From 642fe380de54f41febec35bfb8e24b59d31d1176 Mon Sep 17 00:00:00 2001
From: emamaker <emamaker0@gmail.com>
Date: Sun, 30 Jan 2022 17:09:24 +0100
Subject: [PATCH] Initial commit

---
 .gitignore         |   4 ++
 README.md          |  16 ++++++
 browser_manager.py |  64 ++++++++++++++++++++++
 page_viewer.py     | 130 +++++++++++++++++++++++++++++++++++++++++++++
 requirements.txt   |   5 ++
 5 files changed, 219 insertions(+)
 create mode 100644 .gitignore
 create mode 100644 README.md
 create mode 100644 browser_manager.py
 create mode 100644 page_viewer.py
 create mode 100644 requirements.txt
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..24d83a9
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,4 @@
+__pycache__
+chromedriver
+tor-data-dir*
+tor
\ No newline at end of file
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..b3655ac
--- /dev/null
+++ b/README.md
@@ -0,0 +1,16 @@
+# PageViewerBot
+A bot to act as a user surfing a website, using [UndetectedChromedriver](https://github.com/ultrafunkamsterdam/undetected-chromedriver) and the [Tor Network](https://torproject.org)
+
+# Usage
+Number of concurrent processes and website to visit are hardcoded for now, and so are other parameters. You will have to change thos directly in the code.<br>
+Tor daemon executable is passed as environment variable TOR_PATH, you can download it on [torproject.org](https://torproject.org) or with your distro's package manager, then launch
+
+> TOR_PATH= _path_to_tor_daemon_ python3 page_viewer.py
+
+# TODO
+[] use ArgumentParser for stuff that needs to be customized at use time
+
+[] Configuration file (?)
+
+# Important disclaimer
+This project was made only for fun and educational purposes. I am not responsible if you can rate limited/temp banned/perm banned by website or anything else that derives from your use of this project
\ No newline at end of file
diff --git a/browser_manager.py b/browser_manager.py
new file mode 100644
index 0000000..2bdffa8
--- /dev/null
+++ b/browser_manager.py
@@ -0,0 +1,64 @@
+import os
+import re
+import stem
+import socket
+import requests
+from contextlib import closing
+import undetected_chromedriver as uc
+
+import threading
+
+def find_free_port():
+    with closing(socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as s:
+        s.bind(('', 0))
+        s.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
+        return s.getsockname()[1]
+
+def create_tor_proxy(socks_port,control_port):
+    TOR_PATH = os.environ['TOR_PATH']
+    
+    try:
+        tor_process = stem.process.launch_tor_with_config(
+          config = {
+            'SocksPort': str(socks_port),
+            'ControlPort' : str(control_port),
+            'MaxCircuitDirtiness' : '300',
+            'DataDirectory' : "tor-data-dir-" + (str(threading.get_native_id()))
+          },
+          init_msg_handler = lambda line: print(line) if re.search('Bootstrapped', line) else False,
+          tor_cmd = TOR_PATH
+        )
+        print("[INFO] Tor connection created.")
+    except Exception as e :
+        tor_process = None
+        print("[ERROR] Starting new TOR", e)
+        print("[INFO] Using existing tor connection.")
+    
+    return tor_process
+
+def start_browser(use_tor=False):
+
+    options = uc.ChromeOptions()
+    options.add_argument('--no-first-run')
+    options.add_argument('--password-store=basic')
+
+    tor_process = None
+    if use_tor:
+        SOCKS_PORT = find_free_port()
+        CONTROL_PORT = find_free_port()
+
+
+        tor_process = create_tor_proxy(SOCKS_PORT, CONTROL_PORT)
+        proxies = {'http' : f'socks5://localhost:{SOCKS_PORT}','https' : f'socks5://localhost:{SOCKS_PORT}'}
+
+        options.add_argument(f'--proxy-server=socks5://localhost:{SOCKS_PORT}')
+    else:
+        proxies = []
+
+    ip = requests.get("http://httpbin.org/ip", proxies=proxies).json()["origin"]
+    print (f'IP is {ip}')
+    
+    driver = uc.Chrome(options=options)
+
+    return driver, tor_process
+    
\ No newline at end of file
diff --git a/page_viewer.py b/page_viewer.py
new file mode 100644
index 0000000..c7886e0
--- /dev/null
+++ b/page_viewer.py
@@ -0,0 +1,130 @@
+# Use undetected-chromedriver together with stem to use the tor network
+import undetected_chromedriver as uc
+from tbselenium.utils import launch_tbb_tor_with_stem
+
+import time
+import random
+
+from selenium.webdriver.support.ui import Select
+import selenium.webdriver as webdriver
+import selenium.common.exceptions as sexceptions
+from selenium.webdriver.support.ui import WebDriverWait
+from selenium.webdriver.support import expected_conditions as EC
+from selenium.webdriver.common.by import By
+
+import requests
+import browser_manager
+
+import threading
+
+outside_website_scopes = ["blog.altervista.org", "en.altervista.org", "it.altervista.org",  "pinterest",
+                        "facebook.com", "instagram.com", "iubenda.com", "twitter.com", "#"]
+
+ACCEPT_COOKIES_BTN_SEL = ".iubenda-cs-accept-btn"
+MAX_REDIRECTION = 15
+LOOK_FOR_ADS = True
+
+def visit_page(driver, url):
+
+    # go to the page
+    print(f"Visiting: {url}")
+    driver.get(url)
+
+    # time.sleep a little bit to wait for all of the page to be loaded (especially ads) and to simulate a user reading. Keep in mind the profile is preset to accept cookies on this website
+    time.sleep(3)
+    click(driver, By.CSS_SELECTOR, ACCEPT_COOKIES_BTN_SEL, 10)
+    time.sleep(4)
+
+    for i in range(0, MAX_REDIRECTION):
+        time.sleep(6)
+        
+        # ads = get_possibile_ads(driver)
+
+        if not LOOK_FOR_ADS or (LOOK_FOR_ADS and random.random() < 0.65): # or len(ads) == 0:
+            # click random element
+            try_to_click_elements(driver, get_possibile_webelements(driver))
+        else:
+            # Look for an iframe, check if there's an "a" element with an href attribute that contains the "ad" word, it will probably we a link of this type: https://adclick.g.doubleclick.net/aclk?sa=l&ai=Cd8zYPwn0Yaa-CriP7_UPn9eL0AH0mNG7Z-jMmOvlDtrZHhABIJn9jhdg_YqihNQSoAGf08b9A8gBCeACAKgDAcgDCqoEpQJP0Bhs9u34Kcc-HNkWqAWhVh3eYt8wWhsYc-x06cQ2rdO9JUopiRUFeFWuf1P0_jg4FVCG7kbS5eit_70Q9c4AcZaJG7QByYKfaWLsVDTGDUJ4t7v3hl2wGTDD-ep5teyxxtCykaypJMuO3jdhE_YA_7pQy63jkTglMnWGCN4sd-6qNNr_vTk1Uv5nqbV6cQSC8PbEd0RaM78kNNz4Z3i8yFWnyav6kfHnSn6kjR_8bCziGQu7NHf8bQoDb0mhIh6BBV-SU2MMM00Tm91QPNqVePZRY3IKNd8EuIhzGGdYA9hSTtnTQGA6Hfe4BUy8ZEAxMDxVr4s2mm-JRZnHVMmvP5_KksicKf4x9BgwLa1p0N_6-s4dtEgTrNv1Bs-bi3Eeqh1Jh8AE3NCrw9gD4AQBoAYugAfJrLkCqAeOzhuoB5PYG6gH7paxAqgH_p6xAqgH1ckbqAemvhuoB_PRG6gHltgbqAeqm7ECqAffn7EC2AcA0ggJCIjhgBAQARgdsQlalI-L2WNNd4AKA5gLAcgLAYAMAbgMAdgTDYgUAtAVAZgWAYAXAQ&ae=1&num=1&sig=AOD64_24Zd3eetdN63hdYcO7O7s5IFwTRw&client=ca-pub-4188807138211503&nb=9&adurl=https://www.avaya.com/it/prodotti/ccaas/public/lots_of_stuff_after
+            # if no iframes are available or none of them contains an ad, this will just skip
+            if try_to_click_elements(driver, get_clickable_ads(driver)):
+                time.sleep(4)
+                break
+    
+    driver.quit()
+
+def try_to_click_elements(driver, clickable_elements):
+    while True:
+        index = random.randint(0, len(clickable_elements) - 1)
+
+        if len(clickable_elements) == 0:
+            print("All the elements weren't clickable!")
+            return False
+
+        try:
+            print("trying to click element", clickable_elements[index])
+            clickable_elements[index].click()
+            return True
+        except:
+            print(clickable_elements[index], "was not clickable, deleting and trying again")
+            clickable_elements.pop(index)
+
+
+def get_clickable_ads(driver):
+    searchable_ads = [frame for frame in driver.find_elements_by_xpath('.//iframe') ]
+    id_ok_ads = [frame for frame in searchable_ads if ('ad' in frame.get_attribute('id') or 'google_ad' in frame.get_attribute('id')) and frame.is_displayed() ]
+    print(id_ok_ads)
+
+    return id_ok_ads
+
+
+def get_possibile_webelements(driver):
+    # Fetch all the element with links present on the page and only save the clickable ones
+    clickable_elements = []
+    for el in [x for x in driver.find_elements_by_xpath('.//a') if x.get_attribute('href') != None]:
+        clickable = False
+        try:
+            WebDriverWait(driver, 0.25).until(EC.element_to_be_clickable(el))
+            clickable = True   
+        except:
+            clickable = False
+
+        #print(f'{el}, which brings to, {el.get_attribute("href")} is {clickable} ', end="")
+        
+        if clickable:
+            broke = False
+
+            for outside_scope in outside_website_scopes:
+                if outside_scope in el.get_attribute('href'):
+                    broke = True
+                    break
+            
+            if not broke:
+                clickable_elements.append(el)
+
+    return clickable_elements
+
+def click(driver, by, desc, timeout):
+    WebDriverWait(driver, timeout).until(EC.element_to_be_clickable((by, desc))).click()
+
+def launch_and_visit(use_tor, page_url):
+    driver, tor_process = browser_manager.start_browser(use_tor)
+    visit_page(driver, page_url)
+    
+    time.sleep(5)
+
+    driver.quit()
+    
+    if use_tor:
+        tor_process.kill()
+
+if __name__ == "__main__":
+    threads = []
+
+    for i in range(0, 3):
+        t1 = threading.Thread(target=launch_and_visit, args=(True, 'https://giangillorossi.altervista.org'))
+        t1.start()
+
+        threads.append(t1)
+
+    for t in threads:
+        t.join()
\ No newline at end of file
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000..8adf8f9
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,5 @@
+tbselenium
+stem
+undetected-chromedriver==3.1.0
+requests
+requests[socks]
\ No newline at end of file