From 642fe380de54f41febec35bfb8e24b59d31d1176 Mon Sep 17 00:00:00 2001 From: emamaker Date: Sun, 30 Jan 2022 17:09:24 +0100 Subject: [PATCH] Initial commit --- .gitignore | 4 ++ README.md | 16 ++++++ browser_manager.py | 64 ++++++++++++++++++++++ page_viewer.py | 130 +++++++++++++++++++++++++++++++++++++++++++++ requirements.txt | 5 ++ 5 files changed, 219 insertions(+) create mode 100644 .gitignore create mode 100644 README.md create mode 100644 browser_manager.py create mode 100644 page_viewer.py create mode 100644 requirements.txt diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..24d83a9 --- /dev/null +++ b/.gitignore @@ -0,0 +1,4 @@ +__pycache__ +chromedriver +tor-data-dir* +tor \ No newline at end of file diff --git a/README.md b/README.md new file mode 100644 index 0000000..b3655ac --- /dev/null +++ b/README.md @@ -0,0 +1,16 @@ +# PageViewerBot +A bot to act as a user surfing a website, using [UndetectedChromedriver](https://github.com/ultrafunkamsterdam/undetected-chromedriver) and the [Tor Network](https://torproject.org) + +# Usage +Number of concurrent processes and website to visit are hardcoded for now, and so are other parameters. You will have to change thos directly in the code.
+Tor daemon executable is passed as environment variable TOR_PATH, you can download it on [torproject.org](https://torproject.org) or with your distro's package manager, then launch + +> TOR_PATH= _path_to_tor_daemon_ python3 page_viewer.py + +# TODO +[] use ArgumentParser for stuff that needs to be customized at use time + +[] Configuration file (?) + +# Important disclaimer +This project was made only for fun and educational purposes. I am not responsible if you can rate limited/temp banned/perm banned by website or anything else that derives from your use of this project \ No newline at end of file diff --git a/browser_manager.py b/browser_manager.py new file mode 100644 index 0000000..2bdffa8 --- /dev/null +++ b/browser_manager.py @@ -0,0 +1,64 @@ +import os +import re +import stem +import socket +import requests +from contextlib import closing +import undetected_chromedriver as uc + +import threading + +def find_free_port(): + with closing(socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as s: + s.bind(('', 0)) + s.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1) + return s.getsockname()[1] + +def create_tor_proxy(socks_port,control_port): + TOR_PATH = os.environ['TOR_PATH'] + + try: + tor_process = stem.process.launch_tor_with_config( + config = { + 'SocksPort': str(socks_port), + 'ControlPort' : str(control_port), + 'MaxCircuitDirtiness' : '300', + 'DataDirectory' : "tor-data-dir-" + (str(threading.get_native_id())) + }, + init_msg_handler = lambda line: print(line) if re.search('Bootstrapped', line) else False, + tor_cmd = TOR_PATH + ) + print("[INFO] Tor connection created.") + except Exception as e : + tor_process = None + print("[ERROR] Starting new TOR", e) + print("[INFO] Using existing tor connection.") + + return tor_process + +def start_browser(use_tor=False): + + options = uc.ChromeOptions() + options.add_argument('--no-first-run') + options.add_argument('--password-store=basic') + + tor_process = None + if use_tor: + SOCKS_PORT = find_free_port() + CONTROL_PORT = find_free_port() + + + tor_process = create_tor_proxy(SOCKS_PORT, CONTROL_PORT) + proxies = {'http' : f'socks5://localhost:{SOCKS_PORT}','https' : f'socks5://localhost:{SOCKS_PORT}'} + + options.add_argument(f'--proxy-server=socks5://localhost:{SOCKS_PORT}') + else: + proxies = [] + + ip = requests.get("http://httpbin.org/ip", proxies=proxies).json()["origin"] + print (f'IP is {ip}') + + driver = uc.Chrome(options=options) + + return driver, tor_process + \ No newline at end of file diff --git a/page_viewer.py b/page_viewer.py new file mode 100644 index 0000000..c7886e0 --- /dev/null +++ b/page_viewer.py @@ -0,0 +1,130 @@ +# Use undetected-chromedriver together with stem to use the tor network +import undetected_chromedriver as uc +from tbselenium.utils import launch_tbb_tor_with_stem + +import time +import random + +from selenium.webdriver.support.ui import Select +import selenium.webdriver as webdriver +import selenium.common.exceptions as sexceptions +from selenium.webdriver.support.ui import WebDriverWait +from selenium.webdriver.support import expected_conditions as EC +from selenium.webdriver.common.by import By + +import requests +import browser_manager + +import threading + +outside_website_scopes = ["blog.altervista.org", "en.altervista.org", "it.altervista.org", "pinterest", + "facebook.com", "instagram.com", "iubenda.com", "twitter.com", "#"] + +ACCEPT_COOKIES_BTN_SEL = ".iubenda-cs-accept-btn" +MAX_REDIRECTION = 15 +LOOK_FOR_ADS = True + +def visit_page(driver, url): + + # go to the page + print(f"Visiting: {url}") + driver.get(url) + + # time.sleep a little bit to wait for all of the page to be loaded (especially ads) and to simulate a user reading. Keep in mind the profile is preset to accept cookies on this website + time.sleep(3) + click(driver, By.CSS_SELECTOR, ACCEPT_COOKIES_BTN_SEL, 10) + time.sleep(4) + + for i in range(0, MAX_REDIRECTION): + time.sleep(6) + + # ads = get_possibile_ads(driver) + + if not LOOK_FOR_ADS or (LOOK_FOR_ADS and random.random() < 0.65): # or len(ads) == 0: + # click random element + try_to_click_elements(driver, get_possibile_webelements(driver)) + else: + # Look for an iframe, check if there's an "a" element with an href attribute that contains the "ad" word, it will probably we a link of this type: https://adclick.g.doubleclick.net/aclk?sa=l&ai=Cd8zYPwn0Yaa-CriP7_UPn9eL0AH0mNG7Z-jMmOvlDtrZHhABIJn9jhdg_YqihNQSoAGf08b9A8gBCeACAKgDAcgDCqoEpQJP0Bhs9u34Kcc-HNkWqAWhVh3eYt8wWhsYc-x06cQ2rdO9JUopiRUFeFWuf1P0_jg4FVCG7kbS5eit_70Q9c4AcZaJG7QByYKfaWLsVDTGDUJ4t7v3hl2wGTDD-ep5teyxxtCykaypJMuO3jdhE_YA_7pQy63jkTglMnWGCN4sd-6qNNr_vTk1Uv5nqbV6cQSC8PbEd0RaM78kNNz4Z3i8yFWnyav6kfHnSn6kjR_8bCziGQu7NHf8bQoDb0mhIh6BBV-SU2MMM00Tm91QPNqVePZRY3IKNd8EuIhzGGdYA9hSTtnTQGA6Hfe4BUy8ZEAxMDxVr4s2mm-JRZnHVMmvP5_KksicKf4x9BgwLa1p0N_6-s4dtEgTrNv1Bs-bi3Eeqh1Jh8AE3NCrw9gD4AQBoAYugAfJrLkCqAeOzhuoB5PYG6gH7paxAqgH_p6xAqgH1ckbqAemvhuoB_PRG6gHltgbqAeqm7ECqAffn7EC2AcA0ggJCIjhgBAQARgdsQlalI-L2WNNd4AKA5gLAcgLAYAMAbgMAdgTDYgUAtAVAZgWAYAXAQ&ae=1&num=1&sig=AOD64_24Zd3eetdN63hdYcO7O7s5IFwTRw&client=ca-pub-4188807138211503&nb=9&adurl=https://www.avaya.com/it/prodotti/ccaas/public/lots_of_stuff_after + # if no iframes are available or none of them contains an ad, this will just skip + if try_to_click_elements(driver, get_clickable_ads(driver)): + time.sleep(4) + break + + driver.quit() + +def try_to_click_elements(driver, clickable_elements): + while True: + index = random.randint(0, len(clickable_elements) - 1) + + if len(clickable_elements) == 0: + print("All the elements weren't clickable!") + return False + + try: + print("trying to click element", clickable_elements[index]) + clickable_elements[index].click() + return True + except: + print(clickable_elements[index], "was not clickable, deleting and trying again") + clickable_elements.pop(index) + + +def get_clickable_ads(driver): + searchable_ads = [frame for frame in driver.find_elements_by_xpath('.//iframe') ] + id_ok_ads = [frame for frame in searchable_ads if ('ad' in frame.get_attribute('id') or 'google_ad' in frame.get_attribute('id')) and frame.is_displayed() ] + print(id_ok_ads) + + return id_ok_ads + + +def get_possibile_webelements(driver): + # Fetch all the element with links present on the page and only save the clickable ones + clickable_elements = [] + for el in [x for x in driver.find_elements_by_xpath('.//a') if x.get_attribute('href') != None]: + clickable = False + try: + WebDriverWait(driver, 0.25).until(EC.element_to_be_clickable(el)) + clickable = True + except: + clickable = False + + #print(f'{el}, which brings to, {el.get_attribute("href")} is {clickable} ', end="") + + if clickable: + broke = False + + for outside_scope in outside_website_scopes: + if outside_scope in el.get_attribute('href'): + broke = True + break + + if not broke: + clickable_elements.append(el) + + return clickable_elements + +def click(driver, by, desc, timeout): + WebDriverWait(driver, timeout).until(EC.element_to_be_clickable((by, desc))).click() + +def launch_and_visit(use_tor, page_url): + driver, tor_process = browser_manager.start_browser(use_tor) + visit_page(driver, page_url) + + time.sleep(5) + + driver.quit() + + if use_tor: + tor_process.kill() + +if __name__ == "__main__": + threads = [] + + for i in range(0, 3): + t1 = threading.Thread(target=launch_and_visit, args=(True, 'https://giangillorossi.altervista.org')) + t1.start() + + threads.append(t1) + + for t in threads: + t.join() \ No newline at end of file diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..8adf8f9 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,5 @@ +tbselenium +stem +undetected-chromedriver==3.1.0 +requests +requests[socks] \ No newline at end of file