# Use undetected-chromedriver together with stem to use the tor network import undetected_chromedriver as uc from tbselenium.utils import launch_tbb_tor_with_stem import time import random from selenium.webdriver.support.ui import Select import selenium.webdriver as webdriver import selenium.common.exceptions as sexceptions from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.common.by import By import requests import browser_manager import threading outside_website_scopes = ["blog.altervista.org", "en.altervista.org", "it.altervista.org", "pinterest", "facebook.com", "instagram.com", "iubenda.com", "twitter.com", "#"] ACCEPT_COOKIES_BTN_SEL = ".iubenda-cs-accept-btn" MAX_REDIRECTION = 15 LOOK_FOR_ADS = True def visit_page(driver, url): # go to the page print(f"Visiting: {url}") driver.get(url) # time.sleep a little bit to wait for all of the page to be loaded (especially ads) and to simulate a user reading. Keep in mind the profile is preset to accept cookies on this website time.sleep(3) click(driver, By.CSS_SELECTOR, ACCEPT_COOKIES_BTN_SEL, 10) time.sleep(4) for i in range(0, MAX_REDIRECTION): time.sleep(6) # ads = get_possibile_ads(driver) if not LOOK_FOR_ADS or (LOOK_FOR_ADS and random.random() < 0.65): # or len(ads) == 0: # click random element try_to_click_elements(driver, get_possibile_webelements(driver)) else: # Look for an iframe, check if there's an "a" element with an href attribute that contains the "ad" word, it will probably we a link of this type: https://adclick.g.doubleclick.net/aclk?sa=l&ai=Cd8zYPwn0Yaa-CriP7_UPn9eL0AH0mNG7Z-jMmOvlDtrZHhABIJn9jhdg_YqihNQSoAGf08b9A8gBCeACAKgDAcgDCqoEpQJP0Bhs9u34Kcc-HNkWqAWhVh3eYt8wWhsYc-x06cQ2rdO9JUopiRUFeFWuf1P0_jg4FVCG7kbS5eit_70Q9c4AcZaJG7QByYKfaWLsVDTGDUJ4t7v3hl2wGTDD-ep5teyxxtCykaypJMuO3jdhE_YA_7pQy63jkTglMnWGCN4sd-6qNNr_vTk1Uv5nqbV6cQSC8PbEd0RaM78kNNz4Z3i8yFWnyav6kfHnSn6kjR_8bCziGQu7NHf8bQoDb0mhIh6BBV-SU2MMM00Tm91QPNqVePZRY3IKNd8EuIhzGGdYA9hSTtnTQGA6Hfe4BUy8ZEAxMDxVr4s2mm-JRZnHVMmvP5_KksicKf4x9BgwLa1p0N_6-s4dtEgTrNv1Bs-bi3Eeqh1Jh8AE3NCrw9gD4AQBoAYugAfJrLkCqAeOzhuoB5PYG6gH7paxAqgH_p6xAqgH1ckbqAemvhuoB_PRG6gHltgbqAeqm7ECqAffn7EC2AcA0ggJCIjhgBAQARgdsQlalI-L2WNNd4AKA5gLAcgLAYAMAbgMAdgTDYgUAtAVAZgWAYAXAQ&ae=1&num=1&sig=AOD64_24Zd3eetdN63hdYcO7O7s5IFwTRw&client=ca-pub-4188807138211503&nb=9&adurl=https://www.avaya.com/it/prodotti/ccaas/public/lots_of_stuff_after # if no iframes are available or none of them contains an ad, this will just skip if try_to_click_elements(driver, get_clickable_ads(driver)): time.sleep(4) break driver.quit() def try_to_click_elements(driver, clickable_elements): while True: index = random.randint(0, len(clickable_elements) - 1) if len(clickable_elements) == 0: print("All the elements weren't clickable!") return False try: print("trying to click element", clickable_elements[index]) clickable_elements[index].click() return True except: print(clickable_elements[index], "was not clickable, deleting and trying again") clickable_elements.pop(index) def get_clickable_ads(driver): searchable_ads = [frame for frame in driver.find_elements_by_xpath('.//iframe') ] id_ok_ads = [frame for frame in searchable_ads if ('ad' in frame.get_attribute('id') or 'google_ad' in frame.get_attribute('id')) and frame.is_displayed() ] print(id_ok_ads) return id_ok_ads def get_possibile_webelements(driver): # Fetch all the element with links present on the page and only save the clickable ones clickable_elements = [] for el in [x for x in driver.find_elements_by_xpath('.//a') if x.get_attribute('href') != None]: clickable = False try: WebDriverWait(driver, 0.25).until(EC.element_to_be_clickable(el)) clickable = True except: clickable = False #print(f'{el}, which brings to, {el.get_attribute("href")} is {clickable} ', end="") if clickable: broke = False for outside_scope in outside_website_scopes: if outside_scope in el.get_attribute('href'): broke = True break if not broke: clickable_elements.append(el) return clickable_elements def click(driver, by, desc, timeout): WebDriverWait(driver, timeout).until(EC.element_to_be_clickable((by, desc))).click() def launch_and_visit(use_tor, page_url, headless=False): driver, tor_process = browser_manager.start_browser(use_tor=use_tor, headless=headless) visit_page(driver, page_url) time.sleep(5) browser_manager.close_browser(driver, tor_process) if __name__ == "__main__": launch_and_visit(use_tor=True, page_url='https://giangillorossi.altervista.org', headless=False)