PageViewerBot/page_viewer.py

# Use undetected-chromedriver together with stem to use the tor network
import time
import random
import os

from selenium.webdriver.support.ui import Select
import selenium.webdriver as webdriver
import selenium.common.exceptions as sexceptions
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By

import requests
import browser_manager

import threading

outside_website_scopes = ["blog.altervista.org", "en.altervista.org", "it.altervista.org",  "pinterest",
                        "facebook.com", "instagram.com", "iubenda.com", "twitter.com", "#"]

ACCEPT_COOKIES_BTN_SEL = ".iubenda-cs-accept-btn"
MAX_REDIRECTION = 15
LOOK_FOR_ADS = True

def visit_page(driver, url):

    # go to the page
    print(f"Visiting: {url}")
    driver.get(url)

    # time.sleep a little bit to wait for all of the page to be loaded (especially ads) and to simulate a user reading. Keep in mind the profile is preset to accept cookies on this website
    time.sleep(3)
    click(driver, By.CSS_SELECTOR, ACCEPT_COOKIES_BTN_SEL, 10)
    time.sleep(4)

    for i in range(0, MAX_REDIRECTION):
        time.sleep(6)

        # ads = get_possibile_ads(driver)

        if not LOOK_FOR_ADS or (LOOK_FOR_ADS and random.random() < 0.65): # or len(ads) == 0:
            # click random element
            try_to_click_elements(driver, get_possibile_webelements(driver))
        else:
            # Look for an iframe, check if there's an "a" element with an href attribute that contains the "ad" word, it will probably we a link of this type: https://adclick.g.doubleclick.net/aclk?sa=l&ai=Cd8zYPwn0Yaa-CriP7_UPn9eL0AH0mNG7Z-jMmOvlDtrZHhABIJn9jhdg_YqihNQSoAGf08b9A8gBCeACAKgDAcgDCqoEpQJP0Bhs9u34Kcc-HNkWqAWhVh3eYt8wWhsYc-x06cQ2rdO9JUopiRUFeFWuf1P0_jg4FVCG7kbS5eit_70Q9c4AcZaJG7QByYKfaWLsVDTGDUJ4t7v3hl2wGTDD-ep5teyxxtCykaypJMuO3jdhE_YA_7pQy63jkTglMnWGCN4sd-6qNNr_vTk1Uv5nqbV6cQSC8PbEd0RaM78kNNz4Z3i8yFWnyav6kfHnSn6kjR_8bCziGQu7NHf8bQoDb0mhIh6BBV-SU2MMM00Tm91QPNqVePZRY3IKNd8EuIhzGGdYA9hSTtnTQGA6Hfe4BUy8ZEAxMDxVr4s2mm-JRZnHVMmvP5_KksicKf4x9BgwLa1p0N_6-s4dtEgTrNv1Bs-bi3Eeqh1Jh8AE3NCrw9gD4AQBoAYugAfJrLkCqAeOzhuoB5PYG6gH7paxAqgH_p6xAqgH1ckbqAemvhuoB_PRG6gHltgbqAeqm7ECqAffn7EC2AcA0ggJCIjhgBAQARgdsQlalI-L2WNNd4AKA5gLAcgLAYAMAbgMAdgTDYgUAtAVAZgWAYAXAQ&ae=1&num=1&sig=AOD64_24Zd3eetdN63hdYcO7O7s5IFwTRw&client=ca-pub-4188807138211503&nb=9&adurl=https://www.avaya.com/it/prodotti/ccaas/public/lots_of_stuff_after
            # if no iframes are available or none of them contains an ad, this will just skip
            if try_to_click_elements(driver, get_clickable_ads(driver)):
                time.sleep(4)
                break

    driver.quit()

def try_to_click_elements(driver, clickable_elements):
    while True:
        index = random.randint(0, len(clickable_elements) - 1)

        if len(clickable_elements) == 0:
            print("All the elements weren't clickable!")
            return False

        try:
            print("trying to click element", clickable_elements[index])
            clickable_elements[index].click()
            return True
        except:
            print(clickable_elements[index], "was not clickable, deleting and trying again")
            clickable_elements.pop(index)


def get_clickable_ads(driver):
    searchable_ads = [frame for frame in driver.find_elements_by_xpath('.//iframe') ]
    id_ok_ads = [frame for frame in searchable_ads if ('ad' in frame.get_attribute('id') or 'google_ad' in frame.get_attribute('id')) and frame.is_displayed() ]
    print(id_ok_ads)

    return id_ok_ads


def get_possibile_webelements(driver):
    # Fetch all the element with links present on the page and only save the clickable ones
    clickable_elements = []
    for el in [x for x in driver.find_elements_by_xpath('.//a') if x.get_attribute('href') != None]:
        clickable = False
        try:
            WebDriverWait(driver, 0.25).until(EC.element_to_be_clickable(el))
            clickable = True
        except:
            clickable = False

        #print(f'{el}, which brings to, {el.get_attribute("href")} is {clickable} ', end="")

        if clickable:
            broke = False

            for outside_scope in outside_website_scopes:
                if outside_scope in el.get_attribute('href'):
                    broke = True
                    break

            if not broke:
                clickable_elements.append(el)

    return clickable_elements

def click(driver, by, desc, timeout):
    WebDriverWait(driver, timeout).until(EC.element_to_be_clickable((by, desc))).click()

def launch_and_visit(use_tor, page_url, headless=False):
    driver, tor_process = browser_manager.start_browser(use_tor=use_tor, headless=headless)

    try:
        visit_page(driver, page_url)
    except Exception as e:
        print("Unknown error, exiting. Error log:", e)

    time.sleep(5)

    browser_manager.close_browser(driver, tor_process)

if __name__ == "__main__":
    NO_PROCESSES = 10
    threads = []

    while True:
        if len(threads) < NO_PROCESSES:
            print(f"[BOT] Starting thread n.{len(threads)}/{NO_PROCESSES}")
            t1 = threading.Thread(target=launch_and_visit, args=(False, 'https://giangillorossi.altervista.org', False))
            t1.start()

            threads.append(t1)

        i = 0
        while i < len(threads):
            # print(i)
            if not (threads[i].is_alive()):
                print(f"[BOT] Thread n.{i} has stopped, removing from list")
                threads.pop(i)

            else:
                i += 1


        # for i in range(   0, 5):
        #     t1 = threading.Thread(target=launch_and_visit, args=(True, 'https://giangillorossi.altervista.org', True))
        #     t1.start()

        #     threads.append(t1)

        # for t in threads:
        #     t.join()