PageViewerBot/page_viewer.py

# Use undetected-chromedriver together with stem to use the tor network
import time
import random
import os 

from selenium.webdriver.support.ui import Select
import selenium.webdriver as webdriver
import selenium.common.exceptions as sexceptions
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By

import requests
import browser_manager

import threading

outside_website_scopes = ["blog.altervista.org", "en.altervista.org", "it.altervista.org",  "pinterest",
                        "facebook.com", "instagram.com", "iubenda.com", "twitter.com", "#"]

ACCEPT_COOKIES_BTN_SEL = ".iubenda-cs-accept-btn"
MAX_REDIRECTION = 15
LOOK_FOR_ADS = True

def visit_page(driver, url):

    # go to the page
    print(f"Visiting: {url}")
    driver.get(url)

    # time.sleep a little bit to wait for all of the page to be loaded (especially ads) and to simulate a user reading. Keep in mind the profile is preset to accept cookies on this website
    time.sleep(3)
    click(driver, By.CSS_SELECTOR, ACCEPT_COOKIES_BTN_SEL, 10)
    time.sleep(4)

    for i in range(0, MAX_REDIRECTION):
        time.sleep(6)
        
        # ads = get_possibile_ads(driver)

        if not LOOK_FOR_ADS or (LOOK_FOR_ADS and random.random() < 0.65): # or len(ads) == 0:
            # click random element
            try_to_click_elements(driver, get_possibile_webelements(driver))
        else:
            # Look for an iframe, check if there's an "a" element with an href attribute that contains the "ad" word, it will probably we a link of this type: https://adclick.g.doubleclick.net/aclk?sa=l&ai=Cd8zYPwn0Yaa-CriP7_UPn9eL0AH0mNG7Z-jMmOvlDtrZHhABIJn9jhdg_YqihNQSoAGf08b9A8gBCeACAKgDAcgDCqoEpQJP0Bhs9u34Kcc-HNkWqAWhVh3eYt8wWhsYc-x06cQ2rdO9JUopiRUFeFWuf1P0_jg4FVCG7kbS5eit_70Q9c4AcZaJG7QByYKfaWLsVDTGDUJ4t7v3hl2wGTDD-ep5teyxxtCykaypJMuO3jdhE_YA_7pQy63jkTglMnWGCN4sd-6qNNr_vTk1Uv5nqbV6cQSC8PbEd0RaM78kNNz4Z3i8yFWnyav6kfHnSn6kjR_8bCziGQu7NHf8bQoDb0mhIh6BBV-SU2MMM00Tm91QPNqVePZRY3IKNd8EuIhzGGdYA9hSTtnTQGA6Hfe4BUy8ZEAxMDxVr4s2mm-JRZnHVMmvP5_KksicKf4x9BgwLa1p0N_6-s4dtEgTrNv1Bs-bi3Eeqh1Jh8AE3NCrw9gD4AQBoAYugAfJrLkCqAeOzhuoB5PYG6gH7paxAqgH_p6xAqgH1ckbqAemvhuoB_PRG6gHltgbqAeqm7ECqAffn7EC2AcA0ggJCIjhgBAQARgdsQlalI-L2WNNd4AKA5gLAcgLAYAMAbgMAdgTDYgUAtAVAZgWAYAXAQ&ae=1&num=1&sig=AOD64_24Zd3eetdN63hdYcO7O7s5IFwTRw&client=ca-pub-4188807138211503&nb=9&adurl=https://www.avaya.com/it/prodotti/ccaas/public/lots_of_stuff_after
            # if no iframes are available or none of them contains an ad, this will just skip
            if try_to_click_elements(driver, get_clickable_ads(driver)):
                time.sleep(4)
                break

def try_to_click_elements(driver, clickable_elements):
    while True:
        if len(clickable_elements) == 0:
            print("All the elements weren't clickable!")
            return False

        index = random.randint(0, len(clickable_elements) - 1)

        try:
            print("trying to click element", clickable_elements[index])
            clickable_elements[index].click()
            return True
        except:
            print(clickable_elements[index], "was not clickable, deleting and trying again")
            clickable_elements.pop(index)


def get_clickable_ads(driver):
    searchable_ads = [frame for frame in driver.find_elements_by_xpath('.//iframe') ]
    id_ok_ads = [frame for frame in searchable_ads if ('ad' in frame.get_attribute('id') or 'google_ad' in frame.get_attribute('id')) and frame.is_displayed() ]
    print(f"Ads that can be clicked {id_ok_ads}")

    return id_ok_ads


def get_possibile_webelements(driver):
    # Fetch all the element with links present on the page and only save the clickable ones
    clickable_elements = []
    for el in [x for x in driver.find_elements_by_xpath('.//a') if x.get_attribute('href') != None]:
        clickable = False
        try:
            WebDriverWait(driver, 0.25).until(EC.element_to_be_clickable(el))
            clickable = True   
        except:
            clickable = False

        #print(f'{el}, which brings to, {el.get_attribute("href")} is {clickable} ', end="")
        
        if clickable:
            broke = False

            for outside_scope in outside_website_scopes:
                if outside_scope in el.get_attribute('href'):
                    broke = True
                    break
            
            if not broke:
                clickable_elements.append(el)

    return clickable_elements

def click(driver, by, desc, timeout):
    WebDriverWait(driver, timeout).until(EC.element_to_be_clickable((by, desc))).click()

def launch_and_visit(use_tor, page_url, headless=False):
    driver, tor_process = browser_manager.start_browser(use_tor=use_tor, headless=headless)

    try:
        visit_page(driver, page_url)
    except Exception as e:
        print("Unknown error, exiting. Error log:", e)
    
    time.sleep(5)

    browser_manager.close_browser(driver, tor_process)

if __name__ == "__main__":
    NO_PROCESSES = 10
    threads = []

    while True:
        if len(threads) < NO_PROCESSES:
            print(f"[BOT] Starting thread n.{len(threads)+1}/{NO_PROCESSES}")
            t1 = threading.Thread(target=launch_and_visit, args=(True, 'https://giangillorossi.altervista.org', True))
            t1.start()

            threads.append(t1)

        i = 0
        while i < len(threads):
            # print(i)
            if not (threads[i].is_alive()):
                print(f"[BOT] Thread n.{i+1} has stopped, removing from list")
                threads.pop(i)
        
            else:
                i += 1

             
        # for i in range(   0, 5):
        #     t1 = threading.Thread(target=launch_and_visit, args=(True, 'https://giangillorossi.altervista.org', True))
        #     t1.start()

        #     threads.append(t1)

        # for t in threads:
        #     t.join()
Initial commit 2022-01-30 17:09:24 +01:00			`# Use undetected-chromedriver together with stem to use the tor network`
			`import time`
			`import random`
Revert "go back to single-threaded (because of chromedriver)" This reverts commit 56530c96c729e90d166a2d6dcf074dd890c3d5ba. 2022-02-05 09:43:26 +01:00			`import os`
Initial commit 2022-01-30 17:09:24 +01:00
			`from selenium.webdriver.support.ui import Select`
			`import selenium.webdriver as webdriver`
			`import selenium.common.exceptions as sexceptions`
			`from selenium.webdriver.support.ui import WebDriverWait`
			`from selenium.webdriver.support import expected_conditions as EC`
			`from selenium.webdriver.common.by import By`

			`import requests`
			`import browser_manager`

			`import threading`

			`outside_website_scopes = ["blog.altervista.org", "en.altervista.org", "it.altervista.org", "pinterest",`
			`"facebook.com", "instagram.com", "iubenda.com", "twitter.com", "#"]`

			`ACCEPT_COOKIES_BTN_SEL = ".iubenda-cs-accept-btn"`
			`MAX_REDIRECTION = 15`
			`LOOK_FOR_ADS = True`

			`def visit_page(driver, url):`

			`# go to the page`
			`print(f"Visiting: {url}")`
			`driver.get(url)`

			`# time.sleep a little bit to wait for all of the page to be loaded (especially ads) and to simulate a user reading. Keep in mind the profile is preset to accept cookies on this website`
			`time.sleep(3)`
			`click(driver, By.CSS_SELECTOR, ACCEPT_COOKIES_BTN_SEL, 10)`
			`time.sleep(4)`

			`for i in range(0, MAX_REDIRECTION):`
			`time.sleep(6)`

			`# ads = get_possibile_ads(driver)`

			`if not LOOK_FOR_ADS or (LOOK_FOR_ADS and random.random() < 0.65): # or len(ads) == 0:`
			`# click random element`
			`try_to_click_elements(driver, get_possibile_webelements(driver))`
			`else:`
			# Look for an iframe, check if there's an "a" element with an href attribute that contains the "ad" word, it will probably we a link of this type: https://adclick.g.doubleclick.net/aclk?sa=l&ai=Cd8zYPwn0Yaa-CriP7_UPn9eL0AH0mNG7Z-jMmOvlDtrZHhABIJn9jhdg_YqihNQSoAGf08b9A8gBCeACAKgDAcgDCqoEpQJP0Bhs9u34Kcc-HNkWqAWhVh3eYt8wWhsYc-x06cQ2rdO9JUopiRUFeFWuf1P0_jg4FVCG7kbS5eit_70Q9c4AcZaJG7QByYKfaWLsVDTGDUJ4t7v3hl2wGTDD-ep5teyxxtCykaypJMuO3jdhE_YA_7pQy63jkTglMnWGCN4sd-6qNNr_vTk1Uv5nqbV6cQSC8PbEd0RaM78kNNz4Z3i8yFWnyav6kfHnSn6kjR_8bCziGQu7NHf8bQoDb0mhIh6BBV-SU2MMM00Tm91QPNqVePZRY3IKNd8EuIhzGGdYA9hSTtnTQGA6Hfe4BUy8ZEAxMDxVr4s2mm-JRZnHVMmvP5_KksicKf4x9BgwLa1p0N_6-s4dtEgTrNv1Bs-bi3Eeqh1Jh8AE3NCrw9gD4AQBoAYugAfJrLkCqAeOzhuoB5PYG6gH7paxAqgH_p6xAqgH1ckbqAemvhuoB_PRG6gHltgbqAeqm7ECqAffn7EC2AcA0ggJCIjhgBAQARgdsQlalI-L2WNNd4AKA5gLAcgLAYAMAbgMAdgTDYgUAtAVAZgWAYAXAQ&ae=1&num=1&sig=AOD64_24Zd3eetdN63hdYcO7O7s5IFwTRw&client=ca-pub-4188807138211503&nb=9&adurl=https://www.avaya.com/it/prodotti/ccaas/public/lots_of_stuff_after
			`# if no iframes are available or none of them contains an ad, this will just skip`
			`if try_to_click_elements(driver, get_clickable_ads(driver)):`
			`time.sleep(4)`
			`break`
page viewer: fix 'empty range for randrange() (0, 0, 0)' when no clickable elements are found 2022-02-17 15:32:20 +01:00
Initial commit 2022-01-30 17:09:24 +01:00			`def try_to_click_elements(driver, clickable_elements):`
			`while True:`
			`if len(clickable_elements) == 0:`
			`print("All the elements weren't clickable!")`
			`return False`

page viewer: fix 'empty range for randrange() (0, 0, 0)' when no clickable elements are found 2022-02-17 15:32:20 +01:00			`index = random.randint(0, len(clickable_elements) - 1)`

Initial commit 2022-01-30 17:09:24 +01:00			`try:`
			`print("trying to click element", clickable_elements[index])`
			`clickable_elements[index].click()`
			`return True`
			`except:`
			`print(clickable_elements[index], "was not clickable, deleting and trying again")`
			`clickable_elements.pop(index)`


			`def get_clickable_ads(driver):`
			`searchable_ads = [frame for frame in driver.find_elements_by_xpath('.//iframe') ]`
			`id_ok_ads = [frame for frame in searchable_ads if ('ad' in frame.get_attribute('id') or 'google_ad' in frame.get_attribute('id')) and frame.is_displayed() ]`
page viewer: fix 'empty range for randrange() (0, 0, 0)' when no clickable elements are found 2022-02-17 15:32:20 +01:00			`print(f"Ads that can be clicked {id_ok_ads}")`
Initial commit 2022-01-30 17:09:24 +01:00
			`return id_ok_ads`


			`def get_possibile_webelements(driver):`
			`# Fetch all the element with links present on the page and only save the clickable ones`
			`clickable_elements = []`
			`for el in [x for x in driver.find_elements_by_xpath('.//a') if x.get_attribute('href') != None]:`
			`clickable = False`
			`try:`
			`WebDriverWait(driver, 0.25).until(EC.element_to_be_clickable(el))`
			`clickable = True`
			`except:`
			`clickable = False`

			`#print(f'{el}, which brings to, {el.get_attribute("href")} is {clickable} ', end="")`

			`if clickable:`
			`broke = False`

			`for outside_scope in outside_website_scopes:`
			`if outside_scope in el.get_attribute('href'):`
			`broke = True`
			`break`

			`if not broke:`
			`clickable_elements.append(el)`

			`return clickable_elements`

			`def click(driver, by, desc, timeout):`
			`WebDriverWait(driver, timeout).until(EC.element_to_be_clickable((by, desc))).click()`

add option for headless browser 2022-01-31 22:26:53 +01:00			`def launch_and_visit(use_tor, page_url, headless=False):`
			`driver, tor_process = browser_manager.start_browser(use_tor=use_tor, headless=headless)`
Revert "go back to single-threaded (because of chromedriver)" This reverts commit 56530c96c729e90d166a2d6dcf074dd890c3d5ba. 2022-02-05 09:43:26 +01:00
			`try:`
			`visit_page(driver, page_url)`
			`except Exception as e:`
			`print("Unknown error, exiting. Error log:", e)`
Initial commit 2022-01-30 17:09:24 +01:00
			`time.sleep(5)`

go back to single-threaded (because of chromedriver) + delete tor data directory when closing 2022-01-31 22:26:20 +01:00			`browser_manager.close_browser(driver, tor_process)`
Initial commit 2022-01-30 17:09:24 +01:00
			`if __name__ == "__main__":`
replace ended threads in real time instead of waiting for them all to finish 2022-02-06 09:18:28 +01:00			`NO_PROCESSES = 10`
			`threads = []`
Revert "go back to single-threaded (because of chromedriver)" This reverts commit 56530c96c729e90d166a2d6dcf074dd890c3d5ba. 2022-02-05 09:43:26 +01:00
replace ended threads in real time instead of waiting for them all to finish 2022-02-06 09:18:28 +01:00			`while True:`
			`if len(threads) < NO_PROCESSES:`
page viewer: fix 'empty range for randrange() (0, 0, 0)' when no clickable elements are found 2022-02-17 15:32:20 +01:00			`print(f"[BOT] Starting thread n.{len(threads)+1}/{NO_PROCESSES}")`
			`t1 = threading.Thread(target=launch_and_visit, args=(True, 'https://giangillorossi.altervista.org', True))`
Revert "go back to single-threaded (because of chromedriver)" This reverts commit 56530c96c729e90d166a2d6dcf074dd890c3d5ba. 2022-02-05 09:43:26 +01:00			`t1.start()`

			`threads.append(t1)`

replace ended threads in real time instead of waiting for them all to finish 2022-02-06 09:18:28 +01:00			`i = 0`
			`while i < len(threads):`
			`# print(i)`
			`if not (threads[i].is_alive()):`
page viewer: fix 'empty range for randrange() (0, 0, 0)' when no clickable elements are found 2022-02-17 15:32:20 +01:00			`print(f"[BOT] Thread n.{i+1} has stopped, removing from list")`
replace ended threads in real time instead of waiting for them all to finish 2022-02-06 09:18:28 +01:00			`threads.pop(i)`

			`else:`
			`i += 1`





			`# for i in range( 0, 5):`
			`# t1 = threading.Thread(target=launch_and_visit, args=(True, 'https://giangillorossi.altervista.org', True))`
			`# t1.start()`

			`# threads.append(t1)`

			`# for t in threads:`
			`# t.join()`