PageViewerBot/page_viewer.py

153 lines
5.8 KiB
Python

# Use undetected-chromedriver together with stem to use the tor network
import time
import random
import os
from selenium.webdriver.support.ui import Select
import selenium.webdriver as webdriver
import selenium.common.exceptions as sexceptions
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
import requests
import browser_manager
import threading
outside_website_scopes = ["blog.altervista.org", "en.altervista.org", "it.altervista.org", "pinterest",
"facebook.com", "instagram.com", "iubenda.com", "twitter.com", "#"]
ACCEPT_COOKIES_BTN_SEL = ".iubenda-cs-accept-btn"
MAX_REDIRECTION = 15
LOOK_FOR_ADS = True
def visit_page(driver, url):
# go to the page
print(f"Visiting: {url}")
driver.get(url)
# time.sleep a little bit to wait for all of the page to be loaded (especially ads) and to simulate a user reading. Keep in mind the profile is preset to accept cookies on this website
time.sleep(3)
click(driver, By.CSS_SELECTOR, ACCEPT_COOKIES_BTN_SEL, 10)
time.sleep(4)
for i in range(0, MAX_REDIRECTION):
time.sleep(6)
# ads = get_possibile_ads(driver)
if not LOOK_FOR_ADS or (LOOK_FOR_ADS and random.random() < 0.65): # or len(ads) == 0:
# click random element
try_to_click_elements(driver, get_possibile_webelements(driver))
else:
# Look for an iframe, check if there's an "a" element with an href attribute that contains the "ad" word, it will probably we a link of this type: https://adclick.g.doubleclick.net/aclk?sa=l&ai=Cd8zYPwn0Yaa-CriP7_UPn9eL0AH0mNG7Z-jMmOvlDtrZHhABIJn9jhdg_YqihNQSoAGf08b9A8gBCeACAKgDAcgDCqoEpQJP0Bhs9u34Kcc-HNkWqAWhVh3eYt8wWhsYc-x06cQ2rdO9JUopiRUFeFWuf1P0_jg4FVCG7kbS5eit_70Q9c4AcZaJG7QByYKfaWLsVDTGDUJ4t7v3hl2wGTDD-ep5teyxxtCykaypJMuO3jdhE_YA_7pQy63jkTglMnWGCN4sd-6qNNr_vTk1Uv5nqbV6cQSC8PbEd0RaM78kNNz4Z3i8yFWnyav6kfHnSn6kjR_8bCziGQu7NHf8bQoDb0mhIh6BBV-SU2MMM00Tm91QPNqVePZRY3IKNd8EuIhzGGdYA9hSTtnTQGA6Hfe4BUy8ZEAxMDxVr4s2mm-JRZnHVMmvP5_KksicKf4x9BgwLa1p0N_6-s4dtEgTrNv1Bs-bi3Eeqh1Jh8AE3NCrw9gD4AQBoAYugAfJrLkCqAeOzhuoB5PYG6gH7paxAqgH_p6xAqgH1ckbqAemvhuoB_PRG6gHltgbqAeqm7ECqAffn7EC2AcA0ggJCIjhgBAQARgdsQlalI-L2WNNd4AKA5gLAcgLAYAMAbgMAdgTDYgUAtAVAZgWAYAXAQ&ae=1&num=1&sig=AOD64_24Zd3eetdN63hdYcO7O7s5IFwTRw&client=ca-pub-4188807138211503&nb=9&adurl=https://www.avaya.com/it/prodotti/ccaas/public/lots_of_stuff_after
# if no iframes are available or none of them contains an ad, this will just skip
if try_to_click_elements(driver, get_clickable_ads(driver)):
time.sleep(4)
break
driver.quit()
def try_to_click_elements(driver, clickable_elements):
while True:
index = random.randint(0, len(clickable_elements) - 1)
if len(clickable_elements) == 0:
print("All the elements weren't clickable!")
return False
try:
print("trying to click element", clickable_elements[index])
clickable_elements[index].click()
return True
except:
print(clickable_elements[index], "was not clickable, deleting and trying again")
clickable_elements.pop(index)
def get_clickable_ads(driver):
searchable_ads = [frame for frame in driver.find_elements_by_xpath('.//iframe') ]
id_ok_ads = [frame for frame in searchable_ads if ('ad' in frame.get_attribute('id') or 'google_ad' in frame.get_attribute('id')) and frame.is_displayed() ]
print(id_ok_ads)
return id_ok_ads
def get_possibile_webelements(driver):
# Fetch all the element with links present on the page and only save the clickable ones
clickable_elements = []
for el in [x for x in driver.find_elements_by_xpath('.//a') if x.get_attribute('href') != None]:
clickable = False
try:
WebDriverWait(driver, 0.25).until(EC.element_to_be_clickable(el))
clickable = True
except:
clickable = False
#print(f'{el}, which brings to, {el.get_attribute("href")} is {clickable} ', end="")
if clickable:
broke = False
for outside_scope in outside_website_scopes:
if outside_scope in el.get_attribute('href'):
broke = True
break
if not broke:
clickable_elements.append(el)
return clickable_elements
def click(driver, by, desc, timeout):
WebDriverWait(driver, timeout).until(EC.element_to_be_clickable((by, desc))).click()
def launch_and_visit(use_tor, page_url, headless=False):
driver, tor_process = browser_manager.start_browser(use_tor=use_tor, headless=headless)
try:
visit_page(driver, page_url)
except Exception as e:
print("Unknown error, exiting. Error log:", e)
time.sleep(5)
browser_manager.close_browser(driver, tor_process)
if __name__ == "__main__":
NO_PROCESSES = 10
threads = []
while True:
if len(threads) < NO_PROCESSES:
print(f"[BOT] Starting thread n.{len(threads)}/{NO_PROCESSES}")
t1 = threading.Thread(target=launch_and_visit, args=(False, 'https://giangillorossi.altervista.org', False))
t1.start()
threads.append(t1)
i = 0
while i < len(threads):
# print(i)
if not (threads[i].is_alive()):
print(f"[BOT] Thread n.{i} has stopped, removing from list")
threads.pop(i)
else:
i += 1
# for i in range( 0, 5):
# t1 = threading.Thread(target=launch_and_visit, args=(True, 'https://giangillorossi.altervista.org', True))
# t1.start()
# threads.append(t1)
# for t in threads:
# t.join()