|
| 1 | +#!/usr/bin/env python3 |
| 2 | + |
| 3 | +""" |
| 4 | +# author: Bartlomiej "furas" Burek (https://blog.furas.pl) |
| 5 | +# date: 2024.04.29 |
| 6 | +# [While i scrape website, can I search for specific keywords in a searchbar with Selenium (python)? - Stack Overflow](https://stackoverflow.com/questions/78398520/while-i-scrape-website-can-i-search-for-specific-keywords-in-a-searchbar-with-s/78405308#78405308) |
| 7 | +""" |
| 8 | + |
| 9 | +from selenium import webdriver |
| 10 | +from selenium.webdriver.common.by import By |
| 11 | +from selenium.webdriver.common.keys import Keys |
| 12 | +#from selenium.webdriver.support.ui import WebDriverWait |
| 13 | +#from selenium.webdriver.support import expected_conditions as EC |
| 14 | +#from selenium.common.exceptions import NoSuchElementException, TimeoutException |
| 15 | + |
| 16 | +import time |
| 17 | + |
| 18 | +# --- |
| 19 | + |
| 20 | +import selenium |
| 21 | +print('Selenium:', selenium.__version__) |
| 22 | + |
| 23 | +# --- |
| 24 | + |
| 25 | +def scrape_page(driver, keyword): |
| 26 | + try: |
| 27 | + |
| 28 | + # Gestisci il banner dei cookie, se presente |
| 29 | + try: |
| 30 | + print('Clicking cookie banner') |
| 31 | + cookie_banner = driver.find_element(By.XPATH, "//a[b[text()='Chiudi']]") |
| 32 | + cookie_banner.click() |
| 33 | + except Exception as e: |
| 34 | + print('Exception:', e) |
| 35 | + |
| 36 | + # Trova tutti gli elementi "Continua a leggere" |
| 37 | + elements_dt = driver.find_elements(By.CSS_SELECTOR, "dl.simple-list.results dt") |
| 38 | + #elements_dd = driver.find_elements(By.XPATH, "//dl[@class='sample-list.results']/dd/a") |
| 39 | + |
| 40 | + print('[DEBUG] len(elements_dt):', len(elements_dt)) |
| 41 | + # Lista per memorizzare i dati estratti |
| 42 | + data = [] |
| 43 | + |
| 44 | + # Clicca su ciascun elemento |
| 45 | + #for index, element_dt, element_dd in enumerate(zip(elements_dt, elements_dd), 1): # you can use `enumerate(..., 1)` to start `index` with `1` |
| 46 | + for index, element in enumerate(elements_dt, 1): # you can use `enumerate(..., 1)` to start `index` with `1` |
| 47 | + |
| 48 | + try: |
| 49 | + article_url = element.find_element(By.XPATH, './/a').get_attribute("href") |
| 50 | + article_title = element.text |
| 51 | + |
| 52 | + # ... DON'T CLIK LINKS BECAUSE IT WILL REMOVE CURRENT PAGE FROM MEMPRY |
| 53 | + # ... AND YOU WILL LOST ACCESS TO OTHER `elements` ON CURRENT PAGE |
| 54 | + # ... |
| 55 | + # ... Get `href` and later (after loop) use `.get(href)` to access subpages. |
| 56 | + |
| 57 | + data.append({ |
| 58 | + 'keyword': keyword, |
| 59 | + 'Titolo': article_title, |
| 60 | + 'URL': article_url, |
| 61 | + #'Data': article_date, |
| 62 | + #'Contenuto': article_content |
| 63 | + }) |
| 64 | + |
| 65 | + print('[DEBUG] data:', data[-1]) |
| 66 | + # Torna alla pagina precedente |
| 67 | + #driver.back() |
| 68 | + except Exception as e: |
| 69 | + print("Errore durante il clic sull'elemento:", e) |
| 70 | + |
| 71 | + # work with subpages |
| 72 | + |
| 73 | + for item in data: |
| 74 | + print('[DEBUG] subpage:', item['URL']) |
| 75 | + driver.get(item['URL']) |
| 76 | + #article_date = ... |
| 77 | + #article_content = ... |
| 78 | + #item['Data'] = article_date |
| 79 | + #item['Contenuto'] = article_content |
| 80 | + |
| 81 | + except Exception as e: |
| 82 | + print("Errore durante lo scraping della pagina:", e) |
| 83 | + return None |
| 84 | + |
| 85 | + return data |
| 86 | + |
| 87 | +# --- main --- |
| 88 | + |
| 89 | +driver = webdriver.Chrome() |
| 90 | +driver.maximize_window() |
| 91 | +driver.implicitly_wait(10) |
| 92 | + |
| 93 | +# --- |
| 94 | + |
| 95 | +start_url = "https://www.salute.gov.it/portale/home.html" |
| 96 | + |
| 97 | +all_data = [] |
| 98 | + |
| 99 | +keywords = ['ukraina', 'covid-19', 'elan musk'] |
| 100 | + |
| 101 | +for word in keywords: |
| 102 | + |
| 103 | + print("Main Page:", start_url) |
| 104 | + |
| 105 | + # open main page |
| 106 | + driver.get(start_url) |
| 107 | + |
| 108 | + # find searchbar |
| 109 | + print('Search:', word) |
| 110 | + searchbar = driver.find_element(By.ID, "f_cerca") |
| 111 | + # put keyword in searchbar and press ENTER |
| 112 | + searchbar.send_keys(word) |
| 113 | + searchbar.send_keys(Keys.ENTER) |
| 114 | + |
| 115 | + time.sleep(5) # wait for results |
| 116 | + |
| 117 | + #get current url (because it could load different URL to show results) |
| 118 | + search_results_url = driver.current_url |
| 119 | + |
| 120 | + # start scraping results (with pagination): |
| 121 | + #while True: # try to get all pages |
| 122 | + for _ in range(3): # try to get only 3 pages |
| 123 | + print("Scraping:", search_results_url) |
| 124 | + |
| 125 | + page_data = scrape_page(driver, word) # <--- only scraping, without `.get(url)`, I send `word` only to add it to `data` |
| 126 | + |
| 127 | + if page_data: |
| 128 | + all_data.extend(page_data) |
| 129 | + |
| 130 | + driver.get(search_results_url) # go back to result after visiting subpages - to get link to next page |
| 131 | + |
| 132 | + try: |
| 133 | + next_page_link = driver.find_element(By.XPATH, "//a[contains(text(), 'Successive')]") |
| 134 | + search_results_url = next_page_link.get_attribute("href") |
| 135 | + driver.get(search_results_url) # <--- open next page with results using URL |
| 136 | + #next_page_link.click() # <--- or click link |
| 137 | + except Exception as e: |
| 138 | + print('[DEBUG] Exception:', e) |
| 139 | + print('[DEBUG] break') |
| 140 | + #input('Press ENTER to continue') |
| 141 | + break # exit loop |
| 142 | + |
| 143 | +driver.quit() |
| 144 | + |
| 145 | +import pandas as pd |
| 146 | +df = pd.DataFrame(all_data) |
| 147 | +print(df) |
| 148 | + |
| 149 | +input("Press ENTER to close") |
0 commit comments