Skip to content

Commit f6f875a

Browse files
committed
scraping
1 parent ec04026 commit f6f875a

File tree

1 file changed

+149
-0
lines changed
  • __scraping__/salute.gov.it - selenium

1 file changed

+149
-0
lines changed
Lines changed: 149 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,149 @@
1+
#!/usr/bin/env python3
2+
3+
"""
4+
# author: Bartlomiej "furas" Burek (https://blog.furas.pl)
5+
# date: 2024.04.29
6+
# [While i scrape website, can I search for specific keywords in a searchbar with Selenium (python)? - Stack Overflow](https://stackoverflow.com/questions/78398520/while-i-scrape-website-can-i-search-for-specific-keywords-in-a-searchbar-with-s/78405308#78405308)
7+
"""
8+
9+
from selenium import webdriver
10+
from selenium.webdriver.common.by import By
11+
from selenium.webdriver.common.keys import Keys
12+
#from selenium.webdriver.support.ui import WebDriverWait
13+
#from selenium.webdriver.support import expected_conditions as EC
14+
#from selenium.common.exceptions import NoSuchElementException, TimeoutException
15+
16+
import time
17+
18+
# ---
19+
20+
import selenium
21+
print('Selenium:', selenium.__version__)
22+
23+
# ---
24+
25+
def scrape_page(driver, keyword):
26+
try:
27+
28+
# Gestisci il banner dei cookie, se presente
29+
try:
30+
print('Clicking cookie banner')
31+
cookie_banner = driver.find_element(By.XPATH, "//a[b[text()='Chiudi']]")
32+
cookie_banner.click()
33+
except Exception as e:
34+
print('Exception:', e)
35+
36+
# Trova tutti gli elementi "Continua a leggere"
37+
elements_dt = driver.find_elements(By.CSS_SELECTOR, "dl.simple-list.results dt")
38+
#elements_dd = driver.find_elements(By.XPATH, "//dl[@class='sample-list.results']/dd/a")
39+
40+
print('[DEBUG] len(elements_dt):', len(elements_dt))
41+
# Lista per memorizzare i dati estratti
42+
data = []
43+
44+
# Clicca su ciascun elemento
45+
#for index, element_dt, element_dd in enumerate(zip(elements_dt, elements_dd), 1): # you can use `enumerate(..., 1)` to start `index` with `1`
46+
for index, element in enumerate(elements_dt, 1): # you can use `enumerate(..., 1)` to start `index` with `1`
47+
48+
try:
49+
article_url = element.find_element(By.XPATH, './/a').get_attribute("href")
50+
article_title = element.text
51+
52+
# ... DON'T CLIK LINKS BECAUSE IT WILL REMOVE CURRENT PAGE FROM MEMPRY
53+
# ... AND YOU WILL LOST ACCESS TO OTHER `elements` ON CURRENT PAGE
54+
# ...
55+
# ... Get `href` and later (after loop) use `.get(href)` to access subpages.
56+
57+
data.append({
58+
'keyword': keyword,
59+
'Titolo': article_title,
60+
'URL': article_url,
61+
#'Data': article_date,
62+
#'Contenuto': article_content
63+
})
64+
65+
print('[DEBUG] data:', data[-1])
66+
# Torna alla pagina precedente
67+
#driver.back()
68+
except Exception as e:
69+
print("Errore durante il clic sull'elemento:", e)
70+
71+
# work with subpages
72+
73+
for item in data:
74+
print('[DEBUG] subpage:', item['URL'])
75+
driver.get(item['URL'])
76+
#article_date = ...
77+
#article_content = ...
78+
#item['Data'] = article_date
79+
#item['Contenuto'] = article_content
80+
81+
except Exception as e:
82+
print("Errore durante lo scraping della pagina:", e)
83+
return None
84+
85+
return data
86+
87+
# --- main ---
88+
89+
driver = webdriver.Chrome()
90+
driver.maximize_window()
91+
driver.implicitly_wait(10)
92+
93+
# ---
94+
95+
start_url = "https://www.salute.gov.it/portale/home.html"
96+
97+
all_data = []
98+
99+
keywords = ['ukraina', 'covid-19', 'elan musk']
100+
101+
for word in keywords:
102+
103+
print("Main Page:", start_url)
104+
105+
# open main page
106+
driver.get(start_url)
107+
108+
# find searchbar
109+
print('Search:', word)
110+
searchbar = driver.find_element(By.ID, "f_cerca")
111+
# put keyword in searchbar and press ENTER
112+
searchbar.send_keys(word)
113+
searchbar.send_keys(Keys.ENTER)
114+
115+
time.sleep(5) # wait for results
116+
117+
#get current url (because it could load different URL to show results)
118+
search_results_url = driver.current_url
119+
120+
# start scraping results (with pagination):
121+
#while True: # try to get all pages
122+
for _ in range(3): # try to get only 3 pages
123+
print("Scraping:", search_results_url)
124+
125+
page_data = scrape_page(driver, word) # <--- only scraping, without `.get(url)`, I send `word` only to add it to `data`
126+
127+
if page_data:
128+
all_data.extend(page_data)
129+
130+
driver.get(search_results_url) # go back to result after visiting subpages - to get link to next page
131+
132+
try:
133+
next_page_link = driver.find_element(By.XPATH, "//a[contains(text(), 'Successive')]")
134+
search_results_url = next_page_link.get_attribute("href")
135+
driver.get(search_results_url) # <--- open next page with results using URL
136+
#next_page_link.click() # <--- or click link
137+
except Exception as e:
138+
print('[DEBUG] Exception:', e)
139+
print('[DEBUG] break')
140+
#input('Press ENTER to continue')
141+
break # exit loop
142+
143+
driver.quit()
144+
145+
import pandas as pd
146+
df = pd.DataFrame(all_data)
147+
print(df)
148+
149+
input("Press ENTER to close")

0 commit comments

Comments
 (0)