Skip to content

Commit d7f77f5

Browse files
committed
__scraping__
1 parent 556dc31 commit d7f77f5

File tree

11 files changed

+558
-11
lines changed

11 files changed

+558
-11
lines changed

__scraping__/city-data.com/main.py

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
# author: Bartlomiej "furas" Burek (https://blog.furas.pl)
2+
# date: 2022.07.21
3+
#
4+
5+
import requests
6+
from bs4 import BeautifulSoup
7+
import pandas as pd
8+
9+
url = "https://www.city-data.com/city/Adak-Alaska.html"
10+
response = requests.get(url)
11+
#print('status:', response.status_code)
12+
soup = BeautifulSoup(response.text, "html.parser")
13+
14+
religion_population = soup.find(id="religion").find_all('tr')
15+
16+
data = []
17+
18+
for row in religion_population:
19+
columns = row.find_all('td')
20+
if columns:
21+
religion = columns[0].get_text(strip=True)
22+
number = columns[1].get_text(strip=True).replace(",", "")
23+
print(f'religion: {religion} | number: {number}')
24+
data.append([religion, int(number)])
25+
26+
# ---------------------------------------------
27+
28+
df = pd.DataFrame(data, columns=['religion', 'number'])
29+
df['percentage'] = (df['number'] / df['number'].sum()) * 100
30+
31+
print(df)
32+
Lines changed: 111 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,111 @@
1+
# author: Bartlomiej "furas" Burek (https://blog.furas.pl)
2+
# date: 2022.07.30
3+
#
4+
5+
# https://docs.opensea.io/reference/getting-assets
6+
7+
from selenium import webdriver
8+
from selenium.webdriver.chrome.service import Service
9+
from selenium.webdriver.common.by import By
10+
#from selenium.webdriver.common.keys import Keys
11+
#from selenium.webdriver.support.ui import WebDriverWait
12+
#from selenium.webdriver.support import expected_conditions as EC
13+
from selenium.common.exceptions import NoSuchElementException, TimeoutException
14+
15+
#from webdriver_manager.chrome import ChromeDriverManager
16+
from webdriver_manager.firefox import GeckoDriverManager
17+
18+
import time
19+
20+
#import undetected_chromedriver as uc
21+
22+
# ----
23+
24+
import selenium
25+
print('Selenium:', selenium.__version__)
26+
27+
# ---
28+
29+
url = 'https://opensea.io/collection/meebits?search[sortAscending]=false&search[sortBy]=FAVORITE_COUNT'
30+
31+
#driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
32+
driver = webdriver.Firefox(service=Service(GeckoDriverManager().install()))
33+
34+
#driver = uc.Chrome(executable_path='/home/furas/bin/chromedriver', service_args=['--quiet'])
35+
#driver = uc.Chrome()
36+
37+
driver.get(url)
38+
39+
time.sleep(5)
40+
41+
# --- scroll down to the botton ---
42+
43+
SCROLL_PAUSE_TIME = 0.5
44+
45+
# Get scroll height
46+
last_height = driver.execute_script("return document.body.scrollHeight")
47+
48+
while True:
49+
# Scroll down to bottom
50+
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
51+
52+
# Wait to load page
53+
time.sleep(SCROLL_PAUSE_TIME)
54+
55+
# Calculate new scroll height and compare with last scroll height
56+
new_height = driver.execute_script("return document.body.scrollHeight")
57+
if new_height == last_height:
58+
break
59+
last_height = new_height
60+
61+
# --- get data ---
62+
63+
assets = []
64+
65+
try:
66+
item = driver.find_element(By.XPATH, '//h1')
67+
collection_name = item.text
68+
except NoSuchElementException:
69+
collection_name = '?'
70+
print('collection_name:', collection_name)
71+
72+
try:
73+
item = driver.find_element(By.XPATH, '//div[@class="sc-1xf18x6-0 sc-1aqfqq9-0 sc-1y1ib3i-7 haVRLx dfsEJr eGsklH"]')
74+
collection_desc = item.text
75+
except NoSuchElementException:
76+
collection_desc = '?'
77+
print('collection_desc:', collection_desc)
78+
79+
for profile in driver.find_elements(By.XPATH, '//div[@role="grid"]/div'):
80+
print('--- profile ---')
81+
82+
try:
83+
item = profile.find_element(By.XPATH, './/div[@class="sc-7qr9y8-0 sc-dw611d-1 iUvoJs fcpvjL"]')
84+
artname = item.text
85+
except NoSuchElementException:
86+
artname = '?'
87+
print('artname:', artname)
88+
89+
try:
90+
item = profile.find_element(By.XPATH, './/div[@class="sc-7qr9y8-0 iUvoJs Price--amount"]')
91+
price = item.text
92+
except NoSuchElementException:
93+
price = '?'
94+
print('price:', price)
95+
96+
try:
97+
item = profile.find_element(By.XPATH, './/a')
98+
link = item.get_attribute('href')
99+
except NoSuchElementException:
100+
link = '?'
101+
print('link:', link)
102+
103+
assets.append( [collection_name, artname, price, link] )
104+
105+
print(assets)
106+
107+
import pandas as pd
108+
109+
df = pd.DataFrame(assets, columns=["collection_name", "artname", "price", "link"])
110+
print(df.to_string())
111+
Lines changed: 81 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,81 @@
1+
# author: Bartlomiej "furas" Burek (https://blog.furas.pl)
2+
# date: 2022.07.29
3+
#
4+
5+
from selenium import webdriver
6+
from selenium.webdriver.chrome.service import Service
7+
from selenium.webdriver.common.by import By
8+
9+
from webdriver_manager.chrome import ChromeDriverManager
10+
#from webdriver_manager.firefox import GeckoDriverManager
11+
12+
import time
13+
14+
# ----
15+
16+
import selenium
17+
print('Selenium:', selenium.__version__)
18+
19+
# ---
20+
21+
website = 'https://opensea.io/collection/azuki?search[sortAscending]=false&search[sortBy]=FAVORITE_COUNT'
22+
23+
#s = Service('C:\webdrivers\chromedriver.exe')
24+
#driver = webdriver.Chrome(service=s)
25+
26+
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
27+
#driver = webdriver.Firefox(service=Service(GeckoDriverManager().install()))
28+
29+
driver.get(website)
30+
driver.maximize_window()
31+
32+
time.sleep(1)
33+
34+
driver.find_element(By.XPATH, '//div[@class="sc-1xf18x6-0 sc-1aqfqq9-0 haVRLx dfsEJr styledPhoenixText"]').click()
35+
36+
collection_name = []
37+
collection_desc1 = []
38+
collection_desc2 = []
39+
collection_desc3 = []
40+
name = []
41+
price = []
42+
43+
for item in driver.find_elements(By.XPATH, '//h1'):
44+
collection_name.append(item.text)
45+
#time.sleep(1)
46+
47+
for item in driver.find_elements(By.XPATH, '(//p[1])[1]'):
48+
collection_desc1.append(item.text)
49+
#time.sleep(1)
50+
51+
for item in driver.find_elements(By.XPATH, '//p[2]'):
52+
collection_desc2.append(item.text)
53+
#time.sleep(1)
54+
55+
for item in driver.find_elements(By.XPATH, '//p[3]'):
56+
collection_desc3.append(item.text)
57+
#time.sleep(1)
58+
59+
for item in driver.find_elements(By.XPATH, '//div[@class="sc-7qr9y8-0 sc-dw611d-1 iUvoJs fcpvjL"]'):
60+
name.append(item.text)
61+
#time.sleep(1)
62+
63+
for item in driver.find_elements(By.XPATH, '//div[@class="sc-7qr9y8-0 iUvoJs Price--amount"]'):
64+
price.append(item.text)
65+
#time.sleep(1)
66+
67+
Collection_Azuki = {
68+
'Collection_Name': collection_name,
69+
'Collection_Description1': collection_desc1,
70+
'Collection_Des2': collection_desc2,
71+
'Colltionec_Des3': collection_desc3,
72+
'Art_Name_fav': name,
73+
'Art_Price_fav': price
74+
}
75+
76+
import pandas as pd
77+
78+
df = pd.DataFrame.from_dict(Collection_Azuki, orient='index')
79+
df = df.transpose()
80+
81+
print(df.to_string())

0 commit comments

Comments
 (0)