Skip to content

Commit 556dc31

Browse files
committed
__scraping__
1 parent 59f1ccd commit 556dc31

3 files changed

Lines changed: 116 additions & 0 deletions

File tree

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,44 @@
1+
# author: Bartlomiej "furas" Burek (https://blog.furas.pl)
2+
# date: 2022.06.28
3+
# [python - How to convert a soup to a Dataframe - Stack Overflow](https://stackoverflow.com/questions/72779612/how-to-convert-a-soup-to-a-dataframe/72780471?noredirect=1#comment128558903_72780471)
4+
5+
import requests
6+
import requests_cache
7+
import pandas as pd
8+
import io
9+
10+
# without header `Content-Type` it will send `HTML` instead of `JSON`
11+
headers = {
12+
'Content-Type': 'application/json; charset=utf-8',
13+
}
14+
15+
url = 'https://droughtmonitor.unl.edu/DmData/GISData.aspx/ReturnDMWeeks'
16+
17+
response = requests.get(url, headers=headers)
18+
data = response.json()
19+
20+
all_dates = [f"{d[:4]}-{d[4:6]}-{d[6:]}" for d in data['d']]
21+
print(len(all_dates))
22+
23+
# --- before loop ---
24+
25+
all_dfs = []
26+
27+
# url without `2022-06-21` at the end
28+
url = 'https://droughtmonitor.unl.edu/DmData/GISData.aspx/?mode=table&aoi=county&date='
29+
30+
# --- loop ---
31+
32+
requests_cache.install_cache('csv_cache') # keep in cache to resuse from cache when it crash and it need to run again
33+
34+
for date in all_dates:
35+
print('date:', date)
36+
csv = requests.get(url + date)
37+
df = pd.read_csv(io.StringIO(csv.text))
38+
#df.to_csv(f"{date}.csv") # to keep it when
39+
all_dfs.append( df )
40+
41+
# --- after loop ---
42+
43+
full_df = pd.concat(all_dfs)
44+
print(full_df)
Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,62 @@
1+
# author: Bartlomiej "furas" Burek (https://blog.furas.pl)
2+
# date: 2022.06.30
3+
# [python - Unshapable list error when scraping information - Stack Overflow](https://stackoverflow.com/questions/72810484/unshapable-list-error-when-scraping-information/)
4+
5+
import scrapy
6+
from scrapy.crawler import CrawlerProcess
7+
8+
class TestSpider(scrapy.Spider):
9+
10+
name = 'test'
11+
12+
start_urls = [
13+
#'https://rejestradwokatow.pl/adwokat/list/strona/1/sta/2,3,9',
14+
'https://rejestradwokatow.pl/adwokat/abaewicz-agnieszka-51004',
15+
'https://rejestradwokatow.pl/adwokat/adach-micha-55082',
16+
]
17+
18+
custom_settings = {
19+
'CONCURRENT_REQUESTS_PER_DOMAIN': 1,
20+
'DOWNLOAD_DELAY': 1,
21+
'USER_AGENT': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36'
22+
}
23+
24+
def parse(self, response):
25+
wev = {}
26+
27+
tic = response.xpath("//div[@class='line_list_K']//div//span//text()").getall()
28+
det = response.xpath("//div[@class='line_list_K']//div//div//text()").getall()
29+
30+
#print(tic)
31+
#print(det)
32+
#print('---')
33+
34+
all_rows = response.xpath("//div[@class='line_list_K']/div")
35+
36+
for row in all_rows:
37+
name = row.xpath(".//span/text()").get()
38+
value = row.xpath(".//div/text()").get()
39+
if name and value:
40+
wev[name.strip()] = value.strip()
41+
elif name and name.strip() == 'Email:':
42+
# <div class="address_e" data-ea="adwokat.adach" data-eb="gmail.com"></div>
43+
div = row.xpath('./div')
44+
email_a = div.attrib['data-ea']
45+
email_b = div.attrib['data-eb']
46+
wev[name.strip()] = f'{email_a}@{email_b}'
47+
48+
print(wev)
49+
50+
yield wev
51+
52+
# --- run without creating project and save results in `output.csv` ---
53+
54+
from scrapy.crawler import CrawlerProcess
55+
56+
c = CrawlerProcess({
57+
#'USER_AGENT': 'Mozilla/5.0',
58+
'FEEDS': {'output.csv': {'format': 'csv'}}, # new in 2.1
59+
})
60+
c.crawl(TestSpider)
61+
c.start()
62+
Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
Status:,Data wpisu w aktualnej izbie na listę adwokatów:,Data skreślenia z listy:,Ostatnie miejsce wpisu:,Stary nr wpisu:,Zastępca:
2+
Były adwokat,2013-09-01,2019-07-23,Katowice,1077,Pieprzyk Mirosław
3+
Wykonujący zawód,2014-05-20,,,1193,
4+
Status:,Data wpisu w aktualnej izbie na listę adwokatów:,Data skreślenia z listy:,Ostatnie miejsce wpisu:,Stary nr wpisu:,Zastępca:
5+
Były adwokat,2013-09-01,2019-07-23,Katowice,1077,Pieprzyk Mirosław
6+
Status:,Data wpisu w aktualnej izbie na listę adwokatów:,Data skreślenia z listy:,Ostatnie miejsce wpisu:,Stary nr wpisu:,Zastępca:
7+
Były adwokat,2013-09-01,2019-07-23,Katowice,1077,Pieprzyk Mirosław
8+
Status:,Data wpisu w aktualnej izbie na listę adwokatów:,Data skreślenia z listy:,Ostatnie miejsce wpisu:,Stary nr wpisu:,Zastępca:
9+
Były adwokat,2013-09-01,2019-07-23,Katowice,1077,Pieprzyk Mirosław
10+
Wykonujący zawód,2014-05-20,,,1193,

0 commit comments

Comments
 (0)