__scraping__

furas · furas · commit 556dc311a89b · 2022-06-30T13:50:58.000+02:00
diff --git a/__scraping__/droughtmonitor.unl.edu - requests, pandas/main.py b/__scraping__/droughtmonitor.unl.edu - requests, pandas/main.py
@@ -0,0 +1,44 @@
+# author: Bartlomiej "furas" Burek (https://blog.furas.pl)
+# date: 2022.06.28
+# [python - How to convert a soup to a Dataframe - Stack Overflow](https://stackoverflow.com/questions/72779612/how-to-convert-a-soup-to-a-dataframe/72780471?noredirect=1#comment128558903_72780471)
+
+import requests
+import requests_cache
+import pandas as pd
+import io
+
+# without header `Content-Type` it will send `HTML` instead of `JSON`
+headers = {
+    'Content-Type': 'application/json; charset=utf-8',
+}
+
+url = 'https://droughtmonitor.unl.edu/DmData/GISData.aspx/ReturnDMWeeks'
+
+response = requests.get(url, headers=headers)
+data = response.json()
+
+all_dates = [f"{d[:4]}-{d[4:6]}-{d[6:]}" for d in data['d']]
+print(len(all_dates))
+
+# --- before loop ---
+
+all_dfs = []
+
+# url without `2022-06-21` at the end
+url = 'https://droughtmonitor.unl.edu/DmData/GISData.aspx/?mode=table&aoi=county&date='
+
+# --- loop ---
+
+requests_cache.install_cache('csv_cache')  # keep in cache to resuse from cache when it crash and it need to run again
+
+for date in all_dates:
+    print('date:', date)
+    csv = requests.get(url + date)
+    df = pd.read_csv(io.StringIO(csv.text))
+    #df.to_csv(f"{date}.csv")  # to keep it when
+    all_dfs.append( df )
+
+# --- after loop ---
+
+full_df = pd.concat(all_dfs)
+print(full_df)
diff --git a/__scraping__/rejestradwokatow.pl - scrapy/main.py b/__scraping__/rejestradwokatow.pl - scrapy/main.py
@@ -0,0 +1,62 @@
+# author: Bartlomiej "furas" Burek (https://blog.furas.pl)
+# date: 2022.06.30
+# [python - Unshapable list error when scraping information - Stack Overflow](https://stackoverflow.com/questions/72810484/unshapable-list-error-when-scraping-information/)
+
+import scrapy
+from scrapy.crawler import CrawlerProcess
+
+class TestSpider(scrapy.Spider):
+
+    name = 'test'
+
+    start_urls = [
+        #'https://rejestradwokatow.pl/adwokat/list/strona/1/sta/2,3,9',
+        'https://rejestradwokatow.pl/adwokat/abaewicz-agnieszka-51004',
+        'https://rejestradwokatow.pl/adwokat/adach-micha-55082',
+    ]
+
+    custom_settings = {
+        'CONCURRENT_REQUESTS_PER_DOMAIN': 1,
+        'DOWNLOAD_DELAY': 1,
+        'USER_AGENT': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36'
+    }
+
+    def parse(self, response):
+        wev = {}
+
+        tic = response.xpath("//div[@class='line_list_K']//div//span//text()").getall()
+        det = response.xpath("//div[@class='line_list_K']//div//div//text()").getall()
+
+        #print(tic)
+        #print(det)
+        #print('---')
+
+        all_rows = response.xpath("//div[@class='line_list_K']/div")
+
+        for row in all_rows:
+            name  = row.xpath(".//span/text()").get()
+            value = row.xpath(".//div/text()").get()
+            if name and value:
+                wev[name.strip()] = value.strip()
+            elif name and name.strip() == 'Email:':
+                # <div class="address_e" data-ea="adwokat.adach" data-eb="gmail.com"></div>
+                div = row.xpath('./div')
+                email_a = div.attrib['data-ea']
+                email_b = div.attrib['data-eb']
+                wev[name.strip()] = f'{email_a}@{email_b}'
+
+        print(wev)
+
+        yield wev
+
+# --- run without creating project and save results in `output.csv` ---
+
+from scrapy.crawler import CrawlerProcess
+
+c = CrawlerProcess({
+    #'USER_AGENT': 'Mozilla/5.0',
+    'FEEDS': {'output.csv': {'format': 'csv'}},  # new in 2.1
+})
+c.crawl(TestSpider)
+c.start()
+
diff --git a/__scraping__/rejestradwokatow.pl - scrapy/output.csv b/__scraping__/rejestradwokatow.pl - scrapy/output.csv
@@ -0,0 +1,10 @@
+Status:,Data wpisu w aktualnej izbie na listę adwokatów:,Data skreślenia z listy:,Ostatnie miejsce wpisu:,Stary nr wpisu:,Zastępca:
+Były adwokat,2013-09-01,2019-07-23,Katowice,1077,Pieprzyk Mirosław
+Wykonujący zawód,2014-05-20,,,1193,
+Status:,Data wpisu w aktualnej izbie na listę adwokatów:,Data skreślenia z listy:,Ostatnie miejsce wpisu:,Stary nr wpisu:,Zastępca:
+Były adwokat,2013-09-01,2019-07-23,Katowice,1077,Pieprzyk Mirosław
+Status:,Data wpisu w aktualnej izbie na listę adwokatów:,Data skreślenia z listy:,Ostatnie miejsce wpisu:,Stary nr wpisu:,Zastępca:
+Były adwokat,2013-09-01,2019-07-23,Katowice,1077,Pieprzyk Mirosław
+Status:,Data wpisu w aktualnej izbie na listę adwokatów:,Data skreślenia z listy:,Ostatnie miejsce wpisu:,Stary nr wpisu:,Zastępca:
+Były adwokat,2013-09-01,2019-07-23,Katowice,1077,Pieprzyk Mirosław
+Wykonujący zawód,2014-05-20,,,1193,