Skip to content

Commit 722c170

Browse files
committed
__scraping__
1 parent 94a89eb commit 722c170

3 files changed

Lines changed: 70 additions & 6 deletions

File tree

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
# author: Bartlomiej "furas" Burek (https://blog.furas.pl)
2+
# date: 2022.06.11
3+
# [python - Accessing Data from Javascript API call - Stack Overflow](https://stackoverflow.com/questions/72570830/accessing-data-from-javascript-api-call/72575730)
4+
5+
# `requests` converts `+` to `%28` but server needs `+`
6+
7+
import requests
8+
import urllib.parse
9+
10+
data = {
11+
"xmlquery": "<post>\n<param+name=\"Exchange\"+value=\"NMF\"/>\n<param+name=\"SubSystem\"+value=\"Prices\"/>\n<param+name=\"Action\"+value=\"GetMarket\"/>\n<param+name=\"inst__a\"+value=\"0,1,2,5,21,23\"/>\n<param+name=\"ext_xslt\"+value=\"/nordicV3/paging_inst_table.xsl\"/>\n<param+name=\"Market\"+value=\"GITS:CO:CPHCB,GITS:CO:CPHBB,M:GITS:CO:CPHTA,GITS:CO:CPHAU,GITS:CO:CPHSA\"/>\n<param+name=\"RecursiveMarketElement\"+value=\"True\"/>\n<param+name=\"XPath\"+value=\"//inst[@itid='2'+or+@itid='3']\"/>\n<param+name=\"ext_xslt_lang\"+value=\"en\"/>\n<param+name=\"ext_xslt_tableId\"+value=\"bondsSearchDKTable\"/>\n<param+name=\"ext_xslt_options\"+value=\",noflag,\"/>\n<param+name=\"ext_xslt_hiddenattrs\"+value=\",fnm,isrid,dlt,tp,bb,ib,cpt,rps,os,lt,st,itid,lists,it,mkt,\"/>\n<param+name=\"ext_xslt_notlabel\"+value=\",fnm\"/>\n<param+name=\"ext_xslt_jspcbk\"+value=\"doPaging\"/>\n<param+name=\"ext_xslt_jsscbk\"+value=\"doSortPager\"/>\n<param+name=\"ext_xslt_sorder\"+value=\"descending\"/>\n<param+name=\"ext_xslt_sattr\"+value=\"chp\"/>\n<param+name=\"ext_xslt_start\"+value=\"0\"/>\n<param+name=\"ext_xslt_size\"+value=\"100\"/>\n<param+name=\"inst__an\"+value=\"id,nm,fnm,isin,cpnrt,bp,ap,lsp,chp,atap,ed,dlt,cr,isrid,tp,bb,ib,cpt,rps,os,lt,st,itid,lists,it,mkt\"/>\n<param+name=\"app\"+value=\"/obligationer/danmark\"/>\n</post>"
12+
}
13+
14+
# server needs all these headers
15+
headers = {
16+
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:98.0) Gecko/20100101 Firefox/98.0',
17+
'X-Requested-With': 'XMLHttpRequest',
18+
'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
19+
}
20+
21+
url = 'http://www.nasdaqomxnordic.com/webproxy/DataFeedProxy.aspx'
22+
23+
data_str = urllib.parse.urlencode(data, safe="+") # <-- don't convert `+`
24+
25+
response = requests.post(url, data=data_str, headers=headers)
26+
27+
#print(response.text)
28+
29+
# ---------------------------------
30+
31+
import pandas as pd
32+
33+
all_tables = pd.read_html(response.text)
34+
df = all_tables[1]
35+
36+
print(df)

__scraping__/pisa.ucsc.edu - scrapy/main.py

Lines changed: 8 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,7 @@ def professor_filter(item):
4747
return (re.search(r'\w\.', item) or "Staff" in item)
4848

4949
class ClassesSpider(scrapy.Spider):
50-
50+
5151
name = "classes"
5252

5353
def start_requests(self):
@@ -72,23 +72,25 @@ def parse(self, response):
7272
all_rows = response.xpath('//div[contains(@id, "rowpanel_")]')
7373

7474
classDict = {}
75-
75+
7676
for row in all_rows:
7777
classname = row.xpath('.//h2//a/text()').re(r'(?i)(\w+\s\w+)+\s-\s\w+\xa0+([\w\s]+\b)')
7878
professor = row.xpath('(.//div[@class="panel-body"]//div)[3]/text()').get().strip()
7979
print(classname, professor)
8080
if professor and professor_filter(professor):
8181
classDict[tuple(classname)] = [professor]
82+
yield {'class': tuple(classname), 'professor': professor} # it will write to file csv
83+
8284
else:
8385
print('skip:', professor)
84-
86+
8587
print(classDict)
86-
88+
8789
#filename = f'class-{page}.html'
8890
#with open(filename, 'wb') as f:
8991
# f.write(response.body)
9092
#self.log(f'Saved file {filename}')
91-
93+
9294

9395
# --- run without project and save in `output.csv` ---
9496

@@ -99,4 +101,4 @@ def parse(self, response):
99101
'FEEDS': {'output.csv': {'format': 'csv'}}, # new in 2.1
100102
})
101103
c.crawl(ClassesSpider)
102-
c.start()
104+
c.start()
Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
class,professor
2+
"ECON 1,Intro Microeconomic",Staff
3+
"ECON 1,Intro Microeconomic","Gonzalez,J.H."
4+
"ECON 2,Intro Macroeconomic","Gonzalez,J.H."
5+
"ECON 10A,Econ of Accounting","Moussa,Y.I."
6+
"ECON 10A,Econ of Accounting","Jones,K.K."
7+
"ECON 11A,Math Methd for Econ","Mendes,B.S."
8+
"ECON 11B,Math Methds Econ II","Katznelson,J.R."
9+
"ECON 100A,Intermed Microecon","Lazzati,N."
10+
"ECON 100A,Intermed Microecon","Robinson,J.M."
11+
"ECON 100B,Intermed Macroecon","Saijo,H."
12+
"ECON 101,Managerial Econ","Gonzalez,J.H."
13+
"ECON 104,Numbr Truth","Bulman,G."
14+
"ECON 111A,Intermed Account I",Staff
15+
"ECON 113,Intro Econometrics","Dobkin,C.E."
16+
"ECON 113,Intro Econometrics","Giuliano,L."
17+
"ECON 114,Adv Quant Methods","Martinez-Iriarte,J."
18+
"ECON 117B,Tax Factors","Moussa,Y.I."
19+
"ECON 125,Econ History Of US","Meininger,A.G."
20+
"ECON 126,Why Succeed","Baden,R.B."
21+
"ECON 133,Security Markets","Pommerenke,K."
22+
"ECON 136,Business Strategy","Baden,R.B."
23+
"ECON 141,Internatl Finance","GU,W."
24+
"ECON 150,Public Finance","Marion,J."
25+
"ECON 161A,Marketing","Owen,S.A."
26+
"ECON 166A,Game Theory","Wei,D."

0 commit comments

Comments
 (0)