Skip to content

Commit 9e0c3df

Browse files
committed
add splash demo
1 parent d00e885 commit 9e0c3df

File tree

1 file changed

+42
-0
lines changed

1 file changed

+42
-0
lines changed

python_splash.py

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
# _*_ coding: utf-8 _*_
2+
3+
"""
4+
使用Splash服务器抓取Ajax渲染页面
5+
"""
6+
7+
import json
8+
import requests
9+
10+
# Docker安装: https://splash.readthedocs.io/en/latest/install.html
11+
CRAWLER_URL = "http://weixin.sogou.com/weixin?page=1&type=2&query=%E4%B8%AD%E5%9B%BD"
12+
13+
14+
# render.html
15+
def test_1(url):
16+
render = "http://xx.xx.xx.xx:8050/render.html"
17+
body = json.dumps({
18+
"url": url,
19+
"wait": 0.5, # 设定页面加载等待时间
20+
"images": 0, # 是否抓取图片
21+
"timeout": 3, # 设置过期时间
22+
# "allowed_domains": ["sogou.com", ], # 设置允许的域
23+
"allowed_content_types": "text/html; charset=utf-8"
24+
})
25+
headers = {"Content-Type": "application/json"}
26+
27+
response = requests.post(url=render, headers=headers, data=body)
28+
print(url, response.status_code)
29+
print(response.text)
30+
return
31+
32+
# test_1(CRAWLER_URL)
33+
34+
35+
# render.png
36+
def test_2(url):
37+
render = "http://xx.xx.xx.xx:8050/render.png?url=%s&timeout=5" % url
38+
response = requests.get(url=render)
39+
print(url, response.status_code)
40+
return
41+
42+
# test_2(CRAWLER_URL)

0 commit comments

Comments
 (0)