Skip to content

Commit 8cd2a0a

Browse files
authored
Add files via upload
1 parent 60912d0 commit 8cd2a0a

2 files changed

Lines changed: 169 additions & 0 deletions

File tree

JayZhou/get_detail_jay.py

Lines changed: 91 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,91 @@
1+
import random
2+
import time
3+
import requests
4+
from pymongo import MongoClient
5+
6+
7+
class CommentPhotoCrawler(object):
8+
"""注:这次爬虫由于时间原因,写得比较粗糙,仅供参考"""
9+
10+
def __init__(self, sleep_time=2):
11+
self.sleep_time = sleep_time
12+
self.mid = None
13+
self.login_headers = {
14+
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_5) AppleWebKit/537.36 '
15+
'(KHTML, like Gecko)Chrome/48.0.2564.116 Safari/537.36',
16+
'Accept': '*/*',
17+
'Accept-Encoding': 'gzip, deflate, br',
18+
'Accept-Language': 'zh-CN,zh;q=0.9',
19+
'Connection': 'keep-alive',
20+
'Origin': 'https://passport.weibo.cn',
21+
'Referer': 'https://passport.weibo.cn/signin/login?'
22+
}
23+
self.session = None
24+
client = MongoClient('127.0.0.1', 27017)
25+
self.db = client.Jayzhou
26+
self.col = self.db.jay_detail
27+
self.col.ensure_index('uid', unique=True)
28+
29+
def login(self, user, password):
30+
self.session = requests.Session()
31+
login_data = {
32+
'username': user,
33+
'password': password,
34+
'savestate': '1',
35+
'r': 'https://weibo.cn/',
36+
'ec': '0',
37+
'pagerefer': 'https://passport.weibo.cn/signin/welcome',
38+
'entry': 'mweibo',
39+
'mainpageflag': '1'
40+
} # 表单数据
41+
login_url = 'https://passport.weibo.cn/sso/login'
42+
self.session.post(login_url, headers=self.login_headers, data=login_data)
43+
print('模拟登录手机网页端微博成功!')
44+
45+
def get_fans(self):
46+
base_url = 'https://m.weibo.cn/api/container/getIndex?containerid=230283{}_-_INFO' \
47+
'&title=%E5%9F%BA%E6%9C%AC%E8%B5%84%E6%96%99&luicode=10000011&lfid=230283{}'
48+
uid_list = self.gen_url()
49+
for uid in uid_list:
50+
res = self.session.get(base_url.format(uid, uid))
51+
if res.status_code == 200:
52+
print(res.json())
53+
self.parse_res(res, uid)
54+
print('Successfully got data from uid: {}'.format(uid))
55+
else:
56+
print('Can not get uid : {}'.format(uid))
57+
time.sleep(random.random()*4)
58+
59+
def gen_url(self):
60+
col = self.db.get_collection('jay')
61+
scheme_list = list(col.distinct('scheme'))
62+
uid_list = [scheme[21:31] for scheme in scheme_list]
63+
with open('uid.txt', 'w') as f:
64+
for u in uid_list:
65+
f.write(u+'\n')
66+
return uid_list
67+
68+
def parse_res(self, response, uid):
69+
item = {'uid': uid}
70+
data = response.json()['data']['cards'][0:2]
71+
try:
72+
for info in data[0]['card_group'][1:]:
73+
try:
74+
item[info['item_name']] = info['item_content']
75+
except:
76+
pass
77+
for info in data[1]['card_group'][1:]:
78+
try:
79+
item[info['item_name']] = info['item_content']
80+
except:
81+
pass
82+
except:
83+
print('Passing uid: {}'.format(uid))
84+
85+
self.col.update({'uid': item['uid']}, {'$set': item}, upsert=True)
86+
87+
88+
if __name__ == '__main__':
89+
com = CommentPhotoCrawler()
90+
com.login('', '') # 传入你的微博用户名和密码
91+
com.get_fans()

JayZhou/get_fans_data.py

Lines changed: 78 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,78 @@
1+
import random
2+
import time
3+
import requests
4+
from pymongo import MongoClient
5+
6+
7+
class CommentPhotoCrawler(object):
8+
9+
def __init__(self, sleep_time=2):
10+
self.sleep_time = sleep_time
11+
self.mid = None
12+
self.login_headers = {
13+
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_5) AppleWebKit/537.36 '
14+
'(KHTML, like Gecko)Chrome/48.0.2564.116 Safari/537.36',
15+
'Accept': '*/*',
16+
'Accept-Encoding': 'gzip, deflate, br',
17+
'Accept-Language': 'zh-CN,zh;q=0.9',
18+
'Connection': 'keep-alive',
19+
'Origin': 'https://passport.weibo.cn',
20+
'Referer': 'https://passport.weibo.cn/signin/login?'
21+
}
22+
self.session = None
23+
client = MongoClient('127.0.0.1', 27017)
24+
db = client.Jayzhou
25+
self.col = db.jay
26+
self.col.ensure_index('scheme', unique=True)
27+
28+
def login(self, user, password):
29+
self.session = requests.Session()
30+
login_data = {
31+
'username': user,
32+
'password': password,
33+
'savestate': '1',
34+
'r': 'https://weibo.cn/',
35+
'ec': '0',
36+
'pagerefer': 'https://passport.weibo.cn/signin/welcome',
37+
'entry': 'mweibo',
38+
'mainpageflag': '1'
39+
} # 表单数据
40+
login_url = 'https://passport.weibo.cn/sso/login'
41+
self.session.post(login_url, headers=self.login_headers, data=login_data)
42+
print('模拟登录手机网页端微博成功!')
43+
44+
def get_fans(self):
45+
46+
while 1:
47+
urls = self.gen_url()
48+
for url in urls:
49+
res = self.session.get(url)
50+
print(res.json())
51+
if res.status_code == 418 or res.status_code == 403:
52+
print('Can not get data from url: {}'.format(url))
53+
time.sleep(60)
54+
else:
55+
data = res.json().get('data').get('cards')
56+
if len(data) > 0:
57+
for item in data[0]['card_group']:
58+
self.col.update({'scheme': item['scheme']}, {'$set': item}, upsert=True)
59+
print('Successfully get data from url: {}'.format(url))
60+
else:
61+
print('Can not get data from url: {}'.format(url))
62+
time.sleep(random.random()*5)
63+
64+
@staticmethod
65+
def gen_url():
66+
base_url = 'https://m.weibo.cn/api/container/getIndex?containerid=' \
67+
'2311407a8941058aaf4df5147042ce104568da_-_super_newfans&luicode' \
68+
'=10000011&lfid=1008087a8941058aaf4df5147042ce104568da_-_hotuser&page={}'
69+
70+
url_list = [base_url.format(i) for i in range(1, 51)]
71+
random.shuffle(url_list)
72+
return url_list
73+
74+
75+
if __name__ == '__main__':
76+
com = CommentPhotoCrawler()
77+
com.login('', '') # 传入你的微博用户名和密码
78+
com.get_fans()

0 commit comments

Comments
 (0)