1+ import random
2+ import time
3+ import requests
4+ from pymongo import MongoClient
5+
6+
7+ class CommentPhotoCrawler (object ):
8+ """注:这次爬虫由于时间原因,写得比较粗糙,仅供参考"""
9+
10+ def __init__ (self , sleep_time = 2 ):
11+ self .sleep_time = sleep_time
12+ self .mid = None
13+ self .login_headers = {
14+ 'User-Agent' : 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_5) AppleWebKit/537.36 '
15+ '(KHTML, like Gecko)Chrome/48.0.2564.116 Safari/537.36' ,
16+ 'Accept' : '*/*' ,
17+ 'Accept-Encoding' : 'gzip, deflate, br' ,
18+ 'Accept-Language' : 'zh-CN,zh;q=0.9' ,
19+ 'Connection' : 'keep-alive' ,
20+ 'Origin' : 'https://passport.weibo.cn' ,
21+ 'Referer' : 'https://passport.weibo.cn/signin/login?'
22+ }
23+ self .session = None
24+ client = MongoClient ('127.0.0.1' , 27017 )
25+ self .db = client .Jayzhou
26+ self .col = self .db .jay_detail
27+ self .col .ensure_index ('uid' , unique = True )
28+
29+ def login (self , user , password ):
30+ self .session = requests .Session ()
31+ login_data = {
32+ 'username' : user ,
33+ 'password' : password ,
34+ 'savestate' : '1' ,
35+ 'r' : 'https://weibo.cn/' ,
36+ 'ec' : '0' ,
37+ 'pagerefer' : 'https://passport.weibo.cn/signin/welcome' ,
38+ 'entry' : 'mweibo' ,
39+ 'mainpageflag' : '1'
40+ } # 表单数据
41+ login_url = 'https://passport.weibo.cn/sso/login'
42+ self .session .post (login_url , headers = self .login_headers , data = login_data )
43+ print ('模拟登录手机网页端微博成功!' )
44+
45+ def get_fans (self ):
46+ base_url = 'https://m.weibo.cn/api/container/getIndex?containerid=230283{}_-_INFO' \
47+ '&title=%E5%9F%BA%E6%9C%AC%E8%B5%84%E6%96%99&luicode=10000011&lfid=230283{}'
48+ uid_list = self .gen_url ()
49+ for uid in uid_list :
50+ res = self .session .get (base_url .format (uid , uid ))
51+ if res .status_code == 200 :
52+ print (res .json ())
53+ self .parse_res (res , uid )
54+ print ('Successfully got data from uid: {}' .format (uid ))
55+ else :
56+ print ('Can not get uid : {}' .format (uid ))
57+ time .sleep (random .random ()* 4 )
58+
59+ def gen_url (self ):
60+ col = self .db .get_collection ('jay' )
61+ scheme_list = list (col .distinct ('scheme' ))
62+ uid_list = [scheme [21 :31 ] for scheme in scheme_list ]
63+ with open ('uid.txt' , 'w' ) as f :
64+ for u in uid_list :
65+ f .write (u + '\n ' )
66+ return uid_list
67+
68+ def parse_res (self , response , uid ):
69+ item = {'uid' : uid }
70+ data = response .json ()['data' ]['cards' ][0 :2 ]
71+ try :
72+ for info in data [0 ]['card_group' ][1 :]:
73+ try :
74+ item [info ['item_name' ]] = info ['item_content' ]
75+ except :
76+ pass
77+ for info in data [1 ]['card_group' ][1 :]:
78+ try :
79+ item [info ['item_name' ]] = info ['item_content' ]
80+ except :
81+ pass
82+ except :
83+ print ('Passing uid: {}' .format (uid ))
84+
85+ self .col .update ({'uid' : item ['uid' ]}, {'$set' : item }, upsert = True )
86+
87+
88+ if __name__ == '__main__' :
89+ com = CommentPhotoCrawler ()
90+ com .login ('' , '' ) # 传入你的微博用户名和密码
91+ com .get_fans ()
0 commit comments