forked from Alfred1984/interesting-python
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathyouku_comments.py
More file actions
75 lines (66 loc) · 3.63 KB
/
youku_comments.py
File metadata and controls
75 lines (66 loc) · 3.63 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
import json
import requests
from pymongo import MongoClient
class CommentCrawler(object):
def __init__(self):
self.video_id = None
client = MongoClient('127.0.0.1', 27017)
self.db = client.get_database('Changan')
self.col = None
def get_video_id(self, v_url):
res = requests.get(v_url)
idx = res.text.find("videoId: '")
if idx != -1:
self.video_id = res.text[idx+10:idx+20]
print('Video ID for corresponding url is :{}'.format(self.video_id))
else:
print('Can not get video id, please check the url you just input.')
def get_comments(self, video_url, collection):
self.get_video_id(video_url)
self.col = self.db.get_collection(collection)
self.col.ensure_index('id', unique=True)
print('Built collection of: {}'.format(collection))
base_url = 'https://p.comments.youku.com/ycp/comment/pc/commentList?jsoncallback=n_commentList' \
'&app=100-DDwODVkv&objectId={}&objectType=1&listType=0&' \
'currentPage={}&pageSize=30&sign=edb9eab487e78a7729408772d8691134&time=1562320232'
page = 1
while 1:
try:
res = requests.get(base_url.format(self.video_id, page), timeout=5)
data = json.loads(res.text[res.text.find('{'):-1])
for com in data['data']['comment']:
self.col.update({'id': com['id']}, {'$set': com}, upsert=True)
print('Successfully crawl comments of video : {}, page :{}'.format(self.video_id, page))
if page == data['data']['totalPage']:
print('Finished crawling all pages.')
break
else:
page += 1
except:
print('Video: {} Page: {} request timeout! Moving to next page!'.format(self.video_id, page))
page += 1
if __name__ == '__main__':
cc = CommentCrawler()
d = {'youku1': 'https://v.youku.com/v_show/id_XNDI0NDYyNjk1Mg'
'==.html?spm=a2h0j.11185381.listitem_page1.5~A&&s=efbfbd78efbfbd5cefbf',
'youku2': 'https://v.youku.com/v_show/id_XNDI0NDQ0ODEwNA'
'==.html?spm=a2h0j.11185381.listitem_page1.5!2~A&&s=efbfbd78efbfbd5cefbf',
'youku3': 'https://v.youku.com/v_show/id_XNDI0NDQ2MzU3Mg'
'==.html?spm=a2h0j.11185381.listitem_page1.5!3~A&&s=efbfbd78efbfbd5cefbf',
'youku4': 'https://v.youku.com/v_show/id_XNDI0NDQ3NTMwMA'
'==.html?spm=a2h0j.11185381.listitem_page1.5!4~A&&s=efbfbd78efbfbd5cefbf',
'youku5': 'https://v.youku.com/v_show/id_XNDI0NDQ5NzE3Ng'
'==.html?spm=a2h0j.11185381.listitem_page1.5!5~A&&s=efbfbd78efbfbd5cefbf',
'youku6': 'https://v.youku.com/v_show/id_XNDI0NDUwODUxMg'
'==.html?spm=a2h0j.11185381.listitem_page1.5!6~A&&s=efbfbd78efbfbd5cefbf',
'youku7': 'https://v.youku.com/v_show/id_XNDI0NDUxOTgyNA'
'==.html?spm=a2h0j.11185381.listitem_page1.5!7~A&&s=efbfbd78efbfbd5cefbf',
'youku8': 'https://v.youku.com/v_show/id_XNDI0NDU1MjQxMg'
'==.html?spm=a2h0j.11185381.listitem_page1.5!8~A&&s=efbfbd78efbfbd5cefbf',
'youku9': 'https://v.youku.com/v_show/id_XNDI0NDYzMzAyOA'
'==.html?spm=a2h0j.11185381.listitem_page1.5!9~A&&s=efbfbd78efbfbd5cefbf',
'youku10': 'https://v.youku.com/v_show/id_XNDI0NDY1MzA3Ng='
'=.html?spm=a2h0j.11185381.listitem_page1.5!10~A&&s=efbfbd78efbfbd5cefbf'
}
for col, u in d.items():
cc.get_comments(video_url=u, collection=col)