Skip to content

Commit 935c6c5

Browse files
committed
Added get_likes() function in User
1 parent e54d3e8 commit 935c6c5

1 file changed

Lines changed: 57 additions & 0 deletions

File tree

zhihu.py

Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -599,6 +599,63 @@ def get_collections(self):
599599
yield Collection(url, name, self)
600600

601601

602+
def get_likes(self):
603+
# Todo: first version without zhuanlan article, also need the first one
604+
if self.user_url == None:
605+
print "I'm an anonymous user."
606+
return
607+
yield
608+
else:
609+
r = requests.get(self.user_url)
610+
soup = BeautifulSoup(r.content)
611+
# Handle the first liked item
612+
first_item = soup.find("div", attrs={'class':'zm-profile-section-item zm-item clearfix'})
613+
first_item = first_item.find("div", attrs={'class':'zm-profile-section-main zm-profile-section-activity-main zm-profile-activity-page-item-main'})
614+
if u"赞同了回答" in str(first_item):
615+
first_like = first_item.find("a")['href']
616+
yield Answer("http://www.zhihu.com" + first_like)
617+
# Handle the rest liked items
618+
post_url = self.user_url + "/activities"
619+
start_time = soup.find("div", attrs={'class':'zm-profile-section-item zm-item clearfix'})["data-time"]
620+
_xsrf = soup.find("input", attrs={'name': '_xsrf'})["value"]
621+
data = {
622+
'start': start_time,
623+
'_xsrf': _xsrf,
624+
}
625+
header = {
626+
'Host': "www.zhihu.com",
627+
'Referer': self.user_url,
628+
'User-Agent': "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36",
629+
}
630+
r = requests.post(post_url, data=data, headers=header)
631+
response_size = r.json()["msg"][0]
632+
response_html = r.json()["msg"][1]
633+
while response_size > 0:
634+
all_liked_answers = re.findall(u"\u8d5e\u540c\u4e86\u56de\u7b54\n\n<a class=\"question_link\" target=\"_blank\" href=\"\/question\/\d{8}\/answer\/\d{8}", response_html)
635+
# Remove duplicated, result in random order
636+
liked_answers = list(set(all_liked_answers))
637+
liked_answers.sort(key=all_liked_answers.index)
638+
for i in xrange(len(liked_answers)):
639+
answer_url = "http://www.zhihu.com" + liked_answers[i][54:]
640+
yield Answer(answer_url)
641+
data_times = re.findall(r"data-time=\"\d+\"", response_html)
642+
if len(data_times) != response_size:
643+
print "读取activities栏时间信息时发生错误,可能因为某答案中包含data-time信息"
644+
return
645+
yield
646+
latest_data_time = re.search(r"\d+", data_times[response_size - 1]).group()
647+
data = {
648+
'start': latest_data_time,
649+
'_xsrf': _xsrf,
650+
}
651+
r = requests.post(post_url, data=data, headers=header)
652+
response_size = r.json()["msg"][0]
653+
response_html = r.json()["msg"][1]
654+
return
655+
yield
656+
657+
658+
602659
class Answer:
603660
answer_url = None
604661
# session = None

0 commit comments

Comments
 (0)