@@ -599,6 +599,63 @@ def get_collections(self):
599599 yield Collection (url , name , self )
600600
601601
602+ def get_likes (self ):
603+ # Todo: first version without zhuanlan article, also need the first one
604+ if self .user_url == None :
605+ print "I'm an anonymous user."
606+ return
607+ yield
608+ else :
609+ r = requests .get (self .user_url )
610+ soup = BeautifulSoup (r .content )
611+ # Handle the first liked item
612+ first_item = soup .find ("div" , attrs = {'class' :'zm-profile-section-item zm-item clearfix' })
613+ first_item = first_item .find ("div" , attrs = {'class' :'zm-profile-section-main zm-profile-section-activity-main zm-profile-activity-page-item-main' })
614+ if u"赞同了回答" in str (first_item ):
615+ first_like = first_item .find ("a" )['href' ]
616+ yield Answer ("http://www.zhihu.com" + first_like )
617+ # Handle the rest liked items
618+ post_url = self .user_url + "/activities"
619+ start_time = soup .find ("div" , attrs = {'class' :'zm-profile-section-item zm-item clearfix' })["data-time" ]
620+ _xsrf = soup .find ("input" , attrs = {'name' : '_xsrf' })["value" ]
621+ data = {
622+ 'start' : start_time ,
623+ '_xsrf' : _xsrf ,
624+ }
625+ header = {
626+ 'Host' : "www.zhihu.com" ,
627+ 'Referer' : self .user_url ,
628+ 'User-Agent' : "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36" ,
629+ }
630+ r = requests .post (post_url , data = data , headers = header )
631+ response_size = r .json ()["msg" ][0 ]
632+ response_html = r .json ()["msg" ][1 ]
633+ while response_size > 0 :
634+ all_liked_answers = re .findall (u"\u8d5e \u540c \u4e86 \u56de \u7b54 \n \n <a class=\" question_link\" target=\" _blank\" href=\" \/question\/\d{8}\/answer\/\d{8}" , response_html )
635+ # Remove duplicated, result in random order
636+ liked_answers = list (set (all_liked_answers ))
637+ liked_answers .sort (key = all_liked_answers .index )
638+ for i in xrange (len (liked_answers )):
639+ answer_url = "http://www.zhihu.com" + liked_answers [i ][54 :]
640+ yield Answer (answer_url )
641+ data_times = re .findall (r"data-time=\"\d+\"" , response_html )
642+ if len (data_times ) != response_size :
643+ print "读取activities栏时间信息时发生错误,可能因为某答案中包含data-time信息"
644+ return
645+ yield
646+ latest_data_time = re .search (r"\d+" , data_times [response_size - 1 ]).group ()
647+ data = {
648+ 'start' : latest_data_time ,
649+ '_xsrf' : _xsrf ,
650+ }
651+ r = requests .post (post_url , data = data , headers = header )
652+ response_size = r .json ()["msg" ][0 ]
653+ response_html = r .json ()["msg" ][1 ]
654+ return
655+ yield
656+
657+
658+
602659class Answer :
603660 answer_url = None
604661 # session = None
0 commit comments