@@ -107,7 +107,7 @@ def __init__(self, url, title=None):
107107
108108 def parser (self ):
109109 r = requests .get (self .url )
110- self .soup = BeautifulSoup (r .content )
110+ self .soup = BeautifulSoup (r .content , "lxml" )
111111
112112 def get_title (self ):
113113 if hasattr (self , "title" ):
@@ -182,7 +182,7 @@ def get_all_answers(self):
182182 for j in xrange (min (answers_num , 20 )):
183183 if self .soup == None :
184184 self .parser ()
185- soup = BeautifulSoup (self .soup .encode ("utf-8" ))
185+ soup = BeautifulSoup (self .soup .encode ("utf-8" ), "lxml" )
186186
187187 is_my_answer = False
188188 if soup .find_all ("div" , class_ = "zm-item-answer" )[j ].find ("span" , class_ = "count" ) == None :
@@ -251,9 +251,9 @@ def get_all_answers(self):
251251
252252 answer_list = r .json ()["msg" ]
253253 for j in xrange (min (answers_num - i * 20 , 20 )):
254- soup = BeautifulSoup (self .soup .encode ("utf-8" ))
254+ soup = BeautifulSoup (self .soup .encode ("utf-8" ), "lxml" )
255255
256- answer_soup = BeautifulSoup (answer_list [j ])
256+ answer_soup = BeautifulSoup (answer_list [j ], "lxml" )
257257
258258 if answer_soup .find ("div" , class_ = "zm-editable-content clearfix" ) == None :
259259 continue
@@ -337,7 +337,7 @@ def __init__(self, user_url, user_id=None):
337337
338338 def parser (self ):
339339 r = requests .get (self .user_url )
340- soup = BeautifulSoup (r .content )
340+ soup = BeautifulSoup (r .content , "lxml" )
341341 self .soup = soup
342342
343343 def get_user_id (self ):
@@ -496,7 +496,7 @@ def get_followees(self):
496496 followee_url = self .user_url + "/followees"
497497 r = requests .get (followee_url )
498498
499- soup = BeautifulSoup (r .content )
499+ soup = BeautifulSoup (r .content , "lxml" )
500500 for i in xrange ((followees_num - 1 ) / 20 + 1 ):
501501 if i == 0 :
502502 user_url_list = soup .find_all ("h2" , class_ = "zm-list-content-title" )
@@ -523,7 +523,7 @@ def get_followees(self):
523523
524524 followee_list = r_post .json ()["msg" ]
525525 for j in xrange (min (followees_num - i * 20 , 20 )):
526- followee_soup = BeautifulSoup (followee_list [j ])
526+ followee_soup = BeautifulSoup (followee_list [j ], "lxml" )
527527 user_link = followee_soup .find ("h2" , class_ = "zm-list-content-title" ).a
528528 yield User (user_link ["href" ], user_link .string .encode ("utf-8" ))
529529
@@ -541,7 +541,7 @@ def get_followers(self):
541541 follower_url = self .user_url + "/followers"
542542 r = requests .get (follower_url )
543543
544- soup = BeautifulSoup (r .content )
544+ soup = BeautifulSoup (r .content , "lxml" )
545545 for i in xrange ((followers_num - 1 ) / 20 + 1 ):
546546 if i == 0 :
547547 user_url_list = soup .find_all ("h2" , class_ = "zm-list-content-title" )
@@ -567,7 +567,7 @@ def get_followers(self):
567567
568568 follower_list = r_post .json ()["msg" ]
569569 for j in xrange (min (followers_num - i * 20 , 20 )):
570- follower_soup = BeautifulSoup (follower_list [j ])
570+ follower_soup = BeautifulSoup (follower_list [j ], "lxml" )
571571 user_link = follower_soup .find ("h2" , class_ = "zm-list-content-title" ).a
572572 yield User (user_link ["href" ], user_link .string .encode ("utf-8" ))
573573
@@ -591,7 +591,7 @@ def get_asks(self):
591591 ask_url = self .user_url + "/asks?page=" + str (i + 1 )
592592 r = requests .get (ask_url )
593593
594- soup = BeautifulSoup (r .content )
594+ soup = BeautifulSoup (r .content , "lxml" )
595595 for question in soup .find_all ("a" , class_ = "question_link" ):
596596 url = "http://www.zhihu.com" + question ["href" ]
597597 title = question .string .encode ("utf-8" )
@@ -611,7 +611,7 @@ def get_answers(self):
611611 for i in xrange ((answers_num - 1 ) / 20 + 1 ):
612612 answer_url = self .user_url + "/answers?page=" + str (i + 1 )
613613 r = requests .get (answer_url )
614- soup = BeautifulSoup (r .content )
614+ soup = BeautifulSoup (r .content , "lxml" )
615615 for answer in soup .find_all ("a" , class_ = "question_link" ):
616616 question_url = "http://www.zhihu.com" + answer ["href" ][0 :18 ]
617617 question_title = answer .string .encode ("utf-8" )
@@ -634,7 +634,7 @@ def get_collections(self):
634634
635635 r = requests .get (collection_url )
636636
637- soup = BeautifulSoup (r .content )
637+ soup = BeautifulSoup (r .content , "lxml" )
638638 for collection in soup .find_all ("div" , class_ = "zm-profile-section-item zg-clear" ):
639639 url = "http://www.zhihu.com" + \
640640 collection .find ("a" , class_ = "zm-profile-fav-item-title" )["href" ]
@@ -650,7 +650,7 @@ def get_likes(self):
650650 yield
651651 else :
652652 r = requests .get (self .user_url )
653- soup = BeautifulSoup (r .content )
653+ soup = BeautifulSoup (r .content , "lxml" )
654654 # Handle the first liked item
655655 first_item = soup .find ("div" , attrs = {'class' :'zm-profile-section-item zm-item clearfix' })
656656 first_item = first_item .find ("div" , attrs = {'class' :'zm-profile-section-main zm-profile-section-activity-main zm-profile-activity-page-item-main' })
@@ -717,7 +717,7 @@ def __init__(self, answer_url, question=None, author=None, upvote=None, content=
717717
718718 def parser (self ):
719719 r = requests .get (self .answer_url )
720- soup = BeautifulSoup (r .content )
720+ soup = BeautifulSoup (r .content , "lxml" )
721721 self .soup = soup
722722
723723 def get_question (self ):
@@ -772,7 +772,7 @@ def get_content(self):
772772 else :
773773 if self .soup == None :
774774 self .parser ()
775- soup = BeautifulSoup (self .soup .encode ("utf-8" ))
775+ soup = BeautifulSoup (self .soup .encode ("utf-8" ), "lxml" )
776776 answer = soup .find ("div" , class_ = "zm-editable-content clearfix" )
777777 soup .body .extract ()
778778 soup .head .insert_after (soup .new_tag ("body" , ** {'class' : 'zhi' }))
@@ -962,7 +962,7 @@ def get_voters(self):
962962 # s = session
963963 # r = s.get(request_url, params={"params": "{\"answer_id\":\"%d\"}" % int(data_aid)})
964964 r = requests .get (request_url , params = {"params" : "{\" answer_id\" :\" %d\" }" % int (data_aid )})
965- soup = BeautifulSoup (r .content )
965+ soup = BeautifulSoup (r .content , "lxml" )
966966 voters_info = soup .find_all ("span" )[1 :- 1 ]
967967 if len (voters_info ) == 0 :
968968 return
@@ -997,7 +997,7 @@ def __init__(self, url, name=None, creator=None):
997997 self .creator = creator
998998 def parser (self ):
999999 r = requests .get (self .url )
1000- soup = BeautifulSoup (r .content )
1000+ soup = BeautifulSoup (r .content , "lxml" )
10011001 self .soup = soup
10021002
10031003 def get_name (self ):
@@ -1062,7 +1062,7 @@ def get_all_answers(self):
10621062 i = 2
10631063 while True :
10641064 r = requests .get (self .url + "?page=" + str (i ))
1065- answer_soup = BeautifulSoup (r .content )
1065+ answer_soup = BeautifulSoup (r .content , "lxml" )
10661066 answer_list = answer_soup .find_all ("div" , class_ = "zm-item" )
10671067 if len (answer_list ) == 0 :
10681068 break
0 commit comments