@@ -90,8 +90,8 @@ def create_session():
9090 r = s .post ('http://www.zhihu.com/login/email' , data = login_data , headers = header )
9191 if r .json ()["r" ] == 1 :
9292 print "Login Failed, reason is:"
93- for m in r .json ()["msg " ]:
94- print r .json ()["msg " ][m ]
93+ for m in r .json ()["data " ]:
94+ print r .json ()["data " ][m ]
9595 print "Use cookies"
9696 has_cookies = False
9797 for key in cookies :
@@ -218,49 +218,55 @@ def get_all_answers(self):
218218 return
219219 yield
220220 else :
221+ error_answer_count = 0
221222 for i in xrange ((answers_num - 1 ) / 50 + 1 ):
222223 if i == 0 :
223224 for j in xrange (min (answers_num , 50 )):
224225 if self .soup == None :
225226 self .parser ()
226227 soup = BeautifulSoup (self .soup .encode ("utf-8" ))
228+ try :
229+ if soup .find_all ("div" , class_ = "zm-item-answer " )[j ].find ("div" , class_ = " zm-editable-content clearfix" ) == None :
230+ error_answer_count += 1
231+ continue
232+ author = None
233+ if soup .find_all ("h3" , class_ = "zm-item-answer-author-wrap" )[j ].string == u"匿名用户" :
234+ author_url = None
235+ author = User (author_url )
236+ else :
237+ author_tag = soup .find_all ("h3" , class_ = "zm-item-answer-author-wrap" )[j ].find_all ("a" )[1 ]
238+ author_id = author_tag .string .encode ("utf-8" )
239+ author_url = "http://www.zhihu.com" + author_tag ["href" ]
240+ author = User (author_url , author_id )
227241
228- author = None
229- if soup .find_all ("h3" , class_ = "zm-item-answer-author-wrap" )[j ].string == u"匿名用户" :
230- author_url = None
231- author = User (author_url )
232- else :
233- author_tag = soup .find_all ("h3" , class_ = "zm-item-answer-author-wrap" )[j ].find_all ("a" )[1 ]
234- author_id = author_tag .string .encode ("utf-8" )
235- author_url = "http://www.zhihu.com" + author_tag ["href" ]
236- author = User (author_url , author_id )
237-
238- count = soup .find_all ("span" , class_ = "count" )[j ].string
239- if count [- 1 ] == "K" :
240- upvote = int (count [0 :(len (count ) - 1 )]) * 1000
241- elif count [- 1 ] == "W" :
242- upvote = int (count [0 :(len (count ) - 1 )]) * 10000
243- else :
244- upvote = int (count )
245-
246- answer_url = "http://www.zhihu.com" + soup .find_all ("a" , class_ = "answer-date-link" )[j ]["href" ]
247-
248- answer = soup .find_all ("div" , class_ = " zm-editable-content clearfix" )[j ]
249- soup .body .extract ()
250- soup .head .insert_after (soup .new_tag ("body" , ** {'class' : 'zhi' }))
251- soup .body .append (answer )
252- img_list = soup .find_all ("img" , class_ = "content_image lazy" )
253- for img in img_list :
254- img ["src" ] = img ["data-actualsrc" ]
255- img_list = soup .find_all ("img" , class_ = "origin_image zh-lightbox-thumb lazy" )
256- for img in img_list :
257- img ["src" ] = img ["data-actualsrc" ]
258- noscript_list = soup .find_all ("noscript" )
259- for noscript in noscript_list :
260- noscript .extract ()
261- content = soup
262- answer = Answer (answer_url , self , author , upvote , content )
263- yield answer
242+ count = soup .find_all ("span" , class_ = "count" )[j ].string
243+ if count [- 1 ] == "K" :
244+ upvote = int (count [0 :(len (count ) - 1 )]) * 1000
245+ elif count [- 1 ] == "W" :
246+ upvote = int (count [0 :(len (count ) - 1 )]) * 10000
247+ else :
248+ upvote = int (count )
249+
250+ answer_url = "http://www.zhihu.com" + soup .find_all ("a" , class_ = "answer-date-link" )[j ]["href" ]
251+
252+ answer = soup .find_all ("div" , class_ = " zm-editable-content clearfix" )[j - error_answer_count ]
253+ soup .body .extract ()
254+ soup .head .insert_after (soup .new_tag ("body" , ** {'class' : 'zhi' }))
255+ soup .body .append (answer )
256+ img_list = soup .find_all ("img" , class_ = "content_image lazy" )
257+ for img in img_list :
258+ img ["src" ] = img ["data-actualsrc" ]
259+ img_list = soup .find_all ("img" , class_ = "origin_image zh-lightbox-thumb lazy" )
260+ for img in img_list :
261+ img ["src" ] = img ["data-actualsrc" ]
262+ noscript_list = soup .find_all ("noscript" )
263+ for noscript in noscript_list :
264+ noscript .extract ()
265+ content = soup
266+ answer = Answer (answer_url , self , author , upvote , content )
267+ yield answer
268+ except Exception , e :
269+ print e
264270 else :
265271 s = session
266272 post_url = "http://www.zhihu.com/node/QuestionAnswerListV2"
@@ -291,43 +297,45 @@ def get_all_answers(self):
291297 soup = BeautifulSoup (self .soup .encode ("utf-8" ))
292298
293299 answer_soup = BeautifulSoup (answer_list [j ])
300+ try :
301+ author = None
302+ if answer_soup .find ("h3" , class_ = "zm-item-answer-author-wrap" ).string == u"匿名用户" :
303+ author_url = None
304+ author = User (author_url )
305+ else :
306+ author_tag = answer_soup .find ("h3" , class_ = "zm-item-answer-author-wrap" ).find_all ("a" )[1 ]
307+ author_id = author_tag .string .encode ("utf-8" )
308+ author_url = "http://www.zhihu.com" + author_tag ["href" ]
309+ author = User (author_url , author_id )
294310
295- author = None
296- if answer_soup .find ("h3" , class_ = "zm-item-answer-author-wrap" ).string == u"匿名用户" :
297- author_url = None
298- author = User (author_url )
299- else :
300- author_tag = answer_soup .find ("h3" , class_ = "zm-item-answer-author-wrap" ).find_all ("a" )[1 ]
301- author_id = author_tag .string .encode ("utf-8" )
302- author_url = "http://www.zhihu.com" + author_tag ["href" ]
303- author = User (author_url , author_id )
304-
305- count = answer_soup .find ("span" , class_ = "count" ).string
306- if count [- 1 ] == "K" :
307- upvote = int (count [0 :(len (count ) - 1 )]) * 1000
308- elif count [- 1 ] == "W" :
309- upvote = int (count [0 :(len (count ) - 1 )]) * 10000
310- else :
311- upvote = int (count )
312-
313- answer_url = "http://www.zhihu.com" + answer_soup .find ("a" , class_ = "answer-date-link" )["href" ]
314-
315- answer = answer_soup .find ("div" , class_ = " zm-editable-content clearfix" )
316- soup .body .extract ()
317- soup .head .insert_after (soup .new_tag ("body" , ** {'class' : 'zhi' }))
318- soup .body .append (answer )
319- img_list = soup .find_all ("img" , class_ = "content_image lazy" )
320- for img in img_list :
321- img ["src" ] = img ["data-actualsrc" ]
322- img_list = soup .find_all ("img" , class_ = "origin_image zh-lightbox-thumb lazy" )
323- for img in img_list :
324- img ["src" ] = img ["data-actualsrc" ]
325- noscript_list = soup .find_all ("noscript" )
326- for noscript in noscript_list :
327- noscript .extract ()
328- content = soup
329- answer = Answer (answer_url , self , author , upvote , content )
330- yield answer
311+ count = answer_soup .find ("span" , class_ = "count" ).string
312+ if count [- 1 ] == "K" :
313+ upvote = int (count [0 :(len (count ) - 1 )]) * 1000
314+ elif count [- 1 ] == "W" :
315+ upvote = int (count [0 :(len (count ) - 1 )]) * 10000
316+ else :
317+ upvote = int (count )
318+
319+ answer_url = "http://www.zhihu.com" + answer_soup .find ("a" , class_ = "answer-date-link" )["href" ]
320+
321+ answer = answer_soup .find ("div" , class_ = " zm-editable-content clearfix" )
322+ soup .body .extract ()
323+ soup .head .insert_after (soup .new_tag ("body" , ** {'class' : 'zhi' }))
324+ soup .body .append (answer )
325+ img_list = soup .find_all ("img" , class_ = "content_image lazy" )
326+ for img in img_list :
327+ img ["src" ] = img ["data-actualsrc" ]
328+ img_list = soup .find_all ("img" , class_ = "origin_image zh-lightbox-thumb lazy" )
329+ for img in img_list :
330+ img ["src" ] = img ["data-actualsrc" ]
331+ noscript_list = soup .find_all ("noscript" )
332+ for noscript in noscript_list :
333+ noscript .extract ()
334+ content = soup
335+ answer = Answer (answer_url , self , author , upvote , content )
336+ yield answer
337+ except Exception , e :
338+ print e
331339
332340 def get_top_i_answers (self , n ):
333341 # if n > self.get_answers_num():
0 commit comments