Skip to content

Commit 9b4dd5a

Browse files
committed
修复 issue 9 中的 bug
1 parent 511f4f6 commit 9b4dd5a

1 file changed

Lines changed: 82 additions & 74 deletions

File tree

zhihu.py

Lines changed: 82 additions & 74 deletions
Original file line numberDiff line numberDiff line change
@@ -90,8 +90,8 @@ def create_session():
9090
r = s.post('http://www.zhihu.com/login/email', data=login_data, headers=header)
9191
if r.json()["r"] == 1:
9292
print "Login Failed, reason is:"
93-
for m in r.json()["msg"]:
94-
print r.json()["msg"][m]
93+
for m in r.json()["data"]:
94+
print r.json()["data"][m]
9595
print "Use cookies"
9696
has_cookies = False
9797
for key in cookies:
@@ -218,49 +218,55 @@ def get_all_answers(self):
218218
return
219219
yield
220220
else:
221+
error_answer_count = 0
221222
for i in xrange((answers_num - 1) / 50 + 1):
222223
if i == 0:
223224
for j in xrange(min(answers_num, 50)):
224225
if self.soup == None:
225226
self.parser()
226227
soup = BeautifulSoup(self.soup.encode("utf-8"))
228+
try:
229+
if soup.find_all("div", class_="zm-item-answer ")[j].find("div", class_=" zm-editable-content clearfix") == None:
230+
error_answer_count += 1
231+
continue
232+
author = None
233+
if soup.find_all("h3", class_="zm-item-answer-author-wrap")[j].string == u"匿名用户":
234+
author_url = None
235+
author = User(author_url)
236+
else:
237+
author_tag = soup.find_all("h3", class_="zm-item-answer-author-wrap")[j].find_all("a")[1]
238+
author_id = author_tag.string.encode("utf-8")
239+
author_url = "http://www.zhihu.com" + author_tag["href"]
240+
author = User(author_url, author_id)
227241

228-
author = None
229-
if soup.find_all("h3", class_="zm-item-answer-author-wrap")[j].string == u"匿名用户":
230-
author_url = None
231-
author = User(author_url)
232-
else:
233-
author_tag = soup.find_all("h3", class_="zm-item-answer-author-wrap")[j].find_all("a")[1]
234-
author_id = author_tag.string.encode("utf-8")
235-
author_url = "http://www.zhihu.com" + author_tag["href"]
236-
author = User(author_url, author_id)
237-
238-
count = soup.find_all("span", class_="count")[j].string
239-
if count[-1] == "K":
240-
upvote = int(count[0:(len(count) - 1)]) * 1000
241-
elif count[-1] == "W":
242-
upvote = int(count[0:(len(count) - 1)]) * 10000
243-
else:
244-
upvote = int(count)
245-
246-
answer_url = "http://www.zhihu.com" + soup.find_all("a", class_="answer-date-link")[j]["href"]
247-
248-
answer = soup.find_all("div", class_=" zm-editable-content clearfix")[j]
249-
soup.body.extract()
250-
soup.head.insert_after(soup.new_tag("body", **{'class': 'zhi'}))
251-
soup.body.append(answer)
252-
img_list = soup.find_all("img", class_="content_image lazy")
253-
for img in img_list:
254-
img["src"] = img["data-actualsrc"]
255-
img_list = soup.find_all("img", class_="origin_image zh-lightbox-thumb lazy")
256-
for img in img_list:
257-
img["src"] = img["data-actualsrc"]
258-
noscript_list = soup.find_all("noscript")
259-
for noscript in noscript_list:
260-
noscript.extract()
261-
content = soup
262-
answer = Answer(answer_url, self, author, upvote, content)
263-
yield answer
242+
count = soup.find_all("span", class_="count")[j].string
243+
if count[-1] == "K":
244+
upvote = int(count[0:(len(count) - 1)]) * 1000
245+
elif count[-1] == "W":
246+
upvote = int(count[0:(len(count) - 1)]) * 10000
247+
else:
248+
upvote = int(count)
249+
250+
answer_url = "http://www.zhihu.com" + soup.find_all("a", class_="answer-date-link")[j]["href"]
251+
252+
answer = soup.find_all("div", class_=" zm-editable-content clearfix")[j - error_answer_count]
253+
soup.body.extract()
254+
soup.head.insert_after(soup.new_tag("body", **{'class': 'zhi'}))
255+
soup.body.append(answer)
256+
img_list = soup.find_all("img", class_="content_image lazy")
257+
for img in img_list:
258+
img["src"] = img["data-actualsrc"]
259+
img_list = soup.find_all("img", class_="origin_image zh-lightbox-thumb lazy")
260+
for img in img_list:
261+
img["src"] = img["data-actualsrc"]
262+
noscript_list = soup.find_all("noscript")
263+
for noscript in noscript_list:
264+
noscript.extract()
265+
content = soup
266+
answer = Answer(answer_url, self, author, upvote, content)
267+
yield answer
268+
except Exception, e:
269+
print e
264270
else:
265271
s = session
266272
post_url = "http://www.zhihu.com/node/QuestionAnswerListV2"
@@ -291,43 +297,45 @@ def get_all_answers(self):
291297
soup = BeautifulSoup(self.soup.encode("utf-8"))
292298

293299
answer_soup = BeautifulSoup(answer_list[j])
300+
try:
301+
author = None
302+
if answer_soup.find("h3", class_="zm-item-answer-author-wrap").string == u"匿名用户":
303+
author_url = None
304+
author = User(author_url)
305+
else:
306+
author_tag = answer_soup.find("h3", class_="zm-item-answer-author-wrap").find_all("a")[1]
307+
author_id = author_tag.string.encode("utf-8")
308+
author_url = "http://www.zhihu.com" + author_tag["href"]
309+
author = User(author_url, author_id)
294310

295-
author = None
296-
if answer_soup.find("h3", class_="zm-item-answer-author-wrap").string == u"匿名用户":
297-
author_url = None
298-
author = User(author_url)
299-
else:
300-
author_tag = answer_soup.find("h3", class_="zm-item-answer-author-wrap").find_all("a")[1]
301-
author_id = author_tag.string.encode("utf-8")
302-
author_url = "http://www.zhihu.com" + author_tag["href"]
303-
author = User(author_url, author_id)
304-
305-
count = answer_soup.find("span", class_="count").string
306-
if count[-1] == "K":
307-
upvote = int(count[0:(len(count) - 1)]) * 1000
308-
elif count[-1] == "W":
309-
upvote = int(count[0:(len(count) - 1)]) * 10000
310-
else:
311-
upvote = int(count)
312-
313-
answer_url = "http://www.zhihu.com" + answer_soup.find("a", class_="answer-date-link")["href"]
314-
315-
answer = answer_soup.find("div", class_=" zm-editable-content clearfix")
316-
soup.body.extract()
317-
soup.head.insert_after(soup.new_tag("body", **{'class': 'zhi'}))
318-
soup.body.append(answer)
319-
img_list = soup.find_all("img", class_="content_image lazy")
320-
for img in img_list:
321-
img["src"] = img["data-actualsrc"]
322-
img_list = soup.find_all("img", class_="origin_image zh-lightbox-thumb lazy")
323-
for img in img_list:
324-
img["src"] = img["data-actualsrc"]
325-
noscript_list = soup.find_all("noscript")
326-
for noscript in noscript_list:
327-
noscript.extract()
328-
content = soup
329-
answer = Answer(answer_url, self, author, upvote, content)
330-
yield answer
311+
count = answer_soup.find("span", class_="count").string
312+
if count[-1] == "K":
313+
upvote = int(count[0:(len(count) - 1)]) * 1000
314+
elif count[-1] == "W":
315+
upvote = int(count[0:(len(count) - 1)]) * 10000
316+
else:
317+
upvote = int(count)
318+
319+
answer_url = "http://www.zhihu.com" + answer_soup.find("a", class_="answer-date-link")["href"]
320+
321+
answer = answer_soup.find("div", class_=" zm-editable-content clearfix")
322+
soup.body.extract()
323+
soup.head.insert_after(soup.new_tag("body", **{'class': 'zhi'}))
324+
soup.body.append(answer)
325+
img_list = soup.find_all("img", class_="content_image lazy")
326+
for img in img_list:
327+
img["src"] = img["data-actualsrc"]
328+
img_list = soup.find_all("img", class_="origin_image zh-lightbox-thumb lazy")
329+
for img in img_list:
330+
img["src"] = img["data-actualsrc"]
331+
noscript_list = soup.find_all("noscript")
332+
for noscript in noscript_list:
333+
noscript.extract()
334+
content = soup
335+
answer = Answer(answer_url, self, author, upvote, content)
336+
yield answer
337+
except Exception, e:
338+
print e
331339

332340
def get_top_i_answers(self, n):
333341
# if n > self.get_answers_num():

0 commit comments

Comments
 (0)