修复 issue 9 中的 bug

egrcc · egrcc · commit 9b4dd5a3d551 · 2015-07-13T21:39:19.000+08:00
diff --git a/zhihu.py b/zhihu.py
@@ -90,8 +90,8 @@ def create_session():
     r = s.post('http://www.zhihu.com/login/email', data=login_data, headers=header)
     if r.json()["r"] == 1:
         print "Login Failed, reason is:"
-        for m in r.json()["msg"]:
-            print r.json()["msg"][m]
+        for m in r.json()["data"]:
+            print r.json()["data"][m]
         print "Use cookies"
         has_cookies = False
         for key in cookies:
@@ -218,49 +218,55 @@ def get_all_answers(self):
             return
             yield
         else:
+            error_answer_count = 0
             for i in xrange((answers_num - 1) / 50 + 1):
                 if i == 0:
                     for j in xrange(min(answers_num, 50)):
                         if self.soup == None:
                             self.parser()
                         soup = BeautifulSoup(self.soup.encode("utf-8"))
+                        try:
+                            if soup.find_all("div", class_="zm-item-answer ")[j].find("div", class_=" zm-editable-content clearfix") == None:
+                                error_answer_count += 1
+                                continue
+                            author = None
+                            if soup.find_all("h3", class_="zm-item-answer-author-wrap")[j].string == u"匿名用户":
+                                author_url = None
+                                author = User(author_url)
+                            else:
+                                author_tag = soup.find_all("h3", class_="zm-item-answer-author-wrap")[j].find_all("a")[1]
+                                author_id = author_tag.string.encode("utf-8")
+                                author_url = "http://www.zhihu.com" + author_tag["href"]
+                                author = User(author_url, author_id)
 
-                        author = None
-                        if soup.find_all("h3", class_="zm-item-answer-author-wrap")[j].string == u"匿名用户":
-                            author_url = None
-                            author = User(author_url)
-                        else:
-                            author_tag = soup.find_all("h3", class_="zm-item-answer-author-wrap")[j].find_all("a")[1]
-                            author_id = author_tag.string.encode("utf-8")
-                            author_url = "http://www.zhihu.com" + author_tag["href"]
-                            author = User(author_url, author_id)
-
-                        count = soup.find_all("span", class_="count")[j].string
-                        if count[-1] == "K":
-                            upvote = int(count[0:(len(count) - 1)]) * 1000
-                        elif count[-1] == "W":
-                            upvote = int(count[0:(len(count) - 1)]) * 10000
-                        else:
-                            upvote = int(count)
-
-                        answer_url = "http://www.zhihu.com" + soup.find_all("a", class_="answer-date-link")[j]["href"]
-
-                        answer = soup.find_all("div", class_=" zm-editable-content clearfix")[j]
-                        soup.body.extract()
-                        soup.head.insert_after(soup.new_tag("body", **{'class': 'zhi'}))
-                        soup.body.append(answer)
-                        img_list = soup.find_all("img", class_="content_image lazy")
-                        for img in img_list:
-                            img["src"] = img["data-actualsrc"]
-                        img_list = soup.find_all("img", class_="origin_image zh-lightbox-thumb lazy")
-                        for img in img_list:
-                            img["src"] = img["data-actualsrc"]
-                        noscript_list = soup.find_all("noscript")
-                        for noscript in noscript_list:
-                            noscript.extract()
-                        content = soup
-                        answer = Answer(answer_url, self, author, upvote, content)
-                        yield answer
+                            count = soup.find_all("span", class_="count")[j].string
+                            if count[-1] == "K":
+                                upvote = int(count[0:(len(count) - 1)]) * 1000
+                            elif count[-1] == "W":
+                                upvote = int(count[0:(len(count) - 1)]) * 10000
+                            else:
+                                upvote = int(count)
+
+                            answer_url = "http://www.zhihu.com" + soup.find_all("a", class_="answer-date-link")[j]["href"]
+
+                            answer = soup.find_all("div", class_=" zm-editable-content clearfix")[j - error_answer_count]
+                            soup.body.extract()
+                            soup.head.insert_after(soup.new_tag("body", **{'class': 'zhi'}))
+                            soup.body.append(answer)
+                            img_list = soup.find_all("img", class_="content_image lazy")
+                            for img in img_list:
+                                img["src"] = img["data-actualsrc"]
+                            img_list = soup.find_all("img", class_="origin_image zh-lightbox-thumb lazy")
+                            for img in img_list:
+                                img["src"] = img["data-actualsrc"]
+                            noscript_list = soup.find_all("noscript")
+                            for noscript in noscript_list:
+                                noscript.extract()
+                            content = soup
+                            answer = Answer(answer_url, self, author, upvote, content)
+                            yield answer
+                        except Exception, e:
+                            print e
                 else:
                     s = session
                     post_url = "http://www.zhihu.com/node/QuestionAnswerListV2"
@@ -291,43 +297,45 @@ def get_all_answers(self):
                         soup = BeautifulSoup(self.soup.encode("utf-8"))
 
                         answer_soup = BeautifulSoup(answer_list[j])
+                        try:
+                            author = None
+                            if answer_soup.find("h3", class_="zm-item-answer-author-wrap").string == u"匿名用户":
+                                author_url = None
+                                author = User(author_url)
+                            else:
+                                author_tag = answer_soup.find("h3", class_="zm-item-answer-author-wrap").find_all("a")[1]
+                                author_id = author_tag.string.encode("utf-8")
+                                author_url = "http://www.zhihu.com" + author_tag["href"]
+                                author = User(author_url, author_id)
 
-                        author = None
-                        if answer_soup.find("h3", class_="zm-item-answer-author-wrap").string == u"匿名用户":
-                            author_url = None
-                            author = User(author_url)
-                        else:
-                            author_tag = answer_soup.find("h3", class_="zm-item-answer-author-wrap").find_all("a")[1]
-                            author_id = author_tag.string.encode("utf-8")
-                            author_url = "http://www.zhihu.com" + author_tag["href"]
-                            author = User(author_url, author_id)
-
-                        count = answer_soup.find("span", class_="count").string
-                        if count[-1] == "K":
-                            upvote = int(count[0:(len(count) - 1)]) * 1000
-                        elif count[-1] == "W":
-                            upvote = int(count[0:(len(count) - 1)]) * 10000
-                        else:
-                            upvote = int(count)
-
-                        answer_url = "http://www.zhihu.com" + answer_soup.find("a", class_="answer-date-link")["href"]
-
-                        answer = answer_soup.find("div", class_=" zm-editable-content clearfix")
-                        soup.body.extract()
-                        soup.head.insert_after(soup.new_tag("body", **{'class': 'zhi'}))
-                        soup.body.append(answer)
-                        img_list = soup.find_all("img", class_="content_image lazy")
-                        for img in img_list:
-                            img["src"] = img["data-actualsrc"]
-                        img_list = soup.find_all("img", class_="origin_image zh-lightbox-thumb lazy")
-                        for img in img_list:
-                            img["src"] = img["data-actualsrc"]
-                        noscript_list = soup.find_all("noscript")
-                        for noscript in noscript_list:
-                            noscript.extract()
-                        content = soup
-                        answer = Answer(answer_url, self, author, upvote, content)
-                        yield answer
+                            count = answer_soup.find("span", class_="count").string
+                            if count[-1] == "K":
+                                upvote = int(count[0:(len(count) - 1)]) * 1000
+                            elif count[-1] == "W":
+                                upvote = int(count[0:(len(count) - 1)]) * 10000
+                            else:
+                                upvote = int(count)
+
+                            answer_url = "http://www.zhihu.com" + answer_soup.find("a", class_="answer-date-link")["href"]
+
+                            answer = answer_soup.find("div", class_=" zm-editable-content clearfix")
+                            soup.body.extract()
+                            soup.head.insert_after(soup.new_tag("body", **{'class': 'zhi'}))
+                            soup.body.append(answer)
+                            img_list = soup.find_all("img", class_="content_image lazy")
+                            for img in img_list:
+                                img["src"] = img["data-actualsrc"]
+                            img_list = soup.find_all("img", class_="origin_image zh-lightbox-thumb lazy")
+                            for img in img_list:
+                                img["src"] = img["data-actualsrc"]
+                            noscript_list = soup.find_all("noscript")
+                            for noscript in noscript_list:
+                                noscript.extract()
+                            content = soup
+                            answer = Answer(answer_url, self, author, upvote, content)
+                            yield answer
+                        except Exception, e:
+                            print e
 
     def get_top_i_answers(self, n):
         # if n > self.get_answers_num():