top

egrcc · egrcc · commit a2ddd54d4a84 · 2014-12-15T17:11:07.000+08:00
diff --git a/README.rst b/README.rst
@@ -28,7 +28,7 @@ zhihu-python 采用 python2.7 编写，用来方便地获取知乎上各种内
 ---------
 
 zhihu-python 主要文件为 zhihu.py ，配置文件为 config.ini , 将这两个文件下载到你的工作目录，并修改
-config.ini 文件中的 email 为你的知乎账户邮箱，修改 password 为你的知乎账户密码。
+config.ini 文件中的 email 为你的知乎账户邮箱，修改 password 为你的知乎账户密码（用作模拟登录）。
 
  
 Question：获取问题信息
@@ -65,6 +65,8 @@ Question 代表一个问题，处理知乎问题相关操作。创建一个 Ques
     topics = question.get_topics()
     # 获取排名第一的回答
     top_answer = question.get_top_answer()
+    # 获取排名前十的十个回答
+    top_answers = question.get_top_i_answers(10)
     # 获取所有回答
     answers = question.get_all_answers()
     
@@ -85,6 +87,9 @@ Question 代表一个问题，处理知乎问题相关操作。创建一个 Ques
     print top_answer 
     # 输出：<zhihu.Answer instance at 0x7f8b6582d0e0>
     # 一个Answer类对象
+    print top_answers
+    # 输出：<generator object get_top_i_answers at 0x7fed676eb320>
+    # 代表前十的Answer的生成器
     print answers 
     # 输出：<generator object get_all_answer at 0x7f8b66ba30a0>
     # 代表所有Answer的生成器
@@ -234,6 +239,8 @@ Collection 代表一个收藏夹，处理收藏夹相关操作。创建一个 Co
     creator = collection.get_creator()
     # 获取该收藏夹的名字
     name = collection.get_name()
+    # 获取该收藏夹下的前十个答案
+    top_answers = collection.get_top_i_answers(10)
     # 获取该收藏夹下的所有答案
     answers = collection.get_all_answers()
     
@@ -242,6 +249,9 @@ Collection 代表一个收藏夹，处理收藏夹相关操作。创建一个 Co
     # 一个User对象
     print creator.get_user_id() # 稷黍
     print name # 给你一个不同的视角
+    print top_answers
+    # <generator object get_top_i_answers at 0x7f378465dc80>
+    # 代表前十个答案的生成器对象
     print answers 
     # <generator object get_all_answer at 0x7fe12a29b280>
     # 代表所有答案的生成器对象
@@ -250,7 +260,38 @@ Collection 代表一个收藏夹，处理收藏夹相关操作。创建一个 Co
 综合实例
 ~~~~~~~~~~~~~~~
 
-有待添加
+将 Question ， Answer ， User ， Collection 结合起来使用。实例如下：
+
+.. code-block:: python
+
+    # -*- coding: utf-8 -*-
+    from zhihu import Question
+    from zhihu import Answer
+    from zhihu import User
+    from zhihu import Collection
+    
+    url = "http://www.zhihu.com/question/24269892"
+    question = Question(url)
+	# 得到排名第一的答案
+    answer = question.get_top_answer()
+	# 得到排名第一的答案的作者
+    user = answer.get_author()
+	# 得到该作者回答过的所有问题的答案
+    user_answers = user.get_answers()
+	# 输出该作者回答过的所有问题的标题
+    for answer in user_answers:
+        print answer.get_question().get_title()
+	# 得到该用户的所有收藏夹
+    user_collections = user.get_collections()
+    for collection in user_collections:
+		# 输出每一个收藏夹的名字
+        print collection.get_name()
+		# 得到该收藏夹下的前十个回答
+        top_answers = collection.get_top_i_answers(10)
+		# 把答案内容转成txt，markdown
+        for answer in top_answers:
+            answer.to_txt()
+            answer.to_md()
 
 
 
diff --git a/test.py b/test.py
@@ -21,6 +21,8 @@ def question_test(url):
     topics = question.get_topics()
     # 获取排名第一的回答
     top_answer = question.get_top_answer()
+    # 获取排名前十的十个回答
+    top_answers = question.get_top_i_answers(10)
     # 获取所有回答
     answers = question.get_all_answers()
 
@@ -39,6 +41,7 @@ def question_test(url):
     for topic in topics:
         print topic , # 输出：情感克制 现实 社会 个人经历
     print top_answer # 输出：<zhihu.Answer instance at 0x7f8b6582d0e0>（Answer类对象）
+    print top_answers # 输出：<generator object get_top_i_answers at 0x7fed676eb320>（代表前十的Answer的生成器）
     print answers # 输出：<generator object get_all_answer at 0x7f8b66ba30a0>（代表所有Answer的生成器）
 
 
@@ -132,6 +135,8 @@ def collection_test(collection_url):
     creator = collection.get_creator()
     # 获取该收藏夹的名字
     name = collection.get_name()
+    # 获取该收藏夹下的前十个答案
+    top_answers = collection.get_top_i_answers(10)
     # 获取该收藏夹下的所有答案
     answers = collection.get_all_answers()
 
@@ -140,10 +145,37 @@ def collection_test(collection_url):
     # 一个User对象
     print creator.get_user_id() # 稷黍
     print name # 给你一个不同的视角
+    print top_answers
+    # <generator object get_top_i_answers at 0x7f378465dc80>
+    # 代表前十个答案的生成器对象
     print answers 
     # <generator object get_all_answer at 0x7fe12a29b280>
     # 代表所有答案的生成器对象
 
+def test():
+    url = "http://www.zhihu.com/question/24269892"
+    question = Question(url)
+    # 得到排名第一的答案
+    answer = question.get_top_answer()
+    # 得到排名第一的答案的作者
+    user = answer.get_author()
+    # 得到该作者回答过的所有问题的答案
+    user_answers = user.get_answers()
+    # 输出该作者回答过的所有问题的标题
+    for answer in user_answers:
+        print answer.get_question().get_title()
+    # 得到该用户的所有收藏夹
+    user_collections = user.get_collections()
+    for collection in user_collections:
+        # 输出每一个收藏夹的名字
+        print collection.get_name()
+        # 得到该收藏夹下的前十个回答
+        top_answers = collection.get_top_i_answers(10)
+        # 把答案内容转成txt，markdown
+        for answer in top_answers:
+            answer.to_txt()
+            answer.to_md()
+
 def main():
     url = "http://www.zhihu.com/question/24269892"
     question_test(url)
@@ -153,6 +185,7 @@ def main():
     user_test(user_url)
     collection_url = "http://www.zhihu.com/collection/36750683"
     collection_test(collection_url)
+    test()
 
 if __name__ == '__main__':
     main()
diff --git a/zhihu.py b/zhihu.py
@@ -18,9 +18,12 @@ class Question:
 
     def __init__(self, url, title = None):
         
-        self.url = url
-        if title != None:
-            self.title = title
+        if url[0:len(url) - 8] != "http://www.zhihu.com/question/":
+            raise ValueError("\"" + url + "\"" + " : it isn't a question url.")
+        else:     
+            self.url = url
+            if title != None:
+                self.title = title
 
     def create_session(self):
         cf = ConfigParser.ConfigParser()
@@ -84,45 +87,45 @@ def get_topics(self):
             topics.append(i.contents[0].encode("utf-8").replace("\n", ""))
         return topics
 
-    def get_top_answer(self):
-
-        if self.get_answers_num() == 0:
-            print "No answer."
-            return 
-        else:
-            if self.soup == None:
-                self.parser()
-            soup = BeautifulSoup(self.soup.encode("utf-8"))
-            author = None
-            if soup.find("h3", class_ = "zm-item-answer-author-wrap") == u"匿名用户":
-                author_url = None
-                author = User(author_url)
-            else:
-                author_tag = soup.find("h3", class_ = "zm-item-answer-author-wrap").find_all("a")[1]
-                author_id = author_tag.string.encode("utf-8")
-                author_url = "http://www.zhihu.com" + author_tag["href"]
-                author = User(author_url, author_id)
-
-            count = soup.find("span", class_ = "count").string
-            if count[-1] == "K":
-                upvote = int(count[0:(len(count) - 1)]) * 1000
-            elif count[-1] == "W":
-                upvote = int(count[0:(len(count) - 1)]) * 10000
-            else:
-                upvote = int(count)
-
-            answer_url = "http://www.zhihu.com" + soup.find("a", class_ = "answer-date-link")["href"]
+    # def get_top_answer(self):
 
-            top_answer = soup.find("div", class_ = " zm-editable-content clearfix")
-            soup.body.extract()
-            soup.head.insert_after(soup.new_tag("body", **{'class':'zhi'}))
-            soup.body.append(top_answer)
-            img_list = soup.find_all("img", class_ = "content_image lazy")
-            for img in img_list:
-                img["src"] = img["data-actualsrc"]
-            content = soup
-            answer = Answer(answer_url, self, author, upvote, content)
-            return answer
+    #     if self.get_answers_num() == 0:
+    #         print "No answer."
+    #         return 
+    #     else:
+    #         if self.soup == None:
+    #             self.parser()
+    #         soup = BeautifulSoup(self.soup.encode("utf-8"))
+    #         author = None
+    #         if soup.find("h3", class_ = "zm-item-answer-author-wrap") == u"匿名用户":
+    #             author_url = None
+    #             author = User(author_url)
+    #         else:
+    #             author_tag = soup.find("h3", class_ = "zm-item-answer-author-wrap").find_all("a")[1]
+    #             author_id = author_tag.string.encode("utf-8")
+    #             author_url = "http://www.zhihu.com" + author_tag["href"]
+    #             author = User(author_url, author_id)
+
+    #         count = soup.find("span", class_ = "count").string
+    #         if count[-1] == "K":
+    #             upvote = int(count[0:(len(count) - 1)]) * 1000
+    #         elif count[-1] == "W":
+    #             upvote = int(count[0:(len(count) - 1)]) * 10000
+    #         else:
+    #             upvote = int(count)
+
+    #         answer_url = "http://www.zhihu.com" + soup.find("a", class_ = "answer-date-link")["href"]
+
+    #         top_answer = soup.find("div", class_ = " zm-editable-content clearfix")
+    #         soup.body.extract()
+    #         soup.head.insert_after(soup.new_tag("body", **{'class':'zhi'}))
+    #         soup.body.append(top_answer)
+    #         img_list = soup.find_all("img", class_ = "content_image lazy")
+    #         for img in img_list:
+    #             img["src"] = img["data-actualsrc"]
+    #         content = soup
+    #         answer = Answer(answer_url, self, author, upvote, content)
+    #         return answer
 
     def get_all_answers(self):
         if self.get_answers_num() == 0:
@@ -222,6 +225,21 @@ def get_all_answers(self):
                         answer = Answer(answer_url, self, author, upvote, content)
                         yield answer
 
+    def get_top_i_answers(self, i):
+        # if i > self.get_answers_num():
+        #     i = self.get_answers_num()
+        j = 0
+        answers = self.get_all_answers()
+        for answer in answers:
+            j = j + 1
+            if j > i:
+                break
+            yield answer
+
+    def get_top_answer(self):
+        for answer in self.get_top_i_answers(1):
+            return answer
+
 
 class User:
 
@@ -232,6 +250,8 @@ class User:
     def __init__(self, user_url, user_id = None):
         if user_url == None:
             self.user_id = "匿名用户"
+        elif user_url[0:28] != "http://www.zhihu.com/people/":
+            raise ValueError("\"" + user_url + "\"" + " : it isn't a user url.")
         else:
             self.user_url = user_url
             if user_id != None:
@@ -257,7 +277,7 @@ def parser(self):
 
     def get_user_id(self):
         if self.user_url == None:
-            print "I'm anonymous user."
+            # print "I'm anonymous user."
             return "匿名用户"
         else:
             if hasattr(self, "user_id"):
@@ -627,6 +647,7 @@ def to_txt(self):
             if not os.path.isdir(os.path.join(os.path.join(os.getcwd(), "text"))):
                 os.makedirs(os.path.join(os.path.join(os.getcwd(), "text")))
             file_name = self.get_question().get_title() + "--" + self.get_author().get_user_id() + "的回答.txt"
+            print file_name
             if os.path.exists(os.path.join(os.path.join(os.getcwd(), "text"), file_name)):
                 f = open(os.path.join(os.path.join(os.getcwd(), "text"), file_name), "a")
                 f.write("\n\n")
@@ -637,6 +658,7 @@ def to_txt(self):
             if not os.path.isdir(os.path.join(os.path.join(os.getcwd(), "text"))):
                 os.makedirs(os.path.join(os.path.join(os.getcwd(), "text")))
             file_name = self.get_question().get_title() + "--" + self.get_author().get_user_id() + "的回答.txt"
+            print file_name
             f = open(os.path.join(os.path.join(os.getcwd(), "text"), file_name), "wt")
             f.write(self.get_question().get_title() + "\n\n")
         f.write("作者: " + self.get_author().get_user_id() + "  赞同: " + str(self.get_upvote()) + "\n\n")
@@ -657,6 +679,7 @@ def to_md(self):
         content = self.get_content()
         if self.get_author().get_user_id() == "匿名用户":
             file_name = self.get_question().get_title() + "--" + self.get_author().get_user_id() + "的回答.md"
+            print file_name
             if not os.path.isdir(os.path.join(os.path.join(os.getcwd(), "markdown"))):
                 os.makedirs(os.path.join(os.path.join(os.getcwd(), "markdown")))
             if os.path.exists(os.path.join(os.path.join(os.getcwd(), "markdown"), file_name)):
@@ -669,6 +692,7 @@ def to_md(self):
             if not os.path.isdir(os.path.join(os.path.join(os.getcwd(), "markdown"))):
                 os.makedirs(os.path.join(os.path.join(os.getcwd(), "markdown")))
             file_name = self.get_question().get_title() + "--" + self.get_author().get_user_id() + "的回答.md"
+            print file_name
             f = open(os.path.join(os.path.join(os.getcwd(), "markdown"), file_name), "wt")
             f.write("# " + self.get_question().get_title() + "\n")
         f.write("## 作者: " + self.get_author().get_user_id() + "  赞同: " + str(self.get_upvote()) + "\n")
@@ -684,12 +708,16 @@ class Collection:
     session = None
     soup = None
 
-    def __init__(self, url, name = None, creator = None):      
-        self.url = url
-        if name != None:
-            self.name = name
-        if creator != None:
-            self.creator = creator
+    def __init__(self, url, name = None, creator = None):
+
+        if url[0:len(url) - 8] != "http://www.zhihu.com/collection/":
+            raise ValueError("\"" + url + "\"" + " : it isn't a collection url.")
+        else:
+            self.url = url
+            if name != None:
+                self.name = name
+            if creator != None:
+                self.creator = creator
 
     def create_session(self):
         cf = ConfigParser.ConfigParser()
@@ -790,4 +818,11 @@ def get_all_answers(self):
                         yield Answer(answer_url, question, author)
                 i = i + 1
 
-
+    def get_top_i_answers(self, i):
+        j = 0
+        answers = self.get_all_answers()
+        for answer in answers:
+            j = j + 1
+            if j > i:
+                break
+            yield answer