Skip to content

Commit a2ddd54

Browse files
committed
top
1 parent 9c4149f commit a2ddd54

File tree

3 files changed

+160
-51
lines changed

3 files changed

+160
-51
lines changed

README.rst

Lines changed: 43 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@ zhihu-python 采用 python2.7 编写,用来方便地获取知乎上各种内
2828
---------
2929

3030
zhihu-python 主要文件为 zhihu.py ,配置文件为 config.ini , 将这两个文件下载到你的工作目录,并修改
31-
config.ini 文件中的 email 为你的知乎账户邮箱,修改 password 为你的知乎账户密码。
31+
config.ini 文件中的 email 为你的知乎账户邮箱,修改 password 为你的知乎账户密码(用作模拟登录)
3232

3333

3434
Question:获取问题信息
@@ -65,6 +65,8 @@ Question 代表一个问题,处理知乎问题相关操作。创建一个 Ques
6565
topics = question.get_topics()
6666
# 获取排名第一的回答
6767
top_answer = question.get_top_answer()
68+
# 获取排名前十的十个回答
69+
top_answers = question.get_top_i_answers(10)
6870
# 获取所有回答
6971
answers = question.get_all_answers()
7072
@@ -85,6 +87,9 @@ Question 代表一个问题,处理知乎问题相关操作。创建一个 Ques
8587
print top_answer
8688
# 输出:<zhihu.Answer instance at 0x7f8b6582d0e0>
8789
# 一个Answer类对象
90+
print top_answers
91+
# 输出:<generator object get_top_i_answers at 0x7fed676eb320>
92+
# 代表前十的Answer的生成器
8893
print answers
8994
# 输出:<generator object get_all_answer at 0x7f8b66ba30a0>
9095
# 代表所有Answer的生成器
@@ -234,6 +239,8 @@ Collection 代表一个收藏夹,处理收藏夹相关操作。创建一个 Co
234239
creator = collection.get_creator()
235240
# 获取该收藏夹的名字
236241
name = collection.get_name()
242+
# 获取该收藏夹下的前十个答案
243+
top_answers = collection.get_top_i_answers(10)
237244
# 获取该收藏夹下的所有答案
238245
answers = collection.get_all_answers()
239246
@@ -242,6 +249,9 @@ Collection 代表一个收藏夹,处理收藏夹相关操作。创建一个 Co
242249
# 一个User对象
243250
print creator.get_user_id() # 稷黍
244251
print name # 给你一个不同的视角
252+
print top_answers
253+
# <generator object get_top_i_answers at 0x7f378465dc80>
254+
# 代表前十个答案的生成器对象
245255
print answers
246256
# <generator object get_all_answer at 0x7fe12a29b280>
247257
# 代表所有答案的生成器对象
@@ -250,7 +260,38 @@ Collection 代表一个收藏夹,处理收藏夹相关操作。创建一个 Co
250260
综合实例
251261
~~~~~~~~~~~~~~~
252262

253-
有待添加
263+
将 Question , Answer , User , Collection 结合起来使用。实例如下:
264+
265+
.. code-block:: python
266+
267+
# -*- coding: utf-8 -*-
268+
from zhihu import Question
269+
from zhihu import Answer
270+
from zhihu import User
271+
from zhihu import Collection
272+
273+
url = "http://www.zhihu.com/question/24269892"
274+
question = Question(url)
275+
# 得到排名第一的答案
276+
answer = question.get_top_answer()
277+
# 得到排名第一的答案的作者
278+
user = answer.get_author()
279+
# 得到该作者回答过的所有问题的答案
280+
user_answers = user.get_answers()
281+
# 输出该作者回答过的所有问题的标题
282+
for answer in user_answers:
283+
print answer.get_question().get_title()
284+
# 得到该用户的所有收藏夹
285+
user_collections = user.get_collections()
286+
for collection in user_collections:
287+
# 输出每一个收藏夹的名字
288+
print collection.get_name()
289+
# 得到该收藏夹下的前十个回答
290+
top_answers = collection.get_top_i_answers(10)
291+
# 把答案内容转成txt,markdown
292+
for answer in top_answers:
293+
answer.to_txt()
294+
answer.to_md()
254295
255296
256297

test.py

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,8 @@ def question_test(url):
2121
topics = question.get_topics()
2222
# 获取排名第一的回答
2323
top_answer = question.get_top_answer()
24+
# 获取排名前十的十个回答
25+
top_answers = question.get_top_i_answers(10)
2426
# 获取所有回答
2527
answers = question.get_all_answers()
2628

@@ -39,6 +41,7 @@ def question_test(url):
3941
for topic in topics:
4042
print topic , # 输出:情感克制 现实 社会 个人经历
4143
print top_answer # 输出:<zhihu.Answer instance at 0x7f8b6582d0e0>(Answer类对象)
44+
print top_answers # 输出:<generator object get_top_i_answers at 0x7fed676eb320>(代表前十的Answer的生成器)
4245
print answers # 输出:<generator object get_all_answer at 0x7f8b66ba30a0>(代表所有Answer的生成器)
4346

4447

@@ -132,6 +135,8 @@ def collection_test(collection_url):
132135
creator = collection.get_creator()
133136
# 获取该收藏夹的名字
134137
name = collection.get_name()
138+
# 获取该收藏夹下的前十个答案
139+
top_answers = collection.get_top_i_answers(10)
135140
# 获取该收藏夹下的所有答案
136141
answers = collection.get_all_answers()
137142

@@ -140,10 +145,37 @@ def collection_test(collection_url):
140145
# 一个User对象
141146
print creator.get_user_id() # 稷黍
142147
print name # 给你一个不同的视角
148+
print top_answers
149+
# <generator object get_top_i_answers at 0x7f378465dc80>
150+
# 代表前十个答案的生成器对象
143151
print answers
144152
# <generator object get_all_answer at 0x7fe12a29b280>
145153
# 代表所有答案的生成器对象
146154

155+
def test():
156+
url = "http://www.zhihu.com/question/24269892"
157+
question = Question(url)
158+
# 得到排名第一的答案
159+
answer = question.get_top_answer()
160+
# 得到排名第一的答案的作者
161+
user = answer.get_author()
162+
# 得到该作者回答过的所有问题的答案
163+
user_answers = user.get_answers()
164+
# 输出该作者回答过的所有问题的标题
165+
for answer in user_answers:
166+
print answer.get_question().get_title()
167+
# 得到该用户的所有收藏夹
168+
user_collections = user.get_collections()
169+
for collection in user_collections:
170+
# 输出每一个收藏夹的名字
171+
print collection.get_name()
172+
# 得到该收藏夹下的前十个回答
173+
top_answers = collection.get_top_i_answers(10)
174+
# 把答案内容转成txt,markdown
175+
for answer in top_answers:
176+
answer.to_txt()
177+
answer.to_md()
178+
147179
def main():
148180
url = "http://www.zhihu.com/question/24269892"
149181
question_test(url)
@@ -153,6 +185,7 @@ def main():
153185
user_test(user_url)
154186
collection_url = "http://www.zhihu.com/collection/36750683"
155187
collection_test(collection_url)
188+
test()
156189

157190
if __name__ == '__main__':
158191
main()

zhihu.py

Lines changed: 84 additions & 49 deletions
Original file line numberDiff line numberDiff line change
@@ -18,9 +18,12 @@ class Question:
1818

1919
def __init__(self, url, title = None):
2020

21-
self.url = url
22-
if title != None:
23-
self.title = title
21+
if url[0:len(url) - 8] != "http://www.zhihu.com/question/":
22+
raise ValueError("\"" + url + "\"" + " : it isn't a question url.")
23+
else:
24+
self.url = url
25+
if title != None:
26+
self.title = title
2427

2528
def create_session(self):
2629
cf = ConfigParser.ConfigParser()
@@ -84,45 +87,45 @@ def get_topics(self):
8487
topics.append(i.contents[0].encode("utf-8").replace("\n", ""))
8588
return topics
8689

87-
def get_top_answer(self):
88-
89-
if self.get_answers_num() == 0:
90-
print "No answer."
91-
return
92-
else:
93-
if self.soup == None:
94-
self.parser()
95-
soup = BeautifulSoup(self.soup.encode("utf-8"))
96-
author = None
97-
if soup.find("h3", class_ = "zm-item-answer-author-wrap") == u"匿名用户":
98-
author_url = None
99-
author = User(author_url)
100-
else:
101-
author_tag = soup.find("h3", class_ = "zm-item-answer-author-wrap").find_all("a")[1]
102-
author_id = author_tag.string.encode("utf-8")
103-
author_url = "http://www.zhihu.com" + author_tag["href"]
104-
author = User(author_url, author_id)
105-
106-
count = soup.find("span", class_ = "count").string
107-
if count[-1] == "K":
108-
upvote = int(count[0:(len(count) - 1)]) * 1000
109-
elif count[-1] == "W":
110-
upvote = int(count[0:(len(count) - 1)]) * 10000
111-
else:
112-
upvote = int(count)
113-
114-
answer_url = "http://www.zhihu.com" + soup.find("a", class_ = "answer-date-link")["href"]
90+
# def get_top_answer(self):
11591

116-
top_answer = soup.find("div", class_ = " zm-editable-content clearfix")
117-
soup.body.extract()
118-
soup.head.insert_after(soup.new_tag("body", **{'class':'zhi'}))
119-
soup.body.append(top_answer)
120-
img_list = soup.find_all("img", class_ = "content_image lazy")
121-
for img in img_list:
122-
img["src"] = img["data-actualsrc"]
123-
content = soup
124-
answer = Answer(answer_url, self, author, upvote, content)
125-
return answer
92+
# if self.get_answers_num() == 0:
93+
# print "No answer."
94+
# return
95+
# else:
96+
# if self.soup == None:
97+
# self.parser()
98+
# soup = BeautifulSoup(self.soup.encode("utf-8"))
99+
# author = None
100+
# if soup.find("h3", class_ = "zm-item-answer-author-wrap") == u"匿名用户":
101+
# author_url = None
102+
# author = User(author_url)
103+
# else:
104+
# author_tag = soup.find("h3", class_ = "zm-item-answer-author-wrap").find_all("a")[1]
105+
# author_id = author_tag.string.encode("utf-8")
106+
# author_url = "http://www.zhihu.com" + author_tag["href"]
107+
# author = User(author_url, author_id)
108+
109+
# count = soup.find("span", class_ = "count").string
110+
# if count[-1] == "K":
111+
# upvote = int(count[0:(len(count) - 1)]) * 1000
112+
# elif count[-1] == "W":
113+
# upvote = int(count[0:(len(count) - 1)]) * 10000
114+
# else:
115+
# upvote = int(count)
116+
117+
# answer_url = "http://www.zhihu.com" + soup.find("a", class_ = "answer-date-link")["href"]
118+
119+
# top_answer = soup.find("div", class_ = " zm-editable-content clearfix")
120+
# soup.body.extract()
121+
# soup.head.insert_after(soup.new_tag("body", **{'class':'zhi'}))
122+
# soup.body.append(top_answer)
123+
# img_list = soup.find_all("img", class_ = "content_image lazy")
124+
# for img in img_list:
125+
# img["src"] = img["data-actualsrc"]
126+
# content = soup
127+
# answer = Answer(answer_url, self, author, upvote, content)
128+
# return answer
126129

127130
def get_all_answers(self):
128131
if self.get_answers_num() == 0:
@@ -222,6 +225,21 @@ def get_all_answers(self):
222225
answer = Answer(answer_url, self, author, upvote, content)
223226
yield answer
224227

228+
def get_top_i_answers(self, i):
229+
# if i > self.get_answers_num():
230+
# i = self.get_answers_num()
231+
j = 0
232+
answers = self.get_all_answers()
233+
for answer in answers:
234+
j = j + 1
235+
if j > i:
236+
break
237+
yield answer
238+
239+
def get_top_answer(self):
240+
for answer in self.get_top_i_answers(1):
241+
return answer
242+
225243

226244
class User:
227245

@@ -232,6 +250,8 @@ class User:
232250
def __init__(self, user_url, user_id = None):
233251
if user_url == None:
234252
self.user_id = "匿名用户"
253+
elif user_url[0:28] != "http://www.zhihu.com/people/":
254+
raise ValueError("\"" + user_url + "\"" + " : it isn't a user url.")
235255
else:
236256
self.user_url = user_url
237257
if user_id != None:
@@ -257,7 +277,7 @@ def parser(self):
257277

258278
def get_user_id(self):
259279
if self.user_url == None:
260-
print "I'm anonymous user."
280+
# print "I'm anonymous user."
261281
return "匿名用户"
262282
else:
263283
if hasattr(self, "user_id"):
@@ -627,6 +647,7 @@ def to_txt(self):
627647
if not os.path.isdir(os.path.join(os.path.join(os.getcwd(), "text"))):
628648
os.makedirs(os.path.join(os.path.join(os.getcwd(), "text")))
629649
file_name = self.get_question().get_title() + "--" + self.get_author().get_user_id() + "的回答.txt"
650+
print file_name
630651
if os.path.exists(os.path.join(os.path.join(os.getcwd(), "text"), file_name)):
631652
f = open(os.path.join(os.path.join(os.getcwd(), "text"), file_name), "a")
632653
f.write("\n\n")
@@ -637,6 +658,7 @@ def to_txt(self):
637658
if not os.path.isdir(os.path.join(os.path.join(os.getcwd(), "text"))):
638659
os.makedirs(os.path.join(os.path.join(os.getcwd(), "text")))
639660
file_name = self.get_question().get_title() + "--" + self.get_author().get_user_id() + "的回答.txt"
661+
print file_name
640662
f = open(os.path.join(os.path.join(os.getcwd(), "text"), file_name), "wt")
641663
f.write(self.get_question().get_title() + "\n\n")
642664
f.write("作者: " + self.get_author().get_user_id() + " 赞同: " + str(self.get_upvote()) + "\n\n")
@@ -657,6 +679,7 @@ def to_md(self):
657679
content = self.get_content()
658680
if self.get_author().get_user_id() == "匿名用户":
659681
file_name = self.get_question().get_title() + "--" + self.get_author().get_user_id() + "的回答.md"
682+
print file_name
660683
if not os.path.isdir(os.path.join(os.path.join(os.getcwd(), "markdown"))):
661684
os.makedirs(os.path.join(os.path.join(os.getcwd(), "markdown")))
662685
if os.path.exists(os.path.join(os.path.join(os.getcwd(), "markdown"), file_name)):
@@ -669,6 +692,7 @@ def to_md(self):
669692
if not os.path.isdir(os.path.join(os.path.join(os.getcwd(), "markdown"))):
670693
os.makedirs(os.path.join(os.path.join(os.getcwd(), "markdown")))
671694
file_name = self.get_question().get_title() + "--" + self.get_author().get_user_id() + "的回答.md"
695+
print file_name
672696
f = open(os.path.join(os.path.join(os.getcwd(), "markdown"), file_name), "wt")
673697
f.write("# " + self.get_question().get_title() + "\n")
674698
f.write("## 作者: " + self.get_author().get_user_id() + " 赞同: " + str(self.get_upvote()) + "\n")
@@ -684,12 +708,16 @@ class Collection:
684708
session = None
685709
soup = None
686710

687-
def __init__(self, url, name = None, creator = None):
688-
self.url = url
689-
if name != None:
690-
self.name = name
691-
if creator != None:
692-
self.creator = creator
711+
def __init__(self, url, name = None, creator = None):
712+
713+
if url[0:len(url) - 8] != "http://www.zhihu.com/collection/":
714+
raise ValueError("\"" + url + "\"" + " : it isn't a collection url.")
715+
else:
716+
self.url = url
717+
if name != None:
718+
self.name = name
719+
if creator != None:
720+
self.creator = creator
693721

694722
def create_session(self):
695723
cf = ConfigParser.ConfigParser()
@@ -790,4 +818,11 @@ def get_all_answers(self):
790818
yield Answer(answer_url, question, author)
791819
i = i + 1
792820

793-
821+
def get_top_i_answers(self, i):
822+
j = 0
823+
answers = self.get_all_answers()
824+
for answer in answers:
825+
j = j + 1
826+
if j > i:
827+
break
828+
yield answer

0 commit comments

Comments
 (0)