Skip to content

Commit a6c34b8

Browse files
committed
windows
1 parent 9c2d15a commit a6c34b8

2 files changed

Lines changed: 115 additions & 32 deletions

File tree

README.rst

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@ zhihu-python:获取知乎信息
66

77
zhihu-python 采用 python2.7 编写,用来方便地获取知乎上各种内容的信息,并且可以方便地将答案备份导出为 txt 或 markdown 文件。由于知乎官方目前没有提供 api,所以有了此项目地存在。
88

9-
**注:本项目代码均在Ubuntu14.04上使用python2.7.6编写和测试通过,windows环境仍然存在编码问题**
9+
**注:本项目代码均在Ubuntu14.04上使用python2.7.6编写和测试通过,其他环境可能存在一定问题**
1010

1111
获取某个问题下的全部回答并导出,很简单:
1212

zhihu.py

Lines changed: 114 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
import re
55
import time
66
import json
7+
import platform
78
import requests
89
import html2text
910
import ConfigParser
@@ -21,7 +22,15 @@ def create_session():
2122
password = cf.get("info", "password")
2223
s = requests.session()
2324
login_data = {"email": email, "password": password}
24-
s.post('http://www.zhihu.com/login', login_data)
25+
header = {
26+
'User-Agent': "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:34.0) Gecko/20100101 Firefox/34.0",
27+
'Host': "www.zhihu.com",
28+
'Referer': "http://www.zhihu.com/",
29+
'X-Requested-With': "XMLHttpRequest"
30+
}
31+
r = s.post('http://www.zhihu.com/login', data = login_data, headers = header)
32+
if r.json()["r"] == 1:
33+
raise Exception("login failed.")
2534
session = s
2635

2736

@@ -64,21 +73,33 @@ def parser(self):
6473

6574
def get_title(self):
6675
if hasattr(self, "title"):
67-
return self.title
76+
if platform.system() == 'Windows':
77+
title = self.title.decode('utf-8').encode('gbk')
78+
return title
79+
else:
80+
return self.title
6881
else:
6982
if self.soup == None:
7083
self.parser()
7184
soup = self.soup
7285
title = soup.find("h2", class_ = "zm-item-title").string.encode("utf-8").replace("\n", "")
7386
self.title = title
74-
return title
87+
if platform.system() == 'Windows':
88+
title = title.decode('utf-8').encode('gbk')
89+
return title
90+
else:
91+
return title
7592

7693
def get_detail(self):
7794
if self.soup == None:
7895
self.parser()
7996
soup = self.soup
8097
detail = soup.find("div", id = "zh-question-detail").div.get_text().encode("utf-8")
81-
return detail
98+
if platform.system() == 'Windows':
99+
detail = detail.decode('utf-8').encode('gbk')
100+
return detail
101+
else:
102+
return detail
82103

83104
def get_answers_num(self):
84105
if self.soup == None:
@@ -103,7 +124,10 @@ def get_topics(self):
103124
topic_list = soup.find_all("a", class_ = "zm-item-tag")
104125
topics = []
105126
for i in topic_list:
106-
topics.append(i.contents[0].encode("utf-8").replace("\n", ""))
127+
topic = i.contents[0].encode("utf-8").replace("\n", "")
128+
if platform.system() == 'Windows':
129+
topic = topic.decode('utf-8').encode('gbk')
130+
topics.append(topic)
107131
return topics
108132

109133
# def get_top_answer(self):
@@ -315,18 +339,27 @@ def parser(self):
315339
def get_user_id(self):
316340
if self.user_url == None:
317341
# print "I'm anonymous user."
318-
return "匿名用户"
342+
if platform.system() == 'Windows':
343+
return "匿名用户".decode('utf-8').encode('gbk')
344+
else:
345+
return "匿名用户"
319346
else:
320347
if hasattr(self, "user_id"):
321-
return self.user_id
348+
if platform.system() == 'Windows':
349+
return self.user_id.decode('utf-8').encode('gbk')
350+
else:
351+
return self.user_id
322352
else:
323353
if self.soup == None:
324354
self.parser()
325355
soup = self.soup
326356
user_id = soup.find("div", class_ = "title-section ellipsis") \
327357
.find("span", class_ = "name").string.encode("utf-8")
328358
self.user_id = user_id
329-
return user_id
359+
if platform.system() == 'Windows':
360+
return user_id.decode('utf-8').encode('gbk')
361+
else:
362+
return user_id
330363

331364
def get_followees_num(self):
332365
if self.user_url == None:
@@ -704,11 +737,23 @@ def to_txt(self):
704737
for li in li_list:
705738
li.insert_before(content.new_string("\n"))
706739

707-
if self.get_author().get_user_id() == "匿名用户":
740+
if platform.system() == 'Windows':
741+
anon_user_id = "匿名用户".decode('utf-8').encode('gbk')
742+
else:
743+
anon_user_id = "匿名用户"
744+
if self.get_author().get_user_id() == anon_user_id:
708745
if not os.path.isdir(os.path.join(os.path.join(os.getcwd(), "text"))):
709746
os.makedirs(os.path.join(os.path.join(os.getcwd(), "text")))
710-
file_name = self.get_question().get_title() + "--" + self.get_author().get_user_id() + "的回答.txt"
747+
if platform.system() == 'Windows':
748+
file_name = self.get_question().get_title() + "--" + self.get_author().get_user_id() + "的回答.txt".decode('utf-8').encode('gbk')
749+
else:
750+
file_name = self.get_question().get_title() + "--" + self.get_author().get_user_id() + "的回答.txt"
711751
print file_name
752+
# if platform.system() == 'Windows':
753+
# file_name = file_name.decode('utf-8').encode('gbk')
754+
# print file_name
755+
# else:
756+
# print file_name
712757
if os.path.exists(os.path.join(os.path.join(os.getcwd(), "text"), file_name)):
713758
f = open(os.path.join(os.path.join(os.getcwd(), "text"), file_name), "a")
714759
f.write("\n\n")
@@ -718,13 +763,27 @@ def to_txt(self):
718763
else:
719764
if not os.path.isdir(os.path.join(os.path.join(os.getcwd(), "text"))):
720765
os.makedirs(os.path.join(os.path.join(os.getcwd(), "text")))
721-
file_name = self.get_question().get_title() + "--" + self.get_author().get_user_id() + "的回答.txt"
766+
if platform.system() == 'Windows':
767+
file_name = self.get_question().get_title() + "--" + self.get_author().get_user_id() + "的回答.txt".decode('utf-8').encode('gbk')
768+
else:
769+
file_name = self.get_question().get_title() + "--" + self.get_author().get_user_id() + "的回答.txt"
722770
print file_name
771+
# if platform.system() == 'Windows':
772+
# file_name = file_name.decode('utf-8').encode('gbk')
773+
# print file_name
774+
# else:
775+
# print file_name
723776
f = open(os.path.join(os.path.join(os.getcwd(), "text"), file_name), "wt")
724777
f.write(self.get_question().get_title() + "\n\n")
725-
f.write("作者: " + self.get_author().get_user_id() + " 赞同: " + str(self.get_upvote()) + "\n\n")
726-
f.write(body.get_text().encode("utf-8"))
727-
f.write("\n" + "原链接: " + self.answer_url)
778+
if platform.system() == 'Windows':
779+
f.write("作者: ".decode('utf-8').encode('gbk') + self.get_author().get_user_id() + " 赞同: ".decode('utf-8').encode('gbk') + str(self.get_upvote()) + "\n\n")
780+
f.write(body.get_text().encode("gbk"))
781+
link_str = "原链接: ".decode('utf-8').encode('gbk')
782+
f.write("\n" + link_str + self.answer_url.decode('utf-8').encode('gbk'))
783+
else:
784+
f.write("作者: " + self.get_author().get_user_id() + " 赞同: " + str(self.get_upvote()) + "\n\n")
785+
f.write(body.get_text().encode("utf-8"))
786+
f.write("\n" + "原链接: " + self.answer_url)
728787
f.close()
729788

730789
# def to_html(self):
@@ -742,32 +801,49 @@ def to_txt(self):
742801

743802
def to_md(self):
744803
content = self.get_content()
745-
if self.get_author().get_user_id() == "匿名用户":
746-
file_name = self.get_question().get_title() + "--" + self.get_author().get_user_id() + "的回答.md"
804+
if platform.system() == 'Windows':
805+
anon_user_id = "匿名用户".decode('utf-8').encode('gbk')
806+
else:
807+
anon_user_id = "匿名用户"
808+
if self.get_author().get_user_id() == anon_user_id:
809+
if platform.system() == 'Windows':
810+
file_name = self.get_question().get_title() + "--" + self.get_author().get_user_id() + "的回答.md".decode('utf-8').encode('gbk')
811+
else:
812+
file_name = self.get_question().get_title() + "--" + self.get_author().get_user_id() + "的回答.md"
747813
print file_name
814+
# if platform.system() == 'Windows':
815+
# file_name = file_name.decode('utf-8').encode('gbk')
816+
# print file_name
817+
# else:
818+
# print file_name
748819
if not os.path.isdir(os.path.join(os.path.join(os.getcwd(), "markdown"))):
749820
os.makedirs(os.path.join(os.path.join(os.getcwd(), "markdown")))
750821
if os.path.exists(os.path.join(os.path.join(os.getcwd(), "markdown"), file_name)):
751822
f = open(os.path.join(os.path.join(os.getcwd(), "markdown"), file_name), "a")
752-
# f_2 = open(os.path.join(os.path.join(os.getcwd(), "markdown"), "2_" + file_name), "a")
753823
f.write("\n")
754-
# f_2.write("\n")
755824
else:
756825
f = open(os.path.join(os.path.join(os.getcwd(), "markdown"), file_name), "a")
757-
# f_2 = open(os.path.join(os.path.join(os.getcwd(), "markdown"), "2_" + file_name), "a")
758826
f.write("# " + self.get_question().get_title() + "\n")
759-
# f_2.write("# " + self.get_question().get_title() + "\n")
760827
else:
761828
if not os.path.isdir(os.path.join(os.path.join(os.getcwd(), "markdown"))):
762829
os.makedirs(os.path.join(os.path.join(os.getcwd(), "markdown")))
763-
file_name = self.get_question().get_title() + "--" + self.get_author().get_user_id() + "的回答.md"
830+
if platform.system() == 'Windows':
831+
file_name = self.get_question().get_title() + "--" + self.get_author().get_user_id() + "的回答.md".decode('utf-8').encode('gbk')
832+
else:
833+
file_name = self.get_question().get_title() + "--" + self.get_author().get_user_id() + "的回答.md"
764834
print file_name
835+
# file_name = self.get_question().get_title() + "--" + self.get_author().get_user_id() + "的回答.md"
836+
# if platform.system() == 'Windows':
837+
# file_name = file_name.decode('utf-8').encode('gbk')
838+
# print file_name
839+
# else:
840+
# print file_name
765841
f = open(os.path.join(os.path.join(os.getcwd(), "markdown"), file_name), "wt")
766-
# f_2 = open(os.path.join(os.path.join(os.getcwd(), "markdown"), "2_" + file_name), "wt")
767842
f.write("# " + self.get_question().get_title() + "\n")
768-
# f_2.write("# " + self.get_question().get_title() + "\n")
769-
f.write("## 作者: " + self.get_author().get_user_id() + " 赞同: " + str(self.get_upvote()) + "\n")
770-
# f_2.write("## 作者: " + self.get_author().get_user_id() + " 赞同: " + str(self.get_upvote()) + "\n")
843+
if platform.system() == 'Windows':
844+
f.write("## 作者: ".decode('utf-8').encode('gbk') + self.get_author().get_user_id() + " 赞同: ".decode('utf-8').encode('gbk') + str(self.get_upvote()) + "\n")
845+
else:
846+
f.write("## 作者: " + self.get_author().get_user_id() + " 赞同: " + str(self.get_upvote()) + "\n")
771847
text = html2text.html2text(content.decode('utf-8')).encode("utf-8")
772848

773849
r = re.findall(r'\*\*(.*?)\*\*', text)
@@ -784,12 +860,14 @@ def to_md(self):
784860
for i in r:
785861
text = text.replace(i, i + "\n\n")
786862

787-
f.write(text)
788-
# f_2.write(text)
789-
f.write("#### 原链接: " + self.answer_url)
790-
# f_2.write("#### 原链接: " + self.answer_url)
863+
if platform.system() == 'Windows':
864+
f.write(text.decode('utf-8').encode('gbk'))
865+
link_str = "#### 原链接: ".decode('utf-8').encode('gbk')
866+
f.write(link_str + self.answer_url.decode('utf-8').encode('gbk'))
867+
else:
868+
f.write(text)
869+
f.write("#### 原链接: " + self.answer_url)
791870
f.close()
792-
# f_2.close()
793871

794872

795873

@@ -833,12 +911,17 @@ def parser(self):
833911

834912
def get_name(self):
835913
if hasattr(self, 'name'):
836-
return self.name
914+
if platform.system() == 'Windows':
915+
return self.name.decode('utf-8').encode('gbk')
916+
else:
917+
return self.name
837918
else:
838919
if self.soup == None:
839920
self.parser()
840921
soup = self.soup
841922
self.name = soup.find("h2", id = "zh-fav-head-title").string.encode("utf-8").strip()
923+
if platform.system() == 'Windows':
924+
return self.name.decode('utf-8').encode('gbk')
842925
return self.name
843926

844927
def get_creator(self):

0 commit comments

Comments
 (0)