44import re
55import time
66import json
7+ import platform
78import requests
89import html2text
910import ConfigParser
@@ -21,7 +22,15 @@ def create_session():
2122 password = cf .get ("info" , "password" )
2223 s = requests .session ()
2324 login_data = {"email" : email , "password" : password }
24- s .post ('http://www.zhihu.com/login' , login_data )
25+ header = {
26+ 'User-Agent' : "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:34.0) Gecko/20100101 Firefox/34.0" ,
27+ 'Host' : "www.zhihu.com" ,
28+ 'Referer' : "http://www.zhihu.com/" ,
29+ 'X-Requested-With' : "XMLHttpRequest"
30+ }
31+ r = s .post ('http://www.zhihu.com/login' , data = login_data , headers = header )
32+ if r .json ()["r" ] == 1 :
33+ raise Exception ("login failed." )
2534 session = s
2635
2736
@@ -64,21 +73,33 @@ def parser(self):
6473
6574 def get_title (self ):
6675 if hasattr (self , "title" ):
67- return self .title
76+ if platform .system () == 'Windows' :
77+ title = self .title .decode ('utf-8' ).encode ('gbk' )
78+ return title
79+ else :
80+ return self .title
6881 else :
6982 if self .soup == None :
7083 self .parser ()
7184 soup = self .soup
7285 title = soup .find ("h2" , class_ = "zm-item-title" ).string .encode ("utf-8" ).replace ("\n " , "" )
7386 self .title = title
74- return title
87+ if platform .system () == 'Windows' :
88+ title = title .decode ('utf-8' ).encode ('gbk' )
89+ return title
90+ else :
91+ return title
7592
7693 def get_detail (self ):
7794 if self .soup == None :
7895 self .parser ()
7996 soup = self .soup
8097 detail = soup .find ("div" , id = "zh-question-detail" ).div .get_text ().encode ("utf-8" )
81- return detail
98+ if platform .system () == 'Windows' :
99+ detail = detail .decode ('utf-8' ).encode ('gbk' )
100+ return detail
101+ else :
102+ return detail
82103
83104 def get_answers_num (self ):
84105 if self .soup == None :
@@ -103,7 +124,10 @@ def get_topics(self):
103124 topic_list = soup .find_all ("a" , class_ = "zm-item-tag" )
104125 topics = []
105126 for i in topic_list :
106- topics .append (i .contents [0 ].encode ("utf-8" ).replace ("\n " , "" ))
127+ topic = i .contents [0 ].encode ("utf-8" ).replace ("\n " , "" )
128+ if platform .system () == 'Windows' :
129+ topic = topic .decode ('utf-8' ).encode ('gbk' )
130+ topics .append (topic )
107131 return topics
108132
109133 # def get_top_answer(self):
@@ -315,18 +339,27 @@ def parser(self):
315339 def get_user_id (self ):
316340 if self .user_url == None :
317341 # print "I'm anonymous user."
318- return "匿名用户"
342+ if platform .system () == 'Windows' :
343+ return "匿名用户" .decode ('utf-8' ).encode ('gbk' )
344+ else :
345+ return "匿名用户"
319346 else :
320347 if hasattr (self , "user_id" ):
321- return self .user_id
348+ if platform .system () == 'Windows' :
349+ return self .user_id .decode ('utf-8' ).encode ('gbk' )
350+ else :
351+ return self .user_id
322352 else :
323353 if self .soup == None :
324354 self .parser ()
325355 soup = self .soup
326356 user_id = soup .find ("div" , class_ = "title-section ellipsis" ) \
327357 .find ("span" , class_ = "name" ).string .encode ("utf-8" )
328358 self .user_id = user_id
329- return user_id
359+ if platform .system () == 'Windows' :
360+ return user_id .decode ('utf-8' ).encode ('gbk' )
361+ else :
362+ return user_id
330363
331364 def get_followees_num (self ):
332365 if self .user_url == None :
@@ -704,11 +737,23 @@ def to_txt(self):
704737 for li in li_list :
705738 li .insert_before (content .new_string ("\n " ))
706739
707- if self .get_author ().get_user_id () == "匿名用户" :
740+ if platform .system () == 'Windows' :
741+ anon_user_id = "匿名用户" .decode ('utf-8' ).encode ('gbk' )
742+ else :
743+ anon_user_id = "匿名用户"
744+ if self .get_author ().get_user_id () == anon_user_id :
708745 if not os .path .isdir (os .path .join (os .path .join (os .getcwd (), "text" ))):
709746 os .makedirs (os .path .join (os .path .join (os .getcwd (), "text" )))
710- file_name = self .get_question ().get_title () + "--" + self .get_author ().get_user_id () + "的回答.txt"
747+ if platform .system () == 'Windows' :
748+ file_name = self .get_question ().get_title () + "--" + self .get_author ().get_user_id () + "的回答.txt" .decode ('utf-8' ).encode ('gbk' )
749+ else :
750+ file_name = self .get_question ().get_title () + "--" + self .get_author ().get_user_id () + "的回答.txt"
711751 print file_name
752+ # if platform.system() == 'Windows':
753+ # file_name = file_name.decode('utf-8').encode('gbk')
754+ # print file_name
755+ # else:
756+ # print file_name
712757 if os .path .exists (os .path .join (os .path .join (os .getcwd (), "text" ), file_name )):
713758 f = open (os .path .join (os .path .join (os .getcwd (), "text" ), file_name ), "a" )
714759 f .write ("\n \n " )
@@ -718,13 +763,27 @@ def to_txt(self):
718763 else :
719764 if not os .path .isdir (os .path .join (os .path .join (os .getcwd (), "text" ))):
720765 os .makedirs (os .path .join (os .path .join (os .getcwd (), "text" )))
721- file_name = self .get_question ().get_title () + "--" + self .get_author ().get_user_id () + "的回答.txt"
766+ if platform .system () == 'Windows' :
767+ file_name = self .get_question ().get_title () + "--" + self .get_author ().get_user_id () + "的回答.txt" .decode ('utf-8' ).encode ('gbk' )
768+ else :
769+ file_name = self .get_question ().get_title () + "--" + self .get_author ().get_user_id () + "的回答.txt"
722770 print file_name
771+ # if platform.system() == 'Windows':
772+ # file_name = file_name.decode('utf-8').encode('gbk')
773+ # print file_name
774+ # else:
775+ # print file_name
723776 f = open (os .path .join (os .path .join (os .getcwd (), "text" ), file_name ), "wt" )
724777 f .write (self .get_question ().get_title () + "\n \n " )
725- f .write ("作者: " + self .get_author ().get_user_id () + " 赞同: " + str (self .get_upvote ()) + "\n \n " )
726- f .write (body .get_text ().encode ("utf-8" ))
727- f .write ("\n " + "原链接: " + self .answer_url )
778+ if platform .system () == 'Windows' :
779+ f .write ("作者: " .decode ('utf-8' ).encode ('gbk' ) + self .get_author ().get_user_id () + " 赞同: " .decode ('utf-8' ).encode ('gbk' ) + str (self .get_upvote ()) + "\n \n " )
780+ f .write (body .get_text ().encode ("gbk" ))
781+ link_str = "原链接: " .decode ('utf-8' ).encode ('gbk' )
782+ f .write ("\n " + link_str + self .answer_url .decode ('utf-8' ).encode ('gbk' ))
783+ else :
784+ f .write ("作者: " + self .get_author ().get_user_id () + " 赞同: " + str (self .get_upvote ()) + "\n \n " )
785+ f .write (body .get_text ().encode ("utf-8" ))
786+ f .write ("\n " + "原链接: " + self .answer_url )
728787 f .close ()
729788
730789 # def to_html(self):
@@ -742,32 +801,49 @@ def to_txt(self):
742801
743802 def to_md (self ):
744803 content = self .get_content ()
745- if self .get_author ().get_user_id () == "匿名用户" :
746- file_name = self .get_question ().get_title () + "--" + self .get_author ().get_user_id () + "的回答.md"
804+ if platform .system () == 'Windows' :
805+ anon_user_id = "匿名用户" .decode ('utf-8' ).encode ('gbk' )
806+ else :
807+ anon_user_id = "匿名用户"
808+ if self .get_author ().get_user_id () == anon_user_id :
809+ if platform .system () == 'Windows' :
810+ file_name = self .get_question ().get_title () + "--" + self .get_author ().get_user_id () + "的回答.md" .decode ('utf-8' ).encode ('gbk' )
811+ else :
812+ file_name = self .get_question ().get_title () + "--" + self .get_author ().get_user_id () + "的回答.md"
747813 print file_name
814+ # if platform.system() == 'Windows':
815+ # file_name = file_name.decode('utf-8').encode('gbk')
816+ # print file_name
817+ # else:
818+ # print file_name
748819 if not os .path .isdir (os .path .join (os .path .join (os .getcwd (), "markdown" ))):
749820 os .makedirs (os .path .join (os .path .join (os .getcwd (), "markdown" )))
750821 if os .path .exists (os .path .join (os .path .join (os .getcwd (), "markdown" ), file_name )):
751822 f = open (os .path .join (os .path .join (os .getcwd (), "markdown" ), file_name ), "a" )
752- # f_2 = open(os.path.join(os.path.join(os.getcwd(), "markdown"), "2_" + file_name), "a")
753823 f .write ("\n " )
754- # f_2.write("\n")
755824 else :
756825 f = open (os .path .join (os .path .join (os .getcwd (), "markdown" ), file_name ), "a" )
757- # f_2 = open(os.path.join(os.path.join(os.getcwd(), "markdown"), "2_" + file_name), "a")
758826 f .write ("# " + self .get_question ().get_title () + "\n " )
759- # f_2.write("# " + self.get_question().get_title() + "\n")
760827 else :
761828 if not os .path .isdir (os .path .join (os .path .join (os .getcwd (), "markdown" ))):
762829 os .makedirs (os .path .join (os .path .join (os .getcwd (), "markdown" )))
763- file_name = self .get_question ().get_title () + "--" + self .get_author ().get_user_id () + "的回答.md"
830+ if platform .system () == 'Windows' :
831+ file_name = self .get_question ().get_title () + "--" + self .get_author ().get_user_id () + "的回答.md" .decode ('utf-8' ).encode ('gbk' )
832+ else :
833+ file_name = self .get_question ().get_title () + "--" + self .get_author ().get_user_id () + "的回答.md"
764834 print file_name
835+ # file_name = self.get_question().get_title() + "--" + self.get_author().get_user_id() + "的回答.md"
836+ # if platform.system() == 'Windows':
837+ # file_name = file_name.decode('utf-8').encode('gbk')
838+ # print file_name
839+ # else:
840+ # print file_name
765841 f = open (os .path .join (os .path .join (os .getcwd (), "markdown" ), file_name ), "wt" )
766- # f_2 = open(os.path.join(os.path.join(os.getcwd(), "markdown"), "2_" + file_name), "wt")
767842 f .write ("# " + self .get_question ().get_title () + "\n " )
768- # f_2.write("# " + self.get_question().get_title() + "\n")
769- f .write ("## 作者: " + self .get_author ().get_user_id () + " 赞同: " + str (self .get_upvote ()) + "\n " )
770- # f_2.write("## 作者: " + self.get_author().get_user_id() + " 赞同: " + str(self.get_upvote()) + "\n")
843+ if platform .system () == 'Windows' :
844+ f .write ("## 作者: " .decode ('utf-8' ).encode ('gbk' ) + self .get_author ().get_user_id () + " 赞同: " .decode ('utf-8' ).encode ('gbk' ) + str (self .get_upvote ()) + "\n " )
845+ else :
846+ f .write ("## 作者: " + self .get_author ().get_user_id () + " 赞同: " + str (self .get_upvote ()) + "\n " )
771847 text = html2text .html2text (content .decode ('utf-8' )).encode ("utf-8" )
772848
773849 r = re .findall (r'\*\*(.*?)\*\*' , text )
@@ -784,12 +860,14 @@ def to_md(self):
784860 for i in r :
785861 text = text .replace (i , i + "\n \n " )
786862
787- f .write (text )
788- # f_2.write(text)
789- f .write ("#### 原链接: " + self .answer_url )
790- # f_2.write("#### 原链接: " + self.answer_url)
863+ if platform .system () == 'Windows' :
864+ f .write (text .decode ('utf-8' ).encode ('gbk' ))
865+ link_str = "#### 原链接: " .decode ('utf-8' ).encode ('gbk' )
866+ f .write (link_str + self .answer_url .decode ('utf-8' ).encode ('gbk' ))
867+ else :
868+ f .write (text )
869+ f .write ("#### 原链接: " + self .answer_url )
791870 f .close ()
792- # f_2.close()
793871
794872
795873
@@ -833,12 +911,17 @@ def parser(self):
833911
834912 def get_name (self ):
835913 if hasattr (self , 'name' ):
836- return self .name
914+ if platform .system () == 'Windows' :
915+ return self .name .decode ('utf-8' ).encode ('gbk' )
916+ else :
917+ return self .name
837918 else :
838919 if self .soup == None :
839920 self .parser ()
840921 soup = self .soup
841922 self .name = soup .find ("h2" , id = "zh-fav-head-title" ).string .encode ("utf-8" ).strip ()
923+ if platform .system () == 'Windows' :
924+ return self .name .decode ('utf-8' ).encode ('gbk' )
842925 return self .name
843926
844927 def get_creator (self ):
0 commit comments