diff --git a/.gitignore b/.gitignore index db4561e..f26d885 100644 --- a/.gitignore +++ b/.gitignore @@ -52,3 +52,6 @@ docs/_build/ # PyBuilder target/ + +# Prevent accidental cookie leak +cookie diff --git a/README.rst b/README.rst index 9040ba1..9b5ad28 100644 --- a/README.rst +++ b/README.rst @@ -1,10 +1,7 @@ zhihu-python:获取知乎信息 =============================== -:Author: `egrcc `_ ( `微博 `_ | `电邮 `_ ) -:Committer: `Eureka22 `_ , `lufo816 `_ , `LuoZijun `_ -:Update: 09/09 2015 - +**注意: 本项目不再维护更新!** .. contents:: @@ -12,9 +9,9 @@ zhihu-python:获取知乎信息 介绍 ---- -zhihu-python 采用 python2.7 编写,用来方便地获取知乎上各种内容的信息,并且可以方便地将答案备份导出为 txt 或 markdown 文件。由于知乎官方目前没有提供 api,所以有了此项目的存在。 +zhihu-python 采用 Python2.7 编写,用来方便地获取知乎上各种内容的信息,并且可以方便地将答案备份导出为 txt 或 markdown 文件。由于知乎官方目前没有提供 api,所以有了此项目的存在。 -使用 python3 的类似项目可以参见:`zhihu-py3 `_ 。 +使用 Python3 的类似项目可以参见:`zhihu-py3 `_ 。使用 PHP 的类似项目可以参见:`zhihu-php `_ 。使用 Go 的类似项目可以参见:`zhihu-go `_ 。 **注: 本项目代码均在 Ubuntu14.04 上使用 python2.7.6 编写和测试通过,其他环境可能存在一定问题。** @@ -78,7 +75,7 @@ zhihu-python 采用 python2.7 编写,用来方便地获取知乎上各种内 * `requests `_ * `html2text `_ * `termcolor `_ - +* `lxml `_ .. code:: bash @@ -255,6 +252,8 @@ User 代表一个用户,处理用户相关操作。创建一个 User 对象需 user = User(user_url) # 获取用户ID user_id = user.get_user_id() + # 获取用户性别 + user_gender = user.get_gender() # 获取该用户的关注者人数 followers_num = user.get_followers_num() # 获取该用户关注的人数 @@ -269,6 +268,8 @@ User 代表一个用户,处理用户相关操作。创建一个 User 对象需 agree_num = user.get_agree_num() # 获取该用户获得的感谢数 thanks_num = user.get_thanks_num() + # 获取该用户的头像url + head_img_url = user.get_head_img_url() # 获取该用户关注的人 followees = user.get_followees() @@ -289,6 +290,7 @@ User 代表一个用户,处理用户相关操作。创建一个 User 对象需 print collections_num # 44 print agree_num # 46387 print thanks_num # 11477 + print head_img_url # https://pic2.zhimg.com/0626f4164009f291b26a79d96c6962c5_l.jpg print followees # @@ -349,6 +351,104 @@ Collection 代表一个收藏夹,处理收藏夹相关操作。创建一个 Co print answers # # 代表所有答案的生成器对象 + + +Column:获取专栏信息 +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Column 代表一个专栏,处理专栏相关操作。创建一个 Column 对象需传入该专栏的 url ,如: + +.. code-block:: python + + from zhihu import Column + + url = "http://zhuanlan.zhihu.com/daily" + column = Column(url) + +得到 Column 对象后,可以获取该专栏的一些信息: + +.. code-block:: python + + # -*- coding: utf-8 -*- + from zhihu import Column + + url = "http://zhuanlan.zhihu.com/daily" + column = Column(url) + + # 获取该专栏的标题 + title = column.get_title() + # 获取该专栏的描述 + description = column.get_description() + # 获取该专栏的作者 + creator = column.get_creator() + # 获取该专栏的文章数 + posts_num = column.get_posts_num() + # 获取该专栏的所有文章 + posts = column.get_all_posts() + + print title # 输出:知乎日报 + print description + # 输出: + # 知乎日报启动画面接受所有摄影师朋友们的投稿,将作品链接 + #(如 Flickr、LOFTER 等等),发至邮箱 qidong (at) zhihu.com, + # 并附上您的知乎个人页面地址即可。 + # + # 详细投稿要求: http://t.cn/zQyEpN5 + + print creator + # 输出: + # User类对象 + print posts_num # 150 + print posts + # 输出: + # Post类对象 + + +Post:获取专栏文章信息 +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Post 代表一个专栏文章,处理专栏文章相关操作。创建一个 Post 对象需传入该文章的 url ,如: + +.. code-block:: python + + from zhihu import Post + + url = "http://zhuanlan.zhihu.com/p/20235601" + post = Post(url) + +得到 Post 对象后,可以获取该文章的一些信息: + +.. code-block:: python + + # -*- coding: utf-8 -*- + from zhihu import Post + + url = "http://zhuanlan.zhihu.com/p/20770968" + post = Post(url) + + # 获取该文章的标题 + title = post.get_title() + # 获取该文章的内容 + content = post.get_content() + # 获取该文章的作者 + author = post.get_author() + # 获取该文章的所属专栏 + column = post.get_column() + # 获取该文章所属话题 + topics = post.get_topics() + + print title # 输出:夜读书|四月十九日 + print content + # 输出: + #

各位,晚上好。
... + # ...... + print author + # 输出: + for topic in topics: + print topic, # 输出:阅读 + print column + # 输出: + # Column类对象 综合实例 @@ -485,7 +585,13 @@ zhihu.User ---- 知乎用户操作类 得到该用户的ID。 **Returns**: 代表 ID 的字符串 + + **get_gender** () + 得到该用户的性别。 + + **Returns**: 代表 性别 的字符串(male/female) + **get_followees_num** () 得到该用户关注的人的个数。 @@ -509,6 +615,14 @@ zhihu.User ---- 知乎用户操作类 得到该用户获得的感谢数。 **Returns**: 代表感谢数的 int 型整数 + + **get_head_img_url** (scale) + + 获取用户头像url。该方法由 `@liuwons `_ 添加。 + + **Parameters**: **scale** int 型整数,代表尺寸: 1(25×25), 3(75×75), 4(100×100), 6(150×150), 10(250×250) + + **Returns**: 对应尺寸头像的图片链接, 字符串 **get_asks_num** () @@ -661,10 +775,101 @@ zhihu.Collection ---- 知乎收藏夹操作类 **Returns**: 包含该收藏夹下前 n 个回答的 generator 对象。其中每一个元素为代表一个回答的 Answer 对象 +zhihu.Column ---- 知乎专栏操作类 +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +*class* zhihu. **Column** (*Column_url*) + + Column 以 url 为唯一标识,创建一个 Column 对象实例必须传入一个代表知乎专栏的 url (如:http://zhuanlan.zhihu.com/daily),需包含“http(s)://”。如果传入的不是代表专栏的 url ,程序会报错。通过调用 Column 类的一系列方法,获得该专栏的一些信息。该类由 `@johnnyluck `_ 添加。 + + **Parameters**: + * **column_url** -- 该专栏的链接,字符串 + + **Returns**: 一个 Column 实例对象 + + **get_title** () + + 得到该专栏的题目。 + + **Returns**: 一个代表题目的字符串 + + **get_creator** () + + 得到该专栏的创建者。 + + **Returns**: 一个 User 对象 + + **get_description** () + + 得到该专栏的描述。 + + **Returns**: 一个专栏描述的字符串 + + **get_followers_num** () + + 得到该专栏的关注人数。 + + **Returns**: 一个 int 型的整数 + + **get_posts_num** () + + 得到该专栏的所有文章数。 + + **Returns**: 一个 int 型的整数 + + **get_content** () + + 得到该答案的内容。 + + **Returns**: 一个字符串 + + **get_posts** () + + 得到该专栏的所有文章。 + + **Returns**:包含所有文章的 generator 对象。其中每一个元素为代表一个文章 Post 对象 + + +zhihu.Post ---- 知乎专栏文章操作类 +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +*class* zhihu. **Post** (*Post_url*) -联系我 ----------- + Post 以 url 为唯一标识,创建一个 Post 对象实例必须传入一个代表知乎文章的 url (如:http://zhuanlan.zhihu.com/p/20235601),需包含“http(s)://”。如果传入的不是代表文章的 url ,程序会报错。通过调用 Post 类的一系列方法,获得该文章的一些信息。该类由 `@johnnyluck `_ 添加。 + + **Parameters**: + * **post_url** -- 该文章的链接,字符串 + + **Returns**: 一个 Post 实例对象 + + **get_title** () + + 得到该文章的题目。 + + **Returns**: 一个代表题目的字符串 + + **get_author** () + + 得到该文章的作者。 + + **Returns**: 一个 User 对象 + + **get_content** () + + 得到该文章的内容。 + + **Returns**: 一个文章描述的字符串 + + **get_topics** () + + 得到该文章的话题。 + + **Returns**: 一个列表 + + **get_column** () + + 得到该文章的所属专栏。 + + **Returns**:一个 Column 的实例对象 + -- 微博:http://weibo.com/u/2948739432 -- github:https://github.com/egrcc -- email:zhaolujun1994@gmail.com diff --git a/auth.py b/auth.py index 0828d41..ed51290 100644 --- a/auth.py +++ b/auth.py @@ -4,6 +4,7 @@ # Build-in / Std import os, sys, time, platform, random import re, json, cookielib +from getpass import getpass # requirements import requests, termcolor @@ -66,8 +67,8 @@ def __init__(self, message): def download_captcha(): - url = "http://www.zhihu.com/captcha.gif" - r = requests.get(url, params={"r": random.random()} ) + url = "https://www.zhihu.com/captcha.gif" + r = requests.get(url, params={"r": random.random(), "type": "login"}, verify=False) if int(r.status_code) != 200: raise NetworkError(u"验证码请求失败") image_name = u"verify." + r.headers['content-type'].split("/")[1] @@ -82,27 +83,20 @@ def download_captcha(): elif platform.system() == "Darwin": Logging.info(u"Command: open %s &" % image_name ) os.system("open %s &" % image_name ) - elif platform.system() == "SunOS": - os.system("open %s &" % image_name ) - elif platform.system() == "FreeBSD": - os.system("open %s &" % image_name ) - elif platform.system() == "Unix": - os.system("open %s &" % image_name ) - elif platform.system() == "OpenBSD": - os.system("open %s &" % image_name ) - elif platform.system() == "NetBSD": + elif platform.system() in ("SunOS", "FreeBSD", "Unix", "OpenBSD", "NetBSD"): os.system("open %s &" % image_name ) elif platform.system() == "Windows": - os.system("open %s &" % image_name ) + os.system("%s" % image_name ) else: Logging.info(u"我们无法探测你的作业系统,请自行打开验证码 %s 文件,并输入验证码。" % os.path.join(os.getcwd(), image_name) ) - captcha_code = raw_input( termcolor.colored("请输入验证码: ", "cyan") ) + sys.stdout.write(termcolor.colored(u"请输入验证码: ", "cyan") ) + captcha_code = raw_input( ) return captcha_code def search_xsrf(): url = "http://www.zhihu.com/" - r = requests.get(url) + r = requests.get(url, verify=False) if int(r.status_code) != 200: raise NetworkError(u"验证码请求失败") results = re.compile(r"\ @@ -158,6 +167,9 @@ def user_test(user_url): if i == 41: break + for topic in topics: + print topic + print asks # # 代表该用户提的所有问题的生成器对象 @@ -194,6 +206,56 @@ def collection_test(collection_url): # 代表所有答案的生成器对象 +def post_test(post_url): + post = Post(post_url) + + # 获取该文章的标题 + title = post.get_title() + # 获取该文章的内容 + content = post.get_content() + # 获取该文章的作者 + author = post.get_author() + # 获取该文章的所属专栏 + column = post.get_column() + # 获取该文章所属话题 + topics = post.get_topics() + + print title # 输出: + print content + for topic in topics: + print topic, # 输出: + print "\n" + print author + # 输出: + # User类对象 + print column + # 输出: + # Column类对象 + + +def column_test(column_url): + + column = Column(column_url) + + # 获取该专栏的标题 + title = column.get_title() + # 获取该专栏的描述 + description = column.get_description() + # 获取该专栏的作者 + creator = column.get_creator() + # 获取该专栏的文章数 + posts_num = column.get_posts_num() + # 获取该专栏的所有文章 + posts = column.get_all_posts() + + print title + print description + print creator + # 输出: + # User类对象 + print posts_num + print posts + def test(): url = "http://www.zhihu.com/question/24269892" question = Question(url) @@ -228,6 +290,10 @@ def main(): user_test(user_url) collection_url = "http://www.zhihu.com/collection/36750683" collection_test(collection_url) + post_url = "http://zhuanlan.zhihu.com/p/20770968" + post_test(post_url) + column_url = "http://zhuanlan.zhihu.com/daily" + column_test(column_url) test() diff --git "a/text/\347\216\260\345\256\236\345\217\257\344\273\245\346\234\211\345\244\232\347\276\216\345\245\275\357\274\237/\347\216\260\345\256\236\345\217\257\344\273\245\346\234\211\345\244\232\347\276\216\345\245\275\357\274\237--\351\202\223\345\262\202\347\232\204\345\233\236\347\255\224.txt" "b/text/\347\216\260\345\256\236\345\217\257\344\273\245\346\234\211\345\244\232\347\276\216\345\245\275\357\274\237/\347\216\260\345\256\236\345\217\257\344\273\245\346\234\211\345\244\232\347\276\216\345\245\275\357\274\237--\351\202\223\345\262\202\347\232\204\345\233\236\347\255\224.txt" deleted file mode 100644 index b1e3921..0000000 --- "a/text/\347\216\260\345\256\236\345\217\257\344\273\245\346\234\211\345\244\232\347\276\216\345\245\275\357\274\237/\347\216\260\345\256\236\345\217\257\344\273\245\346\234\211\345\244\232\347\276\216\345\245\275\357\274\237--\351\202\223\345\262\202\347\232\204\345\233\236\347\255\224.txt" +++ /dev/null @@ -1,21 +0,0 @@ -现实可以有多美好? - -作者: 邓岂 赞同: 23 - -最近高考失利啦,考的很差很差。高三辛苦弄到的加分全都没用上。 -心里想不开决定自杀。 -于是给已经有男朋友的她,发了一条长长的短信,告诉她我要去复读,请她以后不要再联系我了,把我忘了吧。其实现在的我也不再单身,有了一个女朋友。可我在决定去死的时候,选择的唯一一个告别的人却依然是她。 -那晚,我爬到了我家阳台上(24楼),准备一下子跳下去。 -可是她回了我的短信,大意是她明白我想说不仅仅是去复读,而是要做危险的事。她还说,她不舍得忘掉我。 -那一晚心情沉痛,关了手机,从家里骑了2个小时的自行车,到她家的楼下,看着她家的灯火一夜未息。直到天蒙蒙亮,我方才骑车回家。回家打开手机,112个未接电话,其中有98个来自她,另外来自我和她共同的朋友们。 -她们说,半夜收到她给她们发的短信,告诉她们她很担心我会做出很多出格的事情。 -我很偏执,但我看到、知道这一切的时候,我的心肺都融化了。 - -她拒绝过我。甚至我还亲自见证了她是怎样一步一步的爱上那个男生,并最终和她走到了一起。 -有人说,我是这世界上最傻最傻的备胎。可是我想告诉所有这样说的人,做这样一个备胎可是生命中最美好的事儿啊。 - -以上。 - - - -原链接: http://www.zhihu.com/question/24269892/answer/27338490 \ No newline at end of file diff --git a/zhihu.py b/zhihu.py index f5149da..68af702 100755 --- a/zhihu.py +++ b/zhihu.py @@ -1,52 +1,52 @@ # -*- coding: utf-8 -*- ''' - ;$$; - ############# - #############;#####o - ## o######################### + ;$$; + ############# + #############;#####o + ## o######################### ##### $############################### ## ###$ ######! ########################## ## ### $### ################### ###### ### ### ##o####################### ###### ;### #### ##################### - ## ### ###### ######&&################ - ## ### ###### ## ############ ####### - o## ######## ## ################## - ##o ### #### #######o####### - ## ###### ###########&##### - ## #### #############! - ### ######### - #####& ## o#### - ###### ## ####* - ## !## ##### - ## ##* ####; ## - ##### #####o ##### - #### ### ### $###o - ### ## ####! $### - ## ##### - ## ## - ;## ### ; - ##$ ## - ####### ## - ##### &## ## - ### ### ### - ### ### ## - ## ;## ## - ## ### ## - ### ### ## - #### ## - ### ## - ##; ## - ##$ ##& - ## ## - ##; ## - ## ##; - ### ### ##$ - ### ### ## - ###################### #####&&&&&&&&&&&&### - ### $#####$ ############&$o$&################################ - # $&########&o + ## ### ###### ######&&################ + ## ### ###### ## ############ ####### + o## ######## ## ################## + ##o ### #### #######o####### + ## ###### ###########&##### + ## #### #############! + ### ######### + #####& ## o#### + ###### ## ####* + ## !## ##### + ## ##* ####; ## + ##### #####o ##### + #### ### ### $###o + ### ## ####! $### + ## ##### + ## ## + ;## ### ; + ##$ ## + ####### ## + ##### &## ## + ### ### ### + ### ### ## + ## ;## ## + ## ### ## + ### ### ## + #### ## + ### ## + ##; ## + ##$ ##& + ## ## + ##; ## + ## ##; + ### ### ##$ + ### ### ## + ###################### #####&&&&&&&&&&&&### + ### $#####$ ############&$o$&################################ + # $&########&o ''' # Build-in / Std @@ -92,22 +92,216 @@ reload(sys) sys.setdefaultencoding('utf8') +class Post: + url = None + meta = None + slug = None + + def __init__(self, url): + + if not re.compile(r"(http|https)://zhuanlan.zhihu.com/p/\d{8}").match(url): + raise ValueError("\"" + url + "\"" + " : it isn't a question url.") + else: + self.url = url + self.slug = re.compile(r"(http|https)://zhuanlan.zhihu.com/p/(\d{8})").match(url).group(2) + + def parser(self): + headers = { + 'User-Agent': "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.135 Safari/537.36", + 'Host': "zhuanlan.zhihu.com", + 'Accept': "application/json, text/plain, */*" + } + r = requests.get('https://zhuanlan.zhihu.com/api/posts/' + self.slug, headers=headers, verify=False) + self.meta = r.json() + + def get_title(self): + if hasattr(self, "title"): + if platform.system() == 'Windows': + title = self.title.decode('utf-8').encode('gbk') + return title + else: + return self.title + else: + if self.meta == None: + self.parser() + meta = self.meta + title = meta['title'] + self.title = title + if platform.system() == 'Windows': + title = title.decode('utf-8').encode('gbk') + return title + else: + return title + + def get_content(self): + if self.meta == None: + self.parser() + meta = self.meta + content = meta['content'] + if platform.system() == 'Windows': + content = content.decode('utf-8').encode('gbk') + return content + else: + return content + + def get_author(self): + if hasattr(self, "author"): + return self.author + else: + if self.meta == None: + self.parser() + meta = self.meta + author_tag = meta['author'] + author = User(author_tag['profileUrl'],author_tag['slug']) + return author + + def get_column(self): + if self.meta == None: + self.parser() + meta = self.meta + column_url = 'https://zhuanlan.zhihu.com/' + meta['column']['slug'] + return Column(column_url, meta['column']['slug']) + + def get_likes(self): + if self.meta == None: + self.parser() + meta = self.meta + return int(meta["likesCount"]) + + def get_topics(self): + if self.meta == None: + self.parser() + meta = self.meta + topic_list = [] + for topic in meta['topics']: + topic_list.append(topic['name']) + return topic_list + +class Column: + url = None + meta = None + + def __init__(self, url, slug=None): + + if not re.compile(r"(http|https)://zhuanlan.zhihu.com/([0-9a-zA-Z]+)").match(url): + raise ValueError("\"" + url + "\"" + " : it isn't a question url.") + else: + self.url = url + if slug == None: + self.slug = re.compile(r"(http|https)://zhuanlan.zhihu.com/([0-9a-zA-Z]+)").match(url).group(2) + else: + self.slug = slug + + def parser(self): + headers = { + 'User-Agent': "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.135 Safari/537.36", + 'Host': "zhuanlan.zhihu.com", + 'Accept': "application/json, text/plain, */*" + } + r = requests.get('https://zhuanlan.zhihu.com/api/columns/' + self.slug, headers=headers, verify=False) + self.meta = r.json() + + def get_title(self): + if hasattr(self,"title"): + if platform.system() == 'Windows': + title = self.title.decode('utf-8').encode('gbk') + return title + else: + return self.title + else: + if self.meta == None: + self.parser() + meta = self.meta + title = meta['name'] + self.title = title + if platform.system() == 'Windows': + title = title.decode('utf-8').encode('gbk') + return title + else: + return title + + def get_description(self): + if self.meta == None: + self.parser() + meta = self.meta + description = meta['description'] + if platform.system() == 'Windows': + description = description.decode('utf-8').encode('gbk') + return description + else: + return description + + def get_followers_num(self): + if self.meta == None: + self.parser() + meta = self.meta + followers_num = int(meta['followersCount']) + return followers_num + + def get_posts_num(self): + if self.meta == None: + self.parser() + meta = self.meta + posts_num = int(meta['postsCount']) + return posts_num + + def get_creator(self): + if hasattr(self, "creator"): + return self.creator + else: + if self.meta == None: + self.parser() + meta = self.meta + creator_tag = meta['creator'] + creator = User(creator_tag['profileUrl'],creator_tag['slug']) + return creator + + def get_all_posts(self): + posts_num = self.get_posts_num() + if posts_num == 0: + print "No posts." + return + yield + else: + for i in xrange((posts_num - 1) / 20 + 1): + parm = {'limit': 20, 'offset': 20*i} + url = 'https://zhuanlan.zhihu.com/api/columns/' + self.slug + '/posts' + headers = { + 'User-Agent': "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.135 Safari/537.36", + 'Host': "www.zhihu.com", + 'Origin': "http://www.zhihu.com", + 'Pragma': "no-cache", + 'Referer': "http://www.zhihu.com/" + } + r = requests.get(url, params=parm, headers=headers, verify=False) + posts_list = r.json() + for p in posts_list: + post_url = 'https://zhuanlan.zhihu.com/p/' + str(p['slug']) + yield Post(post_url) + class Question: url = None soup = None def __init__(self, url, title=None): - if url[0:len(url) - 8] != "http://www.zhihu.com/question/": + if not re.compile(r"(http|https)://www.zhihu.com/question/\d{8}").match(url): raise ValueError("\"" + url + "\"" + " : it isn't a question url.") else: self.url = url - + if title != None: self.title = title def parser(self): - r = requests.get(self.url) - self.soup = BeautifulSoup(r.content) + headers = { + 'User-Agent': "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.135 Safari/537.36", + 'Host': "www.zhihu.com", + 'Origin': "http://www.zhihu.com", + 'Pragma': "no-cache", + 'Referer': "http://www.zhihu.com/" + } + r = requests.get(self.url,headers=headers, verify=False) + self.soup = BeautifulSoup(r.content, "lxml") def get_title(self): if hasattr(self, "title"): @@ -177,27 +371,27 @@ def get_all_answers(self): else: error_answer_count = 0 my_answer_count = 0 - for i in xrange((answers_num - 1) / 50 + 1): + for i in xrange((answers_num - 1) / 20 + 1): if i == 0: - for j in xrange(min(answers_num, 50)): + for j in xrange(min(answers_num, 20)): if self.soup == None: self.parser() - soup = BeautifulSoup(self.soup.encode("utf-8")) + soup = BeautifulSoup(self.soup.encode("utf-8"), "lxml") is_my_answer = False if soup.find_all("div", class_="zm-item-answer")[j].find("span", class_="count") == None: my_answer_count += 1 is_my_answer = True - + if soup.find_all("div", class_="zm-item-answer")[j].find("div", class_="zm-editable-content clearfix") == None: error_answer_count += 1 continue author = None - if soup.find_all("h3", class_="zm-item-answer-author-wrap")[j].string == u"匿名用户": + if soup.find_all("div", class_="zm-item-answer-author-info")[j].get_text(strip='\n') == u"匿名用户": author_url = None author = User(author_url) else: - author_tag = soup.find_all("h3", class_="zm-item-answer-author-wrap")[j].find_all("a")[1] + author_tag = soup.find_all("div", class_="zm-item-answer-author-info")[j].find_all("a")[1] author_id = author_tag.string.encode("utf-8") author_url = "http://www.zhihu.com" + author_tag["href"] author = User(author_url, author_id) @@ -234,9 +428,9 @@ def get_all_answers(self): else: post_url = "http://www.zhihu.com/node/QuestionAnswerListV2" _xsrf = self.soup.find("input", attrs={'name': '_xsrf'})["value"] - offset = i * 50 + offset = i * 20 params = json.dumps( - {"url_token": int(self.url[-8:-1] + self.url[-1]), "pagesize": 50, "offset": offset}) + {"url_token": int(self.url[-8:-1] + self.url[-1]), "pagesize": 20, "offset": offset}) data = { '_xsrf': _xsrf, 'method': "next", @@ -247,23 +441,23 @@ def get_all_answers(self): 'Host': "www.zhihu.com", 'Referer': self.url } - r = requests.post(post_url, data=data, headers=header) + r = requests.post(post_url, data=data, headers=header, verify=False) answer_list = r.json()["msg"] - for j in xrange(min(answers_num - i * 50, 50)): - soup = BeautifulSoup(self.soup.encode("utf-8")) + for j in xrange(min(answers_num - i * 20, 20)): + soup = BeautifulSoup(self.soup.encode("utf-8"), "lxml") + + answer_soup = BeautifulSoup(answer_list[j], "lxml") - answer_soup = BeautifulSoup(answer_list[j]) - if answer_soup.find("div", class_="zm-editable-content clearfix") == None: continue - + author = None - if answer_soup.find("h3", class_="zm-item-answer-author-wrap").string == u"匿名用户": + if answer_soup.find("div", class_="zm-item-answer-author-info").get_text(strip='\n') == u"匿名用户": author_url = None author = User(author_url) else: - author_tag = answer_soup.find("h3", class_="zm-item-answer-author-wrap").find_all("a")[1] + author_tag = answer_soup.find("div", class_="zm-item-answer-author-info").find_all("a")[1] author_id = author_tag.string.encode("utf-8") author_url = "http://www.zhihu.com" + author_tag["href"] author = User(author_url, author_id) @@ -328,7 +522,7 @@ class User: def __init__(self, user_url, user_id=None): if user_url == None: self.user_id = "匿名用户" - elif user_url[0:28] != "http://www.zhihu.com/people/": + elif user_url.startswith('www.zhihu.com/people', user_url.index('//') + 2) == False: raise ValueError("\"" + user_url + "\"" + " : it isn't a user url.") else: self.user_url = user_url @@ -336,8 +530,15 @@ def __init__(self, user_url, user_id=None): self.user_id = user_id def parser(self): - r = requests.get(self.user_url) - soup = BeautifulSoup(r.content) + headers = { + 'User-Agent': "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.135 Safari/537.36", + 'Host': "www.zhihu.com", + 'Origin': "http://www.zhihu.com", + 'Pragma': "no-cache", + 'Referer': "http://www.zhihu.com/" + } + r = requests.get(self.user_url, headers=headers, verify=False) + soup = BeautifulSoup(r.content, "lxml") self.soup = soup def get_user_id(self): @@ -365,6 +566,70 @@ def get_user_id(self): else: return user_id + def get_head_img_url(self, scale=4): + """ + By liuwons (https://github.com/liuwons) + 增加获取知乎识用户的头像url + scale对应的头像尺寸: + 1 - 25×25 + 3 - 75×75 + 4 - 100×100 + 6 - 150×150 + 10 - 250×250 + """ + scale_list = [1, 3, 4, 6, 10] + scale_name = '0s0ml0t000b' + if self.user_url == None: + print "I'm anonymous user." + return None + else: + if scale not in scale_list: + print 'Illegal scale.' + return None + if self.soup == None: + self.parser() + soup = self.soup + url = soup.find("img", class_="Avatar Avatar--l")["src"] + return url[:-5] + scale_name[scale] + url[-4:] + + def get_data_id(self): + """ + By yannisxu (https://github.com/yannisxu) + 增加获取知乎 data-id 的方法来确定标识用户的唯一性 #24 + (https://github.com/egrcc/zhihu-python/pull/24) + """ + if self.user_url == None: + print "I'm anonymous user." + return 0 + else: + if self.soup == None: + self.parser() + soup = self.soup + data_id = soup.find("button", class_="zg-btn zg-btn-follow zm-rich-follow-btn")['data-id'] + return data_id + + def get_gender(self): + """ + By Mukosame (https://github.com/mukosame) + 增加获取知乎识用户的性别 + + """ + if self.user_url == None: + print "I'm anonymous user." + return 'unknown' + else: + if self.soup == None: + self.parser() + soup = self.soup + try: + gender = str(soup.find("span",class_="item gender").i) + if (gender == ''): + return 'female' + else: + return 'male' + except: + return 'unknown' + def get_followees_num(self): if self.user_url == None: print "I'm anonymous user." @@ -389,6 +654,22 @@ def get_followers_num(self): .find_all("a")[1].strong.string) return followers_num + def get_topics_num(self): + if self.user_url == None: + print "I'm anonymous user." + return 0 + else: + if self.soup == None: + self.parser() + soup = self.soup + topics_num = soup.find_all("div", class_="zm-profile-side-section-title")[-1].strong.string.encode("utf-8") + I='' + for i in topics_num: + if i.isdigit(): + I=I+i + topics_num=int(I) + return topics_num + def get_agree_num(self): if self.user_url == None: print "I'm anonymous user." @@ -456,9 +737,16 @@ def get_followees(self): yield else: followee_url = self.user_url + "/followees" - r = requests.get(followee_url) - - soup = BeautifulSoup(r.content) + headers = { + 'User-Agent': "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.135 Safari/537.36", + 'Host': "www.zhihu.com", + 'Origin': "http://www.zhihu.com", + 'Pragma': "no-cache", + 'Referer': "http://www.zhihu.com/" + } + r = requests.get(followee_url, headers=headers, verify=False) + + soup = BeautifulSoup(r.content, "lxml") for i in xrange((followees_num - 1) / 20 + 1): if i == 0: user_url_list = soup.find_all("h2", class_="zm-list-content-title") @@ -481,11 +769,11 @@ def get_followees(self): 'Referer': followee_url } - r_post = requests.post(post_url, data=data, headers=header) + r_post = requests.post(post_url, data=data, headers=header, verify=False) followee_list = r_post.json()["msg"] for j in xrange(min(followees_num - i * 20, 20)): - followee_soup = BeautifulSoup(followee_list[j]) + followee_soup = BeautifulSoup(followee_list[j], "lxml") user_link = followee_soup.find("h2", class_="zm-list-content-title").a yield User(user_link["href"], user_link.string.encode("utf-8")) @@ -501,9 +789,16 @@ def get_followers(self): yield else: follower_url = self.user_url + "/followers" - r = requests.get(follower_url) - - soup = BeautifulSoup(r.content) + headers = { + 'User-Agent': "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.135 Safari/537.36", + 'Host': "www.zhihu.com", + 'Origin': "http://www.zhihu.com", + 'Pragma': "no-cache", + 'Referer': "http://www.zhihu.com/" + } + r = requests.get(follower_url, headers=headers, verify=False) + + soup = BeautifulSoup(r.content, "lxml") for i in xrange((followers_num - 1) / 20 + 1): if i == 0: user_url_list = soup.find_all("h2", class_="zm-list-content-title") @@ -525,15 +820,69 @@ def get_followers(self): 'Host': "www.zhihu.com", 'Referer': follower_url } - r_post = requests.post(post_url, data=data, headers=header) + r_post = requests.post(post_url, data=data, headers=header, verify=False) follower_list = r_post.json()["msg"] for j in xrange(min(followers_num - i * 20, 20)): - follower_soup = BeautifulSoup(follower_list[j]) + follower_soup = BeautifulSoup(follower_list[j], "lxml") user_link = follower_soup.find("h2", class_="zm-list-content-title").a yield User(user_link["href"], user_link.string.encode("utf-8")) + def get_topics(self): + if self.user_url == None: + print "I'm anonymous user." + return + yield + else: + topics_num = self.get_topics_num() + # print topics_num + if topics_num == 0: + return + yield + else: + topics_url = self.user_url + "/topics" + headers = { + 'User-Agent': "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.135 Safari/537.36", + 'Host': "www.zhihu.com", + 'Origin': "http://www.zhihu.com", + 'Pragma': "no-cache", + 'Referer': "http://www.zhihu.com/" + } + r = requests.get(topics_url, headers=headers, verify=False) + soup = BeautifulSoup(r.content, "lxml") + for i in xrange((topics_num - 1) / 20 + 1): + if i == 0: + topic_list = soup.find_all("div", class_="zm-profile-section-item zg-clear") + for j in xrange(min(topics_num, 20)): + yield topic_list[j].find("strong").string.encode("utf-8") + else: + post_url = topics_url + _xsrf = soup.find("input", attrs={'name': '_xsrf'})["value"] + offset = i * 20 + data = { + '_xsrf': _xsrf, + 'offset': offset, + 'start': 0 + } + header = { + 'User-Agent': "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:34.0) Gecko/20100101 Firefox/34.0", + 'Host': "www.zhihu.com", + 'Referer': topics_url + } + r_post = requests.post(post_url, data=data, headers=header, verify=False) + + topic_data = r_post.json()["msg"][1] + topic_soup = BeautifulSoup(topic_data, "lxml") + topic_list = topic_soup.find_all("div", class_="zm-profile-section-item zg-clear") + for j in xrange(min(topics_num - i * 20, 20)): + yield topic_list[j].find("strong").string.encode("utf-8") + def get_asks(self): + """ + By ecsys (https://github.com/ecsys) + 增加了获取某用户所有赞过答案的功能 #29 + (https://github.com/egrcc/zhihu-python/pull/29) + """ if self.user_url == None: print "I'm anonymous user." return @@ -546,9 +895,16 @@ def get_asks(self): else: for i in xrange((asks_num - 1) / 20 + 1): ask_url = self.user_url + "/asks?page=" + str(i + 1) - r = requests.get(ask_url) - - soup = BeautifulSoup(r.content) + headers = { + 'User-Agent': "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.135 Safari/537.36", + 'Host': "www.zhihu.com", + 'Origin': "http://www.zhihu.com", + 'Pragma': "no-cache", + 'Referer': "http://www.zhihu.com/" + } + r = requests.get(ask_url, headers=headers, verify=False) + + soup = BeautifulSoup(r.content, "lxml") for question in soup.find_all("a", class_="question_link"): url = "http://www.zhihu.com" + question["href"] title = question.string.encode("utf-8") @@ -567,8 +923,15 @@ def get_answers(self): else: for i in xrange((answers_num - 1) / 20 + 1): answer_url = self.user_url + "/answers?page=" + str(i + 1) - r = requests.get(answer_url) - soup = BeautifulSoup(r.content) + headers = { + 'User-Agent': "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.135 Safari/537.36", + 'Host': "www.zhihu.com", + 'Origin': "http://www.zhihu.com", + 'Pragma': "no-cache", + 'Referer': "http://www.zhihu.com/" + } + r = requests.get(answer_url, headers=headers, verify=False) + soup = BeautifulSoup(r.content, "lxml") for answer in soup.find_all("a", class_="question_link"): question_url = "http://www.zhihu.com" + answer["href"][0:18] question_title = answer.string.encode("utf-8") @@ -588,10 +951,16 @@ def get_collections(self): else: for i in xrange((collections_num - 1) / 20 + 1): collection_url = self.user_url + "/collections?page=" + str(i + 1) - - r = requests.get(collection_url) + headers = { + 'User-Agent': "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.135 Safari/537.36", + 'Host': "www.zhihu.com", + 'Origin': "http://www.zhihu.com", + 'Pragma': "no-cache", + 'Referer': "http://www.zhihu.com/" + } + r = requests.get(collection_url, headers=headers, verify=False) - soup = BeautifulSoup(r.content) + soup = BeautifulSoup(r.content, "lxml") for collection in soup.find_all("div", class_="zm-profile-section-item zg-clear"): url = "http://www.zhihu.com" + \ collection.find("a", class_="zm-profile-fav-item-title")["href"] @@ -599,6 +968,69 @@ def get_collections(self): yield Collection(url, name, self) + def get_likes(self): + # This function only handles liked answers, not including zhuanlan articles + if self.user_url == None: + print "I'm an anonymous user." + return + yield + else: + headers = { + 'User-Agent': "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.135 Safari/537.36", + 'Host': "www.zhihu.com", + 'Origin': "http://www.zhihu.com", + 'Pragma': "no-cache", + 'Referer': "http://www.zhihu.com/" + } + r = requests.get(self.user_url, headers=headers, verify=False) + soup = BeautifulSoup(r.content, "lxml") + # Handle the first liked item + first_item = soup.find("div", attrs={'class':'zm-profile-section-item zm-item clearfix'}) + first_item = first_item.find("div", attrs={'class':'zm-profile-section-main zm-profile-section-activity-main zm-profile-activity-page-item-main'}) + if u"赞同了回答" in str(first_item): + first_like = first_item.find("a")['href'] + yield Answer("http://www.zhihu.com" + first_like) + # Handle the rest liked items + post_url = self.user_url + "/activities" + start_time = soup.find("div", attrs={'class':'zm-profile-section-item zm-item clearfix'})["data-time"] + _xsrf = soup.find("input", attrs={'name': '_xsrf'})["value"] + data = { + 'start': start_time, + '_xsrf': _xsrf, + } + header = { + 'Host': "www.zhihu.com", + 'Referer': self.user_url, + 'User-Agent': "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36", + } + r = requests.post(post_url, data=data, headers=header, verify=False) + response_size = r.json()["msg"][0] + response_html = r.json()["msg"][1] + while response_size > 0: + all_liked_answers = re.findall(u"\u8d5e\u540c\u4e86\u56de\u7b54\n\n