From bec392089f23de98d9b6d94c1fd0a595d46e7937 Mon Sep 17 00:00:00 2001 From: Yanghangfeng <1437183653@qq.com> Date: Sun, 20 Nov 2016 14:07:13 +0800 Subject: [PATCH 01/65] update readme.md --- README.md | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 3fd8452..218cad7 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,19 @@ -# 功能介绍 +# PythonCrawler: 用python编写的简单爬虫项目集合 +``` + ( + )\ ) ) ) ( ( +(()/( ( ( /( ( /( )\ ( ) ( ( )\ ( ( + /(_)))\ ) )\()))\()) ( ( (((_) )( ( /( )\))( ((_) ))\ )( +(_)) (()/( (_))/((_)\ )\ )\ ) )\___ (()\ )(_))((_)()\ _ /((_)(()\ +| _ \ )(_))| |_ | |(_) ((_) _(_/(((/ __| ((_)((_)_ _(()((_)| |(_)) ((_) +| _/| || || _|| ' \ / _ \| ' \))| (__ | '_|/ _` |\ V V /| |/ -_) | '_| +|_| \_, | \__||_||_|\___/|_||_| \___||_| \__,_| \_/\_/ |_|\___| |_| + |__/ + by yanghangfeng +``` + + +# 模块介绍 ##### 1. [baiduImg.py](https://github.com/Fenghuapiao/PythonCrawler/blob/master/baiduImg.py): 抓取百度的‘高清摄影’图片 From e468b41b7d59992b407a9bdccb1061052a00b770 Mon Sep 17 00:00:00 2001 From: Yanghangfeng <1437183653@qq.com> Date: Wed, 23 Nov 2016 17:46:13 +0800 Subject: [PATCH 02/65] update file --- ECUT_pos_html.py | 57 ------- README.md | 82 ++++++++-- baiduImg.py | 53 ------- baiduImg2.py | 48 ------ spiderAPI/__init__.py | 0 spiderAPI/baidumap.py | 30 ++++ spiderAPI/dianping.py | 87 +++++++++++ spiderAPI/github.py | 67 +++++++++ spiderAPI/lagou.py | 13 ++ spiderAPI/proxyip.py | 50 +++++++ .../ECUT_get_grade.py | 17 ++- JDSpider.py => spiderFile/JD_spider.py | 84 +++++------ spiderFile/baidu_sy_img.py | 55 +++++++ spiderFile/baidu_wm_img.py | 51 +++++++ GetPhotos2.py => spiderFile/get_photos.py | 49 +++--- .../get_web_all_img.py | 140 +++++++++--------- githubHot.py => spiderFile/github_hot.py | 1 + .../lagou_position_spider.py | 32 ++-- one_img.py => spiderFile/one_img.py | 39 +++-- student_img.py => spiderFile/student_img.py | 55 +++---- .../xz_picture_spider.py | 6 +- 21 files changed, 641 insertions(+), 375 deletions(-) delete mode 100644 ECUT_pos_html.py delete mode 100644 baiduImg.py delete mode 100644 baiduImg2.py create mode 100644 spiderAPI/__init__.py create mode 100644 spiderAPI/baidumap.py create mode 100644 spiderAPI/dianping.py create mode 100644 spiderAPI/github.py create mode 100644 spiderAPI/lagou.py create mode 100644 spiderAPI/proxyip.py rename ECUT_get_grade.py => spiderFile/ECUT_get_grade.py (90%) rename JDSpider.py => spiderFile/JD_spider.py (93%) create mode 100644 spiderFile/baidu_sy_img.py create mode 100644 spiderFile/baidu_wm_img.py rename GetPhotos2.py => spiderFile/get_photos.py (90%) rename getWebAllImg.py => spiderFile/get_web_all_img.py (62%) rename githubHot.py => spiderFile/github_hot.py (99%) rename lagouPositionSpider.py => spiderFile/lagou_position_spider.py (97%) rename one_img.py => spiderFile/one_img.py (96%) rename student_img.py => spiderFile/student_img.py (87%) rename pictureSpider.py => spiderFile/xz_picture_spider.py (96%) diff --git a/ECUT_pos_html.py b/ECUT_pos_html.py deleted file mode 100644 index 470219c..0000000 --- a/ECUT_pos_html.py +++ /dev/null @@ -1,57 +0,0 @@ -import requests -import re -from bs4 import BeautifulSoup as bs - - -def crawl_all_main_url(page=10): - # 默认抓取官网前十页招聘信息的url - all_url_list = [] - for _ in range(1, page+1): - url = 'http://zjc.ecit.edu.cn/jy/app/newslist.php?BigClassName=%D5%D0%C6%B8%D0%C5%CF%A2&Page={0}'.format(_) - page_html = requests.get(url).text - x_url_reg = re.compile('(.*?)') - explain_text = re.findall(explain_text_reg, html)[0] - if ('时间' and '地点') in explain_text: - return True - else: - pass - -def save_html(): - all_url_list = crawl_all_main_url() - for son_url in all_url_list: - if get_title(son_url): - text_html = requests.get(son_url).content.decode('gbk') - domain_url = 'http://zjc.ecit.edu.cn/jy' - img_url_reg = re.compile('border=0 src="\.\.(.*?)"') - child_url = re.findall(img_url_reg, text_html) - if child_url != []: - img_url = domain_url + child_url[0] - re_url = 'src="..{0}"'.format(child_url[0]) - end_url = 'src="{0}"'.format(img_url) - end_html = text_html.replace(re_url, end_url) - soup = bs(end_html, 'lxml') - text_div = soup.find_all('div', id='main')[0] - with open('./{0}.html'.format(son_url[-11:]), 'wb') as file: - text_html = 'U职网提供数据咨询服务 {0} '.format(text_div) - file.write(text_html.encode('utf-8')) - else: - with open('./{0}.html'.format(son_url[-11:]), 'wb') as file: - html = requests.get(son_url).content.decode('gbk') - soup = bs(text_html, 'lxml') - text_div = soup.find_all('div', id='main')[0] - text_html = 'U职网提供数据咨询服务 {0} '.format(text_div) - file.write(text_html.encode('utf-8')) - else: - continue - -if __name__ == '__main__': - save_html() diff --git a/README.md b/README.md index 218cad7..7303b13 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ -# PythonCrawler: 用python编写的简单爬虫项目集合 +# PythonCrawler: 用python编写的爬虫项目集合 ``` ( )\ ) ) ) ( ( @@ -9,30 +9,90 @@ | _/| || || _|| ' \ / _ \| ' \))| (__ | '_|/ _` |\ V V /| |/ -_) | '_| |_| \_, | \__||_||_|\___/|_||_| \___||_| \__,_| \_/\_/ |_|\___| |_| |__/ - by yanghangfeng + ——————by yanghangfeng ``` -# 模块介绍 +# spiderFile模块简介 -##### 1. [baiduImg.py](https://github.com/Fenghuapiao/PythonCrawler/blob/master/baiduImg.py): 抓取百度的‘高清摄影’图片 +##### 1. [baidu_sy_img.py](https://github.com/Fenghuapiao/PythonCrawler/blob/master/baiduImg.py): 抓取百度的‘高清摄影’图片 -##### 2. [baiduImg2.py](https://github.com/Fenghuapiao/PythonCrawler/blob/master/baiduImg2.py): 抓取百度图片‘唯美意境’模块 +##### 2. [baidu_wm_img.py](https://github.com/Fenghuapiao/PythonCrawler/blob/master/baiduImg2.py): 抓取百度图片‘唯美意境’模块 -##### 3. [GetPhotos2.py](https://github.com/Fenghuapiao/PythonCrawler/blob/master/GetPhotos2.py): 抓取百度贴吧某话题下的所有图片 +##### 3. [get_photos.py](https://github.com/Fenghuapiao/PythonCrawler/blob/master/GetPhotos2.py): 抓取百度贴吧某话题下的所有图片 -##### 4. [getWebAllImg.py](https://github.com/Fenghuapiao/PythonCrawler/blob/master/getWebAllImg.py): 抓取整个网站的图片 +##### 4. [get_web_all_img.py](https://github.com/Fenghuapiao/PythonCrawler/blob/master/getWebAllImg.py): 抓取整个网站的图片 -##### 5. [lagouPositionSpider.py](https://github.com/Fenghuapiao/PythonCrawler/blob/master/lagouPositionSpider.py): 任意输入关键字,一键抓取与关键字相关的职位招聘信息,并保存到本地文件 +##### 5. [lagou_position_spider.py](https://github.com/Fenghuapiao/PythonCrawler/blob/master/lagouPositionSpider.py): 任意输入关键字,一键抓取与关键字相关的职位招聘信息,并保存到本地文件 ##### 6. [student_img.py](https://github.com/Fenghuapiao/PythonCrawler/blob/master/student_img.py): 基于本学校官网的url漏洞,获取所有注册学生学籍证件照 -##### 7. [JDSpider.py](https://github.com/Fenghuapiao/PythonCrawler/blob/master/JDSpider.py): 大批量抓取京东商品id和标签 +##### 7. [JD_spider.py](https://github.com/Fenghuapiao/PythonCrawler/blob/master/JDSpider.py): 大批量抓取京东商品id和标签 ##### 8. [ECUT_pos_html.py](https://github.com/Fenghuapiao/PythonCrawler/blob/master/ECUT_pos_html.py): 抓取学校官网所有校园招聘信息,并保存为html格式,图片也会镶嵌在html中。 ##### 9. [ECUT_get_grade.py](https://github.com/Fenghuapiao/PythonCrawler/blob/master/ECUT_get_grade.py): 模拟登陆学校官网,抓取成绩并计算平均学分绩 -##### 10. [githubHot.py](https://github.com/Fenghuapiao/PythonCrawler/blob/master/githubHot.py): 抓取github上面热门语言所对应的项目,并把项目简介和项目主页地址保存到本地文件。 +##### 10. [github_hot.py](https://github.com/Fenghuapiao/PythonCrawler/blob/master/githubHot.py): 抓取github上面热门语言所对应的项目,并把项目简介和项目主页地址保存到本地文件。 -##### 11.[pictureSpider.py](https://github.com/Fenghuapiao/PythonCrawler/blob/master/pictureSpider.py): 应一位知友的请求,抓取某网站上面所有的写真图片。 +##### 11.[xz_picture_spider.py](https://github.com/Fenghuapiao/PythonCrawler/blob/master/pictureSpider.py): 应一位知友的请求,抓取某网站上面所有的写真图片。 +--- +# spiderAPI模块简介 +#### 本模块提供一些网站的API爬虫接口,功能可能不是很全因此可塑性很大智慧的你如果有兴趣可以继续改进。 +##### 1.大众点评 +```python +from spiderAPI.dianping import * + +''' +citys = { + '北京': '2', '上海': '1', '广州': '4', '深圳': '7', '成都': '8', '重庆': '9', '杭州': '3', '南京': '5', '沈阳': '18', '苏州': '6', '天津': '10','武汉': '16', '西安': '17', '长沙': '344', '大连': '19', '济南': '22', '宁波': '11', '青岛': '21', '无锡': '13', '厦门': '15', '郑州': '160' +} + +ranktype = { + '最佳餐厅': 'score', '人气餐厅': 'popscore', '口味最佳': 'score1', '环境最佳': 'score2', '服务最佳': 'score3' +} +''' + +result=bestRestaurant(cityId=1, rankType='popscore')#获取人气餐厅 + +shoplist=dpindex(cityId=1, page=1)#商户风云榜 + +restaurantlist=restaurantList('http://www.dianping.com/search/category/2/10/p2')#获取餐厅 + +``` + +##### 2.获取代理IP +爬取http://proxy.ipcn.org,获取可用代理 +```python +from spiderAPI.proxyip import get_enableips + +enableips=get_enableips() + +``` + +##### 3.百度地图 +百度地图提供的API,对查询有一些限制,这里找出了web上查询的接口 +```python +from spiderAPI.baidumap import * + +citys=citys()#获取城市列表 +result=search(keyword="美食", citycode="257", page=1)#获取搜索结果 + +``` + +##### 4.模拟登录github +```python +from spiderAPI.github import GitHub + +github = GitHub() +github.login() # 这一步会提示你输入用户名和密码 +github.show_timeline() # 获取github主页时间线 +# 更多的功能有待你们自己去发掘 +``` + +##### 5.拉勾网 +```python +from spiderAPI.lagou import * + +lagou_spider(key='数据挖掘', page=1) # 获取关键字为数据挖掘的招聘信息 +``` diff --git a/baiduImg.py b/baiduImg.py deleted file mode 100644 index b8a2269..0000000 --- a/baiduImg.py +++ /dev/null @@ -1,53 +0,0 @@ -import requests -import re - -url = 'http://image.baidu.com/search/index' -headers = { - 'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:44.0) Gecko/20100101 Firefox/44.0', - 'Accept-Language' : 'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3', - 'Accept-Encoding' : 'gzip, deflate', - 'Referer' : 'http://image.baidu.com/search/index?tn=baiduimage&ipn=r&ct=201326592&cl=2&fm=detail&lm=-1&st=-1&sf=2&fmq=&pv=&ic=0&nc=1&z=&se=&showtab=0&fb=0&width=&height=&face=0&istype=2&ie=utf-8&word=%E9%AB%98%E6%B8%85%E6%91%84%E5%BD%B1&oq=%E9%AB%98%E6%B8%85%E6%91%84%E5%BD%B1&rsp=-1', - 'Cookie' : 'HOSUPPORT=1; UBI=fi_PncwhpxZ%7ETaMMzY0i9qXJ9ATcu3rvxFIc-a7KI9byBcYk%7EjBVmPGIbL3LTKKJ2D17mh5VfJ5yjlCncAb2yhPI5sZM51Qo7tpCemygM0VNUzuTBJwYF8OYmi3nsCCzbpo5U9tLSzkZfcQ1rxUcJSzaipThg__; HISTORY=fec845b215cd8e8be424cf320de232722d0050; PTOKEN=ff58b208cc3c16596889e0a20833991d; STOKEN=1b1f4b028b5a4415aa1dd9794ff061d312ad2a822d52418f3f1ffabbc0ac6142; SAVEUSERID=0868a2b4c9d166dc85e605f0dfd153; USERNAMETYPE=3; PSTM=1454309602; BAIDUID=E5493FD55CFE5424BA25B1996943B3B6:FG=1; BIDUPSID=B7D6D9EFA208B7B8C7CB6EF8F827BD4E; BDUSS=VSeFB6UXBmRWc3UEdFeXhKOFRvQm4ySmVmTkVEN2N0bldnM2o5RHdyaE54ZDlXQVFBQUFBJCQAAAAAAAAAAAEAAABzhCtU3Mbj5cfl0e8AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAE04uFZNOLhWZW; H_PS_PSSID=1447_18282_17946_18205_18559_17001_17073_15479_12166_18086_10634; BDRCVFR[feWj1Vr5u3D]=I67x6TjHwwYf0; BDRCVFR[X_XKQks0S63]=mk3SLVN4HKm; BDRCVFR[dG2JNJb_ajR]=mk3SLVN4HKm', - } -def get_html(url, headers): - data = { - 'cl' : '2', - 'ct' : '201326592', - 'face' : '0', - 'fp' : 'result', - 'gsm' : '200001e', - 'ic' : '0', - 'ie' : 'utf-8', - 'ipn' : 'rj', - 'istype' : '2', - 'lm' : '-1', - 'nc' : '1', - 'oe' : 'utf-8', - 'pn' : '30', - 'queryword' : '高清摄影', - 'rn' : '30', - 'st' : '-1', - 'tn' : 'resultjson_com', - 'word' : '高清摄影' - } - - page = requests.get(url, data, headers = headers).text - return page - -def get_img(page, headers): -# img_url_list = [] - reg = re.compile('http://.*?\.jpg') - imglist1 = re.findall(reg, page) - imglist2 = imglist1[0 : len(imglist1) : 3] -# [img_url_list.append(i) for i in imglist if not i in img_url_list] - x = 0 - for imgurl in imglist2: - bin = requests.get(imgurl, headers = headers).content - with open('E:/Pic2/%s.jpg' % x, 'wb') as file: - file.write(bin) - x += 1 - -if __name__ == '__main__': - page = get_html(url, headers) - get_img(page, headers) - diff --git a/baiduImg2.py b/baiduImg2.py deleted file mode 100644 index 35b2507..0000000 --- a/baiduImg2.py +++ /dev/null @@ -1,48 +0,0 @@ -import requests -import re - -url = 'http://image.baidu.com/search/index' -date = { - 'cl' : '2', - 'ct' : '201326592', - 'fp' : 'result', - 'gsm' : '1e', - 'ie' : 'utf-8', - 'ipn' : 'rj', - 'istype' : '2', - 'lm' : '-1', - 'nc' : '1', - 'oe' : 'utf-8', - 'pn' : '30', - 'queryword' : '唯美意境图片', - 'rn' : '30', - 'st' : '-1', - 'tn' : 'resultjson_com', - 'word' : '唯美意境图片' - } -headers = { - 'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:44.0) Gecko/20100101 Firefox/44.0', - 'Accept' : 'text/plain, */*; q=0.01', - 'Accept-Language' : 'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3', - 'Accept-Encoding' : 'gzip, deflate', - 'X-Requested-With' : 'XMLHttpRequest', - 'Referer' : 'http://image.baidu.com/search/index?ct=201326592&cl=2&st=-1&lm=-1&nc=1&ie=utf-8&tn=baiduimage&ipn=r&rps=1&pv=&fm=rs3&word=%E5%94%AF%E7%BE%8E%E6%84%8F%E5%A2%83%E5%9B%BE%E7%89%87&ofr=%E9%AB%98%E6%B8%85%E6%91%84%E5%BD%B1', - 'Cookie' : 'BDqhfp=%E5%94%AF%E7%BE%8E%E6%84%8F%E5%A2%83%E5%9B%BE%E7%89%87%26%26NaN-1undefined-1undefined%26%260%26%261; Hm_lvt_737dbb498415dd39d8abf5bc2404b290=1455016371,1455712809,1455769605,1455772886; PSTM=1454309602; BAIDUID=E5493FD55CFE5424BA25B1996943B3B6:FG=1; BIDUPSID=B7D6D9EFA208B7B8C7CB6EF8F827BD4E; BDUSS=VSeFB6UXBmRWc3UEdFeXhKOFRvQm4ySmVmTkVEN2N0bldnM2o5RHdyaE54ZDlXQVFBQUFBJCQAAAAAAAAAAAEAAABzhCtU3Mbj5cfl0e8AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAE04uFZNOLhWZW; H_PS_PSSID=1447_18282_17946_15479_12166_18086_10634; Hm_lpvt_737dbb498415dd39d8abf5bc2404b290=1455788775; firstShowTip=1; BDRCVFR[-pGxjrCMryR]=mk3SLVN4HKm; BDRCVFR[dG2JNJb_ajR]=mk3SLVN4HKm', - 'Connection' : 'keep-alive' - } -def get_page(url, date, headers): - page = requests.get(url, date, headers = headers).text - return page - -def get_img(page, headers): - reg = re.compile('http://.*?\.jpg') - imglist = re.findall(reg, page)[::3] - x = 0 - for imgurl in imglist: - with open('E:/Pic/%s.jpg' % x, 'wb') as file: - file.write(requests.get(imgurl, headers = headers).content) - x += 1 - -if __name__ == '__main__': - page = get_page(url, date, headers) - get_img(page, headers) \ No newline at end of file diff --git a/spiderAPI/__init__.py b/spiderAPI/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/spiderAPI/baidumap.py b/spiderAPI/baidumap.py new file mode 100644 index 0000000..1af5207 --- /dev/null +++ b/spiderAPI/baidumap.py @@ -0,0 +1,30 @@ +import requests +import json + +headers = { + 'Host': "map.baidu.com", + "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", + "Accept-Encoding": "gzip, deflate", + "Accept-Language": "en-US,en;q=0.5", + "Connection": "keep-alive", + "User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:39.0) Gecko/20100101 Firefox/39.0"} + + +def citys(): + html = requests.get( + 'http://map.baidu.com/?newmap=1&reqflag=pcmap&biz=1&from=webmap&da_par=baidu&pcevaname=pc4.1&qt=s&da_src=searchBox.button&wd=美食&c=1&src=0&wd2=&sug=0&l=5&b=(7002451.220000001,1994587.88;19470675.22,7343963.88)&from=webmap&biz_forward={%22scaler%22:1,%22styles%22:%22pl%22}&sug_forward=&tn=B_NORMAL_MAP&nn=0&u_loc=12736591.152491,3547888.166124&ie=utf-8&t=1459951988807', headers=headers).text + data = json.loads(html) + result = [] + for item in data['more_city']: + for city in item['city']: + result.append(city) + for item in data['content']: + result.append(item) + return result + + +def search(keyword, citycode, page): + html = requests.get('http://map.baidu.com/?newmap=1&reqflag=pcmap&biz=1&from=webmap&da_par=baidu&pcevaname=pc4.1&qt=con&from=webmap&c=' + str(citycode) + '&wd=' + keyword + '&wd2=&pn=' + str( + page) + '&nn=' + str(page * 10) + '&db=0&sug=0&addr=0&&da_src=pcmappg.poi.page&on_gel=1&src=7&gr=3&l=12&tn=B_NORMAL_MAP&u_loc=12736591.152491,3547888.166124&ie=utf-8', headers=headers).text + data = json.loads(html)['content'] + return data diff --git a/spiderAPI/dianping.py b/spiderAPI/dianping.py new file mode 100644 index 0000000..6ba43db --- /dev/null +++ b/spiderAPI/dianping.py @@ -0,0 +1,87 @@ +import requests +import json +import os +from bs4 import BeautifulSoup + +headers = { + 'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:39.0) Gecko/20100101 Firefox/39.0', + 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', + 'Accept-Language': 'en-US,en;q=0.5', + 'Accept-Encoding': 'gzip, deflate', + 'Connection': 'keep-alive'} + + +def bestRestaurant(cityId=1, rankType='popscore'): + html = requests.get('http://www.dianping.com/mylist/ajax/shoprank?cityId=%s&shopType=10&rankType=%s&categoryId=0' % + (cityId, rankType), headers=headers).text + result = json.loads(html)['shopBeans'] + return result + + +def getCityId(): + citys = {'北京': '2', '上海': '1', '广州': '4', '深圳': '7', '成都': '8', '重庆': '9', '杭州': '3', '南京': '5', '沈阳': '18', '苏州': '6', '天津': '10', + '武汉': '16', '西安': '17', '长沙': '344', '大连': '19', '济南': '22', '宁波': '11', '青岛': '21', '无锡': '13', '厦门': '15', '郑州': '160'} + return citys + + +def getRankType(): + RankType = {'最佳餐厅': 'score', '人气餐厅': 'popscore', + '口味最佳': 'score1', '环境最佳': 'score2', '服务最佳': 'score3'} + return RankType + + +def dpindex(cityId=1, page=1): + url = 'http://dpindex.dianping.com/dpindex?region=&category=&type=rank&city=%s&p=%s' % ( + cityId, page) + html = requests.get(url, headers=headers).text + table = BeautifulSoup(html, 'lxml').find( + 'div', attrs={'class': 'idxmain-subcontainer'}).find_all('li') + result = [] + for item in table: + shop = {} + shop['name'] = item.find('div', attrs={'class': 'field-name'}).get_text() + shop['url'] = item.find('a').get('href') + shop['num'] = item.find('div', attrs={'class': 'field-num'}).get_text() + shop['addr'] = item.find('div', attrs={'class': 'field-addr'}).get_text() + shop['index'] = item.find('div', attrs={'class': 'field-index'}).get_text() + result.append(shop) + return result + + +def restaurantList(url): + html = requests.get(url, headers=headers, timeout=30).text.replace('\r', '').replace('\n', '') + table = BeautifulSoup(html, 'lxml').find('div', id='shop-all-list').find_all('li') + result = [] + for item in table: + shop = {} + soup = item.find('div', attrs={'class': 'txt'}) + tit = soup.find('div', attrs={'class': 'tit'}) + comment = soup.find('div', attrs={'class': 'comment'}) + tag_addr = soup.find('div', attrs={'class': 'tag-addr'}) + shop['name'] = tit.find('a').get_text() + shop['star'] = comment.find('span').get('title') + shop['review-num'] = comment.find('a', + attrs={'class': 'review-num'}).get_text().replace('条点评', '') + shop['mean-price'] = comment.find('a', attrs={'class': 'mean-price'}).get_text() + shop['type'] = tag_addr.find('span', attrs={'class': 'tag'}).get_text() + shop['addr'] = tag_addr.find('span', attrs={'class': 'addr'}).get_text() + try: + comment_list = soup.find('span', attrs={'class': 'comment-list'}).find_all('span') + except: + comment_list = [] + score = [] + for i in comment_list: + score.append(i.get_text()) + shop['score'] = score + tags = [] + try: + for i in tit.find('div', attrs={'class': 'promo-icon'}).find_all('a'): + try: + tags += i.get('class') + except: + tags.append(i.get('class')[0]) + except: + pass + shop['tags'] = tags + result.append(shop) + return result diff --git a/spiderAPI/github.py b/spiderAPI/github.py new file mode 100644 index 0000000..8a74b54 --- /dev/null +++ b/spiderAPI/github.py @@ -0,0 +1,67 @@ +import requests +from bs4 import BeautifulSoup +import json + + +headers = { + 'Host': "github.com", + 'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:44.0) Gecko/20100101 Firefox/44.0', + 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', + 'Accept-Language': 'en-US,en;q=0.5', + 'Accept-Encoding': 'gzip, deflate', + 'Connection': 'keep-alive' +} + + +class GitHub(): + + def __init__(self): + self.session = requests.session() + self.timeline = [] + self.name = '' + self.user = '' + self.passwd = '' + + def login(self): + self.user = input('please input username:') + self.passwd = input('please input password:') + html = self.session.get('https://github.com/login', headers=headers).text + authenticity_token = BeautifulSoup(html, 'lxml').find( + 'input', {'name': 'authenticity_token'}).get('value') + data = { + 'commit': "Sign+in", + 'utf8': "✓", + 'login': self.user, + 'password': self.passwd, + 'authenticity_token': authenticity_token + } + html = self.session.post('https://github.com/session', data=data, headers=headers).text + self.name = BeautifulSoup(html, 'lxml').find( + 'strong', {'class': 'css-truncate-target'}).get_text() + + def get_timeline(self, page=1): + html = self.session.get( + 'https://github.com/dashboard/index/{page}?utf8=%E2%9C%93'.format(page=page), headers=headers).text + table = BeautifulSoup(html, 'lxml').find( + 'div', id='dashboard').find_all('div', {'class': 'alert'}) + for item in table: + line = {} + line['thing'] = item.find('div', {'class': 'title'}).get_text( + ).replace('\r', '').replace('\n', '') + line['time'] = item.find('relative-time').get('datetime') + self.timeline.append(line) + + def show_timeline(self): + keys = ['who', 'do', 'to', 'time'] + for line in self.timeline: + text = line['time'] + ' ' + line['thing'] + print('*' + text + ' ' * (80 - len(text) - 2) + '*') + print('*-*-*' * 16) + + def overview(self, user=None): + if user == None: + user = self.name + html = self.session.get('https://github.com/' + user, headers=headers).text + return overview + + diff --git a/spiderAPI/lagou.py b/spiderAPI/lagou.py new file mode 100644 index 0000000..52493a7 --- /dev/null +++ b/spiderAPI/lagou.py @@ -0,0 +1,13 @@ +import json +import requests as rq + + +def lagou_spider(key=None, page=None): + lagou_url = 'http://www.lagou.com/jobs/positionAjax.json?first=false&pn={0}&kd={1}' + lagou_python_data = [] + for i in range(page): + print('抓取第{0}页'.format(i + 1)) + lagou_url_ = lagou_url.format(i, key) + lagou_data = json.loads(rq.get(lagou_url_).text) + lagou_python_data.extend(lagou_data['content']['positionResult']['result']) + return lagou_python_data diff --git a/spiderAPI/proxyip.py b/spiderAPI/proxyip.py new file mode 100644 index 0000000..f7c3637 --- /dev/null +++ b/spiderAPI/proxyip.py @@ -0,0 +1,50 @@ +import requests +import threading +import re + + +enableips = [] + + +class IsEnable(threading.Thread): + + def __init__(self, ip): + super(IsEnable, self).__init__() + self.ip = ip + self.proxies = { + 'http': 'http://%s' % ip + } + + def run(self): + global enableips + try: + html = requests.get('http://httpbin.org/ip', proxies=self.proxies, timeout=5).text + result = eval(html)['origin'] + if result in self.ip: + enableips.append(self.ip) + except: + return False + + +def parser(url): + html = requests.get(url).text + ips = re.findall('\d+\.\d+\.\d+\.\d+:\d+', html) + return ips + + +def get_enableips(): + global enableips + urls = ['http://proxy.ipcn.org/proxya.html', 'http://proxy.ipcn.org/proxya2.html', + 'http://proxy.ipcn.org/proxyb.html', 'http://proxy.ipcn.org/proxyb2.html'] + for url in urls: + ips = parser(url) + threadings = [] + for ip in ips: + work = IsEnable(ip) + work.setDaemon(True) + threadings.append(work) + for work in threadings: + work.start() + for work in threadings: + work.join() + return enableips diff --git a/ECUT_get_grade.py b/spiderFile/ECUT_get_grade.py similarity index 90% rename from ECUT_get_grade.py rename to spiderFile/ECUT_get_grade.py index 5dd31cf..6d2ff1a 100644 --- a/ECUT_get_grade.py +++ b/spiderFile/ECUT_get_grade.py @@ -3,13 +3,14 @@ import numpy as np import pandas as pd + def warn(*args, **kw): pass import warnings warnings.warn = warn -print('*'*30 + '东华理工大学' + '*'*30) -print('*'*30 + '作者:杨航锋' + '*'*30) -print('*'*30 + '版本:v1.0' + '*'*30) +print('*' * 30 + '东华理工大学' + '*' * 30) +print('*' * 30 + '作者:杨航锋' + '*' * 30) +print('*' * 30 + '版本:v1.0' + '*' * 30) print('\n') print('请输你学号:') username = input() @@ -19,6 +20,7 @@ def warn(*args, **kw): pass login_url = 'https://cas.ecit.cn/index.jsp?service=http://portal.ecit.cn/Authentication' + def get_LT(login_url): html = requests.get(login_url, verify=False).text regex = re.compile('') @@ -86,12 +88,13 @@ def get_LT(login_url): grade_data_.to_csv('./grade_data.csv', index=False) print('成绩已保存在运行此程序的文件夹') elif select == '3': - xw_grade = grade_data_[(grade_data_['课程名'] == '*数学分析(I)') | (grade_data_['课程名'] == '高等代数(I)')|\ - (grade_data_['课程名'] == 'C语言程序设计基础') | (grade_data_['课程名'] == '大学英语(II)')|\ - (grade_data_['课程名'] == '*常微分方程') | (grade_data_['课程名'] == '*概率论')|\ + xw_grade = grade_data_[(grade_data_['课程名'] == '*数学分析(I)') | (grade_data_['课程名'] == '高等代数(I)') | + (grade_data_['课程名'] == 'C语言程序设计基础') | (grade_data_['课程名'] == '大学英语(II)') | + (grade_data_['课程名'] == '*常微分方程') | (grade_data_['课程名'] == '*概率论') | (grade_data_['课程名'] == '数据结构')] print(xw_grade) print('\n') - avg_grade = np.sum((xw_grade.学分.astype(float) * xw_grade.成绩.astype(float))) / np.sum(xw_grade.学分.astype(float)) + avg_grade = np.sum((xw_grade.学分.astype(float) * xw_grade.成绩.astype(float))) / \ + np.sum(xw_grade.学分.astype(float)) print('平均学分绩={0}'.format(avg_grade)) input('按任意键结束') diff --git a/JDSpider.py b/spiderFile/JD_spider.py similarity index 93% rename from JDSpider.py rename to spiderFile/JD_spider.py index 9b8c66d..9d697dd 100644 --- a/JDSpider.py +++ b/spiderFile/JD_spider.py @@ -1,42 +1,42 @@ -import requests -import re -import pandas as pd - -def get_data(): - jj_url1 = 'http://search.jd.com/s_new.php?keyword=%E5%AE%B6%E5%B1%85%E7%94%A8%E5%93%81&enc=utf-8&qrst=1&rt=1&stop=1&pt=1&vt=2&sttr=1&offset=6&page=' - jj_url2 = '&s=53&click=0' - bt_ = [] - _id = [] - url_list = [] - for i in range(1, 10000, 2): - jj_url = jj_url1 + str(i) + jj_url2 - url_list.append(jj_url) - html = requests.get(jj_url).content.decode('utf-8') - reg1 = re.compile('') - bt = re.findall(reg1, html) - id_ = re.findall(reg2, html) - bt_.extend(bt) - _id.extend(id_) - return bt_, _id - -def split_str(_id): - zid = [] - for _ in _id: - zid.append(_.split('_')[2]) - return zid - -def save_data(zid, bt_): - data = pd.DataFrame({ - '标题': bt_, - 'ID': zid - }) - data.to_excel('./家居用品.xlsx', index=False) - -def start_main(): - bt_, _id = get_data() - zid = split_str(_id) - save_data(zid, bt_) - -if __name__ == '__main__': - start_main() +import requests +import re +import pandas as pd + +def get_data(): + jj_url1 = 'http://search.jd.com/s_new.php?keyword=%E5%AE%B6%E5%B1%85%E7%94%A8%E5%93%81&enc=utf-8&qrst=1&rt=1&stop=1&pt=1&vt=2&sttr=1&offset=6&page=' + jj_url2 = '&s=53&click=0' + bt_ = [] + _id = [] + url_list = [] + for i in range(1, 10, 2): + jj_url = jj_url1 + str(i) + jj_url2 + url_list.append(jj_url) + html = requests.get(jj_url).content.decode('utf-8') + reg1 = re.compile('') + bt = re.findall(reg1, html) + id_ = re.findall(reg2, html) + bt_.extend(bt) + _id.extend(id_) + return bt_, _id + +def split_str(_id): + zid = [] + for _ in _id: + zid.append(_.split('_')[2]) + return zid + +def save_data(zid, bt_): + data = pd.DataFrame({ + '标题': bt_, + 'ID': zid + }) + data.to_excel('./家居用品.xlsx', index=False) + +def start_main(): + bt_, _id = get_data() + zid = split_str(_id) + save_data(zid, bt_) + +if __name__ == '__main__': + start_main() diff --git a/spiderFile/baidu_sy_img.py b/spiderFile/baidu_sy_img.py new file mode 100644 index 0000000..faaf6e2 --- /dev/null +++ b/spiderFile/baidu_sy_img.py @@ -0,0 +1,55 @@ +import requests +import re + +url = 'http://image.baidu.com/search/index' +headers = { + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:44.0) Gecko/20100101 Firefox/44.0', + 'Accept-Language': 'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3', + 'Accept-Encoding': 'gzip, deflate', + 'Referer': 'http://image.baidu.com/search/index?tn=baiduimage&ipn=r&ct=201326592&cl=2&fm=detail&lm=-1&st=-1&sf=2&fmq=&pv=&ic=0&nc=1&z=&se=&showtab=0&fb=0&width=&height=&face=0&istype=2&ie=utf-8&word=%E9%AB%98%E6%B8%85%E6%91%84%E5%BD%B1&oq=%E9%AB%98%E6%B8%85%E6%91%84%E5%BD%B1&rsp=-1', + 'Cookie': 'HOSUPPORT=1; UBI=fi_PncwhpxZ%7ETaMMzY0i9qXJ9ATcu3rvxFIc-a7KI9byBcYk%7EjBVmPGIbL3LTKKJ2D17mh5VfJ5yjlCncAb2yhPI5sZM51Qo7tpCemygM0VNUzuTBJwYF8OYmi3nsCCzbpo5U9tLSzkZfcQ1rxUcJSzaipThg__; HISTORY=fec845b215cd8e8be424cf320de232722d0050; PTOKEN=ff58b208cc3c16596889e0a20833991d; STOKEN=1b1f4b028b5a4415aa1dd9794ff061d312ad2a822d52418f3f1ffabbc0ac6142; SAVEUSERID=0868a2b4c9d166dc85e605f0dfd153; USERNAMETYPE=3; PSTM=1454309602; BAIDUID=E5493FD55CFE5424BA25B1996943B3B6:FG=1; BIDUPSID=B7D6D9EFA208B7B8C7CB6EF8F827BD4E; BDUSS=VSeFB6UXBmRWc3UEdFeXhKOFRvQm4ySmVmTkVEN2N0bldnM2o5RHdyaE54ZDlXQVFBQUFBJCQAAAAAAAAAAAEAAABzhCtU3Mbj5cfl0e8AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAE04uFZNOLhWZW; H_PS_PSSID=1447_18282_17946_18205_18559_17001_17073_15479_12166_18086_10634; BDRCVFR[feWj1Vr5u3D]=I67x6TjHwwYf0; BDRCVFR[X_XKQks0S63]=mk3SLVN4HKm; BDRCVFR[dG2JNJb_ajR]=mk3SLVN4HKm', +} + + +def get_html(url, headers): + data = { + 'cl': '2', + 'ct': '201326592', + 'face': '0', + 'fp': 'result', + 'gsm': '200001e', + 'ic': '0', + 'ie': 'utf-8', + 'ipn': 'rj', + 'istype': '2', + 'lm': '-1', + 'nc': '1', + 'oe': 'utf-8', + 'pn': '30', + 'queryword': '高清摄影', + 'rn': '30', + 'st': '-1', + 'tn': 'resultjson_com', + 'word': '高清摄影' + } + + page = requests.get(url, data, headers=headers).text + return page + + +def get_img(page, headers): + # img_url_list = [] + reg = re.compile('http://.*?\.jpg') + imglist1 = re.findall(reg, page) + imglist2 = imglist1[0: len(imglist1): 3] +# [img_url_list.append(i) for i in imglist if not i in img_url_list] + x = 0 + for imgurl in imglist2: + bin = requests.get(imgurl, headers=headers).content + with open('E:/Pic2/%s.jpg' % x, 'wb') as file: + file.write(bin) + x += 1 + +if __name__ == '__main__': + page = get_html(url, headers) + get_img(page, headers) diff --git a/spiderFile/baidu_wm_img.py b/spiderFile/baidu_wm_img.py new file mode 100644 index 0000000..542c782 --- /dev/null +++ b/spiderFile/baidu_wm_img.py @@ -0,0 +1,51 @@ +import requests +import re + +url = 'http://image.baidu.com/search/index' +date = { + 'cl': '2', + 'ct': '201326592', + 'fp': 'result', + 'gsm': '1e', + 'ie': 'utf-8', + 'ipn': 'rj', + 'istype': '2', + 'lm': '-1', + 'nc': '1', + 'oe': 'utf-8', + 'pn': '30', + 'queryword': '唯美意境图片', + 'rn': '30', + 'st': '-1', + 'tn': 'resultjson_com', + 'word': '唯美意境图片' +} +headers = { + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:44.0) Gecko/20100101 Firefox/44.0', + 'Accept': 'text/plain, */*; q=0.01', + 'Accept-Language': 'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3', + 'Accept-Encoding': 'gzip, deflate', + 'X-Requested-With': 'XMLHttpRequest', + 'Referer': 'http://image.baidu.com/search/index?ct=201326592&cl=2&st=-1&lm=-1&nc=1&ie=utf-8&tn=baiduimage&ipn=r&rps=1&pv=&fm=rs3&word=%E5%94%AF%E7%BE%8E%E6%84%8F%E5%A2%83%E5%9B%BE%E7%89%87&ofr=%E9%AB%98%E6%B8%85%E6%91%84%E5%BD%B1', + 'Cookie': 'BDqhfp=%E5%94%AF%E7%BE%8E%E6%84%8F%E5%A2%83%E5%9B%BE%E7%89%87%26%26NaN-1undefined-1undefined%26%260%26%261; Hm_lvt_737dbb498415dd39d8abf5bc2404b290=1455016371,1455712809,1455769605,1455772886; PSTM=1454309602; BAIDUID=E5493FD55CFE5424BA25B1996943B3B6:FG=1; BIDUPSID=B7D6D9EFA208B7B8C7CB6EF8F827BD4E; BDUSS=VSeFB6UXBmRWc3UEdFeXhKOFRvQm4ySmVmTkVEN2N0bldnM2o5RHdyaE54ZDlXQVFBQUFBJCQAAAAAAAAAAAEAAABzhCtU3Mbj5cfl0e8AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAE04uFZNOLhWZW; H_PS_PSSID=1447_18282_17946_15479_12166_18086_10634; Hm_lpvt_737dbb498415dd39d8abf5bc2404b290=1455788775; firstShowTip=1; BDRCVFR[-pGxjrCMryR]=mk3SLVN4HKm; BDRCVFR[dG2JNJb_ajR]=mk3SLVN4HKm', + 'Connection': 'keep-alive' +} + + +def get_page(url, date, headers): + page = requests.get(url, date, headers=headers).text + return page + + +def get_img(page, headers): + reg = re.compile('http://.*?\.jpg') + imglist = re.findall(reg, page)[::3] + x = 0 + for imgurl in imglist: + with open('E:/Pic/%s.jpg' % x, 'wb') as file: + file.write(requests.get(imgurl, headers=headers).content) + x += 1 + +if __name__ == '__main__': + page = get_page(url, date, headers) + get_img(page, headers) diff --git a/GetPhotos2.py b/spiderFile/get_photos.py similarity index 90% rename from GetPhotos2.py rename to spiderFile/get_photos.py index 24927c4..eae1725 100644 --- a/GetPhotos2.py +++ b/spiderFile/get_photos.py @@ -1,24 +1,25 @@ -import requests -from bs4 import BeautifulSoup - -url = 'http://tieba.baidu.com/p/4178314700' - -def GetHtml(url): - html = requests.get(url).text - return html - -def GetImg(html): - soup = BeautifulSoup(html, 'html.parser') - imglist = [] - for photourl in soup.find_all('img'): - imglist.append(photourl.get('src')) - x = 0 - for imgurl in imglist: - with open('E:/Pic/%s.jpg' % x, 'wb') as file: - file.write(requests.get(imgurl).content) - x += 1 - -if __name__ == '__main__': - html = GetHtml(url) - GetImg(html) - \ No newline at end of file +import requests +from bs4 import BeautifulSoup + +url = 'http://tieba.baidu.com/p/4178314700' + + +def GetHtml(url): + html = requests.get(url).text + return html + + +def GetImg(html): + soup = BeautifulSoup(html, 'html.parser') + imglist = [] + for photourl in soup.find_all('img'): + imglist.append(photourl.get('src')) + x = 0 + for imgurl in imglist: + with open('E:/Pic/%s.jpg' % x, 'wb') as file: + file.write(requests.get(imgurl).content) + x += 1 + +if __name__ == '__main__': + html = GetHtml(url) + GetImg(html) diff --git a/getWebAllImg.py b/spiderFile/get_web_all_img.py similarity index 62% rename from getWebAllImg.py rename to spiderFile/get_web_all_img.py index 56306d9..616befe 100644 --- a/getWebAllImg.py +++ b/spiderFile/get_web_all_img.py @@ -1,67 +1,73 @@ -import re -import time -import requests - -def get_html(url, headers): - html = requests.get(url, timeout = 100, headers = headers).text - return html - -def get_main_url(html): - reg = re.compile('http://.*?\.jpg') - main_imglist = re.findall(reg, html) - return main_imglist - -def get_son_url(html): - initurl = 'http://www.woyaogexing.com' - reg = re.compile('/tupian/weimei/\d+/\d+\.html') - son_urllist_init = re.findall(reg, html) - son_urlist = set(son_urllist_init) - son_url_final = [] - for son_url in son_urlist: - son_url_final.append(initurl + son_url) - return son_url_final #结果是所有含有图片的网页地址 - -def get_all_sonurl(son_url_final, headers): - son_imglist = [] - for sonurl in son_url_final: - son_html = requests.get(sonurl, timeout = 100, headers = headers).text - son_reg = re.compile('http://.*?\.jpg') - son_imglist1 = re.findall(son_reg, son_html) - for temp in son_imglist1: - son_imglist.append(temp) - return son_imglist #结果是所有子网页图片的地址 - -def get_all_img(main_imglist, son_imglist, headers): - global x #使用全局变量使每次的变量不清除,这个问题有待完美解决! - for imgurl in main_imglist: - son_imglist.append(imgurl) - for imgurl in son_imglist: - with open('E:/Pic2/%s.jpg' % x, 'wb') as file: - file.write(requests.get(imgurl, timeout = 100, headers = headers).content) - time.sleep(0.1) - x += 1 - -def turn_page(): - page_list = ['http://www.woyaogexing.com/tupian/weimei/index.html'] - for i in range(1, 7): - page_list.append('http://www.woyaogexing.com/tupian/weimei/index_' + str(i) + '.html') - return page_list - -if __name__ == '__main__': - headers = { - 'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:44.0) Gecko/20100101 Firefox/44.0', - 'Accept' : 'text/plain, */*; q=0.01', - 'Accept-Language' : 'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3', - 'Accept-Encoding' : 'gzip, deflate', - 'Cookie' : 'bdshare_firstime=1456041345958; Hm_lvt_a077b6b44aeefe3829d03416d9cb4ec3=1456041346; Hm_lpvt_a077b6b44aeefe3829d03416d9cb4ec3=1456048504', - } - x = 0 - page_list = ['http://www.woyaogexing.com/tupian/weimei/index.html'] - for i in range(2, 20): - page_list.append('http://www.woyaogexing.com/tupian/weimei/index_' + str(i) + '.html') - for p in range(6): - html = get_html(page_list[p], headers) - main_imglist = get_main_url(html) - son_url_final = get_son_url(html) - son_imglist = get_all_sonurl(son_url_final, headers) - get_all_img(main_imglist, son_imglist, headers) \ No newline at end of file +import re +import time +import requests + + +def get_html(url, headers): + html = requests.get(url, timeout=100, headers=headers).text + return html + + +def get_main_url(html): + reg = re.compile('http://.*?\.jpg') + main_imglist = re.findall(reg, html) + return main_imglist + + +def get_son_url(html): + initurl = 'http://www.woyaogexing.com' + reg = re.compile('/tupian/weimei/\d+/\d+\.html') + son_urllist_init = re.findall(reg, html) + son_urlist = set(son_urllist_init) + son_url_final = [] + for son_url in son_urlist: + son_url_final.append(initurl + son_url) + return son_url_final # 结果是所有含有图片的网页地址 + + +def get_all_sonurl(son_url_final, headers): + son_imglist = [] + for sonurl in son_url_final: + son_html = requests.get(sonurl, timeout=100, headers=headers).text + son_reg = re.compile('http://.*?\.jpg') + son_imglist1 = re.findall(son_reg, son_html) + for temp in son_imglist1: + son_imglist.append(temp) + return son_imglist # 结果是所有子网页图片的地址 + + +def get_all_img(main_imglist, son_imglist, headers): + global x # 使用全局变量使每次的变量不清除,这个问题有待完美解决! + for imgurl in main_imglist: + son_imglist.append(imgurl) + for imgurl in son_imglist: + with open('E:/Pic2/%s.jpg' % x, 'wb') as file: + file.write(requests.get(imgurl, timeout=100, headers=headers).content) + time.sleep(0.1) + x += 1 + + +def turn_page(): + page_list = ['http://www.woyaogexing.com/tupian/weimei/index.html'] + for i in range(1, 7): + page_list.append('http://www.woyaogexing.com/tupian/weimei/index_' + str(i) + '.html') + return page_list + +if __name__ == '__main__': + headers = { + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:44.0) Gecko/20100101 Firefox/44.0', + 'Accept': 'text/plain, */*; q=0.01', + 'Accept-Language': 'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3', + 'Accept-Encoding': 'gzip, deflate', + 'Cookie': 'bdshare_firstime=1456041345958; Hm_lvt_a077b6b44aeefe3829d03416d9cb4ec3=1456041346; Hm_lpvt_a077b6b44aeefe3829d03416d9cb4ec3=1456048504', + } + x = 0 + page_list = ['http://www.woyaogexing.com/tupian/weimei/index.html'] + for i in range(2, 20): + page_list.append('http://www.woyaogexing.com/tupian/weimei/index_' + str(i) + '.html') + for p in range(6): + html = get_html(page_list[p], headers) + main_imglist = get_main_url(html) + son_url_final = get_son_url(html) + son_imglist = get_all_sonurl(son_url_final, headers) + get_all_img(main_imglist, son_imglist, headers) diff --git a/githubHot.py b/spiderFile/github_hot.py similarity index 99% rename from githubHot.py rename to spiderFile/github_hot.py index 7c1c8a3..ea9726e 100644 --- a/githubHot.py +++ b/spiderFile/github_hot.py @@ -3,6 +3,7 @@ import pandas as pd import numpy as np + def hot_github(keyword): url = 'https://github.com/trending/{0}'.format(keyword) main_url = 'https://github.com{0}' diff --git a/lagouPositionSpider.py b/spiderFile/lagou_position_spider.py similarity index 97% rename from lagouPositionSpider.py rename to spiderFile/lagou_position_spider.py index 7d83e35..cac4bdb 100644 --- a/lagouPositionSpider.py +++ b/spiderFile/lagou_position_spider.py @@ -1,16 +1,16 @@ -import json -import requests as rq -import pandas as pd - -kw = input('请输入抓取的职位名称:') -lagou_url = 'http://www.lagou.com/jobs/positionAjax.json?first=false&pn={0}&kd={1}' -lagou_python_data = [] -for i in range(1, 31): - print('抓取第{0}页'.format(i)) - lagou_url_ = lagou_url.format(i, kw) - lagou_data = json.loads(rq.get(lagou_url_).text) - lagou_python_data.extend(lagou_data['content']['positionResult']['result']) - -position_data = pd.DataFrame(lagou_python_data) -position_data.to_csv('./招聘{0}职位.csv'.format(kw), index=False) -print('数据已保存到本地文件') +import json +import requests as rq +import pandas as pd + +kw = input('请输入抓取的职位名称:') +lagou_url = 'http://www.lagou.com/jobs/positionAjax.json?first=false&pn={0}&kd={1}' +lagou_python_data = [] +for i in range(1, 31): + print('抓取第{0}页'.format(i)) + lagou_url_ = lagou_url.format(i, kw) + lagou_data = json.loads(rq.get(lagou_url_).text) + lagou_python_data.extend(lagou_data['content']['positionResult']['result']) + +position_data = pd.DataFrame(lagou_python_data) +position_data.to_csv('./招聘{0}职位.csv'.format(kw), index=False) +print('数据已保存到本地文件') diff --git a/one_img.py b/spiderFile/one_img.py similarity index 96% rename from one_img.py rename to spiderFile/one_img.py index 5d41d3c..4212314 100644 --- a/one_img.py +++ b/spiderFile/one_img.py @@ -1,21 +1,18 @@ -import re -import requests - -temp = 'http://caodan.org/page/' -count = 1 -for i in range(1, 1331): - url = temp + str(i) - page = requests.get(url).text - reg = re.compile('src="(http://.*?\.jpg)"') - img_url = re.findall(reg, page) - if img_url != []: - with open('E:/img/%s.jpg' % count, 'wb') as file: - img_data = requests.get(img_url[0]).content - file.write(img_data) - count += 1 - else: - continue -print('OK!') - - - \ No newline at end of file +import re +import requests + +temp = 'http://caodan.org/page/' +count = 1 +for i in range(1, 1331): + url = temp + str(i) + page = requests.get(url).text + reg = re.compile('src="(http://.*?\.jpg)"') + img_url = re.findall(reg, page) + if img_url != []: + with open('E:/img/%s.jpg' % count, 'wb') as file: + img_data = requests.get(img_url[0]).content + file.write(img_data) + count += 1 + else: + continue +print('OK!') diff --git a/student_img.py b/spiderFile/student_img.py similarity index 87% rename from student_img.py rename to spiderFile/student_img.py index a23c213..d3135ea 100644 --- a/student_img.py +++ b/spiderFile/student_img.py @@ -1,26 +1,29 @@ -import requests - -url = '' -banji = [] -zhuanye = [] -for a in range(10): - for b in range(10): - banji.append(str(a) + '0' + str(b)) -for c in range(10): - zhuanye.append('20' + str(c)) - -for year in range(2011, 2015): - for xh in zhuanye: - for nj in banji: - for i in range(1, 35): - if i < 10: - xuehao = str(year) + str(xh) + str(nj) + '0' + str(i) - student_url = url + xuehao - with open('E:/student_img/%s.jpeg' % xuehao, 'wb') as file: - file.write(requests.get(student_url).content) - else: - xuehao = str(year) + str(xh) + str(nj) + str(i) - student_url = url + xuehao - with open('E:/student_img/%s.jpeg' % xuehao, 'wb') as file: - file.write(requests.get(student_url).content) -print('OK!') +import requests + +""" +思路:去官网自己的主页,看自己的照片的url然后你懂的。 +""" +url = '' +banji = [] +zhuanye = [] +for a in range(10): + for b in range(10): + banji.append(str(a) + '0' + str(b)) +for c in range(10): + zhuanye.append('20' + str(c)) + +for year in range(2011, 2015): + for xh in zhuanye: + for nj in banji: + for i in range(1, 35): + if i < 10: + xuehao = str(year) + str(xh) + str(nj) + '0' + str(i) + student_url = url + xuehao + with open('E:/student_img/%s.jpeg' % xuehao, 'wb') as file: + file.write(requests.get(student_url).content) + else: + xuehao = str(year) + str(xh) + str(nj) + str(i) + student_url = url + xuehao + with open('E:/student_img/%s.jpeg' % xuehao, 'wb') as file: + file.write(requests.get(student_url).content) +print('OK!') diff --git a/pictureSpider.py b/spiderFile/xz_picture_spider.py similarity index 96% rename from pictureSpider.py rename to spiderFile/xz_picture_spider.py index 0e0abca..8e87e6d 100644 --- a/pictureSpider.py +++ b/spiderFile/xz_picture_spider.py @@ -8,10 +8,10 @@ def Spidermain(page=11): 本爬虫的爬取策略为深度优先(DFS) ''' main_url_ = 'http://www.rosiok.com/app/list_12_{0}.html' - for _ in range(1, page+1): + for _ in range(1, page + 1): main_url = main_url_.format(_) domain_url = 'http://www.rosiok.com{0}' - start_html = requests.get(main_url).content.decode('gb2312') + start_html = requests.get(main_url).content.decode('gb2312') kids_url_regex = re.compile('') kids_url = [domain_url.format(i) for i in re.findall(kids_url_regex, start_html)] for kid_url in kids_url: @@ -39,6 +39,6 @@ def Spidermain(page=11): file.write(s.get(pic_url, timeout=5).content) except: pass - + if __name__ == '__main__': Spidermain() From 9810dfab52d0e4f0064fa47b56dcc52d860d3507 Mon Sep 17 00:00:00 2001 From: Yanghangfeng <1437183653@qq.com> Date: Wed, 23 Nov 2016 18:11:21 +0800 Subject: [PATCH 03/65] update readme.md --- README.md | 29 +++++++++++++++-------------- 1 file changed, 15 insertions(+), 14 deletions(-) diff --git a/README.md b/README.md index 7303b13..50ec563 100644 --- a/README.md +++ b/README.md @@ -9,35 +9,36 @@ | _/| || || _|| ' \ / _ \| ' \))| (__ | '_|/ _` |\ V V /| |/ -_) | '_| |_| \_, | \__||_||_|\___/|_||_| \___||_| \__,_| \_/\_/ |_|\___| |_| |__/ - ——————by yanghangfeng + —————— by yanghangfeng ``` -# spiderFile模块简介 +- # spiderFile模块简介 -##### 1. [baidu_sy_img.py](https://github.com/Fenghuapiao/PythonCrawler/blob/master/baiduImg.py): 抓取百度的‘高清摄影’图片 +##### 1. [baidu_sy_img.py](https://github.com/Fenghuapiao/PythonCrawler/blob/master/spiderFile/baidu_sy_img.py): 抓取百度的‘高清摄影’图片 -##### 2. [baidu_wm_img.py](https://github.com/Fenghuapiao/PythonCrawler/blob/master/baiduImg2.py): 抓取百度图片‘唯美意境’模块 +##### 2. [baidu_wm_img.py](https://github.com/Fenghuapiao/PythonCrawler/blob/master/spiderFile/baidu_wm_img.py): 抓取百度图片‘唯美意境’模块 -##### 3. [get_photos.py](https://github.com/Fenghuapiao/PythonCrawler/blob/master/GetPhotos2.py): 抓取百度贴吧某话题下的所有图片 +##### 3. [get_photos.py](https://github.com/Fenghuapiao/PythonCrawler/blob/master/spiderFile/get_photos.py): 抓取百度贴吧某话题下的所有图片 -##### 4. [get_web_all_img.py](https://github.com/Fenghuapiao/PythonCrawler/blob/master/getWebAllImg.py): 抓取整个网站的图片 +##### 4. [get_web_all_img.py](https://github.com/Fenghuapiao/PythonCrawler/blob/master/spiderFile/get_web_all_img.py): 抓取整个网站的图片 -##### 5. [lagou_position_spider.py](https://github.com/Fenghuapiao/PythonCrawler/blob/master/lagouPositionSpider.py): 任意输入关键字,一键抓取与关键字相关的职位招聘信息,并保存到本地文件 +##### 5. [lagou_position_spider.py](https://github.com/Fenghuapiao/PythonCrawler/blob/master/spiderFile/lagou_position_spider.py): 任意输入关键字,一键抓取与关键字相关的职位招聘信息,并保存到本地文件 -##### 6. [student_img.py](https://github.com/Fenghuapiao/PythonCrawler/blob/master/student_img.py): 基于本学校官网的url漏洞,获取所有注册学生学籍证件照 +##### 6. [student_img.py](https://github.com/Fenghuapiao/PythonCrawler/blob/master/spiderFile/student_img.py): 基于本学校官网的url漏洞,获取所有注册学生学籍证件照 -##### 7. [JD_spider.py](https://github.com/Fenghuapiao/PythonCrawler/blob/master/JDSpider.py): 大批量抓取京东商品id和标签 +##### 7. [JD_spider.py](https://github.com/Fenghuapiao/PythonCrawler/blob/master/spiderFile/JD_spider.py): 大批量抓取京东商品id和标签 -##### 8. [ECUT_pos_html.py](https://github.com/Fenghuapiao/PythonCrawler/blob/master/ECUT_pos_html.py): 抓取学校官网所有校园招聘信息,并保存为html格式,图片也会镶嵌在html中。 +##### 8. [ECUT_pos_html.py](https://github.com/Fenghuapiao/PythonCrawler/blob/master/spiderFile/ECUT_pos_html.py): 抓取学校官网所有校园招聘信息,并保存为html格式,图片也会镶嵌在html中。 -##### 9. [ECUT_get_grade.py](https://github.com/Fenghuapiao/PythonCrawler/blob/master/ECUT_get_grade.py): 模拟登陆学校官网,抓取成绩并计算平均学分绩 +##### 9. [ECUT_get_grade.py](https://github.com/Fenghuapiao/PythonCrawler/blob/master/spiderFile/ECUT_get_grade.py): 模拟登陆学校官网,抓取成绩并计算平均学分绩 -##### 10. [github_hot.py](https://github.com/Fenghuapiao/PythonCrawler/blob/master/githubHot.py): 抓取github上面热门语言所对应的项目,并把项目简介和项目主页地址保存到本地文件。 +##### 10. [github_hot.py](https://github.com/Fenghuapiao/PythonCrawler/blob/master/spiderFile/github_hot.py): 抓取github上面热门语言所对应的项目,并把项目简介和项目主页地址保存到本地文件。 -##### 11.[xz_picture_spider.py](https://github.com/Fenghuapiao/PythonCrawler/blob/master/pictureSpider.py): 应一位知友的请求,抓取某网站上面所有的写真图片。 +##### 11.[xz_picture_spider.py](https://github.com/Fenghuapiao/PythonCrawler/blob/master/spiderFile/xz_picture_spider.py): 应一位知友的请求,抓取某网站上面所有的写真图片。 +##### 12.[one_img.py](https://github.com/Fenghuapiao/PythonCrawler/blob/master/spiderFile/one_img.py): 抓取one文艺网站的图片 --- -# spiderAPI模块简介 +- # spiderAPI模块简介 #### 本模块提供一些网站的API爬虫接口,功能可能不是很全因此可塑性很大智慧的你如果有兴趣可以继续改进。 ##### 1.大众点评 ```python From 13b07000e0e7e690971de8bdecf44087f63b3880 Mon Sep 17 00:00:00 2001 From: Yanghangfeng <1437183653@qq.com> Date: Wed, 23 Nov 2016 18:18:02 +0800 Subject: [PATCH 04/65] update readme.md --- spiderFile/ECUT_pos_html.py | 50 +++++++++++++++++++++++++++++++++++++ 1 file changed, 50 insertions(+) create mode 100644 spiderFile/ECUT_pos_html.py diff --git a/spiderFile/ECUT_pos_html.py b/spiderFile/ECUT_pos_html.py new file mode 100644 index 0000000..93b8ae1 --- /dev/null +++ b/spiderFile/ECUT_pos_html.py @@ -0,0 +1,50 @@ +import requests +import re from bs4 +import BeautifulSoup as bs + + +def crawl_all_main_url(page=10): + # 默认抓取官网前十页招聘信息的url + all_url_list = [] + for _ in range(1, page+1): + url = 'http://zjc.ecit.edu.cn/jy/app/newslist.php?BigClassName=%D5%D0%C6%B8%D0%C5%CF%A2&Page={0}'.format(_) + page_html = requests.get(url).text + x_url_reg = re.compile('(.*?)') + explain_text = re.findall(explain_text_reg, html)[0] + if ('时间' and '地点') in explain_text: + return True + else: pass + def save_html(): + all_url_list = crawl_all_main_url() + for son_url in all_url_list: + if get_title(son_url): + text_html = requests.get(son_url).content.decode('gbk') + domain_url = 'http://zjc.ecit.edu.cn/jy' + img_url_reg = re.compile('border=0 src="\.\.(.*?)"') + child_url = re.findall(img_url_reg, text_html) + if child_url != []: + img_url = domain_url + child_url[0] + re_url = 'src="..{0}"'.format(child_url[0]) + end_url = 'src="{0}"'.format(img_url) + end_html = text_html.replace(re_url, end_url) + soup = bs(end_html, 'lxml') + text_div = soup.find_all('div', id='main')[0] + with open('./{0}.html'.format(son_url[-11:]), 'wb') as file: + text_html = 'U职网提供数据咨询服务 {0} '.format(text_div) file.write(text_html.encode('utf-8')) + else: + with open('./{0}.html'.format(son_url[-11:]), 'wb') as file: + html = requests.get(son_url).content.decode('gbk') + soup = bs(text_html, 'lxml') + text_div = soup.find_all('div', id='main')[0] + text_html = 'U职网提供数据咨询服务 {0} '.format(text_div) + file.write(text_html.encode('utf-8')) + else: continue +if __name__ == '__main__': +save_html() From 34742ff14e188b8f05a1334d9a3c5df7b9ed9ec9 Mon Sep 17 00:00:00 2001 From: Yanghangfeng <1437183653@qq.com> Date: Wed, 23 Nov 2016 21:14:28 +0800 Subject: [PATCH 05/65] update reademe --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 50ec563..cb231f1 100644 --- a/README.md +++ b/README.md @@ -13,7 +13,7 @@ ``` -- # spiderFile模块简介 +# spiderFile模块简介 ##### 1. [baidu_sy_img.py](https://github.com/Fenghuapiao/PythonCrawler/blob/master/spiderFile/baidu_sy_img.py): 抓取百度的‘高清摄影’图片 @@ -38,7 +38,7 @@ ##### 11.[xz_picture_spider.py](https://github.com/Fenghuapiao/PythonCrawler/blob/master/spiderFile/xz_picture_spider.py): 应一位知友的请求,抓取某网站上面所有的写真图片。 ##### 12.[one_img.py](https://github.com/Fenghuapiao/PythonCrawler/blob/master/spiderFile/one_img.py): 抓取one文艺网站的图片 --- -- # spiderAPI模块简介 +# spiderAPI模块简介 #### 本模块提供一些网站的API爬虫接口,功能可能不是很全因此可塑性很大智慧的你如果有兴趣可以继续改进。 ##### 1.大众点评 ```python From 42fe51d92615c34ad1ab2f95dc8d2ba84f385a8b Mon Sep 17 00:00:00 2001 From: yanghangfeng <1437183653@qq.com> Date: Fri, 30 Dec 2016 16:21:21 +0800 Subject: [PATCH 06/65] add get_baike.py --- spiderFile/get_baike.py | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) create mode 100644 spiderFile/get_baike.py diff --git a/spiderFile/get_baike.py b/spiderFile/get_baike.py new file mode 100644 index 0000000..eb64fcd --- /dev/null +++ b/spiderFile/get_baike.py @@ -0,0 +1,20 @@ +import re +import requests as rq + +def get_baidubaike(): + + keyword = input('please input wordkey:') + url = 'http://baike.baidu.com/item/{}'.format(keyword) + html = rq.get(url).content.decode('utf-8') + + regex = re.compile('content="(.*?)">') + words = re.findall(regex, html)[0] + return words + +if __name__ == '__main__': + words = get_baidubaike() + print(words) + + + + From 6e98a4854199477e046338e400969d15f5d74507 Mon Sep 17 00:00:00 2001 From: yanghangfeng <1437183653@qq.com> Date: Fri, 30 Dec 2016 16:27:58 +0800 Subject: [PATCH 07/65] update readme --- README.md | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index cb231f1..63ba3ac 100644 --- a/README.md +++ b/README.md @@ -36,7 +36,9 @@ ##### 10. [github_hot.py](https://github.com/Fenghuapiao/PythonCrawler/blob/master/spiderFile/github_hot.py): 抓取github上面热门语言所对应的项目,并把项目简介和项目主页地址保存到本地文件。 ##### 11.[xz_picture_spider.py](https://github.com/Fenghuapiao/PythonCrawler/blob/master/spiderFile/xz_picture_spider.py): 应一位知友的请求,抓取某网站上面所有的写真图片。 -##### 12.[one_img.py](https://github.com/Fenghuapiao/PythonCrawler/blob/master/spiderFile/one_img.py): 抓取one文艺网站的图片 +##### 12.[one_img.py](https://github.com/Fenghuapiao/PythonCrawler/blob/master/spiderFile/one_img.py): 抓取one文艺网站的图片。 +##### 13.[get_baike.py](https://github.com/Fenghuapiao/PythonCrawler/blob/master/spiderFile/get_baike.py): 任意输入一个关键词抓取百度百科的介绍。 + --- # spiderAPI模块简介 #### 本模块提供一些网站的API爬虫接口,功能可能不是很全因此可塑性很大智慧的你如果有兴趣可以继续改进。 From c18ef4c17bb99c2a0fa052b843d76bad75622893 Mon Sep 17 00:00:00 2001 From: yanghangfeng <1437183653@qq.com> Date: Fri, 30 Dec 2016 20:02:28 +0800 Subject: [PATCH 08/65] update one_img.py --- spiderFile/one_img.py | 35 +++++++++++++++++++++++++++-------- 1 file changed, 27 insertions(+), 8 deletions(-) diff --git a/spiderFile/one_img.py b/spiderFile/one_img.py index 4212314..1e206fb 100644 --- a/spiderFile/one_img.py +++ b/spiderFile/one_img.py @@ -1,18 +1,37 @@ import re import requests -temp = 'http://caodan.org/page/' +temp = 'http://wufazhuce.com/one/' count = 1 -for i in range(1, 1331): +for i in range(14, 1580): url = temp + str(i) page = requests.get(url).text reg = re.compile('src="(http://.*?\.jpg)"') img_url = re.findall(reg, page) if img_url != []: - with open('E:/img/%s.jpg' % count, 'wb') as file: - img_data = requests.get(img_url[0]).content - file.write(img_data) - count += 1 - else: - continue + with open('./{}.jpg'.format(count), 'wb') as file: + try: + img_data = requests.get(img_url[0]).content + file.write(img_data) + count += 1 + except: + pass print('OK!') + + + + + + + + + + + + + + + + + + From 2191011b1541e9a600b535b8ab4f14723ab02825 Mon Sep 17 00:00:00 2001 From: Yanghangfeng <1437183653@qq.com> Date: Thu, 23 Feb 2017 12:20:19 +0800 Subject: [PATCH 09/65] add kantuSpider.py --- spiderFile/kantuSpider.py | 46 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 46 insertions(+) create mode 100644 spiderFile/kantuSpider.py diff --git a/spiderFile/kantuSpider.py b/spiderFile/kantuSpider.py new file mode 100644 index 0000000..13cb76e --- /dev/null +++ b/spiderFile/kantuSpider.py @@ -0,0 +1,46 @@ +import re +import os +import time + +import requests as rq + + +def get_all_page(page): + url = 'http://52kantu.cn/?page={}'.format(page) + html = rq.get(url).text + + return html + + +def get_img_url(html): + regex = re.compile(' Date: Thu, 23 Feb 2017 13:19:25 +0800 Subject: [PATCH 10/65] update readme.md --- README.md | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/README.md b/README.md index 63ba3ac..83265bc 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ -# PythonCrawler: 用python编写的爬虫项目集合 +# PythonCrawler: 用python编写的爬虫项目集合 ``` ( )\ ) ) ) ( ( @@ -15,30 +15,34 @@ # spiderFile模块简介 -##### 1. [baidu_sy_img.py](https://github.com/Fenghuapiao/PythonCrawler/blob/master/spiderFile/baidu_sy_img.py): 抓取百度的‘高清摄影’图片 +##### 1. [baidu_sy_img.py](https://github.com/Fenghuapiao/PythonCrawler/blob/master/spiderFile/baidu_sy_img.py): 抓取百度的‘高清摄影’图片。 ##### 2. [baidu_wm_img.py](https://github.com/Fenghuapiao/PythonCrawler/blob/master/spiderFile/baidu_wm_img.py): 抓取百度图片‘唯美意境’模块 -##### 3. [get_photos.py](https://github.com/Fenghuapiao/PythonCrawler/blob/master/spiderFile/get_photos.py): 抓取百度贴吧某话题下的所有图片 +##### 3. [get_photos.py](https://github.com/Fenghuapiao/PythonCrawler/blob/master/spiderFile/get_photos.py): 抓取百度贴吧某话题下的所有图片。 -##### 4. [get_web_all_img.py](https://github.com/Fenghuapiao/PythonCrawler/blob/master/spiderFile/get_web_all_img.py): 抓取整个网站的图片 +##### 4. [get_web_all_img.py](https://github.com/Fenghuapiao/PythonCrawler/blob/master/spiderFile/get_web_all_img.py): 抓取整个网站的图片。 -##### 5. [lagou_position_spider.py](https://github.com/Fenghuapiao/PythonCrawler/blob/master/spiderFile/lagou_position_spider.py): 任意输入关键字,一键抓取与关键字相关的职位招聘信息,并保存到本地文件 +##### 5. [lagou_position_spider.py](https://github.com/Fenghuapiao/PythonCrawler/blob/master/spiderFile/lagou_position_spider.py): 任意输入关键字,一键抓取与关键字相关的职位招聘信息,并保存到本地文件。 -##### 6. [student_img.py](https://github.com/Fenghuapiao/PythonCrawler/blob/master/spiderFile/student_img.py): 基于本学校官网的url漏洞,获取所有注册学生学籍证件照 +##### 6. [student_img.py](https://github.com/Fenghuapiao/PythonCrawler/blob/master/spiderFile/student_img.py): 基于本学校官网的url漏洞,获取所有注册学生学籍证件照。 -##### 7. [JD_spider.py](https://github.com/Fenghuapiao/PythonCrawler/blob/master/spiderFile/JD_spider.py): 大批量抓取京东商品id和标签 +##### 7. [JD_spider.py](https://github.com/Fenghuapiao/PythonCrawler/blob/master/spiderFile/JD_spider.py): 大批量抓取京东商品id和标签。 ##### 8. [ECUT_pos_html.py](https://github.com/Fenghuapiao/PythonCrawler/blob/master/spiderFile/ECUT_pos_html.py): 抓取学校官网所有校园招聘信息,并保存为html格式,图片也会镶嵌在html中。 -##### 9. [ECUT_get_grade.py](https://github.com/Fenghuapiao/PythonCrawler/blob/master/spiderFile/ECUT_get_grade.py): 模拟登陆学校官网,抓取成绩并计算平均学分绩 +##### 9. [ECUT_get_grade.py](https://github.com/Fenghuapiao/PythonCrawler/blob/master/spiderFile/ECUT_get_grade.py): 模拟登陆学校官网,抓取成绩并计算平均学分绩。 ##### 10. [github_hot.py](https://github.com/Fenghuapiao/PythonCrawler/blob/master/spiderFile/github_hot.py): 抓取github上面热门语言所对应的项目,并把项目简介和项目主页地址保存到本地文件。 ##### 11.[xz_picture_spider.py](https://github.com/Fenghuapiao/PythonCrawler/blob/master/spiderFile/xz_picture_spider.py): 应一位知友的请求,抓取某网站上面所有的写真图片。 + ##### 12.[one_img.py](https://github.com/Fenghuapiao/PythonCrawler/blob/master/spiderFile/one_img.py): 抓取one文艺网站的图片。 + ##### 13.[get_baike.py](https://github.com/Fenghuapiao/PythonCrawler/blob/master/spiderFile/get_baike.py): 任意输入一个关键词抓取百度百科的介绍。 +##### 14.[kantuSpider.py](https://github.com/Fenghuapiao/PythonCrawler/blob/master/spiderFile/kantuSpider.py): 抓取看图网站上的所有图片。 + --- # spiderAPI模块简介 #### 本模块提供一些网站的API爬虫接口,功能可能不是很全因此可塑性很大智慧的你如果有兴趣可以继续改进。 From 79f503230907489092f8bbbdba9d02249c314e32 Mon Sep 17 00:00:00 2001 From: Yanghangfeng <1437183653@qq.com> Date: Tue, 28 Feb 2017 12:22:25 +0800 Subject: [PATCH 11/65] update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 83265bc..98b807f 100644 --- a/README.md +++ b/README.md @@ -17,7 +17,7 @@ ##### 1. [baidu_sy_img.py](https://github.com/Fenghuapiao/PythonCrawler/blob/master/spiderFile/baidu_sy_img.py): 抓取百度的‘高清摄影’图片。 -##### 2. [baidu_wm_img.py](https://github.com/Fenghuapiao/PythonCrawler/blob/master/spiderFile/baidu_wm_img.py): 抓取百度图片‘唯美意境’模块 +##### 2. [baidu_wm_img.py](https://github.com/Fenghuapiao/PythonCrawler/blob/master/spiderFile/baidu_wm_img.py): 抓取百度图片‘唯美意境’模块。 ##### 3. [get_photos.py](https://github.com/Fenghuapiao/PythonCrawler/blob/master/spiderFile/get_photos.py): 抓取百度贴吧某话题下的所有图片。 From e3e7e2924b4573c97f3eeedf454a935e9fc7ffbf Mon Sep 17 00:00:00 2001 From: bigablecat Date: Tue, 14 Mar 2017 13:59:56 +0800 Subject: [PATCH 12/65] =?UTF-8?q?#=E5=9B=BE=E7=89=87=E8=B7=AF=E5=BE=84?= =?UTF-8?q?=E8=A7=84=E5=88=99=E5=8F=91=E7=94=9F=E4=BA=86=E5=8F=98=E5=8C=96?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ''是现有的规则 --- spiderFile/one_img.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spiderFile/one_img.py b/spiderFile/one_img.py index 1e206fb..4687fee 100644 --- a/spiderFile/one_img.py +++ b/spiderFile/one_img.py @@ -6,7 +6,7 @@ for i in range(14, 1580): url = temp + str(i) page = requests.get(url).text - reg = re.compile('src="(http://.*?\.jpg)"') + reg = re.compile('') img_url = re.findall(reg, page) if img_url != []: with open('./{}.jpg'.format(count), 'wb') as file: From 602bb329c108261d21b739f9aaa544f747338f4f Mon Sep 17 00:00:00 2001 From: HangfengYang Date: Sat, 19 May 2018 19:31:12 +0800 Subject: [PATCH 13/65] update README.md --- README.md | 34 +++++++++++++++++++++++----------- 1 file changed, 23 insertions(+), 11 deletions(-) diff --git a/README.md b/README.md index 98b807f..96ae3b6 100644 --- a/README.md +++ b/README.md @@ -1,16 +1,28 @@ -# PythonCrawler: 用python编写的爬虫项目集合 ``` - ( - )\ ) ) ) ( ( -(()/( ( ( /( ( /( )\ ( ) ( ( )\ ( ( - /(_)))\ ) )\()))\()) ( ( (((_) )( ( /( )\))( ((_) ))\ )( -(_)) (()/( (_))/((_)\ )\ )\ ) )\___ (()\ )(_))((_)()\ _ /((_)(()\ -| _ \ )(_))| |_ | |(_) ((_) _(_/(((/ __| ((_)((_)_ _(()((_)| |(_)) ((_) -| _/| || || _|| ' \ / _ \| ' \))| (__ | '_|/ _` |\ V V /| |/ -_) | '_| -|_| \_, | \__||_||_|\___/|_||_| \___||_| \__,_| \_/\_/ |_|\___| |_| - |__/ - —————— by yanghangfeng + ( + )\ ) ) ) ( ( + (()/( ( ( /( ( /( )\ ( ) ( ( )\ ( ( + /(_)))\ ) )\()))\()) ( ( (((_) )( ( /( )\))( ((_) ))\ )( + (_)) (()/( (_))/((_)\ )\ )\ ) )\___ (()\ )(_))((_)()\ _ /((_)(()\ + | _ \ )(_))| |_ | |(_) ((_) _(_/(((/ __| ((_)((_)_ _(()((_)| |(_)) ((_) + | _/| || || _|| ' \ / _ \| ' \))| (__ | '_|/ _` |\ V V /| |/ -_) | '_| + |_| \_, | \__||_||_|\___/|_||_| \___||_| \__,_| \_/\_/ |_|\___| |_| + |__/ + —————— by yanghangfeng ``` +#

PythonCrawler: 用 python编写的爬虫项目集合

+

+ + + + + + + + + +

+ # spiderFile模块简介 From d4b0cb1aa3ce85ac7be301a297c878c04cd0998c Mon Sep 17 00:00:00 2001 From: HangfengYang Date: Sat, 19 May 2018 19:32:47 +0800 Subject: [PATCH 14/65] update README.md --- README.md | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/README.md b/README.md index 96ae3b6..bafbfe5 100644 --- a/README.md +++ b/README.md @@ -1,14 +1,14 @@ ``` - ( - )\ ) ) ) ( ( - (()/( ( ( /( ( /( )\ ( ) ( ( )\ ( ( - /(_)))\ ) )\()))\()) ( ( (((_) )( ( /( )\))( ((_) ))\ )( - (_)) (()/( (_))/((_)\ )\ )\ ) )\___ (()\ )(_))((_)()\ _ /((_)(()\ - | _ \ )(_))| |_ | |(_) ((_) _(_/(((/ __| ((_)((_)_ _(()((_)| |(_)) ((_) - | _/| || || _|| ' \ / _ \| ' \))| (__ | '_|/ _` |\ V V /| |/ -_) | '_| - |_| \_, | \__||_||_|\___/|_||_| \___||_| \__,_| \_/\_/ |_|\___| |_| - |__/ - —————— by yanghangfeng + ( + )\ ) ) ) ( ( + (()/( ( ( /( ( /( )\ ( ) ( ( )\ ( ( + /(_)))\ ) )\()))\()) ( ( (((_) )( ( /( )\))( ((_) ))\ )( + (_)) (()/( (_))/((_)\ )\ )\ ) )\___ (()\ )(_))((_)()\ _ /((_)(()\ + | _ \ )(_))| |_ | |(_) ((_) _(_/(((/ __| ((_)((_)_ _(()((_)| |(_)) ((_) + | _/| || || _|| ' \ / _ \| ' \))| (__ | '_|/ _` |\ V V /| |/ -_) | '_| + |_| \_, | \__||_||_|\___/|_||_| \___||_| \__,_| \_/\_/ |_|\___| |_| + |__/ + —————— by yanghangfeng ``` #

PythonCrawler: 用 python编写的爬虫项目集合

From 3fb5c3ae29d685d7dc41f3357602b99a9b0b0537 Mon Sep 17 00:00:00 2001 From: HangfengYang Date: Sat, 19 May 2018 19:34:58 +0800 Subject: [PATCH 15/65] update README.md --- README.md | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/README.md b/README.md index bafbfe5..3a391c3 100644 --- a/README.md +++ b/README.md @@ -1,14 +1,14 @@ ``` - ( - )\ ) ) ) ( ( - (()/( ( ( /( ( /( )\ ( ) ( ( )\ ( ( - /(_)))\ ) )\()))\()) ( ( (((_) )( ( /( )\))( ((_) ))\ )( - (_)) (()/( (_))/((_)\ )\ )\ ) )\___ (()\ )(_))((_)()\ _ /((_)(()\ - | _ \ )(_))| |_ | |(_) ((_) _(_/(((/ __| ((_)((_)_ _(()((_)| |(_)) ((_) - | _/| || || _|| ' \ / _ \| ' \))| (__ | '_|/ _` |\ V V /| |/ -_) | '_| - |_| \_, | \__||_||_|\___/|_||_| \___||_| \__,_| \_/\_/ |_|\___| |_| - |__/ - —————— by yanghangfeng + ( + )\ ) ) ) ( ( + (()/( ( ( /( ( /( )\ ( ) ( ( )\ ( ( + /(_)))\ ) )\()))\()) ( ( (((_) )( ( /( )\))( ((_) ))\ )( + (_)) (()/( (_))/((_)\ )\ )\ ) )\___ (()\ )(_))((_)()\ _ /((_)(()\ + | _ \ )(_))| |_ | |(_) ((_) _(_/(((/ __| ((_)((_)_ _(()((_)| |(_)) ((_) + | _/| || || _|| ' \ / _ \| ' \))| (__ | '_|/ _` |\ V V /| |/ -_) | '_| + |_| \_, | \__||_||_|\___/|_||_| \___||_| \__,_| \_/\_/ |_|\___| |_| + |__/ + —————— by yanghangfeng ``` #

PythonCrawler: 用 python编写的爬虫项目集合

From aa412528725d09ec9bd7ae79e530cc168b1046f2 Mon Sep 17 00:00:00 2001 From: HangfengYang Date: Sat, 19 May 2018 19:36:22 +0800 Subject: [PATCH 16/65] update README.md --- README.md | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/README.md b/README.md index 3a391c3..c5d9826 100644 --- a/README.md +++ b/README.md @@ -1,14 +1,14 @@ ``` - ( - )\ ) ) ) ( ( - (()/( ( ( /( ( /( )\ ( ) ( ( )\ ( ( - /(_)))\ ) )\()))\()) ( ( (((_) )( ( /( )\))( ((_) ))\ )( - (_)) (()/( (_))/((_)\ )\ )\ ) )\___ (()\ )(_))((_)()\ _ /((_)(()\ - | _ \ )(_))| |_ | |(_) ((_) _(_/(((/ __| ((_)((_)_ _(()((_)| |(_)) ((_) - | _/| || || _|| ' \ / _ \| ' \))| (__ | '_|/ _` |\ V V /| |/ -_) | '_| - |_| \_, | \__||_||_|\___/|_||_| \___||_| \__,_| \_/\_/ |_|\___| |_| - |__/ - —————— by yanghangfeng + ( + )\ ) ) ) ( ( + (()/( ( ( /( ( /( )\ ( ) ( ( )\ ( ( + /(_)))\ ) )\()))\()) ( ( (((_) )( ( /( )\))( ((_) ))\ )( + (_)) (()/( (_))/((_)\ )\ )\ ) )\___ (()\ )(_))((_)()\ _ /((_)(()\ + | _ \ )(_))| |_ | |(_) ((_) _(_/(((/ __| ((_)((_)_ _(()((_)| |(_)) ((_) + | _/| || || _|| ' \ / _ \| ' \))| (__ | '_|/ _` |\ V V /| |/ -_) | '_| + |_| \_, | \__||_||_|\___/|_||_| \___||_| \__,_| \_/\_/ |_|\___| |_| + |__/ + —————— by yanghangfeng ``` #

PythonCrawler: 用 python编写的爬虫项目集合

From 9f0a1afb8bbfb891a6af02b5695bba3c33c0d056 Mon Sep 17 00:00:00 2001 From: HangfengYang Date: Tue, 22 May 2018 12:16:06 +0800 Subject: [PATCH 17/65] update readme.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index c5d9826..c698629 100644 --- a/README.md +++ b/README.md @@ -81,7 +81,7 @@ restaurantlist=restaurantList('http://www.dianping.com/search/category/2/10/p2') ``` ##### 2.获取代理IP -爬取http://proxy.ipcn.org,获取可用代理 +爬取[代理IP](http://proxy.ipcn.org) ```python from spiderAPI.proxyip import get_enableips From 1f4ac1645d6f793eff4bcac47f66c8d8b835a213 Mon Sep 17 00:00:00 2001 From: HangfengYang Date: Thu, 24 May 2018 13:02:57 +0800 Subject: [PATCH 18/65] update README.md --- README.md | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/README.md b/README.md index c698629..bf4462e 100644 --- a/README.md +++ b/README.md @@ -21,6 +21,12 @@ + + + + + +

From 2024a8e9083df330c55dddb5147fc172bb7dcd3f Mon Sep 17 00:00:00 2001 From: Hangfeng Yang Date: Fri, 3 Aug 2018 12:57:19 +0800 Subject: [PATCH 19/65] Update README.md --- README.md | 38 +++++++++++++++++++------------------- 1 file changed, 19 insertions(+), 19 deletions(-) diff --git a/README.md b/README.md index bf4462e..e37e684 100644 --- a/README.md +++ b/README.md @@ -12,7 +12,7 @@ ``` #

PythonCrawler: 用 python编写的爬虫项目集合

- + @@ -21,11 +21,11 @@ - - + + - - + +

@@ -33,33 +33,33 @@ # spiderFile模块简介 -##### 1. [baidu_sy_img.py](https://github.com/Fenghuapiao/PythonCrawler/blob/master/spiderFile/baidu_sy_img.py): 抓取百度的‘高清摄影’图片。 +##### 1. [baidu_sy_img.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/baidu_sy_img.py): 抓取百度的‘高清摄影’图片。 -##### 2. [baidu_wm_img.py](https://github.com/Fenghuapiao/PythonCrawler/blob/master/spiderFile/baidu_wm_img.py): 抓取百度图片‘唯美意境’模块。 +##### 2. [baidu_wm_img.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/baidu_wm_img.py): 抓取百度图片‘唯美意境’模块。 -##### 3. [get_photos.py](https://github.com/Fenghuapiao/PythonCrawler/blob/master/spiderFile/get_photos.py): 抓取百度贴吧某话题下的所有图片。 +##### 3. [get_photos.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/get_photos.py): 抓取百度贴吧某话题下的所有图片。 -##### 4. [get_web_all_img.py](https://github.com/Fenghuapiao/PythonCrawler/blob/master/spiderFile/get_web_all_img.py): 抓取整个网站的图片。 +##### 4. [get_web_all_img.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/get_web_all_img.py): 抓取整个网站的图片。 -##### 5. [lagou_position_spider.py](https://github.com/Fenghuapiao/PythonCrawler/blob/master/spiderFile/lagou_position_spider.py): 任意输入关键字,一键抓取与关键字相关的职位招聘信息,并保存到本地文件。 +##### 5. [lagou_position_spider.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/lagou_position_spider.py): 任意输入关键字,一键抓取与关键字相关的职位招聘信息,并保存到本地文件。 -##### 6. [student_img.py](https://github.com/Fenghuapiao/PythonCrawler/blob/master/spiderFile/student_img.py): 基于本学校官网的url漏洞,获取所有注册学生学籍证件照。 +##### 6. [student_img.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/student_img.py): 基于本学校官网的url漏洞,获取所有注册学生学籍证件照。 -##### 7. [JD_spider.py](https://github.com/Fenghuapiao/PythonCrawler/blob/master/spiderFile/JD_spider.py): 大批量抓取京东商品id和标签。 +##### 7. [JD_spider.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/JD_spider.py): 大批量抓取京东商品id和标签。 -##### 8. [ECUT_pos_html.py](https://github.com/Fenghuapiao/PythonCrawler/blob/master/spiderFile/ECUT_pos_html.py): 抓取学校官网所有校园招聘信息,并保存为html格式,图片也会镶嵌在html中。 +##### 8. [ECUT_pos_html.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/ECUT_pos_html.py): 抓取学校官网所有校园招聘信息,并保存为html格式,图片也会镶嵌在html中。 -##### 9. [ECUT_get_grade.py](https://github.com/Fenghuapiao/PythonCrawler/blob/master/spiderFile/ECUT_get_grade.py): 模拟登陆学校官网,抓取成绩并计算平均学分绩。 +##### 9. [ECUT_get_grade.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/ECUT_get_grade.py): 模拟登陆学校官网,抓取成绩并计算平均学分绩。 -##### 10. [github_hot.py](https://github.com/Fenghuapiao/PythonCrawler/blob/master/spiderFile/github_hot.py): 抓取github上面热门语言所对应的项目,并把项目简介和项目主页地址保存到本地文件。 +##### 10. [github_hot.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/github_hot.py): 抓取github上面热门语言所对应的项目,并把项目简介和项目主页地址保存到本地文件。 -##### 11.[xz_picture_spider.py](https://github.com/Fenghuapiao/PythonCrawler/blob/master/spiderFile/xz_picture_spider.py): 应一位知友的请求,抓取某网站上面所有的写真图片。 +##### 11.[xz_picture_spider.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/xz_picture_spider.py): 应一位知友的请求,抓取某网站上面所有的写真图片。 -##### 12.[one_img.py](https://github.com/Fenghuapiao/PythonCrawler/blob/master/spiderFile/one_img.py): 抓取one文艺网站的图片。 +##### 12.[one_img.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/one_img.py): 抓取one文艺网站的图片。 -##### 13.[get_baike.py](https://github.com/Fenghuapiao/PythonCrawler/blob/master/spiderFile/get_baike.py): 任意输入一个关键词抓取百度百科的介绍。 +##### 13.[get_baike.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/get_baike.py): 任意输入一个关键词抓取百度百科的介绍。 -##### 14.[kantuSpider.py](https://github.com/Fenghuapiao/PythonCrawler/blob/master/spiderFile/kantuSpider.py): 抓取看图网站上的所有图片。 +##### 14.[kantuSpider.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/kantuSpider.py): 抓取看图网站上的所有图片。 --- # spiderAPI模块简介 From d4ff21c6661613194e5a17cfb71863003e608190 Mon Sep 17 00:00:00 2001 From: HangfengYang Date: Mon, 20 Aug 2018 10:59:10 +0800 Subject: [PATCH 20/65] =?UTF-8?q?=E6=B7=BB=E5=8A=A0fuckCTF.py?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- README.md | 2 + spiderFile/fuckCTF.py | 123 ++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 125 insertions(+) create mode 100644 spiderFile/fuckCTF.py diff --git a/README.md b/README.md index e37e684..e332982 100644 --- a/README.md +++ b/README.md @@ -61,6 +61,8 @@ ##### 14.[kantuSpider.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/kantuSpider.py): 抓取看图网站上的所有图片。 +##### 15.[fuckCTF.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/fuckCTF.py): 通过selenium模拟登入合天网站,自动修改原始密码。 + --- # spiderAPI模块简介 #### 本模块提供一些网站的API爬虫接口,功能可能不是很全因此可塑性很大智慧的你如果有兴趣可以继续改进。 diff --git a/spiderFile/fuckCTF.py b/spiderFile/fuckCTF.py new file mode 100644 index 0000000..8e7f3a5 --- /dev/null +++ b/spiderFile/fuckCTF.py @@ -0,0 +1,123 @@ +import os +import random +from PIL import Image +from selenium import webdriver + + +class fuckCTF: + + def __init__(self, username, old_password): + self.url = "http://hetianlab.com/" + self.login_url = "http://hetianlab.com/loginLab.do" + self.username = username + self.old_password = old_password + self.new_password = (self.yield_new_password(), "111111")[0] + self.options = webdriver.FirefoxOptions() + self.options.add_argument("-headless") + self.browser = webdriver.Firefox(options=self.options) + print("init ok") + + def login_hetian(self): + self.browser.get(self.login_url) + self.browser.find_element_by_id("userEmail").clear() + self.browser.find_element_by_id("userEmail").send_keys(self.username) + self.browser.find_element_by_id("passwordIn").clear() + self.browser.find_element_by_id("passwordIn").send_keys(self.old_password) + self.browser.get_screenshot_as_file(self.username + '/' + "login.png") + self.browser.find_element_by_id("registButIn").click() + self.browser.get(self.url) + print("login_hetian running ok!") + + def get_personl_information_page(self): + grzx_btn = self.browser.find_element_by_xpath("/html/body/div[1]/div[1]/div/div/div[2]/ul/li[2]/a") + self.browser.execute_script("$(arguments[0]).click()", grzx_btn) + self.browser.get("http://hetianlab.com/getUserInfo.do") + print("get_personl_information_page running ok!") + + def get_password_setting_page(self): + mmsz_btn = self.browser.find_element_by_xpath("/html/body/div[2]/div/div[1]/ul/ul[3]/li[2]") + self.browser.execute_script("$(arguments[0]).click()", mmsz_btn) + self.browser.find_element_by_id("person").click() + self.browser.find_element_by_class_name("check") + print("get_password_setting_page running ok!") + + def yield_new_password(self): + strings = list("abcdefghijklmnopqrstuvwxyz1234567890!@#$%^&*()") + return "".join(random.choices(strings, k=6)) + + def setting_password(self): + self.browser.find_element_by_id("oldpwd").clear() + self.browser.find_element_by_id("oldpwd").send_keys(self.old_password) + self.browser.find_element_by_id("newpwd").clear() + self.browser.find_element_by_id("newpwd").send_keys(self.new_password) + self.browser.find_element_by_id("quepwd").clear() + self.browser.find_element_by_id("quepwd").send_keys(self.new_password) + print("setting_password running ok!") + + def get_v_code(self): + status = self.browser.get_screenshot_as_file(self.username + '/' + "v_code.png") + if status: + img = Image.open(self.username + '/' + "v_code.png") + img.show() + self.v_code = input("请输入验证码: ") + self.browser.find_element_by_class_name("code").send_keys(self.v_code) + else: + raise("截屏失败!") + print("get_v_code running ok!") + + def submit_data(self): + self.browser.find_element_by_id("submitbtn").click() + self.browser.get_screenshot_as_file(self.username + '/' + "result.png") + self.browser.quit() + print("submit_data running ok!") + + def make_portfolio(self): + if not os.path.exists(self.username): + os.makedirs(self.username) + print("make_portfolio running ok!") + + def save_success_data(self): + with open("./username_and_password_data_successed.log", "a+") as fp: + fp.write( + "username" + ": {}".format(self.username) + "\t" + "password" + ": {}".format(self.new_password) + + "\n" + ) + print("save_success_data running ok!") + + def save_failed_data(self): + with open("./username_and_password_data_failed.log", "a+") as fp: + fp.write( + "username" + ": {}".format(self.username) + "\n" + ) + print("save_failed_data running ok!") + + def main(self): + try: + self.make_portfolio() + self.login_hetian() + self.get_personl_information_page() + self.get_password_setting_page() + self.setting_password() + self.get_v_code() + self.submit_data() + self.save_success_data() + except: + self.save_failed_data() + + +def yield_usernames(n): + prefix = "ctf2018_gzhu" + postfix = "@dh.com" + for num in range(1, n): + if num < 10: + infix = '0' + str(num) + else: + infix = str(num) + yield prefix + infix + postfix + + +if __name__ == "__main__": + for username in yield_usernames(100): + ctfer = fuckCTF(username, "111111") + ctfer.main() From a6590adc1e44f647bc02daff4ef299d6615577c6 Mon Sep 17 00:00:00 2001 From: yhf Date: Mon, 20 Aug 2018 11:01:42 +0800 Subject: [PATCH 21/65] Update fuckCTF.py --- spiderFile/fuckCTF.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spiderFile/fuckCTF.py b/spiderFile/fuckCTF.py index 8e7f3a5..5e42979 100644 --- a/spiderFile/fuckCTF.py +++ b/spiderFile/fuckCTF.py @@ -119,5 +119,5 @@ def yield_usernames(n): if __name__ == "__main__": for username in yield_usernames(100): - ctfer = fuckCTF(username, "111111") + ctfer = fuckCTF(username, "******") ctfer.main() From 3e5a3e0200907a85137cb433c4049103663810c3 Mon Sep 17 00:00:00 2001 From: yhf Date: Mon, 20 Aug 2018 15:19:08 +0800 Subject: [PATCH 22/65] Update fuckCTF.py --- spiderFile/fuckCTF.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/spiderFile/fuckCTF.py b/spiderFile/fuckCTF.py index 5e42979..17e4b64 100644 --- a/spiderFile/fuckCTF.py +++ b/spiderFile/fuckCTF.py @@ -1,3 +1,10 @@ + +""" +author: 杨航锋 +date:2018.8.19 +""" + + import os import random from PIL import Image From 6a9cf1f0f2723f007f71f98d6b9c35b8f8f51c4c Mon Sep 17 00:00:00 2001 From: yhf Date: Mon, 20 Aug 2018 18:57:56 +0800 Subject: [PATCH 23/65] Update fuckCTF.py --- spiderFile/fuckCTF.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/spiderFile/fuckCTF.py b/spiderFile/fuckCTF.py index 17e4b64..6142056 100644 --- a/spiderFile/fuckCTF.py +++ b/spiderFile/fuckCTF.py @@ -1,7 +1,8 @@ """ author: 杨航锋 -date:2018.8.19 +date : 2018.8.19 +mood : 嗯,比较无聊,甚至还有点想吃黄焖鸡米饭😋 """ From 3ee30111aa1551deea01b8bde85bc8e4b694150c Mon Sep 17 00:00:00 2001 From: yhf Date: Mon, 20 Aug 2018 19:56:58 +0800 Subject: [PATCH 24/65] Update fuckCTF.py --- spiderFile/fuckCTF.py | 22 +++++++++++++++++++--- 1 file changed, 19 insertions(+), 3 deletions(-) diff --git a/spiderFile/fuckCTF.py b/spiderFile/fuckCTF.py index 6142056..a89e269 100644 --- a/spiderFile/fuckCTF.py +++ b/spiderFile/fuckCTF.py @@ -8,6 +8,8 @@ import os import random +import functools + from PIL import Image from selenium import webdriver @@ -19,7 +21,7 @@ def __init__(self, username, old_password): self.login_url = "http://hetianlab.com/loginLab.do" self.username = username self.old_password = old_password - self.new_password = (self.yield_new_password(), "111111")[0] + self.new_password = (self.yield_new_password(), "******")[0] self.options = webdriver.FirefoxOptions() self.options.add_argument("-headless") self.browser = webdriver.Firefox(options=self.options) @@ -49,9 +51,10 @@ def get_password_setting_page(self): self.browser.find_element_by_class_name("check") print("get_password_setting_page running ok!") + @gen_decorator def yield_new_password(self): strings = list("abcdefghijklmnopqrstuvwxyz1234567890!@#$%^&*()") - return "".join(random.choices(strings, k=6)) + yield "".join(random.choices(strings, k=6)) def setting_password(self): self.browser.find_element_by_id("oldpwd").clear() @@ -113,7 +116,20 @@ def main(self): except: self.save_failed_data() - + +def gen_decorator(gen): + @functools.wraps(gen) + def inner(*args, **kwargs): + return next(gen(*args, **kwargs)) + return inner + + +@gen_decorator +def yield_new_password(): + strings = list("abcdefghijklmnopqrstuvwxyz1234567890!@#$%^&*()") + yield "".join(random.choices(strings, k=6)) + + def yield_usernames(n): prefix = "ctf2018_gzhu" postfix = "@dh.com" From bc0c59b3039ed2af3555546a0d8366bc8842420d Mon Sep 17 00:00:00 2001 From: HangfengYang Date: Mon, 20 Aug 2018 20:11:45 +0800 Subject: [PATCH 25/65] :yum::yum::yum: --- spiderFile/fuckCTF.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/spiderFile/fuckCTF.py b/spiderFile/fuckCTF.py index a89e269..d894db2 100644 --- a/spiderFile/fuckCTF.py +++ b/spiderFile/fuckCTF.py @@ -122,12 +122,6 @@ def gen_decorator(gen): def inner(*args, **kwargs): return next(gen(*args, **kwargs)) return inner - - -@gen_decorator -def yield_new_password(): - strings = list("abcdefghijklmnopqrstuvwxyz1234567890!@#$%^&*()") - yield "".join(random.choices(strings, k=6)) def yield_usernames(n): From a30b6030412adb4a0ea821ff31ff21a0901f2546 Mon Sep 17 00:00:00 2001 From: HangfengYang Date: Tue, 21 Aug 2018 10:36:04 +0800 Subject: [PATCH 26/65] :yum::yum::yum: --- spiderFile/fuckCTF.py | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/spiderFile/fuckCTF.py b/spiderFile/fuckCTF.py index d894db2..6c501ae 100644 --- a/spiderFile/fuckCTF.py +++ b/spiderFile/fuckCTF.py @@ -21,7 +21,7 @@ def __init__(self, username, old_password): self.login_url = "http://hetianlab.com/loginLab.do" self.username = username self.old_password = old_password - self.new_password = (self.yield_new_password(), "******")[0] + self.new_password = (yield_new_password(), "******")[0] self.options = webdriver.FirefoxOptions() self.options.add_argument("-headless") self.browser = webdriver.Firefox(options=self.options) @@ -51,11 +51,6 @@ def get_password_setting_page(self): self.browser.find_element_by_class_name("check") print("get_password_setting_page running ok!") - @gen_decorator - def yield_new_password(self): - strings = list("abcdefghijklmnopqrstuvwxyz1234567890!@#$%^&*()") - yield "".join(random.choices(strings, k=6)) - def setting_password(self): self.browser.find_element_by_id("oldpwd").clear() self.browser.find_element_by_id("oldpwd").send_keys(self.old_password) @@ -122,12 +117,18 @@ def gen_decorator(gen): def inner(*args, **kwargs): return next(gen(*args, **kwargs)) return inner + + +@gen_decorator +def yield_new_password(): + strings = list("abcdefghijklmnopqrstuvwxyz1234567890!@#$%^&*()") + yield "".join(random.choices(strings, k=6)) - + def yield_usernames(n): prefix = "ctf2018_gzhu" postfix = "@dh.com" - for num in range(1, n): + for num in range(n): if num < 10: infix = '0' + str(num) else: @@ -137,5 +138,5 @@ def yield_usernames(n): if __name__ == "__main__": for username in yield_usernames(100): - ctfer = fuckCTF(username, "******") + ctfer = fuckCTF(username, "111111") ctfer.main() From 8b29fdf6e60f1e9007724308b5a8d23a6d577055 Mon Sep 17 00:00:00 2001 From: yhf Date: Tue, 21 Aug 2018 11:32:10 +0800 Subject: [PATCH 27/65] Update fuckCTF.py --- spiderFile/fuckCTF.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spiderFile/fuckCTF.py b/spiderFile/fuckCTF.py index 6c501ae..76597f1 100644 --- a/spiderFile/fuckCTF.py +++ b/spiderFile/fuckCTF.py @@ -138,5 +138,5 @@ def yield_usernames(n): if __name__ == "__main__": for username in yield_usernames(100): - ctfer = fuckCTF(username, "111111") + ctfer = fuckCTF(username, "******") ctfer.main() From b94801a967b236c5d7289904d6a67ce0608ea6a2 Mon Sep 17 00:00:00 2001 From: yhf Date: Fri, 28 Dec 2018 11:09:40 +0800 Subject: [PATCH 28/65] Create LICENSE --- LICENSE | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) create mode 100644 LICENSE diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..02bfa5d --- /dev/null +++ b/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2018 yhf + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. From dd2c6c7f7b6fc4ce2a04088d9cbc71df14a762b6 Mon Sep 17 00:00:00 2001 From: yhf Date: Tue, 22 Jan 2019 16:30:25 +0800 Subject: [PATCH 29/65] update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index e332982..11e32c8 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ -``` +```shell ( )\ ) ) ) ( ( (()/( ( ( /( ( /( )\ ( ) ( ( )\ ( ( From 601fd91f9f50470103cd898b883fd729087131ab Mon Sep 17 00:00:00 2001 From: yhf Date: Wed, 23 Jan 2019 17:04:17 +0800 Subject: [PATCH 30/65] add one_update.py --- spiderFile/one_update.py | 38 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 38 insertions(+) create mode 100644 spiderFile/one_update.py diff --git a/spiderFile/one_update.py b/spiderFile/one_update.py new file mode 100644 index 0000000..d457785 --- /dev/null +++ b/spiderFile/one_update.py @@ -0,0 +1,38 @@ +import re +import requests as rq + +ROOT_URL = "http://wufazhuce.com/one/" +URL_NUM = 14 + +def yield_url(ROOT_URL, URL_NUM): + return ROOT_URL + str(URL_NUM) + +def get_html(url): + return rq.get(url).content.decode("utf-8") + +def get_data(html): + img_url_regex = re.compile('') + cite_regex = re.compile('
(.*?)
', re.S) + img_url = re.findall(img_url_regex, html)[0] + cite = re.findall(cite_regex, html)[0].strip() + return img_url, cite + +def save_data(img_url, cite, URL_NUM): + with open("./{}.jpg".format(URL_NUM), "wb") as fp: + fp.write(rq.get(img_url).content) + with open("./cite{}.txt".format(URL_NUM), "w") as fp: + fp.write(cite) + return URL_NUM + 1 + +def main(ROOT_URL, URL_NUM, number): + for _ in range(number): + url = yield_url(ROOT_URL, URL_NUM) + html = get_html(url) + img_url, cite = get_data(html) + URL_NUM = save_data(img_url, cite, URL_NUM) + +if __name__ == "__main__": + try: + main(ROOT_URL, URL_NUM, 20) + except: + pass From bdb5d02d7111659c0a43f6fdf0cd2c3f261f7994 Mon Sep 17 00:00:00 2001 From: yhf Date: Wed, 23 Jan 2019 17:04:48 +0800 Subject: [PATCH 31/65] update README --- README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/README.md b/README.md index 11e32c8..ceed4fe 100644 --- a/README.md +++ b/README.md @@ -63,6 +63,8 @@ ##### 15.[fuckCTF.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/fuckCTF.py): 通过selenium模拟登入合天网站,自动修改原始密码。 +##### 16.[one_update.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/one_update.py): 更新抓取one文艺网站的代码,添加一句箴言的抓取。 + --- # spiderAPI模块简介 #### 本模块提供一些网站的API爬虫接口,功能可能不是很全因此可塑性很大智慧的你如果有兴趣可以继续改进。 From 9b598859c520cf510ca1e65485b5e8327b21764d Mon Sep 17 00:00:00 2001 From: yhf Date: Thu, 24 Jan 2019 16:33:08 +0800 Subject: [PATCH 32/65] update readme file. --- README.md | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index ceed4fe..5fdc214 100644 --- a/README.md +++ b/README.md @@ -11,6 +11,7 @@ —————— by yanghangfeng ``` #

PythonCrawler: 用 python编写的爬虫项目集合

+

@@ -67,8 +68,11 @@ --- # spiderAPI模块简介 + #### 本模块提供一些网站的API爬虫接口,功能可能不是很全因此可塑性很大智慧的你如果有兴趣可以继续改进。 + ##### 1.大众点评 + ```python from spiderAPI.dianping import * @@ -100,7 +104,8 @@ enableips=get_enableips() ``` ##### 3.百度地图 -百度地图提供的API,对查询有一些限制,这里找出了web上查询的接口 + +百度地图提供的API,对查询有一些限制,这里找出了web上查询的接口。 ```python from spiderAPI.baidumap import * From c7d9ed106529bfba0eca81a788e889343970d34f Mon Sep 17 00:00:00 2001 From: yhf Date: Fri, 25 Jan 2019 15:49:29 +0800 Subject: [PATCH 33/65] update readme file. --- README.md | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/README.md b/README.md index 5fdc214..e1549be 100644 --- a/README.md +++ b/README.md @@ -10,7 +10,7 @@ |__/ —————— by yanghangfeng ``` -#

PythonCrawler: 用 python编写的爬虫项目集合

+#

PythonCrawler: 用 python编写的爬虫项目集合:bug:

@@ -31,7 +31,6 @@

- # spiderFile模块简介 ##### 1. [baidu_sy_img.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/baidu_sy_img.py): 抓取百度的‘高清摄影’图片。 From 7df29c9aa457686f170305761df5dff901563a29 Mon Sep 17 00:00:00 2001 From: yhf Date: Sun, 27 Jan 2019 15:31:17 +0800 Subject: [PATCH 34/65] update --- spiderFile/baidu_sy_img.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/spiderFile/baidu_sy_img.py b/spiderFile/baidu_sy_img.py index faaf6e2..b663ea0 100644 --- a/spiderFile/baidu_sy_img.py +++ b/spiderFile/baidu_sy_img.py @@ -42,11 +42,11 @@ def get_img(page, headers): reg = re.compile('http://.*?\.jpg') imglist1 = re.findall(reg, page) imglist2 = imglist1[0: len(imglist1): 3] -# [img_url_list.append(i) for i in imglist if not i in img_url_list] + # [img_url_list.append(i) for i in imglist if not i in img_url_list] x = 0 for imgurl in imglist2: bin = requests.get(imgurl, headers=headers).content - with open('E:/Pic2/%s.jpg' % x, 'wb') as file: + with open('./%s.jpg' % x, 'wb') as file: file.write(bin) x += 1 From fba4d61bdc3e9e4fa651ebfcf8b0563f664587af Mon Sep 17 00:00:00 2001 From: yhf Date: Mon, 28 Jan 2019 08:49:28 +0800 Subject: [PATCH 35/65] update readme file. --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index e1549be..35e4c3b 100644 --- a/README.md +++ b/README.md @@ -35,7 +35,7 @@ ##### 1. [baidu_sy_img.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/baidu_sy_img.py): 抓取百度的‘高清摄影’图片。 -##### 2. [baidu_wm_img.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/baidu_wm_img.py): 抓取百度图片‘唯美意境’模块。 +##### 2. [baidu_wm_img.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/baidu_wm_img.py): 抓取百度图片`唯美意境`模块。 ##### 3. [get_photos.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/get_photos.py): 抓取百度贴吧某话题下的所有图片。 From ed73a121534c187ca9735a148f1770c2de575ba7 Mon Sep 17 00:00:00 2001 From: yhf Date: Tue, 19 Feb 2019 13:42:21 +0800 Subject: [PATCH 36/65] update README --- README.md | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/README.md b/README.md index 35e4c3b..bf1dee5 100644 --- a/README.md +++ b/README.md @@ -33,37 +33,37 @@ # spiderFile模块简介 -##### 1. [baidu_sy_img.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/baidu_sy_img.py): 抓取百度的‘高清摄影’图片。 +1. [baidu_sy_img.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/baidu_sy_img.py): **抓取百度的‘高清摄影’图片。** -##### 2. [baidu_wm_img.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/baidu_wm_img.py): 抓取百度图片`唯美意境`模块。 +2. [baidu_wm_img.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/baidu_wm_img.py): **抓取百度图片`唯美意境`模块。** -##### 3. [get_photos.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/get_photos.py): 抓取百度贴吧某话题下的所有图片。 +3. [get_photos.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/get_photos.py): **抓取百度贴吧某话题下的所有图片。** -##### 4. [get_web_all_img.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/get_web_all_img.py): 抓取整个网站的图片。 +4. [get_web_all_img.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/get_web_all_img.py): **抓取整个网站的图片。** -##### 5. [lagou_position_spider.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/lagou_position_spider.py): 任意输入关键字,一键抓取与关键字相关的职位招聘信息,并保存到本地文件。 +5. [lagou_position_spider.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/lagou_position_spider.py): **任意输入关键字,一键抓取与关键字相关的职位招聘信息,并保存到本地文件。** -##### 6. [student_img.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/student_img.py): 基于本学校官网的url漏洞,获取所有注册学生学籍证件照。 +6. [student_img.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/student_img.py): **基于本学校官网的url漏洞,获取所有注册学生学籍证件照。** -##### 7. [JD_spider.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/JD_spider.py): 大批量抓取京东商品id和标签。 +7. [JD_spider.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/JD_spider.py): **大批量抓取京东商品id和标签。** -##### 8. [ECUT_pos_html.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/ECUT_pos_html.py): 抓取学校官网所有校园招聘信息,并保存为html格式,图片也会镶嵌在html中。 +8. [ECUT_pos_html.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/ECUT_pos_html.py): **抓取学校官网所有校园招聘信息,并保存为html格式,图片也会镶嵌在html中。** -##### 9. [ECUT_get_grade.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/ECUT_get_grade.py): 模拟登陆学校官网,抓取成绩并计算平均学分绩。 +9. [ECUT_get_grade.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/ECUT_get_grade.py): **模拟登陆学校官网,抓取成绩并计算平均学分绩。** -##### 10. [github_hot.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/github_hot.py): 抓取github上面热门语言所对应的项目,并把项目简介和项目主页地址保存到本地文件。 +10. [github_hot.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/github_hot.py): **抓取github上面热门语言所对应的项目,并把项目简介和项目主页地址保存到本地文件。** -##### 11.[xz_picture_spider.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/xz_picture_spider.py): 应一位知友的请求,抓取某网站上面所有的写真图片。 +11.[xz_picture_spider.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/xz_picture_spider.py): **应一位知友的请求,抓取某网站上面所有的写真图片。** -##### 12.[one_img.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/one_img.py): 抓取one文艺网站的图片。 +12.[one_img.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/one_img.py): **抓取one文艺网站的图片。** -##### 13.[get_baike.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/get_baike.py): 任意输入一个关键词抓取百度百科的介绍。 +13.[get_baike.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/get_baike.py): **任意输入一个关键词抓取百度百科的介绍。** -##### 14.[kantuSpider.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/kantuSpider.py): 抓取看图网站上的所有图片。 +14.[kantuSpider.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/kantuSpider.py): **抓取看图网站上的所有图片。** -##### 15.[fuckCTF.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/fuckCTF.py): 通过selenium模拟登入合天网站,自动修改原始密码。 +15.[fuckCTF.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/fuckCTF.py): **通过selenium模拟登入合天网站,自动修改原始密码。** -##### 16.[one_update.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/one_update.py): 更新抓取one文艺网站的代码,添加一句箴言的抓取。 +16.[one_update.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/one_update.py): **更新抓取one文艺网站的代码,添加一句箴言的抓取。** --- # spiderAPI模块简介 From 16e884afcf68c4f003bf79220ae168d5fd6e75c4 Mon Sep 17 00:00:00 2001 From: yhf Date: Tue, 19 Feb 2019 13:43:52 +0800 Subject: [PATCH 37/65] update README --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index bf1dee5..47e5c43 100644 --- a/README.md +++ b/README.md @@ -33,7 +33,7 @@ # spiderFile模块简介 -1. [baidu_sy_img.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/baidu_sy_img.py): **抓取百度的‘高清摄影’图片。** +1. [baidu_sy_img.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/baidu_sy_img.py): **抓取百度的`高清摄影`图片。** 2. [baidu_wm_img.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/baidu_wm_img.py): **抓取百度图片`唯美意境`模块。** From db1bb529108f37548067e31ba276f91d3c05c4e7 Mon Sep 17 00:00:00 2001 From: yhf Date: Tue, 19 Feb 2019 13:46:14 +0800 Subject: [PATCH 38/65] update README --- README.md | 45 +++++++++++++++------------------------------ 1 file changed, 15 insertions(+), 30 deletions(-) diff --git a/README.md b/README.md index 47e5c43..a6621e5 100644 --- a/README.md +++ b/README.md @@ -34,36 +34,21 @@ # spiderFile模块简介 1. [baidu_sy_img.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/baidu_sy_img.py): **抓取百度的`高清摄影`图片。** - -2. [baidu_wm_img.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/baidu_wm_img.py): **抓取百度图片`唯美意境`模块。** - -3. [get_photos.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/get_photos.py): **抓取百度贴吧某话题下的所有图片。** - -4. [get_web_all_img.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/get_web_all_img.py): **抓取整个网站的图片。** - -5. [lagou_position_spider.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/lagou_position_spider.py): **任意输入关键字,一键抓取与关键字相关的职位招聘信息,并保存到本地文件。** - -6. [student_img.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/student_img.py): **基于本学校官网的url漏洞,获取所有注册学生学籍证件照。** - -7. [JD_spider.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/JD_spider.py): **大批量抓取京东商品id和标签。** - -8. [ECUT_pos_html.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/ECUT_pos_html.py): **抓取学校官网所有校园招聘信息,并保存为html格式,图片也会镶嵌在html中。** - -9. [ECUT_get_grade.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/ECUT_get_grade.py): **模拟登陆学校官网,抓取成绩并计算平均学分绩。** - -10. [github_hot.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/github_hot.py): **抓取github上面热门语言所对应的项目,并把项目简介和项目主页地址保存到本地文件。** - -11.[xz_picture_spider.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/xz_picture_spider.py): **应一位知友的请求,抓取某网站上面所有的写真图片。** - -12.[one_img.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/one_img.py): **抓取one文艺网站的图片。** - -13.[get_baike.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/get_baike.py): **任意输入一个关键词抓取百度百科的介绍。** - -14.[kantuSpider.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/kantuSpider.py): **抓取看图网站上的所有图片。** - -15.[fuckCTF.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/fuckCTF.py): **通过selenium模拟登入合天网站,自动修改原始密码。** - -16.[one_update.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/one_update.py): **更新抓取one文艺网站的代码,添加一句箴言的抓取。** +2. [baidu_wm_img.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/baidu_wm_img.py): **抓取百度图片`唯美意境`模块。** +3. [get_photos.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/get_photos.py): **抓取百度贴吧某话题下的所有图片。** +4. [get_web_all_img.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/get_web_all_img.py): **抓取整个网站的图片。** +5. [lagou_position_spider.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/lagou_position_spider.py): **任意输入关键字,一键抓取与关键字相关的职位招聘信息,并保存到本地文件。** +6. [student_img.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/student_img.py): **基于本学校官网的url漏洞,获取所有注册学生学籍证件照。** +7. [JD_spider.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/JD_spider.py): **大批量抓取京东商品id和标签。** +8. [ECUT_pos_html.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/ECUT_pos_html.py): **抓取学校官网所有校园招聘信息,并保存为html格式,图片也会镶嵌在html中。** +9. [ECUT_get_grade.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/ECUT_get_grade.py): **模拟登陆学校官网,抓取成绩并计算平均学分绩。** +10. [github_hot.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/github_hot.py): **抓取github上面热门语言所对应的项目,并把项目简介和项目主页地址保存到本地文件。** +11. [xz_picture_spider.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/xz_picture_spider.py): **应一位知友的请求,抓取某网站上面所有的写真图片。** +12. [one_img.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/one_img.py): **抓取one文艺网站的图片。** +13. [get_baike.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/get_baike.py): **任意输入一个关键词抓取百度百科的介绍。** +14. [kantuSpider.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/kantuSpider.py): **抓取看图网站上的所有图片。** +15. [fuckCTF.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/fuckCTF.py): **通过selenium模拟登入合天网站,自动修改原始密码。** +16. [one_update.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/one_update.py): **更新抓取one文艺网站的代码,添加一句箴言的抓取。** --- # spiderAPI模块简介 From 3ef305ba7b5b79f36dd1740d0f4043d0e4ba3faf Mon Sep 17 00:00:00 2001 From: yhf Date: Thu, 9 May 2019 09:04:09 +0800 Subject: [PATCH 39/65] add get_history_weather.py:leaves: --- spiderFile/get_history_weather.py | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) create mode 100644 spiderFile/get_history_weather.py diff --git a/spiderFile/get_history_weather.py b/spiderFile/get_history_weather.py new file mode 100644 index 0000000..77176fc --- /dev/null +++ b/spiderFile/get_history_weather.py @@ -0,0 +1,31 @@ +import re +import pandas as pd +import requests as rq +from bs4 import BeautifulSoup + + +def get_data(url): + html = rq.get(url).content.decode("gbk") + soup = BeautifulSoup(html, "html.parser") + tr_list = soup.find_all("tr") + dates, conditions, temperatures = [], [], [] + for data in tr_list[1:]: + sub_data = data.text.split() + dates.append(sub_data[0]) + conditions.append("".join(sub_data[1:3])) + temperatures.append("".join(sub_data[3:6])) + _data = pd.DataFrame() + _data["日期"] = dates + _data["天气状况"] = conditions + _data["气温"] = temperatures + return _data + +# 获取广州市2019年第一季度天气状况 +data_1_month = get_data("http://www.tianqihoubao.com/lishi/guangzhou/month/201901.html") +data_2_month = get_data("http://www.tianqihoubao.com/lishi/guangzhou/month/201902.html") +data_3_month = get_data("http://www.tianqihoubao.com/lishi/guangzhou/month/201903.html") + + +data = pd.concat([data_1_month, data_2_month, data_3_month]).reset_index(drop=True) + +data.to_csv("guangzhou_history_weather_data.csv", index=False, encoding="utf-8") From d0a29c2deb36d9986321f38f97ff352c1d589135 Mon Sep 17 00:00:00 2001 From: yhf Date: Thu, 9 May 2019 09:05:22 +0800 Subject: [PATCH 40/65] update README:fire: --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index a6621e5..b2c5dc4 100644 --- a/README.md +++ b/README.md @@ -49,6 +49,7 @@ 14. [kantuSpider.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/kantuSpider.py): **抓取看图网站上的所有图片。** 15. [fuckCTF.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/fuckCTF.py): **通过selenium模拟登入合天网站,自动修改原始密码。** 16. [one_update.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/one_update.py): **更新抓取one文艺网站的代码,添加一句箴言的抓取。** +17. [get_history_weather.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/get_history_weather.py): **抓取广州市2019年第一季度的天气数据。** --- # spiderAPI模块简介 From be62473e586df0f910bf05fc375e0daf1eda8374 Mon Sep 17 00:00:00 2001 From: yhf Date: Mon, 9 Nov 2020 09:59:00 +0800 Subject: [PATCH 41/65] Create search_useful_camera_ip_address.py --- spiderFile/search_useful_camera_ip_address.py | 92 +++++++++++++++++++ 1 file changed, 92 insertions(+) create mode 100644 spiderFile/search_useful_camera_ip_address.py diff --git a/spiderFile/search_useful_camera_ip_address.py b/spiderFile/search_useful_camera_ip_address.py new file mode 100644 index 0000000..652b180 --- /dev/null +++ b/spiderFile/search_useful_camera_ip_address.py @@ -0,0 +1,92 @@ +import re +import tqdm +import time +from selenium import webdriver +from selenium.webdriver.common.by import By +from selenium.webdriver.chrome.options import Options +from selenium.webdriver.support.ui import WebDriverWait +from selenium.webdriver.support import expected_conditions as EC +from selenium.common.exceptions import NoAlertPresentException, TimeoutException + +# 扫描网站可自己寻找,代码仅演示逻辑 +country = "IN" #印度 +city = "" +login_url = "" +query_url = "" +city_url = "" +USER_NAME = "" +PASSWORD = "" + +# 无头浏览器配置 +chrome_options = Options() +chrome_options.add_argument("--headless") +chrome_options.add_argument("--disable-gpu") +chrome_options.add_argument("log-level=3") +browser = webdriver.Chrome(chrome_options=chrome_options) +browser.set_page_load_timeout(10) + +#登录模块 +browser.get(login_url) +WebDriverWait(browser, 30).until( + EC.presence_of_element_located((By.XPATH, '//*[@name="login_submit"]')) +) +browser.find_element_by_id("username").clear() +browser.find_element_by_id("username").send_keys(USER_NAME) +browser.find_element_by_id("password").clear() +browser.find_element_by_id("password").send_keys(PASSWORD) +browser.find_element_by_name("login_submit").click() + +#抓取潜在的摄像头url,默认抓取两页 +if city: + query_url += city_url + +latent_camera_url = [] +browser.get(query_url) +WebDriverWait(browser, 30).until( + EC.presence_of_element_located((By.CLASS_NAME, 'button')) +) +html = browser.page_source +latent_camera_url += re.findall(' Date: Mon, 9 Nov 2020 10:03:18 +0800 Subject: [PATCH 42/65] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index b2c5dc4..f3cfa1e 100644 --- a/README.md +++ b/README.md @@ -50,7 +50,7 @@ 15. [fuckCTF.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/fuckCTF.py): **通过selenium模拟登入合天网站,自动修改原始密码。** 16. [one_update.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/one_update.py): **更新抓取one文艺网站的代码,添加一句箴言的抓取。** 17. [get_history_weather.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/get_history_weather.py): **抓取广州市2019年第一季度的天气数据。** - +18. [search_useful_camera_ip_address.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/search_useful_camera_ip_address.py): **模拟登入某扫描网站获取潜在的摄像头IP地址,然后使用弱密码验证筛选出可登录的摄像头IP地址。** --- # spiderAPI模块简介 From 375a7bd08425abd0b820135f26bf0459ac3b41f3 Mon Sep 17 00:00:00 2001 From: yhf Date: Mon, 9 Nov 2020 10:04:26 +0800 Subject: [PATCH 43/65] Update README.md --- README.md | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/README.md b/README.md index f3cfa1e..4fd02b1 100644 --- a/README.md +++ b/README.md @@ -1,14 +1,14 @@ ```shell - ( - )\ ) ) ) ( ( - (()/( ( ( /( ( /( )\ ( ) ( ( )\ ( ( - /(_)))\ ) )\()))\()) ( ( (((_) )( ( /( )\))( ((_) ))\ )( - (_)) (()/( (_))/((_)\ )\ )\ ) )\___ (()\ )(_))((_)()\ _ /((_)(()\ - | _ \ )(_))| |_ | |(_) ((_) _(_/(((/ __| ((_)((_)_ _(()((_)| |(_)) ((_) - | _/| || || _|| ' \ / _ \| ' \))| (__ | '_|/ _` |\ V V /| |/ -_) | '_| - |_| \_, | \__||_||_|\___/|_||_| \___||_| \__,_| \_/\_/ |_|\___| |_| - |__/ - —————— by yanghangfeng + ( + )\ ) ) ) ( ( + (()/( ( ( /( ( /( )\ ( ) ( ( )\ ( ( + /(_)))\ ) )\()))\()) ( ( (((_) )( ( /( )\))( ((_) ))\ )( + (_)) (()/( (_))/((_)\ )\ )\ ) )\___ (()\ )(_))((_)()\ _ /((_)(()\ + | _ \ )(_))| |_ | |(_) ((_) _(_/(((/ __| ((_)((_)_ _(()((_)| |(_)) ((_) + | _/| || || _|| ' \ / _ \| ' \))| (__ | '_|/ _` |\ V V /| |/ -_) | '_| + |_| \_, | \__||_||_|\___/|_||_| \___||_| \__,_| \_/\_/ |_|\___| |_| + |__/ + —————— by yanghangfeng ``` #

PythonCrawler: 用 python编写的爬虫项目集合:bug:

From 47370eee4e4e6a5b7bc80aa54945bd26792a4b88 Mon Sep 17 00:00:00 2001 From: yhf Date: Sun, 7 Feb 2021 10:54:20 +0800 Subject: [PATCH 44/65] Create get_top_sec_com.py --- spiderFile/get_top_sec_com.py | 67 +++++++++++++++++++++++++++++++++++ 1 file changed, 67 insertions(+) create mode 100644 spiderFile/get_top_sec_com.py diff --git a/spiderFile/get_top_sec_com.py b/spiderFile/get_top_sec_com.py new file mode 100644 index 0000000..0007e77 --- /dev/null +++ b/spiderFile/get_top_sec_com.py @@ -0,0 +1,67 @@ +import re +import os +import joblib +import requests as rq + +import pandas as pd +import matplotlib.pyplot as plt + +class getTopSecCom: + def __init__(self, top=None): + self.headers = {"Referer": "http://quote.eastmoney.com/", + "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.141 Safari/537.36"} + self.bk_url = "http://71.push2.eastmoney.com/api/qt/clist/get?cb=jQuery1124034348162124675374_1612595298605&pn=1&pz=85&po=1&np=1&ut=bd1d9ddb04089700cf9c27f6f7426281&fltt=2&invt=2&fid=f62&fs=b:BK0655&fields=f12,f14&_=1612595298611" + self.shares_api = "https://xueqiu.com/S/" + self.top = top + if not os.path.exists("./useful_sec_com_list"): + self.useful_sec_com_list = self.get_sec_com_code() + else: + with open("./useful_sec_com_list", "rb") as fp: + self.useful_sec_com_list = joblib.load(fp) + + def get_sec_com_code(self): + html = rq.get(self.bk_url, headers=self.headers).content.decode("utf-8") + sec_com_list = eval(re.findall("\[(.*?)\]", html)[0]) + useful_sec_com_list = [[i["f12"], i["f14"]] for i in sec_com_list if "ST" not in i["f14"]] + + # 0和3开头的为深证上市股票前缀为sz,6开头的为上证上市股票前缀为sh + for sec_com in useful_sec_com_list: + if sec_com[0][0] == "6": + sec_com[0] = "sh" + sec_com[0] + else: + sec_com[0] = "sz" + sec_com[0] + with open("useful_sec_com_list", "wb") as fp: + joblib.dump(useful_sec_com_list, fp) + return useful_sec_com_list + + def get_shares_details(self): + all_shares = [] + for sec_com in self.useful_sec_com_list: + url = self.shares_api + sec_com[0] + response = rq.get(url, headers=headers).content.decode("utf-8") + market_value = re.search("总市值:(.*?)亿", response) + if market_value: + all_shares.append([*sec_com, market_value.groups()[0]]) + return all_shares + + def yield_picture(self, save_path): + all_shares = self.get_shares_details() + df = pd.DataFrame(all_shares, columns=["股票代码", "公司", "市值(亿)"]) + df["市值(亿)"] = df["市值(亿)"].astype(float) + df.sort_values(by="市值(亿)", ascending=False, inplace=True) + height = 0.18 * df.shape[0] + if self.top and 0< self.top <= df.shape[0]: + df = df.iloc[:self.top, :] + height = 0.18 * self.top + df.index = range(1, df.shape[0]+1) + + plt.rcParams['font.sans-serif'] = ['SimHei'] + plt.rcParams['axes.unicode_minus'] = False + + + fig = plt.figure(figsize=(2.5, height), dpi=400) + ax = fig.add_subplot(111, frame_on=False) + ax.xaxis.set_visible(False) + ax.yaxis.set_visible(False) + _ = table(ax, df, loc="center") + fig.savefig(save_path) From 4d86db327934e60e75223219652440ed60c799c2 Mon Sep 17 00:00:00 2001 From: yhf Date: Sun, 7 Feb 2021 10:56:46 +0800 Subject: [PATCH 45/65] Update README.md --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 4fd02b1..87ed4b0 100644 --- a/README.md +++ b/README.md @@ -51,6 +51,7 @@ 16. [one_update.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/one_update.py): **更新抓取one文艺网站的代码,添加一句箴言的抓取。** 17. [get_history_weather.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/get_history_weather.py): **抓取广州市2019年第一季度的天气数据。** 18. [search_useful_camera_ip_address.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/search_useful_camera_ip_address.py): **模拟登入某扫描网站获取潜在的摄像头IP地址,然后使用弱密码验证筛选出可登录的摄像头IP地址。** +19. [get_top_sec_com.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/get_top_sec_com.py): **获取A股市场网络安全版块公司市值排名情况,并以图片格式保存下来。** --- # spiderAPI模块简介 From ac3c78a8b82168931d9ec36e7f7669528a9aa347 Mon Sep 17 00:00:00 2001 From: yhf Date: Sun, 28 Feb 2021 10:56:40 +0800 Subject: [PATCH 46/65] Update get_top_sec_com.py --- spiderFile/get_top_sec_com.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/spiderFile/get_top_sec_com.py b/spiderFile/get_top_sec_com.py index 0007e77..3a3186c 100644 --- a/spiderFile/get_top_sec_com.py +++ b/spiderFile/get_top_sec_com.py @@ -38,7 +38,7 @@ def get_shares_details(self): all_shares = [] for sec_com in self.useful_sec_com_list: url = self.shares_api + sec_com[0] - response = rq.get(url, headers=headers).content.decode("utf-8") + response = rq.get(url, headers=self.headers).content.decode("utf-8") market_value = re.search("总市值:(.*?)亿", response) if market_value: all_shares.append([*sec_com, market_value.groups()[0]]) @@ -49,10 +49,10 @@ def yield_picture(self, save_path): df = pd.DataFrame(all_shares, columns=["股票代码", "公司", "市值(亿)"]) df["市值(亿)"] = df["市值(亿)"].astype(float) df.sort_values(by="市值(亿)", ascending=False, inplace=True) - height = 0.18 * df.shape[0] + height = 0.2 * df.shape[0] if self.top and 0< self.top <= df.shape[0]: df = df.iloc[:self.top, :] - height = 0.18 * self.top + height = 0.2 * self.top df.index = range(1, df.shape[0]+1) plt.rcParams['font.sans-serif'] = ['SimHei'] @@ -63,5 +63,5 @@ def yield_picture(self, save_path): ax = fig.add_subplot(111, frame_on=False) ax.xaxis.set_visible(False) ax.yaxis.set_visible(False) - _ = table(ax, df, loc="center") - fig.savefig(save_path) + _ = pd.plotting.table(ax, df, loc="center", cellLoc="center") + plt.savefig(save_path) From 57c0937cc3349facd0f03ec1b21c2d833d1fe8d4 Mon Sep 17 00:00:00 2001 From: yhf Date: Fri, 16 Apr 2021 15:46:10 +0800 Subject: [PATCH 47/65] Update get_top_sec_com.py add async function. --- spiderFile/get_top_sec_com.py | 25 ++++++++++++++++++++++++- 1 file changed, 24 insertions(+), 1 deletion(-) diff --git a/spiderFile/get_top_sec_com.py b/spiderFile/get_top_sec_com.py index 3a3186c..be0f706 100644 --- a/spiderFile/get_top_sec_com.py +++ b/spiderFile/get_top_sec_com.py @@ -1,6 +1,8 @@ import re import os import joblib +import asyncio +import aiohttp import requests as rq import pandas as pd @@ -33,6 +35,26 @@ def get_sec_com_code(self): with open("useful_sec_com_list", "wb") as fp: joblib.dump(useful_sec_com_list, fp) return useful_sec_com_list + + async def async_get_shares_details(self, sec_com, url): + async with aiohttp.ClientSession() as session: + async with session.get(url, headers=self.headers) as response: + html = await response.text() + market_value = re.search("总市值:(.*?)亿", html) + if market_value: + return [*sec_com, market_value.groups()[0]] + + async def async_get_all_shares(self): + tasks = [] + for sec_com in self.useful_sec_com_list: + url = self.shares_api + sec_com[0] + tasks.append( + asyncio.create_task( + self.async_get_shares_details(sec_com, url) + ) + ) + done, pendding = await asyncio.wait(tasks) + return [share.result() for share in done if share.result()] def get_shares_details(self): all_shares = [] @@ -45,7 +67,8 @@ def get_shares_details(self): return all_shares def yield_picture(self, save_path): - all_shares = self.get_shares_details() + # all_shares = self.get_shares_details() # 同步代码 + all_shares = asyncio.run(self.async_get_all_shares()) # 异步代码 df = pd.DataFrame(all_shares, columns=["股票代码", "公司", "市值(亿)"]) df["市值(亿)"] = df["市值(亿)"].astype(float) df.sort_values(by="市值(亿)", ascending=False, inplace=True) From 4e2dc05f1935659700358b10604ac6622aced224 Mon Sep 17 00:00:00 2001 From: yhf Date: Fri, 16 Apr 2021 16:01:25 +0800 Subject: [PATCH 48/65] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 87ed4b0..9f1e6e2 100644 --- a/README.md +++ b/README.md @@ -51,7 +51,7 @@ 16. [one_update.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/one_update.py): **更新抓取one文艺网站的代码,添加一句箴言的抓取。** 17. [get_history_weather.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/get_history_weather.py): **抓取广州市2019年第一季度的天气数据。** 18. [search_useful_camera_ip_address.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/search_useful_camera_ip_address.py): **模拟登入某扫描网站获取潜在的摄像头IP地址,然后使用弱密码验证筛选出可登录的摄像头IP地址。** -19. [get_top_sec_com.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/get_top_sec_com.py): **获取A股市场网络安全版块公司市值排名情况,并以图片格式保存下来。** +19. [get_top_sec_com.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/get_top_sec_com.py): **异步编程获取A股市场网络安全版块公司市值排名情况,并以图片格式保存下来。** --- # spiderAPI模块简介 From 27f856ffcf3dc76ef6762c41973d889583604e69 Mon Sep 17 00:00:00 2001 From: yhf Date: Sat, 17 Apr 2021 10:12:58 +0800 Subject: [PATCH 49/65] Create get_tj_accident_info.py --- spiderFile/get_tj_accident_info.py | 77 ++++++++++++++++++++++++++++++ 1 file changed, 77 insertions(+) create mode 100644 spiderFile/get_tj_accident_info.py diff --git a/spiderFile/get_tj_accident_info.py b/spiderFile/get_tj_accident_info.py new file mode 100644 index 0000000..b8b2237 --- /dev/null +++ b/spiderFile/get_tj_accident_info.py @@ -0,0 +1,77 @@ +import re +import joblib +import asyncio +import aiohttp +import requests as rq +from bs4 import BeautifulSoup + +def yield_all_page_url(root_url, page=51): + """生成所有的页面url + @param root_url: 首页url + type root_url: str + @param page: 爬取的页面个数 + type page: int + """ + # 观察网站翻页结构可知 + page_url_list = [f"{root_url}index_{i}.html" for i in range(1, page)] + # 添加首页url + page_url_list.insert(0, root_url) + return page_url_list + +async def get_info_page_url(url, session): + regex = re.compile("
') + html = rq.get(url, headers=HEADERS).content.decode("utf-8") + soup = BeautifulSoup(html) + title = re.search(title_regex, html) + content_1 = soup.find("div", class_="TRS_UEDITOR TRS_WEB") + content_2 = soup.find("div", class_="view TRS_UEDITOR trs_paper_default trs_word") + content_3 = soup.find("div", class_="view TRS_UEDITOR trs_paper_default trs_web") + if content_1: + content = content_1.text + elif content_2: + content = content_2.text + elif content_3: + content = content_3.text + else: + content = "" + return {"title": title.groups()[0], "content": content} + +def get_all_data(all_info_page_url_list): + all_data = [] + for i, url in enumerate(all_info_page_url_list): + all_data.append(get_data(url)) + print(i, url, all_data[-1]) + joblib.dump(all_data, "all_data.joblib") + + +if __name__ == "__main__": + root_url = "http://yjgl.tj.gov.cn/ZWGK6939/SGXX3106/" + agent_part_1 = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 " + agent_part_2 = "(KHTML, like Gecko) Chrome/89.0.4389.82 Safari/537.36" + HEADERS = {"Host": "yjgl.tj.gov.cn", + "Connection": "keep-alive", + "User-Agent": agent_part_1 + agent_part_2, + "Referer": "http://static.bshare.cn/"} + page_url_list = yield_all_page_url(root_url, page=51) + all_info_page_url_list = asyncio.run(get_all_info_page_url(root_url, page_url_list)) + joblib.dump("all_info_page_url_list", all_info_page_url_list) From cf927e594a2cb1ff224f95347a513c729e2b71ed Mon Sep 17 00:00:00 2001 From: yhf Date: Sat, 17 Apr 2021 10:15:39 +0800 Subject: [PATCH 50/65] Update README.md --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 9f1e6e2..4062693 100644 --- a/README.md +++ b/README.md @@ -52,6 +52,7 @@ 17. [get_history_weather.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/get_history_weather.py): **抓取广州市2019年第一季度的天气数据。** 18. [search_useful_camera_ip_address.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/search_useful_camera_ip_address.py): **模拟登入某扫描网站获取潜在的摄像头IP地址,然后使用弱密码验证筛选出可登录的摄像头IP地址。** 19. [get_top_sec_com.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/get_top_sec_com.py): **异步编程获取A股市场网络安全版块公司市值排名情况,并以图片格式保存下来。** +20. [get_tf_accident_info.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/get_tj_accident_info.py): **同步和异步编程结合获取天津市应急管理局所有事故信息。** --- # spiderAPI模块简介 From 198e817f146d211c9d4c3a470de44eb372d7c291 Mon Sep 17 00:00:00 2001 From: yhf Date: Wed, 28 Apr 2021 12:14:53 +0800 Subject: [PATCH 51/65] Update get_top_sec_com.py --- spiderFile/get_top_sec_com.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spiderFile/get_top_sec_com.py b/spiderFile/get_top_sec_com.py index be0f706..5229bbd 100644 --- a/spiderFile/get_top_sec_com.py +++ b/spiderFile/get_top_sec_com.py @@ -87,4 +87,4 @@ def yield_picture(self, save_path): ax.xaxis.set_visible(False) ax.yaxis.set_visible(False) _ = pd.plotting.table(ax, df, loc="center", cellLoc="center") - plt.savefig(save_path) + plt.savefig(save_path, bbox_inches="tight") From 66304e7b63b028a9a031428ea7b1d3859d12627a Mon Sep 17 00:00:00 2001 From: yhf Date: Fri, 14 May 2021 16:45:31 +0800 Subject: [PATCH 52/65] Update get_top_sec_com.py --- spiderFile/get_top_sec_com.py | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/spiderFile/get_top_sec_com.py b/spiderFile/get_top_sec_com.py index 5229bbd..caf9ac2 100644 --- a/spiderFile/get_top_sec_com.py +++ b/spiderFile/get_top_sec_com.py @@ -1,5 +1,6 @@ import re import os +import time import joblib import asyncio import aiohttp @@ -71,20 +72,22 @@ def yield_picture(self, save_path): all_shares = asyncio.run(self.async_get_all_shares()) # 异步代码 df = pd.DataFrame(all_shares, columns=["股票代码", "公司", "市值(亿)"]) df["市值(亿)"] = df["市值(亿)"].astype(float) + date = time.strftime("%Y年%m月%d日", time.localtime()) df.sort_values(by="市值(亿)", ascending=False, inplace=True) - height = 0.2 * df.shape[0] - if self.top and 0< self.top <= df.shape[0]: - df = df.iloc[:self.top, :] - height = 0.2 * self.top df.index = range(1, df.shape[0]+1) plt.rcParams['font.sans-serif'] = ['SimHei'] plt.rcParams['axes.unicode_minus'] = False - fig = plt.figure(figsize=(2.5, height), dpi=400) + fig = plt.figure(dpi=400) ax = fig.add_subplot(111, frame_on=False) ax.xaxis.set_visible(False) - ax.yaxis.set_visible(False) - _ = pd.plotting.table(ax, df, loc="center", cellLoc="center") + ax.yaxis.set_visible(False) + _ = pd.plotting.table(ax, df, loc="best", cellLoc="center") + ax.set_title(f"{date}A股网安版块公司市值排名", fontsize=10) plt.savefig(save_path, bbox_inches="tight") + +if __name__ == "__main__": + m = getTopSecCom() + m.yield_picture("rank.png") From 5bb06b8864226e6977501bf08f56db889e0f8e7c Mon Sep 17 00:00:00 2001 From: yhf Date: Fri, 14 May 2021 16:46:33 +0800 Subject: [PATCH 53/65] Update get_top_sec_com.py --- spiderFile/get_top_sec_com.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/spiderFile/get_top_sec_com.py b/spiderFile/get_top_sec_com.py index caf9ac2..f1fce0a 100644 --- a/spiderFile/get_top_sec_com.py +++ b/spiderFile/get_top_sec_com.py @@ -8,6 +8,8 @@ import pandas as pd import matplotlib.pyplot as plt +# import nest_asyncio +# nest_asyncio.apply() class getTopSecCom: def __init__(self, top=None): From bfa05ebfd961c00a5840812a536860eba7f92faf Mon Sep 17 00:00:00 2001 From: yhf Date: Sun, 2 Jan 2022 09:11:53 +0800 Subject: [PATCH 54/65] Update README.md --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 4062693..bab2523 100644 --- a/README.md +++ b/README.md @@ -10,7 +10,7 @@ |__/ —————— by yanghangfeng ``` -#

PythonCrawler: 用 python编写的爬虫项目集合:bug:

+#

PythonCrawler: 用 python编写的爬虫项目集合:bug:(本项目代码仅作为爬虫技术学习之用,学习者千万要遵循中华人民共和国法律!)

@@ -38,7 +38,7 @@ 3. [get_photos.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/get_photos.py): **抓取百度贴吧某话题下的所有图片。** 4. [get_web_all_img.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/get_web_all_img.py): **抓取整个网站的图片。** 5. [lagou_position_spider.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/lagou_position_spider.py): **任意输入关键字,一键抓取与关键字相关的职位招聘信息,并保存到本地文件。** -6. [student_img.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/student_img.py): **基于本学校官网的url漏洞,获取所有注册学生学籍证件照。** +6. [student_img.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/student_img.py): **自动化获取学籍证件照。** 7. [JD_spider.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/JD_spider.py): **大批量抓取京东商品id和标签。** 8. [ECUT_pos_html.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/ECUT_pos_html.py): **抓取学校官网所有校园招聘信息,并保存为html格式,图片也会镶嵌在html中。** 9. [ECUT_get_grade.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/ECUT_get_grade.py): **模拟登陆学校官网,抓取成绩并计算平均学分绩。** From 4630508b964288937b0c68dd9a845dcbbc0a605b Mon Sep 17 00:00:00 2001 From: yhf Date: Sun, 2 Jan 2022 09:13:18 +0800 Subject: [PATCH 55/65] Update student_img.py --- spiderFile/student_img.py | 25 +------------------------ 1 file changed, 1 insertion(+), 24 deletions(-) diff --git a/spiderFile/student_img.py b/spiderFile/student_img.py index d3135ea..a66436d 100644 --- a/spiderFile/student_img.py +++ b/spiderFile/student_img.py @@ -1,29 +1,6 @@ import requests """ -思路:去官网自己的主页,看自己的照片的url然后你懂的。 +思路:去官网自己的主页,看自己的学籍照片的url。 """ -url = '' -banji = [] -zhuanye = [] -for a in range(10): - for b in range(10): - banji.append(str(a) + '0' + str(b)) -for c in range(10): - zhuanye.append('20' + str(c)) -for year in range(2011, 2015): - for xh in zhuanye: - for nj in banji: - for i in range(1, 35): - if i < 10: - xuehao = str(year) + str(xh) + str(nj) + '0' + str(i) - student_url = url + xuehao - with open('E:/student_img/%s.jpeg' % xuehao, 'wb') as file: - file.write(requests.get(student_url).content) - else: - xuehao = str(year) + str(xh) + str(nj) + str(i) - student_url = url + xuehao - with open('E:/student_img/%s.jpeg' % xuehao, 'wb') as file: - file.write(requests.get(student_url).content) -print('OK!') From 483c276b7c700f16ec7b7f5a62b194a93dedf26e Mon Sep 17 00:00:00 2001 From: yhf Date: Sun, 2 Jan 2022 09:14:46 +0800 Subject: [PATCH 56/65] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index bab2523..36c5fdb 100644 --- a/README.md +++ b/README.md @@ -50,7 +50,7 @@ 15. [fuckCTF.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/fuckCTF.py): **通过selenium模拟登入合天网站,自动修改原始密码。** 16. [one_update.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/one_update.py): **更新抓取one文艺网站的代码,添加一句箴言的抓取。** 17. [get_history_weather.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/get_history_weather.py): **抓取广州市2019年第一季度的天气数据。** -18. [search_useful_camera_ip_address.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/search_useful_camera_ip_address.py): **模拟登入某扫描网站获取潜在的摄像头IP地址,然后使用弱密码验证筛选出可登录的摄像头IP地址。** +18. [search_useful_camera_ip_address.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/search_useful_camera_ip_address.py): **摄像头弱密码安全科普。** 19. [get_top_sec_com.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/get_top_sec_com.py): **异步编程获取A股市场网络安全版块公司市值排名情况,并以图片格式保存下来。** 20. [get_tf_accident_info.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/get_tj_accident_info.py): **同步和异步编程结合获取天津市应急管理局所有事故信息。** --- From 81a5eed902dd2514464ec7da5e5fe465eb8b082f Mon Sep 17 00:00:00 2001 From: yhf Date: Fri, 8 Jul 2022 15:43:54 +0800 Subject: [PATCH 57/65] Update README.md --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 36c5fdb..6b92523 100644 --- a/README.md +++ b/README.md @@ -10,7 +10,7 @@ |__/ —————— by yanghangfeng ``` -#

PythonCrawler: 用 python编写的爬虫项目集合:bug:(本项目代码仅作为爬虫技术学习之用,学习者千万要遵循中华人民共和国法律!)

+#

PythonCrawler: 用 python编写的爬虫项目集合:bug:(本项目代码仅作为爬虫技术学习之用,学习者务必遵循中华人民共和国法律!)

@@ -38,7 +38,7 @@ 3. [get_photos.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/get_photos.py): **抓取百度贴吧某话题下的所有图片。** 4. [get_web_all_img.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/get_web_all_img.py): **抓取整个网站的图片。** 5. [lagou_position_spider.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/lagou_position_spider.py): **任意输入关键字,一键抓取与关键字相关的职位招聘信息,并保存到本地文件。** -6. [student_img.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/student_img.py): **自动化获取学籍证件照。** +6. [student_img.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/student_img.py): **自动化获取自己学籍证件照。** 7. [JD_spider.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/JD_spider.py): **大批量抓取京东商品id和标签。** 8. [ECUT_pos_html.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/ECUT_pos_html.py): **抓取学校官网所有校园招聘信息,并保存为html格式,图片也会镶嵌在html中。** 9. [ECUT_get_grade.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/ECUT_get_grade.py): **模拟登陆学校官网,抓取成绩并计算平均学分绩。** From 0d60014d61a5b159257a366a9088752d6c6059b3 Mon Sep 17 00:00:00 2001 From: yhf Date: Mon, 24 Oct 2022 14:59:17 +0800 Subject: [PATCH 58/65] Update README.md --- README.md | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 6b92523..277bb18 100644 --- a/README.md +++ b/README.md @@ -29,7 +29,13 @@

- +对于很多小伙伴咨询IP代理的问题,推荐一个产品,链接:http://www.ipidea.net/?utm-source=yhf&utm-keyword=?yhf +产品介绍: +1、覆盖220+的国家和地区,9000万真实住宅IP资源,汇聚成大规模代理服务池。 +2、提供动态住宅代理、静态住宅代理、数据中心、移动代理等多种解决方案,满足电子商务、市场调查、抓取索引、网站测试、广告验证、seo监控优化等多个业务场景。 +3、支持HTTP/HTTPS/Socks5协议 +4、真实住宅IP,支持从制定国家城市访问目标网站,隐藏真实网络环境,保护隐私,24小时持续过滤并更新,IP纯净度高,快速响应,无限并发,99.9%的成功率,确保高效稳定连接,让您的业务得心应手 +5、支持海量IP免费试用 # spiderFile模块简介 From bf967800e286a4241518bc3304d0c593fc7d3062 Mon Sep 17 00:00:00 2001 From: yhf Date: Mon, 24 Oct 2022 15:01:33 +0800 Subject: [PATCH 59/65] Update README.md --- README.md | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index 277bb18..dd07248 100644 --- a/README.md +++ b/README.md @@ -29,13 +29,14 @@

-对于很多小伙伴咨询IP代理的问题,推荐一个产品,链接:http://www.ipidea.net/?utm-source=yhf&utm-keyword=?yhf +对于很多小伙伴咨询IP代理的问题,推荐一个产品,[相关链接点击直达](http://www.ipidea.net/?utm-source=yhf&utm-keyword=?yhf) 产品介绍: -1、覆盖220+的国家和地区,9000万真实住宅IP资源,汇聚成大规模代理服务池。 -2、提供动态住宅代理、静态住宅代理、数据中心、移动代理等多种解决方案,满足电子商务、市场调查、抓取索引、网站测试、广告验证、seo监控优化等多个业务场景。 -3、支持HTTP/HTTPS/Socks5协议 +1、覆盖220+的国家和地区,9000万真实住宅IP资源,汇聚成大规模代理服务池; +2、提供动态住宅代理、静态住宅代理、数据中心、移动代理等多种解决方案,满足电子商务、市场调查、抓取索引、网站测试、广告验证、seo监控优化等多个业务场景; +3、支持HTTP/HTTPS/Socks5协议; 4、真实住宅IP,支持从制定国家城市访问目标网站,隐藏真实网络环境,保护隐私,24小时持续过滤并更新,IP纯净度高,快速响应,无限并发,99.9%的成功率,确保高效稳定连接,让您的业务得心应手 -5、支持海量IP免费试用 +; +5、支持海量IP免费试用。 # spiderFile模块简介 From 72ba185ce7a774f0a06515a08db02ff494b6c49e Mon Sep 17 00:00:00 2001 From: yhf Date: Mon, 24 Oct 2022 15:30:14 +0800 Subject: [PATCH 60/65] Update README.md --- README.md | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index dd07248..7dc2c5b 100644 --- a/README.md +++ b/README.md @@ -29,13 +29,14 @@

-对于很多小伙伴咨询IP代理的问题,推荐一个产品,[相关链接点击直达](http://www.ipidea.net/?utm-source=yhf&utm-keyword=?yhf) -产品介绍: -1、覆盖220+的国家和地区,9000万真实住宅IP资源,汇聚成大规模代理服务池; -2、提供动态住宅代理、静态住宅代理、数据中心、移动代理等多种解决方案,满足电子商务、市场调查、抓取索引、网站测试、广告验证、seo监控优化等多个业务场景; -3、支持HTTP/HTTPS/Socks5协议; + +由于很多小伙伴都咨询IP代理的问题,在这里推荐大家一个好用的产品,[相关链接点击直达](http://www.ipidea.net/?utm-source=yhf&utm-keyword=?yhf) +产品介绍: +1、覆盖220+的国家和地区,9000万真实住宅IP资源,汇聚成大规模代理服务池; +2、提供动态住宅代理、静态住宅代理、数据中心、移动代理等多种解决方案,满足电子商务、市场调查、抓取索引、网站测试、广告验证、seo监控优化等多个业务场景; +3、支持HTTP/HTTPS/Socks5协议; 4、真实住宅IP,支持从制定国家城市访问目标网站,隐藏真实网络环境,保护隐私,24小时持续过滤并更新,IP纯净度高,快速响应,无限并发,99.9%的成功率,确保高效稳定连接,让您的业务得心应手 -; +; 5、支持海量IP免费试用。 # spiderFile模块简介 From f9315ea9a0ec52e1e24aa62211ebfdd797903f2c Mon Sep 17 00:00:00 2001 From: yhf Date: Thu, 2 Mar 2023 19:02:48 +0800 Subject: [PATCH 61/65] Update README.md --- README.md | 9 --------- 1 file changed, 9 deletions(-) diff --git a/README.md b/README.md index 7dc2c5b..c54c2a1 100644 --- a/README.md +++ b/README.md @@ -30,15 +30,6 @@

-由于很多小伙伴都咨询IP代理的问题,在这里推荐大家一个好用的产品,[相关链接点击直达](http://www.ipidea.net/?utm-source=yhf&utm-keyword=?yhf) -产品介绍: -1、覆盖220+的国家和地区,9000万真实住宅IP资源,汇聚成大规模代理服务池; -2、提供动态住宅代理、静态住宅代理、数据中心、移动代理等多种解决方案,满足电子商务、市场调查、抓取索引、网站测试、广告验证、seo监控优化等多个业务场景; -3、支持HTTP/HTTPS/Socks5协议; -4、真实住宅IP,支持从制定国家城市访问目标网站,隐藏真实网络环境,保护隐私,24小时持续过滤并更新,IP纯净度高,快速响应,无限并发,99.9%的成功率,确保高效稳定连接,让您的业务得心应手 -; -5、支持海量IP免费试用。 - # spiderFile模块简介 1. [baidu_sy_img.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/baidu_sy_img.py): **抓取百度的`高清摄影`图片。** From fac1ffc9c9e6a04875b55d54cd67dbf72ac39db2 Mon Sep 17 00:00:00 2001 From: yhf Date: Thu, 17 Apr 2025 14:14:18 +0800 Subject: [PATCH 62/65] Update README.md --- README.md | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/README.md b/README.md index c54c2a1..6f96817 100644 --- a/README.md +++ b/README.md @@ -30,6 +30,18 @@

+# IPWO全球代理资源 | 为采集、跨境与测试项目提供支持(免费试用,爬虫使用强烈推荐!!!) +### 官网地址 +[👉 访问 IPWO 官网](https://www.ipwo.net/?code=WSESV2ONN) +### 产品简介 +* 免费试用,先体验再选择 +* 9000万+真实住宅IP,覆盖220+国家和地区 +* 支持动态住宅代理、静态住宅代理(ISP) +* 适用于数据抓取、电商、广告验证、SEO监控等场景 +* 支持HTTP/HTTPS/SOCKS5协议,兼容性强 +* 纯净IP池,实时更新,99.9%连接成功率 +* 支持指定国家城市地区访问,保护隐私 + # spiderFile模块简介 1. [baidu_sy_img.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/baidu_sy_img.py): **抓取百度的`高清摄影`图片。** From db47c2a3efb062fb3ba228d73bf148701b30af7d Mon Sep 17 00:00:00 2001 From: yhf Date: Sat, 9 May 2026 11:11:26 +0800 Subject: [PATCH 63/65] Clean up README and update project details Remove ASCII art and update project description. --- README.md | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/README.md b/README.md index 6f96817..c54c2a1 100644 --- a/README.md +++ b/README.md @@ -30,18 +30,6 @@

-# IPWO全球代理资源 | 为采集、跨境与测试项目提供支持(免费试用,爬虫使用强烈推荐!!!) -### 官网地址 -[👉 访问 IPWO 官网](https://www.ipwo.net/?code=WSESV2ONN) -### 产品简介 -* 免费试用,先体验再选择 -* 9000万+真实住宅IP,覆盖220+国家和地区 -* 支持动态住宅代理、静态住宅代理(ISP) -* 适用于数据抓取、电商、广告验证、SEO监控等场景 -* 支持HTTP/HTTPS/SOCKS5协议,兼容性强 -* 纯净IP池,实时更新,99.9%连接成功率 -* 支持指定国家城市地区访问,保护隐私 - # spiderFile模块简介 1. [baidu_sy_img.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/baidu_sy_img.py): **抓取百度的`高清摄影`图片。** From b9af210061c835499023d82e95cfaca98eff16d4 Mon Sep 17 00:00:00 2001 From: yhf Date: Tue, 12 May 2026 17:31:49 +0800 Subject: [PATCH 64/65] Revise README with project updates and new features Update README to include new project details and features. --- README.md | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/README.md b/README.md index c54c2a1..ee46f57 100644 --- a/README.md +++ b/README.md @@ -30,6 +30,19 @@

+# 🚀 重磅福利|Swiftproxy 全球代理 · 开发者必备 +Swiftproxy 代理服务 + +**🌍 全球顶级住宅代理网络,一站式解决爬虫/采集/自动化难题 [🔗官方入口](https://www.swiftproxy.net/?ref=PythonCrawler)** +- [x] **195+国家全覆盖**,**8000万+** 纯净住宅IP池 +- [x] **99.89%超高请求成功率**,稳定低延迟 +- [x] 流量永不过期,无强制月付,灵活随心用 +- [x] 原生支持 **HTTP(S)/SOCKS5** 双协议 +- [x] 智能IP轮换+精准地区定位,强效规避封锁 +- [x] 适配Python/Node/Go/PHP全语言,一键集成 +- [x] 免费试用开启,开箱即用零门槛 + +--- # spiderFile模块简介 1. [baidu_sy_img.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/baidu_sy_img.py): **抓取百度的`高清摄影`图片。** From a4ebdb9d446ec0c82548ce7d06391bb8e385ccfd Mon Sep 17 00:00:00 2001 From: yhf Date: Tue, 12 May 2026 18:56:08 +0800 Subject: [PATCH 65/65] Clean up README by removing unnecessary content Remove ASCII art and introductory text from README --- README.md | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/README.md b/README.md index ee46f57..80111dd 100644 --- a/README.md +++ b/README.md @@ -30,18 +30,6 @@

-# 🚀 重磅福利|Swiftproxy 全球代理 · 开发者必备 -Swiftproxy 代理服务 - -**🌍 全球顶级住宅代理网络,一站式解决爬虫/采集/自动化难题 [🔗官方入口](https://www.swiftproxy.net/?ref=PythonCrawler)** -- [x] **195+国家全覆盖**,**8000万+** 纯净住宅IP池 -- [x] **99.89%超高请求成功率**,稳定低延迟 -- [x] 流量永不过期,无强制月付,灵活随心用 -- [x] 原生支持 **HTTP(S)/SOCKS5** 双协议 -- [x] 智能IP轮换+精准地区定位,强效规避封锁 -- [x] 适配Python/Node/Go/PHP全语言,一键集成 -- [x] 免费试用开启,开箱即用零门槛 - --- # spiderFile模块简介