diff --git a/ECUT_pos_html.py b/ECUT_pos_html.py deleted file mode 100644 index 470219c..0000000 --- a/ECUT_pos_html.py +++ /dev/null @@ -1,57 +0,0 @@ -import requests -import re -from bs4 import BeautifulSoup as bs - - -def crawl_all_main_url(page=10): - # 默认抓取官网前十页招聘信息的url - all_url_list = [] - for _ in range(1, page+1): - url = 'http://zjc.ecit.edu.cn/jy/app/newslist.php?BigClassName=%D5%D0%C6%B8%D0%C5%CF%A2&Page={0}'.format(_) - page_html = requests.get(url).text - x_url_reg = re.compile('(.*?)') - explain_text = re.findall(explain_text_reg, html)[0] - if ('时间' and '地点') in explain_text: - return True - else: - pass - -def save_html(): - all_url_list = crawl_all_main_url() - for son_url in all_url_list: - if get_title(son_url): - text_html = requests.get(son_url).content.decode('gbk') - domain_url = 'http://zjc.ecit.edu.cn/jy' - img_url_reg = re.compile('border=0 src="\.\.(.*?)"') - child_url = re.findall(img_url_reg, text_html) - if child_url != []: - img_url = domain_url + child_url[0] - re_url = 'src="..{0}"'.format(child_url[0]) - end_url = 'src="{0}"'.format(img_url) - end_html = text_html.replace(re_url, end_url) - soup = bs(end_html, 'lxml') - text_div = soup.find_all('div', id='main')[0] - with open('./{0}.html'.format(son_url[-11:]), 'wb') as file: - text_html = 'U职网提供数据咨询服务 {0} '.format(text_div) - file.write(text_html.encode('utf-8')) - else: - with open('./{0}.html'.format(son_url[-11:]), 'wb') as file: - html = requests.get(son_url).content.decode('gbk') - soup = bs(text_html, 'lxml') - text_div = soup.find_all('div', id='main')[0] - text_html = 'U职网提供数据咨询服务 {0} '.format(text_div) - file.write(text_html.encode('utf-8')) - else: - continue - -if __name__ == '__main__': - save_html() diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..02bfa5d --- /dev/null +++ b/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2018 yhf + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/README.md b/README.md index 3fd8452..80111dd 100644 --- a/README.md +++ b/README.md @@ -1,23 +1,119 @@ -# 功能介绍 +```shell + ( + )\ ) ) ) ( ( + (()/( ( ( /( ( /( )\ ( ) ( ( )\ ( ( + /(_)))\ ) )\()))\()) ( ( (((_) )( ( /( )\))( ((_) ))\ )( + (_)) (()/( (_))/((_)\ )\ )\ ) )\___ (()\ )(_))((_)()\ _ /((_)(()\ + | _ \ )(_))| |_ | |(_) ((_) _(_/(((/ __| ((_)((_)_ _(()((_)| |(_)) ((_) + | _/| || || _|| ' \ / _ \| ' \))| (__ | '_|/ _` |\ V V /| |/ -_) | '_| + |_| \_, | \__||_||_|\___/|_||_| \___||_| \__,_| \_/\_/ |_|\___| |_| + |__/ + —————— by yanghangfeng +``` +#

PythonCrawler: 用 python编写的爬虫项目集合:bug:(本项目代码仅作为爬虫技术学习之用,学习者务必遵循中华人民共和国法律!)

-##### 1. [baiduImg.py](https://github.com/Fenghuapiao/PythonCrawler/blob/master/baiduImg.py): 抓取百度的‘高清摄影’图片 +

+ + + + + + + + + + + + + + + +

-##### 2. [baiduImg2.py](https://github.com/Fenghuapiao/PythonCrawler/blob/master/baiduImg2.py): 抓取百度图片‘唯美意境’模块 +--- +# spiderFile模块简介 -##### 3. [GetPhotos2.py](https://github.com/Fenghuapiao/PythonCrawler/blob/master/GetPhotos2.py): 抓取百度贴吧某话题下的所有图片 +1. [baidu_sy_img.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/baidu_sy_img.py): **抓取百度的`高清摄影`图片。** +2. [baidu_wm_img.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/baidu_wm_img.py): **抓取百度图片`唯美意境`模块。** +3. [get_photos.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/get_photos.py): **抓取百度贴吧某话题下的所有图片。** +4. [get_web_all_img.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/get_web_all_img.py): **抓取整个网站的图片。** +5. [lagou_position_spider.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/lagou_position_spider.py): **任意输入关键字,一键抓取与关键字相关的职位招聘信息,并保存到本地文件。** +6. [student_img.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/student_img.py): **自动化获取自己学籍证件照。** +7. [JD_spider.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/JD_spider.py): **大批量抓取京东商品id和标签。** +8. [ECUT_pos_html.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/ECUT_pos_html.py): **抓取学校官网所有校园招聘信息,并保存为html格式,图片也会镶嵌在html中。** +9. [ECUT_get_grade.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/ECUT_get_grade.py): **模拟登陆学校官网,抓取成绩并计算平均学分绩。** +10. [github_hot.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/github_hot.py): **抓取github上面热门语言所对应的项目,并把项目简介和项目主页地址保存到本地文件。** +11. [xz_picture_spider.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/xz_picture_spider.py): **应一位知友的请求,抓取某网站上面所有的写真图片。** +12. [one_img.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/one_img.py): **抓取one文艺网站的图片。** +13. [get_baike.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/get_baike.py): **任意输入一个关键词抓取百度百科的介绍。** +14. [kantuSpider.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/kantuSpider.py): **抓取看图网站上的所有图片。** +15. [fuckCTF.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/fuckCTF.py): **通过selenium模拟登入合天网站,自动修改原始密码。** +16. [one_update.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/one_update.py): **更新抓取one文艺网站的代码,添加一句箴言的抓取。** +17. [get_history_weather.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/get_history_weather.py): **抓取广州市2019年第一季度的天气数据。** +18. [search_useful_camera_ip_address.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/search_useful_camera_ip_address.py): **摄像头弱密码安全科普。** +19. [get_top_sec_com.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/get_top_sec_com.py): **异步编程获取A股市场网络安全版块公司市值排名情况,并以图片格式保存下来。** +20. [get_tf_accident_info.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/get_tj_accident_info.py): **同步和异步编程结合获取天津市应急管理局所有事故信息。** +--- +# spiderAPI模块简介 -##### 4. [getWebAllImg.py](https://github.com/Fenghuapiao/PythonCrawler/blob/master/getWebAllImg.py): 抓取整个网站的图片 +#### 本模块提供一些网站的API爬虫接口,功能可能不是很全因此可塑性很大智慧的你如果有兴趣可以继续改进。 -##### 5. [lagouPositionSpider.py](https://github.com/Fenghuapiao/PythonCrawler/blob/master/lagouPositionSpider.py): 任意输入关键字,一键抓取与关键字相关的职位招聘信息,并保存到本地文件 +##### 1.大众点评 -##### 6. [student_img.py](https://github.com/Fenghuapiao/PythonCrawler/blob/master/student_img.py): 基于本学校官网的url漏洞,获取所有注册学生学籍证件照 +```python +from spiderAPI.dianping import * -##### 7. [JDSpider.py](https://github.com/Fenghuapiao/PythonCrawler/blob/master/JDSpider.py): 大批量抓取京东商品id和标签 +''' +citys = { + '北京': '2', '上海': '1', '广州': '4', '深圳': '7', '成都': '8', '重庆': '9', '杭州': '3', '南京': '5', '沈阳': '18', '苏州': '6', '天津': '10','武汉': '16', '西安': '17', '长沙': '344', '大连': '19', '济南': '22', '宁波': '11', '青岛': '21', '无锡': '13', '厦门': '15', '郑州': '160' +} -##### 8. [ECUT_pos_html.py](https://github.com/Fenghuapiao/PythonCrawler/blob/master/ECUT_pos_html.py): 抓取学校官网所有校园招聘信息,并保存为html格式,图片也会镶嵌在html中。 +ranktype = { + '最佳餐厅': 'score', '人气餐厅': 'popscore', '口味最佳': 'score1', '环境最佳': 'score2', '服务最佳': 'score3' +} +''' -##### 9. [ECUT_get_grade.py](https://github.com/Fenghuapiao/PythonCrawler/blob/master/ECUT_get_grade.py): 模拟登陆学校官网,抓取成绩并计算平均学分绩 +result=bestRestaurant(cityId=1, rankType='popscore')#获取人气餐厅 -##### 10. [githubHot.py](https://github.com/Fenghuapiao/PythonCrawler/blob/master/githubHot.py): 抓取github上面热门语言所对应的项目,并把项目简介和项目主页地址保存到本地文件。 +shoplist=dpindex(cityId=1, page=1)#商户风云榜 -##### 11.[pictureSpider.py](https://github.com/Fenghuapiao/PythonCrawler/blob/master/pictureSpider.py): 应一位知友的请求,抓取某网站上面所有的写真图片。 +restaurantlist=restaurantList('http://www.dianping.com/search/category/2/10/p2')#获取餐厅 + +``` + +##### 2.获取代理IP +爬取[代理IP](http://proxy.ipcn.org) +```python +from spiderAPI.proxyip import get_enableips + +enableips=get_enableips() + +``` + +##### 3.百度地图 + +百度地图提供的API,对查询有一些限制,这里找出了web上查询的接口。 +```python +from spiderAPI.baidumap import * + +citys=citys()#获取城市列表 +result=search(keyword="美食", citycode="257", page=1)#获取搜索结果 + +``` + +##### 4.模拟登录github +```python +from spiderAPI.github import GitHub + +github = GitHub() +github.login() # 这一步会提示你输入用户名和密码 +github.show_timeline() # 获取github主页时间线 +# 更多的功能有待你们自己去发掘 +``` + +##### 5.拉勾网 +```python +from spiderAPI.lagou import * + +lagou_spider(key='数据挖掘', page=1) # 获取关键字为数据挖掘的招聘信息 +``` diff --git a/baiduImg.py b/baiduImg.py deleted file mode 100644 index b8a2269..0000000 --- a/baiduImg.py +++ /dev/null @@ -1,53 +0,0 @@ -import requests -import re - -url = 'http://image.baidu.com/search/index' -headers = { - 'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:44.0) Gecko/20100101 Firefox/44.0', - 'Accept-Language' : 'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3', - 'Accept-Encoding' : 'gzip, deflate', - 'Referer' : 'http://image.baidu.com/search/index?tn=baiduimage&ipn=r&ct=201326592&cl=2&fm=detail&lm=-1&st=-1&sf=2&fmq=&pv=&ic=0&nc=1&z=&se=&showtab=0&fb=0&width=&height=&face=0&istype=2&ie=utf-8&word=%E9%AB%98%E6%B8%85%E6%91%84%E5%BD%B1&oq=%E9%AB%98%E6%B8%85%E6%91%84%E5%BD%B1&rsp=-1', - 'Cookie' : 'HOSUPPORT=1; UBI=fi_PncwhpxZ%7ETaMMzY0i9qXJ9ATcu3rvxFIc-a7KI9byBcYk%7EjBVmPGIbL3LTKKJ2D17mh5VfJ5yjlCncAb2yhPI5sZM51Qo7tpCemygM0VNUzuTBJwYF8OYmi3nsCCzbpo5U9tLSzkZfcQ1rxUcJSzaipThg__; HISTORY=fec845b215cd8e8be424cf320de232722d0050; PTOKEN=ff58b208cc3c16596889e0a20833991d; STOKEN=1b1f4b028b5a4415aa1dd9794ff061d312ad2a822d52418f3f1ffabbc0ac6142; SAVEUSERID=0868a2b4c9d166dc85e605f0dfd153; USERNAMETYPE=3; PSTM=1454309602; BAIDUID=E5493FD55CFE5424BA25B1996943B3B6:FG=1; BIDUPSID=B7D6D9EFA208B7B8C7CB6EF8F827BD4E; BDUSS=VSeFB6UXBmRWc3UEdFeXhKOFRvQm4ySmVmTkVEN2N0bldnM2o5RHdyaE54ZDlXQVFBQUFBJCQAAAAAAAAAAAEAAABzhCtU3Mbj5cfl0e8AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAE04uFZNOLhWZW; H_PS_PSSID=1447_18282_17946_18205_18559_17001_17073_15479_12166_18086_10634; BDRCVFR[feWj1Vr5u3D]=I67x6TjHwwYf0; BDRCVFR[X_XKQks0S63]=mk3SLVN4HKm; BDRCVFR[dG2JNJb_ajR]=mk3SLVN4HKm', - } -def get_html(url, headers): - data = { - 'cl' : '2', - 'ct' : '201326592', - 'face' : '0', - 'fp' : 'result', - 'gsm' : '200001e', - 'ic' : '0', - 'ie' : 'utf-8', - 'ipn' : 'rj', - 'istype' : '2', - 'lm' : '-1', - 'nc' : '1', - 'oe' : 'utf-8', - 'pn' : '30', - 'queryword' : '高清摄影', - 'rn' : '30', - 'st' : '-1', - 'tn' : 'resultjson_com', - 'word' : '高清摄影' - } - - page = requests.get(url, data, headers = headers).text - return page - -def get_img(page, headers): -# img_url_list = [] - reg = re.compile('http://.*?\.jpg') - imglist1 = re.findall(reg, page) - imglist2 = imglist1[0 : len(imglist1) : 3] -# [img_url_list.append(i) for i in imglist if not i in img_url_list] - x = 0 - for imgurl in imglist2: - bin = requests.get(imgurl, headers = headers).content - with open('E:/Pic2/%s.jpg' % x, 'wb') as file: - file.write(bin) - x += 1 - -if __name__ == '__main__': - page = get_html(url, headers) - get_img(page, headers) - diff --git a/baiduImg2.py b/baiduImg2.py deleted file mode 100644 index 35b2507..0000000 --- a/baiduImg2.py +++ /dev/null @@ -1,48 +0,0 @@ -import requests -import re - -url = 'http://image.baidu.com/search/index' -date = { - 'cl' : '2', - 'ct' : '201326592', - 'fp' : 'result', - 'gsm' : '1e', - 'ie' : 'utf-8', - 'ipn' : 'rj', - 'istype' : '2', - 'lm' : '-1', - 'nc' : '1', - 'oe' : 'utf-8', - 'pn' : '30', - 'queryword' : '唯美意境图片', - 'rn' : '30', - 'st' : '-1', - 'tn' : 'resultjson_com', - 'word' : '唯美意境图片' - } -headers = { - 'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:44.0) Gecko/20100101 Firefox/44.0', - 'Accept' : 'text/plain, */*; q=0.01', - 'Accept-Language' : 'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3', - 'Accept-Encoding' : 'gzip, deflate', - 'X-Requested-With' : 'XMLHttpRequest', - 'Referer' : 'http://image.baidu.com/search/index?ct=201326592&cl=2&st=-1&lm=-1&nc=1&ie=utf-8&tn=baiduimage&ipn=r&rps=1&pv=&fm=rs3&word=%E5%94%AF%E7%BE%8E%E6%84%8F%E5%A2%83%E5%9B%BE%E7%89%87&ofr=%E9%AB%98%E6%B8%85%E6%91%84%E5%BD%B1', - 'Cookie' : 'BDqhfp=%E5%94%AF%E7%BE%8E%E6%84%8F%E5%A2%83%E5%9B%BE%E7%89%87%26%26NaN-1undefined-1undefined%26%260%26%261; Hm_lvt_737dbb498415dd39d8abf5bc2404b290=1455016371,1455712809,1455769605,1455772886; PSTM=1454309602; BAIDUID=E5493FD55CFE5424BA25B1996943B3B6:FG=1; BIDUPSID=B7D6D9EFA208B7B8C7CB6EF8F827BD4E; BDUSS=VSeFB6UXBmRWc3UEdFeXhKOFRvQm4ySmVmTkVEN2N0bldnM2o5RHdyaE54ZDlXQVFBQUFBJCQAAAAAAAAAAAEAAABzhCtU3Mbj5cfl0e8AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAE04uFZNOLhWZW; H_PS_PSSID=1447_18282_17946_15479_12166_18086_10634; Hm_lpvt_737dbb498415dd39d8abf5bc2404b290=1455788775; firstShowTip=1; BDRCVFR[-pGxjrCMryR]=mk3SLVN4HKm; BDRCVFR[dG2JNJb_ajR]=mk3SLVN4HKm', - 'Connection' : 'keep-alive' - } -def get_page(url, date, headers): - page = requests.get(url, date, headers = headers).text - return page - -def get_img(page, headers): - reg = re.compile('http://.*?\.jpg') - imglist = re.findall(reg, page)[::3] - x = 0 - for imgurl in imglist: - with open('E:/Pic/%s.jpg' % x, 'wb') as file: - file.write(requests.get(imgurl, headers = headers).content) - x += 1 - -if __name__ == '__main__': - page = get_page(url, date, headers) - get_img(page, headers) \ No newline at end of file diff --git a/one_img.py b/one_img.py deleted file mode 100644 index 5d41d3c..0000000 --- a/one_img.py +++ /dev/null @@ -1,21 +0,0 @@ -import re -import requests - -temp = 'http://caodan.org/page/' -count = 1 -for i in range(1, 1331): - url = temp + str(i) - page = requests.get(url).text - reg = re.compile('src="(http://.*?\.jpg)"') - img_url = re.findall(reg, page) - if img_url != []: - with open('E:/img/%s.jpg' % count, 'wb') as file: - img_data = requests.get(img_url[0]).content - file.write(img_data) - count += 1 - else: - continue -print('OK!') - - - \ No newline at end of file diff --git a/spiderAPI/__init__.py b/spiderAPI/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/spiderAPI/baidumap.py b/spiderAPI/baidumap.py new file mode 100644 index 0000000..1af5207 --- /dev/null +++ b/spiderAPI/baidumap.py @@ -0,0 +1,30 @@ +import requests +import json + +headers = { + 'Host': "map.baidu.com", + "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", + "Accept-Encoding": "gzip, deflate", + "Accept-Language": "en-US,en;q=0.5", + "Connection": "keep-alive", + "User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:39.0) Gecko/20100101 Firefox/39.0"} + + +def citys(): + html = requests.get( + 'http://map.baidu.com/?newmap=1&reqflag=pcmap&biz=1&from=webmap&da_par=baidu&pcevaname=pc4.1&qt=s&da_src=searchBox.button&wd=美食&c=1&src=0&wd2=&sug=0&l=5&b=(7002451.220000001,1994587.88;19470675.22,7343963.88)&from=webmap&biz_forward={%22scaler%22:1,%22styles%22:%22pl%22}&sug_forward=&tn=B_NORMAL_MAP&nn=0&u_loc=12736591.152491,3547888.166124&ie=utf-8&t=1459951988807', headers=headers).text + data = json.loads(html) + result = [] + for item in data['more_city']: + for city in item['city']: + result.append(city) + for item in data['content']: + result.append(item) + return result + + +def search(keyword, citycode, page): + html = requests.get('http://map.baidu.com/?newmap=1&reqflag=pcmap&biz=1&from=webmap&da_par=baidu&pcevaname=pc4.1&qt=con&from=webmap&c=' + str(citycode) + '&wd=' + keyword + '&wd2=&pn=' + str( + page) + '&nn=' + str(page * 10) + '&db=0&sug=0&addr=0&&da_src=pcmappg.poi.page&on_gel=1&src=7&gr=3&l=12&tn=B_NORMAL_MAP&u_loc=12736591.152491,3547888.166124&ie=utf-8', headers=headers).text + data = json.loads(html)['content'] + return data diff --git a/spiderAPI/dianping.py b/spiderAPI/dianping.py new file mode 100644 index 0000000..6ba43db --- /dev/null +++ b/spiderAPI/dianping.py @@ -0,0 +1,87 @@ +import requests +import json +import os +from bs4 import BeautifulSoup + +headers = { + 'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:39.0) Gecko/20100101 Firefox/39.0', + 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', + 'Accept-Language': 'en-US,en;q=0.5', + 'Accept-Encoding': 'gzip, deflate', + 'Connection': 'keep-alive'} + + +def bestRestaurant(cityId=1, rankType='popscore'): + html = requests.get('http://www.dianping.com/mylist/ajax/shoprank?cityId=%s&shopType=10&rankType=%s&categoryId=0' % + (cityId, rankType), headers=headers).text + result = json.loads(html)['shopBeans'] + return result + + +def getCityId(): + citys = {'北京': '2', '上海': '1', '广州': '4', '深圳': '7', '成都': '8', '重庆': '9', '杭州': '3', '南京': '5', '沈阳': '18', '苏州': '6', '天津': '10', + '武汉': '16', '西安': '17', '长沙': '344', '大连': '19', '济南': '22', '宁波': '11', '青岛': '21', '无锡': '13', '厦门': '15', '郑州': '160'} + return citys + + +def getRankType(): + RankType = {'最佳餐厅': 'score', '人气餐厅': 'popscore', + '口味最佳': 'score1', '环境最佳': 'score2', '服务最佳': 'score3'} + return RankType + + +def dpindex(cityId=1, page=1): + url = 'http://dpindex.dianping.com/dpindex?region=&category=&type=rank&city=%s&p=%s' % ( + cityId, page) + html = requests.get(url, headers=headers).text + table = BeautifulSoup(html, 'lxml').find( + 'div', attrs={'class': 'idxmain-subcontainer'}).find_all('li') + result = [] + for item in table: + shop = {} + shop['name'] = item.find('div', attrs={'class': 'field-name'}).get_text() + shop['url'] = item.find('a').get('href') + shop['num'] = item.find('div', attrs={'class': 'field-num'}).get_text() + shop['addr'] = item.find('div', attrs={'class': 'field-addr'}).get_text() + shop['index'] = item.find('div', attrs={'class': 'field-index'}).get_text() + result.append(shop) + return result + + +def restaurantList(url): + html = requests.get(url, headers=headers, timeout=30).text.replace('\r', '').replace('\n', '') + table = BeautifulSoup(html, 'lxml').find('div', id='shop-all-list').find_all('li') + result = [] + for item in table: + shop = {} + soup = item.find('div', attrs={'class': 'txt'}) + tit = soup.find('div', attrs={'class': 'tit'}) + comment = soup.find('div', attrs={'class': 'comment'}) + tag_addr = soup.find('div', attrs={'class': 'tag-addr'}) + shop['name'] = tit.find('a').get_text() + shop['star'] = comment.find('span').get('title') + shop['review-num'] = comment.find('a', + attrs={'class': 'review-num'}).get_text().replace('条点评', '') + shop['mean-price'] = comment.find('a', attrs={'class': 'mean-price'}).get_text() + shop['type'] = tag_addr.find('span', attrs={'class': 'tag'}).get_text() + shop['addr'] = tag_addr.find('span', attrs={'class': 'addr'}).get_text() + try: + comment_list = soup.find('span', attrs={'class': 'comment-list'}).find_all('span') + except: + comment_list = [] + score = [] + for i in comment_list: + score.append(i.get_text()) + shop['score'] = score + tags = [] + try: + for i in tit.find('div', attrs={'class': 'promo-icon'}).find_all('a'): + try: + tags += i.get('class') + except: + tags.append(i.get('class')[0]) + except: + pass + shop['tags'] = tags + result.append(shop) + return result diff --git a/spiderAPI/github.py b/spiderAPI/github.py new file mode 100644 index 0000000..8a74b54 --- /dev/null +++ b/spiderAPI/github.py @@ -0,0 +1,67 @@ +import requests +from bs4 import BeautifulSoup +import json + + +headers = { + 'Host': "github.com", + 'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:44.0) Gecko/20100101 Firefox/44.0', + 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', + 'Accept-Language': 'en-US,en;q=0.5', + 'Accept-Encoding': 'gzip, deflate', + 'Connection': 'keep-alive' +} + + +class GitHub(): + + def __init__(self): + self.session = requests.session() + self.timeline = [] + self.name = '' + self.user = '' + self.passwd = '' + + def login(self): + self.user = input('please input username:') + self.passwd = input('please input password:') + html = self.session.get('https://github.com/login', headers=headers).text + authenticity_token = BeautifulSoup(html, 'lxml').find( + 'input', {'name': 'authenticity_token'}).get('value') + data = { + 'commit': "Sign+in", + 'utf8': "✓", + 'login': self.user, + 'password': self.passwd, + 'authenticity_token': authenticity_token + } + html = self.session.post('https://github.com/session', data=data, headers=headers).text + self.name = BeautifulSoup(html, 'lxml').find( + 'strong', {'class': 'css-truncate-target'}).get_text() + + def get_timeline(self, page=1): + html = self.session.get( + 'https://github.com/dashboard/index/{page}?utf8=%E2%9C%93'.format(page=page), headers=headers).text + table = BeautifulSoup(html, 'lxml').find( + 'div', id='dashboard').find_all('div', {'class': 'alert'}) + for item in table: + line = {} + line['thing'] = item.find('div', {'class': 'title'}).get_text( + ).replace('\r', '').replace('\n', '') + line['time'] = item.find('relative-time').get('datetime') + self.timeline.append(line) + + def show_timeline(self): + keys = ['who', 'do', 'to', 'time'] + for line in self.timeline: + text = line['time'] + ' ' + line['thing'] + print('*' + text + ' ' * (80 - len(text) - 2) + '*') + print('*-*-*' * 16) + + def overview(self, user=None): + if user == None: + user = self.name + html = self.session.get('https://github.com/' + user, headers=headers).text + return overview + + diff --git a/spiderAPI/lagou.py b/spiderAPI/lagou.py new file mode 100644 index 0000000..52493a7 --- /dev/null +++ b/spiderAPI/lagou.py @@ -0,0 +1,13 @@ +import json +import requests as rq + + +def lagou_spider(key=None, page=None): + lagou_url = 'http://www.lagou.com/jobs/positionAjax.json?first=false&pn={0}&kd={1}' + lagou_python_data = [] + for i in range(page): + print('抓取第{0}页'.format(i + 1)) + lagou_url_ = lagou_url.format(i, key) + lagou_data = json.loads(rq.get(lagou_url_).text) + lagou_python_data.extend(lagou_data['content']['positionResult']['result']) + return lagou_python_data diff --git a/spiderAPI/proxyip.py b/spiderAPI/proxyip.py new file mode 100644 index 0000000..f7c3637 --- /dev/null +++ b/spiderAPI/proxyip.py @@ -0,0 +1,50 @@ +import requests +import threading +import re + + +enableips = [] + + +class IsEnable(threading.Thread): + + def __init__(self, ip): + super(IsEnable, self).__init__() + self.ip = ip + self.proxies = { + 'http': 'http://%s' % ip + } + + def run(self): + global enableips + try: + html = requests.get('http://httpbin.org/ip', proxies=self.proxies, timeout=5).text + result = eval(html)['origin'] + if result in self.ip: + enableips.append(self.ip) + except: + return False + + +def parser(url): + html = requests.get(url).text + ips = re.findall('\d+\.\d+\.\d+\.\d+:\d+', html) + return ips + + +def get_enableips(): + global enableips + urls = ['http://proxy.ipcn.org/proxya.html', 'http://proxy.ipcn.org/proxya2.html', + 'http://proxy.ipcn.org/proxyb.html', 'http://proxy.ipcn.org/proxyb2.html'] + for url in urls: + ips = parser(url) + threadings = [] + for ip in ips: + work = IsEnable(ip) + work.setDaemon(True) + threadings.append(work) + for work in threadings: + work.start() + for work in threadings: + work.join() + return enableips diff --git a/ECUT_get_grade.py b/spiderFile/ECUT_get_grade.py similarity index 90% rename from ECUT_get_grade.py rename to spiderFile/ECUT_get_grade.py index 5dd31cf..6d2ff1a 100644 --- a/ECUT_get_grade.py +++ b/spiderFile/ECUT_get_grade.py @@ -3,13 +3,14 @@ import numpy as np import pandas as pd + def warn(*args, **kw): pass import warnings warnings.warn = warn -print('*'*30 + '东华理工大学' + '*'*30) -print('*'*30 + '作者:杨航锋' + '*'*30) -print('*'*30 + '版本:v1.0' + '*'*30) +print('*' * 30 + '东华理工大学' + '*' * 30) +print('*' * 30 + '作者:杨航锋' + '*' * 30) +print('*' * 30 + '版本:v1.0' + '*' * 30) print('\n') print('请输你学号:') username = input() @@ -19,6 +20,7 @@ def warn(*args, **kw): pass login_url = 'https://cas.ecit.cn/index.jsp?service=http://portal.ecit.cn/Authentication' + def get_LT(login_url): html = requests.get(login_url, verify=False).text regex = re.compile('') @@ -86,12 +88,13 @@ def get_LT(login_url): grade_data_.to_csv('./grade_data.csv', index=False) print('成绩已保存在运行此程序的文件夹') elif select == '3': - xw_grade = grade_data_[(grade_data_['课程名'] == '*数学分析(I)') | (grade_data_['课程名'] == '高等代数(I)')|\ - (grade_data_['课程名'] == 'C语言程序设计基础') | (grade_data_['课程名'] == '大学英语(II)')|\ - (grade_data_['课程名'] == '*常微分方程') | (grade_data_['课程名'] == '*概率论')|\ + xw_grade = grade_data_[(grade_data_['课程名'] == '*数学分析(I)') | (grade_data_['课程名'] == '高等代数(I)') | + (grade_data_['课程名'] == 'C语言程序设计基础') | (grade_data_['课程名'] == '大学英语(II)') | + (grade_data_['课程名'] == '*常微分方程') | (grade_data_['课程名'] == '*概率论') | (grade_data_['课程名'] == '数据结构')] print(xw_grade) print('\n') - avg_grade = np.sum((xw_grade.学分.astype(float) * xw_grade.成绩.astype(float))) / np.sum(xw_grade.学分.astype(float)) + avg_grade = np.sum((xw_grade.学分.astype(float) * xw_grade.成绩.astype(float))) / \ + np.sum(xw_grade.学分.astype(float)) print('平均学分绩={0}'.format(avg_grade)) input('按任意键结束') diff --git a/spiderFile/ECUT_pos_html.py b/spiderFile/ECUT_pos_html.py new file mode 100644 index 0000000..93b8ae1 --- /dev/null +++ b/spiderFile/ECUT_pos_html.py @@ -0,0 +1,50 @@ +import requests +import re from bs4 +import BeautifulSoup as bs + + +def crawl_all_main_url(page=10): + # 默认抓取官网前十页招聘信息的url + all_url_list = [] + for _ in range(1, page+1): + url = 'http://zjc.ecit.edu.cn/jy/app/newslist.php?BigClassName=%D5%D0%C6%B8%D0%C5%CF%A2&Page={0}'.format(_) + page_html = requests.get(url).text + x_url_reg = re.compile('(.*?)') + explain_text = re.findall(explain_text_reg, html)[0] + if ('时间' and '地点') in explain_text: + return True + else: pass + def save_html(): + all_url_list = crawl_all_main_url() + for son_url in all_url_list: + if get_title(son_url): + text_html = requests.get(son_url).content.decode('gbk') + domain_url = 'http://zjc.ecit.edu.cn/jy' + img_url_reg = re.compile('border=0 src="\.\.(.*?)"') + child_url = re.findall(img_url_reg, text_html) + if child_url != []: + img_url = domain_url + child_url[0] + re_url = 'src="..{0}"'.format(child_url[0]) + end_url = 'src="{0}"'.format(img_url) + end_html = text_html.replace(re_url, end_url) + soup = bs(end_html, 'lxml') + text_div = soup.find_all('div', id='main')[0] + with open('./{0}.html'.format(son_url[-11:]), 'wb') as file: + text_html = 'U职网提供数据咨询服务 {0} '.format(text_div) file.write(text_html.encode('utf-8')) + else: + with open('./{0}.html'.format(son_url[-11:]), 'wb') as file: + html = requests.get(son_url).content.decode('gbk') + soup = bs(text_html, 'lxml') + text_div = soup.find_all('div', id='main')[0] + text_html = 'U职网提供数据咨询服务 {0} '.format(text_div) + file.write(text_html.encode('utf-8')) + else: continue +if __name__ == '__main__': +save_html() diff --git a/JDSpider.py b/spiderFile/JD_spider.py similarity index 93% rename from JDSpider.py rename to spiderFile/JD_spider.py index 9b8c66d..9d697dd 100644 --- a/JDSpider.py +++ b/spiderFile/JD_spider.py @@ -1,42 +1,42 @@ -import requests -import re -import pandas as pd - -def get_data(): - jj_url1 = 'http://search.jd.com/s_new.php?keyword=%E5%AE%B6%E5%B1%85%E7%94%A8%E5%93%81&enc=utf-8&qrst=1&rt=1&stop=1&pt=1&vt=2&sttr=1&offset=6&page=' - jj_url2 = '&s=53&click=0' - bt_ = [] - _id = [] - url_list = [] - for i in range(1, 10000, 2): - jj_url = jj_url1 + str(i) + jj_url2 - url_list.append(jj_url) - html = requests.get(jj_url).content.decode('utf-8') - reg1 = re.compile('') - bt = re.findall(reg1, html) - id_ = re.findall(reg2, html) - bt_.extend(bt) - _id.extend(id_) - return bt_, _id - -def split_str(_id): - zid = [] - for _ in _id: - zid.append(_.split('_')[2]) - return zid - -def save_data(zid, bt_): - data = pd.DataFrame({ - '标题': bt_, - 'ID': zid - }) - data.to_excel('./家居用品.xlsx', index=False) - -def start_main(): - bt_, _id = get_data() - zid = split_str(_id) - save_data(zid, bt_) - -if __name__ == '__main__': - start_main() +import requests +import re +import pandas as pd + +def get_data(): + jj_url1 = 'http://search.jd.com/s_new.php?keyword=%E5%AE%B6%E5%B1%85%E7%94%A8%E5%93%81&enc=utf-8&qrst=1&rt=1&stop=1&pt=1&vt=2&sttr=1&offset=6&page=' + jj_url2 = '&s=53&click=0' + bt_ = [] + _id = [] + url_list = [] + for i in range(1, 10, 2): + jj_url = jj_url1 + str(i) + jj_url2 + url_list.append(jj_url) + html = requests.get(jj_url).content.decode('utf-8') + reg1 = re.compile('') + bt = re.findall(reg1, html) + id_ = re.findall(reg2, html) + bt_.extend(bt) + _id.extend(id_) + return bt_, _id + +def split_str(_id): + zid = [] + for _ in _id: + zid.append(_.split('_')[2]) + return zid + +def save_data(zid, bt_): + data = pd.DataFrame({ + '标题': bt_, + 'ID': zid + }) + data.to_excel('./家居用品.xlsx', index=False) + +def start_main(): + bt_, _id = get_data() + zid = split_str(_id) + save_data(zid, bt_) + +if __name__ == '__main__': + start_main() diff --git a/spiderFile/baidu_sy_img.py b/spiderFile/baidu_sy_img.py new file mode 100644 index 0000000..b663ea0 --- /dev/null +++ b/spiderFile/baidu_sy_img.py @@ -0,0 +1,55 @@ +import requests +import re + +url = 'http://image.baidu.com/search/index' +headers = { + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:44.0) Gecko/20100101 Firefox/44.0', + 'Accept-Language': 'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3', + 'Accept-Encoding': 'gzip, deflate', + 'Referer': 'http://image.baidu.com/search/index?tn=baiduimage&ipn=r&ct=201326592&cl=2&fm=detail&lm=-1&st=-1&sf=2&fmq=&pv=&ic=0&nc=1&z=&se=&showtab=0&fb=0&width=&height=&face=0&istype=2&ie=utf-8&word=%E9%AB%98%E6%B8%85%E6%91%84%E5%BD%B1&oq=%E9%AB%98%E6%B8%85%E6%91%84%E5%BD%B1&rsp=-1', + 'Cookie': 'HOSUPPORT=1; UBI=fi_PncwhpxZ%7ETaMMzY0i9qXJ9ATcu3rvxFIc-a7KI9byBcYk%7EjBVmPGIbL3LTKKJ2D17mh5VfJ5yjlCncAb2yhPI5sZM51Qo7tpCemygM0VNUzuTBJwYF8OYmi3nsCCzbpo5U9tLSzkZfcQ1rxUcJSzaipThg__; HISTORY=fec845b215cd8e8be424cf320de232722d0050; PTOKEN=ff58b208cc3c16596889e0a20833991d; STOKEN=1b1f4b028b5a4415aa1dd9794ff061d312ad2a822d52418f3f1ffabbc0ac6142; SAVEUSERID=0868a2b4c9d166dc85e605f0dfd153; USERNAMETYPE=3; PSTM=1454309602; BAIDUID=E5493FD55CFE5424BA25B1996943B3B6:FG=1; BIDUPSID=B7D6D9EFA208B7B8C7CB6EF8F827BD4E; BDUSS=VSeFB6UXBmRWc3UEdFeXhKOFRvQm4ySmVmTkVEN2N0bldnM2o5RHdyaE54ZDlXQVFBQUFBJCQAAAAAAAAAAAEAAABzhCtU3Mbj5cfl0e8AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAE04uFZNOLhWZW; H_PS_PSSID=1447_18282_17946_18205_18559_17001_17073_15479_12166_18086_10634; BDRCVFR[feWj1Vr5u3D]=I67x6TjHwwYf0; BDRCVFR[X_XKQks0S63]=mk3SLVN4HKm; BDRCVFR[dG2JNJb_ajR]=mk3SLVN4HKm', +} + + +def get_html(url, headers): + data = { + 'cl': '2', + 'ct': '201326592', + 'face': '0', + 'fp': 'result', + 'gsm': '200001e', + 'ic': '0', + 'ie': 'utf-8', + 'ipn': 'rj', + 'istype': '2', + 'lm': '-1', + 'nc': '1', + 'oe': 'utf-8', + 'pn': '30', + 'queryword': '高清摄影', + 'rn': '30', + 'st': '-1', + 'tn': 'resultjson_com', + 'word': '高清摄影' + } + + page = requests.get(url, data, headers=headers).text + return page + + +def get_img(page, headers): + # img_url_list = [] + reg = re.compile('http://.*?\.jpg') + imglist1 = re.findall(reg, page) + imglist2 = imglist1[0: len(imglist1): 3] + # [img_url_list.append(i) for i in imglist if not i in img_url_list] + x = 0 + for imgurl in imglist2: + bin = requests.get(imgurl, headers=headers).content + with open('./%s.jpg' % x, 'wb') as file: + file.write(bin) + x += 1 + +if __name__ == '__main__': + page = get_html(url, headers) + get_img(page, headers) diff --git a/spiderFile/baidu_wm_img.py b/spiderFile/baidu_wm_img.py new file mode 100644 index 0000000..542c782 --- /dev/null +++ b/spiderFile/baidu_wm_img.py @@ -0,0 +1,51 @@ +import requests +import re + +url = 'http://image.baidu.com/search/index' +date = { + 'cl': '2', + 'ct': '201326592', + 'fp': 'result', + 'gsm': '1e', + 'ie': 'utf-8', + 'ipn': 'rj', + 'istype': '2', + 'lm': '-1', + 'nc': '1', + 'oe': 'utf-8', + 'pn': '30', + 'queryword': '唯美意境图片', + 'rn': '30', + 'st': '-1', + 'tn': 'resultjson_com', + 'word': '唯美意境图片' +} +headers = { + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:44.0) Gecko/20100101 Firefox/44.0', + 'Accept': 'text/plain, */*; q=0.01', + 'Accept-Language': 'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3', + 'Accept-Encoding': 'gzip, deflate', + 'X-Requested-With': 'XMLHttpRequest', + 'Referer': 'http://image.baidu.com/search/index?ct=201326592&cl=2&st=-1&lm=-1&nc=1&ie=utf-8&tn=baiduimage&ipn=r&rps=1&pv=&fm=rs3&word=%E5%94%AF%E7%BE%8E%E6%84%8F%E5%A2%83%E5%9B%BE%E7%89%87&ofr=%E9%AB%98%E6%B8%85%E6%91%84%E5%BD%B1', + 'Cookie': 'BDqhfp=%E5%94%AF%E7%BE%8E%E6%84%8F%E5%A2%83%E5%9B%BE%E7%89%87%26%26NaN-1undefined-1undefined%26%260%26%261; Hm_lvt_737dbb498415dd39d8abf5bc2404b290=1455016371,1455712809,1455769605,1455772886; PSTM=1454309602; BAIDUID=E5493FD55CFE5424BA25B1996943B3B6:FG=1; BIDUPSID=B7D6D9EFA208B7B8C7CB6EF8F827BD4E; BDUSS=VSeFB6UXBmRWc3UEdFeXhKOFRvQm4ySmVmTkVEN2N0bldnM2o5RHdyaE54ZDlXQVFBQUFBJCQAAAAAAAAAAAEAAABzhCtU3Mbj5cfl0e8AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAE04uFZNOLhWZW; H_PS_PSSID=1447_18282_17946_15479_12166_18086_10634; Hm_lpvt_737dbb498415dd39d8abf5bc2404b290=1455788775; firstShowTip=1; BDRCVFR[-pGxjrCMryR]=mk3SLVN4HKm; BDRCVFR[dG2JNJb_ajR]=mk3SLVN4HKm', + 'Connection': 'keep-alive' +} + + +def get_page(url, date, headers): + page = requests.get(url, date, headers=headers).text + return page + + +def get_img(page, headers): + reg = re.compile('http://.*?\.jpg') + imglist = re.findall(reg, page)[::3] + x = 0 + for imgurl in imglist: + with open('E:/Pic/%s.jpg' % x, 'wb') as file: + file.write(requests.get(imgurl, headers=headers).content) + x += 1 + +if __name__ == '__main__': + page = get_page(url, date, headers) + get_img(page, headers) diff --git a/spiderFile/fuckCTF.py b/spiderFile/fuckCTF.py new file mode 100644 index 0000000..76597f1 --- /dev/null +++ b/spiderFile/fuckCTF.py @@ -0,0 +1,142 @@ + +""" +author: 杨航锋 +date : 2018.8.19 +mood : 嗯,比较无聊,甚至还有点想吃黄焖鸡米饭😋 +""" + + +import os +import random +import functools + +from PIL import Image +from selenium import webdriver + + +class fuckCTF: + + def __init__(self, username, old_password): + self.url = "http://hetianlab.com/" + self.login_url = "http://hetianlab.com/loginLab.do" + self.username = username + self.old_password = old_password + self.new_password = (yield_new_password(), "******")[0] + self.options = webdriver.FirefoxOptions() + self.options.add_argument("-headless") + self.browser = webdriver.Firefox(options=self.options) + print("init ok") + + def login_hetian(self): + self.browser.get(self.login_url) + self.browser.find_element_by_id("userEmail").clear() + self.browser.find_element_by_id("userEmail").send_keys(self.username) + self.browser.find_element_by_id("passwordIn").clear() + self.browser.find_element_by_id("passwordIn").send_keys(self.old_password) + self.browser.get_screenshot_as_file(self.username + '/' + "login.png") + self.browser.find_element_by_id("registButIn").click() + self.browser.get(self.url) + print("login_hetian running ok!") + + def get_personl_information_page(self): + grzx_btn = self.browser.find_element_by_xpath("/html/body/div[1]/div[1]/div/div/div[2]/ul/li[2]/a") + self.browser.execute_script("$(arguments[0]).click()", grzx_btn) + self.browser.get("http://hetianlab.com/getUserInfo.do") + print("get_personl_information_page running ok!") + + def get_password_setting_page(self): + mmsz_btn = self.browser.find_element_by_xpath("/html/body/div[2]/div/div[1]/ul/ul[3]/li[2]") + self.browser.execute_script("$(arguments[0]).click()", mmsz_btn) + self.browser.find_element_by_id("person").click() + self.browser.find_element_by_class_name("check") + print("get_password_setting_page running ok!") + + def setting_password(self): + self.browser.find_element_by_id("oldpwd").clear() + self.browser.find_element_by_id("oldpwd").send_keys(self.old_password) + self.browser.find_element_by_id("newpwd").clear() + self.browser.find_element_by_id("newpwd").send_keys(self.new_password) + self.browser.find_element_by_id("quepwd").clear() + self.browser.find_element_by_id("quepwd").send_keys(self.new_password) + print("setting_password running ok!") + + def get_v_code(self): + status = self.browser.get_screenshot_as_file(self.username + '/' + "v_code.png") + if status: + img = Image.open(self.username + '/' + "v_code.png") + img.show() + self.v_code = input("请输入验证码: ") + self.browser.find_element_by_class_name("code").send_keys(self.v_code) + else: + raise("截屏失败!") + print("get_v_code running ok!") + + def submit_data(self): + self.browser.find_element_by_id("submitbtn").click() + self.browser.get_screenshot_as_file(self.username + '/' + "result.png") + self.browser.quit() + print("submit_data running ok!") + + def make_portfolio(self): + if not os.path.exists(self.username): + os.makedirs(self.username) + print("make_portfolio running ok!") + + def save_success_data(self): + with open("./username_and_password_data_successed.log", "a+") as fp: + fp.write( + "username" + ": {}".format(self.username) + "\t" + "password" + ": {}".format(self.new_password) + + "\n" + ) + print("save_success_data running ok!") + + def save_failed_data(self): + with open("./username_and_password_data_failed.log", "a+") as fp: + fp.write( + "username" + ": {}".format(self.username) + "\n" + ) + print("save_failed_data running ok!") + + def main(self): + try: + self.make_portfolio() + self.login_hetian() + self.get_personl_information_page() + self.get_password_setting_page() + self.setting_password() + self.get_v_code() + self.submit_data() + self.save_success_data() + except: + self.save_failed_data() + + +def gen_decorator(gen): + @functools.wraps(gen) + def inner(*args, **kwargs): + return next(gen(*args, **kwargs)) + return inner + + +@gen_decorator +def yield_new_password(): + strings = list("abcdefghijklmnopqrstuvwxyz1234567890!@#$%^&*()") + yield "".join(random.choices(strings, k=6)) + + +def yield_usernames(n): + prefix = "ctf2018_gzhu" + postfix = "@dh.com" + for num in range(n): + if num < 10: + infix = '0' + str(num) + else: + infix = str(num) + yield prefix + infix + postfix + + +if __name__ == "__main__": + for username in yield_usernames(100): + ctfer = fuckCTF(username, "******") + ctfer.main() diff --git a/spiderFile/get_baike.py b/spiderFile/get_baike.py new file mode 100644 index 0000000..eb64fcd --- /dev/null +++ b/spiderFile/get_baike.py @@ -0,0 +1,20 @@ +import re +import requests as rq + +def get_baidubaike(): + + keyword = input('please input wordkey:') + url = 'http://baike.baidu.com/item/{}'.format(keyword) + html = rq.get(url).content.decode('utf-8') + + regex = re.compile('content="(.*?)">') + words = re.findall(regex, html)[0] + return words + +if __name__ == '__main__': + words = get_baidubaike() + print(words) + + + + diff --git a/spiderFile/get_history_weather.py b/spiderFile/get_history_weather.py new file mode 100644 index 0000000..77176fc --- /dev/null +++ b/spiderFile/get_history_weather.py @@ -0,0 +1,31 @@ +import re +import pandas as pd +import requests as rq +from bs4 import BeautifulSoup + + +def get_data(url): + html = rq.get(url).content.decode("gbk") + soup = BeautifulSoup(html, "html.parser") + tr_list = soup.find_all("tr") + dates, conditions, temperatures = [], [], [] + for data in tr_list[1:]: + sub_data = data.text.split() + dates.append(sub_data[0]) + conditions.append("".join(sub_data[1:3])) + temperatures.append("".join(sub_data[3:6])) + _data = pd.DataFrame() + _data["日期"] = dates + _data["天气状况"] = conditions + _data["气温"] = temperatures + return _data + +# 获取广州市2019年第一季度天气状况 +data_1_month = get_data("http://www.tianqihoubao.com/lishi/guangzhou/month/201901.html") +data_2_month = get_data("http://www.tianqihoubao.com/lishi/guangzhou/month/201902.html") +data_3_month = get_data("http://www.tianqihoubao.com/lishi/guangzhou/month/201903.html") + + +data = pd.concat([data_1_month, data_2_month, data_3_month]).reset_index(drop=True) + +data.to_csv("guangzhou_history_weather_data.csv", index=False, encoding="utf-8") diff --git a/GetPhotos2.py b/spiderFile/get_photos.py similarity index 90% rename from GetPhotos2.py rename to spiderFile/get_photos.py index 24927c4..eae1725 100644 --- a/GetPhotos2.py +++ b/spiderFile/get_photos.py @@ -1,24 +1,25 @@ -import requests -from bs4 import BeautifulSoup - -url = 'http://tieba.baidu.com/p/4178314700' - -def GetHtml(url): - html = requests.get(url).text - return html - -def GetImg(html): - soup = BeautifulSoup(html, 'html.parser') - imglist = [] - for photourl in soup.find_all('img'): - imglist.append(photourl.get('src')) - x = 0 - for imgurl in imglist: - with open('E:/Pic/%s.jpg' % x, 'wb') as file: - file.write(requests.get(imgurl).content) - x += 1 - -if __name__ == '__main__': - html = GetHtml(url) - GetImg(html) - \ No newline at end of file +import requests +from bs4 import BeautifulSoup + +url = 'http://tieba.baidu.com/p/4178314700' + + +def GetHtml(url): + html = requests.get(url).text + return html + + +def GetImg(html): + soup = BeautifulSoup(html, 'html.parser') + imglist = [] + for photourl in soup.find_all('img'): + imglist.append(photourl.get('src')) + x = 0 + for imgurl in imglist: + with open('E:/Pic/%s.jpg' % x, 'wb') as file: + file.write(requests.get(imgurl).content) + x += 1 + +if __name__ == '__main__': + html = GetHtml(url) + GetImg(html) diff --git a/spiderFile/get_tj_accident_info.py b/spiderFile/get_tj_accident_info.py new file mode 100644 index 0000000..b8b2237 --- /dev/null +++ b/spiderFile/get_tj_accident_info.py @@ -0,0 +1,77 @@ +import re +import joblib +import asyncio +import aiohttp +import requests as rq +from bs4 import BeautifulSoup + +def yield_all_page_url(root_url, page=51): + """生成所有的页面url + @param root_url: 首页url + type root_url: str + @param page: 爬取的页面个数 + type page: int + """ + # 观察网站翻页结构可知 + page_url_list = [f"{root_url}index_{i}.html" for i in range(1, page)] + # 添加首页url + page_url_list.insert(0, root_url) + return page_url_list + +async def get_info_page_url(url, session): + regex = re.compile("') + html = rq.get(url, headers=HEADERS).content.decode("utf-8") + soup = BeautifulSoup(html) + title = re.search(title_regex, html) + content_1 = soup.find("div", class_="TRS_UEDITOR TRS_WEB") + content_2 = soup.find("div", class_="view TRS_UEDITOR trs_paper_default trs_word") + content_3 = soup.find("div", class_="view TRS_UEDITOR trs_paper_default trs_web") + if content_1: + content = content_1.text + elif content_2: + content = content_2.text + elif content_3: + content = content_3.text + else: + content = "" + return {"title": title.groups()[0], "content": content} + +def get_all_data(all_info_page_url_list): + all_data = [] + for i, url in enumerate(all_info_page_url_list): + all_data.append(get_data(url)) + print(i, url, all_data[-1]) + joblib.dump(all_data, "all_data.joblib") + + +if __name__ == "__main__": + root_url = "http://yjgl.tj.gov.cn/ZWGK6939/SGXX3106/" + agent_part_1 = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 " + agent_part_2 = "(KHTML, like Gecko) Chrome/89.0.4389.82 Safari/537.36" + HEADERS = {"Host": "yjgl.tj.gov.cn", + "Connection": "keep-alive", + "User-Agent": agent_part_1 + agent_part_2, + "Referer": "http://static.bshare.cn/"} + page_url_list = yield_all_page_url(root_url, page=51) + all_info_page_url_list = asyncio.run(get_all_info_page_url(root_url, page_url_list)) + joblib.dump("all_info_page_url_list", all_info_page_url_list) diff --git a/spiderFile/get_top_sec_com.py b/spiderFile/get_top_sec_com.py new file mode 100644 index 0000000..f1fce0a --- /dev/null +++ b/spiderFile/get_top_sec_com.py @@ -0,0 +1,95 @@ +import re +import os +import time +import joblib +import asyncio +import aiohttp +import requests as rq + +import pandas as pd +import matplotlib.pyplot as plt +# import nest_asyncio +# nest_asyncio.apply() + +class getTopSecCom: + def __init__(self, top=None): + self.headers = {"Referer": "http://quote.eastmoney.com/", + "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.141 Safari/537.36"} + self.bk_url = "http://71.push2.eastmoney.com/api/qt/clist/get?cb=jQuery1124034348162124675374_1612595298605&pn=1&pz=85&po=1&np=1&ut=bd1d9ddb04089700cf9c27f6f7426281&fltt=2&invt=2&fid=f62&fs=b:BK0655&fields=f12,f14&_=1612595298611" + self.shares_api = "https://xueqiu.com/S/" + self.top = top + if not os.path.exists("./useful_sec_com_list"): + self.useful_sec_com_list = self.get_sec_com_code() + else: + with open("./useful_sec_com_list", "rb") as fp: + self.useful_sec_com_list = joblib.load(fp) + + def get_sec_com_code(self): + html = rq.get(self.bk_url, headers=self.headers).content.decode("utf-8") + sec_com_list = eval(re.findall("\[(.*?)\]", html)[0]) + useful_sec_com_list = [[i["f12"], i["f14"]] for i in sec_com_list if "ST" not in i["f14"]] + + # 0和3开头的为深证上市股票前缀为sz,6开头的为上证上市股票前缀为sh + for sec_com in useful_sec_com_list: + if sec_com[0][0] == "6": + sec_com[0] = "sh" + sec_com[0] + else: + sec_com[0] = "sz" + sec_com[0] + with open("useful_sec_com_list", "wb") as fp: + joblib.dump(useful_sec_com_list, fp) + return useful_sec_com_list + + async def async_get_shares_details(self, sec_com, url): + async with aiohttp.ClientSession() as session: + async with session.get(url, headers=self.headers) as response: + html = await response.text() + market_value = re.search("总市值:(.*?)亿", html) + if market_value: + return [*sec_com, market_value.groups()[0]] + + async def async_get_all_shares(self): + tasks = [] + for sec_com in self.useful_sec_com_list: + url = self.shares_api + sec_com[0] + tasks.append( + asyncio.create_task( + self.async_get_shares_details(sec_com, url) + ) + ) + done, pendding = await asyncio.wait(tasks) + return [share.result() for share in done if share.result()] + + def get_shares_details(self): + all_shares = [] + for sec_com in self.useful_sec_com_list: + url = self.shares_api + sec_com[0] + response = rq.get(url, headers=self.headers).content.decode("utf-8") + market_value = re.search("总市值:(.*?)亿", response) + if market_value: + all_shares.append([*sec_com, market_value.groups()[0]]) + return all_shares + + def yield_picture(self, save_path): + # all_shares = self.get_shares_details() # 同步代码 + all_shares = asyncio.run(self.async_get_all_shares()) # 异步代码 + df = pd.DataFrame(all_shares, columns=["股票代码", "公司", "市值(亿)"]) + df["市值(亿)"] = df["市值(亿)"].astype(float) + date = time.strftime("%Y年%m月%d日", time.localtime()) + df.sort_values(by="市值(亿)", ascending=False, inplace=True) + df.index = range(1, df.shape[0]+1) + + plt.rcParams['font.sans-serif'] = ['SimHei'] + plt.rcParams['axes.unicode_minus'] = False + + + fig = plt.figure(dpi=400) + ax = fig.add_subplot(111, frame_on=False) + ax.xaxis.set_visible(False) + ax.yaxis.set_visible(False) + _ = pd.plotting.table(ax, df, loc="best", cellLoc="center") + ax.set_title(f"{date}A股网安版块公司市值排名", fontsize=10) + plt.savefig(save_path, bbox_inches="tight") + +if __name__ == "__main__": + m = getTopSecCom() + m.yield_picture("rank.png") diff --git a/getWebAllImg.py b/spiderFile/get_web_all_img.py similarity index 62% rename from getWebAllImg.py rename to spiderFile/get_web_all_img.py index 56306d9..616befe 100644 --- a/getWebAllImg.py +++ b/spiderFile/get_web_all_img.py @@ -1,67 +1,73 @@ -import re -import time -import requests - -def get_html(url, headers): - html = requests.get(url, timeout = 100, headers = headers).text - return html - -def get_main_url(html): - reg = re.compile('http://.*?\.jpg') - main_imglist = re.findall(reg, html) - return main_imglist - -def get_son_url(html): - initurl = 'http://www.woyaogexing.com' - reg = re.compile('/tupian/weimei/\d+/\d+\.html') - son_urllist_init = re.findall(reg, html) - son_urlist = set(son_urllist_init) - son_url_final = [] - for son_url in son_urlist: - son_url_final.append(initurl + son_url) - return son_url_final #结果是所有含有图片的网页地址 - -def get_all_sonurl(son_url_final, headers): - son_imglist = [] - for sonurl in son_url_final: - son_html = requests.get(sonurl, timeout = 100, headers = headers).text - son_reg = re.compile('http://.*?\.jpg') - son_imglist1 = re.findall(son_reg, son_html) - for temp in son_imglist1: - son_imglist.append(temp) - return son_imglist #结果是所有子网页图片的地址 - -def get_all_img(main_imglist, son_imglist, headers): - global x #使用全局变量使每次的变量不清除,这个问题有待完美解决! - for imgurl in main_imglist: - son_imglist.append(imgurl) - for imgurl in son_imglist: - with open('E:/Pic2/%s.jpg' % x, 'wb') as file: - file.write(requests.get(imgurl, timeout = 100, headers = headers).content) - time.sleep(0.1) - x += 1 - -def turn_page(): - page_list = ['http://www.woyaogexing.com/tupian/weimei/index.html'] - for i in range(1, 7): - page_list.append('http://www.woyaogexing.com/tupian/weimei/index_' + str(i) + '.html') - return page_list - -if __name__ == '__main__': - headers = { - 'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:44.0) Gecko/20100101 Firefox/44.0', - 'Accept' : 'text/plain, */*; q=0.01', - 'Accept-Language' : 'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3', - 'Accept-Encoding' : 'gzip, deflate', - 'Cookie' : 'bdshare_firstime=1456041345958; Hm_lvt_a077b6b44aeefe3829d03416d9cb4ec3=1456041346; Hm_lpvt_a077b6b44aeefe3829d03416d9cb4ec3=1456048504', - } - x = 0 - page_list = ['http://www.woyaogexing.com/tupian/weimei/index.html'] - for i in range(2, 20): - page_list.append('http://www.woyaogexing.com/tupian/weimei/index_' + str(i) + '.html') - for p in range(6): - html = get_html(page_list[p], headers) - main_imglist = get_main_url(html) - son_url_final = get_son_url(html) - son_imglist = get_all_sonurl(son_url_final, headers) - get_all_img(main_imglist, son_imglist, headers) \ No newline at end of file +import re +import time +import requests + + +def get_html(url, headers): + html = requests.get(url, timeout=100, headers=headers).text + return html + + +def get_main_url(html): + reg = re.compile('http://.*?\.jpg') + main_imglist = re.findall(reg, html) + return main_imglist + + +def get_son_url(html): + initurl = 'http://www.woyaogexing.com' + reg = re.compile('/tupian/weimei/\d+/\d+\.html') + son_urllist_init = re.findall(reg, html) + son_urlist = set(son_urllist_init) + son_url_final = [] + for son_url in son_urlist: + son_url_final.append(initurl + son_url) + return son_url_final # 结果是所有含有图片的网页地址 + + +def get_all_sonurl(son_url_final, headers): + son_imglist = [] + for sonurl in son_url_final: + son_html = requests.get(sonurl, timeout=100, headers=headers).text + son_reg = re.compile('http://.*?\.jpg') + son_imglist1 = re.findall(son_reg, son_html) + for temp in son_imglist1: + son_imglist.append(temp) + return son_imglist # 结果是所有子网页图片的地址 + + +def get_all_img(main_imglist, son_imglist, headers): + global x # 使用全局变量使每次的变量不清除,这个问题有待完美解决! + for imgurl in main_imglist: + son_imglist.append(imgurl) + for imgurl in son_imglist: + with open('E:/Pic2/%s.jpg' % x, 'wb') as file: + file.write(requests.get(imgurl, timeout=100, headers=headers).content) + time.sleep(0.1) + x += 1 + + +def turn_page(): + page_list = ['http://www.woyaogexing.com/tupian/weimei/index.html'] + for i in range(1, 7): + page_list.append('http://www.woyaogexing.com/tupian/weimei/index_' + str(i) + '.html') + return page_list + +if __name__ == '__main__': + headers = { + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:44.0) Gecko/20100101 Firefox/44.0', + 'Accept': 'text/plain, */*; q=0.01', + 'Accept-Language': 'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3', + 'Accept-Encoding': 'gzip, deflate', + 'Cookie': 'bdshare_firstime=1456041345958; Hm_lvt_a077b6b44aeefe3829d03416d9cb4ec3=1456041346; Hm_lpvt_a077b6b44aeefe3829d03416d9cb4ec3=1456048504', + } + x = 0 + page_list = ['http://www.woyaogexing.com/tupian/weimei/index.html'] + for i in range(2, 20): + page_list.append('http://www.woyaogexing.com/tupian/weimei/index_' + str(i) + '.html') + for p in range(6): + html = get_html(page_list[p], headers) + main_imglist = get_main_url(html) + son_url_final = get_son_url(html) + son_imglist = get_all_sonurl(son_url_final, headers) + get_all_img(main_imglist, son_imglist, headers) diff --git a/githubHot.py b/spiderFile/github_hot.py similarity index 99% rename from githubHot.py rename to spiderFile/github_hot.py index 7c1c8a3..ea9726e 100644 --- a/githubHot.py +++ b/spiderFile/github_hot.py @@ -3,6 +3,7 @@ import pandas as pd import numpy as np + def hot_github(keyword): url = 'https://github.com/trending/{0}'.format(keyword) main_url = 'https://github.com{0}' diff --git a/spiderFile/kantuSpider.py b/spiderFile/kantuSpider.py new file mode 100644 index 0000000..13cb76e --- /dev/null +++ b/spiderFile/kantuSpider.py @@ -0,0 +1,46 @@ +import re +import os +import time + +import requests as rq + + +def get_all_page(page): + url = 'http://52kantu.cn/?page={}'.format(page) + html = rq.get(url).text + + return html + + +def get_img_url(html): + regex = re.compile('') + img_url = re.findall(reg, page) + if img_url != []: + with open('./{}.jpg'.format(count), 'wb') as file: + try: + img_data = requests.get(img_url[0]).content + file.write(img_data) + count += 1 + except: + pass +print('OK!') + + + + + + + + + + + + + + + + + + diff --git a/spiderFile/one_update.py b/spiderFile/one_update.py new file mode 100644 index 0000000..d457785 --- /dev/null +++ b/spiderFile/one_update.py @@ -0,0 +1,38 @@ +import re +import requests as rq + +ROOT_URL = "http://wufazhuce.com/one/" +URL_NUM = 14 + +def yield_url(ROOT_URL, URL_NUM): + return ROOT_URL + str(URL_NUM) + +def get_html(url): + return rq.get(url).content.decode("utf-8") + +def get_data(html): + img_url_regex = re.compile('') + cite_regex = re.compile('
(.*?)
', re.S) + img_url = re.findall(img_url_regex, html)[0] + cite = re.findall(cite_regex, html)[0].strip() + return img_url, cite + +def save_data(img_url, cite, URL_NUM): + with open("./{}.jpg".format(URL_NUM), "wb") as fp: + fp.write(rq.get(img_url).content) + with open("./cite{}.txt".format(URL_NUM), "w") as fp: + fp.write(cite) + return URL_NUM + 1 + +def main(ROOT_URL, URL_NUM, number): + for _ in range(number): + url = yield_url(ROOT_URL, URL_NUM) + html = get_html(url) + img_url, cite = get_data(html) + URL_NUM = save_data(img_url, cite, URL_NUM) + +if __name__ == "__main__": + try: + main(ROOT_URL, URL_NUM, 20) + except: + pass diff --git a/spiderFile/search_useful_camera_ip_address.py b/spiderFile/search_useful_camera_ip_address.py new file mode 100644 index 0000000..652b180 --- /dev/null +++ b/spiderFile/search_useful_camera_ip_address.py @@ -0,0 +1,92 @@ +import re +import tqdm +import time +from selenium import webdriver +from selenium.webdriver.common.by import By +from selenium.webdriver.chrome.options import Options +from selenium.webdriver.support.ui import WebDriverWait +from selenium.webdriver.support import expected_conditions as EC +from selenium.common.exceptions import NoAlertPresentException, TimeoutException + +# 扫描网站可自己寻找,代码仅演示逻辑 +country = "IN" #印度 +city = "" +login_url = "" +query_url = "" +city_url = "" +USER_NAME = "" +PASSWORD = "" + +# 无头浏览器配置 +chrome_options = Options() +chrome_options.add_argument("--headless") +chrome_options.add_argument("--disable-gpu") +chrome_options.add_argument("log-level=3") +browser = webdriver.Chrome(chrome_options=chrome_options) +browser.set_page_load_timeout(10) + +#登录模块 +browser.get(login_url) +WebDriverWait(browser, 30).until( + EC.presence_of_element_located((By.XPATH, '//*[@name="login_submit"]')) +) +browser.find_element_by_id("username").clear() +browser.find_element_by_id("username").send_keys(USER_NAME) +browser.find_element_by_id("password").clear() +browser.find_element_by_id("password").send_keys(PASSWORD) +browser.find_element_by_name("login_submit").click() + +#抓取潜在的摄像头url,默认抓取两页 +if city: + query_url += city_url + +latent_camera_url = [] +browser.get(query_url) +WebDriverWait(browser, 30).until( + EC.presence_of_element_located((By.CLASS_NAME, 'button')) +) +html = browser.page_source +latent_camera_url += re.findall('
') kids_url = [domain_url.format(i) for i in re.findall(kids_url_regex, start_html)] for kid_url in kids_url: @@ -39,6 +39,6 @@ def Spidermain(page=11): file.write(s.get(pic_url, timeout=5).content) except: pass - + if __name__ == '__main__': Spidermain() diff --git a/student_img.py b/student_img.py deleted file mode 100644 index a23c213..0000000 --- a/student_img.py +++ /dev/null @@ -1,26 +0,0 @@ -import requests - -url = '' -banji = [] -zhuanye = [] -for a in range(10): - for b in range(10): - banji.append(str(a) + '0' + str(b)) -for c in range(10): - zhuanye.append('20' + str(c)) - -for year in range(2011, 2015): - for xh in zhuanye: - for nj in banji: - for i in range(1, 35): - if i < 10: - xuehao = str(year) + str(xh) + str(nj) + '0' + str(i) - student_url = url + xuehao - with open('E:/student_img/%s.jpeg' % xuehao, 'wb') as file: - file.write(requests.get(student_url).content) - else: - xuehao = str(year) + str(xh) + str(nj) + str(i) - student_url = url + xuehao - with open('E:/student_img/%s.jpeg' % xuehao, 'wb') as file: - file.write(requests.get(student_url).content) -print('OK!')