Date: Sun, 7 Feb 2021 10:54:20 +0800
Subject: [PATCH 44/65] Create get_top_sec_com.py
---
spiderFile/get_top_sec_com.py | 67 +++++++++++++++++++++++++++++++++++
1 file changed, 67 insertions(+)
create mode 100644 spiderFile/get_top_sec_com.py
diff --git a/spiderFile/get_top_sec_com.py b/spiderFile/get_top_sec_com.py
new file mode 100644
index 0000000..0007e77
--- /dev/null
+++ b/spiderFile/get_top_sec_com.py
@@ -0,0 +1,67 @@
+import re
+import os
+import joblib
+import requests as rq
+
+import pandas as pd
+import matplotlib.pyplot as plt
+
+class getTopSecCom:
+ def __init__(self, top=None):
+ self.headers = {"Referer": "http://quote.eastmoney.com/",
+ "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.141 Safari/537.36"}
+ self.bk_url = "http://71.push2.eastmoney.com/api/qt/clist/get?cb=jQuery1124034348162124675374_1612595298605&pn=1&pz=85&po=1&np=1&ut=bd1d9ddb04089700cf9c27f6f7426281&fltt=2&invt=2&fid=f62&fs=b:BK0655&fields=f12,f14&_=1612595298611"
+ self.shares_api = "https://xueqiu.com/S/"
+ self.top = top
+ if not os.path.exists("./useful_sec_com_list"):
+ self.useful_sec_com_list = self.get_sec_com_code()
+ else:
+ with open("./useful_sec_com_list", "rb") as fp:
+ self.useful_sec_com_list = joblib.load(fp)
+
+ def get_sec_com_code(self):
+ html = rq.get(self.bk_url, headers=self.headers).content.decode("utf-8")
+ sec_com_list = eval(re.findall("\[(.*?)\]", html)[0])
+ useful_sec_com_list = [[i["f12"], i["f14"]] for i in sec_com_list if "ST" not in i["f14"]]
+
+ # 0和3开头的为深证上市股票前缀为sz,6开头的为上证上市股票前缀为sh
+ for sec_com in useful_sec_com_list:
+ if sec_com[0][0] == "6":
+ sec_com[0] = "sh" + sec_com[0]
+ else:
+ sec_com[0] = "sz" + sec_com[0]
+ with open("useful_sec_com_list", "wb") as fp:
+ joblib.dump(useful_sec_com_list, fp)
+ return useful_sec_com_list
+
+ def get_shares_details(self):
+ all_shares = []
+ for sec_com in self.useful_sec_com_list:
+ url = self.shares_api + sec_com[0]
+ response = rq.get(url, headers=headers).content.decode("utf-8")
+ market_value = re.search("总市值:(.*?)亿", response)
+ if market_value:
+ all_shares.append([*sec_com, market_value.groups()[0]])
+ return all_shares
+
+ def yield_picture(self, save_path):
+ all_shares = self.get_shares_details()
+ df = pd.DataFrame(all_shares, columns=["股票代码", "公司", "市值(亿)"])
+ df["市值(亿)"] = df["市值(亿)"].astype(float)
+ df.sort_values(by="市值(亿)", ascending=False, inplace=True)
+ height = 0.18 * df.shape[0]
+ if self.top and 0< self.top <= df.shape[0]:
+ df = df.iloc[:self.top, :]
+ height = 0.18 * self.top
+ df.index = range(1, df.shape[0]+1)
+
+ plt.rcParams['font.sans-serif'] = ['SimHei']
+ plt.rcParams['axes.unicode_minus'] = False
+
+
+ fig = plt.figure(figsize=(2.5, height), dpi=400)
+ ax = fig.add_subplot(111, frame_on=False)
+ ax.xaxis.set_visible(False)
+ ax.yaxis.set_visible(False)
+ _ = table(ax, df, loc="center")
+ fig.savefig(save_path)
From 4d86db327934e60e75223219652440ed60c799c2 Mon Sep 17 00:00:00 2001
From: yhf
Date: Sun, 7 Feb 2021 10:56:46 +0800
Subject: [PATCH 45/65] Update README.md
---
README.md | 1 +
1 file changed, 1 insertion(+)
diff --git a/README.md b/README.md
index 4fd02b1..87ed4b0 100644
--- a/README.md
+++ b/README.md
@@ -51,6 +51,7 @@
16. [one_update.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/one_update.py): **更新抓取one文艺网站的代码,添加一句箴言的抓取。**
17. [get_history_weather.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/get_history_weather.py): **抓取广州市2019年第一季度的天气数据。**
18. [search_useful_camera_ip_address.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/search_useful_camera_ip_address.py): **模拟登入某扫描网站获取潜在的摄像头IP地址,然后使用弱密码验证筛选出可登录的摄像头IP地址。**
+19. [get_top_sec_com.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/get_top_sec_com.py): **获取A股市场网络安全版块公司市值排名情况,并以图片格式保存下来。**
---
# spiderAPI模块简介
From ac3c78a8b82168931d9ec36e7f7669528a9aa347 Mon Sep 17 00:00:00 2001
From: yhf
Date: Sun, 28 Feb 2021 10:56:40 +0800
Subject: [PATCH 46/65] Update get_top_sec_com.py
---
spiderFile/get_top_sec_com.py | 10 +++++-----
1 file changed, 5 insertions(+), 5 deletions(-)
diff --git a/spiderFile/get_top_sec_com.py b/spiderFile/get_top_sec_com.py
index 0007e77..3a3186c 100644
--- a/spiderFile/get_top_sec_com.py
+++ b/spiderFile/get_top_sec_com.py
@@ -38,7 +38,7 @@ def get_shares_details(self):
all_shares = []
for sec_com in self.useful_sec_com_list:
url = self.shares_api + sec_com[0]
- response = rq.get(url, headers=headers).content.decode("utf-8")
+ response = rq.get(url, headers=self.headers).content.decode("utf-8")
market_value = re.search("总市值:(.*?)亿", response)
if market_value:
all_shares.append([*sec_com, market_value.groups()[0]])
@@ -49,10 +49,10 @@ def yield_picture(self, save_path):
df = pd.DataFrame(all_shares, columns=["股票代码", "公司", "市值(亿)"])
df["市值(亿)"] = df["市值(亿)"].astype(float)
df.sort_values(by="市值(亿)", ascending=False, inplace=True)
- height = 0.18 * df.shape[0]
+ height = 0.2 * df.shape[0]
if self.top and 0< self.top <= df.shape[0]:
df = df.iloc[:self.top, :]
- height = 0.18 * self.top
+ height = 0.2 * self.top
df.index = range(1, df.shape[0]+1)
plt.rcParams['font.sans-serif'] = ['SimHei']
@@ -63,5 +63,5 @@ def yield_picture(self, save_path):
ax = fig.add_subplot(111, frame_on=False)
ax.xaxis.set_visible(False)
ax.yaxis.set_visible(False)
- _ = table(ax, df, loc="center")
- fig.savefig(save_path)
+ _ = pd.plotting.table(ax, df, loc="center", cellLoc="center")
+ plt.savefig(save_path)
From 57c0937cc3349facd0f03ec1b21c2d833d1fe8d4 Mon Sep 17 00:00:00 2001
From: yhf
Date: Fri, 16 Apr 2021 15:46:10 +0800
Subject: [PATCH 47/65] Update get_top_sec_com.py
add async function.
---
spiderFile/get_top_sec_com.py | 25 ++++++++++++++++++++++++-
1 file changed, 24 insertions(+), 1 deletion(-)
diff --git a/spiderFile/get_top_sec_com.py b/spiderFile/get_top_sec_com.py
index 3a3186c..be0f706 100644
--- a/spiderFile/get_top_sec_com.py
+++ b/spiderFile/get_top_sec_com.py
@@ -1,6 +1,8 @@
import re
import os
import joblib
+import asyncio
+import aiohttp
import requests as rq
import pandas as pd
@@ -33,6 +35,26 @@ def get_sec_com_code(self):
with open("useful_sec_com_list", "wb") as fp:
joblib.dump(useful_sec_com_list, fp)
return useful_sec_com_list
+
+ async def async_get_shares_details(self, sec_com, url):
+ async with aiohttp.ClientSession() as session:
+ async with session.get(url, headers=self.headers) as response:
+ html = await response.text()
+ market_value = re.search("| 总市值:(.*?)亿", html)
+ if market_value:
+ return [*sec_com, market_value.groups()[0]]
+
+ async def async_get_all_shares(self):
+ tasks = []
+ for sec_com in self.useful_sec_com_list:
+ url = self.shares_api + sec_com[0]
+ tasks.append(
+ asyncio.create_task(
+ self.async_get_shares_details(sec_com, url)
+ )
+ )
+ done, pendding = await asyncio.wait(tasks)
+ return [share.result() for share in done if share.result()]
def get_shares_details(self):
all_shares = []
@@ -45,7 +67,8 @@ def get_shares_details(self):
return all_shares
def yield_picture(self, save_path):
- all_shares = self.get_shares_details()
+ # all_shares = self.get_shares_details() # 同步代码
+ all_shares = asyncio.run(self.async_get_all_shares()) # 异步代码
df = pd.DataFrame(all_shares, columns=["股票代码", "公司", "市值(亿)"])
df["市值(亿)"] = df["市值(亿)"].astype(float)
df.sort_values(by="市值(亿)", ascending=False, inplace=True)
From 4e2dc05f1935659700358b10604ac6622aced224 Mon Sep 17 00:00:00 2001
From: yhf
Date: Fri, 16 Apr 2021 16:01:25 +0800
Subject: [PATCH 48/65] Update README.md
---
README.md | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/README.md b/README.md
index 87ed4b0..9f1e6e2 100644
--- a/README.md
+++ b/README.md
@@ -51,7 +51,7 @@
16. [one_update.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/one_update.py): **更新抓取one文艺网站的代码,添加一句箴言的抓取。**
17. [get_history_weather.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/get_history_weather.py): **抓取广州市2019年第一季度的天气数据。**
18. [search_useful_camera_ip_address.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/search_useful_camera_ip_address.py): **模拟登入某扫描网站获取潜在的摄像头IP地址,然后使用弱密码验证筛选出可登录的摄像头IP地址。**
-19. [get_top_sec_com.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/get_top_sec_com.py): **获取A股市场网络安全版块公司市值排名情况,并以图片格式保存下来。**
+19. [get_top_sec_com.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/get_top_sec_com.py): **异步编程获取A股市场网络安全版块公司市值排名情况,并以图片格式保存下来。**
---
# spiderAPI模块简介
From 27f856ffcf3dc76ef6762c41973d889583604e69 Mon Sep 17 00:00:00 2001
From: yhf
Date: Sat, 17 Apr 2021 10:12:58 +0800
Subject: [PATCH 49/65] Create get_tj_accident_info.py
---
spiderFile/get_tj_accident_info.py | 77 ++++++++++++++++++++++++++++++
1 file changed, 77 insertions(+)
create mode 100644 spiderFile/get_tj_accident_info.py
diff --git a/spiderFile/get_tj_accident_info.py b/spiderFile/get_tj_accident_info.py
new file mode 100644
index 0000000..b8b2237
--- /dev/null
+++ b/spiderFile/get_tj_accident_info.py
@@ -0,0 +1,77 @@
+import re
+import joblib
+import asyncio
+import aiohttp
+import requests as rq
+from bs4 import BeautifulSoup
+
+def yield_all_page_url(root_url, page=51):
+ """生成所有的页面url
+ @param root_url: 首页url
+ type root_url: str
+ @param page: 爬取的页面个数
+ type page: int
+ """
+ # 观察网站翻页结构可知
+ page_url_list = [f"{root_url}index_{i}.html" for i in range(1, page)]
+ # 添加首页url
+ page_url_list.insert(0, root_url)
+ return page_url_list
+
+async def get_info_page_url(url, session):
+ regex = re.compile("')
+ html = rq.get(url, headers=HEADERS).content.decode("utf-8")
+ soup = BeautifulSoup(html)
+ title = re.search(title_regex, html)
+ content_1 = soup.find("div", class_="TRS_UEDITOR TRS_WEB")
+ content_2 = soup.find("div", class_="view TRS_UEDITOR trs_paper_default trs_word")
+ content_3 = soup.find("div", class_="view TRS_UEDITOR trs_paper_default trs_web")
+ if content_1:
+ content = content_1.text
+ elif content_2:
+ content = content_2.text
+ elif content_3:
+ content = content_3.text
+ else:
+ content = ""
+ return {"title": title.groups()[0], "content": content}
+
+def get_all_data(all_info_page_url_list):
+ all_data = []
+ for i, url in enumerate(all_info_page_url_list):
+ all_data.append(get_data(url))
+ print(i, url, all_data[-1])
+ joblib.dump(all_data, "all_data.joblib")
+
+
+if __name__ == "__main__":
+ root_url = "http://yjgl.tj.gov.cn/ZWGK6939/SGXX3106/"
+ agent_part_1 = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
+ agent_part_2 = "(KHTML, like Gecko) Chrome/89.0.4389.82 Safari/537.36"
+ HEADERS = {"Host": "yjgl.tj.gov.cn",
+ "Connection": "keep-alive",
+ "User-Agent": agent_part_1 + agent_part_2,
+ "Referer": "http://static.bshare.cn/"}
+ page_url_list = yield_all_page_url(root_url, page=51)
+ all_info_page_url_list = asyncio.run(get_all_info_page_url(root_url, page_url_list))
+ joblib.dump("all_info_page_url_list", all_info_page_url_list)
From cf927e594a2cb1ff224f95347a513c729e2b71ed Mon Sep 17 00:00:00 2001
From: yhf
Date: Sat, 17 Apr 2021 10:15:39 +0800
Subject: [PATCH 50/65] Update README.md
---
README.md | 1 +
1 file changed, 1 insertion(+)
diff --git a/README.md b/README.md
index 9f1e6e2..4062693 100644
--- a/README.md
+++ b/README.md
@@ -52,6 +52,7 @@
17. [get_history_weather.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/get_history_weather.py): **抓取广州市2019年第一季度的天气数据。**
18. [search_useful_camera_ip_address.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/search_useful_camera_ip_address.py): **模拟登入某扫描网站获取潜在的摄像头IP地址,然后使用弱密码验证筛选出可登录的摄像头IP地址。**
19. [get_top_sec_com.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/get_top_sec_com.py): **异步编程获取A股市场网络安全版块公司市值排名情况,并以图片格式保存下来。**
+20. [get_tf_accident_info.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/get_tj_accident_info.py): **同步和异步编程结合获取天津市应急管理局所有事故信息。**
---
# spiderAPI模块简介
From 198e817f146d211c9d4c3a470de44eb372d7c291 Mon Sep 17 00:00:00 2001
From: yhf
Date: Wed, 28 Apr 2021 12:14:53 +0800
Subject: [PATCH 51/65] Update get_top_sec_com.py
---
spiderFile/get_top_sec_com.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/spiderFile/get_top_sec_com.py b/spiderFile/get_top_sec_com.py
index be0f706..5229bbd 100644
--- a/spiderFile/get_top_sec_com.py
+++ b/spiderFile/get_top_sec_com.py
@@ -87,4 +87,4 @@ def yield_picture(self, save_path):
ax.xaxis.set_visible(False)
ax.yaxis.set_visible(False)
_ = pd.plotting.table(ax, df, loc="center", cellLoc="center")
- plt.savefig(save_path)
+ plt.savefig(save_path, bbox_inches="tight")
From 66304e7b63b028a9a031428ea7b1d3859d12627a Mon Sep 17 00:00:00 2001
From: yhf
Date: Fri, 14 May 2021 16:45:31 +0800
Subject: [PATCH 52/65] Update get_top_sec_com.py
---
spiderFile/get_top_sec_com.py | 17 ++++++++++-------
1 file changed, 10 insertions(+), 7 deletions(-)
diff --git a/spiderFile/get_top_sec_com.py b/spiderFile/get_top_sec_com.py
index 5229bbd..caf9ac2 100644
--- a/spiderFile/get_top_sec_com.py
+++ b/spiderFile/get_top_sec_com.py
@@ -1,5 +1,6 @@
import re
import os
+import time
import joblib
import asyncio
import aiohttp
@@ -71,20 +72,22 @@ def yield_picture(self, save_path):
all_shares = asyncio.run(self.async_get_all_shares()) # 异步代码
df = pd.DataFrame(all_shares, columns=["股票代码", "公司", "市值(亿)"])
df["市值(亿)"] = df["市值(亿)"].astype(float)
+ date = time.strftime("%Y年%m月%d日", time.localtime())
df.sort_values(by="市值(亿)", ascending=False, inplace=True)
- height = 0.2 * df.shape[0]
- if self.top and 0< self.top <= df.shape[0]:
- df = df.iloc[:self.top, :]
- height = 0.2 * self.top
df.index = range(1, df.shape[0]+1)
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False
- fig = plt.figure(figsize=(2.5, height), dpi=400)
+ fig = plt.figure(dpi=400)
ax = fig.add_subplot(111, frame_on=False)
ax.xaxis.set_visible(False)
- ax.yaxis.set_visible(False)
- _ = pd.plotting.table(ax, df, loc="center", cellLoc="center")
+ ax.yaxis.set_visible(False)
+ _ = pd.plotting.table(ax, df, loc="best", cellLoc="center")
+ ax.set_title(f"{date}A股网安版块公司市值排名", fontsize=10)
plt.savefig(save_path, bbox_inches="tight")
+
+if __name__ == "__main__":
+ m = getTopSecCom()
+ m.yield_picture("rank.png")
From 5bb06b8864226e6977501bf08f56db889e0f8e7c Mon Sep 17 00:00:00 2001
From: yhf
Date: Fri, 14 May 2021 16:46:33 +0800
Subject: [PATCH 53/65] Update get_top_sec_com.py
---
spiderFile/get_top_sec_com.py | 2 ++
1 file changed, 2 insertions(+)
diff --git a/spiderFile/get_top_sec_com.py b/spiderFile/get_top_sec_com.py
index caf9ac2..f1fce0a 100644
--- a/spiderFile/get_top_sec_com.py
+++ b/spiderFile/get_top_sec_com.py
@@ -8,6 +8,8 @@
import pandas as pd
import matplotlib.pyplot as plt
+# import nest_asyncio
+# nest_asyncio.apply()
class getTopSecCom:
def __init__(self, top=None):
From bfa05ebfd961c00a5840812a536860eba7f92faf Mon Sep 17 00:00:00 2001
From: yhf
Date: Sun, 2 Jan 2022 09:11:53 +0800
Subject: [PATCH 54/65] Update README.md
---
README.md | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/README.md b/README.md
index 4062693..bab2523 100644
--- a/README.md
+++ b/README.md
@@ -10,7 +10,7 @@
|__/
—————— by yanghangfeng
```
-# PythonCrawler: 用 python编写的爬虫项目集合:bug:
+# PythonCrawler: 用 python编写的爬虫项目集合:bug:(本项目代码仅作为爬虫技术学习之用,学习者千万要遵循中华人民共和国法律!)
@@ -38,7 +38,7 @@
3. [get_photos.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/get_photos.py): **抓取百度贴吧某话题下的所有图片。**
4. [get_web_all_img.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/get_web_all_img.py): **抓取整个网站的图片。**
5. [lagou_position_spider.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/lagou_position_spider.py): **任意输入关键字,一键抓取与关键字相关的职位招聘信息,并保存到本地文件。**
-6. [student_img.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/student_img.py): **基于本学校官网的url漏洞,获取所有注册学生学籍证件照。**
+6. [student_img.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/student_img.py): **自动化获取学籍证件照。**
7. [JD_spider.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/JD_spider.py): **大批量抓取京东商品id和标签。**
8. [ECUT_pos_html.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/ECUT_pos_html.py): **抓取学校官网所有校园招聘信息,并保存为html格式,图片也会镶嵌在html中。**
9. [ECUT_get_grade.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/ECUT_get_grade.py): **模拟登陆学校官网,抓取成绩并计算平均学分绩。**
From 4630508b964288937b0c68dd9a845dcbbc0a605b Mon Sep 17 00:00:00 2001
From: yhf
Date: Sun, 2 Jan 2022 09:13:18 +0800
Subject: [PATCH 55/65] Update student_img.py
---
spiderFile/student_img.py | 25 +------------------------
1 file changed, 1 insertion(+), 24 deletions(-)
diff --git a/spiderFile/student_img.py b/spiderFile/student_img.py
index d3135ea..a66436d 100644
--- a/spiderFile/student_img.py
+++ b/spiderFile/student_img.py
@@ -1,29 +1,6 @@
import requests
"""
-思路:去官网自己的主页,看自己的照片的url然后你懂的。
+思路:去官网自己的主页,看自己的学籍照片的url。
"""
-url = ''
-banji = []
-zhuanye = []
-for a in range(10):
- for b in range(10):
- banji.append(str(a) + '0' + str(b))
-for c in range(10):
- zhuanye.append('20' + str(c))
-for year in range(2011, 2015):
- for xh in zhuanye:
- for nj in banji:
- for i in range(1, 35):
- if i < 10:
- xuehao = str(year) + str(xh) + str(nj) + '0' + str(i)
- student_url = url + xuehao
- with open('E:/student_img/%s.jpeg' % xuehao, 'wb') as file:
- file.write(requests.get(student_url).content)
- else:
- xuehao = str(year) + str(xh) + str(nj) + str(i)
- student_url = url + xuehao
- with open('E:/student_img/%s.jpeg' % xuehao, 'wb') as file:
- file.write(requests.get(student_url).content)
-print('OK!')
From 483c276b7c700f16ec7b7f5a62b194a93dedf26e Mon Sep 17 00:00:00 2001
From: yhf
Date: Sun, 2 Jan 2022 09:14:46 +0800
Subject: [PATCH 56/65] Update README.md
---
README.md | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/README.md b/README.md
index bab2523..36c5fdb 100644
--- a/README.md
+++ b/README.md
@@ -50,7 +50,7 @@
15. [fuckCTF.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/fuckCTF.py): **通过selenium模拟登入合天网站,自动修改原始密码。**
16. [one_update.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/one_update.py): **更新抓取one文艺网站的代码,添加一句箴言的抓取。**
17. [get_history_weather.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/get_history_weather.py): **抓取广州市2019年第一季度的天气数据。**
-18. [search_useful_camera_ip_address.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/search_useful_camera_ip_address.py): **模拟登入某扫描网站获取潜在的摄像头IP地址,然后使用弱密码验证筛选出可登录的摄像头IP地址。**
+18. [search_useful_camera_ip_address.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/search_useful_camera_ip_address.py): **摄像头弱密码安全科普。**
19. [get_top_sec_com.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/get_top_sec_com.py): **异步编程获取A股市场网络安全版块公司市值排名情况,并以图片格式保存下来。**
20. [get_tf_accident_info.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/get_tj_accident_info.py): **同步和异步编程结合获取天津市应急管理局所有事故信息。**
---
From 81a5eed902dd2514464ec7da5e5fe465eb8b082f Mon Sep 17 00:00:00 2001
From: yhf
Date: Fri, 8 Jul 2022 15:43:54 +0800
Subject: [PATCH 57/65] Update README.md
---
README.md | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/README.md b/README.md
index 36c5fdb..6b92523 100644
--- a/README.md
+++ b/README.md
@@ -10,7 +10,7 @@
|__/
—————— by yanghangfeng
```
-# PythonCrawler: 用 python编写的爬虫项目集合:bug:(本项目代码仅作为爬虫技术学习之用,学习者千万要遵循中华人民共和国法律!)
+# PythonCrawler: 用 python编写的爬虫项目集合:bug:(本项目代码仅作为爬虫技术学习之用,学习者务必遵循中华人民共和国法律!)
@@ -38,7 +38,7 @@
3. [get_photos.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/get_photos.py): **抓取百度贴吧某话题下的所有图片。**
4. [get_web_all_img.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/get_web_all_img.py): **抓取整个网站的图片。**
5. [lagou_position_spider.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/lagou_position_spider.py): **任意输入关键字,一键抓取与关键字相关的职位招聘信息,并保存到本地文件。**
-6. [student_img.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/student_img.py): **自动化获取学籍证件照。**
+6. [student_img.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/student_img.py): **自动化获取自己学籍证件照。**
7. [JD_spider.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/JD_spider.py): **大批量抓取京东商品id和标签。**
8. [ECUT_pos_html.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/ECUT_pos_html.py): **抓取学校官网所有校园招聘信息,并保存为html格式,图片也会镶嵌在html中。**
9. [ECUT_get_grade.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/ECUT_get_grade.py): **模拟登陆学校官网,抓取成绩并计算平均学分绩。**
From 0d60014d61a5b159257a366a9088752d6c6059b3 Mon Sep 17 00:00:00 2001
From: yhf
Date: Mon, 24 Oct 2022 14:59:17 +0800
Subject: [PATCH 58/65] Update README.md
---
README.md | 8 +++++++-
1 file changed, 7 insertions(+), 1 deletion(-)
diff --git a/README.md b/README.md
index 6b92523..277bb18 100644
--- a/README.md
+++ b/README.md
@@ -29,7 +29,13 @@
-
+对于很多小伙伴咨询IP代理的问题,推荐一个产品,链接:http://www.ipidea.net/?utm-source=yhf&utm-keyword=?yhf
+产品介绍:
+1、覆盖220+的国家和地区,9000万真实住宅IP资源,汇聚成大规模代理服务池。
+2、提供动态住宅代理、静态住宅代理、数据中心、移动代理等多种解决方案,满足电子商务、市场调查、抓取索引、网站测试、广告验证、seo监控优化等多个业务场景。
+3、支持HTTP/HTTPS/Socks5协议
+4、真实住宅IP,支持从制定国家城市访问目标网站,隐藏真实网络环境,保护隐私,24小时持续过滤并更新,IP纯净度高,快速响应,无限并发,99.9%的成功率,确保高效稳定连接,让您的业务得心应手
+5、支持海量IP免费试用
# spiderFile模块简介
From bf967800e286a4241518bc3304d0c593fc7d3062 Mon Sep 17 00:00:00 2001
From: yhf
Date: Mon, 24 Oct 2022 15:01:33 +0800
Subject: [PATCH 59/65] Update README.md
---
README.md | 11 ++++++-----
1 file changed, 6 insertions(+), 5 deletions(-)
diff --git a/README.md b/README.md
index 277bb18..dd07248 100644
--- a/README.md
+++ b/README.md
@@ -29,13 +29,14 @@
-对于很多小伙伴咨询IP代理的问题,推荐一个产品,链接:http://www.ipidea.net/?utm-source=yhf&utm-keyword=?yhf
+对于很多小伙伴咨询IP代理的问题,推荐一个产品,[相关链接点击直达](http://www.ipidea.net/?utm-source=yhf&utm-keyword=?yhf)
产品介绍:
-1、覆盖220+的国家和地区,9000万真实住宅IP资源,汇聚成大规模代理服务池。
-2、提供动态住宅代理、静态住宅代理、数据中心、移动代理等多种解决方案,满足电子商务、市场调查、抓取索引、网站测试、广告验证、seo监控优化等多个业务场景。
-3、支持HTTP/HTTPS/Socks5协议
+1、覆盖220+的国家和地区,9000万真实住宅IP资源,汇聚成大规模代理服务池;
+2、提供动态住宅代理、静态住宅代理、数据中心、移动代理等多种解决方案,满足电子商务、市场调查、抓取索引、网站测试、广告验证、seo监控优化等多个业务场景;
+3、支持HTTP/HTTPS/Socks5协议;
4、真实住宅IP,支持从制定国家城市访问目标网站,隐藏真实网络环境,保护隐私,24小时持续过滤并更新,IP纯净度高,快速响应,无限并发,99.9%的成功率,确保高效稳定连接,让您的业务得心应手
-5、支持海量IP免费试用
+;
+5、支持海量IP免费试用。
# spiderFile模块简介
From 72ba185ce7a774f0a06515a08db02ff494b6c49e Mon Sep 17 00:00:00 2001
From: yhf
Date: Mon, 24 Oct 2022 15:30:14 +0800
Subject: [PATCH 60/65] Update README.md
---
README.md | 13 +++++++------
1 file changed, 7 insertions(+), 6 deletions(-)
diff --git a/README.md b/README.md
index dd07248..7dc2c5b 100644
--- a/README.md
+++ b/README.md
@@ -29,13 +29,14 @@
-对于很多小伙伴咨询IP代理的问题,推荐一个产品,[相关链接点击直达](http://www.ipidea.net/?utm-source=yhf&utm-keyword=?yhf)
-产品介绍:
-1、覆盖220+的国家和地区,9000万真实住宅IP资源,汇聚成大规模代理服务池;
-2、提供动态住宅代理、静态住宅代理、数据中心、移动代理等多种解决方案,满足电子商务、市场调查、抓取索引、网站测试、广告验证、seo监控优化等多个业务场景;
-3、支持HTTP/HTTPS/Socks5协议;
+
+由于很多小伙伴都咨询IP代理的问题,在这里推荐大家一个好用的产品,[相关链接点击直达](http://www.ipidea.net/?utm-source=yhf&utm-keyword=?yhf)
+产品介绍:
+1、覆盖220+的国家和地区,9000万真实住宅IP资源,汇聚成大规模代理服务池;
+2、提供动态住宅代理、静态住宅代理、数据中心、移动代理等多种解决方案,满足电子商务、市场调查、抓取索引、网站测试、广告验证、seo监控优化等多个业务场景;
+3、支持HTTP/HTTPS/Socks5协议;
4、真实住宅IP,支持从制定国家城市访问目标网站,隐藏真实网络环境,保护隐私,24小时持续过滤并更新,IP纯净度高,快速响应,无限并发,99.9%的成功率,确保高效稳定连接,让您的业务得心应手
-;
+;
5、支持海量IP免费试用。
# spiderFile模块简介
From f9315ea9a0ec52e1e24aa62211ebfdd797903f2c Mon Sep 17 00:00:00 2001
From: yhf
Date: Thu, 2 Mar 2023 19:02:48 +0800
Subject: [PATCH 61/65] Update README.md
---
README.md | 9 ---------
1 file changed, 9 deletions(-)
diff --git a/README.md b/README.md
index 7dc2c5b..c54c2a1 100644
--- a/README.md
+++ b/README.md
@@ -30,15 +30,6 @@
-由于很多小伙伴都咨询IP代理的问题,在这里推荐大家一个好用的产品,[相关链接点击直达](http://www.ipidea.net/?utm-source=yhf&utm-keyword=?yhf)
-产品介绍:
-1、覆盖220+的国家和地区,9000万真实住宅IP资源,汇聚成大规模代理服务池;
-2、提供动态住宅代理、静态住宅代理、数据中心、移动代理等多种解决方案,满足电子商务、市场调查、抓取索引、网站测试、广告验证、seo监控优化等多个业务场景;
-3、支持HTTP/HTTPS/Socks5协议;
-4、真实住宅IP,支持从制定国家城市访问目标网站,隐藏真实网络环境,保护隐私,24小时持续过滤并更新,IP纯净度高,快速响应,无限并发,99.9%的成功率,确保高效稳定连接,让您的业务得心应手
-;
-5、支持海量IP免费试用。
-
# spiderFile模块简介
1. [baidu_sy_img.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/baidu_sy_img.py): **抓取百度的`高清摄影`图片。**
From fac1ffc9c9e6a04875b55d54cd67dbf72ac39db2 Mon Sep 17 00:00:00 2001
From: yhf
Date: Thu, 17 Apr 2025 14:14:18 +0800
Subject: [PATCH 62/65] Update README.md
---
README.md | 12 ++++++++++++
1 file changed, 12 insertions(+)
diff --git a/README.md b/README.md
index c54c2a1..6f96817 100644
--- a/README.md
+++ b/README.md
@@ -30,6 +30,18 @@
+# IPWO全球代理资源 | 为采集、跨境与测试项目提供支持(免费试用,爬虫使用强烈推荐!!!)
+### 官网地址
+[👉 访问 IPWO 官网](https://www.ipwo.net/?code=WSESV2ONN)
+### 产品简介
+* 免费试用,先体验再选择
+* 9000万+真实住宅IP,覆盖220+国家和地区
+* 支持动态住宅代理、静态住宅代理(ISP)
+* 适用于数据抓取、电商、广告验证、SEO监控等场景
+* 支持HTTP/HTTPS/SOCKS5协议,兼容性强
+* 纯净IP池,实时更新,99.9%连接成功率
+* 支持指定国家城市地区访问,保护隐私
+
# spiderFile模块简介
1. [baidu_sy_img.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/baidu_sy_img.py): **抓取百度的`高清摄影`图片。**
From db47c2a3efb062fb3ba228d73bf148701b30af7d Mon Sep 17 00:00:00 2001
From: yhf
Date: Sat, 9 May 2026 11:11:26 +0800
Subject: [PATCH 63/65] Clean up README and update project details
Remove ASCII art and update project description.
---
README.md | 12 ------------
1 file changed, 12 deletions(-)
diff --git a/README.md b/README.md
index 6f96817..c54c2a1 100644
--- a/README.md
+++ b/README.md
@@ -30,18 +30,6 @@
-# IPWO全球代理资源 | 为采集、跨境与测试项目提供支持(免费试用,爬虫使用强烈推荐!!!)
-### 官网地址
-[👉 访问 IPWO 官网](https://www.ipwo.net/?code=WSESV2ONN)
-### 产品简介
-* 免费试用,先体验再选择
-* 9000万+真实住宅IP,覆盖220+国家和地区
-* 支持动态住宅代理、静态住宅代理(ISP)
-* 适用于数据抓取、电商、广告验证、SEO监控等场景
-* 支持HTTP/HTTPS/SOCKS5协议,兼容性强
-* 纯净IP池,实时更新,99.9%连接成功率
-* 支持指定国家城市地区访问,保护隐私
-
# spiderFile模块简介
1. [baidu_sy_img.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/baidu_sy_img.py): **抓取百度的`高清摄影`图片。**
From b9af210061c835499023d82e95cfaca98eff16d4 Mon Sep 17 00:00:00 2001
From: yhf
Date: Tue, 12 May 2026 17:31:49 +0800
Subject: [PATCH 64/65] Revise README with project updates and new features
Update README to include new project details and features.
---
README.md | 13 +++++++++++++
1 file changed, 13 insertions(+)
diff --git a/README.md b/README.md
index c54c2a1..ee46f57 100644
--- a/README.md
+++ b/README.md
@@ -30,6 +30,19 @@
+# 🚀 重磅福利|Swiftproxy 全球代理 · 开发者必备
+
+
+**🌍 全球顶级住宅代理网络,一站式解决爬虫/采集/自动化难题 [🔗官方入口](https://www.swiftproxy.net/?ref=PythonCrawler)**
+- [x] **195+国家全覆盖**,**8000万+** 纯净住宅IP池
+- [x] **99.89%超高请求成功率**,稳定低延迟
+- [x] 流量永不过期,无强制月付,灵活随心用
+- [x] 原生支持 **HTTP(S)/SOCKS5** 双协议
+- [x] 智能IP轮换+精准地区定位,强效规避封锁
+- [x] 适配Python/Node/Go/PHP全语言,一键集成
+- [x] 免费试用开启,开箱即用零门槛
+
+---
# spiderFile模块简介
1. [baidu_sy_img.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/baidu_sy_img.py): **抓取百度的`高清摄影`图片。**
From a4ebdb9d446ec0c82548ce7d06391bb8e385ccfd Mon Sep 17 00:00:00 2001
From: yhf
Date: Tue, 12 May 2026 18:56:08 +0800
Subject: [PATCH 65/65] Clean up README by removing unnecessary content
Remove ASCII art and introductory text from README
---
README.md | 12 ------------
1 file changed, 12 deletions(-)
diff --git a/README.md b/README.md
index ee46f57..80111dd 100644
--- a/README.md
+++ b/README.md
@@ -30,18 +30,6 @@
-# 🚀 重磅福利|Swiftproxy 全球代理 · 开发者必备
-
-
-**🌍 全球顶级住宅代理网络,一站式解决爬虫/采集/自动化难题 [🔗官方入口](https://www.swiftproxy.net/?ref=PythonCrawler)**
-- [x] **195+国家全覆盖**,**8000万+** 纯净住宅IP池
-- [x] **99.89%超高请求成功率**,稳定低延迟
-- [x] 流量永不过期,无强制月付,灵活随心用
-- [x] 原生支持 **HTTP(S)/SOCKS5** 双协议
-- [x] 智能IP轮换+精准地区定位,强效规避封锁
-- [x] 适配Python/Node/Go/PHP全语言,一键集成
-- [x] 免费试用开启,开箱即用零门槛
-
---
# spiderFile模块简介
| | |