diff --git a/.github/workflows/docker-image-latest.yml b/.github/workflows/docker-image-latest.yml new file mode 100644 index 000000000..6c7e00ac6 --- /dev/null +++ b/.github/workflows/docker-image-latest.yml @@ -0,0 +1,35 @@ +name: Publish Docker image latest + +on: + push: + branches: + - 'master' + +jobs: + + push_to_registry: + name: Push Docker image to Docker Hub + runs-on: ubuntu-latest + + steps: + - name: Check out the repo + uses: actions/checkout@v2 + + - name: Log in to Docker Hub + uses: docker/login-action@v1 + with: + username: ${{ secrets.DOCKERHUB_USERNAME }} + password: ${{ secrets.DOCKERHUB_TOKEN }} + + - name: Extract metadata (tags, labels) for Docker + id: meta + uses: docker/metadata-action@v3 + with: + images: jhao104/proxy_pool + + - name: Build and push Docker image + uses: docker/build-push-action@v2 + with: + context: . + push: true + tags: jhao104/proxy_pool:latest diff --git a/.github/workflows/docker-image-tags.yml b/.github/workflows/docker-image-tags.yml new file mode 100644 index 000000000..9a59645ad --- /dev/null +++ b/.github/workflows/docker-image-tags.yml @@ -0,0 +1,36 @@ +name: Publish Docker image tags + +on: + push: + tags: + - '*' + +jobs: + + push_to_registry: + name: Push Docker image to Docker Hub + runs-on: ubuntu-latest + + steps: + - name: Check out the repo + uses: actions/checkout@v2 + + - name: Log in to Docker Hub + uses: docker/login-action@v1 + with: + username: ${{ secrets.DOCKERHUB_USERNAME }} + password: ${{ secrets.DOCKERHUB_TOKEN }} + + - name: Extract metadata (tags, labels) for Docker + id: meta + uses: docker/metadata-action@v3 + with: + images: jhao104/proxy_pool + + - name: Build and push Docker image + uses: docker/build-push-action@v2 + with: + context: . + push: true + tags: ${{ steps.meta.outputs.tags }} + labels: ${{ steps.meta.outputs.labels }} diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml new file mode 100644 index 000000000..96369c998 --- /dev/null +++ b/.github/workflows/docs.yml @@ -0,0 +1,22 @@ +name: Deploy Docs + +on: + push: + branches: + - master + +permissions: + contents: write + +jobs: + deploy: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + + - uses: actions/setup-python@v5 + with: + python-version: '3.x' + + - run: pip install mkdocs-material + - run: mkdocs gh-deploy --force \ No newline at end of file diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml new file mode 100644 index 000000000..74d3a691d --- /dev/null +++ b/.github/workflows/test.yml @@ -0,0 +1,34 @@ +name: Tests + +on: + push: + branches: [master] + pull_request: + branches: [master, develop] + +jobs: + test: + name: Python ${{ matrix.python-version }} + runs-on: ubuntu-latest + + strategy: + fail-fast: false + matrix: + python-version: ["3.8", "3.9", "3.10", "3.11"] + + steps: + - uses: actions/checkout@v4 + + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v5 + with: + python-version: ${{ matrix.python-version }} + + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install -r requirements.txt + pip install -r requirements-test.txt + + - name: Run tests + run: pytest --cov=. --cov-report=term-missing \ No newline at end of file diff --git a/.gitignore b/.gitignore index f09264408..e823c7548 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,12 @@ .idea/ +site/ *.pyc +*.pyc.* +__pycache__/ *.log +.tox +.claude/ +docs/ideas/ +.coverage +.pytest_cache/ +htmlcov/ \ No newline at end of file diff --git a/.travis.yml b/.travis.yml deleted file mode 100644 index a775459a0..000000000 --- a/.travis.yml +++ /dev/null @@ -1,10 +0,0 @@ -language: python -python: - - 2.7 - # - nightly -os: - - linux -install: - - pip install -r requirements.txt - -script: python test.py \ No newline at end of file diff --git a/Api/ProxyApi.py b/Api/ProxyApi.py deleted file mode 100644 index 45db4843a..000000000 --- a/Api/ProxyApi.py +++ /dev/null @@ -1,80 +0,0 @@ -# -*- coding: utf-8 -*- -# !/usr/bin/env python -""" -------------------------------------------------- - File Name: ProxyApi.py - Description : - Author : JHao - date: 2016/12/4 -------------------------------------------------- - Change Activity: - 2016/12/4: -------------------------------------------------- -""" -__author__ = 'JHao' - -import sys - -sys.path.append('../') - -from flask import Flask, jsonify, request -from Util.GetConfig import GetConfig - -from Manager.ProxyManager import ProxyManager - -app = Flask(__name__) - -api_list = { - 'get': u'get an usable proxy', - # 'refresh': u'refresh proxy pool', - 'get_all': u'get all proxy from proxy pool', - 'delete?proxy=127.0.0.1:8080': u'delete an unable proxy', - 'get_status': u'proxy statistics' -} - - -@app.route('/') -def index(): - return jsonify(api_list) - - -@app.route('/get/') -def get(): - proxy = ProxyManager().get() - return proxy if proxy else 'no proxy!' - - -@app.route('/refresh/') -def refresh(): - # TODO refresh会有守护程序定时执行,由api直接调用性能较差,暂不使用 - # ProxyManager().refresh() - pass - return 'success' - - -@app.route('/get_all/') -def getAll(): - proxies = ProxyManager().getAll() - return jsonify(proxies) - - -@app.route('/delete/', methods=['GET']) -def delete(): - proxy = request.args.get('proxy') - ProxyManager().delete(proxy) - return 'success' - - -@app.route('/get_status/') -def getStatus(): - status = ProxyManager().getNumber() - return jsonify(status) - - -def run(): - config = GetConfig() - app.run(host=config.host_ip, port=config.host_port) - - -if __name__ == '__main__': - run() diff --git a/CLAUDE.md b/CLAUDE.md new file mode 100644 index 000000000..25f087abe --- /dev/null +++ b/CLAUDE.md @@ -0,0 +1,111 @@ +# CLAUDE.md + +本文件为 Claude Code (claude.ai/code) 在本仓库中工作时提供指导。 + +## 技术栈 +Python (3.8–3.11)、Flask (API)、Redis/SSDB (存储)、APScheduler (调度)。依赖版本固定记录在 `requirements.txt` 中。 + +## 常用命令 +- 安装依赖:`pip install -r requirements.txt` +- 运行代理爬取/验证调度器:`python proxyPool.py schedule` +- 运行 API 服务器:`python proxyPool.py server` +- 运行单元测试:`pytest tests/unit/` +- 运行 API 测试:`pytest tests/api/` +- 运行集成测试(需真实 Redis):`pytest tests/integration/ -m integration` +- 运行全部测试:`pytest` +- 查看覆盖率:`pytest --cov=. --cov-report=term-missing` + +## 测试 + +### 目录结构 +``` +tests/ +├── conftest.py # 共享 fixtures(app、client、fake_redis、proxy_obj、reset_singleton) +├── unit/ # 纯逻辑,零外部依赖 +│ ├── test_proxy.py # Proxy 类:构造、序列化、setter、add_source +│ ├── test_db_client.py # DbClient.parseDbConn URI 解析 +│ ├── test_config.py # ConfigHandler 环境变量覆盖 +│ └── test_validator.py # formatValidator 正则匹配 +├── api/ # Flask 测试客户端,mock ProxyHandler +│ └── test_proxy_api.py # /get /pop /all /count /delete 全路由 +└── integration/ # 需要真实 Redis,标记 @pytest.mark.integration + ├── test_redis_client.py # RedisClient 完整 CRUD + └── test_ssdb_client.py # SsdbClient 完整 CRUD +``` + +### 测试分层 +- **unit/**:不依赖外部服务,用 `unittest.mock` 或 `fakeredis` 模拟,CI 必跑 +- **api/**:使用 Flask `app.test_client()`,mock 掉 `ProxyHandler`,不依赖数据库 +- **integration/**:需要真实 Redis,通过 `@pytest.mark.integration` 标记,按需执行 + +### 测试依赖 +`pytest`、`pytest-cov`、`fakeredis`(纯 Python Redis 模拟,无需真实服务) + +### 关键约定 +- 测试函数命名:`test_` 前缀 + 下划线命名(`test_get_with_https`) +- 每个测试前自动重置 `Singleton._inst`,避免单例泄漏 +- 集成测试与单元测试共存:单元测试用 fakeredis 跑,集成测试标记后按需执行 + +## 高层架构 +免费代理池项目,爬取公开代理源、验证代理可用性、持久化存储到 Redis/SSDB,并通过 Flask RESTful API 提供代理服务。 + +### 核心组件 +- **爬取器** (`fetcher/proxyFetcher.py`):`ProxyFetcher` 类,每个代理源对应一个静态方法,yield 出 `host:port` 字符串。通过 `setting.py` 中的 `PROXY_FETCHER` 列表启用对应爬取器。 +- **数据库层** (`db/`):抽象 `dbClient` 接口,包含 Redis (`redisClient.py`) 和 SSDB (`ssdbClient.py`) 两种实现。通过 `setting.py` 中的 `DB_CONN` 配置连接(格式:`redis://:pwd@ip:port/db` 或 `ssdb://:pwd@ip:port`)。 +- **调度器** (`helper/scheduler.py`):基于 APScheduler 的定时任务,驱动爬取器运行并触发验证。时区通过 `setting.py` 中的 `TIMEZONE` 配置。 +- **验证器** (`helper/validator.py`):使用 `HTTP_URL` (http://httpbin.org) 和 `HTTPS_URL` (https://www.qq.com) 测试代理,超时时间由 `VERIFY_TIMEOUT` 指定(默认 10 秒)。超过 `MAX_FAIL_COUNT` 的代理会被移除。当代理池数量低于 `POOL_SIZE_MIN`(默认 20)时触发重新爬取。 +- **API** (`api/proxyApi.py`):Flask 接口,包含以下端点: + - `/get`:随机获取一个代理(`?type=https` 可筛选 HTTPS 代理) + - `/pop`:获取并删除一个代理 + - `/all`:列出所有代理 + - `/count`:代理数量统计 + - `/delete`:通过 `?proxy=host:port` 删除指定代理 + - 服务运行在 `HOST:PORT`(默认 `0.0.0.0:5010`),配置来自 `setting.py`。 +- **命令行入口** (`proxyPool.py`):基于 click 的命令行工具,包含 `schedule` 和 `server` 两个子命令。 + +### 扩展代理源 +1. 在 `fetcher/proxyFetcher.py` 的 `ProxyFetcher` 类中新增一个静态方法,yield 出 `host:port` 字符串。 +2. 将该方名添加到 `setting.py` 的 `PROXY_FETCHER` 列表中。调度器会自动识别并启用新的代理源。 + +## 关键配置 +所有运行时配置均在 `setting.py` 中: +- `HOST`/`PORT`:API 绑定的地址和端口 +- `DB_CONN`:数据库连接字符串 +- `PROXY_FETCHER`:已启用的爬取器方法名列表 +- `HTTP_URL`/`HTTPS_URL`:验证目标 URL +- `VERIFY_TIMEOUT`:验证超时时间(默认 10 秒) +- `MAX_FAIL_COUNT`:代理被移除前允许的最大失败次数 +- `POOL_SIZE_MIN`:触发重新爬取的最小代理池数量阈值 +- `PROXY_REGION`:是否启用代理地区属性(默认 `True`) +- `TIMEZONE`:调度器时区(默认 `Asia/Shanghai`) + +## 代码风格与命名规范 +- **文件头**:每个 `.py` 文件必须包含以下标准头部: + ```python + # -*- coding: utf-8 -*- + """ + ------------------------------------------------- + File Name: fileName.py + Description : 文件功能描述 + Author : JHao + date: yyyy/mm/dd + ------------------------------------------------- + Change Activity: + yyyy/mm/dd: 修改内容简述 (修改时添加此行) + ------------------------------------------------- + """ + __author__ = 'JHao' + ``` +- **缩进**:4 个空格(Python 标准) +- **文件命名**:驼峰命名,如 `proxyFetcher.py`、`dbClient.py`、`redisClient.py`、`webRequest.py` +- **类命名**:帕斯卡命名,如 `ProxyFetcher`、`RedisClient`、`SsdbClient`、`ProxyValidator` +- **方法命名**:混合风格——数据库/爬取器方法使用驼峰命名(`getAll`、`getCount`、`changeTable`、`freeProxy01`),属性和辅助方法使用下划线命名(`user_agent`、`fail_count`、`check_count`) +- **爬取器方法**:命名为 `freeProxy` + 两位数字(如 `freeProxy01`、`freeProxy02`)。新增爬取器必须遵循此模式 +- **常量**(在 `setting.py` 中):大写下划线命名(`DB_CONN`、`PROXY_FETCHER`、`HTTP_URL`、`MAX_FAIL_COUNT`) +- **变量**:下划线命名(`proxy_obj`、`proxy_str`、`https`) +- **注释/文档字符串**:源文件头部和行内注释通常使用中文(普通话) +- **单例模式**:使用自定义 `Singleton` 元类(`util/singleton.py`)结合 `six.withMetaclass` 实现 + +## 注意事项 +- 运行测试前需先安装测试依赖:`pip install pytest pytest-cov fakeredis` +- 单元测试和 API 测试不依赖外部服务,可直接运行;集成测试需启动 Redis diff --git a/Config.ini b/Config.ini deleted file mode 100644 index 39eb6cdb6..000000000 --- a/Config.ini +++ /dev/null @@ -1,23 +0,0 @@ -[DB] -;Configure the database information -;type: SSDB/REDIS/MONGODB if use redis, only modify the host port,the type should be SSDB -type = SSDB -host = localhost -;port = 6379 -port = 8888 -name = proxy - -[ProxyGetter] -;register the proxy getter function -freeProxyFirst = 1 -freeProxySecond = 1 -freeProxyThird = 1 -freeProxyFourth = 1 -freeProxyFifth = 1 -freeProxySixth = 1 -freeProxySeventh = 1 - -[HOST] -; API接口配置 http://127.0.0.1:5051 -ip = 0.0.0.0 -port = 5010 diff --git a/DB/DbClient.py b/DB/DbClient.py deleted file mode 100644 index 68c5db7a7..000000000 --- a/DB/DbClient.py +++ /dev/null @@ -1,113 +0,0 @@ -# -*- coding: utf-8 -*- -# !/usr/bin/env python -""" -------------------------------------------------- - File Name: DbClient.py - Description : DB工厂类 - Author : JHao - date: 2016/12/2 -------------------------------------------------- - Change Activity: - 2016/12/2: -------------------------------------------------- -""" -__author__ = 'JHao' - -import os -import sys - -from Util.GetConfig import GetConfig -from Util.utilClass import Singleton - -sys.path.append(os.path.dirname(os.path.abspath(__file__))) - - -class DbClient(object): - """ - DbClient DB工厂类 提供get/put/pop/delete/getAll/changeTable方法 - - 目前存放代理的table/collection/hash有两种: - raw_proxy: 存放原始的代理; - useful_proxy_queue: 存放检验后的代理; - - 抽象方法定义: - get(proxy): 返回proxy的信息; - put(proxy): 存入一个代理; - pop(): 弹出一个代理 - exists(proxy): 判断代理是否存在 - getNumber(raw_proxy): 返回代理总数(一个计数器); - update(proxy, num): 修改代理属性计数器的值; - delete(proxy): 删除指定代理; - getAll(): 返回所有代理; - changeTable(name): 切换 table or collection or hash; - - - 所有方法需要相应类去具体实现: - SSDB:SsdbClient.py - REDIS:RedisClient.py - - """ - - __metaclass__ = Singleton - - def __init__(self): - """ - init - :return: - """ - self.config = GetConfig() - self.__initDbClient() - - def __initDbClient(self): - """ - init DB Client - :return: - """ - __type = None - if "SSDB" == self.config.db_type: - __type = "SsdbClient" - elif "REDIS" == self.config.db_type: - __type = "RedisClient" - elif "MONGODB" == self.config.db_type: - __type = "MongodbClient" - else: - pass - assert __type, 'type error, Not support DB type: {}'.format(self.config.db_type) - self.client = getattr(__import__(__type), __type)(name=self.config.db_name, - host=self.config.db_host, - port=self.config.db_port) - - def get(self, key, **kwargs): - return self.client.get(key, **kwargs) - - def put(self, key, **kwargs): - return self.client.put(key, **kwargs) - - def update(self, key, value, **kwargs): - return self.client.update(key, value, **kwargs) - - def delete(self, key, **kwargs): - return self.client.delete(key, **kwargs) - - def exists(self, key, **kwargs): - return self.client.exists(key, **kwargs) - - def pop(self, **kwargs): - return self.client.pop(**kwargs) - - def getAll(self): - return self.client.getAll() - - def changeTable(self, name): - self.client.changeTable(name) - - def getNumber(self): - return self.client.getNumber() - - -if __name__ == "__main__": - account = DbClient() - print(account.get()) - account.changeTable('use') - account.put('ac') - print(account.get()) diff --git a/DB/MongodbClient.py b/DB/MongodbClient.py deleted file mode 100644 index bd0647f51..000000000 --- a/DB/MongodbClient.py +++ /dev/null @@ -1,74 +0,0 @@ -# coding: utf-8 -""" -------------------------------------------------- - File Name: MongodbClient.py - Description : 封装mongodb操作 - Author : JHao netAir - date: 2017/3/3 -------------------------------------------------- - Change Activity: - 2017/3/3: - 2017/9/26:完成对mongodb的支持 -------------------------------------------------- -""" -__author__ = 'Maps netAir' - -from pymongo import MongoClient - - -class MongodbClient(object): - def __init__(self, name, host, port): - self.name = name - self.client = MongoClient(host, port) - self.db = self.client.proxy - - def changeTable(self, name): - self.name = name - - def get(self, proxy): - data = self.db[self.name].find_one({'proxy': proxy}) - return data['num'] if data != None else None - - def put(self, proxy, num=1): - if self.db[self.name].find_one({'proxy': proxy}): - return None - else: - self.db[self.name].insert({'proxy': proxy, 'num': num}) - - def pop(self): - data = list(self.db[self.name].aggregate([{'$sample': {'size': 1}}])) - if data: - data = data[0] - value = data['proxy'] - self.delete(value) - return {'proxy': value, 'value': data['num']} - return None - - def delete(self, value): - self.db[self.name].remove({'proxy': value}) - - def getAll(self): - return {p['proxy']: p['num'] for p in self.db[self.name].find()} - - def clean(self): - self.client.drop_database('proxy') - - def delete_all(self): - self.db[self.name].remove() - - def update(self, key, value): - self.db[self.name].update({'proxy': key}, {'$inc': {'num': value}}) - - def exists(self, key): - return True if self.db[self.name].find_one({'proxy': key}) != None else False - - def getNumber(self): - return self.db[self.name].count() - - -if __name__ == "__main__": - db = MongodbClient('first', 'localhost', 27017) - # db.put('127.0.0.1:1') - # db2 = MongodbClient('second', 'localhost', 27017) - # db2.put('127.0.0.1:2') - print(db.pop()) diff --git a/DB/RedisClient.py b/DB/RedisClient.py deleted file mode 100644 index 7d9af4386..000000000 --- a/DB/RedisClient.py +++ /dev/null @@ -1,123 +0,0 @@ -# -*- coding: utf-8 -*- -# !/usr/bin/env python - -''' -self.name为Redis中的一个key -2017/4/17 修改pop -''' - -# ############################ -# 已弃用, -# SsdbClient.py 支持redis -############################## - -import json -import random -import redis -import sys - - -class RedisClient(object): - """ - Reids client - """ - - def __init__(self, name, host, port): - """ - init - :param name: - :param host: - :param port: - :return: - """ - self.name = name - self.__conn = redis.Redis(host=host, port=port, db=0) - - def get(self): - """ - get random result - :return: - """ - key = self.__conn.hgetall(name=self.name) - # return random.choice(key.keys()) if key else None - # key.keys()在python3中返回dict_keys,不支持index,不能直接使用random.choice - # 另:python3中,redis返回为bytes,需要解码 - rkey = random.choice(list(key.keys())) if key else None - if isinstance(rkey, bytes): - return rkey.decode('utf-8') - else: - return rkey - # return self.__conn.srandmember(name=self.name) - - def put(self, key): - """ - put an item - :param value: - :return: - """ - key = json.dumps(key) if isinstance(key, (dict, list)) else key - return self.__conn.hincrby(self.name, key, 1) - # return self.__conn.sadd(self.name, value) - - def getvalue(self, key): - value = self.__conn.hget(self.name, key) - return value if value else None - - def pop(self): - """ - pop an item - :return: - """ - key = self.get() - if key: - self.__conn.hdel(self.name, key) - return key - # return self.__conn.spop(self.name) - - def delete(self, key): - """ - delete an item - :param key: - :return: - """ - self.__conn.hdel(self.name, key) - # self.__conn.srem(self.name, value) - - def inckey(self, key, value): - self.__conn.hincrby(self.name, key, value) - - def getAll(self): - # return self.__conn.hgetall(self.name).keys() - # python3 redis返回bytes类型,需要解码 - if sys.version_info.major == 3: - return [key.decode('utf-8') for key in self.__conn.hgetall(self.name).keys()] - else: - return self.__conn.hgetall(self.name).keys() - # return self.__conn.smembers(self.name) - - def get_status(self): - return self.__conn.hlen(self.name) - # return self.__conn.scard(self.name) - - def changeTable(self, name): - self.name = name - - -if __name__ == '__main__': - redis_con = RedisClient('proxy', 'localhost', 6379) - # redis_con.put('abc') - # redis_con.put('123') - # redis_con.put('123.115.235.221:8800') - # redis_con.put(['123', '115', '235.221:8800']) - # print(redis_con.getAll()) - # redis_con.delete('abc') - # print(redis_con.getAll()) - - # print(redis_con.getAll()) - redis_con.changeTable('raw_proxy') - redis_con.pop() - - # redis_con.put('132.112.43.221:8888') - # redis_con.changeTable('proxy') - print(redis_con.get_status()) - print(redis_con.getAll()) diff --git a/DB/SsdbClient.py b/DB/SsdbClient.py deleted file mode 100644 index 08778fdb6..000000000 --- a/DB/SsdbClient.py +++ /dev/null @@ -1,112 +0,0 @@ -# -*- coding: utf-8 -*- -# !/usr/bin/env python -""" -------------------------------------------------- - File Name: SsdbClient.py - Description : 封装SSDB操作 - Author : JHao - date: 2016/12/2 -------------------------------------------------- - Change Activity: - 2016/12/2: - 2017/09/22: PY3中 redis-py返回的数据是bytes型 - 2017/09/27: 修改pop()方法 返回{proxy:value}字典 -------------------------------------------------- -""" -__author__ = 'JHao' - -from Util import EnvUtil - -from redis.connection import BlockingConnectionPool -from redis import Redis -import random - - -class SsdbClient(object): - """ - SSDB client - - SSDB中代理存放的容器为hash: - 原始代理存放在name为raw_proxy的hash中,key为代理的ip:port,value为为None,以后扩展可能会加入代理属性; - 验证后的代理存放在name为useful_proxy的hash中,key为代理的ip:port,value为一个计数,初始为1,每校验失败一次减1; - - """ - - def __init__(self, name, host, port): - """ - init - :param name: hash name - :param host: ssdb host - :param port: ssdb port - :return: - """ - self.name = name - self.__conn = Redis(connection_pool=BlockingConnectionPool(host=host, port=port)) - - def get(self, proxy): - """ - get an item - 从hash中获取对应的proxy, 使用前需要调用changeTable() - :param proxy: - :return: - """ - data = self.__conn.hget(name=self.name, key=proxy) - if data: - return data.decode('utf-8') if EnvUtil.PY3 else data - else: - return None - - def put(self, proxy, num=1): - """ - 将代理放入hash, 使用changeTable指定hash name - :param proxy: - :param num: - :return: - """ - data = self.__conn.hincrby(self.name, proxy, num) - return data - - def delete(self, key): - """ - Remove the ``key`` from hash ``name`` - :param key: - :return: - """ - self.__conn.hdel(self.name, key) - - def update(self, key, value): - self.__conn.hincrby(self.name, key, value) - - def pop(self): - """ - 弹出一个代理 - :return: dict {proxy: value} - """ - proxies = self.__conn.hkeys(self.name) - if proxies: - proxy = random.choice(proxies) - value = self.__conn.hget(self.name, proxy) - self.delete(proxy) - return {'proxy': proxy.decode('utf-8') if EnvUtil.PY3 else proxy, - 'value': value.decode('utf-8') if EnvUtil.PY3 and value else value} - return None - - def exists(self, key): - return self.__conn.hexists(self.name, key) - - def getAll(self): - item_dict = self.__conn.hgetall(self.name) - if EnvUtil.PY3: - return {key.decode('utf8'): value.decode('utf8') for key, value in item_dict.items()} - else: - return item_dict - - def getNumber(self): - """ - Return the number of elements in hash ``name`` - :return: - """ - return self.__conn.hlen(self.name) - - def changeTable(self, name): - self.name = name diff --git a/Dockerfile b/Dockerfile index 7c815a4e7..3e1ddade2 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,30 +1,23 @@ -FROM python:3.6 -WORKDIR /usr/src/app +FROM python:3.10-alpine + +LABEL maintainer="jhao104 " + +WORKDIR /app + +COPY ./requirements.txt . + +# timezone and init process +RUN apk add -U tzdata tini && \ + cp /usr/share/zoneinfo/Asia/Shanghai /etc/localtime && \ + apk del tzdata + +# runtime environment +RUN apk add musl-dev gcc libxml2-dev libxslt-dev && \ + pip install --no-cache-dir -r requirements.txt && \ + apk del gcc musl-dev + COPY . . -ENV DEBIAN_FRONTEND noninteractive -ENV TZ Asia/Shanghai -RUN pip install --no-cache-dir -r requirements.txt && \ - apt-get update && \ - apt-get install -y --force-yes git make gcc g++ autoconf && apt-get clean && \ - git clone --depth 1 https://github.com/ideawu/ssdb.git ssdb && \ - cd ssdb && make && make install && cp ssdb-server /usr/bin && \ - apt-get remove -y --force-yes git make gcc g++ autoconf && \ - apt-get autoremove -y && \ - rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* && \ - cp ssdb.conf /etc && cd .. && yes | rm -r ssdb && \ - mkdir -p /var/lib/ssdb && \ - sed \ - -e 's@home.*@home /var/lib@' \ - -e 's/loglevel.*/loglevel info/' \ - -e 's@work_dir = .*@work_dir = /var/lib/ssdb@' \ - -e 's@pidfile = .*@pidfile = /run/ssdb.pid@' \ - -e 's@level:.*@level: info@' \ - -e 's@ip:.*@ip: 0.0.0.0@' \ - -i /etc/ssdb.conf && \ - echo "# ! /bin/sh " > /usr/src/app/run.sh && \ - echo "cd Run" >> /usr/src/app/run.sh && \ - echo "/usr/bin/ssdb-server /etc/ssdb.conf &" >> /usr/src/app/run.sh && \ - echo "python main.py" >> /usr/src/app/run.sh && \ - chmod 777 run.sh + EXPOSE 5010 -CMD [ "sh", "run.sh" ] + +ENTRYPOINT ["tini", "--", "bash", "proxy_pool.sh", "start", "--fg"] diff --git a/Manager/ProxyManager.py b/Manager/ProxyManager.py deleted file mode 100644 index 90ca120c3..000000000 --- a/Manager/ProxyManager.py +++ /dev/null @@ -1,102 +0,0 @@ -# -*- coding: utf-8 -*- -# !/usr/bin/env python -""" -------------------------------------------------- - File Name: ProxyManager.py - Description : - Author : JHao - date: 2016/12/3 -------------------------------------------------- - Change Activity: - 2016/12/3: -------------------------------------------------- -""" -__author__ = 'JHao' - -import random - -from Util import EnvUtil -from DB.DbClient import DbClient -from Util.GetConfig import GetConfig -from Util.LogHandler import LogHandler -from ProxyGetter.getFreeProxy import GetFreeProxy - - -class ProxyManager(object): - """ - ProxyManager - """ - - def __init__(self): - self.db = DbClient() - self.config = GetConfig() - self.raw_proxy_queue = 'raw_proxy' - self.log = LogHandler('proxy_manager') - self.useful_proxy_queue = 'useful_proxy' - - def refresh(self): - """ - fetch proxy into Db by ProxyGetter - :return: - """ - for proxyGetter in self.config.proxy_getter_functions: - proxy_set = set() - # fetch raw proxy - for proxy in getattr(GetFreeProxy, proxyGetter.strip())(): - if proxy: - self.log.info('{func}: fetch proxy {proxy}'.format(func=proxyGetter, proxy=proxy)) - proxy_set.add(proxy.strip()) - - # store raw proxy - for proxy in proxy_set: - self.db.changeTable(self.useful_proxy_queue) - if self.db.exists(proxy): - continue - self.db.changeTable(self.raw_proxy_queue) - self.db.put(proxy) - - def get(self): - """ - return a useful proxy - :return: - """ - self.db.changeTable(self.useful_proxy_queue) - item_dict = self.db.getAll() - if item_dict: - if EnvUtil.PY3: - return random.choice(list(item_dict.keys())) - else: - return random.choice(item_dict.keys()) - return None - # return self.db.pop() - - def delete(self, proxy): - """ - delete proxy from pool - :param proxy: - :return: - """ - self.db.changeTable(self.useful_proxy_queue) - self.db.delete(proxy) - - def getAll(self): - """ - get all proxy from pool as list - :return: - """ - self.db.changeTable(self.useful_proxy_queue) - item_dict = self.db.getAll() - if EnvUtil.PY3: - return list(item_dict.keys()) if item_dict else list() - return item_dict.keys() if item_dict else list() - - def getNumber(self): - self.db.changeTable(self.raw_proxy_queue) - total_raw_proxy = self.db.getNumber() - self.db.changeTable(self.useful_proxy_queue) - total_useful_queue = self.db.getNumber() - return {'raw_proxy': total_raw_proxy, 'useful_proxy': total_useful_queue} - -if __name__ == '__main__': - pp = ProxyManager() - pp.refresh() diff --git a/ProxyGetter/getFreeProxy.py b/ProxyGetter/getFreeProxy.py deleted file mode 100644 index d3d3af83f..000000000 --- a/ProxyGetter/getFreeProxy.py +++ /dev/null @@ -1,186 +0,0 @@ -# -*- coding: utf-8 -*- -# !/usr/bin/env python -""" -------------------------------------------------- - File Name: GetFreeProxy.py - Description : 抓取免费代理 - Author : JHao - date: 2016/11/25 -------------------------------------------------- - Change Activity: - 2016/11/25: -------------------------------------------------- -""" -import re -import requests - -try: - from importlib import reload # py3 实际不会实用,只是为了不显示语法错误 -except: - import sys # py2 - - reload(sys) - sys.setdefaultencoding('utf-8') - -from Util.utilFunction import robustCrawl, getHtmlTree -from Util.WebRequest import WebRequest - -# for debug to disable insecureWarning -requests.packages.urllib3.disable_warnings() - - -class GetFreeProxy(object): - """ - proxy getter - """ - - def __init__(self): - pass - - @staticmethod - def freeProxyFirst(page=10): - """ - 抓取无忧代理 http://www.data5u.com/ - :param page: 页数 - :return: - """ - url_list = ['http://www.data5u.com/', - 'http://www.data5u.com/free/', - 'http://www.data5u.com/free/gngn/index.shtml', - 'http://www.data5u.com/free/gnpt/index.shtml'] - for url in url_list: - html_tree = getHtmlTree(url) - ul_list = html_tree.xpath('//ul[@class="l2"]') - for ul in ul_list: - try: - yield ':'.join(ul.xpath('.//li/text()')[0:2]) - except Exception as e: - pass - - @staticmethod - def freeProxySecond(proxy_number=100): - """ - 抓取代理66 http://www.66ip.cn/ - :param proxy_number: 代理数量 - :return: - """ - url = "http://www.66ip.cn/mo.php?sxb=&tqsl={}&port=&export=&ktip=&sxa=&submit=%CC%E1++%C8%A1&textarea=".format( - proxy_number) - request = WebRequest() - # html = request.get(url).content - # content为未解码,text为解码后的字符串 - html = request.get(url).text - for proxy in re.findall(r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}:\d{1,5}', html): - yield proxy - - @staticmethod - def freeProxyThird(days=1): - """ - 抓取ip181 http://www.ip181.com/ - :param days: - :return: - """ - url = 'http://www.ip181.com/' - html_tree = getHtmlTree(url) - try: - tr_list = html_tree.xpath('//tr')[1:] - for tr in tr_list: - yield ':'.join(tr.xpath('./td/text()')[0:2]) - except Exception as e: - pass - - @staticmethod - def freeProxyFourth(): - """ - 抓取西刺代理 http://api.xicidaili.com/free2016.txt - :return: - """ - url_list = ['http://www.xicidaili.com/nn', # 高匿 - 'http://www.xicidaili.com/nt', # 透明 - ] - for each_url in url_list: - tree = getHtmlTree(each_url) - proxy_list = tree.xpath('.//table[@id="ip_list"]//tr') - for proxy in proxy_list: - try: - yield ':'.join(proxy.xpath('./td/text()')[0:2]) - except Exception as e: - pass - - @staticmethod - def freeProxyFifth(): - """ - 抓取guobanjia http://www.goubanjia.com/free/gngn/index.shtml - :return: - """ - url = "http://www.goubanjia.com/free/gngn/index{page}.shtml" - for page in range(1, 10): - page_url = url.format(page=page) - tree = getHtmlTree(page_url) - proxy_list = tree.xpath('//td[@class="ip"]') - # 此网站有隐藏的数字干扰,或抓取到多余的数字或.符号 - # 需要过滤掉

的内容 - xpath_str = """.//*[not(contains(@style, 'display: none')) - and not(contains(@style, 'display:none')) - and not(contains(@class, 'port')) - ]/text() - """ - for each_proxy in proxy_list: - try: - # :符号裸放在td下,其他放在div span p中,先分割找出ip,再找port - ip_addr = ''.join(each_proxy.xpath(xpath_str)) - port = each_proxy.xpath(".//span[contains(@class, 'port')]/text()")[0] - yield '{}:{}'.format(ip_addr, port) - except Exception as e: - pass - - @staticmethod - def freeProxySixth(): - """ - 抓取讯代理免费proxy http://www.xdaili.cn/ipagent/freeip/getFreeIps?page=1&rows=10 - :return: - """ - url = 'http://www.xdaili.cn/ipagent/freeip/getFreeIps?page=1&rows=10' - request = WebRequest() - try: - res = request.get(url).json() - for row in res['RESULT']['rows']: - yield '{}:{}'.format(row['ip'], row['port']) - except Exception as e: - pass - - @staticmethod - def freeProxySeventh(): - """ - 快代理免费https://www.kuaidaili.com/free/inha/1/ - """ - url = 'https://www.kuaidaili.com/free/inha/{page}/' - for page in range(1, 10): - page_url = url.format(page=page) - tree = getHtmlTree(page_url) - proxy_list = tree.xpath('.//table//tr') - for tr in proxy_list[1:]: - yield ':'.join(tr.xpath('./td/text()')[0:2]) - - -if __name__ == '__main__': - gg = GetFreeProxy() - # for e in gg.freeProxyFirst(): - # print(e) - # - # for e in gg.freeProxySecond(): - # print(e) - # - # for e in gg.freeProxyThird(): - # print(e) - - # for e in gg.freeProxyFourth(): - # print(e) - - # for e in gg.freeProxyFifth(): - # print(e) - - # for e in gg.freeProxySixth(): - # print(e) - for e in gg.freeProxySeventh(): - print(e) diff --git a/README.md b/README.md index b62d6ae5c..0d080bf26 100644 --- a/README.md +++ b/README.md @@ -1,9 +1,8 @@ -爬虫IP代理池 +ProxyPool 爬虫代理IP池 ======= -[![Build Status](https://travis-ci.org/jhao104/proxy_pool.svg?branch=master)](https://travis-ci.org/jhao104/proxy_pool) +[![Tests](https://github.com/jhao104/proxy_pool/actions/workflows/test.yml/badge.svg?branch=master)](https://github.com/jhao104/proxy_pool/actions/workflows/test.yml) [![](https://img.shields.io/badge/Powered%20by-@j_hao104-green.svg)](http://www.spiderpy.cn/blog/) -[![Requirements Status](https://requires.io/github/jhao104/proxy_pool/requirements.svg?branch=master)](https://requires.io/github/jhao104/proxy_pool/requirements/?branch=master) [![Packagist](https://img.shields.io/packagist/l/doctrine/orm.svg)](https://github.com/jhao104/proxy_pool/blob/master/LICENSE) [![GitHub contributors](https://img.shields.io/github/contributors/jhao104/proxy_pool.svg)](https://github.com/jhao104/proxy_pool/graphs/contributors) [![](https://img.shields.io/badge/language-Python-green.svg)](https://github.com/jhao104/proxy_pool) @@ -17,78 +16,119 @@ __ / / /___ / -##### [介绍文档](https://github.com/jhao104/proxy_pool/blob/master/doc/introduce.md) +### ProxyPool -* 支持版本: ![](https://img.shields.io/badge/Python-2.x-green.svg) ![](https://img.shields.io/badge/Python-3.x-blue.svg) +爬虫代理IP池项目,主要功能为定时采集网上发布的免费代理验证入库,定时验证入库的代理保证代理的可用性,提供API和CLI两种使用方式。同时你也可以扩展代理源以增加代理池IP的质量和数量。 -* 测试地址: http://123.207.35.36:5010 (单机勿压。感谢) +* 文档: [document](https://jhao104.github.io/proxy_pool/) -### 下载安装 +* 支持版本: +[![](https://img.shields.io/badge/Python-3.8-blue.svg)](https://docs.python.org/3.8/) +[![](https://img.shields.io/badge/Python-3.9-blue.svg)](https://docs.python.org/3.9/) +[![](https://img.shields.io/badge/Python-3.10-blue.svg)](https://docs.python.org/3.10/) +[![](https://img.shields.io/badge/Python-3.11-blue.svg)](https://docs.python.org/3.11/) -* 下载源码: +* 测试地址: http://demo.spiderpy.cn (勿压谢谢) -```shell -git clone git@github.com:jhao104/proxy_pool.git +* 付费代理推荐: [亮数据 Bright Data](https://get.brightdata.com/github_jh)(前身 Luminati).全球代理与网络抓取行业头部领导者。覆盖 195+ 国家的 1.5亿+ 真人住宅IP,亲测成功率极高,轻松突破反爬封锁。需要高质量代理IP的可以注册后联系中文客服。[申请免费试用](https://get.brightdata.com/github_jh) (PS:用不明白的同学可以参考这个[使用教程](https://www.cnblogs.com/jhao/p/15611785.html))。 -或者直接到https://github.com/jhao104/proxy_pool 下载zip文件 +   想自建爬虫?接入 [Bright Data MCP Server](https://get.brightdata.com/cd3yy5),让 Claude、Cursor、Windsurf 等 AI 助手直接实时抓取网页——自动破解验证码、绕过地区限制。[Scraper Studio](https://get.brightdata.com/cd3yy5) 支持 AI 一键生成或 JS 代码定制,全托管基础设施运行,无需自购代理、无需搭服务器,分钟级上线。所有产品底层均由同一套顶级代理网络驱动。 + +   API 产品现享7折 + 免费试用额度,注册后可联系中文客服快速上手。(用不明白的同学可参考使用教程,或注册后直接使用互动 AI 智能助手) +👉 [https://get.brightdata.com/cd3yy5](https://get.brightdata.com/cd3yy5) + + +### 运行项目 + +##### 下载代码: + +* git clone + +```bash +git clone https://github.com/jhao104/proxy_pool.git +``` + +* releases + +```bash +https://github.com/jhao104/proxy_pool/releases 下载对应zip文件 ``` -* 安装依赖: +##### 安装依赖: -```shell +```bash pip install -r requirements.txt ``` -* 配置Config.ini: +##### 更新配置: + + +```python +# setting.py 为项目配置文件 + +# 配置API服务 + +HOST = "0.0.0.0" # IP +PORT = 5000 # 监听端口 + + +# 配置数据库 -```shell -# Config.ini 为项目配置文件 -# 配置DB -type = SSDB # 如果使用SSDB或redis数据库,均配置为SSDB -host = localhost # db host -port = 8888 # db port -name = proxy # 默认配置 +DB_CONN = 'redis://:pwd@127.0.0.1:8888/0' -# 配置 ProxyGetter -freeProxyFirst = 1 # 这里是启动的抓取函数,可在ProxyGetter/getFreeProxy.py 扩展 -freeProxySecond = 1 -.... -# 配置 HOST (api服务) -ip = 127.0.0.1 # 监听ip,0.0.0.0开启外网访问 -port = 5010 # 监听端口 -# 上面配置启动后,代理api地址为 http://127.0.0.1:5010 +# 配置 ProxyFetcher +PROXY_FETCHER = [ + "freeProxy01", # 这里是启用的代理抓取方法名,所有fetch方法位于fetcher/proxyFetcher.py + "freeProxy02", + # .... +] ``` -* 启动: +#### 启动项目: + +```bash +# 如果已经具备运行条件, 可用通过proxyPool.py启动。 +# 程序分为: schedule 调度程序 和 server Api服务 -```shell -# 如果你的依赖已经安全完成并且具备运行条件,可以直接在Run下运行main.py -# 到Run目录下: ->>>python main.py +# 启动调度程序 +python proxyPool.py schedule -# 如果运行成功你应该看到有4个main.py进程 +# 启动webApi服务 +python proxyPool.py server -# 你也可以分别运行他们, -# 依次到Api下启动ProxyApi.py,Schedule下启动ProxyRefreshSchedule.py和ProxyValidSchedule.py即可. ``` -### 使用 +### Docker Image + +```bash +docker pull jhao104/proxy_pool -  启动过几分钟后就能看到抓取到的代理IP,你可以直接到数据库中查看,推荐一个[SSDB可视化工具](https://github.com/jhao104/SSDBAdmin)。 +docker run --env DB_CONN=redis://:password@ip:port/0 -p 5010:5010 jhao104/proxy_pool:latest +``` +### docker-compose + +项目目录下运行: +``` bash +docker-compose up -d +``` -  也可以通过api访问http://127.0.0.1:5010 查看。 +### 使用 * Api -| api | method | Description | arg| +启动web服务后, 默认配置下会开启 http://127.0.0.1:5010 的api接口服务: + +| api | method | Description | params| | ----| ---- | ---- | ----| | / | GET | api介绍 | None | -| /get | GET | 随机获取一个代理 | None| -| /get_all | GET | 获取所有代理 |None| -| /get_status | GET | 查看代理数量 |None| -| /delete | GET | 删除代理 |proxy=host:ip| +| /get | GET | 随机获取一个代理| 可选参数: `?type=https` 过滤支持https的代理| +| /pop | GET | 获取并删除一个代理| 可选参数: `?type=https` 过滤支持https的代理| +| /all | GET | 获取所有代理 |可选参数: `?type=https` 过滤支持https的代理| +| /count | GET | 查看代理数量 |None| +| /delete | GET | 删除代理 |`?proxy=host:ip`| + * 爬虫使用 @@ -98,7 +138,7 @@ port = 5010 # 监听端口 import requests def get_proxy(): - return requests.get("http://127.0.0.1:5010/get/").content + return requests.get("http://127.0.0.1:5010/get/").json() def delete_proxy(proxy): requests.get("http://127.0.0.1:5010/delete/?proxy={}".format(proxy)) @@ -108,65 +148,85 @@ def delete_proxy(proxy): def getHtml(): # .... retry_count = 5 - proxy = get_proxy() + proxy = get_proxy().get("proxy") while retry_count > 0: try: - html = requests.get('https://www.example.com', proxies={"http": "http://{}".format(proxy)}) + html = requests.get('http://www.example.com', proxies={"http": "http://{}".format(proxy)}) # 使用代理访问 return html except Exception: retry_count -= 1 - # 出错5次, 删除代理池中代理 + # 删除代理池中代理 delete_proxy(proxy) return None ``` ### 扩展代理 -  项目默认包含几个免费的代理获取方法,但是免费的毕竟质量不好,所以如果直接运行可能拿到的代理质量不理想。所以,提供了代理获取的扩展方法。 +  项目默认包含几个免费的代理获取源,但是免费的毕竟质量有限,所以如果直接运行可能拿到的代理质量不理想。所以,提供了代理获取的扩展方法。 -  添加一个新的代理获取方法如下: +  添加一个新的代理源方法如下: -* 1、首先在[GetFreeProxy](https://github.com/jhao104/proxy_pool/blob/b9ccdfaada51b57cfb1bbd0c01d4258971bc8352/ProxyGetter/getFreeProxy.py#L32)类中添加你的获取代理的静态方法, +* 1、首先在[ProxyFetcher](https://github.com/jhao104/proxy_pool/blob/1a3666283806a22ef287fba1a8efab7b94e94bac/fetcher/proxyFetcher.py#L21)类中添加自定义的获取代理的静态方法, 该方法需要以生成器(yield)形式返回`host:ip`格式的代理,例如: ```python -class GetFreeProxy(object): +class ProxyFetcher(object): # .... - # 你自己的方法 + # 自定义代理源获取方法 @staticmethod - def freeProxyCustom(): # 命名不和已有重复即可 + def freeProxyCustom1(): # 命名不和已有重复即可 - # 通过某网站或者某接口或某数据库获取代理 任意你喜欢的姿势都行 - # 假设你拿到了一个代理列表 - proxies = ["139.129.166.68:3128", "139.129.166.61:3128", ...] + # 通过某网站或者某接口或某数据库获取代理 + # 假设你已经拿到了一个代理列表 + proxies = ["x.x.x.x:3128", "x.x.x.x:80"] for proxy in proxies: yield proxy - # 确保每个proxy都是 host:ip正确的格式就行 + # 确保每个proxy都是 host:ip正确的格式返回 ``` -* 2、添加好方法后,修改Config.ini文件中的`[ProxyGetter]`项: +* 2、添加好方法后,修改[setting.py](https://github.com/jhao104/proxy_pool/blob/1a3666283806a22ef287fba1a8efab7b94e94bac/setting.py#L47)文件中的`PROXY_FETCHER`项: -  在`Config.ini`的`[ProxyGetter]`下添加自定义的方法的名字: +  在`PROXY_FETCHER`下添加自定义方法的名字: -```shell +```python +PROXY_FETCHER = [ + "freeProxy01", + "freeProxy02", + # .... + "freeProxyCustom1" # # 确保名字和你添加方法名字一致 +] +``` -[ProxyGetter] -;register the proxy getter function -freeProxyFirst = 0 # 如果要取消某个方法,将其删除或赋为0即可 -.... -freeProxyCustom = 1 # 确保名字和你添加方法名字一致 -``` +  `schedule` 进程会每隔一段时间抓取一次代理,下次抓取时会自动识别调用你定义的方法。 + +### 免费代理源 + 目前实现的采集免费代理网站有(排名不分先后, 下面仅是对其发布的免费代理情况, 付费代理测评可以参考[这里](https://zhuanlan.zhihu.com/p/33576641)): + + | 代理名称 | 状态 | 更新速度 | 可用率 | 地址 | 代码 | + |---------------| ---- | -------- | ------ | ----- |------------------------------------------------| + | 66代理 | ✔ | ★ | * | [地址](http://www.66ip.cn/) | [`freeProxy02`](/fetcher/proxyFetcher.py#L50) | + | 开心代理 | ✔ | ★ | * | [地址](http://www.kxdaili.com/) | [`freeProxy03`](/fetcher/proxyFetcher.py#L63) | + | FreeProxyList | ✔ | ★ | * | [地址](https://www.freeproxylists.net/zh/) | [`freeProxy04`](/fetcher/proxyFetcher.py#L74) | + | 快代理 | ✔ | ★ | * | [地址](https://www.kuaidaili.com/) | [`freeProxy05`](/fetcher/proxyFetcher.py#L92) | + | 冰凌代理 | ✔ | ★★★ | * | [地址](https://www.binglx.cn/) | [`freeProxy06`](/fetcher/proxyFetcher.py#L111) | + | 云代理 | ✔ | ★ | * | [地址](http://www.ip3366.net/) | [`freeProxy07`](/fetcher/proxyFetcher.py#L123) | + | 小幻代理 | ✔ | ★★ | * | [地址](https://ip.ihuan.me/) | [`freeProxy08`](/fetcher/proxyFetcher.py#L133) | + | 免费代理库 | ✔ | ☆ | * | [地址](http://ip.jiangxianli.com/) | [`freeProxy09`](/fetcher/proxyFetcher.py#L143) | + | 89代理 | ✔ | ☆ | * | [地址](https://www.89ip.cn/) | [`freeProxy10`](/fetcher/proxyFetcher.py#L154) | + | 稻壳代理 | ✔ | ★★ | *** | [地址](https://www.docip.ne) | [`freeProxy11`](/fetcher/proxyFetcher.py#L164) | + | 谷德代理 | ✔ | ★★ | *** | [地址](https://www.goodips.com) | [`freeProxy12`](/fetcher/proxyFetcher.py#L174) | -  `ProxyRefreshSchedule`会每隔一段时间抓取一次代理,下次抓取时会自动识别调用你定义的方法。 + + 如果还有其他好的免费代理网站, 可以在提交在[issues](https://github.com/jhao104/proxy_pool/issues/71), 下次更新时会考虑在项目中支持。 ### 问题反馈 -  任何问题欢迎在[Issues](https://github.com/jhao104/proxy_pool/issues) 中反馈,如果没有账号可以去 我的[博客](http://www.spiderpy.cn/blog/message)中留言。 +  任何问题欢迎在[Issues](https://github.com/jhao104/proxy_pool/issues) 中反馈,同时也可以到我的[博客](http://www.spiderpy.cn/blog/message)中留言。   你的反馈会让此项目变得更加完美。 @@ -174,12 +234,15 @@ freeProxyCustom = 1 # 确保名字和你添加方法名字一致   本项目仅作为基本的通用的代理池架构,不接收特有功能(当然,不限于特别好的idea)。 -  本项目依然不够完善,如果发现bug或有新的功能添加,请在[Issues](https://github.com/jhao104/proxy_pool/issues)中提交bug(或新功能)描述,在确认后提交你的代码。 +  本项目依然不够完善,如果发现bug或有新的功能添加,请在[Issues](https://github.com/jhao104/proxy_pool/issues)中提交bug(或新功能)描述,我会尽力改进,使她更加完美。   这里感谢以下contributor的无私奉献: -  [@kangnwh](https://github.com/kangnwh)| [@bobobo80](https://github.com/bobobo80)| [@halleywj](https://github.com/halleywj)| [@newlyedward](https://github.com/newlyedward)| [@wang-ye](https://github.com/wang-ye)| [@gladmo](https://github.com/gladmo)| [@bernieyangmh](https://github.com/bernieyangmh)| [@PythonYXY](https://github.com/PythonYXY)| [@zuijiawoniu](https://github.com/zuijiawoniu)| [@netAir](https://github.com/netAir)| [@gladmo](https://github.com/gladmo)| [@scil](https://github.com/scil) +  [@kangnwh](https://github.com/kangnwh) | [@bobobo80](https://github.com/bobobo80) | [@halleywj](https://github.com/halleywj) | [@newlyedward](https://github.com/newlyedward) | [@wang-ye](https://github.com/wang-ye) | [@gladmo](https://github.com/gladmo) | [@bernieyangmh](https://github.com/bernieyangmh) | [@PythonYXY](https://github.com/PythonYXY) | [@zuijiawoniu](https://github.com/zuijiawoniu) | [@netAir](https://github.com/netAir) | [@scil](https://github.com/scil) | [@tangrela](https://github.com/tangrela) | [@highroom](https://github.com/highroom) | [@luocaodan](https://github.com/luocaodan) | [@vc5](https://github.com/vc5) | [@1again](https://github.com/1again) | [@obaiyan](https://github.com/obaiyan) | [@zsbh](https://github.com/zsbh) | [@jiannanya](https://github.com/jiannanya) | [@Jerry12228](https://github.com/Jerry12228) | [@zeyudada](https://github.com/zeyudada) +### Release Notes + [changelog](https://jhao104.github.io/proxy_pool/changelog/) +Featured|HelloGitHub diff --git a/Run/main.py b/Run/main.py deleted file mode 100644 index 6b07654ee..000000000 --- a/Run/main.py +++ /dev/null @@ -1,42 +0,0 @@ -# -*- coding: utf-8 -*- -""" -------------------------------------------------- - File Name: main.py - Description : 运行主函数 - Author : JHao - date: 2017/4/1 -------------------------------------------------- - Change Activity: - 2017/4/1: -------------------------------------------------- -""" -__author__ = 'JHao' - -import sys -from multiprocessing import Process - -sys.path.append('../') - -from Api.ProxyApi import run as ProxyApiRun -from Schedule.ProxyValidSchedule import run as ValidRun -from Schedule.ProxyRefreshSchedule import run as RefreshRun - - -def run(): - p_list = list() - p1 = Process(target=ProxyApiRun, name='ProxyApiRun') - p_list.append(p1) - p2 = Process(target=ValidRun, name='ValidRun') - p_list.append(p2) - p3 = Process(target=RefreshRun, name='RefreshRun') - p_list.append(p3) - - for p in p_list: - p.daemon = True - p.start() - for p in p_list: - p.join() - - -if __name__ == '__main__': - run() diff --git a/Schedule/ProxyCheck.py b/Schedule/ProxyCheck.py deleted file mode 100644 index 91db84142..000000000 --- a/Schedule/ProxyCheck.py +++ /dev/null @@ -1,63 +0,0 @@ -# -*- coding: utf-8 -*- -""" -------------------------------------------------- - File Name: ProxyCheck - Description : 多线程验证useful_proxy - Author : J_hao - date: 2017/9/26 -------------------------------------------------- - Change Activity: - 2017/9/26: 多线程验证useful_proxy -------------------------------------------------- -""" -__author__ = 'J_hao' - -import sys -from time import sleep -from threading import Thread - -sys.path.append('../') - -from Util.utilFunction import validUsefulProxy -from Manager.ProxyManager import ProxyManager -from Util.LogHandler import LogHandler - -FAIL_COUNT = 1 # 校验失败次数, 超过次数删除代理 - - -class ProxyCheck(ProxyManager, Thread): - def __init__(self): - ProxyManager.__init__(self) - Thread.__init__(self) - self.log = LogHandler('proxy_check') - - def run(self): - self.db.changeTable(self.useful_proxy_queue) - while True: - proxy_item = self.db.pop() - while proxy_item: - proxy = proxy_item.get('proxy') - counter = proxy_item.get('value', 1) - if validUsefulProxy(proxy): - # 验证通过计数器加1 - if counter and int(counter) < 1: - self.db.put(proxy, num=int(counter) + 1) - else: - self.db.put(proxy) - self.log.info('ProxyCheck: {} validation pass'.format(proxy)) - else: - self.log.info('ProxyCheck: {} validation fail'.format(proxy)) - # 验证失败,计数器减1 - if counter and int(counter) <= FAIL_COUNT: - self.log.info('ProxyCheck: {} fail too many, delete!'.format(proxy)) - self.db.delete(proxy) - else: - self.db.put(proxy, num=int(counter) - 1) - - proxy_item = self.db.pop() - sleep(60 * 5) - - -if __name__ == '__main__': - p = ProxyCheck() - p.run() diff --git a/Schedule/ProxyRefreshSchedule.py b/Schedule/ProxyRefreshSchedule.py deleted file mode 100644 index c358acd8a..000000000 --- a/Schedule/ProxyRefreshSchedule.py +++ /dev/null @@ -1,104 +0,0 @@ -# -*- coding: utf-8 -*- -# !/usr/bin/env python -""" -------------------------------------------------- - File Name: ProxyRefreshSchedule.py - Description : 代理定时刷新 - Author : JHao - date: 2016/12/4 -------------------------------------------------- - Change Activity: - 2016/12/4: 代理定时刷新 - 2017/03/06: 使用LogHandler添加日志 - 2017/04/26: raw_proxy_queue验证通过但useful_proxy_queue中已经存在的代理不在放入 -------------------------------------------------- -""" - -import sys -import time -import logging -from threading import Thread -from apscheduler.schedulers.blocking import BlockingScheduler - -sys.path.append('../') - -from Util.utilFunction import validUsefulProxy -from Manager.ProxyManager import ProxyManager -from Util.LogHandler import LogHandler - -__author__ = 'JHao' - -logging.basicConfig() - - -class ProxyRefreshSchedule(ProxyManager): - """ - 代理定时刷新 - """ - - def __init__(self): - ProxyManager.__init__(self) - self.log = LogHandler('refresh_schedule') - - def validProxy(self): - """ - 验证raw_proxy_queue中的代理, 将可用的代理放入useful_proxy_queue - :return: - """ - self.db.changeTable(self.raw_proxy_queue) - raw_proxy_item = self.db.pop() - self.log.info('ProxyRefreshSchedule: %s start validProxy' % time.ctime()) - # 计算剩余代理,用来减少重复计算 - remaining_proxies = self.getAll() - while raw_proxy_item: - raw_proxy = raw_proxy_item.get('proxy') - if isinstance(raw_proxy, bytes): - # 兼容Py3 - raw_proxy = raw_proxy.decode('utf8') - - if (raw_proxy not in remaining_proxies) and validUsefulProxy(raw_proxy): - self.db.changeTable(self.useful_proxy_queue) - self.db.put(raw_proxy) - self.log.info('ProxyRefreshSchedule: %s validation pass' % raw_proxy) - else: - self.log.info('ProxyRefreshSchedule: %s validation fail' % raw_proxy) - self.db.changeTable(self.raw_proxy_queue) - raw_proxy_item = self.db.pop() - remaining_proxies = self.getAll() - self.log.info('ProxyRefreshSchedule: %s validProxy complete' % time.ctime()) - - -def refreshPool(): - pp = ProxyRefreshSchedule() - pp.validProxy() - - -def main(process_num=30): - p = ProxyRefreshSchedule() - - # 获取新代理 - p.refresh() - - # 检验新代理 - pl = [] - for num in range(process_num): - proc = Thread(target=refreshPool, args=()) - pl.append(proc) - - for num in range(process_num): - pl[num].daemon = True - pl[num].start() - - for num in range(process_num): - pl[num].join() - - -def run(): - main() - sched = BlockingScheduler() - sched.add_job(main, 'interval', minutes=10) # 每10分钟抓取一次 - sched.start() - - -if __name__ == '__main__': - run() diff --git a/Schedule/ProxyValidSchedule.py b/Schedule/ProxyValidSchedule.py deleted file mode 100644 index 8345a83a4..000000000 --- a/Schedule/ProxyValidSchedule.py +++ /dev/null @@ -1,54 +0,0 @@ -# -*- coding: utf-8 -*- -""" -------------------------------------------------- - File Name: ProxyValidSchedule.py - Description : 验证useful_proxy_queue中的代理,将不可用的移出 - Author : JHao - date: 2017/3/31 -------------------------------------------------- - Change Activity: - 2017/3/31: 验证useful_proxy_queue中的代理 -------------------------------------------------- -""" -__author__ = 'JHao' - -import sys - -sys.path.append('../') - -from Schedule.ProxyCheck import ProxyCheck - - -class ProxyValidSchedule(object): - def __init__(self): - pass - - def __validProxy(self, threads=5): - """ - 验证useful_proxy代理 - :param threads: 线程数 - :return: - """ - thread_list = list() - for index in range(threads): - thread_list.append(ProxyCheck()) - - for thread in thread_list: - thread.daemon = True - thread.start() - - for thread in thread_list: - thread.join() - - def main(self): - self.__validProxy() - - -def run(): - p = ProxyValidSchedule() - p.main() - - -if __name__ == '__main__': - p = ProxyValidSchedule() - p.main() diff --git a/Schedule/__init__.py b/Schedule/__init__.py deleted file mode 100644 index e94e59d11..000000000 --- a/Schedule/__init__.py +++ /dev/null @@ -1,13 +0,0 @@ -# -*- coding: utf-8 -*- -""" -------------------------------------------------- - File Name: __init__.py.py - Description : - Author : JHao - date: 2016/12/3 -------------------------------------------------- - Change Activity: - 2016/12/3: -------------------------------------------------- -""" -__author__ = 'JHao' \ No newline at end of file diff --git a/Test/__init__.py b/Test/__init__.py deleted file mode 100644 index 898942953..000000000 --- a/Test/__init__.py +++ /dev/null @@ -1,13 +0,0 @@ -# -*- coding: utf-8 -*- -""" -------------------------------------------------- - File Name: __init__.py - Description : - Author : J_hao - date: 2017/7/31 -------------------------------------------------- - Change Activity: - 2017/7/31: -------------------------------------------------- -""" -__author__ = 'J_hao' diff --git a/Test/testGetConfig.py b/Test/testGetConfig.py deleted file mode 100644 index 7f44fa6b4..000000000 --- a/Test/testGetConfig.py +++ /dev/null @@ -1,33 +0,0 @@ -# -*- coding: utf-8 -*- -""" -------------------------------------------------- - File Name: testGetConfig - Description : test all function in GetConfig.py - Author : J_hao - date: 2017/7/31 -------------------------------------------------- - Change Activity: - 2017/7/31: -------------------------------------------------- -""" -__author__ = 'J_hao' - -from Util.GetConfig import GetConfig - - -# noinspection PyPep8Naming -def testGetConfig(): - """ - test class GetConfig in Util/GetConfig - :return: - """ - gg = GetConfig() - print(gg.db_type) - print(gg.db_name) - print(gg.db_host) - print(gg.db_port) - assert isinstance(gg.proxy_getter_functions, list) - print(gg.proxy_getter_functions) - -if __name__ == '__main__': - testGetConfig() diff --git a/Test/testGetFreeProxy.py b/Test/testGetFreeProxy.py deleted file mode 100644 index df99c79f3..000000000 --- a/Test/testGetFreeProxy.py +++ /dev/null @@ -1,37 +0,0 @@ -# -*- coding: utf-8 -*- -""" -------------------------------------------------- - File Name: testGetFreeProxy - Description : test model ProxyGetter/getFreeProxy - Author : J_hao - date: 2017/7/31 -------------------------------------------------- - Change Activity: - 2017/7/31:function testGetFreeProxy -------------------------------------------------- -""" -__author__ = 'J_hao' - -from ProxyGetter.getFreeProxy import GetFreeProxy -from Util.GetConfig import GetConfig - - -# noinspection PyPep8Naming -def testGetFreeProxy(): - """ - test class GetFreeProxy in ProxyGetter/GetFreeProxy - :return: - """ - gc = GetConfig() - proxy_getter_functions = gc.proxy_getter_functions - for proxyGetter in proxy_getter_functions: - proxy_count = 0 - for proxy in getattr(GetFreeProxy, proxyGetter.strip())(): - if proxy: - print('{func}: fetch proxy {proxy}'.format(func=proxyGetter, proxy=proxy)) - proxy_count += 1 - assert proxy_count >= 20, '{} fetch proxy fail'.format(proxyGetter) - - -if __name__ == '__main__': - testGetFreeProxy() diff --git a/Test/testLogHandler.py b/Test/testLogHandler.py deleted file mode 100644 index da309b707..000000000 --- a/Test/testLogHandler.py +++ /dev/null @@ -1,35 +0,0 @@ -# -*- coding: utf-8 -*- -""" -------------------------------------------------- - File Name: testLogHandler - Description : - Author : J_hao - date: 2017/8/2 -------------------------------------------------- - Change Activity: - 2017/8/2: -------------------------------------------------- -""" -__author__ = 'J_hao' - -from Util.LogHandler import LogHandler - - -# noinspection PyPep8Naming -def testLogHandler(): - """ - test function LogHandler in Util/LogHandler - :return: - """ - log = LogHandler('test') - log.info('this is a log from test') - - log.resetName(name='test1') - log.info('this is a log from test1') - - log.resetName(name='test2') - log.info('this is a log from test2') - - -if __name__ == '__main__': - testLogHandler() diff --git a/Test/testWebRequest.py b/Test/testWebRequest.py deleted file mode 100644 index 07dd54762..000000000 --- a/Test/testWebRequest.py +++ /dev/null @@ -1,30 +0,0 @@ -# -*- coding: utf-8 -*- -""" -------------------------------------------------- - File Name: testWebRequest - Description : test class WebRequest - Author : J_hao - date: 2017/7/31 -------------------------------------------------- - Change Activity: - 2017/7/31: function testWebRequest -------------------------------------------------- -""" -__author__ = 'J_hao' - -from Util.WebRequest import WebRequest - - -# noinspection PyPep8Naming -def testWebRequest(): - """ - test class WebRequest in Util/WebRequest.py - :return: - """ - wr = WebRequest() - request_object = wr.get('https://www.baidu.com/') - assert request_object.status_code == 200 - - -if __name__ == '__main__': - testWebRequest() diff --git a/Util/EnvUtil.py b/Util/EnvUtil.py deleted file mode 100644 index b9df83c55..000000000 --- a/Util/EnvUtil.py +++ /dev/null @@ -1,17 +0,0 @@ -# -*- coding: utf-8 -*- -""" -------------------------------------------------- - File Name: EnvUtil - Description : 环境相关 - Author : J_hao - date: 2017/9/18 -------------------------------------------------- - Change Activity: - 2017/9/18: 区分Python版本 -------------------------------------------------- -""" -__author__ = 'J_hao' - -import sys - -PY3 = sys.version_info >= (3,) \ No newline at end of file diff --git a/Util/GetConfig.py b/Util/GetConfig.py deleted file mode 100644 index 24b003f28..000000000 --- a/Util/GetConfig.py +++ /dev/null @@ -1,68 +0,0 @@ -# -*- coding: utf-8 -*- -# !/usr/bin/env python -""" -------------------------------------------------- - File Name: GetConfig.py - Description : fetch config from config.ini - Author : JHao - date: 2016/12/3 -------------------------------------------------- - Change Activity: - 2016/12/3: get db property func -------------------------------------------------- -""" -__author__ = 'JHao' - -import os -from Util.utilClass import ConfigParse -from Util.utilClass import LazyProperty - - -class GetConfig(object): - """ - to get config from config.ini - """ - - def __init__(self): - self.pwd = os.path.split(os.path.realpath(__file__))[0] - self.config_path = os.path.join(os.path.split(self.pwd)[0], 'Config.ini') - self.config_file = ConfigParse() - self.config_file.read(self.config_path) - - @LazyProperty - def db_type(self): - return self.config_file.get('DB', 'type') - - @LazyProperty - def db_name(self): - return self.config_file.get('DB', 'name') - - @LazyProperty - def db_host(self): - return self.config_file.get('DB', 'host') - - @LazyProperty - def db_port(self): - return int(self.config_file.get('DB', 'port')) - - @LazyProperty - def proxy_getter_functions(self): - return self.config_file.options('ProxyGetter') - - @LazyProperty - def host_ip(self): - return self.config_file.get('HOST','ip') - - @LazyProperty - def host_port(self): - return int(self.config_file.get('HOST', 'port')) - -if __name__ == '__main__': - gg = GetConfig() - print(gg.db_type) - print(gg.db_name) - print(gg.db_host) - print(gg.db_port) - print(gg.proxy_getter_functions) - print(gg.host_ip) - print(gg.host_port) diff --git a/Util/WebRequest.py b/Util/WebRequest.py deleted file mode 100644 index abbdb17be..000000000 --- a/Util/WebRequest.py +++ /dev/null @@ -1,82 +0,0 @@ -# -*- coding: utf-8 -*- -""" -------------------------------------------------- - File Name: WebRequest - Description : Network Requests Class - Author : J_hao - date: 2017/7/31 -------------------------------------------------- - Change Activity: - 2017/7/31: -------------------------------------------------- -""" -__author__ = 'J_hao' - -import requests -import random -import time - - -class WebRequest(object): - def __init__(self, *args, **kwargs): - pass - - @property - def user_agent(self): - """ - return an User-Agent at random - :return: - """ - ua_list = [ - 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101', - 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122', - 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.71', - 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95', - 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71', - 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)', - 'Mozilla/5.0 (Windows NT 5.1; U; en; rv:1.8.1) Gecko/20061208 Firefox/2.0.0 Opera 9.50', - 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:34.0) Gecko/20100101 Firefox/34.0', - ] - return random.choice(ua_list) - - @property - def header(self): - """ - basic header - :return: - """ - return {'User-Agent': self.user_agent, - 'Accept': '*/*', - 'Connection': 'keep-alive', - 'Accept-Language': 'zh-CN,zh;q=0.8'} - - def get(self, url, header=None, retry_time=5, timeout=30, - retry_flag=list(), retry_interval=5, *args, **kwargs): - """ - get method - :param url: target url - :param header: headers - :param retry_time: retry time when network error - :param timeout: network timeout - :param retry_flag: if retry_flag in content. do retry - :param retry_interval: retry interval(second) - :param args: - :param kwargs: - :return: - """ - headers = self.header - if header and isinstance(header, dict): - headers.update(header) - while True: - try: - html = requests.get(url, headers=headers, timeout=timeout) - if any(f in html.content for f in retry_flag): - raise Exception - return html - except Exception as e: - print(e) - retry_time -= 1 - if retry_time <= 0: - # 多次请求失败时,返回百度页面 - return requests.get("https://www.baidu.com/") - time.sleep(retry_interval) diff --git a/Util/utilClass.py b/Util/utilClass.py deleted file mode 100644 index 89112ffd8..000000000 --- a/Util/utilClass.py +++ /dev/null @@ -1,64 +0,0 @@ -# -*- coding: utf-8 -*- -# !/usr/bin/env python -""" -------------------------------------------------- - File Name: utilClass.py - Description : tool class - Author : JHao - date: 2016/12/3 -------------------------------------------------- - Change Activity: - 2016/12/3: Class LazyProperty - 2016/12/4: rewrite ConfigParser -------------------------------------------------- -""" -__author__ = 'JHao' - - -class LazyProperty(object): - """ - LazyProperty - explain: http://www.spiderpy.cn/blog/5/ - """ - - def __init__(self, func): - self.func = func - - def __get__(self, instance, owner): - if instance is None: - return self - else: - value = self.func(instance) - setattr(instance, self.func.__name__, value) - return value - - -try: - from configparser import ConfigParser # py3 -except: - from ConfigParser import ConfigParser # py2 - - -class ConfigParse(ConfigParser): - """ - rewrite ConfigParser, for support upper option - """ - - def __init__(self): - ConfigParser.__init__(self) - - def optionxform(self, optionstr): - return optionstr - - -class Singleton(type): - """ - Singleton Metaclass - """ - - _inst = {} - - def __call__(cls, *args, **kwargs): - if cls not in cls._inst: - cls._inst[cls] = super(Singleton, cls).__call__(*args) - return cls._inst[cls] diff --git a/Util/utilFunction.py b/Util/utilFunction.py deleted file mode 100644 index 2227bfec4..000000000 --- a/Util/utilFunction.py +++ /dev/null @@ -1,106 +0,0 @@ -# -*- coding: utf-8 -*- -# !/usr/bin/env python -""" -------------------------------------------------- - File Name: utilFunction.py - Description : tool function - Author : JHao - date: 2016/11/25 -------------------------------------------------- - Change Activity: - 2016/11/25: 添加robustCrawl、verifyProxy、getHtmlTree -------------------------------------------------- -""" -import requests -import time -from lxml import etree - -from Util.LogHandler import LogHandler -from Util.WebRequest import WebRequest - -logger = LogHandler(__name__, stream=False) - - -# noinspection PyPep8Naming -def robustCrawl(func): - def decorate(*args, **kwargs): - try: - return func(*args, **kwargs) - except Exception as e: - logger.info(u"sorry, 抓取出错。错误原因:") - logger.info(e) - - return decorate - - -# noinspection PyPep8Naming -def verifyProxyFormat(proxy): - """ - 检查代理格式 - :param proxy: - :return: - """ - import re - verify_regex = r"\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}:\d{1,5}" - return True if re.findall(verify_regex, proxy) else False - - -# noinspection PyPep8Naming -def getHtmlTree(url, **kwargs): - """ - 获取html树 - :param url: - :param kwargs: - :return: - """ - - header = {'Connection': 'keep-alive', - 'Cache-Control': 'max-age=0', - 'Upgrade-Insecure-Requests': '1', - 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko)', - 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', - 'Accept-Encoding': 'gzip, deflate, sdch', - 'Accept-Language': 'zh-CN,zh;q=0.8', - } - # TODO 取代理服务器用代理服务器访问 - wr = WebRequest() - - # delay 2s for per request - time.sleep(2) - - html = wr.get(url=url, header=header).content - return etree.HTML(html) - - -def tcpConnect(proxy): - """ - TCP 三次握手 - :param proxy: - :return: - """ - from socket import socket, AF_INET, SOCK_STREAM - s = socket(AF_INET, SOCK_STREAM) - ip, port = proxy.split(':') - result = s.connect_ex((ip, int(port))) - return True if result == 0 else False - - -# noinspection PyPep8Naming -def validUsefulProxy(proxy): - """ - 检验代理是否可用 - :param proxy: - :return: - """ - if isinstance(proxy, bytes): - proxy = proxy.decode('utf8') - proxies = {"http": "http://{proxy}".format(proxy=proxy)} - try: - # 超过20秒的代理就不要了 - r = requests.get('http://httpbin.org/ip', proxies=proxies, timeout=20, verify=False) - if r.status_code == 200: - logger.info('%s is ok' % proxy) - return True - except Exception as e: - logger.debug(e) - return False diff --git a/__init__.py b/__init__.py deleted file mode 100644 index c511f3103..000000000 --- a/__init__.py +++ /dev/null @@ -1,13 +0,0 @@ -# -*- coding: utf-8 -*- -""" -------------------------------------------------- - File Name: __init__.py - Description : - Author : JHao - date: 2016/12/3 -------------------------------------------------- - Change Activity: - 2016/12/3: -------------------------------------------------- -""" -__author__ = 'JHao' \ No newline at end of file diff --git a/_config.yml b/_config.yml index ddeb671b6..c4192631f 100644 --- a/_config.yml +++ b/_config.yml @@ -1 +1 @@ -theme: jekyll-theme-time-machine \ No newline at end of file +theme: jekyll-theme-cayman \ No newline at end of file diff --git a/Api/__init__.py b/api/__init__.py similarity index 100% rename from Api/__init__.py rename to api/__init__.py diff --git a/api/proxyApi.py b/api/proxyApi.py new file mode 100644 index 000000000..bd2de57e2 --- /dev/null +++ b/api/proxyApi.py @@ -0,0 +1,138 @@ +# -*- coding: utf-8 -*- +# !/usr/bin/env python +""" +------------------------------------------------- + File Name: ProxyApi.py + Description : WebApi + Author : JHao + date: 2016/12/4 +------------------------------------------------- + Change Activity: + 2016/12/04: WebApi + 2019/08/14: 集成Gunicorn启动方式 + 2020/06/23: 新增pop接口 + 2022/07/21: 更新count接口 +------------------------------------------------- +""" +__author__ = 'JHao' + +import platform +from werkzeug.wrappers import Response +from flask import Flask, jsonify, request + +from util.six import iteritems +from helper.proxy import Proxy +from handler.proxyHandler import ProxyHandler +from handler.configHandler import ConfigHandler + +app = Flask(__name__) +conf = ConfigHandler() +proxy_handler = ProxyHandler() + + +class JsonResponse(Response): + @classmethod + def force_type(cls, response, environ=None): + if isinstance(response, (dict, list)): + response = jsonify(response) + + return super(JsonResponse, cls).force_type(response, environ) + + +app.response_class = JsonResponse + +api_list = [ + {"url": "/get", "params": "type: ''https'|''", "desc": "get a proxy"}, + {"url": "/pop", "params": "", "desc": "get and delete a proxy"}, + {"url": "/delete", "params": "proxy: 'e.g. 127.0.0.1:8080'", "desc": "delete an unable proxy"}, + {"url": "/all", "params": "type: ''https'|''", "desc": "get all proxy from proxy pool"}, + {"url": "/count", "params": "", "desc": "return proxy count"} + # 'refresh': 'refresh proxy pool', +] + + +@app.route('/') +def index(): + return {'url': api_list} + + +@app.route('/get/') +def get(): + https = request.args.get("type", "").lower() == 'https' + proxy = proxy_handler.get(https) + return proxy.to_dict if proxy else {"code": 0, "src": "no proxy"} + + +@app.route('/pop/') +def pop(): + https = request.args.get("type", "").lower() == 'https' + proxy = proxy_handler.pop(https) + return proxy.to_dict if proxy else {"code": 0, "src": "no proxy"} + + +@app.route('/refresh/') +def refresh(): + # TODO refresh会有守护程序定时执行,由api直接调用性能较差,暂不使用 + return 'success' + + +@app.route('/all/') +def getAll(): + https = request.args.get("type", "").lower() == 'https' + proxies = proxy_handler.getAll(https) + return jsonify([_.to_dict for _ in proxies]) + + +@app.route('/delete/', methods=['GET']) +def delete(): + proxy = request.args.get('proxy') + status = proxy_handler.delete(Proxy(proxy)) + return {"code": 0, "src": status} + + +@app.route('/count/') +def getCount(): + proxies = proxy_handler.getAll() + http_type_dict = {} + source_dict = {} + for proxy in proxies: + http_type = 'https' if proxy.https else 'http' + http_type_dict[http_type] = http_type_dict.get(http_type, 0) + 1 + for source in proxy.source.split('/'): + source_dict[source] = source_dict.get(source, 0) + 1 + return {"http_type": http_type_dict, "source": source_dict, "count": len(proxies)} + + +def runFlask(): + if platform.system() == "Windows": + app.run(host=conf.serverHost, port=conf.serverPort) + else: + import gunicorn.app.base + + class StandaloneApplication(gunicorn.app.base.BaseApplication): + + def __init__(self, app, options=None): + self.options = options or {} + self.application = app + super(StandaloneApplication, self).__init__() + + def load_config(self): + _config = dict([(key, value) for key, value in iteritems(self.options) + if key in self.cfg.settings and value is not None]) + for key, value in iteritems(_config): + self.cfg.set(key.lower(), value) + + def load(self): + return self.application + + _options = { + 'bind': '%s:%s' % (conf.serverHost, conf.serverPort), + 'workers': 4, + 'accesslog': '-', # log to stdout + 'access_log_format': '%(h)s %(l)s %(t)s "%(r)s" %(s)s "%(a)s"' + } + StandaloneApplication(app, _options).run() + + +if __name__ == '__main__': + runFlask() diff --git a/DB/__init__.py b/db/__init__.py similarity index 100% rename from DB/__init__.py rename to db/__init__.py diff --git a/db/dbClient.py b/db/dbClient.py new file mode 100644 index 000000000..4d9554b18 --- /dev/null +++ b/db/dbClient.py @@ -0,0 +1,120 @@ +# -*- coding: utf-8 -*- +# !/usr/bin/env python +""" +------------------------------------------------- + File Name: DbClient.py + Description : DB工厂类 + Author : JHao + date: 2016/12/2 +------------------------------------------------- + Change Activity: + 2016/12/02: DB工厂类 + 2020/07/03: 取消raw_proxy储存 +------------------------------------------------- +""" +__author__ = 'JHao' + +import os +import sys + +from util.six import urlparse, withMetaclass +from util.singleton import Singleton + +sys.path.append(os.path.dirname(os.path.abspath(__file__))) + + +class DbClient(withMetaclass(Singleton)): + """ + DbClient DB工厂类 提供get/put/update/pop/delete/exists/getAll/clean/getCount/changeTable方法 + + + 抽象方法定义: + get(): 随机返回一个proxy; + put(proxy): 存入一个proxy; + pop(): 顺序返回并删除一个proxy; + update(proxy): 更新指定proxy信息; + delete(proxy): 删除指定proxy; + exists(proxy): 判断指定proxy是否存在; + getAll(): 返回所有代理; + clean(): 清除所有proxy信息; + getCount(): 返回proxy统计信息; + changeTable(name): 切换操作对象 + + + 所有方法需要相应类去具体实现: + ssdb: ssdbClient.py + redis: redisClient.py + mongodb: mongodbClient.py + + """ + + def __init__(self, db_conn): + """ + init + :return: + """ + self.parseDbConn(db_conn) + self.__initDbClient() + + @classmethod + def parseDbConn(cls, db_conn): + db_conf = urlparse(db_conn) + cls.db_type = db_conf.scheme.upper().strip() + cls.db_host = db_conf.hostname + cls.db_port = db_conf.port + cls.db_user = db_conf.username + cls.db_pwd = db_conf.password + cls.db_name = db_conf.path[1:] + return cls + + def __initDbClient(self): + """ + init DB Client + :return: + """ + __type = None + if "SSDB" == self.db_type: + __type = "ssdbClient" + elif "REDIS" == self.db_type: + __type = "redisClient" + else: + pass + assert __type, 'type error, Not support DB type: {}'.format(self.db_type) + self.client = getattr(__import__(__type), "%sClient" % self.db_type.title())(host=self.db_host, + port=self.db_port, + username=self.db_user, + password=self.db_pwd, + db=self.db_name) + + def get(self, https, **kwargs): + return self.client.get(https, **kwargs) + + def put(self, key, **kwargs): + return self.client.put(key, **kwargs) + + def update(self, key, value, **kwargs): + return self.client.update(key, value, **kwargs) + + def delete(self, key, **kwargs): + return self.client.delete(key, **kwargs) + + def exists(self, key, **kwargs): + return self.client.exists(key, **kwargs) + + def pop(self, https, **kwargs): + return self.client.pop(https, **kwargs) + + def getAll(self, https): + return self.client.getAll(https) + + def clear(self): + return self.client.clear() + + def changeTable(self, name): + self.client.changeTable(name) + + def getCount(self): + return self.client.getCount() + + def test(self): + return self.client.test() diff --git a/db/redisClient.py b/db/redisClient.py new file mode 100644 index 000000000..e66614d7e --- /dev/null +++ b/db/redisClient.py @@ -0,0 +1,155 @@ +# -*- coding: utf-8 -*- +""" +----------------------------------------------------- + File Name: redisClient.py + Description : 封装Redis相关操作 + Author : JHao + date: 2019/8/9 +------------------------------------------------------ + Change Activity: + 2019/08/09: 封装Redis相关操作 + 2020/06/23: 优化pop方法, 改用hscan命令 + 2021/05/26: 区别http/https代理 +------------------------------------------------------ +""" +__author__ = 'JHao' + +from redis.exceptions import TimeoutError, ConnectionError, ResponseError +from redis.connection import BlockingConnectionPool +from handler.logHandler import LogHandler +from random import choice +from redis import Redis +import json + + +class RedisClient(object): + """ + Redis client + + Redis中代理存放的结构为hash: + key为ip:port, value为代理属性的字典; + + """ + + def __init__(self, **kwargs): + """ + init + :param host: host + :param port: port + :param password: password + :param db: db + :return: + """ + self.name = "" + kwargs.pop("username") + self.__conn = Redis(connection_pool=BlockingConnectionPool(decode_responses=True, + timeout=5, + socket_timeout=5, + **kwargs)) + + def get(self, https): + """ + 返回一个代理 + :return: + """ + if https: + items = self.__conn.hvals(self.name) + proxies = list(filter(lambda x: json.loads(x).get("https"), items)) + return choice(proxies) if proxies else None + else: + proxies = self.__conn.hkeys(self.name) + proxy = choice(proxies) if proxies else None + return self.__conn.hget(self.name, proxy) if proxy else None + + def put(self, proxy_obj): + """ + 将代理放入hash, 使用changeTable指定hash name + :param proxy_obj: Proxy obj + :return: + """ + data = self.__conn.hset(self.name, proxy_obj.proxy, proxy_obj.to_json) + return data + + def pop(self, https): + """ + 弹出一个代理 + :return: dict {proxy: value} + """ + proxy = self.get(https) + if proxy: + self.__conn.hdel(self.name, json.loads(proxy).get("proxy", "")) + return proxy if proxy else None + + def delete(self, proxy_str): + """ + 移除指定代理, 使用changeTable指定hash name + :param proxy_str: proxy str + :return: + """ + return self.__conn.hdel(self.name, proxy_str) + + def exists(self, proxy_str): + """ + 判断指定代理是否存在, 使用changeTable指定hash name + :param proxy_str: proxy str + :return: + """ + return self.__conn.hexists(self.name, proxy_str) + + def update(self, proxy_obj): + """ + 更新 proxy 属性 + :param proxy_obj: + :return: + """ + return self.__conn.hset(self.name, proxy_obj.proxy, proxy_obj.to_json) + + def getAll(self, https): + """ + 字典形式返回所有代理, 使用changeTable指定hash name + :return: + """ + items = self.__conn.hvals(self.name) + if https: + return list(filter(lambda x: json.loads(x).get("https"), items)) + else: + return items + + def clear(self): + """ + 清空所有代理, 使用changeTable指定hash name + :return: + """ + return self.__conn.delete(self.name) + + def getCount(self): + """ + 返回代理数量 + :return: + """ + proxies = self.getAll(https=False) + return {'total': len(proxies), 'https': len(list(filter(lambda x: json.loads(x).get("https"), proxies)))} + + def changeTable(self, name): + """ + 切换操作对象 + :param name: + :return: + """ + self.name = name + + def test(self): + log = LogHandler('redis_client') + try: + self.getCount() + except TimeoutError as e: + log.error('redis connection time out: %s' % str(e), exc_info=True) + return e + except ConnectionError as e: + log.error('redis connection error: %s' % str(e), exc_info=True) + return e + except ResponseError as e: + log.error('redis connection error: %s' % str(e), exc_info=True) + return e + + diff --git a/db/ssdbClient.py b/db/ssdbClient.py new file mode 100644 index 000000000..0f5c00054 --- /dev/null +++ b/db/ssdbClient.py @@ -0,0 +1,153 @@ +# -*- coding: utf-8 -*- +# !/usr/bin/env python +""" +------------------------------------------------- + File Name: ssdbClient.py + Description : 封装SSDB操作 + Author : JHao + date: 2016/12/2 +------------------------------------------------- + Change Activity: + 2016/12/2: + 2017/09/22: PY3中 redis-py返回的数据是bytes型 + 2017/09/27: 修改pop()方法 返回{proxy:value}字典 + 2020/07/03: 2.1.0 优化代码结构 + 2021/05/26: 区分http和https代理 +------------------------------------------------- +""" +__author__ = 'JHao' +from redis.exceptions import TimeoutError, ConnectionError, ResponseError +from redis.connection import BlockingConnectionPool +from handler.logHandler import LogHandler +from random import choice +from redis import Redis +import json + + +class SsdbClient(object): + """ + SSDB client + + SSDB中代理存放的结构为hash: + key为代理的ip:por, value为代理属性的字典; + """ + + def __init__(self, **kwargs): + """ + init + :param host: host + :param port: port + :param password: password + :return: + """ + self.name = "" + kwargs.pop("username") + self.__conn = Redis(connection_pool=BlockingConnectionPool(decode_responses=True, + timeout=5, + socket_timeout=5, + **kwargs)) + + def get(self, https): + """ + 从hash中随机返回一个代理 + :return: + """ + if https: + items_dict = self.__conn.hgetall(self.name) + proxies = list(filter(lambda x: json.loads(x).get("https"), items_dict.values())) + return choice(proxies) if proxies else None + else: + proxies = self.__conn.hkeys(self.name) + proxy = choice(proxies) if proxies else None + return self.__conn.hget(self.name, proxy) if proxy else None + + def put(self, proxy_obj): + """ + 将代理放入hash + :param proxy_obj: Proxy obj + :return: + """ + result = self.__conn.hset(self.name, proxy_obj.proxy, proxy_obj.to_json) + return result + + def pop(self, https): + """ + 顺序弹出一个代理 + :return: proxy + """ + proxy = self.get(https) + if proxy: + self.__conn.hdel(self.name, json.loads(proxy).get("proxy", "")) + return proxy if proxy else None + + def delete(self, proxy_str): + """ + 移除指定代理, 使用changeTable指定hash name + :param proxy_str: proxy str + :return: + """ + self.__conn.hdel(self.name, proxy_str) + + def exists(self, proxy_str): + """ + 判断指定代理是否存在, 使用changeTable指定hash name + :param proxy_str: proxy str + :return: + """ + return self.__conn.hexists(self.name, proxy_str) + + def update(self, proxy_obj): + """ + 更新 proxy 属性 + :param proxy_obj: + :return: + """ + self.__conn.hset(self.name, proxy_obj.proxy, proxy_obj.to_json) + + def getAll(self, https): + """ + 字典形式返回所有代理, 使用changeTable指定hash name + :return: + """ + item_dict = self.__conn.hgetall(self.name) + if https: + return list(filter(lambda x: json.loads(x).get("https"), item_dict.values())) + else: + return item_dict.values() + + def clear(self): + """ + 清空所有代理, 使用changeTable指定hash name + :return: + """ + return self.__conn.delete(self.name) + + def getCount(self): + """ + 返回代理数量 + :return: + """ + proxies = self.getAll(https=False) + return {'total': len(proxies), 'https': len(list(filter(lambda x: json.loads(x).get("https"), proxies)))} + + def changeTable(self, name): + """ + 切换操作对象 + :param name: + :return: + """ + self.name = name + + def test(self): + log = LogHandler('ssdb_client') + try: + self.getCount() + except TimeoutError as e: + log.error('ssdb connection time out: %s' % str(e), exc_info=True) + return e + except ConnectionError as e: + log.error('ssdb connection error: %s' % str(e), exc_info=True) + return e + except ResponseError as e: + log.error('ssdb connection error: %s' % str(e), exc_info=True) + return e diff --git a/doc/introduce.md b/doc/introduce.md deleted file mode 100644 index 13f45317a..000000000 --- a/doc/introduce.md +++ /dev/null @@ -1,173 +0,0 @@ - -## 代理池介绍 - -本项目通过爬虫方式持续抓取代理网站公布的免费代理IP,实时校验,维护部分可以使用的代理,并通过api的形式提供外部使用。 - -### 1、问题 - -构建一个代理IP池,可能有下面这些问题: - -* 代理IP从何而来? - -  许多刚接触爬虫的,都试过去西刺、快代理之类有免费代理的网站去抓些免费代理,还是有一些代理能用。 -当然,如果你有更好的代理接口也可以自己接入。 - -  免费代理的采集也很简单,无非就是:`访问页面`` —> `正则/xpath提取` —> `保存` - -* 如何保证代理质量? - -  可以肯定免费的代理IP大部分都是不能用的,不然别人还提供付费接口干嘛(不过事实上很多代理商的付费IP也不稳定,也有很多是不能用)。 -所以采集回来的代理IP不能直接使用,检测的办法也很简单:可以写个程序不断的用代理访问一个稳定的网站,看是否可以正常访问即可。 -这个过程可以使用多线/进程或异步的方式,因为检测代理是个很慢的过程。 - -* 采集回来的代理如何存储? - -  这里不得不推荐一个国人开发的高性能支持多种数据结构的NoSQL数据库[SSDB](http://ssdb.io/docs/zh_cn/),用于替代Redis。支持队列、hash、set、k-v对,支持T级别数据。是做分布式爬虫很好中间存储工具。 - -* 如何让爬虫更方便的用到这些代理? - -  答案肯定是做成服务咯,Python有这么多的web框架,随便拿一个来写个api供爬虫调用。这样代理和爬虫架构分离有很多好处, -比如:当爬虫完全不用考虑如何校验代理,如何保证拿到的代理可用,这些都由代理池来完成。这样只需要安静的码爬虫代码就行啦。 - -### 2、代理池设计 - -  代理池由四部分组成: - -* ProxyGetter: - -  代理获取接口,目前有5个免费代理源,每调用一次就会抓取这个5个网站的最新代理放入DB,支持自定义扩展额外的代理获取接口; - -* DB: - -  用于存放代理IP,目前支持SSDB和Redis(推荐SSDB)。至于为什么选择SSDB,大家可以参考这篇[文章](https://www.sdk.cn/news/2684),个人觉得SSDB是个不错的Redis替代方案,如果你没有用过SSDB,安装起来也很简单,可以参考[这里](https://github.com/jhao104/memory-notes/blob/master/SSDB/SSDB%E5%AE%89%E8%A3%85%E9%85%8D%E7%BD%AE%E8%AE%B0%E5%BD%95.md); - -* Schedule: - -  计划任务,定时去检测DB中的代理可用性,删除不可用的代理。同时也会主动通过ProxyGetter去获取最新代理放入DB; - -* ProxyApi: - -  代理池的外部接口,由[Flask](http://flask.pocoo.org/)实现,功能是给爬虫提供与代理池交互的接口。 - - -![设计](https://pic2.zhimg.com/v2-f2756da2986aa8a8cab1f9562a115b55_b.png) - -### 3、代码模块 - -  Python中高层次的数据结构,动态类型和动态绑定,使得它非常适合于快速应用开发,也适合于作为胶水语言连接已有的软件部件。用Python来搞这个代理IP池也很简单,代码分为6个模块: - -* Api: - -  api接口相关代码,目前api是由Flask实现,代码也非常简单。客户端请求传给Flask,Flask调用`ProxyManager`中的实现,包括`get/delete/refresh/get_all`; - -* DB: - -  数据库相关代码,目前数据库是支持SSDB/Redis。代码用工厂模式实现,方便日后扩展其他类型数据库; - -* Manager: - -  `get/delete/refresh/get_all`等接口的具体实现类,目前代理池只负责管理proxy,日后可能会有更多功能,比如代理和爬虫的绑定,代理和账号的绑定等等; - -* ProxyGetter: - -  代理获取的相关代码,目前抓取了[快代理](http://www.kuaidaili.com)、[代理66](http://www.66ip.cn/)、[有代理](http://www.youdaili.net/Daili/http/)、[西刺代理](http://api.xicidaili.com/free2016.txt)、[guobanjia](http://www.goubanjia.com/free/gngn/index.shtml)这个五个网站的免费代理,经测试这个5个网站每天更新的可用代理只有六七十个,当然也支持自己扩展代理接口; - -* Schedule: - -  定时任务相关代码,现在只是实现定时去刷新代理,并验证可用代理,采用多进程方式; - -* Util: - -  存放一些公共的模块方法或函数,包含`GetConfig`:读取配置文件config.ini的类,`ConfigParse`: 扩展ConfigParser的类,使其对大小写敏感, `Singleton`:实现单例,`LazyProperty`:实现类属性惰性计算。等等; - -* 其他文件: - -  配置文件:`Config.ini``,数据库配置和代理获取接口配置,可以在GetFreeProxy中添加新的代理获取方法,并在Config.ini中注册即可使用; - -### 4、安装 - -下载代码: -``` -git clone git@github.com:jhao104/proxy_pool.git - -或者直接到https://github.com/jhao104/proxy_pool 下载zip文件 -``` - -安装依赖: -``` -pip install -r requirements.txt -``` - -启动: - -``` -如果你的依赖已经安全完成并且具备运行条件,可以直接在Run下运行main.py -到Run目录下: ->>>python main.py - -如果运行成功你应该可以看到有4个main.py进程在 - - -你也可以分别运行他们,依次到Api下启动ProxyApi.py,Schedule下启动ProxyRefreshSchedule.py和ProxyValidSchedule.py即可 -``` - -docker: -``` -git clone git@github.com:jhao104/proxy_pool.git - -cd proxy_pool - -docker build -t proxy:latest -f Dockerfile . - -docker run -p 5010:5010 -d proxy:latest - -# Wait a few minutes -curl localhost:5010/get/ -# result: xxx.xxx.xxx.xxx:xxxx - -curl localhost:5010/get_all/ -``` - -### 5、使用 -  定时任务启动后,会通过GetFreeProxy中的方法抓取代理存入数据库并验证。此后默认每10分钟会重复执行一次。定时任务启动大概一两分钟后,便可在[SSDB](https://github.com/jhao104/SSDBAdmin)中看到刷新出来的可用的代理: - -![useful_proxy](https://pic2.zhimg.com/v2-12f9b7eb72f60663212f317535a113d1_b.png) - -  启动ProxyApi.py后即可在浏览器中使用接口获取代理,一下是浏览器中的截图: - -  index页面: - -![index](https://pic3.zhimg.com/v2-a867aa3db1d413fea8aeeb4c693f004a_b.png) - -  get: - -![get](https://pic1.zhimg.com/v2-f54b876b428893235533de20f2edbfe0_b.png) - -  get_all: - -![get_all](https://pic3.zhimg.com/v2-5c79f8c07e04f9ef655b9bea406d0306_b.png) - - -  爬虫中使用,如果要在爬虫代码中使用的话, 可以将此api封装成函数直接使用,例如: -``` -import requests - -def get_proxy(): - return requests.get("http://127.0.0.1:5010/get/").content - -def delete_proxy(proxy): - requests.get("http://127.0.0.1:5010/delete/?proxy={}".format(proxy)) - -# your spider code - -def spider(): - # .... - requests.get('https://www.example.com', proxies={"http": "http://{}".format(get_proxy())}) - # .... - -``` - -  测试地址:http://123.207.35.36:5010 单机勿压测。谢谢 - -### 6、最后 -  时间仓促,功能和代码都比较简陋,以后有时间再改进。喜欢的在github上给个star。感谢! diff --git a/doc/release_notes.md b/doc/release_notes.md deleted file mode 100644 index c1e4eeefa..000000000 --- a/doc/release_notes.md +++ /dev/null @@ -1,9 +0,0 @@ -## Release Notes - -* newest -  1.使用多线程验证useful_pool - -* 1.10 -  1. 第一版; -  2. 支持PY2/PY3; -  3. 代理池基本功能; \ No newline at end of file diff --git a/docker-compose.yml b/docker-compose.yml new file mode 100644 index 000000000..9d1a10ba4 --- /dev/null +++ b/docker-compose.yml @@ -0,0 +1,14 @@ +version: '2' +services: + proxy_pool: + build: . + container_name: proxy_pool + ports: + - "5010:5010" + links: + - proxy_redis + environment: + DB_CONN: "redis://@proxy_redis:6379/0" + proxy_redis: + image: "redis" + container_name: proxy_redis \ No newline at end of file diff --git a/docs/api.md b/docs/api.md new file mode 100644 index 000000000..875d378c8 --- /dev/null +++ b/docs/api.md @@ -0,0 +1,79 @@ +# API 使用 + +## 接口列表 + +启动 ProxyPool 的 `server` 后会提供如下 HTTP 接口: + +| 接口 | 方法 | 说明 | 参数 | +|------|------|------|------| +| `/` | GET | 返回 API 列表 | 无 | +| `/get` | GET | 随机返回一个代理 | 可选:`?type=https` 过滤 HTTPS 代理 | +| `/pop` | GET | 返回并删除一个代理 | 可选:`?type=https` 过滤 HTTPS 代理 | +| `/all` | GET | 返回所有代理 | 可选:`?type=https` 过滤 HTTPS 代理 | +| `/count` | GET | 返回代理数量统计 | 无 | +| `/delete` | GET | 删除指定代理 | `?proxy=host:port` | + +## 调用示例 + +### 在爬虫中使用 + +通过调用 API 接口来使用代理池: + +```python +import requests + + +def get_proxy(): + return requests.get("http://127.0.0.1:5010/get/").json() + + +def delete_proxy(proxy): + requests.get("http://127.0.0.1:5010/delete/?proxy={}".format(proxy)) + + +def get_html(): + retry_count = 5 + proxy = get_proxy().get("proxy") + while retry_count > 0: + try: + # 使用代理访问 + html = requests.get( + "http://www.example.com", + proxies={ + "http": "http://{}".format(proxy), + "https": "https://{}".format(proxy), + }, + ) + return html + except Exception: + retry_count -= 1 + # 删除代理池中代理 + delete_proxy(proxy) + return None +``` + +本例中在本地 `127.0.0.1` 启动端口为 `5010` 的 `server`,使用 `/get` 接口获取代理,`/delete` 删除代理。 + +### 获取 HTTPS 代理 + +```python +# 只获取支持 HTTPS 的代理 +proxy = requests.get("http://127.0.0.1:5010/get/?type=https").json() +``` + +### 获取代理统计 + +```python +# 返回代理数量、类型分布、来源分布 +stats = requests.get("http://127.0.0.1:5010/count/").json() +# 示例返回: {"http_type": {"http": 10, "https": 5}, "source": {"freeProxy01": 8, "freeProxy02": 7}, "count": 15} +``` + +## 直接读取数据库 + +除了通过 API 接口,也可以直接读取数据库获取代理。目前支持两种数据库:Redis 和 SSDB。 + +- **Redis**:存储结构为 hash,hash name 为配置项中的 `TABLE_NAME`(默认 `use_proxy`) +- **SSDB**:存储结构为 hash,hash name 为配置项中的 `TABLE_NAME` + +可以在代码中自行读取数据库获取代理列表。 \ No newline at end of file diff --git a/docs/assets/logo.svg b/docs/assets/logo.svg new file mode 100644 index 000000000..3f1c5c65b --- /dev/null +++ b/docs/assets/logo.svg @@ -0,0 +1,12 @@ + + + + + + + + + P + + + \ No newline at end of file diff --git a/docs/changelog.md b/docs/changelog.md new file mode 100644 index 000000000..01b5cd39e --- /dev/null +++ b/docs/changelog.md @@ -0,0 +1,105 @@ +# 变更日志 + +## Next (unreleased) + +1. 新增代理源 **谷德代理**; (2026-05-14) +2. 引入tox自动化测试, **放弃Python 3.7以下版本支持**; (2026-05-14) +3. 新增统一服务管理脚本 ``proxy_pool.sh``, 支持start/stop/restart/status命令; (2026-05-26) +4. Docker镜像使用 ``tini`` 作为init进程, 正确处理信号转发; (2026-05-26) +5. tox依赖统一从 ``requirements.txt`` 读取并自动重建环境; (2026-05-26) +6. 优化CI配置, 避免PR时重复触发测试; (2026-05-26) +7. **重写测试套件**: 使用pytest重构全部测试, 覆盖unit/api/integration三层; (2026-05-28) + + +## 2.4.2 (2024-01-18) + +1. 代理格式检查支持需认证的代理格式 `username:password@ip:port`; (2023-03-10) +2. 新增代理源 **稻壳代理**; (2023-05-15) +3. 新增代理源 **冰凌代理**; (2023-01-18) + +## 2.4.1 (2022-07-17) + +1. 新增代理源 **FreeProxyList**; (2022-07-21) +2. 新增代理源 **FateZero**; (2022-08-01) +3. 新增代理属性 `region`; (2022-08-16) + +## 2.4.0 (2021-11-17) + +1. 移除无效代理源 **神鸡代理**; (2021-11-16) +2. 移除无效代理源 **极速代理**; (2021-11-16) +3. 移除代理源 **西拉代理**; (2021-11-16) +4. 新增代理源 **蝶鸟IP**; (2021-11-16) +5. 新增代理源 **PROXY11**; (2021-11-16) +6. 多线程采集代理; (2021-11-17) + +## 2.3.0 (2021-05-27) + +1. 修复Dockerfile时区问题; (2021-04-12) +2. 新增Proxy属性 `source`, 标记代理来源; (2021-04-13) +3. 新增Proxy属性 `https`, 标记支持https的代理; (2021-05-27) + +## 2.2.0 (2021-04-08) + +1. 启动时检查数据库连通性; +2. 新增免费代理源 **米扑代理**; +3. 新增免费代理源 **Pzzqz**; +4. 新增免费代理源 **神鸡代理**; +5. 新增免费代理源 **极速代理**; +6. 新增免费代理源 **小幻代理**; + +## 2.1.1 (2021-02-23) + +1. Fix Bug [#493](https://github.com/jhao104/proxy_pool/issues/493), 新增时区配置; (2020-08-12) +2. 修复 **66代理** 采集; (2020-11-04) +3. 修复 **全网代理** 采集, 解决HTML端口加密问题; (2020-11-04) +4. 新增 **代理盒子** 免费源; (2020-11-04) +5. 新增 `POOL_SIZE_MIN` 配置项, runProxyCheck时, 剩余代理少于POOL_SIZE_MIN触发抓取; (2021-02-23) + +## 2.1.0 (2020-07) + +1. 新增免费代理源 **西拉代理**; (2020-03-30) +2. Fix Bug [#356](https://github.com/jhao104/proxy_pool/issues/356) [#401](https://github.com/jhao104/proxy_pool/issues/401) +3. 优化Docker镜像体积; (2020-06-19) +4. 优化配置方式; +5. 优化代码结构; +6. 不再储存raw_proxy, 抓取后直接验证入库; + +## 2.0.1 (2019-10) + +1. 新增免费代理源 **89免费代理**; +2. 新增免费代理源 **齐云代理**; + +## 2.0.0 (2019-08) + +1. WebApi集成Gunicorn方式启动, Windows平台暂不支持; +2. 优化Proxy调度程序; +3. 扩展Proxy属性; +4. 新增cli工具, 更加方便启动proxyPool; + +## 1.14 (2019-07) + +1. 修复 Queue阻塞导致的 `ProxyValidSchedule` 假死bug; +2. 修改代理源 **云代理** 抓取; +3. 修改代理源 **码农代理** 抓取; +4. 修改代理源 **代理66** 抓取, 引入 `PyExecJS` 模块破解加速乐动态Cookies加密; + +## 1.13 (2019-02) + +1. 使用.py文件替换.ini作为配置文件; +2. 优化代理采集部分; + +## 1.12 (2018-04) + +1. 优化代理格式检查; +2. 增加代理源; +3. Fix Bug [#122](https://github.com/jhao104/proxy_pool/issues/122) [#126](https://github.com/jhao104/proxy_pool/issues/126) + +## 1.11 (2017-08) + +1. 使用多线程验证useful_pool; + +## 1.10 (2016-11) + +1. 第一版; +2. 支持PY2/PY3; +3. 代理池基本功能; \ No newline at end of file diff --git a/docs/configuration.md b/docs/configuration.md new file mode 100644 index 000000000..2baa07ec4 --- /dev/null +++ b/docs/configuration.md @@ -0,0 +1,114 @@ +# 配置参考 + +配置文件 `setting.py` 位于项目的主目录下,配置主要分为五类:**服务配置**、**数据库配置**、**采集配置**、**校验配置**、**调度配置**。 + +## 服务配置 + +### `HOST` + +API 服务监听的 IP。本机访问设置为 `127.0.0.1`,开启远程访问设置为 `0.0.0.0`。 + +- 默认值:`"0.0.0.0"` + +### `PORT` + +API 服务监听的端口。 + +- 默认值:`5010` + +## 数据库配置 + +### `DB_CONN` + +存放代理 IP 的数据库 URI,配置格式为: + +``` +db_type://[[user]:[pwd]]@ip:port/[db] +``` + +目前支持的 `db_type`:`redis`、`ssdb`。 + +配置示例: + +```python +# Redis +DB_CONN = 'redis://@127.0.0.1:6379' +DB_CONN = 'redis://:123456@127.0.0.1:6379' +DB_CONN = 'redis://:123456@127.0.0.1:6379/15' + +# SSDB +DB_CONN = 'ssdb://@127.0.0.1:8888' +DB_CONN = 'ssdb://:123456@127.0.0.1:8888' +``` + +### `TABLE_NAME` + +存放代理的数据载体名称。SSDB 和 Redis 的存放结构为 hash。 + +- 默认值:`"use_proxy"` + +## 采集配置 + +### `PROXY_FETCHER` + +启用的代理采集方法名列表。代理采集方法位于 `fetcher/proxyFetcher.py` 类中。 + +由于各个代理源的稳定性不容易掌握,当某个代理采集方法失效时,可以在该配置中注释掉其名称。如果有增加某些代理采集方法,也请在该配置中添加其方法名,具体请参考 [扩展代理源](extending/fetcher.md)。 + +调度程序每次执行采集任务时都会再次加载该配置,保证每次运行的采集方法都是有效的。 + +```python +PROXY_FETCHER = [ + "freeProxy01", + "freeProxy02", + # .... +] +``` + +## 校验配置 + +### `HTTP_URL` + +用于检验代理是否可用的地址。 + +- 默认值:`"http://httpbin.org"` + +### `HTTPS_URL` + +用于检验代理是否支持 HTTPS 的地址。 + +- 默认值:`"https://www.qq.com"` + +### `VERIFY_TIMEOUT` + +检验代理的超时时间,单位秒。使用代理访问 `HTTP_URL` / `HTTPS_URL` 耗时超过 `VERIFY_TIMEOUT` 时,视为代理不可用。 + +- 默认值:`10` + +### `MAX_FAIL_COUNT` + +检验代理允许的最大失败次数。超过则剔除代理。 + +- 默认值:`0`(即失败一次即删除) + +### `POOL_SIZE_MIN` + +代理检测定时任务运行前,若代理数量小于 `POOL_SIZE_MIN`,则先运行抓取程序。 + +- 默认值:`20` + +## 代理属性 + +### `PROXY_REGION` + +是否启用代理地域属性。开启后会尝试解析代理 IP 的地理位置信息。 + +- 默认值:`True` + +## 调度配置 + +### `TIMEZONE` + +调度器的时区设置。如果在虚拟机上运行时出现 `ValueError: Timezone offset does not match system offset` 错误,请设置该配置项。 + +- 默认值:`"Asia/Shanghai"` \ No newline at end of file diff --git a/docs/docker.md b/docs/docker.md new file mode 100644 index 000000000..bd620ae6a --- /dev/null +++ b/docs/docker.md @@ -0,0 +1,54 @@ +# Docker 部署 + +## 使用 Docker 镜像 + +拉取并运行 Docker 镜像: + +```console +docker pull jhao104/proxy_pool:latest + +docker run --env DB_CONN=redis://:password@ip:port/0 -p 5010:5010 jhao104/proxy_pool:latest +``` + +`DB_CONN` 环境变量会覆盖 `setting.py` 中的数据库连接配置。 + +## 使用 docker-compose + +项目根目录下的 `docker-compose.yml` 定义了 ProxyPool 和 Redis 两个服务: + +```yaml +version: '2' +services: + proxy_pool: + build: . + container_name: proxy_pool + ports: + - "5010:5010" + links: + - proxy_redis + environment: + DB_CONN: "redis://@proxy_redis:6379/0" + proxy_redis: + image: "redis" + container_name: proxy_redis +``` + +启动: + +```console +docker-compose up -d +``` + +## 容器环境注意事项 + +在 Docker 容器中,建议使用前台模式启动服务: + +```console +./proxy_pool.sh start --fg +``` + +Dockerfile 中的 ENTRYPOINT 配置: + +```dockerfile +ENTRYPOINT ["tini", "--", "bash", "proxy_pool.sh", "start", "--fg"] +``` \ No newline at end of file diff --git a/docs/extending/fetcher.md b/docs/extending/fetcher.md new file mode 100644 index 000000000..c63b7547b --- /dev/null +++ b/docs/extending/fetcher.md @@ -0,0 +1,42 @@ +# 扩展代理源 + +项目默认包含多个免费的代理获取源,但是免费的毕竟质量有限,如果直接运行可能拿到的代理质量不理想。因此提供了用户自定义扩展代理获取的方法。 + +## 添加新的代理源 + +### 第一步:编写获取方法 + +在 `ProxyFetcher` 类中添加自定义的获取代理的静态方法,该方法需要以生成器(yield)形式返回 `host:port` 格式的代理字符串: + +```python +# fetcher/proxyFetcher.py + +class ProxyFetcher(object): + # .... + # 自定义代理源获取方法 + @staticmethod + def freeProxyCustom01(): # 命名不和已有重复即可 + # 通过某网站或者某接口或某数据库获取代理 + # 假设你已经拿到了一个代理列表 + proxies = ["x.x.x.x:3128", "x.x.x.x:80"] + for proxy in proxies: + yield proxy + # 确保每个 proxy 都是 host:port 正确的格式返回 +``` + +### 第二步:注册到配置 + +修改配置文件 `setting.py` 中的 `PROXY_FETCHER` 项,加入刚才添加的自定义方法的名字: + +```python +PROXY_FETCHER = [ + # .... + "freeProxyCustom01" # 确保名字和你添加的方法名字一致 +] +``` + +调度程序每次执行采集任务时都会重新加载该配置,添加后会自动启用新的代理源。 + +## 命名规范 + +代理获取方法建议命名为 `freeProxy` + 两位数字(如 `freeProxy01`),自定义方法可使用 `freeProxyCustom` + 数字。 \ No newline at end of file diff --git a/docs/extending/validator.md b/docs/extending/validator.md new file mode 100644 index 000000000..1a844cac1 --- /dev/null +++ b/docs/extending/validator.md @@ -0,0 +1,68 @@ +# 扩展校验器 + +## 内置校验 + +项目中使用的代理校验方法全部定义在 `helper/validator.py` 中,通过 `ProxyValidator` 类中提供的装饰器来区分。校验方法返回 `True` 表示校验通过,返回 `False` 表示校验不通过。 + +代理校验方法分为三类: + +| 类型 | 装饰器 | 说明 | +|------|--------|------| +| `preValidator` | `@ProxyValidator.addPreValidator` | 预校验,在代理抓取后验证前调用 | +| `httpValidator` | `@ProxyValidator.addHttpValidator` | 代理可用性校验,通过则认为代理可用 | +| `httpsValidator` | `@ProxyValidator.addHttpsValidator` | 校验代理是否支持 HTTPS | + +每种校验可以定义多个方法,只有**所有**方法都返回 `True` 的情况下才视为该校验通过。 + +### 校验执行顺序 + +```mermaid +graph LR + A[抓取代理] --> B[preValidator] + B -->|通过| C[httpValidator] + B -->|失败| F[丢弃] + C -->|通过| D[代理可用] + C -->|失败| F + D --> E[httpsValidator] + E -->|通过| G[标记 HTTPS=True] + E -->|失败| H[标记 HTTPS=False] +``` + +- `preValidator` 校验通过的代理才会进入可用性校验 +- `httpValidator` 校验通过后认为代理可用,更新入代理池 +- `httpsValidator` 校验通过后视为代理支持 HTTPS,更新代理的 `https` 属性为 `True` + +## 扩展校验 + +在 `helper/validator.py` 中已有自定义校验的示例,自定义函数需返回 `True` 或者 `False`,使用 `ProxyValidator` 中提供的装饰器来区分校验类型。 + +### 示例 1:自定义代理可用性校验 + +```python +@ProxyValidator.addHttpValidator +def customValidatorExample01(proxy): + """自定义代理可用性校验函数""" + proxies = {"http": "http://{proxy}".format(proxy=proxy)} + try: + r = requests.get("http://www.baidu.com/", headers=HEADER, proxies=proxies, timeout=5) + return True if r.status_code == 200 and len(r.content) > 200 else False + except Exception as e: + return False +``` + +### 示例 2:自定义 HTTPS 校验 + +```python +@ProxyValidator.addHttpsValidator +def customValidatorExample02(proxy): + """自定义代理是否支持 HTTPS 校验函数""" + proxies = {"https": "https://{proxy}".format(proxy=proxy)} + try: + r = requests.get("https://www.baidu.com/", headers=HEADER, proxies=proxies, timeout=5, verify=False) + return True if r.status_code == 200 and len(r.content) > 200 else False + except Exception as e: + return False +``` + +!!! note + 在运行代理可用性校验时,所有被 `ProxyValidator.addHttpValidator` 装饰的函数会依次按定义顺序执行,只有当所有函数都返回 `True` 时才会判断代理可用。`HttpsValidator` 运行机制也是如此。 \ No newline at end of file diff --git a/docs/getting-started.md b/docs/getting-started.md new file mode 100644 index 000000000..62073ea1b --- /dev/null +++ b/docs/getting-started.md @@ -0,0 +1,160 @@ +# 快速开始 + +## 下载代码 + +本项目需要下载代码到本地运行,通过 `git` 下载: + +```console +git clone https://github.com/jhao104/proxy_pool.git +``` + +或者下载特定的 [release](https://github.com/jhao104/proxy_pool/releases) 版本。 + +## 安装依赖 + +到项目目录下使用 `pip` 安装依赖库: + +```console +pip install -r requirements.txt +``` + +## 更新配置 + +配置文件 `setting.py` 位于项目的主目录下,常用的配置项: + +```python +# API 服务 +HOST = "0.0.0.0" # 监听 IP +PORT = 5010 # 监听端口 + +# 数据库 +DB_CONN = 'redis://:pwdstring@127.0.0.1:6379/0' + +# 代理采集方法 +PROXY_FETCHER = [ + "freeProxy01", # 所有 fetch 方法位于 fetcher/proxyFetcher.py + "freeProxy02", + # .... +] +``` + +更多配置请参考 [配置参考](configuration.md)。 + +## 启动项目 + +完整程序包含两部分:`schedule` 调度程序和 `server` API 服务。调度程序负责采集和验证代理,API 服务提供代理服务 HTTP 接口。 + +### 方式一:使用 `proxy_pool.sh`(推荐) + +`proxy_pool.sh` 提供统一的服务管理接口,支持后台运行和进程管理: + +```console +# 后台启动所有服务 +./proxy_pool.sh start + +# 前台启动(容器环境) +./proxy_pool.sh start --fg + +# 停止服务 +./proxy_pool.sh stop + +# 查看状态 +./proxy_pool.sh status + +# 重启服务 +./proxy_pool.sh restart +``` + +### 方式二:使用 `proxyPool.py` + +`proxyPool.py` 是项目的 Python CLI 入口,可以分别启动调度程序和 API 服务: + +```console +# 启动调度程序 +python proxyPool.py schedule + +# 启动 API 服务 +python proxyPool.py server +``` + +## 服务管理 + +### proxy_pool.sh 可用命令 + +| 命令 | 说明 | +|------|------| +| `start` | 启动所有服务(默认后台运行) | +| `start --fg` | 前台启动,适用于容器环境 | +| `stop` | 停止所有服务 | +| `restart` | 重启所有服务 | +| `status` | 查看服务运行状态 | + +### PID 文件 + +服务启动后会在项目根目录生成 `proxy_pool.pid` 文件,记录所有子进程的 PID。该文件用于 `stop` 命令识别需要终止的进程、`status` 命令检查进程状态、防止重复启动。`stop` 命令执行后会自动删除该文件。 + +## 故障排除 + +### 服务启动失败 + +使用前台启动查看详细日志排查错误: + +```console +./proxy_pool.sh start --fg +``` + +### 端口被占用 + +修改 `setting.py` 中的 `PORT` 配置: + +```python +PORT = 5010 # 修改为其他端口 +``` + +### 无法停止服务 + +手动终止进程: + +```console +# 查看 PID 文件 +cat proxy_pool.pid + +# 手动终止进程 +kill + +# 删除 PID 文件 +rm proxy_pool.pid +``` + +## 运行测试 + +### 安装测试依赖 + +```console +pip install -r requirements-test.txt +``` + +### 运行全部测试 + +```console +pytest +``` + +### 分层运行 + +```console +# 单元测试(零外部依赖,CI 必跑) +pytest tests/unit/ + +# API 路由测试 +pytest tests/api/ + +# 集成测试(RedisClient/SsdbClient CRUD,使用 fakeredis 模拟) +pytest tests/integration/ +``` + +### 查看覆盖率 + +```console +pytest --cov=. --cov-report=term-missing +``` \ No newline at end of file diff --git a/docs/index.md b/docs/index.md new file mode 100644 index 000000000..1ed6a938a --- /dev/null +++ b/docs/index.md @@ -0,0 +1,117 @@ +--- +hide: + - navigation + - toc +--- + +

+ +# ProxyPool + +**Python爬虫代理IP池** — 定时采集、验证、存储免费代理,通过 RESTful API 提供服务。 + +[:octicons-mark-github-16: GitHub](https://github.com/jhao104/proxy_pool){ .md-button } +[:octicons-rocket-16: 快速开始](getting-started.md){ .md-button .md-button--primary } + +
+ +
+ +
+ +### :material-access-point-network: 多源采集 + +内置 15+ 免费代理源,支持自定义扩展,定时自动采集。 + +
+ +
+ +### :material-shield-check: 自动验证 + +HTTP/HTTPS 可用性自动校验,剔除失效代理,保证代理质量。 + +
+ +
+ +### :material-database: 持久存储 + +Redis/SSDB 持久化存储,支持集群部署,数据不丢失。 + +
+ +
+ +### :material-api: RESTful API + +提供 `/get`、`/pop`、`/all`、`/count`、`/delete` 等接口,开箱即用。 + +
+ +
+ +### :material-docker: Docker 部署 + +一条命令启动,支持 docker-compose,自带 Redis 服务。 + +
+ +
+ +### :material-clock-fast: 定时调度 + +APScheduler 驱动,自动维护代理池数量,无需人工干预。 + +
+ +
+ +--- + +## 快速开始 + +```bash +# 克隆项目 +git clone https://github.com/jhao104/proxy_pool.git +cd proxy_pool + +# 安装依赖 +pip install -r requirements.txt + +# 启动调度程序(采集和验证代理) +python proxyPool.py schedule + +# 启动 API 服务 +python proxyPool.py server +``` + +启动后访问 `http://127.0.0.1:5010/get` 即可获取一个代理。 + +## API 示例 + +```python +import requests + +# 获取代理 +proxy = requests.get("http://127.0.0.1:5010/get/").json() + +# 使用代理 +html = requests.get( + "http://www.example.com", + proxies={"http": f"http://{proxy['proxy']}"} +) +``` + +## 文档导航 + +| 章节 | 说明 | +|------|------| +| [快速开始](getting-started.md) | 安装、配置、启动项目 | +| [项目结构](project-structure.md) | 目录结构与核心模块说明 | +| [配置参考](configuration.md) | `setting.py` 全部配置项详解 | +| [API 使用](api.md) | RESTful API 端点与调用示例 | +| [Docker 部署](docker.md) | Docker / docker-compose 部署方式 | +| [扩展代理源](extending/fetcher.md) | 自定义代理采集方法 | +| [扩展校验器](extending/validator.md) | 自定义代理校验规则 | +| [变更日志](changelog.md) | 版本发布记录 | \ No newline at end of file diff --git a/docs/project-structure.md b/docs/project-structure.md new file mode 100644 index 000000000..ffde8e34e --- /dev/null +++ b/docs/project-structure.md @@ -0,0 +1,81 @@ +# 项目结构 + +ProxyPool 项目目录结构如下: + +``` +proxy_pool/ +├── api/ # API 服务 +│ └── proxyApi.py # Flask RESTful 接口 +├── db/ # 数据库层 +│ ├── dbClient.py # 抽象数据库接口 +│ ├── redisClient.py # Redis 实现 +│ └── ssdbClient.py # SSDB 实现 +├── fetcher/ # 代理采集器 +│ └── proxyFetcher.py # 各代理源采集方法 +├── handler/ # 业务处理器 +│ ├── configHandler.py # 配置读取 +│ ├── logHandler.py # 日志处理 +│ └── proxyHandler.py # 代理 CRUD 逻辑 +├── helper/ # 核心辅助模块 +│ ├── scheduler.py # APScheduler 定时调度 +│ ├── validator.py # 代理可用性校验 +│ ├── proxy.py # 代理数据模型 +│ ├── fetch.py # 采集任务执行 +│ └── check.py # 校验任务执行 +├── util/ # 工具库 +│ ├── singleton.py # 单例元类 +│ ├── lazyProperty.py # 惰性属性装饰器 +│ ├── six.py # Python 2/3 兼容层 +│ └── webRequest.py # HTTP 请求封装 +├── tests/ # 测试 +│ ├── conftest.py # 共享 fixtures +│ ├── unit/ # 单元测试(零外部依赖) +│ ├── api/ # API 路由测试(Flask test client) +│ └── integration/ # 集成测试(RedisClient/SsdbClient CRUD) +├── docs/ # MkDocs 文档源文件 +├── proxyPool.py # CLI 入口(click) +├── proxy_pool.sh # 服务管理脚本 +├── setting.py # 全局配置文件 +├── requirements.txt # Python 依赖 +├── requirements-test.txt # 测试依赖(pytest、pytest-cov、fakeredis) +├── pyproject.toml # pytest 配置 +├── Dockerfile # Docker 镜像构建 +├── docker-compose.yml # Docker Compose 编排 +└── tox.ini # 多版本测试配置 +``` + +## 核心模块说明 + +### `proxyPool.py` — 入口 + +基于 click 的命令行入口,提供 `schedule` 和 `server` 两个子命令。`schedule` 启动代理采集和验证调度器,`server` 启动 Flask API 服务。 + +### `api/proxyApi.py` — API 服务 + +Flask 应用,提供 `/get`、`/pop`、`/all`、`/count`、`/delete` 等接口,运行在 `setting.py` 配置的 `HOST:PORT`(默认 `0.0.0.0:5010`)。 + +### `db/` — 数据库层 + +通过 `dbClient.py` 定义统一接口,`redisClient.py` 和 `ssdbClient.py` 分别实现 Redis 和 SSDB 的存取逻辑。使用 `setting.py` 中的 `DB_CONN` 连接字符串选择后端。 + +### `fetcher/proxyFetcher.py` — 代理采集 + +`ProxyFetcher` 类中每个代理源对应一个 `freeProxyXX` 静态方法,yield `host:port` 字符串。通过 `setting.py` 的 `PROXY_FETCHER` 列表启用。 + +### `helper/scheduler.py` — 定时调度 + +基于 APScheduler,按配置间隔驱动采集器和验证器运行,自动维护代理池数量。 + +### `helper/validator.py` — 代理校验 + +使用 `HTTP_URL` 和 `HTTPS_URL` 测试代理可用性,超过 `MAX_FAIL_COUNT` 次失败的代理会被移除。 + +### `handler/` — 业务处理 + +- `configHandler.py`:封装 `setting.py` 配置项的读取 +- `logHandler.py`:统一日志配置 +- `proxyHandler.py`:代理的增删改查操作 + +### `setting.py` — 配置中心 + +所有运行时配置集中在此文件,包括 API 地址、数据库连接、采集器列表、校验参数等。详见 [配置参考](configuration.md)。 diff --git a/docs/stylesheets/extra.css b/docs/stylesheets/extra.css new file mode 100644 index 000000000..3e8d0df5d --- /dev/null +++ b/docs/stylesheets/extra.css @@ -0,0 +1,127 @@ +/* 科技感主题定制 */ + +/* 首页 Hero 区域 */ +.md-typeset .tx-hero { + background: linear-gradient(135deg, #3949ab 0%, #1a237e 100%); + color: #fff; + padding: 3rem 2rem; + border-radius: 12px; + margin-bottom: 2rem; +} + +.md-typeset .tx-hero h1 { + color: #fff !important; + font-weight: 700; + font-size: 2.4rem; + margin-bottom: 0.5rem; +} + +.md-typeset .tx-hero p { + color: rgba(255, 255, 255, 0.85); + font-size: 1.1rem; + line-height: 1.6; +} + +/* 特性卡片 */ +.tx-features { + display: grid; + grid-template-columns: repeat(auto-fit, minmax(280px, 1fr)); + gap: 1.2rem; + margin: 2rem 0; +} + +.tx-feature { + padding: 1.5rem; + border: 1px solid var(--md-default-fg-color--lightest); + border-radius: 8px; + transition: border-color 0.2s, box-shadow 0.2s; +} + +.tx-feature:hover { + border-color: var(--md-accent-fg-color); + box-shadow: 0 4px 12px rgba(0, 0, 0, 0.08); +} + +.tx-feature h3 { + margin-top: 0; + font-size: 1rem; +} + +.tx-feature p { + margin-bottom: 0; + color: var(--md-default-fg-color--light); + font-size: 0.85rem; +} + +/* 快速开始按钮 */ +.md-typeset .tx-hero .md-button { + border-color: rgba(255, 255, 255, 0.5); + color: #fff; + font-weight: 600; +} + +.md-typeset .tx-hero .md-button:hover { + background-color: rgba(255, 255, 255, 0.15); + border-color: #fff; +} + +.md-typeset .tx-hero .md-button--primary { + background-color: #fff; + color: #1a237e; + border-color: #fff; +} + +.md-typeset .tx-hero .md-button--primary:hover { + background-color: rgba(255, 255, 255, 0.9); +} + +/* 顶部导航栏 */ +.md-header { + background: linear-gradient(90deg, #1a237e 0%, #3949ab 100%); +} + +.md-header__title { + font-weight: 600; +} + +/* 代码块增强 */ +.md-typeset code { + font-size: 0.82rem; +} + +/* 导航标签页 */ +.md-tabs { + background: var(--md-primary-fg-color); +} + +.md-tabs__link { + font-weight: 500; +} + +/* 搜索框 */ +.md-search__form { + background-color: rgba(255, 255, 255, 0.15); +} + +/* 侧边栏 */ +.md-sidebar--primary .md-sidebar__scrollwrap { + background: linear-gradient(180deg, rgba(57, 73, 171, 0.05) 0%, transparent 100%); +} + +/* 表格增强 */ +.md-typeset table:not([class]) { + border-radius: 8px; + overflow: hidden; +} + +.md-typeset table:not([class]) th { + background-color: var(--md-primary-fg-color--light); + color: #fff; + font-weight: 600; +} + +/* Admonition 增强 */ +.md-typeset .admonition, +.md-typeset details { + border-radius: 8px; +} \ No newline at end of file diff --git a/ProxyGetter/__init__.py b/fetcher/__init__.py similarity index 89% rename from ProxyGetter/__init__.py rename to fetcher/__init__.py index d1c5cc292..54820a3ba 100644 --- a/ProxyGetter/__init__.py +++ b/fetcher/__init__.py @@ -1,7 +1,7 @@ # -*- coding: utf-8 -*- """ ------------------------------------------------- - File Name: __init__.py.py + File Name: __init__.py Description : Author : JHao date: 2016/11/25 diff --git a/fetcher/proxyFetcher.py b/fetcher/proxyFetcher.py new file mode 100644 index 000000000..a46222101 --- /dev/null +++ b/fetcher/proxyFetcher.py @@ -0,0 +1,404 @@ +# -*- coding: utf-8 -*- +""" +------------------------------------------------- + File Name: proxyFetcher + Description : + Author : JHao + date: 2016/11/25 +------------------------------------------------- + Change Activity: + 2016/11/25: proxyFetcher +------------------------------------------------- +""" +__author__ = 'JHao' + +import re +import json +from time import sleep + +from lxml import etree + +from util.webRequest import WebRequest + + +class ProxyFetcher(object): + """ + proxy getter + """ + + @staticmethod + def _parse_proxies_from_text(text): + if not text: + return [] + proxy_pattern = re.compile(r'(? 0: + ip = "".join(tr.xpath("./td[1]/text()")).strip() + port = "".join(tr.xpath("./td[2]/text()")).strip() + yield "%s:%s" % (ip, port) + + @staticmethod + def freeProxy03(): + """ 开心代理 """ + target_urls = ["http://www.kxdaili.com/dailiip.html", "http://www.kxdaili.com/dailiip/2/1.html"] + for url in target_urls: + tree = WebRequest().get(url).tree + for tr in tree.xpath("//table[@class='active']//tr")[1:]: + ip = "".join(tr.xpath('./td[1]/text()')).strip() + port = "".join(tr.xpath('./td[2]/text()')).strip() + yield "%s:%s" % (ip, port) + + @staticmethod + def freeProxy04(): + """ FreeProxyList https://www.freeproxylists.net/zh/ """ + url = "https://www.freeproxylists.net/zh/?c=CN&pt=&pr=&a%5B%5D=0&a%5B%5D=1&a%5B%5D=2&u=50" + tree = WebRequest().get(url, verify=False).tree + from urllib import parse + + def parse_ip(input_str): + html_str = parse.unquote(input_str) + ips = re.findall(r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}', html_str) + return ips[0] if ips else None + + for tr in tree.xpath("//tr[@class='Odd']") + tree.xpath("//tr[@class='Even']"): + ip = parse_ip("".join(tr.xpath('./td[1]/script/text()')).strip()) + port = "".join(tr.xpath('./td[2]/text()')).strip() + if ip: + yield "%s:%s" % (ip, port) + + @staticmethod + def freeProxy05(page_count=1): + """ 快代理 https://www.kuaidaili.com """ + url_pattern = [ + 'https://www.kuaidaili.com/free/inha/{}/', + 'https://www.kuaidaili.com/free/intr/{}/' + ] + url_list = [] + for page_index in range(1, page_count + 1): + for pattern in url_pattern: + url_list.append(pattern.format(page_index)) + + for url in url_list: + tree = WebRequest().get(url).tree + proxy_list = tree.xpath('.//table//tr') + sleep(1) # 必须sleep 不然第二条请求不到数据 + for tr in proxy_list[1:]: + yield ':'.join(tr.xpath('./td/text()')[0:2]) + + @staticmethod + def freeProxy06(): + """ 冰凌代理 https://www.binglx.cn """ + url = "https://www.binglx.cn/?page=1" + try: + tree = WebRequest().get(url).tree + proxy_list = tree.xpath('.//table//tr') + for tr in proxy_list[1:]: + yield ':'.join(tr.xpath('./td/text()')[0:2]) + except Exception as e: + print(e) + + @staticmethod + def freeProxy07(): + """ 云代理 """ + urls = ['http://www.ip3366.net/free/?stype=1', "http://www.ip3366.net/free/?stype=2"] + for url in urls: + r = WebRequest().get(url, timeout=10) + proxies = re.findall(r'(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})[\s\S]*?(\d+)', r.text) + for proxy in proxies: + yield ":".join(proxy) + + @staticmethod + def freeProxy08(): + """ 小幻代理 """ + request = WebRequest() + ti_url = "https://ip.ihuan.me/ti.html" + tqdl_url = "https://ip.ihuan.me/tqdl.html" + ti_resp = request.get(ti_url, timeout=10, verify=False) + form_data = {} + if ti_resp.tree is not None: + for input_tag in ti_resp.tree.xpath("//form//input[@name]"): + name = "".join(input_tag.xpath("./@name")).strip() + value = "".join(input_tag.xpath("./@value")).strip() + if name: + form_data[name] = value + + key = form_data.get("key") + if not key: + key_match = re.search(r'name=["\']key["\'][^>]*value=["\']([^"\']+)', ti_resp.text) + if not key_match: + key_match = re.search(r'key["\']?\s*[:=]\s*["\']([0-9a-f]{16,})', ti_resp.text) + key = key_match.group(1) if key_match else "" + + if not key: + return + + header = { + "Origin": "https://ip.ihuan.me", + "Referer": ti_url, + } + data = form_data.copy() + data.update({ + "num": "2000", + "port": "", + "kill_port": "", + "address": "", + "kill_address": "", + "anonymity": "", + "type": "", + "post": "", + "sort": "1", + "key": key, + }) + r = request.post(tqdl_url, header=header, data=data, timeout=10, verify=False) + proxies = ProxyFetcher._parse_proxies_from_tree(r.tree) + proxies.extend(ProxyFetcher._parse_proxies_from_text(r.text)) + for proxy in ProxyFetcher._yield_unique_proxies(proxies): + yield proxy + + @staticmethod + def freeProxy09(page_count=1): + """ 免费代理库 """ + for i in range(1, page_count + 1): + url = 'http://ip.jiangxianli.com/?country=中国&page={}'.format(i) + html_tree = WebRequest().get(url, verify=False).tree + for index, tr in enumerate(html_tree.xpath("//table//tr")): + if index == 0: + continue + yield ":".join(tr.xpath("./td/text()")[0:2]).strip() + + @staticmethod + def freeProxy10(): + """ 89免费代理 """ + r = WebRequest().get("https://www.89ip.cn/index_1.html", timeout=10) + proxies = re.findall( + r'[\s\S]*?(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})[\s\S]*?[\s\S]*?[\s\S]*?(\d+)[\s\S]*?', + r.text) + for proxy in proxies: + yield ':'.join(proxy) + + @staticmethod + def freeProxy11(): + """ 稻壳代理 https://www.docip.net/ """ + r = WebRequest().get("https://www.docip.net/data/free.json", timeout=10) + try: + for each in r.json['data']: + yield each['ip'] + except Exception as e: + print(e) + + @staticmethod + def freeProxy12(): + """ 谷德代理 https://www.goodips.com/ """ + url = "https://www.goodips.com/" + tree = WebRequest().get(url, verify=False).tree + for item in tree.xpath("//div[@class='table-list']"): + ip = "".join(item.xpath("./ul/li[1]/text()")).strip() + port = "".join(item.xpath("./ul/li[2]/text()")).strip() + if ip and port: + yield "%s:%s" % (ip, port) + + @staticmethod + def freeProxy13(): + """ FreeVPNNode 中国代理 https://cn.freevpnnode.com/free-proxy-for-china/ """ + # url = "https://cn.freevpnnode.com/free-proxy-for-china/" + url = "https://cn.freevpnnode.com/free-proxy/" + r = WebRequest().get(url, timeout=5, retry_time=1, verify=False) + proxies = ProxyFetcher._parse_proxies_from_tree(r.tree) + proxies.extend(ProxyFetcher._parse_proxies_from_text(r.text)) + for proxy in ProxyFetcher._yield_unique_proxies(proxies): + yield proxy + + @staticmethod + def freeProxy14(): + """ SCDN 代理接口 """ + # url = "https://proxy.scdn.io/get_proxies.php?protocol=&country=%E4%B8%AD%E5%9B%BD&per_page=100&page=1" + url = "https://proxy.scdn.io/get_proxies.php?protocol=&country=&per_page=100&page=1" + r = WebRequest().get(url, timeout=5, retry_time=1, verify=False) + try: + data = r.json + proxies = [] + table_html = data.get("table_html") if isinstance(data, dict) else "" + if table_html: + tree = etree.HTML("%s
" % table_html) + proxies.extend(ProxyFetcher._parse_proxies_from_tree(tree)) + + if not proxies: + proxies = ProxyFetcher._parse_proxies_from_json(data) + if not proxies: + proxies = ProxyFetcher._parse_proxies_from_text(r.text) + for proxy in ProxyFetcher._yield_unique_proxies(proxies): + yield proxy + except Exception as e: + print(e) + + @staticmethod + def freeProxy15(): + """ Geonode Free Proxy 中国代理 https://geonode.com/free-proxy-list/ """ + # url = "https://proxylist.geonode.com/api/proxy-list?limit=500&page=1&sort_by=lastChecked&sort_type=desc&country=CN" + url = "https://proxylist.geonode.com/api/proxy-list?limit=500&page=1&sort_by=lastChecked&sort_type=desc" + r = WebRequest().get(url, timeout=5, retry_time=1, verify=False) + try: + proxies = ProxyFetcher._parse_proxies_from_json(r.json) + if not proxies: + proxies = ProxyFetcher._parse_proxies_from_text(r.text) + for proxy in ProxyFetcher._yield_unique_proxies(proxies): + yield proxy + except Exception as e: + print(e) + + # @staticmethod + # def wallProxy01(): + # """ + # PzzQz https://pzzqz.com/ + # """ + # from requests import Session + # from lxml import etree + # session = Session() + # try: + # index_resp = session.get("https://pzzqz.com/", timeout=20, verify=False).text + # x_csrf_token = re.findall('X-CSRFToken": "(.*?)"', index_resp) + # if x_csrf_token: + # data = {"http": "on", "ping": "3000", "country": "cn", "ports": ""} + # proxy_resp = session.post("https://pzzqz.com/", verify=False, + # headers={"X-CSRFToken": x_csrf_token[0]}, json=data).json() + # tree = etree.HTML(proxy_resp["proxy_html"]) + # for tr in tree.xpath("//tr"): + # ip = "".join(tr.xpath("./td[1]/text()")) + # port = "".join(tr.xpath("./td[2]/text()")) + # yield "%s:%s" % (ip, port) + # except Exception as e: + # print(e) + + # @staticmethod + # def freeProxy10(): + # """ + # 墙外网站 cn-proxy + # :return: + # """ + # urls = ['http://cn-proxy.com/', 'http://cn-proxy.com/archives/218'] + # request = WebRequest() + # for url in urls: + # r = request.get(url, timeout=10) + # proxies = re.findall(r'(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})[\w\W](\d+)', r.text) + # for proxy in proxies: + # yield ':'.join(proxy) + + # @staticmethod + # def freeProxy11(): + # """ + # https://proxy-list.org/english/index.php + # :return: + # """ + # urls = ['https://proxy-list.org/english/index.php?p=%s' % n for n in range(1, 10)] + # request = WebRequest() + # import base64 + # for url in urls: + # r = request.get(url, timeout=10) + # proxies = re.findall(r"Proxy\('(.*?)'\)", r.text) + # for proxy in proxies: + # yield base64.b64decode(proxy).decode() + + # @staticmethod + # def freeProxy12(): + # urls = ['https://list.proxylistplus.com/Fresh-HTTP-Proxy-List-1'] + # request = WebRequest() + # for url in urls: + # r = request.get(url, timeout=10) + # proxies = re.findall(r'(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})[\s\S]*?(\d+)', r.text) + # for proxy in proxies: + # yield ':'.join(proxy) + + +if __name__ == '__main__': + p = ProxyFetcher() + for _ in p.freeProxy12(): + print(_) + +# http://nntime.com/proxy-list-01.htm diff --git a/Manager/__init__.py b/handler/__init__.py similarity index 75% rename from Manager/__init__.py rename to handler/__init__.py index e94e59d11..9a42cea96 100644 --- a/Manager/__init__.py +++ b/handler/__init__.py @@ -1,7 +1,7 @@ # -*- coding: utf-8 -*- """ ------------------------------------------------- - File Name: __init__.py.py + File Name: __init__.py Description : Author : JHao date: 2016/12/3 @@ -10,4 +10,6 @@ 2016/12/3: ------------------------------------------------- """ -__author__ = 'JHao' \ No newline at end of file +__author__ = 'JHao' + +# from handler.ProxyManager import ProxyManager diff --git a/handler/configHandler.py b/handler/configHandler.py new file mode 100644 index 000000000..29000bcc6 --- /dev/null +++ b/handler/configHandler.py @@ -0,0 +1,83 @@ +# -*- coding: utf-8 -*- +""" +------------------------------------------------- + File Name: configHandler + Description : + Author : JHao + date: 2020/6/22 +------------------------------------------------- + Change Activity: + 2020/6/22: +------------------------------------------------- +""" +__author__ = 'JHao' + +import os +import setting +from util.singleton import Singleton +from util.lazyProperty import LazyProperty +from util.six import reload_six, withMetaclass + + +class ConfigHandler(withMetaclass(Singleton)): + + def __init__(self): + pass + + @LazyProperty + def serverHost(self): + return os.environ.get("HOST", setting.HOST) + + @LazyProperty + def serverPort(self): + return os.environ.get("PORT", setting.PORT) + + @LazyProperty + def dbConn(self): + return os.getenv("DB_CONN", setting.DB_CONN) + + @LazyProperty + def tableName(self): + return os.getenv("TABLE_NAME", setting.TABLE_NAME) + + @property + def fetchers(self): + reload_six(setting) + return setting.PROXY_FETCHER + + @LazyProperty + def httpUrl(self): + return os.getenv("HTTP_URL", setting.HTTP_URL) + + @LazyProperty + def httpsUrl(self): + return os.getenv("HTTPS_URL", setting.HTTPS_URL) + + @LazyProperty + def verifyTimeout(self): + return int(os.getenv("VERIFY_TIMEOUT", setting.VERIFY_TIMEOUT)) + + # @LazyProperty + # def proxyCheckCount(self): + # return int(os.getenv("PROXY_CHECK_COUNT", setting.PROXY_CHECK_COUNT)) + + @LazyProperty + def maxFailCount(self): + return int(os.getenv("MAX_FAIL_COUNT", setting.MAX_FAIL_COUNT)) + + # @LazyProperty + # def maxFailRate(self): + # return int(os.getenv("MAX_FAIL_RATE", setting.MAX_FAIL_RATE)) + + @LazyProperty + def poolSizeMin(self): + return int(os.getenv("POOL_SIZE_MIN", setting.POOL_SIZE_MIN)) + + @LazyProperty + def proxyRegion(self): + return bool(os.getenv("PROXY_REGION", setting.PROXY_REGION)) + + @LazyProperty + def timezone(self): + return os.getenv("TIMEZONE", setting.TIMEZONE) + diff --git a/Util/LogHandler.py b/handler/logHandler.py similarity index 84% rename from Util/LogHandler.py rename to handler/logHandler.py index 6e7341c1b..45cd1201d 100644 --- a/Util/LogHandler.py +++ b/handler/logHandler.py @@ -7,15 +7,16 @@ date: 2017/3/6 ------------------------------------------------- Change Activity: - 2017/3/6: log handler - 2017/9/21: 屏幕输出/文件输出 可选(默认屏幕和文件均输出) + 2017/03/06: log handler + 2017/09/21: 屏幕输出/文件输出 可选(默认屏幕和文件均输出) + 2020/07/13: Windows下TimedRotatingFileHandler线程不安全, 不再使用 ------------------------------------------------- """ __author__ = 'JHao' import os - import logging +import platform from logging.handlers import TimedRotatingFileHandler @@ -33,6 +34,12 @@ ROOT_PATH = os.path.join(CURRENT_PATH, os.pardir) LOG_PATH = os.path.join(ROOT_PATH, 'log') +if not os.path.exists(LOG_PATH): + try: + os.mkdir(LOG_PATH) + except FileExistsError: + pass + class LogHandler(logging.Logger): """ @@ -46,7 +53,8 @@ def __init__(self, name, level=DEBUG, stream=True, file=True): if stream: self.__setStreamHandler__() if file: - self.__setFileHandler__() + if platform.system() != "Windows": + self.__setFileHandler__() def __setFileHandler__(self, level=None): """ @@ -83,16 +91,6 @@ def __setStreamHandler__(self, level=None): stream_handler.setLevel(level) self.addHandler(stream_handler) - def resetName(self, name): - """ - reset name - :param name: - :return: - """ - self.name = name - self.removeHandler(self.file_handler) - self.__setFileHandler__() - if __name__ == '__main__': log = LogHandler('test') diff --git a/handler/proxyHandler.py b/handler/proxyHandler.py new file mode 100644 index 000000000..32e215e5d --- /dev/null +++ b/handler/proxyHandler.py @@ -0,0 +1,86 @@ +# -*- coding: utf-8 -*- +""" +------------------------------------------------- + File Name: ProxyHandler.py + Description : + Author : JHao + date: 2016/12/3 +------------------------------------------------- + Change Activity: + 2016/12/03: + 2020/05/26: 区分http和https +------------------------------------------------- +""" +__author__ = 'JHao' + +from helper.proxy import Proxy +from db.dbClient import DbClient +from handler.configHandler import ConfigHandler + + +class ProxyHandler(object): + """ Proxy CRUD operator""" + + def __init__(self): + self.conf = ConfigHandler() + self.db = DbClient(self.conf.dbConn) + self.db.changeTable(self.conf.tableName) + + def get(self, https=False): + """ + return a proxy + Args: + https: True/False + Returns: + """ + proxy = self.db.get(https) + return Proxy.createFromJson(proxy) if proxy else None + + def pop(self, https): + """ + return and delete a useful proxy + :return: + """ + proxy = self.db.pop(https) + if proxy: + return Proxy.createFromJson(proxy) + return None + + def put(self, proxy): + """ + put proxy into use proxy + :return: + """ + self.db.put(proxy) + + def delete(self, proxy): + """ + delete useful proxy + :param proxy: + :return: + """ + return self.db.delete(proxy.proxy) + + def getAll(self, https=False): + """ + get all proxy from pool as Proxy list + :return: + """ + proxies = self.db.getAll(https) + return [Proxy.createFromJson(_) for _ in proxies] + + def exists(self, proxy): + """ + check proxy exists + :param proxy: + :return: + """ + return self.db.exists(proxy.proxy) + + def getCount(self): + """ + return raw_proxy and use_proxy count + :return: + """ + total_use_proxy = self.db.getCount() + return {'count': total_use_proxy} diff --git a/log/__init__.py b/helper/__init__.py similarity index 100% rename from log/__init__.py rename to helper/__init__.py diff --git a/helper/check.py b/helper/check.py new file mode 100644 index 000000000..0b732b84d --- /dev/null +++ b/helper/check.py @@ -0,0 +1,158 @@ +# -*- coding: utf-8 -*- +""" +------------------------------------------------- + File Name: check + Description : 执行代理校验 + Author : JHao + date: 2019/8/6 +------------------------------------------------- + Change Activity: + 2019/08/06: 执行代理校验 + 2021/05/25: 分别校验http和https + 2022/08/16: 获取代理Region信息 +------------------------------------------------- +""" +__author__ = 'JHao' + +from util.six import Empty +from threading import Thread +from datetime import datetime +from util.webRequest import WebRequest +from handler.logHandler import LogHandler +from helper.validator import ProxyValidator +from handler.proxyHandler import ProxyHandler +from handler.configHandler import ConfigHandler + + +class DoValidator(object): + """ 执行校验 """ + + conf = ConfigHandler() + + @classmethod + def validator(cls, proxy, work_type): + """ + 校验入口 + Args: + proxy: Proxy Object + work_type: raw/use + Returns: + Proxy Object + """ + http_r = cls.httpValidator(proxy) + https_r = False if not http_r else cls.httpsValidator(proxy) + + proxy.check_count += 1 + proxy.last_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S") + proxy.last_status = True if http_r else False + if http_r: + if proxy.fail_count > 0: + proxy.fail_count -= 1 + proxy.https = True if https_r else False + if work_type == "raw": + proxy.region = cls.regionGetter(proxy) if cls.conf.proxyRegion else "" + else: + proxy.fail_count += 1 + return proxy + + @classmethod + def httpValidator(cls, proxy): + for func in ProxyValidator.http_validator: + if not func(proxy.proxy): + return False + return True + + @classmethod + def httpsValidator(cls, proxy): + for func in ProxyValidator.https_validator: + if not func(proxy.proxy): + return False + return True + + @classmethod + def preValidator(cls, proxy): + for func in ProxyValidator.pre_validator: + if not func(proxy): + return False + return True + + @classmethod + def regionGetter(cls, proxy): + try: + url = 'https://api.ip.sb/geoip/%s' % proxy.proxy.split(':')[0] + r = WebRequest().get(url=url, retry_time=1, timeout=2).json + return r.get('country_code') + except: + return 'error' + + +class _ThreadChecker(Thread): + """ 多线程检测 """ + + def __init__(self, work_type, target_queue, thread_name): + Thread.__init__(self, name=thread_name) + self.work_type = work_type + self.log = LogHandler("checker") + self.proxy_handler = ProxyHandler() + self.target_queue = target_queue + self.conf = ConfigHandler() + + def run(self): + self.log.info("{}ProxyCheck - {}: start".format(self.work_type.title(), self.name)) + while True: + try: + proxy = self.target_queue.get(block=False) + except Empty: + self.log.info("{}ProxyCheck - {}: complete".format(self.work_type.title(), self.name)) + break + proxy = DoValidator.validator(proxy, self.work_type) + if self.work_type == "raw": + self.__ifRaw(proxy) + else: + self.__ifUse(proxy) + self.target_queue.task_done() + + def __ifRaw(self, proxy): + if proxy.last_status: + if self.proxy_handler.exists(proxy): + self.log.info('RawProxyCheck - {}: {} exist'.format(self.name, proxy.proxy.ljust(23))) + else: + self.log.info('RawProxyCheck - {}: {} pass'.format(self.name, proxy.proxy.ljust(23))) + self.proxy_handler.put(proxy) + else: + self.log.info('RawProxyCheck - {}: {} fail'.format(self.name, proxy.proxy.ljust(23))) + + def __ifUse(self, proxy): + if proxy.last_status: + self.log.info('UseProxyCheck - {}: {} pass'.format(self.name, proxy.proxy.ljust(23))) + self.proxy_handler.put(proxy) + else: + if proxy.fail_count > self.conf.maxFailCount: + self.log.info('UseProxyCheck - {}: {} fail, count {} delete'.format(self.name, + proxy.proxy.ljust(23), + proxy.fail_count)) + self.proxy_handler.delete(proxy) + else: + self.log.info('UseProxyCheck - {}: {} fail, count {} keep'.format(self.name, + proxy.proxy.ljust(23), + proxy.fail_count)) + self.proxy_handler.put(proxy) + + +def Checker(tp, queue): + """ + run Proxy ThreadChecker + :param tp: raw/use + :param queue: Proxy Queue + :return: + """ + thread_list = list() + for index in range(20): + thread_list.append(_ThreadChecker(tp, queue, "thread_%s" % str(index).zfill(2))) + + for thread in thread_list: + thread.setDaemon(True) + thread.start() + + for thread in thread_list: + thread.join() diff --git a/helper/fetch.py b/helper/fetch.py new file mode 100644 index 000000000..6340b3a7c --- /dev/null +++ b/helper/fetch.py @@ -0,0 +1,88 @@ +# -*- coding: utf-8 -*- +""" +------------------------------------------------- + File Name: fetchScheduler + Description : + Author : JHao + date: 2019/8/6 +------------------------------------------------- + Change Activity: + 2021/11/18: 多线程采集 +------------------------------------------------- +""" +__author__ = 'JHao' + +from threading import Thread +from helper.proxy import Proxy +from helper.check import DoValidator +from handler.logHandler import LogHandler +from handler.proxyHandler import ProxyHandler +from fetcher.proxyFetcher import ProxyFetcher +from handler.configHandler import ConfigHandler + + +class _ThreadFetcher(Thread): + + def __init__(self, fetch_source, proxy_dict): + Thread.__init__(self) + self.fetch_source = fetch_source + self.proxy_dict = proxy_dict + self.fetcher = getattr(ProxyFetcher, fetch_source, None) + self.log = LogHandler("fetcher") + self.conf = ConfigHandler() + self.proxy_handler = ProxyHandler() + + def run(self): + self.log.info("ProxyFetch - {func}: start".format(func=self.fetch_source)) + try: + for proxy in self.fetcher(): + self.log.info('ProxyFetch - %s: %s ok' % (self.fetch_source, proxy.ljust(23))) + proxy = proxy.strip() + if proxy in self.proxy_dict: + self.proxy_dict[proxy].add_source(self.fetch_source) + else: + self.proxy_dict[proxy] = Proxy( + proxy, source=self.fetch_source) + except Exception as e: + self.log.error("ProxyFetch - {func}: error".format(func=self.fetch_source)) + self.log.error(str(e)) + + +class Fetcher(object): + name = "fetcher" + + def __init__(self): + self.log = LogHandler(self.name) + self.conf = ConfigHandler() + + def run(self): + """ + fetch proxy with proxyFetcher + :return: + """ + proxy_dict = dict() + thread_list = list() + self.log.info("ProxyFetch : start") + + for fetch_source in self.conf.fetchers: + self.log.info("ProxyFetch - {func}: start".format(func=fetch_source)) + fetcher = getattr(ProxyFetcher, fetch_source, None) + if not fetcher: + self.log.error("ProxyFetch - {func}: class method not exists!".format(func=fetch_source)) + continue + if not callable(fetcher): + self.log.error("ProxyFetch - {func}: must be class method".format(func=fetch_source)) + continue + thread_list.append(_ThreadFetcher(fetch_source, proxy_dict)) + + for thread in thread_list: + thread.setDaemon(True) + thread.start() + + for thread in thread_list: + thread.join() + + self.log.info("ProxyFetch - all complete!") + for _ in proxy_dict.values(): + if DoValidator.preValidator(_.proxy): + yield _ diff --git a/helper/launcher.py b/helper/launcher.py new file mode 100644 index 000000000..73d8a0ad2 --- /dev/null +++ b/helper/launcher.py @@ -0,0 +1,65 @@ +# -*- coding: utf-8 -*- +""" +------------------------------------------------- + File Name: launcher + Description : 启动器 + Author : JHao + date: 2021/3/26 +------------------------------------------------- + Change Activity: + 2021/3/26: 启动器 +------------------------------------------------- +""" +__author__ = 'JHao' + +import sys +from db.dbClient import DbClient +from handler.logHandler import LogHandler +from handler.configHandler import ConfigHandler + +log = LogHandler('launcher') + + +def startServer(): + __beforeStart() + from api.proxyApi import runFlask + runFlask() + + +def startScheduler(): + __beforeStart() + from helper.scheduler import runScheduler + runScheduler() + + +def __beforeStart(): + __showVersion() + __showConfigure() + if __checkDBConfig(): + log.info('exit!') + sys.exit() + + +def __showVersion(): + from setting import VERSION + log.info("ProxyPool Version: %s" % VERSION) + + +def __showConfigure(): + conf = ConfigHandler() + log.info("ProxyPool configure HOST: %s" % conf.serverHost) + log.info("ProxyPool configure PORT: %s" % conf.serverPort) + log.info("ProxyPool configure PROXY_FETCHER: %s" % conf.fetchers) + + +def __checkDBConfig(): + conf = ConfigHandler() + db = DbClient(conf.dbConn) + log.info("============ DATABASE CONFIGURE ================") + log.info("DB_TYPE: %s" % db.db_type) + log.info("DB_HOST: %s" % db.db_host) + log.info("DB_PORT: %s" % db.db_port) + log.info("DB_NAME: %s" % db.db_name) + log.info("DB_USER: %s" % db.db_user) + log.info("=================================================") + return db.test() diff --git a/helper/proxy.py b/helper/proxy.py new file mode 100644 index 000000000..396a84239 --- /dev/null +++ b/helper/proxy.py @@ -0,0 +1,136 @@ +# -*- coding: utf-8 -*- +""" +------------------------------------------------- + File Name: Proxy + Description : 代理对象类型封装 + Author : JHao + date: 2019/7/11 +------------------------------------------------- + Change Activity: + 2019/7/11: 代理对象类型封装 +------------------------------------------------- +""" +__author__ = 'JHao' + +import json + + +class Proxy(object): + + def __init__(self, proxy, fail_count=0, region="", anonymous="", + source="", check_count=0, last_status="", last_time="", https=False): + self._proxy = proxy + self._fail_count = fail_count + self._region = region + self._anonymous = anonymous + self._source = source.split('/') + self._check_count = check_count + self._last_status = last_status + self._last_time = last_time + self._https = https + + @classmethod + def createFromJson(cls, proxy_json): + _dict = json.loads(proxy_json) + return cls(proxy=_dict.get("proxy", ""), + fail_count=_dict.get("fail_count", 0), + region=_dict.get("region", ""), + anonymous=_dict.get("anonymous", ""), + source=_dict.get("source", ""), + check_count=_dict.get("check_count", 0), + last_status=_dict.get("last_status", ""), + last_time=_dict.get("last_time", ""), + https=_dict.get("https", False) + ) + + @property + def proxy(self): + """ 代理 ip:port """ + return self._proxy + + @property + def fail_count(self): + """ 检测失败次数 """ + return self._fail_count + + @property + def region(self): + """ 地理位置(国家/城市) """ + return self._region + + @property + def anonymous(self): + """ 匿名 """ + return self._anonymous + + @property + def source(self): + """ 代理来源 """ + return '/'.join(self._source) + + @property + def check_count(self): + """ 代理检测次数 """ + return self._check_count + + @property + def last_status(self): + """ 最后一次检测结果 True -> 可用; False -> 不可用""" + return self._last_status + + @property + def last_time(self): + """ 最后一次检测时间 """ + return self._last_time + + @property + def https(self): + """ 是否支持https """ + return self._https + + @property + def to_dict(self): + """ 属性字典 """ + return {"proxy": self.proxy, + "https": self.https, + "fail_count": self.fail_count, + "region": self.region, + "anonymous": self.anonymous, + "source": self.source, + "check_count": self.check_count, + "last_status": self.last_status, + "last_time": self.last_time} + + @property + def to_json(self): + """ 属性json格式 """ + return json.dumps(self.to_dict, ensure_ascii=False) + + @fail_count.setter + def fail_count(self, value): + self._fail_count = value + + @check_count.setter + def check_count(self, value): + self._check_count = value + + @last_status.setter + def last_status(self, value): + self._last_status = value + + @last_time.setter + def last_time(self, value): + self._last_time = value + + @https.setter + def https(self, value): + self._https = value + + @region.setter + def region(self, value): + self._region = value + + def add_source(self, source_str): + if source_str: + self._source.append(source_str) + self._source = list(set(self._source)) diff --git a/helper/scheduler.py b/helper/scheduler.py new file mode 100644 index 000000000..cd91190a5 --- /dev/null +++ b/helper/scheduler.py @@ -0,0 +1,71 @@ +# -*- coding: utf-8 -*- +""" +------------------------------------------------- + File Name: proxyScheduler + Description : + Author : JHao + date: 2019/8/5 +------------------------------------------------- + Change Activity: + 2019/08/05: proxyScheduler + 2021/02/23: runProxyCheck时,剩余代理少于POOL_SIZE_MIN时执行抓取 +------------------------------------------------- +""" +__author__ = 'JHao' + +from apscheduler.schedulers.blocking import BlockingScheduler +from apscheduler.executors.pool import ProcessPoolExecutor + +from util.six import Queue +from helper.fetch import Fetcher +from helper.check import Checker +from handler.logHandler import LogHandler +from handler.proxyHandler import ProxyHandler +from handler.configHandler import ConfigHandler + + +def __runProxyFetch(): + proxy_queue = Queue() + proxy_fetcher = Fetcher() + + for proxy in proxy_fetcher.run(): + proxy_queue.put(proxy) + + Checker("raw", proxy_queue) + + +def __runProxyCheck(): + proxy_handler = ProxyHandler() + proxy_queue = Queue() + if proxy_handler.db.getCount().get("total", 0) < proxy_handler.conf.poolSizeMin: + __runProxyFetch() + for proxy in proxy_handler.getAll(): + proxy_queue.put(proxy) + Checker("use", proxy_queue) + + +def runScheduler(): + __runProxyFetch() + + timezone = ConfigHandler().timezone + scheduler_log = LogHandler("scheduler") + scheduler = BlockingScheduler(logger=scheduler_log, timezone=timezone) + + scheduler.add_job(__runProxyFetch, 'interval', minutes=4, id="proxy_fetch", name="proxy采集") + scheduler.add_job(__runProxyCheck, 'interval', minutes=2, id="proxy_check", name="proxy检查") + executors = { + 'default': {'type': 'threadpool', 'max_workers': 20}, + 'processpool': ProcessPoolExecutor(max_workers=5) + } + job_defaults = { + 'coalesce': False, + 'max_instances': 10 + } + + scheduler.configure(executors=executors, job_defaults=job_defaults, timezone=timezone) + + scheduler.start() + + +if __name__ == '__main__': + runScheduler() diff --git a/helper/validator.py b/helper/validator.py new file mode 100644 index 000000000..136691c2e --- /dev/null +++ b/helper/validator.py @@ -0,0 +1,86 @@ +# -*- coding: utf-8 -*- +""" +------------------------------------------------- + File Name: _validators + Description : 定义proxy验证方法 + Author : JHao + date: 2021/5/25 +------------------------------------------------- + Change Activity: + 2023/03/10: 支持带用户认证的代理格式 username:password@ip:port +------------------------------------------------- +""" +__author__ = 'JHao' + +import re +from requests import head +from util.six import withMetaclass +from util.singleton import Singleton +from handler.configHandler import ConfigHandler + +conf = ConfigHandler() + +HEADER = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:34.0) Gecko/20100101 Firefox/34.0', + 'Accept': '*/*', + 'Connection': 'keep-alive', + 'Accept-Language': 'zh-CN,zh;q=0.8'} + +IP_REGEX = re.compile(r"(.*:.*@)?\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}:\d{1,5}") + + +class ProxyValidator(withMetaclass(Singleton)): + pre_validator = [] + http_validator = [] + https_validator = [] + + @classmethod + def addPreValidator(cls, func): + cls.pre_validator.append(func) + return func + + @classmethod + def addHttpValidator(cls, func): + cls.http_validator.append(func) + return func + + @classmethod + def addHttpsValidator(cls, func): + cls.https_validator.append(func) + return func + + +@ProxyValidator.addPreValidator +def formatValidator(proxy): + """检查代理格式""" + return True if IP_REGEX.fullmatch(proxy) else False + + +@ProxyValidator.addHttpValidator +def httpTimeOutValidator(proxy): + """ http检测超时 """ + + proxies = {"http": "http://{proxy}".format(proxy=proxy), "https": "https://{proxy}".format(proxy=proxy)} + + try: + r = head(conf.httpUrl, headers=HEADER, proxies=proxies, timeout=conf.verifyTimeout) + return True if r.status_code == 200 else False + except Exception as e: + return False + + +@ProxyValidator.addHttpsValidator +def httpsTimeOutValidator(proxy): + """https检测超时""" + + proxies = {"http": "http://{proxy}".format(proxy=proxy), "https": "https://{proxy}".format(proxy=proxy)} + try: + r = head(conf.httpsUrl, headers=HEADER, proxies=proxies, timeout=conf.verifyTimeout, verify=False) + return True if r.status_code == 200 else False + except Exception as e: + return False + + +@ProxyValidator.addHttpValidator +def customValidatorExample(proxy): + """自定义validator函数,校验代理是否可用, 返回True/False""" + return True diff --git a/mkdocs.yml b/mkdocs.yml new file mode 100644 index 000000000..60886afe6 --- /dev/null +++ b/mkdocs.yml @@ -0,0 +1,93 @@ +site_name: ProxyPool +site_description: Python爬虫代理IP池 +site_author: jhao104 +site_url: https://jhao104.github.io/proxy_pool/ + +repo_name: jhao104/proxy_pool +repo_url: https://github.com/jhao104/proxy_pool + +theme: + name: material + language: zh + logo: assets/logo.svg + favicon: assets/logo.svg + icon: + repo: fontawesome/brands/github + palette: + - media: "(prefers-color-scheme: light)" + scheme: default + primary: indigo + accent: light-blue + toggle: + icon: material/brightness-7 + name: 切换到暗色模式 + - media: "(prefers-color-scheme: dark)" + scheme: slate + primary: indigo + accent: light-blue + toggle: + icon: material/brightness-4 + name: 切换到亮色模式 + features: + - navigation.instant + - navigation.instant.progress + - navigation.tabs + - navigation.tabs.sticky + - navigation.sections + - navigation.top + - navigation.tracking + - search.suggest + - search.highlight + - search.share + - content.code.copy + - content.code.annotate + - content.tabs.link + - toc.follow + font: + text: Noto Sans SC + code: JetBrains Mono + +markdown_extensions: + - admonition + - pymdownx.details + - pymdownx.superfences + - pymdownx.highlight: + anchor_linenums: true + - pymdownx.inlinehilite + - pymdownx.tabbed: + alternate_style: true + - pymdownx.snippets + - pymdownx.emoji: + emoji_index: !!python/name:material.extensions.emoji.twemoji + emoji_generator: !!python/name:material.extensions.emoji.to_svg + - tables + - attr_list + - def_list + - md_in_html + - toc: + permalink: true + +extra_css: + - stylesheets/extra.css + +extra: + social: + - icon: fontawesome/brands/github + link: https://github.com/jhao104/proxy_pool + generator: false + +plugins: + - search + +nav: + - 首页: index.md + - 用户指南: + - 快速开始: getting-started.md + - 项目结构: project-structure.md + - 配置参考: configuration.md + - API 使用: api.md + - Docker 部署: docker.md + - 开发指南: + - 扩展代理源: extending/fetcher.md + - 扩展校验器: extending/validator.md + - 变更日志: changelog.md \ No newline at end of file diff --git a/proxyPool.py b/proxyPool.py new file mode 100644 index 000000000..59afaadeb --- /dev/null +++ b/proxyPool.py @@ -0,0 +1,43 @@ +# -*- coding: utf-8 -*- +""" +------------------------------------------------- + File Name: proxy_pool + Description : proxy pool 启动入口 + Author : JHao + date: 2020/6/19 +------------------------------------------------- + Change Activity: + 2020/6/19: +------------------------------------------------- +""" +__author__ = 'JHao' + +import click +from helper.launcher import startServer, startScheduler +from setting import BANNER, VERSION + +CONTEXT_SETTINGS = dict(help_option_names=['-h', '--help']) + + +@click.group(context_settings=CONTEXT_SETTINGS) +@click.version_option(version=VERSION) +def cli(): + """ProxyPool cli工具""" + + +@cli.command(name="schedule") +def schedule(): + """ 启动调度程序 """ + click.echo(BANNER) + startScheduler() + + +@cli.command(name="server") +def server(): + """ 启动api服务 """ + click.echo(BANNER) + startServer() + + +if __name__ == '__main__': + cli() diff --git a/proxy_pool.sh b/proxy_pool.sh new file mode 100644 index 000000000..cae60196c --- /dev/null +++ b/proxy_pool.sh @@ -0,0 +1,224 @@ +#!/usr/bin/env bash +set -e + +SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)" +PID_FILE="$SCRIPT_DIR/proxy_pool.pid" +PYTHON="${PYTHON:-python}" + +# 颜色 +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +NC='\033[0m' + +log_info() { echo -e "${GREEN}[INFO]${NC} $1"; } +log_warn() { echo -e "${YELLOW}[WARN]${NC} $1"; } +log_error() { echo -e "${RED}[ERROR]${NC} $1"; } + +# 获取已启动的 PIDs +get_pids() { + if [ -f "$PID_FILE" ]; then + cat "$PID_FILE" + fi +} + +# 检查进程是否存活 +is_running() { + local pid=$1 + kill -0 "$pid" 2>/dev/null +} + +# 启动服务 +cmd_start() { + local foreground=false + + while [[ $# -gt 0 ]]; do + case $1 in + --fg|--foreground) foreground=true; shift ;; + *) log_error "Unknown option: $1"; exit 1 ;; + esac + done + + # 检查是否已运行 + local pids=$(get_pids) + if [ -n "$pids" ]; then + for pid in $pids; do + if is_running "$pid"; then + log_warn "Service already running (PID: $pid)" + log_warn "Use '$0 stop' first, or '$0 restart'" + exit 1 + fi + done + fi + + # 清理旧的 PID 文件 + rm -f "$PID_FILE" + + cd "$SCRIPT_DIR" + + if [ "$foreground" = true ]; then + # 前台模式(容器环境) + log_info "Starting in foreground mode..." + + trap 'log_info "Shutting down..."; kill $SERVER_PID $SCHEDULER_PID 2>/dev/null; wait; rm -f "$PID_FILE"; exit 0' EXIT INT TERM + + $PYTHON proxyPool.py server & + SERVER_PID=$! + + $PYTHON proxyPool.py schedule & + SCHEDULER_PID=$! + + echo "$SERVER_PID" >> "$PID_FILE" + echo "$SCHEDULER_PID" >> "$PID_FILE" + + log_info "Services started (PIDs: $SERVER_PID $SCHEDULER_PID)" + wait + else + # 后台模式(非容器环境) + log_info "Starting in background mode..." + + nohup $PYTHON proxyPool.py server > /dev/null 2>&1 & + SERVER_PID=$! + + nohup $PYTHON proxyPool.py schedule > /dev/null 2>&1 & + SCHEDULER_PID=$! + + echo "$SERVER_PID" >> "$PID_FILE" + echo "$SCHEDULER_PID" >> "$PID_FILE" + + sleep 2 + + # 验证启动 + local failed=false + if ! is_running "$SERVER_PID"; then + log_error "Server failed to start" + failed=true + fi + if ! is_running "$SCHEDULER_PID"; then + log_error "Scheduler failed to start" + failed=true + fi + + if [ "$failed" = true ]; then + cmd_stop + exit 1 + fi + + log_info "Services started" + log_info " Server PID: $SERVER_PID" + log_info " Scheduler PID: $SCHEDULER_PID" + log_info "Use '$0 stop' to stop, '$0 status' to check" + fi +} + +# 停止服务 +cmd_stop() { + local pids=$(get_pids) + + if [ -z "$pids" ]; then + log_warn "No PID file found. Services may not be running." + exit 0 + fi + + log_info "Stopping services..." + + local stopped=0 + for pid in $pids; do + if is_running "$pid"; then + kill "$pid" 2>/dev/null || true + stopped=$((stopped + 1)) + fi + done + + # 等待进程退出 + sleep 1 + + # 强制杀死仍在运行的进程 + for pid in $pids; do + if is_running "$pid"; then + log_warn "Force killing PID $pid" + kill -9 "$pid" 2>/dev/null || true + fi + done + + rm -f "$PID_FILE" + log_info "Stopped $stopped service(s)" +} + +# 重启服务 +cmd_restart() { + cmd_stop + sleep 1 + cmd_start "$@" +} + +# 查看状态 +cmd_status() { + local pids=$(get_pids) + + if [ -z "$pids" ]; then + log_info "No PID file found. Services are not running." + exit 0 + fi + + local running=0 + local dead=0 + + for pid in $pids; do + if is_running "$pid"; then + running=$((running + 1)) + else + dead=$((dead + 1)) + fi + done + + if [ $running -gt 0 ]; then + log_info "Services: $running running, $dead dead" + for pid in $pids; do + local status="stopped" + if is_running "$pid"; then + status="running" + fi + echo " PID $pid: $status" + done + else + log_warn "All services are stopped" + rm -f "$PID_FILE" + fi +} + +# 显示帮助 +cmd_help() { + cat < [options] + +Commands: + start [--fg] Start services (background by default) + --fg Run in foreground (for containers) + stop Stop all services + restart [--fg] Restart services + status Show service status + help Show this help + +Examples: + $0 start # Start in background + $0 start --fg # Start in foreground (containers) + $0 stop # Stop all services + $0 status # Check status + +Environment: + PYTHON Python executable (default: python) +EOF +} + +# 主入口 +case "${1:-help}" in + start) shift; cmd_start "$@" ;; + stop) cmd_stop ;; + restart) shift; cmd_restart "$@" ;; + status) cmd_status ;; + help|-h|--help) cmd_help ;; + *) log_error "Unknown command: $1"; cmd_help; exit 1 ;; +esac diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 000000000..70bf49eec --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,8 @@ +[tool.setuptools] +py-modules = [] + +[tool.pytest.ini_options] +testpaths = ["tests"] +markers = [ + "integration: 需要外部服务(如 Redis)的集成测试", +] \ No newline at end of file diff --git a/requirements-test.txt b/requirements-test.txt new file mode 100644 index 000000000..a07a17646 --- /dev/null +++ b/requirements-test.txt @@ -0,0 +1,5 @@ +pytest>=7.0 +pytest-cov>=4.0 +fakeredis>=2.0 +async_timeout>=3.0;python_version<"3.11" +typing_extensions>=4.0;python_version<"3.11" \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index 698cc8197..fe1f6968e 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,9 +1,9 @@ -APScheduler==3.2.0 -Flask==0.11.1 -requests==2.11.0 -lxml==3.7.1 - -pymongo==3.2.2 -redis==2.10.5 - - +requests==2.31.0 +gunicorn==19.9.0 +lxml==4.9.2 +redis>=4.2.0 +APScheduler==3.10.0;python_version>="3.10" +APScheduler==3.2.0;python_version<"3.10" +click==8.0.1 +Flask==2.1.1 +werkzeug==2.1.0 diff --git a/setting.py b/setting.py new file mode 100644 index 000000000..ff616aabc --- /dev/null +++ b/setting.py @@ -0,0 +1,96 @@ +# -*- coding: utf-8 -*- +""" +------------------------------------------------- + File Name: setting.py + Description : 配置文件 + Author : JHao + date: 2019/2/15 +------------------------------------------------- + Change Activity: + 2019/2/15: +------------------------------------------------- +""" + +BANNER = r""" +**************************************************************** +*** ______ ********************* ______ *********** _ ******** +*** | ___ \_ ******************** | ___ \ ********* | | ******** +*** | |_/ / \__ __ __ _ __ _ | |_/ /___ * ___ | | ******** +*** | __/| _// _ \ \ \/ /| | | || __// _ \ / _ \ | | ******** +*** | | | | | (_) | > < \ |_| || | | (_) | (_) || |___ **** +*** \_| |_| \___/ /_/\_\ \__ |\_| \___/ \___/ \_____/ **** +**** __ / / ***** +************************* /___ / ******************************* +************************* ******************************** +**************************************************************** +""" + +VERSION = "2.4.0" + +# ############### server config ############### +HOST = "0.0.0.0" + +PORT = 5010 + +# ############### database config ################### +# db connection uri +# example: +# Redis: redis://:password@ip:port/db +# Ssdb: ssdb://:password@ip:port +DB_CONN = 'redis://:pwdstring@127.0.0.1:6379/0' + +# proxy table name +TABLE_NAME = 'use_proxy' + + +# ###### config the proxy fetch function ###### +PROXY_FETCHER = [ + "freeProxy01", + "freeProxy02", + "freeProxy03", + "freeProxy04", + "freeProxy05", + "freeProxy06", + "freeProxy07", + "freeProxy08", + "freeProxy09", + "freeProxy10", + "freeProxy11", + "freeProxy12", + "freeProxy13", + "freeProxy14", + "freeProxy15", +] + +# ############# proxy validator ################# +# 代理验证目标网站 +HTTP_URL = "http://httpbin.org" + +HTTPS_URL = "https://www.qq.com" + +# 代理验证时超时时间 +VERIFY_TIMEOUT = 10 + +# 近PROXY_CHECK_COUNT次校验中允许的最大失败次数,超过则剔除代理 +MAX_FAIL_COUNT = 0 + +# 近PROXY_CHECK_COUNT次校验中允许的最大失败率,超过则剔除代理 +# MAX_FAIL_RATE = 0.1 + +# proxyCheck时代理数量少于POOL_SIZE_MIN触发抓取 +POOL_SIZE_MIN = 20 + +# ############# proxy attributes ################# +# 是否启用代理地域属性 +PROXY_REGION = True + +# ############# scheduler config ################# + +# Set the timezone for the scheduler forcely (optional) +# If it is running on a VM, and +# "ValueError: Timezone offset does not match system offset" +# was raised during scheduling. +# Please uncomment the following line and set a timezone for the scheduler. +# Otherwise it will detect the timezone from the system automatically. + +TIMEZONE = "Asia/Shanghai" diff --git a/test.py b/test.py deleted file mode 100644 index 518710d3b..000000000 --- a/test.py +++ /dev/null @@ -1,15 +0,0 @@ -# -*- coding: utf-8 -*- -""" -------------------------------------------------- - File Name: test.py - Description : - Author : JHao - date: 2017/3/7 -------------------------------------------------- - Change Activity: - 2017/3/7: -------------------------------------------------- -""" -__author__ = 'JHao' - -from Schedule import ProxyRefreshSchedule \ No newline at end of file diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/tests/api/__init__.py b/tests/api/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/tests/api/test_proxy_api.py b/tests/api/test_proxy_api.py new file mode 100644 index 000000000..de6728149 --- /dev/null +++ b/tests/api/test_proxy_api.py @@ -0,0 +1,154 @@ +# -*- coding: utf-8 -*- +""" +------------------------------------------------- + File Name: testProxyApi.py + Description : Flask API全路由测试 + Author : JHao + date: 2026/5/28 +------------------------------------------------- + Change Activity: + 2026/05/28: +------------------------------------------------- +""" +__author__ = 'JHao' + +import pytest +from helper.proxy import Proxy + + +@pytest.fixture +def mocks(app): + """快捷访问 app._test_mocks""" + return app._test_mocks + + +class TestIndex: + + def test_index_returns_api_list(self, client): + resp = client.get("/") + assert resp.status_code == 200 + data = resp.get_json() + assert "url" in data + assert len(data["url"]) > 0 + + +class TestGet: + + def test_get_returns_proxy(self, client, mocks): + proxy = Proxy("1.2.3.4:8080", source="test", https=False) + mocks["get"].return_value = proxy + + resp = client.get("/get/") + assert resp.status_code == 200 + data = resp.get_json() + assert data["proxy"] == "1.2.3.4:8080" + assert data["https"] is False + + def test_get_no_proxy(self, client, mocks): + mocks["get"].return_value = None + + resp = client.get("/get/") + assert resp.status_code == 200 + data = resp.get_json() + assert data["code"] == 0 + assert data["src"] == "no proxy" + + def test_get_https_filter(self, client, mocks): + proxy = Proxy("5.6.7.8:443", source="test", https=True) + mocks["get"].return_value = proxy + + resp = client.get("/get/?type=https") + assert resp.status_code == 200 + data = resp.get_json() + assert data["https"] is True + mocks["get"].assert_called_with(True) + + def test_get_http_filter(self, client, mocks): + mocks["get"].return_value = None + + client.get("/get/") + mocks["get"].assert_called_with(False) + + +class TestPop: + + def test_pop_returns_proxy(self, client, mocks): + proxy = Proxy("1.2.3.4:8080", source="test") + mocks["pop"].return_value = proxy + + resp = client.get("/pop/") + assert resp.status_code == 200 + data = resp.get_json() + assert data["proxy"] == "1.2.3.4:8080" + + def test_pop_no_proxy(self, client, mocks): + mocks["pop"].return_value = None + + resp = client.get("/pop/") + data = resp.get_json() + assert data["code"] == 0 + + +class TestAll: + + def test_all_returns_list(self, client, mocks): + proxies = [ + Proxy("1.2.3.4:8080", source="test"), + Proxy("5.6.7.8:443", source="test", https=True), + ] + mocks["getAll"].return_value = proxies + + resp = client.get("/all/") + assert resp.status_code == 200 + data = resp.get_json() + assert len(data) == 2 + assert data[0]["proxy"] == "1.2.3.4:8080" + assert data[1]["proxy"] == "5.6.7.8:443" + + def test_all_empty(self, client, mocks): + mocks["getAll"].return_value = [] + + resp = client.get("/all/") + data = resp.get_json() + assert data == [] + + +class TestDelete: + + def test_delete_calls_handler(self, client, mocks): + mocks["delete"].return_value = True + + resp = client.get("/delete/?proxy=1.2.3.4:8080") + assert resp.status_code == 200 + data = resp.get_json() + assert data["code"] == 0 + assert data["src"] is True + mocks["delete"].assert_called_once() + + +class TestCount: + + def test_count_returns_stats(self, client, mocks): + proxies = [ + Proxy("1.2.3.4:8080", source="freeProxy01", https=False), + Proxy("5.6.7.8:443", source="freeProxy02", https=True), + ] + mocks["getAll"].return_value = proxies + + resp = client.get("/count/") + assert resp.status_code == 200 + data = resp.get_json() + assert data["count"] == 2 + assert data["http_type"]["http"] == 1 + assert data["http_type"]["https"] == 1 + assert data["source"]["freeProxy01"] == 1 + assert data["source"]["freeProxy02"] == 1 + + def test_count_empty(self, client, mocks): + mocks["getAll"].return_value = [] + + resp = client.get("/count/") + data = resp.get_json() + assert data["count"] == 0 + assert data["http_type"] == {} + assert data["source"] == {} \ No newline at end of file diff --git a/tests/conftest.py b/tests/conftest.py new file mode 100644 index 000000000..4e3b5c128 --- /dev/null +++ b/tests/conftest.py @@ -0,0 +1,100 @@ +# -*- coding: utf-8 -*- +""" +------------------------------------------------- + File Name: conftest.py + Description : 测试共享fixtures + Author : JHao + date: 2026/5/28 +------------------------------------------------- + Change Activity: + 2026/05/28: +------------------------------------------------- +""" +__author__ = 'JHao' + +import sys +import os +from unittest.mock import MagicMock, patch + +import pytest +import fakeredis + +# 确保项目根目录在 sys.path 中 +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + +from util.singleton import Singleton +from helper.proxy import Proxy + + +# --------------- Singleton 重置 --------------- + +@pytest.fixture(autouse=True) +def reset_singleton(): + """每个测试前清空 Singleton 缓存,防止测试间状态泄漏""" + saved = Singleton._inst.copy() + Singleton._inst.clear() + yield + Singleton._inst.clear() + Singleton._inst.update(saved) + + +# --------------- Proxy 工厂 --------------- + +@pytest.fixture +def proxy_obj(): + """标准测试用 Proxy 对象""" + return Proxy("1.2.3.4:8080", source="test", https=False) + + +@pytest.fixture +def https_proxy_obj(): + """HTTPS 测试用 Proxy 对象""" + return Proxy("5.6.7.8:443", source="test", https=True) + + +# --------------- Redis / DB --------------- + +@pytest.fixture +def fake_redis(): + """fakeredis 实例,用于 RedisClient/SsdbClient 测试""" + return fakeredis.FakeRedis(decode_responses=True) + + +@pytest.fixture +def mock_db_client(fake_redis): + """mock DbClient,返回 fakeredis 支持的 RedisClient 行为""" + with patch("db.dbClient.DbClient") as mock_cls: + yield mock_cls, fake_redis + + +# --------------- Flask API --------------- + +@pytest.fixture +def app(): + """Flask app,proxy_handler 被 mock""" + # mock 掉 DbClient,防止 ProxyHandler 连接真实 Redis + with patch("db.dbClient.DbClient") as mock_db_cls: + mock_db_instance = MagicMock() + mock_db_cls.return_value = mock_db_instance + + from api.proxyApi import app as flask_app, proxy_handler + flask_app.config["TESTING"] = True + + # 替换 proxy_handler 的方法为 MagicMock,方便测试中配置返回值 + with patch.object(proxy_handler, "get") as mock_get, \ + patch.object(proxy_handler, "pop") as mock_pop, \ + patch.object(proxy_handler, "getAll") as mock_getAll, \ + patch.object(proxy_handler, "delete") as mock_delete: + flask_app._test_mocks = { + "get": mock_get, + "pop": mock_pop, + "getAll": mock_getAll, + "delete": mock_delete, + } + yield flask_app + + +@pytest.fixture +def client(app): + """Flask test client""" + return app.test_client() \ No newline at end of file diff --git a/tests/integration/__init__.py b/tests/integration/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/tests/integration/test_redis_client.py b/tests/integration/test_redis_client.py new file mode 100644 index 000000000..9c048ca0f --- /dev/null +++ b/tests/integration/test_redis_client.py @@ -0,0 +1,149 @@ +# -*- coding: utf-8 -*- +""" +------------------------------------------------- + File Name: testRedisClient.py + Description : RedisClient集成测试 + Author : JHao + date: 2026/5/28 +------------------------------------------------- + Change Activity: + 2026/05/28: +------------------------------------------------- +""" +__author__ = 'JHao' + +import json +import pytest +import fakeredis +from unittest.mock import patch, MagicMock +from db.redisClient import RedisClient +from helper.proxy import Proxy + + +@pytest.fixture +def redis_client(fake_redis): + """RedisClient 实例,内部连接替换为 fakeredis""" + with patch("db.redisClient.BlockingConnectionPool"): + with patch("db.redisClient.Redis", return_value=fake_redis): + client = RedisClient(host="localhost", port=6379, + username=None, password=None, db="0") + client.changeTable("test_proxy") + return client + + +def _make_proxy(proxy_str, https=False, source="test"): + return Proxy(proxy_str, source=https and "https_test" or "http_test", + https=https) + + +class TestRedisPutGet: + + def test_put_and_get(self, redis_client): + proxy = _make_proxy("1.2.3.4:8080") + redis_client.put(proxy) + result = redis_client.get(https=False) + assert result is not None + data = json.loads(result) + assert data["proxy"] == "1.2.3.4:8080" + + def test_get_https(self, redis_client): + proxy = _make_proxy("5.6.7.8:443", https=True) + redis_client.put(proxy) + result = redis_client.get(https=True) + assert result is not None + data = json.loads(result) + assert data["https"] is True + + def test_get_https_excludes_http(self, redis_client): + proxy = _make_proxy("1.2.3.4:8080", https=False) + redis_client.put(proxy) + result = redis_client.get(https=True) + assert result is None + + def test_get_empty_returns_none(self, redis_client): + result = redis_client.get(https=False) + assert result is None + + +class TestRedisExists: + + def test_exists_true(self, redis_client): + proxy = _make_proxy("1.2.3.4:8080") + redis_client.put(proxy) + assert redis_client.exists("1.2.3.4:8080") is True + + def test_exists_false(self, redis_client): + assert redis_client.exists("9.9.9.9:9999") is False + + +class TestRedisDelete: + + def test_delete(self, redis_client): + proxy = _make_proxy("1.2.3.4:8080") + redis_client.put(proxy) + redis_client.delete("1.2.3.4:8080") + assert redis_client.exists("1.2.3.4:8080") is False + + +class TestRedisPop: + + def test_pop_removes_proxy(self, redis_client): + proxy = _make_proxy("1.2.3.4:8080") + redis_client.put(proxy) + popped = redis_client.pop(https=False) + assert popped is not None + assert redis_client.exists("1.2.3.4:8080") is False + + def test_pop_empty_returns_none(self, redis_client): + result = redis_client.pop(https=False) + assert result is None + + +class TestRedisGetAll: + + def test_get_all(self, redis_client): + redis_client.put(_make_proxy("1.2.3.4:8080")) + redis_client.put(_make_proxy("5.6.7.8:443", https=True)) + all_proxies = redis_client.getAll(https=False) + assert len(all_proxies) == 2 + + def test_get_all_https_filter(self, redis_client): + redis_client.put(_make_proxy("1.2.3.4:8080", https=False)) + redis_client.put(_make_proxy("5.6.7.8:443", https=True)) + https_proxies = redis_client.getAll(https=True) + assert len(https_proxies) == 1 + + +class TestRedisGetCount: + + def test_get_count(self, redis_client): + redis_client.put(_make_proxy("1.2.3.4:8080", https=False)) + redis_client.put(_make_proxy("5.6.7.8:443", https=True)) + count = redis_client.getCount() + assert count["total"] == 2 + assert count["https"] == 1 + + def test_get_count_empty(self, redis_client): + count = redis_client.getCount() + assert count["total"] == 0 + assert count["https"] == 0 + + +class TestRedisClear: + + def test_clear(self, redis_client): + redis_client.put(_make_proxy("1.2.3.4:8080")) + redis_client.put(_make_proxy("5.6.7.8:443")) + redis_client.clear() + count = redis_client.getCount() + assert count["total"] == 0 + + +class TestRedisChangeTable: + + def test_change_table_isolation(self, redis_client): + redis_client.put(_make_proxy("1.2.3.4:8080")) + redis_client.changeTable("other_table") + assert redis_client.getCount()["total"] == 0 + redis_client.changeTable("test_proxy") + assert redis_client.getCount()["total"] == 1 \ No newline at end of file diff --git a/tests/integration/test_ssdb_client.py b/tests/integration/test_ssdb_client.py new file mode 100644 index 000000000..ab046ef7f --- /dev/null +++ b/tests/integration/test_ssdb_client.py @@ -0,0 +1,148 @@ +# -*- coding: utf-8 -*- +""" +------------------------------------------------- + File Name: testSsdbClient.py + Description : SsdbClient集成测试 + Author : JHao + date: 2026/5/28 +------------------------------------------------- + Change Activity: + 2026/05/28: +------------------------------------------------- +""" +__author__ = 'JHao' + +import json +import pytest +from unittest.mock import patch +from db.ssdbClient import SsdbClient +from helper.proxy import Proxy + + +@pytest.fixture +def ssdb_client(fake_redis): + """SsdbClient 实例,内部连接替换为 fakeredis""" + with patch("db.ssdbClient.BlockingConnectionPool"): + with patch("db.ssdbClient.Redis", return_value=fake_redis): + client = SsdbClient(host="localhost", port=8888, + username=None, password=None) + client.changeTable("test_proxy") + return client + + +def _make_proxy(proxy_str, https=False, source="test"): + return Proxy(proxy_str, source=https and "https_test" or "http_test", + https=https) + + +class TestSsdbPutGet: + + def test_put_and_get(self, ssdb_client): + proxy = _make_proxy("1.2.3.4:8080") + ssdb_client.put(proxy) + result = ssdb_client.get(https=False) + assert result is not None + data = json.loads(result) + assert data["proxy"] == "1.2.3.4:8080" + + def test_get_https(self, ssdb_client): + proxy = _make_proxy("5.6.7.8:443", https=True) + ssdb_client.put(proxy) + result = ssdb_client.get(https=True) + assert result is not None + data = json.loads(result) + assert data["https"] is True + + def test_get_https_excludes_http(self, ssdb_client): + proxy = _make_proxy("1.2.3.4:8080", https=False) + ssdb_client.put(proxy) + result = ssdb_client.get(https=True) + assert result is None + + def test_get_empty_returns_none(self, ssdb_client): + result = ssdb_client.get(https=False) + assert result is None + + +class TestSsdbExists: + + def test_exists_true(self, ssdb_client): + proxy = _make_proxy("1.2.3.4:8080") + ssdb_client.put(proxy) + assert ssdb_client.exists("1.2.3.4:8080") is True + + def test_exists_false(self, ssdb_client): + assert ssdb_client.exists("9.9.9.9:9999") is False + + +class TestSsdbDelete: + + def test_delete(self, ssdb_client): + proxy = _make_proxy("1.2.3.4:8080") + ssdb_client.put(proxy) + ssdb_client.delete("1.2.3.4:8080") + assert ssdb_client.exists("1.2.3.4:8080") is False + + +class TestSsdbPop: + + def test_pop_removes_proxy(self, ssdb_client): + proxy = _make_proxy("1.2.3.4:8080") + ssdb_client.put(proxy) + popped = ssdb_client.pop(https=False) + assert popped is not None + assert ssdb_client.exists("1.2.3.4:8080") is False + + def test_pop_empty_returns_none(self, ssdb_client): + result = ssdb_client.pop(https=False) + assert result is None + + +class TestSsdbGetAll: + + def test_get_all(self, ssdb_client): + ssdb_client.put(_make_proxy("1.2.3.4:8080")) + ssdb_client.put(_make_proxy("5.6.7.8:443", https=True)) + all_proxies = list(ssdb_client.getAll(https=False)) + assert len(all_proxies) == 2 + + def test_get_all_https_filter(self, ssdb_client): + ssdb_client.put(_make_proxy("1.2.3.4:8080", https=False)) + ssdb_client.put(_make_proxy("5.6.7.8:443", https=True)) + https_proxies = list(ssdb_client.getAll(https=True)) + assert len(https_proxies) == 1 + + +class TestSsdbGetCount: + + def test_get_count(self, ssdb_client): + ssdb_client.put(_make_proxy("1.2.3.4:8080", https=False)) + ssdb_client.put(_make_proxy("5.6.7.8:443", https=True)) + count = ssdb_client.getCount() + assert count["total"] == 2 + assert count["https"] == 1 + + def test_get_count_empty(self, ssdb_client): + count = ssdb_client.getCount() + assert count["total"] == 0 + assert count["https"] == 0 + + +class TestSsdbClear: + + def test_clear(self, ssdb_client): + ssdb_client.put(_make_proxy("1.2.3.4:8080")) + ssdb_client.put(_make_proxy("5.6.7.8:443")) + ssdb_client.clear() + count = ssdb_client.getCount() + assert count["total"] == 0 + + +class TestSsdbChangeTable: + + def test_change_table_isolation(self, ssdb_client): + ssdb_client.put(_make_proxy("1.2.3.4:8080")) + ssdb_client.changeTable("other_table") + assert ssdb_client.getCount()["total"] == 0 + ssdb_client.changeTable("test_proxy") + assert ssdb_client.getCount()["total"] == 1 \ No newline at end of file diff --git a/tests/unit/__init__.py b/tests/unit/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/tests/unit/test_config.py b/tests/unit/test_config.py new file mode 100644 index 000000000..00f30bea9 --- /dev/null +++ b/tests/unit/test_config.py @@ -0,0 +1,100 @@ +# -*- coding: utf-8 -*- +""" +------------------------------------------------- + File Name: testConfig.py + Description : ConfigHandler环境变量测试 + Author : JHao + date: 2026/5/28 +------------------------------------------------- + Change Activity: + 2026/05/28: +------------------------------------------------- +""" +__author__ = 'JHao' + +import os +import pytest +import setting +from handler.configHandler import ConfigHandler + + +@pytest.fixture(autouse=True) +def clean_env(): + """测试前后清理可能设置的环境变量""" + env_keys = ["DB_CONN", "PORT", "HOST", "TABLE_NAME", "HTTP_URL", + "HTTPS_URL", "VERIFY_TIMEOUT", "MAX_FAIL_COUNT", + "POOL_SIZE_MIN", "PROXY_REGION", "TIMEZONE"] + saved = {k: os.environ.get(k) for k in env_keys} + for k in env_keys: + os.environ.pop(k, None) + yield + for k, v in saved.items(): + if v is not None: + os.environ[k] = v + else: + os.environ.pop(k, None) + + +@pytest.fixture +def conf(): + return ConfigHandler() + + +class TestConfigHandlerDefaults: + + def test_db_conn_default(self, conf): + assert conf.dbConn == setting.DB_CONN + + def test_server_host_default(self, conf): + assert conf.serverHost == setting.HOST + + def test_server_port_default(self, conf): + assert str(conf.serverPort) == str(setting.PORT) + + def test_table_name_default(self, conf): + assert conf.tableName == setting.TABLE_NAME + + def test_http_url_default(self, conf): + assert conf.httpUrl == setting.HTTP_URL + + def test_https_url_default(self, conf): + assert conf.httpsUrl == setting.HTTPS_URL + + def test_verify_timeout_default(self, conf): + assert conf.verifyTimeout == setting.VERIFY_TIMEOUT + + def test_max_fail_count_default(self, conf): + assert conf.maxFailCount == setting.MAX_FAIL_COUNT + + def test_pool_size_min_default(self, conf): + assert conf.poolSizeMin == setting.POOL_SIZE_MIN + + def test_timezone_default(self, conf): + assert conf.timezone == setting.TIMEZONE + + def test_fetchers_is_list(self, conf): + assert isinstance(conf.fetchers, list) + assert len(conf.fetchers) > 0 + + +class TestConfigHandlerEnvOverride: + + def test_db_conn_override(self): + os.environ["DB_CONN"] = "redis://:newpwd@10.0.0.1:6380/3" + conf = ConfigHandler() + assert conf.dbConn == "redis://:newpwd@10.0.0.1:6380/3" + + def test_port_override(self): + os.environ["PORT"] = "8080" + conf = ConfigHandler() + assert str(conf.serverPort) == "8080" + + def test_verify_timeout_override(self): + os.environ["VERIFY_TIMEOUT"] = "30" + conf = ConfigHandler() + assert conf.verifyTimeout == 30 + + def test_max_fail_count_override(self): + os.environ["MAX_FAIL_COUNT"] = "5" + conf = ConfigHandler() + assert conf.maxFailCount == 5 \ No newline at end of file diff --git a/tests/unit/test_db_client.py b/tests/unit/test_db_client.py new file mode 100644 index 000000000..31becb5cf --- /dev/null +++ b/tests/unit/test_db_client.py @@ -0,0 +1,62 @@ +# -*- coding: utf-8 -*- +""" +------------------------------------------------- + File Name: testDbClient.py + Description : DbClient URI解析单元测试 + Author : JHao + date: 2026/5/28 +------------------------------------------------- + Change Activity: + 2026/05/28: +------------------------------------------------- +""" +__author__ = 'JHao' + +import pytest +from db.dbClient import DbClient + + +class TestParseDbConn: + + def test_redis_uri(self): + DbClient.parseDbConn("redis://:password@127.0.0.1:6379/1") + assert DbClient.db_type == "REDIS" + assert DbClient.db_pwd == "password" + assert DbClient.db_host == "127.0.0.1" + assert DbClient.db_port == 6379 + assert DbClient.db_name == "1" + + def test_ssdb_uri(self): + DbClient.parseDbConn("ssdb://:password@127.0.0.1:8888") + assert DbClient.db_type == "SSDB" + assert DbClient.db_pwd == "password" + assert DbClient.db_host == "127.0.0.1" + assert DbClient.db_port == 8888 + + def test_redis_uri_no_password(self): + DbClient.parseDbConn("redis://127.0.0.1:6379/0") + assert DbClient.db_type == "REDIS" + assert DbClient.db_pwd is None + assert DbClient.db_host == "127.0.0.1" + assert DbClient.db_port == 6379 + assert DbClient.db_name == "0" + + def test_ssdb_uri_no_password(self): + DbClient.parseDbConn("ssdb://@127.0.0.1:8888") + assert DbClient.db_type == "SSDB" + assert DbClient.db_host == "127.0.0.1" + assert DbClient.db_port == 8888 + + def test_unknown_db_type_raises(self): + with pytest.raises(AssertionError): + DbClient("mysql://127.0.0.1:3306") + + @pytest.mark.parametrize("uri,expected_type", [ + ("redis://:pwd@10.0.0.1:6380/2", "REDIS"), + ("ssdb://:pwd@10.0.0.1:8899", "SSDB"), + ]) + def test_parse_returns_cls(self, uri, expected_type): + """parseDbConn 返回 cls 以支持链式调用""" + result = DbClient.parseDbConn(uri) + assert result is DbClient + assert DbClient.db_type == expected_type \ No newline at end of file diff --git a/tests/unit/test_proxy.py b/tests/unit/test_proxy.py new file mode 100644 index 000000000..d8f1bb680 --- /dev/null +++ b/tests/unit/test_proxy.py @@ -0,0 +1,179 @@ +# -*- coding: utf-8 -*- +""" +------------------------------------------------- + File Name: testProxy.py + Description : Proxy类单元测试 + Author : JHao + date: 2026/5/28 +------------------------------------------------- + Change Activity: + 2026/05/28: +------------------------------------------------- +""" +__author__ = 'JHao' + +import json +import pytest +from helper.proxy import Proxy + + +class TestProxyInit: + """Proxy 构造测试""" + + def test_default_values(self): + p = Proxy("1.2.3.4:8080") + assert p.proxy == "1.2.3.4:8080" + assert p.fail_count == 0 + assert p.region == "" + assert p.anonymous == "" + assert p.source == "" + assert p.check_count == 0 + assert p.last_status == "" + assert p.last_time == "" + assert p.https is False + + def test_custom_values(self): + p = Proxy( + "5.6.7.8:443", + fail_count=3, + region="US", + anonymous="high", + source="freeProxy01", + check_count=10, + last_status=True, + last_time="2024-01-01 00:00:00", + https=True, + ) + assert p.proxy == "5.6.7.8:443" + assert p.fail_count == 3 + assert p.region == "US" + assert p.anonymous == "high" + assert p.source == "freeProxy01" + assert p.check_count == 10 + assert p.last_status is True + assert p.last_time == "2024-01-01 00:00:00" + assert p.https is True + + def test_source_with_slash(self): + """source 含 / 时应被拆分为列表,读回时用 / 连接""" + p = Proxy("1.2.3.4:8080", source="freeProxy01/freeProxy02") + assert p.source == "freeProxy01/freeProxy02" + + +class TestProxySerialization: + """序列化 / 反序列化测试""" + + def test_to_dict_keys(self): + p = Proxy("1.2.3.4:8080") + d = p.to_dict + expected_keys = {"proxy", "https", "fail_count", "region", "anonymous", + "source", "check_count", "last_status", "last_time"} + assert set(d.keys()) == expected_keys + + def test_to_dict_values(self): + p = Proxy("1.2.3.4:8080", source="test", https=True) + d = p.to_dict + assert d["proxy"] == "1.2.3.4:8080" + assert d["https"] is True + assert d["source"] == "test" + assert d["fail_count"] == 0 + + def test_to_json_is_valid_json(self): + p = Proxy("1.2.3.4:8080", source="test") + j = p.to_json + d = json.loads(j) + assert d["proxy"] == "1.2.3.4:8080" + + def test_create_from_json_roundtrip(self): + """to_json -> createFromJson 往返一致性""" + original = Proxy("10.0.0.1:3128", source="freeProxy01/freeProxy02", + https=True, fail_count=2, region="CN") + restored = Proxy.createFromJson(original.to_json) + assert restored.proxy == original.proxy + assert restored.https == original.https + assert restored.fail_count == original.fail_count + assert restored.region == original.region + assert restored.source == original.source + + def test_create_from_json_minimal(self): + """createFromJson 缺少字段时使用默认值""" + j = '{"proxy": "1.2.3.4:8080"}' + p = Proxy.createFromJson(j) + assert p.proxy == "1.2.3.4:8080" + assert p.fail_count == 0 + assert p.https is False + + def test_create_from_json_with_slash_source(self): + """source 含 / 的 JSON 反序列化""" + j = '{"proxy": "1.2.3.4:8080", "source": "freeProxy01/freeProxy02", "https": false}' + p = Proxy.createFromJson(j) + assert p.source == "freeProxy01/freeProxy02" + + def test_to_dict_to_json_consistency(self): + """to_dict 和 to_json 数据一致""" + p = Proxy("1.2.3.4:8080", source="test", https=True, fail_count=1) + d = p.to_dict + j = json.loads(p.to_json) + assert d == j + + +class TestProxySetters: + """setter 测试""" + + def test_fail_count_setter(self): + p = Proxy("1.2.3.4:8080") + p.fail_count = 5 + assert p.fail_count == 5 + + def test_check_count_setter(self): + p = Proxy("1.2.3.4:8080") + p.check_count = 10 + assert p.check_count == 10 + + def test_last_status_setter(self): + p = Proxy("1.2.3.4:8080") + p.last_status = True + assert p.last_status is True + + def test_last_time_setter(self): + p = Proxy("1.2.3.4:8080") + p.last_time = "2024-01-01 12:00:00" + assert p.last_time == "2024-01-01 12:00:00" + + def test_https_setter(self): + p = Proxy("1.2.3.4:8080") + p.https = True + assert p.https is True + + def test_region_setter(self): + p = Proxy("1.2.3.4:8080") + p.region = "US" + assert p.region == "US" + + +class TestProxyAddSource: + """add_source 测试""" + + def test_add_source(self): + p = Proxy("1.2.3.4:8080", source="src1") + p.add_source("src2") + assert "src1" in p.source + assert "src2" in p.source + + def test_add_source_dedup(self): + """重复 source 不应重复添加""" + p = Proxy("1.2.3.4:8080", source="src1") + p.add_source("src1") + assert p.source.count("src1") == 1 + + def test_add_source_empty_string(self): + """空字符串不应添加""" + p = Proxy("1.2.3.4:8080", source="src1") + p.add_source("") + assert p.source == "src1" + + def test_add_source_none(self): + """None 不应添加""" + p = Proxy("1.2.3.4:8080", source="src1") + p.add_source(None) + assert p.source == "src1" \ No newline at end of file diff --git a/tests/unit/test_validator.py b/tests/unit/test_validator.py new file mode 100644 index 000000000..25f374078 --- /dev/null +++ b/tests/unit/test_validator.py @@ -0,0 +1,68 @@ +# -*- coding: utf-8 -*- +""" +------------------------------------------------- + File Name: testValidator.py + Description : formatValidator正则测试 + Author : JHao + date: 2026/5/28 +------------------------------------------------- + Change Activity: + 2026/05/28: +------------------------------------------------- +""" +__author__ = 'JHao' + +import re +import pytest + +# 直接导入 IP_REGEX 和 formatValidator,不导入整个 validator 模块(避免模块级副作用) +from helper.validator import IP_REGEX, formatValidator + + +class TestIPRegex: + + @pytest.mark.parametrize("proxy", [ + "1.2.3.4:8080", + "192.168.1.1:3128", + "10.0.0.1:80", + "255.255.255.255:65535", + "0.0.0.0:1", + "1.2.3.4:99999", # regex 不校验端口范围 + "999.1.1.1:80", # regex 不校验 IP 范围 + "user:pass@1.2.3.4:8080", + "admin:secret@192.168.1.1:443", + ]) + def test_valid_proxy_format(self, proxy): + assert IP_REGEX.fullmatch(proxy) is not None, f"应匹配: {proxy}" + + @pytest.mark.parametrize("proxy", [ + "", + "abc", + "1.2.3.4", + "1.2.3.4:", + ":8080", + "1.2.3.4:abc", + "1.2.3.4:8080:extra", + "host:8080", + ]) + def test_invalid_proxy_format(self, proxy): + assert IP_REGEX.fullmatch(proxy) is None, f"不应匹配: {proxy}" + + +class TestFormatValidator: + + @pytest.mark.parametrize("proxy", [ + "1.2.3.4:8080", + "192.168.1.1:3128", + "user:pass@10.0.0.1:80", + ]) + def test_valid_returns_true(self, proxy): + assert formatValidator(proxy) is True + + @pytest.mark.parametrize("proxy", [ + "", + "abc", + "1.2.3.4", + ]) + def test_invalid_returns_false(self, proxy): + assert formatValidator(proxy) is False \ No newline at end of file diff --git a/tox.ini b/tox.ini new file mode 100644 index 000000000..aa50babb5 --- /dev/null +++ b/tox.ini @@ -0,0 +1,11 @@ +[tox] +envlist = py38,py39,py310,py311 +skip_missing_interpreters = true + +[testenv] +skip_install = true +recreate = true +deps = + -r requirements.txt + -r requirements-test.txt +commands = pytest \ No newline at end of file diff --git a/Util/__init__.py b/util/__init__.py similarity index 56% rename from Util/__init__.py rename to util/__init__.py index d1c5cc292..4a81052c3 100644 --- a/Util/__init__.py +++ b/util/__init__.py @@ -1,12 +1,13 @@ # -*- coding: utf-8 -*- """ ------------------------------------------------- - File Name: __init__.py.py - Description : - Author : JHao - date: 2016/11/25 + File Name: __init__ + Description : + Author : JHao + date: 2020/7/6 ------------------------------------------------- Change Activity: - 2016/11/25: + 2020/7/6: ------------------------------------------------- -""" \ No newline at end of file +""" +__author__ = 'JHao' diff --git a/util/lazyProperty.py b/util/lazyProperty.py new file mode 100644 index 000000000..f028192d2 --- /dev/null +++ b/util/lazyProperty.py @@ -0,0 +1,31 @@ +# -*- coding: utf-8 -*- +""" +------------------------------------------------- + File Name: lazyProperty + Description : + Author : JHao + date: 2016/12/3 +------------------------------------------------- + Change Activity: + 2016/12/3: +------------------------------------------------- +""" +__author__ = 'JHao' + + +class LazyProperty(object): + """ + LazyProperty + explain: http://www.spiderpy.cn/blog/5/ + """ + + def __init__(self, func): + self.func = func + + def __get__(self, instance, owner): + if instance is None: + return self + else: + value = self.func(instance) + setattr(instance, self.func.__name__, value) + return value diff --git a/util/singleton.py b/util/singleton.py new file mode 100644 index 000000000..1abb7a7c3 --- /dev/null +++ b/util/singleton.py @@ -0,0 +1,26 @@ +# -*- coding: utf-8 -*- +""" +------------------------------------------------- + File Name: singleton + Description : + Author : JHao + date: 2016/12/3 +------------------------------------------------- + Change Activity: + 2016/12/3: +------------------------------------------------- +""" +__author__ = 'JHao' + + +class Singleton(type): + """ + Singleton Metaclass + """ + + _inst = {} + + def __call__(cls, *args, **kwargs): + if cls not in cls._inst: + cls._inst[cls] = super(Singleton, cls).__call__(*args) + return cls._inst[cls] diff --git a/util/six.py b/util/six.py new file mode 100644 index 000000000..d31e12138 --- /dev/null +++ b/util/six.py @@ -0,0 +1,57 @@ +# -*- coding: utf-8 -*- +""" +------------------------------------------------- + File Name: six + Description : + Author : JHao + date: 2020/6/22 +------------------------------------------------- + Change Activity: + 2020/6/22: +------------------------------------------------- +""" +__author__ = 'JHao' + +import sys + +PY2 = sys.version_info[0] == 2 +PY3 = sys.version_info[0] == 3 + +if PY3: + def iteritems(d, **kw): + return iter(d.items(**kw)) +else: + def iteritems(d, **kw): + return d.iteritems(**kw) + +if PY3: + from urllib.parse import urlparse +else: + from urlparse import urlparse + +if PY3: + try: + from importlib import reload as reload_six + except ImportError: + from imp import reload as reload_six +else: + reload_six = reload + +if PY3: + from queue import Empty, Queue +else: + from Queue import Empty, Queue + + +def withMetaclass(meta, *bases): + """Create a base class with a metaclass.""" + + # This requires a bit of explanation: the basic idea is to make a dummy + # metaclass for one level of class instantiation that replaces itself with + # the actual metaclass. + class MetaClass(meta): + + def __new__(cls, name, this_bases, d): + return meta(name, bases, d) + + return type.__new__(MetaClass, 'temporary_class', (), {}) diff --git a/util/webRequest.py b/util/webRequest.py new file mode 100644 index 000000000..97164773a --- /dev/null +++ b/util/webRequest.py @@ -0,0 +1,133 @@ +# -*- coding: utf-8 -*- +""" +------------------------------------------------- + File Name: WebRequest + Description : Network Requests Class + Author : J_hao + date: 2017/7/31 +------------------------------------------------- + Change Activity: + 2017/7/31: +------------------------------------------------- +""" +__author__ = 'J_hao' + +from requests.models import Response +from lxml import etree +import requests +import random +import time + +from handler.logHandler import LogHandler + +requests.packages.urllib3.disable_warnings() + + +class WebRequest(object): + name = "web_request" + + def __init__(self, *args, **kwargs): + self.log = LogHandler(self.name, file=False) + self.response = Response() + + @property + def user_agent(self): + """ + return an User-Agent at random + :return: + """ + ua_list = [ + 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101', + 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122', + 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.71', + 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95', + 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71', + 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)', + 'Mozilla/5.0 (Windows NT 5.1; U; en; rv:1.8.1) Gecko/20061208 Firefox/2.0.0 Opera 9.50', + 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:34.0) Gecko/20100101 Firefox/34.0', + ] + return random.choice(ua_list) + + @property + def header(self): + """ + basic header + :return: + """ + return {'User-Agent': self.user_agent, + 'Accept': '*/*', + 'Connection': 'keep-alive', + 'Accept-Language': 'zh-CN,zh;q=0.8'} + + def get(self, url, header=None, retry_time=3, retry_interval=5, timeout=5, *args, **kwargs): + """ + get method + :param url: target url + :param header: headers + :param retry_time: retry time + :param retry_interval: retry interval + :param timeout: network timeout + :return: + """ + headers = self.header + if header and isinstance(header, dict): + headers.update(header) + while True: + try: + self.response = requests.get(url, headers=headers, timeout=timeout, *args, **kwargs) + return self + except Exception as e: + self.log.error("requests: %s error: %s" % (url, str(e))) + retry_time -= 1 + if retry_time <= 0: + resp = Response() + resp.status_code = 200 + return self + self.log.info("retry %s second after" % retry_interval) + time.sleep(retry_interval) + + def post(self, url, header=None, retry_time=3, retry_interval=5, timeout=5, *args, **kwargs): + """ + post method + :param url: target url + :param header: headers + :param retry_time: retry time + :param retry_interval: retry interval + :param timeout: network timeout + :return: + """ + headers = self.header + if header and isinstance(header, dict): + headers.update(header) + while True: + try: + self.response = requests.post(url, headers=headers, timeout=timeout, *args, **kwargs) + return self + except Exception as e: + self.log.error("requests: %s error: %s" % (url, str(e))) + retry_time -= 1 + if retry_time <= 0: + resp = Response() + resp.status_code = 200 + self.response = resp + return self + self.log.info("retry %s second after" % retry_interval) + time.sleep(retry_interval) + + @property + def tree(self): + if not self.response.content: + return None + return etree.HTML(self.response.content) + + @property + def text(self): + return self.response.text + + @property + def json(self): + try: + return self.response.json() + except Exception as e: + self.log.error(str(e)) + return {}