Skip to content

Commit e520954

Browse files
author
tuntun
committed
一个最基础的Scrapy抓取示例
1 parent b3c95bf commit e520954

5 files changed

Lines changed: 64 additions & 5 deletions

File tree

myspiders/blog/blog/items.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,4 +11,6 @@
1111
class BlogItem(scrapy.Item):
1212
# define the fields for your item here like:
1313
# name = scrapy.Field()
14-
pass
14+
title = scrapy.Field()
15+
link = scrapy.Field()
16+
description = scrapy.Field()

myspiders/blog/blog/pipelines.py

Lines changed: 15 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,8 +4,22 @@
44
#
55
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
66
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
7-
7+
import json
88

99
class BlogPipeline(object):
10+
11+
def __init__(self):
12+
pass
13+
# self.file = open('./blog.json','wb')
14+
1015
def process_item(self, item, spider):
16+
# self.file.write(json.dumps(dict(item), ensure_ascii=False).encode('utf8') + '\n')
17+
if(item['description']):
18+
item['description'] = ''.join(item['description'])
19+
1120
return item
21+
22+
23+
def spider_closed(self, spider):
24+
pass
25+
# self.file.close()

myspiders/blog/blog/settings.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -61,9 +61,9 @@
6161

6262
# Configure item pipelines
6363
# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
64-
#ITEM_PIPELINES = {
65-
# 'blog.pipelines.SomePipeline': 300,
66-
#}
64+
ITEM_PIPELINES = {
65+
'blog.pipelines.BlogPipeline': 300,
66+
}
6767

6868
# Enable and configure the AutoThrottle extension (disabled by default)
6969
# See http://doc.scrapy.org/en/latest/topics/autothrottle.html
Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
from scrapy.spiders import Spider
2+
from scrapy.selector import Selector
3+
4+
from blog.items import BlogItem
5+
6+
7+
class BlogIndexSpider(Spider):
8+
name = "index"
9+
allowed_domains = ["tantengvip.com"]
10+
start_urls = [
11+
"http://www.tantengvip.com/",
12+
]
13+
14+
def parse(self, response):
15+
sel = Selector(response)
16+
articles = sel.xpath('//div[@id="content"]/article')
17+
items = []
18+
19+
for article in articles:
20+
item = BlogItem()
21+
item['title'] = article.xpath('header/h1[@class="entry-title"]/a/text()').extract()
22+
item['link'] = article.xpath('header/h1[@class="entry-title"]/a/@href').extract()
23+
item['description'] = article.xpath('div[@class="entry-content"]/p[position()<3]/text()').extract()
24+
25+
items.append(item)
26+
27+
return items

myspiders/blog/index.json

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
[{"link": ["http://www.tantengvip.com/2015/11/windows-python27-scrapy/"], "description": "\u7531\u4e8eScrapy\u76ee\u524d\u5bf9Python2.7\u7684\u652f\u6301\u6700\u597d\uff0cPython3\u4e0b\u53ef\u80fd\u4f1a\u6709\u95ee\u9898\uff0c\u6240\u4ee5\u8fd8\u662f\u5728Python2.7\u4e0b\u88c5Scrapy\u6a21\u5757\u3002\u5728Mac\u4e0a\u81ea\u5e26\u7684Python2.7\u73af\u5883\u5b89\u88c5Scrapy\u6a21\u5757\uff0c\u4e00\u8def\u987a\u98ce\uff0c\u76f4\u63a5pip install -i http://pypi.douban.com/simple Scrapy,\u81ea\u52a8\u5b89\u88c5\u6240\u9700\u7684\u6a21\u5757\uff0c\u4e00\u5207OK.", "title": ["\u5728windows\u4e0a\u5b89\u88c5Python2.7 Scrapy\u6a21\u5757"]},
2+
{"link": ["http://www.tantengvip.com/2015/11/redis-set/"], "description": "\u4e3a\u4e86\u5e7f\u5927\u4eb2\u4eec\u80fd\u591f\u597d\u597d\u5241\u624b\uff0c\u6211\u4eec\u4e5f\u662f\u62fc\u4e86\uff0c\u201c\u53cc\u5341\u4e00\u201d\u4e00\u8fc7\uff0c\u6211\u4eec\u5c31\u5f00\u59cb\u51c6\u5907\u201c\u53cc\u5341\u4e8c\u201d\u4e86\uff0c\u5927\u4fc3\u6d3b\u52a8\u6709\u5f88\u591a\uff0c\u672c\u6587\u4ee5\u201c\u4e0b\u5355\u6709\u793c\u201d\u7684\u529f\u80fd\u5b9e\u73b0\uff0c\u8bb2\u8bb2redis\u7684\u51e0\u4e2a\u7528\u6cd5\uff0c\u5982redis\u96c6\u5408\u3001\u81ea\u589e\u7684\u5177\u4f53\u5e94\u7528\u3002\u8fd9\u91cc\u4ec5\u4ec5\u4e3e\u4e2a\u5c0f\u4f8b\u5b50\uff0c\u5728\u4e00\u4e2a\u5927\u578b\u7f51\u7ad9\u591a\u4e2a\u7ec8\u7aef\u4e2d\uff0c\u8981\u8003\u8651\u5404\u79cd\u4e0d\u540c\u7684\u60c5\u51b5\u548c\u5e94\u7528\u573a\u666f\u3002 ", "title": ["Redis\u96c6\u5408\u7684\u5e94\u7528:\u4e0b\u5355\u6709\u793c\u529f\u80fd\u5b9e\u73b0"]},
3+
{"link": ["http://www.tantengvip.com/2015/11/flushdb-redis/"], "description": "\u4f7f\u7528flushdb\u547d\u4ee4\u53ef\u4ee5\u6e05\u9664redis\u5176\u4e2d\u4e00\u4e2a\u6570\u636e\u5e93\u7684\u6240\u6709\u6570\u636e\uff0cflushall\u547d\u4ee4\u6e05\u9664\u6574\u4e2aredis\u6240\u6709\u6570\u636e\u5e93\u7684\u6570\u636e\u3002", "title": ["flushdb\u547d\u4ee4\u6e05\u9664REDIS\u6240\u6709\u6570\u636e"]},
4+
{"link": ["http://www.tantengvip.com/2015/11/python-singleton/"], "description": "\u5f53\u6211\u4eec\u7406\u89e3\u4e86Python\u7684__new__\u65b9\u6cd5\u540e\uff0c\u6211\u4eec\u8fd8\u53ef\u4ee5\u5229\u7528\u5b83\u6765\u505a\u4e00\u4e9b\u5176\u4ed6\u6709\u8da3\u7684\u4e8b\u60c5\uff0c\u6bd4\u5982\u5b9e\u73b0\u8bbe\u8ba1\u6a21\u5f0f\u4e2d\u7684\u5355\u4f8b\u6a21\u5f0f(singleton) \u3002\u56e0\u4e3a\u7c7b\u6bcf\u4e00\u6b21\u5b9e\u4f8b\u5316\u540e\u4ea7\u751f\u7684\u8fc7\u7a0b\u90fd\u662f\u901a\u8fc7__new__\u6765\u63a7\u5236\u7684\uff0c\u6240\u4ee5\u901a\u8fc7\u91cd\u8f7d__new__\u65b9\u6cd5\uff0c\u6211\u4eec\u53ef\u4ee5\u5f88\u7b80\u5355\u7684\u5b9e\u73b0\u5355\u4f8b\u6a21\u5f0f\u3002", "title": ["Python3\u5355\u4f8b\u6a21\u5f0f\u793a\u4f8b"]},
5+
{"link": ["http://www.tantengvip.com/2015/11/pip-mirror/"], "description": "\u4f7f\u7528pip\u5b89\u88c5python\u5305\u7531\u4e8e\u5b98\u7f51\u7ecf\u5e38\u88ab\u5899\u65e0\u6cd5\u5b89\u88c5\uff0c\u53ef\u4ee5\u4f7f\u7528pip\u955c\u50cf\u65b9\u5f0f\u5b89\u88c5python\u5305\u3002 ", "title": ["\u4f7f\u7528pip\u955c\u50cf\u65b9\u5f0f\u5b89\u88c5python\u5305"]},
6+
{"link": ["http://www.tantengvip.com/2015/11/mac-install-redis/"], "description": "\u53bbredis\u5b98\u7f51()\u81ea\u884c\u4e0b\u8f7d\u5b89\u88c5\u5305\u89e3\u538b\u7f29\u5230\u672c\u5730\u6587\u4ef6\u5939\uff0c\u6bd4\u5982\u653e\u5728Mac\u5e94\u7528\u7a0b\u5e8f\u6587\u4ef6\u5939(/Applications/)\uff0c\u5728\u7ec8\u7aef\u8fdb\u5165redis\u6587\u4ef6\u5939\u3002", "title": ["Mac\u4e0b\u5b89\u88c5Redis"]},
7+
{"link": ["http://www.tantengvip.com/2015/11/httpd-vhost-domain-alias/"], "description": "\u5728Apache\u7684httpd-vhosts\u53ef\u4ee5\u914d\u7f6e\u865a\u62df\u4e3b\u673a\u57df\u540d\uff0c\u4e5f\u53ef\u4ee5\u8bbe\u7f6e\u57df\u540d\u522b\u540d\uff0c\u5982\u57df\u540dwww.yii2.com\uff0c\u53ef\u4ee5\u7ed9\u5b83\u8bbe\u7f6e\u4e00\u4e2a\u6216\u591a\u4e2a\u57df\u540d\u522b\u540d\uff0c\u901a\u8fc7www.yii3.com,www.yii4.com\u8bbf\u95ee\u3002 ", "title": ["httpd-vhosts\u8bbe\u7f6e\u57df\u540d\u522b\u540d"]},
8+
{"link": ["http://www.tantengvip.com/2015/11/php-abstract-interface-demo/"], "description": "\u62bd\u8c61\u7c7b(abstract class)\u548c\u63a5\u53e3(interface)\u662f\u9762\u5411\u5bf9\u8c61\u5f88\u91cd\u8981\u7684\u6982\u5ff5\uff0c\u4ed6\u4eec\u5f88\u76f8\u4f3c\uff0c\u90fd\u662f\u5b9a\u4e49\u8981\u5b9e\u73b0\u7684\u65b9\u6cd5\uff0c\u4f46\u53c8\u6709\u4e0d\u540c\u7684\u4f7f\u7528\u573a\u666f\uff0cPHP\u7684\u62bd\u8c61\u7c7b\u548c\u63a5\u53e3\u6709\u4ec0\u4e48\u533a\u522b\uff0c\u770b\u4e0b\u9762\u7684\u793a\u4f8b\u3002 ", "title": ["PHP\u62bd\u8c61\u7c7b\u548c\u63a5\u53e3\u793a\u4f8b\u548c\u533a\u522b"]},
9+
{"link": ["http://www.tantengvip.com/2015/11/python-ebook/"], "description": "\u770b\u4e91\u4e0a\u51e0\u672c\u5b66python\u7684\u514d\u8d39\u7535\u5b50\u4e66\uff1a\u94fe\u63a5\uff1a", "title": ["\u5b66python\u7684\u51e0\u672c\u7535\u5b50\u4e66"]},
10+
{"link": ["http://www.tantengvip.com/2015/11/mac-yii2-migrate/"], "description": "Yii2 migrate\u7684\u6982\u5ff5\uff1a\u4e0b\u9762\u662f\u5728Mac\u4e0b\u4f7f\u7528 Yii 2 migrate \u547d\u4ee4\u5b89\u88c5\u521d\u59cb\u5316\u6570\u636e\u5e93\uff1a", "title": ["Mac\u4e0bYii 2 migrate\u547d\u4ee4\u64cd\u4f5c\u6570\u636e\u5e93"]},
11+
{"link": ["http://www.tantengvip.com/2015/11/html5-localstorage/"], "description": "\u672c\u5730\u5b58\u50a8\u89e3\u51b3\u65b9\u6848\u5f88\u591a\uff0c\u6bd4\u5982Flash SharedObject\u3001Google Gears\u3001Cookie\u3001DOM Storage\u3001User Data\u3001window.name\u3001Silverlight\u3001Open Database\u7b49\u3002", "title": ["HTML5\u7684\u672c\u5730\u5b58\u50a8\u65b9\u6848localStorage"]},
12+
{"link": ["http://www.tantengvip.com/2015/11/github-image-hosting/"], "description": "\u5728\u77e5\u4e4e\u4e0a\u770b\u5230\u4e00\u4e2a\u56de\u7b54\uff0c\u95ee\u9898\u662f\u300e\u300f\uff0c\u7adf\u7136\u6709\u4eba\u8bf4\u53ef\u4ee5\u628aGithub\u5f53\u514d\u8d39\u56fe\u5e8a\uff0c\u540e\u6765\u4e00\u60f3\uff0c\u786e\u5b9e\u53ef\u4ee5\u554a\uff0c\u641c\u7d22\u4e00\u4e0b\u770b\u679c\u7136\u6709\u65b9\u6cd5\uff0c\u5176\u5b9e\u539f\u7406\u4e5f\u5f88\u7b80\u5355\u3002 ", "title": ["\u628aGithub\u5f53\u514d\u8d39\u56fe\u5e8a"]},
13+
{"link": ["http://www.tantengvip.com/2015/11/wordpress-url-get-route/"], "description": "WordPress\u6dfb\u52a0\u8def\u7531\u89c4\u5219\u51fd\u6570add_rewrite_rule\uff0c\u4f7f\u7528\u793a\u4f8b\uff1a\u8fd9\u6837url www.xx.com/haha/tuntun \uff0c\u5b9e\u9645\u4e0a\u5c31\u4f1a\u8bbf\u95ee /index.php?page_id=8&myname=tuntun", "title": ["WordPress\u53c2\u6570\u4f20\u9012\u548c\u8def\u7531\u89c4\u5219"]},
14+
{"link": ["http://www.tantengvip.com/2015/11/wordpress-redis/"], "description": "\u5728WordPress\u6d4b\u8bd5\u4e86\u4e00\u4e0b\u4f7f\u7528Redis\uff0c\u505a\u4e86\u4e2a\u6d4b\u8bd5\u63d2\u4ef6\uff0c\u8fd9\u4e2a\u63d2\u4ef6\u4e0d\u53ef\u6b63\u5f0f\u7528\u4e8e\u9879\u76ee\uff01\u8fd9\u4e2a\u63d2\u4ef6\u4e0d\u53ef\u6b63\u5f0f\u7528\u4e8e\u9879\u76ee\uff01\u8fd9\u4e2a\u63d2\u4ef6\u4e0d\u53ef\u6b63\u5f0f\u7528\u4e8e\u9879\u76ee\uff01\u8fd9\u53ea\u662f\u6d4b\u8bd5\u5728WordPress\u4e2d\u5b9e\u73b0Redis\u7684\u4f7f\u7528\uff0c\u4eceYii2\u7684\u4e00\u4e2aRedis\u7c7b\u7248\u4ee3\u7801\u8fc7\u6765\uff0c\u770b\u80fd\u4e0d\u80fd\u6210\u529f\u5b9e\u73b0Redis\uff0c\u7ed3\u679c\u662fOK\u7684\u3002\u8fd9\u4ec5\u4ec5\u662f \u6d4b\u8bd5\u4ee3\u7801\u3002", "title": ["\u5728WordPress\u4e2d\u4f7f\u7528Redis"]},
15+
{"link": ["http://www.tantengvip.com/2015/11/serialize-json-diff/"], "description": "\u5728PHP\u4e2d\uff0cserialize\u548cjson\u4e24\u79cd\u65b9\u5f0f\u5bf9\u4e00\u4e2a\u5bf9\u8c61\u6216\u6570\u7ec4\u8fdb\u884c\u5e8f\u5217\u5316\u6216\u53cd\u5e8f\u5217\u5316\u6709\u4ec0\u4e48\u533a\u522b\u5462\uff1f\u5047\u8bbe\u4e00\u4e2a\u5bf9\u8c61\u548c\u4e00\u4e2a\u6570\u7ec4\uff1a", "title": ["PHP\u4e2dserialize\u548cjson\u5e8f\u5217\u5316\u4e0e\u53cd\u5e8f\u5217\u5316\u7684\u533a\u522b"]},
16+
{"link": ["http://www.tantengvip.com/2015/11/sourcetree-commit-slowly/"], "description": "SourceTree\u63d0\u4ea4\u4ee3\u7801\u5361\u6b7b\uff0c\u7279\u522b\u662f\u4e00\u6b21\u63d0\u4ea4\u7684\u4ee3\u7801\u6587\u4ef6\u5f88\u591a\u7684\u65f6\u5019\uff0c\u8fd9\u4e2a\u65f6\u5019\u5efa\u8bae\u7528Git\u547d\u4ee4\u884c\u7684\u65b9\u5f0f\u63d0\u4ea4\uff0c\u5c31\u4e0d\u4f1a\u51fa\u73b0\u5361\u6b7b\u7684\u60c5\u51b5\u3002", "title": ["\u89e3\u51b3SourceTree\u8fd0\u884c\u6162\u7684\u65b9\u6cd5"]}]

0 commit comments

Comments
 (0)