Skip to content

Commit 4c9317c

Browse files
author
lichuang
committed
add baidu_search
1 parent 01a756e commit 4c9317c

File tree

8 files changed

+1210
-0
lines changed

8 files changed

+1210
-0
lines changed

baidu_search/baidu_search/__init__.py

Whitespace-only changes.

baidu_search/baidu_search/items.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
# -*- coding: utf-8 -*-
2+
3+
# Define here the models for your scraped items
4+
#
5+
# See documentation in:
6+
# http://doc.scrapy.org/en/latest/topics/items.html
7+
8+
import scrapy
9+
10+
11+
class BaiduSearchItem(scrapy.Item):
12+
# define the fields for your item here like:
13+
# name = scrapy.Field()
14+
pass
Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
# -*- coding: utf-8 -*-
2+
3+
# Define your item pipelines here
4+
#
5+
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
6+
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
7+
8+
9+
class BaiduSearchPipeline(object):
10+
def process_item(self, item, spider):
11+
return item

baidu_search/baidu_search/result.html

Lines changed: 1041 additions & 0 deletions
Large diffs are not rendered by default.
Lines changed: 92 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,92 @@
1+
# -*- coding: utf-8 -*-
2+
3+
# Scrapy settings for baidu_search project
4+
#
5+
# For simplicity, this file contains only settings considered important or
6+
# commonly used. You can find more settings consulting the documentation:
7+
#
8+
# http://doc.scrapy.org/en/latest/topics/settings.html
9+
# http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
10+
# http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
11+
12+
BOT_NAME = 'baidu_search'
13+
14+
SPIDER_MODULES = ['baidu_search.spiders']
15+
NEWSPIDER_MODULE = 'baidu_search.spiders'
16+
17+
18+
# Crawl responsibly by identifying yourself (and your website) on the user-agent
19+
#USER_AGENT = 'baidu_search (+http://www.yourdomain.com)'
20+
USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'
21+
22+
# Obey robots.txt rules
23+
ROBOTSTXT_OBEY = False
24+
25+
# Configure maximum concurrent requests performed by Scrapy (default: 16)
26+
#CONCURRENT_REQUESTS = 32
27+
28+
# Configure a delay for requests for the same website (default: 0)
29+
# See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
30+
# See also autothrottle settings and docs
31+
#DOWNLOAD_DELAY = 3
32+
DOWNLOAD_TIMEOUT = 5
33+
# The download delay setting will honor only one of:
34+
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
35+
#CONCURRENT_REQUESTS_PER_IP = 16
36+
37+
# Disable cookies (enabled by default)
38+
#COOKIES_ENABLED = False
39+
40+
# Disable Telnet Console (enabled by default)
41+
#TELNETCONSOLE_ENABLED = False
42+
43+
# Override the default request headers:
44+
#DEFAULT_REQUEST_HEADERS = {
45+
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
46+
# 'Accept-Language': 'en',
47+
#}
48+
49+
# Enable or disable spider middlewares
50+
# See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
51+
#SPIDER_MIDDLEWARES = {
52+
# 'baidu_search.middlewares.MyCustomSpiderMiddleware': 543,
53+
#}
54+
55+
# Enable or disable downloader middlewares
56+
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
57+
#DOWNLOADER_MIDDLEWARES = {
58+
# 'baidu_search.middlewares.MyCustomDownloaderMiddleware': 543,
59+
#}
60+
61+
# Enable or disable extensions
62+
# See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
63+
#EXTENSIONS = {
64+
# 'scrapy.extensions.telnet.TelnetConsole': None,
65+
#}
66+
67+
# Configure item pipelines
68+
# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
69+
#ITEM_PIPELINES = {
70+
# 'baidu_search.pipelines.SomePipeline': 300,
71+
#}
72+
73+
# Enable and configure the AutoThrottle extension (disabled by default)
74+
# See http://doc.scrapy.org/en/latest/topics/autothrottle.html
75+
#AUTOTHROTTLE_ENABLED = True
76+
# The initial download delay
77+
#AUTOTHROTTLE_START_DELAY = 5
78+
# The maximum download delay to be set in case of high latencies
79+
#AUTOTHROTTLE_MAX_DELAY = 60
80+
# The average number of requests Scrapy should be sending in parallel to
81+
# each remote server
82+
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
83+
# Enable showing throttling stats for every response received:
84+
#AUTOTHROTTLE_DEBUG = False
85+
86+
# Enable and configure HTTP caching (disabled by default)
87+
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
88+
#HTTPCACHE_ENABLED = True
89+
#HTTPCACHE_EXPIRATION_SECS = 0
90+
#HTTPCACHE_DIR = 'httpcache'
91+
#HTTPCACHE_IGNORE_HTTP_CODES = []
92+
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
# This package will contain the spiders of your Scrapy project
2+
#
3+
# Please refer to the documentation for information on how to create and manage
4+
# your spiders.
Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
# coding:utf-8
2+
3+
import sys
4+
reload(sys)
5+
sys.setdefaultencoding( "utf-8" )
6+
7+
import scrapy
8+
from w3lib.html import remove_tags
9+
10+
class BaiduSearchSpider(scrapy.Spider):
11+
name = "baidu_search"
12+
allowed_domains = ["baidu.com"]
13+
start_urls = [
14+
"https://www.baidu.com/s?wd=机器学习"
15+
]
16+
17+
def parse(self, response):
18+
hrefs = response.selector.xpath('//div[contains(@class, "c-container")]/h3/a/@href').extract()
19+
containers = response.selector.xpath('//div[contains(@class, "c-container")]')
20+
for container in containers:
21+
href = container.xpath('h3/a/@href').extract()[0]
22+
title = remove_tags(container.xpath('h3/a').extract()[0])
23+
c_abstract = container.xpath('div/div/div[contains(@class, "c-abstract")]').extract()
24+
abstract = ""
25+
if len(c_abstract) > 0:
26+
abstract = remove_tags(c_abstract[0])
27+
request = scrapy.Request(href, callback=self.parse_url)
28+
request.meta['title'] = title
29+
request.meta['abstract'] = abstract
30+
yield request
31+
32+
def parse_url(self, response):
33+
print "url:", response.url
34+
print "title:", response.meta['title']
35+
print "abstract:", response.meta['abstract']
36+
content = remove_tags(response.selector.xpath('//body').extract()[0])
37+
print "content_len:", len(content)

baidu_search/scrapy.cfg

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
# Automatically created by: scrapy startproject
2+
#
3+
# For more information about the [deploy] section see:
4+
# https://scrapyd.readthedocs.org/en/latest/deploy.html
5+
6+
[settings]
7+
default = baidu_search.settings
8+
9+
[deploy]
10+
#url = http://localhost:6800/
11+
project = baidu_search

0 commit comments

Comments
 (0)