diff --git a/.gitignore b/.gitignore index 0af075f75..cd33b6188 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,3 @@ target/* *.iml +out/ diff --git a/.travis.yml b/.travis.yml new file mode 100644 index 000000000..c7c99f406 --- /dev/null +++ b/.travis.yml @@ -0,0 +1,4 @@ +language: java +jdk: + - oraclejdk7 + - openjdk6 diff --git a/README.md b/README.md index 723adf42f..66be4c46c 100644 --- a/README.md +++ b/README.md @@ -1,87 +1,157 @@ webmagic --------- -####*一个网络爬虫工具包* +[![Build Status](https://travis-ci.org/code4craft/webmagic.png?branch=master)](https://travis-ci.org/code4craft/webmagic) -webmagic的发起源于工作中的需要,其定位是帮助开发者更便捷的开发一个垂直的网络爬虫。webmagic可以便捷的使用xpath和正则表达式进行链接和内容的提取,对于有Java和xpath或者正则基础的开发者,只需编写少量代码即可完成一个定制爬虫。 +>webmagic是一个开源的Java垂直爬虫框架,目标是简化爬虫的开发流程,让开发者专注于逻辑功能的开发。webmagic的核心非常简单,但是覆盖爬虫的整个流程,也是很好的学习爬虫开发的材料。作者曾经在前公司进行过一年的垂直爬虫的开发,webmagic就是为了解决爬虫开发的一些重复劳动而产生的框架。 -###哲学### +>web爬虫是一种技术,webmagic致力于将这种技术的实现成本降低,但是出于对资源提供者的尊重,webmagic不会做反封锁的事情,包括:验证码破解、代理切换、自动登录、抓取静态资源等。 -* Write Less, Do more. +webmagic的主要特色: - webmagic是一个开发者的工具包,它的目标是让开发者可以通过更少的代码,实现一个高质量的爬虫。webmagic内部还集成了一些常见的垂直性爬虫的功能,例如针对页面正文的Readability技术,可以直接对页面的正文进行智能提取。 - - 以下是爬取oschina博客的一段代码: - - Spider.me().processor(new SimplePageProcessor("http://my.oschina.net/", "http://my.oschina.net/*/blog/*")).run(); +* 核心简单但是涵盖爬虫的全部流程,灵活而强大,也是学习爬虫入门的好材料。 +* 提供丰富的抽取页面API。 +* 无配置,但是可通过POJO+注解形式实现一个爬虫。 +* 支持多线程。 +* 支持分布式。 +* 支持爬取js动态渲染的页面。 +* 无框架依赖,可以灵活的嵌入到项目中去。 + +webmagic的架构和设计参考了以下两个项目,感谢以下两个项目的作者: + +python爬虫 **scrapy** [https://github.com/scrapy/scrapy](https://github.com/scrapy/scrapy) + +Java爬虫 **Spiderman** [https://gitcafe.com/laiweiwei/Spiderman](https://gitcafe.com/laiweiwei/Spiderman) -* 简单可用 +## 快速开始 - webmagic的功能覆盖整个爬虫的生命周期(链接提取、页面下载、内容抽取、持久化),是一个完整的爬虫框架。但是与其他Full-Stack的框架不同,webmagic只引入少量约定,大部分功能都通过简单的API调用完成,目的是尽量降低开发者的学习成本。webmagic以jar包的形式存在,并且不依赖任何框架,在程序可以随处进行调用。 +### 使用maven -* 灵活性 +webmagic使用maven管理依赖,你可以直接下载webmagic源码进行编译: - 参考scrapy的设计,webmagic将爬虫的扩展点分为processor、schedular、downloader、pipeline三个模块,可以通过扩展这些接口实现强大的扩展功能。如可以通过多个Spider实现多线程抓取;可以通过扩展schedular实现断点续传乃至于分布式爬虫;可以通过扩展pipeline实现业务可定制的持久化功能。 + git clone https://github.com/code4craft/webmagic.git + cd webmagic + mvn clean install + +安装后,在项目中添加对应的依赖即可使用webmagic: + + + us.codecraft + webmagic-core + 0.2.0 + + + us.codecraft + webmagic-extension + 0.2.0 + + +#### 项目结构 ------- +webmagic主要包括两个包: -###Get Started +* **webmagic-core** + + webmagic核心部分,只包含爬虫基本模块和基本抽取器。webmagic-core的目标是成为网页爬虫的一个教科书般的实现。 + +* **webmagic-extension** -webmagic定制的核心是PageProcessor接口。一个最简单的webmagic爬虫例子是这样的: + webmagic的扩展模块,提供一些更方便的编写爬虫的工具。包括注解格式定义爬虫、JSON、分布式等支持。 + +webmagic还包含两个可用的扩展包,因为这两个包都依赖了比较重量级的工具,所以从主要包中抽离出来: + +* **webmagic-saxon** + + webmagic与Saxon结合的模块。Saxon是一个XPath、XSLT的解析工具,webmagic依赖Saxon来进行XPath2.0语法解析支持。 - Spider.me().processor(new SimplePageProcessor("http://my.oschina.net/", "http://my.oschina.net/*/blog/*")).run(); +* **webmagic-selenium** + + webmagic与Selenium结合的模块。Selenium是一个模拟浏览器进行页面渲染的工具,webmagic依赖Selenium进行动态页面的抓取。 -其中SimplePageProcessor实现如下: +在项目中,你可以根据需要依赖不同的包。 - public class SimplePageProcessor implements PageProcessor { +### 不使用maven - private String urlPattern; +不使用maven的用户,可以下载这个二进制打包版本(感谢[oschina](http://www.oschina.net/)): - private static final String UA = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31"; + git clone http://git.oschina.net/flashsword20/webmagic-bin.git - private Site site; +在**bin/lib**目录下,有项目依赖的所有jar包,直接在IDE里import即可。 - public SimplePageProcessor(String startUrl, String urlPattern) { - this.site = Site.me().setStartUrl(startUrl). - setDomain(UrlUtils.getDomain(startUrl)).setUserAgent(UA); - this.urlPattern = "("+urlPattern.replace(".","\\.").replace("*","[^\"'#]*")+")"; +### 第一个爬虫 - } +#### 定制PageProcessor + +PageProcessor是webmagic-core的一部分,定制一个PageProcessor即可实现自己的爬虫逻辑。以下是抓取osc博客的一段代码: + + public class OschinaBlogPageProcesser implements PageProcessor { + + private Site site = Site.me().setDomain("my.oschina.net") + .addStartUrl("http://my.oschina.net/flashsword/blog"); @Override public void process(Page page) { - List requests = page.getHtml().as().rs(urlPattern).toStrings(); - page.addTargetRequests(requests); - page.putField("title", page.getHtml().x("//title")); - page.putField("content", page.getHtml().sc()); + List links = page.getHtml().links().regex("http://my\\.oschina\\.net/flashsword/blog/\\d+").all(); + page.addTargetRequests(links); + page.putField("title", page.getHtml().xpath("//div[@class='BlogEntity']/div[@class='BlogTitle']/h1").toString()); + page.putField("content", page.getHtml().$("div.content").toString()); + page.putField("tags",page.getHtml().xpath("//div[@class='BlogTags']/a/text()").all()); } @Override public Site getSite() { return site; + + } + + public static void main(String[] args) { + Spider.create(new OschinaBlogPageProcesser()) + .pipeline(new ConsolePipeline()).run(); } } ---- +这里通过page.addTargetRequests()方法来增加要抓取的URL,并通过page.putField()来保存抽取结果。page.getHtml().xpath()则是按照某个规则对结果进行抽取,这里抽取支持链式调用。调用结束后,toString()表示转化为单个String,all()则转化为一个String列表。 -TODO +Spider是爬虫的入口类。Pipeline是结果输出和持久化的接口,这里ConsolePipeline表示结果输出到控制台。 +执行这个main方法,即可在控制台看到抓取结果。webmagic默认有3秒抓取间隔,请耐心等待。 - public class OschinaBlogPageProcesser implements PageProcessor { +#### 使用注解 - @Override - public void process(Page page) { - List strings = page.getHtml().rs("]*href=[\"']{1}(http://my\\.oschina\\.net/\\w+/blog/\\d+)[\"']{1}").toStrings(); - page.addTargetRequests(strings); - page.putField("title", page.getHtml().xs("//div[@class='BlogEntity']/div[@class='BlogTitle']/h1")); - page.putField("content", page.getHtml().sc()); - page.putField("author", page.getUrl().r("my\\.oschina\\.net/(\\w+)/blog/\\d+")); - } +webmagic-extension包括了注解方式编写爬虫的方法,只需基于一个POJO增加注解即可完成一个爬虫。以下仍然是抓取oschina博客的一段代码,功能与OschinaBlogPageProcesser完全相同: - @Override - public Site getSite() { - return Site.me().setDomain("my.oschina.net").setStartUrl("http://www.oschina.net/"). - setUserAgent("Mozilla/5.0 (Macintosh; Chrome/26.0.1410.65 Safari/537.31"); - } - } + @TargetUrl("http://my.oschina.net/flashsword/blog/\\d+") + public class OschinaBlog { + + @ExtractBy("//title") + private String title; + + @ExtractBy(value = "div.BlogContent",type = ExtractBy.Type.Css) + private String content; + + @ExtractBy(value = "//div[@class='BlogTags']/a/text()", multi = true) + private List tags; + + public static void main(String[] args) { + OOSpider.create( + Site.me().addStartUrl("http://my.oschina.net/flashsword/blog"), + new ConsolePageModelPipeline(), OschinaBlog.class).run(); + } + } + +这个例子定义了一个Model类,Model类的字段'title'、'content'、'tags'均为要抽取的属性。这个类在Pipeline里是可以复用的。 + +### 详细文档 + +见[webmagic manual.md](https://github.com/code4craft/webmagic/blob/master/webmagic%20manual.md)。 + +### 示例 + +webmagic-samples目录里有一些定制PageProcessor以抽取不同站点的例子。 + +作者还有一个使用webmagic进行抽取并持久化到数据库的项目[JobHunter](http://git.oschina.net/flashsword20/jobhunter)。这个项目整合了Spring,自定义了Pipeline,使用mybatis进行数据持久化。 + +### 协议 + +webmagic遵循[Apache 2.0协议](http://opensource.org/licenses/Apache-2.0) diff --git a/pom.xml b/pom.xml index c424910a6..cacce99b3 100644 --- a/pom.xml +++ b/pom.xml @@ -3,68 +3,104 @@ xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> us.codecraft - 0.0.1-SNAPSHOT + 0.2.0 4.0.0 - + pom webmagic - - - org.apache.httpcomponents - httpclient - 4.2.1 - - - - junit - junit - 4.7 - test - - - - com.google.guava - guava - 13.0.1 - - - - org.apache.commons - commons-lang3 - 3.1 - - - - log4j - log4j - 1.2.17 - - - - commons-collections - commons-collections - 3.2.1 - - - - net.sourceforge.htmlcleaner - htmlcleaner - 2.4 - - - - org.apache.commons - commons-io - 1.3.2 - + + webmagic-core + webmagic-extension/ + webmagic-samples/ + - + + + + junit + junit + 4.7 + test + + + org.apache.httpcomponents + httpclient + 4.2.4 + + + net.sf.saxon + Saxon-HE + 9.5.1-1 + + + log4j + log4j + 1.2.17 + + + org.apache.commons + commons-lang3 + 3.1 + + + commons-collections + commons-collections + 3.2.1 + + + net.sourceforge.htmlcleaner + htmlcleaner + 2.5 + + + org.apache.commons + commons-io + 1.3.2 + + + org.jsoup + jsoup + 1.7.2 + + + + + org.apache.maven.plugins + maven-compiler-plugin + 3.1 + + 1.6 + 1.6 + UTF-8 + + + + org.apache.maven.plugins + maven-dependency-plugin + 2.8 + + + copy-dependencies + package + + copy-dependencies + + + ${project.build.directory}/lib + false + false + true + + + + org.apache.maven.plugins maven-resources-plugin + 2.6 UTF-8 @@ -72,6 +108,7 @@ org.apache.maven.plugins maven-source-plugin + 2.2.1 attach-sources @@ -84,6 +121,10 @@ org.apache.maven.plugins maven-javadoc-plugin + 2.9.1 + + UTF-8 + attach-javadocs @@ -96,10 +137,10 @@ org.apache.maven.plugins maven-release-plugin - 2.0-beta-7 + 2.4.1 - \ No newline at end of file + diff --git a/release-note.md b/release-note.md new file mode 100755 index 000000000..d78963ce9 --- /dev/null +++ b/release-note.md @@ -0,0 +1,55 @@ +Release Notes +---- +*2012-8-9* `version:0.2.0` + +此次更新的主题是"方便"(之前的主题是"灵活")。 + +增加了webmagic-extension模块。 + +增加了注解方式支持,可以通过POJO+注解的方式编写一个爬虫,更符合Java开发习惯。以下是抓取一个博客的完整代码: + + @TargetUrl("http://my.oschina.net/flashsword/blog/\\d+") + public class OschinaBlog { + + @ExtractBy("//title") + private String title; + + @ExtractBy(value = "div.BlogContent",type = ExtractBy.Type.Css) + private String content; + + @ExtractBy(value = "//div[@class='BlogTags']/a/text()", multi = true) + private List tags; + + public static void main(String[] args) { + OOSpider.create(Site.me().addStartUrl("http://my.oschina.net/flashsword/blog"), + new ConsolePageModelPipeline(), OschinaBlog.class). + scheduler(new RedisScheduler("127.0.0.1")).thread(5).run(); + } + + } + +增加基于redis的分布式支持。 + +增加XPath2.0语法支持(webmagic-saxon模块)。 + +增加基于Selenium的浏览器渲染支持,用于抓取动态加载内容(webmagic-selenium模块)。 + +修复一些已有bug。 + +补充了文档:[webmagic-0.2.0用户手册](http://code4craft.github.io/webmagic/)。 + +*2012-7-25* `version:0.1.0` + +第一个稳定版本。 + +修改了若干API,使得可扩展性更强,为每个任务分配一个ID,可以通过ID区分不同任务。 + +重写了Pipeline接口,将抽取结果集包装到ResultItems对象,而不是通用一个Page对象,便于逻辑分离。 + +增加下载的重试机制,支持gzip,支持自定义UA/cookie。 + +增加多线程抓取功能,只需在初始化的时候指定线程数即可。 + +增加jquery形式的CSS Selector API,可以通过`page.getHtml().$("div.body")`形式抽取元素。 + +完善了文档,架构说明:[webmagic的设计机制及原理-如何开发一个Java爬虫](http://my.oschina.net/flashsword/blog/145796),Javadoc:[http://code4craft.github.io/webmagic/docs](http://code4craft.github.io/webmagic/docs)。 \ No newline at end of file diff --git a/webmagic manual.md b/webmagic manual.md new file mode 100644 index 000000000..3bf1ea5e7 --- /dev/null +++ b/webmagic manual.md @@ -0,0 +1,380 @@ +webmagic使用手册 +------ +>webmagic是一个开源的Java垂直爬虫框架,目标是简化爬虫的开发流程,让开发者专注于逻辑功能的开发。webmagic的核心非常简单,但是覆盖爬虫的整个流程,也是很好的学习爬虫开发的材料。 + +>web爬虫是一种技术,webmagic致力于将这种技术的实现成本降低,但是出于对资源提供者的尊重,webmagic不会做反封锁的事情,包括:验证码破解、代理切换、自动登录、抓取静态资源等。 + +>作者黄亿华([code4crafter@gmail.com](code4crafter@gmail.com))目前就职于大众点评,曾经在前公司进行过一年的垂直爬虫的开发,webmagic就是为了解决爬虫开发的一些重复劳动而产生的框架。 + +>webmagic的架构和设计参考了以下两个项目,感谢以下两个项目的作者: + +>python爬虫 **scrapy** [https://github.com/scrapy/scrapy](https://github.com/scrapy/scrapy) + +>Java爬虫 **Spiderman** [https://gitcafe.com/laiweiwei/Spiderman](https://gitcafe.com/laiweiwei/Spiderman) + +>webmagic遵循[Apache 2.0协议](http://www.apache.org/licenses/LICENSE-2.0.html),你可以自由进行使用和修改。有使用不便或者问题,欢迎在github[提交issue](https://github.com/code4craft/webmagic/issues),或者在[oschina讨论模块](http://www.oschina.net/question)提问。 + +
+ + +## 快速开始 + +### 使用maven + +webmagic使用maven管理依赖,你可以直接下载webmagic源码进行编译: + + git clone https://github.com/code4craft/webmagic.git + mvn clean install + +安装后,在项目中添加对应的依赖即可使用webmagic: + + + us.codecraft + webmagic-core + 0.2.0 + + + us.codecraft + webmagic-extension + 0.2.0 + + +#### 项目结构 + +webmagic主要包括两个包: + +* **webmagic-core** + + webmagic核心部分,只包含爬虫基本模块和基本抽取器。webmagic-core的目标是成为网页爬虫的一个教科书般的实现。 + +* **webmagic-extension** + + webmagic的扩展模块,提供一些更方便的编写爬虫的工具。包括注解格式定义爬虫、JSON、分布式等支持。 + +webmagic还包含两个可用的扩展包,因为这两个包都依赖了比较重量级的工具,所以从主要包中抽离出来: + +* **webmagic-saxon** + + webmagic与Saxon结合的模块。Saxon是一个XPath、XSLT的解析工具,webmagic依赖Saxon来进行XPath2.0语法解析支持。 + +* **webmagic-selenium** + + webmagic与Selenium结合的模块。Selenium是一个模拟浏览器进行页面渲染的工具,webmagic依赖Selenium进行动态页面的抓取。 + +在项目中,你可以根据需要依赖不同的包。 + +### 不使用maven + +不使用maven的用户,可以下载这个二进制打包版本(感谢[oschina](http://www.oschina.net/)): + + git clone http://git.oschina.net/flashsword20/webmagic-bin.git + +在**bin/lib**目录下,有项目依赖的所有jar包,直接在IDE里import即可。 + +### 第一个爬虫 + +#### 定制PageProcessor + +PageProcessor是webmagic-core的一部分,定制一个PageProcessor即可实现自己的爬虫逻辑。以下是抓取osc博客的一段代码: + + public class OschinaBlogPageProcesser implements PageProcessor { + + private Site site = Site.me().setDomain("my.oschina.net") + .addStartUrl("http://my.oschina.net/flashsword/blog"); + + @Override + public void process(Page page) { + List links = page.getHtml().links().regex("http://my\\.oschina\\.net/flashsword/blog/\\d+").all(); + page.addTargetRequests(links); + page.putField("title", page.getHtml().xpath("//div[@class='BlogEntity']/div[@class='BlogTitle']/h1").toString()); + page.putField("content", page.getHtml().$("div.content").toString()); + page.putField("tags",page.getHtml().xpath("//div[@class='BlogTags']/a/text()").all()); + } + + @Override + public Site getSite() { + return site; + + } + + public static void main(String[] args) { + Spider.create(new OschinaBlogPageProcesser()) + .pipeline(new ConsolePipeline()).run(); + } + } + +这里通过page.addTargetRequests()方法来增加要抓取的URL,并通过page.putField()来保存抽取结果。page.getHtml().xpath()则是按照某个规则对结果进行抽取,这里抽取支持链式调用。调用结束后,toString()表示转化为单个String,all()则转化为一个String列表。 + +Spider是爬虫的入口类。Pipeline是结果输出和持久化的接口,这里ConsolePipeline表示结果输出到控制台。 + +执行这个main方法,即可在控制台看到抓取结果。webmagic默认有3秒抓取间隔,请耐心等待。 + +#### 使用注解 + +webmagic-extension包括了注解方式编写爬虫的方法,只需基于一个POJO增加注解即可完成一个爬虫。以下仍然是抓取oschina博客的一段代码,功能与OschinaBlogPageProcesser完全相同: + + @TargetUrl("http://my.oschina.net/flashsword/blog/\\d+") + public class OschinaBlog { + + @ExtractBy("//title") + private String title; + + @ExtractBy(value = "div.BlogContent",type = ExtractBy.Type.Css) + private String content; + + @ExtractBy(value = "//div[@class='BlogTags']/a/text()", multi = true) + private List tags; + + public static void main(String[] args) { + OOSpider.create( + Site.me().addStartUrl("http://my.oschina.net/flashsword/blog"), + new ConsolePageModelPipeline(), OschinaBlog.class).run(); + } + } + +这个例子定义了一个Model类,Model类的字段'title'、'content'、'tags'均为要抽取的属性。这个类在Pipeline里是可以复用的。 + +注解的详细使用方式见后文中得webmagic-extension注解模块。 + +
+ + +## webmagic-core + +webmagic-core是爬虫的核心框架,只包括一个爬虫各功能模块的核心功能。webmagic-core的目标是成为网页爬虫的一个教科书般的实现。 + +此节部分内容摘自作者的博文 +[webmagic的设计机制及原理-如何开发一个Java爬虫](http://my.oschina.net/flashsword/blog/145796)。 + +### webmagic-core的模块划分 + +webmagic-core参考了scrapy的模块划分,分为Spider(整个爬虫的调度框架)、Downloader(页面下载)、PageProcessor(链接提取和页面分析)、Scheduler(URL管理)、Pipeline(离线分析和持久化)几部分。只不过scrapy通过middleware实现扩展,而webmagic则通过定义这几个接口,并将其不同的实现注入主框架类Spider来实现扩展。 + +![image](http://code4craft.github.io/images/posts/webmagic.png) +
+ +#### Spider类(核心调度) + +**Spider**是爬虫的入口类,Spider的接口调用采用了链式的API设计,其他功能全部通过接口注入Spider实现,下面是启动一个比较复杂的Spider的例子。 + + Spider.create(sinaBlogProcessor) + .scheduler(new FileCacheQueueScheduler("/data/temp/webmagic/cache/")) + .pipeline(new FilePipeline()) + .thread(10).run(); + + +Spider的核心处理流程非常简单,代码如下: + + + private void processRequest(Request request) { + Page page = downloader.download(request, this); + if (page == null) { + sleep(site.getSleepTime()); + return; + } + pageProcessor.process(page); + addRequest(page); + for (Pipeline pipeline : pipelines) { + pipeline.process(page, this); + } + sleep(site.getSleepTime()); + } + +#### PageProcessor(页面分析及链接抽取) + +页面分析是垂直爬虫中需要定制的部分。在webmagic-core里,通过实现**PageProcessor**接口来实现定制爬虫。PageProcessor有两个核心方法:public void process(Page page)和public Site getSite() 。 + +* public void process(Page page) + + 通过对**Page**对象的操作,实现爬虫逻辑。Page对象包括两个最重要的方法:addTargetRequests()可以添加URL到待抓取队列,put()可以将结果保存供后续处理。 + Page的数据可以通过Page.getHtml()和Page.getUrl()获取。 + +* public Site getSite() + + **Site**对象定义了爬虫的域名、起始地址、抓取间隔、编码等信息。 + +**Selector**是webmagic为了简化页面抽取开发的独立模块,是webmagic-core的主要着力点。这里整合了CSS Selector、XPath和正则表达式,并可以进行链式的抽取。 + + + //content是用别的爬虫工具抽取到的正文 + List links = page.getHtml() + .$("div.title") //css 选择,Java里虽然很少有$符号出现,不过貌似$作为方法名是合法的 + .xpath("//@href") //提取链接 + .regex(".*blog.*") //正则匹配过滤 + .all(); //转换为string列表 + +webmagic包括一个对于页面正文的自动抽取的类**SmartContentSelector**。相信用过Evernote Clearly都会对其自动抽取正文的技术印象深刻。这个技术又叫**Readability**。当然webmagic对Readability的实现还比较粗略,但是仍有一些学习价值。 + +基于Saxon,webmagic提供了XPath2.0语法的支持。XPath2.0语法支持内部函数、逻辑控制等,是一门完整的语言,如果你熟悉XPath2.0语法,倒是不妨一试(需要引入**webmagic-saxon**包)。 + +**webmagic-samples**包里有一些为某个站点定制的PageProcessor,供学习之用。 + +#### Downloader(页面下载) + +**Downloader**是webmagic中下载页面的接口,主要方法: + +* public Page download(Request request, Task task) + + **Request**对象封装了待抓取的URL及其他信息,而Page则包含了页面下载后的Html及其他信息。Task是一个包装了任务对应的Site信息的抽象接口。 + +* public void setThread(int thread) + + 因为Downloader一般会涉及连接池等功能,而这些功能与多线程密切相关,所以定义了此方法。 + +目前有几个Downloader的实现: + +* HttpClientDownloader + + 集成了**Apache HttpClient**的Downloader。Apache HttpClient(4.0后整合到HttpCompenent项目中)是强大的Java http下载器,它支持自定义HTTP头(对于爬虫比较有用的就是User-agent、cookie等)、自动redirect、连接复用、cookie保留、设置代理等诸多强大的功能。 + +* SeleniumDownloader + + 对于一些Javascript动态加载的网页,仅仅使用http模拟下载工具,并不能取到页面的内容。这方面的思路有两种:一种是抽丝剥茧,分析js的逻辑,再用爬虫去重现它;另一种就是:内置一个浏览器,直接获取最后加载完的页面。**webmagic-selenium**包中整合了Selenium到SeleniumDownloader,可以直接进行动态加载页面的抓取。 + +#### Scheduler(URL管理) + +**Scheduler**是webmagic的管理模块,通过实现Scheduler可以定制自己的URL管理器。Scheduler包括两个主要方法: + +* public void push(Request request,Task task) + + 将待抓取URL加入Scheduler。Request对象是对URL的一个封装,还包括优先级、以及一个供存储数据的Map。Task仍然用于区分不同任务,在多个任务公用一个Scheduler时可以此进行区分。 + +* public Request poll(Task task) + + 从Scheduler里取出一条请求,并进行后续执行。 + +webmagic目前有三个Scheduler的实现: + +* QueueScheduler + + 一个简单的内存队列,速度较快,并且是线程安全的。 + +* FileCacheQueueScheduler + + 使用文件保存队列,它可以用于耗时较长的下载任务,在任务中途停止后(手动停止或者程序崩溃),下次执行仍然从中止的URL开始继续爬取。 + +* RedisScheduler + + 使用redis存储URL队列。通过使用同一台redis服务器存储URL,webmagic可以很容易的在多机部署,从而达到分布式爬虫的效果。 + +#### Pipeline(后续处理和持久化) + +**Pipeline**是最终抽取结果进行输出和持久化的接口。它只包括一个方法: + +* public void process(ResultItems resultItems,Task task) + + **ResultItems**是集成了抽取结果的对象。通过ResultItems.get(key)可以获取抽取结果。Task同样是用于区分不同任务的对象。 + +webmagic包括以下几个Pipeline的实现: + +* ConsolePipeline + + 直接输出结果到控制台,测试时使用。 + +* FilePipeline + + 输出结果到文件,每个URL单独保存到一个页面,以URL的MD5结果作为文件名。通过构造函数`public FilePipeline(String path)`定义存储路径,**以下使用文件持久化的类,多数都使用此方法指定路径**。 + +* JsonFilePipeline + + 以JSON输出结果到文件(.json后缀),其他与FilePipeline相同。 + +webmagic目前不支持持久化到数据库,但是结合其他工具,持久化到数据库也是很容易的。这里不妨看一下[webmagic结合JFinal持久化到数据库的一段代码](http://www.oschina.net/code/snippet_190591_23456)。因为JFinal目前还不支持maven,所以这段代码并没有放到webmagic-samples里来。 + +
+ +## webmagic-extension + +webmagic-extension是为了开发爬虫更方便而实现的一些功能模块。这些功能完全基于webmagic-core的框架,包括注解形式编写爬虫、分页、分布式等功能。 + +### 注解模块 + +webmagic-extension包括注解模块。为什么会有注解方式? + +因为PageProcessor的方式灵活、强大,但是没有解决两个问题: + +* 对于一个站点,如果想抓取多种格式的URL,那么必须在PageProcesser中写判断逻辑,代码难以管理。 +* 抓取结果没有对应Model,并不符合Java程序开发习惯,与一些框架也无法很好整合。 + +注解的核心是Model类,本身是一个POJO,这个Model类用于传递、保存页面最终抓取结果数据。注解方式直接将抽取与数据绑定,以便于编写和维护。 + +注解方式其实也是通过一个PageProcessor的实现--ModelPageProcessor完成,因此对webmagic-core代码没有任何影响。仍然以抓取OschinaBlog的程序为例: + + @TargetUrl("http://my.oschina.net/flashsword/blog/\\d+") + public class OschinaBlog { + + @ExtractBy("//title") + private String title; + + @ExtractBy(value = "div.BlogContent",type = ExtractBy.Type.Css) + private String content; + + @ExtractBy(value = "//div[@class='BlogTags']/a/text()", multi = true) + private List tags; + + public static void main(String[] args) { + OOSpider.create( + Site.me().addStartUrl("http://my.oschina.net/flashsword/blog"), + new ConsolePageModelPipeline(), OschinaBlog.class).run(); + } + } + +注解部分包括以下内容: + +* #### TargetUrl + + "TargetUrl"表示这个Model对应要抓取的URL,它包含两层意思:符合这个条件的URL会被加入抓取队列;符合这个条件的URL会被这个Model抓取。TargetUrl可以**sourceRegion**指定提取URL的区域(仅支持XPath)。 + + TargetUrl使用了正则表达式,匹配 "http://my.oschina.net/flashsword/blog/150039" 格式的URL。webmagic对正则表达式进行了修改,"."仅表示字符"."而不代表任意字符,而"\*"则代表了".\*",例如"http://\*.oschina.net/\*"代表了oschina所有的二级域名下的URL。 + + 与TargetUrl相似的还有**HelpUrl**,HelpUrl表示:仅仅抓取该URL用作链接提取,并不对它进行内容抽取。例如博客正文页对应TargetUrl,而列表页则对应HelpUrl。 + +* #### ExtractBy + + * #### 用于字段 + + "ExtractBy"可用于类以及字段。用于字段时,定义了字段抽取的规则。抽取的规则默认使用[**XPath**](http://www.w3school.com.cn/xpath/),也可以选择使用CSS Selector、正则表达式(通过设置type)。 + + ExtractBy还有几个扩展属性。**multi**表示是否抽取列表,当然,设置为multi时,你需要一个List字段去容纳它。**notnull**则表示,此字段不允许为null,若为null则放弃整个对象。 + + * #### 用于类 + + "ExtractBy"用于类时,则限定了字段抽取的区域。用于类时仍支持multi,multi则表示一个页面可以抽取到多个对象。 + + * #### ExtractByRaw & ExtractByUrl + + 在类使用"ExtractBy"修饰后,字段的"ExtractBy"使用的是其抽取的结果,如果仍然想要抽取原HTML,可以使用"ExtractByRaw"。与此类似的还有"ExtractByUrl",表示从URL中抽取信息。ExtractByUrl只支持正则表达式。 + + * #### ExtractBy2 ExtractBy3 + + "ExtractBy"、"ExtractByRaw"支持链式抽取,通过增加注解"ExtractBy2"、"ExtractBy3"实现。 + +* #### AfterExtractor + + AfterExtractor接口是对注解方式抽取能力不足的补充。实现AfterExtractor接口后,会在**使用注解方式填充完字段后**调用**afterProcess()**方法,在这个方法中可以直接访问已抽取的字段、补充需要抽取的字段,甚至做一些简单的输出和持久化操作(并不是很建议这么做)。这部分可以参考[webmagic结合JFinal持久化到数据库的一段代码](http://www.oschina.net/code/snippet_190591_23456)。 + +* #### OOSpider + OOSpider是注解式爬虫的入口,这里调用**create()**方法将OschinaBlog这个类加入到爬虫的抽取中,这里是可以传入多个类的,例如: + + OOSpider.create( + Site.me().addStartUrl("http://www.oschina.net"), + new ConsolePageModelPipeline(), + OschinaBlog.clas,OschinaAnswer.class).run(); + + OOSpider会根据TargetUrl调用不同的Model进行解析。 + +* #### PageModelPipeline + 可以通过定义PageModelPipeline来选择结果输出方式。这里new ConsolePageModelPipeline()是PageModelPipeline的一个实现,会将结果输出到控制台。 + +* #### 分页 + + 处理单项数据分页(例如单条新闻多个页面)是爬虫一个比较头疼的问题。webmagic目前对于分页的解决方案是:在注解模式下,Model通过实现**PagedModel**接口,并引入PagedPipeline作为第一个Pipeline来实现。具体可以参考webmagic-samples中抓取网易新闻的代码:**us.codecraft.webmagic.model.samples.News163**。 + + 关于分页,这里有一篇对于webmagic分页实现的详细说明的文章[关于爬虫实现分页的一些思考](http://my.oschina.net/flashsword/blog/150039)。 + 目前分页功能还没有分布式实现,如果实现RedisScheduler进行分布式爬取,请不要使用分页功能。 + +### 分布式 + +webmagic-extension中,通过redis来管理URL,达到分布式的效果。但是对于分布式爬虫,仅仅程序能够分布式运行,还满足不了大规模抓取的需要,webmagic可能后期会加入一些任务管理和监控的功能,也欢迎各位用户为webmagic提交代码,做出贡献。 + + diff --git "a/webmagic-0.2.0\344\275\277\347\224\250\346\226\207\346\241\243.pdf" "b/webmagic-0.2.0\344\275\277\347\224\250\346\226\207\346\241\243.pdf" new file mode 100644 index 000000000..a38a00911 Binary files /dev/null and "b/webmagic-0.2.0\344\275\277\347\224\250\346\226\207\346\241\243.pdf" differ diff --git a/webmagic-core/README.md b/webmagic-core/README.md new file mode 100644 index 000000000..90a6f0a47 --- /dev/null +++ b/webmagic-core/README.md @@ -0,0 +1,3 @@ +webmagic-core +------- +webmagic核心部分。只包含爬虫基本模块和基本抽取器。webmagic-core的目标是成为网页爬虫的一个教科书般的实现。 \ No newline at end of file diff --git a/webmagic-core/pom.xml b/webmagic-core/pom.xml index 607eb13c1..cf42d2a94 100644 --- a/webmagic-core/pom.xml +++ b/webmagic-core/pom.xml @@ -2,8 +2,11 @@ - us.codecraft - 0.0.1-SNAPSHOT + + us.codecraft + webmagic + 0.2.0 + 4.0.0 webmagic-core @@ -12,94 +15,43 @@ org.apache.httpcomponents httpclient - 4.2.1 junit junit - 4.7 - test - - - - com.google.guava - guava - 13.0.1 org.apache.commons commons-lang3 - 3.1 log4j log4j - 1.2.17 commons-collections commons-collections - 3.2.1 net.sourceforge.htmlcleaner htmlcleaner - 2.4 + + + + org.jsoup + jsoup org.apache.commons commons-io - 1.3.2 - - - - org.apache.maven.plugins - maven-resources-plugin - - UTF-8 - - - - org.apache.maven.plugins - maven-source-plugin - - - attach-sources - - jar - - - - - - org.apache.maven.plugins - maven-javadoc-plugin - - - attach-javadocs - - jar - - - - - - org.apache.maven.plugins - maven-release-plugin - 2.0-beta-7 - - - - - \ No newline at end of file diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java index 8f1a4c77a..eb2c13214 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java @@ -6,19 +6,27 @@ import java.util.ArrayList; import java.util.List; -import java.util.Map; -import java.util.concurrent.ConcurrentHashMap; /** - * User: cairne - * Date: 13-4-21 - * Time: 上午11:22 + *
+ * Page保存了上一次抓取的结果,并可定义待抓取的链接内容。
+ *
+ *     主要方法:
+ *     {@link #getUrl()} 获取页面的Url
+ *     {@link #getHtml()}  获取页面的html内容
+ *     {@link #putField(String, us.codecraft.webmagic.selector.Selectable)} 保存抽取的结果
+ *     {@link #getFields()} 获取抽取的结果,在 {@link us.codecraft.webmagic.pipeline.Pipeline} 中调用
+ *     {@link #addTargetRequests(java.util.List)} {@link #addTargetRequest(String)} 添加待抓取的链接
+ *
+ * 
+ * + * @author code4crafter@gmail.com
*/ public class Page { private Request request; - private Map fields = new ConcurrentHashMap(); + private ResultItems resultItems = new ResultItems(); private Selectable html; @@ -26,21 +34,30 @@ public class Page { private List targetRequests = new ArrayList(); - public void process() { - fields.put("title", html.x("").r("")); - } - public Page() { } - public Map getFields() { - return fields; + public Page setSkip(boolean skip) { + resultItems.setSkip(skip); + return this; + } - public void putField(String key, Selectable field) { - fields.put(key, field); + /** + * 保存抽取的结果 + * + * @param key 结果的key + * @param field 结果的value + */ + public void putField(String key, Object field) { + resultItems.put(key, field); } + /** + * 获取页面的html内容 + * + * @return html 页面的html内容 + */ public Selectable getHtml() { return html; } @@ -53,41 +70,82 @@ public List getTargetRequests() { return targetRequests; } + /** + * 添加待抓取的链接 + * + * @param requests 待抓取的链接 + */ public void addTargetRequests(List requests) { synchronized (targetRequests) { for (String s : requests) { if (StringUtils.isBlank(s) || s.equals("#") || s.startsWith("javascript:")) { break; } - s = UrlUtils.fixRelativeUrl(s, url.toString()); + s = UrlUtils.canonicalizeUrl(s, url.toString()); targetRequests.add(new Request(s)); } } } - public void addTargetRequests(String requestString) { + /** + * 添加待抓取的链接 + * + * @param requestString 待抓取的链接 + */ + public void addTargetRequest(String requestString) { if (StringUtils.isBlank(requestString) || requestString.equals("#")) { return; } synchronized (targetRequests) { - requestString = UrlUtils.fixRelativeUrl(requestString, url.toString()); + requestString = UrlUtils.canonicalizeUrl(requestString, url.toString()); targetRequests.add(new Request(requestString)); } } + /** + * 添加待抓取的页面,在需要传递附加信息时使用 + * + * @param request 待抓取的页面 + */ + public void addTargetRequest(Request request) { + synchronized (targetRequests) { + targetRequests.add(request); + } + } + + /** + * 获取页面的Url + * + * @return url 当前页面的url,可用于抽取 + */ public Selectable getUrl() { return url; } + /** + * 设置url + * + * @param url + */ public void setUrl(Selectable url) { this.url = url; } + /** + * 获取抓取请求 + * + * @return request 抓取请求 + */ public Request getRequest() { return request; } public void setRequest(Request request) { this.request = request; + this.resultItems.setRequest(request); + } + + public ResultItems getResultItems() { + return resultItems; } } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Request.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Request.java index ccefc7f9a..905dbe591 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Request.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Request.java @@ -1,27 +1,116 @@ package us.codecraft.webmagic; +import java.io.Serializable; +import java.util.HashMap; +import java.util.Map; + /** - * User: cairne - * Date: 13-4-21 - * Time: 上午11:37 + * Request对象封装了待抓取的url信息。
+ * 在PageProcessor中,Request对象可以通过{@link us.codecraft.webmagic.Page#getRequest()} 获取。
+ *
+ * Request对象包含一个extra属性,可以写入一些必须的上下文,这个特性在某些场合会有用。
+ *
+ *      Example:
+ *          抓取${linktext}时,希望提取链接link,并保存linktext的信息。
+ *      在上一个页面:
+ *      public void process(Page page){
+ *          Request request = new Request(link,linktext);
+ *          page.addTargetRequest(request)
+ *      }
+ *      在下一个页面:
+ *      public void process(Page page){
+ *          String linktext =  (String)page.getRequest().getExtra()[0];
+ *      }
+ * 
+ * + * @author code4crafter@gmail.com
+ * Date: 13-4-21 + * Time: 上午11:37 */ -public class Request { +public class Request implements Serializable { + + private static final long serialVersionUID = 2062192774891352043L; private String url; - private Object[] extra; + /** + * 额外参数,可以保存一些需要的上下文信息 + */ + private Map extras; + + private double priority; - public Request(String url, Object... extra) { + public Request() { + } + + /** + * 构建一个request对象 + * + * @param url 必须参数,待抓取的url + */ + public Request(String url) { this.url = url; - this.extra = extra; } - public Object[] getExtra() { - return extra; + public double getPriority() { + return priority; + } + + public Request setPriority(double priority) { + this.priority = priority; + return this; + } + + public Object getExtra(String key) { + if (extras == null) { + return null; + } + return extras.get(key); } + public Request putExtra(String key, Object value) { + if (extras == null) { + extras = new HashMap(); + } + extras.put(key, value); + return this; + } + + /** + * 获取待抓取的url + * + * @return url 待抓取的url + */ public String getUrl() { return url; } + @Override + public boolean equals(Object o) { + if (this == o) return true; + if (o == null || getClass() != o.getClass()) return false; + + Request request = (Request) o; + + if (!url.equals(request.url)) return false; + + return true; + } + + public Map getExtras() { + return extras; + } + + @Override + public int hashCode() { + return url.hashCode(); + } + + public void setExtras(Map extras) { + this.extras = extras; + } + + public void setUrl(String url) { + this.url = url; + } } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/ResultItems.java b/webmagic-core/src/main/java/us/codecraft/webmagic/ResultItems.java new file mode 100644 index 000000000..7a8e5c39f --- /dev/null +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/ResultItems.java @@ -0,0 +1,64 @@ +package us.codecraft.webmagic; + +import java.util.HashMap; +import java.util.Map; + +/** + * 保存抽取结果的类,由PageProcessor处理得到,传递给{@link us.codecraft.webmagic.pipeline.Pipeline}进行持久化。
+ * @author code4crafter@gmail.com
+ * Date: 13-7-25
+ * Time: 下午12:20
+ */ +public class ResultItems { + + private Map fields = new HashMap(); + + private Request request; + + private boolean skip; + + public T get(String key) { + Object o = fields.get(key); + if (o == null) { + return null; + } + return (T) fields.get(key); + } + + public Map getAll() { + return fields; + } + + public ResultItems put(String key, T value) { + fields.put(key, value); + return this; + } + + public Request getRequest() { + return request; + } + + public ResultItems setRequest(Request request) { + this.request = request; + return this; + } + + /** + * 是否忽略这个页面,用于pipeline来判断是否对这个页面进行处理 + * @return 是否忽略 true 忽略 + */ + public boolean isSkip() { + return skip; + } + + + /** + * 设置是否忽略这个页面,用于pipeline来判断是否对这个页面进行处理 + * @param skip + * @return this + */ + public ResultItems setSkip(boolean skip) { + this.skip = skip; + return this; + } +} diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java index 4c032aafb..9ab97fe86 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java @@ -1,12 +1,16 @@ package us.codecraft.webmagic; -import java.util.HashSet; -import java.util.Set; +import us.codecraft.webmagic.utils.UrlUtils; + +import java.util.*; /** - * User: cairne - * Date: 13-4-21 - * Time: 下午12:13 + * Site定义一个待抓取的站点的各种信息。
+ * 这个类的所有getter方法,一般都只会被爬虫框架内部进行调用。
+ * + * @author code4crafter@gmail.com
+ * Date: 13-4-21 + * Time: 下午12:13 */ public class Site { @@ -14,14 +18,16 @@ public class Site { private String userAgent; - private String cookie; + private Map cookies = new LinkedHashMap(); - private String encoding; + private String charset; - private String startUrl; + private List startUrls = new ArrayList(); private int sleepTime = 3000; + private int retryTimes = 0; + private static final Set DEFAULT_STATUS_CODE_SET = new HashSet(); private Set acceptStatCode = DEFAULT_STATUS_CODE_SET; @@ -30,70 +36,181 @@ public class Site { DEFAULT_STATUS_CODE_SET.add(200); } + /** + * 创建一个Site对象,等价于new Site() + * + * @return 新建的对象 + */ public static Site me() { return new Site(); } - public Site setCookie(String cookie) { - this.cookie = cookie; + /** + * 为这个站点添加一个cookie,可用于抓取某些需要登录访问的站点。这个cookie的域名与{@link #getDomain()}是一致的 + * + * @param name cookie的名称 + * @param value cookie的值 + * @return this + */ + public Site addCookie(String name, String value) { + cookies.put(name, value); return this; } + /** + * 为这个站点设置user-agent,很多网站都对user-agent进行了限制,不设置此选项可能会得到期望之外的结果。 + * + * @param userAgent userAgent + * @return this + */ public Site setUserAgent(String userAgent) { this.userAgent = userAgent; return this; } - public String getCookie() { - return cookie; + /** + * 获取已经设置的所有cookie + * + * @return 已经设置的所有cookie + */ + public Map getCookies() { + return cookies; } + /** + * 获取已设置的user-agent + * + * @return 已设置的user-agent + */ public String getUserAgent() { return userAgent; } + /** + * 获取已设置的domain + * + * @return 已设置的domain + */ public String getDomain() { + if (domain == null) { + if (startUrls.size() > 0) { + domain = UrlUtils.getDomain(startUrls.get(0)); + } + } return domain; } + /** + * 设置这个站点所在域名,必须项。
+ * 目前不支持多个域名的抓取。抓取多个域名请新建一个Spider。 + * + * @param domain 爬虫会抓取的域名 + * @return this + */ public Site setDomain(String domain) { this.domain = domain; return this; } - public String getEncoding() { - return encoding; + /** + * 设置页面编码,若不设置则自动根据Html meta信息获取。
+ * 一般无需设置encoding,如果发现下载的结果是乱码,则可以设置此项。
+ * + * @param charset 编码格式,主要是"utf-8"、"gbk"两种 + * @return this + */ + public Site setCharset(String charset) { + this.charset = charset; + return this; + } + + /** + * 获取已设置的编码 + * + * @return 已设置的domain + */ + public String getCharset() { + return charset; } - public Site setEncoding(String encoding) { - this.encoding = encoding; + /** + * 设置可接受的http状态码,仅当状态码在这个集合中时,才会读取页面内容。
+ * 默认为200,正常情况下,无须设置此项。
+ * 某些站点会错误的返回状态码,此时可以对这个选项进行设置。
+ * + * @param acceptStatCode 可接受的状态码 + * @return this + */ + public Site setAcceptStatCode(Set acceptStatCode) { + this.acceptStatCode = acceptStatCode; return this; } + /** + * 获取可接受的状态码 + * + * @return 可接受的状态码 + */ public Set getAcceptStatCode() { return acceptStatCode; } - public Site setAcceptStatCode(Set acceptStatCode) { - this.acceptStatCode = acceptStatCode; - return this; + /** + * 获取初始页面的地址列表 + * + * @return 初始页面的地址列表 + */ + public List getStartUrls() { + return startUrls; } - public String getStartUrl() { - return startUrl; + /** + * 增加初始页面的地址,可反复调用此方法增加多个初始地址。 + * + * @param startUrl 初始页面的地址 + * @return this + */ + public Site addStartUrl(String startUrl) { + this.startUrls.add(startUrl); + return this; } - public Site setStartUrl(String startUrl) { - this.startUrl = startUrl; + /** + * 设置两次抓取之间的间隔,避免对目标站点压力过大(或者避免被防火墙屏蔽...)。 + * + * @param sleepTime 单位毫秒 + * @return this + */ + public Site setSleepTime(int sleepTime) { + this.sleepTime = sleepTime; return this; } + /** + * 获取两次抓取之间的间隔 + * + * @return 两次抓取之间的间隔,单位毫秒 + */ public int getSleepTime() { return sleepTime; } - public Site setSleepTime(int sleepTime) { - this.sleepTime = sleepTime; + /** + * 获取重新下载的次数,默认为0 + * + * @return 重新下载的次数 + */ + public int getRetryTimes() { + return retryTimes; + } + + /** + * 设置获取重新下载的次数,默认为0 + * + * @return this + */ + public Site setRetryTimes(int retryTimes) { + this.retryTimes = retryTimes; return this; } @@ -106,20 +223,34 @@ public boolean equals(Object o) { if (acceptStatCode != null ? !acceptStatCode.equals(site.acceptStatCode) : site.acceptStatCode != null) return false; - if (cookie != null ? !cookie.equals(site.cookie) : site.cookie != null) return false; if (!domain.equals(site.domain)) return false; - if (encoding != null ? !encoding.equals(site.encoding) : site.encoding != null) return false; + if (!startUrls.equals(site.startUrls)) return false; + if (charset != null ? !charset.equals(site.charset) : site.charset != null) return false; if (userAgent != null ? !userAgent.equals(site.userAgent) : site.userAgent != null) return false; return true; } + public Task toTask() { + return new Task() { + @Override + public String getUUID() { + return Site.this.getDomain(); + } + + @Override + public Site getSite() { + return Site.this; + } + }; + } + @Override public int hashCode() { int result = domain.hashCode(); + result = 31 * result + (startUrls != null ? startUrls.hashCode() : 0); result = 31 * result + (userAgent != null ? userAgent.hashCode() : 0); - result = 31 * result + (cookie != null ? cookie.hashCode() : 0); - result = 31 * result + (encoding != null ? encoding.hashCode() : 0); + result = 31 * result + (charset != null ? charset.hashCode() : 0); result = 31 * result + (acceptStatCode != null ? acceptStatCode.hashCode() : 0); return result; } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java index f3ec5f83a..cf627967c 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java @@ -2,80 +2,238 @@ import org.apache.commons.collections.CollectionUtils; import org.apache.log4j.Logger; +import us.codecraft.webmagic.downloader.Destroyable; import us.codecraft.webmagic.downloader.Downloader; import us.codecraft.webmagic.downloader.HttpClientDownloader; import us.codecraft.webmagic.pipeline.ConsolePipeline; import us.codecraft.webmagic.pipeline.Pipeline; import us.codecraft.webmagic.processor.PageProcessor; -import us.codecraft.webmagic.schedular.QueueSchedular; -import us.codecraft.webmagic.schedular.Schedular; +import us.codecraft.webmagic.scheduler.QueueScheduler; +import us.codecraft.webmagic.scheduler.Scheduler; +import us.codecraft.webmagic.utils.ThreadUtils; import java.util.ArrayList; import java.util.List; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.atomic.AtomicInteger; /** - * User: cairne - * Date: 13-4-21 - * Time: 上午6:53 + *
+ * webmagic爬虫的入口类。
+ *
+ * 示例:
+ * 定义一个最简单的爬虫:
+ *      Spider.create(new SimplePageProcessor("http://my.oschina.net/", "http://my.oschina.net/*blog/*")).run();
+ *
+ * 使用FilePipeline保存结果到文件:
+ *      Spider.create(new SimplePageProcessor("http://my.oschina.net/", "http://my.oschina.net/*blog/*"))
+ *          .pipeline(new FilePipeline("/data/temp/webmagic/")).run();
+ *
+ * 使用FileCacheQueueScheduler缓存URL,关闭爬虫后下次自动从停止的页面继续抓取:
+ *      Spider.create(new SimplePageProcessor("http://my.oschina.net/", "http://my.oschina.net/*blog/*"))
+ *          .scheduler(new FileCacheQueueScheduler("/data/temp/webmagic/cache/")).run();
+ * 
+ * + * @author code4crafter@gmail.com
+ * Date: 13-4-21 + * Time: 上午6:53 */ -public class Spider implements Runnable { +public class Spider implements Runnable, Task { - private Downloader downloader = new HttpClientDownloader(); + private Downloader downloader; private List pipelines = new ArrayList(); private PageProcessor pageProcessor; - private Schedular schedular = new QueueSchedular(); + private List startUrls; + + private Site site; + + private String uuid; + + private Scheduler scheduler = new QueueScheduler(); private Logger logger = Logger.getLogger(getClass()); - public static Spider me() { - return new Spider(); - } + private ExecutorService executorService; + + private int threadNum = 1; + + private AtomicInteger stat = new AtomicInteger(STAT_INIT); + + private final static int STAT_INIT = 0; + + private final static int STAT_RUNNING = 1; - public Spider processor(PageProcessor pageProcessor) { + private final static int STAT_STOPPED = 2; + + /** + * 使用已定义的抽取规则新建一个Spider。 + * + * @param pageProcessor 已定义的抽取规则 + */ + public Spider(PageProcessor pageProcessor) { this.pageProcessor = pageProcessor; - schedular.push(new Request(pageProcessor.getSite().getStartUrl()), pageProcessor.getSite()); + this.site = pageProcessor.getSite(); + this.startUrls = pageProcessor.getSite().getStartUrls(); + } + + /** + * 使用已定义的抽取规则新建一个Spider。 + * + * @param pageProcessor 已定义的抽取规则 + * @return 新建的Spider + */ + public static Spider create(PageProcessor pageProcessor) { + return new Spider(pageProcessor); + } + + /** + * 重新设置startUrls,会覆盖Site本身的startUrls。 + * + * @param startUrls + * @return this + */ + public Spider startUrls(List startUrls) { + checkIfNotRunning(); + this.startUrls = startUrls; return this; } - public Thread thread() { - return new Thread(this); + /** + * 为爬虫设置一个唯一ID,用于标志任务,默认情况下使用domain作为uuid,对于单domain多任务的情况,请为重复任务设置不同的ID。 + * + * @param uuid 唯一ID + * @return this + */ + public Spider setUUID(String uuid) { + this.uuid = uuid; + return this; } - public Spider schedular(Schedular schedular) { - this.schedular = schedular; + /** + * 设置调度器。调度器用于保存待抓取URL,并可以进行去重、同步、持久化等工作。默认情况下使用内存中的阻塞队列进行调度。 + * + * @param scheduler 调度器 + * @return this + */ + public Spider scheduler(Scheduler scheduler) { + checkIfNotRunning(); + this.scheduler = scheduler; return this; } + /** + * 设置处理管道。处理管道用于最终抽取结果的后处理,例如:保存到文件、保存到数据库等。默认情况下会输出到控制台。 + * + * @param pipeline 处理管道 + * @return this + */ public Spider pipeline(Pipeline pipeline) { + checkIfNotRunning(); this.pipelines.add(pipeline); return this; } + public Spider downloader(Downloader downloader) { + checkIfNotRunning(); + this.downloader = downloader; + return this; + } + + protected void checkComponent() { + if (downloader == null) { + this.downloader = new HttpClientDownloader(); + } + if (pipelines.isEmpty()) { + pipelines.add(new ConsolePipeline()); + } + downloader.setThread(threadNum); + } @Override public void run() { - Site site = pageProcessor.getSite(); - Request request = schedular.poll(site); - if (pipelines.isEmpty()){ - pipelines.add(new ConsolePipeline()); + if (!stat.compareAndSet(STAT_INIT, STAT_RUNNING)) { + throw new IllegalStateException("Spider is already running!"); } - while (request != null) { - Page page = downloader.download(request,site); - if (page == null) { - sleep(site.getSleepTime()); - continue; + checkComponent(); + if (startUrls != null) { + for (String startUrl : startUrls) { + scheduler.push(new Request(startUrl), this); } - pageProcessor.process(page); - addRequest(page); - for (Pipeline pipeline : pipelines) { - pipeline.process(page,site); + } + Request request = scheduler.poll(this); + //singel thread + if (executorService == null) { + while (request != null) { + processRequest(request); + request = scheduler.poll(this); + } + } else { + //multi thread + final AtomicInteger threadAlive = new AtomicInteger(0); + while (true) { + if (request == null) { + //when no request found but some thread is alive, sleep a while. + try { + Thread.sleep(100); + } catch (InterruptedException e) { + } + } else { + final Request requestFinal = request; + threadAlive.incrementAndGet(); + executorService.execute(new Runnable() { + @Override + public void run() { + processRequest(requestFinal); + threadAlive.decrementAndGet(); + } + }); + } + request = scheduler.poll(this); + if (threadAlive.get() == 0) { + request = scheduler.poll(this); + if (request == null) { + break; + } + } } + executorService.shutdown(); + } + stat.compareAndSet(STAT_RUNNING, STAT_STOPPED); + //release some resources + destroy(); + } + + private void destroy() { + destroyEach(downloader); + destroyEach(pageProcessor); + for (Pipeline pipeline : pipelines) { + destroyEach(pipeline); + } + } + + private void destroyEach(Object object) { + if (object instanceof Destroyable) { + ((Destroyable) object).destroy(); + } + } + + private void processRequest(Request request) { + Page page = downloader.download(request, this); + if (page == null) { sleep(site.getSleepTime()); - request = schedular.poll(site); + return; } + pageProcessor.process(page); + addRequest(page); + if (!page.getResultItems().isSkip()){ + for (Pipeline pipeline : pipelines) { + pipeline.process(page.getResultItems(), this); + } + } + sleep(site.getSleepTime()); } private void sleep(int time) { @@ -83,15 +241,68 @@ private void sleep(int time) { Thread.sleep(time); } catch (InterruptedException e) { e.printStackTrace(); - ; } } private void addRequest(Page page) { if (CollectionUtils.isNotEmpty(page.getTargetRequests())) { for (Request request : page.getTargetRequests()) { - schedular.push(request,pageProcessor.getSite()); + scheduler.push(request, this); } } } + + private void checkIfNotRunning() { + if (!stat.compareAndSet(STAT_INIT, STAT_INIT)) { + throw new IllegalStateException("Spider is already running!"); + } + } + + public void runAsync() { + Thread thread = new Thread(this); + thread.setDaemon(false); + thread.start(); + } + + /** + * 建立多个线程下载 + * + * @param threadNum 线程数 + * @return this + */ + public Spider thread(int threadNum) { + checkIfNotRunning(); + this.threadNum = threadNum; + if (threadNum <= 0) { + throw new IllegalArgumentException("threadNum should be more than one!"); + } + if (threadNum == 1) { + return this; + } + synchronized (this) { + this.executorService = ThreadUtils.newFixedThreadPool(threadNum); + } + return this; + } + + public Spider clearPipeline(){ + pipelines=new ArrayList(); + return this; + } + + @Override + public String getUUID() { + if (uuid != null) { + return uuid; + } + if (site != null) { + return site.getDomain(); + } + return null; + } + + @Override + public Site getSite() { + return site; + } } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Task.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Task.java new file mode 100644 index 000000000..14c1d319b --- /dev/null +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Task.java @@ -0,0 +1,23 @@ +package us.codecraft.webmagic; + +/** + * 抓取任务的抽象接口。
+ * @author code4crafter@gmail.com
+ * Date: 13-6-18 + * Time: 下午2:57 + */ +public interface Task { + + /** + * 返回唯一标志该任务的字符串,以区分不同任务。 + * @return uuid + */ + public String getUUID(); + + /** + * 返回任务抓取的站点信息 + * @return site + */ + public Site getSite(); + +} diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/Destroyable.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/Destroyable.java new file mode 100644 index 000000000..6dcbde1b3 --- /dev/null +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/Destroyable.java @@ -0,0 +1,13 @@ +package us.codecraft.webmagic.downloader; + +/** + * 比较占用资源的服务可以实现该接口,Spider会在结束时调用destroy()释放资源。
+ * @author code4crafter@gmail.com
+ * Date: 13-7-26
+ * Time: 下午3:10
+ */ +public interface Destroyable { + + public void destroy(); + +} diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/Downloader.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/Downloader.java index e84758487..9a7f59a3a 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/Downloader.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/Downloader.java @@ -2,14 +2,31 @@ import us.codecraft.webmagic.Page; import us.codecraft.webmagic.Request; -import us.codecraft.webmagic.Site; +import us.codecraft.webmagic.Task; /** - * User: cairne - * Date: 13-4-21 - * Time: 下午12:14 + * Downloader是webmagic下载页面的接口。webmagic默认使用了HttpComponent作为下载器,一般情况,你无需自己实现这个接口。
+ * + * @author code4crafter@gmail.com
+ * Date: 13-4-21 + * Time: 下午12:14 */ public interface Downloader { - public Page download(Request request,Site site); + /** + * 下载页面,并保存信息到Page对象中。 + * + * @param request + * @param task + * @return page + */ + public Page download(Request request, Task task); + + /** + * 设置线程数,多线程程序一般需要Downloader支持
+ * 如果不考虑多线程的可以不实现这个方法
+ * + * @param thread 线程数量 + */ + public void setThread(int thread); } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java index 269ba6d37..0e0977ae1 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java @@ -1,53 +1,102 @@ package us.codecraft.webmagic.downloader; import org.apache.commons.io.IOUtils; +import org.apache.http.Header; +import org.apache.http.HeaderElement; import org.apache.http.HttpResponse; import org.apache.http.client.HttpClient; +import org.apache.http.client.entity.GzipDecompressingEntity; import org.apache.http.client.methods.HttpGet; import org.apache.log4j.Logger; import us.codecraft.webmagic.Page; import us.codecraft.webmagic.Request; import us.codecraft.webmagic.Site; +import us.codecraft.webmagic.Task; import us.codecraft.webmagic.selector.Html; import us.codecraft.webmagic.selector.PlainText; import us.codecraft.webmagic.utils.UrlUtils; +import java.io.IOException; + /** - * User: cairne - * Date: 13-4-21 - * Time: 下午12:15 + * 封装了HttpClient的下载器。已实现指定次数重试、处理gzip、自定义UA/cookie等功能。
+ * + * @author code4crafter@gmail.com
+ * Date: 13-4-21 + * Time: 下午12:15 */ public class HttpClientDownloader implements Downloader { private Logger logger = Logger.getLogger(getClass()); + private int poolSize = 1; + @Override - public Page download(Request request, Site site) { + public Page download(Request request, Task task) { + Site site = task.getSite(); logger.info("downloading page " + request.getUrl()); - HttpClient httpClient = HttpClientPool.getInstance().getClient(site); + HttpClient httpClient = HttpClientPool.getInstance(poolSize).getClient(site); + String charset = site.getCharset(); try { HttpGet httpGet = new HttpGet(request.getUrl()); - HttpResponse httpResponse = httpClient.execute(httpGet); + HttpResponse httpResponse = null; + int tried = 0; + boolean retry; + do { + try { + httpResponse = httpClient.execute(httpGet); + retry = false; + } catch (IOException e) { + tried++; + if (tried > site.getRetryTimes()) { + logger.warn("download page " + request.getUrl() + " error", e); + return null; + } + logger.info("download page " + request.getUrl() + " error, retry the " + tried + " time!"); + retry = true; + } + } while (retry); int statusCode = httpResponse.getStatusLine().getStatusCode(); if (site.getAcceptStatCode().contains(statusCode)) { - if (site.getEncoding() == null){ + //charset + if (charset == null) { String value = httpResponse.getEntity().getContentType().getValue(); - site.setEncoding(new PlainText(value).r("charset=([^\\s]+)").toString()); + charset = UrlUtils.getCharset(value); } + // + handleGzip(httpResponse); String content = IOUtils.toString(httpResponse.getEntity().getContent(), - site.getEncoding()); + charset); Page page = new Page(); page.setHtml(new Html(UrlUtils.fixAllRelativeHrefs(content, request.getUrl()))); page.setUrl(new PlainText(request.getUrl())); page.setRequest(request); return page; } else { - logger.warn("code error " + statusCode); + logger.warn("code error " + statusCode + "\t" + request.getUrl()); } } catch (Exception e) { logger.warn("download page " + request.getUrl() + " error", e); } return null; } + + @Override + public void setThread(int thread) { + poolSize=thread; + } + + private void handleGzip(HttpResponse httpResponse) { + Header ceheader = httpResponse.getEntity().getContentEncoding(); + if (ceheader != null) { + HeaderElement[] codecs = ceheader.getElements(); + for (HeaderElement codec : codecs) { + if (codec.getName().equalsIgnoreCase("gzip")) { + httpResponse.setEntity( + new GzipDecompressingEntity(httpResponse.getEntity())); + } + } + } + } } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientPool.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientPool.java index 4fdf42124..854f1e57a 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientPool.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientPool.java @@ -1,27 +1,39 @@ package us.codecraft.webmagic.downloader; import org.apache.http.HttpVersion; +import org.apache.http.client.CookieStore; import org.apache.http.client.HttpClient; import org.apache.http.client.params.ClientPNames; import org.apache.http.client.params.CookiePolicy; import org.apache.http.conn.scheme.PlainSocketFactory; import org.apache.http.conn.scheme.Scheme; import org.apache.http.conn.scheme.SchemeRegistry; +import org.apache.http.impl.client.BasicCookieStore; import org.apache.http.impl.client.DefaultHttpClient; import org.apache.http.impl.conn.PoolingClientConnectionManager; +import org.apache.http.impl.cookie.BasicClientCookie; import org.apache.http.params.*; import us.codecraft.webmagic.Site; +import java.util.Map; + /** - * User: cairne - * Date: 13-4-21 - * Time: 下午12:29 + * @author code4crafter@gmail.com
+ * Date: 13-4-21 + * Time: 下午12:29 */ public class HttpClientPool { - public static final HttpClientPool INSTANCE = new HttpClientPool(5); + public static volatile HttpClientPool INSTANCE; - public static HttpClientPool getInstance() { + public static HttpClientPool getInstance(int poolSize) { + if (INSTANCE == null) { + synchronized (HttpClientPool.class) { + if (INSTANCE == null) { + INSTANCE = new HttpClientPool(poolSize); + } + } + } return INSTANCE; } @@ -43,22 +55,30 @@ private HttpClient generateClient(Site site) { HttpProtocolParamBean paramsBean = new HttpProtocolParamBean(params); paramsBean.setVersion(HttpVersion.HTTP_1_1); - paramsBean.setContentCharset("UTF-8"); + paramsBean.setContentCharset(site.getCharset()); paramsBean.setUseExpectContinue(false); SchemeRegistry schemeRegistry = new SchemeRegistry(); schemeRegistry.register(new Scheme("http", 80, PlainSocketFactory.getSocketFactory())); PoolingClientConnectionManager connectionManager = new PoolingClientConnectionManager(schemeRegistry); - connectionManager.setMaxTotal(100); + connectionManager.setMaxTotal(poolSize); connectionManager.setDefaultMaxPerRoute(100); - HttpClient httpClient = new DefaultHttpClient(connectionManager, params); + DefaultHttpClient httpClient = new DefaultHttpClient(connectionManager, params); + generateCookie(httpClient, site); httpClient.getParams().setIntParameter("http.socket.timeout", 60000); httpClient.getParams().setParameter(ClientPNames.COOKIE_POLICY, CookiePolicy.BEST_MATCH); return httpClient; } - public void pushBack(HttpClient httpClient) { - + private void generateCookie(DefaultHttpClient httpClient, Site site) { + CookieStore cookieStore = new BasicCookieStore(); + for (Map.Entry cookieEntry : site.getCookies().entrySet()) { + BasicClientCookie cookie = new BasicClientCookie(cookieEntry.getKey(), cookieEntry.getValue()); + cookie.setDomain(site.getDomain()); + cookieStore.addCookie(cookie); + } + httpClient.setCookieStore(cookieStore); } + } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/package.html b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/package.html new file mode 100644 index 000000000..cae5560ea --- /dev/null +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/package.html @@ -0,0 +1,5 @@ + + +包含了页面下载的接口Downloader和实现类HttpClientDownloader,该实现类封装了HttpComponent库。 + + diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/package.html b/webmagic-core/src/main/java/us/codecraft/webmagic/package.html new file mode 100644 index 000000000..d5ff540a6 --- /dev/null +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/package.html @@ -0,0 +1,5 @@ + + +包括webmagic入口类Spider和一些数据传递的实体类。 + + diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/ConsolePipeline.java b/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/ConsolePipeline.java index 4115b8ce5..e1648fe71 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/ConsolePipeline.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/ConsolePipeline.java @@ -1,23 +1,24 @@ package us.codecraft.webmagic.pipeline; -import us.codecraft.webmagic.Page; -import us.codecraft.webmagic.Site; -import us.codecraft.webmagic.selector.Selectable; +import us.codecraft.webmagic.ResultItems; +import us.codecraft.webmagic.Task; import java.util.Map; /** - * User: cairne - * Date: 13-4-21 - * Time: 下午1:45 + * 命令行输出抽取结果。可用于测试。
+ * + * @author code4crafter@gmail.com
+ * Date: 13-4-21 + * Time: 下午1:45 */ -public class ConsolePipeline implements Pipeline{ +public class ConsolePipeline implements Pipeline { @Override - public void process(Page page,Site site) { - System.out.println("get page: "+page.getUrl()); - for (Map.Entry entry : page.getFields().entrySet()) { - System.out.println(entry.getKey()+":\t"+entry.getValue().toStrings()); + public void process(ResultItems resultItems, Task task) { + System.out.println("get page: " + resultItems.getRequest().getUrl()); + for (Map.Entry entry : resultItems.getAll().entrySet()) { + System.out.println(entry.getKey() + ":\t" + entry.getValue()); } } } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/FilePipeline.java b/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/FilePipeline.java index d8407af57..252ccd5f5 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/FilePipeline.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/FilePipeline.java @@ -1,10 +1,9 @@ package us.codecraft.webmagic.pipeline; import org.apache.commons.codec.digest.DigestUtils; -import us.codecraft.webmagic.Page; -import us.codecraft.webmagic.Site; -import us.codecraft.webmagic.selector.Selectable; -import us.codecraft.webmagic.utils.UrlUtils; +import org.apache.log4j.Logger; +import us.codecraft.webmagic.ResultItems; +import us.codecraft.webmagic.Task; import java.io.File; import java.io.FileWriter; @@ -13,41 +12,61 @@ import java.util.Map; /** - * User: cairne - * Date: 13-4-21 - * Time: 下午6:28 + * 持久化到文件的接口。 + * + * @author code4crafter@gmail.com
+ * Date: 13-4-21 + * Time: 下午6:28 */ public class FilePipeline implements Pipeline { - private String path = "/data/temp/webmagic/"; + private String path = "/data/webmagic/"; + private Logger logger = Logger.getLogger(getClass()); + + /** + * 新建一个FilePipeline,使用默认保存路径"/data/webmagic/" + */ public FilePipeline() { } + /** + * 新建一个FilePipeline + * + * @param path 文件保存路径 + */ public FilePipeline(String path) { + if (!path.endsWith("/")&&!path.endsWith("\\")){ + path+="/"; + } this.path = path; } @Override - public void process(Page page, Site site) { - String domain = site.getDomain(); - domain = UrlUtils.getDomain(domain); - String path = this.path + "" + domain + "/"; + public void process(ResultItems resultItems, Task task) { + String path = this.path + "/" + task.getUUID() + "/"; File file = new File(path); if (!file.exists()) { file.mkdirs(); } try { - PrintWriter printWriter = new PrintWriter(new FileWriter(path + DigestUtils.md5Hex(page.getUrl().toString()) + ".html")); - printWriter.println("url:\t" + page.getUrl()); - for (Map.Entry entry : page.getFields().entrySet()) { - printWriter.println(entry.getKey() + ":\t" + entry.getValue().toStrings()); + PrintWriter printWriter = new PrintWriter(new FileWriter(path + DigestUtils.md5Hex(resultItems.getRequest().getUrl()) + ".html")); + printWriter.println("url:\t" + resultItems.getRequest().getUrl()); + for (Map.Entry entry : resultItems.getAll().entrySet()) { + if (entry.getValue() instanceof Iterable) { + Iterable value = (Iterable) entry.getValue(); + printWriter.println(entry.getKey() + ":"); + for (Object o : value) { + printWriter.println(o); + } + } else { + printWriter.println(entry.getKey() + ":\t" + entry.getValue()); + } } printWriter.close(); } catch (IOException e) { - e.printStackTrace(); //To change body of catch statement use File | Settings | File Templates. + logger.warn("write file error", e); } - } } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/Pipeline.java b/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/Pipeline.java index ef27cdae7..595a8e87b 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/Pipeline.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/Pipeline.java @@ -1,14 +1,15 @@ package us.codecraft.webmagic.pipeline; -import us.codecraft.webmagic.Page; -import us.codecraft.webmagic.Site; +import us.codecraft.webmagic.ResultItems; +import us.codecraft.webmagic.Task; /** - * User: cairne + * Pipeline是数据离线处理和持久化的接口。通过实现Pipeline以实现不同的持久化方式(例如保存到数据库)。 + * @author code4crafter@gmail.com
* Date: 13-4-21 * Time: 下午1:39 */ public interface Pipeline { - public void process(Page page,Site site); + public void process(ResultItems resultItems,Task task); } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/package.html b/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/package.html new file mode 100644 index 000000000..498183ebd --- /dev/null +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/package.html @@ -0,0 +1,5 @@ + + +包含了处理页面抽取结果的接口Pipeline和它的几个实现类。 + + diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/processor/PageProcessor.java b/webmagic-core/src/main/java/us/codecraft/webmagic/processor/PageProcessor.java index 22a24c96f..3963d0805 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/processor/PageProcessor.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/processor/PageProcessor.java @@ -4,21 +4,23 @@ import us.codecraft.webmagic.Site; /** - * User: cairne + * 定制爬虫的核心接口。通过实现PageProcessor可以实现一个定制的爬虫。
+ * extends the class to implements various spiders.
+ * @author code4crafter@gmail.com
* Date: 13-4-21 * Time: 上午11:42 */ public interface PageProcessor { /** - * extends the class to implements variaty spiders + * 定义如何处理页面,包括链接提取、内容抽取等。 * @param page */ public void process(Page page); /** - * the site the processor for - * @return + * 定义任务一些配置信息,例如开始链接、抓取间隔、自定义cookie、自定义UA等。 + * @return site */ public Site getSite(); } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/processor/SimplePageProcessor.java b/webmagic-core/src/main/java/us/codecraft/webmagic/processor/SimplePageProcessor.java index a8165bb42..ff9646054 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/processor/SimplePageProcessor.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/processor/SimplePageProcessor.java @@ -7,7 +7,8 @@ import java.util.List; /** - * User: cairne + * 非常简单的抽取器。链接抽取使用定义的通配符,并保存抽取整个内容到content字段。
+ * @author code4crafter@gmail.com
* Date: 13-4-22 * Time: 下午9:15 */ @@ -20,22 +21,28 @@ public class SimplePageProcessor implements PageProcessor { private Site site; public SimplePageProcessor(String startUrl, String urlPattern) { - this.site = Site.me().setStartUrl(startUrl). + this.site = Site.me().addStartUrl(startUrl). setDomain(UrlUtils.getDomain(startUrl)).setUserAgent(UA); + //compile "*" expression to regex this.urlPattern = "("+urlPattern.replace(".","\\.").replace("*","[^\"'#]*")+")"; } @Override public void process(Page page) { - List requests = page.getHtml().as().rs(urlPattern).toStrings(); + List requests = page.getHtml().links().regex(urlPattern).all(); + //调用page.addTargetRequests()方法添加待抓取链接 page.addTargetRequests(requests); - page.putField("title", page.getHtml().x("//title")); - page.putField("content", page.getHtml().sc()); + //xpath方式抽取 + page.putField("title", page.getHtml().xpath("//title")); + //sc表示使用Readability技术抽取正文 + page.putField("html", page.getHtml().toString()); + page.putField("content", page.getHtml().smartContent()); } @Override public Site getSite() { + //定义抽取站点的相关参数 return site; } } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/processor/package.html b/webmagic-core/src/main/java/us/codecraft/webmagic/processor/package.html new file mode 100644 index 000000000..47274a1fd --- /dev/null +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/processor/package.html @@ -0,0 +1,5 @@ + + +包含了封装页面处理逻辑的接口PageProcessor和一个实现类SimplePageProcessor。实现PageProcessor即可定制一个自己的爬虫。 + + diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/schedular/Schedular.java b/webmagic-core/src/main/java/us/codecraft/webmagic/schedular/Schedular.java deleted file mode 100644 index a5b71f5af..000000000 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/schedular/Schedular.java +++ /dev/null @@ -1,17 +0,0 @@ -package us.codecraft.webmagic.schedular; - -import us.codecraft.webmagic.Request; -import us.codecraft.webmagic.Site; - -/** - * User: cairne - * Date: 13-4-21 - * Time: 下午1:12 - */ -public interface Schedular { - - public void push(Request request,Site site); - - public Request poll(Site site); - -} diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/schedular/QueueSchedular.java b/webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/QueueScheduler.java similarity index 68% rename from webmagic-core/src/main/java/us/codecraft/webmagic/schedular/QueueSchedular.java rename to webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/QueueScheduler.java index 071f708c4..723b5f93c 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/schedular/QueueSchedular.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/QueueScheduler.java @@ -1,8 +1,8 @@ -package us.codecraft.webmagic.schedular; +package us.codecraft.webmagic.scheduler; import org.apache.log4j.Logger; import us.codecraft.webmagic.Request; -import us.codecraft.webmagic.Site; +import us.codecraft.webmagic.Task; import java.util.HashSet; import java.util.Set; @@ -10,11 +10,12 @@ import java.util.concurrent.LinkedBlockingQueue; /** - * User: cairne + * 内存队列实现的线程安全Scheduler。
+ * @author code4crafter@gmail.com
* Date: 13-4-21 * Time: 下午1:13 */ -public class QueueSchedular implements Schedular { +public class QueueScheduler implements Scheduler { private Logger logger = Logger.getLogger(getClass()); @@ -23,7 +24,7 @@ public class QueueSchedular implements Schedular { private Set urls = new HashSet(); @Override - public synchronized void push(Request request,Site site) { + public synchronized void push(Request request,Task task) { if (logger.isDebugEnabled()){ logger.debug("push to queue "+request.getUrl()); } @@ -34,7 +35,7 @@ public synchronized void push(Request request,Site site) { } @Override - public synchronized Request poll(Site site) { + public synchronized Request poll(Task task) { return queue.poll(); } } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/Scheduler.java b/webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/Scheduler.java new file mode 100644 index 000000000..fc39b450e --- /dev/null +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/Scheduler.java @@ -0,0 +1,29 @@ +package us.codecraft.webmagic.scheduler; + +import us.codecraft.webmagic.Request; +import us.codecraft.webmagic.Task; + +/** + * 包含url管理和调度的接口。包括url抓取队列,url去重等功能。
+ * Scheduler的接口包含一个Task参数,该参数是为单Scheduler多Task预留的(Spider就是一个Task)。
+ * @author code4crafter@gmail.com
+ * Date: 13-4-21 + * Time: 下午1:12 + */ +public interface Scheduler { + + /** + * 加入一个待抓取的链接 + * @param request 待抓取的链接 + * @param task 定义的任务,以满足单Scheduler多Task的情况 + */ + public void push(Request request,Task task); + + /** + * 返回下一个要抓取的链接 + * @param task 定义的任务,以满足单Scheduler多Task的情况 + * @return 下一个要抓取的链接 + */ + public Request poll(Task task); + +} diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/package.html b/webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/package.html new file mode 100644 index 000000000..7887dd536 --- /dev/null +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/package.html @@ -0,0 +1,5 @@ + + +包含url管理和调度的接口Scheduler及它的几个实现类。 + + diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/AndSelector.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/AndSelector.java new file mode 100644 index 000000000..997b6cf19 --- /dev/null +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/AndSelector.java @@ -0,0 +1,53 @@ +package us.codecraft.webmagic.selector; + +import java.util.ArrayList; +import java.util.List; + +/** + * @author code4crafter@gmail.com
+ * Date: 13-8-3
+ * Time: 下午5:29
+ */ +public class AndSelector implements Selector { + + private List selectors = new ArrayList(); + + public AndSelector(Selector... selectors) { + for (Selector selector : selectors) { + this.selectors.add(selector); + } + } + + @Override + public String select(String text) { + for (Selector selector : selectors) { + if (text == null) { + return null; + } + text = selector.select(text); + } + return text; + } + + @Override + public List selectList(String text) { + List results = new ArrayList(); + boolean first = true; + for (Selector selector : selectors) { + if (first) { + results = selector.selectList(text); + first = false; + } else { + List resultsTemp = new ArrayList(); + for (String result : results) { + resultsTemp.addAll(selector.selectList(result)); + } + results = resultsTemp; + if (results == null || results.size() == 0) { + return results; + } + } + } + return results; + } +} diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/CssSelector.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/CssSelector.java new file mode 100644 index 000000000..90a9d1d7d --- /dev/null +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/CssSelector.java @@ -0,0 +1,48 @@ +package us.codecraft.webmagic.selector; + +import org.apache.commons.collections.CollectionUtils; +import org.jsoup.Jsoup; +import org.jsoup.nodes.Document; +import org.jsoup.nodes.Element; +import org.jsoup.select.Elements; + +import java.util.ArrayList; +import java.util.List; + +/** + * css风格的选择器。包装了Jsoup。
+ * @author code4crafter@gmail.com
+ * Date: 13-4-21 + * Time: 上午9:39 + */ +public class CssSelector implements Selector { + + private String selectorText; + + public CssSelector(String selectorText) { + this.selectorText = selectorText; + } + + @Override + public String select(String text) { + Document doc = Jsoup.parse(text); + Elements elements = doc.select(selectorText); + if (CollectionUtils.isEmpty(elements)) { + return null; + } + return elements.get(0).outerHtml(); + } + + @Override + public List selectList(String text) { + List strings = new ArrayList(); + Document doc = Jsoup.parse(text); + Elements elements = doc.select(selectorText); + if (CollectionUtils.isNotEmpty(elements)) { + for (Element element : elements) { + strings.add(element.outerHtml()); + } + } + return strings; + } +} diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Html.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Html.java index c385ff99c..114eef996 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Html.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Html.java @@ -4,9 +4,10 @@ import java.util.List; /** - * User: cairne - * Date: 13-4-21 - * Time: 上午7:54 + * 可抽取的html文本。
+ * @author code4crafter@gmail.com
+ * Date: 13-4-21 + * Time: 上午7:54 */ public class Html extends PlainText { @@ -18,10 +19,8 @@ public Html(String text) { super(text); } - @Override - public Selectable x(String xpath) { - XpathSelector xpathSelector = SelectorFactory.getInstatnce().newXpathSelector(xpath); - return select(xpathSelector,strings); + public static Html create(String text) { + return new Html(text); } @Override @@ -29,7 +28,7 @@ protected Selectable select(Selector selector, List strings) { List results = new ArrayList(); for (String string : strings) { String result = selector.select(string); - if (result!=null){ + if (result != null) { results.add(result); } } @@ -47,27 +46,27 @@ protected Selectable selectList(Selector selector, List strings) { } @Override - public Selectable sc() { + public Selectable smartContent() { SmartContentSelector smartContentSelector = SelectorFactory.getInstatnce().newSmartContentSelector(); - return select(smartContentSelector,strings); + return select(smartContentSelector, strings); } @Override - public Selectable a() { + public Selectable links() { XpathSelector xpathSelector = SelectorFactory.getInstatnce().newXpathSelector("//a/@href"); - return select(xpathSelector,strings); + return selectList(xpathSelector, strings); } @Override - public Selectable as() { - XpathSelector xpathSelector = SelectorFactory.getInstatnce().newXpathSelector("//a/@href"); - return selectList(xpathSelector,strings); + public Selectable xpath(String xpath) { + XpathSelector xpathSelector = SelectorFactory.getInstatnce().newXpathSelector(xpath); + return selectList(xpathSelector, strings); } @Override - public Selectable xs(String xpath) { - XpathSelector xpathSelector = SelectorFactory.getInstatnce().newXpathSelector(xpath); - return selectList(xpathSelector, strings); + public Selectable $(String selector) { + CssSelector cssSelector = new CssSelector(selector); + return selectList(cssSelector,strings); } } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/OrSelector.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/OrSelector.java new file mode 100644 index 000000000..48f9fb93e --- /dev/null +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/OrSelector.java @@ -0,0 +1,41 @@ +package us.codecraft.webmagic.selector; + +import java.util.ArrayList; +import java.util.List; + +/** + * @author code4crafter@gmail.com
+ * Date: 13-8-3
+ * Time: 下午5:29
+ */ +public class OrSelector implements Selector { + + private List selectors = new ArrayList(); + + public OrSelector(Selector... selectors) { + for (Selector selector : selectors) { + this.selectors.add(selector); + } + } + + @Override + public String select(String text) { + for (Selector selector : selectors) { + text = selector.select(text); + if (text!=null){ + return text; + } + } + return null; + } + + @Override + public List selectList(String text) { + List results = new ArrayList(); + for (Selector selector : selectors) { + List strings = selector.selectList(text); + results.addAll(strings); + } + return results; + } +} diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/PlainText.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/PlainText.java index 91ab7abd2..d06a53105 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/PlainText.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/PlainText.java @@ -6,7 +6,8 @@ import java.util.List; /** - * User: cairne + * 可抽取的纯文本,不包括xpath和css selector实现。
+ * @author code4crafter@gmail.com
* Date: 13-4-21 * Time: 上午7:54 */ @@ -24,39 +25,32 @@ public PlainText(String text) { this.strings = results; } - @Override - public Selectable x(String xpath) { - throw new UnsupportedOperationException(); + public static PlainText create(String text) { + return new PlainText(text); } @Override - public Selectable xs(String xpath) { + public Selectable xpath(String xpath) { throw new UnsupportedOperationException(); } @Override - public Selectable sc() { + public Selectable $(String selector) { throw new UnsupportedOperationException(); } @Override - public Selectable a() { + public Selectable smartContent() { throw new UnsupportedOperationException(); } @Override - public Selectable as() { + public Selectable links() { throw new UnsupportedOperationException(); } @Override - public Selectable r(String regex) { - RegexSelector regexSelector = SelectorFactory.getInstatnce().newRegexSelector(regex); - return select(regexSelector, strings); - } - - @Override - public Selectable rs(String regex) { + public Selectable regex(String regex) { RegexSelector regexSelector = SelectorFactory.getInstatnce().newRegexSelector(regex); return selectList(regexSelector, strings); } @@ -82,20 +76,20 @@ protected Selectable selectList(Selector selector, List strings) { } @Override - public Selectable rp(String regex, String replacement) { + public Selectable replace(String regex, String replacement) { ReplaceSelector replaceSelector = SelectorFactory.getInstatnce().newReplaceSelector(regex, replacement); return select(replaceSelector, strings); } @Override - public List toStrings() { + public List all() { return strings; } @Override public String toString() { - if (CollectionUtils.isNotEmpty(toStrings())) { - return toStrings().get(0); + if (CollectionUtils.isNotEmpty(all())) { + return all().get(0); } else { return null; } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/RegexResult.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/RegexResult.java index 8b14e8b75..04467bcce 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/RegexResult.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/RegexResult.java @@ -1,7 +1,8 @@ package us.codecraft.webmagic.selector; /** - * User: cairne + * 封装正则表达式抽取接口的类。
+ * @author code4crafter@gmail.com
* Date: 13-4-21 * Time: 上午7:39 */ diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/RegexSelector.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/RegexSelector.java index 671cbe79c..e95138b7a 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/RegexSelector.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/RegexSelector.java @@ -9,7 +9,8 @@ import java.util.regex.PatternSyntaxException; /** - * User: cairne + * 正则表达式抽取器。
+ * @author code4crafter@gmail.com
* Date: 13-4-21 * Time: 上午7:09 */ diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/ReplaceSelector.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/ReplaceSelector.java index 5f788982a..38b95f787 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/ReplaceSelector.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/ReplaceSelector.java @@ -6,7 +6,8 @@ import java.util.regex.PatternSyntaxException; /** - * User: cairne + * 对文本进行替换。
+ * @author code4crafter@gmail.com
* Date: 13-4-21 * Time: 上午7:09 */ diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Selectable.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Selectable.java index 0fcc4208d..42f3d1083 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Selectable.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Selectable.java @@ -3,57 +3,42 @@ import java.util.List; /** - * User: cairne + * 可进行抽取的文本。
+ * @author code4crafter@gmail.com
* Date: 13-4-20 * Time: 下午7:51 */ public interface Selectable { /** - * select with xpath + * select list with xpath * * @param xpath * @return new Selectable after extract */ - public Selectable x(String xpath); + public Selectable xpath(String xpath); /** - * select list with xpath + * select list with css selector * - * @param xpath + * @param selector css selector expression * @return new Selectable after extract */ - public Selectable xs(String xpath); + public Selectable $(String selector); /** * select smart content with ReadAbility algorithm * * @return content */ - public Selectable sc(); - - /** - * select a link - * - * @return - */ - public Selectable a(); + public Selectable smartContent(); /** * select all links * - * @return - */ - public Selectable as(); - - - /** - * select with regex - * - * @param regex - * @return new Selectable after extract + * @return all links */ - public Selectable r(String regex); + public Selectable links(); /** * select list with regex @@ -61,7 +46,7 @@ public interface Selectable { * @param regex * @return new Selectable after extract */ - public Selectable rs(String regex); + public Selectable regex(String regex); /** * replace with regex @@ -70,7 +55,7 @@ public interface Selectable { * @param replacement * @return new Selectable after extract */ - public Selectable rp(String regex, String replacement); + public Selectable replace(String regex, String replacement); /** * single string result @@ -84,5 +69,5 @@ public interface Selectable { * * @return multi string result */ - public List toStrings(); + public List all(); } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Selector.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Selector.java index 914e8ab01..4af2b449a 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Selector.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Selector.java @@ -3,7 +3,8 @@ import java.util.List; /** - * User: cairne + * 抽取器。
+ * @author code4crafter@gmail.com
* Date: 13-4-20 * Time: 下午8:02 */ diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/SelectorFactory.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/SelectorFactory.java index af1996943..1dd56e01c 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/SelectorFactory.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/SelectorFactory.java @@ -7,7 +7,8 @@ import java.util.concurrent.ConcurrentHashMap; /** - * User: cairne + * 产生selector的工厂。
+ * @author code4crafter@gmail.com
* Date: 13-4-21 * Time: 上午7:56 */ @@ -70,7 +71,7 @@ public T newSelector(Class clazz, String... param) { } else { throw new UnsupportedOperationException(); } - } catch (ReflectiveOperationException e) { + } catch (Exception e) { throw new IllegalArgumentException("init object error", e); } } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/SmartContentSelector.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/SmartContentSelector.java index c2e36dff9..89748975d 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/SmartContentSelector.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/SmartContentSelector.java @@ -8,8 +8,9 @@ import java.util.concurrent.atomic.AtomicInteger; /** - * 找到clear - * User: cairne + * readability算法,基础是找到所有p标签的父节点 + * 写的比较乱,最终效果还在尝试中 + * @author code4crafter@gmail.com
* Date: 13-4-21 * Time: 下午4:42 */ diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/XpathSelector.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/XpathSelector.java index 6de2f08ab..02afe2912 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/XpathSelector.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/XpathSelector.java @@ -6,7 +6,8 @@ import java.util.List; /** - * User: cairne + * xpath的选择器。包装了HtmlCleaner。
+ * @author code4crafter@gmail.com
* Date: 13-4-21 * Time: 上午9:39 */ @@ -52,12 +53,12 @@ public List selectList(String text) { try { Object[] objects = tagNode.evaluateXPath(xpathStr); if (objects != null && objects.length >= 1) { - for (int i = 0; i < objects.length; i++) { - if (objects[i] instanceof TagNode) { - TagNode tagNode1 = (TagNode) objects[i]; + for (Object object : objects) { + if (object instanceof TagNode) { + TagNode tagNode1 = (TagNode) object; results.add(htmlCleaner.getInnerHtml(tagNode1)); } else { - results.add(objects[i].toString()); + results.add(object.toString()); } } } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/package.html b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/package.html new file mode 100644 index 000000000..3c9ef7b25 --- /dev/null +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/package.html @@ -0,0 +1,5 @@ + + +提供了便捷抽取页面内容的工具,对外核心接口是Selectable,内部抽取则是通过实现Selector来定制。 + + diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/utils/ThreadUtils.java b/webmagic-core/src/main/java/us/codecraft/webmagic/utils/ThreadUtils.java new file mode 100644 index 000000000..d6876c719 --- /dev/null +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/utils/ThreadUtils.java @@ -0,0 +1,34 @@ +package us.codecraft.webmagic.utils; + +import java.util.concurrent.ExecutorService; +import java.util.concurrent.LinkedBlockingQueue; +import java.util.concurrent.ThreadPoolExecutor; +import java.util.concurrent.TimeUnit; + +/** + * 线程工具类。
+ * @author code4crafer@gmail.com + * Date: 13-6-23 + * Time: 下午7:11 + */ +public class ThreadUtils { + + public static ExecutorService newFixedThreadPool(int threadSize) { + return new ThreadPoolExecutor(threadSize, threadSize, 0L, TimeUnit.MILLISECONDS, + new LinkedBlockingQueue(1) { + + private static final long serialVersionUID = -9028058603126367678L; + + @Override + public boolean offer(Runnable e) { + try { + put(e); + return true; + } catch (InterruptedException ie) { + Thread.currentThread().interrupt(); + } + return false; + } + }); + } +} diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/utils/UrlUtils.java b/webmagic-core/src/main/java/us/codecraft/webmagic/utils/UrlUtils.java index 124ca6412..9f038bc87 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/utils/UrlUtils.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/utils/UrlUtils.java @@ -6,7 +6,8 @@ import java.util.regex.Pattern; /** - * User: cairne + * url及html处理工具类。
+ * @author code4crafter@gmail.com
* Date: 13-4-21 * Time: 下午1:52 */ @@ -14,7 +15,13 @@ public class UrlUtils { private static Pattern relativePathPattern = Pattern.compile("^([\\.]+)/"); - public static String fixRelativeUrl(String url, String refer) { + /** + * 将url想对地址转化为绝对地址 + * @param url url地址 + * @param refer url地址来自哪个页面 + * @return url绝对地址 + */ + public static String canonicalizeUrl(String url, String refer) { if (StringUtils.isBlank(url) || StringUtils.isBlank(refer)) { return url; } @@ -62,12 +69,12 @@ public static String getHost(String url) { private static Pattern patternForProtocal = Pattern.compile("[\\w]+://"); - public static String removeProtocal(String url) { + public static String removeProtocol(String url) { return patternForProtocal.matcher(url).replaceAll(""); } public static String getDomain(String url) { - String domain = removeProtocal(url); + String domain = removeProtocol(url); int i = StringUtils.indexOf(domain, "/", 1); if (i > 0) { domain = StringUtils.substring(domain, 0, i); @@ -75,7 +82,7 @@ public static String getDomain(String url) { return domain; } - private static Pattern patternForHref = Pattern.compile("(]*href=)[\"']{0,1}([^\"']*)[\"']{0,1}", Pattern.CASE_INSENSITIVE); + private static Pattern patternForHref = Pattern.compile("(]*href=)[\"']{0,1}([^\"'<>\\s]*)[\"']{0,1}", Pattern.CASE_INSENSITIVE); public static String fixAllRelativeHrefs(String html, String url) { StringBuilder stringBuilder = new StringBuilder(); @@ -84,7 +91,7 @@ public static String fixAllRelativeHrefs(String html, String url) { while (matcher.find()) { stringBuilder.append(StringUtils.substring(html, lastEnd, matcher.start())); stringBuilder.append(matcher.group(1)); - stringBuilder.append("\"" + fixRelativeUrl(matcher.group(2), url) + "\""); + stringBuilder.append("\"").append(canonicalizeUrl(matcher.group(2), url)).append("\""); lastEnd = matcher.end(); } stringBuilder.append(StringUtils.substring(html, lastEnd)); diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/utils/package.html b/webmagic-core/src/main/java/us/codecraft/webmagic/utils/package.html new file mode 100644 index 000000000..bfbe8dfcd --- /dev/null +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/utils/package.html @@ -0,0 +1,5 @@ + + +提供一些处理链接的静态工具类。 + + diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/HtmlTest.java b/webmagic-core/src/test/java/us/codecraft/webmagic/HtmlTest.java index f79909840..c90001460 100644 --- a/webmagic-core/src/test/java/us/codecraft/webmagic/HtmlTest.java +++ b/webmagic-core/src/test/java/us/codecraft/webmagic/HtmlTest.java @@ -5,7 +5,7 @@ import us.codecraft.webmagic.selector.Html; /** - * User: cairne + * @author code4crafter@gmail.com
* Date: 13-4-21 * Time: 上午8:42 */ @@ -14,7 +14,8 @@ public class HtmlTest { @Test public void testRegexSelector() { Html selectable = new Html("aaaaaaab"); - Assert.assertEquals("abbabbab", (selectable.r("(.*)").rp("aa(a)", "$1bb").toString())); + Assert.assertEquals("abbabbab", (selectable.regex("(.*)").replace("aa(a)", "$1bb").toString())); } + } diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/SpiderTest.java b/webmagic-core/src/test/java/us/codecraft/webmagic/SpiderTest.java deleted file mode 100644 index 5cb9848ff..000000000 --- a/webmagic-core/src/test/java/us/codecraft/webmagic/SpiderTest.java +++ /dev/null @@ -1,131 +0,0 @@ -package us.codecraft.webmagic; - -import org.junit.Ignore; -import org.junit.Test; -import us.codecraft.webmagic.pipeline.FilePipeline; -import us.codecraft.webmagic.processor.SimplePageProcessor; -import us.codecraft.webmagic.samples.HuxiuProcessor; -import us.codecraft.webmagic.schedular.FileCacheQueueSchedular; - -/** - * User: cairne - * Date: 13-4-20 - * Time: 下午7:46 - */ -public class SpiderTest { - - - @Test - public void testSpider() throws InterruptedException { - Spider me = Spider.me().pipeline(new FilePipeline()).processor(new HuxiuProcessor()); - me.run(); - } - - @Test - public void testGlobalSpider(){ -// PageProcessor pageProcessor = new MeicanProcessor(); -// Spider.me().pipeline(new FilePipeline()).schedular(new FileCacheQueueSchedular(pageProcessor.getSite(),"/data/temp/webmagic/cache/")). -// processor(pageProcessor).run(); - SimplePageProcessor pageProcessor2 = new SimplePageProcessor("http://www.diaoyuweng.com/home.php?mod=space&uid=88304&do=thread&view=me&type=thread&from=space", "http://www.diaoyuweng.com/thread-*-1-1.html"); - pageProcessor2.getSite().setEncoding("GBK"); - System.out.println(pageProcessor2.getSite().getEncoding()); - Spider.me().pipeline(new FilePipeline()).schedular(new FileCacheQueueSchedular(pageProcessor2.getSite(),"/data/temp/webmagic/cache/")). - processor(pageProcessor2).run(); - - - } - - @Test - public void test(){ - System.out.println(System.getProperty("java.io.tmpdir")); - } - - - @Ignore - @Test - public void languageSchema() { - - - /** - * - * _hrefs = rs("]*href=[\"']{1}(/yewu/.*?)[\"']{1}") - * title = r(""(.*)"") - * body = x("//dd[@class='w133']") - * - * site.domain = "sh.58.com" - * site.ua="" - * site.cookie="aa:bb" - * - */ - - /** - * - * - * if (page == r('') && refer(1) == 1) { - * - * type = _refer(1) - * content = _text.t().c() - * title = x("asd@asd").r("",1) - * body[r(_currentUrl).g(1)] = body[r(_currentUrl).g(1)] + (x("").r("",1,2).c()) - * - * body=body[r(_currentUrl).g(1)] - * tags[%] = (tags[%] + xs('')) . r('') - * - * _targetUrls.add('' + x('').r('')) - * _sourceUrls.add() - * _header.put("",""); - * _cookie.add("asdsadasdsa"); - * - * - * } - * - * _cookie.add(_cookie['']) - * - * if (page == r('') && refer(1) == 1) - * ( - * _targetUrl = '' + x('') & r('') - * _sourceUrl = '' - * ) - * - */ - - /** - * - * - * - * - * - * - * - * - * - * - */ - - /** - * - * if (model.url('') && model.refer(1) == 1) - * ( - * - * model.set(type, model.refer(1)) - * content = t(_html) > c() - * title = x(_html, 'asd@asd') > r('',1) - * body[r(_currentUrl).g(1)] = body[r(_currentUrl).g(1)] + (x('') > r('',1,2) > c()) | x('') - * tags[%] = tags + xs('') > r('') - * model.setTargetUrl(); - * - * _targetUrl = '' + x('') & r('') - * _sourceUrl = '' - * ) - * - * _cookie.add(_cookie['']) - * - * if (page == r('') && refer(1) == 1) - * ( - * _targetUrl = '' + x('') & r('') - * _sourceUrl = '' - * ) - * - */ - } -} diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/downloader/HttpClientDownloaderTest.java b/webmagic-core/src/test/java/us/codecraft/webmagic/downloader/HttpClientDownloaderTest.java new file mode 100644 index 000000000..936aece62 --- /dev/null +++ b/webmagic-core/src/test/java/us/codecraft/webmagic/downloader/HttpClientDownloaderTest.java @@ -0,0 +1,25 @@ +package us.codecraft.webmagic.downloader; + +import org.junit.Assert; +import org.junit.Ignore; +import org.junit.Test; +import us.codecraft.webmagic.Page; +import us.codecraft.webmagic.Request; +import us.codecraft.webmagic.Site; + +/** + * Author: code4crafer@gmail.com + * Date: 13-6-18 + * Time: 上午8:22 + */ +public class HttpClientDownloaderTest { + + @Ignore + @Test + public void testCookie() { + Site site = Site.me().setDomain("www.diandian.com").addCookie("t", "yct7q7e6v319wpg4cpxqduu5m77lcgix"); + HttpClientDownloader httpClientDownloader = new HttpClientDownloader(); + Page download = httpClientDownloader.download(new Request("http://www.diandian.com"), site.toTask()); + Assert.assertTrue(download.getHtml().toString().contains("flashsword30")); + } +} diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/selector/HtmlCleanerTest.java b/webmagic-core/src/test/java/us/codecraft/webmagic/selector/HtmlCleanerTest.java deleted file mode 100644 index 7aa2fc77d..000000000 --- a/webmagic-core/src/test/java/us/codecraft/webmagic/selector/HtmlCleanerTest.java +++ /dev/null @@ -1,28 +0,0 @@ -package us.codecraft.webmagic.selector; - -import org.htmlcleaner.CleanerProperties; -import org.htmlcleaner.HtmlCleaner; -import org.htmlcleaner.TagNode; -import org.junit.Test; - -import java.io.IOException; -import java.net.URL; - -/** - * User: cairne - * Date: 13-4-21 - * Time: 上午10:35 - */ -public class HtmlCleanerTest { - - @Test - public void test() throws IOException { - HtmlCleaner htmlCleaner = new HtmlCleaner(); - - CleanerProperties props = htmlCleaner.getProperties(); - - TagNode node = htmlCleaner.clean(new URL("http://www.huanqiu.com"),"UTF-8"); - System.out.println(node.getAllElementsList(true)); - System.out.println(node); - } -} diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/selector/RegexSelectorTest.java b/webmagic-core/src/test/java/us/codecraft/webmagic/selector/RegexSelectorTest.java index 6128f1708..849a4d6e5 100644 --- a/webmagic-core/src/test/java/us/codecraft/webmagic/selector/RegexSelectorTest.java +++ b/webmagic-core/src/test/java/us/codecraft/webmagic/selector/RegexSelectorTest.java @@ -4,7 +4,7 @@ import org.junit.Test; /** - * User: cairne + * @author code4crafter@gmail.com
* Date: 13-4-21 * Time: 上午7:13 */ diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/selector/SmartConentSelectorTest.java b/webmagic-core/src/test/java/us/codecraft/webmagic/selector/SmartConentSelectorTest.java deleted file mode 100644 index 4620a242b..000000000 --- a/webmagic-core/src/test/java/us/codecraft/webmagic/selector/SmartConentSelectorTest.java +++ /dev/null @@ -1,3051 +0,0 @@ -package us.codecraft.webmagic.selector; - -import org.junit.Test; - -import java.io.IOException; - -/** - * User: cairne - * Date: 13-4-21 - * Time: 上午10:35 - */ -public class SmartConentSelectorTest { - - @Test - public void test() throws IOException { - String text ="\n" + - "\n" + - "\n" + - "\n" + - " \n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - " \n" + - " \n" + - " \n" + - " \n" + - " 全文Feed的终极解决方案 - 阮一峰的网络日志\n" + - "\n" + - "\n" + - "
\n" + - "
\n" + - "\n" + - "\n" + - "
\n" + - "
\n" + - "
\n" + - "\n" + - "\n" + - "
阮一峰的网络日志 » 首页 » 档案\n" + - "
\n" + - "\n" + - "
\n" + - "\n" + - "
\n" + - "
\n" + - " \n" + - " \n" + - " \n" + - " \n" + - " \n" + - "
\n" + - "
\n" + - "\n" + - "
\n" + - "\n" + - "\n" + - "
\n" + - "
\n" + - "
\n" + - "\n" + - "\n" + - "\n" + - "
\n" + - "
\n" + - "\n" + - "\n" + - "
\n" + - "
\n" + - "\n" + - "\n" + - "
\n" + - "
\n" + - "
\n" + - "\n" + - "
\n" + - "\n" + - "
\n" + - "\n" + - "\n" + - "\n" + - " \n" + - "
\n" + - "

分类

\n" + - " \n" + - "
\n" + - " \n" + - "\n" + - "\n" + - "\n" + - " \n" + - "
\n" + - "
\n" + - "
\n" + - "

全文Feed的终极解决方案

\n" + - "
\n" + - "
\n" + - " \n" + - "\n" + - "

作者: 阮一峰

\n" + - "\n" + - "

日期: 2010年4月17日

\n" + - "\n" + - "\n" + - "\n" + - "
\n" + - " \n" + - "
\n" + - "\n" + - " \n" + - "

正如我们都知道的,全文Feed最有用。

\n" + - " \n" + - "\n" + - "\n" + - " \n" + - "

但是,世界上的大部分Feed,都是摘要Feed,甚至是标题Feed。我们只好自己动手,制作全文Feed。

\n" + - "\n" + - "

传统的制作方法非常麻烦,需要针对不同的网站,编写不同的内容提取规则。要是有一个傻瓜型的\"全文Feed生成器\",把摘要Feed往里面一扔,全文Feed就自动生成了,那该多好。

\n" + - "\n" + - "

FiveFilters.org提供的生成器,大概最接近于这种要求。

\n" + - "\n" + - "

\n" + - "\n" + - "

举例来说,网易的社会新闻Feed(http://news.163.com/special/00011K6L/rss_sh.xml)是一个摘要Feed。

\n" + - "\n" + - "

\n" + - "\n" + - "

我们把这个网址,送进FiveFilters.org,点击\"Create Feed\"按钮,全文Feed就自动产生了!(查看效果

\n" + - "\n" + - "

但是,这个生成器并不是百用百灵,比如新浪的Feed(http://rss.sina.com.cn/news/society/focus15.xml)就无法抓取全文。

\n" + - "\n" + - "

好在今年3月份,它开源了。作者Keyvan Minoukadeh将所有代码都公开了,所以如果遇到不能生效的Feed,现在我们就可以修改源码了。因此理论上,几乎所有的摘要Feed都可以自动转成全文Feed了。

\n" + - "\n" + - "

源码存放在launchpad.net上,需要安装Bazaar的客户端才能下载。我为大家提供方便,把它们压缩成一个zip文件,点击下载(1.0版,217KB)。

\n" + - "\n" + - "

下载后,上传到支持PHP 5.2的虚拟主机上,就可以直接使用。使用的时候,需要将cache子目录设为可写(权限777)。在config-sample.php文件中,可以查看设置选项,修改默认值后,将文件名改为config.php,就会生效。(不修改亦可,config文件并不是必需的。)

\n" + - "\n" + - "

这个程序的核心是readability.php文件,它负责判断当前网页中,那一部分属于页面的主要内容,然后将其抓取出来。实现原理照搬了arc90的ReadAbility脚本。简单说,思路是这样的:1)检查页面中所有p元素的父容器;2)根据相关特征,为每一个父容器计算一个特征值;3)特征值最大的容器,就是放置主要内容的容器。

\n" + - "\n" + - "

具体实现请阅读代码,源码写得非常清晰,而且有详细的注释。如果遇到不能抓取全文的Feed,你就要自己修改readability.php,增加相应的规则。比如,在我提供下载的代码中,我就设置了新浪网的规则,新浪网的全文Feed就能自动生成了。

\n" + - "\n" + - "

这个程序使用的是AGPL许可证,这就是说你可以自由地使用、修改、发布这个程序,但是只要你向他人提供基于这个程序的服务,你就必须公开源码。

\n" + - "\n" + - "

作者Keyvan Minoukadeh允诺,只要使用者向他捐款200美元,就发布2.0版。如果你喜欢这个程序,建议向他捐款

\n" + - "\n" + - "

P.S.

\n" + - "\n" + - "

这几天,我还发现了一个非常优秀的开源相册软件ZenPhoto,也推荐使用。

\n" + - "\n" + - "

UPDATE(2010.6.3)

\n" + - "\n" + - "

Full TEXT RSS 1.5版下载(283KB)

\n" + - "\n" + - "

UPDATE(2010.11.10)

\n" + - "\n" + - "

Full TEXT RSS 2.1版下载(362KB)

\n" + - "\n" + - "

(完)

\n" + - " \n" + - "\n" + - "
\n" + - "\n" + - "
\n" + - "\n" + - "

文档信息

\n" + - "\n" + - "
\n" + - "
\n" + - "
\n" + - "
\n" + - "

\n" + - "
\n" + - "
\n" + - "

相关文章

\n" + - "
    \n" + - "\n" + - "
  • 2012.12.21: Javascript异步编程的4种方法\n" + - "\n" + - "
    \n" + - " 你可能知道,Javascript语言的执行环境是\"单线程\"(single thread)。\n" + - "
    \n" + - "\n" + - "
  • \n" + - "\n" + - " \n" + - "
  • 2012.12.14: 奥巴马筹款网站的制作过程\n" + - "\n" + - "
    \n" + - " 1.\n" + - "\n" + - "Kyle Rush是一个网站工程师。\n" + - "
    \n" + - "\n" + - "
  • \n" + - "\n" + - " \n" + - "
\n" + - "
\n" + - "\n" + - "
\n" + - "

功能链接

\n" + - "
    \n" + - "
  • 前一篇:\"草原新城\"康巴什
  • \n" + - "
  • 后一篇:网络时代的音乐家生存指南
  • \n" + - "
  • 更多内容请访问:首页 » 档案 » \n" + - "IT技术 \n" + - "
  • \n" + - "\n" + - "
  • \n" + - "\n" + - "
    \n" + - "\n" + - "站内搜索:\n" + - "\n" + - "\n" + - "Web\n" + - "\n" + - "www.ruanyifeng.com\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "
    \n" + - "\n" + - "
  • \n" + - "
  • Feed订阅:
  • \n" + - "
\n" + - "
\n" + - "\n" + - "
\n" + - "

广告(购买广告位)

\n" + - "
\n" + - "\n" + - "\n" + - "
\n" + - "
\n" + - "
\n" + - "\n" + - "\n" + - "
\n" + - "
\n" + - "
\n" + - "\n" + - " \n" + - "\n" + - "
\n" + - "\n" + - "\n" + - " \n" + - " \n" + - " \n" + - "

留言(23条)

\n" + - "\n" + - "
\n" + - " \n" + - "
\n" + - "
\n" + - "
\n" + - "
\n" + - "

\n" + - " \n" + - " \n" + - "\n" + - " zp\n" + - "\n" + - " 说:\n" + - " \n" + - "

\n" + - "
\n" + - "
\n" + - "
\n" + - "

能不能介绍些Movable Type的文章,我比较喜欢它的静态页面,国内关于它的资料好像还不多。特别是MT5出来后,多页面功能可能会让刚接触的人晕头转向。

\n" + - "
\n" + - "
\n" + - "\n" + - "
\n" + - "
\n" + - "
\n" + - " \n" + - " \n" + - " \n" + - "
\n" + - "
\n" + - "
\n" + - "
\n" + - "

\n" + - " \n" + - " \n" + - "\n" + - " 火点\n" + - "\n" + - " 说:\n" + - " \n" + - "

\n" + - "
\n" + - "
\n" + - "
\n" + - "

很好,谢谢作者,只是赶到花了大量的时间在新闻上似乎有点不利于思考。

\n" + - "\n" + - "

用一个图书管理软件(BLM)整理了大学期间看过的书,仅有180本左右,汗颜,这就是我的大学……

\n" + - "\n" + - "

现在参加工作了,好在业余时间还算充裕,希望可以多读一些书。

\n" + - "
\n" + - "
\n" + - "\n" + - "
\n" + - "
\n" + - "
\n" + - " \n" + - " \n" + - " \n" + - "
\n" + - "
\n" + - "
\n" + - "
\n" + - "

\n" + - " \n" + - " \n" + - "\n" + - " AlbertDiao\n" + - "\n" + - " 说:\n" + - " \n" + - "

\n" + - "
\n" + - "
\n" + - "
\n" + - "

如果是手机RSS的话,摘要Feed比较好。一般浏览摘要,感兴趣的点进全文,这样比较节省流量。

\n" + - "
\n" + - "
\n" + - "\n" + - "
\n" + - "
\n" + - "
\n" + - " \n" + - " \n" + - " \n" + - "
\n" + - "
\n" + - "
\n" + - "
\n" + - "

\n" + - " \n" + - " \n" + - "\n" + - " 野草博客\n" + - "\n" + - " 说:\n" + - " \n" + - "

\n" + - "
\n" + - "
\n" + - "
\n" + - "

嗯,野草一直在用他:)

\n" + - "
\n" + - "
\n" + - "\n" + - "
\n" + - "
\n" + - "
\n" + - " \n" + - " \n" + - " \n" + - "
\n" + - "
\n" + - "
\n" + - "
\n" + - "

\n" + - " \n" + - " \n" + - "\n" + - " Ruan YiFeng\n" + - "\n" + - " 说:\n" + - " \n" + - "

\n" + - "
\n" + - "
\n" + - "
\n" + - "
\n" + - "
引用AlbertDiao的发言:
\n" + - "\n" + - "

如果是手机RSS的话,摘要Feed比较好。一般浏览摘要,感兴趣的点进全文,这样比较节省流量。

\n" + - "\n" + - "
\n" + - "\n" + - "

流量会越来越便宜,真正昂贵的是你的时间。所以还是全文Feed好。

\n" + - "\n" + - "
\n" + - "
引用zp的发言:
\n" + - "\n" + - "

能不能介绍些Movable Type的文章。

\n" + - "\n" + - "
\n" + - "\n" + - "

我有这个打算,但是文章不太好写,还需要准备。

\n" + - "
\n" + - "
\n" + - "\n" + - "
\n" + - "
\n" + - "
\n" + - " \n" + - " \n" + - " \n" + - "
\n" + - "
\n" + - "
\n" + - "
\n" + - "

\n" + - " \n" + - " \n" + - "\n" + - " luops\n" + - "\n" + - " 说:\n" + - " \n" + - "

\n" + - "
\n" + - "
\n" + - "
\n" + - "

昨晚测试了此订阅
\n" + - " 同时我也保留了原订阅。
\n" + - "今天发现,同样订阅了163新闻的情况下
\n" + - "全文订阅比官方订阅少了很多新闻
\n" + - "不知其他童靴有没有这样子情况

\n" + - "
\n" + - "
\n" + - "\n" + - "
\n" + - "
\n" + - "
\n" + - " \n" + - " \n" + - " \n" + - "
\n" + - "
\n" + - "
\n" + - "
\n" + - "

\n" + - " \n" + - " \n" + - "\n" + - " 鲜为人志\n" + - "\n" + - " 说:\n" + - " \n" + - "

\n" + - "
\n" + - "
\n" + - "
\n" + - "

呵呵~ 这样都可以啊~

\n" + - "
\n" + - "
\n" + - "\n" + - "
\n" + - "
\n" + - "
\n" + - " \n" + - " \n" + - " \n" + - "
\n" + - "
\n" + - "
\n" + - "
\n" + - "

\n" + - " \n" + - " \n" + - "\n" + - " roy_hu\n" + - "\n" + - " 说:\n" + - " \n" + - "

\n" + - "
\n" + - "
\n" + - "
\n" + - "
\n" + - "
引用AlbertDiao的发言:
\n" + - "\n" + - "

如果是手机RSS的话,摘要Feed比较好。一般浏览摘要,感兴趣的点进全文,这样比较节省流量。

\n" + - "\n" + - "
\n" + - "\n" + - "

我更喜欢全文博客,因为在手机上看Google Reader,自动都排好了版,而看全文的时候需要浏览器排版,没有Google Reader那样专门设计给手机的看着舒服。

\n" + - "
\n" + - "
\n" + - "\n" + - "
\n" + - "
\n" + - "
\n" + - " \n" + - " \n" + - " \n" + - "
\n" + - "
\n" + - "
\n" + - "
\n" + - "

\n" + - " \n" + - " \n" + - "\n" + - " Jack\n" + - "\n" + - " 说:\n" + - " \n" + - "

\n" + - "
\n" + - "
\n" + - "
\n" + - "

另外,也可以用YAHOO PIPE 和YQL来抓取全文。这样除了可以把非全文的FEED变成全文输出外,还可以处理根本没有FEED输出的网页。(不过有很多网页需要处理一下GB2312和UNICODE转换。).而且这样还有一个最大的好处,就是不用建立自己的服务器。

\n" + - "\n" + - "


\n" + - "下面两个FEED 就是用这种办法生成的。
\n" + - "http://feeds.feedburner.com/wenxuecity_news

\n" + - "\n" + - "

http://feeds.feedburner.com/boxun_headline

\n" + - "\n" + - "

可以用GOOGLE READER 来读取它们。也不失为一种间接翻越G/F/W 的办法。
\n" + - "

\n" + - "
\n" + - "
\n" + - "\n" + - "
\n" + - "
\n" + - "
\n" + - " \n" + - " \n" + - " \n" + - "
\n" + - "
\n" + - "
\n" + - "
\n" + - "

\n" + - " \n" + - " \n" + - "\n" + - " Ruan YiFeng\n" + - "\n" + - " 说:\n" + - " \n" + - "

\n" + - "
\n" + - "
\n" + - "
\n" + - "
\n" + - "
引用luops的发言:
\n" + - "\n" + - "

全文订阅比官方订阅少了很多新闻

\n" + - "\n" + - "
\n" + - "\n" + - "

全文Feed默认只有4个条目,下载代码后,你可以自己修改这个值。

\n" + - "
\n" + - "
\n" + - "\n" + - "
\n" + - "
\n" + - "
\n" + - " \n" + - " \n" + - " \n" + - "
\n" + - "
\n" + - "
\n" + - "
\n" + - "

\n" + - " \n" + - " \n" + - "\n" + - " 坏坏鼠\n" + - "\n" + - " 说:\n" + - " \n" + - "

\n" + - "
\n" + - "
\n" + - "
\n" + - "

不懂编程只会用GR的文科生飘过~~~
\n" + - "ps:阮老师的这篇文章GR里也只是显示标题,所以漂洋过海地过来了(牛博编辑的那个频道,已经将你的博客订阅了呵O(∩_∩)O)~~

\n" + - "
\n" + - "
\n" + - "\n" + - "
\n" + - "
\n" + - "
\n" + - " \n" + - " \n" + - " \n" + - "
\n" + - "
\n" + - "
\n" + - "
\n" + - "

\n" + - " \n" + - " \n" + - "\n" + - " 111\n" + - "\n" + - " 说:\n" + - " \n" + - "

\n" + - "
\n" + - "
\n" + - "
\n" + - "
\n" + - "
引用luops的发言:
\n" + - "\n" + - "

全文订阅比官方订阅少了很多新闻

\n" + - "\n" + - "
\n" + - "\n" + - "


\n" + - "是这样的,丢失了好多,时效性好差
\n" + - "

\n" + - "
\n" + - "
\n" + - "\n" + - "
\n" + - "
\n" + - "
\n" + - " \n" + - " \n" + - " \n" + - "
\n" + - "
\n" + - "
\n" + - "
\n" + - "

\n" + - " \n" + - " \n" + - "\n" + - " kuber\n" + - "\n" + - " 说:\n" + - " \n" + - "

\n" + - "
\n" + - "
\n" + - "
\n" + - "

想请教一下你怎么修改规则来全文输出新浪网rss的, 我也碰到几个Feed,缺省的配置不能正确处理.
\n" + - "另外我建议设立一个地方大家可以交流一下脚本不能处理的feed,以及修改的方法, 这样各人不用重复浪费时间了.

\n" + - "
\n" + - "
\n" + - "\n" + - "
\n" + - "
\n" + - "
\n" + - " \n" + - " \n" + - " \n" + - "
\n" + - "
\n" + - "
\n" + - "
\n" + - "

\n" + - " \n" + - " \n" + - "\n" + - " 111\n" + - "\n" + - " 说:\n" + - " \n" + - "

\n" + - "
\n" + - "
\n" + - "
\n" + - "

下载了lz的代码,发布到网站上,功能可用了。rss数量自己设置就好。

\n" + - "\n" + - "

杯具的是网站只有内网地址,gr不认生成的feed地址。

\n" + - "\n" + - "

只能CS订阅,不喜。

\n" + - "
\n" + - "
\n" + - "\n" + - "
\n" + - "
\n" + - "
\n" + - " \n" + - " \n" + - " \n" + - "
\n" + - "
\n" + - "
\n" + - "
\n" + - "

\n" + - " \n" + - " \n" + - "\n" + - " lietlie\n" + - "\n" + - " 说:\n" + - " \n" + - "

\n" + - "
\n" + - "
\n" + - "
\n" + - "

http://mrss.dokoda.jp/
\n" + - "虽然是小鬼子的网站,但是是我找到的能够全文Feed最好的在线工具了,和LZ推荐的网站相比,可以输出所有项目,而没有4条目的限制,当然也不必自己搭建服务器,日文内容很简单,如果使用的是FF或Chrome浏览器还可以利用Google的自动翻译功能将大致内容翻译为中文(FF利用Google Toolbar)——其实即使不翻译一样很容易使用。

\n" + - "
\n" + - "
\n" + - "\n" + - "
\n" + - "
\n" + - "
\n" + - " \n" + - " \n" + - " \n" + - "
\n" + - "
\n" + - "
\n" + - "
\n" + - "

\n" + - " \n" + - " \n" + - "\n" + - " Ruan YiFeng\n" + - "\n" + - " 说:\n" + - " \n" + - "

\n" + - "
\n" + - "
\n" + - "
\n" + - "
\n" + - "
引用kuber的发言:
\n" + - "\n" + - "

想请教一下你怎么修改规则来全文输出新浪网rss的, 我也碰到几个Feed,缺省的配置不能正确处理.

\n" + - "\n" + - "
\n" + - "\n" + - "

新浪的内容容器,有一个比较怪的ID名。只要搜索这个字符串,就能提取内容了。

\n" + - "\n" + - "

最终,你还是需要读readability.php的代码,只要读懂了,我觉得任何页面都能提取。
\n" + - "

\n" + - "
\n" + - "
\n" + - "\n" + - "
\n" + - "
\n" + - "
\n" + - " \n" + - " \n" + - " \n" + - "
\n" + - "
\n" + - "
\n" + - "
\n" + - "

\n" + - " \n" + - " \n" + - "\n" + - " 诗沐\n" + - "\n" + - " 说:\n" + - " \n" + - "

\n" + - "
\n" + - "
\n" + - "
\n" + - "

哇 源码写得相当清爽啊~注释习惯很棒

\n" + - "
\n" + - "
\n" + - "\n" + - "
\n" + - "
\n" + - "
\n" + - " \n" + - " \n" + - " \n" + - "
\n" + - "
\n" + - "
\n" + - "
\n" + - "

\n" + - " \n" + - " \n" + - "\n" + - " xangd\n" + - "\n" + - " 说:\n" + - " \n" + - "

\n" + - "
\n" + - "
\n" + - "
\n" + - "

有人在appspot上部署了一个python的port
\n" + - "http://andrewtrusty.appspot.com/readability/
\n" + - "这个没有4篇post的限制

\n" + - "
\n" + - "
\n" + - "\n" + - "
\n" + - "
\n" + - "
\n" + - " \n" + - " \n" + - " \n" + - "
\n" + - "
\n" + - "
\n" + - "
\n" + - "

\n" + - " \n" + - " \n" + - "\n" + - " neotrue\n" + - "\n" + - " 说:\n" + - " \n" + - "

\n" + - "
\n" + - "
\n" + - "
\n" + - "

很好用,谢谢!

\n" + - "
\n" + - "
\n" + - "\n" + - "
\n" + - "
\n" + - "
\n" + - " \n" + - " \n" + - " \n" + - "
\n" + - "
\n" + - "
\n" + - "
\n" + - "

\n" + - " \n" + - " \n" + - "\n" + - " harvey\n" + - "\n" + - " 说:\n" + - " \n" + - "

\n" + - "
\n" + - "
\n" + - "
\n" + - "

博主,作者把1.5版本放出来了,
\n" + - "可否再麻烦你打包一下,我bazzar一直不成功

\n" + - "
\n" + - "
\n" + - "\n" + - "
\n" + - "
\n" + - "
\n" + - " \n" + - " \n" + - " \n" + - "
\n" + - "
\n" + - "
\n" + - "
\n" + - "

\n" + - " \n" + - " \n" + - "\n" + - " Ruan YiFeng\n" + - "\n" + - " 说:\n" + - " \n" + - "

\n" + - "
\n" + - "
\n" + - "
\n" + - "
\n" + - "
引用harvey的发言:
\n" + - "\n" + - "

博主,作者把1.5版本放出来了,
\n" + - "可否再麻烦你打包一下,我bazzar一直不成功

\n" + - "\n" + - "
\n" + - "\n" + - "

已经加上去了,:-)

\n" + - "
\n" + - "
\n" + - "\n" + - "
\n" + - "
\n" + - "
\n" + - " \n" + - " \n" + - " \n" + - "
\n" + - "
\n" + - "
\n" + - "
\n" + - "

\n" + - " \n" + - " \n" + - "\n" + - " 张治国\n" + - "\n" + - " 说:\n" + - " \n" + - "

\n" + - "
\n" + - "
\n" + - "
\n" + - "

博主,全文Feed默认只有4个条目,下载代码后,修改哪段代码可以改变这个值啊,config-sample.PHP中的数值吗?我是新手,希望博主指点一下,谢谢。

\n" + - "
\n" + - "
\n" + - "\n" + - "
\n" + - "
\n" + - "
\n" + - " \n" + - " \n" + - " \n" + - "
\n" + - "
\n" + - "
\n" + - "
\n" + - "

\n" + - " \n" + - " \n" + - "\n" + - " felix\n" + - "\n" + - " 说:\n" + - " \n" + - "

\n" + - "
\n" + - "
\n" + - "
\n" + - "

看不懂readability,不知道博主能否提供一下过滤页面上的干扰字符的方法
\n" + - "

\n" + - "
\n" + - "
\n" + - "\n" + - "
\n" + - "
\n" + - "
\n" + - " \n" + - "
\n" + - " \n" + - " \n" + - "\n" + - "\n" + - " \n" + - " \n" + - "
\n" + - "

我要发表看法

\n" + - "
\n" + - "\n" + - " \n" + - "
\n" + - "\n" + - "
\n" + - " \n" + - " \n" + - " \n" + - " \n" + - " \n" + - " \n" + - " \n" + - "
\n" + - "
\n" + - "

\n" + - "

\n" + - "
\n" + - "
\n" + - "

\n" + - "

«-必填

\n" + - "
\n" + - "
\n" + - "

\n" + - "

«-必填,不公开

\n" + - "
\n" + - "
\n" + - "

\n" + - "

«-我信任你,不会填写广告链接

\n" + - "
\n" + - "
\n" + - "

\n" + - "

\n" + - "
\n" + - "
\n" + - "
\n" + - " \n" + - " \n" + - "
\n" + - "
\n" + - "
\n" + - "
\n" + - "

正在发表您的评论,请稍候

\n" + - "

\n" + - " \n" + - "\n" + - "

\n" + - "
\n" + - "\n" + - "

«- 点击按钮

\n" + - "
\n" + - "
\n" + - "\n" + - "\n" + - "
\n" + - "
\n" + - "\n" + - " \n" + - "\n" + - "\n" + - "
\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "
\n" + - "
\n" + - "\n" + - "
\n" + - "
\n" + - "\n" + - "\n" + - "
\n" + - "
\n" + - "
\n" + - "

联系方式 | ruanyifeng.com 2003 - 2012\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "

\n" + - "\n" + - "
\n" + - "
\n" + - "
\n" + - "\n" + - "\n" + - "
分享按钮 \n" + - "
\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "
\n" + - "
\n" + - "\n" + - ""; - Html html = new Html(text); - Selectable sc = html.sc(); - System.out.println(sc); - } - - @Test - public void test2(){ - String text = "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - " \n" + - " 地球上最后的夜晚 (豆瓣)\n" + - " \n" + - "\n" + - "\n" + - " \n" + - "\n" + - "\n" + - " \n" + - " \n" + - " \n" + - " \n" + - "\n" + - "\n" + - "\n" + - "\n" + - " \n" + - " \n" + - " \n" + - "\n" + - " \n" + - " \n" + - " \n" + - " \n" + - "\n" + - " \n" + - " \n" + - " \n" + - " \n" + - " \n" + - " \n" + - "\n" + - " \n" + - " \n" + - "\n" + - " \n" + - "\n" + - "\n" + - " \n" + - " \n" + - " \n" + - " \n" + - " \n" + - " \n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "
\n" + - "
\n" + - " \n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "
\n" + - " \n" + - " \n" + - " \n" + - "
\n" + - "\n" + - "\n" + - "
\n" + - " 提醒\n" + - " \n" + - "
\n" + - "
\n" + - "

加载中...

\n" + - "
\n" + - "
\n" + - "
\n" + - " \n" + - "\n" + - "
\n" + - "
    \n" + - " \n" + - " \n" + - "
  • \n" + - " 豆瓣\n" + - "
  • \n" + - " \n" + - " \n" + - "
  • \n" + - " 读书\n" + - "
  • \n" + - " \n" + - " \n" + - "
  • \n" + - " 电影\n" + - "
  • \n" + - " \n" + - " \n" + - "
  • \n" + - " 音乐\n" + - "
  • \n" + - " \n" + - " \n" + - "
  • \n" + - " 同城\n" + - "
  • \n" + - " \n" + - " \n" + - "
  • \n" + - " 小组\n" + - "
  • \n" + - " \n" + - " \n" + - "
  • \n" + - " 阅读\n" + - "
  • \n" + - " \n" + - " \n" + - "
  • \n" + - " 豆瓣FM\n" + - "
  • \n" + - " \n" + - "
  • \n" + - " 更多\n" + - "
    \n" + - " \n" + - " \n" + - " \n" + - " \n" + - " \n" + - " \n" + - " \n" + - "
    九点
    阿尔法城
    移动应用
    \n" + - "
    \n" + - "
  • \n" + - "
\n" + - "
\n" + - "
\n" + - "
\n" + - "\n" + - "\n" + - " \n" + - " \n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "
\n" + - "
\n" + - "
\n" + - "
\n" + - " 豆瓣读书\n" + - "
\n" + - "
\n" + - "
\n" + - "
\n" + - " 搜索:\n" + - " \n" + - "
\n" + - "
\n" + - " \n" + - "
\n" + - "
\n" + - "
\n" + - "\n" + - "
\n" + - "
\n" + - "
\n" + - " \n" + - "
\n" + - "
\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - " \n" + - " \n" + - " \n" + - " \n" + - "\n" + - "\n" + - "\n" + - "
\n" + - " \n" + - "

\n" + - " 地球上最后的夜晚\n" + - "
\n" + - "

\n" + - "\n" + - " \n" + - "
\n" + - " \n" + - "
\n" + - " \n" + - "
\n" + - "
\n" + - "
\n" + - " \n" + - "\n" + - "\n" + - "\n" + - "
\n" + - "
\n" + - "\n" + - " \n" + - "\n" + - " \n" + - " \"地球上最后的夜晚\"\n" + - " \n" + - "\n" + - "
\n" + - "

\n" + - " 更新描述或封面\n" + - "

\n" + - "\n" + - "\n" + - "
\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "
\n" + - "\n" + - "\n" + - " \n" + - " \n" + - " \n" + - "\n" + - " \n" + - " \n" + - " \n" + - " 原作名: Last Evenings on Earth
\n" + - "\n" + - " \n" + - " \n" + - " \n" + - " \n" + - " 作者: \n" + - " \n" + - " [智利] 罗贝托·波拉尼奥\n" + - "
\n" + - "\n" + - " \n" + - " \n" + - " \n" + - " \n" + - " 译者: \n" + - " \n" + - " 赵德明\n" + - "
\n" + - "\n" + - " \n" + - " \n" + - " \n" + - " 出版社: 上海人民出版社
\n" + - "\n" + - " \n" + - " \n" + - " \n" + - " 出版年: 2013-4-1
\n" + - "\n" + - " \n" + - " \n" + - " \n" + - " 页数: 288
\n" + - "\n" + - " \n" + - " \n" + - " \n" + - " 定价: 45.00元
\n" + - "\n" + - " \n" + - " \n" + - " \n" + - "\n" + - " \n" + - " \n" + - " \n" + - " 丛书: 罗贝托·波拉尼奥作品系列
\n" + - "\n" + - " \n" + - " \n" + - " \n" + - " \n" + - " \n" + - " ISBN: 9787208112025
\n" + - "\n" + - "\n" + - "
\n" + - "\n" + - "
\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - " \n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "
\n" + - "
\n" + - "

\n" + - " \n" + - "\n" + - "\n" + - " \n" + - " 8.4\n" + - " \n" + - "\n" + - " \n" + - "

\n" + - "

\n" + - " (\n" + - " \n" + - " 11人评价\n" + - " \n" + - " )\n" + - "

\n" + - " \n" + - " \n" + - "\n" + - "\n" + - " \n" + - "
\n" + - "\n" + - " 45.5%
\n" + - " \n" + - " \n" + - "\n" + - "\n" + - " \n" + - "
\n" + - "\n" + - " 9.1%
\n" + - " \n" + - " \n" + - "\n" + - "\n" + - " \n" + - "
\n" + - "\n" + - " 18.2%
\n" + - " \n" + - " \n" + - "\n" + - "\n" + - " \n" + - "
\n" + - "\n" + - " 18.2%
\n" + - " \n" + - " \n" + - "\n" + - "\n" + - " \n" + - "
\n" + - "\n" + - " 9.1%
\n" + - "
\n" + - "
\n" + - "\n" + - "\n" + - "
\n" + - " \n" + - "\n" + - "\n" + - "\n" + - " \n" + - " \n" + - "
\n" + - " \n" + - " 想读\n" + - " \n" + - " \n" + - " 在读\n" + - " \n" + - " \n" + - " 读过\n" + - " \n" + - "
\n" + - " \n" + - "\n" + - "\n" + - " \n" + - " 评价: \n" + - " \n" + - " \n" + - " \n" + - " \n" + - " \n" + - " \n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "
\n" + - "
\n" + - "\n" + - "\n" + - "\n" + - " \n" + - "
\n" + - "
    \n" + - "
  • \n" + - "  写笔记\n" + - "
  • \n" + - "\n" + - "
  • \n" + - "  写书评\n" + - "
  • \n" + - "\n" + - "
  • \n" + - "\n" + - " \n" + - " \n" + - "\n" + - " \n" + - " 加入购书单\n" + - " 已在购书单\n" + - "
    \n" + - "
  • \n" + - "\n" + - "
  • \n" + - " \n" + - "\n" + - "\n" + - "
    \n" + - " \n" + - "\n" + - " \n" + - "\n" + - " \n" + - "\n" + - " 添加到豆列\n" + - "
    \n" + - "\n" + - "
  • \n" + - "\n" + - " \n" + - " \n" + - " \n" + - "
  • \n" + - " 分享到   \n" + - "
  • \n" + - " \n" + - "\n" + - "
\n" + - "
\n" + - "\n" + - "\n" + - " \n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - " \n" + - " \n" + - " \n" + - " \n" + - "\n" + - "
\n" + - "\n" + - " \n" + - " \n" + - "\n" + - " \n" + - " \n" + - "\n" + - "\n" + - "\n" + - " 推荐\n" + - " \n" + - "\n" + - "
\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "
\n" + - "\n" + - "
\n" + - "
\n" + - "
\n" + - " \n" + - " \n" + - " \n" + - "\n" + - "\n" + - "\n" + - "\n" + - " \n" + - "\n" + - "

\n" + - "\n" + - " 内容简介\n" + - "  · · · · · ·\n" + - "\n" + - "

\n" + - "\n" + - "\n" + - "\n" + - "\n" + - " \n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - " \n" + - " \n" + - "\n" + - "

\n" + - "\n" + - " 作者简介\n" + - "  · · · · · ·\n" + - "\n" + - "

\n" + - "\n" + - "\n" + - "\n" + - "
\n" + - " \n" + - " \n" + - " \n" + - "
\n" + - "

罗贝托•波拉尼奥(Roberto Bolaño,1953—2003)出生于智利,父亲是卡车司机和业余拳击手,母亲在学校教授数学和统计学。1968年全家移居墨西哥。1973年波拉尼奥再次回到智利投身社会主义革命却遭到逮捕,差点被杀害。逃回墨西哥后他和好友推动了融合超现实主义、达达主义以及街头剧场的“现实以下主义”(Infrarealism)运动,意图激发拉丁美洲年轻人对生活与文学的热爱。1977年他前往欧洲,最后在西班牙波拉瓦海岸结婚定居。2003年因为肝脏功能损坏,等不到器官移植而在巴塞罗那去世,年仅五十岁。

波拉尼奥四十岁才开始写小说,作品数量却十分惊人,身后留下十部小说、四部短篇小说集以及三部诗集。1998年出版的《荒野侦探》在拉美文坛引起的轰动,不亚于三十年前《百年孤独》出版时的盛况。而其身后出版的《2666》更是引发欧美舆论压倒性好评,均致以...

(展开全部)

\n" + - "\n" + - "
\n" + - " \n" + - " \n" + - "
\n" + - "

罗贝托•波拉尼奥(Roberto Bolaño,1953—2003)出生于智利,父亲是卡车司机和业余拳击手,母亲在学校教授数学和统计学。1968年全家移居墨西哥。1973年波拉尼奥再次回到智利投身社会主义革命却遭到逮捕,差点被杀害。逃回墨西哥后他和好友推动了融合超现实主义、达达主义以及街头剧场的“现实以下主义”(Infrarealism)运动,意图激发拉丁美洲年轻人对生活与文学的热爱。1977年他前往欧洲,最后在西班牙波拉瓦海岸结婚定居。2003年因为肝脏功能损坏,等不到器官移植而在巴塞罗那去世,年仅五十岁。

波拉尼奥四十岁才开始写小说,作品数量却十分惊人,身后留下十部小说、四部短篇小说集以及三部诗集。1998年出版的《荒野侦探》在拉美文坛引起的轰动,不亚于三十年前《百年孤独》出版时的盛况。而其身后出版的《2666》更是引发欧美舆论压倒性好评,均致以杰作、伟大、里程碑、天才等等赞誉。苏珊•桑塔格、约翰•班维尔、科尔姆•托宾、斯蒂芬•金等众多作家对波拉尼奥赞赏有加,更有评论认为此书的出版自此将作者带至塞万提斯,斯特恩,梅尔维尔,普鲁斯特,穆齐尔与品钦的同一队列。

\n" + - "\n" + - "
\n" + - "
\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - " \n" + - "\n" + - " \n" + - " \n" + - "\n" + - "\n" + - "\n" + - "\n" + - " \n" + - "\n" + - "

\n" + - "\n" + - " 目录\n" + - "  · · · · · ·\n" + - "\n" + - "

\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "
\n" + - " 圣西尼……………………………………3
\n" + - " 亨利·西蒙·勒普兰斯…………………… 27
\n" + - " 恩里克·马丁……………………………39
\n" + - " 一件文学奇事…………………… ……59
\n" + - " 通话…………………… ………………75
\n" + - " 毛毛虫…………………………………83
\n" + - " · · · · · ·\n" + - " (更多)\n" + - "
\n" + - "\n" + - "
\n" + - " 圣西尼……………………………………3
\n" + - " 亨利·西蒙·勒普兰斯…………………… 27
\n" + - " 恩里克·马丁……………………………39
\n" + - " 一件文学奇事…………………… ……59
\n" + - " 通话…………………… ………………75
\n" + - " 毛毛虫…………………………………83
\n" + - " 安妮·穆尔的生平 ……………………101
\n" + - " “小眼”席尔瓦 ………………………139
\n" + - " 戈麦斯帕拉西奥 ……………………159
\n" + - " 地球上最后的夜晚………………… 173
\n" + - " 1978 年的几天………………………205
\n" + - " 在法国和比利时闲逛…………………225
\n" + - " 牙科医生…………………… ………245
\n" + - " 邀舞卡……………………………… 273
\n" + - " · · · · · · (收起)\n" + - "
\n" + - "\n" + - "\n" + - " \n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - " \n" + - "\n" + - "

\n" + - "\n" + - " "地球上最后的夜晚"试读\n" + - "  · · · · · ·\n" + - "\n" + - "

\n" + - "\n" + - "\n" + - "
\n" + - "

情况是这样的:B 和B 父去阿卡普尔科度假。一大早,清晨六点,父子俩就要出发。那天夜里,B 睡在父亲家里。没梦,或者就算有梦,一睁眼也忘了。听见父亲在卫生间。向窗外望去,一片漆黑。B 不开灯,穿衣裳。等走出卧室的时候,父亲已经在桌旁看前一天的体育报纸了。早饭已经做好了。咖啡,牧场煎蛋。B 问候父亲后,走进卫生间。\n" + - "B 父的汽车是1970 年的福特野马。六点半,父子俩上车,开..

\n" + - "\n" + - "
· · · · · · (查看全部试读)
\n" + - "
\n" + - "\n" + - "\n" + - "\n" + - " \n" + - "\n" + - "\n" + - "\n" + - "\n" + - "
\n" + - " \n" + - " \n" + - "\n" + - "

\n" + - "\n" + - " 豆瓣成员常用的标签(共38个)\n" + - "  · · · · · ·\n" + - "\n" + - "

\n" + - "\n" + - "\n" + - "
罗贝托-波拉尼奥(68)   拉美文学(35)   外国文学(24)   小说(22)   智利文学(14)   波拉尼奥(10)   智利(10)   小说集(10)  
\n" + - "
\n" + - "\n" + - "\n" + - " \n" + - "\n" + - "\n" + - "
\n" + - "

丛书信息

\n" + - "
\n" + - "  罗贝托·波拉尼奥作品系列 (共6册),\n" + - "这套丛书还有\n" + - "《2666》,《荒野侦探》,《2666》,《荒野侦探》,《护身符》。
\n" + - "
\n" + - "\n" + - "\n" + - " \n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - " \n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "
\n" + - "

\n" + - " 书评  · · · · · · \n" + - "

\n" + - " \n" + - " 我来评论这本书\n" + - "\n" + - "\n" + - " \n" + - "\n" + - " \n" + - "\n" + - "
\n" + - " \n" + - " \n" + - "
\n" + - "
\n" + - "
\n" + - " \n" + - " \"DeadKennedy\"/\n" + - "\n" + - "
\n" + - "

\n" + - "
\n" + - " \">\"\n" + - " \n" + - " \"<\"\n" + - "
\n" + - " 信仰的挽歌\n" + - "

\n" + - "
\n" + - " \n" + - " DeadKennedy   \n" + - " \n" + - " \n" + - "

\n" + - "
\n" + - " Elegy to Faith\n" + - "\n" + - "\n" + - "波拉诺难得的短篇集。\n" + - "\n" + - "\n" + - "比之长篇,波拉诺的短篇是其能力的代表。他的长篇像话剧台词,冗长,精彩,让人迷失其中,在读过大概三百页之后似乎明白一些他在说什么。而他的短篇则像电台DJ的串词,明了,信息丰富,基本是波拉诺的自传和自白。很多篇目就是作家自身经历的镜像。是一些关于动荡,个人自由,劳动,知识份子,流放和坚持的故事。纽约时报评论这本书为“流放民谣”。\n" + - "\n" + - "\n" + - "比如写自身经历的:......\n" + - "\n" + - "

\n" + - " \n" + - " 2012-02-14 13:53    \n" + - " 2/2有用\n" + - " \n" + - " \n" + - " 来自 New Directions2007版\n" + - "\n" + - "
\n" + - "
\n" + - "
\n" + - "
\n" + - "
\n" + - "
\n" + - "
\n" + - " \n" + - "\n" + - "\n" + - "
\n" + - "\n" + - "
\n" + - " \n" + - " \n" + - "
\n" + - "
\n" + - "
\n" + - " \n" + - " \"DeadKennedy\"/\n" + - "\n" + - "
\n" + - "

\n" + - "
\n" + - " \">\"\n" + - " \n" + - " \"<\"\n" + - "
\n" + - " 信仰的挽歌\n" + - "

\n" + - "
\n" + - " \n" + - " DeadKennedy   \n" + - " \n" + - " \n" + - "

\n" + - "
\n" + - " Elegy to Faith\n" + - "\n" + - "\n" + - "波拉诺难得的短篇集。\n" + - "\n" + - "\n" + - "比之长篇,波拉诺的短篇是其能力的代表。他的长篇像话剧台词,冗长,精彩,让人迷失其中,在读过大概三百页之后似乎明白一些他在说什么。而他的短篇则像电台DJ的串词,明了,信息丰富,基本是波拉诺的自传和自白。很多篇目就是作家自身经历的镜像。是一些关于动荡,个人自由,劳动,知识份子,流放和坚持的故事。纽约时报评论这本书为“流放民谣”。\n" + - "\n" + - "\n" + - "比如写自身经历的:......\n" + - "\n" + - "

\n" + - " \n" + - " 2012-02-14 13:53    \n" + - " 2/2有用\n" + - " \n" + - " \n" + - " 来自 New Directions2007版\n" + - "\n" + - "
\n" + - "
\n" + - "
\n" + - "
\n" + - "
\n" + - "
\n" + - "
\n" + - " \n" + - "\n" + - "\n" + - "
\n" + - "\n" + - " \n" + - "
\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - " \n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "
\n" + - "
\n" + - "
\n" + - " \n" + - "\n" + - "\n" + - "
\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - " \n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "
\n" + - "\n" + - "\n" + - "\n" + - "\n" + - " \n" + - " \n" + - "\n" + - "

\n" + - "\n" + - " 论坛\n" + - "  · · · · · ·\n" + - "\n" + - "

\n" + - "\n" + - "\n" + - "\n" + - " \n" + - "\n" + - " \n" + - " \n" + - " \n" + - " \n" + - " \n" + - " \n" + - " \n" + - " \n" + - " \n" + - " \n" + - " \n" + - " \n" + - " \n" + - " \n" + - " \n" + - " \n" + - " \n" + - " \n" + - " \n" + - " \n" + - " \n" + - "
平装or精装?孔亚雷or赵德明?来自Nihilum5 回应2013-04-21
書到底出了沒啊?来自阿城199114 回应2013-04-13
不是翻译问题,是根本看不懂来自呆呆双鱼女1 回应2013-04-20
\n" + - "\n" + - "\n" + - "

>\n" + - " 在这本书的论坛里发言\n" + - "

\n" + - "\n" + - "\n" + - "
\n" + - "\n" + - "\n" + - "
\n" + - "\n" + - "
\n" + - "
\n" + - " \n" + - " \n" + - " \n" + - " \n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - " \n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "
\n" + - "\n" + - "
\n" + - "

\n" + - " 在哪儿买这本书?\n" + - "

\n" + - " \n" + - "
\n" + - " \n" + - "
\n" + - " \n" + - " \n" + - " \n" + - " \n" + - " 加入购书单\n" + - " \n" + - " 已在购书单 \n" + - " 查看\n" + - " 删除\n" + - " \n" + - " \n" + - " \n" + - "\n" + - " 多本比价,批量购买\n" + - "
\n" + - "\n" + - "
\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - " \n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "
\n" + - "
\n" + - " \n" + - "\n" + - "\n" + - "\n" + - "\n" + - " \n" + - " \n" + - "\n" + - "

\n" + - "\n" + - " 这本书的其他版本 \n" + - "  · · · · · ·\n" + - "  (\n" + - " 全部3\n" + - " ) \n" + - "\n" + - "

\n" + - "\n" + - "\n" + - "
\n" + - " \n" + - "
\n" + - "\n" + - "\n" + - " \n" + - "\n" + - "\n" + - "\n" + - " \n" + - " \n" + - "\n" + - "

\n" + - "\n" + - " 以下豆列推荐\n" + - "  · · · · · ·\n" + - "  (\n" + - " 全部\n" + - " ) \n" + - "\n" + - "

\n" + - "\n" + - "\n" + - "
\n" + - " \n" + - "
\n" + - "\n" + - "\n" + - " \n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "

谁读这本书?

\n" + - "
\n" + - "\n" + - " \n" + - "\n" + - " \n" + - "
\"小K\"
\n" + - "
小K
\n" + - "
13分钟前 想读
\n" + - "\n" + - "
\n" + - "\n" + - "\n" + - "
\n" + - "

\n" + - "
\n" + - " \n" + - "
\"杰森辛普森\"
\n" + - "
杰森辛普森
\n" + - "
28分钟前 想读
\n" + - "\n" + - "
\n" + - "\n" + - "\n" + - "
\n" + - "

\n" + - "
\n" + - " \n" + - "
\"Aby\"
\n" + - "
Aby
\n" + - "
37分钟前 想读
\n" + - "\n" + - "
\n" + - "\n" + - " tags:对人生的诠释\n" + - "\n" + - "
\n" + - "

\n" + - "
\n" + - " \n" + - "
\"老男孩\"
\n" + - "
老男孩
\n" + - "
1小时前 想读
\n" + - "\n" + - "
\n" + - "\n" + - "\n" + - "
\n" + - "

\n" + - "
\n" + - "\n" + - "\n" + - "

> 5人在读

\n" + - "

> 12人读过

\n" + - "

> 658人想读

\n" + - "\n" + - "
\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - " \n" + - "\n" + - " \n" + - " \n" + - "\n" + - "

\n" + - "\n" + - " 喜欢这本书的人常去的小组\n" + - "  · · · · · ·\n" + - "\n" + - "

\n" + - "\n" + - "\n" + - " \n" + - "\n" + - "\n" + - "\n" + - " \n" + - "
\n" + - "
\"托马斯·品钦\"/
\n" + - " \n" + - "
托马斯·品钦 (711)\n" + - "
\n" + - "
\n" + - " \n" + - "
\n" + - "
\"短经典\"/
\n" + - " \n" + - "
短经典 (787)\n" + - "
\n" + - "
\n" + - " \n" + - "
\n" + - "
\"寻找:布鲁诺.舒尔茨\"/
\n" + - " \n" + - "
寻找:布鲁诺.舒尔茨 (466)\n" + - "
\n" + - "
\n" + - " \n" + - "
\n" + - "
\"胡安·鲁尔福\"/
\n" + - " \n" + - "
胡安·鲁尔福 (613)\n" + - "
\n" + - "
\n" + - " \n" + - "
\n" + - "
\"V.S.奈保尔\"/
\n" + - " \n" + - "
V.S.奈保尔 (445)\n" + - "
\n" + - "
\n" + - " \n" + - "
\n" + - "
\"胡利奥·科塔萨尔\"/
\n" + - " \n" + - "
胡利奥·科塔萨尔 (1053)\n" + - "
\n" + - "
\n" + - " \n" + - "
\n" + - "
\"中国当代书籍装帧摭评\"/
\n" + - " \n" + - "
中国当代书籍装帧摭评 (1373)\n" + - "
\n" + - "
\n" + - " \n" + - "
\n" + - "
\"泼先生\"/
\n" + - " \n" + - "
泼先生 (485)\n" + - "
\n" + - "
\n" + - "\n" + - "\n" + - "\n" + - " \n" + - "\n" + - "
\n" + - "

> 收藏这本书的1个小组

\n" + - "
\n" + - "

> \n" + - " 加到我的小组收藏里 \n" + - "

\n" + - "
\n" + - "
\n" + - "\n" + - " \n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - " \n" + - "\n" + - "

二手市场

\n" + - "
\n" + - "
    \n" + - "
  • \n" + - " > 点这儿转让\n" + - "\n" + - " 有658人想读,手里有一本闲着?\n" + - "
  • \n" + - "
\n" + - "
\n" + - "\n" + - " \n" + - "

订阅关于地球上最后的夜晚的评论:
\n" + - " feed: rss 2.0

\n" + - "\n" + - " \n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "
\n" + - "
\n" + - " \n" + - "
\n" + - "
\n" + - "
\n" + - "\n" + - " \n" + - "
\n" + - "\n" + - "\n" + - "\n" + - " © 2005-2013 douban.com, all rights reserved\n" + - "\n" + - "\n" + - "\n" + - " 关于豆瓣\n" + - " · 在豆瓣工作\n" + - " · 联系我们\n" + - " · 免责声明\n" + - " \n" + - " · 帮助中心\n" + - " · 开发者\n" + - " · 图书馆合作\n" + - " · 手机读书\n" + - " · 豆瓣广告\n" + - "\n" + - "\n" + - "\n" + - "
\n" + - "\n" + - "
\n" + - " \n" + - " \n" + - " \n" + - " \n" + - " \n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - " \n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n"; - - Html html = new Html(text); - System.out.println(html.sc()); - - } -} diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/selector/XpathSelectorTest.java b/webmagic-core/src/test/java/us/codecraft/webmagic/selector/XpathSelectorTest.java deleted file mode 100644 index 96ea6e8cc..000000000 --- a/webmagic-core/src/test/java/us/codecraft/webmagic/selector/XpathSelectorTest.java +++ /dev/null @@ -1,2750 +0,0 @@ -package us.codecraft.webmagic.selector; - -import org.junit.Assert; -import org.junit.Test; - -/** - * User: cairne - * Date: 13-4-21 - * Time: 上午10:06 - */ -public class XpathSelectorTest { - - String huxiuHtml = "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "产品情感化设计的两个层面-观点-@虎嗅网\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "
\n" + - "\t\t\t\n" + - "\t\n" + - "
\n" + - "

\"虎嗅网\"

\n" + - "\n" + - "
\n" + - "
\n" + - "
\n" + - "订阅虎嗅\n" + - "RSS\n" + - "
\n" + - "\n" + - "
\n" + - "\n" + - "
\n" + - "\n" + - "
\n" + - "
\n" + - "\n" + - "
\n" + - "
\n" + - "
\n" + - "
\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "
\n" + - "
\n" + - "
\n" + - "\n" + - "\n" + - "\n" + - "
\n" + - "
\n" + - "
\n" + - "
\n" + - "\n" + - "\n" + - "
\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "
\n" + - " \n" + - "
\n" + - " \n" + - "
\n" + - "\n" + - "\n" + - "
\n" + - "
\n" + - "
\n" + - " \n" + - "
\n" + - "\n" + - "

产品情感化设计的两个层面

\n" + - "\n" + - "
\n" + - "
\n" + - " 2013-4-22 16:10\n" + - " \n" + - " \t评论(0)\n" + - " \n" + - "产品\n" + - "投稿\n" + - "\n" + - "
\n" + - "
\n" + - " \n" + - "
\n" + - "
\n" + - "
\n" + - "\n" + - "
\n" + - "
\n" + - "\n" + - " \n" + - "
\n" + - " \n" + - " \n" + - " \n" + - " \n" + - " \n" + - "
\n" + - " \"\"\n" + - " 用户之所以选择一款产品,首要的一点在于产品的功能或内容满足了用户。而随着产品的发展,同类型的产品基础功能都大致相同,产品之间的竞争越来越难在功能层面拉开差距。现在产品人员也更加开始在用户体验上下功夫了,而对用户体验的不断追求也就上升到了情感层面。

谈起产品情感化设计,可以拿手机通讯录中添加联系人头像来举例子,单就这个功能点而言,最基础的只要用户能够添加联系人的头像即可,而如果在这个功能上添加用户情感化的元素后,就可以在用户的头像展示上给予更大空间,让用户能够更大的发挥自己的个性。我们也发现新浪微博和开心网个人主页的设计也都增加了个人封面的展示。产品情感化对于功能本身是没有影响的,而情感因素后,产品对用户还会更有吸引力。短期来看,个性化和给用户更大的发挥空间是产品情感化设计的两个很重要的方向。

产品的情感化设计有两个不同的做法:一个是在已有功能上进行扩展,如上文所提到的通讯录中上传头像的功能,是对用户表达欲的满足,用户情感的单向表达;另一种做法则是做一个完全情感化的产品,用户情感的双向表达,是用户之间情感内容的交流,产品扮演的只是桥梁作用,例如小恩爱、抬杠这样的产品。其实所有涉及到用户互动性的产品对于情感化的拓展空间都很大,但是与普通社交不同的是,产品的情感化在于人与人之间更深层次的交流。在我个人看来,社交网站中的发状态功能已经仅仅是用户表达的工具,极少含有感情因素,但是像Facebook推出的暗恋功能却是一个情感化产品,产品的情感化不仅在于让用户将自己的情感寄予到产品中,而且产品要想具有情感化很重要的一点在于产品本身能够起到挖掘用户情感的作用。

前面所提到的两种做法区别在于,前者是基于已有需求而进行的情感化设计,而后者则是完全情感化的产品,就成功率来讲,显然是前者更大一些。本身有需求的产品对于产品的情感化发展不仅奠定了基础,而且也烘托了氛围,做好了铺垫。如果是做一个完全情感化的产品,失败的可能性很大。当产品的功能满足了用户的情感表达,那就意味着产品可以满足用户的需求,而当产品本身所扮演的角色无法成为用户的寄托,那么产品就会面临失败。可想而知,情感化的产品肯定属于UGC类型,对于用户内容的质量要求会比较高,当技术水平不够高、功能操作不够便捷的时候,自然就提高了使用门槛。而且这种类型的产品对于氛围的烘托本身就会有相对高的要求。

如果单从功能角度去衡量,用户情感的单向表达属于功能层面,而用户情感的双向表达属于内容层面。除此之外,产品情感化还有文案和产品风格上的表现。

你是一个资深网虫,或许你也有所感觉,现在的网站文案已经越来越有人情味了。例如提示文案不是“你的账号密码错误”而是“密码不对哦”,文案中增加了语气词。这只是其中的一种表达方式,除此之外,你会看到产品设计中的很多引导方式也更有趣味性,文案内容的情感化也会增加用户的接受程度。

最近自己在使用产品中也有个很大的感触,就是产品风格对用户的吸引,同样是天气类应用,功能上相差无几,但是不同的风格却可以吸引不同的受众。有的是大众普通的风格,有的是小清新风格,有的是卡通风格等等,可以理解为用户对不同风格产品的选择背后的原因就是用户个人情感的不同,而用户的这种情感不能改变只能顺从。

更深层次的讲,产品情感化的关键在于产品功能与用户情感的承接,满足人们情感的诉求。从心理学上讲人的本性有很多,例如表达欲、攀比心理,但从人的本性和产品的情感化进行匹配,会有太多的点,在这里就不一一例举了,大家可以在产品的使用过程中逐渐感受。而之所以要选择利用人性情感的哪一点来设计产品就要根据具体的产品目标来衡量了。

文章来源:马虎眼    作者微信账号:mahuyan


本文由\n" + - "云瑞\n" + - "授权虎嗅网发表,并经虎嗅网编辑。转载此文章须经作者同意,并请附上出处(虎嗅网)及本页链接。
原文链接http://www.huxiu.com/article/13380/1.html\n" + - "
\n" + - "
\n" + - " \n" + - " \n" + - "
\n" + - " \n" + - "\n" + - " \n" + - "\n" + - "\n" + - " \n" + - "
\n" + - " \n" + - "
\n" + - "
\n" + - "
\n" + - "
\n" + - " 分享(0):\n" + - "
\n" + - "
\n" + - "
    \n" + - "
  • \n" + - "
  • \n" + - "
  • \n" + - "
  • \n" + - "
  • \n" + - "
  • \n" + - "
  • \n" + - "
  • \n" + - "
  • \n" + - "
  • \n" + - "
  • \n" + - "
  • \n" + - "
  • \n" + - "
  • \n" + - " \n" + - "
  • \n" + - "
\n" + - " \n" + - "
\n" + - "
收藏\n" + - " \n" + - " \n" + - " \n" + - "\n" + - "\n" + - " 没劲 \n" + - " 喜欢 \n" + - "
\t\t \n" + - "
\n" + - " \n" + - "
\n" + - "
\n" + - "
\n" + - "

参与讨论,请先登录|注册

\n" + - "

\n" + - "\n" + - "\n" + - "

\n" + - "
\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "
\n" + - "
\n" + - "
\n" + - "\n" + - " \n" + - " \n" + - "
\n" + - " \n" + - "
\n" + - "
\n" + - "
\n" + - "\t\t\t
\n" + - "

作者:云瑞

\n" + - "\n" + - "
\n" + - "
\n" + - "
\n" + - "\n" + - "
\n" + - "
\n" + - "
\n" + - "
个人签名
\n" + - "
人人都爱互联网
\n" + - "
\n" + - "
\n" + - "\t\t\t\t\n" + - "

作者其他文章

\n" + - "\n" + - " \n" + - "更多文章\n" + - "
\n" + - "\n" + - "
\n" + - "

您不能错过的作者

\n" + - "
  • \n" + - "

    \"葛甲\"

    \n" + - "

    葛甲

    \n" + - "
  • \n" + - "
  • \n" + - "

    \"吴澍\"

    \n" + - "

    吴澍

    \n" + - "
  • \n" + - "
  • \n" + - "

    \"知乎精选\"

    \n" + - "

    知乎精选

    \n" + - "
  • \n" + - "
  • \n" + - "

    \"译言\"

    \n" + - "

    译言

    \n" + - "
  • \n" + - "
  • \n" + - "

    \"潘乱\"

    \n" + - "

    潘乱

    \n" + - "
  • \n" + - "
  • \n" + - "

    \"王云辉\"

    \n" + - "

    王云辉

    \n" + - "
  • \n" + - "
  • \n" + - "

    \"阑夕\"

    \n" + - "

    阑夕

    \n" + - "
  • \n" + - "
  • \n" + - "

    \"胡晓东\"

    \n" + - "

    胡晓东

    \n" + - "
  • \n" + - "
  • \n" + - "

    \"阳淼\"

    \n" + - "

    阳淼

    \n" + - "
  • \n" + - "
  • \n" + - "

    \"魏武挥\"

    \n" + - "

    魏武挥

    \n" + - "
  • \n" + - "
  • \n" + - "

    \"高低买个皮夹克\"

    \n" + - "

    高低买个皮夹克

    \n" + - "
  • \n" + - "
  • \n" + - "

    \"潘越飞\"

    \n" + - "

    潘越飞

    \n" + - "
  • \n" + - "\t\n" + - "
\n" + - "
\n" + - "\n" + - " \t\t\n" + - "
\n" + - "\n" + - "\n" + - "
\n" + - "
\n" + - "\n" + - "
\n" + - "\n" + - "
\n" + - " \n" + - "
\n" + - " \n" + - "
\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\t
\n" + - "\n" + - "\n" + - "
\n" + - "
\n" + - "

关于我们|加入我们|广告及服务|常见问题解答|提交建议\n" + - "\n" + - "

\n" + - "

Copyright © 虎嗅网\n" + - "( 京ICP备12013432 )

\n" + - "
\n" + - "
\n" + - "\n" + - " 
\n" + - "\n" + - "回顶部\n" + - "\n" + - "\t\t\t
\n" + - "\t\t\t\n" + - "\t\t\t\n" + - "\n"; - - String blogHtml = "\n" + - "\n" + - "\n" + - "\n" + - " \n" + - " \n" + - " \n" + - " \n" + - " 一个基于Python装饰器的用户输入验证设计方案 - SamChi的个人空间 - 开源中国社区\n" + - " \n" + - " \n" + - " \n" + - " \n" + - " \n" + - " \n" + - " \n" + - " \n" + - " \n" + - " \n" + - " \n" + - " \n" + - " \n" + - " \n" + - " \n" + - " \n" + - " \n" + - " \n" + - "\n" + - "\n" + - "
\n" + - "\t
\n" + - "\t\t
\n" + - " \t开源中国社区\n" + - "
\n" + - " \t\t
JetBrains 开发工具全场3折,详情»
\n" + - "
\n" + - " \t\n" + - "
\n" + - "
\n" + - "\t
\n" + - "\t
\n" + - "\t\t
\n" + - "\t\t当前访客身份:\n" + - "\t\t\t\t黄亿华 [ 我的空间 | 退出 ]\n" + - "\t\t\t\t\t\t\t\n" + - "\t\t\t\t\t\t你有0新留言\t\t\t\n" + - "\t\t\t\t\t\t\t\t\n" + - "\t\t
\n" + - "\t\t
\n" + - " \t\t
\n" + - "\t\t\t\t\n" + - "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\n" + - " \t\t\t\n" + - "\t\t\t\t\n" + - "\t\t\t\t
\n" + - " \t\t\t\t\n" + - " \t\t\t\t\n" + - " \t\t\t\t\n" + - "
软件
\n" + - " \n" + - "
\n" + - "\t\t\t\t\t\t\t\n" + - " \t\t
\n" + - "\t\t
\n" + - "\t\t
\n" + - "\t
\n" + - "\t
\t\n" + - "\n" + - "
\n" + - "
\n" + - "\t \"SamChi\"\n" + - " \n" + - " SamChi\n" + - "\t\t\n" + - "\t\t\t\n" + - " \t\t\t\t\t\t\t\n" + - " \n" + - "
\n" + - "
\n" + - " \t关注(21)\n" + - " \t粉丝(52)\n" + - " \t积分(37)\n" + - "
\n" + - "
\n" + - "
\n" + - "这个人很懒,啥也没写
\n" + - "\n" + - "
\n" + - "\t.发送留言\n" + - "\t.请教问题\n" + - "
\n" + - " 博客分类\n" + - " \n" + - "
\n" + - "
\n" + - " 最新评论 \n" + - "
    \n" + - "\t\t
  • \n" + - "\t\t@其斤君羊:说的很对 做什么事情都得从身边做起 更何况创业 ...\n" + - "\t\t查看»\n" + - "\t
  • \n" + - "\t\t
  • \n" + - "\t\t@techstan:不错\n" + - "\t\t查看»\n" + - "\t
  • \n" + - "\t\t
  • \n" + - "\t\t@摩云飞:谢谢博主的总结,很有价值\n" + - "\t\t查看»\n" + - "\t
  • \n" + - "\t\t
  • \n" + - "\t\t@尚楠:正在学Python,谢了\n" + - "\t\t查看»\n" + - "\t
  • \n" + - "\t\t
  • \n" + - "\t\t@knightuniverse:其实我觉得,很多时候,不论是做项目还是做产品,...\n" + - "\t\t查看»\n" + - "\t
  • \n" + - "\t\t
  • \n" + - "\t\t@moyun:顶一个\n" + - "\t\t查看»\n" + - "\t
  • \n" + - "\t\t
  • \n" + - "\t\t@SamChi:引用来自“Martinium”的评论 alert('I am admi...\n" + - "\t\t查看»\n" + - "\t
  • \n" + - "\t\t
  • \n" + - "\t\t@Martinium:alert('I am admin, bitch!'); 这句话亮了。...\n" + - "\t\t查看»\n" + - "\t
  • \n" + - "\t\t
  • \n" + - "\t\t@Ben:引用来自“ExtremeTalk”的评论 引用来自“Ben”...\n" + - "\t\t查看»\n" + - "\t
  • \n" + - "\t\t
  • \n" + - "\t\t@ExtremeTalk:引用来自“Ben”的评论 引用来自“ExtremeTalk”...\n" + - "\t\t查看»\n" + - "\t
  • \n" + - "\t
\n" + - "
\n" + - "访客统计\n" + - "
    \n" + - "\t
  • 3
  • \n" + - "
  • 33
  • \n" + - "
  • 36
  • \n" + - "
  • 842
  • \n" + - "
  • 13706
  • \n" + - "
\n" + - "
\n" + - "\n" + - "\n" + - "
\n" + - "\t
\n" + - " \t\n" + - "\t
\n" + - "\t\n" + - " \t
\t\t\n" + - "
\n" + - "

一个基于Python装饰器的用户输入验证设计方案

\n" + - "
\n" + - " \t\t \t\t \t \n" + - "\t\t\t\t\n" + - "\n" + - "8人收藏此文章,\n" + - "\n" + - "\t\n" + - "\t\t\t\n" + - " \t\t \t\t发表于7天前(2013-04-15 16:46) , \n" + - " \t\t已有127次阅读 ,共0个评论\n" + - " \t\t \t
\n" + - "
\n" + - "\t

情景

\n" + - "

最近初学Python, 语法大概熟悉了之后就开始拿web.py做点小东西,web.py非常轻量,用起来感觉很舒服。但不过无论什么语言或者框架,web开发中有一个最大烦人之处就是表单验证,web.py提供了web.form来进行表单验证的统一处理,这个东西虽然用起来很简单,但是感觉还是不太合心意,首先这套验证机制跟web.py框架耦合的程度太高,而自己的架构是这样的,业务逻辑跟web逻辑完全分离,web仅仅是交互形式的一种,即使添加客户端C/S形式的服务或者是向开发者提供API,业务逻辑也是完全可用,不需要修改,这样对用户输入的验证是属于业务逻辑这一块,不应该跟web表单耦合在一起;另外感觉web.py这套东西还是有些简单,只支持每个表单的正则验证和最后表单提交的整体验证,而很多时候可能需要对用户进行丰富的错误提示,比如针对用户名的错误会具体到是不能为空还是长度错误或者格式错误等, 这个用web.py的form验证就感觉很别扭了。于是就决定自己设计一个用户输入的验证方案。

\n" + - "

设计

\n" + - "

web项目的开发多数都是遵循这么一个结构的设计,即DAO->Service->Controller->View, 按我前面说的,对用户的输入验证应是发生在Service这一层上,这一层的设计是接受用户输入的参数,然后进行验证处理,再进行业务相关的计算,最后输入结果。每个Service接口都应该返回一个结果,我一般都会把这个结果的内容抽象成一个一致类型的对象:

\n" + - "
class Result(object):\n" +
-            "    \n" +
-            "    u''' 操作结果抽象 '''\n" +
-            "    \n" +
-            "    def __init__(self, code, value=None):\n" +
-            "        self.code = code   #操作结果代号\n" +
-            "        self.value = value #操作结果值\n" +
-            "        \n" +
-            "    def __str__(self):\n" +
-            "        return "operation result, code: %s, value: %s" % (self.code, self.value)
\n" + - "

这个结果对象包含两个属性,一个是操作结果的代码,一个是操作的值,举个例子,比如用户注册的接口,如果注册成功,那么就会返回一个这样的Result对象,code属性是'success', value属性是新注册用户分配的ID,如果用户名已经被占用,那么code属性就是'username_exised', value属性的值是None。客户端拿到code属性的值可以做响应的处理,如果是直接面向最终用户的web应用,那么就会去找到这个code对应的错误信息来展示给用户,所有的错误信息我是组织在一个单独的Python模块中(opresult.py):

\n" + - "
reg = {\n" +
-            "       'success':u'注册成功',\n" +
-            "       'username_empty':u'用户名不得为空',\n" +
-            "       'username_format':u'用户名必须只能有数字、字母下划线组成',\n" +
-            "       'username_length':u'用户名长度必须在5到10个字符之间',\n" +
-            "       'username_existed':u'用户名已经存在',\n" +
-            "       'password_empty':u'密码不得为空',\n" +
-            "       'repassword_error':u'两次密码输入不一致',       \n" +
-            "       }
reg是注册的接口名称,这样客户端通过接口名称和code就可以获取对应的提示。 \n" + - "

由此,用户输入验证就是要把接口参数同这些code联系起来。对于参数验证,Python有天生的语言优势,那就是装饰器。一开始就想到了使用装饰器来描述参数验证需求,但这个装饰器需要哪些信息?怎么个形式?这个得从表单验证的需求开始看起,个人总结表单验证大抵不过这些判断条件:

\n" + - "

1. 是否允许为空

\n" + - "

2. 长度限制:比如密码的长度一般会不允许少于多少位

\n" + - "

3. 格式限制:比如Email地址,需要正则判断

\n" + - "

4. 逻辑限制:比如注册时判断用户名是否已经存在

\n" + - "

初步根据这些判断条件设计出这么一个方案:

\n" + - "
@checkarg(username={'allow_empty':False, \n" +
-            "                    'regex':r'^[a-zA-Z\\d_]+$',\n" +
-            "                    'min-length':5, 'max-length':10, \n" +
-            "                    'check_logic':[check_username_usable]},\n" +
-            "          password={'allow_empty':False,'regex':r'.{6,}'},\n" +
-            "          repassword={'allow-empty':False, 'check_logic':\n" +
-            "                      [(lambda **kw:(kw['password'] == kw['repassword'], "repassword_error"))]})\n" +
-            "def reg(username, password, repassword):\n" +
-            "    ....
\n" + - "

每一个参数使用一个字典来描述验证信息, allow_empty是表示是否为空,regex为验证的正则表达式,min-length和max-length用来描述长度,check_logic用来配置其他的验证逻辑。然后如何把这些验证结果同code进行匹配呢?最开始是在这个验证信息的字典中有一项'code':{'allow_empty':'username_empty'}通过这样的形式去匹配错误提示,但是感觉这样整的这个参数太复杂了(感觉现在已经挺复杂了- -b),于是决定这个地方使用约定优于配置的形式,code的值为'参数名_错误类型'的形式,比如allow_empty如果验证了为空,那么会自动返回名为username_empty的code,如果是一些额外的处理逻辑呢?没法做约定,怎么办?那么就约定这些检测函数返回一个元组,第一个元素为一个bool值,表示成功失败,第二个参数为code,表示失败原因,比如判断两次密码是否输入一致的那个lambda:

\n" + - "
lambda **kw:(kw['password'] == kw['repassword'], "repassword_error"
\n" + - "

嗯,大体就是这样的一个设计。

\n" + - "

实现

\n" + - "

根据上面的设计,把最终的装饰器实现了出来, 逻辑比较简单,关于装饰器设计的一些细节可以参阅Python参考手册:

\n" + - "
regex_cache = {}\n" +
-            "     \n" +
-            "def checkarg(**args):\n" +
-            "    \n" +
-            "    u'''参数检测装饰器'''\n" +
-            "    \n" +
-            "    def _checkarg(function):\n" +
-            "        \n" +
-            "        def __checkarg(**func_kw):\n" +
-            "            for key in func_kw:\n" +
-            "                if key in args:\n" +
-            "                    \n" +
-            "                    #要验证的值\n" +
-            "                    value = func_kw[key]\n" +
-            "                    \n" +
-            "                    #验证规则\n" +
-            "                    valid_rules = args[key]\n" +
-            "                    \n" +
-            "                    #检测空\n" +
-            "                    allow_empty = valid_rules.get('allow_empty')\n" +
-            "                    if not allow_empty:\n" +
-            "                        if not value or not value.strip():\n" +
-            "                            return Result(key + "_empty")\n" +
-            "                    elif not value:\n" +
-            "                        #如果是空的并且忽略空检测,那么下面的就不需要检查了\n" +
-            "                        continue;\n" +
-            "                    \n" +
-            "                    #检测长度\n" +
-            "                    if 'min-length' in valid_rules:\n" +
-            "                        min_length = valid_rules['min-length']\n" +
-            "                        if min_length > len(value):\n" +
-            "                            return Result(key + "_length")\n" +
-            "                        \n" +
-            "                    if 'max-length' in valid_rules:\n" +
-            "                        max_length = valid_rules['max-length']\n" +
-            "                        if max_length < len(value):\n" +
-            "                            return Result(key + "_length")\n" +
-            "                    \n" +
-            "                    #检测正则\n" +
-            "                    if 'regex' in valid_rules:\n" +
-            "                        #获取编译后的正则\n" +
-            "                        regex = valid_rules['regex']\n" +
-            "                        regexcmp = regex_cache.get(regex)\n" +
-            "                        if not regexcmp:\n" +
-            "                            regexcmp = re.compile(regex)\n" +
-            "                            regex_cache[regex] = regexcmp\n" +
-            "                        if not regexcmp.search(value):\n" +
-            "                            return Result(key + "_format")\n" +
-            "                    \n" +
-            "                    #检测其他逻辑\n" +
-            "                    check_logics = valid_rules.get('check_logic')\n" +
-            "                    if check_logics:\n" +
-            "                        for logic in check_logics:\n" +
-            "                            result, code = logic(**func_kw)\n" +
-            "                            if not result:\n" +
-            "                                return Result(code)\n" +
-            "                                \n" +
-            "            function(**func_kw)\n" +
-            "        return __checkarg\n" +
-            "                            \n" +
-            "    return _checkarg
\n" + - "\t \t \n" + - "
\n" + - "\t\t\n" + - "
\n" + - "\t \t\n" + - "\t \t \n" + - "
\t\t\n" + - "\t \t\t声明:OSCHINA 博客文章版权属于作者,受法律保护。未经作者同意不得转载。\n" + - "\t \t
\n" + - "\n" + - " \n" + - "\t
\n" + - "\n" + - "\t\n" + - "\t
\n" + - "\t\n" + - "\t\n" + - "\t\t分享到: \n" + - "\t\t\n" + - "\t\t\n" + - "\t\n" + - " 已有 0人顶\n" + - "\t\n" + - "\t
\n" + - "\t\t\n" + - "
\n" + - "
\n" + - "
\n" + - "

共有 0 条网友评论

\n" + - "\t\t\t

尚无网友评论

\n" + - "\t\t
    \n" + - "\t\t
\n" + - "
\n" + - "\t
\n" + - "\n" + - "\n" + - "\n" + - "
\n" + - " \n" + - "
\n" + - "
\n" + - "
\n" + - "\t \n" + - "\t \n" + - "\t 文明上网,理性发言\n" + - "
\n" + - "\t回到页首 | 回到评论列表\n" + - "
\n" + - "
\n" + - "\t\n" + - "
\n" + - "\t关闭相关文章阅读\n" + - "\t\n" + - "
\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "
\n" + - "\t
\n" + - "\t
\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "
© 开源中国(OsChina.NET) | 关于我们 | 广告联系 | @新浪微博 | 开源中国手机版 | 粤ICP备12009483号-3\n" + - "\t开源中国手机客户端:\n" + - "\tAndroid\n" + - "\tiPhone\n" + - "\tWP7\n" + - "
\n" + - "
\n" + - "
\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - ""; - - String html = "\n" + - "\n" + - "\n" + - "\n" + - " \n" + - " \n" + - " 再次吐槽easyui - 开源中国 OSChina.NET\n" + - " \n" + - " \n" + - " \n" + - " \n" + - " \n" + - " \n" + - " \n" + - " \n" + - " \n" + - " \n" + - " \n" + - " \n" + - " \n" + - " \n" + - " \n" + - " \n" + - " \n" + - "\n" + - "
\n" + - "\t
\n" + - "
\n" + - " \t\n" + - "
\n" + - "\t\t
\n" + - " \t\t \t\t黄亿华,您好 \n" + - "\t\t\t\n" + - "\t\t\t\t我的空间\n" + - "\t\t\t\t\n" + - "\t\t\t | \n" + - "\t\t\t添加软件 | 投递新闻 | 退出\n" + - " \t\t\t\t
\n" + - "\t\t
\n" + - "\t
\n" + - "
\n" + - "
\n" + - "

讨论区

\n" + - "
\n" + - "\t
当前位置:
\n" + - "\t
\n" + - "\t\t\t\t\t \t\t讨论区 »\n" + - " \t\t技术问答\t\t\t\t\t\t\t\t» EasyUI\n" + - "\t\t\t\t\t\t\t\t\t\t
\n" + - "
\n" + - "\n" + - "
\n" + - "
\n" + - "\t
\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "
\n" + - "\t\n" + - "\t
\n" + - "\t
\n" + - "\t\t
\"午后冬日\"
\n" + - "\t\t
\n" + - "\t\t\t

再次吐槽easyui

\n" + - "\t\t\t
\n" + - "\t\t\t\t午后冬日\n" + - "\t\t\t\t发表于 2013-4-21 02:28 13小时前,\n" + - "\t\t\t\t3回/289阅,\n" + - "\t\t\t\t最后回答: 4小时前\t\t\t\t\t\t\t\t\t\t\t
\n" + - "\t\t
\n" + - "\t\t\n" + - "\t\t
\n" + - "\t
\n" + - "\t\t \t \t\t\t\t\t\n" + - "\t\t

Java、PHP、Ruby、iOS、Python 等 JetBrains 开发工具低至 99 元(3折),详情»

\n" + - "\t\t
\n" + - "\t\t\t\t\t\t
刚用到easyui treegrid组件,发现这货第一次加载时候并没有传默认参数,展开某一列时候才传递id:xx的参数。这样和后台总是疙里疙瘩,像没事就拌嘴的两口子,查网上都遇到相同问题,最好解决方案就是通过 \n" + - "onBeforeExpand事件来扩展,自行解决。看到官方例子中简洁的代码,感觉easyui耍流氓了,真搞不懂为何要这样实现
\n" + - "\t\t\t\t\t\t
\n" + - "\t\t\t\t标签:\t\t\t\t\n" + - "\t\t\t\t\t\t\t\t\t\t\t\tEasyUI \t\t\t\t\t\t\t\t\t\t\t
\n" + - "\t\t\t\t\t\t
\n" + - "\t\t\t\n" + - "\t\t\t\t\t\t\t\t\t我想问同样的问题\n" + - "\t\t\t\t\t\t\n" + - "\t\t\t共0个人想要问同样的问题\n" + - "\t\t\t\t\t\t补充话题说明»\n" + - "\t\t\t
\n" + - "\t\t\t\t\t\t
\n" + - "\t
    \n" + - "
    \t\t
    \n" + - "\t\t\n" + - "\t\t
    \n" + - " \n" + - "\t\t\t\t
    \n" + - "\t\t\t
    分享到
    \n" + - "\t\t\t\n" + - "\t\t\t
    1
    \n" + - "\t\t\t\n" + - "\t\t\t
    \n" + - "\t\t\t\t\t\t\t\t \t\t\t\n" + - "\t\t\t\t\t\t\t\t0\n" + - "\t\t\t\t|\n" + - "\t\t\t\t\t\t\t\t \t\t\t\n" + - "\t\t\t\t\t\t\t\t0\n" + - "\t\t\t
    \n" + - "\t\t\t\n" + - "\t\t
    \n" + - "\t\t
    \n" + - "\t\t\t\t\t\t
    \n" + - "\t\t\t\n" + - " \t

    \t\t\t\n" + - "\t\t\t\t\n" + - "\t\t\t\t\t按评价排序 |\n" + - "\t\t\t\t\t显示最新答案 | 回页面顶部\n" + - "\t\t\t\t\n" + - "\t\t\t\t共有3个答案 我要回答»\n" + - "\t\t\t

    \n" + - "\t\t\t \t
    • \n" + - "\t
      \"布谷鸟\"
      \n" + - "\t
      \n" + - "\t\t
      布谷鸟 回答于 2013-04-21 09:28
      \t\t\n" + - " \t
      \n" + - "\t\t\t \t\t \t\t举报\n" + - " \t
      \n" + - "\t\t
      \n" + - "\t\t
      对话框、日期控件和选项卡效果还不错,树状菜单没得zTree好用,建议楼主不要全部效果都依赖于此框架,有些easyui实现不好的地方可以换其它的插件实现嘛,反正我现在再也不用诸如ext和easyui之类的东西了,感觉好肥
      \n" + - "\t
      \n" + - "\t
      \n" + - "\t
      --- 共有 1 条评论 --- \n" + - "
        \n" + - "\t\t
      • \n" + - "\t\t\"午后冬日\"\n" + - "\t\t\n" + - "\t\t前端水平实在有限,自己搞的总是感觉不伦不类,只能用这些框架,再集成其它插件,切换主题时风格又不一致。\n" + - "\t\t(4小时前 by 午后冬日)\n" + - "\t\t回复\n" + - "\t\t\n" + - "\t\t
        \n" + - "\t
      • \n" + - "\t
      \n" + - "\n" + - "
      \n" + - "\t
      \t\t\t\t\t\t有帮助(1) |\n" + - "\t\t没帮助(0) |\n" + - "\t\t评论(1) |\n" + - " \t引用此答案\t
      \n" + - "
    • \n" + - "\t
      \"静风流云\"
      \n" + - "\t
      \n" + - "\t\t
      静风流云 回答于 2013-04-21 11:08
      \t\t\n" + - " \t
      \n" + - "\t\t\t \t\t \t\t举报\n" + - " \t
      \n" + - "\t\t
      \n" + - "\t\t

      没办法,原来项目也是因为客户特殊的需求,对layout选型的时候,犹豫了好久,最终放弃了。
      幸亏来了一个厉害的前端,解决问题,够用就好。

      \n" + - "\t
      \n" + - "\t
      \n" + - "\t
      --- 共有 1 条评论 --- \n" + - "
        \n" + - "\t\t
      • \n" + - "\t\t\"午后冬日\"\n" + - "\t\t\n" + - "\t\t我也是犹豫了好久,看过很多前端框架,总是不太满意。个人开发前台后台数据库全部要自己搞定,郁闷ing\n" + - "\t\t(4小时前 by 午后冬日)\n" + - "\t\t回复\n" + - "\t\t\n" + - "\t\t
        \n" + - "\t
      • \n" + - "\t
      \n" + - "\n" + - "
      \n" + - "\t
      \t\t\t\t\t\t有帮助(0) |\n" + - "\t\t没帮助(0) |\n" + - "\t\t评论(1) |\n" + - " \t引用此答案\t
      \n" + - "
    • \n" + - "\t
      \"布谷鸟\"
      \n" + - "\t
      \n" + - "\t\t
      布谷鸟 回答于 2013-04-21 11:29
      \t\t\n" + - " \t
      \n" + - "\t\t\t \t\t \t\t举报\n" + - " \t
      \n" + - "\t\t
      \n" + - "\t\t

      引用来自“布谷鸟”的答案

      对话框、日期控件和选项卡效果还不错,树状菜单没得zTree好用,建议楼主不要全部效果都依赖于此框架,有些easyui实现不好的地方可以换其它的插件实现嘛,反正我现在再也不用诸如ext和easyui之类的东西了,感觉好肥
      前后端你一个人搞啊?那确实很麻烦。面面俱到的话,工作量很大。但是如果需要实现的功能不是很多,而时间也不紧迫的话,事情干起来也还不错。如非必须,建议逐步弃用这些前端框架,在一些比较能够提升体验的地方选用一些适当的插件即可,如此也不再需要担心风格的问题,你看osc后台截图,界面那叫一个丑,用得方便顺手就够了
      \n" + - "\t
      \n" + - "\t
      \n" + - "\t
      \n" + - "\t
      \t\t\t\t\t\t有帮助(0) |\n" + - "\t\t没帮助(0) |\n" + - "\t\t评论(0) |\n" + - " \t引用此答案\t
      \n" + - "
    \n" + - "\t\t\t\t
    \n" + - "\t\t
    \n" + - "\t\t\t
    \"黄亿华\"
    \n" + - "\t\t\t
    \n" + - "\t\t\t\t\n" + - "\t\t\t\t\n" + - "\t\t\t\t\n" + - "\t\t\t\t\n" + - "\t\t\t\t

    \n" + - "\t\t\t\t回答案顶部 | 回页面顶部\n" + - "\t\t\t
    \n" + - "\t\t\t
    \n" + - "\t\t\t\n" + - "\t\t
    \n" + - "\t
    \t\n" + - "\t\n" + - "\n" + - "\n" + - "\n" + - "\t
    \n" + - "\t
    \n" + - " \t\n" + - "\t
    \n" + - "\t\t
    \n" + - "\t\t\t有什么技术问题吗?\n" + - "\t\t\t我要提问\n" + - "\t\t\t
    \n" + - "\t\t
    \n" + - "\t\t\n" + - "\t\t\t\t\t\t
    \n" + - "\t\t\t全部(29)...午后冬日的其他问题\n" + - "\t\t\t\n" + - "\t\t
    \n" + - "\t\t\t\t
    \n" + - "\t\t\n" + - "\t\t
    \n" + - "\t\t\n" + - "\t\t
    \n" + - "\t\t\t类似的话题\n" + - "\t\t\t\n" + - "\t\t
    \n" + - "\t
    \n" + - "\t
    \n" + - "
    \n" + - "\n" + - "\n" + - "\n" + - "\n" + - "
    \n" + - "\t
    \n" + - "\n" + - "\n" + - "\n" + - "\n" + - "
    © 开源中国(OsChina.NET) | 关于我们 | 广告联系 | @新浪微博 | 开源中国手机版 | 粤ICP备12009483号-3\n" + - "\t开源中国手机客户端:\n" + - "\tAndroid\n" + - "\tiPhone\n" + - "\tWP7\n" + - "
    \n" + - "
    \n" + - "
    \n" + - "\n" + - "\n" + - ""; - - @Test - public void test(){ - String text = "\n" + - "\n" + - "\n" + - " \n" + - " \n" + - " jsoup 解析页面商品信息 - - ITeye技术网站\n" + - " \n" + - " \n" + - " \n" + - " \n" + - " \n" + - " \n" + - "\n" + - " \n" + - " \n" + - "\n" + - "\n" + - " \n" + - " \n" + - "\n" + - " \n" + - "\n" + - " \n" + - " \n" + - " \n" + - " \n" + - "
    \n" + - "
    \n" + - " 首页\n" + - " 资讯\n" + - " 精华\n" + - " 论坛\n" + - " 问答\n" + - " 博客\n" + - " 专栏\n" + - " 群组\n" + - " 更多 \n" + - "
    \n" + - " 招聘\n" + - " 搜索\n" + - "
    \n" + - "
    \n" + - "\n" + - "
    \n" + - " \n" + - " 欢迎flashsword20\n" + - " 0\n" + - " \n" + - " \"Newpm\"收件箱(3)\n" + - " \n" + - " 我的应用\n" + - "
    \n" + - " 我的关注\n" + - " 我的群组\n" + - " 我的简历\n" + - " 我的相册\n" + - " 我的收藏\n" + - " 我的代码\n" + - " 我的微博\n" + - "
    \n" + - " 我的博客\n" + - " 设置\n" + - "
    \n" + - "
    \n" + - " \n" + - " \n" + - "
    \n" + - "
    \n" + - " \n" + - " \n" + - "
    \n" + - "
    \n" + - "
    \n" + - "
    \n" + - "
    \n" + - " \n" + - "
    \n" + - "
    \n" + - " \n" + - "\n" + - "\n" + - "\n" + - " \n" + - "\n" + - "\n" + - "
    \n" + - "
    \n" + - "

    \n" + - " jsoup 解析页面商品信息\n" + - " \n" + - "

    \n" + - " \n" + - "
     
    \n" + - "
    \n" + - "\n" + - "
    \n" + - "

    今天用了jsoup 解析页面商品信息,感觉比用xpath获取信息准确多了

    \n" + - "

    \n" + - "

    下面就记录一下:

    \n" + - "

    一、首先去 http://jsoup.org/download 下载jsoup的jar包。

    \n" + - "

    \n" + - "

    二、下面记录下相关代码:

    \n" + - "

    \n" + - "

    \n" + - "

    Document doc = Jsoup.connect(url).get(); //将htm转换成Document类型数据结构

    \n" + - "


    doc.select(\"div:has(div) div#spec-n1:has(img) img\").first().attr(\"src\")); //查找div下含有div的标签

    \n" + - "

    \n" + - "

    并且 div的id='spec-n1',此div第一个img标签,img里属性是src的值。

    \n" + - "

    \n" + - "

    doc.select(\"div:has(div) div.crumb:has(a) a:eq(4)\").text(); //查找class='crumb'的div下第4个a标签

    \n" + - "

    下的值。

    \n" + - "

    \n" + - "

    doc.select(\"div:has(div) div#name:has(h1)\").text(); //查找id='name'的div下的h1标签的值。

    \n" + - "

    \n" + - "

    doc.select(\"tbody:has(tr) td.tdTitle:contains(品牌) + td\").text(); //查找class='tdTitle'的td标签里

    \n" + - "

    \n" + - "

    含有‘品牌’td的下一个td标签中内容。

    \n" + - "

    \n" + - "

    doc.select(\"script[type=text/javascript]:not([src~=[a-zA-Z0-9./\\\\s]+)\"); //查找含有此<script

    \n" + - "

    \n" + - "

    type=\"text/javascript\">……</script>内容,不含有script标签中有src属性的script,如:

    \n" + - "

    \n" + - "

    <script src=\"url\" type=\"text/javascript\"></script>。

    \n" + - "
    \n" + - "\n" + - " \n" + - "\n" + - "\n" + - " \n" + - " \n" + - "
    \n" + - " \n" + - "
    分享到:\n" + - " \n" + - " \n" + - "
    \n" + - "
    \n" + - "\n" + - " \n" + - "
    \n" + - " \n" + - "
    \n" + - "\n" + - "
    \n" + - "
    评论
    \n" + - " \n" + - " \n" + - " \n" + - " \n" + - "
    \n" + - "\n" + - "
    \n" + - "
    发表评论
    \n" + - "
    \n" + - "\n" + - "\n" + - " \n" + - "\n" + - "\n" + - "\n" + - "
    \n" + - "\n" + - "\n" + - "\n" + - "\n" + - "

    (快捷键 Alt+S / Ctrl+Enter)

    \n" + - "
    \n" + - " \n" + - "
    \n" + - "
    \n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "
    \n" + - "\n" + - "
    \n" + - "
    \n" + - "
    \n" + - "
    \"masong1987的博客\"
    \n" + - "
    masong1987
    \n" + - "
    \n" + - "\n" + - "
    \n" + - "
      \n" + - "
    • 浏览: 5401 次
    • \n" + - "
    • 性别: \"Icon_minigender_1\"
    • \n" + - "
    • 来自: 北京
    • \n" + - "
    • \n" + - " \n" + - "
    • \n" + - " 发短消息\n" + - " \n" + - " 更多访客>>\n" + - " \n" + - "
      \n" + - "
      \"flashsword20的博客\"
      \n" + - " \n" + - "
      \n" + - " \n" + - "
      \n" + - "
      \"dylinshi126的博客\"
      \n" + - " \n" + - "
      \n" + - " \n" + - "
      \n" + - "
      \"machoo的博客\"
      \n" + - " \n" + - "
      \n" + - " \n" + - "
      \n" + - "
      \"arson的博客\"
      \n" + - " \n" + - "
      \n" + - " \n" + - "
    \n" + - "\n" + - " \n" + - "\n" + - "
    \n" + - "
    文章分类
    \n" + - " \n" + - "
    \n" + - "
    \n" + - "
    社区版块
    \n" + - " \n" + - "
    \n" + - "
    \n" + - "
    存档分类
    \n" + - " \n" + - "
    \n" + - " \n" + - " \n" + - "\n" + - "
    \n" + - "
    最新评论
    \n" + - " \n" + - "
    \n" + - "\n" + - "
    \n" + - " \n" + - "
    \n" + - "
    \n" + - "\n" + - "
    \n" + - "
    \n" + - "
    \n" + - " 声明:ITeye文章版权属于作者,受法律保护。没有作者书面许可不得转载。若作者同意转载,必须以超链接形式标明文章原始出处和作者。
    \n" + - " © 2003-2012 ITeye.com. All rights reserved. [ 京ICP证110151号 京公网安备110105010620 ]\n" + - "
    \n" + - "
    \n" + - " \n" + - " \n" + - "\n" + - " \n" + - " \n" + - " \n" + - "\n"; - String text2="
    aaa
    "; - XpathSelector xpathSelector = new XpathSelector("//div[@id='main']/div[@class='blog_main']/div[1][@class='blog_title']/h3/a"); - String select = xpathSelector.select(text); - Assert.assertEquals("jsoup 解析页面商品信息",select); - } - - @Test - public void testOschina(){ - Html html1 = new Html(html); - Assert.assertEquals("再次吐槽easyui",html1.x(".//*[@class='QTitle']/h1/a").toString()); - } - - @Test - public void testOschinaBlog(){ - Html html1 = new Html(blogHtml); - System.out.println(html1.sc()); - } - - @Test - public void testHuxiuBlog(){ - Html html1 = new Html(huxiuHtml); - System.out.println(html1.sc()); - } -} diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/utils/UrlUtilsTest.java b/webmagic-core/src/test/java/us/codecraft/webmagic/utils/UrlUtilsTest.java index d42400586..7ac7aa06a 100644 --- a/webmagic-core/src/test/java/us/codecraft/webmagic/utils/UrlUtilsTest.java +++ b/webmagic-core/src/test/java/us/codecraft/webmagic/utils/UrlUtilsTest.java @@ -4,7 +4,7 @@ import org.junit.Test; /** - * User: cairne + * @author code4crafter@gmail.com
    * Date: 13-4-21 * Time: 下午2:22 */ @@ -12,23 +12,20 @@ public class UrlUtilsTest { @Test public void testFixRelativeUrl() { - String fixrelativeurl = UrlUtils.fixRelativeUrl("aa", "http://www.dianping.com/sh/ss/com"); + String fixrelativeurl = UrlUtils.canonicalizeUrl("aa", "http://www.dianping.com/sh/ss/com"); System.out.println("fix: " + fixrelativeurl); Assert.assertEquals("http://www.dianping.com/sh/ss/aa", fixrelativeurl); - fixrelativeurl = UrlUtils.fixRelativeUrl("../aa", "http://www.dianping.com/sh/ss/com"); + fixrelativeurl = UrlUtils.canonicalizeUrl("../aa", "http://www.dianping.com/sh/ss/com"); Assert.assertEquals("http://www.dianping.com/sh/aa", fixrelativeurl); - fixrelativeurl = UrlUtils.fixRelativeUrl("..../aa", "http://www.dianping.com/sh/ss/com"); + fixrelativeurl = UrlUtils.canonicalizeUrl("..../aa", "http://www.dianping.com/sh/ss/com"); Assert.assertEquals("http://www.dianping.com/aa", fixrelativeurl); - fixrelativeurl = UrlUtils.fixRelativeUrl(".../aa", "http://www.dianping.com/sh/ss/com"); + fixrelativeurl = UrlUtils.canonicalizeUrl(".../aa", "http://www.dianping.com/sh/ss/com"); Assert.assertEquals("http://www.dianping.com/aa", fixrelativeurl); - fixrelativeurl = UrlUtils.fixRelativeUrl("..aa", "http://www.dianping.com/sh/ss/com"); + fixrelativeurl = UrlUtils.canonicalizeUrl("..aa", "http://www.dianping.com/sh/ss/com"); Assert.assertEquals("http://www.dianping.com/sh/ss/..aa", fixrelativeurl); -// fixrelativeurl = fixrelativeurl("/aa", "http://www.dianping.com"); -// System.out.println("fix: " + fixrelativeurl); -// fixrelativeurl = fixrelativeurl("/aa", "http://www.dianping.com/"); -// System.out.println("fix: " + fixrelativeurl); + fixrelativeurl = UrlUtils.canonicalizeUrl("../../aa", "http://www.dianping.com/sh/ss/com"); } @Test @@ -628,7 +625,6 @@ public void testFixRelativeHtml(){ "\t\t\t\n" + "\n"; String newHtml = UrlUtils.fixAllRelativeHrefs(html, "http://www.huxiu.com/"); - String text = "订阅虎嗅"; Assert.assertTrue(html.contains(" - + diff --git a/webmagic-extension/README.md b/webmagic-extension/README.md new file mode 100644 index 000000000..71d3c48bc --- /dev/null +++ b/webmagic-extension/README.md @@ -0,0 +1,3 @@ +webmagic-extension +------- +webmagic的扩展模块。包括注解格式定义爬虫、JSON、分布式等支持。 \ No newline at end of file diff --git a/webmagic-extension/pom.xml b/webmagic-extension/pom.xml new file mode 100644 index 000000000..63034f235 --- /dev/null +++ b/webmagic-extension/pom.xml @@ -0,0 +1,36 @@ + + + + us.codecraft + webmagic + 0.2.0 + + 4.0.0 + + webmagic-extension + + + + com.alibaba + fastjson + 1.1.35 + + + redis.clients + jedis + 2.0.0 + + + us.codecraft + webmagic-core + ${project.version} + + + junit + junit + + + + \ No newline at end of file diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/PagedModel.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/PagedModel.java new file mode 100644 index 000000000..7d46cc213 --- /dev/null +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/PagedModel.java @@ -0,0 +1,20 @@ +package us.codecraft.webmagic; + +import java.util.Collection; + +/** + * @author code4crafter@gmail.com
    + * Date: 13-8-4
    + * Time: 下午5:18
    + */ +public interface PagedModel { + + public String getPageKey(); + + public Collection getOtherPages(); + + public String getPage(); + + public PagedModel combine(PagedModel pagedModel); + +} diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/downloader/FileDownloader.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/downloader/FileDownloader.java new file mode 100644 index 000000000..cca5b2065 --- /dev/null +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/downloader/FileDownloader.java @@ -0,0 +1,97 @@ +package us.codecraft.webmagic.downloader; + +import org.apache.commons.codec.digest.DigestUtils; +import org.apache.commons.lang3.StringUtils; +import org.apache.log4j.Logger; +import us.codecraft.webmagic.Page; +import us.codecraft.webmagic.Request; +import us.codecraft.webmagic.Task; +import us.codecraft.webmagic.selector.Html; +import us.codecraft.webmagic.selector.PlainText; + +import java.io.*; + +/** + * 使用缓存到本地的文件来模拟下载,可以在Spider框架中仅进行抽取工作。
    + * @author code4crafer@gmail.com + * Date: 13-6-24 + * Time: 上午7:24 + */ +public class FileDownloader implements Downloader { + + private String path = "/data/temp/webmagic/"; + + private Downloader downloaderWhenFileMiss; + + private Logger logger = Logger.getLogger(getClass()); + + public FileDownloader() { + this("/data/temp/webmagic/", null); + } + + public FileDownloader(String path) { + this(path, null); + } + + public FileDownloader(String path, Downloader downloaderWhenFileMiss) { + if (!path.endsWith("/")&&!path.endsWith("\\")){ + path+="/"; + } + this.path = path; + this.downloaderWhenFileMiss = downloaderWhenFileMiss; + } + + @Override + public Page download(Request request, Task task) { + String path = this.path + "/" + task.getUUID() + "/"; + Page page = null; + try { + final File file = new File(path + DigestUtils.md5Hex(request.getUrl())); + BufferedReader bufferedReader = new BufferedReader(new FileReader(file)); + String line = null; + line = bufferedReader.readLine(); + if (line.equals("url:\t" + request.getUrl())) { + final String html = getHtml(bufferedReader); + page = new Page(); + page.setRequest(request); + page.setUrl(PlainText.create(request.getUrl())); + page.setHtml(Html.create(html)); + } + } catch (IOException e) { + if (e instanceof FileNotFoundException) { + logger.info("File not exist for url " + request.getUrl()); + } else { + logger.warn("File read error for url " + request.getUrl(), e); + } + } + if (page == null) { + page = downloadWhenMiss(request, task); + } + return page; + } + + @Override + public void setThread(int thread) { + + } + + private String getHtml(BufferedReader bufferedReader) throws IOException { + String line; + StringBuilder htmlBuilder= new StringBuilder(); + line = bufferedReader.readLine(); + line = StringUtils.removeStart(line, "html:\t"); + htmlBuilder.append(line); + while ((line=bufferedReader.readLine())!=null){ + htmlBuilder.append(line); + } + return htmlBuilder.toString(); + } + + private Page downloadWhenMiss(Request request, Task task) { + Page page = null; + if (downloaderWhenFileMiss != null) { + page = downloaderWhenFileMiss.download(request, task); + } + return page; + } +} diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/AfterExtractor.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/AfterExtractor.java new file mode 100644 index 000000000..3927d116e --- /dev/null +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/AfterExtractor.java @@ -0,0 +1,15 @@ +package us.codecraft.webmagic.model; + +import us.codecraft.webmagic.Page; + +/** + * 实现这个接口即可在抽取后进行后处理。
    + * + * @author code4crafter@gmail.com
    + * Date: 13-8-3
    + * Time: 上午9:42
    + */ +public interface AfterExtractor { + + public void afterProcess(Page page); +} diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/ConsolePageModelPipeline.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/ConsolePageModelPipeline.java new file mode 100644 index 000000000..c841f10d4 --- /dev/null +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/ConsolePageModelPipeline.java @@ -0,0 +1,16 @@ +package us.codecraft.webmagic.model; + +import org.apache.commons.lang3.builder.ToStringBuilder; +import us.codecraft.webmagic.Task; + +/** + * @author code4crafter@gmail.com
    + * Date: 13-8-3
    + * Time: 下午3:41
    + */ +public class ConsolePageModelPipeline implements PageModelPipeline { + @Override + public void process(Object o, Task task) { + System.out.println(ToStringBuilder.reflectionToString(o)); + } +} diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/Extractor.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/Extractor.java new file mode 100644 index 000000000..04940766c --- /dev/null +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/Extractor.java @@ -0,0 +1,48 @@ +package us.codecraft.webmagic.model; + +import us.codecraft.webmagic.selector.Selector; + +/** + * @author code4crafter@gmail.com
    + * Date: 13-8-1
    + * Time: 下午9:48
    + */ +class Extractor { + + protected Selector selector; + + protected final Source source; + + protected final boolean notNull; + + protected final boolean multi; + + static enum Source {Html, Url, RawHtml} + + public Extractor(Selector selector, Source source, boolean notNull, boolean multi) { + this.selector = selector; + this.source = source; + this.notNull = notNull; + this.multi = multi; + } + + Selector getSelector() { + return selector; + } + + Source getSource() { + return source; + } + + boolean isNotNull() { + return notNull; + } + + boolean isMulti() { + return multi; + } + + void setSelector(Selector selector) { + this.selector = selector; + } +} diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/FieldExtractor.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/FieldExtractor.java new file mode 100644 index 000000000..4ec1bbce0 --- /dev/null +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/FieldExtractor.java @@ -0,0 +1,47 @@ +package us.codecraft.webmagic.model; + +import us.codecraft.webmagic.selector.Selector; + +import java.lang.reflect.Field; +import java.lang.reflect.Method; + +/** + * @author code4crafter@gmail.com
    + * Date: 13-8-1
    + * Time: 下午9:48
    + */ +class FieldExtractor extends Extractor{ + + private final Field field; + + private Method setterMethod; + + public FieldExtractor(Field field, Selector selector, Source source, boolean notNull,boolean multi) { + super(selector, source, notNull,multi); + this.field = field; + } + + Field getField() { + return field; + } + + Selector getSelector() { + return selector; + } + + Source getSource() { + return source; + } + + void setSetterMethod(Method setterMethod) { + this.setterMethod = setterMethod; + } + + Method getSetterMethod() { + return setterMethod; + } + + boolean isNotNull() { + return notNull; + } +} diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/ModelPageProcessor.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/ModelPageProcessor.java new file mode 100644 index 000000000..af762ece0 --- /dev/null +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/ModelPageProcessor.java @@ -0,0 +1,89 @@ +package us.codecraft.webmagic.model; + +import us.codecraft.webmagic.Page; +import us.codecraft.webmagic.Request; +import us.codecraft.webmagic.Site; +import us.codecraft.webmagic.processor.PageProcessor; +import us.codecraft.webmagic.selector.Selector; + +import java.util.ArrayList; +import java.util.HashSet; +import java.util.List; +import java.util.Set; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +/** + * 基于PageProcessor的扩展点。
    + * @author code4crafter@gmail.com
    + * Date: 13-8-1
    + * Time: 下午8:46
    + */ +class ModelPageProcessor implements PageProcessor { + + private List pageModelExtractorList = new ArrayList(); + + private Site site; + + private Set targetUrlPatterns = new HashSet(); + + public static ModelPageProcessor create(Site site, Class... clazzs) { + ModelPageProcessor modelPageProcessor = new ModelPageProcessor(site); + for (Class clazz : clazzs) { + modelPageProcessor.addPageModel(clazz); + } + return modelPageProcessor; + } + + + public ModelPageProcessor addPageModel(Class clazz) { + PageModelExtractor pageModelExtractor = PageModelExtractor.create(clazz); + targetUrlPatterns.addAll(pageModelExtractor.getTargetUrlPatterns()); + targetUrlPatterns.addAll(pageModelExtractor.getHelpUrlPatterns()); + pageModelExtractorList.add(pageModelExtractor); + return this; + } + + private ModelPageProcessor(Site site) { + this.site = site; + } + + @Override + public void process(Page page) { + for (PageModelExtractor pageModelExtractor : pageModelExtractorList) { + extractLinks(page, pageModelExtractor.getHelpUrlRegionSelector(), pageModelExtractor.getHelpUrlPatterns()); + extractLinks(page, pageModelExtractor.getTargetUrlRegionSelector(), pageModelExtractor.getTargetUrlPatterns()); + Object process = pageModelExtractor.process(page); + if (process == null || (process instanceof List && ((List) process).size() == 0)) { + page.getResultItems().setSkip(true); + } + postProcessPageModel(pageModelExtractor.getClazz(), process); + page.putField(pageModelExtractor.getClazz().getCanonicalName(), process); + } + } + + private void extractLinks(Page page, Selector urlRegionSelector, List urlPatterns) { + List links; + if (urlRegionSelector == null) { + links = page.getHtml().links().all(); + } else { + links = urlRegionSelector.selectList(page.getHtml().toString()); + } + for (String link : links) { + for (Pattern targetUrlPattern : urlPatterns) { + Matcher matcher = targetUrlPattern.matcher(link); + if (matcher.find()) { + page.addTargetRequest(new Request(matcher.group(1))); + } + } + } + } + + protected void postProcessPageModel(Class clazz, Object object) { + } + + @Override + public Site getSite() { + return site; + } +} diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/ModelPipeline.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/ModelPipeline.java new file mode 100644 index 000000000..07d6c5a27 --- /dev/null +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/ModelPipeline.java @@ -0,0 +1,49 @@ +package us.codecraft.webmagic.model; + +import us.codecraft.webmagic.ResultItems; +import us.codecraft.webmagic.Task; +import us.codecraft.webmagic.model.annotation.ExtractBy; +import us.codecraft.webmagic.pipeline.Pipeline; + +import java.lang.annotation.Annotation; +import java.util.List; +import java.util.Map; +import java.util.concurrent.ConcurrentHashMap; + +/** + * 基于Pipeline的扩展点,用于实现注解格式的Pipeline。
    + * 与PageModelPipeline是一对多的关系(原谅作者没有更好的名字了)。
    + * @author code4crafter@gmail.com
    + * Date: 13-8-2
    + * Time: 上午10:47
    + */ +class ModelPipeline implements Pipeline { + + private Map pageModelPipelines = new ConcurrentHashMap(); + + public ModelPipeline() { + } + + public ModelPipeline put(Class clazz, PageModelPipeline pageModelPipeline) { + pageModelPipelines.put(clazz, pageModelPipeline); + return this; + } + + @Override + public void process(ResultItems resultItems, Task task) { + for (Map.Entry classPageModelPipelineEntry : pageModelPipelines.entrySet()) { + Object o = resultItems.get(classPageModelPipelineEntry.getKey().getCanonicalName()); + if (o != null) { + Annotation annotation = classPageModelPipelineEntry.getKey().getAnnotation(ExtractBy.class); + if (annotation == null || !((ExtractBy) annotation).multi()) { + classPageModelPipelineEntry.getValue().process(o, task); + } else { + List list = (List) o; + for (Object o1 : list) { + classPageModelPipelineEntry.getValue().process(o1, task); + } + } + } + } + } +} diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/OOSpider.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/OOSpider.java new file mode 100644 index 000000000..e5a41e1d0 --- /dev/null +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/OOSpider.java @@ -0,0 +1,56 @@ +package us.codecraft.webmagic.model; + +import us.codecraft.webmagic.Site; +import us.codecraft.webmagic.Spider; + +/** + * 基于Model的Spider,封装后的入口类。
    + * @author code4crafter@gmail.com
    + * Date: 13-8-3
    + * Time: 上午9:51
    + */ +public class OOSpider extends Spider { + + private ModelPageProcessor modelPageProcessor; + + private ModelPipeline modelPipeline; + + protected OOSpider(ModelPageProcessor modelPageProcessor) { + super(modelPageProcessor); + this.modelPageProcessor = modelPageProcessor; + } + + /** + * 创建一个爬虫。
    + * @param site + * @param pageModelPipeline + * @param pageModels + */ + public OOSpider(Site site, PageModelPipeline pageModelPipeline, Class... pageModels) { + this(ModelPageProcessor.create(site, pageModels)); + this.modelPipeline = new ModelPipeline(); + super.pipeline(modelPipeline); + if (pageModelPipeline!=null){ + for (Class pageModel : pageModels) { + this.modelPipeline.put(pageModel, pageModelPipeline); + } + } + } + + public static OOSpider create(Site site, Class... pageModels) { + return new OOSpider(site, null, pageModels); + } + + public static OOSpider create(Site site, PageModelPipeline pageModelPipeline, Class... pageModels) { + return new OOSpider(site, pageModelPipeline, pageModels); + } + + public OOSpider addPageModel(PageModelPipeline pageModelPipeline, Class... pageModels) { + for (Class pageModel : pageModels) { + modelPageProcessor.addPageModel(pageModel); + modelPipeline.put(pageModel, pageModelPipeline); + } + return this; + } + +} diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/PageModelExtractor.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/PageModelExtractor.java new file mode 100644 index 000000000..2f9004b5e --- /dev/null +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/PageModelExtractor.java @@ -0,0 +1,355 @@ +package us.codecraft.webmagic.model; + +import org.apache.commons.lang3.StringUtils; +import us.codecraft.webmagic.Page; +import us.codecraft.webmagic.model.annotation.*; +import us.codecraft.webmagic.selector.*; + +import java.lang.annotation.Annotation; +import java.lang.reflect.Field; +import java.lang.reflect.InvocationTargetException; +import java.lang.reflect.Method; +import java.util.ArrayList; +import java.util.List; +import java.util.regex.Pattern; + +/** + * Model主要逻辑类。将一个带注解的POJO转换为一个PageModelExtractor。
    + * + * @author code4crafter@gmail.com
    + * Date: 13-8-1
    + * Time: 下午9:33
    + */ +class PageModelExtractor { + + private List targetUrlPatterns = new ArrayList(); + + private Selector targetUrlRegionSelector; + + private List helpUrlPatterns = new ArrayList(); + + private Selector helpUrlRegionSelector; + + private Class clazz; + + private List fieldExtractors; + + private Extractor extractor; + + public static PageModelExtractor create(Class clazz) { + PageModelExtractor pageModelExtractor = new PageModelExtractor(); + pageModelExtractor.init(clazz); + return pageModelExtractor; + } + + private void init(Class clazz) { + this.clazz = clazz; + initClassExtractors(); + fieldExtractors = new ArrayList(); + for (Field field : clazz.getDeclaredFields()) { + field.setAccessible(true); + FieldExtractor fieldExtractor = getAnnotationExtractBy(clazz, field); + FieldExtractor fieldExtractorTmp = getAnnotationExtractByRaw(clazz, field); + if (fieldExtractor != null && fieldExtractorTmp != null) { + throw new IllegalStateException("Only one of 'ExtractBy ExtractByRaw ExtractByUrl' can be added to a field!"); + } else if (fieldExtractor == null && fieldExtractorTmp != null) { + fieldExtractor = fieldExtractorTmp; + } + // ExtractBy2 & ExtractBy3 + if (fieldExtractor!=null){ + addAnnotationExtractBy2(fieldExtractor); + addAnnotationExtractBy3(fieldExtractor); + } + fieldExtractorTmp = getAnnotationExtractByUrl(clazz, field); + if (fieldExtractor != null && fieldExtractorTmp != null) { + throw new IllegalStateException("Only one of 'ExtractBy ExtractByRaw ExtractByUrl' can be added to a field!"); + } else if (fieldExtractor == null && fieldExtractorTmp != null) { + fieldExtractor = fieldExtractorTmp; + } + if (fieldExtractor != null) { + if (!fieldExtractor.isMulti() && !String.class.isAssignableFrom(field.getType())) { + throw new IllegalStateException("Field " + field.getName() + " must be string"); + } else if (fieldExtractor.isMulti() && !List.class.isAssignableFrom(field.getType())) { + throw new IllegalStateException("Field " + field.getName() + " must be list"); + } + fieldExtractors.add(fieldExtractor); + } + } + } + + private FieldExtractor getAnnotationExtractByUrl(Class clazz, Field field) { + FieldExtractor fieldExtractor = null; + ExtractByUrl extractByUrl = field.getAnnotation(ExtractByUrl.class); + if (extractByUrl != null) { + String regexPattern = extractByUrl.value(); + if (regexPattern.trim().equals("")) { + regexPattern = ".*"; + } + fieldExtractor = new FieldExtractor(field, new RegexSelector(regexPattern), FieldExtractor.Source.Url, extractByUrl.notNull(), extractByUrl.multi()); + Method setterMethod = getSetterMethod(clazz, field); + if (setterMethod != null) { + fieldExtractor.setSetterMethod(setterMethod); + } + } + return fieldExtractor; + } + + private FieldExtractor getAnnotationExtractBy(Class clazz, Field field) { + FieldExtractor fieldExtractor = null; + ExtractBy extractBy = field.getAnnotation(ExtractBy.class); + if (extractBy != null) { + String value = extractBy.value(); + Selector selector; + switch (extractBy.type()) { + case Css: + selector = new CssSelector(value); + break; + case Regex: + selector = new RegexSelector(value); + break; + case XPath: + selector = new XpathSelector(value); + break; + default: + selector = new XpathSelector(value); + } + fieldExtractor = new FieldExtractor(field, selector, FieldExtractor.Source.Html, extractBy.notNull(), extractBy.multi()); + Method setterMethod = getSetterMethod(clazz, field); + if (setterMethod != null) { + fieldExtractor.setSetterMethod(setterMethod); + } + } + return fieldExtractor; + } + + private void addAnnotationExtractBy2(FieldExtractor fieldExtractor) { + ExtractBy2 extractBy = fieldExtractor.getField().getAnnotation(ExtractBy2.class); + if (extractBy != null) { + String value = extractBy.value(); + Selector selector; + switch (extractBy.type()) { + case Css: + selector = new CssSelector(value); + break; + case Regex: + selector = new RegexSelector(value); + break; + case XPath: + selector = new XpathSelector(value); + break; + default: + selector = new XpathSelector(value); + } + fieldExtractor.setSelector(new AndSelector(fieldExtractor.getSelector(), selector)); + } + } + + private void addAnnotationExtractBy3(FieldExtractor fieldExtractor) { + ExtractBy3 extractBy = fieldExtractor.getField().getAnnotation(ExtractBy3.class); + if (extractBy != null) { + String value = extractBy.value(); + Selector selector; + switch (extractBy.type()) { + case Css: + selector = new CssSelector(value); + break; + case Regex: + selector = new RegexSelector(value); + break; + case XPath: + selector = new XpathSelector(value); + break; + default: + selector = new XpathSelector(value); + } + fieldExtractor.setSelector(new AndSelector(fieldExtractor.getSelector(), selector)); + } + } + + private FieldExtractor getAnnotationExtractByRaw(Class clazz, Field field) { + FieldExtractor fieldExtractor = null; + ExtractByRaw extractByRaw = field.getAnnotation(ExtractByRaw.class); + if (extractByRaw != null) { + String value = extractByRaw.value(); + Selector selector; + switch (extractByRaw.type()) { + case Css: + selector = new CssSelector(value); + break; + case Regex: + selector = new RegexSelector(value); + break; + case XPath: + selector = new XpathSelector(value); + break; + default: + selector = new XpathSelector(value); + } + fieldExtractor = new FieldExtractor(field, selector, FieldExtractor.Source.RawHtml, extractByRaw.notNull(), extractByRaw.multi()); + Method setterMethod = getSetterMethod(clazz, field); + if (setterMethod != null) { + fieldExtractor.setSetterMethod(setterMethod); + } + } + return fieldExtractor; + } + + public static Method getSetterMethod(Class clazz, Field field) { + String name = "set" + StringUtils.capitalize(field.getName()); + try { + Method declaredMethod = clazz.getDeclaredMethod(name, field.getType()); + declaredMethod.setAccessible(true); + return declaredMethod; + } catch (NoSuchMethodException e) { + return null; + } + } + + private void initClassExtractors() { + Annotation annotation = clazz.getAnnotation(TargetUrl.class); + if (annotation == null) { + targetUrlPatterns.add(Pattern.compile(".*")); + } else { + TargetUrl targetUrl = (TargetUrl) annotation; + String[] value = targetUrl.value(); + for (String s : value) { + targetUrlPatterns.add(Pattern.compile("(" + s.replace(".", "\\.").replace("*", "[^\"'#]*") + ")")); + } + if (!targetUrl.sourceRegion().equals("")) { + targetUrlRegionSelector = new XpathSelector(targetUrl.sourceRegion()); + } + } + annotation = clazz.getAnnotation(HelpUrl.class); + if (annotation != null) { + HelpUrl helpUrl = (HelpUrl) annotation; + String[] value = helpUrl.value(); + for (String s : value) { + helpUrlPatterns.add(Pattern.compile("(" + s.replace(".", "\\.").replace("*", "[^\"'#]*") + ")")); + } + if (!helpUrl.sourceRegion().equals("")) { + helpUrlRegionSelector = new XpathSelector(helpUrl.sourceRegion()); + } + } + annotation = clazz.getAnnotation(ExtractBy.class); + if (annotation != null) { + ExtractBy extractBy = (ExtractBy) annotation; + extractor = new Extractor(new XpathSelector(extractBy.value()), Extractor.Source.Html, extractBy.notNull(), extractBy.multi()); + } + } + + public Object process(Page page) { + boolean matched = false; + for (Pattern targetPattern : targetUrlPatterns) { + if (targetPattern.matcher(page.getUrl().toString()).matches()) { + matched = true; + } + } + if (!matched) { + return null; + } + if (extractor == null) { + return processSingle(page, page.getHtml().toString()); + } else { + if (extractor.multi) { + List os = new ArrayList(); + List list = extractor.getSelector().selectList(page.getHtml().toString()); + for (String s : list) { + Object o = processSingle(page, s); + if (o != null) { + os.add(o); + } + } + return os; + } else { + String select = extractor.getSelector().select(page.getHtml().toString()); + Object o = processSingle(page, select); + return o; + } + } + } + + private Object processSingle(Page page, String html) { + Object o = null; + try { + o = clazz.newInstance(); + for (FieldExtractor fieldExtractor : fieldExtractors) { + if (fieldExtractor.isMulti()) { + List value; + switch (fieldExtractor.getSource()) { + case RawHtml: + value = fieldExtractor.getSelector().selectList(page.getHtml().toString()); + break; + case Html: + value = fieldExtractor.getSelector().selectList(html); + break; + case Url: + value = fieldExtractor.getSelector().selectList(page.getUrl().toString()); + break; + default: + value = fieldExtractor.getSelector().selectList(html); + } + if ((value == null || value.size() == 0) && fieldExtractor.isNotNull()) { + return null; + } + setField(o, fieldExtractor, value); + } else { + String value; + switch (fieldExtractor.getSource()) { + case RawHtml: + value = fieldExtractor.getSelector().select(page.getHtml().toString()); + break; + case Html: + value = fieldExtractor.getSelector().select(html); + break; + case Url: + value = fieldExtractor.getSelector().select(page.getUrl().toString()); + break; + default: + value = fieldExtractor.getSelector().select(html); + } + if (value == null && fieldExtractor.isNotNull()) { + return null; + } + setField(o, fieldExtractor, value); + } + } + if (AfterExtractor.class.isAssignableFrom(clazz)) { + ((AfterExtractor) o).afterProcess(page); + } + } catch (InstantiationException e) { + e.printStackTrace(); + } catch (IllegalAccessException e) { + e.printStackTrace(); + } catch (InvocationTargetException e) { + e.printStackTrace(); + } + return o; + } + + private void setField(Object o, FieldExtractor fieldExtractor, Object value) throws IllegalAccessException, InvocationTargetException { + if (fieldExtractor.getSetterMethod() != null) { + fieldExtractor.getSetterMethod().invoke(o, value); + } + fieldExtractor.getField().set(o, value); + } + + Class getClazz() { + return clazz; + } + + List getTargetUrlPatterns() { + return targetUrlPatterns; + } + + List getHelpUrlPatterns() { + return helpUrlPatterns; + } + + Selector getTargetUrlRegionSelector() { + return targetUrlRegionSelector; + } + + Selector getHelpUrlRegionSelector() { + return helpUrlRegionSelector; + } +} diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/PageModelPipeline.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/PageModelPipeline.java new file mode 100644 index 000000000..a70137f80 --- /dev/null +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/PageModelPipeline.java @@ -0,0 +1,14 @@ +package us.codecraft.webmagic.model; + +import us.codecraft.webmagic.Task; + +/** + * @author code4crafter@gmail.com
    + * Date: 13-8-3
    + * Time: 上午9:34
    + */ +public interface PageModelPipeline { + + public void process(T t, Task task); + +} diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/ExtractBy.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/ExtractBy.java new file mode 100644 index 000000000..8c12ce1fe --- /dev/null +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/ExtractBy.java @@ -0,0 +1,50 @@ +package us.codecraft.webmagic.model.annotation; + +import java.lang.annotation.ElementType; +import java.lang.annotation.Retention; +import java.lang.annotation.Target; + +/** + * 定义类或者字段的抽取规则。
    + * + * @author code4crafter@gmail.com
    + * Date: 13-8-1
    + * Time: 下午8:40
    + */ +@Retention(java.lang.annotation.RetentionPolicy.RUNTIME) +@Target({ElementType.FIELD, ElementType.TYPE}) +public @interface ExtractBy { + + /** + * 抽取规则 + * + * @return 抽取规则 + */ + String value(); + + public enum Type {XPath, Regex, Css} + + /** + * 抽取规则类型,支持XPath、Css selector、正则表达式,默认是XPath + * + * @return 抽取规则类型 + */ + Type type() default Type.XPath; + + /** + * 是否是不能为空的关键字段,若notNull为true,则对应字段抽取不到时,丢弃整个类,默认为false + * + * @return 是否是不能为空的关键字段 + */ + boolean notNull() default false; + + /** + * 是否抽取多个结果
    + * 用于字段时,需要List来盛放结果
    + * 用于类时,表示单页抽取多个对象
    + * + * @return 是否抽取多个结果 + */ + boolean multi() default false; + +} diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/ExtractBy2.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/ExtractBy2.java new file mode 100644 index 000000000..2a4f0802c --- /dev/null +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/ExtractBy2.java @@ -0,0 +1,24 @@ +package us.codecraft.webmagic.model.annotation; + +import java.lang.annotation.ElementType; +import java.lang.annotation.Retention; +import java.lang.annotation.Target; + +/** + * 定义类或者字段的抽取规则,只能在Extract、ExtractByRaw之后使用。
    + * + * @author code4crafter@gmail.com
    + * Date: 13-8-1
    + * Time: 下午8:40
    + */ +@Retention(java.lang.annotation.RetentionPolicy.RUNTIME) +@Target({ElementType.FIELD}) +public @interface ExtractBy2 { + + String value(); + + public enum Type {XPath, Regex, Css} + + Type type() default Type.XPath; + +} diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/ExtractBy3.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/ExtractBy3.java new file mode 100644 index 000000000..741682d42 --- /dev/null +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/ExtractBy3.java @@ -0,0 +1,23 @@ +package us.codecraft.webmagic.model.annotation; + +import java.lang.annotation.ElementType; +import java.lang.annotation.Retention; +import java.lang.annotation.Target; + +/** + * 定义类或者字段的抽取规则,只能在Extract、ExtractByRaw之后使用。
    + * @author code4crafter@gmail.com
    + * Date: 13-8-1
    + * Time: 下午8:40
    + */ +@Retention(java.lang.annotation.RetentionPolicy.RUNTIME) +@Target({ElementType.FIELD}) +public @interface ExtractBy3 { + + String value(); + + public enum Type { XPath, Regex, Css} + + Type type() default Type.XPath; + +} diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/ExtractByRaw.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/ExtractByRaw.java new file mode 100644 index 000000000..a3ae3e5c5 --- /dev/null +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/ExtractByRaw.java @@ -0,0 +1,49 @@ +package us.codecraft.webmagic.model.annotation; + +import java.lang.annotation.ElementType; +import java.lang.annotation.Retention; +import java.lang.annotation.Target; + +/** + * 对于在Class级别就使用过ExtractBy的类,在字段中想抽取全部内容可使用此方法。
    + * + * @author code4crafter@gmail.com
    + * Date: 13-8-1
    + * Time: 下午8:40
    + */ +@Retention(java.lang.annotation.RetentionPolicy.RUNTIME) +@Target({ElementType.FIELD, ElementType.TYPE}) +public @interface ExtractByRaw { + + /** + * 抽取规则 + * + * @return 抽取规则 + */ + String value(); + + public enum Type {XPath, Regex, Css} + + /** + * 抽取规则类型,支持XPath、Css selector、正则表达式,默认是XPath + * + * @return 抽取规则类型 + */ + Type type() default Type.XPath; + + /** + * 是否是不能为空的关键字段,若notNull为true,则对应字段抽取不到时,丢弃整个类,默认为false + * + * @return 是否是不能为空的关键字段 + */ + boolean notNull() default false; + + /** + * 是否抽取多个结果
    + * 需要List来盛放结果
    + * + * @return 是否抽取多个结果 + */ + boolean multi() default false; + +} diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/ExtractByUrl.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/ExtractByUrl.java new file mode 100644 index 000000000..51b5f0dff --- /dev/null +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/ExtractByUrl.java @@ -0,0 +1,40 @@ +package us.codecraft.webmagic.model.annotation; + +import java.lang.annotation.ElementType; +import java.lang.annotation.Retention; +import java.lang.annotation.Target; + +/** + * 定义类或者字段的抽取规则(从url中抽取,只支持正则表达式)。
    + * @author code4crafter@gmail.com
    + * Date: 13-8-1
    + * Time: 下午8:40
    + */ +@Retention(java.lang.annotation.RetentionPolicy.RUNTIME) +@Target({ElementType.FIELD}) +public @interface ExtractByUrl{ + + /** + * 抽取规则,支持正则表达式 + * + * @return 抽取规则 + */ + String value() default ""; + + /** + * 是否是不能为空的关键字段,若notNull为true,则对应字段抽取不到时,丢弃整个类,默认为false + * + * @return 是否是不能为空的关键字段 + */ + boolean notNull() default false; + + /** + * 是否抽取多个结果
    + * 用于字段时,需要List来盛放结果
    + * 用于类时,表示单页抽取多个对象
    + * + * @return 是否抽取多个结果 + */ + boolean multi() default false; + +} diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/HelpUrl.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/HelpUrl.java new file mode 100644 index 000000000..9a0cce4f6 --- /dev/null +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/HelpUrl.java @@ -0,0 +1,30 @@ +package us.codecraft.webmagic.model.annotation; + +import java.lang.annotation.ElementType; +import java.lang.annotation.Retention; +import java.lang.annotation.Target; + +/** + * 定义辅助爬取的url。
    + * @author code4crafter@gmail.com
    + * Date: 13-8-1
    + * Time: 下午8:40
    + */ +@Retention(java.lang.annotation.RetentionPolicy.RUNTIME) +@Target({ElementType.TYPE}) +public @interface HelpUrl { + + /** + * 某个类对应的URL规则列表
    + * webmagic对正则表达式进行了修改,"."仅表示字符"."而不代表任意字符,而"\*"则代表了".\*",例如"http://\*.oschina.net/\*"代表了oschina所有的二级域名下的URL。
    + * + * @return 抽取规则 + */ + String[] value(); + + /** + * 指定提取URL的区域(仅支持XPath) + * @return 指定提取URL的区域 + */ + String sourceRegion() default ""; +} diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/TargetUrl.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/TargetUrl.java new file mode 100644 index 000000000..e12fca396 --- /dev/null +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/TargetUrl.java @@ -0,0 +1,32 @@ +package us.codecraft.webmagic.model.annotation; + +import java.lang.annotation.ElementType; +import java.lang.annotation.Retention; +import java.lang.annotation.Target; + +/** + * 定义某个类抽取的范围和来源,sourceRegion可以用xpath语法限定抽取区域。
    + * + * @author code4crafter@gmail.com
    + * Date: 13-8-1
    + * Time: 下午8:40
    + */ +@Retention(java.lang.annotation.RetentionPolicy.RUNTIME) +@Target({ElementType.TYPE}) +public @interface TargetUrl { + + /** + * 某个类对应的URL规则列表
    + * webmagic对正则表达式进行了修改,"."仅表示字符"."而不代表任意字符,而"\*"则代表了".\*",例如"http://\*.oschina.net/\*"代表了oschina所有的二级域名下的URL。
    + * + * @return 抽取规则 + */ + String[] value(); + + /** + * 指定提取URL的区域(仅支持XPath) + * @return 指定提取URL的区域 + */ + String sourceRegion() default ""; + +} diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/package.html b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/package.html new file mode 100644 index 000000000..1e3004fbe --- /dev/null +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/package.html @@ -0,0 +1,5 @@ + + +webmagic注解抓取方式所定义的注解。 + + diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/package.html b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/package.html new file mode 100644 index 000000000..d62cc0027 --- /dev/null +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/package.html @@ -0,0 +1,5 @@ + + +webmagic对抓取器编写的面向模型(称为PageModel)的封装。基于POJO及注解即可实现一个PageProcessor。 + + diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/pipeline/JsonFilePipeline.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/pipeline/JsonFilePipeline.java new file mode 100644 index 000000000..53dba9e4e --- /dev/null +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/pipeline/JsonFilePipeline.java @@ -0,0 +1,61 @@ +package us.codecraft.webmagic.pipeline; + +import com.alibaba.fastjson.JSON; +import org.apache.commons.codec.digest.DigestUtils; +import org.apache.log4j.Logger; +import us.codecraft.webmagic.ResultItems; +import us.codecraft.webmagic.Task; + +import java.io.File; +import java.io.FileWriter; +import java.io.IOException; +import java.io.PrintWriter; + +/** + * JSON格式持久化到文件的接口。 + * + * @author code4crafter@gmail.com
    + * Date: 13-4-21 + * Time: 下午6:28 + */ +public class JsonFilePipeline implements Pipeline { + + private String path = "/data/webmagic/"; + + private Logger logger = Logger.getLogger(getClass()); + + /** + * 新建一个FilePipeline,使用默认保存路径"/data/webmagic/" + */ + public JsonFilePipeline() { + + } + + /** + * 新建一个FilePipeline + * + * @param path 文件保存路径 + */ + public JsonFilePipeline(String path) { + if (!path.endsWith("/")&&!path.endsWith("\\")){ + path+="/"; + } + this.path = path; + } + + @Override + public void process(ResultItems resultItems, Task task) { + String path = this.path + "/" + task.getUUID() + "/"; + File file = new File(path); + if (!file.exists()) { + file.mkdirs(); + } + try { + PrintWriter printWriter = new PrintWriter(new FileWriter(path + DigestUtils.md5Hex(resultItems.getRequest().getUrl()) + ".json")); + printWriter.write(JSON.toJSONString(resultItems.getAll())); + printWriter.close(); + } catch (IOException e) { + logger.warn("write file error", e); + } + } +} diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/pipeline/PagedPipeline.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/pipeline/PagedPipeline.java new file mode 100644 index 000000000..beda66734 --- /dev/null +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/pipeline/PagedPipeline.java @@ -0,0 +1,84 @@ +package us.codecraft.webmagic.pipeline; + +import us.codecraft.webmagic.PagedModel; +import us.codecraft.webmagic.ResultItems; +import us.codecraft.webmagic.Task; +import us.codecraft.webmagic.utils.DoubleKeyMap; + +import java.util.*; +import java.util.concurrent.ConcurrentHashMap; + +/** + * 用于实现分页的Pipeline。
    + * 在使用redis做分布式爬虫时,请不要使用此功能。
    + * + * @author code4crafter@gmail.com
    + * Date: 13-8-4
    + * Time: 下午5:15
    + */ +public class PagedPipeline implements Pipeline { + + private DoubleKeyMap pageMap = new DoubleKeyMap(ConcurrentHashMap.class); + + private DoubleKeyMap objectMap = new DoubleKeyMap(ConcurrentHashMap.class); + + @Override + public void process(ResultItems resultItems, Task task) { + Map resultItemsAll = resultItems.getAll(); + Iterator> iterator = resultItemsAll.entrySet().iterator(); + while (iterator.hasNext()) { + handleObject(iterator); + } + } + + private void handleObject(Iterator> iterator) { + Map.Entry objectEntry = iterator.next(); + Object o = objectEntry.getValue(); + if (o instanceof PagedModel) { + PagedModel pagedModel = (PagedModel) o; + pageMap.put(pagedModel.getPageKey(), pagedModel.getPage(), Boolean.TRUE); + if (pagedModel.getOtherPages() != null) { + for (String otherPage : pagedModel.getOtherPages()) { + Boolean aBoolean = pageMap.get(pagedModel.getPageKey(), otherPage); + if (aBoolean == null) { + pageMap.put(pagedModel.getPageKey(), otherPage, Boolean.FALSE); + } + } + } + //check if all pages are processed + Map booleanMap = pageMap.get(pagedModel.getPageKey()); + objectMap.put(pagedModel.getPageKey(), pagedModel.getPage(), pagedModel); + if (booleanMap == null) { + return; + } + for (Map.Entry stringBooleanEntry : booleanMap.entrySet()) { + if (!stringBooleanEntry.getValue()) { + iterator.remove(); + return; + } + } + List> entryList = new ArrayList>(); + entryList.addAll(objectMap.get(pagedModel.getPageKey()).entrySet()); + if (entryList.size() != 0) { + Collections.sort(entryList, new Comparator>() { + @Override + public int compare(Map.Entry o1, Map.Entry o2) { + try { + int i1 = Integer.parseInt(o1.getKey()); + int i2 = Integer.parseInt(o2.getKey()); + return i1 - i2; + } catch (NumberFormatException e) { + return o1.getKey().compareTo(o2.getKey()); + } + } + }); + PagedModel value = entryList.get(0).getValue(); + for (int i = 1; i < entryList.size(); i++) { + value = value.combine(entryList.get(i).getValue()); + } + objectEntry.setValue(value); + } + } + } + +} diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/schedular/FileCacheQueueSchedular.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/FileCacheQueueScheduler.java similarity index 69% rename from webmagic-core/src/main/java/us/codecraft/webmagic/schedular/FileCacheQueueSchedular.java rename to webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/FileCacheQueueScheduler.java index 94002ed74..a8dc23a54 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/schedular/FileCacheQueueSchedular.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/FileCacheQueueScheduler.java @@ -1,9 +1,9 @@ -package us.codecraft.webmagic.schedular; +package us.codecraft.webmagic.scheduler; import org.apache.commons.lang3.math.NumberUtils; import org.apache.log4j.Logger; -import us.codecraft.webmagic.Site; import us.codecraft.webmagic.Request; +import us.codecraft.webmagic.Task; import java.io.*; import java.util.LinkedHashSet; @@ -16,11 +16,12 @@ import java.util.concurrent.atomic.AtomicInteger; /** - * User: cairne + * 磁盘文件实现的url管理模块,可以保证在长时间执行的任务中断后,下次启动从中断位置重新开始。
    + * @author code4crafter@gmail.com
    * Date: 13-4-21 * Time: 下午1:13 */ -public class FileCacheQueueSchedular implements Schedular { +public class FileCacheQueueScheduler implements Scheduler { private Logger logger = Logger.getLogger(getClass()); @@ -28,7 +29,7 @@ public class FileCacheQueueSchedular implements Schedular { private String fileUrlAllName = ".urls.txt"; - private Site site; + private Task task; private String fileCursor = ".cursor.txt"; @@ -44,13 +45,11 @@ public class FileCacheQueueSchedular implements Schedular { private Set urls; - public FileCacheQueueSchedular(Site site) { - this.site = site; - } - - public FileCacheQueueSchedular(Site site, String filePath) { + public FileCacheQueueScheduler(String filePath) { + if (!filePath.endsWith("/")&&!filePath.endsWith("\\")){ + filePath+="/"; + } this.filePath = filePath; - this.site = site; } private void flush() { @@ -58,16 +57,17 @@ private void flush() { fileCursorWriter.flush(); } - private void init() { + private void init(Task task) { + this.task = task; File file = new File(filePath); - if (!file.exists()){ + if (!file.exists()) { file.mkdirs(); } readFile(); initWriter(); initFlushThread(); inited.set(true); - logger.info("init cache schedular success"); + logger.info("init cache scheduler success"); } private void initFlushThread() { @@ -81,10 +81,10 @@ public void run() { private void initWriter() { try { - fileUrlWriter = new PrintWriter(new FileWriter(filePath + site.getDomain() + fileUrlAllName, true)); - fileCursorWriter = new PrintWriter(new FileWriter(filePath + site.getDomain() + fileCursor, false)); + fileUrlWriter = new PrintWriter(new FileWriter(getFileName(fileUrlAllName), true)); + fileCursorWriter = new PrintWriter(new FileWriter(getFileName(fileCursor), false)); } catch (IOException e) { - throw new RuntimeException("init cache schedular error", e); + throw new RuntimeException("init cache scheduler error", e); } } @@ -95,35 +95,40 @@ private void readFile() { readCursorFile(); readUrlFile(); } catch (IOException e) { + logger.error("init file error",e); } } private void readUrlFile() throws IOException { String line; - BufferedReader fileUrlReader = new BufferedReader(new FileReader(filePath + site.getDomain() + fileUrlAllName)); + BufferedReader fileUrlReader = new BufferedReader(new FileReader(getFileName(fileUrlAllName))); int lineReaded = 0; while ((line = fileUrlReader.readLine()) != null) { urls.add(line.trim()); lineReaded++; if (lineReaded > cursor.get()) { - queue.add(new Request(line, site)); + queue.add(new Request(line)); } } } private void readCursorFile() throws IOException { - BufferedReader fileCursorReader = new BufferedReader(new FileReader(filePath + site.getDomain() + fileCursor)); - String line = null; + BufferedReader fileCursorReader = new BufferedReader(new FileReader(getFileName(fileCursor))); + String line; //read the last number while ((line = fileCursorReader.readLine()) != null) { cursor = new AtomicInteger(NumberUtils.toInt(line)); } } + private String getFileName(String filename) { + return filePath + task.getUUID() + filename; + } + @Override - public synchronized void push(Request request,Site site) { + public synchronized void push(Request request, Task task) { if (!inited.get()) { - init(); + init(task); } if (logger.isDebugEnabled()) { logger.debug("push to queue " + request.getUrl()); @@ -136,9 +141,9 @@ public synchronized void push(Request request,Site site) { } @Override - public synchronized Request poll(Site site) { + public synchronized Request poll(Task task) { if (!inited.get()) { - init(); + init(task); } fileCursorWriter.println(cursor.incrementAndGet()); return queue.poll(); diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/RedisScheduler.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/RedisScheduler.java new file mode 100644 index 000000000..e7c5bcd42 --- /dev/null +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/RedisScheduler.java @@ -0,0 +1,65 @@ +package us.codecraft.webmagic.scheduler; + +import com.alibaba.fastjson.JSON; +import org.apache.commons.codec.digest.DigestUtils; +import redis.clients.jedis.Jedis; +import redis.clients.jedis.JedisPool; +import redis.clients.jedis.JedisPoolConfig; +import us.codecraft.webmagic.Request; +import us.codecraft.webmagic.Task; + +/** + * 使用redis管理url,构建一个分布式的爬虫。
    + * + * @author code4crafter@gmail.com
    + * Date: 13-7-25
    + * Time: 上午7:07
    + */ +public class RedisScheduler implements Scheduler { + + private JedisPool pool; + + private static final String QUEUE_PREFIX = "queue_"; + + private static final String SET_PREFIX = "set_"; + + private static final String ITEM_PREFIX = "item_"; + + public RedisScheduler(String host) { + pool = new JedisPool(new JedisPoolConfig(), host); + } + + @Override + public synchronized void push(Request request, Task task) { + Jedis jedis = pool.getResource(); + //使用SortedSet进行url去重 + if (jedis.zrank(SET_PREFIX + task.getUUID(), request.getUrl()) == null) { + //使用List保存队列 + jedis.rpush(QUEUE_PREFIX + task.getUUID(), request.getUrl()); + jedis.zadd(SET_PREFIX + task.getUUID(), request.getPriority(), request.getUrl()); + if (request.getExtras() != null) { + String key = ITEM_PREFIX + DigestUtils.shaHex(request.getUrl()); + byte[] bytes = JSON.toJSONString(request).getBytes(); + jedis.set(key.getBytes(), bytes); + } + } + pool.returnResource(jedis); + } + + @Override + public synchronized Request poll(Task task) { + Jedis jedis = pool.getResource(); + String url = jedis.lpop(QUEUE_PREFIX + task.getUUID()); + if (url == null) { + return null; + } + String key = ITEM_PREFIX + DigestUtils.shaHex(url); + byte[] bytes = jedis.get(key.getBytes()); + if (bytes != null) { + Request o = JSON.parseObject(new String(bytes),Request.class); + return o; + } + pool.returnResource(jedis); + return new Request(url); + } +} diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/utils/DoubleKeyMap.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/utils/DoubleKeyMap.java new file mode 100755 index 000000000..b284a1574 --- /dev/null +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/utils/DoubleKeyMap.java @@ -0,0 +1,111 @@ +package us.codecraft.webmagic.utils; + +import java.util.Map; + +/** + * @author code4crafter@gmail.com + * Date Dec 14, 2012 + */ +public class DoubleKeyMap extends MultiKeyMapBase { + private Map> map; + + public DoubleKeyMap() { + init(); + } + + public DoubleKeyMap(Map> map) { + this(map,DEFAULT_CLAZZ); + } + + public DoubleKeyMap(Class protoMapClass) { + super(protoMapClass); + init(); + } + + private void init() { + if (map == null) { + map = this.>newMap(); + } + } + + /** + * init map with protoMapClass + * + * @param protoMapClass + */ + @SuppressWarnings("rawtypes") + public DoubleKeyMap(Map> map, Class protoMapClass) { + super(protoMapClass); + this.map = map; + init(); + } + + /** + * @param key + * @return map + */ + public Map get(K1 key) { + return map.get(key); + } + + /** + * @param key1 + * @param key2 + * @return value + */ + public V get(K1 key1, K2 key2) { + if (get(key1) == null) { + return null; + } + return get(key1).get(key2); + } + + + /** + * @param key1 + * @param submap + * @return + */ + public V put(K1 key1, Map submap) { + return put(key1, submap); + } + + /** + * @param key1 + * @param key2 + * @param value + * @return + */ + public V put(K1 key1, K2 key2, V value) { + if (map.get(key1) == null) { + map.put(key1, this.newMap()); + } + return get(key1).put(key2, value); + } + + /** + * @param key1 + * @param key2 + * @return + */ + public V remove(K1 key1, K2 key2) { + if (get(key1) == null) { + return null; + } + V remove = get(key1).remove(key2); + // 如果上一级map为空,把它也回收掉 + if (get(key1).size() == 0) { + remove(key1); + } + return remove; + } + + /** + * @param key1 + * @return + */ + public Map remove(K1 key1) { + Map remove = map.remove(key1); + return remove; + } +} diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/utils/MultiKeyMapBase.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/utils/MultiKeyMapBase.java new file mode 100755 index 000000000..89fdc9ab0 --- /dev/null +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/utils/MultiKeyMapBase.java @@ -0,0 +1,42 @@ +package us.codecraft.webmagic.utils; + +/** + * @author code4crafter@gmail.com + * Date Dec 14, 2012 + */ + +import java.util.HashMap; +import java.util.Map; + +/** + * multikey map, some basic objects * + * + * @author yihua.huang + */ +public abstract class MultiKeyMapBase { + + protected static final Class DEFAULT_CLAZZ = HashMap.class; + @SuppressWarnings("rawtypes") + private Class protoMapClass = DEFAULT_CLAZZ; + + public MultiKeyMapBase() { + } + + @SuppressWarnings("rawtypes") + public MultiKeyMapBase(Class protoMapClass) { + this.protoMapClass = protoMapClass; + } + + @SuppressWarnings("unchecked") + protected Map newMap() { + try { + return (Map) protoMapClass.newInstance(); + } catch (InstantiationException e) { + throw new IllegalArgumentException("wrong proto type map " + + protoMapClass); + } catch (IllegalAccessException e) { + throw new IllegalArgumentException("wrong proto type map " + + protoMapClass); + } + } +} \ No newline at end of file diff --git a/webmagic-extension/src/test/java/us/codecraft/webmagic/scheduler/RedisSchedulerTest.java b/webmagic-extension/src/test/java/us/codecraft/webmagic/scheduler/RedisSchedulerTest.java new file mode 100644 index 000000000..0819e437d --- /dev/null +++ b/webmagic-extension/src/test/java/us/codecraft/webmagic/scheduler/RedisSchedulerTest.java @@ -0,0 +1,45 @@ +package us.codecraft.webmagic.scheduler; + +import org.junit.Before; +import org.junit.Ignore; +import org.junit.Test; +import us.codecraft.webmagic.Request; +import us.codecraft.webmagic.Site; +import us.codecraft.webmagic.Task; + +/** + * @author code4crafter@gmail.com
    + * Date: 13-7-25
    + * Time: 上午7:51
    + */ +public class RedisSchedulerTest { + + private RedisScheduler redisScheduler; + + @Before + public void setUp() { + redisScheduler = new RedisScheduler("localhost"); + } + + @Ignore("environment depended") + @Test + public void test() { + Task task = new Task() { + @Override + public String getUUID() { + return "1"; + } + + @Override + public Site getSite() { + return null; + } + }; + Request request = new Request("http://www.ibm.com/developerworks/cn/java/j-javadev2-22/"); + request.putExtra("1","2"); + redisScheduler.push(request, task); + Request poll = redisScheduler.poll(task); + System.out.println(poll); + + } +} diff --git a/webmagic-lucene/README.md b/webmagic-lucene/README.md new file mode 100644 index 000000000..77050ab08 --- /dev/null +++ b/webmagic-lucene/README.md @@ -0,0 +1,3 @@ +webmagic-lucene +-------- +尝试将webmagic与lucene结合,打造一个搜索引擎。开发中,不作为webmagic主要模块。 \ No newline at end of file diff --git a/webmagic-lucene/pom.xml b/webmagic-lucene/pom.xml new file mode 100644 index 000000000..d7b4665c0 --- /dev/null +++ b/webmagic-lucene/pom.xml @@ -0,0 +1,37 @@ + + + + webmagic + us.codecraft + 0.2.0 + + 4.0.0 + + webmagic-lucene + + + + org.apache.lucene + lucene-analyzers-common + 4.4.0 + + + org.apache.lucene + lucene-queryparser + 4.4.0 + + + us.codecraft + webmagic-extension + ${project.version} + + + junit + junit + + + + + \ No newline at end of file diff --git a/webmagic-lucene/src/main/java/us/codecraft/webmagic/pipeline/LucenePipeline.java b/webmagic-lucene/src/main/java/us/codecraft/webmagic/pipeline/LucenePipeline.java new file mode 100644 index 000000000..6fe270210 --- /dev/null +++ b/webmagic-lucene/src/main/java/us/codecraft/webmagic/pipeline/LucenePipeline.java @@ -0,0 +1,92 @@ +package us.codecraft.webmagic.pipeline; + +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.standard.StandardAnalyzer; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; +import org.apache.lucene.document.TextField; +import org.apache.lucene.index.DirectoryReader; +import org.apache.lucene.index.IndexWriter; +import org.apache.lucene.index.IndexWriterConfig; +import org.apache.lucene.queryparser.classic.ParseException; +import org.apache.lucene.queryparser.classic.QueryParser; +import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.search.Query; +import org.apache.lucene.search.ScoreDoc; +import org.apache.lucene.store.Directory; +import org.apache.lucene.store.RAMDirectory; +import org.apache.lucene.util.Version; +import us.codecraft.webmagic.ResultItems; +import us.codecraft.webmagic.Task; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; +import java.util.Map; + +/** + * @author code4crafter@gmail.com
    + * Date: 13-8-5
    + * Time: 下午2:11
    + */ +public class LucenePipeline implements Pipeline { + + private Directory directory; + + private Analyzer analyzer; + + private IndexWriterConfig config; + + private void init() throws IOException { + analyzer = new StandardAnalyzer(Version.LUCENE_44); + directory = new RAMDirectory(); + config = new IndexWriterConfig(Version.LUCENE_44, analyzer); + } + + public LucenePipeline() { + try { + init(); + } catch (IOException e) { + e.printStackTrace(); + } + } + + public List search(String fieldName, String value) throws IOException, ParseException { + List documents = new ArrayList(); + DirectoryReader ireader = DirectoryReader.open(directory); + IndexSearcher isearcher = new IndexSearcher(ireader); + // Parse a simple query that searches for "text": + QueryParser parser = new QueryParser(Version.LUCENE_44, fieldName, analyzer); + Query query = parser.parse(value); + ScoreDoc[] hits = isearcher.search(query, null, 1000).scoreDocs; + // Iterate through the results: + for (int i = 0; i < hits.length; i++) { + Document hitDoc = isearcher.doc(hits[i].doc); + documents.add(hitDoc); + } + ireader.close(); + return documents; + } + + @Override + public void process(ResultItems resultItems, Task task) { + if (resultItems.isSkip()){ + return; + } + Document doc = new Document(); + Map all = resultItems.getAll(); + if (all==null){ + return; + } + for (Map.Entry objectEntry : all.entrySet()) { + doc.add(new Field(objectEntry.getKey(), objectEntry.getValue().toString(), TextField.TYPE_STORED)); + } + try { + IndexWriter indexWriter = new IndexWriter(directory, config); + indexWriter.addDocument(doc); + indexWriter.close(); + } catch (IOException e) { + e.printStackTrace(); + } + } +} diff --git a/webmagic-lucene/src/main/test/java/us/codecraft/webmagic/lucene/OschinaBlog.java b/webmagic-lucene/src/main/test/java/us/codecraft/webmagic/lucene/OschinaBlog.java new file mode 100644 index 000000000..b35037024 --- /dev/null +++ b/webmagic-lucene/src/main/test/java/us/codecraft/webmagic/lucene/OschinaBlog.java @@ -0,0 +1,61 @@ +package us.codecraft.webmagic.lucene; + +import org.apache.lucene.document.Document; +import org.apache.lucene.queryparser.classic.ParseException; +import us.codecraft.webmagic.Site; +import us.codecraft.webmagic.model.annotation.ExtractBy; +import us.codecraft.webmagic.model.OOSpider; +import us.codecraft.webmagic.model.annotation.TargetUrl; +import us.codecraft.webmagic.pipeline.LucenePipeline; + +import java.io.IOException; +import java.util.List; + +/** + * @author code4crafter@gmail.com
    + * Date: 13-8-2
    + * Time: 上午7:52
    + */ +@TargetUrl("http://my.oschina.net/flashsword/blog/\\d+") +public class OschinaBlog { + + @ExtractBy("//title") + private String title; + + @ExtractBy(value = "div.BlogContent", type = ExtractBy.Type.Css) + private String content; + + @Override + public String toString() { + return "OschinaBlog{" + + "title='" + title + '\'' + + ", content='" + content + '\'' + + '}'; + } + + public static void main(String[] args) { + LucenePipeline pipeline = new LucenePipeline(); + OOSpider.create(Site.me().addStartUrl("http://my.oschina.net/flashsword/blog"), OschinaBlog.class).pipeline(pipeline).runAsync(); + while (true) { + try { + List search = pipeline.search("title", "webmagic"); + System.out.println(search); + Thread.sleep(3000); + } catch (IOException e) { + e.printStackTrace(); + } catch (ParseException e) { + e.printStackTrace(); + } catch (InterruptedException e) { + e.printStackTrace(); + } + } + } + + public String getTitle() { + return title; + } + + public String getContent() { + return content; + } +} diff --git a/webmagic-plugin/pom.xml b/webmagic-plugin/pom.xml deleted file mode 100644 index 1128f7ac1..000000000 --- a/webmagic-plugin/pom.xml +++ /dev/null @@ -1,72 +0,0 @@ - - - us.codecraft - 0.0.1-SNAPSHOT - 4.0.0 - - webmagic-plugin - - - - us.codecraft - webmagic-core - 0.0.1-SNAPSHOT - - - junit - junit - 4.7 - test - - - org.freemarker - freemarker - 2.3.19 - - - - - - - org.apache.maven.plugins - maven-resources-plugin - - UTF-8 - - - - org.apache.maven.plugins - maven-source-plugin - - - attach-sources - - jar - - - - - - org.apache.maven.plugins - maven-javadoc-plugin - - - attach-javadocs - - jar - - - - - - org.apache.maven.plugins - maven-release-plugin - 2.0-beta-7 - - - - - - \ No newline at end of file diff --git a/webmagic-plugin/src/main/java/us/codecraft/webmagic/pipeline/FreemarkerPipeline.java b/webmagic-plugin/src/main/java/us/codecraft/webmagic/pipeline/FreemarkerPipeline.java deleted file mode 100644 index 8487064ec..000000000 --- a/webmagic-plugin/src/main/java/us/codecraft/webmagic/pipeline/FreemarkerPipeline.java +++ /dev/null @@ -1,57 +0,0 @@ -package us.codecraft.webmagic.pipeline; - -import freemarker.template.Configuration; -import freemarker.template.Template; -import freemarker.template.TemplateException; -import org.apache.commons.codec.digest.DigestUtils; -import us.codecraft.webmagic.Page; -import us.codecraft.webmagic.Site; -import us.codecraft.webmagic.utils.UrlUtils; - -import java.io.*; - -/** - * User: cairne - * Date: 13-6-8 - * Time: 下午9:00 - */ -public class FreemarkerPipeline implements Pipeline { - - private Configuration configuration; - - private Template template; - - private String path = "/data/temp/webmagic/ftl/"; - - public FreemarkerPipeline(String template, String path) throws IOException { - configuration = new Configuration(); - configuration.setDirectoryForTemplateLoading(new File(this.getClass().getClassLoader().getResource("ftl/").getFile())); - this.template = configuration.getTemplate(template); - this.path = path; - File file = new File(path); - } - - public FreemarkerPipeline(String template) throws IOException { - this(template, "/data/temp/webmagic/ftl/"); - } - - - @Override - public void process(Page page, Site site) { - String domain = site.getDomain(); - domain = UrlUtils.getDomain(domain); - String path = this.path + "" + domain + "/"; - File file = new File(path); - if (!file.exists()) { - file.mkdirs(); - } - try { - PrintWriter printWriter = new PrintWriter(new FileWriter(path + DigestUtils.md5Hex(page.getUrl().toString()) + ".html")); - template.process(page.getFields(), printWriter); - printWriter.close(); - } catch (TemplateException e) { - } catch (IOException e) { - e.printStackTrace(); - } - } -} diff --git a/webmagic-plugin/src/main/resources/ftl/wordpress.ftl b/webmagic-plugin/src/main/resources/ftl/wordpress.ftl deleted file mode 100644 index 61820b727..000000000 --- a/webmagic-plugin/src/main/resources/ftl/wordpress.ftl +++ /dev/null @@ -1,23 +0,0 @@ - - $it.Title - http://127.0.0.1/wordpress/?p=$it.Id - ${date} - admin - http://127.0.0.1/wordpress/?p=$it.Id - - - - <#--$it.Id--> - ${date} - ${date} - open - open - ${title} - publish - 0 - 0 - post - - 0 - $tags - \ No newline at end of file diff --git a/webmagic-plugin/src/test/java/us/codecraft/webmagic/FreemarkerPipelineTest.java b/webmagic-plugin/src/test/java/us/codecraft/webmagic/FreemarkerPipelineTest.java deleted file mode 100644 index d52154f13..000000000 --- a/webmagic-plugin/src/test/java/us/codecraft/webmagic/FreemarkerPipelineTest.java +++ /dev/null @@ -1,19 +0,0 @@ -package us.codecraft.webmagic; - -import org.junit.Test; -import us.codecraft.webmagic.pipeline.FreemarkerPipeline; - -import java.io.IOException; - -/** - * User: cairne - * Date: 13-6-9 - * Time: 上午7:14 - */ -public class FreemarkerPipelineTest { - - @Test - public void test() throws IOException { - FreemarkerPipeline freemarkerPipeline = new FreemarkerPipeline("wordpress.ftl"); - } -} diff --git a/webmagic-plugin/src/test/resources/ftl/wordpress.ftl b/webmagic-plugin/src/test/resources/ftl/wordpress.ftl deleted file mode 100644 index 61820b727..000000000 --- a/webmagic-plugin/src/test/resources/ftl/wordpress.ftl +++ /dev/null @@ -1,23 +0,0 @@ - - $it.Title - http://127.0.0.1/wordpress/?p=$it.Id - ${date} - admin - http://127.0.0.1/wordpress/?p=$it.Id - - - - <#--$it.Id--> - ${date} - ${date} - open - open - ${title} - publish - 0 - 0 - post - - 0 - $tags - \ No newline at end of file diff --git a/webmagic-samples/README.md b/webmagic-samples/README.md new file mode 100644 index 000000000..7cdad186f --- /dev/null +++ b/webmagic-samples/README.md @@ -0,0 +1,3 @@ +webmagic-samples +------- +webmagic的一些示例。包括抓取常见博客、信息类网站等。 \ No newline at end of file diff --git a/webmagic-samples/pom.xml b/webmagic-samples/pom.xml index bfa1bfad2..9d00d2f0c 100644 --- a/webmagic-samples/pom.xml +++ b/webmagic-samples/pom.xml @@ -2,9 +2,11 @@ - - us.codecraft - 0.0.1-SNAPSHOT + + us.codecraft + webmagic + 0.2.0 + 4.0.0 webmagic-samples @@ -13,18 +15,16 @@ us.codecraft webmagic-core - 0.0.1-SNAPSHOT + ${project.version} us.codecraft - webmagic-plugin - 0.0.1-SNAPSHOT + webmagic-extension + ${project.version} junit junit - 4.7 - test @@ -32,42 +32,19 @@ org.apache.maven.plugins - maven-resources-plugin + maven-jar-plugin + 2.4 - UTF-8 + + + true + ./lib/ + us.codecraft.webmagic.main.QuickStarter + + - - org.apache.maven.plugins - maven-source-plugin - - - attach-sources - - jar - - - - - - org.apache.maven.plugins - maven-javadoc-plugin - - - attach-javadocs - - jar - - - - - - org.apache.maven.plugins - maven-release-plugin - 2.0-beta-7 - - \ No newline at end of file diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/main/QuickStarter.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/main/QuickStarter.java new file mode 100644 index 000000000..52be27210 --- /dev/null +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/main/QuickStarter.java @@ -0,0 +1,70 @@ +package us.codecraft.webmagic.main; + +import us.codecraft.webmagic.Site; +import us.codecraft.webmagic.model.OOSpider; +import us.codecraft.webmagic.model.samples.IteyeBlog; +import us.codecraft.webmagic.model.samples.News163; +import us.codecraft.webmagic.model.samples.OschinaBlog; +import us.codecraft.webmagic.pipeline.ConsolePipeline; +import us.codecraft.webmagic.pipeline.PagedPipeline; + +import java.util.LinkedHashMap; +import java.util.Map; +import java.util.Scanner; + +/** + * @author code4crafter@gmail.com
    + * Date: 13-8-7
    + * Time: 下午9:24
    + */ +public class QuickStarter { + + private static Map clazzMap; + + private static Map urlMap; + + private static void init(){ + clazzMap = new LinkedHashMap(); + clazzMap.put("1", OschinaBlog.class); + clazzMap.put("2", IteyeBlog.class); + clazzMap.put("3", News163.class); + urlMap = new LinkedHashMap(); + urlMap.put("1", "http://my.oschina.net/flashsword/blog"); + urlMap.put("2", "http://flashsword20.iteye.com/"); + urlMap.put("3", "http://news.163.com/"); + } + + public static void main(String[] args) { + init(); + String key = null; + key = readKey(key); + System.out.println("The demo started and will last 20 seconds..."); + //Start spider + OOSpider.create(Site.me().addStartUrl(urlMap.get(key)), clazzMap.get(key)).pipeline(new PagedPipeline()).pipeline(new ConsolePipeline()).runAsync(); + + try { + Thread.sleep(20000); + } catch (InterruptedException e) { + e.printStackTrace(); + } + System.out.println("The demo stopped!"); + System.out.println("To more usage, try to customize your own Spider!"); + System.exit(0); + } + + private static String readKey(String key) { + Scanner stdin = new Scanner(System.in); + System.out.println("Choose a Spider demo:"); + for (Map.Entry classEntry : clazzMap.entrySet()) { + System.out.println(classEntry.getKey()+"\t" + classEntry.getValue() + "\t" + urlMap.get(classEntry.getKey())); + } + while (key == null) { + key = new String(stdin.nextLine()); + if (clazzMap.get(key) == null) { + System.out.println("Invalid choice!"); + key = null; + } + } + return key; + } +} diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/Blog.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/Blog.java new file mode 100644 index 000000000..509aaf9f0 --- /dev/null +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/Blog.java @@ -0,0 +1,13 @@ +package us.codecraft.webmagic.model.samples; + +/** + * @author code4crafter@gmail.com
    + * Date: 13-8-2
    + * Time: 上午8:10
    + */ +public interface Blog { + + public String getTitle(); + + public String getContent(); +} diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/IteyeBlog.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/IteyeBlog.java new file mode 100644 index 000000000..ae9452526 --- /dev/null +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/IteyeBlog.java @@ -0,0 +1,41 @@ +package us.codecraft.webmagic.model.samples; + +import us.codecraft.webmagic.Site; +import us.codecraft.webmagic.model.annotation.ExtractBy; +import us.codecraft.webmagic.model.OOSpider; +import us.codecraft.webmagic.model.annotation.TargetUrl; + +/** + * @author code4crafter@gmail.com
    + * Date: 13-8-2
    + * Time: 上午7:52
    + */ +@TargetUrl("http://*.iteye.com/blog/*") +public class IteyeBlog implements Blog{ + + @ExtractBy("//title") + private String title; + + @ExtractBy(value = "div#blog_content",type = ExtractBy.Type.Css) + private String content; + + @Override + public String toString() { + return "IteyeBlog{" + + "title='" + title + '\'' + + ", content='" + content + '\'' + + '}'; + } + + public static void main(String[] args) { + OOSpider.create(Site.me().addStartUrl("http://*.iteye.com/blog"), IteyeBlog.class).run(); + } + + public String getTitle() { + return title; + } + + public String getContent() { + return content; + } +} diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/News163.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/News163.java new file mode 100644 index 000000000..8c0e32dce --- /dev/null +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/News163.java @@ -0,0 +1,82 @@ +package us.codecraft.webmagic.model.samples; + +import us.codecraft.webmagic.PagedModel; +import us.codecraft.webmagic.Site; +import us.codecraft.webmagic.model.*; +import us.codecraft.webmagic.model.annotation.ExtractBy; +import us.codecraft.webmagic.model.annotation.ExtractBy2; +import us.codecraft.webmagic.model.annotation.ExtractByUrl; +import us.codecraft.webmagic.model.annotation.TargetUrl; +import us.codecraft.webmagic.pipeline.ConsolePipeline; +import us.codecraft.webmagic.pipeline.PagedPipeline; +import us.codecraft.webmagic.scheduler.RedisScheduler; + +import java.util.Collection; +import java.util.List; + +/** + * @author code4crafter@gmail.com
    + * Date: 13-8-4
    + * Time: 下午8:17
    + */ +@TargetUrl("http://news.163.com/\\d+/\\d+/\\d+/\\w+*.html") +public class News163 implements PagedModel { + + @ExtractByUrl("http://news\\.163\\.com/\\d+/\\d+/\\d+/([^_]*).*\\.html") + private String pageKey; + + @ExtractByUrl(value = "http://news\\.163\\.com/\\d+/\\d+/\\d+/\\w+_(\\d+)\\.html", notNull = false) + private String page; + + @ExtractBy(value = "//div[@class=\"ep-pages\"]//a/@href", multi = true,notNull = false) + @ExtractBy2(value = "http://news\\.163\\.com/\\d+/\\d+/\\d+/\\w+_(\\d+)\\.html", type = ExtractBy2.Type.Regex) + private List otherPage; + + @ExtractBy("//h1[@id=\"h1title\"]/text()") + private String title; + + @ExtractBy("//div[@id=\"epContentLeft\"]") + private String content; + + @Override + public String getPageKey() { + return pageKey; + } + + @Override + public Collection getOtherPages() { + return otherPage; + } + + @Override + public String getPage() { + if (page == null) { + return "1"; + } + return page; + } + + @Override + public PagedModel combine(PagedModel pagedModel) { + News163 news163 = new News163(); + news163.title = this.title; + News163 pagedModel1 = (News163) pagedModel; + news163.content = this.content + pagedModel1.content; + return news163; + } + + @Override + public String toString() { + return "News163{" + + "content='" + content + '\'' + + ", title='" + title + '\'' + + ", otherPage=" + otherPage + + '}'; + } + + public static void main(String[] args) { + OOSpider.create(Site.me().addStartUrl("http://news.163.com/13/0802/05/958I1E330001124J_2.html"), News163.class) + .scheduler(new RedisScheduler("localhost")).clearPipeline().pipeline(new PagedPipeline()).pipeline(new ConsolePipeline()).run(); + } + +} diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/OschinaAnswer.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/OschinaAnswer.java new file mode 100644 index 000000000..e878633b6 --- /dev/null +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/OschinaAnswer.java @@ -0,0 +1,34 @@ +package us.codecraft.webmagic.model.samples; + +import us.codecraft.webmagic.Page; +import us.codecraft.webmagic.Site; +import us.codecraft.webmagic.model.*; +import us.codecraft.webmagic.model.annotation.ExtractBy; +import us.codecraft.webmagic.model.annotation.HelpUrl; +import us.codecraft.webmagic.model.annotation.TargetUrl; + +/** + * @author code4crafter@gmail.com
    + * Date: 13-8-3
    + * Time: 下午8:25
    + */ +@TargetUrl("http://www.oschina.net/question/\\d+_\\d+*") +@HelpUrl("http://www.oschina.net/question/*") +@ExtractBy(value = "//ul[@class='list']/li[@class='Answer']", multi = true) +public class OschinaAnswer implements AfterExtractor{ + + @ExtractBy("//img/@title") + private String user; + + @ExtractBy("//div[@class='detail']") + private String content; + + public static void main(String[] args) { + OOSpider.create(Site.me().addStartUrl("http://www.oschina.net/question/567527_120597"), OschinaAnswer.class).run(); + } + + @Override + public void afterProcess(Page page) { + + } +} diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/OschinaBlog.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/OschinaBlog.java new file mode 100644 index 000000000..c1e3ea340 --- /dev/null +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/OschinaBlog.java @@ -0,0 +1,33 @@ +package us.codecraft.webmagic.model.samples; + +import us.codecraft.webmagic.Site; +import us.codecraft.webmagic.model.ConsolePageModelPipeline; +import us.codecraft.webmagic.model.OOSpider; +import us.codecraft.webmagic.model.annotation.ExtractBy; +import us.codecraft.webmagic.model.annotation.TargetUrl; + +import java.util.List; + +/** + * @author code4crafter@gmail.com
    + * Date: 13-8-2
    + * Time: 上午7:52
    + */ +@TargetUrl("http://my.oschina.net/flashsword/blog/\\d+") +public class OschinaBlog { + + @ExtractBy("//title") + private String title; + + @ExtractBy(value = "div.BlogContent",type = ExtractBy.Type.Css) + private String content; + + @ExtractBy(value = "//div[@class='BlogTags']/a/text()", multi = true) + private List tags; + + public static void main(String[] args) { + OOSpider.create(Site.me().addStartUrl("http://my.oschina.net/flashsword/blog") + ,new ConsolePageModelPipeline(), OschinaBlog.class).run(); + } + +} diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/DiandianBlogProcessor.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/DiandianBlogProcessor.java index 53b10520b..a1189e45d 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/DiandianBlogProcessor.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/DiandianBlogProcessor.java @@ -7,7 +7,7 @@ import java.util.List; /** - * User: cairne + * @author code4crafter@gmail.com
    * Date: 13-4-21 * Time: 下午8:08 */ @@ -17,27 +17,27 @@ public class DiandianBlogProcessor implements PageProcessor { @Override public void process(Page page) { - //a()表示提取链接,as()表示提取所有链接 + //a()表示提取链接,links()表示提取所有链接 //getHtml()返回Html对象,支持链式调用 - //r()表示用正则表达式提取一条内容,rs()表示提取多条内容 - //toString()表示取单条结果,toStrings()表示取多条 - List requests = page.getHtml().as().rs("(.*/post/.*)").toStrings(); + //r()表示用正则表达式提取一条内容,regex()表示提取多条内容 + //toString()表示取单条结果,all()表示取多条 + List requests = page.getHtml().links().regex("(.*/post/.*)").all(); //使用page.addTargetRequests()方法将待抓取的链接加入队列 page.addTargetRequests(requests); //page.putField(key,value)将抽取的内容加入结果Map //x()和xs()使用xpath进行抽取 - page.putField("title", page.getHtml().x("//title").r("(.*?)\\|")); - //sc()使用readability技术直接抽取正文,对于规整的文本有比较好的抽取正确率 - page.putField("content", page.getHtml().sc()); - page.putField("date", page.getUrl().r("post/(\\d+-\\d+-\\d+)/")); - page.putField("id", page.getUrl().r("post/\\d+-\\d+-\\d+/(\\d+)")); + page.putField("title", page.getHtml().xpath("//title").regex("(.*?)\\|").toString()); + //smartContent()使用readability技术直接抽取正文,对于规整的文本有比较好的抽取正确率 + page.putField("content", page.getHtml().smartContent()); + page.putField("date", page.getUrl().regex("post/(\\d+-\\d+-\\d+)/")); + page.putField("id", page.getUrl().regex("post/\\d+-\\d+-\\d+/(\\d+)")); } @Override public Site getSite() { //site定义抽取配置,以及开始url等 if (site == null) { - site = Site.me().setDomain("progressdaily.diandian.com").setStartUrl("http://progressdaily.diandian.com/"). + site = Site.me().setDomain("progressdaily.diandian.com").addStartUrl("http://progressdaily.diandian.com/"). setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31"); } return site; diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/DianpingBlogProcessor.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/DianpingBlogProcessor.java deleted file mode 100644 index dd601adb3..000000000 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/DianpingBlogProcessor.java +++ /dev/null @@ -1,33 +0,0 @@ -package us.codecraft.webmagic.samples; - -import us.codecraft.webmagic.Site; -import us.codecraft.webmagic.Page; -import us.codecraft.webmagic.processor.PageProcessor; - -import java.util.List; - -/** - * User: cairne - * Date: 13-4-21 - * Time: 下午8:08 - */ -public class DianpingBlogProcessor implements PageProcessor { - @Override - public void process(Page page) { - //http://progressdaily.diandian.com/post/2013-01-24/40046867275 - List requests = page.getHtml().rs("]*href=[\"']{1}(/shop/.*?)[\"']{1}").toStrings(); - page.addTargetRequests(requests); - requests = page.getHtml().rs("]*href=[\"']{1}(/search/category/.*?)[\"']{1}").toStrings(); - page.addTargetRequests(requests); - if (page.getUrl().toString().contains("shop")){ - page.putField("title", page.getHtml().x("//h1[@class='shop-title']")); - page.putField("content", page.getHtml().sc()); - } - } - - @Override - public Site getSite() { - return Site.me().setDomain("www.dianping.com").setStartUrl("http://www.dianping.com/"). - setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31"); - } -} diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/DiaoyuwengProcessor.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/DiaoyuwengProcessor.java index 03389f5bb..115f18342 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/DiaoyuwengProcessor.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/DiaoyuwengProcessor.java @@ -8,7 +8,7 @@ import java.util.List; /** - * User: cairne + * @author code4crafter@gmail.com
    * Date: 13-4-21 * Time: 下午8:08 */ @@ -18,23 +18,23 @@ public class DiaoyuwengProcessor implements PageProcessor { @Override public void process(Page page) { - List requests = page.getHtml().as().rs("(http://www\\.diaoyuweng\\.com/home\\.php\\?mod=space&uid=88304&do=thread&view=me&type=thread&order=dateline&from=space&page=\\d+)").toStrings(); + List requests = page.getHtml().links().regex("(http://www\\.diaoyuweng\\.com/home\\.php\\?mod=space&uid=88304&do=thread&view=me&type=thread&order=dateline&from=space&page=\\d+)").all(); page.addTargetRequests(requests); - requests = page.getHtml().as().rs("(http://www\\.diaoyuweng\\.com/thread-\\d+-1-1.html)").toStrings(); + requests = page.getHtml().links().regex("(http://www\\.diaoyuweng\\.com/thread-\\d+-1-1.html)").all(); page.addTargetRequests(requests); if (page.getUrl().toString().contains("thread")){ - page.putField("title", page.getHtml().x("//a[@id='thread_subject']")); - page.putField("content", page.getHtml().x("//div[@class='pcb']//tbody")); - page.putField("date",page.getHtml().r("发表于 (\\d{4}-\\d+-\\d+ \\d+:\\d+:\\d+)")); - page.putField("id",new PlainText("1000"+page.getUrl().r("http://www\\.diaoyuweng\\.com/thread-(\\d+)-1-1.html").toString())); + page.putField("title", page.getHtml().xpath("//a[@id='thread_subject']")); + page.putField("content", page.getHtml().xpath("//div[@class='pcb']//tbody")); + page.putField("date",page.getHtml().regex("发表于 (\\d{4}-\\d+-\\d+ \\d+:\\d+:\\d+)")); + page.putField("id",new PlainText("1000"+page.getUrl().regex("http://www\\.diaoyuweng\\.com/thread-(\\d+)-1-1.html").toString())); } } @Override public Site getSite() { if (site==null){ - site= Site.me().setDomain("www.diaoyuweng.com").setStartUrl("http://www.diaoyuweng.com/home.php?mod=space&uid=88304&do=thread&view=me&type=thread&from=space"). - setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31").setEncoding("GBK").setSleepTime(500); + site= Site.me().setDomain("www.diaoyuweng.com").addStartUrl("http://www.diaoyuweng.com/home.php?mod=space&uid=88304&do=thread&view=me&type=thread&from=space"). + setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31").setCharset("GBK").setSleepTime(500); } return site; } diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/F58PageProcesser.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/F58PageProcesser.java index 78211c4c0..4ffe127b4 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/F58PageProcesser.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/F58PageProcesser.java @@ -7,7 +7,7 @@ import java.util.List; /** - * User: cairne + * @author code4crafter@gmail.com
    * Date: 13-4-21 * Time: 下午1:48 */ @@ -15,14 +15,14 @@ public class F58PageProcesser implements PageProcessor { @Override public void process(Page page) { - List strings = page.getHtml().rs("]*href=[\"']{1}(/yewu/.*?)[\"']{1}").toStrings(); + List strings = page.getHtml().regex("]*href=[\"']{1}(/yewu/.*?)[\"']{1}").all(); page.addTargetRequests(strings); - page.putField("title",page.getHtml().r("(.*)")); - page.putField("body",page.getHtml().x("//dd[@class='w133']")); + page.putField("title",page.getHtml().regex("(.*)")); + page.putField("body",page.getHtml().xpath("//dd[@class='w133']")); } @Override public Site getSite() { - return Site.me().setDomain("sh.58.com").setStartUrl("http://sh.58.com/"); //To change body of implemented methods use File | Settings | File Templates. + return Site.me().setDomain("sh.58.com").addStartUrl("http://sh.58.com/"); //To change body of implemented methods use File | Settings | File Templates. } } diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/HuxiuProcessor.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/HuxiuProcessor.java index 82552f956..89b74d63f 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/HuxiuProcessor.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/HuxiuProcessor.java @@ -7,7 +7,7 @@ import java.util.List; /** - * User: cairne + * @author code4crafter@gmail.com
    * Date: 13-4-21 * Time: 下午8:08 */ @@ -15,15 +15,15 @@ public class HuxiuProcessor implements PageProcessor { @Override public void process(Page page) { //http://progressdaily.diandian.com/post/2013-01-24/40046867275 - List requests = page.getHtml().rs("\"']*href=[\"']{1}([/]{0,1}article[^<>#\"']*?)[\"']{1}").toStrings(); + List requests = page.getHtml().regex("\"']*href=[\"']{1}([/]{0,1}article[^<>#\"']*?)[\"']{1}").all(); page.addTargetRequests(requests); - page.putField("title",page.getHtml().x("//div[@class='neirong']//h1[@class='ph xs5']")); - page.putField("content",page.getHtml().sc()); + page.putField("title",page.getHtml().xpath("//div[@class='neirong']//h1[@class='ph xs5']")); + page.putField("content",page.getHtml().smartContent()); } @Override public Site getSite() { - return Site.me().setDomain("www.huxiu.com").setStartUrl("http://www.huxiu.com/"). + return Site.me().setDomain("www.huxiu.com").addStartUrl("http://www.huxiu.com/"). setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31"); } } diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/InfoQMiniBookProcessor.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/InfoQMiniBookProcessor.java new file mode 100644 index 000000000..b43c3c569 --- /dev/null +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/InfoQMiniBookProcessor.java @@ -0,0 +1,49 @@ +package us.codecraft.webmagic.samples; + +import org.apache.commons.collections.CollectionUtils; +import us.codecraft.webmagic.Page; +import us.codecraft.webmagic.Site; +import us.codecraft.webmagic.Spider; +import us.codecraft.webmagic.pipeline.FilePipeline; +import us.codecraft.webmagic.processor.PageProcessor; +import us.codecraft.webmagic.scheduler.RedisScheduler; + +import java.util.List; + +/** + * @author code4crafter@gmail.com
    + * Date: 13-4-21 + * Time: 下午8:08 + */ +public class InfoQMiniBookProcessor implements PageProcessor { + + private Site site; + + @Override + public void process(Page page) { + page.addTargetRequests(page.getHtml().links().regex("http://www\\.infoq\\.com/cn/minibooks/.*").all()); + List all = page.getHtml().links().regex(".*\\.pdf").all(); + if (CollectionUtils.isNotEmpty(all)) { + page.putField("pdf", all); + } else { + page.getResultItems().setSkip(true); + } + } + + @Override + public Site getSite() { + if (site == null) { + site = Site.me().setDomain("www.infoq.com").addStartUrl("http://www.infoq.com/cn/minibooks").addCookie("RegisteredUserCookie", "sDDDc8dIAgZSq67uJSXhtpQaHEi1XDOH"). + setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31"); + } + return site; + } + + public static void main(String[] args) { + Spider.create(new InfoQMiniBookProcessor()) + .scheduler(new RedisScheduler("localhost")) + .pipeline(new FilePipeline("/data/temp/webmagic/")) + .thread(5) + .run(); + } +} diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/IteyeBlogProcessor.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/IteyeBlogProcessor.java new file mode 100644 index 000000000..c0b3f7316 --- /dev/null +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/IteyeBlogProcessor.java @@ -0,0 +1,37 @@ +package us.codecraft.webmagic.samples; + +import us.codecraft.webmagic.Page; +import us.codecraft.webmagic.Site; +import us.codecraft.webmagic.Spider; +import us.codecraft.webmagic.pipeline.FilePipeline; +import us.codecraft.webmagic.processor.PageProcessor; + +/** + * @author code4crafter@gmail.com
    + * Date: 13-7-26
    + * Time: 上午7:31
    + */ +public class IteyeBlogProcessor implements PageProcessor { + + private Site site; + + @Override + public void process(Page page) { + page.addTargetRequests(page.getHtml().links().regex(".*yanghaoli\\.iteye\\.com/blog/\\d+").all()); + page.putField("title",page.getHtml().xpath("//title").toString()); + page.putField("content",page.getHtml().smartContent().toString()); + } + + @Override + public Site getSite() { + if (site == null) { + site = Site.me().setDomain("yanghaoli.iteye.com").addStartUrl("http://yanghaoli.iteye.com/"). + setSleepTime(100).setRetryTimes(3); + } + return site; + } + + public static void main(String[] args) { + Spider.create(new IteyeBlogProcessor()).thread(5).pipeline(new FilePipeline("/data/webmagic/")).run(); + } +} diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/KaichibaProcessor.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/KaichibaProcessor.java index 58a2cb81c..aff18a6d3 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/KaichibaProcessor.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/KaichibaProcessor.java @@ -5,7 +5,7 @@ import us.codecraft.webmagic.processor.PageProcessor; /** - * User: cairne + * @author code4crafter@gmail.com
    * Date: 13-5-20 * Time: 下午5:31 */ @@ -13,15 +13,15 @@ public class KaichibaProcessor implements PageProcessor { @Override public void process(Page page) { //http://progressdaily.diandian.com/post/2013-01-24/40046867275 - int i = Integer.valueOf(page.getUrl().r("shop/(\\d+)").toString()) + 1; - page.addTargetRequests("http://kaichiba.com/shop/"+i); - page.putField("title",page.getHtml().x("//Title")); - page.putField("items", page.getHtml().xs("//li[@class=\"foodTitle\"]").rp("^\\s+", "").rp("\\s+$", "").rp(".*?", "")); + int i = Integer.valueOf(page.getUrl().regex("shop/(\\d+)").toString()) + 1; + page.addTargetRequest("http://kaichiba.com/shop/" + i); + page.putField("title",page.getHtml().xpath("//Title")); + page.putField("items", page.getHtml().xpath("//li[@class=\"foodTitle\"]").replace("^\\s+", "").replace("\\s+$", "").replace(".*?", "")); } @Override public Site getSite() { - return Site.me().setDomain("kaichiba.com").setStartUrl("http://kaichiba.com/shop/41725781").setEncoding("utf-8"). + return Site.me().setDomain("kaichiba.com").addStartUrl("http://kaichiba.com/shop/41725781").setCharset("utf-8"). setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31"); } } diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/MeicanProcessor.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/MeicanProcessor.java index 637aec172..a4e6e43b1 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/MeicanProcessor.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/MeicanProcessor.java @@ -7,7 +7,7 @@ import java.util.List; /** - * User: cairne + * @author code4crafter@gmail.com
    * Date: 13-5-20 * Time: 下午5:31 */ @@ -15,19 +15,19 @@ public class MeicanProcessor implements PageProcessor { @Override public void process(Page page) { //http://progressdaily.diandian.com/post/2013-01-24/40046867275 - List requests = page.getHtml().xs("//a[@class=\"area_link flat_btn\"]/@href").toStrings(); + List requests = page.getHtml().xpath("//a[@class=\"area_link flat_btn\"]/@href").all(); if (requests.size() > 2) { requests = requests.subList(0, 2); } page.addTargetRequests(requests); - page.addTargetRequests(page.getHtml().as().rs("(.*/restaurant/[^#]+)").toStrings()); - page.putField("items", page.getHtml().xs("//ul[@class=\"dishes menu_dishes\"]/li/span[@class=\"name\"]")); - page.putField("prices", page.getHtml().xs("//ul[@class=\"dishes menu_dishes\"]/li/span[@class=\"price_outer\"]/span[@class=\"price\"]")); + page.addTargetRequests(page.getHtml().links().regex("(.*/restaurant/[^#]+)").all()); + page.putField("items", page.getHtml().xpath("//ul[@class=\"dishes menu_dishes\"]/li/span[@class=\"name\"]")); + page.putField("prices", page.getHtml().xpath("//ul[@class=\"dishes menu_dishes\"]/li/span[@class=\"price_outer\"]/span[@class=\"price\"]")); } @Override public Site getSite() { - return Site.me().setDomain("meican.com").setStartUrl("http://www.meican.com/shanghai/districts").setEncoding("utf-8"). + return Site.me().setDomain("meican.com").addStartUrl("http://www.meican.com/shanghai/districts").setCharset("utf-8"). setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31"); } } diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/NjuBBSProcessor.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/NjuBBSProcessor.java index ca46de671..2337da598 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/NjuBBSProcessor.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/NjuBBSProcessor.java @@ -7,22 +7,22 @@ import java.util.List; /** - * User: cairne + * @author code4crafter@gmail.com
    * Date: 13-4-21 * Time: 下午8:08 */ public class NjuBBSProcessor implements PageProcessor { @Override public void process(Page page) { - List requests = page.getHtml().rs("]*href=(bbstcon\\?board=Pictures&file=[^>]*)").toStrings(); + List requests = page.getHtml().regex("]*href=(bbstcon\\?board=Pictures&file=[^>]*)").all(); page.addTargetRequests(requests); - page.putField("title",page.getHtml().x("//div[@id='content']//h2/a")); - page.putField("content",page.getHtml().sc()); + page.putField("title",page.getHtml().xpath("//div[@id='content']//h2/a")); + page.putField("content",page.getHtml().smartContent()); } @Override public Site getSite() { - return Site.me().setDomain("bbs.nju.edu.cn").setStartUrl("http://bbs.nju.edu.cn/board?board=Pictures"). + return Site.me().setDomain("bbs.nju.edu.cn").addStartUrl("http://bbs.nju.edu.cn/board?board=Pictures"). setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31"); } } diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/OschinaBlogPageProcesser.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/OschinaBlogPageProcesser.java index 2166d9b1b..8ba7063b0 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/OschinaBlogPageProcesser.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/OschinaBlogPageProcesser.java @@ -2,29 +2,37 @@ import us.codecraft.webmagic.Site; import us.codecraft.webmagic.Page; +import us.codecraft.webmagic.Spider; +import us.codecraft.webmagic.pipeline.ConsolePipeline; import us.codecraft.webmagic.processor.PageProcessor; import java.util.List; /** - * User: cairne + * @author code4crafter@gmail.com
    * Date: 13-4-21 * Time: 下午1:48 */ public class OschinaBlogPageProcesser implements PageProcessor { + private Site site = Site.me().setDomain("my.oschina.net").addStartUrl("http://my.oschina.net/flashsword/blog"); + @Override public void process(Page page) { - List strings = page.getHtml().as().r("(http://my\\.oschina\\.net)").toStrings(); - page.addTargetRequests(strings); - page.putField("title", page.getHtml().xs("//div[@class='BlogEntity']/div[@class='BlogTitle']/h1")); - page.putField("content", page.getHtml().sc()); - page.putField("author", page.getUrl().r("my\\.oschina\\.net/(\\w+)/blog/\\d+")); + List links = page.getHtml().links().regex("http://my\\.oschina\\.net/flashsword/blog/\\d+").all(); + page.addTargetRequests(links); + page.putField("title", page.getHtml().xpath("//div[@class='BlogEntity']/div[@class='BlogTitle']/h1").toString()); + page.putField("content", page.getHtml().$("div.content").toString()); + page.putField("tags",page.getHtml().xpath("//div[@class='BlogTags']/a/text()").all()); } @Override public Site getSite() { - return Site.me().setDomain("my.oschina.net").setStartUrl("http://www.oschina.net/"). - setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31"); + return site; + + } + + public static void main(String[] args) { + Spider.create(new OschinaBlogPageProcesser()).pipeline(new ConsolePipeline()).run(); } } diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/OschinaPageProcesser.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/OschinaPageProcesser.java index cdfbc1e0e..522eb2c6a 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/OschinaPageProcesser.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/OschinaPageProcesser.java @@ -7,7 +7,7 @@ import java.util.List; /** - * User: cairne + * @author code4crafter@gmail.com
    * Date: 13-4-21 * Time: 下午1:48 */ @@ -15,15 +15,15 @@ public class OschinaPageProcesser implements PageProcessor { @Override public void process(Page page) { - List strings = page.getHtml().rs("]*href=[\"']{1}(http://www\\.oschina\\.net/question/[\\w]+)[\"']{1}").toStrings(); + List strings = page.getHtml().regex("]*href=[\"']{1}(http://www\\.oschina\\.net/question/[\\w]+)[\"']{1}").all(); page.addTargetRequests(strings); - page.putField("title", page.getHtml().x("//div[@class='QTitle']/h1/a")); - page.putField("content", page.getHtml().xs("//div[@class='Question']//div[@class='Content']/div[@class='detail']")); + page.putField("title", page.getHtml().xpath("//div[@class='QTitle']/h1/a")); + page.putField("content", page.getHtml().xpath("//div[@class='Question']//div[@class='Content']/div[@class='detail']")); } @Override public Site getSite() { - return Site.me().setDomain("www.oschina.net").setStartUrl("http://www.oschina.net/"). + return Site.me().setDomain("www.oschina.net").addStartUrl("http://www.oschina.net/"). setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31"); } } diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/QzoneBlogProcessor.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/QzoneBlogProcessor.java index 67ef671e7..49418b605 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/QzoneBlogProcessor.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/QzoneBlogProcessor.java @@ -7,7 +7,7 @@ import java.util.List; /** - * User: cairne + * @author code4crafter@gmail.com
    * Date: 13-4-21 * Time: 下午8:08 */ @@ -18,15 +18,15 @@ public void process(Page page) { //http://b1.cnc.qzone.qq.com/cgi-bin/blognew/get_abs?hostUin=233017404&uin=233017404&blogType=0&statYear=2013&source=0&statYear=2013&g_tk=291639571&g_tk=291639571&reqInfo=7&pos=0&num=15&source=0&rand=0.46480297949165106 // &cateName=&cateHex=&statYear=2013&reqInfo=7&pos=0&num=15&sortType=0&source=0&rand=0.46480297949165106&g_tk=291639571&verbose=1&ref=qzone - List requests = page.getHtml().rs("]*href=[\"']{1}(http://17dujingdian\\.com/post/[^#]*?)[\"']{1}").toStrings(); + List requests = page.getHtml().regex("]*href=[\"']{1}(http://17dujingdian\\.com/post/[^#]*?)[\"']{1}").all(); page.addTargetRequests(requests); - page.putField("title",page.getHtml().x("//div[@id='content']//h2/a")); - page.putField("content",page.getHtml().sc()); + page.putField("title",page.getHtml().xpath("//div[@id='content']//h2/a")); + page.putField("content",page.getHtml().smartContent()); } @Override public Site getSite() { - return Site.me().setDomain("www.diandian.com").setStartUrl("http://17dujingdian.com/"). + return Site.me().setDomain("www.diandian.com").addStartUrl("http://17dujingdian.com/"). setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31"); } } diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/SinaBlogProcesser.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/SinaBlogProcesser.java index 79065265f..b4c5bc885 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/SinaBlogProcesser.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/SinaBlogProcesser.java @@ -1,11 +1,12 @@ package us.codecraft.webmagic.samples; -import us.codecraft.webmagic.Site; import us.codecraft.webmagic.Page; +import us.codecraft.webmagic.Site; +import us.codecraft.webmagic.Spider; import us.codecraft.webmagic.processor.PageProcessor; /** - * User: cairne + * @author code4crafter@gmail.com
    * Date: 13-4-21 * Time: 下午1:48 */ @@ -15,20 +16,24 @@ public class SinaBlogProcesser implements PageProcessor { @Override public void process(Page page) { - page.addTargetRequests(page.getHtml().as().rs("(http://blog\\.sina\\.com\\.cn/s/blog_.*)").toStrings()); - page.putField("title", page.getHtml().x("//div[@class='articalTitle']/h2")); - page.putField("content",page.getHtml().x("//div[@id='articlebody']//div[@class='articalContent']")); - page.putField("id",page.getUrl().r("http://blog\\.sina\\.com\\.cn/s/blog_(\\w+)")); - page.putField("date",page.getHtml().x("//div[@id='articlebody']//span[@class='time SG_txtc']").r("\\((.*)\\)")); -// page.putField("tags",page.getHtml().xs("//td[@class='blog_tag']/h3/a")); + page.addTargetRequests(page.getHtml().xpath("//div[@class='articalfrontback SG_j_linedot1 clearfix']").links().all()); + page.putField("title", page.getHtml().xpath("//div[@class='articalTitle']/h2")); + page.putField("content",page.getHtml().xpath("//div[@id='articlebody']//div[@class='articalContent']")); + page.putField("id",page.getUrl().regex("http://blog\\.sina\\.com\\.cn/s/blog_(\\w+)")); + page.putField("date",page.getHtml().xpath("//div[@id='articlebody']//span[@class='time SG_txtc']").regex("\\((.*)\\)")); +// page.putField("tags",page.getHtml().xpath("//td[@class='blog_tag']/h3/a")); } @Override public Site getSite() { if (site==null){ - site = Site.me().setDomain("blog.sina.com.cn").setStartUrl("http://blog.sina.com.cn/flashsword20").setSleepTime(3000). + site = Site.me().setDomain("blog.sina.com.cn").addStartUrl("http://blog.sina.com.cn/s/blog_4701280b0102egl0.html").setSleepTime(3000). setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31"); } return site; } + + public static void main(String[] args) { + Spider.create(new SinaBlogProcesser()).run(); + } } diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/TianyaPageProcesser.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/TianyaPageProcesser.java index 7a8920bef..ecc55b424 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/TianyaPageProcesser.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/TianyaPageProcesser.java @@ -7,7 +7,7 @@ import java.util.List; /** - * User: cairne + * @author code4crafter@gmail.com
    * Date: 13-4-21 * Time: 下午1:48 */ @@ -15,14 +15,14 @@ public class TianyaPageProcesser implements PageProcessor { @Override public void process(Page page) { - List strings = page.getHtml().rs("]*href=[\"']{1}(/post-free.*?\\.shtml)[\"']{1}").toStrings(); + List strings = page.getHtml().regex("]*href=[\"']{1}(/post-free.*?\\.shtml)[\"']{1}").all(); page.addTargetRequests(strings); - page.putField("title", page.getHtml().x("//div[@id='post_head']//span[@class='s_title']//b")); - page.putField("body",page.getHtml().sc()); + page.putField("title", page.getHtml().xpath("//div[@id='post_head']//span[@class='s_title']//b")); + page.putField("body",page.getHtml().smartContent()); } @Override public Site getSite() { - return Site.me().setDomain("http://bbs.tianya.cn/").setStartUrl("http://bbs.tianya.cn/"); //To change body of implemented methods use File | Settings | File Templates. + return Site.me().setDomain("http://bbs.tianya.cn/").addStartUrl("http://bbs.tianya.cn/"); //To change body of implemented methods use File | Settings | File Templates. } } diff --git a/webmagic-samples/src/main/resources/log4j.xml b/webmagic-samples/src/main/resources/log4j.xml new file mode 100644 index 000000000..a6630f813 --- /dev/null +++ b/webmagic-samples/src/main/resources/log4j.xml @@ -0,0 +1,26 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/webmagic-samples/src/test/java/us/codecraft/webmagic/HtmlTest.java b/webmagic-samples/src/test/java/us/codecraft/webmagic/HtmlTest.java deleted file mode 100644 index f79909840..000000000 --- a/webmagic-samples/src/test/java/us/codecraft/webmagic/HtmlTest.java +++ /dev/null @@ -1,20 +0,0 @@ -package us.codecraft.webmagic; - -import org.junit.Assert; -import org.junit.Test; -import us.codecraft.webmagic.selector.Html; - -/** - * User: cairne - * Date: 13-4-21 - * Time: 上午8:42 - */ -public class HtmlTest { - - @Test - public void testRegexSelector() { - Html selectable = new Html("aaaaaaab"); - Assert.assertEquals("abbabbab", (selectable.r("(.*)").rp("aa(a)", "$1bb").toString())); - - } -} diff --git a/webmagic-samples/src/test/java/us/codecraft/webmagic/SpiderTest.java b/webmagic-samples/src/test/java/us/codecraft/webmagic/SpiderTest.java index 838c76b8e..dbfa81548 100644 --- a/webmagic-samples/src/test/java/us/codecraft/webmagic/SpiderTest.java +++ b/webmagic-samples/src/test/java/us/codecraft/webmagic/SpiderTest.java @@ -5,36 +5,39 @@ import us.codecraft.webmagic.pipeline.FilePipeline; import us.codecraft.webmagic.processor.SimplePageProcessor; import us.codecraft.webmagic.samples.HuxiuProcessor; -import us.codecraft.webmagic.schedular.FileCacheQueueSchedular; +import us.codecraft.webmagic.scheduler.FileCacheQueueScheduler; /** - * User: cairne + * @author code4crafter@gmail.com
    * Date: 13-4-20 * Time: 下午7:46 */ public class SpiderTest { + @Ignore @Test public void testSpider() throws InterruptedException { - Spider me = Spider.me().pipeline(new FilePipeline()).processor(new HuxiuProcessor()); + Spider me = Spider.create(new HuxiuProcessor()).pipeline(new FilePipeline()); me.run(); } + @Ignore @Test public void testGlobalSpider(){ // PageProcessor pageProcessor = new MeicanProcessor(); -// Spider.me().pipeline(new FilePipeline()).schedular(new FileCacheQueueSchedular(pageProcessor.getSite(),"/data/temp/webmagic/cache/")). +// Spider.me().pipeline(new FilePipeline()).scheduler(new FileCacheQueueScheduler(pageProcessor.getSite(),"/data/temp/webmagic/cache/")). // processor(pageProcessor).run(); SimplePageProcessor pageProcessor2 = new SimplePageProcessor("http://www.diaoyuweng.com/home.php?mod=space&uid=88304&do=thread&view=me&type=thread&from=space", "http://www.diaoyuweng.com/thread-*-1-1.html"); - System.out.println(pageProcessor2.getSite().getEncoding()); + System.out.println(pageProcessor2.getSite().getCharset()); pageProcessor2.getSite().setSleepTime(500); - Spider.me().pipeline(new FilePipeline()).schedular(new FileCacheQueueSchedular(pageProcessor2.getSite(),"/data/temp/webmagic/cache/")). - processor(pageProcessor2).run(); + Spider.create(pageProcessor2).pipeline(new FilePipeline()).scheduler(new FileCacheQueueScheduler("/data/temp/webmagic/cache/")). + run(); } + @Ignore @Test public void test(){ System.out.println(System.getProperty("java.io.tmpdir")); @@ -48,7 +51,7 @@ public void languageSchema() { /** * - * _hrefs = rs("]*href=[\"']{1}(/yewu/.*?)[\"']{1}") + * _hrefs = regex("]*href=[\"']{1}(/yewu/.*?)[\"']{1}") * title = r(""(.*)"") * body = x("//dd[@class='w133']") * @@ -69,7 +72,7 @@ public void languageSchema() { * body[r(_currentUrl).g(1)] = body[r(_currentUrl).g(1)] + (x("").r("",1,2).c()) * * body=body[r(_currentUrl).g(1)] - * tags[%] = (tags[%] + xs('')) . r('') + * tags[%] = (tags[%] + xpath('')) . r('') * * _targetUrls.add('' + x('').r('')) * _sourceUrls.add() @@ -111,7 +114,7 @@ public void languageSchema() { * content = t(_html) > c() * title = x(_html, 'asd@asd') > r('',1) * body[r(_currentUrl).g(1)] = body[r(_currentUrl).g(1)] + (x('') > r('',1,2) > c()) | x('') - * tags[%] = tags + xs('') > r('') + * tags[%] = tags + xpath('') > r('') * model.setTargetUrl(); * * _targetUrl = '' + x('') & r('') diff --git a/webmagic-samples/src/test/java/us/codecraft/webmagic/processor/DiandianProcessorTest.java b/webmagic-samples/src/test/java/us/codecraft/webmagic/processor/DiandianProcessorTest.java deleted file mode 100644 index 18b0680d9..000000000 --- a/webmagic-samples/src/test/java/us/codecraft/webmagic/processor/DiandianProcessorTest.java +++ /dev/null @@ -1,34 +0,0 @@ -package us.codecraft.webmagic.processor; - -import org.junit.Test; -import us.codecraft.webmagic.Spider; -import us.codecraft.webmagic.pipeline.ConsolePipeline; -import us.codecraft.webmagic.pipeline.FreemarkerPipeline; -import us.codecraft.webmagic.samples.DiandianBlogProcessor; -import us.codecraft.webmagic.schedular.FileCacheQueueSchedular; - -import java.io.IOException; - -/** - * User: cairne - * Date: 13-6-9 - * Time: 上午8:02 - */ -public class DiandianProcessorTest { - - @Test - public void test() throws IOException { - DiandianBlogProcessor diaoyuwengProcessor = new DiandianBlogProcessor(); - //pipeline是抓取结束后的处理 - //ftl文件放到classpath:ftl/文件夹下 - //默认放到/data/temp/webmagic/ftl/[domain]目录下 - FreemarkerPipeline pipeline = new FreemarkerPipeline("wordpress.ftl"); - //Spider.me()是简化写法,其实就是new一个啦 - //Spider.pipeline()设定一个pipeline,支持链式调用 - //ConsolePipeline输出结果到控制台 - //FileCacheQueueSchedular保存url,支持断点续传,临时文件输出到/data/temp/webmagic/cache目录 - //Spider.run()执行 - Spider.me().pipeline(new ConsolePipeline()).pipeline(pipeline).schedular(new FileCacheQueueSchedular(diaoyuwengProcessor.getSite(), "/data/temp/webmagic/cache/")). - processor(diaoyuwengProcessor).run(); - } -} diff --git a/webmagic-samples/src/test/java/us/codecraft/webmagic/processor/DiaoyuwengProcessorTest.java b/webmagic-samples/src/test/java/us/codecraft/webmagic/processor/DiaoyuwengProcessorTest.java index 1e77c7c76..0371eb23c 100644 --- a/webmagic-samples/src/test/java/us/codecraft/webmagic/processor/DiaoyuwengProcessorTest.java +++ b/webmagic-samples/src/test/java/us/codecraft/webmagic/processor/DiaoyuwengProcessorTest.java @@ -1,26 +1,28 @@ package us.codecraft.webmagic.processor; +import org.junit.Ignore; import org.junit.Test; import us.codecraft.webmagic.Spider; import us.codecraft.webmagic.pipeline.FilePipeline; -import us.codecraft.webmagic.pipeline.FreemarkerPipeline; +import us.codecraft.webmagic.pipeline.JsonFilePipeline; import us.codecraft.webmagic.samples.DiaoyuwengProcessor; -import us.codecraft.webmagic.schedular.FileCacheQueueSchedular; +import us.codecraft.webmagic.scheduler.FileCacheQueueScheduler; import java.io.IOException; /** - * User: cairne - * Date: 13-6-9 - * Time: 上午8:02 + * @author code4crafter@gmail.com
    + * Date: 13-6-9 + * Time: 上午8:02 */ public class DiaoyuwengProcessorTest { + @Ignore @Test public void test() throws IOException { DiaoyuwengProcessor diaoyuwengProcessor = new DiaoyuwengProcessor(); - FreemarkerPipeline pipeline = new FreemarkerPipeline("wordpress.ftl"); - Spider.me().pipeline(new FilePipeline()).pipeline(pipeline).schedular(new FileCacheQueueSchedular(diaoyuwengProcessor.getSite(), "/data/temp/webmagic/cache/")). - processor(diaoyuwengProcessor).run(); + JsonFilePipeline pipeline = new JsonFilePipeline("/data/webmagic/"); + Spider.create(diaoyuwengProcessor).pipeline(new FilePipeline()).pipeline(pipeline).scheduler(new FileCacheQueueScheduler("/data/temp/webmagic/cache/")). + run(); } } diff --git a/webmagic-samples/src/test/java/us/codecraft/webmagic/processor/SinablogProcessorTest.java b/webmagic-samples/src/test/java/us/codecraft/webmagic/processor/SinablogProcessorTest.java index 0a5cc1b03..026f8d5f7 100644 --- a/webmagic-samples/src/test/java/us/codecraft/webmagic/processor/SinablogProcessorTest.java +++ b/webmagic-samples/src/test/java/us/codecraft/webmagic/processor/SinablogProcessorTest.java @@ -1,34 +1,35 @@ package us.codecraft.webmagic.processor; +import org.junit.Ignore; import org.junit.Test; import us.codecraft.webmagic.Spider; import us.codecraft.webmagic.pipeline.FilePipeline; -import us.codecraft.webmagic.pipeline.FreemarkerPipeline; +import us.codecraft.webmagic.pipeline.JsonFilePipeline; import us.codecraft.webmagic.samples.SinaBlogProcesser; -import us.codecraft.webmagic.schedular.FileCacheQueueSchedular; +import us.codecraft.webmagic.scheduler.FileCacheQueueScheduler; import java.io.IOException; /** - * User: cairne - * Date: 13-6-9 - * Time: 上午8:02 + * @author code4crafter@gmail.com
    + * Date: 13-6-9 + * Time: 上午8:02 */ public class SinablogProcessorTest { + @Ignore @Test public void test() throws IOException { SinaBlogProcesser sinaBlogProcesser = new SinaBlogProcesser(); //pipeline是抓取结束后的处理 - //ftl文件放到classpath:ftl/文件夹下 - //默认放到/data/temp/webmagic/ftl/[domain]目录下 - FreemarkerPipeline pipeline = new FreemarkerPipeline("wordpress.ftl"); + //默认放到/data/webmagic/ftl/[domain]目录下 + JsonFilePipeline pipeline = new JsonFilePipeline("/data/webmagic/"); //Spider.me()是简化写法,其实就是new一个啦 //Spider.pipeline()设定一个pipeline,支持链式调用 //ConsolePipeline输出结果到控制台 //FileCacheQueueSchedular保存url,支持断点续传,临时文件输出到/data/temp/webmagic/cache目录 //Spider.run()执行 - Spider.me().pipeline(new FilePipeline()).pipeline(pipeline).schedular(new FileCacheQueueSchedular(sinaBlogProcesser.getSite(), "/data/temp/webmagic/cache/")). - processor(sinaBlogProcesser).run(); + Spider.create(sinaBlogProcesser).pipeline(new FilePipeline()).pipeline(pipeline).scheduler(new FileCacheQueueScheduler("/data/temp/webmagic/cache/")). + run(); } } diff --git a/webmagic-saxon/README.md b/webmagic-saxon/README.md new file mode 100644 index 000000000..0471c68b6 --- /dev/null +++ b/webmagic-saxon/README.md @@ -0,0 +1,3 @@ +webmagic-extension +------- +webmagic的扩展模块,依赖Saxon进行xpath2.0解析支持。Saxon依赖包太大,不作为默认模块引入。 \ No newline at end of file diff --git a/webmagic-saxon/pom.xml b/webmagic-saxon/pom.xml new file mode 100644 index 000000000..a2db76851 --- /dev/null +++ b/webmagic-saxon/pom.xml @@ -0,0 +1,30 @@ + + + + us.codecraft + webmagic + 0.2.0 + + 4.0.0 + + webmagic-saxon + + + + us.codecraft + webmagic-core + ${project.version} + + + net.sf.saxon + Saxon-HE + + + junit + junit + + + + \ No newline at end of file diff --git a/webmagic-saxon/src/main/java/us/codecraft/webmagic/selector/Xpath2Selector.java b/webmagic-saxon/src/main/java/us/codecraft/webmagic/selector/Xpath2Selector.java new file mode 100644 index 000000000..98b1efe4b --- /dev/null +++ b/webmagic-saxon/src/main/java/us/codecraft/webmagic/selector/Xpath2Selector.java @@ -0,0 +1,178 @@ +package us.codecraft.webmagic.selector; + +import net.sf.saxon.lib.NamespaceConstant; +import net.sf.saxon.xpath.XPathEvaluator; +import org.apache.log4j.Logger; +import org.htmlcleaner.CleanerProperties; +import org.htmlcleaner.DomSerializer; +import org.htmlcleaner.HtmlCleaner; +import org.htmlcleaner.TagNode; +import org.w3c.dom.Document; +import org.w3c.dom.Node; +import org.w3c.dom.NodeList; + +import javax.xml.namespace.NamespaceContext; +import javax.xml.transform.OutputKeys; +import javax.xml.transform.Transformer; +import javax.xml.transform.TransformerFactory; +import javax.xml.transform.dom.DOMSource; +import javax.xml.transform.stream.StreamResult; +import javax.xml.xpath.XPathConstants; +import javax.xml.xpath.XPathExpression; +import javax.xml.xpath.XPathExpressionException; +import java.io.StringWriter; +import java.util.ArrayList; +import java.util.Iterator; +import java.util.List; +import java.util.Map; +import java.util.concurrent.ConcurrentHashMap; + +/** + * 支持xpath2.0的选择器。包装了HtmlCleaner和Saxon HE。
    + * + * @author code4crafter@gmail.com
    + * Date: 13-4-21 + * Time: 上午9:39 + */ +public class Xpath2Selector implements Selector { + + private String xpathStr; + + private XPathExpression xPathExpression; + + private Logger logger = Logger.getLogger(getClass()); + + public Xpath2Selector(String xpathStr) { + this.xpathStr = xpathStr; + try { + init(); + } catch (XPathExpressionException e) { + throw new IllegalArgumentException("XPath error!", e); + } + } + + enum XPath2NamespaceContext implements NamespaceContext { + + INSTANCE; + + private final Map prefix2NamespaceMap = new ConcurrentHashMap(); + + private final Map> namespace2PrefixMap = new ConcurrentHashMap>(); + + private void put(String prefix, String namespaceURI) { + prefix2NamespaceMap.put(prefix, namespaceURI); + List prefixes = namespace2PrefixMap.get(namespaceURI); + if (prefixes == null) { + prefixes = new ArrayList(); + namespace2PrefixMap.put(namespaceURI, prefixes); + } + prefixes.add(prefix); + } + + private XPath2NamespaceContext() { + put("fn", NamespaceConstant.FN); + put("xslt", NamespaceConstant.XSLT); + } + + @Override + public String getNamespaceURI(String prefix) { + return prefix2NamespaceMap.get(prefix); + } + + @Override + public String getPrefix(String namespaceURI) { + List prefixes = namespace2PrefixMap.get(namespaceURI); + if (prefixes == null || prefixes.size() < 1) { + return null; + } + return prefixes.get(0); + } + + @Override + public Iterator getPrefixes(String namespaceURI) { + List prefixes = namespace2PrefixMap.get(namespaceURI); + if (prefixes == null || prefixes.size() < 1) { + return null; + } + return prefixes.iterator(); + } + } + + private void init() throws XPathExpressionException { + XPathEvaluator xPathEvaluator = new XPathEvaluator(); + xPathEvaluator.setNamespaceContext(XPath2NamespaceContext.INSTANCE); + xPathExpression = xPathEvaluator.compile(xpathStr); + } + + @Override + public String select(String text) { + try { + HtmlCleaner htmlCleaner = new HtmlCleaner(); + TagNode tagNode = htmlCleaner.clean(text); + Document document = new DomSerializer(new CleanerProperties()).createDOM(tagNode); + Object result; + try { + result = xPathExpression.evaluate(document, XPathConstants.NODESET); + } catch (XPathExpressionException e) { + result = xPathExpression.evaluate(document, XPathConstants.STRING); + } + if (result instanceof NodeList) { + NodeList nodeList = (NodeList) result; + if (nodeList.getLength() == 0) { + return null; + } + Node item = nodeList.item(0); + if (item.getNodeType() == Node.ATTRIBUTE_NODE || item.getNodeType() == Node.TEXT_NODE) { + return item.getTextContent(); + } else { + StreamResult xmlOutput = new StreamResult(new StringWriter()); + Transformer transformer = TransformerFactory.newInstance().newTransformer(); + transformer.setOutputProperty(OutputKeys.OMIT_XML_DECLARATION, "yes"); + transformer.transform(new DOMSource(item), xmlOutput); + return xmlOutput.getWriter().toString(); + } + } + return result.toString(); + } catch (Exception e) { + logger.error("select text error! " + xpathStr, e); + } + return null; + } + + @Override + public List selectList(String text) { + List results = new ArrayList(); + try { + HtmlCleaner htmlCleaner = new HtmlCleaner(); + TagNode tagNode = htmlCleaner.clean(text); + Document document = new DomSerializer(new CleanerProperties()).createDOM(tagNode); + Object result; + try { + result = xPathExpression.evaluate(document, XPathConstants.NODESET); + } catch (XPathExpressionException e) { + result = xPathExpression.evaluate(document, XPathConstants.STRING); + } + if (result instanceof NodeList) { + NodeList nodeList = (NodeList) result; + Transformer transformer = TransformerFactory.newInstance().newTransformer(); + StreamResult xmlOutput = new StreamResult(); + transformer.setOutputProperty(OutputKeys.OMIT_XML_DECLARATION, "yes"); + for (int i = 0; i < nodeList.getLength(); i++) { + Node item = nodeList.item(i); + if (item.getNodeType() == Node.ATTRIBUTE_NODE || item.getNodeType() == Node.TEXT_NODE) { + results.add(item.getTextContent()); + } else { + xmlOutput.setWriter(new StringWriter()); + transformer.transform(new DOMSource(item), xmlOutput); + results.add(xmlOutput.getWriter().toString()); + } + } + } else { + results.add(result.toString()); + } + } catch (Exception e) { + logger.error("select text error! " + xpathStr, e); + } + return results; + } +} diff --git a/webmagic-saxon/src/test/java/us/codecraft/webmagic/selector/XpathSelectorTest.java b/webmagic-saxon/src/test/java/us/codecraft/webmagic/selector/XpathSelectorTest.java new file mode 100644 index 000000000..b62304069 --- /dev/null +++ b/webmagic-saxon/src/test/java/us/codecraft/webmagic/selector/XpathSelectorTest.java @@ -0,0 +1,1395 @@ +package us.codecraft.webmagic.selector; + +import org.junit.Assert; +import org.junit.Ignore; +import org.junit.Test; + +/** + * @author code4crafter@gmail.com
    Date: 13-4-21 Time: 上午10:06 + */ +public class XpathSelectorTest { + + private String html = "\n" + + "\n" + + "\n" + + "\n" + + " \n" + + " \n" + + " 再次吐槽easyui - 开源中国 OSChina.NET\n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + "\n" + + "
    \n" + + "\t
    \n" + + "
    \n" + + " \t\n" + + "
    \n" + + "\t\t
    \n" + + " \t\t \t\t黄亿华,您好 \n" + + "\t\t\t\n" + + "\t\t\t\t我的空间\n" + + "\t\t\t\t\n" + + "\t\t\t | \n" + + "\t\t\t添加软件 | 投递新闻 | 退出\n" + + " \t\t\t\t
    \n" + + "\t\t
    \n" + + "\t
    \n" + + "
    \n" + + "
    \n" + + "

    讨论区

    \n" + + "
    \n" + + "\t
    当前位置:
    \n" + + "\t
    \n" + + "\t\t\t\t\t \t\t讨论区 »\n" + + " \t\t技术问答\t\t\t\t\t\t\t\t» EasyUI\n" + + "\t\t\t\t\t\t\t\t\t\t
    \n" + + "
    \n" + + "\n" + + "
    \n" + + "
    \n" + + "\t
    \n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "
    \n" + + "\t\n" + + "\t
    \n" + + "\t
    \n" + + "\t\t
    \"午后冬日\"
    \n" + + "\t\t
    \n" + + "\t\t\t

    再次吐槽easyui

    \n" + + "\t\t\t
    \n" + + "\t\t\t\t午后冬日\n" + + "\t\t\t\t发表于 2013-4-21 02:28 13小时前,\n" + + "\t\t\t\t3回/289阅,\n" + + "\t\t\t\t最后回答: 4小时前\t\t\t\t\t\t\t\t\t\t\t
    \n" + + "\t\t
    \n" + + "\t\t\n" + + "\t\t
    \n" + + "\t
    \n" + + "\t\t \t \t\t\t\t\t\n" + + "\t\t

    Java、PHP、Ruby、iOS、Python 等 JetBrains 开发工具低至 99 元(3折),详情»

    \n" + + "\t\t
    \n" + + "\t\t\t\t\t\t
    刚用到easyui treegrid组件,发现这货第一次加载时候并没有传默认参数,展开某一列时候才传递id:xx的参数。这样和后台总是疙里疙瘩,像没事就拌嘴的两口子,查网上都遇到相同问题,最好解决方案就是通过 \n" + + "onBeforeExpand事件来扩展,自行解决。看到官方例子中简洁的代码,感觉easyui耍流氓了,真搞不懂为何要这样实现
    \n" + + "\t\t\t\t\t\t
    \n" + + "\t\t\t\t标签:\t\t\t\t\n" + + "\t\t\t\t\t\t\t\t\t\t\t\tEasyUI \t\t\t\t\t\t\t\t\t\t\t
    \n" + + "\t\t\t\t\t\t
    \n" + + "\t\t\t\n" + + "\t\t\t\t\t\t\t\t\t我想问同样的问题\n" + + "\t\t\t\t\t\t\n" + + "\t\t\t共0个人想要问同样的问题\n" + + "\t\t\t\t\t\t补充话题说明»\n" + + "\t\t\t
    \n" + + "\t\t\t\t\t\t
    \n" + + "\t
      \n" + + "
      \t\t
      \n" + + "\t\t\n" + + "\t\t
      \n" + + " \n" + + "\t\t\t\t
      \n" + + "\t\t\t
      分享到
      \n" + + "\t\t\t\n" + + "\t\t\t
      1
      \n" + + "\t\t\t\n" + + "\t\t\t
      \n" + + "\t\t\t\t\t\t\t\t \t\t\t\n" + + "\t\t\t\t\t\t\t\t0\n" + + "\t\t\t\t|\n" + + "\t\t\t\t\t\t\t\t \t\t\t\n" + + "\t\t\t\t\t\t\t\t0\n" + + "\t\t\t
      \n" + + "\t\t\t\n" + + "\t\t
      \n" + + "\t\t
      \n" + + "\t\t\t\t\t\t
      \n" + + "\t\t\t\n" + + " \t

      \t\t\t\n" + + "\t\t\t\t\n" + + "\t\t\t\t\t按评价排序 |\n" + + "\t\t\t\t\t显示最新答案 | 回页面顶部\n" + + "\t\t\t\t\n" + + "\t\t\t\t共有3个答案 我要回答»\n" + + "\t\t\t

      \n" + + "\t\t\t \t
      • \n" + + "\t
        \"布谷鸟\"
        \n" + + "\t
        \n" + + "\t\t
        布谷鸟 回答于 2013-04-21 09:28
        \t\t\n" + + " \t
        \n" + + "\t\t\t \t\t \t\t举报\n" + + " \t
        \n" + + "\t\t
        \n" + + "\t\t
        对话框、日期控件和选项卡效果还不错,树状菜单没得zTree好用,建议楼主不要全部效果都依赖于此框架,有些easyui实现不好的地方可以换其它的插件实现嘛,反正我现在再也不用诸如ext和easyui之类的东西了,感觉好肥
        \n" + + "\t
        \n" + + "\t
        \n" + + "\t
        --- 共有 1 条评论 --- \n" + + "
          \n" + + "\t\t
        • \n" + + "\t\t\"午后冬日\"\n" + + "\t\t\n" + + "\t\t前端水平实在有限,自己搞的总是感觉不伦不类,只能用这些框架,再集成其它插件,切换主题时风格又不一致。\n" + + "\t\t(4小时前 by 午后冬日)\n" + + "\t\t回复\n" + + "\t\t\n" + + "\t\t
          \n" + + "\t
        • \n" + + "\t
        \n" + + "\n" + + "
        \n" + + "\t
        \t\t\t\t\t\t有帮助(1) |\n" + + "\t\t没帮助(0) |\n" + + "\t\t评论(1) |\n" + + " \t引用此答案\t
        \n" + + "
      • \n" + + "\t
        \"静风流云\"
        \n" + + "\t
        \n" + + "\t\t
        静风流云 回答于 2013-04-21 11:08
        \t\t\n" + + " \t
        \n" + + "\t\t\t \t\t \t\t举报\n" + + " \t
        \n" + + "\t\t
        \n" + + "\t\t

        没办法,原来项目也是因为客户特殊的需求,对layout选型的时候,犹豫了好久,最终放弃了。
        幸亏来了一个厉害的前端,解决问题,够用就好。

        \n" + + "\t
        \n" + + "\t
        \n" + + "\t
        --- 共有 1 条评论 --- \n" + + "
          \n" + + "\t\t
        • \n" + + "\t\t\"午后冬日\"\n" + + "\t\t\n" + + "\t\t我也是犹豫了好久,看过很多前端框架,总是不太满意。个人开发前台后台数据库全部要自己搞定,郁闷ing\n" + + "\t\t(4小时前 by 午后冬日)\n" + + "\t\t回复\n" + + "\t\t\n" + + "\t\t
          \n" + + "\t
        • \n" + + "\t
        \n" + + "\n" + + "
        \n" + + "\t
        \t\t\t\t\t\t有帮助(0) |\n" + + "\t\t没帮助(0) |\n" + + "\t\t评论(1) |\n" + + " \t引用此答案\t
        \n" + + "
      • \n" + + "\t
        \"布谷鸟\"
        \n" + + "\t
        \n" + + "\t\t
        布谷鸟 回答于 2013-04-21 11:29
        \t\t\n" + + " \t
        \n" + + "\t\t\t \t\t \t\t举报\n" + + " \t
        \n" + + "\t\t
        \n" + + "\t\t

        引用来自“布谷鸟”的答案

        对话框、日期控件和选项卡效果还不错,树状菜单没得zTree好用,建议楼主不要全部效果都依赖于此框架,有些easyui实现不好的地方可以换其它的插件实现嘛,反正我现在再也不用诸如ext和easyui之类的东西了,感觉好肥
        前后端你一个人搞啊?那确实很麻烦。面面俱到的话,工作量很大。但是如果需要实现的功能不是很多,而时间也不紧迫的话,事情干起来也还不错。如非必须,建议逐步弃用这些前端框架,在一些比较能够提升体验的地方选用一些适当的插件即可,如此也不再需要担心风格的问题,你看osc后台截图,界面那叫一个丑,用得方便顺手就够了
        \n" + + "\t
        \n" + + "\t
        \n" + + "\t
        \n" + + "\t
        \t\t\t\t\t\t有帮助(0) |\n" + + "\t\t没帮助(0) |\n" + + "\t\t评论(0) |\n" + + " \t引用此答案\t
        \n" + + "
      \n" + + "\t\t\t\t
      \n" + + "\t\t
      \n" + + "\t\t\t
      \"黄亿华\"
      \n" + + "\t\t\t
      \n" + + "\t\t\t\t\n" + + "\t\t\t\t\n" + + "\t\t\t\t\n" + + "\t\t\t\t\n" + + "\t\t\t\t

      \n" + + "\t\t\t\t回答案顶部 | 回页面顶部\n" + + "\t\t\t
      \n" + + "\t\t\t
      \n" + + "\t\t\t\n" + + "\t\t
      \n" + + "\t
      \t\n" + + "\t\n" + + "\n" + + "\n" + + "\n" + + "\t
      \n" + + "\t
      \n" + + " \t\n" + + "\t
      \n" + + "\t\t
      \n" + + "\t\t\t有什么技术问题吗?\n" + + "\t\t\t我要提问\n" + + "\t\t\t
      \n" + + "\t\t
      \n" + + "\t\t\n" + + "\t\t\t\t\t\t
      \n" + + "\t\t\t全部(29)...午后冬日的其他问题\n" + + "\t\t\t\n" + + "\t\t
      \n" + + "\t\t\t\t
      \n" + + "\t\t\n" + + "\t\t
      \n" + + "\t\t\n" + + "\t\t
      \n" + + "\t\t\t类似的话题\n" + + "\t\t\t\n" + + "\t\t
      \n" + + "\t
      \n" + + "\t
      \n" + + "
      \n" + + "\n" + + "\n" + + "\n" + + "\n" + + "
      \n" + + "\t
      \n" + + "\n" + + "\n" + + "\n" + + "\n" + "
      © 开源中国(OsChina.NET) | 关于我们 | 广告联系 | @新浪微博 | 开源中国手机版 | 粤ICP备12009483号-3\n" + "\t开源中国手机客户端:\n" + + "\tAndroid\n" + + "\tiPhone\n" + + "\tWP7\n" + "
      \n" + "
      \n" + + "
      \n" + "\n" + "\n" + + ""; + + @Test + public void test() { + String text = "\n" + + "\n" + + "\n" + + " \n" + + " \n" + + " jsoup 解析页面商品信息 - - ITeye技术网站\n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + "\n" + + " \n" + + " \n" + + "\n" + + "\n" + + " \n" + + " \n" + + "\n" + + " \n" + + "\n" + + " \n" + + " \n" + + " \n" + + " \n" + + "
      \n" + + "
      \n" + + " 首页\n" + + " 资讯\n" + + " 精华\n" + + " 论坛\n" + + " 问答\n" + + " 博客\n" + + " 专栏\n" + + " 群组\n" + + " 更多 \n" + + "
      \n" + + " 招聘\n" + + " 搜索\n" + + "
      \n" + + "
      \n" + + "\n" + + "
      \n" + + " \n" + + " 欢迎flashsword20\n" + + " 0\n" + + " \n" + + " \"Newpm\"收件箱(3)\n" + + " \n" + + " 我的应用\n" + + "
      \n" + + " 我的关注\n" + + " 我的群组\n" + + " 我的简历\n" + + " 我的相册\n" + + " 我的收藏\n" + + " 我的代码\n" + + " 我的微博\n" + + "
      \n" + + " 我的博客\n" + + " 设置\n" + + "
      \n" + + "
      \n" + + " \n" + + " \n" + + "
      \n" + + "
      \n" + + " \n" + + " \n" + + "
      \n" + + "
      \n" + + "
      \n" + + "
      \n" + + "
      \n" + + " \n" + + "
      \n" + + "
      \n" + + " \n" + + "\n" + + "\n" + + "\n" + + " \n" + + "\n" + + "\n" + + "
      \n" + + "
      \n" + + "

      \n" + + " jsoup 解析页面商品信息\n" + + " \n" + + "

      \n" + + " \n" + + "
       
      \n" + + "
      \n" + + "\n" + + "
      \n" + + "

      今天用了jsoup 解析页面商品信息,感觉比用xpath获取信息准确多了

      \n" + + "

      \n" + + "

      下面就记录一下:

      \n" + + "

      一、首先去 http://jsoup.org/download 下载jsoup的jar包。

      \n" + + "

      \n" + + "

      二、下面记录下相关代码:

      \n" + + "

      \n" + + "

      \n" + + "

      Document doc = Jsoup.connect(url).get(); //将htm转换成Document类型数据结构

      \n" + + "


      doc.select(\"div:has(div) div#spec-n1:has(img) img\").first().attr(\"src\")); //查找div下含有div的标签

      \n" + + "

      \n" + + "

      并且 div的id='spec-n1',此div第一个img标签,img里属性是src的值。

      \n" + + "

      \n" + + "

      doc.select(\"div:has(div) div.crumb:has(a) a:eq(4)\").text(); //查找class='crumb'的div下第4个a标签

      \n" + + "

      下的值。

      \n" + + "

      \n" + + "

      doc.select(\"div:has(div) div#name:has(h1)\").text(); //查找id='name'的div下的h1标签的值。

      \n" + + "

      \n" + + "

      doc.select(\"tbody:has(tr) td.tdTitle:contains(品牌) + td\").text(); //查找class='tdTitle'的td标签里

      \n" + + "

      \n" + + "

      含有‘品牌’td的下一个td标签中内容。

      \n" + + "

      \n" + + "

      doc.select(\"script[type=text/javascript]:not([src~=[a-zA-Z0-9./\\\\s]+)\"); //查找含有此<script

      \n" + + "

      \n" + + "

      type=\"text/javascript\">……</script>内容,不含有script标签中有src属性的script,如:

      \n" + + "

      \n" + + "

      <script src=\"url\" type=\"text/javascript\"></script>。

      \n" + + "
      \n" + + "\n" + + " \n" + + "\n" + + "\n" + + " \n" + + " \n" + + "
      \n" + + " \n" + + "
      分享到:\n" + + " \n" + + " \n" + + "
      \n" + + "
      \n" + + "\n" + + " \n" + + "
      \n" + + " \n" + + "
      \n" + + "\n" + + "
      \n" + + "
      评论
      \n" + + " \n" + + " \n" + + " \n" + + " \n" + + "
      \n" + + "\n" + + "
      \n" + + "
      发表评论
      \n" + + "
      \n" + + "\n" + + "\n" + + " \n" + + "\n" + + "\n" + + "\n" + + "
      \n" + + "\n" + + "\n" + + "\n" + + "\n" + + "

      (快捷键 Alt+S / Ctrl+Enter)

      \n" + + "
      \n" + + " \n" + + "
      \n" + + "
      \n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "
      \n" + + "\n" + + "
      \n" + + "
      \n" + + "
      \n" + + "
      \"masong1987的博客\"
      \n" + + "
      masong1987
      \n" + + "
      \n" + + "\n" + + "
      \n" + + "
        \n" + + "
      • 浏览: 5401 次
      • \n" + + "
      • 性别: \"Icon_minigender_1\"
      • \n" + + "
      • 来自: 北京
      • \n" + + "
      • \n" + + " \n" + + "
      • \n" + + " 发短消息\n" + + " \n" + + " 更多访客>>\n" + + " \n" + + "
        \n" + + "
        \"flashsword20的博客\"
        \n" + + " \n" + + "
        \n" + + " \n" + + "
        \n" + + "
        \"dylinshi126的博客\"
        \n" + + " \n" + + "
        \n" + + " \n" + + "
        \n" + + "
        \"machoo的博客\"
        \n" + + " \n" + + "
        \n" + + " \n" + + "
        \n" + + "
        \"arson的博客\"
        \n" + + " \n" + + "
        \n" + + " \n" + + "
      \n" + + "\n" + + " \n" + + "\n" + + "
      \n" + + "
      文章分类
      \n" + + " \n" + + "
      \n" + + "
      \n" + + "
      社区版块
      \n" + + " \n" + + "
      \n" + + "
      \n" + + "
      存档分类
      \n" + + " \n" + + "
      \n" + + " \n" + + " \n" + + "\n" + + "
      \n" + + "
      最新评论
      \n" + + " \n" + + "
      \n" + + "\n" + + "
      \n" + + " \n" + + "
      \n" + + "
      \n" + + "\n" + + "
      \n" + + "
      \n" + + "
      \n" + + " 声明:ITeye文章版权属于作者,受法律保护。没有作者书面许可不得转载。若作者同意转载,必须以超链接形式标明文章原始出处和作者。
      \n" + + " © 2003-2012 ITeye.com. All rights reserved. [ 京ICP证110151号 京公网安备110105010620 ]\n" + + "
      \n" + + "
      \n" + + " \n" + + " \n" + "\n" + " \n" + " \n" + " \n" + "\n"; + String text2 = "
      aaa
      "; + XpathSelector xpathSelector = new XpathSelector( + "//div[@id='main']/div[@class='blog_main']/div[1][@class='blog_title']/h3/a"); + String select = xpathSelector.select(text); + Assert.assertEquals("jsoup 解析页面商品信息", select); + } + + @Test + public void testOschina() { + Html html1 = new Html(html); + Assert.assertEquals("再次吐槽easyui", html1.xpath(".//*[@class='QTitle']/h1/a").toString()); + Assert.assertNotNull(html1.$("a[href]").xpath("//@href").all()); + } + + @Test + public void testXPath2() { + String text = "

      眉山:扎实推进农业农村工作 促农持续增收
      \n" + + "2013-07-31 23:29:45   来源:眉山网      责任编辑:张斯炜

      "; + XpathSelector xpathSelector = new XpathSelector("//h1/text()"); + System.out.println(xpathSelector.select(text)); + } + + @Test + public void testXpath2Selector() { + Xpath2Selector xpath2Selector = new Xpath2Selector("//a/@href"); + String select = xpath2Selector.select(html); + Assert.assertNotNull(select); + } + + @Ignore("take long time") + @Test + public void performanceTest() { + Xpath2Selector xpath2Selector = new Xpath2Selector("//a"); + long time =System.currentTimeMillis(); + for (int i = 0; i < 1000; i++) { + xpath2Selector.selectList(html); + } + System.out.println(System.currentTimeMillis()-time); + XpathSelector xpathSelector = new XpathSelector("//a"); + time =System.currentTimeMillis(); + for (int i = 0; i < 1000; i++) { + xpathSelector.selectList(html); + } + System.out.println(System.currentTimeMillis()-time); + time =System.currentTimeMillis(); + for (int i = 0; i < 1000; i++) { + xpath2Selector.selectList(html); + } + System.out.println(System.currentTimeMillis()-time); + } + +} diff --git a/webmagic-selenium/README.md b/webmagic-selenium/README.md new file mode 100644 index 000000000..c8583c3ab --- /dev/null +++ b/webmagic-selenium/README.md @@ -0,0 +1,3 @@ +webmagic-extension +------- +webmagic与selenium的集成,用于爬取ajax页面。selenium太重,所以单独抽出成一个包了。 \ No newline at end of file diff --git a/webmagic-selenium/pom.xml b/webmagic-selenium/pom.xml new file mode 100644 index 000000000..814b7b31b --- /dev/null +++ b/webmagic-selenium/pom.xml @@ -0,0 +1,31 @@ + + + + us.codecraft + webmagic + 0.2.0 + + 4.0.0 + + webmagic-selenium + + + + org.seleniumhq.selenium + selenium-java + 2.33.0 + + + us.codecraft + webmagic-core + ${project.version} + + + junit + junit + + + + \ No newline at end of file diff --git a/webmagic-selenium/src/main/java/us/codecraft/webmagic/downloader/selenium/SeleniumDownloader.java b/webmagic-selenium/src/main/java/us/codecraft/webmagic/downloader/selenium/SeleniumDownloader.java new file mode 100644 index 000000000..0fa0eea5e --- /dev/null +++ b/webmagic-selenium/src/main/java/us/codecraft/webmagic/downloader/selenium/SeleniumDownloader.java @@ -0,0 +1,110 @@ +package us.codecraft.webmagic.downloader.selenium; + +import org.apache.log4j.Logger; +import org.openqa.selenium.By; +import org.openqa.selenium.Cookie; +import org.openqa.selenium.WebDriver; +import org.openqa.selenium.WebElement; +import us.codecraft.webmagic.Page; +import us.codecraft.webmagic.Request; +import us.codecraft.webmagic.Site; +import us.codecraft.webmagic.Task; +import us.codecraft.webmagic.downloader.Destroyable; +import us.codecraft.webmagic.downloader.Downloader; +import us.codecraft.webmagic.selector.Html; +import us.codecraft.webmagic.selector.PlainText; +import us.codecraft.webmagic.utils.UrlUtils; + +import java.util.Map; + +/** + * 使用Selenium调用浏览器进行渲染。目前仅支持chrome。
      + * 需要下载Selenium driver支持。
      + * + * @author code4crafter@gmail.com
      + * Date: 13-7-26
      + * Time: 下午1:37
      + */ +public class SeleniumDownloader implements Downloader, Destroyable { + + private volatile WebDriverPool webDriverPool; + + private Logger logger = Logger.getLogger(getClass()); + + private int sleepTime = 0; + + private int poolSize = 1; + + /** + * 新建 + * + * @param chromeDriverPath + */ + public SeleniumDownloader(String chromeDriverPath) { + System.getProperties().setProperty("webdriver.chrome.driver", chromeDriverPath); + } + + /** + * set sleep time to wait until load success + * + * @param sleepTime + * @return this + */ + public SeleniumDownloader setSleepTime(int sleepTime) { + this.sleepTime = sleepTime; + return this; + } + + @Override + public Page download(Request request, Task task) { + checkInit(); + WebDriver webDriver; + try { + webDriver = webDriverPool.get(); + } catch (InterruptedException e) { + logger.warn("interrupted", e); + return null; + } + logger.info("downloading page " + request.getUrl()); + webDriver.get(request.getUrl()); + try { + Thread.sleep(sleepTime); + } catch (InterruptedException e) { + e.printStackTrace(); + } + WebDriver.Options manage = webDriver.manage(); + Site site = task.getSite(); + if (site.getCookies() != null) { + for (Map.Entry cookieEntry : site.getCookies().entrySet()) { + Cookie cookie = new Cookie(cookieEntry.getKey(), cookieEntry.getValue()); + manage.addCookie(cookie); + } + } + WebElement webElement = webDriver.findElement(By.xpath("/html")); + String content = webElement.getAttribute("outerHTML"); + Page page = new Page(); + page.setHtml(new Html(UrlUtils.fixAllRelativeHrefs(content, request.getUrl()))); + page.setUrl(new PlainText(request.getUrl())); + page.setRequest(request); + webDriverPool.returnToPool(webDriver); + return page; + } + + private void checkInit() { + if (webDriverPool == null) { + synchronized (this){ + webDriverPool = new WebDriverPool(poolSize); + } + } + } + + @Override + public void setThread(int thread) { + this.poolSize = thread; + } + + @Override + public void destroy() { + webDriverPool.closeAll(); + } +} diff --git a/webmagic-selenium/src/main/java/us/codecraft/webmagic/downloader/selenium/WebDriverPool.java b/webmagic-selenium/src/main/java/us/codecraft/webmagic/downloader/selenium/WebDriverPool.java new file mode 100644 index 000000000..71ba290e1 --- /dev/null +++ b/webmagic-selenium/src/main/java/us/codecraft/webmagic/downloader/selenium/WebDriverPool.java @@ -0,0 +1,88 @@ +package us.codecraft.webmagic.downloader.selenium; + +import org.openqa.selenium.WebDriver; +import org.openqa.selenium.chrome.ChromeDriver; + +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; +import java.util.concurrent.BlockingDeque; +import java.util.concurrent.LinkedBlockingDeque; +import java.util.concurrent.atomic.AtomicInteger; + +/** + * @author code4crafter@gmail.com
      + * Date: 13-7-26
      + * Time: 下午1:41
      + */ +class WebDriverPool { + + private final static int DEFAULT_CAPACITY = 5; + + private final int capacity; + + private final static int STAT_RUNNING = 1; + + private final static int STAT_CLODED = 2; + + private AtomicInteger stat = new AtomicInteger(STAT_RUNNING); + + /** + * store webDrivers created + */ + private List webDriverList = Collections.synchronizedList(new ArrayList()); + + /** + * store webDrivers available + */ + private BlockingDeque innerQueue = new LinkedBlockingDeque(); + + public WebDriverPool(int capacity) { + this.capacity = capacity; + } + + public WebDriverPool() { + this(DEFAULT_CAPACITY); + } + + public WebDriver get() throws InterruptedException { + checkRunning(); + WebDriver poll = innerQueue.poll(); + if (poll != null) { + return poll; + } + if (webDriverList.size() < capacity) { + synchronized (webDriverList) { + if (webDriverList.size() < capacity) { + ChromeDriver e = new ChromeDriver(); + innerQueue.add(e); + webDriverList.add(e); + } + } + + } + return innerQueue.take(); + } + + public void returnToPool(WebDriver webDriver) { + checkRunning(); + innerQueue.add(webDriver); + } + + protected void checkRunning() { + if (!stat.compareAndSet(STAT_RUNNING, STAT_RUNNING)) { + throw new IllegalStateException("Already closed!"); + } + } + + public void closeAll() { + boolean b = stat.compareAndSet(STAT_RUNNING, STAT_CLODED); + if (!b) { + throw new IllegalStateException("Already closed!"); + } + for (WebDriver webDriver : webDriverList) { + webDriver.close(); + } + + } +} diff --git a/webmagic-selenium/src/test/java/us/codecraft/webmagic/downloader/SeleniumTest.java b/webmagic-selenium/src/test/java/us/codecraft/webmagic/downloader/SeleniumTest.java new file mode 100644 index 000000000..b7bcd80b3 --- /dev/null +++ b/webmagic-selenium/src/test/java/us/codecraft/webmagic/downloader/SeleniumTest.java @@ -0,0 +1,41 @@ +package us.codecraft.webmagic.downloader; + +import org.junit.Ignore; +import org.junit.Test; +import org.openqa.selenium.By; +import org.openqa.selenium.WebDriver; +import org.openqa.selenium.WebElement; +import org.openqa.selenium.chrome.ChromeDriver; +import org.openqa.selenium.remote.DesiredCapabilities; + +import java.util.Arrays; +import java.util.HashMap; +import java.util.Map; + +/** + * @author code4crafter@gmail.com
      + * Date: 13-7-26
      + * Time: 下午12:27
      + */ +public class SeleniumTest { + + @Ignore("need chrome driver") + @Test + public void testSelenium() { + System.getProperties().setProperty("webdriver.chrome.driver", "/Users/yihua/Downloads/chromedriver"); + Map contentSettings = new HashMap(); + contentSettings.put("images", 2); + + Map preferences = new HashMap(); + preferences.put("profile.default_content_settings", contentSettings); + + DesiredCapabilities caps = DesiredCapabilities.chrome(); + caps.setCapability("chrome.prefs", preferences); + caps.setCapability("chrome.switches", Arrays.asList("--user-data-dir=/Users/yihua/temp/chrome")); + WebDriver webDriver = new ChromeDriver(caps); + webDriver.get("http://huaban.com/"); + WebElement webElement = webDriver.findElement(By.xpath("/html")); + System.out.println(webElement.getAttribute("outerHTML")); + webDriver.close(); + } +} diff --git a/webmagic-selenium/src/test/java/us/codecraft/webmagic/downloader/selenium/SeleniumDownloaderTest.java b/webmagic-selenium/src/test/java/us/codecraft/webmagic/downloader/selenium/SeleniumDownloaderTest.java new file mode 100644 index 000000000..2b8c24711 --- /dev/null +++ b/webmagic-selenium/src/test/java/us/codecraft/webmagic/downloader/selenium/SeleniumDownloaderTest.java @@ -0,0 +1,61 @@ +package us.codecraft.webmagic.downloader.selenium; + +import org.junit.Ignore; +import org.junit.Test; +import us.codecraft.webmagic.Page; +import us.codecraft.webmagic.Request; +import us.codecraft.webmagic.Site; +import us.codecraft.webmagic.Task; + +/** + * @author code4crafter@gmail.com
      + * Date: 13-7-26
      + * Time: 下午2:46
      + */ +public class SeleniumDownloaderTest { + + private String chromeDriverPath = "/Users/yihua/Downloads/chromedriver"; + + @Ignore("need chrome driver") + @Test + public void test() { + SeleniumDownloader seleniumDownloader = new SeleniumDownloader(chromeDriverPath); + long time1 = System.currentTimeMillis(); + for (int i = 0; i < 100; i++) { + Page page = seleniumDownloader.download(new Request("http://huaban.com/"), new Task() { + @Override + public String getUUID() { + return "huaban.com"; + } + + @Override + public Site getSite() { + return Site.me(); + } + }); + System.out.println(page.getHtml().$("#waterfall").links().regex(".*pins.*").all()); + } + System.out.println(System.currentTimeMillis() - time1); + } + + @Ignore + @Test + public void testBaiduWenku() { + SeleniumDownloader seleniumDownloader = new SeleniumDownloader(chromeDriverPath); + seleniumDownloader.setSleepTime(10000); + long time1 = System.currentTimeMillis(); + Page page = seleniumDownloader.download(new Request("http://wenku.baidu.com/view/462933ff04a1b0717fd5ddc2.html"), new Task() { + @Override + public String getUUID() { + return "huaban.com"; + } + + @Override + public Site getSite() { + return Site.me(); + } + }); + System.out.println(page.getHtml().$("div.inner").replace("<[^<>]+>","").replace("&nsbp;","").all()); + } + +} diff --git a/webmagic-selenium/src/test/java/us/codecraft/webmagic/downloader/selenium/WebDriverPoolTest.java b/webmagic-selenium/src/test/java/us/codecraft/webmagic/downloader/selenium/WebDriverPoolTest.java new file mode 100644 index 000000000..a711a19a8 --- /dev/null +++ b/webmagic-selenium/src/test/java/us/codecraft/webmagic/downloader/selenium/WebDriverPoolTest.java @@ -0,0 +1,31 @@ +package us.codecraft.webmagic.downloader.selenium; + +import org.junit.Ignore; +import org.junit.Test; +import org.openqa.selenium.WebDriver; + +/** + * @author code4crafter@gmail.com
      + * Date: 13-7-26
      + * Time: 下午2:12
      + */ +public class WebDriverPoolTest { + + private String chromeDriverPath = "/Users/yihua/Downloads/chromedriver"; + + @Ignore("need chrome driver") + @Test + public void test() { + System.getProperties().setProperty("webdriver.chrome.driver", chromeDriverPath); + WebDriverPool webDriverPool = new WebDriverPool(5); + for (int i = 0; i < 5; i++) { + try { + WebDriver webDriver = webDriverPool.get(); + System.out.println(i); + } catch (InterruptedException e) { + e.printStackTrace(); + } + } + webDriverPool.closeAll(); + } +} diff --git a/webmagic-selenium/src/test/java/us/codecraft/webmagic/samples/HuabanProcessor.java b/webmagic-selenium/src/test/java/us/codecraft/webmagic/samples/HuabanProcessor.java new file mode 100644 index 000000000..1696a3f95 --- /dev/null +++ b/webmagic-selenium/src/test/java/us/codecraft/webmagic/samples/HuabanProcessor.java @@ -0,0 +1,45 @@ +package us.codecraft.webmagic.samples; + +import us.codecraft.webmagic.Page; +import us.codecraft.webmagic.Site; +import us.codecraft.webmagic.Spider; +import us.codecraft.webmagic.downloader.selenium.SeleniumDownloader; +import us.codecraft.webmagic.pipeline.FilePipeline; +import us.codecraft.webmagic.processor.PageProcessor; + +/** + * 花瓣网抽取器。
      + * 使用Selenium做页面动态渲染。
      + * @author code4crafter@gmail.com
      + * Date: 13-7-26
      + * Time: 下午4:08
      + */ +public class HuabanProcessor implements PageProcessor { + + private Site site; + + @Override + public void process(Page page) { + page.addTargetRequests(page.getHtml().links().regex("http://huaban\\.com/.*").all()); + if (page.getUrl().toString().contains("pins")) { + page.putField("img", page.getHtml().xpath("//div[@id='pin_img']/img/@src").toString()); + } else { + page.getResultItems().setSkip(true); + } + } + + @Override + public Site getSite() { + if (site == null) { + site = Site.me().setDomain("huaban.com").addStartUrl("http://huaban.com/").setSleepTime(0); + } + return site; + } + + public static void main(String[] args) { + Spider.create(new HuabanProcessor()).thread(5) + .pipeline(new FilePipeline("/data/webmagic/test/")) + .downloader(new SeleniumDownloader("/Users/yihua/Downloads/chromedriver")) + .runAsync(); + } +}