From e7e7f7c8874435a5582e256ec0baa776d89bfc64 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E9=BB=84=E4=BA=BF=E5=8D=8E?= Date: Sun, 9 Jun 2013 22:55:49 +0900 Subject: [PATCH 01/81] update samples --- README.md | 24 ++---------------------- 1 file changed, 2 insertions(+), 22 deletions(-) diff --git a/README.md b/README.md index 723adf42f..e4adfd2c1 100644 --- a/README.md +++ b/README.md @@ -61,27 +61,7 @@ webmagic定制的核心是PageProcessor接口。一个最简单的webmagic爬虫 } } ---- - -TODO - - - public class OschinaBlogPageProcesser implements PageProcessor { - - @Override - public void process(Page page) { - List strings = page.getHtml().rs("]*href=[\"']{1}(http://my\\.oschina\\.net/\\w+/blog/\\d+)[\"']{1}").toStrings(); - page.addTargetRequests(strings); - page.putField("title", page.getHtml().xs("//div[@class='BlogEntity']/div[@class='BlogTitle']/h1")); - page.putField("content", page.getHtml().sc()); - page.putField("author", page.getUrl().r("my\\.oschina\\.net/(\\w+)/blog/\\d+")); - } - - @Override - public Site getSite() { - return Site.me().setDomain("my.oschina.net").setStartUrl("http://www.oschina.net/"). - setUserAgent("Mozilla/5.0 (Macintosh; Chrome/26.0.1410.65 Safari/537.31"); - } - } +### 示例 +可参考作者博客[使用webmagic抓取页面并保存为wordpress文件](http://my.oschina.net/flashsword/blog/136846) From e7c9ba8369f0e70a91456d942588346ce1c112d7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E9=BB=84=E4=BA=BF=E5=8D=8E?= Date: Sun, 9 Jun 2013 22:00:39 +0800 Subject: [PATCH 02/81] update comment --- .../us/codecraft/webmagic/selector/SmartContentSelector.java | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/SmartContentSelector.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/SmartContentSelector.java index c2e36dff9..10ab15c6c 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/SmartContentSelector.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/SmartContentSelector.java @@ -8,7 +8,8 @@ import java.util.concurrent.atomic.AtomicInteger; /** - * 找到clear + * readability算法,基础是找到所有p标签的父节点 + * 写的比较乱,最终效果还在尝试中 * User: cairne * Date: 13-4-21 * Time: 下午4:42 From c90fb42a2c0e299be7bb667c6d338eb6915e9859 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E9=BB=84=E4=BA=BF=E5=8D=8E?= Date: Thu, 13 Jun 2013 15:51:05 +0800 Subject: [PATCH 03/81] update pom --- pom.xml | 99 ++++----------------------------------------------------- 1 file changed, 6 insertions(+), 93 deletions(-) diff --git a/pom.xml b/pom.xml index c424910a6..3772cd011 100644 --- a/pom.xml +++ b/pom.xml @@ -8,98 +8,11 @@ webmagic - - - org.apache.httpcomponents - httpclient - 4.2.1 - + + ./webmagic-core + ./webmagic-plugin/ + ./webmagic-samples/ + - - junit - junit - 4.7 - test - - - com.google.guava - guava - 13.0.1 - - - - org.apache.commons - commons-lang3 - 3.1 - - - - log4j - log4j - 1.2.17 - - - - commons-collections - commons-collections - 3.2.1 - - - - net.sourceforge.htmlcleaner - htmlcleaner - 2.4 - - - - org.apache.commons - commons-io - 1.3.2 - - - - - - - - org.apache.maven.plugins - maven-resources-plugin - - UTF-8 - - - - org.apache.maven.plugins - maven-source-plugin - - - attach-sources - - jar - - - - - - org.apache.maven.plugins - maven-javadoc-plugin - - - attach-javadocs - - jar - - - - - - org.apache.maven.plugins - maven-release-plugin - 2.0-beta-7 - - - - - - \ No newline at end of file + From de3e9055610b38b56fd3952d4d7e0340222240a2 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Thu, 13 Jun 2013 15:58:42 +0800 Subject: [PATCH 04/81] update readme --- README.md | 29 ++++++++++++++++++----------- 1 file changed, 18 insertions(+), 11 deletions(-) diff --git a/README.md b/README.md index e4adfd2c1..8724e8902 100644 --- a/README.md +++ b/README.md @@ -2,25 +2,28 @@ webmagic --------- ####*一个网络爬虫工具包* -webmagic的发起源于工作中的需要,其定位是帮助开发者更便捷的开发一个垂直的网络爬虫。webmagic可以便捷的使用xpath和正则表达式进行链接和内容的提取,对于有Java和xpath或者正则基础的开发者,只需编写少量代码即可完成一个定制爬虫。 +webmagic的发起源于工作中的需要,其定位是帮助开发者更便捷的开发一个垂直的网络爬虫。 -###哲学### +webmagic的功能覆盖整个爬虫的生命周期(链接提取、页面下载、内容抽取、持久化),开发者可以便捷的使用xpath和正则表达式进行链接和内容的提取,只需编写少量代码即可完成一个定制爬虫。 -* Write Less, Do more. +###特色### - webmagic是一个开发者的工具包,它的目标是让开发者可以通过更少的代码,实现一个高质量的爬虫。webmagic内部还集成了一些常见的垂直性爬虫的功能,例如针对页面正文的Readability技术,可以直接对页面的正文进行智能提取。 +* ####垂直爬虫#### + webmagic着重于页面抽取的工作。开发者可以使用xpath和正则表达式进行链接和内容的提取,支持链式API调用,以及单复数转换。 + + String content = page.getHtml().x("//div[@class='body']").r("这段话比较重要(.*)").toString(); +* ####嵌入式&无配置#### + webmagic与其他Full-Stack的框架不同,没有配置文件,大部分功能都通过简单的API调用完成。webmagic以jar包的形式存在,并且不依赖任何框架,在程序可以随处进行调用。 + 以下是爬取oschina博客的一段代码: Spider.me().processor(new SimplePageProcessor("http://my.oschina.net/", "http://my.oschina.net/*/blog/*")).run(); + +* ####可扩展#### + 参考[`scrapy`](https://github.com/scrapy/scrapy)的设计,webmagic将爬虫的扩展点分为Processor、Schedular、Downloader、Pipeline三个模块,可以通过扩展这些接口实现强大的扩展功能。如可以通过多个Spider实现多线程抓取;可以通过扩展Schedular实现断点续传乃至于分布式爬虫;可以通过扩展Pipeline实现业务可定制的持久化功能。 -* 简单可用 - - webmagic的功能覆盖整个爬虫的生命周期(链接提取、页面下载、内容抽取、持久化),是一个完整的爬虫框架。但是与其他Full-Stack的框架不同,webmagic只引入少量约定,大部分功能都通过简单的API调用完成,目的是尽量降低开发者的学习成本。webmagic以jar包的形式存在,并且不依赖任何框架,在程序可以随处进行调用。 - -* 灵活性 - - 参考scrapy的设计,webmagic将爬虫的扩展点分为processor、schedular、downloader、pipeline三个模块,可以通过扩展这些接口实现强大的扩展功能。如可以通过多个Spider实现多线程抓取;可以通过扩展schedular实现断点续传乃至于分布式爬虫;可以通过扩展pipeline实现业务可定制的持久化功能。 + ------ @@ -65,3 +68,7 @@ webmagic定制的核心是PageProcessor接口。一个最简单的webmagic爬虫 可参考作者博客[使用webmagic抓取页面并保存为wordpress文件](http://my.oschina.net/flashsword/blog/136846) +### 协议 + +webmagic遵循[Apache 2.0协议](http://opensource.org/licenses/Apache-2.0) + From a48523ed4b3173c0c11be0a69d9f1716cd2b5d17 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Sun, 16 Jun 2013 14:47:02 +0800 Subject: [PATCH 05/81] fix a pom error --- README.md | 15 +++++++++++---- pom.xml | 2 +- .../webmagic/processor/SimplePageProcessor.java | 4 ++++ 3 files changed, 16 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index 8724e8902..e92a4405c 100644 --- a/README.md +++ b/README.md @@ -29,11 +29,9 @@ webmagic的功能覆盖整个爬虫的生命周期(链接提取、页面下载 ###Get Started -webmagic定制的核心是PageProcessor接口。一个最简单的webmagic爬虫例子是这样的: - - Spider.me().processor(new SimplePageProcessor("http://my.oschina.net/", "http://my.oschina.net/*/blog/*")).run(); +webmagic定制的核心是PageProcessor接口。 -其中SimplePageProcessor实现如下: +例如,我们要实现一个简单的通用爬虫SimplePageProcessor,代码如下: public class SimplePageProcessor implements PageProcessor { @@ -53,16 +51,25 @@ webmagic定制的核心是PageProcessor接口。一个最简单的webmagic爬虫 @Override public void process(Page page) { List requests = page.getHtml().as().rs(urlPattern).toStrings(); + //调用page.addTargetRequests()方法添加待抓取链接 page.addTargetRequests(requests); + //xpath方式抽取 page.putField("title", page.getHtml().x("//title")); + //sc表示使用Readability技术抽取正文 page.putField("content", page.getHtml().sc()); } @Override public Site getSite() { + //定义抽取站点的相关参数 return site; } } + +调用这个爬虫的代码如下: + + Spider.me().processor(new SimplePageProcessor("http://my.oschina.net/", "http://my.oschina.net/*/blog/*")).run(); + ### 示例 diff --git a/pom.xml b/pom.xml index 3772cd011..68927f20b 100644 --- a/pom.xml +++ b/pom.xml @@ -5,7 +5,7 @@ us.codecraft 0.0.1-SNAPSHOT 4.0.0 - + pom webmagic diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/processor/SimplePageProcessor.java b/webmagic-core/src/main/java/us/codecraft/webmagic/processor/SimplePageProcessor.java index a8165bb42..eb8f56ea2 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/processor/SimplePageProcessor.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/processor/SimplePageProcessor.java @@ -29,13 +29,17 @@ public SimplePageProcessor(String startUrl, String urlPattern) { @Override public void process(Page page) { List requests = page.getHtml().as().rs(urlPattern).toStrings(); + //调用page.addTargetRequests()方法添加待抓取链接 page.addTargetRequests(requests); + //xpath方式抽取 page.putField("title", page.getHtml().x("//title")); + //sc表示使用Readability技术抽取正文 page.putField("content", page.getHtml().sc()); } @Override public Site getSite() { + //定义抽取站点的相关参数 return site; } } From 1a2f8fb247750a6c3dc0ef4e3d682e07f9f94278 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Mon, 17 Jun 2013 11:12:22 +0800 Subject: [PATCH 06/81] clean some code --- .../webmagic/processor/PageProcessor.java | 2 +- .../webmagic/selector/Selectable.java | 4 +- .../us/codecraft/webmagic/SpiderTest.java | 131 - .../webmagic/selector/HtmlCleanerTest.java | 28 - .../selector/SmartConentSelectorTest.java | 3051 ------------- .../webmagic/selector/XpathSelectorTest.java | 4068 ++++++----------- webmagic-core/src/test/resources/log4j.xml | 2 +- .../java/us/codecraft/webmagic/HtmlTest.java | 20 - .../us/codecraft/webmagic/SpiderTest.java | 3 + .../processor/DiandianProcessorTest.java | 2 + .../processor/DiaoyuwengProcessorTest.java | 2 + .../processor/SinablogProcessorTest.java | 2 + 12 files changed, 1350 insertions(+), 5965 deletions(-) delete mode 100644 webmagic-core/src/test/java/us/codecraft/webmagic/SpiderTest.java delete mode 100644 webmagic-core/src/test/java/us/codecraft/webmagic/selector/HtmlCleanerTest.java delete mode 100644 webmagic-core/src/test/java/us/codecraft/webmagic/selector/SmartConentSelectorTest.java delete mode 100644 webmagic-samples/src/test/java/us/codecraft/webmagic/HtmlTest.java diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/processor/PageProcessor.java b/webmagic-core/src/main/java/us/codecraft/webmagic/processor/PageProcessor.java index 22a24c96f..e3c74a904 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/processor/PageProcessor.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/processor/PageProcessor.java @@ -18,7 +18,7 @@ public interface PageProcessor { /** * the site the processor for - * @return + * @return site */ public Site getSite(); } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Selectable.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Selectable.java index 0fcc4208d..68ca47d44 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Selectable.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Selectable.java @@ -35,14 +35,14 @@ public interface Selectable { /** * select a link * - * @return + * @return first link */ public Selectable a(); /** * select all links * - * @return + * @return all links */ public Selectable as(); diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/SpiderTest.java b/webmagic-core/src/test/java/us/codecraft/webmagic/SpiderTest.java deleted file mode 100644 index 5cb9848ff..000000000 --- a/webmagic-core/src/test/java/us/codecraft/webmagic/SpiderTest.java +++ /dev/null @@ -1,131 +0,0 @@ -package us.codecraft.webmagic; - -import org.junit.Ignore; -import org.junit.Test; -import us.codecraft.webmagic.pipeline.FilePipeline; -import us.codecraft.webmagic.processor.SimplePageProcessor; -import us.codecraft.webmagic.samples.HuxiuProcessor; -import us.codecraft.webmagic.schedular.FileCacheQueueSchedular; - -/** - * User: cairne - * Date: 13-4-20 - * Time: 下午7:46 - */ -public class SpiderTest { - - - @Test - public void testSpider() throws InterruptedException { - Spider me = Spider.me().pipeline(new FilePipeline()).processor(new HuxiuProcessor()); - me.run(); - } - - @Test - public void testGlobalSpider(){ -// PageProcessor pageProcessor = new MeicanProcessor(); -// Spider.me().pipeline(new FilePipeline()).schedular(new FileCacheQueueSchedular(pageProcessor.getSite(),"/data/temp/webmagic/cache/")). -// processor(pageProcessor).run(); - SimplePageProcessor pageProcessor2 = new SimplePageProcessor("http://www.diaoyuweng.com/home.php?mod=space&uid=88304&do=thread&view=me&type=thread&from=space", "http://www.diaoyuweng.com/thread-*-1-1.html"); - pageProcessor2.getSite().setEncoding("GBK"); - System.out.println(pageProcessor2.getSite().getEncoding()); - Spider.me().pipeline(new FilePipeline()).schedular(new FileCacheQueueSchedular(pageProcessor2.getSite(),"/data/temp/webmagic/cache/")). - processor(pageProcessor2).run(); - - - } - - @Test - public void test(){ - System.out.println(System.getProperty("java.io.tmpdir")); - } - - - @Ignore - @Test - public void languageSchema() { - - - /** - * - * _hrefs = rs("]*href=[\"']{1}(/yewu/.*?)[\"']{1}") - * title = r(""(.*)"") - * body = x("//dd[@class='w133']") - * - * site.domain = "sh.58.com" - * site.ua="" - * site.cookie="aa:bb" - * - */ - - /** - * - * - * if (page == r('') && refer(1) == 1) { - * - * type = _refer(1) - * content = _text.t().c() - * title = x("asd@asd").r("",1) - * body[r(_currentUrl).g(1)] = body[r(_currentUrl).g(1)] + (x("").r("",1,2).c()) - * - * body=body[r(_currentUrl).g(1)] - * tags[%] = (tags[%] + xs('')) . r('') - * - * _targetUrls.add('' + x('').r('')) - * _sourceUrls.add() - * _header.put("",""); - * _cookie.add("asdsadasdsa"); - * - * - * } - * - * _cookie.add(_cookie['']) - * - * if (page == r('') && refer(1) == 1) - * ( - * _targetUrl = '' + x('') & r('') - * _sourceUrl = '' - * ) - * - */ - - /** - * - * - * - * - * - * - * - * - * - * - */ - - /** - * - * if (model.url('') && model.refer(1) == 1) - * ( - * - * model.set(type, model.refer(1)) - * content = t(_html) > c() - * title = x(_html, 'asd@asd') > r('',1) - * body[r(_currentUrl).g(1)] = body[r(_currentUrl).g(1)] + (x('') > r('',1,2) > c()) | x('') - * tags[%] = tags + xs('') > r('') - * model.setTargetUrl(); - * - * _targetUrl = '' + x('') & r('') - * _sourceUrl = '' - * ) - * - * _cookie.add(_cookie['']) - * - * if (page == r('') && refer(1) == 1) - * ( - * _targetUrl = '' + x('') & r('') - * _sourceUrl = '' - * ) - * - */ - } -} diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/selector/HtmlCleanerTest.java b/webmagic-core/src/test/java/us/codecraft/webmagic/selector/HtmlCleanerTest.java deleted file mode 100644 index 7aa2fc77d..000000000 --- a/webmagic-core/src/test/java/us/codecraft/webmagic/selector/HtmlCleanerTest.java +++ /dev/null @@ -1,28 +0,0 @@ -package us.codecraft.webmagic.selector; - -import org.htmlcleaner.CleanerProperties; -import org.htmlcleaner.HtmlCleaner; -import org.htmlcleaner.TagNode; -import org.junit.Test; - -import java.io.IOException; -import java.net.URL; - -/** - * User: cairne - * Date: 13-4-21 - * Time: 上午10:35 - */ -public class HtmlCleanerTest { - - @Test - public void test() throws IOException { - HtmlCleaner htmlCleaner = new HtmlCleaner(); - - CleanerProperties props = htmlCleaner.getProperties(); - - TagNode node = htmlCleaner.clean(new URL("http://www.huanqiu.com"),"UTF-8"); - System.out.println(node.getAllElementsList(true)); - System.out.println(node); - } -} diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/selector/SmartConentSelectorTest.java b/webmagic-core/src/test/java/us/codecraft/webmagic/selector/SmartConentSelectorTest.java deleted file mode 100644 index 4620a242b..000000000 --- a/webmagic-core/src/test/java/us/codecraft/webmagic/selector/SmartConentSelectorTest.java +++ /dev/null @@ -1,3051 +0,0 @@ -package us.codecraft.webmagic.selector; - -import org.junit.Test; - -import java.io.IOException; - -/** - * User: cairne - * Date: 13-4-21 - * Time: 上午10:35 - */ -public class SmartConentSelectorTest { - - @Test - public void test() throws IOException { - String text ="\n" + - "\n" + - "\n" + - "\n" + - " \n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - " \n" + - " \n" + - " \n" + - " \n" + - " 全文Feed的终极解决方案 - 阮一峰的网络日志\n" + - "\n" + - "\n" + - "
\n" + - "
\n" + - "\n" + - "\n" + - "
\n" + - "
\n" + - "
\n" + - "\n" + - "\n" + - "
阮一峰的网络日志 » 首页 » 档案\n" + - "
\n" + - "\n" + - "
\n" + - "\n" + - "
\n" + - "
\n" + - " \n" + - " \n" + - " \n" + - " \n" + - " \n" + - "
\n" + - "
\n" + - "\n" + - "
\n" + - "\n" + - "\n" + - "
\n" + - "
\n" + - "
\n" + - "\n" + - "\n" + - "\n" + - "
\n" + - "
\n" + - "\n" + - "\n" + - "
\n" + - "
\n" + - "\n" + - "\n" + - "
\n" + - "
\n" + - "
\n" + - "\n" + - "
\n" + - "\n" + - "
\n" + - "\n" + - "\n" + - "\n" + - " \n" + - "
\n" + - "

分类

\n" + - " \n" + - "
\n" + - " \n" + - "\n" + - "\n" + - "\n" + - " \n" + - "
\n" + - "
\n" + - "
\n" + - "

全文Feed的终极解决方案

\n" + - "
\n" + - "
\n" + - " \n" + - "\n" + - "

作者: 阮一峰

\n" + - "\n" + - "

日期: 2010年4月17日

\n" + - "\n" + - "\n" + - "\n" + - "
\n" + - " \n" + - "
\n" + - "\n" + - " \n" + - "

正如我们都知道的,全文Feed最有用。

\n" + - " \n" + - "\n" + - "\n" + - " \n" + - "

但是,世界上的大部分Feed,都是摘要Feed,甚至是标题Feed。我们只好自己动手,制作全文Feed。

\n" + - "\n" + - "

传统的制作方法非常麻烦,需要针对不同的网站,编写不同的内容提取规则。要是有一个傻瓜型的\"全文Feed生成器\",把摘要Feed往里面一扔,全文Feed就自动生成了,那该多好。

\n" + - "\n" + - "

FiveFilters.org提供的生成器,大概最接近于这种要求。

\n" + - "\n" + - "

\n" + - "\n" + - "

举例来说,网易的社会新闻Feed(http://news.163.com/special/00011K6L/rss_sh.xml)是一个摘要Feed。

\n" + - "\n" + - "

\n" + - "\n" + - "

我们把这个网址,送进FiveFilters.org,点击\"Create Feed\"按钮,全文Feed就自动产生了!(查看效果

\n" + - "\n" + - "

但是,这个生成器并不是百用百灵,比如新浪的Feed(http://rss.sina.com.cn/news/society/focus15.xml)就无法抓取全文。

\n" + - "\n" + - "

好在今年3月份,它开源了。作者Keyvan Minoukadeh将所有代码都公开了,所以如果遇到不能生效的Feed,现在我们就可以修改源码了。因此理论上,几乎所有的摘要Feed都可以自动转成全文Feed了。

\n" + - "\n" + - "

源码存放在launchpad.net上,需要安装Bazaar的客户端才能下载。我为大家提供方便,把它们压缩成一个zip文件,点击下载(1.0版,217KB)。

\n" + - "\n" + - "

下载后,上传到支持PHP 5.2的虚拟主机上,就可以直接使用。使用的时候,需要将cache子目录设为可写(权限777)。在config-sample.php文件中,可以查看设置选项,修改默认值后,将文件名改为config.php,就会生效。(不修改亦可,config文件并不是必需的。)

\n" + - "\n" + - "

这个程序的核心是readability.php文件,它负责判断当前网页中,那一部分属于页面的主要内容,然后将其抓取出来。实现原理照搬了arc90的ReadAbility脚本。简单说,思路是这样的:1)检查页面中所有p元素的父容器;2)根据相关特征,为每一个父容器计算一个特征值;3)特征值最大的容器,就是放置主要内容的容器。

\n" + - "\n" + - "

具体实现请阅读代码,源码写得非常清晰,而且有详细的注释。如果遇到不能抓取全文的Feed,你就要自己修改readability.php,增加相应的规则。比如,在我提供下载的代码中,我就设置了新浪网的规则,新浪网的全文Feed就能自动生成了。

\n" + - "\n" + - "

这个程序使用的是AGPL许可证,这就是说你可以自由地使用、修改、发布这个程序,但是只要你向他人提供基于这个程序的服务,你就必须公开源码。

\n" + - "\n" + - "

作者Keyvan Minoukadeh允诺,只要使用者向他捐款200美元,就发布2.0版。如果你喜欢这个程序,建议向他捐款

\n" + - "\n" + - "

P.S.

\n" + - "\n" + - "

这几天,我还发现了一个非常优秀的开源相册软件ZenPhoto,也推荐使用。

\n" + - "\n" + - "

UPDATE(2010.6.3)

\n" + - "\n" + - "

Full TEXT RSS 1.5版下载(283KB)

\n" + - "\n" + - "

UPDATE(2010.11.10)

\n" + - "\n" + - "

Full TEXT RSS 2.1版下载(362KB)

\n" + - "\n" + - "

(完)

\n" + - " \n" + - "\n" + - "
\n" + - "\n" + - "
\n" + - "\n" + - "

文档信息

\n" + - "\n" + - "
\n" + - "
\n" + - "
\n" + - "
\n" + - "

\n" + - "
\n" + - "
\n" + - "

相关文章

\n" + - "
    \n" + - "\n" + - "
  • 2012.12.21: Javascript异步编程的4种方法\n" + - "\n" + - "
    \n" + - " 你可能知道,Javascript语言的执行环境是\"单线程\"(single thread)。\n" + - "
    \n" + - "\n" + - "
  • \n" + - "\n" + - " \n" + - "
  • 2012.12.14: 奥巴马筹款网站的制作过程\n" + - "\n" + - "
    \n" + - " 1.\n" + - "\n" + - "Kyle Rush是一个网站工程师。\n" + - "
    \n" + - "\n" + - "
  • \n" + - "\n" + - " \n" + - "
\n" + - "
\n" + - "\n" + - "
\n" + - "

功能链接

\n" + - "
    \n" + - "
  • 前一篇:\"草原新城\"康巴什
  • \n" + - "
  • 后一篇:网络时代的音乐家生存指南
  • \n" + - "
  • 更多内容请访问:首页 » 档案 » \n" + - "IT技术 \n" + - "
  • \n" + - "\n" + - "
  • \n" + - "\n" + - "
    \n" + - "\n" + - "站内搜索:\n" + - "\n" + - "\n" + - "Web\n" + - "\n" + - "www.ruanyifeng.com\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "
    \n" + - "\n" + - "
  • \n" + - "
  • Feed订阅:
  • \n" + - "
\n" + - "
\n" + - "\n" + - "
\n" + - "

广告(购买广告位)

\n" + - "
\n" + - "\n" + - "\n" + - "
\n" + - "
\n" + - "
\n" + - "\n" + - "\n" + - "
\n" + - "
\n" + - "
\n" + - "\n" + - " \n" + - "\n" + - "
\n" + - "\n" + - "\n" + - " \n" + - " \n" + - " \n" + - "

留言(23条)

\n" + - "\n" + - "
\n" + - " \n" + - "
\n" + - "
\n" + - "
\n" + - "
\n" + - "

\n" + - " \n" + - " \n" + - "\n" + - " zp\n" + - "\n" + - " 说:\n" + - " \n" + - "

\n" + - "
\n" + - "
\n" + - "
\n" + - "

能不能介绍些Movable Type的文章,我比较喜欢它的静态页面,国内关于它的资料好像还不多。特别是MT5出来后,多页面功能可能会让刚接触的人晕头转向。

\n" + - "
\n" + - "
\n" + - "\n" + - "
\n" + - "
\n" + - "
\n" + - " \n" + - " \n" + - " \n" + - "
\n" + - "
\n" + - "
\n" + - "
\n" + - "

\n" + - " \n" + - " \n" + - "\n" + - " 火点\n" + - "\n" + - " 说:\n" + - " \n" + - "

\n" + - "
\n" + - "
\n" + - "
\n" + - "

很好,谢谢作者,只是赶到花了大量的时间在新闻上似乎有点不利于思考。

\n" + - "\n" + - "

用一个图书管理软件(BLM)整理了大学期间看过的书,仅有180本左右,汗颜,这就是我的大学……

\n" + - "\n" + - "

现在参加工作了,好在业余时间还算充裕,希望可以多读一些书。

\n" + - "
\n" + - "
\n" + - "\n" + - "
\n" + - "
\n" + - "
\n" + - " \n" + - " \n" + - " \n" + - "
\n" + - "
\n" + - "
\n" + - "
\n" + - "

\n" + - " \n" + - " \n" + - "\n" + - " AlbertDiao\n" + - "\n" + - " 说:\n" + - " \n" + - "

\n" + - "
\n" + - "
\n" + - "
\n" + - "

如果是手机RSS的话,摘要Feed比较好。一般浏览摘要,感兴趣的点进全文,这样比较节省流量。

\n" + - "
\n" + - "
\n" + - "\n" + - "
\n" + - "
\n" + - "
\n" + - " \n" + - " \n" + - " \n" + - "
\n" + - "
\n" + - "
\n" + - "
\n" + - "

\n" + - " \n" + - " \n" + - "\n" + - " 野草博客\n" + - "\n" + - " 说:\n" + - " \n" + - "

\n" + - "
\n" + - "
\n" + - "
\n" + - "

嗯,野草一直在用他:)

\n" + - "
\n" + - "
\n" + - "\n" + - "
\n" + - "
\n" + - "
\n" + - " \n" + - " \n" + - " \n" + - "
\n" + - "
\n" + - "
\n" + - "
\n" + - "

\n" + - " \n" + - " \n" + - "\n" + - " Ruan YiFeng\n" + - "\n" + - " 说:\n" + - " \n" + - "

\n" + - "
\n" + - "
\n" + - "
\n" + - "
\n" + - "
引用AlbertDiao的发言:
\n" + - "\n" + - "

如果是手机RSS的话,摘要Feed比较好。一般浏览摘要,感兴趣的点进全文,这样比较节省流量。

\n" + - "\n" + - "
\n" + - "\n" + - "

流量会越来越便宜,真正昂贵的是你的时间。所以还是全文Feed好。

\n" + - "\n" + - "
\n" + - "
引用zp的发言:
\n" + - "\n" + - "

能不能介绍些Movable Type的文章。

\n" + - "\n" + - "
\n" + - "\n" + - "

我有这个打算,但是文章不太好写,还需要准备。

\n" + - "
\n" + - "
\n" + - "\n" + - "
\n" + - "
\n" + - "
\n" + - " \n" + - " \n" + - " \n" + - "
\n" + - "
\n" + - "
\n" + - "
\n" + - "

\n" + - " \n" + - " \n" + - "\n" + - " luops\n" + - "\n" + - " 说:\n" + - " \n" + - "

\n" + - "
\n" + - "
\n" + - "
\n" + - "

昨晚测试了此订阅
\n" + - " 同时我也保留了原订阅。
\n" + - "今天发现,同样订阅了163新闻的情况下
\n" + - "全文订阅比官方订阅少了很多新闻
\n" + - "不知其他童靴有没有这样子情况

\n" + - "
\n" + - "
\n" + - "\n" + - "
\n" + - "
\n" + - "
\n" + - " \n" + - " \n" + - " \n" + - "
\n" + - "
\n" + - "
\n" + - "
\n" + - "

\n" + - " \n" + - " \n" + - "\n" + - " 鲜为人志\n" + - "\n" + - " 说:\n" + - " \n" + - "

\n" + - "
\n" + - "
\n" + - "
\n" + - "

呵呵~ 这样都可以啊~

\n" + - "
\n" + - "
\n" + - "\n" + - "
\n" + - "
\n" + - "
\n" + - " \n" + - " \n" + - " \n" + - "
\n" + - "
\n" + - "
\n" + - "
\n" + - "

\n" + - " \n" + - " \n" + - "\n" + - " roy_hu\n" + - "\n" + - " 说:\n" + - " \n" + - "

\n" + - "
\n" + - "
\n" + - "
\n" + - "
\n" + - "
引用AlbertDiao的发言:
\n" + - "\n" + - "

如果是手机RSS的话,摘要Feed比较好。一般浏览摘要,感兴趣的点进全文,这样比较节省流量。

\n" + - "\n" + - "
\n" + - "\n" + - "

我更喜欢全文博客,因为在手机上看Google Reader,自动都排好了版,而看全文的时候需要浏览器排版,没有Google Reader那样专门设计给手机的看着舒服。

\n" + - "
\n" + - "
\n" + - "\n" + - "
\n" + - "
\n" + - "
\n" + - " \n" + - " \n" + - " \n" + - "
\n" + - "
\n" + - "
\n" + - "
\n" + - "

\n" + - " \n" + - " \n" + - "\n" + - " Jack\n" + - "\n" + - " 说:\n" + - " \n" + - "

\n" + - "
\n" + - "
\n" + - "
\n" + - "

另外,也可以用YAHOO PIPE 和YQL来抓取全文。这样除了可以把非全文的FEED变成全文输出外,还可以处理根本没有FEED输出的网页。(不过有很多网页需要处理一下GB2312和UNICODE转换。).而且这样还有一个最大的好处,就是不用建立自己的服务器。

\n" + - "\n" + - "


\n" + - "下面两个FEED 就是用这种办法生成的。
\n" + - "http://feeds.feedburner.com/wenxuecity_news

\n" + - "\n" + - "

http://feeds.feedburner.com/boxun_headline

\n" + - "\n" + - "

可以用GOOGLE READER 来读取它们。也不失为一种间接翻越G/F/W 的办法。
\n" + - "

\n" + - "
\n" + - "
\n" + - "\n" + - "
\n" + - "
\n" + - "
\n" + - " \n" + - " \n" + - " \n" + - "
\n" + - "
\n" + - "
\n" + - "
\n" + - "

\n" + - " \n" + - " \n" + - "\n" + - " Ruan YiFeng\n" + - "\n" + - " 说:\n" + - " \n" + - "

\n" + - "
\n" + - "
\n" + - "
\n" + - "
\n" + - "
引用luops的发言:
\n" + - "\n" + - "

全文订阅比官方订阅少了很多新闻

\n" + - "\n" + - "
\n" + - "\n" + - "

全文Feed默认只有4个条目,下载代码后,你可以自己修改这个值。

\n" + - "
\n" + - "
\n" + - "\n" + - "
\n" + - "
\n" + - "
\n" + - " \n" + - " \n" + - " \n" + - "
\n" + - "
\n" + - "
\n" + - "
\n" + - "

\n" + - " \n" + - " \n" + - "\n" + - " 坏坏鼠\n" + - "\n" + - " 说:\n" + - " \n" + - "

\n" + - "
\n" + - "
\n" + - "
\n" + - "

不懂编程只会用GR的文科生飘过~~~
\n" + - "ps:阮老师的这篇文章GR里也只是显示标题,所以漂洋过海地过来了(牛博编辑的那个频道,已经将你的博客订阅了呵O(∩_∩)O)~~

\n" + - "
\n" + - "
\n" + - "\n" + - "
\n" + - "
\n" + - "
\n" + - " \n" + - " \n" + - " \n" + - "
\n" + - "
\n" + - "
\n" + - "
\n" + - "

\n" + - " \n" + - " \n" + - "\n" + - " 111\n" + - "\n" + - " 说:\n" + - " \n" + - "

\n" + - "
\n" + - "
\n" + - "
\n" + - "
\n" + - "
引用luops的发言:
\n" + - "\n" + - "

全文订阅比官方订阅少了很多新闻

\n" + - "\n" + - "
\n" + - "\n" + - "


\n" + - "是这样的,丢失了好多,时效性好差
\n" + - "

\n" + - "
\n" + - "
\n" + - "\n" + - "
\n" + - "
\n" + - "
\n" + - " \n" + - " \n" + - " \n" + - "
\n" + - "
\n" + - "
\n" + - "
\n" + - "

\n" + - " \n" + - " \n" + - "\n" + - " kuber\n" + - "\n" + - " 说:\n" + - " \n" + - "

\n" + - "
\n" + - "
\n" + - "
\n" + - "

想请教一下你怎么修改规则来全文输出新浪网rss的, 我也碰到几个Feed,缺省的配置不能正确处理.
\n" + - "另外我建议设立一个地方大家可以交流一下脚本不能处理的feed,以及修改的方法, 这样各人不用重复浪费时间了.

\n" + - "
\n" + - "
\n" + - "\n" + - "
\n" + - "
\n" + - "
\n" + - " \n" + - " \n" + - " \n" + - "
\n" + - "
\n" + - "
\n" + - "
\n" + - "

\n" + - " \n" + - " \n" + - "\n" + - " 111\n" + - "\n" + - " 说:\n" + - " \n" + - "

\n" + - "
\n" + - "
\n" + - "
\n" + - "

下载了lz的代码,发布到网站上,功能可用了。rss数量自己设置就好。

\n" + - "\n" + - "

杯具的是网站只有内网地址,gr不认生成的feed地址。

\n" + - "\n" + - "

只能CS订阅,不喜。

\n" + - "
\n" + - "
\n" + - "\n" + - "
\n" + - "
\n" + - "
\n" + - " \n" + - " \n" + - " \n" + - "
\n" + - "
\n" + - "
\n" + - "
\n" + - "

\n" + - " \n" + - " \n" + - "\n" + - " lietlie\n" + - "\n" + - " 说:\n" + - " \n" + - "

\n" + - "
\n" + - "
\n" + - "
\n" + - "

http://mrss.dokoda.jp/
\n" + - "虽然是小鬼子的网站,但是是我找到的能够全文Feed最好的在线工具了,和LZ推荐的网站相比,可以输出所有项目,而没有4条目的限制,当然也不必自己搭建服务器,日文内容很简单,如果使用的是FF或Chrome浏览器还可以利用Google的自动翻译功能将大致内容翻译为中文(FF利用Google Toolbar)——其实即使不翻译一样很容易使用。

\n" + - "
\n" + - "
\n" + - "\n" + - "
\n" + - "
\n" + - "
\n" + - " \n" + - " \n" + - " \n" + - "
\n" + - "
\n" + - "
\n" + - "
\n" + - "

\n" + - " \n" + - " \n" + - "\n" + - " Ruan YiFeng\n" + - "\n" + - " 说:\n" + - " \n" + - "

\n" + - "
\n" + - "
\n" + - "
\n" + - "
\n" + - "
引用kuber的发言:
\n" + - "\n" + - "

想请教一下你怎么修改规则来全文输出新浪网rss的, 我也碰到几个Feed,缺省的配置不能正确处理.

\n" + - "\n" + - "
\n" + - "\n" + - "

新浪的内容容器,有一个比较怪的ID名。只要搜索这个字符串,就能提取内容了。

\n" + - "\n" + - "

最终,你还是需要读readability.php的代码,只要读懂了,我觉得任何页面都能提取。
\n" + - "

\n" + - "
\n" + - "
\n" + - "\n" + - "
\n" + - "
\n" + - "
\n" + - " \n" + - " \n" + - " \n" + - "
\n" + - "
\n" + - "
\n" + - "
\n" + - "

\n" + - " \n" + - " \n" + - "\n" + - " 诗沐\n" + - "\n" + - " 说:\n" + - " \n" + - "

\n" + - "
\n" + - "
\n" + - "
\n" + - "

哇 源码写得相当清爽啊~注释习惯很棒

\n" + - "
\n" + - "
\n" + - "\n" + - "
\n" + - "
\n" + - "
\n" + - " \n" + - " \n" + - " \n" + - "
\n" + - "
\n" + - "
\n" + - "
\n" + - "

\n" + - " \n" + - " \n" + - "\n" + - " xangd\n" + - "\n" + - " 说:\n" + - " \n" + - "

\n" + - "
\n" + - "
\n" + - "
\n" + - "

有人在appspot上部署了一个python的port
\n" + - "http://andrewtrusty.appspot.com/readability/
\n" + - "这个没有4篇post的限制

\n" + - "
\n" + - "
\n" + - "\n" + - "
\n" + - "
\n" + - "
\n" + - " \n" + - " \n" + - " \n" + - "
\n" + - "
\n" + - "
\n" + - "
\n" + - "

\n" + - " \n" + - " \n" + - "\n" + - " neotrue\n" + - "\n" + - " 说:\n" + - " \n" + - "

\n" + - "
\n" + - "
\n" + - "
\n" + - "

很好用,谢谢!

\n" + - "
\n" + - "
\n" + - "\n" + - "
\n" + - "
\n" + - "
\n" + - " \n" + - " \n" + - " \n" + - "
\n" + - "
\n" + - "
\n" + - "
\n" + - "

\n" + - " \n" + - " \n" + - "\n" + - " harvey\n" + - "\n" + - " 说:\n" + - " \n" + - "

\n" + - "
\n" + - "
\n" + - "
\n" + - "

博主,作者把1.5版本放出来了,
\n" + - "可否再麻烦你打包一下,我bazzar一直不成功

\n" + - "
\n" + - "
\n" + - "\n" + - "
\n" + - "
\n" + - "
\n" + - " \n" + - " \n" + - " \n" + - "
\n" + - "
\n" + - "
\n" + - "
\n" + - "

\n" + - " \n" + - " \n" + - "\n" + - " Ruan YiFeng\n" + - "\n" + - " 说:\n" + - " \n" + - "

\n" + - "
\n" + - "
\n" + - "
\n" + - "
\n" + - "
引用harvey的发言:
\n" + - "\n" + - "

博主,作者把1.5版本放出来了,
\n" + - "可否再麻烦你打包一下,我bazzar一直不成功

\n" + - "\n" + - "
\n" + - "\n" + - "

已经加上去了,:-)

\n" + - "
\n" + - "
\n" + - "\n" + - "
\n" + - "
\n" + - "
\n" + - " \n" + - " \n" + - " \n" + - "
\n" + - "
\n" + - "
\n" + - "
\n" + - "

\n" + - " \n" + - " \n" + - "\n" + - " 张治国\n" + - "\n" + - " 说:\n" + - " \n" + - "

\n" + - "
\n" + - "
\n" + - "
\n" + - "

博主,全文Feed默认只有4个条目,下载代码后,修改哪段代码可以改变这个值啊,config-sample.PHP中的数值吗?我是新手,希望博主指点一下,谢谢。

\n" + - "
\n" + - "
\n" + - "\n" + - "
\n" + - "
\n" + - "
\n" + - " \n" + - " \n" + - " \n" + - "
\n" + - "
\n" + - "
\n" + - "
\n" + - "

\n" + - " \n" + - " \n" + - "\n" + - " felix\n" + - "\n" + - " 说:\n" + - " \n" + - "

\n" + - "
\n" + - "
\n" + - "
\n" + - "

看不懂readability,不知道博主能否提供一下过滤页面上的干扰字符的方法
\n" + - "

\n" + - "
\n" + - "
\n" + - "\n" + - "
\n" + - "
\n" + - "
\n" + - " \n" + - "
\n" + - " \n" + - " \n" + - "\n" + - "\n" + - " \n" + - " \n" + - "
\n" + - "

我要发表看法

\n" + - "
\n" + - "\n" + - " \n" + - "
\n" + - "\n" + - "
\n" + - " \n" + - " \n" + - " \n" + - " \n" + - " \n" + - " \n" + - " \n" + - "
\n" + - "
\n" + - "

\n" + - "

\n" + - "
\n" + - "
\n" + - "

\n" + - "

«-必填

\n" + - "
\n" + - "
\n" + - "

\n" + - "

«-必填,不公开

\n" + - "
\n" + - "
\n" + - "

\n" + - "

«-我信任你,不会填写广告链接

\n" + - "
\n" + - "
\n" + - "

\n" + - "

\n" + - "
\n" + - "
\n" + - "
\n" + - " \n" + - " \n" + - "
\n" + - "
\n" + - "
\n" + - "
\n" + - "

正在发表您的评论,请稍候

\n" + - "

\n" + - " \n" + - "\n" + - "

\n" + - "
\n" + - "\n" + - "

«- 点击按钮

\n" + - "
\n" + - "
\n" + - "\n" + - "\n" + - "
\n" + - "
\n" + - "\n" + - " \n" + - "\n" + - "\n" + - "
\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "
\n" + - "
\n" + - "\n" + - "
\n" + - "
\n" + - "\n" + - "\n" + - "
\n" + - "
\n" + - "
\n" + - "

联系方式 | ruanyifeng.com 2003 - 2012\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "

\n" + - "\n" + - "
\n" + - "
\n" + - "
\n" + - "\n" + - "\n" + - "
分享按钮 \n" + - "
\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "
\n" + - "
\n" + - "\n" + - ""; - Html html = new Html(text); - Selectable sc = html.sc(); - System.out.println(sc); - } - - @Test - public void test2(){ - String text = "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - " \n" + - " 地球上最后的夜晚 (豆瓣)\n" + - " \n" + - "\n" + - "\n" + - " \n" + - "\n" + - "\n" + - " \n" + - " \n" + - " \n" + - " \n" + - "\n" + - "\n" + - "\n" + - "\n" + - " \n" + - " \n" + - " \n" + - "\n" + - " \n" + - " \n" + - " \n" + - " \n" + - "\n" + - " \n" + - " \n" + - " \n" + - " \n" + - " \n" + - " \n" + - "\n" + - " \n" + - " \n" + - "\n" + - " \n" + - "\n" + - "\n" + - " \n" + - " \n" + - " \n" + - " \n" + - " \n" + - " \n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "
\n" + - "
\n" + - " \n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "
\n" + - " \n" + - " \n" + - " \n" + - "
\n" + - "\n" + - "\n" + - "
\n" + - " 提醒\n" + - " \n" + - "
\n" + - "
\n" + - "

加载中...

\n" + - "
\n" + - "
\n" + - "
\n" + - " \n" + - "\n" + - "
\n" + - "
    \n" + - " \n" + - " \n" + - "
  • \n" + - " 豆瓣\n" + - "
  • \n" + - " \n" + - " \n" + - "
  • \n" + - " 读书\n" + - "
  • \n" + - " \n" + - " \n" + - "
  • \n" + - " 电影\n" + - "
  • \n" + - " \n" + - " \n" + - "
  • \n" + - " 音乐\n" + - "
  • \n" + - " \n" + - " \n" + - "
  • \n" + - " 同城\n" + - "
  • \n" + - " \n" + - " \n" + - "
  • \n" + - " 小组\n" + - "
  • \n" + - " \n" + - " \n" + - "
  • \n" + - " 阅读\n" + - "
  • \n" + - " \n" + - " \n" + - "
  • \n" + - " 豆瓣FM\n" + - "
  • \n" + - " \n" + - "
  • \n" + - " 更多\n" + - "
    \n" + - " \n" + - " \n" + - " \n" + - " \n" + - " \n" + - " \n" + - " \n" + - "
    九点
    阿尔法城
    移动应用
    \n" + - "
    \n" + - "
  • \n" + - "
\n" + - "
\n" + - "
\n" + - "
\n" + - "\n" + - "\n" + - " \n" + - " \n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "
\n" + - "
\n" + - "
\n" + - "
\n" + - " 豆瓣读书\n" + - "
\n" + - "
\n" + - "
\n" + - "
\n" + - " 搜索:\n" + - " \n" + - "
\n" + - "
\n" + - " \n" + - "
\n" + - "
\n" + - "
\n" + - "\n" + - "
\n" + - "
\n" + - "
\n" + - " \n" + - "
\n" + - "
\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - " \n" + - " \n" + - " \n" + - " \n" + - "\n" + - "\n" + - "\n" + - "
\n" + - " \n" + - "

\n" + - " 地球上最后的夜晚\n" + - "
\n" + - "

\n" + - "\n" + - " \n" + - "
\n" + - " \n" + - "
\n" + - " \n" + - "
\n" + - "
\n" + - "
\n" + - " \n" + - "\n" + - "\n" + - "\n" + - "
\n" + - "
\n" + - "\n" + - " \n" + - "\n" + - " \n" + - " \"地球上最后的夜晚\"\n" + - " \n" + - "\n" + - "
\n" + - "

\n" + - " 更新描述或封面\n" + - "

\n" + - "\n" + - "\n" + - "
\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "
\n" + - "\n" + - "\n" + - " \n" + - " \n" + - " \n" + - "\n" + - " \n" + - " \n" + - " \n" + - " 原作名: Last Evenings on Earth
\n" + - "\n" + - " \n" + - " \n" + - " \n" + - " \n" + - " 作者: \n" + - " \n" + - " [智利] 罗贝托·波拉尼奥\n" + - "
\n" + - "\n" + - " \n" + - " \n" + - " \n" + - " \n" + - " 译者: \n" + - " \n" + - " 赵德明\n" + - "
\n" + - "\n" + - " \n" + - " \n" + - " \n" + - " 出版社: 上海人民出版社
\n" + - "\n" + - " \n" + - " \n" + - " \n" + - " 出版年: 2013-4-1
\n" + - "\n" + - " \n" + - " \n" + - " \n" + - " 页数: 288
\n" + - "\n" + - " \n" + - " \n" + - " \n" + - " 定价: 45.00元
\n" + - "\n" + - " \n" + - " \n" + - " \n" + - "\n" + - " \n" + - " \n" + - " \n" + - " 丛书: 罗贝托·波拉尼奥作品系列
\n" + - "\n" + - " \n" + - " \n" + - " \n" + - " \n" + - " \n" + - " ISBN: 9787208112025
\n" + - "\n" + - "\n" + - "
\n" + - "\n" + - "
\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - " \n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "
\n" + - "
\n" + - "

\n" + - " \n" + - "\n" + - "\n" + - " \n" + - " 8.4\n" + - " \n" + - "\n" + - " \n" + - "

\n" + - "

\n" + - " (\n" + - " \n" + - " 11人评价\n" + - " \n" + - " )\n" + - "

\n" + - " \n" + - " \n" + - "\n" + - "\n" + - " \n" + - "
\n" + - "\n" + - " 45.5%
\n" + - " \n" + - " \n" + - "\n" + - "\n" + - " \n" + - "
\n" + - "\n" + - " 9.1%
\n" + - " \n" + - " \n" + - "\n" + - "\n" + - " \n" + - "
\n" + - "\n" + - " 18.2%
\n" + - " \n" + - " \n" + - "\n" + - "\n" + - " \n" + - "
\n" + - "\n" + - " 18.2%
\n" + - " \n" + - " \n" + - "\n" + - "\n" + - " \n" + - "
\n" + - "\n" + - " 9.1%
\n" + - "
\n" + - "
\n" + - "\n" + - "\n" + - "
\n" + - " \n" + - "\n" + - "\n" + - "\n" + - " \n" + - " \n" + - "
\n" + - " \n" + - " 想读\n" + - " \n" + - " \n" + - " 在读\n" + - " \n" + - " \n" + - " 读过\n" + - " \n" + - "
\n" + - " \n" + - "\n" + - "\n" + - " \n" + - " 评价: \n" + - " \n" + - " \n" + - " \n" + - " \n" + - " \n" + - " \n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "
\n" + - "
\n" + - "\n" + - "\n" + - "\n" + - " \n" + - "
\n" + - "
    \n" + - "
  • \n" + - "  写笔记\n" + - "
  • \n" + - "\n" + - "
  • \n" + - "  写书评\n" + - "
  • \n" + - "\n" + - "
  • \n" + - "\n" + - " \n" + - " \n" + - "\n" + - " \n" + - " 加入购书单\n" + - " 已在购书单\n" + - "
    \n" + - "
  • \n" + - "\n" + - "
  • \n" + - " \n" + - "\n" + - "\n" + - "
    \n" + - " \n" + - "\n" + - " \n" + - "\n" + - " \n" + - "\n" + - " 添加到豆列\n" + - "
    \n" + - "\n" + - "
  • \n" + - "\n" + - " \n" + - " \n" + - " \n" + - "
  • \n" + - " 分享到   \n" + - "
  • \n" + - " \n" + - "\n" + - "
\n" + - "
\n" + - "\n" + - "\n" + - " \n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - " \n" + - " \n" + - " \n" + - " \n" + - "\n" + - "
\n" + - "\n" + - " \n" + - " \n" + - "\n" + - " \n" + - " \n" + - "\n" + - "\n" + - "\n" + - " 推荐\n" + - " \n" + - "\n" + - "
\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "
\n" + - "\n" + - "
\n" + - "
\n" + - "
\n" + - " \n" + - " \n" + - " \n" + - "\n" + - "\n" + - "\n" + - "\n" + - " \n" + - "\n" + - "

\n" + - "\n" + - " 内容简介\n" + - "  · · · · · ·\n" + - "\n" + - "

\n" + - "\n" + - "\n" + - "\n" + - "\n" + - " \n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - " \n" + - " \n" + - "\n" + - "

\n" + - "\n" + - " 作者简介\n" + - "  · · · · · ·\n" + - "\n" + - "

\n" + - "\n" + - "\n" + - "\n" + - "
\n" + - " \n" + - " \n" + - " \n" + - "
\n" + - "

罗贝托•波拉尼奥(Roberto Bolaño,1953—2003)出生于智利,父亲是卡车司机和业余拳击手,母亲在学校教授数学和统计学。1968年全家移居墨西哥。1973年波拉尼奥再次回到智利投身社会主义革命却遭到逮捕,差点被杀害。逃回墨西哥后他和好友推动了融合超现实主义、达达主义以及街头剧场的“现实以下主义”(Infrarealism)运动,意图激发拉丁美洲年轻人对生活与文学的热爱。1977年他前往欧洲,最后在西班牙波拉瓦海岸结婚定居。2003年因为肝脏功能损坏,等不到器官移植而在巴塞罗那去世,年仅五十岁。

波拉尼奥四十岁才开始写小说,作品数量却十分惊人,身后留下十部小说、四部短篇小说集以及三部诗集。1998年出版的《荒野侦探》在拉美文坛引起的轰动,不亚于三十年前《百年孤独》出版时的盛况。而其身后出版的《2666》更是引发欧美舆论压倒性好评,均致以...

(展开全部)

\n" + - "\n" + - "
\n" + - " \n" + - " \n" + - "
\n" + - "

罗贝托•波拉尼奥(Roberto Bolaño,1953—2003)出生于智利,父亲是卡车司机和业余拳击手,母亲在学校教授数学和统计学。1968年全家移居墨西哥。1973年波拉尼奥再次回到智利投身社会主义革命却遭到逮捕,差点被杀害。逃回墨西哥后他和好友推动了融合超现实主义、达达主义以及街头剧场的“现实以下主义”(Infrarealism)运动,意图激发拉丁美洲年轻人对生活与文学的热爱。1977年他前往欧洲,最后在西班牙波拉瓦海岸结婚定居。2003年因为肝脏功能损坏,等不到器官移植而在巴塞罗那去世,年仅五十岁。

波拉尼奥四十岁才开始写小说,作品数量却十分惊人,身后留下十部小说、四部短篇小说集以及三部诗集。1998年出版的《荒野侦探》在拉美文坛引起的轰动,不亚于三十年前《百年孤独》出版时的盛况。而其身后出版的《2666》更是引发欧美舆论压倒性好评,均致以杰作、伟大、里程碑、天才等等赞誉。苏珊•桑塔格、约翰•班维尔、科尔姆•托宾、斯蒂芬•金等众多作家对波拉尼奥赞赏有加,更有评论认为此书的出版自此将作者带至塞万提斯,斯特恩,梅尔维尔,普鲁斯特,穆齐尔与品钦的同一队列。

\n" + - "\n" + - "
\n" + - "
\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - " \n" + - "\n" + - " \n" + - " \n" + - "\n" + - "\n" + - "\n" + - "\n" + - " \n" + - "\n" + - "

\n" + - "\n" + - " 目录\n" + - "  · · · · · ·\n" + - "\n" + - "

\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "
\n" + - " 圣西尼……………………………………3
\n" + - " 亨利·西蒙·勒普兰斯…………………… 27
\n" + - " 恩里克·马丁……………………………39
\n" + - " 一件文学奇事…………………… ……59
\n" + - " 通话…………………… ………………75
\n" + - " 毛毛虫…………………………………83
\n" + - " · · · · · ·\n" + - " (更多)\n" + - "
\n" + - "\n" + - "
\n" + - " 圣西尼……………………………………3
\n" + - " 亨利·西蒙·勒普兰斯…………………… 27
\n" + - " 恩里克·马丁……………………………39
\n" + - " 一件文学奇事…………………… ……59
\n" + - " 通话…………………… ………………75
\n" + - " 毛毛虫…………………………………83
\n" + - " 安妮·穆尔的生平 ……………………101
\n" + - " “小眼”席尔瓦 ………………………139
\n" + - " 戈麦斯帕拉西奥 ……………………159
\n" + - " 地球上最后的夜晚………………… 173
\n" + - " 1978 年的几天………………………205
\n" + - " 在法国和比利时闲逛…………………225
\n" + - " 牙科医生…………………… ………245
\n" + - " 邀舞卡……………………………… 273
\n" + - " · · · · · · (收起)\n" + - "
\n" + - "\n" + - "\n" + - " \n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - " \n" + - "\n" + - "

\n" + - "\n" + - " "地球上最后的夜晚"试读\n" + - "  · · · · · ·\n" + - "\n" + - "

\n" + - "\n" + - "\n" + - "
\n" + - "

情况是这样的:B 和B 父去阿卡普尔科度假。一大早,清晨六点,父子俩就要出发。那天夜里,B 睡在父亲家里。没梦,或者就算有梦,一睁眼也忘了。听见父亲在卫生间。向窗外望去,一片漆黑。B 不开灯,穿衣裳。等走出卧室的时候,父亲已经在桌旁看前一天的体育报纸了。早饭已经做好了。咖啡,牧场煎蛋。B 问候父亲后,走进卫生间。\n" + - "B 父的汽车是1970 年的福特野马。六点半,父子俩上车,开..

\n" + - "\n" + - "
· · · · · · (查看全部试读)
\n" + - "
\n" + - "\n" + - "\n" + - "\n" + - " \n" + - "\n" + - "\n" + - "\n" + - "\n" + - "
\n" + - " \n" + - " \n" + - "\n" + - "

\n" + - "\n" + - " 豆瓣成员常用的标签(共38个)\n" + - "  · · · · · ·\n" + - "\n" + - "

\n" + - "\n" + - "\n" + - "
罗贝托-波拉尼奥(68)   拉美文学(35)   外国文学(24)   小说(22)   智利文学(14)   波拉尼奥(10)   智利(10)   小说集(10)  
\n" + - "
\n" + - "\n" + - "\n" + - " \n" + - "\n" + - "\n" + - "
\n" + - "

丛书信息

\n" + - "
\n" + - "  罗贝托·波拉尼奥作品系列 (共6册),\n" + - "这套丛书还有\n" + - "《2666》,《荒野侦探》,《2666》,《荒野侦探》,《护身符》。
\n" + - "
\n" + - "\n" + - "\n" + - " \n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - " \n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "
\n" + - "

\n" + - " 书评  · · · · · · \n" + - "

\n" + - " \n" + - " 我来评论这本书\n" + - "\n" + - "\n" + - " \n" + - "\n" + - " \n" + - "\n" + - "
\n" + - " \n" + - " \n" + - "
\n" + - "
\n" + - "
\n" + - " \n" + - " \"DeadKennedy\"/\n" + - "\n" + - "
\n" + - "

\n" + - "
\n" + - " \">\"\n" + - " \n" + - " \"<\"\n" + - "
\n" + - " 信仰的挽歌\n" + - "

\n" + - "
\n" + - " \n" + - " DeadKennedy   \n" + - " \n" + - " \n" + - "

\n" + - "
\n" + - " Elegy to Faith\n" + - "\n" + - "\n" + - "波拉诺难得的短篇集。\n" + - "\n" + - "\n" + - "比之长篇,波拉诺的短篇是其能力的代表。他的长篇像话剧台词,冗长,精彩,让人迷失其中,在读过大概三百页之后似乎明白一些他在说什么。而他的短篇则像电台DJ的串词,明了,信息丰富,基本是波拉诺的自传和自白。很多篇目就是作家自身经历的镜像。是一些关于动荡,个人自由,劳动,知识份子,流放和坚持的故事。纽约时报评论这本书为“流放民谣”。\n" + - "\n" + - "\n" + - "比如写自身经历的:......\n" + - "\n" + - "

\n" + - " \n" + - " 2012-02-14 13:53    \n" + - " 2/2有用\n" + - " \n" + - " \n" + - " 来自 New Directions2007版\n" + - "\n" + - "
\n" + - "
\n" + - "
\n" + - "
\n" + - "
\n" + - "
\n" + - "
\n" + - " \n" + - "\n" + - "\n" + - "
\n" + - "\n" + - "
\n" + - " \n" + - " \n" + - "
\n" + - "
\n" + - "
\n" + - " \n" + - " \"DeadKennedy\"/\n" + - "\n" + - "
\n" + - "

\n" + - "
\n" + - " \">\"\n" + - " \n" + - " \"<\"\n" + - "
\n" + - " 信仰的挽歌\n" + - "

\n" + - "
\n" + - " \n" + - " DeadKennedy   \n" + - " \n" + - " \n" + - "

\n" + - "
\n" + - " Elegy to Faith\n" + - "\n" + - "\n" + - "波拉诺难得的短篇集。\n" + - "\n" + - "\n" + - "比之长篇,波拉诺的短篇是其能力的代表。他的长篇像话剧台词,冗长,精彩,让人迷失其中,在读过大概三百页之后似乎明白一些他在说什么。而他的短篇则像电台DJ的串词,明了,信息丰富,基本是波拉诺的自传和自白。很多篇目就是作家自身经历的镜像。是一些关于动荡,个人自由,劳动,知识份子,流放和坚持的故事。纽约时报评论这本书为“流放民谣”。\n" + - "\n" + - "\n" + - "比如写自身经历的:......\n" + - "\n" + - "

\n" + - " \n" + - " 2012-02-14 13:53    \n" + - " 2/2有用\n" + - " \n" + - " \n" + - " 来自 New Directions2007版\n" + - "\n" + - "
\n" + - "
\n" + - "
\n" + - "
\n" + - "
\n" + - "
\n" + - "
\n" + - " \n" + - "\n" + - "\n" + - "
\n" + - "\n" + - " \n" + - "
\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - " \n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "
\n" + - "
\n" + - "
\n" + - " \n" + - "\n" + - "\n" + - "
\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - " \n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "
\n" + - "\n" + - "\n" + - "\n" + - "\n" + - " \n" + - " \n" + - "\n" + - "

\n" + - "\n" + - " 论坛\n" + - "  · · · · · ·\n" + - "\n" + - "

\n" + - "\n" + - "\n" + - "\n" + - " \n" + - "\n" + - " \n" + - " \n" + - " \n" + - " \n" + - " \n" + - " \n" + - " \n" + - " \n" + - " \n" + - " \n" + - " \n" + - " \n" + - " \n" + - " \n" + - " \n" + - " \n" + - " \n" + - " \n" + - " \n" + - " \n" + - " \n" + - "
平装or精装?孔亚雷or赵德明?来自Nihilum5 回应2013-04-21
書到底出了沒啊?来自阿城199114 回应2013-04-13
不是翻译问题,是根本看不懂来自呆呆双鱼女1 回应2013-04-20
\n" + - "\n" + - "\n" + - "

>\n" + - " 在这本书的论坛里发言\n" + - "

\n" + - "\n" + - "\n" + - "
\n" + - "\n" + - "\n" + - "
\n" + - "\n" + - "
\n" + - "
\n" + - " \n" + - " \n" + - " \n" + - " \n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - " \n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "
\n" + - "\n" + - "
\n" + - "

\n" + - " 在哪儿买这本书?\n" + - "

\n" + - " \n" + - "
\n" + - " \n" + - "
\n" + - " \n" + - " \n" + - " \n" + - " \n" + - " 加入购书单\n" + - " \n" + - " 已在购书单 \n" + - " 查看\n" + - " 删除\n" + - " \n" + - " \n" + - " \n" + - "\n" + - " 多本比价,批量购买\n" + - "
\n" + - "\n" + - "
\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - " \n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "
\n" + - "
\n" + - " \n" + - "\n" + - "\n" + - "\n" + - "\n" + - " \n" + - " \n" + - "\n" + - "

\n" + - "\n" + - " 这本书的其他版本 \n" + - "  · · · · · ·\n" + - "  (\n" + - " 全部3\n" + - " ) \n" + - "\n" + - "

\n" + - "\n" + - "\n" + - "
\n" + - " \n" + - "
\n" + - "\n" + - "\n" + - " \n" + - "\n" + - "\n" + - "\n" + - " \n" + - " \n" + - "\n" + - "

\n" + - "\n" + - " 以下豆列推荐\n" + - "  · · · · · ·\n" + - "  (\n" + - " 全部\n" + - " ) \n" + - "\n" + - "

\n" + - "\n" + - "\n" + - "
\n" + - " \n" + - "
\n" + - "\n" + - "\n" + - " \n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "

谁读这本书?

\n" + - "
\n" + - "\n" + - " \n" + - "\n" + - " \n" + - "
\"小K\"
\n" + - "
小K
\n" + - "
13分钟前 想读
\n" + - "\n" + - "
\n" + - "\n" + - "\n" + - "
\n" + - "

\n" + - "
\n" + - " \n" + - "
\"杰森辛普森\"
\n" + - "
杰森辛普森
\n" + - "
28分钟前 想读
\n" + - "\n" + - "
\n" + - "\n" + - "\n" + - "
\n" + - "

\n" + - "
\n" + - " \n" + - "
\"Aby\"
\n" + - "
Aby
\n" + - "
37分钟前 想读
\n" + - "\n" + - "
\n" + - "\n" + - " tags:对人生的诠释\n" + - "\n" + - "
\n" + - "

\n" + - "
\n" + - " \n" + - "
\"老男孩\"
\n" + - "
老男孩
\n" + - "
1小时前 想读
\n" + - "\n" + - "
\n" + - "\n" + - "\n" + - "
\n" + - "

\n" + - "
\n" + - "\n" + - "\n" + - "

> 5人在读

\n" + - "

> 12人读过

\n" + - "

> 658人想读

\n" + - "\n" + - "
\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - " \n" + - "\n" + - " \n" + - " \n" + - "\n" + - "

\n" + - "\n" + - " 喜欢这本书的人常去的小组\n" + - "  · · · · · ·\n" + - "\n" + - "

\n" + - "\n" + - "\n" + - " \n" + - "\n" + - "\n" + - "\n" + - " \n" + - "
\n" + - "
\"托马斯·品钦\"/
\n" + - " \n" + - "
托马斯·品钦 (711)\n" + - "
\n" + - "
\n" + - " \n" + - "
\n" + - "
\"短经典\"/
\n" + - " \n" + - "
短经典 (787)\n" + - "
\n" + - "
\n" + - " \n" + - "
\n" + - "
\"寻找:布鲁诺.舒尔茨\"/
\n" + - " \n" + - "
寻找:布鲁诺.舒尔茨 (466)\n" + - "
\n" + - "
\n" + - " \n" + - "
\n" + - "
\"胡安·鲁尔福\"/
\n" + - " \n" + - "
胡安·鲁尔福 (613)\n" + - "
\n" + - "
\n" + - " \n" + - "
\n" + - "
\"V.S.奈保尔\"/
\n" + - " \n" + - "
V.S.奈保尔 (445)\n" + - "
\n" + - "
\n" + - " \n" + - "
\n" + - "
\"胡利奥·科塔萨尔\"/
\n" + - " \n" + - "
胡利奥·科塔萨尔 (1053)\n" + - "
\n" + - "
\n" + - " \n" + - "
\n" + - "
\"中国当代书籍装帧摭评\"/
\n" + - " \n" + - "
中国当代书籍装帧摭评 (1373)\n" + - "
\n" + - "
\n" + - " \n" + - "
\n" + - "
\"泼先生\"/
\n" + - " \n" + - "
泼先生 (485)\n" + - "
\n" + - "
\n" + - "\n" + - "\n" + - "\n" + - " \n" + - "\n" + - "
\n" + - "

> 收藏这本书的1个小组

\n" + - "
\n" + - "

> \n" + - " 加到我的小组收藏里 \n" + - "

\n" + - "
\n" + - "
\n" + - "\n" + - " \n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - " \n" + - "\n" + - "

二手市场

\n" + - "
\n" + - "
    \n" + - "
  • \n" + - " > 点这儿转让\n" + - "\n" + - " 有658人想读,手里有一本闲着?\n" + - "
  • \n" + - "
\n" + - "
\n" + - "\n" + - " \n" + - "

订阅关于地球上最后的夜晚的评论:
\n" + - " feed: rss 2.0

\n" + - "\n" + - " \n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "
\n" + - "
\n" + - " \n" + - "
\n" + - "
\n" + - "
\n" + - "\n" + - " \n" + - "
\n" + - "\n" + - "\n" + - "\n" + - " © 2005-2013 douban.com, all rights reserved\n" + - "\n" + - "\n" + - "\n" + - " 关于豆瓣\n" + - " · 在豆瓣工作\n" + - " · 联系我们\n" + - " · 免责声明\n" + - " \n" + - " · 帮助中心\n" + - " · 开发者\n" + - " · 图书馆合作\n" + - " · 手机读书\n" + - " · 豆瓣广告\n" + - "\n" + - "\n" + - "\n" + - "
\n" + - "\n" + - "
\n" + - " \n" + - " \n" + - " \n" + - " \n" + - " \n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - " \n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n"; - - Html html = new Html(text); - System.out.println(html.sc()); - - } -} diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/selector/XpathSelectorTest.java b/webmagic-core/src/test/java/us/codecraft/webmagic/selector/XpathSelectorTest.java index 96ea6e8cc..469ff26c6 100644 --- a/webmagic-core/src/test/java/us/codecraft/webmagic/selector/XpathSelectorTest.java +++ b/webmagic-core/src/test/java/us/codecraft/webmagic/selector/XpathSelectorTest.java @@ -4,2747 +4,1353 @@ import org.junit.Test; /** - * User: cairne - * Date: 13-4-21 - * Time: 上午10:06 + * User: cairne Date: 13-4-21 Time: 上午10:06 */ public class XpathSelectorTest { - String huxiuHtml = "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "产品情感化设计的两个层面-观点-@虎嗅网\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "
\n" + - "\t\t\t\n" + - "\t\n" + - "
\n" + - "

\"虎嗅网\"

\n" + - "\n" + - "
\n" + - "
\n" + - "
\n" + - "订阅虎嗅\n" + - "RSS\n" + - "
\n" + - "\n" + - "
\n" + - "\n" + - "
\n" + - "\n" + - "
\n" + - "
\n" + - "\n" + - "
\n" + - "
\n" + - "
\n" + - "
\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "
\n" + - "
\n" + - "
\n" + - "\n" + - "\n" + - "\n" + - "
\n" + - "
\n" + - "
\n" + - "
\n" + - "\n" + - "\n" + - "
\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "
\n" + - " \n" + - "
\n" + - " \n" + - "
\n" + - "\n" + - "\n" + - "
\n" + - "
\n" + - "
\n" + - " \n" + - "
\n" + - "\n" + - "

产品情感化设计的两个层面

\n" + - "\n" + - "
\n" + - "
\n" + - " 2013-4-22 16:10\n" + - " \n" + - " \t评论(0)\n" + - " \n" + - "产品\n" + - "投稿\n" + - "\n" + - "
\n" + - "
\n" + - " \n" + - "
\n" + - "
\n" + - "
\n" + - "\n" + - "
\n" + - "
\n" + - "\n" + - " \n" + - "
\n" + - " \n" + - " \n" + - " \n" + - " \n" + - " \n" + - "
\n" + - " \"\"\n" + - " 用户之所以选择一款产品,首要的一点在于产品的功能或内容满足了用户。而随着产品的发展,同类型的产品基础功能都大致相同,产品之间的竞争越来越难在功能层面拉开差距。现在产品人员也更加开始在用户体验上下功夫了,而对用户体验的不断追求也就上升到了情感层面。

谈起产品情感化设计,可以拿手机通讯录中添加联系人头像来举例子,单就这个功能点而言,最基础的只要用户能够添加联系人的头像即可,而如果在这个功能上添加用户情感化的元素后,就可以在用户的头像展示上给予更大空间,让用户能够更大的发挥自己的个性。我们也发现新浪微博和开心网个人主页的设计也都增加了个人封面的展示。产品情感化对于功能本身是没有影响的,而情感因素后,产品对用户还会更有吸引力。短期来看,个性化和给用户更大的发挥空间是产品情感化设计的两个很重要的方向。

产品的情感化设计有两个不同的做法:一个是在已有功能上进行扩展,如上文所提到的通讯录中上传头像的功能,是对用户表达欲的满足,用户情感的单向表达;另一种做法则是做一个完全情感化的产品,用户情感的双向表达,是用户之间情感内容的交流,产品扮演的只是桥梁作用,例如小恩爱、抬杠这样的产品。其实所有涉及到用户互动性的产品对于情感化的拓展空间都很大,但是与普通社交不同的是,产品的情感化在于人与人之间更深层次的交流。在我个人看来,社交网站中的发状态功能已经仅仅是用户表达的工具,极少含有感情因素,但是像Facebook推出的暗恋功能却是一个情感化产品,产品的情感化不仅在于让用户将自己的情感寄予到产品中,而且产品要想具有情感化很重要的一点在于产品本身能够起到挖掘用户情感的作用。

前面所提到的两种做法区别在于,前者是基于已有需求而进行的情感化设计,而后者则是完全情感化的产品,就成功率来讲,显然是前者更大一些。本身有需求的产品对于产品的情感化发展不仅奠定了基础,而且也烘托了氛围,做好了铺垫。如果是做一个完全情感化的产品,失败的可能性很大。当产品的功能满足了用户的情感表达,那就意味着产品可以满足用户的需求,而当产品本身所扮演的角色无法成为用户的寄托,那么产品就会面临失败。可想而知,情感化的产品肯定属于UGC类型,对于用户内容的质量要求会比较高,当技术水平不够高、功能操作不够便捷的时候,自然就提高了使用门槛。而且这种类型的产品对于氛围的烘托本身就会有相对高的要求。

如果单从功能角度去衡量,用户情感的单向表达属于功能层面,而用户情感的双向表达属于内容层面。除此之外,产品情感化还有文案和产品风格上的表现。

你是一个资深网虫,或许你也有所感觉,现在的网站文案已经越来越有人情味了。例如提示文案不是“你的账号密码错误”而是“密码不对哦”,文案中增加了语气词。这只是其中的一种表达方式,除此之外,你会看到产品设计中的很多引导方式也更有趣味性,文案内容的情感化也会增加用户的接受程度。

最近自己在使用产品中也有个很大的感触,就是产品风格对用户的吸引,同样是天气类应用,功能上相差无几,但是不同的风格却可以吸引不同的受众。有的是大众普通的风格,有的是小清新风格,有的是卡通风格等等,可以理解为用户对不同风格产品的选择背后的原因就是用户个人情感的不同,而用户的这种情感不能改变只能顺从。

更深层次的讲,产品情感化的关键在于产品功能与用户情感的承接,满足人们情感的诉求。从心理学上讲人的本性有很多,例如表达欲、攀比心理,但从人的本性和产品的情感化进行匹配,会有太多的点,在这里就不一一例举了,大家可以在产品的使用过程中逐渐感受。而之所以要选择利用人性情感的哪一点来设计产品就要根据具体的产品目标来衡量了。

文章来源:马虎眼    作者微信账号:mahuyan


本文由\n" + - "云瑞\n" + - "授权虎嗅网发表,并经虎嗅网编辑。转载此文章须经作者同意,并请附上出处(虎嗅网)及本页链接。
原文链接http://www.huxiu.com/article/13380/1.html\n" + - "
\n" + - "
\n" + - " \n" + - " \n" + - "
\n" + - " \n" + - "\n" + - " \n" + - "\n" + - "\n" + - " \n" + - "
\n" + - " \n" + - "
\n" + - "
\n" + - "
\n" + - "
\n" + - " 分享(0):\n" + - "
\n" + - "
\n" + - "
    \n" + - "
  • \n" + - "
  • \n" + - "
  • \n" + - "
  • \n" + - "
  • \n" + - "
  • \n" + - "
  • \n" + - "
  • \n" + - "
  • \n" + - "
  • \n" + - "
  • \n" + - "
  • \n" + - "
  • \n" + - "
  • \n" + - " \n" + - "
  • \n" + - "
\n" + - " \n" + - "
\n" + - "
收藏\n" + - " \n" + - " \n" + - " \n" + - "\n" + - "\n" + - " 没劲 \n" + - " 喜欢 \n" + - "
\t\t \n" + - "
\n" + - " \n" + - "
\n" + - "
\n" + - "
\n" + - "

参与讨论,请先登录|注册

\n" + - "

\n" + - "\n" + - "\n" + - "

\n" + - "
\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "
\n" + - "
\n" + - "
\n" + - "\n" + - " \n" + - " \n" + - "
\n" + - " \n" + - "
\n" + - "
\n" + - "
\n" + - "\t\t\t
\n" + - "

作者:云瑞

\n" + - "\n" + - "
\n" + - "
\n" + - "
\n" + - "\n" + - "
\n" + - "
\n" + - "
\n" + - "
个人签名
\n" + - "
人人都爱互联网
\n" + - "
\n" + - "
\n" + - "\t\t\t\t\n" + - "

作者其他文章

\n" + - "\n" + - " \n" + - "更多文章\n" + - "
\n" + - "\n" + - "
\n" + - "

您不能错过的作者

\n" + - "
  • \n" + - "

    \"葛甲\"

    \n" + - "

    葛甲

    \n" + - "
  • \n" + - "
  • \n" + - "

    \"吴澍\"

    \n" + - "

    吴澍

    \n" + - "
  • \n" + - "
  • \n" + - "

    \"知乎精选\"

    \n" + - "

    知乎精选

    \n" + - "
  • \n" + - "
  • \n" + - "

    \"译言\"

    \n" + - "

    译言

    \n" + - "
  • \n" + - "
  • \n" + - "

    \"潘乱\"

    \n" + - "

    潘乱

    \n" + - "
  • \n" + - "
  • \n" + - "

    \"王云辉\"

    \n" + - "

    王云辉

    \n" + - "
  • \n" + - "
  • \n" + - "

    \"阑夕\"

    \n" + - "

    阑夕

    \n" + - "
  • \n" + - "
  • \n" + - "

    \"胡晓东\"

    \n" + - "

    胡晓东

    \n" + - "
  • \n" + - "
  • \n" + - "

    \"阳淼\"

    \n" + - "

    阳淼

    \n" + - "
  • \n" + - "
  • \n" + - "

    \"魏武挥\"

    \n" + - "

    魏武挥

    \n" + - "
  • \n" + - "
  • \n" + - "

    \"高低买个皮夹克\"

    \n" + - "

    高低买个皮夹克

    \n" + - "
  • \n" + - "
  • \n" + - "

    \"潘越飞\"

    \n" + - "

    潘越飞

    \n" + - "
  • \n" + - "\t\n" + - "
\n" + - "
\n" + - "\n" + - " \t\t\n" + - "
\n" + - "\n" + - "\n" + - "
\n" + - "
\n" + - "\n" + - "
\n" + - "\n" + - "
\n" + - " \n" + - "
\n" + - " \n" + - "
\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\t
\n" + - "\n" + - "\n" + - "
\n" + - "
\n" + - "

关于我们|加入我们|广告及服务|常见问题解答|提交建议\n" + - "\n" + - "

\n" + - "

Copyright © 虎嗅网\n" + - "( 京ICP备12013432 )

\n" + - "
\n" + - "
\n" + - "\n" + - " 
\n" + - "\n" + - "回顶部\n" + - "\n" + - "\t\t\t
\n" + - "\t\t\t\n" + - "\t\t\t\n" + - "\n"; - - String blogHtml = "\n" + - "\n" + - "\n" + - "\n" + - " \n" + - " \n" + - " \n" + - " \n" + - " 一个基于Python装饰器的用户输入验证设计方案 - SamChi的个人空间 - 开源中国社区\n" + - " \n" + - " \n" + - " \n" + - " \n" + - " \n" + - " \n" + - " \n" + - " \n" + - " \n" + - " \n" + - " \n" + - " \n" + - " \n" + - " \n" + - " \n" + - " \n" + - " \n" + - " \n" + - "\n" + - "\n" + - "
\n" + - "\t
\n" + - "\t\t
\n" + - " \t开源中国社区\n" + - "
\n" + - " \t\t
JetBrains 开发工具全场3折,详情»
\n" + - "
\n" + - " \t\n" + - "
\n" + - "
\n" + - "\t
\n" + - "\t
\n" + - "\t\t
\n" + - "\t\t当前访客身份:\n" + - "\t\t\t\t黄亿华 [ 我的空间 | 退出 ]\n" + - "\t\t\t\t\t\t\t\n" + - "\t\t\t\t\t\t你有0新留言\t\t\t\n" + - "\t\t\t\t\t\t\t\t\n" + - "\t\t
\n" + - "\t\t
\n" + - " \t\t
\n" + - "\t\t\t\t\n" + - "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\n" + - " \t\t\t\n" + - "\t\t\t\t\n" + - "\t\t\t\t
\n" + - " \t\t\t\t\n" + - " \t\t\t\t\n" + - " \t\t\t\t\n" + - "
软件
\n" + - " \n" + - "
\n" + - "\t\t\t\t\t\t\t\n" + - " \t\t
\n" + - "\t\t
\n" + - "\t\t
\n" + - "\t
\n" + - "\t
\t\n" + - "\n" + - "
\n" + - "
\n" + - "\t \"SamChi\"\n" + - " \n" + - " SamChi\n" + - "\t\t\n" + - "\t\t\t\n" + - " \t\t\t\t\t\t\t\n" + - " \n" + - "
\n" + - "
\n" + - " \t关注(21)\n" + - " \t粉丝(52)\n" + - " \t积分(37)\n" + - "
\n" + - "
\n" + - "
\n" + - "这个人很懒,啥也没写
\n" + - "\n" + - "
\n" + - "\t.发送留言\n" + - "\t.请教问题\n" + - "
\n" + - " 博客分类\n" + - " \n" + - "
\n" + - "
\n" + - " 最新评论 \n" + - "
    \n" + - "\t\t
  • \n" + - "\t\t@其斤君羊:说的很对 做什么事情都得从身边做起 更何况创业 ...\n" + - "\t\t查看»\n" + - "\t
  • \n" + - "\t\t
  • \n" + - "\t\t@techstan:不错\n" + - "\t\t查看»\n" + - "\t
  • \n" + - "\t\t
  • \n" + - "\t\t@摩云飞:谢谢博主的总结,很有价值\n" + - "\t\t查看»\n" + - "\t
  • \n" + - "\t\t
  • \n" + - "\t\t@尚楠:正在学Python,谢了\n" + - "\t\t查看»\n" + - "\t
  • \n" + - "\t\t
  • \n" + - "\t\t@knightuniverse:其实我觉得,很多时候,不论是做项目还是做产品,...\n" + - "\t\t查看»\n" + - "\t
  • \n" + - "\t\t
  • \n" + - "\t\t@moyun:顶一个\n" + - "\t\t查看»\n" + - "\t
  • \n" + - "\t\t
  • \n" + - "\t\t@SamChi:引用来自“Martinium”的评论 alert('I am admi...\n" + - "\t\t查看»\n" + - "\t
  • \n" + - "\t\t
  • \n" + - "\t\t@Martinium:alert('I am admin, bitch!'); 这句话亮了。...\n" + - "\t\t查看»\n" + - "\t
  • \n" + - "\t\t
  • \n" + - "\t\t@Ben:引用来自“ExtremeTalk”的评论 引用来自“Ben”...\n" + - "\t\t查看»\n" + - "\t
  • \n" + - "\t\t
  • \n" + - "\t\t@ExtremeTalk:引用来自“Ben”的评论 引用来自“ExtremeTalk”...\n" + - "\t\t查看»\n" + - "\t
  • \n" + - "\t
\n" + - "
\n" + - "访客统计\n" + - "
    \n" + - "\t
  • 3
  • \n" + - "
  • 33
  • \n" + - "
  • 36
  • \n" + - "
  • 842
  • \n" + - "
  • 13706
  • \n" + - "
\n" + - "
\n" + - "\n" + - "\n" + - "
\n" + - "\t
\n" + - " \t\n" + - "\t
\n" + - "\t\n" + - " \t
\t\t\n" + - "
\n" + - "

一个基于Python装饰器的用户输入验证设计方案

\n" + - "
\n" + - " \t\t \t\t \t \n" + - "\t\t\t\t\n" + - "\n" + - "8人收藏此文章,\n" + - "\n" + - "\t\n" + - "\t\t\t\n" + - " \t\t \t\t发表于7天前(2013-04-15 16:46) , \n" + - " \t\t已有127次阅读 ,共0个评论\n" + - " \t\t \t
\n" + - "
\n" + - "\t

情景

\n" + - "

最近初学Python, 语法大概熟悉了之后就开始拿web.py做点小东西,web.py非常轻量,用起来感觉很舒服。但不过无论什么语言或者框架,web开发中有一个最大烦人之处就是表单验证,web.py提供了web.form来进行表单验证的统一处理,这个东西虽然用起来很简单,但是感觉还是不太合心意,首先这套验证机制跟web.py框架耦合的程度太高,而自己的架构是这样的,业务逻辑跟web逻辑完全分离,web仅仅是交互形式的一种,即使添加客户端C/S形式的服务或者是向开发者提供API,业务逻辑也是完全可用,不需要修改,这样对用户输入的验证是属于业务逻辑这一块,不应该跟web表单耦合在一起;另外感觉web.py这套东西还是有些简单,只支持每个表单的正则验证和最后表单提交的整体验证,而很多时候可能需要对用户进行丰富的错误提示,比如针对用户名的错误会具体到是不能为空还是长度错误或者格式错误等, 这个用web.py的form验证就感觉很别扭了。于是就决定自己设计一个用户输入的验证方案。

\n" + - "

设计

\n" + - "

web项目的开发多数都是遵循这么一个结构的设计,即DAO->Service->Controller->View, 按我前面说的,对用户的输入验证应是发生在Service这一层上,这一层的设计是接受用户输入的参数,然后进行验证处理,再进行业务相关的计算,最后输入结果。每个Service接口都应该返回一个结果,我一般都会把这个结果的内容抽象成一个一致类型的对象:

\n" + - "
class Result(object):\n" +
-            "    \n" +
-            "    u''' 操作结果抽象 '''\n" +
-            "    \n" +
-            "    def __init__(self, code, value=None):\n" +
-            "        self.code = code   #操作结果代号\n" +
-            "        self.value = value #操作结果值\n" +
-            "        \n" +
-            "    def __str__(self):\n" +
-            "        return "operation result, code: %s, value: %s" % (self.code, self.value)
\n" + - "

这个结果对象包含两个属性,一个是操作结果的代码,一个是操作的值,举个例子,比如用户注册的接口,如果注册成功,那么就会返回一个这样的Result对象,code属性是'success', value属性是新注册用户分配的ID,如果用户名已经被占用,那么code属性就是'username_exised', value属性的值是None。客户端拿到code属性的值可以做响应的处理,如果是直接面向最终用户的web应用,那么就会去找到这个code对应的错误信息来展示给用户,所有的错误信息我是组织在一个单独的Python模块中(opresult.py):

\n" + - "
reg = {\n" +
-            "       'success':u'注册成功',\n" +
-            "       'username_empty':u'用户名不得为空',\n" +
-            "       'username_format':u'用户名必须只能有数字、字母下划线组成',\n" +
-            "       'username_length':u'用户名长度必须在5到10个字符之间',\n" +
-            "       'username_existed':u'用户名已经存在',\n" +
-            "       'password_empty':u'密码不得为空',\n" +
-            "       'repassword_error':u'两次密码输入不一致',       \n" +
-            "       }
reg是注册的接口名称,这样客户端通过接口名称和code就可以获取对应的提示。 \n" + - "

由此,用户输入验证就是要把接口参数同这些code联系起来。对于参数验证,Python有天生的语言优势,那就是装饰器。一开始就想到了使用装饰器来描述参数验证需求,但这个装饰器需要哪些信息?怎么个形式?这个得从表单验证的需求开始看起,个人总结表单验证大抵不过这些判断条件:

\n" + - "

1. 是否允许为空

\n" + - "

2. 长度限制:比如密码的长度一般会不允许少于多少位

\n" + - "

3. 格式限制:比如Email地址,需要正则判断

\n" + - "

4. 逻辑限制:比如注册时判断用户名是否已经存在

\n" + - "

初步根据这些判断条件设计出这么一个方案:

\n" + - "
@checkarg(username={'allow_empty':False, \n" +
-            "                    'regex':r'^[a-zA-Z\\d_]+$',\n" +
-            "                    'min-length':5, 'max-length':10, \n" +
-            "                    'check_logic':[check_username_usable]},\n" +
-            "          password={'allow_empty':False,'regex':r'.{6,}'},\n" +
-            "          repassword={'allow-empty':False, 'check_logic':\n" +
-            "                      [(lambda **kw:(kw['password'] == kw['repassword'], "repassword_error"))]})\n" +
-            "def reg(username, password, repassword):\n" +
-            "    ....
\n" + - "

每一个参数使用一个字典来描述验证信息, allow_empty是表示是否为空,regex为验证的正则表达式,min-length和max-length用来描述长度,check_logic用来配置其他的验证逻辑。然后如何把这些验证结果同code进行匹配呢?最开始是在这个验证信息的字典中有一项'code':{'allow_empty':'username_empty'}通过这样的形式去匹配错误提示,但是感觉这样整的这个参数太复杂了(感觉现在已经挺复杂了- -b),于是决定这个地方使用约定优于配置的形式,code的值为'参数名_错误类型'的形式,比如allow_empty如果验证了为空,那么会自动返回名为username_empty的code,如果是一些额外的处理逻辑呢?没法做约定,怎么办?那么就约定这些检测函数返回一个元组,第一个元素为一个bool值,表示成功失败,第二个参数为code,表示失败原因,比如判断两次密码是否输入一致的那个lambda:

\n" + - "
lambda **kw:(kw['password'] == kw['repassword'], "repassword_error"
\n" + - "

嗯,大体就是这样的一个设计。

\n" + - "

实现

\n" + - "

根据上面的设计,把最终的装饰器实现了出来, 逻辑比较简单,关于装饰器设计的一些细节可以参阅Python参考手册:

\n" + - "
regex_cache = {}\n" +
-            "     \n" +
-            "def checkarg(**args):\n" +
-            "    \n" +
-            "    u'''参数检测装饰器'''\n" +
-            "    \n" +
-            "    def _checkarg(function):\n" +
-            "        \n" +
-            "        def __checkarg(**func_kw):\n" +
-            "            for key in func_kw:\n" +
-            "                if key in args:\n" +
-            "                    \n" +
-            "                    #要验证的值\n" +
-            "                    value = func_kw[key]\n" +
-            "                    \n" +
-            "                    #验证规则\n" +
-            "                    valid_rules = args[key]\n" +
-            "                    \n" +
-            "                    #检测空\n" +
-            "                    allow_empty = valid_rules.get('allow_empty')\n" +
-            "                    if not allow_empty:\n" +
-            "                        if not value or not value.strip():\n" +
-            "                            return Result(key + "_empty")\n" +
-            "                    elif not value:\n" +
-            "                        #如果是空的并且忽略空检测,那么下面的就不需要检查了\n" +
-            "                        continue;\n" +
-            "                    \n" +
-            "                    #检测长度\n" +
-            "                    if 'min-length' in valid_rules:\n" +
-            "                        min_length = valid_rules['min-length']\n" +
-            "                        if min_length > len(value):\n" +
-            "                            return Result(key + "_length")\n" +
-            "                        \n" +
-            "                    if 'max-length' in valid_rules:\n" +
-            "                        max_length = valid_rules['max-length']\n" +
-            "                        if max_length < len(value):\n" +
-            "                            return Result(key + "_length")\n" +
-            "                    \n" +
-            "                    #检测正则\n" +
-            "                    if 'regex' in valid_rules:\n" +
-            "                        #获取编译后的正则\n" +
-            "                        regex = valid_rules['regex']\n" +
-            "                        regexcmp = regex_cache.get(regex)\n" +
-            "                        if not regexcmp:\n" +
-            "                            regexcmp = re.compile(regex)\n" +
-            "                            regex_cache[regex] = regexcmp\n" +
-            "                        if not regexcmp.search(value):\n" +
-            "                            return Result(key + "_format")\n" +
-            "                    \n" +
-            "                    #检测其他逻辑\n" +
-            "                    check_logics = valid_rules.get('check_logic')\n" +
-            "                    if check_logics:\n" +
-            "                        for logic in check_logics:\n" +
-            "                            result, code = logic(**func_kw)\n" +
-            "                            if not result:\n" +
-            "                                return Result(code)\n" +
-            "                                \n" +
-            "            function(**func_kw)\n" +
-            "        return __checkarg\n" +
-            "                            \n" +
-            "    return _checkarg
\n" + - "\t \t \n" + - "
\n" + - "\t\t\n" + - "
\n" + - "\t \t\n" + - "\t \t \n" + - "
\t\t\n" + - "\t \t\t声明:OSCHINA 博客文章版权属于作者,受法律保护。未经作者同意不得转载。\n" + - "\t \t
\n" + - "\n" + - " \n" + - "\t
\n" + - "\n" + - "\t\n" + - "\t
\n" + - "\t\n" + - "\t\n" + - "\t\t分享到: \n" + - "\t\t\n" + - "\t\t\n" + - "\t\n" + - " 已有 0人顶\n" + - "\t\n" + - "\t
\n" + - "\t\t\n" + - "
\n" + - "
\n" + - "
\n" + - "

共有 0 条网友评论

\n" + - "\t\t\t

尚无网友评论

\n" + - "\t\t
    \n" + - "\t\t
\n" + - "
\n" + - "\t
\n" + - "\n" + - "\n" + - "\n" + - "
\n" + - " \n" + - "
\n" + - "
\n" + - "
\n" + - "\t \n" + - "\t \n" + - "\t 文明上网,理性发言\n" + - "
\n" + - "\t回到页首 | 回到评论列表\n" + - "
\n" + - "
\n" + - "\t\n" + - "
\n" + - "\t关闭相关文章阅读\n" + - "\t\n" + - "
\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "
\n" + - "\t
\n" + - "\t
\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "
© 开源中国(OsChina.NET) | 关于我们 | 广告联系 | @新浪微博 | 开源中国手机版 | 粤ICP备12009483号-3\n" + - "\t开源中国手机客户端:\n" + - "\tAndroid\n" + - "\tiPhone\n" + - "\tWP7\n" + - "
\n" + - "
\n" + - "
\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - ""; - - String html = "\n" + - "\n" + - "\n" + - "\n" + - " \n" + - " \n" + - " 再次吐槽easyui - 开源中国 OSChina.NET\n" + - " \n" + - " \n" + - " \n" + - " \n" + - " \n" + - " \n" + - " \n" + - " \n" + - " \n" + - " \n" + - " \n" + - " \n" + - " \n" + - " \n" + - " \n" + - " \n" + - " \n" + - "\n" + - "
\n" + - "\t
\n" + - "
\n" + - " \t\n" + - "
\n" + - "\t\t
\n" + - " \t\t \t\t黄亿华,您好 \n" + - "\t\t\t\n" + - "\t\t\t\t我的空间\n" + - "\t\t\t\t\n" + - "\t\t\t | \n" + - "\t\t\t添加软件 | 投递新闻 | 退出\n" + - " \t\t\t\t
\n" + - "\t\t
\n" + - "\t
\n" + - "
\n" + - "
\n" + - "

讨论区

\n" + - "
\n" + - "\t
当前位置:
\n" + - "\t
\n" + - "\t\t\t\t\t \t\t讨论区 »\n" + - " \t\t技术问答\t\t\t\t\t\t\t\t» EasyUI\n" + - "\t\t\t\t\t\t\t\t\t\t
\n" + - "
\n" + - "\n" + - "
\n" + - "
\n" + - "\t
\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "
\n" + - "\t\n" + - "\t
\n" + - "\t
\n" + - "\t\t
\"午后冬日\"
\n" + - "\t\t
\n" + - "\t\t\t

再次吐槽easyui

\n" + - "\t\t\t
\n" + - "\t\t\t\t午后冬日\n" + - "\t\t\t\t发表于 2013-4-21 02:28 13小时前,\n" + - "\t\t\t\t3回/289阅,\n" + - "\t\t\t\t最后回答: 4小时前\t\t\t\t\t\t\t\t\t\t\t
\n" + - "\t\t
\n" + - "\t\t\n" + - "\t\t
\n" + - "\t
\n" + - "\t\t \t \t\t\t\t\t\n" + - "\t\t

Java、PHP、Ruby、iOS、Python 等 JetBrains 开发工具低至 99 元(3折),详情»

\n" + - "\t\t
\n" + - "\t\t\t\t\t\t
刚用到easyui treegrid组件,发现这货第一次加载时候并没有传默认参数,展开某一列时候才传递id:xx的参数。这样和后台总是疙里疙瘩,像没事就拌嘴的两口子,查网上都遇到相同问题,最好解决方案就是通过 \n" + - "onBeforeExpand事件来扩展,自行解决。看到官方例子中简洁的代码,感觉easyui耍流氓了,真搞不懂为何要这样实现
\n" + - "\t\t\t\t\t\t
\n" + - "\t\t\t\t标签:\t\t\t\t\n" + - "\t\t\t\t\t\t\t\t\t\t\t\tEasyUI \t\t\t\t\t\t\t\t\t\t\t
\n" + - "\t\t\t\t\t\t
\n" + - "\t\t\t\n" + - "\t\t\t\t\t\t\t\t\t我想问同样的问题\n" + - "\t\t\t\t\t\t\n" + - "\t\t\t共0个人想要问同样的问题\n" + - "\t\t\t\t\t\t补充话题说明»\n" + - "\t\t\t
\n" + - "\t\t\t\t\t\t
\n" + - "\t
    \n" + - "
    \t\t
    \n" + - "\t\t\n" + - "\t\t
    \n" + - " \n" + - "\t\t\t\t
    \n" + - "\t\t\t
    分享到
    \n" + - "\t\t\t\n" + - "\t\t\t
    1
    \n" + - "\t\t\t\n" + - "\t\t\t
    \n" + - "\t\t\t\t\t\t\t\t \t\t\t\n" + - "\t\t\t\t\t\t\t\t0\n" + - "\t\t\t\t|\n" + - "\t\t\t\t\t\t\t\t \t\t\t\n" + - "\t\t\t\t\t\t\t\t0\n" + - "\t\t\t
    \n" + - "\t\t\t\n" + - "\t\t
    \n" + - "\t\t
    \n" + - "\t\t\t\t\t\t
    \n" + - "\t\t\t\n" + - " \t

    \t\t\t\n" + - "\t\t\t\t\n" + - "\t\t\t\t\t按评价排序 |\n" + - "\t\t\t\t\t显示最新答案 | 回页面顶部\n" + - "\t\t\t\t\n" + - "\t\t\t\t共有3个答案 我要回答»\n" + - "\t\t\t

    \n" + - "\t\t\t \t
    • \n" + - "\t
      \"布谷鸟\"
      \n" + - "\t
      \n" + - "\t\t
      布谷鸟 回答于 2013-04-21 09:28
      \t\t\n" + - " \t
      \n" + - "\t\t\t \t\t \t\t举报\n" + - " \t
      \n" + - "\t\t
      \n" + - "\t\t
      对话框、日期控件和选项卡效果还不错,树状菜单没得zTree好用,建议楼主不要全部效果都依赖于此框架,有些easyui实现不好的地方可以换其它的插件实现嘛,反正我现在再也不用诸如ext和easyui之类的东西了,感觉好肥
      \n" + - "\t
      \n" + - "\t
      \n" + - "\t
      --- 共有 1 条评论 --- \n" + - "
        \n" + - "\t\t
      • \n" + - "\t\t\"午后冬日\"\n" + - "\t\t\n" + - "\t\t前端水平实在有限,自己搞的总是感觉不伦不类,只能用这些框架,再集成其它插件,切换主题时风格又不一致。\n" + - "\t\t(4小时前 by 午后冬日)\n" + - "\t\t回复\n" + - "\t\t\n" + - "\t\t
        \n" + - "\t
      • \n" + - "\t
      \n" + - "\n" + - "
      \n" + - "\t
      \t\t\t\t\t\t有帮助(1) |\n" + - "\t\t没帮助(0) |\n" + - "\t\t评论(1) |\n" + - " \t引用此答案\t
      \n" + - "
    • \n" + - "\t
      \"静风流云\"
      \n" + - "\t
      \n" + - "\t\t
      静风流云 回答于 2013-04-21 11:08
      \t\t\n" + - " \t
      \n" + - "\t\t\t \t\t \t\t举报\n" + - " \t
      \n" + - "\t\t
      \n" + - "\t\t

      没办法,原来项目也是因为客户特殊的需求,对layout选型的时候,犹豫了好久,最终放弃了。
      幸亏来了一个厉害的前端,解决问题,够用就好。

      \n" + - "\t
      \n" + - "\t
      \n" + - "\t
      --- 共有 1 条评论 --- \n" + - "
        \n" + - "\t\t
      • \n" + - "\t\t\"午后冬日\"\n" + - "\t\t\n" + - "\t\t我也是犹豫了好久,看过很多前端框架,总是不太满意。个人开发前台后台数据库全部要自己搞定,郁闷ing\n" + - "\t\t(4小时前 by 午后冬日)\n" + - "\t\t回复\n" + - "\t\t\n" + - "\t\t
        \n" + - "\t
      • \n" + - "\t
      \n" + - "\n" + - "
      \n" + - "\t
      \t\t\t\t\t\t有帮助(0) |\n" + - "\t\t没帮助(0) |\n" + - "\t\t评论(1) |\n" + - " \t引用此答案\t
      \n" + - "
    • \n" + - "\t
      \"布谷鸟\"
      \n" + - "\t
      \n" + - "\t\t
      布谷鸟 回答于 2013-04-21 11:29
      \t\t\n" + - " \t
      \n" + - "\t\t\t \t\t \t\t举报\n" + - " \t
      \n" + - "\t\t
      \n" + - "\t\t

      引用来自“布谷鸟”的答案

      对话框、日期控件和选项卡效果还不错,树状菜单没得zTree好用,建议楼主不要全部效果都依赖于此框架,有些easyui实现不好的地方可以换其它的插件实现嘛,反正我现在再也不用诸如ext和easyui之类的东西了,感觉好肥
      前后端你一个人搞啊?那确实很麻烦。面面俱到的话,工作量很大。但是如果需要实现的功能不是很多,而时间也不紧迫的话,事情干起来也还不错。如非必须,建议逐步弃用这些前端框架,在一些比较能够提升体验的地方选用一些适当的插件即可,如此也不再需要担心风格的问题,你看osc后台截图,界面那叫一个丑,用得方便顺手就够了
      \n" + - "\t
      \n" + - "\t
      \n" + - "\t
      \n" + - "\t
      \t\t\t\t\t\t有帮助(0) |\n" + - "\t\t没帮助(0) |\n" + - "\t\t评论(0) |\n" + - " \t引用此答案\t
      \n" + - "
    \n" + - "\t\t\t\t
    \n" + - "\t\t
    \n" + - "\t\t\t
    \"黄亿华\"
    \n" + - "\t\t\t
    \n" + - "\t\t\t\t\n" + - "\t\t\t\t\n" + - "\t\t\t\t\n" + - "\t\t\t\t\n" + - "\t\t\t\t

    \n" + - "\t\t\t\t回答案顶部 | 回页面顶部\n" + - "\t\t\t
    \n" + - "\t\t\t
    \n" + - "\t\t\t\n" + - "\t\t
    \n" + - "\t
    \t\n" + - "\t\n" + - "\n" + - "\n" + - "\n" + - "\t
    \n" + - "\t
    \n" + - " \t\n" + - "\t
    \n" + - "\t\t
    \n" + - "\t\t\t有什么技术问题吗?\n" + - "\t\t\t我要提问\n" + - "\t\t\t
    \n" + - "\t\t
    \n" + - "\t\t\n" + - "\t\t\t\t\t\t
    \n" + - "\t\t\t全部(29)...午后冬日的其他问题\n" + - "\t\t\t\n" + - "\t\t
    \n" + - "\t\t\t\t
    \n" + - "\t\t\n" + - "\t\t
    \n" + - "\t\t\n" + - "\t\t
    \n" + - "\t\t\t类似的话题\n" + - "\t\t\t\n" + - "\t\t
    \n" + - "\t
    \n" + - "\t
    \n" + - "
    \n" + - "\n" + - "\n" + - "\n" + - "\n" + - "
    \n" + - "\t
    \n" + - "\n" + - "\n" + - "\n" + - "\n" + - "
    © 开源中国(OsChina.NET) | 关于我们 | 广告联系 | @新浪微博 | 开源中国手机版 | 粤ICP备12009483号-3\n" + - "\t开源中国手机客户端:\n" + - "\tAndroid\n" + - "\tiPhone\n" + - "\tWP7\n" + - "
    \n" + - "
    \n" + - "
    \n" + - "\n" + - "\n" + - ""; + private String html = "\n" + + "\n" + + "\n" + + "\n" + + " \n" + + " \n" + + " 再次吐槽easyui - 开源中国 OSChina.NET\n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + "\n" + + "
    \n" + + "\t
    \n" + + "
    \n" + + " \t\n" + + "
    \n" + + "\t\t
    \n" + + " \t\t \t\t黄亿华,您好 \n" + + "\t\t\t\n" + + "\t\t\t\t我的空间\n" + + "\t\t\t\t\n" + + "\t\t\t | \n" + + "\t\t\t添加软件 | 投递新闻 | 退出\n" + + " \t\t\t\t
    \n" + + "\t\t
    \n" + + "\t
    \n" + + "
    \n" + + "
    \n" + + "

    讨论区

    \n" + + "
    \n" + + "\t
    当前位置:
    \n" + + "\t
    \n" + + "\t\t\t\t\t \t\t讨论区 »\n" + + " \t\t技术问答\t\t\t\t\t\t\t\t» EasyUI\n" + + "\t\t\t\t\t\t\t\t\t\t
    \n" + + "
    \n" + + "\n" + + "
    \n" + + "
    \n" + + "\t
    \n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "
    \n" + + "\t\n" + + "\t
    \n" + + "\t
    \n" + + "\t\t
    \"午后冬日\"
    \n" + + "\t\t
    \n" + + "\t\t\t

    再次吐槽easyui

    \n" + + "\t\t\t
    \n" + + "\t\t\t\t午后冬日\n" + + "\t\t\t\t发表于 2013-4-21 02:28 13小时前,\n" + + "\t\t\t\t3回/289阅,\n" + + "\t\t\t\t最后回答: 4小时前\t\t\t\t\t\t\t\t\t\t\t
    \n" + + "\t\t
    \n" + + "\t\t\n" + + "\t\t
    \n" + + "\t
    \n" + + "\t\t \t \t\t\t\t\t\n" + + "\t\t

    Java、PHP、Ruby、iOS、Python 等 JetBrains 开发工具低至 99 元(3折),详情»

    \n" + + "\t\t
    \n" + + "\t\t\t\t\t\t
    刚用到easyui treegrid组件,发现这货第一次加载时候并没有传默认参数,展开某一列时候才传递id:xx的参数。这样和后台总是疙里疙瘩,像没事就拌嘴的两口子,查网上都遇到相同问题,最好解决方案就是通过 \n" + + "onBeforeExpand事件来扩展,自行解决。看到官方例子中简洁的代码,感觉easyui耍流氓了,真搞不懂为何要这样实现
    \n" + + "\t\t\t\t\t\t
    \n" + + "\t\t\t\t标签:\t\t\t\t\n" + + "\t\t\t\t\t\t\t\t\t\t\t\tEasyUI \t\t\t\t\t\t\t\t\t\t\t
    \n" + + "\t\t\t\t\t\t
    \n" + + "\t\t\t\n" + + "\t\t\t\t\t\t\t\t\t我想问同样的问题\n" + + "\t\t\t\t\t\t\n" + + "\t\t\t共0个人想要问同样的问题\n" + + "\t\t\t\t\t\t补充话题说明»\n" + + "\t\t\t
    \n" + + "\t\t\t\t\t\t
    \n" + + "\t
      \n" + + "
      \t\t
      \n" + + "\t\t\n" + + "\t\t
      \n" + + " \n" + + "\t\t\t\t
      \n" + + "\t\t\t
      分享到
      \n" + + "\t\t\t\n" + + "\t\t\t
      1
      \n" + + "\t\t\t\n" + + "\t\t\t
      \n" + + "\t\t\t\t\t\t\t\t \t\t\t\n" + + "\t\t\t\t\t\t\t\t0\n" + + "\t\t\t\t|\n" + + "\t\t\t\t\t\t\t\t \t\t\t\n" + + "\t\t\t\t\t\t\t\t0\n" + + "\t\t\t
      \n" + + "\t\t\t\n" + + "\t\t
      \n" + + "\t\t
      \n" + + "\t\t\t\t\t\t
      \n" + + "\t\t\t\n" + + " \t

      \t\t\t\n" + + "\t\t\t\t\n" + + "\t\t\t\t\t按评价排序 |\n" + + "\t\t\t\t\t显示最新答案 | 回页面顶部\n" + + "\t\t\t\t\n" + + "\t\t\t\t共有3个答案 我要回答»\n" + + "\t\t\t

      \n" + + "\t\t\t \t
      • \n" + + "\t
        \"布谷鸟\"
        \n" + + "\t
        \n" + + "\t\t
        布谷鸟 回答于 2013-04-21 09:28
        \t\t\n" + + " \t
        \n" + + "\t\t\t \t\t \t\t举报\n" + + " \t
        \n" + + "\t\t
        \n" + + "\t\t
        对话框、日期控件和选项卡效果还不错,树状菜单没得zTree好用,建议楼主不要全部效果都依赖于此框架,有些easyui实现不好的地方可以换其它的插件实现嘛,反正我现在再也不用诸如ext和easyui之类的东西了,感觉好肥
        \n" + + "\t
        \n" + + "\t
        \n" + + "\t
        --- 共有 1 条评论 --- \n" + + "
          \n" + + "\t\t
        • \n" + + "\t\t\"午后冬日\"\n" + + "\t\t\n" + + "\t\t前端水平实在有限,自己搞的总是感觉不伦不类,只能用这些框架,再集成其它插件,切换主题时风格又不一致。\n" + + "\t\t(4小时前 by 午后冬日)\n" + + "\t\t回复\n" + + "\t\t\n" + + "\t\t
          \n" + + "\t
        • \n" + + "\t
        \n" + + "\n" + + "
        \n" + + "\t
        \t\t\t\t\t\t有帮助(1) |\n" + + "\t\t没帮助(0) |\n" + + "\t\t评论(1) |\n" + + " \t引用此答案\t
        \n" + + "
      • \n" + + "\t
        \"静风流云\"
        \n" + + "\t
        \n" + + "\t\t
        静风流云 回答于 2013-04-21 11:08
        \t\t\n" + + " \t
        \n" + + "\t\t\t \t\t \t\t举报\n" + + " \t
        \n" + + "\t\t
        \n" + + "\t\t

        没办法,原来项目也是因为客户特殊的需求,对layout选型的时候,犹豫了好久,最终放弃了。
        幸亏来了一个厉害的前端,解决问题,够用就好。

        \n" + + "\t
        \n" + + "\t
        \n" + + "\t
        --- 共有 1 条评论 --- \n" + + "
          \n" + + "\t\t
        • \n" + + "\t\t\"午后冬日\"\n" + + "\t\t\n" + + "\t\t我也是犹豫了好久,看过很多前端框架,总是不太满意。个人开发前台后台数据库全部要自己搞定,郁闷ing\n" + + "\t\t(4小时前 by 午后冬日)\n" + + "\t\t回复\n" + + "\t\t\n" + + "\t\t
          \n" + + "\t
        • \n" + + "\t
        \n" + + "\n" + + "
        \n" + + "\t
        \t\t\t\t\t\t有帮助(0) |\n" + + "\t\t没帮助(0) |\n" + + "\t\t评论(1) |\n" + + " \t引用此答案\t
        \n" + + "
      • \n" + + "\t
        \"布谷鸟\"
        \n" + + "\t
        \n" + + "\t\t
        布谷鸟 回答于 2013-04-21 11:29
        \t\t\n" + + " \t
        \n" + + "\t\t\t \t\t \t\t举报\n" + + " \t
        \n" + + "\t\t
        \n" + + "\t\t

        引用来自“布谷鸟”的答案

        对话框、日期控件和选项卡效果还不错,树状菜单没得zTree好用,建议楼主不要全部效果都依赖于此框架,有些easyui实现不好的地方可以换其它的插件实现嘛,反正我现在再也不用诸如ext和easyui之类的东西了,感觉好肥
        前后端你一个人搞啊?那确实很麻烦。面面俱到的话,工作量很大。但是如果需要实现的功能不是很多,而时间也不紧迫的话,事情干起来也还不错。如非必须,建议逐步弃用这些前端框架,在一些比较能够提升体验的地方选用一些适当的插件即可,如此也不再需要担心风格的问题,你看osc后台截图,界面那叫一个丑,用得方便顺手就够了
        \n" + + "\t
        \n" + + "\t
        \n" + + "\t
        \n" + + "\t
        \t\t\t\t\t\t有帮助(0) |\n" + + "\t\t没帮助(0) |\n" + + "\t\t评论(0) |\n" + + " \t引用此答案\t
        \n" + + "
      \n" + + "\t\t\t\t
      \n" + + "\t\t
      \n" + + "\t\t\t
      \"黄亿华\"
      \n" + + "\t\t\t
      \n" + + "\t\t\t\t\n" + + "\t\t\t\t\n" + + "\t\t\t\t\n" + + "\t\t\t\t\n" + + "\t\t\t\t

      \n" + + "\t\t\t\t回答案顶部 | 回页面顶部\n" + + "\t\t\t
      \n" + + "\t\t\t
      \n" + + "\t\t\t\n" + + "\t\t
      \n" + + "\t
      \t\n" + + "\t\n" + + "\n" + + "\n" + + "\n" + + "\t
      \n" + + "\t
      \n" + + " \t\n" + + "\t
      \n" + + "\t\t
      \n" + + "\t\t\t有什么技术问题吗?\n" + + "\t\t\t我要提问\n" + + "\t\t\t
      \n" + + "\t\t
      \n" + + "\t\t\n" + + "\t\t\t\t\t\t
      \n" + + "\t\t\t全部(29)...午后冬日的其他问题\n" + + "\t\t\t\n" + + "\t\t
      \n" + + "\t\t\t\t
      \n" + + "\t\t\n" + + "\t\t
      \n" + + "\t\t\n" + + "\t\t
      \n" + + "\t\t\t类似的话题\n" + + "\t\t\t\n" + + "\t\t
      \n" + + "\t
      \n" + + "\t
      \n" + + "
      \n" + + "\n" + + "\n" + + "\n" + + "\n" + + "
      \n" + + "\t
      \n" + + "\n" + + "\n" + + "\n" + + "\n" + "
      © 开源中国(OsChina.NET) | 关于我们 | 广告联系 | @新浪微博 | 开源中国手机版 | 粤ICP备12009483号-3\n" + "\t开源中国手机客户端:\n" + + "\tAndroid\n" + + "\tiPhone\n" + + "\tWP7\n" + "
      \n" + "
      \n" + + "
      \n" + "\n" + "\n" + + ""; @Test - public void test(){ - String text = "\n" + - "\n" + - "\n" + - " \n" + - " \n" + - " jsoup 解析页面商品信息 - - ITeye技术网站\n" + - " \n" + - " \n" + - " \n" + - " \n" + - " \n" + - " \n" + - "\n" + - " \n" + - " \n" + - "\n" + - "\n" + - " \n" + - " \n" + - "\n" + - " \n" + - "\n" + - " \n" + - " \n" + - " \n" + - " \n" + - "
      \n" + - "
      \n" + - " 首页\n" + - " 资讯\n" + - " 精华\n" + - " 论坛\n" + - " 问答\n" + - " 博客\n" + - " 专栏\n" + - " 群组\n" + - " 更多 \n" + - "
      \n" + - " 招聘\n" + - " 搜索\n" + - "
      \n" + - "
      \n" + - "\n" + - "
      \n" + - " \n" + - " 欢迎flashsword20\n" + - " 0\n" + - " \n" + - " \"Newpm\"收件箱(3)\n" + - " \n" + - " 我的应用\n" + - "
      \n" + - " 我的关注\n" + - " 我的群组\n" + - " 我的简历\n" + - " 我的相册\n" + - " 我的收藏\n" + - " 我的代码\n" + - " 我的微博\n" + - "
      \n" + - " 我的博客\n" + - " 设置\n" + - "
      \n" + - "
      \n" + - " \n" + - " \n" + - "
      \n" + - "
      \n" + - " \n" + - " \n" + - "
      \n" + - "
      \n" + - "
      \n" + - "
      \n" + - "
      \n" + - " \n" + - "
      \n" + - "
      \n" + - " \n" + - "\n" + - "\n" + - "\n" + - " \n" + - "\n" + - "\n" + - "
      \n" + - "
      \n" + - "

      \n" + - " jsoup 解析页面商品信息\n" + - " \n" + - "

      \n" + - " \n" + - "
       
      \n" + - "
      \n" + - "\n" + - "
      \n" + - "

      今天用了jsoup 解析页面商品信息,感觉比用xpath获取信息准确多了

      \n" + - "

      \n" + - "

      下面就记录一下:

      \n" + - "

      一、首先去 http://jsoup.org/download 下载jsoup的jar包。

      \n" + - "

      \n" + - "

      二、下面记录下相关代码:

      \n" + - "

      \n" + - "

      \n" + - "

      Document doc = Jsoup.connect(url).get(); //将htm转换成Document类型数据结构

      \n" + - "


      doc.select(\"div:has(div) div#spec-n1:has(img) img\").first().attr(\"src\")); //查找div下含有div的标签

      \n" + - "

      \n" + - "

      并且 div的id='spec-n1',此div第一个img标签,img里属性是src的值。

      \n" + - "

      \n" + - "

      doc.select(\"div:has(div) div.crumb:has(a) a:eq(4)\").text(); //查找class='crumb'的div下第4个a标签

      \n" + - "

      下的值。

      \n" + - "

      \n" + - "

      doc.select(\"div:has(div) div#name:has(h1)\").text(); //查找id='name'的div下的h1标签的值。

      \n" + - "

      \n" + - "

      doc.select(\"tbody:has(tr) td.tdTitle:contains(品牌) + td\").text(); //查找class='tdTitle'的td标签里

      \n" + - "

      \n" + - "

      含有‘品牌’td的下一个td标签中内容。

      \n" + - "

      \n" + - "

      doc.select(\"script[type=text/javascript]:not([src~=[a-zA-Z0-9./\\\\s]+)\"); //查找含有此<script

      \n" + - "

      \n" + - "

      type=\"text/javascript\">……</script>内容,不含有script标签中有src属性的script,如:

      \n" + - "

      \n" + - "

      <script src=\"url\" type=\"text/javascript\"></script>。

      \n" + - "
      \n" + - "\n" + - " \n" + - "\n" + - "\n" + - " \n" + - " \n" + - "
      \n" + - " \n" + - "
      分享到:\n" + - " \n" + - " \n" + - "
      \n" + - "
      \n" + - "\n" + - " \n" + - "
      \n" + - " \n" + - "
      \n" + - "\n" + - "
      \n" + - "
      评论
      \n" + - " \n" + - " \n" + - " \n" + - " \n" + - "
      \n" + - "\n" + - "
      \n" + - "
      发表评论
      \n" + - "
      \n" + - "\n" + - "\n" + - " \n" + - "\n" + - "\n" + - "\n" + - "
      \n" + - "\n" + - "\n" + - "\n" + - "\n" + - "

      (快捷键 Alt+S / Ctrl+Enter)

      \n" + - "
      \n" + - " \n" + - "
      \n" + - "
      \n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "
      \n" + - "\n" + - "
      \n" + - "
      \n" + - "
      \n" + - "
      \"masong1987的博客\"
      \n" + - "
      masong1987
      \n" + - "
      \n" + - "\n" + - "
      \n" + - "
        \n" + - "
      • 浏览: 5401 次
      • \n" + - "
      • 性别: \"Icon_minigender_1\"
      • \n" + - "
      • 来自: 北京
      • \n" + - "
      • \n" + - " \n" + - "
      • \n" + - " 发短消息\n" + - " \n" + - " 更多访客>>\n" + - " \n" + - "
        \n" + - "
        \"flashsword20的博客\"
        \n" + - " \n" + - "
        \n" + - " \n" + - "
        \n" + - "
        \"dylinshi126的博客\"
        \n" + - " \n" + - "
        \n" + - " \n" + - "
        \n" + - "
        \"machoo的博客\"
        \n" + - " \n" + - "
        \n" + - " \n" + - "
        \n" + - "
        \"arson的博客\"
        \n" + - " \n" + - "
        \n" + - " \n" + - "
      \n" + - "\n" + - " \n" + - "\n" + - "
      \n" + - "
      文章分类
      \n" + - " \n" + - "
      \n" + - "
      \n" + - "
      社区版块
      \n" + - " \n" + - "
      \n" + - "
      \n" + - "
      存档分类
      \n" + - " \n" + - "
      \n" + - " \n" + - " \n" + - "\n" + - "
      \n" + - "
      最新评论
      \n" + - " \n" + - "
      \n" + - "\n" + - "
      \n" + - " \n" + - "
      \n" + - "
      \n" + - "\n" + - "
      \n" + - "
      \n" + - "
      \n" + - " 声明:ITeye文章版权属于作者,受法律保护。没有作者书面许可不得转载。若作者同意转载,必须以超链接形式标明文章原始出处和作者。
      \n" + - " © 2003-2012 ITeye.com. All rights reserved. [ 京ICP证110151号 京公网安备110105010620 ]\n" + - "
      \n" + - "
      \n" + - " \n" + - " \n" + - "\n" + - " \n" + - " \n" + - " \n" + - "\n"; - String text2="
      aaa
      "; - XpathSelector xpathSelector = new XpathSelector("//div[@id='main']/div[@class='blog_main']/div[1][@class='blog_title']/h3/a"); + public void test() { + String text = "\n" + + "\n" + + "\n" + + " \n" + + " \n" + + " jsoup 解析页面商品信息 - - ITeye技术网站\n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + "\n" + + " \n" + + " \n" + + "\n" + + "\n" + + " \n" + + " \n" + + "\n" + + " \n" + + "\n" + + " \n" + + " \n" + + " \n" + + " \n" + + "
      \n" + + "
      \n" + + " 首页\n" + + " 资讯\n" + + " 精华\n" + + " 论坛\n" + + " 问答\n" + + " 博客\n" + + " 专栏\n" + + " 群组\n" + + " 更多 \n" + + "
      \n" + + " 招聘\n" + + " 搜索\n" + + "
      \n" + + "
      \n" + + "\n" + + "
      \n" + + " \n" + + " 欢迎flashsword20\n" + + " 0\n" + + " \n" + + " \"Newpm\"收件箱(3)\n" + + " \n" + + " 我的应用\n" + + "
      \n" + + " 我的关注\n" + + " 我的群组\n" + + " 我的简历\n" + + " 我的相册\n" + + " 我的收藏\n" + + " 我的代码\n" + + " 我的微博\n" + + "
      \n" + + " 我的博客\n" + + " 设置\n" + + "
      \n" + + "
      \n" + + " \n" + + " \n" + + "
      \n" + + "
      \n" + + " \n" + + " \n" + + "
      \n" + + "
      \n" + + "
      \n" + + "
      \n" + + "
      \n" + + " \n" + + "
      \n" + + "
      \n" + + " \n" + + "\n" + + "\n" + + "\n" + + " \n" + + "\n" + + "\n" + + "
      \n" + + "
      \n" + + "

      \n" + + " jsoup 解析页面商品信息\n" + + " \n" + + "

      \n" + + " \n" + + "
       
      \n" + + "
      \n" + + "\n" + + "
      \n" + + "

      今天用了jsoup 解析页面商品信息,感觉比用xpath获取信息准确多了

      \n" + + "

      \n" + + "

      下面就记录一下:

      \n" + + "

      一、首先去 http://jsoup.org/download 下载jsoup的jar包。

      \n" + + "

      \n" + + "

      二、下面记录下相关代码:

      \n" + + "

      \n" + + "

      \n" + + "

      Document doc = Jsoup.connect(url).get(); //将htm转换成Document类型数据结构

      \n" + + "


      doc.select(\"div:has(div) div#spec-n1:has(img) img\").first().attr(\"src\")); //查找div下含有div的标签

      \n" + + "

      \n" + + "

      并且 div的id='spec-n1',此div第一个img标签,img里属性是src的值。

      \n" + + "

      \n" + + "

      doc.select(\"div:has(div) div.crumb:has(a) a:eq(4)\").text(); //查找class='crumb'的div下第4个a标签

      \n" + + "

      下的值。

      \n" + + "

      \n" + + "

      doc.select(\"div:has(div) div#name:has(h1)\").text(); //查找id='name'的div下的h1标签的值。

      \n" + + "

      \n" + + "

      doc.select(\"tbody:has(tr) td.tdTitle:contains(品牌) + td\").text(); //查找class='tdTitle'的td标签里

      \n" + + "

      \n" + + "

      含有‘品牌’td的下一个td标签中内容。

      \n" + + "

      \n" + + "

      doc.select(\"script[type=text/javascript]:not([src~=[a-zA-Z0-9./\\\\s]+)\"); //查找含有此<script

      \n" + + "

      \n" + + "

      type=\"text/javascript\">……</script>内容,不含有script标签中有src属性的script,如:

      \n" + + "

      \n" + + "

      <script src=\"url\" type=\"text/javascript\"></script>。

      \n" + + "
      \n" + + "\n" + + " \n" + + "\n" + + "\n" + + " \n" + + " \n" + + "
      \n" + + " \n" + + "
      分享到:\n" + + " \n" + + " \n" + + "
      \n" + + "
      \n" + + "\n" + + " \n" + + "
      \n" + + " \n" + + "
      \n" + + "\n" + + "
      \n" + + "
      评论
      \n" + + " \n" + + " \n" + + " \n" + + " \n" + + "
      \n" + + "\n" + + "
      \n" + + "
      发表评论
      \n" + + "
      \n" + + "\n" + + "\n" + + " \n" + + "\n" + + "\n" + + "\n" + + "
      \n" + + "\n" + + "\n" + + "\n" + + "\n" + + "

      (快捷键 Alt+S / Ctrl+Enter)

      \n" + + "
      \n" + + " \n" + + "
      \n" + + "
      \n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "
      \n" + + "\n" + + "
      \n" + + "
      \n" + + "
      \n" + + "
      \"masong1987的博客\"
      \n" + + "
      masong1987
      \n" + + "
      \n" + + "\n" + + "
      \n" + + "
        \n" + + "
      • 浏览: 5401 次
      • \n" + + "
      • 性别: \"Icon_minigender_1\"
      • \n" + + "
      • 来自: 北京
      • \n" + + "
      • \n" + + " \n" + + "
      • \n" + + " 发短消息\n" + + " \n" + + " 更多访客>>\n" + + " \n" + + "
        \n" + + "
        \"flashsword20的博客\"
        \n" + + " \n" + + "
        \n" + + " \n" + + "
        \n" + + "
        \"dylinshi126的博客\"
        \n" + + " \n" + + "
        \n" + + " \n" + + "
        \n" + + "
        \"machoo的博客\"
        \n" + + " \n" + + "
        \n" + + " \n" + + "
        \n" + + "
        \"arson的博客\"
        \n" + + " \n" + + "
        \n" + + " \n" + + "
      \n" + + "\n" + + " \n" + + "\n" + + "
      \n" + + "
      文章分类
      \n" + + " \n" + + "
      \n" + + "
      \n" + + "
      社区版块
      \n" + + " \n" + + "
      \n" + + "
      \n" + + "
      存档分类
      \n" + + " \n" + + "
      \n" + + " \n" + + " \n" + + "\n" + + "
      \n" + + "
      最新评论
      \n" + + " \n" + + "
      \n" + + "\n" + + "
      \n" + + " \n" + + "
      \n" + + "
      \n" + + "\n" + + "
      \n" + + "
      \n" + + "
      \n" + + " 声明:ITeye文章版权属于作者,受法律保护。没有作者书面许可不得转载。若作者同意转载,必须以超链接形式标明文章原始出处和作者。
      \n" + + " © 2003-2012 ITeye.com. All rights reserved. [ 京ICP证110151号 京公网安备110105010620 ]\n" + + "
      \n" + + "
      \n" + + " \n" + + " \n" + "\n" + " \n" + " \n" + " \n" + "\n"; + String text2 = "
      aaa
      "; + XpathSelector xpathSelector = new XpathSelector( + "//div[@id='main']/div[@class='blog_main']/div[1][@class='blog_title']/h3/a"); String select = xpathSelector.select(text); - Assert.assertEquals("jsoup 解析页面商品信息",select); + Assert.assertEquals("jsoup 解析页面商品信息", select); } @Test - public void testOschina(){ + public void testOschina() { Html html1 = new Html(html); - Assert.assertEquals("再次吐槽easyui",html1.x(".//*[@class='QTitle']/h1/a").toString()); + Assert.assertEquals("再次吐槽easyui", html1.x(".//*[@class='QTitle']/h1/a").toString()); } - @Test - public void testOschinaBlog(){ - Html html1 = new Html(blogHtml); - System.out.println(html1.sc()); - } - - @Test - public void testHuxiuBlog(){ - Html html1 = new Html(huxiuHtml); - System.out.println(html1.sc()); - } } diff --git a/webmagic-core/src/test/resources/log4j.xml b/webmagic-core/src/test/resources/log4j.xml index a58e889b9..9084694eb 100644 --- a/webmagic-core/src/test/resources/log4j.xml +++ b/webmagic-core/src/test/resources/log4j.xml @@ -24,7 +24,7 @@ - + diff --git a/webmagic-samples/src/test/java/us/codecraft/webmagic/HtmlTest.java b/webmagic-samples/src/test/java/us/codecraft/webmagic/HtmlTest.java deleted file mode 100644 index f79909840..000000000 --- a/webmagic-samples/src/test/java/us/codecraft/webmagic/HtmlTest.java +++ /dev/null @@ -1,20 +0,0 @@ -package us.codecraft.webmagic; - -import org.junit.Assert; -import org.junit.Test; -import us.codecraft.webmagic.selector.Html; - -/** - * User: cairne - * Date: 13-4-21 - * Time: 上午8:42 - */ -public class HtmlTest { - - @Test - public void testRegexSelector() { - Html selectable = new Html("aaaaaaab"); - Assert.assertEquals("abbabbab", (selectable.r("(.*)").rp("aa(a)", "$1bb").toString())); - - } -} diff --git a/webmagic-samples/src/test/java/us/codecraft/webmagic/SpiderTest.java b/webmagic-samples/src/test/java/us/codecraft/webmagic/SpiderTest.java index 838c76b8e..6f795d743 100644 --- a/webmagic-samples/src/test/java/us/codecraft/webmagic/SpiderTest.java +++ b/webmagic-samples/src/test/java/us/codecraft/webmagic/SpiderTest.java @@ -15,12 +15,14 @@ public class SpiderTest { + @Ignore @Test public void testSpider() throws InterruptedException { Spider me = Spider.me().pipeline(new FilePipeline()).processor(new HuxiuProcessor()); me.run(); } + @Ignore @Test public void testGlobalSpider(){ // PageProcessor pageProcessor = new MeicanProcessor(); @@ -35,6 +37,7 @@ public void testGlobalSpider(){ } + @Ignore @Test public void test(){ System.out.println(System.getProperty("java.io.tmpdir")); diff --git a/webmagic-samples/src/test/java/us/codecraft/webmagic/processor/DiandianProcessorTest.java b/webmagic-samples/src/test/java/us/codecraft/webmagic/processor/DiandianProcessorTest.java index 18b0680d9..63021342f 100644 --- a/webmagic-samples/src/test/java/us/codecraft/webmagic/processor/DiandianProcessorTest.java +++ b/webmagic-samples/src/test/java/us/codecraft/webmagic/processor/DiandianProcessorTest.java @@ -1,5 +1,6 @@ package us.codecraft.webmagic.processor; +import org.junit.Ignore; import org.junit.Test; import us.codecraft.webmagic.Spider; import us.codecraft.webmagic.pipeline.ConsolePipeline; @@ -16,6 +17,7 @@ */ public class DiandianProcessorTest { + @Ignore @Test public void test() throws IOException { DiandianBlogProcessor diaoyuwengProcessor = new DiandianBlogProcessor(); diff --git a/webmagic-samples/src/test/java/us/codecraft/webmagic/processor/DiaoyuwengProcessorTest.java b/webmagic-samples/src/test/java/us/codecraft/webmagic/processor/DiaoyuwengProcessorTest.java index 1e77c7c76..18498d36b 100644 --- a/webmagic-samples/src/test/java/us/codecraft/webmagic/processor/DiaoyuwengProcessorTest.java +++ b/webmagic-samples/src/test/java/us/codecraft/webmagic/processor/DiaoyuwengProcessorTest.java @@ -1,5 +1,6 @@ package us.codecraft.webmagic.processor; +import org.junit.Ignore; import org.junit.Test; import us.codecraft.webmagic.Spider; import us.codecraft.webmagic.pipeline.FilePipeline; @@ -16,6 +17,7 @@ */ public class DiaoyuwengProcessorTest { + @Ignore @Test public void test() throws IOException { DiaoyuwengProcessor diaoyuwengProcessor = new DiaoyuwengProcessor(); diff --git a/webmagic-samples/src/test/java/us/codecraft/webmagic/processor/SinablogProcessorTest.java b/webmagic-samples/src/test/java/us/codecraft/webmagic/processor/SinablogProcessorTest.java index 0a5cc1b03..1fada8199 100644 --- a/webmagic-samples/src/test/java/us/codecraft/webmagic/processor/SinablogProcessorTest.java +++ b/webmagic-samples/src/test/java/us/codecraft/webmagic/processor/SinablogProcessorTest.java @@ -1,5 +1,6 @@ package us.codecraft.webmagic.processor; +import org.junit.Ignore; import org.junit.Test; import us.codecraft.webmagic.Spider; import us.codecraft.webmagic.pipeline.FilePipeline; @@ -16,6 +17,7 @@ */ public class SinablogProcessorTest { + @Ignore @Test public void test() throws IOException { SinaBlogProcesser sinaBlogProcesser = new SinaBlogProcesser(); From 62a8a6d1e75484a736374c4c59a612b8268ebc58 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Mon, 17 Jun 2013 11:14:44 +0800 Subject: [PATCH 07/81] invite travis --- .travis.yml | 1 + 1 file changed, 1 insertion(+) create mode 100644 .travis.yml diff --git a/.travis.yml b/.travis.yml new file mode 100644 index 000000000..be7dfb8ca --- /dev/null +++ b/.travis.yml @@ -0,0 +1 @@ +language: java \ No newline at end of file From 7ffd5551e2618e9d20ac80d45d9f8b1b8cddedaa Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Mon, 17 Jun 2013 11:23:29 +0800 Subject: [PATCH 08/81] fix pom --- pom.xml | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/pom.xml b/pom.xml index 68927f20b..39f068c5d 100644 --- a/pom.xml +++ b/pom.xml @@ -14,5 +14,20 @@ ./webmagic-samples/
      + + + + org.apache.maven.plugins + maven-compiler-plugin + 3.1 + + 1.6 + 1.6 + + + + + + From d0a3a2cbc34eb7448e4e79946b2230c47c884baf Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Mon, 17 Jun 2013 11:27:44 +0800 Subject: [PATCH 09/81] fix pom --- README.md | 3 +++ webmagic-core/pom.xml | 9 +++++++++ webmagic-plugin/pom.xml | 9 +++++++++ webmagic-samples/pom.xml | 9 +++++++++ 4 files changed, 30 insertions(+) diff --git a/README.md b/README.md index e92a4405c..063eca22f 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,8 @@ webmagic --------- + +[![Build Status](https://travis-ci.org/code4craft/webmagic.png?branch=master)](https://travis-ci.org/code4craft/webmagic) + ####*一个网络爬虫工具包* webmagic的发起源于工作中的需要,其定位是帮助开发者更便捷的开发一个垂直的网络爬虫。 diff --git a/webmagic-core/pom.xml b/webmagic-core/pom.xml index 607eb13c1..c0ef6a16f 100644 --- a/webmagic-core/pom.xml +++ b/webmagic-core/pom.xml @@ -62,6 +62,15 @@ + + org.apache.maven.plugins + maven-compiler-plugin + 3.1 + + 1.6 + 1.6 + + org.apache.maven.plugins maven-resources-plugin diff --git a/webmagic-plugin/pom.xml b/webmagic-plugin/pom.xml index 1128f7ac1..c2a08adb4 100644 --- a/webmagic-plugin/pom.xml +++ b/webmagic-plugin/pom.xml @@ -29,6 +29,15 @@ + + org.apache.maven.plugins + maven-compiler-plugin + 3.1 + + 1.6 + 1.6 + + org.apache.maven.plugins maven-resources-plugin diff --git a/webmagic-samples/pom.xml b/webmagic-samples/pom.xml index bfa1bfad2..4e345a286 100644 --- a/webmagic-samples/pom.xml +++ b/webmagic-samples/pom.xml @@ -30,6 +30,15 @@ + + org.apache.maven.plugins + maven-compiler-plugin + 3.1 + + 1.6 + 1.6 + + org.apache.maven.plugins maven-resources-plugin From b53bad227775dc480f26726b45b07cece45f22b5 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Mon, 17 Jun 2013 13:21:47 +0800 Subject: [PATCH 10/81] add thanks --- README.md | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/README.md b/README.md index 063eca22f..afa66ed55 100644 --- a/README.md +++ b/README.md @@ -82,3 +82,12 @@ webmagic定制的核心是PageProcessor接口。 webmagic遵循[Apache 2.0协议](http://opensource.org/licenses/Apache-2.0) +### 致谢 + +webmagic的架构和设计参考了以下两个项目,感谢以下两个项目的作者: + +python爬虫**scrapy**[https://github.com/scrapy/scrapy](https://github.com/scrapy/scrapy) + +Java爬虫**Spiderman**[https://gitcafe.com/laiweiwei/Spiderman](https://gitcafe.com/laiweiwei/Spiderman) + + From 5a2c38475daf4f735a4d41855d4463f721b23a74 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Mon, 17 Jun 2013 13:29:17 +0800 Subject: [PATCH 11/81] format --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index afa66ed55..4611c2597 100644 --- a/README.md +++ b/README.md @@ -86,8 +86,8 @@ webmagic遵循[Apache 2.0协议](http://opensource.org/licenses/Apache-2.0) webmagic的架构和设计参考了以下两个项目,感谢以下两个项目的作者: -python爬虫**scrapy**[https://github.com/scrapy/scrapy](https://github.com/scrapy/scrapy) +python爬虫** scrapy **[https://github.com/scrapy/scrapy](https://github.com/scrapy/scrapy) -Java爬虫**Spiderman**[https://gitcafe.com/laiweiwei/Spiderman](https://gitcafe.com/laiweiwei/Spiderman) +Java爬虫** Spiderman **[https://gitcafe.com/laiweiwei/Spiderman](https://gitcafe.com/laiweiwei/Spiderman) From fb8fadbe93ce8f5bf7c2ee746d1fe4f5411d5514 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Tue, 18 Jun 2013 07:24:19 +0800 Subject: [PATCH 12/81] change author info --- .../src/main/java/us/codecraft/webmagic/Page.java | 2 +- .../src/main/java/us/codecraft/webmagic/Request.java | 3 ++- .../src/main/java/us/codecraft/webmagic/Site.java | 2 +- .../src/main/java/us/codecraft/webmagic/Spider.java | 2 +- .../us/codecraft/webmagic/downloader/Downloader.java | 9 ++++++++- .../webmagic/downloader/HttpClientDownloader.java | 2 +- .../us/codecraft/webmagic/downloader/HttpClientPool.java | 2 +- .../us/codecraft/webmagic/pipeline/ConsolePipeline.java | 2 +- .../us/codecraft/webmagic/pipeline/FilePipeline.java | 2 +- .../java/us/codecraft/webmagic/pipeline/Pipeline.java | 2 +- .../us/codecraft/webmagic/processor/PageProcessor.java | 2 +- .../webmagic/processor/SimplePageProcessor.java | 2 +- .../webmagic/schedular/FileCacheQueueSchedular.java | 2 +- .../us/codecraft/webmagic/schedular/QueueSchedular.java | 2 +- .../java/us/codecraft/webmagic/schedular/Schedular.java | 2 +- .../main/java/us/codecraft/webmagic/selector/Html.java | 2 +- .../java/us/codecraft/webmagic/selector/PlainText.java | 2 +- .../java/us/codecraft/webmagic/selector/RegexResult.java | 2 +- .../us/codecraft/webmagic/selector/RegexSelector.java | 2 +- .../us/codecraft/webmagic/selector/ReplaceSelector.java | 2 +- .../java/us/codecraft/webmagic/selector/Selectable.java | 2 +- .../java/us/codecraft/webmagic/selector/Selector.java | 2 +- .../us/codecraft/webmagic/selector/SelectorFactory.java | 2 +- .../webmagic/selector/SmartContentSelector.java | 2 +- .../us/codecraft/webmagic/selector/XpathSelector.java | 2 +- .../main/java/us/codecraft/webmagic/utils/UrlUtils.java | 2 +- .../src/test/java/us/codecraft/webmagic/HtmlTest.java | 2 +- .../codecraft/webmagic/selector/RegexSelectorTest.java | 2 +- .../codecraft/webmagic/selector/XpathSelectorTest.java | 2 +- .../java/us/codecraft/webmagic/utils/UrlUtilsTest.java | 2 +- .../codecraft/webmagic/pipeline/FreemarkerPipeline.java | 2 +- .../us/codecraft/webmagic/FreemarkerPipelineTest.java | 2 +- .../webmagic/samples/DiandianBlogProcessor.java | 2 +- .../webmagic/samples/DianpingBlogProcessor.java | 2 +- .../codecraft/webmagic/samples/DiaoyuwengProcessor.java | 2 +- .../us/codecraft/webmagic/samples/F58PageProcesser.java | 2 +- .../us/codecraft/webmagic/samples/HuxiuProcessor.java | 2 +- .../us/codecraft/webmagic/samples/KaichibaProcessor.java | 2 +- .../us/codecraft/webmagic/samples/MeicanProcessor.java | 2 +- .../us/codecraft/webmagic/samples/NjuBBSProcessor.java | 2 +- .../webmagic/samples/OschinaBlogPageProcesser.java | 2 +- .../codecraft/webmagic/samples/OschinaPageProcesser.java | 2 +- .../codecraft/webmagic/samples/QzoneBlogProcessor.java | 2 +- .../us/codecraft/webmagic/samples/SinaBlogProcesser.java | 2 +- .../codecraft/webmagic/samples/TianyaPageProcesser.java | 2 +- .../src/test/java/us/codecraft/webmagic/SpiderTest.java | 2 +- .../webmagic/processor/DiandianProcessorTest.java | 2 +- .../webmagic/processor/DiaoyuwengProcessorTest.java | 2 +- .../webmagic/processor/SinablogProcessorTest.java | 2 +- 49 files changed, 57 insertions(+), 49 deletions(-) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java index 8f1a4c77a..63b1b53a6 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java @@ -10,7 +10,7 @@ import java.util.concurrent.ConcurrentHashMap; /** - * User: cairne + * Author: code4crafter@gmail.com * Date: 13-4-21 * Time: 上午11:22 */ diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Request.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Request.java index ccefc7f9a..ce7870b8d 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Request.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Request.java @@ -1,7 +1,8 @@ package us.codecraft.webmagic; /** - * User: cairne + * Request对象是 + * Author: code4crafter@gmail.com * Date: 13-4-21 * Time: 上午11:37 */ diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java index 4c032aafb..05117f087 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java @@ -4,7 +4,7 @@ import java.util.Set; /** - * User: cairne + * Author: code4crafter@gmail.com * Date: 13-4-21 * Time: 下午12:13 */ diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java index f3ec5f83a..bbab1a533 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java @@ -14,7 +14,7 @@ import java.util.List; /** - * User: cairne + * Author: code4crafter@gmail.com * Date: 13-4-21 * Time: 上午6:53 */ diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/Downloader.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/Downloader.java index e84758487..2711ba4c8 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/Downloader.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/Downloader.java @@ -5,11 +5,18 @@ import us.codecraft.webmagic.Site; /** - * User: cairne + * Downloader是webmagic抓取页面的核心接口。 + * Author: code4crafter@gmail.com * Date: 13-4-21 * Time: 下午12:14 */ public interface Downloader { + /** + * + * @param request + * @param site + * @return + */ public Page download(Request request,Site site); } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java index 269ba6d37..2eb55c0ab 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java @@ -14,7 +14,7 @@ /** - * User: cairne + * Author: code4crafter@gmail.com * Date: 13-4-21 * Time: 下午12:15 */ diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientPool.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientPool.java index 4fdf42124..066a24e51 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientPool.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientPool.java @@ -13,7 +13,7 @@ import us.codecraft.webmagic.Site; /** - * User: cairne + * Author: code4crafter@gmail.com * Date: 13-4-21 * Time: 下午12:29 */ diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/ConsolePipeline.java b/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/ConsolePipeline.java index 4115b8ce5..635bab62c 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/ConsolePipeline.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/ConsolePipeline.java @@ -7,7 +7,7 @@ import java.util.Map; /** - * User: cairne + * Author: code4crafter@gmail.com * Date: 13-4-21 * Time: 下午1:45 */ diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/FilePipeline.java b/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/FilePipeline.java index d8407af57..ca3144ed7 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/FilePipeline.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/FilePipeline.java @@ -13,7 +13,7 @@ import java.util.Map; /** - * User: cairne + * Author: code4crafter@gmail.com * Date: 13-4-21 * Time: 下午6:28 */ diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/Pipeline.java b/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/Pipeline.java index ef27cdae7..e5da1ea80 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/Pipeline.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/Pipeline.java @@ -4,7 +4,7 @@ import us.codecraft.webmagic.Site; /** - * User: cairne + * Author: code4crafter@gmail.com * Date: 13-4-21 * Time: 下午1:39 */ diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/processor/PageProcessor.java b/webmagic-core/src/main/java/us/codecraft/webmagic/processor/PageProcessor.java index e3c74a904..982005974 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/processor/PageProcessor.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/processor/PageProcessor.java @@ -4,7 +4,7 @@ import us.codecraft.webmagic.Site; /** - * User: cairne + * Author: code4crafter@gmail.com * Date: 13-4-21 * Time: 上午11:42 */ diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/processor/SimplePageProcessor.java b/webmagic-core/src/main/java/us/codecraft/webmagic/processor/SimplePageProcessor.java index eb8f56ea2..c5d637712 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/processor/SimplePageProcessor.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/processor/SimplePageProcessor.java @@ -7,7 +7,7 @@ import java.util.List; /** - * User: cairne + * Author: code4crafter@gmail.com * Date: 13-4-22 * Time: 下午9:15 */ diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/schedular/FileCacheQueueSchedular.java b/webmagic-core/src/main/java/us/codecraft/webmagic/schedular/FileCacheQueueSchedular.java index 94002ed74..56c5f332b 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/schedular/FileCacheQueueSchedular.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/schedular/FileCacheQueueSchedular.java @@ -16,7 +16,7 @@ import java.util.concurrent.atomic.AtomicInteger; /** - * User: cairne + * Author: code4crafter@gmail.com * Date: 13-4-21 * Time: 下午1:13 */ diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/schedular/QueueSchedular.java b/webmagic-core/src/main/java/us/codecraft/webmagic/schedular/QueueSchedular.java index 071f708c4..b9c39c327 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/schedular/QueueSchedular.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/schedular/QueueSchedular.java @@ -10,7 +10,7 @@ import java.util.concurrent.LinkedBlockingQueue; /** - * User: cairne + * Author: code4crafter@gmail.com * Date: 13-4-21 * Time: 下午1:13 */ diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/schedular/Schedular.java b/webmagic-core/src/main/java/us/codecraft/webmagic/schedular/Schedular.java index a5b71f5af..965ad2587 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/schedular/Schedular.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/schedular/Schedular.java @@ -4,7 +4,7 @@ import us.codecraft.webmagic.Site; /** - * User: cairne + * Author: code4crafter@gmail.com * Date: 13-4-21 * Time: 下午1:12 */ diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Html.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Html.java index c385ff99c..22bf73d69 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Html.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Html.java @@ -4,7 +4,7 @@ import java.util.List; /** - * User: cairne + * Author: code4crafter@gmail.com * Date: 13-4-21 * Time: 上午7:54 */ diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/PlainText.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/PlainText.java index 91ab7abd2..1d7a41e5d 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/PlainText.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/PlainText.java @@ -6,7 +6,7 @@ import java.util.List; /** - * User: cairne + * Author: code4crafter@gmail.com * Date: 13-4-21 * Time: 上午7:54 */ diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/RegexResult.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/RegexResult.java index 8b14e8b75..06c00dd1c 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/RegexResult.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/RegexResult.java @@ -1,7 +1,7 @@ package us.codecraft.webmagic.selector; /** - * User: cairne + * Author: code4crafter@gmail.com * Date: 13-4-21 * Time: 上午7:39 */ diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/RegexSelector.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/RegexSelector.java index 671cbe79c..3b9871892 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/RegexSelector.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/RegexSelector.java @@ -9,7 +9,7 @@ import java.util.regex.PatternSyntaxException; /** - * User: cairne + * Author: code4crafter@gmail.com * Date: 13-4-21 * Time: 上午7:09 */ diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/ReplaceSelector.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/ReplaceSelector.java index 5f788982a..e240f0898 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/ReplaceSelector.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/ReplaceSelector.java @@ -6,7 +6,7 @@ import java.util.regex.PatternSyntaxException; /** - * User: cairne + * Author: code4crafter@gmail.com * Date: 13-4-21 * Time: 上午7:09 */ diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Selectable.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Selectable.java index 68ca47d44..f02cbc4c0 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Selectable.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Selectable.java @@ -3,7 +3,7 @@ import java.util.List; /** - * User: cairne + * Author: code4crafter@gmail.com * Date: 13-4-20 * Time: 下午7:51 */ diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Selector.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Selector.java index 914e8ab01..3e1b056e1 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Selector.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Selector.java @@ -3,7 +3,7 @@ import java.util.List; /** - * User: cairne + * Author: code4crafter@gmail.com * Date: 13-4-20 * Time: 下午8:02 */ diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/SelectorFactory.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/SelectorFactory.java index af1996943..633cb4abf 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/SelectorFactory.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/SelectorFactory.java @@ -7,7 +7,7 @@ import java.util.concurrent.ConcurrentHashMap; /** - * User: cairne + * Author: code4crafter@gmail.com * Date: 13-4-21 * Time: 上午7:56 */ diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/SmartContentSelector.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/SmartContentSelector.java index 10ab15c6c..919a795b3 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/SmartContentSelector.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/SmartContentSelector.java @@ -10,7 +10,7 @@ /** * readability算法,基础是找到所有p标签的父节点 * 写的比较乱,最终效果还在尝试中 - * User: cairne + * Author: code4crafter@gmail.com * Date: 13-4-21 * Time: 下午4:42 */ diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/XpathSelector.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/XpathSelector.java index 6de2f08ab..991f92674 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/XpathSelector.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/XpathSelector.java @@ -6,7 +6,7 @@ import java.util.List; /** - * User: cairne + * Author: code4crafter@gmail.com * Date: 13-4-21 * Time: 上午9:39 */ diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/utils/UrlUtils.java b/webmagic-core/src/main/java/us/codecraft/webmagic/utils/UrlUtils.java index 124ca6412..fd26499ce 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/utils/UrlUtils.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/utils/UrlUtils.java @@ -6,7 +6,7 @@ import java.util.regex.Pattern; /** - * User: cairne + * Author: code4crafter@gmail.com * Date: 13-4-21 * Time: 下午1:52 */ diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/HtmlTest.java b/webmagic-core/src/test/java/us/codecraft/webmagic/HtmlTest.java index f79909840..0f8c06166 100644 --- a/webmagic-core/src/test/java/us/codecraft/webmagic/HtmlTest.java +++ b/webmagic-core/src/test/java/us/codecraft/webmagic/HtmlTest.java @@ -5,7 +5,7 @@ import us.codecraft.webmagic.selector.Html; /** - * User: cairne + * Author: code4crafter@gmail.com * Date: 13-4-21 * Time: 上午8:42 */ diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/selector/RegexSelectorTest.java b/webmagic-core/src/test/java/us/codecraft/webmagic/selector/RegexSelectorTest.java index 6128f1708..3b80f92dc 100644 --- a/webmagic-core/src/test/java/us/codecraft/webmagic/selector/RegexSelectorTest.java +++ b/webmagic-core/src/test/java/us/codecraft/webmagic/selector/RegexSelectorTest.java @@ -4,7 +4,7 @@ import org.junit.Test; /** - * User: cairne + * Author: code4crafter@gmail.com * Date: 13-4-21 * Time: 上午7:13 */ diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/selector/XpathSelectorTest.java b/webmagic-core/src/test/java/us/codecraft/webmagic/selector/XpathSelectorTest.java index 469ff26c6..239630a93 100644 --- a/webmagic-core/src/test/java/us/codecraft/webmagic/selector/XpathSelectorTest.java +++ b/webmagic-core/src/test/java/us/codecraft/webmagic/selector/XpathSelectorTest.java @@ -4,7 +4,7 @@ import org.junit.Test; /** - * User: cairne Date: 13-4-21 Time: 上午10:06 + * Author: code4crafter@gmail.com Date: 13-4-21 Time: 上午10:06 */ public class XpathSelectorTest { diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/utils/UrlUtilsTest.java b/webmagic-core/src/test/java/us/codecraft/webmagic/utils/UrlUtilsTest.java index d42400586..36361bfe7 100644 --- a/webmagic-core/src/test/java/us/codecraft/webmagic/utils/UrlUtilsTest.java +++ b/webmagic-core/src/test/java/us/codecraft/webmagic/utils/UrlUtilsTest.java @@ -4,7 +4,7 @@ import org.junit.Test; /** - * User: cairne + * Author: code4crafter@gmail.com * Date: 13-4-21 * Time: 下午2:22 */ diff --git a/webmagic-plugin/src/main/java/us/codecraft/webmagic/pipeline/FreemarkerPipeline.java b/webmagic-plugin/src/main/java/us/codecraft/webmagic/pipeline/FreemarkerPipeline.java index 8487064ec..9afebb669 100644 --- a/webmagic-plugin/src/main/java/us/codecraft/webmagic/pipeline/FreemarkerPipeline.java +++ b/webmagic-plugin/src/main/java/us/codecraft/webmagic/pipeline/FreemarkerPipeline.java @@ -11,7 +11,7 @@ import java.io.*; /** - * User: cairne + * Author: code4crafter@gmail.com * Date: 13-6-8 * Time: 下午9:00 */ diff --git a/webmagic-plugin/src/test/java/us/codecraft/webmagic/FreemarkerPipelineTest.java b/webmagic-plugin/src/test/java/us/codecraft/webmagic/FreemarkerPipelineTest.java index d52154f13..610edf55b 100644 --- a/webmagic-plugin/src/test/java/us/codecraft/webmagic/FreemarkerPipelineTest.java +++ b/webmagic-plugin/src/test/java/us/codecraft/webmagic/FreemarkerPipelineTest.java @@ -6,7 +6,7 @@ import java.io.IOException; /** - * User: cairne + * Author: code4crafter@gmail.com * Date: 13-6-9 * Time: 上午7:14 */ diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/DiandianBlogProcessor.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/DiandianBlogProcessor.java index 53b10520b..b17b05dbf 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/DiandianBlogProcessor.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/DiandianBlogProcessor.java @@ -7,7 +7,7 @@ import java.util.List; /** - * User: cairne + * Author: code4crafter@gmail.com * Date: 13-4-21 * Time: 下午8:08 */ diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/DianpingBlogProcessor.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/DianpingBlogProcessor.java index dd601adb3..cc91f89ad 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/DianpingBlogProcessor.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/DianpingBlogProcessor.java @@ -7,7 +7,7 @@ import java.util.List; /** - * User: cairne + * Author: code4crafter@gmail.com * Date: 13-4-21 * Time: 下午8:08 */ diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/DiaoyuwengProcessor.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/DiaoyuwengProcessor.java index 03389f5bb..bcea9a31e 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/DiaoyuwengProcessor.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/DiaoyuwengProcessor.java @@ -8,7 +8,7 @@ import java.util.List; /** - * User: cairne + * Author: code4crafter@gmail.com * Date: 13-4-21 * Time: 下午8:08 */ diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/F58PageProcesser.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/F58PageProcesser.java index 78211c4c0..346b1e225 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/F58PageProcesser.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/F58PageProcesser.java @@ -7,7 +7,7 @@ import java.util.List; /** - * User: cairne + * Author: code4crafter@gmail.com * Date: 13-4-21 * Time: 下午1:48 */ diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/HuxiuProcessor.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/HuxiuProcessor.java index 82552f956..8286d7816 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/HuxiuProcessor.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/HuxiuProcessor.java @@ -7,7 +7,7 @@ import java.util.List; /** - * User: cairne + * Author: code4crafter@gmail.com * Date: 13-4-21 * Time: 下午8:08 */ diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/KaichibaProcessor.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/KaichibaProcessor.java index 58a2cb81c..866c903c9 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/KaichibaProcessor.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/KaichibaProcessor.java @@ -5,7 +5,7 @@ import us.codecraft.webmagic.processor.PageProcessor; /** - * User: cairne + * Author: code4crafter@gmail.com * Date: 13-5-20 * Time: 下午5:31 */ diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/MeicanProcessor.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/MeicanProcessor.java index 637aec172..30ba84d43 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/MeicanProcessor.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/MeicanProcessor.java @@ -7,7 +7,7 @@ import java.util.List; /** - * User: cairne + * Author: code4crafter@gmail.com * Date: 13-5-20 * Time: 下午5:31 */ diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/NjuBBSProcessor.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/NjuBBSProcessor.java index ca46de671..3b61d7649 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/NjuBBSProcessor.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/NjuBBSProcessor.java @@ -7,7 +7,7 @@ import java.util.List; /** - * User: cairne + * Author: code4crafter@gmail.com * Date: 13-4-21 * Time: 下午8:08 */ diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/OschinaBlogPageProcesser.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/OschinaBlogPageProcesser.java index 2166d9b1b..58d87a7f6 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/OschinaBlogPageProcesser.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/OschinaBlogPageProcesser.java @@ -7,7 +7,7 @@ import java.util.List; /** - * User: cairne + * Author: code4crafter@gmail.com * Date: 13-4-21 * Time: 下午1:48 */ diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/OschinaPageProcesser.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/OschinaPageProcesser.java index cdfbc1e0e..4f84a3f54 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/OschinaPageProcesser.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/OschinaPageProcesser.java @@ -7,7 +7,7 @@ import java.util.List; /** - * User: cairne + * Author: code4crafter@gmail.com * Date: 13-4-21 * Time: 下午1:48 */ diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/QzoneBlogProcessor.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/QzoneBlogProcessor.java index 67ef671e7..e252eeff0 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/QzoneBlogProcessor.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/QzoneBlogProcessor.java @@ -7,7 +7,7 @@ import java.util.List; /** - * User: cairne + * Author: code4crafter@gmail.com * Date: 13-4-21 * Time: 下午8:08 */ diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/SinaBlogProcesser.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/SinaBlogProcesser.java index 79065265f..07f6d4768 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/SinaBlogProcesser.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/SinaBlogProcesser.java @@ -5,7 +5,7 @@ import us.codecraft.webmagic.processor.PageProcessor; /** - * User: cairne + * Author: code4crafter@gmail.com * Date: 13-4-21 * Time: 下午1:48 */ diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/TianyaPageProcesser.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/TianyaPageProcesser.java index 7a8920bef..564f1efb6 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/TianyaPageProcesser.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/TianyaPageProcesser.java @@ -7,7 +7,7 @@ import java.util.List; /** - * User: cairne + * Author: code4crafter@gmail.com * Date: 13-4-21 * Time: 下午1:48 */ diff --git a/webmagic-samples/src/test/java/us/codecraft/webmagic/SpiderTest.java b/webmagic-samples/src/test/java/us/codecraft/webmagic/SpiderTest.java index 6f795d743..79f428ebe 100644 --- a/webmagic-samples/src/test/java/us/codecraft/webmagic/SpiderTest.java +++ b/webmagic-samples/src/test/java/us/codecraft/webmagic/SpiderTest.java @@ -8,7 +8,7 @@ import us.codecraft.webmagic.schedular.FileCacheQueueSchedular; /** - * User: cairne + * Author: code4crafter@gmail.com * Date: 13-4-20 * Time: 下午7:46 */ diff --git a/webmagic-samples/src/test/java/us/codecraft/webmagic/processor/DiandianProcessorTest.java b/webmagic-samples/src/test/java/us/codecraft/webmagic/processor/DiandianProcessorTest.java index 63021342f..74de9c0e0 100644 --- a/webmagic-samples/src/test/java/us/codecraft/webmagic/processor/DiandianProcessorTest.java +++ b/webmagic-samples/src/test/java/us/codecraft/webmagic/processor/DiandianProcessorTest.java @@ -11,7 +11,7 @@ import java.io.IOException; /** - * User: cairne + * Author: code4crafter@gmail.com * Date: 13-6-9 * Time: 上午8:02 */ diff --git a/webmagic-samples/src/test/java/us/codecraft/webmagic/processor/DiaoyuwengProcessorTest.java b/webmagic-samples/src/test/java/us/codecraft/webmagic/processor/DiaoyuwengProcessorTest.java index 18498d36b..012b5e6fd 100644 --- a/webmagic-samples/src/test/java/us/codecraft/webmagic/processor/DiaoyuwengProcessorTest.java +++ b/webmagic-samples/src/test/java/us/codecraft/webmagic/processor/DiaoyuwengProcessorTest.java @@ -11,7 +11,7 @@ import java.io.IOException; /** - * User: cairne + * Author: code4crafter@gmail.com * Date: 13-6-9 * Time: 上午8:02 */ diff --git a/webmagic-samples/src/test/java/us/codecraft/webmagic/processor/SinablogProcessorTest.java b/webmagic-samples/src/test/java/us/codecraft/webmagic/processor/SinablogProcessorTest.java index 1fada8199..72e6be0d6 100644 --- a/webmagic-samples/src/test/java/us/codecraft/webmagic/processor/SinablogProcessorTest.java +++ b/webmagic-samples/src/test/java/us/codecraft/webmagic/processor/SinablogProcessorTest.java @@ -11,7 +11,7 @@ import java.io.IOException; /** - * User: cairne + * Author: code4crafter@gmail.com * Date: 13-6-9 * Time: 上午8:02 */ From 61a1fe31c4aa2b794e00b5cd474f5997d911c6df Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Tue, 18 Jun 2013 08:32:11 +0800 Subject: [PATCH 13/81] add cookie support & add docs --- .../main/java/us/codecraft/webmagic/Page.java | 9 +++++- .../java/us/codecraft/webmagic/Request.java | 30 ++++++++++++++++++- .../main/java/us/codecraft/webmagic/Site.java | 26 ++++++++-------- .../java/us/codecraft/webmagic/Spider.java | 4 ++- .../webmagic/downloader/Downloader.java | 5 ++-- .../webmagic/downloader/HttpClientPool.java | 21 ++++++++++--- .../downloader/HttpClientDownloaderTest.java | 23 ++++++++++++++ .../webmagic/samples/KaichibaProcessor.java | 2 +- 8 files changed, 97 insertions(+), 23 deletions(-) create mode 100644 webmagic-core/src/test/java/us/codecraft/webmagic/downloader/HttpClientDownloaderTest.java diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java index 63b1b53a6..eee1a8a74 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java @@ -10,6 +10,7 @@ import java.util.concurrent.ConcurrentHashMap; /** + * Page保存了抓取的结果,并可定义下一次抓取的链接内容。 * Author: code4crafter@gmail.com * Date: 13-4-21 * Time: 上午11:22 @@ -65,7 +66,7 @@ public void addTargetRequests(List requests) { } } - public void addTargetRequests(String requestString) { + public void addTargetRequest(String requestString) { if (StringUtils.isBlank(requestString) || requestString.equals("#")) { return; } @@ -75,6 +76,12 @@ public void addTargetRequests(String requestString) { } } + public void addTargetRequest(Request request) { + synchronized (targetRequests) { + targetRequests.add(request); + } + } + public Selectable getUrl() { return url; } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Request.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Request.java index ce7870b8d..ecb8b4ef9 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Request.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Request.java @@ -1,7 +1,22 @@ package us.codecraft.webmagic; /** - * Request对象是 + * Request对象封装了待抓取的url信息。
      + * 在PageProcessor中,Request对象可以通过{@link us.codecraft.webmagic.Page#getRequest()} 获取。
      + * Request对象包含一个extra属性,可以写入一些必须的上下文,这个特性在某些场合会有用。
      + *
      + *      Example:
      + *          抓取${linktext}时,希望提取链接link,并保存linktext的信息。
      + *      在上一个页面:
      + *      public void process(Page page){
      + *          Request request = new Request(link,linktext);
      + *          page.addTargetRequest(request)
      + *      }
      + *      在下一个页面:
      + *      public void process(Page page){
      + *          String linktext =  (String)page.getRequest().getExtra()[0];
      + *      }
      + * 
      * Author: code4crafter@gmail.com * Date: 13-4-21 * Time: 上午11:37 @@ -12,15 +27,28 @@ public class Request { private Object[] extra; + /** + * 构建一个request对象 + * @param url 必须参数,待抓取的url + * @param extra 额外参数,可以保存一些需要的上下文信息 + */ public Request(String url, Object... extra) { this.url = url; this.extra = extra; } + /** + * 获取预存的对象 + * @return object[] 预存的对象数组 + */ public Object[] getExtra() { return extra; } + /** + * 获取待抓取的url + * @return url 待抓取的url + */ public String getUrl() { return url; } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java index 05117f087..413d8d80a 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java @@ -1,9 +1,9 @@ package us.codecraft.webmagic; -import java.util.HashSet; -import java.util.Set; +import java.util.*; /** + * Site定义一个待抓取的站点的各种信息。 * Author: code4crafter@gmail.com * Date: 13-4-21 * Time: 下午12:13 @@ -14,11 +14,11 @@ public class Site { private String userAgent; - private String cookie; + private Map cookies = new LinkedHashMap(); private String encoding; - private String startUrl; + private List startUrls; private int sleepTime = 3000; @@ -34,8 +34,8 @@ public static Site me() { return new Site(); } - public Site setCookie(String cookie) { - this.cookie = cookie; + public Site setCookie(String name,String value) { + cookies.put(name,value); return this; } @@ -44,8 +44,8 @@ public Site setUserAgent(String userAgent) { return this; } - public String getCookie() { - return cookie; + public Map getCookies() { + return cookies; } public String getUserAgent() { @@ -79,12 +79,12 @@ public Site setAcceptStatCode(Set acceptStatCode) { return this; } - public String getStartUrl() { - return startUrl; + public List getStartUrls() { + return startUrls; } public Site setStartUrl(String startUrl) { - this.startUrl = startUrl; + this.startUrls.add(startUrl); return this; } @@ -106,8 +106,8 @@ public boolean equals(Object o) { if (acceptStatCode != null ? !acceptStatCode.equals(site.acceptStatCode) : site.acceptStatCode != null) return false; - if (cookie != null ? !cookie.equals(site.cookie) : site.cookie != null) return false; if (!domain.equals(site.domain)) return false; + if (!startUrls.equals(site.startUrls)) return false; if (encoding != null ? !encoding.equals(site.encoding) : site.encoding != null) return false; if (userAgent != null ? !userAgent.equals(site.userAgent) : site.userAgent != null) return false; @@ -117,8 +117,8 @@ public boolean equals(Object o) { @Override public int hashCode() { int result = domain.hashCode(); + result = 31 * result + (startUrls != null ? startUrls.hashCode() : 0); result = 31 * result + (userAgent != null ? userAgent.hashCode() : 0); - result = 31 * result + (cookie != null ? cookie.hashCode() : 0); result = 31 * result + (encoding != null ? encoding.hashCode() : 0); result = 31 * result + (acceptStatCode != null ? acceptStatCode.hashCode() : 0); return result; diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java index bbab1a533..180d7529b 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java @@ -36,7 +36,9 @@ public static Spider me() { public Spider processor(PageProcessor pageProcessor) { this.pageProcessor = pageProcessor; - schedular.push(new Request(pageProcessor.getSite().getStartUrl()), pageProcessor.getSite()); + for (String startUrl : pageProcessor.getSite().getStartUrls()) { + schedular.push(new Request(startUrl), pageProcessor.getSite()); + } return this; } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/Downloader.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/Downloader.java index 2711ba4c8..f276fde2b 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/Downloader.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/Downloader.java @@ -5,7 +5,7 @@ import us.codecraft.webmagic.Site; /** - * Downloader是webmagic抓取页面的核心接口。 + * Downloader是webmagic下载页面的接口。webmagic默认使用了HttpComponent作为下载器,一般情况,你无需自己实现这个接口。 * Author: code4crafter@gmail.com * Date: 13-4-21 * Time: 下午12:14 @@ -13,10 +13,11 @@ public interface Downloader { /** + * 下载页面,并保存信息到Page对象中。 * * @param request * @param site * @return */ - public Page download(Request request,Site site); + public Page download(Request request, Site site); } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientPool.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientPool.java index 066a24e51..891ff188b 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientPool.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientPool.java @@ -1,17 +1,22 @@ package us.codecraft.webmagic.downloader; import org.apache.http.HttpVersion; +import org.apache.http.client.CookieStore; import org.apache.http.client.HttpClient; import org.apache.http.client.params.ClientPNames; import org.apache.http.client.params.CookiePolicy; import org.apache.http.conn.scheme.PlainSocketFactory; import org.apache.http.conn.scheme.Scheme; import org.apache.http.conn.scheme.SchemeRegistry; +import org.apache.http.impl.client.BasicCookieStore; import org.apache.http.impl.client.DefaultHttpClient; import org.apache.http.impl.conn.PoolingClientConnectionManager; +import org.apache.http.impl.cookie.BasicClientCookie; import org.apache.http.params.*; import us.codecraft.webmagic.Site; +import java.util.Map; + /** * Author: code4crafter@gmail.com * Date: 13-4-21 @@ -50,15 +55,23 @@ private HttpClient generateClient(Site site) { schemeRegistry.register(new Scheme("http", 80, PlainSocketFactory.getSocketFactory())); PoolingClientConnectionManager connectionManager = new PoolingClientConnectionManager(schemeRegistry); - connectionManager.setMaxTotal(100); + connectionManager.setMaxTotal(poolSize); connectionManager.setDefaultMaxPerRoute(100); - HttpClient httpClient = new DefaultHttpClient(connectionManager, params); + DefaultHttpClient httpClient = new DefaultHttpClient(connectionManager, params); + generateCookie(httpClient, site); httpClient.getParams().setIntParameter("http.socket.timeout", 60000); httpClient.getParams().setParameter(ClientPNames.COOKIE_POLICY, CookiePolicy.BEST_MATCH); return httpClient; } - public void pushBack(HttpClient httpClient) { - + private void generateCookie(DefaultHttpClient httpClient, Site site) { + CookieStore cookieStore = new BasicCookieStore(); + for (Map.Entry cookieEntry : site.getCookies().entrySet()) { + BasicClientCookie cookie = new BasicClientCookie(cookieEntry.getKey(), cookieEntry.getValue()); + cookie.setDomain(site.getDomain()); + cookieStore.addCookie(cookie); + } + httpClient.setCookieStore(cookieStore); } + } diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/downloader/HttpClientDownloaderTest.java b/webmagic-core/src/test/java/us/codecraft/webmagic/downloader/HttpClientDownloaderTest.java new file mode 100644 index 000000000..7f00e17c0 --- /dev/null +++ b/webmagic-core/src/test/java/us/codecraft/webmagic/downloader/HttpClientDownloaderTest.java @@ -0,0 +1,23 @@ +package us.codecraft.webmagic.downloader; + +import org.junit.Assert; +import org.junit.Test; +import us.codecraft.webmagic.Page; +import us.codecraft.webmagic.Request; +import us.codecraft.webmagic.Site; + +/** + * Author: code4crafer@gmail.com + * Date: 13-6-18 + * Time: 上午8:22 + */ +public class HttpClientDownloaderTest { + + @Test + public void testCookie() { + Site site = Site.me().setDomain("www.diandian.com").setCookie("t", "yct7q7e6v319wpg4cpxqduu5m77lcgix"); + HttpClientDownloader httpClientDownloader = new HttpClientDownloader(); + Page download = httpClientDownloader.download(new Request("http://www.diandian.com"), site); + Assert.assertTrue(download.getHtml().toString().contains("flashsword30")); + } +} diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/KaichibaProcessor.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/KaichibaProcessor.java index 866c903c9..f2d405062 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/KaichibaProcessor.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/KaichibaProcessor.java @@ -14,7 +14,7 @@ public class KaichibaProcessor implements PageProcessor { public void process(Page page) { //http://progressdaily.diandian.com/post/2013-01-24/40046867275 int i = Integer.valueOf(page.getUrl().r("shop/(\\d+)").toString()) + 1; - page.addTargetRequests("http://kaichiba.com/shop/"+i); + page.addTargetRequest("http://kaichiba.com/shop/" + i); page.putField("title",page.getHtml().x("//Title")); page.putField("items", page.getHtml().xs("//li[@class=\"foodTitle\"]").rp("^\\s+", "").rp("\\s+$", "").rp(".*?", "")); } From ff49617966c03ca1accd3018bad92c0c8c9ab387 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Tue, 18 Jun 2013 10:02:11 +0800 Subject: [PATCH 14/81] add warning --- README.md | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 4611c2597..7f535362b 100644 --- a/README.md +++ b/README.md @@ -9,6 +9,10 @@ webmagic的发起源于工作中的需要,其定位是帮助开发者更便捷 webmagic的功能覆盖整个爬虫的生命周期(链接提取、页面下载、内容抽取、持久化),开发者可以便捷的使用xpath和正则表达式进行链接和内容的提取,只需编写少量代码即可完成一个定制爬虫。 +#### 请注意 + +webmagic正处于开发阶段,目前还没有稳定版本。欢迎开发者参与到webmagic的试用和修改中来。** 如果只是想以外部jar包的方式,引用webmagic并进行自己的业务开发,建议你等待webmagic的第一个稳定版本。** + ###特色### * ####垂直爬虫#### @@ -86,8 +90,8 @@ webmagic遵循[Apache 2.0协议](http://opensource.org/licenses/Apache-2.0) webmagic的架构和设计参考了以下两个项目,感谢以下两个项目的作者: -python爬虫** scrapy **[https://github.com/scrapy/scrapy](https://github.com/scrapy/scrapy) +python爬虫 ** scrapy **[https://github.com/scrapy/scrapy](https://github.com/scrapy/scrapy) -Java爬虫** Spiderman **[https://gitcafe.com/laiweiwei/Spiderman](https://gitcafe.com/laiweiwei/Spiderman) +Java爬虫 ** Spiderman **[https://gitcafe.com/laiweiwei/Spiderman](https://gitcafe.com/laiweiwei/Spiderman) From d7031f2515b0d619e16b398d58c524738d8591ce Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Tue, 18 Jun 2013 11:21:53 +0800 Subject: [PATCH 15/81] update readme --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 7f535362b..92972ce9b 100644 --- a/README.md +++ b/README.md @@ -11,7 +11,7 @@ webmagic的功能覆盖整个爬虫的生命周期(链接提取、页面下载 #### 请注意 -webmagic正处于开发阶段,目前还没有稳定版本。欢迎开发者参与到webmagic的试用和修改中来。** 如果只是想以外部jar包的方式,引用webmagic并进行自己的业务开发,建议你等待webmagic的第一个稳定版本。** +webmagic正处于开发阶段,目前还没有稳定版本。欢迎开发者参与到webmagic的试用和修改中来。 ** 如果只是想以外部jar包的方式,引用webmagic并进行自己的业务开发,建议你等待webmagic的第一个稳定版本。** ###特色### From 188036b532ee0165561316fae0429e4d038f3462 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Tue, 18 Jun 2013 14:34:09 +0800 Subject: [PATCH 16/81] add id --- .../main/java/us/codecraft/webmagic/Site.java | 24 +++++++++++++++---- .../java/us/codecraft/webmagic/Spider.java | 1 - .../webmagic/pipeline/FilePipeline.java | 4 ++-- .../schedular/FileCacheQueueSchedular.java | 16 ++++++++----- .../processor/DiaoyuwengProcessorTest.java | 2 -- 5 files changed, 31 insertions(+), 16 deletions(-) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java index 413d8d80a..674ac5bc9 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java @@ -12,13 +12,18 @@ public class Site { private String domain; + /** + * for identify a task + */ + private String identifier; + private String userAgent; - private Map cookies = new LinkedHashMap(); + private Map cookies = new LinkedHashMap(); private String encoding; - private List startUrls; + private List startUrls = new ArrayList(); private int sleepTime = 3000; @@ -34,8 +39,8 @@ public static Site me() { return new Site(); } - public Site setCookie(String name,String value) { - cookies.put(name,value); + public Site setCookie(String name, String value) { + cookies.put(name, value); return this; } @@ -44,7 +49,7 @@ public Site setUserAgent(String userAgent) { return this; } - public Map getCookies() { + public Map getCookies() { return cookies; } @@ -61,6 +66,15 @@ public Site setDomain(String domain) { return this; } + public String getIdentifier() { + return identifier; + } + + public Site setIdentifier(String identifier) { + this.identifier = identifier; + return this; + } + public String getEncoding() { return encoding; } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java index 180d7529b..7f34850d8 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java @@ -85,7 +85,6 @@ private void sleep(int time) { Thread.sleep(time); } catch (InterruptedException e) { e.printStackTrace(); - ; } } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/FilePipeline.java b/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/FilePipeline.java index ca3144ed7..84a94ce77 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/FilePipeline.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/FilePipeline.java @@ -33,7 +33,7 @@ public FilePipeline(String path) { public void process(Page page, Site site) { String domain = site.getDomain(); domain = UrlUtils.getDomain(domain); - String path = this.path + "" + domain + "/"; + String path = this.path + "" + domain + "#" + site.getIdentifier() + "/"; File file = new File(path); if (!file.exists()) { file.mkdirs(); @@ -46,7 +46,7 @@ public void process(Page page, Site site) { } printWriter.close(); } catch (IOException e) { - e.printStackTrace(); //To change body of catch statement use File | Settings | File Templates. + e.printStackTrace(); } } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/schedular/FileCacheQueueSchedular.java b/webmagic-core/src/main/java/us/codecraft/webmagic/schedular/FileCacheQueueSchedular.java index 56c5f332b..b3086a283 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/schedular/FileCacheQueueSchedular.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/schedular/FileCacheQueueSchedular.java @@ -60,7 +60,7 @@ private void flush() { private void init() { File file = new File(filePath); - if (!file.exists()){ + if (!file.exists()) { file.mkdirs(); } readFile(); @@ -81,8 +81,8 @@ public void run() { private void initWriter() { try { - fileUrlWriter = new PrintWriter(new FileWriter(filePath + site.getDomain() + fileUrlAllName, true)); - fileCursorWriter = new PrintWriter(new FileWriter(filePath + site.getDomain() + fileCursor, false)); + fileUrlWriter = new PrintWriter(new FileWriter(getFileName(fileUrlAllName), true)); + fileCursorWriter = new PrintWriter(new FileWriter(getFileName(fileCursor), false)); } catch (IOException e) { throw new RuntimeException("init cache schedular error", e); } @@ -100,7 +100,7 @@ private void readFile() { private void readUrlFile() throws IOException { String line; - BufferedReader fileUrlReader = new BufferedReader(new FileReader(filePath + site.getDomain() + fileUrlAllName)); + BufferedReader fileUrlReader = new BufferedReader(new FileReader(getFileName(fileUrlAllName))); int lineReaded = 0; while ((line = fileUrlReader.readLine()) != null) { urls.add(line.trim()); @@ -112,7 +112,7 @@ private void readUrlFile() throws IOException { } private void readCursorFile() throws IOException { - BufferedReader fileCursorReader = new BufferedReader(new FileReader(filePath + site.getDomain() + fileCursor)); + BufferedReader fileCursorReader = new BufferedReader(new FileReader(getFileName(fileCursor))); String line = null; //read the last number while ((line = fileCursorReader.readLine()) != null) { @@ -120,8 +120,12 @@ private void readCursorFile() throws IOException { } } + private String getFileName(String filename) { + return filePath + site.getDomain() + "#" + site.getIdentifier() + filename; + } + @Override - public synchronized void push(Request request,Site site) { + public synchronized void push(Request request, Site site) { if (!inited.get()) { init(); } diff --git a/webmagic-samples/src/test/java/us/codecraft/webmagic/processor/DiaoyuwengProcessorTest.java b/webmagic-samples/src/test/java/us/codecraft/webmagic/processor/DiaoyuwengProcessorTest.java index 012b5e6fd..0c24b5771 100644 --- a/webmagic-samples/src/test/java/us/codecraft/webmagic/processor/DiaoyuwengProcessorTest.java +++ b/webmagic-samples/src/test/java/us/codecraft/webmagic/processor/DiaoyuwengProcessorTest.java @@ -1,6 +1,5 @@ package us.codecraft.webmagic.processor; -import org.junit.Ignore; import org.junit.Test; import us.codecraft.webmagic.Spider; import us.codecraft.webmagic.pipeline.FilePipeline; @@ -17,7 +16,6 @@ */ public class DiaoyuwengProcessorTest { - @Ignore @Test public void test() throws IOException { DiaoyuwengProcessor diaoyuwengProcessor = new DiaoyuwengProcessor(); From 245e963d9976db19ddff675eede13234242454b1 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Tue, 18 Jun 2013 17:42:31 +0800 Subject: [PATCH 17/81] add uuid to spider --- .../main/java/us/codecraft/webmagic/Site.java | 16 +----- .../java/us/codecraft/webmagic/Spider.java | 56 +++++++++++++++---- .../main/java/us/codecraft/webmagic/Task.java | 12 ++++ .../webmagic/pipeline/ConsolePipeline.java | 4 +- .../webmagic/pipeline/FilePipeline.java | 9 +-- .../codecraft/webmagic/pipeline/Pipeline.java | 4 +- .../processor/SimplePageProcessor.java | 2 +- .../schedular/FileCacheQueueSchedular.java | 20 +++---- .../webmagic/schedular/QueueSchedular.java | 6 +- .../webmagic/schedular/Schedular.java | 6 +- .../samples/DiandianBlogProcessor.java | 2 +- .../samples/DianpingBlogProcessor.java | 2 +- .../webmagic/samples/DiaoyuwengProcessor.java | 2 +- .../webmagic/samples/F58PageProcesser.java | 2 +- .../webmagic/samples/HuxiuProcessor.java | 2 +- .../webmagic/samples/KaichibaProcessor.java | 2 +- .../webmagic/samples/MeicanProcessor.java | 2 +- .../webmagic/samples/NjuBBSProcessor.java | 2 +- .../samples/OschinaBlogPageProcesser.java | 2 +- .../samples/OschinaPageProcesser.java | 2 +- .../webmagic/samples/QzoneBlogProcessor.java | 2 +- .../webmagic/samples/SinaBlogProcesser.java | 2 +- .../webmagic/samples/TianyaPageProcesser.java | 2 +- .../src/main/resources/ftl/wordpress.ftl | 6 +- 24 files changed, 98 insertions(+), 69 deletions(-) create mode 100644 webmagic-core/src/main/java/us/codecraft/webmagic/Task.java diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java index 674ac5bc9..a4d88d895 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java @@ -12,11 +12,6 @@ public class Site { private String domain; - /** - * for identify a task - */ - private String identifier; - private String userAgent; private Map cookies = new LinkedHashMap(); @@ -66,15 +61,6 @@ public Site setDomain(String domain) { return this; } - public String getIdentifier() { - return identifier; - } - - public Site setIdentifier(String identifier) { - this.identifier = identifier; - return this; - } - public String getEncoding() { return encoding; } @@ -97,7 +83,7 @@ public List getStartUrls() { return startUrls; } - public Site setStartUrl(String startUrl) { + public Site addStartUrl(String startUrl) { this.startUrls.add(startUrl); return this; } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java index 7f34850d8..f7f560cb3 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java @@ -18,7 +18,7 @@ * Date: 13-4-21 * Time: 上午6:53 */ -public class Spider implements Runnable { +public class Spider implements Runnable, Task { private Downloader downloader = new HttpClientDownloader(); @@ -26,6 +26,12 @@ public class Spider implements Runnable { private PageProcessor pageProcessor; + private List startUrls; + + private Site site; + + private String uuid; + private Schedular schedular = new QueueSchedular(); private Logger logger = Logger.getLogger(getClass()); @@ -36,9 +42,18 @@ public static Spider me() { public Spider processor(PageProcessor pageProcessor) { this.pageProcessor = pageProcessor; - for (String startUrl : pageProcessor.getSite().getStartUrls()) { - schedular.push(new Request(startUrl), pageProcessor.getSite()); - } + this.site = pageProcessor.getSite(); + return this; + } + + public Spider startUrls(List startUrls) { + this.startUrls = startUrls; + return this; + } + + public Spider startUrl(String startUrl) { + startUrls = new ArrayList(); + startUrls.add(startUrl); return this; } @@ -59,13 +74,15 @@ public Spider pipeline(Pipeline pipeline) { @Override public void run() { - Site site = pageProcessor.getSite(); - Request request = schedular.poll(site); - if (pipelines.isEmpty()){ + for (String startUrl : pageProcessor.getSite().getStartUrls()) { + schedular.push(new Request(startUrl), this); + } + Request request = schedular.poll(this); + if (pipelines.isEmpty()) { pipelines.add(new ConsolePipeline()); } while (request != null) { - Page page = downloader.download(request,site); + Page page = downloader.download(request, site); if (page == null) { sleep(site.getSleepTime()); continue; @@ -73,13 +90,19 @@ public void run() { pageProcessor.process(page); addRequest(page); for (Pipeline pipeline : pipelines) { - pipeline.process(page,site); + pipeline.process(page, this); } sleep(site.getSleepTime()); - request = schedular.poll(site); + request = schedular.poll(this); } } + public Spider setUUID(String uuid) { + this.uuid = uuid; + return this; + } + + private void sleep(int time) { try { Thread.sleep(time); @@ -91,8 +114,19 @@ private void sleep(int time) { private void addRequest(Page page) { if (CollectionUtils.isNotEmpty(page.getTargetRequests())) { for (Request request : page.getTargetRequests()) { - schedular.push(request,pageProcessor.getSite()); + schedular.push(request, this); } } } + + @Override + public String getUUID() { + if (uuid != null) { + return uuid; + } + if (site != null) { + return site.getDomain(); + } + return null; + } } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Task.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Task.java new file mode 100644 index 000000000..0eaf6c95c --- /dev/null +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Task.java @@ -0,0 +1,12 @@ +package us.codecraft.webmagic; + +/** + * Author: code4crafer@gmail.com + * Date: 13-6-18 + * Time: 下午2:57 + */ +public interface Task { + + public String getUUID(); + +} diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/ConsolePipeline.java b/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/ConsolePipeline.java index 635bab62c..7b93876dc 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/ConsolePipeline.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/ConsolePipeline.java @@ -1,7 +1,7 @@ package us.codecraft.webmagic.pipeline; import us.codecraft.webmagic.Page; -import us.codecraft.webmagic.Site; +import us.codecraft.webmagic.Task; import us.codecraft.webmagic.selector.Selectable; import java.util.Map; @@ -14,7 +14,7 @@ public class ConsolePipeline implements Pipeline{ @Override - public void process(Page page,Site site) { + public void process(Page page,Task task) { System.out.println("get page: "+page.getUrl()); for (Map.Entry entry : page.getFields().entrySet()) { System.out.println(entry.getKey()+":\t"+entry.getValue().toStrings()); diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/FilePipeline.java b/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/FilePipeline.java index 84a94ce77..2311a75dd 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/FilePipeline.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/FilePipeline.java @@ -2,9 +2,8 @@ import org.apache.commons.codec.digest.DigestUtils; import us.codecraft.webmagic.Page; -import us.codecraft.webmagic.Site; +import us.codecraft.webmagic.Task; import us.codecraft.webmagic.selector.Selectable; -import us.codecraft.webmagic.utils.UrlUtils; import java.io.File; import java.io.FileWriter; @@ -30,10 +29,8 @@ public FilePipeline(String path) { } @Override - public void process(Page page, Site site) { - String domain = site.getDomain(); - domain = UrlUtils.getDomain(domain); - String path = this.path + "" + domain + "#" + site.getIdentifier() + "/"; + public void process(Page page, Task task) { + String path = this.path + "/" + task.getUUID() + "/"; File file = new File(path); if (!file.exists()) { file.mkdirs(); diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/Pipeline.java b/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/Pipeline.java index e5da1ea80..b2b51e0ae 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/Pipeline.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/Pipeline.java @@ -1,7 +1,7 @@ package us.codecraft.webmagic.pipeline; import us.codecraft.webmagic.Page; -import us.codecraft.webmagic.Site; +import us.codecraft.webmagic.Task; /** * Author: code4crafter@gmail.com @@ -10,5 +10,5 @@ */ public interface Pipeline { - public void process(Page page,Site site); + public void process(Page page,Task task); } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/processor/SimplePageProcessor.java b/webmagic-core/src/main/java/us/codecraft/webmagic/processor/SimplePageProcessor.java index c5d637712..9f4eed36f 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/processor/SimplePageProcessor.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/processor/SimplePageProcessor.java @@ -20,7 +20,7 @@ public class SimplePageProcessor implements PageProcessor { private Site site; public SimplePageProcessor(String startUrl, String urlPattern) { - this.site = Site.me().setStartUrl(startUrl). + this.site = Site.me().addStartUrl(startUrl). setDomain(UrlUtils.getDomain(startUrl)).setUserAgent(UA); this.urlPattern = "("+urlPattern.replace(".","\\.").replace("*","[^\"'#]*")+")"; diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/schedular/FileCacheQueueSchedular.java b/webmagic-core/src/main/java/us/codecraft/webmagic/schedular/FileCacheQueueSchedular.java index b3086a283..e9d4adb7c 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/schedular/FileCacheQueueSchedular.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/schedular/FileCacheQueueSchedular.java @@ -2,8 +2,8 @@ import org.apache.commons.lang3.math.NumberUtils; import org.apache.log4j.Logger; -import us.codecraft.webmagic.Site; import us.codecraft.webmagic.Request; +import us.codecraft.webmagic.Task; import java.io.*; import java.util.LinkedHashSet; @@ -28,7 +28,7 @@ public class FileCacheQueueSchedular implements Schedular { private String fileUrlAllName = ".urls.txt"; - private Site site; + private Task task; private String fileCursor = ".cursor.txt"; @@ -44,13 +44,13 @@ public class FileCacheQueueSchedular implements Schedular { private Set urls; - public FileCacheQueueSchedular(Site site) { - this.site = site; + public FileCacheQueueSchedular(Task task) { + this.task = task; } - public FileCacheQueueSchedular(Site site, String filePath) { + public FileCacheQueueSchedular(Task task, String filePath) { this.filePath = filePath; - this.site = site; + this.task = task; } private void flush() { @@ -106,7 +106,7 @@ private void readUrlFile() throws IOException { urls.add(line.trim()); lineReaded++; if (lineReaded > cursor.get()) { - queue.add(new Request(line, site)); + queue.add(new Request(line)); } } } @@ -121,11 +121,11 @@ private void readCursorFile() throws IOException { } private String getFileName(String filename) { - return filePath + site.getDomain() + "#" + site.getIdentifier() + filename; + return filePath + task.getUUID() + "/" + filename; } @Override - public synchronized void push(Request request, Site site) { + public synchronized void push(Request request, Task task) { if (!inited.get()) { init(); } @@ -140,7 +140,7 @@ public synchronized void push(Request request, Site site) { } @Override - public synchronized Request poll(Site site) { + public synchronized Request poll(Task task) { if (!inited.get()) { init(); } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/schedular/QueueSchedular.java b/webmagic-core/src/main/java/us/codecraft/webmagic/schedular/QueueSchedular.java index b9c39c327..8c3da3b00 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/schedular/QueueSchedular.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/schedular/QueueSchedular.java @@ -2,7 +2,7 @@ import org.apache.log4j.Logger; import us.codecraft.webmagic.Request; -import us.codecraft.webmagic.Site; +import us.codecraft.webmagic.Task; import java.util.HashSet; import java.util.Set; @@ -23,7 +23,7 @@ public class QueueSchedular implements Schedular { private Set urls = new HashSet(); @Override - public synchronized void push(Request request,Site site) { + public synchronized void push(Request request,Task task) { if (logger.isDebugEnabled()){ logger.debug("push to queue "+request.getUrl()); } @@ -34,7 +34,7 @@ public synchronized void push(Request request,Site site) { } @Override - public synchronized Request poll(Site site) { + public synchronized Request poll(Task task) { return queue.poll(); } } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/schedular/Schedular.java b/webmagic-core/src/main/java/us/codecraft/webmagic/schedular/Schedular.java index 965ad2587..8e4edb420 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/schedular/Schedular.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/schedular/Schedular.java @@ -1,7 +1,7 @@ package us.codecraft.webmagic.schedular; import us.codecraft.webmagic.Request; -import us.codecraft.webmagic.Site; +import us.codecraft.webmagic.Task; /** * Author: code4crafter@gmail.com @@ -10,8 +10,8 @@ */ public interface Schedular { - public void push(Request request,Site site); + public void push(Request request,Task task); - public Request poll(Site site); + public Request poll(Task task); } diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/DiandianBlogProcessor.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/DiandianBlogProcessor.java index b17b05dbf..a9351a1be 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/DiandianBlogProcessor.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/DiandianBlogProcessor.java @@ -37,7 +37,7 @@ public void process(Page page) { public Site getSite() { //site定义抽取配置,以及开始url等 if (site == null) { - site = Site.me().setDomain("progressdaily.diandian.com").setStartUrl("http://progressdaily.diandian.com/"). + site = Site.me().setDomain("progressdaily.diandian.com").addStartUrl("http://progressdaily.diandian.com/"). setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31"); } return site; diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/DianpingBlogProcessor.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/DianpingBlogProcessor.java index cc91f89ad..fafb7de25 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/DianpingBlogProcessor.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/DianpingBlogProcessor.java @@ -27,7 +27,7 @@ public void process(Page page) { @Override public Site getSite() { - return Site.me().setDomain("www.dianping.com").setStartUrl("http://www.dianping.com/"). + return Site.me().setDomain("www.dianping.com").addStartUrl("http://www.dianping.com/"). setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31"); } } diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/DiaoyuwengProcessor.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/DiaoyuwengProcessor.java index bcea9a31e..8d64bbca5 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/DiaoyuwengProcessor.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/DiaoyuwengProcessor.java @@ -33,7 +33,7 @@ public void process(Page page) { @Override public Site getSite() { if (site==null){ - site= Site.me().setDomain("www.diaoyuweng.com").setStartUrl("http://www.diaoyuweng.com/home.php?mod=space&uid=88304&do=thread&view=me&type=thread&from=space"). + site= Site.me().setDomain("www.diaoyuweng.com").addStartUrl("http://www.diaoyuweng.com/home.php?mod=space&uid=88304&do=thread&view=me&type=thread&from=space"). setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31").setEncoding("GBK").setSleepTime(500); } return site; diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/F58PageProcesser.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/F58PageProcesser.java index 346b1e225..82db2dd7a 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/F58PageProcesser.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/F58PageProcesser.java @@ -23,6 +23,6 @@ public void process(Page page) { @Override public Site getSite() { - return Site.me().setDomain("sh.58.com").setStartUrl("http://sh.58.com/"); //To change body of implemented methods use File | Settings | File Templates. + return Site.me().setDomain("sh.58.com").addStartUrl("http://sh.58.com/"); //To change body of implemented methods use File | Settings | File Templates. } } diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/HuxiuProcessor.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/HuxiuProcessor.java index 8286d7816..36f69466f 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/HuxiuProcessor.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/HuxiuProcessor.java @@ -23,7 +23,7 @@ public void process(Page page) { @Override public Site getSite() { - return Site.me().setDomain("www.huxiu.com").setStartUrl("http://www.huxiu.com/"). + return Site.me().setDomain("www.huxiu.com").addStartUrl("http://www.huxiu.com/"). setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31"); } } diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/KaichibaProcessor.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/KaichibaProcessor.java index f2d405062..11f046271 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/KaichibaProcessor.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/KaichibaProcessor.java @@ -21,7 +21,7 @@ public void process(Page page) { @Override public Site getSite() { - return Site.me().setDomain("kaichiba.com").setStartUrl("http://kaichiba.com/shop/41725781").setEncoding("utf-8"). + return Site.me().setDomain("kaichiba.com").addStartUrl("http://kaichiba.com/shop/41725781").setEncoding("utf-8"). setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31"); } } diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/MeicanProcessor.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/MeicanProcessor.java index 30ba84d43..aaeca8f88 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/MeicanProcessor.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/MeicanProcessor.java @@ -27,7 +27,7 @@ public void process(Page page) { @Override public Site getSite() { - return Site.me().setDomain("meican.com").setStartUrl("http://www.meican.com/shanghai/districts").setEncoding("utf-8"). + return Site.me().setDomain("meican.com").addStartUrl("http://www.meican.com/shanghai/districts").setEncoding("utf-8"). setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31"); } } diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/NjuBBSProcessor.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/NjuBBSProcessor.java index 3b61d7649..58e19c4ea 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/NjuBBSProcessor.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/NjuBBSProcessor.java @@ -22,7 +22,7 @@ public void process(Page page) { @Override public Site getSite() { - return Site.me().setDomain("bbs.nju.edu.cn").setStartUrl("http://bbs.nju.edu.cn/board?board=Pictures"). + return Site.me().setDomain("bbs.nju.edu.cn").addStartUrl("http://bbs.nju.edu.cn/board?board=Pictures"). setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31"); } } diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/OschinaBlogPageProcesser.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/OschinaBlogPageProcesser.java index 58d87a7f6..bcc2d6eec 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/OschinaBlogPageProcesser.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/OschinaBlogPageProcesser.java @@ -24,7 +24,7 @@ public void process(Page page) { @Override public Site getSite() { - return Site.me().setDomain("my.oschina.net").setStartUrl("http://www.oschina.net/"). + return Site.me().setDomain("my.oschina.net").addStartUrl("http://www.oschina.net/"). setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31"); } } diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/OschinaPageProcesser.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/OschinaPageProcesser.java index 4f84a3f54..d85ca8142 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/OschinaPageProcesser.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/OschinaPageProcesser.java @@ -23,7 +23,7 @@ public void process(Page page) { @Override public Site getSite() { - return Site.me().setDomain("www.oschina.net").setStartUrl("http://www.oschina.net/"). + return Site.me().setDomain("www.oschina.net").addStartUrl("http://www.oschina.net/"). setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31"); } } diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/QzoneBlogProcessor.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/QzoneBlogProcessor.java index e252eeff0..fac491dfd 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/QzoneBlogProcessor.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/QzoneBlogProcessor.java @@ -26,7 +26,7 @@ public void process(Page page) { @Override public Site getSite() { - return Site.me().setDomain("www.diandian.com").setStartUrl("http://17dujingdian.com/"). + return Site.me().setDomain("www.diandian.com").addStartUrl("http://17dujingdian.com/"). setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31"); } } diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/SinaBlogProcesser.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/SinaBlogProcesser.java index 07f6d4768..37c686490 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/SinaBlogProcesser.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/SinaBlogProcesser.java @@ -26,7 +26,7 @@ public void process(Page page) { @Override public Site getSite() { if (site==null){ - site = Site.me().setDomain("blog.sina.com.cn").setStartUrl("http://blog.sina.com.cn/flashsword20").setSleepTime(3000). + site = Site.me().setDomain("blog.sina.com.cn").addStartUrl("http://blog.sina.com.cn/flashsword20").setSleepTime(3000). setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31"); } return site; diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/TianyaPageProcesser.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/TianyaPageProcesser.java index 564f1efb6..db5f9ff21 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/TianyaPageProcesser.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/TianyaPageProcesser.java @@ -23,6 +23,6 @@ public void process(Page page) { @Override public Site getSite() { - return Site.me().setDomain("http://bbs.tianya.cn/").setStartUrl("http://bbs.tianya.cn/"); //To change body of implemented methods use File | Settings | File Templates. + return Site.me().setDomain("http://bbs.tianya.cn/").addStartUrl("http://bbs.tianya.cn/"); //To change body of implemented methods use File | Settings | File Templates. } } diff --git a/webmagic-samples/src/main/resources/ftl/wordpress.ftl b/webmagic-samples/src/main/resources/ftl/wordpress.ftl index f2feeb16c..c2442ab62 100644 --- a/webmagic-samples/src/main/resources/ftl/wordpress.ftl +++ b/webmagic-samples/src/main/resources/ftl/wordpress.ftl @@ -1,13 +1,13 @@ ${title} - http://127.0.0.1/wordpress/?p=${id} + http://127.0.0.1/wordpress/?p=${uuid} ${date} admin - http://127.0.0.1/wordpress/?p=${id} + http://127.0.0.1/wordpress/?p=${uuid} - ${id} + ${uuid} ${date} ${date} open From faba1b81a6f5cfd74f3c7ae2c19b05f92d6b79d9 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Tue, 18 Jun 2013 17:46:00 +0800 Subject: [PATCH 18/81] update readme, markdown syntax ** --- README.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 92972ce9b..e5dc333eb 100644 --- a/README.md +++ b/README.md @@ -11,7 +11,7 @@ webmagic的功能覆盖整个爬虫的生命周期(链接提取、页面下载 #### 请注意 -webmagic正处于开发阶段,目前还没有稳定版本。欢迎开发者参与到webmagic的试用和修改中来。 ** 如果只是想以外部jar包的方式,引用webmagic并进行自己的业务开发,建议你等待webmagic的第一个稳定版本。** +webmagic正处于开发阶段,目前还没有稳定版本。欢迎开发者参与到webmagic的试用和修改中来。 **如果只是想以外部jar包的方式,引用webmagic并进行自己的业务开发,建议你等待webmagic的第一个稳定版本。** ###特色### @@ -90,8 +90,8 @@ webmagic遵循[Apache 2.0协议](http://opensource.org/licenses/Apache-2.0) webmagic的架构和设计参考了以下两个项目,感谢以下两个项目的作者: -python爬虫 ** scrapy **[https://github.com/scrapy/scrapy](https://github.com/scrapy/scrapy) +python爬虫 **scrapy**[https://github.com/scrapy/scrapy](https://github.com/scrapy/scrapy) -Java爬虫 ** Spiderman **[https://gitcafe.com/laiweiwei/Spiderman](https://gitcafe.com/laiweiwei/Spiderman) +Java爬虫 **Spiderman**[https://gitcafe.com/laiweiwei/Spiderman](https://gitcafe.com/laiweiwei/Spiderman) From 4c71f22bd1b9bd135a34e0d1c7e4ae4688a2a987 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Tue, 18 Jun 2013 18:02:30 +0800 Subject: [PATCH 19/81] fix compile error --- .../schedular/FileCacheQueueSchedular.java | 14 +++++--------- .../webmagic/pipeline/FreemarkerPipeline.java | 14 +++++++------- .../java/us/codecraft/webmagic/SpiderTest.java | 2 +- .../webmagic/processor/DiandianProcessorTest.java | 2 +- .../processor/DiaoyuwengProcessorTest.java | 4 +++- .../webmagic/processor/SinablogProcessorTest.java | 2 +- 6 files changed, 18 insertions(+), 20 deletions(-) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/schedular/FileCacheQueueSchedular.java b/webmagic-core/src/main/java/us/codecraft/webmagic/schedular/FileCacheQueueSchedular.java index e9d4adb7c..882f49824 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/schedular/FileCacheQueueSchedular.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/schedular/FileCacheQueueSchedular.java @@ -44,13 +44,8 @@ public class FileCacheQueueSchedular implements Schedular { private Set urls; - public FileCacheQueueSchedular(Task task) { - this.task = task; - } - - public FileCacheQueueSchedular(Task task, String filePath) { + public FileCacheQueueSchedular(String filePath) { this.filePath = filePath; - this.task = task; } private void flush() { @@ -58,7 +53,8 @@ private void flush() { fileCursorWriter.flush(); } - private void init() { + private void init(Task task) { + this.task = task; File file = new File(filePath); if (!file.exists()) { file.mkdirs(); @@ -127,7 +123,7 @@ private String getFileName(String filename) { @Override public synchronized void push(Request request, Task task) { if (!inited.get()) { - init(); + init(task); } if (logger.isDebugEnabled()) { logger.debug("push to queue " + request.getUrl()); @@ -142,7 +138,7 @@ public synchronized void push(Request request, Task task) { @Override public synchronized Request poll(Task task) { if (!inited.get()) { - init(); + init(task); } fileCursorWriter.println(cursor.incrementAndGet()); return queue.poll(); diff --git a/webmagic-plugin/src/main/java/us/codecraft/webmagic/pipeline/FreemarkerPipeline.java b/webmagic-plugin/src/main/java/us/codecraft/webmagic/pipeline/FreemarkerPipeline.java index 9afebb669..112197113 100644 --- a/webmagic-plugin/src/main/java/us/codecraft/webmagic/pipeline/FreemarkerPipeline.java +++ b/webmagic-plugin/src/main/java/us/codecraft/webmagic/pipeline/FreemarkerPipeline.java @@ -5,10 +5,12 @@ import freemarker.template.TemplateException; import org.apache.commons.codec.digest.DigestUtils; import us.codecraft.webmagic.Page; -import us.codecraft.webmagic.Site; -import us.codecraft.webmagic.utils.UrlUtils; +import us.codecraft.webmagic.Task; -import java.io.*; +import java.io.File; +import java.io.FileWriter; +import java.io.IOException; +import java.io.PrintWriter; /** * Author: code4crafter@gmail.com @@ -37,10 +39,8 @@ public FreemarkerPipeline(String template) throws IOException { @Override - public void process(Page page, Site site) { - String domain = site.getDomain(); - domain = UrlUtils.getDomain(domain); - String path = this.path + "" + domain + "/"; + public void process(Page page, Task task) { + String path = this.path + "" + task.getUUID() + "/"; File file = new File(path); if (!file.exists()) { file.mkdirs(); diff --git a/webmagic-samples/src/test/java/us/codecraft/webmagic/SpiderTest.java b/webmagic-samples/src/test/java/us/codecraft/webmagic/SpiderTest.java index 79f428ebe..b67ffc4b7 100644 --- a/webmagic-samples/src/test/java/us/codecraft/webmagic/SpiderTest.java +++ b/webmagic-samples/src/test/java/us/codecraft/webmagic/SpiderTest.java @@ -31,7 +31,7 @@ public void testGlobalSpider(){ SimplePageProcessor pageProcessor2 = new SimplePageProcessor("http://www.diaoyuweng.com/home.php?mod=space&uid=88304&do=thread&view=me&type=thread&from=space", "http://www.diaoyuweng.com/thread-*-1-1.html"); System.out.println(pageProcessor2.getSite().getEncoding()); pageProcessor2.getSite().setSleepTime(500); - Spider.me().pipeline(new FilePipeline()).schedular(new FileCacheQueueSchedular(pageProcessor2.getSite(),"/data/temp/webmagic/cache/")). + Spider.me().pipeline(new FilePipeline()).schedular(new FileCacheQueueSchedular("/data/temp/webmagic/cache/")). processor(pageProcessor2).run(); diff --git a/webmagic-samples/src/test/java/us/codecraft/webmagic/processor/DiandianProcessorTest.java b/webmagic-samples/src/test/java/us/codecraft/webmagic/processor/DiandianProcessorTest.java index 74de9c0e0..721a5eb86 100644 --- a/webmagic-samples/src/test/java/us/codecraft/webmagic/processor/DiandianProcessorTest.java +++ b/webmagic-samples/src/test/java/us/codecraft/webmagic/processor/DiandianProcessorTest.java @@ -30,7 +30,7 @@ public void test() throws IOException { //ConsolePipeline输出结果到控制台 //FileCacheQueueSchedular保存url,支持断点续传,临时文件输出到/data/temp/webmagic/cache目录 //Spider.run()执行 - Spider.me().pipeline(new ConsolePipeline()).pipeline(pipeline).schedular(new FileCacheQueueSchedular(diaoyuwengProcessor.getSite(), "/data/temp/webmagic/cache/")). + Spider.me().pipeline(new ConsolePipeline()).pipeline(pipeline).schedular(new FileCacheQueueSchedular("/data/temp/webmagic/cache/")). processor(diaoyuwengProcessor).run(); } } diff --git a/webmagic-samples/src/test/java/us/codecraft/webmagic/processor/DiaoyuwengProcessorTest.java b/webmagic-samples/src/test/java/us/codecraft/webmagic/processor/DiaoyuwengProcessorTest.java index 0c24b5771..8f03b6a8a 100644 --- a/webmagic-samples/src/test/java/us/codecraft/webmagic/processor/DiaoyuwengProcessorTest.java +++ b/webmagic-samples/src/test/java/us/codecraft/webmagic/processor/DiaoyuwengProcessorTest.java @@ -1,5 +1,6 @@ package us.codecraft.webmagic.processor; +import org.junit.Ignore; import org.junit.Test; import us.codecraft.webmagic.Spider; import us.codecraft.webmagic.pipeline.FilePipeline; @@ -16,11 +17,12 @@ */ public class DiaoyuwengProcessorTest { + @Ignore @Test public void test() throws IOException { DiaoyuwengProcessor diaoyuwengProcessor = new DiaoyuwengProcessor(); FreemarkerPipeline pipeline = new FreemarkerPipeline("wordpress.ftl"); - Spider.me().pipeline(new FilePipeline()).pipeline(pipeline).schedular(new FileCacheQueueSchedular(diaoyuwengProcessor.getSite(), "/data/temp/webmagic/cache/")). + Spider.me().pipeline(new FilePipeline()).pipeline(pipeline).schedular(new FileCacheQueueSchedular("/data/temp/webmagic/cache/")). processor(diaoyuwengProcessor).run(); } } diff --git a/webmagic-samples/src/test/java/us/codecraft/webmagic/processor/SinablogProcessorTest.java b/webmagic-samples/src/test/java/us/codecraft/webmagic/processor/SinablogProcessorTest.java index 72e6be0d6..8fb258378 100644 --- a/webmagic-samples/src/test/java/us/codecraft/webmagic/processor/SinablogProcessorTest.java +++ b/webmagic-samples/src/test/java/us/codecraft/webmagic/processor/SinablogProcessorTest.java @@ -30,7 +30,7 @@ public void test() throws IOException { //ConsolePipeline输出结果到控制台 //FileCacheQueueSchedular保存url,支持断点续传,临时文件输出到/data/temp/webmagic/cache目录 //Spider.run()执行 - Spider.me().pipeline(new FilePipeline()).pipeline(pipeline).schedular(new FileCacheQueueSchedular(sinaBlogProcesser.getSite(), "/data/temp/webmagic/cache/")). + Spider.me().pipeline(new FilePipeline()).pipeline(pipeline).schedular(new FileCacheQueueSchedular("/data/temp/webmagic/cache/")). processor(sinaBlogProcesser).run(); } } From d86716508f0e98f9a1e667418d341961cc455fd0 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Tue, 18 Jun 2013 18:30:45 +0800 Subject: [PATCH 20/81] fix samples --- README.md | 4 ++-- .../java/us/codecraft/webmagic/Spider.java | 6 +---- .../webmagic/pipeline/FreemarkerPipeline.java | 2 +- .../webmagic/FreemarkerPipelineTest.java | 2 +- .../src/test/resources/ftl/wordpress.ftl | 23 ------------------- ...gProcessor.java => DianpingProcessor.java} | 15 ++++++++---- 6 files changed, 15 insertions(+), 37 deletions(-) delete mode 100644 webmagic-plugin/src/test/resources/ftl/wordpress.ftl rename webmagic-samples/src/main/java/us/codecraft/webmagic/samples/{DianpingBlogProcessor.java => DianpingProcessor.java} (61%) diff --git a/README.md b/README.md index e5dc333eb..421443f0a 100644 --- a/README.md +++ b/README.md @@ -90,8 +90,8 @@ webmagic遵循[Apache 2.0协议](http://opensource.org/licenses/Apache-2.0) webmagic的架构和设计参考了以下两个项目,感谢以下两个项目的作者: -python爬虫 **scrapy**[https://github.com/scrapy/scrapy](https://github.com/scrapy/scrapy) +python爬虫 **scrapy** [https://github.com/scrapy/scrapy](https://github.com/scrapy/scrapy) -Java爬虫 **Spiderman**[https://gitcafe.com/laiweiwei/Spiderman](https://gitcafe.com/laiweiwei/Spiderman) +Java爬虫 **Spiderman** [https://gitcafe.com/laiweiwei/Spiderman](https://gitcafe.com/laiweiwei/Spiderman) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java index f7f560cb3..6464d6187 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java @@ -57,10 +57,6 @@ public Spider startUrl(String startUrl) { return this; } - public Thread thread() { - return new Thread(this); - } - public Spider schedular(Schedular schedular) { this.schedular = schedular; return this; @@ -74,7 +70,7 @@ public Spider pipeline(Pipeline pipeline) { @Override public void run() { - for (String startUrl : pageProcessor.getSite().getStartUrls()) { + for (String startUrl : startUrls) { schedular.push(new Request(startUrl), this); } Request request = schedular.poll(this); diff --git a/webmagic-plugin/src/main/java/us/codecraft/webmagic/pipeline/FreemarkerPipeline.java b/webmagic-plugin/src/main/java/us/codecraft/webmagic/pipeline/FreemarkerPipeline.java index 112197113..218276d2e 100644 --- a/webmagic-plugin/src/main/java/us/codecraft/webmagic/pipeline/FreemarkerPipeline.java +++ b/webmagic-plugin/src/main/java/us/codecraft/webmagic/pipeline/FreemarkerPipeline.java @@ -30,7 +30,7 @@ public FreemarkerPipeline(String template, String path) throws IOException { configuration.setDirectoryForTemplateLoading(new File(this.getClass().getClassLoader().getResource("ftl/").getFile())); this.template = configuration.getTemplate(template); this.path = path; - File file = new File(path); + new File(path); } public FreemarkerPipeline(String template) throws IOException { diff --git a/webmagic-plugin/src/test/java/us/codecraft/webmagic/FreemarkerPipelineTest.java b/webmagic-plugin/src/test/java/us/codecraft/webmagic/FreemarkerPipelineTest.java index 610edf55b..9e6b9958c 100644 --- a/webmagic-plugin/src/test/java/us/codecraft/webmagic/FreemarkerPipelineTest.java +++ b/webmagic-plugin/src/test/java/us/codecraft/webmagic/FreemarkerPipelineTest.java @@ -13,7 +13,7 @@ public class FreemarkerPipelineTest { @Test - public void test() throws IOException { + public void testTemplateLoad() throws IOException { FreemarkerPipeline freemarkerPipeline = new FreemarkerPipeline("wordpress.ftl"); } } diff --git a/webmagic-plugin/src/test/resources/ftl/wordpress.ftl b/webmagic-plugin/src/test/resources/ftl/wordpress.ftl deleted file mode 100644 index 61820b727..000000000 --- a/webmagic-plugin/src/test/resources/ftl/wordpress.ftl +++ /dev/null @@ -1,23 +0,0 @@ - - $it.Title - http://127.0.0.1/wordpress/?p=$it.Id - ${date} - admin - http://127.0.0.1/wordpress/?p=$it.Id - - - - <#--$it.Id--> - ${date} - ${date} - open - open - ${title} - publish - 0 - 0 - post - - 0 - $tags - \ No newline at end of file diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/DianpingBlogProcessor.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/DianpingProcessor.java similarity index 61% rename from webmagic-samples/src/main/java/us/codecraft/webmagic/samples/DianpingBlogProcessor.java rename to webmagic-samples/src/main/java/us/codecraft/webmagic/samples/DianpingProcessor.java index fafb7de25..63aa0f03d 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/DianpingBlogProcessor.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/DianpingProcessor.java @@ -2,6 +2,7 @@ import us.codecraft.webmagic.Site; import us.codecraft.webmagic.Page; +import us.codecraft.webmagic.Spider; import us.codecraft.webmagic.processor.PageProcessor; import java.util.List; @@ -11,15 +12,14 @@ * Date: 13-4-21 * Time: 下午8:08 */ -public class DianpingBlogProcessor implements PageProcessor { +public class DianpingProcessor implements PageProcessor { @Override public void process(Page page) { - //http://progressdaily.diandian.com/post/2013-01-24/40046867275 - List requests = page.getHtml().rs("]*href=[\"']{1}(/shop/.*?)[\"']{1}").toStrings(); + List requests = page.getHtml().as().rs(".*shop.*").toStrings(); page.addTargetRequests(requests); - requests = page.getHtml().rs("]*href=[\"']{1}(/search/category/.*?)[\"']{1}").toStrings(); + requests = page.getHtml().rs(".*search/category/.*").toStrings(); page.addTargetRequests(requests); - if (page.getUrl().toString().contains("shop")){ + if (page.getUrl().toString().contains("shop")) { page.putField("title", page.getHtml().x("//h1[@class='shop-title']")); page.putField("content", page.getHtml().sc()); } @@ -30,4 +30,9 @@ public Site getSite() { return Site.me().setDomain("www.dianping.com").addStartUrl("http://www.dianping.com/"). setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31"); } + + public static void main(String[] args) { + DianpingProcessor dianpingProcessor = new DianpingProcessor(); + Spider.me().processor(dianpingProcessor).startUrl("http://www.dianping.com/shanghai/food").run(); + } } From a26b662ed8dfde95c915aaa35caf76b4fdc0b26a Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Tue, 18 Jun 2013 22:13:40 +0800 Subject: [PATCH 21/81] update docs --- .../main/java/us/codecraft/webmagic/Page.java | 18 ++++++++++++++---- .../java/us/codecraft/webmagic/Request.java | 2 +- .../main/java/us/codecraft/webmagic/Site.java | 2 +- .../java/us/codecraft/webmagic/Spider.java | 2 +- .../main/java/us/codecraft/webmagic/Task.java | 2 +- .../webmagic/downloader/Downloader.java | 4 ++-- .../downloader/HttpClientDownloader.java | 2 +- .../webmagic/downloader/HttpClientPool.java | 2 +- .../webmagic/pipeline/ConsolePipeline.java | 2 +- .../webmagic/pipeline/FilePipeline.java | 2 +- .../codecraft/webmagic/pipeline/Pipeline.java | 2 +- .../webmagic/processor/PageProcessor.java | 2 +- .../processor/SimplePageProcessor.java | 2 +- .../schedular/FileCacheQueueSchedular.java | 2 +- .../webmagic/schedular/QueueSchedular.java | 2 +- .../webmagic/schedular/Schedular.java | 2 +- .../us/codecraft/webmagic/selector/Html.java | 2 +- .../codecraft/webmagic/selector/PlainText.java | 2 +- .../webmagic/selector/RegexResult.java | 2 +- .../webmagic/selector/RegexSelector.java | 2 +- .../webmagic/selector/ReplaceSelector.java | 2 +- .../webmagic/selector/Selectable.java | 2 +- .../codecraft/webmagic/selector/Selector.java | 2 +- .../webmagic/selector/SelectorFactory.java | 2 +- .../selector/SmartContentSelector.java | 2 +- .../webmagic/selector/XpathSelector.java | 2 +- .../us/codecraft/webmagic/utils/UrlUtils.java | 2 +- .../java/us/codecraft/webmagic/HtmlTest.java | 2 +- .../webmagic/selector/RegexSelectorTest.java | 2 +- .../webmagic/selector/XpathSelectorTest.java | 2 +- .../codecraft/webmagic/utils/UrlUtilsTest.java | 2 +- .../webmagic/pipeline/FreemarkerPipeline.java | 2 +- .../webmagic/FreemarkerPipelineTest.java | 2 +- .../samples/DiandianBlogProcessor.java | 2 +- .../webmagic/samples/DianpingProcessor.java | 2 +- .../webmagic/samples/DiaoyuwengProcessor.java | 2 +- .../webmagic/samples/F58PageProcesser.java | 2 +- .../webmagic/samples/HuxiuProcessor.java | 2 +- .../webmagic/samples/KaichibaProcessor.java | 2 +- .../webmagic/samples/MeicanProcessor.java | 2 +- .../webmagic/samples/NjuBBSProcessor.java | 2 +- .../samples/OschinaBlogPageProcesser.java | 2 +- .../webmagic/samples/OschinaPageProcesser.java | 2 +- .../webmagic/samples/QzoneBlogProcessor.java | 2 +- .../webmagic/samples/SinaBlogProcesser.java | 2 +- .../webmagic/samples/TianyaPageProcesser.java | 2 +- .../java/us/codecraft/webmagic/SpiderTest.java | 2 +- .../processor/DiandianProcessorTest.java | 2 +- .../processor/DiaoyuwengProcessorTest.java | 2 +- .../processor/SinablogProcessorTest.java | 2 +- 50 files changed, 64 insertions(+), 54 deletions(-) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java index eee1a8a74..ec9f959e8 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java @@ -10,10 +10,16 @@ import java.util.concurrent.ConcurrentHashMap; /** - * Page保存了抓取的结果,并可定义下一次抓取的链接内容。 - * Author: code4crafter@gmail.com - * Date: 13-4-21 - * Time: 上午11:22 + *
      + *Page保存了上一次抓取的结果,并可定义待抓取的链接内容。
      + *
      + *     主要方法:
      + *     {@link #getUrl()} 获取页面的Url
      + *     {@link #getHtml()}  获取页面的html内容
      + *     {@link #addTargetRequests(java.util.List)} {@link #addTargetRequest(String)} 添加待抓取的链接
      + *
      + * 
      + * @author code4crafter@gmail.com
      */ public class Page { @@ -34,6 +40,10 @@ public void process() { public Page() { } + /** + * + * @return fields + */ public Map getFields() { return fields; } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Request.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Request.java index ecb8b4ef9..37ede0dc4 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Request.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Request.java @@ -17,7 +17,7 @@ * String linktext = (String)page.getRequest().getExtra()[0]; * } * - * Author: code4crafter@gmail.com + * @author code4crafter@gmail.com
      * Date: 13-4-21 * Time: 上午11:37 */ diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java index a4d88d895..5c208dd04 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java @@ -4,7 +4,7 @@ /** * Site定义一个待抓取的站点的各种信息。 - * Author: code4crafter@gmail.com + * @author code4crafter@gmail.com
      * Date: 13-4-21 * Time: 下午12:13 */ diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java index 6464d6187..67e9c94d5 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java @@ -14,7 +14,7 @@ import java.util.List; /** - * Author: code4crafter@gmail.com + * @author code4crafter@gmail.com
      * Date: 13-4-21 * Time: 上午6:53 */ diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Task.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Task.java index 0eaf6c95c..2aab74a63 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Task.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Task.java @@ -1,7 +1,7 @@ package us.codecraft.webmagic; /** - * Author: code4crafer@gmail.com + * @author code4crafter@gmail.com
      * Date: 13-6-18 * Time: 下午2:57 */ diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/Downloader.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/Downloader.java index f276fde2b..e3ecff879 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/Downloader.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/Downloader.java @@ -6,7 +6,7 @@ /** * Downloader是webmagic下载页面的接口。webmagic默认使用了HttpComponent作为下载器,一般情况,你无需自己实现这个接口。 - * Author: code4crafter@gmail.com + * @author code4crafter@gmail.com
      * Date: 13-4-21 * Time: 下午12:14 */ @@ -17,7 +17,7 @@ public interface Downloader { * * @param request * @param site - * @return + * @return page */ public Page download(Request request, Site site); } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java index 2eb55c0ab..4332fa329 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java @@ -14,7 +14,7 @@ /** - * Author: code4crafter@gmail.com + * @author code4crafter@gmail.com
      * Date: 13-4-21 * Time: 下午12:15 */ diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientPool.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientPool.java index 891ff188b..4e57e16f9 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientPool.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientPool.java @@ -18,7 +18,7 @@ import java.util.Map; /** - * Author: code4crafter@gmail.com + * @author code4crafter@gmail.com
      * Date: 13-4-21 * Time: 下午12:29 */ diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/ConsolePipeline.java b/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/ConsolePipeline.java index 7b93876dc..866db9239 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/ConsolePipeline.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/ConsolePipeline.java @@ -7,7 +7,7 @@ import java.util.Map; /** - * Author: code4crafter@gmail.com + * @author code4crafter@gmail.com
      * Date: 13-4-21 * Time: 下午1:45 */ diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/FilePipeline.java b/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/FilePipeline.java index 2311a75dd..c7cd9c56a 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/FilePipeline.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/FilePipeline.java @@ -12,7 +12,7 @@ import java.util.Map; /** - * Author: code4crafter@gmail.com + * @author code4crafter@gmail.com
      * Date: 13-4-21 * Time: 下午6:28 */ diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/Pipeline.java b/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/Pipeline.java index b2b51e0ae..1be447c45 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/Pipeline.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/Pipeline.java @@ -4,7 +4,7 @@ import us.codecraft.webmagic.Task; /** - * Author: code4crafter@gmail.com + * @author code4crafter@gmail.com
      * Date: 13-4-21 * Time: 下午1:39 */ diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/processor/PageProcessor.java b/webmagic-core/src/main/java/us/codecraft/webmagic/processor/PageProcessor.java index 982005974..c36ae980d 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/processor/PageProcessor.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/processor/PageProcessor.java @@ -4,7 +4,7 @@ import us.codecraft.webmagic.Site; /** - * Author: code4crafter@gmail.com + * @author code4crafter@gmail.com
      * Date: 13-4-21 * Time: 上午11:42 */ diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/processor/SimplePageProcessor.java b/webmagic-core/src/main/java/us/codecraft/webmagic/processor/SimplePageProcessor.java index 9f4eed36f..3ffc9a32e 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/processor/SimplePageProcessor.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/processor/SimplePageProcessor.java @@ -7,7 +7,7 @@ import java.util.List; /** - * Author: code4crafter@gmail.com + * @author code4crafter@gmail.com
      * Date: 13-4-22 * Time: 下午9:15 */ diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/schedular/FileCacheQueueSchedular.java b/webmagic-core/src/main/java/us/codecraft/webmagic/schedular/FileCacheQueueSchedular.java index 882f49824..0a93e52db 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/schedular/FileCacheQueueSchedular.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/schedular/FileCacheQueueSchedular.java @@ -16,7 +16,7 @@ import java.util.concurrent.atomic.AtomicInteger; /** - * Author: code4crafter@gmail.com + * @author code4crafter@gmail.com
      * Date: 13-4-21 * Time: 下午1:13 */ diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/schedular/QueueSchedular.java b/webmagic-core/src/main/java/us/codecraft/webmagic/schedular/QueueSchedular.java index 8c3da3b00..20576fc7c 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/schedular/QueueSchedular.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/schedular/QueueSchedular.java @@ -10,7 +10,7 @@ import java.util.concurrent.LinkedBlockingQueue; /** - * Author: code4crafter@gmail.com + * @author code4crafter@gmail.com
      * Date: 13-4-21 * Time: 下午1:13 */ diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/schedular/Schedular.java b/webmagic-core/src/main/java/us/codecraft/webmagic/schedular/Schedular.java index 8e4edb420..8df776040 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/schedular/Schedular.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/schedular/Schedular.java @@ -4,7 +4,7 @@ import us.codecraft.webmagic.Task; /** - * Author: code4crafter@gmail.com + * @author code4crafter@gmail.com
      * Date: 13-4-21 * Time: 下午1:12 */ diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Html.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Html.java index 22bf73d69..3b3c80af9 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Html.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Html.java @@ -4,7 +4,7 @@ import java.util.List; /** - * Author: code4crafter@gmail.com + * @author code4crafter@gmail.com
      * Date: 13-4-21 * Time: 上午7:54 */ diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/PlainText.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/PlainText.java index 1d7a41e5d..a11c9a293 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/PlainText.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/PlainText.java @@ -6,7 +6,7 @@ import java.util.List; /** - * Author: code4crafter@gmail.com + * @author code4crafter@gmail.com
      * Date: 13-4-21 * Time: 上午7:54 */ diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/RegexResult.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/RegexResult.java index 06c00dd1c..9f4e2f060 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/RegexResult.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/RegexResult.java @@ -1,7 +1,7 @@ package us.codecraft.webmagic.selector; /** - * Author: code4crafter@gmail.com + * @author code4crafter@gmail.com
      * Date: 13-4-21 * Time: 上午7:39 */ diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/RegexSelector.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/RegexSelector.java index 3b9871892..49fbffd0b 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/RegexSelector.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/RegexSelector.java @@ -9,7 +9,7 @@ import java.util.regex.PatternSyntaxException; /** - * Author: code4crafter@gmail.com + * @author code4crafter@gmail.com
      * Date: 13-4-21 * Time: 上午7:09 */ diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/ReplaceSelector.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/ReplaceSelector.java index e240f0898..1ce7c4d59 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/ReplaceSelector.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/ReplaceSelector.java @@ -6,7 +6,7 @@ import java.util.regex.PatternSyntaxException; /** - * Author: code4crafter@gmail.com + * @author code4crafter@gmail.com
      * Date: 13-4-21 * Time: 上午7:09 */ diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Selectable.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Selectable.java index f02cbc4c0..921e6c3f4 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Selectable.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Selectable.java @@ -3,7 +3,7 @@ import java.util.List; /** - * Author: code4crafter@gmail.com + * @author code4crafter@gmail.com
      * Date: 13-4-20 * Time: 下午7:51 */ diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Selector.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Selector.java index 3e1b056e1..35632b3aa 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Selector.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Selector.java @@ -3,7 +3,7 @@ import java.util.List; /** - * Author: code4crafter@gmail.com + * @author code4crafter@gmail.com
      * Date: 13-4-20 * Time: 下午8:02 */ diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/SelectorFactory.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/SelectorFactory.java index 633cb4abf..3c87ac9c4 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/SelectorFactory.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/SelectorFactory.java @@ -7,7 +7,7 @@ import java.util.concurrent.ConcurrentHashMap; /** - * Author: code4crafter@gmail.com + * @author code4crafter@gmail.com
      * Date: 13-4-21 * Time: 上午7:56 */ diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/SmartContentSelector.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/SmartContentSelector.java index 919a795b3..89748975d 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/SmartContentSelector.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/SmartContentSelector.java @@ -10,7 +10,7 @@ /** * readability算法,基础是找到所有p标签的父节点 * 写的比较乱,最终效果还在尝试中 - * Author: code4crafter@gmail.com + * @author code4crafter@gmail.com
      * Date: 13-4-21 * Time: 下午4:42 */ diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/XpathSelector.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/XpathSelector.java index 991f92674..c2b408eb3 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/XpathSelector.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/XpathSelector.java @@ -6,7 +6,7 @@ import java.util.List; /** - * Author: code4crafter@gmail.com + * @author code4crafter@gmail.com
      * Date: 13-4-21 * Time: 上午9:39 */ diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/utils/UrlUtils.java b/webmagic-core/src/main/java/us/codecraft/webmagic/utils/UrlUtils.java index fd26499ce..74e486c7f 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/utils/UrlUtils.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/utils/UrlUtils.java @@ -6,7 +6,7 @@ import java.util.regex.Pattern; /** - * Author: code4crafter@gmail.com + * @author code4crafter@gmail.com
      * Date: 13-4-21 * Time: 下午1:52 */ diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/HtmlTest.java b/webmagic-core/src/test/java/us/codecraft/webmagic/HtmlTest.java index 0f8c06166..6dacc9833 100644 --- a/webmagic-core/src/test/java/us/codecraft/webmagic/HtmlTest.java +++ b/webmagic-core/src/test/java/us/codecraft/webmagic/HtmlTest.java @@ -5,7 +5,7 @@ import us.codecraft.webmagic.selector.Html; /** - * Author: code4crafter@gmail.com + * @author code4crafter@gmail.com
      * Date: 13-4-21 * Time: 上午8:42 */ diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/selector/RegexSelectorTest.java b/webmagic-core/src/test/java/us/codecraft/webmagic/selector/RegexSelectorTest.java index 3b80f92dc..849a4d6e5 100644 --- a/webmagic-core/src/test/java/us/codecraft/webmagic/selector/RegexSelectorTest.java +++ b/webmagic-core/src/test/java/us/codecraft/webmagic/selector/RegexSelectorTest.java @@ -4,7 +4,7 @@ import org.junit.Test; /** - * Author: code4crafter@gmail.com + * @author code4crafter@gmail.com
      * Date: 13-4-21 * Time: 上午7:13 */ diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/selector/XpathSelectorTest.java b/webmagic-core/src/test/java/us/codecraft/webmagic/selector/XpathSelectorTest.java index 239630a93..8ee888597 100644 --- a/webmagic-core/src/test/java/us/codecraft/webmagic/selector/XpathSelectorTest.java +++ b/webmagic-core/src/test/java/us/codecraft/webmagic/selector/XpathSelectorTest.java @@ -4,7 +4,7 @@ import org.junit.Test; /** - * Author: code4crafter@gmail.com Date: 13-4-21 Time: 上午10:06 + * @author code4crafter@gmail.com
      Date: 13-4-21 Time: 上午10:06 */ public class XpathSelectorTest { diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/utils/UrlUtilsTest.java b/webmagic-core/src/test/java/us/codecraft/webmagic/utils/UrlUtilsTest.java index 36361bfe7..4cfdc046b 100644 --- a/webmagic-core/src/test/java/us/codecraft/webmagic/utils/UrlUtilsTest.java +++ b/webmagic-core/src/test/java/us/codecraft/webmagic/utils/UrlUtilsTest.java @@ -4,7 +4,7 @@ import org.junit.Test; /** - * Author: code4crafter@gmail.com + * @author code4crafter@gmail.com
      * Date: 13-4-21 * Time: 下午2:22 */ diff --git a/webmagic-plugin/src/main/java/us/codecraft/webmagic/pipeline/FreemarkerPipeline.java b/webmagic-plugin/src/main/java/us/codecraft/webmagic/pipeline/FreemarkerPipeline.java index 218276d2e..f512f2628 100644 --- a/webmagic-plugin/src/main/java/us/codecraft/webmagic/pipeline/FreemarkerPipeline.java +++ b/webmagic-plugin/src/main/java/us/codecraft/webmagic/pipeline/FreemarkerPipeline.java @@ -13,7 +13,7 @@ import java.io.PrintWriter; /** - * Author: code4crafter@gmail.com + * @author code4crafter@gmail.com
      * Date: 13-6-8 * Time: 下午9:00 */ diff --git a/webmagic-plugin/src/test/java/us/codecraft/webmagic/FreemarkerPipelineTest.java b/webmagic-plugin/src/test/java/us/codecraft/webmagic/FreemarkerPipelineTest.java index 9e6b9958c..68ff90464 100644 --- a/webmagic-plugin/src/test/java/us/codecraft/webmagic/FreemarkerPipelineTest.java +++ b/webmagic-plugin/src/test/java/us/codecraft/webmagic/FreemarkerPipelineTest.java @@ -6,7 +6,7 @@ import java.io.IOException; /** - * Author: code4crafter@gmail.com + * @author code4crafter@gmail.com
      * Date: 13-6-9 * Time: 上午7:14 */ diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/DiandianBlogProcessor.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/DiandianBlogProcessor.java index a9351a1be..a5b355cb1 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/DiandianBlogProcessor.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/DiandianBlogProcessor.java @@ -7,7 +7,7 @@ import java.util.List; /** - * Author: code4crafter@gmail.com + * @author code4crafter@gmail.com
      * Date: 13-4-21 * Time: 下午8:08 */ diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/DianpingProcessor.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/DianpingProcessor.java index 63aa0f03d..2f28e6a39 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/DianpingProcessor.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/DianpingProcessor.java @@ -8,7 +8,7 @@ import java.util.List; /** - * Author: code4crafter@gmail.com + * @author code4crafter@gmail.com
      * Date: 13-4-21 * Time: 下午8:08 */ diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/DiaoyuwengProcessor.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/DiaoyuwengProcessor.java index 8d64bbca5..f5032ff75 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/DiaoyuwengProcessor.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/DiaoyuwengProcessor.java @@ -8,7 +8,7 @@ import java.util.List; /** - * Author: code4crafter@gmail.com + * @author code4crafter@gmail.com
      * Date: 13-4-21 * Time: 下午8:08 */ diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/F58PageProcesser.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/F58PageProcesser.java index 82db2dd7a..385e3f278 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/F58PageProcesser.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/F58PageProcesser.java @@ -7,7 +7,7 @@ import java.util.List; /** - * Author: code4crafter@gmail.com + * @author code4crafter@gmail.com
      * Date: 13-4-21 * Time: 下午1:48 */ diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/HuxiuProcessor.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/HuxiuProcessor.java index 36f69466f..1fa0b7b55 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/HuxiuProcessor.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/HuxiuProcessor.java @@ -7,7 +7,7 @@ import java.util.List; /** - * Author: code4crafter@gmail.com + * @author code4crafter@gmail.com
      * Date: 13-4-21 * Time: 下午8:08 */ diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/KaichibaProcessor.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/KaichibaProcessor.java index 11f046271..8ea4afe27 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/KaichibaProcessor.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/KaichibaProcessor.java @@ -5,7 +5,7 @@ import us.codecraft.webmagic.processor.PageProcessor; /** - * Author: code4crafter@gmail.com + * @author code4crafter@gmail.com
      * Date: 13-5-20 * Time: 下午5:31 */ diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/MeicanProcessor.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/MeicanProcessor.java index aaeca8f88..98fe8de7d 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/MeicanProcessor.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/MeicanProcessor.java @@ -7,7 +7,7 @@ import java.util.List; /** - * Author: code4crafter@gmail.com + * @author code4crafter@gmail.com
      * Date: 13-5-20 * Time: 下午5:31 */ diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/NjuBBSProcessor.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/NjuBBSProcessor.java index 58e19c4ea..e4cc33cfc 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/NjuBBSProcessor.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/NjuBBSProcessor.java @@ -7,7 +7,7 @@ import java.util.List; /** - * Author: code4crafter@gmail.com + * @author code4crafter@gmail.com
      * Date: 13-4-21 * Time: 下午8:08 */ diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/OschinaBlogPageProcesser.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/OschinaBlogPageProcesser.java index bcc2d6eec..0d6354d8a 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/OschinaBlogPageProcesser.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/OschinaBlogPageProcesser.java @@ -7,7 +7,7 @@ import java.util.List; /** - * Author: code4crafter@gmail.com + * @author code4crafter@gmail.com
      * Date: 13-4-21 * Time: 下午1:48 */ diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/OschinaPageProcesser.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/OschinaPageProcesser.java index d85ca8142..b708ec51d 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/OschinaPageProcesser.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/OschinaPageProcesser.java @@ -7,7 +7,7 @@ import java.util.List; /** - * Author: code4crafter@gmail.com + * @author code4crafter@gmail.com
      * Date: 13-4-21 * Time: 下午1:48 */ diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/QzoneBlogProcessor.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/QzoneBlogProcessor.java index fac491dfd..400ebd5ec 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/QzoneBlogProcessor.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/QzoneBlogProcessor.java @@ -7,7 +7,7 @@ import java.util.List; /** - * Author: code4crafter@gmail.com + * @author code4crafter@gmail.com
      * Date: 13-4-21 * Time: 下午8:08 */ diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/SinaBlogProcesser.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/SinaBlogProcesser.java index 37c686490..5bc2fc68a 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/SinaBlogProcesser.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/SinaBlogProcesser.java @@ -5,7 +5,7 @@ import us.codecraft.webmagic.processor.PageProcessor; /** - * Author: code4crafter@gmail.com + * @author code4crafter@gmail.com
      * Date: 13-4-21 * Time: 下午1:48 */ diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/TianyaPageProcesser.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/TianyaPageProcesser.java index db5f9ff21..a15ef74a2 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/TianyaPageProcesser.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/TianyaPageProcesser.java @@ -7,7 +7,7 @@ import java.util.List; /** - * Author: code4crafter@gmail.com + * @author code4crafter@gmail.com
      * Date: 13-4-21 * Time: 下午1:48 */ diff --git a/webmagic-samples/src/test/java/us/codecraft/webmagic/SpiderTest.java b/webmagic-samples/src/test/java/us/codecraft/webmagic/SpiderTest.java index b67ffc4b7..6293884c1 100644 --- a/webmagic-samples/src/test/java/us/codecraft/webmagic/SpiderTest.java +++ b/webmagic-samples/src/test/java/us/codecraft/webmagic/SpiderTest.java @@ -8,7 +8,7 @@ import us.codecraft.webmagic.schedular.FileCacheQueueSchedular; /** - * Author: code4crafter@gmail.com + * @author code4crafter@gmail.com
      * Date: 13-4-20 * Time: 下午7:46 */ diff --git a/webmagic-samples/src/test/java/us/codecraft/webmagic/processor/DiandianProcessorTest.java b/webmagic-samples/src/test/java/us/codecraft/webmagic/processor/DiandianProcessorTest.java index 721a5eb86..b87815c4d 100644 --- a/webmagic-samples/src/test/java/us/codecraft/webmagic/processor/DiandianProcessorTest.java +++ b/webmagic-samples/src/test/java/us/codecraft/webmagic/processor/DiandianProcessorTest.java @@ -11,7 +11,7 @@ import java.io.IOException; /** - * Author: code4crafter@gmail.com + * @author code4crafter@gmail.com
      * Date: 13-6-9 * Time: 上午8:02 */ diff --git a/webmagic-samples/src/test/java/us/codecraft/webmagic/processor/DiaoyuwengProcessorTest.java b/webmagic-samples/src/test/java/us/codecraft/webmagic/processor/DiaoyuwengProcessorTest.java index 8f03b6a8a..2b2caaca1 100644 --- a/webmagic-samples/src/test/java/us/codecraft/webmagic/processor/DiaoyuwengProcessorTest.java +++ b/webmagic-samples/src/test/java/us/codecraft/webmagic/processor/DiaoyuwengProcessorTest.java @@ -11,7 +11,7 @@ import java.io.IOException; /** - * Author: code4crafter@gmail.com + * @author code4crafter@gmail.com
      * Date: 13-6-9 * Time: 上午8:02 */ diff --git a/webmagic-samples/src/test/java/us/codecraft/webmagic/processor/SinablogProcessorTest.java b/webmagic-samples/src/test/java/us/codecraft/webmagic/processor/SinablogProcessorTest.java index 8fb258378..9613c9e4e 100644 --- a/webmagic-samples/src/test/java/us/codecraft/webmagic/processor/SinablogProcessorTest.java +++ b/webmagic-samples/src/test/java/us/codecraft/webmagic/processor/SinablogProcessorTest.java @@ -11,7 +11,7 @@ import java.io.IOException; /** - * Author: code4crafter@gmail.com + * @author code4crafter@gmail.com
      * Date: 13-6-9 * Time: 上午8:02 */ From 4d7b2753980b54b41a2bb7ae25d0c9c0b96fba39 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Tue, 18 Jun 2013 22:39:37 +0800 Subject: [PATCH 22/81] docs --- .../main/java/us/codecraft/webmagic/Page.java | 43 ++++++++++++++++--- 1 file changed, 37 insertions(+), 6 deletions(-) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java index ec9f959e8..5bf5f26ed 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java @@ -16,6 +16,8 @@ * 主要方法: * {@link #getUrl()} 获取页面的Url * {@link #getHtml()} 获取页面的html内容 + * {@link #putField(String, us.codecraft.webmagic.selector.Selectable)} 保存抽取的结果 + * {@link #getFields()} 获取抽取的结果,在 {@link us.codecraft.webmagic.pipeline.Pipeline} 中调用 * {@link #addTargetRequests(java.util.List)} {@link #addTargetRequest(String)} 添加待抓取的链接 * * @@ -33,25 +35,30 @@ public class Page { private List targetRequests = new ArrayList(); - public void process() { - fields.put("title", html.x("").r("")); - } - public Page() { } /** - * - * @return fields + * 获取抽取的结果,在{@link us.codecraft.webmagic.pipeline.Pipeline} 中调用 + * @return fields 抽取的结果 */ public Map getFields() { return fields; } + /** + * 保存抽取的结果 + * @param key 结果的key + * @param field 结果的value + */ public void putField(String key, Selectable field) { fields.put(key, field); } + /** + * 获取页面的html内容 + * @return html 页面的html内容 + */ public Selectable getHtml() { return html; } @@ -64,6 +71,10 @@ public List getTargetRequests() { return targetRequests; } + /** + * 添加待抓取的链接 + * @param requests 待抓取的链接 + */ public void addTargetRequests(List requests) { synchronized (targetRequests) { for (String s : requests) { @@ -76,6 +87,10 @@ public void addTargetRequests(List requests) { } } + /** + * 添加待抓取的链接 + * @param requestString 待抓取的链接 + */ public void addTargetRequest(String requestString) { if (StringUtils.isBlank(requestString) || requestString.equals("#")) { return; @@ -86,20 +101,36 @@ public void addTargetRequest(String requestString) { } } + /** + * 添加待抓取的页面,在需要传递附加信息时使用 + * @param request 待抓取的页面 + */ public void addTargetRequest(Request request) { synchronized (targetRequests) { targetRequests.add(request); } } + /** + * 获取页面的Url + * @return url 当前页面的url,可用于抽取 + */ public Selectable getUrl() { return url; } + /** + * 设置url + * @param url + */ public void setUrl(Selectable url) { this.url = url; } + /** + * 获取抓取请求 + * @return request 抓取请求 + */ public Request getRequest() { return request; } From e550b72bfccda65c6f5b6f25b1855619d7b7de73 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Wed, 19 Jun 2013 08:20:21 +0800 Subject: [PATCH 23/81] add package infos --- .../main/java/us/codecraft/webmagic/downloader/package.html | 5 +++++ .../src/main/java/us/codecraft/webmagic/package.html | 5 +++++ .../main/java/us/codecraft/webmagic/pipeline/package.html | 5 +++++ .../main/java/us/codecraft/webmagic/processor/package.html | 5 +++++ .../main/java/us/codecraft/webmagic/schedular/package.html | 5 +++++ .../main/java/us/codecraft/webmagic/selector/Selector.java | 2 +- .../main/java/us/codecraft/webmagic/selector/package.html | 5 +++++ .../src/main/java/us/codecraft/webmagic/utils/package.html | 5 +++++ 8 files changed, 36 insertions(+), 1 deletion(-) create mode 100644 webmagic-core/src/main/java/us/codecraft/webmagic/downloader/package.html create mode 100644 webmagic-core/src/main/java/us/codecraft/webmagic/package.html create mode 100644 webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/package.html create mode 100644 webmagic-core/src/main/java/us/codecraft/webmagic/processor/package.html create mode 100644 webmagic-core/src/main/java/us/codecraft/webmagic/schedular/package.html create mode 100644 webmagic-core/src/main/java/us/codecraft/webmagic/selector/package.html create mode 100644 webmagic-core/src/main/java/us/codecraft/webmagic/utils/package.html diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/package.html b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/package.html new file mode 100644 index 000000000..cae5560ea --- /dev/null +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/package.html @@ -0,0 +1,5 @@ + + +包含了页面下载的接口Downloader和实现类HttpClientDownloader,该实现类封装了HttpComponent库。 + + diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/package.html b/webmagic-core/src/main/java/us/codecraft/webmagic/package.html new file mode 100644 index 000000000..d5ff540a6 --- /dev/null +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/package.html @@ -0,0 +1,5 @@ + + +包括webmagic入口类Spider和一些数据传递的实体类。 + + diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/package.html b/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/package.html new file mode 100644 index 000000000..498183ebd --- /dev/null +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/package.html @@ -0,0 +1,5 @@ + + +包含了处理页面抽取结果的接口Pipeline和它的几个实现类。 + + diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/processor/package.html b/webmagic-core/src/main/java/us/codecraft/webmagic/processor/package.html new file mode 100644 index 000000000..47274a1fd --- /dev/null +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/processor/package.html @@ -0,0 +1,5 @@ + + +包含了封装页面处理逻辑的接口PageProcessor和一个实现类SimplePageProcessor。实现PageProcessor即可定制一个自己的爬虫。 + + diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/schedular/package.html b/webmagic-core/src/main/java/us/codecraft/webmagic/schedular/package.html new file mode 100644 index 000000000..0e35610fe --- /dev/null +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/schedular/package.html @@ -0,0 +1,5 @@ + + +包含url管理和调度的接口Schedular及它的几个实现类。 + + diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Selector.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Selector.java index 35632b3aa..f7771cfbd 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Selector.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Selector.java @@ -7,7 +7,7 @@ * Date: 13-4-20 * Time: 下午8:02 */ -public interface Selector { +interface Selector { public String select(String text); diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/package.html b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/package.html new file mode 100644 index 000000000..3c9ef7b25 --- /dev/null +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/package.html @@ -0,0 +1,5 @@ + + +提供了便捷抽取页面内容的工具,对外核心接口是Selectable,内部抽取则是通过实现Selector来定制。 + + diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/utils/package.html b/webmagic-core/src/main/java/us/codecraft/webmagic/utils/package.html new file mode 100644 index 000000000..bfbe8dfcd --- /dev/null +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/utils/package.html @@ -0,0 +1,5 @@ + + +提供一些处理链接的静态工具类。 + + From 9762834e60b88a6efef16696a86c6a8170c8c22c Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Wed, 19 Jun 2013 09:57:41 +0800 Subject: [PATCH 24/81] update Select api: remove x() s() etc. --- .../downloader/HttpClientDownloader.java | 2 +- .../processor/SimplePageProcessor.java | 6 ++-- .../us/codecraft/webmagic/selector/Html.java | 18 ++-------- .../webmagic/selector/PlainText.java | 26 +++----------- .../webmagic/selector/Selectable.java | 34 +++---------------- .../java/us/codecraft/webmagic/HtmlTest.java | 2 +- .../webmagic/selector/XpathSelectorTest.java | 2 +- .../samples/DiandianBlogProcessor.java | 16 ++++----- .../webmagic/samples/DianpingProcessor.java | 8 ++--- .../webmagic/samples/DiaoyuwengProcessor.java | 12 +++---- .../webmagic/samples/F58PageProcesser.java | 6 ++-- .../webmagic/samples/HuxiuProcessor.java | 6 ++-- .../webmagic/samples/KaichibaProcessor.java | 6 ++-- .../webmagic/samples/MeicanProcessor.java | 8 ++--- .../webmagic/samples/NjuBBSProcessor.java | 6 ++-- .../samples/OschinaBlogPageProcesser.java | 8 ++--- .../samples/OschinaPageProcesser.java | 6 ++-- .../webmagic/samples/QzoneBlogProcessor.java | 6 ++-- .../webmagic/samples/SinaBlogProcesser.java | 12 +++---- .../webmagic/samples/TianyaPageProcesser.java | 6 ++-- .../us/codecraft/webmagic/SpiderTest.java | 6 ++-- 21 files changed, 75 insertions(+), 127 deletions(-) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java index 4332fa329..c1ecff3e8 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java @@ -33,7 +33,7 @@ public Page download(Request request, Site site) { if (site.getAcceptStatCode().contains(statusCode)) { if (site.getEncoding() == null){ String value = httpResponse.getEntity().getContentType().getValue(); - site.setEncoding(new PlainText(value).r("charset=([^\\s]+)").toString()); + site.setEncoding(new PlainText(value).regex("charset=([^\\s]+)").toString()); } String content = IOUtils.toString(httpResponse.getEntity().getContent(), site.getEncoding()); diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/processor/SimplePageProcessor.java b/webmagic-core/src/main/java/us/codecraft/webmagic/processor/SimplePageProcessor.java index 3ffc9a32e..0d5244629 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/processor/SimplePageProcessor.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/processor/SimplePageProcessor.java @@ -28,13 +28,13 @@ public SimplePageProcessor(String startUrl, String urlPattern) { @Override public void process(Page page) { - List requests = page.getHtml().as().rs(urlPattern).toStrings(); + List requests = page.getHtml().links().regex(urlPattern).toStrings(); //调用page.addTargetRequests()方法添加待抓取链接 page.addTargetRequests(requests); //xpath方式抽取 - page.putField("title", page.getHtml().x("//title")); + page.putField("title", page.getHtml().xpath("//title")); //sc表示使用Readability技术抽取正文 - page.putField("content", page.getHtml().sc()); + page.putField("content", page.getHtml().smartContent()); } @Override diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Html.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Html.java index 3b3c80af9..3cc84f79e 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Html.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Html.java @@ -18,12 +18,6 @@ public Html(String text) { super(text); } - @Override - public Selectable x(String xpath) { - XpathSelector xpathSelector = SelectorFactory.getInstatnce().newXpathSelector(xpath); - return select(xpathSelector,strings); - } - @Override protected Selectable select(Selector selector, List strings) { List results = new ArrayList(); @@ -47,25 +41,19 @@ protected Selectable selectList(Selector selector, List strings) { } @Override - public Selectable sc() { + public Selectable smartContent() { SmartContentSelector smartContentSelector = SelectorFactory.getInstatnce().newSmartContentSelector(); return select(smartContentSelector,strings); } @Override - public Selectable a() { - XpathSelector xpathSelector = SelectorFactory.getInstatnce().newXpathSelector("//a/@href"); - return select(xpathSelector,strings); - } - - @Override - public Selectable as() { + public Selectable links() { XpathSelector xpathSelector = SelectorFactory.getInstatnce().newXpathSelector("//a/@href"); return selectList(xpathSelector,strings); } @Override - public Selectable xs(String xpath) { + public Selectable xpath(String xpath) { XpathSelector xpathSelector = SelectorFactory.getInstatnce().newXpathSelector(xpath); return selectList(xpathSelector, strings); } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/PlainText.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/PlainText.java index a11c9a293..935ababa3 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/PlainText.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/PlainText.java @@ -25,38 +25,22 @@ public PlainText(String text) { } @Override - public Selectable x(String xpath) { + public Selectable xpath(String xpath) { throw new UnsupportedOperationException(); } @Override - public Selectable xs(String xpath) { + public Selectable smartContent() { throw new UnsupportedOperationException(); } @Override - public Selectable sc() { + public Selectable links() { throw new UnsupportedOperationException(); } @Override - public Selectable a() { - throw new UnsupportedOperationException(); - } - - @Override - public Selectable as() { - throw new UnsupportedOperationException(); - } - - @Override - public Selectable r(String regex) { - RegexSelector regexSelector = SelectorFactory.getInstatnce().newRegexSelector(regex); - return select(regexSelector, strings); - } - - @Override - public Selectable rs(String regex) { + public Selectable regex(String regex) { RegexSelector regexSelector = SelectorFactory.getInstatnce().newRegexSelector(regex); return selectList(regexSelector, strings); } @@ -82,7 +66,7 @@ protected Selectable selectList(Selector selector, List strings) { } @Override - public Selectable rp(String regex, String replacement) { + public Selectable replace(String regex, String replacement) { ReplaceSelector replaceSelector = SelectorFactory.getInstatnce().newReplaceSelector(regex, replacement); return select(replaceSelector, strings); } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Selectable.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Selectable.java index 921e6c3f4..630808d3b 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Selectable.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Selectable.java @@ -9,51 +9,27 @@ */ public interface Selectable { - /** - * select with xpath - * - * @param xpath - * @return new Selectable after extract - */ - public Selectable x(String xpath); - /** * select list with xpath * * @param xpath * @return new Selectable after extract */ - public Selectable xs(String xpath); + public Selectable xpath(String xpath); /** * select smart content with ReadAbility algorithm * * @return content */ - public Selectable sc(); - - /** - * select a link - * - * @return first link - */ - public Selectable a(); + public Selectable smartContent(); /** * select all links * * @return all links */ - public Selectable as(); - - - /** - * select with regex - * - * @param regex - * @return new Selectable after extract - */ - public Selectable r(String regex); + public Selectable links(); /** * select list with regex @@ -61,7 +37,7 @@ public interface Selectable { * @param regex * @return new Selectable after extract */ - public Selectable rs(String regex); + public Selectable regex(String regex); /** * replace with regex @@ -70,7 +46,7 @@ public interface Selectable { * @param replacement * @return new Selectable after extract */ - public Selectable rp(String regex, String replacement); + public Selectable replace(String regex, String replacement); /** * single string result diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/HtmlTest.java b/webmagic-core/src/test/java/us/codecraft/webmagic/HtmlTest.java index 6dacc9833..fcdbfeffa 100644 --- a/webmagic-core/src/test/java/us/codecraft/webmagic/HtmlTest.java +++ b/webmagic-core/src/test/java/us/codecraft/webmagic/HtmlTest.java @@ -14,7 +14,7 @@ public class HtmlTest { @Test public void testRegexSelector() { Html selectable = new Html("aaaaaaab"); - Assert.assertEquals("abbabbab", (selectable.r("(.*)").rp("aa(a)", "$1bb").toString())); + Assert.assertEquals("abbabbab", (selectable.regex("(.*)").replace("aa(a)", "$1bb").toString())); } } diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/selector/XpathSelectorTest.java b/webmagic-core/src/test/java/us/codecraft/webmagic/selector/XpathSelectorTest.java index 8ee888597..bebbb83dc 100644 --- a/webmagic-core/src/test/java/us/codecraft/webmagic/selector/XpathSelectorTest.java +++ b/webmagic-core/src/test/java/us/codecraft/webmagic/selector/XpathSelectorTest.java @@ -1350,7 +1350,7 @@ public void test() { @Test public void testOschina() { Html html1 = new Html(html); - Assert.assertEquals("再次吐槽easyui", html1.x(".//*[@class='QTitle']/h1/a").toString()); + Assert.assertEquals("再次吐槽easyui", html1.xpath(".//*[@class='QTitle']/h1/a").toString()); } } diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/DiandianBlogProcessor.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/DiandianBlogProcessor.java index a5b355cb1..e5aafe7a3 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/DiandianBlogProcessor.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/DiandianBlogProcessor.java @@ -17,20 +17,20 @@ public class DiandianBlogProcessor implements PageProcessor { @Override public void process(Page page) { - //a()表示提取链接,as()表示提取所有链接 + //a()表示提取链接,links()表示提取所有链接 //getHtml()返回Html对象,支持链式调用 - //r()表示用正则表达式提取一条内容,rs()表示提取多条内容 + //r()表示用正则表达式提取一条内容,regex()表示提取多条内容 //toString()表示取单条结果,toStrings()表示取多条 - List requests = page.getHtml().as().rs("(.*/post/.*)").toStrings(); + List requests = page.getHtml().links().regex("(.*/post/.*)").toStrings(); //使用page.addTargetRequests()方法将待抓取的链接加入队列 page.addTargetRequests(requests); //page.putField(key,value)将抽取的内容加入结果Map //x()和xs()使用xpath进行抽取 - page.putField("title", page.getHtml().x("//title").r("(.*?)\\|")); - //sc()使用readability技术直接抽取正文,对于规整的文本有比较好的抽取正确率 - page.putField("content", page.getHtml().sc()); - page.putField("date", page.getUrl().r("post/(\\d+-\\d+-\\d+)/")); - page.putField("id", page.getUrl().r("post/\\d+-\\d+-\\d+/(\\d+)")); + page.putField("title", page.getHtml().xpath("//title").regex("(.*?)\\|")); + //smartContent()使用readability技术直接抽取正文,对于规整的文本有比较好的抽取正确率 + page.putField("content", page.getHtml().smartContent()); + page.putField("date", page.getUrl().regex("post/(\\d+-\\d+-\\d+)/")); + page.putField("id", page.getUrl().regex("post/\\d+-\\d+-\\d+/(\\d+)")); } @Override diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/DianpingProcessor.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/DianpingProcessor.java index 2f28e6a39..7a211882e 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/DianpingProcessor.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/DianpingProcessor.java @@ -15,13 +15,13 @@ public class DianpingProcessor implements PageProcessor { @Override public void process(Page page) { - List requests = page.getHtml().as().rs(".*shop.*").toStrings(); + List requests = page.getHtml().links().regex(".*shop.*").toStrings(); page.addTargetRequests(requests); - requests = page.getHtml().rs(".*search/category/.*").toStrings(); + requests = page.getHtml().regex(".*search/category/.*").toStrings(); page.addTargetRequests(requests); if (page.getUrl().toString().contains("shop")) { - page.putField("title", page.getHtml().x("//h1[@class='shop-title']")); - page.putField("content", page.getHtml().sc()); + page.putField("title", page.getHtml().xpath("//h1[@class='shop-title']")); + page.putField("content", page.getHtml().smartContent()); } } diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/DiaoyuwengProcessor.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/DiaoyuwengProcessor.java index f5032ff75..13ed2e115 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/DiaoyuwengProcessor.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/DiaoyuwengProcessor.java @@ -18,15 +18,15 @@ public class DiaoyuwengProcessor implements PageProcessor { @Override public void process(Page page) { - List requests = page.getHtml().as().rs("(http://www\\.diaoyuweng\\.com/home\\.php\\?mod=space&uid=88304&do=thread&view=me&type=thread&order=dateline&from=space&page=\\d+)").toStrings(); + List requests = page.getHtml().links().regex("(http://www\\.diaoyuweng\\.com/home\\.php\\?mod=space&uid=88304&do=thread&view=me&type=thread&order=dateline&from=space&page=\\d+)").toStrings(); page.addTargetRequests(requests); - requests = page.getHtml().as().rs("(http://www\\.diaoyuweng\\.com/thread-\\d+-1-1.html)").toStrings(); + requests = page.getHtml().links().regex("(http://www\\.diaoyuweng\\.com/thread-\\d+-1-1.html)").toStrings(); page.addTargetRequests(requests); if (page.getUrl().toString().contains("thread")){ - page.putField("title", page.getHtml().x("//a[@id='thread_subject']")); - page.putField("content", page.getHtml().x("//div[@class='pcb']//tbody")); - page.putField("date",page.getHtml().r("发表于 (\\d{4}-\\d+-\\d+ \\d+:\\d+:\\d+)")); - page.putField("id",new PlainText("1000"+page.getUrl().r("http://www\\.diaoyuweng\\.com/thread-(\\d+)-1-1.html").toString())); + page.putField("title", page.getHtml().xpath("//a[@id='thread_subject']")); + page.putField("content", page.getHtml().xpath("//div[@class='pcb']//tbody")); + page.putField("date",page.getHtml().regex("发表于 (\\d{4}-\\d+-\\d+ \\d+:\\d+:\\d+)")); + page.putField("id",new PlainText("1000"+page.getUrl().regex("http://www\\.diaoyuweng\\.com/thread-(\\d+)-1-1.html").toString())); } } diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/F58PageProcesser.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/F58PageProcesser.java index 385e3f278..9d5140a2e 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/F58PageProcesser.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/F58PageProcesser.java @@ -15,10 +15,10 @@ public class F58PageProcesser implements PageProcessor { @Override public void process(Page page) { - List strings = page.getHtml().rs("]*href=[\"']{1}(/yewu/.*?)[\"']{1}").toStrings(); + List strings = page.getHtml().regex("]*href=[\"']{1}(/yewu/.*?)[\"']{1}").toStrings(); page.addTargetRequests(strings); - page.putField("title",page.getHtml().r("(.*)")); - page.putField("body",page.getHtml().x("//dd[@class='w133']")); + page.putField("title",page.getHtml().regex("(.*)")); + page.putField("body",page.getHtml().xpath("//dd[@class='w133']")); } @Override diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/HuxiuProcessor.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/HuxiuProcessor.java index 1fa0b7b55..26c60cc27 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/HuxiuProcessor.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/HuxiuProcessor.java @@ -15,10 +15,10 @@ public class HuxiuProcessor implements PageProcessor { @Override public void process(Page page) { //http://progressdaily.diandian.com/post/2013-01-24/40046867275 - List requests = page.getHtml().rs("\"']*href=[\"']{1}([/]{0,1}article[^<>#\"']*?)[\"']{1}").toStrings(); + List requests = page.getHtml().regex("\"']*href=[\"']{1}([/]{0,1}article[^<>#\"']*?)[\"']{1}").toStrings(); page.addTargetRequests(requests); - page.putField("title",page.getHtml().x("//div[@class='neirong']//h1[@class='ph xs5']")); - page.putField("content",page.getHtml().sc()); + page.putField("title",page.getHtml().xpath("//div[@class='neirong']//h1[@class='ph xs5']")); + page.putField("content",page.getHtml().smartContent()); } @Override diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/KaichibaProcessor.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/KaichibaProcessor.java index 8ea4afe27..0a51b364e 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/KaichibaProcessor.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/KaichibaProcessor.java @@ -13,10 +13,10 @@ public class KaichibaProcessor implements PageProcessor { @Override public void process(Page page) { //http://progressdaily.diandian.com/post/2013-01-24/40046867275 - int i = Integer.valueOf(page.getUrl().r("shop/(\\d+)").toString()) + 1; + int i = Integer.valueOf(page.getUrl().regex("shop/(\\d+)").toString()) + 1; page.addTargetRequest("http://kaichiba.com/shop/" + i); - page.putField("title",page.getHtml().x("//Title")); - page.putField("items", page.getHtml().xs("//li[@class=\"foodTitle\"]").rp("^\\s+", "").rp("\\s+$", "").rp(".*?", "")); + page.putField("title",page.getHtml().xpath("//Title")); + page.putField("items", page.getHtml().xpath("//li[@class=\"foodTitle\"]").replace("^\\s+", "").replace("\\s+$", "").replace(".*?", "")); } @Override diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/MeicanProcessor.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/MeicanProcessor.java index 98fe8de7d..bd218113b 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/MeicanProcessor.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/MeicanProcessor.java @@ -15,14 +15,14 @@ public class MeicanProcessor implements PageProcessor { @Override public void process(Page page) { //http://progressdaily.diandian.com/post/2013-01-24/40046867275 - List requests = page.getHtml().xs("//a[@class=\"area_link flat_btn\"]/@href").toStrings(); + List requests = page.getHtml().xpath("//a[@class=\"area_link flat_btn\"]/@href").toStrings(); if (requests.size() > 2) { requests = requests.subList(0, 2); } page.addTargetRequests(requests); - page.addTargetRequests(page.getHtml().as().rs("(.*/restaurant/[^#]+)").toStrings()); - page.putField("items", page.getHtml().xs("//ul[@class=\"dishes menu_dishes\"]/li/span[@class=\"name\"]")); - page.putField("prices", page.getHtml().xs("//ul[@class=\"dishes menu_dishes\"]/li/span[@class=\"price_outer\"]/span[@class=\"price\"]")); + page.addTargetRequests(page.getHtml().links().regex("(.*/restaurant/[^#]+)").toStrings()); + page.putField("items", page.getHtml().xpath("//ul[@class=\"dishes menu_dishes\"]/li/span[@class=\"name\"]")); + page.putField("prices", page.getHtml().xpath("//ul[@class=\"dishes menu_dishes\"]/li/span[@class=\"price_outer\"]/span[@class=\"price\"]")); } @Override diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/NjuBBSProcessor.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/NjuBBSProcessor.java index e4cc33cfc..a7e9c9ba9 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/NjuBBSProcessor.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/NjuBBSProcessor.java @@ -14,10 +14,10 @@ public class NjuBBSProcessor implements PageProcessor { @Override public void process(Page page) { - List requests = page.getHtml().rs("]*href=(bbstcon\\?board=Pictures&file=[^>]*)").toStrings(); + List requests = page.getHtml().regex("]*href=(bbstcon\\?board=Pictures&file=[^>]*)").toStrings(); page.addTargetRequests(requests); - page.putField("title",page.getHtml().x("//div[@id='content']//h2/a")); - page.putField("content",page.getHtml().sc()); + page.putField("title",page.getHtml().xpath("//div[@id='content']//h2/a")); + page.putField("content",page.getHtml().smartContent()); } @Override diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/OschinaBlogPageProcesser.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/OschinaBlogPageProcesser.java index 0d6354d8a..9293b41c4 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/OschinaBlogPageProcesser.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/OschinaBlogPageProcesser.java @@ -15,11 +15,11 @@ public class OschinaBlogPageProcesser implements PageProcessor { @Override public void process(Page page) { - List strings = page.getHtml().as().r("(http://my\\.oschina\\.net)").toStrings(); + List strings = page.getHtml().links().regex("(http://my\\.oschina\\.net)").toStrings(); page.addTargetRequests(strings); - page.putField("title", page.getHtml().xs("//div[@class='BlogEntity']/div[@class='BlogTitle']/h1")); - page.putField("content", page.getHtml().sc()); - page.putField("author", page.getUrl().r("my\\.oschina\\.net/(\\w+)/blog/\\d+")); + page.putField("title", page.getHtml().xpath("//div[@class='BlogEntity']/div[@class='BlogTitle']/h1")); + page.putField("content", page.getHtml().smartContent()); + page.putField("author", page.getUrl().regex("my\\.oschina\\.net/(\\w+)/blog/\\d+")); } @Override diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/OschinaPageProcesser.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/OschinaPageProcesser.java index b708ec51d..f88ce06d2 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/OschinaPageProcesser.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/OschinaPageProcesser.java @@ -15,10 +15,10 @@ public class OschinaPageProcesser implements PageProcessor { @Override public void process(Page page) { - List strings = page.getHtml().rs("]*href=[\"']{1}(http://www\\.oschina\\.net/question/[\\w]+)[\"']{1}").toStrings(); + List strings = page.getHtml().regex("]*href=[\"']{1}(http://www\\.oschina\\.net/question/[\\w]+)[\"']{1}").toStrings(); page.addTargetRequests(strings); - page.putField("title", page.getHtml().x("//div[@class='QTitle']/h1/a")); - page.putField("content", page.getHtml().xs("//div[@class='Question']//div[@class='Content']/div[@class='detail']")); + page.putField("title", page.getHtml().xpath("//div[@class='QTitle']/h1/a")); + page.putField("content", page.getHtml().xpath("//div[@class='Question']//div[@class='Content']/div[@class='detail']")); } @Override diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/QzoneBlogProcessor.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/QzoneBlogProcessor.java index 400ebd5ec..bf4dcc2a1 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/QzoneBlogProcessor.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/QzoneBlogProcessor.java @@ -18,10 +18,10 @@ public void process(Page page) { //http://b1.cnc.qzone.qq.com/cgi-bin/blognew/get_abs?hostUin=233017404&uin=233017404&blogType=0&statYear=2013&source=0&statYear=2013&g_tk=291639571&g_tk=291639571&reqInfo=7&pos=0&num=15&source=0&rand=0.46480297949165106 // &cateName=&cateHex=&statYear=2013&reqInfo=7&pos=0&num=15&sortType=0&source=0&rand=0.46480297949165106&g_tk=291639571&verbose=1&ref=qzone - List requests = page.getHtml().rs("]*href=[\"']{1}(http://17dujingdian\\.com/post/[^#]*?)[\"']{1}").toStrings(); + List requests = page.getHtml().regex("]*href=[\"']{1}(http://17dujingdian\\.com/post/[^#]*?)[\"']{1}").toStrings(); page.addTargetRequests(requests); - page.putField("title",page.getHtml().x("//div[@id='content']//h2/a")); - page.putField("content",page.getHtml().sc()); + page.putField("title",page.getHtml().xpath("//div[@id='content']//h2/a")); + page.putField("content",page.getHtml().smartContent()); } @Override diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/SinaBlogProcesser.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/SinaBlogProcesser.java index 5bc2fc68a..bb77931fe 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/SinaBlogProcesser.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/SinaBlogProcesser.java @@ -15,12 +15,12 @@ public class SinaBlogProcesser implements PageProcessor { @Override public void process(Page page) { - page.addTargetRequests(page.getHtml().as().rs("(http://blog\\.sina\\.com\\.cn/s/blog_.*)").toStrings()); - page.putField("title", page.getHtml().x("//div[@class='articalTitle']/h2")); - page.putField("content",page.getHtml().x("//div[@id='articlebody']//div[@class='articalContent']")); - page.putField("id",page.getUrl().r("http://blog\\.sina\\.com\\.cn/s/blog_(\\w+)")); - page.putField("date",page.getHtml().x("//div[@id='articlebody']//span[@class='time SG_txtc']").r("\\((.*)\\)")); -// page.putField("tags",page.getHtml().xs("//td[@class='blog_tag']/h3/a")); + page.addTargetRequests(page.getHtml().links().regex("(http://blog\\.sina\\.com\\.cn/s/blog_.*)").toStrings()); + page.putField("title", page.getHtml().xpath("//div[@class='articalTitle']/h2")); + page.putField("content",page.getHtml().xpath("//div[@id='articlebody']//div[@class='articalContent']")); + page.putField("id",page.getUrl().regex("http://blog\\.sina\\.com\\.cn/s/blog_(\\w+)")); + page.putField("date",page.getHtml().xpath("//div[@id='articlebody']//span[@class='time SG_txtc']").regex("\\((.*)\\)")); +// page.putField("tags",page.getHtml().xpath("//td[@class='blog_tag']/h3/a")); } @Override diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/TianyaPageProcesser.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/TianyaPageProcesser.java index a15ef74a2..278657f09 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/TianyaPageProcesser.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/TianyaPageProcesser.java @@ -15,10 +15,10 @@ public class TianyaPageProcesser implements PageProcessor { @Override public void process(Page page) { - List strings = page.getHtml().rs("]*href=[\"']{1}(/post-free.*?\\.shtml)[\"']{1}").toStrings(); + List strings = page.getHtml().regex("]*href=[\"']{1}(/post-free.*?\\.shtml)[\"']{1}").toStrings(); page.addTargetRequests(strings); - page.putField("title", page.getHtml().x("//div[@id='post_head']//span[@class='s_title']//b")); - page.putField("body",page.getHtml().sc()); + page.putField("title", page.getHtml().xpath("//div[@id='post_head']//span[@class='s_title']//b")); + page.putField("body",page.getHtml().smartContent()); } @Override diff --git a/webmagic-samples/src/test/java/us/codecraft/webmagic/SpiderTest.java b/webmagic-samples/src/test/java/us/codecraft/webmagic/SpiderTest.java index 6293884c1..681aac78e 100644 --- a/webmagic-samples/src/test/java/us/codecraft/webmagic/SpiderTest.java +++ b/webmagic-samples/src/test/java/us/codecraft/webmagic/SpiderTest.java @@ -51,7 +51,7 @@ public void languageSchema() { /** * - * _hrefs = rs("]*href=[\"']{1}(/yewu/.*?)[\"']{1}") + * _hrefs = regex("]*href=[\"']{1}(/yewu/.*?)[\"']{1}") * title = r(""(.*)"") * body = x("//dd[@class='w133']") * @@ -72,7 +72,7 @@ public void languageSchema() { * body[r(_currentUrl).g(1)] = body[r(_currentUrl).g(1)] + (x("").r("",1,2).c()) * * body=body[r(_currentUrl).g(1)] - * tags[%] = (tags[%] + xs('')) . r('') + * tags[%] = (tags[%] + xpath('')) . r('') * * _targetUrls.add('' + x('').r('')) * _sourceUrls.add() @@ -114,7 +114,7 @@ public void languageSchema() { * content = t(_html) > c() * title = x(_html, 'asd@asd') > r('',1) * body[r(_currentUrl).g(1)] = body[r(_currentUrl).g(1)] + (x('') > r('',1,2) > c()) | x('') - * tags[%] = tags + xs('') > r('') + * tags[%] = tags + xpath('') > r('') * model.setTargetUrl(); * * _targetUrl = '' + x('') & r('') From 6611d6e64f3b86152bfaf5832fd1c09190a0c0e3 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Wed, 19 Jun 2013 10:14:17 +0800 Subject: [PATCH 25/81] update api in READEME --- README.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 421443f0a..2b687e6bc 100644 --- a/README.md +++ b/README.md @@ -18,7 +18,7 @@ webmagic正处于开发阶段,目前还没有稳定版本。欢迎开发者参 * ####垂直爬虫#### webmagic着重于页面抽取的工作。开发者可以使用xpath和正则表达式进行链接和内容的提取,支持链式API调用,以及单复数转换。 - String content = page.getHtml().x("//div[@class='body']").r("这段话比较重要(.*)").toString(); + String content = page.getHtml().xpath("//div[@class='body']").regex("这段话比较重要(.*)").toString(); * ####嵌入式&无配置#### webmagic与其他Full-Stack的框架不同,没有配置文件,大部分功能都通过简单的API调用完成。webmagic以jar包的形式存在,并且不依赖任何框架,在程序可以随处进行调用。 @@ -57,13 +57,13 @@ webmagic定制的核心是PageProcessor接口。 @Override public void process(Page page) { - List requests = page.getHtml().as().rs(urlPattern).toStrings(); + List requests = page.getHtml().links().regex(urlPattern).toStrings(); //调用page.addTargetRequests()方法添加待抓取链接 page.addTargetRequests(requests); //xpath方式抽取 - page.putField("title", page.getHtml().x("//title")); + page.putField("title", page.getHtml().xpath("//title")); //sc表示使用Readability技术抽取正文 - page.putField("content", page.getHtml().sc()); + page.putField("content", page.getHtml().smartContent()); } @Override From 6ede0e1b053808008339bb3d838514017e6156df Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Thu, 20 Jun 2013 07:53:48 +0800 Subject: [PATCH 26/81] update Spider api --- .../main/java/us/codecraft/webmagic/Site.java | 114 +++++++++++++++--- .../java/us/codecraft/webmagic/Spider.java | 45 +++---- ...ular.java => FileCacheQueueScheduler.java} | 4 +- ...ueueSchedular.java => QueueScheduler.java} | 2 +- .../{Schedular.java => Scheduler.java} | 2 +- .../us/codecraft/webmagic/selector/Html.java | 14 ++- .../webmagic/selector/PlainText.java | 4 + .../downloader/HttpClientDownloaderTest.java | 2 +- .../webmagic/samples/DianpingProcessor.java | 2 +- .../us/codecraft/webmagic/SpiderTest.java | 10 +- .../processor/DiandianProcessorTest.java | 6 +- .../processor/DiaoyuwengProcessorTest.java | 10 +- .../processor/SinablogProcessorTest.java | 10 +- 13 files changed, 161 insertions(+), 64 deletions(-) rename webmagic-core/src/main/java/us/codecraft/webmagic/schedular/{FileCacheQueueSchedular.java => FileCacheQueueScheduler.java} (97%) rename webmagic-core/src/main/java/us/codecraft/webmagic/schedular/{QueueSchedular.java => QueueScheduler.java} (94%) rename webmagic-core/src/main/java/us/codecraft/webmagic/schedular/{Schedular.java => Scheduler.java} (90%) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java index 5c208dd04..f9e0fd611 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java @@ -3,10 +3,12 @@ import java.util.*; /** - * Site定义一个待抓取的站点的各种信息。 + * Site定义一个待抓取的站点的各种信息。
      + * 这个类的所有getter方法,一般都只会被爬虫框架内部进行调用。
      + * * @author code4crafter@gmail.com
      - * Date: 13-4-21 - * Time: 下午12:13 + * Date: 13-4-21 + * Time: 下午12:13 */ public class Site { @@ -30,73 +32,157 @@ public class Site { DEFAULT_STATUS_CODE_SET.add(200); } + /** + * 创建一个Site对象,等价于new Site() + * + * @return 新建的对象 + */ public static Site me() { return new Site(); } - public Site setCookie(String name, String value) { + /** + * 为这个站点添加一个cookie,可用于抓取某些需要登录访问的站点。这个cookie的域名与{@link #getDomain()}是一致的 + * + * @param name cookie的名称 + * @param value cookie的值 + * @return this + */ + public Site addCookie(String name, String value) { cookies.put(name, value); return this; } + /** + * 为这个站点设置user-agent,很多网站都对user-agent进行了限制,不设置此选项可能会得到期望之外的结果。 + * + * @param userAgent userAgent + * @return this + */ public Site setUserAgent(String userAgent) { this.userAgent = userAgent; return this; } + /** + * 获取已经设置的所有cookie + * + * @return 已经设置的所有cookie + */ public Map getCookies() { return cookies; } + /** + * 获取已设置的user-agent + * + * @return 已设置的user-agent + */ public String getUserAgent() { return userAgent; } + /** + * 获取已设置的domain + * + * @return + */ public String getDomain() { return domain; } + /** + * 设置这个站点所在域名,必须项。
      + * 目前不支持多个域名的抓取。抓取多个域名请新建一个Spider。 + * + * @param domain 爬虫会抓取的域名 + * @return this + */ public Site setDomain(String domain) { this.domain = domain; return this; } - public String getEncoding() { - return encoding; - } - + /** + * 设置页面编码,若不设置则自动根据Html meta信息获取。
      + * 一般无需设置encoding,如果发现下载的结果是乱码,则可以设置此项。
      + * + * @param encoding 编码格式,主要是"utf-8"、"gbk"两种 + * @return this + */ public Site setEncoding(String encoding) { this.encoding = encoding; return this; } - public Set getAcceptStatCode() { - return acceptStatCode; + /** + * 获取已设置的编码 + * + * @return 已设置的domain + */ + public String getEncoding() { + return encoding; } + /** + * 设置可接受的http状态码,仅当状态码在这个集合中时,才会读取页面内容。
      + * 默认为200,正常情况下,无须设置此项。
      + * 某些站点会错误的返回状态码,此时可以对这个选项进行设置。
      + * + * @param acceptStatCode 可接受的状态码 + * @return this + */ public Site setAcceptStatCode(Set acceptStatCode) { this.acceptStatCode = acceptStatCode; return this; } + /** + * 获取可接受的状态码 + * + * @return 可接受的状态码 + */ + public Set getAcceptStatCode() { + return acceptStatCode; + } + + /** + * 获取初始页面的地址列表 + * @return 初始页面的地址列表 + */ public List getStartUrls() { return startUrls; } + /** + * 增加初始页面的地址,可反复调用此方法增加多个初始地址。 + * @param startUrl 初始页面的地址 + * @return this + */ public Site addStartUrl(String startUrl) { this.startUrls.add(startUrl); return this; } - public int getSleepTime() { - return sleepTime; - } - + /** + * 设置两次抓取之间的间隔,避免对目标站点压力过大(或者避免被防火墙屏蔽...)。 + * + * @param sleepTime 单位毫秒 + * @return this + */ public Site setSleepTime(int sleepTime) { this.sleepTime = sleepTime; return this; } + /** + * 获取两次抓取之间的间隔 + * @return 两次抓取之间的间隔,单位毫秒 + */ + public int getSleepTime() { + return sleepTime; + } + @Override public boolean equals(Object o) { if (this == o) return true; diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java index 67e9c94d5..8c662eb4a 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java @@ -7,13 +7,18 @@ import us.codecraft.webmagic.pipeline.ConsolePipeline; import us.codecraft.webmagic.pipeline.Pipeline; import us.codecraft.webmagic.processor.PageProcessor; -import us.codecraft.webmagic.schedular.QueueSchedular; -import us.codecraft.webmagic.schedular.Schedular; +import us.codecraft.webmagic.schedular.QueueScheduler; +import us.codecraft.webmagic.schedular.Scheduler; import java.util.ArrayList; import java.util.List; /** + *
      + * webmagic爬虫的入口类。
      + *      示例:
      + *      Spider.create(new SimplePageProcessor("http://my.oschina.net/", "http://my.oschina.net/*blog/*")).run();
      + * 
      * @author code4crafter@gmail.com
      * Date: 13-4-21 * Time: 上午6:53 @@ -32,18 +37,17 @@ public class Spider implements Runnable, Task { private String uuid; - private Schedular schedular = new QueueSchedular(); + private Scheduler scheduler = new QueueScheduler(); private Logger logger = Logger.getLogger(getClass()); - public static Spider me() { - return new Spider(); - } - - public Spider processor(PageProcessor pageProcessor) { + public Spider(PageProcessor pageProcessor){ this.pageProcessor = pageProcessor; this.site = pageProcessor.getSite(); - return this; + } + + public static Spider create(PageProcessor pageProcessor) { + return new Spider(pageProcessor); } public Spider startUrls(List startUrls) { @@ -57,8 +61,13 @@ public Spider startUrl(String startUrl) { return this; } - public Spider schedular(Schedular schedular) { - this.schedular = schedular; + public Spider setUUID(String uuid) { + this.uuid = uuid; + return this; + } + + public Spider schedular(Scheduler scheduler) { + this.scheduler = scheduler; return this; } @@ -71,9 +80,9 @@ public Spider pipeline(Pipeline pipeline) { @Override public void run() { for (String startUrl : startUrls) { - schedular.push(new Request(startUrl), this); + scheduler.push(new Request(startUrl), this); } - Request request = schedular.poll(this); + Request request = scheduler.poll(this); if (pipelines.isEmpty()) { pipelines.add(new ConsolePipeline()); } @@ -89,16 +98,10 @@ public void run() { pipeline.process(page, this); } sleep(site.getSleepTime()); - request = schedular.poll(this); + request = scheduler.poll(this); } } - public Spider setUUID(String uuid) { - this.uuid = uuid; - return this; - } - - private void sleep(int time) { try { Thread.sleep(time); @@ -110,7 +113,7 @@ private void sleep(int time) { private void addRequest(Page page) { if (CollectionUtils.isNotEmpty(page.getTargetRequests())) { for (Request request : page.getTargetRequests()) { - schedular.push(request, this); + scheduler.push(request, this); } } } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/schedular/FileCacheQueueSchedular.java b/webmagic-core/src/main/java/us/codecraft/webmagic/schedular/FileCacheQueueScheduler.java similarity index 97% rename from webmagic-core/src/main/java/us/codecraft/webmagic/schedular/FileCacheQueueSchedular.java rename to webmagic-core/src/main/java/us/codecraft/webmagic/schedular/FileCacheQueueScheduler.java index 0a93e52db..246f7e0a7 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/schedular/FileCacheQueueSchedular.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/schedular/FileCacheQueueScheduler.java @@ -20,7 +20,7 @@ * Date: 13-4-21 * Time: 下午1:13 */ -public class FileCacheQueueSchedular implements Schedular { +public class FileCacheQueueScheduler implements Scheduler { private Logger logger = Logger.getLogger(getClass()); @@ -44,7 +44,7 @@ public class FileCacheQueueSchedular implements Schedular { private Set urls; - public FileCacheQueueSchedular(String filePath) { + public FileCacheQueueScheduler(String filePath) { this.filePath = filePath; } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/schedular/QueueSchedular.java b/webmagic-core/src/main/java/us/codecraft/webmagic/schedular/QueueScheduler.java similarity index 94% rename from webmagic-core/src/main/java/us/codecraft/webmagic/schedular/QueueSchedular.java rename to webmagic-core/src/main/java/us/codecraft/webmagic/schedular/QueueScheduler.java index 20576fc7c..697688596 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/schedular/QueueSchedular.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/schedular/QueueScheduler.java @@ -14,7 +14,7 @@ * Date: 13-4-21 * Time: 下午1:13 */ -public class QueueSchedular implements Schedular { +public class QueueScheduler implements Scheduler { private Logger logger = Logger.getLogger(getClass()); diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/schedular/Schedular.java b/webmagic-core/src/main/java/us/codecraft/webmagic/schedular/Scheduler.java similarity index 90% rename from webmagic-core/src/main/java/us/codecraft/webmagic/schedular/Schedular.java rename to webmagic-core/src/main/java/us/codecraft/webmagic/schedular/Scheduler.java index 8df776040..7e0213275 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/schedular/Schedular.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/schedular/Scheduler.java @@ -8,7 +8,7 @@ * Date: 13-4-21 * Time: 下午1:12 */ -public interface Schedular { +public interface Scheduler { public void push(Request request,Task task); diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Html.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Html.java index 3cc84f79e..0b3637267 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Html.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Html.java @@ -5,8 +5,8 @@ /** * @author code4crafter@gmail.com
      - * Date: 13-4-21 - * Time: 上午7:54 + * Date: 13-4-21 + * Time: 上午7:54 */ public class Html extends PlainText { @@ -18,12 +18,16 @@ public Html(String text) { super(text); } + public static Html create(String text) { + return new Html(text); + } + @Override protected Selectable select(Selector selector, List strings) { List results = new ArrayList(); for (String string : strings) { String result = selector.select(string); - if (result!=null){ + if (result != null) { results.add(result); } } @@ -43,13 +47,13 @@ protected Selectable selectList(Selector selector, List strings) { @Override public Selectable smartContent() { SmartContentSelector smartContentSelector = SelectorFactory.getInstatnce().newSmartContentSelector(); - return select(smartContentSelector,strings); + return select(smartContentSelector, strings); } @Override public Selectable links() { XpathSelector xpathSelector = SelectorFactory.getInstatnce().newXpathSelector("//a/@href"); - return selectList(xpathSelector,strings); + return selectList(xpathSelector, strings); } @Override diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/PlainText.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/PlainText.java index 935ababa3..cedee6301 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/PlainText.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/PlainText.java @@ -24,6 +24,10 @@ public PlainText(String text) { this.strings = results; } + public static PlainText create(String text) { + return new PlainText(text); + } + @Override public Selectable xpath(String xpath) { throw new UnsupportedOperationException(); diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/downloader/HttpClientDownloaderTest.java b/webmagic-core/src/test/java/us/codecraft/webmagic/downloader/HttpClientDownloaderTest.java index 7f00e17c0..b2bcca2e5 100644 --- a/webmagic-core/src/test/java/us/codecraft/webmagic/downloader/HttpClientDownloaderTest.java +++ b/webmagic-core/src/test/java/us/codecraft/webmagic/downloader/HttpClientDownloaderTest.java @@ -15,7 +15,7 @@ public class HttpClientDownloaderTest { @Test public void testCookie() { - Site site = Site.me().setDomain("www.diandian.com").setCookie("t", "yct7q7e6v319wpg4cpxqduu5m77lcgix"); + Site site = Site.me().setDomain("www.diandian.com").addCookie("t", "yct7q7e6v319wpg4cpxqduu5m77lcgix"); HttpClientDownloader httpClientDownloader = new HttpClientDownloader(); Page download = httpClientDownloader.download(new Request("http://www.diandian.com"), site); Assert.assertTrue(download.getHtml().toString().contains("flashsword30")); diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/DianpingProcessor.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/DianpingProcessor.java index 7a211882e..c7233e8b2 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/DianpingProcessor.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/DianpingProcessor.java @@ -33,6 +33,6 @@ public Site getSite() { public static void main(String[] args) { DianpingProcessor dianpingProcessor = new DianpingProcessor(); - Spider.me().processor(dianpingProcessor).startUrl("http://www.dianping.com/shanghai/food").run(); + Spider.create(dianpingProcessor).startUrl("http://www.dianping.com/shanghai/food").run(); } } diff --git a/webmagic-samples/src/test/java/us/codecraft/webmagic/SpiderTest.java b/webmagic-samples/src/test/java/us/codecraft/webmagic/SpiderTest.java index 681aac78e..39018d908 100644 --- a/webmagic-samples/src/test/java/us/codecraft/webmagic/SpiderTest.java +++ b/webmagic-samples/src/test/java/us/codecraft/webmagic/SpiderTest.java @@ -5,7 +5,7 @@ import us.codecraft.webmagic.pipeline.FilePipeline; import us.codecraft.webmagic.processor.SimplePageProcessor; import us.codecraft.webmagic.samples.HuxiuProcessor; -import us.codecraft.webmagic.schedular.FileCacheQueueSchedular; +import us.codecraft.webmagic.schedular.FileCacheQueueScheduler; /** * @author code4crafter@gmail.com
      @@ -18,7 +18,7 @@ public class SpiderTest { @Ignore @Test public void testSpider() throws InterruptedException { - Spider me = Spider.me().pipeline(new FilePipeline()).processor(new HuxiuProcessor()); + Spider me = Spider.create(new HuxiuProcessor()).pipeline(new FilePipeline()); me.run(); } @@ -26,13 +26,13 @@ public void testSpider() throws InterruptedException { @Test public void testGlobalSpider(){ // PageProcessor pageProcessor = new MeicanProcessor(); -// Spider.me().pipeline(new FilePipeline()).schedular(new FileCacheQueueSchedular(pageProcessor.getSite(),"/data/temp/webmagic/cache/")). +// Spider.me().pipeline(new FilePipeline()).schedular(new FileCacheQueueScheduler(pageProcessor.getSite(),"/data/temp/webmagic/cache/")). // processor(pageProcessor).run(); SimplePageProcessor pageProcessor2 = new SimplePageProcessor("http://www.diaoyuweng.com/home.php?mod=space&uid=88304&do=thread&view=me&type=thread&from=space", "http://www.diaoyuweng.com/thread-*-1-1.html"); System.out.println(pageProcessor2.getSite().getEncoding()); pageProcessor2.getSite().setSleepTime(500); - Spider.me().pipeline(new FilePipeline()).schedular(new FileCacheQueueSchedular("/data/temp/webmagic/cache/")). - processor(pageProcessor2).run(); + Spider.create(pageProcessor2).pipeline(new FilePipeline()).schedular(new FileCacheQueueScheduler("/data/temp/webmagic/cache/")). + run(); } diff --git a/webmagic-samples/src/test/java/us/codecraft/webmagic/processor/DiandianProcessorTest.java b/webmagic-samples/src/test/java/us/codecraft/webmagic/processor/DiandianProcessorTest.java index b87815c4d..00491d9f0 100644 --- a/webmagic-samples/src/test/java/us/codecraft/webmagic/processor/DiandianProcessorTest.java +++ b/webmagic-samples/src/test/java/us/codecraft/webmagic/processor/DiandianProcessorTest.java @@ -6,7 +6,7 @@ import us.codecraft.webmagic.pipeline.ConsolePipeline; import us.codecraft.webmagic.pipeline.FreemarkerPipeline; import us.codecraft.webmagic.samples.DiandianBlogProcessor; -import us.codecraft.webmagic.schedular.FileCacheQueueSchedular; +import us.codecraft.webmagic.schedular.FileCacheQueueScheduler; import java.io.IOException; @@ -30,7 +30,7 @@ public void test() throws IOException { //ConsolePipeline输出结果到控制台 //FileCacheQueueSchedular保存url,支持断点续传,临时文件输出到/data/temp/webmagic/cache目录 //Spider.run()执行 - Spider.me().pipeline(new ConsolePipeline()).pipeline(pipeline).schedular(new FileCacheQueueSchedular("/data/temp/webmagic/cache/")). - processor(diaoyuwengProcessor).run(); + Spider.create(diaoyuwengProcessor).pipeline(new ConsolePipeline()).pipeline(pipeline).schedular(new FileCacheQueueScheduler("/data/temp/webmagic/cache/")). + run(); } } diff --git a/webmagic-samples/src/test/java/us/codecraft/webmagic/processor/DiaoyuwengProcessorTest.java b/webmagic-samples/src/test/java/us/codecraft/webmagic/processor/DiaoyuwengProcessorTest.java index 2b2caaca1..a189126c8 100644 --- a/webmagic-samples/src/test/java/us/codecraft/webmagic/processor/DiaoyuwengProcessorTest.java +++ b/webmagic-samples/src/test/java/us/codecraft/webmagic/processor/DiaoyuwengProcessorTest.java @@ -6,14 +6,14 @@ import us.codecraft.webmagic.pipeline.FilePipeline; import us.codecraft.webmagic.pipeline.FreemarkerPipeline; import us.codecraft.webmagic.samples.DiaoyuwengProcessor; -import us.codecraft.webmagic.schedular.FileCacheQueueSchedular; +import us.codecraft.webmagic.schedular.FileCacheQueueScheduler; import java.io.IOException; /** * @author code4crafter@gmail.com
      - * Date: 13-6-9 - * Time: 上午8:02 + * Date: 13-6-9 + * Time: 上午8:02 */ public class DiaoyuwengProcessorTest { @@ -22,7 +22,7 @@ public class DiaoyuwengProcessorTest { public void test() throws IOException { DiaoyuwengProcessor diaoyuwengProcessor = new DiaoyuwengProcessor(); FreemarkerPipeline pipeline = new FreemarkerPipeline("wordpress.ftl"); - Spider.me().pipeline(new FilePipeline()).pipeline(pipeline).schedular(new FileCacheQueueSchedular("/data/temp/webmagic/cache/")). - processor(diaoyuwengProcessor).run(); + Spider.create(diaoyuwengProcessor).pipeline(new FilePipeline()).pipeline(pipeline).schedular(new FileCacheQueueScheduler("/data/temp/webmagic/cache/")). + run(); } } diff --git a/webmagic-samples/src/test/java/us/codecraft/webmagic/processor/SinablogProcessorTest.java b/webmagic-samples/src/test/java/us/codecraft/webmagic/processor/SinablogProcessorTest.java index 9613c9e4e..4a2638328 100644 --- a/webmagic-samples/src/test/java/us/codecraft/webmagic/processor/SinablogProcessorTest.java +++ b/webmagic-samples/src/test/java/us/codecraft/webmagic/processor/SinablogProcessorTest.java @@ -6,14 +6,14 @@ import us.codecraft.webmagic.pipeline.FilePipeline; import us.codecraft.webmagic.pipeline.FreemarkerPipeline; import us.codecraft.webmagic.samples.SinaBlogProcesser; -import us.codecraft.webmagic.schedular.FileCacheQueueSchedular; +import us.codecraft.webmagic.schedular.FileCacheQueueScheduler; import java.io.IOException; /** * @author code4crafter@gmail.com
      - * Date: 13-6-9 - * Time: 上午8:02 + * Date: 13-6-9 + * Time: 上午8:02 */ public class SinablogProcessorTest { @@ -30,7 +30,7 @@ public void test() throws IOException { //ConsolePipeline输出结果到控制台 //FileCacheQueueSchedular保存url,支持断点续传,临时文件输出到/data/temp/webmagic/cache目录 //Spider.run()执行 - Spider.me().pipeline(new FilePipeline()).pipeline(pipeline).schedular(new FileCacheQueueSchedular("/data/temp/webmagic/cache/")). - processor(sinaBlogProcesser).run(); + Spider.create(sinaBlogProcesser).pipeline(new FilePipeline()).pipeline(pipeline).schedular(new FileCacheQueueScheduler("/data/temp/webmagic/cache/")). + run(); } } From 2788ff3d28ffb557fb2cab45ad88981f284efab1 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Thu, 20 Jun 2013 07:54:55 +0800 Subject: [PATCH 27/81] fix spell error=.= --- webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java | 2 +- .../codecraft/webmagic/schedular/FileCacheQueueScheduler.java | 4 ++-- .../src/test/java/us/codecraft/webmagic/SpiderTest.java | 4 ++-- .../codecraft/webmagic/processor/DiandianProcessorTest.java | 2 +- .../codecraft/webmagic/processor/DiaoyuwengProcessorTest.java | 2 +- .../codecraft/webmagic/processor/SinablogProcessorTest.java | 2 +- 6 files changed, 8 insertions(+), 8 deletions(-) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java index 8c662eb4a..4c4943a12 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java @@ -66,7 +66,7 @@ public Spider setUUID(String uuid) { return this; } - public Spider schedular(Scheduler scheduler) { + public Spider scheduler(Scheduler scheduler) { this.scheduler = scheduler; return this; } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/schedular/FileCacheQueueScheduler.java b/webmagic-core/src/main/java/us/codecraft/webmagic/schedular/FileCacheQueueScheduler.java index 246f7e0a7..e0d2c9481 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/schedular/FileCacheQueueScheduler.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/schedular/FileCacheQueueScheduler.java @@ -63,7 +63,7 @@ private void init(Task task) { initWriter(); initFlushThread(); inited.set(true); - logger.info("init cache schedular success"); + logger.info("init cache scheduler success"); } private void initFlushThread() { @@ -80,7 +80,7 @@ private void initWriter() { fileUrlWriter = new PrintWriter(new FileWriter(getFileName(fileUrlAllName), true)); fileCursorWriter = new PrintWriter(new FileWriter(getFileName(fileCursor), false)); } catch (IOException e) { - throw new RuntimeException("init cache schedular error", e); + throw new RuntimeException("init cache scheduler error", e); } } diff --git a/webmagic-samples/src/test/java/us/codecraft/webmagic/SpiderTest.java b/webmagic-samples/src/test/java/us/codecraft/webmagic/SpiderTest.java index 39018d908..f2668f253 100644 --- a/webmagic-samples/src/test/java/us/codecraft/webmagic/SpiderTest.java +++ b/webmagic-samples/src/test/java/us/codecraft/webmagic/SpiderTest.java @@ -26,12 +26,12 @@ public void testSpider() throws InterruptedException { @Test public void testGlobalSpider(){ // PageProcessor pageProcessor = new MeicanProcessor(); -// Spider.me().pipeline(new FilePipeline()).schedular(new FileCacheQueueScheduler(pageProcessor.getSite(),"/data/temp/webmagic/cache/")). +// Spider.me().pipeline(new FilePipeline()).scheduler(new FileCacheQueueScheduler(pageProcessor.getSite(),"/data/temp/webmagic/cache/")). // processor(pageProcessor).run(); SimplePageProcessor pageProcessor2 = new SimplePageProcessor("http://www.diaoyuweng.com/home.php?mod=space&uid=88304&do=thread&view=me&type=thread&from=space", "http://www.diaoyuweng.com/thread-*-1-1.html"); System.out.println(pageProcessor2.getSite().getEncoding()); pageProcessor2.getSite().setSleepTime(500); - Spider.create(pageProcessor2).pipeline(new FilePipeline()).schedular(new FileCacheQueueScheduler("/data/temp/webmagic/cache/")). + Spider.create(pageProcessor2).pipeline(new FilePipeline()).scheduler(new FileCacheQueueScheduler("/data/temp/webmagic/cache/")). run(); diff --git a/webmagic-samples/src/test/java/us/codecraft/webmagic/processor/DiandianProcessorTest.java b/webmagic-samples/src/test/java/us/codecraft/webmagic/processor/DiandianProcessorTest.java index 00491d9f0..ddca9c07f 100644 --- a/webmagic-samples/src/test/java/us/codecraft/webmagic/processor/DiandianProcessorTest.java +++ b/webmagic-samples/src/test/java/us/codecraft/webmagic/processor/DiandianProcessorTest.java @@ -30,7 +30,7 @@ public void test() throws IOException { //ConsolePipeline输出结果到控制台 //FileCacheQueueSchedular保存url,支持断点续传,临时文件输出到/data/temp/webmagic/cache目录 //Spider.run()执行 - Spider.create(diaoyuwengProcessor).pipeline(new ConsolePipeline()).pipeline(pipeline).schedular(new FileCacheQueueScheduler("/data/temp/webmagic/cache/")). + Spider.create(diaoyuwengProcessor).pipeline(new ConsolePipeline()).pipeline(pipeline).scheduler(new FileCacheQueueScheduler("/data/temp/webmagic/cache/")). run(); } } diff --git a/webmagic-samples/src/test/java/us/codecraft/webmagic/processor/DiaoyuwengProcessorTest.java b/webmagic-samples/src/test/java/us/codecraft/webmagic/processor/DiaoyuwengProcessorTest.java index a189126c8..33bcf9c61 100644 --- a/webmagic-samples/src/test/java/us/codecraft/webmagic/processor/DiaoyuwengProcessorTest.java +++ b/webmagic-samples/src/test/java/us/codecraft/webmagic/processor/DiaoyuwengProcessorTest.java @@ -22,7 +22,7 @@ public class DiaoyuwengProcessorTest { public void test() throws IOException { DiaoyuwengProcessor diaoyuwengProcessor = new DiaoyuwengProcessor(); FreemarkerPipeline pipeline = new FreemarkerPipeline("wordpress.ftl"); - Spider.create(diaoyuwengProcessor).pipeline(new FilePipeline()).pipeline(pipeline).schedular(new FileCacheQueueScheduler("/data/temp/webmagic/cache/")). + Spider.create(diaoyuwengProcessor).pipeline(new FilePipeline()).pipeline(pipeline).scheduler(new FileCacheQueueScheduler("/data/temp/webmagic/cache/")). run(); } } diff --git a/webmagic-samples/src/test/java/us/codecraft/webmagic/processor/SinablogProcessorTest.java b/webmagic-samples/src/test/java/us/codecraft/webmagic/processor/SinablogProcessorTest.java index 4a2638328..a0160e186 100644 --- a/webmagic-samples/src/test/java/us/codecraft/webmagic/processor/SinablogProcessorTest.java +++ b/webmagic-samples/src/test/java/us/codecraft/webmagic/processor/SinablogProcessorTest.java @@ -30,7 +30,7 @@ public void test() throws IOException { //ConsolePipeline输出结果到控制台 //FileCacheQueueSchedular保存url,支持断点续传,临时文件输出到/data/temp/webmagic/cache目录 //Spider.run()执行 - Spider.create(sinaBlogProcesser).pipeline(new FilePipeline()).pipeline(pipeline).schedular(new FileCacheQueueScheduler("/data/temp/webmagic/cache/")). + Spider.create(sinaBlogProcesser).pipeline(new FilePipeline()).pipeline(pipeline).scheduler(new FileCacheQueueScheduler("/data/temp/webmagic/cache/")). run(); } } From 763b7fe509531b5a9b6ac6facd2d0afd261ebeba Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Thu, 20 Jun 2013 08:21:48 +0800 Subject: [PATCH 28/81] update javadoc --- .../java/us/codecraft/webmagic/Request.java | 1 + .../main/java/us/codecraft/webmagic/Site.java | 2 +- .../java/us/codecraft/webmagic/Spider.java | 58 +++++++++++++++---- .../main/java/us/codecraft/webmagic/Task.java | 5 ++ .../webmagic/samples/DianpingProcessor.java | 2 +- .../processor/DiandianProcessorTest.java | 1 + 6 files changed, 56 insertions(+), 13 deletions(-) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Request.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Request.java index 37ede0dc4..42dd079fa 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Request.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Request.java @@ -3,6 +3,7 @@ /** * Request对象封装了待抓取的url信息。
      * 在PageProcessor中,Request对象可以通过{@link us.codecraft.webmagic.Page#getRequest()} 获取。
      + *
      * Request对象包含一个extra属性,可以写入一些必须的上下文,这个特性在某些场合会有用。
      *
        *      Example:
      diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java
      index f9e0fd611..423b0a6aa 100644
      --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java
      +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java
      @@ -85,7 +85,7 @@ public String getUserAgent() {
           /**
            * 获取已设置的domain
            *
      -     * @return
      +     * @return 已设置的domain
            */
           public String getDomain() {
               return domain;
      diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java
      index 4c4943a12..dc0102ce8 100644
      --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java
      +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java
      @@ -15,9 +15,19 @@
       
       /**
        * 
      - * webmagic爬虫的入口类。
      - *      示例:
      + *webmagic爬虫的入口类。
      + *
      + *示例:
      + *定义一个最简单的爬虫:
        *      Spider.create(new SimplePageProcessor("http://my.oschina.net/", "http://my.oschina.net/*blog/*")).run();
      + *
      + *使用FilePipeline保存结果到文件:
      + *      Spider.create(new SimplePageProcessor("http://my.oschina.net/", "http://my.oschina.net/*blog/*"))
      + *          .pipeline(new FilePipeline("/data/temp/webmagic/")).run();
      + *
      + *使用FileCacheQueueScheduler缓存URL,关闭爬虫后下次自动从停止的页面继续抓取:
      + *      Spider.create(new SimplePageProcessor("http://my.oschina.net/", "http://my.oschina.net/*blog/*"))
      + *          .scheduler(new FileCacheQueueScheduler("/data/temp/webmagic/cache/")).run();
        * 
      * @author code4crafter@gmail.com
      * Date: 13-4-21 @@ -41,36 +51,60 @@ public class Spider implements Runnable, Task { private Logger logger = Logger.getLogger(getClass()); - public Spider(PageProcessor pageProcessor){ + /** + * 使用已定义的抽取规则新建一个Spider。 + * @param pageProcessor 已定义的抽取规则 + */ + public Spider(PageProcessor pageProcessor) { this.pageProcessor = pageProcessor; this.site = pageProcessor.getSite(); + this.startUrls = pageProcessor.getSite().getStartUrls(); } + /** + * 使用已定义的抽取规则新建一个Spider。 + * @param pageProcessor 已定义的抽取规则 + * @return 新建的Spider + */ public static Spider create(PageProcessor pageProcessor) { return new Spider(pageProcessor); } + /** + * 重新设置startUrls,会覆盖Site本身的startUrls。 + * @param startUrls + * @return this + */ public Spider startUrls(List startUrls) { this.startUrls = startUrls; return this; } - public Spider startUrl(String startUrl) { - startUrls = new ArrayList(); - startUrls.add(startUrl); - return this; - } - + /** + * 为爬虫设置一个唯一ID,用于标志任务,默认情况下使用domain作为uuid,对于单domain多任务的情况,请为重复任务设置不同的ID。 + * @param uuid 唯一ID + * @return this + */ public Spider setUUID(String uuid) { this.uuid = uuid; return this; } + /** + * 设置调度器。调度器用于保存待抓取URL,并可以进行去重、同步、持久化等工作。默认情况下使用内存中的阻塞队列进行调度。 + * @param scheduler 调度器 + * @return this + */ public Spider scheduler(Scheduler scheduler) { this.scheduler = scheduler; return this; } + /** + * 设置处理管道。处理管道用于最终抽取结果的后处理,例如:保存到文件、保存到数据库等。默认情况下会输出到控制台。 + * @param pipeline 处理管道 + * @return this + */ public Spider pipeline(Pipeline pipeline) { this.pipelines.add(pipeline); return this; @@ -79,8 +113,10 @@ public Spider pipeline(Pipeline pipeline) { @Override public void run() { - for (String startUrl : startUrls) { - scheduler.push(new Request(startUrl), this); + if (startUrls != null) { + for (String startUrl : startUrls) { + scheduler.push(new Request(startUrl), this); + } } Request request = scheduler.poll(this); if (pipelines.isEmpty()) { diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Task.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Task.java index 2aab74a63..136b467d9 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Task.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Task.java @@ -1,12 +1,17 @@ package us.codecraft.webmagic; /** + * 抓取任务的抽象接口。
      * @author code4crafter@gmail.com
      * Date: 13-6-18 * Time: 下午2:57 */ public interface Task { + /** + * 返回唯一标志该任务的字符串,以区分不同任务。 + * @return uuid + */ public String getUUID(); } diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/DianpingProcessor.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/DianpingProcessor.java index c7233e8b2..33ac3d786 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/DianpingProcessor.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/DianpingProcessor.java @@ -33,6 +33,6 @@ public Site getSite() { public static void main(String[] args) { DianpingProcessor dianpingProcessor = new DianpingProcessor(); - Spider.create(dianpingProcessor).startUrl("http://www.dianping.com/shanghai/food").run(); + Spider.create(dianpingProcessor).run(); } } diff --git a/webmagic-samples/src/test/java/us/codecraft/webmagic/processor/DiandianProcessorTest.java b/webmagic-samples/src/test/java/us/codecraft/webmagic/processor/DiandianProcessorTest.java index ddca9c07f..13910b528 100644 --- a/webmagic-samples/src/test/java/us/codecraft/webmagic/processor/DiandianProcessorTest.java +++ b/webmagic-samples/src/test/java/us/codecraft/webmagic/processor/DiandianProcessorTest.java @@ -30,6 +30,7 @@ public void test() throws IOException { //ConsolePipeline输出结果到控制台 //FileCacheQueueSchedular保存url,支持断点续传,临时文件输出到/data/temp/webmagic/cache目录 //Spider.run()执行 + Spider.create(diaoyuwengProcessor).pipeline(new ConsolePipeline()).pipeline(pipeline).scheduler(new FileCacheQueueScheduler("/data/temp/webmagic/cache/")). run(); } From 6c227d31f3a22e0af7433d39b0fc592cfc1f7ba4 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Thu, 20 Jun 2013 08:22:53 +0800 Subject: [PATCH 29/81] update readme --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 2b687e6bc..4bd8c7f04 100644 --- a/README.md +++ b/README.md @@ -25,7 +25,7 @@ webmagic正处于开发阶段,目前还没有稳定版本。欢迎开发者参 以下是爬取oschina博客的一段代码: - Spider.me().processor(new SimplePageProcessor("http://my.oschina.net/", "http://my.oschina.net/*/blog/*")).run(); + Spider.create(new SimplePageProcessor("http://my.oschina.net/", "http://my.oschina.net/*/blog/*")).run(); * ####可扩展#### 参考[`scrapy`](https://github.com/scrapy/scrapy)的设计,webmagic将爬虫的扩展点分为Processor、Schedular、Downloader、Pipeline三个模块,可以通过扩展这些接口实现强大的扩展功能。如可以通过多个Spider实现多线程抓取;可以通过扩展Schedular实现断点续传乃至于分布式爬虫;可以通过扩展Pipeline实现业务可定制的持久化功能。 @@ -75,7 +75,7 @@ webmagic定制的核心是PageProcessor接口。 调用这个爬虫的代码如下: - Spider.me().processor(new SimplePageProcessor("http://my.oschina.net/", "http://my.oschina.net/*/blog/*")).run(); + Spider.create(new SimplePageProcessor("http://my.oschina.net/", "http://my.oschina.net/*/blog/*")).run(); ### 示例 From ba3e90a71a972028c8c4ed4124da43a837919274 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Thu, 20 Jun 2013 17:39:06 +0800 Subject: [PATCH 30/81] update api to support jdk 1.6 --- .../codecraft/webmagic/schedular/FileCacheQueueScheduler.java | 2 +- .../java/us/codecraft/webmagic/selector/SelectorFactory.java | 2 +- webmagic-plugin/pom.xml | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/schedular/FileCacheQueueScheduler.java b/webmagic-core/src/main/java/us/codecraft/webmagic/schedular/FileCacheQueueScheduler.java index e0d2c9481..1f5298a00 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/schedular/FileCacheQueueScheduler.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/schedular/FileCacheQueueScheduler.java @@ -117,7 +117,7 @@ private void readCursorFile() throws IOException { } private String getFileName(String filename) { - return filePath + task.getUUID() + "/" + filename; + return filePath + task.getUUID() + filename; } @Override diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/SelectorFactory.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/SelectorFactory.java index 3c87ac9c4..2fa78d323 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/SelectorFactory.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/SelectorFactory.java @@ -70,7 +70,7 @@ public T newSelector(Class clazz, String... param) { } else { throw new UnsupportedOperationException(); } - } catch (ReflectiveOperationException e) { + } catch (Exception e) { throw new IllegalArgumentException("init object error", e); } } diff --git a/webmagic-plugin/pom.xml b/webmagic-plugin/pom.xml index c2a08adb4..0a37a30ce 100644 --- a/webmagic-plugin/pom.xml +++ b/webmagic-plugin/pom.xml @@ -23,7 +23,7 @@ org.freemarker freemarker - 2.3.19 + 2.3.15 From de09d312d820e9e5bc221a78bb055c370055f17b Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Thu, 20 Jun 2013 17:43:33 +0800 Subject: [PATCH 31/81] bugfix --- webmagic-samples/src/main/resources/ftl/wordpress.ftl | 6 +++--- .../codecraft/webmagic/processor/DiandianProcessorTest.java | 1 - 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/webmagic-samples/src/main/resources/ftl/wordpress.ftl b/webmagic-samples/src/main/resources/ftl/wordpress.ftl index c2442ab62..f2feeb16c 100644 --- a/webmagic-samples/src/main/resources/ftl/wordpress.ftl +++ b/webmagic-samples/src/main/resources/ftl/wordpress.ftl @@ -1,13 +1,13 @@ ${title} - http://127.0.0.1/wordpress/?p=${uuid} + http://127.0.0.1/wordpress/?p=${id} ${date} admin - http://127.0.0.1/wordpress/?p=${uuid} + http://127.0.0.1/wordpress/?p=${id} - ${uuid} + ${id} ${date} ${date} open diff --git a/webmagic-samples/src/test/java/us/codecraft/webmagic/processor/DiandianProcessorTest.java b/webmagic-samples/src/test/java/us/codecraft/webmagic/processor/DiandianProcessorTest.java index 13910b528..3402479de 100644 --- a/webmagic-samples/src/test/java/us/codecraft/webmagic/processor/DiandianProcessorTest.java +++ b/webmagic-samples/src/test/java/us/codecraft/webmagic/processor/DiandianProcessorTest.java @@ -17,7 +17,6 @@ */ public class DiandianProcessorTest { - @Ignore @Test public void test() throws IOException { DiandianBlogProcessor diaoyuwengProcessor = new DiandianBlogProcessor(); From c387ef69415d1ba7a082b774da07c6c5fc0ffd21 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Thu, 20 Jun 2013 17:53:07 +0800 Subject: [PATCH 32/81] ignore long time test --- .../us/codecraft/webmagic/processor/DiandianProcessorTest.java | 1 + 1 file changed, 1 insertion(+) diff --git a/webmagic-samples/src/test/java/us/codecraft/webmagic/processor/DiandianProcessorTest.java b/webmagic-samples/src/test/java/us/codecraft/webmagic/processor/DiandianProcessorTest.java index 3402479de..13910b528 100644 --- a/webmagic-samples/src/test/java/us/codecraft/webmagic/processor/DiandianProcessorTest.java +++ b/webmagic-samples/src/test/java/us/codecraft/webmagic/processor/DiandianProcessorTest.java @@ -17,6 +17,7 @@ */ public class DiandianProcessorTest { + @Ignore @Test public void test() throws IOException { DiandianBlogProcessor diaoyuwengProcessor = new DiandianBlogProcessor(); From 5a6d8a4b8777a18c58d81058175f2b813f274544 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Thu, 20 Jun 2013 17:54:46 +0800 Subject: [PATCH 33/81] add jdk --- .travis.yml | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index be7dfb8ca..c7c99f406 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1 +1,4 @@ -language: java \ No newline at end of file +language: java +jdk: + - oraclejdk7 + - openjdk6 From af94f7d39a5174ac7aea211642ddccfad8109722 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Thu, 20 Jun 2013 17:57:31 +0800 Subject: [PATCH 34/81] ignore unstable test --- .../codecraft/webmagic/downloader/HttpClientDownloaderTest.java | 2 ++ 1 file changed, 2 insertions(+) diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/downloader/HttpClientDownloaderTest.java b/webmagic-core/src/test/java/us/codecraft/webmagic/downloader/HttpClientDownloaderTest.java index b2bcca2e5..5e83422ae 100644 --- a/webmagic-core/src/test/java/us/codecraft/webmagic/downloader/HttpClientDownloaderTest.java +++ b/webmagic-core/src/test/java/us/codecraft/webmagic/downloader/HttpClientDownloaderTest.java @@ -1,6 +1,7 @@ package us.codecraft.webmagic.downloader; import org.junit.Assert; +import org.junit.Ignore; import org.junit.Test; import us.codecraft.webmagic.Page; import us.codecraft.webmagic.Request; @@ -13,6 +14,7 @@ */ public class HttpClientDownloaderTest { + @Ignore @Test public void testCookie() { Site site = Site.me().setDomain("www.diandian.com").addCookie("t", "yct7q7e6v319wpg4cpxqduu5m77lcgix"); From 68fd8a68a8a21078ec607a653d1d5e69f1bd5992 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Sun, 23 Jun 2013 16:57:01 +0800 Subject: [PATCH 35/81] add page skip --- .../src/main/java/us/codecraft/webmagic/Page.java | 10 ++++++++++ .../webmagic/pipeline/FreemarkerPipeline.java | 7 +++++-- 2 files changed, 15 insertions(+), 2 deletions(-) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java index 5bf5f26ed..8a167ac7b 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java @@ -35,6 +35,16 @@ public class Page { private List targetRequests = new ArrayList(); + private boolean skip; + + public boolean isSkip() { + return skip; + } + + public void setSkip(boolean skip) { + this.skip = skip; + } + public Page() { } diff --git a/webmagic-plugin/src/main/java/us/codecraft/webmagic/pipeline/FreemarkerPipeline.java b/webmagic-plugin/src/main/java/us/codecraft/webmagic/pipeline/FreemarkerPipeline.java index f512f2628..8741ef498 100644 --- a/webmagic-plugin/src/main/java/us/codecraft/webmagic/pipeline/FreemarkerPipeline.java +++ b/webmagic-plugin/src/main/java/us/codecraft/webmagic/pipeline/FreemarkerPipeline.java @@ -14,8 +14,8 @@ /** * @author code4crafter@gmail.com
      - * Date: 13-6-8 - * Time: 下午9:00 + * Date: 13-6-8 + * Time: 下午9:00 */ public class FreemarkerPipeline implements Pipeline { @@ -40,6 +40,9 @@ public FreemarkerPipeline(String template) throws IOException { @Override public void process(Page page, Task task) { + if (page.isSkip()) { + return; + } String path = this.path + "" + task.getUUID() + "/"; File file = new File(path); if (!file.exists()) { From 3b31c533e499fedc5167125018a3c7fa88464db8 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Sun, 23 Jun 2013 17:05:10 +0800 Subject: [PATCH 36/81] add extra for page --- .../main/java/us/codecraft/webmagic/Page.java | 28 +++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java index 8a167ac7b..5b1ceaf41 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java @@ -37,10 +37,20 @@ public class Page { private boolean skip; + private Object extra; + + /** + * 是否忽略这个页面,用于pipeline来判断是否对这个页面进行处理 + * @return 是否忽略 true 忽略 + */ public boolean isSkip() { return skip; } + /** + * 设置是否忽略这个页面,用于pipeline来判断是否对这个页面进行处理 + * @param skip 是否忽略 true 忽略 + */ public void setSkip(boolean skip) { this.skip = skip; } @@ -148,4 +158,22 @@ public Request getRequest() { public void setRequest(Request request) { this.request = request; } + + /** + * 获取附加对象 + * @param 对象类型 + * @return 对象内容 + */ + public T getExtra() { + return (T)extra; + } + + /** + * 设置附加对象 + * @param extra 对象内容 + * @param 对象类型 + */ + public void setExtra(T extra) { + this.extra = extra; + } } From cd5941fd60224694e4782b8f3f5d1442fa669075 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Sun, 23 Jun 2013 17:06:43 +0800 Subject: [PATCH 37/81] add extra --- .../java/us/codecraft/webmagic/pipeline/ConsolePipeline.java | 3 +++ 1 file changed, 3 insertions(+) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/ConsolePipeline.java b/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/ConsolePipeline.java index 866db9239..6c2abba8f 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/ConsolePipeline.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/ConsolePipeline.java @@ -19,5 +19,8 @@ public void process(Page page,Task task) { for (Map.Entry entry : page.getFields().entrySet()) { System.out.println(entry.getKey()+":\t"+entry.getValue().toStrings()); } + if (page.getExtra()!=null){ + System.out.println(page.getExtra()); + } } } From c80ae654999aa7cf79c712247690a3a5c7df49cf Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Sun, 23 Jun 2013 18:56:31 +0800 Subject: [PATCH 38/81] add gzip support --- .../downloader/HttpClientDownloader.java | 22 ++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java index c1ecff3e8..1b628cd75 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java @@ -1,8 +1,11 @@ package us.codecraft.webmagic.downloader; import org.apache.commons.io.IOUtils; +import org.apache.http.Header; +import org.apache.http.HeaderElement; import org.apache.http.HttpResponse; import org.apache.http.client.HttpClient; +import org.apache.http.client.entity.GzipDecompressingEntity; import org.apache.http.client.methods.HttpGet; import org.apache.log4j.Logger; import us.codecraft.webmagic.Page; @@ -26,15 +29,19 @@ public class HttpClientDownloader implements Downloader { public Page download(Request request, Site site) { logger.info("downloading page " + request.getUrl()); HttpClient httpClient = HttpClientPool.getInstance().getClient(site); + String encoding = site.getEncoding(); try { HttpGet httpGet = new HttpGet(request.getUrl()); HttpResponse httpResponse = httpClient.execute(httpGet); int statusCode = httpResponse.getStatusLine().getStatusCode(); if (site.getAcceptStatCode().contains(statusCode)) { - if (site.getEncoding() == null){ + //charset + if (encoding == null){ String value = httpResponse.getEntity().getContentType().getValue(); site.setEncoding(new PlainText(value).regex("charset=([^\\s]+)").toString()); } + // + handleGzip(httpResponse); String content = IOUtils.toString(httpResponse.getEntity().getContent(), site.getEncoding()); Page page = new Page(); @@ -50,4 +57,17 @@ public Page download(Request request, Site site) { } return null; } + + private void handleGzip(HttpResponse httpResponse) { + Header ceheader = httpResponse.getEntity().getContentEncoding(); + if (ceheader != null) { + HeaderElement[] codecs = ceheader.getElements(); + for (int i = 0; i < codecs.length; i++) { + if (codecs[i].getName().equalsIgnoreCase("gzip")) { + httpResponse.setEntity( + new GzipDecompressingEntity(httpResponse.getEntity())); + } + } + } + } } From 16d581ca5f6432f6b1ebb7ca82bad4c6fc422232 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Sun, 23 Jun 2013 21:09:26 +0800 Subject: [PATCH 39/81] add multithread support --- .../java/us/codecraft/webmagic/Spider.java | 97 +++++++++++++++++-- .../codecraft/webmagic/utils/ThreadUtils.java | 33 +++++++ 2 files changed, 120 insertions(+), 10 deletions(-) create mode 100644 webmagic-core/src/main/java/us/codecraft/webmagic/utils/ThreadUtils.java diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java index dc0102ce8..709b6579c 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java @@ -9,9 +9,12 @@ import us.codecraft.webmagic.processor.PageProcessor; import us.codecraft.webmagic.schedular.QueueScheduler; import us.codecraft.webmagic.schedular.Scheduler; +import us.codecraft.webmagic.utils.ThreadUtils; import java.util.ArrayList; import java.util.List; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.atomic.AtomicInteger; /** *
      @@ -51,6 +54,16 @@ public class Spider implements Runnable, Task {
       
           private Logger logger = Logger.getLogger(getClass());
       
      +    private ExecutorService executorService;
      +
      +    private AtomicInteger stat = new AtomicInteger(STAT_INIT);
      +
      +    private final static int STAT_INIT = 0;
      +
      +    private final static int STAT_RUNNING = 1;
      +
      +    private final static int STAT_STOPPED = 2;
      +
           /**
            * 使用已定义的抽取规则新建一个Spider。
            * @param pageProcessor 已定义的抽取规则
      @@ -76,6 +89,7 @@ public static Spider create(PageProcessor pageProcessor) {
            * @return this
            */
           public Spider startUrls(List startUrls) {
      +        checkIfNotRunning();
               this.startUrls = startUrls;
               return this;
           }
      @@ -96,6 +110,7 @@ public Spider setUUID(String uuid) {
            * @return this
            */
           public Spider scheduler(Scheduler scheduler) {
      +        checkIfNotRunning();
               this.scheduler = scheduler;
               return this;
           }
      @@ -106,6 +121,7 @@ public Spider scheduler(Scheduler scheduler) {
            * @return this
            */
           public Spider pipeline(Pipeline pipeline) {
      +        checkIfNotRunning();
               this.pipelines.add(pipeline);
               return this;
           }
      @@ -113,6 +129,9 @@ public Spider pipeline(Pipeline pipeline) {
       
           @Override
           public void run() {
      +        if (!stat.compareAndSet(STAT_INIT, STAT_RUNNING)) {
      +            throw new IllegalStateException("Spider is already running!");
      +        }
               if (startUrls != null) {
                   for (String startUrl : startUrls) {
                       scheduler.push(new Request(startUrl), this);
      @@ -122,20 +141,56 @@ public void run() {
               if (pipelines.isEmpty()) {
                   pipelines.add(new ConsolePipeline());
               }
      -        while (request != null) {
      -            Page page = downloader.download(request, site);
      -            if (page == null) {
      -                sleep(site.getSleepTime());
      -                continue;
      +        //singel thread
      +        if (executorService==null){
      +            while (request != null) {
      +                processRequest(request);
      +                request = scheduler.poll(this);
                   }
      -            pageProcessor.process(page);
      -            addRequest(page);
      -            for (Pipeline pipeline : pipelines) {
      -                pipeline.process(page, this);
      +        } else {
      +            final AtomicInteger threadAlive = new AtomicInteger(0);
      +            while (true) {
      +                if (request == null) {
      +                    try {
      +                        Thread.sleep(100);
      +                    } catch (InterruptedException e) {
      +                    }
      +                } else {
      +                    final Request requestFinal = request;
      +                    threadAlive.incrementAndGet();
      +                    executorService.execute(new Runnable() {
      +                        @Override
      +                        public void run() {
      +                            processRequest(requestFinal);
      +                            threadAlive.decrementAndGet();
      +                        }
      +                    });
      +                }
      +                request = scheduler.poll(this);
      +                if (threadAlive.get() == 0) {
      +                    request = scheduler.poll(this);
      +                    if (request == null) {
      +                        break;
      +                    }
      +                }
                   }
      +            executorService.shutdown();
      +        }
      +        stat.compareAndSet(STAT_RUNNING, STAT_STOPPED);
      +    }
      +
      +    private void processRequest(Request request) {
      +        Page page = downloader.download(request, site);
      +        if (page == null) {
                   sleep(site.getSleepTime());
      -            request = scheduler.poll(this);
      +            return;
      +        }
      +        pageProcessor.process(page);
      +        addRequest(page);
      +        for (Pipeline pipeline : pipelines) {
      +            pipeline.process(page, this);
               }
      +        sleep(site.getSleepTime());
           }
       
           private void sleep(int time) {
      @@ -154,6 +209,28 @@ private void addRequest(Page page) {
               }
           }
       
      +    private void checkIfNotRunning(){
      +        if (!stat.compareAndSet(STAT_INIT,STAT_INIT)){
      +            throw new IllegalStateException("Spider is already running!");
      +        }
      +    }
      +
      +    /**
      +     * 建立多个线程下载
      +     * @param threadNum 线程数
      +     * @return
      +     */
      +    public Spider thread(int threadNum) {
      +        checkIfNotRunning();
      +        if (threadNum <= 1) {
      +            throw new IllegalArgumentException("threadNum should be more than one!");
      +        }
      +        synchronized (this){
      +            this.executorService = ThreadUtils.newFixedThreadPool(threadNum);
      +        }
      +        return this;
      +    }
      +
           @Override
           public String getUUID() {
               if (uuid != null) {
      diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/utils/ThreadUtils.java b/webmagic-core/src/main/java/us/codecraft/webmagic/utils/ThreadUtils.java
      new file mode 100644
      index 000000000..ebe61198d
      --- /dev/null
      +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/utils/ThreadUtils.java
      @@ -0,0 +1,33 @@
      +package us.codecraft.webmagic.utils;
      +
      +import java.util.concurrent.ExecutorService;
      +import java.util.concurrent.LinkedBlockingQueue;
      +import java.util.concurrent.ThreadPoolExecutor;
      +import java.util.concurrent.TimeUnit;
      +
      +/**
      + * @author code4crafer@gmail.com
      + *         Date: 13-6-23
      + *         Time: 下午7:11
      + */
      +public class ThreadUtils {
      +
      +    public static ExecutorService newFixedThreadPool(int threadSize) {
      +        return new ThreadPoolExecutor(threadSize, threadSize, 0L, TimeUnit.MILLISECONDS,
      +                new LinkedBlockingQueue(1) {
      +
      +                    private static final long serialVersionUID = -9028058603126367678L;
      +
      +                    @Override
      +                    public boolean offer(Runnable e) {
      +                        try {
      +                            put(e);
      +                            return true;
      +                        } catch (InterruptedException ie) {
      +                            Thread.currentThread().interrupt();
      +                        }
      +                        return false;
      +                    }
      +                });
      +    }
      +}
      
      From 82ab749d14a890d9fe5ad7f130e4d748f6189b7d Mon Sep 17 00:00:00 2001
      From: "yihua.huang" 
      Date: Sun, 23 Jun 2013 22:16:04 +0800
      Subject: [PATCH 40/81] add runasync
      
      ---
       .../src/main/java/us/codecraft/webmagic/Spider.java   | 11 +++++++++++
       1 file changed, 11 insertions(+)
      
      diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java
      index 709b6579c..a5e062436 100644
      --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java
      +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java
      @@ -215,6 +215,17 @@ private void checkIfNotRunning(){
               }
           }
       
      +    public void runAsync(){
      +        Thread thread = new Thread(){
      +            @Override
      +            public void run() {
      +                Spider.this.run();
      +            }
      +        };
      +        thread.setDaemon(false);
      +        thread.start();
      +    }
      +
           /**
            * 建立多个线程下载
            * @param threadNum 线程数
      
      From 323b2f22956e6e4dcf814d00141605a1bcceb261 Mon Sep 17 00:00:00 2001
      From: "yihua.huang" 
      Date: Mon, 24 Jun 2013 14:42:49 +0800
      Subject: [PATCH 41/81] add offline cache and process
      
      ---
       .../main/java/us/codecraft/webmagic/Site.java | 30 +++++--
       .../java/us/codecraft/webmagic/Spider.java    | 20 +++--
       .../main/java/us/codecraft/webmagic/Task.java |  6 ++
       .../webmagic/downloader/Downloader.java       |  6 +-
       .../webmagic/downloader/FileDownloader.java   | 88 +++++++++++++++++++
       .../downloader/HttpClientDownloader.java      | 12 +--
       .../webmagic/pipeline/FilePipeline.java       | 14 ++-
       .../us/codecraft/webmagic/utils/UrlUtils.java |  2 +-
       .../downloader/HttpClientDownloaderTest.java  |  2 +-
       .../webmagic/samples/DiaoyuwengProcessor.java |  2 +-
       .../webmagic/samples/GlobalProcessor.java     | 45 ++++++++++
       .../webmagic/samples/KaichibaProcessor.java   |  2 +-
       .../webmagic/samples/MeicanProcessor.java     |  2 +-
       .../us/codecraft/webmagic/SpiderTest.java     |  2 +-
       14 files changed, 196 insertions(+), 37 deletions(-)
       create mode 100644 webmagic-core/src/main/java/us/codecraft/webmagic/downloader/FileDownloader.java
       create mode 100644 webmagic-samples/src/main/java/us/codecraft/webmagic/samples/GlobalProcessor.java
      
      diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java
      index 423b0a6aa..676584ad5 100644
      --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java
      +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java
      @@ -18,7 +18,7 @@ public class Site {
       
           private Map cookies = new LinkedHashMap();
       
      -    private String encoding;
      +    private String charset;
       
           private List startUrls = new ArrayList();
       
      @@ -107,11 +107,11 @@ public Site setDomain(String domain) {
            * 设置页面编码,若不设置则自动根据Html meta信息获取。
      * 一般无需设置encoding,如果发现下载的结果是乱码,则可以设置此项。
      * - * @param encoding 编码格式,主要是"utf-8"、"gbk"两种 + * @param charset 编码格式,主要是"utf-8"、"gbk"两种 * @return this */ - public Site setEncoding(String encoding) { - this.encoding = encoding; + public Site setCharset(String charset) { + this.charset = charset; return this; } @@ -120,8 +120,8 @@ public Site setEncoding(String encoding) { * * @return 已设置的domain */ - public String getEncoding() { - return encoding; + public String getCharset() { + return charset; } /** @@ -194,18 +194,32 @@ public boolean equals(Object o) { return false; if (!domain.equals(site.domain)) return false; if (!startUrls.equals(site.startUrls)) return false; - if (encoding != null ? !encoding.equals(site.encoding) : site.encoding != null) return false; + if (charset != null ? !charset.equals(site.charset) : site.charset != null) return false; if (userAgent != null ? !userAgent.equals(site.userAgent) : site.userAgent != null) return false; return true; } + public Task toTask(){ + return new Task() { + @Override + public String getUUID() { + return Site.this.getDomain(); + } + + @Override + public Site getSite() { + return Site.this; + } + }; + } + @Override public int hashCode() { int result = domain.hashCode(); result = 31 * result + (startUrls != null ? startUrls.hashCode() : 0); result = 31 * result + (userAgent != null ? userAgent.hashCode() : 0); - result = 31 * result + (encoding != null ? encoding.hashCode() : 0); + result = 31 * result + (charset != null ? charset.hashCode() : 0); result = 31 * result + (acceptStatCode != null ? acceptStatCode.hashCode() : 0); return result; } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java index a5e062436..b2a2fa6b1 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java @@ -126,6 +126,12 @@ public Spider pipeline(Pipeline pipeline) { return this; } + public Spider downloader(Downloader downloader) { + checkIfNotRunning(); + this.downloader = downloader; + return this; + } + @Override public void run() { @@ -180,7 +186,7 @@ public void run() { } private void processRequest(Request request) { - Page page = downloader.download(request, site); + Page page = downloader.download(request, this); if (page == null) { sleep(site.getSleepTime()); return; @@ -216,12 +222,7 @@ private void checkIfNotRunning(){ } public void runAsync(){ - Thread thread = new Thread(){ - @Override - public void run() { - Spider.this.run(); - } - }; + Thread thread = new Thread(this); thread.setDaemon(false); thread.start(); } @@ -252,4 +253,9 @@ public String getUUID() { } return null; } + + @Override + public Site getSite() { + return site; + } } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Task.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Task.java index 136b467d9..14c1d319b 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Task.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Task.java @@ -14,4 +14,10 @@ public interface Task { */ public String getUUID(); + /** + * 返回任务抓取的站点信息 + * @return site + */ + public Site getSite(); + } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/Downloader.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/Downloader.java index e3ecff879..9a8bac110 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/Downloader.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/Downloader.java @@ -2,7 +2,7 @@ import us.codecraft.webmagic.Page; import us.codecraft.webmagic.Request; -import us.codecraft.webmagic.Site; +import us.codecraft.webmagic.Task; /** * Downloader是webmagic下载页面的接口。webmagic默认使用了HttpComponent作为下载器,一般情况,你无需自己实现这个接口。 @@ -16,8 +16,8 @@ public interface Downloader { * 下载页面,并保存信息到Page对象中。 * * @param request - * @param site + * @param task * @return page */ - public Page download(Request request, Site site); + public Page download(Request request, Task task); } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/FileDownloader.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/FileDownloader.java new file mode 100644 index 000000000..b4a49ac37 --- /dev/null +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/FileDownloader.java @@ -0,0 +1,88 @@ +package us.codecraft.webmagic.downloader; + +import org.apache.commons.codec.digest.DigestUtils; +import org.apache.commons.lang3.StringUtils; +import org.apache.log4j.Logger; +import us.codecraft.webmagic.Page; +import us.codecraft.webmagic.Request; +import us.codecraft.webmagic.Task; +import us.codecraft.webmagic.selector.Html; +import us.codecraft.webmagic.selector.PlainText; + +import java.io.*; + +/** + * @author code4crafer@gmail.com + * Date: 13-6-24 + * Time: 上午7:24 + */ +public class FileDownloader implements Downloader { + + private String path = "/data/temp/webmagic/"; + + private Downloader downloaderWhenFileMiss; + + private Logger logger = Logger.getLogger(getClass()); + + public FileDownloader() { + this("/data/temp/webmagic/", null); + } + + public FileDownloader(String path) { + this(path, null); + } + + public FileDownloader(String path, Downloader downloaderWhenFileMiss) { + this.path = path; + this.downloaderWhenFileMiss = downloaderWhenFileMiss; + } + + @Override + public Page download(Request request, Task task) { + String path = this.path + "/" + task.getUUID() + "/"; + Page page = null; + try { + final File file = new File(path + DigestUtils.md5Hex(request.getUrl())); + BufferedReader bufferedReader = new BufferedReader(new FileReader(file)); + String line = null; + line = bufferedReader.readLine(); + if (line.equals("url:\t" + request.getUrl())) { + final String html = getHtml(bufferedReader); + page = new Page(); + page.setRequest(request); + page.setUrl(PlainText.create(request.getUrl())); + page.setHtml(Html.create(html)); + } + } catch (IOException e) { + if (e instanceof FileNotFoundException) { + logger.info("File not exist for url " + request.getUrl()); + } else { + logger.warn("File read error for url " + request.getUrl(), e); + } + } + if (page == null) { + page = downloadWhenMiss(request, task); + } + return page; + } + + private String getHtml(BufferedReader bufferedReader) throws IOException { + String line; + StringBuilder htmlBuilder= new StringBuilder(); + line = bufferedReader.readLine(); + line = StringUtils.removeStart(line, "html:\t"); + htmlBuilder.append(line); + while ((line=bufferedReader.readLine())!=null){ + htmlBuilder.append(line); + } + return htmlBuilder.toString(); + } + + private Page downloadWhenMiss(Request request, Task task) { + Page page = null; + if (downloaderWhenFileMiss != null) { + page = downloaderWhenFileMiss.download(request, task); + } + return page; + } +} diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java index 1b628cd75..d2c2d62e7 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java @@ -11,6 +11,7 @@ import us.codecraft.webmagic.Page; import us.codecraft.webmagic.Request; import us.codecraft.webmagic.Site; +import us.codecraft.webmagic.Task; import us.codecraft.webmagic.selector.Html; import us.codecraft.webmagic.selector.PlainText; import us.codecraft.webmagic.utils.UrlUtils; @@ -26,24 +27,25 @@ public class HttpClientDownloader implements Downloader { private Logger logger = Logger.getLogger(getClass()); @Override - public Page download(Request request, Site site) { + public Page download(Request request, Task task) { + Site site = task.getSite(); logger.info("downloading page " + request.getUrl()); HttpClient httpClient = HttpClientPool.getInstance().getClient(site); - String encoding = site.getEncoding(); + String charset = site.getCharset(); try { HttpGet httpGet = new HttpGet(request.getUrl()); HttpResponse httpResponse = httpClient.execute(httpGet); int statusCode = httpResponse.getStatusLine().getStatusCode(); if (site.getAcceptStatCode().contains(statusCode)) { //charset - if (encoding == null){ + if (charset == null){ String value = httpResponse.getEntity().getContentType().getValue(); - site.setEncoding(new PlainText(value).regex("charset=([^\\s]+)").toString()); + charset = new PlainText(value).regex("charset=([^\\s]+)").toString(); } // handleGzip(httpResponse); String content = IOUtils.toString(httpResponse.getEntity().getContent(), - site.getEncoding()); + charset); Page page = new Page(); page.setHtml(new Html(UrlUtils.fixAllRelativeHrefs(content, request.getUrl()))); page.setUrl(new PlainText(request.getUrl())); diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/FilePipeline.java b/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/FilePipeline.java index c7cd9c56a..b079dcc40 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/FilePipeline.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/FilePipeline.java @@ -1,15 +1,14 @@ package us.codecraft.webmagic.pipeline; import org.apache.commons.codec.digest.DigestUtils; +import org.apache.log4j.Logger; import us.codecraft.webmagic.Page; import us.codecraft.webmagic.Task; -import us.codecraft.webmagic.selector.Selectable; import java.io.File; import java.io.FileWriter; import java.io.IOException; import java.io.PrintWriter; -import java.util.Map; /** * @author code4crafter@gmail.com
      @@ -20,6 +19,8 @@ public class FilePipeline implements Pipeline { private String path = "/data/temp/webmagic/"; + private Logger logger = Logger.getLogger(getClass()); + public FilePipeline() { } @@ -36,15 +37,12 @@ public void process(Page page, Task task) { file.mkdirs(); } try { - PrintWriter printWriter = new PrintWriter(new FileWriter(path + DigestUtils.md5Hex(page.getUrl().toString()) + ".html")); + PrintWriter printWriter = new PrintWriter(new FileWriter(path + DigestUtils.md5Hex(page.getUrl().toString()))); printWriter.println("url:\t" + page.getUrl()); - for (Map.Entry entry : page.getFields().entrySet()) { - printWriter.println(entry.getKey() + ":\t" + entry.getValue().toStrings()); - } + printWriter.println("html:\t" + page.getHtml()); printWriter.close(); } catch (IOException e) { - e.printStackTrace(); + logger.warn("write file error",e); } - } } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/utils/UrlUtils.java b/webmagic-core/src/main/java/us/codecraft/webmagic/utils/UrlUtils.java index 74e486c7f..667aaf25f 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/utils/UrlUtils.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/utils/UrlUtils.java @@ -75,7 +75,7 @@ public static String getDomain(String url) { return domain; } - private static Pattern patternForHref = Pattern.compile("(]*href=)[\"']{0,1}([^\"']*)[\"']{0,1}", Pattern.CASE_INSENSITIVE); + private static Pattern patternForHref = Pattern.compile("(]*href=)[\"']{0,1}([^\"'<>\\s]*)[\"']{0,1}", Pattern.CASE_INSENSITIVE); public static String fixAllRelativeHrefs(String html, String url) { StringBuilder stringBuilder = new StringBuilder(); diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/downloader/HttpClientDownloaderTest.java b/webmagic-core/src/test/java/us/codecraft/webmagic/downloader/HttpClientDownloaderTest.java index 5e83422ae..936aece62 100644 --- a/webmagic-core/src/test/java/us/codecraft/webmagic/downloader/HttpClientDownloaderTest.java +++ b/webmagic-core/src/test/java/us/codecraft/webmagic/downloader/HttpClientDownloaderTest.java @@ -19,7 +19,7 @@ public class HttpClientDownloaderTest { public void testCookie() { Site site = Site.me().setDomain("www.diandian.com").addCookie("t", "yct7q7e6v319wpg4cpxqduu5m77lcgix"); HttpClientDownloader httpClientDownloader = new HttpClientDownloader(); - Page download = httpClientDownloader.download(new Request("http://www.diandian.com"), site); + Page download = httpClientDownloader.download(new Request("http://www.diandian.com"), site.toTask()); Assert.assertTrue(download.getHtml().toString().contains("flashsword30")); } } diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/DiaoyuwengProcessor.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/DiaoyuwengProcessor.java index 13ed2e115..695d2e2ab 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/DiaoyuwengProcessor.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/DiaoyuwengProcessor.java @@ -34,7 +34,7 @@ public void process(Page page) { public Site getSite() { if (site==null){ site= Site.me().setDomain("www.diaoyuweng.com").addStartUrl("http://www.diaoyuweng.com/home.php?mod=space&uid=88304&do=thread&view=me&type=thread&from=space"). - setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31").setEncoding("GBK").setSleepTime(500); + setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31").setCharset("GBK").setSleepTime(500); } return site; } diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/GlobalProcessor.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/GlobalProcessor.java new file mode 100644 index 000000000..07f0101a1 --- /dev/null +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/GlobalProcessor.java @@ -0,0 +1,45 @@ +package us.codecraft.webmagic.samples; + +import us.codecraft.webmagic.Page; +import us.codecraft.webmagic.Site; +import us.codecraft.webmagic.Spider; +import us.codecraft.webmagic.downloader.FileDownloader; +import us.codecraft.webmagic.downloader.HttpClientDownloader; +import us.codecraft.webmagic.pipeline.FilePipeline; +import us.codecraft.webmagic.processor.PageProcessor; +import us.codecraft.webmagic.schedular.FileCacheQueueScheduler; + +import java.util.List; + +/** + * Author yihua.huang@dianping.com + * Date: 13-6-24 + * Time: 下午2:12 + */ +public class GlobalProcessor implements PageProcessor { + + private Site site; + + @Override + public void process(Page page) { + final List requests = page.getHtml().links().regex(".*book\\.douban\\.com.*").toStrings(); + page.addTargetRequests(requests); + + } + + @Override + public Site getSite() { + if (site==null){ + site = Site.me().setDomain("douban.com").addStartUrl("http://book.douban.com/").setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31"); + } + return site; + } + + public static void main(String[] args) { + Spider.create(new GlobalProcessor()).thread(10) + .scheduler(new FileCacheQueueScheduler("/data/webmagic/github")) + .downloader(new FileDownloader("/data/webmagic/douban", new HttpClientDownloader())) + .pipeline(new FilePipeline("/data/webmagic/douban")) + .run(); + } +} diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/KaichibaProcessor.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/KaichibaProcessor.java index 0a51b364e..aff18a6d3 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/KaichibaProcessor.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/KaichibaProcessor.java @@ -21,7 +21,7 @@ public void process(Page page) { @Override public Site getSite() { - return Site.me().setDomain("kaichiba.com").addStartUrl("http://kaichiba.com/shop/41725781").setEncoding("utf-8"). + return Site.me().setDomain("kaichiba.com").addStartUrl("http://kaichiba.com/shop/41725781").setCharset("utf-8"). setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31"); } } diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/MeicanProcessor.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/MeicanProcessor.java index bd218113b..39f5723ec 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/MeicanProcessor.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/MeicanProcessor.java @@ -27,7 +27,7 @@ public void process(Page page) { @Override public Site getSite() { - return Site.me().setDomain("meican.com").addStartUrl("http://www.meican.com/shanghai/districts").setEncoding("utf-8"). + return Site.me().setDomain("meican.com").addStartUrl("http://www.meican.com/shanghai/districts").setCharset("utf-8"). setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31"); } } diff --git a/webmagic-samples/src/test/java/us/codecraft/webmagic/SpiderTest.java b/webmagic-samples/src/test/java/us/codecraft/webmagic/SpiderTest.java index f2668f253..76a423fbf 100644 --- a/webmagic-samples/src/test/java/us/codecraft/webmagic/SpiderTest.java +++ b/webmagic-samples/src/test/java/us/codecraft/webmagic/SpiderTest.java @@ -29,7 +29,7 @@ public void testGlobalSpider(){ // Spider.me().pipeline(new FilePipeline()).scheduler(new FileCacheQueueScheduler(pageProcessor.getSite(),"/data/temp/webmagic/cache/")). // processor(pageProcessor).run(); SimplePageProcessor pageProcessor2 = new SimplePageProcessor("http://www.diaoyuweng.com/home.php?mod=space&uid=88304&do=thread&view=me&type=thread&from=space", "http://www.diaoyuweng.com/thread-*-1-1.html"); - System.out.println(pageProcessor2.getSite().getEncoding()); + System.out.println(pageProcessor2.getSite().getCharset()); pageProcessor2.getSite().setSleepTime(500); Spider.create(pageProcessor2).pipeline(new FilePipeline()).scheduler(new FileCacheQueueScheduler("/data/temp/webmagic/cache/")). run(); From 4ce808573d97420425a8e48330fd4b8a9ed1cbe9 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Thu, 18 Jul 2013 17:22:26 +0800 Subject: [PATCH 42/81] add retry --- .gitignore | 1 + webmagic-core/pom.xml | 2 +- .../main/java/us/codecraft/webmagic/Site.java | 19 +++++++ .../downloader/HttpClientDownloader.java | 28 ++++++++-- webmagic-samples/pom.xml | 32 +++++++++++ .../samples/DianpingIndexProcessor.java | 53 +++++++++++++++++++ .../webmagic/samples/DianpingProcessor.java | 30 ++++++----- .../webmagic/samples/GlobalProcessor.java | 13 +++-- .../webmagic/samples/GuoxueProcessor.java | 20 +++++++ .../processor/DiaoyuwengProcessorTest.java | 2 - 10 files changed, 175 insertions(+), 25 deletions(-) create mode 100644 webmagic-samples/src/main/java/us/codecraft/webmagic/samples/DianpingIndexProcessor.java create mode 100644 webmagic-samples/src/main/java/us/codecraft/webmagic/samples/GuoxueProcessor.java diff --git a/.gitignore b/.gitignore index 0af075f75..cd33b6188 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,3 @@ target/* *.iml +out/ diff --git a/webmagic-core/pom.xml b/webmagic-core/pom.xml index c0ef6a16f..df482f725 100644 --- a/webmagic-core/pom.xml +++ b/webmagic-core/pom.xml @@ -12,7 +12,7 @@ org.apache.httpcomponents httpclient - 4.2.1 + 4.2.4 diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java index 676584ad5..2c6118c75 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java @@ -24,6 +24,8 @@ public class Site { private int sleepTime = 3000; + private int retryTimes = 0; + private static final Set DEFAULT_STATUS_CODE_SET = new HashSet(); private Set acceptStatCode = DEFAULT_STATUS_CODE_SET; @@ -183,6 +185,23 @@ public int getSleepTime() { return sleepTime; } + /** + * 获取重新下载的次数,默认为0 + * @return 重新下载的次数 + */ + public int getRetryTimes() { + return retryTimes; + } + + /** + * 设置获取重新下载的次数,默认为0 + * @return this + */ + public Site setRetryTimes(int retryTimes) { + this.retryTimes = retryTimes; + return this; + } + @Override public boolean equals(Object o) { if (this == o) return true; diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java index d2c2d62e7..e4ae0ff6e 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java @@ -16,11 +16,13 @@ import us.codecraft.webmagic.selector.PlainText; import us.codecraft.webmagic.utils.UrlUtils; +import java.io.IOException; + /** * @author code4crafter@gmail.com
      - * Date: 13-4-21 - * Time: 下午12:15 + * Date: 13-4-21 + * Time: 下午12:15 */ public class HttpClientDownloader implements Downloader { @@ -34,11 +36,27 @@ public Page download(Request request, Task task) { String charset = site.getCharset(); try { HttpGet httpGet = new HttpGet(request.getUrl()); - HttpResponse httpResponse = httpClient.execute(httpGet); + HttpResponse httpResponse = null; + int tried = 0; + boolean retry; + do { + try { + httpResponse = httpClient.execute(httpGet); + retry = false; + } catch (IOException e) { + tried++; + if (tried > site.getRetryTimes()) { + logger.warn("download page " + request.getUrl() + " error", e); + return null; + } + logger.info("download page " + request.getUrl() + " error, retry the "+tried+" time!"); + retry = true; + } + } while (retry); int statusCode = httpResponse.getStatusLine().getStatusCode(); if (site.getAcceptStatCode().contains(statusCode)) { //charset - if (charset == null){ + if (charset == null) { String value = httpResponse.getEntity().getContentType().getValue(); charset = new PlainText(value).regex("charset=([^\\s]+)").toString(); } @@ -52,7 +70,7 @@ public Page download(Request request, Task task) { page.setRequest(request); return page; } else { - logger.warn("code error " + statusCode); + logger.warn("code error " + statusCode + "\t" + request.getUrl()); } } catch (Exception e) { logger.warn("download page " + request.getUrl() + " error", e); diff --git a/webmagic-samples/pom.xml b/webmagic-samples/pom.xml index 4e345a286..f1f680667 100644 --- a/webmagic-samples/pom.xml +++ b/webmagic-samples/pom.xml @@ -39,6 +39,25 @@ 1.6 + + org.apache.maven.plugins + maven-dependency-plugin + + + copy-dependencies + package + + copy-dependencies + + + ${project.build.directory}/lib + false + false + true + + + + org.apache.maven.plugins maven-resources-plugin @@ -70,6 +89,19 @@ + + org.apache.maven.plugins + maven-jar-plugin + + + + true + ./lib/ + us.codecraft.webmagic.samples.DianpingIndexProcessor + + + + org.apache.maven.plugins maven-release-plugin diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/DianpingIndexProcessor.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/DianpingIndexProcessor.java new file mode 100644 index 000000000..1f5da5187 --- /dev/null +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/DianpingIndexProcessor.java @@ -0,0 +1,53 @@ +package us.codecraft.webmagic.samples; + +import us.codecraft.webmagic.Page; +import us.codecraft.webmagic.Site; +import us.codecraft.webmagic.Spider; +import us.codecraft.webmagic.processor.PageProcessor; + +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +/** + * @author code4crafter@gmail.com
      + * Date: 13-4-21 Time: 下午8:08 + */ +public class DianpingIndexProcessor implements PageProcessor { + @Override + public void process(Page page) { + if (page.getUrl().toString().equals("http://www.dianping.com/citylist")) { + page.addTargetRequests(page.getHtml().links().regex("http://www\\.dianping\\.com/\\w+$").toStrings()); + return; + } + Pattern p = Pattern.compile("http://www\\.dianping\\.com/\\w+"); + Matcher matcher = p.matcher(page.getUrl().toString()); + if (matcher.matches()) { + page.addTargetRequests(page.getHtml().xpath("//li[@class='term-list-item']//a/@href").regex("http://www\\.dianping\\.com/search/.*").toStrings()); + } else { + p = Pattern.compile("http://www\\.dianping\\.com/search/.*"); + matcher = p.matcher(page.getUrl().toString()); + if (matcher.matches()) { + String result = page.getHtml().regex("您要查看的内容不存在").toString(); + if (result != null) { + System.err.println("No!Url not exist!" + page.getUrl()); + } + } + } + } + + @Override + public Site getSite() { + return Site.me().setDomain("www.dianping.com").addStartUrl("http://www.dianping.com/citylist") + .setSleepTime(0).setUserAgent("I'm a performance tester created by yihua.huang"); + } + + public static void main(String[] args) { + int sleepTime = 0; + if (args.length > 0) { + sleepTime = Integer.parseInt(args[0]); + } + DianpingIndexProcessor dianpingProcessor = new DianpingIndexProcessor(); + dianpingProcessor.getSite().setSleepTime(sleepTime); + Spider.create(dianpingProcessor).thread(10).run(); + } +} diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/DianpingProcessor.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/DianpingProcessor.java index 33ac3d786..056da0aae 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/DianpingProcessor.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/DianpingProcessor.java @@ -1,7 +1,7 @@ package us.codecraft.webmagic.samples; -import us.codecraft.webmagic.Site; import us.codecraft.webmagic.Page; +import us.codecraft.webmagic.Site; import us.codecraft.webmagic.Spider; import us.codecraft.webmagic.processor.PageProcessor; @@ -9,30 +9,36 @@ /** * @author code4crafter@gmail.com
      - * Date: 13-4-21 - * Time: 下午8:08 + * Date: 13-4-21 + * Time: 下午8:08 */ public class DianpingProcessor implements PageProcessor { + + private Site site; + @Override public void process(Page page) { - List requests = page.getHtml().links().regex(".*shop.*").toStrings(); + List requests = page.getHtml().links().regex("http://info-search-web121361\\.alpha\\.dp:8080/search/.*").toStrings(); page.addTargetRequests(requests); - requests = page.getHtml().regex(".*search/category/.*").toStrings(); - page.addTargetRequests(requests); - if (page.getUrl().toString().contains("shop")) { - page.putField("title", page.getHtml().xpath("//h1[@class='shop-title']")); - page.putField("content", page.getHtml().smartContent()); - } } @Override public Site getSite() { - return Site.me().setDomain("www.dianping.com").addStartUrl("http://www.dianping.com/"). - setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31"); + if (site == null) { + site = Site.me().setDomain("info-search-web361.alpha.dp:8080").addStartUrl("http://info11-search-web361.alpha.dp:8080/search/category/1/0"). + setSleepTime(100). + setUserAgent("I'm a performance tester created by yihua.huang"); + } + return site; } public static void main(String[] args) { + int sleepTime = 0; + if (args.length > 0) { + sleepTime = Integer.parseInt(args[0]); + } DianpingProcessor dianpingProcessor = new DianpingProcessor(); + dianpingProcessor.getSite().setSleepTime(sleepTime).setRetryTimes(10); Spider.create(dianpingProcessor).run(); } } diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/GlobalProcessor.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/GlobalProcessor.java index 07f0101a1..383422f62 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/GlobalProcessor.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/GlobalProcessor.java @@ -22,7 +22,7 @@ public class GlobalProcessor implements PageProcessor { @Override public void process(Page page) { - final List requests = page.getHtml().links().regex(".*book\\.douban\\.com.*").toStrings(); + final List requests = page.getHtml().links().toStrings(); page.addTargetRequests(requests); } @@ -30,16 +30,19 @@ public void process(Page page) { @Override public Site getSite() { if (site==null){ - site = Site.me().setDomain("douban.com").addStartUrl("http://book.douban.com/").setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31"); + site = Site.me().setDomain("www.2345.com") + .addStartUrl("http://www.2345.com/").addStartUrl("http://hao.360.cn/") + .addStartUrl("http://www.baidu.com/s?wd=%E7%BD%91%E7%AB%99%E5%AF%BC%E8%88%AA&rsv_spt=1&issp=1&rsv_bp=0&ie=utf-8&tn=80039098_oem_dg&rsv_n=2&rsv_sug3=6&rsv_sug4=698&rsv_sug=0&rsv_sug1=3") + .setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31"); } return site; } public static void main(String[] args) { Spider.create(new GlobalProcessor()).thread(10) - .scheduler(new FileCacheQueueScheduler("/data/webmagic/github")) - .downloader(new FileDownloader("/data/webmagic/douban", new HttpClientDownloader())) - .pipeline(new FilePipeline("/data/webmagic/douban")) + .scheduler(new FileCacheQueueScheduler("/data/webmagic/test")) + .downloader(new FileDownloader("/data/webmagic/test", new HttpClientDownloader())) + .pipeline(new FilePipeline("/data/webmagic/test")) .run(); } } diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/GuoxueProcessor.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/GuoxueProcessor.java new file mode 100644 index 000000000..54d995e59 --- /dev/null +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/GuoxueProcessor.java @@ -0,0 +1,20 @@ +package us.codecraft.webmagic.samples; + +import us.codecraft.webmagic.Spider; +import us.codecraft.webmagic.pipeline.FilePipeline; +import us.codecraft.webmagic.processor.SimplePageProcessor; +import us.codecraft.webmagic.schedular.FileCacheQueueScheduler; + +/** + * @author yihua.huang@dianping.com
      + * @date: 13-7-14
      + * Time: 上午8:33
      + */ +public class GuoxueProcessor { + + public static void main(String[] args) { + SimplePageProcessor simplePageProcessor = new SimplePageProcessor("http://www.guoxue123.cn/", "http://www.guoxue123.cn/*"); + simplePageProcessor.getSite().setCharset("GBK").setSleepTime(500); + Spider.create(simplePageProcessor).pipeline(new FilePipeline("/data/webmagic/")).scheduler(new FileCacheQueueScheduler("/data/webmagic/")).run(); + } +} diff --git a/webmagic-samples/src/test/java/us/codecraft/webmagic/processor/DiaoyuwengProcessorTest.java b/webmagic-samples/src/test/java/us/codecraft/webmagic/processor/DiaoyuwengProcessorTest.java index 33bcf9c61..5680d1237 100644 --- a/webmagic-samples/src/test/java/us/codecraft/webmagic/processor/DiaoyuwengProcessorTest.java +++ b/webmagic-samples/src/test/java/us/codecraft/webmagic/processor/DiaoyuwengProcessorTest.java @@ -1,6 +1,5 @@ package us.codecraft.webmagic.processor; -import org.junit.Ignore; import org.junit.Test; import us.codecraft.webmagic.Spider; import us.codecraft.webmagic.pipeline.FilePipeline; @@ -17,7 +16,6 @@ */ public class DiaoyuwengProcessorTest { - @Ignore @Test public void test() throws IOException { DiaoyuwengProcessor diaoyuwengProcessor = new DiaoyuwengProcessor(); From 0ac2bded860e5505b371bf95d9f51ce37a8a7874 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Thu, 18 Jul 2013 18:12:16 +0800 Subject: [PATCH 43/81] remove --- .gitignore | 1 + .../samples/DianpingIndexProcessor.java | 53 ------------------- 2 files changed, 1 insertion(+), 53 deletions(-) delete mode 100644 webmagic-samples/src/main/java/us/codecraft/webmagic/samples/DianpingIndexProcessor.java diff --git a/.gitignore b/.gitignore index cd33b6188..c28a71a54 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,4 @@ target/* *.iml out/ +Dianping*.java diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/DianpingIndexProcessor.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/DianpingIndexProcessor.java deleted file mode 100644 index 1f5da5187..000000000 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/DianpingIndexProcessor.java +++ /dev/null @@ -1,53 +0,0 @@ -package us.codecraft.webmagic.samples; - -import us.codecraft.webmagic.Page; -import us.codecraft.webmagic.Site; -import us.codecraft.webmagic.Spider; -import us.codecraft.webmagic.processor.PageProcessor; - -import java.util.regex.Matcher; -import java.util.regex.Pattern; - -/** - * @author code4crafter@gmail.com
      - * Date: 13-4-21 Time: 下午8:08 - */ -public class DianpingIndexProcessor implements PageProcessor { - @Override - public void process(Page page) { - if (page.getUrl().toString().equals("http://www.dianping.com/citylist")) { - page.addTargetRequests(page.getHtml().links().regex("http://www\\.dianping\\.com/\\w+$").toStrings()); - return; - } - Pattern p = Pattern.compile("http://www\\.dianping\\.com/\\w+"); - Matcher matcher = p.matcher(page.getUrl().toString()); - if (matcher.matches()) { - page.addTargetRequests(page.getHtml().xpath("//li[@class='term-list-item']//a/@href").regex("http://www\\.dianping\\.com/search/.*").toStrings()); - } else { - p = Pattern.compile("http://www\\.dianping\\.com/search/.*"); - matcher = p.matcher(page.getUrl().toString()); - if (matcher.matches()) { - String result = page.getHtml().regex("您要查看的内容不存在").toString(); - if (result != null) { - System.err.println("No!Url not exist!" + page.getUrl()); - } - } - } - } - - @Override - public Site getSite() { - return Site.me().setDomain("www.dianping.com").addStartUrl("http://www.dianping.com/citylist") - .setSleepTime(0).setUserAgent("I'm a performance tester created by yihua.huang"); - } - - public static void main(String[] args) { - int sleepTime = 0; - if (args.length > 0) { - sleepTime = Integer.parseInt(args[0]); - } - DianpingIndexProcessor dianpingProcessor = new DianpingIndexProcessor(); - dianpingProcessor.getSite().setSleepTime(sleepTime); - Spider.create(dianpingProcessor).thread(10).run(); - } -} From 9e391fbe05d286e5bc6f01e2e2725a6e45ebe9cb Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Thu, 18 Jul 2013 18:23:15 +0800 Subject: [PATCH 44/81] ignore --- .gitignore | 1 - 1 file changed, 1 deletion(-) diff --git a/.gitignore b/.gitignore index c28a71a54..cd33b6188 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,3 @@ target/* *.iml out/ -Dianping*.java From 5708b6e9c325e703d866fbbd68d9cb2e288bf364 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Fri, 19 Jul 2013 12:34:22 +0800 Subject: [PATCH 45/81] +sina blog --- .../webmagic/selector/XpathSelectorTest.java | 1 + .../codecraft/webmagic/samples/SinaBlogProcesser.java | 11 ++++++++--- .../webmagic/processor/DiaoyuwengProcessorTest.java | 2 ++ 3 files changed, 11 insertions(+), 3 deletions(-) diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/selector/XpathSelectorTest.java b/webmagic-core/src/test/java/us/codecraft/webmagic/selector/XpathSelectorTest.java index bebbb83dc..e13b80978 100644 --- a/webmagic-core/src/test/java/us/codecraft/webmagic/selector/XpathSelectorTest.java +++ b/webmagic-core/src/test/java/us/codecraft/webmagic/selector/XpathSelectorTest.java @@ -1351,6 +1351,7 @@ public void test() { public void testOschina() { Html html1 = new Html(html); Assert.assertEquals("再次吐槽easyui", html1.xpath(".//*[@class='QTitle']/h1/a").toString()); + System.out.println(html1.regex("(.*?)").links().toStrings()); } } diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/SinaBlogProcesser.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/SinaBlogProcesser.java index bb77931fe..baa375d81 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/SinaBlogProcesser.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/SinaBlogProcesser.java @@ -1,7 +1,8 @@ package us.codecraft.webmagic.samples; -import us.codecraft.webmagic.Site; import us.codecraft.webmagic.Page; +import us.codecraft.webmagic.Site; +import us.codecraft.webmagic.Spider; import us.codecraft.webmagic.processor.PageProcessor; /** @@ -15,7 +16,7 @@ public class SinaBlogProcesser implements PageProcessor { @Override public void process(Page page) { - page.addTargetRequests(page.getHtml().links().regex("(http://blog\\.sina\\.com\\.cn/s/blog_.*)").toStrings()); + page.addTargetRequests(page.getHtml().xpath("//div[@class='articalfrontback SG_j_linedot1 clearfix']").links().toStrings()); page.putField("title", page.getHtml().xpath("//div[@class='articalTitle']/h2")); page.putField("content",page.getHtml().xpath("//div[@id='articlebody']//div[@class='articalContent']")); page.putField("id",page.getUrl().regex("http://blog\\.sina\\.com\\.cn/s/blog_(\\w+)")); @@ -26,9 +27,13 @@ public void process(Page page) { @Override public Site getSite() { if (site==null){ - site = Site.me().setDomain("blog.sina.com.cn").addStartUrl("http://blog.sina.com.cn/flashsword20").setSleepTime(3000). + site = Site.me().setDomain("blog.sina.com.cn").addStartUrl("http://blog.sina.com.cn/s/blog_4701280b0102egl0.html").setSleepTime(3000). setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31"); } return site; } + + public static void main(String[] args) { + Spider.create(new SinaBlogProcesser()).run(); + } } diff --git a/webmagic-samples/src/test/java/us/codecraft/webmagic/processor/DiaoyuwengProcessorTest.java b/webmagic-samples/src/test/java/us/codecraft/webmagic/processor/DiaoyuwengProcessorTest.java index 5680d1237..33bcf9c61 100644 --- a/webmagic-samples/src/test/java/us/codecraft/webmagic/processor/DiaoyuwengProcessorTest.java +++ b/webmagic-samples/src/test/java/us/codecraft/webmagic/processor/DiaoyuwengProcessorTest.java @@ -1,5 +1,6 @@ package us.codecraft.webmagic.processor; +import org.junit.Ignore; import org.junit.Test; import us.codecraft.webmagic.Spider; import us.codecraft.webmagic.pipeline.FilePipeline; @@ -16,6 +17,7 @@ */ public class DiaoyuwengProcessorTest { + @Ignore @Test public void test() throws IOException { DiaoyuwengProcessor diaoyuwengProcessor = new DiaoyuwengProcessor(); From a5b58bc8fb091863287a4e3cda4d44fa27e5a12b Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Sat, 20 Jul 2013 08:34:18 +0800 Subject: [PATCH 46/81] invite jsoup and cssselector --- webmagic-core/pom.xml | 6 +++ .../java/us/codecraft/webmagic/Spider.java | 2 + .../webmagic/selector/CssSelector.java | 47 +++++++++++++++++++ .../us/codecraft/webmagic/selector/Html.java | 6 +++ .../webmagic/selector/PlainText.java | 5 ++ .../webmagic/selector/Selectable.java | 8 ++++ .../java/us/codecraft/webmagic/HtmlTest.java | 1 + .../webmagic/selector/XpathSelectorTest.java | 2 +- 8 files changed, 76 insertions(+), 1 deletion(-) create mode 100644 webmagic-core/src/main/java/us/codecraft/webmagic/selector/CssSelector.java diff --git a/webmagic-core/pom.xml b/webmagic-core/pom.xml index df482f725..7d787aa85 100644 --- a/webmagic-core/pom.xml +++ b/webmagic-core/pom.xml @@ -52,6 +52,12 @@ 2.4
      + + org.jsoup + jsoup + 1.7.2 + + org.apache.commons commons-io diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java index b2a2fa6b1..1288ff8ed 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java @@ -154,9 +154,11 @@ public void run() { request = scheduler.poll(this); } } else { + //multi thread final AtomicInteger threadAlive = new AtomicInteger(0); while (true) { if (request == null) { + //when no request found but some thread is alive, sleep a while. try { Thread.sleep(100); } catch (InterruptedException e) { diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/CssSelector.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/CssSelector.java new file mode 100644 index 000000000..c2d654af5 --- /dev/null +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/CssSelector.java @@ -0,0 +1,47 @@ +package us.codecraft.webmagic.selector; + +import org.apache.commons.collections.CollectionUtils; +import org.jsoup.Jsoup; +import org.jsoup.nodes.Document; +import org.jsoup.nodes.Element; +import org.jsoup.select.Elements; + +import java.util.ArrayList; +import java.util.List; + +/** + * @author code4crafter@gmail.com
      + * Date: 13-4-21 + * Time: 上午9:39 + */ +public class CssSelector implements Selector { + + private String selectorText; + + public CssSelector(String selectorText) { + this.selectorText = selectorText; + } + + @Override + public String select(String text) { + Document doc = Jsoup.parse(text); + Elements elements = doc.select(selectorText); + if (CollectionUtils.isNotEmpty(elements)) { + return null; + } + return elements.get(0).outerHtml(); + } + + @Override + public List selectList(String text) { + List strings = new ArrayList(); + Document doc = Jsoup.parse(text); + Elements elements = doc.select(selectorText); + if (CollectionUtils.isNotEmpty(elements)) { + for (Element element : elements) { + strings.add(element.outerHtml()); + } + } + return strings; + } +} diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Html.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Html.java index 0b3637267..099f50746 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Html.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Html.java @@ -62,4 +62,10 @@ public Selectable xpath(String xpath) { return selectList(xpathSelector, strings); } + @Override + public Selectable $(String selector) { + CssSelector cssSelector = new CssSelector(selector); + return selectList(cssSelector,strings); + } + } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/PlainText.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/PlainText.java index cedee6301..0137de814 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/PlainText.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/PlainText.java @@ -33,6 +33,11 @@ public Selectable xpath(String xpath) { throw new UnsupportedOperationException(); } + @Override + public Selectable $(String selector) { + throw new UnsupportedOperationException(); + } + @Override public Selectable smartContent() { throw new UnsupportedOperationException(); diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Selectable.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Selectable.java index 630808d3b..f4aa9a530 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Selectable.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Selectable.java @@ -17,6 +17,14 @@ public interface Selectable { */ public Selectable xpath(String xpath); + /** + * select list with jquery selector + * + * @param + * @return + */ + public Selectable $(String selector); + /** * select smart content with ReadAbility algorithm * diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/HtmlTest.java b/webmagic-core/src/test/java/us/codecraft/webmagic/HtmlTest.java index fcdbfeffa..c90001460 100644 --- a/webmagic-core/src/test/java/us/codecraft/webmagic/HtmlTest.java +++ b/webmagic-core/src/test/java/us/codecraft/webmagic/HtmlTest.java @@ -17,4 +17,5 @@ public void testRegexSelector() { Assert.assertEquals("abbabbab", (selectable.regex("(.*)").replace("aa(a)", "$1bb").toString())); } + } diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/selector/XpathSelectorTest.java b/webmagic-core/src/test/java/us/codecraft/webmagic/selector/XpathSelectorTest.java index e13b80978..3ef0a9279 100644 --- a/webmagic-core/src/test/java/us/codecraft/webmagic/selector/XpathSelectorTest.java +++ b/webmagic-core/src/test/java/us/codecraft/webmagic/selector/XpathSelectorTest.java @@ -1351,7 +1351,7 @@ public void test() { public void testOschina() { Html html1 = new Html(html); Assert.assertEquals("再次吐槽easyui", html1.xpath(".//*[@class='QTitle']/h1/a").toString()); - System.out.println(html1.regex("(.*?)").links().toStrings()); + Assert.assertNotNull(html1.$("a[href]").xpath("//@href").toStrings()); } } From 74cd7cab5c54b07c6eec5c0b0e6fea879d49afe5 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Wed, 24 Jul 2013 18:26:54 +0800 Subject: [PATCH 47/81] update java doc --- .../src/main/java/us/codecraft/webmagic/Page.java | 4 ++-- .../webmagic/downloader/HttpClientDownloader.java | 6 +++--- .../webmagic/pipeline/ConsolePipeline.java | 1 + .../codecraft/webmagic/pipeline/FilePipeline.java | 8 ++++++++ .../us/codecraft/webmagic/pipeline/Pipeline.java | 1 + .../webmagic/processor/PageProcessor.java | 6 ++++-- .../webmagic/processor/SimplePageProcessor.java | 2 ++ .../schedular/FileCacheQueueScheduler.java | 4 +++- .../webmagic/schedular/QueueScheduler.java | 1 + .../us/codecraft/webmagic/schedular/Scheduler.java | 12 ++++++++++++ .../us/codecraft/webmagic/schedular/package.html | 2 +- .../codecraft/webmagic/selector/CssSelector.java | 1 + .../codecraft/webmagic/selector/RegexResult.java | 1 + .../us/codecraft/webmagic/selector/Selectable.java | 2 +- .../codecraft/webmagic/selector/XpathSelector.java | 9 +++++---- .../java/us/codecraft/webmagic/utils/UrlUtils.java | 14 ++++++++++---- .../us/codecraft/webmagic/utils/UrlUtilsTest.java | 11 +++++------ .../codecraft/webmagic/FreemarkerPipelineTest.java | 2 +- 18 files changed, 62 insertions(+), 25 deletions(-) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java index 5b1ceaf41..b2dd3db71 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java @@ -101,7 +101,7 @@ public void addTargetRequests(List requests) { if (StringUtils.isBlank(s) || s.equals("#") || s.startsWith("javascript:")) { break; } - s = UrlUtils.fixRelativeUrl(s, url.toString()); + s = UrlUtils.canonicalizeUrl(s, url.toString()); targetRequests.add(new Request(s)); } } @@ -116,7 +116,7 @@ public void addTargetRequest(String requestString) { return; } synchronized (targetRequests) { - requestString = UrlUtils.fixRelativeUrl(requestString, url.toString()); + requestString = UrlUtils.canonicalizeUrl(requestString, url.toString()); targetRequests.add(new Request(requestString)); } } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java index e4ae0ff6e..ac3ea0fb0 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java @@ -58,7 +58,7 @@ public Page download(Request request, Task task) { //charset if (charset == null) { String value = httpResponse.getEntity().getContentType().getValue(); - charset = new PlainText(value).regex("charset=([^\\s]+)").toString(); + charset = UrlUtils.getCharset(value); } // handleGzip(httpResponse); @@ -82,8 +82,8 @@ private void handleGzip(HttpResponse httpResponse) { Header ceheader = httpResponse.getEntity().getContentEncoding(); if (ceheader != null) { HeaderElement[] codecs = ceheader.getElements(); - for (int i = 0; i < codecs.length; i++) { - if (codecs[i].getName().equalsIgnoreCase("gzip")) { + for (HeaderElement codec : codecs) { + if (codec.getName().equalsIgnoreCase("gzip")) { httpResponse.setEntity( new GzipDecompressingEntity(httpResponse.getEntity())); } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/ConsolePipeline.java b/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/ConsolePipeline.java index 6c2abba8f..dff2ded75 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/ConsolePipeline.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/ConsolePipeline.java @@ -7,6 +7,7 @@ import java.util.Map; /** + * 命令行输出抽取结果。可用于测试。
      * @author code4crafter@gmail.com
      * Date: 13-4-21 * Time: 下午1:45 diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/FilePipeline.java b/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/FilePipeline.java index b079dcc40..e48e2bb44 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/FilePipeline.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/FilePipeline.java @@ -11,6 +11,7 @@ import java.io.PrintWriter; /** + * 持久化到文件的接口。 * @author code4crafter@gmail.com
      * Date: 13-4-21 * Time: 下午6:28 @@ -21,10 +22,17 @@ public class FilePipeline implements Pipeline { private Logger logger = Logger.getLogger(getClass()); + /** + * 新建一个FilePipeline,使用默认保存路径"/data/temp/webmagic/" + */ public FilePipeline() { } + /** + * 新建一个FilePipeline + * @param path 文件保存路径 + */ public FilePipeline(String path) { this.path = path; } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/Pipeline.java b/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/Pipeline.java index 1be447c45..408392d9a 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/Pipeline.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/Pipeline.java @@ -4,6 +4,7 @@ import us.codecraft.webmagic.Task; /** + * Pipeline是数据离线处理和持久化的接口。通过实现Pipeline以实现不同的持久化方式(例如保存到数据库)。 * @author code4crafter@gmail.com
      * Date: 13-4-21 * Time: 下午1:39 diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/processor/PageProcessor.java b/webmagic-core/src/main/java/us/codecraft/webmagic/processor/PageProcessor.java index c36ae980d..3963d0805 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/processor/PageProcessor.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/processor/PageProcessor.java @@ -4,6 +4,8 @@ import us.codecraft.webmagic.Site; /** + * 定制爬虫的核心接口。通过实现PageProcessor可以实现一个定制的爬虫。
      + * extends the class to implements various spiders.
      * @author code4crafter@gmail.com
      * Date: 13-4-21 * Time: 上午11:42 @@ -11,13 +13,13 @@ public interface PageProcessor { /** - * extends the class to implements variaty spiders + * 定义如何处理页面,包括链接提取、内容抽取等。 * @param page */ public void process(Page page); /** - * the site the processor for + * 定义任务一些配置信息,例如开始链接、抓取间隔、自定义cookie、自定义UA等。 * @return site */ public Site getSite(); diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/processor/SimplePageProcessor.java b/webmagic-core/src/main/java/us/codecraft/webmagic/processor/SimplePageProcessor.java index 0d5244629..47d3748b9 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/processor/SimplePageProcessor.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/processor/SimplePageProcessor.java @@ -7,6 +7,7 @@ import java.util.List; /** + * 非常简单的抽取器。链接抽取使用定义的通配符,并保存抽取整个内容到content字段。
      * @author code4crafter@gmail.com
      * Date: 13-4-22 * Time: 下午9:15 @@ -22,6 +23,7 @@ public class SimplePageProcessor implements PageProcessor { public SimplePageProcessor(String startUrl, String urlPattern) { this.site = Site.me().addStartUrl(startUrl). setDomain(UrlUtils.getDomain(startUrl)).setUserAgent(UA); + //compile "*" expression to regex this.urlPattern = "("+urlPattern.replace(".","\\.").replace("*","[^\"'#]*")+")"; } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/schedular/FileCacheQueueScheduler.java b/webmagic-core/src/main/java/us/codecraft/webmagic/schedular/FileCacheQueueScheduler.java index 1f5298a00..77a6c0b35 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/schedular/FileCacheQueueScheduler.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/schedular/FileCacheQueueScheduler.java @@ -16,6 +16,7 @@ import java.util.concurrent.atomic.AtomicInteger; /** + * 磁盘文件实现的安全Scheduler,可以保证在长时间执行的任务中断后,下次启动从中断位置重新开始。
      * @author code4crafter@gmail.com
      * Date: 13-4-21 * Time: 下午1:13 @@ -91,6 +92,7 @@ private void readFile() { readCursorFile(); readUrlFile(); } catch (IOException e) { + logger.error("init file error",e); } } @@ -109,7 +111,7 @@ private void readUrlFile() throws IOException { private void readCursorFile() throws IOException { BufferedReader fileCursorReader = new BufferedReader(new FileReader(getFileName(fileCursor))); - String line = null; + String line; //read the last number while ((line = fileCursorReader.readLine()) != null) { cursor = new AtomicInteger(NumberUtils.toInt(line)); diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/schedular/QueueScheduler.java b/webmagic-core/src/main/java/us/codecraft/webmagic/schedular/QueueScheduler.java index 697688596..613e4062c 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/schedular/QueueScheduler.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/schedular/QueueScheduler.java @@ -10,6 +10,7 @@ import java.util.concurrent.LinkedBlockingQueue; /** + * 内存队列实现的线程安全Scheduler。
      * @author code4crafter@gmail.com
      * Date: 13-4-21 * Time: 下午1:13 diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/schedular/Scheduler.java b/webmagic-core/src/main/java/us/codecraft/webmagic/schedular/Scheduler.java index 7e0213275..bf440baf8 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/schedular/Scheduler.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/schedular/Scheduler.java @@ -4,14 +4,26 @@ import us.codecraft.webmagic.Task; /** + * 包含url管理和调度的接口。包括url抓取队列,url去重等功能。
      + * Scheduler的接口包含一个Task参数,该参数是为单Scheduler多Task预留的(Spider就是一个Task)。
      * @author code4crafter@gmail.com
      * Date: 13-4-21 * Time: 下午1:12 */ public interface Scheduler { + /** + * 加入一个待抓取的链接 + * @param request 待抓取的链接 + * @param task 定义的任务,以满足单Scheduler多Task的情况 + */ public void push(Request request,Task task); + /** + * 返回下一个要抓取的链接 + * @param task 定义的任务,以满足单Scheduler多Task的情况 + * @return + */ public Request poll(Task task); } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/schedular/package.html b/webmagic-core/src/main/java/us/codecraft/webmagic/schedular/package.html index 0e35610fe..7887dd536 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/schedular/package.html +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/schedular/package.html @@ -1,5 +1,5 @@ -包含url管理和调度的接口Schedular及它的几个实现类。 +包含url管理和调度的接口Scheduler及它的几个实现类。 diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/CssSelector.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/CssSelector.java index c2d654af5..10dfb6230 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/CssSelector.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/CssSelector.java @@ -10,6 +10,7 @@ import java.util.List; /** + * css风格的选择器。包装了Jsoup。
      * @author code4crafter@gmail.com
      * Date: 13-4-21 * Time: 上午9:39 diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/RegexResult.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/RegexResult.java index 9f4e2f060..04467bcce 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/RegexResult.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/RegexResult.java @@ -1,6 +1,7 @@ package us.codecraft.webmagic.selector; /** + * 封装正则表达式抽取接口的类。
      * @author code4crafter@gmail.com
      * Date: 13-4-21 * Time: 上午7:39 diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Selectable.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Selectable.java index f4aa9a530..932115cdf 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Selectable.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Selectable.java @@ -18,7 +18,7 @@ public interface Selectable { public Selectable xpath(String xpath); /** - * select list with jquery selector + * select list with css selector * * @param * @return diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/XpathSelector.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/XpathSelector.java index c2b408eb3..02afe2912 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/XpathSelector.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/XpathSelector.java @@ -6,6 +6,7 @@ import java.util.List; /** + * xpath的选择器。包装了HtmlCleaner。
      * @author code4crafter@gmail.com
      * Date: 13-4-21 * Time: 上午9:39 @@ -52,12 +53,12 @@ public List selectList(String text) { try { Object[] objects = tagNode.evaluateXPath(xpathStr); if (objects != null && objects.length >= 1) { - for (int i = 0; i < objects.length; i++) { - if (objects[i] instanceof TagNode) { - TagNode tagNode1 = (TagNode) objects[i]; + for (Object object : objects) { + if (object instanceof TagNode) { + TagNode tagNode1 = (TagNode) object; results.add(htmlCleaner.getInnerHtml(tagNode1)); } else { - results.add(objects[i].toString()); + results.add(object.toString()); } } } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/utils/UrlUtils.java b/webmagic-core/src/main/java/us/codecraft/webmagic/utils/UrlUtils.java index 667aaf25f..0b7201d61 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/utils/UrlUtils.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/utils/UrlUtils.java @@ -14,7 +14,13 @@ public class UrlUtils { private static Pattern relativePathPattern = Pattern.compile("^([\\.]+)/"); - public static String fixRelativeUrl(String url, String refer) { + /** + * 将url想对地址转化为绝对地址 + * @param url url地址 + * @param refer url地址来自哪个页面 + * @return + */ + public static String canonicalizeUrl(String url, String refer) { if (StringUtils.isBlank(url) || StringUtils.isBlank(refer)) { return url; } @@ -62,12 +68,12 @@ public static String getHost(String url) { private static Pattern patternForProtocal = Pattern.compile("[\\w]+://"); - public static String removeProtocal(String url) { + public static String removeProtocol(String url) { return patternForProtocal.matcher(url).replaceAll(""); } public static String getDomain(String url) { - String domain = removeProtocal(url); + String domain = removeProtocol(url); int i = StringUtils.indexOf(domain, "/", 1); if (i > 0) { domain = StringUtils.substring(domain, 0, i); @@ -84,7 +90,7 @@ public static String fixAllRelativeHrefs(String html, String url) { while (matcher.find()) { stringBuilder.append(StringUtils.substring(html, lastEnd, matcher.start())); stringBuilder.append(matcher.group(1)); - stringBuilder.append("\"" + fixRelativeUrl(matcher.group(2), url) + "\""); + stringBuilder.append("\"").append(canonicalizeUrl(matcher.group(2), url)).append("\""); lastEnd = matcher.end(); } stringBuilder.append(StringUtils.substring(html, lastEnd)); diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/utils/UrlUtilsTest.java b/webmagic-core/src/test/java/us/codecraft/webmagic/utils/UrlUtilsTest.java index 4cfdc046b..cd55b2c77 100644 --- a/webmagic-core/src/test/java/us/codecraft/webmagic/utils/UrlUtilsTest.java +++ b/webmagic-core/src/test/java/us/codecraft/webmagic/utils/UrlUtilsTest.java @@ -12,18 +12,18 @@ public class UrlUtilsTest { @Test public void testFixRelativeUrl() { - String fixrelativeurl = UrlUtils.fixRelativeUrl("aa", "http://www.dianping.com/sh/ss/com"); + String fixrelativeurl = UrlUtils.canonicalizeUrl("aa", "http://www.dianping.com/sh/ss/com"); System.out.println("fix: " + fixrelativeurl); Assert.assertEquals("http://www.dianping.com/sh/ss/aa", fixrelativeurl); - fixrelativeurl = UrlUtils.fixRelativeUrl("../aa", "http://www.dianping.com/sh/ss/com"); + fixrelativeurl = UrlUtils.canonicalizeUrl("../aa", "http://www.dianping.com/sh/ss/com"); Assert.assertEquals("http://www.dianping.com/sh/aa", fixrelativeurl); - fixrelativeurl = UrlUtils.fixRelativeUrl("..../aa", "http://www.dianping.com/sh/ss/com"); + fixrelativeurl = UrlUtils.canonicalizeUrl("..../aa", "http://www.dianping.com/sh/ss/com"); Assert.assertEquals("http://www.dianping.com/aa", fixrelativeurl); - fixrelativeurl = UrlUtils.fixRelativeUrl(".../aa", "http://www.dianping.com/sh/ss/com"); + fixrelativeurl = UrlUtils.canonicalizeUrl(".../aa", "http://www.dianping.com/sh/ss/com"); Assert.assertEquals("http://www.dianping.com/aa", fixrelativeurl); - fixrelativeurl = UrlUtils.fixRelativeUrl("..aa", "http://www.dianping.com/sh/ss/com"); + fixrelativeurl = UrlUtils.canonicalizeUrl("..aa", "http://www.dianping.com/sh/ss/com"); Assert.assertEquals("http://www.dianping.com/sh/ss/..aa", fixrelativeurl); // fixrelativeurl = fixrelativeurl("/aa", "http://www.dianping.com"); // System.out.println("fix: " + fixrelativeurl); @@ -628,7 +628,6 @@ public void testFixRelativeHtml(){ "\t\t\t\n" + "\n"; String newHtml = UrlUtils.fixAllRelativeHrefs(html, "http://www.huxiu.com/"); - String text = "订阅虎嗅"; Assert.assertTrue(html.contains(" Date: Wed, 24 Jul 2013 19:49:00 +0800 Subject: [PATCH 48/81] update java docs --- .../main/java/us/codecraft/webmagic/schedular/Scheduler.java | 2 +- .../src/main/java/us/codecraft/webmagic/selector/Html.java | 1 + .../main/java/us/codecraft/webmagic/selector/PlainText.java | 1 + .../java/us/codecraft/webmagic/selector/RegexSelector.java | 1 + .../java/us/codecraft/webmagic/selector/ReplaceSelector.java | 1 + .../main/java/us/codecraft/webmagic/selector/Selectable.java | 5 +++-- .../main/java/us/codecraft/webmagic/selector/Selector.java | 1 + .../java/us/codecraft/webmagic/selector/SelectorFactory.java | 1 + .../main/java/us/codecraft/webmagic/utils/ThreadUtils.java | 1 + .../src/main/java/us/codecraft/webmagic/utils/UrlUtils.java | 3 ++- 10 files changed, 13 insertions(+), 4 deletions(-) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/schedular/Scheduler.java b/webmagic-core/src/main/java/us/codecraft/webmagic/schedular/Scheduler.java index bf440baf8..8d9649be1 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/schedular/Scheduler.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/schedular/Scheduler.java @@ -22,7 +22,7 @@ public interface Scheduler { /** * 返回下一个要抓取的链接 * @param task 定义的任务,以满足单Scheduler多Task的情况 - * @return + * @return 下一个要抓取的链接 */ public Request poll(Task task); diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Html.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Html.java index 099f50746..114eef996 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Html.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Html.java @@ -4,6 +4,7 @@ import java.util.List; /** + * 可抽取的html文本。
      * @author code4crafter@gmail.com
      * Date: 13-4-21 * Time: 上午7:54 diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/PlainText.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/PlainText.java index 0137de814..9e8d19418 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/PlainText.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/PlainText.java @@ -6,6 +6,7 @@ import java.util.List; /** + * 可抽取的纯文本,不包括xpath和css selector实现。
      * @author code4crafter@gmail.com
      * Date: 13-4-21 * Time: 上午7:54 diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/RegexSelector.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/RegexSelector.java index 49fbffd0b..e95138b7a 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/RegexSelector.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/RegexSelector.java @@ -9,6 +9,7 @@ import java.util.regex.PatternSyntaxException; /** + * 正则表达式抽取器。
      * @author code4crafter@gmail.com
      * Date: 13-4-21 * Time: 上午7:09 diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/ReplaceSelector.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/ReplaceSelector.java index 1ce7c4d59..38b95f787 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/ReplaceSelector.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/ReplaceSelector.java @@ -6,6 +6,7 @@ import java.util.regex.PatternSyntaxException; /** + * 对文本进行替换。
      * @author code4crafter@gmail.com
      * Date: 13-4-21 * Time: 上午7:09 diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Selectable.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Selectable.java index 932115cdf..1b0ba10a3 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Selectable.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Selectable.java @@ -3,6 +3,7 @@ import java.util.List; /** + * 可进行抽取的文本。
      * @author code4crafter@gmail.com
      * Date: 13-4-20 * Time: 下午7:51 @@ -20,8 +21,8 @@ public interface Selectable { /** * select list with css selector * - * @param - * @return + * @param selector css selector expression + * @return new Selectable after extract */ public Selectable $(String selector); diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Selector.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Selector.java index f7771cfbd..845c0b6c5 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Selector.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Selector.java @@ -3,6 +3,7 @@ import java.util.List; /** + * 抽取器。
      * @author code4crafter@gmail.com
      * Date: 13-4-20 * Time: 下午8:02 diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/SelectorFactory.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/SelectorFactory.java index 2fa78d323..1dd56e01c 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/SelectorFactory.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/SelectorFactory.java @@ -7,6 +7,7 @@ import java.util.concurrent.ConcurrentHashMap; /** + * 产生selector的工厂。
      * @author code4crafter@gmail.com
      * Date: 13-4-21 * Time: 上午7:56 diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/utils/ThreadUtils.java b/webmagic-core/src/main/java/us/codecraft/webmagic/utils/ThreadUtils.java index ebe61198d..d6876c719 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/utils/ThreadUtils.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/utils/ThreadUtils.java @@ -6,6 +6,7 @@ import java.util.concurrent.TimeUnit; /** + * 线程工具类。
      * @author code4crafer@gmail.com * Date: 13-6-23 * Time: 下午7:11 diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/utils/UrlUtils.java b/webmagic-core/src/main/java/us/codecraft/webmagic/utils/UrlUtils.java index 0b7201d61..9f038bc87 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/utils/UrlUtils.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/utils/UrlUtils.java @@ -6,6 +6,7 @@ import java.util.regex.Pattern; /** + * url及html处理工具类。
      * @author code4crafter@gmail.com
      * Date: 13-4-21 * Time: 下午1:52 @@ -18,7 +19,7 @@ public class UrlUtils { * 将url想对地址转化为绝对地址 * @param url url地址 * @param refer url地址来自哪个页面 - * @return + * @return url绝对地址 */ public static String canonicalizeUrl(String url, String refer) { if (StringUtils.isBlank(url) || StringUtils.isBlank(refer)) { From 68059fef69dd8b1d277d30c603ed18d7cbeb113b Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Wed, 24 Jul 2013 20:38:49 +0800 Subject: [PATCH 49/81] update java docs --- webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java | 2 +- .../main/java/us/codecraft/webmagic/downloader/Downloader.java | 2 +- .../java/us/codecraft/webmagic/downloader/FileDownloader.java | 1 + .../us/codecraft/webmagic/downloader/HttpClientDownloader.java | 1 + .../codecraft/webmagic/schedular/FileCacheQueueScheduler.java | 2 +- 5 files changed, 5 insertions(+), 3 deletions(-) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java index 1288ff8ed..57e29b18d 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java @@ -232,7 +232,7 @@ public void runAsync(){ /** * 建立多个线程下载 * @param threadNum 线程数 - * @return + * @return this */ public Spider thread(int threadNum) { checkIfNotRunning(); diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/Downloader.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/Downloader.java index 9a8bac110..c431fc3b4 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/Downloader.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/Downloader.java @@ -5,7 +5,7 @@ import us.codecraft.webmagic.Task; /** - * Downloader是webmagic下载页面的接口。webmagic默认使用了HttpComponent作为下载器,一般情况,你无需自己实现这个接口。 + * Downloader是webmagic下载页面的接口。webmagic默认使用了HttpComponent作为下载器,一般情况,你无需自己实现这个接口。
      * @author code4crafter@gmail.com
      * Date: 13-4-21 * Time: 下午12:14 diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/FileDownloader.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/FileDownloader.java index b4a49ac37..d22bf0815 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/FileDownloader.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/FileDownloader.java @@ -12,6 +12,7 @@ import java.io.*; /** + * 使用缓存到本地的文件来模拟下载,可以在Spider框架中仅进行抽取工作。
      * @author code4crafer@gmail.com * Date: 13-6-24 * Time: 上午7:24 diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java index ac3ea0fb0..7eb627724 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java @@ -20,6 +20,7 @@ /** + * 封装了HttpClient的下载器。已实现指定次数重试、处理gzip、自定义UA/cookie等功能。
      * @author code4crafter@gmail.com
      * Date: 13-4-21 * Time: 下午12:15 diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/schedular/FileCacheQueueScheduler.java b/webmagic-core/src/main/java/us/codecraft/webmagic/schedular/FileCacheQueueScheduler.java index 77a6c0b35..f5393a33e 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/schedular/FileCacheQueueScheduler.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/schedular/FileCacheQueueScheduler.java @@ -16,7 +16,7 @@ import java.util.concurrent.atomic.AtomicInteger; /** - * 磁盘文件实现的安全Scheduler,可以保证在长时间执行的任务中断后,下次启动从中断位置重新开始。
      + * 磁盘文件实现的url管理模块,可以保证在长时间执行的任务中断后,下次启动从中断位置重新开始。
      * @author code4crafter@gmail.com
      * Date: 13-4-21 * Time: 下午1:13 From d998312db72b29daa9e6afa31d902423ffad76b8 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Wed, 24 Jul 2013 20:45:45 +0800 Subject: [PATCH 50/81] update version --- webmagic-core/pom.xml | 2 +- webmagic-plugin/pom.xml | 2 +- webmagic-samples/pom.xml | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/webmagic-core/pom.xml b/webmagic-core/pom.xml index 7d787aa85..b0de214cd 100644 --- a/webmagic-core/pom.xml +++ b/webmagic-core/pom.xml @@ -3,7 +3,7 @@ xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> us.codecraft - 0.0.1-SNAPSHOT + 0.1.0 4.0.0 webmagic-core diff --git a/webmagic-plugin/pom.xml b/webmagic-plugin/pom.xml index 0a37a30ce..9add44714 100644 --- a/webmagic-plugin/pom.xml +++ b/webmagic-plugin/pom.xml @@ -3,7 +3,7 @@ xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> us.codecraft - 0.0.1-SNAPSHOT + 0.1.0 4.0.0 webmagic-plugin diff --git a/webmagic-samples/pom.xml b/webmagic-samples/pom.xml index f1f680667..a921c454e 100644 --- a/webmagic-samples/pom.xml +++ b/webmagic-samples/pom.xml @@ -4,7 +4,7 @@ xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> us.codecraft - 0.0.1-SNAPSHOT + 0.1.0 4.0.0 webmagic-samples From 9de17009430f350bc151d1e1538cecf2830ec6fa Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Wed, 24 Jul 2013 20:48:58 +0800 Subject: [PATCH 51/81] update docs --- webmagic-plugin/pom.xml | 2 +- webmagic-samples/pom.xml | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/webmagic-plugin/pom.xml b/webmagic-plugin/pom.xml index 9add44714..6eb7d615e 100644 --- a/webmagic-plugin/pom.xml +++ b/webmagic-plugin/pom.xml @@ -12,7 +12,7 @@ us.codecraft webmagic-core - 0.0.1-SNAPSHOT + 0.1.0 junit diff --git a/webmagic-samples/pom.xml b/webmagic-samples/pom.xml index a921c454e..ac2092f6a 100644 --- a/webmagic-samples/pom.xml +++ b/webmagic-samples/pom.xml @@ -13,12 +13,12 @@ us.codecraft webmagic-core - 0.0.1-SNAPSHOT + 0.1.0 us.codecraft webmagic-plugin - 0.0.1-SNAPSHOT + 0.1.0 junit From acce8600641661f7a50a01ae9866d6b27d3dec99 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Wed, 24 Jul 2013 20:58:37 +0800 Subject: [PATCH 52/81] readme --- README.md | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 4bd8c7f04..3b8d40b19 100644 --- a/README.md +++ b/README.md @@ -77,10 +77,15 @@ webmagic定制的核心是PageProcessor接口。 Spider.create(new SimplePageProcessor("http://my.oschina.net/", "http://my.oschina.net/*/blog/*")).run(); +### 技术架构及原理 + +见作者的一篇文章:[webmagic的设计机制及原理-如何开发一个Java爬虫](http://my.oschina.net/flashsword/blog/145796) ### 示例 -可参考作者博客[使用webmagic抓取页面并保存为wordpress文件](http://my.oschina.net/flashsword/blog/136846) +webmagic-samples目录里有一些定制PageProcessor以抽取不同站点的例子。 + +作者还有一个使用webmagic进行抽取并持久化到数据库的项目[JobHunter](http://git.oschina.net/flashsword20/jobhunter)。这个项目整合了Spring,自定义了Pipeline,使用mybatis进行数据持久化。 ### 协议 From 8169923d85927ccbf736c7efdcbc5fa8fe19c378 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Wed, 24 Jul 2013 21:09:40 +0800 Subject: [PATCH 53/81] release note --- release-note.md | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) create mode 100755 release-note.md diff --git a/release-note.md b/release-note.md new file mode 100755 index 000000000..893fc86c2 --- /dev/null +++ b/release-note.md @@ -0,0 +1,17 @@ +Release Notes +---- +*2012-7-16* `version:0.1.0` + +第一个稳定版本。 + +修改了若干API,使得可扩展性更强,为每个任务分配一个ID,可以通过ID区分不同任务。 + +增加下载的重试机制,支持gzip,支持自定义UA/cookie。 + +增加jquery形式的CSS Selector API,可以通过`page.getHtml().$("div.body")`形式抽取元素。 + +完善了文档,架构说明:[webmagic的设计机制及原理-如何开发一个Java爬虫](http://my.oschina.net/flashsword/blog/145796),Javadoc:[http://code4craft.github.io/webmagic/docs](http://code4craft.github.io/webmagic/docs)。 + + + + From e9cef8fc4cfe492af25e7e478b0233873d47fa6c Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Wed, 24 Jul 2013 22:07:54 +0800 Subject: [PATCH 54/81] release --- README.md | 4 ---- release-note.md | 8 +++----- 2 files changed, 3 insertions(+), 9 deletions(-) diff --git a/README.md b/README.md index 3b8d40b19..ade3b5e4a 100644 --- a/README.md +++ b/README.md @@ -9,10 +9,6 @@ webmagic的发起源于工作中的需要,其定位是帮助开发者更便捷 webmagic的功能覆盖整个爬虫的生命周期(链接提取、页面下载、内容抽取、持久化),开发者可以便捷的使用xpath和正则表达式进行链接和内容的提取,只需编写少量代码即可完成一个定制爬虫。 -#### 请注意 - -webmagic正处于开发阶段,目前还没有稳定版本。欢迎开发者参与到webmagic的试用和修改中来。 **如果只是想以外部jar包的方式,引用webmagic并进行自己的业务开发,建议你等待webmagic的第一个稳定版本。** - ###特色### * ####垂直爬虫#### diff --git a/release-note.md b/release-note.md index 893fc86c2..7dbe92a96 100755 --- a/release-note.md +++ b/release-note.md @@ -8,10 +8,8 @@ Release Notes 增加下载的重试机制,支持gzip,支持自定义UA/cookie。 -增加jquery形式的CSS Selector API,可以通过`page.getHtml().$("div.body")`形式抽取元素。 - -完善了文档,架构说明:[webmagic的设计机制及原理-如何开发一个Java爬虫](http://my.oschina.net/flashsword/blog/145796),Javadoc:[http://code4craft.github.io/webmagic/docs](http://code4craft.github.io/webmagic/docs)。 - - +增加多线程抓取功能,只需在初始化的时候指定线程数即可。 +增加jquery形式的CSS Selector API,可以通过`page.getHtml().$("div.body")`形式抽取元素。 +完善了文档,架构说明:[webmagic的设计机制及原理-如何开发一个Java爬虫](http://my.oschina.net/flashsword/blog/145796),Javadoc:[http://code4craft.github.io/webmagic/docs](http://code4craft.github.io/webmagic/docs)。 \ No newline at end of file From 02a7f4e77d39bf8c180a0467a427d0ee23f83e93 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Wed, 24 Jul 2013 22:10:48 +0800 Subject: [PATCH 55/81] readme --- README.md | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index ade3b5e4a..78756ac26 100644 --- a/README.md +++ b/README.md @@ -24,9 +24,9 @@ webmagic的功能覆盖整个爬虫的生命周期(链接提取、页面下载 Spider.create(new SimplePageProcessor("http://my.oschina.net/", "http://my.oschina.net/*/blog/*")).run(); * ####可扩展#### - 参考[`scrapy`](https://github.com/scrapy/scrapy)的设计,webmagic将爬虫的扩展点分为Processor、Schedular、Downloader、Pipeline三个模块,可以通过扩展这些接口实现强大的扩展功能。如可以通过多个Spider实现多线程抓取;可以通过扩展Schedular实现断点续传乃至于分布式爬虫;可以通过扩展Pipeline实现业务可定制的持久化功能。 + 参考[`scrapy`](https://github.com/scrapy/scrapy)的设计,webmagic将爬虫的扩展点分为Processor、Schedular、Downloader、Pipeline三个模块,可以通过扩展这些接口实现强大的扩展功能。如可以通过多个Spider实现多线程抓取;可以通过扩展Schedular实现断点续传乃至于分布式爬虫;可以通过扩展Pipeline实现业务可定制的持久化功能。webmagic的架构原理见作者的一篇文章:[webmagic的设计机制及原理-如何开发一个Java爬虫](http://my.oschina.net/flashsword/blog/145796) - +![image](http://code4craft.github.io/images/posts/webmagic.png) ------ @@ -73,9 +73,6 @@ webmagic定制的核心是PageProcessor接口。 Spider.create(new SimplePageProcessor("http://my.oschina.net/", "http://my.oschina.net/*/blog/*")).run(); -### 技术架构及原理 - -见作者的一篇文章:[webmagic的设计机制及原理-如何开发一个Java爬虫](http://my.oschina.net/flashsword/blog/145796) ### 示例 From 4e3ee63badf3af3cadcffcb8d06db0cbff663858 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Thu, 25 Jul 2013 08:20:21 +0800 Subject: [PATCH 56/81] invite redis for distribution --- webmagic-plugin/pom.xml | 5 +++ .../webmagic/scheduler/RedisScheduler.java | 45 +++++++++++++++++++ .../scheduler/RedisSchedulerTest.java | 41 +++++++++++++++++ .../webmagic/samples/GlobalProcessor.java | 13 +++--- 4 files changed, 98 insertions(+), 6 deletions(-) create mode 100644 webmagic-plugin/src/main/java/us/codecraft/webmagic/scheduler/RedisScheduler.java create mode 100644 webmagic-plugin/src/test/java/us/codecraft/webmagic/scheduler/RedisSchedulerTest.java diff --git a/webmagic-plugin/pom.xml b/webmagic-plugin/pom.xml index 6eb7d615e..634f09d34 100644 --- a/webmagic-plugin/pom.xml +++ b/webmagic-plugin/pom.xml @@ -25,6 +25,11 @@ freemarker 2.3.15 + + redis.clients + jedis + 2.0.0 + diff --git a/webmagic-plugin/src/main/java/us/codecraft/webmagic/scheduler/RedisScheduler.java b/webmagic-plugin/src/main/java/us/codecraft/webmagic/scheduler/RedisScheduler.java new file mode 100644 index 000000000..e87ee3357 --- /dev/null +++ b/webmagic-plugin/src/main/java/us/codecraft/webmagic/scheduler/RedisScheduler.java @@ -0,0 +1,45 @@ +package us.codecraft.webmagic.scheduler; + +import redis.clients.jedis.Jedis; +import redis.clients.jedis.JedisPool; +import redis.clients.jedis.JedisPoolConfig; +import us.codecraft.webmagic.Request; +import us.codecraft.webmagic.Task; +import us.codecraft.webmagic.schedular.Scheduler; + +/** + * 使用redis管理url,构建一个分布式的爬虫。
      + * @author yihua.huang@dianping.com
      + * @date: 13-7-25
      + * Time: 上午7:07
      + */ +public class RedisScheduler implements Scheduler{ + + private JedisPool pool; + + private static final String QUEUE_PREFIX = "queue_"; + + private static final String SET_PREFIX = "set_"; + + public RedisScheduler(String host){ + pool = new JedisPool(new JedisPoolConfig(), host); + } + + @Override + public synchronized void push(Request request, Task task) { + Jedis jedis = pool.getResource(); + if (jedis.zrank(SET_PREFIX+task.getUUID(),request.getUrl())==null){ + jedis.rpush(QUEUE_PREFIX+task.getUUID(),request.getUrl()); + jedis.zadd(SET_PREFIX+task.getUUID(),System.currentTimeMillis(),request.getUrl()); + } + pool.returnResource(jedis); + } + + @Override + public synchronized Request poll(Task task) { + Jedis jedis = pool.getResource(); + String url = jedis.lpop(QUEUE_PREFIX+task.getUUID()); + pool.returnResource(jedis); + return new Request(url); + } +} diff --git a/webmagic-plugin/src/test/java/us/codecraft/webmagic/scheduler/RedisSchedulerTest.java b/webmagic-plugin/src/test/java/us/codecraft/webmagic/scheduler/RedisSchedulerTest.java new file mode 100644 index 000000000..0f556d284 --- /dev/null +++ b/webmagic-plugin/src/test/java/us/codecraft/webmagic/scheduler/RedisSchedulerTest.java @@ -0,0 +1,41 @@ +package us.codecraft.webmagic.scheduler; + +import org.junit.Before; +import org.junit.Test; +import us.codecraft.webmagic.Request; +import us.codecraft.webmagic.Site; +import us.codecraft.webmagic.Task; + +/** + * @author yihua.huang@dianping.com
      + * @date: 13-7-25
      + * Time: 上午7:51
      + */ +public class RedisSchedulerTest { + + private RedisScheduler redisScheduler; + + @Before + public void setUp() { + redisScheduler = new RedisScheduler("localhost"); + } + + @Test + public void test() { + Task task = new Task() { + @Override + public String getUUID() { + return "1"; + } + + @Override + public Site getSite() { + return null; + } + }; + redisScheduler.push(new Request("http://www.ibm.com/developerworks/cn/java/j-javadev2-22/"), task); + Request poll = redisScheduler.poll(task); + System.out.println(poll.getUrl()); + + } +} diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/GlobalProcessor.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/GlobalProcessor.java index 383422f62..f7c5f7fad 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/GlobalProcessor.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/GlobalProcessor.java @@ -3,11 +3,9 @@ import us.codecraft.webmagic.Page; import us.codecraft.webmagic.Site; import us.codecraft.webmagic.Spider; -import us.codecraft.webmagic.downloader.FileDownloader; -import us.codecraft.webmagic.downloader.HttpClientDownloader; import us.codecraft.webmagic.pipeline.FilePipeline; import us.codecraft.webmagic.processor.PageProcessor; -import us.codecraft.webmagic.schedular.FileCacheQueueScheduler; +import us.codecraft.webmagic.scheduler.RedisScheduler; import java.util.List; @@ -40,9 +38,12 @@ public Site getSite() { public static void main(String[] args) { Spider.create(new GlobalProcessor()).thread(10) - .scheduler(new FileCacheQueueScheduler("/data/webmagic/test")) - .downloader(new FileDownloader("/data/webmagic/test", new HttpClientDownloader())) - .pipeline(new FilePipeline("/data/webmagic/test")) + .scheduler(new RedisScheduler("localhost")) + .pipeline(new FilePipeline("/data/webmagic/test/")) + .runAsync(); + Spider.create(new GlobalProcessor()).thread(10) + .scheduler(new RedisScheduler("localhost")) + .pipeline(new FilePipeline("/data/webmagic/test/")) .run(); } } From 2aee2e0f3efc267d91bbabea4503ae6c2c2ce895 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Thu, 25 Jul 2013 13:32:39 +0800 Subject: [PATCH 57/81] update pipeline api --- .../main/java/us/codecraft/webmagic/Page.java | 55 ++-------------- .../us/codecraft/webmagic/ResultItems.java | 64 +++++++++++++++++++ .../java/us/codecraft/webmagic/Spider.java | 2 +- .../webmagic/pipeline/ConsolePipeline.java | 14 ++-- .../webmagic/pipeline/FilePipeline.java | 13 ++-- .../codecraft/webmagic/pipeline/Pipeline.java | 4 +- .../processor/SimplePageProcessor.java | 3 +- .../webmagic/selector/PlainText.java | 6 +- .../webmagic/selector/Selectable.java | 2 +- .../webmagic/selector/XpathSelectorTest.java | 2 +- .../webmagic/pipeline/FreemarkerPipeline.java | 10 +-- .../webmagic/scheduler/RedisScheduler.java | 2 + .../scheduler/RedisSchedulerTest.java | 3 +- .../samples/DiandianBlogProcessor.java | 6 +- .../webmagic/samples/DianpingProcessor.java | 2 +- .../webmagic/samples/DiaoyuwengProcessor.java | 4 +- .../webmagic/samples/F58PageProcesser.java | 2 +- .../webmagic/samples/GlobalProcessor.java | 2 +- .../webmagic/samples/HuxiuProcessor.java | 2 +- .../webmagic/samples/MeicanProcessor.java | 4 +- .../webmagic/samples/NjuBBSProcessor.java | 2 +- .../samples/OschinaBlogPageProcesser.java | 2 +- .../samples/OschinaPageProcesser.java | 2 +- .../webmagic/samples/QzoneBlogProcessor.java | 2 +- .../webmagic/samples/SinaBlogProcesser.java | 2 +- .../webmagic/samples/TianyaPageProcesser.java | 2 +- 26 files changed, 119 insertions(+), 95 deletions(-) create mode 100644 webmagic-core/src/main/java/us/codecraft/webmagic/ResultItems.java diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java index b2dd3db71..40f17f0aa 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java @@ -6,8 +6,6 @@ import java.util.ArrayList; import java.util.List; -import java.util.Map; -import java.util.concurrent.ConcurrentHashMap; /** *
      @@ -27,7 +25,7 @@ public class Page {
       
           private Request request;
       
      -    private Map fields = new ConcurrentHashMap();
      +    private ResultItems resultItems = new ResultItems();
       
           private Selectable html;
       
      @@ -35,44 +33,16 @@ public class Page {
       
           private List targetRequests = new ArrayList();
       
      -    private boolean skip;
      -
      -    private Object extra;
      -
      -    /**
      -     * 是否忽略这个页面,用于pipeline来判断是否对这个页面进行处理
      -     * @return 是否忽略 true 忽略
      -     */
      -    public boolean isSkip() {
      -        return skip;
      -    }
      -
      -    /**
      -     * 设置是否忽略这个页面,用于pipeline来判断是否对这个页面进行处理
      -     * @param skip 是否忽略 true 忽略
      -     */
      -    public void setSkip(boolean skip) {
      -        this.skip = skip;
      -    }
      -
           public Page() {
           }
       
      -    /**
      -     * 获取抽取的结果,在{@link us.codecraft.webmagic.pipeline.Pipeline} 中调用
      -     * @return fields 抽取的结果
      -     */
      -    public Map getFields() {
      -        return fields;
      -    }
      -
           /**
            * 保存抽取的结果
            * @param key 结果的key
            * @param field 结果的value
            */
      -    public void putField(String key, Selectable field) {
      -        fields.put(key, field);
      +    public void putField(String key, Object field) {
      +        resultItems.put(key, field);
           }
       
           /**
      @@ -157,23 +127,10 @@ public Request getRequest() {
       
           public void setRequest(Request request) {
               this.request = request;
      +        this.resultItems.setRequest(request);
           }
       
      -    /**
      -     * 获取附加对象
      -     * @param  对象类型
      -     * @return 对象内容
      -     */
      -    public  T getExtra() {
      -        return (T)extra;
      -    }
      -
      -    /**
      -     * 设置附加对象
      -     * @param extra 对象内容
      -     * @param  对象类型
      -     */
      -    public  void setExtra(T extra) {
      -        this.extra = extra;
      +    public ResultItems getResultItems() {
      +        return resultItems;
           }
       }
      diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/ResultItems.java b/webmagic-core/src/main/java/us/codecraft/webmagic/ResultItems.java
      new file mode 100644
      index 000000000..0c1d94c5b
      --- /dev/null
      +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/ResultItems.java
      @@ -0,0 +1,64 @@
      +package us.codecraft.webmagic;
      +
      +import java.util.HashMap;
      +import java.util.Map;
      +
      +/**
      + * 保存抽取结果的类,由PageProcessor处理得到,传递给{@link us.codecraft.webmagic.pipeline.Pipeline}进行持久化。
      + * @author yihua.huang@dianping.com
      + * @date: 13-7-25
      + * Time: 下午12:20
      + */ +public class ResultItems { + + private Map fields = new HashMap(); + + private Request request; + + private boolean skip; + + public T get(String key) { + Object o = fields.get(key); + if (o == null) { + return null; + } + return (T) fields.get(key); + } + + public Map getAll() { + return fields; + } + + public ResultItems put(String key, T value) { + fields.put(key, value); + return this; + } + + public Request getRequest() { + return request; + } + + public ResultItems setRequest(Request request) { + this.request = request; + return this; + } + + /** + * 是否忽略这个页面,用于pipeline来判断是否对这个页面进行处理 + * @return 是否忽略 true 忽略 + */ + public boolean isSkip() { + return skip; + } + + + /** + * 设置是否忽略这个页面,用于pipeline来判断是否对这个页面进行处理 + * @param skip + * @return this + */ + public ResultItems setSkip(boolean skip) { + this.skip = skip; + return this; + } +} diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java index 57e29b18d..a51ed9601 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java @@ -196,7 +196,7 @@ private void processRequest(Request request) { pageProcessor.process(page); addRequest(page); for (Pipeline pipeline : pipelines) { - pipeline.process(page, this); + pipeline.process(page.getResultItems(), this); } sleep(site.getSleepTime()); } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/ConsolePipeline.java b/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/ConsolePipeline.java index dff2ded75..72c3bf3ff 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/ConsolePipeline.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/ConsolePipeline.java @@ -1,8 +1,7 @@ package us.codecraft.webmagic.pipeline; -import us.codecraft.webmagic.Page; +import us.codecraft.webmagic.ResultItems; import us.codecraft.webmagic.Task; -import us.codecraft.webmagic.selector.Selectable; import java.util.Map; @@ -15,13 +14,10 @@ public class ConsolePipeline implements Pipeline{ @Override - public void process(Page page,Task task) { - System.out.println("get page: "+page.getUrl()); - for (Map.Entry entry : page.getFields().entrySet()) { - System.out.println(entry.getKey()+":\t"+entry.getValue().toStrings()); - } - if (page.getExtra()!=null){ - System.out.println(page.getExtra()); + public void process(ResultItems resultItems,Task task) { + System.out.println("get page: "+resultItems.getRequest().getUrl()); + for (Map.Entry entry : resultItems.getAll().entrySet()) { + System.out.println(entry.getKey()+":\t"+entry.getValue()); } } } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/FilePipeline.java b/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/FilePipeline.java index e48e2bb44..0948bfe0f 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/FilePipeline.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/FilePipeline.java @@ -2,13 +2,14 @@ import org.apache.commons.codec.digest.DigestUtils; import org.apache.log4j.Logger; -import us.codecraft.webmagic.Page; +import us.codecraft.webmagic.ResultItems; import us.codecraft.webmagic.Task; import java.io.File; import java.io.FileWriter; import java.io.IOException; import java.io.PrintWriter; +import java.util.Map; /** * 持久化到文件的接口。 @@ -38,16 +39,18 @@ public FilePipeline(String path) { } @Override - public void process(Page page, Task task) { + public void process(ResultItems resultItems, Task task) { String path = this.path + "/" + task.getUUID() + "/"; File file = new File(path); if (!file.exists()) { file.mkdirs(); } try { - PrintWriter printWriter = new PrintWriter(new FileWriter(path + DigestUtils.md5Hex(page.getUrl().toString()))); - printWriter.println("url:\t" + page.getUrl()); - printWriter.println("html:\t" + page.getHtml()); + PrintWriter printWriter = new PrintWriter(new FileWriter(path + DigestUtils.md5Hex(resultItems.getRequest().getUrl()))); + printWriter.println("url:\t" + resultItems.getRequest().getUrl()); + for (Map.Entry entry : resultItems.getAll().entrySet()) { + printWriter.println(entry.getKey()+":\t"+entry.getValue()); + } printWriter.close(); } catch (IOException e) { logger.warn("write file error",e); diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/Pipeline.java b/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/Pipeline.java index 408392d9a..595a8e87b 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/Pipeline.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/Pipeline.java @@ -1,6 +1,6 @@ package us.codecraft.webmagic.pipeline; -import us.codecraft.webmagic.Page; +import us.codecraft.webmagic.ResultItems; import us.codecraft.webmagic.Task; /** @@ -11,5 +11,5 @@ */ public interface Pipeline { - public void process(Page page,Task task); + public void process(ResultItems resultItems,Task task); } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/processor/SimplePageProcessor.java b/webmagic-core/src/main/java/us/codecraft/webmagic/processor/SimplePageProcessor.java index 47d3748b9..ff9646054 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/processor/SimplePageProcessor.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/processor/SimplePageProcessor.java @@ -30,12 +30,13 @@ public SimplePageProcessor(String startUrl, String urlPattern) { @Override public void process(Page page) { - List requests = page.getHtml().links().regex(urlPattern).toStrings(); + List requests = page.getHtml().links().regex(urlPattern).all(); //调用page.addTargetRequests()方法添加待抓取链接 page.addTargetRequests(requests); //xpath方式抽取 page.putField("title", page.getHtml().xpath("//title")); //sc表示使用Readability技术抽取正文 + page.putField("html", page.getHtml().toString()); page.putField("content", page.getHtml().smartContent()); } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/PlainText.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/PlainText.java index 9e8d19418..d06a53105 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/PlainText.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/PlainText.java @@ -82,14 +82,14 @@ public Selectable replace(String regex, String replacement) { } @Override - public List toStrings() { + public List all() { return strings; } @Override public String toString() { - if (CollectionUtils.isNotEmpty(toStrings())) { - return toStrings().get(0); + if (CollectionUtils.isNotEmpty(all())) { + return all().get(0); } else { return null; } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Selectable.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Selectable.java index 1b0ba10a3..42f3d1083 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Selectable.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Selectable.java @@ -69,5 +69,5 @@ public interface Selectable { * * @return multi string result */ - public List toStrings(); + public List all(); } diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/selector/XpathSelectorTest.java b/webmagic-core/src/test/java/us/codecraft/webmagic/selector/XpathSelectorTest.java index 3ef0a9279..30d8a817b 100644 --- a/webmagic-core/src/test/java/us/codecraft/webmagic/selector/XpathSelectorTest.java +++ b/webmagic-core/src/test/java/us/codecraft/webmagic/selector/XpathSelectorTest.java @@ -1351,7 +1351,7 @@ public void test() { public void testOschina() { Html html1 = new Html(html); Assert.assertEquals("再次吐槽easyui", html1.xpath(".//*[@class='QTitle']/h1/a").toString()); - Assert.assertNotNull(html1.$("a[href]").xpath("//@href").toStrings()); + Assert.assertNotNull(html1.$("a[href]").xpath("//@href").all()); } } diff --git a/webmagic-plugin/src/main/java/us/codecraft/webmagic/pipeline/FreemarkerPipeline.java b/webmagic-plugin/src/main/java/us/codecraft/webmagic/pipeline/FreemarkerPipeline.java index 8741ef498..9a045eff0 100644 --- a/webmagic-plugin/src/main/java/us/codecraft/webmagic/pipeline/FreemarkerPipeline.java +++ b/webmagic-plugin/src/main/java/us/codecraft/webmagic/pipeline/FreemarkerPipeline.java @@ -4,7 +4,7 @@ import freemarker.template.Template; import freemarker.template.TemplateException; import org.apache.commons.codec.digest.DigestUtils; -import us.codecraft.webmagic.Page; +import us.codecraft.webmagic.ResultItems; import us.codecraft.webmagic.Task; import java.io.File; @@ -39,8 +39,8 @@ public FreemarkerPipeline(String template) throws IOException { @Override - public void process(Page page, Task task) { - if (page.isSkip()) { + public void process(ResultItems resultItems, Task task) { + if (resultItems.isSkip()) { return; } String path = this.path + "" + task.getUUID() + "/"; @@ -49,8 +49,8 @@ public void process(Page page, Task task) { file.mkdirs(); } try { - PrintWriter printWriter = new PrintWriter(new FileWriter(path + DigestUtils.md5Hex(page.getUrl().toString()) + ".html")); - template.process(page.getFields(), printWriter); + PrintWriter printWriter = new PrintWriter(new FileWriter(path + DigestUtils.md5Hex(resultItems.getRequest().getUrl()) + ".html")); + template.process(resultItems.getAll(), printWriter); printWriter.close(); } catch (TemplateException e) { } catch (IOException e) { diff --git a/webmagic-plugin/src/main/java/us/codecraft/webmagic/scheduler/RedisScheduler.java b/webmagic-plugin/src/main/java/us/codecraft/webmagic/scheduler/RedisScheduler.java index e87ee3357..481981dba 100644 --- a/webmagic-plugin/src/main/java/us/codecraft/webmagic/scheduler/RedisScheduler.java +++ b/webmagic-plugin/src/main/java/us/codecraft/webmagic/scheduler/RedisScheduler.java @@ -28,7 +28,9 @@ public RedisScheduler(String host){ @Override public synchronized void push(Request request, Task task) { Jedis jedis = pool.getResource(); + //使用SortedSet进行url去重 if (jedis.zrank(SET_PREFIX+task.getUUID(),request.getUrl())==null){ + //使用List保存队列 jedis.rpush(QUEUE_PREFIX+task.getUUID(),request.getUrl()); jedis.zadd(SET_PREFIX+task.getUUID(),System.currentTimeMillis(),request.getUrl()); } diff --git a/webmagic-plugin/src/test/java/us/codecraft/webmagic/scheduler/RedisSchedulerTest.java b/webmagic-plugin/src/test/java/us/codecraft/webmagic/scheduler/RedisSchedulerTest.java index 0f556d284..6db21a8f9 100644 --- a/webmagic-plugin/src/test/java/us/codecraft/webmagic/scheduler/RedisSchedulerTest.java +++ b/webmagic-plugin/src/test/java/us/codecraft/webmagic/scheduler/RedisSchedulerTest.java @@ -1,6 +1,7 @@ package us.codecraft.webmagic.scheduler; import org.junit.Before; +import org.junit.Ignore; import org.junit.Test; import us.codecraft.webmagic.Request; import us.codecraft.webmagic.Site; @@ -20,6 +21,7 @@ public void setUp() { redisScheduler = new RedisScheduler("localhost"); } + @Ignore("environment depended") @Test public void test() { Task task = new Task() { @@ -35,7 +37,6 @@ public Site getSite() { }; redisScheduler.push(new Request("http://www.ibm.com/developerworks/cn/java/j-javadev2-22/"), task); Request poll = redisScheduler.poll(task); - System.out.println(poll.getUrl()); } } diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/DiandianBlogProcessor.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/DiandianBlogProcessor.java index e5aafe7a3..a1189e45d 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/DiandianBlogProcessor.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/DiandianBlogProcessor.java @@ -20,13 +20,13 @@ public void process(Page page) { //a()表示提取链接,links()表示提取所有链接 //getHtml()返回Html对象,支持链式调用 //r()表示用正则表达式提取一条内容,regex()表示提取多条内容 - //toString()表示取单条结果,toStrings()表示取多条 - List requests = page.getHtml().links().regex("(.*/post/.*)").toStrings(); + //toString()表示取单条结果,all()表示取多条 + List requests = page.getHtml().links().regex("(.*/post/.*)").all(); //使用page.addTargetRequests()方法将待抓取的链接加入队列 page.addTargetRequests(requests); //page.putField(key,value)将抽取的内容加入结果Map //x()和xs()使用xpath进行抽取 - page.putField("title", page.getHtml().xpath("//title").regex("(.*?)\\|")); + page.putField("title", page.getHtml().xpath("//title").regex("(.*?)\\|").toString()); //smartContent()使用readability技术直接抽取正文,对于规整的文本有比较好的抽取正确率 page.putField("content", page.getHtml().smartContent()); page.putField("date", page.getUrl().regex("post/(\\d+-\\d+-\\d+)/")); diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/DianpingProcessor.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/DianpingProcessor.java index 056da0aae..b7e3ee02d 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/DianpingProcessor.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/DianpingProcessor.java @@ -18,7 +18,7 @@ public class DianpingProcessor implements PageProcessor { @Override public void process(Page page) { - List requests = page.getHtml().links().regex("http://info-search-web121361\\.alpha\\.dp:8080/search/.*").toStrings(); + List requests = page.getHtml().links().regex("http://info-search-web121361\\.alpha\\.dp:8080/search/.*").all(); page.addTargetRequests(requests); } diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/DiaoyuwengProcessor.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/DiaoyuwengProcessor.java index 695d2e2ab..115f18342 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/DiaoyuwengProcessor.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/DiaoyuwengProcessor.java @@ -18,9 +18,9 @@ public class DiaoyuwengProcessor implements PageProcessor { @Override public void process(Page page) { - List requests = page.getHtml().links().regex("(http://www\\.diaoyuweng\\.com/home\\.php\\?mod=space&uid=88304&do=thread&view=me&type=thread&order=dateline&from=space&page=\\d+)").toStrings(); + List requests = page.getHtml().links().regex("(http://www\\.diaoyuweng\\.com/home\\.php\\?mod=space&uid=88304&do=thread&view=me&type=thread&order=dateline&from=space&page=\\d+)").all(); page.addTargetRequests(requests); - requests = page.getHtml().links().regex("(http://www\\.diaoyuweng\\.com/thread-\\d+-1-1.html)").toStrings(); + requests = page.getHtml().links().regex("(http://www\\.diaoyuweng\\.com/thread-\\d+-1-1.html)").all(); page.addTargetRequests(requests); if (page.getUrl().toString().contains("thread")){ page.putField("title", page.getHtml().xpath("//a[@id='thread_subject']")); diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/F58PageProcesser.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/F58PageProcesser.java index 9d5140a2e..4ffe127b4 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/F58PageProcesser.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/F58PageProcesser.java @@ -15,7 +15,7 @@ public class F58PageProcesser implements PageProcessor { @Override public void process(Page page) { - List strings = page.getHtml().regex("]*href=[\"']{1}(/yewu/.*?)[\"']{1}").toStrings(); + List strings = page.getHtml().regex("]*href=[\"']{1}(/yewu/.*?)[\"']{1}").all(); page.addTargetRequests(strings); page.putField("title",page.getHtml().regex("(.*)")); page.putField("body",page.getHtml().xpath("//dd[@class='w133']")); diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/GlobalProcessor.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/GlobalProcessor.java index f7c5f7fad..0e3f9a327 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/GlobalProcessor.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/GlobalProcessor.java @@ -20,7 +20,7 @@ public class GlobalProcessor implements PageProcessor { @Override public void process(Page page) { - final List requests = page.getHtml().links().toStrings(); + final List requests = page.getHtml().links().all(); page.addTargetRequests(requests); } diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/HuxiuProcessor.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/HuxiuProcessor.java index 26c60cc27..89b74d63f 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/HuxiuProcessor.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/HuxiuProcessor.java @@ -15,7 +15,7 @@ public class HuxiuProcessor implements PageProcessor { @Override public void process(Page page) { //http://progressdaily.diandian.com/post/2013-01-24/40046867275 - List requests = page.getHtml().regex("\"']*href=[\"']{1}([/]{0,1}article[^<>#\"']*?)[\"']{1}").toStrings(); + List requests = page.getHtml().regex("\"']*href=[\"']{1}([/]{0,1}article[^<>#\"']*?)[\"']{1}").all(); page.addTargetRequests(requests); page.putField("title",page.getHtml().xpath("//div[@class='neirong']//h1[@class='ph xs5']")); page.putField("content",page.getHtml().smartContent()); diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/MeicanProcessor.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/MeicanProcessor.java index 39f5723ec..a4e6e43b1 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/MeicanProcessor.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/MeicanProcessor.java @@ -15,12 +15,12 @@ public class MeicanProcessor implements PageProcessor { @Override public void process(Page page) { //http://progressdaily.diandian.com/post/2013-01-24/40046867275 - List requests = page.getHtml().xpath("//a[@class=\"area_link flat_btn\"]/@href").toStrings(); + List requests = page.getHtml().xpath("//a[@class=\"area_link flat_btn\"]/@href").all(); if (requests.size() > 2) { requests = requests.subList(0, 2); } page.addTargetRequests(requests); - page.addTargetRequests(page.getHtml().links().regex("(.*/restaurant/[^#]+)").toStrings()); + page.addTargetRequests(page.getHtml().links().regex("(.*/restaurant/[^#]+)").all()); page.putField("items", page.getHtml().xpath("//ul[@class=\"dishes menu_dishes\"]/li/span[@class=\"name\"]")); page.putField("prices", page.getHtml().xpath("//ul[@class=\"dishes menu_dishes\"]/li/span[@class=\"price_outer\"]/span[@class=\"price\"]")); } diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/NjuBBSProcessor.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/NjuBBSProcessor.java index a7e9c9ba9..2337da598 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/NjuBBSProcessor.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/NjuBBSProcessor.java @@ -14,7 +14,7 @@ public class NjuBBSProcessor implements PageProcessor { @Override public void process(Page page) { - List requests = page.getHtml().regex("]*href=(bbstcon\\?board=Pictures&file=[^>]*)").toStrings(); + List requests = page.getHtml().regex("]*href=(bbstcon\\?board=Pictures&file=[^>]*)").all(); page.addTargetRequests(requests); page.putField("title",page.getHtml().xpath("//div[@id='content']//h2/a")); page.putField("content",page.getHtml().smartContent()); diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/OschinaBlogPageProcesser.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/OschinaBlogPageProcesser.java index 9293b41c4..f2dbe8e10 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/OschinaBlogPageProcesser.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/OschinaBlogPageProcesser.java @@ -15,7 +15,7 @@ public class OschinaBlogPageProcesser implements PageProcessor { @Override public void process(Page page) { - List strings = page.getHtml().links().regex("(http://my\\.oschina\\.net)").toStrings(); + List strings = page.getHtml().links().regex("(http://my\\.oschina\\.net)").all(); page.addTargetRequests(strings); page.putField("title", page.getHtml().xpath("//div[@class='BlogEntity']/div[@class='BlogTitle']/h1")); page.putField("content", page.getHtml().smartContent()); diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/OschinaPageProcesser.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/OschinaPageProcesser.java index f88ce06d2..522eb2c6a 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/OschinaPageProcesser.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/OschinaPageProcesser.java @@ -15,7 +15,7 @@ public class OschinaPageProcesser implements PageProcessor { @Override public void process(Page page) { - List strings = page.getHtml().regex("]*href=[\"']{1}(http://www\\.oschina\\.net/question/[\\w]+)[\"']{1}").toStrings(); + List strings = page.getHtml().regex("]*href=[\"']{1}(http://www\\.oschina\\.net/question/[\\w]+)[\"']{1}").all(); page.addTargetRequests(strings); page.putField("title", page.getHtml().xpath("//div[@class='QTitle']/h1/a")); page.putField("content", page.getHtml().xpath("//div[@class='Question']//div[@class='Content']/div[@class='detail']")); diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/QzoneBlogProcessor.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/QzoneBlogProcessor.java index bf4dcc2a1..49418b605 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/QzoneBlogProcessor.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/QzoneBlogProcessor.java @@ -18,7 +18,7 @@ public void process(Page page) { //http://b1.cnc.qzone.qq.com/cgi-bin/blognew/get_abs?hostUin=233017404&uin=233017404&blogType=0&statYear=2013&source=0&statYear=2013&g_tk=291639571&g_tk=291639571&reqInfo=7&pos=0&num=15&source=0&rand=0.46480297949165106 // &cateName=&cateHex=&statYear=2013&reqInfo=7&pos=0&num=15&sortType=0&source=0&rand=0.46480297949165106&g_tk=291639571&verbose=1&ref=qzone - List requests = page.getHtml().regex("]*href=[\"']{1}(http://17dujingdian\\.com/post/[^#]*?)[\"']{1}").toStrings(); + List requests = page.getHtml().regex("]*href=[\"']{1}(http://17dujingdian\\.com/post/[^#]*?)[\"']{1}").all(); page.addTargetRequests(requests); page.putField("title",page.getHtml().xpath("//div[@id='content']//h2/a")); page.putField("content",page.getHtml().smartContent()); diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/SinaBlogProcesser.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/SinaBlogProcesser.java index baa375d81..b4c5bc885 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/SinaBlogProcesser.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/SinaBlogProcesser.java @@ -16,7 +16,7 @@ public class SinaBlogProcesser implements PageProcessor { @Override public void process(Page page) { - page.addTargetRequests(page.getHtml().xpath("//div[@class='articalfrontback SG_j_linedot1 clearfix']").links().toStrings()); + page.addTargetRequests(page.getHtml().xpath("//div[@class='articalfrontback SG_j_linedot1 clearfix']").links().all()); page.putField("title", page.getHtml().xpath("//div[@class='articalTitle']/h2")); page.putField("content",page.getHtml().xpath("//div[@id='articlebody']//div[@class='articalContent']")); page.putField("id",page.getUrl().regex("http://blog\\.sina\\.com\\.cn/s/blog_(\\w+)")); diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/TianyaPageProcesser.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/TianyaPageProcesser.java index 278657f09..ecc55b424 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/TianyaPageProcesser.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/TianyaPageProcesser.java @@ -15,7 +15,7 @@ public class TianyaPageProcesser implements PageProcessor { @Override public void process(Page page) { - List strings = page.getHtml().regex("]*href=[\"']{1}(/post-free.*?\\.shtml)[\"']{1}").toStrings(); + List strings = page.getHtml().regex("]*href=[\"']{1}(/post-free.*?\\.shtml)[\"']{1}").all(); page.addTargetRequests(strings); page.putField("title", page.getHtml().xpath("//div[@id='post_head']//span[@class='s_title']//b")); page.putField("body",page.getHtml().smartContent()); From 57a00da4ca9acdd3043c3182dfaa8a71beb4fc3e Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Thu, 25 Jul 2013 13:48:56 +0800 Subject: [PATCH 58/81] add results --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 78756ac26..75bea5287 100644 --- a/README.md +++ b/README.md @@ -26,7 +26,7 @@ webmagic的功能覆盖整个爬虫的生命周期(链接提取、页面下载 * ####可扩展#### 参考[`scrapy`](https://github.com/scrapy/scrapy)的设计,webmagic将爬虫的扩展点分为Processor、Schedular、Downloader、Pipeline三个模块,可以通过扩展这些接口实现强大的扩展功能。如可以通过多个Spider实现多线程抓取;可以通过扩展Schedular实现断点续传乃至于分布式爬虫;可以通过扩展Pipeline实现业务可定制的持久化功能。webmagic的架构原理见作者的一篇文章:[webmagic的设计机制及原理-如何开发一个Java爬虫](http://my.oschina.net/flashsword/blog/145796) -![image](http://code4craft.github.io/images/posts/webmagic.png) +![image](http://code4craft.github.io/images/posts/webmagic-0.1.0.png) ------ From 223a36f587245735521eccc59901f49e75cb7ab3 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Thu, 25 Jul 2013 13:51:03 +0800 Subject: [PATCH 59/81] release notes --- release-note.md | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/release-note.md b/release-note.md index 7dbe92a96..e9c8d9927 100755 --- a/release-note.md +++ b/release-note.md @@ -1,11 +1,13 @@ Release Notes ---- -*2012-7-16* `version:0.1.0` +*2012-7-25* `version:0.1.0` 第一个稳定版本。 修改了若干API,使得可扩展性更强,为每个任务分配一个ID,可以通过ID区分不同任务。 +重写了Pipeline接口,将抽取结果集包装到ResultItems对象,而不是通用一个Page对象,便于逻辑分离。 + 增加下载的重试机制,支持gzip,支持自定义UA/cookie。 增加多线程抓取功能,只需在初始化的时候指定线程数即可。 From 433e87da78a6dd130f5222705e3809b692c6c47e Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Fri, 26 Jul 2013 08:24:08 +0800 Subject: [PATCH 60/81] add iteye sample --- .../java/us/codecraft/webmagic/Spider.java | 37 +++++++++++------- .../webmagic/pipeline/FilePipeline.java | 2 +- .../webmagic/samples/IteyeBlogProcessor.java | 38 +++++++++++++++++++ webmagic-samples/src/main/resources/log4j.xml | 26 +++++++++++++ 4 files changed, 89 insertions(+), 14 deletions(-) create mode 100644 webmagic-samples/src/main/java/us/codecraft/webmagic/samples/IteyeBlogProcessor.java create mode 100644 webmagic-samples/src/main/resources/log4j.xml diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java index a51ed9601..f3065422e 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java @@ -18,23 +18,24 @@ /** *
      - *webmagic爬虫的入口类。
      + * webmagic爬虫的入口类。
        *
      - *示例:
      - *定义一个最简单的爬虫:
      + * 示例:
      + * 定义一个最简单的爬虫:
        *      Spider.create(new SimplePageProcessor("http://my.oschina.net/", "http://my.oschina.net/*blog/*")).run();
        *
      - *使用FilePipeline保存结果到文件:
      + * 使用FilePipeline保存结果到文件:
        *      Spider.create(new SimplePageProcessor("http://my.oschina.net/", "http://my.oschina.net/*blog/*"))
        *          .pipeline(new FilePipeline("/data/temp/webmagic/")).run();
        *
      - *使用FileCacheQueueScheduler缓存URL,关闭爬虫后下次自动从停止的页面继续抓取:
      + * 使用FileCacheQueueScheduler缓存URL,关闭爬虫后下次自动从停止的页面继续抓取:
        *      Spider.create(new SimplePageProcessor("http://my.oschina.net/", "http://my.oschina.net/*blog/*"))
        *          .scheduler(new FileCacheQueueScheduler("/data/temp/webmagic/cache/")).run();
        * 
      + * * @author code4crafter@gmail.com
      - * Date: 13-4-21 - * Time: 上午6:53 + * Date: 13-4-21 + * Time: 上午6:53 */ public class Spider implements Runnable, Task { @@ -66,6 +67,7 @@ public class Spider implements Runnable, Task { /** * 使用已定义的抽取规则新建一个Spider。 + * * @param pageProcessor 已定义的抽取规则 */ public Spider(PageProcessor pageProcessor) { @@ -76,6 +78,7 @@ public Spider(PageProcessor pageProcessor) { /** * 使用已定义的抽取规则新建一个Spider。 + * * @param pageProcessor 已定义的抽取规则 * @return 新建的Spider */ @@ -85,6 +88,7 @@ public static Spider create(PageProcessor pageProcessor) { /** * 重新设置startUrls,会覆盖Site本身的startUrls。 + * * @param startUrls * @return this */ @@ -96,6 +100,7 @@ public Spider startUrls(List startUrls) { /** * 为爬虫设置一个唯一ID,用于标志任务,默认情况下使用domain作为uuid,对于单domain多任务的情况,请为重复任务设置不同的ID。 + * * @param uuid 唯一ID * @return this */ @@ -106,6 +111,7 @@ public Spider setUUID(String uuid) { /** * 设置调度器。调度器用于保存待抓取URL,并可以进行去重、同步、持久化等工作。默认情况下使用内存中的阻塞队列进行调度。 + * * @param scheduler 调度器 * @return this */ @@ -117,6 +123,7 @@ public Spider scheduler(Scheduler scheduler) { /** * 设置处理管道。处理管道用于最终抽取结果的后处理,例如:保存到文件、保存到数据库等。默认情况下会输出到控制台。 + * * @param pipeline 处理管道 * @return this */ @@ -148,7 +155,7 @@ public void run() { pipelines.add(new ConsolePipeline()); } //singel thread - if (executorService==null){ + if (executorService == null) { while (request != null) { processRequest(request); request = scheduler.poll(this); @@ -217,13 +224,13 @@ private void addRequest(Page page) { } } - private void checkIfNotRunning(){ - if (!stat.compareAndSet(STAT_INIT,STAT_INIT)){ + private void checkIfNotRunning() { + if (!stat.compareAndSet(STAT_INIT, STAT_INIT)) { throw new IllegalStateException("Spider is already running!"); } } - public void runAsync(){ + public void runAsync() { Thread thread = new Thread(this); thread.setDaemon(false); thread.start(); @@ -231,15 +238,19 @@ public void runAsync(){ /** * 建立多个线程下载 + * * @param threadNum 线程数 * @return this */ public Spider thread(int threadNum) { checkIfNotRunning(); - if (threadNum <= 1) { + if (threadNum <= 0) { throw new IllegalArgumentException("threadNum should be more than one!"); } - synchronized (this){ + if (threadNum == 1) { + return this; + } + synchronized (this) { this.executorService = ThreadUtils.newFixedThreadPool(threadNum); } return this; diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/FilePipeline.java b/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/FilePipeline.java index 0948bfe0f..10d97a83c 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/FilePipeline.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/FilePipeline.java @@ -46,7 +46,7 @@ public void process(ResultItems resultItems, Task task) { file.mkdirs(); } try { - PrintWriter printWriter = new PrintWriter(new FileWriter(path + DigestUtils.md5Hex(resultItems.getRequest().getUrl()))); + PrintWriter printWriter = new PrintWriter(new FileWriter(path + DigestUtils.md5Hex(resultItems.getRequest().getUrl())+".html")); printWriter.println("url:\t" + resultItems.getRequest().getUrl()); for (Map.Entry entry : resultItems.getAll().entrySet()) { printWriter.println(entry.getKey()+":\t"+entry.getValue()); diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/IteyeBlogProcessor.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/IteyeBlogProcessor.java new file mode 100644 index 000000000..188f3a1f9 --- /dev/null +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/IteyeBlogProcessor.java @@ -0,0 +1,38 @@ +package us.codecraft.webmagic.samples; + +import us.codecraft.webmagic.Page; +import us.codecraft.webmagic.Site; +import us.codecraft.webmagic.Spider; +import us.codecraft.webmagic.pipeline.FilePipeline; +import us.codecraft.webmagic.processor.PageProcessor; + +/** + * @author yihua.huang@dianping.com
      + * @date: 13-7-26
      + * Time: 上午7:31
      + */ +public class IteyeBlogProcessor implements PageProcessor { + + private Site site; + + @Override + public void process(Page page) { + page.addTargetRequests(page.getHtml().links().regex(".*yanghaoli\\.iteye\\.com/blog/\\d+").all()); + page.putField("title",page.getHtml().xpath("//title").toString()); + page.putField("content",page.getHtml().smartContent().toString()); + } + + @Override + public Site getSite() { + if (site == null) { + site = Site.me().setDomain("yanghaoli.iteye.com").addStartUrl("http://yanghaoli.iteye.com/"). + setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31") + .setSleepTime(100).setRetryTimes(3); + } + return site; + } + + public static void main(String[] args) { + Spider.create(new IteyeBlogProcessor()).thread(5).pipeline(new FilePipeline()).run(); + } +} diff --git a/webmagic-samples/src/main/resources/log4j.xml b/webmagic-samples/src/main/resources/log4j.xml new file mode 100644 index 000000000..a6630f813 --- /dev/null +++ b/webmagic-samples/src/main/resources/log4j.xml @@ -0,0 +1,26 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + From 2153045f77b52a166590d00dc3636d5447dd1cbe Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Fri, 26 Jul 2013 11:52:23 +0800 Subject: [PATCH 61/81] try invite selenium --- pom.xml | 114 +++++++++++++++++- webmagic-core/pom.xml | 73 +---------- webmagic-plugin/pom.xml | 62 +--------- webmagic-samples/pom.xml | 97 ++------------- .../webmagic/samples/IteyeBlogProcessor.java | 5 +- webmagic-selenium/pom.xml | 37 ++++++ 6 files changed, 165 insertions(+), 223 deletions(-) create mode 100644 webmagic-selenium/pom.xml diff --git a/pom.xml b/pom.xml index 39f068c5d..f0b3a7d74 100644 --- a/pom.xml +++ b/pom.xml @@ -3,18 +3,65 @@ xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> us.codecraft - 0.0.1-SNAPSHOT + 0.1.0 4.0.0 pom webmagic - ./webmagic-core - ./webmagic-plugin/ - ./webmagic-samples/ - + webmagic-core + webmagic-plugin/ + webmagic-samples/ + webmagic-selenium + - + + + + junit + junit + 4.7 + test + + + org.apache.httpcomponents + httpclient + 4.2.4 + + + log4j + log4j + 1.2.17 + + + org.apache.commons + commons-lang3 + 3.1 + + + commons-collections + commons-collections + 3.2.1 + + + net.sourceforge.htmlcleaner + htmlcleaner + 2.4 + + + org.apache.commons + commons-io + 1.3.2 + + + org.jsoup + jsoup + 1.7.2 + + + + + org.apache.maven.plugins @@ -25,6 +72,61 @@ 1.6 + + org.apache.maven.plugins + maven-dependency-plugin + + + copy-dependencies + package + + copy-dependencies + + + ${project.build.directory}/lib + false + false + true + + + + + + org.apache.maven.plugins + maven-resources-plugin + + UTF-8 + + + + org.apache.maven.plugins + maven-source-plugin + + + attach-sources + + jar + + + + + + org.apache.maven.plugins + maven-javadoc-plugin + + + attach-javadocs + + jar + + + + + + org.apache.maven.plugins + maven-release-plugin + 2.0-beta-7 + diff --git a/webmagic-core/pom.xml b/webmagic-core/pom.xml index b0de214cd..60c37c025 100644 --- a/webmagic-core/pom.xml +++ b/webmagic-core/pom.xml @@ -2,8 +2,11 @@ - us.codecraft - 0.1.0 + + us.codecraft + webmagic + 0.1.0 + 4.0.0 webmagic-core @@ -12,109 +15,43 @@ org.apache.httpcomponents httpclient - 4.2.4 junit junit - 4.7 - test - - - - com.google.guava - guava - 13.0.1 org.apache.commons commons-lang3 - 3.1 log4j log4j - 1.2.17 commons-collections commons-collections - 3.2.1 net.sourceforge.htmlcleaner htmlcleaner - 2.4 org.jsoup jsoup - 1.7.2 org.apache.commons commons-io - 1.3.2 - - - - org.apache.maven.plugins - maven-compiler-plugin - 3.1 - - 1.6 - 1.6 - - - - org.apache.maven.plugins - maven-resources-plugin - - UTF-8 - - - - org.apache.maven.plugins - maven-source-plugin - - - attach-sources - - jar - - - - - - org.apache.maven.plugins - maven-javadoc-plugin - - - attach-javadocs - - jar - - - - - - org.apache.maven.plugins - maven-release-plugin - 2.0-beta-7 - - - - - \ No newline at end of file diff --git a/webmagic-plugin/pom.xml b/webmagic-plugin/pom.xml index 634f09d34..b75dc9e76 100644 --- a/webmagic-plugin/pom.xml +++ b/webmagic-plugin/pom.xml @@ -2,8 +2,11 @@ - us.codecraft - 0.1.0 + + us.codecraft + webmagic + 0.1.0 + 4.0.0 webmagic-plugin @@ -12,13 +15,11 @@ us.codecraft webmagic-core - 0.1.0 + ${project.version} junit junit - 4.7 - test org.freemarker @@ -32,55 +33,4 @@ - - - - org.apache.maven.plugins - maven-compiler-plugin - 3.1 - - 1.6 - 1.6 - - - - org.apache.maven.plugins - maven-resources-plugin - - UTF-8 - - - - org.apache.maven.plugins - maven-source-plugin - - - attach-sources - - jar - - - - - - org.apache.maven.plugins - maven-javadoc-plugin - - - attach-javadocs - - jar - - - - - - org.apache.maven.plugins - maven-release-plugin - 2.0-beta-7 - - - - - \ No newline at end of file diff --git a/webmagic-samples/pom.xml b/webmagic-samples/pom.xml index ac2092f6a..8af7672d1 100644 --- a/webmagic-samples/pom.xml +++ b/webmagic-samples/pom.xml @@ -2,9 +2,11 @@ - - us.codecraft - 0.1.0 + + us.codecraft + webmagic + 0.1.0 + 4.0.0 webmagic-samples @@ -13,102 +15,17 @@ us.codecraft webmagic-core - 0.1.0 + ${project.version} us.codecraft webmagic-plugin - 0.1.0 + ${project.version} junit junit - 4.7 - test - - - - org.apache.maven.plugins - maven-compiler-plugin - 3.1 - - 1.6 - 1.6 - - - - org.apache.maven.plugins - maven-dependency-plugin - - - copy-dependencies - package - - copy-dependencies - - - ${project.build.directory}/lib - false - false - true - - - - - - org.apache.maven.plugins - maven-resources-plugin - - UTF-8 - - - - org.apache.maven.plugins - maven-source-plugin - - - attach-sources - - jar - - - - - - org.apache.maven.plugins - maven-javadoc-plugin - - - attach-javadocs - - jar - - - - - - org.apache.maven.plugins - maven-jar-plugin - - - - true - ./lib/ - us.codecraft.webmagic.samples.DianpingIndexProcessor - - - - - - org.apache.maven.plugins - maven-release-plugin - 2.0-beta-7 - - - - - \ No newline at end of file diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/IteyeBlogProcessor.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/IteyeBlogProcessor.java index 188f3a1f9..76f9cc30e 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/IteyeBlogProcessor.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/IteyeBlogProcessor.java @@ -26,13 +26,12 @@ public void process(Page page) { public Site getSite() { if (site == null) { site = Site.me().setDomain("yanghaoli.iteye.com").addStartUrl("http://yanghaoli.iteye.com/"). - setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31") - .setSleepTime(100).setRetryTimes(3); + setSleepTime(100).setRetryTimes(3); } return site; } public static void main(String[] args) { - Spider.create(new IteyeBlogProcessor()).thread(5).pipeline(new FilePipeline()).run(); + Spider.create(new IteyeBlogProcessor()).thread(5).pipeline(new FilePipeline("/data/webmagic/")).run(); } } diff --git a/webmagic-selenium/pom.xml b/webmagic-selenium/pom.xml new file mode 100644 index 000000000..209fbe8f2 --- /dev/null +++ b/webmagic-selenium/pom.xml @@ -0,0 +1,37 @@ + + + + + us.codecraft + webmagic + 0.1.0 + + 4.0.0 + webmagic-selenium + + + + us.codecraft + webmagic-core + ${project.version} + + + us.codecraft + webmagic-plugin + ${project.version} + + + junit + junit + + + org.seleniumhq.selenium + selenium-java + 2.33.0 + + + + + \ No newline at end of file From 18d835c73592afa378960a9834697ca7fccbb72a Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Fri, 26 Jul 2013 11:55:40 +0800 Subject: [PATCH 62/81] update readme --- webmagic-core/README.md | 3 +++ webmagic-plugin/README.md | 6 ++++++ webmagic-selenium/README.md | 3 +++ 3 files changed, 12 insertions(+) create mode 100644 webmagic-core/README.md create mode 100644 webmagic-plugin/README.md create mode 100644 webmagic-selenium/README.md diff --git a/webmagic-core/README.md b/webmagic-core/README.md new file mode 100644 index 000000000..4964e1677 --- /dev/null +++ b/webmagic-core/README.md @@ -0,0 +1,3 @@ +webmagic-core +------- +webmagic核心部分。 \ No newline at end of file diff --git a/webmagic-plugin/README.md b/webmagic-plugin/README.md new file mode 100644 index 000000000..e97c2563e --- /dev/null +++ b/webmagic-plugin/README.md @@ -0,0 +1,6 @@ +webmagic-plugin +------- +webmagic的插件模块。 +目前仅实现了freemarker模板渲染,和redis实现分布式爬虫。 + +这部分依赖比较杂,以后考虑多分几个包。 \ No newline at end of file diff --git a/webmagic-selenium/README.md b/webmagic-selenium/README.md new file mode 100644 index 000000000..5e5ce8211 --- /dev/null +++ b/webmagic-selenium/README.md @@ -0,0 +1,3 @@ +webmagic-selenium +------- +尝试使用selenium来进行页面动态渲染,开发中。 \ No newline at end of file From a8da1124fb13bc89eb4199eeef73e57f2b37e004 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Fri, 26 Jul 2013 12:04:08 +0800 Subject: [PATCH 63/81] update structrue --- pom.xml | 1 - webmagic-plugin/pom.xml | 15 ++++------- webmagic-plugin/webmagic-misc/pom.xml | 27 +++++++++++++++++++ .../webmagic/pipeline/FreemarkerPipeline.java | 0 .../webmagic/scheduler/RedisScheduler.java | 0 .../src/main/resources/ftl/wordpress.ftl | 0 .../webmagic/FreemarkerPipelineTest.java | 0 .../scheduler/RedisSchedulerTest.java | 0 .../webmagic-selenium}/README.md | 0 .../webmagic-selenium}/pom.xml | 16 +---------- webmagic-samples/pom.xml | 2 +- 11 files changed, 34 insertions(+), 27 deletions(-) create mode 100644 webmagic-plugin/webmagic-misc/pom.xml rename webmagic-plugin/{ => webmagic-misc}/src/main/java/us/codecraft/webmagic/pipeline/FreemarkerPipeline.java (100%) rename webmagic-plugin/{ => webmagic-misc}/src/main/java/us/codecraft/webmagic/scheduler/RedisScheduler.java (100%) rename webmagic-plugin/{ => webmagic-misc}/src/main/resources/ftl/wordpress.ftl (100%) rename webmagic-plugin/{ => webmagic-misc}/src/test/java/us/codecraft/webmagic/FreemarkerPipelineTest.java (100%) rename webmagic-plugin/{ => webmagic-misc}/src/test/java/us/codecraft/webmagic/scheduler/RedisSchedulerTest.java (100%) rename {webmagic-selenium => webmagic-plugin/webmagic-selenium}/README.md (100%) rename {webmagic-selenium => webmagic-plugin/webmagic-selenium}/pom.xml (55%) diff --git a/pom.xml b/pom.xml index f0b3a7d74..086437d11 100644 --- a/pom.xml +++ b/pom.xml @@ -12,7 +12,6 @@ webmagic-core webmagic-plugin/ webmagic-samples/ - webmagic-selenium diff --git a/webmagic-plugin/pom.xml b/webmagic-plugin/pom.xml index b75dc9e76..22257222c 100644 --- a/webmagic-plugin/pom.xml +++ b/webmagic-plugin/pom.xml @@ -7,7 +7,12 @@ webmagic 0.1.0 + pom 4.0.0 + + webmagic-misc + webmagic-selenium + webmagic-plugin @@ -21,16 +26,6 @@ junit junit - - org.freemarker - freemarker - 2.3.15 - - - redis.clients - jedis - 2.0.0 - \ No newline at end of file diff --git a/webmagic-plugin/webmagic-misc/pom.xml b/webmagic-plugin/webmagic-misc/pom.xml new file mode 100644 index 000000000..c54561596 --- /dev/null +++ b/webmagic-plugin/webmagic-misc/pom.xml @@ -0,0 +1,27 @@ + + + + us.codecraft + webmagic-plugin + 0.1.0 + + 4.0.0 + + webmagic-misc + + + + org.freemarker + freemarker + 2.3.15 + + + redis.clients + jedis + 2.0.0 + + + + \ No newline at end of file diff --git a/webmagic-plugin/src/main/java/us/codecraft/webmagic/pipeline/FreemarkerPipeline.java b/webmagic-plugin/webmagic-misc/src/main/java/us/codecraft/webmagic/pipeline/FreemarkerPipeline.java similarity index 100% rename from webmagic-plugin/src/main/java/us/codecraft/webmagic/pipeline/FreemarkerPipeline.java rename to webmagic-plugin/webmagic-misc/src/main/java/us/codecraft/webmagic/pipeline/FreemarkerPipeline.java diff --git a/webmagic-plugin/src/main/java/us/codecraft/webmagic/scheduler/RedisScheduler.java b/webmagic-plugin/webmagic-misc/src/main/java/us/codecraft/webmagic/scheduler/RedisScheduler.java similarity index 100% rename from webmagic-plugin/src/main/java/us/codecraft/webmagic/scheduler/RedisScheduler.java rename to webmagic-plugin/webmagic-misc/src/main/java/us/codecraft/webmagic/scheduler/RedisScheduler.java diff --git a/webmagic-plugin/src/main/resources/ftl/wordpress.ftl b/webmagic-plugin/webmagic-misc/src/main/resources/ftl/wordpress.ftl similarity index 100% rename from webmagic-plugin/src/main/resources/ftl/wordpress.ftl rename to webmagic-plugin/webmagic-misc/src/main/resources/ftl/wordpress.ftl diff --git a/webmagic-plugin/src/test/java/us/codecraft/webmagic/FreemarkerPipelineTest.java b/webmagic-plugin/webmagic-misc/src/test/java/us/codecraft/webmagic/FreemarkerPipelineTest.java similarity index 100% rename from webmagic-plugin/src/test/java/us/codecraft/webmagic/FreemarkerPipelineTest.java rename to webmagic-plugin/webmagic-misc/src/test/java/us/codecraft/webmagic/FreemarkerPipelineTest.java diff --git a/webmagic-plugin/src/test/java/us/codecraft/webmagic/scheduler/RedisSchedulerTest.java b/webmagic-plugin/webmagic-misc/src/test/java/us/codecraft/webmagic/scheduler/RedisSchedulerTest.java similarity index 100% rename from webmagic-plugin/src/test/java/us/codecraft/webmagic/scheduler/RedisSchedulerTest.java rename to webmagic-plugin/webmagic-misc/src/test/java/us/codecraft/webmagic/scheduler/RedisSchedulerTest.java diff --git a/webmagic-selenium/README.md b/webmagic-plugin/webmagic-selenium/README.md similarity index 100% rename from webmagic-selenium/README.md rename to webmagic-plugin/webmagic-selenium/README.md diff --git a/webmagic-selenium/pom.xml b/webmagic-plugin/webmagic-selenium/pom.xml similarity index 55% rename from webmagic-selenium/pom.xml rename to webmagic-plugin/webmagic-selenium/pom.xml index 209fbe8f2..0da4504f7 100644 --- a/webmagic-selenium/pom.xml +++ b/webmagic-plugin/webmagic-selenium/pom.xml @@ -5,27 +5,13 @@ us.codecraft - webmagic + webmagic-plugin 0.1.0 4.0.0 webmagic-selenium - - us.codecraft - webmagic-core - ${project.version} - - - us.codecraft - webmagic-plugin - ${project.version} - - - junit - junit - org.seleniumhq.selenium selenium-java diff --git a/webmagic-samples/pom.xml b/webmagic-samples/pom.xml index 8af7672d1..b8454900a 100644 --- a/webmagic-samples/pom.xml +++ b/webmagic-samples/pom.xml @@ -19,7 +19,7 @@ us.codecraft - webmagic-plugin + webmagic-misc ${project.version} From 9a88d906d5cb39553db62d363365032669558ef0 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Fri, 26 Jul 2013 12:05:22 +0800 Subject: [PATCH 64/81] readme --- webmagic-plugin/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/webmagic-plugin/README.md b/webmagic-plugin/README.md index e97c2563e..536d59602 100644 --- a/webmagic-plugin/README.md +++ b/webmagic-plugin/README.md @@ -3,4 +3,4 @@ webmagic-plugin webmagic的插件模块。 目前仅实现了freemarker模板渲染,和redis实现分布式爬虫。 -这部分依赖比较杂,以后考虑多分几个包。 \ No newline at end of file +另外有一个使用Selenium来动态渲染页面的模块在开发中。 \ No newline at end of file From c785103ed12a8c02ab59661c1ce2ab9ccff6e1f8 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Fri, 26 Jul 2013 14:29:19 +0800 Subject: [PATCH 65/81] invite selenium --- .../downloader/SeleniumDownloader.java | 48 +++++++++++ .../selenium/downloader/WebDriverPool.java | 82 +++++++++++++++++++ .../webmagic/selenium/SeleniumTest.java | 29 +++++++ .../downloader/WebDriverPoolTest.java | 28 +++++++ 4 files changed, 187 insertions(+) create mode 100644 webmagic-plugin/webmagic-selenium/src/main/java/us/codecraft/webmagic/selenium/downloader/SeleniumDownloader.java create mode 100644 webmagic-plugin/webmagic-selenium/src/main/java/us/codecraft/webmagic/selenium/downloader/WebDriverPool.java create mode 100644 webmagic-plugin/webmagic-selenium/src/test/java/us/codecraft/webmagic/selenium/SeleniumTest.java create mode 100644 webmagic-plugin/webmagic-selenium/src/test/java/us/codecraft/webmagic/selenium/downloader/WebDriverPoolTest.java diff --git a/webmagic-plugin/webmagic-selenium/src/main/java/us/codecraft/webmagic/selenium/downloader/SeleniumDownloader.java b/webmagic-plugin/webmagic-selenium/src/main/java/us/codecraft/webmagic/selenium/downloader/SeleniumDownloader.java new file mode 100644 index 000000000..8fd1c6a2d --- /dev/null +++ b/webmagic-plugin/webmagic-selenium/src/main/java/us/codecraft/webmagic/selenium/downloader/SeleniumDownloader.java @@ -0,0 +1,48 @@ +package us.codecraft.webmagic.selenium.downloader; + +import org.apache.log4j.Logger; +import org.openqa.selenium.By; +import org.openqa.selenium.WebDriver; +import org.openqa.selenium.WebElement; +import us.codecraft.webmagic.Page; +import us.codecraft.webmagic.Request; +import us.codecraft.webmagic.Task; +import us.codecraft.webmagic.downloader.Downloader; +import us.codecraft.webmagic.selector.Html; +import us.codecraft.webmagic.selector.PlainText; +import us.codecraft.webmagic.utils.UrlUtils; + +/** + * @author yihua.huang@dianping.com
      + * @date: 13-7-26
      + * Time: 下午1:37
      + */ +public class SeleniumDownloader implements Downloader { + + private WebDriverPool webDriverPool; + + private Logger logger = Logger.getLogger(getClass()); + + public SeleniumDownloader(String chromeDriverPath) { + System.getProperties().setProperty("webdriver.chrome.driver", chromeDriverPath); + } + + @Override + public Page download(Request request, Task task) { + WebDriver webDriver = null; + try { + webDriver = webDriverPool.get(); + } catch (InterruptedException e) { + logger.warn("interrupted",e); + return null; + } + webDriver.get(request.getUrl()); + WebElement webElement = webDriver.findElement(By.xpath("/html")); + String content = webElement.getAttribute("outerHTML"); + Page page = new Page(); + page.setHtml(new Html(UrlUtils.fixAllRelativeHrefs(content, request.getUrl()))); + page.setUrl(new PlainText(request.getUrl())); + page.setRequest(request); + return page; + } +} diff --git a/webmagic-plugin/webmagic-selenium/src/main/java/us/codecraft/webmagic/selenium/downloader/WebDriverPool.java b/webmagic-plugin/webmagic-selenium/src/main/java/us/codecraft/webmagic/selenium/downloader/WebDriverPool.java new file mode 100644 index 000000000..039cef98e --- /dev/null +++ b/webmagic-plugin/webmagic-selenium/src/main/java/us/codecraft/webmagic/selenium/downloader/WebDriverPool.java @@ -0,0 +1,82 @@ +package us.codecraft.webmagic.selenium.downloader; + +import org.openqa.selenium.WebDriver; +import org.openqa.selenium.chrome.ChromeDriver; + +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; +import java.util.concurrent.BlockingDeque; +import java.util.concurrent.LinkedBlockingDeque; +import java.util.concurrent.atomic.AtomicInteger; + +/** + * @author yihua.huang@dianping.com
      + * @date: 13-7-26
      + * Time: 下午1:41
      + */ +class WebDriverPool { + + private final static int DEFAULT_CAPACITY = 5; + + private final int capacity; + + private final static int STAT_RUNNING = 1; + + private final static int STAT_CLODED = 2; + + private AtomicInteger stat = new AtomicInteger(STAT_RUNNING); + + private List webDriverList = Collections.synchronizedList(new ArrayList()); + + public WebDriverPool(int capacity) { + this.capacity = capacity; + } + + public WebDriverPool() { + this(DEFAULT_CAPACITY); + } + + private BlockingDeque innerQueue = new LinkedBlockingDeque(); + + public WebDriver get() throws InterruptedException { + checkRunning(); + WebDriver poll = innerQueue.poll(); + if (poll != null) { + return poll; + } + if (webDriverList.size() < capacity) { + synchronized (webDriverList) { + if (webDriverList.size() < capacity) { + ChromeDriver e = new ChromeDriver(); + innerQueue.add(e); + webDriverList.add(e); + } + } + + } + return innerQueue.take(); + } + + public void returnToPool(WebDriver webDriver) { + checkRunning(); + innerQueue.add(webDriver); + } + + protected void checkRunning() { + if (!stat.compareAndSet(STAT_RUNNING, STAT_RUNNING)) { + throw new IllegalStateException("Already closed!"); + } + } + + public void closeAll() { + boolean b = stat.compareAndSet(STAT_RUNNING, STAT_CLODED); + if (!b) { + throw new IllegalStateException("Already closed!"); + } + for (WebDriver webDriver : webDriverList) { + webDriver.close(); + } + + } +} diff --git a/webmagic-plugin/webmagic-selenium/src/test/java/us/codecraft/webmagic/selenium/SeleniumTest.java b/webmagic-plugin/webmagic-selenium/src/test/java/us/codecraft/webmagic/selenium/SeleniumTest.java new file mode 100644 index 000000000..fc0a9ec62 --- /dev/null +++ b/webmagic-plugin/webmagic-selenium/src/test/java/us/codecraft/webmagic/selenium/SeleniumTest.java @@ -0,0 +1,29 @@ +package us.codecraft.webmagic.selenium; + +import org.junit.Test; +import org.openqa.selenium.By; +import org.openqa.selenium.WebDriver; +import org.openqa.selenium.WebElement; +import org.openqa.selenium.chrome.ChromeDriver; + +import java.util.List; + +/** + * @author yihua.huang@dianping.com
      + * @date: 13-7-26
      + * Time: 下午12:27
      + */ +public class SeleniumTest { + + @Test + public void test(){ + System.getProperties().setProperty("webdriver.chrome.driver","/Users/yihua/Downloads/chromedriver"); + WebDriver webDriver = new ChromeDriver(); + webDriver.get("http://huaban.com/"); + List elements = webDriver.findElements(By.xpath("/html")); + for (WebElement element : elements) { + System.out.println(element.getAttribute("outerHTML")); + } + webDriver.close(); + } +} diff --git a/webmagic-plugin/webmagic-selenium/src/test/java/us/codecraft/webmagic/selenium/downloader/WebDriverPoolTest.java b/webmagic-plugin/webmagic-selenium/src/test/java/us/codecraft/webmagic/selenium/downloader/WebDriverPoolTest.java new file mode 100644 index 000000000..d38216f8f --- /dev/null +++ b/webmagic-plugin/webmagic-selenium/src/test/java/us/codecraft/webmagic/selenium/downloader/WebDriverPoolTest.java @@ -0,0 +1,28 @@ +package us.codecraft.webmagic.selenium.downloader; + +import org.junit.Test; +import org.openqa.selenium.WebDriver; + +/** + * @author yihua.huang@dianping.com
      + * @date: 13-7-26
      + * Time: 下午2:12
      + */ +public class WebDriverPoolTest { + + @Test + public void test(){ + String chromeDriverPath = "/Users/yihua/Downloads/chromedriver"; + System.getProperties().setProperty("webdriver.chrome.driver", chromeDriverPath); + WebDriverPool webDriverPool =new WebDriverPool(5); + for (int i=0;i<5;i++){ + try { + WebDriver webDriver = webDriverPool.get(); + System.out.println(i); + } catch (InterruptedException e) { + e.printStackTrace(); + } + } + webDriverPool.closeAll(); + } +} From 2f6d17966673e9cb65261dbcc0bae68ebf3d92fe Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Fri, 26 Jul 2013 14:41:30 +0800 Subject: [PATCH 66/81] fix a httpclient pool size bug --- .../main/java/us/codecraft/webmagic/Spider.java | 9 ++++++++- .../downloader/HttpClientDownloader.java | 15 +++++++++++++-- .../webmagic/downloader/HttpClientPool.java | 17 ++++++++++++----- .../webmagic/samples/GlobalProcessor.java | 4 ++-- 4 files changed, 35 insertions(+), 10 deletions(-) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java index f3065422e..a568f93a7 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java @@ -39,7 +39,7 @@ */ public class Spider implements Runnable, Task { - private Downloader downloader = new HttpClientDownloader(); + private Downloader downloader; private List pipelines = new ArrayList(); @@ -139,12 +139,18 @@ public Spider downloader(Downloader downloader) { return this; } + protected void checkComponent() { + if (downloader == null) { + this.downloader = new HttpClientDownloader(); + } + } @Override public void run() { if (!stat.compareAndSet(STAT_INIT, STAT_RUNNING)) { throw new IllegalStateException("Spider is already running!"); } + checkComponent(); if (startUrls != null) { for (String startUrl : startUrls) { scheduler.push(new Request(startUrl), this); @@ -247,6 +253,7 @@ public Spider thread(int threadNum) { if (threadNum <= 0) { throw new IllegalArgumentException("threadNum should be more than one!"); } + downloader = new HttpClientDownloader(threadNum); if (threadNum == 1) { return this; } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java index 7eb627724..d76341986 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java @@ -21,6 +21,7 @@ /** * 封装了HttpClient的下载器。已实现指定次数重试、处理gzip、自定义UA/cookie等功能。
      + * * @author code4crafter@gmail.com
      * Date: 13-4-21 * Time: 下午12:15 @@ -29,11 +30,21 @@ public class HttpClientDownloader implements Downloader { private Logger logger = Logger.getLogger(getClass()); + private int poolSize; + + public HttpClientDownloader(int poolSize) { + this.poolSize = poolSize; + } + + public HttpClientDownloader() { + this(5); + } + @Override public Page download(Request request, Task task) { Site site = task.getSite(); logger.info("downloading page " + request.getUrl()); - HttpClient httpClient = HttpClientPool.getInstance().getClient(site); + HttpClient httpClient = HttpClientPool.getInstance(poolSize).getClient(site); String charset = site.getCharset(); try { HttpGet httpGet = new HttpGet(request.getUrl()); @@ -50,7 +61,7 @@ public Page download(Request request, Task task) { logger.warn("download page " + request.getUrl() + " error", e); return null; } - logger.info("download page " + request.getUrl() + " error, retry the "+tried+" time!"); + logger.info("download page " + request.getUrl() + " error, retry the " + tried + " time!"); retry = true; } } while (retry); diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientPool.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientPool.java index 4e57e16f9..854f1e57a 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientPool.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientPool.java @@ -19,14 +19,21 @@ /** * @author code4crafter@gmail.com
      - * Date: 13-4-21 - * Time: 下午12:29 + * Date: 13-4-21 + * Time: 下午12:29 */ public class HttpClientPool { - public static final HttpClientPool INSTANCE = new HttpClientPool(5); + public static volatile HttpClientPool INSTANCE; - public static HttpClientPool getInstance() { + public static HttpClientPool getInstance(int poolSize) { + if (INSTANCE == null) { + synchronized (HttpClientPool.class) { + if (INSTANCE == null) { + INSTANCE = new HttpClientPool(poolSize); + } + } + } return INSTANCE; } @@ -48,7 +55,7 @@ private HttpClient generateClient(Site site) { HttpProtocolParamBean paramsBean = new HttpProtocolParamBean(params); paramsBean.setVersion(HttpVersion.HTTP_1_1); - paramsBean.setContentCharset("UTF-8"); + paramsBean.setContentCharset(site.getCharset()); paramsBean.setUseExpectContinue(false); SchemeRegistry schemeRegistry = new SchemeRegistry(); diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/GlobalProcessor.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/GlobalProcessor.java index 0e3f9a327..2bdf342fe 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/GlobalProcessor.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/GlobalProcessor.java @@ -27,8 +27,8 @@ public void process(Page page) { @Override public Site getSite() { - if (site==null){ - site = Site.me().setDomain("www.2345.com") + if (site == null) { + site = Site.me().setDomain("www.2345.com").setSleepTime(0) .addStartUrl("http://www.2345.com/").addStartUrl("http://hao.360.cn/") .addStartUrl("http://www.baidu.com/s?wd=%E7%BD%91%E7%AB%99%E5%AF%BC%E8%88%AA&rsv_spt=1&issp=1&rsv_bp=0&ie=utf-8&tn=80039098_oem_dg&rsv_n=2&rsv_sug3=6&rsv_sug4=698&rsv_sug=0&rsv_sug1=3") .setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31"); From bc98059220dfe2aa94761bd08d801414e56da7bb Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Fri, 26 Jul 2013 15:05:29 +0800 Subject: [PATCH 67/81] complete selenium --- .../downloader/SeleniumDownloader.java | 24 +++++++++++-- .../downloader/SeleniumDownloaderTest.java | 36 +++++++++++++++++++ .../downloader/WebDriverPoolTest.java | 9 ++--- 3 files changed, 63 insertions(+), 6 deletions(-) create mode 100644 webmagic-plugin/webmagic-selenium/src/test/java/us/codecraft/webmagic/selenium/downloader/SeleniumDownloaderTest.java diff --git a/webmagic-plugin/webmagic-selenium/src/main/java/us/codecraft/webmagic/selenium/downloader/SeleniumDownloader.java b/webmagic-plugin/webmagic-selenium/src/main/java/us/codecraft/webmagic/selenium/downloader/SeleniumDownloader.java index 8fd1c6a2d..171ca44fc 100644 --- a/webmagic-plugin/webmagic-selenium/src/main/java/us/codecraft/webmagic/selenium/downloader/SeleniumDownloader.java +++ b/webmagic-plugin/webmagic-selenium/src/main/java/us/codecraft/webmagic/selenium/downloader/SeleniumDownloader.java @@ -2,16 +2,20 @@ import org.apache.log4j.Logger; import org.openqa.selenium.By; +import org.openqa.selenium.Cookie; import org.openqa.selenium.WebDriver; import org.openqa.selenium.WebElement; import us.codecraft.webmagic.Page; import us.codecraft.webmagic.Request; +import us.codecraft.webmagic.Site; import us.codecraft.webmagic.Task; import us.codecraft.webmagic.downloader.Downloader; import us.codecraft.webmagic.selector.Html; import us.codecraft.webmagic.selector.PlainText; import us.codecraft.webmagic.utils.UrlUtils; +import java.util.Map; + /** * @author yihua.huang@dianping.com
      * @date: 13-7-26
      @@ -25,24 +29,40 @@ public class SeleniumDownloader implements Downloader { public SeleniumDownloader(String chromeDriverPath) { System.getProperties().setProperty("webdriver.chrome.driver", chromeDriverPath); + webDriverPool = new WebDriverPool(); + } + + public SeleniumDownloader(String chromeDriverPath, int poolSize) { + System.getProperties().setProperty("webdriver.chrome.driver", chromeDriverPath); + webDriverPool = new WebDriverPool(poolSize); } @Override public Page download(Request request, Task task) { - WebDriver webDriver = null; + WebDriver webDriver; try { webDriver = webDriverPool.get(); } catch (InterruptedException e) { - logger.warn("interrupted",e); + logger.warn("interrupted", e); return null; } webDriver.get(request.getUrl()); + WebDriver.Options manage = webDriver.manage(); + Site site = task.getSite(); + if (site.getCookies() != null) { + for (Map.Entry cookieEntry : site.getCookies().entrySet()) { + Cookie cookie = new Cookie(cookieEntry.getKey(), cookieEntry.getValue()); + manage.addCookie(cookie); + } + } WebElement webElement = webDriver.findElement(By.xpath("/html")); String content = webElement.getAttribute("outerHTML"); Page page = new Page(); page.setHtml(new Html(UrlUtils.fixAllRelativeHrefs(content, request.getUrl()))); page.setUrl(new PlainText(request.getUrl())); page.setRequest(request); + webDriverPool.returnToPool(webDriver); return page; } + } diff --git a/webmagic-plugin/webmagic-selenium/src/test/java/us/codecraft/webmagic/selenium/downloader/SeleniumDownloaderTest.java b/webmagic-plugin/webmagic-selenium/src/test/java/us/codecraft/webmagic/selenium/downloader/SeleniumDownloaderTest.java new file mode 100644 index 000000000..d21068459 --- /dev/null +++ b/webmagic-plugin/webmagic-selenium/src/test/java/us/codecraft/webmagic/selenium/downloader/SeleniumDownloaderTest.java @@ -0,0 +1,36 @@ +package us.codecraft.webmagic.selenium.downloader; + +import org.junit.Ignore; +import org.junit.Test; +import us.codecraft.webmagic.Page; +import us.codecraft.webmagic.Request; +import us.codecraft.webmagic.Site; +import us.codecraft.webmagic.Task; + +/** + * @author yihua.huang@dianping.com
      + * @date: 13-7-26
      + * Time: 下午2:46
      + */ +public class SeleniumDownloaderTest { + + private String chromeDriverPath = "/Users/yihua/Downloads/chromedriver"; + + @Ignore("need chrome driver") + @Test + public void test(){ + SeleniumDownloader seleniumDownloader = new SeleniumDownloader(chromeDriverPath); + Page page = seleniumDownloader.download(new Request("http://huaban.com/"), new Task() { + @Override + public String getUUID() { + return "huaban.com"; + } + + @Override + public Site getSite() { + return Site.me(); + } + }); + System.out.println(page.getHtml().$("#waterfall").links().regex(".*pins.*").all()); + } +} diff --git a/webmagic-plugin/webmagic-selenium/src/test/java/us/codecraft/webmagic/selenium/downloader/WebDriverPoolTest.java b/webmagic-plugin/webmagic-selenium/src/test/java/us/codecraft/webmagic/selenium/downloader/WebDriverPoolTest.java index d38216f8f..38e4f86f5 100644 --- a/webmagic-plugin/webmagic-selenium/src/test/java/us/codecraft/webmagic/selenium/downloader/WebDriverPoolTest.java +++ b/webmagic-plugin/webmagic-selenium/src/test/java/us/codecraft/webmagic/selenium/downloader/WebDriverPoolTest.java @@ -10,12 +10,13 @@ */ public class WebDriverPoolTest { + private String chromeDriverPath = "/Users/yihua/Downloads/chromedriver"; + @Test - public void test(){ - String chromeDriverPath = "/Users/yihua/Downloads/chromedriver"; + public void test() { System.getProperties().setProperty("webdriver.chrome.driver", chromeDriverPath); - WebDriverPool webDriverPool =new WebDriverPool(5); - for (int i=0;i<5;i++){ + WebDriverPool webDriverPool = new WebDriverPool(5); + for (int i = 0; i < 5; i++) { try { WebDriver webDriver = webDriverPool.get(); System.out.println(i); From be3528532b0967e3bd5e5bada9e691f4db0f61ed Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Fri, 26 Jul 2013 15:09:01 +0800 Subject: [PATCH 68/81] add selenium ignore --- .../test/java/us/codecraft/webmagic/selenium/SeleniumTest.java | 2 ++ .../webmagic/selenium/downloader/SeleniumDownloaderTest.java | 1 + .../webmagic/selenium/downloader/WebDriverPoolTest.java | 2 ++ 3 files changed, 5 insertions(+) diff --git a/webmagic-plugin/webmagic-selenium/src/test/java/us/codecraft/webmagic/selenium/SeleniumTest.java b/webmagic-plugin/webmagic-selenium/src/test/java/us/codecraft/webmagic/selenium/SeleniumTest.java index fc0a9ec62..e9e1c7b60 100644 --- a/webmagic-plugin/webmagic-selenium/src/test/java/us/codecraft/webmagic/selenium/SeleniumTest.java +++ b/webmagic-plugin/webmagic-selenium/src/test/java/us/codecraft/webmagic/selenium/SeleniumTest.java @@ -1,5 +1,6 @@ package us.codecraft.webmagic.selenium; +import org.junit.Ignore; import org.junit.Test; import org.openqa.selenium.By; import org.openqa.selenium.WebDriver; @@ -15,6 +16,7 @@ */ public class SeleniumTest { + @Ignore("need chrome driver") @Test public void test(){ System.getProperties().setProperty("webdriver.chrome.driver","/Users/yihua/Downloads/chromedriver"); diff --git a/webmagic-plugin/webmagic-selenium/src/test/java/us/codecraft/webmagic/selenium/downloader/SeleniumDownloaderTest.java b/webmagic-plugin/webmagic-selenium/src/test/java/us/codecraft/webmagic/selenium/downloader/SeleniumDownloaderTest.java index d21068459..d50c20a32 100644 --- a/webmagic-plugin/webmagic-selenium/src/test/java/us/codecraft/webmagic/selenium/downloader/SeleniumDownloaderTest.java +++ b/webmagic-plugin/webmagic-selenium/src/test/java/us/codecraft/webmagic/selenium/downloader/SeleniumDownloaderTest.java @@ -20,6 +20,7 @@ public class SeleniumDownloaderTest { @Test public void test(){ SeleniumDownloader seleniumDownloader = new SeleniumDownloader(chromeDriverPath); + Page page = seleniumDownloader.download(new Request("http://huaban.com/"), new Task() { @Override public String getUUID() { diff --git a/webmagic-plugin/webmagic-selenium/src/test/java/us/codecraft/webmagic/selenium/downloader/WebDriverPoolTest.java b/webmagic-plugin/webmagic-selenium/src/test/java/us/codecraft/webmagic/selenium/downloader/WebDriverPoolTest.java index 38e4f86f5..1efc69b16 100644 --- a/webmagic-plugin/webmagic-selenium/src/test/java/us/codecraft/webmagic/selenium/downloader/WebDriverPoolTest.java +++ b/webmagic-plugin/webmagic-selenium/src/test/java/us/codecraft/webmagic/selenium/downloader/WebDriverPoolTest.java @@ -1,5 +1,6 @@ package us.codecraft.webmagic.selenium.downloader; +import org.junit.Ignore; import org.junit.Test; import org.openqa.selenium.WebDriver; @@ -12,6 +13,7 @@ public class WebDriverPoolTest { private String chromeDriverPath = "/Users/yihua/Downloads/chromedriver"; + @Ignore("need chrome driver") @Test public void test() { System.getProperties().setProperty("webdriver.chrome.driver", chromeDriverPath); From 726f00044cb1f58d47ff9ca60ab1b1e6834a6786 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Fri, 26 Jul 2013 15:27:47 +0800 Subject: [PATCH 69/81] release resource --- .../java/us/codecraft/webmagic/Spider.java | 17 ++++++++++ .../webmagic/downloader/Destroyable.java | 13 ++++++++ .../downloader/SeleniumDownloader.java | 13 +++++++- .../downloader/SeleniumDownloaderTest.java | 31 ++++++++++--------- 4 files changed, 59 insertions(+), 15 deletions(-) create mode 100644 webmagic-core/src/main/java/us/codecraft/webmagic/downloader/Destroyable.java diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java index a568f93a7..cd092a308 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java @@ -2,6 +2,7 @@ import org.apache.commons.collections.CollectionUtils; import org.apache.log4j.Logger; +import us.codecraft.webmagic.downloader.Destroyable; import us.codecraft.webmagic.downloader.Downloader; import us.codecraft.webmagic.downloader.HttpClientDownloader; import us.codecraft.webmagic.pipeline.ConsolePipeline; @@ -198,6 +199,22 @@ public void run() { executorService.shutdown(); } stat.compareAndSet(STAT_RUNNING, STAT_STOPPED); + //release some resources + destroy(); + } + + private void destroy() { + destroyEach(downloader); + destroyEach(pageProcessor); + for (Pipeline pipeline : pipelines) { + destroyEach(pipeline); + } + } + + private void destroyEach(Object object){ + if (object instanceof Destroyable) { + ((Destroyable)object).destroy(); + } } private void processRequest(Request request) { diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/Destroyable.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/Destroyable.java new file mode 100644 index 000000000..4f07528d6 --- /dev/null +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/Destroyable.java @@ -0,0 +1,13 @@ +package us.codecraft.webmagic.downloader; + +/** + * 比较占用资源的服务可以实现该接口,Spider会在结束时调用destroy()释放资源。
      + * @author yihua.huang@dianping.com
      + * @date: 13-7-26
      + * Time: 下午3:10
      + */ +public interface Destroyable { + + public void destroy(); + +} diff --git a/webmagic-plugin/webmagic-selenium/src/main/java/us/codecraft/webmagic/selenium/downloader/SeleniumDownloader.java b/webmagic-plugin/webmagic-selenium/src/main/java/us/codecraft/webmagic/selenium/downloader/SeleniumDownloader.java index 171ca44fc..1ed8b4db4 100644 --- a/webmagic-plugin/webmagic-selenium/src/main/java/us/codecraft/webmagic/selenium/downloader/SeleniumDownloader.java +++ b/webmagic-plugin/webmagic-selenium/src/main/java/us/codecraft/webmagic/selenium/downloader/SeleniumDownloader.java @@ -9,6 +9,7 @@ import us.codecraft.webmagic.Request; import us.codecraft.webmagic.Site; import us.codecraft.webmagic.Task; +import us.codecraft.webmagic.downloader.Destroyable; import us.codecraft.webmagic.downloader.Downloader; import us.codecraft.webmagic.selector.Html; import us.codecraft.webmagic.selector.PlainText; @@ -17,16 +18,22 @@ import java.util.Map; /** + * 使用Selenium调用浏览器进行渲染。目前仅支持chrome。
      + * 需要下载Selenium driver支持。
      * @author yihua.huang@dianping.com
      * @date: 13-7-26
      * Time: 下午1:37
      */ -public class SeleniumDownloader implements Downloader { +public class SeleniumDownloader implements Downloader,Destroyable { private WebDriverPool webDriverPool; private Logger logger = Logger.getLogger(getClass()); + /** + * 新建 + * @param chromeDriverPath + */ public SeleniumDownloader(String chromeDriverPath) { System.getProperties().setProperty("webdriver.chrome.driver", chromeDriverPath); webDriverPool = new WebDriverPool(); @@ -65,4 +72,8 @@ public Page download(Request request, Task task) { return page; } + @Override + public void destroy() { + webDriverPool.closeAll(); + } } diff --git a/webmagic-plugin/webmagic-selenium/src/test/java/us/codecraft/webmagic/selenium/downloader/SeleniumDownloaderTest.java b/webmagic-plugin/webmagic-selenium/src/test/java/us/codecraft/webmagic/selenium/downloader/SeleniumDownloaderTest.java index d50c20a32..615ad8674 100644 --- a/webmagic-plugin/webmagic-selenium/src/test/java/us/codecraft/webmagic/selenium/downloader/SeleniumDownloaderTest.java +++ b/webmagic-plugin/webmagic-selenium/src/test/java/us/codecraft/webmagic/selenium/downloader/SeleniumDownloaderTest.java @@ -14,24 +14,27 @@ */ public class SeleniumDownloaderTest { - private String chromeDriverPath = "/Users/yihua/Downloads/chromedriver"; + private String chromeDriverPath = ""; @Ignore("need chrome driver") @Test - public void test(){ + public void test() { SeleniumDownloader seleniumDownloader = new SeleniumDownloader(chromeDriverPath); + long time1 = System.currentTimeMillis(); + for (int i = 0; i < 100; i++) { + Page page = seleniumDownloader.download(new Request("http://huaban.com/"), new Task() { + @Override + public String getUUID() { + return "huaban.com"; + } - Page page = seleniumDownloader.download(new Request("http://huaban.com/"), new Task() { - @Override - public String getUUID() { - return "huaban.com"; - } - - @Override - public Site getSite() { - return Site.me(); - } - }); - System.out.println(page.getHtml().$("#waterfall").links().regex(".*pins.*").all()); + @Override + public Site getSite() { + return Site.me(); + } + }); + System.out.println(page.getHtml().$("#waterfall").links().regex(".*pins.*").all()); + } + System.out.println(System.currentTimeMillis() - time1); } } From d3d81603aa9c747e49627d4e84efa2de7e0e69be Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Fri, 26 Jul 2013 16:32:51 +0800 Subject: [PATCH 70/81] add huaban processor --- .../webmagic/pipeline/ConsolePipeline.java | 3 ++ .../webmagic/pipeline/FilePipeline.java | 3 ++ .../downloader/SeleniumDownloader.java | 1 + .../webmagic/selenium/SeleniumTest.java | 12 ++--- webmagic-samples/pom.xml | 5 +++ .../webmagic/samples/HuabanProcessor.java | 45 +++++++++++++++++++ 6 files changed, 61 insertions(+), 8 deletions(-) create mode 100644 webmagic-samples/src/main/java/us/codecraft/webmagic/samples/HuabanProcessor.java diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/ConsolePipeline.java b/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/ConsolePipeline.java index 72c3bf3ff..8be5fabb6 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/ConsolePipeline.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/ConsolePipeline.java @@ -15,6 +15,9 @@ public class ConsolePipeline implements Pipeline{ @Override public void process(ResultItems resultItems,Task task) { + if (resultItems.isSkip()){ + return; + } System.out.println("get page: "+resultItems.getRequest().getUrl()); for (Map.Entry entry : resultItems.getAll().entrySet()) { System.out.println(entry.getKey()+":\t"+entry.getValue()); diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/FilePipeline.java b/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/FilePipeline.java index 10d97a83c..cbce8324a 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/FilePipeline.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/FilePipeline.java @@ -45,6 +45,9 @@ public void process(ResultItems resultItems, Task task) { if (!file.exists()) { file.mkdirs(); } + if (resultItems.isSkip()){ + return; + } try { PrintWriter printWriter = new PrintWriter(new FileWriter(path + DigestUtils.md5Hex(resultItems.getRequest().getUrl())+".html")); printWriter.println("url:\t" + resultItems.getRequest().getUrl()); diff --git a/webmagic-plugin/webmagic-selenium/src/main/java/us/codecraft/webmagic/selenium/downloader/SeleniumDownloader.java b/webmagic-plugin/webmagic-selenium/src/main/java/us/codecraft/webmagic/selenium/downloader/SeleniumDownloader.java index 1ed8b4db4..b4dd3720b 100644 --- a/webmagic-plugin/webmagic-selenium/src/main/java/us/codecraft/webmagic/selenium/downloader/SeleniumDownloader.java +++ b/webmagic-plugin/webmagic-selenium/src/main/java/us/codecraft/webmagic/selenium/downloader/SeleniumDownloader.java @@ -53,6 +53,7 @@ public Page download(Request request, Task task) { logger.warn("interrupted", e); return null; } + logger.info("downloading page " + request.getUrl()); webDriver.get(request.getUrl()); WebDriver.Options manage = webDriver.manage(); Site site = task.getSite(); diff --git a/webmagic-plugin/webmagic-selenium/src/test/java/us/codecraft/webmagic/selenium/SeleniumTest.java b/webmagic-plugin/webmagic-selenium/src/test/java/us/codecraft/webmagic/selenium/SeleniumTest.java index e9e1c7b60..6cf50c39b 100644 --- a/webmagic-plugin/webmagic-selenium/src/test/java/us/codecraft/webmagic/selenium/SeleniumTest.java +++ b/webmagic-plugin/webmagic-selenium/src/test/java/us/codecraft/webmagic/selenium/SeleniumTest.java @@ -7,8 +7,6 @@ import org.openqa.selenium.WebElement; import org.openqa.selenium.chrome.ChromeDriver; -import java.util.List; - /** * @author yihua.huang@dianping.com
      * @date: 13-7-26
      @@ -18,14 +16,12 @@ public class SeleniumTest { @Ignore("need chrome driver") @Test - public void test(){ - System.getProperties().setProperty("webdriver.chrome.driver","/Users/yihua/Downloads/chromedriver"); + public void testSelenium() { + System.getProperties().setProperty("webdriver.chrome.driver", "/Users/yihua/Downloads/chromedriver"); WebDriver webDriver = new ChromeDriver(); webDriver.get("http://huaban.com/"); - List elements = webDriver.findElements(By.xpath("/html")); - for (WebElement element : elements) { - System.out.println(element.getAttribute("outerHTML")); - } + WebElement webElement = webDriver.findElement(By.xpath("/html")); + System.out.println(webElement.getAttribute("outerHTML")); webDriver.close(); } } diff --git a/webmagic-samples/pom.xml b/webmagic-samples/pom.xml index b8454900a..8a7e00ce0 100644 --- a/webmagic-samples/pom.xml +++ b/webmagic-samples/pom.xml @@ -22,6 +22,11 @@ webmagic-misc ${project.version}
      + + us.codecraft + webmagic-selenium + ${project.version} + junit junit diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/HuabanProcessor.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/HuabanProcessor.java new file mode 100644 index 000000000..23434f33e --- /dev/null +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/HuabanProcessor.java @@ -0,0 +1,45 @@ +package us.codecraft.webmagic.samples; + +import us.codecraft.webmagic.Page; +import us.codecraft.webmagic.Site; +import us.codecraft.webmagic.Spider; +import us.codecraft.webmagic.pipeline.FilePipeline; +import us.codecraft.webmagic.processor.PageProcessor; +import us.codecraft.webmagic.scheduler.RedisScheduler; +import us.codecraft.webmagic.selenium.downloader.SeleniumDownloader; + +/** + * @author yihua.huang@dianping.com
      + * @date: 13-7-26
      + * Time: 下午4:08
      + */ +public class HuabanProcessor implements PageProcessor { + + private Site site; + + @Override + public void process(Page page) { + page.addTargetRequests(page.getHtml().links().regex("http://huaban\\.com/.*").all()); + if (page.getUrl().toString().contains("pins")) { + page.putField("img", page.getHtml().xpath("//div[@id='pin_img']/img/@src").toString()); + } else { + page.getResultItems().setSkip(true); + } + } + + @Override + public Site getSite() { + if (site == null) { + site = Site.me().setDomain("huaban.com").addStartUrl("http://huaban.com/"); + } + return site; + } + + public static void main(String[] args) { + Spider.create(new HuabanProcessor()) + .scheduler(new RedisScheduler("localhost")) + .pipeline(new FilePipeline("/data/webmagic/test/")) + .downloader(new SeleniumDownloader("/Users/yihua/Downloads/chromedriver")) + .runAsync(); + } +} From b71409a5ba6b057a3463c1989fc22ffb75910310 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Fri, 26 Jul 2013 16:34:09 +0800 Subject: [PATCH 71/81] sleep time --- .../java/us/codecraft/webmagic/samples/HuabanProcessor.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/HuabanProcessor.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/HuabanProcessor.java index 23434f33e..a542da410 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/HuabanProcessor.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/HuabanProcessor.java @@ -30,13 +30,13 @@ public void process(Page page) { @Override public Site getSite() { if (site == null) { - site = Site.me().setDomain("huaban.com").addStartUrl("http://huaban.com/"); + site = Site.me().setDomain("huaban.com").addStartUrl("http://huaban.com/").setSleepTime(1000); } return site; } public static void main(String[] args) { - Spider.create(new HuabanProcessor()) + Spider.create(new HuabanProcessor()).thread(5) .scheduler(new RedisScheduler("localhost")) .pipeline(new FilePipeline("/data/webmagic/test/")) .downloader(new SeleniumDownloader("/Users/yihua/Downloads/chromedriver")) From 0d8bc820d794bf7878bf07f2632eb0a88d8bf090 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Fri, 26 Jul 2013 16:55:36 +0800 Subject: [PATCH 72/81] update --- .../java/us/codecraft/webmagic/samples/HuabanProcessor.java | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/HuabanProcessor.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/HuabanProcessor.java index a542da410..d8c5f0573 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/HuabanProcessor.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/HuabanProcessor.java @@ -9,6 +9,8 @@ import us.codecraft.webmagic.selenium.downloader.SeleniumDownloader; /** + * 花瓣网抽取器。
      + * 使用Selenium做页面动态渲染。
      * @author yihua.huang@dianping.com
      * @date: 13-7-26
      * Time: 下午4:08
      @@ -30,7 +32,7 @@ public void process(Page page) { @Override public Site getSite() { if (site == null) { - site = Site.me().setDomain("huaban.com").addStartUrl("http://huaban.com/").setSleepTime(1000); + site = Site.me().setDomain("huaban.com").addStartUrl("http://huaban.com/").setSleepTime(0); } return site; } From 0003d350def7faba2dd25ca909b839223b4c606d Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Fri, 26 Jul 2013 17:11:07 +0800 Subject: [PATCH 73/81] delete --- .../webmagic/samples/DianpingProcessor.java | 44 ------------------- 1 file changed, 44 deletions(-) delete mode 100644 webmagic-samples/src/main/java/us/codecraft/webmagic/samples/DianpingProcessor.java diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/DianpingProcessor.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/DianpingProcessor.java deleted file mode 100644 index b7e3ee02d..000000000 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/DianpingProcessor.java +++ /dev/null @@ -1,44 +0,0 @@ -package us.codecraft.webmagic.samples; - -import us.codecraft.webmagic.Page; -import us.codecraft.webmagic.Site; -import us.codecraft.webmagic.Spider; -import us.codecraft.webmagic.processor.PageProcessor; - -import java.util.List; - -/** - * @author code4crafter@gmail.com
      - * Date: 13-4-21 - * Time: 下午8:08 - */ -public class DianpingProcessor implements PageProcessor { - - private Site site; - - @Override - public void process(Page page) { - List requests = page.getHtml().links().regex("http://info-search-web121361\\.alpha\\.dp:8080/search/.*").all(); - page.addTargetRequests(requests); - } - - @Override - public Site getSite() { - if (site == null) { - site = Site.me().setDomain("info-search-web361.alpha.dp:8080").addStartUrl("http://info11-search-web361.alpha.dp:8080/search/category/1/0"). - setSleepTime(100). - setUserAgent("I'm a performance tester created by yihua.huang"); - } - return site; - } - - public static void main(String[] args) { - int sleepTime = 0; - if (args.length > 0) { - sleepTime = Integer.parseInt(args[0]); - } - DianpingProcessor dianpingProcessor = new DianpingProcessor(); - dianpingProcessor.getSite().setSleepTime(sleepTime).setRetryTimes(10); - Spider.create(dianpingProcessor).run(); - } -} From d76223eb0912a08417d7316886f7d1c2c597595f Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Fri, 26 Jul 2013 19:02:41 +0800 Subject: [PATCH 74/81] fix redisScheduler thread problem --- .../webmagic/scheduler/RedisScheduler.java | 42 +++++++++++++++---- .../webmagic/selenium/SeleniumTest.java | 16 ++++++- 2 files changed, 50 insertions(+), 8 deletions(-) diff --git a/webmagic-plugin/webmagic-misc/src/main/java/us/codecraft/webmagic/scheduler/RedisScheduler.java b/webmagic-plugin/webmagic-misc/src/main/java/us/codecraft/webmagic/scheduler/RedisScheduler.java index 481981dba..575beefc7 100644 --- a/webmagic-plugin/webmagic-misc/src/main/java/us/codecraft/webmagic/scheduler/RedisScheduler.java +++ b/webmagic-plugin/webmagic-misc/src/main/java/us/codecraft/webmagic/scheduler/RedisScheduler.java @@ -7,13 +7,17 @@ import us.codecraft.webmagic.Task; import us.codecraft.webmagic.schedular.Scheduler; +import java.util.concurrent.locks.Condition; +import java.util.concurrent.locks.ReentrantLock; + /** * 使用redis管理url,构建一个分布式的爬虫。
      + * * @author yihua.huang@dianping.com
      * @date: 13-7-25
      * Time: 上午7:07
      */ -public class RedisScheduler implements Scheduler{ +public class RedisScheduler implements Scheduler { private JedisPool pool; @@ -21,7 +25,11 @@ public class RedisScheduler implements Scheduler{ private static final String SET_PREFIX = "set_"; - public RedisScheduler(String host){ + private ReentrantLock lock = new ReentrantLock(); + + private Condition condition = lock.newCondition(); + + public RedisScheduler(String host) { pool = new JedisPool(new JedisPoolConfig(), host); } @@ -29,10 +37,16 @@ public RedisScheduler(String host){ public synchronized void push(Request request, Task task) { Jedis jedis = pool.getResource(); //使用SortedSet进行url去重 - if (jedis.zrank(SET_PREFIX+task.getUUID(),request.getUrl())==null){ - //使用List保存队列 - jedis.rpush(QUEUE_PREFIX+task.getUUID(),request.getUrl()); - jedis.zadd(SET_PREFIX+task.getUUID(),System.currentTimeMillis(),request.getUrl()); + if (jedis.zrank(SET_PREFIX + task.getUUID(), request.getUrl()) == null) { + try { + lock.lock(); + //使用List保存队列 + jedis.rpush(QUEUE_PREFIX + task.getUUID(), request.getUrl()); + jedis.zadd(SET_PREFIX + task.getUUID(), System.currentTimeMillis(), request.getUrl()); + condition.signal(); + } finally { + lock.unlock(); + } } pool.returnResource(jedis); } @@ -40,7 +54,21 @@ public synchronized void push(Request request, Task task) { @Override public synchronized Request poll(Task task) { Jedis jedis = pool.getResource(); - String url = jedis.lpop(QUEUE_PREFIX+task.getUUID()); + String url = jedis.lpop(QUEUE_PREFIX + task.getUUID()); + if (url == null) { + try { + lock.lock(); + while (url == null) { + try { + condition.await(); + url = jedis.lpop(QUEUE_PREFIX + task.getUUID()); + } catch (InterruptedException e) { + } + } + } finally { + lock.unlock(); + } + } pool.returnResource(jedis); return new Request(url); } diff --git a/webmagic-plugin/webmagic-selenium/src/test/java/us/codecraft/webmagic/selenium/SeleniumTest.java b/webmagic-plugin/webmagic-selenium/src/test/java/us/codecraft/webmagic/selenium/SeleniumTest.java index 6cf50c39b..a403b91ce 100644 --- a/webmagic-plugin/webmagic-selenium/src/test/java/us/codecraft/webmagic/selenium/SeleniumTest.java +++ b/webmagic-plugin/webmagic-selenium/src/test/java/us/codecraft/webmagic/selenium/SeleniumTest.java @@ -6,6 +6,11 @@ import org.openqa.selenium.WebDriver; import org.openqa.selenium.WebElement; import org.openqa.selenium.chrome.ChromeDriver; +import org.openqa.selenium.remote.DesiredCapabilities; + +import java.util.Arrays; +import java.util.HashMap; +import java.util.Map; /** * @author yihua.huang@dianping.com
      @@ -18,7 +23,16 @@ public class SeleniumTest { @Test public void testSelenium() { System.getProperties().setProperty("webdriver.chrome.driver", "/Users/yihua/Downloads/chromedriver"); - WebDriver webDriver = new ChromeDriver(); + Map contentSettings = new HashMap(); + contentSettings.put("images", 2); + + Map preferences = new HashMap(); + preferences.put("profile.default_content_settings", contentSettings); + + DesiredCapabilities caps = DesiredCapabilities.chrome(); + caps.setCapability("chrome.prefs", preferences); + caps.setCapability("chrome.switches", Arrays.asList("--user-data-dir=/Users/yihua/temp/chrome")); + WebDriver webDriver = new ChromeDriver(caps); webDriver.get("http://huaban.com/"); WebElement webElement = webDriver.findElement(By.xpath("/html")); System.out.println(webElement.getAttribute("outerHTML")); From 2ed275863ae5856d14adc1cdf6811e166656dd62 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Fri, 26 Jul 2013 19:12:55 +0800 Subject: [PATCH 75/81] fix redis null pointer exception --- .../webmagic/scheduler/RedisScheduler.java | 36 ++++--------------- 1 file changed, 6 insertions(+), 30 deletions(-) diff --git a/webmagic-plugin/webmagic-misc/src/main/java/us/codecraft/webmagic/scheduler/RedisScheduler.java b/webmagic-plugin/webmagic-misc/src/main/java/us/codecraft/webmagic/scheduler/RedisScheduler.java index 575beefc7..382642b7d 100644 --- a/webmagic-plugin/webmagic-misc/src/main/java/us/codecraft/webmagic/scheduler/RedisScheduler.java +++ b/webmagic-plugin/webmagic-misc/src/main/java/us/codecraft/webmagic/scheduler/RedisScheduler.java @@ -7,9 +7,6 @@ import us.codecraft.webmagic.Task; import us.codecraft.webmagic.schedular.Scheduler; -import java.util.concurrent.locks.Condition; -import java.util.concurrent.locks.ReentrantLock; - /** * 使用redis管理url,构建一个分布式的爬虫。
      * @@ -25,10 +22,6 @@ public class RedisScheduler implements Scheduler { private static final String SET_PREFIX = "set_"; - private ReentrantLock lock = new ReentrantLock(); - - private Condition condition = lock.newCondition(); - public RedisScheduler(String host) { pool = new JedisPool(new JedisPoolConfig(), host); } @@ -38,15 +31,9 @@ public synchronized void push(Request request, Task task) { Jedis jedis = pool.getResource(); //使用SortedSet进行url去重 if (jedis.zrank(SET_PREFIX + task.getUUID(), request.getUrl()) == null) { - try { - lock.lock(); - //使用List保存队列 - jedis.rpush(QUEUE_PREFIX + task.getUUID(), request.getUrl()); - jedis.zadd(SET_PREFIX + task.getUUID(), System.currentTimeMillis(), request.getUrl()); - condition.signal(); - } finally { - lock.unlock(); - } + //使用List保存队列 + jedis.rpush(QUEUE_PREFIX + task.getUUID(), request.getUrl()); + jedis.zadd(SET_PREFIX + task.getUUID(), System.currentTimeMillis(), request.getUrl()); } pool.returnResource(jedis); } @@ -55,21 +42,10 @@ public synchronized void push(Request request, Task task) { public synchronized Request poll(Task task) { Jedis jedis = pool.getResource(); String url = jedis.lpop(QUEUE_PREFIX + task.getUUID()); - if (url == null) { - try { - lock.lock(); - while (url == null) { - try { - condition.await(); - url = jedis.lpop(QUEUE_PREFIX + task.getUUID()); - } catch (InterruptedException e) { - } - } - } finally { - lock.unlock(); - } - } pool.returnResource(jedis); + if (url==null){ + return null; + } return new Request(url); } } From 58874c4cd3b685773777ba72ab14b8b10f7e5a19 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Fri, 26 Jul 2013 21:22:57 +0800 Subject: [PATCH 76/81] add list output support --- .../webmagic/pipeline/ConsolePipeline.java | 9 ++++ .../webmagic/pipeline/FilePipeline.java | 22 ++++++--- .../samples/InfoQMiniBookProcessor.java | 49 +++++++++++++++++++ 3 files changed, 74 insertions(+), 6 deletions(-) create mode 100644 webmagic-samples/src/main/java/us/codecraft/webmagic/samples/InfoQMiniBookProcessor.java diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/ConsolePipeline.java b/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/ConsolePipeline.java index 8be5fabb6..97470e04f 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/ConsolePipeline.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/ConsolePipeline.java @@ -20,6 +20,15 @@ public void process(ResultItems resultItems,Task task) { } System.out.println("get page: "+resultItems.getRequest().getUrl()); for (Map.Entry entry : resultItems.getAll().entrySet()) { + if (entry.getValue() instanceof Iterable) { + Iterable value = (Iterable) entry.getValue(); + System.out.println(entry.getKey() + ":"); + for (Object o : value) { + System.out.println(o); + } + } else { + System.out.println(entry.getKey() + ":\t" + entry.getValue()); + } System.out.println(entry.getKey()+":\t"+entry.getValue()); } } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/FilePipeline.java b/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/FilePipeline.java index cbce8324a..01f8d8b82 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/FilePipeline.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/FilePipeline.java @@ -13,9 +13,10 @@ /** * 持久化到文件的接口。 + * * @author code4crafter@gmail.com
      - * Date: 13-4-21 - * Time: 下午6:28 + * Date: 13-4-21 + * Time: 下午6:28 */ public class FilePipeline implements Pipeline { @@ -32,6 +33,7 @@ public FilePipeline() { /** * 新建一个FilePipeline + * * @param path 文件保存路径 */ public FilePipeline(String path) { @@ -45,18 +47,26 @@ public void process(ResultItems resultItems, Task task) { if (!file.exists()) { file.mkdirs(); } - if (resultItems.isSkip()){ + if (resultItems.isSkip()) { return; } try { - PrintWriter printWriter = new PrintWriter(new FileWriter(path + DigestUtils.md5Hex(resultItems.getRequest().getUrl())+".html")); + PrintWriter printWriter = new PrintWriter(new FileWriter(path + DigestUtils.md5Hex(resultItems.getRequest().getUrl()) + ".html")); printWriter.println("url:\t" + resultItems.getRequest().getUrl()); for (Map.Entry entry : resultItems.getAll().entrySet()) { - printWriter.println(entry.getKey()+":\t"+entry.getValue()); + if (entry.getValue() instanceof Iterable) { + Iterable value = (Iterable) entry.getValue(); + printWriter.println(entry.getKey() + ":"); + for (Object o : value) { + printWriter.println(o); + } + } else { + printWriter.println(entry.getKey() + ":\t" + entry.getValue()); + } } printWriter.close(); } catch (IOException e) { - logger.warn("write file error",e); + logger.warn("write file error", e); } } } diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/InfoQMiniBookProcessor.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/InfoQMiniBookProcessor.java new file mode 100644 index 000000000..b43c3c569 --- /dev/null +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/InfoQMiniBookProcessor.java @@ -0,0 +1,49 @@ +package us.codecraft.webmagic.samples; + +import org.apache.commons.collections.CollectionUtils; +import us.codecraft.webmagic.Page; +import us.codecraft.webmagic.Site; +import us.codecraft.webmagic.Spider; +import us.codecraft.webmagic.pipeline.FilePipeline; +import us.codecraft.webmagic.processor.PageProcessor; +import us.codecraft.webmagic.scheduler.RedisScheduler; + +import java.util.List; + +/** + * @author code4crafter@gmail.com
      + * Date: 13-4-21 + * Time: 下午8:08 + */ +public class InfoQMiniBookProcessor implements PageProcessor { + + private Site site; + + @Override + public void process(Page page) { + page.addTargetRequests(page.getHtml().links().regex("http://www\\.infoq\\.com/cn/minibooks/.*").all()); + List all = page.getHtml().links().regex(".*\\.pdf").all(); + if (CollectionUtils.isNotEmpty(all)) { + page.putField("pdf", all); + } else { + page.getResultItems().setSkip(true); + } + } + + @Override + public Site getSite() { + if (site == null) { + site = Site.me().setDomain("www.infoq.com").addStartUrl("http://www.infoq.com/cn/minibooks").addCookie("RegisteredUserCookie", "sDDDc8dIAgZSq67uJSXhtpQaHEi1XDOH"). + setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31"); + } + return site; + } + + public static void main(String[] args) { + Spider.create(new InfoQMiniBookProcessor()) + .scheduler(new RedisScheduler("localhost")) + .pipeline(new FilePipeline("/data/temp/webmagic/")) + .thread(5) + .run(); + } +} From 5a7d90db9b264a501a0c320f41c315e442ab7706 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Mon, 29 Jul 2013 10:59:23 +0800 Subject: [PATCH 77/81] fix a spider init problem --- webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java index cd092a308..2a8b78fb1 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java @@ -270,7 +270,9 @@ public Spider thread(int threadNum) { if (threadNum <= 0) { throw new IllegalArgumentException("threadNum should be more than one!"); } - downloader = new HttpClientDownloader(threadNum); + if (downloader==null || downloader instanceof HttpClientDownloader){ + downloader = new HttpClientDownloader(threadNum); + } if (threadNum == 1) { return this; } From 5388a4a4f692055b9d9dcf004e1b723923917b3f Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Mon, 29 Jul 2013 18:00:12 +0800 Subject: [PATCH 78/81] add selenium download timeout --- .../downloader/SeleniumDownloader.java | 22 ++++++++++++++++++- .../downloader/SeleniumDownloaderTest.java | 3 ++- 2 files changed, 23 insertions(+), 2 deletions(-) diff --git a/webmagic-plugin/webmagic-selenium/src/main/java/us/codecraft/webmagic/selenium/downloader/SeleniumDownloader.java b/webmagic-plugin/webmagic-selenium/src/main/java/us/codecraft/webmagic/selenium/downloader/SeleniumDownloader.java index b4dd3720b..54e3c9c0c 100644 --- a/webmagic-plugin/webmagic-selenium/src/main/java/us/codecraft/webmagic/selenium/downloader/SeleniumDownloader.java +++ b/webmagic-plugin/webmagic-selenium/src/main/java/us/codecraft/webmagic/selenium/downloader/SeleniumDownloader.java @@ -20,18 +20,22 @@ /** * 使用Selenium调用浏览器进行渲染。目前仅支持chrome。
      * 需要下载Selenium driver支持。
      + * * @author yihua.huang@dianping.com
      * @date: 13-7-26
      * Time: 下午1:37
      */ -public class SeleniumDownloader implements Downloader,Destroyable { +public class SeleniumDownloader implements Downloader, Destroyable { private WebDriverPool webDriverPool; private Logger logger = Logger.getLogger(getClass()); + private int sleepTime = 0; + /** * 新建 + * * @param chromeDriverPath */ public SeleniumDownloader(String chromeDriverPath) { @@ -44,6 +48,16 @@ public SeleniumDownloader(String chromeDriverPath, int poolSize) { webDriverPool = new WebDriverPool(poolSize); } + /** + * set sleep time to wait until load success + * @param sleepTime + * @return this + */ + public SeleniumDownloader setSleepTime(int sleepTime) { + this.sleepTime = sleepTime; + return this; + } + @Override public Page download(Request request, Task task) { WebDriver webDriver; @@ -55,6 +69,11 @@ public Page download(Request request, Task task) { } logger.info("downloading page " + request.getUrl()); webDriver.get(request.getUrl()); + try { + Thread.sleep(sleepTime); + } catch (InterruptedException e) { + e.printStackTrace(); + } WebDriver.Options manage = webDriver.manage(); Site site = task.getSite(); if (site.getCookies() != null) { @@ -65,6 +84,7 @@ public Page download(Request request, Task task) { } WebElement webElement = webDriver.findElement(By.xpath("/html")); String content = webElement.getAttribute("outerHTML"); + // Page page = new Page(); page.setHtml(new Html(UrlUtils.fixAllRelativeHrefs(content, request.getUrl()))); page.setUrl(new PlainText(request.getUrl())); diff --git a/webmagic-plugin/webmagic-selenium/src/test/java/us/codecraft/webmagic/selenium/downloader/SeleniumDownloaderTest.java b/webmagic-plugin/webmagic-selenium/src/test/java/us/codecraft/webmagic/selenium/downloader/SeleniumDownloaderTest.java index 615ad8674..4aa99195e 100644 --- a/webmagic-plugin/webmagic-selenium/src/test/java/us/codecraft/webmagic/selenium/downloader/SeleniumDownloaderTest.java +++ b/webmagic-plugin/webmagic-selenium/src/test/java/us/codecraft/webmagic/selenium/downloader/SeleniumDownloaderTest.java @@ -14,7 +14,7 @@ */ public class SeleniumDownloaderTest { - private String chromeDriverPath = ""; + private String chromeDriverPath = "/Users/yihua/Downloads/chromedriver"; @Ignore("need chrome driver") @Test @@ -37,4 +37,5 @@ public Site getSite() { } System.out.println(System.currentTimeMillis() - time1); } + } From bd2ced87d7a782d2310e946de8ce8e493898f3a5 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Mon, 29 Jul 2013 20:01:44 +0800 Subject: [PATCH 79/81] =?UTF-8?q?=E4=B8=BAdownloader=E5=A2=9E=E5=8A=A0?= =?UTF-8?q?=E4=BA=86=E4=B8=80=E4=B8=AA=E6=96=B0=E6=96=B9=E6=B3=95=EF=BC=8C?= =?UTF-8?q?=E5=8F=AF=E8=AE=BE=E7=BD=AE=E7=BA=BF=E7=A8=8B=E6=95=B0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../java/us/codecraft/webmagic/Spider.java | 17 +++++++------ .../webmagic/downloader/Downloader.java | 13 ++++++++-- .../webmagic/downloader/FileDownloader.java | 5 ++++ .../downloader/HttpClientDownloader.java | 13 ++++------ .../downloader/SeleniumDownloader.java | 25 +++++++++++++------ 5 files changed, 48 insertions(+), 25 deletions(-) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java index 2a8b78fb1..a25fd024c 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java @@ -58,6 +58,8 @@ public class Spider implements Runnable, Task { private ExecutorService executorService; + private int threadNum = 1; + private AtomicInteger stat = new AtomicInteger(STAT_INIT); private final static int STAT_INIT = 0; @@ -144,6 +146,10 @@ protected void checkComponent() { if (downloader == null) { this.downloader = new HttpClientDownloader(); } + if (pipelines.isEmpty()) { + pipelines.add(new ConsolePipeline()); + } + downloader.setThread(threadNum); } @Override @@ -158,9 +164,6 @@ public void run() { } } Request request = scheduler.poll(this); - if (pipelines.isEmpty()) { - pipelines.add(new ConsolePipeline()); - } //singel thread if (executorService == null) { while (request != null) { @@ -211,9 +214,9 @@ private void destroy() { } } - private void destroyEach(Object object){ + private void destroyEach(Object object) { if (object instanceof Destroyable) { - ((Destroyable)object).destroy(); + ((Destroyable) object).destroy(); } } @@ -267,12 +270,10 @@ public void runAsync() { */ public Spider thread(int threadNum) { checkIfNotRunning(); + this.threadNum = threadNum; if (threadNum <= 0) { throw new IllegalArgumentException("threadNum should be more than one!"); } - if (downloader==null || downloader instanceof HttpClientDownloader){ - downloader = new HttpClientDownloader(threadNum); - } if (threadNum == 1) { return this; } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/Downloader.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/Downloader.java index c431fc3b4..9a7f59a3a 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/Downloader.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/Downloader.java @@ -6,9 +6,10 @@ /** * Downloader是webmagic下载页面的接口。webmagic默认使用了HttpComponent作为下载器,一般情况,你无需自己实现这个接口。
      + * * @author code4crafter@gmail.com
      - * Date: 13-4-21 - * Time: 下午12:14 + * Date: 13-4-21 + * Time: 下午12:14 */ public interface Downloader { @@ -20,4 +21,12 @@ public interface Downloader { * @return page */ public Page download(Request request, Task task); + + /** + * 设置线程数,多线程程序一般需要Downloader支持
      + * 如果不考虑多线程的可以不实现这个方法
      + * + * @param thread 线程数量 + */ + public void setThread(int thread); } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/FileDownloader.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/FileDownloader.java index d22bf0815..722a2eb71 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/FileDownloader.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/FileDownloader.java @@ -67,6 +67,11 @@ public Page download(Request request, Task task) { return page; } + @Override + public void setThread(int thread) { + + } + private String getHtml(BufferedReader bufferedReader) throws IOException { String line; StringBuilder htmlBuilder= new StringBuilder(); diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java index d76341986..7956cd1ef 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java @@ -32,14 +32,6 @@ public class HttpClientDownloader implements Downloader { private int poolSize; - public HttpClientDownloader(int poolSize) { - this.poolSize = poolSize; - } - - public HttpClientDownloader() { - this(5); - } - @Override public Page download(Request request, Task task) { Site site = task.getSite(); @@ -90,6 +82,11 @@ public Page download(Request request, Task task) { return null; } + @Override + public void setThread(int thread) { + poolSize=thread; + } + private void handleGzip(HttpResponse httpResponse) { Header ceheader = httpResponse.getEntity().getContentEncoding(); if (ceheader != null) { diff --git a/webmagic-plugin/webmagic-selenium/src/main/java/us/codecraft/webmagic/selenium/downloader/SeleniumDownloader.java b/webmagic-plugin/webmagic-selenium/src/main/java/us/codecraft/webmagic/selenium/downloader/SeleniumDownloader.java index 54e3c9c0c..76ac0508d 100644 --- a/webmagic-plugin/webmagic-selenium/src/main/java/us/codecraft/webmagic/selenium/downloader/SeleniumDownloader.java +++ b/webmagic-plugin/webmagic-selenium/src/main/java/us/codecraft/webmagic/selenium/downloader/SeleniumDownloader.java @@ -27,12 +27,14 @@ */ public class SeleniumDownloader implements Downloader, Destroyable { - private WebDriverPool webDriverPool; + private volatile WebDriverPool webDriverPool; private Logger logger = Logger.getLogger(getClass()); private int sleepTime = 0; + private int poolSize = 1; + /** * 新建 * @@ -40,16 +42,11 @@ public class SeleniumDownloader implements Downloader, Destroyable { */ public SeleniumDownloader(String chromeDriverPath) { System.getProperties().setProperty("webdriver.chrome.driver", chromeDriverPath); - webDriverPool = new WebDriverPool(); - } - - public SeleniumDownloader(String chromeDriverPath, int poolSize) { - System.getProperties().setProperty("webdriver.chrome.driver", chromeDriverPath); - webDriverPool = new WebDriverPool(poolSize); } /** * set sleep time to wait until load success + * * @param sleepTime * @return this */ @@ -60,6 +57,7 @@ public SeleniumDownloader setSleepTime(int sleepTime) { @Override public Page download(Request request, Task task) { + checkInit(); WebDriver webDriver; try { webDriver = webDriverPool.get(); @@ -93,6 +91,19 @@ public Page download(Request request, Task task) { return page; } + private void checkInit() { + if (webDriverPool == null) { + synchronized (this){ + webDriverPool = new WebDriverPool(poolSize); + } + } + } + + @Override + public void setThread(int thread) { + this.poolSize = thread; + } + @Override public void destroy() { webDriverPool.closeAll(); From 878b121d4a7fef766bcd230700528a016aa9d97b Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Mon, 29 Jul 2013 20:05:44 +0800 Subject: [PATCH 80/81] add comments for selenium driver --- .../selenium/downloader/SeleniumDownloader.java | 1 - .../webmagic/selenium/downloader/WebDriverPool.java | 10 ++++++++-- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/webmagic-plugin/webmagic-selenium/src/main/java/us/codecraft/webmagic/selenium/downloader/SeleniumDownloader.java b/webmagic-plugin/webmagic-selenium/src/main/java/us/codecraft/webmagic/selenium/downloader/SeleniumDownloader.java index 76ac0508d..1b689d4d2 100644 --- a/webmagic-plugin/webmagic-selenium/src/main/java/us/codecraft/webmagic/selenium/downloader/SeleniumDownloader.java +++ b/webmagic-plugin/webmagic-selenium/src/main/java/us/codecraft/webmagic/selenium/downloader/SeleniumDownloader.java @@ -82,7 +82,6 @@ public Page download(Request request, Task task) { } WebElement webElement = webDriver.findElement(By.xpath("/html")); String content = webElement.getAttribute("outerHTML"); - // Page page = new Page(); page.setHtml(new Html(UrlUtils.fixAllRelativeHrefs(content, request.getUrl()))); page.setUrl(new PlainText(request.getUrl())); diff --git a/webmagic-plugin/webmagic-selenium/src/main/java/us/codecraft/webmagic/selenium/downloader/WebDriverPool.java b/webmagic-plugin/webmagic-selenium/src/main/java/us/codecraft/webmagic/selenium/downloader/WebDriverPool.java index 039cef98e..faed8d63a 100644 --- a/webmagic-plugin/webmagic-selenium/src/main/java/us/codecraft/webmagic/selenium/downloader/WebDriverPool.java +++ b/webmagic-plugin/webmagic-selenium/src/main/java/us/codecraft/webmagic/selenium/downloader/WebDriverPool.java @@ -27,8 +27,16 @@ class WebDriverPool { private AtomicInteger stat = new AtomicInteger(STAT_RUNNING); + /** + * store webDrivers created + */ private List webDriverList = Collections.synchronizedList(new ArrayList()); + /** + * store webDrivers available + */ + private BlockingDeque innerQueue = new LinkedBlockingDeque(); + public WebDriverPool(int capacity) { this.capacity = capacity; } @@ -37,8 +45,6 @@ public WebDriverPool() { this(DEFAULT_CAPACITY); } - private BlockingDeque innerQueue = new LinkedBlockingDeque(); - public WebDriver get() throws InterruptedException { checkRunning(); WebDriver poll = innerQueue.poll(); From fc008fc9b3f18fce159bc94e3b97054a469ca940 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Tue, 30 Jul 2013 09:36:00 +0800 Subject: [PATCH 81/81] dep --- README.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 75bea5287..e6212c73d 100644 --- a/README.md +++ b/README.md @@ -33,6 +33,8 @@ webmagic的功能覆盖整个爬虫的生命周期(链接提取、页面下载 ###Get Started webmagic定制的核心是PageProcessor接口。 + +项目使用maven托管,如果没用maven的可以去[http://git.oschina.net/flashsword20/webmagic-bin](http://git.oschina.net/flashsword20/webmagic-bin)库下载依赖包(这个仓库代码没有实时同步更新,不过依赖应该不会有变化)。 例如,我们要实现一个简单的通用爬虫SimplePageProcessor,代码如下: @@ -73,7 +75,6 @@ webmagic定制的核心是PageProcessor接口。 Spider.create(new SimplePageProcessor("http://my.oschina.net/", "http://my.oschina.net/*/blog/*")).run(); - ### 示例 webmagic-samples目录里有一些定制PageProcessor以抽取不同站点的例子。