Skip to content

Commit 0ada4cb

Browse files
committed
爬虫
1 parent 69af7f7 commit 0ada4cb

File tree

4 files changed

+219
-0
lines changed

4 files changed

+219
-0
lines changed

crawler/pom.xml

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
<?xml version="1.0" encoding="UTF-8"?>
2+
<project xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
3+
xmlns="http://maven.apache.org/POM/4.0.0"
4+
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
5+
<parent>
6+
<artifactId>xiaomo</artifactId>
7+
<groupId>info.xiaomo</groupId>
8+
<version>1.0.0-SNAPSHOT</version>
9+
</parent>
10+
<modelVersion>4.0.0</modelVersion>
11+
12+
<artifactId>crawler</artifactId>
13+
14+
<dependencies>
15+
<dependency>
16+
<groupId>info.xiaomo</groupId>
17+
<artifactId>core</artifactId>
18+
<version>1.0.0-SNAPSHOT</version>
19+
</dependency>
20+
<dependency>
21+
<groupId>com.squareup.okhttp3</groupId>
22+
<artifactId>okhttp</artifactId>
23+
</dependency>
24+
<dependency>
25+
<groupId>org.jsoup</groupId>
26+
<artifactId>jsoup</artifactId>
27+
</dependency>
28+
</dependencies>
29+
30+
31+
<build>
32+
<plugins>
33+
<plugin>
34+
<groupId>org.springframework.boot</groupId>
35+
<artifactId>spring-boot-maven-plugin</artifactId>
36+
</plugin>
37+
</plugins>
38+
</build>
39+
40+
41+
</project>
Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
package info.xiaomo.crawler;
2+
3+
import org.springframework.boot.SpringApplication;
4+
import org.springframework.boot.autoconfigure.EnableAutoConfiguration;
5+
import org.springframework.context.annotation.ComponentScan;
6+
import org.springframework.context.annotation.Configuration;
7+
8+
/**
9+
* 把今天最好的表现当作明天最新的起点..~
10+
* いま 最高の表現 として 明日最新の始発..~
11+
* Today the best performance as tomorrow newest starter!
12+
* Created by IntelliJ IDEA.
13+
*
14+
* @author: xiaomo
15+
* @github: https://github.com/qq83387856
16+
* @email: hupengbest@163.com
17+
* @QQ_NO: 83387856
18+
* @Date: 2016/4/1 15:38
19+
* @Copyright(©) 2015 by xiaomo.
20+
**/
21+
@Configuration
22+
@EnableAutoConfiguration
23+
@ComponentScan("info.xiaomo")
24+
public class CrawlerMain {
25+
public static void main(String[] args) throws Exception {
26+
SpringApplication.run(CrawlerMain.class, args);
27+
}
28+
29+
}
Lines changed: 136 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,136 @@
1+
package info.xiaomo.crawler.spider;
2+
3+
import okhttp3.*;
4+
import org.jsoup.Jsoup;
5+
import org.jsoup.nodes.Document;
6+
import org.jsoup.nodes.Element;
7+
import org.slf4j.Logger;
8+
import org.slf4j.LoggerFactory;
9+
10+
import java.io.File;
11+
import java.io.IOException;
12+
import java.util.*;
13+
import java.util.concurrent.*;
14+
import java.util.concurrent.atomic.AtomicInteger;
15+
16+
public class Crawler {
17+
private static final Logger LOGGER = LoggerFactory.getLogger(Crawler.class);
18+
private final Set<HttpUrl> fetchedUrls = Collections.synchronizedSet(new LinkedHashSet<HttpUrl>());
19+
private final BlockingQueue<HttpUrl> queue = new LinkedBlockingQueue<>();
20+
private final ConcurrentMap<String, AtomicInteger> hostnames = new ConcurrentHashMap<>();
21+
private OkHttpClient client = null;
22+
23+
private Crawler() {
24+
init();
25+
}
26+
27+
private static Crawler getInstance() {
28+
return CrawlerHolder.INSTANCE;
29+
}
30+
31+
public static void main(String[] args) throws ExecutionException, InterruptedException {
32+
String[] urls = {"https://www.baidu.com/"};
33+
List<Future<String>> results = Crawler.getInstance().initUrl(urls).parallelDrainQueue(3);
34+
for (Future<String> future : results) {
35+
System.out.println(future.get());
36+
}
37+
}
38+
39+
private Crawler initUrl(String[] urls) {
40+
for (String url : urls) {
41+
queue.add(HttpUrl.parse(url));
42+
}
43+
44+
return this;
45+
}
46+
47+
private void init() {
48+
long cacheByteCount = 1024 * 1024 * 100;
49+
String dir = "C:\\test";
50+
Cache cache = new Cache(new File(dir), cacheByteCount);
51+
client = new OkHttpClient.Builder().cache(cache).build();
52+
}
53+
54+
public List<Future<String>> parallelDrainQueue(int threadCount) {
55+
ExecutorService executor = Executors.newFixedThreadPool(threadCount);
56+
List<Future<String>> results = new ArrayList<>();
57+
for (int i = 0; i < threadCount; i++) {
58+
Future<String> future = executor.submit(new Callable<String>() {
59+
@Override
60+
public String call() throws Exception {
61+
try {
62+
drainQueue();
63+
} catch (Exception e) {
64+
e.printStackTrace();
65+
}
66+
return null;
67+
}
68+
});
69+
70+
results.add(future);
71+
}
72+
return results;
73+
}
74+
75+
private void drainQueue() throws Exception {
76+
for (HttpUrl url; (url = queue.take()) != null; ) {
77+
if (!fetchedUrls.add(url)) {
78+
continue;
79+
}
80+
81+
try {
82+
fetch(url);
83+
} catch (IOException e) {
84+
e.printStackTrace();
85+
}
86+
}
87+
}
88+
89+
private void fetch(HttpUrl url) throws IOException {
90+
AtomicInteger hostnameCount = new AtomicInteger();
91+
AtomicInteger previous = hostnames.putIfAbsent(url.host(), hostnameCount);
92+
if (previous != null) {
93+
hostnameCount = previous;
94+
}
95+
96+
if (hostnameCount.incrementAndGet() > 100) {
97+
return;
98+
}
99+
100+
Request request = new Request.Builder().url(url).build();
101+
Response response = client.newCall(request).execute();
102+
String responseSource = response.networkResponse() != null
103+
? ("(network: " + response.networkResponse().code() + " over " + response.protocol() + ")") : "(cache)";
104+
int responseCode = response.code();
105+
106+
// 打印log
107+
LOGGER.info("ThreadName:【{}】,ResponseCode:【{}】,URL:【{}】,ResponseSource:【{}】", Thread.currentThread().getName(),
108+
responseCode, url, responseSource);
109+
110+
String contentType = response.header("Content-Type");
111+
if (responseCode != 200 || contentType == null) {
112+
response.body().close();
113+
return;
114+
}
115+
116+
MediaType mediaType = MediaType.parse(contentType);
117+
if (mediaType == null || !mediaType.subtype().equalsIgnoreCase("html")) {
118+
response.body().close();
119+
return;
120+
}
121+
122+
// 获取页面的a[href], 加入LinkedBlockingQueue
123+
Document document = Jsoup.parse(response.body().string(), url.toString());
124+
for (Element element : document.select("a[href]")) {
125+
String href = element.attr("href");
126+
HttpUrl link = response.request().url().resolve(href);
127+
if (link != null) {
128+
queue.add(link);
129+
}
130+
}
131+
}
132+
133+
private static class CrawlerHolder {
134+
private static final Crawler INSTANCE = new Crawler();
135+
}
136+
}

pom.xml

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,7 @@
3636
<module>multipleSource</module>
3737
<!-- 数据接口 -->
3838
<module>api</module>
39+
<module>crawler</module>
3940
</modules>
4041
<packaging>pom</packaging>
4142

@@ -91,6 +92,8 @@
9192
<springfox-swagger-ui.version>2.2.2</springfox-swagger-ui.version>
9293
<!-- mybatis -->
9394
<mybatis.version>1.1.1</mybatis.version>
95+
<jsoup.version>1.9.2</jsoup.version>
96+
<okhttp.version>3.4.1</okhttp.version>
9497
</properties>
9598

9699

@@ -172,6 +175,16 @@
172175
<artifactId>mybatis-spring-boot-starter</artifactId>
173176
<version>${mybatis.version}</version>
174177
</dependency>
178+
<dependency>
179+
<groupId>com.squareup.okhttp3</groupId>
180+
<artifactId>okhttp</artifactId>
181+
<version>${okhttp.version}</version>
182+
</dependency>
183+
<dependency>
184+
<groupId>org.jsoup</groupId>
185+
<artifactId>jsoup</artifactId>
186+
<version>${jsoup.version}</version>
187+
</dependency>
175188
</dependencies>
176189
</dependencyManagement>
177190

0 commit comments

Comments
 (0)