Skip to content

Commit 2e602c2

Browse files
committed
java简易爬虫项目
1 parent 7f67fcc commit 2e602c2

9 files changed

Lines changed: 218 additions & 0 deletions

File tree

IMOOCSpider/.classpath

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
<?xml version="1.0" encoding="UTF-8"?>
2+
<classpath>
3+
<classpathentry kind="src" path="src"/>
4+
<classpathentry kind="con" path="org.eclipse.jdt.launching.JRE_CONTAINER/org.eclipse.jdt.internal.debug.ui.launcher.StandardVMType/JavaSE-1.8"/>
5+
<classpathentry kind="output" path="bin"/>
6+
</classpath>

IMOOCSpider/.project

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
<?xml version="1.0" encoding="UTF-8"?>
2+
<projectDescription>
3+
<name>IMOOCSpider</name>
4+
<comment></comment>
5+
<projects>
6+
</projects>
7+
<buildSpec>
8+
<buildCommand>
9+
<name>org.eclipse.jdt.core.javabuilder</name>
10+
<arguments>
11+
</arguments>
12+
</buildCommand>
13+
</buildSpec>
14+
<natures>
15+
<nature>org.eclipse.jdt.core.javanature</nature>
16+
</natures>
17+
</projectDescription>
Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
eclipse.preferences.version=1
2+
org.eclipse.jdt.core.compiler.codegen.inlineJsrBytecode=enabled
3+
org.eclipse.jdt.core.compiler.codegen.targetPlatform=1.8
4+
org.eclipse.jdt.core.compiler.codegen.unusedLocal=preserve
5+
org.eclipse.jdt.core.compiler.compliance=1.8
6+
org.eclipse.jdt.core.compiler.debug.lineNumber=generate
7+
org.eclipse.jdt.core.compiler.debug.localVariable=generate
8+
org.eclipse.jdt.core.compiler.debug.sourceFile=generate
9+
org.eclipse.jdt.core.compiler.problem.assertIdentifier=error
10+
org.eclipse.jdt.core.compiler.problem.enumIdentifier=error
11+
org.eclipse.jdt.core.compiler.source=1.8

IMOOCSpider/bin/bean/Imooc.class

2.77 KB
Binary file not shown.

IMOOCSpider/bin/main/Main.class

826 Bytes
Binary file not shown.
2.82 KB
Binary file not shown.

IMOOCSpider/src/bean/Imooc.java

Lines changed: 82 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,82 @@
1+
package bean;
2+
3+
import java.util.HashMap;
4+
import java.util.Map;
5+
import java.util.regex.Matcher;
6+
import java.util.regex.Pattern;
7+
8+
import spider.Spider;
9+
10+
/**
11+
* 单个问题页面对象
12+
* @author hjy
13+
*
14+
*/
15+
public class Imooc {
16+
public String question;
17+
public String quesUrl;
18+
public String quesDescription;
19+
public Map<String,String> answers;
20+
public String nextUrl;
21+
22+
public Imooc(String url) {
23+
question="";
24+
quesUrl=url;
25+
quesDescription="";
26+
answers = new HashMap<String,String>();
27+
nextUrl="";
28+
29+
// 获取单个问题页面源码
30+
String codeSource = Spider.getSource(url);
31+
32+
// 正则表达式获取question
33+
Pattern pattern=Pattern.compile("js-qa-wenda-title.+?>(.+?)</h1>");
34+
Matcher matcher=pattern.matcher(codeSource);
35+
if(matcher.find()){
36+
question = matcher.group(1);
37+
}
38+
39+
// 正则表达式获取问题描述
40+
pattern=Pattern.compile("js-qa-wenda.+?rich-text\">(.+?)</div>");
41+
matcher=pattern.matcher(codeSource);
42+
if(matcher.find()){
43+
quesDescription = matcher.group(1).replace("<p>", "").replace("</p>", "");
44+
}
45+
46+
// 正则表达式获取答案列表
47+
pattern=Pattern.compile("detail-name.+?>(.+?)</a>.+?answer-content.+?>(.+?)</div>");//获取回答者name
48+
matcher=pattern.matcher(codeSource);
49+
while(matcher.find()){
50+
String answer = matcher.group(2).replace("<p>", "");
51+
answer = answer.replace("</p>", "");
52+
answer = answer.replace("<br />", "");
53+
answers.put(matcher.group(1),answer);
54+
}
55+
56+
// 正则表达式获取下一个url 爬取获取相关问题的url
57+
pattern=Pattern.compile("class=\"relwenda\".+?href=\"(.+?)\".+?</a>");//获取回答者name
58+
matcher=pattern.matcher(codeSource);
59+
while(matcher.find()){
60+
nextUrl="http://www.imooc.com"+matcher.group(1);
61+
// 防止相关文题是本问题无线循环下去
62+
if(!nextUrl.equals(quesUrl)){
63+
break;
64+
}
65+
}
66+
}
67+
68+
@Override
69+
public String toString() {
70+
return "问题为:"+ question +"\n问题地址为:"+quesUrl+
71+
"\n问题的表述为:"+quesDescription+"\n"
72+
+ "回答的内容为:"+answers+"\n指向下一个链接地址为:"+nextUrl+"\n";
73+
}
74+
75+
76+
// 测试方法
77+
public static void main(String[] args) {
78+
String url = "http://www.imooc.com/wenda/detail/345010";
79+
Imooc imooc = new Imooc(url);
80+
System.out.println(imooc);
81+
}
82+
}

IMOOCSpider/src/main/Main.java

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
package main;
2+
3+
import java.util.ArrayList;
4+
import java.util.List;
5+
6+
import bean.Imooc;
7+
8+
public class Main {
9+
10+
public static void main(String[] args) {
11+
String url = "http://www.imooc.com/wenda/detail/345252";
12+
// List<Imooc> imoocs = new ArrayList<Imooc>();
13+
Imooc imooc;
14+
// 限定爬取数量
15+
for(int i=0; i<5;i++){
16+
imooc = new Imooc(url);
17+
// imoocs.add(imooc);
18+
url = imooc.nextUrl;
19+
System.out.println(imooc);
20+
}
21+
}
22+
}

IMOOCSpider/src/spider/Spider.java

Lines changed: 80 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,80 @@
1+
package spider;
2+
3+
import java.io.BufferedReader;
4+
import java.io.InputStreamReader;
5+
import java.net.URL;
6+
import java.net.URLConnection;
7+
import java.util.ArrayList;
8+
import java.util.List;
9+
import java.util.regex.Matcher;
10+
import java.util.regex.Pattern;
11+
12+
import bean.Imooc;
13+
14+
/**
15+
* 1.获取网页源码
16+
*
17+
*/
18+
public class Spider {
19+
20+
/**
21+
* 获取网页源代码
22+
* @param url 网址
23+
* @return
24+
*/
25+
public static String getSource(String url){
26+
BufferedReader reader=null;
27+
String result = "";
28+
try {
29+
URL realUrl = new URL(url);
30+
// 打开和url之间的连接
31+
URLConnection conn = realUrl.openConnection();
32+
33+
reader = new BufferedReader(new InputStreamReader(conn.getInputStream()));
34+
String line="";
35+
while((line=reader.readLine())!=null){
36+
result+=line;
37+
}
38+
// System.out.println(result);
39+
if(reader!=null)
40+
reader.close();
41+
} catch (Exception e) {
42+
e.printStackTrace();
43+
}
44+
return result;
45+
}
46+
47+
48+
// 测试是否获得慕课问答页面源码
49+
// 测试是否获取问题的url
50+
public static void main(String[] args) {
51+
String url = "http://www.imooc.com/wenda";
52+
String regex = "class=\"content\".+?href=\"(.+?)\".+?</a>";
53+
String result = getSource(url);
54+
List<String> wendaUrl = getImoocPage(result,regex);
55+
System.out.println(wendaUrl);
56+
}
57+
58+
59+
60+
61+
// 没有用到
62+
/**
63+
* 获取问题页面url
64+
* @param quesSource 页面源码
65+
* @param regex 匹配正则表达式
66+
* @return
67+
*/
68+
public static List<String> getImoocPage(String quesSource,String regex){
69+
Pattern pattern = Pattern.compile(regex);
70+
Matcher matcher = pattern.matcher(quesSource);
71+
List<String> quesUrl = new ArrayList<String>();
72+
while(matcher.find()){
73+
String url = "http://www.imooc.com"+matcher.group(1);
74+
quesUrl.add(url);
75+
}
76+
// System.out.println(quesUrl);
77+
return quesUrl;
78+
}
79+
80+
}

0 commit comments

Comments
 (0)