Skip to content

Commit c31f1bf

Browse files
committed
百度知道医学知识的抓取并存储
1 parent dc89915 commit c31f1bf

File tree

2 files changed

+201
-0
lines changed

2 files changed

+201
-0
lines changed

medical_question/GetAnswers.java

Lines changed: 164 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,164 @@
1+
import org.dom4j.DocumentHelper;
2+
import org.jsoup.Jsoup;
3+
import org.jsoup.nodes.Document;
4+
import org.jsoup.nodes.Element;
5+
import org.jsoup.select.Elements;
6+
7+
import java.io.File;
8+
import java.io.IOException;
9+
import java.net.SocketTimeoutException;
10+
import java.util.*;
11+
12+
public class GetAnswers {
13+
14+
Map<String,String> categoryLink;//获取八个"类别"的名字和链接
15+
16+
public static void main(String[] args) {
17+
new GetAnswers().launch();
18+
}
19+
20+
public void launch() {
21+
getCategoryLink();
22+
getAllAnswer();
23+
}
24+
25+
private void getAllAnswer() {
26+
Set<String> categoryName = categoryLink.keySet();
27+
for (String name:categoryName) {
28+
if (name.equals("呼吸内科") || name.equals("内分泌科") || name.equals("肾内科") || name.equals("消化内科") || name.equals("血液科") || name.equals("风湿科")) {
29+
continue;
30+
}
31+
String webAddress = categoryLink.get(name);//“页”的链接
32+
33+
//链接地址不为空就表明还有下一页(到最后一页后设置为空)
34+
//循环遍历每一页
35+
int pageCount = 1;//记录抓取的页数
36+
int allPageCount = 0;//记录总共需要抓取的网页页数
37+
38+
try {
39+
Document tempDom = Jsoup.connect(webAddress).userAgent("Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 Safari/537.36 QIHU 360EE").timeout(5000).get();
40+
allPageCount = Integer.parseInt(tempDom.select(".pager-last").attr("href").split("=")[1].split("#")[0])/25 + 1;
41+
} catch (IOException e) {
42+
e.printStackTrace();
43+
}
44+
45+
for (int i = 1;i <= allPageCount;i++) {
46+
List<String> answerAddressList = new LinkedList<String>();//先获取一页之内每一个有回答的问题的链接,存储在这个List里
47+
Document dom = null;
48+
try {
49+
dom = Jsoup.connect(webAddress + "?pn=" + (i - 1) * 25 + "#list").userAgent("Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 Safari/537.36 QIHU 360EE").timeout(5000).get();
50+
Elements allQuestion = dom.select(".question-list").select(".question-item");
51+
for (Element question:allQuestion) {
52+
if (!question.select(".title-line").select(".question-answer-num").text().equals("0回答")) {
53+
answerAddressList.add(question.select(".title-line").select(".title-container").select(".question-title").attr("abs:href"));
54+
}
55+
}
56+
} catch (SocketTimeoutException e) {
57+
System.out.println("连接超时:" + webAddress);
58+
} catch (IOException e) {
59+
e.printStackTrace();
60+
System.out.println("意外错误:" + webAddress);
61+
}
62+
//上面这部分获取了一页上所有问题的链接
63+
64+
//下面就进每一个问题的页面,抓取信息放到dom4j的Document中,最后用多线程写入文件
65+
Map<String, org.dom4j.Document> answerDom4jDocument = new HashMap<String, org.dom4j.Document>();//问题题目对应题目的document(题目是文件的文件名)
66+
for (String answerAddress:answerAddressList) {
67+
//System.out.println("正在抓[" + name + "]类别下的" + answerAddress);
68+
try {
69+
Document answerDom = Jsoup.connect(answerAddress).userAgent("Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 Safari/537.36 QIHU 360EE").timeout(5000).get();
70+
String type = name;
71+
String title = answerDom.title();
72+
String answerNum = answerDom.select("#wgt-answers").select(".hd").select("h2").text().split("条")[0];
73+
74+
org.dom4j.Document document = DocumentHelper.createDocument();//这个问题的dom树
75+
answerDom4jDocument.put(title,document);
76+
org.dom4j.Element root = document.addElement("root");
77+
root.addElement("question").addText(title).addAttribute("type",name);
78+
org.dom4j.Element answers = root.addElement("answers").addAttribute("number",answerNum);
79+
80+
//答案有三种【题主选择的最佳答案,网友选择的最佳答案,其他答案】
81+
82+
Elements allAnswer = answerDom.select(".bd.answer");
83+
//分别处理每一页的所有答案
84+
for (Element answer:allAnswer) {
85+
String text = answer.select(".answer-text").text();
86+
String username = answer.select(".line.info.f-aid").select(".user-name").text();
87+
if (username.equals("")) {
88+
username = answer.select(".line.info.f-aid").select(".mavin-name").text();
89+
}
90+
String grade = answer.select(".line.info.f-aid").text().split(" ")[answer.select(".line.info.f-aid").text().split(" ").length - 1];
91+
if (grade.equals("最快回答")) {
92+
grade = answer.select(".line.info.f-aid").text().split(" ")[answer.select(".line.info.f-aid").text().split(" ").length - 2];
93+
}
94+
95+
//不是“*级”那就是一个专家的称号
96+
String author = "null";
97+
if (!grade.contains("级")) {
98+
author = grade;
99+
grade = answer.select(".line.info.f-aid").select(".f-orange.f-yahei.ml-5").select("span").text();
100+
}
101+
102+
String support = answer.select(".line.content").select(".grid-r.f-aid").select(".evaluate").attr("data-evaluate");
103+
String unsupport = answer.select(".line.content").select(".grid-r.f-aid").select(".evaluate.evaluate-bad").attr("data-evaluate");
104+
105+
org.dom4j.Element ans = answers.addElement("answer").addAttribute("username",username).addAttribute("grade",grade).addAttribute("author",author);
106+
ans.addElement("text").addText(text);
107+
ans.addElement("support").addText(support);
108+
ans.addElement("unsupport").addText(unsupport);
109+
110+
if (answer.hasClass("wgt-replyer-best")) {
111+
ans.addElement("best_answer").addText("yes");
112+
} else {
113+
ans.addElement("best_answer").addText("no");
114+
}
115+
}
116+
117+
118+
} catch (SocketTimeoutException e) {
119+
System.out.println("连接超时:" + answerAddress);
120+
} catch (IOException e) {
121+
e.printStackTrace();
122+
System.out.println("意外错误:" + answerAddress);
123+
}
124+
}
125+
126+
Set<String> answerTitle = answerDom4jDocument.keySet();
127+
//多线程写入文件
128+
for (String title:answerTitle) {
129+
//分类目录不存在时创建文件夹
130+
if (!new File("/home/geekgao/medical_question/" + name).exists()) {
131+
new File("/home/geekgao/medical_question/" + name).mkdir();
132+
}
133+
134+
new WriteAnswersToFile("/home/geekgao/medical_question/" + name + "/" + System.currentTimeMillis() + ".xml",answerDom4jDocument.get(title)).start();
135+
}
136+
137+
System.out.println("[" + name + "]类别第" + pageCount++ +"页已写入文件.");
138+
139+
}
140+
}
141+
}
142+
143+
public GetAnswers() {
144+
categoryLink = new HashMap<String, String>();
145+
}
146+
147+
private void getCategoryLink() {
148+
try {
149+
Document dom = Jsoup.connect("http://zhidao.baidu.com/browse/790").userAgent("Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 Safari/537.36 QIHU 360EE").timeout(5000).get();
150+
Element categorys = dom.select(".category-list").first();
151+
Elements allLi = categorys.select("li");
152+
153+
for (Element li:allLi) {
154+
categoryLink.put(li.text(), "http://zhidao.baidu.com" + li.select("a").attr("href"));
155+
}
156+
157+
} catch (SocketTimeoutException e) {
158+
System.out.println("连接超时:http://zhidao.baidu.com/browse/790");
159+
} catch (IOException e) {
160+
e.printStackTrace();
161+
System.out.println("连接超时:http://zhidao.baidu.com/browse/790");
162+
}
163+
}
164+
}
Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
import org.dom4j.Document;
2+
import org.dom4j.io.OutputFormat;
3+
import org.dom4j.io.XMLWriter;
4+
5+
import java.io.File;
6+
import java.io.FileWriter;
7+
import java.io.IOException;
8+
9+
10+
public class WriteAnswersToFile extends Thread {
11+
12+
private String address;//输出文件到哪个地址
13+
private Document dom;//将这个dom放入文件里存储
14+
15+
/**
16+
*
17+
* @param address 文件将存储到这个地址
18+
* @param dom asd 即将存储到硬盘的xml文件
19+
*/
20+
public WriteAnswersToFile(String address,Document dom) {
21+
this.address = address;
22+
this.dom = dom;
23+
}
24+
25+
public void run() {
26+
OutputFormat outFormat = OutputFormat.createPrettyPrint();
27+
outFormat.setEncoding("UTF-8");
28+
29+
try {
30+
XMLWriter xml = new XMLWriter(new FileWriter(new File(address)),outFormat);
31+
xml.write(dom);
32+
xml.close();
33+
} catch (IOException e) {
34+
e.printStackTrace();
35+
}
36+
}
37+
}

0 commit comments

Comments
 (0)