|
| 1 | +import org.dom4j.DocumentHelper; |
| 2 | +import org.jsoup.Jsoup; |
| 3 | +import org.jsoup.nodes.Document; |
| 4 | +import org.jsoup.nodes.Element; |
| 5 | +import org.jsoup.select.Elements; |
| 6 | + |
| 7 | +import java.io.File; |
| 8 | +import java.io.IOException; |
| 9 | +import java.net.SocketTimeoutException; |
| 10 | +import java.util.*; |
| 11 | + |
| 12 | +public class GetAnswers { |
| 13 | + |
| 14 | + Map<String,String> categoryLink;//获取八个"类别"的名字和链接 |
| 15 | + |
| 16 | + public static void main(String[] args) { |
| 17 | + new GetAnswers().launch(); |
| 18 | + } |
| 19 | + |
| 20 | + public void launch() { |
| 21 | + getCategoryLink(); |
| 22 | + getAllAnswer(); |
| 23 | + } |
| 24 | + |
| 25 | + private void getAllAnswer() { |
| 26 | + Set<String> categoryName = categoryLink.keySet(); |
| 27 | + for (String name:categoryName) { |
| 28 | + if (name.equals("呼吸内科") || name.equals("内分泌科") || name.equals("肾内科") || name.equals("消化内科") || name.equals("血液科") || name.equals("风湿科")) { |
| 29 | + continue; |
| 30 | + } |
| 31 | + String webAddress = categoryLink.get(name);//“页”的链接 |
| 32 | + |
| 33 | + //链接地址不为空就表明还有下一页(到最后一页后设置为空) |
| 34 | + //循环遍历每一页 |
| 35 | + int pageCount = 1;//记录抓取的页数 |
| 36 | + int allPageCount = 0;//记录总共需要抓取的网页页数 |
| 37 | + |
| 38 | + try { |
| 39 | + Document tempDom = Jsoup.connect(webAddress).userAgent("Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 Safari/537.36 QIHU 360EE").timeout(5000).get(); |
| 40 | + allPageCount = Integer.parseInt(tempDom.select(".pager-last").attr("href").split("=")[1].split("#")[0])/25 + 1; |
| 41 | + } catch (IOException e) { |
| 42 | + e.printStackTrace(); |
| 43 | + } |
| 44 | + |
| 45 | + for (int i = 1;i <= allPageCount;i++) { |
| 46 | + List<String> answerAddressList = new LinkedList<String>();//先获取一页之内每一个有回答的问题的链接,存储在这个List里 |
| 47 | + Document dom = null; |
| 48 | + try { |
| 49 | + dom = Jsoup.connect(webAddress + "?pn=" + (i - 1) * 25 + "#list").userAgent("Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 Safari/537.36 QIHU 360EE").timeout(5000).get(); |
| 50 | + Elements allQuestion = dom.select(".question-list").select(".question-item"); |
| 51 | + for (Element question:allQuestion) { |
| 52 | + if (!question.select(".title-line").select(".question-answer-num").text().equals("0回答")) { |
| 53 | + answerAddressList.add(question.select(".title-line").select(".title-container").select(".question-title").attr("abs:href")); |
| 54 | + } |
| 55 | + } |
| 56 | + } catch (SocketTimeoutException e) { |
| 57 | + System.out.println("连接超时:" + webAddress); |
| 58 | + } catch (IOException e) { |
| 59 | + e.printStackTrace(); |
| 60 | + System.out.println("意外错误:" + webAddress); |
| 61 | + } |
| 62 | + //上面这部分获取了一页上所有问题的链接 |
| 63 | + |
| 64 | + //下面就进每一个问题的页面,抓取信息放到dom4j的Document中,最后用多线程写入文件 |
| 65 | + Map<String, org.dom4j.Document> answerDom4jDocument = new HashMap<String, org.dom4j.Document>();//问题题目对应题目的document(题目是文件的文件名) |
| 66 | + for (String answerAddress:answerAddressList) { |
| 67 | + //System.out.println("正在抓[" + name + "]类别下的" + answerAddress); |
| 68 | + try { |
| 69 | + Document answerDom = Jsoup.connect(answerAddress).userAgent("Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 Safari/537.36 QIHU 360EE").timeout(5000).get(); |
| 70 | + String type = name; |
| 71 | + String title = answerDom.title(); |
| 72 | + String answerNum = answerDom.select("#wgt-answers").select(".hd").select("h2").text().split("条")[0]; |
| 73 | + |
| 74 | + org.dom4j.Document document = DocumentHelper.createDocument();//这个问题的dom树 |
| 75 | + answerDom4jDocument.put(title,document); |
| 76 | + org.dom4j.Element root = document.addElement("root"); |
| 77 | + root.addElement("question").addText(title).addAttribute("type",name); |
| 78 | + org.dom4j.Element answers = root.addElement("answers").addAttribute("number",answerNum); |
| 79 | + |
| 80 | + //答案有三种【题主选择的最佳答案,网友选择的最佳答案,其他答案】 |
| 81 | + |
| 82 | + Elements allAnswer = answerDom.select(".bd.answer"); |
| 83 | + //分别处理每一页的所有答案 |
| 84 | + for (Element answer:allAnswer) { |
| 85 | + String text = answer.select(".answer-text").text(); |
| 86 | + String username = answer.select(".line.info.f-aid").select(".user-name").text(); |
| 87 | + if (username.equals("")) { |
| 88 | + username = answer.select(".line.info.f-aid").select(".mavin-name").text(); |
| 89 | + } |
| 90 | + String grade = answer.select(".line.info.f-aid").text().split(" ")[answer.select(".line.info.f-aid").text().split(" ").length - 1]; |
| 91 | + if (grade.equals("最快回答")) { |
| 92 | + grade = answer.select(".line.info.f-aid").text().split(" ")[answer.select(".line.info.f-aid").text().split(" ").length - 2]; |
| 93 | + } |
| 94 | + |
| 95 | + //不是“*级”那就是一个专家的称号 |
| 96 | + String author = "null"; |
| 97 | + if (!grade.contains("级")) { |
| 98 | + author = grade; |
| 99 | + grade = answer.select(".line.info.f-aid").select(".f-orange.f-yahei.ml-5").select("span").text(); |
| 100 | + } |
| 101 | + |
| 102 | + String support = answer.select(".line.content").select(".grid-r.f-aid").select(".evaluate").attr("data-evaluate"); |
| 103 | + String unsupport = answer.select(".line.content").select(".grid-r.f-aid").select(".evaluate.evaluate-bad").attr("data-evaluate"); |
| 104 | + |
| 105 | + org.dom4j.Element ans = answers.addElement("answer").addAttribute("username",username).addAttribute("grade",grade).addAttribute("author",author); |
| 106 | + ans.addElement("text").addText(text); |
| 107 | + ans.addElement("support").addText(support); |
| 108 | + ans.addElement("unsupport").addText(unsupport); |
| 109 | + |
| 110 | + if (answer.hasClass("wgt-replyer-best")) { |
| 111 | + ans.addElement("best_answer").addText("yes"); |
| 112 | + } else { |
| 113 | + ans.addElement("best_answer").addText("no"); |
| 114 | + } |
| 115 | + } |
| 116 | + |
| 117 | + |
| 118 | + } catch (SocketTimeoutException e) { |
| 119 | + System.out.println("连接超时:" + answerAddress); |
| 120 | + } catch (IOException e) { |
| 121 | + e.printStackTrace(); |
| 122 | + System.out.println("意外错误:" + answerAddress); |
| 123 | + } |
| 124 | + } |
| 125 | + |
| 126 | + Set<String> answerTitle = answerDom4jDocument.keySet(); |
| 127 | + //多线程写入文件 |
| 128 | + for (String title:answerTitle) { |
| 129 | + //分类目录不存在时创建文件夹 |
| 130 | + if (!new File("/home/geekgao/medical_question/" + name).exists()) { |
| 131 | + new File("/home/geekgao/medical_question/" + name).mkdir(); |
| 132 | + } |
| 133 | + |
| 134 | + new WriteAnswersToFile("/home/geekgao/medical_question/" + name + "/" + System.currentTimeMillis() + ".xml",answerDom4jDocument.get(title)).start(); |
| 135 | + } |
| 136 | + |
| 137 | + System.out.println("[" + name + "]类别第" + pageCount++ +"页已写入文件."); |
| 138 | + |
| 139 | + } |
| 140 | + } |
| 141 | + } |
| 142 | + |
| 143 | + public GetAnswers() { |
| 144 | + categoryLink = new HashMap<String, String>(); |
| 145 | + } |
| 146 | + |
| 147 | + private void getCategoryLink() { |
| 148 | + try { |
| 149 | + Document dom = Jsoup.connect("http://zhidao.baidu.com/browse/790").userAgent("Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 Safari/537.36 QIHU 360EE").timeout(5000).get(); |
| 150 | + Element categorys = dom.select(".category-list").first(); |
| 151 | + Elements allLi = categorys.select("li"); |
| 152 | + |
| 153 | + for (Element li:allLi) { |
| 154 | + categoryLink.put(li.text(), "http://zhidao.baidu.com" + li.select("a").attr("href")); |
| 155 | + } |
| 156 | + |
| 157 | + } catch (SocketTimeoutException e) { |
| 158 | + System.out.println("连接超时:http://zhidao.baidu.com/browse/790"); |
| 159 | + } catch (IOException e) { |
| 160 | + e.printStackTrace(); |
| 161 | + System.out.println("连接超时:http://zhidao.baidu.com/browse/790"); |
| 162 | + } |
| 163 | + } |
| 164 | +} |
0 commit comments