|
| 1 | +package bean; |
| 2 | + |
| 3 | +import java.util.HashMap; |
| 4 | +import java.util.Map; |
| 5 | +import java.util.regex.Matcher; |
| 6 | +import java.util.regex.Pattern; |
| 7 | + |
| 8 | +import spider.Spider; |
| 9 | + |
| 10 | +/** |
| 11 | + * 单个问题页面对象 |
| 12 | + * @author hjy |
| 13 | + * |
| 14 | + */ |
| 15 | +public class Imooc { |
| 16 | + public String question; |
| 17 | + public String quesUrl; |
| 18 | + public String quesDescription; |
| 19 | + public Map<String,String> answers; |
| 20 | + public String nextUrl; |
| 21 | + |
| 22 | + public Imooc(String url) { |
| 23 | + question=""; |
| 24 | + quesUrl=url; |
| 25 | + quesDescription=""; |
| 26 | + answers = new HashMap<String,String>(); |
| 27 | + nextUrl=""; |
| 28 | + |
| 29 | +// 获取单个问题页面源码 |
| 30 | + String codeSource = Spider.getSource(url); |
| 31 | + |
| 32 | +// 正则表达式获取question |
| 33 | + Pattern pattern=Pattern.compile("js-qa-wenda-title.+?>(.+?)</h1>"); |
| 34 | + Matcher matcher=pattern.matcher(codeSource); |
| 35 | + if(matcher.find()){ |
| 36 | + question = matcher.group(1); |
| 37 | + } |
| 38 | + |
| 39 | +// 正则表达式获取问题描述 |
| 40 | + pattern=Pattern.compile("js-qa-wenda.+?rich-text\">(.+?)</div>"); |
| 41 | + matcher=pattern.matcher(codeSource); |
| 42 | + if(matcher.find()){ |
| 43 | + quesDescription = matcher.group(1).replace("<p>", "").replace("</p>", ""); |
| 44 | + } |
| 45 | + |
| 46 | +// 正则表达式获取答案列表 |
| 47 | + pattern=Pattern.compile("detail-name.+?>(.+?)</a>.+?answer-content.+?>(.+?)</div>");//获取回答者name |
| 48 | + matcher=pattern.matcher(codeSource); |
| 49 | + while(matcher.find()){ |
| 50 | + String answer = matcher.group(2).replace("<p>", ""); |
| 51 | + answer = answer.replace("</p>", ""); |
| 52 | + answer = answer.replace("<br />", ""); |
| 53 | + answers.put(matcher.group(1),answer); |
| 54 | + } |
| 55 | + |
| 56 | +// 正则表达式获取下一个url 爬取获取相关问题的url |
| 57 | + pattern=Pattern.compile("class=\"relwenda\".+?href=\"(.+?)\".+?</a>");//获取回答者name |
| 58 | + matcher=pattern.matcher(codeSource); |
| 59 | + while(matcher.find()){ |
| 60 | + nextUrl="http://www.imooc.com"+matcher.group(1); |
| 61 | +// 防止相关文题是本问题无线循环下去 |
| 62 | + if(!nextUrl.equals(quesUrl)){ |
| 63 | + break; |
| 64 | + } |
| 65 | + } |
| 66 | + } |
| 67 | + |
| 68 | + @Override |
| 69 | + public String toString() { |
| 70 | + return "问题为:"+ question +"\n问题地址为:"+quesUrl+ |
| 71 | + "\n问题的表述为:"+quesDescription+"\n" |
| 72 | + + "回答的内容为:"+answers+"\n指向下一个链接地址为:"+nextUrl+"\n"; |
| 73 | + } |
| 74 | + |
| 75 | + |
| 76 | +// 测试方法 |
| 77 | + public static void main(String[] args) { |
| 78 | + String url = "http://www.imooc.com/wenda/detail/345010"; |
| 79 | + Imooc imooc = new Imooc(url); |
| 80 | + System.out.println(imooc); |
| 81 | + } |
| 82 | +} |
0 commit comments