/** * add url to fetch * * @param requestString requestString */ public void addTargetRequest(String requestString) { if (StringUtils.isBlank(requestString) || requestString.equals("#")) { return; } requestString = UrlUtils.canonicalizeUrl(requestString, url.toString()); targetRequests.add(new Request(requestString)); }
/** * add urls to fetch * * @param requests requests */ public void addTargetRequests(List<String> requests) { for (String s : requests) { if (StringUtils.isBlank(s) || s.equals("#") || s.startsWith("javascript:")) { continue; } s = UrlUtils.canonicalizeUrl(s, url.toString()); targetRequests.add(new Request(s)); } }
/** * add urls to fetch * * @param requests requests * @param priority priority */ public void addTargetRequests(List<String> requests, long priority) { for (String s : requests) { if (StringUtils.isBlank(s) || s.equals("#") || s.startsWith("javascript:")) { continue; } s = UrlUtils.canonicalizeUrl(s, url.toString()); targetRequests.add(new Request(s).setPriority(priority)); } }
public void process(Page page) { Html html = page.getHtml(); List<String> questionList = html.xpath("//table[@class='tgCustomerCommunityCenterColumn']//div[@class='content']//table[@class='dataGrid']//tr").all(); if(questionList != null && questionList.size() > 1) { //i=0是列名称,所以i从1开始 for( int i = 1 ; i < questionList.size(); i++) { System.out.println(questionList.get(i)); Html tempHtml = Html.create("<table>"+questionList.get(i)+"</table>"); String comment = tempHtml.xpath("//td[@class='title']//a/text()").toString(); System.out.println(comment); String answerNum = tempHtml.xpath("//td[@class='num']/text()").toString(); System.out.println(answerNum); String createTime = tempHtml.xpath("//td[3]/text()").toString(); System.out.println(createTime); /* Document doc = Jsoup.parse(questionList.get(i)); Html hmt = Html.create(questionList.get(i)) ; String str = hmt.links().toString(); String content = doc.getElementsByTag("a").text(); String ss = doc.text();*/ } } }
@Override public RequestMatcher.MatchOther processPage(Page page) { page.putField("reponame", page.getHtml().xpath("//h1[@class='entry-title public']/strong/a/text()").toString()); return RequestMatcher.MatchOther.YES; }
@Override public void process(Page page) { page.addTargetRequests(page.getHtml().links().regex("https://www\\.zhihu\\.com/question/\\d+/answer/\\d+.*").all()); page.putField("title", page.getHtml().xpath("//h1[@class='QuestionHeader-title']/text()").toString()); page.putField("question", page.getHtml().xpath("//div[@class='QuestionRichText']//tidyText()").toString()); page.putField("answer", page.getHtml().xpath("//div[@class='QuestionAnswer-content']/tidyText()").toString()); if (page.getResultItems().get("title")==null){ //skip this page page.setSkip(true); } }
@Override public void process(Page page) { page.putField("name", page.getHtml().css("dl.lemmaWgt-lemmaTitle h1","text").toString()); page.putField("description", page.getHtml().xpath("//div[@class='lemma-summary']/allText()")); }
public Object process(Page page) { boolean matched = false; for (Pattern targetPattern : targetUrlPatterns) { if (targetPattern.matcher(page.getUrl().toString()).matches()) { matched = true; } } if (!matched) { return null; } if (objectExtractor == null) { return processSingle(page, null, true); } else { if (objectExtractor.multi) { List<Object> os = new ArrayList<Object>(); List<String> list = objectExtractor.getSelector().selectList(page.getRawText()); for (String s : list) { Object o = processSingle(page, s, false); if (o != null) { os.add(o); } } return os; } else { String select = objectExtractor.getSelector().select(page.getRawText()); Object o = processSingle(page, select, false); return o; } } }
private void processDistrict(Page page) { String province = page.getRequest().getExtra("province").toString(); String district = page.getRequest().getExtra("district").toString(); String zipCode = page.getHtml().regex("<h2>邮编:(\\d+)</h2>").toString(); page.putField("result", StringUtils.join(new String[]{province, district, zipCode}, "\t")); List<String> links = page.getHtml().links().regex("http://www\\.ip138\\.com/\\d{6}[/]?$").all(); for (String link : links) { page.addTargetRequest(new Request(link).setPriority(2).putExtra("province", province).putExtra("district", district)); } }
@Override public void process(Page page) { //http://progressdaily.diandian.com/post/2013-01-24/40046867275 int i = Integer.valueOf(page.getUrl().regex("shop/(\\d+)").toString()) + 1; page.addTargetRequest("http://kaichiba.com/shop/" + i); page.putField("title",page.getHtml().xpath("//Title")); page.putField("items", page.getHtml().xpath("//li[@class=\"foodTitle\"]").replace("^\\s+", "").replace("\\s+$", "").replace("<span>.*?</span>", "")); }
@Override public void process(Page page) { page.addTargetRequests(page.getHtml().links().regex("(https://github\\.com/\\w+/\\w+)").all()); page.addTargetRequests(page.getHtml().links().regex("(https://github\\.com/\\w+)").all()); GithubRepo githubRepo = new GithubRepo(); githubRepo.setAuthor(page.getUrl().regex("https://github\\.com/(\\w+)/.*").toString()); githubRepo.setName(page.getHtml().xpath("//h1[contains(@class, 'entry-title') and contains(@class, 'public')]/strong/a/text()").toString()); githubRepo.setReadme(page.getHtml().xpath("//div[@id='readme']/tidyText()").toString()); if (githubRepo.getName() == null) { //skip this page page.setSkip(true); } else { page.putField("repo", githubRepo); } }
@Override public RequestMatcher.MatchOther processPage(Page page) { log.info("Extracting from " + page.getUrl()); page.addTargetRequests(page.getHtml().links().regex("https://github\\.com/[\\w\\-]+/[\\w\\-]+").all()); page.addTargetRequests(page.getHtml().links().regex("https://github\\.com/[\\w\\-]+").all()); page.putField("username", page.getHtml().xpath("//span[@class='vcard-fullname']/text()").toString()); return RequestMatcher.MatchOther.YES; }
@Override public void process(Page page) { List<String> requests = page.getHtml().links().regex("(http://www\\.diaoyuweng\\.com/home\\.php\\?mod=space&uid=88304&do=thread&view=me&type=thread&order=dateline&from=space&page=\\d+)").all(); page.addTargetRequests(requests); requests = page.getHtml().links().regex("(http://www\\.diaoyuweng\\.com/thread-\\d+-1-1.html)").all(); page.addTargetRequests(requests); if (page.getUrl().toString().contains("thread")){ page.putField("title", page.getHtml().xpath("//a[@id='thread_subject']")); page.putField("content", page.getHtml().xpath("//div[@class='pcb']//tbody/tidyText()")); page.putField("date",page.getHtml().regex("发表于 (\\d{4}-\\d+-\\d+ \\d+:\\d+:\\d+)")); page.putField("id",new PlainText("1000"+page.getUrl().regex("http://www\\.diaoyuweng\\.com/thread-(\\d+)-1-1.html").toString())); } }
@Override public void process(Page page) { page.addTargetRequests(page.getHtml().links().regex("(https://github\\.com/[\\w\\-]+/[\\w\\-]+)").all()); page.addTargetRequests(page.getHtml().links().regex("(https://github\\.com/[\\w\\-])").all()); page.putField("author", page.getUrl().regex("https://github\\.com/(\\w+)/.*").toString()); page.putField("name", page.getHtml().xpath("//h1[@class='public']/strong/a/text()").toString()); if (page.getResultItems().get("name")==null){ //skip this page page.setSkip(true); } page.putField("readme", page.getHtml().xpath("//div[@id='readme']/tidyText()")); }
@Override public void process(Page page) { List<String> relativeUrl = page.getHtml().xpath("//li[@class='item clearfix']/div/a/@href").all(); page.addTargetRequests(relativeUrl); relativeUrl = page.getHtml().xpath("//div[@id='zh-question-related-questions']//a[@class='question_link']/@href").all(); page.addTargetRequests(relativeUrl); List<String> answers = page.getHtml().xpath("//div[@id='zh-question-answer-wrap']/div").all(); boolean exist = false; for(String answer:answers){ String vote = new Html(answer).xpath("//div[@class='zm-votebar']//span[@class='count']/text()").toString(); if(Integer.valueOf(vote) >= voteNum){ page.putField("vote",vote); page.putField("content",new Html(answer).xpath("//div[@class='zm-editable-content']")); page.putField("userid", new Html(answer).xpath("//a[@class='author-link']/@href")); exist = true; } } if(!exist){ page.setSkip(true); } }
@Override public void process(Page page) { //a()表示提取链接,links()表示提取所有链接 //getHtml()返回Html对象,支持链式调用 //r()表示用正则表达式提取一条内容,regex()表示提取多条内容 //toString()表示取单条结果,all()表示取多条 List<String> requests = page.getHtml().links().regex("(.*/post/.*)").all(); //使用page.addTargetRequests()方法将待抓取的链接加入队列 page.addTargetRequests(requests); //page.putField(key,value)将抽取的内容加入结果Map //x()和xs()使用xpath进行抽取 page.putField("title", page.getHtml().xpath("//title").regex("(.*?)\\|").toString()); //smartContent()使用readability技术直接抽取正文,对于规整的文本有比较好的抽取正确率 page.putField("content", page.getHtml().smartContent()); page.putField("date", page.getUrl().regex("post/(\\d+-\\d+-\\d+)/")); page.putField("id", page.getUrl().regex("post/\\d+-\\d+-\\d+/(\\d+)")); }
@Override public void process(Page page) { page.addTargetRequests(page.getHtml().links().regex(COUNTRY_PAGE).all()); page.addTargetRequests(page.getHtml().links().regex(COUNTRY_EFFECT_PAGE).all()); if (page.getUrl().regex(COUNTRY_EFFECT_PAGE).match()) { String countryName = page.getHtml().xpath("//div[@class='bgGray5']/div/h3[@class='fl f30 lh30 mt10']/text()").toString(); page.putField("countryName", countryName); } }