@Override public void process(Page page) { List<Selectable> nodes = page.getHtml().xpath("//ul[@id=ma-thumb-list]/li").nodes(); StringBuilder accum = new StringBuilder(); for (Selectable node : nodes) { accum.append("img:").append(node.xpath("//a/@href").get()).append("\n"); accum.append("title:").append(node.xpath("//img/@alt").get()).append("\n"); } page.putField("",accum.toString()); if (accum.length() == 0) { page.setSkip(true); } page.addTargetRequests(page.getHtml().links().regex("http://www\\.mama\\.cn/photo/.*\\.html").all()); }
@Override public void process(Page page) { page.addTargetRequests(page.getHtml().links().regex("http://www\\.infoq\\.com/cn/minibooks/.*").all()); List<String> all = page.getHtml().links().regex(".*\\.pdf").all(); if (CollectionUtils.isNotEmpty(all)) { page.putField("pdf", all); } else { page.getResultItems().setSkip(true); } }
@Override public void process(Page page) { //a()表示提取链接,links()表示提取所有链接 //getHtml()返回Html对象,支持链式调用 //r()表示用正则表达式提取一条内容,regex()表示提取多条内容 //toString()表示取单条结果,all()表示取多条 List<String> requests = page.getHtml().links().regex("(.*/post/.*)").all(); //使用page.addTargetRequests()方法将待抓取的链接加入队列 page.addTargetRequests(requests); //page.putField(key,value)将抽取的内容加入结果Map //x()和xs()使用xpath进行抽取 page.putField("title", page.getHtml().xpath("//title").regex("(.*?)\\|").toString()); //smartContent()使用readability技术直接抽取正文,对于规整的文本有比较好的抽取正确率 page.putField("content", page.getHtml().smartContent()); page.putField("date", page.getUrl().regex("post/(\\d+-\\d+-\\d+)/")); page.putField("id", page.getUrl().regex("post/\\d+-\\d+-\\d+/(\\d+)")); }
@Override public void process(Page page) { if (page.getUrl().toString().equals("http://www.ip138.com/post/")) { processCountry(page); } else if (page.getUrl().regex("http://www\\.ip138\\.com/\\d{6}[/]?$").toString() != null) { processDistrict(page); } else { processProvince(page); } }
@Override public void process(Page page) { //列表页 if (page.getUrl().regex(URL_LIST).match()) { page.addTargetRequests(page.getHtml().xpath("//div[@class=\"articleList\"]").links().regex(URL_POST).all()); page.addTargetRequests(page.getHtml().links().regex(URL_LIST).all()); //文章页 } else { page.putField("title", page.getHtml().xpath("//div[@class='articalTitle']/h2")); page.putField("content", page.getHtml().xpath("//div[@id='articlebody']//div[@class='articalContent']")); page.putField("date", page.getHtml().xpath("//div[@id='articlebody']//span[@class='time SG_txtc']").regex("\\((.*)\\)")); } }
@Override public void process(Page page) { page.addTargetRequests(page.getHtml().links().regex("(https://github\\.com/\\w+/\\w+)").all()); page.addTargetRequests(page.getHtml().links().regex("(https://github\\.com/\\w+)").all()); GithubRepo githubRepo = githubRepoPageMapper.get(page); if (githubRepo == null) { page.setSkip(true); } else { page.putField("repo", githubRepo); } }
@Override public RequestMatcher.MatchOther processPage(Page page) { log.info("Extracting from " + page.getUrl()); page.addTargetRequests(page.getHtml().links().regex("https://github\\.com/[\\w\\-]+/[\\w\\-]+").all()); page.addTargetRequests(page.getHtml().links().regex("https://github\\.com/[\\w\\-]+").all()); page.putField("username", page.getHtml().xpath("//span[@class='vcard-fullname']/text()").toString()); return RequestMatcher.MatchOther.YES; }
private void processDistrict(Page page) { String province = page.getRequest().getExtra("province").toString(); String district = page.getRequest().getExtra("district").toString(); String zipCode = page.getHtml().regex("<h2>邮编:(\\d+)</h2>").toString(); page.putField("result", StringUtils.join(new String[]{province, district, zipCode}, "\t")); List<String> links = page.getHtml().links().regex("http://www\\.ip138\\.com/\\d{6}[/]?$").all(); for (String link : links) { page.addTargetRequest(new Request(link).setPriority(2).putExtra("province", province).putExtra("district", district)); } }
@Override public void process(Page page) { List<String> requests = page.getHtml().links().regex("(http://www\\.diaoyuweng\\.com/home\\.php\\?mod=space&uid=88304&do=thread&view=me&type=thread&order=dateline&from=space&page=\\d+)").all(); page.addTargetRequests(requests); requests = page.getHtml().links().regex("(http://www\\.diaoyuweng\\.com/thread-\\d+-1-1.html)").all(); page.addTargetRequests(requests); if (page.getUrl().toString().contains("thread")){ page.putField("title", page.getHtml().xpath("//a[@id='thread_subject']")); page.putField("content", page.getHtml().xpath("//div[@class='pcb']//tbody/tidyText()")); page.putField("date",page.getHtml().regex("发表于 (\\d{4}-\\d+-\\d+ \\d+:\\d+:\\d+)")); page.putField("id",new PlainText("1000"+page.getUrl().regex("http://www\\.diaoyuweng\\.com/thread-(\\d+)-1-1.html").toString())); } }
@Override public void process(Page page) { page.addTargetRequests(page.getHtml().links().regex("(https://github\\.com/[\\w\\-]+/[\\w\\-]+)").all()); page.addTargetRequests(page.getHtml().links().regex("(https://github\\.com/[\\w\\-])").all()); page.putField("author", page.getUrl().regex("https://github\\.com/(\\w+)/.*").toString()); page.putField("name", page.getHtml().xpath("//h1[@class='public']/strong/a/text()").toString()); if (page.getResultItems().get("name")==null){ //skip this page page.setSkip(true); } page.putField("readme", page.getHtml().xpath("//div[@id='readme']/tidyText()")); }
@Override public void process(Page page) { //http://progressdaily.diandian.com/post/2013-01-24/40046867275 List<String> requests = page.getHtml().xpath("//a[@class=\"area_link flat_btn\"]/@href").all(); if (requests.size() > 2) { requests = requests.subList(0, 2); } page.addTargetRequests(requests); page.addTargetRequests(page.getHtml().links().regex("(.*/restaurant/[^#]+)").all()); page.putField("items", page.getHtml().xpath("//ul[@class=\"dishes menu_dishes\"]/li/span[@class=\"name\"]/text()")); page.putField("prices", page.getHtml().xpath("//ul[@class=\"dishes menu_dishes\"]/li/span[@class=\"price_outer\"]/span[@class=\"price\"]/text()")); }
@Override public void process(Page page) { //http://progressdaily.diandian.com/post/2013-01-24/40046867275 int i = Integer.valueOf(page.getUrl().regex("shop/(\\d+)").toString()) + 1; page.addTargetRequest("http://kaichiba.com/shop/" + i); page.putField("title",page.getHtml().xpath("//Title")); page.putField("items", page.getHtml().xpath("//li[@class=\"foodTitle\"]").replace("^\\s+", "").replace("\\s+$", "").replace("<span>.*?</span>", "")); }
@Override public void process(Page page) { page.addTargetRequests(page.getHtml().links().regex("(https://github\\.com/\\w+/\\w+)").all()); page.addTargetRequests(page.getHtml().links().regex("(https://github\\.com/\\w+)").all()); GithubRepo githubRepo = new GithubRepo(); githubRepo.setAuthor(page.getUrl().regex("https://github\\.com/(\\w+)/.*").toString()); githubRepo.setName(page.getHtml().xpath("//h1[contains(@class, 'entry-title') and contains(@class, 'public')]/strong/a/text()").toString()); githubRepo.setReadme(page.getHtml().xpath("//div[@id='readme']/tidyText()").toString()); if (githubRepo.getName() == null) { //skip this page page.setSkip(true); } else { page.putField("repo", githubRepo); } }
@Override public void process(Page page) { List<String> strings = page.getHtml().links().regex(".*/yewu/.*").all(); page.addTargetRequests(strings); page.putField("title",page.getHtml().regex("<title>(.*)</title>")); page.putField("body",page.getHtml().xpath("//dd")); }
@Override public void process(Page page) { if (page.getUrl().regex(LIST_URL).match()) { List<String> ids = new JsonPathSelector("$.data[*]._id").selectList(page.getRawText()); if (CollectionUtils.isNotEmpty(ids)) { for (String id : ids) { page.addTargetRequest("http://angularjs.cn/api/article/" + id); } } } else { page.putField("title", new JsonPathSelector("$.data.title").select(page.getRawText())); page.putField("content", new JsonPathSelector("$.data.content").select(page.getRawText())); } }
@Override public void process(Page page) { List<String> requests = page.getHtml().links().regex(".*article.*").all(); page.addTargetRequests(requests); page.putField("title",page.getHtml().xpath("//div[@class='clearfix neirong']//h1/text()")); page.putField("content",page.getHtml().xpath("//div[@id='neirong_box']/tidyText()")); }
@Override public void process(Page page) { page.addTargetRequests(page.getHtml().links().regex(".*yanghaoli\\.iteye\\.com/blog/\\d+").all()); page.putField("title",page.getHtml().xpath("//title").toString()); page.putField("content",page.getHtml().smartContent().toString()); }
@Override public void process(Page page) { List<String> requests = page.getHtml().links().regex(urlPattern).all(); //add urls to fetch page.addTargetRequests(requests); //extract by XPath page.putField("title", page.getHtml().xpath("//title")); page.putField("html", page.getHtml().toString()); //extract by Readability page.putField("content", page.getHtml().smartContent()); }
@Override public void process(Page page) { page.addTargetRequests(page.getHtml().links().regex("https://www\\.zhihu\\.com/question/\\d+/answer/\\d+.*").all()); page.putField("title", page.getHtml().xpath("//h1[@class='QuestionHeader-title']/text()").toString()); page.putField("question", page.getHtml().xpath("//div[@class='QuestionRichText']//tidyText()").toString()); page.putField("answer", page.getHtml().xpath("//div[@class='QuestionAnswer-content']/tidyText()").toString()); if (page.getResultItems().get("title")==null){ //skip this page page.setSkip(true); } }
@Override public void process(Page page) { if (page.getUrl().regex(URL_POST).match()) { page.putField("goodsName", page.getHtml().xpath("//div[@id='description']/h1/tidyText()")); if (page.getResultItems().get("goodsName") == null) { .xpath("//div[@id='colors']/ul/html()")); } else { page.addTargetRequests(page.getHtml().links().regex(URL_POST).all(), 1000); page.addTargetRequests(page.getHtml().links().regex(URL_LIST).all(), 1);