@Override public RequestMatcher.MatchOther processPage(Page page) { log.info("Extracting from " + page.getUrl()); page.addTargetRequests(page.getHtml().links().regex("https://github\\.com/[\\w\\-]+/[\\w\\-]+").all()); page.addTargetRequests(page.getHtml().links().regex("https://github\\.com/[\\w\\-]+").all()); page.putField("username", page.getHtml().xpath("//span[@class='vcard-fullname']/text()").toString()); return RequestMatcher.MatchOther.YES; }
@Override public void process(Page page) { //列表页 if (page.getUrl().regex(URL_LIST).match()) { page.addTargetRequests(page.getHtml().xpath("//div[@class=\"articleList\"]").links().regex(URL_POST).all()); page.addTargetRequests(page.getHtml().links().regex(URL_LIST).all()); //文章页 } else { page.putField("title", page.getHtml().xpath("//div[@class='articalTitle']/h2")); page.putField("content", page.getHtml().xpath("//div[@id='articlebody']//div[@class='articalContent']")); page.putField("date", page.getHtml().xpath("//div[@id='articlebody']//span[@class='time SG_txtc']").regex("\\((.*)\\)")); } }
@Override public void process(Page page) { List<Selectable> nodes = page.getHtml().xpath("//ul[@id=ma-thumb-list]/li").nodes(); StringBuilder accum = new StringBuilder(); for (Selectable node : nodes) { accum.append("img:").append(node.xpath("//a/@href").get()).append("\n"); accum.append("title:").append(node.xpath("//img/@alt").get()).append("\n"); } page.putField("",accum.toString()); if (accum.length() == 0) { page.setSkip(true); } page.addTargetRequests(page.getHtml().links().regex("http://www\\.mama\\.cn/photo/.*\\.html").all()); }
@Override public void process(Page page) { //http://progressdaily.diandian.com/post/2013-01-24/40046867275 int i = Integer.valueOf(page.getUrl().regex("shop/(\\d+)").toString()) + 1; page.addTargetRequest("http://kaichiba.com/shop/" + i); page.putField("title",page.getHtml().xpath("//Title")); page.putField("items", page.getHtml().xpath("//li[@class=\"foodTitle\"]").replace("^\\s+", "").replace("\\s+$", "").replace("<span>.*?</span>", "")); }
@Override public void process(Page page) { page.addTargetRequests(page.getHtml().links().regex("http://www\\.infoq\\.com/cn/minibooks/.*").all()); List<String> all = page.getHtml().links().regex(".*\\.pdf").all(); if (CollectionUtils.isNotEmpty(all)) { page.putField("pdf", all); } else { page.getResultItems().setSkip(true); } }
public void process(Page page) { Html html = page.getHtml(); List<String> questionList = html.xpath("//table[@class='tgCustomerCommunityCenterColumn']//div[@class='content']//table[@class='dataGrid']//tr").all(); if(questionList != null && questionList.size() > 1) { //i=0是列名称,所以i从1开始 for( int i = 1 ; i < questionList.size(); i++) { System.out.println(questionList.get(i)); Html tempHtml = Html.create("<table>"+questionList.get(i)+"</table>"); String comment = tempHtml.xpath("//td[@class='title']//a/text()").toString(); System.out.println(comment); String answerNum = tempHtml.xpath("//td[@class='num']/text()").toString(); System.out.println(answerNum); String createTime = tempHtml.xpath("//td[3]/text()").toString(); System.out.println(createTime); /* Document doc = Jsoup.parse(questionList.get(i)); Html hmt = Html.create(questionList.get(i)) ; String str = hmt.links().toString(); String content = doc.getElementsByTag("a").text(); String ss = doc.text();*/ } } }
String type = page.getUrl().toString().split("/")[5].split("\\?")[0]; String userno = page.getUrl().toString().split("/")[4]; List<String> bookNos = page.getHtml().xpath("//ul[@class='list-view']/li/@id").all(); List<String> dates = page.getHtml().xpath("//ul[@class='list-view']/li/div[@class='item-show']/div[@class='date']/text()").all(); String nextPage = page.getHtml().xpath("//span[@class='next']/a/@href").get(); if (!StringUtils.isBlank(nextPage)) page.addTargetRequest(new Request(nextPage));
@Override public List<String> process(File inItem) { List<String> outItems = null; try { BufferedReader in = new BufferedReader( new FileReader(inItem) ); String s; in.readLine();//pass first line s = in.readLine(); if (s != null) { Json json = new Json(s); outItems = json.jsonPath("$.data[*].[*]").all(); } in.close(); } catch (IOException e) { e.printStackTrace(); } return outItems; } }
@Override public void process(Page page) { if (page.getUrl().regex(URL_POST).match()) { page.putField("goodsName", page.getHtml().xpath("//div[@id='description']/h1/tidyText()")); if (page.getResultItems().get("goodsName") == null) { .xpath("//div[@id='colors']/ul/html()")); } else { page.addTargetRequests(page.getHtml().links().regex(URL_POST).all(), 1000); page.addTargetRequests(page.getHtml().links().regex(URL_LIST).all(), 1);
if(page.getUrl().regex("https://www\\.zhihu\\.com/search\\?type=people&q=[\\s\\S]+").match()){ List<String> requests=page.getHtml().xpath("//ul[@class='list users']/li/div/div[@class='body']/div[@class='line']").links().all(); for(int i=0;i<requests.size();i++){ requests.set(i,requests.get(i)+"/answers"); else if(page.getUrl().regex("https://www\\.zhihu\\.com/people/(.+)/answers").match()) String name = page.getHtml().xpath("//h1[@class='ProfileHeader-title']/span[@class='ProfileHeader-name']/text()").get(); String identity = page.getHtml().xpath("//*[@id='ProfileHeader']/div/div[2]/div/div[2]/div[1]/h1/span[2]/text()").get(); int infoItem_count = page.getHtml().xpath("//div[@class='ProfileHeader-infoItem']").nodes().size(); String sexString = page.getHtml().xpath("//button[@class='Button FollowButton Button--primary Button--blue']/text()").get(); String picUrl = page.getHtml().xpath("//div[@class='UserAvatar ProfileHeader-avatar']/img[@class='Avatar Avatar--large UserAvatar-inner']/@src").get(); user.setPicUrl(picUrl); String userFollowingUrl= page.getUrl().get();
@Override public void process(Page page) { if (page.getUrl().regex(LIST_URL).match()) { List<String> ids = new JsonPathSelector("$.data[*]._id").selectList(page.getRawText()); if (CollectionUtils.isNotEmpty(ids)) { for (String id : ids) { page.addTargetRequest("http://angularjs.cn/api/article/" + id); } } } else { page.putField("title", new JsonPathSelector("$.data.title").select(page.getRawText())); page.putField("content", new JsonPathSelector("$.data.content").select(page.getRawText())); } }
/** * add url to fetch * * @param requestString requestString */ public void addTargetRequest(String requestString) { if (StringUtils.isBlank(requestString) || requestString.equals("#")) { return; } requestString = UrlUtils.canonicalizeUrl(requestString, url.toString()); targetRequests.add(new Request(requestString)); }
private void extractLinks(Page page, Selector urlRegionSelector, List<Pattern> urlPatterns) { List<String> links; if (urlRegionSelector == null) { links = page.getHtml().links().all(); } else { links = page.getHtml().selectList(urlRegionSelector).links().all(); } for (String link : links) { for (Pattern targetUrlPattern : urlPatterns) { Matcher matcher = targetUrlPattern.matcher(link); if (matcher.find()) { page.addTargetRequest(new Request(matcher.group(0))); } } } }
@Override public void process(Page page, BaseModel model) { Html pageHtml = page.getHtml(); String title = pageHtml.xpath(model.getTitleRegex()).get(); String source = page.getRequest().getUrl(); if (!StringUtils.isEmpty(title) && (!"null".equals(title) && !Arrays.asList(model.getEntryUrls()).contains(source) || model.isSingle)) { page.putField("title", title); page.putField("source", source); page.putField("releaseDate", pageHtml.xpath(model.getReleaseDateRegex()).get()); page.putField("author", pageHtml.xpath(model.getAuthorRegex()).get()); page.putField("content", pageHtml.xpath(model.getContentRegex()).get()); page.putField("tags", pageHtml.xpath(model.getTagRegex()).all()); page.putField("description", pageHtml.xpath(model.getDescriptionRegex()).get()); page.putField("keywords", pageHtml.xpath(model.getKeywordsRegex()).get()); } page.addTargetRequests(page.getHtml().links().regex(model.getTargetLinksRegex()).all()); } }
List<String> sevenDays = sevenStr.xpath("//ul[@class='t clearfix']/li").all(); List<String> result = new ArrayList<>(); Html temp = Html.create(day); StringBuffer stringBuffer = new StringBuffer(); stringBuffer.append(temp.xpath("//h1/text()").toString()); stringBuffer.append("," + temp.xpath("//p[@class='wea']/text()").toString()); stringBuffer.append("," + temp.xpath("//p[@class='tem']/allText()").toString()); List<String> windList = temp.xpath("//p[@class='win']/em/span").all(); String windStr = ","; if(windList !=null && windList.size() > 0){ stringBuffer.append("," + temp.xpath("//p[@class='win']/i/text()").toString());
String userno = page.getUrl().toString().split("/")[4]; String name = page.getHtml().xpath("//div[@class='info']/h1/text()").get().trim(); String signature = page.getHtml().xpath("//div[@class='info']/h1/div[@class='signature_display pl']/text()").get(); String introduction = page.getHtml().xpath("//span[@id='intro_display']/text()").get(); String location = page.getHtml().xpath("//div[@class='user-info']/a/text()").get(); String joinTime = page.getHtml().xpath("//div[@class='user-info']/div[@class='pl']/text()").regex("\\d+-\\d+-\\d+").get(); String pic = page.getHtml().xpath("//div[@class='basic-info']/img/@src").get(); String reading = page.getHtml().xpath("//div[@class='article']/div[@id='book']/h2/span[@class='pl']/a/text()").regex("\\d+本在读").get(); String hasread = page.getHtml().xpath("//div[@class='article']/div[@id='book']/h2/span[@class='pl']/a/text()").regex("\\d+本读过").get(); String wantread = page.getHtml().xpath("//div[@class='article']/div[@id='book']/h2/span[@class='pl']/a/text()").regex("\\d+本想读").get(); String follwees = page.getHtml().xpath(("//div[@id='friend']/h2/span[@class='pl']/a/text()")).regex("\\d+").get(); String followers = page.getHtml().xpath("//div[@class='aside']/p[@class='rev-link']/a/text()").regex("\\d+").get();
public void process(Page page) { Json json = page.getJson(); //System.out.println(json); page.putField(ZhihuPipeline.URL, page.getUrl()); page.putField(ZhihuPipeline.RESPONSE, json); String isEnd = json.jsonPath("$.paging.is_end").get(); if (!Boolean.parseBoolean(isEnd)) { page.addTargetRequest(json.jsonPath("$.paging.next").get()); } List<String> urlTokens = json.jsonPath("$.data[*].url_token").all(); List<String> urls = generateFolloweeUrls(urlTokens); page.addTargetRequests(urls); }
@Override public void process(Page page) { //最近7天天气 Selectable sevenStr = page.getHtml().xpath("//div[@id='7d']/ul[@class='t clearfix']"); //分时段天气 Selectable hourStr = page.getHtml().xpath("//div[@id='7d']/script"); //最近24小时整体情况 // Selectable t24Str = page.getHtml().xpath("//div[@class='left fl']/script"); WeatherWeather weather = new WeatherWeather(); weather.setHour(handleHourStr(hourStr));; List<String> list = handleSevenDays(sevenStr); if(list != null && list.size() == 7){ weather.setToday(list.get(0)); weather.setNextday(list.get(1)); weather.setNext2day(list.get(2)); weather.setNext3day(list.get(3)); weather.setNext4day(list.get(4)); weather.setNext5day(list.get(5)); weather.setNext6day(list.get(6)); } page.putField("weather", weather); page.putField("stationCode", page.getUrl().regex("(\\d+).shtml",1)); }
@Override public List<Document> process(File inItem) { String s = readMember(inItem); List<Document> documents = null; if (!StringUtils.isEmpty(s)) { documents = new ArrayList<>(1); Json json = new Json(s); String id = json.jsonPath("$.id").get(); if (!duplicateRemover.isDuplicate(id)) { documents.add(new Document(id, s)); } } return documents; }