@Override public void process(Page page) { if (page.getUrl().regex(LIST_URL).match()) { List<String> ids = new JsonPathSelector("$.data[*]._id").selectList(page.getRawText()); if (CollectionUtils.isNotEmpty(ids)) { for (String id : ids) { page.addTargetRequest("http://angularjs.cn/api/article/" + id); } } } else { page.putField("title", new JsonPathSelector("$.data.title").select(page.getRawText())); page.putField("content", new JsonPathSelector("$.data.content").select(page.getRawText())); } }
@Override public void process(Page page) { //列表页 if (page.getUrl().regex(URL_LIST).match()) { page.addTargetRequests(page.getHtml().xpath("//div[@class=\"articleList\"]").links().regex(URL_POST).all()); page.addTargetRequests(page.getHtml().links().regex(URL_LIST).all()); //文章页 } else { page.putField("title", page.getHtml().xpath("//div[@class='articalTitle']/h2")); page.putField("content", page.getHtml().xpath("//div[@id='articlebody']//div[@class='articalContent']")); page.putField("date", page.getHtml().xpath("//div[@id='articlebody']//span[@class='time SG_txtc']").regex("\\((.*)\\)")); } }
@Override public void process(Page page) { if (page.getUrl().regex(URL_POST).match()) { page.putField("goodsName", page.getHtml().xpath("//div[@id='description']/h1/tidyText()")); if (page.getResultItems().get("goodsName") == null) {
@Override public void process(Page page) { page.addTargetRequests(page.getHtml().links().regex(COUNTRY_PAGE).all()); page.addTargetRequests(page.getHtml().links().regex(COUNTRY_EFFECT_PAGE).all()); if (page.getUrl().regex(COUNTRY_EFFECT_PAGE).match()) { String countryName = page.getHtml().xpath("//div[@class='bgGray5']/div/h3[@class='fl f30 lh30 mt10']/text()").toString(); page.putField("countryName", countryName); } }
public void process(Page page) { try { site.setUserAgent(AGENTS[new Random().nextInt(5)]); Thread.sleep(new Random().nextInt(20) * 100); } catch (Exception e) { e.printStackTrace(); } // 当前页面是图书列表页面 if (page.getUrl().regex("http(s)?://book\\.douban\\.com/tag/\\S+\\?+").match()) { processForBookList(page); } }
public void process(Page page) { try { site.setUserAgent(AGENTS[new Random().nextInt(5)]); // System.out.println(site.getUserAgent()); // Long a = System.currentTimeMillis(); Thread.sleep(new Random().nextInt(10)*100); // System.out.println("Sleep : " + (System.currentTimeMillis() - a)/1000.0); } catch (Exception e) { e.printStackTrace(); } // 当前页面是图书列表页面 if (page.getUrl().regex("http://www\\.douban\\.com/tag/\\S+/book*") .match()) { processForBookList(page); } // 当前页面是图书详细信息页面 else if (page.getUrl() .regex("http://book\\.douban\\.com/subject/\\S+/?from=tag_all") .match()) { processForBookInfo(page); } }
@Override public void process(Page page) { //列表页 if (page.getUrl().regex(URL_LIST).match()) { crawlList(page); } //主题帖第一页 if (page.getUrl().regex(URL_POST_1).match()) { //page.addTargetRequests(URLGeneratedUtil.generatePostURLs(URL_POST_1)); crawlPost(page); } //主题帖第一页以后 if (page.getUrl().regex(URL_POST).match()) { crawlPost(page); } //用户页 if (page.getUrl().regex(URL_USER).match()) { crawlUser(page); } }
if (!page.getUrl().regex(".*/huxing_.*").match()) { page.setSkip(true); return;
if(page.getUrl().regex("https://www\\.zhihu\\.com/search\\?type=people&q=[\\s\\S]+").match()){ List<String> requests=page.getHtml().xpath("//ul[@class='list users']/li/div/div[@class='body']/div[@class='line']").links().all(); for(int i=0;i<requests.size();i++){ else if(page.getUrl().regex("https://www\\.zhihu\\.com/people/(.+)/answers").match())
public void process(Page page) if(page.getUrl().regex("http(s)?://book.douban.com/subject/\\S+/comments/*").match())
@Override public void process(Page page) { if (page.getUrl().regex(LIST_URL).match()) { List<String> ids = new JsonPathSelector("$.data[*]._id").selectList(page.getRawText()); if (CollectionUtils.isNotEmpty(ids)) { for (String id : ids) { page.addTargetRequest("http://angularjs.cn/api/article/" + id); } } } else { page.putField("title", new JsonPathSelector("$.data.title").select(page.getRawText())); page.putField("content", new JsonPathSelector("$.data.content").select(page.getRawText())); } }
@Override public void process(Page page) { // 根据URL判断页面类型 if (page.getUrl().regex(ALBUM_URL).match()) { System.out.println("歌曲总数----->" + page.getHtml().xpath("//span[@id='playlist-track-count']/text()").toString()); // 爬取歌曲URl加入队列 page.addTargetRequests(page.getHtml().xpath("//div[@id=\"song-list-pre-cache\"]").links().regex(MUSIC_URL).all()); } else { String url = page.getUrl().toString(); Music music = new Music(); // 单独对AJAX请求获取评论数, 使用JSON解析返回结果 String songId = url.substring(url.indexOf("id=") + 3); int commentCount = getComment(page, songId, 0); // music 保存到数据库 music.setSongId(songId); music.setCommentCount(commentCount); music.setTitle(page.getHtml().xpath("//em[@class='f-ff2']/text()").toString()); music.setAuthor(page.getHtml().xpath("//p[@class='des s-fc4']/span/a/text()").toString()); music.setAlbum(page.getHtml().xpath("//p[@class='des s-fc4']/a/text()").toString()); music.setURL(url); //page.putField("music", music); mMusicService.addMusic(music); } }
@Override public void process(Page page) { //列表页 if (page.getUrl().regex(URL_LIST).match()) { page.addTargetRequests(page.getHtml().links().regex(URL_LIST).all()); page.addTargetRequests(page.getHtml().links().regex(URL_POST).all()); } else { String title = page.getHtml().xpath("//*[@class=\"focusbox-title\"]/text()").toString(); List<String> zipList = page.getHtml().links().regex("https://www.leshe.us/wp-content/uploads/.+jpg").all(); page.putField("imgList", zipList); page.putField("title", title); } }
@Override public void process(Page page) { //列表页 if (page.getUrl().regex(URL_LIST).match()) { page.addTargetRequests(page.getHtml().xpath("//div[@class=\"articleList\"]").links().regex(URL_POST).all()); page.addTargetRequests(page.getHtml().links().regex(URL_LIST).all()); //文章页 } else { page.putField("title", page.getHtml().xpath("//div[@class='articalTitle']/h2")); page.putField("content", page.getHtml().xpath("//div[@id='articlebody']//div[@class='articalContent']")); page.putField("date", page.getHtml().xpath("//div[@id='articlebody']//span[@class='time SG_txtc']").regex("\\((.*)\\)")); } }
@Override public void process(Page page) { if (page.getUrl().regex(URL_POST).match()) { page.putField("goodsName", page.getHtml().xpath("//div[@id='description']/h1/tidyText()")); if (page.getResultItems().get("goodsName") == null) {