private void processDistrict(Page page) { String province = page.getRequest().getExtra("province").toString(); String district = page.getRequest().getExtra("district").toString(); String zipCode = page.getHtml().regex("<h2>邮编:(\\d+)</h2>").toString(); page.putField("result", StringUtils.join(new String[]{province, district, zipCode}, "\t")); List<String> links = page.getHtml().links().regex("http://www\\.ip138\\.com/\\d{6}[/]?$").all(); for (String link : links) { page.addTargetRequest(new Request(link).setPriority(2).putExtra("province", province).putExtra("district", district)); } }
@Override public void process(Page page) { List<String> requests = page.getHtml().regex("<a[^<>]*href=(bbstcon\\?board=Pictures&file=[^>]*)").all(); page.addTargetRequests(requests); page.putField("title",page.getHtml().xpath("//div[@id='content']//h2/a")); page.putField("content",page.getHtml().smartContent()); }
@Override public void process(Page page) { //http://progressdaily.diandian.com/post/2013-01-24/40046867275 //http://b1.cnc.qzone.qq.com/cgi-bin/blognew/get_abs?hostUin=233017404&uin=233017404&blogType=0&statYear=2013&source=0&statYear=2013&g_tk=291639571&g_tk=291639571&reqInfo=7&pos=0&num=15&source=0&rand=0.46480297949165106 // &cateName=&cateHex=&statYear=2013&reqInfo=7&pos=0&num=15&sortType=0&source=0&rand=0.46480297949165106&g_tk=291639571&verbose=1&ref=qzone List<String> requests = page.getHtml().regex("<a[^<>]*href=[\"']{1}(http://17dujingdian\\.com/post/[^#]*?)[\"']{1}").all(); page.addTargetRequests(requests); page.putField("title",page.getHtml().xpath("//div[@id='content']//h2/a")); page.putField("content",page.getHtml().smartContent()); }
@Override public void process(Page page) { List<String> strings = page.getHtml().regex("<a[^<>]*href=[\"']{1}(/post-free.*?\\.shtml)[\"']{1}").all(); page.addTargetRequests(strings); page.putField("title", page.getHtml().xpath("//div[@id='post_head']//span[@class='s_title']//b")); page.putField("body",page.getHtml().smartContent()); }
@Override public void process(Page page) { List<String> strings = page.getHtml().links().regex(".*/yewu/.*").all(); page.addTargetRequests(strings); page.putField("title",page.getHtml().regex("<title>(.*)</title>")); page.putField("body",page.getHtml().xpath("//dd")); }
@Override public void process(Page page) { List<String> requests = page.getHtml().links().regex("(http://www\\.diaoyuweng\\.com/home\\.php\\?mod=space&uid=88304&do=thread&view=me&type=thread&order=dateline&from=space&page=\\d+)").all(); page.addTargetRequests(requests); requests = page.getHtml().links().regex("(http://www\\.diaoyuweng\\.com/thread-\\d+-1-1.html)").all(); page.addTargetRequests(requests); if (page.getUrl().toString().contains("thread")){ page.putField("title", page.getHtml().xpath("//a[@id='thread_subject']")); page.putField("content", page.getHtml().xpath("//div[@class='pcb']//tbody/tidyText()")); page.putField("date",page.getHtml().regex("发表于 (\\d{4}-\\d+-\\d+ \\d+:\\d+:\\d+)")); page.putField("id",new PlainText("1000"+page.getUrl().regex("http://www\\.diaoyuweng\\.com/thread-(\\d+)-1-1.html").toString())); } }
private void processDistrict(Page page) { String province = page.getRequest().getExtra("province").toString(); String district = page.getRequest().getExtra("district").toString(); String zipCode = page.getHtml().regex("<h2>邮编:(\\d+)</h2>").toString(); page.putField("result", StringUtils.join(new String[]{province, district, zipCode}, "\t")); List<String> links = page.getHtml().links().regex("http://www\\.ip138\\.com/\\d{6}[/]?$").all(); for (String link : links) { page.addTargetRequest(new Request(link).setPriority(2).putExtra("province", province).putExtra("district", district)); } }
@Override public void process(Page page) { List<String> strings = page.getHtml().regex("<a[^<>]*href=[\"']{1}(/post-free.*?\\.shtml)[\"']{1}").all(); page.addTargetRequests(strings); page.putField("title", page.getHtml().xpath("//div[@id='post_head']//span[@class='s_title']//b")); page.putField("body", page.getHtml().smartContent()); }
@Override public void process(Page page) { List<String> requests = page.getHtml().regex("<a[^<>]*href=(bbstcon\\?board=Pictures&file=[^>]*)").all(); page.addTargetRequests(requests); page.putField("title",page.getHtml().xpath("//div[@id='content']//h2/a")); page.putField("content",page.getHtml().smartContent()); }
@Override public void process(Page page) { //http://progressdaily.diandian.com/post/2013-01-24/40046867275 //http://b1.cnc.qzone.qq.com/cgi-bin/blognew/get_abs?hostUin=233017404&uin=233017404&blogType=0&statYear=2013&source=0&statYear=2013&g_tk=291639571&g_tk=291639571&reqInfo=7&pos=0&num=15&source=0&rand=0.46480297949165106 // &cateName=&cateHex=&statYear=2013&reqInfo=7&pos=0&num=15&sortType=0&source=0&rand=0.46480297949165106&g_tk=291639571&verbose=1&ref=qzone List<String> requests = page.getHtml().regex("<a[^<>]*href=[\"']{1}(http://17dujingdian\\.com/post/[^#]*?)[\"']{1}").all(); page.addTargetRequests(requests); page.putField("title",page.getHtml().xpath("//div[@id='content']//h2/a")); page.putField("content",page.getHtml().smartContent()); }
@Override public void process(Page page) { List<String> strings = page.getHtml().links().regex(".*/yewu/.*").all(); page.addTargetRequests(strings); page.putField("title",page.getHtml().regex("<title>(.*)</title>")); page.putField("body",page.getHtml().xpath("//dd")); }
@Override public void process(Page page) { List<String> requests = page.getHtml().links().regex("(http://www\\.diaoyuweng\\.com/home\\.php\\?mod=space&uid=88304&do=thread&view=me&type=thread&order=dateline&from=space&page=\\d+)").all(); page.addTargetRequests(requests); requests = page.getHtml().links().regex("(http://www\\.diaoyuweng\\.com/thread-\\d+-1-1.html)").all(); page.addTargetRequests(requests); if (page.getUrl().toString().contains("thread")){ page.putField("title", page.getHtml().xpath("//a[@id='thread_subject']")); page.putField("content", page.getHtml().xpath("//div[@class='pcb']//tbody/tidyText()")); page.putField("date",page.getHtml().regex("发表于 (\\d{4}-\\d+-\\d+ \\d+:\\d+:\\d+)")); page.putField("id",new PlainText("1000"+page.getUrl().regex("http://www\\.diaoyuweng\\.com/thread-(\\d+)-1-1.html").toString())); } }