@Override public void process(Page page) { //http://progressdaily.diandian.com/post/2013-01-24/40046867275 List<String> requests = page.getHtml().xpath("//a[@class=\"area_link flat_btn\"]/@href").all(); if (requests.size() > 2) { requests = requests.subList(0, 2); } page.addTargetRequests(requests); page.addTargetRequests(page.getHtml().links().regex("(.*/restaurant/[^#]+)").all()); page.putField("items", page.getHtml().xpath("//ul[@class=\"dishes menu_dishes\"]/li/span[@class=\"name\"]/text()")); page.putField("prices", page.getHtml().xpath("//ul[@class=\"dishes menu_dishes\"]/li/span[@class=\"price_outer\"]/span[@class=\"price\"]/text()")); }
@Override public void process(Page page) { List<String> relativeUrl = page.getHtml().xpath("//li[@class='item clearfix']/div/a/@href").all(); page.addTargetRequests(relativeUrl); relativeUrl = page.getHtml().xpath("//div[@id='zh-question-related-questions']//a[@class='question_link']/@href").all(); page.addTargetRequests(relativeUrl); List<String> answers = page.getHtml().xpath("//div[@id='zh-question-answer-wrap']/div").all(); boolean exist = false; for(String answer:answers){ String vote = new Html(answer).xpath("//div[@class='zm-votebar']//span[@class='count']/text()").toString(); if(Integer.valueOf(vote) >= voteNum){ page.putField("vote",vote); page.putField("content",new Html(answer).xpath("//div[@class='zm-editable-content']")); page.putField("userid", new Html(answer).xpath("//a[@class='author-link']/@href")); exist = true; } } if(!exist){ page.setSkip(true); } }
public void process(Page page) { Html html = page.getHtml(); List<String> questionList = html.xpath("//table[@class='tgCustomerCommunityCenterColumn']//div[@class='content']//table[@class='dataGrid']//tr").all(); if(questionList != null && questionList.size() > 1) { //i=0是列名称,所以i从1开始 for( int i = 1 ; i < questionList.size(); i++) { System.out.println(questionList.get(i)); Html tempHtml = Html.create("<table>"+questionList.get(i)+"</table>"); String comment = tempHtml.xpath("//td[@class='title']//a/text()").toString(); System.out.println(comment); String answerNum = tempHtml.xpath("//td[@class='num']/text()").toString(); System.out.println(answerNum); String createTime = tempHtml.xpath("//td[3]/text()").toString(); System.out.println(createTime); /* Document doc = Jsoup.parse(questionList.get(i)); Html hmt = Html.create(questionList.get(i)) ; String str = hmt.links().toString(); String content = doc.getElementsByTag("a").text(); String ss = doc.text();*/ } } }
@Override public void process(Page page) { List<String> requests = page.getHtml().regex("<a[^<>]*href=(bbstcon\\?board=Pictures&file=[^>]*)").all(); page.addTargetRequests(requests); page.putField("title",page.getHtml().xpath("//div[@id='content']//h2/a")); page.putField("content",page.getHtml().smartContent()); }
@Override public RequestMatcher.MatchOther processPage(Page page) { page.putField("reponame", page.getHtml().xpath("//h1[@class='entry-title public']/strong/a/text()").toString()); return RequestMatcher.MatchOther.YES; }
@Override public void process(Page page) { page.addTargetRequests(page.getHtml().links().regex(".*yanghaoli\\.iteye\\.com/blog/\\d+").all()); page.putField("title",page.getHtml().xpath("//title").toString()); page.putField("content",page.getHtml().smartContent().toString()); }
@Override public void process(Page page) { List<String> requests = page.getHtml().links().regex(urlPattern).all(); //add urls to fetch page.addTargetRequests(requests); //extract by XPath page.putField("title", page.getHtml().xpath("//title")); page.putField("html", page.getHtml().toString()); //extract by Readability page.putField("content", page.getHtml().smartContent()); }
@Override public void process(Page page) { List pagination = page.getHtml().links().regex("/xf/.*").all(); String region = page.getHtml().xpath("//div[@class=pro_crum]/a[3]/text()").toString(); String name = page.getHtml().xpath("//div[@class=pro_crum]/a[4]/text()").toString(); List<String> buildings = page.getHtml().xpath("//div[@id=houseList]/dl/dd/div[@class=lptabl]/table/tbody/tr").all(); if (CollectionUtils.isEmpty(buildings)) { page.setSkip(true); br.append("<table>").append(build).append("</table>"); Html html = new Html(br.toString()); String building = html.xpath("//tr/td[1]/text()").toString(); String unit = html.xpath("//tr/td[2]/text()").toString(); String floorNumber = html.xpath("//tr/td[3]/text()").toString(); String doorNumber = html.xpath("//tr/td[4]/text()").toString(); String area = html.xpath("//tr/td[5]/text()").toString(); String houseType = html.xpath("//tr/td[6]/text()").toString(); String price = html.xpath("//tr/td[7]/text()").toString();
@Override public void process(Page page) { page.addTargetRequests(page.getHtml().links().regex("http://www\\.infoq\\.com/cn/minibooks/.*").all()); List<String> all = page.getHtml().links().regex(".*\\.pdf").all(); if (CollectionUtils.isNotEmpty(all)) { page.putField("pdf", all); } else { page.getResultItems().setSkip(true); } }
public static Html create(String text) { return new Html(text); }
private void processDistrict(Page page) { String province = page.getRequest().getExtra("province").toString(); String district = page.getRequest().getExtra("district").toString(); String zipCode = page.getHtml().regex("<h2>邮编:(\\d+)</h2>").toString(); page.putField("result", StringUtils.join(new String[]{province, district, zipCode}, "\t")); List<String> links = page.getHtml().links().regex("http://www\\.ip138\\.com/\\d{6}[/]?$").all(); for (String link : links) { page.addTargetRequest(new Request(link).setPriority(2).putExtra("province", province).putExtra("district", district)); } }
private void extractLinks(Page page, Selector urlRegionSelector, List<Pattern> urlPatterns) { List<String> links; if (urlRegionSelector == null) { links = page.getHtml().links().all(); } else { links = page.getHtml().selectList(urlRegionSelector).links().all(); } for (String link : links) { for (Pattern targetUrlPattern : urlPatterns) { Matcher matcher = targetUrlPattern.matcher(link); if (matcher.find()) { page.addTargetRequest(new Request(matcher.group(0))); } } } }
Html html = new Html(null) { Head head = new Head(this); Body body = new Body(this) { Blank blank = new Blank(this, "Hello World"); }; }; // prepends the doc type <!DOCTYPE html> html.setPrependDocType(true); System.out.println(html.toHtmlString());
Html html = new Html(null) {{ new Head(this); new Body(this) {{ new H1(this) {{ new NoTag(this, "spacial characters taken from an external resource like file/database : 女 学校 ä ö ü Ä"); }}; }}; }}; System.out.println(html.toHtmlString(StandardCharsets.UTF_8));
Style mainDivStyle = new Style(); mainDivStyle.addCssProperties(AlignItems.CENTER, new BackgroundColor( CssColorName.AQUA.getColorName())); Html html = new Html(null) { Body body = new Body(this) { Div mainDiv = new Div(this, mainDivStyle); Footer footer = new Footer(this); }; }; try { //to write to a file html.toOutputStream(new FileOutputStream("/home/ansgar/html_work/html-by-wffweb.html")); //to print in to console //System.out.println(html.toHtmlString()); } catch (FileNotFoundException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); }
Listcell c = new Listcell(); Html h = new Html(); h.setContent("MY HTML STRING HERE"); c.appendChild(h); listitem.appendChild(c);
public AbstractHtml render() { Html html = new Html(null) { html.setPrependDocType(true); return html;
private void processProvince(Page page) { //这里仅靠xpath没法精准定位,所以使用正则作为筛选,不符合正则的会被过滤掉 List<String> districts = page.getHtml().xpath("//body/table/tbody/tr[@bgcolor=\"#ffffff\"]").all(); Pattern pattern = Pattern.compile("<td>([^<>]+)</td>.*?href=\"(.*?)\"",Pattern.DOTALL); for (String district : districts) { Matcher matcher = pattern.matcher(district); while (matcher.find()) { String title = matcher.group(1); String link = matcher.group(2); Request request = new Request(link).setPriority(1).putExtra("province", page.getRequest().getExtra("province")).putExtra("district", title); page.addTargetRequest(request); } } }