public static PlainText create(String text) { return new PlainText(text); }
@Override public List<Selectable> nodes() { List<Selectable> nodes = new ArrayList<Selectable>(getSourceTexts().size()); for (String string : getSourceTexts()) { nodes.add(PlainText.create(string)); } return nodes; }
StringBuffer predictionSB = new StringBuffer(); Range attributesToShow = null; Boolean outputDistributions = new Boolean(true); PlainText predictionOutput = new PlainText(); predictionOutput.setBuffer(predictionSB); predictionOutput.setOutputDistribution(true); Evaluation evaluation = new Evaluation(data); evaluation.crossValidateModel(j48Model, data, numberOfFolds, randomNumber, predictionOutput, attributesToShow, outputDistributions);
protected Selectable select(Selector selector, List<String> strings) { List<String> results = new ArrayList<String>(); for (String string : strings) { String result = selector.select(string); if (result != null) { results.add(result); } } return new PlainText(results); }
@Override public List<Selectable> nodes() { List<Selectable> nodes = new ArrayList<Selectable>(getSourceTexts().size()); for (String string : getSourceTexts()) { nodes.add(PlainText.create(string)); } return nodes; }
protected Selectable selectList(Selector selector, List<String> strings) { List<String> results = new ArrayList<String>(); for (String string : strings) { List<String> result = selector.selectList(string); results.addAll(result); } return new PlainText(results); }
/** * select elements * * @param elementSelector elementSelector * @return result */ protected Selectable selectElements(BaseElementSelector elementSelector) { ListIterator<Element> elementIterator = getElements().listIterator(); if (!elementSelector.hasAttribute()) { List<Element> resultElements = new ArrayList<Element>(); while (elementIterator.hasNext()) { Element element = checkElementAndConvert(elementIterator); List<Element> selectElements = elementSelector.selectElements(element); resultElements.addAll(selectElements); } return new HtmlNode(resultElements); } else { // has attribute, consider as plaintext List<String> resultStrings = new ArrayList<String>(); while (elementIterator.hasNext()) { Element element = checkElementAndConvert(elementIterator); List<String> selectList = elementSelector.selectList(element); resultStrings.addAll(selectList); } return new PlainText(resultStrings); } }
protected Page handleResponse(Request request, String charset, HttpResponse httpResponse, Task task) throws IOException { byte[] bytes = IOUtils.toByteArray(httpResponse.getEntity().getContent()); String contentType = httpResponse.getEntity().getContentType() == null ? "" : httpResponse.getEntity().getContentType().getValue(); Page page = new Page(); page.setBytes(bytes); if (!request.isBinaryContent()){ if (charset == null) { charset = getHtmlCharset(contentType, bytes); } page.setCharset(charset); page.setRawText(new String(bytes, charset)); } page.setUrl(new PlainText(request.getUrl())); page.setRequest(request); page.setStatusCode(httpResponse.getStatusLine().getStatusCode()); page.setDownloadSuccess(true); if (responseHeader) { page.setHeaders(HttpClientUtils.convertHeaders(httpResponse.getAllHeaders())); } return page; }
@Override public Page download(Request request, Task task) { if (logger.isInfoEnabled()) { logger.info("downloading page: " + request.getUrl()); } String content = getPage(request); if (content.contains("HTTP request failed")) { for (int i = 1; i <= getRetryNum(); i++) { content = getPage(request); if (!content.contains("HTTP request failed")) { break; } } if (content.contains("HTTP request failed")) { //when failed Page page = new Page(); page.setRequest(request); return page; } } Page page = new Page(); page.setRawText(content); page.setUrl(new PlainText(request.getUrl())); page.setRequest(request); page.setStatusCode(200); return page; }
page.setRawText(content); page.setHtml(new Html(content, request.getUrl())); page.setUrl(new PlainText(request.getUrl())); page.setRequest(request); webDriverPool.returnToPool(webDriver);
@Override public void process(Page page) { List<String> requests = page.getHtml().links().regex("(http://www\\.diaoyuweng\\.com/home\\.php\\?mod=space&uid=88304&do=thread&view=me&type=thread&order=dateline&from=space&page=\\d+)").all(); page.addTargetRequests(requests); requests = page.getHtml().links().regex("(http://www\\.diaoyuweng\\.com/thread-\\d+-1-1.html)").all(); page.addTargetRequests(requests); if (page.getUrl().toString().contains("thread")){ page.putField("title", page.getHtml().xpath("//a[@id='thread_subject']")); page.putField("content", page.getHtml().xpath("//div[@class='pcb']//tbody/tidyText()")); page.putField("date",page.getHtml().regex("发表于 (\\d{4}-\\d+-\\d+ \\d+:\\d+:\\d+)")); page.putField("id",new PlainText("1000"+page.getUrl().regex("http://www\\.diaoyuweng\\.com/thread-(\\d+)-1-1.html").toString())); } }
public static PlainText create(String text) { return new PlainText(text); }
protected Selectable selectList(Selector selector, List<String> strings) { List<String> results = new ArrayList<String>(); for (String string : strings) { List<String> result = selector.selectList(string); results.addAll(result); } return new PlainText(results); }
protected Selectable select(Selector selector, List<String> strings) { List<String> results = new ArrayList<String>(); for (String string : strings) { String result = selector.select(string); if (result != null) { results.add(result); } } return new PlainText(results); }
/** * select elements * * @param elementSelector elementSelector * @return result */ protected Selectable selectElements(BaseElementSelector elementSelector) { ListIterator<Element> elementIterator = getElements().listIterator(); if (!elementSelector.hasAttribute()) { List<Element> resultElements = new ArrayList<Element>(); while (elementIterator.hasNext()) { Element element = checkElementAndConvert(elementIterator); List<Element> selectElements = elementSelector.selectElements(element); resultElements.addAll(selectElements); } return new HtmlNode(resultElements); } else { // has attribute, consider as plaintext List<String> resultStrings = new ArrayList<String>(); while (elementIterator.hasNext()) { Element element = checkElementAndConvert(elementIterator); List<String> selectList = elementSelector.selectList(element); resultStrings.addAll(selectList); } return new PlainText(resultStrings); } }
protected Page handleResponse(Request request, String charset, HttpResponse httpResponse, Task task) throws IOException { byte[] bytes = IOUtils.toByteArray(httpResponse.getEntity().getContent()); String contentType = httpResponse.getEntity().getContentType() == null ? "" : httpResponse.getEntity().getContentType().getValue(); Page page = new Page(); page.setBytes(bytes); if (!request.isBinaryContent()){ if (charset == null) { charset = getHtmlCharset(contentType, bytes); } page.setCharset(charset); page.setRawText(new String(bytes, charset)); } page.setUrl(new PlainText(request.getUrl())); page.setRequest(request); page.setStatusCode(httpResponse.getStatusLine().getStatusCode()); page.setDownloadSuccess(true); if (responseHeader) { page.setHeaders(HttpClientUtils.convertHeaders(httpResponse.getAllHeaders())); } return page; }
protected Page handleResponse(Request request, String charset, HttpResponse httpResponse, Task task) throws IOException { String content = getContent(charset, httpResponse); Page page = new Page(); page.setRawText(content); page.setUrl(new PlainText(request.getUrl())); page.setRequest(request); page.setStatusCode(httpResponse.getStatusLine().getStatusCode()); return page; }
protected Page handleResponse(Request request, String charset, HttpResponse httpResponse, Task task) throws IOException { String content = getContent(charset, httpResponse); Page page = new Page(); page.setRawText(content); page.setUrl(new PlainText(request.getUrl())); page.setRequest(request); page.setStatusCode(httpResponse.getStatusLine().getStatusCode()); return page; }
@Override protected void parse(Seed seed, String result, GrabResult crawlResult) { if (result == null) { seed.retry(); return; } SipSoupPage sipSoupPage = new SipSoupPage(); sipSoupPage.setRawText(result); sipSoupPage.setUrl(new PlainText(seed.getData())); sipSoupPage.setRequest(CovertUtil.convertSeed(seed)); sipSoupPage.setStatusCode(200); pageProcessor.process(sipSoupPage); // new url List<Request> targetRequests = sipSoupPage.getTargetRequests(); for (Request request : targetRequests) { crawlResult.addSeed(CovertUtil.covertRequest(request)); } if (!sipSoupPage.getResultItems().isSkip()) { ResultItems resultItems = sipSoupPage.getResultItems(); crawlResult.addResult(JSONObject.toJSONString(resultItems.getAll())); } }
@Override public void process(Page page) { List<String> requests = page.getHtml().links().regex("(http://www\\.diaoyuweng\\.com/home\\.php\\?mod=space&uid=88304&do=thread&view=me&type=thread&order=dateline&from=space&page=\\d+)").all(); page.addTargetRequests(requests); requests = page.getHtml().links().regex("(http://www\\.diaoyuweng\\.com/thread-\\d+-1-1.html)").all(); page.addTargetRequests(requests); if (page.getUrl().toString().contains("thread")){ page.putField("title", page.getHtml().xpath("//a[@id='thread_subject']")); page.putField("content", page.getHtml().xpath("//div[@class='pcb']//tbody/tidyText()")); page.putField("date",page.getHtml().regex("发表于 (\\d{4}-\\d+-\\d+ \\d+:\\d+:\\d+)")); page.putField("id",new PlainText("1000"+page.getUrl().regex("http://www\\.diaoyuweng\\.com/thread-(\\d+)-1-1.html").toString())); } }