@Override public void visit(Page page) { String url = page.getWebURL().getURL(); logger.info("URL: " + url); if (page.getParseData() instanceof HtmlParseData) { HtmlParseData htmlParseData = (HtmlParseData) page.getParseData(); String text = htmlParseData.getText(); String html = htmlParseData.getHtml(); Set<WebURL> links = htmlParseData.getOutgoingUrls(); logger.info("Text length: " + text.length()); logger.info("Html length: " + html.length()); logger.info("Number of outgoing links: " + links.size()); try { postgresDBService.store(page); } catch (RuntimeException e) { logger.error("Storing failed", e); } } }
String text = htmlParseData.getText(); String html = htmlParseData.getHtml(); Set<WebURL> links = htmlParseData.getOutgoingUrls();
@Override public void visit(Page page) { int docid = page.getWebURL().getDocid(); String url = page.getWebURL().getURL(); int parentDocid = page.getWebURL().getParentDocid(); logger.debug("Docid: {}", docid); logger.info("URL: {}", url); logger.debug("Docid of parent page: {}", parentDocid); if (page.getParseData() instanceof HtmlParseData) { HtmlParseData htmlParseData = (HtmlParseData) page.getParseData(); String text = htmlParseData.getText(); String html = htmlParseData.getHtml(); Set<WebURL> links = htmlParseData.getOutgoingUrls(); logger.debug("Text length: {}", text.length()); logger.debug("Html length: {}", html.length()); logger.debug("Number of outgoing links: {}", links.size()); } logger.debug("============="); } }
@Override public void visit(Page page) { int docid = page.getWebURL().getDocid(); String url = page.getWebURL().getURL(); int parentDocid = page.getWebURL().getParentDocid(); logger.debug("Docid: {}", docid); logger.info("URL: {}", url); logger.debug("Docid of parent page: {}", parentDocid); if (page.getParseData() instanceof HtmlParseData) { HtmlParseData htmlParseData = (HtmlParseData) page.getParseData(); String text = htmlParseData.getText(); String html = htmlParseData.getHtml(); Set<WebURL> links = htmlParseData.getOutgoingUrls(); logger.debug("Text length: {}", text.length()); logger.debug("Html length: {}", html.length()); logger.debug("Number of outgoing links: {}", links.size()); } logger.debug("============="); } }
String text = htmlParseData.getText(); String html = htmlParseData.getHtml(); List<WebURL> links = htmlParseData.getOutgoingUrls();
public class MyCrawler extends WebCrawler { // Decide if you want to go to a webpage based on the URL @Override public boolean shouldVisit(WebURL url) { String href = url.getURL().toLowerCase(); return true; } @Override public void visit(Page page) { String url = page.getWebURL().getURL(); System.out.println("URL: " + url); if (page.getParseData() instanceof HtmlParseData) { HtmlParseData htmlParseData = (HtmlParseData) page.getParseData(); String text = htmlParseData.getText(); String html = htmlParseData.getHtml(); List<WebURL> links = htmlParseData.getOutgoingUrls(); } } }
String text = htmlParseData.getText(); String html = htmlParseData.getHtml(); Set<WebURL> links = htmlParseData.getOutgoingUrls();
@Override public void visit(Page page) { logger.info("Visited: {}", page.getWebURL().getURL()); if (page.getParseData() instanceof HtmlParseData) { HtmlParseData parseData = (HtmlParseData) page.getParseData(); Set<WebURL> links = parseData.getOutgoingUrls(); myCrawlStat.incTotalLinks(links.size()); try { myCrawlStat.incTotalTextSize(parseData.getText().getBytes("UTF-8").length); } catch (UnsupportedEncodingException ignored) { // Do nothing } } // We dump this crawler statistics after processing every 50 pages if ((myCrawlStat.getTotalProcessedPages() % 50) == 0) { dumpMyData(); } }
for (WebURL webURL : htmlParseData.getOutgoingUrls())