@Override public void store(Page page) { if (page.getParseData() instanceof HtmlParseData) { try { HtmlParseData htmlParseData = (HtmlParseData) page.getParseData(); insertKeyStatement.setString(1, htmlParseData.getHtml()); insertKeyStatement.setString(2, htmlParseData.getText()); insertKeyStatement.setString(3, page.getWebURL().getURL()); insertKeyStatement.setTimestamp(4, new Timestamp(new java.util.Date().getTime())); insertKeyStatement.executeUpdate(); } catch (SQLException e) { logger.error("SQL Exception while storing webpage for url'{}'", page.getWebURL().getURL(), e); throw new RuntimeException(e); } } }
@Override public void visit(Page page) { String url = page.getWebURL().getURL(); logger.info("URL: " + url); if (page.getParseData() instanceof HtmlParseData) { HtmlParseData htmlParseData = (HtmlParseData) page.getParseData(); String text = htmlParseData.getText(); String html = htmlParseData.getHtml(); Set<WebURL> links = htmlParseData.getOutgoingUrls(); logger.info("Text length: " + text.length()); logger.info("Html length: " + html.length()); logger.info("Number of outgoing links: " + links.size()); try { postgresDBService.store(page); } catch (RuntimeException e) { logger.error("Storing failed", e); } } }
LanguageIdentifier languageIdentifier = new LanguageIdentifier(parsedData.getText()); page.setLanguage(languageIdentifier.getLanguage());
public void processUrl(String url) { logger.debug("Processing: {}", url); Page page = download(url); if (page != null) { ParseData parseData = page.getParseData(); if (parseData != null) { if (parseData instanceof HtmlParseData) { HtmlParseData htmlParseData = (HtmlParseData) parseData; logger.debug("Title: {}", htmlParseData.getTitle()); logger.debug("Text length: {}", htmlParseData.getText().length()); logger.debug("Html length: {}", htmlParseData.getHtml().length()); } } else { logger.warn("Couldn't parse the content of the page."); } } else { logger.warn("Couldn't fetch the content of the page."); } logger.debug("=============="); }
static String wordToFind = "..."; public void visit(Page page) { if (page.getParseData() instanceof HtmlParseData) { HtmlParseData htmlParseData = (HtmlParseData) page.getParseData(); String text = htmlParseData.getText(); if(text.indexOf(wordToFind)!=-1) saveToDB(page.getWebURL().getURL()): } }
String text = htmlParseData.getText(); String html = htmlParseData.getHtml(); Set<WebURL> links = htmlParseData.getOutgoingUrls();
@Override public void visit(Page page) { int docid = page.getWebURL().getDocid(); String url = page.getWebURL().getURL(); int parentDocid = page.getWebURL().getParentDocid(); logger.debug("Docid: {}", docid); logger.info("URL: {}", url); logger.debug("Docid of parent page: {}", parentDocid); if (page.getParseData() instanceof HtmlParseData) { HtmlParseData htmlParseData = (HtmlParseData) page.getParseData(); String text = htmlParseData.getText(); String html = htmlParseData.getHtml(); Set<WebURL> links = htmlParseData.getOutgoingUrls(); logger.debug("Text length: {}", text.length()); logger.debug("Html length: {}", html.length()); logger.debug("Number of outgoing links: {}", links.size()); } logger.debug("============="); } }
@Override public void visit(Page page) { int docid = page.getWebURL().getDocid(); String url = page.getWebURL().getURL(); int parentDocid = page.getWebURL().getParentDocid(); logger.debug("Docid: {}", docid); logger.info("URL: {}", url); logger.debug("Docid of parent page: {}", parentDocid); if (page.getParseData() instanceof HtmlParseData) { HtmlParseData htmlParseData = (HtmlParseData) page.getParseData(); String text = htmlParseData.getText(); String html = htmlParseData.getHtml(); Set<WebURL> links = htmlParseData.getOutgoingUrls(); logger.debug("Text length: {}", text.length()); logger.debug("Html length: {}", html.length()); logger.debug("Number of outgoing links: {}", links.size()); } logger.debug("============="); } }
String text = htmlParseData.getText(); String html = htmlParseData.getHtml(); List<WebURL> links = htmlParseData.getOutgoingUrls();
public class MyCrawler extends WebCrawler { // Decide if you want to go to a webpage based on the URL @Override public boolean shouldVisit(WebURL url) { String href = url.getURL().toLowerCase(); return true; } @Override public void visit(Page page) { String url = page.getWebURL().getURL(); System.out.println("URL: " + url); if (page.getParseData() instanceof HtmlParseData) { HtmlParseData htmlParseData = (HtmlParseData) page.getParseData(); String text = htmlParseData.getText(); String html = htmlParseData.getHtml(); List<WebURL> links = htmlParseData.getOutgoingUrls(); } } }
String text = htmlParseData.getText(); String html = htmlParseData.getHtml(); Set<WebURL> links = htmlParseData.getOutgoingUrls();
@Override public void visit(Page page) { logger.info("Visited: {}", page.getWebURL().getURL()); if (page.getParseData() instanceof HtmlParseData) { HtmlParseData parseData = (HtmlParseData) page.getParseData(); Set<WebURL> links = parseData.getOutgoingUrls(); myCrawlStat.incTotalLinks(links.size()); try { myCrawlStat.incTotalTextSize(parseData.getText().getBytes("UTF-8").length); } catch (UnsupportedEncodingException ignored) { // Do nothing } } // We dump this crawler statistics after processing every 50 pages if ((myCrawlStat.getTotalProcessedPages() % 50) == 0) { dumpMyData(); } }
LanguageIdentifier languageIdentifier = new LanguageIdentifier(parsedData.getText()); page.setLanguage(languageIdentifier.getLanguage());