@Override public void store(Page page) { if (page.getParseData() instanceof HtmlParseData) { try { HtmlParseData htmlParseData = (HtmlParseData) page.getParseData(); insertKeyStatement.setString(1, htmlParseData.getHtml()); insertKeyStatement.setString(2, htmlParseData.getText()); insertKeyStatement.setString(3, page.getWebURL().getURL()); insertKeyStatement.setTimestamp(4, new Timestamp(new java.util.Date().getTime())); insertKeyStatement.executeUpdate(); } catch (SQLException e) { logger.error("SQL Exception while storing webpage for url'{}'", page.getWebURL().getURL(), e); throw new RuntimeException(e); } } }
@Override public void visit(Page page) { String url = page.getWebURL().getURL(); logger.info("URL: " + url); if (page.getParseData() instanceof HtmlParseData) { HtmlParseData htmlParseData = (HtmlParseData) page.getParseData(); String text = htmlParseData.getText(); String html = htmlParseData.getHtml(); Set<WebURL> links = htmlParseData.getOutgoingUrls(); logger.info("Text length: " + text.length()); logger.info("Html length: " + html.length()); logger.info("Number of outgoing links: " + links.size()); try { postgresDBService.store(page); } catch (RuntimeException e) { logger.error("Storing failed", e); } } }
public void processUrl(String url) { logger.debug("Processing: {}", url); Page page = download(url); if (page != null) { ParseData parseData = page.getParseData(); if (parseData != null) { if (parseData instanceof HtmlParseData) { HtmlParseData htmlParseData = (HtmlParseData) parseData; logger.debug("Title: {}", htmlParseData.getTitle()); logger.debug("Text length: {}", htmlParseData.getText().length()); logger.debug("Html length: {}", htmlParseData.getHtml().length()); } } else { logger.warn("Couldn't parse the content of the page."); } } else { logger.warn("Couldn't fetch the content of the page."); } logger.debug("=============="); }
HtmlParseData htmlParseData = (HtmlParseData) page.getParseData(); String text = htmlParseData.getText(); String html = htmlParseData.getHtml(); Set<WebURL> links = htmlParseData.getOutgoingUrls();
@Override public void visit(Page page) { int docid = page.getWebURL().getDocid(); String url = page.getWebURL().getURL(); int parentDocid = page.getWebURL().getParentDocid(); logger.debug("Docid: {}", docid); logger.info("URL: {}", url); logger.debug("Docid of parent page: {}", parentDocid); if (page.getParseData() instanceof HtmlParseData) { HtmlParseData htmlParseData = (HtmlParseData) page.getParseData(); String text = htmlParseData.getText(); String html = htmlParseData.getHtml(); Set<WebURL> links = htmlParseData.getOutgoingUrls(); logger.debug("Text length: {}", text.length()); logger.debug("Html length: {}", html.length()); logger.debug("Number of outgoing links: {}", links.size()); } logger.debug("============="); } }
@Override public void visit(Page page) { int docid = page.getWebURL().getDocid(); String url = page.getWebURL().getURL(); int parentDocid = page.getWebURL().getParentDocid(); logger.debug("Docid: {}", docid); logger.info("URL: {}", url); logger.debug("Docid of parent page: {}", parentDocid); if (page.getParseData() instanceof HtmlParseData) { HtmlParseData htmlParseData = (HtmlParseData) page.getParseData(); String text = htmlParseData.getText(); String html = htmlParseData.getHtml(); Set<WebURL> links = htmlParseData.getOutgoingUrls(); logger.debug("Text length: {}", text.length()); logger.debug("Html length: {}", html.length()); logger.debug("Number of outgoing links: {}", links.size()); } logger.debug("============="); } }
String html = htmlParseData.getHtml();
String html = htmlParseData.getHtml();
HtmlParseData htmlParseData = (HtmlParseData) page.getParseData(); String text = htmlParseData.getText(); String html = htmlParseData.getHtml(); List<WebURL> links = htmlParseData.getOutgoingUrls();
@Override public void visit(Page page) { if(page.getWebURL().getURL().contains("")) { String html = ((HtmlParseData) page.getParseData()).getHtml().replaceAll("\n", ""); Matcher htmlMatcher = Pattern.compile("(.*)(var flashvars.*?)(\\{.*language.*?})(.*)").matcher(html); if(htmlMatcher.matches()) { logger.info("Find video url:[{}]", page.getWebURL().getURL()); try { String viewKey = getEmbedKey(page.getWebURL()); String videoJson = htmlMatcher.replaceAll("$3"); HashMap<String, Object> videoMap = objectMapper.readValue(videoJson, HashMap.class); PornRecord pornRecord = new PornRecord(videoMap, viewKey, Properties.FILE_PATH, Properties.DOWNLOAD_VIDEO); pornRecordDao.save(pornRecord); if(Properties.DOWNLOAD_VIDEO) { downloadService.download(myController.getConfig(), pornRecord.getVideoUrl(), new File(pornRecord.getFilePath())); } } catch (Exception e) { logger.error("Download fail", e); } } else if(Pattern.compile("(.*)(function leastFactor\\(n\\).*)(function go\\(\\) \\{ )(.*)(n=leastFactor\\(p\\);\\{)(.*?=)(.*?;)(.*)").matcher(html).matches()) { try { RnkeyUtils.genRnKey(html); } catch (ScriptException e) { e.printStackTrace(); } } } }
HtmlParseData htmlParseData = (HtmlParseData) page.getParseData(); String text = htmlParseData.getText(); String html = htmlParseData.getHtml(); Set<WebURL> links = htmlParseData.getOutgoingUrls();
public class MyCrawler extends WebCrawler { // Decide if you want to go to a webpage based on the URL @Override public boolean shouldVisit(WebURL url) { String href = url.getURL().toLowerCase(); return true; } @Override public void visit(Page page) { String url = page.getWebURL().getURL(); System.out.println("URL: " + url); if (page.getParseData() instanceof HtmlParseData) { HtmlParseData htmlParseData = (HtmlParseData) page.getParseData(); String text = htmlParseData.getText(); String html = htmlParseData.getHtml(); List<WebURL> links = htmlParseData.getOutgoingUrls(); } } }