@Override public void visit(Page page) { String url = page.getWebURL().getURL(); logger.info("URL: " + url); if (page.getParseData() instanceof HtmlParseData) { HtmlParseData htmlParseData = (HtmlParseData) page.getParseData(); String text = htmlParseData.getText(); String html = htmlParseData.getHtml(); Set<WebURL> links = htmlParseData.getOutgoingUrls(); logger.info("Text length: " + text.length()); logger.info("Html length: " + html.length()); logger.info("Number of outgoing links: " + links.size()); try { postgresDBService.store(page); } catch (RuntimeException e) { logger.error("Storing failed", e); } } }
public HtmlParseData parse(Page page, String contextURL) throws ParseException { HtmlParseData parsedData = new HtmlParseData(); parsedData.setContentCharset(contentCharset); parsedData.setText(contentHandler.getBodyText().trim()); parsedData.setTitle(metadata.get(DublinCore.TITLE)); parsedData.setMetaTags(contentHandler.getMetaTags()); parsedData.setOutgoingUrls(outgoingUrls); parsedData.setHtml(new String(page.getContentData())); } else { parsedData.setHtml(new String(page.getContentData(), page.getContentCharset()));
page.setContentCharset(parsedData.getContentCharset()); LanguageIdentifier languageIdentifier = new LanguageIdentifier(parsedData.getText()); page.setLanguage(languageIdentifier.getLanguage());
@Override public void store(Page page) { if (page.getParseData() instanceof HtmlParseData) { try { HtmlParseData htmlParseData = (HtmlParseData) page.getParseData(); insertKeyStatement.setString(1, htmlParseData.getHtml()); insertKeyStatement.setString(2, htmlParseData.getText()); insertKeyStatement.setString(3, page.getWebURL().getURL()); insertKeyStatement.setTimestamp(4, new Timestamp(new java.util.Date().getTime())); insertKeyStatement.executeUpdate(); } catch (SQLException e) { logger.error("SQL Exception while storing webpage for url'{}'", page.getWebURL().getURL(), e); throw new RuntimeException(e); } } }
public void processUrl(String url) { logger.debug("Processing: {}", url); Page page = download(url); if (page != null) { ParseData parseData = page.getParseData(); if (parseData != null) { if (parseData instanceof HtmlParseData) { HtmlParseData htmlParseData = (HtmlParseData) parseData; logger.debug("Title: {}", htmlParseData.getTitle()); logger.debug("Text length: {}", htmlParseData.getText().length()); logger.debug("Html length: {}", htmlParseData.getHtml().length()); } } else { logger.warn("Couldn't parse the content of the page."); } } else { logger.warn("Couldn't fetch the content of the page."); } logger.debug("=============="); }
String html = htmlParseData.getHtml();
@Override public void visit(Page page) { logger.info("Visited: {}", page.getWebURL().getURL()); if (page.getParseData() instanceof HtmlParseData) { HtmlParseData parseData = (HtmlParseData) page.getParseData(); Set<WebURL> links = parseData.getOutgoingUrls(); myCrawlStat.incTotalLinks(links.size()); try { myCrawlStat.incTotalTextSize(parseData.getText().getBytes("UTF-8").length); } catch (UnsupportedEncodingException ignored) { // Do nothing } } // We dump this crawler statistics after processing every 50 pages if ((myCrawlStat.getTotalProcessedPages() % 50) == 0) { dumpMyData(); } }
for (WebURL webURL : htmlParseData.getOutgoingUrls()) new Values(htmlParseData.getTitle(), new String(page.getContentData(), "UTF-8"), curURL.getURL(), date));
static String wordToFind = "..."; public void visit(Page page) { if (page.getParseData() instanceof HtmlParseData) { HtmlParseData htmlParseData = (HtmlParseData) page.getParseData(); String text = htmlParseData.getText(); if(text.indexOf(wordToFind)!=-1) saveToDB(page.getWebURL().getURL()): } }
String html = htmlParseData.getHtml();
public class MyCrawler extends WebCrawler { // Decide if you want to go to a webpage based on the URL @Override public boolean shouldVisit(WebURL url) { String href = url.getURL().toLowerCase(); return true; } @Override public void visit(Page page) { String url = page.getWebURL().getURL(); System.out.println("URL: " + url); if (page.getParseData() instanceof HtmlParseData) { HtmlParseData htmlParseData = (HtmlParseData) page.getParseData(); String text = htmlParseData.getText(); String html = htmlParseData.getHtml(); List<WebURL> links = htmlParseData.getOutgoingUrls(); } } }
@Override public void visit(Page page) { if(page.getWebURL().getURL().contains("")) { String html = ((HtmlParseData) page.getParseData()).getHtml().replaceAll("\n", ""); Matcher htmlMatcher = Pattern.compile("(.*)(var flashvars.*?)(\\{.*language.*?})(.*)").matcher(html); if(htmlMatcher.matches()) { logger.info("Find video url:[{}]", page.getWebURL().getURL()); try { String viewKey = getEmbedKey(page.getWebURL()); String videoJson = htmlMatcher.replaceAll("$3"); HashMap<String, Object> videoMap = objectMapper.readValue(videoJson, HashMap.class); PornRecord pornRecord = new PornRecord(videoMap, viewKey, Properties.FILE_PATH, Properties.DOWNLOAD_VIDEO); pornRecordDao.save(pornRecord); if(Properties.DOWNLOAD_VIDEO) { downloadService.download(myController.getConfig(), pornRecord.getVideoUrl(), new File(pornRecord.getFilePath())); } } catch (Exception e) { logger.error("Download fail", e); } } else if(Pattern.compile("(.*)(function leastFactor\\(n\\).*)(function go\\(\\) \\{ )(.*)(n=leastFactor\\(p\\);\\{)(.*?=)(.*?;)(.*)").matcher(html).matches()) { try { RnkeyUtils.genRnKey(html); } catch (ScriptException e) { e.printStackTrace(); } } } }
public HtmlParseData parse(Page page, String contextURL) throws ParseException { HtmlParseData parsedData = new HtmlParseData(); HtmlContentHandler contentHandler = new HtmlContentHandler(); Metadata metadata = new Metadata(); try (InputStream inputStream = new ByteArrayInputStream(page.getContentData())) { htmlParser.parse(inputStream, contentHandler, metadata, parseContext); } catch (Exception e) { logger.error("{}, while parsing: {}", e.getMessage(), page.getWebURL().getURL()); throw new ParseException(); } String contentCharset = chooseEncoding(page, metadata); parsedData.setContentCharset(contentCharset); parsedData.setText(contentHandler.getBodyText().trim()); parsedData.setTitle(metadata.get(DublinCore.TITLE)); parsedData.setMetaTags(contentHandler.getMetaTags()); Set<WebURL> outgoingUrls = getOutgoingUrls(contextURL, contentHandler, contentCharset); parsedData.setOutgoingUrls(outgoingUrls); try { if (page.getContentCharset() == null) { parsedData.setHtml(new String(page.getContentData())); } else { parsedData.setHtml(new String(page.getContentData(), page.getContentCharset())); } return parsedData; } catch (UnsupportedEncodingException e) { logger.error("error parsing the html: " + page.getWebURL().getURL(), e); throw new ParseException(); } }
page.setContentCharset(parsedData.getContentCharset()); LanguageIdentifier languageIdentifier = new LanguageIdentifier(parsedData.getText()); page.setLanguage(languageIdentifier.getLanguage());
@Override public void visit(Page page) { int docid = page.getWebURL().getDocid(); String url = page.getWebURL().getURL(); int parentDocid = page.getWebURL().getParentDocid(); logger.debug("Docid: {}", docid); logger.info("URL: {}", url); logger.debug("Docid of parent page: {}", parentDocid); if (page.getParseData() instanceof HtmlParseData) { HtmlParseData htmlParseData = (HtmlParseData) page.getParseData(); String text = htmlParseData.getText(); String html = htmlParseData.getHtml(); Set<WebURL> links = htmlParseData.getOutgoingUrls(); logger.debug("Text length: {}", text.length()); logger.debug("Html length: {}", html.length()); logger.debug("Number of outgoing links: {}", links.size()); } logger.debug("============="); } }
@Override public void visit(Page page) { int docid = page.getWebURL().getDocid(); String url = page.getWebURL().getURL(); int parentDocid = page.getWebURL().getParentDocid(); logger.debug("Docid: {}", docid); logger.info("URL: {}", url); logger.debug("Docid of parent page: {}", parentDocid); if (page.getParseData() instanceof HtmlParseData) { HtmlParseData htmlParseData = (HtmlParseData) page.getParseData(); String text = htmlParseData.getText(); String html = htmlParseData.getHtml(); Set<WebURL> links = htmlParseData.getOutgoingUrls(); logger.debug("Text length: {}", text.length()); logger.debug("Html length: {}", html.length()); logger.debug("Number of outgoing links: {}", links.size()); } logger.debug("============="); } }
String text = htmlParseData.getText(); String html = htmlParseData.getHtml(); Set<WebURL> links = htmlParseData.getOutgoingUrls();
String text = htmlParseData.getText(); String html = htmlParseData.getHtml(); List<WebURL> links = htmlParseData.getOutgoingUrls();
String text = htmlParseData.getText(); String html = htmlParseData.getHtml(); Set<WebURL> links = htmlParseData.getOutgoingUrls();