public void setOutgoingUrls(WebURL referringPage) throws UnsupportedEncodingException { Set<WebURL> outgoingUrls = parseOutgoingUrls(referringPage); this.setOutgoingUrls(outgoingUrls); }
public Parser(CrawlConfig config, TLDList tldList) throws IllegalAccessException, InstantiationException { this(config, new TikaHtmlParser(config, tldList), tldList); }
private void addToOutgoingUrls(String href, String tag) { curUrl = new ExtractedUrlAnchorPair(); curUrl.setHref(href); curUrl.setTag(tag); outgoingUrls.add(curUrl); }
public void parse(Page page, String contextURL) throws NotAllowedContentException, ParseException { if (Util.hasBinaryContent(page.getContentType())) { // BINARY BinaryParseData parseData = new BinaryParseData(); if (config.isIncludeBinaryContentInCrawling()) { if (config.isProcessBinaryContentInCrawling()) { try { parseData.setBinaryContent(page.getContentData()); } catch (Exception e) { if (config.isHaltOnError()) { parseData.setHtml("<html></html>"); if (parseData.getHtml() == null) { throw new ParseException(); parseData.setOutgoingUrls(net.extractUrls(parseData.getHtml())); } else { throw new NotAllowedContentException(); CssParseData parseData = new CssParseData(); if (page.getContentCharset() == null) { parseData.setTextContent(new String(page.getContentData())); } else { parseData.setTextContent( new String(page.getContentData(), page.getContentCharset())); parseData.setOutgoingUrls(page.getWebURL()); page.setParseData(parseData); } catch (Exception e) {
@Override public void visit(Page page) { String url = page.getWebURL().getURL(); logger.info("URL: " + url); if (page.getParseData() instanceof HtmlParseData) { HtmlParseData htmlParseData = (HtmlParseData) page.getParseData(); String text = htmlParseData.getText(); String html = htmlParseData.getHtml(); Set<WebURL> links = htmlParseData.getOutgoingUrls(); logger.info("Text length: " + text.length()); logger.info("Html length: " + html.length()); logger.info("Number of outgoing links: " + links.size()); try { postgresDBService.store(page); } catch (RuntimeException e) { logger.error("Storing failed", e); } } }
public HtmlParseData parse(Page page, String contextURL) throws ParseException { HtmlParseData parsedData = new HtmlParseData(); HtmlContentHandler contentHandler = new HtmlContentHandler(); Metadata metadata = new Metadata(); String contentCharset = chooseEncoding(page, metadata); parsedData.setContentCharset(contentCharset); parsedData.setText(contentHandler.getBodyText().trim()); parsedData.setTitle(metadata.get(DublinCore.TITLE)); parsedData.setMetaTags(contentHandler.getMetaTags()); Set<WebURL> outgoingUrls = getOutgoingUrls(contextURL, contentHandler, contentCharset); parsedData.setOutgoingUrls(outgoingUrls); parsedData.setHtml(new String(page.getContentData())); } else { parsedData.setHtml(new String(page.getContentData(), page.getContentCharset()));
@Override public void store(Page page) { if (page.getParseData() instanceof HtmlParseData) { try { HtmlParseData htmlParseData = (HtmlParseData) page.getParseData(); insertKeyStatement.setString(1, htmlParseData.getHtml()); insertKeyStatement.setString(2, htmlParseData.getText()); insertKeyStatement.setString(3, page.getWebURL().getURL()); insertKeyStatement.setTimestamp(4, new Timestamp(new java.util.Date().getTime())); insertKeyStatement.executeUpdate(); } catch (SQLException e) { logger.error("SQL Exception while storing webpage for url'{}'", page.getWebURL().getURL(), e); throw new RuntimeException(e); } } }
Set<WebURL> outgoingUrls = new HashSet<>(); String baseURL = contentHandler.getBaseUrl(); if (baseURL != null) { contextURL = baseURL; for (ExtractedUrlAnchorPair urlAnchorPair : contentHandler.getOutgoingUrls()) { String href = urlAnchorPair.getHref(); if ((href == null) || href.trim().isEmpty()) { continue; webURL.setTldList(tldList); webURL.setURL(url); webURL.setTag(urlAnchorPair.getTag()); webURL.setAnchor(urlAnchorPair.getAnchor()); webURL.setAttributes(urlAnchorPair.getAttributes()); outgoingUrls.add(webURL); urlCount++;
parser.parse(page, curURL.getURL()); List<WebURL> toSchedule = new ArrayList<>(); int maxCrawlDepth = myController.getConfig().getMaxDepthOfCrawling(); for (WebURL webURL : parseData.getOutgoingUrls()) { webURL.setParentDocid(curURL.getDocid()); webURL.setParentUrl(curURL.getURL()); page.getContentType().contains("html") && ((HtmlParseData)page.getParseData()) .getMetaTagValue("robots"). contains("noindex");
private void addToOutgoingUrls(String href, String tag, Attributes attributes) { curUrl = new ExtractedUrlAnchorPair(); curUrl.setHref(href); curUrl.setTag(tag); for (int x = 0; x < attributes.getLength(); x++) { String attrName = attributes.getLocalName(x); String attrVal = attributes.getValue(attrName); curUrl.setAttribute(attrName, attrVal); } outgoingUrls.add(curUrl); }
private Set<WebURL> parseOutgoingUrls(WebURL referringPage) throws UnsupportedEncodingException { Set<String> extractedUrls = extractUrlInCssText(this.getTextContent()); final String pagePath = referringPage.getPath(); final String pageUrl = referringPage.getURL(); Set<WebURL> outgoingUrls = new HashSet<>(); for (String url : extractedUrls) { String relative = getLinkRelativeTo(pagePath, url); String absolute = getAbsoluteUrlFrom(URLCanonicalizer.getCanonicalURL(pageUrl), relative); WebURL webURL = new WebURL(); webURL.setURL(absolute); outgoingUrls.add(webURL); } return outgoingUrls; }
this.parser = parser == null ? new Parser(config, tldList) : parser; this.robotstxtServer = robotstxtServer;
@Override public void endElement(String uri, String localName, String qName) throws SAXException { Element element = HtmlFactory.getElement(localName); if ((element == Element.A) || (element == Element.AREA) || (element == Element.LINK)) { anchorFlag = false; if (curUrl != null) { String anchor = anchorText.toString().replaceAll("\n", " ").replaceAll("\t", " ").trim(); if (!anchor.isEmpty()) { if (anchor.length() > MAX_ANCHOR_LENGTH) { anchor = anchor.substring(0, MAX_ANCHOR_LENGTH) + "..."; } curUrl.setTag(localName); curUrl.setAnchor(anchor); } anchorText.delete(0, anchorText.length()); } curUrl = null; } else if (element == Element.BODY) { isWithinBodyElement = false; } }
private static String getLinkRelativeTo(String pagePath, String linkUrl) { if (linkUrl.startsWith("/") && !linkUrl.startsWith("//")) { return linkUrl; } if (linkUrl.startsWith("//")) { linkUrl = "http" + linkUrl; } if (linkUrl.startsWith("http")) { String domainUrl = getPathFromUrl(linkUrl); return domainUrl; } if (linkUrl.startsWith("../")) { String[] parts = pagePath.split("/"); int pos = linkUrl.lastIndexOf("../") + 3; int parents = pos / 3; long diff = parts.length - parents - 1; String absolute = ""; for (int i = 0; i < diff; i++) { String dir = parts[i]; if (!dir.isEmpty()) { absolute = absolute + "/" + dir; } } return absolute + "/" + linkUrl.substring(pos); } String root = getDirsFromUrl(pagePath); return root + linkUrl; }
@Override public void startElement(String uri, String localName, String qName, Attributes attributes) throws SAXException { Element element = HtmlFactory.getElement(localName); if (href != null) { anchorFlag = true; addToOutgoingUrls(href, localName, attributes); addToOutgoingUrls(imgSrc, localName); String src = attributes.getValue("src"); if (src != null) { addToOutgoingUrls(src, localName); if (pos != -1) { metaRefresh = content.substring(pos + 4); addToOutgoingUrls(metaRefresh, localName); addToOutgoingUrls(metaLocation, localName);
private static String getAbsoluteUrlFrom(String pageUrl, String linkPath) { String domainUrl = getFullDomainFromUrl(pageUrl); if (linkPath.startsWith("/")) { return domainUrl + linkPath; } return domainUrl + "/" + linkPath; }
public void setBinaryContent(byte[] data) throws TransformerConfigurationException, TikaException, SAXException, IOException { InputStream inputStream = new ByteArrayInputStream(data); ByteArrayOutputStream outputStream = new ByteArrayOutputStream(); try { TransformerHandler handler = getTransformerHandler(outputStream, DEFAULT_OUTPUT_FORMAT, DEFAULT_ENCODING); AUTO_DETECT_PARSER.parse(inputStream, handler, new Metadata(), context); // Hacking the following line to remove Tika's inserted DocType this.html = new String(outputStream.toByteArray(), DEFAULT_ENCODING).replace( "http://www.w3.org/1999/xhtml", ""); } catch (TransformerConfigurationException | TikaException | SAXException | IOException | RuntimeException e) { throw e; } }
/** * Classes that extends WebCrawler should overwrite this function to tell the * crawler whether the given url should be crawled or not. The following * default implementation indicates that all urls should be included in the crawl * except those with a nofollow flag. * * @param url * the url which we are interested to know whether it should be * included in the crawl or not. * @param referringPage * The Page in which this url was found. * @return if the url should be included in the crawl it returns true, * otherwise false is returned. */ public boolean shouldVisit(Page referringPage, WebURL url) { if (myController.getConfig().isRespectNoFollow()) { return !((referringPage != null && referringPage.getContentType() != null && referringPage.getContentType().contains("html") && ((HtmlParseData)referringPage.getParseData()) .getMetaTagValue("robots") .contains("nofollow")) || url.getAttribute("rel").contains("nofollow")); } return true; }
public CrawlController(CrawlConfig config, PageFetcher pageFetcher, RobotstxtServer robotstxtServer) throws Exception { this(config, pageFetcher, new Parser(config), robotstxtServer); }
@Deprecated public Parser(CrawlConfig config) throws IllegalAccessException, InstantiationException { this(config, new TikaHtmlParser(config, null)); }