public HtmlParseData parse(Page page, String contextURL) throws ParseException { HtmlParseData parsedData = new HtmlParseData(); HtmlContentHandler contentHandler = new HtmlContentHandler(); Metadata metadata = new Metadata(); parsedData.setContentCharset(contentCharset); parsedData.setText(contentHandler.getBodyText().trim()); parsedData.setTitle(metadata.get(DublinCore.TITLE)); parsedData.setMetaTags(contentHandler.getMetaTags());
if (href != null) { anchorFlag = true; addToOutgoingUrls(href, localName, attributes); addToOutgoingUrls(imgSrc, localName); String src = attributes.getValue("src"); if (src != null) { addToOutgoingUrls(src, localName); if (pos != -1) { metaRefresh = content.substring(pos + 4); addToOutgoingUrls(metaRefresh, localName); addToOutgoingUrls(metaLocation, localName);
Set<WebURL> outgoingUrls = new HashSet<>(); String baseURL = contentHandler.getBaseUrl(); if (baseURL != null) { contextURL = baseURL; for (ExtractedUrlAnchorPair urlAnchorPair : contentHandler.getOutgoingUrls()) {
HtmlParser htmlParser = new HtmlParser(); HtmlContentHandler contentHandler = new HtmlContentHandler(); // I presume the `Page page` is present in the scope InputStream inputStream = new ByteArrayInputStream(page.getContentData()); Metadata metadata = new Metadata(); ParseContext parseContext = new ParseContext(); // and finally parse htmlParser.parse(inputStream, contentHandler, metadata, parseContext);
public HtmlParseData parse(Page page, String contextURL) throws ParseException { HtmlParseData parsedData = new HtmlParseData(); HtmlContentHandler contentHandler = new HtmlContentHandler(); Metadata metadata = new Metadata(); try (InputStream inputStream = new ByteArrayInputStream(page.getContentData())) { htmlParser.parse(inputStream, contentHandler, metadata, parseContext); } catch (Exception e) { logger.error("{}, while parsing: {}", e.getMessage(), page.getWebURL().getURL()); throw new ParseException(); } String contentCharset = chooseEncoding(page, metadata); parsedData.setContentCharset(contentCharset); parsedData.setText(contentHandler.getBodyText().trim()); parsedData.setTitle(metadata.get(DublinCore.TITLE)); parsedData.setMetaTags(contentHandler.getMetaTags()); Set<WebURL> outgoingUrls = getOutgoingUrls(contextURL, contentHandler, contentCharset); parsedData.setOutgoingUrls(outgoingUrls); try { if (page.getContentCharset() == null) { parsedData.setHtml(new String(page.getContentData())); } else { parsedData.setHtml(new String(page.getContentData(), page.getContentCharset())); } return parsedData; } catch (UnsupportedEncodingException e) { logger.error("error parsing the html: " + page.getWebURL().getURL(), e); throw new ParseException(); } }
private Set<WebURL> getOutgoingUrls(String contextURL, HtmlContentHandler contentHandler, String contentCharset) { Set<WebURL> outgoingUrls = new HashSet<>(); String baseURL = contentHandler.getBaseUrl(); if (baseURL != null) { contextURL = baseURL; for (ExtractedUrlAnchorPair urlAnchorPair : contentHandler.getOutgoingUrls()) {
if (href != null) { anchorFlag = true; addToOutgoingUrls(href, localName, attributes); addToOutgoingUrls(imgSrc, localName); String src = attributes.getValue("src"); if (src != null) { addToOutgoingUrls(src, localName); if (pos != -1) { metaRefresh = content.substring(pos + 4); addToOutgoingUrls(metaRefresh, localName); addToOutgoingUrls(metaLocation, localName);