public HtmlParseData parse(Page page, String contextURL) throws ParseException {
HtmlParseData parsedData = new HtmlParseData();
HtmlContentHandler contentHandler = new HtmlContentHandler();
Metadata metadata = new Metadata();
try (InputStream inputStream = new ByteArrayInputStream(page.getContentData())) {
htmlParser.parse(inputStream, contentHandler, metadata, parseContext);
} catch (Exception e) {
logger.error("{}, while parsing: {}", e.getMessage(), page.getWebURL().getURL());
throw new ParseException();
}
String contentCharset = chooseEncoding(page, metadata);
parsedData.setContentCharset(contentCharset);
parsedData.setText(contentHandler.getBodyText().trim());
parsedData.setTitle(metadata.get(DublinCore.TITLE));
parsedData.setMetaTags(contentHandler.getMetaTags());
Set<WebURL> outgoingUrls = getOutgoingUrls(contextURL, contentHandler, contentCharset);
parsedData.setOutgoingUrls(outgoingUrls);
try {
if (page.getContentCharset() == null) {
parsedData.setHtml(new String(page.getContentData()));
} else {
parsedData.setHtml(new String(page.getContentData(), page.getContentCharset()));
}
return parsedData;
} catch (UnsupportedEncodingException e) {
logger.error("error parsing the html: " + page.getWebURL().getURL(), e);
throw new ParseException();
}
}