private HTMLHighlighter(final boolean extractHTML) { if (extractHTML) { setOutputHighlightOnly(true); setExtraStyleSheet(""); setPreHighlight(""); setPostHighlight(""); } }
private HTMLHighlighter(final boolean extractHTML) { if (extractHTML) { setOutputHighlightOnly(true); setExtraStyleSheet("\n<style type=\"text/css\">\n" + "A:before { content:' '; } \n" // + "A:after { content:' '; } \n" // + "SPAN:before { content:' '; } \n" // + "SPAN:after { content:' '; } \n" // + "</style>\n"); setPreHighlight(""); setPostHighlight(""); } }
private HTMLHighlighter(final boolean extractHTML) { if (extractHTML) { setOutputHighlightOnly(true); setExtraStyleSheet("\n<style type=\"text/css\">\n" + "A:before { content:' '; } \n" // + "A:after { content:' '; } \n" // + "SPAN:before { content:' '; } \n" // + "SPAN:after { content:' '; } \n" // + "</style>\n"); setPreHighlight(""); setPostHighlight(""); } }
private HTMLHighlighter(final boolean extractHTML) { if (extractHTML) { setOutputHighlightOnly(true); setExtraStyleSheet("\n<style type=\"text/css\">\n" + "A:before { content:' '; } \n" // + "A:after { content:' '; } \n" // + "SPAN:before { content:' '; } \n" // + "SPAN:after { content:' '; } \n" // + "</style>\n"); setPreHighlight(""); setPostHighlight(""); } }
/** * returns the article from an document with its basic html structure. * * @param HTMLDocument * @param URI the uri from the document for resolving the relative anchors in the document to absolute anchors * @return String */ public String process(HTMLDocument htmlDoc, URI docUri, final BoilerpipeExtractor extractor) { final HTMLHighlighter hh = HTMLHighlighter.newExtractingInstance(); hh.setOutputHighlightOnly(true); TextDocument doc; String text = ""; try { doc = new BoilerpipeSAXInput(htmlDoc.toInputSource()).getTextDocument(); extractor.process(doc); final InputSource is = htmlDoc.toInputSource(); text = hh.process(doc, is); } catch (Exception ex) { return null; } return removeNotAllowedTags(text, docUri); }