/** * Creates a new {@link HTMLHighlighter}, which is set-up to return the full * HTML text, with the extracted text portion <b>highlighted</b>. */ public static HTMLHighlighter newHighlightingInstance() { return new HTMLHighlighter(false); }
private HTMLHighlighter(final boolean extractHTML) { if (extractHTML) { setOutputHighlightOnly(true); setExtraStyleSheet(""); setPreHighlight(""); setPostHighlight(""); } }
/** * returns the article from an document with its basic html structure. * * @param HTMLDocument * @param URI the uri from the document for resolving the relative anchors in the document to absolute anchors * @return String */ public String process(HTMLDocument htmlDoc, URI docUri, final BoilerpipeExtractor extractor) { final HTMLHighlighter hh = HTMLHighlighter.newExtractingInstance(); hh.setOutputHighlightOnly(true); TextDocument doc; String text = ""; try { doc = new BoilerpipeSAXInput(htmlDoc.toInputSource()).getTextDocument(); extractor.process(doc); final InputSource is = htmlDoc.toInputSource(); text = hh.process(doc, is); } catch (Exception ex) { return null; } return removeNotAllowedTags(text, docUri); }
/** * Processes the given {@link TextDocument} and the original HTML text (as a * String). * * @param doc * The processed {@link TextDocument}. * @param origHTML * The original HTML document. * @throws BoilerpipeProcessingException */ public String process(final TextDocument doc, final String origHTML) throws BoilerpipeProcessingException { return process(doc, new InputSource(new StringReader(origHTML))); }
/** * Processes the given {@link TextDocument} and the original HTML text (as a * String). * * @param doc * The processed {@link TextDocument}. * @param origHTML * The original HTML document. * @return The highlighted HTML. * @throws BoilerpipeProcessingException */ public String process(final TextDocument doc, final String origHTML) throws BoilerpipeProcessingException { return process(doc, new InputSource(new StringReader(origHTML))); }
private HTMLHighlighter(final boolean extractHTML) { if (extractHTML) { setOutputHighlightOnly(true); setExtraStyleSheet("\n<style type=\"text/css\">\n" + "A:before { content:' '; } \n" // + "A:after { content:' '; } \n" // + "SPAN:before { content:' '; } \n" // + "SPAN:after { content:' '; } \n" // + "</style>\n"); setPreHighlight(""); setPostHighlight(""); } }
/** * Creates a new {@link HTMLHighlighter}, which is set-up to return only the * extracted HTML text, including enclosed markup. */ public static HTMLHighlighter newExtractingInstance() { return new HTMLHighlighter(true); }
/** * Processes the given {@link TextDocument} and the original HTML text (as a * String). * * @param doc * The processed {@link TextDocument}. * @param origHTML * The original HTML document. * @return The highlighted HTML. * @throws BoilerpipeProcessingException */ public String process(final TextDocument doc, final String origHTML) throws BoilerpipeProcessingException { return process(doc, new InputSource(new StringReader(origHTML))); }
private HTMLHighlighter(final boolean extractHTML) { if (extractHTML) { setOutputHighlightOnly(true); setExtraStyleSheet("\n<style type=\"text/css\">\n" + "A:before { content:' '; } \n" // + "A:after { content:' '; } \n" // + "SPAN:before { content:' '; } \n" // + "SPAN:after { content:' '; } \n" // + "</style>\n"); setPreHighlight(""); setPostHighlight(""); } }
/** * Creates a new {@link HTMLHighlighter}, which is set-up to return the full * HTML text, with the extracted text portion <b>highlighted</b>. */ public static HTMLHighlighter newHighlightingInstance() { return new HTMLHighlighter(false); }
/** * Processes the given {@link TextDocument} and the original HTML text (as a * String). * * @param doc * The processed {@link TextDocument}. * @param origHTML * The original HTML document. * @return The highlighted HTML. * @throws BoilerpipeProcessingException */ public String process(final TextDocument doc, final String origHTML) throws BoilerpipeProcessingException { return process(doc, new InputSource(new StringReader(origHTML))); }
private HTMLHighlighter(final boolean extractHTML) { if (extractHTML) { setOutputHighlightOnly(true); setExtraStyleSheet("\n<style type=\"text/css\">\n" + "A:before { content:' '; } \n" // + "A:after { content:' '; } \n" // + "SPAN:before { content:' '; } \n" // + "SPAN:after { content:' '; } \n" // + "</style>\n"); setPreHighlight(""); setPostHighlight(""); } }
/** * Creates a new {@link HTMLHighlighter}, which is set-up to return the full * HTML text, with the extracted text portion <b>highlighted</b>. */ public static HTMLHighlighter newHighlightingInstance() { return new HTMLHighlighter(false); }
public String process(final URL url, final BoilerpipeExtractor extractor) throws IOException, BoilerpipeProcessingException, SAXException { final HTMLDocument htmlDoc = HTMLFetcher.fetch(url); final TextDocument doc = new BoilerpipeSAXInput(htmlDoc.toInputSource()) .getTextDocument(); extractor.process(doc); final InputSource is = htmlDoc.toInputSource(); return process(doc, is); }
/** * Creates a new {@link HTMLHighlighter}, which is set-up to return only the * extracted HTML text, including enclosed markup. */ public static HTMLHighlighter newExtractingInstance() { return new HTMLHighlighter(true); }
/** * Fetches the given {@link URL} using {@link HTMLFetcher} and processes the * retrieved HTML using the specified {@link BoilerpipeExtractor}. * * The processed {@link TextDocument}. * The original HTML document. * @return The highlighted HTML. * @throws BoilerpipeProcessingException */ public String process(final URL url, final BoilerpipeExtractor extractor) throws IOException, BoilerpipeProcessingException, SAXException { final HTMLDocument htmlDoc = HTMLFetcher.fetch(url); final TextDocument doc = new BoilerpipeSAXInput(htmlDoc.toInputSource()) .getTextDocument(); extractor.process(doc); final InputSource is = htmlDoc.toInputSource(); return process(doc, is); }
/** * Creates a new {@link HTMLHighlighter}, which is set-up to return only the * extracted HTML text, including enclosed markup. */ public static HTMLHighlighter newExtractingInstance() { return new HTMLHighlighter(true); }
/** * Fetches the given {@link URL} using {@link HTMLFetcher} and processes the * retrieved HTML using the specified {@link BoilerpipeExtractor}. * * The processed {@link TextDocument}. * The original HTML document. * @return The highlighted HTML. * @throws BoilerpipeProcessingException */ public String process(final URL url, final BoilerpipeExtractor extractor) throws IOException, BoilerpipeProcessingException, SAXException { final HTMLDocument htmlDoc = HTMLFetcher.fetch(url); final TextDocument doc = new BoilerpipeSAXInput(htmlDoc.toInputSource()) .getTextDocument(); extractor.process(doc); final InputSource is = htmlDoc.toInputSource(); return process(doc, is); }
/** * Creates a new {@link HTMLHighlighter}, which is set-up to return the full * HTML text, with the extracted text portion <b>highlighted</b>. */ public static HTMLHighlighter newHighlightingInstance() { return new HTMLHighlighter(false); }
/** * Fetches the given {@link URL} using {@link HTMLFetcher} and processes the * retrieved HTML using the specified {@link BoilerpipeExtractor}. * * @param doc * The processed {@link TextDocument}. * @param is * The original HTML document. * @return The highlighted HTML. * @throws BoilerpipeProcessingException */ public String process(final URL url, final BoilerpipeExtractor extractor) throws IOException, BoilerpipeProcessingException, SAXException { final HTMLDocument htmlDoc = HTMLFetcher.fetch(url); final TextDocument doc = new BoilerpipeSAXInput(htmlDoc.toInputSource()) .getTextDocument(); extractor.process(doc); final InputSource is = htmlDoc.toInputSource(); return process(doc, is); }