/** * Retrieves the {@link TextDocument} using a default HTML parser. */ public TextDocument getTextDocument() throws BoilerpipeProcessingException { return getTextDocument(new BoilerpipeHTMLParser()); }
/** * Retrieves the {@link TextDocument} using a default HTML parser. */ public TextDocument getTextDocument() throws BoilerpipeProcessingException { return getTextDocument(new BoilerpipeHTMLParser()); }
/** * Retrieves the {@link TextDocument} using a default HTML parser. */ public TextDocument getTextDocument() throws BoilerpipeProcessingException { return getTextDocument(new BoilerpipeHTMLParser()); }
/** * Retrieves the {@link TextDocument} using a default HTML parser. */ public TextDocument getTextDocument() throws BoilerpipeProcessingException { return getTextDocument(new BoilerpipeHTMLParser()); }
/** * Extracts text from the HTML code given as a String. * * @param html The HTML code as a String. * @return The extracted text. * @throws BoilerpipeProcessingException */ public String getText(final String html) throws BoilerpipeProcessingException { try { return getText(new BoilerpipeSAXInput(new InputSource( new StringReader(html))).getTextDocument()); } catch (SAXException e) { throw new BoilerpipeProcessingException(e); } }
/** * Extracts text from the HTML code given as a String. * * @param html The HTML code as a String. * @return The extracted text. * @throws BoilerpipeProcessingException */ public String getText(final String html) throws BoilerpipeProcessingException { try { return getText(new BoilerpipeSAXInput(new InputSource( new StringReader(html))).getTextDocument()); } catch (SAXException e) { throw new BoilerpipeProcessingException(e); } }
/** * Extracts text from the HTML code given as a String. * * @param html The HTML code as a String. * @return The extracted text. * @throws BoilerpipeProcessingException */ public String getText(final String html) throws BoilerpipeProcessingException { try { return getText(new BoilerpipeSAXInput(new InputSource( new StringReader(html))).getTextDocument()); } catch (SAXException e) { throw new BoilerpipeProcessingException(e); } }
/** * Extracts text from the HTML code given as a String. * * @param html The HTML code as a String. * @return The extracted text. * @throws BoilerpipeProcessingException */ public String getText(final String html) throws BoilerpipeProcessingException { try { return getText(new BoilerpipeSAXInput(new InputSource( new StringReader(html))).getTextDocument()); } catch (SAXException e) { throw new BoilerpipeProcessingException(e); } }
@Override public boolean processContent(Document document) { try{ XMLOutputter outputter = new XMLOutputter(); String xml = outputter.outputString(document.getRootElement()); BoilerpipeSAXInput saxinput = new BoilerpipeSAXInput(new InputSource(new StringReader(xml))); TextDocument textDoc = saxinput.getTextDocument(); String text = extractor.getText(textDoc); addExtractedValue("text", text); return true; } catch(Exception e){ return false; } }
/** * Extracts text from the HTML code available from the given {@link InputSource}. * * @param is The InputSource containing the HTML * @return The extracted text. * @throws BoilerpipeProcessingException */ public String getText(final InputSource is) throws BoilerpipeProcessingException { try { return getText(new BoilerpipeSAXInput(is).getTextDocument()); } catch (SAXException e) { throw new BoilerpipeProcessingException(e); } }
/** * Extracts text from the HTML code available from the given {@link InputSource}. * * @param is The InputSource containing the HTML * @return The extracted text. * @throws BoilerpipeProcessingException */ public String getText(final InputSource is) throws BoilerpipeProcessingException { try { return getText(new BoilerpipeSAXInput(is).getTextDocument()); } catch (SAXException e) { throw new BoilerpipeProcessingException(e); } }
/** * Extracts text from the HTML code available from the given {@link InputSource}. * * @param is The InputSource containing the HTML * @return The extracted text. * @throws BoilerpipeProcessingException */ public String getText(final InputSource is) throws BoilerpipeProcessingException { try { return getText(new BoilerpipeSAXInput(is).getTextDocument()); } catch (SAXException e) { throw new BoilerpipeProcessingException(e); } }
/** * Extracts text from the HTML code available from the given {@link InputSource}. * * @param is The InputSource containing the HTML * @return The extracted text. * @throws BoilerpipeProcessingException */ public String getText(final InputSource is) throws BoilerpipeProcessingException { try { return getText(new BoilerpipeSAXInput(is).getTextDocument()); } catch (SAXException e) { throw new BoilerpipeProcessingException(e); } }
/** * parses the media (picture, video) out of doc * * @param doc document to parse the media out * @param extractor extractor to use * @return list of extracted media, with size = 0 if no media found */ public List<Media> process(String doc, final BoilerpipeExtractor extractor) { final HTMLDocument htmlDoc = new HTMLDocument(doc); List<Media> media = new ArrayList<Media>(); TextDocument tdoc; try { tdoc = new BoilerpipeSAXInput(htmlDoc.toInputSource()).getTextDocument(); extractor.process(tdoc); final InputSource is = htmlDoc.toInputSource(); media = process(tdoc, is); } catch (Exception e) { return null; } return media; }
public String process(final URL url, final BoilerpipeExtractor extractor) throws IOException, BoilerpipeProcessingException, SAXException { final HTMLDocument htmlDoc = HTMLFetcher.fetch(url); final TextDocument doc = new BoilerpipeSAXInput(htmlDoc.toInputSource()) .getTextDocument(); extractor.process(doc); final InputSource is = htmlDoc.toInputSource(); return process(doc, is); }
/** * Fetches the given {@link URL} using {@link HTMLFetcher} and processes the * retrieved HTML using the specified {@link BoilerpipeExtractor}. * * The processed {@link TextDocument}. * The original HTML document. * @return The highlighted HTML. * @throws BoilerpipeProcessingException */ public String process(final URL url, final BoilerpipeExtractor extractor) throws IOException, BoilerpipeProcessingException, SAXException { final HTMLDocument htmlDoc = HTMLFetcher.fetch(url); final TextDocument doc = new BoilerpipeSAXInput(htmlDoc.toInputSource()) .getTextDocument(); extractor.process(doc); final InputSource is = htmlDoc.toInputSource(); return process(doc, is); }
/** * Fetches the given {@link URL} using {@link HTMLFetcher} and processes the * retrieved HTML using the specified {@link BoilerpipeExtractor}. * * The processed {@link TextDocument}. * The original HTML document. * @return The highlighted HTML. * @throws BoilerpipeProcessingException */ public String process(final URL url, final BoilerpipeExtractor extractor) throws IOException, BoilerpipeProcessingException, SAXException { final HTMLDocument htmlDoc = HTMLFetcher.fetch(url); final TextDocument doc = new BoilerpipeSAXInput(htmlDoc.toInputSource()) .getTextDocument(); extractor.process(doc); final InputSource is = htmlDoc.toInputSource(); return process(doc, is); }
/** * Fetches the given {@link URL} using {@link HTMLFetcher} and processes the * retrieved HTML using the specified {@link BoilerpipeExtractor}. * * The processed {@link TextDocument}. * The original HTML document. * @return A List of enclosed {@link Image}s * @throws BoilerpipeProcessingException */ public List<Image> process(final URL url, final BoilerpipeExtractor extractor) throws IOException, BoilerpipeProcessingException, SAXException { final HTMLDocument htmlDoc = HTMLFetcher.fetch(url); final TextDocument doc = new BoilerpipeSAXInput(htmlDoc.toInputSource()) .getTextDocument(); extractor.process(doc); final InputSource is = htmlDoc.toInputSource(); return process(doc, is); }
/** * Fetches the given {@link URL} using {@link HTMLFetcher} and processes the * retrieved HTML using the specified {@link BoilerpipeExtractor}. * * @param doc * The processed {@link TextDocument}. * @param is * The original HTML document. * @return The highlighted HTML. * @throws BoilerpipeProcessingException */ public String process(final URL url, final BoilerpipeExtractor extractor) throws IOException, BoilerpipeProcessingException, SAXException { final HTMLDocument htmlDoc = HTMLFetcher.fetch(url); final TextDocument doc = new BoilerpipeSAXInput(htmlDoc.toInputSource()) .getTextDocument(); extractor.process(doc); final InputSource is = htmlDoc.toInputSource(); return process(doc, is); }
/** * Fetches the given {@link URL} using {@link HTMLFetcher} and processes the * retrieved HTML using the specified {@link BoilerpipeExtractor}. * * The processed {@link TextDocument}. * The original HTML document. * @return A List of enclosed {@link Image}s * @throws BoilerpipeProcessingException */ public List<Image> process(final URL url, final BoilerpipeExtractor extractor) throws IOException, BoilerpipeProcessingException, SAXException { final HTMLDocument htmlDoc = HTMLFetcher.fetch(url); final TextDocument doc = new BoilerpipeSAXInput(htmlDoc.toInputSource()) .getTextDocument(); extractor.process(doc); final InputSource is = htmlDoc.toInputSource(); return process(doc, is); }