/** * Extracts text from the HTML code available from the given {@link Reader}. * * @param r The Reader containing the HTML * @return The extracted text. * @throws BoilerpipeProcessingException */ public String getText(final Reader r) throws BoilerpipeProcessingException { return getText(new InputSource(r)); }
/** * Extracts text from the HTML code available from the given {@link Reader}. * * @param r The Reader containing the HTML * @return The extracted text. * @throws BoilerpipeProcessingException */ public String getText(final Reader r) throws BoilerpipeProcessingException { return getText(new InputSource(r)); }
/** * Extracts text from the HTML code available from the given {@link Reader}. * * @param r The Reader containing the HTML * @return The extracted text. * @throws BoilerpipeProcessingException */ public String getText(final Reader r) throws BoilerpipeProcessingException { return getText(new InputSource(r)); }
/** * Extracts text from the HTML code available from the given {@link Reader}. * * @param r The Reader containing the HTML * @return The extracted text. * @throws BoilerpipeProcessingException */ public String getText(final Reader r) throws BoilerpipeProcessingException { return getText(new InputSource(r)); }
/** * Extracts text from the HTML code given as a String. * * @param html The HTML code as a String. * @return The extracted text. * @throws BoilerpipeProcessingException */ public String getText(final String html) throws BoilerpipeProcessingException { try { return getText(new BoilerpipeSAXInput(new InputSource( new StringReader(html))).getTextDocument()); } catch (SAXException e) { throw new BoilerpipeProcessingException(e); } }
/** * Extracts text from the HTML code given as a String. * * @param html The HTML code as a String. * @return The extracted text. * @throws BoilerpipeProcessingException */ public String getText(final String html) throws BoilerpipeProcessingException { try { return getText(new BoilerpipeSAXInput(new InputSource( new StringReader(html))).getTextDocument()); } catch (SAXException e) { throw new BoilerpipeProcessingException(e); } }
/** * Extracts text from the HTML code given as a String. * * @param html The HTML code as a String. * @return The extracted text. * @throws BoilerpipeProcessingException */ public String getText(final String html) throws BoilerpipeProcessingException { try { return getText(new BoilerpipeSAXInput(new InputSource( new StringReader(html))).getTextDocument()); } catch (SAXException e) { throw new BoilerpipeProcessingException(e); } }
/** * Extracts text from the HTML code given as a String. * * @param html The HTML code as a String. * @return The extracted text. * @throws BoilerpipeProcessingException */ public String getText(final String html) throws BoilerpipeProcessingException { try { return getText(new BoilerpipeSAXInput(new InputSource( new StringReader(html))).getTextDocument()); } catch (SAXException e) { throw new BoilerpipeProcessingException(e); } }
@Override public boolean processContent(Document document) { try{ XMLOutputter outputter = new XMLOutputter(); String xml = outputter.outputString(document.getRootElement()); BoilerpipeSAXInput saxinput = new BoilerpipeSAXInput(new InputSource(new StringReader(xml))); TextDocument textDoc = saxinput.getTextDocument(); String text = extractor.getText(textDoc); addExtractedValue("text", text); return true; } catch(Exception e){ return false; } }
/** * Extracts text from the HTML code available from the given {@link InputSource}. * * @param is The InputSource containing the HTML * @return The extracted text. * @throws BoilerpipeProcessingException */ public String getText(final InputSource is) throws BoilerpipeProcessingException { try { return getText(new BoilerpipeSAXInput(is).getTextDocument()); } catch (SAXException e) { throw new BoilerpipeProcessingException(e); } }
/** * Extracts text from the HTML code available from the given {@link InputSource}. * * @param is The InputSource containing the HTML * @return The extracted text. * @throws BoilerpipeProcessingException */ public String getText(final InputSource is) throws BoilerpipeProcessingException { try { return getText(new BoilerpipeSAXInput(is).getTextDocument()); } catch (SAXException e) { throw new BoilerpipeProcessingException(e); } }
/** * Extracts text from the HTML code available from the given {@link InputSource}. * * @param is The InputSource containing the HTML * @return The extracted text. * @throws BoilerpipeProcessingException */ public String getText(final InputSource is) throws BoilerpipeProcessingException { try { return getText(new BoilerpipeSAXInput(is).getTextDocument()); } catch (SAXException e) { throw new BoilerpipeProcessingException(e); } }
/** * Extracts text from the HTML code available from the given {@link URL}. * NOTE: This method is mainly to be used for show case purposes. If you are * going to crawl the Web, consider using {@link #getText(InputSource)} * instead. * * @param url The URL pointing to the HTML code. * @return The extracted text. * @throws BoilerpipeProcessingException */ public String getText(final URL url) throws BoilerpipeProcessingException { try { return getText(HTMLFetcher.fetch(url).toInputSource()); } catch (IOException e) { throw new BoilerpipeProcessingException(e); } }
/** * Extracts text from the HTML code available from the given {@link InputSource}. * * @param is The InputSource containing the HTML * @return The extracted text. * @throws BoilerpipeProcessingException */ public String getText(final InputSource is) throws BoilerpipeProcessingException { try { return getText(new BoilerpipeSAXInput(is).getTextDocument()); } catch (SAXException e) { throw new BoilerpipeProcessingException(e); } }
/** * Extracts text from the HTML code available from the given {@link URL}. * NOTE: This method is mainly to be used for show case purposes. If you are * going to crawl the Web, consider using {@link #getText(InputSource)} * instead. * * @param url The URL pointing to the HTML code. * @return The extracted text. * @throws BoilerpipeProcessingException */ public String getText(final URL url) throws BoilerpipeProcessingException { try { return getText(HTMLFetcher.fetch(url).toInputSource()); } catch (IOException e) { throw new BoilerpipeProcessingException(e); } }
/** * Extracts text from the HTML code available from the given {@link URL}. * NOTE: This method is mainly to be used for show case purposes. If you are * going to crawl the Web, consider using {@link #getText(InputSource)} * instead. * * @param url The URL pointing to the HTML code. * @return The extracted text. * @throws BoilerpipeProcessingException */ public String getText(final URL url) throws BoilerpipeProcessingException { try { return getText(HTMLFetcher.fetch(url).toInputSource()); } catch (IOException e) { throw new BoilerpipeProcessingException(e); } }
/** * Extracts text from the HTML code available from the given {@link URL}. * NOTE: This method is mainly to be used for show case purposes. If you are * going to crawl the Web, consider using {@link #getText(InputSource)} * instead. * * @param url The URL pointing to the HTML code. * @return The extracted text. * @throws BoilerpipeProcessingException */ public String getText(final URL url) throws BoilerpipeProcessingException { try { return getText(HTMLFetcher.fetch(url).toInputSource()); } catch (IOException e) { throw new BoilerpipeProcessingException(e); } }