de.l3s.boilerpipe.sax.BoilerpipeSAXInput.getTextDocument java code examples

/**
 * Retrieves the {@link TextDocument} using a default HTML parser.
 */
public TextDocument getTextDocument() throws BoilerpipeProcessingException {
  return getTextDocument(new BoilerpipeHTMLParser());
}

/**
 * Retrieves the {@link TextDocument} using a default HTML parser.
 */
public TextDocument getTextDocument() throws BoilerpipeProcessingException {
  return getTextDocument(new BoilerpipeHTMLParser());
}

/**
 * Retrieves the {@link TextDocument} using a default HTML parser.
 */
public TextDocument getTextDocument() throws BoilerpipeProcessingException {
  return getTextDocument(new BoilerpipeHTMLParser());
}

/**
 * Retrieves the {@link TextDocument} using a default HTML parser.
 */
public TextDocument getTextDocument() throws BoilerpipeProcessingException {
  return getTextDocument(new BoilerpipeHTMLParser());
}

/**
 * Extracts text from the HTML code given as a String.
 * 
 * @param html  The HTML code as a String.
 * @return  The extracted text.
 * @throws BoilerpipeProcessingException
 */
public String getText(final String html)
    throws BoilerpipeProcessingException {
  try {
    return getText(new BoilerpipeSAXInput(new InputSource(
        new StringReader(html))).getTextDocument());
  } catch (SAXException e) {
    throw new BoilerpipeProcessingException(e);
  }
}

/**
 * Extracts text from the HTML code given as a String.
 * 
 * @param html  The HTML code as a String.
 * @return  The extracted text.
 * @throws BoilerpipeProcessingException
 */
public String getText(final String html)
    throws BoilerpipeProcessingException {
  try {
    return getText(new BoilerpipeSAXInput(new InputSource(
        new StringReader(html))).getTextDocument());
  } catch (SAXException e) {
    throw new BoilerpipeProcessingException(e);
  }
}

/**
 * Extracts text from the HTML code given as a String.
 * 
 * @param html  The HTML code as a String.
 * @return  The extracted text.
 * @throws BoilerpipeProcessingException
 */
public String getText(final String html)
    throws BoilerpipeProcessingException {
  try {
    return getText(new BoilerpipeSAXInput(new InputSource(
        new StringReader(html))).getTextDocument());
  } catch (SAXException e) {
    throw new BoilerpipeProcessingException(e);
  }
}

/**
 * Extracts text from the HTML code given as a String.
 * 
 * @param html  The HTML code as a String.
 * @return  The extracted text.
 * @throws BoilerpipeProcessingException
 */
public String getText(final String html)
    throws BoilerpipeProcessingException {
  try {
    return getText(new BoilerpipeSAXInput(new InputSource(
        new StringReader(html))).getTextDocument());
  } catch (SAXException e) {
    throw new BoilerpipeProcessingException(e);
  }
}

@Override
public boolean processContent(Document document) {
  try{
   XMLOutputter outputter = new XMLOutputter();
   String xml = outputter.outputString(document.getRootElement());
   BoilerpipeSAXInput saxinput = new BoilerpipeSAXInput(new InputSource(new StringReader(xml)));
   TextDocument textDoc = saxinput.getTextDocument();
   String text = extractor.getText(textDoc);
   addExtractedValue("text", text);
   return true;
  }
  catch(Exception e){
    return false;
  }
}

/**
 * Extracts text from the HTML code available from the given {@link InputSource}.
 * 
 * @param is The InputSource containing the HTML
 * @return  The extracted text.
 * @throws BoilerpipeProcessingException
 */
public String getText(final InputSource is)
    throws BoilerpipeProcessingException {
  try {
    return getText(new BoilerpipeSAXInput(is).getTextDocument());
  } catch (SAXException e) {
    throw new BoilerpipeProcessingException(e);
  }
}

/**
 * Extracts text from the HTML code available from the given {@link InputSource}.
 * 
 * @param is The InputSource containing the HTML
 * @return  The extracted text.
 * @throws BoilerpipeProcessingException
 */
public String getText(final InputSource is)
    throws BoilerpipeProcessingException {
  try {
    return getText(new BoilerpipeSAXInput(is).getTextDocument());
  } catch (SAXException e) {
    throw new BoilerpipeProcessingException(e);
  }
}

/**
 * Extracts text from the HTML code available from the given {@link InputSource}.
 * 
 * @param is The InputSource containing the HTML
 * @return  The extracted text.
 * @throws BoilerpipeProcessingException
 */
public String getText(final InputSource is)
    throws BoilerpipeProcessingException {
  try {
    return getText(new BoilerpipeSAXInput(is).getTextDocument());
  } catch (SAXException e) {
    throw new BoilerpipeProcessingException(e);
  }
}

/**
 * Extracts text from the HTML code available from the given {@link InputSource}.
 * 
 * @param is The InputSource containing the HTML
 * @return  The extracted text.
 * @throws BoilerpipeProcessingException
 */
public String getText(final InputSource is)
    throws BoilerpipeProcessingException {
  try {
    return getText(new BoilerpipeSAXInput(is).getTextDocument());
  } catch (SAXException e) {
    throw new BoilerpipeProcessingException(e);
  }
}

/**
 * parses the media (picture, video) out of doc
 * 
 * @param doc document to parse the media out
 * @param extractor extractor to use
 * @return list of extracted media, with size = 0 if no media found
 */
public List<Media> process(String doc, final BoilerpipeExtractor extractor) {
  final HTMLDocument htmlDoc = new HTMLDocument(doc);
  List<Media> media = new ArrayList<Media>();
  TextDocument tdoc;
  try {
    tdoc = new BoilerpipeSAXInput(htmlDoc.toInputSource()).getTextDocument();
    extractor.process(tdoc);
    final InputSource is = htmlDoc.toInputSource();
    media = process(tdoc, is);
  } catch (Exception e) {
    return null;
  }
  return media;
}

public String process(final URL url, final BoilerpipeExtractor extractor)
    throws IOException, BoilerpipeProcessingException, SAXException {
  final HTMLDocument htmlDoc = HTMLFetcher.fetch(url);
  final TextDocument doc = new BoilerpipeSAXInput(htmlDoc.toInputSource())
      .getTextDocument();
  extractor.process(doc);
  final InputSource is = htmlDoc.toInputSource();
  return process(doc, is);
}

/**
 * Fetches the given {@link URL} using {@link HTMLFetcher} and processes the
 * retrieved HTML using the specified {@link BoilerpipeExtractor}.
 * 
 *            The processed {@link TextDocument}.
 *            The original HTML document.
 * @return The highlighted HTML.
 * @throws BoilerpipeProcessingException
 */
public String process(final URL url, final BoilerpipeExtractor extractor)
    throws IOException, BoilerpipeProcessingException, SAXException {
  final HTMLDocument htmlDoc = HTMLFetcher.fetch(url);
  final TextDocument doc = new BoilerpipeSAXInput(htmlDoc.toInputSource())
      .getTextDocument();
  extractor.process(doc);
  final InputSource is = htmlDoc.toInputSource();
  return process(doc, is);
}

/**
 * Fetches the given {@link URL} using {@link HTMLFetcher} and processes the
 * retrieved HTML using the specified {@link BoilerpipeExtractor}.
 * 
 *            The processed {@link TextDocument}.
 *            The original HTML document.
 * @return The highlighted HTML.
 * @throws BoilerpipeProcessingException
 */
public String process(final URL url, final BoilerpipeExtractor extractor)
    throws IOException, BoilerpipeProcessingException, SAXException {
  final HTMLDocument htmlDoc = HTMLFetcher.fetch(url);
  final TextDocument doc = new BoilerpipeSAXInput(htmlDoc.toInputSource())
      .getTextDocument();
  extractor.process(doc);
  final InputSource is = htmlDoc.toInputSource();
  return process(doc, is);
}

/**
 * Fetches the given {@link URL} using {@link HTMLFetcher} and processes the
 * retrieved HTML using the specified {@link BoilerpipeExtractor}.
 * 
 *            The processed {@link TextDocument}.
 *            The original HTML document.
 * @return A List of enclosed {@link Image}s
 * @throws BoilerpipeProcessingException
 */
public List<Image> process(final URL url, final BoilerpipeExtractor extractor)
    throws IOException, BoilerpipeProcessingException, SAXException {
  final HTMLDocument htmlDoc = HTMLFetcher.fetch(url);
  final TextDocument doc = new BoilerpipeSAXInput(htmlDoc.toInputSource())
      .getTextDocument();
  extractor.process(doc);
  final InputSource is = htmlDoc.toInputSource();
  return process(doc, is);
}

/**
 * Fetches the given {@link URL} using {@link HTMLFetcher} and processes the
 * retrieved HTML using the specified {@link BoilerpipeExtractor}.
 * 
 * @param doc
 *            The processed {@link TextDocument}.
 * @param is
 *            The original HTML document.
 * @return The highlighted HTML.
 * @throws BoilerpipeProcessingException
 */
public String process(final URL url, final BoilerpipeExtractor extractor)
    throws IOException, BoilerpipeProcessingException, SAXException {
  final HTMLDocument htmlDoc = HTMLFetcher.fetch(url);
  final TextDocument doc = new BoilerpipeSAXInput(htmlDoc.toInputSource())
      .getTextDocument();
  extractor.process(doc);
  final InputSource is = htmlDoc.toInputSource();
  return process(doc, is);
}

/**
 * Fetches the given {@link URL} using {@link HTMLFetcher} and processes the
 * retrieved HTML using the specified {@link BoilerpipeExtractor}.
 * 
 *            The processed {@link TextDocument}.
 *            The original HTML document.
 * @return A List of enclosed {@link Image}s
 * @throws BoilerpipeProcessingException
 */
public List<Image> process(final URL url, final BoilerpipeExtractor extractor)
    throws IOException, BoilerpipeProcessingException, SAXException {
  final HTMLDocument htmlDoc = HTMLFetcher.fetch(url);
  final TextDocument doc = new BoilerpipeSAXInput(htmlDoc.toInputSource())
      .getTextDocument();
  extractor.process(doc);
  final InputSource is = htmlDoc.toInputSource();
  return process(doc, is);
}

Javadoc

Retrieves the TextDocument using a default HTML parser.

Popular methods of BoilerpipeSAXInput

<init>
Creates a new instance of BoilerpipeSAXInput for the given InputSource.

Popular in Java

Reading from database using SQL prepared statement
notifyDataSetChanged (ArrayAdapter)
setContentView (Activity)
getExternalFilesDir (Context)
FileInputStream (java.io)
An input stream that reads bytes from a file. File file = ...finally if (in != null) in.clos
Enumeration (java.util)
A legacy iteration interface.New code should use Iterator instead. Iterator replaces the enumeration
TreeMap (java.util)
Walk the nodes of the tree left-to-right or right-to-left. Note that in descending iterations, next
HttpServletRequest (javax.servlet.http)
Extends the javax.servlet.ServletRequest interface to provide request information for HTTP servlets.
Point (java.awt)
A point representing a location in (x,y) coordinate space, specified in integer precision.
Modifier (javassist)
The Modifier class provides static methods and constants to decode class and member access modifiers
Top Sublime Text plugins

How to use getTextDocumentmethodin de.l3s.boilerpipe.sax.BoilerpipeSAXInput

Best Java code snippets using de.l3s.boilerpipe.sax.BoilerpipeSAXInput.getTextDocument (Showing top 20 results out of 315)

How to use
getTextDocument
method
in
de.l3s.boilerpipe.sax.BoilerpipeSAXInput