de.l3s.boilerpipe.extractors.ExtractorBase.getText java code examples

/**
 * Extracts text from the HTML code available from the given {@link Reader}.
 * 
 * @param r The Reader containing the HTML
 * @return  The extracted text.
 * @throws BoilerpipeProcessingException
 */
public String getText(final Reader r) throws BoilerpipeProcessingException {
  return getText(new InputSource(r));
}

/**
 * Extracts text from the HTML code available from the given {@link Reader}.
 * 
 * @param r The Reader containing the HTML
 * @return  The extracted text.
 * @throws BoilerpipeProcessingException
 */
public String getText(final Reader r) throws BoilerpipeProcessingException {
  return getText(new InputSource(r));
}

/**
 * Extracts text from the HTML code available from the given {@link Reader}.
 * 
 * @param r The Reader containing the HTML
 * @return  The extracted text.
 * @throws BoilerpipeProcessingException
 */
public String getText(final Reader r) throws BoilerpipeProcessingException {
  return getText(new InputSource(r));
}

/**
 * Extracts text from the HTML code available from the given {@link Reader}.
 * 
 * @param r The Reader containing the HTML
 * @return  The extracted text.
 * @throws BoilerpipeProcessingException
 */
public String getText(final Reader r) throws BoilerpipeProcessingException {
  return getText(new InputSource(r));
}

/**
 * Extracts text from the HTML code given as a String.
 * 
 * @param html  The HTML code as a String.
 * @return  The extracted text.
 * @throws BoilerpipeProcessingException
 */
public String getText(final String html)
    throws BoilerpipeProcessingException {
  try {
    return getText(new BoilerpipeSAXInput(new InputSource(
        new StringReader(html))).getTextDocument());
  } catch (SAXException e) {
    throw new BoilerpipeProcessingException(e);
  }
}

/**
 * Extracts text from the HTML code given as a String.
 * 
 * @param html  The HTML code as a String.
 * @return  The extracted text.
 * @throws BoilerpipeProcessingException
 */
public String getText(final String html)
    throws BoilerpipeProcessingException {
  try {
    return getText(new BoilerpipeSAXInput(new InputSource(
        new StringReader(html))).getTextDocument());
  } catch (SAXException e) {
    throw new BoilerpipeProcessingException(e);
  }
}

/**
 * Extracts text from the HTML code given as a String.
 * 
 * @param html  The HTML code as a String.
 * @return  The extracted text.
 * @throws BoilerpipeProcessingException
 */
public String getText(final String html)
    throws BoilerpipeProcessingException {
  try {
    return getText(new BoilerpipeSAXInput(new InputSource(
        new StringReader(html))).getTextDocument());
  } catch (SAXException e) {
    throw new BoilerpipeProcessingException(e);
  }
}

/**
 * Extracts text from the HTML code given as a String.
 * 
 * @param html  The HTML code as a String.
 * @return  The extracted text.
 * @throws BoilerpipeProcessingException
 */
public String getText(final String html)
    throws BoilerpipeProcessingException {
  try {
    return getText(new BoilerpipeSAXInput(new InputSource(
        new StringReader(html))).getTextDocument());
  } catch (SAXException e) {
    throw new BoilerpipeProcessingException(e);
  }
}

@Override
public boolean processContent(Document document) {
  try{
   XMLOutputter outputter = new XMLOutputter();
   String xml = outputter.outputString(document.getRootElement());
   BoilerpipeSAXInput saxinput = new BoilerpipeSAXInput(new InputSource(new StringReader(xml)));
   TextDocument textDoc = saxinput.getTextDocument();
   String text = extractor.getText(textDoc);
   addExtractedValue("text", text);
   return true;
  }
  catch(Exception e){
    return false;
  }
}

/**
 * Extracts text from the HTML code available from the given {@link InputSource}.
 * 
 * @param is The InputSource containing the HTML
 * @return  The extracted text.
 * @throws BoilerpipeProcessingException
 */
public String getText(final InputSource is)
    throws BoilerpipeProcessingException {
  try {
    return getText(new BoilerpipeSAXInput(is).getTextDocument());
  } catch (SAXException e) {
    throw new BoilerpipeProcessingException(e);
  }
}

/**
 * Extracts text from the HTML code available from the given {@link InputSource}.
 * 
 * @param is The InputSource containing the HTML
 * @return  The extracted text.
 * @throws BoilerpipeProcessingException
 */
public String getText(final InputSource is)
    throws BoilerpipeProcessingException {
  try {
    return getText(new BoilerpipeSAXInput(is).getTextDocument());
  } catch (SAXException e) {
    throw new BoilerpipeProcessingException(e);
  }
}

/**
 * Extracts text from the HTML code available from the given {@link InputSource}.
 * 
 * @param is The InputSource containing the HTML
 * @return  The extracted text.
 * @throws BoilerpipeProcessingException
 */
public String getText(final InputSource is)
    throws BoilerpipeProcessingException {
  try {
    return getText(new BoilerpipeSAXInput(is).getTextDocument());
  } catch (SAXException e) {
    throw new BoilerpipeProcessingException(e);
  }
}

/**
 * Extracts text from the HTML code available from the given {@link URL}.
 * NOTE: This method is mainly to be used for show case purposes. If you are
 * going to crawl the Web, consider using {@link #getText(InputSource)}
 * instead.
 * 
 * @param url  The URL pointing to the HTML code.
 * @return  The extracted text.
 * @throws BoilerpipeProcessingException
 */
public String getText(final URL url) throws BoilerpipeProcessingException {
  try {
    return getText(HTMLFetcher.fetch(url).toInputSource());
  } catch (IOException e) {
    throw new BoilerpipeProcessingException(e);
  }
}

/**
 * Extracts text from the HTML code available from the given {@link InputSource}.
 * 
 * @param is The InputSource containing the HTML
 * @return  The extracted text.
 * @throws BoilerpipeProcessingException
 */
public String getText(final InputSource is)
    throws BoilerpipeProcessingException {
  try {
    return getText(new BoilerpipeSAXInput(is).getTextDocument());
  } catch (SAXException e) {
    throw new BoilerpipeProcessingException(e);
  }
}

/**
 * Extracts text from the HTML code available from the given {@link URL}.
 * NOTE: This method is mainly to be used for show case purposes. If you are
 * going to crawl the Web, consider using {@link #getText(InputSource)}
 * instead.
 * 
 * @param url  The URL pointing to the HTML code.
 * @return  The extracted text.
 * @throws BoilerpipeProcessingException
 */
public String getText(final URL url) throws BoilerpipeProcessingException {
  try {
    return getText(HTMLFetcher.fetch(url).toInputSource());
  } catch (IOException e) {
    throw new BoilerpipeProcessingException(e);
  }
}

/**
 * Extracts text from the HTML code available from the given {@link URL}.
 * NOTE: This method is mainly to be used for show case purposes. If you are
 * going to crawl the Web, consider using {@link #getText(InputSource)}
 * instead.
 * 
 * @param url  The URL pointing to the HTML code.
 * @return  The extracted text.
 * @throws BoilerpipeProcessingException
 */
public String getText(final URL url) throws BoilerpipeProcessingException {
  try {
    return getText(HTMLFetcher.fetch(url).toInputSource());
  } catch (IOException e) {
    throw new BoilerpipeProcessingException(e);
  }
}

/**
 * Extracts text from the HTML code available from the given {@link URL}.
 * NOTE: This method is mainly to be used for show case purposes. If you are
 * going to crawl the Web, consider using {@link #getText(InputSource)}
 * instead.
 * 
 * @param url  The URL pointing to the HTML code.
 * @return  The extracted text.
 * @throws BoilerpipeProcessingException
 */
public String getText(final URL url) throws BoilerpipeProcessingException {
  try {
    return getText(HTMLFetcher.fetch(url).toInputSource());
  } catch (IOException e) {
    throw new BoilerpipeProcessingException(e);
  }
}

Javadoc

Extracts text from the given TextDocument object.

Popular methods of ExtractorBase

process

Popular in Java

Running tasks concurrently on multiple threads
setRequestProperty (URLConnection)
setContentView (Activity)
onCreateOptionsMenu (Activity)
BigDecimal (java.math)
An immutable arbitrary-precision signed decimal.A value is represented by an arbitrary-precision "un
Socket (java.net)
Provides a client-side TCP socket.
Manifest (java.util.jar)
The Manifest class is used to obtain attribute information for a JarFile and its entries.
DateTimeFormat (org.joda.time.format)
Factory that creates instances of DateTimeFormatter from patterns and styles. Datetime formatting i
GridBagLayout (java.awt)
The GridBagLayout class is a flexible layout manager that aligns components vertically and horizonta
JPanel (javax.swing)
Top plugins for WebStorm

How to use getTextmethodin de.l3s.boilerpipe.extractors.ExtractorBase

Best Java code snippets using de.l3s.boilerpipe.extractors.ExtractorBase.getText (Showing top 17 results out of 315)

How to use
getText
method
in
de.l3s.boilerpipe.extractors.ExtractorBase