de.l3s.boilerpipe.sax.HTMLHighlighter java code examples

/**
 * Creates a new {@link HTMLHighlighter}, which is set-up to return the full
 * HTML text, with the extracted text portion <b>highlighted</b>.
 */
public static HTMLHighlighter newHighlightingInstance() {
  return new HTMLHighlighter(false);
}

private HTMLHighlighter(final boolean extractHTML) {
  if (extractHTML) {
    setOutputHighlightOnly(true);
    setExtraStyleSheet("");
    setPreHighlight("");
    setPostHighlight("");
  }
}

/**
 * returns the article from an document with its basic html structure. 
 * 
 * @param HTMLDocument
 * @param URI the uri from the document for resolving the relative anchors in the document to absolute anchors
 * @return String
 */
public String process(HTMLDocument htmlDoc, URI docUri, final BoilerpipeExtractor extractor) {
  final HTMLHighlighter hh = HTMLHighlighter.newExtractingInstance();
  hh.setOutputHighlightOnly(true);
  TextDocument doc;
  String text = "";
  try {
    doc = new BoilerpipeSAXInput(htmlDoc.toInputSource()).getTextDocument();
    extractor.process(doc);
    final InputSource is = htmlDoc.toInputSource();
    text = hh.process(doc, is);
  } catch (Exception ex) {
    return null;
  }
  return removeNotAllowedTags(text, docUri);
}

/**
 * Processes the given {@link TextDocument} and the original HTML text (as a
 * String).
 * 
 * @param doc
 *            The processed {@link TextDocument}.
 * @param origHTML
 *            The original HTML document.
 * @throws BoilerpipeProcessingException
 */
public String process(final TextDocument doc, final String origHTML)
    throws BoilerpipeProcessingException {
  return process(doc, new InputSource(new StringReader(origHTML)));
}

/**
 * Processes the given {@link TextDocument} and the original HTML text (as a
 * String).
 * 
 * @param doc
 *            The processed {@link TextDocument}.
 * @param origHTML
 *            The original HTML document.
 * @return The highlighted HTML.
 * @throws BoilerpipeProcessingException
 */
public String process(final TextDocument doc, final String origHTML)
    throws BoilerpipeProcessingException {
  return process(doc, new InputSource(new StringReader(origHTML)));
}

private HTMLHighlighter(final boolean extractHTML) {
  if (extractHTML) {
    setOutputHighlightOnly(true);
    setExtraStyleSheet("\n<style type=\"text/css\">\n"
        + "A:before { content:' '; } \n" //
        + "A:after { content:' '; } \n" //
        + "SPAN:before { content:' '; } \n" //
        + "SPAN:after { content:' '; } \n" //
        + "</style>\n");
    setPreHighlight("");
    setPostHighlight("");
  }
}

/**
 * Creates a new {@link HTMLHighlighter}, which is set-up to return only the
 * extracted HTML text, including enclosed markup.
 */
public static HTMLHighlighter newExtractingInstance() {
  return new HTMLHighlighter(true);
}

/**
 * Processes the given {@link TextDocument} and the original HTML text (as a
 * String).
 * 
 * @param doc
 *            The processed {@link TextDocument}.
 * @param origHTML
 *            The original HTML document.
 * @return The highlighted HTML.
 * @throws BoilerpipeProcessingException
 */
public String process(final TextDocument doc, final String origHTML)
    throws BoilerpipeProcessingException {
  return process(doc, new InputSource(new StringReader(origHTML)));
}

private HTMLHighlighter(final boolean extractHTML) {
  if (extractHTML) {
    setOutputHighlightOnly(true);
    setExtraStyleSheet("\n<style type=\"text/css\">\n"
        + "A:before { content:' '; } \n" //
        + "A:after { content:' '; } \n" //
        + "SPAN:before { content:' '; } \n" //
        + "SPAN:after { content:' '; } \n" //
        + "</style>\n");
    setPreHighlight("");
    setPostHighlight("");
  }
}

/**
 * Creates a new {@link HTMLHighlighter}, which is set-up to return the full
 * HTML text, with the extracted text portion <b>highlighted</b>.
 */
public static HTMLHighlighter newHighlightingInstance() {
  return new HTMLHighlighter(false);
}

/**
 * Processes the given {@link TextDocument} and the original HTML text (as a
 * String).
 * 
 * @param doc
 *            The processed {@link TextDocument}.
 * @param origHTML
 *            The original HTML document.
 * @return The highlighted HTML.
 * @throws BoilerpipeProcessingException
 */
public String process(final TextDocument doc, final String origHTML)
    throws BoilerpipeProcessingException {
  return process(doc, new InputSource(new StringReader(origHTML)));
}

private HTMLHighlighter(final boolean extractHTML) {
  if (extractHTML) {
    setOutputHighlightOnly(true);
    setExtraStyleSheet("\n<style type=\"text/css\">\n"
        + "A:before { content:' '; } \n" //
        + "A:after { content:' '; } \n" //
        + "SPAN:before { content:' '; } \n" //
        + "SPAN:after { content:' '; } \n" //
        + "</style>\n");
    setPreHighlight("");
    setPostHighlight("");
  }
}

/**
 * Creates a new {@link HTMLHighlighter}, which is set-up to return the full
 * HTML text, with the extracted text portion <b>highlighted</b>.
 */
public static HTMLHighlighter newHighlightingInstance() {
  return new HTMLHighlighter(false);
}

public String process(final URL url, final BoilerpipeExtractor extractor)
    throws IOException, BoilerpipeProcessingException, SAXException {
  final HTMLDocument htmlDoc = HTMLFetcher.fetch(url);
  final TextDocument doc = new BoilerpipeSAXInput(htmlDoc.toInputSource())
      .getTextDocument();
  extractor.process(doc);
  final InputSource is = htmlDoc.toInputSource();
  return process(doc, is);
}

/**
 * Creates a new {@link HTMLHighlighter}, which is set-up to return only the
 * extracted HTML text, including enclosed markup.
 */
public static HTMLHighlighter newExtractingInstance() {
  return new HTMLHighlighter(true);
}

/**
 * Fetches the given {@link URL} using {@link HTMLFetcher} and processes the
 * retrieved HTML using the specified {@link BoilerpipeExtractor}.
 * 
 *            The processed {@link TextDocument}.
 *            The original HTML document.
 * @return The highlighted HTML.
 * @throws BoilerpipeProcessingException
 */
public String process(final URL url, final BoilerpipeExtractor extractor)
    throws IOException, BoilerpipeProcessingException, SAXException {
  final HTMLDocument htmlDoc = HTMLFetcher.fetch(url);
  final TextDocument doc = new BoilerpipeSAXInput(htmlDoc.toInputSource())
      .getTextDocument();
  extractor.process(doc);
  final InputSource is = htmlDoc.toInputSource();
  return process(doc, is);
}

/**
 * Creates a new {@link HTMLHighlighter}, which is set-up to return only the
 * extracted HTML text, including enclosed markup.
 */
public static HTMLHighlighter newExtractingInstance() {
  return new HTMLHighlighter(true);
}

/**
 * Fetches the given {@link URL} using {@link HTMLFetcher} and processes the
 * retrieved HTML using the specified {@link BoilerpipeExtractor}.
 * 
 *            The processed {@link TextDocument}.
 *            The original HTML document.
 * @return The highlighted HTML.
 * @throws BoilerpipeProcessingException
 */
public String process(final URL url, final BoilerpipeExtractor extractor)
    throws IOException, BoilerpipeProcessingException, SAXException {
  final HTMLDocument htmlDoc = HTMLFetcher.fetch(url);
  final TextDocument doc = new BoilerpipeSAXInput(htmlDoc.toInputSource())
      .getTextDocument();
  extractor.process(doc);
  final InputSource is = htmlDoc.toInputSource();
  return process(doc, is);
}

/**
 * Creates a new {@link HTMLHighlighter}, which is set-up to return the full
 * HTML text, with the extracted text portion <b>highlighted</b>.
 */
public static HTMLHighlighter newHighlightingInstance() {
  return new HTMLHighlighter(false);
}

/**
 * Fetches the given {@link URL} using {@link HTMLFetcher} and processes the
 * retrieved HTML using the specified {@link BoilerpipeExtractor}.
 * 
 * @param doc
 *            The processed {@link TextDocument}.
 * @param is
 *            The original HTML document.
 * @return The highlighted HTML.
 * @throws BoilerpipeProcessingException
 */
public String process(final URL url, final BoilerpipeExtractor extractor)
    throws IOException, BoilerpipeProcessingException, SAXException {
  final HTMLDocument htmlDoc = HTMLFetcher.fetch(url);
  final TextDocument doc = new BoilerpipeSAXInput(htmlDoc.toInputSource())
      .getTextDocument();
  extractor.process(doc);
  final InputSource is = htmlDoc.toInputSource();
  return process(doc, is);
}

Javadoc

Highlights text blocks in an HTML document that have been marked as "content" in the corresponding TextDocument.

Most used methods

<init>
process
Fetches the given URL using HTMLFetcher and processes the retrieved HTML using the specified Boilerp
setExtraStyleSheet
Sets the extra stylesheet definition that will be inserted in the HEAD element. To disable, set it t
setOutputHighlightOnly
Sets whether only HTML enclosed within highlighted content will be returned, or the whole HTML docum
setPostHighlight
Sets the string that will be inserted after any highlighted HTML block. To disable, set it to the em
setPreHighlight
Sets the string that will be inserted prior to any highlighted HTML block. To disable, set it to the
newExtractingInstance
Creates a new HTMLHighlighter, which is set-up to return only the extracted HTML text, including enc

Popular in Java

Parsing JSON documents to java classes using gson
setRequestProperty (URLConnection)
scheduleAtFixedRate (Timer)
runOnUiThread (Activity)
URL (java.net)
A Uniform Resource Locator that identifies the location of an Internet resource as specified by RFC
Deque (java.util)
A linear collection that supports element insertion and removal at both ends. The name deque is shor
Vector (java.util)
Vector is an implementation of List, backed by an array and synchronized. All optional operations in
Callable (java.util.concurrent)
A task that returns a result and may throw an exception. Implementors define a single method with no
Executor (java.util.concurrent)
An object that executes submitted Runnable tasks. This interface provides a way of decoupling task s
XPath (javax.xml.xpath)
XPath provides access to the XPath evaluation environment and expressions. Evaluation of XPath Expr
Top Vim plugins

How to useHTMLHighlighter in de.l3s.boilerpipe.sax

Best Java code snippets using de.l3s.boilerpipe.sax.HTMLHighlighter (Showing top 20 results out of 315)

How to use
HTMLHighlighter
in
de.l3s.boilerpipe.sax