de.l3s.boilerpipe.sax.HTMLFetcher java code examples

/**
 * returns the article from an url with its basic html structure. 
 * 
 */
public String process(final BoilerpipeExtractor extractor, final URL url)
    throws IOException, BoilerpipeProcessingException, SAXException, URISyntaxException {
  final HTMLDocument htmlDoc = HTMLFetcher.fetch(url);
  return process(htmlDoc, url.toURI(), extractor);
}

/**
 * Extracts text from the HTML code available from the given {@link URL}.
 * NOTE: This method is mainly to be used for show case purposes. If you are
 * going to crawl the Web, consider using {@link #getText(InputSource)}
 * instead.
 * 
 * @param url  The URL pointing to the HTML code.
 * @return  The extracted text.
 * @throws BoilerpipeProcessingException
 */
public String getText(final URL url) throws BoilerpipeProcessingException {
  try {
    return getText(HTMLFetcher.fetch(url).toInputSource());
  } catch (IOException e) {
    throw new BoilerpipeProcessingException(e);
  }
}

/**
 * Extracts text from the HTML code available from the given {@link URL}.
 * NOTE: This method is mainly to be used for show case purposes. If you are
 * going to crawl the Web, consider using {@link #getText(InputSource)}
 * instead.
 * 
 * @param url  The URL pointing to the HTML code.
 * @return  The extracted text.
 * @throws BoilerpipeProcessingException
 */
public String getText(final URL url) throws BoilerpipeProcessingException {
  try {
    return getText(HTMLFetcher.fetch(url).toInputSource());
  } catch (IOException e) {
    throw new BoilerpipeProcessingException(e);
  }
}

/**
 * Extracts text from the HTML code available from the given {@link URL}.
 * NOTE: This method is mainly to be used for show case purposes. If you are
 * going to crawl the Web, consider using {@link #getText(InputSource)}
 * instead.
 * 
 * @param url  The URL pointing to the HTML code.
 * @return  The extracted text.
 * @throws BoilerpipeProcessingException
 */
public String getText(final URL url) throws BoilerpipeProcessingException {
  try {
    return getText(HTMLFetcher.fetch(url).toInputSource());
  } catch (IOException e) {
    throw new BoilerpipeProcessingException(e);
  }
}

/**
 * Extracts text from the HTML code available from the given {@link URL}.
 * NOTE: This method is mainly to be used for show case purposes. If you are
 * going to crawl the Web, consider using {@link #getText(InputSource)}
 * instead.
 * 
 * @param url  The URL pointing to the HTML code.
 * @return  The extracted text.
 * @throws BoilerpipeProcessingException
 */
public String getText(final URL url) throws BoilerpipeProcessingException {
  try {
    return getText(HTMLFetcher.fetch(url).toInputSource());
  } catch (IOException e) {
    throw new BoilerpipeProcessingException(e);
  }
}

public String process(final URL url, final BoilerpipeExtractor extractor)
    throws IOException, BoilerpipeProcessingException, SAXException {
  final HTMLDocument htmlDoc = HTMLFetcher.fetch(url);
  final TextDocument doc = new BoilerpipeSAXInput(htmlDoc.toInputSource())
      .getTextDocument();
  extractor.process(doc);
  final InputSource is = htmlDoc.toInputSource();
  return process(doc, is);
}

/**
 * Fetches the given {@link URL} using {@link HTMLFetcher} and processes the
 * retrieved HTML using the specified {@link BoilerpipeExtractor}.
 * 
 *            The processed {@link TextDocument}.
 *            The original HTML document.
 * @return The highlighted HTML.
 * @throws BoilerpipeProcessingException
 */
public String process(final URL url, final BoilerpipeExtractor extractor)
    throws IOException, BoilerpipeProcessingException, SAXException {
  final HTMLDocument htmlDoc = HTMLFetcher.fetch(url);
  final TextDocument doc = new BoilerpipeSAXInput(htmlDoc.toInputSource())
      .getTextDocument();
  extractor.process(doc);
  final InputSource is = htmlDoc.toInputSource();
  return process(doc, is);
}

/**
 * Fetches the given {@link URL} using {@link HTMLFetcher} and processes the
 * retrieved HTML using the specified {@link BoilerpipeExtractor}.
 * 
 *            The processed {@link TextDocument}.
 *            The original HTML document.
 * @return The highlighted HTML.
 * @throws BoilerpipeProcessingException
 */
public String process(final URL url, final BoilerpipeExtractor extractor)
    throws IOException, BoilerpipeProcessingException, SAXException {
  final HTMLDocument htmlDoc = HTMLFetcher.fetch(url);
  final TextDocument doc = new BoilerpipeSAXInput(htmlDoc.toInputSource())
      .getTextDocument();
  extractor.process(doc);
  final InputSource is = htmlDoc.toInputSource();
  return process(doc, is);
}

/**
 * Fetches the given {@link URL} using {@link HTMLFetcher} and processes the
 * retrieved HTML using the specified {@link BoilerpipeExtractor}.
 * 
 *            The processed {@link TextDocument}.
 *            The original HTML document.
 * @return A List of enclosed {@link Image}s
 * @throws BoilerpipeProcessingException
 */
public List<Image> process(final URL url, final BoilerpipeExtractor extractor)
    throws IOException, BoilerpipeProcessingException, SAXException {
  final HTMLDocument htmlDoc = HTMLFetcher.fetch(url);
  final TextDocument doc = new BoilerpipeSAXInput(htmlDoc.toInputSource())
      .getTextDocument();
  extractor.process(doc);
  final InputSource is = htmlDoc.toInputSource();
  return process(doc, is);
}

/**
 * Fetches the given {@link URL} using {@link HTMLFetcher} and processes the
 * retrieved HTML using the specified {@link BoilerpipeExtractor}.
 * 
 *            The processed {@link TextDocument}.
 *            The original HTML document.
 * @return A List of enclosed {@link Image}s
 * @throws BoilerpipeProcessingException
 */
public List<Image> process(final URL url, final BoilerpipeExtractor extractor)
    throws IOException, BoilerpipeProcessingException, SAXException {
  final HTMLDocument htmlDoc = HTMLFetcher.fetch(url);
  final TextDocument doc = new BoilerpipeSAXInput(htmlDoc.toInputSource())
      .getTextDocument();
  extractor.process(doc);
  final InputSource is = htmlDoc.toInputSource();
  return process(doc, is);
}

/**
 * Fetches the given {@link URL} using {@link HTMLFetcher} and processes the retrieved HTML using the specified
 * {@link BoilerpipeExtractor}.
 * 
 * @param url the url of the document to fetch
 * @param extractor extractor to use
 * 
 * @return A List of enclosed {@link Image}s
 * @throws IOException
 * @throws BoilerpipeProcessingException
 * @throws SAXException
 */
@SuppressWarnings("javadoc")
public List<Media> process(final URL url, final BoilerpipeExtractor extractor) throws IOException,
    BoilerpipeProcessingException, SAXException {
  final HTMLDocument htmlDoc = HTMLFetcher.fetch(url);
  final TextDocument doc = new BoilerpipeSAXInput(htmlDoc.toInputSource()).getTextDocument();
  extractor.process(doc);
  final InputSource is = htmlDoc.toInputSource();
  return process(doc, is);
}

/**
 * Fetches the given {@link URL} using {@link HTMLFetcher} and processes the
 * retrieved HTML using the specified {@link BoilerpipeExtractor}.
 * 
 * @param doc
 *            The processed {@link TextDocument}.
 * @param is
 *            The original HTML document.
 * @return The highlighted HTML.
 * @throws BoilerpipeProcessingException
 */
public String process(final URL url, final BoilerpipeExtractor extractor)
    throws IOException, BoilerpipeProcessingException, SAXException {
  final HTMLDocument htmlDoc = HTMLFetcher.fetch(url);
  final TextDocument doc = new BoilerpipeSAXInput(htmlDoc.toInputSource())
      .getTextDocument();
  extractor.process(doc);
  final InputSource is = htmlDoc.toInputSource();
  return process(doc, is);
}

/**
 * Fetches the given {@link URL} using {@link HTMLFetcher} and processes the
 * retrieved HTML using the specified {@link BoilerpipeExtractor}.
 * 
 * @param doc
 *            The processed {@link TextDocument}.
 * @param is
 *            The original HTML document.
 * @return A List of enclosed {@link Image}s
 * @throws BoilerpipeProcessingException
 */
public List<Image> process(final URL url, final BoilerpipeExtractor extractor)
    throws IOException, BoilerpipeProcessingException, SAXException {
  final HTMLDocument htmlDoc = HTMLFetcher.fetch(url);
  final TextDocument doc = new BoilerpipeSAXInput(htmlDoc.toInputSource())
      .getTextDocument();
  extractor.process(doc);
  final InputSource is = htmlDoc.toInputSource();
  return process(doc, is);
}

/**
 * Fetches the given {@link URL} using {@link HTMLFetcher} and processes the retrieved HTML using the specified
 * {@link BoilerpipeExtractor}.
 * 
 * @param url the url of the document to fetch
 * @param extractor extractor to use
 * 
 * @return A List of enclosed {@link Image}s
 * @throws IOException
 * @throws BoilerpipeProcessingException
 * @throws SAXException
 */
@SuppressWarnings("javadoc")
public List<Media> process(final URL url, final BoilerpipeExtractor extractor) throws IOException,
    BoilerpipeProcessingException, SAXException {
  final HTMLDocument htmlDoc = HTMLFetcher.fetch(url);
  final TextDocument doc = new BoilerpipeSAXInput(htmlDoc.toInputSource()).getTextDocument();
  extractor.process(doc);
  final InputSource is = htmlDoc.toInputSource();
  return process(doc, is);
}

/**
 * Fetches the given {@link URL} using {@link HTMLFetcher} and processes the
 * retrieved HTML using the specified {@link BoilerpipeExtractor}.
 * @param url the url of the document to fetch
 * @param extractor extractor to use
 *
 * @return A List of enclosed {@link Image}s
 * @throws IOException
 * @throws BoilerpipeProcessingException
 * @throws SAXException
 */
@SuppressWarnings("javadoc")
public List<Media> process(final URL url, final BoilerpipeExtractor extractor)
        throws IOException, BoilerpipeProcessingException, SAXException {
    final HTMLDocument htmlDoc = HTMLFetcher.fetch(url);
    final TextDocument doc = new BoilerpipeSAXInput(htmlDoc.toInputSource())
            .getTextDocument();
    extractor.process(doc);
    final InputSource is = htmlDoc.toInputSource();
    return process(doc, is);
}

Javadoc

A very simple HTTP/HTML fetcher, really just for demo purposes.

Most used methods

fetch
Fetches the document at the given URL, using URLConnection.

Popular in Java

Creating JSON documents from java classes using gson
setScale (BigDecimal)
setContentView (Activity)
notifyDataSetChanged (ArrayAdapter)
OutputStream (java.io)
A writable sink for bytes.Most clients will use output streams that write data to the file system (
SocketTimeoutException (java.net)
This exception is thrown when a timeout expired on a socket read or accept operation.
BorderLayout (java.awt)
A border layout lays out a container, arranging and resizing its components to fit in five regions:
Window (java.awt)
A Window object is a top-level window with no borders and no menubar. The default layout for a windo
BoxLayout (javax.swing)
JComboBox (javax.swing)
From CI to AI: The AI layer in your organization

How to useHTMLFetcher in de.l3s.boilerpipe.sax

Best Java code snippets using de.l3s.boilerpipe.sax.HTMLFetcher (Showing top 15 results out of 315)

How to use
HTMLFetcher
in
de.l3s.boilerpipe.sax