/** * returns the article from an url with its basic html structure. * */ public String process(final BoilerpipeExtractor extractor, final URL url) throws IOException, BoilerpipeProcessingException, SAXException, URISyntaxException { final HTMLDocument htmlDoc = HTMLFetcher.fetch(url); return process(htmlDoc, url.toURI(), extractor); }
/** * Extracts text from the HTML code available from the given {@link URL}. * NOTE: This method is mainly to be used for show case purposes. If you are * going to crawl the Web, consider using {@link #getText(InputSource)} * instead. * * @param url The URL pointing to the HTML code. * @return The extracted text. * @throws BoilerpipeProcessingException */ public String getText(final URL url) throws BoilerpipeProcessingException { try { return getText(HTMLFetcher.fetch(url).toInputSource()); } catch (IOException e) { throw new BoilerpipeProcessingException(e); } }
/** * Extracts text from the HTML code available from the given {@link URL}. * NOTE: This method is mainly to be used for show case purposes. If you are * going to crawl the Web, consider using {@link #getText(InputSource)} * instead. * * @param url The URL pointing to the HTML code. * @return The extracted text. * @throws BoilerpipeProcessingException */ public String getText(final URL url) throws BoilerpipeProcessingException { try { return getText(HTMLFetcher.fetch(url).toInputSource()); } catch (IOException e) { throw new BoilerpipeProcessingException(e); } }
/** * Extracts text from the HTML code available from the given {@link URL}. * NOTE: This method is mainly to be used for show case purposes. If you are * going to crawl the Web, consider using {@link #getText(InputSource)} * instead. * * @param url The URL pointing to the HTML code. * @return The extracted text. * @throws BoilerpipeProcessingException */ public String getText(final URL url) throws BoilerpipeProcessingException { try { return getText(HTMLFetcher.fetch(url).toInputSource()); } catch (IOException e) { throw new BoilerpipeProcessingException(e); } }
/** * Extracts text from the HTML code available from the given {@link URL}. * NOTE: This method is mainly to be used for show case purposes. If you are * going to crawl the Web, consider using {@link #getText(InputSource)} * instead. * * @param url The URL pointing to the HTML code. * @return The extracted text. * @throws BoilerpipeProcessingException */ public String getText(final URL url) throws BoilerpipeProcessingException { try { return getText(HTMLFetcher.fetch(url).toInputSource()); } catch (IOException e) { throw new BoilerpipeProcessingException(e); } }
public String process(final URL url, final BoilerpipeExtractor extractor) throws IOException, BoilerpipeProcessingException, SAXException { final HTMLDocument htmlDoc = HTMLFetcher.fetch(url); final TextDocument doc = new BoilerpipeSAXInput(htmlDoc.toInputSource()) .getTextDocument(); extractor.process(doc); final InputSource is = htmlDoc.toInputSource(); return process(doc, is); }
/** * Fetches the given {@link URL} using {@link HTMLFetcher} and processes the * retrieved HTML using the specified {@link BoilerpipeExtractor}. * * The processed {@link TextDocument}. * The original HTML document. * @return The highlighted HTML. * @throws BoilerpipeProcessingException */ public String process(final URL url, final BoilerpipeExtractor extractor) throws IOException, BoilerpipeProcessingException, SAXException { final HTMLDocument htmlDoc = HTMLFetcher.fetch(url); final TextDocument doc = new BoilerpipeSAXInput(htmlDoc.toInputSource()) .getTextDocument(); extractor.process(doc); final InputSource is = htmlDoc.toInputSource(); return process(doc, is); }
/** * Fetches the given {@link URL} using {@link HTMLFetcher} and processes the * retrieved HTML using the specified {@link BoilerpipeExtractor}. * * The processed {@link TextDocument}. * The original HTML document. * @return The highlighted HTML. * @throws BoilerpipeProcessingException */ public String process(final URL url, final BoilerpipeExtractor extractor) throws IOException, BoilerpipeProcessingException, SAXException { final HTMLDocument htmlDoc = HTMLFetcher.fetch(url); final TextDocument doc = new BoilerpipeSAXInput(htmlDoc.toInputSource()) .getTextDocument(); extractor.process(doc); final InputSource is = htmlDoc.toInputSource(); return process(doc, is); }
/** * Fetches the given {@link URL} using {@link HTMLFetcher} and processes the * retrieved HTML using the specified {@link BoilerpipeExtractor}. * * The processed {@link TextDocument}. * The original HTML document. * @return A List of enclosed {@link Image}s * @throws BoilerpipeProcessingException */ public List<Image> process(final URL url, final BoilerpipeExtractor extractor) throws IOException, BoilerpipeProcessingException, SAXException { final HTMLDocument htmlDoc = HTMLFetcher.fetch(url); final TextDocument doc = new BoilerpipeSAXInput(htmlDoc.toInputSource()) .getTextDocument(); extractor.process(doc); final InputSource is = htmlDoc.toInputSource(); return process(doc, is); }
/** * Fetches the given {@link URL} using {@link HTMLFetcher} and processes the * retrieved HTML using the specified {@link BoilerpipeExtractor}. * * The processed {@link TextDocument}. * The original HTML document. * @return A List of enclosed {@link Image}s * @throws BoilerpipeProcessingException */ public List<Image> process(final URL url, final BoilerpipeExtractor extractor) throws IOException, BoilerpipeProcessingException, SAXException { final HTMLDocument htmlDoc = HTMLFetcher.fetch(url); final TextDocument doc = new BoilerpipeSAXInput(htmlDoc.toInputSource()) .getTextDocument(); extractor.process(doc); final InputSource is = htmlDoc.toInputSource(); return process(doc, is); }
/** * Fetches the given {@link URL} using {@link HTMLFetcher} and processes the retrieved HTML using the specified * {@link BoilerpipeExtractor}. * * @param url the url of the document to fetch * @param extractor extractor to use * * @return A List of enclosed {@link Image}s * @throws IOException * @throws BoilerpipeProcessingException * @throws SAXException */ @SuppressWarnings("javadoc") public List<Media> process(final URL url, final BoilerpipeExtractor extractor) throws IOException, BoilerpipeProcessingException, SAXException { final HTMLDocument htmlDoc = HTMLFetcher.fetch(url); final TextDocument doc = new BoilerpipeSAXInput(htmlDoc.toInputSource()).getTextDocument(); extractor.process(doc); final InputSource is = htmlDoc.toInputSource(); return process(doc, is); }
/** * Fetches the given {@link URL} using {@link HTMLFetcher} and processes the * retrieved HTML using the specified {@link BoilerpipeExtractor}. * * @param doc * The processed {@link TextDocument}. * @param is * The original HTML document. * @return The highlighted HTML. * @throws BoilerpipeProcessingException */ public String process(final URL url, final BoilerpipeExtractor extractor) throws IOException, BoilerpipeProcessingException, SAXException { final HTMLDocument htmlDoc = HTMLFetcher.fetch(url); final TextDocument doc = new BoilerpipeSAXInput(htmlDoc.toInputSource()) .getTextDocument(); extractor.process(doc); final InputSource is = htmlDoc.toInputSource(); return process(doc, is); }
/** * Fetches the given {@link URL} using {@link HTMLFetcher} and processes the * retrieved HTML using the specified {@link BoilerpipeExtractor}. * * @param doc * The processed {@link TextDocument}. * @param is * The original HTML document. * @return A List of enclosed {@link Image}s * @throws BoilerpipeProcessingException */ public List<Image> process(final URL url, final BoilerpipeExtractor extractor) throws IOException, BoilerpipeProcessingException, SAXException { final HTMLDocument htmlDoc = HTMLFetcher.fetch(url); final TextDocument doc = new BoilerpipeSAXInput(htmlDoc.toInputSource()) .getTextDocument(); extractor.process(doc); final InputSource is = htmlDoc.toInputSource(); return process(doc, is); }
/** * Fetches the given {@link URL} using {@link HTMLFetcher} and processes the retrieved HTML using the specified * {@link BoilerpipeExtractor}. * * @param url the url of the document to fetch * @param extractor extractor to use * * @return A List of enclosed {@link Image}s * @throws IOException * @throws BoilerpipeProcessingException * @throws SAXException */ @SuppressWarnings("javadoc") public List<Media> process(final URL url, final BoilerpipeExtractor extractor) throws IOException, BoilerpipeProcessingException, SAXException { final HTMLDocument htmlDoc = HTMLFetcher.fetch(url); final TextDocument doc = new BoilerpipeSAXInput(htmlDoc.toInputSource()).getTextDocument(); extractor.process(doc); final InputSource is = htmlDoc.toInputSource(); return process(doc, is); }
/** * Fetches the given {@link URL} using {@link HTMLFetcher} and processes the * retrieved HTML using the specified {@link BoilerpipeExtractor}. * @param url the url of the document to fetch * @param extractor extractor to use * * @return A List of enclosed {@link Image}s * @throws IOException * @throws BoilerpipeProcessingException * @throws SAXException */ @SuppressWarnings("javadoc") public List<Media> process(final URL url, final BoilerpipeExtractor extractor) throws IOException, BoilerpipeProcessingException, SAXException { final HTMLDocument htmlDoc = HTMLFetcher.fetch(url); final TextDocument doc = new BoilerpipeSAXInput(htmlDoc.toInputSource()) .getTextDocument(); extractor.process(doc); final InputSource is = htmlDoc.toInputSource(); return process(doc, is); }