/** * {@inheritDoc} */ @Override public String extractTextFromHtml(String html) throws Exception { return ArticleExtractor.getInstance().getText(html); }
public boolean process(TextDocument doc) throws BoilerpipeProcessingException { return ArticleExtractor.INSTANCE.process(doc) | SplitParagraphBlocksFilter.INSTANCE.process(doc) | MinClauseWordsFilter.INSTANCE.process(doc); }
public static final String extractPageBodyText(String pageUrl) throws Exception { URL url = new URL(pageUrl); URLConnection conn = url.openConnection(); if (url.getProtocol().startsWith("http")) { if (((HttpURLConnection) conn).getResponseCode() == 303) { String location = conn.getHeaderField("Location"); System.out.println(">> 303 Other : " + location); return Words.replaceSmartQuotes(ArticleExtractor.INSTANCE .getText(fetchPageText(location))); } } String text = Words.replaceSmartQuotes(ArticleExtractor.INSTANCE .getText(new URL(pageUrl))); if (text == null || text.length() == 0) { text = Words.replaceSmartQuotes(ArticleExtractor.INSTANCE .getText(Feeds.fetchPageText(pageUrl))); } return Jsoup.parse(text).body().text(); }/* * public static final String escapeHtml(String input) { if(input == null ||
private void loadDefaultRules() { addTextExtractor("default-extractor" , PAGE_CONTENT_DE_PROPERTY , DefaultExtractor.getInstance()); addTextExtractor("article-extractor" , PAGE_CONTENT_AE_PROPERTY , ArticleExtractor.getInstance()); addTextExtractor("large-content-extractor", PAGE_CONTENT_LCE_PROPERTY, LargestContentExtractor.getInstance()); addTextExtractor("canola-extractor" , PAGE_CONTENT_CE_PROPERTY , CanolaExtractor.getInstance()); }
public boolean process(TextDocument doc) throws BoilerpipeProcessingException { return ArticleExtractor.INSTANCE.process(doc) | SplitParagraphBlocksFilter.INSTANCE.process(doc) | MinClauseWordsFilter.INSTANCE.process(doc); }
try { System.out.println("Extracting the text content of the URL..."); String text = ArticleExtractor.INSTANCE.getText(new InputStreamReader(url.openStream(), "UTF-8")); if (verbose) { System.out.println("URL text content:");
private String extractTextFromHtml(String text) throws BoilerpipeProcessingException { String extractedText; text = cleanHtml(text); extractedText = NumWordsRulesExtractor.getInstance().getText(text); if (extractedText != null && extractedText.length() > 0) { return extractedText; } extractedText = ArticleExtractor.getInstance().getText(text); if (extractedText != null && extractedText.length() > 0) { return extractedText; } return null; }
public boolean process(TextDocument doc) throws BoilerpipeProcessingException { return ArticleExtractor.INSTANCE.process(doc) | SplitParagraphBlocksFilter.INSTANCE.process(doc) | MinClauseWordsFilter.INSTANCE.process(doc); }
public boolean process(TextDocument doc) throws BoilerpipeProcessingException { return ArticleExtractor.INSTANCE.process(doc) | SplitParagraphBlocksFilter.INSTANCE.process(doc) | MinClauseWordsFilter.INSTANCE.process(doc); }