/** * {@inheritDoc} */ @Override public String extractTextFromHtml(String html) throws Exception { return ArticleExtractor.getInstance().getText(html); }
private String extractTextFromHtml(String text) throws BoilerpipeProcessingException { String extractedText; text = cleanHtml(text); extractedText = NumWordsRulesExtractor.getInstance().getText(text); if (extractedText != null && extractedText.length() > 0) { return extractedText; } extractedText = ArticleExtractor.getInstance().getText(text); if (extractedText != null && extractedText.length() > 0) { return extractedText; } return null; }
private void loadDefaultRules() { addTextExtractor("default-extractor" , PAGE_CONTENT_DE_PROPERTY , DefaultExtractor.getInstance()); addTextExtractor("article-extractor" , PAGE_CONTENT_AE_PROPERTY , ArticleExtractor.getInstance()); addTextExtractor("large-content-extractor", PAGE_CONTENT_LCE_PROPERTY, LargestContentExtractor.getInstance()); addTextExtractor("canola-extractor" , PAGE_CONTENT_CE_PROPERTY , CanolaExtractor.getInstance()); }