/** * {@inheritDoc} */ @Override public String extractTextFromHtml(String html) throws Exception { return ArticleExtractor.getInstance().getText(html); }
/** * Extracts text from the HTML code available from the given {@link Reader}. * * @param r The Reader containing the HTML * @return The extracted text. * @throws BoilerpipeProcessingException */ public String getText(final Reader r) throws BoilerpipeProcessingException { return getText(new InputSource(r)); }
private String extractTextFromHtml(String text) throws BoilerpipeProcessingException { String extractedText; text = cleanHtml(text); extractedText = NumWordsRulesExtractor.getInstance().getText(text); if (extractedText != null && extractedText.length() > 0) { return extractedText; } extractedText = ArticleExtractor.getInstance().getText(text); if (extractedText != null && extractedText.length() > 0) { return extractedText; } return null; }
private void loadDefaultRules() { addTextExtractor("default-extractor" , PAGE_CONTENT_DE_PROPERTY , DefaultExtractor.getInstance()); addTextExtractor("article-extractor" , PAGE_CONTENT_AE_PROPERTY , ArticleExtractor.getInstance()); addTextExtractor("large-content-extractor", PAGE_CONTENT_LCE_PROPERTY, LargestContentExtractor.getInstance()); addTextExtractor("canola-extractor" , PAGE_CONTENT_CE_PROPERTY , CanolaExtractor.getInstance()); }
/** * Extracts text from the given {@link TextDocument} object. * * @param doc The {@link TextDocument}. * @return The extracted text. * @throws BoilerpipeProcessingException */ public String getText(TextDocument doc) throws BoilerpipeProcessingException { process(doc); return doc.getContent(); } }
public boolean process(TextDocument doc) throws BoilerpipeProcessingException { return ArticleExtractor.INSTANCE.process(doc) | SplitParagraphBlocksFilter.INSTANCE.process(doc) | MinClauseWordsFilter.INSTANCE.process(doc); }
protected String parse(String rawText) { if (StringUtils.isEmpty(rawText)) return null; else { try { return DefaultExtractor.INSTANCE.getText(rawText); } catch (BoilerpipeProcessingException e) { LOGGER.error(e.getMessage(), e); return null; } } }
/** * Extracts text from the HTML code available from the given {@link Reader}. * * @param r The Reader containing the HTML * @return The extracted text. * @throws BoilerpipeProcessingException */ public String getText(final Reader r) throws BoilerpipeProcessingException { return getText(new InputSource(r)); }
/** * Extracts text from the given {@link TextDocument} object. * * @param doc The {@link TextDocument}. * @return The extracted text. * @throws BoilerpipeProcessingException */ public String getText(TextDocument doc) throws BoilerpipeProcessingException { process(doc); return doc.getContent(); } }
public boolean process(TextDocument doc) throws BoilerpipeProcessingException { return ArticleExtractor.INSTANCE.process(doc) | SplitParagraphBlocksFilter.INSTANCE.process(doc) | MinClauseWordsFilter.INSTANCE.process(doc); }
/** * Extracts text from the HTML code available from the given {@link Reader}. * * @param r The Reader containing the HTML * @return The extracted text. * @throws BoilerpipeProcessingException */ public String getText(final Reader r) throws BoilerpipeProcessingException { return getText(new InputSource(r)); }
/** * Extracts text from the given {@link TextDocument} object. * * @param doc The {@link TextDocument}. * @return The extracted text. * @throws BoilerpipeProcessingException */ public String getText(TextDocument doc) throws BoilerpipeProcessingException { process(doc); return doc.getContent(); } }
public boolean process(TextDocument doc) throws BoilerpipeProcessingException { return ArticleExtractor.INSTANCE.process(doc) | SplitParagraphBlocksFilter.INSTANCE.process(doc) | MinClauseWordsFilter.INSTANCE.process(doc); }
/** * Extracts text from the HTML code available from the given {@link Reader}. * * @param r The Reader containing the HTML * @return The extracted text. * @throws BoilerpipeProcessingException */ public String getText(final Reader r) throws BoilerpipeProcessingException { return getText(new InputSource(r)); }
/** * Extracts text from the given {@link TextDocument} object. * * @param doc The {@link TextDocument}. * @return The extracted text. * @throws BoilerpipeProcessingException */ public String getText(TextDocument doc) throws BoilerpipeProcessingException { process(doc); return doc.getContent(); } }
public boolean process(TextDocument doc) throws BoilerpipeProcessingException { return ArticleExtractor.INSTANCE.process(doc) | SplitParagraphBlocksFilter.INSTANCE.process(doc) | MinClauseWordsFilter.INSTANCE.process(doc); }
/** * Extracts text from the HTML code available from the given {@link InputSource}. * * @param is The InputSource containing the HTML * @return The extracted text. * @throws BoilerpipeProcessingException */ public String getText(final InputSource is) throws BoilerpipeProcessingException { try { return getText(new BoilerpipeSAXInput(is).getTextDocument()); } catch (SAXException e) { throw new BoilerpipeProcessingException(e); } }
/** * Extracts text from the HTML code available from the given {@link InputSource}. * * @param is The InputSource containing the HTML * @return The extracted text. * @throws BoilerpipeProcessingException */ public String getText(final InputSource is) throws BoilerpipeProcessingException { try { return getText(new BoilerpipeSAXInput(is).getTextDocument()); } catch (SAXException e) { throw new BoilerpipeProcessingException(e); } }
/** * Extracts text from the HTML code available from the given {@link InputSource}. * * @param is The InputSource containing the HTML * @return The extracted text. * @throws BoilerpipeProcessingException */ public String getText(final InputSource is) throws BoilerpipeProcessingException { try { return getText(new BoilerpipeSAXInput(is).getTextDocument()); } catch (SAXException e) { throw new BoilerpipeProcessingException(e); } }
/** * Extracts text from the HTML code available from the given {@link InputSource}. * * @param is The InputSource containing the HTML * @return The extracted text. * @throws BoilerpipeProcessingException */ public String getText(final InputSource is) throws BoilerpipeProcessingException { try { return getText(new BoilerpipeSAXInput(is).getTextDocument()); } catch (SAXException e) { throw new BoilerpipeProcessingException(e); } }