de.l3s.boilerpipe.extractors java code examples

/**
 * {@inheritDoc}
 */
@Override
public String extractTextFromHtml(String html) throws Exception {
  return ArticleExtractor.getInstance().getText(html);
}

/**
 * Extracts text from the HTML code available from the given {@link Reader}.
 * 
 * @param r The Reader containing the HTML
 * @return  The extracted text.
 * @throws BoilerpipeProcessingException
 */
public String getText(final Reader r) throws BoilerpipeProcessingException {
  return getText(new InputSource(r));
}

private String extractTextFromHtml(String text) throws BoilerpipeProcessingException {
  String extractedText;
  text = cleanHtml(text);
  extractedText = NumWordsRulesExtractor.getInstance().getText(text);
  if (extractedText != null && extractedText.length() > 0) {
    return extractedText;
  }
  extractedText = ArticleExtractor.getInstance().getText(text);
  if (extractedText != null && extractedText.length() > 0) {
    return extractedText;
  }
  return null;
}

private void loadDefaultRules() {
  addTextExtractor("default-extractor"      , PAGE_CONTENT_DE_PROPERTY , DefaultExtractor.getInstance());
  addTextExtractor("article-extractor"      , PAGE_CONTENT_AE_PROPERTY , ArticleExtractor.getInstance());
  addTextExtractor("large-content-extractor", PAGE_CONTENT_LCE_PROPERTY, LargestContentExtractor.getInstance());
  addTextExtractor("canola-extractor"       , PAGE_CONTENT_CE_PROPERTY , CanolaExtractor.getInstance());
}

  /**
   * Extracts text from the given {@link TextDocument} object.
   * 
   * @param doc The {@link TextDocument}.
   * @return  The extracted text.
   * @throws BoilerpipeProcessingException
   */
  public String getText(TextDocument doc)
      throws BoilerpipeProcessingException {
    process(doc);
    return doc.getContent();
  }    
}

public boolean process(TextDocument doc)
    throws BoilerpipeProcessingException {
  return
  ArticleExtractor.INSTANCE.process(doc)
      | SplitParagraphBlocksFilter.INSTANCE.process(doc)
      | MinClauseWordsFilter.INSTANCE.process(doc);
}

protected String parse(String rawText) {
 if (StringUtils.isEmpty(rawText)) return null;
 else {
  try {
   return DefaultExtractor.INSTANCE.getText(rawText);
  } catch (BoilerpipeProcessingException e) {
   LOGGER.error(e.getMessage(), e);
   return null;
  }
 }
}

/**
 * Extracts text from the HTML code available from the given {@link Reader}.
 * 
 * @param r The Reader containing the HTML
 * @return  The extracted text.
 * @throws BoilerpipeProcessingException
 */
public String getText(final Reader r) throws BoilerpipeProcessingException {
  return getText(new InputSource(r));
}

  /**
   * Extracts text from the given {@link TextDocument} object.
   * 
   * @param doc The {@link TextDocument}.
   * @return  The extracted text.
   * @throws BoilerpipeProcessingException
   */
  public String getText(TextDocument doc)
      throws BoilerpipeProcessingException {
    process(doc);
    return doc.getContent();
  }    
}

public boolean process(TextDocument doc)
    throws BoilerpipeProcessingException {
  return
  ArticleExtractor.INSTANCE.process(doc)
      | SplitParagraphBlocksFilter.INSTANCE.process(doc)
      | MinClauseWordsFilter.INSTANCE.process(doc);
}

/**
 * Extracts text from the HTML code available from the given {@link Reader}.
 * 
 * @param r The Reader containing the HTML
 * @return  The extracted text.
 * @throws BoilerpipeProcessingException
 */
public String getText(final Reader r) throws BoilerpipeProcessingException {
  return getText(new InputSource(r));
}

  /**
   * Extracts text from the given {@link TextDocument} object.
   * 
   * @param doc The {@link TextDocument}.
   * @return  The extracted text.
   * @throws BoilerpipeProcessingException
   */
  public String getText(TextDocument doc)
      throws BoilerpipeProcessingException {
    process(doc);
    return doc.getContent();
  }    
}

public boolean process(TextDocument doc)
    throws BoilerpipeProcessingException {
  return
  ArticleExtractor.INSTANCE.process(doc)
      | SplitParagraphBlocksFilter.INSTANCE.process(doc)
      | MinClauseWordsFilter.INSTANCE.process(doc);
}

/**
 * Extracts text from the HTML code available from the given {@link Reader}.
 * 
 * @param r The Reader containing the HTML
 * @return  The extracted text.
 * @throws BoilerpipeProcessingException
 */
public String getText(final Reader r) throws BoilerpipeProcessingException {
  return getText(new InputSource(r));
}

  /**
   * Extracts text from the given {@link TextDocument} object.
   * 
   * @param doc The {@link TextDocument}.
   * @return  The extracted text.
   * @throws BoilerpipeProcessingException
   */
  public String getText(TextDocument doc)
      throws BoilerpipeProcessingException {
    process(doc);
    return doc.getContent();
  }    
}

public boolean process(TextDocument doc)
    throws BoilerpipeProcessingException {
  return
  ArticleExtractor.INSTANCE.process(doc)
      | SplitParagraphBlocksFilter.INSTANCE.process(doc)
      | MinClauseWordsFilter.INSTANCE.process(doc);
}

/**
 * Extracts text from the HTML code available from the given {@link InputSource}.
 * 
 * @param is The InputSource containing the HTML
 * @return  The extracted text.
 * @throws BoilerpipeProcessingException
 */
public String getText(final InputSource is)
    throws BoilerpipeProcessingException {
  try {
    return getText(new BoilerpipeSAXInput(is).getTextDocument());
  } catch (SAXException e) {
    throw new BoilerpipeProcessingException(e);
  }
}

/**
 * Extracts text from the HTML code available from the given {@link InputSource}.
 * 
 * @param is The InputSource containing the HTML
 * @return  The extracted text.
 * @throws BoilerpipeProcessingException
 */
public String getText(final InputSource is)
    throws BoilerpipeProcessingException {
  try {
    return getText(new BoilerpipeSAXInput(is).getTextDocument());
  } catch (SAXException e) {
    throw new BoilerpipeProcessingException(e);
  }
}

/**
 * Extracts text from the HTML code available from the given {@link InputSource}.
 * 
 * @param is The InputSource containing the HTML
 * @return  The extracted text.
 * @throws BoilerpipeProcessingException
 */
public String getText(final InputSource is)
    throws BoilerpipeProcessingException {
  try {
    return getText(new BoilerpipeSAXInput(is).getTextDocument());
  } catch (SAXException e) {
    throw new BoilerpipeProcessingException(e);
  }
}

/**
 * Extracts text from the HTML code available from the given {@link InputSource}.
 * 
 * @param is The InputSource containing the HTML
 * @return  The extracted text.
 * @throws BoilerpipeProcessingException
 */
public String getText(final InputSource is)
    throws BoilerpipeProcessingException {
  try {
    return getText(new BoilerpipeSAXInput(is).getTextDocument());
  } catch (SAXException e) {
    throw new BoilerpipeProcessingException(e);
  }
}

How to use de.l3s.boilerpipe.extractors

Best Java code snippets using de.l3s.boilerpipe.extractors (Showing top 20 results out of 315)