protected boolean classify(final TextBlock prev, final TextBlock curr, final TextBlock next) { final boolean isContent = (curr.getLinkDensity() > 0 && next .getNumWords() > 11) || (curr.getNumWords() > 19 || (next.getNumWords() > 6 && next.getLinkDensity() == 0 && prev.getLinkDensity() == 0 && (curr .getNumWords() > 6 || prev.getNumWords() > 7 || next .getNumWords() > 19))); return curr.setIsContent(isContent); } };
protected boolean classify(final TextBlock prev, final TextBlock curr, final TextBlock next) { final boolean isContent = (curr.getLinkDensity() > 0 && next .getNumWords() > 11) || (curr.getNumWords() > 19 || (next.getNumWords() > 6 && next.getLinkDensity() == 0 && prev.getLinkDensity() == 0 && (curr .getNumWords() > 6 || prev.getNumWords() > 7 || next .getNumWords() > 19))); return curr.setIsContent(isContent); } };
protected boolean classify(final TextBlock prev, final TextBlock curr, final TextBlock next) { final boolean isContent = (curr.getLinkDensity() > 0 && next .getNumWords() > 11) || (curr.getNumWords() > 19 || (next.getNumWords() > 6 && next.getLinkDensity() == 0 && prev.getLinkDensity() == 0 && (curr .getNumWords() > 6 || prev.getNumWords() > 7 || next .getNumWords() > 19))); return curr.setIsContent(isContent); } };
protected boolean classify(final TextBlock prev, final TextBlock curr, final TextBlock next) { final boolean isContent = (curr.getLinkDensity() > 0 && next .getNumWords() > 11) || (curr.getNumWords() > 19 || (next.getNumWords() > 6 && next.getLinkDensity() == 0 && prev.getLinkDensity() == 0 && (curr .getNumWords() > 6 || prev.getNumWords() > 7 || next .getNumWords() > 19))); return curr.setIsContent(isContent); } };
protected static int getNumFullTextWords(final TextBlock tb, float minTextDensity) { if(tb.getTextDensity() >= minTextDensity) { return tb.getNumWords(); } else { return 0; } }
protected static int getNumFullTextWords(final TextBlock tb, float minTextDensity) { if(tb.getTextDensity() >= minTextDensity) { return tb.getNumWords(); } else { return 0; } }
protected static int getNumFullTextWords(final TextBlock tb, float minTextDensity) { if(tb.getTextDensity() >= minTextDensity) { return tb.getNumWords(); } else { return 0; } }
protected static int getNumFullTextWords(final TextBlock tb, float minTextDensity) { if(tb.getTextDensity() >= minTextDensity) { return tb.getNumWords(); } else { return 0; } }
public boolean meetsCondition(TextBlock tb) { return tb.getLinkDensity() == 0 && tb.getNumWords() > 6; } });
public boolean meetsCondition(TextBlock tb) { return tb.getLinkDensity() == 0 && tb.getNumWords() > 6; } });
public boolean meetsCondition(TextBlock tb) { return tb.getLinkDensity() == 0 && tb.getNumWords() > 6; } });
/** * Computes statistics on a given {@link TextDocument}. * * @param doc The {@link TextDocument}. * @param contentOnly if true then o */ public TextDocumentStatistics(final TextDocument doc, final boolean contentOnly) { for (TextBlock tb : doc.getTextBlocks()) { if (contentOnly && !tb.isContent()) { continue; } numWords += tb.getNumWords(); numBlocks++; } }
/** * Computes statistics on a given {@link TextDocument}. * * @param doc The {@link TextDocument}. * @param contentOnly if true then o */ public TextDocumentStatistics(final TextDocument doc, final boolean contentOnly) { for (TextBlock tb : doc.getTextBlocks()) { if (contentOnly && !tb.isContent()) { continue; } numWords += tb.getNumWords(); numBlocks++; } }
/** * Computes statistics on a given {@link TextDocument}. * * @param doc The {@link TextDocument}. * @param contentOnly if true then o */ public TextDocumentStatistics(final TextDocument doc, final boolean contentOnly) { for (TextBlock tb : doc.getTextBlocks()) { if (contentOnly && !tb.isContent()) { continue; } numWords += tb.getNumWords(); numBlocks++; } }
/** * Computes statistics on a given {@link TextDocument}. * * @param doc The {@link TextDocument}. * @param contentOnly if true then o */ public TextDocumentStatistics(final TextDocument doc, final boolean contentOnly) { for (TextBlock tb : doc.getTextBlocks()) { if (contentOnly && !tb.isContent()) { continue; } numWords += tb.getNumWords(); numBlocks++; } }
public boolean process(TextDocument doc) throws BoilerpipeProcessingException { boolean changed = false; for (TextBlock tb : doc.getTextBlocks()) { if (tb.getNumWords() > 10) { continue; } final String text = tb.getText(); for (Pattern p : PATTERNS_SHORT) { if (p.matcher(text).find()) { changed = true; tb.setIsContent(true); tb.addLabel(DefaultLabels.ARTICLE_METADATA); } } } return changed; }
public boolean process(final TextDocument doc) throws BoilerpipeProcessingException { boolean changes = false; for (TextBlock tb : doc.getTextBlocks()) { if (!tb.isContent()) { continue; } if (tb.getNumWords() < minWords) { tb.setIsContent(false); changes = true; } } return changes; } }
public boolean process(final TextDocument doc) throws BoilerpipeProcessingException { boolean changes = false; for (TextBlock tb : doc.getTextBlocks()) { if (!tb.isContent()) { continue; } if (tb.getNumWords() < minWords) { tb.setIsContent(false); changes = true; } } return changes; } }
public boolean process(final TextDocument doc) throws BoilerpipeProcessingException { boolean changes = false; for (TextBlock tb : doc.getTextBlocks()) { if (!tb.isContent()) { continue; } if (tb.getNumWords() < minWords) { tb.setIsContent(false); changes = true; } } return changes; } }
public boolean process(final TextDocument doc) throws BoilerpipeProcessingException { boolean changes = false; for (TextBlock tb : doc.getTextBlocks()) { if (!tb.isContent()) { continue; } if (tb.getNumWords() < minWords) { tb.setIsContent(false); changes = true; } } return changes; } }