for (TextBlock block : td.getTextBlocks()) { if (block.isContent()) { BitSet bs = block.getContainedTextElements(); for (TextBlock block : td.getTextBlocks()) { if (block.isContent()) { delegate.startElement(XHTMLContentHandler.XHTML, "p", "p", emptyAttrs);
public static Document createTextDocument(int maxLength){ return new TextDocument(maxLength); } ... Document document = createTextDocument(5); // limit to 5 chars textField1.setDocument(document); document = createTextDocument(10); // limit to 10 chars textField2.setDocument(document);
/** * Extracts text from the given {@link TextDocument} object. * * @param doc The {@link TextDocument}. * @return The extracted text. * @throws BoilerpipeProcessingException */ public String getText(TextDocument doc) throws BoilerpipeProcessingException { process(doc); return doc.getContent(); } }
/** * Returns the {@link TextDocument}'s content. * * @return The content text. */ public String getContent() { return getText(true, false); }
public boolean process(TextDocument doc) throws BoilerpipeProcessingException { return TerminatingBlocksFinder.INSTANCE.process(doc) | new DocumentTitleMatchClassifier(doc.getTitle()).process(doc) | NumWordsRulesClassifier.INSTANCE.process(doc) | IgnoreBlocksAfterContentFilter.DEFAULT_INSTANCE.process(doc) | BlockProximityFusion.MAX_DISTANCE_1.process(doc) | BoilerplateBlockFilter.INSTANCE.process(doc) | BlockProximityFusion.MAX_DISTANCE_1_CONTENT_ONLY.process(doc) | KeepLargestFulltextBlockFilter.INSTANCE.process(doc) | ExpandTitleToContentFilter.INSTANCE.process(doc); } }
public boolean process(TextDocument doc) throws BoilerpipeProcessingException { out.println(doc.debugString()); return false; } }
/** * Returns the {@link TextDocument}'s content. * * @return The content text. */ public String getContent() { return getText(true, false); }
public boolean process(TextDocument doc) throws BoilerpipeProcessingException { return TerminatingBlocksFinder.INSTANCE.process(doc) | new DocumentTitleMatchClassifier(doc.getTitle()).process(doc) | NumWordsRulesClassifier.INSTANCE.process(doc) | IgnoreBlocksAfterContentFilter.DEFAULT_INSTANCE.process(doc) | TrailingHeadlineToBoilerplateFilter.INSTANCE.process(doc) | BlockProximityFusion.MAX_DISTANCE_1.process(doc) | BoilerplateBlockFilter.INSTANCE_KEEP_TITLE.process(doc) | BlockProximityFusion.MAX_DISTANCE_1_CONTENT_ONLY_SAME_TAGLEVEL.process(doc) | KeepLargestBlockFilter.INSTANCE_EXPAND_TO_SAME_TAGLEVEL_MIN_WORDS.process(doc) | ExpandTitleToContentFilter.INSTANCE.process(doc) | LargeBlockSameTagLevelToContentFilter.INSTANCE.process(doc) | ListAtEndFilter.INSTANCE.process(doc) ; } }
public boolean process(TextDocument doc) throws BoilerpipeProcessingException { out.println(doc.debugString()); return false; } }
/** * Returns detailed debugging information about the contained {@link TextBlock}s. * * @return Debug information. */ public String debugString() { StringBuilder sb = new StringBuilder(); for(TextBlock tb : getTextBlocks()) { sb.append(tb.toString()); sb.append('\n'); } return sb.toString(); }
public TextDocument clone() { final List<TextBlock> list = new ArrayList<TextBlock>(textBlocks.size()); for(TextBlock tb : textBlocks) { list.add(tb.clone()); } return new TextDocument(title, list); } }
/** * Extracts text from the given {@link TextDocument} object. * * @param doc The {@link TextDocument}. * @return The extracted text. * @throws BoilerpipeProcessingException */ public String getText(TextDocument doc) throws BoilerpipeProcessingException { process(doc); return doc.getContent(); } }
/** * Returns the {@link TextDocument}'s content. * * @return The content text. */ public String getContent() { return getText(true, false); }
public boolean process(TextDocument doc) throws BoilerpipeProcessingException { return TerminatingBlocksFinder.INSTANCE.process(doc) | new DocumentTitleMatchClassifier(doc.getTitle()).process(doc) | NumWordsRulesClassifier.INSTANCE.process(doc) | IgnoreBlocksAfterContentFilter.DEFAULT_INSTANCE.process(doc) | TrailingHeadlineToBoilerplateFilter.INSTANCE.process(doc) | BlockProximityFusion.MAX_DISTANCE_1.process(doc) | BoilerplateBlockFilter.INSTANCE_KEEP_TITLE.process(doc) | BlockProximityFusion.MAX_DISTANCE_1_CONTENT_ONLY_SAME_TAGLEVEL.process(doc) | KeepLargestBlockFilter.INSTANCE_EXPAND_TO_SAME_TAGLEVEL_MIN_WORDS.process(doc) | ExpandTitleToContentFilter.INSTANCE.process(doc) | LargeBlockSameTagLevelToContentFilter.INSTANCE.process(doc) | ListAtEndFilter.INSTANCE.process(doc) ; } }
public boolean process(TextDocument doc) throws BoilerpipeProcessingException { out.println(doc.debugString()); return false; } }
public boolean process(TextDocument doc) throws BoilerpipeProcessingException { List<TextBlock> textBlocks = doc.getTextBlocks(); boolean hasChanges = false; for (Iterator<TextBlock> it = textBlocks.iterator(); it.hasNext();) { TextBlock tb = it.next(); if (!tb.isContent()) { it.remove(); hasChanges = true; } } return hasChanges; }
public TextDocument clone() { final List<TextBlock> list = new ArrayList<TextBlock>(textBlocks.size()); for(TextBlock tb : textBlocks) { list.add(tb.clone()); } return new TextDocument(title, list); } }
/** * Extracts text from the given {@link TextDocument} object. * * @param doc The {@link TextDocument}. * @return The extracted text. * @throws BoilerpipeProcessingException */ public String getText(TextDocument doc) throws BoilerpipeProcessingException { process(doc); return doc.getContent(); } }
/** * Returns the {@link TextDocument}'s content. * * @return The content text. */ public String getContent() { return getText(true, false); }
public boolean process(TextDocument doc) throws BoilerpipeProcessingException { return TerminatingBlocksFinder.INSTANCE.process(doc) | new DocumentTitleMatchClassifier(doc.getTitle()).process(doc) | NumWordsRulesClassifier.INSTANCE.process(doc) | IgnoreBlocksAfterContentFilter.DEFAULT_INSTANCE.process(doc) | TrailingHeadlineToBoilerplateFilter.INSTANCE.process(doc) | BlockProximityFusion.MAX_DISTANCE_1.process(doc) | BoilerplateBlockFilter.INSTANCE_KEEP_TITLE.process(doc) | BlockProximityFusion.MAX_DISTANCE_1_CONTENT_ONLY_SAME_TAGLEVEL.process(doc) | KeepLargestBlockFilter.INSTANCE_EXPAND_TO_SAME_TAGLEVEL_MIN_WORDS.process(doc) | ExpandTitleToContentFilter.INSTANCE.process(doc) | LargeBlockSameTagLevelToContentFilter.INSTANCE.process(doc) | ListAtEndFilter.INSTANCE.process(doc) ; } }