/** * Clean XML input */ public CleanXmlAnnotator cleanXML(Properties properties) { return new CleanXmlAnnotator(properties); }
public List<CoreLabel> process(List<CoreLabel> tokens) { return process(null, tokens); }
public void setDocAnnotationPatterns(String conf) { docAnnotationPatterns.clear(); // Patterns can only be tag attributes addAnnotationPatterns(docAnnotationPatterns, conf, true); }
xmlTagMatcher = toCaseInsensitivePattern(xmlElementsToProcess); if (StringUtils.isNullOrEmpty(sentenceEndingTags)) { sentenceEndingTagMatcher = null; } else { sentenceEndingTagMatcher = toCaseInsensitivePattern(sentenceEndingTags); dateTagMatcher = toCaseInsensitivePattern(dateTags); this.allowFlawedXml = allowFlawed; setSingleSentenceTagMatcher(singleSentenceTags); setDocIdTagMatcher(docIdTags); setDocTypeTagMatcher(docTypeTags); setDiscourseTags(utteranceTurnTags, speakerTags); setDocAnnotationPatterns(docAnnotations); setTokenAnnotationPatterns(tokenAnnotations); setSectionTagMatcher(sectionTags); setSectionAnnotationPatterns(sectionAnnotations); setQuoteTagMatcher(quoteTags); setSsplitDiscardTokensMatcher(ssplitDiscardTokens);
@Override public void annotate(Annotation annotation) { if (annotation.containsKey(CoreAnnotations.TokensAnnotation.class)) { List<CoreLabel> tokens = annotation.get(CoreAnnotations.TokensAnnotation.class); if (DEBUG) { log.info("CleanXML: starting tokens: " + tokens); } List<CoreLabel> newTokens = process(annotation, tokens); // We assume that if someone is using this annotator, they don't // want the old tokens any more and get rid of them // redo the token indexes if xml tokens have been removed setTokenBeginTokenEnd(newTokens); annotation.set(CoreAnnotations.TokensAnnotation.class, newTokens); if (DEBUG) { log.info("CleanXML: ending tokens: " + annotation.get(CoreAnnotations.TokensAnnotation.class)); } } }
String ssplitDiscardTokens = properties.getProperty("clean.ssplitDiscardTokens"); CleanXmlAnnotator annotator = new CleanXmlAnnotator(xmlTags, sentenceEndingTags, dateTags, allowFlawed); annotator.setSingleSentenceTagMatcher(singleSentenceTags); annotator.setDocIdTagMatcher(docIdTags); annotator.setDocTypeTagMatcher(docTypeTags); annotator.setDiscourseTags(utteranceTurnTags, speakerTags); annotator.setDocAnnotationPatterns(docAnnotations); annotator.setTokenAnnotationPatterns(tokenAnnotations); annotator.setSectionTagMatcher(sectionTags); annotator.setSectionAnnotationPatterns(sectionAnnotations); annotator.setSsplitDiscardTokensMatcher(ssplitDiscardTokens); return annotator;
Set<Class> foundAnnotations = annotateWithTag(annotation, annotation, tag, docAnnotationPatterns, null, toAnnotate, null); toAnnotate.removeAll(foundAnnotations); annotateWithTag(annotation, sectionAnnotations, tag, sectionAnnotationPatterns, savedTokensForSection, null, null); annotateWithTag(annotation, sectionAnnotations, tag, sectionAnnotationPatterns, savedTokensForSection, null, null); annotateWithTag(annotation, tokenAnnotations, tag, tokenAnnotationPatterns, null, null, savedTokenAnnotations); currentSpeaker = tokensToString(annotation, speakerTokens); MultiTokenTag.Tag mentionTag = new MultiTokenTag.Tag(currentSpeaker, "Speaker", speakerTokens.size()); int i = 0; String str = tokensToString(annotation, docIdTokens).trim(); annotation.set(CoreAnnotations.DocIDAnnotation.class, str); String str = tokensToString(annotation, docDateTokens).trim(); annotation.set(CoreAnnotations.DocDateAnnotation.class, str); String str = tokensToString(annotation, docTypeTokens).trim(); annotation.set(CoreAnnotations.DocTypeAnnotation.class, str);
xmlTagMatcher = toCaseInsensitivePattern(xmlElementsToProcess); if (StringUtils.isNullOrEmpty(sentenceEndingTags)) { sentenceEndingTagMatcher = null; } else { sentenceEndingTagMatcher = toCaseInsensitivePattern(sentenceEndingTags); dateTagMatcher = toCaseInsensitivePattern(dateTags); this.allowFlawedXml = allowFlawed; setSingleSentenceTagMatcher(singleSentenceTags); setDocIdTagMatcher(docIdTags); setDocTypeTagMatcher(docTypeTags); setDiscourseTags(utteranceTurnTags, speakerTags); setDocAnnotationPatterns(docAnnotations); setTokenAnnotationPatterns(tokenAnnotations); setSectionTagMatcher(sectionTags); setSectionAnnotationPatterns(sectionAnnotations); setQuoteTagMatcher(quoteTags); setSsplitDiscardTokensMatcher(ssplitDiscardTokens);
Set<Class> foundAnnotations = annotateWithTag(annotation, annotation, tag, docAnnotationPatterns, null, toAnnotate, null); toAnnotate.removeAll(foundAnnotations); annotateWithTag(annotation, sectionAnnotations, tag, sectionAnnotationPatterns, savedTokensForSection, null, null); if (sectionStartToken != null) { sectionStartToken.set(CoreAnnotations.SectionStartAnnotation.class, sectionAnnotations); annotateWithTag(annotation, sectionAnnotations, tag, sectionAnnotationPatterns, savedTokensForSection, null, null); annotateWithTag(annotation, tokenAnnotations, tag, tokenAnnotationPatterns, null, null, savedTokenAnnotations); currentSpeaker = tokensToString(annotation, speakerTokens); MultiTokenTag.Tag mentionTag = new MultiTokenTag.Tag(currentSpeaker, "Speaker", speakerTokens.size()); int i = 0; String str = tokensToString(annotation, docIdTokens).trim(); annotation.set(CoreAnnotations.DocIDAnnotation.class, str); String str = tokensToString(annotation, docDateTokens).trim(); annotation.set(CoreAnnotations.DocDateAnnotation.class, str); String str = tokensToString(annotation, docTypeTokens).trim(); annotation.set(CoreAnnotations.DocTypeAnnotation.class, str);
@Override public void annotate(Annotation annotation) { if (annotation.containsKey(CoreAnnotations.TokensAnnotation.class)) { List<CoreLabel> tokens = annotation.get(CoreAnnotations.TokensAnnotation.class); if (DEBUG) { log.info("CleanXML: starting tokens: " + tokens); } List<CoreLabel> newTokens = process(annotation, tokens); // We assume that if someone is using this annotator, they don't // want the old tokens any more and get rid of them // redo the token indexes if xml tokens have been removed setTokenBeginTokenEnd(newTokens); annotation.set(CoreAnnotations.TokensAnnotation.class, newTokens); if (DEBUG) { log.info("CleanXML: ending tokens: " + annotation.get(CoreAnnotations.TokensAnnotation.class)); } } }
public List<CoreLabel> process(List<CoreLabel> tokens) { return process(null, tokens); }
Set<Class> foundAnnotations = annotateWithTag(annotation, annotation, tag, docAnnotationPatterns, null, toAnnotate, null); toAnnotate.removeAll(foundAnnotations); annotateWithTag(annotation, sectionAnnotations, tag, sectionAnnotationPatterns, savedTokensForSection, null, null); annotateWithTag(annotation, sectionAnnotations, tag, sectionAnnotationPatterns, savedTokensForSection, null, null); annotateWithTag(annotation, tokenAnnotations, tag, tokenAnnotationPatterns, null, null, savedTokenAnnotations); currentSpeaker = tokensToString(annotation, speakerTokens); MultiTokenTag.Tag mentionTag = new MultiTokenTag.Tag(currentSpeaker, "Speaker", speakerTokens.size()); int i = 0; String str = tokensToString(annotation, docIdTokens).trim(); annotation.set(CoreAnnotations.DocIDAnnotation.class, str); String str = tokensToString(annotation, docDateTokens).trim(); annotation.set(CoreAnnotations.DocDateAnnotation.class, str); String str = tokensToString(annotation, docTypeTokens).trim(); annotation.set(CoreAnnotations.DocTypeAnnotation.class, str);
public void setTokenAnnotationPatterns(String conf) { tokenAnnotationPatterns.clear(); // Patterns can only be tag attributes addAnnotationPatterns(tokenAnnotationPatterns, conf, true); }
/** * Clean XML input */ public CleanXmlAnnotator cleanXML(Properties properties) { return new CleanXmlAnnotator(properties); }
public List<CoreLabel> process(List<CoreLabel> tokens) { return process(null, tokens); }
public void setSectionAnnotationPatterns(String conf) { sectionAnnotationPatterns.clear(); addAnnotationPatterns(sectionAnnotationPatterns, conf, false); }
public Annotator create() { String xmlTags = props.getProperty("clean.xmltags", CleanXmlAnnotator.DEFAULT_XML_TAGS); String sentenceEndingTags = props.getProperty("clean.sentenceendingtags", CleanXmlAnnotator.DEFAULT_SENTENCE_ENDERS); String allowFlawedString = props.getProperty("clean.allowflawedxml"); boolean allowFlawed = CleanXmlAnnotator.DEFAULT_ALLOW_FLAWS; if (allowFlawedString != null) allowFlawed = Boolean.valueOf(allowFlawedString); String dateTags = props.getProperty("clean.datetags", CleanXmlAnnotator.DEFAULT_DATE_TAGS); return new CleanXmlAnnotator(xmlTags, sentenceEndingTags, dateTags, allowFlawed); } });
public List<CoreLabel> process(List<CoreLabel> tokens) { return process(tokens, null); }
public void setDocAnnotationPatterns(String conf) { docAnnotationPatterns.clear(); // Patterns can only be tag attributes addAnnotationPatterns(docAnnotationPatterns, conf, true); }
public void annotate(Annotation annotation) { if (annotation.has(TokensAnnotation.class)) { List<CoreLabel> tokens = annotation.get(TokensAnnotation.class); List<CoreLabel> dateTokens = new ArrayList<CoreLabel>(); List<CoreLabel> newTokens = process(tokens, dateTokens); // We assume that if someone is using this annotator, they don't // want the old tokens any more and get rid of them annotation.set(TokensAnnotation.class, newTokens); // if the doc date was found, save it. it is used by SUTime (inside the "ner" annotator) if(dateTokens.size() > 0){ StringBuffer os = new StringBuffer(); boolean first = true; for (CoreLabel t : dateTokens) { if (!first) os.append(" "); os.append(t.word()); first = false; } //System.err.println("DOC DATE IS: " + os.toString()); annotation.set(DocDateAnnotation.class, os.toString()); } } }