edu.stanford.nlp.pipeline.CleanXmlAnnotator java code examples

/**
 * Clean XML input
 */
public CleanXmlAnnotator cleanXML(Properties properties) {
 return new CleanXmlAnnotator(properties);
}

public List<CoreLabel> process(List<CoreLabel> tokens) {
 return process(null, tokens);
}

public void setDocAnnotationPatterns(String conf) {
 docAnnotationPatterns.clear();
 // Patterns can only be tag attributes
 addAnnotationPatterns(docAnnotationPatterns, conf, true);
}

 xmlTagMatcher = toCaseInsensitivePattern(xmlElementsToProcess);
 if (StringUtils.isNullOrEmpty(sentenceEndingTags)) {
  sentenceEndingTagMatcher = null;
 } else {
  sentenceEndingTagMatcher = toCaseInsensitivePattern(sentenceEndingTags);
dateTagMatcher = toCaseInsensitivePattern(dateTags);
this.allowFlawedXml = allowFlawed;
setSingleSentenceTagMatcher(singleSentenceTags);
setDocIdTagMatcher(docIdTags);
setDocTypeTagMatcher(docTypeTags);
setDiscourseTags(utteranceTurnTags, speakerTags);
setDocAnnotationPatterns(docAnnotations);
setTokenAnnotationPatterns(tokenAnnotations);
setSectionTagMatcher(sectionTags);
setSectionAnnotationPatterns(sectionAnnotations);
setQuoteTagMatcher(quoteTags);
setSsplitDiscardTokensMatcher(ssplitDiscardTokens);

@Override
public void annotate(Annotation annotation) {
 if (annotation.containsKey(CoreAnnotations.TokensAnnotation.class)) {
  List<CoreLabel> tokens = annotation.get(CoreAnnotations.TokensAnnotation.class);
  if (DEBUG) { log.info("CleanXML: starting tokens: " + tokens); }
  List<CoreLabel> newTokens = process(annotation, tokens);
  // We assume that if someone is using this annotator, they don't
  // want the old tokens any more and get rid of them
  // redo the token indexes if xml tokens have been removed
  setTokenBeginTokenEnd(newTokens);
  annotation.set(CoreAnnotations.TokensAnnotation.class, newTokens);
  if (DEBUG) { log.info("CleanXML: ending tokens: " + annotation.get(CoreAnnotations.TokensAnnotation.class)); }
 }
}

String ssplitDiscardTokens =
    properties.getProperty("clean.ssplitDiscardTokens");
CleanXmlAnnotator annotator = new CleanXmlAnnotator(xmlTags,
  sentenceEndingTags,
  dateTags,
  allowFlawed);
annotator.setSingleSentenceTagMatcher(singleSentenceTags);
annotator.setDocIdTagMatcher(docIdTags);
annotator.setDocTypeTagMatcher(docTypeTags);
annotator.setDiscourseTags(utteranceTurnTags, speakerTags);
annotator.setDocAnnotationPatterns(docAnnotations);
annotator.setTokenAnnotationPatterns(tokenAnnotations);
annotator.setSectionTagMatcher(sectionTags);
annotator.setSectionAnnotationPatterns(sectionAnnotations);
annotator.setSsplitDiscardTokensMatcher(ssplitDiscardTokens);
return annotator;

Set<Class> foundAnnotations = annotateWithTag(annotation, annotation, tag, docAnnotationPatterns, null, toAnnotate, null);
toAnnotate.removeAll(foundAnnotations);
  annotateWithTag(annotation, sectionAnnotations, tag, sectionAnnotationPatterns, savedTokensForSection, null, null);
annotateWithTag(annotation, sectionAnnotations, tag, sectionAnnotationPatterns, savedTokensForSection, null, null);
annotateWithTag(annotation, tokenAnnotations, tag, tokenAnnotationPatterns, null, null, savedTokenAnnotations);
 currentSpeaker = tokensToString(annotation, speakerTokens);
 MultiTokenTag.Tag mentionTag = new MultiTokenTag.Tag(currentSpeaker, "Speaker", speakerTokens.size());
 int i = 0;
String str = tokensToString(annotation, docIdTokens).trim();
annotation.set(CoreAnnotations.DocIDAnnotation.class, str);
String str = tokensToString(annotation, docDateTokens).trim();
annotation.set(CoreAnnotations.DocDateAnnotation.class, str);
String str = tokensToString(annotation, docTypeTokens).trim();
annotation.set(CoreAnnotations.DocTypeAnnotation.class, str);

 xmlTagMatcher = toCaseInsensitivePattern(xmlElementsToProcess);
 if (StringUtils.isNullOrEmpty(sentenceEndingTags)) {
  sentenceEndingTagMatcher = null;
 } else {
  sentenceEndingTagMatcher = toCaseInsensitivePattern(sentenceEndingTags);
dateTagMatcher = toCaseInsensitivePattern(dateTags);
this.allowFlawedXml = allowFlawed;
setSingleSentenceTagMatcher(singleSentenceTags);
setDocIdTagMatcher(docIdTags);
setDocTypeTagMatcher(docTypeTags);
setDiscourseTags(utteranceTurnTags, speakerTags);
setDocAnnotationPatterns(docAnnotations);
setTokenAnnotationPatterns(tokenAnnotations);
setSectionTagMatcher(sectionTags);
setSectionAnnotationPatterns(sectionAnnotations);
setQuoteTagMatcher(quoteTags);
setSsplitDiscardTokensMatcher(ssplitDiscardTokens);

Set<Class> foundAnnotations = annotateWithTag(annotation, annotation, tag, docAnnotationPatterns, null, toAnnotate, null);
toAnnotate.removeAll(foundAnnotations);
 annotateWithTag(annotation, sectionAnnotations, tag, sectionAnnotationPatterns, savedTokensForSection, null, null);
 if (sectionStartToken != null) {
  sectionStartToken.set(CoreAnnotations.SectionStartAnnotation.class, sectionAnnotations);
annotateWithTag(annotation, sectionAnnotations, tag, sectionAnnotationPatterns, savedTokensForSection, null, null);
annotateWithTag(annotation, tokenAnnotations, tag, tokenAnnotationPatterns, null, null, savedTokenAnnotations);
 currentSpeaker = tokensToString(annotation, speakerTokens);
 MultiTokenTag.Tag mentionTag = new MultiTokenTag.Tag(currentSpeaker, "Speaker", speakerTokens.size());
 int i = 0;
String str = tokensToString(annotation, docIdTokens).trim();
annotation.set(CoreAnnotations.DocIDAnnotation.class, str);
String str = tokensToString(annotation, docDateTokens).trim();
annotation.set(CoreAnnotations.DocDateAnnotation.class, str);
String str = tokensToString(annotation, docTypeTokens).trim();
annotation.set(CoreAnnotations.DocTypeAnnotation.class, str);

@Override
public void annotate(Annotation annotation) {
 if (annotation.containsKey(CoreAnnotations.TokensAnnotation.class)) {
  List<CoreLabel> tokens = annotation.get(CoreAnnotations.TokensAnnotation.class);
  if (DEBUG) { log.info("CleanXML: starting tokens: " + tokens); }
  List<CoreLabel> newTokens = process(annotation, tokens);
  // We assume that if someone is using this annotator, they don't
  // want the old tokens any more and get rid of them
  // redo the token indexes if xml tokens have been removed
  setTokenBeginTokenEnd(newTokens);
  annotation.set(CoreAnnotations.TokensAnnotation.class, newTokens);
  if (DEBUG) { log.info("CleanXML: ending tokens: " + annotation.get(CoreAnnotations.TokensAnnotation.class)); }
 }
}

public List<CoreLabel> process(List<CoreLabel> tokens) {
 return process(null, tokens);
}

Set<Class> foundAnnotations = annotateWithTag(annotation, annotation, tag, docAnnotationPatterns, null, toAnnotate, null);
toAnnotate.removeAll(foundAnnotations);
  annotateWithTag(annotation, sectionAnnotations, tag, sectionAnnotationPatterns, savedTokensForSection, null, null);
annotateWithTag(annotation, sectionAnnotations, tag, sectionAnnotationPatterns, savedTokensForSection, null, null);
annotateWithTag(annotation, tokenAnnotations, tag, tokenAnnotationPatterns, null, null, savedTokenAnnotations);
 currentSpeaker = tokensToString(annotation, speakerTokens);
 MultiTokenTag.Tag mentionTag = new MultiTokenTag.Tag(currentSpeaker, "Speaker", speakerTokens.size());
 int i = 0;
String str = tokensToString(annotation, docIdTokens).trim();
annotation.set(CoreAnnotations.DocIDAnnotation.class, str);
String str = tokensToString(annotation, docDateTokens).trim();
annotation.set(CoreAnnotations.DocDateAnnotation.class, str);
String str = tokensToString(annotation, docTypeTokens).trim();
annotation.set(CoreAnnotations.DocTypeAnnotation.class, str);

public void setTokenAnnotationPatterns(String conf) {
 tokenAnnotationPatterns.clear();
 // Patterns can only be tag attributes
 addAnnotationPatterns(tokenAnnotationPatterns, conf, true);
}

/**
 * Clean XML input
 */
public CleanXmlAnnotator cleanXML(Properties properties) {
 return new CleanXmlAnnotator(properties);
}

public List<CoreLabel> process(List<CoreLabel> tokens) {
 return process(null, tokens);
}

public void setSectionAnnotationPatterns(String conf) {
 sectionAnnotationPatterns.clear();
 addAnnotationPatterns(sectionAnnotationPatterns, conf, false);
}

 public Annotator create() {
  String xmlTags =
   props.getProperty("clean.xmltags",
            CleanXmlAnnotator.DEFAULT_XML_TAGS);
  String sentenceEndingTags =
   props.getProperty("clean.sentenceendingtags",
            CleanXmlAnnotator.DEFAULT_SENTENCE_ENDERS);
  String allowFlawedString = props.getProperty("clean.allowflawedxml");
  boolean allowFlawed = CleanXmlAnnotator.DEFAULT_ALLOW_FLAWS;
  if (allowFlawedString != null)
   allowFlawed = Boolean.valueOf(allowFlawedString);
  String dateTags =
   props.getProperty("clean.datetags",
            CleanXmlAnnotator.DEFAULT_DATE_TAGS);
  return new CleanXmlAnnotator(xmlTags,
    sentenceEndingTags,
    dateTags,
    allowFlawed);
 }
});

public List<CoreLabel> process(List<CoreLabel> tokens) {
 return process(tokens, null);
}

public void setDocAnnotationPatterns(String conf) {
 docAnnotationPatterns.clear();
 // Patterns can only be tag attributes
 addAnnotationPatterns(docAnnotationPatterns, conf, true);
}

public void annotate(Annotation annotation) {
 if (annotation.has(TokensAnnotation.class)) {
  List<CoreLabel> tokens = annotation.get(TokensAnnotation.class);
  List<CoreLabel> dateTokens = new ArrayList<CoreLabel>();
  List<CoreLabel> newTokens = process(tokens, dateTokens);
  // We assume that if someone is using this annotator, they don't
  // want the old tokens any more and get rid of them
  annotation.set(TokensAnnotation.class, newTokens);
  // if the doc date was found, save it. it is used by SUTime (inside the "ner" annotator)
  if(dateTokens.size() > 0){
   StringBuffer os = new StringBuffer();
   boolean first = true;
   for (CoreLabel t : dateTokens) {
    if (!first) os.append(" ");
    os.append(t.word());
    first = false;
   }
   //System.err.println("DOC DATE IS: " + os.toString());
   annotation.set(DocDateAnnotation.class, os.toString());
  }
 }
}

Javadoc

An annotator which removes all XML tags (as identified by the tokenizer) and possibly selectively keeps the text between them. Can also add sentence-ending markers depending on the XML tag. Note that the removal of tags is done by a finite state tokenizer. Thus, this works for simple, typical XML, or equally for similar SGML or XML tags, but will not work on arbitrarily complicated XML.

Most used methods

<init>
process
addAnnotationPatterns
annotateWithTag
Updates a CoreMap with attributes (or text context) from a tag.
setDiscourseTags
setDocAnnotationPatterns
setDocIdTagMatcher
setDocTypeTagMatcher
setSectionAnnotationPatterns
setSectionTagMatcher
setSingleSentenceTagMatcher
setSsplitDiscardTokensMatcher

Popular in Java

Running tasks concurrently on multiple threads
setRequestProperty (URLConnection)
startActivity (Activity)
notifyDataSetChanged (ArrayAdapter)
BigDecimal (java.math)
An immutable arbitrary-precision signed decimal.A value is represented by an arbitrary-precision "un
Selector (java.nio.channels)
A controller for the selection of SelectableChannel objects. Selectable channels can be registered w
TreeMap (java.util)
Walk the nodes of the tree left-to-right or right-to-left. Note that in descending iterations, next
Manifest (java.util.jar)
The Manifest class is used to obtain attribute information for a JarFile and its entries.
Collectors (java.util.stream)
BasicDataSource (org.apache.commons.dbcp)
Basic implementation of javax.sql.DataSource that is configured via JavaBeans properties. This is no
Best plugins for Eclipse

How to useCleanXmlAnnotator in edu.stanford.nlp.pipeline

Best Java code snippets using edu.stanford.nlp.pipeline.CleanXmlAnnotator (Showing top 20 results out of 315)

How to use
CleanXmlAnnotator
in
edu.stanford.nlp.pipeline