/** * Annotates the CAS and checks if it's good quality. * * @param jcas An input CAS that will be annotated. * @param minTokQty The minimum number of tokens present to be considered good. * @return true if the CAS contains a high-quality text. * @throws AnalysisEngineProcessException */ public static boolean checkCAS(JCas jcas, int minTokQty) throws AnalysisEngineProcessException { boolean hasNoun = false, hasVerb = false; for (POS p: JCasUtil.select(jcas, POS.class)) { if (p.getPosValue().startsWith("NN")) hasNoun = true; if (p.getPosValue().startsWith("VB")) hasVerb = true; } Collection<Token> toks = JCasUtil.select(jcas, Token.class); return toks.size() >= minTokQty && hasNoun && hasVerb; } }
/** * @return the POS value if there is a {@link POS} annotation linked to this token. */ public String getPosValue() { POS pos = getPos(); return pos != null ? pos.getPosValue() : null; }
public String getTextClassificationOutcome(JCas jcas, TextClassificationTarget target) { List<POS> posList = JCasUtil.selectCovered(jcas, POS.class, target); String outcome = ""; if (useCoarseGrained) { outcome = posList.get(0).getClass().getSimpleName(); } else { outcome = posList.get(0).getPosValue(); } return outcome; }
public String getTextClassificationOutcome(JCas jcas, TextClassificationTarget aTarget) { List<POS> posList = JCasUtil.selectCovered(jcas, POS.class, aTarget); return posList.get(0).getPosValue(); }
private Set<List<String>> getPosNGrams(List<POS> pos) { Set<List<String>> ngrams = new HashSet<List<String>>(); for (int i = 0; i < pos.size() - (n - 1); i++) { // Generate n-gram at index i List<String> ngram = new ArrayList<String>(); for (int k = 0; k < n; k++) { String token = pos.get(i + k).getPosValue(); ngram.add(token); } // Add ngrams.add(ngram); } return ngrams; }
/** * @return the POS value if there is a {@link POS} annotation linked to this token. */ public String getPosValue() { POS pos = getPos(); return pos != null ? pos.getPosValue() : null; }
public String getTextClassificationOutcome(JCas jcas, TextClassificationTarget unit) { List<POS> posList = JCasUtil.selectCovered(jcas, POS.class, unit); return posList.get(0).getPosValue().replaceAll(" ", "_"); }
private Set<List<String>> getPosNGrams(List<POS> pos) { Set<List<String>> ngrams = new HashSet<List<String>>(); for (int i = 0; i < pos.size() - (n - 1); i++) { // Generate n-gram at index i List<String> ngram = new ArrayList<String>(); for (int k = 0; k < n; k++) { String token = pos.get(i + k).getPosValue(); ngram.add(token); } // Add ngrams.add(ngram); } return ngrams; }
private Set<List<String>> getPosNGrams(List<POS> pos) { Set<List<String>> ngrams = new HashSet<List<String>>(); for (int i = 0; i < pos.size() - (n - 1); i++) { // Generate n-gram at index i List<String> ngram = new ArrayList<String>(); for (int k = 0; k < n; k++) { String token = pos.get(i + k).getPosValue(); ngram.add(token); } // Add ngrams.add(ngram); } return ngrams; }
public static FrequencyDistribution<String> getDocumentPosNgrams(JCas jcas, int minN, int maxN, boolean useCanonical) { FrequencyDistribution<String> posNgrams = new FrequencyDistribution<String>(); for (Sentence s : select(jcas, Sentence.class)) { List<String> postagstrings = new ArrayList<String>(); for (POS p : JCasUtil.selectCovered(jcas, POS.class, s)) { if (useCanonical) { postagstrings.add(p.getClass().getSimpleName()); } else { postagstrings.add(p.getPosValue()); } } String[] posarray = postagstrings.toArray(new String[postagstrings.size()]); for (List<String> ngram : new NGramStringListIterable(posarray, minN, maxN)) { posNgrams.inc(StringUtils.join(ngram, NGRAM_GLUE)); } } return posNgrams; }
private static FrequencyDistribution<String> documentBasedDistribution(JCas jcas, Annotation focus, boolean useCanonical, int minN, int maxN) { FrequencyDistribution<String> posNgrams = new FrequencyDistribution<String>(); List<String> postagstrings = new ArrayList<String>(); for (POS p : selectCovered(jcas, POS.class, focus)) { if (useCanonical) { postagstrings.add(p.getClass().getSimpleName()); } else { postagstrings.add(p.getPosValue()); } } String[] posarray = postagstrings.toArray(new String[postagstrings.size()]); for (List<String> ngram : new NGramStringListIterable(posarray, minN, maxN)) { posNgrams.inc(StringUtils.join(ngram, NGRAM_GLUE)); } return posNgrams; }
protected Map<String, Integer> countTokenPoses(JCas text) { Map<String, Integer> tokenNumMap = new HashMap<String, Integer>(); Iterator<Annotation> tokenIter = text.getAnnotationIndex(Token.type) .iterator(); while (tokenIter.hasNext()) { Token curr = (Token) tokenIter.next(); String tokenText = curr.getLemma().getValue().replace("#", "\\#") + " ### " + curr.getPos().getPosValue(); Integer num = tokenNumMap.get(tokenText); if (null == num) { tokenNumMap.put(tokenText, 1); } else { tokenNumMap.put(tokenText, num + 1); } } return tokenNumMap; }
public List<String> getSubstitutions(JCas jcas) { List<String> tokens = new ArrayList<String>(); List<String> postags = new ArrayList<String>();; for (Token t : JCasUtil.select(jcas, Token.class)) { try { tokens.add(t.getLemma().getValue().toLowerCase()); postags.add(t.getPos().getPosValue()); } catch (NullPointerException e) { System.err.println("Couldn't read lemma value for token \"" + t.getCoveredText() + "\""); } } return getSubstitutions(tokens, postags); }
public List<String> getSubstitutions(JCas jcas, Annotation coveringAnnotation) { List<String> tokens = new ArrayList<String>(); List<String> postags = new ArrayList<String>();; for (Token t : JCasUtil.selectCovered(jcas, Token.class, coveringAnnotation)) { try { tokens.add(t.getLemma().getValue().toLowerCase()); postags.add(t.getPos().getPosValue()); } catch (NullPointerException e) { System.err.println("Couldn't read lemma value for token \"" + t.getCoveredText() + "\""); } } return getSubstitutions(tokens, postags); }
@SuppressWarnings("unused") private boolean hasPos(FeatureStructure fs, String posValue) { if (fs instanceof POS) { POS pos = (POS) fs; if (pos.getPosValue().equals(posValue)) { return true; } } else if (fs instanceof Token) { Token token = (Token) fs; if (token.getPos().getPosValue().equals(posValue)) { return true; } } return false; }
@Override public boolean check(JCas aJCas, List<Message> aMessages) { List<Token> withoutPOS = select(aJCas, Token.class).stream() .filter(t -> t.getPos() == null) .collect(Collectors.toList()); for (Token t : withoutPOS) { aMessages.add(new Message(this, ERROR, String.format("Token has no POS: %s [%d..%d]", t .getType().getName(), t.getBegin(), t.getEnd()))); } List<Token> withoutPOSValue = select(aJCas, Token.class).stream() .filter(t -> t.getPos() != null && t.getPos().getPosValue() == null) .collect(Collectors.toList()); for (Token t : withoutPOSValue) { aMessages.add(new Message(this, ERROR, String.format( "Token has no POS value: %s [%d..%d]", t.getType().getName(), t.getBegin(), t.getEnd()))); } return aMessages.stream().anyMatch(m -> m.level == ERROR); }
@Override public POSSample produce(JCas aJCas) { // Process present sentences Sentence sentence = sentences.next(); // Block on next call to read if (!sentences.hasNext()) { documentComplete(); } List<String> words = new ArrayList<>(); List<String> tags = new ArrayList<>(); for (Token t : selectCovered(Token.class, sentence)) { words.add(t.getText()); if (t.getPos() == null) { throw new IllegalStateException("Token [" + t.getText() + "] has no POS"); } tags.add(t.getPos().getPosValue()); } return new POSSample(words, tags); } }
protected Map<String, String> indexLemmaDepTree(JCas text) { Map<String, String> depTree = new HashMap<String, String>(); for (Dependency dep : JCasUtil.select(text, Dependency.class)) { Token child = dep.getDependent(); Token parent = dep.getGovernor(); depTree.put(child.getBegin() + " ### " + child.getLemma().getValue().replace("#", "\\#") + " ### " + child.getPos().getPosValue(), dep.getDependencyType() + " ## " + parent.getBegin() + " ### " + parent.getLemma().getValue().replace("#", "\\#") + " ### " + parent.getPos().getPosValue()); } return depTree; } }
private static NodeInfo buildNodeInfo(JCas jcas, Token tokenAnno, int serial) throws CasTreeConverterException, UnsupportedPosTagStringException { String word = tokenAnno.getCoveredText(); String lemma = tokenAnno.getLemma().getValue(); String pos = tokenAnno.getPos().getPosValue(); // We rely on the fact the NamedEntity enum values have the same names as the ones // specified in the DKPro mapping (e.g. PERSON, ORGANIZATION) eu.excitementproject.eop.common.representation.parse.representation.basic.NamedEntity namedEntity=null; List<NamedEntity> namedEntities = JCasUtil.selectCovered(NamedEntity.class, tokenAnno); switch (namedEntities.size()) { case 0: break; // if no NER - ignore and move on case 1: namedEntity = eu.excitementproject.eop.common.representation.parse.representation.basic.NamedEntity.valueOf(namedEntities.get(0).getValue()); break; default: throw new CasTreeConverterException(String.format("Got %d NamedEntity annotations for token %s", namedEntities.size(), tokenAnno)); } return new DefaultNodeInfo(word, lemma, serial, namedEntity, new DefaultSyntacticInfo(new PennPartOfSpeech(pos))); }
public static CoreLabel tokenToWord(Token aToken) { CoreLabel t = new CoreLabel(); t.setOriginalText(aToken.getCoveredText()); t.setWord(aToken.getText()); t.setBeginPosition(aToken.getBegin()); t.setEndPosition(aToken.getEnd()); if (aToken.getLemma() != null) { t.setLemma(aToken.getLemma().getValue()); } else { t.setLemma(aToken.getText()); } if (aToken.getPos() != null) { t.setTag(aToken.getPos().getPosValue()); } return t; }