private void annotateTaggingResultsLinkToTokens(JCas aJCas) { List<Token> tokens = getTokens(aJCas); List<TextClassificationOutcome> outcomes = getPredictions(aJCas); for (int i = 0; i < tokens.size(); i++) { Token token = tokens.get(i); String outcome = outcomes.get(i).getOutcome(); POS p = createPartOfSpeechAnnotationFromOutcome(aJCas, token.getBegin(), token.getEnd(), outcome); token.setPos(p); } }
private static String outToken(Token t) { return String.format("%s[%s:%s]", t.getCoveredText(), t.getBegin(), t.getEnd()); }
public static CoreLabel tokenToWord(Token aToken) { CoreLabel t = new CoreLabel(); t.setOriginalText(aToken.getCoveredText()); t.setWord(aToken.getText()); t.setBeginPosition(aToken.getBegin()); t.setEndPosition(aToken.getEnd()); if (aToken.getLemma() != null) { t.setLemma(aToken.getLemma().getValue()); } else { t.setLemma(aToken.getText()); } if (aToken.getPos() != null) { t.setTag(aToken.getPos().getPosValue()); } return t; }
/** * @return the token text taking into account a {@link TokenForm} annotation that might exist. */ public String getText() { String form = getFormValue(); return form != null ? form : getCoveredText(); }
protected void setToken(JCas aJCas, int begin, int end) { Token token = new Token(aJCas, begin, end); token.addToIndexes(); }
/** * Given a list of tokens (e.g. those from a sentence) return the one at the specified position. */ private Token getToken(List<Token> aTokens, int aBegin, int aEnd) { for (Token t : aTokens) { if (aBegin == t.getBegin() && aEnd == t.getEnd()) { return t; } } throw new IllegalStateException("Token not found"); }
private void annotateTokenWithTag(JCas aJCas, String aToken, String aTag, int aCurrPosInText) { if (readToken) { // Token Token token = new Token(aJCas, aCurrPosInText, aToken.length() + aCurrPosInText); token.addToIndexes(); if (readPOS) { // Tag Type posTag = posMappingProvider.getTagType(aTag); POS pos = (POS) aJCas.getCas().createAnnotation(posTag, token.getBegin(), token.getEnd()); pos.setPosValue(aTag); POSUtils.assignCoarseValue(pos); pos.addToIndexes(); // Set the POS for the Token token.setPos(pos); } } } }
toks[i] = t.getText(); tags[i] = t.getPosValue(); i++; Lemma lemmaAnno = new Lemma(aJCas, t.getBegin(), t.getEnd()); lemmaAnno.setValue(lemmas[n]); lemmaAnno.addToIndexes(); t.setLemma(lemmaAnno); n++;
private void convertLemma(JCas aJCas, TextCorpus aCorpusData, Map<String, Token> aTokens) { if (aCorpusData.getLemmasLayer() == null) { return; } for (int i = 0; i < aCorpusData.getLemmasLayer().size(); i++) { eu.clarin.weblicht.wlfxb.tc.api.Token[] lemmaTokens = aCorpusData.getLemmasLayer() .getTokens(aCorpusData.getLemmasLayer().getLemma(i)); String value = aCorpusData.getLemmasLayer().getLemma(i).getString(); Lemma outLemma = new Lemma(aJCas); outLemma.setBegin(aTokens.get(lemmaTokens[0].getID()).getBegin()); outLemma.setEnd(aTokens.get(lemmaTokens[0].getID()).getEnd()); outLemma.setValue(value); outLemma.addToIndexes(); // Set the lemma to the token aTokens.get(lemmaTokens[0].getID()).setLemma(outLemma); } }
private static void copyParagraphAndTokenAnnotations(JCas source, JCas target) { if (!source.getDocumentText().equals(target.getDocumentText())) { throw new IllegalArgumentException("Source and target have different content"); } for (Paragraph p : JCasUtil.select(source, Paragraph.class)) { Paragraph paragraph = new Paragraph(target); paragraph.setBegin(p.getBegin()); paragraph.setEnd(p.getEnd()); paragraph.addToIndexes(); } for (Token t : JCasUtil.select(source, Token.class)) { Token token = new Token(target); token.setBegin(t.getBegin()); token.setEnd(t.getEnd()); token.addToIndexes(); } }
@Override public boolean check(JCas aJCas, List<Message> aMessages) { List<Token> withoutPOS = select(aJCas, Token.class).stream() .filter(t -> t.getPos() == null) .collect(Collectors.toList()); for (Token t : withoutPOS) { aMessages.add(new Message(this, ERROR, String.format("Token has no POS: %s [%d..%d]", t .getType().getName(), t.getBegin(), t.getEnd()))); } List<Token> withoutPOSValue = select(aJCas, Token.class).stream() .filter(t -> t.getPos() != null && t.getPos().getPosValue() == null) .collect(Collectors.toList()); for (Token t : withoutPOSValue) { aMessages.add(new Message(this, ERROR, String.format( "Token has no POS value: %s [%d..%d]", t.getType().getName(), t.getBegin(), t.getEnd()))); } return aMessages.stream().anyMatch(m -> m.level == ERROR); }
@Override public void token(Token aToken, String aPos, String aLemma) { synchronized (cas) { // Add the Part of Speech if (writePos && aPos != null) { Type posTag = posMappingProvider.getTagType(aPos); POS posAnno = (POS) cas.createAnnotation(posTag, aToken.getBegin(), aToken.getEnd()); posAnno.setPosValue(aPos.intern()); POSUtils.assignCoarseValue(posAnno); aToken.setPos(posAnno); pos[count.get()] = posAnno; } // Add the lemma if (writeLemma && aLemma != null) { Lemma lemmaAnno = new Lemma(aJCas, aToken.getBegin(), aToken.getEnd()); lemmaAnno.setValue(aLemma.intern()); aToken.setLemma(lemmaAnno); lemma[count.get()] = lemmaAnno; } count.getAndIncrement(); } } });
Lemma l1 = new Lemma(jcas, t1.getBegin(), t1.getEnd()); l1.setValue("lemma1"); l1.addToIndexes(); t1.setLemma(l1); MorphologicalFeatures m1 = new MorphologicalFeatures(jcas, t1.getBegin(), t1.getEnd()); m1.setValue("morph"); m1.setTense("tense1"); m1.addToIndexes(); t1.setMorph(m1); POS p1 = new POS(jcas, t1.getBegin(), t1.getEnd()); p1.setPosValue("pos1"); p1.addToIndexes(); t1.setPos(p1); Stem s1 = new Stem(jcas, t1.getBegin(), t1.getEnd()); s1.setValue("stem1"); s1.addToIndexes(); t1.setStem(s1);
public List<String> getSubstitutions(JCas jcas) { List<String> tokens = new ArrayList<String>(); List<String> postags = new ArrayList<String>();; for (Token t : JCasUtil.select(jcas, Token.class)) { try { tokens.add(t.getLemma().getValue().toLowerCase()); postags.add(t.getPos().getPosValue()); } catch (NullPointerException e) { System.err.println("Couldn't read lemma value for token \"" + t.getCoveredText() + "\""); } } return getSubstitutions(tokens, postags); }
@Override public void process(final JCas aJCas) throws AnalysisEngineProcessException { Token token; for (Compound compound : JCasUtil.select(aJCas, Compound.class)) { final Token compoundToken = JCasUtil.selectCovered(aJCas, Token.class, compound.getBegin(), compound.getEnd()).get(0); for (Split compoundPart : compound.getSplitsWithoutMorpheme(compoundSplitLevel)) { token = new Token(aJCas); token.setBegin(compoundPart.getBegin()); token.setEnd(compoundPart.getEnd()); token.setPos(compoundToken.getPos()); token.addToIndexes(); } } }
protected Map<String, String> indexDepTree(JCas text) { Map<String, String> depTree = new HashMap<String, String>(); // format: key: 1 ### word ### pos; value: dep_rel ## 2 ### word ### pos // escape: .replace("#", "\\#") // depTree.put("1 ### The ### Det", "DET ## 2 ### dog ### N"); // depTree.put("2 ### dog ### N", "SUBJ ## 3 ### chases ### V"); // depTree.put("3 ### chases ### V", "ROOT ## 0 ### NULL ### NULL"); // depTree.put("4 ### The ### Det", "DET ## 5 ### cat ### N"); // depTree.put("5 ### cat ### N", "OBJ ## 3 ### chases ### V"); for (Dependency dep : JCasUtil.select(text, Dependency.class)) { Token child = dep.getDependent(); Token parent = dep.getGovernor(); depTree.put(child.getBegin() + " ### " + child.getCoveredText().replace("#", "\\#") + " ### " + child.getPos().getPosValue(), dep.getDependencyType() + " ## " + parent.getBegin() + " ### " + parent.getCoveredText().replace("#", "\\#") + " ### " + parent.getPos().getPosValue()); } return depTree; }
@Override public void process(JCas aJCas) throws AnalysisEngineProcessException { if (embedding == null) { return; } Collection<Token> select = JCasUtil.select(aJCas, Token.class); for (Token t : select) { if (vocab.contains(t.getCoveredText())) { continue; } POS pos = t.getPos(); if (pos != null) { pos.removeFromIndexes(); t.setPos(null); } t.removeFromIndexes(); droppedVocabulary++; } }
protected Map<String, String> indexLemmaDepTree(JCas text) { Map<String, String> depTree = new HashMap<String, String>(); for (Dependency dep : JCasUtil.select(text, Dependency.class)) { Token child = dep.getDependent(); Token parent = dep.getGovernor(); depTree.put(child.getBegin() + " ### " + child.getLemma().getValue().replace("#", "\\#") + " ### " + child.getPos().getPosValue(), dep.getDependencyType() + " ## " + parent.getBegin() + " ### " + parent.getLemma().getValue().replace("#", "\\#") + " ### " + parent.getPos().getPosValue()); } return depTree; } }
@Override public String getTokenBaseForm(Token token) { return token.getCoveredText(); }
Token tokenAnnot = new Token(jcas1); tokenAnnot.setBegin(begin); tokenAnnot.setEnd(end); tokenAnnot.addToIndexes(); Lemma lemmaAnnot = new Lemma(jcas1); lemmaAnnot.setBegin(begin); tokenAnnot.setLemma(lemmaAnnot);