/** * @return the lemma value if there is a {@link Lemma} annotation linked to this token. */ public String getLemmaValue() { Lemma lemma = getLemma(); return lemma != null ? lemma.getValue() : null; }
/** * @return the lemma value if there is a {@link Lemma} annotation linked to this token. */ public String getLemmaValue() { Lemma lemma = getLemma(); return lemma != null ? lemma.getValue() : null; }
@Override public String getTokenBaseForm(Token token) { return token.getLemma().getValue(); }
@Override public String getTokenBaseForm(Token token) { return token.getLemma().getValue(); }
@Override public String getSemanticTag(Token token) throws ResourceAccessException { try { if (keySemanticTagMap.containsKey(token.getLemma().getValue())) { return keySemanticTagMap.get(token.getLemma().getValue()); } else { return "UNKNOWN"; } } catch (Exception e) { throw new ResourceAccessException(e); } }
@Override public String getSemanticTag(Token token) throws ResourceAccessException { try { if (keySemanticTagMap.containsKey(token.getLemma().getValue())) { return keySemanticTagMap.get(token.getLemma().getValue()); } else { return "UNKNOWN"; } } catch (Exception e) { throw new ResourceAccessException(e); } }
@Override public String getSemanticTag(List<Token> tokens) throws ResourceAccessException { List<String> lemmas = new ArrayList<String>(); for (Token token : tokens) { lemmas.add(token.getLemma().getValue()); } String lemmaString = StringUtils.join(lemmas, " "); try { if (keySemanticTagMap.containsKey(lemmaString)) { return keySemanticTagMap.get(lemmaString); } else { return "UNKNOWN"; } } catch (Exception e) { throw new ResourceAccessException(e); } }
/** * Produce token lemma, return the original string if the lemma is null; * converts the string to lower case. * * @param tok * @return */ public String getTokenLemma(Token tok) { Lemma l = tok.getLemma(); // For some weird reason, Clear NLP lemma is sometimes NULL return (l!=null) ? l.getValue() : tok.getCoveredText().toLowerCase(); }
@Override public String getSemanticTag(List<Token> tokens) throws ResourceAccessException { List<String> lemmas = new ArrayList<String>(); for (Token token : tokens) { lemmas.add(token.getLemma().getValue()); } String lemmaString = StringUtils.join(lemmas, " "); try { if (keySemanticTagMap.containsKey(lemmaString)) { return keySemanticTagMap.get(lemmaString); } else { return "UNKNOWN"; } } catch (Exception e) { throw new ResourceAccessException(e); } }
/** * Get a phrase from a list of consecutive tokens * @param tokens The list of tokens * @param start The start token index * @param end The end token index * @param supportLemma The current lexical resources needs right and left lemmas * rather than surface words * @return The phrase containing the tokens from start to end */ private String getPhrase(List<Token> tokens, int start, int end, boolean supportLemma) { StringBuilder phrase = new StringBuilder(); for (int tokenIndex = start; tokenIndex < end + 1; ++tokenIndex) { phrase.append(supportLemma ? tokens.get(tokenIndex).getLemma().getValue() : tokens.get(tokenIndex).getCoveredText()); phrase.append(" "); } // Remove last space if (phrase.length() > 0) { phrase.deleteCharAt(phrase.length() - 1); } return phrase.toString(); }
protected Map<String, Integer> countTokenPoses(JCas text) { Map<String, Integer> tokenNumMap = new HashMap<String, Integer>(); Iterator<Annotation> tokenIter = text.getAnnotationIndex(Token.type) .iterator(); while (tokenIter.hasNext()) { Token curr = (Token) tokenIter.next(); String tokenText = curr.getLemma().getValue().replace("#", "\\#") + " ### " + curr.getPos().getPosValue(); Integer num = tokenNumMap.get(tokenText); if (null == num) { tokenNumMap.put(tokenText, 1); } else { tokenNumMap.put(tokenText, num + 1); } } return tokenNumMap; }
public List<String> getSubstitutions(JCas jcas) { List<String> tokens = new ArrayList<String>(); List<String> postags = new ArrayList<String>();; for (Token t : JCasUtil.select(jcas, Token.class)) { try { tokens.add(t.getLemma().getValue().toLowerCase()); postags.add(t.getPos().getPosValue()); } catch (NullPointerException e) { System.err.println("Couldn't read lemma value for token \"" + t.getCoveredText() + "\""); } } return getSubstitutions(tokens, postags); }
public List<String> getSubstitutions(JCas jcas, Annotation coveringAnnotation) { List<String> tokens = new ArrayList<String>(); List<String> postags = new ArrayList<String>();; for (Token t : JCasUtil.selectCovered(jcas, Token.class, coveringAnnotation)) { try { tokens.add(t.getLemma().getValue().toLowerCase()); postags.add(t.getPos().getPosValue()); } catch (NullPointerException e) { System.err.println("Couldn't read lemma value for token \"" + t.getCoveredText() + "\""); } } return getSubstitutions(tokens, postags); }
/** * Create set of all token lemma strings covered by given chunk in given * view * * @param view * Current JCas view * @param curChunk * Current chunk annotation * @param tokenSet * Set of token lemma */ private void createLemmaSet(JCas view, Annotation curChunk, HashSet<String> tokenSet) { for (Token t : JCasUtil.selectCovered(view, Token.class, curChunk.getBegin(), curChunk.getEnd())) { String curLemma = t.getLemma().getValue(); if (!tokenSet.contains(curLemma)) tokenSet.add(curLemma.toLowerCase()); } }
protected Map<String, String> indexLemmaDepTree(JCas text) { Map<String, String> depTree = new HashMap<String, String>(); for (Dependency dep : JCasUtil.select(text, Dependency.class)) { Token child = dep.getDependent(); Token parent = dep.getGovernor(); depTree.put(child.getBegin() + " ### " + child.getLemma().getValue().replace("#", "\\#") + " ### " + child.getPos().getPosValue(), dep.getDependencyType() + " ## " + parent.getBegin() + " ### " + parent.getLemma().getValue().replace("#", "\\#") + " ### " + parent.getPos().getPosValue()); } return depTree; } }
private static NodeInfo buildNodeInfo(JCas jcas, Token tokenAnno, int serial) throws CasTreeConverterException, UnsupportedPosTagStringException { String word = tokenAnno.getCoveredText(); String lemma = tokenAnno.getLemma().getValue(); String pos = tokenAnno.getPos().getPosValue(); // We rely on the fact the NamedEntity enum values have the same names as the ones // specified in the DKPro mapping (e.g. PERSON, ORGANIZATION) eu.excitementproject.eop.common.representation.parse.representation.basic.NamedEntity namedEntity=null; List<NamedEntity> namedEntities = JCasUtil.selectCovered(NamedEntity.class, tokenAnno); switch (namedEntities.size()) { case 0: break; // if no NER - ignore and move on case 1: namedEntity = eu.excitementproject.eop.common.representation.parse.representation.basic.NamedEntity.valueOf(namedEntities.get(0).getValue()); break; default: throw new CasTreeConverterException(String.format("Got %d NamedEntity annotations for token %s", namedEntities.size(), tokenAnno)); } return new DefaultNodeInfo(word, lemma, serial, namedEntity, new DefaultSyntacticInfo(new PennPartOfSpeech(pos))); }
private void writeLemmas(JCas aJCas, TextCorpus aTextCorpus, Map<Integer, eu.clarin.weblicht.wlfxb.tc.api.Token> aTokensBeginPositionMap) { if (!JCasUtil.exists(aJCas, Lemma.class)) { // Do nothing if there are no lemmas in the CAS getLogger().debug("Layer [" + TextCorpusLayerTag.LEMMAS.getXmlName() + "]: empty"); return; } // Tokens layer must already exist TokensLayer tokensLayer = aTextCorpus.getTokensLayer(); // create lemma annotation layer LemmasLayer lemmasLayer = aTextCorpus.createLemmasLayer(); getLogger().debug("Layer [" + TextCorpusLayerTag.LEMMAS.getXmlName() + "]: created"); int j = 0; for (Token coveredToken : select(aJCas, Token.class)) { Lemma lemma = coveredToken.getLemma(); if (lemma != null && lemmasLayer != null) { String lemmaValue = coveredToken.getLemma().getValue(); lemmasLayer.addLemma(lemmaValue, tokensLayer.getToken(j)); } j++; } }
private void writeLemmas(JCas aJCas, TextCorpus aTextCorpus, Map<Integer, eu.clarin.weblicht.wlfxb.tc.api.Token> aTokensBeginPositionMap) { if (!JCasUtil.exists(aJCas, Lemma.class)) { // Do nothing if there are no lemmas in the CAS getLogger().debug("Layer [" + TextCorpusLayerTag.LEMMAS.getXmlName() + "]: empty"); return; } // Tokens layer must already exist TokensLayer tokensLayer = aTextCorpus.getTokensLayer(); // create lemma annotation layer LemmasLayer lemmasLayer = aTextCorpus.createLemmasLayer(); getLogger().debug("Layer [" + TextCorpusLayerTag.LEMMAS.getXmlName() + "]: created"); int j = 0; for (Token coveredToken : select(aJCas, Token.class)) { Lemma lemma = coveredToken.getLemma(); if (lemma != null && lemmasLayer != null) { String lemmaValue = coveredToken.getLemma().getValue(); lemmasLayer.addLemma(lemmaValue, tokensLayer.getToken(j)); } j++; } }
public static CoreLabel tokenToWord(Token aToken) { CoreLabel t = new CoreLabel(); t.setOriginalText(aToken.getCoveredText()); t.setWord(aToken.getText()); t.setBeginPosition(aToken.getBegin()); t.setEndPosition(aToken.getEnd()); if (aToken.getLemma() != null) { t.setLemma(aToken.getLemma().getValue()); } else { t.setLemma(aToken.getText()); } if (aToken.getPos() != null) { t.setTag(aToken.getPos().getPosValue()); } return t; }
@Override public void process(JCas aJCas) throws AnalysisEngineProcessException { for (Lemma lemma : select(aJCas, Lemma.class)) { Token t = selectSingleAt(aJCas, Token.class, lemma.getBegin(), lemma.getEnd()); assert t.getLemma() == lemma; } for (Stem stem : select(aJCas, Stem.class)) { Token t = selectSingleAt(aJCas, Token.class, stem.getBegin(), stem.getEnd()); assert t.getStem() == stem; } for (MorphologicalFeatures morph : select(aJCas, MorphologicalFeatures.class)) { Token t = selectSingleAt(aJCas, Token.class, morph.getBegin(), morph.getEnd()); assert t.getMorph() == morph; } for (POS pos : select(aJCas, POS.class)) { Token t = selectSingleAt(aJCas, Token.class, pos.getBegin(), pos.getEnd()); assert t.getPos() == pos; } for (Dependency dep : select(aJCas, Dependency.class)) { assert dep.getBegin() == dep.getDependent().getBegin(); assert dep.getEnd() == dep.getDependent().getEnd(); } } }