@Override public String basicCategory(String category) { String basicCat; if (leaveGF) { basicCat = stripGF(category); } else { basicCat = super.basicCategory(category); } // log.info("NPLP stripping " + category + " with leaveGF = " + leaveGF + " gives " + basicCat); return basicCat; }
/** * Return a GrammaticalStructureFactory suitable for this language/treebank. * (To be overridden in subclasses.) * * @return A GrammaticalStructureFactory suitable for this language/treebank */ @Override public GrammaticalStructureFactory grammaticalStructureFactory(Predicate<String> puncFilt) { return grammaticalStructureFactory(); }
@Override public TokenizerFactory<? extends HasWord> getTokenizerFactory() { if (tf != null) { return tf; } else { return super.getTokenizerFactory(); } }
LexicalizedParser lexParser = (LexicalizedParser) pd; for (String tag : lexParser.tagIndex) { String t = lp.basicCategory(tag); int gfIdx = t.indexOf(lp.getGfCharacter()); if (gfIdx > 0) { posTags.add(lp.basicCategory(t)); String t = lp.basicCategory(tag); int gfIdx = t.indexOf(lp.getGfCharacter()); if (gfIdx > 0) { gsf = lp.grammaticalStructureFactory(lp.punctuationWordRejectFilter(), lp.typedDependencyHeadFinder());
for (int leng = category.length(); i < leng; i++) { char ch = category.charAt(i); if (isLabelAnnotationIntroducingCharacter(ch)) { if (i == 0) { sawAtZero = true;
/** {@inheritDoc} */ @Override public TokenizerFactory<Tree> treeTokenizerFactory() { return new TreeTokenizerFactory(treeReaderFactory()); }
/** * Returns a String array of punctuation tags that EVALB-style evaluation * should ignore for this treebank/language. * Traditionally, EVALB has ignored a subset of the total set of * punctuation tags in the English Penn Treebank (quotes and * period, comma, colon, etc., but not brackets) * * @return Whether this is a EVALB-ignored punctuation tag */ @Override public String[] evalBIgnoredPunctuationTags() { return punctuationTags(); }
/** * Returns a String which is the first (perhaps unique) start symbol * of the treebank, or null if none is defined. * * @return The start symbol */ @Override public String startSymbol() { String[] ssyms = startSymbols(); if (ssyms == null || ssyms.length == 0) { return null; } return ssyms[0]; }
/** * Say whether this character is an annotation introducing * character. * * @param ch The character to check * @return Whether it is an annotation introducing character */ @Override public boolean isLabelAnnotationIntroducingCharacter(char ch) { char[] cutChars = labelAnnotationIntroducingCharacters(); for (char cutChar : cutChars) { if (ch == cutChar) { return true; } } return false; }
/** * Returns the basic syntactic category of a String. * This implementation basically truncates * stuff after an occurrence of one of the * <code>labelAnnotationIntroducingCharacters()</code>. * However, there is also special case stuff to deal with * labelAnnotationIntroducingCharacters in category labels: * (i) if the first char is in this set, it's never truncated * (e.g., '-' or '=' as a token), and (ii) if it starts with * one of this set, a second instance of the same item from this set is * also excluded (to deal with '-LLB-', '-RCB-', etc.). * * @param category The whole String name of the label * @return The basic category of the String */ @Override public String basicCategory(String category) { if (category == null) { return null; } return category.substring(0, postBasicCategoryIndex(category)); }
/** * Returns the syntactic category and 'function' of a String. * This normally involves truncating numerical coindexation * showing coreference, etc. By 'function', this means * keeping, say, Penn Treebank functional tags or ICE phrasal functions, * perhaps returning them as <code>category-function</code>. * <p/> * This implementation strips numeric tags after label introducing * characters (assuming that non-numeric things are functional tags). * * @param category The whole String name of the label * @return A String giving the category and function */ @Override public String categoryAndFunction(String category) { if (category == null) { return null; } String catFunc = category; int i = lastIndexOfNumericTag(catFunc); while (i >= 0) { catFunc = catFunc.substring(0, i); i = lastIndexOfNumericTag(catFunc); } return catFunc; }
if (isLabelAnnotationIntroducingCharacter(category.charAt(i))) { boolean onlyDigitsFollow = false; for (int j = i + 1; j < category.length(); j++) {
/** {@inheritDoc} */ @Override public TokenizerFactory<Tree> treeTokenizerFactory() { return new TreeTokenizerFactory(treeReaderFactory()); }
/** * Returns a String array of punctuation tags that EVALB-style evaluation * should ignore for this treebank/language. * Traditionally, EVALB has ignored a subset of the total set of * punctuation tags in the English Penn Treebank (quotes and * period, comma, colon, etc., but not brackets) * * @return Whether this is a EVALB-ignored punctuation tag */ public String[] evalBIgnoredPunctuationTags() { return punctuationTags(); }
/** * Returns a String which is the first (perhaps unique) start symbol * of the treebank, or null if none is defined. * * @return The start symbol */ public String startSymbol() { String[] ssyms = startSymbols(); if (ssyms == null || ssyms.length == 0) { return null; } return ssyms[0]; }
/** * Say whether this character is an annotation introducing * character. * * @param ch The character to check * @return Whether it is an annotation introducing character */ @Override public boolean isLabelAnnotationIntroducingCharacter(char ch) { char[] cutChars = labelAnnotationIntroducingCharacters(); for (char cutChar : cutChars) { if (ch == cutChar) { return true; } } return false; }
/** * Returns the basic syntactic category of a String. * This implementation basically truncates * stuff after an occurrence of one of the * <code>labelAnnotationIntroducingCharacters()</code>. * However, there is also special case stuff to deal with * labelAnnotationIntroducingCharacters in category labels: * (i) if the first char is in this set, it's never truncated * (e.g., '-' or '=' as a token), and (ii) if it starts with * one of this set, a second instance of the same item from this set is * also excluded (to deal with '-LLB-', '-RCB-', etc.). * * @param category The whole String name of the label * @return The basic category of the String */ @Override public String basicCategory(String category) { if (category == null) { return null; } return category.substring(0, postBasicCategoryIndex(category)); }
/** * Returns the syntactic category and 'function' of a String. * This normally involves truncating numerical coindexation * showing coreference, etc. By 'function', this means * keeping, say, Penn Treebank functional tags or ICE phrasal functions, * perhaps returning them as <code>category-function</code>. * <p/> * This implementation strips numeric tags after label introducing * characters (assuming that non-numeric things are functional tags). * * @param category The whole String name of the label * @return A String giving the category and function */ @Override public String categoryAndFunction(String category) { if (category == null) { return null; } String catFunc = category; int i = lastIndexOfNumericTag(catFunc); while (i >= 0) { catFunc = catFunc.substring(0, i); i = lastIndexOfNumericTag(catFunc); } return catFunc; }
/** * Return a GrammaticalStructureFactory suitable for this language/treebank. * (To be overridden in subclasses.) * * @return A GrammaticalStructureFactory suitable for this language/treebank */ @Override public GrammaticalStructureFactory grammaticalStructureFactory(Predicate<String> puncFilt, HeadFinder typedDependencyHeadFinder) { return grammaticalStructureFactory(); }
@Override public String basicCategory(String category) { String basicCat = super.basicCategory(category); if(!leaveGF) { basicCat = stripGF(basicCat); } return basicCat; }