/** * Adds the tagging with count to the data structures in this Lexicon. */ protected void addTagging(boolean seen, IntTaggedWord itw, double count) { if (seen) { seenCounter.incrementCount(itw, count); if (itw.tag() == nullTag) { words.add(itw); } else if (itw.word() == nullWord) { tags.add(itw); } else { // rules.add(itw); } } else { uwModel.addTagging(seen, itw, count); // if (itw.tag() == nullTag) { // sigs.add(itw); // } } }
String sig = lex.getUnknownWordModel().getSignature(args[i], i-3); System.out.println(args[i] + " is an unknown word. Signature with uwm " + lex.getUnknownWordModel().getUnknownLevel() + ((i == 3) ? " init": "non-init") + " is: " + sig); impos.clear(); List<String> lis = new ArrayList<>(tagIndex.objectsList());
double totalUnseen = uwModel.unSeenCounter().getCount(NULL_ITW); double c_Tunseen = uwModel.unSeenCounter().getCount(temp); p_T_U = getUnknownWordModel().scoreProbTagGivenWordSignature(iTW, loc, smooth[0], word); if (DEBUG_LEXICON_SCORE) log.info("With useSignatureForKnownSmoothing, P(T|U) is " + p_T_U + " rather than " + (c_Tunseen / totalUnseen)); } else { pb_W_T = getUnknownWordModel().score(iTW, loc, c_T, total, smooth[0], word); } else { double pb_W0_T = getUnknownWordModel().score(iTW, 0, c_T, total, smooth[0], word); double pb_W1_T = getUnknownWordModel().score(iTW, 1, c_T, total, smooth[0], word); pb_W_T = Math.log((Math.exp(pb_W0_T) + 2 * Math.exp(pb_W1_T))/3);
System.out.println("unknownLevel is " + getUnknownWordModel().getUnknownLevel()); System.out.println("Unseen counter: " + Counters.toString(uwModel.unSeenCounter(), nf));
/** * Writes out data from this Object to the Writer w. Rules are separated by * newline, and rule elements are delimited by \t. */ @Override public void writeData(Writer w) throws IOException { PrintWriter out = new PrintWriter(w); for (IntTaggedWord itw : seenCounter.keySet()) { out.println(itw.toLexicalEntry(wordIndex, tagIndex) + " SEEN " + seenCounter.getCount(itw)); } for (IntTaggedWord itw : getUnknownWordModel().unSeenCounter().keySet()) { out.println(itw.toLexicalEntry(wordIndex, tagIndex) + " UNSEEN " + getUnknownWordModel().unSeenCounter().getCount(itw)); } for (int i = 0; i < smooth.length; i++) { out.println("smooth[" + i + "] = " + smooth[i]); } out.flush(); }
tb.loadPath(args[0], new NumberRangesFileFilter(args[1], true)); BaseLexicon lex = new BaseLexicon(); lex.getUnknownWordModel().setUnknownLevel(Integer.parseInt(args[2])); lex.train(tb); System.out.println("done."); String sig = lex.getUnknownWordModel().getSignature(args[i], i-3); System.out.println(args[i] + " is an unknown word. Signature with uwm " + lex.getUnknownWordModel().getUnknownLevel() + ((i == 3) ? " init": "non-init") + " is: " + sig); Set<String> tags = ErasureUtils.uncheckedCast(numb.objects()); impos.clear();
@Override public float score(IntTaggedWord iTW, int loc, String word, String featureSpec) { double c_W = seenCounter.getCount(iTW); boolean seen = (c_W > 0.0); if (seen) { return super.score(iTW, loc, word, featureSpec); } else { float score; // if (useMaxentUnknownWordModel) { // score = cml.score(iTW, 0); // } else { score = this.getUnknownWordModel().score(iTW, loc, 0.0, 0.0, 0.0, word); // ChineseUnknownWordModel doesn't use the final three params // } return score; } } }
/** * Trains this lexicon on the Collection of trees. */ public void train(TaggedWord tw, int loc, double weight) { IntTaggedWord iTW = new IntTaggedWord(tw.word(), tw.tag(), wordIndex, tagIndex); IntTaggedWord iT = new IntTaggedWord(nullWord, iTW.tag); IntTaggedWord iW = new IntTaggedWord(iTW.word, nullTag); seenCounter.incrementCount(iW, weight); IntTaggedWord i = NULL_ITW; if (treesRead > indexToStartUnkCounting) { // start doing this once some way through trees; // treesRead is 1 based counting if (seenCounter.getCount(iW) < 2) { // it's an entirely unknown word int s = model.getSignatureIndex(iTW.word, loc, wordIndex.get(iTW.word)); IntTaggedWord iTS = new IntTaggedWord(s, iTW.tag); IntTaggedWord iS = new IntTaggedWord(s, nullTag); unSeenCounter.incrementCount(iTS, weight); unSeenCounter.incrementCount(iT, weight); unSeenCounter.incrementCount(iS, weight); unSeenCounter.incrementCount(i, weight); } } }
unkCounter.incrementCount(lex.getUnknownWordModel().getSignature(word.value(), posId++));
System.out.println("unknownLevel is " + getUnknownWordModel().getUnknownLevel()); System.out.println("Unseen counter: " + Counters.toString(uwModel.unSeenCounter(), nf));
Lexicon.UNKNOWN_WORD + ", " + unkWord + ')'); if (DEBUG_LEXICON) log.info("unSeenCounter is: " + uwModel.unSeenCounter()); if (DEBUG_LEXICON) log.info("Train.openClassTypesThreshold is " + trainOptions.openClassTypesThreshold); for (IntTaggedWord iT : tags) { if (DEBUG_LEXICON) log.info("Entry for " + iT + " is " + uwModel.unSeenCounter().getCount(iT)); double types = uwModel.unSeenCounter().getCount(iT); if (types > trainOptions.openClassTypesThreshold) {
private double probWordTag(String word, int loc, int wordId, int tagId) { double cW = wordTag.totalCount(wordId); double cWT = wordTag.getCount(wordId, tagId); // p_L double p_W = cW / wordTag.totalCount(); // p_T double cTseen = tagCounter.getCount(tagId); double p_T = cTseen / tagCounter.totalCount(); // p_T_L double p_W_T = 0.0; if (cW > 0.0) { // Seen lemma double p_T_W = 0.0; if (cW > 100.0 && cWT > 0.0) { p_T_W = cWT / cW; } else { double cTunseen = wordTagUnseen.getCount(tagId); // TODO p_T_U is 0? double p_T_U = cTunseen / wordTagUnseen.totalCount(); p_T_W = (cWT + smooth[1]*p_T_U) / (cW + smooth[1]); } p_W_T = p_T_W * p_W / p_T; } else { // Unseen word. Score based on the word signature (of the surface form) IntTaggedWord iTW = new IntTaggedWord(wordId, tagId); double c_T = tagCounter.getCount(tagId); p_W_T = Math.exp(getUnknownWordModel().score(iTW, loc, c_T, tagCounter.totalCount(), smooth[0], word)); } return p_W_T; }
/** * Trains this lexicon on the Collection of trees. */ public void train(TaggedWord tw, int loc, double weight) { IntTaggedWord iTW = new IntTaggedWord(tw.word(), tw.tag(), wordIndex, tagIndex); IntTaggedWord iT = new IntTaggedWord(nullWord, iTW.tag); IntTaggedWord iW = new IntTaggedWord(iTW.word, nullTag); seenCounter.incrementCount(iW, weight); IntTaggedWord i = NULL_ITW; if (treesRead > indexToStartUnkCounting) { // start doing this once some way through trees; // treesRead is 1 based counting if (seenCounter.getCount(iW) < 2) { // it's an entirely unknown word int s = model.getSignatureIndex(iTW.word, loc, wordIndex.get(iTW.word)); IntTaggedWord iTS = new IntTaggedWord(s, iTW.tag); IntTaggedWord iS = new IntTaggedWord(s, nullTag); unSeenCounter.incrementCount(iTS, weight); unSeenCounter.incrementCount(iT, weight); unSeenCounter.incrementCount(iS, weight); unSeenCounter.incrementCount(i, weight); } } }
@Override public void train(TaggedWord tw, int loc, double weight) { if (useGT) { unknownGTTrainer.train(tw, weight); } // scan data String word = tw.word(); String subString = model.getSignature(word, loc); Label tag = new Tag(tw.tag()); if ( ! c.containsKey(tag)) { c.put(tag, new ClassicCounter<>()); } c.get(tag).incrementCount(subString, weight); tc.incrementCount(tag, weight); seenEnd.add(subString); String tagStr = tw.tag(); IntTaggedWord iW = new IntTaggedWord(word, IntTaggedWord.ANY, wordIndex, tagIndex); seenCounter.incrementCount(iW, weight); if (treesRead > indexToStartUnkCounting) { // start doing this once some way through trees; // treesRead is 1 based counting if (seenCounter.getCount(iW) < 2) { IntTaggedWord iT = new IntTaggedWord(IntTaggedWord.ANY, tagStr, wordIndex, tagIndex); unSeenCounter.incrementCount(iT, weight); unSeenCounter.incrementCount(NULL_ITW, weight); } } }
double totalUnseen = uwModel.unSeenCounter().getCount(iTW); iTW.tag = tag; double c_Tunseen = uwModel.unSeenCounter().getCount(iTW); iTW.word = word; p_T_U = getUnknownWordModel().scoreProbTagGivenWordSignature(iTW, loc, smooth[0]); if (DEBUG_LEXICON_SCORE) System.err.println("With useSignatureForKnownSmoothing, P(T|U) is " + p_T_U + " rather than " + (c_Tunseen / totalUnseen)); } else { pb_W_T = getUnknownWordModel().score(iTW, loc, c_T, total, smooth[0]); } else { double pb_W0_T = getUnknownWordModel().score(iTW, 0, c_T, total, smooth[0]); double pb_W1_T = getUnknownWordModel().score(iTW, 1, c_T, total, smooth[0]); pb_W_T = Math.log((Math.exp(pb_W0_T) + 2 * Math.exp(pb_W1_T))/3);
System.out.println("unknownLevel is " + getUnknownWordModel().getUnknownLevel()); System.out.println("Unseen counter: " + Counters.toString(uwModel.unSeenCounter(), nf));
String sig = lex.getUnknownWordModel().getSignature(args[i], i-3); System.out.println(args[i] + " is an unknown word. Signature with uwm " + lex.getUnknownWordModel().getUnknownLevel() + ((i == 3) ? " init": "non-init") + " is: " + sig); impos.clear(); List<String> lis = new ArrayList<String>(tagIndex.objectsList());
if (iTW.word() == nullWord) { double types = uwModel.unSeenCounter().getCount(iTW); if (types > trainOptions.openClassTypesThreshold) { IntTaggedWord iTU = new IntTaggedWord(unkWord, iTW.tag);
@Override public float score(IntTaggedWord iTW, int loc, String word, String featureSpec) { double c_W = seenCounter.getCount(iTW); boolean seen = (c_W > 0.0); if (seen) { return super.score(iTW, loc, word, featureSpec); } else { float score; // if (useMaxentUnknownWordModel) { // score = cml.score(iTW, 0); // } else { score = this.getUnknownWordModel().score(iTW, loc, 0.0, 0.0, 0.0, word); // ChineseUnknownWordModel doesn't use the final three params // } return score; } } }
int s = model.getSignatureIndex(iTW.word, loc, wordIndex.get(iTW.word)); if (DOCUMENT_UNKNOWNS) {