/** * Retrieves the bigram probability for the two given words * * @param word1 * the first word of the bigram * @param word2 * the second word of the bigram * @return the log probability */ private float getBigramProb(int word1, int word2) { NGramBuffer bigram = getBigramBuffer(word1); NGramProbability bigramProbability = bigram.findNGram(word2); return ngramProbTable[1][bigramProbability.getProbabilityID()]; }
/** * Finds the NGram probabilities for the given nth word in a NGram. * * @param nthWordID the ID of the nth word * @return the NGramProbability of the given nth word */ public NGramProbability findNGram(int nthWordID) { int mid, start = 0, end = getNumberNGrams() - 1; NGramProbability ngram = null; while ((end - start) > 0) { mid = (start + end) / 2; int midWordID = getWordID(mid); if (midWordID < nthWordID) { start = mid + 1; } else if (midWordID > nthWordID) { end = mid; } else { ngram = getNGramProbability(mid); break; } } return ngram; }
/** * Finds the NGram probability ID for the given nth word in a NGram. * * @param nthWordID the ID of the nth word * @return the NGram Probability ID of the given nth word */ public int findProbabilityID(int nthWordID) { int mid, start = 0, end = getNumberNGrams(); int nGram = -1; while ((end - start) > 0) { mid = (start + end) / 2; int midWordID = getWordID(mid); if (midWordID < nthWordID) { start = mid + 1; } else if (midWordID > nthWordID) { end = mid; } else { nGram = getProbabilityID(mid); break; } } return nGram; }
/** * Returns the NGramProbability of the nth follower. * * @param nthFollower which follower * @return the NGramProbability of the nth follower */ public int getProbabilityID(int nthFollower) { int nthPosition = 0; nthPosition = nthFollower * LargeNGramModel.BYTES_PER_NGRAM * ((is32bits) ? 4 : 2); setPosition(nthPosition + ((is32bits) ? 4 : 2)); // to skip the word ID return readBytesAsInt(); }
/** * Finds the NGram index for the given nth word in a NGram * * @param nthWordID the ID of the nth word * @return the NGramIndex of the given nth word */ public int findNGramIndex(int nthWordID) { int mid = -1, start = 0, end = getNumberNGrams() - 1; while ((end - start) > 0) { mid = (start + end) / 2; int midWordID = getWordID(mid); if (midWordID < nthWordID) { start = mid + 1; } else if (midWordID > nthWordID) { end = mid; } else { break; } } return mid; }
int lastWordId = getWordID(ws.getWord(ws.size() - 1)); nMinus1Buffer = getNGramBuffer(ws.getOldest()); int index = nMinus1Buffer.findNGramIndex(lastWordId); int firstNMinus1GramEntry = nMinus1Buffer.getFirstNGramEntry(); firstCurrentNGramEntry = getFirstNGramEntry( nMinus1Buffer.getNGramProbability(index), firstNMinus1GramEntry, orderBuffer); int firstNextNGramEntry = getFirstNGramEntry( nMinus1Buffer.getNGramProbability(index + 1), firstNMinus1GramEntry, orderBuffer); numberNGrams = firstNextNGramEntry - firstCurrentNGramEntry; firstCurrentNGramEntry); } else { currentBuffer = new NGramBuffer(buffer, numberNGrams, loader.getBigEndian(), is32bits(), orderBuffer, firstCurrentNGramEntry);
double ugbackoff = logMath.logToLinear(logugbackoff); for (int j = 0; j < bigram.getNumberNGrams(); j++) { int wordID = bigram.getWordID(j); NGramProbability bgProb = bigram.getNGramProbability(j); continue; for (int j = 0; j < bigram.getNumberNGrams(); j++) { float smearTerm; NGramProbability bgProb = bigram.getNGramProbability(j); float logbgbackoff = ngramBackoffTable[2][bgProb.getBackoffID()]; double bgbackoff = logMath.logToLinear(logbgbackoff); int k = bigram.getWordID(j); NGramBuffer trigram = loadTrigramBuffer(i, k); double bg_numerator = 0; double bg_denominator = 0; for (int l = 0; l < trigram.getNumberNGrams(); l++) { int m = trigram.getWordID(l); float logtgprob = ngramProbTable[2][trigram .getProbabilityID(l)]; double tgprob = logMath.logToLinear(logtgprob); float logbgprob = getBigramProb(k, m);
/** Clears the various N-gram caches. */ private void clearCache() { for (int i = 0; i < loadedBigramBuffers.length; i++) { NGramBuffer buffer = loadedBigramBuffers[i]; if (buffer != null) { if (!buffer.getUsed()) loadedBigramBuffers[i] = null; // free the BigramBuffer else buffer.setUsed(false); } } loadedBigramBuffers = new NGramBuffer[unigrams.length]; for (int i = 2; i <= loader.getMaxDepth(); i++) { loadedNGramBuffers[i - 1] = new HashMap<WordSequence, NGramBuffer>(); } logger.info("LM Cache Size: " + ngramProbCache.size() + " Hits: " + ngramHits + " Misses: " + ngramMisses); if (clearCacheAfterUtterance) { ngramProbCache = new LRUCache<WordSequence, Float>(ngramCacheSize); } }
int lastWordId = getWordID(ws.getWord(ws.size() - 1)); nMinus1Buffer = getNGramBuffer(ws.getOldest()); int index = nMinus1Buffer.findNGramIndex(lastWordId); int firstNMinus1GramEntry = nMinus1Buffer.getFirstNGramEntry(); firstCurrentNGramEntry = getFirstNGramEntry( nMinus1Buffer.getNGramProbability(index), firstNMinus1GramEntry, orderBuffer); int firstNextNGramEntry = getFirstNGramEntry( nMinus1Buffer.getNGramProbability(index + 1), firstNMinus1GramEntry, orderBuffer); numberNGrams = firstNextNGramEntry - firstCurrentNGramEntry; firstCurrentNGramEntry); } else { currentBuffer = new NGramBuffer(buffer, numberNGrams, loader.getBigEndian(), is32bits(), orderBuffer, firstCurrentNGramEntry);
double ugbackoff = logMath.logToLinear(logugbackoff); for (int j = 0; j < bigram.getNumberNGrams(); j++) { int wordID = bigram.getWordID(j); NGramProbability bgProb = bigram.getNGramProbability(j); continue; for (int j = 0; j < bigram.getNumberNGrams(); j++) { float smearTerm; NGramProbability bgProb = bigram.getNGramProbability(j); float logbgbackoff = ngramBackoffTable[2][bgProb.getBackoffID()]; double bgbackoff = logMath.logToLinear(logbgbackoff); int k = bigram.getWordID(j); NGramBuffer trigram = loadTrigramBuffer(i, k); double bg_numerator = 0; double bg_denominator = 0; for (int l = 0; l < trigram.getNumberNGrams(); l++) { int m = trigram.getWordID(l); float logtgprob = ngramProbTable[2][trigram .getProbabilityID(l)]; double tgprob = logMath.logToLinear(logtgprob); float logbgprob = getBigramProb(k, m);
/** * Returns the word ID of the nth follower, assuming that the ID is the first two bytes of the NGram entry. * * @param nthFollower starts from 0 to (numberFollowers - 1). * @return the word ID */ public final int getWordID(int nthFollower) { int nthPosition = nthFollower * (buffer.length / numberNGrams); setPosition(nthPosition); return readBytesAsInt(); }
out.writeInt(bigram.getNumberNGrams()); for (int j = 0; j < bigram.getNumberNGrams(); j++) { int k = bigram.getWordID(j); Float smearTerm = getSmearTerm(i, k); out.writeInt(k);
/** Clears the various N-gram caches. */ private void clearCache() { for (int i = 0; i < loadedBigramBuffers.length; i++) { NGramBuffer buffer = loadedBigramBuffers[i]; if (buffer != null) { if (!buffer.getUsed()) loadedBigramBuffers[i] = null; // free the BigramBuffer else buffer.setUsed(false); } } loadedBigramBuffers = new NGramBuffer[unigrams.length]; for (int i = 2; i <= loader.getMaxDepth(); i++) { loadedNGramBuffers[i - 1] = new HashMap<WordSequence, NGramBuffer>(); } logger.info("LM Cache Size: " + ngramProbCache.size() + " Hits: " + ngramHits + " Misses: " + ngramMisses); if (clearCacheAfterUtterance) { ngramProbCache = new LRUCache<WordSequence, Float>(ngramCacheSize); } }
/** * Returns the NGramProbability of the nth follower. * * @param nthFollower which follower * @return the NGramProbability of the nth follower */ public NGramProbability getNGramProbability(int nthFollower) { int nthPosition = 0, wordID = 0, probID = 0, backoffID = 0, firstNGram = 0; nthPosition = nthFollower * LargeNGramModel.BYTES_PER_NGRAM * ((is32bits) ? 4 : 2); setPosition(nthPosition); wordID = readBytesAsInt(); probID = readBytesAsInt(); backoffID = readBytesAsInt(); firstNGram = readBytesAsInt(); return (new NGramProbability(nthFollower, wordID, probID, backoffID, firstNGram)); } }
NGramBuffer bigram = getBigramBuffer(i); if (bigram.getNumberNGrams() != numBigrams) { in.close(); throw new IOException("Bad ngrams for unigram " + i + " Found " + numBigrams + " expected " + bigram.getNumberNGrams()); int k = bigram.getWordID(j); putSmearTerm(i, k, in.readFloat());
/** * Finds the NGram probabilities for the given nth word in a NGram. * * @param nthWordID the ID of the nth word * @return the NGramProbability of the given nth word */ public NGramProbability findNGram(int nthWordID) { int mid, start = 0, end = getNumberNGrams() - 1; NGramProbability ngram = null; while ((end - start) > 0) { mid = (start + end) / 2; int midWordID = getWordID(mid); if (midWordID < nthWordID) { start = mid + 1; } else if (midWordID > nthWordID) { end = mid; } else { ngram = getNGramProbability(mid); break; } } return ngram; }
/** * Finds the NGram probability ID for the given nth word in a NGram. * * @param nthWordID the ID of the nth word * @return the NGram Probability ID of the given nth word */ public int findProbabilityID(int nthWordID) { int mid, start = 0, end = getNumberNGrams(); int nGram = -1; while ((end - start) > 0) { mid = (start + end) / 2; int midWordID = getWordID(mid); if (midWordID < nthWordID) { start = mid + 1; } else if (midWordID > nthWordID) { end = mid; } else { nGram = getProbabilityID(mid); break; } } return nGram; }
/** * Finds or loads the NGram probability of the given NGram. * * @param wordSequence * the NGram to load * @return a NGramProbability of the given NGram */ private NGramProbability findNGram(WordSequence wordSequence) { int numberWords = wordSequence.size(); NGramProbability nGram = null; WordSequence oldest = wordSequence.getOldest(); NGramBuffer nGramBuffer = loadedNGramBuffers[numberWords - 1] .get(oldest); if (nGramBuffer == null) { nGramBuffer = getNGramBuffer(oldest); if (nGramBuffer != null) loadedNGramBuffers[numberWords - 1].put(oldest, nGramBuffer); } if (nGramBuffer != null) { int nthWordID = getWordID(wordSequence.getWord(numberWords - 1)); nGram = nGramBuffer.findNGram(nthWordID); } return nGram; }
/** * Returns the NGramProbability of the nth follower. * * @param nthFollower which follower * @return the NGramProbability of the nth follower */ public int getProbabilityID(int nthFollower) { int nthPosition = 0; nthPosition = nthFollower * LargeNGramModel.BYTES_PER_NGRAM * ((is32bits) ? 4 : 2); setPosition(nthPosition + ((is32bits) ? 4 : 2)); // to skip the word ID return readBytesAsInt(); }
/** * Finds the NGram index for the given nth word in a NGram * * @param nthWordID the ID of the nth word * @return the NGramIndex of the given nth word */ public int findNGramIndex(int nthWordID) { int mid = -1, start = 0, end = getNumberNGrams() - 1; while ((end - start) > 0) { mid = (start + end) / 2; int midWordID = getWordID(mid); if (midWordID < nthWordID) { start = mid + 1; } else if (midWordID > nthWordID) { end = mid; } else { break; } } return mid; }