private static void run(String path, String outFile) { Logger.setGlobalLogger(new Logger.SystemLogger(System.out, System.err)); Logger.startTrack("Reading Lm File " + path + " . . . "); final NgramLanguageModel<String> lm = LmReaders.readLmFromGoogleNgramDir(path, true); Logger.endTrack(); Logger.startTrack("Writing to file " + outFile + " . . . "); LmReaders.writeLmBinary(lm, outFile); Logger.endTrack(); } }
public static void endTrack() { i().endTrack(); }
Logger.startTrack("Reading 1-grams"); try { while (reader.ready()) { if (currLine % 100000 == 0) Logger.logs("Read " + currLine + " lines"); currLine++; final String line = reader.readLine(); Logger.logs(currentNGramCount + " " + currentNGramLength + "-gram read."); Logger.endTrack(); callback.handleNgramOrderFinished(currentNGramLength); currentNGramLength++; if (currentNGramLength > maxOrder) return; currentNGramCount = 0; Logger.startTrack("Reading " + currentNGramLength + "-grams"); Logger.endTrack(); callback.handleNgramOrderFinished(currentNGramLength);
@Override public void cleanup() { Logger.startTrack("Cleaning up values"); Logger.logss("Found " + valueCounter.size() + " unique counts"); Logger.endTrack(); }
private void readFromFiles(final LmReaderCallback<LongRef> callback) { Logger.startTrack("Reading in ngrams from raw text"); countNgrams(lineIterator, callback); Logger.endTrack(); }
@Override public void parse(ArpaLmReaderCallback<ProbBackoffPair> callback) { Logger.startTrack("Writing Kneser-Ney probabilities"); Logger.startTrack("Counting counts for order " + ngramOrder); long numNgrams = 0; //ngrams.getNumNgrams(ngramOrder); for (final Entry<KneserNeyCounts> entry : ngrams.getNgramsForOrder(ngramOrder)) { Logger.endTrack(); callback.handleNgramOrderStarted(ngramOrder + 1); Logger.logss("On order " + (ngramOrder + 1)); int linenum = 0; for (final Entry<KneserNeyCounts> entry : ngrams.getNgramsForOrder(ngramOrder)) { if (linenum++ % 10000 == 0) Logger.logs("Writing line " + linenum); final long relevantCount = entry.value.tokenCounts; if (ngramOrder >= lmOrder - 2 && relevantCount < opts.kneserNeyMinCounts[ngramOrder]) continue; Logger.endTrack();
addToIndexer(wordIndexer, sortedVocabPath); } else if (ngramFiles.length == 0) { Logger.warn("Did not find any files matching expected regex " + regex); Logger.startTrack("Reading ngrams of order " + (ngramOrder_ + 1)); for (final File ngramFile_ : ngramFiles) { final File ngramFile = ngramFile_; Logger.startTrack("Reading ngrams from file " + ngramFile); try { int k = 0; for (String line : Iterators.able(IOUtils.lineIterator(ngramFile.getPath()))) { if (k % 10000 == 0) Logger.logs("Line " + k); k++; line = line.trim(); Logger.endTrack(); Logger.endTrack(); callback.handleNgramOrderFinished(++ngramOrder);
@Override public void initWithLengths(List<Long> numNGrams) { Logger.startTrack("Writing ARPA"); out.println(); out.println("\\data\\"); for (int ngramOrder = 0; ngramOrder < numNGrams.size(); ++ngramOrder) { final long numNgrams = numNGrams.get(ngramOrder); out.println("ngram " + (ngramOrder + 1) + "=" + numNgrams); } out.println(); }
public long size() { long size = 0; for (NgramsForOrderIterableWrapper<W, V> map : ngramsForOrder) { size += map.size(); } if (size > Integer.MAX_VALUE) Logger.warn(NgramMapWrapper.class.getSimpleName() + " doesn't like maps with size greater than Integer.MAX_VALUE"); return (int) size; } }
public static void main(final String[] argv) throws FileNotFoundException, IOException { int i = 0; if (i >= argv.length) usage(); boolean isGoogleBinary = false; if (argv[i].equals("-g")) { isGoogleBinary = true; i++; } if (i >= argv.length) usage(); String vocabFile = null; if (isGoogleBinary) { vocabFile = argv[i++]; } if (i >= argv.length) usage(); String binaryFile = argv[i++]; List<String> files = Arrays.asList(Arrays.copyOfRange(argv, i, argv.length)); if (files.isEmpty()) files = Collections.singletonList("-"); Logger.setGlobalLogger(new Logger.SystemLogger(System.err, System.err)); NgramLanguageModel<String> lm = readBinary(isGoogleBinary, vocabFile, binaryFile); double prob = computeProb(files, lm); System.err.print("Log probability of text is: "); System.out.println(prob); }
/** * @param uncompressedSize * @param compressedLongArray * @param keyBits * @param valueBits */ private void logCompressionInfo(final long uncompressedSize, final LongArray compressedLongArray, final long keyBits, final long valueBits) { final double keyAvg = (double) keyBits / uncompressedSize; Logger.logss("Key bits " + keyAvg); final double valueAvg = (double) valueBits / uncompressedSize; Logger.logss("Value bits " + valueAvg); final double avg = 64 * (double) compressedLongArray.size() / uncompressedSize; Logger.logss("Compressed bits " + avg); totalKeyBitsFinal += keyBits; totalValueBitsFinal += valueBits; totalBitsFinal += compressedLongArray.size(); totalSizeFinal += uncompressedSize; Logger.logss("Total key bits " + totalKeyBitsFinal / totalSizeFinal); Logger.logss("Total value bits " + totalValueBitsFinal / totalSizeFinal); Logger.logss("Total bits " + 64.0 * totalBitsFinal / totalSizeFinal); }
/** * @param <W> * @param wordIndexer * @param maxOrder * @param allLinesIterator * @param callback * @param ngrams * @return */ private void countNgrams(final Iterable<String> allLinesIterator, final LmReaderCallback<Object> callback) { long numLines = 0; for (final String line : allLinesIterator) { if (numLines % 10000 == 0) Logger.logs("On line " + numLines); numLines++; final String[] words = line.split(" "); final int[] sent = new int[words.length + 2]; sent[0] = wordIndexer.getOrAddIndex(wordIndexer.getStartSymbol()); sent[sent.length - 1] = wordIndexer.getOrAddIndex(wordIndexer.getEndSymbol()); for (int i = 0; i < words.length; ++i) { sent[i + 1] = wordIndexer.getOrAddIndexFromString(words[i]); } for (int ngramOrder = 0; ngramOrder < lmOrder; ++ngramOrder) { for (int i = 0; i < sent.length; ++i) { if (i - ngramOrder < 0) continue; callback.call(sent, i - ngramOrder, i + 1, null, line); } } } callback.cleanup(); }
/** * @param isGoogleBinary * @param vocabFile * @param binaryFile * @return */ private static NgramLanguageModel<String> readBinary(boolean isGoogleBinary, String vocabFile, String binaryFile) { NgramLanguageModel<String> lm = null; if (isGoogleBinary) { Logger.startTrack("Reading Google Binary " + binaryFile + " with vocab " + vocabFile); lm = LmReaders.readGoogleLmBinary(binaryFile, vocabFile); Logger.endTrack(); } else { Logger.startTrack("Reading LM Binary " + binaryFile); lm = LmReaders.readLmBinary(binaryFile); Logger.endTrack(); } return lm; }
@Override public void cleanup() { Logger.startTrack("Cleaning up values"); valueIndexer = new Indexer<V>(); for (final Entry<V, Double> entry : valueCounter.getEntriesSortedByDecreasingCount()) { valueIndexer.add(entry.getKey()); } Logger.logss("Found " + valueIndexer.size() + " unique counts"); valueCounter = null; Logger.endTrack(); }
Logger.startTrack("Writing ARPA"); out.println(); out.println("\\data\\"); out.println("\\" + (ngramOrder + 1) + "-grams:"); int line = 0; Logger.logss("On order " + (ngramOrder + 1)); int linenum = 0; for (Entry<KneserNeyCounts> entry : ngrams.getNgramsForOrder(ngramOrder)) { if (linenum++ % 10000 == 0) Logger.logs("Writing line " + line); if (ngramOrder >= lmOrder - 2 && entry.value.tokenCounts < opts.kneserNeyMinCounts[ngramOrder]) continue; final String ngramString = StrUtils.join(WordIndexer.StaticMethods.toList(wordIndexer, entry.key)); Logger.endTrack();
@Override public void call(final int[] ngram, int startPos, int endPos, final V v, final String words) { final long add = map.put(ngram, startPos, endPos, v); if (add < 0) { if (warnCount >= 0 && warnCount < 10) { Logger.warn("Could not add line " + words + "\nThis is probabcly because the prefix or suff of the n-grams was not already in the map. This will be fixed in an upcoming release."); warnCount++; } if (warnCount > 10) warnCount = -1; } }
/** * @param uncompressedSize * @param compressedLongArray * @param keyBits * @param valueBits */ private void logCompressionInfo(final long uncompressedSize, final LongArray compressedLongArray, final long keyBits, final long valueBits) { final double keyAvg = (double) keyBits / uncompressedSize; Logger.logss("Key bits " + keyAvg); final double valueAvg = (double) valueBits / uncompressedSize; Logger.logss("Value bits " + valueAvg); final double avg = 64 * (double) compressedLongArray.size() / uncompressedSize; Logger.logss("Compressed bits " + avg); totalKeyBitsFinal += keyBits; totalValueBitsFinal += valueBits; totalBitsFinal += compressedLongArray.size(); totalSizeFinal += uncompressedSize; Logger.logss("Total key bits " + totalKeyBitsFinal / totalSizeFinal); Logger.logss("Total value bits " + totalValueBitsFinal / totalSizeFinal); Logger.logss("Total bits " + 64.0 * totalBitsFinal / totalSizeFinal); }
if (numLines % 10000 == 0) Logger.logs("On line " + numLines); numLines++; final String[] words = line.split(" ");