/** * @param files * @param lm * @throws IOException * @throws FileNotFoundException */ private static double computeProb(List<String> files, NgramLanguageModel<String> lm) throws IOException, FileNotFoundException { double logProb = 0.0; for (String file : files) { Logger.startTrack("Scoring file " + file + "; current log probability is " + logProb); final InputStream is = (file.equals("-")) ? System.in : (file.endsWith(".gz") ? new GZIPInputStream(new FileInputStream(file)) : new FileInputStream(file)); BufferedReader reader = new BufferedReader(new InputStreamReader(new BufferedInputStream(is))); for (String line : Iterators.able(IOUtils.lineIterator(reader))) { List<String> words = Arrays.asList(line.trim().split("\\s+")); logProb += lm.getLogProb(words); } Logger.endTrack(); } return logProb; }
private void readFromFiles(final LmReaderCallback<LongRef> callback) { Logger.startTrack("Reading in ngrams from raw text"); countNgrams(lineIterator, callback); Logger.endTrack(); }
/** * @param isGoogleBinary * @param vocabFile * @param binaryFile * @return */ private static NgramLanguageModel<String> readBinary(boolean isGoogleBinary, String vocabFile, String binaryFile) { NgramLanguageModel<String> lm = null; if (isGoogleBinary) { Logger.startTrack("Reading Google Binary " + binaryFile + " with vocab " + vocabFile); lm = LmReaders.readGoogleLmBinary(binaryFile, vocabFile); Logger.endTrack(); } else { Logger.startTrack("Reading LM Binary " + binaryFile); lm = LmReaders.readLmBinary(binaryFile); Logger.endTrack(); } return lm; }
private static void run(String path, String outFile) { Logger.setGlobalLogger(new Logger.SystemLogger(System.out, System.err)); Logger.startTrack("Reading Lm File " + path + " . . . "); final NgramLanguageModel<String> lm = LmReaders.readLmFromGoogleNgramDir(path, true); Logger.endTrack(); Logger.startTrack("Writing to file " + outFile + " . . . "); LmReaders.writeLmBinary(lm, outFile); Logger.endTrack(); } }
@Override public void cleanup() { Logger.startTrack("Cleaning up values"); Logger.logss("Found " + valueCounter.size() + " unique counts"); Logger.endTrack(); }
private void readFromFiles(final LmReaderCallback<PhraseTableCounts> callback) { Logger.startTrack("Reading from file " + file); try { final Iterable<String> allLinesIterator = Iterators.able(IOUtils.lineIterator(file)); countPhrases(allLinesIterator, callback); } catch (final IOException e) { throw new RuntimeException(e); } Logger.endTrack(); }
private void readFromFiles(final LmReaderCallback<Object> callback) { Logger.startTrack("Reading from files " + inputFiles); final Iterable<String> allLinesIterator = getLineIterator(inputFiles); countNgrams(allLinesIterator, callback); Logger.endTrack(); }
private void readFromFiles(final LmReaderCallback<PhraseTableCounts> callback) { Logger.startTrack("Reading from file " + file); try { final Iterable<String> allLinesIterator = Iterators.able(IOUtils.lineIterator(file)); countPhrases(allLinesIterator, callback); } catch (final IOException e) { throw new RuntimeException(e); } Logger.endTrack(); }
/** * First pass over the file collects some statistics which help with memory * allocation * * @param <W> * @param arpaLmReader * @return */ private static <V extends Comparable<V>> FirstPassCallback<V> firstPassCommon(final LmReader<V, ? super FirstPassCallback<V>> arpaLmReader, final boolean reverse) { Logger.startTrack("Pass 1 of 2"); final FirstPassCallback<V> valueAddingCallback = new FirstPassCallback<V>(reverse); arpaLmReader.parse(valueAddingCallback); Logger.endTrack(); return valueAddingCallback; }
public static void main(final String[] argv) { if (argv.length != 2) usage(); Logger.setGlobalLogger(new Logger.SystemLogger(System.out, System.err)); Logger.startTrack("Reading Lm File " + argv[0] + " . . . "); final String googleDir = argv[0]; final NgramLanguageModel<String> lm = LmReaders.readLmFromGoogleNgramDir(googleDir, true, false); Logger.endTrack(); final String outFile = argv[1]; Logger.startTrack("Writing to file " + outFile + " . . . "); LmReaders.writeLmBinary(lm, outFile); Logger.endTrack(); } }
/** * First pass over the file collects some statistics which help with memory * allocation * * @param <W> * @param arpaLmReader * @return */ private static <V extends LongRepresentable<V>> FirstPassCallback<V> firstPassCommon(final LmReader<V, ? super FirstPassCallback<V>> arpaLmReader, final boolean reverse) { Logger.startTrack("Counting values"); final FirstPassCallback<V> valueAddingCallback = new FirstPassCallback<V>(reverse); arpaLmReader.parse(valueAddingCallback); Logger.endTrack(); return valueAddingCallback; }
public static void main(final String[] argv) { if (argv.length != 2) usage(); Logger.setGlobalLogger(new Logger.SystemLogger(System.out, System.err)); Logger.startTrack("Reading Lm File " + argv[0] + " . . . "); final String googleDir = argv[0]; final NgramLanguageModel<String> lm = LmReaders.readLmFromGoogleNgramDir(googleDir, true); Logger.endTrack(); final String outFile = argv[1]; Logger.startTrack("Writing to file " + outFile + " . . . "); LmReaders.writeLmBinary(lm, outFile); Logger.endTrack(); } }
public static void main(final String[] argv) { if (argv.length != 2) usage(); Logger.setGlobalLogger(new Logger.SystemLogger(System.out, System.err)); Logger.startTrack("Reading Lm File " + argv[0] + " . . . "); final String lmFile = argv[1]; final StupidBackoffLm<String> lm = (StupidBackoffLm<String>) LmReaders.readLmFromGoogleNgramDir(lmFile, true, false); Logger.endTrack(); final String outFile = argv[1]; Logger.startTrack("Writing to file " + outFile + " . . . "); IOUtils.writeObjFileHard(outFile, lm.getNgramMap()); Logger.endTrack(); } }
public static void main(final String[] argv) { if (argv.length != 2) usage(); Logger.setGlobalLogger(new Logger.SystemLogger(System.out, System.err)); Logger.startTrack("Reading Lm File " + argv[0] + " . . . "); final String lmFile = argv[1]; final StupidBackoffLm<String> lm = LmReaders.readLmFromGoogleNgramDir(lmFile, true); Logger.endTrack(); final String outFile = argv[1]; Logger.startTrack("Writing to file " + outFile + " . . . "); IOUtils.writeObjFileHard(outFile, lm.getNgramMap()); Logger.endTrack(); } }
@Override public void cleanup() { Logger.startTrack("Cleaning up values"); valueIndexer = new Indexer<V>(); for (final Entry<V, Double> entry : valueCounter.getEntriesSortedByDecreasingCount()) { valueIndexer.add(entry.getKey()); } Logger.logss("Found " + valueIndexer.size() + " unique counts"); valueCounter = null; Logger.endTrack(); }
public static void main(final String[] argv) { if (argv.length < 2) { usage(); } final int lmOrder = Integer.parseInt(argv[0]); final String outputFile = argv[1]; final List<String> inputFiles = new ArrayList<String>(); for (int i = 2; i < argv.length; ++i) { inputFiles.add(argv[i]); } if (inputFiles.isEmpty()) inputFiles.add("-"); Logger.setGlobalLogger(new Logger.SystemLogger(System.out, System.err)); Logger.startTrack("Reading text files " + inputFiles + " and writing to file " + outputFile); final StringWordIndexer wordIndexer = new StringWordIndexer(); wordIndexer.setStartSymbol(ArpaLmReader.START_SYMBOL); wordIndexer.setEndSymbol(ArpaLmReader.END_SYMBOL); wordIndexer.setUnkSymbol(ArpaLmReader.UNK_SYMBOL); LmReaders.createKneserNeyLmFromTextFiles(inputFiles, wordIndexer, lmOrder, new File(outputFile), new ConfigOptions()); Logger.endTrack(); }
public static void main(final String[] argv) { if (argv.length < 3) { usage(); } int lmOrder = Integer.parseInt(argv[0]); String outputFile = argv[1]; List<File> inputFiles = new ArrayList<File>(); for (int i = 2; i < argv.length; ++i) { inputFiles.add(new File(argv[i])); } Logger.setGlobalLogger(new Logger.SystemLogger(System.out, System.err)); Logger.startTrack("Reading text files " + inputFiles + " and writing to file " + outputFile); final StringWordIndexer wordIndexer = new StringWordIndexer(); wordIndexer.setStartSymbol(ArpaLmReader.START_SYMBOL); wordIndexer.setEndSymbol(ArpaLmReader.END_SYMBOL); wordIndexer.setUnkSymbol(ArpaLmReader.UNK_SYMBOL); LmReaders.createKneserNeyLmFromTextFiles(inputFiles, wordIndexer, lmOrder, new File(outputFile), new ConfigOptions()); Logger.endTrack(); }
private static void run(String path, String outFile) { Logger.setGlobalLogger(new Logger.SystemLogger(System.out, System.err)); Logger.startTrack("Reading Lm File " + path + " . . . "); final NgramLanguageModel<String> lm = LmReaders.readLmFromGoogleNgramDir(path, true); Logger.endTrack(); Logger.startTrack("Writing to file " + outFile + " . . . "); LmReaders.writeLmBinary(lm, outFile); Logger.endTrack(); } }
public LmValueContainer(final Indexer<V> countIndexer, final int valueRadix, final boolean storePrefixIndexes) { this.valueRadix = valueRadix; valueCoder = new VariableLengthBitCompressor(valueRadix); this.countIndexer = countIndexer; this.storePrefixIndexes = storePrefixIndexes; if (storePrefixIndexes) contextOffsets = new LongArray[6]; valueRanks = new LongArray[6]; countIndexer.getIndex(getDefaultVal()); countIndexer.trim(); countIndexer.lock(); wordWidth = CustomWidthArray.numBitsNeeded(countIndexer.size()); Logger.startTrack("Storing count indices using " + wordWidth + " bits."); storeCounts(); Logger.endTrack(); }
/** * Parse the ARPA file and populate the relevant fields of the enclosing * ICSILanguageModel * */ @Override public void parse(final ArpaLmReaderCallback<ProbBackoffPair> callback_) { this.callback = callback_; this.reader = IOUtils.openInHard(file); Logger.startTrack("Parsing ARPA language model file"); final List<Long> numNGrams = parseHeader(); callback.initWithLengths(numNGrams); parseNGrams(); Logger.endTrack(); callback.cleanup(); wordIndexer.setStartSymbol(wordIndexer.getWord(wordIndexer.getOrAddIndexFromString(START_SYMBOL))); wordIndexer.setEndSymbol(wordIndexer.getWord(wordIndexer.getOrAddIndexFromString(END_SYMBOL))); wordIndexer.setUnkSymbol(wordIndexer.getWord(wordIndexer.getOrAddIndexFromString(UNK_SYMBOL))); }