public static void main(String[] args) throws Exception { final CLIOpts opts = new CLIOpts(args); final File corpus = opts.roFile("corpus[.gz|bz2]", "The corpus file"); double alpha = opts.doubleValue("alpha", -1, "The alpha parameter"); final double beta = opts.doubleValue("beta", 0.01, "The beta parameter"); final int W = opts.intValue("W", "The number of distinct tokens"); final int J = opts.intValue("J", "The number of documents (per language)"); final int K = opts.intValue("K", "The number of topics"); final int N = opts.intValue("N", "The number of iterations to perform"); final File outFile = opts.woFile("output", "The file to write the SVD to"); if (!opts.verify(CPLSATrain.class)) { return; } if (alpha == -1.0) { alpha = 2.0 / K; } if (alpha < 0 || beta < 0) { throw new IllegalArgumentException("Alpha and beta cannot be negative"); } System.err.println("Preparing corpus"); final CPLSATrain train = new CPLSATrain(corpus, J, W, K, alpha, beta); train.solve(N, 1e-12, true); System.err.println("Writing model"); train.writeModel(CLIOpts.openOutputAsMaybeZipped(outFile)); }
@Override public IntIterator iterator() { try { return new DataInputStreamAsIntIterator(CLIOpts.openInputAsMaybeZipped(file)); } catch(IOException x) { throw new RuntimeException(x); } }
public static void main(String[] args) throws Exception { final CLIOpts opts = new CLIOpts(args); StringBuilder betalmString = new StringBuilder("The BetaLM method: "); for (BetaLMImpl.Method method : BetaLMImpl.Method.values()) { betalmString.append(method.name()).append(" "); } final BetaLMImpl.Method betaMethod = opts.enumOptional("b", BetaLMImpl.Method.class, null, betalmString.toString()); final SourceType sourceType = opts.enumOptional("t", SourceType.class, SourceType.FIRST, "The type of source: SIMPLE, FIRST or SECOND"); final Smoothing smoothing = opts.enumOptional("smooth", Smoothing.class, Smoothing.NONE, "The type of smoothing: NONE, ADD_ALPHA, GOOD_TURING, KNESER_NEY"); final File queryFile = opts.roFile("f", "The query file (ontology)", null); final double smoothness = opts.doubleValue("s", 1.0, "The selective smoothing parameter"); final double alpha = opts.doubleValue("a", 0.0, "The minimal smoothing parameter"); final int salience = opts.intValue("salience", "The salience (filtering on query document)", -1); final int stopWordCount = opts.intValue("stop", "The number of stop words to ignore", 150); final boolean writeDocs = opts.flag("writeDocs", "Write documents in corpus with ranking"); final File inFile = opts.roFile("corpus[.gz|.bz2]", "The corpus file"); final int N = opts.nonNegIntValue("N", "The largest n-gram to calculate"); final File wordMapFile = opts.roFile("wordMap", "The word map file"); final File freqFile = opts.roFile("freqs", "The frequency file for the corpus"); final PrintStream out = opts.outFileOrStdout(); if (!opts.verify(CompileModel.class)) { return; } compile(wordMapFile, freqFile, stopWordCount, betaMethod, inFile, sourceType, queryFile, smoothness, salience, alpha, N, writeDocs, out); }
public static void main(String[] args) throws Exception { final CLIOpts opts = new CLIOpts(args); final File histFile = opts.roFile("history", "The history list"); final boolean future = opts.flag("future", "Future history format"); final PrintStream out = opts.outFileOrStdout(); if(!opts.verify(Hist.class)) { return; } hist(histFile, future, out); }
public static void main(String[] args) throws Exception { final CLIOpts opts = new CLIOpts(args); final SourceType sourceType = opts.enumOptional("t", SourceType.class, SourceType.FIRST, "The type of source: SIMPLE, FIRST or SECOND"); final File corpus = opts.roFile("corpus[.gz|.bz2]", "The corpus"); final int N = opts.nonNegIntValue("N", "The largest n-gram to count for"); final File out = opts.woFile("out", "The files to write to"); if (!opts.verify(DoCount.class)) { return; } final PrintWriter[] outs = new PrintWriter[N * 2]; for (int i = 0; i < N; i++) { outs[i] = new PrintWriter(out.getName() + "." + i); if (i != 0) { outs[i + N] = new PrintWriter(out.getName() + ".h" + i); } } // doCount(corpus, N, outs, new BetaSimFunction() { // @Override // public double score(Vector<Integer> document) { // return 1.0; // } // }, sourceType, 0); } private static final DecimalFormat df = new DecimalFormat("0.000000000");
public static void main(String[] args) throws Exception { final CLIOpts opts = new CLIOpts(args); final File doc = opts.roFile("document", "The document on which to estimate the perplexity"); final File lmFile = opts.roFile("lm", "The language model file"); if (!opts.verify(Perplexity.class)) { return; } final ARPALM lm = new ARPALM(lmFile); final Scanner scanner = new Scanner(doc); double perplexity = calculatePerplexity(scanner, lm); System.err.println("Log2 Perplexity=" + (perplexity )); } private static final double LOG_10_2 = 0.3010299956639812;
public static void main(String[] args) throws Exception { final CLIOpts opts = new CLIOpts(args); final File file = opts.roFile("file", "The file to count uniqueness among"); final PrintStream out = opts.outFileOrStdout(); if (!opts.verify(Uniq.class)) { return; } uniq(file, out); }
public static void main(String[] args) throws Exception { final CLIOpts opts = new CLIOpts(args); double alpha = opts.doubleValue("alpha", -1, "The alpha parameter"); final double beta = opts.doubleValue("beta", 0.01, "The beta parameter"); final File corpus = opts.roFile("corpus[.gz|.bz2]", "The corpus file"); final int W = opts.intValue("W", "The number of distinct tokens"); final int J = opts.intValue("J", "The number of documents (per language)"); final int K = opts.intValue("K", "The number of topics"); final int N = opts.intValue("N", "The number of iterations"); final File outFile = opts.woFile("model[.gz|.bz2]", "The file to write the model to"); if (!opts.verify(LDATrain.class)) { return; } if (alpha == -1.0) { alpha = 2.0 / K; } if (alpha < 0 || beta < 0) { throw new IllegalArgumentException("Alpha and beta cannot be negative"); } final LDATrain ldaTrain = new LDATrain(corpus, K, J, W, alpha, beta); ldaTrain.train(N); ldaTrain.writeModel(outFile); } }
public static void main(String[] args) throws Exception { final CLIOpts opts = new CLIOpts(args); final File countedFile = opts.roFile("countFile", "The file with counts"); final int H = opts.nonNegIntValue("H", "The maximum count of count to store"); final PrintStream out = opts.outFileOrStdout(); if(!opts.verify(COCAndMean.class)) { return; } final Scanner in = new Scanner(countedFile); final Data data = calculate(in, H); out.println(Arrays.toString(data.CoC)); out.println(data.mean); }
public static AssignmentBuffer interleavedFrom(File corpus) throws IOException { final DataInputStream data = new DataInputStream(CLIOpts.openInputAsMaybeZipped(corpus)); final File tmpFile = File.createTempFile("assign", ".buf"); tmpFile.deleteOnExit(); final DataOutputStream out = new DataOutputStream(new FileOutputStream(tmpFile)); while(data.available() > 0) { try { int i = data.readInt(); out.writeInt(i); out.writeInt(0); } catch(EOFException x) { break; } } out.flush(); out.close(); return new AssignmentBuffer(new RandomAccessFile(tmpFile, "rw").getChannel(), 4194304, tmpFile.length()); }
public static void main(String[] args) throws Exception { final CLIOpts opts = new CLIOpts(args); final SourceType sourceType = opts.enumOptional("t", SourceType.class, SourceType.FIRST, "The corpus type"); final File refFile = opts.roFile("reference", "The reference ontology"); final File corpusFile = opts.roFile("corpus", "The corpus file"); final File wordMapFile = opts.roFile("wordMap", "The word map"); final int N = opts.intValue("N", "The maximal n-gram to consider"); final double thresh = opts.doubleValue("threshold", "The threshold of salience to filter at"); final File outFile = opts.woFile("out", "The file to write the salient n-gram list to"); if(!opts.verify(MostSalient.class)) { return; } final int W = WordMap.calcW(wordMapFile); final String[] wordMap = WordMap.inverseFromFile(wordMapFile, W, true); final Object2DoubleMap<NGram> salientNGrams = mostSalientNGrams(refFile, corpusFile, N, sourceType); final DataOutputStream out = new DataOutputStream(CLIOpts.openOutputAsMaybeZipped(outFile)); for(Object2DoubleMap.Entry<NGram> e : salientNGrams.object2DoubleEntrySet()) { if(e.getDoubleValue() > thresh) { final NGram ng = e.getKey(); out.writeInt(ng.ngram.length); for(int i = 0; i < ng.ngram.length; i++) { out.writeInt(ng.ngram[i]); } out.writeDouble(e.getDoubleValue()); } } out.flush(); out.close(); } }
public static void main(String[] args) throws Exception { final CLIOpts opts = new CLIOpts(args); final File corpusFile = opts.roFile("corpus", "The corpus"); final File salientFile = opts.roFile("salientList", "The salient list"); final File wordMapFile = opts.roFile("wordMap", "The wordmap"); final PrintStream out = opts.outFileOrStdout(); if (!opts.verify(FilterBySalientList.class)) { return; final DataInputStream in = new DataInputStream(CLIOpts.openInputAsMaybeZipped(salientFile)); final Object2DoubleMap<NGram> ngrams = new Object2DoubleRBTreeMap<NGram>(); int N = 0;
public static SalienceMetric fromFile(final File salienceFile) throws IOException { final DataInputStream in = new DataInputStream(CLIOpts.openInputAsMaybeZipped(salienceFile)); final Object2DoubleMap<NGram> ngrams = new Object2DoubleRBTreeMap<NGram>(); int N = 0; while (true) { try { int n = in.readInt(); N = Math.max(N, n); int[] ng = new int[n]; for (int i = 0; i < n; i++) { ng[i] = in.readInt(); } ngrams.put(new NGram(ng), in.readDouble()); } catch (EOFException x) { break; } } in.close(); return new SalienceMetric(ngrams,N); }
public static void main(String[] args) throws Exception { final CLIOpts opts = new CLIOpts(args); final File inFile = opts.roFile("trainFile", "The training corpus"); final File queryFile = opts.roFile("queryFile", "The file to adapt to"); final File freqFile = opts.roFile("freqFile", "The frequency file"); final File wordMapFile = opts.roFile("wordMap", "The word map"); final int N = opts.nonNegIntValue("N", "The largetst n-gram to consider"); final File testDoc = opts.roFile("test.txt", "The test document to evaluate on"); if(!opts.verify(AlphaSigmaGrid.class)) { return;
public static double calcBetaMean(BetaSimFunction function, File corpus, SourceType sourceType, int W) throws IOException { final DataInputStream in = new DataInputStream(CLIOpts.openInputAsMaybeZipped(corpus)); double mean = 0.0; int docsRead = 0; final IntArrayList doc = new IntArrayList(); while(true) { try { final int w = in.readInt(); if(w == 0) { if((docsRead % 2 == 1 && sourceType == SourceType.FIRST) || (docsRead % 2 == 0 && sourceType == SourceType.SECOND) || sourceType == SourceType.SIMPLE) { final double score = function.scoreNGrams(doc,W); docsRead++; mean = score / docsRead + ((double)(docsRead-1) / (double)docsRead) * mean; doc.clear(); } else { docsRead++; doc.clear(); } } else { doc.add(w); } } catch(EOFException x) { break; } } return mean; }
public static void doCount(final File corpus, final int N, final PrintWriter[] outs, BetaSimFunction beta, SourceType sourceType, int W) throws IOException { int read = 0, docNo = 0; final DataInputStream in = new DataInputStream(CLIOpts.openInputAsMaybeZipped(corpus)); final IntArrayList[] doc = {new IntArrayList(), new IntArrayList()}; while (true) {
private static void filterBySalients(File corpusFile, Object2DoubleMap<NGram> ngrams, PrintStream out, String[] wordMap, int N, SourceType sourceType) throws IOException { final IntList doc[] = {new IntList(), new IntList()}; final double[] salience = {0.0, 0.0}; final DataInputStream in = new DataInputStream(CLIOpts.openInputAsMaybeZipped(corpusFile));
final SparseIntArray binQuery2 = SparseIntArray.fromBinary(CLIOpts.openInputAsMaybeZipped(queryFile), Integer.MAX_VALUE); final IntSet salient = MostSalient.mostSalient(queryFile, inFile, W, salience, sourceType); DoCount.queryFile = salient; binQuery = MostSalient.filter(salient, binQuery2); } else { binQuery = SparseIntArray.fromBinary(CLIOpts.openInputAsMaybeZipped(queryFile), Integer.MAX_VALUE); DoCount.queryFile = readQueryDoc(queryFile);
public static Object2DoubleMap<NGram> countInReference(final File reference, final int N) throws IOException { final Object2DoubleMap<NGram> counts = new Object2DoubleRBTreeMap<NGram>(); final DataInputStream in = new DataInputStream(CLIOpts.openInputAsMaybeZipped(reference)); final NGramCarousel carousel = new NGramCarousel(N); int docNo = 0; while (true) { try { final int w = in.readInt(); if (w != 0) { //if (inDoc(docNo, sourceType)) { carousel.offer(w); for (int n = 1; n <= carousel.maxNGram(); n++) { final NGram ngram = carousel.ngram(n); if (counts.containsKey(ngram)) { counts.put(ngram, counts.getDouble(ngram) + 1.0); } else { counts.put(ngram, 1.0); } } //} } else { carousel.reset(); docNo++; } } catch (EOFException x) { break; } } return counts; }
public static Object2DoubleMap<NGram> countInCorpus(final File reference, final int N, final ObjectSet<NGram> referenceNGrams, SourceType sourceType) throws IOException { final Object2DoubleMap<NGram> counts = new Object2DoubleRBTreeMap<NGram>(); final DataInputStream in = new DataInputStream(CLIOpts.openInputAsMaybeZipped(reference)); final NGramCarousel carousel = new NGramCarousel(N); int docNo = 0;