public static void main(String[] args) throws Exception { final CLIOpts opts = new CLIOpts(args); final File doc = opts.roFile("document", "The document on which to estimate the perplexity"); final File lmFile = opts.roFile("lm", "The language model file"); if (!opts.verify(Perplexity.class)) { return; } final ARPALM lm = new ARPALM(lmFile); final Scanner scanner = new Scanner(doc); double perplexity = calculatePerplexity(scanner, lm); System.err.println("Log2 Perplexity=" + (perplexity )); } private static final double LOG_10_2 = 0.3010299956639812;
public static void main(String[] args) throws Exception { final CLIOpts opts = new CLIOpts(args); final File countedFile = opts.roFile("countFile", "The file with counts"); final int H = opts.nonNegIntValue("H", "The maximum count of count to store"); final PrintStream out = opts.outFileOrStdout(); if(!opts.verify(COCAndMean.class)) { return; } final Scanner in = new Scanner(countedFile); final Data data = calculate(in, H); out.println(Arrays.toString(data.CoC)); out.println(data.mean); }
public static void main(String[] args) throws Exception { final CLIOpts opts = new CLIOpts(args); final File inFile = opts.roFile("trainFile", "The training corpus"); final File queryFile = opts.roFile("queryFile", "The file to adapt to"); final File freqFile = opts.roFile("freqFile", "The frequency file"); final File wordMapFile = opts.roFile("wordMap", "The word map"); final int N = opts.nonNegIntValue("N", "The largetst n-gram to consider"); final File testDoc = opts.roFile("test.txt", "The test document to evaluate on");
public static void main(String[] args) throws Exception { final CLIOpts opts = new CLIOpts(args); final SourceType sourceType = opts.enumOptional("t", SourceType.class, SourceType.FIRST, "The type of source: SIMPLE, FIRST or SECOND"); final File corpus = opts.roFile("corpus[.gz|.bz2]", "The corpus"); final int N = opts.nonNegIntValue("N", "The largest n-gram to count for"); final File out = opts.woFile("out", "The files to write to"); if (!opts.verify(DoCount.class)) { return; } final PrintWriter[] outs = new PrintWriter[N * 2]; for (int i = 0; i < N; i++) { outs[i] = new PrintWriter(out.getName() + "." + i); if (i != 0) { outs[i + N] = new PrintWriter(out.getName() + ".h" + i); } } // doCount(corpus, N, outs, new BetaSimFunction() { // @Override // public double score(Vector<Integer> document) { // return 1.0; // } // }, sourceType, 0); } private static final DecimalFormat df = new DecimalFormat("0.000000000");
public static void main(String[] args) throws Exception { final CLIOpts opts = new CLIOpts(args); final File file = opts.roFile("file", "The file to count uniqueness among"); final PrintStream out = opts.outFileOrStdout(); if (!opts.verify(Uniq.class)) { return; } uniq(file, out); }
public static void main(String[] args) throws Exception { final CLIOpts opts = new CLIOpts(args); StringBuilder betalmString = new StringBuilder("The BetaLM method: "); for (BetaLMImpl.Method method : BetaLMImpl.Method.values()) { betalmString.append(method.name()).append(" "); } final BetaLMImpl.Method betaMethod = opts.enumOptional("b", BetaLMImpl.Method.class, null, betalmString.toString()); final SourceType sourceType = opts.enumOptional("t", SourceType.class, SourceType.FIRST, "The type of source: SIMPLE, FIRST or SECOND"); final Smoothing smoothing = opts.enumOptional("smooth", Smoothing.class, Smoothing.NONE, "The type of smoothing: NONE, ADD_ALPHA, GOOD_TURING, KNESER_NEY"); final File queryFile = opts.roFile("f", "The query file (ontology)", null); final double smoothness = opts.doubleValue("s", 1.0, "The selective smoothing parameter"); final double alpha = opts.doubleValue("a", 0.0, "The minimal smoothing parameter"); final int salience = opts.intValue("salience", "The salience (filtering on query document)", -1); final int stopWordCount = opts.intValue("stop", "The number of stop words to ignore", 150); final boolean writeDocs = opts.flag("writeDocs", "Write documents in corpus with ranking"); final File inFile = opts.roFile("corpus[.gz|.bz2]", "The corpus file"); final int N = opts.nonNegIntValue("N", "The largest n-gram to calculate"); final File wordMapFile = opts.roFile("wordMap", "The word map file"); final File freqFile = opts.roFile("freqs", "The frequency file for the corpus"); final PrintStream out = opts.outFileOrStdout(); if (!opts.verify(CompileModel.class)) { return; } compile(wordMapFile, freqFile, stopWordCount, betaMethod, inFile, sourceType, queryFile, smoothness, salience, alpha, N, writeDocs, out); }
public static void main(String[] args) throws Exception { final CLIOpts opts = new CLIOpts(args); final SourceType sourceType = opts.enumOptional("t", SourceType.class, SourceType.FIRST, "The corpus type"); final File refFile = opts.roFile("reference", "The reference ontology"); final File corpusFile = opts.roFile("corpus", "The corpus file"); final File wordMapFile = opts.roFile("wordMap", "The word map"); final int N = opts.intValue("N", "The maximal n-gram to consider"); final double thresh = opts.doubleValue("threshold", "The threshold of salience to filter at"); final File outFile = opts.woFile("out", "The file to write the salient n-gram list to"); if(!opts.verify(MostSalient.class)) { return; } final int W = WordMap.calcW(wordMapFile); final String[] wordMap = WordMap.inverseFromFile(wordMapFile, W, true); final Object2DoubleMap<NGram> salientNGrams = mostSalientNGrams(refFile, corpusFile, N, sourceType); final DataOutputStream out = new DataOutputStream(CLIOpts.openOutputAsMaybeZipped(outFile)); for(Object2DoubleMap.Entry<NGram> e : salientNGrams.object2DoubleEntrySet()) { if(e.getDoubleValue() > thresh) { final NGram ng = e.getKey(); out.writeInt(ng.ngram.length); for(int i = 0; i < ng.ngram.length; i++) { out.writeInt(ng.ngram[i]); } out.writeDouble(e.getDoubleValue()); } } out.flush(); out.close(); } }
public static void main(String[] args) throws Exception { final CLIOpts opts = new CLIOpts(args); final File histFile = opts.roFile("history", "The history list"); final boolean future = opts.flag("future", "Future history format"); final PrintStream out = opts.outFileOrStdout(); if(!opts.verify(Hist.class)) { return; } hist(histFile, future, out); }
public static void main(String[] args) throws Exception { final CLIOpts opts = new CLIOpts(args); final File corpus = opts.roFile("corpus[.gz|bz2]", "The corpus file"); double alpha = opts.doubleValue("alpha", -1, "The alpha parameter"); final double beta = opts.doubleValue("beta", 0.01, "The beta parameter"); final int W = opts.intValue("W", "The number of distinct tokens"); final int J = opts.intValue("J", "The number of documents (per language)"); final int K = opts.intValue("K", "The number of topics"); final int N = opts.intValue("N", "The number of iterations to perform"); final File outFile = opts.woFile("output", "The file to write the SVD to"); if (!opts.verify(CPLSATrain.class)) { return; } if (alpha == -1.0) { alpha = 2.0 / K; } if (alpha < 0 || beta < 0) { throw new IllegalArgumentException("Alpha and beta cannot be negative"); } System.err.println("Preparing corpus"); final CPLSATrain train = new CPLSATrain(corpus, J, W, K, alpha, beta); train.solve(N, 1e-12, true); System.err.println("Writing model"); train.writeModel(CLIOpts.openOutputAsMaybeZipped(outFile)); }
public static void main(String[] args) throws Exception { final CLIOpts opts = new CLIOpts(args); double alpha = opts.doubleValue("alpha", -1, "The alpha parameter"); final double beta = opts.doubleValue("beta", 0.01, "The beta parameter"); final File corpus = opts.roFile("corpus[.gz|.bz2]", "The corpus file"); final int W = opts.intValue("W", "The number of distinct tokens"); final int J = opts.intValue("J", "The number of documents (per language)"); final int K = opts.intValue("K", "The number of topics"); final int N = opts.intValue("N", "The number of iterations"); final File outFile = opts.woFile("model[.gz|.bz2]", "The file to write the model to"); if (!opts.verify(LDATrain.class)) { return; } if (alpha == -1.0) { alpha = 2.0 / K; } if (alpha < 0 || beta < 0) { throw new IllegalArgumentException("Alpha and beta cannot be negative"); } final LDATrain ldaTrain = new LDATrain(corpus, K, J, W, alpha, beta); ldaTrain.train(N); ldaTrain.writeModel(outFile); } }