public static void compile(final File wordMapFile, final File freqFile, final File inFile, final SourceType sourceType, final File queryFile, final double smoothness, final int salience, final double alpha, final int N, final boolean writeDocs, final PrintStream out) throws IOException, NumberFormatException, RuntimeException, Exception, FileNotFoundException { compile(wordMapFile, freqFile, 150, null, inFile, SourceType.FIRST, null, 1.0, -1, 0.0, 3, false, out); }
@Override public void run() { clearNow(); }
@Override public int compare(Integer o1, Integer o2) { return compare(o1.intValue(), o2.intValue()); } });
final StopWordList stopwords = stopWordList(freqFile, stopWordCount); final double mean = CalculateBetaMean.calcBetaMean(makeFunction(betaMethod, inFile, W, sourceType, queryFile, stopwords, smoothness, 0.0, 1.0, salience), inFile, sourceType, W); System.err.println("mean=" + mean); beta = makeFunction(betaMethod, inFile, W, sourceType, queryFile, stopwords, smoothness, alpha, mean, salience); } else { DoCount.queryFile = readQueryDoc(queryFile); beta = new DefaultBetaFunction(); final File[] countFiles = initCountFiles(N); if(System.getProperty("writeDocs") != null || writeDocs) { DoCount.wordMap = words; DoCount.doCount(inFile, N, openFiles(countFiles), beta, sourceType, W); if(System.getProperty("writeDocs") != null || writeDocs) { DoCount.foreignDocRanking.flush(); if (countFiles[i] != null) { System.out.println(countFiles[i].getPath()); sortedFiles[i] = sort(countFiles[i]); for (int i = 0; i < N; i++) { final File uniqFile = new File(countFiles[i].getPath() + "uniq"); DeleteFileOnExit.add(uniqFile); Uniq.uniq(sortedFiles[i], new PrintStream(uniqFile)); uniqFiles[i] = uniqFile; for (int i = 1; i < N; i++) { final File divHistFile = new File(countFiles[i - 1].getPath() + "divHist");
private static BetaSimFunction makeFunction(BetaLMImpl.Method betaMethod, File inFile, int W, SourceType sourceType, File queryFile, StopWordList stopwords, double smoothness, double alpha, double mean, int salience) throws IOException { if (betaMethod != null) { final PrecomputedValues precomp; if (PrecomputedValues.isNecessary(betaMethod)) { System.err.println("Methodology requires pre-scan of the corpus"); precomp = PrecomputedValues.precompute(inFile, W, sourceType); } else { precomp = null; return new DefaultBetaFunction(); final IntSet salient = MostSalient.mostSalient(queryFile, inFile, W, salience, sourceType); DoCount.queryFile = salient; binQuery = MostSalient.filter(salient, binQuery2); } else { binQuery = SparseIntArray.fromBinary(CLIOpts.openInputAsMaybeZipped(queryFile), Integer.MAX_VALUE); DoCount.queryFile = readQueryDoc(queryFile); return betaSimFunction(betaMethod, binQuery, precomp, stopwords); } else { return Metrics.smoothed(betaSimFunction(betaMethod, binQuery, precomp, stopwords), smoothness, alpha * mean); return new DefaultBetaFunction();
CompileModel.compile(wordMapFile, freqFile, 150, BetaLMImpl.Method.COS_SIM, inFile, SourceType.FIRST, queryFile, sigma, -1, alpha, N, false, out2); DeleteFileOnExit.clearNow(); final ARPALM lm = new ARPALM(tmpFile); perplexity[(int) alpha * 10][(int) sigma] = Perplexity.calculatePerplexity(scanner, lm); System.err.println("alpha=" + alpha + ";sigma= " + sigma + ";perplexity=" + perplexity[(int) alpha * 10][(int) sigma]);
public static void main(String[] args) throws Exception { final CLIOpts opts = new CLIOpts(args); StringBuilder betalmString = new StringBuilder("The BetaLM method: "); for (BetaLMImpl.Method method : BetaLMImpl.Method.values()) { betalmString.append(method.name()).append(" "); } final BetaLMImpl.Method betaMethod = opts.enumOptional("b", BetaLMImpl.Method.class, null, betalmString.toString()); final SourceType sourceType = opts.enumOptional("t", SourceType.class, SourceType.FIRST, "The type of source: SIMPLE, FIRST or SECOND"); final Smoothing smoothing = opts.enumOptional("smooth", Smoothing.class, Smoothing.NONE, "The type of smoothing: NONE, ADD_ALPHA, GOOD_TURING, KNESER_NEY"); final File queryFile = opts.roFile("f", "The query file (ontology)", null); final double smoothness = opts.doubleValue("s", 1.0, "The selective smoothing parameter"); final double alpha = opts.doubleValue("a", 0.0, "The minimal smoothing parameter"); final int salience = opts.intValue("salience", "The salience (filtering on query document)", -1); final int stopWordCount = opts.intValue("stop", "The number of stop words to ignore", 150); final boolean writeDocs = opts.flag("writeDocs", "Write documents in corpus with ranking"); final File inFile = opts.roFile("corpus[.gz|.bz2]", "The corpus file"); final int N = opts.nonNegIntValue("N", "The largest n-gram to calculate"); final File wordMapFile = opts.roFile("wordMap", "The word map file"); final File freqFile = opts.roFile("freqs", "The frequency file for the corpus"); final PrintStream out = opts.outFileOrStdout(); if (!opts.verify(CompileModel.class)) { return; } compile(wordMapFile, freqFile, stopWordCount, betaMethod, inFile, sourceType, queryFile, smoothness, salience, alpha, N, writeDocs, out); }
final String value; if(future) { value = futureValue(line); } else { value = historyValue(line);
public static void main(String[] args) throws Exception { final CLIOpts opts = new CLIOpts(args); final File doc = opts.roFile("document", "The document on which to estimate the perplexity"); final File lmFile = opts.roFile("lm", "The language model file"); if (!opts.verify(Perplexity.class)) { return; } final ARPALM lm = new ARPALM(lmFile); final Scanner scanner = new Scanner(doc); double perplexity = calculatePerplexity(scanner, lm); System.err.println("Log2 Perplexity=" + (perplexity )); } private static final double LOG_10_2 = 0.3010299956639812;
public static Object2DoubleMap<NGram> mostSalientNGrams(final File reference, final File corpus, int N, SourceType sourceType) throws IOException { final Object2DoubleMap<NGram> referenceCounts = countInReference(reference, N); final Object2DoubleMap<NGram> corpusCounts = countInCorpus(corpus, N, referenceCounts.keySet(), sourceType); final Object2DoubleMap<NGram> salience = new Object2DoubleRBTreeMap<NGram>(); for (Object2DoubleMap.Entry<NGram> e : referenceCounts.object2DoubleEntrySet()) { if (corpusCounts.containsKey(e.getKey())) { salience.put(e.getKey(), e.getDoubleValue()/corpusCounts.getDouble(e.getKey())); } } referenceCounts.clear(); corpusCounts.clear(); final Object2DoubleMap<NGram> rankedSalience = new Object2DoubleRBTreeMap<NGram>(new Comparator<NGram>() { @Override public int compare(NGram o1, NGram o2) { final double salience1 = salience.getDouble(o1); final double salience2 = salience.getDouble(o2); if(salience1 < salience2) { return +1; } else if(salience1 > salience2) { return -1; } else { return o1.compareTo(o2); } } }); rankedSalience.putAll(salience); return rankedSalience; }
private static File[] initCountFiles(final int N) throws IOException { final File[] countFiles = new File[N * 2]; for (int n = 0; n < N; n++) { final File countFile = File.createTempFile("counts", "." + n); DeleteFileOnExit.add(countFile); countFiles[n] = countFile; if (n > 0) { final File histFile = File.createTempFile("counts", ".h" + n); DeleteFileOnExit.add(histFile); countFiles[n + N] = histFile; } } return countFiles; }
@Override public double scoreNGrams(IntList document, int W) { return score(SparseIntArray.histogram(document.toIntArray(), W)); }
public static Data calculate(Scanner in, int H) { final Data data = new Data(H); int n = 0; while(in.hasNextLine()) { final String line = in.nextLine(); final String[] parts = line.split(" "); if(parts.length < 3) { System.err.println(line); continue; } final int count = Integer.parseInt(parts[parts.length-1]); if(count <= H) { data.CoC[count-1]++; } final double value = Double.parseDouble(parts[parts.length-2]); n++; data.mean = value / n + data.mean * ((double)(n - 1) / (double)n); } return data; }
public static void main(String[] args) throws Exception { final CLIOpts opts = new CLIOpts(args); final File countedFile = opts.roFile("countFile", "The file with counts"); final int H = opts.nonNegIntValue("H", "The maximum count of count to store"); final PrintStream out = opts.outFileOrStdout(); if(!opts.verify(COCAndMean.class)) { return; } final Scanner in = new Scanner(countedFile); final Data data = calculate(in, H); out.println(Arrays.toString(data.CoC)); out.println(data.mean); }
public static void main(String[] args) throws Exception { final CLIOpts opts = new CLIOpts(args); final File file = opts.roFile("file", "The file to count uniqueness among"); final PrintStream out = opts.outFileOrStdout(); if (!opts.verify(Uniq.class)) { return; } uniq(file, out); }
public static void main(String[] args) throws Exception { final CLIOpts opts = new CLIOpts(args); final File histFile = opts.roFile("history", "The history list"); final boolean future = opts.flag("future", "Future history format"); final PrintStream out = opts.outFileOrStdout(); if(!opts.verify(Hist.class)) { return; } hist(histFile, future, out); }
if (docNo % 2 == 1 && sourceType != SIMPLE) { if (sourceType == FIRST) { processDoc(doc[0], doc[1], N, outs, beta, W); } else /*if(sourceType == INTERLEAVED_USE_SECOND)*/ { processDoc(doc[1], doc[0], N, outs, beta, W); processDoc(doc[docNo % 2], doc[docNo % 2], N, outs, beta, W); doc[docNo % 2].clear();
public static IntSet mostSalient(final File reference, final File corpus, int W, int topN, SourceType sourceType) throws IOException { final PrecomputedValues precomp1 = PrecomputedValues.precompute(reference, W, SourceType.SIMPLE); final PrecomputedValues precomp2 = PrecomputedValues.precompute(corpus, W, sourceType); final double[] salience = new double[W]; final IntRBTreeSet topNWords = new IntRBTreeSet(new IntComparator() { @Override public int compare(int i, int i1) { return salience[i] < salience[i1] ? -1 : (salience[i] > salience[i1] ? 1 : i - i1); } @Override public int compare(Integer o1, Integer o2) { return compare(o1.intValue(), o2.intValue()); } }); for (int w = 0; w < W; w++) { final double val = precomp1.mu.value(w); final Double val2 = precomp2.mu.value(w); if (val != 0.0 && val2 != 0.0) { salience[w] = val / val2; if (topNWords.size() < topN) { topNWords.add(w); } else if (salience[w] > salience[topNWords.firstInt()]) { topNWords.remove(topNWords.first()); topNWords.add(w); } } } return topNWords; }
public static void main(String[] args) throws Exception { final CLIOpts opts = new CLIOpts(args); final SourceType sourceType = opts.enumOptional("t", SourceType.class, SourceType.FIRST, "The corpus type"); final File refFile = opts.roFile("reference", "The reference ontology"); final File corpusFile = opts.roFile("corpus", "The corpus file"); final File wordMapFile = opts.roFile("wordMap", "The word map"); final int N = opts.intValue("N", "The maximal n-gram to consider"); final double thresh = opts.doubleValue("threshold", "The threshold of salience to filter at"); final File outFile = opts.woFile("out", "The file to write the salient n-gram list to"); if(!opts.verify(MostSalient.class)) { return; } final int W = WordMap.calcW(wordMapFile); final String[] wordMap = WordMap.inverseFromFile(wordMapFile, W, true); final Object2DoubleMap<NGram> salientNGrams = mostSalientNGrams(refFile, corpusFile, N, sourceType); final DataOutputStream out = new DataOutputStream(CLIOpts.openOutputAsMaybeZipped(outFile)); for(Object2DoubleMap.Entry<NGram> e : salientNGrams.object2DoubleEntrySet()) { if(e.getDoubleValue() > thresh) { final NGram ng = e.getKey(); out.writeInt(ng.ngram.length); for(int i = 0; i < ng.ngram.length; i++) { out.writeInt(ng.ngram[i]); } out.writeDouble(e.getDoubleValue()); } } out.flush(); out.close(); } }
final int w = in.readInt(); if (w != 0) { if (inDoc(docNo, sourceType)) { carousel.offer(w); for (int n = 1; n <= carousel.maxNGram(); n++) {