public static void main (String[] args) throws IOException { InstanceList training = InstanceList.load (new File(args[0])); int numTopics = args.length > 1 ? Integer.parseInt(args[1]) : 200; InstanceList testing = args.length > 2 ? InstanceList.load (new File(args[2])) : null; LDAHyper lda = new LDAHyper (numTopics, 50.0, 0.01); lda.printLogLikelihood = true; lda.setTopicDisplay(50,7); lda.addInstances(training); lda.estimate(); }
printTopWords (System.out, wordsPerTopic, false); double el = empiricalLikelihood(1000, testing); double ll = modelLogLikelihood(); double mi = topicLabelMutualInformation(); System.out.println(ll + "\t" + el + "\t" + mi); this.printState(new File(stateFilename + '.' + iterationsSoFar)); cachedCoefficients[topic] = alpha[topic] / (tokensPerTopic[topic] + betaSum); clearHistograms(); FeatureSequence tokenSequence = (FeatureSequence) data.get(di).instance.getData(); LabelSequence topicSequence = (LabelSequence) data.get(di).topicSequence; sampleTopicsForOneDoc (tokenSequence, topicSequence, iterationsSoFar >= burninPeriod && iterationsSoFar % saveSampleInterval == 0, true); if (printLogLikelihood) System.out.println (modelLogLikelihood());
public void topicXMLReportPhrases (PrintStream out, int numWords) { int numTopics = this.getNumTopics(); gnu.trove.TObjectIntHashMap<String>[] phrases = new gnu.trove.TObjectIntHashMap[numTopics]; Alphabet alphabet = this.getAlphabet(); for (int di = 0; di < this.getData().size(); di++) { LDAHyper.Topication t = this.getData().get(di); Instance instance = t.instance; FeatureSequence fvs = (FeatureSequence) instance.getData(); for (int pi = 0; pi < doclen; pi++) { feature = fvs.getIndexAtPosition(pi); topic = this.getData().get(di).topicSequence.getIndexAtPosition(pi); if (topic == prevtopic && (!withBigrams || ((FeatureSequenceWithBigrams)fvs).getBiIndexAtPosition(pi) != -1)) { if (sb == null) probs[type] = this.getCountFeatureTopic(type, ti) / (double)this.getCountTokensPerTopic(ti); RankedFeatureVector rfv = new RankedFeatureVector (alphabet, probs); for (int ri = 0; ri < numWords; ri++) { int fi = rfv.getIndexAtRank(ri); pout.println (" <term weight=\""+probs[fi]+"\" count=\""+this.getCountFeatureTopic(fi,ti)+"\">"+alphabet.lookupObject(fi)+ "</term>"); if (ri < 20) // consider top 20 individual words as candidate titles titles.add(alphabet.lookupObject(fi), this.getCountFeatureTopic(fi,ti));
public void addInstances (InstanceList training) { initializeForTypes (training.getDataAlphabet()); ArrayList<LabelSequence> topicSequences = new ArrayList<LabelSequence>(); for (Instance instance : training) { LabelSequence topicSequence = new LabelSequence(topicAlphabet, new int[instanceLength(instance)]); if (false) // This method not yet obeying its last "false" argument, and must be for this to work sampleTopicsForOneDoc((FeatureSequence)instance.getData(), topicSequence, false, false); else { Randoms r = new Randoms(); int[] topics = topicSequence.getFeatures(); for (int i = 0; i < topics.length; i++) topics[i] = r.nextInt(numTopics); } topicSequences.add (topicSequence); } addInstances (training, topicSequences); }
public void estimate () throws IOException { estimate (numIterations); }
printTopWords (System.out, wordsPerTopic, false); double el = empiricalLikelihood(1000, testing); double ll = modelLogLikelihood(); double mi = topicLabelMutualInformation(); System.out.println(ll + "\t" + el + "\t" + mi); this.printState(new File(stateFilename + '.' + iterationsSoFar)); cachedCoefficients[topic] = alpha[topic] / (tokensPerTopic[topic] + betaSum); clearHistograms(); FeatureSequence tokenSequence = (FeatureSequence) data.get(di).instance.getData(); LabelSequence topicSequence = (LabelSequence) data.get(di).topicSequence; sampleTopicsForOneDoc (tokenSequence, topicSequence, iterationsSoFar >= burninPeriod && iterationsSoFar % saveSampleInterval == 0, true); if (printLogLikelihood) System.out.println (modelLogLikelihood());
public void topicXMLReportPhrases (PrintStream out, int numWords) { int numTopics = this.getNumTopics(); gnu.trove.TObjectIntHashMap<String>[] phrases = new gnu.trove.TObjectIntHashMap[numTopics]; Alphabet alphabet = this.getAlphabet(); for (int di = 0; di < this.getData().size(); di++) { LDAHyper.Topication t = this.getData().get(di); Instance instance = t.instance; FeatureSequence fvs = (FeatureSequence) instance.getData(); for (int pi = 0; pi < doclen; pi++) { feature = fvs.getIndexAtPosition(pi); topic = this.getData().get(di).topicSequence.getIndexAtPosition(pi); if (topic == prevtopic && (!withBigrams || ((FeatureSequenceWithBigrams)fvs).getBiIndexAtPosition(pi) != -1)) { if (sb == null) probs[type] = this.getCountFeatureTopic(type, ti) / (double)this.getCountTokensPerTopic(ti); RankedFeatureVector rfv = new RankedFeatureVector (alphabet, probs); for (int ri = 0; ri < numWords; ri++) { int fi = rfv.getIndexAtRank(ri); pout.println (" <term weight=\""+probs[fi]+"\" count=\""+this.getCountFeatureTopic(fi,ti)+"\">"+alphabet.lookupObject(fi)+ "</term>"); if (ri < 20) // consider top 20 individual words as candidate titles titles.add(alphabet.lookupObject(fi), this.getCountFeatureTopic(fi,ti));
public void addInstances (InstanceList training) { initializeForTypes (training.getDataAlphabet()); ArrayList<LabelSequence> topicSequences = new ArrayList<LabelSequence>(); for (Instance instance : training) { LabelSequence topicSequence = new LabelSequence(topicAlphabet, new int[instanceLength(instance)]); if (false) // This method not yet obeying its last "false" argument, and must be for this to work sampleTopicsForOneDoc((FeatureSequence)instance.getData(), topicSequence, false, false); else { Randoms r = new Randoms(); int[] topics = topicSequence.getFeatures(); for (int i = 0; i < topics.length; i++) topics[i] = r.nextInt(numTopics); } topicSequences.add (topicSequence); } addInstances (training, topicSequences); }
public void estimate () throws IOException { estimate (numIterations); }
printTopWords (System.out, wordsPerTopic, false); double el = empiricalLikelihood(1000, testing); double ll = modelLogLikelihood(); double mi = topicLabelMutualInformation(); System.out.println(ll + "\t" + el + "\t" + mi); this.printState(new File(stateFilename + '.' + iterationsSoFar)); cachedCoefficients[topic] = alpha[topic] / (tokensPerTopic[topic] + betaSum); clearHistograms(); FeatureSequence tokenSequence = (FeatureSequence) data.get(di).instance.getData(); LabelSequence topicSequence = (LabelSequence) data.get(di).topicSequence; sampleTopicsForOneDoc (tokenSequence, topicSequence, iterationsSoFar >= burninPeriod && iterationsSoFar % saveSampleInterval == 0, true); if (printLogLikelihood) System.out.println (modelLogLikelihood());
public void topicXMLReportPhrases (PrintStream out, int numWords) { int numTopics = this.getNumTopics(); gnu.trove.TObjectIntHashMap<String>[] phrases = new gnu.trove.TObjectIntHashMap[numTopics]; Alphabet alphabet = this.getAlphabet(); for (int di = 0; di < this.getData().size(); di++) { LDAHyper.Topication t = this.getData().get(di); Instance instance = t.instance; FeatureSequence fvs = (FeatureSequence) instance.getData(); for (int pi = 0; pi < doclen; pi++) { feature = fvs.getIndexAtPosition(pi); topic = this.getData().get(di).topicSequence.getIndexAtPosition(pi); if (topic == prevtopic && (!withBigrams || ((FeatureSequenceWithBigrams)fvs).getBiIndexAtPosition(pi) != -1)) { if (sb == null) probs[type] = this.getCountFeatureTopic(type, ti) / (double)this.getCountTokensPerTopic(ti); RankedFeatureVector rfv = new RankedFeatureVector (alphabet, probs); for (int ri = 0; ri < numWords; ri++) { int fi = rfv.getIndexAtRank(ri); pout.println (" <term weight=\""+probs[fi]+"\" count=\""+this.getCountFeatureTopic(fi,ti)+"\">"+alphabet.lookupObject(fi)+ "</term>"); if (ri < 20) // consider top 20 individual words as candidate titles titles.add(alphabet.lookupObject(fi), this.getCountFeatureTopic(fi,ti));
public static void main (String[] args) throws IOException { InstanceList training = InstanceList.load (new File(args[0])); int numTopics = args.length > 1 ? Integer.parseInt(args[1]) : 200; InstanceList testing = args.length > 2 ? InstanceList.load (new File(args[2])) : null; LDAHyper lda = new LDAHyper (numTopics, 50.0, 0.01); lda.printLogLikelihood = true; lda.setTopicDisplay(50,7); lda.addInstances(training); lda.estimate(); }
public void addInstances (InstanceList training) { initializeForTypes (training.getDataAlphabet()); ArrayList<LabelSequence> topicSequences = new ArrayList<LabelSequence>(); for (Instance instance : training) { LabelSequence topicSequence = new LabelSequence(topicAlphabet, new int[instanceLength(instance)]); if (false) // This method not yet obeying its last "false" argument, and must be for this to work sampleTopicsForOneDoc((FeatureSequence)instance.getData(), topicSequence, false, false); else { Randoms r = new Randoms(); int[] topics = topicSequence.getFeatures(); for (int i = 0; i < topics.length; i++) topics[i] = r.nextInt(numTopics); } topicSequences.add (topicSequence); } addInstances (training, topicSequences); }
public void estimate () throws IOException { estimate (numIterations); }
public static void main (String[] args) throws IOException { InstanceList training = InstanceList.load (new File(args[0])); int numTopics = args.length > 1 ? Integer.parseInt(args[1]) : 200; InstanceList testing = args.length > 2 ? InstanceList.load (new File(args[2])) : null; LDAHyper lda = new LDAHyper (numTopics, 50.0, 0.01); lda.printLogLikelihood = true; lda.setTopicDisplay(50,7); lda.addInstances(training); lda.estimate(); }