for (int topic = 0; topic < numTopics; topic++) { sortedTopics[topic] = new IDSorter(topic, topic); sortedTopics[topic].set(topic, (float) topicCounts[topic] / totalLength); if (sortedTopics[i].getWeight() < threshold) { break; } pw.print (sortedTopics[i].getID() + " " + sortedTopics[i].getWeight() + " ");
for (int topic = 0; topic < numTopics; topic++) { Iterator<IDSorter> iterator = topicSortedWords.get(topic).iterator(); out = new Formatter(new StringBuilder(), Locale.US); out.format("%d\t%.3f\t", topic, model.getTopicProbabilities(docID)[topic]); int rank = 0; while (iterator.hasNext() && rank < 5) { IDSorter idCountPair = iterator.next(); out.format("%s (%.3f) ", dataAlphabet.lookupObject(idCountPair.getID()), idCountPair.getWeight()); rank++; } System.out.println(out); } System.out.println("\n");
public String topWords (int numWords) { StringBuilder output = new StringBuilder(); IDSorter[] sortedWords = new IDSorter[numTypes]; for (int topic = 0; topic < numTopics; topic++) { for (int type = 0; type < numTypes; type++) { sortedWords[type] = new IDSorter(type, typeTopicCounts[type][topic]); } Arrays.sort(sortedWords); output.append(topic + "\t" + tokensPerTopic[topic] + "\t"); for (int i=0; i < numWords; i++) { output.append(alphabet.lookupObject(sortedWords[i].getID()) + " "); } output.append("\n"); } return output.toString(); }
public void findClosest(double[] targetVector) { IDSorter[] sortedWords = new IDSorter[numWords]; double targetSquaredSum = 0.0; for (int col = 0; col < numColumns; col++) { targetSquaredSum += targetVector[col] * targetVector[col]; } double targetNormalizer = 1.0 / Math.sqrt(targetSquaredSum); System.out.println(targetSquaredSum); for (int word = 0; word < numWords; word++) { double innerProduct = 0.0; double wordSquaredSum = 0.0; for (int col = 0; col < numColumns; col++) { wordSquaredSum += weights[word * stride + col] * weights[word * stride + col]; } double wordNormalizer = 1.0 / Math.sqrt(wordSquaredSum); for (int col = 0; col < numColumns; col++) { innerProduct += targetNormalizer * targetVector[col] * wordNormalizer * weights[word * stride + col]; } sortedWords[word] = new IDSorter(word, innerProduct); } Arrays.sort(sortedWords); for (int i = 0; i < 10; i++) { System.out.format("%f\t%d\t%s\n", sortedWords[i].getWeight(), sortedWords[i].getID(), vocabulary.lookupObject(sortedWords[i].getID())); } }
public void topicXMLReport (PrintWriter out, int numWords) { ArrayList<TreeSet<IDSorter>> topicSortedWords = getSortedWords(); out.println("<?xml version='1.0' ?>"); out.println("<topicModel>"); for (int topic = 0; topic < numTopics; topic++) { out.println(" <topic id='" + topic + "' alpha='" + alpha[topic] + "' totalTokens='" + tokensPerTopic[topic] + "'>"); int word = 1; Iterator<IDSorter> iterator = topicSortedWords.get(topic).iterator(); while (iterator.hasNext() && word <= numWords) { IDSorter info = iterator.next(); out.println(" <word rank='" + word + "'>" + alphabet.lookupObject(info.getID()) + "</word>"); word++; } out.println(" </topic>"); } out.println("</topicModel>"); }
for (int position=0; position < limit; position++) { IDSorter info = iterator.next(); double probability = info.getWeight() / tokensPerTopic[topic]; cumulativeProbability += probability; formatter.format("<word rank='%d' count='%.0f' prob='%.5f' cumulative='%.5f' docs='%d'", position+1, info.getWeight(), probability, cumulativeProbability, matrix[position][position]);
public IDSorter[] getSortedTopicWords(int topic) { IDSorter[] sortedTypes = new IDSorter[ numTypes ]; for (int type = 0; type < numTypes; type++) sortedTypes[type] = new IDSorter(type, typeTopicCounts[type].get(topic)); Arrays.sort(sortedTypes); return sortedTypes; }
public void topicXMLReport (PrintWriter out, int numWords) { ArrayList<TreeSet<IDSorter>> topicSortedWords = getSortedWords(); out.println("<?xml version='1.0' ?>"); out.println("<topicModel>"); for (int topic = 0; topic < numTopics; topic++) { out.println(" <topic id='" + topic + "' alpha='" + alpha[topic] + "' totalTokens='" + tokensPerTopic[topic] + "'>"); int rank = 1; Iterator<IDSorter> iterator = topicSortedWords.get(topic).iterator(); while (iterator.hasNext() && rank <= numWords) { IDSorter info = iterator.next(); out.println(" <word rank='" + rank + "' count='" + info.getWeight() + "'>" + alphabet.lookupObject(info.getID()) + "</word>"); rank++; } out.println(" </topic>"); } out.println("</topicModel>"); }
public void findClosest(double[] targetVector) { IDSorter[] sortedWords = new IDSorter[numWords]; double targetSquaredSum = 0.0; for (int col = 0; col < numColumns; col++) { targetSquaredSum += targetVector[col] * targetVector[col]; } double targetNormalizer = 1.0 / Math.sqrt(targetSquaredSum); System.out.println(targetSquaredSum); for (int word = 0; word < numWords; word++) { double innerProduct = 0.0; double wordSquaredSum = 0.0; for (int col = 0; col < numColumns; col++) { wordSquaredSum += weights[word * stride + col] * weights[word * stride + col]; } double wordNormalizer = 1.0 / Math.sqrt(wordSquaredSum); for (int col = 0; col < numColumns; col++) { innerProduct += targetNormalizer * targetVector[col] * wordNormalizer * weights[word * stride + col]; } sortedWords[word] = new IDSorter(word, innerProduct); } Arrays.sort(sortedWords); for (int i = 0; i < 10; i++) { System.out.format("%f\t%d\t%s\n", sortedWords[i].getWeight(), sortedWords[i].getID(), vocabulary.lookupObject(sortedWords[i].getID())); } }
public String topWords (int numWords) { StringBuilder output = new StringBuilder(); IDSorter[] sortedWords = new IDSorter[numTypes]; for (int topic = 0; topic < numTopics; topic++) { for (int type = 0; type < numTypes; type++) { sortedWords[type] = new IDSorter(type, typeTopicCounts[type][topic]); } Arrays.sort(sortedWords); output.append(topic + "\t" + tokensPerTopic[topic] + "\t"); for (int i=0; i < numWords; i++) { output.append(alphabet.lookupObject(sortedWords[i].getID()) + " "); } output.append("\n"); } return output.toString(); }
/** Return an array (one element for each topic) of arrays of words, which * are the most probable words for that topic in descending order. These * are returned as Objects, but will probably be Strings. * * @param numWords The maximum length of each topic's array of words (may be less). */ public Object[][] getTopWords(int numWords) { ArrayList<TreeSet<IDSorter>> topicSortedWords = getSortedWords(); Object[][] result = new Object[ numTopics ][]; for (int topic = 0; topic < numTopics; topic++) { TreeSet<IDSorter> sortedWords = topicSortedWords.get(topic); // How many words should we report? Some topics may have fewer than // the default number of words with non-zero weight. int limit = numWords; if (sortedWords.size() < numWords) { limit = sortedWords.size(); } result[topic] = new Object[limit]; Iterator<IDSorter> iterator = sortedWords.iterator(); for (int i=0; i < limit; i++) { IDSorter info = iterator.next(); result[topic][i] = alphabet.lookupObject(info.getID()); } } return result; }
for (int position=0; position < limit; position++) { IDSorter info = iterator.next(); double probability = info.getWeight() / tokensPerTopic[topic]; cumulativeProbability += probability; formatter.format("<word rank='%d' count='%.0f' prob='%.5f' cumulative='%.5f' docs='%d'", position+1, info.getWeight(), probability, cumulativeProbability, matrix[position][position]);
public IDSorter[] getSortedTopicWords(int topic) { IDSorter[] sortedTypes = new IDSorter[ numTypes ]; for (int type = 0; type < numTypes; type++) sortedTypes[type] = new IDSorter(type, typeTopicCounts[type].get(topic)); Arrays.sort(sortedTypes); return sortedTypes; }
for (int topic = 0; topic < numTopics; topic++) { sortedTopics[topic] = new IDSorter(topic, topic); sortedTopics[topic].set(topic, (float) topicCounts[topic] / totalLength); if (sortedTopics[i].getWeight() < threshold) { break; } pw.print (sortedTopics[i].getID() + " " + sortedTopics[i].getWeight() + " ");
public void topicXMLReport (PrintWriter out, int numWords) { ArrayList<TreeSet<IDSorter>> topicSortedWords = getSortedWords(); out.println("<?xml version='1.0' ?>"); out.println("<topicModel>"); for (int topic = 0; topic < numTopics; topic++) { out.println(" <topic id='" + topic + "' alpha='" + alpha[topic] + "' totalTokens='" + tokensPerTopic[topic] + "'>"); int rank = 1; Iterator<IDSorter> iterator = topicSortedWords.get(topic).iterator(); while (iterator.hasNext() && rank <= numWords) { IDSorter info = iterator.next(); out.println(" <word rank='" + rank + "' count='" + info.getWeight() + "'>" + alphabet.lookupObject(info.getID()) + "</word>"); rank++; } out.println(" </topic>"); } out.println("</topicModel>"); }
public String getTopWords(int numWords, boolean withWeight) { IDSorter[] sortedTypes = new IDSorter[numTypes]; for (int type=0; type < numTypes; type++) { sortedTypes[type] = new IDSorter(type, typeCounts[type]); } Arrays.sort(sortedTypes); Alphabet alphabet = instances.getDataAlphabet(); StringBuffer out = new StringBuffer(); for (int i = 0; i < numWords; i++) { if (withWeight){ out.append(alphabet.lookupObject(sortedTypes[i].getID()) + ":" + sortedTypes[i].getWeight() + " "); }else out.append(alphabet.lookupObject(sortedTypes[i].getID()) + " "); } return out.toString(); }
public String topWords (int numWords) { StringBuilder output = new StringBuilder(); IDSorter[] sortedWords = new IDSorter[numTypes]; for (int topic = 0; topic < numTopics; topic++) { for (int type = 0; type < numTypes; type++) { sortedWords[type] = new IDSorter(type, typeTopicCounts[type][topic]); } Arrays.sort(sortedWords); output.append(topic + "\t" + tokensPerTopic[topic] + "\t"); for (int i=0; i < numWords; i++) { output.append(alphabet.lookupObject(sortedWords[i].getID()) + " "); } output.append("\n"); } return output.toString(); }
/** Return an array (one element for each topic) of arrays of words, which * are the most probable words for that topic in descending order. These * are returned as Objects, but will probably be Strings. * * @param numWords The maximum length of each topic's array of words (may be less). */ public Object[][] getTopWords(int numWords) { ArrayList<TreeSet<IDSorter>> topicSortedWords = getSortedWords(); Object[][] result = new Object[ numTopics ][]; for (int topic = 0; topic < numTopics; topic++) { TreeSet<IDSorter> sortedWords = topicSortedWords.get(topic); // How many words should we report? Some topics may have fewer than // the default number of words with non-zero weight. int limit = numWords; if (sortedWords.size() < numWords) { limit = sortedWords.size(); } result[topic] = new Object[limit]; Iterator<IDSorter> iterator = sortedWords.iterator(); for (int i=0; i < limit; i++) { IDSorter info = iterator.next(); result[topic][i] = alphabet.lookupObject(info.getID()); } } return result; }
for (int position=0; position < limit; position++) { IDSorter info = iterator.next(); double probability = info.getWeight() / tokensPerTopic[topic]; cumulativeProbability += probability; formatter.format("<word rank='%d' count='%.0f' prob='%.5f' cumulative='%.5f' docs='%d'", position+1, info.getWeight(), probability, cumulativeProbability, matrix[position][position]);
public IDSorter[] getSortedTopicWords(int topic) { IDSorter[] sortedTypes = new IDSorter[ numTypes ]; for (int type = 0; type < numTypes; type++) sortedTypes[type] = new IDSorter(type, typeTopicCounts[type].get(topic)); Arrays.sort(sortedTypes); return sortedTypes; }