for (int topic = 0; topic < numTopics; topic++) { Iterator<IDSorter> iterator = topicSortedWords.get(topic).iterator(); out = new Formatter(new StringBuilder(), Locale.US); out.format("%d\t%.3f\t", topic, model.getTopicProbabilities(docID)[topic]); int rank = 0; while (iterator.hasNext() && rank < 5) { IDSorter idCountPair = iterator.next(); out.format("%s (%.3f) ", dataAlphabet.lookupObject(idCountPair.getID()), idCountPair.getWeight()); rank++; } System.out.println(out); } System.out.println("\n");
public void topicXMLReport (PrintWriter out, int numWords) { ArrayList<TreeSet<IDSorter>> topicSortedWords = getSortedWords(); out.println("<?xml version='1.0' ?>"); out.println("<topicModel>"); for (int topic = 0; topic < numTopics; topic++) { out.println(" <topic id='" + topic + "' alpha='" + alpha[topic] + "' totalTokens='" + tokensPerTopic[topic] + "'>"); int rank = 1; Iterator<IDSorter> iterator = topicSortedWords.get(topic).iterator(); while (iterator.hasNext() && rank <= numWords) { IDSorter info = iterator.next(); out.println(" <word rank='" + rank + "' count='" + info.getWeight() + "'>" + alphabet.lookupObject(info.getID()) + "</word>"); rank++; } out.println(" </topic>"); } out.println("</topicModel>"); }
public void topicXMLReport (PrintWriter out, int numWords) { ArrayList<TreeSet<IDSorter>> topicSortedWords = getSortedWords(); out.println("<?xml version='1.0' ?>"); out.println("<topicModel>"); for (int topic = 0; topic < numTopics; topic++) { out.println(" <topic id='" + topic + "' alpha='" + alpha[topic] + "' totalTokens='" + tokensPerTopic[topic] + "'>"); int rank = 1; Iterator<IDSorter> iterator = topicSortedWords.get(topic).iterator(); while (iterator.hasNext() && rank <= numWords) { IDSorter info = iterator.next(); out.println(" <word rank='" + rank + "' count='" + info.getWeight() + "'>" + alphabet.lookupObject(info.getID()) + "</word>"); rank++; } out.println(" </topic>"); } out.println("</topicModel>"); }
public String topWords (int numWords) { StringBuilder output = new StringBuilder(); IDSorter[] sortedWords = new IDSorter[numTypes]; for (int topic = 0; topic < numTopics; topic++) { if (tokensPerTopic[topic] == 0) { continue; } for (int type = 0; type < numTypes; type++) { sortedWords[type] = new IDSorter(type, typeTopicCounts[type][topic]); } Arrays.sort(sortedWords); output.append(topic + "\t" + labelAlphabet.lookupObject(topic) + "\t" + tokensPerTopic[topic] + "\t"); for (int i=0; i < numWords; i++) { if (sortedWords[i].getWeight() == 0) { break; } output.append(alphabet.lookupObject(sortedWords[i].getID()) + " "); } output.append("\n"); } return output.toString(); }
public String topWords (int numWords) { StringBuilder output = new StringBuilder(); IDSorter[] sortedWords = new IDSorter[numTypes]; for (int topic = 0; topic < numTopics; topic++) { if (tokensPerTopic[topic] == 0) { continue; } for (int type = 0; type < numTypes; type++) { sortedWords[type] = new IDSorter(type, typeTopicCounts[type][topic]); } Arrays.sort(sortedWords); output.append(topic + "\t" + labelAlphabet.lookupObject(topic) + "\t" + tokensPerTopic[topic] + "\t"); for (int i=0; i < numWords; i++) { if (sortedWords[i].getWeight() == 0) { break; } output.append(alphabet.lookupObject(sortedWords[i].getID()) + " "); } output.append("\n"); } return output.toString(); }
public String topWords (int numWords) { StringBuilder output = new StringBuilder(); IDSorter[] sortedWords = new IDSorter[numTypes]; for (int topic = 0; topic < numTopics; topic++) { if (tokensPerTopic[topic] == 0) { continue; } for (int type = 0; type < numTypes; type++) { sortedWords[type] = new IDSorter(type, typeTopicCounts[type][topic]); } Arrays.sort(sortedWords); output.append(topic + "\t" + labelAlphabet.lookupObject(topic) + "\t" + tokensPerTopic[topic] + "\t"); for (int i=0; i < numWords; i++) { if (sortedWords[i].getWeight() == 0) { break; } output.append(alphabet.lookupObject(sortedWords[i].getID()) + " "); } output.append("\n"); } return output.toString(); }
/** * @param out A print writer * @param count Print this number of top documents */ public void printTopicDocuments (PrintWriter out, int max) { out.println("#topic doc name proportion ..."); ArrayList<TreeSet<IDSorter>> topicSortedDocuments = getTopicDocuments(10.0); for (int topic = 0; topic < numTopics; topic++) { TreeSet<IDSorter> sortedDocuments = topicSortedDocuments.get(topic); int i = 0; for (IDSorter sorter: sortedDocuments) { if (i == max) { break; } int doc = sorter.getID(); double proportion = sorter.getWeight(); String name = (String) data.get(doc).instance.getName(); if (name == null) { name = "no-name"; } out.format("%d %d %s %f\n", topic, doc, name, proportion); i++; } } }
/** * @param out A print writer * @param count Print this number of top documents */ public void printTopicDocuments (PrintWriter out, int max) { out.println("#topic doc name proportion ..."); ArrayList<TreeSet<IDSorter>> topicSortedDocuments = getTopicDocuments(10.0); for (int topic = 0; topic < numTopics; topic++) { TreeSet<IDSorter> sortedDocuments = topicSortedDocuments.get(topic); int i = 0; for (IDSorter sorter: sortedDocuments) { if (i == max) { break; } int doc = sorter.getID(); double proportion = sorter.getWeight(); String name = (String) data.get(doc).instance.getName(); if (name == null) { name = "no-name"; } out.format("%d %d %s %f\n", topic, doc, name, proportion); i++; } } }
public void findClosest(double[] targetVector) { IDSorter[] sortedWords = new IDSorter[numWords]; double targetSquaredSum = 0.0; for (int col = 0; col < numColumns; col++) { targetSquaredSum += targetVector[col] * targetVector[col]; } double targetNormalizer = 1.0 / Math.sqrt(targetSquaredSum); System.out.println(targetSquaredSum); for (int word = 0; word < numWords; word++) { double innerProduct = 0.0; double wordSquaredSum = 0.0; for (int col = 0; col < numColumns; col++) { wordSquaredSum += weights[word * stride + col] * weights[word * stride + col]; } double wordNormalizer = 1.0 / Math.sqrt(wordSquaredSum); for (int col = 0; col < numColumns; col++) { innerProduct += targetNormalizer * targetVector[col] * wordNormalizer * weights[word * stride + col]; } sortedWords[word] = new IDSorter(word, innerProduct); } Arrays.sort(sortedWords); for (int i = 0; i < 10; i++) { System.out.format("%f\t%d\t%s\n", sortedWords[i].getWeight(), sortedWords[i].getID(), vocabulary.lookupObject(sortedWords[i].getID())); } }
public String topWords (int numWords) { StringBuilder output = new StringBuilder(); IDSorter[] sortedWords = new IDSorter[numTypes]; for (int topic: docsPerTopic.keys()) { for (int type = 0; type < numTypes; type++) { sortedWords[type] = new IDSorter(type, typeTopicCounts[type].get(topic)); } Arrays.sort(sortedWords); output.append(topic + "\t" + tokensPerTopic.get(topic) + "\t"); for (int i=0; i < numWords; i++) { if (sortedWords[i].getWeight() < 1.0) { break; } output.append(alphabet.lookupObject(sortedWords[i].getID()) + " "); } output.append("\n"); } return output.toString(); }
public String getTopWords(int numWords, boolean withWeight) { IDSorter[] sortedTypes = new IDSorter[numTypes]; for (int type=0; type < numTypes; type++) { sortedTypes[type] = new IDSorter(type, typeCounts[type]); } Arrays.sort(sortedTypes); Alphabet alphabet = instances.getDataAlphabet(); StringBuffer out = new StringBuffer(); for (int i = 0; i < numWords; i++) { if (withWeight){ out.append(alphabet.lookupObject(sortedTypes[i].getID()) + ":" + sortedTypes[i].getWeight() + " "); }else out.append(alphabet.lookupObject(sortedTypes[i].getID()) + " "); } return out.toString(); }
public String getTopWords(int numWords, boolean withWeight) { IDSorter[] sortedTypes = new IDSorter[numTypes]; for (int type=0; type < numTypes; type++) { sortedTypes[type] = new IDSorter(type, typeCounts[type]); } Arrays.sort(sortedTypes); Alphabet alphabet = instances.getDataAlphabet(); StringBuffer out = new StringBuffer(); for (int i = 0; i < numWords; i++) { if (withWeight){ out.append(alphabet.lookupObject(sortedTypes[i].getID()) + ":" + sortedTypes[i].getWeight() + " "); }else out.append(alphabet.lookupObject(sortedTypes[i].getID()) + " "); } return out.toString(); }
public String topWords (int numWords) { StringBuilder output = new StringBuilder(); IDSorter[] sortedWords = new IDSorter[numTypes]; for (int topic: docsPerTopic.keys()) { for (int type = 0; type < numTypes; type++) { sortedWords[type] = new IDSorter(type, typeTopicCounts[type].get(topic)); } Arrays.sort(sortedWords); output.append(topic + "\t" + tokensPerTopic.get(topic) + "\t"); for (int i=0; i < numWords; i++) { if (sortedWords[i].getWeight() < 1.0) { break; } output.append(alphabet.lookupObject(sortedWords[i].getID()) + " "); } output.append("\n"); } return output.toString(); }
public String getTopWords(int numWords, boolean withWeight) { IDSorter[] sortedTypes = new IDSorter[numTypes]; for (int type=0; type < numTypes; type++) { sortedTypes[type] = new IDSorter(type, typeCounts[type]); } Arrays.sort(sortedTypes); Alphabet alphabet = instances.getDataAlphabet(); StringBuffer out = new StringBuffer(); for (int i = 0; i < numWords; i++) { if (withWeight){ out.append(alphabet.lookupObject(sortedTypes[i].getID()) + ":" + sortedTypes[i].getWeight() + " "); }else out.append(alphabet.lookupObject(sortedTypes[i].getID()) + " "); } return out.toString(); }
public String topWords (int numWords) { StringBuilder output = new StringBuilder(); IDSorter[] sortedWords = new IDSorter[numTypes]; for (int topic: docsPerTopic.keys()) { for (int type = 0; type < numTypes; type++) { sortedWords[type] = new IDSorter(type, typeTopicCounts[type].get(topic)); } Arrays.sort(sortedWords); output.append(topic + "\t" + tokensPerTopic.get(topic) + "\t"); for (int i=0; i < numWords; i++) { if (sortedWords[i].getWeight() < 1.0) { break; } output.append(alphabet.lookupObject(sortedWords[i].getID()) + " "); } output.append("\n"); } return output.toString(); }
public TopicScores getEffectiveNumberOfWords() { int[] tokensPerTopic = model.tokensPerTopic; TopicScores scores = new TopicScores("eff_num_words", numTopics, numTopWords); int numTypes = alphabet.size(); for (int topic = 0; topic < numTopics; topic++) { double sumSquaredProbabilities = 0.0; TreeSet<IDSorter> sortedWords = topicSortedWords.get(topic); for (IDSorter info: sortedWords) { int type = info.getID(); double probability = info.getWeight() / tokensPerTopic[topic]; sumSquaredProbabilities += probability * probability; } scores.setTopicScore(topic, 1.0 / sumSquaredProbabilities); } return scores; }
public TopicScores getEffectiveNumberOfWords() { int[] tokensPerTopic = model.tokensPerTopic; TopicScores scores = new TopicScores("eff_num_words", numTopics, numTopWords); int numTypes = alphabet.size(); for (int topic = 0; topic < numTopics; topic++) { double sumSquaredProbabilities = 0.0; TreeSet<IDSorter> sortedWords = topicSortedWords.get(topic); for (IDSorter info: sortedWords) { int type = info.getID(); double probability = info.getWeight() / tokensPerTopic[topic]; sumSquaredProbabilities += probability * probability; } scores.setTopicScore(topic, 1.0 / sumSquaredProbabilities); } return scores; }
public TopicScores getEffectiveNumberOfWords() { int[] tokensPerTopic = model.tokensPerTopic; TopicScores scores = new TopicScores("eff_num_words", numTopics, numTopWords); int numTypes = alphabet.size(); for (int topic = 0; topic < numTopics; topic++) { double sumSquaredProbabilities = 0.0; TreeSet<IDSorter> sortedWords = topicSortedWords.get(topic); for (IDSorter info: sortedWords) { int type = info.getID(); double probability = info.getWeight() / tokensPerTopic[topic]; sumSquaredProbabilities += probability * probability; } scores.setTopicScore(topic, 1.0 / sumSquaredProbabilities); } return scores; }
public TopicScores getDistanceFromUniform() { int[] tokensPerTopic = model.tokensPerTopic; TopicScores scores = new TopicScores("uniform_dist", numTopics, numTopWords); scores.wordScoresDefined = true; int numTypes = alphabet.size(); for (int topic = 0; topic < numTopics; topic++) { double topicScore = 0.0; int position = 0; TreeSet<IDSorter> sortedWords = topicSortedWords.get(topic); for (IDSorter info: sortedWords) { int type = info.getID(); double count = info.getWeight(); double score = (count / tokensPerTopic[topic]) * Math.log( (count * numTypes) / tokensPerTopic[topic] ); if (position < numTopWords) { scores.setTopicWordScore(topic, position, score); } topicScore += score; position++; } scores.setTopicScore(topic, topicScore); } return scores; }
public TopicScores getDistanceFromUniform() { int[] tokensPerTopic = model.tokensPerTopic; TopicScores scores = new TopicScores("uniform_dist", numTopics, numTopWords); scores.wordScoresDefined = true; int numTypes = alphabet.size(); for (int topic = 0; topic < numTopics; topic++) { double topicScore = 0.0; int position = 0; TreeSet<IDSorter> sortedWords = topicSortedWords.get(topic); for (IDSorter info: sortedWords) { int type = info.getID(); double count = info.getWeight(); double score = (count / tokensPerTopic[topic]) * Math.log( (count * numTypes) / tokensPerTopic[topic] ); if (position < numTopWords) { scores.setTopicWordScore(topic, position, score); } topicScore += score; position++; } scores.setTopicScore(topic, topicScore); } return scores; }