public MemoryEfficientNB(DocumentCollection docs, FeatureMap _map, int _classesN) { allocateSpace(_map, _classesN); for (int i = 0; i < docs.docs.size(); i++) { // logger.info("Learning - document "+i+" of "+docs.docs.size()); onlineLearning(docs.docs.elementAt(i)); } }
public double getAcc(DocumentCollection test) { double correct = 0; for (int i = 0; i < test.docs.size(); i++) { Document doc = test.docs.elementAt(i); if (this.classify(doc, -1) == doc.classID) correct++; } return correct / test.docs.size(); }
public String getExtendedFeatures(Document d) { double[] conf = getPredictionConfidence(d); String res = ""; for (int i = 0; i < classesN; i++) if (conf[i] > 0) res += methodName + i + "(" + conf[i] + ") "; return res; } }
FeatureMap map = new FeatureMap(); map.addDocs(train, minWordsAppearenceCount, false); MemoryEfficientNB nb = new MemoryEfficientNB(train, map, classesNum); acc += nb.getAcc(test) / ((double) folds.length); for (int j = 0; j < test.docs.size(); j++) { total++; Document d = test.docs.elementAt(j); int label = nb.classify(d, thres); if (label > -1) { recall++;
public double[] getPredictionConfidence(Document doc) { double[] classLLProbs = new double[classesN]; int[] activeFeats = doc.getActiveFid(map); for (int i = 0; i < classesN; i++) { classLLProbs[i] = Math.log(getPrior(i)); for (int activeFeat : activeFeats) classLLProbs[i] += Math.log(getFidProb(activeFeat, i)); } int maxClass = 0; for (int i = 0; i < classesN; i++) if (classLLProbs[i] > classLLProbs[maxClass]) maxClass = i; // all the log-likelihoods are negative, so we're selecting the maximum LL // e.g. if the LLs were: -2001,-2002,-2003, we choose -2001. // then me multiply all the numbers by e^(-2001) which allows // us to do precise math double denom = 0; double[] res = new double[classesN]; for (int i = 0; i < classesN; i++) { res[i] += Math.exp(classLLProbs[i] - classLLProbs[maxClass]); denom += res[i]; } for (int i = 0; i < classesN; i++) res[i] = res[i] / denom; return res; }
public void weightedOnlineLearning(int[] activeFeatures, double weight, int classID) { sampleSize += weight; classCounts[classID] += weight; fidCount += weight * activeFeatures.length; for (int activeFeature : activeFeatures) { weights[classID] += weight; wordCounts[activeFeature] += weight; updateFidCounts(activeFeature, classID, weight); } }
public Hashtable<String, Integer> getTopPmiWords(int maxWordsPerClass, double confThres, int minAppThres) { Hashtable<String, Integer> coolWords = new Hashtable<>(); for (int i = 0; i < classesN; i++) { CharacteristicWords words = this.getTopPmiWords(i, maxWordsPerClass, confThres, minAppThres); logger.info(words.toString()); for (int j = 0; j < words.topWords.size(); j++) if (!coolWords.containsKey(words.topWords.elementAt(j))) coolWords.put(words.topWords.elementAt(j), 1); } return coolWords; }
public MemoryEfficientNB(FeatureMap _map, int _classesN) { allocateSpace(_map, _classesN); }
public static void initTopicClassifier(String pathToTopicData, String[] fileNames, String[] _labelnames) { map = new FeatureMap(); labelnames = new String[_labelnames.length + 1]; labelnames[0] = "UNKNOWN"; for (int i = 0; i < _labelnames.length; i++) labelnames[1 + i] = _labelnames[i]; DocumentCollection docs = new DocumentCollection(); StopWords stops = new StopWords(pathToStopWords); for (int i = 0; i < fileNames.length; i++) docs.addDocuments(pathToTopicData + "/" + fileNames[i], i, stops, false, "\n\t -.,?<>;':\"[]{}\\|`~!@#$%^&*()_+=-0987654321`~"); map.addDocs(docs, 20, false); NfoldCrossvalidation cv = new NfoldCrossvalidation(docs, 5); cv.printNfoldCorrssvalidationNbAcc(fileNames.length, -1, 20); // System.exit(0); nb = new MemoryEfficientNB(docs, map, fileNames.length); }
FeatureMap map = new FeatureMap(); map.addDocs(train, minWordsAppearenceCount, false); MemoryEfficientNB nb = new MemoryEfficientNB(train, map, classesNum); acc += nb.getAcc(test) / ((double) folds.length); for (int j = 0; j < test.docs.size(); j++) { total++; Document d = test.docs.elementAt(j); int label = nb.classify(d, thres); if (label > -1) { recall++;
public double[] getPredictionConfidence(Document doc) { double[] classLLProbs = new double[classesN]; int[] activeFeats = doc.getActiveFid(map); for (int i = 0; i < classesN; i++) { classLLProbs[i] = Math.log(getPrior(i)); for (int activeFeat : activeFeats) classLLProbs[i] += Math.log(getFidProb(activeFeat, i)); } int maxClass = 0; for (int i = 0; i < classesN; i++) if (classLLProbs[i] > classLLProbs[maxClass]) maxClass = i; // all the log-likelihoods are negative, so we're selecting the maximum LL // e.g. if the LLs were: -2001,-2002,-2003, we choose -2001. // then me multiply all the numbers by e^(-2001) which allows // us to do precise math double denom = 0; double[] res = new double[classesN]; for (int i = 0; i < classesN; i++) { res[i] += Math.exp(classLLProbs[i] - classLLProbs[maxClass]); denom += res[i]; } for (int i = 0; i < classesN; i++) res[i] = res[i] / denom; return res; }
public void weightedOnlineLearning(int[] activeFeatures, double weight, int classID) { sampleSize += weight; classCounts[classID] += weight; fidCount += weight * activeFeatures.length; for (int activeFeature : activeFeatures) { weights[classID] += weight; wordCounts[activeFeature] += weight; updateFidCounts(activeFeature, classID, weight); } }
public Hashtable<String, Integer> getTopPmiWords(int maxWordsPerClass, double confThres, int minAppThres) { Hashtable<String, Integer> coolWords = new Hashtable<>(); for (int i = 0; i < classesN; i++) { CharacteristicWords words = this.getTopPmiWords(i, maxWordsPerClass, confThres, minAppThres); logger.info(words.toString()); for (int j = 0; j < words.topWords.size(); j++) if (!coolWords.containsKey(words.topWords.elementAt(j))) coolWords.put(words.topWords.elementAt(j), 1); } return coolWords; }
public MemoryEfficientNB(FeatureMap _map, int _classesN) { allocateSpace(_map, _classesN); }
public static void initTopicClassifier(String pathToTopicData, String[] fileNames, String[] _labelnames) { map = new FeatureMap(); labelnames = new String[_labelnames.length + 1]; labelnames[0] = "UNKNOWN"; for (int i = 0; i < _labelnames.length; i++) labelnames[1 + i] = _labelnames[i]; DocumentCollection docs = new DocumentCollection(); StopWords stops = new StopWords(pathToStopWords); for (int i = 0; i < fileNames.length; i++) docs.addDocuments(pathToTopicData + "/" + fileNames[i], i, stops, false, "\n\t -.,?<>;':\"[]{}\\|`~!@#$%^&*()_+=-0987654321`~"); map.addDocs(docs, 20, false); NfoldCrossvalidation cv = new NfoldCrossvalidation(docs, 5); cv.printNfoldCorrssvalidationNbAcc(fileNames.length, -1, 20); // System.exit(0); nb = new MemoryEfficientNB(docs, map, fileNames.length); }
FeatureMap map = new FeatureMap(); map.addDocs(train, minWordsAppearenceCount, false); MemoryEfficientNB nb = new MemoryEfficientNB(train, map, classesNum); acc += nb.getAcc(test) / ((double) folds.length); for (int j = 0; j < test.docs.size(); j++) { total++; Document d = test.docs.elementAt(j); int label = nb.classify(d, thres); if (label > -1) { recall++;
public MemoryEfficientNB(DocumentCollection docs, FeatureMap _map, int _classesN) { allocateSpace(_map, _classesN); for (int i = 0; i < docs.docs.size(); i++) { // logger.info("Learning - document "+i+" of "+docs.docs.size()); onlineLearning(docs.docs.elementAt(i)); } }
public double[] getPredictionConfidence(Document doc) { double[] classLLProbs = new double[classesN]; int[] activeFeats = doc.getActiveFid(map); for (int i = 0; i < classesN; i++) { classLLProbs[i] = Math.log(getPrior(i)); for (int activeFeat : activeFeats) classLLProbs[i] += Math.log(getFidProb(activeFeat, i)); } int maxClass = 0; for (int i = 0; i < classesN; i++) if (classLLProbs[i] > classLLProbs[maxClass]) maxClass = i; // all the log-likelihoods are negative, so we're selecting the maximum LL // e.g. if the LLs were: -2001,-2002,-2003, we choose -2001. // then me multiply all the numbers by e^(-2001) which allows // us to do precise math double denom = 0; double[] res = new double[classesN]; for (int i = 0; i < classesN; i++) { res[i] += Math.exp(classLLProbs[i] - classLLProbs[maxClass]); denom += res[i]; } for (int i = 0; i < classesN; i++) res[i] = res[i] / denom; return res; }
public double getAcc(DocumentCollection test) { double correct = 0; for (int i = 0; i < test.docs.size(); i++) { Document doc = test.docs.elementAt(i); if (this.classify(doc, -1) == doc.classID) correct++; } return correct / test.docs.size(); }
public String getExtendedFeatures(Document d) { double[] conf = getPredictionConfidence(d); String res = ""; for (int i = 0; i < classesN; i++) if (conf[i] > 0) res += methodName + i + "(" + conf[i] + ") "; return res; } }