public DocumentCollection getTrain(int foldId) { DocumentCollection res = new DocumentCollection(); for (int i = 0; i < folds.length; i++) if (i != foldId) res.addDocuments(folds[i].docs); return res; }
public DocumentCollection getRandomSubset(int setSize) { // logger.info("Building random sample"); DocumentCollection res = new DocumentCollection(); if (setSize > remainingCapacity) { System.err .println("Error-requested random subset size exceeds the available set capacity"); System.exit(0); } while (res.docs.size() < setSize) { int i = (int) (Math.random() * ((double) mask.length)); if (i == mask.length) i--; if (isAvailable(i)) { mask[i] = true; remainingCapacity--; res.docs.add(docs.elementAt(i)); } } // logger.info("Done building random sample"); return res; }
public DocumentCollection getRandomSubset(int setSize) { // logger.info("Building random sample"); DocumentCollection res = new DocumentCollection(); if (setSize > remainingCapacity) { System.err .println("Error-requested random subset size exceeds the available set capacity"); System.exit(0); } while (res.docs.size() < setSize) { int i = (int) (Math.random() * ((double) mask.length)); if (i == mask.length) i--; if (isAvailable(i)) { mask[i] = true; remainingCapacity--; res.docs.add(docs.elementAt(i)); } } // logger.info("Done building random sample"); return res; }
public DocumentCollection getTrain(int foldId) { DocumentCollection res = new DocumentCollection(); for (int i = 0; i < folds.length; i++) if (i != foldId) res.addDocuments(folds[i].docs); return res; }
public DocumentCollection getRandomSubset(int setSize) { // logger.info("Building random sample"); DocumentCollection res = new DocumentCollection(); if (setSize > remainingCapacity) { System.err .println("Error-requested random subset size exceeds the available set capacity"); System.exit(0); } while (res.docs.size() < setSize) { int i = (int) (Math.random() * ((double) mask.length)); if (i == mask.length) i--; if (isAvailable(i)) { mask[i] = true; remainingCapacity--; res.docs.add(docs.elementAt(i)); } } // logger.info("Done building random sample"); return res; }
public DocumentCollection getTrain(int foldId) { DocumentCollection res = new DocumentCollection(); for (int i = 0; i < folds.length; i++) if (i != foldId) res.addDocuments(folds[i].docs); return res; }
public DocumentCollection getBalancedRandomSubset(int classesNum, int numSamplesPerClass) { logger.info("Building random sample"); int[] availableCounts = new int[classesNum]; for (int i = 0; i < this.docs.size(); i++) if (isAvailable(i)) availableCounts[docs.elementAt(i).classID]++; for (int i = 0; i < classesNum; i++) { if (availableCounts[i] < numSamplesPerClass) { System.err.println("Cannot build a balances sample- missing enough elements for one of the classes"); System.exit(0); } availableCounts[i] = numSamplesPerClass; } DocumentCollection res = new DocumentCollection(); while (res.docs.size() < classesNum * numSamplesPerClass) { int i = (int) (Math.random() * ((double) mask.length)); if (i == mask.length) i--; if (isAvailable(i) && (availableCounts[docs.elementAt(i).classID] > 0)) { mask[i] = true; availableCounts[docs.elementAt(i).classID]--; remainingCapacity--; res.docs.add(docs.elementAt(i)); } } logger.info("Done building random sample"); return res; }
public static void initTopicClassifier(String pathToTopicData, String[] fileNames, String[] _labelnames) { map = new FeatureMap(); labelnames = new String[_labelnames.length + 1]; labelnames[0] = "UNKNOWN"; for (int i = 0; i < _labelnames.length; i++) labelnames[1 + i] = _labelnames[i]; DocumentCollection docs = new DocumentCollection(); StopWords stops = new StopWords(pathToStopWords); for (int i = 0; i < fileNames.length; i++) docs.addDocuments(pathToTopicData + "/" + fileNames[i], i, stops, false, "\n\t -.,?<>;':\"[]{}\\|`~!@#$%^&*()_+=-0987654321`~"); map.addDocs(docs, 20, false); NfoldCrossvalidation cv = new NfoldCrossvalidation(docs, 5); cv.printNfoldCorrssvalidationNbAcc(fileNames.length, -1, 20); // System.exit(0); nb = new MemoryEfficientNB(docs, map, fileNames.length); }
public DocumentCollection getBalancedRandomSubset(int classesNum, int numSamplesPerClass) { logger.info("Building random sample"); int[] availableCounts = new int[classesNum]; for (int i = 0; i < this.docs.size(); i++) if (isAvailable(i)) availableCounts[docs.elementAt(i).classID]++; for (int i = 0; i < classesNum; i++) { if (availableCounts[i] < numSamplesPerClass) { System.err.println("Cannot build a balances sample- missing enough elements for one of the classes"); System.exit(0); } availableCounts[i] = numSamplesPerClass; } DocumentCollection res = new DocumentCollection(); while (res.docs.size() < classesNum * numSamplesPerClass) { int i = (int) (Math.random() * ((double) mask.length)); if (i == mask.length) i--; if (isAvailable(i) && (availableCounts[docs.elementAt(i).classID] > 0)) { mask[i] = true; availableCounts[docs.elementAt(i).classID]--; remainingCapacity--; res.docs.add(docs.elementAt(i)); } } logger.info("Done building random sample"); return res; }
public static void initTopicClassifier(String pathToTopicData, String[] fileNames, String[] _labelnames) { map = new FeatureMap(); labelnames = new String[_labelnames.length + 1]; labelnames[0] = "UNKNOWN"; for (int i = 0; i < _labelnames.length; i++) labelnames[1 + i] = _labelnames[i]; DocumentCollection docs = new DocumentCollection(); StopWords stops = new StopWords(pathToStopWords); for (int i = 0; i < fileNames.length; i++) docs.addDocuments(pathToTopicData + "/" + fileNames[i], i, stops, false, "\n\t -.,?<>;':\"[]{}\\|`~!@#$%^&*()_+=-0987654321`~"); map.addDocs(docs, 20, false); NfoldCrossvalidation cv = new NfoldCrossvalidation(docs, 5); cv.printNfoldCorrssvalidationNbAcc(fileNames.length, -1, 20); // System.exit(0); nb = new MemoryEfficientNB(docs, map, fileNames.length); }
public DocumentCollection getBalancedRandomSubset(int classesNum, int numSamplesPerClass) { logger.info("Building random sample"); int[] availableCounts = new int[classesNum]; for (int i = 0; i < this.docs.size(); i++) if (isAvailable(i)) availableCounts[docs.elementAt(i).classID]++; for (int i = 0; i < classesNum; i++) { if (availableCounts[i] < numSamplesPerClass) { System.err.println("Cannot build a balances sample- missing enough elements for one of the classes"); System.exit(0); } availableCounts[i] = numSamplesPerClass; } DocumentCollection res = new DocumentCollection(); while (res.docs.size() < classesNum * numSamplesPerClass) { int i = (int) (Math.random() * ((double) mask.length)); if (i == mask.length) i--; if (isAvailable(i) && (availableCounts[docs.elementAt(i).classID] > 0)) { mask[i] = true; availableCounts[docs.elementAt(i).classID]--; remainingCapacity--; res.docs.add(docs.elementAt(i)); } } logger.info("Done building random sample"); return res; }
public static void initTopicClassifier(String pathToTopicData, String[] fileNames, String[] _labelnames) { map = new FeatureMap(); labelnames = new String[_labelnames.length + 1]; labelnames[0] = "UNKNOWN"; for (int i = 0; i < _labelnames.length; i++) labelnames[1 + i] = _labelnames[i]; DocumentCollection docs = new DocumentCollection(); StopWords stops = new StopWords(pathToStopWords); for (int i = 0; i < fileNames.length; i++) docs.addDocuments(pathToTopicData + "/" + fileNames[i], i, stops, false, "\n\t -.,?<>;':\"[]{}\\|`~!@#$%^&*()_+=-0987654321`~"); map.addDocs(docs, 20, false); NfoldCrossvalidation cv = new NfoldCrossvalidation(docs, 5); cv.printNfoldCorrssvalidationNbAcc(fileNames.length, -1, 20); // System.exit(0); nb = new MemoryEfficientNB(docs, map, fileNames.length); }