public Document nextDoc(int initClassID) { Vector<String> words = in.readLineTokens(tokenizationDelimiters); if ((discardFirstToken) && (words != null) && (words.size() > 0)) words.removeElementAt(0); if (stops != null) words = stops.filterStopWords(words); while (words != null) { if (words.size() > 0) return new Document(words, initClassID); words = in.readLineTokens(tokenizationDelimiters); if ((discardFirstToken) && (words != null) && (words.size() > 0)) words.removeElementAt(0); if (stops != null) words = stops.filterStopWords(words); } return null; } }
public static void initTopicClassifier(String pathToTopicData, String[] fileNames, String[] _labelnames) { map = new FeatureMap(); labelnames = new String[_labelnames.length + 1]; labelnames[0] = "UNKNOWN"; for (int i = 0; i < _labelnames.length; i++) labelnames[1 + i] = _labelnames[i]; DocumentCollection docs = new DocumentCollection(); StopWords stops = new StopWords(pathToStopWords); for (int i = 0; i < fileNames.length; i++) docs.addDocuments(pathToTopicData + "/" + fileNames[i], i, stops, false, "\n\t -.,?<>;':\"[]{}\\|`~!@#$%^&*()_+=-0987654321`~"); map.addDocs(docs, 20, false); NfoldCrossvalidation cv = new NfoldCrossvalidation(docs, 5); cv.printNfoldCorrssvalidationNbAcc(fileNames.length, -1, 20); // System.exit(0); nb = new MemoryEfficientNB(docs, map, fileNames.length); }
public static void initTopicClassifier(String pathToTopicData, String[] fileNames, String[] _labelnames) { map = new FeatureMap(); labelnames = new String[_labelnames.length + 1]; labelnames[0] = "UNKNOWN"; for (int i = 0; i < _labelnames.length; i++) labelnames[1 + i] = _labelnames[i]; DocumentCollection docs = new DocumentCollection(); StopWords stops = new StopWords(pathToStopWords); for (int i = 0; i < fileNames.length; i++) docs.addDocuments(pathToTopicData + "/" + fileNames[i], i, stops, false, "\n\t -.,?<>;':\"[]{}\\|`~!@#$%^&*()_+=-0987654321`~"); map.addDocs(docs, 20, false); NfoldCrossvalidation cv = new NfoldCrossvalidation(docs, 5); cv.printNfoldCorrssvalidationNbAcc(fileNames.length, -1, 20); // System.exit(0); nb = new MemoryEfficientNB(docs, map, fileNames.length); }
public Document nextDoc(int initClassID) { Vector<String> words = in.readLineTokens(tokenizationDelimiters); if ((discardFirstToken) && (words != null) && (words.size() > 0)) words.removeElementAt(0); if (stops != null) words = stops.filterStopWords(words); while (words != null) { if (words.size() > 0) return new Document(words, initClassID); words = in.readLineTokens(tokenizationDelimiters); if ((discardFirstToken) && (words != null) && (words.size() > 0)) words.removeElementAt(0); if (stops != null) words = stops.filterStopWords(words); } return null; } }
public static void initTopicClassifier(String pathToTopicData, String[] fileNames, String[] _labelnames) { map = new FeatureMap(); labelnames = new String[_labelnames.length + 1]; labelnames[0] = "UNKNOWN"; for (int i = 0; i < _labelnames.length; i++) labelnames[1 + i] = _labelnames[i]; DocumentCollection docs = new DocumentCollection(); StopWords stops = new StopWords(pathToStopWords); for (int i = 0; i < fileNames.length; i++) docs.addDocuments(pathToTopicData + "/" + fileNames[i], i, stops, false, "\n\t -.,?<>;':\"[]{}\\|`~!@#$%^&*()_+=-0987654321`~"); map.addDocs(docs, 20, false); NfoldCrossvalidation cv = new NfoldCrossvalidation(docs, 5); cv.printNfoldCorrssvalidationNbAcc(fileNames.length, -1, 20); // System.exit(0); nb = new MemoryEfficientNB(docs, map, fileNames.length); }
public Document nextDoc(int initClassID) { Vector<String> words = in.readLineTokens(tokenizationDelimiters); if ((discardFirstToken) && (words != null) && (words.size() > 0)) words.removeElementAt(0); if (stops != null) words = stops.filterStopWords(words); while (words != null) { if (words.size() > 0) return new Document(words, initClassID); words = in.readLineTokens(tokenizationDelimiters); if ((discardFirstToken) && (words != null) && (words.size() > 0)) words.removeElementAt(0); if (stops != null) words = stops.filterStopWords(words); } return null; } }
public void addDocuments(String filename, int classID, StopWords stops, boolean discardFirstToken, String tokenizationDelimiters) { InFile in = new InFile(filename); Vector<String> words = in.readLineTokens(tokenizationDelimiters); if ((discardFirstToken) && (words != null) && (words.size() > 0)) words.removeElementAt(0); if (stops != null) words = stops.filterStopWords(words); while (words != null) { if (words.size() >= 0) docs.addElement(new Document(words, classID)); words = in.readLineTokens(tokenizationDelimiters); if ((discardFirstToken) && (words != null) && (words.size() > 0)) words.removeElementAt(0); if (stops != null) words = stops.filterStopWords(words); } }
public void addDocuments(String filename, int classID, StopWords stops, boolean discardFirstToken, String tokenizationDelimiters) { InFile in = new InFile(filename); Vector<String> words = in.readLineTokens(tokenizationDelimiters); if ((discardFirstToken) && (words != null) && (words.size() > 0)) words.removeElementAt(0); if (stops != null) words = stops.filterStopWords(words); while (words != null) { if (words.size() >= 0) docs.addElement(new Document(words, classID)); words = in.readLineTokens(tokenizationDelimiters); if ((discardFirstToken) && (words != null) && (words.size() > 0)) words.removeElementAt(0); if (stops != null) words = stops.filterStopWords(words); } }
public void addDocuments(String filename, int classID, StopWords stops, boolean discardFirstToken, String tokenizationDelimiters) { InFile in = new InFile(filename); Vector<String> words = in.readLineTokens(tokenizationDelimiters); if ((discardFirstToken) && (words != null) && (words.size() > 0)) words.removeElementAt(0); if (stops != null) words = stops.filterStopWords(words); while (words != null) { if (words.size() >= 0) docs.addElement(new Document(words, classID)); words = in.readLineTokens(tokenizationDelimiters); if ((discardFirstToken) && (words != null) && (words.size() > 0)) words.removeElementAt(0); if (stops != null) words = stops.filterStopWords(words); } }
public void addFolder(String path, int classID, StopWords stops, boolean discardFirstToken, String tokenizationDelimiters) { String[] files = (new File(path)).list(); for (String file : files) { InFile in = new InFile(path + "/" + file); Vector<String> allWords = new Vector<>(); Vector<String> words = in.readLineTokens(tokenizationDelimiters); if ((discardFirstToken) && (words != null) && (words.size() > 0)) words.removeElementAt(0); if (stops != null) words = stops.filterStopWords(words); while (words != null) { for (int j = 0; j < words.size(); j++) allWords.addElement(words.elementAt(j)); words = in.readLineTokens(tokenizationDelimiters); if ((discardFirstToken) && (words != null) && (words.size() > 0)) words.removeElementAt(0); if (stops != null) words = stops.filterStopWords(words); } docs.addElement(new Document(allWords, classID)); } }
public void addFolder(String path, int classID, StopWords stops, boolean discardFirstToken, String tokenizationDelimiters) { String[] files = (new File(path)).list(); for (String file : files) { InFile in = new InFile(path + "/" + file); Vector<String> allWords = new Vector<>(); Vector<String> words = in.readLineTokens(tokenizationDelimiters); if ((discardFirstToken) && (words != null) && (words.size() > 0)) words.removeElementAt(0); if (stops != null) words = stops.filterStopWords(words); while (words != null) { for (int j = 0; j < words.size(); j++) allWords.addElement(words.elementAt(j)); words = in.readLineTokens(tokenizationDelimiters); if ((discardFirstToken) && (words != null) && (words.size() > 0)) words.removeElementAt(0); if (stops != null) words = stops.filterStopWords(words); } docs.addElement(new Document(allWords, classID)); } }
public void addFolder(String path, int classID, StopWords stops, boolean discardFirstToken, String tokenizationDelimiters) { String[] files = (new File(path)).list(); for (String file : files) { InFile in = new InFile(path + "/" + file); Vector<String> allWords = new Vector<>(); Vector<String> words = in.readLineTokens(tokenizationDelimiters); if ((discardFirstToken) && (words != null) && (words.size() > 0)) words.removeElementAt(0); if (stops != null) words = stops.filterStopWords(words); while (words != null) { for (int j = 0; j < words.size(); j++) allWords.addElement(words.elementAt(j)); words = in.readLineTokens(tokenizationDelimiters); if ((discardFirstToken) && (words != null) && (words.size() > 0)) words.removeElementAt(0); if (stops != null) words = stops.filterStopWords(words); } docs.addElement(new Document(allWords, classID)); } }
public Document(String filename, int _classID, StopWords stops, String tokenizationDelimiters) { InFile in = new InFile(filename); this.classID = _classID; words = new Vector<>(); Vector<String> currentWords = in.readLineTokens(tokenizationDelimiters); while (currentWords != null) { if (stops != null) currentWords = stops.filterStopWords(currentWords); for (int j = 0; j < currentWords.size(); j++) words.addElement(currentWords.elementAt(j)); currentWords = in.readLineTokens(tokenizationDelimiters); } words.trimToSize(); }
public Document(String filename, int _classID, StopWords stops, String tokenizationDelimiters) { InFile in = new InFile(filename); this.classID = _classID; words = new Vector<>(); Vector<String> currentWords = in.readLineTokens(tokenizationDelimiters); while (currentWords != null) { if (stops != null) currentWords = stops.filterStopWords(currentWords); for (int j = 0; j < currentWords.size(); j++) words.addElement(currentWords.elementAt(j)); currentWords = in.readLineTokens(tokenizationDelimiters); } words.trimToSize(); }
public Document(String filename, int _classID, StopWords stops, String tokenizationDelimiters) { InFile in = new InFile(filename); this.classID = _classID; words = new Vector<>(); Vector<String> currentWords = in.readLineTokens(tokenizationDelimiters); while (currentWords != null) { if (stops != null) currentWords = stops.filterStopWords(currentWords); for (int j = 0; j < currentWords.size(); j++) words.addElement(currentWords.elementAt(j)); currentWords = in.readLineTokens(tokenizationDelimiters); } words.trimToSize(); }