public UnigramStatistics(DocumentCollection docs, boolean _countRepsWithinDocs) { countRepsWithinDocs = _countRepsWithinDocs; logger.info("Building unigram statistics"); for (int i = 0; i < docs.docs.size(); i++) { addDoc(docs.docs.elementAt(i)); } logger.info("Done building unigram statistics"); }
public void addDoc(Document doc) { Hashtable<String, Boolean> alreadyAppreared = new Hashtable<>(); Vector<String> words = doc.words; for (int j = 0; j < words.size(); j++) { if (countRepsWithinDocs || (!alreadyAppreared.containsKey(words.elementAt(j)))) { addWord(words.elementAt(j)); alreadyAppreared.put(words.elementAt(j), true); } } }
public void addDocs(DocumentCollection docs, int appearanceThres, boolean countRepsWithinDoc) { UnigramStatistics stat = new UnigramStatistics(docs, countRepsWithinDoc); for (String w : stat.wordCounts.keySet()) { if (stat.wordCounts.get(w) >= appearanceThres) { wordToFid.put(w, dim); fidToWord.put(dim, w); dim++; } } /* * logger.info("Building a feature map"); for(int i=0;i<docs.docs.size();i++) { * Vector<String> words=docs.docs.elementAt(i).words; for(int j=0;j<words.size();j++) * if((!wordToFid.containsKey(words.elementAt(j)))&& * (stat.wordCounts.get(words.elementAt(j))>=appearanceThres)) { * wordToFid.put(words.elementAt(j), dim); fidToWord.put(dim,words.elementAt(j)); dim++; } } */ logger.info("Done building a feature map, the dimension is: " + dim); }
public void addDocs(DocumentCollection docs, int appearanceThres, boolean countRepsWithinDoc) { UnigramStatistics stat = new UnigramStatistics(docs, countRepsWithinDoc); for (String w : stat.wordCounts.keySet()) { if (stat.wordCounts.get(w) >= appearanceThres) { wordToFid.put(w, dim); fidToWord.put(dim, w); dim++; } } /* * logger.info("Building a feature map"); for(int i=0;i<docs.docs.size();i++) { * Vector<String> words=docs.docs.elementAt(i).words; for(int j=0;j<words.size();j++) * if((!wordToFid.containsKey(words.elementAt(j)))&& * (stat.wordCounts.get(words.elementAt(j))>=appearanceThres)) { * wordToFid.put(words.elementAt(j), dim); fidToWord.put(dim,words.elementAt(j)); dim++; } } */ logger.info("Done building a feature map, the dimension is: " + dim); }
public void addDoc(Document doc) { Hashtable<String, Boolean> alreadyAppreared = new Hashtable<>(); Vector<String> words = doc.words; for (int j = 0; j < words.size(); j++) { if (countRepsWithinDocs || (!alreadyAppreared.containsKey(words.elementAt(j)))) { addWord(words.elementAt(j)); alreadyAppreared.put(words.elementAt(j), true); } } }
public UnigramStatistics(DocumentCollection docs, boolean _countRepsWithinDocs) { countRepsWithinDocs = _countRepsWithinDocs; logger.info("Building unigram statistics"); for (int i = 0; i < docs.docs.size(); i++) { addDoc(docs.docs.elementAt(i)); } logger.info("Done building unigram statistics"); }
public void addDocs(DocumentCollection docs, int appearanceThres, boolean countRepsWithinDoc) { UnigramStatistics stat = new UnigramStatistics(docs, countRepsWithinDoc); for (String w : stat.wordCounts.keySet()) { if (stat.wordCounts.get(w) >= appearanceThres) { wordToFid.put(w, dim); fidToWord.put(dim, w); dim++; } } /* * logger.info("Building a feature map"); for(int i=0;i<docs.docs.size();i++) { * Vector<String> words=docs.docs.elementAt(i).words; for(int j=0;j<words.size();j++) * if((!wordToFid.containsKey(words.elementAt(j)))&& * (stat.wordCounts.get(words.elementAt(j))>=appearanceThres)) { * wordToFid.put(words.elementAt(j), dim); fidToWord.put(dim,words.elementAt(j)); dim++; } } */ logger.info("Done building a feature map, the dimension is: " + dim); }
public void addDoc(Document doc) { Hashtable<String, Boolean> alreadyAppreared = new Hashtable<>(); Vector<String> words = doc.words; for (int j = 0; j < words.size(); j++) { if (countRepsWithinDocs || (!alreadyAppreared.containsKey(words.elementAt(j)))) { addWord(words.elementAt(j)); alreadyAppreared.put(words.elementAt(j), true); } } }
public UnigramStatistics(DocumentCollection docs, boolean _countRepsWithinDocs) { countRepsWithinDocs = _countRepsWithinDocs; logger.info("Building unigram statistics"); for (int i = 0; i < docs.docs.size(); i++) { addDoc(docs.docs.elementAt(i)); } logger.info("Done building unigram statistics"); }
public UnigramStatistics(String filename, FeatureMap map) { InFile in = new InFile(filename); Vector<String> tokens = in.readLineTokens("\n\t "); while (tokens != null) { for (int i = 0; i < tokens.size(); i++) if (map.wordToFid.containsKey(tokens.elementAt(i))) addWord(tokens.elementAt(i)); tokens = in.readLineTokens("\n\t "); } in.close(); }
public UnigramStatistics(String filename, FeatureMap map) { InFile in = new InFile(filename); Vector<String> tokens = in.readLineTokens("\n\t "); while (tokens != null) { for (int i = 0; i < tokens.size(); i++) if (map.wordToFid.containsKey(tokens.elementAt(i))) addWord(tokens.elementAt(i)); tokens = in.readLineTokens("\n\t "); } in.close(); }
public UnigramStatistics(String filename, FeatureMap map) { InFile in = new InFile(filename); Vector<String> tokens = in.readLineTokens("\n\t "); while (tokens != null) { for (int i = 0; i < tokens.size(); i++) if (map.wordToFid.containsKey(tokens.elementAt(i))) addWord(tokens.elementAt(i)); tokens = in.readLineTokens("\n\t "); } in.close(); }