private void createIdf(Vocabulary voc, int l) { long begin = System.currentTimeMillis(); logger.info("creating idf matrix..."); Iidf = new float[voc.entrySet().size()]; //logger.debug("Iidf.size: " + Iidf.length); // iterates over the types Iterator it = voc.entrySet().iterator(); while (it.hasNext()) { Map.Entry me = (Map.Entry) it.next(); String term = (String) me.getKey(); Vocabulary.TermFrequency tf = (Vocabulary.TermFrequency) me.getValue(); int index = termIndex.get(term); Iidf[index] = log2((float) l / tf.get()); //logger.info(index + ": " + l + "/"+ tf.get() + " = " + Iidf[index]); } // end while //for (int i=0;i<Iidf.length;i++) // logger.info(i + " " + Iidf[i]); long end = System.currentTimeMillis(); logger.info("took " + (end - begin) + " ms"); } // end createIdf
public void stat() throws IOException { Iterator it = keySet().iterator(); int[] freqCount = new int[11]; while (it.hasNext()) { String key = (String) it.next(); int freq = get(key); if (freq <= 10) { freqCount[freq]++; } else { freqCount[0]++; } } double c = 0; for (int i = 1; i < freqCount.length; i++) { c += (double) freqCount[i] / size(); logger.info("F(" + i + ")=" + freqCount[i] + " (" + c + ")"); } c += (double) freqCount[0] / size(); logger.info("F(freq>10)=" + freqCount[0] + " (" + c + ")"); } // end stat
private void addDocument(String[] array) throws IOException { Vocabulary documentVocabulary = new Vocabulary(); if (stopwordSet.size() == 0) { documentVocabulary.add(token); } else if (!stopwordSet.contains(token)) { documentVocabulary.add(token); documentVocabulary.add(token); if (documentVocabulary.size() == 0) { return; int size = documentVocabulary.entrySet().size(); Iterator<String> it = documentVocabulary.keySet().iterator(); while (it.hasNext()) { freq = documentVocabulary.get(term); corpusVocabulary.add(term); j++;
protected void addDocument(String[] array) throws IOException { Vocabulary documentVocabulary = new Vocabulary(); if (stopwordSet.size() == 0) { logger.debug("1 adding " + token); documentVocabulary.add(token); } else if (!stopwordSet.contains(token)) { logger.debug("2 adding " + token); documentVocabulary.add(token); documentVocabulary.add(token); if (documentVocabulary.size() == 0) { return; int size = documentVocabulary.entrySet().size(); Iterator it = documentVocabulary.entrySet().iterator(); while (it.hasNext()) { Map.Entry me = (Map.Entry) it.next(); corpusVocabulary.add(term); j++;
corpusVocabulary = new Vocabulary(); logger.info( lineCount + "\t" + ((double) (end.getTime() - start1.getTime()) / 1000) + " total s (" + end + "), voc size:" + corpusVocabulary.size() + ", term index size:" + termIndex.size() + ", totalKW: " + totalKW); + " total s, voc size:" + corpusVocabulary.size()); start = new Date(); } else if ((lineCount % 500) == 0) { corpusVocabulary.write(new BufferedWriter(new OutputStreamWriter(new FileOutputStream(dfFile), "UTF-8")));
Vocabulary voc = new Vocabulary(); voc.read(new InputStreamReader(new FileInputStream(dfFile), "UTF-8")); createIdf(voc, l);
/** * Closes the readers. */ public void close() throws IOException { // termIndex.write(new FileWriter(rowFile)); // documentIndex.write(new FileWriter(colFile)); // matrixWriter.close(); // corpusVocabulary.write(new FileWriter(dfFile)); } // end close
/** * Constructs a reader. */ public TermDocumentMatrixBuilder(String matrixName, File stopwordFile, File keywordFile) throws IOException { totalKW = 0; keywordSet = new TermSet(); keywordSet.read(new FileReader(keywordFile)); logger.info("keyword to be indexed: " + keywordSet.size()); stopwordSet = new TermSet(); stopwordSet.read(new FileReader(stopwordFile)); logger.info(stopwordFile + "(" + stopwordSet.size() + ")"); lengthFreq = new int[101]; columnCount = 0; matrixFile = new File(matrixName + "-matrix"); rowFile = new File(matrixName + "-row"); colFile = new File(matrixName + "-col"); dfFile = new File(matrixName + "-df"); termIndex = new Index(); documentIndex = new Index(); matrixWriter = new SparseBinaryMatrixFileWriter(matrixFile); corpusVocabulary = new Vocabulary(); } // end constructor
Vocabulary voc = new Vocabulary(); voc.read(new InputStreamReader(new FileInputStream(dfFile), "UTF-8")); createIdf(voc, l);
Vocabulary voc = new Vocabulary(); voc.read(new InputStreamReader(new FileInputStream(dfFile), "UTF-8")); createIdf(voc, l);
private void createIdf(Vocabulary voc, int l) { long begin = System.currentTimeMillis(); logger.info("creating idf vector..."); Iidf = new float[voc.entrySet().size()]; //logger.debug("Iidf.size: " + Iidf.length); // iterates over the types Iterator it = voc.entrySet().iterator(); while (it.hasNext()) { Map.Entry me = (Map.Entry) it.next(); String term = (String) me.getKey(); Vocabulary.TermFrequency tf = (Vocabulary.TermFrequency) me.getValue(); int index = termIndex.get(term); Iidf[index] = (float) log2((double) l / tf.get()); //logger.info(index + ": " + l + "/"+ tf.get() + " = " + Iidf[index]); } // end while //for (int i=0;i<Iidf.length;i++) // logger.info(i + " " + Iidf[i]); long end = System.currentTimeMillis(); logger.info("took " + (end - begin) + " ms"); } // end createIdf
private void createIdf(Vocabulary voc, int l) { long begin = System.currentTimeMillis(); logger.info("creating idf matrix..."); Iidf = new double[voc.entrySet().size()]; //logger.debug("Iidf.size: " + Iidf.length); // iterates over the types Iterator it = voc.entrySet().iterator(); while (it.hasNext()) { Map.Entry me = (Map.Entry) it.next(); String term = (String) me.getKey(); Vocabulary.TermFrequency tf = (Vocabulary.TermFrequency) me.getValue(); int index = termIndex.get(term); Iidf[index] = log2((double) l / tf.get()); //logger.info(index + ": " + l + "/"+ tf.get() + " = " + Iidf[index]); } // end while //for (int i=0;i<Iidf.length;i++) // logger.info(i + " " + Iidf[i]); long end = System.currentTimeMillis(); logger.info("took " + (end - begin) + " ms"); } // end createIdf