/** * Returns the index of this term in the VSM. */ public int termIndex(String term) throws TermNotFoundException { return termIndex.get(term); } // end termIndex
public Iterator<String> terms() { return termIndex.itemSet().iterator(); } // end terms
public int termCount() { return termIndex.size(); } // end termCount
termIndex = new Index(); termIndex.read(new InputStreamReader(new FileInputStream(rowFile), "UTF-8")); documentIndex = new Index(); documentIndex.read(new InputStreamReader(new FileInputStream(colFile), "UTF-8")); int l = documentNumber = documentIndex.itemSet().size(); logger.info(l + " documents");
File dfFile = new File(matrixName + "-df"); termIndex = new Index(); documentIndex = new Index(); if (saveMatrix) { matrixWriter = new SparseBinaryMatrixFileWriter(matrixFile); logger.info( lineCount + "\t" + ((double) (end.getTime() - start1.getTime()) / 1000) + " total s (" + end + "), voc size:" + corpusVocabulary.size() + ", term index size:" + termIndex.size() + ", totalKW: " + totalKW); termIndex.write(new BufferedWriter(new OutputStreamWriter(new FileOutputStream(rowFile), "UTF-8"))); documentIndex.write(new BufferedWriter(new OutputStreamWriter(new FileOutputStream(colFile), "UTF-8")));
/** * Constructs a reader. */ public TermDocumentMatrixBuilder(String matrixName, File stopwordFile, File keywordFile) throws IOException { totalKW = 0; keywordSet = new TermSet(); keywordSet.read(new FileReader(keywordFile)); logger.info("keyword to be indexed: " + keywordSet.size()); stopwordSet = new TermSet(); stopwordSet.read(new FileReader(stopwordFile)); logger.info(stopwordFile + "(" + stopwordSet.size() + ")"); lengthFreq = new int[101]; columnCount = 0; matrixFile = new File(matrixName + "-matrix"); rowFile = new File(matrixName + "-row"); colFile = new File(matrixName + "-col"); dfFile = new File(matrixName + "-df"); termIndex = new Index(); documentIndex = new Index(); matrixWriter = new SparseBinaryMatrixFileWriter(matrixFile); corpusVocabulary = new Vocabulary(); } // end constructor
termIndex = new Index(); termIndex.read(new InputStreamReader(new FileInputStream(rowFile), "UTF-8")); documentIndex = new Index(); documentIndex.read(new InputStreamReader(new FileInputStream(colFile), "UTF-8")); int l = documentNumber = documentIndex.itemSet().size(); logger.info(l + " documents");
termIndex = new Index(); termIndex.read(new InputStreamReader(new FileInputStream(rowFile), "UTF-8")); documentIndex = new Index(); documentIndex.read(new InputStreamReader(new FileInputStream(colFile), "UTF-8")); int l = documentNumber = documentIndex.itemSet().size(); logger.info(l + " documents");
/** * Returns the idf of the specified term if present in the index; -1 otherwise. */ public float getIdf(String term) { int index = termIndex.get(term); if (index == -1) { return 0; } return Iidf[index]; } // end getIdf
public Iterator<String> documents() { return documentIndex.itemSet().iterator(); } // end terms
public int termCount() { return termIndex.size(); } // end termCount
/** * Returns the idf of the specified term if present in the index; -1 otherwise. */ public double getIdf(String term) { int index = termIndex.get(term); if (index == -1) { return 0; } return Iidf[index]; } // end getIdf
public Iterator<String> documents() { return documentIndex.itemSet().iterator(); } // end terms
public int termCount() { return termIndex.size(); } // end termCount
/** * Returns the idf of the specified term if present in the index; -1 otherwise. */ public float getIdf(String term) { int index = termIndex.get(term); if (index == -1) { return 0; } return Iidf[index]; } // end getIdf
public Iterator<String> terms() { return termIndex.itemSet().iterator(); } // end terms
/** * Returns the index of this term in the VSM. */ public int termIndex(String term) throws TermNotFoundException { return termIndex.get(term); } // end termIndex