eu.fbk.utils.lsa.Vocabulary java code examples

private void createIdf(Vocabulary voc, int l) {
  long begin = System.currentTimeMillis();
  logger.info("creating idf matrix...");
  Iidf = new float[voc.entrySet().size()];
  //logger.debug("Iidf.size: " + Iidf.length);
  // iterates over the types
  Iterator it = voc.entrySet().iterator();
  while (it.hasNext()) {
    Map.Entry me = (Map.Entry) it.next();
    String term = (String) me.getKey();
    Vocabulary.TermFrequency tf = (Vocabulary.TermFrequency) me.getValue();
    int index = termIndex.get(term);
    Iidf[index] = log2((float) l / tf.get());
    //logger.info(index + ": " + l + "/"+ tf.get() + " = " + Iidf[index]);
  } // end while
  //for (int i=0;i<Iidf.length;i++)
  //	logger.info(i + " " + Iidf[i]);
  long end = System.currentTimeMillis();
  logger.info("took " + (end - begin) + " ms");
} // end createIdf

public void stat() throws IOException {
  Iterator it = keySet().iterator();
  int[] freqCount = new int[11];
  while (it.hasNext()) {
    String key = (String) it.next();
    int freq = get(key);
    if (freq <= 10) {
      freqCount[freq]++;
    } else {
      freqCount[0]++;
    }
  }
  double c = 0;
  for (int i = 1; i < freqCount.length; i++) {
    c += (double) freqCount[i] / size();
    logger.info("F(" + i + ")=" + freqCount[i] + " (" + c + ")");
  }
  c += (double) freqCount[0] / size();
  logger.info("F(freq>10)=" + freqCount[0] + " (" + c + ")");
} // end stat

public void lowPassFilter(int cutoff) {
  Iterator it = keySet().iterator();
  while (it.hasNext()) {
    String key = (String) it.next();
    if (get(key) >= cutoff) {
      map.remove(key);
    }
  }
} // end lowPassFilter

private void addDocument(String[] array) throws IOException {
  Vocabulary documentVocabulary = new Vocabulary();
        if (stopwordSet.size() == 0) {
          documentVocabulary.add(token);
        } else if (!stopwordSet.contains(token)) {
          documentVocabulary.add(token);
        documentVocabulary.add(token);
  if (documentVocabulary.size() == 0) {
    return;
  int size = documentVocabulary.entrySet().size();
  Iterator<String> it = documentVocabulary.keySet().iterator();
  while (it.hasNext()) {
      freq = documentVocabulary.get(term);
    corpusVocabulary.add(term);
    j++;

protected void addDocument(String[] array) throws IOException {
  Vocabulary documentVocabulary = new Vocabulary();
        if (stopwordSet.size() == 0) {
          logger.debug("1 adding " + token);
          documentVocabulary.add(token);
        } else if (!stopwordSet.contains(token)) {
          logger.debug("2 adding " + token);
          documentVocabulary.add(token);
        documentVocabulary.add(token);
  if (documentVocabulary.size() == 0) {
    return;
  int size = documentVocabulary.entrySet().size();
  Iterator it = documentVocabulary.entrySet().iterator();
  while (it.hasNext()) {
    Map.Entry me = (Map.Entry) it.next();
    corpusVocabulary.add(term);
    j++;

corpusVocabulary = new Vocabulary();
    logger.info(
        lineCount + "\t" + ((double) (end.getTime() - start1.getTime()) / 1000) + " total s (" + end
            + "), voc size:" + corpusVocabulary.size() + ", term index size:" + termIndex.size()
            + ", totalKW: " + totalKW);
        + " total s,  voc size:" + corpusVocabulary.size());
    start = new Date();
  } else if ((lineCount % 500) == 0) {
corpusVocabulary.write(new BufferedWriter(new OutputStreamWriter(new FileOutputStream(dfFile), "UTF-8")));

Vocabulary voc = new Vocabulary();
voc.read(new InputStreamReader(new FileInputStream(dfFile), "UTF-8"));
createIdf(voc, l);

/**
 * Closes the readers.
 */
public void close() throws IOException {
  //
  termIndex.write(new FileWriter(rowFile));
  //
  documentIndex.write(new FileWriter(colFile));
  //
  matrixWriter.close();
  //
  corpusVocabulary.write(new FileWriter(dfFile));
} // end close

/**
 * Constructs a reader.
 */
public TermDocumentMatrixBuilder(String matrixName, File stopwordFile, File keywordFile) throws IOException {
  totalKW = 0;
  keywordSet = new TermSet();
  keywordSet.read(new FileReader(keywordFile));
  logger.info("keyword to be indexed: " + keywordSet.size());
  stopwordSet = new TermSet();
  stopwordSet.read(new FileReader(stopwordFile));
  logger.info(stopwordFile + "(" + stopwordSet.size() + ")");
  lengthFreq = new int[101];
  columnCount = 0;
  matrixFile = new File(matrixName + "-matrix");
  rowFile = new File(matrixName + "-row");
  colFile = new File(matrixName + "-col");
  dfFile = new File(matrixName + "-df");
  termIndex = new Index();
  documentIndex = new Index();
  matrixWriter = new SparseBinaryMatrixFileWriter(matrixFile);
  corpusVocabulary = new Vocabulary();
} // end constructor

public void highPassFilter(int cutoff) {
  Iterator it = keySet().iterator();
  while (it.hasNext()) {
    String key = (String) it.next();
    if (get(key) <= cutoff) {
      map.remove(key);
    }
  }
} // end lowPassFilter

Vocabulary voc = new Vocabulary();
voc.read(new InputStreamReader(new FileInputStream(dfFile), "UTF-8"));
createIdf(voc, l);

Iterator it = keySet().iterator();
int[] freqCount = new int[10];
int freqOther = 0;
while (it.hasNext()) {
  String key = (String) it.next();
  int freq = get(key);
  if (freq > cutoff) {
    pw.println(freq + "\t" + key);

Vocabulary voc = new Vocabulary();
voc.read(new InputStreamReader(new FileInputStream(dfFile), "UTF-8"));
createIdf(voc, l);

private void createIdf(Vocabulary voc, int l) {
  long begin = System.currentTimeMillis();
  logger.info("creating idf vector...");
  Iidf = new float[voc.entrySet().size()];
  //logger.debug("Iidf.size: " + Iidf.length);
  // iterates over the types
  Iterator it = voc.entrySet().iterator();
  while (it.hasNext()) {
    Map.Entry me = (Map.Entry) it.next();
    String term = (String) me.getKey();
    Vocabulary.TermFrequency tf = (Vocabulary.TermFrequency) me.getValue();
    int index = termIndex.get(term);
    Iidf[index] = (float) log2((double) l / tf.get());
    //logger.info(index + ": " + l + "/"+ tf.get() + " = " + Iidf[index]);
  } // end while
  //for (int i=0;i<Iidf.length;i++)
  //	logger.info(i + " " + Iidf[i]);
  long end = System.currentTimeMillis();
  logger.info("took " + (end - begin) + " ms");
} // end createIdf

private void createIdf(Vocabulary voc, int l) {
  long begin = System.currentTimeMillis();
  logger.info("creating idf matrix...");
  Iidf = new double[voc.entrySet().size()];
  //logger.debug("Iidf.size: " + Iidf.length);
  // iterates over the types
  Iterator it = voc.entrySet().iterator();
  while (it.hasNext()) {
    Map.Entry me = (Map.Entry) it.next();
    String term = (String) me.getKey();
    Vocabulary.TermFrequency tf = (Vocabulary.TermFrequency) me.getValue();
    int index = termIndex.get(term);
    Iidf[index] = log2((double) l / tf.get());
    //logger.info(index + ": " + l + "/"+ tf.get() + " = " + Iidf[index]);
  } // end while
  //for (int i=0;i<Iidf.length;i++)
  //	logger.info(i + " " + Iidf[i]);
  long end = System.currentTimeMillis();
  logger.info("took " + (end - begin) + " ms");
} // end createIdf

Javadoc

This class maps terms and their document frequency within a corpus. The items are stored in alphabetical order.

(freq \t term)+

Most used methods

<init>
Constructs a Vocabulary object.
add
Add a token to the index Vocabulary
entrySet
get
Add a token to the index Vocabulary
keySet
read
Reads the feature termIndex from the specified input stream. This method processes input in terms of
size
write
Writes the feature termIndex into the specified output stream in a format suitable for loading into

Popular in Java

Start an intent from android
notifyDataSetChanged (ArrayAdapter)
compareTo (BigDecimal)
scheduleAtFixedRate (Timer)
InputStream (java.io)
A readable source of bytes.Most clients will use input streams that read data from the file system (
SecureRandom (java.security)
This class generates cryptographically secure pseudo-random numbers. It is best to invoke SecureRand
Arrays (java.util)
This class contains various methods for manipulating arrays (such as sorting and searching). This cl
Properties (java.util)
A Properties object is a Hashtable where the keys and values must be Strings. Each property can have
ExecutorService (java.util.concurrent)
An Executor that provides methods to manage termination and methods that can produce a Future for tr
ServletException (javax.servlet)
Defines a general exception a servlet can throw when it encounters difficulty.
CodeWhisperer alternatives

How to useVocabulary in eu.fbk.utils.lsa

Best Java code snippets using eu.fbk.utils.lsa.Vocabulary (Showing top 15 results out of 315)

How to use
Vocabulary
in
eu.fbk.utils.lsa