weka.core.DictionaryBuilder java code examples

@Override
public void finalizeAggregation() throws Exception {
 finalizeDictionary();
}

@Override
public DictionaryBuilder aggregate(DictionaryBuilder toAgg) throws Exception {
 Map<String, int[]>[] toAggDicts = toAgg.getDictionaries(false);
 if (toAggDicts.length != m_dictsPerClass.length) {
  throw new Exception("Number of dictionaries from the builder to "
   + "be aggregated does not match our number of dictionaries");
 }
 // we assume that the order of class values is consistent
 for (int i = 0; i < toAggDicts.length; i++) {
  Map<String, int[]> toAggDictForClass = toAggDicts[i];
  for (Map.Entry<String, int[]> e : toAggDictForClass.entrySet()) {
   int[] ourCounts = m_dictsPerClass[i].get(e.getKey());
   if (ourCounts == null) {
    ourCounts = new int[2];
    m_dictsPerClass[i].put(e.getKey(), ourCounts);
   }
   ourCounts[0] += e.getValue()[0]; // word count
   ourCounts[1] += e.getValue()[1]; // doc count
  }
 }
 m_count += toAgg.m_count;
 m_docLengthSum += toAgg.m_docLengthSum;
 return this;
}

throws Exception {
if (m_vectorizer.readyToVectorize()
 && inputFormat.equalHeaders(m_vectorizer.getInputFormat())) {
 return m_vectorizer.getVectorizedFormat();
m_vectorizer.reset();
m_vectorizer.setup(inputFormat);
 m_vectorizer.loadDictionary(m_dictionarySource);
} else if (m_textDictionarySource != null) {
 m_vectorizer.loadDictionary(m_textDictionarySource);
} else {
   + "' does not seem to exist!");
 m_vectorizer.loadDictionary(dictFile, !m_dictionaryIsBinary);
return m_vectorizer.getVectorizedFormat();

/**
 * Sets the format of the input instances.
 *
 * @param instanceInfo an Instances object containing the input instance
 *          structure (any instances contained in the object are ignored -
 *          only the structure is required).
 * @return true if the outputFormat may be collected immediately
 * @throws Exception if the input format can't be set successfully
 */
@Override
public boolean setInputFormat(Instances instanceInfo) throws Exception {
 super.setInputFormat(instanceInfo);
 m_dictionaryBuilder.reset();
 m_dictionaryBuilder.setSortDictionary(true);
 m_dictionaryBuilder.setNormalize(false);
 m_dictionaryBuilder.setup(instanceInfo);
 return false;
}

public void testInit() throws Exception {
 Instances data1 = getData1();
 Instances structure = new Instances(data1, 0);
 DictionaryBuilder builder = new DictionaryBuilder();
 builder.setMinTermFreq(2);
 builder.setup(structure);
 // should be just one dictionary (i.e. no class attribute, so no per-class
 // dictionaries)
 assertEquals(1, builder.getDictionaries(false).length);
}

public void testSaveLoadDictionaryPlainTextNoNormalize() throws Exception {
 Instances data1 = getData1();
 Instances structure = new Instances(data1, 0);
 DictionaryBuilder builder = new DictionaryBuilder();
 builder.setMinTermFreq(2);
 builder.setup(structure);
 for (int i = 0; i < data1.numInstances(); i++) {
  builder.processInstance(data1.instance(i));
 }
 builder.finalizeDictionary();
 StringWriter sw = new StringWriter();
 builder.saveDictionary(sw);
 StringReader sr = new StringReader(sw.toString());
 DictionaryBuilder builder2 = new DictionaryBuilder();
 builder2.setup(structure);
 builder2.loadDictionary(sr);
 // just returns the loaded dictionary
 Map<String, int[]> consolidated = builder2.finalizeDictionary();
 assertEquals(2, consolidated.size());
}

public void testVectorizeInstanceWordCountsNoClass() throws Exception {
 Instances data1 = getData1();
 Instances structure = new Instances(data1, 0);
 DictionaryBuilder builder = new DictionaryBuilder();
 builder.setMinTermFreq(2);
 builder.setOutputWordCounts(true);
 builder.setup(structure);
 for (int i = 0; i < data1.numInstances(); i++) {
  builder.processInstance(data1.instance(i));
 }
 builder.finalizeDictionary();
 Instance vectorized = builder.vectorizeInstance(data1.instance(0));
 assertEquals(2, vectorized.numAttributes());
 // "the" occurs twice in the first index and "over" once
 assertEquals(2, (int) vectorized.value(0));
 assertEquals(1, (int) vectorized.value(1));
}

String value = Utils.getOption('R', options);
if (value.length() != 0) {
 setSelectedRange(value);
} else {
 setSelectedRange("first-last");
setInvertSelection(Utils.getFlag('V', options));
 setAttributeNamePrefix(value);
} else {
 setAttributeNamePrefix("");
 setWordsToKeep(Integer.valueOf(value).intValue());
} else {
 setWordsToKeep(1000);
 setPeriodicPruning(Integer.parseInt(value));
} else {
 setPeriodicPruning(-1);
 setMinTermFreq(Integer.valueOf(value).intValue());
} else {
 setMinTermFreq(1);
setOutputWordCounts(Utils.getFlag('C', options));
setTFTransform(Utils.getFlag('T', options));
setIDFTransform(Utils.getFlag('I', options));

public void testGetVectorizedStructureNoClass() throws Exception {
 Instances data1 = getData1();
 Instances structure = new Instances(data1, 0);
 DictionaryBuilder builder = new DictionaryBuilder();
 builder.setMinTermFreq(2);
 builder.setup(structure);
 for (int i = 0; i < data1.numInstances(); i++) {
  builder.processInstance(data1.instance(i));
 }
 builder.finalizeDictionary();
 Instances format = builder.getVectorizedFormat();
 assertTrue(format != null);
 assertEquals(2, format.numAttributes());
}

setWriteMode(WRITE);
m_dictionaryBuilder.reset();
try {
 m_dictionaryBuilder.setup(getInstances());
} catch (Exception ex) {
 throw new IOException(ex);
 m_dictionaryBuilder.processInstance(getInstances().instance(i));
 m_dictionaryBuilder.finalizeDictionary();
} catch (Exception ex) {
 throw new IOException(ex);
 m_dictionaryBuilder.saveDictionary(System.out);
 setWriteMode(WAIT);
 return;
 m_dictionaryBuilder.saveDictionary(m_binaryStream);
} else {
 m_dictionaryBuilder.saveDictionary(getWriter());

m_dictionaryBuilder.setPeriodicPruning(pruneRate);
 m_dictionaryBuilder.processInstance(toProcess);
m_dictionaryBuilder.finalizeDictionary();
setOutputFormat(m_dictionaryBuilder.getVectorizedFormat());
m_dictionaryBuilder.setNormalize(m_filterType != FILTER_NONE);
Instances converted = m_dictionaryBuilder.vectorizeBatch( getInputFormat(),
 m_filterType != FILTER_NONE);
 m_dictionaryBuilder.saveDictionary(m_dictionaryFile, !m_dictionaryIsBinary);

public void testAggregateDictionaries() throws Exception {
 Instances data1 = getData1();
 Instances data4 = getData4();
 Instances structure = new Instances(data1, 0);
 DictionaryBuilder builder = new DictionaryBuilder();
 builder.setMinTermFreq(1);
 builder.setup(structure);
 for (int i = 0; i < data1.numInstances(); i++) {
  builder.processInstance(data1.instance(i));
 }
 Instances structure2 = new Instances(data4, 0);
 DictionaryBuilder builder2 = new DictionaryBuilder();
 builder2.setMinTermFreq(1);
 builder2.setup(structure2);
 for (int i = 0; i < data4.numInstances(); i++) {
  builder2.processInstance(data4.instance(i));
 }
 builder = builder.aggregate(builder2);
 builder.finalizeAggregation();
 Map<String, int[]> consolidated = builder.finalizeDictionary();
 assertEquals(17, consolidated.size());
}

/**
 * Sets whether output instances contain 0 or 1 indicating word presence, or
 * word counts.
 *
 * @param outputWordCounts true if word counts should be output.
 */
public void setOutputWordCounts(boolean outputWordCounts) {
 m_dictionaryBuilder.setOutputWordCounts(outputWordCounts);
}

/**
 * Set the MinTermFreq value.
 *
 * @param newMinTermFreq The new MinTermFreq value.
 */
public void setMinTermFreq(int newMinTermFreq) {
 m_dictionaryBuilder.setMinTermFreq(newMinTermFreq);
}

/**
 * Load a dictionary from a file
 * 
 * @param filename the file to load from
 * @param plainText true if the dictionary is in text format
 * @throws IOException if a problem occurs
 */
public void loadDictionary(String filename, boolean plainText)
 throws IOException {
 loadDictionary(new File(filename), plainText);
}

/**
 * Save the dictionary
 * 
 * @param filename the file to save to
 * @param plainText true if the dictionary should be saved in text format
 * @throws IOException if a problem occurs
 */
public void saveDictionary(String filename, boolean plainText)
 throws IOException {
 saveDictionary(new File(filename), plainText);
}

m_outputFormat = getVectorizedFormat();

/**
 * Sets whether if the word frequencies for a document (instance) should be
 * normalized or not.
 *
 * @param normalize the new type.
 */
@OptionMetadata(displayName = "Normalize word frequencies",
 description = "Whether to normalize to average length of documents seen "
  + "during dictionary construction",
 commandLineParamName = "N", commandLineParamSynopsis = "-N",
 commandLineParamIsFlag = true, displayOrder = 9)
public void setNormalizeDocLength(boolean normalize) {
 m_vectorizer.setNormalize(normalize);
}

public void testInit() throws Exception {
 Instances data1 = getData1();
 Instances structure = new Instances(data1, 0);
 DictionaryBuilder builder = new DictionaryBuilder();
 builder.setMinTermFreq(2);
 builder.setup(structure);
 // should be just one dictionary (i.e. no class attribute, so no per-class
 // dictionaries)
 assertEquals(1, builder.getDictionaries(false).length);
}

public void testSaveLoadDictionaryPlainTextNoNormalize() throws Exception {
 Instances data1 = getData1();
 Instances structure = new Instances(data1, 0);
 DictionaryBuilder builder = new DictionaryBuilder();
 builder.setMinTermFreq(2);
 builder.setup(structure);
 for (int i = 0; i < data1.numInstances(); i++) {
  builder.processInstance(data1.instance(i));
 }
 builder.finalizeDictionary();
 StringWriter sw = new StringWriter();
 builder.saveDictionary(sw);
 StringReader sr = new StringReader(sw.toString());
 DictionaryBuilder builder2 = new DictionaryBuilder();
 builder2.setup(structure);
 builder2.loadDictionary(sr);
 // just returns the loaded dictionary
 Map<String, int[]> consolidated = builder2.finalizeDictionary();
 assertEquals(2, consolidated.size());
}

Javadoc

Class for building and maintaining a dictionary of terms. Has methods for loading, saving and aggregating dictionaries. Supports loading/saving in binary and textual format. Textual format is expected to have one or two comma separated values per line of the format.

 
term [,doc_count]

where

 
doc_count

is the number of documents that the term has occurred in.

Most used methods

finalizeDictionary
Performs final pruning and consolidation according to the number of words to keep property. Finaliza
getDictionaries
Get the current dictionary(s) (one per class for nominal class, if set). These are the dictionaries
getVectorizedFormat
Get the output format
loadDictionary
Load a dictionary from a file
processInstance
Process an instance by tokenizing string attributes and updating the dictionary.
saveDictionary
Save the dictionary
setMinTermFreq
Set the MinTermFreq value.
setNormalize
Set whether word frequencies for a document should be normalized
setOutputWordCounts
Sets whether output instances contain 0 or 1 indicating word presence, or word counts.
setup
vectorizeInstance
<init>

Popular in Java

Finding current android device location
requestLocationUpdates (LocationManager)
scheduleAtFixedRate (Timer)
getSharedPreferences (Context)
OutputStream (java.io)
A writable sink for bytes.Most clients will use output streams that write data to the file system (
Permission (java.security)
Legacy security code; do not use.
MessageFormat (java.text)
Produces concatenated messages in language-neutral way. New code should probably use java.util.Forma
TimerTask (java.util)
The TimerTask class represents a task to run at a specified time. The task may be run once or repeat
Handler (java.util.logging)
A Handler object accepts a logging request and exports the desired messages to a target, for example
ImageIO (javax.imageio)
Top PhpStorm plugins

How to useDictionaryBuilder in weka.core

Best Java code snippets using weka.core.DictionaryBuilder (Showing top 20 results out of 315)

How to use
DictionaryBuilder
in
weka.core