@Override public void finalizeAggregation() throws Exception { finalizeDictionary(); }
@Override public DictionaryBuilder aggregate(DictionaryBuilder toAgg) throws Exception { Map<String, int[]>[] toAggDicts = toAgg.getDictionaries(false); if (toAggDicts.length != m_dictsPerClass.length) { throw new Exception("Number of dictionaries from the builder to " + "be aggregated does not match our number of dictionaries"); } // we assume that the order of class values is consistent for (int i = 0; i < toAggDicts.length; i++) { Map<String, int[]> toAggDictForClass = toAggDicts[i]; for (Map.Entry<String, int[]> e : toAggDictForClass.entrySet()) { int[] ourCounts = m_dictsPerClass[i].get(e.getKey()); if (ourCounts == null) { ourCounts = new int[2]; m_dictsPerClass[i].put(e.getKey(), ourCounts); } ourCounts[0] += e.getValue()[0]; // word count ourCounts[1] += e.getValue()[1]; // doc count } } m_count += toAgg.m_count; m_docLengthSum += toAgg.m_docLengthSum; return this; }
throws Exception { if (m_vectorizer.readyToVectorize() && inputFormat.equalHeaders(m_vectorizer.getInputFormat())) { return m_vectorizer.getVectorizedFormat(); m_vectorizer.reset(); m_vectorizer.setup(inputFormat); m_vectorizer.loadDictionary(m_dictionarySource); } else if (m_textDictionarySource != null) { m_vectorizer.loadDictionary(m_textDictionarySource); } else { + "' does not seem to exist!"); m_vectorizer.loadDictionary(dictFile, !m_dictionaryIsBinary); return m_vectorizer.getVectorizedFormat();
/** * Sets the format of the input instances. * * @param instanceInfo an Instances object containing the input instance * structure (any instances contained in the object are ignored - * only the structure is required). * @return true if the outputFormat may be collected immediately * @throws Exception if the input format can't be set successfully */ @Override public boolean setInputFormat(Instances instanceInfo) throws Exception { super.setInputFormat(instanceInfo); m_dictionaryBuilder.reset(); m_dictionaryBuilder.setSortDictionary(true); m_dictionaryBuilder.setNormalize(false); m_dictionaryBuilder.setup(instanceInfo); return false; }
public void testInit() throws Exception { Instances data1 = getData1(); Instances structure = new Instances(data1, 0); DictionaryBuilder builder = new DictionaryBuilder(); builder.setMinTermFreq(2); builder.setup(structure); // should be just one dictionary (i.e. no class attribute, so no per-class // dictionaries) assertEquals(1, builder.getDictionaries(false).length); }
public void testSaveLoadDictionaryPlainTextNoNormalize() throws Exception { Instances data1 = getData1(); Instances structure = new Instances(data1, 0); DictionaryBuilder builder = new DictionaryBuilder(); builder.setMinTermFreq(2); builder.setup(structure); for (int i = 0; i < data1.numInstances(); i++) { builder.processInstance(data1.instance(i)); } builder.finalizeDictionary(); StringWriter sw = new StringWriter(); builder.saveDictionary(sw); StringReader sr = new StringReader(sw.toString()); DictionaryBuilder builder2 = new DictionaryBuilder(); builder2.setup(structure); builder2.loadDictionary(sr); // just returns the loaded dictionary Map<String, int[]> consolidated = builder2.finalizeDictionary(); assertEquals(2, consolidated.size()); }
public void testVectorizeInstanceWordCountsNoClass() throws Exception { Instances data1 = getData1(); Instances structure = new Instances(data1, 0); DictionaryBuilder builder = new DictionaryBuilder(); builder.setMinTermFreq(2); builder.setOutputWordCounts(true); builder.setup(structure); for (int i = 0; i < data1.numInstances(); i++) { builder.processInstance(data1.instance(i)); } builder.finalizeDictionary(); Instance vectorized = builder.vectorizeInstance(data1.instance(0)); assertEquals(2, vectorized.numAttributes()); // "the" occurs twice in the first index and "over" once assertEquals(2, (int) vectorized.value(0)); assertEquals(1, (int) vectorized.value(1)); }
String value = Utils.getOption('R', options); if (value.length() != 0) { setSelectedRange(value); } else { setSelectedRange("first-last"); setInvertSelection(Utils.getFlag('V', options)); setAttributeNamePrefix(value); } else { setAttributeNamePrefix(""); setWordsToKeep(Integer.valueOf(value).intValue()); } else { setWordsToKeep(1000); setPeriodicPruning(Integer.parseInt(value)); } else { setPeriodicPruning(-1); setMinTermFreq(Integer.valueOf(value).intValue()); } else { setMinTermFreq(1); setOutputWordCounts(Utils.getFlag('C', options)); setTFTransform(Utils.getFlag('T', options)); setIDFTransform(Utils.getFlag('I', options));
public void testGetVectorizedStructureNoClass() throws Exception { Instances data1 = getData1(); Instances structure = new Instances(data1, 0); DictionaryBuilder builder = new DictionaryBuilder(); builder.setMinTermFreq(2); builder.setup(structure); for (int i = 0; i < data1.numInstances(); i++) { builder.processInstance(data1.instance(i)); } builder.finalizeDictionary(); Instances format = builder.getVectorizedFormat(); assertTrue(format != null); assertEquals(2, format.numAttributes()); }
setWriteMode(WRITE); m_dictionaryBuilder.reset(); try { m_dictionaryBuilder.setup(getInstances()); } catch (Exception ex) { throw new IOException(ex); m_dictionaryBuilder.processInstance(getInstances().instance(i)); m_dictionaryBuilder.finalizeDictionary(); } catch (Exception ex) { throw new IOException(ex); m_dictionaryBuilder.saveDictionary(System.out); setWriteMode(WAIT); return; m_dictionaryBuilder.saveDictionary(m_binaryStream); } else { m_dictionaryBuilder.saveDictionary(getWriter());
m_dictionaryBuilder.setPeriodicPruning(pruneRate); m_dictionaryBuilder.processInstance(toProcess); m_dictionaryBuilder.finalizeDictionary(); setOutputFormat(m_dictionaryBuilder.getVectorizedFormat()); m_dictionaryBuilder.setNormalize(m_filterType != FILTER_NONE); Instances converted = m_dictionaryBuilder.vectorizeBatch( getInputFormat(), m_filterType != FILTER_NONE); m_dictionaryBuilder.saveDictionary(m_dictionaryFile, !m_dictionaryIsBinary);
public void testAggregateDictionaries() throws Exception { Instances data1 = getData1(); Instances data4 = getData4(); Instances structure = new Instances(data1, 0); DictionaryBuilder builder = new DictionaryBuilder(); builder.setMinTermFreq(1); builder.setup(structure); for (int i = 0; i < data1.numInstances(); i++) { builder.processInstance(data1.instance(i)); } Instances structure2 = new Instances(data4, 0); DictionaryBuilder builder2 = new DictionaryBuilder(); builder2.setMinTermFreq(1); builder2.setup(structure2); for (int i = 0; i < data4.numInstances(); i++) { builder2.processInstance(data4.instance(i)); } builder = builder.aggregate(builder2); builder.finalizeAggregation(); Map<String, int[]> consolidated = builder.finalizeDictionary(); assertEquals(17, consolidated.size()); }
/** * Sets whether output instances contain 0 or 1 indicating word presence, or * word counts. * * @param outputWordCounts true if word counts should be output. */ public void setOutputWordCounts(boolean outputWordCounts) { m_dictionaryBuilder.setOutputWordCounts(outputWordCounts); }
/** * Set the MinTermFreq value. * * @param newMinTermFreq The new MinTermFreq value. */ public void setMinTermFreq(int newMinTermFreq) { m_dictionaryBuilder.setMinTermFreq(newMinTermFreq); }
/** * Load a dictionary from a file * * @param filename the file to load from * @param plainText true if the dictionary is in text format * @throws IOException if a problem occurs */ public void loadDictionary(String filename, boolean plainText) throws IOException { loadDictionary(new File(filename), plainText); }
/** * Save the dictionary * * @param filename the file to save to * @param plainText true if the dictionary should be saved in text format * @throws IOException if a problem occurs */ public void saveDictionary(String filename, boolean plainText) throws IOException { saveDictionary(new File(filename), plainText); }
m_outputFormat = getVectorizedFormat();
/** * Sets whether if the word frequencies for a document (instance) should be * normalized or not. * * @param normalize the new type. */ @OptionMetadata(displayName = "Normalize word frequencies", description = "Whether to normalize to average length of documents seen " + "during dictionary construction", commandLineParamName = "N", commandLineParamSynopsis = "-N", commandLineParamIsFlag = true, displayOrder = 9) public void setNormalizeDocLength(boolean normalize) { m_vectorizer.setNormalize(normalize); }
public void testInit() throws Exception { Instances data1 = getData1(); Instances structure = new Instances(data1, 0); DictionaryBuilder builder = new DictionaryBuilder(); builder.setMinTermFreq(2); builder.setup(structure); // should be just one dictionary (i.e. no class attribute, so no per-class // dictionaries) assertEquals(1, builder.getDictionaries(false).length); }
public void testSaveLoadDictionaryPlainTextNoNormalize() throws Exception { Instances data1 = getData1(); Instances structure = new Instances(data1, 0); DictionaryBuilder builder = new DictionaryBuilder(); builder.setMinTermFreq(2); builder.setup(structure); for (int i = 0; i < data1.numInstances(); i++) { builder.processInstance(data1.instance(i)); } builder.finalizeDictionary(); StringWriter sw = new StringWriter(); builder.saveDictionary(sw); StringReader sr = new StringReader(sw.toString()); DictionaryBuilder builder2 = new DictionaryBuilder(); builder2.setup(structure); builder2.loadDictionary(sr); // just returns the loaded dictionary Map<String, int[]> consolidated = builder2.finalizeDictionary(); assertEquals(2, consolidated.size()); }