SnowballStemmer stemmer = new SnowballStemmer(); stemmer.setStemmer("English"); StringToWordVector STWfilter = new StringToWordVector(1000); STWfilter.setUseStoplist(true); STWfilter.setIDFTransform(true); STWfilter.setTFTransform(true); STWfilter.setNormalizeDocLength(new SelectedTag(StringToWordVector.FILTER_NORMALIZE_ALL, StringToWordVector.TAGS_FILTER)); STWfilter.setOutputWordCounts(true); STWfilter.setStemmer(stemmer); STWfilter.setInputFormat(train);
StringToWordVector filter = new StringToWordVector(); filter.setWordsToKeep(Integer.MAX_VALUE);
/** * Input an instance for filtering. Filter requires all training instances be * read before producing output. * * @param instance the input instance. * @return true if the filtered instance may now be collected with output(). * @throws IllegalStateException if no input structure has been defined. */ @Override public boolean input(Instance instance) throws Exception { if (getInputFormat() == null) { throw new IllegalStateException("No input instance format defined"); } if (m_NewBatch) { resetQueue(); m_NewBatch = false; } if (isFirstBatchDone()) { Instance inst = m_dictionaryBuilder.vectorizeInstance(instance); push(inst, false); // No need to copy return true; } else { bufferInput(instance); return false; } }
/** * Main method for testing this class. * * @param argv should contain arguments to the filter: use -h for help */ public static void main(String[] argv) { runFilter(new StringToWordVector(), argv); } }
StringToWordVector stwv = new StringToWordVector(); stwv.setInputFormat(data); try { fsource = Filter.useFilter(data, stwv); } catch (Exception e) { e.printStackTrace(); }
setSelectedRange(value); } else { setSelectedRange("first-last"); setInvertSelection(Utils.getFlag('V', options)); setAttributeNamePrefix(value); } else { setAttributeNamePrefix(""); setWordsToKeep(Integer.valueOf(value).intValue()); } else { setWordsToKeep(1000); setPeriodicPruning(Double.parseDouble(value)); } else { setPeriodicPruning(-1); setMinTermFreq(Integer.valueOf(value).intValue()); } else { setMinTermFreq(1); setOutputWordCounts(Utils.getFlag('C', options)); setTFTransform(Utils.getFlag('T', options)); setIDFTransform(Utils.getFlag('I', options)); setDoNotOperateOnPerClassBasis(Utils.getFlag('O', options));
public boolean batchFinished() throws Exception { if (getInputFormat() == null) { throw new IllegalStateException("No input instance format defined"); if (!isFirstBatchDone()) { * getInputFormat().numInstances()); m_dictionaryBuilder.setPeriodicPruning(pruneRate); for (int i = 0; i < getInputFormat().numInstances(); i++) { Instance toProcess = getInputFormat().instance(i); m_dictionaryBuilder.processInstance(toProcess); setOutputFormat(m_dictionaryBuilder.getVectorizedFormat()); Instances converted = m_dictionaryBuilder.vectorizeBatch( getInputFormat(), m_filterType != FILTER_NONE); push(converted.instance(i), false); flushInput(); return (numPendingOutput() != 0);
/** * tests splitOptions and joinOptions * * @see Utils#splitOptions(String) * @see Utils#joinOptions(String[]) */ public void testSplittingAndJoining() { String[] options; String[] newOptions; String joined; int i; try { options = new StringToWordVector().getOptions(); joined = Utils.joinOptions(options); newOptions = Utils.splitOptions(joined); assertEquals("Same number of options", options.length, newOptions.length); for (i = 0; i < options.length; i++) { if (!options[i].equals(newOptions[i])) fail("Option " + (i+1) + " differs"); } } catch (Exception e) { fail("Exception: " + e); } }
/** Creates an example StringToWordVector */ public Filter getFilter() { StringToWordVector f = new StringToWordVector(); return f; }
SnowballStemmer stemmer = new SnowballStemmer(); stemmer.setStemmer("english"); StringToWordVector filter = new StringToWordVector(); filter.setStemmer(stemmer);
StringToWordVector filter = new StringToWordVector(); filter.setStopwords(new File("filename"));
public void testWordsToKeep() { ((StringToWordVector)m_Filter).setWordsToKeep(3); Instances result = useFilter(); // Number of instances shouldn't change assertEquals(m_Instances.numInstances(), result.numInstances()); // Number of attributes will be minus 2 string attributes plus // the word attributes (aiming for 3 -- could be higher in the case of ties) assertEquals(m_Instances.numAttributes() - 2 + 3, result.numAttributes()); }
setSelectedRange(value); } else { setSelectedRange("first-last"); setInvertSelection(Utils.getFlag('V', options)); setAttributeNamePrefix(value); } else { setAttributeNamePrefix(""); setWordsToKeep(Integer.valueOf(value).intValue()); } else { setWordsToKeep(1000); setPeriodicPruning(Double.parseDouble(value)); } else { setPeriodicPruning(-1); setMinTermFreq(Integer.valueOf(value).intValue()); } else { setMinTermFreq(1); setOutputWordCounts(Utils.getFlag('C', options)); setTFTransform(Utils.getFlag('T', options)); setIDFTransform(Utils.getFlag('I', options)); setDoNotOperateOnPerClassBasis(Utils.getFlag('O', options));
public boolean batchFinished() throws Exception { if (getInputFormat() == null) { throw new IllegalStateException("No input instance format defined"); if (!isFirstBatchDone()) { * getInputFormat().numInstances()); m_dictionaryBuilder.setPeriodicPruning(pruneRate); for (int i = 0; i < getInputFormat().numInstances(); i++) { Instance toProcess = getInputFormat().instance(i); m_dictionaryBuilder.processInstance(toProcess); setOutputFormat(m_dictionaryBuilder.getVectorizedFormat()); Instances converted = m_dictionaryBuilder.vectorizeBatch( getInputFormat(), m_filterType != FILTER_NONE); push(converted.instance(i), false); flushInput(); return (numPendingOutput() != 0);
try { StringToWordVector filter = new StringToWordVector(); filter.setInputFormat(htmlInst); Instances dataFiltered = Filter.useFilter(htmlInst, filter); } catch (Exception e) { System.err.println("Exception caught during formatting: " + e.getMessage()); return; }
/** * tests splitOptions and joinOptions * * @see Utils#splitOptions(String) * @see Utils#joinOptions(String[]) */ public void testSplittingAndJoining() { String[] options; String[] newOptions; String joined; int i; try { options = new StringToWordVector().getOptions(); joined = Utils.joinOptions(options); newOptions = Utils.splitOptions(joined); assertEquals("Same number of options", options.length, newOptions.length); for (i = 0; i < options.length; i++) { if (!options[i].equals(newOptions[i])) fail("Option " + (i+1) + " differs"); } } catch (Exception e) { fail("Exception: " + e); } }
/** Creates an example StringToWordVector */ public Filter getFilter() { StringToWordVector f = new StringToWordVector(); return f; }
/** * Main method for testing this class. * * @param argv should contain arguments to the filter: use -h for help */ public static void main(String[] argv) { runFilter(new StringToWordVector(), argv); } }
public void testWordsToKeep() { ((StringToWordVector)m_Filter).setWordsToKeep(3); Instances result = useFilter(); // Number of instances shouldn't change assertEquals(m_Instances.numInstances(), result.numInstances()); // Number of attributes will be minus 2 string attributes plus // the word attributes (aiming for 3 -- could be higher in the case of ties) assertEquals(m_Instances.numAttributes() - 2 + 3, result.numAttributes()); }
StringToWordVector filter = new StringToWordVector(); filter.setInputFormat(dataset); dataset = Filter.useFilter(dataset, filter);