public void testInit() throws Exception { Instances data1 = getData1(); Instances structure = new Instances(data1, 0); DictionaryBuilder builder = new DictionaryBuilder(); builder.setMinTermFreq(2); builder.setup(structure); // should be just one dictionary (i.e. no class attribute, so no per-class // dictionaries) assertEquals(1, builder.getDictionaries(false).length); }
public void testInit() throws Exception { Instances data1 = getData1(); Instances structure = new Instances(data1, 0); DictionaryBuilder builder = new DictionaryBuilder(); builder.setMinTermFreq(2); builder.setup(structure); // should be just one dictionary (i.e. no class attribute, so no per-class // dictionaries) assertEquals(1, builder.getDictionaries(false).length); }
public void testPruneMinFreq() throws Exception { Instances data1 = getData1(); Instances structure = new Instances(data1, 0); DictionaryBuilder builder = new DictionaryBuilder(); builder.setMinTermFreq(1); builder.setup(structure); for (int i = 0; i < data1.numInstances(); i++) { builder.processInstance(data1.instance(i)); } assertEquals(15, builder.getDictionaries(false)[0].size()); Map<String, int[]> consolidated = builder.finalizeDictionary(); // min freq of 1 should keep all terms assertEquals(15, consolidated.size()); }
public void testFinalizeDictionaryNoClassExtraAtts() throws Exception { Instances data1 = getData3(); Instances structure = new Instances(data1, 0); DictionaryBuilder builder = new DictionaryBuilder(); builder.setMinTermFreq(2); builder.setup(structure); for (int i = 0; i < data1.numInstances(); i++) { builder.processInstance(data1.instance(i)); } assertEquals(15, builder.getDictionaries(false)[0].size()); Map<String, int[]> consolidated = builder.finalizeDictionary(); // all but "the" and "over" should have been pruned from the dictionary assertEquals(2, consolidated.size()); }
public void testPruneMinFreq() throws Exception { Instances data1 = getData1(); Instances structure = new Instances(data1, 0); DictionaryBuilder builder = new DictionaryBuilder(); builder.setMinTermFreq(1); builder.setup(structure); for (int i = 0; i < data1.numInstances(); i++) { builder.processInstance(data1.instance(i)); } assertEquals(15, builder.getDictionaries(false)[0].size()); Map<String, int[]> consolidated = builder.finalizeDictionary(); // min freq of 1 should keep all terms assertEquals(15, consolidated.size()); }
public void testFinalizeDictionaryNoClassExtraAtts() throws Exception { Instances data1 = getData3(); Instances structure = new Instances(data1, 0); DictionaryBuilder builder = new DictionaryBuilder(); builder.setMinTermFreq(2); builder.setup(structure); for (int i = 0; i < data1.numInstances(); i++) { builder.processInstance(data1.instance(i)); } assertEquals(15, builder.getDictionaries(false)[0].size()); Map<String, int[]> consolidated = builder.finalizeDictionary(); // all but "the" and "over" should have been pruned from the dictionary assertEquals(2, consolidated.size()); }
public void testTypicalNoClass() throws Exception { Instances data1 = getData1(); Instances structure = new Instances(data1, 0); DictionaryBuilder builder = new DictionaryBuilder(); builder.setMinTermFreq(2); builder.setup(structure); for (int i = 0; i < data1.numInstances(); i++) { builder.processInstance(data1.instance(i)); } assertEquals(15, builder.getDictionaries(false)[0].size()); // check a couple of words assertTrue(builder.getDictionaries(false)[0].get("the") != null); // word count (index 0) should be 4 assertEquals(4, builder.getDictionaries(false)[0].get("the")[0]); // doc count (index 1) should be 2 assertEquals(2, builder.getDictionaries(false)[0].get("the")[1]); }
public void testTypicalNoClassExtraAtts() throws Exception { Instances data1 = getData3(); Instances structure = new Instances(data1, 0); DictionaryBuilder builder = new DictionaryBuilder(); builder.setMinTermFreq(2); builder.setup(structure); for (int i = 0; i < data1.numInstances(); i++) { builder.processInstance(data1.instance(i)); } assertEquals(15, builder.getDictionaries(false)[0].size()); // check a couple of words assertTrue(builder.getDictionaries(false)[0].get("the") != null); // word count (index 0) should be 4 assertEquals(4, builder.getDictionaries(false)[0].get("the")[0]); // doc count (index 1) should be 2 assertEquals(2, builder.getDictionaries(false)[0].get("the")[1]); }
public void testFinalizeDictionaryNoClass() throws Exception { Instances data1 = getData1(); Instances structure = new Instances(data1, 0); DictionaryBuilder builder = new DictionaryBuilder(); builder.setMinTermFreq(2); builder.setup(structure); for (int i = 0; i < data1.numInstances(); i++) { builder.processInstance(data1.instance(i)); } assertEquals(15, builder.getDictionaries(false)[0].size()); Map<String, int[]> consolidated = builder.finalizeDictionary(); // all but "the" and "over" should have been pruned from the dictionary // according to the default min freq of 2 assertEquals(2, consolidated.size()); }
public void testTypicalNoClassExtraAtts() throws Exception { Instances data1 = getData3(); Instances structure = new Instances(data1, 0); DictionaryBuilder builder = new DictionaryBuilder(); builder.setMinTermFreq(2); builder.setup(structure); for (int i = 0; i < data1.numInstances(); i++) { builder.processInstance(data1.instance(i)); } assertEquals(15, builder.getDictionaries(false)[0].size()); // check a couple of words assertTrue(builder.getDictionaries(false)[0].get("the") != null); // word count (index 0) should be 4 assertEquals(4, builder.getDictionaries(false)[0].get("the")[0]); // doc count (index 1) should be 2 assertEquals(2, builder.getDictionaries(false)[0].get("the")[1]); }
public void testFinalizeDictionaryNoClass() throws Exception { Instances data1 = getData1(); Instances structure = new Instances(data1, 0); DictionaryBuilder builder = new DictionaryBuilder(); builder.setMinTermFreq(2); builder.setup(structure); for (int i = 0; i < data1.numInstances(); i++) { builder.processInstance(data1.instance(i)); } assertEquals(15, builder.getDictionaries(false)[0].size()); Map<String, int[]> consolidated = builder.finalizeDictionary(); // all but "the" and "over" should have been pruned from the dictionary // according to the default min freq of 2 assertEquals(2, consolidated.size()); }
public void testTypicalNoClass() throws Exception { Instances data1 = getData1(); Instances structure = new Instances(data1, 0); DictionaryBuilder builder = new DictionaryBuilder(); builder.setMinTermFreq(2); builder.setup(structure); for (int i = 0; i < data1.numInstances(); i++) { builder.processInstance(data1.instance(i)); } assertEquals(15, builder.getDictionaries(false)[0].size()); // check a couple of words assertTrue(builder.getDictionaries(false)[0].get("the") != null); // word count (index 0) should be 4 assertEquals(4, builder.getDictionaries(false)[0].get("the")[0]); // doc count (index 1) should be 2 assertEquals(2, builder.getDictionaries(false)[0].get("the")[1]); }
public void testGetVectorizedStructureNoClassExtraAtts() throws Exception { Instances data1 = getData3(); Instances structure = new Instances(data1, 0); DictionaryBuilder builder = new DictionaryBuilder(); builder.setMinTermFreq(2); builder.setup(structure); for (int i = 0; i < data1.numInstances(); i++) { builder.processInstance(data1.instance(i)); } builder.finalizeDictionary(); Instances format = builder.getVectorizedFormat(); assertTrue(format != null); assertEquals(4, format.numAttributes()); }
public void testGetVectorizedStructureNoClassExtraAtts() throws Exception { Instances data1 = getData3(); Instances structure = new Instances(data1, 0); DictionaryBuilder builder = new DictionaryBuilder(); builder.setMinTermFreq(2); builder.setup(structure); for (int i = 0; i < data1.numInstances(); i++) { builder.processInstance(data1.instance(i)); } builder.finalizeDictionary(); Instances format = builder.getVectorizedFormat(); assertTrue(format != null); assertEquals(4, format.numAttributes()); }
public void testGetVectorizedStructureNoClass() throws Exception { Instances data1 = getData1(); Instances structure = new Instances(data1, 0); DictionaryBuilder builder = new DictionaryBuilder(); builder.setMinTermFreq(2); builder.setup(structure); for (int i = 0; i < data1.numInstances(); i++) { builder.processInstance(data1.instance(i)); } builder.finalizeDictionary(); Instances format = builder.getVectorizedFormat(); assertTrue(format != null); assertEquals(2, format.numAttributes()); }
public void testGetVectorizedStructureNoClass() throws Exception { Instances data1 = getData1(); Instances structure = new Instances(data1, 0); DictionaryBuilder builder = new DictionaryBuilder(); builder.setMinTermFreq(2); builder.setup(structure); for (int i = 0; i < data1.numInstances(); i++) { builder.processInstance(data1.instance(i)); } builder.finalizeDictionary(); Instances format = builder.getVectorizedFormat(); assertTrue(format != null); assertEquals(2, format.numAttributes()); }
public void testVectorizeInstanceWordPresenceNoClass() throws Exception { Instances data1 = getData1(); Instances structure = new Instances(data1, 0); DictionaryBuilder builder = new DictionaryBuilder(); builder.setMinTermFreq(2); builder.setup(structure); for (int i = 0; i < data1.numInstances(); i++) { builder.processInstance(data1.instance(i)); } builder.finalizeDictionary(); Instance vectorized = builder.vectorizeInstance(data1.instance(0)); assertEquals(2, vectorized.numAttributes()); // values of the two attributes should be 1 (presence indicators) assertEquals(1, (int) vectorized.value(0)); assertEquals(1, (int) vectorized.value(1)); }
public void testVectorizeInstanceWordPresenceNoClass() throws Exception { Instances data1 = getData1(); Instances structure = new Instances(data1, 0); DictionaryBuilder builder = new DictionaryBuilder(); builder.setMinTermFreq(2); builder.setup(structure); for (int i = 0; i < data1.numInstances(); i++) { builder.processInstance(data1.instance(i)); } builder.finalizeDictionary(); Instance vectorized = builder.vectorizeInstance(data1.instance(0)); assertEquals(2, vectorized.numAttributes()); // values of the two attributes should be 1 (presence indicators) assertEquals(1, (int) vectorized.value(0)); assertEquals(1, (int) vectorized.value(1)); }
public void testVectorizeInstanceWordCountsNoClass() throws Exception { Instances data1 = getData1(); Instances structure = new Instances(data1, 0); DictionaryBuilder builder = new DictionaryBuilder(); builder.setMinTermFreq(2); builder.setOutputWordCounts(true); builder.setup(structure); for (int i = 0; i < data1.numInstances(); i++) { builder.processInstance(data1.instance(i)); } builder.finalizeDictionary(); Instance vectorized = builder.vectorizeInstance(data1.instance(0)); assertEquals(2, vectorized.numAttributes()); // "the" occurs twice in the first index and "over" once assertEquals(2, (int) vectorized.value(0)); assertEquals(1, (int) vectorized.value(1)); }
public void testVectorizeInstanceWordCountsNoClass() throws Exception { Instances data1 = getData1(); Instances structure = new Instances(data1, 0); DictionaryBuilder builder = new DictionaryBuilder(); builder.setMinTermFreq(2); builder.setOutputWordCounts(true); builder.setup(structure); for (int i = 0; i < data1.numInstances(); i++) { builder.processInstance(data1.instance(i)); } builder.finalizeDictionary(); Instance vectorized = builder.vectorizeInstance(data1.instance(0)); assertEquals(2, vectorized.numAttributes()); // "the" occurs twice in the first index and "over" once assertEquals(2, (int) vectorized.value(0)); assertEquals(1, (int) vectorized.value(1)); }