private Pipe buildPipe() { Pattern tokenPattern = Pattern.compile("\\S[\\S]+\\S"); int[] sizes = {1,2}; ArrayList pipeList = new ArrayList(); pipeList.add(new CharSequence2TokenSequence(tokenPattern)); pipeList.add(new TokenSequenceRemoveStopwords(false, false)); // we should use a real stop word list pipeList.add(new TokenSequenceNGramsDelim(sizes, " ")); pipeList.add(new TokenSequence2FeatureSequence()); return new SerialPipes(pipeList); }
public InstanceList malletPreprocess(List<TokenSequence> data) { ArrayList<Pipe> pipeList = new ArrayList<>(); pipeList.add(new TokenSequenceRemoveStopwords(false, false)); pipeList.add(new TokenSequence2FeatureSequence()); InstanceList instances = new InstanceList(new SerialPipes(pipeList)); ArrayIterator dataListIterator = new ArrayIterator(data); instances.addThruPipe(dataListIterator); return instances; }
(Pipe) new CharSequence2TokenSequence(), (Pipe) new TokenSequenceLowercase(), (Pipe) new TokenSequenceRemoveStopwords(false, false), (Pipe) new TokenSequence2FeatureSequence() });
(Pipe) new CharSequence2TokenSequence(), (Pipe) new TokenSequenceLowercase(), (Pipe) new TokenSequenceRemoveStopwords(false, false), (Pipe) new TokenSequence2FeatureSequence() });
(Pipe) new CharSequence2TokenSequence(), (Pipe) new TokenSequenceLowercase(), (Pipe) new TokenSequenceRemoveStopwords(false, false), (Pipe) new TokenSequence2FeatureSequence() });
public void testThree () { InstanceList il = new InstanceList ( new SerialPipes (new Pipe[] { new Target2Label (), new CharSequence2TokenSequence (), new TokenSequenceLowercase (), new TokenSequenceRemoveStopwords (), new TokenSequence2FeatureSequence (), new FeatureSequence2FeatureVector () })); Iterator<Instance> pi = new FileIterator (new File("foo/bar"), null, Pattern.compile("^([^/]*)/")); il.addThruPipe (pi); }
public void testThree () { InstanceList il = new InstanceList ( new SerialPipes (new Pipe[] { new Target2Label (), new CharSequence2TokenSequence (), new TokenSequenceLowercase (), new TokenSequenceRemoveStopwords (), new TokenSequence2FeatureSequence (), new FeatureSequence2FeatureVector () })); Iterator<Instance> pi = new FileIterator (new File("foo/bar"), null, Pattern.compile("^([^/]*)/")); il.addThruPipe (pi); }
pipeList.add( new TokenSequenceRemoveStopwords(new File("stoplists/en.txt"), "UTF-8", false, false, false) ); pipeList.add( new TokenSequence2FeatureSequence() );
pipeList.add( new TokenSequenceRemoveStopwords(new File("stoplists/en.txt"), "UTF-8", false, false, false) ); pipeList.add( new TokenSequence2FeatureSequence() );
pipeList.add( new TokenSequenceRemoveStopwords(new File("stoplists/en.txt"), "UTF-8", false, false, false) ); pipeList.add( new TokenSequence2FeatureSequence() );
new TokenSequenceRemoveStopwords(stoplistFile.value, encoding.value, false, // don't include default list new TokenSequenceRemoveStopwords(false, keepSequenceBigrams.value);
new TokenSequenceRemoveStopwords(stoplistFile.value, encoding.value, false, // don't include default list new TokenSequenceRemoveStopwords(false, keepSequenceBigrams.value);
new TokenSequenceRemoveStopwords(stoplistFile.value, encoding.value, false, // don't include default list new TokenSequenceRemoveStopwords(false, keepSequenceBigrams.value);
new TokenSequenceRemoveStopwords(stoplistFile.value, encoding.value, false, // don't include default list new TokenSequenceRemoveStopwords(false, keepSequenceBigrams.value);
new TokenSequenceRemoveStopwords(stoplistFile.value, encoding.value, false, // don't include default list new TokenSequenceRemoveStopwords(false, keepSequenceBigrams.value);
new TokenSequenceRemoveStopwords(stoplistFile.value, encoding.value, false, // don't include default list new TokenSequenceRemoveStopwords(false, keepSequenceBigrams.value);
new CharSequence2TokenSequence (), // Data String -> TokenSequence new TokenSequenceLowercase (), // TokenSequence words lowercased new TokenSequenceRemoveStopwords (),// Remove stopwords from sequence new TokenSequence2FeatureSequence(),// Replace each Token with a feature index new FeatureSequence2FeatureVector(),// Collapse word order into a "feature vector"
new CharSequence2TokenSequence (), // Data String -> TokenSequence new TokenSequenceLowercase (), // TokenSequence words lowercased new TokenSequenceRemoveStopwords (),// Remove stopwords from sequence new TokenSequence2FeatureSequence(),// Replace each Token with a feature index new FeatureSequence2FeatureVector(),// Collapse word order into a "feature vector"
new CharSequence2TokenSequence (), // Data String -> TokenSequence new TokenSequenceLowercase (), // TokenSequence words lowercased new TokenSequenceRemoveStopwords (),// Remove stopwords from sequence new TokenSequence2FeatureSequence(),// Replace each Token with a feature index new FeatureSequence2FeatureVector(),// Collapse word order into a "feature vector"