/** * Load a stoplist from a file. * @param stoplistFile The file to load * @param encoding The encoding of the stoplist file (eg UTF-8) * @param includeDefault Whether to include the standard mallet English stoplist */ public TokenSequenceRemoveStopwords(File stoplistFile, String encoding, boolean includeDefault, boolean caseSensitive, boolean markDeletions) { if (! includeDefault) { stoplist = new HashSet<String>(); } else { stoplist = newDefaultStopList(); } addStopWords (fileToStringArray(stoplistFile, encoding)); this.caseSensitive = caseSensitive; this.markDeletions = markDeletions; }
/** Remove whitespace-separated tokens in file "wordlist" to the stoplist. */ public TokenSequenceRemoveStopwords removeStopWords (File wordlist) { this.removeStopWords (fileToStringArray(wordlist, null)); return this; }
/** Add whitespace-separated tokens in file "wordlist" to the stoplist. */ public TokenSequenceRemoveStopwords addStopWords (File wordlist) { if (wordlist != null) this.addStopWords (fileToStringArray(wordlist, null)); return this; }
new TokenSequenceRemoveStopwords(stoplistFile.value, encoding.value, false, // don't include default list stopwordFilter.addStopWords(extraStopwordsFile.value); new TokenSequenceRemoveStopwords(false, keepSequenceBigrams.value); stopwordFilter.addStopWords(extraStopwordsFile.value);
private Pipe buildPipe() { Pattern tokenPattern = Pattern.compile("\\S[\\S]+\\S"); int[] sizes = {1,2}; ArrayList pipeList = new ArrayList(); pipeList.add(new CharSequence2TokenSequence(tokenPattern)); pipeList.add(new TokenSequenceRemoveStopwords(false, false)); // we should use a real stop word list pipeList.add(new TokenSequenceNGramsDelim(sizes, " ")); pipeList.add(new TokenSequence2FeatureSequence()); return new SerialPipes(pipeList); }
public TokenSequenceRemoveStopwords (boolean caseSensitive, boolean markDeletions) { stoplist = newDefaultStopList(); this.caseSensitive = caseSensitive; this.markDeletions = markDeletions; }
new TokenSequenceRemoveStopwords(stoplistFile.value, encoding.value, false, // don't include default list stopwordFilter.addStopWords(extraStopwordsFile.value); new TokenSequenceRemoveStopwords(false, keepSequenceBigrams.value); stopwordFilter.addStopWords(extraStopwordsFile.value);
public InstanceList malletPreprocess(List<TokenSequence> data) { ArrayList<Pipe> pipeList = new ArrayList<>(); pipeList.add(new TokenSequenceRemoveStopwords(false, false)); pipeList.add(new TokenSequence2FeatureSequence()); InstanceList instances = new InstanceList(new SerialPipes(pipeList)); ArrayIterator dataListIterator = new ArrayIterator(data); instances.addThruPipe(dataListIterator); return instances; }
/** Add whitespace-separated tokens in file "wordlist" to the stoplist. */ public TokenSequenceRemoveStopwords addStopWords (File wordlist) { if (wordlist != null) this.addStopWords (fileToStringArray(wordlist, null)); return this; }
public TokenSequenceRemoveStopwords (boolean caseSensitive, boolean markDeletions) { stoplist = newDefaultStopList(); this.caseSensitive = caseSensitive; this.markDeletions = markDeletions; }
new TokenSequenceRemoveStopwords(stoplistFile.value, encoding.value, false, // don't include default list stopwordFilter.addStopWords(extraStopwordsFile.value); new TokenSequenceRemoveStopwords(false, keepSequenceBigrams.value); stopwordFilter.addStopWords(extraStopwordsFile.value);
(Pipe) new CharSequence2TokenSequence(), (Pipe) new TokenSequenceLowercase(), (Pipe) new TokenSequenceRemoveStopwords(false, false), (Pipe) new TokenSequence2FeatureSequence() });
/** * Load a stoplist from a file. * @param stoplistFile The file to load * @param encoding The encoding of the stoplist file (eg UTF-8) * @param includeDefault Whether to include the standard mallet English stoplist */ public TokenSequenceRemoveStopwords(File stoplistFile, String encoding, boolean includeDefault, boolean caseSensitive, boolean markDeletions) { if (! includeDefault) { stoplist = new HashSet<String>(); } else { stoplist = newDefaultStopList(); } addStopWords (fileToStringArray(stoplistFile, encoding)); this.caseSensitive = caseSensitive; this.markDeletions = markDeletions; }
/** Remove whitespace-separated tokens in file "wordlist" to the stoplist. */ public TokenSequenceRemoveStopwords removeStopWords (File wordlist) { this.removeStopWords (fileToStringArray(wordlist, null)); return this; }
/** Add whitespace-separated tokens in file "wordlist" to the stoplist. */ public TokenSequenceRemoveStopwords addStopWords (File wordlist) { if (wordlist != null) this.addStopWords (fileToStringArray(wordlist, null)); return this; }
public TokenSequenceRemoveStopwords (boolean caseSensitive) { stoplist = newDefaultStopList(); this.caseSensitive = caseSensitive; }
new TokenSequenceRemoveStopwords(stoplistFile.value, encoding.value, false, // don't include default list stopwordFilter.addStopWords(extraStopwordsFile.value); new TokenSequenceRemoveStopwords(false, keepSequenceBigrams.value); stopwordFilter.addStopWords(extraStopwordsFile.value);
(Pipe) new CharSequence2TokenSequence(), (Pipe) new TokenSequenceLowercase(), (Pipe) new TokenSequenceRemoveStopwords(false, false), (Pipe) new TokenSequence2FeatureSequence() });
/** * Load a stoplist from a file. * @param stoplistFile The file to load * @param encoding The encoding of the stoplist file (eg UTF-8) * @param includeDefault Whether to include the standard mallet English stoplist */ public TokenSequenceRemoveStopwords(File stoplistFile, String encoding, boolean includeDefault, boolean caseSensitive, boolean markDeletions) { if (! includeDefault) { stoplist = new HashSet<String>(); } else { stoplist = newDefaultStopList(); } addStopWords (fileToStringArray(stoplistFile, encoding)); this.caseSensitive = caseSensitive; this.markDeletions = markDeletions; }
/** Remove whitespace-separated tokens in file "wordlist" to the stoplist. */ public TokenSequenceRemoveStopwords removeStopWords (File wordlist) { this.removeStopWords (fileToStringArray(wordlist, null)); return this; }