cc.mallet.pipe.TokenSequenceRemoveStopwords java code examples

/**
 *  Load a stoplist from a file.
 *  @param stoplistFile    The file to load
 *  @param encoding        The encoding of the stoplist file (eg UTF-8)
 *  @param includeDefault  Whether to include the standard mallet English stoplist
 */
public TokenSequenceRemoveStopwords(File stoplistFile, String encoding, boolean includeDefault,
                  boolean caseSensitive, boolean markDeletions) {
  if (! includeDefault) { stoplist = new HashSet<String>(); }
  else { stoplist = newDefaultStopList(); }
  addStopWords (fileToStringArray(stoplistFile, encoding));
  this.caseSensitive = caseSensitive;
  this.markDeletions = markDeletions;
}

/** Remove whitespace-separated tokens in file "wordlist" to the stoplist. */
public TokenSequenceRemoveStopwords removeStopWords (File wordlist)
{
  this.removeStopWords (fileToStringArray(wordlist, null));
  return this;
}

/** Add whitespace-separated tokens in file "wordlist" to the stoplist. */
public TokenSequenceRemoveStopwords addStopWords (File wordlist)
{
  if (wordlist != null)
    this.addStopWords (fileToStringArray(wordlist, null));
  return this;
}

new TokenSequenceRemoveStopwords(stoplistFile.value,
                 encoding.value,
                 false, // don't include default list
stopwordFilter.addStopWords(extraStopwordsFile.value);
new TokenSequenceRemoveStopwords(false, keepSequenceBigrams.value);
stopwordFilter.addStopWords(extraStopwordsFile.value);

private Pipe buildPipe() {
  Pattern tokenPattern = Pattern.compile("\\S[\\S]+\\S");
  int[] sizes = {1,2};
  ArrayList pipeList = new ArrayList();
  pipeList.add(new CharSequence2TokenSequence(tokenPattern));
  pipeList.add(new TokenSequenceRemoveStopwords(false, false)); // we should use a real stop word list
  pipeList.add(new TokenSequenceNGramsDelim(sizes, " "));
  pipeList.add(new TokenSequence2FeatureSequence());
  return new SerialPipes(pipeList);
}

public TokenSequenceRemoveStopwords (boolean caseSensitive, boolean markDeletions)
{
  stoplist = newDefaultStopList();
  this.caseSensitive = caseSensitive;
  this.markDeletions = markDeletions;
}

new TokenSequenceRemoveStopwords(stoplistFile.value,
                 encoding.value,
                 false, // don't include default list
stopwordFilter.addStopWords(extraStopwordsFile.value);
new TokenSequenceRemoveStopwords(false, keepSequenceBigrams.value);
stopwordFilter.addStopWords(extraStopwordsFile.value);

public InstanceList malletPreprocess(List<TokenSequence> data) {
  ArrayList<Pipe> pipeList = new ArrayList<>();
  pipeList.add(new TokenSequenceRemoveStopwords(false, false));
  pipeList.add(new TokenSequence2FeatureSequence());
  InstanceList instances = new InstanceList(new SerialPipes(pipeList));
  ArrayIterator dataListIterator = new ArrayIterator(data);
  instances.addThruPipe(dataListIterator);
  return instances;
}

/** Add whitespace-separated tokens in file "wordlist" to the stoplist. */
public TokenSequenceRemoveStopwords addStopWords (File wordlist)
{
  if (wordlist != null)
    this.addStopWords (fileToStringArray(wordlist, null));
  return this;
}

public TokenSequenceRemoveStopwords (boolean caseSensitive, boolean markDeletions)
{
  stoplist = newDefaultStopList();
  this.caseSensitive = caseSensitive;
  this.markDeletions = markDeletions;
}

new TokenSequenceRemoveStopwords(stoplistFile.value,
                 encoding.value,
                 false, // don't include default list
stopwordFilter.addStopWords(extraStopwordsFile.value);
new TokenSequenceRemoveStopwords(false, keepSequenceBigrams.value);
stopwordFilter.addStopWords(extraStopwordsFile.value);

  (Pipe) new CharSequence2TokenSequence(),
  (Pipe) new TokenSequenceLowercase(),
  (Pipe) new TokenSequenceRemoveStopwords(false, false),
  (Pipe) new TokenSequence2FeatureSequence()
});

/**
 *  Load a stoplist from a file.
 *  @param stoplistFile    The file to load
 *  @param encoding        The encoding of the stoplist file (eg UTF-8)
 *  @param includeDefault  Whether to include the standard mallet English stoplist
 */
public TokenSequenceRemoveStopwords(File stoplistFile, String encoding, boolean includeDefault,
                  boolean caseSensitive, boolean markDeletions) {
  if (! includeDefault) { stoplist = new HashSet<String>(); }
  else { stoplist = newDefaultStopList(); }
  addStopWords (fileToStringArray(stoplistFile, encoding));
  this.caseSensitive = caseSensitive;
  this.markDeletions = markDeletions;
}

/** Remove whitespace-separated tokens in file "wordlist" to the stoplist. */
public TokenSequenceRemoveStopwords removeStopWords (File wordlist)
{
  this.removeStopWords (fileToStringArray(wordlist, null));
  return this;
}

/** Add whitespace-separated tokens in file "wordlist" to the stoplist. */
public TokenSequenceRemoveStopwords addStopWords (File wordlist)
{
  if (wordlist != null)
    this.addStopWords (fileToStringArray(wordlist, null));
  return this;
}

public TokenSequenceRemoveStopwords (boolean caseSensitive)
{
  stoplist = newDefaultStopList();
  this.caseSensitive = caseSensitive;
}

new TokenSequenceRemoveStopwords(stoplistFile.value,
                 encoding.value,
                 false, // don't include default list
stopwordFilter.addStopWords(extraStopwordsFile.value);
new TokenSequenceRemoveStopwords(false, keepSequenceBigrams.value);
stopwordFilter.addStopWords(extraStopwordsFile.value);

  (Pipe) new CharSequence2TokenSequence(),
  (Pipe) new TokenSequenceLowercase(),
  (Pipe) new TokenSequenceRemoveStopwords(false, false),
  (Pipe) new TokenSequence2FeatureSequence()
});

/**
 *  Load a stoplist from a file.
 *  @param stoplistFile    The file to load
 *  @param encoding        The encoding of the stoplist file (eg UTF-8)
 *  @param includeDefault  Whether to include the standard mallet English stoplist
 */
public TokenSequenceRemoveStopwords(File stoplistFile, String encoding, boolean includeDefault,
                  boolean caseSensitive, boolean markDeletions) {
  if (! includeDefault) { stoplist = new HashSet<String>(); }
  else { stoplist = newDefaultStopList(); }
  addStopWords (fileToStringArray(stoplistFile, encoding));
  this.caseSensitive = caseSensitive;
  this.markDeletions = markDeletions;
}

/** Remove whitespace-separated tokens in file "wordlist" to the stoplist. */
public TokenSequenceRemoveStopwords removeStopWords (File wordlist)
{
  this.removeStopWords (fileToStringArray(wordlist, null));
  return this;
}

Javadoc

Remove tokens from the token sequence in the data field whose text is in the stopword list.

Most used methods

Popular in Java

Start an intent from android
onCreateOptionsMenu (Activity)
startActivity (Activity)
getOriginalFilename (MultipartFile)
Return the original filename in the client's filesystem.This may contain path information depending
MalformedURLException (java.net)
This exception is thrown when a program attempts to create an URL from an incorrect specification.
URLEncoder (java.net)
This class is used to encode a string using the format required by application/x-www-form-urlencoded
Hashtable (java.util)
A plug-in replacement for JDK1.5 java.util.Hashtable. This version is based on org.cliffc.high_scale
Menu (java.awt)
BasicDataSource (org.apache.commons.dbcp)
Basic implementation of javax.sql.DataSource that is configured via JavaBeans properties. This is no
Scheduler (org.quartz)
This is the main interface of a Quartz Scheduler. A Scheduler maintains a registry of org.quartz.Job
Github Copilot alternatives

How to useTokenSequenceRemoveStopwords in cc.mallet.pipe

Best Java code snippets using cc.mallet.pipe.TokenSequenceRemoveStopwords (Showing top 20 results out of 315)

How to use
TokenSequenceRemoveStopwords
in
cc.mallet.pipe