cc.mallet.pipe.TokenSequence2FeatureSequence.<init> java code examples

static List<Pipe> getPipes() {
  List<Pipe> pipes = newArrayList();
  pipes.add(new Target2Label());
  pipes.add(new MyInput2RegexTokens());
  // pipes.add(new PrintInputAndTarget());
  pipes.add(new TokenSequence2FeatureSequence());
  pipes.add(new FeatureSequence2FeatureVector());
  return pipes;
}

private Pipe buildPipe() {
  Pattern tokenPattern = Pattern.compile("\\S[\\S]+\\S");
  int[] sizes = {1,2};
  ArrayList pipeList = new ArrayList();
  pipeList.add(new CharSequence2TokenSequence(tokenPattern));
  pipeList.add(new TokenSequenceRemoveStopwords(false, false)); // we should use a real stop word list
  pipeList.add(new TokenSequenceNGramsDelim(sizes, " "));
  pipeList.add(new TokenSequence2FeatureSequence());
  return new SerialPipes(pipeList);
}

 /**
  * Construct topic model pipe with given stopwords and alphabets
  *
  * @param stopwords to be removed
  * @param dataAlphabet to use
  */
 public TopicModelPipe(Collection<String> stopwords, Alphabet alphabet) {
  // @formatter:off
  super(
    ImmutableList.of(
      new CharSequenceLowercase(),
      new CharSequence2TokenSequence(Pattern.compile("\\p{L}[\\p{L}\\p{P}]+\\p{L}")),
      new RemoveStopwords(stopwords),
      new TokenSequence2FeatureSequence(alphabet)));
  // @formatter:on
 }
}

 /**
  * Construct topic model pipe with given stopwords and alphabets
  *
  * @param stopwords to be removed
  * @param dataAlphabet to use
  */
 public TopicModelPipe(Collection<String> stopwords, Alphabet alphabet) {
  // @formatter:off
  super(
    ImmutableList.of(
      new CharSequenceLowercase(),
      new CharSequence2TokenSequence(Pattern.compile("\\p{L}[\\p{L}\\p{P}]+\\p{L}")),
      new RemoveStopwords(stopwords),
      new TokenSequence2FeatureSequence(alphabet)));
  // @formatter:on
 }
}

public TrainHMM(String trainingFilename, String testingFilename) throws IOException {
  
  ArrayList<Pipe> pipes = new ArrayList<Pipe>();
  pipes.add(new SimpleTaggerSentence2TokenSequence());
  pipes.add(new TokenSequence2FeatureSequence());
  Pipe pipe = new SerialPipes(pipes);
  InstanceList trainingInstances = new InstanceList(pipe);
  InstanceList testingInstances = new InstanceList(pipe);
  trainingInstances.addThruPipe(new LineGroupIterator(new BufferedReader(new InputStreamReader(new GZIPInputStream(new FileInputStream(trainingFilename)))), Pattern.compile("^\\s*$"), true));
  testingInstances.addThruPipe(new LineGroupIterator(new BufferedReader(new InputStreamReader(new GZIPInputStream(new FileInputStream(testingFilename)))), Pattern.compile("^\\s*$"), true));
  
  HMM hmm = new HMM(pipe, null);
  hmm.addStatesForLabelsConnectedAsIn(trainingInstances);
  //hmm.addStatesForBiLabelsConnectedAsIn(trainingInstances);
  HMMTrainerByLikelihood trainer = 
    new HMMTrainerByLikelihood(hmm);
  TransducerEvaluator trainingEvaluator = 
    new PerClassAccuracyEvaluator(trainingInstances, "training");
  TransducerEvaluator testingEvaluator = 
    new PerClassAccuracyEvaluator(testingInstances, "testing");
  trainer.train(trainingInstances, 10);
  
  trainingEvaluator.evaluate(trainer);
  testingEvaluator.evaluate(trainer);
}

public TrainHMM(String trainingFilename, String testingFilename) throws IOException {
  
  ArrayList<Pipe> pipes = new ArrayList<Pipe>();
  pipes.add(new SimpleTaggerSentence2TokenSequence());
  pipes.add(new TokenSequence2FeatureSequence());
  Pipe pipe = new SerialPipes(pipes);
  InstanceList trainingInstances = new InstanceList(pipe);
  InstanceList testingInstances = new InstanceList(pipe);
  trainingInstances.addThruPipe(new LineGroupIterator(new BufferedReader(new InputStreamReader(new GZIPInputStream(new FileInputStream(trainingFilename)))), Pattern.compile("^\\s*$"), true));
  testingInstances.addThruPipe(new LineGroupIterator(new BufferedReader(new InputStreamReader(new GZIPInputStream(new FileInputStream(testingFilename)))), Pattern.compile("^\\s*$"), true));
  
  HMM hmm = new HMM(pipe, null);
  hmm.addStatesForLabelsConnectedAsIn(trainingInstances);
  //hmm.addStatesForBiLabelsConnectedAsIn(trainingInstances);
  HMMTrainerByLikelihood trainer = 
    new HMMTrainerByLikelihood(hmm);
  TransducerEvaluator trainingEvaluator = 
    new PerClassAccuracyEvaluator(trainingInstances, "training");
  TransducerEvaluator testingEvaluator = 
    new PerClassAccuracyEvaluator(testingInstances, "testing");
  trainer.train(trainingInstances, 10);
  
  trainingEvaluator.evaluate(trainer);
  testingEvaluator.evaluate(trainer);
}

public TrainHMM(String trainingFilename, String testingFilename) throws IOException {
  
  ArrayList<Pipe> pipes = new ArrayList<Pipe>();
  pipes.add(new SimpleTaggerSentence2TokenSequence());
  pipes.add(new TokenSequence2FeatureSequence());
  Pipe pipe = new SerialPipes(pipes);
  InstanceList trainingInstances = new InstanceList(pipe);
  InstanceList testingInstances = new InstanceList(pipe);
  trainingInstances.addThruPipe(new LineGroupIterator(new BufferedReader(new InputStreamReader(new GZIPInputStream(new FileInputStream(trainingFilename)))), Pattern.compile("^\\s*$"), true));
  testingInstances.addThruPipe(new LineGroupIterator(new BufferedReader(new InputStreamReader(new GZIPInputStream(new FileInputStream(testingFilename)))), Pattern.compile("^\\s*$"), true));
  
  HMM hmm = new HMM(pipe, null);
  hmm.addStatesForLabelsConnectedAsIn(trainingInstances);
  //hmm.addStatesForBiLabelsConnectedAsIn(trainingInstances);
  HMMTrainerByLikelihood trainer = 
    new HMMTrainerByLikelihood(hmm);
  TransducerEvaluator trainingEvaluator = 
    new PerClassAccuracyEvaluator(trainingInstances, "training");
  TransducerEvaluator testingEvaluator = 
    new PerClassAccuracyEvaluator(testingInstances, "testing");
  trainer.train(trainingInstances, 10);
  
  trainingEvaluator.evaluate(trainer);
  testingEvaluator.evaluate(trainer);
}

public InstanceList malletPreprocess(List<TokenSequence> data) {
  ArrayList<Pipe> pipeList = new ArrayList<>();
  pipeList.add(new TokenSequenceRemoveStopwords(false, false));
  pipeList.add(new TokenSequence2FeatureSequence());
  InstanceList instances = new InstanceList(new SerialPipes(pipeList));
  ArrayIterator dataListIterator = new ArrayIterator(data);
  instances.addThruPipe(dataListIterator);
  return instances;
}

 /**
  * Construct classifier pipe with given labels and stopwords
  *
  * @param initial pipe
  * @param stopwords to be removed
  */
 public AbstractClassifierPipe(Pipe pipe, Collection<String> stopwords) {
  // @formatter:off
  super(
    ImmutableList.of(
      pipe,
      new CharSequenceLowercase(),
      new CharSequence2TokenSequence(Pattern.compile("\\p{L}[\\p{L}\\p{P}]+\\p{L}")),
      new RemoveStopwords(stopwords),
      new TokenSequence2FeatureSequence(),
      new FeatureSequence2FeatureVector()));
  // @formatter:on
 }
}

 /**
  * Construct classifier pipe with given labels and stopwords
  *
  * @param initial pipe
  * @param stopwords to be removed
  */
 public AbstractClassifierPipe(Pipe pipe, Collection<String> stopwords) {
  // @formatter:off
  super(
    ImmutableList.of(
      pipe,
      new CharSequenceLowercase(),
      new CharSequence2TokenSequence(Pattern.compile("\\p{L}[\\p{L}\\p{P}]+\\p{L}")),
      new RemoveStopwords(stopwords),
      new TokenSequence2FeatureSequence(),
      new FeatureSequence2FeatureVector()));
  // @formatter:on
 }
}

public Pipe createPipe () {
  return new SerialPipes (new Pipe[] {
       new CharSequence2TokenSequence (),
       new TokenSequenceLowercase (),
       new TokenSequence2FeatureSequence (),
       new FeatureSequence2FeatureVector ()}); 
}

public Pipe createPipe () {
  return new SerialPipes (new Pipe[] {
       new CharSequence2TokenSequence (),
       new TokenSequenceLowercase (),
       new TokenSequence2FeatureSequence (),
       new FeatureSequence2FeatureVector ()}); 
}

public void testRandomTrained ()
{
 Pipe p = new SerialPipes (new Pipe[]	{
     new TokenSequence2FeatureSequence (),
     new FeatureSequence2FeatureVector (),
     new Target2Label()});
 double testAcc1 = testRandomTrainedOn (new InstanceList (p));
 double testAcc2 = testRandomTrainedOn (new PagedInstanceList (p, 700, 200, new File(".")));
 assertEquals (testAcc1, testAcc2, 0.01);
}

public void testRandomTrained ()
{
 Pipe p = new SerialPipes (new Pipe[]	{
     new TokenSequence2FeatureSequence (),
     new FeatureSequence2FeatureVector (),
     new Target2Label()});
 double testAcc1 = testRandomTrainedOn (new InstanceList (p));
 double testAcc2 = testRandomTrainedOn (new PagedInstanceList (p, 700, 200, new File(".")));
 assertEquals (testAcc1, testAcc2, 0.01);
}

public static void main (String[] args) throws Exception {
  DBInstanceStore saver = new DBInstanceStore(args[0]);
  
  ArrayList<Pipe> pipeList = new ArrayList<Pipe>();
  
  // Read data from File objects
  pipeList.add(new Input2CharSequence("UTF-8"));
  
  // Regular expression for what constitutes a token.
  //  This pattern includes Unicode letters, Unicode numbers, 
  //   and the underscore character. Alternatives:
  //    "\\S+"   (anything not whitespace)
  //    "\\w+"    ( A-Z, a-z, 0-9, _ )
  //    "[\\p{L}\\p{N}_]+|[\\p{P}]+"   (a group of only letters and numbers OR
  //                                    a group of only punctuation marks)
  Pattern tokenPattern =
    Pattern.compile("\\p{L}[\\p{L}\\p{P}]*\\p{L}");
  
  // Tokenize raw strings
  pipeList.add(new CharSequence2TokenSequence(tokenPattern));
  pipeList.add(new TokenSequence2FeatureSequence());
  
  CsvIterator reader = new CsvIterator(new FileReader(new File(args[1])),
                     "(.*?)\\t(.*?)\\t(.*)", 3, 2, 1);
  Pipe serialPipe = new SerialPipes(pipeList);
  Iterator<Instance> iterator = serialPipe.newIteratorFrom(reader);
  saver.saveInstances(iterator);
  saver.saveAlphabets(serialPipe.getDataAlphabet(), serialPipe.getTargetAlphabet());
  saver.cleanup();
}

public static void main (String[] args) throws Exception {
  DBInstanceStore saver = new DBInstanceStore(args[0]);
  
  ArrayList<Pipe> pipeList = new ArrayList<Pipe>();
  
  // Read data from File objects
  pipeList.add(new Input2CharSequence("UTF-8"));
  
  // Regular expression for what constitutes a token.
  //  This pattern includes Unicode letters, Unicode numbers, 
  //   and the underscore character. Alternatives:
  //    "\\S+"   (anything not whitespace)
  //    "\\w+"    ( A-Z, a-z, 0-9, _ )
  //    "[\\p{L}\\p{N}_]+|[\\p{P}]+"   (a group of only letters and numbers OR
  //                                    a group of only punctuation marks)
  Pattern tokenPattern =
    Pattern.compile("\\p{L}[\\p{L}\\p{P}]*\\p{L}");
  
  // Tokenize raw strings
  pipeList.add(new CharSequence2TokenSequence(tokenPattern));
  pipeList.add(new TokenSequence2FeatureSequence());
  
  CsvIterator reader = new CsvIterator(new FileReader(new File(args[1])),
                     "(.*?)\\t(.*?)\\t(.*)", 3, 2, 1);
  Pipe serialPipe = new SerialPipes(pipeList);
  Iterator<Instance> iterator = serialPipe.newIteratorFrom(reader);
  saver.saveInstances(iterator);
  saver.saveAlphabets(serialPipe.getDataAlphabet(), serialPipe.getTargetAlphabet());
  saver.cleanup();
}

public void testThree ()
{
  InstanceList il = new InstanceList (
    new SerialPipes (new Pipe[] {
      new Target2Label (),
      new CharSequence2TokenSequence (),
      new TokenSequenceLowercase (),
      new TokenSequenceRemoveStopwords (),
      new TokenSequence2FeatureSequence (),
      new FeatureSequence2FeatureVector ()
    }));
  Iterator<Instance> pi = new FileIterator (new File("foo/bar"), null, Pattern.compile("^([^/]*)/"));
  il.addThruPipe (pi);
}

public void testThree ()
{
  InstanceList il = new InstanceList (
    new SerialPipes (new Pipe[] {
      new Target2Label (),
      new CharSequence2TokenSequence (),
      new TokenSequenceLowercase (),
      new TokenSequenceRemoveStopwords (),
      new TokenSequence2FeatureSequence (),
      new FeatureSequence2FeatureVector ()
    }));
  Iterator<Instance> pi = new FileIterator (new File("foo/bar"), null, Pattern.compile("^([^/]*)/"));
  il.addThruPipe (pi);
}

public void testStringTrained ()
{
  String[] africaTraining = new String[] {
      "on the plains of africa the lions roar",
      "in swahili ngoma means to dance",
      "nelson mandela became president of south africa",
  "the saraha dessert is expanding"};
  String[] asiaTraining = new String[] {
      "panda bears eat bamboo",
      "china's one child policy has resulted in a surplus of boys",
  "tigers live in the jungle"};
  InstanceList instances =
    new InstanceList (
        new SerialPipes (new Pipe[] {
            new Target2Label (),
            new CharSequence2TokenSequence (),
            new TokenSequence2FeatureSequence (),
            new FeatureSequence2FeatureVector ()}));
  instances.addThruPipe (new ArrayIterator (africaTraining, "africa"));
  instances.addThruPipe (new ArrayIterator (asiaTraining, "asia"));
  Classifier c = new NaiveBayesTrainer ().train (instances);
  Classification cf = c.classify ("nelson mandela never eats lions");
  assertTrue (cf.getLabeling().getBestLabel()
      == ((LabelAlphabet)instances.getTargetAlphabet()).lookupLabel("africa"));
}

public void testStringTrained ()
{
  String[] africaTraining = new String[] {
      "on the plains of africa the lions roar",
      "in swahili ngoma means to dance",
      "nelson mandela became president of south africa",
  "the saraha dessert is expanding"};
  String[] asiaTraining = new String[] {
      "panda bears eat bamboo",
      "china's one child policy has resulted in a surplus of boys",
  "tigers live in the jungle"};
  InstanceList instances =
    new InstanceList (
        new SerialPipes (new Pipe[] {
            new Target2Label (),
            new CharSequence2TokenSequence (),
            new TokenSequence2FeatureSequence (),
            new FeatureSequence2FeatureVector ()}));
  instances.addThruPipe (new ArrayIterator (africaTraining, "africa"));
  instances.addThruPipe (new ArrayIterator (asiaTraining, "asia"));
  Classifier c = new NaiveBayesTrainer ().train (instances);
  Classification cf = c.classify ("nelson mandela never eats lions");
  assertTrue (cf.getLabeling().getBestLabel()
      == ((LabelAlphabet)instances.getTargetAlphabet()).lookupLabel("africa"));
}

Popular methods of TokenSequence2FeatureSequence

getDataAlphabet

Popular in Java

Finding current android device location
scheduleAtFixedRate (Timer)
scheduleAtFixedRate (ScheduledExecutorService)
getOriginalFilename (MultipartFile)
Return the original filename in the client's filesystem.This may contain path information depending
InputStream (java.io)
A readable source of bytes.Most clients will use input streams that read data from the file system (
InetAddress (java.net)
An Internet Protocol (IP) address. This can be either an IPv4 address or an IPv6 address, and in pra
URI (java.net)
A Uniform Resource Identifier that identifies an abstract or physical resource, as specified by RFC
Selector (java.nio.channels)
A controller for the selection of SelectableChannel objects. Selectable channels can be registered w
Queue (java.util)
A collection designed for holding elements prior to processing. Besides basic java.util.Collection o
SortedSet (java.util)
SortedSet is a Set which iterates over its elements in a sorted order. The order is determined eithe
Top Vim plugins

How to use cc.mallet.pipe.TokenSequence2FeatureSequenceconstructor

Best Java code snippets using cc.mallet.pipe.TokenSequence2FeatureSequence.<init> (Showing top 20 results out of 315)

How to use
cc.mallet.pipe.TokenSequence2FeatureSequence
constructor