cc.mallet.pipe.Pipe java code examples

public CRF (Pipe inputPipe, Pipe outputPipe)
{
  super (inputPipe, outputPipe);
  this.inputAlphabet = inputPipe.getDataAlphabet();
  this.outputAlphabet = inputPipe.getTargetAlphabet();
  //inputAlphabet.stopGrowth();
}

public Iterator<Instance> newIteratorFrom (Iterator<Instance> source)
{
  if (pipes.size() == 0)
    return source;
  Iterator<Instance> ret = pipes.get(0).newIteratorFrom(source);
  for (int i = 1; i < pipes.size(); i++)
    ret = pipes.get(i).newIteratorFrom(ret);
  return ret;
}

public METrainerDummyPipe(final Alphabet data, final Alphabet label) {
  super.setDataAlphabet(data);
  super.setTargetAlphabet(label);
}

private void resolveAlphabets ()
{
  Alphabet da = null, ta = null;
  for (Pipe p : pipes) {
    p.preceedingPipeDataAlphabetNotification(da);
    da = p.getDataAlphabet();
    p.preceedingPipeTargetAlphabetNotification(ta);
    ta = p.getTargetAlphabet();
  }
  dataAlphabet = da;
  targetAlphabet = ta;
}

/** Take input sequence from instance.data and put the output sequence in instance.data. */
public Instance transduce (Instance instance)
{
  if (inputPipe != null)
    instance = inputPipe.instanceFrom(instance);
  // TODO Use MaxLatticeFactory instead of hardcoding 
  instance.setData(new MaxLatticeDefault(this, (Sequence)instance.getData()).bestOutputSequence());
  if (outputPipe != null)
    instance = outputPipe.instanceFrom(instance);
  return instance;
}

  p.getTargetAlphabet().lookupIndex(defaultOption.value);
  p.setTargetProcessing(true);
  trainingData = new InstanceList(p);
  trainingData.addThruPipe(new LineGroupIterator(trainingFile,
                          Pattern.compile("^\\s*$"), true));
  logger.info("Number of features in training data: "+p.getDataAlphabet().size());
  p.setTargetProcessing(true);
  testData = new InstanceList(p);
  testData.addThruPipe(new LineGroupIterator(testFile,
    p.setTargetProcessing(false);
    testData = new InstanceList(p);
    testData.addThruPipe(
                          Pattern.compile("^\\s*$"), true));
logger.info ("Number of predicates: "+p.getDataAlphabet().size());
if (p.isTargetProcessing()) {
  Alphabet targets = p.getTargetAlphabet();
  StringBuffer buf = new StringBuffer("Labels:");
  for (int i = 0; i < targets.size(); i++)

public void testConcatenatePipes ()
{
 Pipe p1 = new StupidPipe ();
 Pipe p2 = new SimpleTagger.SimpleTaggerSentence2FeatureVectorSequence ();
 // initialize p2's dict
 p2.instanceFrom(new Instance (data, null, null, null));
 assertEquals (3, p2.getDataAlphabet ().size());
 Pipe serial = PipeUtils.concatenatePipes (p1, p2);
 Alphabet dict = serial.getDataAlphabet ();
 assertEquals (3, dict.size ());
 assertTrue (dict == p2.getDataAlphabet ());
}

public static void writeInstanceList(ArrayList<Pipe> pipes) throws Exception {
  Pipe serialPipe = new SerialPipes(pipes);
  
  DBInstanceStore saver = new DBInstanceStore(outputDatabase.value);
  for (String filename: inputFiles.value) {
    logger.info("importing from " + filename);
    CsvIterator reader = new CsvIterator(new FileReader(filename),
                       "(.*?)\\t(.*?)\\t(.*)", 3, 2, 1);
    
    saver.saveInstances(serialPipe.newIteratorFrom(reader));
  }
  
  saver.saveAlphabets(serialPipe.getDataAlphabet(), serialPipe.getTargetAlphabet());
  saver.cleanup();
}

logger.info("Testing vectors loaded from " + testFile.value);
if (!testFileIlist.getPipe().alphabetsMatch(trainingFileIlist.getPipe())) {
  throw new RuntimeException( 
      trainingFileIlist.getPipe().getDataAlphabet() + "\n" 
          + testFileIlist.getPipe().getDataAlphabet() + "\n" 
          + trainingFileIlist.getPipe().getTargetAlphabet() + "\n" 
          + testFileIlist.getPipe().getTargetAlphabet() + "\n"
          + "Training and testing alphabets don't match!\n");
validationFileIlist = InstanceList.load (new File(validationFile.value));
logger.info("validation vectors loaded from " + validationFile.value);
if (!validationFileIlist.getPipe().alphabetsMatch(trainingFileIlist.getPipe())) {
  throw new RuntimeException( 
      trainingFileIlist.getPipe().getDataAlphabet() + "\n" 
          + validationFileIlist.getPipe().getDataAlphabet() + "\n" 
          + trainingFileIlist.getPipe().getTargetAlphabet() + "\n" 
          + validationFileIlist.getPipe().getTargetAlphabet() + "\n"
          + "Training and validation alphabets don't match!\n");

public void setTargetProcessing (boolean lookForAndProcessTarget)
{
  super.setTargetProcessing (lookForAndProcessTarget);
  for (Pipe p : pipes)
    p.setTargetProcessing (lookForAndProcessTarget);
}

/** Returns the <code>Alphabet</code> mapping features of the data to
 * integers. */
public Alphabet getDataAlphabet ()
{
  if (dataAlphabet == null && pipe != null) {
    dataAlphabet = pipe.getDataAlphabet ();
  }
  assert (pipe == null
      || pipe.getDataAlphabet () == null
      || pipe.getDataAlphabet () == dataAlphabet);
  return dataAlphabet;
}

/** Returns the <code>Alphabet</code> mapping target output labels to
 * integers. */
public Alphabet getTargetAlphabet ()
{
  if (targetAlphabet == null && pipe != null) {
    targetAlphabet = pipe.getTargetAlphabet ();
  }
  assert (pipe == null
      || pipe.getTargetAlphabet () == null
      || pipe.getTargetAlphabet () == targetAlphabet);
  return targetAlphabet;
}

/** Take input sequence from instance.data and put the output sequence in instance.data. */
public Instance transduce (Instance instance)
{
  if (inputPipe != null)
    instance = inputPipe.instanceFrom(instance);
  // TODO Use MaxLatticeFactory instead of hardcoding 
  instance.setData(new MaxLatticeDefault(this, (Sequence)instance.getData()).bestOutputSequence());
  if (outputPipe != null)
    instance = outputPipe.instanceFrom(instance);
  return instance;
}

  p.getTargetAlphabet().lookupIndex(defaultOption.value);
  p.setTargetProcessing(true);
  trainingData = new InstanceList(p);
  trainingData.addThruPipe(new LineGroupIterator(trainingFile,
                          Pattern.compile("^\\s*$"), true));
  logger.info("Number of features in training data: "+p.getDataAlphabet().size());
  p.setTargetProcessing(true);
  testData = new InstanceList(p);
  testData.addThruPipe(new LineGroupIterator(testFile,
    p.setTargetProcessing(false);
    testData = new InstanceList(p);
    testData.addThruPipe(
                          Pattern.compile("^\\s*$"), true));
logger.info ("Number of predicates: "+p.getDataAlphabet().size());
if (p.isTargetProcessing()) {
  Alphabet targets = p.getTargetAlphabet();
  StringBuffer buf = new StringBuffer("Labels:");
  for (int i = 0; i < targets.size(); i++)

public void testConcatenatePipes ()
{
 Pipe p1 = new StupidPipe ();
 Pipe p2 = new SimpleTagger.SimpleTaggerSentence2FeatureVectorSequence ();
 // initialize p2's dict
 p2.instanceFrom(new Instance (data, null, null, null));
 assertEquals (3, p2.getDataAlphabet ().size());
 Pipe serial = PipeUtils.concatenatePipes (p1, p2);
 Alphabet dict = serial.getDataAlphabet ();
 assertEquals (3, dict.size ());
 assertTrue (dict == p2.getDataAlphabet ());
}

public static void writeInstanceList(ArrayList<Pipe> pipes) throws Exception {
  Pipe serialPipe = new SerialPipes(pipes);
  
  DBInstanceStore saver = new DBInstanceStore(outputDatabase.value);
  for (String filename: inputFiles.value) {
    logger.info("importing from " + filename);
    CsvIterator reader = new CsvIterator(new FileReader(filename),
                       "(.*?)\\t(.*?)\\t(.*)", 3, 2, 1);
    
    saver.saveInstances(serialPipe.newIteratorFrom(reader));
  }
  
  saver.saveAlphabets(serialPipe.getDataAlphabet(), serialPipe.getTargetAlphabet());
  saver.cleanup();
}

private void resolveAlphabets ()
{
  Alphabet da = null, ta = null;
  for (Pipe p : pipes) {
    p.preceedingPipeDataAlphabetNotification(da);
    da = p.getDataAlphabet();
    p.preceedingPipeTargetAlphabetNotification(ta);
    ta = p.getTargetAlphabet();
  }
  dataAlphabet = da;
  targetAlphabet = ta;
}

logger.info("Testing vectors loaded from " + testFile.value);
if (!testFileIlist.getPipe().alphabetsMatch(trainingFileIlist.getPipe())) {
  throw new RuntimeException( 
      trainingFileIlist.getPipe().getDataAlphabet() + "\n" 
          + testFileIlist.getPipe().getDataAlphabet() + "\n" 
          + trainingFileIlist.getPipe().getTargetAlphabet() + "\n" 
          + testFileIlist.getPipe().getTargetAlphabet() + "\n"
          + "Training and testing alphabets don't match!\n");
validationFileIlist = InstanceList.load (new File(validationFile.value));
logger.info("validation vectors loaded from " + validationFile.value);
if (!validationFileIlist.getPipe().alphabetsMatch(trainingFileIlist.getPipe())) {
  throw new RuntimeException( 
      trainingFileIlist.getPipe().getDataAlphabet() + "\n" 
          + validationFileIlist.getPipe().getDataAlphabet() + "\n" 
          + trainingFileIlist.getPipe().getTargetAlphabet() + "\n" 
          + validationFileIlist.getPipe().getTargetAlphabet() + "\n"
          + "Training and validation alphabets don't match!\n");

public void setTargetProcessing (boolean lookForAndProcessTarget)
{
  super.setTargetProcessing (lookForAndProcessTarget);
  for (Pipe p : pipes)
    p.setTargetProcessing (lookForAndProcessTarget);
}

public void readModel(InputStream is) throws IOException, ClassNotFoundException {
  final GZIPInputStream gin = new GZIPInputStream(is);
  final ObjectInputStream ois = new ObjectInputStream(gin);
  model = (CRF) ois.readObject();
  trained = true;
  model.getInputPipe().getDataAlphabet().stopGrowth();
  ois.close();
}

Javadoc

The abstract superclass of all Pipes, which transform one data type to another. Pipes are most often used for feature extraction.

Although Pipe does not have any "abstract methods", in order to use a Pipe subclass you must override either the pipe method or the newIteratorFrom method. The former is appropriate when the pipe's processing of an Instance is strictly one-to-one. For every Instance coming in, there is exactly one Instance coming out. The later is appropriate when the pipe's processing may result in more or fewer Instances than arrive through its source iterator.

A pipe operates on an cc.mallet.types.Instance, which is a carrier of data. A pipe reads from and writes to fields in the Instance when it is requested to process the instance. It is up to the pipe which fields in the Instance it reads from and writes to, but usually a pipe will read its input from and write its output to the "data" field of an instance.

A pipe doesn't have any direct notion of input or output - it merely modifies instances that are handed to it. A set of helper classes, which implement the interface Iterator, iterate over commonly encountered input data structures and feed the elements of these data structures to a pipe as instances.

A pipe is frequently used in conjunction with an cc.mallet.types.InstanceList As instances are added to the list, they are processed by the pipe associated with the instance list and the processed Instance is kept in the list.

In one common usage, a cc.mallet.pipe.iterator.FileIterator is given a list of directories to operate over. The FileIterator walks through each directory, creating an instance for each file and putting the data from the file in the data field of the instance. The directory of the file is stored in the target field of the instance. The FileIterator feeds instances to an InstanceList, which processes the instances through its associated pipe and keeps the results.

Pipes can be hierachically composed. In a typical usage, a SerialPipe is created, which holds other pipes in an ordered list. Piping an instance through a SerialPipe means piping the instance through each of the child pipes in sequence.

A pipe holds two separate Alphabets: one for the symbols (feature names) encountered in the data fields of the instances processed through the pipe, and one for the symbols (e.g. class labels) encountered in the target fields.

Most used methods

instanceFrom
getDataAlphabet
newIteratorFrom
Given an InstanceIterator, return a new InstanceIterator whose instances have also been processed by
setDataAlphabet
setTargetAlphabet
setTargetProcessing
Set whether input is taken from target field of instance during processing. If argument is false, do
getTargetAlphabet
alphabetsMatch
getInstanceId
instancesFrom
A convenience method that will pull all instances from source through this pipe, and return the resu
isTargetProcessing
Return true iff this pipe expects and processes information in the target slot.
pipe
Really this should be 'protected', but isn't for historical reasons.

Popular in Java

Running tasks concurrently on multiple threads
requestLocationUpdates (LocationManager)
notifyDataSetChanged (ArrayAdapter)
setScale (BigDecimal)
ObjectMapper (com.fasterxml.jackson.databind)
ObjectMapper provides functionality for reading and writing JSON, either to and from basic POJOs (Pl
File (java.io)
An "abstract" representation of a file system entity identified by a pathname. The pathname may be a
InputStreamReader (java.io)
A class for turning a byte stream into a character stream. Data read from the source input stream is
Proxy (java.net)
This class represents proxy server settings. A created instance of Proxy stores a type and an addres
Map (java.util)
A Map is a data structure consisting of a set of keys and values in which each key is mapped to a si
UUID (java.util)
UUID is an immutable representation of a 128-bit universally unique identifier (UUID). There are mul
Top 12 Jupyter Notebook extensions

How to usePipe in cc.mallet.pipe

Best Java code snippets using cc.mallet.pipe.Pipe (Showing top 20 results out of 315)

How to use
Pipe
in
cc.mallet.pipe