static List<Pipe> getPipes() { List<Pipe> pipes = newArrayList(); pipes.add(new Target2Label()); pipes.add(new MyInput2RegexTokens()); // pipes.add(new PrintInputAndTarget()); pipes.add(new TokenSequence2FeatureSequence()); pipes.add(new FeatureSequence2FeatureVector()); return pipes; }
private Pipe buildPipe() { Pattern tokenPattern = Pattern.compile("\\S[\\S]+\\S"); int[] sizes = {1,2}; ArrayList pipeList = new ArrayList(); pipeList.add(new CharSequence2TokenSequence(tokenPattern)); pipeList.add(new TokenSequenceRemoveStopwords(false, false)); // we should use a real stop word list pipeList.add(new TokenSequenceNGramsDelim(sizes, " ")); pipeList.add(new TokenSequence2FeatureSequence()); return new SerialPipes(pipeList); }
/** * Construct topic model pipe with given stopwords and alphabets * * @param stopwords to be removed * @param dataAlphabet to use */ public TopicModelPipe(Collection<String> stopwords, Alphabet alphabet) { // @formatter:off super( ImmutableList.of( new CharSequenceLowercase(), new CharSequence2TokenSequence(Pattern.compile("\\p{L}[\\p{L}\\p{P}]+\\p{L}")), new RemoveStopwords(stopwords), new TokenSequence2FeatureSequence(alphabet))); // @formatter:on } }
/** * Construct topic model pipe with given stopwords and alphabets * * @param stopwords to be removed * @param dataAlphabet to use */ public TopicModelPipe(Collection<String> stopwords, Alphabet alphabet) { // @formatter:off super( ImmutableList.of( new CharSequenceLowercase(), new CharSequence2TokenSequence(Pattern.compile("\\p{L}[\\p{L}\\p{P}]+\\p{L}")), new RemoveStopwords(stopwords), new TokenSequence2FeatureSequence(alphabet))); // @formatter:on } }
public TrainHMM(String trainingFilename, String testingFilename) throws IOException { ArrayList<Pipe> pipes = new ArrayList<Pipe>(); pipes.add(new SimpleTaggerSentence2TokenSequence()); pipes.add(new TokenSequence2FeatureSequence()); Pipe pipe = new SerialPipes(pipes); InstanceList trainingInstances = new InstanceList(pipe); InstanceList testingInstances = new InstanceList(pipe); trainingInstances.addThruPipe(new LineGroupIterator(new BufferedReader(new InputStreamReader(new GZIPInputStream(new FileInputStream(trainingFilename)))), Pattern.compile("^\\s*$"), true)); testingInstances.addThruPipe(new LineGroupIterator(new BufferedReader(new InputStreamReader(new GZIPInputStream(new FileInputStream(testingFilename)))), Pattern.compile("^\\s*$"), true)); HMM hmm = new HMM(pipe, null); hmm.addStatesForLabelsConnectedAsIn(trainingInstances); //hmm.addStatesForBiLabelsConnectedAsIn(trainingInstances); HMMTrainerByLikelihood trainer = new HMMTrainerByLikelihood(hmm); TransducerEvaluator trainingEvaluator = new PerClassAccuracyEvaluator(trainingInstances, "training"); TransducerEvaluator testingEvaluator = new PerClassAccuracyEvaluator(testingInstances, "testing"); trainer.train(trainingInstances, 10); trainingEvaluator.evaluate(trainer); testingEvaluator.evaluate(trainer); }
public TrainHMM(String trainingFilename, String testingFilename) throws IOException { ArrayList<Pipe> pipes = new ArrayList<Pipe>(); pipes.add(new SimpleTaggerSentence2TokenSequence()); pipes.add(new TokenSequence2FeatureSequence()); Pipe pipe = new SerialPipes(pipes); InstanceList trainingInstances = new InstanceList(pipe); InstanceList testingInstances = new InstanceList(pipe); trainingInstances.addThruPipe(new LineGroupIterator(new BufferedReader(new InputStreamReader(new GZIPInputStream(new FileInputStream(trainingFilename)))), Pattern.compile("^\\s*$"), true)); testingInstances.addThruPipe(new LineGroupIterator(new BufferedReader(new InputStreamReader(new GZIPInputStream(new FileInputStream(testingFilename)))), Pattern.compile("^\\s*$"), true)); HMM hmm = new HMM(pipe, null); hmm.addStatesForLabelsConnectedAsIn(trainingInstances); //hmm.addStatesForBiLabelsConnectedAsIn(trainingInstances); HMMTrainerByLikelihood trainer = new HMMTrainerByLikelihood(hmm); TransducerEvaluator trainingEvaluator = new PerClassAccuracyEvaluator(trainingInstances, "training"); TransducerEvaluator testingEvaluator = new PerClassAccuracyEvaluator(testingInstances, "testing"); trainer.train(trainingInstances, 10); trainingEvaluator.evaluate(trainer); testingEvaluator.evaluate(trainer); }
public TrainHMM(String trainingFilename, String testingFilename) throws IOException { ArrayList<Pipe> pipes = new ArrayList<Pipe>(); pipes.add(new SimpleTaggerSentence2TokenSequence()); pipes.add(new TokenSequence2FeatureSequence()); Pipe pipe = new SerialPipes(pipes); InstanceList trainingInstances = new InstanceList(pipe); InstanceList testingInstances = new InstanceList(pipe); trainingInstances.addThruPipe(new LineGroupIterator(new BufferedReader(new InputStreamReader(new GZIPInputStream(new FileInputStream(trainingFilename)))), Pattern.compile("^\\s*$"), true)); testingInstances.addThruPipe(new LineGroupIterator(new BufferedReader(new InputStreamReader(new GZIPInputStream(new FileInputStream(testingFilename)))), Pattern.compile("^\\s*$"), true)); HMM hmm = new HMM(pipe, null); hmm.addStatesForLabelsConnectedAsIn(trainingInstances); //hmm.addStatesForBiLabelsConnectedAsIn(trainingInstances); HMMTrainerByLikelihood trainer = new HMMTrainerByLikelihood(hmm); TransducerEvaluator trainingEvaluator = new PerClassAccuracyEvaluator(trainingInstances, "training"); TransducerEvaluator testingEvaluator = new PerClassAccuracyEvaluator(testingInstances, "testing"); trainer.train(trainingInstances, 10); trainingEvaluator.evaluate(trainer); testingEvaluator.evaluate(trainer); }
public InstanceList malletPreprocess(List<TokenSequence> data) { ArrayList<Pipe> pipeList = new ArrayList<>(); pipeList.add(new TokenSequenceRemoveStopwords(false, false)); pipeList.add(new TokenSequence2FeatureSequence()); InstanceList instances = new InstanceList(new SerialPipes(pipeList)); ArrayIterator dataListIterator = new ArrayIterator(data); instances.addThruPipe(dataListIterator); return instances; }
/** * Construct classifier pipe with given labels and stopwords * * @param initial pipe * @param stopwords to be removed */ public AbstractClassifierPipe(Pipe pipe, Collection<String> stopwords) { // @formatter:off super( ImmutableList.of( pipe, new CharSequenceLowercase(), new CharSequence2TokenSequence(Pattern.compile("\\p{L}[\\p{L}\\p{P}]+\\p{L}")), new RemoveStopwords(stopwords), new TokenSequence2FeatureSequence(), new FeatureSequence2FeatureVector())); // @formatter:on } }
/** * Construct classifier pipe with given labels and stopwords * * @param initial pipe * @param stopwords to be removed */ public AbstractClassifierPipe(Pipe pipe, Collection<String> stopwords) { // @formatter:off super( ImmutableList.of( pipe, new CharSequenceLowercase(), new CharSequence2TokenSequence(Pattern.compile("\\p{L}[\\p{L}\\p{P}]+\\p{L}")), new RemoveStopwords(stopwords), new TokenSequence2FeatureSequence(), new FeatureSequence2FeatureVector())); // @formatter:on } }
public Pipe createPipe () { return new SerialPipes (new Pipe[] { new CharSequence2TokenSequence (), new TokenSequenceLowercase (), new TokenSequence2FeatureSequence (), new FeatureSequence2FeatureVector ()}); }
public Pipe createPipe () { return new SerialPipes (new Pipe[] { new CharSequence2TokenSequence (), new TokenSequenceLowercase (), new TokenSequence2FeatureSequence (), new FeatureSequence2FeatureVector ()}); }
public void testRandomTrained () { Pipe p = new SerialPipes (new Pipe[] { new TokenSequence2FeatureSequence (), new FeatureSequence2FeatureVector (), new Target2Label()}); double testAcc1 = testRandomTrainedOn (new InstanceList (p)); double testAcc2 = testRandomTrainedOn (new PagedInstanceList (p, 700, 200, new File("."))); assertEquals (testAcc1, testAcc2, 0.01); }
public void testRandomTrained () { Pipe p = new SerialPipes (new Pipe[] { new TokenSequence2FeatureSequence (), new FeatureSequence2FeatureVector (), new Target2Label()}); double testAcc1 = testRandomTrainedOn (new InstanceList (p)); double testAcc2 = testRandomTrainedOn (new PagedInstanceList (p, 700, 200, new File("."))); assertEquals (testAcc1, testAcc2, 0.01); }
public static void main (String[] args) throws Exception { DBInstanceStore saver = new DBInstanceStore(args[0]); ArrayList<Pipe> pipeList = new ArrayList<Pipe>(); // Read data from File objects pipeList.add(new Input2CharSequence("UTF-8")); // Regular expression for what constitutes a token. // This pattern includes Unicode letters, Unicode numbers, // and the underscore character. Alternatives: // "\\S+" (anything not whitespace) // "\\w+" ( A-Z, a-z, 0-9, _ ) // "[\\p{L}\\p{N}_]+|[\\p{P}]+" (a group of only letters and numbers OR // a group of only punctuation marks) Pattern tokenPattern = Pattern.compile("\\p{L}[\\p{L}\\p{P}]*\\p{L}"); // Tokenize raw strings pipeList.add(new CharSequence2TokenSequence(tokenPattern)); pipeList.add(new TokenSequence2FeatureSequence()); CsvIterator reader = new CsvIterator(new FileReader(new File(args[1])), "(.*?)\\t(.*?)\\t(.*)", 3, 2, 1); Pipe serialPipe = new SerialPipes(pipeList); Iterator<Instance> iterator = serialPipe.newIteratorFrom(reader); saver.saveInstances(iterator); saver.saveAlphabets(serialPipe.getDataAlphabet(), serialPipe.getTargetAlphabet()); saver.cleanup(); }
public static void main (String[] args) throws Exception { DBInstanceStore saver = new DBInstanceStore(args[0]); ArrayList<Pipe> pipeList = new ArrayList<Pipe>(); // Read data from File objects pipeList.add(new Input2CharSequence("UTF-8")); // Regular expression for what constitutes a token. // This pattern includes Unicode letters, Unicode numbers, // and the underscore character. Alternatives: // "\\S+" (anything not whitespace) // "\\w+" ( A-Z, a-z, 0-9, _ ) // "[\\p{L}\\p{N}_]+|[\\p{P}]+" (a group of only letters and numbers OR // a group of only punctuation marks) Pattern tokenPattern = Pattern.compile("\\p{L}[\\p{L}\\p{P}]*\\p{L}"); // Tokenize raw strings pipeList.add(new CharSequence2TokenSequence(tokenPattern)); pipeList.add(new TokenSequence2FeatureSequence()); CsvIterator reader = new CsvIterator(new FileReader(new File(args[1])), "(.*?)\\t(.*?)\\t(.*)", 3, 2, 1); Pipe serialPipe = new SerialPipes(pipeList); Iterator<Instance> iterator = serialPipe.newIteratorFrom(reader); saver.saveInstances(iterator); saver.saveAlphabets(serialPipe.getDataAlphabet(), serialPipe.getTargetAlphabet()); saver.cleanup(); }
public void testThree () { InstanceList il = new InstanceList ( new SerialPipes (new Pipe[] { new Target2Label (), new CharSequence2TokenSequence (), new TokenSequenceLowercase (), new TokenSequenceRemoveStopwords (), new TokenSequence2FeatureSequence (), new FeatureSequence2FeatureVector () })); Iterator<Instance> pi = new FileIterator (new File("foo/bar"), null, Pattern.compile("^([^/]*)/")); il.addThruPipe (pi); }
public void testThree () { InstanceList il = new InstanceList ( new SerialPipes (new Pipe[] { new Target2Label (), new CharSequence2TokenSequence (), new TokenSequenceLowercase (), new TokenSequenceRemoveStopwords (), new TokenSequence2FeatureSequence (), new FeatureSequence2FeatureVector () })); Iterator<Instance> pi = new FileIterator (new File("foo/bar"), null, Pattern.compile("^([^/]*)/")); il.addThruPipe (pi); }
public void testStringTrained () { String[] africaTraining = new String[] { "on the plains of africa the lions roar", "in swahili ngoma means to dance", "nelson mandela became president of south africa", "the saraha dessert is expanding"}; String[] asiaTraining = new String[] { "panda bears eat bamboo", "china's one child policy has resulted in a surplus of boys", "tigers live in the jungle"}; InstanceList instances = new InstanceList ( new SerialPipes (new Pipe[] { new Target2Label (), new CharSequence2TokenSequence (), new TokenSequence2FeatureSequence (), new FeatureSequence2FeatureVector ()})); instances.addThruPipe (new ArrayIterator (africaTraining, "africa")); instances.addThruPipe (new ArrayIterator (asiaTraining, "asia")); Classifier c = new NaiveBayesTrainer ().train (instances); Classification cf = c.classify ("nelson mandela never eats lions"); assertTrue (cf.getLabeling().getBestLabel() == ((LabelAlphabet)instances.getTargetAlphabet()).lookupLabel("africa")); }
public void testStringTrained () { String[] africaTraining = new String[] { "on the plains of africa the lions roar", "in swahili ngoma means to dance", "nelson mandela became president of south africa", "the saraha dessert is expanding"}; String[] asiaTraining = new String[] { "panda bears eat bamboo", "china's one child policy has resulted in a surplus of boys", "tigers live in the jungle"}; InstanceList instances = new InstanceList ( new SerialPipes (new Pipe[] { new Target2Label (), new CharSequence2TokenSequence (), new TokenSequence2FeatureSequence (), new FeatureSequence2FeatureVector ()})); instances.addThruPipe (new ArrayIterator (africaTraining, "africa")); instances.addThruPipe (new ArrayIterator (asiaTraining, "asia")); Classifier c = new NaiveBayesTrainer ().train (instances); Classification cf = c.classify ("nelson mandela never eats lions"); assertTrue (cf.getLabeling().getBestLabel() == ((LabelAlphabet)instances.getTargetAlphabet()).lookupLabel("africa")); }