public static List<Pipe> getPipes() throws Exception { List<Pipe> pipes = newArrayList(); pipes.add(new Jcas2TokenSequence()); pipes.add(new Target2LabelSequence()); // more piiiiipes addAllGoodPipes(pipes); pipes.add(new FeatureWindow(window, window)); // for debugging pipes.add(new PrintInputAndTarget()); pipes.add(new TokenSequence2FeatureVectorSequence()); return pipes; }
public Instance pipe (Instance carrier) { carrier.setData(new FeatureVectorSequence ((Alphabet)getDataAlphabet(), (TokenSequence)carrier.getData(), binary, augmentable, growAlphabet)); return carrier; }
public TestCRFPipe(String trainingFilename) throws IOException { ArrayList<Pipe> pipes = new ArrayList<Pipe>(); PrintWriter out = new PrintWriter("test.out"); int[][] conjunctions = new int[3][]; conjunctions[0] = new int[] { -1 }; conjunctions[1] = new int[] { 1 }; conjunctions[2] = new int[] { -2, -1 }; pipes.add(new SimpleTaggerSentence2TokenSequence()); //pipes.add(new FeaturesInWindow("PREV-", -1, 1)); //pipes.add(new FeaturesInWindow("NEXT-", 1, 2)); pipes.add(new OffsetConjunctions(conjunctions)); pipes.add(new TokenTextCharSuffix("C1=", 1)); pipes.add(new TokenTextCharSuffix("C2=", 2)); pipes.add(new TokenTextCharSuffix("C3=", 3)); pipes.add(new RegexMatches("CAPITALIZED", Pattern.compile("^\\p{Lu}.*"))); pipes.add(new RegexMatches("STARTSNUMBER", Pattern.compile("^[0-9].*"))); pipes.add(new RegexMatches("HYPHENATED", Pattern.compile(".*\\-.*"))); pipes.add(new RegexMatches("DOLLARSIGN", Pattern.compile("\\$.*"))); pipes.add(new TokenFirstPosition("FIRSTTOKEN")); pipes.add(new TokenSequence2FeatureVectorSequence()); pipes.add(new SequencePrintingPipe(out)); Pipe pipe = new SerialPipes(pipes); InstanceList trainingInstances = new InstanceList(pipe); trainingInstances.addThruPipe(new LineGroupIterator(new BufferedReader(new InputStreamReader(new GZIPInputStream(new FileInputStream(trainingFilename)))), Pattern.compile("^\\s*$"), true)); out.close(); }
public Instance pipe (Instance carrier) { carrier.setData(new FeatureVectorSequence ((Alphabet)getDataAlphabet(), (TokenSequence)carrier.getData(), binary, augmentable, growAlphabet)); return carrier; }
public TestCRFPipe(String trainingFilename) throws IOException { ArrayList<Pipe> pipes = new ArrayList<Pipe>(); PrintWriter out = new PrintWriter("test.out"); int[][] conjunctions = new int[3][]; conjunctions[0] = new int[] { -1 }; conjunctions[1] = new int[] { 1 }; conjunctions[2] = new int[] { -2, -1 }; pipes.add(new SimpleTaggerSentence2TokenSequence()); //pipes.add(new FeaturesInWindow("PREV-", -1, 1)); //pipes.add(new FeaturesInWindow("NEXT-", 1, 2)); pipes.add(new OffsetConjunctions(conjunctions)); pipes.add(new TokenTextCharSuffix("C1=", 1)); pipes.add(new TokenTextCharSuffix("C2=", 2)); pipes.add(new TokenTextCharSuffix("C3=", 3)); pipes.add(new RegexMatches("CAPITALIZED", Pattern.compile("^\\p{Lu}.*"))); pipes.add(new RegexMatches("STARTSNUMBER", Pattern.compile("^[0-9].*"))); pipes.add(new RegexMatches("HYPHENATED", Pattern.compile(".*\\-.*"))); pipes.add(new RegexMatches("DOLLARSIGN", Pattern.compile("\\$.*"))); pipes.add(new TokenFirstPosition("FIRSTTOKEN")); pipes.add(new TokenSequence2FeatureVectorSequence()); pipes.add(new SequencePrintingPipe(out)); Pipe pipe = new SerialPipes(pipes); InstanceList trainingInstances = new InstanceList(pipe); trainingInstances.addThruPipe(new LineGroupIterator(new BufferedReader(new InputStreamReader(new GZIPInputStream(new FileInputStream(trainingFilename)))), Pattern.compile("^\\s*$"), true)); out.close(); }
public Instance pipe (Instance carrier) { carrier.setData(new FeatureVectorSequence ((Alphabet)getDataAlphabet(), (TokenSequence)carrier.getData(), binary, augmentable, growAlphabet)); return carrier; }
public TestCRFPipe(String trainingFilename) throws IOException { ArrayList<Pipe> pipes = new ArrayList<Pipe>(); PrintWriter out = new PrintWriter("test.out"); int[][] conjunctions = new int[3][]; conjunctions[0] = new int[] { -1 }; conjunctions[1] = new int[] { 1 }; conjunctions[2] = new int[] { -2, -1 }; pipes.add(new SimpleTaggerSentence2TokenSequence()); //pipes.add(new FeaturesInWindow("PREV-", -1, 1)); //pipes.add(new FeaturesInWindow("NEXT-", 1, 2)); pipes.add(new OffsetConjunctions(conjunctions)); pipes.add(new TokenTextCharSuffix("C1=", 1)); pipes.add(new TokenTextCharSuffix("C2=", 2)); pipes.add(new TokenTextCharSuffix("C3=", 3)); pipes.add(new RegexMatches("CAPITALIZED", Pattern.compile("^\\p{Lu}.*"))); pipes.add(new RegexMatches("STARTSNUMBER", Pattern.compile("^[0-9].*"))); pipes.add(new RegexMatches("HYPHENATED", Pattern.compile(".*\\-.*"))); pipes.add(new RegexMatches("DOLLARSIGN", Pattern.compile("\\$.*"))); pipes.add(new TokenFirstPosition("FIRSTTOKEN")); pipes.add(new TokenSequence2FeatureVectorSequence()); pipes.add(new SequencePrintingPipe(out)); Pipe pipe = new SerialPipes(pipes); InstanceList trainingInstances = new InstanceList(pipe); trainingInstances.addThruPipe(new LineGroupIterator(new BufferedReader(new InputStreamReader(new GZIPInputStream(new FileInputStream(trainingFilename)))), Pattern.compile("^\\s*$"), true)); out.close(); }
pipes.add(new RegexMatches("DOLLARSIGN", Pattern.compile(".*\\$.*"))); pipes.add(new TokenFirstPosition("FIRSTTOKEN")); pipes.add(new TokenSequence2FeatureVectorSequence());
new OffsetConjunctions(new int[][] { { -1 }, { 1 } }), new TokenSequence2FeatureVectorSequence(true, true) }); final InstanceList instList = new InstanceList(myPipe);
pipes.add(new RegexMatches("DOLLARSIGN", Pattern.compile(".*\\$.*"))); pipes.add(new TokenFirstPosition("FIRSTTOKEN")); pipes.add(new TokenSequence2FeatureVectorSequence());
pipes.add(new RegexMatches("DOLLARSIGN", Pattern.compile(".*\\$.*"))); pipes.add(new TokenFirstPosition("FIRSTTOKEN")); pipes.add(new TokenSequence2FeatureVectorSequence());
.compile(TextUtil.EXCLAMATION_QUESTION_MARK_REGEX))); pipes.add(new OffsetConjunctions(new int[][] { { -1 }, { 1 } })); pipes.add(new TokenSequence2FeatureVectorSequence(targetAlphabet)); SerialPipes serialPipes = new SerialPipes(pipes); serialPipes.setDataAlphabet(dataAlphabet);
new TokenSequence2FeatureVectorSequence (true, true), });
new TokenSequence2FeatureVectorSequence (true, true), });
new TokenSequence2FeatureVectorSequence (true, true), });
pipeParam.add(new TokenSequence2FeatureVectorSequence(true, true));
pipeParam.add(new TokenSequence2FeatureVectorSequence(true, true));
pipeParam.add(new TokenSequence2FeatureVectorSequence(true, true));
private static Pipe makePipe() { Alphabet alpha = new Alphabet(); JointInputToTokenSequence inputPipe = new JointInputToTokenSequence(alpha, new LabelAlphabet(), new LabelAlphabet()); return new SerialPipes(ImmutableList.of( inputPipe, new TokenSequenceLowercase(), // make all lowercase new NeighborTokenFeature(true, makeNeighbors()), // grab neighboring graphemes new NeighborShapeFeature(true, makeShapeNeighs()), new TokenSequenceToFeature(), // convert the strings in the text to features new TokenSequence2FeatureVectorSequence(alpha, true, true) )); }
new OffsetConjunctions (new int[][]{{-1},{1}}), new PrintTokenSequenceFeatures(), new TokenSequence2FeatureVectorSequence (true, true) });