public static List<Pipe> getPipes() throws Exception { List<Pipe> pipes = newArrayList(); pipes.add(new Jcas2TokenSequence()); pipes.add(new Target2LabelSequence()); // more piiiiipes addAllGoodPipes(pipes); pipes.add(new FeatureWindow(window, window)); // for debugging pipes.add(new PrintInputAndTarget()); pipes.add(new TokenSequence2FeatureVectorSequence()); return pipes; }
public TestCRFPipe(String trainingFilename) throws IOException { ArrayList<Pipe> pipes = new ArrayList<Pipe>(); PrintWriter out = new PrintWriter("test.out"); int[][] conjunctions = new int[3][]; conjunctions[0] = new int[] { -1 }; conjunctions[1] = new int[] { 1 }; conjunctions[2] = new int[] { -2, -1 }; pipes.add(new SimpleTaggerSentence2TokenSequence()); //pipes.add(new FeaturesInWindow("PREV-", -1, 1)); //pipes.add(new FeaturesInWindow("NEXT-", 1, 2)); pipes.add(new OffsetConjunctions(conjunctions)); pipes.add(new TokenTextCharSuffix("C1=", 1)); pipes.add(new TokenTextCharSuffix("C2=", 2)); pipes.add(new TokenTextCharSuffix("C3=", 3)); pipes.add(new RegexMatches("CAPITALIZED", Pattern.compile("^\\p{Lu}.*"))); pipes.add(new RegexMatches("STARTSNUMBER", Pattern.compile("^[0-9].*"))); pipes.add(new RegexMatches("HYPHENATED", Pattern.compile(".*\\-.*"))); pipes.add(new RegexMatches("DOLLARSIGN", Pattern.compile("\\$.*"))); pipes.add(new TokenFirstPosition("FIRSTTOKEN")); pipes.add(new TokenSequence2FeatureVectorSequence()); pipes.add(new SequencePrintingPipe(out)); Pipe pipe = new SerialPipes(pipes); InstanceList trainingInstances = new InstanceList(pipe); trainingInstances.addThruPipe(new LineGroupIterator(new BufferedReader(new InputStreamReader(new GZIPInputStream(new FileInputStream(trainingFilename)))), Pattern.compile("^\\s*$"), true)); out.close(); }
public TestCRFPipe(String trainingFilename) throws IOException { ArrayList<Pipe> pipes = new ArrayList<Pipe>(); PrintWriter out = new PrintWriter("test.out"); int[][] conjunctions = new int[3][]; conjunctions[0] = new int[] { -1 }; conjunctions[1] = new int[] { 1 }; conjunctions[2] = new int[] { -2, -1 }; pipes.add(new SimpleTaggerSentence2TokenSequence()); //pipes.add(new FeaturesInWindow("PREV-", -1, 1)); //pipes.add(new FeaturesInWindow("NEXT-", 1, 2)); pipes.add(new OffsetConjunctions(conjunctions)); pipes.add(new TokenTextCharSuffix("C1=", 1)); pipes.add(new TokenTextCharSuffix("C2=", 2)); pipes.add(new TokenTextCharSuffix("C3=", 3)); pipes.add(new RegexMatches("CAPITALIZED", Pattern.compile("^\\p{Lu}.*"))); pipes.add(new RegexMatches("STARTSNUMBER", Pattern.compile("^[0-9].*"))); pipes.add(new RegexMatches("HYPHENATED", Pattern.compile(".*\\-.*"))); pipes.add(new RegexMatches("DOLLARSIGN", Pattern.compile("\\$.*"))); pipes.add(new TokenFirstPosition("FIRSTTOKEN")); pipes.add(new TokenSequence2FeatureVectorSequence()); pipes.add(new SequencePrintingPipe(out)); Pipe pipe = new SerialPipes(pipes); InstanceList trainingInstances = new InstanceList(pipe); trainingInstances.addThruPipe(new LineGroupIterator(new BufferedReader(new InputStreamReader(new GZIPInputStream(new FileInputStream(trainingFilename)))), Pattern.compile("^\\s*$"), true)); out.close(); }
public TestCRFPipe(String trainingFilename) throws IOException { ArrayList<Pipe> pipes = new ArrayList<Pipe>(); PrintWriter out = new PrintWriter("test.out"); int[][] conjunctions = new int[3][]; conjunctions[0] = new int[] { -1 }; conjunctions[1] = new int[] { 1 }; conjunctions[2] = new int[] { -2, -1 }; pipes.add(new SimpleTaggerSentence2TokenSequence()); //pipes.add(new FeaturesInWindow("PREV-", -1, 1)); //pipes.add(new FeaturesInWindow("NEXT-", 1, 2)); pipes.add(new OffsetConjunctions(conjunctions)); pipes.add(new TokenTextCharSuffix("C1=", 1)); pipes.add(new TokenTextCharSuffix("C2=", 2)); pipes.add(new TokenTextCharSuffix("C3=", 3)); pipes.add(new RegexMatches("CAPITALIZED", Pattern.compile("^\\p{Lu}.*"))); pipes.add(new RegexMatches("STARTSNUMBER", Pattern.compile("^[0-9].*"))); pipes.add(new RegexMatches("HYPHENATED", Pattern.compile(".*\\-.*"))); pipes.add(new RegexMatches("DOLLARSIGN", Pattern.compile("\\$.*"))); pipes.add(new TokenFirstPosition("FIRSTTOKEN")); pipes.add(new TokenSequence2FeatureVectorSequence()); pipes.add(new SequencePrintingPipe(out)); Pipe pipe = new SerialPipes(pipes); InstanceList trainingInstances = new InstanceList(pipe); trainingInstances.addThruPipe(new LineGroupIterator(new BufferedReader(new InputStreamReader(new GZIPInputStream(new FileInputStream(trainingFilename)))), Pattern.compile("^\\s*$"), true)); out.close(); }
new TokenSequence2FeatureVectorSequence (true, true), });
new TokenSequence2FeatureVectorSequence (true, true), });
new TokenSequence2FeatureVectorSequence (true, true), });
private static Pipe makePipe() { Alphabet alpha = new Alphabet(); JointInputToTokenSequence inputPipe = new JointInputToTokenSequence(alpha, new LabelAlphabet(), new LabelAlphabet()); return new SerialPipes(ImmutableList.of( inputPipe, new TokenSequenceLowercase(), // make all lowercase new NeighborTokenFeature(true, makeNeighbors()), // grab neighboring graphemes new NeighborShapeFeature(true, makeShapeNeighs()), new TokenSequenceToFeature(), // convert the strings in the text to features new TokenSequence2FeatureVectorSequence(alpha, true, true) )); }
private static Pipe makePipe() { Alphabet alpha = new Alphabet(); Target2LabelSequence labelPipe = new Target2LabelSequence(); LabelAlphabet labelAlpha = (LabelAlphabet) labelPipe.getTargetAlphabet(); return new SerialPipes(ImmutableList.of( new StringListToTokenSequence(alpha, labelAlpha), // convert to token sequence new TokenSequenceLowercase(), // make all lowercase new NeighborTokenFeature(true, makeNeighbors()), // grab neighboring graphemes new NeighborShapeFeature(true, makeShapeNeighs()), new TokenSequenceToFeature(), // convert the strings in the text to features new TokenSequence2FeatureVectorSequence(alpha, true, true), labelPipe, new LabelSequenceToLabelsAssignment(alpha, labelAlpha) )); }
private Pipe makePipe() { Alphabet alpha = new Alphabet(); Target2Label labelPipe = new Target2Label(); LabelAlphabet labelAlpha = (LabelAlphabet) labelPipe.getTargetAlphabet(); return new SerialPipes(ImmutableList.of( new AlignToStressPipe(alpha, labelAlpha, ImmutableList.<StressFeature>of() ), // convert to token sequence new TokenSequenceLowercase(), // make all lowercase new NeighborTokenFeature(true, makeNeighbors()), // grab neighboring graphemes new SurroundingTokenFeature(false), new SurroundingTokenFeature(true), new NeighborShapeFeature(true, makeShapeNeighs()), new LeadingTrailingFeature(), new TokenSequenceToFeature(), // convert the strings in the text to features new TokenSequence2FeatureVectorSequence(alpha, true, false), labelPipe )); }
private Pipe makePipe() { Alphabet alpha = new Alphabet(); Target2LabelSequence labelPipe = new Target2LabelSequence(); LabelAlphabet labelAlpha = (LabelAlphabet) labelPipe.getTargetAlphabet(); return new SerialPipes(ImmutableList.of( new StringListToTokenSequence(alpha, labelAlpha), // convert to token sequence new TokenSequenceLowercase(), // make all lowercase new NeighborTokenFeature(true, makeNeighbors()), // grab neighboring graphemes new SurroundingTokenFeature(false), // new SurroundingTokenFeature(true), new NeighborShapeFeature(true, makeShapeNeighs()), new LeadingTrailingFeature(), new TokenSequenceToFeature(), // convert the strings in the text to features new TokenSequence2FeatureVectorSequence(alpha, true, true), labelPipe )); }
private Pipe makePipe() { Alphabet alpha = new Alphabet(); Target2LabelSequence labelPipe = new Target2LabelSequence(); LabelAlphabet labelAlpha = (LabelAlphabet) labelPipe.getTargetAlphabet(); return new SerialPipes(ImmutableList.of( new StringListToTokenSequence(alpha, labelAlpha), // convert to token sequence new TokenSequenceLowercase(), // make all lowercase new NeighborTokenFeature(true, makeNeighbors()), // grab neighboring graphemes new SurroundingTokenFeature(false), new SurroundingTokenFeature(true), new NeighborShapeFeature(true, makeShapeNeighs()), new LeadingTrailingFeature(), new TokenSequenceToFeature(), // convert the strings in the text to features new TokenSequence2FeatureVectorSequence(alpha, true, false), labelPipe )); }
private Pipe makePipe() { Alphabet alpha = new Alphabet(); Target2LabelSequence labelPipe = new Target2LabelSequence(); LabelAlphabet labelAlpha = (LabelAlphabet) labelPipe.getTargetAlphabet(); return new SerialPipes(ImmutableList.of( new StringListToTokenSequence(alpha, labelAlpha), // convert to token sequence new TokenSequenceLowercase(), // make all lowercase new NeighborTokenFeature(true, makeNeighbors()), // grab neighboring graphemes new SurroundingTokenFeature(false), new SurroundingTokenFeature(true), new NeighborShapeFeature(true, makeShapeNeighs()), new LeadingTrailingFeature(), new TokenSequenceToFeature(), // convert the strings in the text to features new TokenSequence2FeatureVectorSequence(alpha, true, false), labelPipe )); }
public void testStartState() { Pipe p = new SerialPipes(new Pipe[] { new LineGroupString2TokenSequence(), new TokenSequenceMatchDataAndTarget(Pattern .compile("^(\\S+) (.*)"), 2, 1), new TokenSequenceParseFeatureString(false), new TokenText(), new TokenSequence2FeatureVectorSequence(true, false), new Target2LabelSequence(), new PrintInputAndTarget(), }); InstanceList data = new InstanceList(p); data.addThruPipe(new LineGroupIterator(new StringReader(toy), Pattern .compile("\n"), true)); CRF crf = new CRF(p, null); crf.print(); crf.addStatesForLabelsConnectedAsIn(data); crf.addStartState(); CRFTrainerByLabelLikelihood crft = new CRFTrainerByLabelLikelihood(crf); Optimizable.ByGradientValue maxable = crft.getOptimizableCRF(data); assertEquals(-1.3862, maxable.getValue(), 1e-4); crf = new CRF(p, null); crf .addOrderNStates(data, new int[] { 1 }, null, "A", null, null, false); crf.print(); crft = new CRFTrainerByLabelLikelihood(crf); maxable = crft.getOptimizableCRF(data); assertEquals(-3.09104245335831, maxable.getValue(), 1e-4); }
public void testStartState() { Pipe p = new SerialPipes(new Pipe[] { new LineGroupString2TokenSequence(), new TokenSequenceMatchDataAndTarget(Pattern .compile("^(\\S+) (.*)"), 2, 1), new TokenSequenceParseFeatureString(false), new TokenText(), new TokenSequence2FeatureVectorSequence(true, false), new Target2LabelSequence(), new PrintInputAndTarget(), }); InstanceList data = new InstanceList(p); data.addThruPipe(new LineGroupIterator(new StringReader(toy), Pattern .compile("\n"), true)); CRF crf = new CRF(p, null); crf.print(); crf.addStatesForLabelsConnectedAsIn(data); crf.addStartState(); CRFTrainerByLabelLikelihood crft = new CRFTrainerByLabelLikelihood(crf); Optimizable.ByGradientValue maxable = crft.getOptimizableCRF(data); assertEquals(-1.3862, maxable.getValue(), 1e-4); crf = new CRF(p, null); crf .addOrderNStates(data, new int[] { 1 }, null, "A", null, null, false); crf.print(); crft = new CRFTrainerByLabelLikelihood(crf); maxable = crft.getOptimizableCRF(data); assertEquals(-3.09104245335831, maxable.getValue(), 1e-4); }
private Pipe makePipe() { Alphabet alpha = new Alphabet(); Target2LabelSequence labelPipe = new Target2LabelSequence(); LabelAlphabet labelAlpha = (LabelAlphabet) labelPipe.getTargetAlphabet(); return new SerialPipes(ImmutableList.of( new SWordConverterPipe(), new StringListToTokenSequence(alpha, labelAlpha), // convert to token sequence new TokenSequenceLowercase(), // make all lowercase new PhoneNeighborPipe(true, makeNeighbors()), // grab neighboring graphemes new PhoneClassPipe(true, makeClassNeighbors()), new VowelNeighborPipe(), // new SurroundingTokenFeature(false), // new SurroundingTokenFeature(true), // new NeighborShapeFeature(true, makeShapeNeighs()), new IsFirstPipe(), new ThisPhoneClassPipe(), // new AppendEndPipe(), // right before TS2F to get text set, last not to mess w neighbors new TokenSequenceToFeature(), // convert the strings in the text to features new TokenSequence2FeatureVectorSequence(alpha, true, false), labelPipe )); }
public void testPrint() { Pipe p = new SerialPipes(new Pipe[] { new CharSequence2TokenSequence("."), new TokenText(), new TestCRFTokenSequenceRemoveSpaces(), new TokenSequence2FeatureVectorSequence(), new PrintInputAndTarget(), }); InstanceList one = new InstanceList(p); String[] data = new String[] { "ABCDE", }; one.addThruPipe(new ArrayIterator(data)); CRF crf = new CRF(p, null); crf.addFullyConnectedStatesForThreeQuarterLabels(one); CRFTrainerByLabelLikelihood crft = new CRFTrainerByLabelLikelihood(crf); crf.setWeightsDimensionAsIn(one, false); Optimizable mcrf = crft.getOptimizableCRF(one); double[] params = new double[mcrf.getNumParameters()]; for (int i = 0; i < params.length; i++) { params[i] = i; } mcrf.setParameters(params); crf.print(); }
public void testPrint() { Pipe p = new SerialPipes(new Pipe[] { new CharSequence2TokenSequence("."), new TokenText(), new TestCRFTokenSequenceRemoveSpaces(), new TokenSequence2FeatureVectorSequence(), new PrintInputAndTarget(), }); InstanceList one = new InstanceList(p); String[] data = new String[] { "ABCDE", }; one.addThruPipe(new ArrayIterator(data)); CRF crf = new CRF(p, null); crf.addFullyConnectedStatesForThreeQuarterLabels(one); CRFTrainerByLabelLikelihood crft = new CRFTrainerByLabelLikelihood(crf); crf.setWeightsDimensionAsIn(one, false); Optimizable mcrf = crft.getOptimizableCRF(one); double[] params = new double[mcrf.getNumParameters()]; for (int i = 0; i < params.length; i++) { params[i] = i; } mcrf.setParameters(params); crf.print(); }
public void disabledtestPrint () { Pipe p = new SerialPipes (new Pipe[] { new CharSequence2TokenSequence("."), new TokenText(), new TestMEMM.TestMEMMTokenSequenceRemoveSpaces(), new TokenSequence2FeatureVectorSequence(), new PrintInputAndTarget(), }); InstanceList one = new InstanceList (p); String[] data = new String[] { "ABCDE", }; one.addThruPipe (new ArrayIterator (data)); MEMM crf = new MEMM (p, null); crf.addFullyConnectedStatesForLabels(); crf.setWeightsDimensionAsIn (one); MEMMTrainer memmt = new MEMMTrainer (crf); MEMMTrainer.MEMMOptimizableByLabelLikelihood mcrf = memmt.getOptimizableMEMM(one); double[] params = new double[mcrf.getNumParameters()]; for (int i = 0; i < params.length; i++) { params [i] = i; } mcrf.setParameters (params); crf.print (); }
public void disabledtestPrint () { Pipe p = new SerialPipes (new Pipe[] { new CharSequence2TokenSequence("."), new TokenText(), new TestMEMM.TestMEMMTokenSequenceRemoveSpaces(), new TokenSequence2FeatureVectorSequence(), new PrintInputAndTarget(), }); InstanceList one = new InstanceList (p); String[] data = new String[] { "ABCDE", }; one.addThruPipe (new ArrayIterator (data)); MEMM crf = new MEMM (p, null); crf.addFullyConnectedStatesForLabels(); crf.setWeightsDimensionAsIn (one); MEMMTrainer memmt = new MEMMTrainer (crf); MEMMTrainer.MEMMOptimizableByLabelLikelihood mcrf = memmt.getOptimizableMEMM(one); double[] params = new double[mcrf.getNumParameters()]; for (int i = 0; i < params.length; i++) { params [i] = i; } mcrf.setParameters (params); crf.print (); }