private Pipe buildPipe() { Pattern tokenPattern = Pattern.compile("\\S[\\S]+\\S"); int[] sizes = {1,2}; ArrayList pipeList = new ArrayList(); pipeList.add(new CharSequence2TokenSequence(tokenPattern)); pipeList.add(new TokenSequenceRemoveStopwords(false, false)); // we should use a real stop word list pipeList.add(new TokenSequenceNGramsDelim(sizes, " ")); pipeList.add(new TokenSequence2FeatureSequence()); return new SerialPipes(pipeList); }
/** * Construct topic model pipe with given stopwords and alphabets * * @param stopwords to be removed * @param dataAlphabet to use */ public TopicModelPipe(Collection<String> stopwords, Alphabet alphabet) { // @formatter:off super( ImmutableList.of( new CharSequenceLowercase(), new CharSequence2TokenSequence(Pattern.compile("\\p{L}[\\p{L}\\p{P}]+\\p{L}")), new RemoveStopwords(stopwords), new TokenSequence2FeatureSequence(alphabet))); // @formatter:on } }
/** * Construct topic model pipe with given stopwords and alphabets * * @param stopwords to be removed * @param dataAlphabet to use */ public TopicModelPipe(Collection<String> stopwords, Alphabet alphabet) { // @formatter:off super( ImmutableList.of( new CharSequenceLowercase(), new CharSequence2TokenSequence(Pattern.compile("\\p{L}[\\p{L}\\p{P}]+\\p{L}")), new RemoveStopwords(stopwords), new TokenSequence2FeatureSequence(alphabet))); // @formatter:on } }
/** * Construct classifier pipe with given labels and stopwords * * @param initial pipe * @param stopwords to be removed */ public AbstractClassifierPipe(Pipe pipe, Collection<String> stopwords) { // @formatter:off super( ImmutableList.of( pipe, new CharSequenceLowercase(), new CharSequence2TokenSequence(Pattern.compile("\\p{L}[\\p{L}\\p{P}]+\\p{L}")), new RemoveStopwords(stopwords), new TokenSequence2FeatureSequence(), new FeatureSequence2FeatureVector())); // @formatter:on } }
public static void main (String[] args) { try { for (int i = 0; i < args.length; i++) { Instance carrier = new Instance (new File(args[i]), null, null, null); SerialPipes p = new SerialPipes (new Pipe[] { new Input2CharSequence (), new CharSequence2TokenSequence(new CharSequenceLexer())}); carrier = p.newIteratorFrom (new SingleInstanceIterator(carrier)).next(); TokenSequence ts = (TokenSequence) carrier.getData(); System.out.println ("==="); System.out.println (args[i]); System.out.println (ts.toString()); } } catch (Exception e) { System.out.println (e); e.printStackTrace(); } }
public static void main (String[] args) { try { for (int i = 0; i < args.length; i++) { Instance carrier = new Instance (new File(args[i]), null, null, null); SerialPipes p = new SerialPipes (new Pipe[] { new Input2CharSequence (), new CharSequence2TokenSequence(new CharSequenceLexer())}); carrier = p.newIteratorFrom (new SingleInstanceIterator(carrier)).next(); TokenSequence ts = (TokenSequence) carrier.getData(); System.out.println ("==="); System.out.println (args[i]); System.out.println (ts.toString()); } } catch (Exception e) { System.out.println (e); e.printStackTrace(); } }
public static void main (String[] args) { try { for (int i = 0; i < args.length; i++) { Instance carrier = new Instance (new File(args[i]), null, null, null); SerialPipes p = new SerialPipes (new Pipe[] { new Input2CharSequence (), new CharSequence2TokenSequence(new CharSequenceLexer())}); carrier = p.newIteratorFrom (new SingleInstanceIterator(carrier)).next(); TokenSequence ts = (TokenSequence) carrier.getData(); System.out.println ("==="); System.out.println (args[i]); System.out.println (ts.toString()); } } catch (Exception e) { System.out.println (e); e.printStackTrace(); } }
public Pipe createPipe () { return new SerialPipes (new Pipe[] { new CharSequence2TokenSequence (), new TokenSequenceLowercase (), new TokenSequence2FeatureSequence (), new FeatureSequence2FeatureVector ()}); }
/** * Construct classifier pipe with given labels and stopwords * * @param initial pipe * @param stopwords to be removed */ public AbstractClassifierPipe(Pipe pipe, Collection<String> stopwords) { // @formatter:off super( ImmutableList.of( pipe, new CharSequenceLowercase(), new CharSequence2TokenSequence(Pattern.compile("\\p{L}[\\p{L}\\p{P}]+\\p{L}")), new RemoveStopwords(stopwords), new TokenSequence2FeatureSequence(), new FeatureSequence2FeatureVector())); // @formatter:on } }
public Pipe createPipe () { return new SerialPipes (new Pipe[] { new CharSequence2TokenSequence (), new TokenSequenceLowercase (), new TokenSequence2FeatureSequence (), new FeatureSequence2FeatureVector ()}); }
public static void main (String[] args) throws Exception { DBInstanceStore saver = new DBInstanceStore(args[0]); ArrayList<Pipe> pipeList = new ArrayList<Pipe>(); // Read data from File objects pipeList.add(new Input2CharSequence("UTF-8")); // Regular expression for what constitutes a token. // This pattern includes Unicode letters, Unicode numbers, // and the underscore character. Alternatives: // "\\S+" (anything not whitespace) // "\\w+" ( A-Z, a-z, 0-9, _ ) // "[\\p{L}\\p{N}_]+|[\\p{P}]+" (a group of only letters and numbers OR // a group of only punctuation marks) Pattern tokenPattern = Pattern.compile("\\p{L}[\\p{L}\\p{P}]*\\p{L}"); // Tokenize raw strings pipeList.add(new CharSequence2TokenSequence(tokenPattern)); pipeList.add(new TokenSequence2FeatureSequence()); CsvIterator reader = new CsvIterator(new FileReader(new File(args[1])), "(.*?)\\t(.*?)\\t(.*)", 3, 2, 1); Pipe serialPipe = new SerialPipes(pipeList); Iterator<Instance> iterator = serialPipe.newIteratorFrom(reader); saver.saveInstances(iterator); saver.saveAlphabets(serialPipe.getDataAlphabet(), serialPipe.getTargetAlphabet()); saver.cleanup(); }
public void testThree () { InstanceList il = new InstanceList ( new SerialPipes (new Pipe[] { new Target2Label (), new CharSequence2TokenSequence (), new TokenSequenceLowercase (), new TokenSequenceRemoveStopwords (), new TokenSequence2FeatureSequence (), new FeatureSequence2FeatureVector () })); Iterator<Instance> pi = new FileIterator (new File("foo/bar"), null, Pattern.compile("^([^/]*)/")); il.addThruPipe (pi); }
public void testThree () { InstanceList il = new InstanceList ( new SerialPipes (new Pipe[] { new Target2Label (), new CharSequence2TokenSequence (), new TokenSequenceLowercase (), new TokenSequenceRemoveStopwords (), new TokenSequence2FeatureSequence (), new FeatureSequence2FeatureVector () })); Iterator<Instance> pi = new FileIterator (new File("foo/bar"), null, Pattern.compile("^([^/]*)/")); il.addThruPipe (pi); }
public void testSpacePipe () { Pipe p = new SerialPipes (new Pipe[] { new CharSequence2TokenSequence ("."), new TokenSequenceLowercase (), new TestCRF.TestCRFTokenSequenceRemoveSpaces (), new TokenText (), new OffsetConjunctions (false, new int[][] {{0}, {1},{-1,0},{0,1}, {-2,-1,0}, {0,1,2}, {-3,-2,-1}, {1,2,3}, }), new PrintInputAndTarget(), }); // Print to a string ByteArrayOutputStream out = new ByteArrayOutputStream (); PrintStream oldOut = System.out; System.setOut (new PrintStream (out)); InstanceList lst = new InstanceList (p); lst.addThruPipe (new ArrayIterator (new String[] { TestCRF.data[0], TestCRF.data[1], })); System.setOut (oldOut); assertEquals (spacePipeOutput, out.toString()); }
public static Pipe makeSpacePredictionPipe () { Pipe p = new SerialPipes(new Pipe[]{ new CharSequence2TokenSequence("."), new TokenSequenceLowercase(), new TestMEMMTokenSequenceRemoveSpaces(), new TokenText(), new OffsetConjunctions(true, new int[][]{//{0}, /*{1},{-1,0},{0,1}, */ {1}, {-1, 0}, {0, 1}, // {-2, -1, 0}, {0, 1, 2}, {-3, -2, -1}, {1, 2, 3}, //{-2,-1}, {-1,0}, {0,1}, {1,2}, //{-3,-2,-1}, {-2,-1,0}, {-1,0,1}, {0,1,2}, {1,2,3}, }), // new PrintInputAndTarget(), new TokenSequence2FeatureVectorSequence() }); return p; }
public static Pipe makeSpacePredictionPipe () { Pipe p = new SerialPipes(new Pipe[]{ new CharSequence2TokenSequence("."), new TokenSequenceLowercase(), new TestMEMMTokenSequenceRemoveSpaces(), new TokenText(), new OffsetConjunctions(true, new int[][]{//{0}, /*{1},{-1,0},{0,1}, */ {1}, {-1, 0}, {0, 1}, // {-2, -1, 0}, {0, 1, 2}, {-3, -2, -1}, {1, 2, 3}, //{-2,-1}, {-1,0}, {0,1}, {1,2}, //{-3,-2,-1}, {-2,-1,0}, {-1,0,1}, {0,1,2}, {1,2,3}, }), // new PrintInputAndTarget(), new TokenSequence2FeatureVectorSequence() }); return p; }
public void testTwo () { Pipe p = new SerialPipes (new Pipe[] { new CharSequence2TokenSequence (), new TokenSequenceLowercase (), new RegexMatches ("vowel", Pattern.compile ("[aeiou]")), new RegexMatches ("firsthalf", Pattern.compile ("[a-m]")), new RegexMatches ("secondhalf", Pattern.compile ("[n-z]")), new RegexMatches ("length2", Pattern.compile ("..")), new RegexMatches ("length3", Pattern.compile ("...")), new PrintInput (), new TokenSequence2TokenInstances()}); InstanceList ilist = new InstanceList (p); ilist.addThruPipe (new StringArrayIterator(data)); assert (ilist.size() == 19) : "list size = "+ilist.size(); assertTrue (ilist.size() == 19); }
public void testTwo () { Pipe p = new SerialPipes (new Pipe[] { new CharSequence2TokenSequence (), new TokenSequenceLowercase (), new RegexMatches ("vowel", Pattern.compile ("[aeiou]")), new RegexMatches ("firsthalf", Pattern.compile ("[a-m]")), new RegexMatches ("secondhalf", Pattern.compile ("[n-z]")), new RegexMatches ("length2", Pattern.compile ("..")), new RegexMatches ("length3", Pattern.compile ("...")), new PrintInput (), new TokenSequence2TokenInstances()}); InstanceList ilist = new InstanceList (p); ilist.addThruPipe (new StringArrayIterator(data)); assert (ilist.size() == 19) : "list size = "+ilist.size(); assertTrue (ilist.size() == 19); }
public void testPrint() { Pipe p = new SerialPipes(new Pipe[] { new CharSequence2TokenSequence("."), new TokenText(), new TestCRFTokenSequenceRemoveSpaces(), new TokenSequence2FeatureVectorSequence(), new PrintInputAndTarget(), }); InstanceList one = new InstanceList(p); String[] data = new String[] { "ABCDE", }; one.addThruPipe(new ArrayIterator(data)); CRF crf = new CRF(p, null); crf.addFullyConnectedStatesForThreeQuarterLabels(one); CRFTrainerByLabelLikelihood crft = new CRFTrainerByLabelLikelihood(crf); crf.setWeightsDimensionAsIn(one, false); Optimizable mcrf = crft.getOptimizableCRF(one); double[] params = new double[mcrf.getNumParameters()]; for (int i = 0; i < params.length; i++) { params[i] = i; } mcrf.setParameters(params); crf.print(); }
public void testPrint() { Pipe p = new SerialPipes(new Pipe[] { new CharSequence2TokenSequence("."), new TokenText(), new TestCRFTokenSequenceRemoveSpaces(), new TokenSequence2FeatureVectorSequence(), new PrintInputAndTarget(), }); InstanceList one = new InstanceList(p); String[] data = new String[] { "ABCDE", }; one.addThruPipe(new ArrayIterator(data)); CRF crf = new CRF(p, null); crf.addFullyConnectedStatesForThreeQuarterLabels(one); CRFTrainerByLabelLikelihood crft = new CRFTrainerByLabelLikelihood(crf); crf.setWeightsDimensionAsIn(one, false); Optimizable mcrf = crft.getOptimizableCRF(one); double[] params = new double[mcrf.getNumParameters()]; for (int i = 0; i < params.length; i++) { params[i] = i; } mcrf.setParameters(params); crf.print(); }