private Pipe buildPipe() { Pattern tokenPattern = Pattern.compile("\\S[\\S]+\\S"); int[] sizes = {1,2}; ArrayList pipeList = new ArrayList(); pipeList.add(new CharSequence2TokenSequence(tokenPattern)); pipeList.add(new TokenSequenceRemoveStopwords(false, false)); // we should use a real stop word list pipeList.add(new TokenSequenceNGramsDelim(sizes, " ")); pipeList.add(new TokenSequence2FeatureSequence()); return new SerialPipes(pipeList); }
/** * Construct topic model pipe with given stopwords and alphabets * * @param stopwords to be removed * @param dataAlphabet to use */ public TopicModelPipe(Collection<String> stopwords, Alphabet alphabet) { // @formatter:off super( ImmutableList.of( new CharSequenceLowercase(), new CharSequence2TokenSequence(Pattern.compile("\\p{L}[\\p{L}\\p{P}]+\\p{L}")), new RemoveStopwords(stopwords), new TokenSequence2FeatureSequence(alphabet))); // @formatter:on } }
/** * Construct topic model pipe with given stopwords and alphabets * * @param stopwords to be removed * @param dataAlphabet to use */ public TopicModelPipe(Collection<String> stopwords, Alphabet alphabet) { // @formatter:off super( ImmutableList.of( new CharSequenceLowercase(), new CharSequence2TokenSequence(Pattern.compile("\\p{L}[\\p{L}\\p{P}]+\\p{L}")), new RemoveStopwords(stopwords), new TokenSequence2FeatureSequence(alphabet))); // @formatter:on } }
/** * Construct classifier pipe with given labels and stopwords * * @param initial pipe * @param stopwords to be removed */ public AbstractClassifierPipe(Pipe pipe, Collection<String> stopwords) { // @formatter:off super( ImmutableList.of( pipe, new CharSequenceLowercase(), new CharSequence2TokenSequence(Pattern.compile("\\p{L}[\\p{L}\\p{P}]+\\p{L}")), new RemoveStopwords(stopwords), new TokenSequence2FeatureSequence(), new FeatureSequence2FeatureVector())); // @formatter:on } }
public static void main (String[] args) { try { for (int i = 0; i < args.length; i++) { Instance carrier = new Instance (new File(args[i]), null, null, null); SerialPipes p = new SerialPipes (new Pipe[] { new Input2CharSequence (), new CharSequence2TokenSequence(new CharSequenceLexer())}); carrier = p.newIteratorFrom (new SingleInstanceIterator(carrier)).next(); TokenSequence ts = (TokenSequence) carrier.getData(); System.out.println ("==="); System.out.println (args[i]); System.out.println (ts.toString()); } } catch (Exception e) { System.out.println (e); e.printStackTrace(); } }
public static void main (String[] args) { try { for (int i = 0; i < args.length; i++) { Instance carrier = new Instance (new File(args[i]), null, null, null); SerialPipes p = new SerialPipes (new Pipe[] { new Input2CharSequence (), new CharSequence2TokenSequence(new CharSequenceLexer())}); carrier = p.newIteratorFrom (new SingleInstanceIterator(carrier)).next(); TokenSequence ts = (TokenSequence) carrier.getData(); System.out.println ("==="); System.out.println (args[i]); System.out.println (ts.toString()); } } catch (Exception e) { System.out.println (e); e.printStackTrace(); } }
public static void main (String[] args) { try { for (int i = 0; i < args.length; i++) { Instance carrier = new Instance (new File(args[i]), null, null, null); SerialPipes p = new SerialPipes (new Pipe[] { new Input2CharSequence (), new CharSequence2TokenSequence(new CharSequenceLexer())}); carrier = p.newIteratorFrom (new SingleInstanceIterator(carrier)).next(); TokenSequence ts = (TokenSequence) carrier.getData(); System.out.println ("==="); System.out.println (args[i]); System.out.println (ts.toString()); } } catch (Exception e) { System.out.println (e); e.printStackTrace(); } }
public Pipe createPipe () { return new SerialPipes (new Pipe[] { new CharSequence2TokenSequence (), new TokenSequenceLowercase (), new TokenSequence2FeatureSequence (), new FeatureSequence2FeatureVector ()}); }
/** * Construct classifier pipe with given labels and stopwords * * @param initial pipe * @param stopwords to be removed */ public AbstractClassifierPipe(Pipe pipe, Collection<String> stopwords) { // @formatter:off super( ImmutableList.of( pipe, new CharSequenceLowercase(), new CharSequence2TokenSequence(Pattern.compile("\\p{L}[\\p{L}\\p{P}]+\\p{L}")), new RemoveStopwords(stopwords), new TokenSequence2FeatureSequence(), new FeatureSequence2FeatureVector())); // @formatter:on } }
public Pipe createPipe () { return new SerialPipes (new Pipe[] { new CharSequence2TokenSequence (), new TokenSequenceLowercase (), new TokenSequence2FeatureSequence (), new FeatureSequence2FeatureVector ()}); }
new SerialPipes (new Pipe[] { (Pipe) new TargetStringToFeatures(), (Pipe) new CharSequence2TokenSequence(), (Pipe) new TokenSequenceLowercase(), (Pipe) new TokenSequenceRemoveStopwords(false, false),
new SerialPipes (new Pipe[] { (Pipe) new TargetStringToFeatures(), (Pipe) new CharSequence2TokenSequence(), (Pipe) new TokenSequenceLowercase(), (Pipe) new TokenSequenceRemoveStopwords(false, false),
new SerialPipes (new Pipe[] { (Pipe) new TargetStringToFeatures(), (Pipe) new CharSequence2TokenSequence(), (Pipe) new TokenSequenceLowercase(), (Pipe) new TokenSequenceRemoveStopwords(false, false),
public static void main (String[] args) throws Exception { DBInstanceStore saver = new DBInstanceStore(args[0]); ArrayList<Pipe> pipeList = new ArrayList<Pipe>(); // Read data from File objects pipeList.add(new Input2CharSequence("UTF-8")); // Regular expression for what constitutes a token. // This pattern includes Unicode letters, Unicode numbers, // and the underscore character. Alternatives: // "\\S+" (anything not whitespace) // "\\w+" ( A-Z, a-z, 0-9, _ ) // "[\\p{L}\\p{N}_]+|[\\p{P}]+" (a group of only letters and numbers OR // a group of only punctuation marks) Pattern tokenPattern = Pattern.compile("\\p{L}[\\p{L}\\p{P}]*\\p{L}"); // Tokenize raw strings pipeList.add(new CharSequence2TokenSequence(tokenPattern)); pipeList.add(new TokenSequence2FeatureSequence()); CsvIterator reader = new CsvIterator(new FileReader(new File(args[1])), "(.*?)\\t(.*?)\\t(.*)", 3, 2, 1); Pipe serialPipe = new SerialPipes(pipeList); Iterator<Instance> iterator = serialPipe.newIteratorFrom(reader); saver.saveInstances(iterator); saver.saveAlphabets(serialPipe.getDataAlphabet(), serialPipe.getTargetAlphabet()); saver.cleanup(); }
public static void main (String[] args) throws Exception { DBInstanceStore saver = new DBInstanceStore(args[0]); ArrayList<Pipe> pipeList = new ArrayList<Pipe>(); // Read data from File objects pipeList.add(new Input2CharSequence("UTF-8")); // Regular expression for what constitutes a token. // This pattern includes Unicode letters, Unicode numbers, // and the underscore character. Alternatives: // "\\S+" (anything not whitespace) // "\\w+" ( A-Z, a-z, 0-9, _ ) // "[\\p{L}\\p{N}_]+|[\\p{P}]+" (a group of only letters and numbers OR // a group of only punctuation marks) Pattern tokenPattern = Pattern.compile("\\p{L}[\\p{L}\\p{P}]*\\p{L}"); // Tokenize raw strings pipeList.add(new CharSequence2TokenSequence(tokenPattern)); pipeList.add(new TokenSequence2FeatureSequence()); CsvIterator reader = new CsvIterator(new FileReader(new File(args[1])), "(.*?)\\t(.*?)\\t(.*)", 3, 2, 1); Pipe serialPipe = new SerialPipes(pipeList); Iterator<Instance> iterator = serialPipe.newIteratorFrom(reader); saver.saveInstances(iterator); saver.saveAlphabets(serialPipe.getDataAlphabet(), serialPipe.getTargetAlphabet()); saver.cleanup(); }
public static void main (String[] args) throws Exception { DBInstanceStore saver = new DBInstanceStore(args[0]); ArrayList<Pipe> pipeList = new ArrayList<Pipe>(); // Read data from File objects pipeList.add(new Input2CharSequence("UTF-8")); // Regular expression for what constitutes a token. // This pattern includes Unicode letters, Unicode numbers, // and the underscore character. Alternatives: // "\\S+" (anything not whitespace) // "\\w+" ( A-Z, a-z, 0-9, _ ) // "[\\p{L}\\p{N}_]+|[\\p{P}]+" (a group of only letters and numbers OR // a group of only punctuation marks) Pattern tokenPattern = Pattern.compile("\\p{L}[\\p{L}\\p{P}]*\\p{L}"); // Tokenize raw strings pipeList.add(new CharSequence2TokenSequence(tokenPattern)); pipeList.add(new TokenSequence2FeatureSequence()); CsvIterator reader = new CsvIterator(new FileReader(new File(args[1])), "(.*?)\\t(.*?)\\t(.*)", 3, 2, 1); Pipe serialPipe = new SerialPipes(pipeList); Iterator<Instance> iterator = serialPipe.newIteratorFrom(reader); saver.saveInstances(iterator); saver.saveAlphabets(serialPipe.getDataAlphabet(), serialPipe.getTargetAlphabet()); saver.cleanup(); }
public void testThree () { InstanceList il = new InstanceList ( new SerialPipes (new Pipe[] { new Target2Label (), new CharSequence2TokenSequence (), new TokenSequenceLowercase (), new TokenSequenceRemoveStopwords (), new TokenSequence2FeatureSequence (), new FeatureSequence2FeatureVector () })); Iterator<Instance> pi = new FileIterator (new File("foo/bar"), null, Pattern.compile("^([^/]*)/")); il.addThruPipe (pi); }
public void testThree () { InstanceList il = new InstanceList ( new SerialPipes (new Pipe[] { new Target2Label (), new CharSequence2TokenSequence (), new TokenSequenceLowercase (), new TokenSequenceRemoveStopwords (), new TokenSequence2FeatureSequence (), new FeatureSequence2FeatureVector () })); Iterator<Instance> pi = new FileIterator (new File("foo/bar"), null, Pattern.compile("^([^/]*)/")); il.addThruPipe (pi); }
public void testSpacePipe () { Pipe p = new SerialPipes (new Pipe[] { new CharSequence2TokenSequence ("."), new TokenSequenceLowercase (), new TestCRF.TestCRFTokenSequenceRemoveSpaces (), new TokenText (), new OffsetConjunctions (false, new int[][] {{0}, {1},{-1,0},{0,1}, {-2,-1,0}, {0,1,2}, {-3,-2,-1}, {1,2,3}, }), new PrintInputAndTarget(), }); // Print to a string ByteArrayOutputStream out = new ByteArrayOutputStream (); PrintStream oldOut = System.out; System.setOut (new PrintStream (out)); InstanceList lst = new InstanceList (p); lst.addThruPipe (new ArrayIterator (new String[] { TestCRF.data[0], TestCRF.data[1], })); System.setOut (oldOut); assertEquals (spacePipeOutput, out.toString()); }
public void testSpacePipe () { Pipe p = new SerialPipes (new Pipe[] { new CharSequence2TokenSequence ("."), new TokenSequenceLowercase (), new TestCRF.TestCRFTokenSequenceRemoveSpaces (), new TokenText (), new OffsetConjunctions (false, new int[][] {{0}, {1},{-1,0},{0,1}, {-2,-1,0}, {0,1,2}, {-3,-2,-1}, {1,2,3}, }), new PrintInputAndTarget(), }); // Print to a string ByteArrayOutputStream out = new ByteArrayOutputStream (); PrintStream oldOut = System.out; System.setOut (new PrintStream (out)); InstanceList lst = new InstanceList (p); lst.addThruPipe (new ArrayIterator (new String[] { TestCRF.data[0], TestCRF.data[1], })); System.setOut (oldOut); assertEquals (spacePipeOutput, out.toString()); }