public ObjectStream<TokenSample> create(String[] args) { Parameters params = ArgumentParser.parse(args, Parameters.class); ObjectStream<Parse> parseSampleStream = StreamFactoryRegistry.getFactory(Parse.class, StreamFactoryRegistry.DEFAULT_FORMAT).create( ArgumentParser.filter(args, WordTagSampleStreamFactory.Parameters.class)); return new POSToTokenSampleStream(createDetokenizer(params), new ParseToPOSSampleStream(parseSampleStream)); }
public ObjectStream<SentenceSample> create(String[] args) { Parameters params = ArgumentParser.parse(args, Parameters.class); ObjectStream<Parse> parseSampleStream = StreamFactoryRegistry.getFactory(Parse.class, StreamFactoryRegistry.DEFAULT_FORMAT).create( ArgumentParser.filter(args, ParseSampleStreamFactory.Parameters.class)); return new POSToSentenceSampleStream(createDetokenizer(params), new ParseToPOSSampleStream(parseSampleStream), 30); }
public static void registerFactory() { StreamFactoryRegistry.registerFactory(TokenSample.class, "parse", new ParseToTokenSampleStreamFactory()); } }
public ObjectStream<NameSample> create(String[] args) { OntoNotesFormatParameters params = ArgumentParser.parse(args, OntoNotesFormatParameters.class); ObjectStream<File> documentStream = new DirectorySampleStream(new File( params.getOntoNotesDir()), file -> { if (file.isFile()) { return file.getName().endsWith(".name"); } return file.isDirectory(); }, true); return new OntoNotesNameSampleStream( new FileToStringSampleStream(documentStream, StandardCharsets.UTF_8)); }
private static ObjectStream<POSSample> createPOSSampleStream() throws IOException { ObjectStream<File> documentStream = new DirectorySampleStream(new File( getOpennlpDataDir(), "ontonotes4/data/files/data/english"), file -> { if (file.isFile()) { return file.getName().endsWith(".parse"); } return file.isDirectory(); }, true); return new ParseToPOSSampleStream(new OntoNotesParseSampleStream( new DocumentToLineStream( new FileToStringSampleStream(documentStream, StandardCharsets.UTF_8)))); }
@Test public void readFileTest() throws IOException { final String sentence1 = "This is a sentence."; final String sentence2 = "This is another sentence."; List<String> sentences = Arrays.asList(sentence1, sentence2); DirectorySampleStream directorySampleStream = new DirectorySampleStream(directory.getRoot(), null, false); File tempFile1 = directory.newFile(); FileUtils.writeStringToFile(tempFile1, sentence1); File tempFile2 = directory.newFile(); FileUtils.writeStringToFile(tempFile2, sentence2); try (FileToStringSampleStream stream = new FileToStringSampleStream(directorySampleStream, Charset.defaultCharset())) { String read = stream.read(); Assert.assertTrue(sentences.contains(read)); read = stream.read(); Assert.assertTrue(sentences.contains(read)); } }
public SentenceSample read() throws IOException { List<String[]> sentences = new ArrayList<>(); T posSample; int chunks = 0; while ((posSample = samples.read()) != null && chunks < chunkSize) { sentences.add(toSentence(posSample)); chunks++; } if (sentences.size() > 0) { return new SentenceSample(detokenizer, sentences.toArray(new String[sentences.size()][])); } else if (posSample != null) { return read(); // filter out empty line } return null; // last sample was read } }
public ObjectStream<POSSample> create(String[] args) { ObjectStream<Parse> parseSampleStream = parseSampleStreamFactory.create(args); return new ParseToPOSSampleStream(parseSampleStream); }
public ObjectStream<TokenSample> create(String[] args) { Parameters params = ArgumentParser.parse(args, Parameters.class); ObjectStream<NameSample> samples = StreamFactoryRegistry.getFactory( NameSample.class, "ad").create( ArgumentParser.filter(args, ADNameSampleStreamFactory.Parameters.class)); return new NameToTokenSampleStream(createDetokenizer(params), samples); } }
public byte[] read() throws IOException { File sampleFile = samples.read(); if (sampleFile != null) { return readFile(sampleFile); } else { return null; } } }
public static void registerFactory() { StreamFactoryRegistry.registerFactory(POSSample.class, "parse", new ParseToPOSSampleStreamFactory()); } }
public static void registerFactory() { StreamFactoryRegistry.registerFactory(SentenceSample.class, "parse", new ParseToSentenceSampleStreamFactory()); } }
@Override public String read() throws IOException { File sampleFile = samples.read(); if (sampleFile != null) { return readFile(sampleFile, encoding); } else { return null; } }
public ObjectStream<Parse> create(String[] args) { Parameters params = ArgumentParser.parse(args, Parameters.class); return new ConstitParseSampleStream(new FileToByteArraySampleStream( new DirectorySampleStream(params.getData(), null, false))); }
public ObjectStream<SentenceSample> create(String[] args) { Parameters params = ArgumentParser.parse(args, Parameters.class); ObjectStream<POSSample> posSampleStream = StreamFactoryRegistry.getFactory(POSSample.class, ConllXPOSSampleStreamFactory.CONLLX_FORMAT).create( ArgumentParser.filter(args, ConllXPOSSampleStreamFactory.Parameters.class)); return new POSToSentenceSampleStream(createDetokenizer(params), posSampleStream, 30); } }
public ObjectStream<Parse> create(String[] args) { OntoNotesFormatParameters params = ArgumentParser.parse(args, OntoNotesFormatParameters.class); ObjectStream<File> documentStream = new DirectorySampleStream(new File( params.getOntoNotesDir()), file -> { if (file.isFile()) { return file.getName().endsWith(".parse"); } return file.isDirectory(); }, true); // We need file to line here ... and that is probably best doen with the plain text stream // lets copy it over here, refactor it, and then at some point we replace the current version // with the refactored version return new OntoNotesParseSampleStream(new DocumentToLineStream(new FileToStringSampleStream( documentStream, StandardCharsets.UTF_8))); }
public ObjectStream<POSSample> create(String[] args) { ParseSampleStreamFactory.Parameters params = ArgumentParser.parse(args, ParseSampleStreamFactory.Parameters.class); ObjectStream<Parse> parseSampleStream = StreamFactoryRegistry.getFactory(Parse.class, StreamFactoryRegistry.DEFAULT_FORMAT).create( ArgumentParser.filter(args, ParseSampleStreamFactory.Parameters.class)); return new ParseToPOSSampleStream(parseSampleStream); }
private static ObjectStream<NameSample> createNameSampleStream() throws IOException { ObjectStream<File> documentStream = new DirectorySampleStream(new File( getOpennlpDataDir(), "ontonotes4/data/files/data/english"), file -> { if (file.isFile()) { return file.getName().endsWith(".name"); } return file.isDirectory(); }, true); return new OntoNotesNameSampleStream(new FileToStringSampleStream( documentStream, StandardCharsets.UTF_8)); }
private static ObjectStream<Parse> createParseSampleStream() throws IOException { ObjectStream<File> documentStream = new DirectorySampleStream(new File( getOpennlpDataDir(), "ontonotes4/data/files/data/english"), file -> { if (file.isFile()) { return file.getName().endsWith(".parse"); } return file.isDirectory(); }, true); return new OntoNotesParseSampleStream( new DocumentToLineStream(new FileToStringSampleStream( documentStream, StandardCharsets.UTF_8))); }
public ObjectStream<NameSample> create(String[] args) { Parameters params = ArgumentParser.parse(args, Parameters.class); TokenizerModel tokenizerModel = new TokenizerModelLoader().load(params.getTokenizerModel()); Tokenizer tokenizer = new TokenizerME(tokenizerModel); ObjectStream<String> mucDocStream = new FileToStringSampleStream( new DirectorySampleStream(params.getData(), file -> StringUtil.toLowerCase(file.getName()).endsWith(".sgm"), false), StandardCharsets.UTF_8); return new MucNameSampleStream(tokenizer, mucDocStream); }