public ObjectStream<NameSample> create(String[] args) { OntoNotesFormatParameters params = ArgumentParser.parse(args, OntoNotesFormatParameters.class); ObjectStream<File> documentStream = new DirectorySampleStream(new File( params.getOntoNotesDir()), file -> { if (file.isFile()) { return file.getName().endsWith(".name"); } return file.isDirectory(); }, true); return new OntoNotesNameSampleStream( new FileToStringSampleStream(documentStream, StandardCharsets.UTF_8)); }
@Override public String read() throws IOException { File sampleFile = samples.read(); if (sampleFile != null) { return readFile(sampleFile, encoding); } else { return null; } }
@Test public void readFileTest() throws IOException { final String sentence1 = "This is a sentence."; final String sentence2 = "This is another sentence."; List<String> sentences = Arrays.asList(sentence1, sentence2); DirectorySampleStream directorySampleStream = new DirectorySampleStream(directory.getRoot(), null, false); File tempFile1 = directory.newFile(); FileUtils.writeStringToFile(tempFile1, sentence1); File tempFile2 = directory.newFile(); FileUtils.writeStringToFile(tempFile2, sentence2); try (FileToStringSampleStream stream = new FileToStringSampleStream(directorySampleStream, Charset.defaultCharset())) { String read = stream.read(); Assert.assertTrue(sentences.contains(read)); read = stream.read(); Assert.assertTrue(sentences.contains(read)); } }
public ObjectStream<Parse> create(String[] args) { OntoNotesFormatParameters params = ArgumentParser.parse(args, OntoNotesFormatParameters.class); ObjectStream<File> documentStream = new DirectorySampleStream(new File( params.getOntoNotesDir()), file -> { if (file.isFile()) { return file.getName().endsWith(".parse"); } return file.isDirectory(); }, true); // We need file to line here ... and that is probably best doen with the plain text stream // lets copy it over here, refactor it, and then at some point we replace the current version // with the refactored version return new OntoNotesParseSampleStream(new DocumentToLineStream(new FileToStringSampleStream( documentStream, StandardCharsets.UTF_8))); }
@Override public String read() throws IOException { File sampleFile = samples.read(); if (sampleFile != null) { return readFile(sampleFile, encoding); } else { return null; } }
private static ObjectStream<NameSample> createNameSampleStream() throws IOException { ObjectStream<File> documentStream = new DirectorySampleStream(new File( getOpennlpDataDir(), "ontonotes4/data/files/data/english"), file -> { if (file.isFile()) { return file.getName().endsWith(".name"); } return file.isDirectory(); }, true); return new OntoNotesNameSampleStream(new FileToStringSampleStream( documentStream, StandardCharsets.UTF_8)); }
@Override public String read() throws IOException { File sampleFile = samples.read(); if (sampleFile != null) { return readFile(sampleFile, encoding); } else { return null; } }
private static ObjectStream<Parse> createParseSampleStream() throws IOException { ObjectStream<File> documentStream = new DirectorySampleStream(new File( getOpennlpDataDir(), "ontonotes4/data/files/data/english"), file -> { if (file.isFile()) { return file.getName().endsWith(".parse"); } return file.isDirectory(); }, true); return new OntoNotesParseSampleStream( new DocumentToLineStream(new FileToStringSampleStream( documentStream, StandardCharsets.UTF_8))); }
private static ObjectStream<POSSample> createPOSSampleStream() throws IOException { ObjectStream<File> documentStream = new DirectorySampleStream(new File( getOpennlpDataDir(), "ontonotes4/data/files/data/english"), file -> { if (file.isFile()) { return file.getName().endsWith(".parse"); } return file.isDirectory(); }, true); return new ParseToPOSSampleStream(new OntoNotesParseSampleStream( new DocumentToLineStream( new FileToStringSampleStream(documentStream, StandardCharsets.UTF_8)))); }
public ObjectStream<NameSample> create(String[] args) { Parameters params = ArgumentParser.parse(args, Parameters.class); TokenizerModel tokenizerModel = new TokenizerModelLoader().load(params.getTokenizerModel()); Tokenizer tokenizer = new TokenizerME(tokenizerModel); ObjectStream<String> mucDocStream = new FileToStringSampleStream( new DirectorySampleStream(params.getData(), file -> StringUtil.toLowerCase(file.getName()).endsWith(".sgm"), false), StandardCharsets.UTF_8); return new MucNameSampleStream(tokenizer, mucDocStream); }
public ObjectStream<NameSample> create(String[] args) { OntoNotesFormatParameters params = ArgumentParser.parse(args, OntoNotesFormatParameters.class); ObjectStream<File> documentStream = new DirectorySampleStream(new File( params.getOntoNotesDir()), file -> { if (file.isFile()) { return file.getName().endsWith(".name"); } return file.isDirectory(); }, true); return new OntoNotesNameSampleStream( new FileToStringSampleStream(documentStream, StandardCharsets.UTF_8)); }
public ObjectStream<NameSample> create(String[] args) { OntoNotesFormatParameters params = ArgumentParser.parse(args, OntoNotesFormatParameters.class); ObjectStream<File> documentStream = new DirectorySampleStream(new File( params.getOntoNotesDir()), file -> { if (file.isFile()) { return file.getName().endsWith(".name"); } return file.isDirectory(); }, true); return new OntoNotesNameSampleStream( new FileToStringSampleStream(documentStream, StandardCharsets.UTF_8)); }
public ObjectStream<Parse> create(String[] args) { OntoNotesFormatParameters params = ArgumentParser.parse(args, OntoNotesFormatParameters.class); ObjectStream<File> documentStream = new DirectorySampleStream(new File( params.getOntoNotesDir()), file -> { if (file.isFile()) { return file.getName().endsWith(".parse"); } return file.isDirectory(); }, true); // We need file to line here ... and that is probably best doen with the plain text stream // lets copy it over here, refactor it, and then at some point we replace the current version // with the refactored version return new OntoNotesParseSampleStream(new DocumentToLineStream(new FileToStringSampleStream( documentStream, StandardCharsets.UTF_8))); }
public ObjectStream<Parse> create(String[] args) { OntoNotesFormatParameters params = ArgumentParser.parse(args, OntoNotesFormatParameters.class); ObjectStream<File> documentStream = new DirectorySampleStream(new File( params.getOntoNotesDir()), file -> { if (file.isFile()) { return file.getName().endsWith(".parse"); } return file.isDirectory(); }, true); // We need file to line here ... and that is probably best doen with the plain text stream // lets copy it over here, refactor it, and then at some point we replace the current version // with the refactored version return new OntoNotesParseSampleStream(new DocumentToLineStream(new FileToStringSampleStream( documentStream, StandardCharsets.UTF_8))); }
public ObjectStream<NameSample> create(String[] args) { Parameters params = ArgumentParser.parse(args, Parameters.class); TokenizerModel tokenizerModel = new TokenizerModelLoader().load(params.getTokenizerModel()); Tokenizer tokenizer = new TokenizerME(tokenizerModel); ObjectStream<String> mucDocStream = new FileToStringSampleStream( new DirectorySampleStream(params.getData(), file -> StringUtil.toLowerCase(file.getName()).endsWith(".sgm"), false), StandardCharsets.UTF_8); return new MucNameSampleStream(tokenizer, mucDocStream); }
public ObjectStream<NameSample> create(String[] args) { Parameters params = ArgumentParser.parse(args, Parameters.class); TokenizerModel tokenizerModel = new TokenizerModelLoader().load(params.getTokenizerModel()); Tokenizer tokenizer = new TokenizerME(tokenizerModel); ObjectStream<String> mucDocStream = new FileToStringSampleStream( new DirectorySampleStream(params.getData(), file -> StringUtil.toLowerCase(file.getName()).endsWith(".sgm"), false), StandardCharsets.UTF_8); return new MucNameSampleStream(tokenizer, mucDocStream); }
Tokenizer tokenizer = new TokenizerME(tokenizerModel); ObjectStream<String> mucDocStream = new FileToStringSampleStream( new DirectorySampleStream(params.getData(), new FileFilter() {
Tokenizer tokenizer = new TokenizerME(tokenizerModel); ObjectStream<String> mucDocStream = new FileToStringSampleStream( new DirectorySampleStream(params.getData(), new FileFilter() {