/** * Creates a new {@link NameSample} stream from a line stream, i.e. * {@link ObjectStream}<{@link String}>, that could be a * {@link PlainTextByLineStream} object. * * @param lineStream * a stream of lines as {@link String} */ public ADChunkSampleStream(ObjectStream<String> lineStream) { this.adSentenceStream = new ADSentenceStream(lineStream); }
/** * Creates a new {@link NameSample} stream from a line stream, i.e. * {@link ObjectStream}<{@link String}>, that could be a * {@link PlainTextByLineStream} object. * * @param lineStream * a stream of lines as {@link String} * @param splitHyphenatedTokens * if true hyphenated tokens will be separated: "carros-monstro" > * "carros" "-" "monstro" */ public ADNameSampleStream(ObjectStream<String> lineStream, boolean splitHyphenatedTokens) { this.adSentenceStream = new ADSentenceStream(lineStream); this.splitHyphenatedTokens = splitHyphenatedTokens; }
/** * Creates a new {@link POSSample} stream from a line stream, i.e. * {@link ObjectStream}<{@link String}>, that could be a * {@link PlainTextByLineStream} object. * * @param lineStream * a stream of lines as {@link String} * @param expandME * if true will expand the multiword expressions, each word of the * expression will have the POS Tag that was attributed to the * expression plus the prefix B- or I- (CONLL convention) * @param includeFeatures * if true will combine the POS Tag with the feature tags */ public ADPOSSampleStream(ObjectStream<String> lineStream, boolean expandME, boolean includeFeatures) { this.adSentenceStream = new ADSentenceStream(lineStream); this.expandME = expandME; this.isIncludeFeatures = includeFeatures; }
/** * Creates a new {@link SentenceSample} stream from a line stream, i.e. * {@link ObjectStream}<{@link String}>, that could be a * {@link PlainTextByLineStream} object. * * @param lineStream * a stream of lines as {@link String} * @param includeHeadlines * if true will output the sentences marked as news headlines */ public ADSentenceSampleStream(ObjectStream<String> lineStream, boolean includeHeadlines) { this.adSentenceStream = new ADSentenceStream(lineStream); ptEosCharacters = Factory.ptEosCharacters; Arrays.sort(ptEosCharacters); this.isIncludeTitles = includeHeadlines; }
public ADChunkSampleStream(InputStreamFactory in, String charsetName) throws IOException { try { this.adSentenceStream = new ADSentenceStream(new PlainTextByLineStream( in, charsetName)); } catch (UnsupportedEncodingException e) { // UTF-8 is available on all JVMs, will never happen throw new IllegalStateException(e); } }
/** * Creates a new {@link NameSample} stream from a {@link InputStream} * * @param in * the Corpus {@link InputStream} * @param charsetName * the charset of the Arvores Deitadas Corpus * @param splitHyphenatedTokens * if true hyphenated tokens will be separated: "carros-monstro" > * "carros" "-" "monstro" */ @Deprecated public ADNameSampleStream(InputStreamFactory in, String charsetName, boolean splitHyphenatedTokens) throws IOException { try { this.adSentenceStream = new ADSentenceStream(new PlainTextByLineStream( in, charsetName)); this.splitHyphenatedTokens = splitHyphenatedTokens; } catch (UnsupportedEncodingException e) { // UTF-8 is available on all JVMs, will never happen throw new IllegalStateException(e); } }
/** * Creates a new {@link SentenceSample} stream from a {@link FileInputStream} * * @param in * input stream from the corpus * @param charsetName * the charset to use while reading the corpus * @param includeHeadlines * if true will output the sentences marked as news headlines */ public ADSentenceSampleStream(InputStreamFactory in, String charsetName, boolean includeHeadlines) throws IOException { try { this.adSentenceStream = new ADSentenceStream(new PlainTextByLineStream( in, charsetName)); } catch (UnsupportedEncodingException e) { // UTF-8 is available on all JVMs, will never happen throw new IllegalStateException(e); } ptEosCharacters = Factory.ptEosCharacters; Arrays.sort(ptEosCharacters); this.isIncludeTitles = includeHeadlines; }
/** * Creates a new {@link POSSample} stream from a {@link InputStream} * * @param in * the Corpus {@link InputStream} * @param charsetName * the charset of the Arvores Deitadas Corpus * @param expandME * if true will expand the multiword expressions, each word of the * expression will have the POS Tag that was attributed to the * expression plus the prefix B- or I- (CONLL convention) * @param includeFeatures * if true will combine the POS Tag with the feature tags */ public ADPOSSampleStream(InputStreamFactory in, String charsetName, boolean expandME, boolean includeFeatures) throws IOException { try { this.adSentenceStream = new ADSentenceStream(new PlainTextByLineStream(in, charsetName)); this.expandME = expandME; this.isIncludeFeatures = includeFeatures; } catch (UnsupportedEncodingException e) { // UTF-8 is available on all JVMs, will never happen throw new IllegalStateException(e); } }
private static ADSentenceStream openData() throws IOException { InputStreamFactory in = new ResourceAsStreamFactory(ADParagraphStreamTest.class, "/opennlp/tools/formats/ad.sample"); return new ADSentenceStream(new PlainTextByLineStream(in, "UTF-8")); } }
/** * Creates a new {@link NameSample} stream from a line stream, i.e. * {@link ObjectStream}<{@link String}>, that could be a * {@link PlainTextByLineStream} object. * * @param lineStream * a stream of lines as {@link String} */ public ADChunkSampleStream(ObjectStream<String> lineStream) { this.adSentenceStream = new ADSentenceStream(lineStream); }
/** * Creates a new {@link NameSample} stream from a line stream, i.e. * {@link ObjectStream}< {@link String}>, that could be a * {@link PlainTextByLineStream} object. * * @param lineStream * a stream of lines as {@link String} */ public ADChunk2SampleStream(ObjectStream<String> lineStream) { this.adSentenceStream = new ADSentenceStream(lineStream); }
/** * Creates a new {@link NameSample} stream from a line stream, i.e. * {@link ObjectStream}<{@link String}>, that could be a * {@link PlainTextByLineStream} object. * * @param lineStream * a stream of lines as {@link String} */ public ADChunkSampleStream(ObjectStream<String> lineStream) { this.adSentenceStream = new ADSentenceStream(lineStream); }
/** * Creates a new {@link NameSample} stream from a line stream, i.e. * {@link ObjectStream}< {@link String}>, that could be a * {@link PlainTextByLineStream} object. * * @param lineStream * a stream of lines as {@link String} */ public ADChunk2SampleStream(ObjectStream<String> lineStream) { this.adSentenceStream = new ADSentenceStream(lineStream); }
/** * Creates a new {@link NameSample} stream from a line stream, i.e. * {@link ObjectStream}< {@link String}>, that could be a * {@link PlainTextByLineStream} object. * * @param lineStream * a stream of lines as {@link String} */ public ADFeaturizerSampleStream(ObjectStream<String> lineStream, boolean expandME) { this.expandME = expandME; this.adSentenceStream = new ADSentenceStream(lineStream); }
/** * Creates a new {@link NameSample} stream from a line stream, i.e. * {@link ObjectStream}< {@link String}>, that could be a * {@link PlainTextByLineStream} object. * * @param lineStream * a stream of lines as {@link String} */ public ADFeaturizerSampleStream(ObjectStream<String> lineStream, boolean expandME) { this.expandME = expandME; this.adSentenceStream = new ADSentenceStream(lineStream); }
/** * Creates a new {@link NameSample} stream from a line stream, i.e. * {@link ObjectStream}< {@link String}>, that could be a * {@link PlainTextByLineStream} object. * * @param lineStream * a stream of lines as {@link String} * @param tags * the tags we are looking for, or null for all */ public ADContractionNameSampleStream(ObjectStream<String> lineStream, Set<String> tags) { this.adSentenceStream = new ADSentenceStream(lineStream); this.tags = tags; }
/** * Creates a new {@link NameSample} stream from a line stream, i.e. * {@link ObjectStream}<{@link String}>, that could be a * {@link PlainTextByLineStream} object. * * @param lineStream * a stream of lines as {@link String} * @param splitHyphenatedTokens * if true hyphenated tokens will be separated: "carros-monstro" > * "carros" "-" "monstro" */ public ADNameSampleStream(ObjectStream<String> lineStream, boolean splitHyphenatedTokens) { this.adSentenceStream = new ADSentenceStream(lineStream); this.splitHyphenatedTokens = splitHyphenatedTokens; }
/** * Creates a new {@link SentenceSample} stream from a line stream, i.e. * {@link ObjectStream}<{@link String}>, that could be a * {@link PlainTextByLineStream} object. * * @param lineStream * a stream of lines as {@link String} * @param includeHeadlines * if true will output the sentences marked as news headlines */ public ADSentenceSampleStream(ObjectStream<String> lineStream, boolean includeHeadlines) { this.adSentenceStream = new ADSentenceStream(lineStream); ptEosCharacters = Factory.ptEosCharacters; Arrays.sort(ptEosCharacters); this.isIncludeTitles = includeHeadlines; }
public ADChunkSampleStream(InputStreamFactory in, String charsetName) throws IOException { try { this.adSentenceStream = new ADSentenceStream(new PlainTextByLineStream( in, charsetName)); } catch (UnsupportedEncodingException e) { // UTF-8 is available on all JVMs, will never happen throw new IllegalStateException(e); } }
public ADChunkSampleStream(InputStreamFactory in, String charsetName) throws IOException { try { this.adSentenceStream = new ADSentenceStream(new PlainTextByLineStream( in, charsetName)); } catch (UnsupportedEncodingException e) { // UTF-8 is available on all JVMs, will never happen throw new IllegalStateException(e); } }