/** * Generate a {@link StringSequenceGenerator} that directly returns Strings * instead of {@link LexicalPhrase}s. * * @return a {@link StringSequenceGenerator} instance * @throws IOException if a stopwords file is specified but cannot be read */ public StringSequenceGenerator buildStringSequenceGenerator() throws IOException { return new StringSequenceGenerator(this); } }
protected StringSequenceGenerator(PhraseSequenceGenerator.Builder builder) throws IOException { psg = builder.build(); }
/** * Generate a {@link PhraseSequenceGenerator} * * @return a {@link PhraseSequenceGenerator} instance * @throws IOException if a stopwords file is specified but cannot be read */ public PhraseSequenceGenerator build() throws IOException { return new PhraseSequenceGenerator(this); }
@Override public void initialize(UimaContext context) throws ResourceInitializationException { super.initialize(context); try { sequenceGenerator = new PhraseSequenceGenerator.Builder() .featurePath(featurePath) .filterRegex(numberRegex) .filterRegexReplacement(NUMBER_REPLACEMENT) .stopwordsFile(stopwordsFile) .stopwordsReplacement(STOPWORD_REPLACEMENT) .coveringType(coveringType) .buildStringSequenceGenerator(); } catch (IOException e) { throw new ResourceInitializationException(e); } }
@Override public void initialize(UimaContext context) throws ResourceInitializationException { super.initialize(context); ParallelTopicModel model; try { getLogger().info("Loading model file " + modelLocation); model = ParallelTopicModel.read(modelLocation); if (maxTopicAssignments <= 0) { maxTopicAssignments = model.getNumTopics() / 10; } } catch (Exception e) { throw new ResourceInitializationException(e); } getLogger().info("Model loaded."); inferencer = model.getInferencer(); malletPipe = new TokenSequence2FeatureSequence(model.getAlphabet()); try { sequenceGenerator = new PhraseSequenceGenerator.Builder() .featurePath(tokenFeaturePath) .minTokenLength(minTokenLength) .lowercase(lowercase) .buildStringSequenceGenerator(); } catch (IOException e) { throw new ResourceInitializationException(e); } }
public Builder stopwordsFile(String stopwordsFile) throws MalformedURLException { if (stopwordsFile.isEmpty()) { this.stopwordsFile = Optional.empty(); return this; } else { return stopwordsFile(new File(stopwordsFile)); } }
/** * Generate a list of {@link LexicalPhrase} sequences where each list element represents phrases * extracted from the covering types, e.g. a sentence. If no covering type was defined, the list * contains one element representing the whole document. * * @param aJCas * a {@link JCas} * @return a list of {@link LexicalPhrase} arrays * @throws FeaturePathException * if there was a problem creating the feature path. */ public List<LexicalPhrase[]> tokenSequences(JCas aJCas) throws FeaturePathException { return useCharacters ? characterSequences(aJCas) : annotationSequences(aJCas); }
@Override public void process(JCas aJCas) throws AnalysisEngineProcessException { try { OutputStream outputStream = getOutputStream(aJCas, extension); /* iterate over sentences */ for (String[] line : sequenceGenerator.tokenSequences(aJCas)) { if (line.length > 0) { /* write first token */ outputStream.write(line[0].getBytes(targetEncoding)); /* write remaining tokens with token separator */ for (int i = 1; i < line.length; i++) { outputStream.write((TOKEN_SEPARATOR + line[i]).getBytes(targetEncoding)); } } outputStream.write(System.lineSeparator().getBytes(targetEncoding)); } } catch (FeaturePathException | IOException e) { throw new AnalysisEngineProcessException(e); } }
public Builder stopwordsFile(File stopwordsFile) throws MalformedURLException { if (stopwordsFile != null) { URL url = stopwordsFile.toURI().toURL(); return stopwordsURL(url); } else { return stopwordsURL(null); } }
/** * Generate a list of String sequences. * * @param aJCas * the {@link JCas} to generate sequences from. * @return a list of string arrays. * @throws FeaturePathException * if there was a problem creating the feature path. */ public List<String[]> tokenSequences(JCas aJCas) throws FeaturePathException { return psg.tokenSequences(aJCas).stream() .map(this::phrases2String) .collect(Collectors.toList()); }
/** * Extract a list of {@link LexicalPhrase} arrays from the {@link JCas}. * <p> * If {@link #coveringTypeName} is set, a dedicated array for each covering annotation is * extracted. Otherwise, the result contains only one element. * * @param aJCas * a {@link JCas} * @return a list of {@link LexicalPhrase} arrays * @throws FeaturePathException * if there was a problem creating the feature path. */ private List<LexicalPhrase[]> annotationSequences(JCas aJCas) throws FeaturePathException { List<LexicalPhrase[]> phrases = new ArrayList<>(); if (coveringTypeName.isPresent()) { Type coveringType = FeaturePathUtils .getType(aJCas.getTypeSystem(), coveringTypeName.get()); /* iterate over covering annotations */ for (AnnotationFS covering : CasUtil.select(aJCas.getCas(), coveringType)) { phrases.add(annotationSequence(aJCas, Optional.of(covering))); } } else { /* add a single token sequence for the whole document */ phrases.add(annotationSequence(aJCas, Optional.empty())); } return phrases; }
public Builder stopwordsFile(String stopwordsFile) throws MalformedURLException { if (stopwordsFile.isEmpty()) { this.stopwordsFile = Optional.empty(); return this; } else { return stopwordsFile(new File(stopwordsFile)); } }
/** * Generate a list of {@link LexicalPhrase} sequences where each list element represents phrases * extracted from the covering types, e.g. a sentence. If no covering type was defined, the list * contains one element representing the whole document. * * @param aJCas * a {@link JCas} * @return a list of {@link LexicalPhrase} arrays * @throws FeaturePathException * if there was a problem creating the feature path. */ public List<LexicalPhrase[]> tokenSequences(JCas aJCas) throws FeaturePathException { return useCharacters ? characterSequences(aJCas) : annotationSequences(aJCas); }
@Override public void process(JCas aJCas) throws AnalysisEngineProcessException { DocumentMetaData metadata = DocumentMetaData.get(aJCas); try { /* retrieve token sequences and convert token sequences to instances */ sequenceGenerator.tokenSequences(aJCas).stream() .map(TokenSequence::new) .map(ts -> new Instance(ts, NONE_LABEL, metadata.getDocumentId(), metadata.getDocumentUri())) .forEach(instance -> instanceList.addThruPipe(instance)); } catch (FeaturePathException e) { throw new AnalysisEngineProcessException(e); } }
/** * Generate a {@link StringSequenceGenerator} that directly returns Strings * instead of {@link LexicalPhrase}s. * * @return a {@link StringSequenceGenerator} instance * @throws IOException if a stopwords file is specified but cannot be read */ public StringSequenceGenerator buildStringSequenceGenerator() throws IOException { return new StringSequenceGenerator(this); } }
protected StringSequenceGenerator(PhraseSequenceGenerator.Builder builder) throws IOException { psg = builder.build(); }
/** * Generate a {@link PhraseSequenceGenerator} * * @return a {@link PhraseSequenceGenerator} instance * @throws IOException if a stopwords file is specified but cannot be read */ public PhraseSequenceGenerator build() throws IOException { return new PhraseSequenceGenerator(this); }
public Builder stopwordsFile(File stopwordsFile) throws MalformedURLException { if (stopwordsFile != null) { URL url = stopwordsFile.toURI().toURL(); return stopwordsURL(url); } else { return stopwordsURL(null); } }
/** * Generate a list of String sequences. * * @param aJCas * the {@link JCas} to generate sequences from. * @return a list of string arrays. * @throws FeaturePathException * if there was a problem creating the feature path. */ public List<String[]> tokenSequences(JCas aJCas) throws FeaturePathException { return psg.tokenSequences(aJCas).stream() .map(this::phrases2String) .collect(Collectors.toList()); }
/** * Extract a list of {@link LexicalPhrase} arrays from the {@link JCas}. * <p> * If {@link #coveringTypeName} is set, a dedicated array for each covering annotation is * extracted. Otherwise, the result contains only one element. * * @param aJCas * a {@link JCas} * @return a list of {@link LexicalPhrase} arrays * @throws FeaturePathException * if there was a problem creating the feature path. */ private List<LexicalPhrase[]> annotationSequences(JCas aJCas) throws FeaturePathException { List<LexicalPhrase[]> phrases = new ArrayList<>(); if (coveringTypeName.isPresent()) { Type coveringType = FeaturePathUtils .getType(aJCas.getTypeSystem(), coveringTypeName.get()); /* iterate over covering annotations */ for (AnnotationFS covering : CasUtil.select(aJCas.getCas(), coveringType)) { phrases.add(annotationSequence(aJCas, Optional.of(covering))); } } else { /* add a single token sequence for the whole document */ phrases.add(annotationSequence(aJCas, Optional.empty())); } return phrases; }