/** * Generate a {@link PhraseSequenceGenerator} * * @return a {@link PhraseSequenceGenerator} instance * @throws IOException if a stopwords file is specified but cannot be read */ public PhraseSequenceGenerator build() throws IOException { return new PhraseSequenceGenerator(this); }
/** * Generate a list of {@link LexicalPhrase} sequences where each list element represents phrases * extracted from the covering types, e.g. a sentence. If no covering type was defined, the list * contains one element representing the whole document. * * @param aJCas * a {@link JCas} * @return a list of {@link LexicalPhrase} arrays * @throws FeaturePathException * if there was a problem creating the feature path. */ public List<LexicalPhrase[]> tokenSequences(JCas aJCas) throws FeaturePathException { return useCharacters ? characterSequences(aJCas) : annotationSequences(aJCas); }
/** * Generate a list of String sequences. * * @param aJCas * the {@link JCas} to generate sequences from. * @return a list of string arrays. * @throws FeaturePathException * if there was a problem creating the feature path. */ public List<String[]> tokenSequences(JCas aJCas) throws FeaturePathException { return psg.tokenSequences(aJCas).stream() .map(this::phrases2String) .collect(Collectors.toList()); }
/** * Extract a list of {@link LexicalPhrase} arrays from the {@link JCas}. * <p> * If {@link #coveringTypeName} is set, a dedicated array for each covering annotation is * extracted. Otherwise, the result contains only one element. * * @param aJCas * a {@link JCas} * @return a list of {@link LexicalPhrase} arrays * @throws FeaturePathException * if there was a problem creating the feature path. */ private List<LexicalPhrase[]> characterSequences(JCas aJCas) throws FeaturePathException { if (coveringTypeName.isPresent()) { Type coveringType = FeaturePathUtils .getType(aJCas.getTypeSystem(), coveringTypeName.get()); return CasUtil.select(aJCas.getCas(), coveringType).stream() .map(covering -> characterSequence(aJCas, covering.getCoveredText(), covering.getBegin())) .collect(Collectors.toList()); } else { return Collections.singletonList(characterSequence(aJCas, aJCas.getDocumentText(), 0)); } }
/** * Extract a list of {@link LexicalPhrase} arrays from the {@link JCas}. * <p> * If {@link #coveringTypeName} is set, a dedicated array for each covering annotation is * extracted. Otherwise, the result contains only one element. * * @param aJCas * a {@link JCas} * @return a list of {@link LexicalPhrase} arrays * @throws FeaturePathException * if there was a problem creating the feature path. */ private List<LexicalPhrase[]> annotationSequences(JCas aJCas) throws FeaturePathException { List<LexicalPhrase[]> phrases = new ArrayList<>(); if (coveringTypeName.isPresent()) { Type coveringType = FeaturePathUtils .getType(aJCas.getTypeSystem(), coveringTypeName.get()); /* iterate over covering annotations */ for (AnnotationFS covering : CasUtil.select(aJCas.getCas(), coveringType)) { phrases.add(annotationSequence(aJCas, Optional.of(covering))); } } else { /* add a single token sequence for the whole document */ phrases.add(annotationSequence(aJCas, Optional.empty())); } return phrases; }
/** * Generate a list of String sequences. * * @param aJCas * the {@link JCas} to generate sequences from. * @return a list of string arrays. * @throws FeaturePathException * if there was a problem creating the feature path. */ public List<String[]> tokenSequences(JCas aJCas) throws FeaturePathException { return psg.tokenSequences(aJCas).stream() .map(this::phrases2String) .collect(Collectors.toList()); }
/** * Extract a list of {@link LexicalPhrase} arrays from the {@link JCas}. * <p> * If {@link #coveringTypeName} is set, a dedicated array for each covering annotation is * extracted. Otherwise, the result contains only one element. * * @param aJCas * a {@link JCas} * @return a list of {@link LexicalPhrase} arrays * @throws FeaturePathException * if there was a problem creating the feature path. */ private List<LexicalPhrase[]> characterSequences(JCas aJCas) throws FeaturePathException { if (coveringTypeName.isPresent()) { Type coveringType = FeaturePathUtils .getType(aJCas.getTypeSystem(), coveringTypeName.get()); return CasUtil.select(aJCas.getCas(), coveringType).stream() .map(covering -> characterSequence(aJCas, covering.getCoveredText(), covering.getBegin())) .collect(Collectors.toList()); } else { return Collections.singletonList(characterSequence(aJCas, aJCas.getDocumentText(), 0)); } }
/** * Extract a list of {@link LexicalPhrase} arrays from the {@link JCas}. * <p> * If {@link #coveringTypeName} is set, a dedicated array for each covering annotation is * extracted. Otherwise, the result contains only one element. * * @param aJCas * a {@link JCas} * @return a list of {@link LexicalPhrase} arrays * @throws FeaturePathException * if there was a problem creating the feature path. */ private List<LexicalPhrase[]> annotationSequences(JCas aJCas) throws FeaturePathException { List<LexicalPhrase[]> phrases = new ArrayList<>(); if (coveringTypeName.isPresent()) { Type coveringType = FeaturePathUtils .getType(aJCas.getTypeSystem(), coveringTypeName.get()); /* iterate over covering annotations */ for (AnnotationFS covering : CasUtil.select(aJCas.getCas(), coveringType)) { phrases.add(annotationSequence(aJCas, Optional.of(covering))); } } else { /* add a single token sequence for the whole document */ phrases.add(annotationSequence(aJCas, Optional.empty())); } return phrases; }
/** * Generate a list of {@link LexicalPhrase} sequences where each list element represents phrases * extracted from the covering types, e.g. a sentence. If no covering type was defined, the list * contains one element representing the whole document. * * @param aJCas * a {@link JCas} * @return a list of {@link LexicalPhrase} arrays * @throws FeaturePathException * if there was a problem creating the feature path. */ public List<LexicalPhrase[]> tokenSequences(JCas aJCas) throws FeaturePathException { return useCharacters ? characterSequences(aJCas) : annotationSequences(aJCas); }
/** * Generate a {@link PhraseSequenceGenerator} * * @return a {@link PhraseSequenceGenerator} instance * @throws IOException if a stopwords file is specified but cannot be read */ public PhraseSequenceGenerator build() throws IOException { return new PhraseSequenceGenerator(this); }