de.tudarmstadt.ukp.dkpro.core.api.io.sequencegenerator.PhraseSequenceGenerator java code examples

/**
 * Generate a {@link PhraseSequenceGenerator}
 *
 * @return a {@link PhraseSequenceGenerator} instance
 * @throws IOException if a stopwords file is specified but cannot be read
 */
public PhraseSequenceGenerator build()
    throws IOException
{
  return new PhraseSequenceGenerator(this);
}

/**
 * Generate a list of {@link LexicalPhrase} sequences where each list element represents phrases
 * extracted from the covering types, e.g. a sentence. If no covering type was defined, the list
 * contains one element representing the whole document.
 *
 * @param aJCas
 *            a {@link JCas}
 * @return a list of {@link LexicalPhrase} arrays
 * @throws FeaturePathException
 *             if there was a problem creating the feature path.
 */
public List<LexicalPhrase[]> tokenSequences(JCas aJCas)
    throws FeaturePathException
{
  return useCharacters ? characterSequences(aJCas) : annotationSequences(aJCas);
}

/**
 * Generate a list of String sequences.
 *
 * @param aJCas
 *            the {@link JCas} to generate sequences from.
 * @return a list of string arrays.
 * @throws FeaturePathException
 *             if there was a problem creating the feature path.
 */
public List<String[]> tokenSequences(JCas aJCas)
    throws FeaturePathException
{
  return psg.tokenSequences(aJCas).stream()
      .map(this::phrases2String)
      .collect(Collectors.toList());
}

/**
 * Extract a list of {@link LexicalPhrase} arrays from the {@link JCas}.
 * <p>
 * If {@link #coveringTypeName} is set, a dedicated array for each covering annotation is
 * extracted. Otherwise, the result contains only one element.
 *
 * @param aJCas
 *            a {@link JCas}
 * @return a list of {@link LexicalPhrase} arrays
 * @throws FeaturePathException
 *             if there was a problem creating the feature path.
 */
private List<LexicalPhrase[]> characterSequences(JCas aJCas)
    throws FeaturePathException
{
  if (coveringTypeName.isPresent()) {
    Type coveringType = FeaturePathUtils
        .getType(aJCas.getTypeSystem(), coveringTypeName.get());
    return CasUtil.select(aJCas.getCas(), coveringType).stream()
        .map(covering -> characterSequence(aJCas, covering.getCoveredText(),
            covering.getBegin()))
        .collect(Collectors.toList());
  }
  else {
    return Collections.singletonList(characterSequence(aJCas, aJCas.getDocumentText(), 0));
  }
}

/**
 * Extract a list of {@link LexicalPhrase} arrays from the {@link JCas}.
 * <p>
 * If {@link #coveringTypeName} is set, a dedicated array for each covering annotation is
 * extracted. Otherwise, the result contains only one element.
 *
 * @param aJCas
 *            a {@link JCas}
 * @return a list of {@link LexicalPhrase} arrays
 * @throws FeaturePathException
 *             if there was a problem creating the feature path.
 */
private List<LexicalPhrase[]> annotationSequences(JCas aJCas)
    throws FeaturePathException
{
  List<LexicalPhrase[]> phrases = new ArrayList<>();
  if (coveringTypeName.isPresent()) {
    Type coveringType = FeaturePathUtils
        .getType(aJCas.getTypeSystem(), coveringTypeName.get());
    /* iterate over covering annotations */
    for (AnnotationFS covering : CasUtil.select(aJCas.getCas(), coveringType)) {
      phrases.add(annotationSequence(aJCas, Optional.of(covering)));
    }
  }
  else {
    /* add a single token sequence for the whole document */
    phrases.add(annotationSequence(aJCas, Optional.empty()));
  }
  return phrases;
}

/**
 * Generate a list of String sequences.
 *
 * @param aJCas
 *            the {@link JCas} to generate sequences from.
 * @return a list of string arrays.
 * @throws FeaturePathException
 *             if there was a problem creating the feature path.
 */
public List<String[]> tokenSequences(JCas aJCas)
    throws FeaturePathException
{
  return psg.tokenSequences(aJCas).stream()
      .map(this::phrases2String)
      .collect(Collectors.toList());
}

/**
 * Extract a list of {@link LexicalPhrase} arrays from the {@link JCas}.
 * <p>
 * If {@link #coveringTypeName} is set, a dedicated array for each covering annotation is
 * extracted. Otherwise, the result contains only one element.
 *
 * @param aJCas
 *            a {@link JCas}
 * @return a list of {@link LexicalPhrase} arrays
 * @throws FeaturePathException
 *             if there was a problem creating the feature path.
 */
private List<LexicalPhrase[]> characterSequences(JCas aJCas)
    throws FeaturePathException
{
  if (coveringTypeName.isPresent()) {
    Type coveringType = FeaturePathUtils
        .getType(aJCas.getTypeSystem(), coveringTypeName.get());
    return CasUtil.select(aJCas.getCas(), coveringType).stream()
        .map(covering -> characterSequence(aJCas, covering.getCoveredText(),
            covering.getBegin()))
        .collect(Collectors.toList());
  }
  else {
    return Collections.singletonList(characterSequence(aJCas, aJCas.getDocumentText(), 0));
  }
}

/**
 * Extract a list of {@link LexicalPhrase} arrays from the {@link JCas}.
 * <p>
 * If {@link #coveringTypeName} is set, a dedicated array for each covering annotation is
 * extracted. Otherwise, the result contains only one element.
 *
 * @param aJCas
 *            a {@link JCas}
 * @return a list of {@link LexicalPhrase} arrays
 * @throws FeaturePathException
 *             if there was a problem creating the feature path.
 */
private List<LexicalPhrase[]> annotationSequences(JCas aJCas)
    throws FeaturePathException
{
  List<LexicalPhrase[]> phrases = new ArrayList<>();
  if (coveringTypeName.isPresent()) {
    Type coveringType = FeaturePathUtils
        .getType(aJCas.getTypeSystem(), coveringTypeName.get());
    /* iterate over covering annotations */
    for (AnnotationFS covering : CasUtil.select(aJCas.getCas(), coveringType)) {
      phrases.add(annotationSequence(aJCas, Optional.of(covering)));
    }
  }
  else {
    /* add a single token sequence for the whole document */
    phrases.add(annotationSequence(aJCas, Optional.empty()));
  }
  return phrases;
}

/**
 * Generate a list of {@link LexicalPhrase} sequences where each list element represents phrases
 * extracted from the covering types, e.g. a sentence. If no covering type was defined, the list
 * contains one element representing the whole document.
 *
 * @param aJCas
 *            a {@link JCas}
 * @return a list of {@link LexicalPhrase} arrays
 * @throws FeaturePathException
 *             if there was a problem creating the feature path.
 */
public List<LexicalPhrase[]> tokenSequences(JCas aJCas)
    throws FeaturePathException
{
  return useCharacters ? characterSequences(aJCas) : annotationSequences(aJCas);
}

/**
 * Generate a {@link PhraseSequenceGenerator}
 *
 * @return a {@link PhraseSequenceGenerator} instance
 * @throws IOException if a stopwords file is specified but cannot be read
 */
public PhraseSequenceGenerator build()
    throws IOException
{
  return new PhraseSequenceGenerator(this);
}

Javadoc

Generate sequences of phrases with optional stopword/regex-based filtering, and lowercasing. Filtered tokens are added as LexicalPhrases with empty text or a replacement of the text, if Builder#stopwordReplacement and/or Builder#filterRegexReplacement were set.

Initialize with Builder#build().

When strings instead of LexicalPhrases should be output, use Builder#buildStringSequenceGenerator().

Most used methods

<init>
annotationSequence
Generate an array of LexicalPhrases from features (e.g. tokens or lemmas) covered by an annotation (
annotationSequences
Extract a list of LexicalPhrase arrays from the JCas. If #coveringTypeName is set, a dedicated array
characterSequence
Generate a sequence of LexicalPhrases based on characters. Whitespaces are replaced by #WHITESPACE_C
characterSequences
Extract a list of LexicalPhrase arrays from the JCas. If #coveringTypeName is set, a dedicated array
tokenSequences
Generate a list of LexicalPhrase sequences where each list element represents phrases extracted from

Popular in Java

Updating database using SQL prepared statement
getApplicationContext (Context)
scheduleAtFixedRate (Timer)
getSupportFragmentManager (FragmentActivity)
FileNotFoundException (java.io)
Thrown when a file specified by a program cannot be found.
Hashtable (java.util)
A plug-in replacement for JDK1.5 java.util.Hashtable. This version is based on org.cliffc.high_scale
TimeZone (java.util)
TimeZone represents a time zone offset, and also figures out daylight savings. Typically, you get a
Vector (java.util)
Vector is an implementation of List, backed by an array and synchronized. All optional operations in
ReentrantLock (java.util.concurrent.locks)
A reentrant mutual exclusion Lock with the same basic behavior and semantics as the implicit monitor
LoggerFactory (org.slf4j)
The LoggerFactory is a utility class producing Loggers for various logging APIs, most notably for lo
CodeWhisperer alternatives

How to usePhraseSequenceGenerator in de.tudarmstadt.ukp.dkpro.core.api.io.sequencegenerator

Best Java code snippets using de.tudarmstadt.ukp.dkpro.core.api.io.sequencegenerator.PhraseSequenceGenerator (Showing top 10 results out of 315)

How to use
PhraseSequenceGenerator
in
de.tudarmstadt.ukp.dkpro.core.api.io.sequencegenerator