opennlp.tools.util.StringList.size java code examples

public boolean hasNext() {
 return index < size();
}

/**
 * Adds the tokens to the dictionary as one new entry.
 *
 * @param tokens the new entry
 */
public void put(StringList tokens) {
 entrySet.add(new StringListWrapper(tokens));
 minTokenCount = Math.min(minTokenCount, tokens.size());
 maxTokenCount = Math.max(maxTokenCount, tokens.size());
}

 @Override
 public String toString() {
  StringBuilder string = new StringBuilder();

  string.append('[');

  for (int i = 0; i < size(); i++) {
   string.append(getToken(i));

   if (i < size() - 1) {
    string.append(',');
   }
  }

  string.append(']');

  return string.toString();
 }
}

private static int indexOf(StringList sentence, String token) {
 for (int i = 0; i < sentence.size(); i++) {
  if (token.equals(sentence.getToken(i))) {
   return i;
  }
 }
 return -1;
}

private static StringList getNPlusOneNgram(StringList ngram, String word) {
 String[] tokens = new String[ngram.size() + 1];
 for (int i = 0; i < ngram.size(); i++) {
  tokens[i] = ngram.getToken(i);
 }
 tokens[tokens.length - 1] = word;
 return new StringList(tokens);
}

/**
 * get the (n-1)th ngram of a given ngram, that is the same ngram except the last word in the ngram
 *
 * @param ngram a ngram
 * @return a ngram
 */
public static StringList getNMinusOneTokenFirst(StringList ngram) {
 String[] tokens = new String[ngram.size() - 1];
 for (int i = 0; i < ngram.size() - 1; i++) {
  tokens[i] = ngram.getToken(i);
 }
 return tokens.length > 0 ? new StringList(tokens) : null;
}

/**
 * get the (n-1)th ngram of a given ngram, that is the same ngram except the first word in the ngram
 *
 * @param ngram a ngram
 * @return a ngram
 */
public static StringList getNMinusOneTokenLast(StringList ngram) {
 String[] tokens = new String[ngram.size() - 1];
 for (int i = 1; i < ngram.size(); i++) {
  tokens[i - 1] = ngram.getToken(i);
 }
 return tokens.length > 0 ? new StringList(tokens) : null;
}

/**
 * Initializes the current instance with the given
 * {@link StringList} {@link Iterator}.
 *
 * @param tokenLists
 */
public Index(Iterator<StringList> tokenLists) {
 while (tokenLists.hasNext()) {
  StringList tokens = tokenLists.next();
  for (int i = 0; i < tokens.size(); i++) {
   this.tokens.add(tokens.getToken(i));
  }
 }
}

@Override
public String[] predictNextTokens(String... tokens) {
 double maxProb = Double.NEGATIVE_INFINITY;
 String[] token = null;
 for (StringList ngram : this) {
  String[] sequence = new String[ngram.size() + tokens.length];
  for (int i = 0; i < tokens.length; i++) {
   sequence[i] = tokens[i];
  }
  for (int i = 0; i < ngram.size(); i++) {
   sequence[i + tokens.length] = ngram.getToken(i);
  }
  double v = calculateProbability(sequence);
  if (v > maxProb) {
   maxProb = v;
   token = new String[ngram.size()];
   for (int i = 0; i < ngram.size(); i++) {
    token[i] = ngram.getToken(i);
   }
  }
 }
 return token;
}

/**
 * Compares to tokens list and ignores the case of the tokens.
 *
 * Note: This can cause problems with some locals.
 *
 * @param tokens
 *
 * @return true if identically with ignore the case otherwise false
 */
public boolean compareToIgnoreCase(StringList tokens) {
 if (size() == tokens.size()) {
  for (int i = 0; i < size(); i++) {
   if (getToken(i).compareToIgnoreCase(
     tokens.getToken(i)) != 0) {
    return false;
   }
  }
 }
 else {
  return false;
 }
 return true;
}

@Override
public StringList predictNextTokens(StringList tokens) {
 double maxProb = Double.NEGATIVE_INFINITY;
 StringList token = null;
 for (StringList ngram : this) {
  String[] sequence = new String[ngram.size() + tokens.size()];
  for (int i = 0; i < tokens.size(); i++) {
   sequence[i] = tokens.getToken(i);
  }
  for (int i = 0; i < ngram.size(); i++) {
   sequence[i + tokens.size()] = ngram.getToken(i);
  }
  StringList sample = new StringList(sequence);
  double v = calculateProbability(sample);
  if (v > maxProb) {
   maxProb = v;
   token = ngram;
  }
 }
 return token;
}

private static Double count(StringList ngram, Iterable<StringList> sentences) {
 Double count = 0d;
 for (StringList sentence : sentences) {
  int idx0 = indexOf(sentence, ngram.getToken(0));
  if (idx0 >= 0 && sentence.size() >= idx0 + ngram.size()) {
   boolean match = true;
   for (int i = 1; i < ngram.size(); i++) {
    String sentenceToken = sentence.getToken(idx0 + i);
    String ngramToken = ngram.getToken(i);
    match &= sentenceToken.equals(ngramToken);
   }
   if (match) {
    count++;
   }
  }
 }
 return count;
}

/**
 * Get the ngrams of dimension n of a certain input sequence of tokens.
 *
 * @param sequence a sequence of tokens
 * @param size     the size of the resulting ngrmams
 * @return all the possible ngrams of the given size derivable from the input sequence
 */
public static Collection<StringList> getNGrams(StringList sequence, int size) {
 Collection<StringList> ngrams = new LinkedList<>();
 if (size == -1 || size >= sequence.size()) {
  ngrams.add(sequence);
 } else {
  String[] ngram = new String[size];
  for (int i = 0; i < sequence.size() - size + 1; i++) {
   ngram[0] = sequence.getToken(i);
   for (int j = 1; j < size; j++) {
    ngram[j] = sequence.getToken(i + j);
   }
   ngrams.add(new StringList(ngram));
  }
 }
 return ngrams;
}

/**
 * calculate the probability of a unigram in a vocabulary using maximum likelihood estimation
 *
 * @param word the only word in the unigram
 * @param set  the vocabulary
 * @return the maximum likelihood probability
 */
public static double calculateUnigramMLProbability(String word, Collection<StringList> set) {
 double vocSize = 0d;
 for (StringList s : set) {
  vocSize += s.size();
 }
 return count(new StringList(word), set) / vocSize;
}

 /**
  * Generates the context for a document using character ngrams.
  * @param document document to extract context from
  * @return the generated context
  */
 @Override
 public String[] getContext(CharSequence document) {
  Collection<String> context = new ArrayList<>();

  NGramModel model = new NGramModel();
  model.add(normalizer.normalize(document), minLength, maxLength);

  for (StringList tokenList : model) {
   if (tokenList.size() > 0) {
    context.add(tokenList.getToken(0));
   }
  }
  return context.toArray(new String[context.size()]);
 }
}

 public void createFeatures(List<String> features, String[] tokens, int index, String[] preds) {
  NGramModel model = new NGramModel();
  model.add(tokens[index], minLength, maxLength);

  for (StringList tokenList : model) {
   if (tokenList.size() > 0) {
    features.add("ng=" + StringUtil.toLowerCase(tokenList.getToken(0)));
   }
  }
 }
}

private void init(InputStream in) throws IOException {
 DictionaryEntryPersistor.create(in, entry -> {
  String operationString = entry.getAttributes().getValue("operation");
  StringList word = entry.getTokens();
  if (word.size() != 1)
   throw new InvalidFormatException("Each entry must have exactly one token! " + word);
  // parse operation
  Operation operation = Operation.parse(operationString);
  if (operation == null)
   throw new InvalidFormatException("Unknown operation type: " + operationString);
  operationTable.put(word.getToken(0), operation);
 });
}

private double stupidBackoff(StringList ngram) {
 int count = getCount(ngram);
 StringList nMinusOneToken = NGramUtils.getNMinusOneTokenFirst(ngram);
 if (nMinusOneToken == null || nMinusOneToken.size() == 0) {
  return (double) count / (double) size();
 } else if (count > 0) {
  double countM1 = getCount(nMinusOneToken);
  if (countM1 == 0d) {
   countM1 = size(); // to avoid Infinite if n-1grams do not exist
  }
  return (double) count / countM1;
 } else {
  return 0.4 * stupidBackoff(NGramUtils.getNMinusOneTokenLast(ngram));
 }
}

 @Override
 public String[] getContext(CharSequence document) {
  String[] superContext = super.getContext(document);
  List<String> context = new ArrayList(Arrays.asList(superContext));
  document = this.normalizer.normalize(document);
  SimpleTokenizer tokenizer = SimpleTokenizer.INSTANCE;
  String[] words = tokenizer.tokenize(document.toString());
  NGramModel tokenNgramModel = new NGramModel();
  if (words.length > 0) {
   tokenNgramModel.add(new StringList(words), 1, 3);
   Iterator tokenNgramIterator = tokenNgramModel.iterator();
   while (tokenNgramIterator.hasNext()) {
    StringList tokenList = (StringList) tokenNgramIterator.next();
    if (tokenList.size() > 0) {
     context.add("tg=" + tokenList.toString());
    }
   }
  }
  return context.toArray(new String[context.size()]);
 }
}

/**
 * Tests {@link StringList#getToken(int)}.
 */
@Test
public void testGetToken() {
 StringList l = new StringList("a", "b");
 Assert.assertEquals(2, l.size());
 Assert.assertEquals("a", l.getToken(0));
 Assert.assertEquals("b", l.getToken(1));
}

Javadoc

Retrieves the number of tokens inside this list.

Popular methods of StringList

<init>
Initializes the current instance. Note: Token Strings will be replaced by identical internal String
getToken
Retrieves a token from the given index.
compareToIgnoreCase
Compares to tokens list and ignores the case of the tokens. Note: This can cause problems with some
equals
toString
iterator
Retrieves an Iterator over all tokens.
hashCode

Popular in Java

Start an intent from android
findViewById (Activity)
getSupportFragmentManager (FragmentActivity)
scheduleAtFixedRate (ScheduledExecutorService)
Arrays (java.util)
This class contains various methods for manipulating arrays (such as sorting and searching). This cl
StringTokenizer (java.util)
Breaks a string into tokens; new code should probably use String#split.> // Legacy code: StringTo
AtomicInteger (java.util.concurrent.atomic)
An int value that may be updated atomically. See the java.util.concurrent.atomic package specificati
BufferedImage (java.awt.image)
The BufferedImage subclass describes an java.awt.Image with an accessible buffer of image data. All
JLabel (javax.swing)
Join (org.hibernate.mapping)
CodeWhisperer alternatives

How to use sizemethodin opennlp.tools.util.StringList

Best Java code snippets using opennlp.tools.util.StringList.size (Showing top 20 results out of 315)

How to use
size
method
in
opennlp.tools.util.StringList