/** * Calculate string similarity with tfidf weights relative to each character * frequency and how many times a character appears in a given string * @param strings the strings to calculate similarity for * @return the cosine similarity between the strings */ public static double stringSimilarity(String... strings) { if (strings == null) return 0; Counter<String> counter = new Counter<>(); Counter<String> counter2 = new Counter<>(); for (int i = 0; i < strings[0].length(); i++) counter.incrementCount(String.valueOf(strings[0].charAt(i)), 1.0f); for (int i = 0; i < strings[1].length(); i++) counter2.incrementCount(String.valueOf(strings[1].charAt(i)), 1.0f); Set<String> v1 = counter.keySet(); Set<String> v2 = counter2.keySet(); Set<String> both = SetUtils.intersection(v1, v2); double sclar = 0, norm1 = 0, norm2 = 0; for (String k : both) sclar += counter.getCount(k) * counter2.getCount(k); for (String k : v1) norm1 += counter.getCount(k) * counter.getCount(k); for (String k : v2) norm2 += counter2.getCount(k) * counter2.getCount(k); return sclar / Math.sqrt(norm1 * norm2); }
/** * This method returns pair of elements with a max value * * @return */ public Pair<F, S> argMax() { Double maxCount = -Double.MAX_VALUE; Pair<F, S> maxKey = null; for (Map.Entry<F, Counter<S>> entry : maps.entrySet()) { Counter<S> counter = entry.getValue(); S localMax = counter.argMax(); if (counter.getCount(localMax) > maxCount || maxKey == null) { maxKey = new Pair<F, S>(entry.getKey(), localMax); maxCount = counter.getCount(localMax); } } return maxKey; }
/** * This method returns probability of given element * * @param element * @return */ public double getProbability(T element) { if (totalCount() <= 0.0) throw new IllegalStateException("Can't calculate probability with empty counter"); return getCount(element) / totalCount(); }
/** * This method will increment counts of this counter by counts from other counter * @param other */ public <T2 extends T> void incrementAll(Counter<T2> other) { for (T2 element: other.keySet()) { double cnt = other.getCount(element); incrementCount(element, cnt); } }
/** * This method will apply normalization to counter values and totals. */ public void normalize() { for (T key : keySet()) { setCount(key, getCount(key) / totalCount.get()); } rebuildTotals(); }
public void sortColumnsByWordLikelihoodIncluded(final int column) { final Counter<String> counter = new Counter<>(); List<String> col = getColumn(column); StringTokenizer tokenizer = new StringTokenizer(s); while (tokenizer.hasMoreTokens()) { counter.incrementCount(tokenizer.nextToken(), 1.0f); if (counter.totalCount() <= 0.0) { log.warn("Unable to calculate probability; nothing found"); return; counter.incrementAll(counter.keySet(), 1.0f); Set<String> remove = new HashSet<>(); for (String key : counter.keySet()) if (key.length() < 2 || key.matches("[a-z]+")) remove.add(key); for (String key : remove) counter.removeKey(key); counter.dropElementsBelowThreshold(4.0f); final double totalCount = counter.totalCount();
/** * This method returns counts for a given first/second pair * * @param first * @param second * @return */ public double getCount(F first, S second) { Counter<S> counter = maps.get(first); if (counter == null) return 0.0; return counter.getCount(second); }
public StringGrid getRowsWithDuplicateValuesInColumn(int column) { checkInvalidColumn(column); StringGrid grid = new StringGrid(sep, numColumns); List<String> columns = getColumn(column); Counter<String> counter = new Counter<>(); for (String val : columns) counter.incrementCount(val, 1.0f); counter.dropElementsBelowThreshold(2.0f); Set<String> keys = counter.keySet(); for (List<String> row : this) { for (String key : keys) if (row.get(column).equals(key)) grid.addRow(row); } return grid; }
Counter<Character> counter = new Counter<>(); for (int j = 0; j < line.length(); j++) { counter.incrementCount(line.charAt(j), 1.0f); if (counter.getCount('"') > 1) { String[] split = StringUtils.splitOnCharWithQuoting(line, sep.charAt(0), '"', '\\'); add(new ArrayList<>(Arrays.asList(split)));
/** * This method will increment counts for a given first/second pair * * @param first * @param second * @param inc */ public void incrementCount(F first, S second, double inc) { Counter<S> counter = maps.get(first); if (counter == null) { counter = new Counter<S>(); maps.put(first, counter); } counter.incrementCount(second, inc); }
private String maximalValue(Map<String, Integer> map) { Counter<String> counter = new Counter<>(); for (Map.Entry<String, Integer> entry : map.entrySet()) { counter.incrementCount(entry.getKey(), entry.getValue()); } return counter.argMax(); }
public Collection<String> getNearestNeighbours(INDArray v, int k) { Counter<String> distances = new Counter<>(); for(Object s : vec.vocab().words()) { String word = (String) s; INDArray otherVec = encode(word); double sim = Transforms.cosineSim(v, otherVec); distances.incrementCount(word, sim); } distances.keepTopNElements(k); return distances.keySetSorted(); }
/** * This method will increment all elements in collection * * @param elements * @param inc */ public void incrementAll(Collection<T> elements, double inc) { for (T element: elements) { incrementCount(element, inc); } }
@Override public void incrementCount(String word, double by) { wordFrequencies.incrementCount(word, by); if (wordFrequencies.getCount(word) >= minWordFrequency && vocabWords.indexOf(word) < 0) vocabWords.add(word); }
/** * This method removes all elements except of top N by counter values * @param N */ public void keepTopNElements(int N){ PriorityQueue<Pair<T, Double>> queue = asPriorityQueue(); clear(); for (int e = 0; e < N; e++) { Pair<T, Double> pair = queue.poll(); if (pair != null) incrementCount(pair.getFirst(), pair.getSecond()); } }
/** * This method allows you to set counter value for a given first/second pair * * @param first * @param second * @param value * @return */ public double setCount(F first, S second, double value) { Counter<S> counter = maps.get(first); if (counter == null) { counter = new Counter<S>(); maps.put(first, counter); } return counter.setCount(second, value); }
private boolean hasInside() { if (innerIt == null || !innerIt.hasNext()) { if (!outerIt.hasNext()) { return false; } curKey = outerIt.next(); innerIt = getCounter(curKey).keySet().iterator(); } return true; }
@Override public double idf(String word) { return docFrequencies.getCount(word); }