public String findBestMatchField(Set<String> fields, String value) { int minScore = Integer.MAX_VALUE; String matchedField = null; for (String f : fields) { int dis = StringUtils.getLevenshteinDistance(value, f); if (dis < minScore) { matchedField = f; minScore = dis; } } return matchedField; }
/** * Levenshtein distance (LD) is a measure of the similarity between two strings, which we will refer to as the source * string (s) and the target string (t). The distance is the number of deletions, insertions, or substitutions * required to transform s into t. */ public static Long getLevenshtein_Distance( ValueMetaInterface metaA, Object dataA, ValueMetaInterface metaB, Object dataB ) { if ( dataA == null || dataB == null ) { return null; } return new Long( StringUtils.getLevenshteinDistance( dataA.toString(), dataB.toString() ) ); }
break; default: cdistance = StringUtils.getLevenshteinDistance( usecacheValue, uselookupvalue ); break;
/** * Is the levenshtein distance of the two strings < maxDistance? */ public static boolean inLevenshteinDistance(String title1, String title2, int maxDistance) { int distance = org.apache.commons.lang.StringUtils.getLevenshteinDistance(title1, title2); if (distance > maxDistance) { return false; } return true; }
@Override public boolean test(final Object first, final Object second) { int levenshteinDistance = StringUtils.getLevenshteinDistance(second.toString(), first.toString()); if (levenshteinDistance <= 3) return true; return false; }
@VisibleForTesting protected boolean areVerySimilarStrings(String expected, String actual) { // cut complexity when the strings length difference is bigger than the accepted threshold return (Math.abs(expected.length() - actual.length()) <= MAX_STRING_DISTANCE) && StringUtils.getLevenshteinDistance(expected, actual) < MAX_STRING_DISTANCE; }
@VisibleForTesting protected boolean areVerySimilarStrings(String expected, String actual) { // cut complexity when the strings length difference is bigger than the accepted threshold return (Math.abs(expected.length() - actual.length()) <= MAX_STRING_DISTANCE) && StringUtils.getLevenshteinDistance(expected, actual) < MAX_STRING_DISTANCE; }
/** * Two abstracts are regarded probably same * if their levenshtein distance is less than a configured percentage of the text length. */ @Override public boolean isProbablySameAs(MCRMerger other) { if (!(other instanceof MCRAbstractMerger)) { return false; } String textOther = ((MCRAbstractMerger) other).text; int length = Math.min(text.length(), textOther.length()); int distance = StringUtils.getLevenshteinDistance(text, textOther); System.out.println(distance); return (distance * 100 / length) < MAX_DISTANCE_PERCENT; } }
private int compareName(FileOrRendition a, FileOrRendition b, String fileName) { int aDist = StringUtils.getLevenshteinDistance(a.getName().toLowerCase(), fileName); int bDist = StringUtils.getLevenshteinDistance(b.getName().toLowerCase(), fileName); return Integer.compare(aDist, bDist); }
public static Consumer<String> forStrings(int maxLevenshteinDistance, Consumer<String> delegate) { BiPredicate<String, String> predicate = (a, b) -> a == null || b == null ? false : StringUtils.getLevenshteinDistance(a, b) <= maxLevenshteinDistance; Consumer<String> result = new OmitSimilarItems<>( delegate, (itemSkipCount) -> delegate.accept(" ... " + itemSkipCount + " similar lines omitted ..."), predicate ); return result; } }
/** * Compares two strings (after {@link StringUtils#trim(String) trimming} * by using the Levenshtein's Edit Distance of the two * strings. Does not return the {@link Integer} number of changes but * <code>1-(changes/maxStringSizeAfterTrim)</code><p> * @param s1 the first string * @param s2 the second string * @return the distance * @throws IllegalArgumentException if any of the two parsed strings is NULL */ public static double levenshtein(String s1, String s2) { if(s1 == null || s2 == null){ throw new IllegalArgumentException("NONE of the parsed String MUST BE NULL!"); } s1 = StringUtils.trim(s1); s2 = StringUtils.trim(s2); return s1.isEmpty() || s2.isEmpty() ? 0 : 1.0 - (((double)getLevenshteinDistance(s1, s2)) / ((double)(Math.max(s1.length(), s2.length())))); } }
/** * @param str1 * the first string. * @param str2 * the second string. * @param thresholdCoef * the threshold coefficient: must be between 0.0-1.0. * @return true if the Levenshtein distance is lower than or equal to the computed threshold. */ public static boolean isClone(String str1, String str2, double thresholdCoef) { LOGGER.info("Calculating the Edit Distance with threshold-coef: " + thresholdCoef); if ((thresholdCoef < 0.0) || (thresholdCoef > 1.0)) { throw new IllegalArgumentException( "Threshold Coefficient must be between 0.0 and 1.0!"); } if (StringUtils.getLevenshteinDistance(str1, str2) <= getThreshold(str1, str2, thresholdCoef)) { return true; } return false; } }
private boolean passesSimilarity(String text, final String[] fields) { if (text == null || fields == null || fields.length == 0) { return true; } text = text.replaceAll("[^\\p{L}0-9]++", "").toLowerCase(Locale.ENGLISH).trim(); for (String field : fields) { field = field.replaceAll("[^\\p{L}0-9]++", "").toLowerCase(Locale.ENGLISH).trim(); if (abbreviationDirectory.isSubsequence(text, field) || abbreviationDirectory.isSubsequence(field, text)) { return true; } if (text.length() > 20 && field.length() > 20 && StringUtils.getLevenshteinDistance(field, text) <= 5) { return true; } } return false; }
/** * Similarity metric insensitive for: - order of words - punctuation - * accents - whitespace - case. * * @return from 0 - no similarity to 1 - very very similar */ public static double orderLenientSimilarity(String a, String b) { Preconditions.checkArgument(a != null); Preconditions.checkArgument(b != null); List<String> wordsOfA = tokenizeAndFilterAC(a); List<String> wordsOfB = tokenizeAndFilterAC(b); Collections.sort(wordsOfA); Collections.sort(wordsOfB); String orderedWordsOfA = StringUtils.join(wordsOfA, " "); String orderedWordsOfB = StringUtils.join(wordsOfB, " "); int len = Math.max(orderedWordsOfA.length(), orderedWordsOfB.length()); double ret = (len - StringUtils.getLevenshteinDistance(orderedWordsOfA, orderedWordsOfB)) / (double) len; return ret; }
private boolean passesSimilarity(String text, final List<String> fields) { if (text == null || fields == null || fields.isEmpty()) { return true; } text = text.replaceAll("[^\\p{L}0-9]++", "").toLowerCase(Locale.ENGLISH).trim(); for (String field : fields) { field = field.replaceAll("[^\\p{L}0-9]++", "").toLowerCase(Locale.ENGLISH).trim(); if (AbbreviationDirectory.checkIfSubsequence(text, field) || AbbreviationDirectory.checkIfSubsequence(field, text)) { return true; } if (text.length() > 20 && field.length() > 20 && StringUtils.getLevenshteinDistance(field, text) <= 5) { return true; } } return false; }
/** * lenient Levenshtein distance, insensitive for: - punctuation - accents - * whitespaces - case */ public static int punctuationLenientDistance(String a, String b) { return StringUtils.getLevenshteinDistance(filterAPWC(a), filterAPWC(b)); }
public boolean isSimilar(String s0, String s1) { if(Strings.isEmpty(s0) || Strings.isEmpty(s1)) { return false; } double levenshteinDistance = StringUtils.getLevenshteinDistance(s0, s1); return levenshteinDistance <= 1; }
/** * lenient Levenshtein distance, insensitive for: - accents - whitespaces - * case */ public static int lenientDistance(String a, String b) { return StringUtils.getLevenshteinDistance(filterAWC(a), filterAWC(b)); }
public Suggestion apply(ModelPath available) { int distance = StringUtils.getLevenshteinDistance(unavailable.toString(), available.toString()); boolean suggest = distance <= Math.min(3, unavailable.toString().length() / 2); if (suggest) { return new Suggestion(distance, available); } else { // avoid excess creation of Suggestion objects return null; } } });
private boolean passesSimilarity(String text, String[] fields) { if (text == null || fields == null || fields.length == 0) return true; String normalizedText = normalizeText(text); for (String field : fields) { String normalizedField = normalizeText(field); if (abbreviationDirectory.isSubsequence(normalizedText, normalizedField) || abbreviationDirectory.isSubsequence(normalizedField, normalizedText)) return true; if (normalizedText.length() > 20 && normalizedField.length() > 20 && StringUtils.getLevenshteinDistance(normalizedField, normalizedText) <= 5) return true; } return false; }