public String key(String s, Object... o) { if (s == null || o != null && o.length > 0) { throw new IllegalArgumentException("Fingerprint keyer accepts a single string parameter"); } s = s.trim(); // first off, remove whitespace around the string s = s.toLowerCase(); // then lowercase it s = punctctrl.matcher(s).replaceAll(""); // then remove all punctuation and control chars String[] frags = StringUtils.split(s); // split by whitespace TreeSet<String> set = new TreeSet<>(); Collections.addAll(set, frags); StringBuilder b = new StringBuilder(); Iterator<String> i = set.iterator(); while (i.hasNext()) { // join ordered fragments back together b.append(i.next()); if (i.hasNext()) { b.append(' '); } } return asciify(b.toString()); // find ASCII equivalent to characters }
protected String asciify(String s) { char[] c = s.toCharArray(); StringBuilder b = new StringBuilder(); for (char element : c) { b.append(translate(element)); } return b.toString(); }
public StringCluster(List<String> list) { for (int i = 0; i < list.size(); i++) { String s = list.get(i); FingerPrintKeyer keyer = new FingerPrintKeyer(); String key = keyer.key(s); if (containsKey(key)) { Map<String, Integer> m = get(key); if (m.containsKey(s)) { m.put(s, m.get(s) + 1); } else { m.put(s, 1); } } else { Map<String, Integer> m = new TreeMap<>(); m.put(s, 1); put(key, m); } } }
/** * Deduplicate based on the column clustering signature * @param column */ public void dedupeByCluster(int column) { StringCluster cluster = clusterColumn(column); System.out.println(cluster.get("family mcdonalds restaurant")); System.out.println(cluster.get("family mcdonalds restaurants")); List<Map<String, Integer>> list2 = cluster.getClusters(); for (int i = 0; i < list2.size(); i++) { if (list2.get(i).size() > 1) { System.out.println(list2.get(i)); } } FingerPrintKeyer keyer = new FingerPrintKeyer(); Set<Integer> alreadyDeDupped = new HashSet<>(); for (int i = 0; i < size(); i++) { String key = keyer.key(get(i).get(column)); Map<String, Integer> map = cluster.get(key); if (map != null && map.size() > 1) { List<Integer> list = filterRowsByColumn(column, map.keySet()); //deduplication to do if (list.size() > 1) modifyRows(alreadyDeDupped, column, list, map); } } }