@Override public void init(DictionaryInfo info, int baseId, String hdfsDir) throws IOException { this.baseId = baseId; this.builder = new TrieDictionaryBuilder(new StringBytesConverter()); }
@Override public void init(DictionaryInfo info, int baseId, String hdfsDir) throws IOException { builder = new TrieDictionaryForestBuilder(new StringBytesConverter(), baseId); }
public static TrieDictionaryForestBuilder<String> newDictBuilder(Iterable<String> strs, int baseId) { TrieDictionaryForestBuilder<String> b = new TrieDictionaryForestBuilder<String>(new StringBytesConverter(), baseId); for (String s : strs) b.addValue(s); return b; }
private static TrieDictionaryBuilder<String> newDictBuilder(Iterable<String> str) { TrieDictionaryBuilder<String> b = new TrieDictionaryBuilder<String>(new StringBytesConverter()); for (String s : str) b.addValue(s); return b; }
public static TrieDictionaryForestBuilder<String> newDictBuilder(Iterator<String> strs, int baseId, int treeSize) { TrieDictionaryForestBuilder<String> b = new TrieDictionaryForestBuilder<String>(new StringBytesConverter(), baseId); b.setMaxTrieTreeSize(treeSize); while (strs.hasNext()) b.addValue(strs.next()); return b; }
public synchronized void init() throws IOException { this.store = new GlobalDictHDFSStore(baseDir); store.prepareForWrite(workingDir, isAppendDictGlobal); Long[] versions = store.listAllVersions(); if (versions.length == 0 || !isAppendDictGlobal) { // build dict for the first time this.maxId = 0; this.maxValueLength = 0; this.nValues = 0; this.bytesConverter = new StringBytesConverter(); } else { // append values to last version GlobalDictMetadata metadata = store.getMetadata(versions[versions.length - 1]); this.maxId = metadata.maxId; this.maxValueLength = metadata.maxValueLength; this.nValues = metadata.nValues; this.bytesConverter = metadata.bytesConverter; this.sliceFileMap = new TreeMap<>(metadata.sliceFileMap); } }
public static TrieDictionaryForestBuilder<String> newDictBuilder(Iterable<String> strs, int baseId, int treeSize) { TrieDictionaryForestBuilder<String> b = new TrieDictionaryForestBuilder<String>(new StringBytesConverter(), baseId); b.setMaxTrieTreeSize(treeSize); for (String s : strs) { b.addValue(s); } return b; }
@Test public void categoryNamesTest() throws Exception { InputStream is = new FileInputStream("src/test/resources/dict/dw_category_grouping_names.dat"); ArrayList<String> str = loadStrings(is); Collections.sort(str, new ByteComparator<String>(new StringBytesConverter())); testStringDictionary(str, null); }
private ArrayList<String> getTestData(int count) { RandomStrings rs = new RandomStrings(count); Iterator<String> itr = rs.iterator(); ArrayList<String> testData = new ArrayList<>(); while (itr.hasNext()) testData.add(itr.next()); Collections.sort(testData, new ByteComparator<String>(new StringBytesConverter())); evaluateDataSize(testData); return testData; }
@SuppressWarnings("rawtypes") private static Dictionary strsToDict(Collection<String> strs) { TrieDictionaryBuilder<String> builder = new TrieDictionaryBuilder<>(new StringBytesConverter()); for (String str : strs) { builder.addValue(str); } return builder.build(0); }
@Test public void englishWordsTest() throws Exception { InputStream is = new FileInputStream("src/test/resources/dict/english-words.80 (scowl-2015.05.18).txt"); ArrayList<String> str = loadStrings(is); Collections.sort(str, new ByteComparator<String>(new StringBytesConverter())); testStringDictionary(str, null); }
@Test public void notFoundTest() { ArrayList<String> str = new ArrayList<String>(); str.add("part"); str.add("par"); str.add("partition"); str.add("party"); str.add("parties"); str.add("paint"); Collections.sort(str, new ByteComparator<String>(new StringBytesConverter())); ArrayList<String> notFound = new ArrayList<String>(); notFound.add(""); notFound.add("p"); notFound.add("pa"); notFound.add("pb"); notFound.add("parti"); notFound.add("partz"); notFound.add("partyz"); testStringDictionary(str, notFound); }
@Before public void before() { int dataSize = 100 * 10000; TrieDictionaryBuilder<String> b1 = new TrieDictionaryBuilder<>(new StringBytesConverter()); TrieDictionaryForestBuilder<String> b2 = new TrieDictionaryForestBuilder<String>(new StringBytesConverter(), 0, 5); this.rawData = genStringDataSet(dataSize); for (String str : this.rawData) { b1.addValue(str); b2.addValue(str); } this.oldDict = b1.build(0); this.newDict = b2.build(); System.out.println("new dict split tree size : " + ((TrieDictionaryForest<String>) newDict).getTrees().size()); }
@Test public void dictionaryContainTest() { ArrayList<String> str = new ArrayList<String>(); str.add("part"); str.add("part"); // meant to be dup str.add("par"); str.add("partition"); str.add("party"); str.add("parties"); str.add("paint"); Collections.sort(str, new ByteComparator<String>(new StringBytesConverter())); int baseId = new Random().nextInt(100); TrieDictionaryForestBuilder<String> b = newDictBuilder(str, baseId); TrieDictionaryForest<String> dict = b.build(); str.add("py"); Collections.sort(str, new ByteComparator<String>(new StringBytesConverter())); b = newDictBuilder(str, baseId); baseId = new Random().nextInt(100); TrieDictionaryForest<String> dict2 = b.build(); assertEquals(true, dict2.contains(dict)); assertEquals(false, dict.contains(dict2)); }
@Test public void testEmptyFile() throws Exception { String dirPath = "src/test/resources/multi_file_empty_file"; ArrayList<String> correctAnswer = readAllFiles(dirPath); final BytesConverter<String> converter = new StringBytesConverter(); Collections.sort(correctAnswer, new ByteComparator<String>(new StringBytesConverter())); System.out.println("correct answer:" + correctAnswer); SortedColumnDFSFile column = new SortedColumnDFSFile(qualify(dirPath + "/"), DataType.getType("varchar")); IDictionaryValueEnumerator e = new TableColumnValueEnumerator(column.getReader(), -1); ArrayList<String> output = new ArrayList<>(); while (e.moveNext()) { output.add(new String(e.current())); } System.out.println(correctAnswer.size()); assertTrue(correctAnswer.size() == output.size()); for (int i = 0; i < correctAnswer.size(); i++) { assertEquals(correctAnswer.get(i), output.get(i)); } }
@Test public void testReadStringMultiFile() throws Exception { String dirPath = "src/test/resources/multi_file_str"; ArrayList<String> correctAnswer = readAllFiles(dirPath); Collections.sort(correctAnswer, new ByteComparator<String>(new StringBytesConverter())); SortedColumnDFSFile column = new SortedColumnDFSFile(qualify(dirPath + "/"), DataType.getType("varchar")); IDictionaryValueEnumerator e = new TableColumnValueEnumerator(column.getReader(), -1); ArrayList<String> output = new ArrayList<>(); while (e.moveNext()) { output.add(new String(e.current())); } System.out.println(correctAnswer.size()); assertTrue(correctAnswer.size() == output.size()); for (int i = 0; i < correctAnswer.size(); i++) { assertEquals(correctAnswer.get(i), output.get(i)); } }
@Test //one string one tree public void testMultiTree() { ArrayList<String> strs = new ArrayList<String>(); strs.add("part"); strs.add("par"); strs.add("partition"); strs.add("party"); strs.add("parties"); strs.add("paint"); strs.add("一二三"); //Chinese test strs.add("四五六"); strs.add(""); Collections.sort(strs, new ByteComparator<String>(new StringBytesConverter())); int baseId = 5; int maxTreeSize = 0; TrieDictionaryForestBuilder<String> builder = newDictBuilder(strs, baseId, maxTreeSize); TrieDictionaryForest<String> dict = builder.build(); dict.dump(System.out); assertEquals(strs.size(), dict.getTrees().size()); int expectId = baseId; for (String s : strs) { assertEquals(expectId, dict.getIdFromValue(s)); expectId++; } assertSameBehaviorAsTrie(dict, strs, baseId); }
@Test public void emptyDictTest() throws Exception { TrieDictionaryForestBuilder<String> b = new TrieDictionaryForestBuilder<String>(new StringBytesConverter()); TrieDictionaryForest<String> dict = b.build(); try { int id = dict.getIdFromValue("123", 0); fail("id should not exist"); } catch (IllegalArgumentException e) { //right } try { String value = dict.getValueFromIdImpl(123); fail("value should not exist"); } catch (IllegalArgumentException e) { //right } }
private static Dictionary newDictionaryOfString() { TrieDictionaryBuilder<String> builder = new TrieDictionaryBuilder<>(new StringBytesConverter()); builder.addValue("Dong"); builder.addValue("George"); builder.addValue("Jason"); builder.addValue("Kejia"); builder.addValue("Luke"); builder.addValue("Mahone"); builder.addValue("Qianhao"); builder.addValue("Shaofeng"); builder.addValue("Xu"); builder.addValue("Yang"); return builder.build(0); }
private void assertSameBehaviorAsTrie(TrieDictionaryForest<String> dict, ArrayList<String> strs, int baseId) { TrieDictionaryBuilder<String> trieBuilder = new TrieDictionaryBuilder<>(new StringBytesConverter()); for (String s : strs) { if (s != null) trieBuilder.addValue(s); } TrieDictionary<String> trie = trieBuilder.build(baseId); assertEquals(trie.getMaxId(), dict.getMaxId()); assertEquals(trie.getMinId(), dict.getMinId()); assertEquals(trie.getSize(), dict.getSize()); assertEquals(trie.getSizeOfId(), dict.getSizeOfId()); assertEquals(trie.getSizeOfValue(), dict.getSizeOfValue()); }