public LanguageDetectorContextGenerator getContextGenerator() { return new DefaultLanguageDetectorContextGenerator(1, 3, EmojiCharSequenceNormalizer.getInstance(), UrlCharSequenceNormalizer.getInstance(), TwitterCharSequenceNormalizer.getInstance(), NumberCharSequenceNormalizer.getInstance(), ShrinkCharSequenceNormalizer.getInstance()); }
@Override public String[] getContext(CharSequence document) { String[] superContext = super.getContext(document); List<String> context = new ArrayList(Arrays.asList(superContext)); document = this.normalizer.normalize(document); SimpleTokenizer tokenizer = SimpleTokenizer.INSTANCE; String[] words = tokenizer.tokenize(document.toString()); NGramModel tokenNgramModel = new NGramModel(); if (words.length > 0) { tokenNgramModel.add(new StringList(words), 1, 3); Iterator tokenNgramIterator = tokenNgramModel.iterator(); while (tokenNgramIterator.hasNext()) { StringList tokenList = (StringList) tokenNgramIterator.next(); if (tokenList.size() > 0) { context.add("tg=" + tokenList.toString()); } } } return context.toArray(new String[context.size()]); } }
@Test public void extractContext() throws Exception { String doc = "abcde fghijk"; LanguageDetectorContextGenerator cg = new DefaultLanguageDetectorContextGenerator(1, 3); Collection<String> features = Arrays.asList(cg.getContext(doc)); Assert.assertEquals(33, features.size()); Assert.assertTrue(features.contains("ab")); Assert.assertTrue(features.contains("abc")); Assert.assertTrue(features.contains("e f")); Assert.assertTrue(features.contains(" fg")); } }
public LanguageDetectorContextGenerator getContextGenerator() { return new DefaultLanguageDetectorContextGenerator(1, 3, EmojiCharSequenceNormalizer.getInstance(), UrlCharSequenceNormalizer.getInstance(), TwitterCharSequenceNormalizer.getInstance(), NumberCharSequenceNormalizer.getInstance(), ShrinkCharSequenceNormalizer.getInstance()); }
public LanguageDetectorContextGenerator getContextGenerator() { return new DefaultLanguageDetectorContextGenerator(1, 3, EmojiCharSequenceNormalizer.getInstance(), UrlCharSequenceNormalizer.getInstance(), TwitterCharSequenceNormalizer.getInstance(), NumberCharSequenceNormalizer.getInstance(), ShrinkCharSequenceNormalizer.getInstance()); }