@Override public ITokenizer getTokenizer(LanguageCode language) { switch (language) { case CHINESE_SIMPLIFIED: return ChineseTokenizerFactory.createTokenizer(); /* * We use our own analyzer for Arabic. Lucene's version has special * support for Nonspacing-Mark characters (see * http://www.fileformat.info/info/unicode/category/Mn/index.htm), but we * have them included as letters in the parser. */ case ARABIC: // Intentional fall-through. default: return new ExtendedWhitespaceTokenizer(); } }