/** * @param maxLength the maximum number of characters that will be considered - can help * with performance. Don't use values below 100, as this would decrease * accuracy. * @throws IllegalArgumentException if {@code maxLength} is less than 10 * @since 4.2 */ public LanguageIdentifier(int maxLength) { if (maxLength < 10) { throw new IllegalArgumentException("maxLength must be >= 10 (but values > 100 are recommended): " + maxLength); } this.maxLength = maxLength; try { List<LanguageProfile> profiles = loadProfiles(getLanguageCodes()); languageDetector = LanguageDetectorBuilder.create(NgramExtractors.standard()) .minimalConfidence(MINIMAL_CONFIDENCE) .shortTextAlgorithm(SHORT_ALGO_THRESHOLD) .withProfiles(profiles) .build(); textObjectFactory = new TextObjectFactoryBuilder() .maxTextLength(10000) .withTextFilter(UrlTextFilter.getInstance()) .withTextFilter(RemoveMinorityScriptsTextFilter.forThreshold(0.3)) .withTextFilter(new RemoveEMailSignatureFilter()) .build(); } catch (IOException e) { throw new RuntimeException("Could not set up language identifier", e); } }
private static com.optimaize.langdetect.LanguageDetector createDetector(List<LanguageProfile> languageProfiles, Map<String, Float> languageProbabilities) { // FUTURE currently the short text algorithm doesn't normalize probabilities until the end, which // means you can often get 0 probabilities. So we pick a very short length for this limit. LanguageDetectorBuilder builder = LanguageDetectorBuilder.create(NgramExtractors.standard()) .shortTextAlgorithm(30) .withProfiles(languageProfiles); if (languageProbabilities != null) { Map<LdLocale, Double> languageWeights = new HashMap<>(languageProbabilities.size()); for (String language : languageProbabilities.keySet()) { Double priority = (double)languageProbabilities.get(language); languageWeights.put(LdLocale.fromString(language), priority); } builder.languagePriorities(languageWeights); } return builder.build(); }
public static void loadModels(Path path) throws IOException { languageProfiles = new LanguageProfileReader().readAll(path.toFile()); detector = LanguageDetectorBuilder.create(NgramExtractors.standard()) .withProfiles(languageProfiles) .build(); textObjectFactory = buildTextObjectFactory(); }
public static void loadBuiltInModels() throws IOException { languageProfiles = new LanguageProfileReader().readAllBuiltIn(); detector = LanguageDetectorBuilder.create(NgramExtractors.standard()) .withProfiles(languageProfiles) .build(); textObjectFactory = buildTextObjectFactory(); }
private com.optimaize.langdetect.LanguageDetector createDetector(List<LanguageProfile> languageProfiles) { // FUTURE currently the short text algorithm doesn't normalize probabilities until the end, which // means you can often get 0 probabilities. So we pick a very short length for this limit. LanguageDetectorBuilder builder = LanguageDetectorBuilder.create(NgramExtractors.standard()) .shortTextAlgorithm(30) .withProfiles(languageProfiles); if (languageProbabilities != null) { Map<LdLocale, Double> languageWeights = new HashMap<>(languageProbabilities.size()); for (String language : languageProbabilities.keySet()) { Double priority = (double)languageProbabilities.get(language); languageWeights.put(LdLocale.fromString(language), priority); } builder.languagePriorities(languageWeights); } return builder.build(); }
private static com.optimaize.langdetect.LanguageDetector createDetector(List<LanguageProfile> languageProfiles, Map<String, Float> languageProbabilities) { // FUTURE currently the short text algorithm doesn't normalize probabilities until the end, which // means you can often get 0 probabilities. So we pick a very short length for this limit. LanguageDetectorBuilder builder = LanguageDetectorBuilder.create(NgramExtractors.standard()) .shortTextAlgorithm(30) .withProfiles(languageProfiles); if (languageProbabilities != null) { Map<LdLocale, Double> languageWeights = new HashMap<>(languageProbabilities.size()); for (String language : languageProbabilities.keySet()) { Double priority = (double)languageProbabilities.get(language); languageWeights.put(LdLocale.fromString(language), priority); } builder.languagePriorities(languageWeights); } return builder.build(); }
private static LanguageDetector getLanguageDetector(){ if (languageDetector == null){ try { List<String> languages = new ArrayList<>(); for(Language lg : Language.values()) languages.add(lg.getAbrev().toLowerCase()); List<LanguageProfile> languageProfiles = new LanguageProfileReader().read(languages); languageDetector = LanguageDetectorBuilder.create(NgramExtractors.standard()) .withProfiles(languageProfiles).build(); } catch (IOException e) { LOG.error("Translator.getLanguageDetector", e); } } return languageDetector; }
final LanguageDetector languageDetector = LanguageDetectorBuilder.create(NgramExtractors.standard()) .withProfiles(this.languageProfiles) .build();
public static String detectLanguage(String text) throws IOException{ if(languageDetector == null) { languageDetector = LanguageDetectorBuilder.create(NgramExtractors.standard()) .shortTextAlgorithm(0) .withProfiles(new LanguageProfileReader().readAllBuiltIn()) .build(); } List<DetectedLanguage> detectedLanguages = languageDetector.getProbabilities(text); if(detectedLanguages.size() > 0) return detectedLanguages.get(0).getLocale().getLanguage(); return "N/A"; }
final LanguageDetector languageDetector = LanguageDetectorBuilder.create(NgramExtractors.standard()) .withProfiles(this.languageProfiles) .build();
static private void initOptimaize() { synchronized (initGuard) { if ((textObjectFactory != null) && (languageDetector != null)) return; // origin: https://github.com/optimaize/language-detector // load all languages: List<LanguageProfile> languageProfiles; try { languageProfiles = new LanguageProfileReader().readAllBuiltIn(); } catch (IOException e) { throw new UncheckedIOException(e); } //build language detector: languageDetector = LanguageDetectorBuilder.create(NgramExtractors.standard()) .withProfiles(languageProfiles) .build(); //create a text object factory textObjectFactory = CommonTextObjectFactories.forDetectingOnLargeText(); } }
/** * Using all language profiles from the given directory. */ private LanguageDetector makeDetector() throws IOException { double alpha = getParamDouble("alpha", DEFAULT_ALPHA); String profileDirectory = requireParamString("directory") + "/"; Optional<Long> seed = Optional.fromNullable(getParamLongOrNull("seed")); List<LanguageProfile> languageProfiles = new LanguageProfileReader().readAll(new File(profileDirectory)); return LanguageDetectorBuilder.create(NgramExtractors.standard()) .alpha(alpha) .seed(seed) .shortTextAlgorithm(50) .withProfiles(languageProfiles) .build(); }
/** * Using all language profiles from the given directory. */ private LanguageDetector makeDetector() throws IOException { double alpha = getParamDouble("alpha", DEFAULT_ALPHA); String profileDirectory = requireParamString("directory") + "/"; Optional<Long> seed = Optional.fromNullable(getParamLongOrNull("seed")); List<LanguageProfile> languageProfiles = new LanguageProfileReader().readAll(new File(profileDirectory)); return LanguageDetectorBuilder.create(NgramExtractors.standard()) .alpha(alpha) .seed(seed) .shortTextAlgorithm(50) .withProfiles(languageProfiles) .build(); }
/** * Create a new DocumentFactory instance. Use this only if you need multiple instances! * Otherwise, getInstance() will return a singleton object that you can use. */ public DocumentFactory() { sentenceSplitter = new TreeMap<>(); plainTokenizer = new TreeMap<>(); newlineTokenizer = new TreeMap<>(); loadSentenceSplitter(LANG_EN, Resource.fromJAR("openNLP/en-sent.bin")); loadTokenizer(LANG_EN, Resource.fromJAR("openNLP/en-token.bin")); loadSentenceSplitter(LANG_DE, Resource.fromJAR("openNLP/de-sent.bin")); loadTokenizer(LANG_DE, Resource.fromJAR("openNLP/de-token.bin")); try { //load all languages: List<LanguageProfile> languageProfiles = new LanguageProfileReader().readAllBuiltIn(); //build language detector: languageDetector = LanguageDetectorBuilder.create(NgramExtractors.standard()) .withProfiles(languageProfiles) .build(); //create a text object factory textObjectFactory = CommonTextObjectFactories.forDetectingOnLargeText(); } catch (IOException ex) { log.error("Could not load language profiles"); } }
public static LanguageDetector getLanguageDetector() { if (detectorInstance == null) { synchronized (OptimaizeLanguageFilter.class) { if (detectorInstance == null) { try { detectorInstance = LanguageDetectorBuilder.create(NgramExtractors.standard()) .shortTextAlgorithm(0) .withProfiles(new LanguageProfileReader().readAllBuiltIn()) .build(); } catch (IOException e) { throw new RuntimeIOException(e); } } } } return detectorInstance; }
public void initialize() { if (initialized) { return; } LOG.info("Initializing Language Detector ..."); try { List<LanguageProfile> languageProfiles = new LanguageProfileReader().readAllBuiltIn(); //build language detector: languageDetector = LanguageDetectorBuilder.create(NgramExtractors.standard()) .withProfiles(languageProfiles) .build(); //create a text object factory textObjectFactory = CommonTextObjectFactories.forDetectingOnLargeText(); initialized = true; } catch (IOException ex) { initialized = false; LOG.error("Error while initializing Language Detector", ex); } }
/** * @param maxLength the maximum number of characters that will be considered - can help * with performance. Don't use values below 100, as this would decrease * accuracy. * @throws IllegalArgumentException if {@code maxLength} is less than 10 * @since 4.2 */ public LanguageIdentifier(int maxLength) { if (maxLength < 10) { throw new IllegalArgumentException("maxLength must be >= 10 (but values > 100 are recommended): " + maxLength); } this.maxLength = maxLength; try { List<LanguageProfile> profiles = loadProfiles(getLanguageCodes()); languageDetector = LanguageDetectorBuilder.create(NgramExtractors.standard()) .minimalConfidence(MINIMAL_CONFIDENCE) .shortTextAlgorithm(SHORT_ALGO_THRESHOLD) .withProfiles(profiles) .build(); textObjectFactory = new TextObjectFactoryBuilder() .maxTextLength(10000) .withTextFilter(UrlTextFilter.getInstance()) .withTextFilter(RemoveMinorityScriptsTextFilter.forThreshold(0.3)) .withTextFilter(new RemoveEMailSignatureFilter()) .build(); } catch (IOException e) { throw new RuntimeException("Could not set up language identifier", e); } }