/** * @param maxLength the maximum number of characters that will be considered - can help * with performance. Don't use values below 100, as this would decrease * accuracy. * @throws IllegalArgumentException if {@code maxLength} is less than 10 * @since 4.2 */ public LanguageIdentifier(int maxLength) { if (maxLength < 10) { throw new IllegalArgumentException("maxLength must be >= 10 (but values > 100 are recommended): " + maxLength); } this.maxLength = maxLength; try { List<LanguageProfile> profiles = loadProfiles(getLanguageCodes()); languageDetector = LanguageDetectorBuilder.create(NgramExtractors.standard()) .minimalConfidence(MINIMAL_CONFIDENCE) .shortTextAlgorithm(SHORT_ALGO_THRESHOLD) .withProfiles(profiles) .build(); textObjectFactory = new TextObjectFactoryBuilder() .maxTextLength(10000) .withTextFilter(UrlTextFilter.getInstance()) .withTextFilter(RemoveMinorityScriptsTextFilter.forThreshold(0.3)) .withTextFilter(new RemoveEMailSignatureFilter()) .build(); } catch (IOException e) { throw new RuntimeException("Could not set up language identifier", e); } }
private static TextObjectFactory buildTextObjectFactory() { List<TextFilter> textFilters = new ArrayList<>(); textFilters.add(TikasUrlTextFilter.getInstance()); textFilters.add(RemoveMinorityScriptsTextFilter.forThreshold(0.3)); return new TextObjectFactory(new MultiTextFilter(textFilters), MAX_TEXT_LENGTH); }
public static Optional<LdLocale> detect(String s) { return detector.detect(textObjectFactory.forText(s)); }
public static TextObjectFactory forIndexingCleanText() { return new TextObjectFactoryBuilder() .build(); }
public TextObjectFactory build() { return new TextObjectFactory( new MultiTextFilter(textFilters), maxTextLength ); }
/** * If a script has less than this fraction of content compared to the most used one, its text is removed. * * Example: Latin 10%, Cyrillic 80%, Common 10% (punctuation n'stuff). Now 10 is put in relation to 80. * * @param threshold 0-1, suggested value is 0.3. If smaller then removed, equal remains. */ public static RemoveMinorityScriptsTextFilter forThreshold(double threshold) { return new RemoveMinorityScriptsTextFilter(threshold); }
public TextObject create() { return new TextObject(textFilter, maxTextLength); }
public static TextObjectFactory forDetectingOnLargeText() { return new TextObjectFactoryBuilder() .maxTextLength(10000) .withTextFilter(UrlTextFilter.getInstance()) .withTextFilter(RemoveMinorityScriptsTextFilter.forThreshold(0.3)) .build(); }
public static List<DetectedLanguage> getProbabilities(String s) { TextObject textObject = textObjectFactory.forText(s); return detector.getProbabilities(textObject); }
public static TextObjectFactory forDetectingShortCleanText() { return new TextObjectFactoryBuilder() .build(); }
public TextObjectFactory build() { return new TextObjectFactory( new MultiTextFilter(textFilters), maxTextLength ); }
/** * If a script has less than this fraction of content compared to the most used one, its text is removed. * * Example: Latin 10%, Cyrillic 80%, Common 10% (punctuation n'stuff). Now 10 is put in relation to 80. * * @param threshold 0-1, suggested value is 0.3. If smaller then removed, equal remains. */ public static RemoveMinorityScriptsTextFilter forThreshold(double threshold) { return new RemoveMinorityScriptsTextFilter(threshold); }
public static TextObjectFactory forIndexing() { return new TextObjectFactoryBuilder() .withTextFilter(UrlTextFilter.getInstance()) .withTextFilter(RemoveMinorityScriptsTextFilter.forThreshold(0.3)) .build(); }
public static TextObjectFactory forIndexingCleanText() { return new TextObjectFactoryBuilder() .build(); }
public static TextObjectFactory forIndexing() { return new TextObjectFactoryBuilder() .withTextFilter(UrlTextFilter.getInstance()) .withTextFilter(RemoveMinorityScriptsTextFilter.forThreshold(0.3)) .build(); }
public static TextObjectFactory forDetectingShortCleanText() { return new TextObjectFactoryBuilder() .build(); }
public static TextObjectFactory forDetectingOnLargeText() { return new TextObjectFactoryBuilder() .maxTextLength(10000) .withTextFilter(UrlTextFilter.getInstance()) .withTextFilter(RemoveMinorityScriptsTextFilter.forThreshold(0.3)) .build(); }