/** * Returns a RegexNamefinder that will utilize specified default regexes. * * @param defaults the OpenNLP default regexes * @return {@link RegexNameFinder} */ public static synchronized RegexNameFinder getDefaultRegexNameFinders( DEFAULT_REGEX_NAME_FINDER... defaults) { Objects.requireNonNull(defaults, "defaults must not be null"); return new RegexNameFinder(defaultsToMap(defaults)); }
/** * NEW. This method removes the need for tokenization, but returns the Span * with character indices, rather than word. * * @param text * @return */ public Span[] find(String text) { return getAnnotations(text); }
@Test public void testFindMatchingPatternWithoutMatchingTokenBounds() { Pattern testPattern = Pattern.compile("[0-8] year"); // does match "0 year" String[] sentence = new String[]{"a", "80", "year", "c"}; Pattern[] patterns = new Pattern[]{testPattern}; Map<String, Pattern[]> regexMap = new HashMap<>(); String type = "testtype"; regexMap.put(type, patterns); RegexNameFinder finder = new RegexNameFinder(regexMap); Span[] result = finder.find(sentence); Assert.assertTrue(result.length == 0); } }
@Test public void testURL() throws Exception { String[] tokens = WhitespaceTokenizer.INSTANCE.tokenize(text); Span[] find = regexNameFinder.find(tokens); List<Span> spanList = Arrays.asList(find); Span urlSpan = new Span(13, 14, "URL"); Assert.assertTrue(spanList.contains(urlSpan)); Assert.assertEquals("https://www.google.com", tokens[urlSpan.getStart()]); }
@Test public void testPhoneNumber() throws Exception { String[] tokens = WhitespaceTokenizer.INSTANCE.tokenize(text); Span[] find = regexNameFinder.find(tokens); List<Span> spanList = Arrays.asList(find); Span phoneSpan = new Span(9, 10, "PHONE_NUM"); Assert.assertTrue(spanList.contains(phoneSpan)); Assert.assertEquals("123-234-5678", tokens[phoneSpan.getStart()]); }
/** * Allows for use of selected Defaults as well as regexes from external * configuration * * @param config a map where the key is a type, and the value is a * Pattern[]. If the keys clash with default keys, the config * map will win * @param defaults the OpenNLP default regexes * @return {@link RegexNameFinder} */ public static synchronized RegexNameFinder getDefaultRegexNameFinders( Map<String, Pattern[]> config, DEFAULT_REGEX_NAME_FINDER... defaults) { Objects.requireNonNull(config, "config must not be null"); Map<String, Pattern[]> defaultsToMap = new HashMap<>(); if (defaults != null) { defaultsToMap = defaultsToMap(defaults); } defaultsToMap.putAll(config); return new RegexNameFinder(defaultsToMap); }
@Test public void testFindSingleTokenPattern() { Pattern testPattern = Pattern.compile("test"); String[] sentence = new String[]{"a", "test", "b", "c"}; Pattern[] patterns = new Pattern[]{testPattern}; Map<String, Pattern[]> regexMap = new HashMap<>(); String type = "testtype"; regexMap.put(type, patterns); RegexNameFinder finder = new RegexNameFinder(regexMap); Span[] result = finder.find(sentence); Assert.assertTrue(result.length == 1); Assert.assertTrue(result[0].getStart() == 1); Assert.assertTrue(result[0].getEnd() == 2); }
/** * NEW. This method removes the need for tokenization, but returns the Span * with character indices, rather than word. * * @param text * @return */ public Span[] find(String text) { return getAnnotations(text); }
@Test public void testEmail() throws Exception { String[] tokens = WhitespaceTokenizer.INSTANCE.tokenize(text); Span[] find = regexNameFinder.find(tokens); List<Span> spanList = Arrays.asList(find); Assert.assertTrue(spanList.contains(new Span(3, 4, "EMAIL"))); Span emailSpan = new Span(3, 4, "EMAIL"); Assert.assertEquals("opennlp@gmail.com", tokens[emailSpan.getStart()]); }
/** * Returns a RegexNamefinder that will utilize specified default regexes. * * @param defaults the OpenNLP default regexes * @return {@link RegexNameFinder} */ public static synchronized RegexNameFinder getDefaultRegexNameFinders( DEFAULT_REGEX_NAME_FINDER... defaults) { Objects.requireNonNull(defaults, "defaults must not be null"); return new RegexNameFinder(defaultsToMap(defaults)); }
@Test public void testFindTokenizdPattern() { Pattern testPattern = Pattern.compile("[0-9]+ year"); String[] sentence = new String[]{"a", "80", "year", "b", "c"}; Pattern[] patterns = new Pattern[]{testPattern}; Map<String, Pattern[]> regexMap = new HashMap<>(); String type = "match"; regexMap.put(type, patterns); RegexNameFinder finder = new RegexNameFinder(regexMap); Span[] result = finder.find(sentence); Assert.assertTrue(result.length == 1); Assert.assertTrue(result[0].getStart() == 1); Assert.assertTrue(result[0].getEnd() == 3); Assert.assertTrue(result[0].getType().equals("match")); }
/** * NEW. This method removes the need for tokenization, but returns the Span * with character indices, rather than word. * * @param text * @return */ public Span[] find(String text) { return getAnnotations(text); }
@Test public void testMgrs() throws Exception { String[] tokens = WhitespaceTokenizer.INSTANCE.tokenize(text); Span[] find = regexNameFinder.find(tokens); List<Span> spanList = Arrays.asList(find); Span mgrsSpan1 = new Span(18, 19, "MGRS"); Span mgrsSpan2 = new Span(20, 24, "MGRS"); Assert.assertTrue(spanList.contains(mgrsSpan1)); Assert.assertTrue(spanList.contains(mgrsSpan2)); Assert.assertEquals("11SKU528111".toLowerCase(), tokens[mgrsSpan1.getStart()]); Assert.assertEquals("11S", tokens[mgrsSpan2.getStart()]); } }
/** * Returns a RegexNamefinder that will utilize specified default regexes. * * @param defaults the OpenNLP default regexes * @return {@link RegexNameFinder} */ public static synchronized RegexNameFinder getDefaultRegexNameFinders( DEFAULT_REGEX_NAME_FINDER... defaults) { Objects.requireNonNull(defaults, "defaults must not be null"); return new RegexNameFinder(defaultsToMap(defaults)); }
@Test public void testLatLong() throws Exception { String[] tokens = WhitespaceTokenizer.INSTANCE.tokenize(text); Span[] find = regexNameFinder.find(tokens); List<Span> spanList = Arrays.asList(find); Span latLongSpan1 = new Span(22, 24, "DEGREES_MIN_SEC_LAT_LON"); Span latLongSpan2 = new Span(35, 41, "DEGREES_MIN_SEC_LAT_LON"); Assert.assertTrue(spanList.contains(latLongSpan1)); Assert.assertTrue(spanList.contains(latLongSpan2)); Assert.assertEquals("528", tokens[latLongSpan1.getStart()]); Assert.assertEquals("45", tokens[latLongSpan2.getStart()]); }
/** * Allows for use of selected Defaults as well as regexes from external * configuration * * @param config a map where the key is a type, and the value is a * Pattern[]. If the keys clash with default keys, the config * map will win * @param defaults the OpenNLP default regexes * @return {@link RegexNameFinder} */ public static synchronized RegexNameFinder getDefaultRegexNameFinders( Map<String, Pattern[]> config, DEFAULT_REGEX_NAME_FINDER... defaults) { Objects.requireNonNull(config, "config must not be null"); Map<String, Pattern[]> defaultsToMap = new HashMap<>(); if (defaults != null) { defaultsToMap = defaultsToMap(defaults); } defaultsToMap.putAll(config); return new RegexNameFinder(defaultsToMap); }
/** * Allows for use of selected Defaults as well as regexes from external * configuration * * @param config a map where the key is a type, and the value is a * Pattern[]. If the keys clash with default keys, the config * map will win * @param defaults the OpenNLP default regexes * @return {@link RegexNameFinder} */ public static synchronized RegexNameFinder getDefaultRegexNameFinders( Map<String, Pattern[]> config, DEFAULT_REGEX_NAME_FINDER... defaults) { Objects.requireNonNull(config, "config must not be null"); Map<String, Pattern[]> defaultsToMap = new HashMap<>(); if (defaults != null) { defaultsToMap = defaultsToMap(defaults); } defaultsToMap.putAll(config); return new RegexNameFinder(defaultsToMap); }