public NameSample read() throws IOException { String token = samples.read(); boolean isClearAdaptiveData = false; // An empty line indicates the begin of a new article // for which the adaptive data in the feature generators // must be cleared while (token != null && token.trim().length() == 0) { isClearAdaptiveData = true; token = samples.read(); } if (token != null) { return NameSample.parse(token, isClearAdaptiveData); } else { return null; } } }
/** * Test if it fails to parse type with > * @throws Exception */ @Test(expected = IOException.class) public void testTypeWithInvalidChar2() throws Exception { NameSample.parse("<START:abc>a> token <END>", false); }
/** * Test if it fails to parse type with space * @throws Exception */ @Test(expected = IOException.class) public void testTypeWithSpace() throws Exception { NameSample.parse("<START:abc a> token <END>", false); }
/** * Test if it fails to parse nested names * @throws Exception */ @Test(expected = IOException.class) public void testNestedNameSpans() throws Exception { NameSample.parse("<START:Person> <START:Location> Kennedy <END> City <END>", false); }
/** * Test if it fails to parse type with new line * @throws Exception */ @Test(expected = IOException.class) public void testTypeWithNewLine() throws Exception { NameSample.parse("<START:abc\na> token <END>", false); }
/** * Test if it fails to parse type with : * @throws Exception */ @Test(expected = IOException.class) public void testTypeWithInvalidChar1() throws Exception { NameSample.parse("<START:abc:a> token <END>", false); }
/** * Test if it fails to parse empty type */ @Test(expected = IOException.class) public void testMissingType() throws Exception { NameSample.parse("<START:> token <END>", false); }
public static NameSample parse(String taggedTokens, String defaultType, boolean isClearAdaptiveData) throws IOException { // TODO: Should throw another exception, and then convert it into an IOException in the stream String[] parts = WhitespaceTokenizer.INSTANCE.tokenize(taggedTokens); List<String> tokenList = new ArrayList<>(parts.length); List<Span> nameList = new ArrayList<>(); String nameType = defaultType; int startIndex = -1; int wordIndex = 0; // we check if at least one name has the a type. If no one has, we will // leave the NameType property of NameSample null. boolean catchingName = false; for (int pi = 0; pi < parts.length; pi++) { Matcher startMatcher = START_TAG_PATTERN.matcher(parts[pi]); if (startMatcher.matches()) { if (catchingName) { throw new IOException("Found unexpected annotation" + " while handling a name sequence: " + errorTokenWithContext(parts, pi)); } catchingName = true; startIndex = wordIndex; String nameTypeFromSample = startMatcher.group(2); if (nameTypeFromSample != null) { if (nameTypeFromSample.length() == 0) { throw new IOException("Missing a name type: " + errorTokenWithContext(parts, pi));
document.add(NameSample.parse(line, false));
/** * Tests if an additional space is correctly treated as one space. * * @throws Exception */ @Test public void testParseWithAdditionalSpace() throws Exception { String line = "<START> M . K . <END> <START> Schwitters <END> ? <START> Heartfield <END> ?"; NameSample test = NameSample.parse(line, false); Assert.assertEquals(8, test.getSentence().length); }
/** * Checks if could create a NameSample with NameTypes, generate the * string representation and validate it. */ @Test public void testWithTypesToString() throws Exception { String nameSampleStr = createSimpleNameSample(true).toString(); Assert.assertEquals("<START:Location> U . S . <END> President <START:Person>" + " Barack Obama <END> " + "is considering sending additional American forces to <START:Location> Afghanistan <END> .", nameSampleStr); NameSample parsedSample = NameSample.parse("<START:Location> U . S . <END> " + "President <START:Person> Barack Obama <END> is considering sending " + "additional American forces to <START:Location> Afghanistan <END> .", false); Assert.assertEquals(createSimpleNameSample(true), parsedSample); }
/** * Checks if it accepts name type with some special characters */ @Test public void testTypeWithSpecialChars() throws Exception { NameSample parsedSample = NameSample .parse( "<START:type-1> U . S . <END> " + "President <START:type_2> Barack Obama <END> is considering sending " + "additional American forces to <START:type_3-/;.,&%$> Afghanistan <END> .", false); Assert.assertEquals(3, parsedSample.getNames().length); Assert.assertEquals("type-1", parsedSample.getNames()[0].getType()); Assert.assertEquals("type_2", parsedSample.getNames()[1].getType()); Assert.assertEquals("type_3-/;.,&%$", parsedSample.getNames()[2].getType()); }
public NameSample read() throws IOException { String token = samples.read(); boolean isClearAdaptiveData = false; // An empty line indicates the begin of a new article // for which the adaptive data in the feature generators // must be cleared while (token != null && token.trim().length() == 0) { isClearAdaptiveData = true; token = samples.read(); } if (token != null) { return NameSample.parse(token, isClearAdaptiveData); } else { return null; } } }
public NameSample read() throws IOException { String token = samples.read(); boolean isClearAdaptiveData = false; // An empty line indicates the begin of a new article // for which the adaptive data in the feature generators // must be cleared while (token != null && token.trim().length() == 0) { isClearAdaptiveData = true; token = samples.read(); } if (token != null) { return NameSample.parse(token, isClearAdaptiveData); } else { return null; } } }
public static NameSample parse(String taggedTokens, String defaultType, boolean isClearAdaptiveData) throws IOException { // TODO: Should throw another exception, and then convert it into an IOException in the stream String[] parts = WhitespaceTokenizer.INSTANCE.tokenize(taggedTokens); List<String> tokenList = new ArrayList<>(parts.length); List<Span> nameList = new ArrayList<>(); String nameType = defaultType; int startIndex = -1; int wordIndex = 0; // we check if at least one name has the a type. If no one has, we will // leave the NameType property of NameSample null. boolean catchingName = false; for (int pi = 0; pi < parts.length; pi++) { Matcher startMatcher = START_TAG_PATTERN.matcher(parts[pi]); if (startMatcher.matches()) { if (catchingName) { throw new IOException("Found unexpected annotation" + " while handling a name sequence: " + errorTokenWithContext(parts, pi)); } catchingName = true; startIndex = wordIndex; String nameTypeFromSample = startMatcher.group(2); if (nameTypeFromSample != null) { if (nameTypeFromSample.length() == 0) { throw new IOException("Missing a name type: " + errorTokenWithContext(parts, pi));
public static NameSample parse(String taggedTokens, String defaultType, boolean isClearAdaptiveData) throws IOException { // TODO: Should throw another exception, and then convert it into an IOException in the stream String[] parts = WhitespaceTokenizer.INSTANCE.tokenize(taggedTokens); List<String> tokenList = new ArrayList<>(parts.length); List<Span> nameList = new ArrayList<>(); String nameType = defaultType; int startIndex = -1; int wordIndex = 0; // we check if at least one name has the a type. If no one has, we will // leave the NameType property of NameSample null. boolean catchingName = false; for (int pi = 0; pi < parts.length; pi++) { Matcher startMatcher = START_TAG_PATTERN.matcher(parts[pi]); if (startMatcher.matches()) { if (catchingName) { throw new IOException("Found unexpected annotation" + " while handling a name sequence: " + errorTokenWithContext(parts, pi)); } catchingName = true; startIndex = wordIndex; String nameTypeFromSample = startMatcher.group(2); if (nameTypeFromSample != null) { if (nameTypeFromSample.length() == 0) { throw new IOException("Missing a name type: " + errorTokenWithContext(parts, pi));
document.add(NameSample.parse(line, false));
document.add(NameSample.parse(line, false));