/** * Creates a new immutable span based on an existing span, where the existing span did not include the prob * @param span the span that has no prob or the prob is incorrect and a new Span must be generated * @param prob the probability of the span */ public Span(Span span, double prob) { this(span.start, span.end, span.getType(), prob); }
/** * Initializes a new Span object with an existing Span which is shifted by an * offset. * * @param span * @param offset */ public Span(Span span, int offset) { this(span.start + offset, span.end + offset, span.getType(), span.getProb()); }
/** * Generates a human readable string. */ @Override public String toString() { StringBuilder toStringBuffer = new StringBuilder(15); toStringBuffer.append("["); toStringBuffer.append(getStart()); toStringBuffer.append(".."); toStringBuffer.append(getEnd()); toStringBuffer.append(")"); if (getType() != null) { toStringBuffer.append(" "); toStringBuffer.append(getType()); } return toStringBuffer.toString(); }
@Override public NameSample read() throws IOException { NameSample sample = samples.read(); if (sample != null) { sentenceCount++; tokenCount += sample.getSentence().length; for (Span nameSpan : sample.getNames()) { Integer nameCounter = nameCounters.get(nameSpan.getType()); if (nameCounter == null) { nameCounter = 0; } nameCounters.put(nameSpan.getType(), nameCounter + 1); } } return sample; }
/** * Generates a hash code of the current span. */ @Override public int hashCode() { return Objects.hash(getStart(), getEnd(), getType()); }
public String[] encode(Span[] names, int length) { String[] outcomes = new String[length]; for (int i = 0; i < outcomes.length; i++) { outcomes[i] = BioCodec.OTHER; } for (Span name : names) { if (name.getType() == null) { outcomes[name.getStart()] = "default" + "-" + BioCodec.START; } else { outcomes[name.getStart()] = name.getType() + "-" + BioCodec.START; } // now iterate from begin + 1 till end for (int i = name.getStart() + 1; i < name.getEnd(); i++) { if (name.getType() == null) { outcomes[i] = "default" + "-" + BioCodec.CONTINUE; } else { outcomes[i] = name.getType() + "-" + BioCodec.CONTINUE; } } } return outcomes; }
public void missclassified(T reference, T prediction) { samples++; Span[] references = asSpanArray(reference); Span[] predictions = asSpanArray(prediction); Set<Span> refSet = new HashSet<>(Arrays.asList(references)); Set<Span> predSet = new HashSet<>(Arrays.asList(predictions)); for (Span ref : refSet) { if (predSet.contains(ref)) { addTruePositive(ref.getType()); } else { addFalseNegative(ref.getType()); } } for (Span pred : predSet) { if (!refSet.contains(pred)) { addFalsePositive(pred.getType()); } } }
@Override public void endElement(String name) { if (NAME_ELEMENT_NAMES.contains(name)) { Span nameSpan = incompleteNames.pop(); nameSpan = new Span(nameSpan.getStart(), text.size(), nameSpan.getType()); names.add(nameSpan); } if (MucElementNames.CONTENT_ELEMENTS.contains(name)) { storedSamples.add(new NameSample(text.toArray(new String[text.size()]), names.toArray(new Span[names.size()]), isClearAdaptiveData)); if (isClearAdaptiveData) { isClearAdaptiveData = false; } text.clear(); names.clear(); isInsideContentElement = false; } } }
/** * Compares the specified span to the current span. */ public int compareTo(Span s) { if (getStart() < s.getStart()) { return -1; } else if (getStart() == s.getStart()) { if (getEnd() > s.getEnd()) { return -1; } else if (getEnd() < s.getEnd()) { return 1; } else { // compare the type if (getType() == null && s.getType() == null) { return 0; } else if (getType() != null && s.getType() != null) { // use type lexicography order return getType().compareTo(s.getType()); } else if (getType() != null) { return -1; } return 1; } } else { return 1; } }
public void correctlyClassified(T reference, T prediction) { samples++; // add all true positives! Span[] spans = asSpanArray(reference); for (Span span : spans) { addTruePositive(span.getType()); } }
/** * Checks if the specified span is equal to the current span. */ @Override public boolean equals(Object o) { if (o == this) { return true; } if (o instanceof Span) { Span s = (Span) o; return getStart() == s.getStart() && getEnd() == s.getEnd() && Objects.equals(getType(), s.getType()); } return false; }
public NameSample read() throws IOException { NameSample sample = samples.read(); if (sample != null) { List<Span> filteredNames = new ArrayList<>(); for (Span name : sample.getNames()) { if (types.contains(name.getType())) { filteredNames.add(name); } } return new NameSample(sample.getId(), sample.getSentence(), filteredNames.toArray(new Span[filteredNames.size()]), null, sample.isClearAdaptiveDataSet()); } else { return null; } } }
/** * Return a copy of this span with leading and trailing white spaces removed. * * @param text * * @return the trimmed span or the same object if already trimmed */ public Span trim(CharSequence text) { int newStartOffset = getStart(); for (int i = getStart(); i < getEnd() && StringUtil.isWhitespace(text.charAt(i)); i++) { newStartOffset++; } int newEndOffset = getEnd(); for (int i = getEnd(); i > getStart() && StringUtil.isWhitespace(text.charAt(i - 1)); i--) { newEndOffset--; } if (newStartOffset == getStart() && newEndOffset == getEnd()) { return this; } else if (newStartOffset > newEndOffset) { return new Span(getStart(), getStart(), getType()); } else { return new Span(newStartOffset, newEndOffset, getType()); } }
private void evalNameFinder(TokenNameFinderModel model, BigInteger expectedHash) throws Exception { MessageDigest digest = MessageDigest.getInstance(HASH_ALGORITHM); TokenNameFinder nameFinder = new NameFinderME(model); try (ObjectStream<LeipzigTestSample> lines = createLineWiseStream()) { LeipzigTestSample line; while ((line = lines.read()) != null) { Span[] names = nameFinder.find(line.getText()); for (Span name : names) { digest.update((name.getType() + name.getStart() + name.getEnd()).getBytes(StandardCharsets.UTF_8)); } } } Assert.assertEquals(expectedHash, new BigInteger(1, digest.digest())); }
/** * Checks if it accepts name type with some special characters */ @Test public void testTypeWithSpecialChars() throws Exception { NameSample parsedSample = NameSample .parse( "<START:type-1> U . S . <END> " + "President <START:type_2> Barack Obama <END> is considering sending " + "additional American forces to <START:type_3-/;.,&%$> Afghanistan <END> .", false); Assert.assertEquals(3, parsedSample.getNames().length); Assert.assertEquals("type-1", parsedSample.getNames()[0].getType()); Assert.assertEquals("type_2", parsedSample.getNames()[1].getType()); Assert.assertEquals("type_3-/;.,&%$", parsedSample.getNames()[2].getType()); }
@Test public void testFindTokenizdPattern() { Pattern testPattern = Pattern.compile("[0-9]+ year"); String[] sentence = new String[]{"a", "80", "year", "b", "c"}; Pattern[] patterns = new Pattern[]{testPattern}; Map<String, Pattern[]> regexMap = new HashMap<>(); String type = "match"; regexMap.put(type, patterns); RegexNameFinder finder = new RegexNameFinder(regexMap); Span[] result = finder.find(sentence); Assert.assertTrue(result.length == 1); Assert.assertTrue(result[0].getStart() == 1); Assert.assertTrue(result[0].getEnd() == 3); Assert.assertTrue(result[0].getType().equals("match")); }
@Test public void testSingleFilter() throws IOException { final String[] types = new String[] {organization}; filter = new NameSampleTypeFilter(types, sampleStream(text)); NameSample ns = filter.read(); Assert.assertEquals(1, ns.getNames().length); Assert.assertEquals(organization, ns.getNames()[0].getType()); }
/** * Evaluates the given reference {@link NameSample} object. * * This is done by finding the names with the * {@link TokenNameFinder} in the sentence from the reference * {@link NameSample}. The found names are then used to * calculate and update the scores. * * @param reference the reference {@link NameSample}. * * @return the predicted {@link NameSample}. */ @Override protected NameSample processSample(NameSample reference) { if (reference.isClearAdaptiveDataSet()) { nameFinder.clearAdaptiveData(); } Span[] predictedNames = nameFinder.find(reference.getSentence()); Span[] references = reference.getNames(); // OPENNLP-396 When evaluating with a file in the old format // the type of the span is null, but must be set to default to match // the output of the name finder. for (int i = 0; i < references.length; i++) { if (references[i].getType() == null) { references[i] = new Span(references[i].getStart(), references[i].getEnd(), "default"); } } fmeasure.updateScores(references, predictedNames); return new NameSample(reference.getSentence(), predictedNames, reference.isClearAdaptiveDataSet()); }
/** * Train NamefinderME using OnlyWithNames.train. The goal is to check if the model validator accepts it. * This is related to the issue OPENNLP-9 */ @Test public void testOnlyWithEntitiesWithTypes() throws Exception { // train the name finder ObjectStream<NameSample> sampleStream = new NameSampleDataStream( new PlainTextByLineStream(new MockInputStreamFactory( new File("opennlp/tools/namefind/OnlyWithEntitiesWithTypes.train")), "UTF-8")); TrainingParameters params = new TrainingParameters(); params.put(TrainingParameters.ALGORITHM_PARAM, "MAXENT"); params.put(TrainingParameters.ITERATIONS_PARAM, 70); params.put(TrainingParameters.CUTOFF_PARAM, 1); TokenNameFinderModel nameFinderModel = NameFinderME.train("eng", null, sampleStream, params, TokenNameFinderFactory.create(null, null, Collections.emptyMap(), new BioCodec())); NameFinderME nameFinder = new NameFinderME(nameFinderModel); // now test if it can detect the sample sentences String[] sentence = "NATO United States Barack Obama".split("\\s+"); Span[] names1 = nameFinder.find(sentence); Assert.assertEquals(new Span(0, 1, "organization"), names1[0]); // NATO Assert.assertEquals(new Span(1, 3, "location"), names1[1]); // United States Assert.assertEquals("person", names1[2].getType()); Assert.assertTrue(!hasOtherAsOutcome(nameFinderModel)); }
/** * Train NamefinderME using OnlyWithNamesWithTypes.train. * The goal is to check if the model validator accepts it. * This is related to the issue OPENNLP-9 */ @Test public void testOnlyWithNamesWithTypes() throws Exception { // train the name finder ObjectStream<NameSample> sampleStream = new NameSampleDataStream( new PlainTextByLineStream(new MockInputStreamFactory( new File("opennlp/tools/namefind/OnlyWithNamesWithTypes.train")), "UTF-8")); TrainingParameters params = new TrainingParameters(); params.put(TrainingParameters.ITERATIONS_PARAM, 70); params.put(TrainingParameters.CUTOFF_PARAM, 1); TokenNameFinderModel nameFinderModel = NameFinderME.train("eng", null, sampleStream, params, TokenNameFinderFactory.create(null, null, Collections.emptyMap(), new BioCodec())); NameFinderME nameFinder = new NameFinderME(nameFinderModel); // now test if it can detect the sample sentences String[] sentence = ("Neil Abercrombie Anibal Acevedo-Vila Gary Ackerman " + "Robert Aderholt Daniel Akaka Todd Akin Lamar Alexander Rodney Alexander").split("\\s+"); Span[] names1 = nameFinder.find(sentence); Assert.assertEquals(new Span(0, 2, "person"), names1[0]); Assert.assertEquals(new Span(2, 4, "person"), names1[1]); Assert.assertEquals(new Span(4, 6, "person"), names1[2]); Assert.assertEquals("person", names1[2].getType()); Assert.assertTrue(!hasOtherAsOutcome(nameFinderModel)); }