@Override protected String[] toSentence(POSSample sample) { return sample.getSentence(); } }
@Override public String toString() { StringBuilder result = new StringBuilder(); for (int i = 0; i < getSentence().length; i++) { result.append(getSentence()[i]); result.append('_'); result.append(getTags()[i]); result.append(' '); } if (result.length() > 0) { // get rid of last space result.setLength(result.length() - 1); } return result.toString(); }
@Override public int hashCode() { return Objects.hash(Arrays.hashCode(getSentence()), Arrays.hashCode(getTags())); }
@Override public boolean equals(Object obj) { if (obj == this) { return true; } if (obj instanceof POSSample) { POSSample a = (POSSample) obj; return Arrays.equals(getSentence(), a.getSentence()) && Arrays.equals(getTags(), a.getTags()); } return this == obj; } }
public TokenSample read() throws IOException { POSSample posSample = samples.read(); TokenSample tokenSample = null; if (posSample != null ) { tokenSample = new TokenSample(detokenizer, posSample.getSentence()); } return tokenSample; } }
@Override protected Iterator<Event> createEvents(POSSample sample) { String[] sentence = sample.getSentence(); String[] tags = sample.getTags(); Object[] ac = sample.getAddictionalContext(); List<Event> events = generateEvents(sentence, tags, ac, cg); return events.iterator(); }
@Override public void missclassified(POSSample reference, POSSample prediction) { printError(reference.getTags(), prediction.getTags(), reference, prediction, reference.getSentence()); }
/** * Evaluates the given reference {@link POSSample} object. * * This is done by tagging the sentence from the reference * {@link POSSample} with the {@link POSTagger}. The * tags are then used to update the word accuracy score. * * @param reference the reference {@link POSSample}. * * @return the predicted {@link POSSample}. */ @Override protected POSSample processSample(POSSample reference) { String[] predictedTags = tagger.tag(reference.getSentence(), reference.getAddictionalContext()); String[] referenceTags = reference.getTags(); for (int i = 0; i < referenceTags.length; i++) { if (referenceTags[i].equals(predictedTags[i])) { wordAccuracy.add(1); } else { wordAccuracy.add(0); } } return new POSSample(reference.getSentence(), predictedTags); }
private void statsAdd(POSSample reference, POSSample prediction) { getStats().add(reference.getSentence(), reference.getTags(), prediction.getTags()); }
@SuppressWarnings("unchecked") public Event[] updateContext(Sequence sequence, AbstractModel model) { Sequence<POSSample> pss = sequence; POSTagger tagger = new POSTaggerME(new POSModel("x-unspecified", model, null, new POSTaggerFactory())); String[] sentence = pss.getSource().getSentence(); Object[] ac = pss.getSource().getAddictionalContext(); String[] tags = tagger.tag(pss.getSource().getSentence()); Event[] events = new Event[sentence.length]; POSSampleEventStream.generateEvents(sentence, tags, ac, pcg) .toArray(events); return events; }
public static Dictionary buildNGramDictionary(ObjectStream<POSSample> samples, int cutoff) throws IOException { NGramModel ngramModel = new NGramModel(); POSSample sample; while ((sample = samples.read()) != null) { String[] words = sample.getSentence(); if (words.length > 0) ngramModel.add(new StringList(words), 1, 1); } ngramModel.cutoff(cutoff, Integer.MAX_VALUE); return ngramModel.toDictionary(true); }
@Override public Sequence read() throws IOException { POSSample sample = psi.read(); if (sample != null) { String[] sentence = sample.getSentence(); String[] tags = sample.getTags(); Event[] events = new Event[sentence.length]; for (int i = 0; i < sentence.length; i++) { // it is safe to pass the tags as previous tags because // the context generator does not look for non predicted tags String[] context = pcg.getContext(i, sentence, tags, null); events[i] = new Event(tags[i], context); } Sequence<POSSample> sequence = new Sequence<POSSample>(events,sample); return sequence; } return null; }
/** * Tests if it can parse an empty token. * */ @Test public void testParseEmtpyToken() throws InvalidFormatException { String sentence = "the_DT _NNS"; POSSample sample = POSSample.parse(sentence); Assert.assertEquals(sample.getSentence()[1], ""); }
@Test public void testPOSSampleSerDe() throws IOException { POSSample posSample = createGoldSample(); ByteArrayOutputStream byteArrayOutputStream = new ByteArrayOutputStream(); ObjectOutput out = new ObjectOutputStream(byteArrayOutputStream); out.writeObject(posSample); out.flush(); byte[] bytes = byteArrayOutputStream.toByteArray(); ByteArrayInputStream byteArrayInputStream = new ByteArrayInputStream(bytes); ObjectInput objectInput = new ObjectInputStream(byteArrayInputStream); POSSample deSerializedPOSSample = null; try { deSerializedPOSSample = (POSSample) objectInput.readObject(); } catch (ClassNotFoundException e) { // do nothing } Assert.assertNotNull(deSerializedPOSSample); Assert.assertArrayEquals(posSample.getAddictionalContext(), deSerializedPOSSample.getAddictionalContext()); Assert.assertArrayEquals(posSample.getSentence(), deSerializedPOSSample.getSentence()); Assert.assertArrayEquals(posSample.getTags(), deSerializedPOSSample.getTags()); }
@Test public void evalChunkerModel() throws Exception { MessageDigest digest = MessageDigest.getInstance(HASH_ALGORITHM); POSTagger tagger = new POSTaggerME(new POSModel( new File(getOpennlpDataDir(), "models-sf/en-pos-perceptron.bin"))); Chunker chunker = new ChunkerME(new ChunkerModel( new File(getOpennlpDataDir(), "models-sf/en-chunker.bin"))); try (ObjectStream<LeipzigTestSample> lines = createLineWiseStream()) { LeipzigTestSample line; while ((line = lines.read()) != null) { POSSample sentence = new POSSample(line.getText(), tagger.tag(line.getText())); String[] chunks = chunker.chunk(sentence.getSentence(), sentence.getTags()); for (String chunk : chunks) { digest.update(chunk.getBytes(StandardCharsets.UTF_8)); } } } Assert.assertEquals(new BigInteger("226003515785585284478071030961407561943"), new BigInteger(1, digest.digest())); }
/** * Tests if it can parse an empty {@link String}. */ @Test public void testParseEmptyString() throws InvalidFormatException { String sentence = ""; POSSample sample = POSSample.parse(sentence); Assert.assertEquals(sample.getSentence().length, 0); Assert.assertEquals(sample.getTags().length, 0); }
@Test public void testExpandME() throws IOException { // add one sentence with expandME = true try (ADPOSSampleStream stream = new ADPOSSampleStream( new PlainTextByLineStream(new ResourceAsStreamFactory( ADParagraphStreamTest.class, "/opennlp/tools/formats/ad.sample"), StandardCharsets.UTF_8), true, false)) { POSSample sample = stream.read(); Assert.assertEquals(27, sample.getSentence().length); Assert.assertEquals("Inicia", sample.getSentence()[0]); Assert.assertEquals("v-fin", sample.getTags()[0]); Assert.assertEquals("em", sample.getSentence()[1]); Assert.assertEquals("prp", sample.getTags()[1]); Assert.assertEquals("o", sample.getSentence()[2]); Assert.assertEquals("art", sample.getTags()[2]); Assert.assertEquals("Porto", sample.getSentence()[9]); Assert.assertEquals("B-prop", sample.getTags()[9]); Assert.assertEquals("Poesia", sample.getSentence()[10]); Assert.assertEquals("I-prop", sample.getTags()[10]); } }
@Test public void testSimple() throws IOException { // add one sentence with expandME = includeFeats = false try (ADPOSSampleStream stream = new ADPOSSampleStream( new PlainTextByLineStream(new ResourceAsStreamFactory( ADParagraphStreamTest.class, "/opennlp/tools/formats/ad.sample"), StandardCharsets.UTF_8), false, false)) { POSSample sample = stream.read(); Assert.assertEquals(23, sample.getSentence().length); Assert.assertEquals("Inicia", sample.getSentence()[0]); Assert.assertEquals("v-fin", sample.getTags()[0]); Assert.assertEquals("em", sample.getSentence()[1]); Assert.assertEquals("prp", sample.getTags()[1]); Assert.assertEquals("o", sample.getSentence()[2]); Assert.assertEquals("art", sample.getTags()[2]); Assert.assertEquals("Porto_Poesia", sample.getSentence()[9]); Assert.assertEquals("prop", sample.getTags()[9]); } }
@Test public void testIncludeFeats() throws IOException { // add one sentence with includeFeats = true try (ADPOSSampleStream stream = new ADPOSSampleStream( new PlainTextByLineStream(new ResourceAsStreamFactory( ADParagraphStreamTest.class, "/opennlp/tools/formats/ad.sample"), StandardCharsets.UTF_8), false, true)) { POSSample sample = stream.read(); Assert.assertEquals(23, sample.getSentence().length); Assert.assertEquals("Inicia", sample.getSentence()[0]); Assert.assertEquals("v-fin=PR=3S=IND=VFIN", sample.getTags()[0]); Assert.assertEquals("em", sample.getSentence()[1]); Assert.assertEquals("prp", sample.getTags()[1]); Assert.assertEquals("o", sample.getSentence()[2]); Assert.assertEquals("art=DET=M=S", sample.getTags()[2]); Assert.assertEquals("Porto_Poesia", sample.getSentence()[9]); Assert.assertEquals("prop=M=S", sample.getTags()[9]); } }
new WordTagSampleStream(new CollectionObjectStream<>(sampleString))) { POSSample sample = stream.read(); String[] words = sample.getSentence();