public static POSSample parse(String sentenceString) throws InvalidFormatException { String[] tokenTags = WhitespaceTokenizer.INSTANCE.tokenize(sentenceString); String[] sentence = new String[tokenTags.length]; String[] tags = new String[tokenTags.length]; for (int i = 0; i < tokenTags.length; i++) { int split = tokenTags[i].lastIndexOf("_"); if (split == -1) { throw new InvalidFormatException("Cannot find \"_\" inside token '" + tokenTags[i] + "'!"); } sentence[i] = tokenTags[i].substring(0, split); tags[i] = tokenTags[i].substring(split + 1); } return new POSSample(sentence, tags); }
@Override public boolean equals(Object obj) { if (obj == this) { return true; } if (obj instanceof POSSample) { POSSample a = (POSSample) obj; return Arrays.equals(getSentence(), a.getSentence()) && Arrays.equals(getTags(), a.getTags()); } return this == obj; } }
/** * Tests if it can parse an empty {@link String}. */ @Test public void testParseEmptyString() throws InvalidFormatException { String sentence = ""; POSSample sample = POSSample.parse(sentence); Assert.assertEquals(sample.getSentence().length, 0); Assert.assertEquals(sample.getTags().length, 0); }
@Override protected Iterator<Event> createEvents(POSSample sample) { String[] sentence = sample.getSentence(); String[] tags = sample.getTags(); Object[] ac = sample.getAddictionalContext(); List<Event> events = generateEvents(sentence, tags, ac, cg); return events.iterator(); }
/** * Evaluates the given reference {@link POSSample} object. * * This is done by tagging the sentence from the reference * {@link POSSample} with the {@link POSTagger}. The * tags are then used to update the word accuracy score. * * @param reference the reference {@link POSSample}. * * @return the predicted {@link POSSample}. */ @Override protected POSSample processSample(POSSample reference) { String[] predictedTags = tagger.tag(reference.getSentence(), reference.getAddictionalContext()); String[] referenceTags = reference.getTags(); for (int i = 0; i < referenceTags.length; i++) { if (referenceTags[i].equals(predictedTags[i])) { wordAccuracy.add(1); } else { wordAccuracy.add(0); } } return new POSSample(reference.getSentence(), predictedTags); }
@Test public void evalChunkerModel() throws Exception { MessageDigest digest = MessageDigest.getInstance(HASH_ALGORITHM); POSTagger tagger = new POSTaggerME(new POSModel( new File(getOpennlpDataDir(), "models-sf/en-pos-perceptron.bin"))); Chunker chunker = new ChunkerME(new ChunkerModel( new File(getOpennlpDataDir(), "models-sf/en-chunker.bin"))); try (ObjectStream<LeipzigTestSample> lines = createLineWiseStream()) { LeipzigTestSample line; while ((line = lines.read()) != null) { POSSample sentence = new POSSample(line.getText(), tagger.tag(line.getText())); String[] chunks = chunker.chunk(sentence.getSentence(), sentence.getTags()); for (String chunk : chunks) { digest.update(chunk.getBytes(StandardCharsets.UTF_8)); } } } Assert.assertEquals(new BigInteger("226003515785585284478071030961407561943"), new BigInteger(1, digest.digest())); }
@Override protected String[] toSentence(POSSample sample) { return sample.getSentence(); } }
String[] tags = tagger.tag(whitespaceTokenizerLine); POSSample sample = new POSSample(whitespaceTokenizerLine, tags); String words[] = sample.getSentence();
@SuppressWarnings("unchecked") public Event[] updateContext(Sequence sequence, AbstractModel model) { Sequence<POSSample> pss = sequence; POSTagger tagger = new POSTaggerME(new POSModel("x-unspecified", model, null, new POSTaggerFactory())); String[] sentence = pss.getSource().getSentence(); Object[] ac = pss.getSource().getAddictionalContext(); String[] tags = tagger.tag(pss.getSource().getSentence()); Event[] events = new Event[sentence.length]; POSSampleEventStream.generateEvents(sentence, tags, ac, pcg) .toArray(events); return events; }
/** * Tests if it can parse an empty token. * */ @Test public void testParseEmtpyToken() throws InvalidFormatException { String sentence = "the_DT _NNS"; POSSample sample = POSSample.parse(sentence); Assert.assertEquals(sample.getSentence()[1], ""); }
/** * Tests if it can parse a valid token_tag sentence. * */ @Test public void testParse() throws InvalidFormatException { String sentence = "the_DT stories_NNS about_IN well-heeled_JJ " + "communities_NNS and_CC developers_NNS"; POSSample sample = POSSample.parse(sentence); Assert.assertEquals(sentence, sample.toString()); }
public static POSSample createGoldSample() throws InvalidFormatException { String sentence = "the_DT stories_NNS about_IN well-heeled_JJ " + "communities_NNS and_CC developers_NNS"; return POSSample.parse(sentence); }
public String[] tag(String[] sentence) { return sample.getTags(); }
public POSSample(List<String> sentence, List<String> tags, String[][] additionalContext) { this.sentence = Collections.unmodifiableList(sentence); this.tags = Collections.unmodifiableList(tags); checkArguments(); String[][] ac; if (additionalContext != null) { ac = new String[additionalContext.length][]; for (int i = 0; i < additionalContext.length; i++) { ac[i] = new String[additionalContext[i].length]; System.arraycopy(additionalContext[i], 0, ac[i], 0, additionalContext[i].length); } } else { ac = null; } this.additionalContext = ac; }
/** * Evaluates the given reference {@link POSSample} object. * * This is done by tagging the sentence from the reference * {@link POSSample} with the {@link POSTagger}. The * tags are then used to update the word accuracy score. * * @param reference the reference {@link POSSample}. * * @return the predicted {@link POSSample}. */ @Override protected POSSample processSample(POSSample reference) { String[] predictedTags = tagger.tag(reference.getSentence(), reference.getAddictionalContext()); String[] referenceTags = reference.getTags(); for (int i = 0; i < referenceTags.length; i++) { if (referenceTags[i].equals(predictedTags[i])) { wordAccuracy.add(1); } else { wordAccuracy.add(0); } } return new POSSample(reference.getSentence(), predictedTags); }
@Test public void testPOSSampleSerDe() throws IOException { POSSample posSample = createGoldSample(); ByteArrayOutputStream byteArrayOutputStream = new ByteArrayOutputStream(); ObjectOutput out = new ObjectOutputStream(byteArrayOutputStream); out.writeObject(posSample); out.flush(); byte[] bytes = byteArrayOutputStream.toByteArray(); ByteArrayInputStream byteArrayInputStream = new ByteArrayInputStream(bytes); ObjectInput objectInput = new ObjectInputStream(byteArrayInputStream); POSSample deSerializedPOSSample = null; try { deSerializedPOSSample = (POSSample) objectInput.readObject(); } catch (ClassNotFoundException e) { // do nothing } Assert.assertNotNull(deSerializedPOSSample); Assert.assertArrayEquals(posSample.getAddictionalContext(), deSerializedPOSSample.getAddictionalContext()); Assert.assertArrayEquals(posSample.getSentence(), deSerializedPOSSample.getSentence()); Assert.assertArrayEquals(posSample.getTags(), deSerializedPOSSample.getTags()); }