public String[] lemmatize(String[] toks, String[] tags) { try { LemmaSample predsSample = mSampleStream.read(); // checks if the streams are sync for (int i = 0; i < toks.length; i++) { if (!toks[i].equals(predsSample.getTokens()[i]) || !tags[i].equals(predsSample.getTags()[i])) { throw new RuntimeException("The streams are not sync!" + "\n expected sentence: " + Arrays.toString(toks) + "\n expected tags: " + Arrays.toString(tags) + "\n predicted sentence: " + Arrays.toString(predsSample.getTokens()) + "\n predicted tags: " + Arrays.toString(predsSample.getTags())); } } return predsSample.getLemmas(); } catch (IOException e) { throw new RuntimeException(e); } }
/** * Checks the evaluator results against the results got using the conlleval, * available at http://www.cnts.ua.ac.be/conll2000/chunking/output.html but * containing lemmas instead of chunks. * * @throws IOException */ @Test public void testEvaluator() throws IOException { String inPredicted = "opennlp/tools/lemmatizer/output.txt"; String inExpected = "opennlp/tools/lemmatizer/output.txt"; String encoding = "UTF-8"; DummyLemmaSampleStream predictedSample = new DummyLemmaSampleStream( new PlainTextByLineStream( new MockInputStreamFactory(new File(inPredicted)), encoding), true); DummyLemmaSampleStream expectedSample = new DummyLemmaSampleStream( new PlainTextByLineStream( new MockInputStreamFactory(new File(inExpected)), encoding), false); Lemmatizer dummyLemmatizer = new DummyLemmatizer(predictedSample); OutputStream stream = new ByteArrayOutputStream(); LemmatizerEvaluationMonitor listener = new LemmaEvaluationErrorListener(stream); LemmatizerEvaluator evaluator = new LemmatizerEvaluator(dummyLemmatizer, listener); evaluator.evaluate(expectedSample); Assert.assertEquals(0.9877049180327869, evaluator.getWordAccuracy(), DELTA); Assert.assertNotSame(stream.toString().length(), 0); }