@Override protected SentenceSample processSample(SentenceSample sample) { Span[] predictions = trimSpans(sample.getDocument(), sentenceDetector.sentPosDetect(sample.getDocument())); Span[] references = trimSpans(sample.getDocument(), sample.getSentences()); fmeasure.updateScores(references, predictions); return new SentenceSample(sample.getDocument(), predictions); }
@Override public boolean equals(Object obj) { if (this == obj) { return true; } if (obj instanceof SentenceSample) { SentenceSample a = (SentenceSample) obj; return getDocument().equals(a.getDocument()) && Arrays.equals(getSentences(), a.getSentences()); } return false; } }
public SentenceSample read() throws IOException { StringBuilder sentencesString = new StringBuilder(); List<Span> sentenceSpans = new LinkedList<>(); String sentence; for (int i = 0; i < 25 && (sentence = samples.read()) != null; i++) { int begin = sentencesString.length(); sentence = sentence.trim(); sentencesString.append(sentence); int end = sentencesString.length(); sentenceSpans.add(new Span(begin, end)); sentencesString.append(' '); } if (sentenceSpans.size() > 0) { return new SentenceSample(sentencesString.toString(), sentenceSpans.toArray(new Span[sentenceSpans.size()])); } return null; } }
public Span[] sentPosDetect(String s) { return sample.getSentences(); }
@Before public void setup() throws IOException { InputStreamFactory in = new ResourceAsStreamFactory(ADSentenceSampleStreamTest.class, "/opennlp/tools/formats/ad.sample"); try (ADSentenceSampleStream stream = new ADSentenceSampleStream( new PlainTextByLineStream(in, StandardCharsets.UTF_8), true)) { SentenceSample sample; while ((sample = stream.read()) != null) { System.out.println(sample.getDocument()); System.out.println("<fim>"); samples.add(sample); } } }
@Override public void missclassified(SentenceSample reference, SentenceSample prediction) { printError(reference.getSentences(), prediction.getSentences(), reference, prediction, reference.getDocument()); }
@Override public SentenceSample read() throws IOException { StringBuilder documentText = new StringBuilder(); List<Span> sentenceSpans = new ArrayList<>(); ConlluSentence sentence; for (int i = 0; i < sentencesPerSample && (sentence = samples.read()) != null; i++) { int startIndex = documentText.length(); documentText.append(sentence.getTextComment()).append(' '); sentenceSpans.add(new Span(startIndex, documentText.length() - 1)); } if (documentText.length() > 0) { documentText.setLength(documentText.length() - 1); return new SentenceSample(documentText, sentenceSpans.toArray(new Span[sentenceSpans.size()])); } return null; } }
@Override public SentenceSample read() throws IOException { SentenceSample sample = samples.read(); if (sample != null) { List<String> sentenceTexts = new ArrayList<>(); for (Span sentenceSpan : sample.getSentences()) { sentenceTexts.add(sample.getDocument().substring(sentenceSpan.getStart(), sentenceSpan.getEnd())); } StringBuilder documentText = new StringBuilder(); List<Span> newSentenceSpans = new ArrayList<>(); for (String sentenceText : sentenceTexts) { String[] tokens = WhitespaceTokenizer.INSTANCE.tokenize(sentenceText); int begin = documentText.length(); documentText.append(detokenizer.detokenize(tokens, null)); newSentenceSpans.add(new Span(begin, documentText.length())); documentText.append(' '); } return new SentenceSample(documentText, newSentenceSpans.toArray(new Span[newSentenceSpans.size()])); } return null; } }
@Override public int hashCode() { return Objects.hash(getDocument(), Arrays.hashCode(getSentences())); }
public SentenceSample read() throws IOException { StringBuilder sentencesString = new StringBuilder(); List<Span> sentenceSpans = new LinkedList<>(); String sentence; while ((sentence = samples.read()) != null && !sentence.equals("")) { int begin = sentencesString.length(); sentence = sentence.trim(); sentence = replaceNewLineEscapeTags(sentence); sentencesString.append(sentence); int end = sentencesString.length(); sentenceSpans.add(new Span(begin, end)); sentencesString.append(' '); } if (sentenceSpans.size() > 0) { return new SentenceSample(sentencesString.toString(), sentenceSpans.toArray(new Span[sentenceSpans.size()])); } return null; } }
@Test public void testRetrievingContent() { SentenceSample sample = new SentenceSample("1. 2.", new Span(0, 2), new Span(3, 5)); Assert.assertEquals("1. 2.", sample.getDocument()); Assert.assertEquals(new Span(0, 2), sample.getSentences()[0]); Assert.assertEquals(new Span(3, 5), sample.getSentences()[1]); }
@Override protected Iterator<Event> createEvents(SentenceSample sample) { Collection<Event> events = new ArrayList<>(); for (Span sentenceSpan : sample.getSentences()) { String sentenceString = sentenceSpan.getCoveredText(sample.getDocument()).toString(); for (Iterator<Integer> it = scanner.getPositions( sentenceString).iterator(); it.hasNext();) { int candidate = it.next(); String type = SentenceDetectorME.NO_SPLIT; if (!it.hasNext()) { type = SentenceDetectorME.SPLIT; } events.add(new Event(type, cg.getContext(sample.getDocument(), sentenceSpan.getStart() + candidate))); } } return events.iterator(); } }
@Override public SentenceSample read() throws IOException { StringBuilder sentencesString = new StringBuilder(); List<Span> sentenceSpans = new LinkedList<>(); while (sentenceIt.hasNext()) { IrishSentenceBankDocument.IrishSentenceBankSentence sentence = sentenceIt.next(); int begin = sentencesString.length(); if (sentence.getOriginal() != null) { sentencesString.append(sentence.getOriginal()); } sentenceSpans.add(new Span(begin, sentencesString.length())); sentencesString.append(' '); } // end of stream is reached, indicate that with null return value if (sentenceSpans.size() == 0) { return null; } return new SentenceSample(sentencesString.toString(), sentenceSpans.toArray(new Span[sentenceSpans.size()])); }
@Override protected SentenceSample processSample(SentenceSample sample) { Span[] predictions = trimSpans(sample.getDocument(), sentenceDetector.sentPosDetect(sample.getDocument())); Span[] references = trimSpans(sample.getDocument(), sample.getSentences()); fmeasure.updateScores(references, predictions); return new SentenceSample(sample.getDocument(), predictions); }
@Test public void testSentences() throws IOException { Assert.assertNotNull(samples.get(0).getDocument()); Assert.assertEquals(3, samples.get(0).getSentences().length); Assert.assertEquals(new Span(0, 119), samples.get(0).getSentences()[0]); Assert.assertEquals(new Span(120, 180), samples.get(0).getSentences()[1]); }
@Override public SentenceSample read() throws IOException { StringBuilder sentencesString = new StringBuilder(); List<Span> sentenceSpans = new LinkedList<>(); for (int i = 0; sentenceIt.hasNext() && i < 25 ; i++) { LetsmtDocument.LetsmtSentence sentence = sentenceIt.next(); int begin = sentencesString.length(); if (sentence.getTokens() != null) { sentencesString.append(String.join(" ", sentence.getTokens())); } else if (sentence.getNonTokenizedText() != null) { sentencesString.append(sentence.getNonTokenizedText()); } sentenceSpans.add(new Span(begin, sentencesString.length())); sentencesString.append(' '); } // end of stream is reached, indicate that with null return value if (sentenceSpans.size() == 0) { return null; } return new SentenceSample(sentencesString.toString(), sentenceSpans.toArray(new Span[sentenceSpans.size()])); }
@Override protected SentenceSample processSample(SentenceSample sample) { Span[] predictions = trimSpans(sample.getDocument(), sentenceDetector.sentPosDetect(sample.getDocument())); Span[] references = trimSpans(sample.getDocument(), sample.getSentences()); fmeasure.updateScores(references, predictions); return new SentenceSample(sample.getDocument(), predictions); }
@Test public void testSentenceSampleSerDe() throws IOException { SentenceSample sentenceSample = createGoldSample(); ByteArrayOutputStream byteArrayOutputStream = new ByteArrayOutputStream(); ObjectOutput out = new ObjectOutputStream(byteArrayOutputStream); out.writeObject(sentenceSample); out.flush(); byte[] bytes = byteArrayOutputStream.toByteArray(); ByteArrayInputStream byteArrayInputStream = new ByteArrayInputStream(bytes); ObjectInput objectInput = new ObjectInputStream(byteArrayInputStream); SentenceSample deSerializedSentenceSample = null; try { deSerializedSentenceSample = (SentenceSample) objectInput.readObject(); } catch (ClassNotFoundException e) { // do nothing } Assert.assertNotNull(deSerializedSentenceSample); Assert.assertEquals(sentenceSample.getDocument(), deSerializedSentenceSample.getDocument()); Assert.assertArrayEquals(sentenceSample.getSentences(), deSerializedSentenceSample.getSentences()); }
public SentenceSample read() throws IOException { List<String[]> sentences = new ArrayList<>(); T posSample; int chunks = 0; while ((posSample = samples.read()) != null && chunks < chunkSize) { sentences.add(toSentence(posSample)); chunks++; } if (sentences.size() > 0) { return new SentenceSample(detokenizer, sentences.toArray(new String[sentences.size()][])); } else if (posSample != null) { return read(); // filter out empty line } return null; // last sample was read } }