@Override public SentenceSample read() throws IOException { StringBuilder sentencesString = new StringBuilder(); List<Span> sentenceSpans = new LinkedList<>(); for (int i = 0; sentenceIt.hasNext() && i < 25 ; i++) { LetsmtDocument.LetsmtSentence sentence = sentenceIt.next(); int begin = sentencesString.length(); if (sentence.getTokens() != null) { sentencesString.append(String.join(" ", sentence.getTokens())); } else if (sentence.getNonTokenizedText() != null) { sentencesString.append(sentence.getNonTokenizedText()); } sentenceSpans.add(new Span(begin, sentencesString.length())); sentencesString.append(' '); } // end of stream is reached, indicate that with null return value if (sentenceSpans.size() == 0) { return null; } return new SentenceSample(sentencesString.toString(), sentenceSpans.toArray(new Span[sentenceSpans.size()])); }
@Override public void endElement(String uri, String localName, String qName) throws SAXException { super.endElement(uri, localName, qName); // Note: // words are optional in sentences, if there are no words just the chars have to be captured switch (qName) { case "w": tokens.add(chars.toString().trim()); chars.setLength(0); break; // TODO: The sentence should contain the id, so it can be tracked back to the // place it came from case "s": LetsmtSentence sentence = new LetsmtSentence(); if (tokens.size() > 0) { sentence.tokens = tokens.toArray(new String[tokens.size()]); tokens = new ArrayList<>(); } else { sentence.nonTokenizedText = chars.toString().trim(); } sentences.add(sentence); chars.setLength(0); } } }
@Override public void endElement(String uri, String localName, String qName) throws SAXException { super.endElement(uri, localName, qName); // Note: // words are optional in sentences, if there are no words just the chars have to be captured switch (qName) { case "w": tokens.add(chars.toString().trim()); chars.setLength(0); break; // TODO: The sentence should contain the id, so it can be tracked back to the // place it came from case "s": LetsmtSentence sentence = new LetsmtSentence(); if (tokens.size() > 0) { sentence.tokens = tokens.toArray(new String[tokens.size()]); tokens = new ArrayList<>(); } else { sentence.nonTokenizedText = chars.toString().trim(); } sentences.add(sentence); chars.setLength(0); } } }
@Override public void endElement(String uri, String localName, String qName) throws SAXException { super.endElement(uri, localName, qName); // Note: // words are optional in sentences, if there are no words just the chars have to be captured switch (qName) { case "w": tokens.add(chars.toString().trim()); chars.setLength(0); break; // TODO: The sentence should contain the id, so it can be tracked back to the // place it came from case "s": LetsmtSentence sentence = new LetsmtSentence(); if (tokens.size() > 0) { sentence.tokens = tokens.toArray(new String[tokens.size()]); tokens = new ArrayList<>(); } else { sentence.nonTokenizedText = chars.toString().trim(); } sentences.add(sentence); chars.setLength(0); } } }
@Override public SentenceSample read() throws IOException { StringBuilder sentencesString = new StringBuilder(); List<Span> sentenceSpans = new LinkedList<>(); for (int i = 0; sentenceIt.hasNext() && i < 25 ; i++) { LetsmtDocument.LetsmtSentence sentence = sentenceIt.next(); int begin = sentencesString.length(); if (sentence.getTokens() != null) { sentencesString.append(String.join(" ", sentence.getTokens())); } else if (sentence.getNonTokenizedText() != null) { sentencesString.append(sentence.getNonTokenizedText()); } sentenceSpans.add(new Span(begin, sentencesString.length())); sentencesString.append(' '); } // end of stream is reached, indicate that with null return value if (sentenceSpans.size() == 0) { return null; } return new SentenceSample(sentencesString.toString(), sentenceSpans.toArray(new Span[sentenceSpans.size()])); }
@Override public SentenceSample read() throws IOException { StringBuilder sentencesString = new StringBuilder(); List<Span> sentenceSpans = new LinkedList<>(); for (int i = 0; sentenceIt.hasNext() && i < 25 ; i++) { LetsmtDocument.LetsmtSentence sentence = sentenceIt.next(); int begin = sentencesString.length(); if (sentence.getTokens() != null) { sentencesString.append(String.join(" ", sentence.getTokens())); } else if (sentence.getNonTokenizedText() != null) { sentencesString.append(sentence.getNonTokenizedText()); } sentenceSpans.add(new Span(begin, sentencesString.length())); sentencesString.append(' '); } // end of stream is reached, indicate that with null return value if (sentenceSpans.size() == 0) { return null; } return new SentenceSample(sentencesString.toString(), sentenceSpans.toArray(new Span[sentenceSpans.size()])); }
Assert.assertNull(sent1.getNonTokenizedText()); "products", "." }, sent1.getTokens()); Assert.assertNull(sent2.getNonTokenizedText()); "below", "." }, sent2.getTokens());