LetsmtSentenceStream(LetsmtDocument source) { this.source = source; reset(); }
static LetsmtDocument parse(File file) throws IOException { try (InputStream in = new FileInputStream(file)) { return parse(in); } } }
@Override public void reset() { sentenceIt = source.getSentences().iterator(); } }
CmdLineUtil.checkInputFile("Data", params.getData()); letsmtDoc = LetsmtDocument.parse(params.getData()); } catch (IOException ex) { CmdLineUtil.handleCreateObjectStreamError(ex); ObjectStream<SentenceSample> samples = new LetsmtSentenceStream(letsmtDoc); if (params.getDetokenizer() != null) { try { Detokenizer detokenizer = new DictionaryDetokenizer( new DetokenizationDictionary(params.getDetokenizer())); samples = new DetokenizeSentenceSampleStream(detokenizer, samples); } catch (IOException e) { throw new TerminateToolException(-1, "Failed to load detokenizer rules!", e);
try (InputStream letsmtXmlIn = LetsmtDocumentTest.class.getResourceAsStream("letsmt-with-words.xml");) { LetsmtDocument doc = LetsmtDocument.parse(letsmtXmlIn); List<LetsmtDocument.LetsmtSentence> sents = doc.getSentences(); Assert.assertNull(sent1.getNonTokenizedText()); "products", "." }, sent1.getTokens()); Assert.assertNull(sent2.getNonTokenizedText()); "below", "." }, sent2.getTokens());
@Override public SentenceSample read() throws IOException { StringBuilder sentencesString = new StringBuilder(); List<Span> sentenceSpans = new LinkedList<>(); for (int i = 0; sentenceIt.hasNext() && i < 25 ; i++) { LetsmtDocument.LetsmtSentence sentence = sentenceIt.next(); int begin = sentencesString.length(); if (sentence.getTokens() != null) { sentencesString.append(String.join(" ", sentence.getTokens())); } else if (sentence.getNonTokenizedText() != null) { sentencesString.append(sentence.getNonTokenizedText()); } sentenceSpans.add(new Span(begin, sentencesString.length())); sentencesString.append(' '); } // end of stream is reached, indicate that with null return value if (sentenceSpans.size() == 0) { return null; } return new SentenceSample(sentencesString.toString(), sentenceSpans.toArray(new Span[sentenceSpans.size()])); }
static LetsmtDocument parse(InputStream letsmtXmlIn) throws IOException { SAXParser saxParser = XmlUtil.createSaxParser(); try { XMLReader xmlReader = saxParser.getXMLReader(); LetsmtDocumentHandler docHandler = new LetsmtDocumentHandler(); xmlReader.setContentHandler(docHandler); xmlReader.parse(new InputSource(letsmtXmlIn)); return new LetsmtDocument(docHandler.sentences); } catch (SAXException e) { throw new IOException("Failed to parse letsmt xml!", e); } }
public static void registerFactory() { StreamFactoryRegistry.registerFactory(SentenceSample.class, "letsmt", new LetsmtSentenceStreamFactory( LetsmtSentenceStreamFactory.Parameters.class)); }
@Override public void endElement(String uri, String localName, String qName) throws SAXException { super.endElement(uri, localName, qName); // Note: // words are optional in sentences, if there are no words just the chars have to be captured switch (qName) { case "w": tokens.add(chars.toString().trim()); chars.setLength(0); break; // TODO: The sentence should contain the id, so it can be tracked back to the // place it came from case "s": LetsmtSentence sentence = new LetsmtSentence(); if (tokens.size() > 0) { sentence.tokens = tokens.toArray(new String[tokens.size()]); tokens = new ArrayList<>(); } else { sentence.nonTokenizedText = chars.toString().trim(); } sentences.add(sentence); chars.setLength(0); } } }
CmdLineUtil.checkInputFile("Data", params.getData()); letsmtDoc = LetsmtDocument.parse(params.getData()); } catch (IOException ex) { CmdLineUtil.handleCreateObjectStreamError(ex); ObjectStream<SentenceSample> samples = new LetsmtSentenceStream(letsmtDoc); if (params.getDetokenizer() != null) { try { Detokenizer detokenizer = new DictionaryDetokenizer( new DetokenizationDictionary(params.getDetokenizer())); samples = new DetokenizeSentenceSampleStream(detokenizer, samples); } catch (IOException e) { throw new TerminateToolException(-1, "Failed to load detokenizer rules!", e);
static LetsmtDocument parse(InputStream letsmtXmlIn) throws IOException { SAXParser saxParser = XmlUtil.createSaxParser(); try { XMLReader xmlReader = saxParser.getXMLReader(); LetsmtDocumentHandler docHandler = new LetsmtDocumentHandler(); xmlReader.setContentHandler(docHandler); xmlReader.parse(new InputSource(letsmtXmlIn)); return new LetsmtDocument(docHandler.sentences); } catch (SAXException e) { throw new IOException("Failed to parse letsmt xml!", e); } }
static LetsmtDocument parse(File file) throws IOException { try (InputStream in = new FileInputStream(file)) { return parse(in); } } }
@Override public void reset() { sentenceIt = source.getSentences().iterator(); } }
LetsmtSentenceStream(LetsmtDocument source) { this.source = source; reset(); }
public static void registerFactory() { StreamFactoryRegistry.registerFactory(SentenceSample.class, "letsmt", new LetsmtSentenceStreamFactory( LetsmtSentenceStreamFactory.Parameters.class)); }
static LetsmtDocument parse(InputStream letsmtXmlIn) throws IOException { SAXParser saxParser = XmlUtil.createSaxParser(); try { XMLReader xmlReader = saxParser.getXMLReader(); LetsmtDocumentHandler docHandler = new LetsmtDocumentHandler(); xmlReader.setContentHandler(docHandler); xmlReader.parse(new InputSource(letsmtXmlIn)); return new LetsmtDocument(docHandler.sentences); } catch (SAXException e) { throw new IOException("Failed to parse letsmt xml!", e); } }
static LetsmtDocument parse(File file) throws IOException { try (InputStream in = new FileInputStream(file)) { return parse(in); } } }
@Override public void reset() { sentenceIt = source.getSentences().iterator(); } }
LetsmtSentenceStream(LetsmtDocument source) { this.source = source; reset(); }
public static void registerFactory() { StreamFactoryRegistry.registerFactory(SentenceSample.class, "letsmt", new LetsmtSentenceStreamFactory( LetsmtSentenceStreamFactory.Parameters.class)); }