/** The actual implementation of a tokenized sentence constructor */ protected Sentence(Function<String, Document> doc, List<String> tokens, Properties props) { this(doc.apply(StringUtils.join(tokens.stream().map(x -> x.replace(' ', 'ߝ' /* some random character */)), " ")), props); // Clean up whitespace for (int i = 0; i < impl.getTokenCount(); ++i) { this.impl.getTokenBuilder(i).setWord(this.impl.getTokenBuilder(i).getWord().replace('ߝ', ' ')); this.impl.getTokenBuilder(i).setValue(this.impl.getTokenBuilder(i).getValue().replace('ߝ', ' ')); this.tokensBuilders.get(i).setWord(this.tokensBuilders.get(i).getWord().replace('ߝ', ' ')); this.tokensBuilders.get(i).setValue(this.tokensBuilders.get(i).getValue().replace('ߝ', ' ')); } }
/** * A funky little helper method to interpret each token of the sentence as an HTML string, and translate it back to text. * Note that this is <b>in place</b>. */ public void unescapeHTML() { // Change in the protobuf for (int i = 0; i < sentence.length(); ++i) { CoreNLPProtos.Token.Builder token = sentence.rawToken(i); token.setWord(StringUtils.unescapeHtml3(token.getWord())); token.setLemma(StringUtils.unescapeHtml3(token.getLemma())); } // Change in the annotation CoreMap cm = sentence.document.asAnnotation().get(CoreAnnotations.SentencesAnnotation.class).get(sentence.sentenceIndex()); for (CoreLabel token : cm.get(CoreAnnotations.TokensAnnotation.class)) { token.setWord(StringUtils.unescapeHtml3(token.word())); token.setLemma(StringUtils.unescapeHtml3(token.lemma())); } } }
/** The actual implementation of a tokenized sentence constructor */ protected Sentence(Function<String, Document> doc, List<String> tokens, Properties props) { this(doc.apply(StringUtils.join(tokens.stream().map(x -> x.replace(' ', 'ߝ' /* some random character */)), " ")), props); // Clean up whitespace for (int i = 0; i < impl.getTokenCount(); ++i) { this.impl.getTokenBuilder(i).setWord(this.impl.getTokenBuilder(i).getWord().replace('ߝ', ' ')); this.impl.getTokenBuilder(i).setValue(this.impl.getTokenBuilder(i).getValue().replace('ߝ', ' ')); this.tokensBuilders.get(i).setWord(this.tokensBuilders.get(i).getWord().replace('ߝ', ' ')); this.tokensBuilders.get(i).setValue(this.tokensBuilders.get(i).getValue().replace('ߝ', ' ')); } }
/** * A funky little helper method to interpret each token of the sentence as an HTML string, and translate it back to text. * Note that this is <b>in place</b>. */ public void unescapeHTML() { // Change in the protobuf for (int i = 0; i < sentence.length(); ++i) { CoreNLPProtos.Token.Builder token = sentence.rawToken(i); token.setWord(StringUtils.unescapeHtml3(token.getWord())); token.setLemma(StringUtils.unescapeHtml3(token.getLemma())); } // Change in the annotation CoreMap cm = sentence.document.asAnnotation().get(CoreAnnotations.SentencesAnnotation.class).get(sentence.sentenceIndex()); for (CoreLabel token : cm.get(CoreAnnotations.TokensAnnotation.class)) { token.setWord(StringUtils.unescapeHtml3(token.word())); token.setLemma(StringUtils.unescapeHtml3(token.lemma())); } } }