@Override public boolean keepArgument(JCas jCas) { List<Sentence> sentences = new ArrayList<>(JCasUtil.select(jCas, Sentence.class)); // remove one-sentence arguments if (sentences.size() == 1) { return false; } for (Sentence s : sentences) { if (s.getCoveredText().length() > MAX_SENTENCE_LENGTH) { return false; } } return true; } }
/** * <B>NOTE:</B> Must be called only after one of the conversion methods was called. * @return a mapping between a root, and the text of the tree's sentence. This is an ordered map, * ordered by the order of sentences in the text. * @throws CasTreeConverterException */ public LinkedHashMap<BasicNode, String> getTreesToSentences() throws CasTreeConverterException { if (lastRootList == null || lastSentenceList == null) { throw new CasTreeConverterException("getTreesToSentences() called before a conversion method was called."); } if (lastRootList.size() != lastSentenceList.size()) { throw new CasTreeConverterException("Internal error - lastRootList(size=" + lastRootList.size() + ") and lastSentenceList(size=" + lastSentenceList.size() + ") are in different sizes."); } LinkedHashMap<BasicNode, String> result = new LinkedHashMap<BasicNode, String>(lastRootList.size()); Iterator<Sentence> iterSentences = lastSentenceList.iterator(); for (BasicNode node : lastRootList) { Sentence sentence = iterSentences.next(); result.put(node, sentence.getCoveredText()); } return result; }
public static List<HITSentence> extractSentences(StandaloneArgument argument) throws IOException { // extract sentences List<HITSentence> result = new ArrayList<>(); ArrayList<Sentence> sentences = new ArrayList<>( JCasUtil.select(argument.getJCas(), Sentence.class)); for (int i = 0; i < sentences.size(); i++) { Sentence sentence = sentences.get(i); HITSentence s = new HITSentence(); // position s.position = i; // create unique id by combining argument id and sentence position s.sentenceId = StandaloneArgument.getSentenceID(argument, s.position); s.text = sentence.getCoveredText(); result.add(s); } return result; }
private Collection<? extends HITSentence> extractSentencesForReasons( StandaloneArgument argument) throws IOException { // extract sentences List<HITSentenceReason> result = new ArrayList<>(); ArrayList<Sentence> sentences = new ArrayList<>( JCasUtil.select(argument.getJCas(), Sentence.class)); for (int i = 0; i < sentences.size(); i++) { Sentence sentence = sentences.get(i); HITSentenceReason s = new HITSentenceReason(); // position s.position = i; // create unique id by combining argument id and sentence position s.sentenceId = StandaloneArgument.getSentenceID(argument, s.position); s.text = sentence.getCoveredText(); // find out whether this sentence is already covered by a claim List<Claim> coveringClaims = JCasUtil.selectCovering(Claim.class, sentence); s.disabled = !coveringClaims.isEmpty(); // there can't be any claims at the moment! if (s.disabled) { throw new IllegalStateException("No claim annotations are allowed at this point"); } result.add(s); } return result; }
private void setTokenSentenceAddress(JCas aJCas) { int sentNMumber = 1; for (Sentence sentence : select(aJCas, Sentence.class)) { int lineNumber = 1; for (Token token : selectCovered(Token.class, sentence)) { AnnotationUnit unit = new AnnotationUnit(token.getBegin(), token.getEnd(), false, token.getCoveredText()); units.add(unit); if (lineNumber == 1) { sentenceUnits.put(unit, sentence.getCoveredText()); } unitsLineNumber.put(unit, sentNMumber + "-" + lineNumber); lineNumber++; } sentNMumber++; } }
private void setTokenSentenceAddress(JCas aJCas) { int sentNMumber = 1; for (Sentence sentence : select(aJCas, Sentence.class)) { int lineNumber = 1; for (Token token : selectCovered(Token.class, sentence)) { AnnotationUnit unit = new AnnotationUnit(token.getBegin(), token.getEnd(), false, token.getCoveredText()); units.add(unit); if (lineNumber == 1) { sentenceUnits.put(unit, sentence.getCoveredText()); } unitsLineNumber.put(unit, sentNMumber + "-" + lineNumber); lineNumber++; } sentNMumber++; } }
private void setTokenSentenceAddress(JCas aJCas) { int sentNMumber = 1; for (Sentence sentence : select(aJCas, Sentence.class)) { int lineNumber = 1; for (Token token : selectCovered(Token.class, sentence)) { AnnotationUnit unit = new AnnotationUnit(token.getBegin(), token.getEnd(), false, token.getCoveredText()); units.add(unit); if (lineNumber == 1) { sentenceUnits.put(unit, sentence.getCoveredText()); } unitsLineNumber.put(unit, sentNMumber + "-" + lineNumber); lineNumber++; } sentNMumber++; } }
public void write(PrintWriter aOut, TsvSentence aSentence) { String[] lines = splitPreserveAllTokens(aSentence.getUimaSentence().getCoveredText(), LINE_BREAK); for (String line : lines) { aOut.print(PREFIX_TEXT); aOut.print(escapeText(line)); aOut.print(LINE_BREAK); } for (TsvToken token : aSentence.getTokens()) { write(aOut, token); aOut.write(LINE_BREAK); for (TsvSubToken subToken : token.getSubTokens()) { write(aOut, subToken); aOut.write(LINE_BREAK); } } }
public void write(PrintWriter aOut, TsvSentence aSentence) { String[] lines = splitPreserveAllTokens(aSentence.getUimaSentence().getCoveredText(), LINE_BREAK); for (String line : lines) { aOut.print(PREFIX_TEXT); aOut.print(escapeText(line)); aOut.print(LINE_BREAK); } for (TsvToken token : aSentence.getTokens()) { write(aOut, token); aOut.write(LINE_BREAK); for (TsvSubToken subToken : token.getSubTokens()) { write(aOut, subToken); aOut.write(LINE_BREAK); } } }
innerTool.setSentence(sentenceAnno.getCoveredText()); innerTool.tokenize(); tokenStrings = innerTool.getTokenizedSentence(); SortedMap<Integer, DockedToken> dockedTokens = DockedTokenFinder.find(sentenceAnno.getCoveredText(), tokenStrings, false, true);
public static void tokenize(JCas aJCas) { BreakIterator bi = BreakIterator.getWordInstance(Locale.US); for (Sentence s : select(aJCas, Sentence.class)) { bi.setText(s.getCoveredText()); int last = bi.first(); int cur = bi.next(); while (cur != BreakIterator.DONE) { int[] span = new int[] { last, cur }; trim(s.getCoveredText(), span); if (!isEmpty(span[0], span[1])) { Token seg = new Token(aJCas, span[0] + s.getBegin(), span[1] + s.getBegin()); seg.addToIndexes(aJCas); } last = cur; cur = bi.next(); } } }
public static void tokenize(JCas aJCas) { BreakIterator bi = BreakIterator.getWordInstance(Locale.US); for (Sentence s : select(aJCas, Sentence.class)) { bi.setText(s.getCoveredText()); int last = bi.first(); int cur = bi.next(); while (cur != BreakIterator.DONE) { int[] span = new int[] { last, cur }; trim(s.getCoveredText(), span); if (!isEmpty(span[0], span[1])) { Token seg = new Token(aJCas, span[0] + s.getBegin(), span[1] + s.getBegin()); seg.addToIndexes(aJCas); } last = cur; cur = bi.next(); } } }
String sentenceText = sentence.getCoveredText().toLowerCase(); for (int i = -1; (i = sentenceText.indexOf(selectedText.toLowerCase(), i)) != -1; i = i + selectedText.length()) {
@Test @Ignore("No TEI yet to opensource ") public void testTeiReader() throws Exception { CollectionReaderDescription reader = createReaderDescription(TeiReader.class, TeiReader.PARAM_LANGUAGE, "en", TeiReader.PARAM_SOURCE_LOCATION, "classpath:/local/", TeiReader.PARAM_PATTERNS, new String[] { "[+]*.xml" }); String firstSentence = "70 I DAG."; for (JCas jcas : new JCasIterable(reader)) { DocumentMetaData meta = DocumentMetaData.get(jcas); String text = jcas.getDocumentText(); System.out.printf("%s - %d%n", meta.getDocumentId(), text.length()); System.out.println(jcas.getDocumentLanguage()); assertEquals(2235, JCasUtil.select(jcas, Token.class).size()); assertEquals(745, JCasUtil.select(jcas, POS.class).size()); assertEquals(745, JCasUtil.select(jcas, Lemma.class).size()); assertEquals(0, JCasUtil.select(jcas, NamedEntity.class).size()); assertEquals(30, JCasUtil.select(jcas, Sentence.class).size()); assertEquals(firstSentence, JCasUtil.select(jcas, Sentence.class).iterator().next() .getCoveredText()); } } }
String sentenceText = sentence.getCoveredText().toLowerCase(); for (int i = -1; (i = sentenceText.indexOf(selectedText.toLowerCase(), i)) != -1; i = i + selectedText.length()) {
String sentenceText = sentence.getCoveredText().toLowerCase(); for (int i = -1; (i = sentenceText.indexOf(selectedText.toLowerCase(), i)) != -1; i = i + selectedText.length()) {
@Test public void brownReaderTest() throws Exception { CollectionReader reader = createCollectionReader( BrownCorpusReader.class, BrownCorpusReader.PARAM_PATH, "src/test/resources/test_corpora/brown/", BrownCorpusReader.PARAM_PATTERNS, new String[] { ResourceCollectionReaderBase.INCLUDE_PREFIX + "*.xml" } ); String firstSentence = "The Fulton County Grand Jury said Friday an investigation of Atlanta's recent primary election produced `` no evidence '' that any irregularities took place . "; int i = 0; for (JCas jcas : new JCasIterable(reader)) { if (i == 0) { assertEquals(2239, JCasUtil.select(jcas, Token.class).size()); assertEquals(2239, JCasUtil.select(jcas, POS.class).size()); assertEquals(98, JCasUtil.select(jcas, Sentence.class).size()); assertEquals(firstSentence, JCasUtil.select(jcas, Sentence.class).iterator().next().getCoveredText()); } i++; } assertEquals(3, i); }
@Override protected void process(JCas aJCas, String aText, int aZoneBegin) throws AnalysisEngineProcessException { if (isWriteSentence()) { Span[] sentences = sentenceModelProvider.getResource().sentPosDetect(aText); for (Span sSpan : sentences) { createSentence(aJCas, sSpan.getStart() + aZoneBegin, sSpan.getEnd() + aZoneBegin); } } if (isWriteToken()) { for (Sentence sent : selectCovered(aJCas, Sentence.class, aZoneBegin, aZoneBegin + aText.length())) { Span[] tokens = tokenModelProvider.getResource().tokenizePos(sent.getCoveredText()); for (Span tSpan : tokens) { createToken(aJCas, tSpan.getStart() + sent.getBegin(), tSpan.getEnd() + sent.getBegin()); } } } } }
@Test public void tigerTest() throws Exception { CollectionReader reader = createCollectionReader( TigerCorpusReader.class, TigerCorpusReader.PARAM_FILE, "src/test/resources/test_corpora/tiger/tiger.txt" ); String firstSentence = "`` Ross Perot wäre vielleicht ein prächtiger Diktator '' "; int i = 0; for (JCas jcas : new JCasIterable(reader)) { if (i == 0) { assertEquals(9, JCasUtil.select(jcas, Token.class).size()); assertEquals(9, JCasUtil.select(jcas, Lemma.class).size()); assertEquals(9, JCasUtil.select(jcas, POS.class).size()); assertEquals(1, JCasUtil.select(jcas, Sentence.class).size()); assertEquals(firstSentence, JCasUtil.select(jcas, Sentence.class).iterator().next().getCoveredText()); assertEquals("Sentence 1", DocumentMetaData.get(jcas).getDocumentTitle()); } i++; } assertEquals(20, i); } }
@Test public void wackyTest() throws Exception { CollectionReader reader = createCollectionReader( WackyCorpusReader.class, WackyCorpusReader.PARAM_PATH, "src/test/resources/test_corpora/wacky/", WackyCorpusReader.PARAM_LANGUAGE_EDITION, WackyLanguageEdition.DEWAC.name() ); String firstSentence = "Nikita ( La Femme Nikita ) Dieser Episodenführer wurde von September 1998 bis Mai 1999 von Konstantin C.W. Volkmann geschrieben und im Mai 2000 von Stefan Börzel übernommen . "; int i = 0; for (JCas jcas : new JCasIterable(reader)) { System.out.println(jcas.getDocumentText()); if (i == 0) { assertEquals(11406, JCasUtil.select(jcas, Token.class).size()); assertEquals(11406, JCasUtil.select(jcas, Lemma.class).size()); assertEquals(11406, JCasUtil.select(jcas, POS.class).size()); assertEquals(717, JCasUtil.select(jcas, Sentence.class).size()); assertEquals(firstSentence, JCasUtil.select(jcas, Sentence.class).iterator().next().getCoveredText()); assertEquals("\"http://www.epguides.de/nikita.htm\"", DocumentMetaData.get(jcas).getDocumentTitle()); } i++; } assertEquals(4, i); }