@Override public void process(JCas jCas) throws AnalysisEngineProcessException { int pmId = getHeaderIntDocId(jCas); try { for (Annotation a : select(jCas, annotationClass)) { preparedStatement.setInt(1, pmId); for (int i = 0; i < annotationFields.length; i++) { try { Object value = annotationMethods.get(i).invoke(a); preparedStatement.setObject(i + 2, value); } catch (Exception e) { throw new Exception( "Could not assign field " + (i + 2), e); } } preparedStatement.addBatch(); if (++inserted % 1000 == 0) preparedStatement.executeBatch(); } } catch (Exception e) { LOG.error("could not insert " + pmId, e); } }
private void printSectionStart(String desc, JCas cas, int begin) { String str = cas.getDocumentText(); if (begin < 0) { System.out.println("AAAA - " + desc + " - -1 - " + BlueCasUtil.getHeaderDocId(cas)); } else if (begin < str.length()) { int end = begin + 50; if (end > str.length()) { end = str.length() - 1; } System.out.println("AAAA - "+desc+" ("+ BlueCasUtil.getHeaderDocId(cas)+":"+begin+"): "+str.substring(begin, end)); } }
@Override public void process(JCas jCas) throws AnalysisEngineProcessException { int pmid = BlueCasUtil.getHeaderIntDocId(jCas); if (!BlueCasUtil.isEmptyText(jCas)) { // System.out.println("indexing:: " + pmid); Document doc = new Document(); doc.add(new IntField(PMID_FIELD, pmid, Store.YES)); doc.add(new TextField(CONTENT_FIELD, jCas.getDocumentText(), Store.YES)); doc.add(new TextField(TITLE_FIELD, getTitle(jCas), Store.YES)); try { indexWriter.addDocument(doc); } catch (IOException e) { throw new AnalysisEngineProcessException(e); } } }
@Override public void process(JCas jCas) throws AnalysisEngineProcessException { String title = getTitle(jCas); String text = jCas.getDocumentText(); // add title to text if too small if (text.length() < minTextLenght && title.length() > 0) { text = title + " " + text; } // only detect if text is long enough if (text != null && text.length() > minTextLenght) { // TODO maybe cut if text too long --> slower try { jCas.setDocumentLanguage(detect(text)); } catch (LangDetectException e) { LOG.warn("error detecting language for {}, {}", getHeaderDocId(jCas), e); } } }
@Override public void process(JCas jCas) throws AnalysisEngineProcessException { int pmId = getHeaderIntDocId(jCas); try { if (PubmedArticleEntity.findFirst(PUBMED_ID + " = ?", pmId) == null) { PubmedArticleEntity a = new PubmedArticleEntity(); a.set(PUBMED_ID, pmId); a.set(ABSTRACT, jCas.getDocumentText()); try { Date date = JCasUtil.selectSingle(jCas, Date.class); a.set(PUBLISHED_DATE, date.getYear() + "-" + date.getMonth() + "-" + date.getDay()); } catch (Exception e) {// nope LOG.warn("could not add date to " + pmId, e); } a.set(TITLE, StringUtils.snippetizeAtSpace(getTitle(jCas), 510)); a.saveIt(); inserted++; if (processed++ % 10000 == 0) LOG.debug("processed {}\tinserted {}", processed, inserted); } } catch (Exception e) { LOG.error("could not insert " + pmId, e); } }
public void process(JCas jCas) throws AnalysisEngineProcessException { if (BlueCasUtil.isEmptyText(jCas)) return; .selectCovered(jCas.getCas(), s); + "' in pmid " + getHeaderDocId(jCas), e);
for (Annotation filteredAnnotOccurrence : select(jCas, filteredAnnotationClass)) { boolean overlap = BlueCasUtil.distance(protectedOccurrence, filteredAnnotOccurrence) == -1; if (overlap) {
public void process(JCas jCas) throws AnalysisEngineProcessException { try { for (DocumentBlock block : select(jCas, DocumentBlock.class)) { String label = classify(block.getCoveredText()); if (label.equals(LABEL_INSIDE)) { block.setLabel(SECTION_REFERENCES_ENTRY); } } } catch (Exception e) { int pmId = BlueCasUtil.getHeaderIntDocId(jCas); LOG.warn("could not perform inference on " + pmId, e); } }
Cooccurrence[] array = coocs.toArray(new Cooccurrence[coocs.size()]); Preconditions.checkArgument(predictedLabels.size() == coocs.size(), "pmid" + getHeaderDocId(jCas) + " should have same # of elems, but was: coocs=" + coocs.size() + " and predictedLabels="
@Override public void process(JCas jCas) throws AnalysisEngineProcessException { int pmId = getHeaderIntDocId(jCas); DocumentSpecies docSpecies = selectSingle(jCas, DocumentSpecies.class); String msg = pmId + "\t" + docSpecies.getFamilyName(); for (LinnaeusSpecies species : select(jCas, LinnaeusSpecies.class)) { msg += "\t" + species.getMostProbableSpeciesId() + "\t" + species.getCoveredText().replaceAll("\t", " "); } LOG.debug(msg); }
public static void expandAbbreviations(JCas jCas) { String pmId = getHeaderDocId(jCas); // otherwise was very slow Map<Abbreviation, List<Annotation>> cache = newHashMap(); List<Abbreviation> tmp = newLinkedList(select(jCas, Abbreviation.class)); for (Abbreviation abrev : tmp) { Annotation reference = abrev.getTextReference(); if (reference != null && reference instanceof Abbreviation) { Abbreviation aRef = (Abbreviation) reference; List<Annotation> covereds; if (cache.containsKey(aRef)) covereds = cache.get(aRef); else { covereds = getCovered(jCas, aRef, pmId); cache.put(aRef, covereds); } // copy them to the other abbreviation short-forms for (Annotation covered : covereds) { Annotation clone = (Annotation) covered.clone(); clone.setBegin(abrev.getBegin()); clone.setEnd(abrev.getEnd()); clone.addToIndexes(jCas); if (!clone.getCoveredText().equals(aRef.getCoveredText())) LOG.warn("'{}' not matching2 '{}' in " + pmId, clone.getCoveredText(), aRef.getCoveredText()); } } } }
public void process(JCas jCas) throws AnalysisEngineProcessException { int pmId = getHeaderIntDocId(jCas);
List<SentenceExample> retSentences = newArrayList(); String pmId = getHeaderDocId(jCas); int sentenceId = 0;
public void process(JCas jCas) throws AnalysisEngineProcessException { int pmId = getHeaderIntDocId(jCas);
LOG.debug("Wordnet exception while processing >" + t.getCoveredText() + "< [" + t.getBegin() + ":" + t.getEnd() + "] from doc " + getHeaderDocId(jCas));
getHeaderIntDocId(jCas) + ""); BasicDBObject updateCommands = new BasicDBObject(); updateCommands.put("$set", dbLists);
public void process(JCas jCas) throws AnalysisEngineProcessException { int pmId = BlueCasUtil.getHeaderIntDocId(jCas);