@Override public void process(JCas jCas) throws AnalysisEngineProcessException { String title = getTitle(jCas); String text = jCas.getDocumentText(); // add title to text if too small if (text.length() < minTextLenght && title.length() > 0) { text = title + " " + text; } // only detect if text is long enough if (text != null && text.length() > minTextLenght) { // TODO maybe cut if text too long --> slower try { jCas.setDocumentLanguage(detect(text)); } catch (LangDetectException e) { LOG.warn("error detecting language for {}, {}", getHeaderDocId(jCas), e); } } }
@Override public void process(JCas jCas) throws AnalysisEngineProcessException { int pmid = BlueCasUtil.getHeaderIntDocId(jCas); if (!BlueCasUtil.isEmptyText(jCas)) { // System.out.println("indexing:: " + pmid); Document doc = new Document(); doc.add(new IntField(PMID_FIELD, pmid, Store.YES)); doc.add(new TextField(CONTENT_FIELD, jCas.getDocumentText(), Store.YES)); doc.add(new TextField(TITLE_FIELD, getTitle(jCas), Store.YES)); try { indexWriter.addDocument(doc); } catch (IOException e) { throw new AnalysisEngineProcessException(e); } } }
@Override public void process(JCas jCas) throws AnalysisEngineProcessException { int pmId = getHeaderIntDocId(jCas); try { if (PubmedArticleEntity.findFirst(PUBMED_ID + " = ?", pmId) == null) { PubmedArticleEntity a = new PubmedArticleEntity(); a.set(PUBMED_ID, pmId); a.set(ABSTRACT, jCas.getDocumentText()); try { Date date = JCasUtil.selectSingle(jCas, Date.class); a.set(PUBLISHED_DATE, date.getYear() + "-" + date.getMonth() + "-" + date.getDay()); } catch (Exception e) {// nope LOG.warn("could not add date to " + pmId, e); } a.set(TITLE, StringUtils.snippetizeAtSpace(getTitle(jCas), 510)); a.saveIt(); inserted++; if (processed++ % 10000 == 0) LOG.debug("processed {}\tinserted {}", processed, inserted); } } catch (Exception e) { LOG.error("could not insert " + pmId, e); } }