@Override public void process(JCas jCas) throws AnalysisEngineProcessException { Header header = selectSingle(jCas, Header.class); File pdfFile = new File(header.getSource()); checkFileExists(pdfFile); LOG.debug("extracting {}", pdfFile.getName()); try { PDFTextStream pdf; if (pdfFile.getName().endsWith("zip")) { InputStream is = unzipUniqueFileAsStream(pdfFile); pdf = new PDFTextStream(is, removeExtension(pdfFile.getName())); } else { pdf = new PDFTextStream(pdfFile); } BlockHandler blueHandler = new BlockHandler(); pdf.pipe(blueHandler); pdf.close(); PdfCollectionReader.extractText(jCas, blueHandler.getDoc(), header.getDocId(), expandAbbrevs); if (extractTables) PdfCollectionReader .extractTables(tableExtractor, pdfFile, jCas); // if (extractReferences) // extractReferences(f, jcas); } catch (Throwable t) { LOG.error("error extracting " + header.getSource(), t); // throw new AnalysisEngineProcessException(e); } }
@Override public void getNext(JCas jcas) throws IOException, CollectionException { File f = fileIterator.next(); Header header = new Header(jcas); // .* removes the tmp part header.setDocId(f.getName().replaceAll("\\.pdf.*", "")); header.setSource(f.getAbsolutePath()); header.addToIndexes(); PDFTextStream pdf = new PDFTextStream(f); BlockHandler blueHandler = new BlockHandler(); pdf.pipe(blueHandler); pdf.close(); extractText(jcas, blueHandler.getDoc(), header.getDocId(), expandAbbrevs); if (extractTables) extractTables(tableExtractor, f, jcas); // printHtml(jcas, new File("target/" + header.getDocId() + ".html")); }