public String extractPlainText() throws IOException { StringBuilder sb = new StringBuilder(); try { extractText(getPDDocument(getDocument()).getPageTree(), sb); } finally { close(); } return sb.toString(); }
/** * Try to use JPod. * * @param imgPath the provided (PDF) input file. * @return proper (JPod) loader or null if failed */ private static Loader getJPodLoader (Path imgPath) { logger.debug("getJPodLoader {}", imgPath); PDDocument doc = null; try { FileLocator locator = new FileLocator(imgPath.toFile()); doc = PDDocument.createFromLocator(locator); } catch (IOException ex) { logger.warn("Error opening pdf file " + imgPath, ex); } catch (COSLoadException ex) { logger.warn("Invalid pdf file " + imgPath, ex); } if (doc == null) { return null; } int imageCount = doc.getPageTree().getCount(); return new JPodLoader(doc, imageCount); }
PDPage page = doc.getPageTree().getPageAt(id - 1); Rectangle2D rect = page.getCropBox().toNormalizedRectangle(); int rotation = page.getRotate();
private Integer getPageFromCOSArray(COSArray destination) { // DOCEAR: fallback if no entry was found if (destination == null) { return 1; } Iterator<?> it = destination.iterator(); while (it.hasNext()) { COSObject o = (COSObject) it.next(); if (o.isIndirect()) { // the page is indirect referenced o.dereference(); } PDPage page = getDocument().getPageTree().getFirstPage(); while (page != null) { if (page.cosGetObject().equals(o)) { return page.getNodeIndex() + 1; } page = page.getNextPage(); } } return null; }
String title = null; try { PDPage page = getPDDocument(getDocument()).getPageTree().getFirstPage(); if (page.isPage()) { try {
private void onlyHashExtraction() throws IOException { try { PDPage page = getPDDocument(getDocument()).getPageTree().getFirstPage(); if (page.isPage()) { try { if(!page.cosGetContents().basicIterator().hasNext()) { page = page.getNextPage(); } TreeMap<PdfTextEntity, StringBuilder> map = tryTextExtraction(page); Entry<PdfTextEntity, StringBuilder> entry = map.firstEntry(); if(entry == null) { UniqueImageHashExtractor handler = new UniqueImageHashExtractor(); tryImageExtraction(page, handler); uniqueHash = handler.getUniqueHash(); } } catch (Exception ex) { } } } finally { close(); } }