/** * Try to use JPod. * * @param imgPath the provided (PDF) input file. * @return proper (JPod) loader or null if failed */ private static Loader getJPodLoader (Path imgPath) { logger.debug("getJPodLoader {}", imgPath); PDDocument doc = null; try { FileLocator locator = new FileLocator(imgPath.toFile()); doc = PDDocument.createFromLocator(locator); } catch (IOException ex) { logger.warn("Error opening pdf file " + imgPath, ex); } catch (COSLoadException ex) { logger.warn("Invalid pdf file " + imgPath, ex); } if (doc == null) { return null; } int imageCount = doc.getPageTree().getCount(); return new JPodLoader(doc, imageCount); }
private void extractText(PDPageTree pageTree, StringBuilder sb) { for (Iterator<?> it = pageTree.getKids().iterator(); it.hasNext();) { PDPageNode node = (PDPageNode) it.next(); if (node.isPage()) { try { CSTextExtractor extractor = new CSTextExtractor(); PDPage page = (PDPage) node; AffineTransform pageTx = new AffineTransform(); PDFGeometryTools.adjustTransform(pageTx, page); extractor.setDeviceTransform(pageTx); CSDeviceBasedInterpreter interpreter = new CSDeviceBasedInterpreter(null, extractor); interpreter.process(page.getContentStream(), page.getResources()); sb.append(extractor.getContent()); } catch (CSException e) { e.printStackTrace(); } } else { extractText((PDPageTree) node, sb); } } }
PDPage page = doc.getPageTree().getPageAt(id - 1); Rectangle2D rect = page.getCropBox().toNormalizedRectangle(); int rotation = page.getRotate();
private Integer getPageFromCOSArray(COSArray destination) { // DOCEAR: fallback if no entry was found if (destination == null) { return 1; } Iterator<?> it = destination.iterator(); while (it.hasNext()) { COSObject o = (COSObject) it.next(); if (o.isIndirect()) { // the page is indirect referenced o.dereference(); } PDPage page = getDocument().getPageTree().getFirstPage(); while (page != null) { if (page.cosGetObject().equals(o)) { return page.getNodeIndex() + 1; } page = page.getNextPage(); } } return null; }
String title = null; try { PDPage page = getPDDocument(getDocument()).getPageTree().getFirstPage(); if (page.isPage()) { try {
private void onlyHashExtraction() throws IOException { try { PDPage page = getPDDocument(getDocument()).getPageTree().getFirstPage(); if (page.isPage()) { try { if(!page.cosGetContents().basicIterator().hasNext()) { page = page.getNextPage(); } TreeMap<PdfTextEntity, StringBuilder> map = tryTextExtraction(page); Entry<PdfTextEntity, StringBuilder> entry = map.firstEntry(); if(entry == null) { UniqueImageHashExtractor handler = new UniqueImageHashExtractor(); tryImageExtraction(page, handler); uniqueHash = handler.getUniqueHash(); } } catch (Exception ex) { } } } finally { close(); } }