public class PDFReader{ public static void main(String args[]) { PDFTextStripper pdfStripper = null; PDDocument pdDoc = null; COSDocument cosDoc = null; File file = new File("C:/my.pdf"); try { PDFParser parser = new PDFParser(new FileInputStream(file)); parser.parse(); cosDoc = parser.getDocument(); pdfStripper = new PDFTextStripper(); pdDoc = new PDDocument(cosDoc); pdfStripper.setStartPage(1); pdfStripper.setEndPage(5); String parsedText = pdfStripper.getText(pdDoc); System.out.println(parsedText); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } } }
@Override public void processPage(PDPage page) throws IOException { try { super.startPage(page); detectAnglesAndProcessPage(page); } catch (IOException e) { handleCatchableIOE(e); } finally { super.endPage(page); } }
@Override public void processPage(PDPage page) throws IOException { try { super.processPage(page); } catch (IOException e) { handleCatchableIOE(e); } }
/** * Configures the given pdf2XHTML. * * @param pdf2XHTML */ public void configure(PDF2XHTML pdf2XHTML) { pdf2XHTML.setSortByPosition(getSortByPosition()); if (getEnableAutoSpace()) { pdf2XHTML.setWordSeparator(" "); } else { pdf2XHTML.setWordSeparator(""); } if (getAverageCharTolerance() != null) { pdf2XHTML.setAverageCharTolerance(getAverageCharTolerance()); } if (getSpacingTolerance() != null) { pdf2XHTML.setSpacingTolerance(getSpacingTolerance()); } pdf2XHTML.setSuppressDuplicateOverlappingText(getSuppressDuplicateOverlappingText()); }
private String decode(String value) { if (PDFEncodedStringDecoder.shouldDecode(value)) { PDFEncodedStringDecoder d = new PDFEncodedStringDecoder(); return d.decode(value); } return value; }
@Override public void processPage(PDPage pdPage) throws IOException { try { startPage(pdPage); doOCROnCurrentPage(); endPage(pdPage); } catch (TikaException|SAXException e) { throw new IOExceptionWithCause(e); } catch (IOException e) { handleCatchableIOE(e); } }
@Field void setExtractInlineImages(boolean extractInlineImages) { defaultConfig.setExtractInlineImages(extractInlineImages); }
@Field void setAllowExtractionForAccessibility(boolean allowExtractionForAccessibility) { defaultConfig.setAccessChecker(new AccessChecker(allowExtractionForAccessibility)); }
/** * If true, the parser should try to remove duplicated * text over the same region. This is needed for some * PDFs that achieve bolding by re-writing the same * text in the same area. Note that this can * slow down extraction substantially (PDFBOX-956) and * sometimes remove characters that were not in fact * duplicated (PDFBOX-1155). By default this is disabled. * * @deprecated use {@link #setPDFParserConfig(PDFParserConfig)} */ public void setSuppressDuplicateOverlappingText(boolean v) { defaultConfig.setSuppressDuplicateOverlappingText(v); }
/** * If true (the default), text in annotations will be * extracted. * * @deprecated use {@link #setPDFParserConfig(PDFParserConfig)} */ public void setExtractAnnotationText(boolean v) { defaultConfig.setExtractAnnotationText(v); }
/** * If true (the default), the parser should estimate * where spaces should be inserted between words. For * many PDFs this is necessary as they do not include * explicit whitespace characters. * * @deprecated use {@link #setPDFParserConfig(PDFParserConfig)} */ public void setEnableAutoSpace(boolean v) { defaultConfig.setEnableAutoSpace(v); }
/** * If true, text in annotations will be extracted. * * @deprecated use {@link #getPDFParserConfig()} */ public boolean getExtractAnnotationText() { return defaultConfig.getExtractAnnotationText(); }
/** * Loads properties from InputStream and then tries to close InputStream. * If there is an IOException, this silently swallows the exception * and goes back to the default. * * @param is */ public PDFParserConfig(InputStream is) { init(is); }
/** * Image type used to render the page image for OCR. * @see #setOcrImageType(ImageType) */ public void setOcrImageType(String ocrImageTypeString) { this.ocrImageType = parseImageType(ocrImageTypeString); }
/** * @see #setEnableAutoSpace(boolean) * @deprecated use {@link #getPDFParserConfig()} */ public boolean getEnableAutoSpace() { return defaultConfig.getEnableAutoSpace(); }
/** * @see #setSortByPosition(boolean) * @deprecated use {@link #getPDFParserConfig()} */ public boolean getSortByPosition() { return defaultConfig.getSortByPosition(); }
/** * @see #setSuppressDuplicateOverlappingText(boolean) * @deprecated use {@link #getPDFParserConfig()} */ public boolean getSuppressDuplicateOverlappingText() { return defaultConfig.getSuppressDuplicateOverlappingText(); }
private void addMetadata(Metadata metadata, String name, String value) { if (value != null) { metadata.add(name, decode(value)); } }
@Field public void setOcrImageType(String imageType) { defaultConfig.setOcrImageType(imageType); }
@Field void setExtractActions(boolean extractActions) { defaultConfig.setExtractActions(extractActions); }