org.apache.tika.parser.pdf java code examples

 public class PDFReader{
  public static void main(String args[]) {
    PDFTextStripper pdfStripper = null;
    PDDocument pdDoc = null;
    COSDocument cosDoc = null;
    File file = new File("C:/my.pdf");
    try {
      PDFParser parser = new PDFParser(new FileInputStream(file));
      parser.parse();
      cosDoc = parser.getDocument();
      pdfStripper = new PDFTextStripper();
      pdDoc = new PDDocument(cosDoc);
      pdfStripper.setStartPage(1);
      pdfStripper.setEndPage(5);
      String parsedText = pdfStripper.getText(pdDoc);
      System.out.println(parsedText);
    } catch (IOException e) {
      // TODO Auto-generated catch block
      e.printStackTrace();
    } 
  }
}

@Override
public void processPage(PDPage page) throws IOException {
  try {
    super.startPage(page);
    detectAnglesAndProcessPage(page);
  } catch (IOException e) {
    handleCatchableIOE(e);
  } finally {
    super.endPage(page);
  }
}

@Override
public void processPage(PDPage page) throws IOException {
  try {
    super.processPage(page);
  } catch (IOException e) {
    handleCatchableIOE(e);
  }
}

/**
 * Configures the given pdf2XHTML.
 *
 * @param pdf2XHTML
 */
public void configure(PDF2XHTML pdf2XHTML) {
  pdf2XHTML.setSortByPosition(getSortByPosition());
  if (getEnableAutoSpace()) {
    pdf2XHTML.setWordSeparator(" ");
  } else {
    pdf2XHTML.setWordSeparator("");
  }
  if (getAverageCharTolerance() != null) {
    pdf2XHTML.setAverageCharTolerance(getAverageCharTolerance());
  }
  if (getSpacingTolerance() != null) {
    pdf2XHTML.setSpacingTolerance(getSpacingTolerance());
  }
  pdf2XHTML.setSuppressDuplicateOverlappingText(getSuppressDuplicateOverlappingText());
}

private String decode(String value) {
  if (PDFEncodedStringDecoder.shouldDecode(value)) {
    PDFEncodedStringDecoder d = new PDFEncodedStringDecoder();
    return d.decode(value);
  }
  return value;
}

@Override
public void processPage(PDPage pdPage) throws IOException {
  try {
    startPage(pdPage);
    doOCROnCurrentPage();
    endPage(pdPage);
  } catch (TikaException|SAXException e) {
    throw new IOExceptionWithCause(e);
  } catch (IOException e) {
    handleCatchableIOE(e);
  }
}

@Field
void setExtractInlineImages(boolean extractInlineImages) {
  defaultConfig.setExtractInlineImages(extractInlineImages);
}

@Field
void setAllowExtractionForAccessibility(boolean allowExtractionForAccessibility) {
  defaultConfig.setAccessChecker(new AccessChecker(allowExtractionForAccessibility));
}

/**
 * If true, the parser should try to remove duplicated
 * text over the same region.  This is needed for some
 * PDFs that achieve bolding by re-writing the same
 * text in the same area.  Note that this can
 * slow down extraction substantially (PDFBOX-956) and
 * sometimes remove characters that were not in fact
 * duplicated (PDFBOX-1155).  By default this is disabled.
 *
 * @deprecated use {@link #setPDFParserConfig(PDFParserConfig)}
 */
public void setSuppressDuplicateOverlappingText(boolean v) {
  defaultConfig.setSuppressDuplicateOverlappingText(v);
}

/**
 * If true (the default), text in annotations will be
 * extracted.
 *
 * @deprecated use {@link #setPDFParserConfig(PDFParserConfig)}
 */
public void setExtractAnnotationText(boolean v) {
  defaultConfig.setExtractAnnotationText(v);
}

/**
 * If true (the default), the parser should estimate
 * where spaces should be inserted between words.  For
 * many PDFs this is necessary as they do not include
 * explicit whitespace characters.
 *
 * @deprecated use {@link #setPDFParserConfig(PDFParserConfig)}
 */
public void setEnableAutoSpace(boolean v) {
  defaultConfig.setEnableAutoSpace(v);
}

/**
 * If true, text in annotations will be extracted.
 *
 * @deprecated use {@link #getPDFParserConfig()}
 */
public boolean getExtractAnnotationText() {
  return defaultConfig.getExtractAnnotationText();
}

/**
 * Loads properties from InputStream and then tries to close InputStream.
 * If there is an IOException, this silently swallows the exception
 * and goes back to the default.
 *
 * @param is
 */
public PDFParserConfig(InputStream is) {
  init(is);
}

/**
 * Image type used to render the page image for OCR.
 * @see #setOcrImageType(ImageType)
*/
public void setOcrImageType(String ocrImageTypeString) {
  this.ocrImageType = parseImageType(ocrImageTypeString);
}

/**
 * @see #setEnableAutoSpace(boolean)
 * @deprecated use {@link #getPDFParserConfig()}
 */
public boolean getEnableAutoSpace() {
  return defaultConfig.getEnableAutoSpace();
}

/**
 * @see #setSortByPosition(boolean)
 * @deprecated use {@link #getPDFParserConfig()}
 */
public boolean getSortByPosition() {
  return defaultConfig.getSortByPosition();
}

/**
 * @see #setSuppressDuplicateOverlappingText(boolean)
 * @deprecated use {@link #getPDFParserConfig()}
 */
public boolean getSuppressDuplicateOverlappingText() {
  return defaultConfig.getSuppressDuplicateOverlappingText();
}

private void addMetadata(Metadata metadata, String name, String value) {
  if (value != null) {
    metadata.add(name, decode(value));
  }
}

@Field
public void setOcrImageType(String imageType) {
  defaultConfig.setOcrImageType(imageType);
}

@Field
void setExtractActions(boolean extractActions) {
  defaultConfig.setExtractActions(extractActions);
}

Most used classes

PDFParser
PDF parser. This parser can process also encrypted PDF documents if the required password is given a
PDFParserConfig
Config for PDFParser. This allows parameters to be set programmatically: 1. Calls to PDFParser, i.e.
AccessChecker
Checks whether or not a document allows extraction generally or extraction for accessibility only.
PDF2XHTML
Utility class that overrides the PDFTextStripper functionality to produce a semi-structured XHTML SA
PDFEncodedStringDecoder
In fairly rare cases, a PDF's XMP will contain a string that has incorrectly been encoded with PDFEn

How to use org.apache.tika.parser.pdf

Best Java code snippets using org.apache.tika.parser.pdf (Showing top 20 results out of 315)