@Field void setExtractUniqueInlineImagesOnly(boolean extractUniqueInlineImagesOnly) { defaultConfig.setExtractUniqueInlineImagesOnly(extractUniqueInlineImagesOnly); }
getBooleanProp(props.getProperty("extractInlineImages"), getExtractInlineImages())); setExtractUniqueInlineImagesOnly( getBooleanProp(props.getProperty("extractUniqueInlineImagesOnly"), getExtractUniqueInlineImagesOnly()));
@Field void setExtractUniqueInlineImagesOnly(boolean extractUniqueInlineImagesOnly) { defaultConfig.setExtractUniqueInlineImagesOnly(extractUniqueInlineImagesOnly); }
@Field void setExtractUniqueInlineImagesOnly(boolean extractUniqueInlineImagesOnly) { defaultConfig.setExtractUniqueInlineImagesOnly(extractUniqueInlineImagesOnly); }
pdfParserConfig.setExtractUniqueInlineImagesOnly((Boolean) extractUniqueInlineImagesOnly);
getBooleanProp(props.getProperty("extractInlineImages"), getExtractInlineImages())); setExtractUniqueInlineImagesOnly( getBooleanProp(props.getProperty("extractUniqueInlineImagesOnly"), getExtractUniqueInlineImagesOnly()));
/** * Create a new extractor, which will OCR images by default if Tesseract is available locally, extract inline * images from PDF files and OCR them and use PDFBox's non-sequential PDF parser. */ public Extractor() { // Calculate the SHA256 digest by default. setDigestAlgorithms(DigestAlgorithm.SHA256); // Run OCR on images contained within PDFs and not on pages. pdfConfig.setExtractInlineImages(true); pdfConfig.setOcrStrategy(PDFParserConfig.OCR_STRATEGY.NO_OCR); // By default, only the object IDs are used for determining uniqueness. // In scanned documents under test from the Panama registry, different embedded images had the same ID, leading to incomplete OCRing when uniqueness detection was turned on. pdfConfig.setExtractUniqueInlineImagesOnly(false); // Set a long OCR timeout by default, because Tika's is too short. setOcrTimeout(Duration.ofDays(1)); ocrConfig.setEnableImageProcessing(0); // See TIKA-2167. Image processing causes OCR to fail. // English text recognition by default. ocrConfig.setLanguage("eng"); }
getBooleanProp(props.getProperty("extractInlineImages"), getExtractInlineImages())); setExtractUniqueInlineImagesOnly( getBooleanProp(props.getProperty("extractUniqueInlineImagesOnly"), getExtractUniqueInlineImagesOnly()));
Parser parser = new AutoDetectParser(); BodyContentHandler handler = new BodyContentHandler(Integer.MAX_VALUE); TesseractOCRConfig config = new TesseractOCRConfig(); config.setTesseractPath(tPath); PDFParserConfig pdfConfig = new PDFParserConfig(); pdfConfig.setExtractInlineImages(true); pdfConfig.setExtractUniqueInlineImagesOnly(false); // set to false if pdf contains multiple images. ParseContext parseContext = new ParseContext(); parseContext.set(TesseractOCRConfig.class, config); parseContext.set(PDFParserConfig.class, pdfConfig); //need to add this to make sure recursive parsing happens! parseContext.set(Parser.class, parser);