private static void initParser(Fs fs) { if (parser == null) { PDFParser pdfParser = new PDFParser(); DefaultParser defaultParser; if (fs.isPdfOcr()) { logger.debug("OCR is activated for PDF documents"); if (ExternalParser.check("tesseract")) { pdfParser.setOcrStrategy("ocr_and_text"); } else { logger.debug("But Tesseract is not installed so we won't run OCR."); } defaultParser = new DefaultParser(); } else { logger.debug("OCR is disabled. Even though it's detected, it must be disabled explicitly"); defaultParser = new DefaultParser( MediaTypeRegistry.getDefaultRegistry(), new ServiceLoader(), Collections.singletonList(TesseractOCRParser.class)); } Parser PARSERS[] = new Parser[2]; PARSERS[0] = defaultParser; PARSERS[1] = pdfParser; parser = new AutoDetectParser(PARSERS); } }
private static void initParser(Fs fs) { if (parser == null) { PDFParser pdfParser = new PDFParser(); DefaultParser defaultParser; if (fs.isPdfOcr()) { logger.debug("OCR is activated for PDF documents"); if (ExternalParser.check("tesseract")) { pdfParser.setOcrStrategy("ocr_and_text"); } else { logger.debug("But Tesseract is not installed so we won't run OCR."); } defaultParser = new DefaultParser(); } else { logger.debug("OCR is disabled. Even though it's detected, it must be disabled explicitly"); defaultParser = new DefaultParser( MediaTypeRegistry.getDefaultRegistry(), new ServiceLoader(), Collections.singletonList(TesseractOCRParser.class)); } Parser PARSERS[] = new Parser[2]; PARSERS[0] = defaultParser; PARSERS[1] = pdfParser; parser = new AutoDetectParser(PARSERS); } }