public class PDFReader{ public static void main(String args[]) { PDFTextStripper pdfStripper = null; PDDocument pdDoc = null; COSDocument cosDoc = null; File file = new File("C:/my.pdf"); try { PDFParser parser = new PDFParser(new FileInputStream(file)); parser.parse(); cosDoc = parser.getDocument(); pdfStripper = new PDFTextStripper(); pdDoc = new PDDocument(cosDoc); pdfStripper.setStartPage(1); pdfStripper.setEndPage(5); String parsedText = pdfStripper.getText(pdDoc); System.out.println(parsedText); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } } }
public static Metadata getMet(URL url) throws IOException, SAXException, TikaException { Metadata met = new Metadata(); PDFParser parser = new PDFParser(); parser.parse(url.openStream(), new BodyContentHandler(), met, new ParseContext()); return met; }
public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException { TikaInputStream tis = TikaInputStream.get(stream, new TemporaryResources()); File tmpFile = tis.getFile(); GrobidRESTParser grobidParser = new GrobidRESTParser(); grobidParser.parse(tmpFile.getAbsolutePath(), handler, metadata, context); PDFParser parser = new PDFParser(); parser.parse(new FileInputStream(tmpFile), handler, metadata, context); } }
File file = new File(fileName); try { parser = new PDFParser(new FileInputStream(file)); parser.parse(); cosDoc = parser.getDocument();
private static void initParser(Fs fs) { if (parser == null) { PDFParser pdfParser = new PDFParser(); DefaultParser defaultParser; if (fs.isPdfOcr()) { logger.debug("OCR is activated for PDF documents"); if (ExternalParser.check("tesseract")) { pdfParser.setOcrStrategy("ocr_and_text"); } else { logger.debug("But Tesseract is not installed so we won't run OCR."); } defaultParser = new DefaultParser(); } else { logger.debug("OCR is disabled. Even though it's detected, it must be disabled explicitly"); defaultParser = new DefaultParser( MediaTypeRegistry.getDefaultRegistry(), new ServiceLoader(), Collections.singletonList(TesseractOCRParser.class)); } Parser PARSERS[] = new Parser[2]; PARSERS[0] = defaultParser; PARSERS[1] = pdfParser; parser = new AutoDetectParser(PARSERS); } }
var PDFParser = require('pdf2json'); var pdfParser = new PDFParser(); pdfParser.on('pdfParser_dataReady', function(data) { var doc = data.PDFJS && data.PDFJS.pdfDocument && data.PDFJS.pdfDocument.numPages; console.log('Number of pages:', doc); }); // pdfParser.on('pdfParser_dataError', _.bind(_onPFBinDataError, self)); pdfParser.loadPDF('test.pdf');
@Override protected Parser getParser() { return new PDFParser(); } }
@Override protected Parser getParser() { return new PDFParser(); } }
var fs = require("fs"); // https://github.com/modesty/pdf2json var PDFParser = require("./node_modules/pdf2json/PDFParser"); var pdfParser = new PDFParser(this,1); pdfParser.on("pdfParser_dataError", errData => console.error(errData.parserError)); pdfParser.on("pdfParser_dataReady", pdfData => { console.log(pdfParser) fs.writeFile("./content.txt", pdfParser.getRawTextContent()); });
/** * @see org.opencms.search.extractors.I_CmsTextExtractor#extractText(java.io.InputStream, java.lang.String) */ @Override public I_CmsExtractionResult extractText(InputStream in) throws Exception { return extractText(in, new PDFParser()); } }
public void ReadPDF() throws Exception { URL TestURL = new URL("http://www.axmag.com/download/pdfurl-guide.pdf"); BufferedInputStream TestFile = new BufferedInputStream(TestURL.openStream()); PDFParser TestPDF = new PDFParser(TestFile); TestPDF.parse(); String TestText = new PDFTextStripper().getText(TestPDF.getPDDocument()); Assert.assertTrue(TestText.contains("Open the setting.xml, you can see it is like this")); }
private PdfParser() { Map<Pattern, MediaType> patterns = new HashMap<Pattern, MediaType>(); patterns.put(Pattern.compile(".*\\.pdf", Pattern.CASE_INSENSITIVE), MediaType.application("pdf")); NameDetector detector = new NameDetector(patterns); tika = TikaFactory.newTika(detector, new PDFParser()); }
InputStream input = new FileInputStream("sample.pdf"); ContentHandler handler = new BodyContentHandler(Integer.MAX_VALUE); Metadata metadata = new Metadata(); new PDFParser().parse(input, handler, metadata, new ParseContext()); String plainText = handler.toString(); System.out.println(plainText);
File in = new File("somefile.pdf"); InputStream fin = new FileInputStream(in); PDFParser parser = new PDFParser(fin); parser.setTempDirectory(new File(tempDirectoryPath)); parser.parse(); PDDocument document = parser.getPDDocument();
InputStream input = new FileInputStream(new File(resourceLocation)); ContentHandler textHandler = new BodyContentHandler(); Metadata metadata = new Metadata(); PDFParser parser = new PDFParser(); parser.parse(input, textHandler, metadata); input.close(); out.println("Title: " + metadata.get("title")); out.println("Author: " + metadata.get("Author")); out.println("content: " + textHandler.toString());
InputStream input = new FileInputStream(new File(resourceLocation)); ContentHandler textHandler = new BodyContentHandler(); Metadata metadata = new Metadata(); PDFParser parser = new PDFParser(); parser.parse(input, textHandler, metadata); input.close(); out.println("Title: " + metadata.get("title")); out.println("Author: " + metadata.get("Author")); out.println("content: " + textHandler.toString());
private void openPDFDoc(final File pdfFile) throws Exception { File originalPDF = pdfFile; PDFParser parser = new PDFParser(new BufferedInputStream(new FileInputStream( originalPDF))); parser.parse(); PDDocument originialPdfDoc = parser.getPDDocument(); boolean isOriginalDocEncrypted = originialPdfDoc.isEncrypted(); if (isOriginalDocEncrypted) { originialPdfDoc.openProtection(new StandardDecryptionMaterial("password")); } }
public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException { TikaInputStream tis = TikaInputStream.get(stream, new TemporaryResources()); File tmpFile = tis.getFile(); GrobidRESTParser grobidParser = new GrobidRESTParser(); grobidParser.parse(tmpFile.getAbsolutePath(), handler, metadata, context); PDFParser parser = new PDFParser(); parser.parse(new FileInputStream(tmpFile), handler, metadata, context); } }
public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException { TikaInputStream tis = TikaInputStream.get(stream, new TemporaryResources()); File tmpFile = tis.getFile(); GrobidRESTParser grobidParser = new GrobidRESTParser(); grobidParser.parse(tmpFile.getAbsolutePath(), handler, metadata, context); PDFParser parser = new PDFParser(); parser.parse(new FileInputStream(tmpFile), handler, metadata, context); } }
PDFParser pdfParser = new PDFParser(new FileInputStream("c:\\temp\\owgr49f2013.pdf")); pdfParser.parse(); PDDocument pdDocument = pdfParser.getPDDocument(); PDFTextStripper stripper = new PDFTextStripper("UTF-8"); stripper.setSortByPosition(false); stripper.setWordSeparator("###"); System.out.println(stripper.getText(pdDocument));