Refine search
public class PDFReader{ public static void main(String args[]) { PDFTextStripper pdfStripper = null; PDDocument pdDoc = null; COSDocument cosDoc = null; File file = new File("C:/my.pdf"); try { PDFParser parser = new PDFParser(new FileInputStream(file)); parser.parse(); cosDoc = parser.getDocument(); pdfStripper = new PDFTextStripper(); pdDoc = new PDDocument(cosDoc); pdfStripper.setStartPage(1); pdfStripper.setEndPage(5); String parsedText = pdfStripper.getText(pdDoc); System.out.println(parsedText); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } } }
addMetadata(metadata, property, pdfBoxBaseline); List<String> items = getXMPBagOrSeqList(dc, property.getName()); if (items == null) { if (pdfBoxBaseline != null && pdfBoxBaseline.length() > 0) { addMetadata(metadata, property, pdfBoxBaseline); addMetadata(metadata, property, item); addMetadata(metadata, property, pdfBoxBaseline);
private void addMetadata(Metadata metadata, String name, String value) { if (value != null) { metadata.add(name, decode(value)); } }
public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException { TikaInputStream tis = TikaInputStream.get(stream, new TemporaryResources()); File tmpFile = tis.getFile(); GrobidRESTParser grobidParser = new GrobidRESTParser(); grobidParser.parse(tmpFile.getAbsolutePath(), handler, metadata, context); PDFParser parser = new PDFParser(); parser.parse(new FileInputStream(tmpFile), handler, metadata, context); } }
PDFParser pps = new PDFParser(new FileInputStream(Filepath);
File in = new File("somefile.pdf"); InputStream fin = new FileInputStream(in); PDFParser parser = new PDFParser(fin); parser.setTempDirectory(new File(tempDirectoryPath)); parser.parse(); PDDocument document = parser.getPDDocument();
psStream = new FileInputStream(filename); } catch (FileNotFoundException ffne) { try { PDFParser parser = new PDFParser(psStream); PrinterJob job1 = PrinterJob.getPrinterJob(); job1.setPrintService(services[count]); psStream.close();
var PDFParser = require('pdf2json'); var pdfParser = new PDFParser(); pdfParser.on('pdfParser_dataReady', function(data) { var doc = data.PDFJS && data.PDFJS.pdfDocument && data.PDFJS.pdfDocument.numPages; console.log('Number of pages:', doc); }); // pdfParser.on('pdfParser_dataError', _.bind(_onPFBinDataError, self)); pdfParser.loadPDF('test.pdf');
public void ReadPDF() throws Exception { URL TestURL = new URL("http://www.axmag.com/download/pdfurl-guide.pdf"); BufferedInputStream TestFile = new BufferedInputStream(TestURL.openStream()); PDFParser TestPDF = new PDFParser(TestFile); TestPDF.parse(); String TestText = new PDFTextStripper().getText(TestPDF.getPDDocument()); Assert.assertTrue(TestText.contains("Open the setting.xml, you can see it is like this")); }
private static void initParser(Fs fs) { if (parser == null) { PDFParser pdfParser = new PDFParser(); DefaultParser defaultParser; if (fs.isPdfOcr()) { logger.debug("OCR is activated for PDF documents"); if (ExternalParser.check("tesseract")) { pdfParser.setOcrStrategy("ocr_and_text"); } else { logger.debug("But Tesseract is not installed so we won't run OCR."); } defaultParser = new DefaultParser(); } else { logger.debug("OCR is disabled. Even though it's detected, it must be disabled explicitly"); defaultParser = new DefaultParser( MediaTypeRegistry.getDefaultRegistry(), new ServiceLoader(), Collections.singletonList(TesseractOCRParser.class)); } Parser PARSERS[] = new Parser[2]; PARSERS[0] = defaultParser; PARSERS[1] = pdfParser; parser = new AutoDetectParser(PARSERS); } }
try { TikaInputStream tstream = TikaInputStream.cast(stream); password = getPassword(metadata, context); MemoryUsageSetting memoryUsageSetting = MemoryUsageSetting.setupMainMemoryOnly(); if (localConfig.getMaxMainMemoryBytes() >= 0) { extractMetadata(pdfDocument, metadata, context); AccessChecker checker = localConfig.getAccessChecker(); checker.check(metadata); if (handler != null) { if (shouldHandleXFAOnly(pdfDocument, localConfig)) { handleXFAOnly(pdfDocument, handler, metadata, context); } else if (localConfig.getOcrStrategy().equals(PDFParserConfig.OCR_STRATEGY.OCR_ONLY)) { metadata.add("X-Parsed-By", TesseractOCRParser.class.toString());
Document dom = loadDOM(document.getDocumentCatalog().getMetadata(), metadata, context); extractMultilingualItems(metadata, TikaCoreProperties.TITLE, info.getTitle(), dcSchema); addMetadata(metadata, PDF.DOC_INFO_TITLE, info.getTitle()); extractDublinCoreListItems(metadata, TikaCoreProperties.CREATOR, info.getAuthor(), dcSchema); addMetadata(metadata, PDF.DOC_INFO_CREATOR, info.getAuthor()); extractDublinCoreListItems(metadata, TikaCoreProperties.CONTRIBUTOR, null, dcSchema); addMetadata(metadata, TikaCoreProperties.CREATOR_TOOL, info.getCreator()); addMetadata(metadata, PDF.DOC_INFO_CREATOR_TOOL, info.getCreator()); addMetadata(metadata, Office.KEYWORDS, info.getKeywords()); addMetadata(metadata, PDF.DOC_INFO_KEY_WORDS, info.getKeywords()); addMetadata(metadata, PDF.DOC_INFO_PRODUCER, info.getProducer()); extractMultilingualItems(metadata, TikaCoreProperties.DESCRIPTION, null, dcSchema); addMetadata(metadata, PDF.DOC_INFO_SUBJECT, info.getSubject()); addMetadata(metadata, TikaCoreProperties.SUBJECT, info.getKeywords()); addMetadata(metadata, TikaCoreProperties.SUBJECT, info.getSubject()); addMetadata(metadata, OfficeOpenXMLCore.SUBJECT, info.getSubject()); addMetadata(metadata, PDF.DOC_INFO_TRAPPED, info.getTrapped()); Calendar created = info.getCreationDate(); addMetadata(metadata, PDF.DOC_INFO_CREATED, created); addMetadata(metadata, TikaCoreProperties.CREATED, created); Calendar modified = info.getModificationDate(); addMetadata(metadata, TikaCoreProperties.MODIFIED, modified); addMetadata(metadata, PDF.DOC_INFO_MODIFICATION_DATE, modified); String name = key.getName();
@Override public List<Page> parse(InputStream fs, List<String> filterPatterns) throws Exception { List<Page> pages = new ArrayList<>(); PageContentHandler handler = new PageContentHandler(filterPatterns); Metadata metadata = new Metadata(); pdfParser.setSortByPosition(true); pdfParser.parse(fs, handler, metadata, new ParseContext()); Map<Integer, List<String>> content = handler.getImprovedContent(); for (Integer i : content.keySet()) { Page page = new Page(i); for (String p : content.get(i)) { page.getParagraphs().add(p); } pages.add(page); } return pages; } }
/** * Used when processing custom metadata entries, as PDFBox won't do * the conversion for us in the way it does for the standard ones */ private void addMetadata(Metadata metadata, String name, COSBase value) { if (value instanceof COSArray) { for (Object v : ((COSArray) value).toList()) { addMetadata(metadata, name, ((COSBase) v)); } } else if (value instanceof COSString) { addMetadata(metadata, name, ((COSString) value).getString()); } // Avoid calling COSDictionary#toString, since it can lead to infinite // recursion. See TIKA-1038 and PDFBOX-1835. else if (value != null && !(value instanceof COSDictionary)) { addMetadata(metadata, name, value.toString()); } }
InputStream input = new FileInputStream(new File(resourceLocation)); ContentHandler textHandler = new BodyContentHandler(); Metadata metadata = new Metadata(); PDFParser parser = new PDFParser(); parser.parse(input, textHandler, metadata); input.close(); out.println("Title: " + metadata.get("title")); out.println("Author: " + metadata.get("Author")); out.println("content: " + textHandler.toString());
PDFParser pps = new PDFParser(new FileInputStream(Filepath));
private void openPDFDoc(final File pdfFile) throws Exception { File originalPDF = pdfFile; PDFParser parser = new PDFParser(new BufferedInputStream(new FileInputStream( originalPDF))); parser.parse(); PDDocument originialPdfDoc = parser.getPDDocument(); boolean isOriginalDocEncrypted = originialPdfDoc.isEncrypted(); if (isOriginalDocEncrypted) { originialPdfDoc.openProtection(new StandardDecryptionMaterial("password")); } }
@Override protected Parser getParser() { return new PDFParser(); } }
import org.apache.tika.parser.pdf.PDFParser; import org.apache.tika.metadata.Metadata; import org.apache.tika.parser.ParseContext; import org.apache.tika.sax.BodyContentHandler; import org.xml.sax.ContentHandler; public class URLReader { public static void main(String[] args) throws Exception { URL url = new URL("http://website.com/document.pdf"); ContentHandler contenthandler = new BodyContentHandler(); Metadata metadata = new Metadata(); PDFParser pdfparser = new PDFParser(); pdfparser.parse(is, contenthandler, metadata, new ParseContext()); System.out.println(contenthandler.toString()); } }