Tabnine Logo
PDFParser.parse
Code IndexAdd Tabnine to your IDE (free)

How to use
parse
method
in
org.apache.tika.parser.pdf.PDFParser

Best Java code snippets using org.apache.tika.parser.pdf.PDFParser.parse (Showing top 20 results out of 315)

origin: stackoverflow.com

 public class PDFReader{
  public static void main(String args[]) {
    PDFTextStripper pdfStripper = null;
    PDDocument pdDoc = null;
    COSDocument cosDoc = null;
    File file = new File("C:/my.pdf");
    try {
      PDFParser parser = new PDFParser(new FileInputStream(file));
      parser.parse();
      cosDoc = parser.getDocument();
      pdfStripper = new PDFTextStripper();
      pdDoc = new PDDocument(cosDoc);
      pdfStripper.setStartPage(1);
      pdfStripper.setEndPage(5);
      String parsedText = pdfStripper.getText(pdDoc);
      System.out.println(parsedText);
    } catch (IOException e) {
      // TODO Auto-generated catch block
      e.printStackTrace();
    } 
  }
}
origin: apache/tika

public static Metadata getMet(URL url) throws IOException, SAXException,
    TikaException {
  Metadata met = new Metadata();
  PDFParser parser = new PDFParser();
  parser.parse(url.openStream(), new BodyContentHandler(), met, new ParseContext());
  return met;
}
origin: apache/tika

 public void parse(InputStream stream, ContentHandler handler,
   Metadata metadata, ParseContext context) throws IOException,
   SAXException, TikaException {
  TikaInputStream tis = TikaInputStream.get(stream, new TemporaryResources());
  File tmpFile = tis.getFile();

  GrobidRESTParser grobidParser = new GrobidRESTParser();
  grobidParser.parse(tmpFile.getAbsolutePath(), handler, metadata, context);

  PDFParser parser = new PDFParser();
  parser.parse(new FileInputStream(tmpFile), handler, metadata, context);
 }
}
origin: stackoverflow.com

try {
  parser = new PDFParser(new FileInputStream(file));
  parser.parse();
  cosDoc = parser.getDocument();
  pdfStripper = new PDFTextStripper();
origin: stackoverflow.com

public void ReadPDF() throws Exception {
 URL TestURL = new URL("http://www.axmag.com/download/pdfurl-guide.pdf");
 BufferedInputStream TestFile = new BufferedInputStream(TestURL.openStream());
 PDFParser TestPDF = new PDFParser(TestFile);
 TestPDF.parse();
 String TestText = new PDFTextStripper().getText(TestPDF.getPDDocument());
 Assert.assertTrue(TestText.contains("Open the setting.xml, you can see it is like this"));
 }
origin: stackoverflow.com

 File in = new File("somefile.pdf");
InputStream fin = new FileInputStream(in);
PDFParser parser = new PDFParser(fin);
parser.setTempDirectory(new File(tempDirectoryPath));
parser.parse();
PDDocument document = parser.getPDDocument();
origin: stackoverflow.com

 InputStream input = new FileInputStream(new File(resourceLocation));
ContentHandler textHandler = new BodyContentHandler();
Metadata metadata = new Metadata();
PDFParser parser = new PDFParser();
parser.parse(input, textHandler, metadata);
input.close();
out.println("Title: " + metadata.get("title"));
out.println("Author: " + metadata.get("Author"));
out.println("content: " + textHandler.toString());
origin: stackoverflow.com

 InputStream input = new FileInputStream(new File(resourceLocation));
ContentHandler textHandler = new BodyContentHandler();
Metadata metadata = new Metadata();
PDFParser parser = new PDFParser();
parser.parse(input, textHandler, metadata);
input.close();
out.println("Title: " + metadata.get("title"));
out.println("Author: " + metadata.get("Author"));
out.println("content: " + textHandler.toString());
origin: stackoverflow.com

 import org.apache.tika.parser.pdf.PDFParser;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.sax.BodyContentHandler;
import org.xml.sax.ContentHandler;

public class URLReader {
  public static void main(String[] args) throws Exception {

    URL url = new URL("http://website.com/document.pdf");
    ContentHandler contenthandler = new BodyContentHandler();
    Metadata metadata = new Metadata();
    PDFParser pdfparser = new PDFParser();
    pdfparser.parse(is, contenthandler, metadata, new ParseContext());

    System.out.println(contenthandler.toString());
  }
}
origin: stackoverflow.com

private void openPDFDoc(final File pdfFile) throws Exception {
   File originalPDF = pdfFile;
   PDFParser parser = new PDFParser(new BufferedInputStream(new FileInputStream(
       originalPDF)));
   parser.parse();
   PDDocument originialPdfDoc = parser.getPDDocument();
   boolean isOriginalDocEncrypted = originialPdfDoc.isEncrypted();
   if (isOriginalDocEncrypted) {
     originialPdfDoc.openProtection(new StandardDecryptionMaterial("password"));
   }
 }
origin: org.apache.tika/tika-parsers

 public void parse(InputStream stream, ContentHandler handler,
   Metadata metadata, ParseContext context) throws IOException,
   SAXException, TikaException {
  TikaInputStream tis = TikaInputStream.get(stream, new TemporaryResources());
  File tmpFile = tis.getFile();

  GrobidRESTParser grobidParser = new GrobidRESTParser();
  grobidParser.parse(tmpFile.getAbsolutePath(), handler, metadata, context);

  PDFParser parser = new PDFParser();
  parser.parse(new FileInputStream(tmpFile), handler, metadata, context);
 }
}
origin: com.github.lafa.tikaNoExternal/tika-external

 public void parse(InputStream stream, ContentHandler handler,
   Metadata metadata, ParseContext context) throws IOException,
   SAXException, TikaException {
  TikaInputStream tis = TikaInputStream.get(stream, new TemporaryResources());
  File tmpFile = tis.getFile();

  GrobidRESTParser grobidParser = new GrobidRESTParser();
  grobidParser.parse(tmpFile.getAbsolutePath(), handler, metadata, context);

  PDFParser parser = new PDFParser();
  parser.parse(new FileInputStream(tmpFile), handler, metadata, context);
 }
}
origin: stackoverflow.com

 PDFParser pdfParser = new PDFParser(new FileInputStream("c:\\temp\\owgr49f2013.pdf"));
pdfParser.parse();
PDDocument pdDocument = pdfParser.getPDDocument();

PDFTextStripper stripper = new PDFTextStripper("UTF-8");
stripper.setSortByPosition(false);
stripper.setWordSeparator("###");
System.out.println(stripper.getText(pdDocument));
origin: stackoverflow.com

parser.parse();
cosDoc = parser.getDocument();
pdfStripper = new PDFTextStripper();
origin: graphaware/neo4j-nlp

  @Override
  public List<Page> parse(InputStream fs, List<String> filterPatterns) throws Exception {
    List<Page> pages = new ArrayList<>();
    PageContentHandler handler = new PageContentHandler(filterPatterns);
    Metadata metadata = new Metadata();
    pdfParser.setSortByPosition(true);
    pdfParser.parse(fs, handler, metadata, new ParseContext());

    Map<Integer, List<String>> content =  handler.getImprovedContent();
    for (Integer i : content.keySet()) {
      Page page = new Page(i);
      for (String p : content.get(i)) {
        page.getParagraphs().add(p);
      }
      pages.add(page);
    }

    return pages;
  }
}
origin: stackoverflow.com

parser.parse();
cosDoc = parser.getDocument();
pdfStripper = new PDFTextStripper();
origin: stackoverflow.com

parser.parse();
cosDoc = parser.getDocument();
pdfStripper = new PDFTextStripper();
origin: stackoverflow.com

parser.parse();
origin: stackoverflow.com

 public static String pdftoText(File file) {
  PDFParser parser=null;
  String parsedText = null;
  PDFTextStripper pdfStripper = null;
  PDDocument pdDoc = null;
  COSDocument cosDoc = null;
  try {
    parser = new PDFParser(new FileInputStream(file));
    parser.parse();
    cosDoc = parser.getDocument();
    pdfStripper = new PDFTextStripper();
    pdDoc = new PDDocument(cosDoc);
    parsedText = pdfStripper.getText(pdDoc);
  } catch (Exception e) {
    // handle exception
  } finally {
    try {
      if (cosDoc != null)
        cosDoc.close();
      if (pdDoc != null)
        pdDoc.close();
    } catch (Exception e) {
      // handle excpetion
    }
  }
  return parsedText;
}
origin: apache/ofbiz-framework

  public void testFopMacroLibrary() throws Exception {
    String screentextUrl = screenUrl.concat("Fop");
    HttpClient http = initHttpClient();
    http.setUrl(screentextUrl.concat(authentificationQuery));
    //FIXME need to check if the stream is an application-pdf that don't contains ftl stack trace
    InputStream screenInputStream = http.postStream();
    assertNotNull("Response failed from ofbiz", screenInputStream);
    assertEquals("Response contentType isn't good : " + http.getResponseContentType(), "application/pdf;charset=UTF-8", http.getResponseContentType());

    String screenOutString = "";
    try {
      BodyContentHandler handler = new BodyContentHandler(Integer.MAX_VALUE);
      Metadata metadata = new Metadata();
      new PDFParser().parse(screenInputStream, handler, metadata, new ParseContext());
      screenOutString = handler.toString();
    } finally {
      screenInputStream.close();
    }
    //Test if a ftl macro error is present
    assertFalse("Fop Screen contains Macro on error : see " + screentextUrl + " for more detail", screenOutString.contains("FreeMarker template error:"));
  }
}
org.apache.tika.parser.pdfPDFParserparse

Popular methods of PDFParser

  • <init>
  • addMetadata
  • decode
  • extractDublinCoreListItems
    This tries to read a list from a particular property in XMPSchemaDublinCore. If it can't find the in
  • extractMetadata
  • extractMultilingualItems
    Try to extract all multilingual items from the XMPSchema This relies on the property having a valid
  • getPassword
  • getXMPBagOrSeqList
    As of this writing, XMPSchema can contain bags or sequence lists for some attributes...despite stand
  • handleXFAOnly
  • loadDOM
  • shouldHandleXFAOnly
  • getDocument
  • shouldHandleXFAOnly,
  • getDocument,
  • getPDDocument,
  • setInitializableProblemHandler,
  • setOcrStrategy,
  • setSortByPosition,
  • setTempDirectory

Popular in Java

  • Reactive rest calls using spring rest template
  • findViewById (Activity)
  • scheduleAtFixedRate (ScheduledExecutorService)
  • getOriginalFilename (MultipartFile)
    Return the original filename in the client's filesystem.This may contain path information depending
  • FileInputStream (java.io)
    An input stream that reads bytes from a file. File file = ...finally if (in != null) in.clos
  • OutputStream (java.io)
    A writable sink for bytes.Most clients will use output streams that write data to the file system (
  • InetAddress (java.net)
    An Internet Protocol (IP) address. This can be either an IPv4 address or an IPv6 address, and in pra
  • SocketTimeoutException (java.net)
    This exception is thrown when a timeout expired on a socket read or accept operation.
  • Format (java.text)
    The base class for all formats. This is an abstract base class which specifies the protocol for clas
  • TreeMap (java.util)
    Walk the nodes of the tree left-to-right or right-to-left. Note that in descending iterations, next
  • From CI to AI: The AI layer in your organization
Tabnine Logo
  • Products

    Search for Java codeSearch for JavaScript code
  • IDE Plugins

    IntelliJ IDEAWebStormVisual StudioAndroid StudioEclipseVisual Studio CodePyCharmSublime TextPhpStormVimGoLandRubyMineEmacsJupyter NotebookJupyter LabRiderDataGripAppCode
  • Company

    About UsContact UsCareers
  • Resources

    FAQBlogTabnine AcademyTerms of usePrivacy policyJava Code IndexJavascript Code Index
Get Tabnine for your IDE now