private void handleRecursiveJson(URL url, OutputStream output) throws IOException, SAXException, TikaException { Metadata metadata = new Metadata(); RecursiveParserWrapper wrapper = new RecursiveParserWrapper(parser); RecursiveParserWrapperHandler handler = new RecursiveParserWrapperHandler(getContentHandlerFactory(type), -1); try (InputStream input = TikaInputStream.get(url, metadata)) { wrapper.parse(input, handler, metadata, context); } JsonMetadataList.setPrettyPrinting(prettyPrint); Writer writer = getOutputWriter(output, encoding); try { JsonMetadataList.toJson(handler.getMetadataList(), writer); } finally { writer.flush(); } }
/** * * @param contentHandler content handler used on the main document * @param metadata metadata from the main document * @throws SAXException */ @Override public void endDocument(ContentHandler contentHandler, Metadata metadata) throws SAXException { super.endDocument(contentHandler, metadata); addContent(contentHandler, metadata); metadataList.add(0, ParserUtils.cloneMetadata(metadata)); }
} else if (object instanceof RecursiveParserWrapperHandler) { resources.add(new RecursiveMetadataContentHandlerResource((RecursiveParserWrapperHandler) object)); object = new RecursiveMetadataContentHandlerProxy(n, ((RecursiveParserWrapperHandler)object).getContentHandlerFactory()); } else if (object instanceof ContentHandler && ! (object instanceof AbstractRecursiveParserWrapperHandler)) {
/** * This clears the last parser state (metadata list, unknown count, hit embeddedresource count) * * @deprecated use a {@link org.apache.tika.sax.RecursiveParserWrapperHandler} instead * @throws IllegalStateException if you used a {@link RecursiveParserWrapper} in your call * to {@link #parse(InputStream, ContentHandler, Metadata, ParseContext)} */ @Deprecated public void reset() { if (lastParseState != null) { lastParseState = new ParserState(new RecursiveParserWrapperHandler(contentHandlerFactory, maxEmbeddedResources)); } else { throw new IllegalStateException("This is deprecated; please use a RecursiveParserWrapperHandler instead"); } }
/** * * The first element in the returned list represents the * data from the outer container file. There is no guarantee * about the ordering of the list after that. * * @deprecated use a {@link RecursiveParserWrapperHandler} instead * * @return list of Metadata objects that were gathered during the parse * @throws IllegalStateException if you've used a {@link RecursiveParserWrapperHandler} in your last * call to {@link #parse(InputStream, ContentHandler, Metadata, ParseContext)} */ @Deprecated public List<Metadata> getMetadata() { if (lastParseState != null) { return ((RecursiveParserWrapperHandler) lastParseState.recursiveParserWrapperHandler).getMetadataList(); } else { throw new IllegalStateException("This is deprecated; please use a RecursiveParserWrapperHandler instead"); } }
parserState = new ParserState((AbstractRecursiveParserWrapperHandler)recursiveParserWrapperHandler); } else { parserState = new ParserState(new RecursiveParserWrapperHandler(contentHandlerFactory, maxEmbeddedResources)); lastParseState = parserState;
/** * * The first element in the returned list represents the * data from the outer container file. There is no guarantee * about the ordering of the list after that. * * @deprecated use a {@link RecursiveParserWrapperHandler} instead * * @return list of Metadata objects that were gathered during the parse * @throws IllegalStateException if you've used a {@link RecursiveParserWrapperHandler} in your last * call to {@link #parse(InputStream, ContentHandler, Metadata, ParseContext)} */ @Deprecated public List<Metadata> getMetadata() { if (lastParseState != null) { return ((RecursiveParserWrapperHandler) lastParseState.recursiveParserWrapperHandler).getMetadataList(); } else { throw new IllegalStateException("This is deprecated; please use a RecursiveParserWrapperHandler instead"); } }
metadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, "test_recursive_embedded.docx"); ParseContext context = new ParseContext(); RecursiveParserWrapperHandler handler = new RecursiveParserWrapperHandler(factory, -1); try (InputStream stream = ParsingExample.class.getResourceAsStream("test_recursive_embedded.docx")) { wrapper.parse(stream, handler, metadata, context); return handler.getMetadataList();
/** * This clears the last parser state (metadata list, unknown count, hit embeddedresource count) * * @deprecated use a {@link org.apache.tika.sax.RecursiveParserWrapperHandler} instead * @throws IllegalStateException if you used a {@link RecursiveParserWrapper} in your call * to {@link #parse(InputStream, ContentHandler, Metadata, ParseContext)} */ @Deprecated public void reset() { if (lastParseState != null) { lastParseState = new ParserState(new RecursiveParserWrapperHandler(contentHandlerFactory, maxEmbeddedResources)); } else { throw new IllegalStateException("This is deprecated; please use a RecursiveParserWrapperHandler instead"); } }
/** * This is called after parsing an embedded document. * @param contentHandler local contenthandler used on the embedded document * @param metadata metadata from the embedded document * @throws SAXException */ @Override public void endEmbeddedDocument(ContentHandler contentHandler, Metadata metadata) throws SAXException { super.endEmbeddedDocument(contentHandler, metadata); addContent(contentHandler, metadata); metadataList.add(ParserUtils.cloneMetadata(metadata)); }
} else if (object instanceof RecursiveParserWrapperHandler) { resources.add(new RecursiveMetadataContentHandlerResource((RecursiveParserWrapperHandler) object)); object = new RecursiveMetadataContentHandlerProxy(n, ((RecursiveParserWrapperHandler)object).getContentHandlerFactory()); } else if (object instanceof ContentHandler && ! (object instanceof AbstractRecursiveParserWrapperHandler)) {
List<Metadata> metadataList = null; Metadata containerMetadata = fileResource.getMetadata(); RecursiveParserWrapperHandler handler = new RecursiveParserWrapperHandler(contentHandlerFactory, -1); try { parse(fileResource.getResourceId(), parser, is, handler, thrown = t; } finally { metadataList = handler.getMetadataList(); IOUtils.closeQuietly(is);
parserState = new ParserState((AbstractRecursiveParserWrapperHandler)recursiveParserWrapperHandler); } else { parserState = new ParserState(new RecursiveParserWrapperHandler(contentHandlerFactory, maxEmbeddedResources)); lastParseState = parserState;
/** * * @param contentHandler content handler used on the main document * @param metadata metadata from the main document * @throws SAXException */ @Override public void endDocument(ContentHandler contentHandler, Metadata metadata) throws SAXException { super.endDocument(contentHandler, metadata); addContent(contentHandler, metadata); metadataList.add(0, ParserUtils.cloneMetadata(metadata)); }
private static List<Metadata> getRecursiveMetadata(InputStream is, Parser parser, ParseContext parseContext) throws Exception { //different from parent TikaTest in that this extracts text. //can't extract xhtml because "tmp" file names wind up in //content's metadata and they'll differ by file. parseContext = new ParseContext(); RecursiveParserWrapperHandler handler = new RecursiveParserWrapperHandler( new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.TEXT, -1), -1); parser.parse(is, handler, new Metadata(), parseContext); return handler.getMetadataList(); }
/** * This is called after parsing an embedded document. * @param contentHandler local contenthandler used on the embedded document * @param metadata metadata from the embedded document * @throws SAXException */ @Override public void endEmbeddedDocument(ContentHandler contentHandler, Metadata metadata) throws SAXException { super.endEmbeddedDocument(contentHandler, metadata); addContent(contentHandler, metadata); metadataList.add(ParserUtils.cloneMetadata(metadata)); }
protected List<Metadata> getRecursiveMetadata(InputStream is, ParseContext context, Metadata metadata, boolean suppressException) throws Exception { Parser p = new AutoDetectParser(); RecursiveParserWrapper wrapper = new RecursiveParserWrapper(p); RecursiveParserWrapperHandler handler = new RecursiveParserWrapperHandler( new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.XML, -1)); try { wrapper.parse(is, handler, metadata, context); } catch (Exception e) { if (!suppressException) { throw e; } } return handler.getMetadataList(); }
protected List<Metadata> getRecursiveMetadata(String filePath, Parser parserToWrap, ParseContext parseContext) throws Exception { RecursiveParserWrapper wrapper = new RecursiveParserWrapper(parserToWrap); RecursiveParserWrapperHandler handler = new RecursiveParserWrapperHandler( new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.XML, -1)); try (InputStream is = getResourceAsStream("/test-documents/" + filePath)) { wrapper.parse(is, handler, new Metadata(), parseContext); } return handler.getMetadataList(); }
new RecursiveParserWrapperHandler( new BasicContentHandlerFactory( BasicContentHandlerFactory.HANDLER_TYPE.BODY, -1), StringWriter jsonBuffer = new StringWriter(); JsonMetadataList.setPrettyPrinting(true); JsonMetadataList.toJson(recursiveParserWrapperHandler.getMetadataList(), jsonBuffer); setText(json, jsonBuffer.toString());
protected List<Metadata> getRecursiveMetadata(String filePath, Parser parserToWrap, BasicContentHandlerFactory.HANDLER_TYPE handlerType) throws Exception { RecursiveParserWrapper wrapper = new RecursiveParserWrapper(parserToWrap); RecursiveParserWrapperHandler handler = new RecursiveParserWrapperHandler( new BasicContentHandlerFactory(handlerType, -1)); try (InputStream is = getResourceAsStream("/test-documents/" + filePath)) { wrapper.parse(is, handler, new Metadata(), new ParseContext()); } return handler.getMetadataList(); }