private ContentHandlerFactory getContentHandlerFactory(OutputType type) { BasicContentHandlerFactory.HANDLER_TYPE handlerType = BasicContentHandlerFactory.HANDLER_TYPE.IGNORE; if (type.equals(HTML)) { handlerType = BasicContentHandlerFactory.HANDLER_TYPE.HTML; } else if (type.equals(XML)) { handlerType = BasicContentHandlerFactory.HANDLER_TYPE.XML; } else if (type.equals(TEXT)) { handlerType = BasicContentHandlerFactory.HANDLER_TYPE.TEXT; } else if (type.equals(TEXT_MAIN)) { handlerType = BasicContentHandlerFactory.HANDLER_TYPE.BODY; } else if (type.equals(METADATA)) { handlerType = BasicContentHandlerFactory.HANDLER_TYPE.IGNORE; } return new BasicContentHandlerFactory(handlerType, -1); } private void usage() {
@Override public ContentHandler getNewContentHandler(OutputStream os, String encoding) throws UnsupportedEncodingException { return getNewContentHandler(os, Charset.forName(encoding)); }
@Override public ContentHandlerFactory build(Node node, Map<String, String> runtimeAttributes) { Map<String, String> attributes = XMLDOMUtil.mapifyAttrs(node, runtimeAttributes); BasicContentHandlerFactory.HANDLER_TYPE type = null; String handlerTypeString = attributes.get("basicHandlerType"); type = BasicContentHandlerFactory.parseHandlerType(handlerTypeString, BasicContentHandlerFactory.HANDLER_TYPE.TEXT); int writeLimit = -1; String writeLimitString = attributes.get("writeLimit"); if (writeLimitString != null) { try { writeLimit = Integer.parseInt(attributes.get("writeLimit")); } catch (NumberFormatException e) { //swallow and default to -1 //TODO: should we throw a RuntimeException? } } return new BasicContentHandlerFactory(type, writeLimit); }
@Test public void testIgnore() throws Exception { Parser p = new MockParser(OVER_DEFAULT); ContentHandler handler = new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.IGNORE, -1).getNewContentHandler(); assertTrue(handler instanceof DefaultHandler); p.parse(null, handler, null, null); //unfortunatley, the DefaultHandler does not return "", assertContains("org.xml.sax.helpers.DefaultHandler", handler.toString()); //tests that no write limit exception is thrown p = new MockParser(100); handler = new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.IGNORE, 5).getNewContentHandler(); assertTrue(handler instanceof DefaultHandler); p.parse(null, handler, null, null); assertContains("org.xml.sax.helpers.DefaultHandler", handler.toString()); }
sb.append("json"); } else if (contentHandlerFactory instanceof BasicContentHandlerFactory) { appendSuffix(((BasicContentHandlerFactory) contentHandlerFactory).getType(), sb);
BasicContentHandlerFactory.HANDLER_TYPE.HTML; ContentHandler handler = new BasicContentHandlerFactory(type, -1).getNewContentHandler(); handler = new BasicContentHandlerFactory(type, 5).getNewContentHandler(); assertTrue(handler instanceof WriteOutContentHandler); assertWriteLimitReached(p, (WriteOutContentHandler) handler); handler = new BasicContentHandlerFactory(type, -1).getNewContentHandler(os, ENCODING); assertTrue(handler instanceof ToXMLContentHandler); p.parse(null, handler, null, null); handler = new BasicContentHandlerFactory(type, 5).getNewContentHandler(os, ENCODING); assertTrue(handler instanceof WriteOutContentHandler); assertWriteLimitReached(p, (WriteOutContentHandler) handler);
private static List<Metadata> getRecursiveMetadata(InputStream is, Parser parser, ParseContext parseContext) throws Exception { //different from parent TikaTest in that this extracts text. //can't extract xhtml because "tmp" file names wind up in //content's metadata and they'll differ by file. parseContext = new ParseContext(); RecursiveParserWrapperHandler handler = new RecursiveParserWrapperHandler( new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.TEXT, -1), -1); parser.parse(is, handler, new Metadata(), parseContext); return handler.getMetadataList(); }
BasicContentHandlerFactory.HANDLER_TYPE.BODY; ContentHandler handler = new BasicContentHandlerFactory(type, -1).getNewContentHandler(); handler = new BasicContentHandlerFactory(type, 5).getNewContentHandler(); assertTrue(handler instanceof BodyContentHandler); assertWriteLimitReached(p, (BodyContentHandler)handler); handler = new BasicContentHandlerFactory(type, -1).getNewContentHandler(os, ENCODING); assertTrue(handler instanceof BodyContentHandler); p.parse(null, handler, null, null); handler = new BasicContentHandlerFactory(type, 5).getNewContentHandler(os, ENCODING); assertTrue(handler instanceof WriteOutContentHandler); assertWriteLimitReached(p, (WriteOutContentHandler) handler);
private MetadataList parseMetadata(InputStream is, MultivaluedMap<String, String> httpHeaders, UriInfo info, String handlerTypeName) throws Exception { final Metadata metadata = new Metadata(); final ParseContext context = new ParseContext(); Parser parser = TikaResource.createParser(); // TODO: parameterize choice of max chars/max embedded attachments RecursiveParserWrapper wrapper = new RecursiveParserWrapper(parser); TikaResource.fillMetadata(parser, metadata, context, httpHeaders); // no need to add parser to parse recursively TikaResource.fillParseContext(context, httpHeaders, null); TikaResource.logRequest(LOG, info, metadata); BasicContentHandlerFactory.HANDLER_TYPE type = BasicContentHandlerFactory.parseHandlerType(handlerTypeName, DEFAULT_HANDLER_TYPE); RecursiveParserWrapperHandler handler = new RecursiveParserWrapperHandler( new BasicContentHandlerFactory(type, -1), -1); TikaResource.parse(wrapper, LOG, info.getPath(), is, handler, metadata, context); /* We used to have this non-functional bit of code...refactor to add it back and make it work? new LanguageHandler() { public void endDocument() { metadata.set("language", getLanguage().getLanguage()); } }, */ return new MetadataList(handler.getMetadataList()); }
@Override public ContentHandler getNewContentHandler(OutputStream os, String encoding) throws UnsupportedEncodingException { return getNewContentHandler(os, Charset.forName(encoding)); }
protected List<Metadata> getRecursiveMetadata(InputStream is, ParseContext context, Metadata metadata, boolean suppressException) throws Exception { Parser p = new AutoDetectParser(); RecursiveParserWrapper wrapper = new RecursiveParserWrapper(p); RecursiveParserWrapperHandler handler = new RecursiveParserWrapperHandler( new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.XML, -1)); try { wrapper.parse(is, handler, metadata, context); } catch (Exception e) { if (!suppressException) { throw e; } } return handler.getMetadataList(); }
BasicContentHandlerFactory.HANDLER_TYPE.HTML; ContentHandler handler = new BasicContentHandlerFactory(type, -1).getNewContentHandler(); handler = new BasicContentHandlerFactory(type, 5).getNewContentHandler(); assertTrue(handler instanceof WriteOutContentHandler); assertWriteLimitReached(p, (WriteOutContentHandler) handler); handler = new BasicContentHandlerFactory(type, -1).getNewContentHandler(os, ENCODING); assertTrue(handler instanceof ToHTMLContentHandler); p.parse(null, handler, null, null); handler = new BasicContentHandlerFactory(type, 5).getNewContentHandler(os, ENCODING); assertTrue(handler instanceof WriteOutContentHandler); assertWriteLimitReached(p, (WriteOutContentHandler) handler);
protected List<Metadata> getRecursiveMetadata(String filePath, Parser parserToWrap, ParseContext parseContext) throws Exception { RecursiveParserWrapper wrapper = new RecursiveParserWrapper(parserToWrap); RecursiveParserWrapperHandler handler = new RecursiveParserWrapperHandler( new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.XML, -1)); try (InputStream is = getResourceAsStream("/test-documents/" + filePath)) { wrapper.parse(is, handler, new Metadata(), parseContext); } return handler.getMetadataList(); }
BasicContentHandlerFactory.HANDLER_TYPE.TEXT; ContentHandler handler = new BasicContentHandlerFactory(type, -1).getNewContentHandler(); handler = new BasicContentHandlerFactory(type, 5).getNewContentHandler(); assertTrue(handler instanceof WriteOutContentHandler); assertWriteLimitReached(p, (WriteOutContentHandler) handler); handler = new BasicContentHandlerFactory(type, -1).getNewContentHandler(os, ENCODING); assertTrue(handler instanceof ToTextContentHandler); p.parse(null, handler, null, null); handler = new BasicContentHandlerFactory(type, 5).getNewContentHandler(os, ENCODING); assertTrue(handler instanceof WriteOutContentHandler); assertWriteLimitReached(p, (WriteOutContentHandler) handler);
RecursiveParserWrapperHandler recursiveParserWrapperHandler = new RecursiveParserWrapperHandler( new BasicContentHandlerFactory( BasicContentHandlerFactory.HANDLER_TYPE.BODY, -1), -1);
protected List<Metadata> getRecursiveMetadata(String filePath, Parser parserToWrap, BasicContentHandlerFactory.HANDLER_TYPE handlerType) throws Exception { RecursiveParserWrapper wrapper = new RecursiveParserWrapper(parserToWrap); RecursiveParserWrapperHandler handler = new RecursiveParserWrapperHandler( new BasicContentHandlerFactory(handlerType, -1)); try (InputStream is = getResourceAsStream("/test-documents/" + filePath)) { wrapper.parse(is, handler, new Metadata(), new ParseContext()); } return handler.getMetadataList(); }
protected List<Metadata> getRecursiveMetadata(String filePath, ParseContext context) throws Exception { Parser p = new AutoDetectParser(); RecursiveParserWrapper wrapper = new RecursiveParserWrapper(p); RecursiveParserWrapperHandler handler = new RecursiveParserWrapperHandler( new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.XML, -1)); try (InputStream is = getResourceAsStream("/test-documents/" + filePath)) { wrapper.parse(is, handler, new Metadata(), context); } return handler.getMetadataList(); }
@Test public void testDoublyDecorated() { Parser d = new DigestingAutoDetectParserFactory().getParser(TikaConfig.getDefaultConfig()); RecursiveParserWrapper wrapper = new RecursiveParserWrapper(d, new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.TEXT, -1)); ParseContext parseContext = new ParseContext(); parseContext.set(Parser.class, wrapper); Parser txtParser = EmbeddedDocumentUtil.tryToFindExistingLeafParser(org.apache.tika.parser.txt.TXTParser.class, parseContext); assertNotNull(txtParser); assertEquals(org.apache.tika.parser.txt.TXTParser.class, txtParser.getClass()); } }
SAXException, TikaException { Parser p = new AutoDetectParser(); ContentHandlerFactory factory = new BasicContentHandlerFactory( BasicContentHandlerFactory.HANDLER_TYPE.HTML, -1);
Parser p = new RecursiveParserWrapper(new AutoDetectParserFactory().getParser(new TikaConfig())); RecursiveParserWrapperFSConsumer consumer = new RecursiveParserWrapperFSConsumer( queue, p, new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.TEXT, -1), mockOSFactory);