public static void useCompositeParser() throws Exception { InputStream stream = new ByteArrayInputStream(new byte[0]); ContentHandler handler = new DefaultHandler(); ParseContext context = new ParseContext(); Map<MediaType, Parser> parsersByType = new HashMap<MediaType, Parser>(); parsersByType.put(MediaType.parse("text/html"), new HtmlParser()); parsersByType.put(MediaType.parse("application/xml"), new XMLParser()); CompositeParser parser = new CompositeParser(); parser.setParsers(parsersByType); parser.setFallback(new TXTParser()); Metadata metadata = new Metadata(); metadata.set(Metadata.CONTENT_TYPE, "text/html"); parser.parse(stream, handler, metadata, context); }
canonical.parse(new ByteArrayInputStream(new byte[0]), handler, metadata, new ParseContext()); assertEquals("True", metadata.get("BMP")); assertEquals("True", metadata.get("Canonical")); alias.parse(new ByteArrayInputStream(new byte[0]), handler, metadata, new ParseContext()); assertEquals("True", metadata.get("BMP")); assertEquals("True", metadata.get("Alias")); canonical.parse(new ByteArrayInputStream(new byte[0]), handler, metadata, new ParseContext()); assertEquals("True", metadata.get("BMP")); assertEquals("True", metadata.get("Canonical")); alias.parse(new ByteArrayInputStream(new byte[0]), handler, metadata, new ParseContext()); assertEquals("True", metadata.get("BMP")); assertEquals("True", metadata.get("Alias")); both.parse(new ByteArrayInputStream(new byte[0]), handler, metadata, new ParseContext()); assertEquals("True", metadata.get("BMP")); assertEquals("True", metadata.get("Alias"));
super.parse(tis, sch, metadata, context); } catch (SAXException e) {
super.parse(tis, sch, metadata, context); } catch (final SAXException e) {
super.parse(tis, sch, metadata, context); } catch (final SAXException e) {
super.parse(tis, sch, metadata, context); } catch (final SAXException e) {
compositeParser.parse(tis, sch, metadata, context); } catch (SAXException e) {
public void parse( InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException { TemporaryResources tmp = new TemporaryResources(); try { TikaInputStream tis = TikaInputStream.get(stream, tmp); // Automatically detect the MIME type of the document MediaType type = detector.detect(tis, metadata); metadata.set(Metadata.CONTENT_TYPE, type.toString()); // TIKA-216: Zip bomb prevention SecureContentHandler sch = handler != null ? new SecureContentHandler(handler, tis) : null; try { // Parse the document super.parse(tis, sch, metadata, context); } catch (SAXException e) { // Convert zip bomb exceptions to TikaExceptions sch.throwIfCauseOf(e); throw e; } catch (final Error error) { throw new TikaErrorException("[TikaErrorException] " + error.toString() + "; message=" + error.getMessage() + "; stackTace=" + error.getStackTrace()); } } finally { tmp.dispose(); } }
super.parse(tis, sch, metadata, context); } catch (final SAXException e) {
@Override public Extraction parse(final InputStream stream, final String source) throws ExtractionException { final Metadata tikaMetadata = new Metadata(); tikaMetadata.set(TikaMetadataKeys.RESOURCE_NAME_KEY, source); final ToHTMLContentHandler handler = new ToHTMLContentHandler(); try { parser.parse(stream, new XHTMLContentHandler(new NoHeadTagInBodyContentHandler(handler), tikaMetadata), tikaMetadata, context); } catch (IOException | SAXException | TikaException | NullPointerException exception) { throw new ExtractionException("Failed to parse stream", exception); } final String processedHtml = postProcess(tikaMetadata, handler.toString()); return new DefaultExtraction(processedHtml, convertMetadata(tikaMetadata)); }
super.parse(tis, sch, metadata, context); } catch (SAXException e) {