Metadata metadata = new Metadata(); MediaType mediaType = MediaType.OCTET_STREAM; try { if ( type instanceof byte[] ) { ByteArrayInputStream bais = new ByteArrayInputStream( ( byte[] ) type ); mediaType = detector.detect( bais, metadata ); return mediaType.toString(); fileMetadata.put( AssetUtils.CONTENT_TYPE, mediaType.toString() ); return mediaType.toString();
public static String probeContentType(final InputStream is, final String name) { try (InputStream stream = new BufferedInputStream(is)) { final Metadata metadata = new Metadata(); metadata.set(Metadata.RESOURCE_NAME_KEY, name); return getDefaultMimeTypes().detect(stream, metadata).toString(); } catch (IOException e) { LOGGER.warn("Couldn't detect the media type of attachment {} {}", name, e); return WILDCARD; } }
Metadata metadata = new Metadata(); metadata.add(Metadata.CONTENT_TYPE, page.getContentType()); try (InputStream inputStream = new ByteArrayInputStream(page.getContentData())) { htmlParser.parse(inputStream, contentHandler, metadata, parseContext); } catch (Exception e) { parsedData.setTitle(metadata.get(DublinCore.TITLE)); parsedData.setMetaTags(contentHandler.getMetaTags());
@Override public void process(final InputStream stream) throws IOException { try (final InputStream in = new BufferedInputStream(stream)) { TikaInputStream tikaStream = TikaInputStream.get(in); Metadata metadata = new Metadata(); if (filename != null && context.getProperty(USE_FILENAME_IN_DETECTION).asBoolean()) { metadata.add(TikaMetadataKeys.RESOURCE_NAME_KEY, filename); } // Get mime type MediaType mediatype = detector.detect(tikaStream, metadata); mimeTypeRef.set(mediatype.toString()); } } });
private Map<String, String> tika_parse(InputStream sourceStream, String prefix, Integer maxAttribs, Integer maxAttribLen) throws IOException, TikaException, SAXException { final Metadata metadata = new Metadata(); final TikaInputStream tikaInputStream = TikaInputStream.get(sourceStream); try { autoDetectParser.parse(tikaInputStream, new DefaultHandler(), metadata); } finally { tikaInputStream.close(); final Pattern metadataKeyFilter = metadataKeyFilterRef.get(); final StringBuilder dataBuilder = new StringBuilder(); for (final String key : metadata.names()) { if (metadataKeyFilter != null && !metadataKeyFilter.matcher(key).matches()) { continue; if (metadata.isMultiValued(key)) { for (String val : metadata.getValues(key)) { if (dataBuilder.length() > 1) { dataBuilder.append(", "); dataBuilder.append(metadata.get(key));
public static void useAutoDetectParser() throws Exception { InputStream stream = new ByteArrayInputStream(new byte[0]); ContentHandler handler = new DefaultHandler(); Metadata metadata = new Metadata(); ParseContext context = new ParseContext(); Parser parser = new AutoDetectParser(); parser.parse(stream, handler, metadata, context); }
@Test public void testHelloWorld() throws Exception { try (ForkParser parser = new ForkParser(ForkParserTest.class.getClassLoader(), new ForkTestParser())) { Metadata metadata = new Metadata(); ContentHandler output = new BodyContentHandler(); InputStream stream = new ByteArrayInputStream(new byte[0]); ParseContext context = new ParseContext(); parser.parse(stream, output, metadata, context); assertEquals("Hello, World!", output.toString().trim()); assertEquals("text/plain", metadata.get(Metadata.CONTENT_TYPE)); } }
private void parsePage(byte[] byteObject, Parser htmlParser, ContentHandler xhtml, ParseContext context) throws TikaException {// throws IOException InputStream stream = null; Metadata metadata = new Metadata(); ContentHandler handler = new EmbeddedContentHandler(new BodyContentHandler(xhtml));// -1 try { stream = new ByteArrayInputStream(byteObject); htmlParser.parse(stream, handler, metadata, context); } catch (SAXException e) { throw new RuntimeException(e); } catch (IOException e) { // Pushback overflow from tagsoup } }
public void setBinaryContent(byte[] data) throws TransformerConfigurationException, TikaException, SAXException, IOException { InputStream inputStream = new ByteArrayInputStream(data); ByteArrayOutputStream outputStream = new ByteArrayOutputStream(); try { TransformerHandler handler = getTransformerHandler(outputStream, DEFAULT_OUTPUT_FORMAT, DEFAULT_ENCODING); AUTO_DETECT_PARSER.parse(inputStream, handler, new Metadata(), context); // Hacking the following line to remove Tika's inserted DocType this.html = new String(outputStream.toByteArray(), DEFAULT_ENCODING).replace( "http://www.w3.org/1999/xhtml", ""); } catch (TransformerConfigurationException | TikaException | SAXException | IOException | RuntimeException e) { throw e; } }
private void handleInlineBodyPart(BodyContents part) throws MimeException, IOException { String contentType = part.metadata.get(Metadata.CONTENT_TYPE); Parser parser = null; if (MediaType.TEXT_HTML.toString().equalsIgnoreCase(contentType)) { parser = EmbeddedDocumentUtil.tryToFindExistingLeafParser(HtmlParser.class, parseContext); parser = EmbeddedDocumentUtil.tryToFindExistingLeafParser(RTFParser.class, parseContext); } else if (MediaType.TEXT_PLAIN.toString().equalsIgnoreCase(contentType)) { parser = EmbeddedDocumentUtil.tryToFindExistingLeafParser(TXTParser.class, parseContext); try (TikaInputStream tis = TikaInputStream.get(part.bytes)) { handleEmbedded(tis, part.metadata); parser.parse( new ByteArrayInputStream(part.bytes), new EmbeddedContentHandler(new BodyContentHandler(handler)), new Metadata(), parseContext ); } catch (SAXException | TikaException e) {
public static void useCompositeParser() throws Exception { InputStream stream = new ByteArrayInputStream(new byte[0]); ContentHandler handler = new DefaultHandler(); ParseContext context = new ParseContext(); Map<MediaType, Parser> parsersByType = new HashMap<MediaType, Parser>(); parsersByType.put(MediaType.parse("text/html"), new HtmlParser()); parsersByType.put(MediaType.parse("application/xml"), new XMLParser()); CompositeParser parser = new CompositeParser(); parser.setParsers(parsersByType); parser.setFallback(new TXTParser()); Metadata metadata = new Metadata(); metadata.set(Metadata.CONTENT_TYPE, "text/html"); parser.parse(stream, handler, metadata, context); }
metadata = new Metadata(); handler = new BodyContentHandler(); p.parse(new ByteArrayInputStream(new byte[] {0,1,2,3,4}), handler, metadata, context); assertEquals("Fell back!", handler.toString()); usedParsers = metadata.getValues("X-Parsed-By"); assertEquals(1, usedParsers.length); assertEquals(DummyParser.class.getName(), usedParsers[0]); metadata = new Metadata(); handler = new BodyContentHandler(); p.parse(new ByteArrayInputStream(new byte[] {0,1,2,3,4}), handler, metadata, context); assertEquals("Fell back!", handler.toString()); usedParsers = metadata.getValues("X-Parsed-By"); assertEquals(2, usedParsers.length); assertEquals(ErrorParser.class.getName(), usedParsers[0]); assertNotNull(metadata.get(ParserUtils.EMBEDDED_EXCEPTION)); assertNotNull(metadata.get(ParserUtils.EMBEDDED_PARSER)); assertEquals(ErrorParser.class.getName(), metadata.get(ParserUtils.EMBEDDED_PARSER)); metadata = new Metadata(); handler = new BodyContentHandler(); p.parse(new ByteArrayInputStream(new byte[] {0,1,2,3,4}), handler, metadata, context); assertEquals("Fell back!", handler.toString()); usedParsers = metadata.getValues("X-Parsed-By"); assertEquals(2, usedParsers.length);
byte[] bytes = (byte[])obj; handleEmbeddedResource( TikaInputStream.get(bytes), String v = toString(obj, c.getType()); if (isRichText(c)) { BodyContentHandler h = new BodyContentHandler(); Metadata m = new Metadata(); m.set(Metadata.CONTENT_TYPE, "text/html; charset=UTF-8"); try { htmlParser.parse(new ByteArrayInputStream(v.getBytes(UTF_8)), h, m, parseContext); handler.characters(h.toString()); } catch (SAXException e) {
MediaType.OCTET_STREAM, MediaType.TEXT_PLAIN)); ParseContext context = new ParseContext(); BodyContentHandler handler; Metadata metadata; Set<MediaType> types = p.getSupportedTypes(context); assertEquals(2, types.size()); assertEquals(types.toString(), true, types.contains(MediaType.TEXT_PLAIN)); metadata = new Metadata(); handler = new BodyContentHandler(); p.parse(new ByteArrayInputStream(new byte[] {0,1,2,3,4}), handler, metadata, context); assertEquals("Fell back!", handler.toString()); metadata = new Metadata(); handler = new BodyContentHandler(); p.parse(new ByteArrayInputStream(new byte[] {0,1,2,3,4}), handler, metadata, context); assertEquals("", handler.toString());
public static void process(Path path) throws Exception { Parser parser = new AutoDetectParser(); Metadata metadata = new Metadata(); // The PhoneExtractingContentHandler will examine any characters for phone numbers before passing them // to the underlying Handler. PhoneExtractingContentHandler handler = new PhoneExtractingContentHandler(new BodyContentHandler(), metadata); try (InputStream stream = new BufferedInputStream(Files.newInputStream(path))) { parser.parse(stream, handler, metadata, new ParseContext()); } String[] numbers = metadata.getValues("phonenumbers"); Collections.addAll(phoneNumbers, numbers); } }
InputStream is = new FileInputStream("/home/rahul/Music/03 - I Like Your Music.mp3"); Parser parser = new AutoDetectParser(); BodyContentHandler handler = new BodyContentHandler(); Metadata metadata = new Metadata(); parser.parse(is, handler, metadata, new ParseContext()); String handler = handler.toString(); System.out.println("Handler data: " + handler); System.out.println(metadata.get(Metadata.CREATION_DATE)); System.out.println(metadata.get(Metadata.LAST_MODIFIED));
/** * This example shows how to extract content from the outer document and all * embedded documents. The key is to specify a {@link Parser} in the {@link ParseContext}. * * @return content, including from embedded documents * @throws IOException * @throws SAXException * @throws TikaException */ public String parseEmbeddedExample() throws IOException, SAXException, TikaException { AutoDetectParser parser = new AutoDetectParser(); BodyContentHandler handler = new BodyContentHandler(); Metadata metadata = new Metadata(); ParseContext context = new ParseContext(); context.set(Parser.class, parser); try (InputStream stream = ParsingExample.class.getResourceAsStream("test_recursive_embedded.docx")) { parser.parse(stream, handler, metadata, context); return handler.toString(); } }
public void extract(InputStream is, Path outputDir) throws SAXException, TikaException, IOException { Metadata m = new Metadata(); ParseContext c = new ParseContext(); ContentHandler h = new BodyContentHandler(-1); c.set(Parser.class, parser); EmbeddedDocumentExtractor ex = new MyEmbeddedDocumentExtractor(outputDir, c); c.set(EmbeddedDocumentExtractor.class, ex); parser.parse(is, h, m, c); }
/** * Creates a reader for the text content of the given binary stream. * * @param stream binary stream * @throws IOException if the document can not be parsed */ public ParsingReader(InputStream stream) throws IOException { this(new AutoDetectParser(), stream, new Metadata(), new ParseContext()); context.set(Parser.class, parser); }
/** * Example of extracting the plain text of the contents. * Will return only the "body" part of the document */ public String parseToPlainText() throws IOException, SAXException, TikaException { BodyContentHandler handler = new BodyContentHandler(); AutoDetectParser parser = new AutoDetectParser(); Metadata metadata = new Metadata(); try (InputStream stream = ContentHandlerExample.class.getResourceAsStream("test.doc")) { parser.parse(stream, handler, metadata); return handler.toString(); } }