org.apache.tika.metadata.Metadata java code examples

Refine search

Metadata metadata = new Metadata();
MediaType mediaType = MediaType.OCTET_STREAM;
try {
  if ( type instanceof byte[] ) {
    ByteArrayInputStream bais = new ByteArrayInputStream( ( byte[] ) type );
    mediaType = detector.detect( bais, metadata );
    return mediaType.toString();
  fileMetadata.put( AssetUtils.CONTENT_TYPE, mediaType.toString() );
return mediaType.toString();

public static String probeContentType(final InputStream is, final String name) {
  try (InputStream stream = new BufferedInputStream(is)) {
    final Metadata metadata = new Metadata();
    metadata.set(Metadata.RESOURCE_NAME_KEY, name);
    return getDefaultMimeTypes().detect(stream, metadata).toString();
  } catch (IOException e) {
    LOGGER.warn("Couldn't detect the media type of attachment {} {}", name, e);
    return WILDCARD;
  }
}

Metadata metadata = new Metadata();
  metadata.add(Metadata.CONTENT_TYPE, page.getContentType());
try (InputStream inputStream = new ByteArrayInputStream(page.getContentData())) {
  htmlParser.parse(inputStream, contentHandler, metadata, parseContext);
} catch (Exception e) {
parsedData.setTitle(metadata.get(DublinCore.TITLE));
parsedData.setMetaTags(contentHandler.getMetaTags());

  @Override
  public void process(final InputStream stream) throws IOException {
    try (final InputStream in = new BufferedInputStream(stream)) {
      TikaInputStream tikaStream = TikaInputStream.get(in);
      Metadata metadata = new Metadata();
      if (filename != null && context.getProperty(USE_FILENAME_IN_DETECTION).asBoolean()) {
        metadata.add(TikaMetadataKeys.RESOURCE_NAME_KEY, filename);
      }
      // Get mime type
      MediaType mediatype = detector.detect(tikaStream, metadata);
      mimeTypeRef.set(mediatype.toString());
    }
  }
});

private Map<String, String> tika_parse(InputStream sourceStream, String prefix, Integer maxAttribs,
                    Integer maxAttribLen) throws IOException, TikaException, SAXException {
  final Metadata metadata = new Metadata();
  final TikaInputStream tikaInputStream = TikaInputStream.get(sourceStream);
  try {
    autoDetectParser.parse(tikaInputStream, new DefaultHandler(), metadata);
  } finally {
    tikaInputStream.close();
  final Pattern metadataKeyFilter = metadataKeyFilterRef.get();
  final StringBuilder dataBuilder = new StringBuilder();
  for (final String key : metadata.names()) {
    if (metadataKeyFilter != null && !metadataKeyFilter.matcher(key).matches()) {
      continue;
    if (metadata.isMultiValued(key)) {
      for (String val : metadata.getValues(key)) {
        if (dataBuilder.length() > 1) {
          dataBuilder.append(", ");
      dataBuilder.append(metadata.get(key));

public static void useAutoDetectParser() throws Exception {
  InputStream stream = new ByteArrayInputStream(new byte[0]);
  ContentHandler handler = new DefaultHandler();
  Metadata metadata = new Metadata();
  ParseContext context = new ParseContext();
  Parser parser = new AutoDetectParser();
  parser.parse(stream, handler, metadata, context);
}

@Test
public void testHelloWorld() throws Exception {
  try (ForkParser parser = new ForkParser(ForkParserTest.class.getClassLoader(), new ForkTestParser())) {
    Metadata metadata = new Metadata();
    ContentHandler output = new BodyContentHandler();
    InputStream stream = new ByteArrayInputStream(new byte[0]);
    ParseContext context = new ParseContext();
    parser.parse(stream, output, metadata, context);
    assertEquals("Hello, World!", output.toString().trim());
    assertEquals("text/plain", metadata.get(Metadata.CONTENT_TYPE));
  }
}

private void parsePage(byte[] byteObject, Parser htmlParser,
            ContentHandler xhtml, ParseContext context) throws TikaException {// throws IOException
  InputStream stream = null;
  Metadata metadata = new Metadata();
  ContentHandler handler = new EmbeddedContentHandler(new BodyContentHandler(xhtml));// -1
  try {
    stream = new ByteArrayInputStream(byteObject);
    htmlParser.parse(stream, handler, metadata, context);
  } catch (SAXException e) {
    throw new RuntimeException(e);
  } catch (IOException e) {
    // Pushback overflow from tagsoup
  }
}

public void setBinaryContent(byte[] data)
      throws TransformerConfigurationException, TikaException, SAXException, IOException {
  InputStream inputStream = new ByteArrayInputStream(data);
  ByteArrayOutputStream outputStream = new ByteArrayOutputStream();
  try {
    TransformerHandler handler =
      getTransformerHandler(outputStream, DEFAULT_OUTPUT_FORMAT, DEFAULT_ENCODING);
    AUTO_DETECT_PARSER.parse(inputStream, handler, new Metadata(), context);
    // Hacking the following line to remove Tika's inserted DocType
    this.html = new String(outputStream.toByteArray(), DEFAULT_ENCODING).replace(
      "http://www.w3.org/1999/xhtml", "");
  } catch (TransformerConfigurationException | TikaException | SAXException | IOException | RuntimeException e) {
    throw e;
  }
}

private void handleInlineBodyPart(BodyContents part) throws MimeException, IOException {
  String contentType = part.metadata.get(Metadata.CONTENT_TYPE);
  Parser parser = null;
  if (MediaType.TEXT_HTML.toString().equalsIgnoreCase(contentType)) {
    parser =
        EmbeddedDocumentUtil.tryToFindExistingLeafParser(HtmlParser.class, parseContext);
    parser =
        EmbeddedDocumentUtil.tryToFindExistingLeafParser(RTFParser.class, parseContext);
  } else if (MediaType.TEXT_PLAIN.toString().equalsIgnoreCase(contentType)) {
    parser =
        EmbeddedDocumentUtil.tryToFindExistingLeafParser(TXTParser.class, parseContext);
    try (TikaInputStream tis = TikaInputStream.get(part.bytes)) {
      handleEmbedded(tis, part.metadata);
      parser.parse(
          new ByteArrayInputStream(part.bytes),
          new EmbeddedContentHandler(new BodyContentHandler(handler)),
          new Metadata(), parseContext
      );
    } catch (SAXException | TikaException e) {

public static void useCompositeParser() throws Exception {
  InputStream stream = new ByteArrayInputStream(new byte[0]);
  ContentHandler handler = new DefaultHandler();
  ParseContext context = new ParseContext();
  Map<MediaType, Parser> parsersByType = new HashMap<MediaType, Parser>();
  parsersByType.put(MediaType.parse("text/html"), new HtmlParser());
  parsersByType.put(MediaType.parse("application/xml"), new XMLParser());
  CompositeParser parser = new CompositeParser();
  parser.setParsers(parsersByType);
  parser.setFallback(new TXTParser());
  Metadata metadata = new Metadata();
  metadata.set(Metadata.CONTENT_TYPE, "text/html");
  parser.parse(stream, handler, metadata, context);
}

metadata = new Metadata();
handler = new BodyContentHandler();
p.parse(new ByteArrayInputStream(new byte[] {0,1,2,3,4}), handler, metadata, context);
assertEquals("Fell back!", handler.toString());
usedParsers = metadata.getValues("X-Parsed-By");
assertEquals(1, usedParsers.length);
assertEquals(DummyParser.class.getName(), usedParsers[0]);
metadata = new Metadata();
handler = new BodyContentHandler();
p.parse(new ByteArrayInputStream(new byte[] {0,1,2,3,4}), handler, metadata, context);
assertEquals("Fell back!", handler.toString());
usedParsers = metadata.getValues("X-Parsed-By");
assertEquals(2, usedParsers.length);
assertEquals(ErrorParser.class.getName(), usedParsers[0]);
assertNotNull(metadata.get(ParserUtils.EMBEDDED_EXCEPTION));
assertNotNull(metadata.get(ParserUtils.EMBEDDED_PARSER));
assertEquals(ErrorParser.class.getName(), metadata.get(ParserUtils.EMBEDDED_PARSER));
metadata = new Metadata();
handler = new BodyContentHandler();
p.parse(new ByteArrayInputStream(new byte[] {0,1,2,3,4}), handler, metadata, context);
assertEquals("Fell back!", handler.toString());
usedParsers = metadata.getValues("X-Parsed-By");
assertEquals(2, usedParsers.length);

  byte[] bytes = (byte[])obj;
  handleEmbeddedResource(
      TikaInputStream.get(bytes),
String v = toString(obj, c.getType());
if (isRichText(c)) {
  BodyContentHandler h = new BodyContentHandler();
  Metadata m = new Metadata();
  m.set(Metadata.CONTENT_TYPE, "text/html; charset=UTF-8");
  try {
    htmlParser.parse(new ByteArrayInputStream(v.getBytes(UTF_8)),
        h,
        m, parseContext);
    handler.characters(h.toString());
  } catch (SAXException e) {

    MediaType.OCTET_STREAM, MediaType.TEXT_PLAIN));
ParseContext context = new ParseContext();
BodyContentHandler handler;
Metadata metadata;
Set<MediaType> types = p.getSupportedTypes(context);
assertEquals(2, types.size());
assertEquals(types.toString(), true, types.contains(MediaType.TEXT_PLAIN));
metadata = new Metadata();
handler = new BodyContentHandler();
p.parse(new ByteArrayInputStream(new byte[] {0,1,2,3,4}), handler, metadata, context);
assertEquals("Fell back!", handler.toString());
metadata = new Metadata();
handler = new BodyContentHandler();
p.parse(new ByteArrayInputStream(new byte[] {0,1,2,3,4}), handler, metadata, context);
assertEquals("", handler.toString());

  public static void process(Path path) throws Exception {
    Parser parser = new AutoDetectParser();
    Metadata metadata = new Metadata();
    // The PhoneExtractingContentHandler will examine any characters for phone numbers before passing them
    // to the underlying Handler.
    PhoneExtractingContentHandler handler = new PhoneExtractingContentHandler(new BodyContentHandler(), metadata);
    try (InputStream stream = new BufferedInputStream(Files.newInputStream(path))) {
      parser.parse(stream, handler, metadata, new ParseContext());
    }
    String[] numbers = metadata.getValues("phonenumbers");
    Collections.addAll(phoneNumbers, numbers);
  }
}

 InputStream is = new FileInputStream("/home/rahul/Music/03 - I Like Your Music.mp3");

Parser parser = new AutoDetectParser();
BodyContentHandler handler = new BodyContentHandler();

Metadata metadata = new Metadata();
parser.parse(is, handler, metadata, new ParseContext());

String handler = handler.toString();
System.out.println("Handler data: " + handler);
System.out.println(metadata.get(Metadata.CREATION_DATE));
System.out.println(metadata.get(Metadata.LAST_MODIFIED));

/**
 * This example shows how to extract content from the outer document and all
 * embedded documents.  The key is to specify a {@link Parser} in the {@link ParseContext}.
 *
 * @return content, including from embedded documents
 * @throws IOException
 * @throws SAXException
 * @throws TikaException
 */
public String parseEmbeddedExample() throws IOException, SAXException, TikaException {
  AutoDetectParser parser = new AutoDetectParser();
  BodyContentHandler handler = new BodyContentHandler();
  Metadata metadata = new Metadata();
  ParseContext context = new ParseContext();
  context.set(Parser.class, parser);
  try (InputStream stream = ParsingExample.class.getResourceAsStream("test_recursive_embedded.docx")) {
    parser.parse(stream, handler, metadata, context);
    return handler.toString();
  }
}

public void extract(InputStream is, Path outputDir) throws SAXException, TikaException, IOException {
  Metadata m = new Metadata();
  ParseContext c = new ParseContext();
  ContentHandler h = new BodyContentHandler(-1);
  c.set(Parser.class, parser);
  EmbeddedDocumentExtractor ex = new MyEmbeddedDocumentExtractor(outputDir, c);
  c.set(EmbeddedDocumentExtractor.class, ex);
  parser.parse(is, h, m, c);
}

/**
 * Creates a reader for the text content of the given binary stream.
 *
 * @param stream binary stream
 * @throws IOException if the document can not be parsed
 */
public ParsingReader(InputStream stream) throws IOException {
  this(new AutoDetectParser(), stream, new Metadata(), new ParseContext());
  context.set(Parser.class, parser);
}

/**
 * Example of extracting the plain text of the contents.
 * Will return only the "body" part of the document
 */
public String parseToPlainText() throws IOException, SAXException, TikaException {
  BodyContentHandler handler = new BodyContentHandler();
  AutoDetectParser parser = new AutoDetectParser();
  Metadata metadata = new Metadata();
  try (InputStream stream = ContentHandlerExample.class.getResourceAsStream("test.doc")) {
    parser.parse(stream, handler, metadata);
    return handler.toString();
  }
}

Javadoc

A multi-valued metadata container.

Most used methods

<init>
Constructs a new, empty metadata.
set
Sets the values of the identified metadata property.
get
Returns the value (if any) of the identified metadata property.
add
Add a metadata property/value mapping. Add the specified value to the list of values associated to t
names
Returns an array of the names contained in the metadata.
getValues
Get the values associated to a metadata name.
isMultiValued
Returns true if named value is multivalued.
toString
size
Returns the number of metadata names in this metadata.
getDate
Returns the value of the identified Date based metadata property.
getInt
Returns the value of the identified Integer based metadata property.
remove
Remove a metadata and all its associated values.

Popular in Java

Reading from database using SQL prepared statement
getApplicationContext (Context)
setRequestProperty (URLConnection)
compareTo (BigDecimal)
SocketTimeoutException (java.net)
This exception is thrown when a timeout expired on a socket read or accept operation.
Set (java.util)
A Set is a data structure which does not allow duplicate elements.
Stack (java.util)
Stack is a Last-In/First-Out(LIFO) data structure which represents a stack of objects. It enables u
JarFile (java.util.jar)
JarFile is used to read jar entries and their associated data from jar files.
Font (java.awt)
The Font class represents fonts, which are used to render text in a visible way. A font provides the
JTable (javax.swing)
Best IntelliJ plugins

How to useMetadata in org.apache.tika.metadata

Best Java code snippets using org.apache.tika.metadata.Metadata (Showing top 20 results out of 1,305)

Refine search

How to use
Metadata
in
org.apache.tika.metadata