private Map<String, String> tika_parse(InputStream sourceStream, String prefix, Integer maxAttribs, Integer maxAttribLen) throws IOException, TikaException, SAXException { final Metadata metadata = new Metadata(); final TikaInputStream tikaInputStream = TikaInputStream.get(sourceStream); try { dataBuilder.append(metadata.get(key));
public static void main(String[] args) throws IOException, SAXException, TikaException { DirListParser parser = new DirListParser(); Metadata met = new Metadata(); parser.parse(System.in, new BodyContentHandler(), met); System.out.println("Num files: " + met.getValues("Filename").length); System.out.println("Num executables: " + met.get("NumExecutables")); }
@Test public void testSomeParams() throws Exception { //test that a parameterized parser can read a config file //with only some changes to the initial values Metadata md = getMetadata("TIKA-1986-some-parameters.xml"); assertEquals("-6.0", md.get("xdouble")); assertEquals("testparamval", md.get("testparam")); assertEquals("false", md.get("xbool")); }
/** Test for <code>get(String)</code> method. */ @Test public void testGet() { Metadata meta = new Metadata(); assertNull(meta.get("a-name")); meta.add("a-name", "value-1"); assertEquals("value-1", meta.get("a-name")); meta.add("a-name", "value-2"); assertEquals("value-1", meta.get("a-name")); }
@Test public void testTextBasic() throws IOException { ExtractReader extractReader = new ExtractReader(); List<Metadata> metadataList = extractReader.loadExtract(testTxtFile); assertEquals(1, metadataList.size()); Metadata m = metadataList.get(0); assertEquals(1, m.getValues(AbstractRecursiveParserWrapperHandler.TIKA_CONTENT).length); assertEquals("the quick brown fox fox fox jumped over the lazy lazy dog\n", m.get(AbstractRecursiveParserWrapperHandler.TIKA_CONTENT)); //test that the mime is inferred from the file extension assertEquals("application/msword", m.get(Metadata.CONTENT_TYPE)); }
private static void assertExtractEquals(Extract extractA, Extract extractB) { //this currently only checks the basics //might want to add more checks assertEquals("number of embedded files", extractA.metadataList.size(), extractB.metadataList.size()); for (int i = 0; i < extractA.metadataList.size(); i++) { assertEquals("number of metadata elements in attachment: " + i, extractA.metadataList.get(i).size(), extractB.metadataList.get(i).size()); assertEquals("content in attachment: " + i, extractA.metadataList.get(i).get(AbstractRecursiveParserWrapperHandler.TIKA_CONTENT), extractB.metadataList.get(i).get(AbstractRecursiveParserWrapperHandler.TIKA_CONTENT)); } }
parser.setParsers(new HashMap<MediaType, Parser>()); Metadata metadata = new Metadata(); metadata.add(TikaMetadataKeys.RESOURCE_NAME_KEY, file.getName()); parser.parse(stream, new DefaultHandler(), metadata, new ParseContext()); stream.close(); String mimeType = metadata.get(HttpHeaders.CONTENT_TYPE); System.out.println(mimeType);
InputStream is = new FileInputStream("/home/rahul/Music/03 - I Like Your Music.mp3"); Parser parser = new AutoDetectParser(); BodyContentHandler handler = new BodyContentHandler(); Metadata metadata = new Metadata(); parser.parse(is, handler, metadata, new ParseContext()); String handler = handler.toString(); System.out.println("Handler data: " + handler); System.out.println(metadata.get(Metadata.CREATION_DATE)); System.out.println(metadata.get(Metadata.LAST_MODIFIED));
private void handleInlineBodyPart(BodyContents part) throws MimeException, IOException { String contentType = part.metadata.get(Metadata.CONTENT_TYPE); Parser parser = null; if (MediaType.TEXT_HTML.toString().equalsIgnoreCase(contentType)) { parser.parse( new ByteArrayInputStream(part.bytes), new EmbeddedContentHandler(new BodyContentHandler(handler)), new Metadata(), parseContext ); } catch (SAXException | TikaException e) {
String name = md.get(TikaCoreProperties.RESOURCE_NAME_KEY); if (name != null && name.length() > 0) { setTitle("Apache Tika: " + name); -1); RecursiveParserWrapper wrapper = new RecursiveParserWrapper(parser); wrapper.parse(input, recursiveParserWrapperHandler, new Metadata(), new ParseContext()); StringWriter jsonBuffer = new StringWriter(); JsonMetadataList.setPrettyPrinting(true);
public TrecDocument summarize(File file) throws FileNotFoundException, IOException, TikaException { Tika tika = new Tika(); Metadata met = new Metadata(); String contents = tika.parseToString(new FileInputStream(file), met); return new TrecDocument(met.get(TikaCoreProperties.RESOURCE_NAME_KEY), contents, met.getDate(TikaCoreProperties.CREATED)); }
@Test public void testConfigurableParserTypesDecorated() throws Exception { Metadata md = getMetadata("TIKA-1986-parameterized-decorated.xml"); for (Map.Entry<String, String> entry : expcted.entrySet()) { assertEquals("mismatch for " + entry.getKey(), entry.getValue(), md.get(entry.getKey())); } }
@Test public void testGetMetadata() throws Exception { URL url = TikaInputStreamTest.class.getResource("test.txt"); Metadata metadata = new Metadata(); TikaInputStream.get(url, metadata).close(); assertEquals("test.txt", metadata.get(TikaCoreProperties.RESOURCE_NAME_KEY)); assertEquals( Long.toString(Files.size(Paths.get(url.toURI()))), metadata.get(Metadata.CONTENT_LENGTH)); }
@Test public void testBasic() throws Exception { ExtractReader extractReader = new ExtractReader(); List<Metadata> metadataList = extractReader.loadExtract(testJsonFile); assertEquals(2, metadataList.size()); assertEquals(1, metadataList.get(0).getValues(AbstractRecursiveParserWrapperHandler.TIKA_CONTENT).length); assertEquals(1, metadataList.get(1).getValues(AbstractRecursiveParserWrapperHandler.TIKA_CONTENT).length); assertContains("fox", metadataList.get(0).get(AbstractRecursiveParserWrapperHandler.TIKA_CONTENT)); assertContains("attachment", metadataList.get(1).get(AbstractRecursiveParserWrapperHandler.TIKA_CONTENT)); extractReader = new ExtractReader(ExtractReader.ALTER_METADATA_LIST.FIRST_ONLY); metadataList = extractReader.loadExtract(testJsonFile); assertEquals(1, metadataList.size()); assertEquals(1, metadataList.get(0).getValues(AbstractRecursiveParserWrapperHandler.TIKA_CONTENT).length); assertContains("fox", metadataList.get(0).get(AbstractRecursiveParserWrapperHandler.TIKA_CONTENT)); assertNotContained("attachment", metadataList.get(0).get(AbstractRecursiveParserWrapperHandler.TIKA_CONTENT)); extractReader = new ExtractReader(ExtractReader.ALTER_METADATA_LIST.CONCATENATE_CONTENT_INTO_FIRST); metadataList = extractReader.loadExtract(testJsonFile); assertEquals(1, metadataList.size()); assertEquals(1, metadataList.get(0).getValues(AbstractRecursiveParserWrapperHandler.TIKA_CONTENT).length); assertContains("fox", metadataList.get(0).get(AbstractRecursiveParserWrapperHandler.TIKA_CONTENT)); assertContains("attachment", metadataList.get(0).get(AbstractRecursiveParserWrapperHandler.TIKA_CONTENT)); }
private void testBaseline() throws Exception { awaitServerStartup(); Response response = WebClient .create(endPoint + META_PATH) .accept("application/json") .put(ClassLoader .getSystemResourceAsStream(TEST_RECURSIVE_DOC)); Reader reader = new InputStreamReader((InputStream) response.getEntity(), UTF_8); List<Metadata> metadataList = JsonMetadataList.fromJson(reader); assertEquals(12, metadataList.size()); assertEquals("Microsoft Office Word", metadataList.get(0).get(OfficeOpenXMLExtended.APPLICATION)); assertContains("plundered our seas", metadataList.get(6).get("X-TIKA:content")); } }
public static void main(String[] args) { String fileLocation = "C:/desktopname/songs/song.mp3"; try { InputStream input = new FileInputStream(new File(fileLocation)); ContentHandler handler = new DefaultHandler(); Metadata metadata = new Metadata(); Parser parser = new Mp3Parser(); ParseContext parseCtx = new ParseContext(); parser.parse(input, handler, metadata, parseCtx); input.close(); // List all metadata String[] metadataNames = metadata.names(); for(String name : metadataNames){ System.out.println(name + ": " + metadata.get(name)); } // Retrieve the necessary info from metadata // Names - title, xmpDM:artist etc. - mentioned below may differ based System.out.println("----------------------------------------------"); System.out.println("Title: " + metadata.get("title")); System.out.println("Artists: " + metadata.get("xmpDM:artist")); System.out.println("Composer : "+metadata.get("xmpDM:composer")); System.out.println("Genre : "+metadata.get("xmpDM:genre")); System.out.println("Album : "+metadata.get("xmpDM:album")); } catch (Exception e) { } }
public class tikaExample { public static void main(String[] args) throws SAXException, TikaException { InputStream is = null; try { is = new BufferedInputStream(new FileInputStream(new File("/home/rahul/Downloads/darknet5.doc"))); Parser parser = new AutoDetectParser(); BodyContentHandler handler = new BodyContentHandler(); Metadata metadata = new Metadata(); parser.parse(is, handler, metadata, new ParseContext()); System.out.println("creation date "+metadata.get(Metadata.CREATION_DATE)); System.out.println("last modify date "+metadata.get(Metadata.LAST_MODIFIED)); } catch (IOException e) { e.printStackTrace(); }
@Override public List<RecognisedObject> recognise(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException { Metadata md = new Metadata(); parse(stream, handler, md, context); List<RecognisedObject> objects = new ArrayList<>(); for (String key: md.names()) { double confidence = Double.parseDouble(md.get(key)); objects.add(new RecognisedObject(key, "eng", key, confidence)); } return objects; } }
BodyContentHandler handler = new BodyContentHandler(); Metadata metadata = new Metadata(); contentType = metadata.get("Content-Type");