public static void process(Path path) throws Exception { Parser parser = new AutoDetectParser(); Metadata metadata = new Metadata(); // The PhoneExtractingContentHandler will examine any characters for phone numbers before passing them // to the underlying Handler. PhoneExtractingContentHandler handler = new PhoneExtractingContentHandler(new BodyContentHandler(), metadata); try (InputStream stream = new BufferedInputStream(Files.newInputStream(path))) { parser.parse(stream, handler, metadata, new ParseContext()); } String[] numbers = metadata.getValues("phonenumbers"); Collections.addAll(phoneNumbers, numbers); } }
/** * Example of extracting the plain text of the contents. * Will return only the "body" part of the document */ public String parseToPlainText() throws IOException, SAXException, TikaException { BodyContentHandler handler = new BodyContentHandler(); AutoDetectParser parser = new AutoDetectParser(); Metadata metadata = new Metadata(); try (InputStream stream = ContentHandlerExample.class.getResourceAsStream("test.doc")) { parser.parse(stream, handler, metadata); return handler.toString(); } }
String v = toString(obj, c.getType()); if (isRichText(c)) { BodyContentHandler h = new BodyContentHandler(); Metadata m = new Metadata(); m.set(Metadata.CONTENT_TYPE, "text/html; charset=UTF-8"); try { htmlParser.parse(new ByteArrayInputStream(v.getBytes(UTF_8)), h, m, parseContext); handler.characters(h.toString()); } catch (SAXException e) {
ParseContext context = new ParseContext(); BodyContentHandler handler; Metadata metadata; metadata = new Metadata(); handler = new BodyContentHandler(); p.parse(new ByteArrayInputStream(new byte[] {0,1,2,3,4}), handler, metadata, context); assertEquals("Fell back!", handler.toString()); usedParsers = metadata.getValues("X-Parsed-By"); assertEquals(1, usedParsers.length); assertEquals(DummyParser.class.getName(), usedParsers[0]); metadata = new Metadata(); handler = new BodyContentHandler(); p.parse(new ByteArrayInputStream(new byte[] {0,1,2,3,4}), handler, metadata, context); assertEquals("Fell back!", handler.toString()); usedParsers = metadata.getValues("X-Parsed-By"); metadata = new Metadata(); handler = new BodyContentHandler(); p.parse(new ByteArrayInputStream(new byte[] {0,1,2,3,4}), handler, metadata, context); assertEquals("Fell back!", handler.toString()); usedParsers = metadata.getValues("X-Parsed-By");
MediaType.OCTET_STREAM, MediaType.TEXT_PLAIN)); ParseContext context = new ParseContext(); BodyContentHandler handler; Metadata metadata; Set<MediaType> types = p.getSupportedTypes(context); assertEquals(2, types.size()); assertEquals(types.toString(), true, types.contains(MediaType.TEXT_PLAIN)); metadata = new Metadata(); handler = new BodyContentHandler(); p.parse(new ByteArrayInputStream(new byte[] {0,1,2,3,4}), handler, metadata, context); assertEquals("Fell back!", handler.toString()); metadata = new Metadata(); handler = new BodyContentHandler(); p.parse(new ByteArrayInputStream(new byte[] {0,1,2,3,4}), handler, metadata, context); assertEquals("", handler.toString());
ByteArrayInputStream bais = new ByteArrayInputStream(content); org.apache.tika.metadata.Metadata md = new org.apache.tika.metadata.Metadata(); if (StringUtils.isNotBlank(httpCT)) { md.set(org.apache.tika.metadata.Metadata.CONTENT_TYPE, httpCT); md.set(org.apache.tika.metadata.Metadata.RESOURCE_NAME_KEY, _url.getFile()); } catch (MalformedURLException e1) { ContentHandler textHandler = new BodyContentHandler(-1); TeeContentHandler teeHandler = new TeeContentHandler(linkHandler, textHandler); ParseContext parseContext = new ParseContext(); parseContext.set(Parser.class, tika.getParser()); parseContext.set(HtmlMapper.class, (HtmlMapper) HTMLMapperClass.newInstance()); } catch (Exception e) { tika.getParser().parse(bais, teeHandler, md, parseContext); text = textHandler.toString(); } catch (Throwable e) {
String from = msg.getDisplayFrom(); metadata.set(TikaCoreProperties.CREATOR, from); metadata.set(Metadata.MESSAGE_FROM, from); metadata.set(Metadata.MESSAGE_TO, msg.getDisplayTo()); metadata.set(Metadata.MESSAGE_CC, msg.getDisplayCC()); metadata.set(Metadata.MESSAGE_BCC, msg.getDisplayBCC()); HtmlParser htmlParser = new HtmlParser(); htmlParser.parse( new ByteArrayInputStream(data), new EmbeddedContentHandler(new BodyContentHandler(xhtml)), new Metadata(), new ParseContext() ); doneBody = true; RTFParser rtfParser = new RTFParser(); rtfParser.parse( new ByteArrayInputStream(rtf.getData()), new EmbeddedContentHandler(new BodyContentHandler(xhtml)), new Metadata(), new ParseContext()); doneBody = true;
metadata.set(Office.MAPI_MESSAGE_CLASS, getMessageClass(msg.getMessageClass())); } catch (ChunkNotFoundException e){} metadata.set(TikaCoreProperties.TITLE, subject); metadata.set(TikaCoreProperties.TRANSITION_SUBJECT_TO_DC_DESCRIPTION, msg.getConversationTopic()); htmlParser = new HtmlParser(); htmlParser.parse( new ByteArrayInputStream(data), new EmbeddedContentHandler(new BodyContentHandler(xhtml)), new Metadata(), parseContext ); rtfParser = new RTFParser(); rtfParser.parse( new ByteArrayInputStream(rtf.getData()), new EmbeddedContentHandler(new BodyContentHandler(xhtml)), new Metadata(), parseContext); doneBody = true;
public static void main(String[] args) throws Exception { File file = new File("/Users/jason/docstore/example_received_regular.msg"); AutoDetectParser parser = new AutoDetectParser(); BodyContentHandler handler = new BodyContentHandler(-1); Metadata tikaMetadata = new Metadata(); InputStream input = TikaInputStream.get(file, tikaMetadata); parser.parse(input, handler, tikaMetadata, new ParseContext()); String[] names = tikaMetadata.names(); Arrays.sort(names); for (String name : names) { System.out.println(name + ": " + tikaMetadata.get(name)); } }
InputStream is = new FileInputStream("/home/rahul/Music/03 - I Like Your Music.mp3"); Parser parser = new AutoDetectParser(); BodyContentHandler handler = new BodyContentHandler(); Metadata metadata = new Metadata(); parser.parse(is, handler, metadata, new ParseContext()); String handler = handler.toString(); System.out.println("Handler data: " + handler); System.out.println(metadata.get(Metadata.CREATION_DATE)); System.out.println(metadata.get(Metadata.LAST_MODIFIED));
@Override public Single<Map<String, String>> getMetadata(InputStream ins) { return Single.create(sub -> { Parser parser = new AutoDetectParser(); BodyContentHandler handler = new BodyContentHandler(); Metadata metadata = new Metadata(); ParseContext context = new ParseContext(); try { parser.parse(ins, handler, metadata, context); Map<String, String> map = new HashMap<>(); String[] metadataNames = metadata.names(); for (String name : metadataNames) { map.put(name, metadata.get(name)); } sub.onSuccess(map); } catch (Exception e) { sub.onError(e); } // ins.close(); }); }
return; //we need not to process plain text! final ParseContext context = new ParseContext(); context.set(Parser.class,parser); Set<MediaType> supproted = parser.getSupportedTypes(context); if(supproted.contains(plainMediaType)) { final InputStream in; in = mtas.in; final Metadata metadata = new Metadata(); metadata.set(Metadata.CONTENT_TYPE, mtas.mediaType.toString()); metadata.set(Metadata.CONTENT_ENCODING, charset); final ContentHandler textHandler = new BodyContentHandler( //only the Body new PlainTextHandler(plainTextWriter, false,skipLinebreaks)); //skip ignoreable final ToXMLContentHandler xhtmlHandler;
public static void main(String[] args) throws IOException, SAXException, TikaException { DirListParser parser = new DirListParser(); Metadata met = new Metadata(); parser.parse(System.in, new BodyContentHandler(), met); System.out.println("Num files: " + met.getValues("Filename").length); System.out.println("Num executables: " + met.get("NumExecutables")); }
BodyContentHandler handler = new BodyContentHandler(writer); Metadata meta = new Metadata(); ParseContext context = new ParseContext(); parser.parse(in, handler, meta, context); in.close(); combineContentItem(meta.get(DublinCore.TITLE), I_CmsExtractionResult.ITEM_TITLE, content, contentItems); combineContentItem(meta.get(MSOffice.KEYWORDS), I_CmsExtractionResult.ITEM_KEYWORDS, content, contentItems); combineContentItem(meta.get(DublinCore.SUBJECT), I_CmsExtractionResult.ITEM_SUBJECT, content, contentItems); combineContentItem(meta.get(MSOffice.AUTHOR), I_CmsExtractionResult.ITEM_AUTHOR, content, contentItems);
BodyContentHandler ch = new BodyContentHandler(woh); Metadata metadata = new Metadata(); metadata.add(Metadata.CONTENT_TYPE, blob.getMediaType()); if (blob.getName() != null) { metadata.add(Metadata.RESOURCE_NAME_KEY, blob.getName()); ParseContext parseContext = new ParseContext(); tikaParser.parse(is, ch, metadata, parseContext); } catch (Throwable t) { if (woh.isWriteLimitReached(t)) { String text = ch.toString(); if (text.length() > 0) { result.add(text);
public static Metadata getMet(URL url) throws IOException, SAXException, TikaException { Metadata met = new Metadata(); PDFParser parser = new PDFParser(); parser.parse(url.openStream(), new BodyContentHandler(), met, new ParseContext()); return met; }
public ParsedData parse(InputStream stream, String fileName, String contentType) { BodyContentHandler handler = new BodyContentHandler(MAX_CHARACTERS); BoilerpipeContentHandler textHandler = new BoilerpipeContentHandler(handler, KeepEverythingExtractor.INSTANCE); Metadata metadata = createMetadata(fileName, contentType); ParseContext context = new ParseContext(); try { parser.parse(stream, textHandler, metadata, context); Map<String, String> metadataMap = new HashMap<String, String>(); for (String propertyName : metadata.names()) { metadataMap.put(propertyName, metadata.get(propertyName)); } return new ParsedData(handler.toString(), metadataMap); } catch (IOException | SAXException | TikaException e) { logger.error("Failed to extract metadata using Tika.", e); return null; } }
metadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, filename); System.out.println("The MIME type (based on filename) is: [" + mimeRegistry.detect(null, metadata) + "]"); metadata.set(Metadata.CONTENT_TYPE, type.toString()); ContentHandler handler = new BodyContentHandler(); parser.parse(stream, handler, metadata, new ParseContext());
final ParseContext parseContext = new ParseContext(); parseContext.set(Parser.class, parser); parser.parse(in, new BodyContentHandler(writer), metadata, parseContext); final StringWriter writer2 = new StringWriter( initialBufferSize); parser.parse(in, new BodyContentHandler(writer2), metadata2, parseContext); content = normalizeContent(writer2); final StringWriter writer3 = new StringWriter( initialBufferSize); parser.parse(in, new BodyContentHandler(writer3), metadata3, parseContext); content = normalizeContent(writer3); final String[] names = metadata.names(); Arrays.sort(names); for (final String name : names) { extractData.putValues(name, metadata.getValues(name));
xhtml.startDocument(); ContentHandler childHandler = new EmbeddedContentHandler( new BodyContentHandler(xhtml)); type = type.trim(); metadata.set(Metadata.CONTENT_TYPE, type); } else if (entry.getName().equals("metadata.xml")) { meta.parse(zip, new DefaultHandler(), metadata, context); } else if (entry.getName().endsWith(".opf")) { meta.parse(zip, new DefaultHandler(), metadata, context); } else if (entry.getName().endsWith(".htm") || entry.getName().endsWith(".html") || entry.getName().endsWith(".xhtml")) { content.parse(zip, childHandler, metadata, context);