Refine search
public static String probeContentType(final InputStream is, final String name) { try (InputStream stream = new BufferedInputStream(is)) { final Metadata metadata = new Metadata(); metadata.set(Metadata.RESOURCE_NAME_KEY, name); return getDefaultMimeTypes().detect(stream, metadata).toString(); } catch (IOException e) { LOGGER.warn("Couldn't detect the media type of attachment {} {}", name, e); return WILDCARD; } }
@Override public void startElement(String uri, String localName, String qName, Attributes attributes) throws SAXException { binaryMode = ELEMENT_BINARY.equals(localName); if (binaryMode) { binaryData.setLength(0); metadata = new Metadata(); metadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, attributes.getValue(ATTRIBUTE_ID)); metadata.set(Metadata.CONTENT_TYPE, attributes.getValue(ATTRIBUTE_CONTENT_TYPE)); } }
/** * Utility method that returns a {@link Metadata} instance * for a document with the given name. * * @param name resource name (or <code>null</code>) * @return metadata instance */ private static Metadata getMetadata(String name) { Metadata metadata = new Metadata(); if (name != null && name.length() > 0) { metadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, name); } return metadata; }
protected static Metadata handleEntryMetadata( String name, Date createAt, Date modifiedAt, Long size, XHTMLContentHandler xhtml) throws SAXException, IOException, TikaException { Metadata entrydata = new Metadata(); if (createAt != null) { entrydata.set(TikaCoreProperties.CREATED, createAt); } if (modifiedAt != null) { entrydata.set(TikaCoreProperties.MODIFIED, modifiedAt); } if (size != null) { entrydata.set(Metadata.CONTENT_LENGTH, Long.toString(size)); } if (name != null && name.length() > 0) { name = name.replace("\\", "/"); entrydata.set(TikaCoreProperties.RESOURCE_NAME_KEY, name); AttributesImpl attributes = new AttributesImpl(); attributes.addAttribute("", "class", "class", "CDATA", "embedded"); attributes.addAttribute("", "id", "id", "CDATA", name); xhtml.startElement("div", attributes); xhtml.endElement("div"); entrydata.set(TikaCoreProperties.EMBEDDED_RELATIONSHIP_ID, name); } return entrydata; }
protected void handleClob(String tableName, String columnName, int rowNum, ResultSet resultSet, int columnIndex, ContentHandler handler, ParseContext context) throws SQLException, IOException, SAXException { Clob clob = resultSet.getClob(columnIndex); if (resultSet.wasNull()) { return; } boolean truncated = clob.length() > Integer.MAX_VALUE || clob.length() > maxClobLength; int readSize = (clob.length() < maxClobLength ? (int) clob.length() : maxClobLength); Metadata m = new Metadata(); m.set(Database.TABLE_NAME, tableName); m.set(Database.COLUMN_NAME, columnName); m.set(Database.PREFIX + "ROW_NUM", Integer.toString(rowNum)); m.set(Database.PREFIX + "IS_CLOB", "true"); m.set(Database.PREFIX + "CLOB_LENGTH", Long.toString(clob.length())); m.set(Database.PREFIX + "IS_CLOB_TRUNCATED", Boolean.toString(truncated)); m.set(Metadata.CONTENT_TYPE, "text/plain; charset=UTF-8"); m.set(Metadata.CONTENT_LENGTH, Integer.toString(readSize)); m.set(TikaCoreProperties.RESOURCE_NAME_KEY, //just in case something screwy is going on with the column name FilenameUtils.normalize(FilenameUtils.getName(columnName + "_" + rowNum + ".txt"))); //is there a more efficient way to go from a Reader to an InputStream? String s = clob.getSubString(0, readSize); if (embeddedDocumentUtil.shouldParseEmbedded(m)) { embeddedDocumentUtil.parseEmbedded(new ByteArrayInputStream(s.getBytes(UTF_8)), handler, m, true); } }
/** * Constructor * * @param inputRoot the input root for the file * @param fullPath the full path to the file * @throws IllegalArgumentException if the fullPath is not * a child of inputRoot */ public FSFileResource(Path inputRoot, Path fullPath) { this.fullPath = fullPath; this.metadata = new Metadata(); //child path must actually be a child assert(fullPath.toAbsolutePath().startsWith(inputRoot.toAbsolutePath())); this.relativePath = inputRoot.relativize(fullPath).toString(); //need to set these now so that the filter can determine //whether or not to crawl this file metadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, fullPath.getFileName().toString()); long sz = -1; try { sz = Files.size(fullPath); } catch (IOException e) { //swallow //not existent file will be handled downstream } metadata.set(Metadata.CONTENT_LENGTH, Long.toString(sz)); metadata.set(FSProperties.FS_REL_PATH, relativePath); metadata.set(FileResource.FILE_EXTENSION, getExtension(fullPath)); }
private List<Metadata> generateListFromTextFile(Reader reader, FileSuffixes fileSuffixes) throws IOException { List<Metadata> metadataList = new ArrayList<>(); String content = IOUtils.toString(reader); Metadata m = new Metadata(); m.set(AbstractRecursiveParserWrapperHandler.TIKA_CONTENT, content); if (fileSuffixes.format == FileSuffixes.FORMAT.HTML) { m.set(RecursiveParserWrapperHandler.TIKA_CONTENT_HANDLER, ToXMLContentHandler.class.getSimpleName()); } else if (fileSuffixes.format == FileSuffixes.FORMAT.TXT) { m.set(RecursiveParserWrapperHandler.TIKA_CONTENT_HANDLER, ToTextContentHandler.class.getSimpleName()); } //Let's hope the file name has a suffix that can //be used to determine the mime. Could be wrong or missing, //but better than nothing. m.set(TikaCoreProperties.RESOURCE_NAME_KEY, fileSuffixes.originalFileName); MediaType mimeType = tikaConfig.getMimeRepository().detect(null, m); if (mimeType != null) { m.set(Metadata.CONTENT_TYPE, mimeType.toString()); } metadataList.add(m); return metadataList; }
/** * Detects the media type of the given document. The type detection is * based on the content of the given document stream and the name of the * document. * <p> * If the document stream supports the * {@link InputStream#markSupported() mark feature}, then the stream is * marked and reset to the original position before this method returns. * Only a limited number of bytes are read from the stream. * <p> * The given document stream is <em>not</em> closed by this method. * * @since Apache Tika 0.9 * @param stream the document stream * @param name document name * @return detected media type * @throws IOException if the stream can not be read */ public String detect(InputStream stream, String name) throws IOException { Metadata metadata = new Metadata(); metadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, name); return detect(stream, metadata); }
protected void handleEmbeddedResource(TikaInputStream resource, String filename, String relationshipID, ClassID storageClassID, String mediaType, XHTMLContentHandler xhtml, boolean outputHtml) throws IOException, SAXException, TikaException { try { Metadata metadata = new Metadata(); if (filename != null) { metadata.set(Metadata.TIKA_MIME_FILE, filename); metadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, filename); } if (relationshipID != null) { metadata.set(TikaCoreProperties.EMBEDDED_RELATIONSHIP_ID, relationshipID); } if (storageClassID != null) { metadata.set(TikaCoreProperties.EMBEDDED_STORAGE_CLASS_ID, storageClassID.toString()); } if (mediaType != null) { metadata.set(Metadata.CONTENT_TYPE, mediaType); } if (embeddedDocumentUtil.shouldParseEmbedded(metadata)) { embeddedDocumentUtil.parseEmbedded(resource, xhtml, metadata, outputHtml); } } finally { resource.close(); } }
@Override public void endElement(String uri, String localName, String qName) throws SAXException { if (CANVAS.equals(localName)) { if (! canvasStack.isEmpty()) { canvasStack.pop(); } } else if (PATH.equals(localName)) { //this assumes that there cannot be a path within a path //not sure if this is true or if we need to track path depth if (imageSourcePathInZip != null) { Metadata m = embeddedInfos.get(imageSourcePathInZip); if (m == null) { m = new Metadata(); } if (originalLocationOnDrive != null) { String val = m.get(TikaCoreProperties.ORIGINAL_RESOURCE_NAME); if (val == null) { m.set(TikaCoreProperties.ORIGINAL_RESOURCE_NAME, originalLocationOnDrive); } } m.set(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE, TikaCoreProperties.EmbeddedResourceType.INLINE.toString()); embeddedInfos.put(imageSourcePathInZip, m); } //reset imageSourcePathInZip = null; originalLocationOnDrive = null; } } @Override
public void parse(final InputStream in, final String contentType, final String fieldName, final Document doc) throws IOException { final Metadata md = new Metadata(); md.set(HttpHeaders.CONTENT_TYPE, contentType); try { // Add body text. doc.add(text(fieldName, tika.parseToString(in, md), false)); } catch (final IOException e) { log.warn("Failed to index an attachment.", e); return; } catch (final TikaException e) { log.warn("Failed to parse an attachment.", e); return; } // Add DC attributes. addDublinCoreAttributes(md, doc); }
private void handleEmbedded() throws SAXException { if (rawBytes != null) { try (TikaInputStream is = TikaInputStream.get(rawBytes)) { Metadata metadata = new Metadata(); if (pictName != null) { metadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, pictName); } if (pictSource != null) { metadata.set(TikaCoreProperties.ORIGINAL_RESOURCE_NAME, pictSource); } if (embeddedDocumentExtractor.shouldParseEmbedded(metadata)) { embeddedDocumentExtractor.parseEmbedded(is, handler, metadata, false); } } catch (IOException e) { //log } } //reset pictName = null; pictSource = null; rawBytes = null; } }
private void handleEmbedded(String name, String type, byte[] contents, EmbeddedDocumentExtractor embeddedExtractor, ContentHandler handler) throws IOException, SAXException, TikaException { Metadata metadata = new Metadata(); if (name != null) metadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, name); if (type != null) metadata.set(Metadata.CONTENT_TYPE, type); if (embeddedExtractor.shouldParseEmbedded(metadata)) { embeddedExtractor.parseEmbedded( TikaInputStream.get(contents), new EmbeddedContentHandler(handler), metadata, false); } } }
public void indexWithDublinCore(File file) throws Exception { Metadata met = new Metadata(); met.add(TikaCoreProperties.CREATOR, "Manning"); met.add(TikaCoreProperties.CREATOR, "Tika in Action"); met.set(TikaCoreProperties.CREATED, new Date()); met.set(TikaCoreProperties.FORMAT, tika.detect(file)); met.set(DublinCore.SOURCE, file.toURI().toURL().toString()); met.add(TikaCoreProperties.SUBJECT, "File"); met.add(TikaCoreProperties.SUBJECT, "Indexing"); met.add(TikaCoreProperties.SUBJECT, "Metadata"); met.set(Property.externalClosedChoise(TikaCoreProperties.RIGHTS.getName(), "public", "private"), "public"); try (InputStream is = new FileInputStream(file)) { tika.parse(is, met); Document document = new Document(); for (String key : met.names()) { String[] values = met.getValues(key); for (String val : values) { document.add(new TextField(key, val, Store.YES)); } writer.addDocument(document); } } } }
/** * Handles an embedded file in the document */ protected void handleEmbeddedFile(PackagePart part, ContentHandler handler, String rel) throws SAXException, IOException { Metadata metadata = new Metadata(); metadata.set(TikaCoreProperties.EMBEDDED_RELATIONSHIP_ID, rel); // Get the name String name = part.getPartName().getName(); metadata.set( TikaCoreProperties.RESOURCE_NAME_KEY, name.substring(name.lastIndexOf('/') + 1)); // Get the content type metadata.set( Metadata.CONTENT_TYPE, part.getContentType()); // Call the recursing handler if (embeddedExtractor.shouldParseEmbedded(metadata)) { try(TikaInputStream tis = TikaInputStream.get(part.getInputStream())) { embeddedExtractor.parseEmbedded( tis, new EmbeddedContentHandler(handler), metadata, false); } } }
private void handleThumbnail(ContentHandler handler) { try { OPCPackage opcPackage = extractor.getPackage(); for (PackageRelationship rel : opcPackage.getRelationshipsByType(PackageRelationshipTypes.THUMBNAIL)) { PackagePart tPart = opcPackage.getPart(rel); InputStream tStream = tPart.getInputStream(); Metadata thumbnailMetadata = new Metadata(); String thumbName = tPart.getPartName().getName(); thumbnailMetadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, thumbName); AttributesImpl attributes = new AttributesImpl(); attributes.addAttribute(XHTML, "class", "class", "CDATA", "embedded"); attributes.addAttribute(XHTML, "id", "id", "CDATA", thumbName); handler.startElement(XHTML, "div", "div", attributes); handler.endElement(XHTML, "div", "div"); thumbnailMetadata.set(TikaCoreProperties.EMBEDDED_RELATIONSHIP_ID, thumbName); thumbnailMetadata.set(Metadata.CONTENT_TYPE, tPart.getContentType()); thumbnailMetadata.set(TikaCoreProperties.TITLE, tPart.getPartName().getName()); if (embeddedExtractor.shouldParseEmbedded(thumbnailMetadata)) { embeddedExtractor.parseEmbedded(TikaInputStream.get(tStream), new EmbeddedContentHandler(handler), thumbnailMetadata, false); } tStream.close(); } } catch (Exception ex) { } }
public static void useCompositeParser() throws Exception { InputStream stream = new ByteArrayInputStream(new byte[0]); ContentHandler handler = new DefaultHandler(); ParseContext context = new ParseContext(); Map<MediaType, Parser> parsersByType = new HashMap<MediaType, Parser>(); parsersByType.put(MediaType.parse("text/html"), new HtmlParser()); parsersByType.put(MediaType.parse("application/xml"), new XMLParser()); CompositeParser parser = new CompositeParser(); parser.setParsers(parsersByType); parser.setFallback(new TXTParser()); Metadata metadata = new Metadata(); metadata.set(Metadata.CONTENT_TYPE, "text/html"); parser.parse(stream, handler, metadata, context); }
@Override public Integer call() throws Exception { for (int i = 0; i < 1000; i++) { Metadata m = new Metadata(); long start = System.currentTimeMillis(); start += random.nextInt(1000000); Date now = new Date(start); DateFormat df = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss'Z'", Locale.US); m.set(TikaCoreProperties.CREATED, df.format(now)); df.setTimeZone(TimeZone.getTimeZone("UTC")); assertTrue(Math.abs(now.getTime() - m.getDate(TikaCoreProperties.CREATED).getTime()) < 2000); } return 1; } }