Tika tika = new Tika(); File file = ... String mimeType = tika.detect(file);
protected void getMimeType(InputStream inputStream, String fileName, StaticAsset newAsset) { Tika tika = new Tika(); String tikaMimeType = tika.detect(fileName); if (tikaMimeType == null) { try { tikaMimeType = tika.detect(inputStream); } catch (IOException e) { //if tika can't resolve, don't throw exception } } if (tikaMimeType != null) { newAsset.setMimeType(tikaMimeType); } }
/** * Detects the media type of a document with the given file name. * The type detection is based on known file name extensions. * <p> * The given name can also be a URL or a full file path. In such cases * only the file name part of the string is used for type detection. * * @param name the file name of the document * @return detected media type */ public String detect(String name) { try { return detect((InputStream) null, name); } catch (IOException e) { throw new IllegalStateException("Unexpected IOException", e); } }
@Override public String probeContentType(Path path) throws IOException { // Try to detect based on the file name only for efficiency String fileNameDetect = tika.detect(path.toString()); if(!fileNameDetect.equals(MimeTypes.OCTET_STREAM)) { return fileNameDetect; } // Then check the file content if necessary String fileContentDetect = tika.detect(path); if(!fileContentDetect.equals(MimeTypes.OCTET_STREAM)) { return fileContentDetect; } // Specification says to return null if we could not // conclusively determine the file type return null; }
public static void main(String[] args) throws Exception { Tika tika = new Tika(); for (String file : args) { String type = tika.detect(new File(file)); System.out.println(file + ": " + type); } } }
/** * Detects the media type of the given document. The type detection is * based on the content of the given document stream. * <p> * If the document stream supports the * {@link InputStream#markSupported() mark feature}, then the stream is * marked and reset to the original position before this method returns. * Only a limited number of bytes are read from the stream. * <p> * The given document stream is <em>not</em> closed by this method. * * @param stream the document stream * @return detected media type * @throws IOException if the stream can not be read */ public String detect(InputStream stream) throws IOException { return detect(stream, new Metadata()); }
/** * Detects the media type of the given document. The type detection is * based on the first few bytes of a document. * <p> * For best results at least a few kilobytes of the document data * are needed. See also the other detect() methods for better * alternatives when you have more than just the document prefix * available for type detection. * * @since Apache Tika 0.9 * @param prefix first few bytes of the document * @return detected media type */ public String detect(byte[] prefix) { try { try (InputStream stream = TikaInputStream.get(prefix)) { return detect(stream); } } catch (IOException e) { throw new IllegalStateException("Unexpected IOException", e); } }
public static String customMimeInfo() throws Exception { String path = "file:///path/to/prescription-type.xml"; MimeTypes typeDatabase = MimeTypesFactory.create(new URL(path)); Tika tika = new Tika(typeDatabase); String type = tika.detect("/path/to/prescription.xpd"); return type; }
public static String detectWithCustomConfig(String name) throws Exception { String config = "/org/apache/tika/mime/tika-mimetypes.xml"; Tika tika = new Tika(MimeTypesFactory.create(config)); return tika.detect(name); }
/** * Find the Mime Content Type of a document stored in the given file. * Returns application/octet-stream if no better match is found. * * @deprecated Use {@link Tika#detect(File)} instead * @param file file to analyze * @return the Mime Content Type of the specified document * @throws MimeTypeException if the type can't be detected * @throws IOException if the file can't be read */ public MimeType getMimeType(File file) throws MimeTypeException, IOException { return forName(new Tika(this).detect(file)); }
/** * Detects the media type of the file at the given path. The type * detection is based on the document content and a potential known * file extension. * <p> * Use the {@link #detect(String)} method when you want to detect the * type of the document without actually accessing the file. * * @param path the path of the file * @return detected media type * @throws IOException if the file can not be read */ public String detect(Path path) throws IOException { Metadata metadata = new Metadata(); try (InputStream stream = TikaInputStream.get(path, metadata)) { return detect(stream, metadata); } }
/** * Detects the media type of the resource at the given URL. The type * detection is based on the document content and a potential known * file extension included in the URL. * <p> * Use the {@link #detect(String)} method when you want to detect the * type of the document without actually accessing the URL. * * @param url the URL of the resource * @return detected media type * @throws IOException if the resource can not be read */ public String detect(URL url) throws IOException { Metadata metadata = new Metadata(); try (InputStream stream = TikaInputStream.get(url, metadata)) { return detect(stream, metadata); } }
/** * Detects the media type of the given file. The type detection is * based on the document content and a potential known file extension. * <p> * Use the {@link #detect(String)} method when you want to detect the * type of the document without actually accessing the file. * * @param file the file * @return detected media type * @throws IOException if the file can not be read * @see #detect(Path) */ public String detect(File file) throws IOException { Metadata metadata = new Metadata(); try (@SuppressWarnings("deprecation") InputStream stream = TikaInputStream.get(file, metadata)) { return detect(stream, metadata); } }
private static void benchmark(File file) throws Exception { if (file.isHidden()) { // ignore } else if (file.isFile()) { try (InputStream input = new FileInputStream(file)) { byte[] content = IOUtils.toByteArray(input); String type = tika.detect(new ByteArrayInputStream(content)); long start = System.currentTimeMillis(); for (int i = 0; i < 1000; i++) { tika.detect(new ByteArrayInputStream(content)); } System.out.printf( Locale.ROOT, "%6dns per Tika.detect(%s) = %s%n", System.currentTimeMillis() - start, file, type); } } else if (file.isDirectory()) { for (File child : file.listFiles()) { benchmark(child); } } }
public static String customCompositeDetector() throws Exception { String path = "file:///path/to/prescription-type.xml"; MimeTypes typeDatabase = MimeTypesFactory.create(new URL(path)); Tika tika = new Tika(new CompositeDetector(typeDatabase, new EncryptedPrescriptionDetector())); String type = tika.detect("/path/to/tmp/prescription.xpd"); return type; }
public static String detectWithCustomDetector(String name) throws Exception { String config = "/org/apache/tika/mime/tika-mimetypes.xml"; Detector detector = MimeTypesFactory.create(config); Detector custom = new Detector() { private static final long serialVersionUID = -5420638839201540749L; public MediaType detect(InputStream input, Metadata metadata) { String type = metadata.get("my-custom-type-override"); if (type != null) { return MediaType.parse(type); } else { return MediaType.OCTET_STREAM; } } }; Tika tika = new Tika(new CompositeDetector(custom, detector)); return tika.detect(name); } }
@Test public void testByteOrderMark() throws Exception { assertEquals(MediaType.TEXT_PLAIN.toString(), tika.detect( new ByteArrayInputStream("\ufefftest".getBytes(UTF_16LE)), new Metadata())); assertEquals(MediaType.TEXT_PLAIN.toString(), tika.detect( new ByteArrayInputStream("\ufefftest".getBytes(UTF_16BE)), new Metadata())); assertEquals(MediaType.TEXT_PLAIN.toString(), tika.detect( new ByteArrayInputStream("\ufefftest".getBytes(UTF_8)), new Metadata())); }
private void testStream(String expected, String urlOrFileName, InputStream in) throws IOException { assertNotNull("Test stream: [" + urlOrFileName + "] is null!", in); if (!in.markSupported()) { in = new java.io.BufferedInputStream(in); } try { Metadata metadata = new Metadata(); // String mime = this.proDetector.detect(in, metadata).toString(); String mime = tika.detect(in, metadata).toString(); assertEquals( urlOrFileName + " is not properly detected: detected.", expected, mime); // Add resource name and test again metadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, urlOrFileName); // mime = this.proDetector.detect(in, metadata).toString(); mime = tika.detect(in, metadata).toString(); assertEquals(urlOrFileName + " is not properly detected after adding resource name.", expected, mime); } finally { in.close(); } }
/** * Test for things like javascript files whose content is enclosed in XML * comment delimiters, but that aren't actually XML. * * @see <a * href="https://issues.apache.org/jira/browse/TIKA-426">TIKA-426</a> */ @Test public void testNotXML() throws IOException { assertEquals(MediaType.TEXT_PLAIN.toString(), tika.detect( new ByteArrayInputStream("<!-- test -->".getBytes(UTF_8)), new Metadata())); }
/** * Test for type detection of empty documents. * * @see <a * href="https://issues.apache.org/jira/browse/TIKA-483">TIKA-483</a> */ @Test public void testEmptyDocument() throws IOException { assertEquals(MediaType.OCTET_STREAM.toString(), tika.detect( new ByteArrayInputStream(new byte[0]), new Metadata())); Metadata namehint = new Metadata(); namehint.set(TikaCoreProperties.RESOURCE_NAME_KEY, "test.txt"); assertEquals(MediaType.TEXT_PLAIN.toString(), tika.detect(new ByteArrayInputStream(new byte[0]), namehint)); Metadata typehint = new Metadata(); typehint.set(Metadata.CONTENT_TYPE, "text/plain"); assertEquals(MediaType.TEXT_PLAIN.toString(), tika.detect(new ByteArrayInputStream(new byte[0]), typehint)); }