Refine search
@Override public void process(final InputStream stream) throws IOException { try (final InputStream in = new BufferedInputStream(stream)) { TikaInputStream tikaStream = TikaInputStream.get(in); Metadata metadata = new Metadata(); if (filename != null && context.getProperty(USE_FILENAME_IN_DETECTION).asBoolean()) { metadata.add(TikaMetadataKeys.RESOURCE_NAME_KEY, filename); } // Get mime type MediaType mediatype = detector.detect(tikaStream, metadata); mimeTypeRef.set(mediatype.toString()); } } });
private Map<String, String> tika_parse(InputStream sourceStream, String prefix, Integer maxAttribs, Integer maxAttribLen) throws IOException, TikaException, SAXException { final Metadata metadata = new Metadata(); final TikaInputStream tikaInputStream = TikaInputStream.get(sourceStream); try { autoDetectParser.parse(tikaInputStream, new DefaultHandler(), metadata);
/** * Creates a TikaInputStream from the given array of bytes. * <p> * Note that you must always explicitly close the returned stream as in * some cases it may end up writing the given data to a temporary file. * * @param data input data * @return a TikaInputStream instance */ public static TikaInputStream get(byte[] data) { return get(data, new Metadata()); }
/** * Creates a TikaInputStream from the file at the given path. * <p> * Note that you must always explicitly close the returned stream to * prevent leaking open file handles. * * @param path input file * @return a TikaInputStream instance * @throws IOException if an I/O error occurs */ public static TikaInputStream get(Path path) throws IOException { return get(path, new Metadata()); }
/** * Creates a TikaInputStream from the resource at the given URI. * <p> * Note that you must always explicitly close the returned stream as in * some cases it may end up writing the resource to a temporary file. * * @param uri resource URI * @return a TikaInputStream instance * @throws IOException if the resource can not be accessed */ public static TikaInputStream get(URI uri) throws IOException { return get(uri, new Metadata()); }
/** * Creates a TikaInputStream from the resource at the given URL. * <p> * Note that you must always explicitly close the returned stream as in * some cases it may end up writing the resource to a temporary file. * * @param url resource URL * @return a TikaInputStream instance * @throws IOException if the resource can not be accessed */ public static TikaInputStream get(URL url) throws IOException { return get(url, new Metadata()); }
/** * Creates a TikaInputStream from the given database BLOB. * <p> * Note that the result set containing the BLOB may need to be kept open * until the returned TikaInputStream has been processed and closed. * You must also always explicitly close the returned stream as in * some cases it may end up writing the blob data to a temporary file. * * @param blob database BLOB * @return a TikaInputStream instance * @throws SQLException if BLOB data can not be accessed */ public static TikaInputStream get(Blob blob) throws SQLException { return get(blob, new Metadata()); }
/** * Creates a TikaInputStream from the given file. * <p> * Note that you must always explicitly close the returned stream to * prevent leaking open file handles. * * @param file input file * @return a TikaInputStream instance * @throws FileNotFoundException if the file does not exist * @deprecated use {@link #get(Path)}. In Tika 2.0, this will be removed * or modified to throw an IOException. */ @Deprecated public static TikaInputStream get(File file) throws FileNotFoundException { return get(file, new Metadata()); }
/** * Parses the resource at the given URL and returns the extracted * text content. * * @param url the URL of the resource to be parsed * @return extracted text content * @throws IOException if the resource can not be read or parsed */ public Reader parse(URL url) throws IOException { Metadata metadata = new Metadata(); InputStream stream = TikaInputStream.get(url, metadata); return parse(stream, metadata); }
/** * Detects the media type of the file at the given path. The type * detection is based on the document content and a potential known * file extension. * <p> * Use the {@link #detect(String)} method when you want to detect the * type of the document without actually accessing the file. * * @param path the path of the file * @return detected media type * @throws IOException if the file can not be read */ public String detect(Path path) throws IOException { Metadata metadata = new Metadata(); try (InputStream stream = TikaInputStream.get(path, metadata)) { return detect(stream, metadata); } }
/** * Detects the media type of the resource at the given URL. The type * detection is based on the document content and a potential known * file extension included in the URL. * <p> * Use the {@link #detect(String)} method when you want to detect the * type of the document without actually accessing the URL. * * @param url the URL of the resource * @return detected media type * @throws IOException if the resource can not be read */ public String detect(URL url) throws IOException { Metadata metadata = new Metadata(); try (InputStream stream = TikaInputStream.get(url, metadata)) { return detect(stream, metadata); } }
/** * Detects the media type of the given file. The type detection is * based on the document content and a potential known file extension. * <p> * Use the {@link #detect(String)} method when you want to detect the * type of the document without actually accessing the file. * * @param file the file * @return detected media type * @throws IOException if the file can not be read * @see #detect(Path) */ public String detect(File file) throws IOException { Metadata metadata = new Metadata(); try (@SuppressWarnings("deprecation") InputStream stream = TikaInputStream.get(file, metadata)) { return detect(stream, metadata); } }
/** * Parses the resource at the given URL and returns the extracted * text content. * <p> * To avoid unpredictable excess memory use, the returned string contains * only up to {@link #getMaxStringLength()} first characters extracted * from the input document. Use the {@link #setMaxStringLength(int)} * method to adjust this limitation. * * @param url the URL of the resource to be parsed * @return extracted text content * @throws IOException if the resource can not be read * @throws TikaException if the resource can not be parsed */ public String parseToString(URL url) throws IOException, TikaException { Metadata metadata = new Metadata(); InputStream stream = TikaInputStream.get(url, metadata); return parseToString(stream, metadata); }
/** * Parses the file at the given path and returns the extracted text content. * <p> * To avoid unpredictable excess memory use, the returned string contains * only up to {@link #getMaxStringLength()} first characters extracted * from the input document. Use the {@link #setMaxStringLength(int)} * method to adjust this limitation. * * @param path the path of the file to be parsed * @return extracted text content * @throws IOException if the file can not be read * @throws TikaException if the file can not be parsed */ public String parseToString(Path path) throws IOException, TikaException { Metadata metadata = new Metadata(); InputStream stream = TikaInputStream.get(path, metadata); return parseToString(stream, metadata); }
/** * Parses the given file and returns the extracted text content. * <p> * To avoid unpredictable excess memory use, the returned string contains * only up to {@link #getMaxStringLength()} first characters extracted * from the input document. Use the {@link #setMaxStringLength(int)} * method to adjust this limitation. * * @param file the file to be parsed * @return extracted text content * @throws IOException if the file can not be read * @throws TikaException if the file can not be parsed * @see #parseToString(Path) */ public String parseToString(File file) throws IOException, TikaException { Metadata metadata = new Metadata(); @SuppressWarnings("deprecation") InputStream stream = TikaInputStream.get(file, metadata); return parseToString(stream, metadata); }
public static String handleStreamContent(byte[] file) throws Exception { Metadata md = new Metadata(); TikaInputStream input = TikaInputStream.get(file, md); StringWriter textBuffer = new StringWriter(); StringBuilder metadataBuffer = new StringBuilder(); ContentHandler handler = new TeeContentHandler( getTextContentHandler(textBuffer) ); parser.parse(input, handler, md, context); return textBuffer.toString(); }
public static void parseTikaInputStream(String filename) throws Exception { Parser parser = new AutoDetectParser(); ContentHandler handler = new DefaultHandler(); Metadata metadata = new Metadata(); ParseContext context = new ParseContext(); try (InputStream stream = TikaInputStream.get(new File(filename))) { parser.parse(stream, handler, metadata, context); } }
private static void handleEmbedded(byte[] data, EmbeddedDocumentExtractor embeddedDocumentExtractor, ContentHandler handler) throws TikaException, SAXException { try (InputStream is = TikaInputStream.get(data)) { Metadata embeddedMetadata = new Metadata(); if (embeddedDocumentExtractor.shouldParseEmbedded(embeddedMetadata)) { embeddedDocumentExtractor.parseEmbedded(is, new EmbeddedContentHandler(handler), embeddedMetadata, false); } } catch (IOException e) { } } }