Refine search
public TrecDocument summarize(File file) throws FileNotFoundException, IOException, TikaException { Tika tika = new Tika(); Metadata met = new Metadata(); String contents = tika.parseToString(new FileInputStream(file), met); return new TrecDocument(met.get(TikaCoreProperties.RESOURCE_NAME_KEY), contents, met.getDate(TikaCoreProperties.CREATED)); }
/** * Detects the media type of the file at the given path. The type * detection is based on the document content and a potential known * file extension. * <p> * Use the {@link #detect(String)} method when you want to detect the * type of the document without actually accessing the file. * * @param path the path of the file * @return detected media type * @throws IOException if the file can not be read */ public String detect(Path path) throws IOException { Metadata metadata = new Metadata(); try (InputStream stream = TikaInputStream.get(path, metadata)) { return detect(stream, metadata); } }
/** * Parses the file at the given path and returns the extracted text content. * * @param path the path of the file to be parsed * @return extracted text content * @throws IOException if the file can not be read or parsed */ public Reader parse(Path path) throws IOException { return parse(path, new Metadata()); }
public static void main(String[] args) throws Exception { // Create a Tika instance with the default configuration Tika tika = new Tika(); // Parse all given files and print out the extracted // text content for (String file : args) { String text = tika.parseToString(new File(file)); System.out.print(text); } } }
/** * Parses the file at the given path and returns the extracted text content. * <p> * To avoid unpredictable excess memory use, the returned string contains * only up to {@link #getMaxStringLength()} first characters extracted * from the input document. Use the {@link #setMaxStringLength(int)} * method to adjust this limitation. * * @param path the path of the file to be parsed * @return extracted text content * @throws IOException if the file can not be read * @throws TikaException if the file can not be parsed */ public String parseToString(Path path) throws IOException, TikaException { Metadata metadata = new Metadata(); InputStream stream = TikaInputStream.get(path, metadata); return parseToString(stream, metadata); }
/** * Detects the media type of the given document. The type detection is * based on the content of the given document stream and the name of the * document. * <p> * If the document stream supports the * {@link InputStream#markSupported() mark feature}, then the stream is * marked and reset to the original position before this method returns. * Only a limited number of bytes are read from the stream. * <p> * The given document stream is <em>not</em> closed by this method. * * @since Apache Tika 0.9 * @param stream the document stream * @param name document name * @return detected media type * @throws IOException if the stream can not be read */ public String detect(InputStream stream, String name) throws IOException { Metadata metadata = new Metadata(); metadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, name); return detect(stream, metadata); }
public void indexWithDublinCore(File file) throws Exception { Metadata met = new Metadata(); met.add(TikaCoreProperties.CREATOR, "Manning"); met.add(TikaCoreProperties.CREATOR, "Tika in Action"); met.set(TikaCoreProperties.CREATED, new Date()); met.set(TikaCoreProperties.FORMAT, tika.detect(file)); met.set(DublinCore.SOURCE, file.toURI().toURL().toString()); met.add(TikaCoreProperties.SUBJECT, "File"); met.add(TikaCoreProperties.SUBJECT, "Indexing"); met.add(TikaCoreProperties.SUBJECT, "Metadata"); met.set(Property.externalClosedChoise(TikaCoreProperties.RIGHTS.getName(), "public", "private"), "public"); try (InputStream is = new FileInputStream(file)) { tika.parse(is, met); Document document = new Document(); for (String key : met.names()) { String[] values = met.getValues(key); for (String val : values) { document.add(new TextField(key, val, Store.YES)); } writer.addDocument(document); } } } }
Tika tika = new Tika(); Metadata metadata = new Metadata(); metadata.set(Metadata.RESOURCE_NAME_KEY, "myfile.name"); String text = tika.parseToString(new File("myfile.name"));
@Test public void testInitializableParser() throws Exception { URL configFileUrl = getClass().getClassLoader().getResource(TIKA_CFG_FILE); assert configFileUrl != null; TikaConfig config = new TikaConfig(configFileUrl); Tika tika = new Tika(config); Metadata md = new Metadata(); tika.parse(TikaInputStream.get("someString".getBytes(StandardCharsets.ISO_8859_1)), md); assertEquals("5", md.get(DummyInitializableParser.SUM_FIELD)); } }
/** * Parses the resource at the given URL and returns the extracted * text content. * * @param url the URL of the resource to be parsed * @return extracted text content * @throws IOException if the resource can not be read or parsed */ public Reader parse(URL url) throws IOException { Metadata metadata = new Metadata(); InputStream stream = TikaInputStream.get(url, metadata); return parse(stream, metadata); }
/** * Detects the media type of the given document. The type detection is * based on the content of the given document stream. * <p> * If the document stream supports the * {@link InputStream#markSupported() mark feature}, then the stream is * marked and reset to the original position before this method returns. * Only a limited number of bytes are read from the stream. * <p> * The given document stream is <em>not</em> closed by this method. * * @param stream the document stream * @return detected media type * @throws IOException if the stream can not be read */ public String detect(InputStream stream) throws IOException { return detect(stream, new Metadata()); }
public void indexContentSpecificMet(File file) throws Exception { Metadata met = new Metadata(); try (InputStream is = new FileInputStream(file)) { tika.parse(is, met); Document document = new Document(); for (String key : met.names()) { String[] values = met.getValues(key); for (String val : values) { document.add(new TextField(key, val, Store.YES)); } writer.addDocument(document); } } }
public void parse(final InputStream in, final String contentType, final String fieldName, final Document doc) throws IOException { final Metadata md = new Metadata(); md.set(HttpHeaders.CONTENT_TYPE, contentType); try { // Add body text. doc.add(text(fieldName, tika.parseToString(in, md), false)); } catch (final IOException e) { log.warn("Failed to index an attachment.", e); return; } catch (final TikaException e) { log.warn("Failed to parse an attachment.", e); return; } // Add DC attributes. addDublinCoreAttributes(md, doc); }
/** * Parses the given document and returns the extracted text content. * The given input stream is closed by this method. * <p> * To avoid unpredictable excess memory use, the returned string contains * only up to {@link #getMaxStringLength()} first characters extracted * from the input document. Use the {@link #setMaxStringLength(int)} * method to adjust this limitation. * <p> * <strong>NOTE:</strong> Unlike most other Tika methods that take an * {@link InputStream}, this method will close the given stream for * you as a convenience. With other methods you are still responsible * for closing the stream or a wrapper instance returned by Tika. * * @param stream the document to be parsed * @return extracted text content * @throws IOException if the document can not be read * @throws TikaException if the document can not be parsed */ public String parseToString(InputStream stream) throws IOException, TikaException { return parseToString(stream, new Metadata()); }
Tika tika = new Tika(); tika.parse(is, tikaMet); // extract metadata tikaMet.add("content", tika.parseToString(file)); // extract content + tikaMet.names().length + "]"); for (String key : tikaMet.names()) { met.addMetadata(key, StringEscapeUtils.escapeXml(tikaMet.get(key))); LOG.fine("Added tika met key [" + key + "] with value ["
private Metadata getMetadata(String name) throws TikaException, IOException, SAXException { URL url = this.getClass().getResource("/org/apache/tika/config/"+name); assertNotNull("couldn't find: "+name, url); TikaConfig tikaConfig = new TikaConfig(url); Tika tika = new Tika(tikaConfig); Metadata metadata = new Metadata(); tika.parse(url.openStream(), metadata); return metadata; } }
if (MediaType.TEXT_PLAIN.toString().equals(metadata.get(Metadata.CONTENT_TYPE))) { reader = new InputStreamReader(inputStream, StandardCharsets.UTF_8); } else { reader = secondaryParser.parse(inputStream); double predictAuthorAge = getAgePredictorClient().predictAge(IOUtils.toString(reader)); metadata.add(MD_KEY_ESTIMATED_AGE, Double.toString(predictAuthorAge) );
/** * Parses the file at the given path and returns the extracted text content. * <p> * Metadata information extracted from the document is returned in * the supplied metadata instance. * * @param path the path of the file to be parsed * @param metadata where document's metadata will be populated * @return extracted text content * @throws IOException if the file can not be read or parsed */ public Reader parse(Path path, Metadata metadata) throws IOException { InputStream stream = TikaInputStream.get(path, metadata); return parse(stream, metadata); }