public static void parseToReaderExample() throws Exception { File document = new File("example.doc"); try (Reader reader = new Tika().parse(document)) { char[] buffer = new char[1000]; int n = reader.read(buffer); while (n != -1) { System.out.append(CharBuffer.wrap(buffer, 0, n)); n = reader.read(buffer); } } }
/** * Parses the file at the given path and returns the extracted text content. * * @param path the path of the file to be parsed * @return extracted text content * @throws IOException if the file can not be read or parsed */ public Reader parse(Path path) throws IOException { return parse(path, new Metadata()); }
/** * Parses the given file and returns the extracted text content. * * @param file the file to be parsed * @return extracted text content * @throws IOException if the file can not be read or parsed * @see #parse(Path) */ public Reader parse(File file) throws IOException { return parse(file, new Metadata()); }
/** * Parses the given document and returns the extracted text content. * <p> * The returned reader will be responsible for closing the given stream. * The stream and any associated resources will be closed at or before * the time when the {@link Reader#close()} method is called. * * @param stream the document to be parsed * @return extracted text content * @throws IOException if the document can not be read or parsed */ public Reader parse(InputStream stream) throws IOException { return parse(stream, new Metadata()); }
public void indexDocument(File file) throws Exception { try (Reader fulltext = tika.parse(file)) { Document document = new Document(); document.add(new TextField("filename", file.getName(), Store.YES)); document.add(new TextField("fulltext", fulltext)); writer.addDocument(document); } } }
/** * Parses the file at the given path and returns the extracted text content. * <p> * Metadata information extracted from the document is returned in * the supplied metadata instance. * * @param path the path of the file to be parsed * @param metadata where document's metadata will be populated * @return extracted text content * @throws IOException if the file can not be read or parsed */ public Reader parse(Path path, Metadata metadata) throws IOException { InputStream stream = TikaInputStream.get(path, metadata); return parse(stream, metadata); }
/** * Parses the given file and returns the extracted text content. * <p> * Metadata information extracted from the document is returned in * the supplied metadata instance. * * @param file the file to be parsed * @param metadata where document's metadata will be populated * @return extracted text content * @throws IOException if the file can not be read or parsed * @see #parse(Path) */ public Reader parse(File file, Metadata metadata) throws IOException { @SuppressWarnings("deprecation") InputStream stream = TikaInputStream.get(file, metadata); return parse(stream, metadata); }
/** * Parses the resource at the given URL and returns the extracted * text content. * * @param url the URL of the resource to be parsed * @return extracted text content * @throws IOException if the resource can not be read or parsed */ public Reader parse(URL url) throws IOException { Metadata metadata = new Metadata(); InputStream stream = TikaInputStream.get(url, metadata); return parse(stream, metadata); }
public void indexContentSpecificMet(File file) throws Exception { Metadata met = new Metadata(); try (InputStream is = new FileInputStream(file)) { tika.parse(is, met); Document document = new Document(); for (String key : met.names()) { String[] values = met.getValues(key); for (String val : values) { document.add(new TextField(key, val, Store.YES)); } writer.addDocument(document); } } }
.equals(metadata.get(Metadata.CONTENT_TYPE)) ? new InputStreamReader(inputStream, StandardCharsets.UTF_8) : secondaryParser.parse(inputStream);
reader = new InputStreamReader(inputStream, StandardCharsets.UTF_8); } else { reader = secondaryParser.parse(inputStream);
private Metadata getMetadata(String name) throws TikaException, IOException, SAXException { URL url = this.getClass().getResource("/org/apache/tika/config/"+name); assertNotNull("couldn't find: "+name, url); TikaConfig tikaConfig = new TikaConfig(url); Tika tika = new Tika(tikaConfig); Metadata metadata = new Metadata(); tika.parse(url.openStream(), metadata); return metadata; } }
public void indexWithDublinCore(File file) throws Exception { Metadata met = new Metadata(); met.add(TikaCoreProperties.CREATOR, "Manning"); met.add(TikaCoreProperties.CREATOR, "Tika in Action"); met.set(TikaCoreProperties.CREATED, new Date()); met.set(TikaCoreProperties.FORMAT, tika.detect(file)); met.set(DublinCore.SOURCE, file.toURI().toURL().toString()); met.add(TikaCoreProperties.SUBJECT, "File"); met.add(TikaCoreProperties.SUBJECT, "Indexing"); met.add(TikaCoreProperties.SUBJECT, "Metadata"); met.set(Property.externalClosedChoise(TikaCoreProperties.RIGHTS.getName(), "public", "private"), "public"); try (InputStream is = new FileInputStream(file)) { tika.parse(is, met); Document document = new Document(); for (String key : met.names()) { String[] values = met.getValues(key); for (String val : values) { document.add(new TextField(key, val, Store.YES)); } writer.addDocument(document); } } } }
@Test public void testInitializableParser() throws Exception { URL configFileUrl = getClass().getClassLoader().getResource(TIKA_CFG_FILE); assert configFileUrl != null; TikaConfig config = new TikaConfig(configFileUrl); Tika tika = new Tika(config); Metadata md = new Metadata(); tika.parse(TikaInputStream.get("someString".getBytes(StandardCharsets.ISO_8859_1)), md); assertEquals("5", md.get(DummyInitializableParser.SUM_FIELD)); } }
/** * Parses the file at the given path and returns the extracted text content. * * @param path the path of the file to be parsed * @return extracted text content * @throws IOException if the file can not be read or parsed */ public Reader parse(Path path) throws IOException { return parse(path, new Metadata()); }
/** * Parses the given file and returns the extracted text content. * * @param file the file to be parsed * @return extracted text content * @throws IOException if the file can not be read or parsed * @see #parse(Path) */ public Reader parse(File file) throws IOException { return parse(file, new Metadata()); }
/** * Parses the file at the given path and returns the extracted text content. * * @param path the path of the file to be parsed * @return extracted text content * @throws IOException if the file can not be read or parsed */ public Reader parse(Path path) throws IOException { return parse(path, new Metadata()); }
/** * Parses the given file and returns the extracted text content. * * @param file the file to be parsed * @return extracted text content * @throws IOException if the file can not be read or parsed * @see #parse(Path) */ public Reader parse(File file) throws IOException { return parse(file, new Metadata()); }
/** * Parses the resource at the given URL and returns the extracted * text content. * * @param url the URL of the resource to be parsed * @return extracted text content * @throws IOException if the resource can not be read or parsed */ public Reader parse(URL url) throws IOException { Metadata metadata = new Metadata(); InputStream stream = TikaInputStream.get(url, metadata); return parse(stream, metadata); }
/** * Parses the resource at the given URL and returns the extracted * text content. * * @param url the URL of the resource to be parsed * @return extracted text content * @throws IOException if the resource can not be read or parsed */ public Reader parse(URL url) throws IOException { Metadata metadata = new Metadata(); InputStream stream = TikaInputStream.get(url, metadata); return parse(stream, metadata); }