public static void main(String[] args) throws Exception { // Create a Tika instance with the default configuration Tika tika = new Tika(); // Parse all given files and print out the extracted // text content for (String file : args) { String text = tika.parseToString(new File(file)); System.out.print(text); } } }
public static String parseToStringExample() throws Exception { File document = new File("example.doc"); String content = new Tika().parseToString(document); System.out.print(content); return content; }
/** * Example of how to use Tika's parseToString method to parse the content of a file, * and return any text found. * <p> * Note: Tika.parseToString() will extract content from the outer container * document and any embedded/attached documents. * * @return The content of a file. */ public String parseToStringExample() throws IOException, SAXException, TikaException { Tika tika = new Tika(); try (InputStream stream = ParsingExample.class.getResourceAsStream("test.doc")) { return tika.parseToString(stream); } }
public void indexDocument(File file) throws Exception { Document document = new Document(); document.add(new TextField("filename", file.getName(), Store.YES)); document.add(new TextField("fulltext", tika.parseToString(file), Store.NO)); writer.addDocument(document); } }
/** * Parses the given document and returns the extracted text content. * The given input stream is closed by this method. * <p> * To avoid unpredictable excess memory use, the returned string contains * only up to {@link #getMaxStringLength()} first characters extracted * from the input document. Use the {@link #setMaxStringLength(int)} * method to adjust this limitation. * <p> * <strong>NOTE:</strong> Unlike most other Tika methods that take an * {@link InputStream}, this method will close the given stream for * you as a convenience. With other methods you are still responsible * for closing the stream or a wrapper instance returned by Tika. * * @param stream the document to be parsed * @return extracted text content * @throws IOException if the document can not be read * @throws TikaException if the document can not be parsed */ public String parseToString(InputStream stream) throws IOException, TikaException { return parseToString(stream, new Metadata()); }
/** * Parses the file at the given path and returns the extracted text content. * <p> * To avoid unpredictable excess memory use, the returned string contains * only up to {@link #getMaxStringLength()} first characters extracted * from the input document. Use the {@link #setMaxStringLength(int)} * method to adjust this limitation. * * @param path the path of the file to be parsed * @return extracted text content * @throws IOException if the file can not be read * @throws TikaException if the file can not be parsed */ public String parseToString(Path path) throws IOException, TikaException { Metadata metadata = new Metadata(); InputStream stream = TikaInputStream.get(path, metadata); return parseToString(stream, metadata); }
/** * Parses the resource at the given URL and returns the extracted * text content. * <p> * To avoid unpredictable excess memory use, the returned string contains * only up to {@link #getMaxStringLength()} first characters extracted * from the input document. Use the {@link #setMaxStringLength(int)} * method to adjust this limitation. * * @param url the URL of the resource to be parsed * @return extracted text content * @throws IOException if the resource can not be read * @throws TikaException if the resource can not be parsed */ public String parseToString(URL url) throws IOException, TikaException { Metadata metadata = new Metadata(); InputStream stream = TikaInputStream.get(url, metadata); return parseToString(stream, metadata); }
public void parse(final InputStream in, final String contentType, final String fieldName, final Document doc) throws IOException { final Metadata md = new Metadata(); md.set(HttpHeaders.CONTENT_TYPE, contentType); try { // Add body text. doc.add(text(fieldName, tika.parseToString(in, md), false)); } catch (final IOException e) { log.warn("Failed to index an attachment.", e); return; } catch (final TikaException e) { log.warn("Failed to parse an attachment.", e); return; } // Add DC attributes. addDublinCoreAttributes(md, doc); }
/** * Parses the given file and returns the extracted text content. * <p> * To avoid unpredictable excess memory use, the returned string contains * only up to {@link #getMaxStringLength()} first characters extracted * from the input document. Use the {@link #setMaxStringLength(int)} * method to adjust this limitation. * * @param file the file to be parsed * @return extracted text content * @throws IOException if the file can not be read * @throws TikaException if the file can not be parsed * @see #parseToString(Path) */ public String parseToString(File file) throws IOException, TikaException { Metadata metadata = new Metadata(); @SuppressWarnings("deprecation") InputStream stream = TikaInputStream.get(file, metadata); return parseToString(stream, metadata); }
public TrecDocument summarize(File file) throws FileNotFoundException, IOException, TikaException { Tika tika = new Tika(); Metadata met = new Metadata(); String contents = tika.parseToString(new FileInputStream(file), met); return new TrecDocument(met.get(TikaCoreProperties.RESOURCE_NAME_KEY), contents, met.getDate(TikaCoreProperties.CREATED)); }
private void compareXlsx(File expected, File result) throws IOException, TikaException { Tika tika = new Tika(); String expectedText = tika.parseToString(expected); String resultText = tika.parseToString(result); assertEquals(expectedText, resultText); } <dependency> <groupId>org.apache.tika</groupId> <artifactId>tika-parsers</artifactId> <version>1.13</version> <scope>test</scope> </dependency>
private String doParse(final InputStream inputStream) { try { // tika parseToString already closes the inputStream return tika.parseToString(inputStream); } catch (TikaException e) { throw new IllegalStateException("Unexpected TikaException processing failure", e); } catch (IOException e) { throw new IllegalStateException("Unexpected IOException processing failure", e); } }
public String parseToStringExample() throws IOException, SAXException, TikaException { Tika tika = new Tika(); try (InputStream stream = ParsingExample.class.getResourceAsStream("test.pdf")) { return tika.parseToString(stream); // This should return you the pdf's text } }
File inputFile = ...; Tika tika = new Tika(); String extractedText = tika.parseToString(inputFile);
/** * Parses the resource at the given URL and returns the extracted * text content. * <p> * To avoid unpredictable excess memory use, the returned string contains * only up to {@link #getMaxStringLength()} first characters extracted * from the input document. Use the {@link #setMaxStringLength(int)} * method to adjust this limitation. * * @param url the URL of the resource to be parsed * @return extracted text content * @throws IOException if the resource can not be read * @throws TikaException if the resource can not be parsed */ public String parseToString(URL url) throws IOException, TikaException { Metadata metadata = new Metadata(); InputStream stream = TikaInputStream.get(url, metadata); return parseToString(stream, metadata); }
Tika tika = new Tika(); Metadata metadata = new Metadata(); metadata.set(Metadata.RESOURCE_NAME_KEY, "myfile.name"); String text = tika.parseToString(new File("myfile.name"));
/** * Parses the file at the given path and returns the extracted text content. * <p> * To avoid unpredictable excess memory use, the returned string contains * only up to {@link #getMaxStringLength()} first characters extracted * from the input document. Use the {@link #setMaxStringLength(int)} * method to adjust this limitation. * * @param path the path of the file to be parsed * @return extracted text content * @throws IOException if the file can not be read * @throws TikaException if the file can not be parsed */ public String parseToString(Path path) throws IOException, TikaException { Metadata metadata = new Metadata(); InputStream stream = TikaInputStream.get(path, metadata); return parseToString(stream, metadata); }
/** * Parses the file at the given path and returns the extracted text content. * <p> * To avoid unpredictable excess memory use, the returned string contains * only up to {@link #getMaxStringLength()} first characters extracted * from the input document. Use the {@link #setMaxStringLength(int)} * method to adjust this limitation. * * @param path the path of the file to be parsed * @return extracted text content * @throws IOException if the file can not be read * @throws TikaException if the file can not be parsed */ public String parseToString(Path path) throws IOException, TikaException { Metadata metadata = new Metadata(); InputStream stream = TikaInputStream.get(path, metadata); return parseToString(stream, metadata); }
/** * Parses the given file and returns the extracted text content. * <p> * To avoid unpredictable excess memory use, the returned string contains * only up to {@link #getMaxStringLength()} first characters extracted * from the input document. Use the {@link #setMaxStringLength(int)} * method to adjust this limitation. * * @param file the file to be parsed * @return extracted text content * @throws IOException if the file can not be read * @throws TikaException if the file can not be parsed * @see #parseToString(Path) */ public String parseToString(File file) throws IOException, TikaException { Metadata metadata = new Metadata(); @SuppressWarnings("deprecation") InputStream stream = TikaInputStream.get(file, metadata); return parseToString(stream, metadata); }
private String getContentAsText(XWikiDocument doc, XWikiContext context) { String contentText = null; try { XWikiAttachment att = doc.getAttachment(this.filename); LOGGER.debug("Start parsing attachement [{}] in document [{}]", this.filename, doc.getDocumentReference()); Tika tika = new Tika(); Metadata metadata = new Metadata(); metadata.set(Metadata.RESOURCE_NAME_KEY, this.filename); contentText = StringUtils.lowerCase(tika.parseToString(att.getContentInputStream(context), metadata)); } catch (Throwable ex) { LOGGER.warn("error getting content of attachment [{}] for document [{}]", new Object[] {this.filename, doc.getDocumentReference(), ex}); } return contentText; } }