/** * Example of extracting the plain text of the contents. * Will return only the "body" part of the document */ public String parseToPlainText() throws IOException, SAXException, TikaException { BodyContentHandler handler = new BodyContentHandler(); AutoDetectParser parser = new AutoDetectParser(); Metadata metadata = new Metadata(); try (InputStream stream = ContentHandlerExample.class.getResourceAsStream("test.doc")) { parser.parse(stream, handler, metadata); return handler.toString(); } }
try (InputStream stream = ParsingExample.class.getResourceAsStream("test.doc")) { parser.parse(stream, handler, metadata); return handler.toString();
/** * This example shows how to extract content from the outer document and all * embedded documents. The key is to specify a {@link Parser} in the {@link ParseContext}. * * @return content, including from embedded documents * @throws IOException * @throws SAXException * @throws TikaException */ public String parseEmbeddedExample() throws IOException, SAXException, TikaException { AutoDetectParser parser = new AutoDetectParser(); BodyContentHandler handler = new BodyContentHandler(); Metadata metadata = new Metadata(); ParseContext context = new ParseContext(); context.set(Parser.class, parser); try (InputStream stream = ParsingExample.class.getResourceAsStream("test_recursive_embedded.docx")) { parser.parse(stream, handler, metadata, context); return handler.toString(); } }
/** * If you don't want content from embedded documents, send in * a {@link org.apache.tika.parser.ParseContext} that does contains a * {@link EmptyParser}. * * @return The content of a file. */ public String parseNoEmbeddedExample() throws IOException, SAXException, TikaException { AutoDetectParser parser = new AutoDetectParser(); BodyContentHandler handler = new BodyContentHandler(); Metadata metadata = new Metadata(); ParseContext parseContext = new ParseContext(); parseContext.set(Parser.class, new EmptyParser()); try (InputStream stream = ParsingExample.class.getResourceAsStream("test_recursive_embedded.docx")) { parser.parse(stream, handler, metadata, parseContext); return handler.toString(); } }
handler = new BodyContentHandler(); p.parse(new ByteArrayInputStream(new byte[] {0,1,2,3,4}), handler, metadata, context); assertEquals("Fell back!", handler.toString()); handler = new BodyContentHandler(); p.parse(new ByteArrayInputStream(new byte[] {0,1,2,3,4}), handler, metadata, context); assertEquals("", handler.toString());
h, m, parseContext); handler.characters(h.toString()); } catch (SAXException e) {
handler = new BodyContentHandler(); p.parse(new ByteArrayInputStream(new byte[] {0,1,2,3,4}), handler, metadata, context); assertEquals("Fell back!", handler.toString()); handler = new BodyContentHandler(); p.parse(new ByteArrayInputStream(new byte[] {0,1,2,3,4}), handler, metadata, context); assertEquals("Fell back!", handler.toString()); handler = new BodyContentHandler(); p.parse(new ByteArrayInputStream(new byte[] {0,1,2,3,4}), handler, metadata, context); assertEquals("Fell back!", handler.toString());
handler = new BodyContentHandler(); p.parse(new ByteArrayInputStream(new byte[] {0,1,2,3,4}), handler, metadata, context); assertEquals("Fell back 1!", handler.toString()); handler = new BodyContentHandler(); p.parse(new ByteArrayInputStream(new byte[] {0,1,2,3,4}), handler, metadata, context); assertEquals("Fell back 1!Fell back 2!", handler.toString()); handler = new BodyContentHandler(); p.parse(new ByteArrayInputStream(new byte[] {0,1,2,3,4}), handler, metadata, context); assertEquals("Fell back 1!Fell back 2!", handler.toString()); handler = new BodyContentHandler(); p.parse(new ByteArrayInputStream(new byte[] {0,1,2,3,4}), handler, metadata, context); assertEquals("Fell back 1!Fell back 2!", handler.toString());
HttpGet httpget = new HttpGet("http://url.here"); HttpEntity entity = null; HttpClient client = new DefaultHttpClient(); HttpResponse response = client.execute(httpget); entity = response.getEntity(); if (entity != null) { InputStream instream = entity.getContent(); BodyContentHandler handler = new BodyContentHandler(); Metadata metadata = new Metadata(); Parser parser = new AutoDetectParser(); parser.parse( instream, handler, metadata, new ParseContext()); String plainText = handler.toString(); FileWriter writer = new FileWriter( "/scratch/cache/output.txt"); writer.write( plainText ); writer.close(); System.out.println( "done"); }
BodyContentHandler bch = new BodyContentHandler(); parser.parse(is, bch, metadata, new ParseContext()); String plainText = bch.toString();
public String parseExample() throws IOException, SAXException, TikaException { AutoDetectParser parser = new AutoDetectParser(); BodyContentHandler handler = new BodyContentHandler(); Metadata metadata = new Metadata(); try (InputStream stream = ParsingExample.class.getResourceAsStream("test.pdf")) { parser.parse(stream, handler, metadata); return handler.toString(); } }
public String fetchPageAutoDetectParser(final String url ){ String fetchURL = addHttp(url); String pageContent = null; URLConnection connection; try { log.info("fetch url auto detect parser " + url); connection = new URL(fetchURL).openConnection(); connection.setReadTimeout(DEFAULT_TIMEOUT); //parse method parameters Parser parser = new AutoDetectParser(); BodyContentHandler handler = new BodyContentHandler(); Metadata metadata = new Metadata(); ParseContext context = new ParseContext(); //parsing the file parser.parse(connection.getInputStream(), handler, metadata, context); pageContent = handler.toString(); } catch (Exception e) { log.info(e.getMessage() + "\n" + e); } return pageContent; }
BodyContentHandler handler = new BodyContentHandler(); AutoDetectParser parser = new AutoDetectParser(); Metadata metadata = new Metadata(); try { parser.parse(is, handler, metadata); text = handler.toString(); } catch(TikaException te) { System.out.println(te.toString()); } finally { is.close(); }
/** * Extract data from MS Word DOC/DOCX file to text * * @param path * @return * @throws DataflowException */ public static String extractWordFile(Path path) throws DataflowException { try (FileInputStream inputStream = new FileInputStream(path.toString())) { BodyContentHandler handler = new BodyContentHandler(); Metadata metadata = new Metadata(); AutoDetectParser parser = new AutoDetectParser(); parser.parse(inputStream, handler, metadata); return handler.toString(); } catch (IOException | SAXException | TikaException e) { throw new DataflowException(e); } }
public ParsedData parse(InputStream stream, String fileName, String contentType) { BodyContentHandler handler = new BodyContentHandler(MAX_CHARACTERS); BoilerpipeContentHandler textHandler = new BoilerpipeContentHandler(handler, KeepEverythingExtractor.INSTANCE); Metadata metadata = createMetadata(fileName, contentType); ParseContext context = new ParseContext(); try { parser.parse(stream, textHandler, metadata, context); Map<String, String> metadataMap = new HashMap<String, String>(); for (String propertyName : metadata.names()) { metadataMap.put(propertyName, metadata.get(propertyName)); } return new ParsedData(handler.toString(), metadataMap); } catch (IOException | SAXException | TikaException e) { logger.error("Failed to extract metadata using Tika.", e); return null; } }
@Override public void doProcessStream(InputStream stream, String source, JCas jCas) throws IOException { super.doProcessStream(stream, source, jCas); try { BodyContentHandler textHandler = new BodyContentHandler(Integer.MAX_VALUE); Metadata metadata = new Metadata(); ParseContext context = new ParseContext(); AutoDetectParser autoParser = new AutoDetectParser(); autoParser.parse(stream, textHandler, metadata, context); String fullContent = textHandler.toString(); Matcher m = tearlinePattern.matcher(fullContent); if (m.find()) { jCas.setDocumentText(removeBoilerplate(fullContent.substring(0, m.start())).trim()); } else { jCas.setDocumentText(removeBoilerplate(fullContent).trim()); } for (String name : metadata.names()) { addMetadata(jCas, name, metadata.get(name)); } } catch (SAXException | TikaException e) { getMonitor().warn("Couldn't parse metadata from '{}'", source, e); } }
@Override public void doProcessStream(InputStream stream, String source, JCas jCas) throws IOException { super.doProcessStream(stream, source, jCas); try { BodyContentHandler textHandler = new BodyContentHandler(Integer.MAX_VALUE); Metadata metadata = new Metadata(); ParseContext context = new ParseContext(); AutoDetectParser autoParser = new AutoDetectParser(); autoParser.parse(stream, textHandler, metadata, context); String fullContent = textHandler.toString(); Matcher m = tearlinePattern.matcher(fullContent); if (m.find()) { jCas.setDocumentText(removeBoilerplate(fullContent.substring(0, m.start())).trim()); } else { jCas.setDocumentText(removeBoilerplate(fullContent).trim()); } for (String name : metadata.names()) { addMetadata(jCas, name, metadata.get(name)); } } catch (SAXException | TikaException e) { getMonitor().warn("Couldn't parse metadata from '{}'", source, e); } }
public void testFopMacroLibrary() throws Exception { String screentextUrl = screenUrl.concat("Fop"); HttpClient http = initHttpClient(); http.setUrl(screentextUrl.concat(authentificationQuery)); //FIXME need to check if the stream is an application-pdf that don't contains ftl stack trace InputStream screenInputStream = http.postStream(); assertNotNull("Response failed from ofbiz", screenInputStream); assertEquals("Response contentType isn't good : " + http.getResponseContentType(), "application/pdf;charset=UTF-8", http.getResponseContentType()); String screenOutString = ""; try { BodyContentHandler handler = new BodyContentHandler(Integer.MAX_VALUE); Metadata metadata = new Metadata(); new PDFParser().parse(screenInputStream, handler, metadata, new ParseContext()); screenOutString = handler.toString(); } finally { screenInputStream.close(); } //Test if a ftl macro error is present assertFalse("Fop Screen contains Macro on error : see " + screentextUrl + " for more detail", screenOutString.contains("FreeMarker template error:")); } }
@Override public void doProcessStream(InputStream stream, String source, JCas jCas) throws IOException { super.doProcessStream(stream, source, jCas); try { BodyContentHandler textHandler = new BodyContentHandler(Integer.MAX_VALUE); Metadata metadata = new Metadata(); ParseContext context = new ParseContext(); AutoDetectParser autoParser = new AutoDetectParser(); autoParser.parse(stream, textHandler, metadata, context); jCas.setDocumentText(textHandler.toString()); for (String name : metadata.names()) { addMetadata(jCas, name, metadata.get(name)); } } catch (SAXException | TikaException e) { getMonitor().warn("Couldn't parse metadata from '{}'", source, e); if (Strings.isNullOrEmpty(jCas.getDocumentText())) { jCas.setDocumentText(CORRUPT_FILE_TEXT); } } } }
@Override public void doProcessStream(InputStream stream, String source, JCas jCas) throws IOException { super.doProcessStream(stream, source, jCas); try { BodyContentHandler textHandler = new BodyContentHandler(Integer.MAX_VALUE); Metadata metadata = new Metadata(); ParseContext context = new ParseContext(); AutoDetectParser autoParser = new AutoDetectParser(); autoParser.parse(stream, textHandler, metadata, context); jCas.setDocumentText(textHandler.toString()); for (String name : metadata.names()) { addMetadata(jCas, name, metadata.get(name)); } } catch (SAXException | TikaException e) { getMonitor().warn("Couldn't parse metadata from '{}'", source, e); if (Strings.isNullOrEmpty(jCas.getDocumentText())) { jCas.setDocumentText(CORRUPT_FILE_TEXT); } } } }