public static void process(Path path) throws Exception { Parser parser = new AutoDetectParser(); Metadata metadata = new Metadata(); // The PhoneExtractingContentHandler will examine any characters for phone numbers before passing them // to the underlying Handler. PhoneExtractingContentHandler handler = new PhoneExtractingContentHandler(new BodyContentHandler(), metadata); try (InputStream stream = new BufferedInputStream(Files.newInputStream(path))) { parser.parse(stream, handler, metadata, new ParseContext()); } String[] numbers = metadata.getValues("phonenumbers"); Collections.addAll(phoneNumbers, numbers); } }
private void parsePage(byte[] byteObject, Parser htmlParser, ContentHandler xhtml, ParseContext context) throws TikaException {// throws IOException InputStream stream = null; Metadata metadata = new Metadata(); ContentHandler handler = new EmbeddedContentHandler(new BodyContentHandler(xhtml));// -1 try { stream = new ByteArrayInputStream(byteObject); htmlParser.parse(stream, handler, metadata, context); } catch (SAXException e) { throw new RuntimeException(e); } catch (IOException e) { // Pushback overflow from tagsoup } }
public static void main(String[] args) throws IOException, SAXException, TikaException { DirListParser parser = new DirListParser(); Metadata met = new Metadata(); parser.parse(System.in, new BodyContentHandler(), met); System.out.println("Num files: " + met.getValues("Filename").length); System.out.println("Num executables: " + met.get("NumExecutables")); }
@Test public void testHelloWorld() throws Exception { try (ForkParser parser = new ForkParser(ForkParserTest.class.getClassLoader(), new ForkTestParser())) { Metadata metadata = new Metadata(); ContentHandler output = new BodyContentHandler(); InputStream stream = new ByteArrayInputStream(new byte[0]); ParseContext context = new ParseContext(); parser.parse(stream, output, metadata, context); assertEquals("Hello, World!", output.toString().trim()); assertEquals("text/plain", metadata.get(Metadata.CONTENT_TYPE)); } }
public static void testTeeContentHandler(String filename) throws Exception { InputStream stream = new ByteArrayInputStream(new byte[0]); Metadata metadata = new Metadata(); ParseContext context = new ParseContext(); Parser parser = new AutoDetectParser(); LinkContentHandler linkCollector = new LinkContentHandler(); try (OutputStream output = new FileOutputStream(new File(filename))) { ContentHandler handler = new TeeContentHandler( new BodyContentHandler(output), linkCollector); parser.parse(stream, handler, metadata, context); } }
InputStream is = new FileInputStream("/home/rahul/Music/03 - I Like Your Music.mp3"); Parser parser = new AutoDetectParser(); BodyContentHandler handler = new BodyContentHandler(); Metadata metadata = new Metadata(); parser.parse(is, handler, metadata, new ParseContext()); String handler = handler.toString(); System.out.println("Handler data: " + handler); System.out.println(metadata.get(Metadata.CREATION_DATE)); System.out.println(metadata.get(Metadata.LAST_MODIFIED));
private void handleInlineBodyPart(BodyContents part) throws MimeException, IOException { String contentType = part.metadata.get(Metadata.CONTENT_TYPE); Parser parser = null; if (MediaType.TEXT_HTML.toString().equalsIgnoreCase(contentType)) { parser.parse( new ByteArrayInputStream(part.bytes), new EmbeddedContentHandler(new BodyContentHandler(handler)), new Metadata(), parseContext ); } catch (SAXException | TikaException e) {
public static String parseUsingAutoDetect(String filename, TikaConfig tikaConfig, Metadata metadata) throws Exception { System.out.println("Handling using AutoDetectParser: [" + filename + "]"); AutoDetectParser parser = new AutoDetectParser(tikaConfig); ContentHandler handler = new BodyContentHandler(); TikaInputStream stream = TikaInputStream.get(new File(filename), metadata); parser.parse(stream, handler, metadata, new ParseContext()); return handler.toString(); }
final ParseContext parseContext = new ParseContext(); parseContext.set(Parser.class, parser); parser.parse(in, new BodyContentHandler(writer), metadata, parseContext); final StringWriter writer2 = new StringWriter( initialBufferSize); parser.parse(in, new BodyContentHandler(writer2), metadata2, parseContext); content = normalizeContent(writer2); final StringWriter writer3 = new StringWriter( initialBufferSize); parser.parse(in, new BodyContentHandler(writer3), metadata3, parseContext); content = normalizeContent(writer3);
public ParsedData parse(InputStream stream, String fileName, String contentType) { BodyContentHandler handler = new BodyContentHandler(MAX_CHARACTERS); BoilerpipeContentHandler textHandler = new BoilerpipeContentHandler(handler, KeepEverythingExtractor.INSTANCE); Metadata metadata = createMetadata(fileName, contentType); ParseContext context = new ParseContext(); try { parser.parse(stream, textHandler, metadata, context); Map<String, String> metadataMap = new HashMap<String, String>(); for (String propertyName : metadata.names()) { metadataMap.put(propertyName, metadata.get(propertyName)); } return new ParsedData(handler.toString(), metadataMap); } catch (IOException | SAXException | TikaException e) { logger.error("Failed to extract metadata using Tika.", e); return null; } }
@Test public void testPackageCanBeAccessed() throws Exception { try (ForkParser parser = new ForkParser(ForkParserTest.class.getClassLoader(), new ForkTestParser.ForkTestParserAccessingPackage())) { Metadata metadata = new Metadata(); ContentHandler output = new BodyContentHandler(); InputStream stream = new ByteArrayInputStream(new byte[0]); ParseContext context = new ParseContext(); parser.parse(stream, output, metadata, context); assertEquals("Hello, World!", output.toString().trim()); assertEquals("text/plain", metadata.get(Metadata.CONTENT_TYPE)); } }
public static Metadata getMet(URL url) throws IOException, SAXException, TikaException { Metadata met = new Metadata(); PDFParser parser = new PDFParser(); parser.parse(url.openStream(), new BodyContentHandler(), met, new ParseContext()); return met; }
public class tikaExample { public static void main(String[] args) throws SAXException, TikaException { InputStream is = null; try { is = new BufferedInputStream(new FileInputStream(new File("/home/rahul/Downloads/darknet5.doc"))); Parser parser = new AutoDetectParser(); BodyContentHandler handler = new BodyContentHandler(); Metadata metadata = new Metadata(); parser.parse(is, handler, metadata, new ParseContext()); System.out.println("creation date "+metadata.get(Metadata.CREATION_DATE)); System.out.println("last modify date "+metadata.get(Metadata.LAST_MODIFIED)); } catch (IOException e) { e.printStackTrace(); }
/** * Example of extracting the plain text of the contents. * Will return only the "body" part of the document */ public String parseToPlainText() throws IOException, SAXException, TikaException { BodyContentHandler handler = new BodyContentHandler(); AutoDetectParser parser = new AutoDetectParser(); Metadata metadata = new Metadata(); try (InputStream stream = ContentHandlerExample.class.getResourceAsStream("test.doc")) { parser.parse(stream, handler, metadata); return handler.toString(); } }
BodyContentHandler handler = new BodyContentHandler(); Metadata metadata = new Metadata(); contentType = metadata.get("Content-Type");
public static void process(Path path) throws Exception { Parser parser = new AutoDetectParser(); Metadata metadata = new Metadata(); // The StandardsExtractingContentHandler will examine any characters for // standard references before passing them // to the underlying Handler. StandardsExtractingContentHandler handler = new StandardsExtractingContentHandler(new BodyContentHandler(-1), metadata); handler.setThreshold(0.75); try (InputStream stream = new BufferedInputStream(Files.newInputStream(path))) { parser.parse(stream, handler, metadata, new ParseContext()); } String[] references = metadata.getValues(StandardsExtractingContentHandler.STANDARD_REFERENCES); Collections.addAll(standardReferences, references); } }
metadata.set(Metadata.CONTENT_TYPE, type.toString()); ContentHandler handler = new BodyContentHandler(); parser.parse(stream, handler, metadata, new ParseContext());
final ParseContext parseContext = new ParseContext(); parseContext.set(Parser.class, parser); parser.parse( in, new BodyContentHandler(writer), metadata, parseContext); parser.parse( in, new BodyContentHandler(writer2), metadata2, parseContext); parser.parse( in, new BodyContentHandler(writer3), metadata3, parseContext);
ByteArrayOutputStream output = new ByteArrayOutputStream(); OutputStreamWriter writer = new OutputStreamWriter(output, "UTF-8"); ContentHandler handler = new BodyContentHandler(writer); ParseContext context = new ParseContext(); context.set(PDFParserConfig.class, new VisalloParserConfig()); LOGGER.debug("metadata"); for (String metadataName : metadata.names()) { LOGGER.debug(" %s: %s", metadataName, metadata.get(metadataName));
@Test public void testSerialParsing() throws Exception { try (ForkParser parser = new ForkParser(ForkParserTest.class.getClassLoader(), new ForkTestParser())) { ParseContext context = new ParseContext(); for (int i = 0; i < 10; i++) { ContentHandler output = new BodyContentHandler(); InputStream stream = new ByteArrayInputStream(new byte[0]); parser.parse(stream, output, new Metadata(), context); assertEquals("Hello, World!", output.toString().trim()); } } }