Refine search
public TikaHtmlParser(CrawlConfig config, TLDList tldList) throws InstantiationException, IllegalAccessException { this.config = config; this.tldList = tldList; htmlParser = new HtmlParser(); parseContext = new ParseContext(); parseContext.set(HtmlMapper.class, AllTagMapper.class.newInstance()); }
public void parse( InputStream stream, ContentHandler handler, Metadata metadata) throws IOException, SAXException, TikaException { ParseContext context = new ParseContext(); context.set(Parser.class, this); parse(stream, handler, metadata, context); }
/** * Parses the given document and returns the extracted text content. * Input metadata like a file name or a content type hint can be passed * in the given metadata instance. Metadata information extracted from * the document is returned in that same metadata instance. * <p> * The returned reader will be responsible for closing the given stream. * The stream and any associated resources will be closed at or before * the time when the {@link Reader#close()} method is called. * * @param stream the document to be parsed * @param metadata where document's metadata will be populated * @return extracted text content * @throws IOException if the document can not be read or parsed */ public Reader parse(InputStream stream, Metadata metadata) throws IOException { ParseContext context = new ParseContext(); context.set(Parser.class, parser); return new ParsingReader(parser, stream, metadata, context); }
/** * Creates a reader for the text content of the given binary stream. * * @param stream binary stream * @throws IOException if the document can not be parsed */ public ParsingReader(InputStream stream) throws IOException { this(new AutoDetectParser(), stream, new Metadata(), new ParseContext()); context.set(Parser.class, parser); }
/** * Creates a reader for the text content of the given binary stream * with the given name. * * @param stream binary stream * @param name document name * @throws IOException if the document can not be parsed */ public ParsingReader(InputStream stream, String name) throws IOException { this(new AutoDetectParser(), stream, getMetadata(name), new ParseContext()); context.set(Parser.class, parser); }
public static void testLocale() throws Exception { InputStream stream = new ByteArrayInputStream(new byte[0]); ContentHandler handler = new DefaultHandler(); Metadata metadata = new Metadata(); Parser parser = new AutoDetectParser(); ParseContext context = new ParseContext(); context.set(Locale.class, Locale.ENGLISH); parser.parse(stream, handler, metadata, context); }
public void extract(InputStream is, Path outputDir) throws SAXException, TikaException, IOException { Metadata m = new Metadata(); ParseContext c = new ParseContext(); ContentHandler h = new BodyContentHandler(-1); c.set(Parser.class, parser); EmbeddedDocumentExtractor ex = new MyEmbeddedDocumentExtractor(outputDir, c); c.set(EmbeddedDocumentExtractor.class, ex); parser.parse(is, h, m, c); }
public static void testHtmlMapper() throws Exception { InputStream stream = new ByteArrayInputStream(new byte[0]); ContentHandler handler = new DefaultHandler(); Metadata metadata = new Metadata(); Parser parser = new AutoDetectParser(); ParseContext context = new ParseContext(); context.set(HtmlMapper.class, new IdentityHtmlMapper()); parser.parse(stream, handler, metadata, context); }
public static void testCompositeDocument() throws Exception { InputStream stream = new ByteArrayInputStream(new byte[0]); ContentHandler handler = new DefaultHandler(); Metadata metadata = new Metadata(); Parser parser = new AutoDetectParser(); ParseContext context = new ParseContext(); context.set(Parser.class, new ParserDecorator(parser) { private static final long serialVersionUID = 4424210691523343833L; @Override public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException { // custom processing of the component document } }); parser.parse(stream, handler, metadata, context); } }
public void extract( TikaInputStream stream, ContainerExtractor recurseExtractor, EmbeddedResourceHandler handler) throws IOException, TikaException { ParseContext context = new ParseContext(); context.set(Parser.class, new RecursiveParser(recurseExtractor, handler)); try { parser.parse(stream, new DefaultHandler(), new Metadata(), context); } catch (SAXException e) { throw new TikaException("Unexpected SAX exception", e); } }
public boolean findInFile(String query, Path path) { InterruptingContentHandler handler = new InterruptingContentHandler(query); Metadata metadata = new Metadata(); ParseContext context = new ParseContext(); context.set(Parser.class, tika.getParser()); try (InputStream is = new BufferedInputStream(Files.newInputStream(path))) { tika.getParser().parse(is, handler, metadata, context); } catch (QueryMatchedException e) { return true; } catch (SAXException | TikaException | IOException e) { // something went wrong with parsing... e.printStackTrace(); } return false; }
/** * This example shows how to extract content from the outer document and all * embedded documents. The key is to specify a {@link Parser} in the {@link ParseContext}. * * @return content, including from embedded documents * @throws IOException * @throws SAXException * @throws TikaException */ public String parseEmbeddedExample() throws IOException, SAXException, TikaException { AutoDetectParser parser = new AutoDetectParser(); BodyContentHandler handler = new BodyContentHandler(); Metadata metadata = new Metadata(); ParseContext context = new ParseContext(); context.set(Parser.class, parser); try (InputStream stream = ParsingExample.class.getResourceAsStream("test_recursive_embedded.docx")) { parser.parse(stream, handler, metadata, context); return handler.toString(); } }
/** * If you don't want content from embedded documents, send in * a {@link org.apache.tika.parser.ParseContext} that does contains a * {@link EmptyParser}. * * @return The content of a file. */ public String parseNoEmbeddedExample() throws IOException, SAXException, TikaException { AutoDetectParser parser = new AutoDetectParser(); BodyContentHandler handler = new BodyContentHandler(); Metadata metadata = new Metadata(); ParseContext parseContext = new ParseContext(); parseContext.set(Parser.class, new EmptyParser()); try (InputStream stream = ParsingExample.class.getResourceAsStream("test_recursive_embedded.docx")) { parser.parse(stream, handler, metadata, parseContext); return handler.toString(); } }
) throws Exception { Metadata metadata = new Metadata(); ParseContext pc = new ParseContext(); MutableInt count = new MutableInt(); pc.set(EmbeddedDocumentExtractor.class, new MyEmbeddedDocumentExtractor(count, files)); TikaResource.parse(parser, LOG, info.getPath(), is, ch, metadata, pc);
new WriteOutContentHandler(maxStringLength); try { ParseContext context = new ParseContext(); context.set(Parser.class, parser); parser.parse( stream, new BodyContentHandler(handler), metadata, context);
new WriteOutContentHandler(maxLength); try { ParseContext context = new ParseContext(); context.set(Parser.class, parser); parser.parse( stream, new BodyContentHandler(handler), metadata, context);
public TikaGUI(Parser parser) { super("Apache Tika"); setDefaultCloseOperation(JFrame.EXIT_ON_CLOSE); addMenuBar(); cards = new JPanel(layout); addWelcomeCard(cards, "welcome"); metadata = addCard(cards, "text/plain", "metadata"); html = addCard(cards, "text/html", "html"); text = addCard(cards, "text/plain", "text"); textMain = addCard(cards, "text/plain", "main"); xml = addCard(cards, "text/plain", "xhtml"); json = addCard(cards, "text/plain", "json"); add(cards); layout.show(cards, "welcome"); setPreferredSize(new Dimension(640, 480)); pack(); this.context = new ParseContext(); this.parser = parser; this.imageParser = new ImageSavingParser(parser); this.context.set(DocumentSelector.class, new ImageDocumentSelector()); this.context.set(Parser.class, imageParser); }
@Test public void testSimple() { Parser p = new AutoDetectParser(); ParseContext parseContext = new ParseContext(); parseContext.set(Parser.class, p); Parser txtParser = EmbeddedDocumentUtil.tryToFindExistingLeafParser(org.apache.tika.parser.txt.TXTParser.class, parseContext); assertNotNull(txtParser); assertEquals(org.apache.tika.parser.txt.TXTParser.class, txtParser.getClass()); }
@Test public void testExecuteExecutor() throws Exception { TikaConfig config = TikaConfig.getDefaultConfig(); ParseContext context = new ParseContext(); context.set(ExecutorService.class, config.getExecutorService()); Future result = ConcurrentUtils.execute(context, new Runnable() { @Override public void run() { //Do nothing } }); assertNull(result.get()); }
@Override public boolean processFileResource(FileResource fileResource) { ParseContext context = new ParseContext(); if (parseRecursively) { context.set(Parser.class, parser);