public BinaryParseData() { context.set(Parser.class, AUTO_DETECT_PARSER); }
public TikaHtmlParser(CrawlConfig config, TLDList tldList) throws InstantiationException, IllegalAccessException { this.config = config; this.tldList = tldList; htmlParser = new HtmlParser(); parseContext = new ParseContext(); parseContext.set(HtmlMapper.class, AllTagMapper.class.newInstance()); }
InputStream input = new FileInputStream("myfile.html"); ContentHandler handler = new BodyContentHandler(); Metadata metadata = new Metadata(); new HtmlParser().parse(input, handler, metadata, new ParseContext()); String plainText = handler.toString();
/** * Creates a reader for the text content of the given binary stream * with the given name. * * @param stream binary stream * @param name document name * @throws IOException if the document can not be parsed */ public ParsingReader(InputStream stream, String name) throws IOException { this(new AutoDetectParser(), stream, getMetadata(name), new ParseContext()); context.set(Parser.class, parser); }
public void parse( InputStream stream, ContentHandler handler, Metadata metadata) throws IOException, SAXException, TikaException { ParseContext context = new ParseContext(); context.set(Parser.class, this); parse(stream, handler, metadata, context); }
/** * Returns the component parsers. * * @return component parsers, keyed by media type */ public Map<MediaType, Parser> getParsers() { return getParsers(new ParseContext()); }
public void setBinaryContent(byte[] data) throws TransformerConfigurationException, TikaException, SAXException, IOException { InputStream inputStream = new ByteArrayInputStream(data); ByteArrayOutputStream outputStream = new ByteArrayOutputStream(); try { TransformerHandler handler = getTransformerHandler(outputStream, DEFAULT_OUTPUT_FORMAT, DEFAULT_ENCODING); AUTO_DETECT_PARSER.parse(inputStream, handler, new Metadata(), context); // Hacking the following line to remove Tika's inserted DocType this.html = new String(outputStream.toByteArray(), DEFAULT_ENCODING).replace( "http://www.w3.org/1999/xhtml", ""); } catch (TransformerConfigurationException | TikaException | SAXException | IOException | RuntimeException e) { throw e; } }
@SuppressWarnings("unused") @OnScheduled public void onScheduled(ProcessContext context) { String metadataKeyFilterInput = context.getProperty(METADATA_KEY_FILTER).getValue(); if (metadataKeyFilterInput != null && metadataKeyFilterInput.length() > 0) { metadataKeyFilterRef.set(Pattern.compile(metadataKeyFilterInput)); } else { metadataKeyFilterRef.set(null); } autoDetectParser = new AutoDetectParser(); }
@Override public Parser getParser(TikaConfig config) { Parser p = new AutoDetectParser(config); if (digester == null) { return p; } DigestingParser d = new DigestingParser(p, digester); return d; }
/** * Checks to see if the user has specified an {@link OfficeParserConfig}. * If so, no changes are made; if not, one is added to the context. * * @param parseContext */ public void configure(ParseContext parseContext) { OfficeParserConfig officeParserConfig = parseContext.get(OfficeParserConfig.class, defaultOfficeParserConfig); parseContext.set(OfficeParserConfig.class, officeParserConfig); }
public Set<MediaType> getSupportedTypes(ParseContext context) { return getDelegateParser(context).getSupportedTypes(context); }
@Override public Set<MediaType> getSupportedTypes(ParseContext context) { return getWrappedParser().getSupportedTypes(context); }
/** * Delegates the method call to the decorated parser. Subclasses should * override this method (and use <code>super.getSupportedTypes()</code> * to invoke the decorated parser) to implement extra decoration. */ public Set<MediaType> getSupportedTypes(ParseContext context) { return parser.getSupportedTypes(context); }
/** * Look for an EncodingDetetor in the ParseContext. If it hasn't been * passed in, use the original EncodingDetector from initialization. * * @param parseContext * @return */ protected EncodingDetector getEncodingDetector(ParseContext parseContext) { EncodingDetector fromParseContext = parseContext.get(EncodingDetector.class); if (fromParseContext != null) { return fromParseContext; } return getEncodingDetector(); }
final TikaInputStream tikaInputStream = TikaInputStream.get(sourceStream); try { autoDetectParser.parse(tikaInputStream, new DefaultHandler(), metadata); } finally { tikaInputStream.close();
private static void setEncodingDetector(Parser p, EncodingDetector encodingDetector) { if (p instanceof AbstractEncodingDetectorParser) { ((AbstractEncodingDetectorParser)p).setEncodingDetector(encodingDetector); } else if (p instanceof CompositeParser) { for (Parser child : ((CompositeParser)p).getAllComponentParsers()) { setEncodingDetector(child, encodingDetector); } } else if (p instanceof ParserDecorator) { setEncodingDetector(((ParserDecorator)p).getWrappedParser(), encodingDetector); } }
/** * Returns the password to be used for this file, or null * if no / default password should be used */ protected String getPassword() { if (passwordProvider != null) { return passwordProvider.getPassword(parentMetadata); } return null; }
public DefaultParser(MediaTypeRegistry registry, ServiceLoader loader, Collection<Class<? extends Parser>> excludeParsers, EncodingDetector encodingDetector) { super(registry, getDefaultParsers(loader, encodingDetector), excludeParsers); this.loader = loader; }
/** * Delegates the method call to the decorated parser. Subclasses should * override this method (and use <code>super.parse()</code> to invoke * the decorated parser) to implement extra decoration. */ public void parse( InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException { parser.parse(stream, handler, metadata, context); }
/** * Creates a Tika facade using the given detector instance, the * default parser configuration, and the default Translator. * * @since Apache Tika 0.8 * @param detector type detector */ public Tika(Detector detector) { this(detector, new AutoDetectParser(detector)); }