public static <T> Param<T> load(InputStream stream) throws SAXException, IOException, TikaException { DocumentBuilder db = XMLReaderUtils.getDocumentBuilder(); Document document = db.parse(stream); return load(document.getFirstChild()); }
public static ContentTags parseXML(String html, Set<String> uppercaseTagsOfInterest) throws TikaException, IOException, SAXException { Map<String, Integer> tags = new HashMap<>(); XHTMLContentTagHandler xhtmlContentTagHandler = new XHTMLContentTagHandler(uppercaseTagsOfInterest, tags); XMLReaderUtils.parseSAX(new ByteArrayInputStream(html.getBytes(StandardCharsets.UTF_8)), xhtmlContentTagHandler, EMPTY_PARSE_CONTEXT); return new ContentTags(xhtmlContentTagHandler.toString(), tags); }
public TikaConfig(Path path) throws TikaException, IOException, SAXException { this(XMLReaderUtils.buildDOM(path)); } public TikaConfig(Path path, ServiceLoader loader)
public void save(OutputStream stream) throws TransformerException, TikaException { DocumentBuilder builder = XMLReaderUtils.getDocumentBuilder(); Document doc = builder.newDocument(); Element paramEl = doc.createElement("param"); doc.appendChild(paramEl); save(paramEl); Transformer transformer = XMLReaderUtils.getTransformer(); transformer.transform(new DOMSource(paramEl), new StreamResult(stream)); }
/** * Builds a Document with a DocumentBuilder from the pool * * @since Apache Tika 1.19.1 * @param uriString uriString to process * @return a document * @throws TikaException * @throws IOException * @throws SAXException */ public static Document buildDOM(String uriString) throws TikaException, IOException, SAXException { PoolDOMBuilder builder = acquireDOMBuilder(); try { return builder.getDocumentBuilder().parse(uriString); } finally { releaseDOMBuilder(builder); } }
private void updateXMLReaderUtils(Element element) throws TikaException { Element child = getChild(element, "xml-reader-utils"); if (child == null) { return; } String attr = child.getAttribute("maxEntityExpansions"); if (attr != null) { XMLReaderUtils.setMaxEntityExpansions(Integer.parseInt(attr)); } //make sure to call this after set entity expansions attr = child.getAttribute("poolSize"); if (attr != null) { XMLReaderUtils.setPoolSize(Integer.parseInt(attr)); } }
@Test public void testXMLReaderUtils() throws Exception { //pool size may have been reset already by an //earlier test. Can't test for default here. assertEquals(XMLReaderUtils.DEFAULT_MAX_ENTITY_EXPANSIONS, XMLReaderUtils.getMaxEntityExpansions()); //make sure that detection on this file actually works with //default expansions assertEquals("application/rdf+xml", detect("test-difficult-rdf1.xml", TikaConfig.getDefaultConfig()).toString()); TikaConfig tikaConfig = getConfig("TIKA-2732-xmlreaderutils.xml"); try { assertEquals(33, XMLReaderUtils.getPoolSize()); assertEquals(5, XMLReaderUtils.getMaxEntityExpansions()); //make sure that there's actually a change in behavior assertEquals("text/plain", detect("test-difficult-rdf1.xml", tikaConfig).toString()); } finally { XMLReaderUtils.setMaxEntityExpansions(XMLReaderUtils.DEFAULT_MAX_ENTITY_EXPANSIONS); XMLReaderUtils.setPoolSize(XMLReaderUtils.DEFAULT_POOL_SIZE); } }
for (int i = 0; i < poolSize; i++) { try { SAX_PARSERS.offer(buildPoolParser(generation, getSAXParserFactory().newSAXParser())); } catch (SAXException|ParserConfigurationException e) { throw new TikaException("problem creating sax parser", e); for (int i = 0; i < poolSize; i++) { DOM_BUILDERS.offer( new PoolDOMBuilder(POOL_GENERATION.get(), getDocumentBuilder()));
if (waiting > 3000) { setPoolSize(POOL_SIZE);
/** * Returns the StAX input factory specified in this parsing context. * If a factory is not explicitly specified, then a default factory * instance is created and returned. The default factory instance is * configured to be namespace-aware and to apply reasonable security * using the {@link XMLReaderUtils#IGNORING_STAX_ENTITY_RESOLVER}. * * @since Apache Tika 1.13 * @return StAX input factory */ public XMLInputFactory getXMLInputFactory() { XMLInputFactory factory = get(XMLInputFactory.class); if (factory != null) { return factory; } return XMLReaderUtils.getXMLInputFactory(); }
/** * This checks context for a user specified {@link SAXParser}. * If one is not found, this reuses a SAXParser from the pool. * * @since Apache Tika 1.19 * @param is InputStream to parse * @param contentHandler handler to use * @param context context to use * @return * @throws TikaException * @throws IOException * @throws SAXException */ public static void parseSAX(InputStream is, DefaultHandler contentHandler, ParseContext context) throws TikaException, IOException, SAXException { SAXParser saxParser = context.get(SAXParser.class); PoolSAXParser poolSAXParser = null; if (saxParser == null) { poolSAXParser = acquireSAXParser(); saxParser = poolSAXParser.getSAXParser(); } try { saxParser.parse(is, contentHandler); } finally { if (poolSAXParser != null) { releaseParser(poolSAXParser); } } }
for (int i = 0; i < poolSize; i++) { try { SAX_PARSERS.offer(buildPoolParser(generation, getSAXParserFactory().newSAXParser())); } catch (SAXException|ParserConfigurationException e) { throw new TikaException("problem creating sax parser", e); for (int i = 0; i < poolSize; i++) { DOM_BUILDERS.offer( new PoolDOMBuilder(POOL_GENERATION.get(), getDocumentBuilder()));
public void save(OutputStream stream) throws TransformerException, TikaException { DocumentBuilder builder = XMLReaderUtils.getDocumentBuilder(); Document doc = builder.newDocument(); Element paramEl = doc.createElement("param"); doc.appendChild(paramEl); save(paramEl); Transformer transformer = XMLReaderUtils.getTransformer(); transformer.transform(new DOMSource(paramEl), new StreamResult(stream)); }
private void updateXMLReaderUtils(Element element) throws TikaException { Element child = getChild(element, "xml-reader-utils"); if (child == null) { return; } String attr = child.getAttribute("maxEntityExpansions"); if (attr != null) { XMLReaderUtils.setMaxEntityExpansions(Integer.parseInt(attr)); } //make sure to call this after set entity expansions attr = child.getAttribute("poolSize"); if (attr != null) { XMLReaderUtils.setPoolSize(Integer.parseInt(attr)); } }
/** * Builds a Document with a DocumentBuilder from the pool * * @since Apache Tika 1.19.1 * * @return a document * @throws TikaException * @throws IOException * @throws SAXException */ public static Document buildDOM(InputStream is) throws TikaException, IOException, SAXException { PoolDOMBuilder builder = acquireDOMBuilder(); try { return builder.getDocumentBuilder().parse(is); } finally { releaseDOMBuilder(builder); } }
if (waiting > 3000) { setPoolSize(POOL_SIZE);
public void read(InputStream xmlLogFileIs, XMLLogMsgHandler handler) throws XMLStreamException { InputStream is = new LogXMLWrappingInputStream(xmlLogFileIs); XMLInputFactory factory = XMLReaderUtils.getXMLInputFactory(); XMLStreamReader reader = factory.createXMLStreamReader(is);
/** * This checks context for a user specified {@link SAXParser}. * If one is not found, this reuses a SAXParser from the pool. * * @since Apache Tika 1.19 * @param is InputStream to parse * @param contentHandler handler to use * @param context context to use * @return * @throws TikaException * @throws IOException * @throws SAXException */ public static void parseSAX(InputStream is, DefaultHandler contentHandler, ParseContext context) throws TikaException, IOException, SAXException { SAXParser saxParser = context.get(SAXParser.class); PoolSAXParser poolSAXParser = null; if (saxParser == null) { poolSAXParser = acquireSAXParser(); saxParser = poolSAXParser.getSAXParser(); } try { saxParser.parse(is, contentHandler); } finally { if (poolSAXParser != null) { releaseParser(poolSAXParser); } } }
public void parse( InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException { XMLReaderUtils.parseSAX( new CloseShieldInputStream(stream), new OfflineContentHandler(handler), context); }
public TikaConfig(Path path, ServiceLoader loader) throws TikaException, IOException, SAXException { this(XMLReaderUtils.buildDOM(path), loader); }