public static ContentTags parseXML(String html, Set<String> uppercaseTagsOfInterest) throws TikaException, IOException, SAXException { Map<String, Integer> tags = new HashMap<>(); XHTMLContentTagHandler xhtmlContentTagHandler = new XHTMLContentTagHandler(uppercaseTagsOfInterest, tags); XMLReaderUtils.parseSAX(new ByteArrayInputStream(html.getBytes(StandardCharsets.UTF_8)), xhtmlContentTagHandler, EMPTY_PARSE_CONTEXT); return new ContentTags(xhtmlContentTagHandler.toString(), tags); }
private static MediaType parseContentTypes(InputStream is) { ContentTypeHandler contentTypeHandler = new ContentTypeHandler(); try { XMLReaderUtils.parseSAX(is, contentTypeHandler, new ParseContext()); } catch (SecurityException e) { throw e; } catch (Exception e) { } return contentTypeHandler.mediaType; }
public void parse( InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException { XMLReaderUtils.parseSAX( new CloseShieldInputStream(stream), new OfflineContentHandler(handler), context); }
XMLReaderUtils.parseSAX( new CloseShieldInputStream(stream), new OfflineContentHandler(
/** * @since Apache Tika 0.9 */ public QName extractRootElement(InputStream stream) { ExtractorHandler handler = new ExtractorHandler(); try { XMLReaderUtils.parseSAX( new CloseShieldInputStream(stream), new OfflineContentHandler(handler), EMPTY_CONTEXT); } catch (Exception ignore) { } return handler.rootElement; }
private void onDocumentLoad(ParseContext parseContext, InputStream stream) throws TikaException, IOException, SAXException { XMLReaderUtils.parseSAX(stream, new OfflineContentHandler(new StylesStripper()), parseContext); }
public void processSheet( SheetContentsHandler sheetContentsExtractor, CommentsTable comments, StylesTable styles, ReadOnlySharedStringsTable strings, InputStream sheetInputStream) throws IOException, SAXException { try { XSSFSheetInterestingPartsCapturer handler = new XSSFSheetInterestingPartsCapturer(new XSSFSheetXMLHandler( styles, comments, strings, sheetContentsExtractor, formatter, false)); XMLReaderUtils.parseSAX(sheetInputStream, handler, parseContext); sheetInputStream.close(); if (handler.hasProtection) { metadata.set(TikaCoreProperties.PROTECTED, "true"); } } catch (TikaException e) { throw new RuntimeException("SAX parser appears to be broken - " + e.getMessage()); } }
void parseInternal( InputStream stream, final ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException { DefaultHandler dh = new OpenDocumentElementMappingContentHandler(handler, MAPPINGS); XMLReaderUtils.parseSAX( new CloseShieldInputStream(stream), new OfflineContentHandler( new NSNormalizerContentHandler(dh)), context); }
private void handleDocumentRef(String docRef) throws SAXException { //docRef is a path to a FixedDocumentSequence document, // e.g. /Documents/1/FixedDoc.fdoc //relative root is /Documents/1 ..need this Pages... String relativeRoot = null; int i = docRef.lastIndexOf("/"); if (i > 0) { relativeRoot = docRef.substring(0, i); } else { relativeRoot = ""; } String zipPath = (docRef.startsWith("/") ? docRef.substring(1) : docRef); if (pkg instanceof ZipPackage) { try (InputStream stream = getZipStream(zipPath, pkg)) { XMLReaderUtils.parseSAX( new CloseShieldInputStream(stream), new OfflineContentHandler(new EmbeddedContentHandler( new PageContentPartHandler(relativeRoot, xhtml))), context); } catch (IOException | TikaException e) { throw new SAXException(new TikaException("IOException trying to read: " + docRef)); } } else { throw new SAXException(new TikaException("Package must be ZipPackage")); } }
private void extractHOCROutput(InputStream is, ParseContext parseContext, XHTMLContentHandler xhtml) throws TikaException, IOException, SAXException { if (parseContext == null) { parseContext = new ParseContext(); } xhtml.startElement("div", "class", "ocr"); XMLReaderUtils.parseSAX(is, new OfflineContentHandler(new HOCRPassThroughHandler(xhtml)), parseContext); xhtml.endElement("div"); }
new OfflineContentHandler(handler)); try { XMLReaderUtils.parseSAX( stream, new TeeContentHandler( tagged, new MetaHandler(metadata)), context);
private void handleDocuments(PackageRelationship packageRelationship, XHTMLContentHandler xhtml) throws IOException, SAXException, TikaException { try (InputStream stream = pkg.getPart(packageRelationship).getInputStream()) { XMLReaderUtils.parseSAX( new CloseShieldInputStream(stream), new OfflineContentHandler(new EmbeddedContentHandler( new FixedDocSeqHandler(xhtml))), context); } }
PackagePart relatedPartPart = parentPart.getRelatedPart(relatedPartPackageRelationship); try (InputStream stream = relatedPartPart.getInputStream()) { XMLReaderUtils.parseSAX(stream, new OfflineContentHandler(new EmbeddedContentHandler(contentHandler)), context);
@Override public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException { //set OfficeParserConfig if the user hasn't specified one configure(context); final XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata); xhtml.startDocument(); try { XMLReaderUtils.parseSAX( new CloseShieldInputStream(stream), new OfflineContentHandler(new EmbeddedContentHandler( new Word2006MLDocHandler(xhtml, metadata, context))), context); } catch (SAXException e) { throw new TikaException("XML parse error", e); } xhtml.endDocument(); } }
@Override public void parse( InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException { setContentType(metadata); final XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata); xhtml.startDocument(); TaggedContentHandler tagged = new TaggedContentHandler(xhtml); try { XMLReaderUtils.parseSAX( new CloseShieldInputStream(stream), new OfflineContentHandler(new EmbeddedContentHandler( getContentHandler(tagged, metadata, context))), context); } catch (SAXException e) { tagged.throwIfCauseOf(e); throw new TikaException("XML parse error", e); } finally { xhtml.endDocument(); } }
@Override public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException { // TODO Auto-generated method stub final XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata); xhtml.startDocument(); xhtml.startElement("p"); TaggedContentHandler tagged = new TaggedContentHandler(handler); try { XMLReaderUtils.parseSAX( new CloseShieldInputStream(stream), new OfflineContentHandler(new EmbeddedContentHandler( getContentHandler(tagged, metadata, context))), context); } catch (SAXException e) { tagged.throwIfCauseOf(e); throw new TikaException("XML parse error", e); } finally { xhtml.endElement("p"); xhtml.endDocument(); } }
private void handlePart(PackagePart packagePart, XWPFStylesShim styles, XWPFListManager listManager, XHTMLContentHandler xhtml) throws IOException, SAXException { Map<String, String> linkedRelationships = loadLinkedRelationships(packagePart, true, metadata); try (InputStream stream = packagePart.getInputStream()) { XMLReaderUtils.parseSAX( new CloseShieldInputStream(stream), new OfflineContentHandler(new EmbeddedContentHandler( new OOXMLWordAndPowerPointTextHandler( new OOXMLTikaBodyPartHandler(xhtml, styles, listManager, config), linkedRelationships, config.getIncludeShapeBasedContent(), config.getConcatenatePhoneticRuns()))), context); } catch (TikaException|IOException e) { metadata.add(TikaCoreProperties.TIKA_META_EXCEPTION_WARNING, ExceptionUtils.getStackTrace(e)); } }
public void parse( InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException { if (metadata.get(Metadata.CONTENT_TYPE) == null) { metadata.set(Metadata.CONTENT_TYPE, "application/xml"); } final XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata); xhtml.startDocument(); xhtml.startElement("p"); TaggedContentHandler tagged = new TaggedContentHandler(handler); try { XMLReaderUtils.parseSAX( new CloseShieldInputStream(stream), new OfflineContentHandler(new EmbeddedContentHandler( getContentHandler(tagged, metadata, context))), context); } catch (SAXException e) { tagged.throwIfCauseOf(e); throw new TikaException("XML parse error", e); } finally { xhtml.endElement("p"); xhtml.endDocument(); } }
XMLReaderUtils.parseSAX( new CloseShieldInputStream(stream), new OfflineContentHandler(new XSLFCommentAuthorHandler()),
XMLReaderUtils.parseSAX( new CloseShieldInputStream(stream), new OfflineContentHandler(new EmbeddedContentHandler(