/** * Example of extracting just the body as HTML, without the * head part, as a string */ public String parseBodyToHTML() throws IOException, SAXException, TikaException { ContentHandler handler = new BodyContentHandler( new ToXMLContentHandler()); AutoDetectParser parser = new AutoDetectParser(); Metadata metadata = new Metadata(); try (InputStream stream = ContentHandlerExample.class.getResourceAsStream("test.doc")) { parser.parse(stream, handler, metadata); return handler.toString(); } }
@Override public void endElement(String uri, String localName, String qName) throws SAXException { if (inStartElement) { write('>'); inStartElement = false; if (EMPTY_ELEMENTS.contains(localName)) { namespaces.clear(); return; } } super.endElement(uri, localName, qName); }
String uri, String localName, String qName, Attributes atts) throws SAXException { lazyCloseStartElement(); write('<'); write(currentElement.getQName(uri, localName)); write(' '); write(currentElement.getQName(atts.getURI(i), atts.getLocalName(i))); write('='); write('"'); char[] ch = atts.getValue(i).toCharArray(); writeEscaped(ch, 0, ch.length, true); write('"'); write(' '); write("xmlns"); String prefix = entry.getValue(); if (prefix.length() > 0) { write(':'); write(prefix); write('='); write('"'); char[] ch = entry.getKey().toCharArray(); writeEscaped(ch, 0, ch.length, true); write('"');
@Override public void characters(char[] ch, int start, int length) throws SAXException { lazyCloseStartElement(); writeEscaped(ch, start, start + length, false); }
private void lazyCloseStartElement() throws SAXException { if (inStartElement) { write('>'); inStartElement = false; } }
/** * Writes the given characters with XML meta characters escaped. * * @param ch character array * @param from start position in the array * @param to end position in the array * @param attribute whether the characters should be escaped as * an attribute value or normal character content * @throws SAXException if the characters could not be written */ private void writeEscaped(char[] ch, int from, int to, boolean attribute) throws SAXException { int pos = from; while (pos < to) { if (ch[pos] == '<') { from = pos = writeCharsAndEntity(ch, from, pos, "lt"); } else if (ch[pos] == '>') { from = pos = writeCharsAndEntity(ch, from, pos, "gt"); } else if (ch[pos] == '&') { from = pos = writeCharsAndEntity(ch, from, pos, "amp"); } else if (attribute && ch[pos] == '"') { from = pos = writeCharsAndEntity(ch, from, pos, "quot"); } else { pos++; } } super.characters(ch, from, to - from); }
@Override public void characters(char[] ch, int start, int length) throws SAXException { lazyCloseStartElement(); writeEscaped(ch, start, start + length, false); }
/** * Writes the given characters with XML meta characters escaped. * * @param ch character array * @param from start position in the array * @param to end position in the array * @param attribute whether the characters should be escaped as * an attribute value or normal character content * @throws SAXException if the characters could not be written */ private void writeEscaped(char[] ch, int from, int to, boolean attribute) throws SAXException { int pos = from; while (pos < to) { if (ch[pos] == '<') { from = pos = writeCharsAndEntity(ch, from, pos, "lt"); } else if (ch[pos] == '>') { from = pos = writeCharsAndEntity(ch, from, pos, "gt"); } else if (ch[pos] == '&') { from = pos = writeCharsAndEntity(ch, from, pos, "amp"); } else if (attribute && ch[pos] == '"') { from = pos = writeCharsAndEntity(ch, from, pos, "quot"); } else { pos++; } } super.characters(ch, from, to - from); }
/** * Example of extracting the contents as HTML, as a string. */ public String parseToHTML() throws IOException, SAXException, TikaException { ContentHandler handler = new ToXMLContentHandler(); AutoDetectParser parser = new AutoDetectParser(); Metadata metadata = new Metadata(); try (InputStream stream = ContentHandlerExample.class.getResourceAsStream("test.doc")) { parser.parse(stream, handler, metadata); return handler.toString(); } }
String uri, String localName, String qName, Attributes atts) throws SAXException { lazyCloseStartElement(); write('<'); write(currentElement.getQName(uri, localName)); write(' '); write(currentElement.getQName(atts.getURI(i), atts.getLocalName(i))); write('='); write('"'); char[] ch = atts.getValue(i).toCharArray(); writeEscaped(ch, 0, ch.length, true); write('"'); write(' '); write("xmlns"); String prefix = entry.getValue(); if (prefix.length() > 0) { write(':'); write(prefix); write('='); write('"'); char[] ch = entry.getKey().toCharArray(); writeEscaped(ch, 0, ch.length, true); write('"');
@Override public void characters(char[] ch, int start, int length) throws SAXException { lazyCloseStartElement(); writeEscaped(ch, start, start + length, false); }
@Override public void endElement(String uri, String localName, String qName) throws SAXException { if (inStartElement) { write(" />"); inStartElement = false; } else { write("</"); write(qName); write('>'); } namespaces.clear(); // Reset the position in the tree, to avoid endless stack overflow // chains (see TIKA-1070) currentElement = currentElement.parent; }
@Override public void endElement(String uri, String localName, String qName) throws SAXException { if (inStartElement) { write('>'); inStartElement = false; if (EMPTY_ELEMENTS.contains(localName)) { namespaces.clear(); return; } } super.endElement(uri, localName, qName); }
/** * Writes the given characters with XML meta characters escaped. * * @param ch character array * @param from start position in the array * @param to end position in the array * @param attribute whether the characters should be escaped as * an attribute value or normal character content * @throws SAXException if the characters could not be written */ private void writeEscaped(char[] ch, int from, int to, boolean attribute) throws SAXException { int pos = from; while (pos < to) { if (ch[pos] == '<') { from = pos = writeCharsAndEntity(ch, from, pos, "lt"); } else if (ch[pos] == '>') { from = pos = writeCharsAndEntity(ch, from, pos, "gt"); } else if (ch[pos] == '&') { from = pos = writeCharsAndEntity(ch, from, pos, "amp"); } else if (attribute && ch[pos] == '"') { from = pos = writeCharsAndEntity(ch, from, pos, "quot"); } else { pos++; } } super.characters(ch, from, to - from); }
/** * Example of extracting just one part of the document's body, * as HTML as a string, excluding the rest */ public String parseOnePartToHTML() throws IOException, SAXException, TikaException { // Only get things under html -> body -> div (class=header) XPathParser xhtmlParser = new XPathParser("xhtml", XHTMLContentHandler.XHTML); Matcher divContentMatcher = xhtmlParser.parse("/xhtml:html/xhtml:body/xhtml:div/descendant::node()"); ContentHandler handler = new MatchingContentHandler( new ToXMLContentHandler(), divContentMatcher); AutoDetectParser parser = new AutoDetectParser(); Metadata metadata = new Metadata(); try (InputStream stream = ContentHandlerExample.class.getResourceAsStream("test2.doc")) { parser.parse(stream, handler, metadata); return handler.toString(); } }
String uri, String localName, String qName, Attributes atts) throws SAXException { lazyCloseStartElement(); write('<'); write(currentElement.getQName(uri, localName)); write(' '); write(currentElement.getQName(atts.getURI(i), atts.getLocalName(i))); write('='); write('"'); char[] ch = atts.getValue(i).toCharArray(); writeEscaped(ch, 0, ch.length, true); write('"'); write(' '); write("xmlns"); String prefix = entry.getValue(); if (prefix.length() > 0) { write(':'); write(prefix); write('='); write('"'); char[] ch = entry.getKey().toCharArray(); writeEscaped(ch, 0, ch.length, true); write('"');
/** * Writes the given characters as-is followed by the given entity. * * @param ch character array * @param from start position in the array * @param to end position in the array * @param entity entity code * @return next position in the array, * after the characters plus one entity * @throws SAXException if the characters could not be written */ private int writeCharsAndEntity(char[] ch, int from, int to, String entity) throws SAXException { super.characters(ch, from, to - from); write('&'); write(entity); write(';'); return to + 1; }
@Override public void endElement(String uri, String localName, String qName) throws SAXException { if (inStartElement) { write('>'); inStartElement = false; if (EMPTY_ELEMENTS.contains(localName)) { namespaces.clear(); return; } } super.endElement(uri, localName, qName); }
ContentHandler toXML = new ToXMLContentHandler(); SafeContentHandler handler = new SafeContentHandler(toXML); AttributesImpl attributes = new AttributesImpl();