/** * Creates a content handler that writes content up to the given * write limit to the given character stream. * * @since Apache Tika 0.10 * @param writer character stream * @param writeLimit write limit */ public WriteOutContentHandler(Writer writer, int writeLimit) { this(new ToTextContentHandler(writer), writeLimit); }
/** * Writes the given ignorable characters to the given character stream. * The default implementation simply forwards the call to the * {@link #characters(char[], int, int)} method. */ @Override public void ignorableWhitespace(char[] ch, int start, int length) throws SAXException { characters(ch, start, length); }
@Override public void startElement( String uri, String localName, String qName, Attributes atts) throws SAXException { super.startElement(uri, localName, qName, atts); String uc = (qName == null) ? "" : qName.toUpperCase(Locale.ENGLISH); if (uppercaseTagsOfInterest.contains(uc)) { Integer i = tags.get(uc); if (i == null) { i = 1; } else { i++; } tags.put(uc, i); } } }
@Override public void endElement(String uri, String localName, String qName) throws SAXException { // if(skipLinebreakes & addedText){ // characters(LINEBREAK, 0, 1); // addedText = false; // } super.endElement(uri, localName, qName); } }
new OutputStreamWriter(os, charset)), writeLimit); case TEXT: return new WriteOutContentHandler(new ToTextContentHandler(os, charset.name()), writeLimit); case HTML: return new WriteOutContentHandler(new ToHTMLContentHandler(os, charset.name()), writeLimit); return new WriteOutContentHandler(new ToXMLContentHandler(os, charset.name()), writeLimit); default: return new WriteOutContentHandler(new ToTextContentHandler(os, charset.name()), writeLimit); return new BodyContentHandler(new OutputStreamWriter(os, charset)); case TEXT: return new ToTextContentHandler(os, charset.name()); case HTML: return new ToHTMLContentHandler(os, charset.name()); return new ToXMLContentHandler(os, charset.name()); default: return new ToTextContentHandler(os, charset.name());
/** * Writes the given character as-is. * * @param ch character to be written * @throws SAXException if the character could not be written */ protected void write(char ch) throws SAXException { super.characters(new char[] { ch }, 0, 1); }
switch(type) { case TEXT: return new WriteOutContentHandler(new ToTextContentHandler(), writeLimit); case HTML: return new WriteOutContentHandler(new ToHTMLContentHandler(), writeLimit); return new WriteOutContentHandler(new ToXMLContentHandler(), writeLimit); default: return new WriteOutContentHandler(new ToTextContentHandler(), writeLimit); return new ToTextContentHandler(); case HTML: return new ToHTMLContentHandler(); return new ToXMLContentHandler(); default: return new ToTextContentHandler();
/** * Writes the given string of character as-is. * * @param string string of character to be written * @throws SAXException if the character string could not be written */ protected void write(String string) throws SAXException { super.characters(string.toCharArray(), 0, string.length()); }
@Test public void testToTextContentHandler() throws Exception { assertStartDocument("", new ToTextContentHandler()); assertCharacters("content", new ToTextContentHandler()); assertCharacterEscaping("<&\">", new ToTextContentHandler()); assertIgnorableWhitespace(" \t\r\n", new ToTextContentHandler()); assertEmptyElement("", new ToTextContentHandler()); assertEmptyElementWithAttributes("", new ToTextContentHandler()); assertEmptyElementWithAttributeEscaping("", new ToTextContentHandler()); assertElement("content", new ToTextContentHandler()); assertElementWithAttributes("content", new ToTextContentHandler()); }
/** * Writes the given characters as-is followed by the given entity. * * @param ch character array * @param from start position in the array * @param to end position in the array * @param entity entity code * @return next position in the array, * after the characters plus one entity * @throws SAXException if the characters could not be written */ private int writeCharsAndEntity(char[] ch, int from, int to, String entity) throws SAXException { super.characters(ch, from, to - from); write('&'); write(entity); write(';'); return to + 1; }
/** * Creates a content handler that writes content up to the given * write limit to the given character stream. * * @since Apache Tika 0.10 * @param writer character stream * @param writeLimit write limit */ public WriteOutContentHandler(Writer writer, int writeLimit) { this(new ToTextContentHandler(writer), writeLimit); }
/** * Writes the given characters with XML meta characters escaped. * * @param ch character array * @param from start position in the array * @param to end position in the array * @param attribute whether the characters should be escaped as * an attribute value or normal character content * @throws SAXException if the characters could not be written */ private void writeEscaped(char[] ch, int from, int to, boolean attribute) throws SAXException { int pos = from; while (pos < to) { if (ch[pos] == '<') { from = pos = writeCharsAndEntity(ch, from, pos, "lt"); } else if (ch[pos] == '>') { from = pos = writeCharsAndEntity(ch, from, pos, "gt"); } else if (ch[pos] == '&') { from = pos = writeCharsAndEntity(ch, from, pos, "amp"); } else if (attribute && ch[pos] == '"') { from = pos = writeCharsAndEntity(ch, from, pos, "quot"); } else { pos++; } } super.characters(ch, from, to - from); }
/** * Creates a content handler that writes content up to the given * write limit to the given character stream. * * @since Apache Tika 0.10 * @param writer character stream * @param writeLimit write limit */ public WriteOutContentHandler(Writer writer, int writeLimit) { this(new ToTextContentHandler(writer), writeLimit); }
/** * Writes the given ignorable characters to the given character stream. * The default implementation simply forwards the call to the * {@link #characters(char[], int, int)} method. */ @Override public void ignorableWhitespace(char[] ch, int start, int length) throws SAXException { characters(ch, start, length); }
/** * Extract the content and metadata from the input stream with a media type hint. * @param in input stream to extract the content and metadata from * @param mt JAX-RS MediaType of the stream content * @return the extracted content and metadata or null if extraction is not possible or was unsuccessful */ public TikaContent extract(final InputStream in, javax.ws.rs.core.MediaType mt) { return extract(in, new ToTextContentHandler(), mt); }
/** * Writes the given character as-is. * * @param ch character to be written * @throws SAXException if the character could not be written */ protected void write(char ch) throws SAXException { super.characters(new char[] { ch }, 0, 1); }
/** * Extract the content and metadata from the input stream with a media type hint. * @param in input stream to extract the content and metadata from * @param mt JAX-RS MediaType of the stream content * @return the extracted content and metadata or null if extraction is not possible or was unsuccessful */ public TikaContent extract(final InputStream in, javax.ws.rs.core.MediaType mt) { return extract(in, new ToTextContentHandler(), mt); }
/** * Writes the given ignorable characters to the given character stream. * The default implementation simply forwards the call to the * {@link #characters(char[], int, int)} method. */ @Override public void ignorableWhitespace(char[] ch, int start, int length) throws SAXException { characters(ch, start, length); }
switch(type) { case TEXT: return new WriteOutContentHandler(new ToTextContentHandler(), writeLimit); case HTML: return new WriteOutContentHandler(new ToHTMLContentHandler(), writeLimit); return new WriteOutContentHandler(new ToXMLContentHandler(), writeLimit); default: return new WriteOutContentHandler(new ToTextContentHandler(), writeLimit); return new ToTextContentHandler(); case HTML: return new ToHTMLContentHandler(); return new ToXMLContentHandler(); default: return new ToTextContentHandler();
@Override public void ignorableWhitespace(char[] ch, int start, int length) throws SAXException { if(!skipWhitespaces && addedText){ super.characters(ch, start, length); addedText = false; } //else ignore } @Override