/** * Creates a content handler that writes XHTML body character events to * the given writer. * * @param writer writer */ public BoilerpipeContentHandler(Writer writer) { this(new WriteOutContentHandler(writer)); }
static String extractText(FsSettings fsSettings, int indexedChars, InputStream stream, Metadata metadata) throws IOException, TikaException { initTika(fsSettings.getFs()); WriteOutContentHandler handler = new WriteOutContentHandler(indexedChars); try { parser.parse(stream, new BodyContentHandler(handler), metadata, context); } catch (SAXException e) { if (!handler.isWriteLimitReached(e)) { // This should never happen with BodyContentHandler... throw new TikaException("Unexpected SAX processing failure", e); } } finally { stream.close(); } return handler.toString(); }
@Override public void startElement(String uri, String localName, String qName, Attributes attributes) throws SAXException { super.startElement(uri, localName, qName, attributes); if ("img".equals(localName) && attributes.getValue("alt") != null) { String nfo = "[image: " + attributes.getValue("alt") + ']'; characters(nfo.toCharArray(), 0, nfo.length()); } if ("a".equals(localName) && attributes.getValue("name") != null) { String nfo = "[bookmark: " + attributes.getValue("name") + ']'; characters(nfo.toCharArray(), 0, nfo.length()); } } }
final WriteOutContentHandler contentHandler = new WriteOutContentHandler(writer, maxLength); parser.parse(inputStream, new BodyContentHandler(contentHandler), metadata, context); } catch (Throwable e) { if (!contentHandler.isWriteLimitReached(e)) { throw e; } else {
@Test public void testLimit() throws Exception { //TIKA-2668 - java 11-ea Parser p = new MockParser(); WriteOutContentHandler handler = new WriteOutContentHandler(15); Metadata metadata = new Metadata(); ParseContext parseContext = new ParseContext(); Parser[] parsers = new Parser[1]; parsers[0] = p; Parser autoDetectParser = new AutoDetectParser(parsers); try (InputStream is = getResourceAsStream("/test-documents/example.xml")) { autoDetectParser.parse(is, handler, metadata, parseContext); } catch (Exception e) { tryToFindIllegalStateException(e); } assertEquals("hello wo", handler.toString().trim()); }
if (!writeOutContentHandler.isWriteLimitReached(e)) { log.debug("Failed to extract text from a binary property." + " This is a fairly common case, and nothing to" if (!writeOutContentHandler.isWriteLimitReached(t)) { log.debug("Failed to extract text from a binary property." + " This is a fairly common case, and nothing to" value.discard(); setExtractedText(writeOutContentHandler.toString());
/** * Checks whether the given exception (or any of it's root causes) was * thrown by this handler as a signal of reaching the write limit. * * @since Apache Tika 0.7 * @param t throwable * @return <code>true</code> if the write limit was reached, * <code>false</code> otherwise */ public boolean isWriteLimitReached(Throwable t) { if (t instanceof WriteLimitReachedException) { return tag.equals(((WriteLimitReachedException) t).tag); } else { return t.getCause() != null && isWriteLimitReached(t.getCause()); } }
WriteOutContentHandler woh = new WriteOutContentHandler(500 * 1000); // 500K limit (Tika default: 100K) BodyContentHandler ch = new BodyContentHandler(woh); if (woh.isWriteLimitReached(t)) {
if (!writeOutContentHandler.isWriteLimitReached(e)) { log.debug("Failed to extract text from a binary property." + " This is a fairly common case, and nothing to" if (!writeOutContentHandler.isWriteLimitReached(t)) { log.debug("Failed to extract text from a binary property." + " This is a fairly common case, and nothing to" value.discard(); setExtractedText(writeOutContentHandler.toString());
private void assertWriteLimitReached(Parser p, WriteOutContentHandler handler) throws Exception { boolean wlr = false; try { p.parse(null, handler, null, null); } catch (SAXException e) { if (! handler.isWriteLimitReached(e)) { throw e; } wlr = true; } assertTrue("WriteLimitReached", wlr); } //TODO: is there a better way than to repeat this with diff signature?
/** * Creates a content handler that writes XHTML body character events to * the given writer. * * @param writer writer */ public BodyContentHandler(Writer writer) { this(new WriteOutContentHandler(writer)); }
throws IOException, TikaException { WriteOutContentHandler handler = new WriteOutContentHandler(maxLength); try { ParseContext context = new ParseContext(); stream, new BodyContentHandler(handler), metadata, context); } catch (SAXException e) { if (!handler.isWriteLimitReached(e)) { stream.close(); return handler.toString();
/** * Checks whether the given exception (or any of it's root causes) was * thrown by this handler as a signal of reaching the write limit. * * @since Apache Tika 0.7 * @param t throwable * @return <code>true</code> if the write limit was reached, * <code>false</code> otherwise */ public boolean isWriteLimitReached(Throwable t) { if (t instanceof WriteLimitReachedException) { return tag.equals(((WriteLimitReachedException) t).tag); } else { return t.getCause() != null && isWriteLimitReached(t.getCause()); } }
@Override public void startElement(String uri, String localName, String qName, Attributes attributes) throws SAXException { super.startElement(uri, localName, qName, attributes); if ("img".equals(localName) && attributes.getValue("alt") != null) { String nfo = "[image: " + attributes.getValue("alt") + ']'; characters(nfo.toCharArray(), 0, nfo.length()); } if ("a".equals(localName) && attributes.getValue("name") != null) { String nfo = "[bookmark: " + attributes.getValue("name") + ']'; characters(nfo.toCharArray(), 0, nfo.length()); } } }
/** * Creates a content handler that writes XHTML body character events to * an internal string buffer. The contents of the buffer can be retrieved * using the {@link #toString()} method. * <p> * The internal string buffer is bounded at 100k characters. If this write * limit is reached, then a {@link SAXException} is thrown. */ public BodyContentHandler() { this(new WriteOutContentHandler()); }
throws IOException, TikaException { WriteOutContentHandler handler = new WriteOutContentHandler(maxStringLength); try { ParseContext context = new ParseContext(); stream, new BodyContentHandler(handler), metadata, context); } catch (SAXException e) { if (!handler.isWriteLimitReached(e)) { stream.close(); return handler.toString();
/** * Checks whether the given exception (or any of it's root causes) was * thrown by this handler as a signal of reaching the write limit. * * @since Apache Tika 0.7 * @param t throwable * @return <code>true</code> if the write limit was reached, * <code>false</code> otherwise */ public boolean isWriteLimitReached(Throwable t) { if (t instanceof WriteLimitReachedException) { return tag.equals(((WriteLimitReachedException) t).tag); } else { return t.getCause() != null && isWriteLimitReached(t.getCause()); } }
@Override public void startElement(String uri, String localName, String qName, Attributes attributes) throws SAXException { super.startElement(uri, localName, qName, attributes); if ("img".equals(localName) && attributes.getValue("alt") != null) { String nfo = "[image: " + attributes.getValue("alt") + ']'; characters(nfo.toCharArray(), 0, nfo.length()); } if ("a".equals(localName) && attributes.getValue("name") != null) { String nfo = "[bookmark: " + attributes.getValue("name") + ']'; characters(nfo.toCharArray(), 0, nfo.length()); } } }
/** * Creates a content handler that writes XHTML body character events to * the given output stream using the default encoding. * * @param stream output stream */ public BodyContentHandler(OutputStream stream) { this(new WriteOutContentHandler(stream)); }
private String parseStringValue(Blob v, Metadata metadata) { WriteOutContentHandler handler = new WriteOutContentHandler(); try { InputStream stream = v.getNewStream(); try { parser.parse(stream, handler, metadata, new ParseContext()); } finally { stream.close(); } } catch (LinkageError e) { // Capture and ignore errors caused by extraction libraries // not being present. This is equivalent to disabling // selected media types in configuration, so we can simply // ignore these errors. } catch (Throwable t) { // Capture and report any other full text extraction problems. // The special STOP exception is used for normal termination. if (!handler.isWriteLimitReached(t)) { log.debug("Failed to extract text from a binary property: " + " This is a fairly common case, and nothing to" + " worry about. The stack trace is included to" + " help improve the text extraction feature.", t); return "TextExtractionError"; } } return handler.toString(); }