public void parse( InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws SAXException { XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata); xhtml.startDocument(); xhtml.endDocument(); } }
protected static void parse(OldExcelExtractor extractor, XHTMLContentHandler xhtml) throws TikaException, IOException, SAXException { // Get the whole text, as a single string String text = extractor.getText(); // Split and output xhtml.startDocument(); String line; BufferedReader reader = new BufferedReader(new StringReader(text)); while ((line = reader.readLine()) != null) { xhtml.startElement("p"); xhtml.characters(line); xhtml.endElement("p"); } xhtml.endDocument(); }
private void processOutput(ContentHandler handler, Metadata metadata, String output) throws SAXException, IOException { XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata); InputStream stream = new ByteArrayInputStream(output.getBytes(UTF_8)); try (Reader reader = new InputStreamReader(stream, UTF_8)) { xhtml.startDocument(); xhtml.startElement("p"); char[] buffer = new char[1024]; for (int n = reader.read(buffer); n != -1; n = reader.read(buffer)) { xhtml.characters(buffer, 0, n); } xhtml.endElement("p"); } finally { xhtml.endDocument(); } }
/** * Starts a thread that extracts the contents of the standard output * stream of the given process to the given XHTML content handler. * The standard output stream is closed once fully processed. * * @param process process * @param xhtml XHTML content handler * @throws SAXException if the XHTML SAX events could not be handled * @throws IOException if an input error occurred */ private void extractOutput(InputStream stream, XHTMLContentHandler xhtml) throws SAXException, IOException { try (Reader reader = new InputStreamReader(stream, UTF_8)) { xhtml.startDocument(); xhtml.startElement("p"); char[] buffer = new char[1024]; for (int n = reader.read(buffer); n != -1; n = reader.read(buffer)) { xhtml.characters(buffer, 0, n); } xhtml.endElement("p"); xhtml.endDocument(); } }
/** * @see org.apache.tika.parser.microsoft.ooxml.OOXMLExtractor#getXHTML(ContentHandler, Metadata, ParseContext) */ public void getXHTML( ContentHandler handler, Metadata metadata, ParseContext context) throws SAXException, XmlException, IOException, TikaException { XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata); xhtml.startDocument(); buildXHTML(xhtml); // Now do any embedded parts handleEmbeddedParts(handler, metadata); // thumbnail handleThumbnail(handler); xhtml.endDocument(); }
private void extractDocumentArea(WPPrefixArea prefixArea, WPInputStream in, XHTMLContentHandler xhtml) throws SAXException, IOException { // Move to offset (for some reason skip() did not work). for (int i = 0; i < prefixArea.getDocAreaPointer(); i++) { in.readWPByte(); } xhtml.startDocument(); getDocumentAreaExtractor(prefixArea).extract(in, xhtml); xhtml.endDocument(); }
public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException { ByteArrayOutputStream os = new ByteArrayOutputStream(); IOUtils.copy(stream, os); String name = metadata.get(TikaCoreProperties.RESOURCE_NAME_KEY); if (name == null) { name = ""; } try { NetcdfFile ncFile = NetcdfFile.openInMemory(name, os.toByteArray()); unravelStringMet(ncFile, null, metadata); } catch (IOException e) { throw new TikaException("HDF parse error", e); } XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata); xhtml.startDocument(); xhtml.endDocument(); }
/** * writes the content to the given XHTML * content handler * * @param content * the content which needs to be written * @param xhtml * XHTML content handler * @throws SAXException * if the XHTML SAX events could not be handled * */ private void extractOutput(String content, XHTMLContentHandler xhtml) throws SAXException{ xhtml.startDocument(); xhtml.startElement("div"); xhtml.characters(content); xhtml.endElement("div"); xhtml.endDocument(); } }
public void parse( InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException { // We only do metadata, for now XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata); xhtml.startDocument(); // What kind is it? byte[] first4 = new byte[4]; IOUtils.readFully(stream, first4); if (first4[0] == (byte)'M' && first4[1] == (byte)'Z') { parsePE(xhtml, metadata, stream, first4); } else if (first4[0] == (byte)0x7f && first4[1] == (byte)'E' && first4[2] == (byte)'L' && first4[3] == (byte)'F') { parseELF(xhtml, metadata, stream, first4); } // Finish everything xhtml.endDocument(); }
private void extract(PushbackInputStream in) throws IOException, SAXException, TikaException { out.startDocument(); out.endDocument();
XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata); xhtml.startDocument(); ContentHandler childHandler = new EmbeddedContentHandler( new BodyContentHandler(xhtml)); xhtml.endDocument();
@Test public void testWhitespaceWithMenus() throws Exception { xhtml.startDocument(); xhtml.startElement("menu"); xhtml.element("li", "one"); xhtml.element("li", "two"); xhtml.endElement("menu"); xhtml.endDocument(); String[] words = getRealWords(output.toString()); assertEquals(2, words.length); assertEquals("one", words[0]); assertEquals("two", words[1]); }
@Override public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException { if (metadata.get(Metadata.CONTENT_TYPE) == null) { metadata.set(Metadata.CONTENT_TYPE, QP_9.toString()); } XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata); xhtml.startDocument(); QPWTextExtractor extractor = new QPWTextExtractor(); extractor.extract(stream, xhtml, metadata); xhtml.endDocument(); } }
/** * Test that content in option elements are properly separated in text * output. * * @see <a href="https://issues.apache.org/jira/browse/TIKA-394">TIKA-394</a> */ @Test public void testWhitespaceWithOptions() throws Exception { xhtml.startDocument(); xhtml.startElement("form"); xhtml.startElement("select"); xhtml.element("option", "opt1"); xhtml.element("option", "opt2"); xhtml.endElement("select"); xhtml.endElement("form"); xhtml.endDocument(); String[] words = output.toString().split("\\s+"); assertEquals(2, words.length); assertEquals("opt1", words[0]); assertEquals("opt2", words[1]); }
public void parse( InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException { HashMap<String,String> properties = this.loadProperties(stream); this.setMetadata(metadata, properties); XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata); xhtml.startDocument(); // TODO: put body content here xhtml.startElement("p"); String body = clean(properties.get("body")); if (body != null) xhtml.characters(body); xhtml.endElement("p"); xhtml.endDocument(); }
public void parse( InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException { metadata.set(Metadata.CONTENT_TYPE, HELLO_MIME_TYPE); metadata.set("Hello", "World"); XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata); xhtml.startDocument(); xhtml.endDocument(); }
metadata.set(Metadata.CONTENT_ENCODING, charsetName); XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata); xhtml.startDocument(); xhtml.endDocument();
xhtml.startDocument(); xhtml.endElement("tr"); xhtml.endElement("table"); xhtml.endDocument();
public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException { for (Entry<String,String> m : this.metadata.entrySet()) { metadata.add(m.getKey(), m.getValue()); } XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata); xhtml.startDocument(); if (xmlText != null) { xhtml.characters(xmlText.toCharArray(), 0, xmlText.length()); } xhtml.endDocument(); }
private void extractContent(XHTMLContentHandler xhtmlContentHandler, DefaultMetadata defaultMetadata) throws SAXException{ xhtmlContentHandler.startDocument(); xhtmlContentHandler.newline(); xhtmlContentHandler.endDocument();