org.apache.tika.sax.BodyContentHandler.toString java code examples

/**
 * Example of extracting the plain text of the contents.
 * Will return only the "body" part of the document
 */
public String parseToPlainText() throws IOException, SAXException, TikaException {
  BodyContentHandler handler = new BodyContentHandler();
  AutoDetectParser parser = new AutoDetectParser();
  Metadata metadata = new Metadata();
  try (InputStream stream = ContentHandlerExample.class.getResourceAsStream("test.doc")) {
    parser.parse(stream, handler, metadata);
    return handler.toString();
  }
}

try (InputStream stream = ParsingExample.class.getResourceAsStream("test.doc")) {
  parser.parse(stream, handler, metadata);
  return handler.toString();

/**
 * This example shows how to extract content from the outer document and all
 * embedded documents.  The key is to specify a {@link Parser} in the {@link ParseContext}.
 *
 * @return content, including from embedded documents
 * @throws IOException
 * @throws SAXException
 * @throws TikaException
 */
public String parseEmbeddedExample() throws IOException, SAXException, TikaException {
  AutoDetectParser parser = new AutoDetectParser();
  BodyContentHandler handler = new BodyContentHandler();
  Metadata metadata = new Metadata();
  ParseContext context = new ParseContext();
  context.set(Parser.class, parser);
  try (InputStream stream = ParsingExample.class.getResourceAsStream("test_recursive_embedded.docx")) {
    parser.parse(stream, handler, metadata, context);
    return handler.toString();
  }
}

/**
 * If you don't want content from embedded documents, send in
 * a {@link org.apache.tika.parser.ParseContext} that does contains a
 * {@link EmptyParser}.
 *
 * @return The content of a file.
 */
public String parseNoEmbeddedExample() throws IOException, SAXException, TikaException {
  AutoDetectParser parser = new AutoDetectParser();
  BodyContentHandler handler = new BodyContentHandler();
  Metadata metadata = new Metadata();
  ParseContext parseContext = new ParseContext();
  parseContext.set(Parser.class, new EmptyParser());
  try (InputStream stream = ParsingExample.class.getResourceAsStream("test_recursive_embedded.docx")) {
    parser.parse(stream, handler, metadata, parseContext);
    return handler.toString();
  }
}

handler = new BodyContentHandler();
p.parse(new ByteArrayInputStream(new byte[] {0,1,2,3,4}), handler, metadata, context);
assertEquals("Fell back!", handler.toString());
handler = new BodyContentHandler();
p.parse(new ByteArrayInputStream(new byte[] {0,1,2,3,4}), handler, metadata, context);
assertEquals("", handler.toString());

      h,
      m, parseContext);
  handler.characters(h.toString());
} catch (SAXException e) {

handler = new BodyContentHandler();
p.parse(new ByteArrayInputStream(new byte[] {0,1,2,3,4}), handler, metadata, context);
assertEquals("Fell back!", handler.toString());
handler = new BodyContentHandler();
p.parse(new ByteArrayInputStream(new byte[] {0,1,2,3,4}), handler, metadata, context);
assertEquals("Fell back!", handler.toString());
handler = new BodyContentHandler();
p.parse(new ByteArrayInputStream(new byte[] {0,1,2,3,4}), handler, metadata, context);
assertEquals("Fell back!", handler.toString());

handler = new BodyContentHandler();
p.parse(new ByteArrayInputStream(new byte[] {0,1,2,3,4}), handler, metadata, context);
assertEquals("Fell back 1!", handler.toString());
handler = new BodyContentHandler();
p.parse(new ByteArrayInputStream(new byte[] {0,1,2,3,4}), handler, metadata, context);
assertEquals("Fell back 1!Fell back 2!", handler.toString());
handler = new BodyContentHandler();
p.parse(new ByteArrayInputStream(new byte[] {0,1,2,3,4}), handler, metadata, context);
assertEquals("Fell back 1!Fell back 2!", handler.toString());
handler = new BodyContentHandler();
p.parse(new ByteArrayInputStream(new byte[] {0,1,2,3,4}), handler, metadata, context);
assertEquals("Fell back 1!Fell back 2!", handler.toString());

HttpGet httpget = new HttpGet("http://url.here"); 
 HttpEntity entity = null;
 HttpClient client = new DefaultHttpClient();
 HttpResponse response = client.execute(httpget);
 entity = response.getEntity();
 if (entity != null) {
   InputStream instream = entity.getContent();
   BodyContentHandler handler = new BodyContentHandler();
   Metadata metadata = new Metadata();
   Parser parser = new AutoDetectParser();
   parser.parse( instream, handler, metadata, new ParseContext());
   String plainText = handler.toString();
   FileWriter writer = new FileWriter( "/scratch/cache/output.txt");
   writer.write( plainText );
   writer.close();
   System.out.println( "done");
 }

 BodyContentHandler bch = new BodyContentHandler();
parser.parse(is, bch, metadata, new ParseContext());
String plainText = bch.toString();

 public String parseExample() throws IOException, SAXException, TikaException {
  AutoDetectParser parser = new AutoDetectParser();
  BodyContentHandler handler = new BodyContentHandler();
  Metadata metadata = new Metadata();
  try (InputStream stream = ParsingExample.class.getResourceAsStream("test.pdf")) {
    parser.parse(stream, handler, metadata);
    return handler.toString();
  }
}

public String fetchPageAutoDetectParser(final String url ){
  String fetchURL = addHttp(url);
  String pageContent = null;
   URLConnection connection;
   try {
    log.info("fetch url  auto detect parser " + url);
    connection = new URL(fetchURL).openConnection();
    connection.setReadTimeout(DEFAULT_TIMEOUT);
    
   //parse method parameters
    Parser parser = new AutoDetectParser();
    BodyContentHandler handler = new BodyContentHandler();
    Metadata metadata = new Metadata();
    ParseContext context = new ParseContext();
    
    //parsing the file
    parser.parse(connection.getInputStream(), handler, metadata, context);
    
    pageContent = handler.toString();
   } catch (Exception e) {
    log.info(e.getMessage() + "\n" + e);
   }
   return  pageContent;
}

 BodyContentHandler handler = new BodyContentHandler();
AutoDetectParser parser = new AutoDetectParser();
Metadata metadata = new Metadata();
try {
  parser.parse(is, handler, metadata);
  text = handler.toString();
} catch(TikaException te) {
  System.out.println(te.toString());
} finally {
  is.close();
}

/**
 * Extract data from MS Word DOC/DOCX file to text
 * 
 * @param path
 * @return
 * @throws DataflowException
 */
public static String extractWordFile(Path path) throws DataflowException {
  try (FileInputStream inputStream = new FileInputStream(path.toString())) {
    BodyContentHandler handler = new BodyContentHandler();
    Metadata metadata = new Metadata();
    AutoDetectParser parser = new AutoDetectParser();
    parser.parse(inputStream, handler, metadata);
    
    return handler.toString();
  } catch (IOException | SAXException | TikaException e) {
    throw new DataflowException(e);
  }
}

public ParsedData parse(InputStream stream, String fileName, String contentType) {
  BodyContentHandler handler = new BodyContentHandler(MAX_CHARACTERS);
  BoilerpipeContentHandler textHandler = new BoilerpipeContentHandler(handler, KeepEverythingExtractor.INSTANCE);
  Metadata metadata = createMetadata(fileName, contentType);
  ParseContext context = new ParseContext();
  try {
    parser.parse(stream, textHandler, metadata, context);
    
    Map<String, String> metadataMap = new HashMap<String, String>();
    for (String propertyName : metadata.names()) {
      metadataMap.put(propertyName, metadata.get(propertyName));
    }
    
    return new ParsedData(handler.toString(), metadataMap);
    
  } catch (IOException | SAXException | TikaException e) {
    logger.error("Failed to extract metadata using Tika.", e);
    return null;
  }
}

@Override
public void doProcessStream(InputStream stream, String source, JCas jCas) throws IOException {
 super.doProcessStream(stream, source, jCas);
 try {
  BodyContentHandler textHandler = new BodyContentHandler(Integer.MAX_VALUE);
  Metadata metadata = new Metadata();
  ParseContext context = new ParseContext();
  AutoDetectParser autoParser = new AutoDetectParser();
  autoParser.parse(stream, textHandler, metadata, context);
  String fullContent = textHandler.toString();
  Matcher m = tearlinePattern.matcher(fullContent);
  if (m.find()) {
   jCas.setDocumentText(removeBoilerplate(fullContent.substring(0, m.start())).trim());
  } else {
   jCas.setDocumentText(removeBoilerplate(fullContent).trim());
  }
  for (String name : metadata.names()) {
   addMetadata(jCas, name, metadata.get(name));
  }
 } catch (SAXException | TikaException e) {
  getMonitor().warn("Couldn't parse metadata from '{}'", source, e);
 }
}

@Override
public void doProcessStream(InputStream stream, String source, JCas jCas) throws IOException {
 super.doProcessStream(stream, source, jCas);
 try {
  BodyContentHandler textHandler = new BodyContentHandler(Integer.MAX_VALUE);
  Metadata metadata = new Metadata();
  ParseContext context = new ParseContext();
  AutoDetectParser autoParser = new AutoDetectParser();
  autoParser.parse(stream, textHandler, metadata, context);
  String fullContent = textHandler.toString();
  Matcher m = tearlinePattern.matcher(fullContent);
  if (m.find()) {
   jCas.setDocumentText(removeBoilerplate(fullContent.substring(0, m.start())).trim());
  } else {
   jCas.setDocumentText(removeBoilerplate(fullContent).trim());
  }
  for (String name : metadata.names()) {
   addMetadata(jCas, name, metadata.get(name));
  }
 } catch (SAXException | TikaException e) {
  getMonitor().warn("Couldn't parse metadata from '{}'", source, e);
 }
}

  public void testFopMacroLibrary() throws Exception {
    String screentextUrl = screenUrl.concat("Fop");
    HttpClient http = initHttpClient();
    http.setUrl(screentextUrl.concat(authentificationQuery));
    //FIXME need to check if the stream is an application-pdf that don't contains ftl stack trace
    InputStream screenInputStream = http.postStream();
    assertNotNull("Response failed from ofbiz", screenInputStream);
    assertEquals("Response contentType isn't good : " + http.getResponseContentType(), "application/pdf;charset=UTF-8", http.getResponseContentType());

    String screenOutString = "";
    try {
      BodyContentHandler handler = new BodyContentHandler(Integer.MAX_VALUE);
      Metadata metadata = new Metadata();
      new PDFParser().parse(screenInputStream, handler, metadata, new ParseContext());
      screenOutString = handler.toString();
    } finally {
      screenInputStream.close();
    }
    //Test if a ftl macro error is present
    assertFalse("Fop Screen contains Macro on error : see " + screentextUrl + " for more detail", screenOutString.contains("FreeMarker template error:"));
  }
}

 @Override
 public void doProcessStream(InputStream stream, String source, JCas jCas) throws IOException {
  super.doProcessStream(stream, source, jCas);

  try {
   BodyContentHandler textHandler = new BodyContentHandler(Integer.MAX_VALUE);
   Metadata metadata = new Metadata();
   ParseContext context = new ParseContext();

   AutoDetectParser autoParser = new AutoDetectParser();
   autoParser.parse(stream, textHandler, metadata, context);

   jCas.setDocumentText(textHandler.toString());

   for (String name : metadata.names()) {
    addMetadata(jCas, name, metadata.get(name));
   }
  } catch (SAXException | TikaException e) {
   getMonitor().warn("Couldn't parse metadata from '{}'", source, e);
   if (Strings.isNullOrEmpty(jCas.getDocumentText())) {
    jCas.setDocumentText(CORRUPT_FILE_TEXT);
   }
  }
 }
}

 @Override
 public void doProcessStream(InputStream stream, String source, JCas jCas) throws IOException {
  super.doProcessStream(stream, source, jCas);

  try {
   BodyContentHandler textHandler = new BodyContentHandler(Integer.MAX_VALUE);
   Metadata metadata = new Metadata();
   ParseContext context = new ParseContext();

   AutoDetectParser autoParser = new AutoDetectParser();
   autoParser.parse(stream, textHandler, metadata, context);

   jCas.setDocumentText(textHandler.toString());

   for (String name : metadata.names()) {
    addMetadata(jCas, name, metadata.get(name));
   }
  } catch (SAXException | TikaException e) {
   getMonitor().warn("Couldn't parse metadata from '{}'", source, e);
   if (Strings.isNullOrEmpty(jCas.getDocumentText())) {
    jCas.setDocumentText(CORRUPT_FILE_TEXT);
   }
  }
 }
}

Popular methods of BodyContentHandler

<init>
Creates a content handler that passes all XHTML body events to the given underlying content handler.
characters
endElement
ignorableWhitespace
startElement

Popular in Java

Running tasks concurrently on multiple threads
getResourceAsStream (ClassLoader)
orElseThrow (Optional)
Return the contained value, if present, otherwise throw an exception to be created by the provided s
getOriginalFilename (MultipartFile)
Return the original filename in the client's filesystem.This may contain path information depending
SocketTimeoutException (java.net)
This exception is thrown when a timeout expired on a socket read or accept operation.
Path (java.nio.file)
SQLException (java.sql)
An exception that indicates a failed JDBC operation. It provides the following information about pro
AtomicInteger (java.util.concurrent.atomic)
An int value that may be updated atomically. See the java.util.concurrent.atomic package specificati
HttpServletRequest (javax.servlet.http)
Extends the javax.servlet.ServletRequest interface to provide request information for HTTP servlets.
ImageIO (javax.imageio)
Top Vim plugins

How to use toStringmethodin org.apache.tika.sax.BodyContentHandler

Best Java code snippets using org.apache.tika.sax.BodyContentHandler.toString (Showing top 20 results out of 315)

How to use
toString
method
in
org.apache.tika.sax.BodyContentHandler