org.apache.tika.parser.ParseContext.set java code examples

Refine search

ParseContext.<init>

public TikaHtmlParser(CrawlConfig config, TLDList tldList) throws InstantiationException, IllegalAccessException {
  this.config = config;
  this.tldList = tldList;
  htmlParser = new HtmlParser();
  parseContext = new ParseContext();
  parseContext.set(HtmlMapper.class, AllTagMapper.class.newInstance());
}

public void parse(
    InputStream stream, ContentHandler handler, Metadata metadata)
    throws IOException, SAXException, TikaException {
  ParseContext context = new ParseContext();
  context.set(Parser.class, this);
  parse(stream, handler, metadata, context);
}

/**
 * Parses the given document and returns the extracted text content.
 * Input metadata like a file name or a content type hint can be passed
 * in the given metadata instance. Metadata information extracted from
 * the document is returned in that same metadata instance.
 * <p>
 * The returned reader will be responsible for closing the given stream.
 * The stream and any associated resources will be closed at or before
 * the time when the {@link Reader#close()} method is called.
 *
 * @param stream the document to be parsed
 * @param metadata where document's metadata will be populated
 * @return extracted text content
 * @throws IOException if the document can not be read or parsed
 */
public Reader parse(InputStream stream, Metadata metadata)
    throws IOException {
  ParseContext context = new ParseContext();
  context.set(Parser.class, parser);
  return new ParsingReader(parser, stream, metadata, context);
}

/**
 * Creates a reader for the text content of the given binary stream.
 *
 * @param stream binary stream
 * @throws IOException if the document can not be parsed
 */
public ParsingReader(InputStream stream) throws IOException {
  this(new AutoDetectParser(), stream, new Metadata(), new ParseContext());
  context.set(Parser.class, parser);
}

/**
 * Creates a reader for the text content of the given binary stream
 * with the given name.
 *
 * @param stream binary stream
 * @param name document name
 * @throws IOException if the document can not be parsed
 */
public ParsingReader(InputStream stream, String name) throws IOException {
  this(new AutoDetectParser(), stream, getMetadata(name), new ParseContext());
  context.set(Parser.class, parser);
}

public static void testLocale() throws Exception {
  InputStream stream = new ByteArrayInputStream(new byte[0]);
  ContentHandler handler = new DefaultHandler();
  Metadata metadata = new Metadata();
  Parser parser = new AutoDetectParser();
  ParseContext context = new ParseContext();
  context.set(Locale.class, Locale.ENGLISH);
  parser.parse(stream, handler, metadata, context);
}

public void extract(InputStream is, Path outputDir) throws SAXException, TikaException, IOException {
  Metadata m = new Metadata();
  ParseContext c = new ParseContext();
  ContentHandler h = new BodyContentHandler(-1);
  c.set(Parser.class, parser);
  EmbeddedDocumentExtractor ex = new MyEmbeddedDocumentExtractor(outputDir, c);
  c.set(EmbeddedDocumentExtractor.class, ex);
  parser.parse(is, h, m, c);
}

public static void testHtmlMapper() throws Exception {
  InputStream stream = new ByteArrayInputStream(new byte[0]);
  ContentHandler handler = new DefaultHandler();
  Metadata metadata = new Metadata();
  Parser parser = new AutoDetectParser();
  ParseContext context = new ParseContext();
  context.set(HtmlMapper.class, new IdentityHtmlMapper());
  parser.parse(stream, handler, metadata, context);
}

  public static void testCompositeDocument() throws Exception {
    InputStream stream = new ByteArrayInputStream(new byte[0]);
    ContentHandler handler = new DefaultHandler();
    Metadata metadata = new Metadata();
    Parser parser = new AutoDetectParser();
    ParseContext context = new ParseContext();
    context.set(Parser.class, new ParserDecorator(parser) {
      private static final long serialVersionUID = 4424210691523343833L;

      @Override
      public void parse(InputStream stream, ContentHandler handler,
               Metadata metadata, ParseContext context)
          throws IOException, SAXException, TikaException {
        // custom processing of the component document
      }
    });
    parser.parse(stream, handler, metadata, context);
  }
}

public void extract(
    TikaInputStream stream, ContainerExtractor recurseExtractor,
    EmbeddedResourceHandler handler)
    throws IOException, TikaException {
  ParseContext context = new ParseContext();
  context.set(Parser.class, new RecursiveParser(recurseExtractor, handler));
  try {
    parser.parse(stream, new DefaultHandler(), new Metadata(), context);
  } catch (SAXException e) {
    throw new TikaException("Unexpected SAX exception", e);
  }
}

public boolean findInFile(String query, Path path) {
  InterruptingContentHandler handler = new InterruptingContentHandler(query);
  Metadata metadata = new Metadata();
  ParseContext context = new ParseContext();
  context.set(Parser.class, tika.getParser());
  try (InputStream is = new BufferedInputStream(Files.newInputStream(path))) {
    tika.getParser().parse(is, handler, metadata, context);
  } catch (QueryMatchedException e) {
    return true;
  } catch (SAXException | TikaException | IOException e) {
    // something went wrong with parsing...
    e.printStackTrace();
  }
  return false;
}

/**
 * This example shows how to extract content from the outer document and all
 * embedded documents.  The key is to specify a {@link Parser} in the {@link ParseContext}.
 *
 * @return content, including from embedded documents
 * @throws IOException
 * @throws SAXException
 * @throws TikaException
 */
public String parseEmbeddedExample() throws IOException, SAXException, TikaException {
  AutoDetectParser parser = new AutoDetectParser();
  BodyContentHandler handler = new BodyContentHandler();
  Metadata metadata = new Metadata();
  ParseContext context = new ParseContext();
  context.set(Parser.class, parser);
  try (InputStream stream = ParsingExample.class.getResourceAsStream("test_recursive_embedded.docx")) {
    parser.parse(stream, handler, metadata, context);
    return handler.toString();
  }
}

/**
 * If you don't want content from embedded documents, send in
 * a {@link org.apache.tika.parser.ParseContext} that does contains a
 * {@link EmptyParser}.
 *
 * @return The content of a file.
 */
public String parseNoEmbeddedExample() throws IOException, SAXException, TikaException {
  AutoDetectParser parser = new AutoDetectParser();
  BodyContentHandler handler = new BodyContentHandler();
  Metadata metadata = new Metadata();
  ParseContext parseContext = new ParseContext();
  parseContext.set(Parser.class, new EmptyParser());
  try (InputStream stream = ParsingExample.class.getResourceAsStream("test_recursive_embedded.docx")) {
    parser.parse(stream, handler, metadata, parseContext);
    return handler.toString();
  }
}

) throws Exception {
  Metadata metadata = new Metadata();
  ParseContext pc = new ParseContext();
  MutableInt count = new MutableInt();
  pc.set(EmbeddedDocumentExtractor.class, new MyEmbeddedDocumentExtractor(count, files));
  TikaResource.parse(parser, LOG, info.getPath(), is, ch, metadata, pc);

  new WriteOutContentHandler(maxStringLength);
try {
  ParseContext context = new ParseContext();
  context.set(Parser.class, parser);
  parser.parse(
      stream, new BodyContentHandler(handler), metadata, context);

  new WriteOutContentHandler(maxLength);
try {
  ParseContext context = new ParseContext();
  context.set(Parser.class, parser);
  parser.parse(
         stream, new BodyContentHandler(handler), metadata, context);

public TikaGUI(Parser parser) {
  super("Apache Tika");
  setDefaultCloseOperation(JFrame.EXIT_ON_CLOSE);
  addMenuBar();
  cards = new JPanel(layout);
  addWelcomeCard(cards, "welcome");
  metadata = addCard(cards, "text/plain", "metadata");
  html = addCard(cards, "text/html", "html");
  text = addCard(cards, "text/plain", "text");
  textMain = addCard(cards, "text/plain", "main");
  xml = addCard(cards, "text/plain", "xhtml");
  json = addCard(cards, "text/plain", "json");
  add(cards);
  layout.show(cards, "welcome");
  setPreferredSize(new Dimension(640, 480));
  pack();
  this.context = new ParseContext();
  this.parser = parser;
  this.imageParser = new ImageSavingParser(parser);
  this.context.set(DocumentSelector.class, new ImageDocumentSelector());
  this.context.set(Parser.class, imageParser);
}

@Test
public void testSimple() {
  Parser p = new AutoDetectParser();
  ParseContext parseContext = new ParseContext();
  parseContext.set(Parser.class, p);
  Parser txtParser = EmbeddedDocumentUtil.tryToFindExistingLeafParser(org.apache.tika.parser.txt.TXTParser.class, parseContext);
  assertNotNull(txtParser);
  assertEquals(org.apache.tika.parser.txt.TXTParser.class, txtParser.getClass());
}

@Test
public void testExecuteExecutor() throws Exception {
  TikaConfig config = TikaConfig.getDefaultConfig();
  ParseContext context = new ParseContext();
  context.set(ExecutorService.class, config.getExecutorService());
  Future result = ConcurrentUtils.execute(context, new Runnable() {
    
    @Override
    public void run() {
      //Do nothing
      
    }
  });
  
  assertNull(result.get());
}

@Override
public boolean processFileResource(FileResource fileResource) {
  ParseContext context = new ParseContext();
  if (parseRecursively) {
    context.set(Parser.class, parser);

Javadoc

Adds the given value to the context as an implementation of the given interface.

Popular methods of ParseContext

<init>
get
Returns the object in this context that implements the given interface, or the given default value i
getDocumentBuilder
Returns the DOM builder specified in this parsing context. If a builder is not explicitly specified,
getSAXParser
Returns the SAX parser specified in this parsing context. If a parser is not explicitly specified, t
getXMLInputFactory
Returns the StAX input factory specified in this parsing context. If a factory is not explicitly spe
getDocumentBuilderFactory
Returns the DOM builder factory specified in this parsing context. If a factory is not explicitly sp
getSAXParserFactory
Returns the SAX parser factory specified in this parsing context. If a factory is not explicitly spe
getXMLReader
Returns the XMLReader specified in this parsing context. If a reader is not explicitly specified, th
setVersion
tryToSetSAXFeatureOnDOMFactory
tryToSetStaxProperty
tryToSetXercesManager

Popular in Java

Reading from database using SQL prepared statement
getContentResolver (Context)
scheduleAtFixedRate (Timer)
findViewById (Activity)
HashSet (java.util)
HashSet is an implementation of a Set. All optional operations (adding and removing) are supported.
Queue (java.util)
A collection designed for holding elements prior to processing. Besides basic java.util.Collection o
CountDownLatch (java.util.concurrent)
A synchronization aid that allows one or more threads to wait until a set of operations being perfor
Cipher (javax.crypto)
This class provides access to implementations of cryptographic ciphers for encryption and decryption
LogFactory (org.apache.commons.logging)
Factory for creating Log instances, with discovery and configuration features similar to that employ
Notification (javax.management)
Top PhpStorm plugins

How to use setmethodin org.apache.tika.parser.ParseContext

Best Java code snippets using org.apache.tika.parser.ParseContext.set (Showing top 20 results out of 333)

Refine search

How to use
set
method
in
org.apache.tika.parser.ParseContext