uk.gov.dstl.baleen.uima.BaleenContentExtractor.processStream java code examples

@Test
public void testMetadata() throws Exception {
 JCas jCas = JCasSingleton.getJCasInstance();
 BaleenContentExtractor contentExtractor = new TearlineContentExtractor();
 contentExtractor.initialize(new CustomResourceSpecifier_impl(), Collections.emptyMap());
 File f = new File(getClass().getResource("tearline/1.docx").getPath());
 try (InputStream is = new FileInputStream(f); ) {
  contentExtractor.processStream(is, f.getPath(), jCas);
  assertFalse(JCasUtil.select(jCas, Metadata.class).isEmpty());
 }
 contentExtractor.destroy();
}

 @Test
 public void testTikaCorruptFile() throws Exception {
  JCas jCas = JCasSingleton.getJCasInstance();

  BaleenContentExtractor contentExtractor = new TikaContentExtractor();

  File f = new File(getClass().getResource("corrupt.docx").getPath());

  contentExtractor.initialize(new CustomResourceSpecifier_impl(), Collections.emptyMap());
  try (InputStream is = new FileInputStream(f); ) {
   contentExtractor.processStream(is, f.getPath(), jCas);
  }
  contentExtractor.destroy();

  assertEquals(TikaContentExtractor.CORRUPT_FILE_TEXT, jCas.getDocumentText());
 }
}

@Test
public void testTikaText() throws Exception {
 JCas jCas = JCasSingleton.getJCasInstance();
 BaleenContentExtractor contentExtractor = new TikaContentExtractor();
 File f = new File(getClass().getResource("test.txt").getPath());
 contentExtractor.initialize(new CustomResourceSpecifier_impl(), Collections.emptyMap());
 try (InputStream is = new FileInputStream(f); ) {
  contentExtractor.processStream(is, f.getPath(), jCas);
 }
 contentExtractor.destroy();
 assertEquals("Hello World\n", jCas.getDocumentText());
 assertEquals(4, JCasUtil.select(jCas, Metadata.class).size());
}

 @Test
 public void testNotEnoughCols() throws Exception {
  JCas jCas = JCasSingleton.getJCasInstance();

  BaleenContentExtractor contentExtractor = new CsvContentExtractor();

  File f = new File(getClass().getResource("test.csv").getPath());

  Map<String, Object> config = new HashMap<>();
  config.put(CsvContentExtractor.PARAM_SEPARATOR, ",");
  config.put(CsvContentExtractor.PARAM_CONTENT_COLUMN, "20");
  config.put(CsvContentExtractor.PARAM_COLUMNS, Arrays.asList("id", "test1", "", "test3"));

  contentExtractor.initialize(new CustomResourceSpecifier_impl(), config);
  try (InputStream is = new FileInputStream(f); ) {
   contentExtractor.processStream(is, f.getPath(), jCas);
   fail("Expected error not thrown");
  } catch (IOException ioe) {
   // This error is expected
  }
  contentExtractor.destroy();
 }
}

@Test
public void testTikaWrappingDocx() throws Exception {
 JCas jCas = JCasSingleton.getJCasInstance();
 BaleenContentExtractor contentExtractor = new TikaContentExtractor();
 File f = new File(getClass().getResource("wrappingLines.docx").getPath());
 contentExtractor.initialize(new CustomResourceSpecifier_impl(), Collections.emptyMap());
 try (InputStream is = new FileInputStream(f); ) {
  contentExtractor.processStream(is, f.getPath(), jCas);
 }
 contentExtractor.destroy();
 assertEquals(
   "Test Document\nThis is my test document, which has a sentence that is long enough to wrap over two lines but we want it to appear as a single line when we extract the content.\nThis is a second paragraph. This is a third sentence, but still the second paragraph. Super-cali-fragi-listic-expi-alo-docious.\n",
   jCas.getDocumentText());
}

@Test
public void testInitializingMapper() throws UIMAException, IOException {
 JCas jCas = JCasSingleton.getJCasInstance();
 BaleenContentExtractor contentExtractor = new TestStructureContentExtractor();
 Map<String, Object> params = new HashMap<>();
 params.put("contentMappers", new String[] {"MetaTags"});
 contentExtractor.initialize(new CustomResourceSpecifier_impl(), params);
 contentExtractor.processStream(null, "source", jCas);
 long count =
   JCasUtil.select(jCas, Metadata.class)
     .stream()
     .filter(
       m ->
         m.getKey().equals("baleen:content-mappers")
           && m.getValue().contains("MetaTags"))
     .count();
 assertEquals(1, count);
}

@Test
public void testNoTearline() throws Exception {
 JCas jCas = JCasSingleton.getJCasInstance();
 BaleenContentExtractor contentExtractor = new TearlineContentExtractor();
 contentExtractor.initialize(new CustomResourceSpecifier_impl(), Collections.emptyMap());
 File f = new File(getClass().getResource("tearline/notearline.docx").getPath());
 try (InputStream is = new FileInputStream(f); ) {
  contentExtractor.processStream(is, f.getPath(), jCas);
  assertEquals("This document has no tearline.", jCas.getDocumentText());
  jCas.reset();
 }
 contentExtractor.destroy();
}

@Test
public void testInitializingManipulator() throws UIMAException, IOException {
 JCas jCas = JCasSingleton.getJCasInstance();
 BaleenContentExtractor contentExtractor = new TestStructureContentExtractor();
 Map<String, Object> params = new HashMap<>();
 params.put("contentManipulators", new String[] {"RemoveEmptyText"});
 contentExtractor.initialize(new CustomResourceSpecifier_impl(), params);
 contentExtractor.processStream(null, "source", jCas);
 long count =
   JCasUtil.select(jCas, Metadata.class)
     .stream()
     .filter(
       m ->
         m.getKey().equals("baleen:content-manipulators")
           && m.getValue().contains("RemoveEmptyText"))
     .count();
 assertEquals(1, count);
}

@Test
public void testBoilerplate() throws Exception {
 JCas jCas = JCasSingleton.getJCasInstance();
 Map<String, Object> params = new HashMap<>();
 params.put("boilerplate", new String[] {"[aeiou]"});
 BaleenContentExtractor contentExtractor = new TearlineContentExtractor();
 contentExtractor.initialize(new CustomResourceSpecifier_impl(), params);
 File f = new File(getClass().getResource("tearline/notearline.docx").getPath());
 try (InputStream is = new FileInputStream(f); ) {
  contentExtractor.processStream(is, f.getPath(), jCas);
  assertEquals("Ths dcmnt hs n trln.", jCas.getDocumentText());
  jCas.reset();
 }
 contentExtractor.destroy();
}

 @Test
 public void testCustomTearline() throws Exception {
  JCas jCas = JCasSingleton.getJCasInstance();

  Map<String, Object> params = new HashMap<>();
  params.put("tearline", "Customer Form:");

  BaleenContentExtractor contentExtractor = new TearlineContentExtractor();
  contentExtractor.initialize(new CustomResourceSpecifier_impl(), params);

  File f = new File(getClass().getResource("tearline/customtearline.docx").getPath());

  try (InputStream is = new FileInputStream(f); ) {
   contentExtractor.processStream(is, f.getPath(), jCas);
   assertEquals("This is the first tearline.", jCas.getDocumentText());

   jCas.reset();
  }
  contentExtractor.destroy();
 }
}

@Test
public void testTearline() throws Exception {
 JCas jCas = JCasSingleton.getJCasInstance();
 BaleenContentExtractor contentExtractor = new TearlineContentExtractor();
 contentExtractor.initialize(new CustomResourceSpecifier_impl(), Collections.emptyMap());
 String[] files = new String[] {"1.docx", "2.docx", "3.docx", "4.docx", "5.doc", "6.pdf"};
 for (String file : files) {
  File f = new File(getClass().getResource("tearline/" + file).getPath());
  try (InputStream is = new FileInputStream(f); ) {
   contentExtractor.processStream(is, f.getPath(), jCas);
   assertEquals("This is the first tearline.", jCas.getDocumentText());
   jCas.reset();
  }
 }
 contentExtractor.destroy();
}

 @Test
 public void testDisableTextBlocks() throws Exception {
  JCas jCas = JCasSingleton.getJCasInstance();

  BaleenContentExtractor contentExtractor = new TestStructureContentExtractor();
  Map<String, Object> map = new HashMap<>();
  map.put(StructureContentExtractor.FIELD_EXTRACT_TEXT_BLOCKS, "false");
  contentExtractor.initialize(new CustomResourceSpecifier_impl(), map);

  contentExtractor.processStream(null, "source", jCas);

  assertEquals("Title\nExample", jCas.getDocumentText());
  Collection<Text> select = JCasUtil.select(jCas, Text.class);
  assertTrue(select.isEmpty());
 }
}

@Test
public void test() throws UIMAException, IOException {
 JCas jCas = JCasSingleton.getJCasInstance();
 BaleenContentExtractor contentExtractor = new TestStructureContentExtractor();
 contentExtractor.initialize(new CustomResourceSpecifier_impl(), Collections.emptyMap());
 contentExtractor.processStream(null, "source", jCas);
 assertEquals("Title\nExample", jCas.getDocumentText());
 Collection<Paragraph> select = JCasUtil.select(jCas, Paragraph.class);
 assertEquals(select.size(), 1);
 Paragraph p = select.iterator().next();
 assertEquals(p.getBegin(), 6);
 assertEquals(p.getEnd(), 13);
 List<Metadata> contentMeta =
   JCasUtil.select(jCas, Metadata.class)
     .stream()
     .filter(m -> m.getKey().startsWith("baleen:content-"))
     .collect(Collectors.toList());
 assertEquals(3, contentMeta.size());
}

@Test
public void testTextBlocksEnabled() throws Exception {
 JCas jCas = JCasSingleton.getJCasInstance();
 BaleenContentExtractor contentExtractor = new TestStructureContentExtractor();
 contentExtractor.initialize(new CustomResourceSpecifier_impl(), Collections.emptyMap());
 contentExtractor.processStream(null, "source", jCas);
 assertEquals("Title\nExample", jCas.getDocumentText());
 Collection<Text> select = JCasUtil.select(jCas, Text.class);
 assertTrue(select.size() > 0);
}

@Test
public void testTikaWord() throws Exception {
 JCas jCas = JCasSingleton.getJCasInstance();
 BaleenContentExtractor contentExtractor = new TikaContentExtractor();
 File f = new File(getClass().getResource("test.docx").getPath());
 contentExtractor.initialize(new CustomResourceSpecifier_impl(), Collections.emptyMap());
 try (InputStream is = new FileInputStream(f); ) {
  contentExtractor.processStream(is, f.getPath(), jCas);
 }
 contentExtractor.destroy();
 assertEquals(
   "Test Document\nThis is a simple test document, with a title and a single sentence.\n",
   jCas.getDocumentText());
 Collection<Metadata> metadata = JCasUtil.select(jCas, Metadata.class);
 assertEquals(44, metadata.size());
 Map<String, String> metadataMap = new HashMap<>();
 for (Metadata md : metadata) {
  metadataMap.put(md.getKey(), md.getValue());
 }
 assertTrue(metadataMap.containsKey("Page-Count"));
 assertEquals("1", metadataMap.get("Page-Count"));
 assertTrue(metadataMap.containsKey("meta:author"));
 assertEquals("James Baker", metadataMap.get("meta:author"));
}

contentExtractor.processStream(is, f.getPath(), jCas);

Popular methods of BaleenContentExtractor

addToJCasIndex
Add an annotation to the JCas index, notifying UimaMonitor of the fact we have done so
createMonitor
createSupport
destroy
doDestroy
Called when the content extractor has finished and is closing down. Any open resources, for example,
doInitialize
Called when the content extractor is being initialized. Any required resources, for example, should
doProcessStream
Called when the content extractor is being asked to process an inputstream and extract the content.
getSupport
Gets the UimaSupport object associated with this ContentExtractor, for instance to log errors.
getUimaContext
initialize

Popular in Java

Running tasks concurrently on multiple threads
getExternalFilesDir (Context)
setScale (BigDecimal)
getResourceAsStream (ClassLoader)
InputStreamReader (java.io)
A class for turning a byte stream into a character stream. Data read from the source input stream is
URL (java.net)
A Uniform Resource Locator that identifies the location of an Internet resource as specified by RFC
Date (java.util)
A specific moment in time, with millisecond precision. Values typically come from System#currentTime
Random (java.util)
This class provides methods that return pseudo-random values.It is dangerous to seed Random with the
TreeMap (java.util)
Walk the nodes of the tree left-to-right or right-to-left. Note that in descending iterations, next
Get (org.apache.hadoop.hbase.client)
Used to perform Get operations on a single row. To get everything for a row, instantiate a Get objec
CodeWhisperer alternatives

How to use processStreammethodin uk.gov.dstl.baleen.uima.BaleenContentExtractor

Best Java code snippets using uk.gov.dstl.baleen.uima.BaleenContentExtractor.processStream (Showing top 16 results out of 315)

How to use
processStream
method
in
uk.gov.dstl.baleen.uima.BaleenContentExtractor