@Test public void testMetadata() throws Exception { JCas jCas = JCasSingleton.getJCasInstance(); BaleenContentExtractor contentExtractor = new TearlineContentExtractor(); contentExtractor.initialize(new CustomResourceSpecifier_impl(), Collections.emptyMap()); File f = new File(getClass().getResource("tearline/1.docx").getPath()); try (InputStream is = new FileInputStream(f); ) { contentExtractor.processStream(is, f.getPath(), jCas); assertFalse(JCasUtil.select(jCas, Metadata.class).isEmpty()); } contentExtractor.destroy(); }
@Test public void testTikaCorruptFile() throws Exception { JCas jCas = JCasSingleton.getJCasInstance(); BaleenContentExtractor contentExtractor = new TikaContentExtractor(); File f = new File(getClass().getResource("corrupt.docx").getPath()); contentExtractor.initialize(new CustomResourceSpecifier_impl(), Collections.emptyMap()); try (InputStream is = new FileInputStream(f); ) { contentExtractor.processStream(is, f.getPath(), jCas); } contentExtractor.destroy(); assertEquals(TikaContentExtractor.CORRUPT_FILE_TEXT, jCas.getDocumentText()); } }
@Test public void testTikaText() throws Exception { JCas jCas = JCasSingleton.getJCasInstance(); BaleenContentExtractor contentExtractor = new TikaContentExtractor(); File f = new File(getClass().getResource("test.txt").getPath()); contentExtractor.initialize(new CustomResourceSpecifier_impl(), Collections.emptyMap()); try (InputStream is = new FileInputStream(f); ) { contentExtractor.processStream(is, f.getPath(), jCas); } contentExtractor.destroy(); assertEquals("Hello World\n", jCas.getDocumentText()); assertEquals(4, JCasUtil.select(jCas, Metadata.class).size()); }
@Test public void testNotEnoughCols() throws Exception { JCas jCas = JCasSingleton.getJCasInstance(); BaleenContentExtractor contentExtractor = new CsvContentExtractor(); File f = new File(getClass().getResource("test.csv").getPath()); Map<String, Object> config = new HashMap<>(); config.put(CsvContentExtractor.PARAM_SEPARATOR, ","); config.put(CsvContentExtractor.PARAM_CONTENT_COLUMN, "20"); config.put(CsvContentExtractor.PARAM_COLUMNS, Arrays.asList("id", "test1", "", "test3")); contentExtractor.initialize(new CustomResourceSpecifier_impl(), config); try (InputStream is = new FileInputStream(f); ) { contentExtractor.processStream(is, f.getPath(), jCas); fail("Expected error not thrown"); } catch (IOException ioe) { // This error is expected } contentExtractor.destroy(); } }
@Test public void testTikaWrappingDocx() throws Exception { JCas jCas = JCasSingleton.getJCasInstance(); BaleenContentExtractor contentExtractor = new TikaContentExtractor(); File f = new File(getClass().getResource("wrappingLines.docx").getPath()); contentExtractor.initialize(new CustomResourceSpecifier_impl(), Collections.emptyMap()); try (InputStream is = new FileInputStream(f); ) { contentExtractor.processStream(is, f.getPath(), jCas); } contentExtractor.destroy(); assertEquals( "Test Document\nThis is my test document, which has a sentence that is long enough to wrap over two lines but we want it to appear as a single line when we extract the content.\nThis is a second paragraph. This is a third sentence, but still the second paragraph. Super-cali-fragi-listic-expi-alo-docious.\n", jCas.getDocumentText()); }
@Test public void testInitializingMapper() throws UIMAException, IOException { JCas jCas = JCasSingleton.getJCasInstance(); BaleenContentExtractor contentExtractor = new TestStructureContentExtractor(); Map<String, Object> params = new HashMap<>(); params.put("contentMappers", new String[] {"MetaTags"}); contentExtractor.initialize(new CustomResourceSpecifier_impl(), params); contentExtractor.processStream(null, "source", jCas); long count = JCasUtil.select(jCas, Metadata.class) .stream() .filter( m -> m.getKey().equals("baleen:content-mappers") && m.getValue().contains("MetaTags")) .count(); assertEquals(1, count); }
@Test public void testNoTearline() throws Exception { JCas jCas = JCasSingleton.getJCasInstance(); BaleenContentExtractor contentExtractor = new TearlineContentExtractor(); contentExtractor.initialize(new CustomResourceSpecifier_impl(), Collections.emptyMap()); File f = new File(getClass().getResource("tearline/notearline.docx").getPath()); try (InputStream is = new FileInputStream(f); ) { contentExtractor.processStream(is, f.getPath(), jCas); assertEquals("This document has no tearline.", jCas.getDocumentText()); jCas.reset(); } contentExtractor.destroy(); }
@Test public void testInitializingManipulator() throws UIMAException, IOException { JCas jCas = JCasSingleton.getJCasInstance(); BaleenContentExtractor contentExtractor = new TestStructureContentExtractor(); Map<String, Object> params = new HashMap<>(); params.put("contentManipulators", new String[] {"RemoveEmptyText"}); contentExtractor.initialize(new CustomResourceSpecifier_impl(), params); contentExtractor.processStream(null, "source", jCas); long count = JCasUtil.select(jCas, Metadata.class) .stream() .filter( m -> m.getKey().equals("baleen:content-manipulators") && m.getValue().contains("RemoveEmptyText")) .count(); assertEquals(1, count); }
@Test public void testBoilerplate() throws Exception { JCas jCas = JCasSingleton.getJCasInstance(); Map<String, Object> params = new HashMap<>(); params.put("boilerplate", new String[] {"[aeiou]"}); BaleenContentExtractor contentExtractor = new TearlineContentExtractor(); contentExtractor.initialize(new CustomResourceSpecifier_impl(), params); File f = new File(getClass().getResource("tearline/notearline.docx").getPath()); try (InputStream is = new FileInputStream(f); ) { contentExtractor.processStream(is, f.getPath(), jCas); assertEquals("Ths dcmnt hs n trln.", jCas.getDocumentText()); jCas.reset(); } contentExtractor.destroy(); }
@Test public void testCustomTearline() throws Exception { JCas jCas = JCasSingleton.getJCasInstance(); Map<String, Object> params = new HashMap<>(); params.put("tearline", "Customer Form:"); BaleenContentExtractor contentExtractor = new TearlineContentExtractor(); contentExtractor.initialize(new CustomResourceSpecifier_impl(), params); File f = new File(getClass().getResource("tearline/customtearline.docx").getPath()); try (InputStream is = new FileInputStream(f); ) { contentExtractor.processStream(is, f.getPath(), jCas); assertEquals("This is the first tearline.", jCas.getDocumentText()); jCas.reset(); } contentExtractor.destroy(); } }
@Test public void testTearline() throws Exception { JCas jCas = JCasSingleton.getJCasInstance(); BaleenContentExtractor contentExtractor = new TearlineContentExtractor(); contentExtractor.initialize(new CustomResourceSpecifier_impl(), Collections.emptyMap()); String[] files = new String[] {"1.docx", "2.docx", "3.docx", "4.docx", "5.doc", "6.pdf"}; for (String file : files) { File f = new File(getClass().getResource("tearline/" + file).getPath()); try (InputStream is = new FileInputStream(f); ) { contentExtractor.processStream(is, f.getPath(), jCas); assertEquals("This is the first tearline.", jCas.getDocumentText()); jCas.reset(); } } contentExtractor.destroy(); }
@Test public void testDisableTextBlocks() throws Exception { JCas jCas = JCasSingleton.getJCasInstance(); BaleenContentExtractor contentExtractor = new TestStructureContentExtractor(); Map<String, Object> map = new HashMap<>(); map.put(StructureContentExtractor.FIELD_EXTRACT_TEXT_BLOCKS, "false"); contentExtractor.initialize(new CustomResourceSpecifier_impl(), map); contentExtractor.processStream(null, "source", jCas); assertEquals("Title\nExample", jCas.getDocumentText()); Collection<Text> select = JCasUtil.select(jCas, Text.class); assertTrue(select.isEmpty()); } }
@Test public void test() throws UIMAException, IOException { JCas jCas = JCasSingleton.getJCasInstance(); BaleenContentExtractor contentExtractor = new TestStructureContentExtractor(); contentExtractor.initialize(new CustomResourceSpecifier_impl(), Collections.emptyMap()); contentExtractor.processStream(null, "source", jCas); assertEquals("Title\nExample", jCas.getDocumentText()); Collection<Paragraph> select = JCasUtil.select(jCas, Paragraph.class); assertEquals(select.size(), 1); Paragraph p = select.iterator().next(); assertEquals(p.getBegin(), 6); assertEquals(p.getEnd(), 13); List<Metadata> contentMeta = JCasUtil.select(jCas, Metadata.class) .stream() .filter(m -> m.getKey().startsWith("baleen:content-")) .collect(Collectors.toList()); assertEquals(3, contentMeta.size()); }
@Test public void testTextBlocksEnabled() throws Exception { JCas jCas = JCasSingleton.getJCasInstance(); BaleenContentExtractor contentExtractor = new TestStructureContentExtractor(); contentExtractor.initialize(new CustomResourceSpecifier_impl(), Collections.emptyMap()); contentExtractor.processStream(null, "source", jCas); assertEquals("Title\nExample", jCas.getDocumentText()); Collection<Text> select = JCasUtil.select(jCas, Text.class); assertTrue(select.size() > 0); }
@Test public void testTikaWord() throws Exception { JCas jCas = JCasSingleton.getJCasInstance(); BaleenContentExtractor contentExtractor = new TikaContentExtractor(); File f = new File(getClass().getResource("test.docx").getPath()); contentExtractor.initialize(new CustomResourceSpecifier_impl(), Collections.emptyMap()); try (InputStream is = new FileInputStream(f); ) { contentExtractor.processStream(is, f.getPath(), jCas); } contentExtractor.destroy(); assertEquals( "Test Document\nThis is a simple test document, with a title and a single sentence.\n", jCas.getDocumentText()); Collection<Metadata> metadata = JCasUtil.select(jCas, Metadata.class); assertEquals(44, metadata.size()); Map<String, String> metadataMap = new HashMap<>(); for (Metadata md : metadata) { metadataMap.put(md.getKey(), md.getValue()); } assertTrue(metadataMap.containsKey("Page-Count")); assertEquals("1", metadataMap.get("Page-Count")); assertTrue(metadataMap.containsKey("meta:author")); assertEquals("James Baker", metadataMap.get("meta:author")); }
contentExtractor.processStream(is, f.getPath(), jCas);