/** * Adds a metadata annotation to the JCas * * @param jCas The JCas object to add the annotation to * @param name The metadata key * @param value The metadata value */ protected Metadata addMetadata(JCas jCas, String name, String value) { if (!Strings.isNullOrEmpty(name) && !Strings.isNullOrEmpty(value)) { Metadata md = new Metadata(jCas); md.setKey(name); md.setValue(value); addToJCasIndex(md); return md; } return null; } }
@Override public final boolean initialize(ResourceSpecifier specifier, Map<String, Object> additionalParams) throws ResourceInitializationException { boolean result = super.initialize(specifier, additionalParams); UimaContext context = getUimaContext(); String pipelineName = UimaUtils.getPipelineName(context); monitor = createMonitor(pipelineName); support = createSupport(pipelineName, context); monitor.startFunction("initialize"); doInitialize(context, additionalParams); monitor.finishFunction("initialize"); return result; }
@Override public final void destroy() { monitor.startFunction("destroy"); doDestroy(); monitor.finishFunction("destroy"); }
@Override public final void initialize(UimaContext context, Map<String, Object> params) throws ResourceInitializationException { String pipelineName = UimaUtils.getPipelineName(context); monitor = createMonitor(pipelineName); support = createSupport(pipelineName, context); monitor.startFunction("initialize"); doInitialize(context, params); monitor.finishFunction("initialize"); }
@Test public void testMetadata() throws Exception { JCas jCas = JCasSingleton.getJCasInstance(); BaleenContentExtractor contentExtractor = new TearlineContentExtractor(); contentExtractor.initialize(new CustomResourceSpecifier_impl(), Collections.emptyMap()); File f = new File(getClass().getResource("tearline/1.docx").getPath()); try (InputStream is = new FileInputStream(f); ) { contentExtractor.processStream(is, f.getPath(), jCas); assertFalse(JCasUtil.select(jCas, Metadata.class).isEmpty()); } contentExtractor.destroy(); }
@Test public void testInitializingMapper() throws UIMAException, IOException { JCas jCas = JCasSingleton.getJCasInstance(); BaleenContentExtractor contentExtractor = new TestStructureContentExtractor(); Map<String, Object> params = new HashMap<>(); params.put("contentMappers", new String[] {"MetaTags"}); contentExtractor.initialize(new CustomResourceSpecifier_impl(), params); contentExtractor.processStream(null, "source", jCas); long count = JCasUtil.select(jCas, Metadata.class) .stream() .filter( m -> m.getKey().equals("baleen:content-mappers") && m.getValue().contains("MetaTags")) .count(); assertEquals(1, count); }
/** * Add an annotation to the JCas index, notifying UimaMonitor of the fact we have done so * * @param annotations Annotation(s) to add */ protected void addToJCasIndex(Annotation... annotations) { getSupport().add(annotations); }
@Override public final void processStream(InputStream stream, String source, JCas jCas) throws IOException { monitor.startFunction("process"); doProcessStream(stream, source, jCas); monitor.finishFunction("process"); monitor.persistCounts(); }
@Test public void testInitializingManipulatorAsMapper() throws UIMAException, IOException { BaleenContentExtractor contentExtractor = new TestStructureContentExtractor(); Map<String, Object> params = new HashMap<>(); params.put( "contentMappers", new String[] {"uk.gov.dstl.baleen.contentmanipulators.HeaderAndFooterRemover"}); contentExtractor.initialize(new CustomResourceSpecifier_impl(), params); // TODO Could test its not actually used here... }
@Test public void testTikaCorruptFile() throws Exception { JCas jCas = JCasSingleton.getJCasInstance(); BaleenContentExtractor contentExtractor = new TikaContentExtractor(); File f = new File(getClass().getResource("corrupt.docx").getPath()); contentExtractor.initialize(new CustomResourceSpecifier_impl(), Collections.emptyMap()); try (InputStream is = new FileInputStream(f); ) { contentExtractor.processStream(is, f.getPath(), jCas); } contentExtractor.destroy(); assertEquals(TikaContentExtractor.CORRUPT_FILE_TEXT, jCas.getDocumentText()); } }
@Test public void testInitializingManipulator() throws UIMAException, IOException { JCas jCas = JCasSingleton.getJCasInstance(); BaleenContentExtractor contentExtractor = new TestStructureContentExtractor(); Map<String, Object> params = new HashMap<>(); params.put("contentManipulators", new String[] {"RemoveEmptyText"}); contentExtractor.initialize(new CustomResourceSpecifier_impl(), params); contentExtractor.processStream(null, "source", jCas); long count = JCasUtil.select(jCas, Metadata.class) .stream() .filter( m -> m.getKey().equals("baleen:content-manipulators") && m.getValue().contains("RemoveEmptyText")) .count(); assertEquals(1, count); }
/** * Add an annotation to the JCas index, notifying UimaMonitor of the fact we have done so * * @param annotations Annotation(s) to add */ protected void addToJCasIndex(Collection<? extends Annotation> annotations) { getSupport().add(annotations); }
@Override public final void processStream(InputStream stream, String source, JCas jCas) throws IOException { monitor.startFunction("process"); doProcessStream(stream, source, jCas); monitor.finishFunction("process"); monitor.persistCounts(); }
@Test(expected = ResourceInitializationException.class) public void testInitializingBadMapper() throws UIMAException, IOException { BaleenContentExtractor contentExtractor = new TestStructureContentExtractor(); Map<String, Object> params = new HashMap<>(); params.put("contentMappers", new String[] {"DoesNotExist"}); contentExtractor.initialize(new CustomResourceSpecifier_impl(), params); }
@Test public void testTikaText() throws Exception { JCas jCas = JCasSingleton.getJCasInstance(); BaleenContentExtractor contentExtractor = new TikaContentExtractor(); File f = new File(getClass().getResource("test.txt").getPath()); contentExtractor.initialize(new CustomResourceSpecifier_impl(), Collections.emptyMap()); try (InputStream is = new FileInputStream(f); ) { contentExtractor.processStream(is, f.getPath(), jCas); } contentExtractor.destroy(); assertEquals("Hello World\n", jCas.getDocumentText()); assertEquals(4, JCasUtil.select(jCas, Metadata.class).size()); }
@Test public void testDisableTextBlocks() throws Exception { JCas jCas = JCasSingleton.getJCasInstance(); BaleenContentExtractor contentExtractor = new TestStructureContentExtractor(); Map<String, Object> map = new HashMap<>(); map.put(StructureContentExtractor.FIELD_EXTRACT_TEXT_BLOCKS, "false"); contentExtractor.initialize(new CustomResourceSpecifier_impl(), map); contentExtractor.processStream(null, "source", jCas); assertEquals("Title\nExample", jCas.getDocumentText()); Collection<Text> select = JCasUtil.select(jCas, Text.class); assertTrue(select.isEmpty()); } }
/** * Add an annotation to the JCas index, notifying UimaMonitor of the fact we have done so * * @param annotations Annotation(s) to add */ protected void addToJCasIndex(Annotation... annotations) { getSupport().add(annotations); }
@Override public final void destroy() { monitor.startFunction("destroy"); doDestroy(); monitor.finishFunction("destroy"); }
/** * Adds a metadata annotation to the JCas * * @param jCas The JCas object to add the annotation to * @param name The metadata key * @param value The metadata value */ protected Metadata addMetadata(JCas jCas, String name, String value) { if (!Strings.isNullOrEmpty(name) && !Strings.isNullOrEmpty(value)) { Metadata md = new Metadata(jCas); md.setKey(name); md.setValue(value); addToJCasIndex(md); return md; } return null; } }
@Test public void testTikaWrappingDocx() throws Exception { JCas jCas = JCasSingleton.getJCasInstance(); BaleenContentExtractor contentExtractor = new TikaContentExtractor(); File f = new File(getClass().getResource("wrappingLines.docx").getPath()); contentExtractor.initialize(new CustomResourceSpecifier_impl(), Collections.emptyMap()); try (InputStream is = new FileInputStream(f); ) { contentExtractor.processStream(is, f.getPath(), jCas); } contentExtractor.destroy(); assertEquals( "Test Document\nThis is my test document, which has a sentence that is long enough to wrap over two lines but we want it to appear as a single line when we extract the content.\nThis is a second paragraph. This is a third sentence, but still the second paragraph. Super-cali-fragi-listic-expi-alo-docious.\n", jCas.getDocumentText()); }