public TrecDocument summarize(File file) throws FileNotFoundException, IOException, TikaException { Tika tika = new Tika(); Metadata met = new Metadata(); String contents = tika.parseToString(new FileInputStream(file), met); return new TrecDocument(met.get(TikaCoreProperties.RESOURCE_NAME_KEY), contents, met.getDate(TikaCoreProperties.CREATED)); }
@Override public Integer call() throws Exception { for (int i = 0; i < 1000; i++) { Metadata m = new Metadata(); long start = System.currentTimeMillis(); start += random.nextInt(1000000); Date now = new Date(start); DateFormat df = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss'Z'", Locale.US); m.set(TikaCoreProperties.CREATED, df.format(now)); df.setTimeZone(TimeZone.getTimeZone("UTC")); assertTrue(Math.abs(now.getTime() - m.getDate(TikaCoreProperties.CREATED).getTime()) < 2000); } return 1; } }
assertEquals(1000, meta.getDate(TikaCoreProperties.CREATED).getTime()); assertEquals(null, meta.getDate(TikaCoreProperties.CREATED)); assertEquals(1000, meta.getDate(TikaCoreProperties.CREATED).getTime()); assertEquals(null, meta.getInt(Metadata.BITS_PER_SAMPLE)); assertEquals(null, meta.getInt(TikaCoreProperties.CREATED)); assertEquals(1000, meta.getDate(TikaCoreProperties.CREATED).getTime()); assertEquals(1000, meta.getDate(TikaCoreProperties.CREATED).getTime()); assertEquals(1000, meta.getDate(TikaCoreProperties.CREATED).getTime()); assertEquals(1000, meta.getDate(TikaCoreProperties.CREATED).getTime()); assertEquals(1000, meta.getDate(TikaCoreProperties.CREATED).getTime()); assertEquals(1000, meta.getDate(TikaCoreProperties.CREATED).getTime()); assertEquals(1000, meta.getDate(TikaCoreProperties.CREATED).getTime()); assertEquals(12*hour, meta.getDate(TikaCoreProperties.CREATED).getTime()); assertEquals(12*hour, meta.getDate(TikaCoreProperties.CREATED).getTime());
private void convertProperty(Metadata metadata, Properties props, QName jcrDCProp, Property[] tikaProperty) { for (Property property : tikaProperty) { String value = (String)metadata.get(property); if (value != null) { if (property.equals(DublinCore.DATE) || property.equals(MSOffice.LAST_SAVED) || property.equals(MSOffice.CREATION_DATE)) { value = metadata.getDate(property).toString(); } props.put(jcrDCProp, value); return; } } } }
private void transformDate(final String name, final ValueConsumer consumer) throws IOException { final Date date = metadata.getDate(dateProperties.get(name)); Instant instant = null; if (null != date) { instant = date.toInstant(); } else { // Try some other formats. for (DateTimeFormatter format: dateFormats) { final TemporalAccessor accessor; try { accessor = format.parseBest(metadata.get(name), Instant::from, LocalDateTime::from); } catch (final DateTimeParseException e) { continue; } if (accessor instanceof Instant) { instant = (Instant) accessor; } else if (accessor instanceof LocalDateTime) { // Default to UTC for dates with not time zone. instant = ((LocalDateTime) accessor).toInstant(ZoneOffset.UTC); } break; } } if (null != instant) { consumer.accept(fields.forMetadataISODate(name), instant.toString()); } else { throw new IOException(String.format("Unable to parse date \"%s\" from field " + "\"%s\" for ISO 8601 formatting.", metadata.get(name), name)); } }
Date date = dummy.getDate(DublinCore.DATE); //access parseDate(..) if(date != null){ //now use the Clerezza Literal factory object = lf.createTypedLiteral(date);