final Pattern metadataKeyFilter = metadataKeyFilterRef.get(); final StringBuilder dataBuilder = new StringBuilder(); for (final String key : metadata.names()) { if (metadataKeyFilter != null && !metadataKeyFilter.matcher(key).matches()) { continue;
/** * Override to get a custom sort order * or to filter names. * * @param metadata metadata from which to grab names * @return list of names in the order in which they should be serialized */ protected String[] getNames(Metadata metadata) { String[] names = metadata.names(); Arrays.sort(names); return names; } }
@Override public String[] getNames(Metadata m) { String[] names = m.names(); Arrays.sort(names, new PrettyMetadataKeyComparator()); return names; } }
int countMetadataValues(Metadata m) { if (m == null) { return 0; } int i = 0; for (String n : m.names()) { i += m.getValues(n).length; } return i; }
@Override public void endDocument() { String[] names = metadata.names(); Arrays.sort(names); outputMetadata(names); writer.flush(); this.metOutput = true; }
@Override @SuppressWarnings("resource") public void writeTo(Metadata metadata, Class<?> type, Type genericType, Annotation[] annotations, MediaType mediaType, MultivaluedMap<String, Object> httpHeaders, OutputStream entityStream) throws IOException, WebApplicationException { if (metadata.names().length != 1) { throw new WebApplicationException("Metadata object must only have one entry!"); } Writer writer = new OutputStreamWriter(entityStream, UTF_8); for (String name : metadata.names()) { writer.write(metadata.get(name)); } // Don't close, just flush the stream writer.flush(); } }
public String toString() { StringBuffer buf = new StringBuffer(); String[] names = names(); for (int i = 0; i < names.length; i++) { String[] values = _getValues(names[i]); for (int j = 0; j < values.length; j++) { if (buf.length() > 0) { buf.append(" "); } buf.append(names[i]).append("=").append(values[j]); } } return buf.toString(); }
private int findMatchingDigests(Metadata metadata, List<Metadata> metadataListB) { Set<String> digestKeys = new HashSet<>(); for (String n : metadata.names()) { if (n.startsWith(DIGEST_KEY_PREFIX)) { String digestA = metadata.get(n); for (int i = 0; i < metadataListB.size(); i++) { String digestB = metadataListB.get(i).get(n); if (digestA != null && digestA.equals(digestB)) { return i; } } } } return -1; }
public static void metadataToCsv(Metadata metadata, OutputStream outputStream) throws IOException { CSVWriter writer = new CSVWriter(new OutputStreamWriter(outputStream, UTF_8)); for (String name : metadata.names()) { String[] values = metadata.getValues(name); ArrayList<String> list = new ArrayList<>(values.length + 1); list.add(name); list.addAll(Arrays.asList(values)); writer.writeNext(list.toArray(values)); } writer.close(); }
@Override @SuppressWarnings("resource") public void writeTo(Metadata metadata, Class<?> type, Type genericType, Annotation[] annotations, MediaType mediaType, MultivaluedMap<String, Object> httpHeaders, OutputStream entityStream) throws IOException, WebApplicationException { CSVWriter writer = new CSVWriter(new OutputStreamWriter(entityStream, UTF_8)); for (String name : metadata.names()) { String[] values = metadata.getValues(name); ArrayList<String> list = new ArrayList<String>(values.length + 1); list.add(name); list.addAll(Arrays.asList(values)); writer.writeNext(list.toArray(values)); } // Don't close, just flush the stream writer.flush(); } }
public void indexContentSpecificMet(File file) throws Exception { Metadata met = new Metadata(); try (InputStream is = new FileInputStream(file)) { tika.parse(is, met); Document document = new Document(); for (String key : met.names()) { String[] values = met.getValues(key); for (String val : values) { document.add(new TextField(key, val, Store.YES)); } writer.addDocument(document); } } }
@Override public List<RecognisedObject> recognise(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException { Metadata md = new Metadata(); parse(stream, handler, md, context); List<RecognisedObject> objects = new ArrayList<>(); for (String key: md.names()) { double confidence = Double.parseDouble(md.get(key)); objects.add(new RecognisedObject(key, "eng", key, confidence)); } return objects; } }
public static void debug(Metadata metadata) { for (String n : metadata.names()) { for (String v : metadata.getValues(n)) { System.out.println(n + " : "+v); } } } }
public static void debug(List<Metadata> list) { int i = 0; for (Metadata m : list) { for (String n : m.names()) { for (String v : m.getValues(n)) { System.out.println(i + ": "+n + " : "+v); } } i++; } }
public static Map<String, String> handleStreamMetaDate(byte[] file) throws Exception { Map<String, String> meta = new HashMap<>(); Metadata md = new Metadata(); TikaInputStream input = TikaInputStream.get(file, md); StringWriter textBuffer = new StringWriter(); ContentHandler handler = new TeeContentHandler( getTextContentHandler(textBuffer) ); parser.parse(input, handler, md, context); String[] names = md.names(); Arrays.sort(names); for (String name : names) { meta.put(name, md.get(name)); } return meta; }
private byte[] toString(ContentHandler contentHandler, Metadata metadata) { StringBuilder sb = new StringBuilder(); for (String n : metadata.names()) { for (String v : metadata.getValues(n)) { sb.append(n).append(" : ").append(v).append("\n");; } } if (! contentHandler.getClass().equals(DefaultHandler.class)) { sb.append("\n"); sb.append("CONTENT: "+ contentHandler.toString()); sb.append("\n\n"); } return sb.toString().getBytes(StandardCharsets.UTF_8); } }
public void parse(String filePath, ContentHandler handler, Metadata metadata, ParseContext context) throws FileNotFoundException { File pdfFile = new File(filePath); ContentDisposition cd = new ContentDisposition( "form-data; name=\"input\"; filename=\"" + pdfFile.getName() + "\""); Attachment att = new Attachment("input", new FileInputStream(pdfFile), cd); MultipartBody body = new MultipartBody(att); Response response = WebClient .create(restHostUrlStr + GROBID_PROCESSHEADER_PATH) .accept(MediaType.APPLICATION_XML).type(MediaType.MULTIPART_FORM_DATA) .post(body); try { String resp = response.readEntity(String.class); Metadata teiMet = new TEIDOMParser().parse(resp, context); for (String key : teiMet.names()) { metadata.add("grobid:header_" + key, teiMet.get(key)); } } catch (Exception e) { LOG.warn("Couldn't read response", e); } }
/** Test for <code>names</code> method. */ @Test public void testNames() { String[] names = null; Metadata meta = new Metadata(); names = meta.names(); assertEquals(0, names.length); meta.add("name-one", "value"); names = meta.names(); assertEquals(1, names.length); assertEquals("name-one", names[0]); meta.add("name-two", "value"); names = meta.names(); assertEquals(2, names.length); }
/** * Does a deep clone of a Metadata object. */ public static Metadata cloneMetadata(Metadata m) { Metadata clone = new Metadata(); for (String n : m.names()){ if (! m.isMultiValued(n)) { clone.set(n, m.get(n)); } else { String[] vals = m.getValues(n); for (int i = 0; i < vals.length; i++) { clone.add(n, vals[i]); } } } return clone; }
@Test public void testGetField_Author_JSON_Partial_Found() throws Exception { InputStream stream = ClassLoader.getSystemResourceAsStream(TikaResourceTest.TEST_DOC); Response response = WebClient.create(endPoint + META_PATH + "/"+TikaCoreProperties.CREATOR.getName()) .type("application/msword") .accept(MediaType.APPLICATION_JSON).put(copy(stream, 12000)); Assert.assertEquals(Response.Status.OK.getStatusCode(), response.getStatus()); Metadata metadata = JsonMetadata.fromJson(new InputStreamReader( (InputStream) response.getEntity(), UTF_8)); assertEquals("Maxim Valyanskiy", metadata.get(TikaCoreProperties.CREATOR)); assertEquals(1, metadata.names().length); }