private URI ensureInDictionary(final URI uri) { try { this.dictionary.keyFor(uri); return uri; } catch (final IOException ex) { throw new IllegalStateException("Cannot access dictionary: " + ex.getMessage(), ex); } }
public SerializerAvro(final String fileName) throws IOException { this.dictionary = Dictionary.createHadoopDictionary(URI.class, fileName); this.factory = Data.getValueFactory(); this.datatypeFactory = Data.getDatatypeFactory(); }
private Resource decodeIdentifier(final GenericRecord record) { final Schema schema = record.getSchema(); if (schema.equals(Schemas.COMPRESSED_IDENTIFIER)) { try { return this.dictionary.objectFor((Integer) record.get(0)); } catch (final IOException ex) { throw new IllegalStateException("Cannot access dictionary: " + ex.getMessage(), ex); } } else if (schema.equals(Schemas.PLAIN_IDENTIFIER)) { final String string = record.get(0).toString(); if (string.startsWith("_:")) { return this.factory.createBNode(string.substring(2)); } else { return this.factory.createURI(string); } } throw new IllegalArgumentException("Unsupported encoded identifier: " + record); }
private static void writeBinary(final Stream<Record> records, final File file) throws IOException { final String base = file.getAbsolutePath(); final Dictionary<URI> dictionary = Dictionary.createLocalDictionary(URI.class, new File( base + ".dict")); final Serializer serializer = new Serializer(false, dictionary, Statements.VALUE_FACTORY); final CountingOutputStream stream = new CountingOutputStream(IO.write(base + ".gz")); try { records.toHandler(new Handler<Record>() { private int records = 0; @Override public void handle(final Record record) throws Throwable { if (record == null || this.records > 0 && this.records % 1000 == 0) { LOGGER.info("{} records, {} bytes processed ({} bytes/record)", this.records, stream.getCount(), stream.getCount() / this.records); } if (record != null) { serializer.toStream(stream, record); ++this.records; } } }); } finally { IO.closeQuietly(records); IO.closeQuietly(stream); } }
public byte[] compressURI(final URI uri) { Preconditions.checkNotNull(uri); try { final ByteArrayOutputStream stream = new ByteArrayOutputStream(); final Encoder encoder = EncoderFactory.get().directBinaryEncoder(stream, null); final DatumWriter<Object> writer = new GenericDatumWriter<Object>( Schemas.COMPRESSED_IDENTIFIER); this.dictionary.keyFor(uri); // ensure a compressed version of URI is available final Object generic = encodeIdentifier(uri); writer.write(generic, encoder); return stream.toByteArray(); } catch (final IOException ex) { throw new Error("Unexpected exception (!): " + ex.getMessage(), ex); } }
private Object encodeIdentifier(final Resource identifier) { if (identifier instanceof URI) { try { final Integer key = this.dictionary.keyFor((URI) identifier, false); if (key != null) { return SerializerAvro.newGenericRecord(Schemas.COMPRESSED_IDENTIFIER, key); } } catch (final IOException ex) { throw new IllegalStateException("Cannot access dictionary: " + ex.getMessage(), ex); } } final String id = identifier instanceof BNode ? "_:" + ((BNode) identifier).getID() : identifier.stringValue(); return SerializerAvro.newGenericRecord(Schemas.PLAIN_IDENTIFIER, id); }