public DatasetDescriptor addToDescriptor(DatasetDescriptor descriptor) { DatasetDescriptor.Builder builder = new DatasetDescriptor.Builder(descriptor) .property(CHARSET_PROPERTY, charset) .property(DELIMITER_PROPERTY, delimiter) .property(ESCAPE_CHAR_PROPERTY, escape) .property(QUOTE_CHAR_PROPERTY, quote) .property(HAS_HEADER_PROPERTY, Boolean.toString(useHeader)) .property(LINES_TO_SKIP_PROPERTY, Integer.toString(linesToSkip)); if (header != null) { builder.property(HEADER_PROPERTY, header); } return builder.build(); }
@Override public FileSystemWriter<Record> newWriter(Path directory, Schema datasetSchema, Schema writerSchema) { return FileSystemWriter.newWriter(fs, directory, 100, 2 * 1024 * 1024, new DatasetDescriptor.Builder() .property( "kite.writer.roll-interval-seconds", String.valueOf(10)) .property( "kite.writer.target-file-size", String.valueOf(32 * 1024 * 1024)) // 32 MB .property( "kite.writer.fs-supports-rename", String.valueOf(false)) .schema(datasetSchema) .format("avro") .build(), writerSchema); } }
@Override public FileSystemWriter<Record> newWriter(Path directory, Schema datasetSchema, Schema writerSchema) { return FileSystemWriter.newWriter(fs, directory, 100, 2 * 1024 * 1024, new DatasetDescriptor.Builder() .property( "kite.writer.roll-interval-seconds", String.valueOf(10)) .property( "kite.writer.target-file-size", String.valueOf(32 * 1024 * 1024)) // 32 MB .schema(datasetSchema) .format("avro") .build(), writerSchema); }
@Before public void createTestDatasets() { Datasets.delete("dataset:file:/tmp/datasets/unpartitioned"); Datasets.delete("dataset:file:/tmp/datasets/partitioned"); Datasets.delete("dataset:file:/tmp/datasets/temporary"); DatasetDescriptor descriptor = new DatasetDescriptor.Builder() .schema(TestRecord.class) .build(); unpartitioned = Datasets.create("dataset:file:/tmp/datasets/unpartitioned", descriptor, TestRecord.class); descriptor = new DatasetDescriptor.Builder(descriptor) .property("kite.writer.cache-size", "20") .partitionStrategy(new PartitionStrategy.Builder() .hash("id", 4) .build()) .build(); partitioned = Datasets.create("dataset:file:/tmp/datasets/partitioned", descriptor, TestRecord.class); // create a second dataset with the same partitioning for replacement parts temporary = Datasets.create("dataset:file:/tmp/datasets/temporary", descriptor, TestRecord.class); writeTestRecords(unpartitioned); writeTestRecords(partitioned); writeTestRecords(temporary); }
@Test public void testCustomProperties() { final String propName = "my.custom.property"; final String propValue = "string"; DatasetDescriptor descriptorWithProp = new DatasetDescriptor.Builder(testDescriptor) .property(propName, propValue) .build(); DatasetDescriptor created = provider.create(NAMESPACE, NAME, descriptorWithProp); Assert.assertTrue("Should have custom property", created.hasProperty(propName)); Assert.assertEquals("Should have correct custom property value", propValue, created.getProperty(propName)); Assert.assertTrue("List should contain property name", created.listProperties().contains(propName)); DatasetDescriptor loaded = provider.load(NAMESPACE, NAME); Assert.assertTrue("Should have custom property", loaded.hasProperty(propName)); Assert.assertEquals("Should have correct custom property value", propValue, loaded.getProperty(propName)); Assert.assertTrue("List should contain property name", created.listProperties().contains(propName)); }
@Test public void testUpdate() throws IOException { DatasetDescriptor updated = new DatasetDescriptor.Builder(descriptor) .property("parquet.block.size", "1024") .build(); DatasetDescriptor saved = provider.update("default", "old_2", updated); Assert.assertNotNull("Should find saved metadata", saved); Assert.assertEquals("Should update old dataset successfully", updated.getProperty("parquet.block.size"), saved.getProperty("parquet.block.size")); DatasetDescriptor loaded = provider.load("default", "old_2"); Assert.assertNotNull("Should find saved metadata", loaded); Assert.assertEquals("Should make changes on disk", updated.getProperty("parquet.block.size"), loaded.getProperty("parquet.block.size")); Assert.assertFalse("Should not move metadata to new location", local.exists(new Path(root, new Path("default", "old_2")))); } }
@Before public void setUp() throws Exception { super.setUp(); inputDataset = repo.create("ns", "in", new DatasetDescriptor.Builder() .property("kite.allow.csv", "true") .schema(STRING_SCHEMA) .format(format) .build(), GenericData.Record.class); outputDataset = repo.create("ns", "out", new DatasetDescriptor.Builder() .property("kite.allow.csv", "true") .schema(STATS_SCHEMA) .format(format) .build(), GenericData.Record.class); }
private static DatasetDescriptor descriptor(FileSystem fs, Result.Table table) throws IOException { // inspect the path to determine the partition strategy PartitionStrategy strategy = strategy(fs, table.location); DatasetDescriptor.Builder builder = new DatasetDescriptor.Builder() .format(table.format) .schema(table.schema) .partitionStrategy(strategy) .location(table.location); if (table.depth < 0) { builder.property("kite.filesystem.mixed-depth", "true"); } return builder.build(); }
@Test public void testConfigureDurableParquetAppender() throws IOException { FileSystem fs = LocalFileSystem.getInstance(); FileSystemWriter<Object> writer = FileSystemWriter.newWriter( fs, new Path("/tmp"), -1, -1, new DatasetDescriptor.Builder() .property(FileSystemProperties.NON_DURABLE_PARQUET_PROP, "false") .schema(SCHEMA) .format("parquet") .build(), SCHEMA); Assert.assertEquals("Disabling the non-durable parquet appender should get us a durable appender", DurableParquetAppender.class, writer.newAppender(testDirectory).getClass()); }
@Override public FileSystemWriter<Record> newWriter(Path directory, Schema datasetSchema, Schema writerSchema) { return FileSystemWriter.newWriter(fs, directory, 100, 2 * 1024 * 1024, new DatasetDescriptor.Builder() .property( "kite.writer.roll-interval-seconds", String.valueOf(10)) .property( "kite.writer.target-file-size", String.valueOf(32 * 1024 * 1024)) // 32 MB .schema(datasetSchema) .format("parquet") .build(), writerSchema); }
@Test public void testParquetConfiguration() throws IOException { FileSystem fs = LocalFileSystem.getInstance(); FileSystemWriter<Object> writer = FileSystemWriter.newWriter( fs, new Path("/tmp"), -1, -1, new DatasetDescriptor.Builder() .property("parquet.block.size", "34343434") .schema(SCHEMA) .format("parquet") .build(), SCHEMA); Assert.assertEquals("Should copy properties to Configuration", 34343434, writer.conf.getInt("parquet.block.size", -1)); }
private static DatasetDescriptor copy(DatasetDescriptor descriptor) { // don't reuse the previous dataset's location and don't use durable // parquet writers because fault-tolerance is handled by OutputCommitter return new DatasetDescriptor.Builder(descriptor) .property(FileSystemProperties.NON_DURABLE_PARQUET_PROP, "true") .location((URI) null) .build(); }
@Override public DatasetReader<Text> newReader() throws IOException { DatasetDescriptor descriptor = new DatasetDescriptor.Builder() .property(InputFormatUtil.INPUT_FORMAT_CLASS_PROP, "org.apache.hadoop.mapreduce.lib.input.TextInputFormat") .property(InputFormatUtil.INPUT_FORMAT_RECORD_PROP, "value") .schema(Schema.create(Schema.Type.STRING)) .build(); return new InputFormatReader<Text>(localfs, userFile, descriptor); }
@Before @Override public void setUp() throws Exception { super.setUp(); dataset = repo.create("ns", "out", new DatasetDescriptor.Builder() .property("kite.allow.csv", "true") .schema(STATS_SCHEMA) .format(format) .build(), GenericData.Record.class); }
@Test public void testConfigureNonDurableParquetAppender() throws IOException { FileSystem fs = LocalFileSystem.getInstance(); FileSystemWriter<Object> writer = FileSystemWriter.newWriter( fs, new Path("/tmp"), -1, -1, new DatasetDescriptor.Builder() .property(FileSystemProperties.NON_DURABLE_PARQUET_PROP, "true") .schema(SCHEMA) .format("parquet") .build(), SCHEMA); Assert.assertEquals("Enabling the non-durable parquet appender should get us a non-durable appender", ParquetAppender.class, writer.newAppender(testDirectory).getClass()); }
@Before @Override public void setUp() throws Exception { super.setUp(); outputDataset = repo.create("ns", "out", new DatasetDescriptor.Builder() .property("kite.allow.csv", "true") .schema(STATS_SCHEMA) .format(format) .build(), GenericData.Record.class); }
private static DatasetDescriptor copy(DatasetDescriptor descriptor) { // don't reuse the previous dataset's location and don't use durable // parquet writers because fault-tolerance is handled by OutputCommitter return new DatasetDescriptor.Builder(descriptor) .property(FileSystemProperties.NON_DURABLE_PARQUET_PROP, "true") .location((URI) null) .build(); }
@Override public DatasetReader<GenericData.Record> newReader() throws IOException { final DatasetDescriptor desc = new DatasetDescriptor.Builder() .property("kite.csv.has-header", "true") .schema(VALIDATOR_SCHEMA) .build(); return new CSVFileReader<GenericData.Record>(localfs, validatorFile, desc, DataModelUtil.accessor(GenericData.Record.class, desc.getSchema())); }
@Override public DatasetReader<LongWritable> newReader() throws IOException { DatasetDescriptor descriptor = new DatasetDescriptor.Builder() .property(InputFormatUtil.INPUT_FORMAT_CLASS_PROP, "org.apache.hadoop.mapreduce.lib.input.TextInputFormat") .property(InputFormatUtil.INPUT_FORMAT_RECORD_PROP, "key") .schema(Schema.create(Schema.Type.LONG)) .build(); return new InputFormatReader<LongWritable>(localfs, userFile, descriptor); }