private SavePolicy(Context context) { String uri = context.getString(CONFIG_KITE_ERROR_DATASET_URI); Preconditions.checkArgument(uri != null, "Must set " + CONFIG_KITE_ERROR_DATASET_URI + " when " + CONFIG_FAILURE_POLICY + "=save"); if (Datasets.exists(uri)) { dataset = Datasets.load(uri, AvroFlumeEvent.class); } else { DatasetDescriptor descriptor = new DatasetDescriptor.Builder() .schema(AvroFlumeEvent.class) .build(); dataset = Datasets.create(uri, descriptor, AvroFlumeEvent.class); } nEventsHandled = 0; }
@Override public int run() throws IOException { Preconditions.checkArgument(classNames != null && !classNames.isEmpty(), "Java class name is required"); Preconditions.checkArgument(classNames.size() == 1, "Only one java class name can be given"); ClassLoader loader = loaderFor(jars, libs); String className = classNames.get(0); Class<?> recordClass; try { recordClass = loader.loadClass(className); } catch (ClassNotFoundException e) { throw new IllegalArgumentException("Cannot find class: " + className, e); } DatasetDescriptor descriptor = new DatasetDescriptor.Builder() .schema(recordClass) .build(); String schema = descriptor.getSchema().toString(!minimize); output(schema, console, outputPath); return 0; }
@Before public void setUp() throws IOException { this.testSchema = DatasetTestUtilities.USER_SCHEMA; Configuration tempConfig = new Configuration(); tempConfig.setLong("fs.trash.interval", 1); this.conf = (distributed ? MiniDFSTest.getConfiguration() : tempConfig); this.fileSystem = FileSystem.get(conf); this.testDirectory = fileSystem.makeQualified( new Path(Files.createTempDir().getAbsolutePath())); this.testDescriptor = new DatasetDescriptor.Builder() .schema(testSchema) .build(); this.testProvider = newProvider(conf); this.repo = newRepo(testProvider); }
@Test public void testTSV() { final DatasetDescriptor desc = new DatasetDescriptor.Builder() .property("kite.csv.delimiter", "\t") .property("kite.csv.lines-to-skip", "1") .schema(STRINGS) .build(); final CSVFileReader<GenericData.Record> reader = new CSVFileReader<GenericData.Record>(localfs, tsvFile, desc,
@Test public void testReadySignalUpdatesModifiedTime() { final FileSystemDataset<Record> ds = new FileSystemDataset.Builder<Record>() .namespace("ns") .name("users") .configuration(getConfiguration()) .descriptor( new DatasetDescriptor.Builder().schema(USER_SCHEMA).format(format) .location(testDirectory).build()) .type(Record.class) .uri(URIBuilder.build(URI.create("repo:" + testDirectory.toUri()), "ns", "name")) .build(); Assert.assertFalse("Dataset should not be ready before being signaled", ds.isReady()); // the modified time depends on the filesystem, and may only be granular to the second // signal and check until the modified time is after the current time, or until // enough time has past that the signal should have been distinguishable long signaledTime = 0; long currentTime = System.currentTimeMillis(); while(currentTime >= signaledTime && (System.currentTimeMillis() - currentTime) <= 2000) { ds.signalReady(); signaledTime = ds.getLastModified(); } Assert.assertTrue("Dataset should have been signaled as ready", ds.isReady()); Assert.assertTrue("Signal should update the modified time", signaledTime > currentTime); Assert.assertFalse("Only the dataset should have been signaled", ((Signalable)ds.with("username", "bob")).isReady()); }
@Test public void testPartitionedDataset() throws Exception { File folder = temp.newFolder("a/b/c/d/e/dataset_name"); Path root = new Path(temp.getRoot().toURI()); FileSystem fs = LocalFileSystem.getInstance(); URI datasetUri = URI.create("dataset:file:" + folder.getAbsolutePath()); DatasetDescriptor descriptor = new DatasetDescriptor.Builder() .schema(USER_SCHEMA) .partitionStrategy(new PartitionStrategy.Builder() .hash("id", 4) .build()) .build(); Dataset<GenericRecord> dataset = Datasets.create(datasetUri, descriptor); // write two so that the descriptor uses the directory rather than a file writeUserToView(dataset); writeUserToView(dataset); Path datasetPath = new Path(folder.toURI()); Path partitionPath = new Path(datasetPath, "id_hash=1"); DatasetDescriptor actual = Iterables.getOnlyElement( FileSystemUtil.findPotentialDatasets(fs, root)); Assert.assertFalse("Should not flag at mixed depth", descriptor.hasProperty("kite.filesystem.mixed-depth")); Assert.assertEquals("Location should be at the partition directory", partitionPath.toUri(), actual.getLocation()); Assert.assertEquals("Should use user schema", USER_SCHEMA, actual.getSchema()); Assert.assertEquals("Should have Avro format", Formats.AVRO, actual.getFormat()); Assert.assertFalse("Should not be partitioned", actual.isPartitioned()); }
@Override public int run(String[] args) throws Exception { // Create a dataset of products with the Avro schema DatasetDescriptor descriptor = new DatasetDescriptor.Builder() .schema(Product.class) .build(); Dataset<Product> products = Datasets.create( "dataset:hdfs:/tmp/data/products", descriptor, Product.class); // Get a writer for the dataset and write some products to it DatasetWriter<Product> writer = null; try { writer = products.newWriter(); int i = 0; for (String name : names) { Product product = new Product(); product.setName(name); product.setId(i++); writer.write(product); } } finally { if (writer != null) { writer.close(); } } return 0; }
/** * The constructor will start by registering the schemas with the meta store * table in HBase, and create the required tables to run. */ public UserProfileDatasetExample() throws Exception { Configuration conf = HBaseConfiguration.create(); HBaseAdmin admin = new HBaseAdmin(conf); // Delete the table if it exists so we start fresh. if (admin.tableExists("kite_example_user_profiles")) { admin.disableTable("kite_example_user_profiles"); admin.deleteTable("kite_example_user_profiles"); } HBaseDatasetRepository repo = new HBaseDatasetRepository.Builder() .configuration(conf).build(); // TODO: change to use namespace (CDK-140) DatasetDescriptor userProfileDatasetDescriptor = new DatasetDescriptor.Builder().schema(UserProfileModel2.SCHEMA$).build(); userProfileDataset = repo.create("default", "kite_example_user_profiles.UserProfileModel2", userProfileDatasetDescriptor); DatasetDescriptor userActionsDatasetDescriptor = new DatasetDescriptor.Builder().schema(UserActionsModel2.SCHEMA$).build(); userActionsDataset = repo.create("default", "kite_example_user_profiles.UserActionsModel2", userActionsDatasetDescriptor); DatasetDescriptor userProfileActionsDatasetDescriptor = new DatasetDescriptor.Builder().schema(UserProfileActionsModel2.SCHEMA$).build(); userProfileActionsDataset = repo.create("default", "kite_example_user_profiles.UserProfileActionsProtocol2", userProfileActionsDatasetDescriptor); }
@Test public void testAllowedPartitionSchemaCombinations() { Compatibility.checkDescriptor( new DatasetDescriptor.Builder() .schema(schema) .partitionStrategy(new PartitionStrategy.Builder() .year("timestamp") .month("timestamp") .day("timestamp") .hour("timestamp") .minute("timestamp") .identity("message", "message_copy") .identity("timestamp", "ts") .identity("number", "num") .hash("message", 48) .hash("timestamp", 48) .hash("number", 48) .hash("payload", 48) .hash("float", 48) .hash("double", 48) .hash("bool", 48) .range("number", 5, 10, 15, 20) .range("message", "m", "z", "M", "Z") .build()) .build()); }
@Test(expected = ValidationException.class) public void testCannotMergeDatasetsWithDifferentPartitionStrategies() throws IOException { FileSystemDataset<Record> ds = new FileSystemDataset.Builder<Record>() .namespace("ns") .name("users") .configuration(getConfiguration()) .descriptor(new DatasetDescriptor.Builder() .schema(USER_SCHEMA) .location(testDirectory) .partitionStrategy(new PartitionStrategy.Builder() .hash("username", 2).build()) .build()) .type(Record.class) .build(); FileSystemDataset<Record> dsUpdate = new FileSystemDataset.Builder<Record>() .namespace("ns") .name("users") .configuration(getConfiguration()) .descriptor(new DatasetDescriptor.Builder() .schema(USER_SCHEMA) .location(testDirectory) .partitionStrategy(new PartitionStrategy.Builder() .hash("username", 2).hash("email", 3).build()) .build()) .type(Record.class) .build(); ds.merge(dsUpdate); }
.endRecord(); DatasetDescriptor descriptor = new DatasetDescriptor.Builder() .schema(event) .partitionStrategy(new PartitionStrategy.Builder() .provided("v", "int") .day("created_at") .build()) .build(); DatasetDescriptor update = new DatasetDescriptor.Builder(ds.getDescriptor()) .partitionStrategy(new PartitionStrategy.Builder() .identity("version", "v") .day("created_at") .build()) .build();
@Test public void testReflectProjectionAsType() throws IOException { Dataset<StandardEvent> original = repo.create( "ns", "reflectProjection", new DatasetDescriptor.Builder() .schema(StandardEvent.class) .build(), StandardEvent.class); DatasetWriter<ReflectStandardEvent> writer = null; try { writer = original.asType(ReflectStandardEvent.class).newWriter(); writer.write(new ReflectStandardEvent(sepEvent)); writer.write(new ReflectStandardEvent(octEvent)); writer.write(new ReflectStandardEvent(novEvent)); } finally { Closeables.close(writer, false); } final View<ReflectSmallEvent> smallEvents = original.asType(ReflectSmallEvent.class); Set<ReflectSmallEvent> expected = Sets.newHashSet( new ReflectSmallEvent(sepEvent), new ReflectSmallEvent(octEvent), new ReflectSmallEvent(novEvent)); assertContentEquals(expected, smallEvents); TestHelpers.assertThrows("Should not be able to write small events", IncompatibleSchemaException.class, new Runnable() { @Override public void run() { smallEvents.newWriter(); } }); }
@Test public void testKeyMappingSourceMustBeIdentityPartitioned() { // and it works when the field is present Assert.assertNotNull(new DatasetDescriptor.Builder() .schema(USER_SCHEMA) .partitionStrategy(new PartitionStrategy.Builder() .hash("id", 16) .identity("id") .build()) .columnMapping(new ColumnMapping.Builder() .key("id") .build()) .build()); TestHelpers.assertThrows("Should reject mapping source not id partitioned", ValidationException.class, new Runnable() { @Override public void run() { new DatasetDescriptor.Builder() .schema(USER_SCHEMA) .partitionStrategy(new PartitionStrategy.Builder() .hash("id", 16) .build()) .columnMapping(new ColumnMapping.Builder() .key("id") .build()) .build(); } } ); }
/** * The constructor will start by registering the schemas with the meta store * table in HBase, and create the required tables to run. */ public UserProfileDatasetExample() throws Exception { Configuration conf = HBaseConfiguration.create(); HBaseAdmin admin = new HBaseAdmin(conf); // Delete the table if it exists so we start fresh. if (admin.tableExists("kite_example_user_profiles")) { admin.disableTable("kite_example_user_profiles"); admin.deleteTable("kite_example_user_profiles"); } HBaseDatasetRepository repo = new HBaseDatasetRepository.Builder() .configuration(conf).build(); // TODO: change to use namespace (CDK-140) DatasetDescriptor userProfileDatasetDescriptor = new DatasetDescriptor.Builder().schema(UserProfileModel2.SCHEMA$).build(); userProfileDataset = repo.create("default", "kite_example_user_profiles.UserProfileModel2", userProfileDatasetDescriptor); DatasetDescriptor userActionsDatasetDescriptor = new DatasetDescriptor.Builder().schema(UserActionsModel2.SCHEMA$).build(); userActionsDataset = repo.create("default", "kite_example_user_profiles.UserActionsModel2", userActionsDatasetDescriptor); DatasetDescriptor userProfileActionsDatasetDescriptor = new DatasetDescriptor.Builder().schema(UserProfileActionsModel2.SCHEMA$).build(); userProfileActionsDataset = repo.create("default", "kite_example_user_profiles.UserProfileActionsProtocol2", userProfileActionsDatasetDescriptor); }
@Test @SuppressWarnings("deprecation") public void testWriteToSubpartition() throws IOException { PartitionStrategy partitionStrategy = new PartitionStrategy.Builder() .hash("username", "username_part", 2).hash("email", 3).build(); FileSystemDataset<Record> ds = new FileSystemDataset.Builder<Record>() .namespace("ns") .name("partitioned-users") .configuration(getConfiguration()) .descriptor(new DatasetDescriptor.Builder() .schema(USER_SCHEMA) .format(format) .compressionType(compressionType) .location(testDirectory) .partitionStrategy(partitionStrategy) .build()) .type(Record.class) .build(); PartitionKey key = new PartitionKey(1); FileSystemDataset<Record> userPartition = (FileSystemDataset<Record>) ds.getPartition(key, true); Assert.assertEquals(key, userPartition.getPartitionKey()); writeTestUsers(userPartition, 1); Assert.assertTrue("Partitioned directory exists", fileSystem.exists(new Path(testDirectory, "username_part=1/email_hash=2"))); Assert.assertEquals(1, readTestUsersInPartition(ds, key, "email_hash")); }
@Test(expected = ValidationException.class) public void testCannotMergeDatasetsWithDifferentFormats() throws IOException { FileSystemDataset<Record> ds = new FileSystemDataset.Builder<Record>() .namespace("ns") .name("users") .configuration(getConfiguration()) .descriptor(new DatasetDescriptor.Builder() .schema(USER_SCHEMA) .format(Formats.AVRO) .location(testDirectory) .build()) .type(Record.class) .build(); FileSystemDataset<Record> dsUpdate = new FileSystemDataset.Builder<Record>() .namespace("ns") .name("users") .configuration(getConfiguration()) .descriptor(new DatasetDescriptor.Builder() .schema(USER_SCHEMA) .format(Formats.PARQUET) .location(testDirectory) .build()) .type(Record.class) .build(); ds.merge(dsUpdate); }
@Before public void createTestDatasets() { Datasets.delete("dataset:file:/tmp/datasets/unpartitioned"); Datasets.delete("dataset:file:/tmp/datasets/partitioned"); Datasets.delete("dataset:file:/tmp/datasets/temporary"); DatasetDescriptor descriptor = new DatasetDescriptor.Builder() .schema(TestRecord.class) .build(); unpartitioned = Datasets.create("dataset:file:/tmp/datasets/unpartitioned", descriptor, TestRecord.class); descriptor = new DatasetDescriptor.Builder(descriptor) .property("kite.writer.cache-size", "20") .partitionStrategy(new PartitionStrategy.Builder() .hash("id", 4) .build()) .build(); partitioned = Datasets.create("dataset:file:/tmp/datasets/partitioned", descriptor, TestRecord.class); // create a second dataset with the same partitioning for replacement parts temporary = Datasets.create("dataset:file:/tmp/datasets/temporary", descriptor, TestRecord.class); writeTestRecords(unpartitioned); writeTestRecords(partitioned); writeTestRecords(temporary); }
@Test public void testBackwardCompatibleMappingToPartitionStrategy() { Schema schema = new Schema.Parser().parse("{" + " \"type\": \"record\"," + " \"name\": \"User\"," + " \"fields\": [" + " {\"name\": \"id\", \"type\": \"long\", \"mapping\":" + " {\"type\": \"key\", \"value\": \"1\"} }," + " {\"name\": \"username\", \"type\": \"string\", \"mapping\":" + " {\"type\": \"key\", \"value\": \"0\"} }," + " {\"name\": \"real_name\", \"type\": \"string\", \"mapping\":" + " {\"type\": \"column\", \"value\": \"m:name\"} }" + " ]" + "}"); PartitionStrategy expected = new PartitionStrategy.Builder() .identity("username") .identity("id") .build(); DatasetDescriptor descriptor = new DatasetDescriptor.Builder() .schema(schema) .build(); Assert.assertEquals(expected, descriptor.getPartitionStrategy()); } }
@Test public void testUpdateFailsWithIncompatibleSchemaChange() { Dataset<Record> dataset = repo.create(NAMESPACE, NAME, new DatasetDescriptor.Builder() .schema(testSchema).build()); Assert.assertEquals("Dataset name is propagated", NAME, dataset.getName()); Assert.assertEquals("Dataset schema is propagated", testSchema, dataset .getDescriptor().getSchema()); Schema testSchemaV2 = SchemaBuilder.record("user").fields() .requiredString("username") .requiredString("email") .requiredString("favoriteColor") // incompatible - no default .endRecord(); try { repo.update(NAMESPACE, NAME, new DatasetDescriptor.Builder( dataset.getDescriptor()).schema(testSchemaV2).build()); Assert.fail("Should fail due to incompatible update"); } catch (ValidationException e) { // expected } dataset = repo.load(NAMESPACE, NAME); Assert.assertEquals("Dataset schema is unchanged", testSchema, dataset .getDescriptor().getSchema()); }
@Test public void testReflectProjectionLoad() throws IOException { Dataset<ReflectStandardEvent> original = repo.create( "ns", "reflectProjection", new DatasetDescriptor.Builder() .schema(ReflectStandardEvent.class) .build(), ReflectStandardEvent.class); DatasetWriter<ReflectStandardEvent> writer = null; try { writer = original.newWriter(); writer.write(new ReflectStandardEvent(sepEvent)); writer.write(new ReflectStandardEvent(octEvent)); writer.write(new ReflectStandardEvent(novEvent)); } finally { Closeables.close(writer, false); } View<ReflectSmallEvent> dataset = repo.load("ns", original.getName(), ReflectSmallEvent.class); Set<ReflectSmallEvent> expected = Sets.newHashSet( new ReflectSmallEvent(sepEvent), new ReflectSmallEvent(octEvent), new ReflectSmallEvent(novEvent)); assertContentEquals(expected, dataset); }