private SavePolicy(Context context) { String uri = context.getString(CONFIG_KITE_ERROR_DATASET_URI); Preconditions.checkArgument(uri != null, "Must set " + CONFIG_KITE_ERROR_DATASET_URI + " when " + CONFIG_FAILURE_POLICY + "=save"); if (Datasets.exists(uri)) { dataset = Datasets.load(uri, AvroFlumeEvent.class); } else { DatasetDescriptor descriptor = new DatasetDescriptor.Builder() .schema(AvroFlumeEvent.class) .build(); dataset = Datasets.create(uri, descriptor, AvroFlumeEvent.class); } nEventsHandled = 0; }
@Before public void setUp() throws IOException { this.testSchema = DatasetTestUtilities.USER_SCHEMA; Configuration tempConfig = new Configuration(); tempConfig.setLong("fs.trash.interval", 1); this.conf = (distributed ? MiniDFSTest.getConfiguration() : tempConfig); this.fileSystem = FileSystem.get(conf); this.testDirectory = fileSystem.makeQualified( new Path(Files.createTempDir().getAbsolutePath())); this.testDescriptor = new DatasetDescriptor.Builder() .schema(testSchema) .build(); this.testProvider = newProvider(conf); this.repo = newRepo(testProvider); }
@Override public int run(List<String> args) throws Exception { Preconditions.checkState(!Datasets.exists(uri), "events dataset already exists"); DatasetDescriptor descriptor = new DatasetDescriptor.Builder() .schema(StandardEvent.class).build(); View<StandardEvent> events = Datasets.create(uri, descriptor, StandardEvent.class); DatasetWriter<StandardEvent> writer = events.newWriter(); try { while (System.currentTimeMillis() - baseTimestamp < 36000) { writer.write(generateRandomEvent()); } } finally { writer.close(); } System.out.println("Generated " + counter + " events"); return 0; }
@Test public void testMixedProjection() throws IOException { Dataset<StandardEvent> original = repo.create("ns", "mixedProjection", new DatasetDescriptor.Builder() .schema(StandardEvent.class) .build(), StandardEvent.class); DatasetWriter<StandardEvent> writer = null; try { writer = original.newWriter(); writer.write(sepEvent); writer.write(octEvent); writer.write(novEvent); } finally { Closeables.close(writer, false); } Dataset<ReflectSmallEvent> dataset = repo.load("ns", original.getName(), ReflectSmallEvent.class); Set<ReflectSmallEvent> expected = Sets.newHashSet( new ReflectSmallEvent(sepEvent), new ReflectSmallEvent(octEvent), new ReflectSmallEvent(novEvent)); assertContentEquals(expected, dataset); }
@BeforeClass public static void setup() throws IOException { fs = LocalFileSystem.getInstance(); testDirectory = new Path(Files.createTempDir().getAbsolutePath()); FileSystemDatasetRepository repo = new FileSystemDatasetRepository(fs.getConf(), testDirectory); Dataset<GenericRecord> writerDataset = repo.create("ns", "test", new DatasetDescriptor.Builder() .schema(DatasetTestUtilities.OLD_VALUE_SCHEMA) .format(Formats.PARQUET) .build(), GenericRecord.class); DatasetWriter<GenericRecord> writer = writerDataset.newWriter(); GenericRecord record = new GenericData.Record(DatasetTestUtilities.OLD_VALUE_SCHEMA); for (long i = 0; i < totalRecords; i++) { record.put("value", Long.valueOf(i)); writer.write(record); } writer.close(); repo.update("ns", "test", new DatasetDescriptor.Builder(writerDataset.getDescriptor()) .schema(Value.class).build()); readerDataset = repo.load("ns", "test", GenericRecord.class); }
@Test public void testLargeSchema() { // Only run this test in distributed mode, since non-HDFS schema URLs result // in the schema being loaded into the Hive metastore, and large schemas // can exceed the size limit of that. Assume.assumeTrue(distributed); Assert.assertFalse("Sanity check", provider.exists(NAMESPACE, "large_schema_test")); // Create a schema with many fields to ensure the underlying store can handle it. SchemaBuilder.FieldAssembler<Schema> fields = SchemaBuilder.record("Event").fields(); for (int i = 0; i < 1000; ++i) { fields.requiredString("field_" + i); } DatasetDescriptor descriptor = new DatasetDescriptor.Builder() .format(Formats.AVRO) .schema(fields.endRecord()) .build(); DatasetDescriptor created = provider.create(NAMESPACE, "large_schema_test", descriptor); Assert.assertEquals("Large schemas should match", descriptor.getSchema(), created.getSchema()); } }
@Test public void testUpdateSuccessfulWithCompatibleSchemaChangeFieldRemoved() { Dataset<Record> dataset = repo.create(NAMESPACE, NAME, new DatasetDescriptor.Builder() .schema(testSchema).build()); writeTestUsers(dataset, 5, 0, "email"); checkTestUsers(dataset, 5, "email"); Schema testSchemaV2 = SchemaBuilder.record("user").fields() .requiredString("username") .endRecord(); Dataset<Record> datasetV2 = repo.update(NAMESPACE, NAME, new DatasetDescriptor.Builder(dataset.getDescriptor()) .schema(testSchemaV2) .build()); Assert.assertEquals("Dataset schema is updated", testSchemaV2, datasetV2 .getDescriptor().getSchema()); // test that the old records can be read back with the new schema checkTestUsers(datasetV2, 5, new String[0]); // write more users and test that the mixed set can be read back with the new schema writeTestUsers(datasetV2, 5, 5, new String[0]); checkTestUsers(datasetV2, 10, new String[0]); }
@Override public int run(List<String> args) throws Exception { String inputUri = uri; String outputUri = "dataset:hive?dataset=correlated_events"; if (args.size() == 1) { outputUri = args.get(0); } Preconditions.checkState(Datasets.exists(inputUri), "input dataset doesn't exists"); if (!Datasets.exists(outputUri)) { Datasets.create(outputUri, new DatasetDescriptor.Builder() .format("avro") .schema(CorrelatedEvents.class) .build()); } CorrelateEventsTask task = new CorrelateEventsTask(inputUri, outputUri); task.run(); return 0; }
@Test public void testMappingSourceMustBeSchemaField() { Assert.assertNotNull(new DatasetDescriptor.Builder() .schema(USER_SCHEMA) .columnMapping(new ColumnMapping.Builder() .column("id", "meta", "id") .build()) .build()); TestHelpers.assertThrows("Should reject mapping source not in schema", ValidationException.class, new Runnable() { @Override public void run() { new DatasetDescriptor.Builder() .schema(USER_SCHEMA) .columnMapping(new ColumnMapping.Builder() .column("created_at", "meta", "created_at") .build()) .build(); } }); }
@Test @SuppressWarnings("deprecation") public void testGetPartitionReturnsNullIfNoAutoCreate() throws IOException { PartitionStrategy partitionStrategy = new PartitionStrategy.Builder().hash( "username", 2).build(); FileSystemDataset<Record> ds = new FileSystemDataset.Builder<Record>() .namespace("ns") .name("partitioned-users") .configuration(getConfiguration()) .descriptor(new DatasetDescriptor.Builder() .schema(USER_SCHEMA) .format(format) .location(testDirectory) .partitionStrategy(partitionStrategy) .build()) .type(Record.class) .build(); Assert .assertNull(ds.getPartition(new PartitionKey(1), false)); }
@Before public void createTestDatasets() { Datasets.delete("dataset:file:/tmp/datasets/unpartitioned"); Datasets.delete("dataset:file:/tmp/datasets/partitioned"); DatasetDescriptor descriptor = new DatasetDescriptor.Builder() .schema(TestRecord.class) .build(); unpartitioned = Datasets.create("dataset:file:/tmp/datasets/unpartitioned", descriptor, TestRecord.class); descriptor = new DatasetDescriptor.Builder(descriptor) .partitionStrategy(new PartitionStrategy.Builder() .hash("id", 4) .build()) .build(); partitioned = Datasets.create("dataset:file:/tmp/datasets/partitioned", descriptor, TestRecord.class); writeTestRecords(unpartitioned); writeTestRecords(partitioned); }
@Before public void setUp() throws IOException { fileSystem = FileSystem.get(new Configuration()); testDirectory = fileSystem.makeQualified( new Path(Files.createTempDir().getAbsolutePath())); partitionStrategy = new PartitionStrategy.Builder() .hash("username", "username_part", 2).hash("email", 3).build(); dataset = new FileSystemDataset.Builder<Record>() .namespace("ns") .name("partitioned-users") .configuration(new Configuration()) .uri(URI.create("test")) .descriptor(new DatasetDescriptor.Builder() .schema(USER_SCHEMA) .location(testDirectory) .partitionStrategy(partitionStrategy) .build()) .type(Record.class) .build(); }
@Test public void testUnpartitionedDataset() throws Exception { File folder = temp.newFolder("a/b/c/d/e/dataset_name"); Path root = new Path(temp.getRoot().toURI()); FileSystem fs = LocalFileSystem.getInstance(); URI datasetUri = URI.create("dataset:file:" + folder.getAbsolutePath()); DatasetDescriptor descriptor = new DatasetDescriptor.Builder() .schema(USER_SCHEMA) .build(); Dataset<GenericRecord> dataset = Datasets.create(datasetUri, descriptor); // write two so that the descriptor uses the directory rather than a file writeUserToView(dataset); writeUserToView(dataset); DatasetDescriptor expected = dataset.getDescriptor(); DatasetDescriptor actual = Iterables.getOnlyElement( FileSystemUtil.findPotentialDatasets(fs, root)); Assert.assertEquals("Should succeed and find an equivalent descriptor", expected, actual); }
@Test public void testDatasetNotPartitioned() { Datasets.delete("dataset:file:/tmp/datasets/ns/test"); final Dataset<GenericRecord> ds = Datasets.create( "dataset:file:/tmp/datasets/ns/test", new DatasetDescriptor.Builder() .schema(schema) .build()); Assert.assertEquals("Should work for empty relative directory", ds, FileSystemDatasets.viewForUri(ds, "file:/tmp/datasets/ns/test")); TestHelpers.assertThrows("Should reject paths in a non-partitioned dataset", IllegalArgumentException.class, new Runnable() { @Override public void run() { FileSystemDatasets.viewForUri(ds, "y=2014/m=03/d=14"); } }); }
@BeforeClass public static void setup() throws IOException { fs = LocalFileSystem.getInstance(); testDirectory = new Path(Files.createTempDir().getAbsolutePath()); FileSystemDatasetRepository repo = new FileSystemDatasetRepository(fs.getConf(), testDirectory); Dataset<StandardEvent> writerDataset = repo.create("ns", "test", new DatasetDescriptor.Builder() .schema(StandardEvent.class) .format(Formats.PARQUET) .build(), StandardEvent.class); DatasetWriter<StandardEvent> writer = writerDataset.newWriter(); for (long i = 0; i < totalRecords; i++) { String text = String.valueOf(i); writer.write(new StandardEvent(text, text, i, text, text, i)); } writer.close(); readerDataset = repo.load("ns", "test", GenericRecord.class); }
@Before public void setUp() throws IOException { this.conf = new Configuration(); this.fileSystem = FileSystem.get(conf); this.testDirectory = new Path(Files.createTempDir().getAbsolutePath()); this.repo = new FileSystemDatasetRepository(conf, testDirectory); PartitionStrategy partitionStrategy = new PartitionStrategy.Builder() .hash("username", 2).build(); FileSystemDataset<Object> users = (FileSystemDataset<Object>) repo.create( "ns", "users", new DatasetDescriptor.Builder() .schema(USER_SCHEMA) .partitionStrategy(partitionStrategy) .build()); writer = PartitionedDatasetWriter.newWriter( new FileSystemView<Object>(users, null, null, Object.class)); }
@Test public void testPathIterator_Directory() { FileSystemDataset<Record> ds = new FileSystemDataset.Builder<Record>() .namespace("ns") .name("users") .configuration(getConfiguration()) .descriptor(new DatasetDescriptor.Builder() .schema(USER_SCHEMA) .format(format) .compressionType(compressionType) .location(testDirectory) .build()) .type(Record.class) .build(); List<Path> dirPaths = Lists.newArrayList(ds.dirIterator()); Assert.assertEquals("dirIterator for non-partitioned dataset should yield a single path.", 1, dirPaths.size()); Assert.assertEquals("dirIterator should yield absolute paths.", testDirectory, dirPaths.get(0)); }
@Before public void setUp() throws IOException { this.conf = new Configuration(); this.fileSystem = FileSystem.get(conf); this.testDirectory = new Path(Files.createTempDir().getAbsolutePath()); this.repo = new FileSystemDatasetRepository(conf, testDirectory, new EnusrePartitionPathDoesNotExistMetadataProvider(conf, testDirectory)); partitionStrategy = new PartitionStrategy.Builder() .hash("username", 2).build(); FileSystemDataset<Object> users = (FileSystemDataset<Object>) repo.create( "ns", "users", new DatasetDescriptor.Builder() .schema(USER_SCHEMA) .partitionStrategy(partitionStrategy) .build()); view = new FileSystemView<Object>(users, null, null, Object.class); }
@Test public void testBadNumericSchema() { final DatasetDescriptor desc = new DatasetDescriptor.Builder() .schema(TYPE_ERROR_SCHEMA) .build(); final CSVFileReader<GenericData.Record> reader = new CSVFileReader<GenericData.Record>(localfs, csvFile, desc, DataModelUtil.accessor(GenericData.Record.class, desc.getSchema())); reader.initialize(); Assert.assertTrue(reader.hasNext()); TestHelpers.assertThrows("Should reject float value for integer schema", DatasetRecordException.class, new Runnable() { @Override public void run() { reader.next(); } }); }
@BeforeClass public static void setup() throws IOException { fs = LocalFileSystem.getInstance(); testDirectory = new Path(Files.createTempDir().getAbsolutePath()); FileSystemDatasetRepository repo = new FileSystemDatasetRepository(fs.getConf(), testDirectory); Dataset<StandardEvent> writerDataset = repo.create("ns", "test", new DatasetDescriptor.Builder() .schema(StandardEvent.class) .build(), StandardEvent.class); DatasetWriter<StandardEvent> writer = writerDataset.newWriter(); for (long i = 0; i < totalRecords; i++) { String text = String.valueOf(i); writer.write(new StandardEvent(text, text, i, text, text, i)); } writer.close(); readerDataset = repo.load("ns", "test", GenericData.Record.class); }