private SavePolicy(Context context) { String uri = context.getString(CONFIG_KITE_ERROR_DATASET_URI); Preconditions.checkArgument(uri != null, "Must set " + CONFIG_KITE_ERROR_DATASET_URI + " when " + CONFIG_FAILURE_POLICY + "=save"); if (Datasets.exists(uri)) { dataset = Datasets.load(uri, AvroFlumeEvent.class); } else { DatasetDescriptor descriptor = new DatasetDescriptor.Builder() .schema(AvroFlumeEvent.class) .build(); dataset = Datasets.create(uri, descriptor, AvroFlumeEvent.class); } nEventsHandled = 0; }
@Before public void setUp() throws IOException { this.testSchema = DatasetTestUtilities.USER_SCHEMA; Configuration tempConfig = new Configuration(); tempConfig.setLong("fs.trash.interval", 1); this.conf = (distributed ? MiniDFSTest.getConfiguration() : tempConfig); this.fileSystem = FileSystem.get(conf); this.testDirectory = fileSystem.makeQualified( new Path(Files.createTempDir().getAbsolutePath())); this.testDescriptor = new DatasetDescriptor.Builder() .schema(testSchema) .build(); this.testProvider = newProvider(conf); this.repo = newRepo(testProvider); }
@Override public int run(List<String> args) throws Exception { Preconditions.checkState(!Datasets.exists(uri), "events dataset already exists"); DatasetDescriptor descriptor = new DatasetDescriptor.Builder() .schema(StandardEvent.class).build(); View<StandardEvent> events = Datasets.create(uri, descriptor, StandardEvent.class); DatasetWriter<StandardEvent> writer = events.newWriter(); try { while (System.currentTimeMillis() - baseTimestamp < 36000) { writer.write(generateRandomEvent()); } } finally { writer.close(); } System.out.println("Generated " + counter + " events"); return 0; }
@Test public void testUpdateSuccessfulWithCompatibleSchemaChangeFieldRemoved() { Dataset<Record> dataset = repo.create(NAMESPACE, NAME, new DatasetDescriptor.Builder() .schema(testSchema).build()); writeTestUsers(dataset, 5, 0, "email"); checkTestUsers(dataset, 5, "email"); Schema testSchemaV2 = SchemaBuilder.record("user").fields() .requiredString("username") .endRecord(); Dataset<Record> datasetV2 = repo.update(NAMESPACE, NAME, new DatasetDescriptor.Builder(dataset.getDescriptor()) .schema(testSchemaV2) .build()); Assert.assertEquals("Dataset schema is updated", testSchemaV2, datasetV2 .getDescriptor().getSchema()); // test that the old records can be read back with the new schema checkTestUsers(datasetV2, 5, new String[0]); // write more users and test that the mixed set can be read back with the new schema writeTestUsers(datasetV2, 5, 5, new String[0]); checkTestUsers(datasetV2, 10, new String[0]); }
@Override public int run(List<String> args) throws Exception { String inputUri = uri; String outputUri = "dataset:hive?dataset=correlated_events"; if (args.size() == 1) { outputUri = args.get(0); } Preconditions.checkState(Datasets.exists(inputUri), "input dataset doesn't exists"); if (!Datasets.exists(outputUri)) { Datasets.create(outputUri, new DatasetDescriptor.Builder() .format("avro") .schema(CorrelatedEvents.class) .build()); } CorrelateEventsTask task = new CorrelateEventsTask(inputUri, outputUri); task.run(); return 0; }
@Test public void testMappingSourceMustBeSchemaField() { Assert.assertNotNull(new DatasetDescriptor.Builder() .schema(USER_SCHEMA) .columnMapping(new ColumnMapping.Builder() .column("id", "meta", "id") .build()) .build()); TestHelpers.assertThrows("Should reject mapping source not in schema", ValidationException.class, new Runnable() { @Override public void run() { new DatasetDescriptor.Builder() .schema(USER_SCHEMA) .columnMapping(new ColumnMapping.Builder() .column("created_at", "meta", "created_at") .build()) .build(); } }); }
@Test @SuppressWarnings("deprecation") public void testGetPartitionReturnsNullIfNoAutoCreate() throws IOException { PartitionStrategy partitionStrategy = new PartitionStrategy.Builder().hash( "username", 2).build(); FileSystemDataset<Record> ds = new FileSystemDataset.Builder<Record>() .namespace("ns") .name("partitioned-users") .configuration(getConfiguration()) .descriptor(new DatasetDescriptor.Builder() .schema(USER_SCHEMA) .format(format) .location(testDirectory) .partitionStrategy(partitionStrategy) .build()) .type(Record.class) .build(); Assert .assertNull(ds.getPartition(new PartitionKey(1), false)); }
@Before public void createTestDatasets() { Datasets.delete("dataset:file:/tmp/datasets/unpartitioned"); Datasets.delete("dataset:file:/tmp/datasets/partitioned"); DatasetDescriptor descriptor = new DatasetDescriptor.Builder() .schema(TestRecord.class) .build(); unpartitioned = Datasets.create("dataset:file:/tmp/datasets/unpartitioned", descriptor, TestRecord.class); descriptor = new DatasetDescriptor.Builder(descriptor) .partitionStrategy(new PartitionStrategy.Builder() .hash("id", 4) .build()) .build(); partitioned = Datasets.create("dataset:file:/tmp/datasets/partitioned", descriptor, TestRecord.class); writeTestRecords(unpartitioned); writeTestRecords(partitioned); }
@Before public void setUp() throws IOException { fileSystem = FileSystem.get(new Configuration()); testDirectory = fileSystem.makeQualified( new Path(Files.createTempDir().getAbsolutePath())); partitionStrategy = new PartitionStrategy.Builder() .hash("username", "username_part", 2).hash("email", 3).build(); dataset = new FileSystemDataset.Builder<Record>() .namespace("ns") .name("partitioned-users") .configuration(new Configuration()) .uri(URI.create("test")) .descriptor(new DatasetDescriptor.Builder() .schema(USER_SCHEMA) .location(testDirectory) .partitionStrategy(partitionStrategy) .build()) .type(Record.class) .build(); }
@Test public void testUnpartitionedDataset() throws Exception { File folder = temp.newFolder("a/b/c/d/e/dataset_name"); Path root = new Path(temp.getRoot().toURI()); FileSystem fs = LocalFileSystem.getInstance(); URI datasetUri = URI.create("dataset:file:" + folder.getAbsolutePath()); DatasetDescriptor descriptor = new DatasetDescriptor.Builder() .schema(USER_SCHEMA) .build(); Dataset<GenericRecord> dataset = Datasets.create(datasetUri, descriptor); // write two so that the descriptor uses the directory rather than a file writeUserToView(dataset); writeUserToView(dataset); DatasetDescriptor expected = dataset.getDescriptor(); DatasetDescriptor actual = Iterables.getOnlyElement( FileSystemUtil.findPotentialDatasets(fs, root)); Assert.assertEquals("Should succeed and find an equivalent descriptor", expected, actual); }
@Test public void testDatasetNotPartitioned() { Datasets.delete("dataset:file:/tmp/datasets/ns/test"); final Dataset<GenericRecord> ds = Datasets.create( "dataset:file:/tmp/datasets/ns/test", new DatasetDescriptor.Builder() .schema(schema) .build()); Assert.assertEquals("Should work for empty relative directory", ds, FileSystemDatasets.viewForUri(ds, "file:/tmp/datasets/ns/test")); TestHelpers.assertThrows("Should reject paths in a non-partitioned dataset", IllegalArgumentException.class, new Runnable() { @Override public void run() { FileSystemDatasets.viewForUri(ds, "y=2014/m=03/d=14"); } }); }
@BeforeClass public static void setup() throws IOException { fs = LocalFileSystem.getInstance(); testDirectory = new Path(Files.createTempDir().getAbsolutePath()); FileSystemDatasetRepository repo = new FileSystemDatasetRepository(fs.getConf(), testDirectory); Dataset<StandardEvent> writerDataset = repo.create("ns", "test", new DatasetDescriptor.Builder() .schema(StandardEvent.class) .format(Formats.PARQUET) .build(), StandardEvent.class); DatasetWriter<StandardEvent> writer = writerDataset.newWriter(); for (long i = 0; i < totalRecords; i++) { String text = String.valueOf(i); writer.write(new StandardEvent(text, text, i, text, text, i)); } writer.close(); readerDataset = repo.load("ns", "test", GenericRecord.class); }
@Test public void testPathIterator_Directory() { FileSystemDataset<Record> ds = new FileSystemDataset.Builder<Record>() .namespace("ns") .name("users") .configuration(getConfiguration()) .descriptor(new DatasetDescriptor.Builder() .schema(USER_SCHEMA) .format(format) .compressionType(compressionType) .location(testDirectory) .build()) .type(Record.class) .build(); List<Path> dirPaths = Lists.newArrayList(ds.dirIterator()); Assert.assertEquals("dirIterator for non-partitioned dataset should yield a single path.", 1, dirPaths.size()); Assert.assertEquals("dirIterator should yield absolute paths.", testDirectory, dirPaths.get(0)); }
@Before public void setUp() throws IOException { this.conf = new Configuration(); this.fileSystem = FileSystem.get(conf); this.testDirectory = new Path(Files.createTempDir().getAbsolutePath()); this.repo = new FileSystemDatasetRepository(conf, testDirectory); PartitionStrategy partitionStrategy = new PartitionStrategy.Builder() .hash("username", 2).build(); FileSystemDataset<Object> users = (FileSystemDataset<Object>) repo.create( "ns", "users", new DatasetDescriptor.Builder() .schema(USER_SCHEMA) .partitionStrategy(partitionStrategy) .build()); writer = PartitionedDatasetWriter.newWriter( new FileSystemView<Object>(users, null, null, Object.class)); }
@Before public void setUp() throws IOException { this.conf = new Configuration(); this.fileSystem = FileSystem.get(conf); this.testDirectory = new Path(Files.createTempDir().getAbsolutePath()); this.repo = new FileSystemDatasetRepository(conf, testDirectory, new EnusrePartitionPathDoesNotExistMetadataProvider(conf, testDirectory)); partitionStrategy = new PartitionStrategy.Builder() .hash("username", 2).build(); FileSystemDataset<Object> users = (FileSystemDataset<Object>) repo.create( "ns", "users", new DatasetDescriptor.Builder() .schema(USER_SCHEMA) .partitionStrategy(partitionStrategy) .build()); view = new FileSystemView<Object>(users, null, null, Object.class); }
@Test public void testBadNumericSchema() { final DatasetDescriptor desc = new DatasetDescriptor.Builder() .schema(TYPE_ERROR_SCHEMA) .build(); final CSVFileReader<GenericData.Record> reader = new CSVFileReader<GenericData.Record>(localfs, csvFile, desc, DataModelUtil.accessor(GenericData.Record.class, desc.getSchema())); reader.initialize(); Assert.assertTrue(reader.hasNext()); TestHelpers.assertThrows("Should reject float value for integer schema", DatasetRecordException.class, new Runnable() { @Override public void run() { reader.next(); } }); }
@BeforeClass public static void setup() throws IOException { fs = LocalFileSystem.getInstance(); testDirectory = new Path(Files.createTempDir().getAbsolutePath()); FileSystemDatasetRepository repo = new FileSystemDatasetRepository(fs.getConf(), testDirectory); Dataset<StandardEvent> writerDataset = repo.create("ns", "test", new DatasetDescriptor.Builder() .schema(StandardEvent.class) .build(), StandardEvent.class); DatasetWriter<StandardEvent> writer = writerDataset.newWriter(); for (long i = 0; i < totalRecords; i++) { String text = String.valueOf(i); writer.write(new StandardEvent(text, text, i, text, text, i)); } writer.close(); readerDataset = repo.load("ns", "test", GenericData.Record.class); }
@Test public void signalReadyOnUnboundedDataset() { final FileSystemDataset<Record> ds = new FileSystemDataset.Builder<Record>() .namespace("ns") .name("users") .configuration(getConfiguration()) .descriptor( new DatasetDescriptor.Builder().schema(USER_SCHEMA).format(format) .location(testDirectory).build()) .type(Record.class) .uri(URIBuilder.build(URI.create("repo:" + testDirectory.toUri()), "ns", "name")) .build(); Assert.assertFalse("Unbounded dataset has not been signaled", ds.isReady()); ds.signalReady(); Assert.assertTrue("Unbounded dataset has been signaled and should be ready", ds.isReady()); }
@BeforeClass public static void setup() throws IOException { fs = LocalFileSystem.getInstance(); testDirectory = new Path(Files.createTempDir().getAbsolutePath()); FileSystemDatasetRepository repo = new FileSystemDatasetRepository(fs.getConf(), testDirectory); Dataset<MyRecord> writerDataset = repo.create("ns", "test", new DatasetDescriptor.Builder() .schema(MyRecord.class) .build(), MyRecord.class); DatasetWriter<MyRecord> writer = writerDataset.newWriter(); for (int i = 0; i < totalRecords; i++) { writer.write(new MyRecord(String.valueOf(i), i)); } writer.close(); readerDataset = repo.load("ns", "test", GenericRecord.class); }
@Before public void setUp() throws Exception { super.setUp(); inputDataset = repo.create("ns", "in", new DatasetDescriptor.Builder() .property("kite.allow.csv", "true") .schema(STRING_SCHEMA) .format(format) .build(), GenericData.Record.class); outputDataset = repo.create("ns", "out", new DatasetDescriptor.Builder() .property("kite.allow.csv", "true") .schema(STATS_SCHEMA) .format(format) .build(), GenericData.Record.class); }