/** * Configure the dataset's location (optional). * * @param uri A location String URI * @return An instance of the builder for method chaining. * * @since 0.8.0 */ public Builder location(String uri) { return location(URI.create(uri)); }
/** * Configure the dataset's location (optional). * * @param path A location Path * @return An instance of the builder for method chaining. * * @since 0.8.0 */ public Builder location(Path path) { return location(path.toString()); }
@Override public DatasetDescriptor create(String namespace, String name, DatasetDescriptor descriptor) { Preconditions.checkNotNull(namespace, "Namespace cannot be null"); Preconditions.checkNotNull(name, "Name cannot be null"); Preconditions.checkNotNull(descriptor, "Descriptor cannot be null"); if (exists(namespace, name)) { throw new DatasetExistsException( "Dataset already exists for name:" + name); } DatasetDescriptor newDescriptor; if (descriptor.getLocation() == null) { newDescriptor = new DatasetDescriptor.Builder(descriptor) .location(fs.makeQualified(new Path(newLocation(name)))) .build(); } else { // don't need to modify it newDescriptor = descriptor; } // save and return if (!descriptors.containsKey(namespace)) { descriptors.put(namespace, Maps.<String, DatasetDescriptor>newHashMap()); } Map<String, DatasetDescriptor> datasets = descriptors.get(namespace); datasets.put(name, newDescriptor); return newDescriptor; }
@Test public void testUnpartitionedReplace() { // recreate temporary without a partition strategy Datasets.delete("dataset:file:/tmp/datasets/temporary"); DatasetDescriptor descriptor = new DatasetDescriptor .Builder(unpartitioned.getDescriptor()) .location((URI) null) // clear the location .build(); temporary = Datasets.create("dataset:file:/tmp/datasets/temporary", descriptor, TestRecord.class); Assert.assertTrue("Should allow replacing an unpartitioned dataset", unpartitioned.canReplace(unpartitioned)); // make sure there are multiple files writeTestRecords(unpartitioned); writeTestRecords(unpartitioned); writeTestRecords(temporary); writeTestRecords(temporary); Set<String> originalFiles = Sets.newHashSet( Iterators.transform(unpartitioned.pathIterator(), new GetFilename())); Set<String> replacementFiles = Sets.newHashSet( Iterators.transform(temporary.pathIterator(), new GetFilename())); Iterators.transform(temporary.pathIterator(), new GetFilename()); Assert.assertFalse("Sanity check", originalFiles.equals(replacementFiles)); unpartitioned.replace(unpartitioned, temporary); Set<String> replacedFiles = Sets.newHashSet( Iterators.transform(unpartitioned.pathIterator(), new GetFilename())); Assert.assertEquals("Should contain the replacement files", replacementFiles, replacedFiles); }
@Test(expected = ValidationException.class) public void testCannotMergeDatasetsWithDifferentFormats() throws IOException { FileSystemDataset<Record> ds = new FileSystemDataset.Builder<Record>() .namespace("ns") .name("users") .configuration(getConfiguration()) .descriptor(new DatasetDescriptor.Builder() .schema(USER_SCHEMA) .format(Formats.AVRO) .location(testDirectory) .build()) .type(Record.class) .build(); FileSystemDataset<Record> dsUpdate = new FileSystemDataset.Builder<Record>() .namespace("ns") .name("users") .configuration(getConfiguration()) .descriptor(new DatasetDescriptor.Builder() .schema(USER_SCHEMA) .format(Formats.PARQUET) .location(testDirectory) .build()) .type(Record.class) .build(); ds.merge(dsUpdate); }
@Test @SuppressWarnings("deprecation") public void testGetPartitionReturnsNullIfNoAutoCreate() throws IOException { PartitionStrategy partitionStrategy = new PartitionStrategy.Builder().hash( "username", 2).build(); FileSystemDataset<Record> ds = new FileSystemDataset.Builder<Record>() .namespace("ns") .name("partitioned-users") .configuration(getConfiguration()) .descriptor(new DatasetDescriptor.Builder() .schema(USER_SCHEMA) .format(format) .location(testDirectory) .partitionStrategy(partitionStrategy) .build()) .type(Record.class) .build(); Assert .assertNull(ds.getPartition(new PartitionKey(1), false)); }
@Test(expected = ValidationException.class) public void testCannotMergeDatasetsWithDifferentSchemas() throws IOException { FileSystemDataset<Record> ds = new FileSystemDataset.Builder<Record>() .namespace("ns") .name("users") .configuration(getConfiguration()) .descriptor(new DatasetDescriptor.Builder() .schema(STRING_SCHEMA) .location(testDirectory) .build()) .type(Record.class) .build(); FileSystemDataset<Record> dsUpdate = new FileSystemDataset.Builder<Record>() .namespace("ns") .name("users") .configuration(getConfiguration()) .descriptor(new DatasetDescriptor.Builder() .schema(USER_SCHEMA) .location(testDirectory) .build()) .type(Record.class) .build(); ds.merge(dsUpdate); }
@Test public void testCreateWithLocation() throws URISyntaxException { Assert.assertFalse("Sanity check", provider.exists(NAMESPACE, NAME)); String auth = getDFS().getUri().getAuthority(); URI requestedLocation = new URI("hdfs://" + auth + "/tmp/data/my_data_set"); DatasetDescriptor requested = new DatasetDescriptor.Builder(testDescriptor) .location(requestedLocation) .build(); final DatasetDescriptor created; try { created = provider.create(NAMESPACE, NAME, requested); } catch (UnsupportedOperationException ex) { // this is expected if the provider doesn't support requested locations return; } // if supported, the location should be unchanged. Assert.assertNotNull("Descriptor should be returned", created); Assert.assertTrue("Descriptor should exist", provider.exists(NAMESPACE, NAME)); Assert.assertEquals("Requested locations should match", requestedLocation, created.getLocation()); }
@Test public void testUpdateFailsWithLocationChange() { ensureCreated(); Dataset<Record> dataset = repo.load(NAMESPACE, NAME); URI location = dataset.getDescriptor().getLocation(); DatasetDescriptor changed = new DatasetDescriptor.Builder(dataset.getDescriptor()) .location(new Path(testDirectory, "newDataLocation").toUri()) .build(); try { repo.update(NAMESPACE, NAME, changed); Assert.fail("Should fail due to data location change"); } catch (ValidationException ex) { // expected } Assert.assertEquals( location, repo.load(NAMESPACE, NAME).getDescriptor().getLocation()); }
@Before public void setUp() throws IOException { fileSystem = FileSystem.get(new Configuration()); testDirectory = fileSystem.makeQualified( new Path(Files.createTempDir().getAbsolutePath())); partitionStrategy = new PartitionStrategy.Builder() .hash("username", "username_part", 2).hash("email", 3).build(); dataset = new FileSystemDataset.Builder<Record>() .namespace("ns") .name("partitioned-users") .configuration(new Configuration()) .uri(URI.create("test")) .descriptor(new DatasetDescriptor.Builder() .schema(USER_SCHEMA) .location(testDirectory) .partitionStrategy(partitionStrategy) .build()) .type(Record.class) .build(); }
@Test public void testWriteAndRead() throws IOException { FileSystemDataset<Record> ds = new FileSystemDataset.Builder<Record>() .namespace("ns") .name("test") .configuration(getConfiguration()) .descriptor(new DatasetDescriptor.Builder() .schemaUri(USER_SCHEMA_URL) .format(format) .compressionType(compressionType) .location(testDirectory) .build()) .type(Record.class) .build(); Assert.assertFalse("Dataset is not partitioned", ds.getDescriptor() .isPartitioned()); writeTestUsers(ds, 10); checkTestUsers(ds, 10); }
@Test public void testPathIterator_Directory() { FileSystemDataset<Record> ds = new FileSystemDataset.Builder<Record>() .namespace("ns") .name("users") .configuration(getConfiguration()) .descriptor(new DatasetDescriptor.Builder() .schema(USER_SCHEMA) .format(format) .compressionType(compressionType) .location(testDirectory) .build()) .type(Record.class) .build(); List<Path> dirPaths = Lists.newArrayList(ds.dirIterator()); Assert.assertEquals("dirIterator for non-partitioned dataset should yield a single path.", 1, dirPaths.size()); Assert.assertEquals("dirIterator should yield absolute paths.", testDirectory, dirPaths.get(0)); }
@Test public void signalReadyOnUnboundedDataset() { final FileSystemDataset<Record> ds = new FileSystemDataset.Builder<Record>() .namespace("ns") .name("users") .configuration(getConfiguration()) .descriptor( new DatasetDescriptor.Builder().schema(USER_SCHEMA).format(format) .location(testDirectory).build()) .type(Record.class) .uri(URIBuilder.build(URI.create("repo:" + testDirectory.toUri()), "ns", "name")) .build(); Assert.assertFalse("Unbounded dataset has not been signaled", ds.isReady()); ds.signalReady(); Assert.assertTrue("Unbounded dataset has been signaled and should be ready", ds.isReady()); }
@Test public void testMoveToTrashWithoutPartitions() { final FileSystemDataset<Record> ds = new FileSystemDataset.Builder<Record>() .namespace("ns") .name("users") .configuration(getConfiguration()) .descriptor( new DatasetDescriptor.Builder().schema(USER_SCHEMA).format(format) .location(testDirectory).build()) .type(Record.class) .build(); writeTestUsers(ds, 10); Assert.assertTrue(ds.moveToTrash()); checkReaderBehavior(ds.newReader(), 0, (RecordValidator<Record>) null); }
private static DatasetDescriptor descriptor(FileSystem fs, Result.Table table) throws IOException { // inspect the path to determine the partition strategy PartitionStrategy strategy = strategy(fs, table.location); DatasetDescriptor.Builder builder = new DatasetDescriptor.Builder() .format(table.format) .schema(table.schema) .partitionStrategy(strategy) .location(table.location); if (table.depth < 0) { builder.property("kite.filesystem.mixed-depth", "true"); } return builder.build(); }
@Test public void testDeleteAllWithoutPartitions() { final FileSystemDataset<Record> ds = new FileSystemDataset.Builder<Record>() .namespace("ns") .name("users") .configuration(getConfiguration()) .descriptor( new DatasetDescriptor.Builder().schema(USER_SCHEMA).format(format) .location(testDirectory).build()) .type(Record.class) .build(); writeTestUsers(ds, 10); Assert.assertTrue(ds.deleteAll()); checkReaderBehavior(ds.newReader(), 0, (RecordValidator<Record>) null); }
private static DatasetDescriptor copy(DatasetDescriptor descriptor) { // don't reuse the previous dataset's location and don't use durable // parquet writers because fault-tolerance is handled by OutputCommitter return new DatasetDescriptor.Builder(descriptor) .property(FileSystemProperties.NON_DURABLE_PARQUET_PROP, "true") .location((URI) null) .build(); }
private static DatasetDescriptor getDatasetDescriptor(Schema schema, URI location) { return new DatasetDescriptor.Builder() .schema(schema) .location(location) .build(); }
private static DatasetDescriptor copy(DatasetDescriptor descriptor) { // don't reuse the previous dataset's location and don't use durable // parquet writers because fault-tolerance is handled by OutputCommitter return new DatasetDescriptor.Builder(descriptor) .property(FileSystemProperties.NON_DURABLE_PARQUET_PROP, "true") .location((URI) null) .build(); }
private static DatasetDescriptor getDatasetDescriptor(Schema schema, URI location) { return new DatasetDescriptor.Builder() .schema(schema) .location(location) .build(); }