/** * Configure the {@link Dataset}'s schema from a String URI. A schema is * required, and can be set using one of the methods {@code schema}, * {@code schemaLiteral}, {@code schemaUri}, or * {@code schemaFromAvroDataFile}. * * @param uri a String URI * @return An instance of the builder for method chaining. * @throws IOException * * @since 0.8.0 */ public Builder schemaUri(String uri) throws IOException { return schemaUri(URI.create(uri)); }
@Test public void testSchemaFromHdfs() throws IOException { MiniDFSTest.setupFS(); FileSystem fs = MiniDFSTest.getDFS(); // copy a schema to HDFS Path schemaPath = fs.makeQualified(new Path("schema.avsc")); FSDataOutputStream out = fs.create(schemaPath); IOUtils.copyBytes(DatasetTestUtilities.USER_SCHEMA_URL.toURL().openStream(), out, fs.getConf()); out.close(); // build a schema using the HDFS path and check it's the same Schema schema = new DatasetDescriptor.Builder().schemaUri(schemaPath.toUri()).build() .getSchema(); Assert.assertEquals(DatasetTestUtilities.USER_SCHEMA, schema); MiniDFSTest.teardownFS(); }
@Test public void testLoadViewStringUriWithoutType() throws Exception { DatasetDescriptor descriptor = new DatasetDescriptor.Builder() .schemaUri("resource:schema/user.avsc") .build(); Constraints constraints = new Constraints(descriptor.getSchema(), null) .with("username", "user1") .with("email", "user1@example.com"); AbstractDataset<GenericRecord> ds = mock(AbstractDataset.class); when(repo.load("ns", "test", GenericRecord.class)).thenReturn(ds); when(ds.getDescriptor()).thenReturn(descriptor); AbstractRefinableView<GenericRecord> userAndEmailView = mock(AbstractRefinableView.class); when(ds.filter(constraints)).thenReturn(userAndEmailView); URI datasetUri = new URIBuilder(repoUri, "ns", "test") .with("username", "user1") .with("email", "user1@example.com") .with("ignoredOption", "abc") .build(); RefinableView<GenericRecord> view = Datasets.load(datasetUri.toString()); verify(repo).load("ns", "test", GenericRecord.class); verifyNoMoreInteractions(repo); verify(ds).getDescriptor(); verify(ds).filter(constraints); verifyNoMoreInteractions(ds); verifyNoMoreInteractions(userAndEmailView); Assert.assertEquals(userAndEmailView, view); }
@Test public void testCreateViewStringUri() throws Exception { DatasetDescriptor descriptor = new DatasetDescriptor.Builder() .schemaUri("resource:schema/user.avsc") .build(); Constraints constraints = new Constraints(descriptor.getSchema(), null) .with("username", "user1") .with("email", "user1@example.com"); AbstractDataset<Object> ds = mock(AbstractDataset.class); when(repo.create("ns", "test", descriptor, Object.class)).thenReturn(ds); when(ds.getDescriptor()).thenReturn(descriptor); AbstractRefinableView<Object> userAndEmailView = mock(AbstractRefinableView.class); when(ds.filter(constraints)).thenReturn(userAndEmailView); URI datasetUri = new URIBuilder(repoUri, "ns", "test") .with("username", "user1") .with("email", "user1@example.com") .with("ignoredOption", "abc") .build(); RefinableView<Object> view = Datasets.<Object, RefinableView<Object>> create(datasetUri.toString(), descriptor, Object.class); verify(repo).create("ns", "test", descriptor, Object.class); verifyNoMoreInteractions(repo); verify(ds).getDescriptor(); verify(ds).filter(constraints); verifyNoMoreInteractions(ds); verifyNoMoreInteractions(userAndEmailView); Assert.assertEquals(userAndEmailView, view); }
@Test public void testLoadViewWithoutType() throws Exception { DatasetDescriptor descriptor = new DatasetDescriptor.Builder() .schemaUri("resource:schema/user.avsc") .build(); Constraints constraints = new Constraints(descriptor.getSchema(), null) .with("username", "user1") .with("email", "user1@example.com"); AbstractDataset<GenericRecord> ds = mock(AbstractDataset.class); when(repo.load("ns", "test", GenericRecord.class)).thenReturn(ds); when(ds.getDescriptor()).thenReturn(descriptor); AbstractRefinableView<GenericRecord> userAndEmailView = mock(AbstractRefinableView.class); when(ds.filter(constraints)).thenReturn(userAndEmailView); URI datasetUri = new URIBuilder(repoUri, "ns", "test") .with("username", "user1") .with("email", "user1@example.com") .with("ignoredOption", "abc") .build(); RefinableView<GenericRecord> view = Datasets.load(datasetUri); verify(repo).load("ns", "test", GenericRecord.class); verifyNoMoreInteractions(repo); verify(ds).getDescriptor(); verify(ds).filter(constraints); verifyNoMoreInteractions(ds); verifyNoMoreInteractions(userAndEmailView); Assert.assertEquals(userAndEmailView, view); }
@Test public void testCreateViewStringUriWithoutType() throws Exception { DatasetDescriptor descriptor = new DatasetDescriptor.Builder() .schemaUri("resource:schema/user.avsc") .build(); Constraints constraints = new Constraints(descriptor.getSchema(), null) .with("username", "user1") .with("email", "user1@example.com"); AbstractDataset<GenericRecord> ds = mock(AbstractDataset.class); when(repo.create("ns", "test", descriptor, GenericRecord.class)).thenReturn(ds); when(ds.getDescriptor()).thenReturn(descriptor); AbstractRefinableView<GenericRecord> userAndEmailView = mock(AbstractRefinableView.class); when(ds.filter(constraints)).thenReturn(userAndEmailView); URI datasetUri = new URIBuilder(repoUri, "ns", "test") .with("username", "user1") .with("email", "user1@example.com") .with("ignoredOption", "abc") .build(); View<GenericRecord> view = Datasets.create(datasetUri.toString(), descriptor); verify(repo).create("ns", "test", descriptor, GenericRecord.class); verifyNoMoreInteractions(repo); verify(ds).getDescriptor(); verify(ds).filter(constraints); verifyNoMoreInteractions(ds); verifyNoMoreInteractions(userAndEmailView); Assert.assertEquals(userAndEmailView, view); }
@Override public int run(String[] args) throws Exception { // Create a dataset of users with the Avro schema DatasetDescriptor descriptor = new DatasetDescriptor.Builder() .schemaUri("resource:user.avsc") .build(); Dataset<Record> users = Datasets.create("dataset:hive?dataset=users", descriptor, Record.class); // Get a writer for the dataset and write some users to it DatasetWriter<Record> writer = null; try { writer = users.newWriter(); Random rand = new Random(); GenericRecordBuilder builder = new GenericRecordBuilder(descriptor.getSchema()); for (int i = 0; i < 100; i++) { Record record = builder.set("username", "user-" + i) .set("creationDate", System.currentTimeMillis()) .set("favoriteColor", colors[rand.nextInt(colors.length)]).build(); writer.write(record); } } finally { if (writer != null) { writer.close(); } } return 0; }
@Test public void testCreateViewWithoutType() throws Exception { DatasetDescriptor descriptor = new DatasetDescriptor.Builder() .schemaUri("resource:schema/user.avsc") .build(); Constraints constraints = new Constraints(descriptor.getSchema(), null) .with("username", "user1") .with("email", "user1@example.com"); AbstractDataset<GenericRecord> ds = mock(AbstractDataset.class); when(repo.create("ns", "test", descriptor, GenericRecord.class)).thenReturn(ds); when(ds.getDescriptor()).thenReturn(descriptor); AbstractRefinableView<GenericRecord> userAndEmailView = mock(AbstractRefinableView.class); when(ds.filter(constraints)).thenReturn(userAndEmailView); URI datasetUri = new URIBuilder(repoUri, "ns", "test") .with("username", "user1") .with("email", "user1@example.com") .with("ignoredOption", "abc") .build(); View<GenericRecord> view = Datasets.create(datasetUri, descriptor); verify(repo).create("ns", "test", descriptor, GenericRecord.class); verifyNoMoreInteractions(repo); verify(ds).getDescriptor(); verify(ds).filter(constraints); verifyNoMoreInteractions(ds); verifyNoMoreInteractions(userAndEmailView); Assert.assertEquals(userAndEmailView, view); }
@Override public DatasetDescriptor update(String namespace, String name, DatasetDescriptor descriptor) { Compatibility.checkDatasetName(namespace, name); Compatibility.checkDescriptor(descriptor); String resolved = resolveNamespace(namespace, name); if (resolved != null) { Table table = getMetaStoreUtil().getTable(resolved, name); Path managerPath = new Path(new Path(table.getSd().getLocation()), SCHEMA_DIRECTORY); SchemaManager manager = SchemaManager.create(conf, managerPath); DatasetDescriptor newDescriptor; try { URI schemaURI = manager.writeSchema(descriptor.getSchema()); newDescriptor = new DatasetDescriptor.Builder(descriptor) .schemaUri(schemaURI).build(); } catch (IOException e) { throw new DatasetIOException("Unable to create schema", e); } HiveUtils.updateTableSchema(table, newDescriptor); getMetaStoreUtil().alterTable(table); return descriptor; } throw new DatasetNotFoundException( "Hive table not found: " + namespace + "." + name); }
@Test public void testLoadViewStringUri() throws Exception { DatasetDescriptor descriptor = new DatasetDescriptor.Builder() .schemaUri("resource:schema/user.avsc") .build(); Constraints constraints = new Constraints(descriptor.getSchema(), null) .with("username", "user1") .with("email", "user1@example.com"); AbstractDataset<Object> ds = mock(AbstractDataset.class); when(repo.load("ns", "test", Object.class)).thenReturn(ds); when(ds.getDescriptor()).thenReturn(descriptor); AbstractRefinableView<Object> userAndEmailView = mock(AbstractRefinableView.class); when(ds.filter(constraints)).thenReturn(userAndEmailView); URI datasetUri = new URIBuilder(repoUri, "ns", "test") .with("username", "user1") .with("email", "user1@example.com") .with("ignoredOption", "abc") .build(); RefinableView<Object> view = Datasets.<Object, RefinableView<Object>> load(datasetUri.toString(), Object.class); verify(repo).load("ns", "test", Object.class); verifyNoMoreInteractions(repo); verify(ds).getDescriptor(); verify(ds).filter(constraints); verifyNoMoreInteractions(ds); verifyNoMoreInteractions(userAndEmailView); Assert.assertEquals(userAndEmailView, view); }
@Before public void setup() throws Exception { this.conf = (distributed ? MiniDFSTest.getConfiguration() : new Configuration()); this.fs = FileSystem.get(conf); this.trashPolicy = TrashPolicy.getInstance(conf, fs, fs.getHomeDirectory()); this.repo = newRepo(); this.strategy = new PartitionStrategy.Builder() .year("timestamp") .month("timestamp") .day("timestamp") .build(); this.testDescriptor = new DatasetDescriptor.Builder() .schemaUri("resource:standard_event.avsc") .partitionStrategy(strategy) .build(); repo.delete("ns", "test"); this.unbounded = repo.create("ns", "test", testDescriptor); this.valueDescriptor = new DatasetDescriptor.Builder().schemaUri("resource:value.avsc").build(); repo.delete("ns", "value"); this.valueView = repo.create("ns", "value", valueDescriptor); this.testValueView = repo.load("ns", "value", TestValue.class); }
@Test public void testRefineIdentity() throws Exception { PartitionStrategy strategy = new PartitionStrategy.Builder() .identity("user_id") .build(); DatasetDescriptor descriptor = new DatasetDescriptor.Builder() .schemaUri("resource:standard_event.avsc") .partitionStrategy(strategy) .build(); // Create a separate dataset to avoid conflicts with the above. Dataset<StandardEvent> identityDataset = repo.create( "ns", "test_identity", descriptor); DatasetWriter<StandardEvent> writer = null; try { writer = identityDataset.newWriter(); writer.write(sepEvent); writer.write(octEvent); writer.write(novEvent); } finally { Closeables.close(writer, false); } assertContentEquals(Sets.newHashSet(sepEvent, novEvent), identityDataset.with("user_id", 0L)); } }
@Override public DatasetDescriptor update(String namespace, String name, DatasetDescriptor descriptor) { Compatibility.checkDatasetName(namespace, name); Compatibility.checkDescriptor(descriptor); String resolved = resolveNamespace(namespace, name); if (resolved != null) { Table table = getMetaStoreUtil().getTable(resolved, name); Path managerPath = new Path(new Path(table.getSd().getLocation()), SCHEMA_DIRECTORY); SchemaManager manager = SchemaManager.create(conf, managerPath); DatasetDescriptor newDescriptor; try { URI schemaURI = manager.writeSchema(descriptor.getSchema()); newDescriptor = new DatasetDescriptor.Builder(descriptor) .schemaUri(schemaURI).build(); } catch (IOException e) { throw new DatasetIOException("Unable to create schema", e); } HiveUtils.updateTableSchema(table, newDescriptor); getMetaStoreUtil().alterTable(table); return descriptor; } throw new DatasetNotFoundException( "Hive table not found: " + namespace + "." + name); }
@Test public void testWriteAndRead() throws IOException { FileSystemDataset<Record> ds = new FileSystemDataset.Builder<Record>() .namespace("ns") .name("test") .configuration(getConfiguration()) .descriptor(new DatasetDescriptor.Builder() .schemaUri(USER_SCHEMA_URL) .format(format) .compressionType(compressionType) .location(testDirectory) .build()) .type(Record.class) .build(); Assert.assertFalse("Dataset is not partitioned", ds.getDescriptor() .isPartitioned()); writeTestUsers(ds, 10); checkTestUsers(ds, 10); }
@Before public void setup() throws Exception { this.conf = new Configuration(); this.fs = FileSystem.get(conf); this.repo = newRepo(); this.strategy = new PartitionStrategy.Builder() .year("timestamp") .month("timestamp") .day("timestamp") .hash("user_id", 2) .build(); this.testDescriptor = new DatasetDescriptor.Builder() .schemaUri("resource:standard_event.avsc") .partitionStrategy(strategy) .build(); this.testDataset = repo.create("ns", "test", testDescriptor); }
@Override public int run(String[] args) throws Exception { // Create a dataset of events with the Avro schema DatasetDescriptor descriptor = new DatasetDescriptor.Builder() .schemaUri("resource:event.avsc") .build(); Datasets.create("dataset:hive:/tmp/data/default/events", descriptor); return 0; }
@BeforeClass public static void createRepositoryAndTestDatasets() throws Exception { hdfsAuth = getDFS().getUri().getAuthority(); descriptor = new DatasetDescriptor.Builder() .schemaUri("resource:schema/user.avsc") .build(); }
@BeforeClass public static void startHBase() throws Exception { new Loader().load(); descriptor = new DatasetDescriptor.Builder() .schemaUri("resource:TestGenericEntity.avsc") .build(); HBaseTestUtils.getMiniCluster(); String zkQuorum = HBaseTestUtils.getConf().get(HConstants.ZOOKEEPER_QUORUM); String zkClientPort = HBaseTestUtils.getConf().get(HConstants.ZOOKEEPER_CLIENT_PORT); zk = zkQuorum + ":" + zkClientPort; // OK since zkQuorum is a single host repositoryUri = new URI("repo:hbase:" + zk); }
@BeforeClass public static void createRepositoryAndTestDatasets() throws Exception { localFS = LocalFileSystem.getInstance(); descriptor = new DatasetDescriptor.Builder() .schemaUri("resource:schema/user.avsc") .build(); }
@Test public void testSchemaFromResourceURI() throws Exception { String uri = "resource:standard_event.avsc"; DatasetDescriptor descriptor = new DatasetDescriptor.Builder().schemaUri(uri).build(); Assert.assertNotNull(descriptor); Assert.assertNotNull(descriptor.getSchema()); }