org.kitesdk.data.DatasetDescriptor$Builder.schema java code examples

Refine search

private SavePolicy(Context context) {
 String uri = context.getString(CONFIG_KITE_ERROR_DATASET_URI);
 Preconditions.checkArgument(uri != null, "Must set "
   + CONFIG_KITE_ERROR_DATASET_URI + " when " + CONFIG_FAILURE_POLICY
   + "=save");
 if (Datasets.exists(uri)) {
  dataset = Datasets.load(uri, AvroFlumeEvent.class);
 } else {
  DatasetDescriptor descriptor = new DatasetDescriptor.Builder()
    .schema(AvroFlumeEvent.class)
    .build();
  dataset = Datasets.create(uri, descriptor, AvroFlumeEvent.class);
 }
 nEventsHandled = 0;
}

@Override
public int run() throws IOException {
 Preconditions.checkArgument(classNames != null && !classNames.isEmpty(),
   "Java class name is required");
 Preconditions.checkArgument(classNames.size() == 1,
   "Only one java class name can be given");
 ClassLoader loader = loaderFor(jars, libs);
 String className = classNames.get(0);
 Class<?> recordClass;
 try {
  recordClass = loader.loadClass(className);
 } catch (ClassNotFoundException e) {
  throw new IllegalArgumentException("Cannot find class: " + className, e);
 }
 DatasetDescriptor descriptor = new DatasetDescriptor.Builder()
   .schema(recordClass)
   .build();
 String schema = descriptor.getSchema().toString(!minimize);
 output(schema, console, outputPath);
 return 0;
}

@Before
public void setUp() throws IOException {
 this.testSchema = DatasetTestUtilities.USER_SCHEMA;
 Configuration tempConfig = new Configuration();
 tempConfig.setLong("fs.trash.interval", 1);
 this.conf = (distributed ?
   MiniDFSTest.getConfiguration() : tempConfig);
 this.fileSystem = FileSystem.get(conf);
 this.testDirectory = fileSystem.makeQualified(
   new Path(Files.createTempDir().getAbsolutePath()));
 this.testDescriptor = new DatasetDescriptor.Builder()
   .schema(testSchema)
   .build();
 this.testProvider = newProvider(conf);
 this.repo = newRepo(testProvider);
}

@Test
public void testTSV() {
 final DatasetDescriptor desc = new DatasetDescriptor.Builder()
   .property("kite.csv.delimiter", "\t")
   .property("kite.csv.lines-to-skip", "1")
   .schema(STRINGS)
   .build();
 final CSVFileReader<GenericData.Record> reader =
   new CSVFileReader<GenericData.Record>(localfs, tsvFile, desc,

@Test
public void testReadySignalUpdatesModifiedTime() {
 final FileSystemDataset<Record> ds = new FileSystemDataset.Builder<Record>()
   .namespace("ns")
   .name("users")
   .configuration(getConfiguration())
   .descriptor(
     new DatasetDescriptor.Builder().schema(USER_SCHEMA).format(format)
       .location(testDirectory).build())
   .type(Record.class)
   .uri(URIBuilder.build(URI.create("repo:" + testDirectory.toUri()), "ns", "name"))
   .build();
  Assert.assertFalse("Dataset should not be ready before being signaled",
   ds.isReady());
 // the modified time depends on the filesystem, and may only be granular to the second
 // signal and check until the modified time is after the current time, or until
 // enough time has past that the signal should have been distinguishable
 long signaledTime = 0;
 long currentTime = System.currentTimeMillis();
 while(currentTime >= signaledTime && (System.currentTimeMillis() - currentTime) <= 2000) {
  ds.signalReady();
  signaledTime = ds.getLastModified();
 }
 Assert.assertTrue("Dataset should have been signaled as ready", ds.isReady());
 Assert.assertTrue("Signal should update the modified time",
   signaledTime > currentTime);
 Assert.assertFalse("Only the dataset should have been signaled",
   ((Signalable)ds.with("username", "bob")).isReady());
}

@Test
public void testPartitionedDataset() throws Exception {
 File folder = temp.newFolder("a/b/c/d/e/dataset_name");
 Path root = new Path(temp.getRoot().toURI());
 FileSystem fs = LocalFileSystem.getInstance();
 URI datasetUri = URI.create("dataset:file:" + folder.getAbsolutePath());
 DatasetDescriptor descriptor = new DatasetDescriptor.Builder()
   .schema(USER_SCHEMA)
   .partitionStrategy(new PartitionStrategy.Builder()
     .hash("id", 4)
     .build())
   .build();
 Dataset<GenericRecord> dataset = Datasets.create(datasetUri, descriptor);
 // write two so that the descriptor uses the directory rather than a file
 writeUserToView(dataset);
 writeUserToView(dataset);
 Path datasetPath = new Path(folder.toURI());
 Path partitionPath = new Path(datasetPath, "id_hash=1");
 DatasetDescriptor actual = Iterables.getOnlyElement(
   FileSystemUtil.findPotentialDatasets(fs, root));
 Assert.assertFalse("Should not flag at mixed depth",
   descriptor.hasProperty("kite.filesystem.mixed-depth"));
 Assert.assertEquals("Location should be at the partition directory",
   partitionPath.toUri(), actual.getLocation());
 Assert.assertEquals("Should use user schema",
   USER_SCHEMA, actual.getSchema());
 Assert.assertEquals("Should have Avro format",
   Formats.AVRO, actual.getFormat());
 Assert.assertFalse("Should not be partitioned", actual.isPartitioned());
}

@Override
public int run(String[] args) throws Exception {
 // Create a dataset of products with the Avro schema
 DatasetDescriptor descriptor = new DatasetDescriptor.Builder()
   .schema(Product.class)
   .build();
 Dataset<Product> products = Datasets.create(
   "dataset:hdfs:/tmp/data/products", descriptor, Product.class);
 // Get a writer for the dataset and write some products to it
 DatasetWriter<Product> writer = null;
 try {
  writer = products.newWriter();
  int i = 0;
  for (String name : names) {
   Product product = new Product();
   product.setName(name);
   product.setId(i++);
   writer.write(product);
  }
 } finally {
  if (writer != null) {
   writer.close();
  }
 }
 return 0;
}

/**
 * The constructor will start by registering the schemas with the meta store
 * table in HBase, and create the required tables to run.
 */
public UserProfileDatasetExample() throws Exception {
 Configuration conf = HBaseConfiguration.create();
 HBaseAdmin admin = new HBaseAdmin(conf);
 // Delete the table if it exists so we start fresh.
 if (admin.tableExists("kite_example_user_profiles")) {
  admin.disableTable("kite_example_user_profiles");
  admin.deleteTable("kite_example_user_profiles");
 }
 HBaseDatasetRepository repo = new HBaseDatasetRepository.Builder()
   .configuration(conf).build();
 // TODO: change to use namespace (CDK-140)
 DatasetDescriptor userProfileDatasetDescriptor =
   new DatasetDescriptor.Builder().schema(UserProfileModel2.SCHEMA$).build();
 userProfileDataset = repo.create("default", "kite_example_user_profiles.UserProfileModel2",
   userProfileDatasetDescriptor);
 DatasetDescriptor userActionsDatasetDescriptor =
   new DatasetDescriptor.Builder().schema(UserActionsModel2.SCHEMA$).build();
 userActionsDataset = repo.create("default", "kite_example_user_profiles.UserActionsModel2",
   userActionsDatasetDescriptor);
 DatasetDescriptor userProfileActionsDatasetDescriptor =
   new DatasetDescriptor.Builder().schema(UserProfileActionsModel2.SCHEMA$).build();
 userProfileActionsDataset = repo.create("default", "kite_example_user_profiles.UserProfileActionsProtocol2",
   userProfileActionsDatasetDescriptor);
}

@Test
public void testAllowedPartitionSchemaCombinations() {
 Compatibility.checkDescriptor(
   new DatasetDescriptor.Builder()
     .schema(schema)
     .partitionStrategy(new PartitionStrategy.Builder()
       .year("timestamp")
       .month("timestamp")
       .day("timestamp")
       .hour("timestamp")
       .minute("timestamp")
       .identity("message", "message_copy")
       .identity("timestamp", "ts")
       .identity("number", "num")
       .hash("message", 48)
       .hash("timestamp", 48)
       .hash("number", 48)
       .hash("payload", 48)
       .hash("float", 48)
       .hash("double", 48)
       .hash("bool", 48)
       .range("number", 5, 10, 15, 20)
       .range("message", "m", "z", "M", "Z")
       .build())
     .build());
}

@Test(expected = ValidationException.class)
public void testCannotMergeDatasetsWithDifferentPartitionStrategies() throws IOException {
 FileSystemDataset<Record> ds = new FileSystemDataset.Builder<Record>()
   .namespace("ns")
   .name("users")
   .configuration(getConfiguration())
   .descriptor(new DatasetDescriptor.Builder()
     .schema(USER_SCHEMA)
     .location(testDirectory)
     .partitionStrategy(new PartitionStrategy.Builder()
       .hash("username", 2).build())
     .build())
   .type(Record.class)
   .build();
 FileSystemDataset<Record> dsUpdate = new FileSystemDataset.Builder<Record>()
   .namespace("ns")
   .name("users")
   .configuration(getConfiguration())
   .descriptor(new DatasetDescriptor.Builder()
     .schema(USER_SCHEMA)
     .location(testDirectory)
     .partitionStrategy(new PartitionStrategy.Builder()
       .hash("username", 2).hash("email", 3).build())
     .build())
   .type(Record.class)
   .build();
 ds.merge(dsUpdate);
}

  .endRecord();
DatasetDescriptor descriptor = new DatasetDescriptor.Builder()
  .schema(event)
  .partitionStrategy(new PartitionStrategy.Builder()
    .provided("v", "int")
    .day("created_at")
    .build())
  .build();
DatasetDescriptor update = new DatasetDescriptor.Builder(ds.getDescriptor())
  .partitionStrategy(new PartitionStrategy.Builder()
    .identity("version", "v")
    .day("created_at")
    .build())
  .build();

@Test
public void testReflectProjectionAsType() throws IOException {
 Dataset<StandardEvent> original = repo.create(
   "ns", "reflectProjection",
   new DatasetDescriptor.Builder()
     .schema(StandardEvent.class)
     .build(),
   StandardEvent.class);
 DatasetWriter<ReflectStandardEvent> writer = null;
 try {
  writer = original.asType(ReflectStandardEvent.class).newWriter();
  writer.write(new ReflectStandardEvent(sepEvent));
  writer.write(new ReflectStandardEvent(octEvent));
  writer.write(new ReflectStandardEvent(novEvent));
 } finally {
  Closeables.close(writer, false);
 }
 final View<ReflectSmallEvent> smallEvents = original.asType(ReflectSmallEvent.class);
 Set<ReflectSmallEvent> expected = Sets.newHashSet(
   new ReflectSmallEvent(sepEvent), new ReflectSmallEvent(octEvent),
   new ReflectSmallEvent(novEvent));
 assertContentEquals(expected, smallEvents);
 TestHelpers.assertThrows("Should not be able to write small events",
   IncompatibleSchemaException.class, new Runnable() {
    @Override
    public void run() {
     smallEvents.newWriter();
    }
   });
}

@Test
public void testKeyMappingSourceMustBeIdentityPartitioned() {
 // and it works when the field is present
 Assert.assertNotNull(new DatasetDescriptor.Builder()
   .schema(USER_SCHEMA)
   .partitionStrategy(new PartitionStrategy.Builder()
     .hash("id", 16)
     .identity("id")
     .build())
   .columnMapping(new ColumnMapping.Builder()
     .key("id")
     .build())
   .build());
 TestHelpers.assertThrows("Should reject mapping source not id partitioned",
   ValidationException.class, new Runnable() {
    @Override
    public void run() {
     new DatasetDescriptor.Builder()
       .schema(USER_SCHEMA)
       .partitionStrategy(new PartitionStrategy.Builder()
         .hash("id", 16)
         .build())
       .columnMapping(new ColumnMapping.Builder()
         .key("id")
         .build())
       .build();
    }
   }
 );
}

/**
 * The constructor will start by registering the schemas with the meta store
 * table in HBase, and create the required tables to run.
 */
public UserProfileDatasetExample() throws Exception {
 Configuration conf = HBaseConfiguration.create();
 HBaseAdmin admin = new HBaseAdmin(conf);
 // Delete the table if it exists so we start fresh.
 if (admin.tableExists("kite_example_user_profiles")) {
  admin.disableTable("kite_example_user_profiles");
  admin.deleteTable("kite_example_user_profiles");
 }
 HBaseDatasetRepository repo = new HBaseDatasetRepository.Builder()
   .configuration(conf).build();
 // TODO: change to use namespace (CDK-140)
 DatasetDescriptor userProfileDatasetDescriptor =
   new DatasetDescriptor.Builder().schema(UserProfileModel2.SCHEMA$).build();
 userProfileDataset = repo.create("default", "kite_example_user_profiles.UserProfileModel2",
   userProfileDatasetDescriptor);
 DatasetDescriptor userActionsDatasetDescriptor =
   new DatasetDescriptor.Builder().schema(UserActionsModel2.SCHEMA$).build();
 userActionsDataset = repo.create("default", "kite_example_user_profiles.UserActionsModel2",
   userActionsDatasetDescriptor);
 DatasetDescriptor userProfileActionsDatasetDescriptor =
   new DatasetDescriptor.Builder().schema(UserProfileActionsModel2.SCHEMA$).build();
 userProfileActionsDataset = repo.create("default", "kite_example_user_profiles.UserProfileActionsProtocol2",
   userProfileActionsDatasetDescriptor);
}

@Test
@SuppressWarnings("deprecation")
public void testWriteToSubpartition() throws IOException {
 PartitionStrategy partitionStrategy = new PartitionStrategy.Builder()
  .hash("username", "username_part", 2).hash("email", 3).build();
 FileSystemDataset<Record> ds = new FileSystemDataset.Builder<Record>()
   .namespace("ns")
   .name("partitioned-users")
   .configuration(getConfiguration())
   .descriptor(new DatasetDescriptor.Builder()
     .schema(USER_SCHEMA)
     .format(format)
     .compressionType(compressionType)
     .location(testDirectory)
     .partitionStrategy(partitionStrategy)
     .build())
   .type(Record.class)
   .build();
 PartitionKey key = new PartitionKey(1);
 FileSystemDataset<Record> userPartition = (FileSystemDataset<Record>) ds.getPartition(key, true);
 Assert.assertEquals(key, userPartition.getPartitionKey());
 writeTestUsers(userPartition, 1);
 Assert.assertTrue("Partitioned directory exists",
  fileSystem.exists(new Path(testDirectory, "username_part=1/email_hash=2")));
 Assert.assertEquals(1, readTestUsersInPartition(ds, key, "email_hash"));
}

@Test(expected = ValidationException.class)
public void testCannotMergeDatasetsWithDifferentFormats() throws IOException {
 FileSystemDataset<Record> ds = new FileSystemDataset.Builder<Record>()
   .namespace("ns")
   .name("users")
   .configuration(getConfiguration())
   .descriptor(new DatasetDescriptor.Builder()
     .schema(USER_SCHEMA)
     .format(Formats.AVRO)
     .location(testDirectory)
     .build())
   .type(Record.class)
   .build();
 FileSystemDataset<Record> dsUpdate = new FileSystemDataset.Builder<Record>()
   .namespace("ns")
   .name("users")
   .configuration(getConfiguration())
   .descriptor(new DatasetDescriptor.Builder()
     .schema(USER_SCHEMA)
     .format(Formats.PARQUET)
     .location(testDirectory)
     .build())
   .type(Record.class)
   .build();
 ds.merge(dsUpdate);
}

@Before
public void createTestDatasets() {
 Datasets.delete("dataset:file:/tmp/datasets/unpartitioned");
 Datasets.delete("dataset:file:/tmp/datasets/partitioned");
 Datasets.delete("dataset:file:/tmp/datasets/temporary");
 DatasetDescriptor descriptor = new DatasetDescriptor.Builder()
   .schema(TestRecord.class)
   .build();
 unpartitioned = Datasets.create("dataset:file:/tmp/datasets/unpartitioned",
   descriptor, TestRecord.class);
 descriptor = new DatasetDescriptor.Builder(descriptor)
   .property("kite.writer.cache-size", "20")
   .partitionStrategy(new PartitionStrategy.Builder()
     .hash("id", 4)
     .build())
   .build();
 partitioned = Datasets.create("dataset:file:/tmp/datasets/partitioned",
   descriptor, TestRecord.class);
 // create a second dataset with the same partitioning for replacement parts
 temporary = Datasets.create("dataset:file:/tmp/datasets/temporary",
   descriptor, TestRecord.class);
 writeTestRecords(unpartitioned);
 writeTestRecords(partitioned);
 writeTestRecords(temporary);
}

 @Test
 public void testBackwardCompatibleMappingToPartitionStrategy() {
  Schema schema = new Schema.Parser().parse("{" +
    "  \"type\": \"record\"," +
    "  \"name\": \"User\"," +
    "  \"fields\": [" +
    "    {\"name\": \"id\", \"type\": \"long\", \"mapping\":" +
    "      {\"type\": \"key\", \"value\": \"1\"} }," +
    "    {\"name\": \"username\", \"type\": \"string\", \"mapping\":" +
    "      {\"type\": \"key\", \"value\": \"0\"} }," +
    "    {\"name\": \"real_name\", \"type\": \"string\", \"mapping\":" +
    "      {\"type\": \"column\", \"value\": \"m:name\"} }" +
    "  ]" +
    "}");
  PartitionStrategy expected = new PartitionStrategy.Builder()
    .identity("username")
    .identity("id")
    .build();
  DatasetDescriptor descriptor = new DatasetDescriptor.Builder()
    .schema(schema)
    .build();
  Assert.assertEquals(expected, descriptor.getPartitionStrategy());
 }
}

@Test
public void testUpdateFailsWithIncompatibleSchemaChange() {
 Dataset<Record> dataset = repo.create(NAMESPACE, NAME, new DatasetDescriptor.Builder()
   .schema(testSchema).build());
 Assert.assertEquals("Dataset name is propagated", NAME,
   dataset.getName());
 Assert.assertEquals("Dataset schema is propagated", testSchema, dataset
   .getDescriptor().getSchema());
 Schema testSchemaV2 = SchemaBuilder.record("user").fields()
   .requiredString("username")
   .requiredString("email")
   .requiredString("favoriteColor") // incompatible - no default
   .endRecord();
 try {
  repo.update(NAMESPACE, NAME, new DatasetDescriptor.Builder(
    dataset.getDescriptor()).schema(testSchemaV2).build());
  Assert.fail("Should fail due to incompatible update");
 } catch (ValidationException e) {
  // expected
 }
 dataset = repo.load(NAMESPACE, NAME);
 Assert.assertEquals("Dataset schema is unchanged", testSchema, dataset
   .getDescriptor().getSchema());
}

@Test
public void testReflectProjectionLoad() throws IOException {
 Dataset<ReflectStandardEvent> original = repo.create(
   "ns", "reflectProjection",
   new DatasetDescriptor.Builder()
     .schema(ReflectStandardEvent.class)
     .build(),
   ReflectStandardEvent.class);
 DatasetWriter<ReflectStandardEvent> writer = null;
 try {
  writer = original.newWriter();
  writer.write(new ReflectStandardEvent(sepEvent));
  writer.write(new ReflectStandardEvent(octEvent));
  writer.write(new ReflectStandardEvent(novEvent));
 } finally {
  Closeables.close(writer, false);
 }
 View<ReflectSmallEvent> dataset = repo.load("ns", original.getName(),
   ReflectSmallEvent.class);
 Set<ReflectSmallEvent> expected = Sets.newHashSet(
   new ReflectSmallEvent(sepEvent), new ReflectSmallEvent(octEvent),
   new ReflectSmallEvent(novEvent));
 assertContentEquals(expected, dataset);
}

Javadoc

Configure the dataset's schema from a File. A schema is required, and can be set using one of the methods schema, schemaLiteral, schemaUri, or schemaFromAvroDataFile.

Popular methods of DatasetDescriptor$Builder

Popular in Java

Making http requests using okhttp
getSupportFragmentManager (FragmentActivity)
setScale (BigDecimal)
getApplicationContext (Context)
UnknownHostException (java.net)
Thrown when a hostname can not be resolved.
MessageFormat (java.text)
Produces concatenated messages in language-neutral way. New code should probably use java.util.Forma
HashMap (java.util)
HashMap is an implementation of Map. All optional operations are supported.All elements are permitte
DataSource (javax.sql)
An interface for the creation of Connection objects which represent a connection to a database. This
Options (org.apache.commons.cli)
Main entry-point into the library. Options represents a collection of Option objects, which describ
Reference (javax.naming)
Top plugins for Android Studio

How to use schemamethodin org.kitesdk.data.DatasetDescriptor$Builder

Best Java code snippets using org.kitesdk.data.DatasetDescriptor$Builder.schema (Showing top 20 results out of 315)

Refine search

How to use
schema
method
in
org.kitesdk.data.DatasetDescriptor$Builder