co.cask.cdap.api.dataset.lib.FileSetProperties$Builder java code examples

@Test
public void testInputOutputFormatClassAtRuntime() throws Exception {
 // create a dataset with text input and output formats
 DatasetId datasetId = OTHER_NAMESPACE.dataset("testRuntimeFormats");
 dsFrameworkUtil.createInstance("fileSet", datasetId, FileSetProperties.builder()
  .setInputFormat(TextInputFormat.class)
  .setOutputFormat(TextOutputFormat.class)
  .build());
 // without passing anything in arguments, the input/output format classes will come from dataset properties
 FileSet fs = dsFrameworkUtil.getInstance(datasetId);
 Assert.assertEquals(TextInputFormat.class.getName(), fs.getInputFormatClassName());
 Assert.assertEquals(TextOutputFormat.class.getName(), fs.getOutputFormatClassName());
 // allow overriding the input format in dataset runtime args
 fs = dsFrameworkUtil.getInstance(datasetId, ImmutableMap.of(
  FileSetProperties.INPUT_FORMAT, CombineTextInputFormat.class.getName()));
 Assert.assertEquals(CombineTextInputFormat.class.getName(), fs.getInputFormatClassName());
 Assert.assertEquals(TextOutputFormat.class.getName(), fs.getOutputFormatClassName());
 // allow overriding both the input and output format in dataset runtime args
 fs = dsFrameworkUtil.getInstance(datasetId, ImmutableMap.of(
  FileSetProperties.INPUT_FORMAT, CombineTextInputFormat.class.getName(),
  FileSetProperties.OUTPUT_FORMAT, NullOutputFormat.class.getName()));
 Assert.assertEquals(CombineTextInputFormat.class.getName(), fs.getInputFormatClassName());
 Assert.assertEquals(NullOutputFormat.class.getName(), fs.getOutputFormatClassName());
}

                 .setBasePath(existingPath)
                 .setPossessExisting(true)
                 .build());
Assert.assertTrue(someFile.exists());

private DatasetProperties createProperties(OutputFormatProvider outputFormatProvider) {
 FileSetProperties.Builder fileProperties = SnapshotFileSet.getBaseProperties(config);
 addFileProperties(fileProperties);
 fileProperties.setOutputFormat(outputFormatProvider.getOutputFormatClassName());
 for (Map.Entry<String, String> formatProperty : outputFormatProvider.getOutputFormatConfiguration().entrySet()) {
  fileProperties.setOutputProperty(formatProperty.getKey(), formatProperty.getValue());
 }
 return fileProperties.build();
}

 .setBasePath(baseDir.getPath())
 .setDataExternal(true)
 .build());
 .setBasePath(baseDir.getPath())
 .setDataExternal(true)
 .build());
spec = framework.getDatasetSpec(MY_DS);
Assert.assertNotNull(spec);

 .setBasePath("perm/test/path")
 .setFilePermissions(fsPermissions)
 .setFileGroup(group)
 .build());
FileSet fs = dsFrameworkUtil.getInstance(datasetId);

 .setBasePath("rtInput1")
 .setInputFormat(TextInputFormat.class)
 .setOutputFormat(TextOutputFormat.class)
 .setOutputProperty(TextOutputFormat.SEPERATOR, ":")
 .build());
dsFramework.addInstance("fileSet", rtOutput1, FileSetProperties.builder()
 .setBasePath("rtOutput1")
 .setInputFormat(TextInputFormat.class)
 .setOutputFormat(TextOutputFormat.class)
 .setOutputProperty(TextOutputFormat.SEPERATOR, ":")
 .build());
 .setBasePath("rtInput2")
 .setInputFormat(TextInputFormat.class)
 .setOutputFormat(TextOutputFormat.class)
 .setOutputProperty(TextOutputFormat.SEPERATOR, ":")
 .build());
runtimeArguments = Maps.newHashMap();
runtimeArguments.put(AppWithMapReduceUsingRuntimeDatasets.INPUT_NAME, "rtInput2");

          PartitionedFileSetProperties.builder()
           .setExploreFormat("csv")
           .setExploreSchema("key int, value string")
           .setEnableExploreOnCreate(true)
           .build());
DataSetManager<TimePartitionedFileSet> tpfsManager = getDataset("tpfs");
Date date = DATE_FORMAT.parse("6/4/12 10:00 am");

.setEnableExploreOnCreate(true)
.setSerDe("org.apache.hadoop.hive.ql.io.orc.OrcSerde")
.setExploreInputFormat("org.apache.hadoop.hive.ql.io.orc.OrcInputFormat")
.setExploreOutputFormat("org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat")
.setExploreSchema("record STRING")
.build());

 .builder().setBasePath("some/path").setInputFormat(TextInputFormat.class).build());
DataSetManager<FileSet> bManager = getDataset("b");
String bFormat = bManager.get().getInputFormatClassName();

.setPartitioning(PARTITIONING)
.setEnableExploreOnCreate(true)
.setOutputFormat(org.apache.hadoop.mapreduce.lib.output.TextOutputFormat.class)
.setOutputProperty(org.apache.hadoop.mapreduce.lib.output.TextOutputFormat.SEPERATOR, ",")
.setExploreFormat("csv")
.setExploreSchema("key string, value string")
.build());

DatasetProperties filesetAvroTableProps =
 FileSetProperties.builder()
  .setTableProperty(DatasetSystemMetadataProvider.FILESET_AVRO_SCHEMA_PROPERTY, "avro-table-schema")
  .build();
assertDatasetSchema("avro-table-schema", filesetAvroTableProps);
  .setTableProperty(DatasetSystemMetadataProvider.FILESET_AVRO_SCHEMA_PROPERTY, "avro-table-schema")
  .add(DatasetProperties.SCHEMA, "avro-schema")
  .build();
assertDatasetSchema("avro-schema", filesetAvroTableProps);
  .setOutputProperty(DatasetSystemMetadataProvider.FILESET_AVRO_SCHEMA_OUTPUT_KEY, "avro-output-schema")
  .build();
assertDatasetSchema("avro-output-schema", filesetAvroOutputProps);
  .setOutputProperty(DatasetSystemMetadataProvider.FILESET_AVRO_SCHEMA_OUTPUT_KEY, "avro-output-schema")
  .add(DatasetProperties.SCHEMA, "avro-schema")
  .build();
assertDatasetSchema("avro-schema", filesetAvroOutputProps);
  .setOutputProperty(DatasetSystemMetadataProvider.FILESET_PARQUET_SCHEMA_OUTPUT_KEY, "parquet-output-schema")
  .build();
assertDatasetSchema("parquet-output-schema", filesetParquetProps);
  .setOutputProperty(DatasetSystemMetadataProvider.FILESET_PARQUET_SCHEMA_OUTPUT_KEY, "parquet-output-schema")
  .add(DatasetProperties.SCHEMA, "parquet-schema")
  .build();
assertDatasetSchema("parquet-schema", filesetParquetProps);

 .setPartitioning(PARTITIONING_1)
 .setBasePath(absolutePath.getPath())
 .setDataExternal(true)
 .build());
final PartitionedFileSet pfs = dsFrameworkUtil.getInstance(pfsExternalInstance);

DatasetId inputDatasetId = inputNSMeta.getNamespaceId().dataset("input");
addDatasetInstance(FileSet.class.getName(), inputDatasetId,
          FileSetProperties.builder().setInputFormat(TextInputFormat.class).build());

.setBasePath(absolutePath)
.setDataExternal(true)
.build());

.setOutputFormat(TextOutputFormat.class)
.setOutputProperty(TextOutputFormat.SEPERATOR, ",")
.build());
.setOutputFormat(TextOutputFormat.class)
.setOutputProperty(TextOutputFormat.SEPERATOR, ",")
.setEnableExploreOnCreate(true)
.setExploreFormat("text")
.setExploreFormatProperty("delimiter", "\n")
.setExploreSchema("record STRING")
.build());

static void performAdmin(RuntimeContext context) {
 Admin admin = context.getAdmin();
 Map<String, String> args = context.getRuntimeArguments();
 try {
  // if invoked with dropAll=true, clean up all datasets (a, b, c, d)
  if ("true".equals(args.get("dropAll"))) {
   for (String name : new String[]{"a", "b", "c", "d"}) {
    if (admin.datasetExists(name)) {
     admin.dropDataset(name);
    }
   }
  } else {
   // create a, update b with /extra in base path, truncate c, drop d
   admin.createDataset("a", Table.class.getName(), DatasetProperties.EMPTY);
   String type = admin.getDatasetType("b");
   Assert.assertEquals(FileSet.class.getName(), type);
   DatasetProperties bProps = admin.getDatasetProperties("b");
   String base = bProps.getProperties().get("base.path");
   Assert.assertNotNull(base);
   String newBase = args.get("new.base.path");
   DatasetProperties newBProps = ((FileSetProperties.Builder) FileSetProperties.builder()
    .addAll(bProps.getProperties())).setDataExternal(true).setBasePath(newBase).build();
   admin.updateDataset("b", newBProps);
   admin.truncateDataset("c");
   admin.dropDataset("d");
  }
 } catch (DatasetManagementException e) {
  Throwables.propagate(e);
 }
}

/**
 * Configure a file set to use ORC file format with a given schema. The schema is parsed
 * validated and converted into a Hive schema which is compatible with ORC format. The file set is configured to use
 * ORC input and output format, and also configured for Explore to use Hive. The schema is added
 * to the file set properties in all the different required ways:
 * <ul>
 *   <li>As a top-level dataset property;</li>
 *   <li>As the schema for the input and output format;</li>
 *   <li>As the schema to be used by the ORC serde (which is used by Hive).</li>
 * </ul>
 *
 * @param configuredSchema the original schema configured for the table
 * @param properties a builder for the file set properties
 */
public static void configureORCFileSet(String configuredSchema, FileSetProperties.Builder properties)  {
 //TODO test if complex cases run with lowercase schema only
 String lowerCaseSchema = configuredSchema.toLowerCase();
 String hiveSchema = parseHiveSchema(lowerCaseSchema, configuredSchema);
 hiveSchema = hiveSchema.substring(1, hiveSchema.length() - 1);
 properties.setExploreInputFormat("org.apache.hadoop.hive.ql.io.orc.OrcInputFormat")
  .setExploreOutputFormat("org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat")
  .setSerDe("org.apache.hadoop.hive.ql.io.orc.OrcSerde")
  .setExploreSchema(hiveSchema)
  .setEnableExploreOnCreate(true)
  .add(DatasetProperties.SCHEMA, configuredSchema)
  .build();
}

@Override
public void configure() {
 try {
  createDataset("fs", FileSet.class, FileSetProperties.builder()
   .setInputFormat(MyTextInputFormat.class)
   .setOutputFormat(MyTextOutputFormat.class)
   .setOutputProperty(TextOutputFormat.SEPERATOR, ":").build());
  createDataset("pfs", PartitionedFileSet.class, PartitionedFileSetProperties.builder()
   .setPartitioning(Partitioning.builder().addStringField("x").build())
   .setInputFormat(MyTextInputFormat.class)
   .setOutputFormat(TextOutputFormat.class)
   .setOutputProperty(TextOutputFormat.SEPERATOR, ":").build());
  createDataset("tpfs", TimePartitionedFileSet.class, FileSetProperties.builder()
   .setInputFormat(MyTextInputFormat.class)
   .setOutputFormat(TextOutputFormat.class)
   .setOutputProperty(TextOutputFormat.SEPERATOR, ":").build());
  createDataset("myfs", MyFileSet.class, FileSetProperties.builder()
   .setInputFormat(MyTextInputFormat.class)
   .setOutputFormat(TextOutputFormat.class)
   .setOutputProperty(TextOutputFormat.SEPERATOR, ":").build());
  addSpark(new FileCountSparkProgram());
  addSpark(new ScalaFileCountSparkProgram());
 } catch (Throwable t) {
  throw Throwables.propagate(t);
 }
}

@Test
public void testFixProperties() throws DatasetManagementException, UnsupportedTypeException {
 testFix("fileSet",
     FileSetProperties.builder().setBasePath("/tmp/nn").setDataExternal(true).build());
 testFix(FileSet.class.getName(),
     FileSetProperties.builder().setEnableExploreOnCreate(true).setExploreFormat("csv").build());
 testFix("timePartitionedFileSet",
     FileSetProperties.builder().setBasePath("relative").build());
 testFix(TimePartitionedFileSet.class.getName(),
     FileSetProperties.builder().setBasePath("relative").add("custom", "value").build());
 testFix("objectMappedTable",
     ObjectMappedTableProperties.builder().setType(TestObject.class)
      .setRowKeyExploreName("x").setRowKeyExploreType(Schema.Type.STRING)
      .setConflictDetection(ConflictDetection.NONE).build());
 testFix(ObjectMappedTable.class.getName(),
     ObjectMappedTableProperties.builder().setType(TestObject.class)
      .setRowKeyExploreName("x").setRowKeyExploreType(Schema.Type.STRING)
      .setConflictDetection(ConflictDetection.NONE).build());
 testFix("lineageDataset",
     DatasetProperties.EMPTY);
 testFix(LineageDataset.class.getName(),
     TableProperties.builder().setTTL(1000).build());
 testFix(UsageDataset.class.getSimpleName(), DatasetProperties.EMPTY);
 testFix("table",
     TableProperties.builder().setColumnFamily("fam").build());
 testFix("indexedTable",
     DatasetProperties.builder().add(IndexedTable.INDEX_COLUMNS_CONF_KEY, "a,c").build());
}

/**
 * Configure a file set to use Avro file format with a given schema. The schema is parsed
 * as an Avro schema, validated and converted into a Hive schema. The file set is configured to use
 * Avro key input and output format, and also configured for Explore to use Avro. The schema is added
 * to the file set properties in all the different required ways:
 * <ul>
 *   <li>As a top-level dataset property;</li>
 *   <li>As the schema for the input and output format;</li>
 *   <li>As the schema of the Hive table;</li>
 *   <li>As the schema to be used by the Avro serde (which is used by Hive).</li>
 * </ul>
 * @param configuredSchema the original schema configured for the table
 * @param properties a builder for the file set properties
 */
public static void configureAvroFileSet(String configuredSchema, FileSetProperties.Builder properties) {
 properties
  .setEnableExploreOnCreate(true)
  .setSerDe("org.apache.hadoop.hive.serde2.avro.AvroSerDe")
  .setExploreInputFormat("org.apache.hadoop.hive.ql.io.avro.AvroContainerInputFormat")
  .setExploreOutputFormat("org.apache.hadoop.hive.ql.io.avro.AvroContainerOutputFormat")
  .setTableProperty("avro.schema.literal", configuredSchema)
  .add(DatasetProperties.SCHEMA, configuredSchema);
}

How to useFileSetProperties$Builder in co.cask.cdap.api.dataset.lib

Best Java code snippets using co.cask.cdap.api.dataset.lib.FileSetProperties$Builder (Showing top 20 results out of 315)

How to use
FileSetProperties$Builder
in
co.cask.cdap.api.dataset.lib