co.cask.cdap.api.dataset.lib.PartitionedFileSet java code examples

/**
 * Removes the list of partitions that have failed processing the configured number of times from the working set and
 * returns them.
 */
protected List<PartitionDetail> removeDiscardedPartitions(ConsumerWorkingSet workingSet) {
 List<PartitionDetail> failedPartitions = new ArrayList<>();
 Iterator<ConsumablePartition> iter = workingSet.getPartitions().iterator();
 while (iter.hasNext()) {
  ConsumablePartition partition = iter.next();
  if (partition.getProcessState() == ProcessState.DISCARDED) {
   failedPartitions.add(getPartitionedFileSet().getPartition(partition.getPartitionKey()));
   iter.remove();
  }
 }
 return failedPartitions;
}

private Long getLatestSnapshot() throws IOException {
 Location stateFile = files.getEmbeddedFileSet().getBaseLocation().append(STATE_FILE_NAME);
 if (!stateFile.exists()) {
  return null;
 }
 try (InputStreamReader reader = new InputStreamReader(stateFile.getInputStream(), Charsets.UTF_8)) {
  String val = CharStreams.toString(reader);
  return Long.valueOf(val);
 }
}

 @Override
 public void apply() throws Exception {
  Location outputLocation = createPartition(pfs, PARTITION_KEY, "file");
  outputLocationRef.set(outputLocation);
  Assert.assertTrue(outputLocation.exists());
  Assert.assertNotNull(pfs.getPartition(PARTITION_KEY));
  Assert.assertTrue(pfs.getPartition(PARTITION_KEY).getLocation().exists());
  pfs.dropPartition(PARTITION_KEY);
  Assert.assertFalse(outputLocation.exists());
  Assert.assertNull(pfs.getPartition(PARTITION_KEY));
  pfs.dropPartition(PARTITION_KEY);
 }
});

public void deleteMatchingPartitionsByTime(long upperLimit) throws IOException {
 if (upperLimit > 0 && upperLimit < Long.MAX_VALUE) {
  PartitionFilter filter = PartitionFilter.builder().addRangeCondition(SNAPSHOT_FIELD, null, upperLimit).build();
  Set<PartitionDetail> partitions = files.getPartitions(filter);
  for (PartitionDetail partition : partitions) {
   files.dropPartition(partition.getPartitionKey());
  }
 }
}

 @Override
 public void apply() throws Exception {
  // drop all existing partitions (2 of which are not consumed)
  for (PartitionDetail partitionDetail : dataset.getPartitions(PartitionFilter.ALWAYS_MATCH)) {
   dataset.dropPartition(partitionDetail.getPartitionKey());
  }
  // add 5 new ones
  for (PartitionKey partitionKey : partitionKeys2) {
   dataset.getPartitionOutput(partitionKey).addPartition();
  }
 }
});

 @Override
 public void apply() throws Exception {
  Assert.assertTrue(pfsBaseLocation.exists());
  // attempt to write a new partition - should fail
  try {
   pfs.getPartitionOutput(PARTITION_KEY);
   Assert.fail("External partitioned file set should not allow writing files");
  } catch (UnsupportedOperationException e) {
   // expected
  }
  // create an external file and add it as a partition
  File someFile = new File(absolutePath, "some.file");
  OutputStream out = new FileOutputStream(someFile);
  out.close();
  Assert.assertTrue(someFile.exists());
  pfs.addPartition(PARTITION_KEY, "some.file");
  Assert.assertNotNull(pfs.getPartition(PARTITION_KEY));
  Assert.assertTrue(pfs.getPartition(PARTITION_KEY).getLocation().exists());
  // now drop the partition and validate the file is still there
  pfs.dropPartition(PARTITION_KEY);
  Assert.assertNull(pfs.getPartition(PARTITION_KEY));
  Assert.assertTrue(someFile.exists());
 }
});

 @Override
 public void apply() throws Exception {
  for (PartitionKey partitionKey : partitionKeys1) {
   dataset.getPartitionOutput(partitionKey).addPartition();
  }
 }
});

 @Override
 public void apply() throws Exception {
  PartitionOutput partitionOutput = dataset.getPartitionOutput(PARTITION_KEY);
  long beforeTime = System.currentTimeMillis();
  partitionOutput.addPartition();
  long afterTime = System.currentTimeMillis();
  PartitionDetail partitionDetail = dataset.getPartition(PARTITION_KEY);
  Assert.assertNotNull(partitionDetail);
  long creationTime = partitionDetail.getMetadata().getCreationTime();
  long lastModificationTime = partitionDetail.getMetadata().lastModificationTime();
  // lastModificationTime time should be equal to creationTime for a partition that has not been appended to
  Assert.assertEquals(creationTime, lastModificationTime);
  Assert.assertTrue(creationTime >= beforeTime && creationTime <= afterTime);
 }
});

@Override
public void apply() throws Exception {
 PartitionOutput partitionOutput = dataset.getPartitionOutput(PARTITION_KEY);
 ImmutableMap<String, String> originalEntries = ImmutableMap.of("key1", "value1", "key2", "value2");
 partitionOutput.setMetadata(originalEntries);
 dataset.addMetadata(PARTITION_KEY, updatedMetadata);
 PartitionDetail partitionDetail = dataset.getPartition(PARTITION_KEY);
 Assert.assertNotNull(partitionDetail);
 dataset.setMetadata(PARTITION_KEY, Collections.singletonMap("key3", "value4"));
 partitionDetail = dataset.getPartition(PARTITION_KEY);
 Assert.assertNotNull(partitionDetail);
 Assert.assertEquals(ImmutableMap.of("key1", "value1", "key2", "value2", "key3", "value4"),
  dataset.addMetadata(PARTITION_KEY, "key2", "value3");
  Assert.fail("Expected not to be able to update an existing metadata entry");
 } catch (DataSetException expected) {
 dataset.removeMetadata(PARTITION_KEY, ImmutableSet.of("key2", "key3", "key4"));
 partitionDetail = dataset.getPartition(PARTITION_KEY);
 Assert.assertNotNull(partitionDetail);
 Assert.assertEquals(ImmutableMap.of("key1", "value1"), partitionDetail.getMetadata().asMap());
   .addStringField("s", "nonexistent")
   .build();
  dataset.addMetadata(nonexistentPartitionKey, "key2", "value3");

DataSetManager<PartitionedFileSet> pfsManager = getDataset("pfs");
PartitionedFileSet pfs = pfsManager.get();
PartitionOutput partitionOutput = pfs.getPartitionOutput(PartitionKey.builder().addStringField("x", "nn").build());
Location location = partitionOutput.getLocation();
prepareFileInput(location);
PartitionDetail partition = pfs.getPartition(outputKey);
Assert.assertNotNull(partition);
validateFileOutput(partition.getLocation());
pfs.dropPartition(partitionOutput.getPartitionKey());
pfs.dropPartition(partition.getPartitionKey());

@Test
public void testRollbackOnJobFailure() throws Exception {
 // tests the logic of #onFailure method
 Map<String, String> args = new HashMap<>();
 FileSetArguments.setOutputPath(args, "custom/output/path");
 PartitionedFileSetArguments.setOutputPartitionKey(args, PARTITION_KEY);
 PartitionedFileSet pfs = dsFrameworkUtil.getInstance(pfsInstance, args);
 TransactionContext txContext = new TransactionContext(txClient, (TransactionAware) pfs);
 txContext.start();
 Location outputLocation = pfs.getEmbeddedFileSet().getOutputLocation();
 Assert.assertFalse(outputLocation.exists());
 outputLocation.mkdirs();
 Assert.assertTrue(outputLocation.exists());
 ((PartitionedFileSetDataset) pfs).onFailure();
 txContext.abort();
 // because the previous transaction aborted, the partition as well as the directory for it will not exist
 txContext.start();
 Assert.assertNull(pfs.getPartition(PARTITION_KEY));
 Assert.assertFalse(outputLocation.exists());
 txContext.finish();
}

final PartitionOutput output1 = pfs.getPartitionOutput(KEY_1);
location1 = output1.getLocation();
try (Writer writer = new OutputStreamWriter(location1.append("file").getOutputStream())) {
location2 = pfs.getEmbeddedFileSet().getLocation(path2);
try (Writer writer = new OutputStreamWriter(location2.append("file").getOutputStream())) {
 writer.write("2,2\n");
pfs.addPartition(KEY_2, path2);
final PartitionOutput output3 = pfs.getPartitionOutput(KEY_3);
location3 = output3.getLocation();
String basePath = pfs.getEmbeddedFileSet().getBaseLocation().toURI().getPath();
String absPath3 = location3.toURI().getPath();
Assert.assertTrue(absPath3.startsWith(basePath));

@Override
public void apply() throws Exception {
 try {
  pfs.getPartitionOutput(
   PartitionKey.builder().addField("i", 1).addField("l", 2L).build());
  Assert.fail("should have thrown exception due to missing field");
  pfs.addPartition(
   PartitionKey.builder().addField("i", 1).addField("l", "2").addField("s", "a").build(),
   "some/location");
  pfs.addPartition(
   PartitionKey.builder().addField("i", 1).addField("l", 2L).addField("s", "a").addField("x", "x").build(),
   "some/location", ImmutableMap.of("a", "b"));
 pfs.addPartition(
  PartitionKey.builder().addField("i", 1).addField("l", 2L).addField("s", "a").build(),
  "some/location", ImmutableMap.of("a", "b"));
 try {
  pfs.addMetadata(
   PartitionKey.builder().addField("i", 1).addField("l", 2L).addField("s", "a").addField("x", "x").build(),
   ImmutableMap.of("abc", "xyz"));
  pfs.dropPartition(PartitionKey.builder().addField("i", 1).addField("l", 2L).addField("s", 0).build());
  Assert.fail("should have thrown exception due to incompatible field");
 } catch (IllegalArgumentException e) {

Partitioning partitioning = null;
if (dataset instanceof PartitionedFileSet) {
 partitioning = ((PartitionedFileSet) dataset).getPartitioning();
 baseLocation = ((PartitionedFileSet) dataset).getEmbeddedFileSet().getBaseLocation();
} else {
 baseLocation = ((FileSet) dataset).getBaseLocation();

Location loc = pfs.get().getEmbeddedFileSet().getLocation("some/path");
OutputStream os = loc.append("part1").getOutputStream();
try (Writer writer = new OutputStreamWriter(os)) {
 writer.write("1,x\n");
pfs.get().addPartition(PartitionKey.builder().addStringField("x", "1").build(), "some/path");
pfs.flush();

 @Override
 public void apply() throws Exception {
  // this should succeed without error (but log a warning)
  Assert.assertEquals(Collections.EMPTY_SET,
            pfs.getPartitions(PartitionFilter.builder().addValueCondition("me-not-there", 42).build()));
 }
});

 @Override
 public void apply() throws Exception {
  dataset.dropPartition(partitionKey1);
 }
});

PartitionOutput partitionOutput = cleanRecords.getPartitionOutput(outputPartition);
Location partitionLocation = partitionOutput.getLocation();
int numInputFiles = 100;
cleanRecords.concatenatePartition(outputPartition).get();

DynamicPartitionerWriterWrapper(TaskAttemptContext job) {
 this.job = job;
 Configuration configuration = job.getConfiguration();
 Class<? extends DynamicPartitioner> partitionerClass = configuration
  .getClass(PartitionedFileSetArguments.DYNAMIC_PARTITIONER_CLASS_NAME, null, DynamicPartitioner.class);
 this.dynamicPartitioner = new InstantiatorFactory(false).get(TypeToken.of(partitionerClass)).create();
 this.partitionWriteOption =
  DynamicPartitioner.PartitionWriteOption.valueOf(
   configuration.get(PartitionedFileSetArguments.DYNAMIC_PARTITIONER_WRITE_OPTION));
 MapReduceClassLoader classLoader = MapReduceClassLoader.getFromConfiguration(configuration);
 this.taskContext = classLoader.getTaskContextProvider().get(job);
 // name the output file 'part-<RunId>-m-00000' instead of 'part-m-00000'
 String outputName = DynamicPartitioningOutputFormat.getOutputName(job);
 if (partitionWriteOption == DynamicPartitioner.PartitionWriteOption.CREATE_OR_APPEND) {
  outputName = outputName + "-" + taskContext.getProgramRunId().getRun();
 }
 this.outputName = outputName;
 String outputDatasetName = configuration.get(Constants.Dataset.Partitioned.HCONF_ATTR_OUTPUT_DATASET);
 this.outputDataset = taskContext.getDataset(outputDatasetName);
 this.partitioning = outputDataset.getPartitioning();
 this.dynamicPartitioner.initialize(taskContext);
 this.fileOutputFormatName = job.getConfiguration()
  .getClass(Constants.Dataset.Partitioned.HCONF_ATTR_OUTPUT_FORMAT_CLASS_NAME, null, FileOutputFormat.class)
  .getName();
}

 @Override
 public void run() {
  pfs.addPartition(KEY_4, path3);
 }
});

Javadoc

Represents a dataset that is split into partitions that can be uniquely addressed by partition keys along multiple dimensions. Each partition is a path in a file set, the partition key attached as meta data. Note that the partitioning of the dataset is fixed, that is, all operations that accept a partition key as a parameter require that that key has exactly the same schema as the partitioning. This dataset can be made available for querying with SQL (explore). This is enabled through dataset properties when the dataset is created. See FileSetPropertiesfor details. If it is enabled for explore, a Hive external table will be created when the dataset is created. The Hive table is partitioned by the same keys as this dataset.

Most used methods

getPartition
getEmbeddedFileSet
dropPartition
getPartitionOutput
Return a partition output for a specific partition key, in preparation for creating a new partition.
getPartitions
addPartition
Add a partition for a given partition key, stored at a given path (relative to the file set's base p
consumePartitions
getPartitioning
addMetadata
Adds a set of new metadata entries for a particular partition. Note that existing entries cannot be
concatenatePartition
Asynchronous operation to concatenate the partition in Hive. Note that Hive only supports certain fo
removeMetadata
Removes a set of metadata entries for a particular partition. If any metadata key does not exist, no
setMetadata
Sets metadata entries for a particular partition. If the metadata entry key does not already exist,

Popular in Java

Updating database using SQL prepared statement
setScale (BigDecimal)
getSharedPreferences (Context)
setRequestProperty (URLConnection)
ObjectMapper (com.fasterxml.jackson.databind)
ObjectMapper provides functionality for reading and writing JSON, either to and from basic POJOs (Pl
Path (java.nio.file)
GregorianCalendar (java.util)
GregorianCalendar is a concrete subclass of Calendarand provides the standard calendar used by most
HashSet (java.util)
HashSet is an implementation of a Set. All optional operations (adding and removing) are supported.
List (java.util)
An ordered collection (also known as a sequence). The user of this interface has precise control ove
ReentrantLock (java.util.concurrent.locks)
A reentrant mutual exclusion Lock with the same basic behavior and semantics as the implicit monitor
Github Copilot alternatives

How to usePartitionedFileSet in co.cask.cdap.api.dataset.lib

Best Java code snippets using co.cask.cdap.api.dataset.lib.PartitionedFileSet (Showing top 20 results out of 315)

How to use
PartitionedFileSet
in
co.cask.cdap.api.dataset.lib