co.cask.cdap.test.DataSetManager java code examples

 /**
  * Get the values associated with the specified keys.
  * @param keys keys for which value to be determined
  * @param tableManager manager for the table
  * @return the key value map
  * @throws Exception
  */
 public static Map<String, String> getValues(Set<String> keys, DataSetManager<KeyValueTable> tableManager)
  throws Exception {
  tableManager.flush();
  KeyValueTable table = tableManager.get();

  Map<String, String> values = new HashMap<>();
  for (String key : keys) {
   values.put(key, Bytes.toString(table.read(key)));
  }
  return values;
 }
}

 /**
  * Read the value for the specified rowKey and columnKey.
  */
 public static String readOutput(DataSetManager<Table> tableManager, String rowKey, String columnKey)
  throws Exception {
  Table table = tableManager.get();
  return Bytes.toString(table.get(Bytes.toBytes(rowKey), Bytes.toBytes(columnKey)));
 }
}

 @Override
 public Long call() throws Exception {
  tsTableManager.flush();
  return getCounts("Message", tsTable);
 }
}, 1, TimeUnit.MINUTES, 1, TimeUnit.SECONDS);

 /**
  * Read the value for the specified rowKey and columnKey.
  */
 public static String readOutput(DataSetManager<Table> tableManager, String rowKey, String columnKey) {
  Table table = tableManager.get();
  return Bytes.toString(table.get(Bytes.toBytes(rowKey), Bytes.toBytes(columnKey)));
 }
}

 @Override
 public Long call() throws Exception {
  tsTableManager.flush();
  return getCounts("Message", tsTable);
 }
}, 1, TimeUnit.MINUTES, 1, TimeUnit.SECONDS);

/**
 * Clear any records written to this sink.
 *
 * @param tableManager dataset manager used to get the sink dataset
 */
public static void clear(DataSetManager<Table> tableManager) {
 tableManager.flush();
 Table table = tableManager.get();
 try (Scanner scanner = table.scan(null, null)) {
  Row row;
  while ((row = scanner.next()) != null) {
   table.delete(row.getRow());
  }
 }
 tableManager.flush();
}

/**
 * Used to read the records written by this sink.
 *
 * @param tableManager dataset manager used to get the sink dataset to read from
 */
public static List<StructuredRecord> readOutput(DataSetManager<Table> tableManager) throws Exception {
 Table table = tableManager.get();
 try (Scanner scanner = table.scan(null, null)) {
  List<StructuredRecord> records = new ArrayList<>();
  Row row;
  while ((row = scanner.next()) != null) {
   Schema schema = Schema.parseJson(row.getString(SCHEMA_COL));
   String recordStr = row.getString(RECORD_COL);
   records.add(StructuredRecordStringConverter.fromJsonString(recordStr, schema));
  }
  return records;
 }
}

 @Override
 public Long call() throws Exception {
  tsTableManager.flush();
  return getCounts(Integer.toString(finalI), tsTable);
 }
}, 1, TimeUnit.MINUTES, 1, TimeUnit.SECONDS);

private static void writeInput(DataSetManager<Table> tableManager, @Nullable String rowKey,
                Iterable<StructuredRecord> records) throws Exception {
 tableManager.flush();
 Table table = tableManager.get();
 // write each record as a separate row, with the serialized record as one column and schema as another
 // each rowkey will be a UUID.
 for (StructuredRecord record : records) {
  byte[] row = rowKey == null ? Bytes.toBytes(UUID.randomUUID()) : Bytes.toBytes(rowKey);
  table.put(row, SCHEMA_COL, Bytes.toBytes(record.getSchema().toString()));
  table.put(row, RECORD_COL, Bytes.toBytes(StructuredRecordStringConverter.toJsonString(record)));
 }
 tableManager.flush();
}

@Test
public void testClassicSpark() throws Exception {
 ApplicationManager appManager = deploy(TestSparkApp.class);
 for (Class<?> sparkClass : Arrays.asList(TestSparkApp.ClassicSpark.class, TestSparkApp.ScalaClassicSpark.class)) {
  final SparkManager sparkManager = appManager.getSparkManager(sparkClass.getSimpleName());
  sparkManager.startAndWaitForRun(ProgramRunStatus.COMPLETED, 5, TimeUnit.MINUTES);
 }
 KeyValueTable resultTable = this.<KeyValueTable>getDataset("ResultTable").get();
 Assert.assertEquals(1L, Bytes.toLong(resultTable.read(ClassicSparkProgram.class.getName())));
 Assert.assertEquals(1L, Bytes.toLong(resultTable.read(ScalaClassicSparkProgram.class.getName())));
}

 @Override
 public Long call() throws Exception {
  tsTableManager.flush();
  return getCounts(Integer.toString(finalI), tsTable);
 }
}, 1, TimeUnit.MINUTES, 1, TimeUnit.SECONDS);

/**
 * Used to read the records written by this sink.
 *
 * @param tableManager dataset manager used to get the sink dataset to read from
 */
public static List<StructuredRecord> readOutput(DataSetManager<Table> tableManager) throws Exception {
 tableManager.flush();
 Table table = tableManager.get();
 try (Scanner scanner = table.scan(null, null)) {
  List<StructuredRecord> records = new ArrayList<>();
  Row row;
  while ((row = scanner.next()) != null) {
   Schema schema = Schema.parseJson(row.getString(SCHEMA_COL));
   String recordStr = row.getString(RECORD_COL);
   records.add(StructuredRecordStringConverter.fromJsonString(recordStr, schema));
  }
  return records;
 }
}

private void assertWorkerDatasetWrites(byte[] startRow, byte[] endRow,
                    int expectedCount, int expectedTotalCount) throws Exception {
 DataSetManager<KeyValueTable> datasetManager =
  getDataset(testSpace.dataset(AppUsingGetServiceURL.WORKER_INSTANCES_DATASET));
 KeyValueTable instancesTable = datasetManager.get();
 try (CloseableIterator<KeyValue<byte[], byte[]>> instancesIterator = instancesTable.scan(startRow, endRow)) {
  List<KeyValue<byte[], byte[]>> workerInstances = Lists.newArrayList(instancesIterator);
  // Assert that the worker starts with expectedCount instances
  Assert.assertEquals(expectedCount, workerInstances.size());
  // Assert that each instance of the worker knows the total number of instances
  for (KeyValue<byte[], byte[]> keyValue : workerInstances) {
   Assert.assertEquals(expectedTotalCount, Bytes.toInt(keyValue.getValue()));
  }
 }
}

/**
 * Used to write the input records for the pipeline run. Should be called after the pipeline has been created.
 *
 * @param tableManager dataset manager used to write to the source dataset
 * @param records records that should be the input for the pipeline
 */
public static void writeInput(DataSetManager<Table> tableManager,
               Iterable<StructuredRecord> records) throws Exception {
 tableManager.flush();
 Table table = tableManager.get();
 // write each record as a separate row, with the serialized record as one column and schema as another
 // each rowkey will be a UUID.
 for (StructuredRecord record : records) {
  byte[] row = Bytes.toBytes(UUID.randomUUID());
  table.put(row, SCHEMA_COL, Bytes.toBytes(record.getSchema().toString()));
  table.put(row, RECORD_COL, Bytes.toBytes(StructuredRecordStringConverter.toJsonString(record)));
 }
 tableManager.flush();
}

@Test
public void testSparkWithService() throws Exception {
 ApplicationManager applicationManager = deployApplication(TestSparkServiceIntegrationApp.class);
 startService(applicationManager);
 SparkManager sparkManager = applicationManager.getSparkManager(
  TestSparkServiceIntegrationApp.SparkServiceProgram.class.getSimpleName()).start();
 sparkManager.waitForRun(ProgramRunStatus.COMPLETED, 120, TimeUnit.SECONDS);
 DataSetManager<KeyValueTable> datasetManager = getDataset("result");
 KeyValueTable results = datasetManager.get();
 for (int i = 1; i <= 5; i++) {
  byte[] key = String.valueOf(i).getBytes(Charsets.UTF_8);
  Assert.assertEquals((i * i), Integer.parseInt(Bytes.toString(results.read(key))));
 }
}

private void addDummyData(NamespaceId namespaceId, String datasetName) throws Exception {
 DataSetManager<KeyValueTable> tableManager = getDataset(namespaceId.dataset(datasetName));
 KeyValueTable inputTable = tableManager.get();
 inputTable.write("hello", "world");
 tableManager.flush();
}

@Test
public void testDynamicSpark() throws Exception {
 ApplicationManager appManager = deploy(TestSparkApp.class);
 // Write some data to a local file
 File inputFile = TEMP_FOLDER.newFile();
 try (BufferedWriter writer = Files.newBufferedWriter(inputFile.toPath(), StandardCharsets.UTF_8)) {
  for (int i = 0; i < 10; i++) {
   writer.write("Line " + (i + 1));
   writer.newLine();
  }
 }
 SparkManager sparkManager = appManager.getSparkManager(ScalaDynamicSpark.class.getSimpleName());
 sparkManager.startAndWaitForRun(ImmutableMap.of("input", inputFile.getAbsolutePath(),
                         "output", "ResultTable",
                         "tmpdir", TMP_FOLDER.newFolder().getAbsolutePath()),
                 ProgramRunStatus.COMPLETED, 5, TimeUnit.MINUTES);
 // Validate the result written to dataset
 KeyValueTable resultTable = this.<KeyValueTable>getDataset("ResultTable").get();
 // There should be ten "Line"
 Assert.assertEquals(10, Bytes.toInt(resultTable.read("Line")));
 // Each number should appear once
 for (int i = 0; i < 10; i++) {
  Assert.assertEquals(1, Bytes.toInt(resultTable.read(Integer.toString(i + 1))));
 }
}

 private void createPartition(DataSetManager<TimePartitionedFileSet> tpfsManager, long time, int i) throws Exception {
  TimePartitionedFileSet tpfs = tpfsManager.get();
  TimePartitionOutput output = tpfs.getPartitionOutput(time);
  try (PrintStream out = new PrintStream(output.getLocation().append("file").getOutputStream())) {
   out.println(String.format("%d,x%d", i, i));
  }
  output.addPartition();
  tpfsManager.flush();
 }
}

private void verifyWorkflowRun(String runId, boolean shouldKeepWordCountDataset, boolean shouldKeepCSVFilesetDataset,
                String expectedRunStatus)
 throws Exception {
 // Once the Workflow run is complete local datasets should not be available
 DataSetManager<KeyValueTable> localKeyValueDataset =
  getDataset(testSpace.dataset(WorkflowAppWithLocalDatasets.WORDCOUNT_DATASET + "." + runId));
 if (shouldKeepWordCountDataset) {
  Assert.assertNotNull(localKeyValueDataset.get());
 } else {
  Assert.assertNull(localKeyValueDataset.get());
 }
 DataSetManager<FileSet> localFileSetDataset =
  getDataset(testSpace.dataset(WorkflowAppWithLocalDatasets.CSV_FILESET_DATASET + "." + runId));
 if (shouldKeepCSVFilesetDataset) {
  Assert.assertNotNull(localFileSetDataset.get());
 } else {
  Assert.assertNull(localFileSetDataset.get());
 }
 // Dataset which is not local should still be available
 DataSetManager<KeyValueTable> nonLocalKeyValueDataset =
  getDataset(testSpace.dataset(WorkflowAppWithLocalDatasets.RESULT_DATASET));
 Assert.assertEquals("6", Bytes.toString(nonLocalKeyValueDataset.get().read("UniqueWordCount")));
 // There should not be any local copy of the non local dataset
 nonLocalKeyValueDataset = getDataset(testSpace.dataset(WorkflowAppWithLocalDatasets.RESULT_DATASET + "." + runId));
 Assert.assertNull(nonLocalKeyValueDataset.get());
 DataSetManager<KeyValueTable> workflowRuns
  = getDataset(testSpace.dataset(WorkflowAppWithLocalDatasets.WORKFLOW_RUNS_DATASET));
 Assert.assertEquals(expectedRunStatus, Bytes.toString(workflowRuns.get().read(runId)));
}

 private void addTimePartition(DataSetManager<TimePartitionedFileSet> tpfsManager,
                long inputTime) throws IOException, TransactionFailureException, InterruptedException {
  TimePartitionedFileSet tpfs = tpfsManager.get();

  PartitionOutput partitionOutput = tpfs.getPartitionOutput(inputTime);
  Location location = partitionOutput.getLocation();
  prepareFileInput(location);
  partitionOutput.addPartition();

  tpfsManager.flush();
 }
}

Javadoc

Instances of this class are for managing co.cask.cdap.api.dataset.DataSet.

Note: Changes made with the instance of the dataset acquired via #get() are not visible to other components unless #flush() is called.

Typical usage for read:

  
DataSetManager 
  myTableManager = getDataset("my_table"); 
String value = myTableManager.get().get(new Get("key1", "column1")).getString("column1");

Typical usage for write:

  
DataSetManager 
  myTableManager = getDataset("my_table"); 
myTableManager.get().put(new Put("key1", "column1", "value1")); 
myTableManager.flush();

Most used methods

flush
Makes changes performed using the dataset instance acquired via #get() visible to all other componen
get
The returned instance of the dataset will see only changes made before the manager was acquired, or

Popular in Java

Creating JSON documents from java classes using gson
addToBackStack (FragmentTransaction)
onCreateOptionsMenu (Activity)
getApplicationContext (Context)
OutputStream (java.io)
A writable sink for bytes.Most clients will use output streams that write data to the file system (
Path (java.nio.file)
BitSet (java.util)
The BitSet class implements abit array [http://en.wikipedia.org/wiki/Bit_array]. Each element is eit
Deque (java.util)
A linear collection that supports element insertion and removal at both ends. The name deque is shor
LinkedList (java.util)
Doubly-linked list implementation of the List and Dequeinterfaces. Implements all optional list oper
JTable (javax.swing)
Top 12 Jupyter Notebook extensions

How to useDataSetManager in co.cask.cdap.test

Best Java code snippets using co.cask.cdap.test.DataSetManager (Showing top 20 results out of 315)

How to use
DataSetManager
in
co.cask.cdap.test