org.kitesdk.data.Dataset java code examples

  return Datasets.load(uri).getDataset().getDescriptor().getSchema();
} else if ("resource".equals(uri.getScheme())) {
  try (InputStream in = Resources.getResource(uri.getSchemeSpecificPart()).openStream()) {

DatasetDescriptor descriptor = view.getDataset().getDescriptor();
Format format = descriptor.getFormat();
Preconditions.checkArgument(allowedFormats().contains(format.getName()),
  DEFAULT_SYNCABLE_SYNC_ON_BATCH) && (Formats.AVRO.equals(format));
this.datasetName = view.getDataset().getName();

boolean deleteAllUnsafe(boolean useTrash) {
 boolean deleted = false;
 if (dataset.getDescriptor().isPartitioned()) {
  for (StorageKey key : partitionIterator()) {
   deleted = (useTrash ? FileSystemUtil.cleanlyMoveToTrash(fs, root, key.getPath())
     : FileSystemUtil.cleanlyDelete(fs, root, key.getPath())) || deleted;
   if (listener != null) {
    // the relative path is the partition name, so we can simply delete it
    // in Hive
    listener.partitionDeleted(dataset.getNamespace(),
      dataset.getName(), key.getPath().toString());
   }
  }
 }
 else {
  for (Path path : pathIterator()) {
   deleted = (useTrash ? FileSystemUtil.cleanlyMoveToTrash(fs, root, path)
     : FileSystemUtil.cleanlyDelete(fs, root, path)) || deleted;
  }
 }
 return deleted;
}

@Override
protected DatasetWriter<GenericRecord> createWriter() {
  if (Formats.PARQUET.getName().equals(getDatasetDefinition().getFormat().getName())) {
    Dataset<GenericRecord> dataset =
        DatasetUtils.getOrCreateDataset(getDatasetRepositoryFactory(), getDatasetDefinition(), getEntityClass(), GenericRecord.class);
    schema = dataset.getDescriptor().getSchema();
    return dataset.newWriter();
  } else {
    throw new StoreException("Invalid format " + getDatasetDefinition().getFormat() +
        " specified, you must use 'parquet' with " + this.getClass().getSimpleName() + ".");
  }
}

Schema schema = target.getDataset().getDescriptor().getSchema();
    target.getDataset().getNamespace(),
    UUID.randomUUID().toString());

private static void printInfo(Logger console, Dataset<?> dataset) {
 DatasetDescriptor desc = dataset.getDescriptor();
 String schema = ColumnMappingParser.removeEmbeddedMapping(
   PartitionStrategyParser.removeEmbeddedStrategy(desc.getSchema()))
   .toString(true);
 Collection<String> properties = desc.listProperties();
 console.info("\nDataset \"{}\":", dataset.getName());
 console.info("\tURI: \"{}\"", dataset.getUri());
 console.info("\tSchema: {}", indent(schema));
 if (desc.isPartitioned()) {
  console.info("\tPartition strategy: {}",
    indent(desc.getPartitionStrategy().toString(true)));
 } else {
  console.info("\tNot partitioned");
 }
 if (desc.isColumnMapped()) {
  console.info("\tColumn mapping: {}",
    indent(desc.getColumnMapping().toString(true)));
 }
 if (!properties.isEmpty()) {
  StringBuilder sb = new StringBuilder();
  for (String prop : properties) {
   sb.append("\n\t\t").append(prop).append("=")
     .append(desc.getProperty(prop));
  }
  console.info("\tProperties:{}", sb.toString());
 }
}

 @Override
 public void run() {
  dataset.newReader();
 }
});

@Override
protected DatasetReader<GenericRecord> createReader() {
  Dataset<GenericRecord> dataset = DatasetUtils.getOrCreateDataset(getDatasetRepositoryFactory(),
      getDatasetDefinition(), getEntityClass(), GenericRecord.class);
  schema = dataset.getDescriptor().getSchema();
  return dataset.newReader();
}

protected AbstractRefinableView(AbstractRefinableView<?> view, Schema schema, Class<E> type) {
 if (view.dataset instanceof AbstractDataset) {
  this.dataset = ((AbstractDataset<?>) view.dataset).asType(type);
 } else {
  this.dataset = Datasets.load(view.dataset.getUri(), type);
 }
 this.comparator = view.comparator;
 this.constraints = view.constraints;
 // thread-safe, so okay to reuse when views share a partition strategy
 this.keys = view.keys;
 // Resolve our type according to the given schema
 this.accessor = DataModelUtil.accessor(type, schema);
 this.entityTest = constraints.toEntityPredicate(accessor);
 Schema datasetSchema = dataset.getDescriptor().getSchema();
 this.canRead = SchemaValidationUtil.canRead(
   datasetSchema, accessor.getReadSchema());
 this.canWrite = SchemaValidationUtil.canRead(
   accessor.getWriteSchema(), datasetSchema);
 IncompatibleSchemaException.check(canRead || canWrite,
   "The type cannot be used to read from or write to the dataset:\n" +
   "Type schema: %s\nDataset schema: %s",
   getSchema(), datasetSchema);
}

@Override
protected DatasetWriter<T> createWriter() {
  if (Formats.AVRO.getName().equals(getDatasetDefinition().getFormat().getName())) {
    Dataset<T> dataset = DatasetUtils.getOrCreateDataset(getDatasetRepositoryFactory(), getDatasetDefinition(),
        getEntityClass(), getEntityClass());
    return dataset.newWriter();
  } else {
    throw new StoreException("Invalid format " + getDatasetDefinition().getFormat()
        + " specified, you must use 'avro' with " + this.getClass().getSimpleName() + ".");
  }
}

@Test
public void testSimpleViews() {
 assertViewUriEquivalent("dataset",
   "dataset:file:/tmp/test_name", test);
 assertViewUriEquivalent("to constraint",
   "view:file:/tmp/test_name?timestamp=(,0]",
   test.to("timestamp", 0L));
 assertViewUriEquivalent("View with toBefore constraint",
   "view:file:/tmp/test_name?timestamp=(,0)",
   test.toBefore("timestamp", 0L));
 assertViewUriEquivalent("View with from constraint",
   "view:file:/tmp/test_name?timestamp=[0,)",
   test.from("timestamp", 0L));
 assertViewUriEquivalent("View with fromAfter constraint",
   "view:file:/tmp/test_name?timestamp=(0,)",
   test.fromAfter("timestamp", 0L));
 assertViewUriEquivalent("View with in(\"\") constraint",
   "view:file:/tmp/test_name?color=in()",
   test.with("color", ""));
 assertViewUriEquivalent("View with in constraint",
   "view:file:/tmp/test_name?color=orange,red",
   test.with("color", "orange", "red"));
 assertViewUriEquivalent("View with exists constraint",
   "view:file:/tmp/test_name?id=",
   test.with("id"));
}

ObjectMapper mapper = new ObjectMapper();
 mapper.disable(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES);
 JsonNode node = mapper.readTree(new URL(url));
 node = node.get("duObjects");
 TypeReference<List<Dataset>> typeRef = new TypeReference<List<Dataset>>() {
 };
 List<Dataset> list = mapper.readValue(node.traverse(), typeRef);
 for (int i = 0; i < list.size(); i++) {
   Dataset dataSet = list.get(i);
   System.out.println(dataSet.getName());
 }

@Test
public void testMixedProjection() throws IOException {
 Dataset<StandardEvent> original = repo.create("ns", "mixedProjection",
   new DatasetDescriptor.Builder()
     .schema(StandardEvent.class)
     .build(), StandardEvent.class);
 DatasetWriter<StandardEvent> writer = null;
 try {
  writer = original.newWriter();
  writer.write(sepEvent);
  writer.write(octEvent);
  writer.write(novEvent);
 } finally {
  Closeables.close(writer, false);
 }
 Dataset<ReflectSmallEvent> dataset = repo.load("ns", original.getName(),
   ReflectSmallEvent.class);
 Set<ReflectSmallEvent> expected = Sets.newHashSet(
   new ReflectSmallEvent(sepEvent), new ReflectSmallEvent(octEvent),
   new ReflectSmallEvent(novEvent));
 assertContentEquals(expected, dataset);
}

writer = hellos.newWriter();
reader = hellos.newReader();

private URI getLegacyRepoUri(Dataset<GenericRecord> dataset) {
 return FlumeConfigCommand.this.getLegacyRepoUri(dataset.getUri(), dataset.getNamespace());
}

 @Test
 public void testRefineIdentity() throws Exception {
   PartitionStrategy strategy = new PartitionStrategy.Builder()
       .identity("user_id")
       .build();

   DatasetDescriptor descriptor = new DatasetDescriptor.Builder()
       .schemaUri("resource:standard_event.avsc")
       .partitionStrategy(strategy)
       .build();

   // Create a separate dataset to avoid conflicts with the above.
   Dataset<StandardEvent> identityDataset = repo.create(
     "ns", "test_identity", descriptor);

   DatasetWriter<StandardEvent> writer = null;

   try {
     writer = identityDataset.newWriter();
     writer.write(sepEvent);
     writer.write(octEvent);
     writer.write(novEvent);
   } finally {
     Closeables.close(writer, false);
   }

   assertContentEquals(Sets.newHashSet(sepEvent, novEvent),
       identityDataset.with("user_id", 0L));
 }
}

private static DatasetRepository getDatasetRepository(JobContext jobContext) {
 Configuration conf = Hadoop.JobContext.getConfiguration.invoke(jobContext);
 DatasetRepository repo = DatasetRepositories.repositoryFor(conf.get(KITE_OUTPUT_URI));
 if (repo instanceof TemporaryDatasetRepositoryAccessor) {
  Dataset<Object> dataset = load(jobContext).getDataset();
  String namespace = dataset.getNamespace();
  repo = ((TemporaryDatasetRepositoryAccessor) repo)
    .getTemporaryRepository(namespace, getJobDatasetName(jobContext));
 }
 return repo;
}

@Override
public URI getUri() {
 URIBuilder builder = new URIBuilder(dataset.getUri());
 for (Map.Entry<String, String> entry : constraints.toQueryMap().entrySet()) {
  builder.with(entry.getKey(), entry.getValue());
 }
 return builder.build();
}

eventsToProcess = eventsDataset.toBefore("timestamp", currentMinute);

@Test
public void testRelative() {
 DatasetRepository repo = DatasetRepositories.repositoryFor("repo:file:target/data");
 repo.delete("ns", "test");
 repo.create("ns", "test", descriptor);
 Dataset<Record> ds = Datasets.<Record, Dataset<Record>>
   load("dataset:file:target/data/ns/test", Record.class);
 Assert.assertNotNull("Should load dataset", ds);
 Assert.assertTrue(ds instanceof FileSystemDataset);
 Path cwd = localFS.makeQualified(new Path("."));
 Assert.assertEquals("Locations should match",
   new Path(cwd, "target/data/ns/test").toUri(), ds.getDescriptor().getLocation());
 Assert.assertEquals("Descriptors should match",
   repo.load("ns", "test").getDescriptor(), ds.getDescriptor());
 Assert.assertEquals("Should report correct namespace",
   "ns", ds.getNamespace());
 Assert.assertEquals("Should report correct name",
   "test", ds.getName());
 repo.delete("ns", "test");
}

Javadoc

A logical representation of a set of data entities.

Logically, all datasets have two generic properties: a name, and a descriptor that holds information such as the dataset's schema and its partitioning information. Concrete implementations of Dataset can support additional properties, mandatory or otherwise, as needed. Datasets are not normally instantiated directly, but managed by a repository (also implementation-specific).

Implementations of Dataset are immutable.

Most used methods

Popular in Java

Creating JSON documents from java classes using gson
onRequestPermissionsResult (Fragment)
getSystemService (Context)
getOriginalFilename (MultipartFile)
Return the original filename in the client's filesystem.This may contain path information depending
Proxy (java.net)
This class represents proxy server settings. A created instance of Proxy stores a type and an addres
Permission (java.security)
Legacy security code; do not use.
Dictionary (java.util)
Note: Do not use this class since it is obsolete. Please use the Map interface for new implementatio
ResourceBundle (java.util)
ResourceBundle is an abstract class which is the superclass of classes which provide Locale-specifi
Get (org.apache.hadoop.hbase.client)
Used to perform Get operations on a single row. To get everything for a row, instantiate a Get objec
Project (org.apache.tools.ant)
Central representation of an Ant project. This class defines an Ant project with all of its targets,
Top plugins for WebStorm

How to useDataset in org.kitesdk.data

Best Java code snippets using org.kitesdk.data.Dataset (Showing top 20 results out of 315)

How to use
Dataset
in
org.kitesdk.data