return Datasets.load(uri).getDataset().getDescriptor().getSchema(); } else if ("resource".equals(uri.getScheme())) { try (InputStream in = Resources.getResource(uri.getSchemeSpecificPart()).openStream()) {
DatasetDescriptor descriptor = view.getDataset().getDescriptor(); Format format = descriptor.getFormat(); Preconditions.checkArgument(allowedFormats().contains(format.getName()), DEFAULT_SYNCABLE_SYNC_ON_BATCH) && (Formats.AVRO.equals(format)); this.datasetName = view.getDataset().getName();
boolean deleteAllUnsafe(boolean useTrash) { boolean deleted = false; if (dataset.getDescriptor().isPartitioned()) { for (StorageKey key : partitionIterator()) { deleted = (useTrash ? FileSystemUtil.cleanlyMoveToTrash(fs, root, key.getPath()) : FileSystemUtil.cleanlyDelete(fs, root, key.getPath())) || deleted; if (listener != null) { // the relative path is the partition name, so we can simply delete it // in Hive listener.partitionDeleted(dataset.getNamespace(), dataset.getName(), key.getPath().toString()); } } } else { for (Path path : pathIterator()) { deleted = (useTrash ? FileSystemUtil.cleanlyMoveToTrash(fs, root, path) : FileSystemUtil.cleanlyDelete(fs, root, path)) || deleted; } } return deleted; }
@Override protected DatasetWriter<GenericRecord> createWriter() { if (Formats.PARQUET.getName().equals(getDatasetDefinition().getFormat().getName())) { Dataset<GenericRecord> dataset = DatasetUtils.getOrCreateDataset(getDatasetRepositoryFactory(), getDatasetDefinition(), getEntityClass(), GenericRecord.class); schema = dataset.getDescriptor().getSchema(); return dataset.newWriter(); } else { throw new StoreException("Invalid format " + getDatasetDefinition().getFormat() + " specified, you must use 'parquet' with " + this.getClass().getSimpleName() + "."); } }
Schema schema = target.getDataset().getDescriptor().getSchema(); target.getDataset().getNamespace(), UUID.randomUUID().toString());
private static void printInfo(Logger console, Dataset<?> dataset) { DatasetDescriptor desc = dataset.getDescriptor(); String schema = ColumnMappingParser.removeEmbeddedMapping( PartitionStrategyParser.removeEmbeddedStrategy(desc.getSchema())) .toString(true); Collection<String> properties = desc.listProperties(); console.info("\nDataset \"{}\":", dataset.getName()); console.info("\tURI: \"{}\"", dataset.getUri()); console.info("\tSchema: {}", indent(schema)); if (desc.isPartitioned()) { console.info("\tPartition strategy: {}", indent(desc.getPartitionStrategy().toString(true))); } else { console.info("\tNot partitioned"); } if (desc.isColumnMapped()) { console.info("\tColumn mapping: {}", indent(desc.getColumnMapping().toString(true))); } if (!properties.isEmpty()) { StringBuilder sb = new StringBuilder(); for (String prop : properties) { sb.append("\n\t\t").append(prop).append("=") .append(desc.getProperty(prop)); } console.info("\tProperties:{}", sb.toString()); } }
@Override public void run() { dataset.newReader(); } });
@Override protected DatasetReader<GenericRecord> createReader() { Dataset<GenericRecord> dataset = DatasetUtils.getOrCreateDataset(getDatasetRepositoryFactory(), getDatasetDefinition(), getEntityClass(), GenericRecord.class); schema = dataset.getDescriptor().getSchema(); return dataset.newReader(); }
protected AbstractRefinableView(AbstractRefinableView<?> view, Schema schema, Class<E> type) { if (view.dataset instanceof AbstractDataset) { this.dataset = ((AbstractDataset<?>) view.dataset).asType(type); } else { this.dataset = Datasets.load(view.dataset.getUri(), type); } this.comparator = view.comparator; this.constraints = view.constraints; // thread-safe, so okay to reuse when views share a partition strategy this.keys = view.keys; // Resolve our type according to the given schema this.accessor = DataModelUtil.accessor(type, schema); this.entityTest = constraints.toEntityPredicate(accessor); Schema datasetSchema = dataset.getDescriptor().getSchema(); this.canRead = SchemaValidationUtil.canRead( datasetSchema, accessor.getReadSchema()); this.canWrite = SchemaValidationUtil.canRead( accessor.getWriteSchema(), datasetSchema); IncompatibleSchemaException.check(canRead || canWrite, "The type cannot be used to read from or write to the dataset:\n" + "Type schema: %s\nDataset schema: %s", getSchema(), datasetSchema); }
@Override protected DatasetWriter<T> createWriter() { if (Formats.AVRO.getName().equals(getDatasetDefinition().getFormat().getName())) { Dataset<T> dataset = DatasetUtils.getOrCreateDataset(getDatasetRepositoryFactory(), getDatasetDefinition(), getEntityClass(), getEntityClass()); return dataset.newWriter(); } else { throw new StoreException("Invalid format " + getDatasetDefinition().getFormat() + " specified, you must use 'avro' with " + this.getClass().getSimpleName() + "."); } }
@Test public void testSimpleViews() { assertViewUriEquivalent("dataset", "dataset:file:/tmp/test_name", test); assertViewUriEquivalent("to constraint", "view:file:/tmp/test_name?timestamp=(,0]", test.to("timestamp", 0L)); assertViewUriEquivalent("View with toBefore constraint", "view:file:/tmp/test_name?timestamp=(,0)", test.toBefore("timestamp", 0L)); assertViewUriEquivalent("View with from constraint", "view:file:/tmp/test_name?timestamp=[0,)", test.from("timestamp", 0L)); assertViewUriEquivalent("View with fromAfter constraint", "view:file:/tmp/test_name?timestamp=(0,)", test.fromAfter("timestamp", 0L)); assertViewUriEquivalent("View with in(\"\") constraint", "view:file:/tmp/test_name?color=in()", test.with("color", "")); assertViewUriEquivalent("View with in constraint", "view:file:/tmp/test_name?color=orange,red", test.with("color", "orange", "red")); assertViewUriEquivalent("View with exists constraint", "view:file:/tmp/test_name?id=", test.with("id")); }
ObjectMapper mapper = new ObjectMapper(); mapper.disable(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES); JsonNode node = mapper.readTree(new URL(url)); node = node.get("duObjects"); TypeReference<List<Dataset>> typeRef = new TypeReference<List<Dataset>>() { }; List<Dataset> list = mapper.readValue(node.traverse(), typeRef); for (int i = 0; i < list.size(); i++) { Dataset dataSet = list.get(i); System.out.println(dataSet.getName()); }
@Test public void testMixedProjection() throws IOException { Dataset<StandardEvent> original = repo.create("ns", "mixedProjection", new DatasetDescriptor.Builder() .schema(StandardEvent.class) .build(), StandardEvent.class); DatasetWriter<StandardEvent> writer = null; try { writer = original.newWriter(); writer.write(sepEvent); writer.write(octEvent); writer.write(novEvent); } finally { Closeables.close(writer, false); } Dataset<ReflectSmallEvent> dataset = repo.load("ns", original.getName(), ReflectSmallEvent.class); Set<ReflectSmallEvent> expected = Sets.newHashSet( new ReflectSmallEvent(sepEvent), new ReflectSmallEvent(octEvent), new ReflectSmallEvent(novEvent)); assertContentEquals(expected, dataset); }
private URI getLegacyRepoUri(Dataset<GenericRecord> dataset) { return FlumeConfigCommand.this.getLegacyRepoUri(dataset.getUri(), dataset.getNamespace()); }
@Test public void testRefineIdentity() throws Exception { PartitionStrategy strategy = new PartitionStrategy.Builder() .identity("user_id") .build(); DatasetDescriptor descriptor = new DatasetDescriptor.Builder() .schemaUri("resource:standard_event.avsc") .partitionStrategy(strategy) .build(); // Create a separate dataset to avoid conflicts with the above. Dataset<StandardEvent> identityDataset = repo.create( "ns", "test_identity", descriptor); DatasetWriter<StandardEvent> writer = null; try { writer = identityDataset.newWriter(); writer.write(sepEvent); writer.write(octEvent); writer.write(novEvent); } finally { Closeables.close(writer, false); } assertContentEquals(Sets.newHashSet(sepEvent, novEvent), identityDataset.with("user_id", 0L)); } }
private static DatasetRepository getDatasetRepository(JobContext jobContext) { Configuration conf = Hadoop.JobContext.getConfiguration.invoke(jobContext); DatasetRepository repo = DatasetRepositories.repositoryFor(conf.get(KITE_OUTPUT_URI)); if (repo instanceof TemporaryDatasetRepositoryAccessor) { Dataset<Object> dataset = load(jobContext).getDataset(); String namespace = dataset.getNamespace(); repo = ((TemporaryDatasetRepositoryAccessor) repo) .getTemporaryRepository(namespace, getJobDatasetName(jobContext)); } return repo; }
@Override public URI getUri() { URIBuilder builder = new URIBuilder(dataset.getUri()); for (Map.Entry<String, String> entry : constraints.toQueryMap().entrySet()) { builder.with(entry.getKey(), entry.getValue()); } return builder.build(); }
eventsToProcess = eventsDataset.toBefore("timestamp", currentMinute);
@Test public void testRelative() { DatasetRepository repo = DatasetRepositories.repositoryFor("repo:file:target/data"); repo.delete("ns", "test"); repo.create("ns", "test", descriptor); Dataset<Record> ds = Datasets.<Record, Dataset<Record>> load("dataset:file:target/data/ns/test", Record.class); Assert.assertNotNull("Should load dataset", ds); Assert.assertTrue(ds instanceof FileSystemDataset); Path cwd = localFS.makeQualified(new Path(".")); Assert.assertEquals("Locations should match", new Path(cwd, "target/data/ns/test").toUri(), ds.getDescriptor().getLocation()); Assert.assertEquals("Descriptors should match", repo.load("ns", "test").getDescriptor(), ds.getDescriptor()); Assert.assertEquals("Should report correct namespace", "ns", ds.getNamespace()); Assert.assertEquals("Should report correct name", "test", ds.getName()); repo.delete("ns", "test"); }