private SavePolicy(Context context) { String uri = context.getString(CONFIG_KITE_ERROR_DATASET_URI); Preconditions.checkArgument(uri != null, "Must set " + CONFIG_KITE_ERROR_DATASET_URI + " when " + CONFIG_FAILURE_POLICY + "=save"); if (Datasets.exists(uri)) { dataset = Datasets.load(uri, AvroFlumeEvent.class); } else { DatasetDescriptor descriptor = new DatasetDescriptor.Builder() .schema(AvroFlumeEvent.class) .build(); dataset = Datasets.create(uri, descriptor, AvroFlumeEvent.class); } nEventsHandled = 0; }
/** * Create a {@link Dataset} for the given dataset or view URI. * {@code create} returns an empty dataset. You can use {@code DatasetWriter} * to populate your dataset. * <p> * URIs must begin with {@code dataset:} or {@code view:}. The remainder of * the URI is implementation specific, depending on the dataset scheme. If the * URI is a view URI, this method creates the underlying dataset and returns a * view of it. * * @param uri a {@code Dataset} or {@code View} URI * @param <V> the type of {@code Dataset} or {@code View} expected * @return a newly created {@code Dataset} responsible for the given URI * @throws NullPointerException * if {@code uri} or {@code descriptor} is {@code null} * @throws IllegalArgumentException * if {@code uri} is not a dataset or view URI * @throws DatasetExistsException * if a {@code Dataset} for the given URI already exists * @throws IncompatibleSchemaException * if the schema is not compatible with existing datasets with * shared storage (for example, in the same HBase table) */ @SuppressWarnings("unchecked") public static <V extends View<GenericRecord>> V create(URI uri, DatasetDescriptor descriptor) { return Datasets.<GenericRecord, V>create( uri, descriptor, GenericRecord.class); }
@Override public int run(String[] args) throws Exception { // Create a dataset of events with the Avro schema DatasetDescriptor descriptor = new DatasetDescriptor.Builder() .schemaUri("resource:event.avsc") .build(); Datasets.create("dataset:hive:/tmp/data/default/events", descriptor); return 0; }
@Override public int run(List<String> args) throws Exception { Preconditions.checkState(!Datasets.exists(uri), "events dataset already exists"); DatasetDescriptor descriptor = new DatasetDescriptor.Builder() .schema(StandardEvent.class).build(); View<StandardEvent> events = Datasets.create(uri, descriptor, StandardEvent.class); DatasetWriter<StandardEvent> writer = events.newWriter(); try { while (System.currentTimeMillis() - baseTimestamp < 36000) { writer.write(generateRandomEvent()); } } finally { writer.close(); } System.out.println("Generated " + counter + " events"); return 0; }
private SavePolicy(Context context) { String uri = context.getString(CONFIG_KITE_ERROR_DATASET_URI); Preconditions.checkArgument(uri != null, "Must set " + CONFIG_KITE_ERROR_DATASET_URI + " when " + CONFIG_FAILURE_POLICY + "=save"); if (Datasets.exists(uri)) { dataset = Datasets.load(uri, AvroFlumeEvent.class); } else { DatasetDescriptor descriptor = new DatasetDescriptor.Builder() .schema(AvroFlumeEvent.class) .build(); dataset = Datasets.create(uri, descriptor, AvroFlumeEvent.class); } nEventsHandled = 0; }
@Override public int run(List<String> args) throws Exception { String inputUri = uri; String outputUri = "dataset:hive?dataset=correlated_events"; if (args.size() == 1) { outputUri = args.get(0); } Preconditions.checkState(Datasets.exists(inputUri), "input dataset doesn't exists"); if (!Datasets.exists(outputUri)) { Datasets.create(outputUri, new DatasetDescriptor.Builder() .format("avro") .schema(CorrelatedEvents.class) .build()); } CorrelateEventsTask task = new CorrelateEventsTask(inputUri, outputUri); task.run(); return 0; }
@Test public void testCreateStringUriWithoutType() { URI datasetUri = new URIBuilder(repoUri, "ns", "test").build(); DatasetDescriptor descriptor = new DatasetDescriptor.Builder() .schemaLiteral("\"string\"") .build(); Dataset<GenericRecord> expected = mock(Dataset.class); when(repo.create("ns", "test", descriptor, GenericRecord.class)).thenReturn(expected); Dataset<GenericRecord> ds = Datasets.create(datasetUri.toString(), descriptor); verify(repo).create("ns", "test", descriptor, GenericRecord.class); verifyNoMoreInteractions(repo); verifyNoMoreInteractions(expected); Assert.assertEquals(expected, ds); }
@Test public void testCreateStringUri() { URI datasetUri = new URIBuilder(repoUri, "ns", "test").build(); DatasetDescriptor descriptor = new DatasetDescriptor.Builder() .schemaLiteral("\"string\"") .build(); Dataset<Object> expected = mock(Dataset.class); when(repo.create("ns", "test", descriptor, Object.class)).thenReturn(expected); Dataset<Object> ds = Datasets.<Object, Dataset<Object>> create(datasetUri.toString(), descriptor, Object.class); verify(repo).create("ns", "test", descriptor, Object.class); verifyNoMoreInteractions(repo); verifyNoMoreInteractions(expected); Assert.assertEquals(expected, ds); }
@Test public void testCreateWithoutType() { URI datasetUri = new URIBuilder(repoUri, "ns", "test").build(); DatasetDescriptor descriptor = new DatasetDescriptor.Builder() .schemaLiteral("\"string\"") .build(); Dataset<GenericRecord> expected = mock(Dataset.class); when(repo.create("ns", "test", descriptor, GenericRecord.class)).thenReturn(expected); Dataset<GenericRecord> ds = Datasets.create(datasetUri, descriptor); verify(repo).create("ns", "test", descriptor, GenericRecord.class); verifyNoMoreInteractions(repo); verifyNoMoreInteractions(expected); Assert.assertEquals(expected, ds); }
@Test public void testCreate() { URI datasetUri = new URIBuilder(repoUri, "ns", "test").build(); DatasetDescriptor descriptor = new DatasetDescriptor.Builder() .schemaLiteral("\"string\"") .build(); Dataset<Object> expected = mock(Dataset.class); when(repo.create("ns", "test", descriptor, Object.class)).thenReturn(expected); Dataset<Object> ds = Datasets.<Object, Dataset<Object>> create(datasetUri, descriptor, Object.class); verify(repo).create("ns", "test", descriptor, Object.class); verifyNoMoreInteractions(repo); verifyNoMoreInteractions(expected); Assert.assertEquals(expected, ds); }
@Test public void testEmptyDataset() throws Exception { File folder = temp.newFolder("a/b/c/d/e/dataset_name"); Path root = new Path(temp.getRoot().toURI()); FileSystem fs = LocalFileSystem.getInstance(); URI datasetUri = URI.create("dataset:file:" + folder.getAbsolutePath()); DatasetDescriptor descriptor = new DatasetDescriptor.Builder() .schema(USER_SCHEMA) .build(); Datasets.create(datasetUri, descriptor); Collection<DatasetDescriptor> expected = Lists.newArrayList(); Assert.assertEquals("Should succeed and find no datasets", expected, FileSystemUtil.findPotentialDatasets(fs, root)); }
@Test public void testCreateView() throws Exception { DatasetDescriptor descriptor = new DatasetDescriptor.Builder() .schemaUri("resource:schema/user.avsc") .build(); Constraints constraints = new Constraints(descriptor.getSchema(), null) .with("username", "user1") .with("email", "user1@example.com"); AbstractDataset<Object> ds = mock(AbstractDataset.class); when(repo.create("ns", "test", descriptor, Object.class)).thenReturn(ds); when(ds.getDescriptor()).thenReturn(descriptor); AbstractRefinableView<Object> userAndEmailView = mock(AbstractRefinableView.class); when(ds.filter(constraints)).thenReturn(userAndEmailView); URI datasetUri = new URIBuilder(repoUri, "ns", "test") .with("username", "user1") .with("email", "user1@example.com") .with("ignoredOption", "abc") .build(); RefinableView<Object> view = Datasets.<Object, RefinableView<Object>> create(datasetUri, descriptor, Object.class); verify(repo).create("ns", "test", descriptor, Object.class); verifyNoMoreInteractions(repo); verify(ds).getDescriptor(); verify(ds).filter(constraints); verifyNoMoreInteractions(ds); verifyNoMoreInteractions(userAndEmailView); Assert.assertEquals(userAndEmailView, view); }
@Test public void testUnpartitionedDataset() throws Exception { File folder = temp.newFolder("a/b/c/d/e/dataset_name"); Path root = new Path(temp.getRoot().toURI()); FileSystem fs = LocalFileSystem.getInstance(); URI datasetUri = URI.create("dataset:file:" + folder.getAbsolutePath()); DatasetDescriptor descriptor = new DatasetDescriptor.Builder() .schema(USER_SCHEMA) .build(); Dataset<GenericRecord> dataset = Datasets.create(datasetUri, descriptor); // write two so that the descriptor uses the directory rather than a file writeUserToView(dataset); writeUserToView(dataset); DatasetDescriptor expected = dataset.getDescriptor(); DatasetDescriptor actual = Iterables.getOnlyElement( FileSystemUtil.findPotentialDatasets(fs, root)); Assert.assertEquals("Should succeed and find an equivalent descriptor", expected, actual); }
@Test(expected=NullPointerException.class) public void testCreateNullType() { URI datasetUri = new URIBuilder(repoUri, "ns", "test").build(); DatasetDescriptor descriptor = new DatasetDescriptor.Builder() .schemaLiteral("\"string\"") .build(); Datasets.<Object, Dataset<Object>> create(datasetUri, descriptor, null); }
@BeforeClass public static void createTestDataset() { Datasets.delete("dataset:file:/tmp/test_name"); test = Datasets.create("dataset:file:/tmp/test_name", new DatasetDescriptor.Builder() .schema(SCHEMA) .partitionStrategy(STRATEGY) .build()); }
@Before public void createFileSystemDataset() { String uri = "dataset:file:/tmp/datasets/ns/test"; DatasetDescriptor descriptor = new DatasetDescriptor.Builder() .schema(schema) .partitionStrategy(ymd) .build(); Datasets.delete(uri); this.dataset = Datasets.create(uri, descriptor); }
@Test public void testUnpartitionedReplace() { // recreate temporary without a partition strategy Datasets.delete("dataset:file:/tmp/datasets/temporary"); DatasetDescriptor descriptor = new DatasetDescriptor .Builder(unpartitioned.getDescriptor()) .location((URI) null) // clear the location .build(); temporary = Datasets.create("dataset:file:/tmp/datasets/temporary", descriptor, TestRecord.class); Assert.assertTrue("Should allow replacing an unpartitioned dataset", unpartitioned.canReplace(unpartitioned)); // make sure there are multiple files writeTestRecords(unpartitioned); writeTestRecords(unpartitioned); writeTestRecords(temporary); writeTestRecords(temporary); Set<String> originalFiles = Sets.newHashSet( Iterators.transform(unpartitioned.pathIterator(), new GetFilename())); Set<String> replacementFiles = Sets.newHashSet( Iterators.transform(temporary.pathIterator(), new GetFilename())); Iterators.transform(temporary.pathIterator(), new GetFilename()); Assert.assertFalse("Sanity check", originalFiles.equals(replacementFiles)); unpartitioned.replace(unpartitioned, temporary); Set<String> replacedFiles = Sets.newHashSet( Iterators.transform(unpartitioned.pathIterator(), new GetFilename())); Assert.assertEquals("Should contain the replacement files", replacementFiles, replacedFiles); }
@Before public void createTestDatasets() { Datasets.delete("dataset:file:/tmp/datasets/unpartitioned"); Datasets.delete("dataset:file:/tmp/datasets/partitioned"); Datasets.delete("dataset:file:/tmp/datasets/temporary"); DatasetDescriptor descriptor = new DatasetDescriptor.Builder() .schema(TestRecord.class) .build(); unpartitioned = Datasets.create("dataset:file:/tmp/datasets/unpartitioned", descriptor, TestRecord.class); descriptor = new DatasetDescriptor.Builder(descriptor) .property("kite.writer.cache-size", "20") .partitionStrategy(new PartitionStrategy.Builder() .hash("id", 4) .build()) .build(); partitioned = Datasets.create("dataset:file:/tmp/datasets/partitioned", descriptor, TestRecord.class); // create a second dataset with the same partitioning for replacement parts temporary = Datasets.create("dataset:file:/tmp/datasets/temporary", descriptor, TestRecord.class); writeTestRecords(unpartitioned); writeTestRecords(partitioned); writeTestRecords(temporary); }
@Before public void createTestDatasets() { Datasets.delete("dataset:file:/tmp/datasets/unpartitioned"); Datasets.delete("dataset:file:/tmp/datasets/partitioned"); DatasetDescriptor descriptor = new DatasetDescriptor.Builder() .schema(TestRecord.class) .build(); unpartitioned = Datasets.create("dataset:file:/tmp/datasets/unpartitioned", descriptor, TestRecord.class); descriptor = new DatasetDescriptor.Builder(descriptor) .partitionStrategy(new PartitionStrategy.Builder() .hash("id", 4) .build()) .build(); partitioned = Datasets.create("dataset:file:/tmp/datasets/partitioned", descriptor, TestRecord.class); writeTestRecords(unpartitioned); writeTestRecords(partitioned); }
@Test public void testDatasetNotPartitioned() { Datasets.delete("dataset:file:/tmp/datasets/ns/test"); final Dataset<GenericRecord> ds = Datasets.create( "dataset:file:/tmp/datasets/ns/test", new DatasetDescriptor.Builder() .schema(schema) .build()); Assert.assertEquals("Should work for empty relative directory", ds, FileSystemDatasets.viewForUri(ds, "file:/tmp/datasets/ns/test")); TestHelpers.assertThrows("Should reject paths in a non-partitioned dataset", IllegalArgumentException.class, new Runnable() { @Override public void run() { FileSystemDatasets.viewForUri(ds, "y=2014/m=03/d=14"); } }); }