@Override public int run(List<String> args) throws Exception { String inputUri = uri; String outputUri = "dataset:hive?dataset=correlated_events"; if (args.size() == 1) { outputUri = args.get(0); } Preconditions.checkState(Datasets.exists(inputUri), "input dataset doesn't exists"); if (!Datasets.exists(outputUri)) { Datasets.create(outputUri, new DatasetDescriptor.Builder() .format("avro") .schema(CorrelatedEvents.class) .build()); } CorrelateEventsTask task = new CorrelateEventsTask(inputUri, outputUri); task.run(); return 0; }
@Override public FileSystemWriter<Record> newWriter(Path directory, Schema datasetSchema, Schema writerSchema) { return FileSystemWriter.newWriter(fs, directory, 100, 2 * 1024 * 1024, new DatasetDescriptor.Builder() .property( "kite.writer.roll-interval-seconds", String.valueOf(10)) .property( "kite.writer.target-file-size", String.valueOf(32 * 1024 * 1024)) // 32 MB .property( "kite.writer.fs-supports-rename", String.valueOf(false)) .schema(datasetSchema) .format("avro") .build(), writerSchema); } }
@BeforeClass public static void setup() throws IOException { fs = LocalFileSystem.getInstance(); testDirectory = new Path(Files.createTempDir().getAbsolutePath()); FileSystemDatasetRepository repo = new FileSystemDatasetRepository(fs.getConf(), testDirectory); Dataset<StandardEvent> writerDataset = repo.create("ns", "test", new DatasetDescriptor.Builder() .schema(StandardEvent.class) .format(Formats.PARQUET) .build(), StandardEvent.class); DatasetWriter<StandardEvent> writer = writerDataset.newWriter(); for (long i = 0; i < totalRecords; i++) { String text = String.valueOf(i); writer.write(new StandardEvent(text, text, i, text, text, i)); } writer.close(); readerDataset = repo.load("ns", "test", GenericRecord.class); }
@Test @SuppressWarnings("deprecation") public void testGetPartitionReturnsNullIfNoAutoCreate() throws IOException { PartitionStrategy partitionStrategy = new PartitionStrategy.Builder().hash( "username", 2).build(); FileSystemDataset<Record> ds = new FileSystemDataset.Builder<Record>() .namespace("ns") .name("partitioned-users") .configuration(getConfiguration()) .descriptor(new DatasetDescriptor.Builder() .schema(USER_SCHEMA) .format(format) .location(testDirectory) .partitionStrategy(partitionStrategy) .build()) .type(Record.class) .build(); Assert .assertNull(ds.getPartition(new PartitionKey(1), false)); }
@Test public void signalReadyOnUnboundedDataset() { final FileSystemDataset<Record> ds = new FileSystemDataset.Builder<Record>() .namespace("ns") .name("users") .configuration(getConfiguration()) .descriptor( new DatasetDescriptor.Builder().schema(USER_SCHEMA).format(format) .location(testDirectory).build()) .type(Record.class) .uri(URIBuilder.build(URI.create("repo:" + testDirectory.toUri()), "ns", "name")) .build(); Assert.assertFalse("Unbounded dataset has not been signaled", ds.isReady()); ds.signalReady(); Assert.assertTrue("Unbounded dataset has been signaled and should be ready", ds.isReady()); }
@Test public void testWriteAndRead() throws IOException { FileSystemDataset<Record> ds = new FileSystemDataset.Builder<Record>() .namespace("ns") .name("test") .configuration(getConfiguration()) .descriptor(new DatasetDescriptor.Builder() .schemaUri(USER_SCHEMA_URL) .format(format) .compressionType(compressionType) .location(testDirectory) .build()) .type(Record.class) .build(); Assert.assertFalse("Dataset is not partitioned", ds.getDescriptor() .isPartitioned()); writeTestUsers(ds, 10); checkTestUsers(ds, 10); }
@Test public void testPathIterator_Directory() { FileSystemDataset<Record> ds = new FileSystemDataset.Builder<Record>() .namespace("ns") .name("users") .configuration(getConfiguration()) .descriptor(new DatasetDescriptor.Builder() .schema(USER_SCHEMA) .format(format) .compressionType(compressionType) .location(testDirectory) .build()) .type(Record.class) .build(); List<Path> dirPaths = Lists.newArrayList(ds.dirIterator()); Assert.assertEquals("dirIterator for non-partitioned dataset should yield a single path.", 1, dirPaths.size()); Assert.assertEquals("dirIterator should yield absolute paths.", testDirectory, dirPaths.get(0)); }
@Test public void testMoveToTrashWithoutPartitions() { final FileSystemDataset<Record> ds = new FileSystemDataset.Builder<Record>() .namespace("ns") .name("users") .configuration(getConfiguration()) .descriptor( new DatasetDescriptor.Builder().schema(USER_SCHEMA).format(format) .location(testDirectory).build()) .type(Record.class) .build(); writeTestUsers(ds, 10); Assert.assertTrue(ds.moveToTrash()); checkReaderBehavior(ds.newReader(), 0, (RecordValidator<Record>) null); }
private static DatasetDescriptor descriptor(FileSystem fs, Result.Table table) throws IOException { // inspect the path to determine the partition strategy PartitionStrategy strategy = strategy(fs, table.location); DatasetDescriptor.Builder builder = new DatasetDescriptor.Builder() .format(table.format) .schema(table.schema) .partitionStrategy(strategy) .location(table.location); if (table.depth < 0) { builder.property("kite.filesystem.mixed-depth", "true"); } return builder.build(); }
@Before public void setUp() throws Exception { super.setUp(); inputDataset = repo.create("ns", "in", new DatasetDescriptor.Builder() .property("kite.allow.csv", "true") .schema(STRING_SCHEMA) .format(format) .build(), GenericData.Record.class); outputDataset = repo.create("ns", "out", new DatasetDescriptor.Builder() .property("kite.allow.csv", "true") .schema(STATS_SCHEMA) .format(format) .build(), GenericData.Record.class); }
/** * Configure the dataset's format from a format name String (optional). If * not specified, {@link Formats#AVRO} is used by default. * * @param formatName a String format name * @return An instance of the builder for method chaining. * @throws UnknownFormatException if the format name is not recognized. * * @since 0.8.0 */ public Builder format(String formatName) { return this.format(Formats.fromString(formatName)); }
@Test public void testParquetConfiguration() throws IOException { FileSystem fs = LocalFileSystem.getInstance(); FileSystemWriter<Object> writer = FileSystemWriter.newWriter( fs, new Path("/tmp"), -1, -1, new DatasetDescriptor.Builder() .property("parquet.block.size", "34343434") .schema(SCHEMA) .format("parquet") .build(), SCHEMA); Assert.assertEquals("Should copy properties to Configuration", 34343434, writer.conf.getInt("parquet.block.size", -1)); }
@Test(expected = UnknownFormatException.class) public void testUnknownFormat() throws IOException { final DatasetDescriptor descriptor = new DatasetDescriptor.Builder() .schema(STRING_SCHEMA) .format(Accessor.getDefault().newFormat("explode!")) .build(); MultiFileDatasetReader<Record> reader = new MultiFileDatasetReader<Record>( fileSystem, Lists.newArrayList(TEST_FILE), descriptor, CONSTRAINTS, ACCESSOR); try { reader.initialize(); } finally { reader.close(); } }
@Test public void testConfigureDurableParquetAppender() throws IOException { FileSystem fs = LocalFileSystem.getInstance(); FileSystemWriter<Object> writer = FileSystemWriter.newWriter( fs, new Path("/tmp"), -1, -1, new DatasetDescriptor.Builder() .property(FileSystemProperties.NON_DURABLE_PARQUET_PROP, "false") .schema(SCHEMA) .format("parquet") .build(), SCHEMA); Assert.assertEquals("Disabling the non-durable parquet appender should get us a durable appender", DurableParquetAppender.class, writer.newAppender(testDirectory).getClass()); }
@Override public FileSystemWriter<Record> newWriter(Path directory, Schema datasetSchema, Schema writerSchema) { return FileSystemWriter.newWriter(fs, directory, 100, 2 * 1024 * 1024, new DatasetDescriptor.Builder() .property( "kite.writer.roll-interval-seconds", String.valueOf(10)) .property( "kite.writer.target-file-size", String.valueOf(32 * 1024 * 1024)) // 32 MB .schema(datasetSchema) .format("parquet") .build(), writerSchema); }
@Test public void testDeleteAllWithoutPartitions() { final FileSystemDataset<Record> ds = new FileSystemDataset.Builder<Record>() .namespace("ns") .name("users") .configuration(getConfiguration()) .descriptor( new DatasetDescriptor.Builder().schema(USER_SCHEMA).format(format) .location(testDirectory).build()) .type(Record.class) .build(); writeTestUsers(ds, 10); Assert.assertTrue(ds.deleteAll()); checkReaderBehavior(ds.newReader(), 0, (RecordValidator<Record>) null); }
@Before @Override public void setUp() throws Exception { super.setUp(); dataset = repo.create("ns", "out", new DatasetDescriptor.Builder() .property("kite.allow.csv", "true") .schema(STATS_SCHEMA) .format(format) .build(), GenericData.Record.class); }
@Override public FileSystemWriter<Record> newWriter(Path directory, Schema datasetSchema, Schema writerSchema) { return FileSystemWriter.newWriter(fs, directory, 100, 2 * 1024 * 1024, new DatasetDescriptor.Builder() .property( "kite.writer.roll-interval-seconds", String.valueOf(10)) .property( "kite.writer.target-file-size", String.valueOf(32 * 1024 * 1024)) // 32 MB .schema(datasetSchema) .format("avro") .build(), writerSchema); }
@Test public void testConfigureNonDurableParquetAppender() throws IOException { FileSystem fs = LocalFileSystem.getInstance(); FileSystemWriter<Object> writer = FileSystemWriter.newWriter( fs, new Path("/tmp"), -1, -1, new DatasetDescriptor.Builder() .property(FileSystemProperties.NON_DURABLE_PARQUET_PROP, "true") .schema(SCHEMA) .format("parquet") .build(), SCHEMA); Assert.assertEquals("Enabling the non-durable parquet appender should get us a non-durable appender", ParquetAppender.class, writer.newAppender(testDirectory).getClass()); }
@Before @Override public void setUp() throws Exception { super.setUp(); outputDataset = repo.create("ns", "out", new DatasetDescriptor.Builder() .property("kite.allow.csv", "true") .schema(STATS_SCHEMA) .format(format) .build(), GenericData.Record.class); }