@Override public void sync() throws EventDeliveryException { if (nEventsHandled > 0) { if (Formats.PARQUET.equals( dataset.getDataset().getDescriptor().getFormat())) { // We need to close the writer on sync if we're writing to a Parquet // dataset close(); } else { if (writer instanceof Syncable) { ((Syncable) writer).sync(); } } } }
Format format = descriptor.getFormat(); Preconditions.checkArgument(allowedFormats().contains(format.getName()), "Unsupported format: " + format.getName());
static boolean isSupportedFormat(DatasetDescriptor descriptor) { Format format = descriptor.getFormat(); return (SUPPORTED_FORMATS.contains(format) || (Formats.CSV.equals(format) && DescriptorUtil.isEnabled(FileSystemProperties.ALLOW_CSV_PROP, descriptor) )); }
public FileSystemViewKeyInputFormat(FileSystemDataset<E> dataset, Configuration conf) { this.dataset = dataset; this.view = null; LOG.debug("Dataset: {}", dataset); Format format = dataset.getDescriptor().getFormat(); setConfigProperties(conf, format, dataset.getSchema(), dataset.getType()); }
@Override public void initialize() { Preconditions.checkState(state.equals(ReaderWriterState.NEW), "A reader may not be opened more than once - current state:%s", state); final Format format = descriptor.getFormat(); if (!SUPPORTED_FORMATS.contains(format)) { throw new UnknownFormatException("Cannot open format:" + format.getName()); } this.state = ReaderWriterState.OPEN; }
public FileSystemViewKeyInputFormat(FileSystemView<E> view, Configuration conf) { this.dataset = (FileSystemDataset<E>) view.getDataset(); this.view = view; LOG.debug("View: {}", view); Format format = dataset.getDescriptor().getFormat(); setConfigProperties(conf, format, view.getSchema(), view.getType()); }
@Override public <T> Collection<T> read(Class<T> targetClass, ViewCallback viewCallback) { DatasetDescriptor descriptor = getDatasetDescriptor(targetClass); if (descriptor == null) { throw new StoreException("Unable to locate dataset for target class " + targetClass.getName()); } if (Formats.PARQUET.equals(descriptor.getFormat())) { return readGenericRecords(targetClass, viewCallback); } else { return readPojo(targetClass, viewCallback); } }
@Override public <T> Collection<T> read(Class<T> targetClass) { DatasetDescriptor descriptor = getDatasetDescriptor(targetClass); if (descriptor == null) { throw new StoreException("Unable to locate dataset for target class " + targetClass.getName()); } if (Formats.PARQUET.equals(descriptor.getFormat())) { return readGenericRecords(targetClass, null); } else { return readPojo(targetClass, null); } }
@SuppressWarnings("unchecked") // See https://github.com/Parquet/parquet-mr/issues/106 private void openNextReader() { if (Formats.PARQUET.equals(descriptor.getFormat())) { this.reader = new ParquetFileSystemDatasetReader(fileSystem, filesIter.next(), accessor.getReadSchema(), accessor.getType()); } else if (Formats.JSON.equals(descriptor.getFormat())) { this.reader = new JSONFileReader<E>( fileSystem, filesIter.next(), accessor); } else if (Formats.CSV.equals(descriptor.getFormat())) { this.reader = new CSVFileReader<E>(fileSystem, filesIter.next(), descriptor, accessor); } else if (Formats.INPUTFORMAT.equals(descriptor.getFormat())) { this.reader = new InputFormatReader(fileSystem, filesIter.next(), descriptor); } else { this.reader = new FileSystemDatasetReader<E>(fileSystem, filesIter.next(), accessor.getReadSchema(), accessor.getType()); } reader.initialize(); this.readerIterator = Iterators.filter(reader, constraints.toEntityPredicate( (pathIter != null ? pathIter.getStorageKey() : null), accessor)); }
@Override public void sync() throws EventDeliveryException { if (nEventsHandled > 0) { if (Formats.PARQUET.equals( dataset.getDataset().getDescriptor().getFormat())) { // We need to close the writer on sync if we're writing to a Parquet // dataset close(); } else { if (writer instanceof Syncable) { ((Syncable) writer).sync(); } } } }
@Override public void initialize() { Preconditions.checkState(state.equals(ReaderWriterState.NEW), "Unable to open a writer from state:%s", state); DatasetDescriptor descriptor = view.getDataset().getDescriptor(); ValidationException.check( FileSystemWriter.isSupportedFormat(descriptor), "Not a supported format: %s", descriptor.getFormat()); LOG.debug("Opening partitioned dataset writer w/strategy:{}", partitionStrategy); cachedWriters = CacheBuilder.newBuilder().maximumSize(maxWriters) .removalListener(new DatasetWriterCloser<E>()) .build(createCacheLoader()); state = ReaderWriterState.OPEN; }
static <E> FileSystemWriter<E> newWriter(FileSystem fs, Path path, long rollIntervalMillis, long targetFileSize, DatasetDescriptor descriptor, Schema writerSchema) { Format format = descriptor.getFormat(); if (Formats.PARQUET.equals(format)) { // by default, Parquet is not durable if (DescriptorUtil.isDisabled( FileSystemProperties.NON_DURABLE_PARQUET_PROP, descriptor)) { return new IncrementalWriter<E>( fs, path, rollIntervalMillis, targetFileSize, descriptor, writerSchema); } else { return new FileSystemWriter<E>( fs, path, rollIntervalMillis, targetFileSize, descriptor, writerSchema); } } else if (Formats.AVRO.equals(format) || Formats.CSV.equals(format)) { return new IncrementalWriter<E>( fs, path, rollIntervalMillis, targetFileSize, descriptor, writerSchema); } else { return new FileSystemWriter<E>( fs, path, rollIntervalMillis, targetFileSize, descriptor, writerSchema); } }
static <E> PartitionedDatasetWriter<E, ?> newWriter(FileSystemView<E> view) { DatasetDescriptor descriptor = view.getDataset().getDescriptor(); Format format = descriptor.getFormat(); if (Formats.PARQUET.equals(format)) { // by default, Parquet is not durable if (DescriptorUtil.isDisabled( FileSystemProperties.NON_DURABLE_PARQUET_PROP, descriptor)) { return new IncrementalPartitionedDatasetWriter<E>(view); } else { return new NonDurablePartitionedDatasetWriter<E>(view); } } else if (Formats.AVRO.equals(format) || Formats.CSV.equals(format)) { return new IncrementalPartitionedDatasetWriter<E>(view); } else { return new NonDurablePartitionedDatasetWriter<E>(view); } }
@Test public void testLoad() { ensureCreated(); DatasetDescriptor loaded = provider.load(NAMESPACE, NAME); Assert.assertNotNull("DatasetDescriptor should be returned", loaded); Assert.assertEquals("Schema should match", testDescriptor.getSchema(), loaded.getSchema()); Assert.assertEquals("PartitionStrategy should match", testDescriptor.getPartitionStrategy(), loaded.getPartitionStrategy()); Assert.assertEquals("Format should match", testDescriptor.getFormat(), loaded.getFormat()); }
public void testUpdate() { ensureCreated(); /* * To be clear: we are testing that even crazy, incompatible changes are * happily saved by the MetadataProvider. Rule enforcement is done upstream * by libraries that are in a better position to make decisions about what * changes are incompatible. */ final DatasetDescriptor saved = provider.update(NAMESPACE, NAME, anotherDescriptor); Assert.assertNotNull("Updated Descriptor should be returned", saved); Assert.assertEquals("Schema should match update", anotherDescriptor.getSchema(), saved.getSchema()); Assert.assertEquals("PartitionStrategy should match update", anotherDescriptor.getPartitionStrategy(), saved.getPartitionStrategy()); Assert.assertEquals("Format should match update", anotherDescriptor.getFormat(), saved.getFormat()); }
@Test public void testMultipleAvroFilesInOneFolder() throws Exception { File folder = temp.newFolder("a/b/c/d/e"); Path root = new Path(temp.getRoot().toURI()); FileSystem fs = LocalFileSystem.getInstance(); // create a two Avro files in parent Path parent = new Path(folder.toURI()); createAvroUserFile(fs, parent); createAvroUserFile(fs, parent); DatasetDescriptor descriptor = Iterables.getOnlyElement( FileSystemUtil.findPotentialDatasets(fs, root)); Assert.assertFalse("Should not flag at mixed depth", descriptor.hasProperty("kite.filesystem.mixed-depth")); Assert.assertEquals("Should be directly under parent", parent.toUri(), descriptor.getLocation()); Assert.assertEquals("Should use user schema", USER_SCHEMA, descriptor.getSchema()); Assert.assertEquals("Should have Avro format", Formats.AVRO, descriptor.getFormat()); Assert.assertFalse("Should not be partitioned", descriptor.isPartitioned()); }
@Test public void testMultipleParquetFilesInOneFolder() throws Exception { File folder = temp.newFolder("a/b/c/d/e"); Path root = new Path(temp.getRoot().toURI()); FileSystem fs = LocalFileSystem.getInstance(); // create a single Avro file Path parent = new Path(folder.toURI()); createParquetEventFile(fs, parent); createParquetEventFile(fs, parent); DatasetDescriptor descriptor = Iterables.getOnlyElement( FileSystemUtil.findPotentialDatasets(fs, root)); Assert.assertFalse("Should not flag at mixed depth", descriptor.hasProperty("kite.filesystem.mixed-depth")); Assert.assertEquals("Should be directly under parent", parent.toUri(), descriptor.getLocation()); Assert.assertEquals("Should use event schema", EVENT_SCHEMA, descriptor.getSchema()); Assert.assertEquals("Should have Parquet format", Formats.PARQUET, descriptor.getFormat()); Assert.assertFalse("Should not be partitioned", descriptor.isPartitioned()); }
@Test public void testCreate() { Assert.assertFalse("Sanity check", provider.exists(NAMESPACE, NAME)); DatasetDescriptor created = provider.create(NAMESPACE, NAME, testDescriptor); Assert.assertNotNull("Descriptor should be returned", created); Assert.assertTrue("Descriptor should exist", provider.exists(NAMESPACE, NAME)); Assert.assertEquals("Schema should match", testDescriptor.getSchema(), created.getSchema()); Assert.assertEquals("PartitionStrategy should match", testDescriptor.getPartitionStrategy(), created.getPartitionStrategy()); Assert.assertEquals("Format should match", testDescriptor.getFormat(), created.getFormat()); // the MetadataProvider optionally sets the location, nothing to test }
@SuppressWarnings("unchecked") private RecordReader<E, Void> createUnfilteredRecordReader(InputSplit inputSplit, TaskAttemptContext taskAttemptContext) throws IOException, InterruptedException { Format format = dataset.getDescriptor().getFormat(); if (Formats.AVRO.equals(format)) { return new AvroKeyReaderWrapper(new AvroCombineInputFormat<E>()); } else if (Formats.PARQUET.equals(format)) { return new ValueReaderWrapper(new AvroParquetCombineInputFormat()); } else if (Formats.JSON.equals(format)) { JSONInputFormat<E> delegate = new JSONInputFormat<E>(); delegate.setView(view != null ? view : dataset); return delegate.createRecordReader(inputSplit, taskAttemptContext); } else if (Formats.CSV.equals(format)) { CSVInputFormat<E> delegate = new CSVInputFormat<E>(); delegate.setView(view != null ? view : dataset); return delegate.createRecordReader(inputSplit, taskAttemptContext); } else if (Formats.INPUTFORMAT.equals(format)) { return InputFormatUtil.newRecordReader(dataset.getDescriptor()); } else { throw new UnsupportedOperationException( "Not a supported format: " + format); } }
@Test public void testUpdateFailsWithFormatChange() { Dataset<Record> dataset = repo.create(NAMESPACE, NAME, new DatasetDescriptor.Builder(testDescriptor) .format(Formats.AVRO) .build()); DatasetDescriptor changed = new DatasetDescriptor.Builder(dataset.getDescriptor()) .format(Formats.PARQUET) .build(); try { repo.update(NAMESPACE, NAME, changed); Assert.fail("Should fail due to format change"); } catch (ValidationException e) { // expected } Assert.assertEquals( Formats.AVRO, repo.load(NAMESPACE, NAME).getDescriptor().getFormat()); }