@Override public void sync() throws EventDeliveryException { if (nEventsHandled > 0) { if (Formats.PARQUET.equals( dataset.getDataset().getDescriptor().getFormat())) { // We need to close the writer on sync if we're writing to a Parquet // dataset close(); } else { if (writer instanceof Syncable) { ((Syncable) writer).sync(); } } } }
@Override Format directory(FileSystem fs, Path path, List<Format> formats) throws IOException { Format format = null; for (Format otherFormat : formats) { if (format == null) { format = otherFormat; } else if (!format.equals(otherFormat)) { throw new ValidationException(String.format( "Path contains multiple formats (%s, %s): %s", format, otherFormat, path)); } } return format; }
private static void setConfigProperties(Configuration conf, Format format, Schema schema, Class<?> type) { GenericData model = DataModelUtil.getDataModelForType(type); if (Formats.AVRO.equals(format)) { setModel.invoke(conf, model.getClass()); conf.set(AVRO_SCHEMA_INPUT_KEY, schema.toString()); } else if (Formats.PARQUET.equals(format)) { // TODO: update to a version of Parquet with setAvroDataSupplier //AvroReadSupport.setAvroDataSupplier(conf, // DataModelUtil.supplierClassFor(model)); AvroReadSupport.setAvroReadSchema(conf, schema); } }
static boolean isSupportedFormat(DatasetDescriptor descriptor) { Format format = descriptor.getFormat(); return (SUPPORTED_FORMATS.contains(format) || (Formats.CSV.equals(format) && DescriptorUtil.isEnabled(FileSystemProperties.ALLOW_CSV_PROP, descriptor) )); }
static <E> FileSystemWriter<E> newWriter(FileSystem fs, Path path, long rollIntervalMillis, long targetFileSize, DatasetDescriptor descriptor, Schema writerSchema) { Format format = descriptor.getFormat(); if (Formats.PARQUET.equals(format)) { // by default, Parquet is not durable if (DescriptorUtil.isDisabled( FileSystemProperties.NON_DURABLE_PARQUET_PROP, descriptor)) { return new IncrementalWriter<E>( fs, path, rollIntervalMillis, targetFileSize, descriptor, writerSchema); } else { return new FileSystemWriter<E>( fs, path, rollIntervalMillis, targetFileSize, descriptor, writerSchema); } } else if (Formats.AVRO.equals(format) || Formats.CSV.equals(format)) { return new IncrementalWriter<E>( fs, path, rollIntervalMillis, targetFileSize, descriptor, writerSchema); } else { return new FileSystemWriter<E>( fs, path, rollIntervalMillis, targetFileSize, descriptor, writerSchema); } }
static <E> PartitionedDatasetWriter<E, ?> newWriter(FileSystemView<E> view) { DatasetDescriptor descriptor = view.getDataset().getDescriptor(); Format format = descriptor.getFormat(); if (Formats.PARQUET.equals(format)) { // by default, Parquet is not durable if (DescriptorUtil.isDisabled( FileSystemProperties.NON_DURABLE_PARQUET_PROP, descriptor)) { return new IncrementalPartitionedDatasetWriter<E>(view); } else { return new NonDurablePartitionedDatasetWriter<E>(view); } } else if (Formats.AVRO.equals(format) || Formats.CSV.equals(format)) { return new IncrementalPartitionedDatasetWriter<E>(view); } else { return new NonDurablePartitionedDatasetWriter<E>(view); } }
@Override public <T> Collection<T> read(Class<T> targetClass) { DatasetDescriptor descriptor = getDatasetDescriptor(targetClass); if (descriptor == null) { throw new StoreException("Unable to locate dataset for target class " + targetClass.getName()); } if (Formats.PARQUET.equals(descriptor.getFormat())) { return readGenericRecords(targetClass, null); } else { return readPojo(targetClass, null); } }
@Override public <T> Collection<T> read(Class<T> targetClass, ViewCallback viewCallback) { DatasetDescriptor descriptor = getDatasetDescriptor(targetClass); if (descriptor == null) { throw new StoreException("Unable to locate dataset for target class " + targetClass.getName()); } if (Formats.PARQUET.equals(descriptor.getFormat())) { return readGenericRecords(targetClass, viewCallback); } else { return readPojo(targetClass, viewCallback); } }
@SuppressWarnings("unchecked") private RecordReader<E, Void> createUnfilteredRecordReader(InputSplit inputSplit, TaskAttemptContext taskAttemptContext) throws IOException, InterruptedException { Format format = dataset.getDescriptor().getFormat(); if (Formats.AVRO.equals(format)) { return new AvroKeyReaderWrapper(new AvroCombineInputFormat<E>()); } else if (Formats.PARQUET.equals(format)) { return new ValueReaderWrapper(new AvroParquetCombineInputFormat()); } else if (Formats.JSON.equals(format)) { JSONInputFormat<E> delegate = new JSONInputFormat<E>(); delegate.setView(view != null ? view : dataset); return delegate.createRecordReader(inputSplit, taskAttemptContext); } else if (Formats.CSV.equals(format)) { CSVInputFormat<E> delegate = new CSVInputFormat<E>(); delegate.setView(view != null ? view : dataset); return delegate.createRecordReader(inputSplit, taskAttemptContext); } else if (Formats.INPUTFORMAT.equals(format)) { return InputFormatUtil.newRecordReader(dataset.getDescriptor()); } else { throw new UnsupportedOperationException( "Not a supported format: " + format); } }
@SuppressWarnings("unchecked") // See https://github.com/Parquet/parquet-mr/issues/106 private void openNextReader() { if (Formats.PARQUET.equals(descriptor.getFormat())) { this.reader = new ParquetFileSystemDatasetReader(fileSystem, filesIter.next(), accessor.getReadSchema(), accessor.getType()); } else if (Formats.JSON.equals(descriptor.getFormat())) { this.reader = new JSONFileReader<E>( fileSystem, filesIter.next(), accessor); } else if (Formats.CSV.equals(descriptor.getFormat())) { this.reader = new CSVFileReader<E>(fileSystem, filesIter.next(), descriptor, accessor); } else if (Formats.INPUTFORMAT.equals(descriptor.getFormat())) { this.reader = new InputFormatReader(fileSystem, filesIter.next(), descriptor); } else { this.reader = new FileSystemDatasetReader<E>(fileSystem, filesIter.next(), accessor.getReadSchema(), accessor.getType()); } reader.initialize(); this.readerIterator = Iterators.filter(reader, constraints.toEntityPredicate( (pathIter != null ? pathIter.getStorageKey() : null), accessor)); }
@VisibleForTesting @SuppressWarnings("unchecked") <E> FileAppender<E> newAppender(Path temp) { Format format = descriptor.getFormat(); if (Formats.PARQUET.equals(format)) { // by default, Parquet is not durable if (DescriptorUtil.isDisabled( FileSystemProperties.NON_DURABLE_PARQUET_PROP, descriptor)) { return (FileAppender<E>) new DurableParquetAppender( fs, temp, schema, conf, descriptor.getCompressionType()); } else { return (FileAppender<E>) new ParquetAppender( fs, temp, schema, conf, descriptor.getCompressionType()); } } else if (Formats.AVRO.equals(format)) { return new AvroAppender<E>(fs, temp, schema, descriptor.getCompressionType()); } else if (Formats.CSV.equals(format) && DescriptorUtil.isEnabled(FileSystemProperties.ALLOW_CSV_PROP, descriptor)) { return new CSVAppender<E>(fs, temp, descriptor); } else { this.state = ReaderWriterState.ERROR; throw new UnknownFormatException("Unknown format " + descriptor); } }
@Override @SuppressWarnings({"unchecked", "deprecation"}) public List<InputSplit> getSplits(JobContext jobContext) throws IOException { Configuration conf = Hadoop.JobContext.getConfiguration.invoke(jobContext); Job job = new Job(conf); Format format = dataset.getDescriptor().getFormat(); if (setInputPaths(jobContext, job)) { if (Formats.AVRO.equals(format)) { AvroJob.setInputKeySchema(job, dataset.getDescriptor().getSchema()); AvroCombineInputFormat<E> delegate = new AvroCombineInputFormat<E>(); return delegate.getSplits(jobContext); } else if (Formats.PARQUET.equals(format)) { AvroParquetCombineInputFormat delegate = new AvroParquetCombineInputFormat(); return delegate.getSplits(jobContext); } else if (Formats.JSON.equals(format)) { return new JSONInputFormat().getSplits(jobContext); } else if (Formats.CSV.equals(format)) { // this generates an unchecked cast exception? return new CSVInputFormat().getSplits(jobContext); } else if (Formats.INPUTFORMAT.equals(format)) { return InputFormatUtil.newInputFormatInstance(dataset.getDescriptor()) .getSplits(jobContext); } else { throw new UnsupportedOperationException( "Not a supported format: " + format); } } else { return ImmutableList.of(); } }
@Override public void sync() throws EventDeliveryException { if (nEventsHandled > 0) { if (Formats.PARQUET.equals( dataset.getDataset().getDescriptor().getFormat())) { // We need to close the writer on sync if we're writing to a Parquet // dataset close(); } else { if (writer instanceof Syncable) { ((Syncable) writer).sync(); } } } }
FileSystemDataset(FileSystem fileSystem, Path directory, String namespace, String name, DatasetDescriptor descriptor, URI uri, @Nullable PartitionListener partitionListener, Class<E> type) { super(type, descriptor.getSchema()); if (Formats.PARQUET.equals(descriptor.getFormat())) { Preconditions.checkArgument(IndexedRecord.class.isAssignableFrom(type) || type == Object.class, "Parquet only supports generic and specific data models, type" + " parameter must implement IndexedRecord"); } this.fileSystem = fileSystem; this.directory = directory; this.namespace = namespace; this.name = name; this.descriptor = descriptor; this.partitionStrategy = descriptor.isPartitioned() ? descriptor.getPartitionStrategy() : null; this.partitionListener = partitionListener; this.convert = new PathConversion(descriptor.getSchema()); this.uri = uri; Path signalsPath = new Path(getDirectory(fileSystem, directory), SIGNALS_DIRECTORY_NAME); this.signalManager = new SignalManager(fileSystem, signalsPath); this.unbounded = new FileSystemPartitionView<E>( this, partitionListener, signalManager, type); // remove this.partitionKey for 0.14.0 this.partitionKey = null; }
Formats.fromString(format).equals(existingFormat), "Found %s data, but --format is %s", existingFormat.getName(), format);
private PartitionedDatasetWriter(FileSystemView<E> view) { final DatasetDescriptor descriptor = view.getDataset().getDescriptor(); Preconditions.checkArgument(descriptor.isPartitioned(), "Dataset " + view.getDataset() + " is not partitioned"); this.view = view; this.partitionStrategy = descriptor.getPartitionStrategy(); int defaultMaxWriters = partitionStrategy.getCardinality(); if (defaultMaxWriters < 0 || defaultMaxWriters > DEFAULT_WRITER_CACHE_SIZE) { defaultMaxWriters = DEFAULT_WRITER_CACHE_SIZE; } this.maxWriters = DescriptorUtil.getInt(WRITER_CACHE_SIZE_PROP, descriptor, defaultMaxWriters); this.state = ReaderWriterState.NEW; this.reusedKey = new StorageKey(partitionStrategy); this.accessor = view.getAccessor(); this.provided = view.getProvidedValues(); // get file rolling properties if (!Formats.PARQUET.equals(descriptor.getFormat())) { this.targetFileSize = DescriptorUtil.getLong( TARGET_FILE_SIZE_PROP, descriptor, -1); } else { targetFileSize = -1; } this.rollIntervalMillis = 1000 * DescriptorUtil.getLong( ROLL_INTERVAL_S_PROP, descriptor, -1); }