org.kitesdk.data.DatasetDescriptor.getFormat java code examples

@Override
public void sync() throws EventDeliveryException {
 if (nEventsHandled > 0) {
  if (Formats.PARQUET.equals(
    dataset.getDataset().getDescriptor().getFormat())) {
   // We need to close the writer on sync if we're writing to a Parquet
   // dataset
   close();
  } else {
   if (writer instanceof Syncable) {
    ((Syncable) writer).sync();
   }
  }
 }
}

Format format = descriptor.getFormat();
Preconditions.checkArgument(allowedFormats().contains(format.getName()),
  "Unsupported format: " + format.getName());

static boolean isSupportedFormat(DatasetDescriptor descriptor) {
 Format format = descriptor.getFormat();
 return (SUPPORTED_FORMATS.contains(format) || (Formats.CSV.equals(format) &&
   DescriptorUtil.isEnabled(FileSystemProperties.ALLOW_CSV_PROP, descriptor)
 ));
}

public FileSystemViewKeyInputFormat(FileSystemDataset<E> dataset,
  Configuration conf) {
 this.dataset = dataset;
 this.view = null;
 LOG.debug("Dataset: {}", dataset);
 Format format = dataset.getDescriptor().getFormat();
 setConfigProperties(conf, format, dataset.getSchema(), dataset.getType());
}

@Override
public void initialize() {
 Preconditions.checkState(state.equals(ReaderWriterState.NEW),
  "A reader may not be opened more than once - current state:%s", state);
 final Format format = descriptor.getFormat();
 if (!SUPPORTED_FORMATS.contains(format)) {
  throw new UnknownFormatException("Cannot open format:" + format.getName());
 }
 this.state = ReaderWriterState.OPEN;
}

public FileSystemViewKeyInputFormat(FileSystemView<E> view, Configuration conf) {
 this.dataset = (FileSystemDataset<E>) view.getDataset();
 this.view = view;
 LOG.debug("View: {}", view);
 Format format = dataset.getDescriptor().getFormat();
 setConfigProperties(conf, format, view.getSchema(), view.getType());
}

@Override
public <T> Collection<T> read(Class<T> targetClass, ViewCallback viewCallback) {
  DatasetDescriptor descriptor = getDatasetDescriptor(targetClass);
  if (descriptor == null) {
    throw new StoreException("Unable to locate dataset for target class " + targetClass.getName());
  }
  if (Formats.PARQUET.equals(descriptor.getFormat())) {
    return readGenericRecords(targetClass, viewCallback);
  } else {
    return readPojo(targetClass, viewCallback);
  }
}

@Override
public <T> Collection<T> read(Class<T> targetClass) {
  DatasetDescriptor descriptor = getDatasetDescriptor(targetClass);
  if (descriptor == null) {
    throw new StoreException("Unable to locate dataset for target class " + targetClass.getName());
  }
  if (Formats.PARQUET.equals(descriptor.getFormat())) {
    return readGenericRecords(targetClass, null);
  } else {
    return readPojo(targetClass, null);
  }
}

@SuppressWarnings("unchecked") // See https://github.com/Parquet/parquet-mr/issues/106
private void openNextReader() {
 if (Formats.PARQUET.equals(descriptor.getFormat())) {
  this.reader = new ParquetFileSystemDatasetReader(fileSystem,
    filesIter.next(), accessor.getReadSchema(), accessor.getType());
 } else if (Formats.JSON.equals(descriptor.getFormat())) {
  this.reader = new JSONFileReader<E>(
    fileSystem, filesIter.next(), accessor);
 } else if (Formats.CSV.equals(descriptor.getFormat())) {
  this.reader = new CSVFileReader<E>(fileSystem, filesIter.next(),
    descriptor, accessor);
 } else if (Formats.INPUTFORMAT.equals(descriptor.getFormat())) {
  this.reader = new InputFormatReader(fileSystem, filesIter.next(), descriptor);
 } else {
  this.reader = new FileSystemDatasetReader<E>(fileSystem, filesIter.next(),
    accessor.getReadSchema(), accessor.getType());
 }
 reader.initialize();
 this.readerIterator = Iterators.filter(reader,
   constraints.toEntityPredicate(
     (pathIter != null ? pathIter.getStorageKey() : null), accessor));
}

@Override
public void sync() throws EventDeliveryException {
 if (nEventsHandled > 0) {
  if (Formats.PARQUET.equals(
    dataset.getDataset().getDescriptor().getFormat())) {
   // We need to close the writer on sync if we're writing to a Parquet
   // dataset
   close();
  } else {
   if (writer instanceof Syncable) {
    ((Syncable) writer).sync();
   }
  }
 }
}

@Override
public void initialize() {
 Preconditions.checkState(state.equals(ReaderWriterState.NEW),
   "Unable to open a writer from state:%s", state);
 DatasetDescriptor descriptor = view.getDataset().getDescriptor();
 ValidationException.check(
   FileSystemWriter.isSupportedFormat(descriptor),
   "Not a supported format: %s", descriptor.getFormat());
 LOG.debug("Opening partitioned dataset writer w/strategy:{}",
  partitionStrategy);
 cachedWriters = CacheBuilder.newBuilder().maximumSize(maxWriters)
  .removalListener(new DatasetWriterCloser<E>())
  .build(createCacheLoader());
 state = ReaderWriterState.OPEN;
}

static <E> FileSystemWriter<E> newWriter(FileSystem fs, Path path,
                     long rollIntervalMillis,
                     long targetFileSize,
                     DatasetDescriptor descriptor, Schema writerSchema) {
 Format format = descriptor.getFormat();
 if (Formats.PARQUET.equals(format)) {
  // by default, Parquet is not durable
  if (DescriptorUtil.isDisabled(
    FileSystemProperties.NON_DURABLE_PARQUET_PROP, descriptor)) {
   return new IncrementalWriter<E>(
     fs, path, rollIntervalMillis, targetFileSize, descriptor, writerSchema);
  } else {
   return new FileSystemWriter<E>(
     fs, path, rollIntervalMillis, targetFileSize, descriptor, writerSchema);
  }
 } else if (Formats.AVRO.equals(format) || Formats.CSV.equals(format)) {
  return new IncrementalWriter<E>(
    fs, path, rollIntervalMillis, targetFileSize, descriptor, writerSchema);
 } else {
  return new FileSystemWriter<E>(
    fs, path, rollIntervalMillis, targetFileSize, descriptor, writerSchema);
 }
}

static <E> PartitionedDatasetWriter<E, ?> newWriter(FileSystemView<E> view) {
 DatasetDescriptor descriptor = view.getDataset().getDescriptor();
 Format format = descriptor.getFormat();
 if (Formats.PARQUET.equals(format)) {
  // by default, Parquet is not durable
  if (DescriptorUtil.isDisabled(
    FileSystemProperties.NON_DURABLE_PARQUET_PROP, descriptor)) {
   return new IncrementalPartitionedDatasetWriter<E>(view);
  } else {
   return new NonDurablePartitionedDatasetWriter<E>(view);
  }
 } else if (Formats.AVRO.equals(format) || Formats.CSV.equals(format)) {
  return new IncrementalPartitionedDatasetWriter<E>(view);
 } else {
  return new NonDurablePartitionedDatasetWriter<E>(view);
 }
}

@Test
public void testLoad() {
 ensureCreated();
 DatasetDescriptor loaded = provider.load(NAMESPACE, NAME);
 Assert.assertNotNull("DatasetDescriptor should be returned", loaded);
 Assert.assertEquals("Schema should match",
   testDescriptor.getSchema(), loaded.getSchema());
 Assert.assertEquals("PartitionStrategy should match",
   testDescriptor.getPartitionStrategy(), loaded.getPartitionStrategy());
 Assert.assertEquals("Format should match",
   testDescriptor.getFormat(), loaded.getFormat());
}

public void testUpdate() {
 ensureCreated();
 /*
  * To be clear: we are testing that even crazy, incompatible changes are
  * happily saved by the MetadataProvider. Rule enforcement is done upstream
  * by libraries that are in a better position to make decisions about what
  * changes are incompatible.
  */
 final DatasetDescriptor saved = provider.update(NAMESPACE, NAME, anotherDescriptor);
 Assert.assertNotNull("Updated Descriptor should be returned", saved);
 Assert.assertEquals("Schema should match update",
   anotherDescriptor.getSchema(), saved.getSchema());
 Assert.assertEquals("PartitionStrategy should match update",
   anotherDescriptor.getPartitionStrategy(), saved.getPartitionStrategy());
 Assert.assertEquals("Format should match update",
   anotherDescriptor.getFormat(), saved.getFormat());
}

@Test
public void testMultipleAvroFilesInOneFolder() throws Exception {
 File folder = temp.newFolder("a/b/c/d/e");
 Path root = new Path(temp.getRoot().toURI());
 FileSystem fs = LocalFileSystem.getInstance();
 // create a two Avro files in parent
 Path parent = new Path(folder.toURI());
 createAvroUserFile(fs, parent);
 createAvroUserFile(fs, parent);
 DatasetDescriptor descriptor = Iterables.getOnlyElement(
   FileSystemUtil.findPotentialDatasets(fs, root));
 Assert.assertFalse("Should not flag at mixed depth",
   descriptor.hasProperty("kite.filesystem.mixed-depth"));
 Assert.assertEquals("Should be directly under parent",
   parent.toUri(), descriptor.getLocation());
 Assert.assertEquals("Should use user schema",
   USER_SCHEMA, descriptor.getSchema());
 Assert.assertEquals("Should have Avro format",
   Formats.AVRO, descriptor.getFormat());
 Assert.assertFalse("Should not be partitioned", descriptor.isPartitioned());
}

@Test
public void testMultipleParquetFilesInOneFolder() throws Exception {
 File folder = temp.newFolder("a/b/c/d/e");
 Path root = new Path(temp.getRoot().toURI());
 FileSystem fs = LocalFileSystem.getInstance();
 // create a single Avro file
 Path parent = new Path(folder.toURI());
 createParquetEventFile(fs, parent);
 createParquetEventFile(fs, parent);
 DatasetDescriptor descriptor = Iterables.getOnlyElement(
   FileSystemUtil.findPotentialDatasets(fs, root));
 Assert.assertFalse("Should not flag at mixed depth",
   descriptor.hasProperty("kite.filesystem.mixed-depth"));
 Assert.assertEquals("Should be directly under parent",
   parent.toUri(), descriptor.getLocation());
 Assert.assertEquals("Should use event schema",
   EVENT_SCHEMA, descriptor.getSchema());
 Assert.assertEquals("Should have Parquet format",
   Formats.PARQUET, descriptor.getFormat());
 Assert.assertFalse("Should not be partitioned", descriptor.isPartitioned());
}

@Test
public void testCreate() {
 Assert.assertFalse("Sanity check", provider.exists(NAMESPACE, NAME));
 DatasetDescriptor created = provider.create(NAMESPACE, NAME, testDescriptor);
 Assert.assertNotNull("Descriptor should be returned", created);
 Assert.assertTrue("Descriptor should exist", provider.exists(NAMESPACE, NAME));
 Assert.assertEquals("Schema should match",
   testDescriptor.getSchema(), created.getSchema());
 Assert.assertEquals("PartitionStrategy should match",
   testDescriptor.getPartitionStrategy(), created.getPartitionStrategy());
 Assert.assertEquals("Format should match",
   testDescriptor.getFormat(), created.getFormat());
 // the MetadataProvider optionally sets the location, nothing to test
}

@SuppressWarnings("unchecked")
private RecordReader<E, Void> createUnfilteredRecordReader(InputSplit inputSplit,
  TaskAttemptContext taskAttemptContext) throws IOException, InterruptedException {
 Format format = dataset.getDescriptor().getFormat();
 if (Formats.AVRO.equals(format)) {
  return new AvroKeyReaderWrapper(new AvroCombineInputFormat<E>());
 } else if (Formats.PARQUET.equals(format)) {
  return new ValueReaderWrapper(new AvroParquetCombineInputFormat());
 } else if (Formats.JSON.equals(format)) {
  JSONInputFormat<E> delegate = new JSONInputFormat<E>();
  delegate.setView(view != null ? view : dataset);
  return delegate.createRecordReader(inputSplit, taskAttemptContext);
 } else if (Formats.CSV.equals(format)) {
  CSVInputFormat<E> delegate = new CSVInputFormat<E>();
  delegate.setView(view != null ? view : dataset);
  return delegate.createRecordReader(inputSplit, taskAttemptContext);
 } else if (Formats.INPUTFORMAT.equals(format)) {
  return InputFormatUtil.newRecordReader(dataset.getDescriptor());
 } else {
  throw new UnsupportedOperationException(
    "Not a supported format: " + format);
 }
}

@Test
public void testUpdateFailsWithFormatChange() {
 Dataset<Record> dataset = repo.create(NAMESPACE, NAME,
   new DatasetDescriptor.Builder(testDescriptor)
     .format(Formats.AVRO)
     .build());
 DatasetDescriptor changed =
   new DatasetDescriptor.Builder(dataset.getDescriptor())
   .format(Formats.PARQUET)
   .build();
 try {
  repo.update(NAMESPACE, NAME, changed);
  Assert.fail("Should fail due to format change");
 } catch (ValidationException e) {
  // expected
 }
 Assert.assertEquals(
   Formats.AVRO, repo.load(NAMESPACE, NAME).getDescriptor().getFormat());
}

Javadoc

Get the associated Format the data is stored in.

Popular methods of DatasetDescriptor

getSchema
getPartitionStrategy
isPartitioned
getLocation
getProperty
listProperties
getColumnMapping
getSchemaUrl
hasProperty
isColumnMapped
getCompressionType
<init>
Create an instance of this class with the supplied Schema, optional URL, Format, optional location U

Popular in Java

Reading from database using SQL prepared statement
setContentView (Activity)
getExternalFilesDir (Context)
getResourceAsStream (ClassLoader)
Pointer (com.sun.jna)
An abstraction for a native pointer data type. A Pointer instance represents, on the Java side, a na
FileInputStream (java.io)
An input stream that reads bytes from a file. File file = ...finally if (in != null) in.clos
Dictionary (java.util)
Note: Do not use this class since it is obsolete. Please use the Map interface for new implementatio
Queue (java.util)
A collection designed for holding elements prior to processing. Besides basic java.util.Collection o
Executor (java.util.concurrent)
An object that executes submitted Runnable tasks. This interface provides a way of decoupling task s
Executors (java.util.concurrent)
Factory and utility methods for Executor, ExecutorService, ScheduledExecutorService, ThreadFactory,
Best IntelliJ plugins

How to use getFormatmethodin org.kitesdk.data.DatasetDescriptor

Best Java code snippets using org.kitesdk.data.DatasetDescriptor.getFormat (Showing top 20 results out of 315)

How to use
getFormat
method
in
org.kitesdk.data.DatasetDescriptor