org.kitesdk.data.mapreduce.DatasetKeyOutputFormat java code examples

/**
 * The job dataset may already exist if the ApplicationMaster was restarted
 */
@SuppressWarnings("unchecked")
private static <E> Dataset<E> loadOrCreateJobDataset(JobContext jobContext) {
 Dataset<Object> dataset = load(jobContext).getDataset();
 String jobDatasetName = getJobDatasetName(jobContext);
 DatasetRepository repo = getDatasetRepository(jobContext);
 if (repo.exists(TEMP_NAMESPACE, jobDatasetName)) {
  Dataset<E> tempDataset = repo.load(TEMP_NAMESPACE, jobDatasetName,
   DatasetKeyOutputFormat.<E>getType(jobContext));
  try {
   Compatibility.checkCompatible(dataset.getDescriptor(),
    tempDataset.getDescriptor());
   return tempDataset;
  } catch (RuntimeException ex) {
   // swallow
  }
 }
 return repo.create(TEMP_NAMESPACE, jobDatasetName,
   copy(dataset.getDescriptor()),
   DatasetKeyOutputFormat.<E>getType(jobContext));
}

private static <E> Dataset<E> loadOrCreateTaskAttemptDataset(TaskAttemptContext taskContext) {
 String taskAttemptDatasetName = getTaskAttemptDatasetName(taskContext);
 DatasetRepository repo = getDatasetRepository(taskContext);
 Dataset<E> jobDataset = loadJobDataset(taskContext);
 if (repo.exists(TEMP_NAMESPACE, taskAttemptDatasetName)) {
  return repo.load(TEMP_NAMESPACE, taskAttemptDatasetName);
 } else {
  return repo.create(TEMP_NAMESPACE, taskAttemptDatasetName,
    copy(jobDataset.getDescriptor()));
 }
}

private static void deleteJobDataset(JobContext jobContext) {
 DatasetRepository repo = getDatasetRepository(jobContext);
 repo.delete(TEMP_NAMESPACE, getJobDatasetName(jobContext));
}

private static void deleteTaskAttemptDataset(TaskAttemptContext taskContext) {
 DatasetRepository repo = getDatasetRepository(taskContext);
 String taskAttemptDatasetName = getTaskAttemptDatasetName(taskContext);
 if (repo.exists(TEMP_NAMESPACE, taskAttemptDatasetName)) {
  repo.delete(TEMP_NAMESPACE, taskAttemptDatasetName);
 }
}

public DatasetTarget(URI uri) {
 this.uri = uri;
 Configuration temp = emptyConf();
 // use appendTo since handleExisting checks for existing data
 DatasetKeyOutputFormat.configure(temp).appendTo(uri);
 this.formatBundle = outputBundle(temp);
}

@Override
@SuppressWarnings("unchecked")
public RecordWriter<E, Void> getRecordWriter(TaskAttemptContext taskAttemptContext) {
 Configuration conf = Hadoop.TaskAttemptContext
   .getConfiguration.invoke(taskAttemptContext);
 View<E> target = load(taskAttemptContext);
 View<E> working;
 if (usePerTaskAttemptDatasets(target, conf)) {
  working = loadOrCreateTaskAttemptView(taskAttemptContext);
 } else {
  working = target;
 }
 boolean copyRecords = conf.getBoolean(KITE_COPY_RECORDS, false);
 String partitionDir = conf.get(KITE_PARTITION_DIR);
 if (working.getDataset().getDescriptor().isPartitioned() &&
   partitionDir != null) {
  if (!(target instanceof FileSystemDataset)) {
   throw new UnsupportedOperationException("Partitions only supported for " +
     "FileSystemDataset. Dataset: " + target);
  }
  FileSystemDataset fsDataset = (FileSystemDataset) target;
  PartitionKey key = fsDataset.keyFromDirectory(new Path(partitionDir));
  if (key != null && !key.getValues().isEmpty()) {
   working = fsDataset.getPartition(key, true);
  }
  return new DatasetRecordWriter<E>(working, copyRecords);
 } else {
  return new DatasetRecordWriter<E>(working, copyRecords);
 }
}

@Override
public OutputCommitter getOutputCommitter(TaskAttemptContext taskAttemptContext) {
 Configuration conf = Hadoop.TaskAttemptContext.getConfiguration.invoke(taskAttemptContext);
 DefaultConfiguration.init(conf);
 View<E> view = load(taskAttemptContext);
 return usePerTaskAttemptDatasets(view) ?
   new MergeOutputCommitter<E>() : new NullOutputCommitter();
}

private static DatasetRepository getDatasetRepository(JobContext jobContext) {
 Configuration conf = Hadoop.JobContext.getConfiguration.invoke(jobContext);
 DatasetRepository repo = DatasetRepositories.repositoryFor(conf.get(KITE_OUTPUT_URI));
 if (repo instanceof TemporaryDatasetRepositoryAccessor) {
  Dataset<Object> dataset = load(jobContext).getDataset();
  String namespace = dataset.getNamespace();
  repo = ((TemporaryDatasetRepositoryAccessor) repo)
    .getTemporaryRepository(namespace, getJobDatasetName(jobContext));
 }
 return repo;
}

@Override
public void checkOutputSpecs(JobContext jobContext) {
 // The committer setup will fail if the output dataset does not exist
 View<E> target = load(jobContext);
 Configuration conf = Hadoop.JobContext.getConfiguration.invoke(jobContext);
 switch (conf.getEnum(KITE_WRITE_MODE, WriteMode.DEFAULT)) {
  case APPEND:
   break;
  case OVERWRITE:
   // if the merge won't use replace, then delete the existing data
   if (!canReplace(target)) {
    target.deleteAll();
   }
   break;
  default:
  case DEFAULT:
   boolean isReady = false;
   if (target instanceof Signalable) {
    isReady = ((Signalable)target).isReady();
   }
   if (isReady || !target.isEmpty()) {
    throw new DatasetException(
      "View is not empty or has been signaled as ready: " + target);
   }
   break;
 }
}

@SuppressWarnings("deprecation")
private static <E> View<E> load(JobContext jobContext) {
 Configuration conf = Hadoop.JobContext.getConfiguration.invoke(jobContext);
 Class<E> type = getType(jobContext);
 String outputUri = conf.get(KITE_OUTPUT_URI);
 return Datasets.<E, View<E>>load(outputUri, type);
}

public DatasetTarget(View<E> view) {
 this.view = view;
 Configuration temp = emptyConf();
 // use appendTo since handleExisting checks for existing data
 DatasetKeyOutputFormat.configure(temp).appendTo(view);
 this.formatBundle = outputBundle(temp);
 this.uri = view.getDataset().getUri();
}

@Override
@SuppressWarnings("unchecked")
public RecordWriter<E, Void> getRecordWriter(TaskAttemptContext taskAttemptContext) {
 Configuration conf = Hadoop.TaskAttemptContext
   .getConfiguration.invoke(taskAttemptContext);
 View<E> target = load(taskAttemptContext);
 View<E> working;
 if (usePerTaskAttemptDatasets(target)) {
  working = loadOrCreateTaskAttemptView(taskAttemptContext);
 } else {
  working = target;
 }
 boolean copyRecords = conf.getBoolean(KITE_COPY_RECORDS, false);
 String partitionDir = conf.get(KITE_PARTITION_DIR);
 if (working.getDataset().getDescriptor().isPartitioned() &&
   partitionDir != null) {
  if (!(target instanceof FileSystemDataset)) {
   throw new UnsupportedOperationException("Partitions only supported for " +
     "FileSystemDataset. Dataset: " + target);
  }
  FileSystemDataset fsDataset = (FileSystemDataset) target;
  PartitionKey key = fsDataset.keyFromDirectory(new Path(partitionDir));
  if (key != null && !key.getValues().isEmpty()) {
   working = fsDataset.getPartition(key, true);
  }
  return new DatasetRecordWriter<E>(working, copyRecords);
 } else {
  return new DatasetRecordWriter<E>(working, copyRecords);
 }
}

private static <E> Dataset<E> loadJobDataset(JobContext jobContext) {
 DatasetRepository repo = getDatasetRepository(jobContext);
 return repo.load(TEMP_NAMESPACE, getJobDatasetName(jobContext));
}

private static void deleteTaskAttemptDataset(TaskAttemptContext taskContext) {
 DatasetRepository repo = getDatasetRepository(taskContext);
 String taskAttemptDatasetName = getTaskAttemptDatasetName(taskContext);
 if (repo.exists(TEMP_NAMESPACE, taskAttemptDatasetName)) {
  repo.delete(TEMP_NAMESPACE, taskAttemptDatasetName);
 }
}

@Override
public OutputCommitter getOutputCommitter(TaskAttemptContext taskAttemptContext) {
 Configuration conf = Hadoop.TaskAttemptContext.getConfiguration.invoke(taskAttemptContext);
 DefaultConfiguration.init(conf);
 View<E> view = load(taskAttemptContext);
 return usePerTaskAttemptDatasets(view, conf) ?
   new MergeOutputCommitter<E>() : new NullOutputCommitter();
}

private static DatasetRepository getDatasetRepository(JobContext jobContext) {
 Configuration conf = Hadoop.JobContext.getConfiguration.invoke(jobContext);
 DatasetRepository repo = DatasetRepositories.repositoryFor(conf.get(KITE_OUTPUT_URI));
 if (repo instanceof TemporaryDatasetRepositoryAccessor) {
  Dataset<Object> dataset = load(jobContext).getDataset();
  String namespace = dataset.getNamespace();
  repo = ((TemporaryDatasetRepositoryAccessor) repo)
    .getTemporaryRepository(namespace, getJobDatasetName(jobContext));
 }
 return repo;
}

@Override
public void checkOutputSpecs(JobContext jobContext) {
 // The committer setup will fail if the output dataset does not exist
 View<E> target = load(jobContext);
 Configuration conf = Hadoop.JobContext.getConfiguration.invoke(jobContext);
 switch (conf.getEnum(KITE_WRITE_MODE, WriteMode.DEFAULT)) {
  case APPEND:
   break;
  case OVERWRITE:
   // if the merge won't use replace, then delete the existing data
   if (!canReplace(target)) {
    target.deleteAll();
   }
   break;
  default:
  case DEFAULT:
   boolean isReady = false;
   if (target instanceof Signalable) {
    isReady = ((Signalable)target).isReady();
   }
   if (isReady || !target.isEmpty()) {
    throw new DatasetException(
      "View is not empty or has been signaled as ready: " + target);
   }
   break;
 }
}

@SuppressWarnings("deprecation")
private static <E> View<E> load(JobContext jobContext) {
 Configuration conf = Hadoop.JobContext.getConfiguration.invoke(jobContext);
 Class<E> type = getType(jobContext);
 String outputUri = conf.get(KITE_OUTPUT_URI);
 return Datasets.<E, View<E>>load(outputUri, type);
}

@SuppressWarnings("deprecation")
private Job createJob() throws Exception {
 Job job = new Job();
 DatasetKeyInputFormat.configure(job).readFrom(inputDataset).withType(GenericData.Record.class);
 job.setMapperClass(LineCountMapper.class);
 job.setMapOutputKeyClass(Text.class);
 job.setMapOutputValueClass(IntWritable.class);
 job.setReducerClass(GenericStatsReducer.class);
 DatasetKeyOutputFormat.configure(job).writeTo(outputDataset).withType(GenericData.Record.class);
 return job;
}

/**
 * The job dataset may already exist if the ApplicationMaster was restarted
 */
@SuppressWarnings("unchecked")
private static <E> Dataset<E> loadOrCreateJobDataset(JobContext jobContext) {
 Dataset<Object> dataset = load(jobContext).getDataset();
 String jobDatasetName = getJobDatasetName(jobContext);
 DatasetRepository repo = getDatasetRepository(jobContext);
 if (repo.exists(TEMP_NAMESPACE, jobDatasetName)) {
  Dataset<E> tempDataset = repo.load(TEMP_NAMESPACE, jobDatasetName,
   DatasetKeyOutputFormat.<E>getType(jobContext));
  try {
   Compatibility.checkCompatible(dataset.getDescriptor(),
    tempDataset.getDescriptor());
   return tempDataset;
  } catch (RuntimeException ex) {
   // swallow
  }
 }
 return repo.create(TEMP_NAMESPACE, jobDatasetName,
   copy(dataset.getDescriptor()),
   DatasetKeyOutputFormat.<E>getType(jobContext));
}

Javadoc

A MapReduce OutputFormat for writing to a Dataset. Since a Dataset only contains entities (not key/value pairs), this output format ignores the value.

Most used methods

configure
Configures the Job to use the DatasetKeyOutputFormat and returns a helper to add further configurati
canReplace
copy
getDatasetRepository
getJobDatasetName
getTaskAttemptDatasetName
getType
load
loadJobDataset
loadOrCreateTaskAttemptDataset
loadOrCreateTaskAttemptView
usePerTaskAttemptDatasets

Popular in Java

Parsing JSON documents to java classes using gson
getSharedPreferences (Context)
findViewById (Activity)
scheduleAtFixedRate (ScheduledExecutorService)
Proxy (java.net)
This class represents proxy server settings. A created instance of Proxy stores a type and an addres
ByteBuffer (java.nio)
A buffer for bytes. A byte buffer can be created in either one of the following ways: * #allocate
MessageDigest (java.security)
Uses a one-way hash function to turn an arbitrary number of bytes into a fixed-length byte sequence.
ResultSet (java.sql)
An interface for an object which represents a database table entry, returned as the result of the qu
HashMap (java.util)
HashMap is an implementation of Map. All optional operations are supported.All elements are permitte
PriorityQueue (java.util)
A PriorityQueue holds elements on a priority heap, which orders the elements according to their natu
Top PhpStorm plugins

How to useDatasetKeyOutputFormat in org.kitesdk.data.mapreduce

Best Java code snippets using org.kitesdk.data.mapreduce.DatasetKeyOutputFormat (Showing top 20 results out of 315)

How to use
DatasetKeyOutputFormat
in
org.kitesdk.data.mapreduce