@Override @SuppressWarnings("unchecked") public RecordWriter<E, Void> getRecordWriter(TaskAttemptContext taskAttemptContext) { Configuration conf = Hadoop.TaskAttemptContext .getConfiguration.invoke(taskAttemptContext); View<E> target = load(taskAttemptContext); View<E> working; if (usePerTaskAttemptDatasets(target)) { working = loadOrCreateTaskAttemptView(taskAttemptContext); } else { working = target; } boolean copyRecords = conf.getBoolean(KITE_COPY_RECORDS, false); String partitionDir = conf.get(KITE_PARTITION_DIR); if (working.getDataset().getDescriptor().isPartitioned() && partitionDir != null) { if (!(target instanceof FileSystemDataset)) { throw new UnsupportedOperationException("Partitions only supported for " + "FileSystemDataset. Dataset: " + target); } FileSystemDataset fsDataset = (FileSystemDataset) target; PartitionKey key = fsDataset.keyFromDirectory(new Path(partitionDir)); if (key != null && !key.getValues().isEmpty()) { working = fsDataset.getPartition(key, true); } return new DatasetRecordWriter<E>(working, copyRecords); } else { return new DatasetRecordWriter<E>(working, copyRecords); } }
/** * Checks for a method implementation. * * @param methodName name of a method (different from constructor) * @param argClasses argument classes for the method * @return this Builder for method chaining * @see {@link java.lang.Class#forName(String)} * @see {@link java.lang.Class#getMethod(String, Class[])} */ public Builder hiddenImpl(Class<?> targetClass, String methodName, Class<?>... argClasses) { // don't do any work if an implementation has been found if (method != null) { return this; } try { Method hidden = targetClass.getDeclaredMethod(methodName, argClasses); AccessController.doPrivileged(new MakeAccessible(hidden)); this.method = new UnboundMethod(hidden, name); } catch (SecurityException e) { // unusable } catch (NoSuchMethodException e) { // not the right implementation } return this; }
public <R> R invoke(Object target, Object... args) { try { return this.<R>invokeChecked(target, args); } catch (Exception e) { throw Throwables.propagate(e); } }
@Override public void checkOutputSpecs(JobContext jobContext) { // The committer setup will fail if the output dataset does not exist View<E> target = load(jobContext); Configuration conf = Hadoop.JobContext.getConfiguration.invoke(jobContext); switch (conf.getEnum(KITE_WRITE_MODE, WriteMode.DEFAULT)) { case APPEND: break; case OVERWRITE: // if the merge won't use replace, then delete the existing data if (!canReplace(target)) { target.deleteAll(); } break; default: case DEFAULT: boolean isReady = false; if (target instanceof Signalable) { isReady = ((Signalable)target).isReady(); } if (isReady || !target.isEmpty()) { throw new DatasetException( "View is not empty or has been signaled as ready: " + target); } break; } }
@Override @SuppressWarnings("unchecked") public RecordWriter<E, Void> getRecordWriter(TaskAttemptContext taskAttemptContext) { Configuration conf = Hadoop.TaskAttemptContext .getConfiguration.invoke(taskAttemptContext); View<E> target = load(taskAttemptContext); View<E> working; if (usePerTaskAttemptDatasets(target, conf)) { working = loadOrCreateTaskAttemptView(taskAttemptContext); } else { working = target; } boolean copyRecords = conf.getBoolean(KITE_COPY_RECORDS, false); String partitionDir = conf.get(KITE_PARTITION_DIR); if (working.getDataset().getDescriptor().isPartitioned() && partitionDir != null) { if (!(target instanceof FileSystemDataset)) { throw new UnsupportedOperationException("Partitions only supported for " + "FileSystemDataset. Dataset: " + target); } FileSystemDataset fsDataset = (FileSystemDataset) target; PartitionKey key = fsDataset.keyFromDirectory(new Path(partitionDir)); if (key != null && !key.getValues().isEmpty()) { working = fsDataset.getPartition(key, true); } return new DatasetRecordWriter<E>(working, copyRecords); } else { return new DatasetRecordWriter<E>(working, copyRecords); } }
@Override @SuppressWarnings("unchecked") public void commitJob(JobContext jobContext) throws IOException { Configuration conf = Hadoop.JobContext .getConfiguration.invoke(jobContext); DatasetRepository repo = getDatasetRepository(jobContext); boolean isTemp = repo instanceof TemporaryDatasetRepository; String jobDatasetName = getJobDatasetName(jobContext); View<E> targetView = load(jobContext); Dataset<E> jobDataset = repo.load(TEMP_NAMESPACE, jobDatasetName); WriteMode mode = conf.getEnum(KITE_WRITE_MODE, WriteMode.DEFAULT); if (mode == WriteMode.OVERWRITE && canReplace(targetView)) { ((Replaceable<View<E>>) targetView.getDataset()).replace(targetView, jobDataset); } else { ((Mergeable<Dataset<E>>) targetView.getDataset()).merge(jobDataset); } if (targetView instanceof Signalable) { ((Signalable)targetView).signalReady(); } if (isTemp) { ((TemporaryDatasetRepository) repo).delete(); } else { repo.delete(TEMP_NAMESPACE, jobDatasetName); } }
/** * Given a list of puts, create a new put with the values in each put merged * together. It is expected that no puts have a value for the same fully * qualified column. Return the new put. * * @param key * The key of the new put. * @param putList * The list of puts to merge * @return the new Put instance */ public static Put mergePuts(byte[] keyBytes, List<Put> putList) { Put put = new Put(keyBytes); for (Put putToMerge : putList) { Map<byte[], List<KeyValue>> familyMap = (Map<byte[], List<KeyValue>>) GET_FAMILY_MAP_METHOD.invoke(putToMerge); for (List<KeyValue> keyValueList : familyMap.values()) { for (KeyValue keyValue : keyValueList) { // don't use put.add(KeyValue) since it doesn't work with HBase 0.96 onwards put.add(keyValue.getFamily(), keyValue.getQualifier(), keyValue.getTimestamp(), keyValue.getValue()); } } } return put; }
@Override @SuppressWarnings({"unchecked", "deprecation"}) public List<InputSplit> getSplits(JobContext jobContext) throws IOException { Configuration conf = Hadoop.JobContext.getConfiguration.invoke(jobContext); Job job = new Job(conf); Format format = dataset.getDescriptor().getFormat(); if (setInputPaths(jobContext, job)) { if (Formats.AVRO.equals(format)) { AvroJob.setInputKeySchema(job, dataset.getDescriptor().getSchema()); AvroCombineInputFormat<E> delegate = new AvroCombineInputFormat<E>(); return delegate.getSplits(jobContext); } else if (Formats.PARQUET.equals(format)) { AvroParquetCombineInputFormat delegate = new AvroParquetCombineInputFormat(); return delegate.getSplits(jobContext); } else if (Formats.JSON.equals(format)) { return new JSONInputFormat().getSplits(jobContext); } else if (Formats.CSV.equals(format)) { // this generates an unchecked cast exception? return new CSVInputFormat().getSplits(jobContext); } else if (Formats.INPUTFORMAT.equals(format)) { return InputFormatUtil.newInputFormatInstance(dataset.getDescriptor()) .getSplits(jobContext); } else { throw new UnsupportedOperationException( "Not a supported format: " + format); } } else { return ImmutableList.of(); } }
@Override public void checkOutputSpecs(JobContext jobContext) { // The committer setup will fail if the output dataset does not exist View<E> target = load(jobContext); Configuration conf = Hadoop.JobContext.getConfiguration.invoke(jobContext); switch (conf.getEnum(KITE_WRITE_MODE, WriteMode.DEFAULT)) { case APPEND: break; case OVERWRITE: // if the merge won't use replace, then delete the existing data if (!canReplace(target)) { target.deleteAll(); } break; default: case DEFAULT: boolean isReady = false; if (target instanceof Signalable) { isReady = ((Signalable)target).isReady(); } if (isReady || !target.isEmpty()) { throw new DatasetException( "View is not empty or has been signaled as ready: " + target); } break; } }
/** * Checks for a method implementation. * * @param methodName name of a method (different from constructor) * @param argClasses argument classes for the method * @return this Builder for method chaining * @see {@link java.lang.Class#forName(String)} * @see {@link java.lang.Class#getMethod(String, Class[])} */ public Builder impl(Class<?> targetClass, String methodName, Class<?>... argClasses) { // don't do any work if an implementation has been found if (method != null) { return this; } try { this.method = new UnboundMethod( targetClass.getMethod(methodName, argClasses), name); } catch (NoSuchMethodException e) { // not the right implementation } return this; }
/** * Given a list of puts, create a new put with the values in each put merged * together. It is expected that no puts have a value for the same fully * qualified column. Return the new put. * * @param key * The key of the new put. * @param putList * The list of puts to merge * @return the new Put instance */ public static Put mergePuts(byte[] keyBytes, List<Put> putList) { Put put = new Put(keyBytes); for (Put putToMerge : putList) { Map<byte[], List<KeyValue>> familyMap = (Map<byte[], List<KeyValue>>) GET_FAMILY_MAP_METHOD.invoke(putToMerge); for (List<KeyValue> keyValueList : familyMap.values()) { for (KeyValue keyValue : keyValueList) { // don't use put.add(KeyValue) since it doesn't work with HBase 0.96 onwards put.add(keyValue.getFamily(), keyValue.getQualifier(), keyValue.getTimestamp(), keyValue.getValue()); } } } return put; }
/** * Checks for a method implementation. * * @param methodName name of a method (different from constructor) * @param argClasses argument classes for the method * @return this Builder for method chaining * @see {@link java.lang.Class#forName(String)} * @see {@link java.lang.Class#getMethod(String, Class[])} */ public Builder impl(Class<?> targetClass, String methodName, Class<?>... argClasses) { // don't do any work if an implementation has been found if (method != null) { return this; } try { this.method = new UnboundMethod( targetClass.getMethod(methodName, argClasses), name); } catch (NoSuchMethodException e) { // not the right implementation } return this; }
/** * Checks for a method implementation. * * @param methodName name of a method (different from constructor) * @param argClasses argument classes for the method * @return this Builder for method chaining * @see {@link java.lang.Class#forName(String)} * @see {@link java.lang.Class#getMethod(String, Class[])} */ public Builder hiddenImpl(Class<?> targetClass, String methodName, Class<?>... argClasses) { // don't do any work if an implementation has been found if (method != null) { return this; } try { Method hidden = targetClass.getDeclaredMethod(methodName, argClasses); AccessController.doPrivileged(new MakeAccessible(hidden)); this.method = new UnboundMethod(hidden, name); } catch (SecurityException e) { // unusable } catch (NoSuchMethodException e) { // not the right implementation } return this; }
@Override @SuppressWarnings("unchecked") public void commitJob(JobContext jobContext) throws IOException { Configuration conf = Hadoop.JobContext .getConfiguration.invoke(jobContext); DatasetRepository repo = getDatasetRepository(jobContext); boolean isTemp = repo instanceof TemporaryDatasetRepository; String jobDatasetName = getJobDatasetName(jobContext); View<E> targetView = load(jobContext); Dataset<E> jobDataset = repo.load(TEMP_NAMESPACE, jobDatasetName); WriteMode mode = conf.getEnum(KITE_WRITE_MODE, WriteMode.DEFAULT); if (mode == WriteMode.OVERWRITE && canReplace(targetView)) { ((Replaceable<View<E>>) targetView.getDataset()).replace(targetView, jobDataset); } else { ((Mergeable<Dataset<E>>) targetView.getDataset()).merge(jobDataset); } if (targetView instanceof Signalable) { ((Signalable)targetView).signalReady(); } if (isTemp) { ((TemporaryDatasetRepository) repo).delete(); } else { repo.delete(TEMP_NAMESPACE, jobDatasetName); } }
@Override public RecordReader<E, Void> createRecordReader(InputSplit split, TaskAttemptContext context) throws IOException, InterruptedException { Configuration conf = Hadoop.TaskAttemptContext .getConfiguration.invoke(context); Path path; if (split instanceof FileSplit) { path = ((FileSplit) split).getPath(); } else { throw new DatasetOperationException( "Split is not a FileSplit: %s:%s", split.getClass().getCanonicalName(), split); } CSVFileReader<E> reader = new CSVFileReader<E>( path.getFileSystem(conf), path, descriptor, accessor); reader.initialize(); return reader.asRecordReader(); } }
private static <E> View<E> loadOrCreateTaskAttemptView(TaskAttemptContext taskContext) { Configuration conf = Hadoop.JobContext.getConfiguration.invoke(taskContext); Map<String, String> uriOptions = Registration.lookupDatasetUri( URI.create(URI.create(conf.get(KITE_OUTPUT_URI)).getSchemeSpecificPart())).second(); Dataset<E> dataset = loadOrCreateTaskAttemptDataset(taskContext); if (dataset instanceof AbstractDataset) { DatasetDescriptor descriptor = dataset.getDescriptor(); Schema schema = descriptor.getSchema(); PartitionStrategy strategy = null; if (descriptor.isPartitioned()) { strategy = descriptor.getPartitionStrategy(); } Constraints constraints = Constraints.fromQueryMap( schema, strategy, uriOptions); return ((AbstractDataset<E>) dataset).filter(constraints); } else { return dataset; } }
private static <T> T visit(PathVisitor<T> visitor, FileSystem fs, Path path, List<Path> followedLinks) throws IOException { if (fs.isFile(path)) { return visitor.file(fs, path); } else if (IS_SYMLINK != null && IS_SYMLINK.<Boolean>invoke(fs.getFileStatus(path))) { Preconditions.checkArgument(!followedLinks.contains(path), "Encountered recursive path structure at link: " + path); followedLinks.add(path); // no need to remove return visit(visitor, fs, fs.getLinkTarget(path), followedLinks); } List<T> children = Lists.newArrayList(); FileStatus[] statuses = fs.listStatus(path, PathFilters.notHidden()); for (FileStatus stat : statuses) { children.add(visit(visitor, fs, stat.getPath())); } return visitor.directory(fs, path, children); }
private static <E> View<E> loadOrCreateTaskAttemptView(TaskAttemptContext taskContext) { Configuration conf = Hadoop.JobContext.getConfiguration.invoke(taskContext); Map<String, String> uriOptions = Registration.lookupDatasetUri( URI.create(URI.create(conf.get(KITE_OUTPUT_URI)).getSchemeSpecificPart())).second(); Dataset<E> dataset = loadOrCreateTaskAttemptDataset(taskContext); if (dataset instanceof AbstractDataset) { DatasetDescriptor descriptor = dataset.getDescriptor(); Schema schema = descriptor.getSchema(); PartitionStrategy strategy = null; if (descriptor.isPartitioned()) { strategy = descriptor.getPartitionStrategy(); } Constraints constraints = Constraints.fromQueryMap( schema, strategy, uriOptions); return ((AbstractDataset<E>) dataset).filter(constraints); } else { return dataset; } }
@Override public RecordReader<E, Void> createRecordReader(InputSplit split, TaskAttemptContext context) throws IOException, InterruptedException { Configuration conf = Hadoop.TaskAttemptContext .getConfiguration.invoke(context); Path path; if (split instanceof FileSplit) { path = ((FileSplit) split).getPath(); } else { throw new DatasetOperationException( "Split is not a FileSplit: %s:%s", split.getClass().getCanonicalName(), split); } JSONFileReader<E> reader = new JSONFileReader<E>( path.getFileSystem(conf), path, accessor); reader.initialize(); return reader.asRecordReader(); } }
@SuppressWarnings("unchecked") private boolean setInputPaths(JobContext jobContext, Job job) throws IOException { List<Path> paths = Lists.newArrayList((Iterator) (view == null ? dataset.pathIterator() : view.pathIterator())); LOG.debug("Input paths: {}", paths); if (paths.isEmpty()) { return false; } FileInputFormat.setInputPaths(job, paths.toArray(new Path[paths.size()])); // the following line is needed for Hadoop 1, otherwise the paths are not set Configuration contextConf = Hadoop.JobContext .getConfiguration.invoke(jobContext); Configuration jobConf = Hadoop.JobContext .getConfiguration.invoke(job); contextConf.set("mapred.input.dir", jobConf.get("mapred.input.dir")); return true; }