@Override public <T> Iterable<T> materialize(PCollection<T> pcollection) { return pcollection.materialize(); }
@Override public <T> Iterable<T> materialize(PCollection<T> pcollection) { return pcollection.materialize(); }
@Override public <T> Iterable<T> materialize(PCollection<T> pcollection) { return pcollection.materialize(); }
/** * Constructs a new instance of this {@code PObject} implementation. * * @param collect The backing {@code PCollection} for this {@code PObject}. */ public PObjectImpl(PCollection<S> collect) { this.name = collect.toString(); this.iterable = collect.materialize(); this.cachedValue = null; this.isCached = false; }
/** {@inheritDoc} */ @Override public final T getValue() { if (!isCached) { cachedValue = process(collection.materialize()); isCached = true; } return cachedValue; }
/** * Obtain the contents of this LCollection as a {@link Stream} that can be processed locally. Note, this may trigger * your job to execute in a distributed environment if the pipeline has not yet been run. */ default Stream<S> materialize() { return StreamSupport.stream(underlying().materialize().spliterator(), false); }
@Override public <S> PCollection<S> union(List<PCollection<S>> collections) { List<S> output = Lists.newArrayList(); for (PCollection<S> pcollect : collections) { Iterables.addAll(output, pcollect.materialize()); } return new MemCollection<S>(output, collections.get(0).getPType()); }
@Override public PCollection<S> union(PCollection<S>... collections) { Collection<S> output = Lists.newArrayList(); for (PCollection<S> pcollect : collections) { for (S s : pcollect.materialize()) { output.add(s); } } output.addAll(collect); return new MemCollection<S>(output, collections[0].getPType()); }
@Override public PCollection<S> union(PCollection<S>... collections) { Collection<S> output = Lists.newArrayList(); for (PCollection<S> pcollect : collections) { for (S s : pcollect.materialize()) { output.add(s); } } output.addAll(collect); return new MemCollection<S>(output, collections[0].getPType()); }
private Set<Target> getDependencies(PipelineCallable<?> callable) { Set<Target> deps = Sets.newHashSet(callable.getAllTargets().values()); for (PCollection pc : callable.getAllPCollections().values()) { PCollectionImpl pcImpl = (PCollectionImpl) pc; deps.addAll(pcImpl.getTargetDependencies()); MaterializableIterable iter = (MaterializableIterable) pc.materialize(); Source pcSrc = iter.getSource(); if (pcSrc instanceof Target) { deps.add((Target) pcSrc); } } return deps; }
private static <C> List<KeyValue> getSplitPoints(HTable table, PTable<C, Void> affectedRows) throws IOException { List<byte[]> startKeys; try { startKeys = Lists.newArrayList(table.getStartKeys()); if (startKeys.isEmpty()) { throw new AssertionError(table + " has no regions!"); } } catch (IOException e) { throw new CrunchRuntimeException(e); } Collections.sort(startKeys, Bytes.BYTES_COMPARATOR); Iterable<ByteBuffer> bufferedStartKeys = affectedRows .parallelDo(new DetermineAffectedRegionsFn(startKeys), Writables.bytes()).materialize(); // set to get rid of the potential duplicate start keys emitted ImmutableSet.Builder<KeyValue> startKeyBldr = ImmutableSet.builder(); for (final ByteBuffer bufferedStartKey : bufferedStartKeys) { startKeyBldr.add(KeyValueUtil.createFirstOnRow(bufferedStartKey.array())); } return ImmutableList.copyOf(startKeyBldr.build()); }
@Override public void write(PCollection<?> collection, Target target) { if (target instanceof PathTarget) { Path path = ((PathTarget) target).getPath(); try { FileSystem fs = FileSystem.get(conf); FSDataOutputStream os = fs.create(new Path(path, "out")); if (collection instanceof PTable) { for (Object o : collection.materialize()) { Pair p = (Pair) o; os.writeBytes(p.first().toString()); os.writeBytes("\t"); os.writeBytes(p.second().toString()); os.writeBytes("\r\n"); } } else { for (Object o : collection.materialize()) { os.writeBytes(o.toString() + "\r\n"); } } os.close(); } catch (IOException e) { LOG.error("Exception writing target: " + target, e); } } else { LOG.error("Target " + target + " is not a PathTarget instance"); } }
private static <C> List<KeyValue> getSplitPoints(RegionLocator regionLocator, PTable<C, Void> affectedRows) throws IOException { List<byte[]> startKeys; try { startKeys = Lists.newArrayList(regionLocator.getStartKeys()); if (startKeys.isEmpty()) { throw new AssertionError(regionLocator.getName().getNameAsString() + " has no regions!"); } } catch (IOException e) { throw new CrunchRuntimeException(e); } Collections.sort(startKeys, Bytes.BYTES_COMPARATOR); Iterable<ByteBuffer> bufferedStartKeys = affectedRows .parallelDo(new DetermineAffectedRegionsFn(startKeys), Writables.bytes()).materialize(); // set to get rid of the potential duplicate start keys emitted ImmutableSet.Builder<KeyValue> startKeyBldr = ImmutableSet.builder(); for (final ByteBuffer bufferedStartKey : bufferedStartKeys) { startKeyBldr.add(KeyValueUtil.createFirstOnRow(bufferedStartKey.array())); } return ImmutableList.copyOf(startKeyBldr.build()); }
if (collection instanceof PTable) { byte[] tab = "\t".getBytes(Charsets.UTF_8); for (Object o : collection.materialize()) { Pair p = (Pair) o; os.write(p.first().toString().getBytes(Charsets.UTF_8)); for (Object o : collection.materialize()) { os.write(o.toString().getBytes(Charsets.UTF_8)); os.write(newLine);
outputIndex++; if (collection instanceof PTable) { for (Object o : collection.materialize()) { Pair p = (Pair) o; os.writeBytes(p.first().toString()); for (Object o : collection.materialize()) { os.writeBytes(o.toString() + "\r\n");
Iterator<ByteBuffer> iter = result.materialize().iterator(); x.start(); while (iter.hasNext()) {
private void writeSequenceFileFromPCollection(final FileSystem fs, final Path path, final PCollection collection) throws IOException { final PType pType = collection.getPType(); final Converter converter = pType.getConverter(); final Class valueClass = converter.getValueClass(); final SequenceFile.Writer writer = new SequenceFile.Writer(fs, fs.getConf(), path, NullWritable.class, valueClass); for (final Object o : collection.materialize()) { final Object value = pType.getOutputMapFn().map(o); writer.append(NullWritable.get(), value); } writer.close(); }
@SuppressWarnings({ "rawtypes", "unchecked" }) private void writeAvroFile(FSDataOutputStream outputStream, PCollection recordCollection) throws IOException { AvroType avroType = (AvroType)recordCollection.getPType(); if (avroType == null) { throw new IllegalStateException("Can't write a non-typed Avro collection"); } DatumWriter datumWriter = Avros.newWriter((AvroType)recordCollection.getPType()); DataFileWriter dataFileWriter = new DataFileWriter(datumWriter); dataFileWriter.create(avroType.getSchema(), outputStream); for (Object record : recordCollection.materialize()) { dataFileWriter.append(avroType.getOutputMapFn().map(record)); } dataFileWriter.close(); outputStream.close(); }
private static <K, V> void configureReducers(GroupingOptions.Builder builder, PTable<K, V> ptable, Configuration conf, int numReducers) { if (numReducers <= 0) { numReducers = PartitionUtils.getRecommendedPartitions(ptable, conf); if (numReducers < 5) { // Not worth the overhead, force it to 1 numReducers = 1; } } builder.numReducers(numReducers); if (numReducers > 1) { Iterable<K> iter = Sample.reservoirSample(ptable.keys(), numReducers - 1).materialize(); MaterializableIterable<K> mi = (MaterializableIterable<K>) iter; if (mi.isSourceTarget()) { builder.sourceTargets((SourceTarget) mi.getSource()); } builder.partitionerClass(TotalOrderPartitioner.class); builder.conf(TotalOrderPartitioner.PARTITIONER_PATH, mi.getPath().toString()); //TODO: distcache handling } }