org.apache.crunch.PCollection.materialize java code examples

@Override
public <T> Iterable<T> materialize(PCollection<T> pcollection) {
 return pcollection.materialize();
}

@Override
public <T> Iterable<T> materialize(PCollection<T> pcollection) {
 return pcollection.materialize();
}

@Override
public <T> Iterable<T> materialize(PCollection<T> pcollection) {
 return pcollection.materialize();
}

/**
 * Constructs a new instance of this {@code PObject} implementation.
 *
 * @param collect The backing {@code PCollection} for this {@code PObject}.
 */
public PObjectImpl(PCollection<S> collect) {
 this.name = collect.toString();
 this.iterable = collect.materialize();
 this.cachedValue = null;
 this.isCached = false;
}

/** {@inheritDoc} */
@Override
public final T getValue() {
 if (!isCached) {
  cachedValue = process(collection.materialize());
  isCached = true;
 }
 return cachedValue;
}

/**
 * Obtain the contents of this LCollection as a {@link Stream} that can be processed locally. Note, this may trigger
 * your job to execute in a distributed environment if the pipeline has not yet been run.
 */
default Stream<S> materialize() {
  return StreamSupport.stream(underlying().materialize().spliterator(), false);
}

@Override
public <S> PCollection<S> union(List<PCollection<S>> collections) {
 List<S> output = Lists.newArrayList();
 for (PCollection<S> pcollect : collections) {
  Iterables.addAll(output, pcollect.materialize());
 }
 return new MemCollection<S>(output, collections.get(0).getPType());
}

@Override
public PCollection<S> union(PCollection<S>... collections) {
 Collection<S> output = Lists.newArrayList();    
 for (PCollection<S> pcollect : collections) {
  for (S s : pcollect.materialize()) {
   output.add(s);
  }
 }
 output.addAll(collect);
 return new MemCollection<S>(output, collections[0].getPType());
}

@Override
public PCollection<S> union(PCollection<S>... collections) {
 Collection<S> output = Lists.newArrayList();
 for (PCollection<S> pcollect : collections) {
  for (S s : pcollect.materialize()) {
   output.add(s);
  }
 }
 output.addAll(collect);
 return new MemCollection<S>(output, collections[0].getPType());
}

private Set<Target> getDependencies(PipelineCallable<?> callable) {
 Set<Target> deps = Sets.newHashSet(callable.getAllTargets().values());
 for (PCollection pc : callable.getAllPCollections().values()) {
  PCollectionImpl pcImpl = (PCollectionImpl) pc;
  deps.addAll(pcImpl.getTargetDependencies());
  MaterializableIterable iter = (MaterializableIterable) pc.materialize();
  Source pcSrc = iter.getSource();
  if (pcSrc instanceof Target) {
   deps.add((Target) pcSrc);
  }
 }
 return deps;
}

private static <C> List<KeyValue> getSplitPoints(HTable table, PTable<C, Void> affectedRows) throws IOException {
 List<byte[]> startKeys;
 try {
  startKeys = Lists.newArrayList(table.getStartKeys());
  if (startKeys.isEmpty()) {
   throw new AssertionError(table + " has no regions!");
  }
 } catch (IOException e) {
  throw new CrunchRuntimeException(e);
 }
 Collections.sort(startKeys, Bytes.BYTES_COMPARATOR);
 Iterable<ByteBuffer> bufferedStartKeys = affectedRows
     .parallelDo(new DetermineAffectedRegionsFn(startKeys), Writables.bytes()).materialize();
 // set to get rid of the potential duplicate start keys emitted
 ImmutableSet.Builder<KeyValue> startKeyBldr = ImmutableSet.builder();
 for (final ByteBuffer bufferedStartKey : bufferedStartKeys) {
  startKeyBldr.add(KeyValueUtil.createFirstOnRow(bufferedStartKey.array()));
 }
 return ImmutableList.copyOf(startKeyBldr.build());
}

@Override
public void write(PCollection<?> collection, Target target) {
 if (target instanceof PathTarget) {
  Path path = ((PathTarget) target).getPath();
  try {
   FileSystem fs = FileSystem.get(conf);
   FSDataOutputStream os = fs.create(new Path(path, "out"));
   if (collection instanceof PTable) {
    for (Object o : collection.materialize()) {
     Pair p = (Pair) o;
     os.writeBytes(p.first().toString());
     os.writeBytes("\t");
     os.writeBytes(p.second().toString());
     os.writeBytes("\r\n");
    }
   } else {
    for (Object o : collection.materialize()) {
     os.writeBytes(o.toString() + "\r\n");
    }
   }
   os.close();
  } catch (IOException e) {
   LOG.error("Exception writing target: " + target, e);
  }
 } else {
  LOG.error("Target " + target + " is not a PathTarget instance");
 }
}

private static <C> List<KeyValue> getSplitPoints(RegionLocator regionLocator, PTable<C, Void> affectedRows) throws IOException {
 List<byte[]> startKeys;
 try {
  startKeys = Lists.newArrayList(regionLocator.getStartKeys());
  if (startKeys.isEmpty()) {
   throw new AssertionError(regionLocator.getName().getNameAsString() + " has no regions!");
  }
 } catch (IOException e) {
  throw new CrunchRuntimeException(e);
 }
 Collections.sort(startKeys, Bytes.BYTES_COMPARATOR);
 Iterable<ByteBuffer> bufferedStartKeys = affectedRows
     .parallelDo(new DetermineAffectedRegionsFn(startKeys), Writables.bytes()).materialize();
 // set to get rid of the potential duplicate start keys emitted
 ImmutableSet.Builder<KeyValue> startKeyBldr = ImmutableSet.builder();
 for (final ByteBuffer bufferedStartKey : bufferedStartKeys) {
  startKeyBldr.add(KeyValueUtil.createFirstOnRow(bufferedStartKey.array()));
 }
 return ImmutableList.copyOf(startKeyBldr.build());
}

if (collection instanceof PTable) {
 byte[] tab = "\t".getBytes(Charsets.UTF_8);
 for (Object o : collection.materialize()) {
  Pair p = (Pair) o;
  os.write(p.first().toString().getBytes(Charsets.UTF_8));
 for (Object o : collection.materialize()) {
  os.write(o.toString().getBytes(Charsets.UTF_8));
  os.write(newLine);

outputIndex++;
if (collection instanceof PTable) {
 for (Object o : collection.materialize()) {
  Pair p = (Pair) o;
  os.writeBytes(p.first().toString());
 for (Object o : collection.materialize()) {
  os.writeBytes(o.toString() + "\r\n");

Iterator<ByteBuffer> iter = result.materialize().iterator();
x.start();
while (iter.hasNext()) {

private void writeSequenceFileFromPCollection(final FileSystem fs, final Path path,
  final PCollection collection) throws IOException {
 final PType pType = collection.getPType();
 final Converter converter = pType.getConverter();
 final Class valueClass = converter.getValueClass();
 final SequenceFile.Writer writer = new SequenceFile.Writer(fs, fs.getConf(), path,
   NullWritable.class, valueClass);
 for (final Object o : collection.materialize()) {
  final Object value = pType.getOutputMapFn().map(o);
  writer.append(NullWritable.get(), value);
 }
 writer.close();
}

@SuppressWarnings({ "rawtypes", "unchecked" })
private void writeAvroFile(FSDataOutputStream outputStream, PCollection recordCollection) throws IOException {
 AvroType avroType = (AvroType)recordCollection.getPType();
 if (avroType == null) {
  throw new IllegalStateException("Can't write a non-typed Avro collection");
 }
 DatumWriter datumWriter = Avros.newWriter((AvroType)recordCollection.getPType());
 DataFileWriter dataFileWriter = new DataFileWriter(datumWriter);
 dataFileWriter.create(avroType.getSchema(), outputStream);
 for (Object record : recordCollection.materialize()) {
  dataFileWriter.append(avroType.getOutputMapFn().map(record));
 }
 dataFileWriter.close();
 outputStream.close();
}

private static <K, V> void configureReducers(GroupingOptions.Builder builder,
  PTable<K, V> ptable, Configuration conf, int numReducers) {
 if (numReducers <= 0) {
  numReducers = PartitionUtils.getRecommendedPartitions(ptable, conf);
  if (numReducers < 5) {
   // Not worth the overhead, force it to 1
   numReducers = 1;
  }
 }
 builder.numReducers(numReducers);
 if (numReducers > 1) {
  Iterable<K> iter = Sample.reservoirSample(ptable.keys(), numReducers - 1).materialize();
  MaterializableIterable<K> mi = (MaterializableIterable<K>) iter;
  if (mi.isSourceTarget()) {
   builder.sourceTargets((SourceTarget) mi.getSource());
  }
  builder.partitionerClass(TotalOrderPartitioner.class);
  builder.conf(TotalOrderPartitioner.PARTITIONER_PATH, mi.getPath().toString());
  //TODO: distcache handling
 }   
}

Javadoc

Returns a reference to the data set represented by this PCollection that may be used by the client to read the data locally.

Popular methods of PCollection

parallelDo
Applies the given doFn to the elements of this PCollection and returns a new PCollection that is the
getPType
Returns the PType of this PCollection.
by
Apply the given map function to each element of this instance in order to create a PTable.
write
Write the contents of this PCollection to the given Target, using the given Target.WriteMode to hand
getPipeline
Returns the Pipeline associated with this PCollection.
getTypeFamily
Returns the PTypeFamily of this PCollection.
count
Returns a PTable instance that contains the counts of each unique element of this PCollection.
aggregate
Returns a PCollection that contains the result of aggregating all values in this instance.
asReadable
cache
Marks this data as cached using the given CachingOptions. Cached PCollections will only be processed
filter
Apply the given filter function to this instance and return the resulting PCollection.
first

Popular in Java

Making http requests using okhttp
notifyDataSetChanged (ArrayAdapter)
getOriginalFilename (MultipartFile)
Return the original filename in the client's filesystem.This may contain path information depending
scheduleAtFixedRate (ScheduledExecutorService)
InputStreamReader (java.io)
A class for turning a byte stream into a character stream. Data read from the source input stream is
Thread (java.lang)
A thread is a thread of execution in a program. The Java Virtual Machine allows an application to ha
List (java.util)
An ordered collection (also known as a sequence). The user of this interface has precise control ove
JTextField (javax.swing)
Join (org.hibernate.mapping)
Runner (org.openjdk.jmh.runner)
Top Vim plugins

How to use materializemethodin org.apache.crunch.PCollection

Best Java code snippets using org.apache.crunch.PCollection.materialize (Showing top 19 results out of 315)

How to use
materialize
method
in
org.apache.crunch.PCollection