public static PTransform<PCollection<String>, WriteFilesResult<Void>> createWrite( String filenamePrefix, String filenameSuffix, Schema schema, JdbcAvroArgs jdbcAvroArgs) { filenamePrefix = filenamePrefix.replaceAll("/+$", "") + "/part"; ValueProvider<ResourceId> prefixProvider = StaticValueProvider.of(FileBasedSink.convertToFileResourceIfPossible(filenamePrefix)); FileBasedSink.FilenamePolicy filenamePolicy = DefaultFilenamePolicy.fromStandardParameters( prefixProvider, DEFAULT_SHARD_TEMPLATE, filenameSuffix, false); final DynamicAvroDestinations<String, Void, String> destinations = AvroIO.constantDestinations(filenamePolicy, schema, ImmutableMap.of(), jdbcAvroArgs.getCodecFactory(), SerializableFunctions.identity()); final FileBasedSink<String, Void, String> sink = new JdbcAvroSink<>( prefixProvider, destinations, jdbcAvroArgs); return WriteFiles.to(sink); }
@Parameters(name = "{index}: {0}") public static Iterable<WriteFiles<Object, Void, Object>> data() { return ImmutableList.of( WriteFiles.to(new DummySink()), WriteFiles.to(new DummySink()).withWindowedWrites(), WriteFiles.to(new DummySink()).withNumShards(17), WriteFiles.to(new DummySink()).withWindowedWrites().withNumShards(42)); }
/** Test a WriteFiles transform with an empty PCollection. */ @Test @Category(NeedsRunner.class) public void testWriteWithEmptyPCollection() throws IOException { List<String> inputs = new ArrayList<>(); runWrite(inputs, IDENTITY_MAP, getBaseOutputFilename(), WriteFiles.to(makeSimpleSink())); }
/** Test a WriteFiles transform with a PCollection of elements. */ @Test @Category(NeedsRunner.class) public void testWrite() throws IOException { List<String> inputs = Arrays.asList( "Critical canary", "Apprehensive eagle", "Intimidating pigeon", "Pedantic gull", "Frisky finch"); runWrite(inputs, IDENTITY_MAP, getBaseOutputFilename(), WriteFiles.to(makeSimpleSink())); }
/** * Test that WriteFiles with a configured number of shards produces the desired number of shards * even when there are many elements. */ @Test @Category(NeedsRunner.class) public void testShardedWrite() throws IOException { runShardedWrite( Arrays.asList("one", "two", "three", "four", "five", "six"), IDENTITY_MAP, getBaseOutputFilename(), WriteFiles.to(makeSimpleSink())); }
@Override public PTransformReplacement<PCollection<InputT>, WriteFilesResult<DestinationT>> getReplacementTransform( AppliedPTransform< PCollection<InputT>, WriteFilesResult<DestinationT>, PTransform<PCollection<InputT>, WriteFilesResult<DestinationT>>> transform) { try { WriteFiles<InputT, DestinationT, ?> replacement = WriteFiles.to(WriteFilesTranslation.getSink(transform)) .withSideInputs(WriteFilesTranslation.getDynamicDestinationSideInputs(transform)) .withSharding(new LogElementShardsWithDrift<>()); if (WriteFilesTranslation.isWindowedWrites(transform)) { replacement = replacement.withWindowedWrites(); } return PTransformReplacement.of( PTransformReplacements.getSingletonMainInput(transform), replacement); } catch (IOException e) { throw new RuntimeException(e); } }
/** * Test that WriteFiles with a configured number of shards produces the desired number of shard * even when there are too few elements. */ @Test @Category(NeedsRunner.class) public void testExpandShardedWrite() throws IOException { runShardedWrite( Arrays.asList("one", "two", "three", "four", "five", "six"), IDENTITY_MAP, getBaseOutputFilename(), WriteFiles.to(makeSimpleSink()).withNumShards(20)); }
/** Test that WriteFiles with an empty input still produces one shard. */ @Test @Category(NeedsRunner.class) public void testEmptyWrite() throws IOException { runWrite( Collections.emptyList(), IDENTITY_MAP, getBaseOutputFilename(), WriteFiles.to(makeSimpleSink())); checkFileContents( getBaseOutputFilename(), Collections.emptyList(), Optional.of(1), true /* expectRemovedTempDirectory */); }
@Test public void testBuildWrite() { SimpleSink<Void> sink = makeSimpleSink(); WriteFiles<String, ?, String> write = WriteFiles.to(sink).withNumShards(3); assertThat((SimpleSink<Void>) write.getSink(), is(sink)); PTransform<PCollection<String>, PCollectionView<Integer>> originalSharding = write.getComputeNumShards(); assertThat(write.getComputeNumShards(), is(nullValue())); assertThat(write.getNumShardsProvider(), instanceOf(StaticValueProvider.class)); assertThat(write.getNumShardsProvider().get(), equalTo(3)); assertThat(write.getComputeNumShards(), equalTo(originalSharding)); WriteFiles<String, ?, ?> write2 = write.withSharding(SHARDING_TRANSFORM); assertThat((SimpleSink<Void>) write2.getSink(), is(sink)); assertThat(write2.getComputeNumShards(), equalTo(SHARDING_TRANSFORM)); // original unchanged WriteFiles<String, ?, ?> writeUnsharded = write2.withRunnerDeterminedSharding(); assertThat(writeUnsharded.getComputeNumShards(), nullValue()); assertThat(write.getComputeNumShards(), equalTo(originalSharding)); }
@Override public PDone expand(PCollection<byte[]> input) { checkState( getOutputPrefix() != null, "need to set the output prefix of a TFRecordIO.Write transform"); WriteFiles<byte[], Void, byte[]> write = WriteFiles.to( new TFRecordSink( getOutputPrefix(), getShardTemplate(), getFilenameSuffix(), getCompression())); if (getNumShards() > 0) { write = write.withNumShards(getNumShards()); } input.apply("Write", write); return PDone.in(input.getPipeline()); }
/** Test a WriteFiles with sessions. */ @Test @Category(NeedsRunner.class) public void testWriteWithSessions() throws IOException { List<String> inputs = Arrays.asList( "Critical canary", "Apprehensive eagle", "Intimidating pigeon", "Pedantic gull", "Frisky finch"); runWrite( inputs, new WindowAndReshuffle<>(Window.into(Sessions.withGapDuration(Duration.millis(1)))), getBaseOutputFilename(), WriteFiles.to(makeSimpleSink())); }
/** Test a WriteFiles with a windowed PCollection. */ @Test @Category(NeedsRunner.class) public void testWriteWindowed() throws IOException { List<String> inputs = Arrays.asList( "Critical canary", "Apprehensive eagle", "Intimidating pigeon", "Pedantic gull", "Frisky finch"); runWrite( inputs, new WindowAndReshuffle<>(Window.into(FixedWindows.of(Duration.millis(2)))), getBaseOutputFilename(), WriteFiles.to(makeSimpleSink())); }
@Test public void withNoShardingSpecifiedReturnsNewTransform() { ResourceId outputDirectory = LocalResources.fromString("/foo", true /* isDirectory */); PTransform<PCollection<Object>, WriteFilesResult<Void>> original = WriteFiles.to( new FileBasedSink<Object, Void, Object>( StaticValueProvider.of(outputDirectory), DynamicFileDestinations.constant(new FakeFilenamePolicy())) { @Override public WriteOperation<Void, Object> createWriteOperation() { throw new IllegalArgumentException("Should not be used"); } }); @SuppressWarnings("unchecked") PCollection<Object> objs = (PCollection) p.apply(Create.empty(VoidCoder.of())); AppliedPTransform< PCollection<Object>, WriteFilesResult<Void>, PTransform<PCollection<Object>, WriteFilesResult<Void>>> originalApplication = AppliedPTransform.of("write", objs.expand(), Collections.emptyMap(), original, p); assertThat( factory.getReplacementTransform(originalApplication).getTransform(), not(equalTo((Object) original))); }
@Test public void testRunnerDeterminedSharding() { FlinkPipelineOptions options = PipelineOptionsFactory.as(FlinkPipelineOptions.class); options.setRunner(TestFlinkRunner.class); options.setFlinkMaster("[auto]"); options.setParallelism(5); TestPipeline p = TestPipeline.fromOptions(options); StreamingShardedWriteFactory<Object, Void, Object> factory = new StreamingShardedWriteFactory<>(p.getOptions()); WriteFiles<Object, Void, Object> original = WriteFiles.to(new TestSink(tmpFolder.toString())); @SuppressWarnings("unchecked") PCollection<Object> objs = (PCollection) p.apply(Create.empty(VoidCoder.of())); AppliedPTransform<PCollection<Object>, WriteFilesResult<Void>, WriteFiles<Object, Void, Object>> originalApplication = AppliedPTransform.of("writefiles", objs.expand(), Collections.emptyMap(), original, p); WriteFiles<Object, Void, Object> replacement = (WriteFiles<Object, Void, Object>) factory.getReplacementTransform(originalApplication).getTransform(); assertThat(replacement, not(equalTo((Object) original))); assertThat(replacement.getNumShardsProvider().get(), is(10)); }
@Test @Category(NeedsRunner.class) public void testWriteSpilling() throws IOException { List<String> inputs = Lists.newArrayList(); for (int i = 0; i < 100; ++i) { inputs.add("mambo_number_" + i); } runWrite( inputs, Window.into(FixedWindows.of(Duration.millis(2))), getBaseOutputFilename(), WriteFiles.to(makeSimpleSink()) .withMaxNumWritersPerBundle(2) .withWindowedWrites() .withNumShards(1)); }
@Test @Category(NeedsRunner.class) public void testUnboundedNeedsWindowed() { thrown.expect(IllegalArgumentException.class); thrown.expectMessage( "Must use windowed writes when applying WriteFiles to an unbounded PCollection"); SimpleSink<Void> sink = makeSimpleSink(); p.apply(Create.of("foo")).setIsBoundedInternal(IsBounded.UNBOUNDED).apply(WriteFiles.to(sink)); p.run(); }
static Pipeline buildPipeline(ExportOptions opts) { // Use the base target directory to stage bundles ValueProvider<ResourceId> destinationPath = NestedValueProvider .of(opts.getDestinationPath(), new StringToDirResourceId()); // Concat the destination path & prefix for the final path FilePathPrefix filePathPrefix = new FilePathPrefix(destinationPath, opts.getFilenamePrefix()); SequenceFileSink<ImmutableBytesWritable, Result> sink = new SequenceFileSink<>( destinationPath, DefaultFilenamePolicy.fromStandardParameters( filePathPrefix, null, "", false ), ImmutableBytesWritable.class, WritableSerialization.class, Result.class, ResultSerialization.class ); Pipeline pipeline = Pipeline.create(Utils.tweakOptions(opts)); CloudBigtableScanConfiguration config = TemplateUtils.BuildExportConfig(opts); pipeline .apply("Read table", Read.from(CloudBigtableIO.read(config))) .apply("Format results", MapElements.via(new ResultToKV())) .apply("Write", WriteFiles.to(sink)); return pipeline; }
@Test @Category(NeedsRunner.class) public void testUnboundedWritesNeedSharding() { thrown.expect(IllegalArgumentException.class); thrown.expectMessage( "When applying WriteFiles to an unbounded PCollection, " + "must specify number of output shards explicitly"); SimpleSink<Void> sink = makeSimpleSink(); p.apply(Create.of("foo")) .setIsBoundedInternal(IsBounded.UNBOUNDED) .apply(WriteFiles.to(sink).withWindowedWrites()); p.run(); }
@Test public void testDisplayData() { DynamicDestinations<String, Void, String> dynamicDestinations = DynamicFileDestinations.constant( DefaultFilenamePolicy.fromParams( new Params() .withBaseFilename( getBaseOutputDirectory() .resolve("file", StandardResolveOptions.RESOLVE_FILE)) .withShardTemplate("-SS-of-NN"))); SimpleSink<Void> sink = new SimpleSink<Void>( getBaseOutputDirectory(), dynamicDestinations, Compression.UNCOMPRESSED) { @Override public void populateDisplayData(DisplayData.Builder builder) { builder.add(DisplayData.item("foo", "bar")); } }; WriteFiles<String, ?, String> write = WriteFiles.to(sink); DisplayData displayData = DisplayData.from(write); assertThat(displayData, hasDisplayItem("sink", sink.getClass())); assertThat(displayData, includesDisplayDataFor("sink", sink)); }
@Test public void testShardedDisplayData() { DynamicDestinations<String, Void, String> dynamicDestinations = DynamicFileDestinations.constant( DefaultFilenamePolicy.fromParams( new Params() .withBaseFilename( getBaseOutputDirectory() .resolve("file", StandardResolveOptions.RESOLVE_FILE)) .withShardTemplate("-SS-of-NN"))); SimpleSink<Void> sink = new SimpleSink<Void>( getBaseOutputDirectory(), dynamicDestinations, Compression.UNCOMPRESSED) { @Override public void populateDisplayData(DisplayData.Builder builder) { builder.add(DisplayData.item("foo", "bar")); } }; WriteFiles<String, ?, String> write = WriteFiles.to(sink).withNumShards(1); DisplayData displayData = DisplayData.from(write); assertThat(displayData, hasDisplayItem("sink", sink.getClass())); assertThat(displayData, includesDisplayDataFor("sink", sink)); assertThat(displayData, hasDisplayItem("numShards", 1)); }