@Parameters(name = "{index}: {0}") public static Iterable<WriteFiles<Object, Void, Object>> data() { return ImmutableList.of( WriteFiles.to(new DummySink()), WriteFiles.to(new DummySink()).withWindowedWrites(), WriteFiles.to(new DummySink()).withNumShards(17), WriteFiles.to(new DummySink()).withWindowedWrites().withNumShards(42)); }
@Test public void testBuildWrite() { SimpleSink<Void> sink = makeSimpleSink(); WriteFiles<String, ?, String> write = WriteFiles.to(sink).withNumShards(3); assertThat((SimpleSink<Void>) write.getSink(), is(sink)); PTransform<PCollection<String>, PCollectionView<Integer>> originalSharding = write.getComputeNumShards(); assertThat(write.getComputeNumShards(), is(nullValue())); assertThat(write.getNumShardsProvider(), instanceOf(StaticValueProvider.class)); assertThat(write.getNumShardsProvider().get(), equalTo(3)); assertThat(write.getComputeNumShards(), equalTo(originalSharding)); WriteFiles<String, ?, ?> write2 = write.withSharding(SHARDING_TRANSFORM); assertThat((SimpleSink<Void>) write2.getSink(), is(sink)); assertThat(write2.getComputeNumShards(), equalTo(SHARDING_TRANSFORM)); // original unchanged WriteFiles<String, ?, ?> writeUnsharded = write2.withRunnerDeterminedSharding(); assertThat(writeUnsharded.getComputeNumShards(), nullValue()); assertThat(write.getComputeNumShards(), equalTo(originalSharding)); }
@Override public boolean isWindowedWrites() { return transform.getWindowedWrites(); }
@Override public boolean isRunnerDeterminedSharding() { return transform.getNumShardsProvider() == null && transform.getComputeNumShards() == null; } },
/** * Returns a new {@link WriteFiles} that will write to the current {@link FileBasedSink} using the * specified number of shards. * * <p>This option should be used sparingly as it can hurt performance. See {@link WriteFiles} for * more information. * * <p>A value less than or equal to 0 will be equivalent to the default behavior of * runner-determined sharding. */ public WriteFiles<UserT, DestinationT, OutputT> withNumShards(int numShards) { if (numShards > 0) { return withNumShards(StaticValueProvider.of(numShards)); } return withRunnerDeterminedSharding(); }
@Override public void populateDisplayData(DisplayData.Builder builder) { super.populateDisplayData(builder); builder .add(DisplayData.item("sink", getSink().getClass()).withLabel("WriteFiles Sink")) .include("sink", getSink()); if (getComputeNumShards() != null) { builder.include("sharding", getComputeNumShards()); } else { builder.addIfNotNull( DisplayData.item("numShards", getNumShardsProvider()) .withLabel("Fixed Number of Shards")); } }
WriteFiles.to(sink).withSideInputs(sideInputs); if (WriteFilesTranslation.isWindowedWrites(transform)) { replacement = replacement.withWindowedWrites(); replacement.withNumShards(numShards)); } catch (Exception e) { throw new RuntimeException(e);
public static PTransform<PCollection<String>, WriteFilesResult<Void>> createWrite( String filenamePrefix, String filenameSuffix, Schema schema, JdbcAvroArgs jdbcAvroArgs) { filenamePrefix = filenamePrefix.replaceAll("/+$", "") + "/part"; ValueProvider<ResourceId> prefixProvider = StaticValueProvider.of(FileBasedSink.convertToFileResourceIfPossible(filenamePrefix)); FileBasedSink.FilenamePolicy filenamePolicy = DefaultFilenamePolicy.fromStandardParameters( prefixProvider, DEFAULT_SHARD_TEMPLATE, filenameSuffix, false); final DynamicAvroDestinations<String, Void, String> destinations = AvroIO.constantDestinations(filenamePolicy, schema, ImmutableMap.of(), jdbcAvroArgs.getCodecFactory(), SerializableFunctions.identity()); final FileBasedSink<String, Void, String> sink = new JdbcAvroSink<>( prefixProvider, destinations, jdbcAvroArgs); return WriteFiles.to(sink); }
if (input.isBounded() == IsBounded.UNBOUNDED) { checkArgument( getWindowedWrites(), "Must use windowed writes when applying %s to an unbounded PCollection", WriteFiles.class.getSimpleName()); getComputeNumShards() != null || getNumShardsProvider() != null, "When applying %s to an unbounded PCollection, " + "must specify number of output shards explicitly", WriteFiles.class.getSimpleName()); this.writeOperation = getSink().createWriteOperation(); this.writeOperation.setWindowedWrites(getWindowedWrites()); if (!getWindowedWrites()) { try { destinationCoder = getDynamicDestinations() .getDestinationCoderWithDefault(input.getPipeline().getCoderRegistry()); destinationCoder.verifyDeterministic(); (getComputeNumShards() == null) ? null : input.apply(getComputeNumShards()); (getComputeNumShards() == null && getNumShardsProvider() == null) ? input.apply( "WriteUnshardedBundlesToTempFiles",
WriteFiles.to(new ViaFileBasedSink<>(resolved)) .withSideInputs(Lists.newArrayList(resolved.getAllSideInputs())); if (getNumShards() != null) { writeFiles = writeFiles.withNumShards(getNumShards()); } else if (getSharding() != null) { writeFiles = writeFiles.withSharding(getSharding()); } else { writeFiles = writeFiles.withRunnerDeterminedSharding(); writeFiles = writeFiles.withWindowedWrites();
@Override public PTransformReplacement<PCollection<InputT>, WriteFilesResult<DestinationT>> getReplacementTransform( AppliedPTransform< PCollection<InputT>, WriteFilesResult<DestinationT>, PTransform<PCollection<InputT>, WriteFilesResult<DestinationT>>> transform) { try { WriteFiles<InputT, DestinationT, ?> replacement = WriteFiles.to(WriteFilesTranslation.getSink(transform)) .withSideInputs(WriteFilesTranslation.getDynamicDestinationSideInputs(transform)) .withSharding(new LogElementShardsWithDrift<>()); if (WriteFilesTranslation.isWindowedWrites(transform)) { replacement = replacement.withWindowedWrites(); } return PTransformReplacement.of( PTransformReplacements.getSingletonMainInput(transform), replacement); } catch (IOException e) { throw new RuntimeException(e); } }
/** * Test that WriteFiles with a configured number of shards produces the desired number of shard * even when there are too few elements. */ @Test @Category(NeedsRunner.class) public void testExpandShardedWrite() throws IOException { runShardedWrite( Arrays.asList("one", "two", "three", "four", "five", "six"), IDENTITY_MAP, getBaseOutputFilename(), WriteFiles.to(makeSimpleSink()).withNumShards(20)); }
@Test @Category(NeedsRunner.class) public void testWriteSpilling() throws IOException { List<String> inputs = Lists.newArrayList(); for (int i = 0; i < 100; ++i) { inputs.add("mambo_number_" + i); } runWrite( inputs, Window.into(FixedWindows.of(Duration.millis(2))), getBaseOutputFilename(), WriteFiles.to(makeSimpleSink()) .withMaxNumWritersPerBundle(2) .withWindowedWrites() .withNumShards(1)); }
@Test public void testRunnerDeterminedSharding() { FlinkPipelineOptions options = PipelineOptionsFactory.as(FlinkPipelineOptions.class); options.setRunner(TestFlinkRunner.class); options.setFlinkMaster("[auto]"); options.setParallelism(5); TestPipeline p = TestPipeline.fromOptions(options); StreamingShardedWriteFactory<Object, Void, Object> factory = new StreamingShardedWriteFactory<>(p.getOptions()); WriteFiles<Object, Void, Object> original = WriteFiles.to(new TestSink(tmpFolder.toString())); @SuppressWarnings("unchecked") PCollection<Object> objs = (PCollection) p.apply(Create.empty(VoidCoder.of())); AppliedPTransform<PCollection<Object>, WriteFilesResult<Void>, WriteFiles<Object, Void, Object>> originalApplication = AppliedPTransform.of("writefiles", objs.expand(), Collections.emptyMap(), original, p); WriteFiles<Object, Void, Object> replacement = (WriteFiles<Object, Void, Object>) factory.getReplacementTransform(originalApplication).getTransform(); assertThat(replacement, not(equalTo((Object) original))); assertThat(replacement.getNumShardsProvider().get(), is(10)); }
@Test @Category(NeedsRunner.class) public void testCustomShardedWrite() throws IOException { // Flag to validate that the pipeline options are passed to the Sink WriteOptions options = TestPipeline.testingPipelineOptions().as(WriteOptions.class); options.setTestFlag("test_value"); Pipeline p = TestPipeline.create(options); List<String> inputs = new ArrayList<>(); // Prepare timestamps for the elements. List<Long> timestamps = new ArrayList<>(); for (long i = 0; i < 1000; i++) { inputs.add(Integer.toString(3)); timestamps.add(i + 1); } SimpleSink<Void> sink = makeSimpleSink(); WriteFiles<String, ?, String> write = WriteFiles.to(sink).withSharding(new LargestInt()); p.apply(Create.timestamped(inputs, timestamps).withCoder(StringUtf8Coder.of())) .apply(IDENTITY_MAP) .apply(write) .getPerDestinationOutputFilenames() .apply(new VerifyFilesExist<>()); p.run(); checkFileContents( getBaseOutputFilename(), inputs, Optional.of(3), true /* expectRemovedTempDirectory */); }
false); WriteFiles<Integer, Void, Integer> write = WriteFiles.to( new FileBasedSink<Integer, Void, Integer>( StaticValueProvider.of(outputDirectory), DynamicFileDestinations.constant(policy)) { is(true)); WriteFiles<Integer, Void, Integer> withStaticSharding = write.withNumShards(3); assertThat( PTransformMatchers.writeWithRunnerDeterminedSharding() write.withSharding(Sum.integersGlobally().asSingletonView()); assertThat( PTransformMatchers.writeWithRunnerDeterminedSharding()
@Test @Category(NeedsRunner.class) public void testUnboundedWritesNeedSharding() { thrown.expect(IllegalArgumentException.class); thrown.expectMessage( "When applying WriteFiles to an unbounded PCollection, " + "must specify number of output shards explicitly"); SimpleSink<Void> sink = makeSimpleSink(); p.apply(Create.of("foo")) .setIsBoundedInternal(IsBounded.UNBOUNDED) .apply(WriteFiles.to(sink).withWindowedWrites()); p.run(); }
(write.getNumShardsProvider() != null && !write.getWindowedWrites()) ? Optional.of(write.getNumShardsProvider().get()) : Optional.absent(); checkFileContents(baseName, inputs, numShards, !write.getWindowedWrites());
@Override public SdkFunctionSpec translateSink(SdkComponents newComponents) { // TODO: register the environment return toProto(transform.getSink()); }
FileBasedSink sink = WriteFilesTranslation.getSink(transform); WriteFiles<UserT, DestinationT, OutputT> replacement = WriteFiles.to(sink).withSideInputs(sideInputs); if (WriteFilesTranslation.isWindowedWrites(transform)) { replacement = replacement.withWindowedWrites(); replacement.withNumShards(numShards)); } catch (Exception e) { throw new RuntimeException(e);