try (BoundedReader<KV<ImmutableBytesWritable, Result>> reader = splitSource .createReader(opts)) { if (reader.start()) { return reader.getCurrent().getKey();
@ProcessElement public void processElement(ProcessContext c) throws Exception { TableSchema schema = BigQueryHelpers.fromJsonString( c.sideInput(schemaView), TableSchema.class); String jobUuid = c.sideInput(jobIdTokenView); BigQuerySourceBase<T> source = createSource(jobUuid, coder); List<BoundedSource<T>> sources = source.createSources( ImmutableList.of( FileSystems.matchNewResource( c.element(), false /* is directory */)), schema); checkArgument(sources.size() == 1, "Expected exactly one source."); BoundedSource<T> avroSource = sources.get(0); BoundedSource.BoundedReader<T> reader = avroSource.createReader(c.getPipelineOptions()); for (boolean more = reader.start(); more; more = reader.advance()) { c.output(reader.getCurrent()); } } })
return items; } finally { reader.close(); () -> { unblockSplitter.await(); BoundedSource<T> residual = reader.splitAtFraction(fraction); if (residual == null) { return null; return KV.of(reader.getCurrentSource(), residual); }); List<T> currentItems = readerThread.get();
throws Exception { try (BoundedSource.BoundedReader<T> reader = source.createReader(options)) { BoundedSource<T> originalSource = reader.getCurrentSource(); List<T> currentItems = readNItemsFromUnstartedReader(reader, numItemsToReadBeforeSplit); BoundedSource<T> residual = reader.splitAtFraction(splitFraction); if (residual != null) { assertFalse( + "Source objects MUST be immutable.", source, splitFraction, numItemsToReadBeforeSplit), reader.getCurrentSource() == originalSource); assertFalse( String.format( + "Source objects MUST be immutable.", source, splitFraction, numItemsToReadBeforeSplit), reader.getCurrentSource() == residual); BoundedSource<T> primary = reader.getCurrentSource(); return verifySingleSplitAtFractionResult( source,
assertEquals(0.0, readerOrig.getFractionConsumed(), 1e-6); assertEquals(0, readerOrig.getSplitPointsConsumed()); assertEquals( BoundedSource.BoundedReader.SPLIT_POINTS_UNKNOWN, readerOrig.getSplitPointsRemaining()); assertTrue(readerOrig.start()); assertEquals(0, readerOrig.getSplitPointsConsumed()); assertEquals( BoundedSource.BoundedReader.SPLIT_POINTS_UNKNOWN, readerOrig.getSplitPointsRemaining()); remainder = readerOrig.splitAtFraction(0.1); System.err.println(readerOrig.getCurrentSource()); assertNotNull(remainder); assertEquals(0, readerOrig.getSplitPointsConsumed()); assertEquals(1, readerOrig.getSplitPointsRemaining()); assertFalse(readerOrig.advance()); assertEquals(1.0, readerOrig.getFractionConsumed(), 1e-6); assertEquals(1, readerOrig.getSplitPointsConsumed()); assertEquals(0, readerOrig.getSplitPointsRemaining()); assertEquals(0.0, reader.getFractionConsumed(), 1e-6); assertEquals(0, reader.getSplitPointsConsumed()); assertEquals( BoundedSource.BoundedReader.SPLIT_POINTS_UNKNOWN, reader.getSplitPointsRemaining()); assertTrue(reader.start()); assertEquals(0, reader.getSplitPointsConsumed());
float recordsRead = 0; assertEquals(Double.valueOf(0), reader.getFractionConsumed()); boolean start = reader.start(); assertEquals(true, start); if (start) { elements.add(reader.getCurrent()); boolean advance = reader.advance(); reader.getFractionConsumed()); assertEquals(true, advance); while (advance) { elements.add(reader.getCurrent()); advance = reader.advance(); assertEquals( Double.valueOf(++recordsRead / TestEmployeeDataSet.NUMBER_OF_RECORDS_IN_EACH_SPLIT), reader.getFractionConsumed()); assertEquals(Double.valueOf(1), reader.getFractionConsumed()); reader.close();
float recordsRead = 0; assertEquals(Double.valueOf(0), reader.getFractionConsumed()); boolean start = reader.start(); assertEquals(true, start); if (start) { elements.add(reader.getCurrent()); boolean advance = reader.advance(); reader.getFractionConsumed()); assertEquals(true, advance); while (advance) { elements.add(reader.getCurrent()); advance = reader.advance(); assertEquals( Double.valueOf(++recordsRead / TestEmployeeDataSet.NUMBER_OF_RECORDS_IN_EACH_SPLIT), reader.getFractionConsumed()); assertEquals(Double.valueOf(1), reader.getFractionConsumed()); reader.close();
boundedSource.setInputFormatObj(mockInputFormat); BoundedReader<KV<Text, Employee>> reader = boundedSource.createReader(p.getOptions()); assertEquals(Double.valueOf(0), reader.getFractionConsumed()); boolean start = reader.start(); assertEquals(true, start); if (start) { boolean advance = reader.advance(); assertEquals(null, reader.getFractionConsumed()); assertEquals(true, advance); if (advance) { advance = reader.advance(); assertEquals(null, reader.getFractionConsumed()); assertEquals(null, reader.getFractionConsumed()); reader.close();
boundedSource.setInputFormatObj(mockInputFormat); BoundedReader<KV<Text, Employee>> reader = boundedSource.createReader(p.getOptions()); assertEquals(Double.valueOf(0), reader.getFractionConsumed()); boolean start = reader.start(); assertEquals(true, start); if (start) { boolean advance = reader.advance(); assertEquals(null, reader.getFractionConsumed()); assertEquals(true, advance); if (advance) { advance = reader.advance(); assertEquals(null, reader.getFractionConsumed()); assertEquals(null, reader.getFractionConsumed()); reader.close();
.createReader(PipelineOptionsFactory.create())) { assertEquals(0.0, reader.getFractionConsumed(), 1e-6); assertEquals(0, reader.getSplitPointsConsumed()); assertEquals( BoundedSource.BoundedReader.SPLIT_POINTS_UNKNOWN, reader.getSplitPointsRemaining()); assertTrue(reader.start()); assertEquals(0, reader.getSplitPointsConsumed()); assertEquals( BoundedSource.BoundedReader.SPLIT_POINTS_UNKNOWN, reader.getSplitPointsRemaining()); assertTrue(reader.advance()); assertEquals(1, reader.getSplitPointsConsumed()); assertEquals( BoundedSource.BoundedReader.SPLIT_POINTS_UNKNOWN, reader.getSplitPointsRemaining()); assertTrue(reader.advance()); assertEquals(2, reader.getSplitPointsConsumed()); assertEquals(1, reader.getSplitPointsRemaining()); assertFalse(reader.advance()); assertEquals(1.0, reader.getFractionConsumed(), 1e-6); assertEquals(3, reader.getSplitPointsConsumed()); assertEquals(0, reader.getSplitPointsRemaining());
/** * This test validates behavior of {@link * HadoopInputFormatBoundedSource.HadoopInputFormatReader#start() start()} method if InputFormat's * {@link InputFormat#getSplits() getSplits()} returns InputSplitList having zero records. */ @Test public void testReadersStartWhenZeroRecords() throws Exception { InputFormat mockInputFormat = Mockito.mock(EmployeeInputFormat.class); EmployeeRecordReader mockReader = Mockito.mock(EmployeeRecordReader.class); Mockito.when( mockInputFormat.createRecordReader( Mockito.any(InputSplit.class), Mockito.any(TaskAttemptContext.class))) .thenReturn(mockReader); Mockito.when(mockReader.nextKeyValue()).thenReturn(false); InputSplit mockInputSplit = Mockito.mock(NewObjectsEmployeeInputSplit.class); HadoopInputFormatBoundedSource<Text, Employee> boundedSource = new HadoopInputFormatBoundedSource<>( serConf, WritableCoder.of(Text.class), AvroCoder.of(Employee.class), null, // No key translation required. null, // No value translation required. new SerializableSplit(mockInputSplit)); boundedSource.setInputFormatObj(mockInputFormat); BoundedReader<KV<Text, Employee>> reader = boundedSource.createReader(p.getOptions()); assertEquals(false, reader.start()); assertEquals(Double.valueOf(1), reader.getFractionConsumed()); reader.close(); }
/** * This test validates behavior of {@link * HadoopInputFormatBoundedSource.HadoopInputFormatReader#start() start()} method if InputFormat's * {@link InputFormat#getSplits(JobContext)} getSplits(JobContext)} returns InputSplitList having * zero records. */ @Test public void testReadersStartWhenZeroRecords() throws Exception { InputFormat mockInputFormat = Mockito.mock(EmployeeInputFormat.class); EmployeeRecordReader mockReader = Mockito.mock(EmployeeRecordReader.class); Mockito.when( mockInputFormat.createRecordReader( Mockito.any(InputSplit.class), Mockito.any(TaskAttemptContext.class))) .thenReturn(mockReader); Mockito.when(mockReader.nextKeyValue()).thenReturn(false); InputSplit mockInputSplit = Mockito.mock(NewObjectsEmployeeInputSplit.class); HadoopInputFormatBoundedSource<Text, Employee> boundedSource = new HadoopInputFormatBoundedSource<>( serConf, WritableCoder.of(Text.class), AvroCoder.of(Employee.class), null, // No key translation required. null, // No value translation required. new SerializableSplit(mockInputSplit)); boundedSource.setInputFormatObj(mockInputFormat); BoundedReader<KV<Text, Employee>> reader = boundedSource.createReader(p.getOptions()); assertEquals(false, reader.start()); assertEquals(Double.valueOf(1), reader.getFractionConsumed()); reader.close(); }
@Test public void testReadingGranularityAndFractionConsumed() throws IOException { // Tests that the reader correctly snaps to multiples of the given granularity // (note: this is testing test code), and that getFractionConsumed works sensibly // in the face of that. PipelineOptions options = PipelineOptionsFactory.create(); CoarseRangeSource source = new CoarseRangeSource(13, 35, 1, 10); try (CoarseRangeReader reader = source.createReader(options)) { List<Integer> items = new ArrayList<>(); assertEquals(0.0, reader.getFractionConsumed(), 1e-6); assertTrue(reader.start()); items.add(reader.getCurrent()); while (reader.advance()) { Double fraction = reader.getFractionConsumed(); assertNotNull(fraction); assertTrue(fraction.toString(), fraction > 0.0); assertTrue(fraction.toString(), fraction <= 1.0); items.add(reader.getCurrent()); } assertEquals(1.0, reader.getFractionConsumed(), 1e-6); assertEquals(20, items.size()); assertEquals(20, items.get(0).intValue()); assertEquals(39, items.get(items.size() - 1).intValue()); source = new CoarseRangeSource(13, 17, 1, 10); } try (BoundedSource.BoundedReader<Integer> reader = source.createReader(options)) { assertFalse(reader.start()); } }
@Override public void processElement(WindowedValue<BoundedSourceShard<OutputT>> element) throws Exception { BoundedSource<OutputT> source = element.getValue().getSource(); try (final BoundedReader<OutputT> reader = source.createReader(options)) { boolean contentsRemaining = reader.start(); Future<BoundedSource<OutputT>> residualFuture = startDynamicSplitThread(source, reader); UncommittedBundle<OutputT> output = evaluationContext.createBundle(outputPCollection); while (contentsRemaining) { output.add( WindowedValue.timestampedValueInGlobalWindow( reader.getCurrent(), reader.getCurrentTimestamp())); contentsRemaining = reader.advance(); } resultBuilder.addOutput(output); try { BoundedSource<OutputT> residual = residualFuture.get(); if (residual != null) { resultBuilder.addUnprocessedElements( element.withValue(BoundedSourceShard.of(residual))); } } catch (ExecutionException exex) { // Un-and-rewrap the exception thrown by attempting to split throw UserCodeException.wrap(exex.getCause()); } } }
@Test public void testUnsplittable() throws IOException { String baseName = "test-input"; File compressedFile = tmpFolder.newFile(baseName + ".gz"); byte[] input = generateInput(10000); writeFile(compressedFile, input, CompressionMode.GZIP); CompressedSource<Byte> source = CompressedSource.from(new ByteSource(compressedFile.getPath(), 1)); List<Byte> expected = Lists.newArrayList(); for (byte i : input) { expected.add(i); } PipelineOptions options = PipelineOptionsFactory.create(); BoundedReader<Byte> reader = source.createReader(options); List<Byte> actual = Lists.newArrayList(); for (boolean hasNext = reader.start(); hasNext; hasNext = reader.advance()) { actual.add(reader.getCurrent()); // checkpoint every 9 elements if (actual.size() % 9 == 0) { Double fractionConsumed = reader.getFractionConsumed(); assertNotNull(fractionConsumed); assertNull(reader.splitAtFraction(fractionConsumed)); } } assertEquals(expected.size(), actual.size()); assertEquals(Sets.newHashSet(expected), Sets.newHashSet(actual)); }
@Test public void testProgress() throws IOException { final int numRecords = 5; @SuppressWarnings("deprecation") // testing CountingSource BoundedSource<Long> source = CountingSource.upTo(numRecords); try (BoundedReader<Long> reader = source.createReader(PipelineOptionsFactory.create())) { // Check preconditions before starting. Note that CountingReader can always give an accurate // remaining parallelism. assertEquals(0.0, reader.getFractionConsumed(), 1e-6); assertEquals(0, reader.getSplitPointsConsumed()); assertEquals(numRecords, reader.getSplitPointsRemaining()); assertTrue(reader.start()); int i = 0; do { assertEquals(i, reader.getSplitPointsConsumed()); assertEquals(numRecords - i, reader.getSplitPointsRemaining()); ++i; } while (reader.advance()); assertEquals(numRecords, i); // exactly numRecords calls to advance() assertEquals(1.0, reader.getFractionConsumed(), 1e-6); assertEquals(numRecords, reader.getSplitPointsConsumed()); assertEquals(0, reader.getSplitPointsRemaining()); } }
@Test public void testFractionConsumedWhenReadingFilepattern() throws IOException { List<String> data1 = createStringDataset(3, 1000); File file1 = createFileWithData("file1", data1); List<String> data2 = createStringDataset(3, 1000); createFileWithData("file2", data2); List<String> data3 = createStringDataset(3, 1000); createFileWithData("file3", data3); TestFileBasedSource source = new TestFileBasedSource(file1.getParent() + "/" + "file*", 1024, null); try (BoundedSource.BoundedReader<String> reader = source.createReader(null)) { double lastFractionConsumed = 0.0; assertEquals(0.0, reader.getFractionConsumed(), 1e-6); assertTrue(reader.start()); assertTrue(reader.advance()); assertTrue(reader.advance()); // We're inside the first file. Should be in [0, 1/3). assertTrue(reader.getFractionConsumed() > 0.0); assertTrue(reader.getFractionConsumed() < 1.0 / 3.0); while (reader.advance()) { double fractionConsumed = reader.getFractionConsumed(); assertTrue(fractionConsumed > lastFractionConsumed); lastFractionConsumed = fractionConsumed; } assertEquals(1.0, reader.getFractionConsumed(), 1e-6); } }
@Test public void testGetSplitPointsConsumed() throws Exception { final String table = "TEST-TABLE"; final int numRows = 100; int splitPointsConsumed = 0; makeTableData(table, numRows); BigtableSource source = new BigtableSource( config.withTableId(ValueProvider.StaticValueProvider.of(table)), null, Arrays.asList(ByteKeyRange.ALL_KEYS), null); BoundedReader<Row> reader = source.createReader(TestPipeline.testingPipelineOptions()); reader.start(); // Started, 0 split points consumed assertEquals( "splitPointsConsumed starting", splitPointsConsumed, reader.getSplitPointsConsumed()); // Split points consumed increases for each row read while (reader.advance()) { assertEquals( "splitPointsConsumed advancing", ++splitPointsConsumed, reader.getSplitPointsConsumed()); } // Reader marked as done, 100 split points consumed assertEquals("splitPointsConsumed done", numRows, reader.getSplitPointsConsumed()); reader.close(); }
@ProcessElement public void processElement(ProcessContext c) { KV<String, String> kv = c.element(); Ddl ddl = c.sideInput(ddlView); String tableName = kv.getKey(); Table table = ddl.table(tableName); SerializableFunction<GenericRecord, Mutation> parseFn = new AvroRecordConverter(table); AvroSource<Mutation> source = AvroSource.from(kv.getValue()) .withParseFn(parseFn, SerializableCoder.of(Mutation.class)); try { BoundedSource.BoundedReader<Mutation> reader = source.createReader(c.getPipelineOptions()); for (boolean more = reader.start(); more; more = reader.advance()) { c.output(reader.getCurrent()); } } catch (IOException e) { throw new RuntimeException(e); } } })
@Test public void testGetProgressFromUnstartedReader() throws Exception { List<FixedRecord> records = createFixedRecords(DEFAULT_RECORD_COUNT); String filename = generateTestFile( "tmp.avro", records, SyncBehavior.SYNC_DEFAULT, 1000, AvroCoder.of(FixedRecord.class), DataFileConstants.NULL_CODEC); File file = new File(filename); AvroSource<FixedRecord> source = AvroSource.from(filename).withSchema(FixedRecord.class); try (BoundedSource.BoundedReader<FixedRecord> reader = source.createReader(null)) { assertEquals(Double.valueOf(0.0), reader.getFractionConsumed()); } List<? extends BoundedSource<FixedRecord>> splits = source.split(file.length() / 3, null); for (BoundedSource<FixedRecord> subSource : splits) { try (BoundedSource.BoundedReader<FixedRecord> reader = subSource.createReader(null)) { assertEquals(Double.valueOf(0.0), reader.getFractionConsumed()); } } }