public DummyHoodieSinkDataConverter() { super(new Configuration(), new ErrorExtractor()); }
private void testWriteGeneral(@NonNull final JavaRDD<AvroPayload> testData, @NonNull final Configuration conf) throws IOException { final FileSinkDataConverter converter = new FileSinkDataConverter(conf, new ErrorExtractor()); final FileSinkConfiguration fileConf = new FileSinkConfiguration(conf); final FileSink awsSink = new AwsFileSink(fileConf, converter); awsSink.write(testData); }
@Test public void testGetHeaderWithCsv() { final String separator = ","; final JavaRDD<AvroPayload> payloadData = AvroPayloadUtil.generateTestData(this.jsc.get(), 10, StringTypes.EMPTY); final Configuration conf = initConf(separator, "csv"); log.info("Starting to get data header."); final FileSinkDataConverter converter = new FileSinkDataConverter(conf, new ErrorExtractor()); final String header = converter.getHeader(payloadData); final String resultHeader = "int_field,string_field,boolean_field"; Assert.assertEquals(resultHeader, header); log.info("Header: {}", header); }
@Test public void testConvertAllWithCsv() { log.info("Starts Test convert all with csv"); final String separator = " "; final JavaRDD<AvroPayload> payloadData = AvroPayloadUtil.generateTestData(this.jsc.get(), 10, StringTypes.EMPTY); final Configuration conf = initConf(separator, "csv"); log.info("Starting to convert data."); final FileSinkDataConverter converter = new FileSinkDataConverter(conf, new ErrorExtractor()); final JavaRDD<String> dataConverted = converter.convertAll(payloadData); int i = 1; for (String line: dataConverted.collect()){ Assert.assertEquals(String.valueOf(i) + separator + String.valueOf(i) + separator + "true", line ); i = i + 1; } }
@Test public void testConvertAllWithCsvSpecialChar() { log.info("Starts Test convert all with csv"); final String separator = ","; final JavaRDD<AvroPayload> payloadData = AvroPayloadUtil.generateTestDataNew(this.jsc.get(), 10, StringTypes.EMPTY); final Configuration conf = initConf(separator, "csv"); log.info("Starting to convert data."); final FileSinkDataConverter converter = new FileSinkDataConverter(conf, new ErrorExtractor()); final JavaRDD<String> dataConverted = converter.convertAll(payloadData); int i = 1; for (String line: dataConverted.collect()){ Assert.assertEquals(String.valueOf(i) + separator + "\"" + String.valueOf(i) + "\\\",try\\\\\"" + separator + "true", line); i = i + 1; } }
@Test(expected = SparkException.class) public void testConvertAllWithJsonNotSupported() { log.info("Starts Test convert all with json"); final String separator = ","; final JavaRDD<AvroPayload> payloadData = AvroPayloadUtil.generateTestData(this.jsc.get(), 10, StringTypes.EMPTY); final Configuration conf = initConf(separator, "json"); log.info("Starting to convert data."); final FileSinkDataConverter converter = new FileSinkDataConverter(conf, new ErrorExtractor()); final JavaRDD<String> dataConverted = converter.convertAll(payloadData); int i = 1; for (String line: dataConverted.collect()){ Assert.assertEquals(String.valueOf(i) + separator + String.valueOf(i) + separator + "true", line ); i = i + 1; } }
private void testWriteToCsvCommon(@NonNull final String pathPrefix, @NonNull final String path, @NonNull final String separator, @NonNull final JavaRDD<AvroPayload> testData, @NonNull final int partitionNum, @NonNull final String timeStamp, @NonNull final String sourceSubPath, @NonNull final String dispersalType) throws Exception { final Configuration conf = initConfig(pathPrefix, path, separator, timeStamp, sourceSubPath, dispersalType); final FileSinkDataConverter converter = new FileSinkDataConverter(conf, new ErrorExtractor()); final FileSinkConfiguration fileConf = new FileSinkConfiguration(conf); final HdfsFileSink hdfsSink = spy(new HdfsFileSink(fileConf, converter)); hdfsSink.write(testData); verify(hdfsSink, times(1)).write(Matchers.any(JavaRDD.class)); verify(hdfsSink, times(1)).getRepartitionNum(Matchers.any(JavaRDD.class)); verify(hdfsSink, times(1)).getRddSizeInMegaByte(Matchers.any(JavaRDD.class)); final FileStatus[] status = this.fileSystem.get().listStatus(new Path(fileConf.getPathHdfs())); int fileNum = 0; for (final FileStatus fileStatus : status) { if (fileStatus.isFile()) { fileNum++; } } assertEquals(partitionNum, fileNum); } }
@Test public void testWriteToCsvWithHeader() throws IOException { final JavaRDD<AvroPayload> testData = AvroPayloadUtil.generateTestDataNew(this.jsc.get(), NUM_RECORD2, StringTypes.EMPTY); final Configuration conf = initConfig(pathPrefix, PATH4, COMMA_SEPARATOR, TIMESTAMP1, SOURCE_SUB_PATH1, VERSION); conf.setProperty(FileSinkConfiguration.CSV_COLUMN_HEADER, "true"); final FileSinkDataConverter converter = new FileSinkDataConverter(conf, new ErrorExtractor()); final FileSinkConfiguration fileConf = new FileSinkConfiguration(conf); final HdfsFileSink hdfsSink = spy(new HdfsFileSink(fileConf, converter)); hdfsSink.write(testData); verify(hdfsSink, times(1)).write(Matchers.any(JavaRDD.class)); verify(hdfsSink, times(1)).addColumnHeader(Matchers.anyString(), Matchers.any(JavaRDD.class)); final FileStatus[] status = this.fileSystem.get().listStatus(new Path(fileConf.getPathHdfs())); for (final FileStatus fileStatus : status) { if (fileStatus.isFile()) { Path path = fileStatus.getPath(); FSDataInputStream in = this.fileSystem.get().open(path); BufferedReader d = new BufferedReader(new InputStreamReader(in)); String header = d.readLine(); Assert.assertEquals("int_field,string_field,boolean_field", header); in.close(); d.close(); } } }
@Test public void testExceptionHandling() { final int successRecords = 5; final int invalidDataRecords = 7; final int runtimeExceptionRecords = 1; final List<String> inputList = new ArrayList<>(); // Adding only success & invalid_data records. IntStream.range(0, successRecords).forEach(i -> inputList.add(SUCCESS)); IntStream.range(0, invalidDataRecords).forEach(i -> inputList.add(INVALID_DATA)); final MockAbstractDataConverter mockConverter = new MockAbstractDataConverter(new Configuration(), new ErrorExtractor()); final RDDWrapper<String> result = mockConverter.map(this.jsc.get().parallelize(inputList)); Assert.assertEquals(successRecords, result.getCount()); // Adding runtime exception records. This should fail the spark job. IntStream.range(0, runtimeExceptionRecords).forEach(i -> inputList.add(RUNTIME_EXCEPTION)); try { mockConverter.map(this.jsc.get().parallelize(inputList)); Assert.fail("expecting error here"); } catch (Exception e) { Assert.assertEquals(SparkException.class, e.getClass()); Assert.assertEquals(JobRuntimeException.class, e.getCause().getClass()); Assert.assertEquals(JobRuntimeException.class, e.getCause().getCause().getClass()); Assert.assertEquals(RUNTIME_EXCEPTION, e.getCause().getCause().getMessage()); } }
@Before public void setupTest() { super.setupTest(); this.testData1 = AvroPayloadUtil.generateTestData(this.jsc.get(), NUM_RECORD1, StringTypes.EMPTY); this.testData2 = AvroPayloadUtil.generateTestDataNew(this.jsc.get(), NUM_RECORD2, StringTypes.EMPTY); this.conf = initConfig(pathPrefix, PATH1, COMMA_SEPARATOR, TIMESTAMP1, SOURCE_SUB_PATH1, VERSION); this.converter = new FileSinkDataConverter(conf, new ErrorExtractor()); final FileSinkConfiguration fileConf = new FileSinkConfiguration(conf); this.fileSink = spy(new HdfsFileSink(fileConf, converter)); this.convertedData1 = this.converter.convertAll(this.testData1); this.convertedData2 = this.converter.convertAll(this.testData2); }
private void testWriteAllFieldsMockDataToCassandra(boolean addLongTimestamp) { final JavaRDD<AvroPayload> testData = AvroPayloadUtil.generateTestData(this.jsc.get(), 100, StringTypes.EMPTY); final List<String> schemaFields = AvroPayloadUtil.getSchemaFields(); final List<String> partitionKeys = Collections.singletonList(schemaFields.get(0)); final List<ClusterKey> clusteringKeys = Collections.singletonList( new ClusterKey(schemaFields.get(1), ClusterKey.Order.DESC)); final List<String> requiredFields = Arrays.asList(schemaFields.get(0), schemaFields.get(1)); final Optional<String> timestamp = addLongTimestamp ? Optional.of(TEST_TIMESTAMP) : Optional.absent(); final TimestampInfo tsInfo = new TimestampInfo(timestamp, true); final CassandraSinkDataConverter dataconverter = new CassandraSinkDataConverter(AvroPayloadUtil.getAvroTestDataSchema(StringTypes.EMPTY), new Configuration(), Optional.of(new HashSet<>(schemaFields)), requiredFields, tsInfo, new ErrorExtractor()); final CassandraSchemaConverter schemaConverter = new CassandraSchemaConverter(KEY_SPACE, TABLE, tsInfo, Optional.absent()); final CassandraSchema schema = schemaConverter.convertToExternalSchema( AvroPayloadUtil.getAvroTestDataSchema(StringTypes.EMPTY)); final Optional<Long> ttl = Optional.of(10000L); final CassandraSinkSchemaManager schemaManager = new CassandraSinkSchemaManager(schema, partitionKeys, clusteringKeys, ttl); final CassandraSinkConfiguration conf = initializeConfiguration(false, addLongTimestamp); final CassandraSSTableSink sink = new CassandraSSTableSink(dataconverter, schemaManager, conf); sink.write(testData); validateCassandraTable(100, false, addLongTimestamp); }
private void testWriteAllFieldsMockDataToCassandra(boolean addLongTimestamp) { final JavaRDD<AvroPayload> testData = AvroPayloadUtil.generateTestData(this.jsc.get(), 100, StringTypes.EMPTY); final List<String> schemaFields = AvroPayloadUtil.getSchemaFields(); final List<String> partitionKeys = Collections.singletonList(schemaFields.get(0)); final List<ClusterKey> clusteringKeys = Collections.singletonList( new ClusterKey(schemaFields.get(1), ClusterKey.Order.DESC)); final List<String> requiredFields = Arrays.asList(schemaFields.get(0), schemaFields.get(1)); final Optional<String> timestamp = addLongTimestamp ? Optional.of(TEST_TIMESTAMP) : Optional.absent(); final TimestampInfo tsInfo = new TimestampInfo(timestamp, true); final CassandraSinkCQLDataConverter converter = new CassandraSinkCQLDataConverter(AvroPayloadUtil.getAvroTestDataSchema(StringTypes.EMPTY), new Configuration(), Optional.of(new HashSet<>(schemaFields)), requiredFields, tsInfo, new ErrorExtractor()); final CassandraSchemaConverter schemaConverter = new CassandraSchemaConverter(KEY_SPACE, TABLE, tsInfo, Optional.absent()); final CassandraSchema schema = schemaConverter.convertToExternalSchema( AvroPayloadUtil.getAvroTestDataSchema(StringTypes.EMPTY)); final Optional<Long> ttl = Optional.of(10000L); final CassandraSinkSchemaManager schemaManager = new CassandraSinkSchemaManager(schema, partitionKeys, clusteringKeys, ttl); final CassandraSinkConfiguration conf = initializeConfiguration(false, addLongTimestamp); final CassandraClientSink sink = new CassandraClientSink(converter, schemaManager, conf); sink.write(testData); validateCassandraTable(100, false, addLongTimestamp); }
Optional.of(new HashSet<>(schemaFields)), requiredFields, tsInfo, new ErrorExtractor());
private String testWriteToMockS3General(@NonNull final Configuration conf) throws IOException { final JavaRDD<AvroPayload> testData = AvroPayloadUtil.generateTestDataNew(this.jsc.get(), NUM_RECORD, StringTypes.EMPTY); final FileSinkDataConverter converter = new FileSinkDataConverter(conf, new ErrorExtractor()); final FileSinkConfiguration fileConf = new FileSinkConfiguration(conf); final AwsConfiguration awsConf = new AwsConfiguration(fileConf); final MockAwsFileSink awsMockSink = spy(new MockAwsFileSink(fileConf, converter)); awsMockSink.write(testData); final AmazonS3 MockClient = awsMockSink.getS3Client(); verify(awsMockSink, times(EXPECTED_INVOCATIONS)).write(Matchers.any(JavaRDD.class)); verify(MockClient, times(EXPECTED_PARTITION_NUM)).putObject(Matchers.any(PutObjectRequest.class)); assertTrue(MockClient.doesBucketExistV2(fileConf.getBucketName().get())); for (int i = 0 ; i < EXPECTED_PARTITION_NUM ; i++) { final Boolean objectExist = MockClient.doesObjectExist(fileConf.getBucketName().get(), awsConf.getS3FilePrefix()+ "_0000" + i); assertTrue(objectExist); } return awsConf.getS3FilePrefix(); }
Optional.of(new HashSet<>(schemaFields)), requiredFields, tsInfo, new ErrorExtractor());
Optional.of(new HashSet<>(schemaFields)), requiredFields, TimestampInfo.generateEmptyTimestampInfo(), new ErrorExtractor());
HiveTestUtil.initializeConfig(JOB_NAME, dataPath, "testMetadataPath"); final SparkSourceDataConverter converter = new SparkSourceDataConverter(dfSchema, avroSchema, hiveConf.getConf(), Sets.newHashSet(LEFT_FIELD, RIGHT_FIELD), new ErrorExtractor()); final HiveSource source = new HiveSource(hiveConf, this.sqlContext.get(), converter);
Optional.of(new HashSet<>(schemaFields)), requiredFields, TimestampInfo.generateEmptyTimestampInfo(), new ErrorExtractor());
new ErrorExtractor()); final JavaRDD<AvroPayload> payloadRDD = converter.map(df.javaRDD()).getData();
new Configuration(), Collections.singleton(STRING_FIELD), new ErrorExtractor());