@Override public void process(InputStream in, OutputStream out) throws IOException { try (CSVFileReader<Record> reader = new CSVFileReader<>( in, props, schema, Record.class)) { reader.initialize(); try (DataFileWriter<Record> w = writer.create(schema, out)) { while (reader.hasNext()) { try { Record record = reader.next(); w.append(record); written.incrementAndGet(); } catch (DatasetRecordException e) { failures.add(e); } } } } } });
@Override public RecordReader<E, Void> createRecordReader(InputSplit split, TaskAttemptContext context) throws IOException, InterruptedException { Configuration conf = Hadoop.TaskAttemptContext .getConfiguration.invoke(context); Path path; if (split instanceof FileSplit) { path = ((FileSplit) split).getPath(); } else { throw new DatasetOperationException( "Split is not a FileSplit: %s:%s", split.getClass().getCanonicalName(), split); } CSVFileReader<E> reader = new CSVFileReader<E>( path.getFileSystem(conf), path, descriptor, accessor); reader.initialize(); return reader.asRecordReader(); } }
public int count(FileSystem fs, Path path, DatasetDescriptor descriptor) { CSVFileReader<GenericRecord> reader = new CSVFileReader<GenericRecord>( fs, path, descriptor, DataModelUtil.accessor(GenericRecord.class, descriptor.getSchema())); int count = 0; reader.initialize(); for (GenericRecord r : reader) { count += 1; System.err.println(r); } reader.close(); return count; }
@Test public void testBadNumericSchema() { final DatasetDescriptor desc = new DatasetDescriptor.Builder() .schema(TYPE_ERROR_SCHEMA) .build(); final CSVFileReader<GenericData.Record> reader = new CSVFileReader<GenericData.Record>(localfs, csvFile, desc, DataModelUtil.accessor(GenericData.Record.class, desc.getSchema())); reader.initialize(); Assert.assertTrue(reader.hasNext()); TestHelpers.assertThrows("Should reject float value for integer schema", DatasetRecordException.class, new Runnable() { @Override public void run() { reader.next(); } }); }
@Override public void run() { reader.next(); } });
@SuppressWarnings("unchecked") // See https://github.com/Parquet/parquet-mr/issues/106 private void openNextReader() { if (Formats.PARQUET.equals(descriptor.getFormat())) { this.reader = new ParquetFileSystemDatasetReader(fileSystem, filesIter.next(), accessor.getReadSchema(), accessor.getType()); } else if (Formats.JSON.equals(descriptor.getFormat())) { this.reader = new JSONFileReader<E>( fileSystem, filesIter.next(), accessor); } else if (Formats.CSV.equals(descriptor.getFormat())) { this.reader = new CSVFileReader<E>(fileSystem, filesIter.next(), descriptor, accessor); } else if (Formats.INPUTFORMAT.equals(descriptor.getFormat())) { this.reader = new InputFormatReader(fileSystem, filesIter.next(), descriptor); } else { this.reader = new FileSystemDatasetReader<E>(fileSystem, filesIter.next(), accessor.getReadSchema(), accessor.getType()); } reader.initialize(); this.readerIterator = Iterators.filter(reader, constraints.toEntityPredicate( (pathIter != null ? pathIter.getStorageKey() : null), accessor)); }
@Override public E next() { Preconditions.checkState(state.equals(ReaderWriterState.OPEN), "Attempt to read from a file in state:%s", state); if (!hasNext) { throw new NoSuchElementException(); } try { if (reuseRecords) { this.record = builder.makeRecord(next, record); return record; } else { return builder.makeRecord(next, null); } } finally { this.hasNext = advance(); } }
@Override public void run() { reader.next(); } });
@Override public DatasetReader<GenericData.Record> newReader() throws IOException { final DatasetDescriptor desc = new DatasetDescriptor.Builder() .property("kite.csv.has-header", "true") .schema(VALIDATOR_SCHEMA) .build(); return new CSVFileReader<GenericData.Record>(localfs, validatorFile, desc, DataModelUtil.accessor(GenericData.Record.class, desc.getSchema())); }
@Override public void process(InputStream in, OutputStream out) throws IOException { try (CSVFileReader<Record> reader = new CSVFileReader<>( in, props, schema, Record.class)) { reader.initialize(); try (DataFileWriter<Record> w = writer.create(schema, out)) { while (reader.hasNext()) { try { Record record = reader.next(); w.append(record); written.incrementAndGet(); } catch (DatasetRecordException e) { failures.add(e); } } } } } });
@Override public void run() { reader.next(); } });
@Test(expected = IllegalArgumentException.class) public void testRejectsNonRecordSchemas() { final DatasetDescriptor desc = new DatasetDescriptor.Builder() .schema(SchemaBuilder.array().items().stringType()) .build(); new CSVFileReader<GenericData.Record>(localfs, csvFile, desc, DataModelUtil.accessor(GenericData.Record.class, desc.getSchema())); }
.build(); final CSVFileReader<TestBean> reader = new CSVFileReader<TestBean>(localfs, csvFile, desc, DataModelUtil.accessor(TestBean.class, desc.getSchema())); reader.initialize(); Assert.assertTrue(reader.hasNext()); TestBean bean = reader.next(); Assert.assertEquals("str", bean.myStr); Assert.assertEquals((Integer) 34, bean.myInt); Assert.assertEquals(false, bean.myBool); Assert.assertTrue(reader.hasNext()); bean = reader.next(); Assert.assertEquals("str,2", bean.myStr); Assert.assertEquals(null, bean.myInt); Assert.assertTrue(reader.hasNext()); TestHelpers.assertThrows("Should complain about null as a number", DatasetRecordException.class, new Runnable() { Assert.assertTrue(reader.hasNext()); bean = reader.next(); Assert.assertEquals("str4", bean.myStr); Assert.assertEquals(null, bean.myInt); Assert.assertFalse(reader.hasNext());
@Override public void run() { reader.next(); } });
.build(); final CSVFileReader<TestBean> reader = new CSVFileReader<TestBean>(localfs, reorderedFile, desc, DataModelUtil.accessor(TestBean.class, desc.getSchema())); reader.initialize(); Assert.assertTrue(reader.hasNext()); TestBean bean = reader.next(); Assert.assertEquals("str", bean.myStr); Assert.assertEquals((Integer) 34, bean.myInt); Assert.assertEquals(false, bean.myBool); Assert.assertTrue(reader.hasNext()); bean = reader.next(); Assert.assertEquals("str,2", bean.myStr); Assert.assertEquals(null, bean.myInt); Assert.assertTrue(reader.hasNext()); TestHelpers.assertThrows("Should complain about null as a number", DatasetRecordException.class, new Runnable() { Assert.assertTrue(reader.hasNext()); bean = reader.next(); Assert.assertEquals("str4", bean.myStr); Assert.assertEquals(null, bean.myInt); Assert.assertFalse(reader.hasNext());
@Override public void run() { reader.next(); } });
.build(); final CSVFileReader<GenericData.Record> reader = new CSVFileReader<GenericData.Record>(localfs, csvFile, desc, DataModelUtil.accessor(GenericData.Record.class, desc.getSchema())); reader.initialize(); Assert.assertTrue(reader.hasNext()); GenericData.Record rec = reader.next(); Assert.assertEquals("str", rec.get(0)); Assert.assertEquals(34, rec.get(1)); Assert.assertEquals(false, rec.get(3)); Assert.assertTrue(reader.hasNext()); rec = reader.next(); Assert.assertEquals("str,2", rec.get(0)); Assert.assertEquals(0, rec.get(1)); Assert.assertTrue(reader.hasNext()); TestHelpers.assertThrows("Should complain about null as a number", DatasetRecordException.class, new Runnable() { Assert.assertTrue(reader.hasNext()); TestHelpers.assertThrows("Should complain about missing default", DatasetRecordException.class, new Runnable() { Assert.assertFalse(reader.hasNext());
@Override public void run() { reader.next(); } });
.build(); final CSVFileReader<TestGenericRecord> reader = new CSVFileReader<TestGenericRecord>(localfs, csvFile, desc, DataModelUtil.accessor(TestGenericRecord.class, desc.getSchema())); reader.initialize(); Assert.assertTrue(reader.hasNext()); TestGenericRecord record = reader.next(); Assert.assertEquals("str", record.get(0)); Assert.assertEquals((Integer) 34, record.get(1)); Assert.assertEquals(false, record.get(3)); Assert.assertTrue(reader.hasNext()); record = reader.next(); Assert.assertEquals("str,2", record.get(0)); Assert.assertEquals((Integer) 0, record.get(1)); Assert.assertTrue(reader.hasNext()); TestHelpers.assertThrows("Should complain about null as a number", DatasetRecordException.class, new Runnable() { Assert.assertTrue(reader.hasNext()); TestHelpers.assertThrows("Should complain about missing default", DatasetRecordException.class, new Runnable() { Assert.assertFalse(reader.hasNext());