/** * Delete a file tracked by a {@link DataFile} from the underlying table. * * @param file a DataFile to remove from the table * @return this for method chaining */ default DeleteFiles deleteFile(DataFile file) { deleteFile(file.path()); return this; }
@Override public String toString() { return Objects.toStringHelper(this) .add("file", file.path()) .add("partition_data", file.partition()) .add("residual", residual()) .toString(); } }
@Override public RewriteFiles rewriteFiles(Set<DataFile> filesToDelete, Set<DataFile> filesToAdd) { Preconditions.checkArgument(filesToDelete != null && !filesToDelete.isEmpty(), "Files to delete cannot be null or empty"); Preconditions.checkArgument(filesToAdd != null && !filesToAdd.isEmpty(), "Files to add can not be null or empty"); for (DataFile toDelete : filesToDelete) { delete(toDelete.path()); } for (DataFile toAdd : filesToAdd) { add(toAdd); } return this; } }
@Override public void abort() throws IOException { FileSystem fs = currentPath.getFileSystem(conf); // clean up files created by this writer Tasks.foreach(completedFiles) .throwFailureWhenFinished() .noRetry() .run(file -> fs.delete(new Path(file.path().toString())), IOException.class); if (currentAppender != null) { currentAppender.close(); this.currentAppender = null; fs.delete(currentPath); } }
List<String> paths(DataFile... dataFiles) { List<String> paths = Lists.newArrayListWithExpectedSize(dataFiles.length); for (DataFile file : dataFiles) { paths.add(file.path().toString()); } return paths; }
if (entry != null && locations.contains(locationWrapper.set(entry.file().path()))) { results.add(entry.copy());
static void validateManifest(String manifest, Iterator<Long> ids, Iterator<DataFile> expectedFiles) { for (ManifestEntry entry : ManifestReader.read(localInput(manifest)).entries()) { DataFile file = entry.file(); DataFile expected = expectedFiles.next(); Assert.assertEquals("Path should match expected", expected.path().toString(), file.path().toString()); Assert.assertEquals("Snapshot ID should match expected ID", (long) ids.next(), entry.snapshotId()); } Assert.assertFalse("Should find all files in the manifest", expectedFiles.hasNext()); }
@Override public void abort(WriterCommitMessage[] messages) { FileSystem fs; try { fs = new Path(table.location()).getFileSystem(conf); } catch (IOException e) { throw new RuntimeIOException(e); } Tasks.foreach(files(messages)) .retry(propertyAsInt(COMMIT_NUM_RETRIES, COMMIT_NUM_RETRIES_DEFAULT)) .exponentialBackoff( propertyAsInt(COMMIT_MIN_RETRY_WAIT_MS, COMMIT_MIN_RETRY_WAIT_MS_DEFAULT), propertyAsInt(COMMIT_MAX_RETRY_WAIT_MS, COMMIT_MAX_RETRY_WAIT_MS_DEFAULT), propertyAsInt(COMMIT_TOTAL_RETRY_TIME_MS, COMMIT_TOTAL_RETRY_TIME_MS_DEFAULT), 2.0 /* exponential */ ) .throwFailureWhenFinished() .run(file -> { try { fs.delete(new Path(file.path().toString()), false /* not recursive */ ); } catch (IOException e) { throw new RuntimeIOException(e); } }); }
public Builder copy(DataFile toCopy) { if (isPartitioned) { this.partitionData = copyPartitionData(spec, toCopy.partition(), partitionData); } this.filePath = toCopy.path().toString(); this.format = toCopy.format(); this.recordCount = toCopy.recordCount(); this.fileSizeInBytes = toCopy.fileSizeInBytes(); this.blockSizeInBytes = toCopy.blockSizeInBytes(); this.columnSizes = toCopy.columnSizes(); this.valueCounts = toCopy.valueCounts(); this.nullValueCounts = toCopy.nullValueCounts(); this.lowerBounds = toCopy.lowerBounds(); this.upperBounds = toCopy.upperBounds(); return this; }
static void validateManifestEntries(String manifest, Iterator<Long> ids, Iterator<DataFile> expectedFiles, Iterator<ManifestEntry.Status> expectedStatuses) { for (ManifestEntry entry : ManifestReader.read(localInput(manifest)).entries()) { DataFile file = entry.file(); DataFile expected = expectedFiles.next(); final ManifestEntry.Status expectedStatus = expectedStatuses.next(); Assert.assertEquals("Path should match expected", expected.path().toString(), file.path().toString()); Assert.assertEquals("Snapshot ID should match expected ID", (long) ids.next(), entry.snapshotId()); Assert.assertEquals("Entry status should match expected ID", expectedStatus, entry.status()); } Assert.assertFalse("Should find all files in the manifest", expectedFiles.hasNext()); }
private Iterator<InternalRow> open(FileScanTask task, Schema readSchema, Configuration conf) { InputFile location = HadoopInputFile.fromLocation(task.file().path(), conf); CloseableIterable<InternalRow> iter; switch (task.file().format()) { case ORC: SparkOrcReader reader = new SparkOrcReader(location, task, readSchema); this.currentCloseable = reader; return reader; case PARQUET: iter = newParquetIterable(location, task, readSchema); break; case AVRO: iter = newAvroIterable(location, task, readSchema); break; default: throw new UnsupportedOperationException( "Cannot read unknown format: " + task.file().format()); } this.currentCloseable = iter; return iter.iterator(); }
private void writeAndValidateWithLocations(Table table, File location, File expectedDataDir) throws IOException { Schema tableSchema = table.schema(); // use the table schema because ids are reassigned table.updateProperties().set(TableProperties.DEFAULT_FILE_FORMAT, format).commit(); List<Record> expected = RandomData.generateList(tableSchema, 100, 0L); Dataset<Row> df = createDataset(expected, tableSchema); DataFrameWriter<?> writer = df.write().format("iceberg").mode("append"); writer.save(location.toString()); table.refresh(); Dataset<Row> result = spark.read() .format("iceberg") .load(location.toString()); List<Row> actual = result.collectAsList(); Assert.assertEquals("Result size should match expected", expected.size(), actual.size()); for (int i = 0; i < expected.size(); i += 1) { assertEqualsSafe(tableSchema.asStruct(), expected.get(i), actual.get(i)); } table.currentSnapshot().addedFiles().forEach(dataFile -> Assert.assertTrue( String.format( "File should have the parent directory %s, but has: %s.", expectedDataDir.getAbsolutePath(), dataFile.path()), URI.create(dataFile.path().toString()).getPath().startsWith(expectedDataDir.getAbsolutePath()))); }
!currentIds.contains(entry.snapshotId())) { filesToDelete.add(entry.file().path().toString());
void validateSnapshot(Snapshot old, Snapshot snap, DataFile... newFiles) { List<ManifestFile> oldManifests = old != null ? old.manifests() : ImmutableList.of(); // copy the manifests to a modifiable list and remove the existing manifests List<ManifestFile> newManifests = Lists.newArrayList(snap.manifests()); for (ManifestFile oldManifest : oldManifests) { Assert.assertTrue("New snapshot should contain old manifests", newManifests.remove(oldManifest)); } Assert.assertEquals("Should create 1 new manifest and reuse old manifests", 1, newManifests.size()); ManifestFile manifest = newManifests.get(0); long id = snap.snapshotId(); Iterator<String> newPaths = paths(newFiles).iterator(); for (ManifestEntry entry : ManifestReader.read(localInput(manifest.path())).entries()) { DataFile file = entry.file(); Assert.assertEquals("Path should match expected", newPaths.next(), file.path().toString()); Assert.assertEquals("File's snapshot ID should match", id, entry.snapshotId()); } Assert.assertFalse("Should find all files in the manifest", newPaths.hasNext()); }
for (ManifestEntry entry : reader.entries()) { DataFile file = entry.file(); boolean fileDelete = (deletePaths.contains(pathWrapper.set(file.path())) || dropPartitions.contains(partitionWrapper.set(file.partition()))); if (fileDelete || inclusive.eval(file.partition())) { fileDelete || strict.eval(file.partition()) || metricsEvaluator.eval(file), "Cannot delete file where some, but not all, rows match filter %s: %s", this.deleteExpression, file.path()); for (ManifestEntry entry : reader.entries()) { DataFile file = entry.file(); boolean fileDelete = (deletePaths.contains(pathWrapper.set(file.path())) || dropPartitions.contains(partitionWrapper.set(file.partition()))); if (entry.status() != Status.DELETED) { fileDelete || strict.eval(file.partition()) || metricsEvaluator.eval(file), "Cannot delete file where some, but not all, rows match filter %s: %s", this.deleteExpression, file.path()); CharSequenceWrapper wrapper = CharSequenceWrapper.wrap(entry.file().path()); if (deletedPaths.contains(wrapper)) { LOG.warn("Deleting a duplicate path from manifest {}: {}",
private CloseableIterable<Record> open(FileScanTask task) { InputFile input = ops.io().newInputFile(task.file().path().toString()); // TODO: join to partition data from the manifest file switch (task.file().format()) { case AVRO: Avro.ReadBuilder avro = Avro.read(input) .project(projection) .createReaderFunc(DataReader::create) .split(task.start(), task.length()); if (reuseContainers) { avro.reuseContainers(); } return avro.build(); case PARQUET: Parquet.ReadBuilder parquet = Parquet.read(input) .project(projection) .createReaderFunc(fileSchema -> buildReader(projection, fileSchema)) .split(task.start(), task.length()); if (reuseContainers) { parquet.reuseContainers(); } return parquet.build(); default: throw new UnsupportedOperationException(String.format("Cannot read %s file: %s", task.file().format().name(), task.file().path())); } }
@Override public List<ManifestFile> apply(TableMetadata base) { if (validateAddedFiles) { PartitionSpec spec = writeSpec(); Expression rowFilter = rowFilter(); Expression inclusiveExpr = Projections.inclusive(spec).project(rowFilter); Evaluator inclusive = new Evaluator(spec.partitionType(), inclusiveExpr); Expression strictExpr = Projections.strict(spec).project(rowFilter); Evaluator strict = new Evaluator(spec.partitionType(), strictExpr); StrictMetricsEvaluator metrics = new StrictMetricsEvaluator( base.schema(), rowFilter); for (DataFile file : addedFiles()) { // the real test is that the strict or metrics test matches the file, indicating that all // records in the file match the filter. inclusive is used to avoid testing the metrics, // which is more complicated ValidationException.check( inclusive.eval(file.partition()) && (strict.eval(file.partition()) || metrics.eval(file)), "Cannot append file with rows that do not match filter: %s: %s", rowFilter, file.path()); } } return super.apply(base); } }