diff --git a/api/src/main/java/org/apache/iceberg/FileContent.java b/api/src/main/java/org/apache/iceberg/FileContent.java index 2c9a2fa51bd2..f700305d68ee 100644 --- a/api/src/main/java/org/apache/iceberg/FileContent.java +++ b/api/src/main/java/org/apache/iceberg/FileContent.java @@ -18,11 +18,21 @@ */ package org.apache.iceberg; -/** Content type stored in a file, one of DATA, POSITION_DELETES, or EQUALITY_DELETES. */ +/** + * Content type stored in a file. + * + *

For V1-V3 tables: DATA, POSITION_DELETES, or EQUALITY_DELETES. + * + *

For V4 tables: DATA, POSITION_DELETES, EQUALITY_DELETES, DATA_MANIFEST, or DELETE_MANIFEST. + */ public enum FileContent { DATA(0), POSITION_DELETES(1), - EQUALITY_DELETES(2); + EQUALITY_DELETES(2), + /** Data manifest entry (V4+ only) - references data files in a root manifest. */ + DATA_MANIFEST(3), + /** Delete manifest entry (V4+ only) - references delete files in a root manifest. */ + DELETE_MANIFEST(4); private final int id; diff --git a/core/src/main/java/org/apache/iceberg/ContentInfo.java b/core/src/main/java/org/apache/iceberg/ContentInfo.java new file mode 100644 index 000000000000..6a4831214f4d --- /dev/null +++ b/core/src/main/java/org/apache/iceberg/ContentInfo.java @@ -0,0 +1,58 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg; + +import org.apache.iceberg.types.Types; + +/** + * Metadata about externally stored content such as deletion vectors. + * + *

The deletion vector content is stored at the specified offset and size within the file + * referenced by {@link TrackedFile#location()}. + * + *

This struct must be defined when content_type is POSITION_DELETES, and must be null otherwise. + * + *

Note: For manifest-level deletion vectors (marking entries in a manifest as deleted), see + * {@link TrackedFile#manifestDV()} which stores the DV inline as a binary field. + */ +interface ContentInfo { + Types.NestedField OFFSET = + Types.NestedField.required( + 144, "offset", Types.LongType.get(), "Offset in the file where the content starts"); + Types.NestedField SIZE_IN_BYTES = + Types.NestedField.required( + 145, + "size_in_bytes", + Types.LongType.get(), + "Length of the referenced content stored in the file"); + + static Types.StructType schema() { + return Types.StructType.of(OFFSET, SIZE_IN_BYTES); + } + + /** + * Returns the offset in the file where the deletion vector content starts. + * + *

The file location is specified in the {@link TrackedFile#location()} field. + */ + long offset(); + + /** Returns the size in bytes of the deletion vector content. */ + long sizeInBytes(); +} diff --git a/core/src/main/java/org/apache/iceberg/ManifestEntryStatus.java b/core/src/main/java/org/apache/iceberg/ManifestEntryStatus.java new file mode 100644 index 000000000000..c54178096db5 --- /dev/null +++ b/core/src/main/java/org/apache/iceberg/ManifestEntryStatus.java @@ -0,0 +1,41 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg; + +/** + * Status of an entry in a manifest file. + * + *

This is a top-level enum to avoid duplication across manifest entry types (V3 ManifestEntry + * and V4 TrackingInfo). + */ +public enum ManifestEntryStatus { + EXISTING(0), + ADDED(1), + DELETED(2); + + private final int id; + + ManifestEntryStatus(int id) { + this.id = id; + } + + public int id() { + return id; + } +} diff --git a/core/src/main/java/org/apache/iceberg/ManifestStats.java b/core/src/main/java/org/apache/iceberg/ManifestStats.java new file mode 100644 index 000000000000..89e57f5bea52 --- /dev/null +++ b/core/src/main/java/org/apache/iceberg/ManifestStats.java @@ -0,0 +1,86 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg; + +import org.apache.iceberg.types.Types; + +/** + * Statistics for manifest entries in a V4 tracked file. + * + *

This encapsulates added/removed/existing files/row counts and min_sequence_number for a + * manifest. This must be defined when the content_type is a manifest (3 or 4), and null otherwise. + */ +interface ManifestStats { + Types.NestedField ADDED_FILES_COUNT = + Types.NestedField.required( + 504, "added_files_count", Types.IntegerType.get(), "Number of files added"); + Types.NestedField EXISTING_FILES_COUNT = + Types.NestedField.required( + 505, "existing_files_count", Types.IntegerType.get(), "Number of existing files"); + Types.NestedField DELETED_FILES_COUNT = + Types.NestedField.required( + 506, "deleted_files_count", Types.IntegerType.get(), "Number of deleted files"); + Types.NestedField ADDED_ROWS_COUNT = + Types.NestedField.required( + 512, "added_rows_count", Types.LongType.get(), "Number of rows in added files"); + Types.NestedField EXISTING_ROWS_COUNT = + Types.NestedField.required( + 513, "existing_rows_count", Types.LongType.get(), "Number of rows in existing files"); + Types.NestedField DELETED_ROWS_COUNT = + Types.NestedField.required( + 514, "deleted_rows_count", Types.LongType.get(), "Number of rows in deleted files"); + Types.NestedField MIN_SEQUENCE_NUMBER = + Types.NestedField.required( + 516, + "min_sequence_number", + Types.LongType.get(), + "Minimum sequence number of files in this manifest"); + + static Types.StructType schema() { + return Types.StructType.of( + ADDED_FILES_COUNT, + EXISTING_FILES_COUNT, + DELETED_FILES_COUNT, + ADDED_ROWS_COUNT, + EXISTING_ROWS_COUNT, + DELETED_ROWS_COUNT, + MIN_SEQUENCE_NUMBER); + } + + /** Returns the number of files added by this manifest. */ + int addedFilesCount(); + + /** Returns the number of existing files referenced by this manifest. */ + int existingFilesCount(); + + /** Returns the number of deleted files in this manifest. */ + int deletedFilesCount(); + + /** Returns the number of rows in added files. */ + long addedRowsCount(); + + /** Returns the number of rows in existing files. */ + long existingRowsCount(); + + /** Returns the number of rows in deleted files. */ + long deletedRowsCount(); + + /** Returns the minimum sequence number of files in this manifest. */ + long minSequenceNumber(); +} diff --git a/core/src/main/java/org/apache/iceberg/TrackedFile.java b/core/src/main/java/org/apache/iceberg/TrackedFile.java new file mode 100644 index 000000000000..e51acba98e41 --- /dev/null +++ b/core/src/main/java/org/apache/iceberg/TrackedFile.java @@ -0,0 +1,271 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg; + +import java.nio.ByteBuffer; +import java.util.List; +import java.util.Set; +import org.apache.iceberg.stats.ContentStats; +import org.apache.iceberg.types.Types; + +/** + * Represents a V4 content entry in a manifest file. + * + *

TrackedFile is the V4 equivalent of ContentFile. It provides a unified representation for all + * entry types in a V4 manifest: data files, delete files, manifests, and deletion vectors. + */ +interface TrackedFile { + // Field IDs from V4 specification + Types.NestedField TRACKING_INFO = + Types.NestedField.required( + 147, "tracking_info", TrackingInfo.schema(), "Tracking information for this entry"); + Types.NestedField CONTENT_TYPE = + Types.NestedField.required( + 134, + "content_type", + Types.IntegerType.get(), + "Type of content: 0=DATA, 1=POSITION_DELETES, 2=EQUALITY_DELETES, 3=DATA_MANIFEST, 4=DELETE_MANIFEST"); + Types.NestedField LOCATION = + Types.NestedField.required(100, "location", Types.StringType.get(), "Location of the file"); + Types.NestedField FILE_FORMAT = + Types.NestedField.required( + 101, + "file_format", + Types.StringType.get(), + "String file format name: avro, orc, parquet, or puffin"); + Types.NestedField PARTITION_SPEC_ID = + Types.NestedField.required( + 149, + "partition_spec_id", + Types.IntegerType.get(), + "ID of partition spec used to write manifest or data/delete files"); + Types.NestedField SORT_ORDER_ID = + Types.NestedField.optional( + 140, + "sort_order_id", + Types.IntegerType.get(), + "ID representing sort order for this file. Can only be set if content_type is 0"); + Types.NestedField RECORD_COUNT = + Types.NestedField.required( + 103, + "record_count", + Types.LongType.get(), + "Number of records in this file, or the cardinality of a deletion vector"); + Types.NestedField FILE_SIZE_IN_BYTES = + Types.NestedField.optional( + 104, "file_size_in_bytes", Types.LongType.get(), "Total file size in bytes."); + Types.NestedField CONTENT_STATS = + Types.NestedField.optional( + 146, + "content_stats", + Types.StructType.of(), // schema is derived from table schema at read/write time + "Content statistics for this entry"); + Types.NestedField KEY_METADATA = + Types.NestedField.optional( + 131, + "key_metadata", + Types.BinaryType.get(), + "Implementation-specific key metadata for encryption"); + Types.NestedField SPLIT_OFFSETS = + Types.NestedField.optional( + 132, + "split_offsets", + Types.ListType.ofRequired(133, Types.LongType.get()), + "Split offsets for the data file. Must be sorted ascending"); + Types.NestedField CONTENT_INFO = + Types.NestedField.optional( + 148, + "content_info", + ContentInfo.schema(), + "Content info. Required when content_type is POSITION_DELETES, must be null otherwise"); + Types.NestedField EQUALITY_IDS = + Types.NestedField.optional( + 135, + "equality_ids", + Types.ListType.ofRequired(136, Types.IntegerType.get()), + "Field ids used to determine row equality in equality delete files. Required when content=2"); + Types.NestedField REFERENCED_FILE = + Types.NestedField.optional( + 143, + "referenced_file", + Types.StringType.get(), + "Location of referenced data file or affiliated manifest"); + Types.NestedField MANIFEST_STATS = + Types.NestedField.optional( + 150, + "manifest_stats", + ManifestStats.schema(), + "Manifest statistics. Required for DATA_MANIFEST and DELETE_MANIFEST"); + Types.NestedField MANIFEST_DV = + Types.NestedField.optional( + 151, + "manifest_dv", + Types.BinaryType.get(), + "Serialized deletion vector for manifest entries"); + + /** + * Returns the path of the manifest which this file is referenced in or null if it was not read + * from a manifest. + */ + String manifestLocation(); + + /** + * Returns the tracking information for this entry. + * + *

Contains status, snapshot ID, sequence numbers, and first-row-id. Optional - may be null if + * tracking info is inherited. + */ + TrackingInfo trackingInfo(); + + /** + * Returns the type of content stored by this entry. + * + *

One of: DATA, POSITION_DELETES, EQUALITY_DELETES, DATA_MANIFEST, or DELETE_MANIFEST. + */ + FileContent contentType(); + + /** Returns the location of the file. */ + String location(); + + /** Returns the format of the file (avro, orc, parquet, or puffin). */ + FileFormat fileFormat(); + + /** + * Returns the content info. + * + *

Must be defined if content_type is POSITION_DELETES, must be null otherwise. + */ + ContentInfo contentInfo(); + + /** Returns the ID of the partition spec used to write this file or manifest. */ + int partitionSpecId(); + + /** + * Returns the ID representing sort order for this file. + * + *

Can only be set if content_type is DATA. + */ + Integer sortOrderId(); + + /** Returns the number of records in this file, or the cardinality of a deletion vector. */ + long recordCount(); + + /** + * Returns the total file size in bytes. + * + *

Must be defined if location is defined. + */ + long fileSizeInBytes(); + + /** Returns the content stats for this entry. */ + ContentStats contentStats(); + + /** + * Returns the manifest stats for this entry. + * + *

Must be set if content_type is DATA_MANIFEST or DELETE_MANIFEST, otherwise must be null. + */ + ManifestStats manifestStats(); + + /** + * Returns the manifest deletion vector for this entry. + * + *

When present, this is a serialized deletion vector where each set bit position corresponds + * to an entry in the manifest that should be treated as deleted. This allows marking manifest + * entries as deleted without rewriting the manifest file. + * + *

Optional for DATA_MANIFEST and DELETE_MANIFEST content types, must be null otherwise. + */ + ByteBuffer manifestDV(); + + /** Returns metadata about how this file is encrypted, or null if stored in plain text. */ + ByteBuffer keyMetadata(); + + /** + * Returns list of recommended split locations, if applicable, null otherwise. + * + *

Must be sorted in ascending order. + */ + List splitOffsets(); + + /** + * Returns the set of field IDs used for equality comparison, in equality delete files. + * + *

Required when content_type is EQUALITY_DELETES, must be null otherwise. + */ + List equalityIds(); + + /** + * Returns the location of the referenced file. + * + *

For POSITION_DELETES: location of the data file that the deletion vector references. + * + *

For DELETE_MANIFEST: location of the affiliated data manifest, or null if unaffiliated. + */ + String referencedFile(); + + /** + * Copies this tracked file. + * + *

Manifest readers can reuse file instances; use this method to copy data when collecting + * files from tasks. + */ + TrackedFile copy(); + + /** + * Copies this tracked file without stats. + * + *

Use this method to copy data without stats when collecting files. + */ + TrackedFile copyWithoutStats(); + + /** + * Copies this tracked file with stats only for specific columns. + * + *

Manifest readers can reuse file instances; use this method to copy data with stats only for + * specific columns when collecting files. + * + * @param requestedColumnIds column IDs for which to keep stats + * @return a copy of this tracked file, with content stats for only the requested columns + */ + TrackedFile copyWithStats(Set requestedColumnIds); + + /** + * Converts this tracked file to a DataFile. + * + *

Only valid when content_type is DATA. The partition spec must have been set on this + * TrackedFile prior to calling this method (typically done by the manifest reader). + * + * @return a DataFile representation + * @throws IllegalStateException if content_type is not DATA or partition spec is not set + */ + DataFile asDataFile(); + + /** + * Converts this tracked file to a DeleteFile. + * + *

Only valid when content_type is POSITION_DELETES or EQUALITY_DELETES. The partition spec + * must have been set on this TrackedFile prior to calling this method (typically done by the + * manifest reader). + * + * @return a DeleteFile representation + * @throws IllegalStateException if content_type is not a delete type or partition spec is not set + */ + DeleteFile asDeleteFile(); +} diff --git a/core/src/main/java/org/apache/iceberg/TrackingInfo.java b/core/src/main/java/org/apache/iceberg/TrackingInfo.java new file mode 100644 index 000000000000..330d6e35ff71 --- /dev/null +++ b/core/src/main/java/org/apache/iceberg/TrackingInfo.java @@ -0,0 +1,87 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg; + +import org.apache.iceberg.types.Types; + +/** + * Tracking information for a tracked file entry in a V4 manifest. + * + *

This groups the status, snapshot, and sequence number information for the entry. This enables + * accessing the fields for the entry and provides an isolated structure that can be modified. + */ +interface TrackingInfo { + Types.NestedField STATUS = + Types.NestedField.required( + 0, "status", Types.IntegerType.get(), "Entry status: 0=existing, 1=added, 2=deleted"); + Types.NestedField SNAPSHOT_ID = + Types.NestedField.optional( + 1, + "snapshot_id", + Types.LongType.get(), + "Snapshot ID where the file was added or deleted"); + Types.NestedField SEQUENCE_NUMBER = + Types.NestedField.optional( + 3, "sequence_number", Types.LongType.get(), "Data sequence number of the file"); + Types.NestedField FILE_SEQUENCE_NUMBER = + Types.NestedField.optional( + 4, + "file_sequence_number", + Types.LongType.get(), + "File sequence number indicating when the file was added"); + Types.NestedField FIRST_ROW_ID = + Types.NestedField.optional( + 142, "first_row_id", Types.LongType.get(), "ID of the first row in the data file"); + + static Types.StructType schema() { + return Types.StructType.of( + STATUS, SNAPSHOT_ID, SEQUENCE_NUMBER, FILE_SEQUENCE_NUMBER, FIRST_ROW_ID); + } + + /** + * Returns the status of the entry. + * + *

Status values: + * + *

+ */ + ManifestEntryStatus status(); + + /** Returns the snapshot ID where the file was added or deleted. */ + Long snapshotId(); + + /** Returns the data sequence number of the file. */ + Long dataSequenceNumber(); + + /** Returns the file sequence number indicating when the file was added. */ + Long fileSequenceNumber(); + + /** Returns the ID of the first row in the data file. */ + Long firstRowId(); + + /** Returns the path of the manifest which this entry was read from. */ + String manifestLocation(); + + /** Returns the ordinal position of this entry within the manifest. */ + long manifestPos(); +}