diff --git a/api/src/main/java/org/apache/iceberg/FileContent.java b/api/src/main/java/org/apache/iceberg/FileContent.java index 2c9a2fa51bd2..f700305d68ee 100644 --- a/api/src/main/java/org/apache/iceberg/FileContent.java +++ b/api/src/main/java/org/apache/iceberg/FileContent.java @@ -18,11 +18,21 @@ */ package org.apache.iceberg; -/** Content type stored in a file, one of DATA, POSITION_DELETES, or EQUALITY_DELETES. */ +/** + * Content type stored in a file. + * + *
For V1-V3 tables: DATA, POSITION_DELETES, or EQUALITY_DELETES. + * + *
For V4 tables: DATA, POSITION_DELETES, EQUALITY_DELETES, DATA_MANIFEST, or DELETE_MANIFEST. + */ public enum FileContent { DATA(0), POSITION_DELETES(1), - EQUALITY_DELETES(2); + EQUALITY_DELETES(2), + /** Data manifest entry (V4+ only) - references data files in a root manifest. */ + DATA_MANIFEST(3), + /** Delete manifest entry (V4+ only) - references delete files in a root manifest. */ + DELETE_MANIFEST(4); private final int id; diff --git a/core/src/main/java/org/apache/iceberg/ContentInfo.java b/core/src/main/java/org/apache/iceberg/ContentInfo.java new file mode 100644 index 000000000000..6a4831214f4d --- /dev/null +++ b/core/src/main/java/org/apache/iceberg/ContentInfo.java @@ -0,0 +1,58 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg; + +import org.apache.iceberg.types.Types; + +/** + * Metadata about externally stored content such as deletion vectors. + * + *
The deletion vector content is stored at the specified offset and size within the file + * referenced by {@link TrackedFile#location()}. + * + *
This struct must be defined when content_type is POSITION_DELETES, and must be null otherwise. + * + *
Note: For manifest-level deletion vectors (marking entries in a manifest as deleted), see + * {@link TrackedFile#manifestDV()} which stores the DV inline as a binary field. + */ +interface ContentInfo { + Types.NestedField OFFSET = + Types.NestedField.required( + 144, "offset", Types.LongType.get(), "Offset in the file where the content starts"); + Types.NestedField SIZE_IN_BYTES = + Types.NestedField.required( + 145, + "size_in_bytes", + Types.LongType.get(), + "Length of the referenced content stored in the file"); + + static Types.StructType schema() { + return Types.StructType.of(OFFSET, SIZE_IN_BYTES); + } + + /** + * Returns the offset in the file where the deletion vector content starts. + * + *
The file location is specified in the {@link TrackedFile#location()} field. + */ + long offset(); + + /** Returns the size in bytes of the deletion vector content. */ + long sizeInBytes(); +} diff --git a/core/src/main/java/org/apache/iceberg/ManifestEntryStatus.java b/core/src/main/java/org/apache/iceberg/ManifestEntryStatus.java new file mode 100644 index 000000000000..c54178096db5 --- /dev/null +++ b/core/src/main/java/org/apache/iceberg/ManifestEntryStatus.java @@ -0,0 +1,41 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg; + +/** + * Status of an entry in a manifest file. + * + *
This is a top-level enum to avoid duplication across manifest entry types (V3 ManifestEntry + * and V4 TrackingInfo). + */ +public enum ManifestEntryStatus { + EXISTING(0), + ADDED(1), + DELETED(2); + + private final int id; + + ManifestEntryStatus(int id) { + this.id = id; + } + + public int id() { + return id; + } +} diff --git a/core/src/main/java/org/apache/iceberg/ManifestStats.java b/core/src/main/java/org/apache/iceberg/ManifestStats.java new file mode 100644 index 000000000000..89e57f5bea52 --- /dev/null +++ b/core/src/main/java/org/apache/iceberg/ManifestStats.java @@ -0,0 +1,86 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg; + +import org.apache.iceberg.types.Types; + +/** + * Statistics for manifest entries in a V4 tracked file. + * + *
This encapsulates added/removed/existing files/row counts and min_sequence_number for a + * manifest. This must be defined when the content_type is a manifest (3 or 4), and null otherwise. + */ +interface ManifestStats { + Types.NestedField ADDED_FILES_COUNT = + Types.NestedField.required( + 504, "added_files_count", Types.IntegerType.get(), "Number of files added"); + Types.NestedField EXISTING_FILES_COUNT = + Types.NestedField.required( + 505, "existing_files_count", Types.IntegerType.get(), "Number of existing files"); + Types.NestedField DELETED_FILES_COUNT = + Types.NestedField.required( + 506, "deleted_files_count", Types.IntegerType.get(), "Number of deleted files"); + Types.NestedField ADDED_ROWS_COUNT = + Types.NestedField.required( + 512, "added_rows_count", Types.LongType.get(), "Number of rows in added files"); + Types.NestedField EXISTING_ROWS_COUNT = + Types.NestedField.required( + 513, "existing_rows_count", Types.LongType.get(), "Number of rows in existing files"); + Types.NestedField DELETED_ROWS_COUNT = + Types.NestedField.required( + 514, "deleted_rows_count", Types.LongType.get(), "Number of rows in deleted files"); + Types.NestedField MIN_SEQUENCE_NUMBER = + Types.NestedField.required( + 516, + "min_sequence_number", + Types.LongType.get(), + "Minimum sequence number of files in this manifest"); + + static Types.StructType schema() { + return Types.StructType.of( + ADDED_FILES_COUNT, + EXISTING_FILES_COUNT, + DELETED_FILES_COUNT, + ADDED_ROWS_COUNT, + EXISTING_ROWS_COUNT, + DELETED_ROWS_COUNT, + MIN_SEQUENCE_NUMBER); + } + + /** Returns the number of files added by this manifest. */ + int addedFilesCount(); + + /** Returns the number of existing files referenced by this manifest. */ + int existingFilesCount(); + + /** Returns the number of deleted files in this manifest. */ + int deletedFilesCount(); + + /** Returns the number of rows in added files. */ + long addedRowsCount(); + + /** Returns the number of rows in existing files. */ + long existingRowsCount(); + + /** Returns the number of rows in deleted files. */ + long deletedRowsCount(); + + /** Returns the minimum sequence number of files in this manifest. */ + long minSequenceNumber(); +} diff --git a/core/src/main/java/org/apache/iceberg/TrackedFile.java b/core/src/main/java/org/apache/iceberg/TrackedFile.java new file mode 100644 index 000000000000..e51acba98e41 --- /dev/null +++ b/core/src/main/java/org/apache/iceberg/TrackedFile.java @@ -0,0 +1,271 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg; + +import java.nio.ByteBuffer; +import java.util.List; +import java.util.Set; +import org.apache.iceberg.stats.ContentStats; +import org.apache.iceberg.types.Types; + +/** + * Represents a V4 content entry in a manifest file. + * + *
TrackedFile is the V4 equivalent of ContentFile. It provides a unified representation for all + * entry types in a V4 manifest: data files, delete files, manifests, and deletion vectors. + */ +interface TrackedFile { + // Field IDs from V4 specification + Types.NestedField TRACKING_INFO = + Types.NestedField.required( + 147, "tracking_info", TrackingInfo.schema(), "Tracking information for this entry"); + Types.NestedField CONTENT_TYPE = + Types.NestedField.required( + 134, + "content_type", + Types.IntegerType.get(), + "Type of content: 0=DATA, 1=POSITION_DELETES, 2=EQUALITY_DELETES, 3=DATA_MANIFEST, 4=DELETE_MANIFEST"); + Types.NestedField LOCATION = + Types.NestedField.required(100, "location", Types.StringType.get(), "Location of the file"); + Types.NestedField FILE_FORMAT = + Types.NestedField.required( + 101, + "file_format", + Types.StringType.get(), + "String file format name: avro, orc, parquet, or puffin"); + Types.NestedField PARTITION_SPEC_ID = + Types.NestedField.required( + 149, + "partition_spec_id", + Types.IntegerType.get(), + "ID of partition spec used to write manifest or data/delete files"); + Types.NestedField SORT_ORDER_ID = + Types.NestedField.optional( + 140, + "sort_order_id", + Types.IntegerType.get(), + "ID representing sort order for this file. Can only be set if content_type is 0"); + Types.NestedField RECORD_COUNT = + Types.NestedField.required( + 103, + "record_count", + Types.LongType.get(), + "Number of records in this file, or the cardinality of a deletion vector"); + Types.NestedField FILE_SIZE_IN_BYTES = + Types.NestedField.optional( + 104, "file_size_in_bytes", Types.LongType.get(), "Total file size in bytes."); + Types.NestedField CONTENT_STATS = + Types.NestedField.optional( + 146, + "content_stats", + Types.StructType.of(), // schema is derived from table schema at read/write time + "Content statistics for this entry"); + Types.NestedField KEY_METADATA = + Types.NestedField.optional( + 131, + "key_metadata", + Types.BinaryType.get(), + "Implementation-specific key metadata for encryption"); + Types.NestedField SPLIT_OFFSETS = + Types.NestedField.optional( + 132, + "split_offsets", + Types.ListType.ofRequired(133, Types.LongType.get()), + "Split offsets for the data file. Must be sorted ascending"); + Types.NestedField CONTENT_INFO = + Types.NestedField.optional( + 148, + "content_info", + ContentInfo.schema(), + "Content info. Required when content_type is POSITION_DELETES, must be null otherwise"); + Types.NestedField EQUALITY_IDS = + Types.NestedField.optional( + 135, + "equality_ids", + Types.ListType.ofRequired(136, Types.IntegerType.get()), + "Field ids used to determine row equality in equality delete files. Required when content=2"); + Types.NestedField REFERENCED_FILE = + Types.NestedField.optional( + 143, + "referenced_file", + Types.StringType.get(), + "Location of referenced data file or affiliated manifest"); + Types.NestedField MANIFEST_STATS = + Types.NestedField.optional( + 150, + "manifest_stats", + ManifestStats.schema(), + "Manifest statistics. Required for DATA_MANIFEST and DELETE_MANIFEST"); + Types.NestedField MANIFEST_DV = + Types.NestedField.optional( + 151, + "manifest_dv", + Types.BinaryType.get(), + "Serialized deletion vector for manifest entries"); + + /** + * Returns the path of the manifest which this file is referenced in or null if it was not read + * from a manifest. + */ + String manifestLocation(); + + /** + * Returns the tracking information for this entry. + * + *
Contains status, snapshot ID, sequence numbers, and first-row-id. Optional - may be null if + * tracking info is inherited. + */ + TrackingInfo trackingInfo(); + + /** + * Returns the type of content stored by this entry. + * + *
One of: DATA, POSITION_DELETES, EQUALITY_DELETES, DATA_MANIFEST, or DELETE_MANIFEST. + */ + FileContent contentType(); + + /** Returns the location of the file. */ + String location(); + + /** Returns the format of the file (avro, orc, parquet, or puffin). */ + FileFormat fileFormat(); + + /** + * Returns the content info. + * + *
Must be defined if content_type is POSITION_DELETES, must be null otherwise. + */ + ContentInfo contentInfo(); + + /** Returns the ID of the partition spec used to write this file or manifest. */ + int partitionSpecId(); + + /** + * Returns the ID representing sort order for this file. + * + *
Can only be set if content_type is DATA. + */ + Integer sortOrderId(); + + /** Returns the number of records in this file, or the cardinality of a deletion vector. */ + long recordCount(); + + /** + * Returns the total file size in bytes. + * + *
Must be defined if location is defined. + */ + long fileSizeInBytes(); + + /** Returns the content stats for this entry. */ + ContentStats contentStats(); + + /** + * Returns the manifest stats for this entry. + * + *
Must be set if content_type is DATA_MANIFEST or DELETE_MANIFEST, otherwise must be null. + */ + ManifestStats manifestStats(); + + /** + * Returns the manifest deletion vector for this entry. + * + *
When present, this is a serialized deletion vector where each set bit position corresponds + * to an entry in the manifest that should be treated as deleted. This allows marking manifest + * entries as deleted without rewriting the manifest file. + * + *
Optional for DATA_MANIFEST and DELETE_MANIFEST content types, must be null otherwise. + */ + ByteBuffer manifestDV(); + + /** Returns metadata about how this file is encrypted, or null if stored in plain text. */ + ByteBuffer keyMetadata(); + + /** + * Returns list of recommended split locations, if applicable, null otherwise. + * + *
Must be sorted in ascending order.
+ */
+ List Required when content_type is EQUALITY_DELETES, must be null otherwise.
+ */
+ List For POSITION_DELETES: location of the data file that the deletion vector references.
+ *
+ * For DELETE_MANIFEST: location of the affiliated data manifest, or null if unaffiliated.
+ */
+ String referencedFile();
+
+ /**
+ * Copies this tracked file.
+ *
+ * Manifest readers can reuse file instances; use this method to copy data when collecting
+ * files from tasks.
+ */
+ TrackedFile copy();
+
+ /**
+ * Copies this tracked file without stats.
+ *
+ * Use this method to copy data without stats when collecting files.
+ */
+ TrackedFile copyWithoutStats();
+
+ /**
+ * Copies this tracked file with stats only for specific columns.
+ *
+ * Manifest readers can reuse file instances; use this method to copy data with stats only for
+ * specific columns when collecting files.
+ *
+ * @param requestedColumnIds column IDs for which to keep stats
+ * @return a copy of this tracked file, with content stats for only the requested columns
+ */
+ TrackedFile copyWithStats(Set Only valid when content_type is DATA. The partition spec must have been set on this
+ * TrackedFile prior to calling this method (typically done by the manifest reader).
+ *
+ * @return a DataFile representation
+ * @throws IllegalStateException if content_type is not DATA or partition spec is not set
+ */
+ DataFile asDataFile();
+
+ /**
+ * Converts this tracked file to a DeleteFile.
+ *
+ * Only valid when content_type is POSITION_DELETES or EQUALITY_DELETES. The partition spec
+ * must have been set on this TrackedFile prior to calling this method (typically done by the
+ * manifest reader).
+ *
+ * @return a DeleteFile representation
+ * @throws IllegalStateException if content_type is not a delete type or partition spec is not set
+ */
+ DeleteFile asDeleteFile();
+}
diff --git a/core/src/main/java/org/apache/iceberg/TrackingInfo.java b/core/src/main/java/org/apache/iceberg/TrackingInfo.java
new file mode 100644
index 000000000000..330d6e35ff71
--- /dev/null
+++ b/core/src/main/java/org/apache/iceberg/TrackingInfo.java
@@ -0,0 +1,87 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.iceberg;
+
+import org.apache.iceberg.types.Types;
+
+/**
+ * Tracking information for a tracked file entry in a V4 manifest.
+ *
+ * This groups the status, snapshot, and sequence number information for the entry. This enables
+ * accessing the fields for the entry and provides an isolated structure that can be modified.
+ */
+interface TrackingInfo {
+ Types.NestedField STATUS =
+ Types.NestedField.required(
+ 0, "status", Types.IntegerType.get(), "Entry status: 0=existing, 1=added, 2=deleted");
+ Types.NestedField SNAPSHOT_ID =
+ Types.NestedField.optional(
+ 1,
+ "snapshot_id",
+ Types.LongType.get(),
+ "Snapshot ID where the file was added or deleted");
+ Types.NestedField SEQUENCE_NUMBER =
+ Types.NestedField.optional(
+ 3, "sequence_number", Types.LongType.get(), "Data sequence number of the file");
+ Types.NestedField FILE_SEQUENCE_NUMBER =
+ Types.NestedField.optional(
+ 4,
+ "file_sequence_number",
+ Types.LongType.get(),
+ "File sequence number indicating when the file was added");
+ Types.NestedField FIRST_ROW_ID =
+ Types.NestedField.optional(
+ 142, "first_row_id", Types.LongType.get(), "ID of the first row in the data file");
+
+ static Types.StructType schema() {
+ return Types.StructType.of(
+ STATUS, SNAPSHOT_ID, SEQUENCE_NUMBER, FILE_SEQUENCE_NUMBER, FIRST_ROW_ID);
+ }
+
+ /**
+ * Returns the status of the entry.
+ *
+ * Status values:
+ *
+ *
+ *
+ */
+ ManifestEntryStatus status();
+
+ /** Returns the snapshot ID where the file was added or deleted. */
+ Long snapshotId();
+
+ /** Returns the data sequence number of the file. */
+ Long dataSequenceNumber();
+
+ /** Returns the file sequence number indicating when the file was added. */
+ Long fileSequenceNumber();
+
+ /** Returns the ID of the first row in the data file. */
+ Long firstRowId();
+
+ /** Returns the path of the manifest which this entry was read from. */
+ String manifestLocation();
+
+ /** Returns the ordinal position of this entry within the manifest. */
+ long manifestPos();
+}