Skip to content

Commit 729969d

Browse files
committed
Orc, Data: Implementation of ORCFormatModel
1 parent 235ab0f commit 729969d

4 files changed

Lines changed: 342 additions & 6 deletions

File tree

data/src/main/java/org/apache/iceberg/data/GenericFormatModels.java

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,9 +21,12 @@
2121
import org.apache.iceberg.avro.AvroFormatModel;
2222
import org.apache.iceberg.data.avro.DataWriter;
2323
import org.apache.iceberg.data.avro.PlannedDataReader;
24+
import org.apache.iceberg.data.orc.GenericOrcReader;
25+
import org.apache.iceberg.data.orc.GenericOrcWriter;
2426
import org.apache.iceberg.data.parquet.GenericParquetReaders;
2527
import org.apache.iceberg.data.parquet.GenericParquetWriter;
2628
import org.apache.iceberg.formats.FormatModelRegistry;
29+
import org.apache.iceberg.orc.ORCFormatModel;
2730
import org.apache.iceberg.parquet.ParquetFormatModel;
2831

2932
public class GenericFormatModels {
@@ -48,6 +51,17 @@ public static void register() {
4851
GenericParquetReaders.buildReader(icebergSchema, fileSchema, idToConstant)));
4952

5053
FormatModelRegistry.register(ParquetFormatModel.forPositionDeletes());
54+
55+
FormatModelRegistry.register(
56+
ORCFormatModel.create(
57+
Record.class,
58+
Void.class,
59+
(icebergSchema, fileSchema, engineSchema) ->
60+
GenericOrcWriter.buildWriter(icebergSchema, fileSchema),
61+
(icebergSchema, fileSchema, engineSchema, idToConstant) ->
62+
GenericOrcReader.buildReader(icebergSchema, fileSchema, idToConstant)));
63+
64+
FormatModelRegistry.register(ORCFormatModel.forPositionDeletes());
5165
}
5266

5367
private GenericFormatModels() {}

data/src/test/java/org/apache/iceberg/data/TestGenericFormatModels.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -55,7 +55,7 @@ public class TestGenericFormatModels {
5555
RandomGenericData.generate(TestBase.SCHEMA, 10, 1L);
5656

5757
private static final FileFormat[] FILE_FORMATS =
58-
new FileFormat[] {FileFormat.AVRO, FileFormat.PARQUET};
58+
new FileFormat[] {FileFormat.AVRO, FileFormat.PARQUET, FileFormat.ORC};
5959

6060
@TempDir protected Path temp;
6161

orc/src/main/java/org/apache/iceberg/orc/ORC.java

Lines changed: 16 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -45,13 +45,15 @@
4545
import java.util.Locale;
4646
import java.util.Map;
4747
import java.util.Objects;
48+
import java.util.Set;
4849
import java.util.function.BiFunction;
4950
import java.util.function.Function;
5051
import java.util.stream.Collectors;
5152
import java.util.stream.IntStream;
5253
import org.apache.hadoop.conf.Configuration;
5354
import org.apache.hadoop.fs.Path;
5455
import org.apache.iceberg.FileFormat;
56+
import org.apache.iceberg.MetadataColumns;
5557
import org.apache.iceberg.MetricsConfig;
5658
import org.apache.iceberg.PartitionSpec;
5759
import org.apache.iceberg.Schema;
@@ -79,7 +81,10 @@
7981
import org.apache.iceberg.io.OutputFile;
8082
import org.apache.iceberg.mapping.NameMapping;
8183
import org.apache.iceberg.relocated.com.google.common.base.Preconditions;
84+
import org.apache.iceberg.relocated.com.google.common.collect.ImmutableSet;
8285
import org.apache.iceberg.relocated.com.google.common.collect.Maps;
86+
import org.apache.iceberg.relocated.com.google.common.collect.Sets;
87+
import org.apache.iceberg.types.TypeUtil;
8388
import org.apache.iceberg.util.ArrayUtil;
8489
import org.apache.iceberg.util.PropertyUtil;
8590
import org.apache.orc.CompressionKind;
@@ -179,9 +184,8 @@ public WriteBuilder metricsConfig(MetricsConfig newMetricsConfig) {
179184
return this;
180185
}
181186

182-
// Supposed to always be a private method used strictly by data and delete write builders
183-
private WriteBuilder createContextFunc(
184-
Function<Map<String, String>, Context> newCreateContextFunc) {
187+
// supposed to always be a private method used strictly by data and delete write builders
188+
WriteBuilder createContextFunc(Function<Map<String, String>, Context> newCreateContextFunc) {
185189
this.createContextFunc = newCreateContextFunc;
186190
return this;
187191
}
@@ -219,7 +223,7 @@ public <D> FileAppender<D> build() {
219223
metricsConfig);
220224
}
221225

222-
private static class Context {
226+
static class Context {
223227
private final long stripeSize;
224228
private final long blockSize;
225229
private final int vectorizedRowBatchSize;
@@ -699,6 +703,7 @@ public static class ReadBuilder {
699703
private Function<TypeDescription, OrcRowReader<?>> readerFunc;
700704
private Function<TypeDescription, OrcBatchReader<?>> batchedReaderFunc;
701705
private int recordsPerBatch = VectorizedRowBatch.DEFAULT_SIZE;
706+
private Set<Integer> constantFieldIds = ImmutableSet.of();
702707

703708
private ReadBuilder(InputFile file) {
704709
Preconditions.checkNotNull(file, "Input file cannot be null");
@@ -775,12 +780,18 @@ public ReadBuilder withNameMapping(NameMapping newNameMapping) {
775780
return this;
776781
}
777782

783+
ReadBuilder constantFieldIds(Set<Integer> newConstantFieldIds) {
784+
this.constantFieldIds = newConstantFieldIds;
785+
return this;
786+
}
787+
778788
public <D> CloseableIterable<D> build() {
779789
Preconditions.checkNotNull(schema, "Schema is required");
780790
return new OrcIterable<>(
781791
file,
782792
conf,
783-
schema,
793+
TypeUtil.selectNot(
794+
schema, Sets.union(constantFieldIds, MetadataColumns.metadataFieldIds())),
784795
nameMapping,
785796
start,
786797
length,

0 commit comments

Comments
 (0)