|
| 1 | +// Licensed to the Apache Software Foundation (ASF) under one |
| 2 | +// or more contributor license agreements. See the NOTICE file |
| 3 | +// distributed with this work for additional information |
| 4 | +// regarding copyright ownership. The ASF licenses this file |
| 5 | +// to you under the Apache License, Version 2.0 (the |
| 6 | +// "License"); you may not use this file except in compliance |
| 7 | +// with the License. You may obtain a copy of the License at |
| 8 | +// |
| 9 | +// http://www.apache.org/licenses/LICENSE-2.0 |
| 10 | +// |
| 11 | +// Unless required by applicable law or agreed to in writing, |
| 12 | +// software distributed under the License is distributed on an |
| 13 | +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| 14 | +// KIND, either express or implied. See the License for the |
| 15 | +// specific language governing permissions and limitations |
| 16 | +// under the License. |
| 17 | + |
| 18 | +package org.apache.doris.datasource.property.fileformat; |
| 19 | + |
| 20 | +import org.apache.doris.analysis.Separator; |
| 21 | +import org.apache.doris.catalog.Column; |
| 22 | +import org.apache.doris.common.util.Util; |
| 23 | +import org.apache.doris.nereids.exceptions.AnalysisException; |
| 24 | +import org.apache.doris.qe.ConnectContext; |
| 25 | +import org.apache.doris.thrift.TFileAttributes; |
| 26 | +import org.apache.doris.thrift.TFileFormatType; |
| 27 | +import org.apache.doris.thrift.TFileTextScanRangeParams; |
| 28 | +import org.apache.doris.thrift.TResultFileSinkOptions; |
| 29 | +import org.apache.doris.thrift.TTextSerdeType; |
| 30 | + |
| 31 | +import com.google.common.base.Strings; |
| 32 | +import com.google.common.collect.Lists; |
| 33 | +import org.apache.logging.log4j.LogManager; |
| 34 | +import org.apache.logging.log4j.Logger; |
| 35 | + |
| 36 | +import java.util.List; |
| 37 | +import java.util.Map; |
| 38 | + |
| 39 | +public class CsvFileFormatProperties extends FileFormatProperties { |
| 40 | + public static final Logger LOG = LogManager.getLogger( |
| 41 | + org.apache.doris.datasource.property.fileformat.CsvFileFormatProperties.class); |
| 42 | + |
| 43 | + public static final String DEFAULT_COLUMN_SEPARATOR = "\t"; |
| 44 | + public static final String DEFAULT_HIVE_TEXT_COLUMN_SEPARATOR = "\001"; |
| 45 | + public static final String DEFAULT_LINE_DELIMITER = "\n"; |
| 46 | + |
| 47 | + public static final String PROP_COLUMN_SEPARATOR = "column_separator"; |
| 48 | + public static final String PROP_LINE_DELIMITER = "line_delimiter"; |
| 49 | + |
| 50 | + public static final String PROP_SKIP_LINES = "skip_lines"; |
| 51 | + public static final String PROP_CSV_SCHEMA = "csv_schema"; |
| 52 | + public static final String PROP_COMPRESS_TYPE = "compress_type"; |
| 53 | + public static final String PROP_TRIM_DOUBLE_QUOTES = "trim_double_quotes"; |
| 54 | + |
| 55 | + public static final String PROP_ENCLOSE = "enclose"; |
| 56 | + |
| 57 | + private String headerType = ""; |
| 58 | + private TTextSerdeType textSerdeType = TTextSerdeType.JSON_TEXT_SERDE; |
| 59 | + private String columnSeparator = DEFAULT_COLUMN_SEPARATOR; |
| 60 | + private String lineDelimiter = DEFAULT_LINE_DELIMITER; |
| 61 | + private boolean trimDoubleQuotes; |
| 62 | + private int skipLines; |
| 63 | + private byte enclose; |
| 64 | + |
| 65 | + // used by tvf |
| 66 | + // User specified csv columns, it will override columns got from file |
| 67 | + private final List<Column> csvSchema = Lists.newArrayList(); |
| 68 | + |
| 69 | + String defaultColumnSeparator = DEFAULT_COLUMN_SEPARATOR; |
| 70 | + |
| 71 | + public CsvFileFormatProperties() { |
| 72 | + super(TFileFormatType.FORMAT_CSV_PLAIN); |
| 73 | + } |
| 74 | + |
| 75 | + public CsvFileFormatProperties(String defaultColumnSeparator, TTextSerdeType textSerdeType) { |
| 76 | + super(TFileFormatType.FORMAT_CSV_PLAIN); |
| 77 | + this.defaultColumnSeparator = defaultColumnSeparator; |
| 78 | + this.textSerdeType = textSerdeType; |
| 79 | + } |
| 80 | + |
| 81 | + public CsvFileFormatProperties(String headerType) { |
| 82 | + super(TFileFormatType.FORMAT_CSV_PLAIN); |
| 83 | + this.headerType = headerType; |
| 84 | + } |
| 85 | + |
| 86 | + |
| 87 | + @Override |
| 88 | + public void analyzeFileFormatProperties(Map<String, String> formatProperties, boolean isRemoveOriginProperty) |
| 89 | + throws AnalysisException { |
| 90 | + try { |
| 91 | + // analyze properties specified by user |
| 92 | + columnSeparator = getOrDefault(formatProperties, PROP_COLUMN_SEPARATOR, |
| 93 | + defaultColumnSeparator, isRemoveOriginProperty); |
| 94 | + if (Strings.isNullOrEmpty(columnSeparator)) { |
| 95 | + throw new AnalysisException("column_separator can not be empty."); |
| 96 | + } |
| 97 | + columnSeparator = Separator.convertSeparator(columnSeparator); |
| 98 | + |
| 99 | + lineDelimiter = getOrDefault(formatProperties, PROP_LINE_DELIMITER, |
| 100 | + DEFAULT_LINE_DELIMITER, isRemoveOriginProperty); |
| 101 | + if (Strings.isNullOrEmpty(lineDelimiter)) { |
| 102 | + throw new AnalysisException("line_delimiter can not be empty."); |
| 103 | + } |
| 104 | + lineDelimiter = Separator.convertSeparator(lineDelimiter); |
| 105 | + |
| 106 | + String enclosedString = getOrDefault(formatProperties, PROP_ENCLOSE, |
| 107 | + "", isRemoveOriginProperty); |
| 108 | + if (!Strings.isNullOrEmpty(enclosedString)) { |
| 109 | + if (enclosedString.length() > 1) { |
| 110 | + throw new AnalysisException("enclose should not be longer than one byte."); |
| 111 | + } |
| 112 | + enclose = (byte) enclosedString.charAt(0); |
| 113 | + if (enclose == 0) { |
| 114 | + throw new AnalysisException("enclose should not be byte [0]."); |
| 115 | + } |
| 116 | + } |
| 117 | + |
| 118 | + trimDoubleQuotes = Boolean.valueOf(getOrDefault(formatProperties, |
| 119 | + PROP_TRIM_DOUBLE_QUOTES, "", isRemoveOriginProperty)) |
| 120 | + .booleanValue(); |
| 121 | + skipLines = Integer.valueOf(getOrDefault(formatProperties, |
| 122 | + PROP_SKIP_LINES, "0", isRemoveOriginProperty)).intValue(); |
| 123 | + if (skipLines < 0) { |
| 124 | + throw new AnalysisException("skipLines should not be less than 0."); |
| 125 | + } |
| 126 | + |
| 127 | + String compressTypeStr = getOrDefault(formatProperties, |
| 128 | + PROP_COMPRESS_TYPE, "UNKNOWN", isRemoveOriginProperty); |
| 129 | + compressionType = Util.getFileCompressType(compressTypeStr); |
| 130 | + |
| 131 | + } catch (org.apache.doris.common.AnalysisException e) { |
| 132 | + throw new AnalysisException(e.getMessage()); |
| 133 | + } |
| 134 | + } |
| 135 | + |
| 136 | + @Override |
| 137 | + public void fullTResultFileSinkOptions(TResultFileSinkOptions sinkOptions) { |
| 138 | + sinkOptions.setColumnSeparator(columnSeparator); |
| 139 | + sinkOptions.setLineDelimiter(lineDelimiter); |
| 140 | + } |
| 141 | + |
| 142 | + // The method `analyzeFileFormatProperties` must have been called once before this method |
| 143 | + @Override |
| 144 | + public TFileAttributes toTFileAttributes() { |
| 145 | + TFileAttributes fileAttributes = new TFileAttributes(); |
| 146 | + TFileTextScanRangeParams fileTextScanRangeParams = new TFileTextScanRangeParams(); |
| 147 | + fileTextScanRangeParams.setColumnSeparator(this.columnSeparator); |
| 148 | + fileTextScanRangeParams.setLineDelimiter(this.lineDelimiter); |
| 149 | + if (this.enclose != 0) { |
| 150 | + fileTextScanRangeParams.setEnclose(this.enclose); |
| 151 | + } |
| 152 | + fileAttributes.setTextParams(fileTextScanRangeParams); |
| 153 | + fileAttributes.setHeaderType(headerType); |
| 154 | + fileAttributes.setTrimDoubleQuotes(trimDoubleQuotes); |
| 155 | + fileAttributes.setSkipLines(skipLines); |
| 156 | + fileAttributes.setEnableTextValidateUtf8( |
| 157 | + ConnectContext.get().getSessionVariable().enableTextValidateUtf8); |
| 158 | + return fileAttributes; |
| 159 | + } |
| 160 | + |
| 161 | + public String getHeaderType() { |
| 162 | + return headerType; |
| 163 | + } |
| 164 | + |
| 165 | + public TTextSerdeType getTextSerdeType() { |
| 166 | + return textSerdeType; |
| 167 | + } |
| 168 | + |
| 169 | + public String getColumnSeparator() { |
| 170 | + return columnSeparator; |
| 171 | + } |
| 172 | + |
| 173 | + public String getLineDelimiter() { |
| 174 | + return lineDelimiter; |
| 175 | + } |
| 176 | + |
| 177 | + public boolean isTrimDoubleQuotes() { |
| 178 | + return trimDoubleQuotes; |
| 179 | + } |
| 180 | + |
| 181 | + public int getSkipLines() { |
| 182 | + return skipLines; |
| 183 | + } |
| 184 | + |
| 185 | + public byte getEnclose() { |
| 186 | + return enclose; |
| 187 | + } |
| 188 | + |
| 189 | + public List<Column> getCsvSchema() { |
| 190 | + return csvSchema; |
| 191 | + } |
| 192 | +} |
0 commit comments