Skip to content

Commit 5a57928

Browse files
committed
[bug](parquet) fix parquet type not handle float16 type
1 parent 507b51b commit 5a57928

File tree

3 files changed

+88
-0
lines changed

3 files changed

+88
-0
lines changed

be/src/vec/exec/format/parquet/parquet_column_convert.cpp

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -230,6 +230,11 @@ std::unique_ptr<PhysicalToLogicalConverter> PhysicalToLogicalConverter::get_conv
230230
// for FixedSizeBinary
231231
physical_converter =
232232
std::make_unique<FixedSizeBinaryConverter>(parquet_schema.type_length);
233+
} else if (src_logical_primitive == TYPE_FLOAT &&
234+
src_physical_type == tparquet::Type::FIXED_LEN_BYTE_ARRAY &&
235+
parquet_schema.logicalType.__isset.FLOAT16) {
236+
physical_converter =
237+
std::make_unique<Float16PhysicalConverter>(parquet_schema.type_length);
233238
} else {
234239
physical_converter = std::make_unique<ConsistentPhysicalConverter>();
235240
}

be/src/vec/exec/format/parquet/parquet_column_convert.h

Lines changed: 81 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
#include <gen_cpp/parquet_types.h>
2121

2222
#include "common/cast_set.h"
23+
#include "runtime/primitive_type.h"
2324
#include "vec/columns/column_varbinary.h"
2425
#include "vec/core/extended_types.h"
2526
#include "vec/core/field.h"
@@ -354,6 +355,86 @@ class FixedSizeBinaryConverter : public PhysicalToLogicalConverter {
354355
}
355356
};
356357

358+
class Float16PhysicalConverter : public PhysicalToLogicalConverter {
359+
private:
360+
int _type_length;
361+
362+
public:
363+
Float16PhysicalConverter(int type_length) : _type_length(type_length) {
364+
DCHECK_EQ(_type_length, 2);
365+
}
366+
367+
Status physical_convert(ColumnPtr& src_physical_col, ColumnPtr& src_logical_column) override {
368+
ColumnPtr from_col = remove_nullable(src_physical_col);
369+
MutableColumnPtr to_col = remove_nullable(src_logical_column)->assume_mutable();
370+
371+
const auto* src_data = assert_cast<const ColumnUInt8*>(from_col.get());
372+
size_t length = src_data->size();
373+
size_t num_values = length / _type_length;
374+
auto* to_float_column = assert_cast<ColumnFloat32*>(to_col.get());
375+
const uint8_t* ptr = src_data->get_data().data();
376+
for (int i = 0; i < num_values; ++i) {
377+
size_t offset = i * _type_length;
378+
const uint8_t* data_ptr = ptr + offset;
379+
uint16_t raw;
380+
memcpy(&raw, data_ptr, sizeof(uint16_t));
381+
float value = half_to_float(raw);
382+
to_float_column->insert_value(value);
383+
}
384+
385+
return Status::OK();
386+
}
387+
388+
float half_to_float(uint16_t h) {
389+
// uint16_t h: half precision floating point
390+
// bit 15: sign(1 bit)
391+
// bits 14..10 : exponent(5 bits)
392+
// bits 9..0 : mantissa(10 bits)
393+
394+
// sign bit placed to float32 bit31
395+
uint32_t sign = (h & 0x8000U) << 16; // 0x8000 << 16 = 0x8000_0000
396+
// exponent:(5 bits)
397+
uint32_t exp = (h & 0x7C00U) >> 10; // 0x7C00 = 0111 1100 0000 (half exponent mask)
398+
// mantissa(10 bits)
399+
uint32_t mant = (h & 0x03FFU); // 10-bit fraction
400+
401+
// cases:Zero/Subnormal, Normal, Inf/NaN
402+
if (exp == 0) {
403+
// exp==0: Zero or Subnormal ----------
404+
if (mant == 0) {
405+
// ±0.0
406+
// sign = either 0x00000000 or 0x80000000
407+
return std::bit_cast<float>(sign);
408+
} else {
409+
// ---------- Subnormal ----------
410+
// half subnormal:
411+
// value = (-1)^sign * (mant / 2^10) * 2^(1 - bias)
412+
// half bias = 15 → exponent = 1 - 15 = -14
413+
float f = (static_cast<float>(mant) / 1024.0F) * std::powf(2.0F, -14.0F);
414+
return sign ? -f : f;
415+
}
416+
} else if (exp == 0x1F) {
417+
// exp==31: Inf or NaN ----------
418+
// float32:
419+
// exponent = 255 (0xFF)
420+
// mantissa = mant << 13
421+
uint32_t f = sign | 0x7F800000U | (mant << 13);
422+
return std::bit_cast<float>(f);
423+
} else {
424+
// Normalized ----------
425+
// float32 exponent:
426+
// exp32 = exp16 - bias16 + bias32
427+
// bias16 = 15
428+
// bias32 = 127
429+
//
430+
// so: exp32 = exp + (127 - 15)
431+
uint32_t f = sign | ((exp + (127 - 15)) << 23) // place to float32 exponent
432+
| (mant << 13); // mantissa align to 23 bits
433+
return std::bit_cast<float>(f);
434+
}
435+
}
436+
};
437+
357438
class UUIDVarBinaryConverter : public PhysicalToLogicalConverter {
358439
public:
359440
UUIDVarBinaryConverter(int type_length) : _type_length(type_length) {}

be/src/vec/exec/format/parquet/schema_desc.cpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -308,6 +308,8 @@ std::pair<DataTypePtr, bool> FieldDescriptor::convert_to_doris_type(
308308
} else if (logicalType.__isset.UUID) {
309309
ans.first =
310310
DataTypeFactory::instance().create_data_type(TYPE_VARBINARY, nullable, -1, -1, 16);
311+
} else if (logicalType.__isset.FLOAT16) {
312+
ans.first = DataTypeFactory::instance().create_data_type(TYPE_FLOAT, nullable);
311313
} else {
312314
throw Exception(Status::InternalError("Not supported parquet logicalType"));
313315
}

0 commit comments

Comments
 (0)