Skip to content

Commit d34ca90

Browse files
committed
wip
1 parent 3c56ead commit d34ca90

File tree

4 files changed

+130
-32
lines changed

4 files changed

+130
-32
lines changed

backends-clickhouse/src/test/scala/org/apache/gluten/execution/GlutenFunctionValidateSuite.scala

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1386,6 +1386,34 @@ class GlutenFunctionValidateSuite extends GlutenClickHouseWholeStageTransformerS
13861386
}
13871387
}
13881388

1389+
test("arabic_indic digit date") {
1390+
withTable("tb_arabic_date") {
1391+
sql("create table tb_arabic_date(d string) using parquet")
1392+
sql("""
1393+
|insert into tb_arabic_date values
1394+
|('2020-01-01'),
1395+
|(cast(unbase64('2aLZoNmi2aQt2aDZpi3ZoNmh') as string)),
1396+
|(cast(unbase64('2aLZoNmi2aQt2aHZoi3Zo9mh') as string)),
1397+
|('2022-10-11'),
1398+
|(cast(unbase64('2aLZoNmi2aQt2aHZoi3Zo9mh') as string))
1399+
|""".stripMargin)
1400+
var query_sql = "select from_unixtime(unix_timestamp(d, 'yyyy-MM-dd')) from tb_arabic_date"
1401+
compareResultsAgainstVanillaSpark(query_sql, true, { _ => })
1402+
1403+
query_sql = """
1404+
|select from_unixtime(
1405+
| unix_timestamp(cast(unbase64('2aLZoNmi2aQt2aDZpi3ZoNmh') as string),
1406+
| 'yyyy-MM-dd'))
1407+
|""".stripMargin
1408+
compareResultsAgainstVanillaSpark(query_sql, true, { _ => })
1409+
1410+
query_sql = """
1411+
|select from_unixtime(unix_timestamp('2020-01-01', 'yyyy-MM-dd'))
1412+
|""".stripMargin
1413+
compareResultsAgainstVanillaSpark(query_sql, true, { _ => })
1414+
}
1415+
}
1416+
13891417
test("Test map with nullable key") {
13901418
val sql = "select map(string_field1, int_field1) from json_test where string_field1 is not null"
13911419
compareResultsAgainstVanillaSpark(sql, true, { _ => })

cpp-ch/local-engine/Functions/TranslateArabicNumber.cpp renamed to cpp-ch/local-engine/Functions/ArabicIndicToAsciiDigitForDate.cpp

Lines changed: 95 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -16,33 +16,36 @@
1616
*/
1717

1818

19+
#include <Columns/IColumn.h>
20+
#include <Columns/ColumnNullable.h>
21+
#include <Columns/ColumnConst.h>
22+
#include <Columns/ColumnString.h>
23+
#include <DataTypes/DataTypeNullable.h>
24+
#include <DataTypes/IDataType.h>
1925
#include <Functions/FunctionFactory.h>
2026
#include <Functions/FunctionHelpers.h>
2127
#include <Functions/IFunction.h>
2228
#include <Common/Exception.h>
23-
#include <DataTypes/IDataType.h>
24-
#include <DataTypes/DataTypeNullable.h>
25-
#include <Columns/ColumnNullable.h>
26-
#include <Columns/ColumnString.h>
29+
#include <Common/logger_useful.h>
2730

2831
namespace DB
2932
{
3033
namespace ErrorCodes
3134
{
32-
extern const int ILLEGAL_TYPE_OF_ARGUMENT;
35+
extern const int ILLEGAL_TYPE_OF_ARGUMENT;
3336
}
3437
}
3538

3639
namespace local_engine
3740
{
3841
// Since spark 3.3, unix_timestamp support arabic number input, e.g., "٢٠٢١-٠٧-٠١ ١٢:٠٠:٠٠".
39-
// We implement a function to translate arabic number to normal number here.
40-
class TranslateArabicNumberFunction : public DB::IFunction
42+
// We implement a function to translate arabic indic digits to ascii digits here.
43+
class ArabicIndicToAsciiDigitForDateFunction : public DB::IFunction
4144
{
4245
public:
43-
static constexpr auto name = "translateArabicNumber";
46+
static constexpr auto name = "arabic_indic_to_ascii_digit_for_date";
4447

45-
static DB::FunctionPtr create(DB::ContextPtr) { return std::make_shared<TranslateArabicNumberFunction>(); }
48+
static DB::FunctionPtr create(DB::ContextPtr) { return std::make_shared<ArabicIndicToAsciiDigitForDateFunction>(); }
4649

4750
String getName() const override { return name; }
4851

@@ -53,7 +56,7 @@ class TranslateArabicNumberFunction : public DB::IFunction
5356
{
5457
auto nested_type = DB::removeNullable(arguments[0]);
5558
if (!DB::WhichDataType(nested_type).isString())
56-
throw DB::Exception(DB::ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "Argument for function {} must be String", getName());
59+
throw DB::Exception(DB::ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "Argument for function {} must be String, but got {}", getName(), arguments[0]->getName());
5760
return arguments[0];
5861
}
5962

@@ -63,6 +66,33 @@ class TranslateArabicNumberFunction : public DB::IFunction
6366
const DB::ColumnString * col_str = nullptr;
6467
const DB::ColumnNullable * col_nullable = nullptr;
6568
const DB::NullMap * null_map = nullptr;
69+
if (data_col->isConst())
70+
{
71+
if (data_col->isNullAt(0))
72+
{
73+
return data_col;
74+
}
75+
const DB::ColumnConst * col_const = DB::checkAndGetColumn<DB::ColumnConst>(data_col.get());
76+
data_col = col_const->getDataColumnPtr();
77+
if (data_col->isNullable())
78+
{
79+
col_nullable = DB::checkAndGetColumn<DB::ColumnNullable>(data_col.get());
80+
null_map = &(col_nullable->getNullMapData());
81+
col_str = DB::checkAndGetColumn<DB::ColumnString>(&(col_nullable->getNestedColumn()));
82+
}
83+
else
84+
{
85+
col_str = DB::checkAndGetColumn<DB::ColumnString>(data_col.get());
86+
}
87+
if (!col_str)
88+
throw DB::Exception(DB::ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "Argument for function {} must be String, but got {}", getName(), data_col->getName());
89+
auto date_str = col_str->getDataAt(0);
90+
auto new_str = convertArabicIndicDigit(date_str);
91+
auto new_data_col = data_col->cloneEmpty();
92+
new_data_col->insertData(new_str.c_str(), new_str.size());
93+
return DB::ColumnConst::create(std::move(new_data_col), input_rows_count);
94+
}
95+
6696
if (data_col->isNullable())
6797
{
6898
col_nullable = DB::checkAndGetColumn<DB::ColumnNullable>(data_col.get());
@@ -74,11 +104,10 @@ class TranslateArabicNumberFunction : public DB::IFunction
74104
col_str = DB::checkAndGetColumn<DB::ColumnString>(data_col.get());
75105
}
76106
if (!col_str)
77-
throw DB::Exception(DB::ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "Argument for function {} must be String", getName());
107+
throw DB::Exception(DB::ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "Argument for function {} must be String, but got {}", getName(), data_col->getName());
78108

79109
auto nested_data_col = DB::removeNullable(arguments[0].column);
80-
bool has_arabic_number = false;
81-
bool has_normal_number = false;
110+
bool has_arabic_indic_digit = false;
82111
size_t row_index = 0;
83112
for (row_index = 0; row_index < input_rows_count; ++row_index)
84113
{
@@ -87,26 +116,21 @@ class TranslateArabicNumberFunction : public DB::IFunction
87116
continue;
88117
}
89118
auto str = col_str->getDataAt(row_index);
90-
if (hasArabicNumber(str))
119+
if (hasArabicIndicDigit(str))
91120
{
92-
has_arabic_number = true;
93-
}
94-
else
95-
{
96-
has_normal_number = true;
97-
}
98-
99-
if (has_arabic_number)
121+
has_arabic_indic_digit = true;
100122
break;
123+
}
101124
}
102-
if (!has_arabic_number)
125+
126+
if (!has_arabic_indic_digit)
103127
{
104-
// No Arabic number found, return the original column
128+
// No Arabic indic digits found, return the original column
105129
return arguments[0].column;
106130
}
107131

108132
auto res_col = data_col->cloneEmpty();
109-
if (has_normal_number)
133+
if (row_index)
110134
{
111135
res_col->insertManyFrom(*data_col, 0, row_index);
112136
}
@@ -117,13 +141,16 @@ class TranslateArabicNumberFunction : public DB::IFunction
117141
res_col->insertDefault();
118142
continue;
119143
}
144+
auto str = convertArabicIndicDigit(col_str->getDataAt(row_index));
145+
res_col->insertData(str.c_str(), str.size());
120146
}
121147
return res_col;
122148
}
123-
private:
124149

125-
bool hasArabicNumber(StringRef str) const
150+
private:
151+
bool hasArabicIndicDigit(StringRef str) const
126152
{
153+
// In most cases, the first byte is a digit.
127154
char c = reinterpret_cast<char>(str.data[0]);
128155
if ('0' <= c && c <= '9')
129156
{
@@ -132,12 +159,50 @@ class TranslateArabicNumberFunction : public DB::IFunction
132159
return true;
133160
}
134161

162+
163+
bool isArabicIndicDigit(char32_t c) const { return c >= 0x0660 && c <= 0x0669; }
164+
char toAsciiDigit(char32_t c) const { return static_cast<char>(c - 0x0660 + '0'); }
165+
166+
String convertArabicIndicDigit(const StringRef & str) const
167+
{
168+
std::string result;
169+
result.reserve(str.size);
170+
for (size_t i = 0; i < str.size;)
171+
{
172+
unsigned char c = str.data[i];
173+
char32_t cp = 0;
174+
if ((c & 0x80) == 0) // 1-byte
175+
{
176+
cp = c;
177+
i += 1;
178+
}
179+
else if ((c & 0xE0) == 0xC0) // 2-byte
180+
{
181+
cp = ((c & 0x1F) << 6) | (str.data[i + 1] & 0x3F);
182+
i += 2;
183+
}
184+
else if ((c & 0xF0) == 0xE0) // 3-byte
185+
{
186+
cp = ((c & 0x0F) << 12) | ((str.data[i + 1] & 0x3F) << 6) | (str.data[i + 2] & 0x3F);
187+
i += 3;
188+
}
189+
else if ((c & 0xF8) == 0xF0) // 4-byte
190+
{
191+
cp = ((c & 0x07) << 18) | ((str.data[i + 1] & 0x3F) << 12) | ((str.data[i + 2] & 0x3F) << 6) | (str.data[i + 3] & 0x3F);
192+
i += 4;
193+
}
194+
if (isArabicIndicDigit(cp))
195+
result.push_back(toAsciiDigit(cp));
196+
else
197+
result.push_back(cp);
198+
}
199+
return result;
200+
}
135201
};
136202

137203
using namespace DB;
138-
REGISTER_FUNCTION(TranslateArabicNumberFunction)
204+
REGISTER_FUNCTION(ArabicIndicToAsciiDigitForDate)
139205
{
140-
factory.registerFunction<TranslateArabicNumberFunction>();
206+
factory.registerFunction<ArabicIndicToAsciiDigitForDateFunction>();
141207
}
142208
}
143-

cpp-ch/local-engine/Parser/scalar_function_parser/getTimestamp.h

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -61,7 +61,7 @@ class FunctionParserGetTimestamp : public FunctionParser
6161
auto parsed_args = parseFunctionArguments(substrait_func, actions_dag);
6262
if (parsed_args.size() != 2)
6363
throw DB::Exception(DB::ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, "Function {} requires exactly two arguments", getName());
64-
const auto * expr_arg = parsed_args[0];
64+
const auto * expr_arg = convertArabicIndicDigit(actions_dag, parsed_args[0]);
6565
const auto * fmt_arg = parsed_args[1];
6666

6767
const auto & args = substrait_func.arguments();
@@ -129,5 +129,11 @@ class FunctionParserGetTimestamp : public FunctionParser
129129
return std::regex_match(fmt, fmtPattern);
130130
}
131131
}
132+
133+
const DB::ActionsDAG::Node * convertArabicIndicDigit(DB::ActionsDAG & actions_dag, const DB::ActionsDAG::Node * node) const
134+
{
135+
const auto * func_node = toFunctionNode(actions_dag, "arabic_indic_to_ascii_digit_for_date", {node});
136+
return func_node;
137+
}
132138
};
133139
}

cpp-ch/local-engine/Parser/scalar_function_parser/unixTimestamp.cpp

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -59,7 +59,6 @@ class FunctionParserUnixTimestamp : public FunctionParserGetTimestamp
5959
throw DB::Exception(DB::ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, "Function {} requires exactly two arguments", getName());
6060

6161
const auto * expr_arg = parsed_args[0];
62-
const auto * fmt_arg = parsed_args[1];
6362
auto expr_type = removeNullable(expr_arg->result_type);
6463
if (isString(expr_type))
6564
return FunctionParserGetTimestamp::parse(substrait_func, actions_dag);

0 commit comments

Comments
 (0)