1616 */
1717
1818
19+ #include < Columns/IColumn.h>
20+ #include < Columns/ColumnNullable.h>
21+ #include < Columns/ColumnConst.h>
22+ #include < Columns/ColumnString.h>
23+ #include < DataTypes/DataTypeNullable.h>
24+ #include < DataTypes/IDataType.h>
1925#include < Functions/FunctionFactory.h>
2026#include < Functions/FunctionHelpers.h>
2127#include < Functions/IFunction.h>
2228#include < Common/Exception.h>
23- #include < DataTypes/IDataType.h>
24- #include < DataTypes/DataTypeNullable.h>
25- #include < Columns/ColumnNullable.h>
26- #include < Columns/ColumnString.h>
29+ #include < Common/logger_useful.h>
2730
2831namespace DB
2932{
3033namespace ErrorCodes
3134{
32- extern const int ILLEGAL_TYPE_OF_ARGUMENT;
35+ extern const int ILLEGAL_TYPE_OF_ARGUMENT;
3336}
3437}
3538
3639namespace local_engine
3740{
3841// Since spark 3.3, unix_timestamp support arabic number input, e.g., "٢٠٢١-٠٧-٠١ ١٢:٠٠:٠٠".
39- // We implement a function to translate arabic number to normal number here.
40- class TranslateArabicNumberFunction : public DB ::IFunction
42+ // We implement a function to translate arabic indic digits to ascii digits here.
43+ class ArabicIndicToAsciiDigitForDateFunction : public DB ::IFunction
4144{
4245public:
43- static constexpr auto name = " translateArabicNumber " ;
46+ static constexpr auto name = " arabic_indic_to_ascii_digit_for_date " ;
4447
45- static DB::FunctionPtr create (DB::ContextPtr) { return std::make_shared<TranslateArabicNumberFunction >(); }
48+ static DB::FunctionPtr create (DB::ContextPtr) { return std::make_shared<ArabicIndicToAsciiDigitForDateFunction >(); }
4649
4750 String getName () const override { return name; }
4851
@@ -53,7 +56,7 @@ class TranslateArabicNumberFunction : public DB::IFunction
5356 {
5457 auto nested_type = DB::removeNullable (arguments[0 ]);
5558 if (!DB::WhichDataType (nested_type).isString ())
56- throw DB::Exception (DB::ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, " Argument for function {} must be String" , getName ());
59+ throw DB::Exception (DB::ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, " Argument for function {} must be String, but got {} " , getName (), arguments[ 0 ]-> getName ());
5760 return arguments[0 ];
5861 }
5962
@@ -63,6 +66,33 @@ class TranslateArabicNumberFunction : public DB::IFunction
6366 const DB::ColumnString * col_str = nullptr ;
6467 const DB::ColumnNullable * col_nullable = nullptr ;
6568 const DB::NullMap * null_map = nullptr ;
69+ if (data_col->isConst ())
70+ {
71+ if (data_col->isNullAt (0 ))
72+ {
73+ return data_col;
74+ }
75+ const DB::ColumnConst * col_const = DB::checkAndGetColumn<DB::ColumnConst>(data_col.get ());
76+ data_col = col_const->getDataColumnPtr ();
77+ if (data_col->isNullable ())
78+ {
79+ col_nullable = DB::checkAndGetColumn<DB::ColumnNullable>(data_col.get ());
80+ null_map = &(col_nullable->getNullMapData ());
81+ col_str = DB::checkAndGetColumn<DB::ColumnString>(&(col_nullable->getNestedColumn ()));
82+ }
83+ else
84+ {
85+ col_str = DB::checkAndGetColumn<DB::ColumnString>(data_col.get ());
86+ }
87+ if (!col_str)
88+ throw DB::Exception (DB::ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, " Argument for function {} must be String, but got {}" , getName (), data_col->getName ());
89+ auto date_str = col_str->getDataAt (0 );
90+ auto new_str = convertArabicIndicDigit (date_str);
91+ auto new_data_col = data_col->cloneEmpty ();
92+ new_data_col->insertData (new_str.c_str (), new_str.size ());
93+ return DB::ColumnConst::create (std::move (new_data_col), input_rows_count);
94+ }
95+
6696 if (data_col->isNullable ())
6797 {
6898 col_nullable = DB::checkAndGetColumn<DB::ColumnNullable>(data_col.get ());
@@ -74,11 +104,10 @@ class TranslateArabicNumberFunction : public DB::IFunction
74104 col_str = DB::checkAndGetColumn<DB::ColumnString>(data_col.get ());
75105 }
76106 if (!col_str)
77- throw DB::Exception (DB::ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, " Argument for function {} must be String" , getName ());
107+ throw DB::Exception (DB::ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, " Argument for function {} must be String, but got {} " , getName (), data_col-> getName ());
78108
79109 auto nested_data_col = DB::removeNullable (arguments[0 ].column );
80- bool has_arabic_number = false ;
81- bool has_normal_number = false ;
110+ bool has_arabic_indic_digit = false ;
82111 size_t row_index = 0 ;
83112 for (row_index = 0 ; row_index < input_rows_count; ++row_index)
84113 {
@@ -87,26 +116,21 @@ class TranslateArabicNumberFunction : public DB::IFunction
87116 continue ;
88117 }
89118 auto str = col_str->getDataAt (row_index);
90- if (hasArabicNumber (str))
119+ if (hasArabicIndicDigit (str))
91120 {
92- has_arabic_number = true ;
93- }
94- else
95- {
96- has_normal_number = true ;
97- }
98-
99- if (has_arabic_number)
121+ has_arabic_indic_digit = true ;
100122 break ;
123+ }
101124 }
102- if (!has_arabic_number)
125+
126+ if (!has_arabic_indic_digit)
103127 {
104- // No Arabic number found, return the original column
128+ // No Arabic indic digits found, return the original column
105129 return arguments[0 ].column ;
106130 }
107131
108132 auto res_col = data_col->cloneEmpty ();
109- if (has_normal_number )
133+ if (row_index )
110134 {
111135 res_col->insertManyFrom (*data_col, 0 , row_index);
112136 }
@@ -117,13 +141,16 @@ class TranslateArabicNumberFunction : public DB::IFunction
117141 res_col->insertDefault ();
118142 continue ;
119143 }
144+ auto str = convertArabicIndicDigit (col_str->getDataAt (row_index));
145+ res_col->insertData (str.c_str (), str.size ());
120146 }
121147 return res_col;
122148 }
123- private:
124149
125- bool hasArabicNumber (StringRef str) const
150+ private:
151+ bool hasArabicIndicDigit (StringRef str) const
126152 {
153+ // In most cases, the first byte is a digit.
127154 char c = reinterpret_cast <char >(str.data [0 ]);
128155 if (' 0' <= c && c <= ' 9' )
129156 {
@@ -132,12 +159,50 @@ class TranslateArabicNumberFunction : public DB::IFunction
132159 return true ;
133160 }
134161
162+
163+ bool isArabicIndicDigit (char32_t c) const { return c >= 0x0660 && c <= 0x0669 ; }
164+ char toAsciiDigit (char32_t c) const { return static_cast <char >(c - 0x0660 + ' 0' ); }
165+
166+ String convertArabicIndicDigit (const StringRef & str) const
167+ {
168+ std::string result;
169+ result.reserve (str.size );
170+ for (size_t i = 0 ; i < str.size ;)
171+ {
172+ unsigned char c = str.data [i];
173+ char32_t cp = 0 ;
174+ if ((c & 0x80 ) == 0 ) // 1-byte
175+ {
176+ cp = c;
177+ i += 1 ;
178+ }
179+ else if ((c & 0xE0 ) == 0xC0 ) // 2-byte
180+ {
181+ cp = ((c & 0x1F ) << 6 ) | (str.data [i + 1 ] & 0x3F );
182+ i += 2 ;
183+ }
184+ else if ((c & 0xF0 ) == 0xE0 ) // 3-byte
185+ {
186+ cp = ((c & 0x0F ) << 12 ) | ((str.data [i + 1 ] & 0x3F ) << 6 ) | (str.data [i + 2 ] & 0x3F );
187+ i += 3 ;
188+ }
189+ else if ((c & 0xF8 ) == 0xF0 ) // 4-byte
190+ {
191+ cp = ((c & 0x07 ) << 18 ) | ((str.data [i + 1 ] & 0x3F ) << 12 ) | ((str.data [i + 2 ] & 0x3F ) << 6 ) | (str.data [i + 3 ] & 0x3F );
192+ i += 4 ;
193+ }
194+ if (isArabicIndicDigit (cp))
195+ result.push_back (toAsciiDigit (cp));
196+ else
197+ result.push_back (cp);
198+ }
199+ return result;
200+ }
135201};
136202
137203using namespace DB ;
138- REGISTER_FUNCTION (TranslateArabicNumberFunction )
204+ REGISTER_FUNCTION (ArabicIndicToAsciiDigitForDate )
139205{
140- factory.registerFunction <TranslateArabicNumberFunction >();
206+ factory.registerFunction <ArabicIndicToAsciiDigitForDateFunction >();
141207}
142208}
143-
0 commit comments