Skip to content

Commit ac385d8

Browse files
committed
bypass locale processing on number parsing in ATT - ~33x speedup
1 parent 6e03b9f commit ac385d8

File tree

1 file changed

+52
-5
lines changed

1 file changed

+52
-5
lines changed

lttoolbox/att_compiler.cc

Lines changed: 52 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -19,9 +19,9 @@
1919
#include <lttoolbox/alphabet.h>
2020
#include <lttoolbox/transducer.h>
2121
#include <lttoolbox/compression.h>
22-
#include <lttoolbox/string_utils.h>
2322
#include <lttoolbox/file_utils.h>
2423
#include <algorithm>
24+
#include <limits>
2525
#include <stack>
2626
#include <unicode/uchar.h>
2727
#include <unicode/ustring.h>
@@ -141,6 +141,53 @@ AttCompiler::add_transition(int from, int to,
141141
}
142142
}
143143

144+
/*
145+
* ICU number parsing does a lot of locale handling and this tends to
146+
* dominate the runtime of lt-comp. Since we always force the locale
147+
* to be C.UTF-8, we can make stronger assumptions leading to a
148+
* ~2000x speedup for stoi and ~500x for stod relative to StringUtils.
149+
* - DGS 2025-08-29
150+
*/
151+
152+
int fast_stoi(UString s) {
153+
int ret = 0;
154+
for (size_t i = 0; i < s.size(); i++) {
155+
if (s[i] < '0' || s[i] > '9') throw std::invalid_argument("bad int");
156+
ret *= 10;
157+
ret += (s[i] - '0');
158+
}
159+
return ret;
160+
}
161+
162+
double fast_stod(UString s) {
163+
if (s.size() == 0) throw std::invalid_argument("empty string");
164+
double sign = 1;
165+
size_t i = 0;
166+
if (s[i] == '-') {
167+
i++;
168+
sign = -1;
169+
}
170+
if (i == s.size()) throw std::invalid_argument("no number");
171+
if (i + 3 == s.size() && s[i] == 'i' && s[i+1] == 'n' && s[i+2] == 'f') {
172+
return sign * std::numeric_limits<double>::infinity();
173+
}
174+
double ret = 0;
175+
for (; i < s.size(); i++) {
176+
if (s[i] == '.') break;
177+
if (s[i] < '0' || s[i] > '9') throw std::invalid_argument("bad digit");
178+
ret *= 10;
179+
ret += (s[i] - '0');
180+
}
181+
i++;
182+
double mul = 0.1;
183+
for (; i < s.size(); i++) {
184+
if (s[i] < '0' || s[i] > '9') throw std::invalid_argument("bad digit");
185+
ret += (s[i] - '0') * mul;
186+
mul *= 0.1;
187+
}
188+
return sign * ret;
189+
}
190+
144191
void
145192
AttCompiler::parse(std::string const &file_name, bool read_rl)
146193
{
@@ -208,7 +255,7 @@ AttCompiler::parse(std::string const &file_name, bool read_rl)
208255
}
209256

210257
try {
211-
from = StringUtils::stoi(tokens[0]) + state_id_offset;
258+
from = fast_stoi(tokens[0]) + state_id_offset;
212259
} catch (const std::invalid_argument& e) {
213260
std::cerr << "Error: invalid source state in file '" << file_name << "' on line " << line_number << "." << std::endl;
214261
exit(EXIT_FAILURE);
@@ -233,7 +280,7 @@ AttCompiler::parse(std::string const &file_name, bool read_rl)
233280
if (tokens.size() > 1)
234281
{
235282
try {
236-
weight = StringUtils::stod(tokens[1]);
283+
weight = fast_stod(tokens[1]);
237284
} catch (const std::invalid_argument& e) {
238285
std::cerr << "Error: invalid weight in file '" << file_name << "' on line " << line_number << "." << std::endl;
239286
exit(EXIT_FAILURE);
@@ -248,7 +295,7 @@ AttCompiler::parse(std::string const &file_name, bool read_rl)
248295
else
249296
{
250297
try {
251-
to = StringUtils::stoi(tokens[1]) + state_id_offset;
298+
to = fast_stoi(tokens[1]) + state_id_offset;
252299
} catch (const std::invalid_argument& e) {
253300
std::cerr << "Error: invalid target state in file '" << file_name << "' on line " << line_number << "." << std::endl;
254301
exit(EXIT_FAILURE);
@@ -269,7 +316,7 @@ AttCompiler::parse(std::string const &file_name, bool read_rl)
269316
if(tokens.size() > 4)
270317
{
271318
try {
272-
weight = StringUtils::stod(tokens[4]);
319+
weight = fast_stod(tokens[4]);
273320
} catch (const std::invalid_argument& e) {
274321
std::cerr << "Error: invalid weight in file '" << file_name << "' on line " << line_number << "." << std::endl;
275322
exit(EXIT_FAILURE);

0 commit comments

Comments
 (0)