1919#include < lttoolbox/alphabet.h>
2020#include < lttoolbox/transducer.h>
2121#include < lttoolbox/compression.h>
22- #include < lttoolbox/string_utils.h>
2322#include < lttoolbox/file_utils.h>
2423#include < algorithm>
24+ #include < limits>
2525#include < stack>
2626#include < unicode/uchar.h>
2727#include < unicode/ustring.h>
@@ -141,6 +141,53 @@ AttCompiler::add_transition(int from, int to,
141141 }
142142}
143143
144+ /*
145+ * ICU number parsing does a lot of locale handling and this tends to
146+ * dominate the runtime of lt-comp. Since we always force the locale
147+ * to be C.UTF-8, we can make stronger assumptions leading to a
148+ * ~2000x speedup for stoi and ~500x for stod relative to StringUtils.
149+ * - DGS 2025-08-29
150+ */
151+
152+ int fast_stoi (UString s) {
153+ int ret = 0 ;
154+ for (size_t i = 0 ; i < s.size (); i++) {
155+ if (s[i] < ' 0' || s[i] > ' 9' ) throw std::invalid_argument (" bad int" );
156+ ret *= 10 ;
157+ ret += (s[i] - ' 0' );
158+ }
159+ return ret;
160+ }
161+
162+ double fast_stod (UString s) {
163+ if (s.size () == 0 ) throw std::invalid_argument (" empty string" );
164+ double sign = 1 ;
165+ size_t i = 0 ;
166+ if (s[i] == ' -' ) {
167+ i++;
168+ sign = -1 ;
169+ }
170+ if (i == s.size ()) throw std::invalid_argument (" no number" );
171+ if (i + 3 == s.size () && s[i] == ' i' && s[i+1 ] == ' n' && s[i+2 ] == ' f' ) {
172+ return sign * std::numeric_limits<double >::infinity ();
173+ }
174+ double ret = 0 ;
175+ for (; i < s.size (); i++) {
176+ if (s[i] == ' .' ) break ;
177+ if (s[i] < ' 0' || s[i] > ' 9' ) throw std::invalid_argument (" bad digit" );
178+ ret *= 10 ;
179+ ret += (s[i] - ' 0' );
180+ }
181+ i++;
182+ double mul = 0.1 ;
183+ for (; i < s.size (); i++) {
184+ if (s[i] < ' 0' || s[i] > ' 9' ) throw std::invalid_argument (" bad digit" );
185+ ret += (s[i] - ' 0' ) * mul;
186+ mul *= 0.1 ;
187+ }
188+ return sign * ret;
189+ }
190+
144191void
145192AttCompiler::parse (std::string const &file_name, bool read_rl)
146193{
@@ -208,7 +255,7 @@ AttCompiler::parse(std::string const &file_name, bool read_rl)
208255 }
209256
210257 try {
211- from = StringUtils::stoi (tokens[0 ]) + state_id_offset;
258+ from = fast_stoi (tokens[0 ]) + state_id_offset;
212259 } catch (const std::invalid_argument& e) {
213260 std::cerr << " Error: invalid source state in file '" << file_name << " ' on line " << line_number << " ." << std::endl;
214261 exit (EXIT_FAILURE);
@@ -233,7 +280,7 @@ AttCompiler::parse(std::string const &file_name, bool read_rl)
233280 if (tokens.size () > 1 )
234281 {
235282 try {
236- weight = StringUtils::stod (tokens[1 ]);
283+ weight = fast_stod (tokens[1 ]);
237284 } catch (const std::invalid_argument& e) {
238285 std::cerr << " Error: invalid weight in file '" << file_name << " ' on line " << line_number << " ." << std::endl;
239286 exit (EXIT_FAILURE);
@@ -248,7 +295,7 @@ AttCompiler::parse(std::string const &file_name, bool read_rl)
248295 else
249296 {
250297 try {
251- to = StringUtils::stoi (tokens[1 ]) + state_id_offset;
298+ to = fast_stoi (tokens[1 ]) + state_id_offset;
252299 } catch (const std::invalid_argument& e) {
253300 std::cerr << " Error: invalid target state in file '" << file_name << " ' on line " << line_number << " ." << std::endl;
254301 exit (EXIT_FAILURE);
@@ -269,7 +316,7 @@ AttCompiler::parse(std::string const &file_name, bool read_rl)
269316 if (tokens.size () > 4 )
270317 {
271318 try {
272- weight = StringUtils::stod (tokens[4 ]);
319+ weight = fast_stod (tokens[4 ]);
273320 } catch (const std::invalid_argument& e) {
274321 std::cerr << " Error: invalid weight in file '" << file_name << " ' on line " << line_number << " ." << std::endl;
275322 exit (EXIT_FAILURE);
0 commit comments