@@ -715,6 +715,42 @@ FSTProcessor::compoundAnalysis(UString input_word)
715715 return filterFinals (current_state, input_word);
716716}
717717
718+ UString
719+ FSTProcessor::compoundAnalysisOrLowering (UString input_cased) {
720+ if (do_decomposition) {
721+ // Try compound analysis without altering casing:
722+ UString compound = compoundAnalysis (input_cased);
723+ if (!compound.empty ()) {
724+ return compound;
725+ }
726+ }
727+ // If we failed due to state explosion, we may try again with the lowercased string:
728+ UString input_lowered = StringUtils::tolower (input_cased);
729+ State current_state = initial_state;
730+ for (unsigned int i=0 ; i<input_lowered.size (); i++) {
731+ current_state.step_case (input_lowered[i], beCaseSensitive (current_state));
732+ if (current_state.size ()==0 ) {
733+ break ;
734+ }
735+ }
736+ if (do_decomposition && compoundOnlyLSymbol != 0 ) {
737+ current_state.pruneStatesWithForbiddenSymbol (compoundOnlyLSymbol);
738+ }
739+ UString nonCompound = filterFinals (current_state, input_lowered);
740+ if (!nonCompound.empty ()) {
741+ return nonCompound;
742+ }
743+ if (do_decomposition) {
744+ // … or even on the compound analysis of the lowercased string:
745+ UString compound = compoundAnalysis (input_lowered);
746+ if (!compound.empty ()) {
747+ return compound;
748+ }
749+ }
750+ // None of the above:
751+ UString nullString;
752+ return nullString;
753+ }
718754
719755
720756void
@@ -961,17 +997,10 @@ FSTProcessor::analysis(InputFile& input, UFILE *output)
961997 {
962998 input_buffer.setPos (last_start + limit.i_codepoint );
963999 UString unknown_word = sf.substr (0 , limit.i_utf16 );
964- if (do_decomposition)
1000+ UString compoundOrLower = compoundAnalysisOrLowering (unknown_word);
1001+ if (!compoundOrLower.empty ())
9651002 {
966- UString compound = compoundAnalysis (unknown_word);
967- if (!compound.empty ())
968- {
969- printWord (unknown_word, compound, output);
970- }
971- else
972- {
973- printUnknownWord (unknown_word, output);
974- }
1003+ printWord (unknown_word, compoundOrLower, output);
9751004 }
9761005 else
9771006 {
@@ -991,17 +1020,10 @@ FSTProcessor::analysis(InputFile& input, UFILE *output)
9911020 {
9921021 input_buffer.setPos (last_start + limit.i_codepoint );
9931022 UString unknown_word = sf.substr (0 , limit.i_utf16 );
994- if (do_decomposition)
1023+ UString compoundOrLower = compoundAnalysisOrLowering (unknown_word);
1024+ if (!compoundOrLower.empty ())
9951025 {
996- UString compound = compoundAnalysis (unknown_word);
997- if (!compound.empty ())
998- {
999- printWord (unknown_word, compound, output);
1000- }
1001- else
1002- {
1003- printUnknownWord (unknown_word, output);
1004- }
1026+ printWord (unknown_word, compoundOrLower, output);
10051027 }
10061028 else
10071029 {
@@ -1781,6 +1803,34 @@ FSTProcessor::bilingual(InputFile& input, UFILE *output, GenerationMode mode)
17811803 result.clear ();
17821804 }
17831805
1806+ if (result.empty () && (mode == gm_bilgen || mode == gm_all)) {
1807+ // Retry looking up lower-cased version, this time not using alt-override (which leads to state explosions)
1808+ State current_state = initial_state;
1809+ if (reader.readings [index].mark == ' #' ) current_state.step (' #' );
1810+ bool seenTags = false ;
1811+ for (size_t i = 0 ; i < symbols.size (); i++) {
1812+ seenTags = seenTags || alphabet.isTag (symbols[i]);
1813+ if (alphabet.isTag (symbols[i]) || beCaseSensitive (current_state)) {
1814+ current_state.step_override (symbols[i], any_char, symbols[i]);
1815+ }
1816+ else {
1817+ int32_t symbol_low = u_tolower (symbols[i]);
1818+ current_state.step_override (symbol_low, any_char, symbol_low);
1819+ }
1820+ if (current_state.isFinal (all_finals)) {
1821+ queue_start = i;
1822+ current_state.filterFinalsArray (result,
1823+ all_finals, alphabet, escaped_chars,
1824+ displayWeightsMode, maxAnalyses,
1825+ maxWeightClasses);
1826+ }
1827+ }
1828+ // if there are no tags, we only return complete matches
1829+ if ((!seenTags || mode == gm_all || mode == gm_bilgen) && queue_start + 1 < symbols.size ()) {
1830+ result.clear ();
1831+ }
1832+ }
1833+
17841834 UString source;
17851835 size_t queue_pos = 0 ;
17861836 if (reader.readings [index].mark == ' #' ) {
0 commit comments