Skip to content

Commit 98889df

Browse files
committed
Fallback to analysing lower-cased version on state explosion
both in analysis (lt-proc -a, with or without compounding -e) and in "bilingual generation" (lt-proc -g -b). fixes #194
1 parent b8774df commit 98889df

File tree

4 files changed

+125
-20
lines changed

4 files changed

+125
-20
lines changed

lttoolbox/fst_processor.cc

Lines changed: 70 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -715,6 +715,42 @@ FSTProcessor::compoundAnalysis(UString input_word)
715715
return filterFinals(current_state, input_word);
716716
}
717717

718+
UString
719+
FSTProcessor::compoundAnalysisOrLowering(UString input_cased) {
720+
if(do_decomposition) {
721+
// Try compound analysis without altering casing:
722+
UString compound = compoundAnalysis(input_cased);
723+
if(!compound.empty()) {
724+
return compound;
725+
}
726+
}
727+
// If we failed due to state explosion, we may try again with the lowercased string:
728+
UString input_lowered = StringUtils::tolower(input_cased);
729+
State current_state = initial_state;
730+
for(unsigned int i=0; i<input_lowered.size(); i++) {
731+
current_state.step_case(input_lowered[i], beCaseSensitive(current_state));
732+
if(current_state.size()==0) {
733+
break;
734+
}
735+
}
736+
if(do_decomposition && compoundOnlyLSymbol != 0) {
737+
current_state.pruneStatesWithForbiddenSymbol(compoundOnlyLSymbol);
738+
}
739+
UString nonCompound = filterFinals(current_state, input_lowered);
740+
if(!nonCompound.empty()) {
741+
return nonCompound;
742+
}
743+
if(do_decomposition) {
744+
// … or even on the compound analysis of the lowercased string:
745+
UString compound = compoundAnalysis(input_lowered);
746+
if(!compound.empty()) {
747+
return compound;
748+
}
749+
}
750+
// None of the above:
751+
UString nullString;
752+
return nullString;
753+
}
718754

719755

720756
void
@@ -961,17 +997,10 @@ FSTProcessor::analysis(InputFile& input, UFILE *output)
961997
{
962998
input_buffer.setPos(last_start + limit.i_codepoint);
963999
UString unknown_word = sf.substr(0, limit.i_utf16);
964-
if(do_decomposition)
1000+
UString compoundOrLower = compoundAnalysisOrLowering(unknown_word);
1001+
if(!compoundOrLower.empty())
9651002
{
966-
UString compound = compoundAnalysis(unknown_word);
967-
if(!compound.empty())
968-
{
969-
printWord(unknown_word, compound, output);
970-
}
971-
else
972-
{
973-
printUnknownWord(unknown_word, output);
974-
}
1003+
printWord(unknown_word, compoundOrLower, output);
9751004
}
9761005
else
9771006
{
@@ -991,17 +1020,10 @@ FSTProcessor::analysis(InputFile& input, UFILE *output)
9911020
{
9921021
input_buffer.setPos(last_start + limit.i_codepoint);
9931022
UString unknown_word = sf.substr(0, limit.i_utf16);
994-
if(do_decomposition)
1023+
UString compoundOrLower = compoundAnalysisOrLowering(unknown_word);
1024+
if(!compoundOrLower.empty())
9951025
{
996-
UString compound = compoundAnalysis(unknown_word);
997-
if(!compound.empty())
998-
{
999-
printWord(unknown_word, compound, output);
1000-
}
1001-
else
1002-
{
1003-
printUnknownWord(unknown_word, output);
1004-
}
1026+
printWord(unknown_word, compoundOrLower, output);
10051027
}
10061028
else
10071029
{
@@ -1781,6 +1803,34 @@ FSTProcessor::bilingual(InputFile& input, UFILE *output, GenerationMode mode)
17811803
result.clear();
17821804
}
17831805

1806+
if(result.empty() && (mode == gm_bilgen || mode == gm_all)) {
1807+
// Retry looking up lower-cased version, this time not using alt-override (which leads to state explosions)
1808+
State current_state = initial_state;
1809+
if (reader.readings[index].mark == '#') current_state.step('#');
1810+
bool seenTags = false;
1811+
for (size_t i = 0; i < symbols.size(); i++) {
1812+
seenTags = seenTags || alphabet.isTag(symbols[i]);
1813+
if(alphabet.isTag(symbols[i]) || beCaseSensitive(current_state)) {
1814+
current_state.step_override(symbols[i], any_char, symbols[i]);
1815+
}
1816+
else {
1817+
int32_t symbol_low = u_tolower(symbols[i]);
1818+
current_state.step_override(symbol_low, any_char, symbol_low);
1819+
}
1820+
if (current_state.isFinal(all_finals)) {
1821+
queue_start = i;
1822+
current_state.filterFinalsArray(result,
1823+
all_finals, alphabet, escaped_chars,
1824+
displayWeightsMode, maxAnalyses,
1825+
maxWeightClasses);
1826+
}
1827+
}
1828+
// if there are no tags, we only return complete matches
1829+
if ((!seenTags || mode == gm_all || mode == gm_bilgen) && queue_start + 1 < symbols.size()) {
1830+
result.clear();
1831+
}
1832+
}
1833+
17841834
UString source;
17851835
size_t queue_pos = 0;
17861836
if (reader.readings[index].mark == '#') {

lttoolbox/fst_processor.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -411,6 +411,11 @@ class FSTProcessor
411411
static UStringView removeTags(UStringView str);
412412
UString compoundAnalysis(UString str);
413413

414+
/**
415+
* As above, but if compoundAnalysis gives no results, try analysing the lowercased version of str.
416+
*/
417+
UString compoundAnalysisOrLowering(UString str);
418+
414419
struct Indices {
415420
size_t i_codepoint;
416421
size_t i_utf16; // always >= i_codepoint since some codepoints take up 2 UTF-16's

tests/data/big-mono.dix

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
<?xml version="1.0" encoding="UTF-8"?>
2+
<dictionary>
3+
<alphabet>ABCDEFGHIJKLMNOPQRSTUVWXYZÀÁÂÄÅÆÇÈÉÊËÍÑÒÓÔÕÖØÙÚÜČĐŊŠŦŽabcdefghijklmnopqrstuvwxyzàáâäåæçèéêëíñòóôõöøùúüčđŋšŧž­-</alphabet>
4+
<sdefs>
5+
<sdef n="n"/>
6+
<sdef n="np"/>
7+
<sdef n="def"/>
8+
<sdef n="compound-only-L"/>
9+
<sdef n="compound-R"/>
10+
</sdefs>
11+
<pardefs>
12+
13+
</pardefs>
14+
15+
<section id="main" type="standard">
16+
<e><p><l>hjerterytmeovervåkningen</l><r>hjerterytmeovervåkning<s n="n"/><s n="def"/></r></p></e>
17+
<e><p><l>hjerteklaff</l><r>hjerteklaff<s n="n"/><s n="compound-only-L"/></r></p></e>
18+
<e><p><l>overvåkningen</l><r>overvåkning<s n="n"/><s n="def"/><s n="compound-R"/></r></p></e>
19+
<e> <re>[A-ZÆØÅ]+[a-zæøåA-ZÆØÅ]+!</re><p><l/><r><s n="np"/></r></p></e>
20+
21+
<e><p><l>vas</l><r>vass<s n="n"/><s n="compound-only-L"/></r></p></e>
22+
<e><p><l>senga</l><r>seng<s n="n"/><s n="def"/><s n="compound-R"/></r></p></e>
23+
</section>
24+
25+
26+
</dictionary>

tests/lt_proc/__init__.py

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -531,6 +531,30 @@ class BiltransGenDebugSymbols(ProcTest):
531531
'^ab<n><def>#c/#ab<n><def>#c$',
532532
]
533533

534+
class BiltransLowerFallback(ProcTest):
535+
procdix = 'data/big-mono.dix'
536+
procdir = 'rl'
537+
procflags = ['-g', '-b', '-z']
538+
inputs = [
539+
'^HJERTERYTMEOVERVÅKNING<n><def>$',
540+
]
541+
expectedOutputs = [
542+
'^HJERTERYTMEOVERVÅKNING<n><def>/hjerterytmeovervåkningen$',
543+
]
544+
545+
class AnalysisLowerFallback(ProcTest):
546+
procdix = 'data/big-mono.dix'
547+
procdir = 'lr'
548+
procflags = ['-w', '-e', '-z']
549+
inputs = [
550+
'Vas vas',
551+
'hjerterytmeovervåkningen hjerteklaffovervåkningen HJERTERYTMEOVERVÅKNINGEN HJERTEKLAFFOVERVÅKNINGEN',
552+
]
553+
expectedOutputs = [
554+
'^Vas/*Vas$ ^vas/*vas$',
555+
'^hjerterytmeovervåkningen/hjerterytmeovervåkning<n><def>$ ^hjerteklaffovervåkningen/hjerteklaff<n>+overvåkning<n><def>$ ^HJERTERYTMEOVERVÅKNINGEN/hjerterytmeovervåkning<n><def>$ ^HJERTEKLAFFOVERVÅKNINGEN/hjerteklaff<n>+overvåkning<n><def>$'
556+
]
557+
534558

535559
# These fail on some systems:
536560
#from null_flush_invalid_stream_format import *

0 commit comments

Comments
 (0)