Remove deprecated --handle_kludgy_ordinals flag

vthorsteinsson · claude · vthorsteinsson · commit f9bb98951865 · 2025-12-11T16:58:35.000Z
Remove the outdated handle_kludgy_ordinals option from the CLI and tokenization API. Kludgy ordinals (e.g. '1sti', '3ja') are now always passed through unchanged as word tokens, which was the default behavior. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
diff --git a/README.md b/README.md
@@ -118,7 +118,6 @@ Other options can be specified on the command line:
 | `-g`, `--keep_composite_glyphs` | Do not replace composite glyphs using Unicode COMBINING codes with their accented/umlaut counterparts. |
 | `-e`, `--replace_html_escapes` | HTML escape codes replaced by their meaning, such as `&aacute;` -> `á`. |
 | `-c`, `--convert_numbers`      | English-style decimal points and thousands separators in numbers changed to Icelandic style. |
-| `-k N`, `--handle_kludgy_ordinals N` | Kludgy ordinal handling defined. 0: Returns the original mixed word form, 1. Kludgy ordinal returned as pure word forms, 2: Kludgy ordinals returned as pure numbers. |
 
 Type `tokenize -h` or `tokenize --help` to get a short help message.
 
@@ -453,31 +452,6 @@ functions:
 
   The default value for the `replace_html_escapes` option is `False`.
 
-* `handle_kludgy_ordinals=[value]`
-
-  This options controls the way Tokenizer handles 'kludgy' ordinals, such as
-  *1sti*, *4ðu*, or *2ja*. By default, such ordinals are returned unmodified
-  ('passed through') as word tokens (`TOK.WORD`).
-  However, this can be modified as follows:
-
-  * `tokenizer.KLUDGY_ORDINALS_MODIFY`: Kludgy ordinals are corrected
-    to become 'proper' word tokens, i.e. *1sti* becomes *fyrsti* and
-    *2ja* becomes *tveggja*.
-
-  * `tokenizer.KLUDGY_ORDINALS_TRANSLATE`: Kludgy ordinals that represent
-    proper ordinal numbers are translated to ordinal tokens (`TOK.ORDINAL`),
-    with their original text and their ordinal value. *1sti* thus
-    becomes a `TOK.ORDINAL` token with a value of 1, and *3ja* becomes
-    a `TOK.ORDINAL` with a value of 3.
-
-  * `tokenizer.KLUDGY_ORDINALS_PASS_THROUGH` is the default value of
-    the option. It causes kludgy ordinals to be returned unmodified as
-    word tokens.
-
-  Note that versions of Tokenizer prior to 1.4 behaved as if
-  `handle_kludgy_ordinals` were set to
-  `tokenizer.KLUDGY_ORDINALS_TRANSLATE`.
-
 ## Dash and Hyphen Handling
 
 Tokenizer distinguishes between three dash types and handles them contextually:
@@ -578,9 +552,8 @@ with the following exceptions:
   can be disabled; see the `replace_composite_glyphs` option described
   above.)
 
-* If the appropriate options are specified (see above), it converts
-  kludgy ordinals (*3ja*) to proper ones (*þriðja*), and English-style
-  thousand and decimal separators to Icelandic ones
+* If the `convert_numbers` option is specified (see above), English-style
+  thousand and decimal separators are converted to Icelandic ones
   (*10,345.67* becomes *10.345,67*).
 
 * If the `replace_html_escapes` option is set, Tokenizer replaces
@@ -812,8 +785,8 @@ can be found in the file `test/toktest_normal_gold_expected.txt`.
   `TOK.SERIALNUMBER` token kinds; abbreviations can now have multiple
   meanings.
 * Version 1.4.0: Added the `**options` parameter to the
-  `tokenize()` function, giving control over the handling of numbers,
-  telephone numbers, and 'kludgy' ordinals.
+  `tokenize()` function, giving control over the handling of numbers
+  and telephone numbers.
 * Version 1.3.0: Added `TOK.DOMAIN` and `TOK.HASHTAG` token types; 
   improved handling of capitalized month name *Ágúst*, which is
   now recognized when following an ordinal number; improved recognition
diff --git a/src/tokenizer/__init__.py b/src/tokenizer/__init__.py
@@ -36,9 +36,6 @@
     TP_WORD,
     EN_DASH,
     EM_DASH,
-    KLUDGY_ORDINALS_PASS_THROUGH,
-    KLUDGY_ORDINALS_MODIFY,
-    KLUDGY_ORDINALS_TRANSLATE,
     BIN_Tuple,
     BIN_TupleList,
 )
@@ -80,9 +77,6 @@
     "EM_DASH",
     "EN_DASH",
     "generate_raw_tokens",
-    "KLUDGY_ORDINALS_MODIFY",
-    "KLUDGY_ORDINALS_PASS_THROUGH",
-    "KLUDGY_ORDINALS_TRANSLATE",
     "mark_paragraphs",
     "normalized_text_from_tokens",
     "normalized_text",
diff --git a/src/tokenizer/definitions.py b/src/tokenizer/definitions.py
@@ -605,20 +605,8 @@ class PersonNameTuple(NamedTuple):
 )
 
 
-# If the handle_kludgy_ordinals option is set to
-# KLUDGY_ORDINALS_PASS_THROUGH, we do not convert
-# kludgy ordinals but pass them through as word tokens.
-KLUDGY_ORDINALS_PASS_THROUGH = 0
-# If the handle_kludgy_ordinals option is set to
-# KLUDGY_ORDINALS_MODIFY, we convert '1sti' to 'fyrsti', etc.,
-# and return the modified word as a token.
-KLUDGY_ORDINALS_MODIFY = 1
-# If the handle_kludgy_ordinals option is set to
-# KLUDGY_ORDINALS_TRANSLATE, we convert '1sti' to TOK.Ordinal('1sti', 1), etc.,
-# but otherwise pass the original word through as a word token ('2ja').
-KLUDGY_ORDINALS_TRANSLATE = 2
-
-# Incorrectly written ('kludgy') ordinals
+# Incorrectly written ('kludgy') ordinals: these are passed through unchanged
+# as word tokens, but they need to be recognized so they are not parsed as numbers
 ORDINAL_ERRORS: Mapping[str, str] = {
     "1sti": "fyrsti",
     "1sta": "fyrsta",
@@ -639,22 +627,6 @@ class PersonNameTuple(NamedTuple):
     "4ra": "fjögurra",
 }
 
-# Translations of kludgy ordinal words into numbers
-ORDINAL_NUMBERS: Mapping[str, int] = {
-    "1sti": 1,
-    "1sta": 1,
-    "1stu": 1,
-    "3ji": 3,
-    "3ja": 3,
-    "3ju": 3,
-    "4ði": 4,
-    "4ða": 4,
-    "4ðu": 4,
-    "5ti": 5,
-    "5ta": 5,
-    "5tu": 5,
-}
-
 # Handling of Roman numerals
 
 RE_ROMAN_NUMERAL = re.compile(
diff --git a/src/tokenizer/main.py b/src/tokenizer/main.py
@@ -149,19 +149,6 @@
     ),
 )
 
-parser.add_argument(
-    "-k",
-    "--handle_kludgy_ordinals",
-    type=int,
-    default=0,
-    help=(
-        "Kludgy ordinal handling defined.\n"
-        "\t0: Returns the original word form.\n"
-        "\t1: Ordinals returned as pure words.\n"
-        "\t2: Ordinals returned as numbers."
-    ),
-)
-
 parser.add_argument(
     "-v",
     "--version",
@@ -263,9 +250,6 @@ def val(t: Tok, quote_word: bool = False) -> Any:
     if args.one_sent_per_line:
         options["one_sent_per_line"] = True
 
-    if args.handle_kludgy_ordinals:
-        options["handle_kludgy_ordinals"] = args.handle_kludgy_ordinals
-
     if args.original:
         options["original"] = args.original
 
diff --git a/src/tokenizer/tokenizer.py b/src/tokenizer/tokenizer.py
@@ -1730,42 +1730,22 @@ def _is_letter(self, char: str) -> bool:
 class NumberParser:
     """Parses a sequence of digits off the front of a raw token"""
 
-    def __init__(
-        self, rt: Tok, handle_kludgy_ordinals: int, convert_numbers: bool
-    ) -> None:
+    def __init__(self, rt: Tok, convert_numbers: bool) -> None:
         self.rt = rt
-        self.handle_kludgy_ordinals = handle_kludgy_ordinals
         self.convert_numbers = convert_numbers
 
     def parse(self) -> Iterable[Tok]:
         """Parse the raw token, yielding result tokens"""
         # Handle kludgy ordinals: '3ji', '5ti', etc.
+        # Yield them unchanged as word tokens (pass-through behavior)
         rt = self.rt
-        handle_kludgy_ordinals = self.handle_kludgy_ordinals
         convert_numbers = self.convert_numbers
-        for key, val in ORDINAL_ERRORS.items():
+        for key in ORDINAL_ERRORS:
             rtxt = rt.txt
             if rtxt.startswith(key):
-                # This is a kludgy ordinal
+                # This is a kludgy ordinal: yield it unchanged as a word token
                 key_tok, rt = rt.split(len(key))
-                if handle_kludgy_ordinals == KLUDGY_ORDINALS_MODIFY:
-                    # Convert ordinals to corresponding word tokens:
-                    # '1sti' -> 'fyrsti', '3ji' -> 'þriðji', etc.
-                    key_tok.substitute_longer((0, len(key)), val)
-                    yield TOK.Word(key_tok)
-                elif (
-                    handle_kludgy_ordinals == KLUDGY_ORDINALS_TRANSLATE
-                    and key in ORDINAL_NUMBERS
-                ):
-                    # Convert word-form ordinals into ordinal tokens,
-                    # i.e. '1sti' -> TOK.Ordinal('1sti', 1),
-                    # but leave other kludgy constructs ('2ja')
-                    # as word tokens
-                    yield TOK.Ordinal(key_tok, ORDINAL_NUMBERS[key])
-                else:
-                    # No special handling of kludgy ordinals:
-                    # yield them unchanged as word tokens
-                    yield TOK.Word(key_tok)
+                yield TOK.Word(key_tok)
                 break  # This skips the for loop 'else'
         else:
             # Not a kludgy ordinal: eat tokens starting with a digit
@@ -1898,7 +1878,6 @@ def parse(self, rt: Tok) -> Iterable[Tok]:
 
 def parse_mixed(
     rt: Tok,
-    handle_kludgy_ordinals: int,
     convert_numbers: bool,
     replace_composite_glyphs: bool = True,
 ) -> Iterable[Tok]:
@@ -1994,7 +1973,7 @@ def parse_mixed(
             rtxt[0] in DIGITS_PREFIX
             or (rtxt[0] in SIGN_PREFIX and len(rtxt) >= 2 and rtxt[1] in DIGITS_PREFIX)
         ):
-            np = NumberParser(rt, handle_kludgy_ordinals, convert_numbers)
+            np = NumberParser(rt, convert_numbers)
             yield from np.parse()
             rt = np.rt
             ate = True
@@ -2072,12 +2051,6 @@ def parse_tokens(txt: Union[str, Iterable[str]], **options: Any) -> Iterator[Tok
     replace_html_escapes: bool = options.get("replace_html_escapes", False)
     one_sent_per_line: bool = options.get("one_sent_per_line", False)
 
-    # The default behavior for kludgy ordinals is to pass them
-    # through as word tokens
-    handle_kludgy_ordinals: int = options.get(
-        "handle_kludgy_ordinals", KLUDGY_ORDINALS_PASS_THROUGH
-    )
-
     # This code proceeds roughly as follows:
     # 1) The text is split into raw tokens on whitespace boundaries.
     # 2) (By far the most common case:) Raw tokens that are purely
@@ -2178,9 +2151,7 @@ def parse_tokens(txt: Union[str, Iterable[str]], **options: Any) -> Iterator[Tok
                 yield TOK.Punctuation(punct, normalized="‚")
 
         # More complex case of mixed punctuation, letters and numbers
-        yield from parse_mixed(
-            rt, handle_kludgy_ordinals, convert_numbers, replace_composite_glyphs
-        )
+        yield from parse_mixed(rt, convert_numbers, replace_composite_glyphs)
 
     # Yield a sentinel token at the end that will be cut off by the final generator
     yield TOK.End_Sentinel()
diff --git a/test/test_cli.py b/test/test_cli.py
@@ -200,13 +200,4 @@ def test_cli(capsys: CaptureFixture[str], monkeypatch: MonkeyPatch) -> None:
         == "Hann fékk 7,5 í meðaleinkunn en bara 3,3 í íþróttum , og hlaut 2.000,5 USD fyrir ."
     )
 
-    # Handle kludgy ordinals
-    # --handle_kludgy_ordinals flag
-    t = "Hann var 1sti maðurinn til að heimsækja tunglið."
-    r = run_cli(c, m, ["-", "-", "--handle_kludgy_ordinals", "1"], t)
-    assert r == "Hann var fyrsti maðurinn til að heimsækja tunglið ."
-    # TODO: Broken functionality, needs to be fixed
-    # r = run_cli(c, m, ["-", "-", "--handle_kludgy_ordinals", "2"], t)
-    # assert r == "Hann var 1. maðurinn til að heimsækja tunglið ."
-
     # TODO: Add more tests for the CLI to achieve 100% coverage
diff --git a/test/test_index_calculation.py b/test/test_index_calculation.py
@@ -638,26 +638,6 @@ def test_composite_phrases() -> None:
     assert byte_indexes == [0, 25, 26]
 
 
-def test_lengthening_substitutions() -> None:
-    s = "Þetta er 3ji báturinn!"
-    #    0123456789012345678901
-    #    ^    ^  ^   ^        ^
-    #    x             x
-    #             !             lengthening happens here (3ji->þriðji)
-    toks = tokenizer.parse_tokens(
-        s, handle_kludgy_ordinals=tokenizer.KLUDGY_ORDINALS_MODIFY
-    )
-    char_indexes, byte_indexes = tokenizer.calculate_indexes(toks)
-    assert char_indexes == [0, 5, 8, 12, 21]
-    assert byte_indexes == [0, 6, 9, 13, 23]
-    toks = tokenizer.parse_tokens(
-        s, handle_kludgy_ordinals=tokenizer.KLUDGY_ORDINALS_MODIFY
-    )
-    char_indexes, byte_indexes = tokenizer.calculate_indexes(toks, last_is_end=True)
-    assert char_indexes == [0, 5, 8, 12, 21, 22]
-    assert byte_indexes == [0, 6, 9, 13, 23, 24]
-
-
 def test_converted_measurements() -> None:
     s = "Stillið ofninn á 12° C til að baka kökuna."
     #    012345678901234567890123456789012345678901
diff --git a/test/test_tokenizer.py b/test/test_tokenizer.py
@@ -498,21 +498,6 @@ def test_single_tokens() -> None:
         ("1-800-1234-545566", TOK.SERIALNUMBER),
     ]
 
-    TEST_CASES_KLUDGY_MODIFY = [
-        ("1sti", [Tok(TOK.WORD, "fyrsti", None)]),
-        ("4ðu", [Tok(TOK.WORD, "fjórðu", None)]),
-        ("2svar", [Tok(TOK.WORD, "tvisvar", None)]),
-        ("4ra", [Tok(TOK.WORD, "fjögurra", None)]),
-        ("2ja", [Tok(TOK.WORD, "tveggja", None)]),
-    ]
-
-    TEST_CASES_KLUDGY_TRANSLATE = [
-        ("1sti", [Tok(TOK.ORDINAL, "1sti", 1)]),
-        ("4ðu", [Tok(TOK.ORDINAL, "4ðu", 4)]),
-        ("2svar", [Tok(TOK.WORD, "2svar", None)]),
-        ("4ra", [Tok(TOK.WORD, "4ra", None)]),
-    ]
-
     TEST_CASES_CONVERT_TELNOS: List[TestCase] = [
         ("525-4764", TOK.TELNO),
         ("4204200", [Tok(TOK.TELNO, "4204200", ("420-4200", "354"))]),
@@ -602,10 +587,6 @@ def run_test(test_cases: Iterable[TestCase], **options: Any) -> None:
 
     run_test(cast(Iterable[TestCase], TEST_CASES))
     run_test(cast(Iterable[TestCase], TEST_CASES_CONVERT_TELNOS))
-    run_test(TEST_CASES_KLUDGY_MODIFY, handle_kludgy_ordinals=t.KLUDGY_ORDINALS_MODIFY)
-    run_test(
-        TEST_CASES_KLUDGY_TRANSLATE, handle_kludgy_ordinals=t.KLUDGY_ORDINALS_TRANSLATE
-    )
     run_test(TEST_CASES_CONVERT_NUMBERS, convert_numbers=True)
     run_test(
         cast(Iterable[TestCase], TEST_CASES_COALESCE_PERCENT), coalesce_percent=True
@@ -1051,42 +1032,6 @@ def test_correction() -> None:
             """Hann „gaf“ mér €10.780,65.""",
         ),
     ]
-    SENT_KLUDGY_ORDINALS_MODIFY = [
-        (
-            """Hann sagði: ´Þú ert fífl´! Farðu í 3ja herbergja íbúð.""",
-            """Hann sagði: ‚Þú ert fífl‘! Farðu í þriggja herbergja íbúð.""",
-        ),
-        (
-            """Hann sagði: ´Þú ert fífl´! Farðu í 1sta sinn.""",
-            """Hann sagði: ‚Þú ert fífl‘! Farðu í fyrsta sinn.""",
-        ),
-        (
-            """Hann sagði: ´Þú ert fífl´! Farðu 2svar í bað.""",
-            """Hann sagði: ‚Þú ert fífl‘! Farðu tvisvar í bað.""",
-        ),
-        (
-            """Ég keypti 4ra herbergja íbúð á verði 2ja herbergja.""",
-            """Ég keypti fjögurra herbergja íbúð á verði tveggja herbergja.""",
-        ),
-    ]
-    SENT_KLUDGY_ORDINALS_TRANSLATE = [
-        (
-            """Hann sagði: ´Þú ert fífl´! Farðu í 3ja sinn.""",
-            """Hann sagði: ‚Þú ert fífl‘! Farðu í 3ja sinn.""",
-        ),
-        (
-            """Hann sagði: ´Þú ert fífl´! Farðu í 1sta sinn.""",
-            """Hann sagði: ‚Þú ert fífl‘! Farðu í 1sta sinn.""",
-        ),
-        (
-            """Hann sagði: ´Þú ert fífl´! Farðu 2svar í bað.""",
-            """Hann sagði: ‚Þú ert fífl‘! Farðu 2svar í bað.""",
-        ),
-        (
-            """Ég keypti 4ra herbergja íbúð á verði 2ja herbergja.""",
-            """Ég keypti 4ra herbergja íbúð á verði 2ja herbergja.""",
-        ),
-    ]
     SENT_CONVERT_NUMBERS = [
         (
             """Hann "gaf" mér 10,780.65 dollara.""",
@@ -1102,14 +1047,6 @@ def test_correction() -> None:
         s = t.tokenize(sent)
         txt = t.detokenize(s, normalize=True)
         assert txt == correct
-    for sent, correct in SENT_KLUDGY_ORDINALS_MODIFY:
-        s = t.tokenize(sent, handle_kludgy_ordinals=t.KLUDGY_ORDINALS_MODIFY)
-        txt = t.detokenize(s, normalize=True)
-        assert txt == correct
-    for sent, correct in SENT_KLUDGY_ORDINALS_TRANSLATE:
-        s = t.tokenize(sent, handle_kludgy_ordinals=t.KLUDGY_ORDINALS_TRANSLATE)
-        txt = t.detokenize(s, normalize=True)
-        assert txt == correct
     for sent, correct in SENT_CONVERT_NUMBERS:
         s = t.tokenize(sent, convert_numbers=True)
         txt = t.detokenize(s, normalize=True)