Skip to content

[FEAT] words_from_chars RTL and LTR adapion #431

@peleg122

Description

@peleg122

in the words_from_chars function that can be found in recognition.utils
it assumes the line is LTR but not all the languages are LTR and some are RTL like Hebrew and Arabic.
i suggest adding characters checking to see whats the dominant language because there can be mixed languages in the sentence to determine the direction and based on that merge to the edges.

from typing import List, Tuple, Any
import unicodedata
import statistics

# ---- bidi helpers ----
def _is_strong_rtl(s: str) -> bool:
    for ch in s:
        b = unicodedata.bidirectional(ch)
        if b in ("R", "AL", "AN"):
            return True
    return False

def _is_strong_ltr(s: str) -> bool:
    for ch in s:
        b = unicodedata.bidirectional(ch)
        if b in ("L", "LRE", "LRO", "EN"):
            return True
    return False

def _detect_line_direction(chars) -> str:
    rtl = ltr = 0
    for c in chars:
        t = (getattr(c, "text", "") or "").strip()
        if not t:
            continue
        rtl += 1 if _is_strong_rtl(t) else 0
        ltr += 1 if _is_strong_ltr(t) else 0
    return "rtl" if rtl > ltr else "ltr"

# ---- geometry helpers (robust to different field names) ----
def _extract_box(obj: Any) -> Tuple[float, float, float, float]:
    """Return (x0, y0, x1, y1) from obj or obj.bbox with common field names."""
    b = getattr(obj, "bbox", obj)
    for x0k, x1k, y0k, y1k in (
        ("x0", "x1", "y0", "y1"),
        ("xmin", "xmax", "ymin", "ymax"),
        ("left", "right", "top", "bottom"),
    ):
        try:
            x0 = getattr(b, x0k); x1 = getattr(b, x1k)
            y0 = getattr(b, y0k); y1 = getattr(b, y1k)
            if None not in (x0, x1, y0, y1):
                return float(x0), float(y0), float(x1), float(y1)
        except AttributeError:
            pass
    raise AttributeError("Unsupported bbox schema on object: %r" % (obj,))

def _median_char_width(valid_chars: List[Any]) -> float:
    widths = []
    for ch in valid_chars:
        try:
            x0, _, x1, _ = _extract_box(ch)
            w = x1 - x0
            if w > 0:
                widths.append(w)
        except Exception:
            continue
    return statistics.median(widths) if widths else 0.0

def _near_edge(direction: str, is_start: bool, ch_box, line_box,
               tol_px: float) -> bool:
    ch_x0, _, ch_x1, _ = ch_box
    ln_x0, _, ln_x1, _ = _extract_box(line_box)
    if direction == "ltr":
        # start→left edge, end→right edge
        if is_start:
            return (ch_x0 - ln_x0) <= tol_px
        else:
            return (ln_x1 - ch_x1) <= tol_px
    else:  # rtl
        # start→right edge, end→left edge
        if is_start:
            return (ln_x1 - ch_x1) <= tol_px
        else:
            return (ch_x0 - ln_x0) <= tol_px

def words_from_chars(chars: List["TextChar"], line_box: "PolygonBox",
                     snap_ratio_line: float = 0.03,
                     snap_ratio_char: float = 0.50):
    """
    Build words and optionally snap first/last word to line edges *only if close*.
    - snap_ratio_line: max distance to edge as % of line width (e.g., 3%)
    - snap_ratio_char: max distance to edge as multiple of median char width (e.g., 0.5x)
    """
    words = []
    word = None

    # Direction
    direction = _detect_line_direction(chars)

    # Valid indices
    valid_indices = [i for i, ch in enumerate(chars) if getattr(ch, "bbox_valid", False)]
    if not valid_indices:
        return words
    first_valid = valid_indices[0]
    last_valid = valid_indices[-1]

    # Tolerance in pixels (min of small % of line width and ~half char width)
    ln_x0, _, ln_x1, _ = _extract_box(line_box)
    line_w = max(1.0, ln_x1 - ln_x0)  # avoid zero-division
    med_char_w = _median_char_width([chars[i] for i in valid_indices])
    tol_px = min(snap_ratio_line * line_w,
                 snap_ratio_char * med_char_w if med_char_w > 0 else float("inf"))

    for i, char in enumerate(chars):
        if not getattr(char, "bbox_valid", False):
            if word:
                words.append(word)
                word = None
            continue

        ch_text = (getattr(char, "text", "") or "")
        ch_box = _extract_box(char)

        if not word:
            word = TextWord(**char.model_dump())

            # Only snap if the first valid char is *near* the expected edge
            if i == first_valid and _near_edge(direction, True, ch_box, line_box, tol_px):
                if direction == "ltr":
                    word.merge_left(line_box)
                else:
                    word.merge_right(line_box)

            # Single-char line: also check the far edge
            if i == last_valid and _near_edge(direction, False, ch_box, line_box, tol_px):
                if direction == "ltr":
                    word.merge_right(line_box)
                else:
                    word.merge_left(line_box)

            if not ch_text.strip():
                words.append(word)
                word = None

        elif not ch_text.strip():
            words.append(word)
            word = None

        else:
            word.merge(char)
            word.text = word.text + ch_text

            if i == last_valid and _near_edge(direction, False, _extract_box(char), line_box, tol_px):
                if direction == "ltr":
                    word.merge_right(line_box)
                else:
                    word.merge_left(line_box)

    if word:
        words.append(word)
    return words

just a suggestion of implanmen

Metadata

Metadata

Assignees

No one assigned

    Labels

    enhancementNew feature or request

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions