yichael
/
AndroidRemoteController


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428
							"""
Grapheme cluster segmentation following Unicode Standard Annex #29.

This module provides pure-Python implementation of the grapheme cluster boundary algorithm as
defined in UAX #29: Unicode Text Segmentation.

https://www.unicode.org/reports/tr29/
"""

from __future__ import annotations

# std imports
from enum import IntEnum
from functools import lru_cache

from typing import TYPE_CHECKING, NamedTuple

# local
from .bisearch import bisearch as _bisearch
from .table_grapheme import (GRAPHEME_L,
                             GRAPHEME_T,
                             GRAPHEME_V,
                             GRAPHEME_LV,
                             INCB_EXTEND,
                             INCB_LINKER,
                             GRAPHEME_LVT,
                             INCB_CONSONANT,
                             GRAPHEME_EXTEND,
                             GRAPHEME_CONTROL,
                             GRAPHEME_PREPEND,
                             GRAPHEME_SPACINGMARK,
                             EXTENDED_PICTOGRAPHIC,
                             GRAPHEME_REGIONAL_INDICATOR)

if TYPE_CHECKING:  # pragma: no cover
    # std imports
    from collections.abc import Iterator

# Maximum backward scan distance when finding grapheme cluster boundaries.
# Covers all known Unicode grapheme clusters with margin; longer sequences are pathological.
MAX_GRAPHEME_SCAN = 32


class GCB(IntEnum):
    """Grapheme Cluster Break property values."""

    OTHER = 0
    CR = 1
    LF = 2
    CONTROL = 3
    EXTEND = 4
    ZWJ = 5
    REGIONAL_INDICATOR = 6
    PREPEND = 7
    SPACING_MARK = 8
    L = 9
    V = 10
    T = 11
    LV = 12
    LVT = 13


# All lru_cache sizes in this file use maxsize=1024, chosen by benchmarking UDHR data (500+
# languages) and considering typical process-long sessions: western scripts need ~64 unique
# codepoints, but CJK could reach ~2000 -- but likely not.
@lru_cache(maxsize=1024)
def _grapheme_cluster_break(ucs: int) -> GCB:
    # pylint: disable=too-many-branches,too-complex
    """Return the Grapheme_Cluster_Break property for a codepoint."""
    # Single codepoint matches
    if ucs == 0x000d:
        return GCB.CR
    if ucs == 0x000a:
        return GCB.LF
    if ucs == 0x200d:
        return GCB.ZWJ
    # Matching by codepoint ranges, requiring binary search
    if _bisearch(ucs, GRAPHEME_CONTROL):
        return GCB.CONTROL
    if _bisearch(ucs, GRAPHEME_EXTEND):
        return GCB.EXTEND
    if _bisearch(ucs, GRAPHEME_REGIONAL_INDICATOR):
        return GCB.REGIONAL_INDICATOR
    if _bisearch(ucs, GRAPHEME_PREPEND):
        return GCB.PREPEND
    if _bisearch(ucs, GRAPHEME_SPACINGMARK):
        return GCB.SPACING_MARK
    if _bisearch(ucs, GRAPHEME_L):
        return GCB.L
    if _bisearch(ucs, GRAPHEME_V):
        return GCB.V
    if _bisearch(ucs, GRAPHEME_T):
        return GCB.T
    if _bisearch(ucs, GRAPHEME_LV):
        return GCB.LV
    if _bisearch(ucs, GRAPHEME_LVT):
        return GCB.LVT
    return GCB.OTHER


@lru_cache(maxsize=1024)
def _is_extended_pictographic(ucs: int) -> bool:
    """Check if codepoint has Extended_Pictographic property."""
    return bool(_bisearch(ucs, EXTENDED_PICTOGRAPHIC))


@lru_cache(maxsize=1024)
def _is_incb_linker(ucs: int) -> bool:
    """Check if codepoint has InCB=Linker property."""
    return bool(_bisearch(ucs, INCB_LINKER))


@lru_cache(maxsize=1024)
def _is_incb_consonant(ucs: int) -> bool:
    """Check if codepoint has InCB=Consonant property."""
    return bool(_bisearch(ucs, INCB_CONSONANT))


@lru_cache(maxsize=1024)
def _is_incb_extend(ucs: int) -> bool:
    """Check if codepoint has InCB=Extend property."""
    return bool(_bisearch(ucs, INCB_EXTEND))


class BreakResult(NamedTuple):
    """Result of grapheme cluster break decision."""

    should_break: bool
    ri_count: int


@lru_cache(maxsize=1024)
def _simple_break_check(prev_gcb: GCB, curr_gcb: GCB) -> BreakResult | None:
    """
    Check simple GCB-pair-based break rules (cacheable).

    Returns BreakResult for rules that can be determined from GCB properties alone, or None if
    complex lookback rules (GB9c, GB11) need to be checked.
    """
    # GB3: CR x LF
    if prev_gcb == GCB.CR and curr_gcb == GCB.LF:
        return BreakResult(should_break=False, ri_count=0)

    # GB4: (Control|CR|LF) ÷
    if prev_gcb in (GCB.CONTROL, GCB.CR, GCB.LF):
        return BreakResult(should_break=True, ri_count=0)

    # GB5: ÷ (Control|CR|LF)
    if curr_gcb in (GCB.CONTROL, GCB.CR, GCB.LF):
        return BreakResult(should_break=True, ri_count=0)

    # GB6: L x (L|V|LV|LVT)
    if prev_gcb == GCB.L and curr_gcb in (GCB.L, GCB.V, GCB.LV, GCB.LVT):
        return BreakResult(should_break=False, ri_count=0)

    # GB7: (LV|V) x (V|T)
    if prev_gcb in (GCB.LV, GCB.V) and curr_gcb in (GCB.V, GCB.T):
        return BreakResult(should_break=False, ri_count=0)

    # GB8: (LVT|T) x T
    if prev_gcb in (GCB.LVT, GCB.T) and curr_gcb == GCB.T:
        return BreakResult(should_break=False, ri_count=0)

    # GB9: x (Extend|ZWJ) - but ZWJ needs GB11 check, so only handle Extend here
    if curr_gcb == GCB.EXTEND:
        return BreakResult(should_break=False, ri_count=0)

    # GB9a: x SpacingMark
    if curr_gcb == GCB.SPACING_MARK:
        return BreakResult(should_break=False, ri_count=0)

    # GB9b: Prepend x
    if prev_gcb == GCB.PREPEND:
        return BreakResult(should_break=False, ri_count=0)

    # GB9c and GB11 need lookback - return None to signal complex check needed
    # GB12/13 (RI pairs) need ri_count state - also handled in main function
    return None


def _should_break(
    prev_gcb: GCB,
    curr_gcb: GCB,
    text: str,
    curr_idx: int,
    ri_count: int,
) -> BreakResult:
    # pylint: disable=too-many-branches,too-complex
    """
    Determine if there should be a grapheme cluster break between prev and curr.

    Implements UAX #29 grapheme cluster boundary rules.
    """
    # Try cached simple rules first
    result = _simple_break_check(prev_gcb, curr_gcb)
    if result is not None:
        return result

    # GB9: x ZWJ (not cached because GB11 needs lookback when prev is ZWJ)
    if curr_gcb == GCB.ZWJ:
        return BreakResult(should_break=False, ri_count=0)

    # GB9c: Indic conjunct cluster
    # \p{InCB=Consonant} [\p{InCB=Extend}\p{InCB=Linker}]* \p{InCB=Linker}
    #     [\p{InCB=Extend}\p{InCB=Linker}]* x \p{InCB=Consonant}
    curr_ucs = ord(text[curr_idx])
    if _is_incb_consonant(curr_ucs):
        has_linker = False
        i = curr_idx - 1
        while i >= 0:
            prev_ucs = ord(text[i])
            if _is_incb_linker(prev_ucs):
                has_linker = True
                i -= 1
            elif _is_incb_extend(prev_ucs):
                i -= 1
            elif _is_incb_consonant(prev_ucs):
                if has_linker:
                    return BreakResult(should_break=False, ri_count=0)
                break
            else:
                break

    # GB11: ExtPict Extend* ZWJ x ExtPict
    if prev_gcb == GCB.ZWJ and _is_extended_pictographic(curr_ucs):
        i = curr_idx - 2  # Skip the ZWJ at curr_idx - 1
        while i >= 0:
            prev_ucs = ord(text[i])
            prev_prop = _grapheme_cluster_break(prev_ucs)
            if prev_prop == GCB.EXTEND:
                i -= 1
            elif _is_extended_pictographic(prev_ucs):
                return BreakResult(should_break=False, ri_count=0)
            else:
                break

    # GB12/GB13: RI x RI (pair matching)
    if prev_gcb == GCB.REGIONAL_INDICATOR and curr_gcb == GCB.REGIONAL_INDICATOR:
        if ri_count % 2 == 1:
            return BreakResult(should_break=False, ri_count=ri_count + 1)
        return BreakResult(should_break=True, ri_count=1)

    # GB999: Any ÷ Any
    ri_count = 1 if curr_gcb == GCB.REGIONAL_INDICATOR else 0
    return BreakResult(should_break=True, ri_count=ri_count)


def iter_graphemes(
    unistr: str,
    start: int = 0,
    end: int | None = None,
) -> Iterator[str]:
    r"""
    Iterate over grapheme clusters in a Unicode string.

    Grapheme clusters are "user-perceived characters" - what a user would
    consider a single character, which may consist of multiple Unicode
    codepoints (e.g., a base character with combining marks, emoji sequences).

    :param unistr: The Unicode string to segment.
    :param start: Starting index (default 0).
    :param end: Ending index (default len(unistr)).
    :yields: Grapheme cluster substrings.

    Example::

        >>> list(iter_graphemes('cafe\u0301'))
        ['c', 'a', 'f', 'e\u0301']
        >>> list(iter_graphemes('\U0001F468\u200D\U0001F469\u200D\U0001F467'))
        ['o', 'k', '\U0001F468\u200D\U0001F469\u200D\U0001F467']
        >>> list(iter_graphemes('\U0001F1FA\U0001F1F8'))
        ['o', 'k', '\U0001F1FA\U0001F1F8']

    .. versionadded:: 0.3.0
    """
    if not unistr:
        return

    length = len(unistr)

    if end is None:
        end = length

    if start >= end or start >= length:
        return

    end = min(end, length)

    # Track state for grapheme cluster boundaries
    cluster_start = start
    ri_count = 0

    # Get GCB for first character
    prev_gcb = _grapheme_cluster_break(ord(unistr[start]))

    # Handle Regional Indicator count initialization
    if prev_gcb == GCB.REGIONAL_INDICATOR:
        ri_count = 1

    for idx in range(start + 1, end):
        curr_gcb = _grapheme_cluster_break(ord(unistr[idx]))

        result = _should_break(prev_gcb, curr_gcb, unistr, idx, ri_count)
        ri_count = result.ri_count

        if result.should_break:
            yield unistr[cluster_start:idx]
            cluster_start = idx

        prev_gcb = curr_gcb

    # Yield the final cluster
    yield unistr[cluster_start:end]


def _find_cluster_start(text: str, pos: int) -> int:
    """
    Find the start of the grapheme cluster containing the character before pos.

    Scans backwards from pos to find a safe starting point, then iterates forward using standard
    break rules to find the actual cluster boundary.

    :param text: The Unicode string.
    :param pos: Position to search before (exclusive).
    :returns: Start position of the grapheme cluster.
    """
    target_cp = ord(text[pos - 1])

    # GB3: CR x LF - LF after CR is part of same cluster
    if target_cp == 0x0A and pos >= 2 and text[pos - 2] == '\r':
        return pos - 2

    # Fast path: ASCII (except LF) starts its own cluster
    if target_cp < 0x80:
        # GB9b: Check for preceding PREPEND (rare: Arabic/Brahmic)
        if pos >= 2 and target_cp >= 0x20:
            prev_cp = ord(text[pos - 2])
            if prev_cp >= 0x80 and _grapheme_cluster_break(prev_cp) == GCB.PREPEND:
                return _find_cluster_start(text, pos - 1)
        return pos - 1

    # Scan backward to find a safe starting point
    safe_start = pos - 1
    while safe_start > 0 and (pos - safe_start) < MAX_GRAPHEME_SCAN:
        cp = ord(text[safe_start])
        if 0x20 <= cp < 0x80:  # ASCII always starts a cluster
            break
        if _grapheme_cluster_break(cp) == GCB.CONTROL:  # GB4
            break
        safe_start -= 1

    # Verify forward to find the actual cluster boundary
    cluster_start = safe_start
    left_gcb = _grapheme_cluster_break(ord(text[safe_start]))
    ri_count = 1 if left_gcb == GCB.REGIONAL_INDICATOR else 0

    for i in range(safe_start + 1, pos):
        right_gcb = _grapheme_cluster_break(ord(text[i]))
        result = _should_break(left_gcb, right_gcb, text, i, ri_count)
        ri_count = result.ri_count
        if result.should_break:
            cluster_start = i
        left_gcb = right_gcb

    return cluster_start


def grapheme_boundary_before(unistr: str, pos: int) -> int:
    r"""
    Find the grapheme cluster boundary immediately before a position.

    :param unistr: The Unicode string to search.
    :param pos: Position in the string (0 < pos <= len(unistr)).
    :returns: Start index of the grapheme cluster containing the character at pos-1.

    Example::

        >>> grapheme_boundary_before('Hello \U0001F44B\U0001F3FB', 8)
        6
        >>> grapheme_boundary_before('a\r\nb', 3)
        1

    .. versionadded:: 0.3.6
    """
    if pos <= 0:
        return 0
    return _find_cluster_start(unistr, min(pos, len(unistr)))


def iter_graphemes_reverse(
    unistr: str,
    start: int = 0,
    end: int | None = None,
) -> Iterator[str]:
    r"""
    Iterate over grapheme clusters in reverse order (last to first).

    :param unistr: The Unicode string to segment.
    :param start: Starting index (default 0).
    :param end: Ending index (default len(unistr)).
    :yields: Grapheme cluster substrings in reverse order.

    Example::

        >>> list(iter_graphemes_reverse('cafe\u0301'))
        ['e\u0301', 'f', 'a', 'c']

    .. versionadded:: 0.3.6
    """
    if not unistr:
        return

    length = len(unistr)

    end = length if end is None else min(end, length)
    start = max(start, 0)

    if start >= end or start >= length:
        return

    pos = end
    while pos > start:
        cluster_start = _find_cluster_start(unistr, pos)
        # Don't yield partial graphemes that extend before start
        if cluster_start < start:
            break
        yield unistr[cluster_start:pos]
        pos = cluster_start