| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428 |
- """
- Grapheme cluster segmentation following Unicode Standard Annex #29.
- This module provides pure-Python implementation of the grapheme cluster boundary algorithm as
- defined in UAX #29: Unicode Text Segmentation.
- https://www.unicode.org/reports/tr29/
- """
- from __future__ import annotations
- # std imports
- from enum import IntEnum
- from functools import lru_cache
- from typing import TYPE_CHECKING, NamedTuple
- # local
- from .bisearch import bisearch as _bisearch
- from .table_grapheme import (GRAPHEME_L,
- GRAPHEME_T,
- GRAPHEME_V,
- GRAPHEME_LV,
- INCB_EXTEND,
- INCB_LINKER,
- GRAPHEME_LVT,
- INCB_CONSONANT,
- GRAPHEME_EXTEND,
- GRAPHEME_CONTROL,
- GRAPHEME_PREPEND,
- GRAPHEME_SPACINGMARK,
- EXTENDED_PICTOGRAPHIC,
- GRAPHEME_REGIONAL_INDICATOR)
- if TYPE_CHECKING: # pragma: no cover
- # std imports
- from collections.abc import Iterator
- # Maximum backward scan distance when finding grapheme cluster boundaries.
- # Covers all known Unicode grapheme clusters with margin; longer sequences are pathological.
- MAX_GRAPHEME_SCAN = 32
- class GCB(IntEnum):
- """Grapheme Cluster Break property values."""
- OTHER = 0
- CR = 1
- LF = 2
- CONTROL = 3
- EXTEND = 4
- ZWJ = 5
- REGIONAL_INDICATOR = 6
- PREPEND = 7
- SPACING_MARK = 8
- L = 9
- V = 10
- T = 11
- LV = 12
- LVT = 13
- # All lru_cache sizes in this file use maxsize=1024, chosen by benchmarking UDHR data (500+
- # languages) and considering typical process-long sessions: western scripts need ~64 unique
- # codepoints, but CJK could reach ~2000 -- but likely not.
- @lru_cache(maxsize=1024)
- def _grapheme_cluster_break(ucs: int) -> GCB:
- # pylint: disable=too-many-branches,too-complex
- """Return the Grapheme_Cluster_Break property for a codepoint."""
- # Single codepoint matches
- if ucs == 0x000d:
- return GCB.CR
- if ucs == 0x000a:
- return GCB.LF
- if ucs == 0x200d:
- return GCB.ZWJ
- # Matching by codepoint ranges, requiring binary search
- if _bisearch(ucs, GRAPHEME_CONTROL):
- return GCB.CONTROL
- if _bisearch(ucs, GRAPHEME_EXTEND):
- return GCB.EXTEND
- if _bisearch(ucs, GRAPHEME_REGIONAL_INDICATOR):
- return GCB.REGIONAL_INDICATOR
- if _bisearch(ucs, GRAPHEME_PREPEND):
- return GCB.PREPEND
- if _bisearch(ucs, GRAPHEME_SPACINGMARK):
- return GCB.SPACING_MARK
- if _bisearch(ucs, GRAPHEME_L):
- return GCB.L
- if _bisearch(ucs, GRAPHEME_V):
- return GCB.V
- if _bisearch(ucs, GRAPHEME_T):
- return GCB.T
- if _bisearch(ucs, GRAPHEME_LV):
- return GCB.LV
- if _bisearch(ucs, GRAPHEME_LVT):
- return GCB.LVT
- return GCB.OTHER
- @lru_cache(maxsize=1024)
- def _is_extended_pictographic(ucs: int) -> bool:
- """Check if codepoint has Extended_Pictographic property."""
- return bool(_bisearch(ucs, EXTENDED_PICTOGRAPHIC))
- @lru_cache(maxsize=1024)
- def _is_incb_linker(ucs: int) -> bool:
- """Check if codepoint has InCB=Linker property."""
- return bool(_bisearch(ucs, INCB_LINKER))
- @lru_cache(maxsize=1024)
- def _is_incb_consonant(ucs: int) -> bool:
- """Check if codepoint has InCB=Consonant property."""
- return bool(_bisearch(ucs, INCB_CONSONANT))
- @lru_cache(maxsize=1024)
- def _is_incb_extend(ucs: int) -> bool:
- """Check if codepoint has InCB=Extend property."""
- return bool(_bisearch(ucs, INCB_EXTEND))
- class BreakResult(NamedTuple):
- """Result of grapheme cluster break decision."""
- should_break: bool
- ri_count: int
- @lru_cache(maxsize=1024)
- def _simple_break_check(prev_gcb: GCB, curr_gcb: GCB) -> BreakResult | None:
- """
- Check simple GCB-pair-based break rules (cacheable).
- Returns BreakResult for rules that can be determined from GCB properties alone, or None if
- complex lookback rules (GB9c, GB11) need to be checked.
- """
- # GB3: CR x LF
- if prev_gcb == GCB.CR and curr_gcb == GCB.LF:
- return BreakResult(should_break=False, ri_count=0)
- # GB4: (Control|CR|LF) ÷
- if prev_gcb in (GCB.CONTROL, GCB.CR, GCB.LF):
- return BreakResult(should_break=True, ri_count=0)
- # GB5: ÷ (Control|CR|LF)
- if curr_gcb in (GCB.CONTROL, GCB.CR, GCB.LF):
- return BreakResult(should_break=True, ri_count=0)
- # GB6: L x (L|V|LV|LVT)
- if prev_gcb == GCB.L and curr_gcb in (GCB.L, GCB.V, GCB.LV, GCB.LVT):
- return BreakResult(should_break=False, ri_count=0)
- # GB7: (LV|V) x (V|T)
- if prev_gcb in (GCB.LV, GCB.V) and curr_gcb in (GCB.V, GCB.T):
- return BreakResult(should_break=False, ri_count=0)
- # GB8: (LVT|T) x T
- if prev_gcb in (GCB.LVT, GCB.T) and curr_gcb == GCB.T:
- return BreakResult(should_break=False, ri_count=0)
- # GB9: x (Extend|ZWJ) - but ZWJ needs GB11 check, so only handle Extend here
- if curr_gcb == GCB.EXTEND:
- return BreakResult(should_break=False, ri_count=0)
- # GB9a: x SpacingMark
- if curr_gcb == GCB.SPACING_MARK:
- return BreakResult(should_break=False, ri_count=0)
- # GB9b: Prepend x
- if prev_gcb == GCB.PREPEND:
- return BreakResult(should_break=False, ri_count=0)
- # GB9c and GB11 need lookback - return None to signal complex check needed
- # GB12/13 (RI pairs) need ri_count state - also handled in main function
- return None
- def _should_break(
- prev_gcb: GCB,
- curr_gcb: GCB,
- text: str,
- curr_idx: int,
- ri_count: int,
- ) -> BreakResult:
- # pylint: disable=too-many-branches,too-complex
- """
- Determine if there should be a grapheme cluster break between prev and curr.
- Implements UAX #29 grapheme cluster boundary rules.
- """
- # Try cached simple rules first
- result = _simple_break_check(prev_gcb, curr_gcb)
- if result is not None:
- return result
- # GB9: x ZWJ (not cached because GB11 needs lookback when prev is ZWJ)
- if curr_gcb == GCB.ZWJ:
- return BreakResult(should_break=False, ri_count=0)
- # GB9c: Indic conjunct cluster
- # \p{InCB=Consonant} [\p{InCB=Extend}\p{InCB=Linker}]* \p{InCB=Linker}
- # [\p{InCB=Extend}\p{InCB=Linker}]* x \p{InCB=Consonant}
- curr_ucs = ord(text[curr_idx])
- if _is_incb_consonant(curr_ucs):
- has_linker = False
- i = curr_idx - 1
- while i >= 0:
- prev_ucs = ord(text[i])
- if _is_incb_linker(prev_ucs):
- has_linker = True
- i -= 1
- elif _is_incb_extend(prev_ucs):
- i -= 1
- elif _is_incb_consonant(prev_ucs):
- if has_linker:
- return BreakResult(should_break=False, ri_count=0)
- break
- else:
- break
- # GB11: ExtPict Extend* ZWJ x ExtPict
- if prev_gcb == GCB.ZWJ and _is_extended_pictographic(curr_ucs):
- i = curr_idx - 2 # Skip the ZWJ at curr_idx - 1
- while i >= 0:
- prev_ucs = ord(text[i])
- prev_prop = _grapheme_cluster_break(prev_ucs)
- if prev_prop == GCB.EXTEND:
- i -= 1
- elif _is_extended_pictographic(prev_ucs):
- return BreakResult(should_break=False, ri_count=0)
- else:
- break
- # GB12/GB13: RI x RI (pair matching)
- if prev_gcb == GCB.REGIONAL_INDICATOR and curr_gcb == GCB.REGIONAL_INDICATOR:
- if ri_count % 2 == 1:
- return BreakResult(should_break=False, ri_count=ri_count + 1)
- return BreakResult(should_break=True, ri_count=1)
- # GB999: Any ÷ Any
- ri_count = 1 if curr_gcb == GCB.REGIONAL_INDICATOR else 0
- return BreakResult(should_break=True, ri_count=ri_count)
- def iter_graphemes(
- unistr: str,
- start: int = 0,
- end: int | None = None,
- ) -> Iterator[str]:
- r"""
- Iterate over grapheme clusters in a Unicode string.
- Grapheme clusters are "user-perceived characters" - what a user would
- consider a single character, which may consist of multiple Unicode
- codepoints (e.g., a base character with combining marks, emoji sequences).
- :param unistr: The Unicode string to segment.
- :param start: Starting index (default 0).
- :param end: Ending index (default len(unistr)).
- :yields: Grapheme cluster substrings.
- Example::
- >>> list(iter_graphemes('cafe\u0301'))
- ['c', 'a', 'f', 'e\u0301']
- >>> list(iter_graphemes('\U0001F468\u200D\U0001F469\u200D\U0001F467'))
- ['o', 'k', '\U0001F468\u200D\U0001F469\u200D\U0001F467']
- >>> list(iter_graphemes('\U0001F1FA\U0001F1F8'))
- ['o', 'k', '\U0001F1FA\U0001F1F8']
- .. versionadded:: 0.3.0
- """
- if not unistr:
- return
- length = len(unistr)
- if end is None:
- end = length
- if start >= end or start >= length:
- return
- end = min(end, length)
- # Track state for grapheme cluster boundaries
- cluster_start = start
- ri_count = 0
- # Get GCB for first character
- prev_gcb = _grapheme_cluster_break(ord(unistr[start]))
- # Handle Regional Indicator count initialization
- if prev_gcb == GCB.REGIONAL_INDICATOR:
- ri_count = 1
- for idx in range(start + 1, end):
- curr_gcb = _grapheme_cluster_break(ord(unistr[idx]))
- result = _should_break(prev_gcb, curr_gcb, unistr, idx, ri_count)
- ri_count = result.ri_count
- if result.should_break:
- yield unistr[cluster_start:idx]
- cluster_start = idx
- prev_gcb = curr_gcb
- # Yield the final cluster
- yield unistr[cluster_start:end]
- def _find_cluster_start(text: str, pos: int) -> int:
- """
- Find the start of the grapheme cluster containing the character before pos.
- Scans backwards from pos to find a safe starting point, then iterates forward using standard
- break rules to find the actual cluster boundary.
- :param text: The Unicode string.
- :param pos: Position to search before (exclusive).
- :returns: Start position of the grapheme cluster.
- """
- target_cp = ord(text[pos - 1])
- # GB3: CR x LF - LF after CR is part of same cluster
- if target_cp == 0x0A and pos >= 2 and text[pos - 2] == '\r':
- return pos - 2
- # Fast path: ASCII (except LF) starts its own cluster
- if target_cp < 0x80:
- # GB9b: Check for preceding PREPEND (rare: Arabic/Brahmic)
- if pos >= 2 and target_cp >= 0x20:
- prev_cp = ord(text[pos - 2])
- if prev_cp >= 0x80 and _grapheme_cluster_break(prev_cp) == GCB.PREPEND:
- return _find_cluster_start(text, pos - 1)
- return pos - 1
- # Scan backward to find a safe starting point
- safe_start = pos - 1
- while safe_start > 0 and (pos - safe_start) < MAX_GRAPHEME_SCAN:
- cp = ord(text[safe_start])
- if 0x20 <= cp < 0x80: # ASCII always starts a cluster
- break
- if _grapheme_cluster_break(cp) == GCB.CONTROL: # GB4
- break
- safe_start -= 1
- # Verify forward to find the actual cluster boundary
- cluster_start = safe_start
- left_gcb = _grapheme_cluster_break(ord(text[safe_start]))
- ri_count = 1 if left_gcb == GCB.REGIONAL_INDICATOR else 0
- for i in range(safe_start + 1, pos):
- right_gcb = _grapheme_cluster_break(ord(text[i]))
- result = _should_break(left_gcb, right_gcb, text, i, ri_count)
- ri_count = result.ri_count
- if result.should_break:
- cluster_start = i
- left_gcb = right_gcb
- return cluster_start
- def grapheme_boundary_before(unistr: str, pos: int) -> int:
- r"""
- Find the grapheme cluster boundary immediately before a position.
- :param unistr: The Unicode string to search.
- :param pos: Position in the string (0 < pos <= len(unistr)).
- :returns: Start index of the grapheme cluster containing the character at pos-1.
- Example::
- >>> grapheme_boundary_before('Hello \U0001F44B\U0001F3FB', 8)
- 6
- >>> grapheme_boundary_before('a\r\nb', 3)
- 1
- .. versionadded:: 0.3.6
- """
- if pos <= 0:
- return 0
- return _find_cluster_start(unistr, min(pos, len(unistr)))
- def iter_graphemes_reverse(
- unistr: str,
- start: int = 0,
- end: int | None = None,
- ) -> Iterator[str]:
- r"""
- Iterate over grapheme clusters in reverse order (last to first).
- :param unistr: The Unicode string to segment.
- :param start: Starting index (default 0).
- :param end: Ending index (default len(unistr)).
- :yields: Grapheme cluster substrings in reverse order.
- Example::
- >>> list(iter_graphemes_reverse('cafe\u0301'))
- ['e\u0301', 'f', 'a', 'c']
- .. versionadded:: 0.3.6
- """
- if not unistr:
- return
- length = len(unistr)
- end = length if end is None else min(end, length)
- start = max(start, 0)
- if start >= end or start >= length:
- return
- pos = end
- while pos > start:
- cluster_start = _find_cluster_start(unistr, pos)
- # Don't yield partial graphemes that extend before start
- if cluster_start < start:
- break
- yield unistr[cluster_start:pos]
- pos = cluster_start
|