grapheme.py 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428
  1. """
  2. Grapheme cluster segmentation following Unicode Standard Annex #29.
  3. This module provides pure-Python implementation of the grapheme cluster boundary algorithm as
  4. defined in UAX #29: Unicode Text Segmentation.
  5. https://www.unicode.org/reports/tr29/
  6. """
  7. from __future__ import annotations
  8. # std imports
  9. from enum import IntEnum
  10. from functools import lru_cache
  11. from typing import TYPE_CHECKING, NamedTuple
  12. # local
  13. from .bisearch import bisearch as _bisearch
  14. from .table_grapheme import (GRAPHEME_L,
  15. GRAPHEME_T,
  16. GRAPHEME_V,
  17. GRAPHEME_LV,
  18. INCB_EXTEND,
  19. INCB_LINKER,
  20. GRAPHEME_LVT,
  21. INCB_CONSONANT,
  22. GRAPHEME_EXTEND,
  23. GRAPHEME_CONTROL,
  24. GRAPHEME_PREPEND,
  25. GRAPHEME_SPACINGMARK,
  26. EXTENDED_PICTOGRAPHIC,
  27. GRAPHEME_REGIONAL_INDICATOR)
  28. if TYPE_CHECKING: # pragma: no cover
  29. # std imports
  30. from collections.abc import Iterator
  31. # Maximum backward scan distance when finding grapheme cluster boundaries.
  32. # Covers all known Unicode grapheme clusters with margin; longer sequences are pathological.
  33. MAX_GRAPHEME_SCAN = 32
  34. class GCB(IntEnum):
  35. """Grapheme Cluster Break property values."""
  36. OTHER = 0
  37. CR = 1
  38. LF = 2
  39. CONTROL = 3
  40. EXTEND = 4
  41. ZWJ = 5
  42. REGIONAL_INDICATOR = 6
  43. PREPEND = 7
  44. SPACING_MARK = 8
  45. L = 9
  46. V = 10
  47. T = 11
  48. LV = 12
  49. LVT = 13
  50. # All lru_cache sizes in this file use maxsize=1024, chosen by benchmarking UDHR data (500+
  51. # languages) and considering typical process-long sessions: western scripts need ~64 unique
  52. # codepoints, but CJK could reach ~2000 -- but likely not.
  53. @lru_cache(maxsize=1024)
  54. def _grapheme_cluster_break(ucs: int) -> GCB:
  55. # pylint: disable=too-many-branches,too-complex
  56. """Return the Grapheme_Cluster_Break property for a codepoint."""
  57. # Single codepoint matches
  58. if ucs == 0x000d:
  59. return GCB.CR
  60. if ucs == 0x000a:
  61. return GCB.LF
  62. if ucs == 0x200d:
  63. return GCB.ZWJ
  64. # Matching by codepoint ranges, requiring binary search
  65. if _bisearch(ucs, GRAPHEME_CONTROL):
  66. return GCB.CONTROL
  67. if _bisearch(ucs, GRAPHEME_EXTEND):
  68. return GCB.EXTEND
  69. if _bisearch(ucs, GRAPHEME_REGIONAL_INDICATOR):
  70. return GCB.REGIONAL_INDICATOR
  71. if _bisearch(ucs, GRAPHEME_PREPEND):
  72. return GCB.PREPEND
  73. if _bisearch(ucs, GRAPHEME_SPACINGMARK):
  74. return GCB.SPACING_MARK
  75. if _bisearch(ucs, GRAPHEME_L):
  76. return GCB.L
  77. if _bisearch(ucs, GRAPHEME_V):
  78. return GCB.V
  79. if _bisearch(ucs, GRAPHEME_T):
  80. return GCB.T
  81. if _bisearch(ucs, GRAPHEME_LV):
  82. return GCB.LV
  83. if _bisearch(ucs, GRAPHEME_LVT):
  84. return GCB.LVT
  85. return GCB.OTHER
  86. @lru_cache(maxsize=1024)
  87. def _is_extended_pictographic(ucs: int) -> bool:
  88. """Check if codepoint has Extended_Pictographic property."""
  89. return bool(_bisearch(ucs, EXTENDED_PICTOGRAPHIC))
  90. @lru_cache(maxsize=1024)
  91. def _is_incb_linker(ucs: int) -> bool:
  92. """Check if codepoint has InCB=Linker property."""
  93. return bool(_bisearch(ucs, INCB_LINKER))
  94. @lru_cache(maxsize=1024)
  95. def _is_incb_consonant(ucs: int) -> bool:
  96. """Check if codepoint has InCB=Consonant property."""
  97. return bool(_bisearch(ucs, INCB_CONSONANT))
  98. @lru_cache(maxsize=1024)
  99. def _is_incb_extend(ucs: int) -> bool:
  100. """Check if codepoint has InCB=Extend property."""
  101. return bool(_bisearch(ucs, INCB_EXTEND))
  102. class BreakResult(NamedTuple):
  103. """Result of grapheme cluster break decision."""
  104. should_break: bool
  105. ri_count: int
  106. @lru_cache(maxsize=1024)
  107. def _simple_break_check(prev_gcb: GCB, curr_gcb: GCB) -> BreakResult | None:
  108. """
  109. Check simple GCB-pair-based break rules (cacheable).
  110. Returns BreakResult for rules that can be determined from GCB properties alone, or None if
  111. complex lookback rules (GB9c, GB11) need to be checked.
  112. """
  113. # GB3: CR x LF
  114. if prev_gcb == GCB.CR and curr_gcb == GCB.LF:
  115. return BreakResult(should_break=False, ri_count=0)
  116. # GB4: (Control|CR|LF) ÷
  117. if prev_gcb in (GCB.CONTROL, GCB.CR, GCB.LF):
  118. return BreakResult(should_break=True, ri_count=0)
  119. # GB5: ÷ (Control|CR|LF)
  120. if curr_gcb in (GCB.CONTROL, GCB.CR, GCB.LF):
  121. return BreakResult(should_break=True, ri_count=0)
  122. # GB6: L x (L|V|LV|LVT)
  123. if prev_gcb == GCB.L and curr_gcb in (GCB.L, GCB.V, GCB.LV, GCB.LVT):
  124. return BreakResult(should_break=False, ri_count=0)
  125. # GB7: (LV|V) x (V|T)
  126. if prev_gcb in (GCB.LV, GCB.V) and curr_gcb in (GCB.V, GCB.T):
  127. return BreakResult(should_break=False, ri_count=0)
  128. # GB8: (LVT|T) x T
  129. if prev_gcb in (GCB.LVT, GCB.T) and curr_gcb == GCB.T:
  130. return BreakResult(should_break=False, ri_count=0)
  131. # GB9: x (Extend|ZWJ) - but ZWJ needs GB11 check, so only handle Extend here
  132. if curr_gcb == GCB.EXTEND:
  133. return BreakResult(should_break=False, ri_count=0)
  134. # GB9a: x SpacingMark
  135. if curr_gcb == GCB.SPACING_MARK:
  136. return BreakResult(should_break=False, ri_count=0)
  137. # GB9b: Prepend x
  138. if prev_gcb == GCB.PREPEND:
  139. return BreakResult(should_break=False, ri_count=0)
  140. # GB9c and GB11 need lookback - return None to signal complex check needed
  141. # GB12/13 (RI pairs) need ri_count state - also handled in main function
  142. return None
  143. def _should_break(
  144. prev_gcb: GCB,
  145. curr_gcb: GCB,
  146. text: str,
  147. curr_idx: int,
  148. ri_count: int,
  149. ) -> BreakResult:
  150. # pylint: disable=too-many-branches,too-complex
  151. """
  152. Determine if there should be a grapheme cluster break between prev and curr.
  153. Implements UAX #29 grapheme cluster boundary rules.
  154. """
  155. # Try cached simple rules first
  156. result = _simple_break_check(prev_gcb, curr_gcb)
  157. if result is not None:
  158. return result
  159. # GB9: x ZWJ (not cached because GB11 needs lookback when prev is ZWJ)
  160. if curr_gcb == GCB.ZWJ:
  161. return BreakResult(should_break=False, ri_count=0)
  162. # GB9c: Indic conjunct cluster
  163. # \p{InCB=Consonant} [\p{InCB=Extend}\p{InCB=Linker}]* \p{InCB=Linker}
  164. # [\p{InCB=Extend}\p{InCB=Linker}]* x \p{InCB=Consonant}
  165. curr_ucs = ord(text[curr_idx])
  166. if _is_incb_consonant(curr_ucs):
  167. has_linker = False
  168. i = curr_idx - 1
  169. while i >= 0:
  170. prev_ucs = ord(text[i])
  171. if _is_incb_linker(prev_ucs):
  172. has_linker = True
  173. i -= 1
  174. elif _is_incb_extend(prev_ucs):
  175. i -= 1
  176. elif _is_incb_consonant(prev_ucs):
  177. if has_linker:
  178. return BreakResult(should_break=False, ri_count=0)
  179. break
  180. else:
  181. break
  182. # GB11: ExtPict Extend* ZWJ x ExtPict
  183. if prev_gcb == GCB.ZWJ and _is_extended_pictographic(curr_ucs):
  184. i = curr_idx - 2 # Skip the ZWJ at curr_idx - 1
  185. while i >= 0:
  186. prev_ucs = ord(text[i])
  187. prev_prop = _grapheme_cluster_break(prev_ucs)
  188. if prev_prop == GCB.EXTEND:
  189. i -= 1
  190. elif _is_extended_pictographic(prev_ucs):
  191. return BreakResult(should_break=False, ri_count=0)
  192. else:
  193. break
  194. # GB12/GB13: RI x RI (pair matching)
  195. if prev_gcb == GCB.REGIONAL_INDICATOR and curr_gcb == GCB.REGIONAL_INDICATOR:
  196. if ri_count % 2 == 1:
  197. return BreakResult(should_break=False, ri_count=ri_count + 1)
  198. return BreakResult(should_break=True, ri_count=1)
  199. # GB999: Any ÷ Any
  200. ri_count = 1 if curr_gcb == GCB.REGIONAL_INDICATOR else 0
  201. return BreakResult(should_break=True, ri_count=ri_count)
  202. def iter_graphemes(
  203. unistr: str,
  204. start: int = 0,
  205. end: int | None = None,
  206. ) -> Iterator[str]:
  207. r"""
  208. Iterate over grapheme clusters in a Unicode string.
  209. Grapheme clusters are "user-perceived characters" - what a user would
  210. consider a single character, which may consist of multiple Unicode
  211. codepoints (e.g., a base character with combining marks, emoji sequences).
  212. :param unistr: The Unicode string to segment.
  213. :param start: Starting index (default 0).
  214. :param end: Ending index (default len(unistr)).
  215. :yields: Grapheme cluster substrings.
  216. Example::
  217. >>> list(iter_graphemes('cafe\u0301'))
  218. ['c', 'a', 'f', 'e\u0301']
  219. >>> list(iter_graphemes('\U0001F468\u200D\U0001F469\u200D\U0001F467'))
  220. ['o', 'k', '\U0001F468\u200D\U0001F469\u200D\U0001F467']
  221. >>> list(iter_graphemes('\U0001F1FA\U0001F1F8'))
  222. ['o', 'k', '\U0001F1FA\U0001F1F8']
  223. .. versionadded:: 0.3.0
  224. """
  225. if not unistr:
  226. return
  227. length = len(unistr)
  228. if end is None:
  229. end = length
  230. if start >= end or start >= length:
  231. return
  232. end = min(end, length)
  233. # Track state for grapheme cluster boundaries
  234. cluster_start = start
  235. ri_count = 0
  236. # Get GCB for first character
  237. prev_gcb = _grapheme_cluster_break(ord(unistr[start]))
  238. # Handle Regional Indicator count initialization
  239. if prev_gcb == GCB.REGIONAL_INDICATOR:
  240. ri_count = 1
  241. for idx in range(start + 1, end):
  242. curr_gcb = _grapheme_cluster_break(ord(unistr[idx]))
  243. result = _should_break(prev_gcb, curr_gcb, unistr, idx, ri_count)
  244. ri_count = result.ri_count
  245. if result.should_break:
  246. yield unistr[cluster_start:idx]
  247. cluster_start = idx
  248. prev_gcb = curr_gcb
  249. # Yield the final cluster
  250. yield unistr[cluster_start:end]
  251. def _find_cluster_start(text: str, pos: int) -> int:
  252. """
  253. Find the start of the grapheme cluster containing the character before pos.
  254. Scans backwards from pos to find a safe starting point, then iterates forward using standard
  255. break rules to find the actual cluster boundary.
  256. :param text: The Unicode string.
  257. :param pos: Position to search before (exclusive).
  258. :returns: Start position of the grapheme cluster.
  259. """
  260. target_cp = ord(text[pos - 1])
  261. # GB3: CR x LF - LF after CR is part of same cluster
  262. if target_cp == 0x0A and pos >= 2 and text[pos - 2] == '\r':
  263. return pos - 2
  264. # Fast path: ASCII (except LF) starts its own cluster
  265. if target_cp < 0x80:
  266. # GB9b: Check for preceding PREPEND (rare: Arabic/Brahmic)
  267. if pos >= 2 and target_cp >= 0x20:
  268. prev_cp = ord(text[pos - 2])
  269. if prev_cp >= 0x80 and _grapheme_cluster_break(prev_cp) == GCB.PREPEND:
  270. return _find_cluster_start(text, pos - 1)
  271. return pos - 1
  272. # Scan backward to find a safe starting point
  273. safe_start = pos - 1
  274. while safe_start > 0 and (pos - safe_start) < MAX_GRAPHEME_SCAN:
  275. cp = ord(text[safe_start])
  276. if 0x20 <= cp < 0x80: # ASCII always starts a cluster
  277. break
  278. if _grapheme_cluster_break(cp) == GCB.CONTROL: # GB4
  279. break
  280. safe_start -= 1
  281. # Verify forward to find the actual cluster boundary
  282. cluster_start = safe_start
  283. left_gcb = _grapheme_cluster_break(ord(text[safe_start]))
  284. ri_count = 1 if left_gcb == GCB.REGIONAL_INDICATOR else 0
  285. for i in range(safe_start + 1, pos):
  286. right_gcb = _grapheme_cluster_break(ord(text[i]))
  287. result = _should_break(left_gcb, right_gcb, text, i, ri_count)
  288. ri_count = result.ri_count
  289. if result.should_break:
  290. cluster_start = i
  291. left_gcb = right_gcb
  292. return cluster_start
  293. def grapheme_boundary_before(unistr: str, pos: int) -> int:
  294. r"""
  295. Find the grapheme cluster boundary immediately before a position.
  296. :param unistr: The Unicode string to search.
  297. :param pos: Position in the string (0 < pos <= len(unistr)).
  298. :returns: Start index of the grapheme cluster containing the character at pos-1.
  299. Example::
  300. >>> grapheme_boundary_before('Hello \U0001F44B\U0001F3FB', 8)
  301. 6
  302. >>> grapheme_boundary_before('a\r\nb', 3)
  303. 1
  304. .. versionadded:: 0.3.6
  305. """
  306. if pos <= 0:
  307. return 0
  308. return _find_cluster_start(unistr, min(pos, len(unistr)))
  309. def iter_graphemes_reverse(
  310. unistr: str,
  311. start: int = 0,
  312. end: int | None = None,
  313. ) -> Iterator[str]:
  314. r"""
  315. Iterate over grapheme clusters in reverse order (last to first).
  316. :param unistr: The Unicode string to segment.
  317. :param start: Starting index (default 0).
  318. :param end: Ending index (default len(unistr)).
  319. :yields: Grapheme cluster substrings in reverse order.
  320. Example::
  321. >>> list(iter_graphemes_reverse('cafe\u0301'))
  322. ['e\u0301', 'f', 'a', 'c']
  323. .. versionadded:: 0.3.6
  324. """
  325. if not unistr:
  326. return
  327. length = len(unistr)
  328. end = length if end is None else min(end, length)
  329. start = max(start, 0)
  330. if start >= end or start >= length:
  331. return
  332. pos = end
  333. while pos > start:
  334. cluster_start = _find_cluster_start(unistr, pos)
  335. # Don't yield partial graphemes that extend before start
  336. if cluster_start < start:
  337. break
  338. yield unistr[cluster_start:pos]
  339. pos = cluster_start