finder.py 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272
  1. from enum import Enum
  2. from grapheme.grapheme_property_group import GraphemePropertyGroup as GraphGroup
  3. from grapheme.grapheme_property_group import get_group
  4. from grapheme.incb_property_group import InCBPropertyGroup as InCBGroup
  5. from grapheme.incb_property_group import get_group as get_group_incb
  6. class BreakPossibility(Enum):
  7. CERTAIN = "certain"
  8. POSSIBLE = "possible"
  9. NO_BREAK = "nobreak"
  10. def get_break_possibility(a, b):
  11. # Probably most common, included as short circuit before checking all else
  12. if a is GraphGroup.OTHER and b is GraphGroup.OTHER:
  13. return BreakPossibility.CERTAIN
  14. assert isinstance(a, GraphGroup)
  15. assert isinstance(b, GraphGroup)
  16. # Only break if preceeded by an uneven number of REGIONAL_INDICATORS
  17. # sot (RI RI)* RI × RI
  18. # [^RI] (RI RI) * RI × RI
  19. if a is GraphGroup.REGIONAL_INDICATOR and b is GraphGroup.REGIONAL_INDICATOR:
  20. return BreakPossibility.POSSIBLE
  21. # (Control | CR | LF) ÷
  22. # ÷ (Control | CR | LF)
  23. if a in [GraphGroup.CONTROL, GraphGroup.CR, GraphGroup.LF] or b in [
  24. GraphGroup.CONTROL,
  25. GraphGroup.CR,
  26. GraphGroup.LF,
  27. ]:
  28. # CR × LF
  29. if a is GraphGroup.CR and b is GraphGroup.LF:
  30. return BreakPossibility.NO_BREAK
  31. else:
  32. return BreakPossibility.CERTAIN
  33. # L × (L | V | LV | LVT)
  34. if a is GraphGroup.L and b in [GraphGroup.L, GraphGroup.V, GraphGroup.LV, GraphGroup.LVT]:
  35. return BreakPossibility.NO_BREAK
  36. # (LV | V) × (V | T)
  37. if a in [GraphGroup.LV, GraphGroup.V] and b in [GraphGroup.V, GraphGroup.T]:
  38. return BreakPossibility.NO_BREAK
  39. # (LVT | T) × T
  40. if a in [GraphGroup.LVT, GraphGroup.T] and b is GraphGroup.T:
  41. return BreakPossibility.NO_BREAK
  42. # × (Extend | ZWJ)
  43. # × SpacingMark
  44. # Prepend ×
  45. if b in [GraphGroup.EXTEND, GraphGroup.ZWJ, GraphGroup.SPACING_MARK] or a is GraphGroup.PREPEND:
  46. return BreakPossibility.NO_BREAK
  47. # \p{Extended_Pictographic} Extend* ZWJ × \p{Extended_Pictographic}
  48. if a is GraphGroup.ZWJ and b is GraphGroup.EXTENDED_PICTOGRAPHIC:
  49. return BreakPossibility.POSSIBLE
  50. # everything else, assumes all other rules are included above
  51. return BreakPossibility.CERTAIN
  52. def get_break_possibility_incb(a, b):
  53. # Probably most common, included as short circuit before checking all else
  54. if a is InCBGroup.OTHER and b is InCBGroup.OTHER:
  55. return BreakPossibility.CERTAIN
  56. if a in [InCBGroup.LINKER, InCBGroup.EXTEND] and b is InCBGroup.CONSONANT:
  57. return BreakPossibility.NO_BREAK
  58. if a in [InCBGroup.LINKER, InCBGroup.EXTEND, InCBGroup.CONSONANT] and b is InCBGroup.LINKER:
  59. return BreakPossibility.NO_BREAK
  60. assert isinstance(a, InCBGroup)
  61. assert isinstance(b, InCBGroup)
  62. # everything else, assumes all other rules are included above
  63. return BreakPossibility.POSSIBLE
  64. def get_last_certain_break_index(string, index):
  65. if index >= len(string):
  66. return len(string)
  67. prev = get_group(string[index])
  68. prev_incb = get_group_incb(string[index])
  69. while True:
  70. if index <= 0:
  71. return 0
  72. index -= 1
  73. cur = get_group(string[index])
  74. cur_incb = get_group_incb(string[index])
  75. if (
  76. get_break_possibility(cur, prev) == BreakPossibility.CERTAIN
  77. and get_break_possibility_incb(cur_incb, prev_incb) != BreakPossibility.NO_BREAK
  78. ):
  79. return index + 1
  80. prev = cur
  81. prev_incb = cur_incb
  82. class UState(Enum):
  83. DEFAULT = 0 # No special case
  84. GB9c_Consonant = 10
  85. GB9c_Extend = 11
  86. GB9c_Linker = 12
  87. GB11_Picto = 20
  88. GB11_ZWJ = 21
  89. GB12_First = 30
  90. GB12_Second = 31
  91. class GraphemeIterator:
  92. def __init__(self, string: str):
  93. self.str_iter = iter(string)
  94. self.buffer = ""
  95. self.lastg = None
  96. self.state = UState.DEFAULT
  97. try:
  98. self.buffer = next(self.str_iter)
  99. except StopIteration:
  100. self.buffer = None
  101. else:
  102. lastg = get_group(self.buffer)
  103. self.lastg = lastg
  104. if lastg is GraphGroup.EXTENDED_PICTOGRAPHIC:
  105. self.state = UState.GB11_Picto
  106. elif lastg is GraphGroup.REGIONAL_INDICATOR:
  107. self.state = UState.GB12_First
  108. else:
  109. lastincb = get_group_incb(self.buffer)
  110. if lastincb is InCBGroup.CONSONANT:
  111. self.state = UState.GB9c_Consonant
  112. def __iter__(self):
  113. return self
  114. def default_should_break(self, nextg, nextincb):
  115. should_break = None
  116. next_state = UState.DEFAULT
  117. # First the most common
  118. if (
  119. self.lastg is GraphGroup.OTHER
  120. and nextg is GraphGroup.OTHER
  121. and nextincb is InCBGroup.OTHER
  122. ):
  123. # GB999 Any ÷ Any
  124. # Otherwise, break everywhere
  125. return True, UState.DEFAULT
  126. elif self.lastg is GraphGroup.CR and nextg is GraphGroup.LF:
  127. # GB3 CR × LF
  128. # Do not break between a CR and LF
  129. should_break = False
  130. elif self.lastg in (GraphGroup.CONTROL, GraphGroup.CR, GraphGroup.LF):
  131. # GB4 (Control | CR | LF) ÷
  132. # break before and after controls
  133. should_break = True
  134. elif nextg in (GraphGroup.CONTROL, GraphGroup.CR, GraphGroup.LF):
  135. # GB5 ÷ (Control | CR | LF)
  136. # break before and after controls.
  137. should_break = True
  138. elif self.lastg is GraphGroup.L and nextg in (
  139. GraphGroup.L,
  140. GraphGroup.V,
  141. GraphGroup.LV,
  142. GraphGroup.LVT,
  143. ):
  144. # GB6 L × (L | V | LV | LVT)
  145. # Do not break Hangul syllable or other conjoining sequences.
  146. should_break = False
  147. elif self.lastg in (GraphGroup.LV, GraphGroup.V) and nextg in (GraphGroup.V, GraphGroup.T):
  148. # GB7 (LV | V) × (V | T)
  149. # Do not break Hangul syllable or other conjoining sequences.
  150. should_break = False
  151. elif self.lastg in (GraphGroup.LVT, GraphGroup.T) and nextg is GraphGroup.T:
  152. # GB8 (LVT | T) × T
  153. # Do not break Hangul syllable or other conjoining sequences.
  154. should_break = False
  155. elif nextg in (GraphGroup.EXTEND, GraphGroup.ZWJ, GraphGroup.SPACING_MARK):
  156. # GB9 × (Extend | ZWJ)
  157. # Do not break before extending characters or ZWJ.
  158. # GB9a × SpacingMark
  159. # Do not break before SpacingMarks
  160. should_break = False
  161. elif self.lastg is GraphGroup.PREPEND:
  162. # GB9b Prepend ×
  163. # Do not break after Prepend characters
  164. should_break = False
  165. # Next State
  166. if nextg is GraphGroup.OTHER and nextincb is InCBGroup.OTHER:
  167. pass
  168. elif nextg is GraphGroup.EXTENDED_PICTOGRAPHIC:
  169. next_state = UState.GB11_Picto
  170. elif nextg is GraphGroup.REGIONAL_INDICATOR:
  171. next_state = UState.GB12_First
  172. elif nextincb is InCBGroup.CONSONANT:
  173. next_state = UState.GB9c_Consonant
  174. return should_break, next_state
  175. def __next__(self):
  176. for codepoint in self.str_iter:
  177. nextg = get_group(codepoint)
  178. next_inbc = get_group_incb(codepoint)
  179. sb, next_state = self.default_should_break(nextg, next_inbc)
  180. if self.state is UState.DEFAULT:
  181. pass
  182. # GB11
  183. elif self.state is UState.GB11_Picto:
  184. if nextg is GraphGroup.EXTEND:
  185. next_state = UState.GB11_Picto
  186. sb = False
  187. elif nextg is GraphGroup.ZWJ:
  188. next_state = UState.GB11_ZWJ
  189. sb = False
  190. elif self.state is UState.GB11_ZWJ and nextg is GraphGroup.EXTENDED_PICTOGRAPHIC:
  191. next_state = UState.DEFAULT
  192. sb = False
  193. # GB12 sot (RI RI)* RI × RI
  194. # GB13 [^RI] (RI RI)* RI × RI
  195. # Do not break within emoji flag sequences.
  196. # That is, do not break between regional indicator (RI) symbols
  197. # if there is an odd number of RI characters before the break point.
  198. elif (
  199. self.state in (UState.GB12_First, UState.GB12_Second)
  200. and nextg is GraphGroup.REGIONAL_INDICATOR
  201. ):
  202. sb = self.state is not UState.GB12_First
  203. next_state = (
  204. UState.GB12_First if self.state is UState.GB12_Second else UState.GB12_Second
  205. )
  206. # GB9c Consonant [ Extend Linker ]* Linker [ Extend Linker ]* × Consonant
  207. elif self.state is UState.GB9c_Consonant:
  208. if next_inbc is InCBGroup.EXTEND:
  209. sb = False
  210. next_state = UState.GB9c_Consonant
  211. elif next_inbc is InCBGroup.LINKER:
  212. sb = False
  213. next_state = UState.GB9c_Linker
  214. elif self.state is UState.GB9c_Linker:
  215. if next_inbc is InCBGroup.LINKER:
  216. sb = False
  217. next_state = UState.GB9c_Linker
  218. elif next_inbc is InCBGroup.CONSONANT:
  219. sb = False
  220. next_state = UState.GB9c_Consonant
  221. elif next_inbc is InCBGroup.EXTEND:
  222. sb = False
  223. next_state = UState.GB9c_Linker
  224. # Handle results
  225. self.state = next_state
  226. self.lastg = nextg
  227. if sb is True or sb is None:
  228. return self._break(codepoint)
  229. self.buffer += codepoint # type: ignore
  230. if self.buffer:
  231. # GB2 Any ÷ eot
  232. # Break at the end of text, unless the text is empty.
  233. return self._break(None)
  234. raise StopIteration()
  235. def _break(self, new):
  236. """Return the current buffer, start with a new one"""
  237. old_buffer = self.buffer
  238. self.buffer = new
  239. return old_buffer