| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272 |
- from enum import Enum
- from grapheme.grapheme_property_group import GraphemePropertyGroup as GraphGroup
- from grapheme.grapheme_property_group import get_group
- from grapheme.incb_property_group import InCBPropertyGroup as InCBGroup
- from grapheme.incb_property_group import get_group as get_group_incb
- class BreakPossibility(Enum):
- CERTAIN = "certain"
- POSSIBLE = "possible"
- NO_BREAK = "nobreak"
- def get_break_possibility(a, b):
- # Probably most common, included as short circuit before checking all else
- if a is GraphGroup.OTHER and b is GraphGroup.OTHER:
- return BreakPossibility.CERTAIN
- assert isinstance(a, GraphGroup)
- assert isinstance(b, GraphGroup)
- # Only break if preceeded by an uneven number of REGIONAL_INDICATORS
- # sot (RI RI)* RI × RI
- # [^RI] (RI RI) * RI × RI
- if a is GraphGroup.REGIONAL_INDICATOR and b is GraphGroup.REGIONAL_INDICATOR:
- return BreakPossibility.POSSIBLE
- # (Control | CR | LF) ÷
- # ÷ (Control | CR | LF)
- if a in [GraphGroup.CONTROL, GraphGroup.CR, GraphGroup.LF] or b in [
- GraphGroup.CONTROL,
- GraphGroup.CR,
- GraphGroup.LF,
- ]:
- # CR × LF
- if a is GraphGroup.CR and b is GraphGroup.LF:
- return BreakPossibility.NO_BREAK
- else:
- return BreakPossibility.CERTAIN
- # L × (L | V | LV | LVT)
- if a is GraphGroup.L and b in [GraphGroup.L, GraphGroup.V, GraphGroup.LV, GraphGroup.LVT]:
- return BreakPossibility.NO_BREAK
- # (LV | V) × (V | T)
- if a in [GraphGroup.LV, GraphGroup.V] and b in [GraphGroup.V, GraphGroup.T]:
- return BreakPossibility.NO_BREAK
- # (LVT | T) × T
- if a in [GraphGroup.LVT, GraphGroup.T] and b is GraphGroup.T:
- return BreakPossibility.NO_BREAK
- # × (Extend | ZWJ)
- # × SpacingMark
- # Prepend ×
- if b in [GraphGroup.EXTEND, GraphGroup.ZWJ, GraphGroup.SPACING_MARK] or a is GraphGroup.PREPEND:
- return BreakPossibility.NO_BREAK
- # \p{Extended_Pictographic} Extend* ZWJ × \p{Extended_Pictographic}
- if a is GraphGroup.ZWJ and b is GraphGroup.EXTENDED_PICTOGRAPHIC:
- return BreakPossibility.POSSIBLE
- # everything else, assumes all other rules are included above
- return BreakPossibility.CERTAIN
- def get_break_possibility_incb(a, b):
- # Probably most common, included as short circuit before checking all else
- if a is InCBGroup.OTHER and b is InCBGroup.OTHER:
- return BreakPossibility.CERTAIN
- if a in [InCBGroup.LINKER, InCBGroup.EXTEND] and b is InCBGroup.CONSONANT:
- return BreakPossibility.NO_BREAK
- if a in [InCBGroup.LINKER, InCBGroup.EXTEND, InCBGroup.CONSONANT] and b is InCBGroup.LINKER:
- return BreakPossibility.NO_BREAK
- assert isinstance(a, InCBGroup)
- assert isinstance(b, InCBGroup)
- # everything else, assumes all other rules are included above
- return BreakPossibility.POSSIBLE
- def get_last_certain_break_index(string, index):
- if index >= len(string):
- return len(string)
- prev = get_group(string[index])
- prev_incb = get_group_incb(string[index])
- while True:
- if index <= 0:
- return 0
- index -= 1
- cur = get_group(string[index])
- cur_incb = get_group_incb(string[index])
- if (
- get_break_possibility(cur, prev) == BreakPossibility.CERTAIN
- and get_break_possibility_incb(cur_incb, prev_incb) != BreakPossibility.NO_BREAK
- ):
- return index + 1
- prev = cur
- prev_incb = cur_incb
- class UState(Enum):
- DEFAULT = 0 # No special case
- GB9c_Consonant = 10
- GB9c_Extend = 11
- GB9c_Linker = 12
- GB11_Picto = 20
- GB11_ZWJ = 21
- GB12_First = 30
- GB12_Second = 31
- class GraphemeIterator:
- def __init__(self, string: str):
- self.str_iter = iter(string)
- self.buffer = ""
- self.lastg = None
- self.state = UState.DEFAULT
- try:
- self.buffer = next(self.str_iter)
- except StopIteration:
- self.buffer = None
- else:
- lastg = get_group(self.buffer)
- self.lastg = lastg
- if lastg is GraphGroup.EXTENDED_PICTOGRAPHIC:
- self.state = UState.GB11_Picto
- elif lastg is GraphGroup.REGIONAL_INDICATOR:
- self.state = UState.GB12_First
- else:
- lastincb = get_group_incb(self.buffer)
- if lastincb is InCBGroup.CONSONANT:
- self.state = UState.GB9c_Consonant
- def __iter__(self):
- return self
- def default_should_break(self, nextg, nextincb):
- should_break = None
- next_state = UState.DEFAULT
- # First the most common
- if (
- self.lastg is GraphGroup.OTHER
- and nextg is GraphGroup.OTHER
- and nextincb is InCBGroup.OTHER
- ):
- # GB999 Any ÷ Any
- # Otherwise, break everywhere
- return True, UState.DEFAULT
- elif self.lastg is GraphGroup.CR and nextg is GraphGroup.LF:
- # GB3 CR × LF
- # Do not break between a CR and LF
- should_break = False
- elif self.lastg in (GraphGroup.CONTROL, GraphGroup.CR, GraphGroup.LF):
- # GB4 (Control | CR | LF) ÷
- # break before and after controls
- should_break = True
- elif nextg in (GraphGroup.CONTROL, GraphGroup.CR, GraphGroup.LF):
- # GB5 ÷ (Control | CR | LF)
- # break before and after controls.
- should_break = True
- elif self.lastg is GraphGroup.L and nextg in (
- GraphGroup.L,
- GraphGroup.V,
- GraphGroup.LV,
- GraphGroup.LVT,
- ):
- # GB6 L × (L | V | LV | LVT)
- # Do not break Hangul syllable or other conjoining sequences.
- should_break = False
- elif self.lastg in (GraphGroup.LV, GraphGroup.V) and nextg in (GraphGroup.V, GraphGroup.T):
- # GB7 (LV | V) × (V | T)
- # Do not break Hangul syllable or other conjoining sequences.
- should_break = False
- elif self.lastg in (GraphGroup.LVT, GraphGroup.T) and nextg is GraphGroup.T:
- # GB8 (LVT | T) × T
- # Do not break Hangul syllable or other conjoining sequences.
- should_break = False
- elif nextg in (GraphGroup.EXTEND, GraphGroup.ZWJ, GraphGroup.SPACING_MARK):
- # GB9 × (Extend | ZWJ)
- # Do not break before extending characters or ZWJ.
- # GB9a × SpacingMark
- # Do not break before SpacingMarks
- should_break = False
- elif self.lastg is GraphGroup.PREPEND:
- # GB9b Prepend ×
- # Do not break after Prepend characters
- should_break = False
- # Next State
- if nextg is GraphGroup.OTHER and nextincb is InCBGroup.OTHER:
- pass
- elif nextg is GraphGroup.EXTENDED_PICTOGRAPHIC:
- next_state = UState.GB11_Picto
- elif nextg is GraphGroup.REGIONAL_INDICATOR:
- next_state = UState.GB12_First
- elif nextincb is InCBGroup.CONSONANT:
- next_state = UState.GB9c_Consonant
- return should_break, next_state
- def __next__(self):
- for codepoint in self.str_iter:
- nextg = get_group(codepoint)
- next_inbc = get_group_incb(codepoint)
- sb, next_state = self.default_should_break(nextg, next_inbc)
- if self.state is UState.DEFAULT:
- pass
- # GB11
- elif self.state is UState.GB11_Picto:
- if nextg is GraphGroup.EXTEND:
- next_state = UState.GB11_Picto
- sb = False
- elif nextg is GraphGroup.ZWJ:
- next_state = UState.GB11_ZWJ
- sb = False
- elif self.state is UState.GB11_ZWJ and nextg is GraphGroup.EXTENDED_PICTOGRAPHIC:
- next_state = UState.DEFAULT
- sb = False
- # GB12 sot (RI RI)* RI × RI
- # GB13 [^RI] (RI RI)* RI × RI
- # Do not break within emoji flag sequences.
- # That is, do not break between regional indicator (RI) symbols
- # if there is an odd number of RI characters before the break point.
- elif (
- self.state in (UState.GB12_First, UState.GB12_Second)
- and nextg is GraphGroup.REGIONAL_INDICATOR
- ):
- sb = self.state is not UState.GB12_First
- next_state = (
- UState.GB12_First if self.state is UState.GB12_Second else UState.GB12_Second
- )
- # GB9c Consonant [ Extend Linker ]* Linker [ Extend Linker ]* × Consonant
- elif self.state is UState.GB9c_Consonant:
- if next_inbc is InCBGroup.EXTEND:
- sb = False
- next_state = UState.GB9c_Consonant
- elif next_inbc is InCBGroup.LINKER:
- sb = False
- next_state = UState.GB9c_Linker
- elif self.state is UState.GB9c_Linker:
- if next_inbc is InCBGroup.LINKER:
- sb = False
- next_state = UState.GB9c_Linker
- elif next_inbc is InCBGroup.CONSONANT:
- sb = False
- next_state = UState.GB9c_Consonant
- elif next_inbc is InCBGroup.EXTEND:
- sb = False
- next_state = UState.GB9c_Linker
- # Handle results
- self.state = next_state
- self.lastg = nextg
- if sb is True or sb is None:
- return self._break(codepoint)
- self.buffer += codepoint # type: ignore
- if self.buffer:
- # GB2 Any ÷ eot
- # Break at the end of text, unless the text is empty.
- return self._break(None)
- raise StopIteration()
- def _break(self, new):
- """Return the current buffer, start with a new one"""
- old_buffer = self.buffer
- self.buffer = new
- return old_buffer
|