grapheme_property_group.py 4.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148
  1. import json
  2. import os
  3. import string
  4. from enum import Enum
  5. class GraphemePropertyGroup(Enum):
  6. PREPEND = "Prepend"
  7. CR = "CR"
  8. LF = "LF"
  9. CONTROL = "Control"
  10. EXTEND = "Extend"
  11. REGIONAL_INDICATOR = "Regional_Indicator"
  12. SPACING_MARK = "SpacingMark"
  13. L = "L"
  14. V = "V"
  15. T = "T"
  16. LV = "LV"
  17. LVT = "LVT"
  18. ZWJ = "ZWJ"
  19. EXTENDED_PICTOGRAPHIC = "Extended_Pictographic"
  20. OTHER = "Other"
  21. COMMON_OTHER_GROUP_CHARS = ""
  22. class ContainerNode:
  23. """
  24. Simple implementation of interval based BTree with no support for deletion.
  25. """
  26. def __init__(self, children) -> None:
  27. self.children = self._sorted(children)
  28. self._set_min_max()
  29. def _set_min_max(self):
  30. self.min = self.children[0].min
  31. self.max = self.children[-1].max
  32. # Adds an item to the node or it's subnodes. Returns a new node if this node is split, or None.
  33. def add(self, item):
  34. for child in self.children:
  35. if child.min <= item.min <= child.max:
  36. assert child.min <= item.max <= child.max
  37. new_child = child.add(item)
  38. if new_child:
  39. return self._add_child(new_child)
  40. else:
  41. self._set_min_max()
  42. return None
  43. return self._add_child(item)
  44. def get_value(self, key):
  45. for child in self.children:
  46. if child.min <= key <= child.max:
  47. return child.get_value(key)
  48. return None
  49. def _add_child(self, child):
  50. self.children.append(child)
  51. self.children = self._sorted(self.children)
  52. other = None
  53. if len(self.children) >= 4:
  54. other = ContainerNode(self.children[2:])
  55. self.children = self.children[0:2]
  56. self._set_min_max()
  57. return other
  58. def _sorted(self, children):
  59. return sorted(children, key=lambda c: c.min)
  60. class LeafNode:
  61. def __init__(self, range_min, range_max, group):
  62. self.min = range_min
  63. self.max = range_max
  64. self.group = group
  65. # Assumes range check has already been done
  66. def get_value(self, _):
  67. return self.group
  68. SINGLE_CHAR_MAPPINGS = {}
  69. RANGE_TREE = ContainerNode([LeafNode(0, 0, None)])
  70. def get_group(char: str):
  71. if char in COMMON_OTHER_GROUP_CHARS:
  72. return GraphemePropertyGroup.OTHER
  73. else:
  74. return get_group_ord(ord(char))
  75. def get_group_ord(char: int):
  76. group = SINGLE_CHAR_MAPPINGS.get(char, None)
  77. if group:
  78. return group
  79. return RANGE_TREE.get_value(char) or GraphemePropertyGroup.OTHER
  80. def load_file(filename, enumgroup):
  81. with open(os.path.join(os.path.dirname(__file__), filename)) as f:
  82. data = json.load(f)
  83. assert len(data) == len(enumgroup) - 1
  84. single_char_mappings = {}
  85. for key, value in data.items():
  86. group = enumgroup(key)
  87. for char in value["single_chars"]:
  88. single_char_mappings[char] = group
  89. range_tree = None
  90. for key, value in data.items():
  91. for range_ in value["ranges"]:
  92. min_ = range_[0]
  93. max_ = range_[1]
  94. group = enumgroup(key)
  95. if max_ - min_ < 20:
  96. for i in range(min_, max_ + 1):
  97. single_char_mappings[i] = group
  98. continue
  99. new_node = LeafNode(min_, max_, group)
  100. if range_tree:
  101. new_subtree = range_tree.add(new_node)
  102. if new_subtree:
  103. range_tree = ContainerNode([range_tree, new_subtree])
  104. else:
  105. range_tree = ContainerNode([new_node])
  106. del data
  107. common_ascii = string.ascii_letters + string.digits + string.punctuation
  108. common_other_group_chars = "".join(
  109. c for c in common_ascii if get_group_ord(ord(c)) == GraphemePropertyGroup.OTHER
  110. )
  111. return single_char_mappings, range_tree, common_other_group_chars
  112. SINGLE_CHAR_MAPPINGS, RANGE_TREE, COMMON_OTHER_GROUP_CHARS = load_file(
  113. "data/grapheme_break_property.json", GraphemePropertyGroup
  114. )