api.py 6.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213
  1. from grapheme.finder import GraphemeIterator, get_last_certain_break_index
  2. UNICODE_VERSION = "16.0.0"
  3. def graphemes(string):
  4. """
  5. Returns an iterator of all graphemes of given string.
  6. >>> rainbow_flag = "🏳️‍🌈"
  7. >>> [codepoint for codepoint in rainbow_flag]
  8. ['🏳', '️', '\u200d', '🌈']
  9. >>> list(grapheme.graphemes("multi codepoint: " + rainbow_flag))
  10. ['m', 'u', 'l', 't', 'i', ' ', 'c', 'o', 'd', 'e', 'p', 'o', 'i', 'n', 't', ':', ' ', '🏳️‍🌈']
  11. """
  12. return iter(GraphemeIterator(string))
  13. def length(string, until=None):
  14. """
  15. Returns the number of graphemes in the string.
  16. Note that this functions needs to traverse the full string to calculate the length,
  17. unlike `len(string)` and it's time consumption is linear to the length of the string
  18. (up to the `until` value).
  19. Only counts up to the `until` argument, if given. This is useful when testing
  20. the length of a string against some limit and the excess length is not interesting.
  21. >>> rainbow_flag = "🏳️‍🌈"
  22. >>> len(rainbow_flag)
  23. 4
  24. >>> graphemes.length(rainbow_flag)
  25. 1
  26. >>> graphemes.length("".join(str(i) for i in range(100)), 30)
  27. 30
  28. """
  29. if until is None:
  30. return sum(1 for _ in GraphemeIterator(string))
  31. iterator = graphemes(string)
  32. count = 0
  33. while True:
  34. try:
  35. if count >= until:
  36. break
  37. next(iterator)
  38. except StopIteration:
  39. break
  40. else:
  41. count += 1
  42. return count
  43. # TODO: should probably use an optimized iterator that only deals with code point counts
  44. def grapheme_lengths(string):
  45. """
  46. Returns an iterator of number of code points in each grapheme of the string.
  47. """
  48. return iter(len(g) for g in graphemes(string))
  49. def slice(string, start=None, end=None):
  50. """
  51. Returns a substring of the given string, counting graphemes instead of codepoints.
  52. Negative indices is currently not supported.
  53. >>> string = "tamil நி (ni)"
  54. >>> string[:7]
  55. 'tamil ந'
  56. >>> grapheme.slice(string, end=7)
  57. 'tamil நி'
  58. >>> string[7:]
  59. 'ி (ni)'
  60. >>> grapheme.slice(string, 7)
  61. ' (ni)'
  62. """
  63. if start is None:
  64. start = 0
  65. if end is not None and start >= end:
  66. return ""
  67. if start < 0:
  68. raise NotImplementedError("Negative indexing is currently not supported.")
  69. sum_ = 0
  70. start_index = None
  71. for grapheme_index, grapheme_length in enumerate(grapheme_lengths(string)):
  72. if grapheme_index == start:
  73. start_index = sum_
  74. elif grapheme_index == end:
  75. return string[start_index:sum_]
  76. sum_ += grapheme_length
  77. if start_index is not None:
  78. return string[start_index:]
  79. return ""
  80. def contains(string, substring):
  81. """
  82. Returns true if the sequence of graphemes in substring is also present in string.
  83. This differs from the normal python `in` operator, since the python operator will return
  84. true if the sequence of codepoints are withing the other string without considering
  85. grapheme boundaries.
  86. Performance notes: Very fast if `substring not in string`, since that also means that
  87. the same graphemes can not be in the two strings. Otherwise this function has linear time
  88. complexity in relation to the string length. It will traverse the sequence of graphemes until
  89. a match is found, so it will generally perform better for grapheme sequences that match early.
  90. >>> "🇸🇪" in "🇪🇸🇪🇪"
  91. True
  92. >>> grapheme.contains("🇪🇸🇪🇪", "🇸🇪")
  93. False
  94. """
  95. if substring not in string:
  96. return False
  97. substr_graphemes = list(graphemes(substring))
  98. if len(substr_graphemes) == 0:
  99. return True
  100. elif len(substr_graphemes) == 1:
  101. return substr_graphemes[0] in graphemes(string)
  102. else:
  103. str_iter = graphemes(string)
  104. str_sub_part = []
  105. for _ in range(len(substr_graphemes)):
  106. try:
  107. str_sub_part.append(next(str_iter))
  108. except StopIteration:
  109. return False
  110. for g in str_iter:
  111. if str_sub_part == substr_graphemes:
  112. return True
  113. str_sub_part.append(g)
  114. str_sub_part.pop(0)
  115. return str_sub_part == substr_graphemes
  116. def startswith(string, prefix):
  117. """
  118. Like str.startswith, but also checks that the string starts with the given prefixes sequence of
  119. graphemes.
  120. str.startswith may return true for a prefix that is not visually represented as a prefix if a
  121. grapheme cluster is continued after the prefix ends.
  122. >>> grapheme.startswith("✊🏾", "✊")
  123. False
  124. >>> "✊🏾".startswith("✊")
  125. True
  126. """
  127. return string.startswith(prefix) and safe_split_index(string, len(prefix)) == len(prefix)
  128. def endswith(string, suffix):
  129. """
  130. Like str.endswith, but also checks that the string endswith the given prefixes sequence of
  131. graphemes.
  132. str.endswith may return true for a suffix that is not visually represented as a suffix if a
  133. grapheme cluster is initiated before the suffix starts.
  134. >>> grapheme.endswith("🏳️‍🌈", "🌈")
  135. False
  136. >>> "🏳️‍🌈".endswith("🌈")
  137. True
  138. """
  139. expected_index = len(string) - len(suffix)
  140. return string.endswith(suffix) and safe_split_index(string, expected_index) == expected_index
  141. def safe_split_index(string, max_len):
  142. """
  143. Returns the highest index up to `max_len` at which the given string can be sliced,
  144. without breaking a grapheme.
  145. This is useful for when you want to split or take a substring from a string,
  146. and don't really care about
  147. the exact grapheme length, but don't want to risk breaking existing graphemes.
  148. This function does normally not traverse the full grapheme sequence up to the given length,
  149. so it can be used for arbitrarily long strings and high `max_len`.
  150. However, some grapheme boundaries depend on the previous state,
  151. so the worst case performance is O(n). In practice, it's only very long non-broken sequences
  152. of country flags (represented as Regional Indicators) that will perform badly.
  153. The return value will always be between `0` and `len(string)`.
  154. >>> string = "tamil நி (ni)"
  155. >>> i = grapheme.safe_split_index(string, 7)
  156. >>> i
  157. 6
  158. >>> string[:i]
  159. 'tamil '
  160. >>> string[i:]
  161. 'நி (ni)'
  162. """
  163. last_index = get_last_certain_break_index(string, max_len)
  164. for i in grapheme_lengths(string[last_index:]):
  165. if last_index + i > max_len:
  166. break
  167. last_index += i
  168. return last_index