unicode_breaks.py 6.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159
  1. import os.path
  2. import urllib.request
  3. from itertools import zip_longest
  4. from .utils import toolkit
  5. from ..utils.cells import split_graphemes
  6. from ..utils.colors import GREEN, ORANGE, RED
  7. CACHE = '.unicode_cache'
  8. def validate_unicode_breaks(uver=None, show_all=False, cache=True):
  9. # validate unicode grapheme clusters detection.
  10. # this downloads the specs directly from unicode.org and caches it locally.
  11. # document: https://unicode.org/reports/tr51/
  12. latest = f'{CACHE}/latest'
  13. if not uver and cache and os.path.exists(latest):
  14. with open(latest) as f:
  15. uver = f.read()
  16. print('using version "latest" as:', uver)
  17. file = f'{CACHE}/emoji-test_{uver}.txt'
  18. if cache and os.path.exists(file):
  19. print('loading cached:', file)
  20. with open(file) as f:
  21. data = f.read()
  22. else:
  23. url = f'https://www.unicode.org/Public/emoji/{uver or "latest"}/emoji-test.txt'
  24. print('downloading:', url)
  25. try:
  26. req = urllib.request.urlopen(url)
  27. except OSError as e:
  28. print(RED('Download error:'), e)
  29. return
  30. os.makedirs(os.path.dirname(file), exist_ok=True)
  31. if not uver:
  32. new_url = req.geturl()
  33. uver = new_url.split('/')[5]
  34. print('saving version "latest" as:', uver)
  35. with open(latest, 'w') as f:
  36. f.write(uver)
  37. file = f'{CACHE}/emoji-test_{uver}.txt'
  38. print('saving:', file)
  39. data = req.read().decode('utf8')
  40. with open(file, 'w') as f:
  41. f.write(data)
  42. def where():
  43. nonlocal groups
  44. if any(groups):
  45. print('\n'.join(g for g in groups if g))
  46. groups = [None, None]
  47. def expect(*chars):
  48. nonlocal errors, total
  49. text = ''.join(chars)
  50. actual = split_graphemes(text)
  51. error = actual != chars
  52. total += 1
  53. errors += error
  54. if error or show_all:
  55. where()
  56. codes = '|'.join((GREEN if a == c else RED)(
  57. ' '.join(hex(ord(c)).replace('0x', '') for c in a) if a else '-'
  58. ) for a, c in zip_longest(actual, chars))
  59. small_name = name.replace(' skin tone', '').replace(' hair', '')
  60. small_status = ''.join(x[0] for x in status.split('-'))
  61. a_len, c_len = len(actual), len(chars)
  62. size = f'{GREEN(a_len)} ==' if a_len == c_len else f'{RED(a_len)} !='
  63. print(f' {char} {text.replace(char, "X"):>3}: {size} {c_len} -> '
  64. f'|{codes}| {ORANGE(small_status)} {small_name}')
  65. groups, total, errors = [None, None], 0, 0
  66. for line in filter(None, data.splitlines()):
  67. if line.startswith('#'):
  68. if line.startswith('# group:'):
  69. groups[0] = line.split()[-1]
  70. elif line.startswith('# subgroup:'):
  71. groups[1] = f' - {line.split()[-1]}'
  72. continue
  73. p1, p2 = (p.split() for p in line.split(';'))
  74. status, name = p2[0], ' '.join(p2[4:])
  75. char = ''.join(chr(int(x, 16)) for x in p1)
  76. expect(char)
  77. expect(char, char)
  78. expect('a', char, 'a')
  79. expect('a', 'a', char)
  80. expect(char, 'a', 'a')
  81. print(f'\nerrors : {errors / total:6.2%} [{errors}/{total}]')
  82. print(f'successes: {1 - errors / total:6.2%} [{total - errors}/{total}]')
  83. def find_groups(data, max_diff):
  84. """Group some numbers with a maximum difference between them.
  85. I've used to try to fix the current grapheme break error.
  86. Using version unicode 13.1:
  87. Component
  88. - skin-tone
  89. 🏻 XX: 1 != 2 -> |1f3fb 1f3fb|-| c light
  90. 🏻 aXa: 2 != 3 -> |61 1f3fb|61|-| c light
  91. 🏻 aaX: 2 != 3 -> |61|61 1f3fb|-| c light
  92. 🏼 XX: 1 != 2 -> |1f3fc 1f3fc|-| c medium-light
  93. 🏼 aXa: 2 != 3 -> |61 1f3fc|61|-| c medium-light
  94. 🏼 aaX: 2 != 3 -> |61|61 1f3fc|-| c medium-light
  95. 🏽 XX: 1 != 2 -> |1f3fd 1f3fd|-| c medium
  96. 🏽 aXa: 2 != 3 -> |61 1f3fd|61|-| c medium
  97. 🏽 aaX: 2 != 3 -> |61|61 1f3fd|-| c medium
  98. 🏾 XX: 1 != 2 -> |1f3fe 1f3fe|-| c medium-dark
  99. 🏾 aXa: 2 != 3 -> |61 1f3fe|61|-| c medium-dark
  100. 🏾 aaX: 2 != 3 -> |61|61 1f3fe|-| c medium-dark
  101. 🏿 XX: 1 != 2 -> |1f3ff 1f3ff|-| c dark
  102. 🏿 aXa: 2 != 3 -> |61 1f3ff|61|-| c dark
  103. 🏿 aaX: 2 != 3 -> |61|61 1f3ff|-| c dark
  104. The codepoints that do accept a skin tone are:
  105. 0x0261D, 0x026F9, 0x0270A, 0x0270B, 0x0270C, 0x0270D, 0x1F385, 0x1F3C2, 0x1F3C3, 0x1F3C4,
  106. 0x1F3C7, 0x1F3CA, 0x1F3CB, 0x1F3CC, 0x1F442, 0x1F443, 0x1F446, 0x1F447, 0x1F448, 0x1F449,
  107. 0x1F44A, 0x1F44B, 0x1F44C, 0x1F44D, 0x1F44E, 0x1F44F, 0x1F450, 0x1F466, 0x1F467, 0x1F468,
  108. 0x1F469, 0x1F46B, 0x1F46C, 0x1F46D, 0x1F46E, 0x1F470, 0x1F471, 0x1F472, 0x1F473, 0x1F474,
  109. 0x1F475, 0x1F476, 0x1F477, 0x1F478, 0x1F47C, 0x1F481, 0x1F482, 0x1F483, 0x1F485, 0x1F486,
  110. 0x1F487, 0x1F4AA, 0x1F574, 0x1F575, 0x1F57A, 0x1F590, 0x1F595, 0x1F596, 0x1F645, 0x1F646,
  111. 0x1F647, 0x1F64B, 0x1F64C, 0x1F64D, 0x1F64E, 0x1F64F, 0x1F6A3, 0x1F6B4, 0x1F6B5, 0x1F6B6,
  112. 0x1F6C0, 0x1F6CC, 0x1F90C, 0x1F90F, 0x1F918, 0x1F919, 0x1F91A, 0x1F91B, 0x1F91C, 0x1F91E,
  113. 0x1F91F, 0x1F926, 0x1F930, 0x1F931, 0x1F932, 0x1F933, 0x1F934, 0x1F935, 0x1F936, 0x1F937,
  114. 0x1F938, 0x1F939, 0x1F93D, 0x1F93E, 0x1F977, 0x1F9B5, 0x1F9B6, 0x1F9B8, 0x1F9B9, 0x1F9BB,
  115. 0x1F9CD, 0x1F9CE, 0x1F9CF, 0x1F9D1, 0x1F9D2, 0x1F9D3, 0x1F9D4, 0x1F9D5, 0x1F9D6, 0x1F9D7,
  116. 0x1F9D8, 0x1F9D9, 0x1F9DA, 0x1F9DB, 0x1F9DC, 0x1F9DD
  117. """
  118. it = iter(sorted(data))
  119. last_item = next(it)
  120. current_group = [last_item]
  121. result = [current_group]
  122. for i in it:
  123. if i - last_item > max_diff:
  124. current_group = []
  125. result.append(current_group)
  126. current_group.append(i)
  127. last_item = i
  128. print('\n'.join(f'{len(g)}:|' + ' '.join(hex(x).replace('0x', '') for x in g) + '|'
  129. for g in result))
  130. if __name__ == '__main__':
  131. parser, run = toolkit('Tests the grapheme break implementation against some unicode version.')
  132. parser.add_argument('uver', type=float, nargs='?', help='the unicode version to be used')
  133. parser.add_argument('--all', dest='show_all', action='store_true',
  134. help='shows the correct cases, in addition to the wrong ones')
  135. parser.add_argument('--no-cache', dest='cache', action='store_false',
  136. help='ignores the cache and re-downloads the spec')
  137. run(validate_unicode_breaks)