| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159 |
- import os.path
- import urllib.request
- from itertools import zip_longest
- from .utils import toolkit
- from ..utils.cells import split_graphemes
- from ..utils.colors import GREEN, ORANGE, RED
- CACHE = '.unicode_cache'
- def validate_unicode_breaks(uver=None, show_all=False, cache=True):
- # validate unicode grapheme clusters detection.
- # this downloads the specs directly from unicode.org and caches it locally.
- # document: https://unicode.org/reports/tr51/
- latest = f'{CACHE}/latest'
- if not uver and cache and os.path.exists(latest):
- with open(latest) as f:
- uver = f.read()
- print('using version "latest" as:', uver)
- file = f'{CACHE}/emoji-test_{uver}.txt'
- if cache and os.path.exists(file):
- print('loading cached:', file)
- with open(file) as f:
- data = f.read()
- else:
- url = f'https://www.unicode.org/Public/emoji/{uver or "latest"}/emoji-test.txt'
- print('downloading:', url)
- try:
- req = urllib.request.urlopen(url)
- except OSError as e:
- print(RED('Download error:'), e)
- return
- os.makedirs(os.path.dirname(file), exist_ok=True)
- if not uver:
- new_url = req.geturl()
- uver = new_url.split('/')[5]
- print('saving version "latest" as:', uver)
- with open(latest, 'w') as f:
- f.write(uver)
- file = f'{CACHE}/emoji-test_{uver}.txt'
- print('saving:', file)
- data = req.read().decode('utf8')
- with open(file, 'w') as f:
- f.write(data)
- def where():
- nonlocal groups
- if any(groups):
- print('\n'.join(g for g in groups if g))
- groups = [None, None]
- def expect(*chars):
- nonlocal errors, total
- text = ''.join(chars)
- actual = split_graphemes(text)
- error = actual != chars
- total += 1
- errors += error
- if error or show_all:
- where()
- codes = '|'.join((GREEN if a == c else RED)(
- ' '.join(hex(ord(c)).replace('0x', '') for c in a) if a else '-'
- ) for a, c in zip_longest(actual, chars))
- small_name = name.replace(' skin tone', '').replace(' hair', '')
- small_status = ''.join(x[0] for x in status.split('-'))
- a_len, c_len = len(actual), len(chars)
- size = f'{GREEN(a_len)} ==' if a_len == c_len else f'{RED(a_len)} !='
- print(f' {char} {text.replace(char, "X"):>3}: {size} {c_len} -> '
- f'|{codes}| {ORANGE(small_status)} {small_name}')
- groups, total, errors = [None, None], 0, 0
- for line in filter(None, data.splitlines()):
- if line.startswith('#'):
- if line.startswith('# group:'):
- groups[0] = line.split()[-1]
- elif line.startswith('# subgroup:'):
- groups[1] = f' - {line.split()[-1]}'
- continue
- p1, p2 = (p.split() for p in line.split(';'))
- status, name = p2[0], ' '.join(p2[4:])
- char = ''.join(chr(int(x, 16)) for x in p1)
- expect(char)
- expect(char, char)
- expect('a', char, 'a')
- expect('a', 'a', char)
- expect(char, 'a', 'a')
- print(f'\nerrors : {errors / total:6.2%} [{errors}/{total}]')
- print(f'successes: {1 - errors / total:6.2%} [{total - errors}/{total}]')
- def find_groups(data, max_diff):
- """Group some numbers with a maximum difference between them.
- I've used to try to fix the current grapheme break error.
- Using version unicode 13.1:
- Component
- - skin-tone
- 🏻 XX: 1 != 2 -> |1f3fb 1f3fb|-| c light
- 🏻 aXa: 2 != 3 -> |61 1f3fb|61|-| c light
- 🏻 aaX: 2 != 3 -> |61|61 1f3fb|-| c light
- 🏼 XX: 1 != 2 -> |1f3fc 1f3fc|-| c medium-light
- 🏼 aXa: 2 != 3 -> |61 1f3fc|61|-| c medium-light
- 🏼 aaX: 2 != 3 -> |61|61 1f3fc|-| c medium-light
- 🏽 XX: 1 != 2 -> |1f3fd 1f3fd|-| c medium
- 🏽 aXa: 2 != 3 -> |61 1f3fd|61|-| c medium
- 🏽 aaX: 2 != 3 -> |61|61 1f3fd|-| c medium
- 🏾 XX: 1 != 2 -> |1f3fe 1f3fe|-| c medium-dark
- 🏾 aXa: 2 != 3 -> |61 1f3fe|61|-| c medium-dark
- 🏾 aaX: 2 != 3 -> |61|61 1f3fe|-| c medium-dark
- 🏿 XX: 1 != 2 -> |1f3ff 1f3ff|-| c dark
- 🏿 aXa: 2 != 3 -> |61 1f3ff|61|-| c dark
- 🏿 aaX: 2 != 3 -> |61|61 1f3ff|-| c dark
- The codepoints that do accept a skin tone are:
- 0x0261D, 0x026F9, 0x0270A, 0x0270B, 0x0270C, 0x0270D, 0x1F385, 0x1F3C2, 0x1F3C3, 0x1F3C4,
- 0x1F3C7, 0x1F3CA, 0x1F3CB, 0x1F3CC, 0x1F442, 0x1F443, 0x1F446, 0x1F447, 0x1F448, 0x1F449,
- 0x1F44A, 0x1F44B, 0x1F44C, 0x1F44D, 0x1F44E, 0x1F44F, 0x1F450, 0x1F466, 0x1F467, 0x1F468,
- 0x1F469, 0x1F46B, 0x1F46C, 0x1F46D, 0x1F46E, 0x1F470, 0x1F471, 0x1F472, 0x1F473, 0x1F474,
- 0x1F475, 0x1F476, 0x1F477, 0x1F478, 0x1F47C, 0x1F481, 0x1F482, 0x1F483, 0x1F485, 0x1F486,
- 0x1F487, 0x1F4AA, 0x1F574, 0x1F575, 0x1F57A, 0x1F590, 0x1F595, 0x1F596, 0x1F645, 0x1F646,
- 0x1F647, 0x1F64B, 0x1F64C, 0x1F64D, 0x1F64E, 0x1F64F, 0x1F6A3, 0x1F6B4, 0x1F6B5, 0x1F6B6,
- 0x1F6C0, 0x1F6CC, 0x1F90C, 0x1F90F, 0x1F918, 0x1F919, 0x1F91A, 0x1F91B, 0x1F91C, 0x1F91E,
- 0x1F91F, 0x1F926, 0x1F930, 0x1F931, 0x1F932, 0x1F933, 0x1F934, 0x1F935, 0x1F936, 0x1F937,
- 0x1F938, 0x1F939, 0x1F93D, 0x1F93E, 0x1F977, 0x1F9B5, 0x1F9B6, 0x1F9B8, 0x1F9B9, 0x1F9BB,
- 0x1F9CD, 0x1F9CE, 0x1F9CF, 0x1F9D1, 0x1F9D2, 0x1F9D3, 0x1F9D4, 0x1F9D5, 0x1F9D6, 0x1F9D7,
- 0x1F9D8, 0x1F9D9, 0x1F9DA, 0x1F9DB, 0x1F9DC, 0x1F9DD
- """
- it = iter(sorted(data))
- last_item = next(it)
- current_group = [last_item]
- result = [current_group]
- for i in it:
- if i - last_item > max_diff:
- current_group = []
- result.append(current_group)
- current_group.append(i)
- last_item = i
- print('\n'.join(f'{len(g)}:|' + ' '.join(hex(x).replace('0x', '') for x in g) + '|'
- for g in result))
- if __name__ == '__main__':
- parser, run = toolkit('Tests the grapheme break implementation against some unicode version.')
- parser.add_argument('uver', type=float, nargs='?', help='the unicode version to be used')
- parser.add_argument('--all', dest='show_all', action='store_true',
- help='shows the correct cases, in addition to the wrong ones')
- parser.add_argument('--no-cache', dest='cache', action='store_false',
- help='ignores the cache and re-downloads the spec')
- run(validate_unicode_breaks)
|