yichael
/
AndroidRemoteController


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159
							import os.path
import urllib.request
from itertools import zip_longest

from .utils import toolkit
from ..utils.cells import split_graphemes
from ..utils.colors import GREEN, ORANGE, RED

CACHE = '.unicode_cache'


def validate_unicode_breaks(uver=None, show_all=False, cache=True):
    # validate unicode grapheme clusters detection.
    # this downloads the specs directly from unicode.org and caches it locally.
    # document: https://unicode.org/reports/tr51/

    latest = f'{CACHE}/latest'
    if not uver and cache and os.path.exists(latest):
        with open(latest) as f:
            uver = f.read()
        print('using version "latest" as:', uver)

    file = f'{CACHE}/emoji-test_{uver}.txt'
    if cache and os.path.exists(file):
        print('loading cached:', file)
        with open(file) as f:
            data = f.read()
    else:
        url = f'https://www.unicode.org/Public/emoji/{uver or "latest"}/emoji-test.txt'
        print('downloading:', url)

        try:
            req = urllib.request.urlopen(url)
        except OSError as e:
            print(RED('Download error:'), e)
            return

        os.makedirs(os.path.dirname(file), exist_ok=True)
        if not uver:
            new_url = req.geturl()
            uver = new_url.split('/')[5]
            print('saving version "latest" as:', uver)
            with open(latest, 'w') as f:
                f.write(uver)
            file = f'{CACHE}/emoji-test_{uver}.txt'

        print('saving:', file)
        data = req.read().decode('utf8')
        with open(file, 'w') as f:
            f.write(data)

    def where():
        nonlocal groups
        if any(groups):
            print('\n'.join(g for g in groups if g))
        groups = [None, None]

    def expect(*chars):
        nonlocal errors, total
        text = ''.join(chars)
        actual = split_graphemes(text)
        error = actual != chars
        total += 1
        errors += error
        if error or show_all:
            where()
            codes = '|'.join((GREEN if a == c else RED)(
                ' '.join(hex(ord(c)).replace('0x', '') for c in a) if a else '-'
            ) for a, c in zip_longest(actual, chars))
            small_name = name.replace(' skin tone', '').replace(' hair', '')
            small_status = ''.join(x[0] for x in status.split('-'))
            a_len, c_len = len(actual), len(chars)
            size = f'{GREEN(a_len)} ==' if a_len == c_len else f'{RED(a_len)} !='
            print(f' {char}   {text.replace(char, "X"):>3}: {size} {c_len} -> '
                  f'|{codes}| {ORANGE(small_status)} {small_name}')

    groups, total, errors = [None, None], 0, 0
    for line in filter(None, data.splitlines()):
        if line.startswith('#'):
            if line.startswith('# group:'):
                groups[0] = line.split()[-1]
            elif line.startswith('# subgroup:'):
                groups[1] = f'  - {line.split()[-1]}'
            continue

        p1, p2 = (p.split() for p in line.split(';'))
        status, name = p2[0], ' '.join(p2[4:])
        char = ''.join(chr(int(x, 16)) for x in p1)
        expect(char)
        expect(char, char)
        expect('a', char, 'a')
        expect('a', 'a', char)
        expect(char, 'a', 'a')

    print(f'\nerrors   : {errors / total:6.2%} [{errors}/{total}]')
    print(f'successes: {1 - errors / total:6.2%} [{total - errors}/{total}]')


def find_groups(data, max_diff):
    """Group some numbers with a maximum difference between them.
    I've used to try to fix the current grapheme break error.

    Using version unicode 13.1:
        Component
          - skin-tone
         🏻    XX: 1 != 2 -> |1f3fb 1f3fb|-| c light
         🏻   aXa: 2 != 3 -> |61 1f3fb|61|-| c light
         🏻   aaX: 2 != 3 -> |61|61 1f3fb|-| c light
         🏼    XX: 1 != 2 -> |1f3fc 1f3fc|-| c medium-light
         🏼   aXa: 2 != 3 -> |61 1f3fc|61|-| c medium-light
         🏼   aaX: 2 != 3 -> |61|61 1f3fc|-| c medium-light
         🏽    XX: 1 != 2 -> |1f3fd 1f3fd|-| c medium
         🏽   aXa: 2 != 3 -> |61 1f3fd|61|-| c medium
         🏽   aaX: 2 != 3 -> |61|61 1f3fd|-| c medium
         🏾    XX: 1 != 2 -> |1f3fe 1f3fe|-| c medium-dark
         🏾   aXa: 2 != 3 -> |61 1f3fe|61|-| c medium-dark
         🏾   aaX: 2 != 3 -> |61|61 1f3fe|-| c medium-dark
         🏿    XX: 1 != 2 -> |1f3ff 1f3ff|-| c dark
         🏿   aXa: 2 != 3 -> |61 1f3ff|61|-| c dark
         🏿   aaX: 2 != 3 -> |61|61 1f3ff|-| c dark

    The codepoints that do accept a skin tone are:
    0x0261D, 0x026F9, 0x0270A, 0x0270B, 0x0270C, 0x0270D, 0x1F385, 0x1F3C2, 0x1F3C3, 0x1F3C4,
    0x1F3C7, 0x1F3CA, 0x1F3CB, 0x1F3CC, 0x1F442, 0x1F443, 0x1F446, 0x1F447, 0x1F448, 0x1F449,
    0x1F44A, 0x1F44B, 0x1F44C, 0x1F44D, 0x1F44E, 0x1F44F, 0x1F450, 0x1F466, 0x1F467, 0x1F468,
    0x1F469, 0x1F46B, 0x1F46C, 0x1F46D, 0x1F46E, 0x1F470, 0x1F471, 0x1F472, 0x1F473, 0x1F474,
    0x1F475, 0x1F476, 0x1F477, 0x1F478, 0x1F47C, 0x1F481, 0x1F482, 0x1F483, 0x1F485, 0x1F486,
    0x1F487, 0x1F4AA, 0x1F574, 0x1F575, 0x1F57A, 0x1F590, 0x1F595, 0x1F596, 0x1F645, 0x1F646,
    0x1F647, 0x1F64B, 0x1F64C, 0x1F64D, 0x1F64E, 0x1F64F, 0x1F6A3, 0x1F6B4, 0x1F6B5, 0x1F6B6,
    0x1F6C0, 0x1F6CC, 0x1F90C, 0x1F90F, 0x1F918, 0x1F919, 0x1F91A, 0x1F91B, 0x1F91C, 0x1F91E,
    0x1F91F, 0x1F926, 0x1F930, 0x1F931, 0x1F932, 0x1F933, 0x1F934, 0x1F935, 0x1F936, 0x1F937,
    0x1F938, 0x1F939, 0x1F93D, 0x1F93E, 0x1F977, 0x1F9B5, 0x1F9B6, 0x1F9B8, 0x1F9B9, 0x1F9BB,
    0x1F9CD, 0x1F9CE, 0x1F9CF, 0x1F9D1, 0x1F9D2, 0x1F9D3, 0x1F9D4, 0x1F9D5, 0x1F9D6, 0x1F9D7,
    0x1F9D8, 0x1F9D9, 0x1F9DA, 0x1F9DB, 0x1F9DC, 0x1F9DD

    """
    it = iter(sorted(data))
    last_item = next(it)
    current_group = [last_item]
    result = [current_group]
    for i in it:
        if i - last_item > max_diff:
            current_group = []
            result.append(current_group)
        current_group.append(i)
        last_item = i
    print('\n'.join(f'{len(g)}:|' + ' '.join(hex(x).replace('0x', '') for x in g) + '|'
                    for g in result))


if __name__ == '__main__':
    parser, run = toolkit('Tests the grapheme break implementation against some unicode version.')
    parser.add_argument('uver', type=float, nargs='?', help='the unicode version to be used')
    parser.add_argument('--all', dest='show_all', action='store_true',
                        help='shows the correct cases, in addition to the wrong ones')
    parser.add_argument('--no-cache', dest='cache', action='store_false',
                        help='ignores the cache and re-downloads the spec')

    run(validate_unicode_breaks)