| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673 |
- # This file is part of python-bidi
- #
- # python-bidi is free software: you can redistribute it and/or modify
- # it under the terms of the GNU Lesser General Public License as published by
- # the Free Software Foundation, either version 3 of the License, or
- # (at your option) any later version.
- #
- # This program is distributed in the hope that it will be useful,
- # but WITHOUT ANY WARRANTY; without even the implied warranty of
- # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- # GNU Lesser General Public License for more details.
- #
- # You should have received a copy of the GNU Lesser General Public License
- # along with this program. If not, see <http://www.gnu.org/licenses/>.
- # Copyright (C) 2008-2010 Yaacov Zamir <kzamir_a_walla.co.il>,
- # Copyright (C) 2010-2015 Meir kriheli <mkriheli@gmail.com>.
- "bidirectional algorithm implementation"
- import inspect
- import sys
- from collections import deque
- from typing import Optional, Union
- from unicodedata import bidirectional, mirrored
- from .mirror import MIRRORED
- StrOrBytes = Union[str, bytes]
- # Some definitions
- PARAGRAPH_LEVELS = {"L": 0, "AL": 1, "R": 1}
- EXPLICIT_LEVEL_LIMIT = 62
- def _LEAST_GREATER_ODD(x):
- return (x + 1) | 1
- def _LEAST_GREATER_EVEN(x):
- return (x + 2) & ~1
- X2_X5_MAPPINGS = {
- "RLE": (_LEAST_GREATER_ODD, "N"),
- "LRE": (_LEAST_GREATER_EVEN, "N"),
- "RLO": (_LEAST_GREATER_ODD, "R"),
- "LRO": (_LEAST_GREATER_EVEN, "L"),
- }
- # Added 'B' so X6 won't execute in that case and X8 will run it's course
- X6_IGNORED = list(X2_X5_MAPPINGS.keys()) + ["BN", "PDF", "B"]
- X9_REMOVED = list(X2_X5_MAPPINGS.keys()) + ["BN", "PDF"]
- def _embedding_direction(x):
- return ("L", "R")[x % 2]
- _IS_UCS2 = sys.maxunicode == 65535
- _SURROGATE_MIN, _SURROGATE_MAX = 55296, 56319 # D800, DBFF
- def debug_storage(storage, base_info=False, chars=True, runs=False):
- "Display debug information for the storage"
- stderr = sys.stderr
- caller = inspect.stack()[1][3]
- stderr.write(f"in {caller}\n")
- if base_info:
- stderr.write(" base level : %d\n" % storage["base_level"])
- stderr.write(" base dir : {}\n".format(storage["base_dir"]))
- if runs:
- stderr.write(" runs : {}\n".format(list(storage["runs"])))
- if chars:
- output = " Chars : "
- for _ch in storage["chars"]:
- if _ch != "\n":
- output += _ch["ch"]
- else:
- output += "C"
- stderr.write(output + "\n")
- output = " Res. levels : {}\n".format("".join(
- [str(_ch["level"]) for _ch in storage["chars"]]
- ))
- stderr.write(output)
- _types = [_ch["type"].ljust(3) for _ch in storage["chars"]]
- for i in range(3):
- output = " %s\n" if i else " Res. types : %s\n"
- stderr.write(output % "".join([_t[i] for _t in _types]))
- def get_base_level(text, upper_is_rtl=False) -> int:
- """Get the paragraph base embedding level. Returns 0 for LTR,
- 1 for RTL.
- `text` a unicode object.
- Set `upper_is_rtl` to True to treat upper case chars as strong 'R'
- for debugging (default: False).
- """
- base_level = None
- prev_surrogate = False
- # P2
- for _ch in text:
- # surrogate in case of ucs2
- if _IS_UCS2 and (_SURROGATE_MIN <= ord(_ch) <= _SURROGATE_MAX):
- prev_surrogate = _ch
- continue
- elif prev_surrogate:
- _ch = prev_surrogate + _ch
- prev_surrogate = False
- # treat upper as RTL ?
- if upper_is_rtl and _ch.isupper():
- base_level = 1
- break
- bidi_type = bidirectional(_ch)
- if bidi_type in ("AL", "R"):
- base_level = 1
- break
- elif bidi_type == "L":
- base_level = 0
- break
- # P3
- if base_level is None:
- base_level = 0
- return base_level
- def get_embedding_levels(text, storage, upper_is_rtl=False, debug=False):
- """Get the paragraph base embedding level and direction,
- set the storage to the array of chars"""
- prev_surrogate = False
- base_level = storage["base_level"]
- # preset the storage's chars
- for _ch in text:
- if _IS_UCS2 and (_SURROGATE_MIN <= ord(_ch) <= _SURROGATE_MAX):
- prev_surrogate = _ch
- continue
- elif prev_surrogate:
- _ch = prev_surrogate + _ch
- prev_surrogate = False
- bidi_type = "R" if upper_is_rtl and _ch.isupper() else bidirectional(_ch)
- storage["chars"].append(
- {"ch": _ch, "level": base_level, "type": bidi_type, "orig": bidi_type}
- )
- if debug:
- debug_storage(storage, base_info=True)
- def explicit_embed_and_overrides(storage, debug=False):
- """Apply X1 to X9 rules of the unicode algorithm.
- See http://unicode.org/reports/tr9/#Explicit_Levels_and_Directions
- """
- overflow_counter = almost_overflow_counter = 0
- directional_override = "N"
- levels = deque()
- # X1
- embedding_level = storage["base_level"]
- for _ch in storage["chars"]:
- bidi_type = _ch["type"]
- level_func, override = X2_X5_MAPPINGS.get(bidi_type, (None, None))
- if level_func:
- # So this is X2 to X5
- # if we've past EXPLICIT_LEVEL_LIMIT, note it and do nothing
- if overflow_counter != 0:
- overflow_counter += 1
- continue
- new_level = level_func(embedding_level)
- if new_level < EXPLICIT_LEVEL_LIMIT:
- levels.append((embedding_level, directional_override))
- embedding_level, directional_override = new_level, override
- elif embedding_level == EXPLICIT_LEVEL_LIMIT - 2:
- # The new level is invalid, but a valid level can still be
- # achieved if this level is 60 and we encounter an RLE or
- # RLO further on. So record that we 'almost' overflowed.
- almost_overflow_counter += 1
- else:
- overflow_counter += 1
- else:
- # X6
- if bidi_type not in X6_IGNORED:
- _ch["level"] = embedding_level
- if directional_override != "N":
- _ch["type"] = directional_override
- # X7
- elif bidi_type == "PDF":
- if overflow_counter:
- overflow_counter -= 1
- elif (
- almost_overflow_counter
- and embedding_level != EXPLICIT_LEVEL_LIMIT - 1
- ):
- almost_overflow_counter -= 1
- elif levels:
- embedding_level, directional_override = levels.pop()
- # X8
- elif bidi_type == "B":
- levels.clear()
- overflow_counter = almost_overflow_counter = 0
- embedding_level = _ch["level"] = storage["base_level"]
- directional_override = "N"
- # Removes the explicit embeds and overrides of types
- # RLE, LRE, RLO, LRO, PDF, and BN. Adjusts extended chars
- # next and prev as well
- # Applies X9. See http://unicode.org/reports/tr9/#X9
- storage["chars"] = [
- _ch for _ch in storage["chars"] if _ch["type"] not in X9_REMOVED
- ]
- calc_level_runs(storage)
- if debug:
- debug_storage(storage, runs=True)
- def calc_level_runs(storage):
- """Split the storage to run of char types at the same level.
- Applies X10. See http://unicode.org/reports/tr9/#X10
- """
- # run level depends on the higher of the two levels on either side of
- # the boundary If the higher level is odd, the type is R; otherwise,
- # it is L
- storage["runs"].clear()
- chars = storage["chars"]
- # empty string ?
- if not chars:
- return
- def calc_level_run(b_l, b_r):
- return ["L", "R"][max(b_l, b_r) % 2]
- first_char = chars[0]
- sor = calc_level_run(storage["base_level"], first_char["level"])
- eor = None
- run_start = run_length = 0
- curr_level, curr_type = 0, ""
- prev_level, prev_type = first_char["level"], first_char["type"]
- for _ch in chars:
- curr_level, curr_type = _ch["level"], _ch["type"]
- if curr_level == prev_level:
- run_length += 1
- else:
- eor = calc_level_run(prev_level, curr_level)
- storage["runs"].append(
- {
- "sor": sor,
- "eor": eor,
- "start": run_start,
- "type": prev_type,
- "length": run_length,
- }
- )
- sor = eor
- run_start += run_length
- run_length = 1
- prev_level, prev_type = curr_level, curr_type
- # for the last char/runlevel
- eor = calc_level_run(curr_level, storage["base_level"])
- storage["runs"].append(
- {
- "sor": sor,
- "eor": eor,
- "start": run_start,
- "type": curr_type,
- "length": run_length,
- }
- )
- def resolve_weak_types(storage, debug=False):
- """Resolve weak type rules W1 - W3.
- See: http://unicode.org/reports/tr9/#Resolving_Weak_Types
- """
- for run in storage["runs"]:
- prev_strong = prev_type = run["sor"]
- start, length = run["start"], run["length"]
- chars = storage["chars"][start : start + length]
- for _ch in chars:
- # W1. Examine each nonspacing mark (NSM) in the level run, and
- # change the type of the NSM to the type of the previous character.
- # If the NSM is at the start of the level run, it will get the type
- # of sor.
- bidi_type = _ch["type"]
- if bidi_type == "NSM":
- _ch["type"] = bidi_type = prev_type
- # W2. Search backward from each instance of a European number until
- # the first strong type (R, L, AL, or sor) is found. If an AL is
- # found, change the type of the European number to Arabic number.
- if bidi_type == "EN" and prev_strong == "AL":
- _ch["type"] = "AN"
- # update prev_strong if needed
- if bidi_type in ("R", "L", "AL"):
- prev_strong = bidi_type
- prev_type = _ch["type"]
- # W3. Change all ALs to R
- for _ch in chars:
- if _ch["type"] == "AL":
- _ch["type"] = "R"
- # W4. A single European separator between two European numbers changes
- # to a European number. A single common separator between two numbers of
- # the same type changes to that type.
- for idx in range(1, len(chars) - 1):
- bidi_type = chars[idx]["type"]
- prev_type = chars[idx - 1]["type"]
- next_type = chars[idx + 1]["type"]
- if bidi_type == "ES" and (prev_type == next_type == "EN"):
- chars[idx]["type"] = "EN"
- if (
- bidi_type == "CS"
- and prev_type == next_type
- and prev_type in ("AN", "EN")
- ):
- chars[idx]["type"] = prev_type
- # W5. A sequence of European terminators adjacent to European numbers
- # changes to all European numbers.
- for idx in range(len(chars)):
- if chars[idx]["type"] == "EN":
- for et_idx in range(idx - 1, -1, -1):
- if chars[et_idx]["type"] == "ET":
- chars[et_idx]["type"] = "EN"
- else:
- break
- for et_idx in range(idx + 1, len(chars)):
- if chars[et_idx]["type"] == "ET":
- chars[et_idx]["type"] = "EN"
- else:
- break
- # W6. Otherwise, separators and terminators change to Other Neutral.
- for _ch in chars:
- if _ch["type"] in ("ET", "ES", "CS"):
- _ch["type"] = "ON"
- # W7. Search backward from each instance of a European number until the
- # first strong type (R, L, or sor) is found. If an L is found, then
- # change the type of the European number to L.
- prev_strong = run["sor"]
- for _ch in chars:
- if _ch["type"] == "EN" and prev_strong == "L":
- _ch["type"] = "L"
- if _ch["type"] in ("L", "R"):
- prev_strong = _ch["type"]
- if debug:
- debug_storage(storage, runs=True)
- def resolve_neutral_types(storage, debug):
- """Resolving neutral types. Implements N1 and N2
- See: http://unicode.org/reports/tr9/#Resolving_Neutral_Types
- """
- prev_bidi_type = ""
- for run in storage["runs"]:
- start, length = run["start"], run["length"]
- # use sor and eor
- chars = (
- [{"type": run["sor"]}]
- + storage["chars"][start : start + length]
- + [{"type": run["eor"]}]
- )
- total_chars = len(chars)
- seq_start = None
- for idx in range(total_chars):
- _ch = chars[idx]
- if _ch["type"] in ("B", "S", "WS", "ON"):
- # N1. A sequence of neutrals takes the direction of the
- # surrounding strong text if the text on both sides has the same
- # direction. European and Arabic numbers act as if they were R
- # in terms of their influence on neutrals. Start-of-level-run
- # (sor) and end-of-level-run (eor) are used at level run
- # boundaries.
- if seq_start is None:
- seq_start = idx
- prev_bidi_type = chars[idx - 1]["type"]
- else:
- if seq_start is not None:
- next_bidi_type = chars[idx]["type"]
- if prev_bidi_type in ("AN", "EN"):
- prev_bidi_type = "R"
- if next_bidi_type in ("AN", "EN"):
- next_bidi_type = "R"
- for seq_idx in range(seq_start, idx):
- if prev_bidi_type == next_bidi_type:
- chars[seq_idx]["type"] = prev_bidi_type
- else:
- # N2. Any remaining neutrals take the embedding
- # direction. The embedding direction for the given
- # neutral character is derived from its embedding
- # level: L if the character is set to an even level,
- # and R if the level is odd.
- chars[seq_idx]["type"] = _embedding_direction(
- chars[seq_idx]["level"]
- )
- seq_start = None
- if debug:
- debug_storage(storage)
- def resolve_implicit_levels(storage, debug):
- """Resolving implicit levels (I1, I2)
- See: http://unicode.org/reports/tr9/#Resolving_Implicit_Levels
- """
- for run in storage["runs"]:
- start, length = run["start"], run["length"]
- chars = storage["chars"][start : start + length]
- for _ch in chars:
- # only those types are allowed at this stage
- assert _ch["type"] in ("L", "R", "EN", "AN"), (
- "{} not allowed here".format(_ch["type"])
- )
- if _embedding_direction(_ch["level"]) == "L":
- # I1. For all characters with an even (left-to-right) embedding
- # direction, those of type R go up one level and those of type
- # AN or EN go up two levels.
- if _ch["type"] == "R":
- _ch["level"] += 1
- elif _ch["type"] != "L":
- _ch["level"] += 2
- else:
- # I2. For all characters with an odd (right-to-left) embedding
- # direction, those of type L, EN or AN go up one level.
- if _ch["type"] != "R":
- _ch["level"] += 1
- if debug:
- debug_storage(storage, runs=True)
- def reverse_contiguous_sequence(
- chars, line_start, line_end, highest_level, lowest_odd_level
- ):
- """L2. From the highest level found in the text to the lowest odd
- level on each line, including intermediate levels not actually
- present in the text, reverse any contiguous sequence of characters
- that are at that level or higher.
- """
- for level in range(highest_level, lowest_odd_level - 1, -1):
- _start = _end = None
- for run_idx in range(line_start, line_end + 1):
- run_ch = chars[run_idx]
- if run_ch["level"] >= level:
- if _start is None:
- _start = _end = run_idx
- else:
- _end = run_idx
- else:
- if _end is not None:
- chars[_start : +_end + 1] = reversed(chars[_start : +_end + 1])
- _start = _end = None
- # anything remaining ?
- if _start is not None and _end is not None:
- chars[_start : +_end + 1] = reversed(chars[_start : +_end + 1])
- def reorder_resolved_levels(storage, debug):
- """L1 and L2 rules"""
- # Applies L1.
- should_reset = True
- chars = storage["chars"]
- for _ch in chars[::-1]:
- # L1. On each line, reset the embedding level of the following
- # characters to the paragraph embedding level:
- if _ch["orig"] in ("B", "S"):
- # 1. Segment separators,
- # 2. Paragraph separators,
- _ch["level"] = storage["base_level"]
- should_reset = True
- elif should_reset and _ch["orig"] in ("BN", "WS"):
- # 3. Any sequence of whitespace characters preceding a segment
- # separator or paragraph separator
- # 4. Any sequence of white space characters at the end of the
- # line.
- _ch["level"] = storage["base_level"]
- else:
- should_reset = False
- max_len = len(chars)
- # L2 should be per line
- # Calculates highest level and lowest odd level on the fly.
- line_start = line_end = 0
- highest_level = 0
- lowest_odd_level = EXPLICIT_LEVEL_LIMIT
- for idx in range(max_len):
- _ch = chars[idx]
- # calc the levels
- char_level = _ch["level"]
- if char_level > highest_level:
- highest_level = char_level
- if char_level % 2 and char_level < lowest_odd_level:
- lowest_odd_level = char_level
- if _ch["orig"] == "B" or idx == max_len - 1:
- line_end = idx
- # omit line breaks
- if _ch["orig"] == "B":
- line_end -= 1
- reverse_contiguous_sequence(
- chars, line_start, line_end, highest_level, lowest_odd_level
- )
- # reset for next line run
- line_start = idx + 1
- highest_level = 0
- lowest_odd_level = EXPLICIT_LEVEL_LIMIT
- if debug:
- debug_storage(storage)
- def apply_mirroring(storage, debug):
- """Applies L4: mirroring
- See: http://unicode.org/reports/tr9/#L4
- """
- # L4. A character is depicted by a mirrored glyph if and only if (a) the
- # resolved directionality of that character is R, and (b) the
- # Bidi_Mirrored property value of that character is true.
- for _ch in storage["chars"]:
- unichar = _ch["ch"]
- if mirrored(unichar) and _embedding_direction(_ch["level"]) == "R":
- _ch["ch"] = MIRRORED.get(unichar, unichar)
- if debug:
- debug_storage(storage)
- def get_empty_storage():
- """Return an empty storage skeleton, usable for testing"""
- return {
- "base_level": None,
- "base_dir": None,
- "chars": [],
- "runs": deque(),
- }
- def get_display(
- str_or_bytes: StrOrBytes,
- encoding: str = "utf-8",
- upper_is_rtl: bool = False,
- base_dir: Optional[str] = None,
- debug: bool = False,
- ) -> StrOrBytes:
- """Accepts `str` or `bytes`. In case it's `bytes`, `encoding`
- is needed as the algorithm works on `str` (default:"utf-8").
- Set `upper_is_rtl` to True to treat upper case chars as strong 'R'
- for debugging (default: False).
- Set `base_dir` to 'L' or 'R' to override the calculated base_level.
- Set `debug` to True to display (using sys.stderr) the steps taken with the
- algorithm.
- Returns the display layout, either as unicode or `encoding` encoded
- string.
- """
- storage = get_empty_storage()
- # utf-8 ? we need unicode
- if isinstance(str_or_bytes, bytes):
- text = str_or_bytes.decode(encoding)
- was_decoded = True
- else:
- text = str_or_bytes
- was_decoded = False
- if base_dir is None:
- base_level = get_base_level(text, upper_is_rtl)
- else:
- base_level = PARAGRAPH_LEVELS[base_dir]
- storage["base_level"] = base_level
- storage["base_dir"] = ("L", "R")[base_level]
- get_embedding_levels(text, storage, upper_is_rtl, debug)
- explicit_embed_and_overrides(storage, debug)
- resolve_weak_types(storage, debug)
- resolve_neutral_types(storage, debug)
- resolve_implicit_levels(storage, debug)
- reorder_resolved_levels(storage, debug)
- apply_mirroring(storage, debug)
- chars = storage["chars"]
- display = "".join([_ch["ch"] for _ch in chars])
- if was_decoded:
- display = display.encode(encoding)
- return display
|