wcwidth.py 40 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030
  1. """
  2. This is a python implementation of wcwidth() and wcswidth().
  3. https://github.com/jquast/wcwidth
  4. from Markus Kuhn's C code, retrieved from:
  5. http://www.cl.cam.ac.uk/~mgk25/ucs/wcwidth.c
  6. This is an implementation of wcwidth() and wcswidth() (defined in
  7. IEEE Std 1002.1-2001) for Unicode.
  8. http://www.opengroup.org/onlinepubs/007904975/functions/wcwidth.html
  9. http://www.opengroup.org/onlinepubs/007904975/functions/wcswidth.html
  10. In fixed-width output devices, Latin characters all occupy a single
  11. "cell" position of equal width, whereas ideographic CJK characters
  12. occupy two such cells. Interoperability between terminal-line
  13. applications and (teletype-style) character terminals using the
  14. UTF-8 encoding requires agreement on which character should advance
  15. the cursor by how many cell positions. No established formal
  16. standards exist at present on which Unicode character shall occupy
  17. how many cell positions on character terminals. These routines are
  18. a first attempt of defining such behavior based on simple rules
  19. applied to data provided by the Unicode Consortium.
  20. For some graphical characters, the Unicode standard explicitly
  21. defines a character-cell width via the definition of the East Asian
  22. FullWidth (F), Wide (W), Half-width (H), and Narrow (Na) classes.
  23. In all these cases, there is no ambiguity about which width a
  24. terminal shall use. For characters in the East Asian Ambiguous (A)
  25. class, the width choice depends purely on a preference of backward
  26. compatibility with either historic CJK or Western practice.
  27. Choosing single-width for these characters is easy to justify as
  28. the appropriate long-term solution, as the CJK practice of
  29. displaying these characters as double-width comes from historic
  30. implementation simplicity (8-bit encoded characters were displayed
  31. single-width and 16-bit ones double-width, even for Greek,
  32. Cyrillic, etc.) and not any typographic considerations.
  33. Much less clear is the choice of width for the Not East Asian
  34. (Neutral) class. Existing practice does not dictate a width for any
  35. of these characters. It would nevertheless make sense
  36. typographically to allocate two character cells to characters such
  37. as for instance EM SPACE or VOLUME INTEGRAL, which cannot be
  38. represented adequately with a single-width glyph. The following
  39. routines at present merely assign a single-cell width to all
  40. neutral characters, in the interest of simplicity. This is not
  41. entirely satisfactory and should be reconsidered before
  42. establishing a formal standard in this area. At the moment, the
  43. decision which Not East Asian (Neutral) characters should be
  44. represented by double-width glyphs cannot yet be answered by
  45. applying a simple rule from the Unicode database content. Setting
  46. up a proper standard for the behavior of UTF-8 character terminals
  47. will require a careful analysis not only of each Unicode character,
  48. but also of each presentation form, something the author of these
  49. routines has avoided to do so far.
  50. http://www.unicode.org/unicode/reports/tr11/
  51. Latest version: http://www.cl.cam.ac.uk/~mgk25/ucs/wcwidth.c
  52. """
  53. from __future__ import annotations
  54. # std imports
  55. from functools import lru_cache
  56. from typing import TYPE_CHECKING
  57. # local
  58. from .bisearch import bisearch as _bisearch
  59. from .grapheme import iter_graphemes
  60. from .table_mc import CATEGORY_MC
  61. from .sgr_state import (_SGR_PATTERN,
  62. _SGR_STATE_DEFAULT,
  63. _sgr_state_update,
  64. _sgr_state_is_active,
  65. _sgr_state_to_sequence)
  66. from .table_vs16 import VS16_NARROW_TO_WIDE
  67. from .table_wide import WIDE_EASTASIAN
  68. from .table_zero import ZERO_WIDTH
  69. from .control_codes import ILLEGAL_CTRL, VERTICAL_CTRL, HORIZONTAL_CTRL, ZERO_WIDTH_CTRL
  70. from .table_grapheme import ISC_CONSONANT, EXTENDED_PICTOGRAPHIC, GRAPHEME_REGIONAL_INDICATOR
  71. from .table_ambiguous import AMBIGUOUS_EASTASIAN
  72. from .escape_sequences import (ZERO_WIDTH_PATTERN,
  73. CURSOR_LEFT_SEQUENCE,
  74. CURSOR_RIGHT_SEQUENCE,
  75. INDETERMINATE_EFFECT_SEQUENCE)
  76. from .unicode_versions import list_versions
  77. if TYPE_CHECKING: # pragma: no cover
  78. # std imports
  79. from collections.abc import Iterator
  80. from typing import Literal
  81. # Pre-compute table references for the latest (and only) Unicode version.
  82. _LATEST_VERSION = list_versions()[-1]
  83. _ZERO_WIDTH_TABLE = ZERO_WIDTH[_LATEST_VERSION]
  84. _WIDE_EASTASIAN_TABLE = WIDE_EASTASIAN[_LATEST_VERSION]
  85. _AMBIGUOUS_TABLE = AMBIGUOUS_EASTASIAN[next(iter(AMBIGUOUS_EASTASIAN))]
  86. _CATEGORY_MC_TABLE = CATEGORY_MC[_LATEST_VERSION]
  87. _REGIONAL_INDICATOR_SET = frozenset(
  88. range(GRAPHEME_REGIONAL_INDICATOR[0][0], GRAPHEME_REGIONAL_INDICATOR[0][1] + 1)
  89. )
  90. _EMOJI_ZWJ_SET = frozenset(
  91. cp for lo, hi in EXTENDED_PICTOGRAPHIC for cp in range(lo, hi + 1)
  92. ) | _REGIONAL_INDICATOR_SET
  93. _FITZPATRICK_RANGE = (0x1F3FB, 0x1F3FF)
  94. # Indic_Syllabic_Category=Virama codepoints, from IndicSyllabicCategory.txt.
  95. # These are structurally tied to their scripts and not expected to change.
  96. # https://www.unicode.org/Public/UCD/latest/ucd/IndicSyllabicCategory.txt
  97. _ISC_VIRAMA_SET = frozenset((
  98. 0x094D, # DEVANAGARI SIGN VIRAMA
  99. 0x09CD, # BENGALI SIGN VIRAMA
  100. 0x0A4D, # GURMUKHI SIGN VIRAMA
  101. 0x0ACD, # GUJARATI SIGN VIRAMA
  102. 0x0B4D, # ORIYA SIGN VIRAMA
  103. 0x0BCD, # TAMIL SIGN VIRAMA
  104. 0x0C4D, # TELUGU SIGN VIRAMA
  105. 0x0CCD, # KANNADA SIGN VIRAMA
  106. 0x0D4D, # MALAYALAM SIGN VIRAMA
  107. 0x0DCA, # SINHALA SIGN AL-LAKUNA
  108. 0x1B44, # BALINESE ADEG ADEG
  109. 0xA806, # SYLOTI NAGRI SIGN HASANTA
  110. 0xA8C4, # SAURASHTRA SIGN VIRAMA
  111. 0xA9C0, # JAVANESE PANGKON
  112. 0x11046, # BRAHMI VIRAMA
  113. 0x110B9, # KAITHI SIGN VIRAMA
  114. 0x111C0, # SHARADA SIGN VIRAMA
  115. 0x11235, # KHOJKI SIGN VIRAMA
  116. 0x1134D, # GRANTHA SIGN VIRAMA
  117. 0x11442, # NEWA SIGN VIRAMA
  118. 0x114C2, # TIRHUTA SIGN VIRAMA
  119. 0x115BF, # SIDDHAM SIGN VIRAMA
  120. 0x1163F, # MODI SIGN VIRAMA
  121. 0x116B6, # TAKRI SIGN VIRAMA
  122. 0x11839, # DOGRA SIGN VIRAMA
  123. 0x119E0, # NANDINAGARI SIGN VIRAMA
  124. 0x11C3F, # BHAIKSUKI SIGN VIRAMA
  125. ))
  126. _ISC_CONSONANT_TABLE = ISC_CONSONANT
  127. # In 'parse' mode, strings longer than this are checked for cursor-movement
  128. # controls (BS, TAB, CR, cursor sequences); when absent, mode downgrades to
  129. # 'ignore' to skip character-by-character parsing. The detection scan cost is
  130. # negligible for long strings but wasted on short ones like labels or headings.
  131. _WIDTH_FAST_PATH_MIN_LEN = 20
  132. # Translation table to strip C0/C1 control characters for fast 'ignore' mode.
  133. _CONTROL_CHAR_TABLE = str.maketrans('', '', (
  134. ''.join(chr(c) for c in range(0x00, 0x20)) + # C0: NUL through US (including tab)
  135. '\x7f' + # DEL
  136. ''.join(chr(c) for c in range(0x80, 0xa0)) # C1: U+0080-U+009F
  137. ))
  138. # Unlike wcwidth.__all__, wcwidth.wcwidth.__all__ is NOT for the purpose of defining a public API,
  139. # or what we prefer to be imported with statement, "from wcwidth.wcwidth import *". Explicitly
  140. # re-export imports here for no other reason than to satisfy the type checkers (mypy). Yak shavings.
  141. __all__ = (
  142. 'ZERO_WIDTH',
  143. 'WIDE_EASTASIAN',
  144. 'AMBIGUOUS_EASTASIAN',
  145. 'VS16_NARROW_TO_WIDE',
  146. 'list_versions',
  147. 'wcwidth',
  148. 'wcswidth',
  149. 'width',
  150. 'iter_sequences',
  151. 'ljust',
  152. 'rjust',
  153. 'center',
  154. 'clip',
  155. 'strip_sequences',
  156. '_wcmatch_version',
  157. '_wcversion_value',
  158. )
  159. # maxsize=1024: western scripts need ~64 unique codepoints per session, but
  160. # CJK sessions may use ~2000 of ~3500 common hanzi/kanji. 1024 accommodates
  161. # heavy CJK use. Performance floor at 32; bisearch is ~100ns per miss.
  162. @lru_cache(maxsize=1024)
  163. def wcwidth(wc: str, unicode_version: str = 'auto', ambiguous_width: int = 1) -> int: # pylint: disable=unused-argument
  164. r"""
  165. Given one Unicode codepoint, return its printable length on a terminal.
  166. :param wc: A single Unicode character.
  167. :param unicode_version: Ignored. Retained for backwards compatibility.
  168. .. deprecated:: 0.3.0
  169. Only the latest Unicode version is now shipped.
  170. :param ambiguous_width: Width to use for East Asian Ambiguous (A)
  171. characters. Default is ``1`` (narrow). Set to ``2`` for CJK contexts
  172. where ambiguous characters display as double-width. See
  173. :ref:`ambiguous_width` for details.
  174. :returns: The width, in cells, necessary to display the character of
  175. Unicode string character, ``wc``. Returns 0 if the ``wc`` argument has
  176. no printable effect on a terminal (such as NUL '\0'), -1 if ``wc`` is
  177. not printable, or has an indeterminate effect on the terminal, such as
  178. a control character. Otherwise, the number of column positions the
  179. character occupies on a graphic terminal (1 or 2) is returned.
  180. See :ref:`Specification` for details of cell measurement.
  181. """
  182. ucs = ord(wc) if wc else 0
  183. # small optimization: early return of 1 for printable ASCII, this provides
  184. # approximately 40% performance improvement for mostly-ascii documents, with
  185. # less than 1% impact to others.
  186. if 32 <= ucs < 0x7f:
  187. return 1
  188. # C0/C1 control characters are -1 for compatibility with POSIX-like calls
  189. if ucs and ucs < 32 or 0x07F <= ucs < 0x0A0:
  190. return -1
  191. # Zero width
  192. if _bisearch(ucs, _ZERO_WIDTH_TABLE):
  193. return 0
  194. # Wide (F/W categories)
  195. if _bisearch(ucs, _WIDE_EASTASIAN_TABLE):
  196. return 2
  197. # Ambiguous width (A category) - only when ambiguous_width=2
  198. if ambiguous_width == 2 and _bisearch(ucs, _AMBIGUOUS_TABLE):
  199. return 2
  200. return 1
  201. def wcswidth(
  202. pwcs: str,
  203. n: int | None = None,
  204. unicode_version: str = 'auto',
  205. ambiguous_width: int = 1,
  206. ) -> int:
  207. """
  208. Given a unicode string, return its printable length on a terminal.
  209. :param pwcs: Measure width of given unicode string.
  210. :param n: When ``n`` is None (default), return the length of the entire
  211. string, otherwise only the first ``n`` characters are measured.
  212. Better to use string slicing capability, ``wcswidth(pwcs[:n])``, instead,
  213. for performance. This argument is a holdover from the POSIX function for
  214. matching signatures. Be careful that ``n`` is at grapheme boundaries.
  215. :param unicode_version: Ignored. Retained for backwards compatibility.
  216. .. deprecated:: 0.3.0
  217. Only the latest Unicode version is now shipped.
  218. :param ambiguous_width: Width to use for East Asian Ambiguous (A)
  219. characters. Default is ``1`` (narrow). Set to ``2`` for CJK contexts.
  220. :returns: The width, in cells, needed to display the first ``n`` characters
  221. of the unicode string ``pwcs``. Returns ``-1`` for C0 and C1 control
  222. characters!
  223. See :ref:`Specification` for details of cell measurement.
  224. """
  225. # pylint: disable=unused-argument,too-many-locals,too-many-statements
  226. # pylint: disable=too-complex,too-many-branches
  227. # This function intentionally kept long without delegating functions to reduce function calls in
  228. # "hot path", the overhead per-character adds up.
  229. # Fast path: pure ASCII printable strings are always width == length
  230. if n is None and pwcs.isascii() and pwcs.isprintable():
  231. return len(pwcs)
  232. # Select wcwidth call pattern for best lru_cache performance:
  233. # - ambiguous_width=1 (default): single-arg calls share cache with direct wcwidth() calls
  234. # - ambiguous_width=2: full positional args needed (results differ, separate cache is correct)
  235. _wcwidth = wcwidth if ambiguous_width == 1 else lambda c: wcwidth(c, 'auto', ambiguous_width)
  236. end = len(pwcs) if n is None else n
  237. total_width = 0
  238. idx = 0
  239. last_measured_idx = -2 # Track index of last measured char for VS16
  240. last_measured_ucs = -1 # Codepoint of last measured char (for deferred emoji check)
  241. last_was_virama = False # Virama conjunct formation state
  242. conjunct_pending = False # Deferred +1 for bare conjuncts (no trailing Mc)
  243. while idx < end:
  244. char = pwcs[idx]
  245. ucs = ord(char)
  246. if ucs == 0x200D:
  247. if last_was_virama:
  248. # ZWJ after virama requests explicit half-form rendering but
  249. # does not change cell count — consume ZWJ only, let the next
  250. # consonant be handled by the virama conjunct rule.
  251. idx += 1
  252. elif idx + 1 < end:
  253. # Emoji ZWJ: skip next character unconditionally.
  254. idx += 2
  255. last_was_virama = False
  256. else:
  257. idx += 1
  258. last_was_virama = False
  259. continue
  260. if ucs == 0xFE0F and last_measured_idx >= 0:
  261. # VS16 following a measured character: add 1 if that character is
  262. # known to be converted from narrow to wide by VS16.
  263. total_width += _bisearch(ord(pwcs[last_measured_idx]),
  264. VS16_NARROW_TO_WIDE["9.0.0"])
  265. last_measured_idx = -2 # Prevent double application
  266. # VS16 preserves emoji context: last_measured_ucs stays as the base
  267. idx += 1
  268. continue
  269. # Regional Indicator & Fitzpatrick: both above BMP (U+1F1E6+)
  270. if ucs > 0xFFFF:
  271. if ucs in _REGIONAL_INDICATOR_SET:
  272. # Lazy RI pairing: count preceding consecutive RIs only when the last one is
  273. # received, because RI's are received so rarely its better than per-loop tracking of
  274. # 'last char was an RI'.
  275. ri_before = 0
  276. j = idx - 1
  277. while j >= 0 and ord(pwcs[j]) in _REGIONAL_INDICATOR_SET:
  278. ri_before += 1
  279. j -= 1
  280. if ri_before % 2 == 1:
  281. # Second RI in pair: contributes 0 (pair = one 2-cell flag) using an even-or-odd
  282. # check to determine, 'CAUS' would be two flags, but 'CAU' would be 1 flag
  283. # and wide 'U'.
  284. idx += 1
  285. last_measured_ucs = ucs
  286. continue
  287. # First or unpaired RI: measured normally (width 2 from table)
  288. # Fitzpatrick modifier: zero-width when following emoji base
  289. elif (_FITZPATRICK_RANGE[0] <= ucs <= _FITZPATRICK_RANGE[1]
  290. and last_measured_ucs in _EMOJI_ZWJ_SET):
  291. idx += 1
  292. continue
  293. # Virama conjunct formation: consonant following virama contributes 0 width.
  294. # See https://www.unicode.org/reports/tr44/#Indic_Syllabic_Category
  295. if last_was_virama and _bisearch(ucs, _ISC_CONSONANT_TABLE):
  296. last_measured_idx = idx
  297. last_measured_ucs = ucs
  298. last_was_virama = False
  299. conjunct_pending = True
  300. idx += 1
  301. continue
  302. wcw = _wcwidth(char)
  303. if wcw < 0:
  304. # early return -1 on C0 and C1 control characters
  305. return wcw
  306. if wcw > 0:
  307. if conjunct_pending:
  308. total_width += 1
  309. conjunct_pending = False
  310. last_measured_idx = idx
  311. last_measured_ucs = ucs
  312. last_was_virama = False
  313. elif last_measured_idx >= 0 and _bisearch(ucs, _CATEGORY_MC_TABLE):
  314. # Spacing Combining Mark (Mc) following a base character adds 1
  315. wcw = 1
  316. last_measured_idx = -2
  317. last_was_virama = False
  318. conjunct_pending = False
  319. else:
  320. last_was_virama = ucs in _ISC_VIRAMA_SET
  321. total_width += wcw
  322. idx += 1
  323. if conjunct_pending:
  324. total_width += 1
  325. return total_width
  326. # NOTE: _wcversion_value and _wcmatch_version are no longer used internally
  327. # by wcwidth since version 0.5.0 (only the latest Unicode version is shipped).
  328. #
  329. # They are retained for API compatibility with external tools like ucs-detect
  330. # that may use these private functions.
  331. @lru_cache(maxsize=128)
  332. def _wcversion_value(ver_string: str) -> tuple[int, ...]: # pragma: no cover
  333. """
  334. Integer-mapped value of given dotted version string.
  335. .. deprecated:: 0.3.0
  336. This function is no longer used internally by wcwidth but is retained
  337. for API compatibility with external tools.
  338. :param ver_string: Unicode version string, of form ``n.n.n``.
  339. :returns: tuple of digit tuples, ``tuple(int, [...])``.
  340. """
  341. retval = tuple(map(int, (ver_string.split('.'))))
  342. return retval
  343. @lru_cache(maxsize=8)
  344. def _wcmatch_version(given_version: str) -> str: # pylint: disable=unused-argument
  345. """
  346. Return the supported Unicode version level.
  347. .. deprecated:: 0.3.0
  348. This function now always returns the latest version.
  349. This function is no longer used internally by wcwidth but is retained
  350. for API compatibility with external tools.
  351. :param given_version: Ignored. Any value is accepted for compatibility.
  352. :returns: The latest unicode version string.
  353. """
  354. return _LATEST_VERSION
  355. def iter_sequences(text: str) -> Iterator[tuple[str, bool]]:
  356. r"""
  357. Iterate through text, yielding segments with sequence identification.
  358. This generator yields tuples of ``(segment, is_sequence)`` for each part
  359. of the input text, where ``is_sequence`` is ``True`` if the segment is
  360. a recognized terminal escape sequence.
  361. :param text: String to iterate through.
  362. :returns: Iterator of (segment, is_sequence) tuples.
  363. .. versionadded:: 0.3.0
  364. Example::
  365. >>> list(iter_sequences('hello'))
  366. [('hello', False)]
  367. >>> list(iter_sequences('\x1b[31mred'))
  368. [('\x1b[31m', True), ('red', False)]
  369. >>> list(iter_sequences('\x1b[1m\x1b[31m'))
  370. [('\x1b[1m', True), ('\x1b[31m', True)]
  371. """
  372. idx = 0
  373. text_len = len(text)
  374. segment_start = 0
  375. while idx < text_len:
  376. char = text[idx]
  377. if char == '\x1b':
  378. # Yield any accumulated non-sequence text
  379. if idx > segment_start:
  380. yield (text[segment_start:idx], False)
  381. # Try to match an escape sequence
  382. match = ZERO_WIDTH_PATTERN.match(text, idx)
  383. if match:
  384. yield (match.group(), True)
  385. idx = match.end()
  386. else:
  387. # Lone ESC or unrecognized - yield as sequence anyway
  388. yield (char, True)
  389. idx += 1
  390. segment_start = idx
  391. else:
  392. idx += 1
  393. # Yield any remaining text
  394. if segment_start < text_len:
  395. yield (text[segment_start:], False)
  396. def _width_ignored_codes(text: str, ambiguous_width: int = 1) -> int:
  397. """
  398. Fast path for width() with control_codes='ignore'.
  399. Strips escape sequences and control characters, then measures remaining text.
  400. """
  401. return wcswidth(
  402. strip_sequences(text).translate(_CONTROL_CHAR_TABLE),
  403. ambiguous_width=ambiguous_width
  404. )
  405. def width(
  406. text: str,
  407. *,
  408. control_codes: Literal['parse', 'strict', 'ignore'] = 'parse',
  409. tabsize: int = 8,
  410. ambiguous_width: int = 1,
  411. ) -> int:
  412. r"""
  413. Return printable width of text containing many kinds of control codes and sequences.
  414. Unlike :func:`wcswidth`, this function handles most control characters and many popular terminal
  415. output sequences. Never returns -1.
  416. :param text: String to measure.
  417. :param control_codes: How to handle control characters and sequences:
  418. - ``'parse'`` (default): Track horizontal cursor movement from BS ``\b``, CR ``\r``, TAB
  419. ``\t``, and cursor left and right movement sequences. Vertical movement (LF, VT, FF) and
  420. indeterminate sequences are zero-width. Never raises.
  421. - ``'strict'``: Like parse, but raises :exc:`ValueError` on control characters with
  422. indeterminate results of the screen or cursor, like clear or vertical movement. Generally,
  423. these should be handled with a virtual terminal emulator (like 'pyte').
  424. - ``'ignore'``: All C0 and C1 control characters and escape sequences are measured as
  425. width 0. This is the fastest measurement for text already filtered or known not to contain
  426. any kinds of control codes or sequences. TAB ``\t`` is zero-width; for tab expansion,
  427. pre-process: ``text.replace('\t', ' ' * 8)``.
  428. :param tabsize: Tab stop width for ``'parse'`` and ``'strict'`` modes. Default is 8.
  429. Must be positive. Has no effect when ``control_codes='ignore'``.
  430. :param ambiguous_width: Width to use for East Asian Ambiguous (A)
  431. characters. Default is ``1`` (narrow). Set to ``2`` for CJK contexts.
  432. :returns: Maximum cursor position reached, "extent", accounting for cursor movement sequences
  433. present in ``text`` according to given parameters. This represents the rightmost column the
  434. cursor reaches. Always a non-negative integer.
  435. :raises ValueError: If ``control_codes='strict'`` and control characters with indeterminate
  436. effects, such as vertical movement or clear sequences are encountered, or on unexpected
  437. C0 or C1 control code. Also raised when ``control_codes`` is not one of the valid values.
  438. .. versionadded:: 0.3.0
  439. Examples::
  440. >>> width('hello')
  441. 5
  442. >>> width('コンニチハ')
  443. 10
  444. >>> width('\x1b[31mred\x1b[0m')
  445. 3
  446. >>> width('\x1b[31mred\x1b[0m', control_codes='ignore') # same result (ignored)
  447. 3
  448. >>> width('123\b4') # backspace overwrites previous cell (outputs '124')
  449. 3
  450. >>> width('abc\t') # tab caused cursor to move to column 8
  451. 8
  452. >>> width('1\x1b[10C') # '1' + cursor right 10, cursor ends on column 11
  453. 11
  454. >>> width('1\x1b[10C', control_codes='ignore') # faster but wrong in this case
  455. 1
  456. """
  457. # pylint: disable=too-complex,too-many-branches,too-many-statements,too-many-locals
  458. # This could be broken into sub-functions (#1, #3, and 6 especially), but for reduced overhead
  459. # considering this function is a likely "hot path", they are inlined, breaking many of our
  460. # complexity rules.
  461. # Fast path for ASCII printable (no tabs, escapes, or control chars)
  462. if text.isascii() and text.isprintable():
  463. return len(text)
  464. # Fast parse: if no horizontal cursor movements are possible, switch to 'ignore' mode.
  465. # Only check for longer strings - the detection overhead hurts short string performance.
  466. if control_codes == 'parse' and len(text) > _WIDTH_FAST_PATH_MIN_LEN:
  467. # Check for cursor-affecting control characters
  468. if '\b' not in text and '\t' not in text and '\r' not in text:
  469. # Check for escape sequences - if none, or only non-cursor-movement sequences
  470. if '\x1b' not in text or (
  471. not CURSOR_RIGHT_SEQUENCE.search(text) and
  472. not CURSOR_LEFT_SEQUENCE.search(text)
  473. ):
  474. control_codes = 'ignore'
  475. # Fast path for ignore mode -- this is useful if you know the text is already "clean"
  476. if control_codes == 'ignore':
  477. return _width_ignored_codes(text, ambiguous_width)
  478. strict = control_codes == 'strict'
  479. # Track absolute positions: tab stops need modulo on absolute column, CR resets to 0.
  480. # Initialize max_extent to 0 so backward movement (CR, BS) won't yield negative width.
  481. current_col = 0
  482. max_extent = 0
  483. idx = 0
  484. last_measured_idx = -2 # Track index of last measured char for VS16; -2 can never match idx-1
  485. last_measured_ucs = -1 # Codepoint of last measured char (for deferred emoji check)
  486. last_was_virama = False # Virama conjunct formation state
  487. conjunct_pending = False # Deferred +1 for bare conjuncts (no trailing Mc)
  488. text_len = len(text)
  489. # Select wcwidth call pattern for best lru_cache performance:
  490. # - ambiguous_width=1 (default): single-arg calls share cache with direct wcwidth() calls
  491. # - ambiguous_width=2: full positional args needed (results differ, separate cache is correct)
  492. _wcwidth = wcwidth if ambiguous_width == 1 else lambda c: wcwidth(c, 'auto', ambiguous_width)
  493. while idx < text_len:
  494. char = text[idx]
  495. # 1. Handle ESC sequences
  496. if char == '\x1b':
  497. match = ZERO_WIDTH_PATTERN.match(text, idx)
  498. if match:
  499. seq = match.group()
  500. if strict and INDETERMINATE_EFFECT_SEQUENCE.match(seq):
  501. raise ValueError(f"Indeterminate cursor sequence at position {idx}")
  502. # Apply cursor movement
  503. right = CURSOR_RIGHT_SEQUENCE.match(seq)
  504. if right:
  505. current_col += int(right.group(1) or 1)
  506. else:
  507. left = CURSOR_LEFT_SEQUENCE.match(seq)
  508. if left:
  509. current_col = max(0, current_col - int(left.group(1) or 1))
  510. idx = match.end()
  511. else:
  512. idx += 1
  513. max_extent = max(max_extent, current_col)
  514. continue
  515. # 2. Handle illegal and vertical control characters (zero width, error in strict)
  516. if char in ILLEGAL_CTRL:
  517. if strict:
  518. raise ValueError(f"Illegal control character {ord(char):#x} at position {idx}")
  519. idx += 1
  520. continue
  521. if char in VERTICAL_CTRL:
  522. if strict:
  523. raise ValueError(f"Vertical movement character {ord(char):#x} at position {idx}")
  524. idx += 1
  525. continue
  526. # 3. Handle horizontal movement characters
  527. if char in HORIZONTAL_CTRL:
  528. if char == '\x09' and tabsize > 0: # Tab
  529. current_col += tabsize - (current_col % tabsize)
  530. elif char == '\x08': # Backspace
  531. if current_col > 0:
  532. current_col -= 1
  533. elif char == '\x0d': # Carriage return
  534. current_col = 0
  535. max_extent = max(max_extent, current_col)
  536. idx += 1
  537. continue
  538. # 4. Handle ZWJ
  539. if char == '\u200D':
  540. if last_was_virama:
  541. # ZWJ after virama requests explicit half-form rendering but
  542. # does not change cell count — consume ZWJ only, let the next
  543. # consonant be handled by the virama conjunct rule.
  544. idx += 1
  545. elif idx + 1 < text_len:
  546. # Emoji ZWJ: skip next character unconditionally.
  547. idx += 2
  548. last_was_virama = False
  549. else:
  550. idx += 1
  551. last_was_virama = False
  552. continue
  553. # 5. Handle other zero-width characters (control chars)
  554. if char in ZERO_WIDTH_CTRL:
  555. idx += 1
  556. continue
  557. ucs = ord(char)
  558. # 6. Handle VS16: converts preceding narrow character to wide
  559. if ucs == 0xFE0F:
  560. if last_measured_idx == idx - 1:
  561. if _bisearch(ord(text[last_measured_idx]), VS16_NARROW_TO_WIDE["9.0.0"]):
  562. current_col += 1
  563. max_extent = max(max_extent, current_col)
  564. # VS16 preserves emoji context: last_measured_ucs stays as the base
  565. idx += 1
  566. continue
  567. # 6b. Regional Indicator & Fitzpatrick: both above BMP (U+1F1E6+)
  568. if ucs > 0xFFFF:
  569. if ucs in _REGIONAL_INDICATOR_SET:
  570. # Lazy RI pairing: count preceding consecutive RIs
  571. ri_before = 0
  572. j = idx - 1
  573. while j >= 0 and ord(text[j]) in _REGIONAL_INDICATOR_SET:
  574. ri_before += 1
  575. j -= 1
  576. if ri_before % 2 == 1:
  577. last_measured_ucs = ucs
  578. idx += 1
  579. continue
  580. # 6c. Fitzpatrick modifier: zero-width when following emoji base
  581. elif (_FITZPATRICK_RANGE[0] <= ucs <= _FITZPATRICK_RANGE[1]
  582. and last_measured_ucs in _EMOJI_ZWJ_SET):
  583. idx += 1
  584. continue
  585. # 7. Virama conjunct formation: consonant following virama contributes 0 width.
  586. # See https://www.unicode.org/reports/tr44/#Indic_Syllabic_Category
  587. if last_was_virama and _bisearch(ucs, _ISC_CONSONANT_TABLE):
  588. last_measured_idx = idx
  589. last_measured_ucs = ucs
  590. last_was_virama = False
  591. conjunct_pending = True
  592. idx += 1
  593. continue
  594. # 8. Normal characters: measure with wcwidth
  595. w = _wcwidth(char)
  596. if w > 0:
  597. if conjunct_pending:
  598. current_col += 1
  599. conjunct_pending = False
  600. current_col += w
  601. max_extent = max(max_extent, current_col)
  602. last_measured_idx = idx
  603. last_measured_ucs = ucs
  604. last_was_virama = False
  605. elif last_measured_idx >= 0 and _bisearch(ucs, _CATEGORY_MC_TABLE):
  606. # Spacing Combining Mark (Mc) following a base character adds 1
  607. current_col += 1
  608. max_extent = max(max_extent, current_col)
  609. last_measured_idx = -2
  610. last_was_virama = False
  611. conjunct_pending = False
  612. else:
  613. last_was_virama = ucs in _ISC_VIRAMA_SET
  614. idx += 1
  615. if conjunct_pending:
  616. current_col += 1
  617. max_extent = max(max_extent, current_col)
  618. return max_extent
  619. def ljust(
  620. text: str,
  621. dest_width: int,
  622. fillchar: str = ' ',
  623. *,
  624. control_codes: Literal['parse', 'strict', 'ignore'] = 'parse',
  625. ambiguous_width: int = 1,
  626. ) -> str:
  627. r"""
  628. Return text left-justified in a string of given display width.
  629. :param text: String to justify, may contain terminal sequences.
  630. :param dest_width: Total display width of result in terminal cells.
  631. :param fillchar: Single character for padding (default space). Must have
  632. display width of 1 (not wide, not zero-width, not combining). Unicode
  633. characters like ``'·'`` are acceptable. The width is not validated.
  634. :param control_codes: How to handle control sequences when measuring.
  635. Passed to :func:`width` for measurement.
  636. :param ambiguous_width: Width to use for East Asian Ambiguous (A)
  637. characters. Default is ``1`` (narrow). Set to ``2`` for CJK contexts.
  638. :returns: Text padded on the right to reach ``dest_width``.
  639. .. versionadded:: 0.3.0
  640. Example::
  641. >>> wcwidth.ljust('hi', 5)
  642. 'hi '
  643. >>> wcwidth.ljust('\x1b[31mhi\x1b[0m', 5)
  644. '\x1b[31mhi\x1b[0m '
  645. >>> wcwidth.ljust('\U0001F468\u200D\U0001F469\u200D\U0001F467', 6)
  646. '👨‍👩‍👧 '
  647. """
  648. if text.isascii() and text.isprintable():
  649. text_width = len(text)
  650. else:
  651. text_width = width(text, control_codes=control_codes, ambiguous_width=ambiguous_width)
  652. padding_cells = max(0, dest_width - text_width)
  653. return text + fillchar * padding_cells
  654. def rjust(
  655. text: str,
  656. dest_width: int,
  657. fillchar: str = ' ',
  658. *,
  659. control_codes: Literal['parse', 'strict', 'ignore'] = 'parse',
  660. ambiguous_width: int = 1,
  661. ) -> str:
  662. r"""
  663. Return text right-justified in a string of given display width.
  664. :param text: String to justify, may contain terminal sequences.
  665. :param dest_width: Total display width of result in terminal cells.
  666. :param fillchar: Single character for padding (default space). Must have
  667. display width of 1 (not wide, not zero-width, not combining). Unicode
  668. characters like ``'·'`` are acceptable. The width is not validated.
  669. :param control_codes: How to handle control sequences when measuring.
  670. Passed to :func:`width` for measurement.
  671. :param ambiguous_width: Width to use for East Asian Ambiguous (A)
  672. characters. Default is ``1`` (narrow). Set to ``2`` for CJK contexts.
  673. :returns: Text padded on the left to reach ``dest_width``.
  674. .. versionadded:: 0.3.0
  675. Example::
  676. >>> wcwidth.rjust('hi', 5)
  677. ' hi'
  678. >>> wcwidth.rjust('\x1b[31mhi\x1b[0m', 5)
  679. ' \x1b[31mhi\x1b[0m'
  680. >>> wcwidth.rjust('\U0001F468\u200D\U0001F469\u200D\U0001F467', 6)
  681. ' 👨‍👩‍👧'
  682. """
  683. if text.isascii() and text.isprintable():
  684. text_width = len(text)
  685. else:
  686. text_width = width(text, control_codes=control_codes, ambiguous_width=ambiguous_width)
  687. padding_cells = max(0, dest_width - text_width)
  688. return fillchar * padding_cells + text
  689. def center(
  690. text: str,
  691. dest_width: int,
  692. fillchar: str = ' ',
  693. *,
  694. control_codes: Literal['parse', 'strict', 'ignore'] = 'parse',
  695. ambiguous_width: int = 1,
  696. ) -> str:
  697. r"""
  698. Return text centered in a string of given display width.
  699. :param text: String to center, may contain terminal sequences.
  700. :param dest_width: Total display width of result in terminal cells.
  701. :param fillchar: Single character for padding (default space). Must have
  702. display width of 1 (not wide, not zero-width, not combining). Unicode
  703. characters like ``'·'`` are acceptable. The width is not validated.
  704. :param control_codes: How to handle control sequences when measuring.
  705. Passed to :func:`width` for measurement.
  706. :param ambiguous_width: Width to use for East Asian Ambiguous (A)
  707. characters. Default is ``1`` (narrow). Set to ``2`` for CJK contexts.
  708. :returns: Text padded on both sides to reach ``dest_width``.
  709. For odd-width padding, the extra cell goes on the right (matching
  710. Python's :meth:`str.center` behavior).
  711. .. versionadded:: 0.3.0
  712. Example::
  713. >>> wcwidth.center('hi', 6)
  714. ' hi '
  715. >>> wcwidth.center('\x1b[31mhi\x1b[0m', 6)
  716. ' \x1b[31mhi\x1b[0m '
  717. >>> wcwidth.center('\U0001F468\u200D\U0001F469\u200D\U0001F467', 6)
  718. ' 👨‍👩‍👧 '
  719. """
  720. if text.isascii() and text.isprintable():
  721. text_width = len(text)
  722. else:
  723. text_width = width(text, control_codes=control_codes, ambiguous_width=ambiguous_width)
  724. total_padding = max(0, dest_width - text_width)
  725. # matching https://jazcap53.github.io/pythons-eccentric-strcenter.html
  726. left_pad = total_padding // 2 + (total_padding & dest_width & 1)
  727. right_pad = total_padding - left_pad
  728. return fillchar * left_pad + text + fillchar * right_pad
  729. def strip_sequences(text: str) -> str:
  730. r"""
  731. Return text with all terminal escape sequences removed.
  732. Unknown or incomplete ESC sequences are preserved.
  733. :param text: String that may contain terminal escape sequences.
  734. :returns: The input text with all escape sequences stripped.
  735. .. versionadded:: 0.3.0
  736. Example::
  737. >>> strip_sequences('\x1b[31mred\x1b[0m')
  738. 'red'
  739. >>> strip_sequences('hello')
  740. 'hello'
  741. >>> strip_sequences('\x1b[1m\x1b[31mbold red\x1b[0m text')
  742. 'bold red text'
  743. """
  744. return ZERO_WIDTH_PATTERN.sub('', text)
  745. def clip(
  746. text: str,
  747. start: int,
  748. end: int,
  749. *,
  750. fillchar: str = ' ',
  751. tabsize: int = 8,
  752. ambiguous_width: int = 1,
  753. propagate_sgr: bool = True,
  754. ) -> str:
  755. r"""
  756. Clip text to display columns ``(start, end)`` while preserving all terminal sequences.
  757. This function extracts a substring based on visible column positions rather than
  758. character indices. Terminal escape sequences are preserved in the output since
  759. they have zero display width. If a wide character (width 2) would be split at
  760. either boundary, it is replaced with ``fillchar``.
  761. TAB characters (``\t``) are expanded to spaces up to the next tab stop,
  762. controlled by the ``tabsize`` parameter.
  763. Other cursor movement characters (backspace, carriage return) and cursor
  764. movement sequences are passed through unchanged as zero-width.
  765. :param text: String to clip, may contain terminal escape sequences.
  766. :param start: Absolute starting column (inclusive, 0-indexed).
  767. :param end: Absolute ending column (exclusive).
  768. :param fillchar: Character to use when a wide character must be split at
  769. a boundary (default space). Must have display width of 1.
  770. :param tabsize: Tab stop width (default 8). Set to 0 to pass tabs through
  771. as zero-width (preserved in output but don't advance column position).
  772. :param ambiguous_width: Width to use for East Asian Ambiguous (A)
  773. characters. Default is ``1`` (narrow). Set to ``2`` for CJK contexts.
  774. :param propagate_sgr: If True (default), SGR (terminal styling) sequences
  775. are propagated. The result begins with any active style at the start
  776. position and ends with a reset sequence if styles are active.
  777. :returns: Substring of ``text`` spanning display columns ``(start, end)``,
  778. with all terminal sequences preserved and wide characters at boundaries
  779. replaced with ``fillchar``.
  780. SGR (terminal styling) sequences are propagated by default. The result
  781. begins with any active style and ends with a reset::
  782. >>> clip('\x1b[1;34mHello world\x1b[0m', 6, 11)
  783. '\x1b[1;34mworld\x1b[0m'
  784. Set ``propagate_sgr=False`` to disable this behavior.
  785. .. versionadded:: 0.3.0
  786. .. versionchanged:: 0.5.0
  787. Added ``propagate_sgr`` parameter (default True).
  788. Example::
  789. >>> clip('hello world', 0, 5)
  790. 'hello'
  791. >>> clip('中文字', 0, 3) # Wide char split at column 3
  792. '中 '
  793. >>> clip('a\tb', 0, 10) # Tab expanded to spaces
  794. 'a b'
  795. """
  796. # pylint: disable=too-complex,too-many-locals,too-many-branches,too-many-statements,too-many-nested-blocks
  797. # Again, for 'hot path', we avoid additional delegate functions and accept the cost
  798. # of complexity for improved python performance.
  799. start = max(start, 0)
  800. if end <= start:
  801. return ''
  802. # Fast path: printable ASCII only (no tabs, escape sequences, or wide or zero-width chars)
  803. if text.isascii() and text.isprintable():
  804. return text[start:end]
  805. # Fast path: no escape sequences means no SGR tracking needed
  806. if propagate_sgr and '\x1b' not in text:
  807. propagate_sgr = False
  808. # SGR tracking state (only when propagate_sgr=True)
  809. sgr_at_clip_start = None # state when first visible char emitted (None = not yet)
  810. if propagate_sgr:
  811. sgr = _SGR_STATE_DEFAULT # current SGR state, updated by all sequences
  812. output: list[str] = []
  813. col = 0
  814. idx = 0
  815. while idx < len(text):
  816. char = text[idx]
  817. # Early exit: past visible region, SGR captured, no escape ahead
  818. if col >= end and sgr_at_clip_start is not None and char != '\x1b':
  819. break
  820. # Handle escape sequences
  821. if char == '\x1b' and (match := ZERO_WIDTH_PATTERN.match(text, idx)):
  822. seq = match.group()
  823. if propagate_sgr and _SGR_PATTERN.match(seq):
  824. # Update SGR state; will be applied as prefix when visible content starts
  825. sgr = _sgr_state_update(sgr, seq)
  826. else:
  827. # Non-SGR sequences always preserved
  828. output.append(seq)
  829. idx = match.end()
  830. continue
  831. # Handle bare ESC (not a valid sequence)
  832. if char == '\x1b':
  833. output.append(char)
  834. idx += 1
  835. continue
  836. # TAB expansion
  837. if char == '\t':
  838. if tabsize > 0:
  839. next_tab = col + (tabsize - (col % tabsize))
  840. while col < next_tab:
  841. if start <= col < end:
  842. output.append(' ')
  843. if propagate_sgr and sgr_at_clip_start is None:
  844. sgr_at_clip_start = sgr
  845. col += 1
  846. else:
  847. output.append(char)
  848. idx += 1
  849. continue
  850. # Grapheme clustering for everything else
  851. grapheme = next(iter_graphemes(text, start=idx))
  852. w = width(grapheme, ambiguous_width=ambiguous_width)
  853. if w == 0:
  854. if start <= col < end:
  855. output.append(grapheme)
  856. elif col >= start and col + w <= end:
  857. # Fully visible
  858. output.append(grapheme)
  859. if propagate_sgr and sgr_at_clip_start is None:
  860. sgr_at_clip_start = sgr
  861. col += w
  862. elif col < end and col + w > start:
  863. # Partially visible (wide char at boundary)
  864. output.append(fillchar * (min(end, col + w) - max(start, col)))
  865. if propagate_sgr and sgr_at_clip_start is None:
  866. sgr_at_clip_start = sgr
  867. col += w
  868. else:
  869. col += w
  870. idx += len(grapheme)
  871. result = ''.join(output)
  872. # Apply SGR prefix/suffix
  873. if sgr_at_clip_start is not None:
  874. if prefix := _sgr_state_to_sequence(sgr_at_clip_start):
  875. result = prefix + result
  876. if _sgr_state_is_active(sgr_at_clip_start):
  877. result += '\x1b[0m'
  878. return result