utils.py 6.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194
  1. import re
  2. import sys
  3. from ast import literal_eval
  4. from functools import total_ordering
  5. from typing import NamedTuple, Union
  6. # The following is a list in Python that are line breaks in str.splitlines, but
  7. # not in Python. In Python only \r (Carriage Return, 0xD) and \n (Line Feed,
  8. # 0xA) are allowed to split lines.
  9. _NON_LINE_BREAKS = (
  10. '\v', # Vertical Tabulation 0xB
  11. '\f', # Form Feed 0xC
  12. '\x1C', # File Separator
  13. '\x1D', # Group Separator
  14. '\x1E', # Record Separator
  15. '\x85', # Next Line (NEL - Equivalent to CR+LF.
  16. # Used to mark end-of-line on some IBM mainframes.)
  17. '\u2028', # Line Separator
  18. '\u2029', # Paragraph Separator
  19. )
  20. class Version(NamedTuple):
  21. major: int
  22. minor: int
  23. micro: int
  24. def split_lines(string: str, keepends: bool = False) -> "list[str]":
  25. r"""
  26. Intended for Python code. In contrast to Python's :py:meth:`str.splitlines`,
  27. looks at form feeds and other special characters as normal text. Just
  28. splits ``\n`` and ``\r\n``.
  29. Also different: Returns ``[""]`` for an empty string input.
  30. In Python 2.7 form feeds are used as normal characters when using
  31. str.splitlines. However in Python 3 somewhere there was a decision to split
  32. also on form feeds.
  33. """
  34. if keepends:
  35. lst = string.splitlines(True)
  36. # We have to merge lines that were broken by form feed characters.
  37. merge = []
  38. for i, line in enumerate(lst):
  39. try:
  40. last_chr = line[-1]
  41. except IndexError:
  42. pass
  43. else:
  44. if last_chr in _NON_LINE_BREAKS:
  45. merge.append(i)
  46. for index in reversed(merge):
  47. try:
  48. lst[index] = lst[index] + lst[index + 1]
  49. del lst[index + 1]
  50. except IndexError:
  51. # index + 1 can be empty and therefore there's no need to
  52. # merge.
  53. pass
  54. # The stdlib's implementation of the end is inconsistent when calling
  55. # it with/without keepends. One time there's an empty string in the
  56. # end, one time there's none.
  57. if string.endswith('\n') or string.endswith('\r') or string == '':
  58. lst.append('')
  59. return lst
  60. else:
  61. return re.split(r'\n|\r\n|\r', string)
  62. def python_bytes_to_unicode(
  63. source: Union[str, bytes], encoding: str = 'utf-8', errors: str = 'strict'
  64. ) -> str:
  65. """
  66. Checks for unicode BOMs and PEP 263 encoding declarations. Then returns a
  67. unicode object like in :py:meth:`bytes.decode`.
  68. :param encoding: See :py:meth:`bytes.decode` documentation.
  69. :param errors: See :py:meth:`bytes.decode` documentation. ``errors`` can be
  70. ``'strict'``, ``'replace'`` or ``'ignore'``.
  71. """
  72. def detect_encoding():
  73. """
  74. For the implementation of encoding definitions in Python, look at:
  75. - http://www.python.org/dev/peps/pep-0263/
  76. - http://docs.python.org/2/reference/lexical_analysis.html#encoding-declarations
  77. """
  78. byte_mark = literal_eval(r"b'\xef\xbb\xbf'")
  79. if source.startswith(byte_mark):
  80. # UTF-8 byte-order mark
  81. return 'utf-8'
  82. first_two_lines = re.match(br'(?:[^\r\n]*(?:\r\n|\r|\n)){0,2}', source).group(0)
  83. possible_encoding = re.search(br"coding[=:]\s*([-\w.]+)",
  84. first_two_lines)
  85. if possible_encoding:
  86. e = possible_encoding.group(1)
  87. if not isinstance(e, str):
  88. e = str(e, 'ascii', 'replace')
  89. return e
  90. else:
  91. # the default if nothing else has been set -> PEP 263
  92. return encoding
  93. if isinstance(source, str):
  94. # only cast str/bytes
  95. return source
  96. encoding = detect_encoding()
  97. try:
  98. # Cast to unicode
  99. return str(source, encoding, errors)
  100. except LookupError:
  101. if errors == 'replace':
  102. # This is a weird case that can happen if the given encoding is not
  103. # a valid encoding. This usually shouldn't happen with provided
  104. # encodings, but can happen if somebody uses encoding declarations
  105. # like `# coding: foo-8`.
  106. return str(source, 'utf-8', errors)
  107. raise
  108. def version_info() -> Version:
  109. """
  110. Returns a namedtuple of parso's version, similar to Python's
  111. ``sys.version_info``.
  112. """
  113. from parso import __version__
  114. tupl = re.findall(r'[a-z]+|\d+', __version__)
  115. return Version(*[x if i == 3 else int(x) for i, x in enumerate(tupl)])
  116. class _PythonVersionInfo(NamedTuple):
  117. major: int
  118. minor: int
  119. @total_ordering
  120. class PythonVersionInfo(_PythonVersionInfo):
  121. def __gt__(self, other):
  122. if isinstance(other, tuple):
  123. if len(other) != 2:
  124. raise ValueError("Can only compare to tuples of length 2.")
  125. return (self.major, self.minor) > other
  126. super().__gt__(other)
  127. return (self.major, self.minor)
  128. def __eq__(self, other):
  129. if isinstance(other, tuple):
  130. if len(other) != 2:
  131. raise ValueError("Can only compare to tuples of length 2.")
  132. return (self.major, self.minor) == other
  133. super().__eq__(other)
  134. def __ne__(self, other):
  135. return not self.__eq__(other)
  136. def _parse_version(version) -> PythonVersionInfo:
  137. match = re.match(r'(\d+)(?:\.(\d{1,2})(?:\.\d+)?)?((a|b|rc)\d)?$', version)
  138. if match is None:
  139. raise ValueError('The given version is not in the right format. '
  140. 'Use something like "3.8" or "3".')
  141. major = int(match.group(1))
  142. minor = match.group(2)
  143. if minor is None:
  144. # Use the latest Python in case it's not exactly defined, because the
  145. # grammars are typically backwards compatible?
  146. if major == 2:
  147. minor = "7"
  148. elif major == 3:
  149. minor = "6"
  150. else:
  151. raise NotImplementedError("Sorry, no support yet for those fancy new/old versions.")
  152. minor = int(minor)
  153. return PythonVersionInfo(major, minor)
  154. def parse_version_string(version: str = None) -> PythonVersionInfo:
  155. """
  156. Checks for a valid version number (e.g. `3.8` or `3.10.1` or `3`) and
  157. returns a corresponding version info that is always two characters long in
  158. decimal.
  159. """
  160. if version is None:
  161. version = '%s.%s' % sys.version_info[:2]
  162. if not isinstance(version, str):
  163. raise TypeError('version must be a string like "3.8"')
  164. return _parse_version(version)