asttokens.py 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450
  1. # Copyright 2016 Grist Labs, Inc.
  2. #
  3. # Licensed under the Apache License, Version 2.0 (the "License");
  4. # you may not use this file except in compliance with the License.
  5. # You may obtain a copy of the License at
  6. #
  7. # https://www.apache.org/licenses/LICENSE-2.0
  8. #
  9. # Unless required by applicable law or agreed to in writing, software
  10. # distributed under the License is distributed on an "AS IS" BASIS,
  11. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. # See the License for the specific language governing permissions and
  13. # limitations under the License.
  14. import abc
  15. import ast
  16. import bisect
  17. import sys
  18. import token
  19. from ast import Module
  20. from typing import Iterable, Iterator, List, Optional, Tuple, Any, cast, TYPE_CHECKING
  21. from .line_numbers import LineNumbers
  22. from .util import (
  23. Token, match_token, is_non_coding_token, patched_generate_tokens, last_stmt,
  24. annotate_fstring_nodes, generate_tokens, is_module, is_stmt
  25. )
  26. if TYPE_CHECKING: # pragma: no cover
  27. from .util import AstNode, TokenInfo
  28. class ASTTextBase(metaclass=abc.ABCMeta):
  29. def __init__(self, source_text: str, filename: str) -> None:
  30. self._filename = filename
  31. # Decode source after parsing to let Python 2 handle coding declarations.
  32. # (If the encoding was not utf-8 compatible, then even if it parses correctly,
  33. # we'll fail with a unicode error here.)
  34. source_text = str(source_text)
  35. self._text = source_text
  36. self._line_numbers = LineNumbers(source_text)
  37. @abc.abstractmethod
  38. def get_text_positions(self, node, padded):
  39. # type: (AstNode, bool) -> Tuple[Tuple[int, int], Tuple[int, int]]
  40. """
  41. Returns two ``(lineno, col_offset)`` tuples for the start and end of the given node.
  42. If the positions can't be determined, or the nodes don't correspond to any particular text,
  43. returns ``(1, 0)`` for both.
  44. ``padded`` corresponds to the ``padded`` argument to ``ast.get_source_segment()``.
  45. This means that if ``padded`` is True, the start position will be adjusted to include
  46. leading whitespace if ``node`` is a multiline statement.
  47. """
  48. raise NotImplementedError # pragma: no cover
  49. def get_text_range(self, node, padded=True):
  50. # type: (AstNode, bool) -> Tuple[int, int]
  51. """
  52. Returns the (startpos, endpos) positions in source text corresponding to the given node.
  53. Returns (0, 0) for nodes (like `Load`) that don't correspond to any particular text.
  54. See ``get_text_positions()`` for details on the ``padded`` argument.
  55. """
  56. start, end = self.get_text_positions(node, padded)
  57. return (
  58. self._line_numbers.line_to_offset(*start),
  59. self._line_numbers.line_to_offset(*end),
  60. )
  61. def get_text(self, node, padded=True):
  62. # type: (AstNode, bool) -> str
  63. """
  64. Returns the text corresponding to the given node.
  65. Returns '' for nodes (like `Load`) that don't correspond to any particular text.
  66. See ``get_text_positions()`` for details on the ``padded`` argument.
  67. """
  68. start, end = self.get_text_range(node, padded)
  69. return self._text[start: end]
  70. class ASTTokens(ASTTextBase):
  71. """
  72. ASTTokens maintains the text of Python code in several forms: as a string, as line numbers, and
  73. as tokens, and is used to mark and access token and position information.
  74. ``source_text`` must be a unicode or UTF8-encoded string. If you pass in UTF8 bytes, remember
  75. that all offsets you'll get are to the unicode text, which is available as the ``.text``
  76. property.
  77. If ``parse`` is set, the ``source_text`` will be parsed with ``ast.parse()``, and the resulting
  78. tree marked with token info and made available as the ``.tree`` property.
  79. If ``tree`` is given, it will be marked and made available as the ``.tree`` property. In
  80. addition to the trees produced by the ``ast`` module, ASTTokens will also mark trees produced
  81. using ``astroid`` library <https://www.astroid.org>.
  82. If only ``source_text`` is given, you may use ``.mark_tokens(tree)`` to mark the nodes of an AST
  83. tree created separately.
  84. """
  85. def __init__(self, source_text, parse=False, tree=None, filename='<unknown>', tokens=None):
  86. # type: (Any, bool, Optional[Module], str, Optional[Iterable[TokenInfo]]) -> None
  87. super(ASTTokens, self).__init__(source_text, filename)
  88. self._tree = ast.parse(source_text, filename) if parse else tree
  89. # Tokenize the code.
  90. if tokens is None:
  91. tokens = generate_tokens(self._text)
  92. self._tokens = list(self._translate_tokens(tokens))
  93. # Extract the start positions of all tokens, so that we can quickly map positions to tokens.
  94. self._token_offsets = [tok.startpos for tok in self._tokens]
  95. if self._tree:
  96. self.mark_tokens(self._tree)
  97. def mark_tokens(self, root_node):
  98. # type: (Module) -> None
  99. """
  100. Given the root of the AST or Astroid tree produced from source_text, visits all nodes marking
  101. them with token and position information by adding ``.first_token`` and
  102. ``.last_token`` attributes. This is done automatically in the constructor when ``parse`` or
  103. ``tree`` arguments are set, but may be used manually with a separate AST or Astroid tree.
  104. """
  105. # The hard work of this class is done by MarkTokens
  106. from .mark_tokens import MarkTokens # to avoid import loops
  107. MarkTokens(self).visit_tree(root_node)
  108. def _translate_tokens(self, original_tokens):
  109. # type: (Iterable[TokenInfo]) -> Iterator[Token]
  110. """
  111. Translates the given standard library tokens into our own representation.
  112. """
  113. for index, tok in enumerate(patched_generate_tokens(original_tokens)):
  114. tok_type, tok_str, start, end, line = tok
  115. yield Token(tok_type, tok_str, start, end, line, index,
  116. self._line_numbers.line_to_offset(start[0], start[1]),
  117. self._line_numbers.line_to_offset(end[0], end[1]))
  118. @property
  119. def text(self):
  120. # type: () -> str
  121. """The source code passed into the constructor."""
  122. return self._text
  123. @property
  124. def tokens(self):
  125. # type: () -> List[Token]
  126. """The list of tokens corresponding to the source code from the constructor."""
  127. return self._tokens
  128. @property
  129. def tree(self):
  130. # type: () -> Optional[Module]
  131. """The root of the AST tree passed into the constructor or parsed from the source code."""
  132. return self._tree
  133. @property
  134. def filename(self):
  135. # type: () -> str
  136. """The filename that was parsed"""
  137. return self._filename
  138. def get_token_from_offset(self, offset):
  139. # type: (int) -> Token
  140. """
  141. Returns the token containing the given character offset (0-based position in source text),
  142. or the preceeding token if the position is between tokens.
  143. """
  144. return self._tokens[bisect.bisect(self._token_offsets, offset) - 1]
  145. def get_token(self, lineno, col_offset):
  146. # type: (int, int) -> Token
  147. """
  148. Returns the token containing the given (lineno, col_offset) position, or the preceeding token
  149. if the position is between tokens.
  150. """
  151. # TODO: add test for multibyte unicode. We need to translate offsets from ast module (which
  152. # are in utf8) to offsets into the unicode text. tokenize module seems to use unicode offsets
  153. # but isn't explicit.
  154. return self.get_token_from_offset(self._line_numbers.line_to_offset(lineno, col_offset))
  155. def get_token_from_utf8(self, lineno, col_offset):
  156. # type: (int, int) -> Token
  157. """
  158. Same as get_token(), but interprets col_offset as a UTF8 offset, which is what `ast` uses.
  159. """
  160. return self.get_token(lineno, self._line_numbers.from_utf8_col(lineno, col_offset))
  161. def next_token(self, tok, include_extra=False):
  162. # type: (Token, bool) -> Token
  163. """
  164. Returns the next token after the given one. If include_extra is True, includes non-coding
  165. tokens from the tokenize module, such as NL and COMMENT.
  166. """
  167. i = tok.index + 1
  168. if not include_extra:
  169. while is_non_coding_token(self._tokens[i].type):
  170. i += 1
  171. return self._tokens[i]
  172. def prev_token(self, tok, include_extra=False):
  173. # type: (Token, bool) -> Token
  174. """
  175. Returns the previous token before the given one. If include_extra is True, includes non-coding
  176. tokens from the tokenize module, such as NL and COMMENT.
  177. """
  178. i = tok.index - 1
  179. if not include_extra:
  180. while is_non_coding_token(self._tokens[i].type):
  181. i -= 1
  182. return self._tokens[i]
  183. def find_token(self, start_token, tok_type, tok_str=None, reverse=False):
  184. # type: (Token, int, Optional[str], bool) -> Token
  185. """
  186. Looks for the first token, starting at start_token, that matches tok_type and, if given, the
  187. token string. Searches backwards if reverse is True. Returns ENDMARKER token if not found (you
  188. can check it with `token.ISEOF(t.type)`).
  189. """
  190. t = start_token
  191. advance = self.prev_token if reverse else self.next_token
  192. while not match_token(t, tok_type, tok_str) and not token.ISEOF(t.type):
  193. t = advance(t, include_extra=True)
  194. return t
  195. def token_range(self,
  196. first_token, # type: Token
  197. last_token, # type: Token
  198. include_extra=False, # type: bool
  199. ):
  200. # type: (...) -> Iterator[Token]
  201. """
  202. Yields all tokens in order from first_token through and including last_token. If
  203. include_extra is True, includes non-coding tokens such as tokenize.NL and .COMMENT.
  204. """
  205. for i in range(first_token.index, last_token.index + 1):
  206. if include_extra or not is_non_coding_token(self._tokens[i].type):
  207. yield self._tokens[i]
  208. def get_tokens(self, node, include_extra=False):
  209. # type: (AstNode, bool) -> Iterator[Token]
  210. """
  211. Yields all tokens making up the given node. If include_extra is True, includes non-coding
  212. tokens such as tokenize.NL and .COMMENT.
  213. """
  214. return self.token_range(node.first_token, node.last_token, include_extra=include_extra)
  215. def get_text_positions(self, node, padded):
  216. # type: (AstNode, bool) -> Tuple[Tuple[int, int], Tuple[int, int]]
  217. """
  218. Returns two ``(lineno, col_offset)`` tuples for the start and end of the given node.
  219. If the positions can't be determined, or the nodes don't correspond to any particular text,
  220. returns ``(1, 0)`` for both.
  221. ``padded`` corresponds to the ``padded`` argument to ``ast.get_source_segment()``.
  222. This means that if ``padded`` is True, the start position will be adjusted to include
  223. leading whitespace if ``node`` is a multiline statement.
  224. """
  225. if not hasattr(node, 'first_token'):
  226. return (1, 0), (1, 0)
  227. start = node.first_token.start
  228. end = node.last_token.end
  229. if padded and any(match_token(t, token.NEWLINE) for t in self.get_tokens(node)):
  230. # Set col_offset to 0 to include leading indentation for multiline statements.
  231. start = (start[0], 0)
  232. return start, end
  233. class ASTText(ASTTextBase):
  234. """
  235. Supports the same ``get_text*`` methods as ``ASTTokens``,
  236. but uses the AST to determine the text positions instead of tokens.
  237. This is faster than ``ASTTokens`` as it requires less setup work.
  238. It also (sometimes) supports nodes inside f-strings, which ``ASTTokens`` doesn't.
  239. Some node types and/or Python versions are not supported.
  240. In these cases the ``get_text*`` methods will fall back to using ``ASTTokens``
  241. which incurs the usual setup cost the first time.
  242. If you want to avoid this, check ``supports_tokenless(node)`` before calling ``get_text*`` methods.
  243. """
  244. def __init__(self, source_text, tree=None, filename='<unknown>'):
  245. # type: (Any, Optional[Module], str) -> None
  246. super(ASTText, self).__init__(source_text, filename)
  247. self._tree = tree
  248. if self._tree is not None:
  249. annotate_fstring_nodes(self._tree)
  250. self._asttokens = None # type: Optional[ASTTokens]
  251. @property
  252. def tree(self):
  253. # type: () -> Module
  254. if self._tree is None:
  255. self._tree = ast.parse(self._text, self._filename)
  256. annotate_fstring_nodes(self._tree)
  257. return self._tree
  258. @property
  259. def asttokens(self):
  260. # type: () -> ASTTokens
  261. if self._asttokens is None:
  262. self._asttokens = ASTTokens(
  263. self._text,
  264. tree=self.tree,
  265. filename=self._filename,
  266. )
  267. return self._asttokens
  268. def _get_text_positions_tokenless(self, node, padded):
  269. # type: (AstNode, bool) -> Tuple[Tuple[int, int], Tuple[int, int]]
  270. """
  271. Version of ``get_text_positions()`` that doesn't use tokens.
  272. """
  273. if is_module(node):
  274. # Modules don't have position info, so just return the range of the whole text.
  275. # The token-using method does something different, but its behavior seems weird and inconsistent.
  276. # For example, in a file with only comments, it only returns the first line.
  277. # It's hard to imagine a case when this matters.
  278. return (1, 0), self._line_numbers.offset_to_line(len(self._text))
  279. if getattr(node, 'lineno', None) is None:
  280. return (1, 0), (1, 0)
  281. assert node # tell mypy that node is not None, which we allowed up to here for compatibility
  282. decorators = getattr(node, 'decorator_list', [])
  283. if not decorators:
  284. # Astroid uses node.decorators.nodes instead of node.decorator_list.
  285. decorators_node = getattr(node, 'decorators', None)
  286. decorators = getattr(decorators_node, 'nodes', [])
  287. if decorators:
  288. # Function/Class definition nodes are marked by AST as starting at def/class,
  289. # not the first decorator. This doesn't match the token-using behavior,
  290. # or inspect.getsource(), and just seems weird.
  291. start_node = decorators[0]
  292. else:
  293. start_node = node
  294. start_lineno = start_node.lineno
  295. end_node = last_stmt(node)
  296. # Include leading indentation for multiline statements.
  297. # This doesn't mean simple statements that happen to be on multiple lines,
  298. # but compound statements where inner indentation matters.
  299. # So we don't just compare node.lineno and node.end_lineno,
  300. # we check for a contained statement starting on a different line.
  301. if padded and (
  302. start_lineno != end_node.lineno
  303. or (
  304. # Astroid docstrings aren't treated as separate statements.
  305. # So to handle function/class definitions with a docstring but no other body,
  306. # we just check that the node is a statement with a docstring
  307. # and spanning multiple lines in the simple, literal sense.
  308. start_lineno != node.end_lineno
  309. and getattr(node, "doc_node", None)
  310. and is_stmt(node)
  311. )
  312. ):
  313. start_col_offset = 0
  314. else:
  315. start_col_offset = self._line_numbers.from_utf8_col(start_lineno, start_node.col_offset)
  316. start = (start_lineno, start_col_offset)
  317. # To match the token-using behaviour, we exclude trailing semicolons and comments.
  318. # This means that for blocks containing multiple statements, we have to use the last one
  319. # instead of the actual node for end_lineno and end_col_offset.
  320. end_lineno = cast(int, end_node.end_lineno)
  321. end_col_offset = cast(int, end_node.end_col_offset)
  322. end_col_offset = self._line_numbers.from_utf8_col(end_lineno, end_col_offset)
  323. end = (end_lineno, end_col_offset)
  324. return start, end
  325. def get_text_positions(self, node, padded):
  326. # type: (AstNode, bool) -> Tuple[Tuple[int, int], Tuple[int, int]]
  327. """
  328. Returns two ``(lineno, col_offset)`` tuples for the start and end of the given node.
  329. If the positions can't be determined, or the nodes don't correspond to any particular text,
  330. returns ``(1, 0)`` for both.
  331. ``padded`` corresponds to the ``padded`` argument to ``ast.get_source_segment()``.
  332. This means that if ``padded`` is True, the start position will be adjusted to include
  333. leading whitespace if ``node`` is a multiline statement.
  334. """
  335. if getattr(node, "_broken_positions", None):
  336. # This node was marked in util.annotate_fstring_nodes as having untrustworthy lineno/col_offset.
  337. return (1, 0), (1, 0)
  338. if supports_tokenless(node):
  339. return self._get_text_positions_tokenless(node, padded)
  340. return self.asttokens.get_text_positions(node, padded)
  341. # Node types that _get_text_positions_tokenless doesn't support.
  342. # These initial values are missing lineno.
  343. _unsupported_tokenless_types = ("arguments", "Arguments", "withitem") # type: Tuple[str, ...]
  344. if sys.version_info[:2] == (3, 8):
  345. # _get_text_positions_tokenless works incorrectly for these types due to bugs in Python 3.8.
  346. _unsupported_tokenless_types += ("arg", "Starred")
  347. # no lineno in 3.8
  348. _unsupported_tokenless_types += ("Slice", "ExtSlice", "Index", "keyword")
  349. def supports_tokenless(node=None):
  350. # type: (Any) -> bool
  351. """
  352. Returns True if the Python version and the node (if given) are supported by
  353. the ``get_text*`` methods of ``ASTText`` without falling back to ``ASTTokens``.
  354. See ``ASTText`` for why this matters.
  355. The following cases are not supported:
  356. - PyPy
  357. - ``ast.arguments`` / ``astroid.Arguments``
  358. - ``ast.withitem``
  359. - ``astroid.Comprehension``
  360. - ``astroid.AssignName`` inside ``astroid.Arguments`` or ``astroid.ExceptHandler``
  361. - The following nodes in Python 3.8 only:
  362. - ``ast.arg``
  363. - ``ast.Starred``
  364. - ``ast.Slice``
  365. - ``ast.ExtSlice``
  366. - ``ast.Index``
  367. - ``ast.keyword``
  368. """
  369. return (
  370. type(node).__name__ not in _unsupported_tokenless_types
  371. and not (
  372. # astroid nodes
  373. not isinstance(node, ast.AST) and node is not None and (
  374. (
  375. type(node).__name__ == "AssignName"
  376. and type(node.parent).__name__ in ("Arguments", "ExceptHandler")
  377. )
  378. )
  379. )
  380. and 'pypy' not in sys.version.lower()
  381. )