mark_tokens.py 21 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469
  1. # Copyright 2016 Grist Labs, Inc.
  2. #
  3. # Licensed under the Apache License, Version 2.0 (the "License");
  4. # you may not use this file except in compliance with the License.
  5. # You may obtain a copy of the License at
  6. #
  7. # https://www.apache.org/licenses/LICENSE-2.0
  8. #
  9. # Unless required by applicable law or agreed to in writing, software
  10. # distributed under the License is distributed on an "AS IS" BASIS,
  11. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. # See the License for the specific language governing permissions and
  13. # limitations under the License.
  14. import ast
  15. import numbers
  16. import sys
  17. import token
  18. from ast import Module
  19. from typing import Callable, List, Union, cast, Optional, Tuple, TYPE_CHECKING
  20. from . import util
  21. from .asttokens import ASTTokens
  22. from .astroid_compat import astroid_node_classes as nc, BaseContainer as AstroidBaseContainer
  23. if TYPE_CHECKING:
  24. from .util import AstNode
  25. # Mapping of matching braces. To find a token here, look up token[:2].
  26. _matching_pairs_left = {
  27. (token.OP, '('): (token.OP, ')'),
  28. (token.OP, '['): (token.OP, ']'),
  29. (token.OP, '{'): (token.OP, '}'),
  30. }
  31. _matching_pairs_right = {
  32. (token.OP, ')'): (token.OP, '('),
  33. (token.OP, ']'): (token.OP, '['),
  34. (token.OP, '}'): (token.OP, '{'),
  35. }
  36. class MarkTokens:
  37. """
  38. Helper that visits all nodes in the AST tree and assigns .first_token and .last_token attributes
  39. to each of them. This is the heart of the token-marking logic.
  40. """
  41. def __init__(self, code):
  42. # type: (ASTTokens) -> None
  43. self._code = code
  44. self._methods = util.NodeMethods()
  45. self._iter_children = None # type: Optional[Callable]
  46. def visit_tree(self, node):
  47. # type: (Module) -> None
  48. self._iter_children = util.iter_children_func(node)
  49. util.visit_tree(node, self._visit_before_children, self._visit_after_children)
  50. def _visit_before_children(self, node, parent_token):
  51. # type: (AstNode, Optional[util.Token]) -> Tuple[Optional[util.Token], Optional[util.Token]]
  52. col = getattr(node, 'col_offset', None)
  53. token = self._code.get_token_from_utf8(node.lineno, col) if col is not None else None
  54. if not token and util.is_module(node):
  55. # We'll assume that a Module node starts at the start of the source code.
  56. token = self._code.get_token(1, 0)
  57. # Use our own token, or our parent's if we don't have one, to pass to child calls as
  58. # parent_token argument. The second value becomes the token argument of _visit_after_children.
  59. return (token or parent_token, token)
  60. def _visit_after_children(self, node, parent_token, token):
  61. # type: (AstNode, Optional[util.Token], Optional[util.Token]) -> None
  62. # This processes the node generically first, after all children have been processed.
  63. # Get the first and last tokens that belong to children. Note how this doesn't assume that we
  64. # iterate through children in order that corresponds to occurrence in source code. This
  65. # assumption can fail (e.g. with return annotations).
  66. first = token
  67. last = None
  68. for child in cast(Callable, self._iter_children)(node):
  69. # astroid slices have especially wrong positions, we don't want them to corrupt their parents.
  70. if util.is_empty_astroid_slice(child):
  71. continue
  72. if not first or child.first_token.index < first.index:
  73. first = child.first_token
  74. if not last or child.last_token.index > last.index:
  75. last = child.last_token
  76. # If we don't have a first token from _visit_before_children, and there were no children, then
  77. # use the parent's token as the first token.
  78. first = first or parent_token
  79. # If no children, set last token to the first one.
  80. last = last or first
  81. # Statements continue to before NEWLINE. This helps cover a few different cases at once.
  82. if util.is_stmt(node):
  83. last = self._find_last_in_stmt(cast(util.Token, last))
  84. # Capture any unmatched brackets.
  85. first, last = self._expand_to_matching_pairs(cast(util.Token, first), cast(util.Token, last), node)
  86. # Give a chance to node-specific methods to adjust.
  87. nfirst, nlast = self._methods.get(self, node.__class__)(node, first, last)
  88. if (nfirst, nlast) != (first, last):
  89. # If anything changed, expand again to capture any unmatched brackets.
  90. nfirst, nlast = self._expand_to_matching_pairs(nfirst, nlast, node)
  91. node.first_token = nfirst
  92. node.last_token = nlast
  93. def _find_last_in_stmt(self, start_token):
  94. # type: (util.Token) -> util.Token
  95. t = start_token
  96. while (not util.match_token(t, token.NEWLINE) and
  97. not util.match_token(t, token.OP, ';') and
  98. not token.ISEOF(t.type)):
  99. t = self._code.next_token(t, include_extra=True)
  100. return self._code.prev_token(t)
  101. def _expand_to_matching_pairs(self, first_token, last_token, node):
  102. # type: (util.Token, util.Token, AstNode) -> Tuple[util.Token, util.Token]
  103. """
  104. Scan tokens in [first_token, last_token] range that are between node's children, and for any
  105. unmatched brackets, adjust first/last tokens to include the closing pair.
  106. """
  107. # We look for opening parens/braces among non-child tokens (i.e. tokens between our actual
  108. # child nodes). If we find any closing ones, we match them to the opens.
  109. to_match_right = [] # type: List[Tuple[int, str]]
  110. to_match_left = []
  111. for tok in self._code.token_range(first_token, last_token):
  112. tok_info = tok[:2]
  113. if to_match_right and tok_info == to_match_right[-1]:
  114. to_match_right.pop()
  115. elif tok_info in _matching_pairs_left:
  116. to_match_right.append(_matching_pairs_left[tok_info])
  117. elif tok_info in _matching_pairs_right:
  118. to_match_left.append(_matching_pairs_right[tok_info])
  119. # Once done, extend `last_token` to match any unclosed parens/braces.
  120. for match in reversed(to_match_right):
  121. last = self._code.next_token(last_token)
  122. # Allow for trailing commas or colons (allowed in subscripts) before the closing delimiter
  123. while any(util.match_token(last, token.OP, x) for x in (',', ':')):
  124. last = self._code.next_token(last)
  125. # Now check for the actual closing delimiter.
  126. if util.match_token(last, *match):
  127. last_token = last
  128. # And extend `first_token` to match any unclosed opening parens/braces.
  129. for match in to_match_left:
  130. first = self._code.prev_token(first_token)
  131. if util.match_token(first, *match):
  132. first_token = first
  133. return (first_token, last_token)
  134. #----------------------------------------------------------------------
  135. # Node visitors. Each takes a preliminary first and last tokens, and returns the adjusted pair
  136. # that will actually be assigned.
  137. def visit_default(self, node, first_token, last_token):
  138. # type: (AstNode, util.Token, util.Token) -> Tuple[util.Token, util.Token]
  139. # pylint: disable=no-self-use
  140. # By default, we don't need to adjust the token we computed earlier.
  141. return (first_token, last_token)
  142. def handle_comp(self, open_brace, node, first_token, last_token):
  143. # type: (str, AstNode, util.Token, util.Token) -> Tuple[util.Token, util.Token]
  144. # For list/set/dict comprehensions, we only get the token of the first child, so adjust it to
  145. # include the opening brace (the closing brace will be matched automatically).
  146. before = self._code.prev_token(first_token)
  147. util.expect_token(before, token.OP, open_brace)
  148. return (before, last_token)
  149. def visit_comprehension(self,
  150. node, # type: AstNode
  151. first_token, # type: util.Token
  152. last_token, # type: util.Token
  153. ):
  154. # type: (...) -> Tuple[util.Token, util.Token]
  155. # The 'comprehension' node starts with 'for' but we only get first child; we search backwards
  156. # to find the 'for' keyword.
  157. first = self._code.find_token(first_token, token.NAME, 'for', reverse=True)
  158. return (first, last_token)
  159. def visit_if(self, node, first_token, last_token):
  160. # type: (util.Token, util.Token, util.Token) -> Tuple[util.Token, util.Token]
  161. while first_token.string not in ('if', 'elif'):
  162. first_token = self._code.prev_token(first_token)
  163. return first_token, last_token
  164. def handle_attr(self, node, first_token, last_token):
  165. # type: (AstNode, util.Token, util.Token) -> Tuple[util.Token, util.Token]
  166. # Attribute node has ".attr" (2 tokens) after the last child.
  167. dot = self._code.find_token(last_token, token.OP, '.')
  168. name = self._code.next_token(dot)
  169. util.expect_token(name, token.NAME)
  170. return (first_token, name)
  171. visit_attribute = handle_attr
  172. visit_assignattr = handle_attr
  173. visit_delattr = handle_attr
  174. def handle_def(self, node, first_token, last_token):
  175. # type: (AstNode, util.Token, util.Token) -> Tuple[util.Token, util.Token]
  176. # With astroid, nodes that start with a doc-string can have an empty body, in which case we
  177. # need to adjust the last token to include the doc string.
  178. if not node.body and (getattr(node, 'doc_node', None) or getattr(node, 'doc', None)): # type: ignore[union-attr]
  179. last_token = self._code.find_token(last_token, token.STRING)
  180. # Include @ from decorator
  181. if first_token.index > 0:
  182. prev = self._code.prev_token(first_token)
  183. if util.match_token(prev, token.OP, '@'):
  184. first_token = prev
  185. return (first_token, last_token)
  186. visit_classdef = handle_def
  187. visit_functiondef = handle_def
  188. def handle_following_brackets(self, node, last_token, opening_bracket):
  189. # type: (AstNode, util.Token, str) -> util.Token
  190. # This is for calls and subscripts, which have a pair of brackets
  191. # at the end which may contain no nodes, e.g. foo() or bar[:].
  192. # We look for the opening bracket and then let the matching pair be found automatically
  193. # Remember that last_token is at the end of all children,
  194. # so we are not worried about encountering a bracket that belongs to a child.
  195. first_child = next(cast(Callable, self._iter_children)(node))
  196. call_start = self._code.find_token(first_child.last_token, token.OP, opening_bracket)
  197. if call_start.index > last_token.index:
  198. last_token = call_start
  199. return last_token
  200. def visit_call(self, node, first_token, last_token):
  201. # type: (util.Token, util.Token, util.Token) -> Tuple[util.Token, util.Token]
  202. last_token = self.handle_following_brackets(node, last_token, '(')
  203. # Handling a python bug with decorators with empty parens, e.g.
  204. # @deco()
  205. # def ...
  206. if util.match_token(first_token, token.OP, '@'):
  207. first_token = self._code.next_token(first_token)
  208. return (first_token, last_token)
  209. def visit_matchclass(self, node, first_token, last_token):
  210. # type: (util.Token, util.Token, util.Token) -> Tuple[util.Token, util.Token]
  211. last_token = self.handle_following_brackets(node, last_token, '(')
  212. return (first_token, last_token)
  213. def visit_subscript(self,
  214. node, # type: AstNode
  215. first_token, # type: util.Token
  216. last_token, # type: util.Token
  217. ):
  218. # type: (...) -> Tuple[util.Token, util.Token]
  219. last_token = self.handle_following_brackets(node, last_token, '[')
  220. return (first_token, last_token)
  221. def visit_slice(self, node, first_token, last_token):
  222. # type: (AstNode, util.Token, util.Token) -> Tuple[util.Token, util.Token]
  223. # consume `:` tokens to the left and right. In Python 3.9, Slice nodes are
  224. # given a col_offset, (and end_col_offset), so this will always start inside
  225. # the slice, even if it is the empty slice. However, in 3.8 and below, this
  226. # will only expand to the full slice if the slice contains a node with a
  227. # col_offset. So x[:] will only get the correct tokens in 3.9, but x[1:] and
  228. # x[:1] will even on earlier versions of Python.
  229. while True:
  230. prev = self._code.prev_token(first_token)
  231. if prev.string != ':':
  232. break
  233. first_token = prev
  234. while True:
  235. next_ = self._code.next_token(last_token)
  236. if next_.string != ':':
  237. break
  238. last_token = next_
  239. return (first_token, last_token)
  240. def handle_bare_tuple(self, node, first_token, last_token):
  241. # type: (AstNode, util.Token, util.Token) -> Tuple[util.Token, util.Token]
  242. # A bare tuple doesn't include parens; if there is a trailing comma, make it part of the tuple.
  243. maybe_comma = self._code.next_token(last_token)
  244. if util.match_token(maybe_comma, token.OP, ','):
  245. last_token = maybe_comma
  246. return (first_token, last_token)
  247. # In Python3.8 parsed tuples include parentheses when present.
  248. def handle_tuple_nonempty(self, node, first_token, last_token):
  249. # type: (AstNode, util.Token, util.Token) -> Tuple[util.Token, util.Token]
  250. assert isinstance(node, ast.Tuple) or isinstance(node, AstroidBaseContainer)
  251. # It's a bare tuple if the first token belongs to the first child. The first child may
  252. # include extraneous parentheses (which don't create new nodes), so account for those too.
  253. child = node.elts[0]
  254. if TYPE_CHECKING:
  255. child = cast(AstNode, child)
  256. child_first, child_last = self._gobble_parens(child.first_token, child.last_token, True)
  257. if first_token == child_first:
  258. return self.handle_bare_tuple(node, first_token, last_token)
  259. return (first_token, last_token)
  260. def visit_tuple(self, node, first_token, last_token):
  261. # type: (AstNode, util.Token, util.Token) -> Tuple[util.Token, util.Token]
  262. assert isinstance(node, ast.Tuple) or isinstance(node, AstroidBaseContainer)
  263. if not node.elts:
  264. # An empty tuple is just "()", and we need no further info.
  265. return (first_token, last_token)
  266. return self.handle_tuple_nonempty(node, first_token, last_token)
  267. def _gobble_parens(self, first_token, last_token, include_all=False):
  268. # type: (util.Token, util.Token, bool) -> Tuple[util.Token, util.Token]
  269. # Expands a range of tokens to include one or all pairs of surrounding parentheses, and
  270. # returns (first, last) tokens that include these parens.
  271. while first_token.index > 0:
  272. prev = self._code.prev_token(first_token)
  273. next = self._code.next_token(last_token)
  274. if util.match_token(prev, token.OP, '(') and util.match_token(next, token.OP, ')'):
  275. first_token, last_token = prev, next
  276. if include_all:
  277. continue
  278. break
  279. return (first_token, last_token)
  280. def visit_str(self, node, first_token, last_token):
  281. # type: (AstNode, util.Token, util.Token) -> Tuple[util.Token, util.Token]
  282. return self.handle_str(first_token, last_token)
  283. def visit_joinedstr(self,
  284. node, # type: AstNode
  285. first_token, # type: util.Token
  286. last_token, # type: util.Token
  287. ):
  288. # type: (...) -> Tuple[util.Token, util.Token]
  289. if sys.version_info < (3, 12):
  290. # Older versions don't tokenize the contents of f-strings
  291. return self.handle_str(first_token, last_token)
  292. last = first_token
  293. while True:
  294. if util.match_token(last, getattr(token, "FSTRING_START")):
  295. # Python 3.12+ has tokens for the start (e.g. `f"`) and end (`"`)
  296. # of the f-string. We can't just look for the next FSTRING_END
  297. # because f-strings can be nested, e.g. f"{f'{x}'}", so we need
  298. # to treat this like matching balanced parentheses.
  299. count = 1
  300. while count > 0:
  301. last = self._code.next_token(last)
  302. # mypy complains about token.FSTRING_START and token.FSTRING_END.
  303. if util.match_token(last, getattr(token, "FSTRING_START")):
  304. count += 1
  305. elif util.match_token(last, getattr(token, "FSTRING_END")):
  306. count -= 1
  307. last_token = last
  308. last = self._code.next_token(last_token)
  309. elif util.match_token(last, token.STRING):
  310. # Similar to handle_str, we also need to handle adjacent strings.
  311. last_token = last
  312. last = self._code.next_token(last_token)
  313. else:
  314. break
  315. return (first_token, last_token)
  316. def visit_bytes(self, node, first_token, last_token):
  317. # type: (AstNode, util.Token, util.Token) -> Tuple[util.Token, util.Token]
  318. return self.handle_str(first_token, last_token)
  319. def handle_str(self, first_token, last_token):
  320. # type: (util.Token, util.Token) -> Tuple[util.Token, util.Token]
  321. # Multiple adjacent STRING tokens form a single string.
  322. last = self._code.next_token(last_token)
  323. while util.match_token(last, token.STRING):
  324. last_token = last
  325. last = self._code.next_token(last_token)
  326. return (first_token, last_token)
  327. def handle_num(self,
  328. node, # type: AstNode
  329. value, # type: Union[complex, int, numbers.Number]
  330. first_token, # type: util.Token
  331. last_token, # type: util.Token
  332. ):
  333. # type: (...) -> Tuple[util.Token, util.Token]
  334. # A constant like '-1' gets turned into two tokens; this will skip the '-'.
  335. while util.match_token(last_token, token.OP):
  336. last_token = self._code.next_token(last_token)
  337. if isinstance(value, complex):
  338. # A complex number like -2j cannot be compared directly to 0
  339. # A complex number like 1-2j is expressed as a binary operation
  340. # so we don't need to worry about it
  341. value = value.imag
  342. # This makes sure that the - is included
  343. if value < 0 and first_token.type == token.NUMBER: # type: ignore[operator]
  344. first_token = self._code.prev_token(first_token)
  345. return (first_token, last_token)
  346. def visit_num(self, node, first_token, last_token):
  347. # type: (AstNode, util.Token, util.Token) -> Tuple[util.Token, util.Token]
  348. n = node.n # type: ignore[union-attr] # ast.Num has been removed in python 3.14
  349. assert isinstance(n, (complex, int, numbers.Number))
  350. return self.handle_num(node, n, first_token, last_token)
  351. def visit_const(self, node, first_token, last_token):
  352. # type: (AstNode, util.Token, util.Token) -> Tuple[util.Token, util.Token]
  353. assert isinstance(node, ast.Constant) or isinstance(node, nc.Const)
  354. if isinstance(node.value, numbers.Number):
  355. return self.handle_num(node, node.value, first_token, last_token)
  356. elif isinstance(node.value, (str, bytes)):
  357. return self.visit_str(node, first_token, last_token)
  358. return (first_token, last_token)
  359. visit_constant = visit_const
  360. def visit_keyword(self, node, first_token, last_token):
  361. # type: (AstNode, util.Token, util.Token) -> Tuple[util.Token, util.Token]
  362. # Until python 3.9 (https://bugs.python.org/issue40141),
  363. # ast.keyword nodes didn't have line info. Astroid has lineno None.
  364. assert isinstance(node, ast.keyword) or isinstance(node, nc.Keyword)
  365. if node.arg is not None and getattr(node, 'lineno', None) is None:
  366. equals = self._code.find_token(first_token, token.OP, '=', reverse=True)
  367. name = self._code.prev_token(equals)
  368. util.expect_token(name, token.NAME, node.arg)
  369. first_token = name
  370. return (first_token, last_token)
  371. def visit_starred(self, node, first_token, last_token):
  372. # type: (AstNode, util.Token, util.Token) -> Tuple[util.Token, util.Token]
  373. # Astroid has 'Starred' nodes (for "foo(*bar)" type args), but they need to be adjusted.
  374. if not util.match_token(first_token, token.OP, '*'):
  375. star = self._code.prev_token(first_token)
  376. if util.match_token(star, token.OP, '*'):
  377. first_token = star
  378. return (first_token, last_token)
  379. def visit_assignname(self, node, first_token, last_token):
  380. # type: (AstNode, util.Token, util.Token) -> Tuple[util.Token, util.Token]
  381. # Astroid may turn 'except' clause into AssignName, but we need to adjust it.
  382. if util.match_token(first_token, token.NAME, 'except'):
  383. colon = self._code.find_token(last_token, token.OP, ':')
  384. first_token = last_token = self._code.prev_token(colon)
  385. return (first_token, last_token)
  386. # Async nodes should typically start with the word 'async'
  387. # but Python < 3.7 doesn't put the col_offset there
  388. # AsyncFunctionDef is slightly different because it might have
  389. # decorators before that, which visit_functiondef handles
  390. def handle_async(self, node, first_token, last_token):
  391. # type: (AstNode, util.Token, util.Token) -> Tuple[util.Token, util.Token]
  392. if not first_token.string == 'async':
  393. first_token = self._code.prev_token(first_token)
  394. return (first_token, last_token)
  395. visit_asyncfor = handle_async
  396. visit_asyncwith = handle_async
  397. def visit_asyncfunctiondef(self,
  398. node, # type: AstNode
  399. first_token, # type: util.Token
  400. last_token, # type: util.Token
  401. ):
  402. # type: (...) -> Tuple[util.Token, util.Token]
  403. if util.match_token(first_token, token.NAME, 'def'):
  404. # Include the 'async' token
  405. first_token = self._code.prev_token(first_token)
  406. return self.visit_functiondef(node, first_token, last_token)