jslexer.py 7.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216
  1. """
  2. babel.messages.jslexer
  3. ~~~~~~~~~~~~~~~~~~~~~~
  4. A simple JavaScript 1.5 lexer which is used for the JavaScript
  5. extractor.
  6. :copyright: (c) 2013-2026 by the Babel Team.
  7. :license: BSD, see LICENSE for more details.
  8. """
  9. from __future__ import annotations
  10. import re
  11. from collections.abc import Generator
  12. from typing import NamedTuple
  13. operators: list[str] = sorted([
  14. '+', '-', '*', '%', '!=', '==', '<', '>', '<=', '>=', '=',
  15. '+=', '-=', '*=', '%=', '<<', '>>', '>>>', '<<=', '>>=',
  16. '>>>=', '&', '&=', '|', '|=', '&&', '||', '^', '^=', '(', ')',
  17. '[', ']', '{', '}', '!', '--', '++', '~', ',', ';', '.', ':',
  18. ], key=len, reverse=True) # fmt: skip
  19. escapes: dict[str, str] = {'b': '\b', 'f': '\f', 'n': '\n', 'r': '\r', 't': '\t'}
  20. name_re = re.compile(r'[\w$_][\w\d$_]*', re.UNICODE)
  21. dotted_name_re = re.compile(r'[\w$_][\w\d$_.]*[\w\d$_.]', re.UNICODE)
  22. division_re = re.compile(r'/=?')
  23. regex_re = re.compile(r'/(?:[^/\\]*(?:\\.[^/\\]*)*)/[a-zA-Z]*', re.DOTALL)
  24. line_re = re.compile(r'(\r\n|\n|\r)')
  25. line_join_re = re.compile(r'\\' + line_re.pattern)
  26. uni_escape_re = re.compile(r'[a-fA-F0-9]{1,4}')
  27. hex_escape_re = re.compile(r'[a-fA-F0-9]{1,2}')
  28. class Token(NamedTuple):
  29. type: str
  30. value: str
  31. lineno: int
  32. _rules: list[tuple[str | None, re.Pattern[str]]] = [
  33. (None, re.compile(r'\s+', re.UNICODE)),
  34. (None, re.compile(r'<!--.*')),
  35. ('linecomment', re.compile(r'//.*')),
  36. ('multilinecomment', re.compile(r'/\*.*?\*/', re.UNICODE | re.DOTALL)),
  37. ('dotted_name', dotted_name_re),
  38. ('name', name_re),
  39. ('number', re.compile(r'''(
  40. (?:0|[1-9]\d*)
  41. (\.\d+)?
  42. ([eE][-+]?\d+)? |
  43. (0x[a-fA-F0-9]+)
  44. )''', re.VERBOSE)),
  45. ('jsx_tag', re.compile(r'(?:</?[^>\s]+|/>)', re.I)), # May be mangled in `get_rules`
  46. ('operator', re.compile(r'(%s)' % '|'.join(re.escape(op) for op in operators))),
  47. ('template_string', re.compile(r'''`(?:[^`\\]*(?:\\.[^`\\]*)*)`''', re.UNICODE)),
  48. ('string', re.compile(r'''(
  49. '(?:[^'\\]*(?:\\.[^'\\]*)*)' |
  50. "(?:[^"\\]*(?:\\.[^"\\]*)*)"
  51. )''', re.VERBOSE | re.DOTALL)),
  52. ] # fmt: skip
  53. def get_rules(
  54. jsx: bool,
  55. dotted: bool,
  56. template_string: bool,
  57. ) -> list[tuple[str | None, re.Pattern[str]]]:
  58. """
  59. Get a tokenization rule list given the passed syntax options.
  60. Internal to this module.
  61. """
  62. rules = []
  63. for token_type, rule in _rules:
  64. if not jsx and token_type and 'jsx' in token_type:
  65. continue
  66. if not template_string and token_type == 'template_string':
  67. continue
  68. if token_type == 'dotted_name':
  69. if not dotted:
  70. continue
  71. token_type = 'name'
  72. rules.append((token_type, rule))
  73. return rules
  74. def indicates_division(token: Token) -> bool:
  75. """A helper function that helps the tokenizer to decide if the current
  76. token may be followed by a division operator.
  77. """
  78. if token.type == 'operator':
  79. return token.value in (')', ']', '}', '++', '--')
  80. return token.type in ('name', 'number', 'string', 'regexp')
  81. def unquote_string(string: str) -> str:
  82. """Unquote a string with JavaScript rules. The string has to start with
  83. string delimiters (``'``, ``"`` or the back-tick/grave accent (for template strings).)
  84. """
  85. assert string and string[0] == string[-1] and string[0] in '"\'`', (
  86. 'string provided is not properly delimited'
  87. )
  88. string = line_join_re.sub('\\1', string[1:-1])
  89. result: list[str] = []
  90. add = result.append
  91. pos = 0
  92. while True:
  93. # scan for the next escape
  94. escape_pos = string.find('\\', pos)
  95. if escape_pos < 0:
  96. break
  97. add(string[pos:escape_pos])
  98. # check which character is escaped
  99. next_char = string[escape_pos + 1]
  100. if next_char in escapes:
  101. add(escapes[next_char])
  102. # unicode escapes. trie to consume up to four characters of
  103. # hexadecimal characters and try to interpret them as unicode
  104. # character point. If there is no such character point, put
  105. # all the consumed characters into the string.
  106. elif next_char in 'uU':
  107. escaped = uni_escape_re.match(string, escape_pos + 2)
  108. if escaped is not None:
  109. escaped_value = escaped.group()
  110. if len(escaped_value) == 4:
  111. try:
  112. add(chr(int(escaped_value, 16)))
  113. except ValueError:
  114. pass
  115. else:
  116. pos = escape_pos + 6
  117. continue
  118. add(next_char + escaped_value)
  119. pos = escaped.end()
  120. continue
  121. else:
  122. add(next_char)
  123. # hex escapes. conversion from 2-digits hex to char is infallible
  124. elif next_char in 'xX':
  125. escaped = hex_escape_re.match(string, escape_pos + 2)
  126. if escaped is not None:
  127. escaped_value = escaped.group()
  128. add(chr(int(escaped_value, 16)))
  129. pos = escape_pos + 2 + len(escaped_value)
  130. continue
  131. else:
  132. add(next_char)
  133. # bogus escape. Just remove the backslash.
  134. else:
  135. add(next_char)
  136. pos = escape_pos + 2
  137. if pos < len(string):
  138. add(string[pos:])
  139. return ''.join(result)
  140. def tokenize(
  141. source: str,
  142. jsx: bool = True,
  143. dotted: bool = True,
  144. template_string: bool = True,
  145. lineno: int = 1,
  146. ) -> Generator[Token, None, None]:
  147. """
  148. Tokenize JavaScript/JSX source. Returns a generator of tokens.
  149. :param source: The JavaScript source to tokenize.
  150. :param jsx: Enable (limited) JSX parsing.
  151. :param dotted: Read dotted names as single name token.
  152. :param template_string: Support ES6 template strings
  153. :param lineno: starting line number (optional)
  154. """
  155. may_divide = False
  156. pos = 0
  157. end = len(source)
  158. rules = get_rules(jsx=jsx, dotted=dotted, template_string=template_string)
  159. while pos < end:
  160. # handle regular rules first
  161. for token_type, rule in rules: # noqa: B007
  162. match = rule.match(source, pos)
  163. if match is not None:
  164. break
  165. # if we don't have a match we don't give up yet, but check for
  166. # division operators or regular expression literals, based on
  167. # the status of `may_divide` which is determined by the last
  168. # processed non-whitespace token using `indicates_division`.
  169. else:
  170. if may_divide:
  171. match = division_re.match(source, pos)
  172. token_type = 'operator'
  173. else:
  174. match = regex_re.match(source, pos)
  175. token_type = 'regexp'
  176. if match is None:
  177. # woops. invalid syntax. jump one char ahead and try again.
  178. pos += 1
  179. continue
  180. token_value = match.group()
  181. if token_type is not None:
  182. token = Token(token_type, token_value, lineno)
  183. may_divide = indicates_division(token)
  184. yield token
  185. lineno += len(line_re.findall(token_value))
  186. pos = match.end()