helpers.py 41 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220
  1. # helpers.py
  2. import html.entities
  3. import operator
  4. import re
  5. import sys
  6. import typing
  7. from . import __diag__
  8. from .core import *
  9. from .util import (
  10. _bslash,
  11. _flatten,
  12. _escape_regex_range_chars,
  13. make_compressed_re,
  14. replaced_by_pep8,
  15. )
  16. def _suppression(expr: Union[ParserElement, str]) -> ParserElement:
  17. # internal helper to avoid wrapping Suppress inside another Suppress
  18. if isinstance(expr, Suppress):
  19. return expr
  20. return Suppress(expr)
  21. #
  22. # global helpers
  23. #
  24. def counted_array(
  25. expr: ParserElement, int_expr: typing.Optional[ParserElement] = None, **kwargs
  26. ) -> ParserElement:
  27. """Helper to define a counted list of expressions.
  28. This helper defines a pattern of the form::
  29. integer expr expr expr...
  30. where the leading integer tells how many expr expressions follow.
  31. The matched tokens returns the array of expr tokens as a list - the
  32. leading count token is suppressed.
  33. If ``int_expr`` is specified, it should be a pyparsing expression
  34. that produces an integer value.
  35. Examples:
  36. .. doctest::
  37. >>> counted_array(Word(alphas)).parse_string('2 ab cd ef')
  38. ParseResults(['ab', 'cd'], {})
  39. - In this parser, the leading integer value is given in binary,
  40. '10' indicating that 2 values are in the array:
  41. .. doctest::
  42. >>> binary_constant = Word('01').set_parse_action(lambda t: int(t[0], 2))
  43. >>> counted_array(Word(alphas), int_expr=binary_constant
  44. ... ).parse_string('10 ab cd ef')
  45. ParseResults(['ab', 'cd'], {})
  46. - If other fields must be parsed after the count but before the
  47. list items, give the fields results names and they will
  48. be preserved in the returned ParseResults:
  49. .. doctest::
  50. >>> ppc = pyparsing.common
  51. >>> count_with_metadata = ppc.integer + Word(alphas)("type")
  52. >>> typed_array = counted_array(Word(alphanums),
  53. ... int_expr=count_with_metadata)("items")
  54. >>> result = typed_array.parse_string("3 bool True True False")
  55. >>> print(result.dump())
  56. ['True', 'True', 'False']
  57. - items: ['True', 'True', 'False']
  58. - type: 'bool'
  59. """
  60. intExpr: typing.Optional[ParserElement] = deprecate_argument(
  61. kwargs, "intExpr", None
  62. )
  63. intExpr = intExpr or int_expr
  64. array_expr = Forward()
  65. def count_field_parse_action(s, l, t):
  66. nonlocal array_expr
  67. n = t[0]
  68. array_expr <<= (expr * n) if n else Empty()
  69. # clear list contents, but keep any named results
  70. del t[:]
  71. if intExpr is None:
  72. intExpr = Word(nums).set_parse_action(lambda t: int(t[0]))
  73. else:
  74. intExpr = intExpr.copy()
  75. intExpr.set_name("arrayLen")
  76. intExpr.add_parse_action(count_field_parse_action, call_during_try=True)
  77. return (intExpr + array_expr).set_name(f"(len) {expr}...")
  78. def match_previous_literal(expr: ParserElement) -> ParserElement:
  79. """Helper to define an expression that is indirectly defined from
  80. the tokens matched in a previous expression, that is, it looks for
  81. a 'repeat' of a previous expression. For example::
  82. .. testcode::
  83. first = Word(nums)
  84. second = match_previous_literal(first)
  85. match_expr = first + ":" + second
  86. will match ``"1:1"``, but not ``"1:2"``. Because this
  87. matches a previous literal, will also match the leading
  88. ``"1:1"`` in ``"1:10"``. If this is not desired, use
  89. :class:`match_previous_expr`. Do *not* use with packrat parsing
  90. enabled.
  91. """
  92. rep = Forward()
  93. def copy_token_to_repeater(s, l, t):
  94. if not t:
  95. rep << Empty()
  96. return
  97. if len(t) == 1:
  98. rep << t[0]
  99. return
  100. # flatten t tokens
  101. tflat = _flatten(t.as_list())
  102. rep << And(Literal(tt) for tt in tflat)
  103. expr.add_parse_action(copy_token_to_repeater, call_during_try=True)
  104. rep.set_name(f"(prev) {expr}")
  105. return rep
  106. def match_previous_expr(expr: ParserElement) -> ParserElement:
  107. """Helper to define an expression that is indirectly defined from
  108. the tokens matched in a previous expression, that is, it looks for
  109. a 'repeat' of a previous expression. For example:
  110. .. testcode::
  111. first = Word(nums)
  112. second = match_previous_expr(first)
  113. match_expr = first + ":" + second
  114. will match ``"1:1"``, but not ``"1:2"``. Because this
  115. matches by expressions, will *not* match the leading ``"1:1"``
  116. in ``"1:10"``; the expressions are evaluated first, and then
  117. compared, so ``"1"`` is compared with ``"10"``. Do *not* use
  118. with packrat parsing enabled.
  119. """
  120. rep = Forward()
  121. e2 = expr.copy()
  122. rep <<= e2
  123. def copy_token_to_repeater(s, l, t):
  124. matchTokens = _flatten(t.as_list())
  125. def must_match_these_tokens(s, l, t):
  126. theseTokens = _flatten(t.as_list())
  127. if theseTokens != matchTokens:
  128. raise ParseException(
  129. s, l, f"Expected {matchTokens}, found{theseTokens}"
  130. )
  131. rep.set_parse_action(must_match_these_tokens, call_during_try=True)
  132. expr.add_parse_action(copy_token_to_repeater, call_during_try=True)
  133. rep.set_name(f"(prev) {expr}")
  134. return rep
  135. def one_of(
  136. strs: Union[typing.Iterable[str], str],
  137. caseless: bool = False,
  138. use_regex: bool = True,
  139. as_keyword: bool = False,
  140. **kwargs,
  141. ) -> ParserElement:
  142. """Helper to quickly define a set of alternative :class:`Literal` s,
  143. and makes sure to do longest-first testing when there is a conflict,
  144. regardless of the input order, but returns
  145. a :class:`MatchFirst` for best performance.
  146. :param strs: a string of space-delimited literals, or a collection of
  147. string literals
  148. :param caseless: treat all literals as caseless
  149. :param use_regex: bool - as an optimization, will
  150. generate a :class:`Regex` object; otherwise, will generate
  151. a :class:`MatchFirst` object (if ``caseless=True`` or
  152. ``as_keyword=True``, or if creating a :class:`Regex` raises an exception)
  153. :param as_keyword: bool - enforce :class:`Keyword`-style matching on the
  154. generated expressions
  155. Parameters ``asKeyword`` and ``useRegex`` are retained for pre-PEP8
  156. compatibility, but will be removed in a future release.
  157. Example:
  158. .. testcode::
  159. comp_oper = one_of("< = > <= >= !=")
  160. var = Word(alphas)
  161. number = Word(nums)
  162. term = var | number
  163. comparison_expr = term + comp_oper + term
  164. print(comparison_expr.search_string("B = 12 AA=23 B<=AA AA>12"))
  165. prints:
  166. .. testoutput::
  167. [['B', '=', '12'], ['AA', '=', '23'], ['B', '<=', 'AA'], ['AA', '>', '12']]
  168. """
  169. useRegex: bool = deprecate_argument(kwargs, "useRegex", True)
  170. asKeyword: bool = deprecate_argument(kwargs, "asKeyword", False)
  171. asKeyword = asKeyword or as_keyword
  172. useRegex = useRegex and use_regex
  173. if (
  174. isinstance(caseless, str_type)
  175. and __diag__.warn_on_multiple_string_args_to_oneof
  176. ):
  177. warnings.warn(
  178. "warn_on_multiple_string_args_to_oneof:"
  179. " More than one string argument passed to one_of, pass"
  180. " choices as a list or space-delimited string",
  181. PyparsingDiagnosticWarning,
  182. stacklevel=2,
  183. )
  184. if caseless:
  185. is_equal = lambda a, b: a.upper() == b.upper()
  186. masks = lambda a, b: b.upper().startswith(a.upper())
  187. else:
  188. is_equal = operator.eq
  189. masks = lambda a, b: b.startswith(a)
  190. symbols: list[str]
  191. if isinstance(strs, str_type):
  192. strs = typing.cast(str, strs)
  193. symbols = strs.split()
  194. elif isinstance(strs, Iterable):
  195. symbols = list(strs)
  196. else:
  197. raise TypeError("Invalid argument to one_of, expected string or iterable")
  198. if not symbols:
  199. return NoMatch()
  200. # reorder given symbols to take care to avoid masking longer choices with shorter ones
  201. # (but only if the given symbols are not just single characters)
  202. i = 0
  203. while i < len(symbols) - 1:
  204. cur = symbols[i]
  205. for j, other in enumerate(symbols[i + 1 :]):
  206. if is_equal(other, cur):
  207. del symbols[i + j + 1]
  208. break
  209. if len(other) > len(cur) and masks(cur, other):
  210. del symbols[i + j + 1]
  211. symbols.insert(i, other)
  212. break
  213. else:
  214. i += 1
  215. if useRegex:
  216. re_flags: int = re.IGNORECASE if caseless else 0
  217. try:
  218. if all(len(sym) == 1 for sym in symbols):
  219. # symbols are just single characters, create range regex pattern
  220. patt = f"[{''.join(_escape_regex_range_chars(sym) for sym in symbols)}]"
  221. else:
  222. patt = "|".join(re.escape(sym) for sym in symbols)
  223. # wrap with \b word break markers if defining as keywords
  224. if asKeyword:
  225. patt = rf"\b(?:{patt})\b"
  226. ret = Regex(patt, flags=re_flags)
  227. ret.set_name(" | ".join(repr(s) for s in symbols))
  228. if caseless:
  229. # add parse action to return symbols as specified, not in random
  230. # casing as found in input string
  231. symbol_map = {sym.lower(): sym for sym in symbols}
  232. ret.add_parse_action(lambda s, l, t: symbol_map[t[0].lower()])
  233. return ret
  234. except re.error:
  235. warnings.warn(
  236. "Exception creating Regex for one_of, building MatchFirst",
  237. PyparsingDiagnosticWarning,
  238. stacklevel=2,
  239. )
  240. # last resort, just use MatchFirst of Token class corresponding to caseless
  241. # and asKeyword settings
  242. CASELESS = KEYWORD = True
  243. parse_element_class = {
  244. (CASELESS, KEYWORD): CaselessKeyword,
  245. (CASELESS, not KEYWORD): CaselessLiteral,
  246. (not CASELESS, KEYWORD): Keyword,
  247. (not CASELESS, not KEYWORD): Literal,
  248. }[(caseless, asKeyword)]
  249. return MatchFirst(parse_element_class(sym) for sym in symbols).set_name(
  250. " | ".join(symbols)
  251. )
  252. def dict_of(key: ParserElement, value: ParserElement) -> Dict:
  253. """Helper to easily and clearly define a dictionary by specifying
  254. the respective patterns for the key and value. Takes care of
  255. defining the :class:`Dict`, :class:`ZeroOrMore`, and
  256. :class:`Group` tokens in the proper order. The key pattern
  257. can include delimiting markers or punctuation, as long as they are
  258. suppressed, thereby leaving the significant key text. The value
  259. pattern can include named results, so that the :class:`Dict` results
  260. can include named token fields.
  261. Example:
  262. .. doctest::
  263. >>> text = "shape: SQUARE posn: upper left color: light blue texture: burlap"
  264. >>> data_word = Word(alphas)
  265. >>> label = data_word + FollowedBy(':')
  266. >>> attr_expr = (
  267. ... label
  268. ... + Suppress(':')
  269. ... + OneOrMore(data_word, stop_on=label)
  270. ... .set_parse_action(' '.join))
  271. >>> print(attr_expr[1, ...].parse_string(text).dump())
  272. ['shape', 'SQUARE', 'posn', 'upper left', 'color', 'light blue', 'texture', 'burlap']
  273. >>> attr_label = label
  274. >>> attr_value = Suppress(':') + OneOrMore(data_word, stop_on=label
  275. ... ).set_parse_action(' '.join)
  276. # similar to Dict, but simpler call format
  277. >>> result = dict_of(attr_label, attr_value).parse_string(text)
  278. >>> print(result.dump())
  279. [['shape', 'SQUARE'], ['posn', 'upper left'], ['color', 'light blue'], ['texture', 'burlap']]
  280. - color: 'light blue'
  281. - posn: 'upper left'
  282. - shape: 'SQUARE'
  283. - texture: 'burlap'
  284. [0]:
  285. ['shape', 'SQUARE']
  286. [1]:
  287. ['posn', 'upper left']
  288. [2]:
  289. ['color', 'light blue']
  290. [3]:
  291. ['texture', 'burlap']
  292. >>> print(result['shape'])
  293. SQUARE
  294. >>> print(result.shape) # object attribute access works too
  295. SQUARE
  296. >>> print(result.as_dict())
  297. {'shape': 'SQUARE', 'posn': 'upper left', 'color': 'light blue', 'texture': 'burlap'}
  298. """
  299. return Dict(OneOrMore(Group(key + value)))
  300. def original_text_for(
  301. expr: ParserElement, as_string: bool = True, **kwargs
  302. ) -> ParserElement:
  303. """Helper to return the original, untokenized text for a given
  304. expression. Useful to restore the parsed fields of an HTML start
  305. tag into the raw tag text itself, or to revert separate tokens with
  306. intervening whitespace back to the original matching input text. By
  307. default, returns a string containing the original parsed text.
  308. If the optional ``as_string`` argument is passed as
  309. ``False``, then the return value is
  310. a :class:`ParseResults` containing any results names that
  311. were originally matched, and a single token containing the original
  312. matched text from the input string. So if the expression passed to
  313. :class:`original_text_for` contains expressions with defined
  314. results names, you must set ``as_string`` to ``False`` if you
  315. want to preserve those results name values.
  316. The ``asString`` pre-PEP8 argument is retained for compatibility,
  317. but will be removed in a future release.
  318. Example:
  319. .. testcode::
  320. src = "this is test <b> bold <i>text</i> </b> normal text "
  321. for tag in ("b", "i"):
  322. opener, closer = make_html_tags(tag)
  323. patt = original_text_for(opener + ... + closer)
  324. print(patt.search_string(src)[0])
  325. prints:
  326. .. testoutput::
  327. ['<b> bold <i>text</i> </b>']
  328. ['<i>text</i>']
  329. """
  330. asString: bool = deprecate_argument(kwargs, "asString", True)
  331. asString = asString and as_string
  332. locMarker = Empty().set_parse_action(lambda s, loc, t: loc)
  333. endlocMarker = locMarker.copy()
  334. endlocMarker.callPreparse = False
  335. matchExpr = locMarker("_original_start") + expr + endlocMarker("_original_end")
  336. if asString:
  337. extractText = lambda s, l, t: s[t._original_start : t._original_end]
  338. else:
  339. def extractText(s, l, t):
  340. t[:] = [s[t.pop("_original_start") : t.pop("_original_end")]]
  341. matchExpr.set_parse_action(extractText)
  342. matchExpr.ignoreExprs = expr.ignoreExprs
  343. matchExpr.suppress_warning(Diagnostics.warn_ungrouped_named_tokens_in_collection)
  344. return matchExpr
  345. def ungroup(expr: ParserElement) -> ParserElement:
  346. """Helper to undo pyparsing's default grouping of And expressions,
  347. even if all but one are non-empty.
  348. """
  349. return TokenConverter(expr).add_parse_action(lambda t: t[0])
  350. def locatedExpr(expr: ParserElement) -> ParserElement:
  351. """
  352. .. deprecated:: 3.0.0
  353. Use the :class:`Located` class instead. Note that `Located`
  354. returns results with one less grouping level.
  355. Helper to decorate a returned token with its starting and ending
  356. locations in the input string.
  357. This helper adds the following results names:
  358. - ``locn_start`` - location where matched expression begins
  359. - ``locn_end`` - location where matched expression ends
  360. - ``value`` - the actual parsed results
  361. Be careful if the input text contains ``<TAB>`` characters, you
  362. may want to call :meth:`ParserElement.parse_with_tabs`
  363. """
  364. warnings.warn(
  365. f"{'locatedExpr'!r} deprecated - use {'Located'!r}",
  366. PyparsingDeprecationWarning,
  367. stacklevel=2,
  368. )
  369. locator = Empty().set_parse_action(lambda ss, ll, tt: ll)
  370. return Group(
  371. locator("locn_start")
  372. + expr("value")
  373. + locator.copy().leave_whitespace()("locn_end")
  374. )
  375. # define special default value to permit None as a significant value for
  376. # ignore_expr
  377. _NO_IGNORE_EXPR_GIVEN = NoMatch()
  378. def nested_expr(
  379. opener: Union[str, ParserElement] = "(",
  380. closer: Union[str, ParserElement] = ")",
  381. content: typing.Optional[ParserElement] = None,
  382. ignore_expr: typing.Optional[ParserElement] = _NO_IGNORE_EXPR_GIVEN,
  383. **kwargs,
  384. ) -> ParserElement:
  385. """Helper method for defining nested lists enclosed in opening and
  386. closing delimiters (``"("`` and ``")"`` are the default).
  387. :param opener: str - opening character for a nested list
  388. (default= ``"("``); can also be a pyparsing expression
  389. :param closer: str - closing character for a nested list
  390. (default= ``")"``); can also be a pyparsing expression
  391. :param content: expression for items within the nested lists
  392. :param ignore_expr: expression for ignoring opening and closing delimiters
  393. (default = :class:`quoted_string`)
  394. Parameter ``ignoreExpr`` is retained for compatibility
  395. but will be removed in a future release.
  396. If an expression is not provided for the content argument, the
  397. nested expression will capture all whitespace-delimited content
  398. between delimiters as a list of separate values.
  399. Use the ``ignore_expr`` argument to define expressions that may
  400. contain opening or closing characters that should not be treated as
  401. opening or closing characters for nesting, such as quoted_string or
  402. a comment expression. Specify multiple expressions using an
  403. :class:`Or` or :class:`MatchFirst`. The default is
  404. :class:`quoted_string`, but if no expressions are to be ignored, then
  405. pass ``None`` for this argument.
  406. Example:
  407. .. testcode::
  408. data_type = one_of("void int short long char float double")
  409. decl_data_type = Combine(data_type + Opt(Word('*')))
  410. ident = Word(alphas+'_', alphanums+'_')
  411. number = pyparsing_common.number
  412. arg = Group(decl_data_type + ident)
  413. LPAR, RPAR = map(Suppress, "()")
  414. code_body = nested_expr('{', '}', ignore_expr=(quoted_string | c_style_comment))
  415. c_function = (decl_data_type("type")
  416. + ident("name")
  417. + LPAR + Opt(DelimitedList(arg), [])("args") + RPAR
  418. + code_body("body"))
  419. c_function.ignore(c_style_comment)
  420. source_code = '''
  421. int is_odd(int x) {
  422. return (x%2);
  423. }
  424. int dec_to_hex(char hchar) {
  425. if (hchar >= '0' && hchar <= '9') {
  426. return (ord(hchar)-ord('0'));
  427. } else {
  428. return (10+ord(hchar)-ord('A'));
  429. }
  430. }
  431. '''
  432. for func in c_function.search_string(source_code):
  433. print(f"{func.name} ({func.type}) args: {func.args}")
  434. prints:
  435. .. testoutput::
  436. is_odd (int) args: [['int', 'x']]
  437. dec_to_hex (int) args: [['char', 'hchar']]
  438. """
  439. ignoreExpr: ParserElement = deprecate_argument(
  440. kwargs, "ignoreExpr", _NO_IGNORE_EXPR_GIVEN
  441. )
  442. if ignoreExpr != ignore_expr:
  443. ignoreExpr = ignore_expr if ignoreExpr is _NO_IGNORE_EXPR_GIVEN else ignoreExpr # type: ignore [assignment]
  444. if ignoreExpr is _NO_IGNORE_EXPR_GIVEN:
  445. ignoreExpr = quoted_string()
  446. if opener == closer:
  447. raise ValueError("opening and closing strings cannot be the same")
  448. if content is None:
  449. if isinstance(opener, str_type) and isinstance(closer, str_type):
  450. opener = typing.cast(str, opener)
  451. closer = typing.cast(str, closer)
  452. if len(opener) == 1 and len(closer) == 1:
  453. if ignoreExpr is not None:
  454. content = Combine(
  455. OneOrMore(
  456. ~ignoreExpr
  457. + CharsNotIn(
  458. opener + closer + ParserElement.DEFAULT_WHITE_CHARS,
  459. exact=1,
  460. )
  461. )
  462. )
  463. else:
  464. content = Combine(
  465. Empty()
  466. + CharsNotIn(
  467. opener + closer + ParserElement.DEFAULT_WHITE_CHARS
  468. )
  469. )
  470. else:
  471. if ignoreExpr is not None:
  472. content = Combine(
  473. OneOrMore(
  474. ~ignoreExpr
  475. + ~Literal(opener)
  476. + ~Literal(closer)
  477. + CharsNotIn(ParserElement.DEFAULT_WHITE_CHARS, exact=1)
  478. )
  479. )
  480. else:
  481. content = Combine(
  482. OneOrMore(
  483. ~Literal(opener)
  484. + ~Literal(closer)
  485. + CharsNotIn(ParserElement.DEFAULT_WHITE_CHARS, exact=1)
  486. )
  487. )
  488. else:
  489. raise ValueError(
  490. "opening and closing arguments must be strings if no content expression is given"
  491. )
  492. # for these internally-created context expressions, simulate whitespace-skipping
  493. if ParserElement.DEFAULT_WHITE_CHARS:
  494. content.set_parse_action(
  495. lambda t: t[0].strip(ParserElement.DEFAULT_WHITE_CHARS)
  496. )
  497. ret = Forward()
  498. if ignoreExpr is not None:
  499. ret <<= Group(
  500. _suppression(opener)
  501. + ZeroOrMore(ignoreExpr | ret | content)
  502. + _suppression(closer)
  503. )
  504. else:
  505. ret <<= Group(
  506. _suppression(opener) + ZeroOrMore(ret | content) + _suppression(closer)
  507. )
  508. ret.set_name(f"nested {opener}{closer} expression")
  509. # don't override error message from content expressions
  510. ret.errmsg = None
  511. return ret
  512. def _makeTags(tagStr, xml, suppress_LT=Suppress("<"), suppress_GT=Suppress(">")):
  513. """Internal helper to construct opening and closing tag expressions,
  514. given a tag name"""
  515. if isinstance(tagStr, str_type):
  516. resname = tagStr
  517. tagStr = Keyword(tagStr, caseless=not xml)
  518. else:
  519. resname = tagStr.name
  520. tagAttrName = Word(alphas, alphanums + "_-:")
  521. if xml:
  522. tagAttrValue = dbl_quoted_string.copy().set_parse_action(remove_quotes)
  523. openTag = (
  524. suppress_LT
  525. + tagStr("tag")
  526. + Dict(ZeroOrMore(Group(tagAttrName + Suppress("=") + tagAttrValue)))
  527. + Opt("/", default=[False])("empty").set_parse_action(
  528. lambda s, l, t: t[0] == "/"
  529. )
  530. + suppress_GT
  531. )
  532. else:
  533. tagAttrValue = quoted_string.copy().set_parse_action(remove_quotes) | Word(
  534. printables, exclude_chars=">"
  535. )
  536. openTag = (
  537. suppress_LT
  538. + tagStr("tag")
  539. + Dict(
  540. ZeroOrMore(
  541. Group(
  542. tagAttrName.set_parse_action(lambda t: t[0].lower())
  543. + Opt(Suppress("=") + tagAttrValue)
  544. )
  545. )
  546. )
  547. + Opt("/", default=[False])("empty").set_parse_action(
  548. lambda s, l, t: t[0] == "/"
  549. )
  550. + suppress_GT
  551. )
  552. closeTag = Combine(Literal("</") + tagStr + ">", adjacent=False)
  553. openTag.set_name(f"<{resname}>")
  554. # add start<tagname> results name in parse action now that ungrouped names are not reported at two levels
  555. openTag.add_parse_action(
  556. lambda t: t.__setitem__(
  557. "start" + "".join(resname.replace(":", " ").title().split()), t.copy()
  558. )
  559. )
  560. closeTag = closeTag(
  561. "end" + "".join(resname.replace(":", " ").title().split())
  562. ).set_name(f"</{resname}>")
  563. openTag.tag = resname
  564. closeTag.tag = resname
  565. openTag.tag_body = SkipTo(closeTag())
  566. return openTag, closeTag
  567. def make_html_tags(
  568. tag_str: Union[str, ParserElement],
  569. ) -> tuple[ParserElement, ParserElement]:
  570. """Helper to construct opening and closing tag expressions for HTML,
  571. given a tag name. Matches tags in either upper or lower case,
  572. attributes with namespaces and with quoted or unquoted values.
  573. Example:
  574. .. testcode::
  575. text = '<td>More info at the <a href="https://github.com/pyparsing/pyparsing/wiki">pyparsing</a> wiki page</td>'
  576. # make_html_tags returns pyparsing expressions for the opening and
  577. # closing tags as a 2-tuple
  578. a, a_end = make_html_tags("A")
  579. link_expr = a + SkipTo(a_end)("link_text") + a_end
  580. for link in link_expr.search_string(text):
  581. # attributes in the <A> tag (like "href" shown here) are
  582. # also accessible as named results
  583. print(link.link_text, '->', link.href)
  584. prints:
  585. .. testoutput::
  586. pyparsing -> https://github.com/pyparsing/pyparsing/wiki
  587. """
  588. return _makeTags(tag_str, False)
  589. def make_xml_tags(
  590. tag_str: Union[str, ParserElement],
  591. ) -> tuple[ParserElement, ParserElement]:
  592. """Helper to construct opening and closing tag expressions for XML,
  593. given a tag name. Matches tags only in the given upper/lower case.
  594. Example: similar to :class:`make_html_tags`
  595. """
  596. return _makeTags(tag_str, True)
  597. any_open_tag: ParserElement
  598. any_close_tag: ParserElement
  599. any_open_tag, any_close_tag = make_html_tags(
  600. Word(alphas, alphanums + "_:").set_name("any tag")
  601. )
  602. _htmlEntityMap = {k.rstrip(";"): v for k, v in html.entities.html5.items()}
  603. _most_common_entities = "nbsp lt gt amp quot apos cent pound euro copy".replace(
  604. " ", "|"
  605. )
  606. common_html_entity = Regex(
  607. lambda: f"&(?P<entity>{_most_common_entities}|{make_compressed_re(_htmlEntityMap)});"
  608. ).set_name("common HTML entity")
  609. def replace_html_entity(s, l, t):
  610. """Helper parser action to replace common HTML entities with their special characters"""
  611. return _htmlEntityMap.get(t.entity)
  612. class OpAssoc(Enum):
  613. """Enumeration of operator associativity
  614. - used in constructing InfixNotationOperatorSpec for :class:`infix_notation`"""
  615. LEFT = 1
  616. RIGHT = 2
  617. InfixNotationOperatorArgType = Union[
  618. ParserElement, str, tuple[Union[ParserElement, str], Union[ParserElement, str]]
  619. ]
  620. InfixNotationOperatorSpec = Union[
  621. tuple[
  622. InfixNotationOperatorArgType,
  623. int,
  624. OpAssoc,
  625. typing.Optional[ParseAction],
  626. ],
  627. tuple[
  628. InfixNotationOperatorArgType,
  629. int,
  630. OpAssoc,
  631. ],
  632. ]
  633. def infix_notation(
  634. base_expr: ParserElement,
  635. op_list: list[InfixNotationOperatorSpec],
  636. lpar: Union[str, ParserElement] = Suppress("("),
  637. rpar: Union[str, ParserElement] = Suppress(")"),
  638. ) -> Forward:
  639. """Helper method for constructing grammars of expressions made up of
  640. operators working in a precedence hierarchy. Operators may be unary
  641. or binary, left- or right-associative. Parse actions can also be
  642. attached to operator expressions. The generated parser will also
  643. recognize the use of parentheses to override operator precedences
  644. (see example below).
  645. Note: if you define a deep operator list, you may see performance
  646. issues when using infix_notation. See
  647. :class:`ParserElement.enable_packrat` for a mechanism to potentially
  648. improve your parser performance.
  649. Parameters:
  650. :param base_expr: expression representing the most basic operand to
  651. be used in the expression
  652. :param op_list: list of tuples, one for each operator precedence level
  653. in the expression grammar; each tuple is of the form ``(op_expr,
  654. num_operands, right_left_assoc, (optional)parse_action)``, where:
  655. - ``op_expr`` is the pyparsing expression for the operator; may also
  656. be a string, which will be converted to a Literal; if ``num_operands``
  657. is 3, ``op_expr`` is a tuple of two expressions, for the two
  658. operators separating the 3 terms
  659. - ``num_operands`` is the number of terms for this operator (must be 1,
  660. 2, or 3)
  661. - ``right_left_assoc`` is the indicator whether the operator is right
  662. or left associative, using the pyparsing-defined constants
  663. ``OpAssoc.RIGHT`` and ``OpAssoc.LEFT``.
  664. - ``parse_action`` is the parse action to be associated with
  665. expressions matching this operator expression (the parse action
  666. tuple member may be omitted); if the parse action is passed
  667. a tuple or list of functions, this is equivalent to calling
  668. ``set_parse_action(*fn)``
  669. (:class:`ParserElement.set_parse_action`)
  670. :param lpar: expression for matching left-parentheses; if passed as a
  671. str, then will be parsed as ``Suppress(lpar)``. If lpar is passed as
  672. an expression (such as ``Literal('(')``), then it will be kept in
  673. the parsed results, and grouped with them. (default= ``Suppress('(')``)
  674. :param rpar: expression for matching right-parentheses; if passed as a
  675. str, then will be parsed as ``Suppress(rpar)``. If rpar is passed as
  676. an expression (such as ``Literal(')')``), then it will be kept in
  677. the parsed results, and grouped with them. (default= ``Suppress(')')``)
  678. Example:
  679. .. testcode::
  680. # simple example of four-function arithmetic with ints and
  681. # variable names
  682. integer = pyparsing_common.signed_integer
  683. varname = pyparsing_common.identifier
  684. arith_expr = infix_notation(integer | varname,
  685. [
  686. ('-', 1, OpAssoc.RIGHT),
  687. (one_of('* /'), 2, OpAssoc.LEFT),
  688. (one_of('+ -'), 2, OpAssoc.LEFT),
  689. ])
  690. arith_expr.run_tests('''
  691. 5+3*6
  692. (5+3)*6
  693. (5+x)*y
  694. -2--11
  695. ''', full_dump=False)
  696. prints:
  697. .. testoutput::
  698. :options: +NORMALIZE_WHITESPACE
  699. 5+3*6
  700. [[5, '+', [3, '*', 6]]]
  701. (5+3)*6
  702. [[[5, '+', 3], '*', 6]]
  703. (5+x)*y
  704. [[[5, '+', 'x'], '*', 'y']]
  705. -2--11
  706. [[['-', 2], '-', ['-', 11]]]
  707. """
  708. # captive version of FollowedBy that does not do parse actions or capture results names
  709. class _FB(FollowedBy):
  710. def parseImpl(self, instring, loc, doActions=True):
  711. self.expr.try_parse(instring, loc)
  712. return loc, []
  713. _FB.__name__ = "FollowedBy>"
  714. ret = Forward()
  715. ret.set_name(f"{base_expr.name}_expression")
  716. if isinstance(lpar, str):
  717. lpar = Suppress(lpar)
  718. if isinstance(rpar, str):
  719. rpar = Suppress(rpar)
  720. nested_expr = (lpar + ret + rpar).set_name(f"nested_{base_expr.name}_expression")
  721. # if lpar and rpar are not suppressed, wrap in group
  722. if not (isinstance(lpar, Suppress) and isinstance(rpar, Suppress)):
  723. lastExpr = base_expr | Group(nested_expr)
  724. else:
  725. lastExpr = base_expr | nested_expr
  726. arity: int
  727. rightLeftAssoc: opAssoc
  728. pa: typing.Optional[ParseAction]
  729. opExpr1: ParserElement
  730. opExpr2: ParserElement
  731. matchExpr: ParserElement
  732. match_lookahead: ParserElement
  733. for operDef in op_list:
  734. opExpr, arity, rightLeftAssoc, pa = (operDef + (None,))[:4] # type: ignore[assignment]
  735. if isinstance(opExpr, str_type):
  736. opExpr = ParserElement._literalStringClass(opExpr)
  737. opExpr = typing.cast(ParserElement, opExpr)
  738. if arity == 3:
  739. if not isinstance(opExpr, (tuple, list)) or len(opExpr) != 2:
  740. raise ValueError(
  741. "if numterms=3, opExpr must be a tuple or list of two expressions"
  742. )
  743. opExpr1, opExpr2 = opExpr
  744. term_name = f"{opExpr1}{opExpr2} operations"
  745. else:
  746. term_name = f"{opExpr} operations"
  747. if not 1 <= arity <= 3:
  748. raise ValueError("operator must be unary (1), binary (2), or ternary (3)")
  749. if rightLeftAssoc not in (OpAssoc.LEFT, OpAssoc.RIGHT):
  750. raise ValueError("operator must indicate right or left associativity")
  751. thisExpr: ParserElement = Forward().set_name(term_name)
  752. thisExpr = typing.cast(Forward, thisExpr)
  753. match_lookahead = And([])
  754. if rightLeftAssoc is OpAssoc.LEFT:
  755. if arity == 1:
  756. match_lookahead = _FB(lastExpr + opExpr)
  757. matchExpr = Group(lastExpr + opExpr[1, ...])
  758. elif arity == 2:
  759. if opExpr is not None:
  760. match_lookahead = _FB(lastExpr + opExpr + lastExpr)
  761. matchExpr = Group(lastExpr + (opExpr + lastExpr)[1, ...])
  762. else:
  763. match_lookahead = _FB(lastExpr + lastExpr)
  764. matchExpr = Group(lastExpr[2, ...])
  765. elif arity == 3:
  766. match_lookahead = _FB(
  767. lastExpr + opExpr1 + lastExpr + opExpr2 + lastExpr
  768. )
  769. matchExpr = Group(
  770. lastExpr + (opExpr1 + lastExpr + opExpr2 + lastExpr)[1, ...]
  771. )
  772. elif rightLeftAssoc is OpAssoc.RIGHT:
  773. if arity == 1:
  774. # try to avoid LR with this extra test
  775. if not isinstance(opExpr, Opt):
  776. opExpr = Opt(opExpr)
  777. match_lookahead = _FB(opExpr.expr + thisExpr)
  778. matchExpr = Group(opExpr + thisExpr)
  779. elif arity == 2:
  780. if opExpr is not None:
  781. match_lookahead = _FB(lastExpr + opExpr + thisExpr)
  782. matchExpr = Group(lastExpr + (opExpr + thisExpr)[1, ...])
  783. else:
  784. match_lookahead = _FB(lastExpr + thisExpr)
  785. matchExpr = Group(lastExpr + thisExpr[1, ...])
  786. elif arity == 3:
  787. match_lookahead = _FB(
  788. lastExpr + opExpr1 + thisExpr + opExpr2 + thisExpr
  789. )
  790. matchExpr = Group(lastExpr + opExpr1 + thisExpr + opExpr2 + thisExpr)
  791. # suppress lookahead expr from railroad diagrams
  792. match_lookahead.show_in_diagram = False
  793. # TODO - determine why this statement can't be included in the following
  794. # if pa block
  795. matchExpr = match_lookahead + matchExpr
  796. if pa:
  797. if isinstance(pa, (tuple, list)):
  798. matchExpr.set_parse_action(*pa)
  799. else:
  800. matchExpr.set_parse_action(pa)
  801. thisExpr <<= (matchExpr | lastExpr).set_name(term_name)
  802. lastExpr = thisExpr
  803. ret <<= lastExpr
  804. return ret
  805. def indentedBlock(blockStatementExpr, indentStack, indent=True, backup_stacks=[]):
  806. """
  807. .. deprecated:: 3.0.0
  808. Use the :class:`IndentedBlock` class instead. Note that `IndentedBlock`
  809. has a difference method signature.
  810. Helper method for defining space-delimited indentation blocks,
  811. such as those used to define block statements in Python source code.
  812. :param blockStatementExpr: expression defining syntax of statement that
  813. is repeated within the indented block
  814. :param indentStack: list created by caller to manage indentation stack
  815. (multiple ``statementWithIndentedBlock`` expressions within a single
  816. grammar should share a common ``indentStack``)
  817. :param indent: boolean indicating whether block must be indented beyond
  818. the current level; set to ``False`` for block of left-most statements
  819. A valid block must contain at least one ``blockStatement``.
  820. (Note that indentedBlock uses internal parse actions which make it
  821. incompatible with packrat parsing.)
  822. Example:
  823. .. testcode::
  824. data = '''
  825. def A(z):
  826. A1
  827. B = 100
  828. G = A2
  829. A2
  830. A3
  831. B
  832. def BB(a,b,c):
  833. BB1
  834. def BBA():
  835. bba1
  836. bba2
  837. bba3
  838. C
  839. D
  840. def spam(x,y):
  841. def eggs(z):
  842. pass
  843. '''
  844. indentStack = [1]
  845. stmt = Forward()
  846. identifier = Word(alphas, alphanums)
  847. funcDecl = ("def" + identifier + Group("(" + Opt(delimitedList(identifier)) + ")") + ":")
  848. func_body = indentedBlock(stmt, indentStack)
  849. funcDef = Group(funcDecl + func_body)
  850. rvalue = Forward()
  851. funcCall = Group(identifier + "(" + Opt(delimitedList(rvalue)) + ")")
  852. rvalue << (funcCall | identifier | Word(nums))
  853. assignment = Group(identifier + "=" + rvalue)
  854. stmt << (funcDef | assignment | identifier)
  855. module_body = stmt[1, ...]
  856. parseTree = module_body.parseString(data)
  857. parseTree.pprint()
  858. prints:
  859. .. testoutput::
  860. [['def',
  861. 'A',
  862. ['(', 'z', ')'],
  863. ':',
  864. [['A1'], [['B', '=', '100']], [['G', '=', 'A2']], ['A2'], ['A3']]],
  865. 'B',
  866. ['def',
  867. 'BB',
  868. ['(', 'a', 'b', 'c', ')'],
  869. ':',
  870. [['BB1'], [['def', 'BBA', ['(', ')'], ':', [['bba1'], ['bba2'], ['bba3']]]]]],
  871. 'C',
  872. 'D',
  873. ['def',
  874. 'spam',
  875. ['(', 'x', 'y', ')'],
  876. ':',
  877. [[['def', 'eggs', ['(', 'z', ')'], ':', [['pass']]]]]]]
  878. """
  879. warnings.warn(
  880. f"{'indentedBlock'!r} deprecated - use {'IndentedBlock'!r}",
  881. PyparsingDeprecationWarning,
  882. stacklevel=2,
  883. )
  884. backup_stacks.append(indentStack[:])
  885. def reset_stack():
  886. indentStack[:] = backup_stacks[-1]
  887. def checkPeerIndent(s, l, t):
  888. if l >= len(s):
  889. return
  890. curCol = col(l, s)
  891. if curCol != indentStack[-1]:
  892. if curCol > indentStack[-1]:
  893. raise ParseException(s, l, "illegal nesting")
  894. raise ParseException(s, l, "not a peer entry")
  895. def checkSubIndent(s, l, t):
  896. curCol = col(l, s)
  897. if curCol > indentStack[-1]:
  898. indentStack.append(curCol)
  899. else:
  900. raise ParseException(s, l, "not a subentry")
  901. def checkUnindent(s, l, t):
  902. if l >= len(s):
  903. return
  904. curCol = col(l, s)
  905. if not (indentStack and curCol in indentStack):
  906. raise ParseException(s, l, "not an unindent")
  907. if curCol < indentStack[-1]:
  908. indentStack.pop()
  909. NL = OneOrMore(LineEnd().set_whitespace_chars("\t ").suppress())
  910. INDENT = (Empty() + Empty().set_parse_action(checkSubIndent)).set_name("INDENT")
  911. PEER = Empty().set_parse_action(checkPeerIndent).set_name("")
  912. UNDENT = Empty().set_parse_action(checkUnindent).set_name("UNINDENT")
  913. if indent:
  914. smExpr = Group(
  915. Opt(NL)
  916. + INDENT
  917. + OneOrMore(PEER + Group(blockStatementExpr) + Opt(NL))
  918. + UNDENT
  919. )
  920. else:
  921. smExpr = Group(
  922. Opt(NL)
  923. + OneOrMore(PEER + Group(blockStatementExpr) + Opt(NL))
  924. + Opt(UNDENT)
  925. )
  926. # add a parse action to remove backup_stack from list of backups
  927. smExpr.add_parse_action(
  928. lambda: backup_stacks.pop(-1) and None if backup_stacks else None
  929. )
  930. smExpr.set_fail_action(lambda a, b, c, d: reset_stack())
  931. blockStatementExpr.ignore(_bslash + LineEnd())
  932. return smExpr.set_name("indented block")
  933. # it's easy to get these comment structures wrong - they're very common,
  934. # so may as well make them available
  935. c_style_comment = Regex(r"/\*(?:[^*]|\*(?!/))*\*\/").set_name("C style comment")
  936. "Comment of the form ``/* ... */``"
  937. html_comment = Regex(r"<!--[\s\S]*?-->").set_name("HTML comment")
  938. "Comment of the form ``<!-- ... -->``"
  939. rest_of_line = Regex(r".*").leave_whitespace().set_name("rest of line")
  940. dbl_slash_comment = Regex(r"//(?:\\\n|[^\n])*").set_name("// comment")
  941. "Comment of the form ``// ... (to end of line)``"
  942. cpp_style_comment = Regex(
  943. r"(?:/\*(?:[^*]|\*(?!/))*\*\/)|(?://(?:\\\n|[^\n])*)"
  944. ).set_name("C++ style comment")
  945. "Comment of either form :class:`c_style_comment` or :class:`dbl_slash_comment`"
  946. java_style_comment = cpp_style_comment
  947. "Same as :class:`cpp_style_comment`"
  948. python_style_comment = Regex(r"#.*").set_name("Python style comment")
  949. "Comment of the form ``# ... (to end of line)``"
  950. # build list of built-in expressions, for future reference if a global default value
  951. # gets updated
  952. _builtin_exprs: list[ParserElement] = [
  953. v for v in vars().values() if isinstance(v, ParserElement)
  954. ]
  955. # compatibility function, superseded by DelimitedList class
  956. def delimited_list(
  957. expr: Union[str, ParserElement],
  958. delim: Union[str, ParserElement] = ",",
  959. combine: bool = False,
  960. min: typing.Optional[int] = None,
  961. max: typing.Optional[int] = None,
  962. *,
  963. allow_trailing_delim: bool = False,
  964. ) -> ParserElement:
  965. """
  966. .. deprecated:: 3.1.0
  967. Use the :class:`DelimitedList` class instead.
  968. """
  969. return DelimitedList(
  970. expr, delim, combine, min, max, allow_trailing_delim=allow_trailing_delim
  971. )
  972. # Compatibility synonyms
  973. # fmt: off
  974. opAssoc = OpAssoc
  975. anyOpenTag = any_open_tag
  976. anyCloseTag = any_close_tag
  977. commonHTMLEntity = common_html_entity
  978. cStyleComment = c_style_comment
  979. htmlComment = html_comment
  980. restOfLine = rest_of_line
  981. dblSlashComment = dbl_slash_comment
  982. cppStyleComment = cpp_style_comment
  983. javaStyleComment = java_style_comment
  984. pythonStyleComment = python_style_comment
  985. delimitedList = replaced_by_pep8("delimitedList", DelimitedList)
  986. delimited_list = replaced_by_pep8("delimited_list", DelimitedList)
  987. countedArray = replaced_by_pep8("countedArray", counted_array)
  988. matchPreviousLiteral = replaced_by_pep8("matchPreviousLiteral", match_previous_literal)
  989. matchPreviousExpr = replaced_by_pep8("matchPreviousExpr", match_previous_expr)
  990. oneOf = replaced_by_pep8("oneOf", one_of)
  991. dictOf = replaced_by_pep8("dictOf", dict_of)
  992. originalTextFor = replaced_by_pep8("originalTextFor", original_text_for)
  993. nestedExpr = replaced_by_pep8("nestedExpr", nested_expr)
  994. makeHTMLTags = replaced_by_pep8("makeHTMLTags", make_html_tags)
  995. makeXMLTags = replaced_by_pep8("makeXMLTags", make_xml_tags)
  996. replaceHTMLEntity = replaced_by_pep8("replaceHTMLEntity", replace_html_entity)
  997. infixNotation = replaced_by_pep8("infixNotation", infix_notation)
  998. # fmt: on