| 12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220 |
- # helpers.py
- import html.entities
- import operator
- import re
- import sys
- import typing
- from . import __diag__
- from .core import *
- from .util import (
- _bslash,
- _flatten,
- _escape_regex_range_chars,
- make_compressed_re,
- replaced_by_pep8,
- )
- def _suppression(expr: Union[ParserElement, str]) -> ParserElement:
- # internal helper to avoid wrapping Suppress inside another Suppress
- if isinstance(expr, Suppress):
- return expr
- return Suppress(expr)
- #
- # global helpers
- #
- def counted_array(
- expr: ParserElement, int_expr: typing.Optional[ParserElement] = None, **kwargs
- ) -> ParserElement:
- """Helper to define a counted list of expressions.
- This helper defines a pattern of the form::
- integer expr expr expr...
- where the leading integer tells how many expr expressions follow.
- The matched tokens returns the array of expr tokens as a list - the
- leading count token is suppressed.
- If ``int_expr`` is specified, it should be a pyparsing expression
- that produces an integer value.
- Examples:
- .. doctest::
- >>> counted_array(Word(alphas)).parse_string('2 ab cd ef')
- ParseResults(['ab', 'cd'], {})
- - In this parser, the leading integer value is given in binary,
- '10' indicating that 2 values are in the array:
- .. doctest::
- >>> binary_constant = Word('01').set_parse_action(lambda t: int(t[0], 2))
- >>> counted_array(Word(alphas), int_expr=binary_constant
- ... ).parse_string('10 ab cd ef')
- ParseResults(['ab', 'cd'], {})
- - If other fields must be parsed after the count but before the
- list items, give the fields results names and they will
- be preserved in the returned ParseResults:
- .. doctest::
- >>> ppc = pyparsing.common
- >>> count_with_metadata = ppc.integer + Word(alphas)("type")
- >>> typed_array = counted_array(Word(alphanums),
- ... int_expr=count_with_metadata)("items")
- >>> result = typed_array.parse_string("3 bool True True False")
- >>> print(result.dump())
- ['True', 'True', 'False']
- - items: ['True', 'True', 'False']
- - type: 'bool'
- """
- intExpr: typing.Optional[ParserElement] = deprecate_argument(
- kwargs, "intExpr", None
- )
- intExpr = intExpr or int_expr
- array_expr = Forward()
- def count_field_parse_action(s, l, t):
- nonlocal array_expr
- n = t[0]
- array_expr <<= (expr * n) if n else Empty()
- # clear list contents, but keep any named results
- del t[:]
- if intExpr is None:
- intExpr = Word(nums).set_parse_action(lambda t: int(t[0]))
- else:
- intExpr = intExpr.copy()
- intExpr.set_name("arrayLen")
- intExpr.add_parse_action(count_field_parse_action, call_during_try=True)
- return (intExpr + array_expr).set_name(f"(len) {expr}...")
- def match_previous_literal(expr: ParserElement) -> ParserElement:
- """Helper to define an expression that is indirectly defined from
- the tokens matched in a previous expression, that is, it looks for
- a 'repeat' of a previous expression. For example::
- .. testcode::
- first = Word(nums)
- second = match_previous_literal(first)
- match_expr = first + ":" + second
- will match ``"1:1"``, but not ``"1:2"``. Because this
- matches a previous literal, will also match the leading
- ``"1:1"`` in ``"1:10"``. If this is not desired, use
- :class:`match_previous_expr`. Do *not* use with packrat parsing
- enabled.
- """
- rep = Forward()
- def copy_token_to_repeater(s, l, t):
- if not t:
- rep << Empty()
- return
- if len(t) == 1:
- rep << t[0]
- return
- # flatten t tokens
- tflat = _flatten(t.as_list())
- rep << And(Literal(tt) for tt in tflat)
- expr.add_parse_action(copy_token_to_repeater, call_during_try=True)
- rep.set_name(f"(prev) {expr}")
- return rep
- def match_previous_expr(expr: ParserElement) -> ParserElement:
- """Helper to define an expression that is indirectly defined from
- the tokens matched in a previous expression, that is, it looks for
- a 'repeat' of a previous expression. For example:
- .. testcode::
- first = Word(nums)
- second = match_previous_expr(first)
- match_expr = first + ":" + second
- will match ``"1:1"``, but not ``"1:2"``. Because this
- matches by expressions, will *not* match the leading ``"1:1"``
- in ``"1:10"``; the expressions are evaluated first, and then
- compared, so ``"1"`` is compared with ``"10"``. Do *not* use
- with packrat parsing enabled.
- """
- rep = Forward()
- e2 = expr.copy()
- rep <<= e2
- def copy_token_to_repeater(s, l, t):
- matchTokens = _flatten(t.as_list())
- def must_match_these_tokens(s, l, t):
- theseTokens = _flatten(t.as_list())
- if theseTokens != matchTokens:
- raise ParseException(
- s, l, f"Expected {matchTokens}, found{theseTokens}"
- )
- rep.set_parse_action(must_match_these_tokens, call_during_try=True)
- expr.add_parse_action(copy_token_to_repeater, call_during_try=True)
- rep.set_name(f"(prev) {expr}")
- return rep
- def one_of(
- strs: Union[typing.Iterable[str], str],
- caseless: bool = False,
- use_regex: bool = True,
- as_keyword: bool = False,
- **kwargs,
- ) -> ParserElement:
- """Helper to quickly define a set of alternative :class:`Literal` s,
- and makes sure to do longest-first testing when there is a conflict,
- regardless of the input order, but returns
- a :class:`MatchFirst` for best performance.
- :param strs: a string of space-delimited literals, or a collection of
- string literals
- :param caseless: treat all literals as caseless
- :param use_regex: bool - as an optimization, will
- generate a :class:`Regex` object; otherwise, will generate
- a :class:`MatchFirst` object (if ``caseless=True`` or
- ``as_keyword=True``, or if creating a :class:`Regex` raises an exception)
- :param as_keyword: bool - enforce :class:`Keyword`-style matching on the
- generated expressions
- Parameters ``asKeyword`` and ``useRegex`` are retained for pre-PEP8
- compatibility, but will be removed in a future release.
- Example:
- .. testcode::
- comp_oper = one_of("< = > <= >= !=")
- var = Word(alphas)
- number = Word(nums)
- term = var | number
- comparison_expr = term + comp_oper + term
- print(comparison_expr.search_string("B = 12 AA=23 B<=AA AA>12"))
- prints:
- .. testoutput::
- [['B', '=', '12'], ['AA', '=', '23'], ['B', '<=', 'AA'], ['AA', '>', '12']]
- """
- useRegex: bool = deprecate_argument(kwargs, "useRegex", True)
- asKeyword: bool = deprecate_argument(kwargs, "asKeyword", False)
- asKeyword = asKeyword or as_keyword
- useRegex = useRegex and use_regex
- if (
- isinstance(caseless, str_type)
- and __diag__.warn_on_multiple_string_args_to_oneof
- ):
- warnings.warn(
- "warn_on_multiple_string_args_to_oneof:"
- " More than one string argument passed to one_of, pass"
- " choices as a list or space-delimited string",
- PyparsingDiagnosticWarning,
- stacklevel=2,
- )
- if caseless:
- is_equal = lambda a, b: a.upper() == b.upper()
- masks = lambda a, b: b.upper().startswith(a.upper())
- else:
- is_equal = operator.eq
- masks = lambda a, b: b.startswith(a)
- symbols: list[str]
- if isinstance(strs, str_type):
- strs = typing.cast(str, strs)
- symbols = strs.split()
- elif isinstance(strs, Iterable):
- symbols = list(strs)
- else:
- raise TypeError("Invalid argument to one_of, expected string or iterable")
- if not symbols:
- return NoMatch()
- # reorder given symbols to take care to avoid masking longer choices with shorter ones
- # (but only if the given symbols are not just single characters)
- i = 0
- while i < len(symbols) - 1:
- cur = symbols[i]
- for j, other in enumerate(symbols[i + 1 :]):
- if is_equal(other, cur):
- del symbols[i + j + 1]
- break
- if len(other) > len(cur) and masks(cur, other):
- del symbols[i + j + 1]
- symbols.insert(i, other)
- break
- else:
- i += 1
- if useRegex:
- re_flags: int = re.IGNORECASE if caseless else 0
- try:
- if all(len(sym) == 1 for sym in symbols):
- # symbols are just single characters, create range regex pattern
- patt = f"[{''.join(_escape_regex_range_chars(sym) for sym in symbols)}]"
- else:
- patt = "|".join(re.escape(sym) for sym in symbols)
- # wrap with \b word break markers if defining as keywords
- if asKeyword:
- patt = rf"\b(?:{patt})\b"
- ret = Regex(patt, flags=re_flags)
- ret.set_name(" | ".join(repr(s) for s in symbols))
- if caseless:
- # add parse action to return symbols as specified, not in random
- # casing as found in input string
- symbol_map = {sym.lower(): sym for sym in symbols}
- ret.add_parse_action(lambda s, l, t: symbol_map[t[0].lower()])
- return ret
- except re.error:
- warnings.warn(
- "Exception creating Regex for one_of, building MatchFirst",
- PyparsingDiagnosticWarning,
- stacklevel=2,
- )
- # last resort, just use MatchFirst of Token class corresponding to caseless
- # and asKeyword settings
- CASELESS = KEYWORD = True
- parse_element_class = {
- (CASELESS, KEYWORD): CaselessKeyword,
- (CASELESS, not KEYWORD): CaselessLiteral,
- (not CASELESS, KEYWORD): Keyword,
- (not CASELESS, not KEYWORD): Literal,
- }[(caseless, asKeyword)]
- return MatchFirst(parse_element_class(sym) for sym in symbols).set_name(
- " | ".join(symbols)
- )
- def dict_of(key: ParserElement, value: ParserElement) -> Dict:
- """Helper to easily and clearly define a dictionary by specifying
- the respective patterns for the key and value. Takes care of
- defining the :class:`Dict`, :class:`ZeroOrMore`, and
- :class:`Group` tokens in the proper order. The key pattern
- can include delimiting markers or punctuation, as long as they are
- suppressed, thereby leaving the significant key text. The value
- pattern can include named results, so that the :class:`Dict` results
- can include named token fields.
- Example:
- .. doctest::
- >>> text = "shape: SQUARE posn: upper left color: light blue texture: burlap"
- >>> data_word = Word(alphas)
- >>> label = data_word + FollowedBy(':')
- >>> attr_expr = (
- ... label
- ... + Suppress(':')
- ... + OneOrMore(data_word, stop_on=label)
- ... .set_parse_action(' '.join))
- >>> print(attr_expr[1, ...].parse_string(text).dump())
- ['shape', 'SQUARE', 'posn', 'upper left', 'color', 'light blue', 'texture', 'burlap']
- >>> attr_label = label
- >>> attr_value = Suppress(':') + OneOrMore(data_word, stop_on=label
- ... ).set_parse_action(' '.join)
- # similar to Dict, but simpler call format
- >>> result = dict_of(attr_label, attr_value).parse_string(text)
- >>> print(result.dump())
- [['shape', 'SQUARE'], ['posn', 'upper left'], ['color', 'light blue'], ['texture', 'burlap']]
- - color: 'light blue'
- - posn: 'upper left'
- - shape: 'SQUARE'
- - texture: 'burlap'
- [0]:
- ['shape', 'SQUARE']
- [1]:
- ['posn', 'upper left']
- [2]:
- ['color', 'light blue']
- [3]:
- ['texture', 'burlap']
- >>> print(result['shape'])
- SQUARE
- >>> print(result.shape) # object attribute access works too
- SQUARE
- >>> print(result.as_dict())
- {'shape': 'SQUARE', 'posn': 'upper left', 'color': 'light blue', 'texture': 'burlap'}
- """
- return Dict(OneOrMore(Group(key + value)))
- def original_text_for(
- expr: ParserElement, as_string: bool = True, **kwargs
- ) -> ParserElement:
- """Helper to return the original, untokenized text for a given
- expression. Useful to restore the parsed fields of an HTML start
- tag into the raw tag text itself, or to revert separate tokens with
- intervening whitespace back to the original matching input text. By
- default, returns a string containing the original parsed text.
- If the optional ``as_string`` argument is passed as
- ``False``, then the return value is
- a :class:`ParseResults` containing any results names that
- were originally matched, and a single token containing the original
- matched text from the input string. So if the expression passed to
- :class:`original_text_for` contains expressions with defined
- results names, you must set ``as_string`` to ``False`` if you
- want to preserve those results name values.
- The ``asString`` pre-PEP8 argument is retained for compatibility,
- but will be removed in a future release.
- Example:
- .. testcode::
- src = "this is test <b> bold <i>text</i> </b> normal text "
- for tag in ("b", "i"):
- opener, closer = make_html_tags(tag)
- patt = original_text_for(opener + ... + closer)
- print(patt.search_string(src)[0])
- prints:
- .. testoutput::
- ['<b> bold <i>text</i> </b>']
- ['<i>text</i>']
- """
- asString: bool = deprecate_argument(kwargs, "asString", True)
- asString = asString and as_string
- locMarker = Empty().set_parse_action(lambda s, loc, t: loc)
- endlocMarker = locMarker.copy()
- endlocMarker.callPreparse = False
- matchExpr = locMarker("_original_start") + expr + endlocMarker("_original_end")
- if asString:
- extractText = lambda s, l, t: s[t._original_start : t._original_end]
- else:
- def extractText(s, l, t):
- t[:] = [s[t.pop("_original_start") : t.pop("_original_end")]]
- matchExpr.set_parse_action(extractText)
- matchExpr.ignoreExprs = expr.ignoreExprs
- matchExpr.suppress_warning(Diagnostics.warn_ungrouped_named_tokens_in_collection)
- return matchExpr
- def ungroup(expr: ParserElement) -> ParserElement:
- """Helper to undo pyparsing's default grouping of And expressions,
- even if all but one are non-empty.
- """
- return TokenConverter(expr).add_parse_action(lambda t: t[0])
- def locatedExpr(expr: ParserElement) -> ParserElement:
- """
- .. deprecated:: 3.0.0
- Use the :class:`Located` class instead. Note that `Located`
- returns results with one less grouping level.
- Helper to decorate a returned token with its starting and ending
- locations in the input string.
- This helper adds the following results names:
- - ``locn_start`` - location where matched expression begins
- - ``locn_end`` - location where matched expression ends
- - ``value`` - the actual parsed results
- Be careful if the input text contains ``<TAB>`` characters, you
- may want to call :meth:`ParserElement.parse_with_tabs`
- """
- warnings.warn(
- f"{'locatedExpr'!r} deprecated - use {'Located'!r}",
- PyparsingDeprecationWarning,
- stacklevel=2,
- )
- locator = Empty().set_parse_action(lambda ss, ll, tt: ll)
- return Group(
- locator("locn_start")
- + expr("value")
- + locator.copy().leave_whitespace()("locn_end")
- )
- # define special default value to permit None as a significant value for
- # ignore_expr
- _NO_IGNORE_EXPR_GIVEN = NoMatch()
- def nested_expr(
- opener: Union[str, ParserElement] = "(",
- closer: Union[str, ParserElement] = ")",
- content: typing.Optional[ParserElement] = None,
- ignore_expr: typing.Optional[ParserElement] = _NO_IGNORE_EXPR_GIVEN,
- **kwargs,
- ) -> ParserElement:
- """Helper method for defining nested lists enclosed in opening and
- closing delimiters (``"("`` and ``")"`` are the default).
- :param opener: str - opening character for a nested list
- (default= ``"("``); can also be a pyparsing expression
- :param closer: str - closing character for a nested list
- (default= ``")"``); can also be a pyparsing expression
- :param content: expression for items within the nested lists
- :param ignore_expr: expression for ignoring opening and closing delimiters
- (default = :class:`quoted_string`)
- Parameter ``ignoreExpr`` is retained for compatibility
- but will be removed in a future release.
- If an expression is not provided for the content argument, the
- nested expression will capture all whitespace-delimited content
- between delimiters as a list of separate values.
- Use the ``ignore_expr`` argument to define expressions that may
- contain opening or closing characters that should not be treated as
- opening or closing characters for nesting, such as quoted_string or
- a comment expression. Specify multiple expressions using an
- :class:`Or` or :class:`MatchFirst`. The default is
- :class:`quoted_string`, but if no expressions are to be ignored, then
- pass ``None`` for this argument.
- Example:
- .. testcode::
- data_type = one_of("void int short long char float double")
- decl_data_type = Combine(data_type + Opt(Word('*')))
- ident = Word(alphas+'_', alphanums+'_')
- number = pyparsing_common.number
- arg = Group(decl_data_type + ident)
- LPAR, RPAR = map(Suppress, "()")
- code_body = nested_expr('{', '}', ignore_expr=(quoted_string | c_style_comment))
- c_function = (decl_data_type("type")
- + ident("name")
- + LPAR + Opt(DelimitedList(arg), [])("args") + RPAR
- + code_body("body"))
- c_function.ignore(c_style_comment)
- source_code = '''
- int is_odd(int x) {
- return (x%2);
- }
- int dec_to_hex(char hchar) {
- if (hchar >= '0' && hchar <= '9') {
- return (ord(hchar)-ord('0'));
- } else {
- return (10+ord(hchar)-ord('A'));
- }
- }
- '''
- for func in c_function.search_string(source_code):
- print(f"{func.name} ({func.type}) args: {func.args}")
- prints:
- .. testoutput::
- is_odd (int) args: [['int', 'x']]
- dec_to_hex (int) args: [['char', 'hchar']]
- """
- ignoreExpr: ParserElement = deprecate_argument(
- kwargs, "ignoreExpr", _NO_IGNORE_EXPR_GIVEN
- )
- if ignoreExpr != ignore_expr:
- ignoreExpr = ignore_expr if ignoreExpr is _NO_IGNORE_EXPR_GIVEN else ignoreExpr # type: ignore [assignment]
- if ignoreExpr is _NO_IGNORE_EXPR_GIVEN:
- ignoreExpr = quoted_string()
- if opener == closer:
- raise ValueError("opening and closing strings cannot be the same")
- if content is None:
- if isinstance(opener, str_type) and isinstance(closer, str_type):
- opener = typing.cast(str, opener)
- closer = typing.cast(str, closer)
- if len(opener) == 1 and len(closer) == 1:
- if ignoreExpr is not None:
- content = Combine(
- OneOrMore(
- ~ignoreExpr
- + CharsNotIn(
- opener + closer + ParserElement.DEFAULT_WHITE_CHARS,
- exact=1,
- )
- )
- )
- else:
- content = Combine(
- Empty()
- + CharsNotIn(
- opener + closer + ParserElement.DEFAULT_WHITE_CHARS
- )
- )
- else:
- if ignoreExpr is not None:
- content = Combine(
- OneOrMore(
- ~ignoreExpr
- + ~Literal(opener)
- + ~Literal(closer)
- + CharsNotIn(ParserElement.DEFAULT_WHITE_CHARS, exact=1)
- )
- )
- else:
- content = Combine(
- OneOrMore(
- ~Literal(opener)
- + ~Literal(closer)
- + CharsNotIn(ParserElement.DEFAULT_WHITE_CHARS, exact=1)
- )
- )
- else:
- raise ValueError(
- "opening and closing arguments must be strings if no content expression is given"
- )
- # for these internally-created context expressions, simulate whitespace-skipping
- if ParserElement.DEFAULT_WHITE_CHARS:
- content.set_parse_action(
- lambda t: t[0].strip(ParserElement.DEFAULT_WHITE_CHARS)
- )
- ret = Forward()
- if ignoreExpr is not None:
- ret <<= Group(
- _suppression(opener)
- + ZeroOrMore(ignoreExpr | ret | content)
- + _suppression(closer)
- )
- else:
- ret <<= Group(
- _suppression(opener) + ZeroOrMore(ret | content) + _suppression(closer)
- )
- ret.set_name(f"nested {opener}{closer} expression")
- # don't override error message from content expressions
- ret.errmsg = None
- return ret
- def _makeTags(tagStr, xml, suppress_LT=Suppress("<"), suppress_GT=Suppress(">")):
- """Internal helper to construct opening and closing tag expressions,
- given a tag name"""
- if isinstance(tagStr, str_type):
- resname = tagStr
- tagStr = Keyword(tagStr, caseless=not xml)
- else:
- resname = tagStr.name
- tagAttrName = Word(alphas, alphanums + "_-:")
- if xml:
- tagAttrValue = dbl_quoted_string.copy().set_parse_action(remove_quotes)
- openTag = (
- suppress_LT
- + tagStr("tag")
- + Dict(ZeroOrMore(Group(tagAttrName + Suppress("=") + tagAttrValue)))
- + Opt("/", default=[False])("empty").set_parse_action(
- lambda s, l, t: t[0] == "/"
- )
- + suppress_GT
- )
- else:
- tagAttrValue = quoted_string.copy().set_parse_action(remove_quotes) | Word(
- printables, exclude_chars=">"
- )
- openTag = (
- suppress_LT
- + tagStr("tag")
- + Dict(
- ZeroOrMore(
- Group(
- tagAttrName.set_parse_action(lambda t: t[0].lower())
- + Opt(Suppress("=") + tagAttrValue)
- )
- )
- )
- + Opt("/", default=[False])("empty").set_parse_action(
- lambda s, l, t: t[0] == "/"
- )
- + suppress_GT
- )
- closeTag = Combine(Literal("</") + tagStr + ">", adjacent=False)
- openTag.set_name(f"<{resname}>")
- # add start<tagname> results name in parse action now that ungrouped names are not reported at two levels
- openTag.add_parse_action(
- lambda t: t.__setitem__(
- "start" + "".join(resname.replace(":", " ").title().split()), t.copy()
- )
- )
- closeTag = closeTag(
- "end" + "".join(resname.replace(":", " ").title().split())
- ).set_name(f"</{resname}>")
- openTag.tag = resname
- closeTag.tag = resname
- openTag.tag_body = SkipTo(closeTag())
- return openTag, closeTag
- def make_html_tags(
- tag_str: Union[str, ParserElement],
- ) -> tuple[ParserElement, ParserElement]:
- """Helper to construct opening and closing tag expressions for HTML,
- given a tag name. Matches tags in either upper or lower case,
- attributes with namespaces and with quoted or unquoted values.
- Example:
- .. testcode::
- text = '<td>More info at the <a href="https://github.com/pyparsing/pyparsing/wiki">pyparsing</a> wiki page</td>'
- # make_html_tags returns pyparsing expressions for the opening and
- # closing tags as a 2-tuple
- a, a_end = make_html_tags("A")
- link_expr = a + SkipTo(a_end)("link_text") + a_end
- for link in link_expr.search_string(text):
- # attributes in the <A> tag (like "href" shown here) are
- # also accessible as named results
- print(link.link_text, '->', link.href)
- prints:
- .. testoutput::
- pyparsing -> https://github.com/pyparsing/pyparsing/wiki
- """
- return _makeTags(tag_str, False)
- def make_xml_tags(
- tag_str: Union[str, ParserElement],
- ) -> tuple[ParserElement, ParserElement]:
- """Helper to construct opening and closing tag expressions for XML,
- given a tag name. Matches tags only in the given upper/lower case.
- Example: similar to :class:`make_html_tags`
- """
- return _makeTags(tag_str, True)
- any_open_tag: ParserElement
- any_close_tag: ParserElement
- any_open_tag, any_close_tag = make_html_tags(
- Word(alphas, alphanums + "_:").set_name("any tag")
- )
- _htmlEntityMap = {k.rstrip(";"): v for k, v in html.entities.html5.items()}
- _most_common_entities = "nbsp lt gt amp quot apos cent pound euro copy".replace(
- " ", "|"
- )
- common_html_entity = Regex(
- lambda: f"&(?P<entity>{_most_common_entities}|{make_compressed_re(_htmlEntityMap)});"
- ).set_name("common HTML entity")
- def replace_html_entity(s, l, t):
- """Helper parser action to replace common HTML entities with their special characters"""
- return _htmlEntityMap.get(t.entity)
- class OpAssoc(Enum):
- """Enumeration of operator associativity
- - used in constructing InfixNotationOperatorSpec for :class:`infix_notation`"""
- LEFT = 1
- RIGHT = 2
- InfixNotationOperatorArgType = Union[
- ParserElement, str, tuple[Union[ParserElement, str], Union[ParserElement, str]]
- ]
- InfixNotationOperatorSpec = Union[
- tuple[
- InfixNotationOperatorArgType,
- int,
- OpAssoc,
- typing.Optional[ParseAction],
- ],
- tuple[
- InfixNotationOperatorArgType,
- int,
- OpAssoc,
- ],
- ]
- def infix_notation(
- base_expr: ParserElement,
- op_list: list[InfixNotationOperatorSpec],
- lpar: Union[str, ParserElement] = Suppress("("),
- rpar: Union[str, ParserElement] = Suppress(")"),
- ) -> Forward:
- """Helper method for constructing grammars of expressions made up of
- operators working in a precedence hierarchy. Operators may be unary
- or binary, left- or right-associative. Parse actions can also be
- attached to operator expressions. The generated parser will also
- recognize the use of parentheses to override operator precedences
- (see example below).
- Note: if you define a deep operator list, you may see performance
- issues when using infix_notation. See
- :class:`ParserElement.enable_packrat` for a mechanism to potentially
- improve your parser performance.
- Parameters:
- :param base_expr: expression representing the most basic operand to
- be used in the expression
- :param op_list: list of tuples, one for each operator precedence level
- in the expression grammar; each tuple is of the form ``(op_expr,
- num_operands, right_left_assoc, (optional)parse_action)``, where:
- - ``op_expr`` is the pyparsing expression for the operator; may also
- be a string, which will be converted to a Literal; if ``num_operands``
- is 3, ``op_expr`` is a tuple of two expressions, for the two
- operators separating the 3 terms
- - ``num_operands`` is the number of terms for this operator (must be 1,
- 2, or 3)
- - ``right_left_assoc`` is the indicator whether the operator is right
- or left associative, using the pyparsing-defined constants
- ``OpAssoc.RIGHT`` and ``OpAssoc.LEFT``.
- - ``parse_action`` is the parse action to be associated with
- expressions matching this operator expression (the parse action
- tuple member may be omitted); if the parse action is passed
- a tuple or list of functions, this is equivalent to calling
- ``set_parse_action(*fn)``
- (:class:`ParserElement.set_parse_action`)
- :param lpar: expression for matching left-parentheses; if passed as a
- str, then will be parsed as ``Suppress(lpar)``. If lpar is passed as
- an expression (such as ``Literal('(')``), then it will be kept in
- the parsed results, and grouped with them. (default= ``Suppress('(')``)
- :param rpar: expression for matching right-parentheses; if passed as a
- str, then will be parsed as ``Suppress(rpar)``. If rpar is passed as
- an expression (such as ``Literal(')')``), then it will be kept in
- the parsed results, and grouped with them. (default= ``Suppress(')')``)
- Example:
- .. testcode::
- # simple example of four-function arithmetic with ints and
- # variable names
- integer = pyparsing_common.signed_integer
- varname = pyparsing_common.identifier
- arith_expr = infix_notation(integer | varname,
- [
- ('-', 1, OpAssoc.RIGHT),
- (one_of('* /'), 2, OpAssoc.LEFT),
- (one_of('+ -'), 2, OpAssoc.LEFT),
- ])
- arith_expr.run_tests('''
- 5+3*6
- (5+3)*6
- (5+x)*y
- -2--11
- ''', full_dump=False)
- prints:
- .. testoutput::
- :options: +NORMALIZE_WHITESPACE
- 5+3*6
- [[5, '+', [3, '*', 6]]]
- (5+3)*6
- [[[5, '+', 3], '*', 6]]
- (5+x)*y
- [[[5, '+', 'x'], '*', 'y']]
- -2--11
- [[['-', 2], '-', ['-', 11]]]
- """
- # captive version of FollowedBy that does not do parse actions or capture results names
- class _FB(FollowedBy):
- def parseImpl(self, instring, loc, doActions=True):
- self.expr.try_parse(instring, loc)
- return loc, []
- _FB.__name__ = "FollowedBy>"
- ret = Forward()
- ret.set_name(f"{base_expr.name}_expression")
- if isinstance(lpar, str):
- lpar = Suppress(lpar)
- if isinstance(rpar, str):
- rpar = Suppress(rpar)
- nested_expr = (lpar + ret + rpar).set_name(f"nested_{base_expr.name}_expression")
- # if lpar and rpar are not suppressed, wrap in group
- if not (isinstance(lpar, Suppress) and isinstance(rpar, Suppress)):
- lastExpr = base_expr | Group(nested_expr)
- else:
- lastExpr = base_expr | nested_expr
- arity: int
- rightLeftAssoc: opAssoc
- pa: typing.Optional[ParseAction]
- opExpr1: ParserElement
- opExpr2: ParserElement
- matchExpr: ParserElement
- match_lookahead: ParserElement
- for operDef in op_list:
- opExpr, arity, rightLeftAssoc, pa = (operDef + (None,))[:4] # type: ignore[assignment]
- if isinstance(opExpr, str_type):
- opExpr = ParserElement._literalStringClass(opExpr)
- opExpr = typing.cast(ParserElement, opExpr)
- if arity == 3:
- if not isinstance(opExpr, (tuple, list)) or len(opExpr) != 2:
- raise ValueError(
- "if numterms=3, opExpr must be a tuple or list of two expressions"
- )
- opExpr1, opExpr2 = opExpr
- term_name = f"{opExpr1}{opExpr2} operations"
- else:
- term_name = f"{opExpr} operations"
- if not 1 <= arity <= 3:
- raise ValueError("operator must be unary (1), binary (2), or ternary (3)")
- if rightLeftAssoc not in (OpAssoc.LEFT, OpAssoc.RIGHT):
- raise ValueError("operator must indicate right or left associativity")
- thisExpr: ParserElement = Forward().set_name(term_name)
- thisExpr = typing.cast(Forward, thisExpr)
- match_lookahead = And([])
- if rightLeftAssoc is OpAssoc.LEFT:
- if arity == 1:
- match_lookahead = _FB(lastExpr + opExpr)
- matchExpr = Group(lastExpr + opExpr[1, ...])
- elif arity == 2:
- if opExpr is not None:
- match_lookahead = _FB(lastExpr + opExpr + lastExpr)
- matchExpr = Group(lastExpr + (opExpr + lastExpr)[1, ...])
- else:
- match_lookahead = _FB(lastExpr + lastExpr)
- matchExpr = Group(lastExpr[2, ...])
- elif arity == 3:
- match_lookahead = _FB(
- lastExpr + opExpr1 + lastExpr + opExpr2 + lastExpr
- )
- matchExpr = Group(
- lastExpr + (opExpr1 + lastExpr + opExpr2 + lastExpr)[1, ...]
- )
- elif rightLeftAssoc is OpAssoc.RIGHT:
- if arity == 1:
- # try to avoid LR with this extra test
- if not isinstance(opExpr, Opt):
- opExpr = Opt(opExpr)
- match_lookahead = _FB(opExpr.expr + thisExpr)
- matchExpr = Group(opExpr + thisExpr)
- elif arity == 2:
- if opExpr is not None:
- match_lookahead = _FB(lastExpr + opExpr + thisExpr)
- matchExpr = Group(lastExpr + (opExpr + thisExpr)[1, ...])
- else:
- match_lookahead = _FB(lastExpr + thisExpr)
- matchExpr = Group(lastExpr + thisExpr[1, ...])
- elif arity == 3:
- match_lookahead = _FB(
- lastExpr + opExpr1 + thisExpr + opExpr2 + thisExpr
- )
- matchExpr = Group(lastExpr + opExpr1 + thisExpr + opExpr2 + thisExpr)
- # suppress lookahead expr from railroad diagrams
- match_lookahead.show_in_diagram = False
- # TODO - determine why this statement can't be included in the following
- # if pa block
- matchExpr = match_lookahead + matchExpr
- if pa:
- if isinstance(pa, (tuple, list)):
- matchExpr.set_parse_action(*pa)
- else:
- matchExpr.set_parse_action(pa)
- thisExpr <<= (matchExpr | lastExpr).set_name(term_name)
- lastExpr = thisExpr
- ret <<= lastExpr
- return ret
- def indentedBlock(blockStatementExpr, indentStack, indent=True, backup_stacks=[]):
- """
- .. deprecated:: 3.0.0
- Use the :class:`IndentedBlock` class instead. Note that `IndentedBlock`
- has a difference method signature.
- Helper method for defining space-delimited indentation blocks,
- such as those used to define block statements in Python source code.
- :param blockStatementExpr: expression defining syntax of statement that
- is repeated within the indented block
- :param indentStack: list created by caller to manage indentation stack
- (multiple ``statementWithIndentedBlock`` expressions within a single
- grammar should share a common ``indentStack``)
- :param indent: boolean indicating whether block must be indented beyond
- the current level; set to ``False`` for block of left-most statements
- A valid block must contain at least one ``blockStatement``.
- (Note that indentedBlock uses internal parse actions which make it
- incompatible with packrat parsing.)
- Example:
- .. testcode::
- data = '''
- def A(z):
- A1
- B = 100
- G = A2
- A2
- A3
- B
- def BB(a,b,c):
- BB1
- def BBA():
- bba1
- bba2
- bba3
- C
- D
- def spam(x,y):
- def eggs(z):
- pass
- '''
- indentStack = [1]
- stmt = Forward()
- identifier = Word(alphas, alphanums)
- funcDecl = ("def" + identifier + Group("(" + Opt(delimitedList(identifier)) + ")") + ":")
- func_body = indentedBlock(stmt, indentStack)
- funcDef = Group(funcDecl + func_body)
- rvalue = Forward()
- funcCall = Group(identifier + "(" + Opt(delimitedList(rvalue)) + ")")
- rvalue << (funcCall | identifier | Word(nums))
- assignment = Group(identifier + "=" + rvalue)
- stmt << (funcDef | assignment | identifier)
- module_body = stmt[1, ...]
- parseTree = module_body.parseString(data)
- parseTree.pprint()
- prints:
- .. testoutput::
- [['def',
- 'A',
- ['(', 'z', ')'],
- ':',
- [['A1'], [['B', '=', '100']], [['G', '=', 'A2']], ['A2'], ['A3']]],
- 'B',
- ['def',
- 'BB',
- ['(', 'a', 'b', 'c', ')'],
- ':',
- [['BB1'], [['def', 'BBA', ['(', ')'], ':', [['bba1'], ['bba2'], ['bba3']]]]]],
- 'C',
- 'D',
- ['def',
- 'spam',
- ['(', 'x', 'y', ')'],
- ':',
- [[['def', 'eggs', ['(', 'z', ')'], ':', [['pass']]]]]]]
- """
- warnings.warn(
- f"{'indentedBlock'!r} deprecated - use {'IndentedBlock'!r}",
- PyparsingDeprecationWarning,
- stacklevel=2,
- )
- backup_stacks.append(indentStack[:])
- def reset_stack():
- indentStack[:] = backup_stacks[-1]
- def checkPeerIndent(s, l, t):
- if l >= len(s):
- return
- curCol = col(l, s)
- if curCol != indentStack[-1]:
- if curCol > indentStack[-1]:
- raise ParseException(s, l, "illegal nesting")
- raise ParseException(s, l, "not a peer entry")
- def checkSubIndent(s, l, t):
- curCol = col(l, s)
- if curCol > indentStack[-1]:
- indentStack.append(curCol)
- else:
- raise ParseException(s, l, "not a subentry")
- def checkUnindent(s, l, t):
- if l >= len(s):
- return
- curCol = col(l, s)
- if not (indentStack and curCol in indentStack):
- raise ParseException(s, l, "not an unindent")
- if curCol < indentStack[-1]:
- indentStack.pop()
- NL = OneOrMore(LineEnd().set_whitespace_chars("\t ").suppress())
- INDENT = (Empty() + Empty().set_parse_action(checkSubIndent)).set_name("INDENT")
- PEER = Empty().set_parse_action(checkPeerIndent).set_name("")
- UNDENT = Empty().set_parse_action(checkUnindent).set_name("UNINDENT")
- if indent:
- smExpr = Group(
- Opt(NL)
- + INDENT
- + OneOrMore(PEER + Group(blockStatementExpr) + Opt(NL))
- + UNDENT
- )
- else:
- smExpr = Group(
- Opt(NL)
- + OneOrMore(PEER + Group(blockStatementExpr) + Opt(NL))
- + Opt(UNDENT)
- )
- # add a parse action to remove backup_stack from list of backups
- smExpr.add_parse_action(
- lambda: backup_stacks.pop(-1) and None if backup_stacks else None
- )
- smExpr.set_fail_action(lambda a, b, c, d: reset_stack())
- blockStatementExpr.ignore(_bslash + LineEnd())
- return smExpr.set_name("indented block")
- # it's easy to get these comment structures wrong - they're very common,
- # so may as well make them available
- c_style_comment = Regex(r"/\*(?:[^*]|\*(?!/))*\*\/").set_name("C style comment")
- "Comment of the form ``/* ... */``"
- html_comment = Regex(r"<!--[\s\S]*?-->").set_name("HTML comment")
- "Comment of the form ``<!-- ... -->``"
- rest_of_line = Regex(r".*").leave_whitespace().set_name("rest of line")
- dbl_slash_comment = Regex(r"//(?:\\\n|[^\n])*").set_name("// comment")
- "Comment of the form ``// ... (to end of line)``"
- cpp_style_comment = Regex(
- r"(?:/\*(?:[^*]|\*(?!/))*\*\/)|(?://(?:\\\n|[^\n])*)"
- ).set_name("C++ style comment")
- "Comment of either form :class:`c_style_comment` or :class:`dbl_slash_comment`"
- java_style_comment = cpp_style_comment
- "Same as :class:`cpp_style_comment`"
- python_style_comment = Regex(r"#.*").set_name("Python style comment")
- "Comment of the form ``# ... (to end of line)``"
- # build list of built-in expressions, for future reference if a global default value
- # gets updated
- _builtin_exprs: list[ParserElement] = [
- v for v in vars().values() if isinstance(v, ParserElement)
- ]
- # compatibility function, superseded by DelimitedList class
- def delimited_list(
- expr: Union[str, ParserElement],
- delim: Union[str, ParserElement] = ",",
- combine: bool = False,
- min: typing.Optional[int] = None,
- max: typing.Optional[int] = None,
- *,
- allow_trailing_delim: bool = False,
- ) -> ParserElement:
- """
- .. deprecated:: 3.1.0
- Use the :class:`DelimitedList` class instead.
- """
- return DelimitedList(
- expr, delim, combine, min, max, allow_trailing_delim=allow_trailing_delim
- )
- # Compatibility synonyms
- # fmt: off
- opAssoc = OpAssoc
- anyOpenTag = any_open_tag
- anyCloseTag = any_close_tag
- commonHTMLEntity = common_html_entity
- cStyleComment = c_style_comment
- htmlComment = html_comment
- restOfLine = rest_of_line
- dblSlashComment = dbl_slash_comment
- cppStyleComment = cpp_style_comment
- javaStyleComment = java_style_comment
- pythonStyleComment = python_style_comment
- delimitedList = replaced_by_pep8("delimitedList", DelimitedList)
- delimited_list = replaced_by_pep8("delimited_list", DelimitedList)
- countedArray = replaced_by_pep8("countedArray", counted_array)
- matchPreviousLiteral = replaced_by_pep8("matchPreviousLiteral", match_previous_literal)
- matchPreviousExpr = replaced_by_pep8("matchPreviousExpr", match_previous_expr)
- oneOf = replaced_by_pep8("oneOf", one_of)
- dictOf = replaced_by_pep8("dictOf", dict_of)
- originalTextFor = replaced_by_pep8("originalTextFor", original_text_for)
- nestedExpr = replaced_by_pep8("nestedExpr", nested_expr)
- makeHTMLTags = replaced_by_pep8("makeHTMLTags", make_html_tags)
- makeXMLTags = replaced_by_pep8("makeXMLTags", make_xml_tags)
- replaceHTMLEntity = replaced_by_pep8("replaceHTMLEntity", replace_html_entity)
- infixNotation = replaced_by_pep8("infixNotation", infix_notation)
- # fmt: on
|