common.py 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570
  1. # common.py
  2. from .core import *
  3. from .helpers import DelimitedList, any_open_tag, any_close_tag
  4. from datetime import datetime
  5. import sys
  6. PY_310_OR_LATER = sys.version_info >= (3, 10)
  7. # some other useful expressions - using lower-case class name since we are really using this as a namespace
  8. class pyparsing_common:
  9. """Here are some common low-level expressions that may be useful in
  10. jump-starting parser development:
  11. - numeric forms (:class:`integers<integer>`, :class:`reals<real>`,
  12. :class:`scientific notation<sci_real>`)
  13. - common :class:`programming identifiers<identifier>`
  14. - network addresses (:class:`MAC<mac_address>`,
  15. :class:`IPv4<ipv4_address>`, :class:`IPv6<ipv6_address>`)
  16. - ISO8601 :class:`dates<iso8601_date>` and
  17. :class:`datetime<iso8601_datetime>`
  18. - :class:`UUID<uuid>`
  19. - :class:`comma-separated list<comma_separated_list>`
  20. - :class:`url`
  21. Parse actions:
  22. - :class:`convert_to_integer`
  23. - :class:`convert_to_float`
  24. - :class:`convert_to_date`
  25. - :class:`convert_to_datetime`
  26. - :class:`strip_html_tags`
  27. - :class:`upcase_tokens`
  28. - :class:`downcase_tokens`
  29. Examples:
  30. .. testcode::
  31. pyparsing_common.number.run_tests('''
  32. # any int or real number, returned as the appropriate type
  33. 100
  34. -100
  35. +100
  36. 3.14159
  37. 6.02e23
  38. 1e-12
  39. ''')
  40. .. testoutput::
  41. :options: +NORMALIZE_WHITESPACE
  42. # any int or real number, returned as the appropriate type
  43. 100
  44. [100]
  45. -100
  46. [-100]
  47. +100
  48. [100]
  49. 3.14159
  50. [3.14159]
  51. 6.02e23
  52. [6.02e+23]
  53. 1e-12
  54. [1e-12]
  55. .. testcode::
  56. pyparsing_common.fnumber.run_tests('''
  57. # any int or real number, returned as float
  58. 100
  59. -100
  60. +100
  61. 3.14159
  62. 6.02e23
  63. 1e-12
  64. ''')
  65. .. testoutput::
  66. :options: +NORMALIZE_WHITESPACE
  67. # any int or real number, returned as float
  68. 100
  69. [100.0]
  70. -100
  71. [-100.0]
  72. +100
  73. [100.0]
  74. 3.14159
  75. [3.14159]
  76. 6.02e23
  77. [6.02e+23]
  78. 1e-12
  79. [1e-12]
  80. .. testcode::
  81. pyparsing_common.hex_integer.run_tests('''
  82. # hex numbers
  83. 100
  84. FF
  85. ''')
  86. .. testoutput::
  87. :options: +NORMALIZE_WHITESPACE
  88. # hex numbers
  89. 100
  90. [256]
  91. FF
  92. [255]
  93. .. testcode::
  94. pyparsing_common.fraction.run_tests('''
  95. # fractions
  96. 1/2
  97. -3/4
  98. ''')
  99. .. testoutput::
  100. :options: +NORMALIZE_WHITESPACE
  101. # fractions
  102. 1/2
  103. [0.5]
  104. -3/4
  105. [-0.75]
  106. .. testcode::
  107. pyparsing_common.mixed_integer.run_tests('''
  108. # mixed fractions
  109. 1
  110. 1/2
  111. -3/4
  112. 1-3/4
  113. ''')
  114. .. testoutput::
  115. :options: +NORMALIZE_WHITESPACE
  116. # mixed fractions
  117. 1
  118. [1]
  119. 1/2
  120. [0.5]
  121. -3/4
  122. [-0.75]
  123. 1-3/4
  124. [1.75]
  125. .. testcode::
  126. import uuid
  127. pyparsing_common.uuid.set_parse_action(token_map(uuid.UUID))
  128. pyparsing_common.uuid.run_tests('''
  129. # uuid
  130. 12345678-1234-5678-1234-567812345678
  131. ''')
  132. .. testoutput::
  133. :options: +NORMALIZE_WHITESPACE
  134. # uuid
  135. 12345678-1234-5678-1234-567812345678
  136. [UUID('12345678-1234-5678-1234-567812345678')]
  137. """
  138. @staticmethod
  139. def convert_to_integer(_, __, t):
  140. """
  141. Parse action for converting parsed integers to Python int
  142. """
  143. return [int(tt) for tt in t]
  144. @staticmethod
  145. def convert_to_float(_, __, t):
  146. """
  147. Parse action for converting parsed numbers to Python float
  148. """
  149. return [float(tt) for tt in t]
  150. integer = (
  151. Word(nums)
  152. .set_name("integer")
  153. .set_parse_action(
  154. convert_to_integer
  155. if PY_310_OR_LATER
  156. else lambda t: [int(tt) for tt in t] # type: ignore[misc]
  157. )
  158. )
  159. """expression that parses an unsigned integer, converts to an int"""
  160. hex_integer = (
  161. Word(hexnums).set_name("hex integer").set_parse_action(token_map(int, 16))
  162. )
  163. """expression that parses a hexadecimal integer, converts to an int"""
  164. signed_integer = (
  165. Regex(r"[+-]?\d+")
  166. .set_name("signed integer")
  167. .set_parse_action(
  168. convert_to_integer
  169. if PY_310_OR_LATER
  170. else lambda t: [int(tt) for tt in t] # type: ignore[misc]
  171. )
  172. )
  173. """expression that parses an integer with optional leading sign, converts to an int"""
  174. fraction = (
  175. signed_integer().set_parse_action(
  176. convert_to_float
  177. if PY_310_OR_LATER
  178. else lambda t: [float(tt) for tt in t] # type: ignore[misc]
  179. )
  180. + "/"
  181. + signed_integer().set_parse_action(
  182. convert_to_float
  183. if PY_310_OR_LATER
  184. else lambda t: [float(tt) for tt in t] # type: ignore[misc]
  185. )
  186. ).set_name("fraction")
  187. """fractional expression of an integer divided by an integer, converts to a float"""
  188. fraction.add_parse_action(lambda tt: tt[0] / tt[-1])
  189. mixed_integer = (
  190. fraction | signed_integer + Opt(Opt("-").suppress() + fraction)
  191. ).set_name("fraction or mixed integer-fraction")
  192. """mixed integer of the form 'integer - fraction', with optional leading integer, converts to a float"""
  193. mixed_integer.add_parse_action(sum)
  194. real = (
  195. Regex(r"[+-]?(?:\d+\.\d*|\.\d+)")
  196. .set_name("real number")
  197. .set_parse_action(
  198. convert_to_float
  199. if PY_310_OR_LATER
  200. else lambda t: [float(tt) for tt in t] # type: ignore[misc]
  201. )
  202. )
  203. """expression that parses a floating point number, converts to a float"""
  204. sci_real = (
  205. Regex(r"[+-]?(?:\d+(?:[eE][+-]?\d+)|(?:\d+\.\d*|\.\d+)(?:[eE][+-]?\d+)?)")
  206. .set_name("real number with scientific notation")
  207. .set_parse_action(
  208. convert_to_float
  209. if PY_310_OR_LATER
  210. else lambda t: [float(tt) for tt in t] # type: ignore[misc]
  211. )
  212. )
  213. """expression that parses a floating point number with optional
  214. scientific notation, converts to a float"""
  215. # streamlining this expression makes the docs nicer-looking
  216. number = (sci_real | real | signed_integer).set_name("number").streamline()
  217. """any numeric expression, converts to the corresponding Python type"""
  218. fnumber = (
  219. Regex(r"[+-]?\d+\.?\d*(?:[eE][+-]?\d+)?")
  220. .set_name("fnumber")
  221. .set_parse_action(
  222. convert_to_float
  223. if PY_310_OR_LATER
  224. else lambda t: [float(tt) for tt in t] # type: ignore[misc]
  225. )
  226. )
  227. """any int or real number, always converts to a float"""
  228. ieee_float = (
  229. Regex(r"(?i:[+-]?(?:(?:\d+\.?\d*(?:e[+-]?\d+)?)|nan|inf(?:inity)?))")
  230. .set_name("ieee_float")
  231. .set_parse_action(
  232. convert_to_float
  233. if PY_310_OR_LATER
  234. else lambda t: [float(tt) for tt in t] # type: ignore[misc]
  235. )
  236. )
  237. """any floating-point literal (int, real number, infinity, or NaN), converts to a float"""
  238. identifier = Word(identchars, identbodychars).set_name("identifier")
  239. """typical code identifier (leading alpha or '_', followed by 0 or more alphas, nums, or '_')"""
  240. ipv4_address = Regex(
  241. r"(?:25[0-5]|2[0-4][0-9]|1?[0-9]{1,2})(?:\.(?:25[0-5]|2[0-4][0-9]|1?[0-9]{1,2})){3}"
  242. ).set_name("IPv4 address")
  243. "IPv4 address (``0.0.0.0 - 255.255.255.255``)"
  244. _ipv6_part = Regex(r"[0-9a-fA-F]{1,4}").set_name("hex_integer")
  245. _full_ipv6_address = (_ipv6_part + (":" + _ipv6_part) * 7).set_name(
  246. "full IPv6 address"
  247. )
  248. _short_ipv6_address = (
  249. Opt(_ipv6_part + (":" + _ipv6_part) * (0, 6))
  250. + "::"
  251. + Opt(_ipv6_part + (":" + _ipv6_part) * (0, 6))
  252. ).set_name("short IPv6 address")
  253. _short_ipv6_address.add_condition(
  254. lambda t: sum(1 for tt in t if pyparsing_common._ipv6_part.matches(tt)) < 8
  255. )
  256. _mixed_ipv6_address = ("::ffff:" + ipv4_address).set_name("mixed IPv6 address")
  257. ipv6_address = Combine(
  258. (_full_ipv6_address | _mixed_ipv6_address | _short_ipv6_address).set_name(
  259. "IPv6 address"
  260. )
  261. ).set_name("IPv6 address")
  262. "IPv6 address (long, short, or mixed form)"
  263. mac_address = Regex(
  264. r"[0-9a-fA-F]{2}([:.-])[0-9a-fA-F]{2}(?:\1[0-9a-fA-F]{2}){4}"
  265. ).set_name("MAC address")
  266. "MAC address xx:xx:xx:xx:xx (may also have '-' or '.' delimiters)"
  267. @staticmethod
  268. def convert_to_date(fmt: str = "%Y-%m-%d"):
  269. """
  270. Helper to create a parse action for converting parsed date string to Python datetime.date
  271. Params -
  272. - fmt - format to be passed to datetime.strptime (default= ``"%Y-%m-%d"``)
  273. Example:
  274. .. testcode::
  275. date_expr = pyparsing_common.iso8601_date.copy()
  276. date_expr.set_parse_action(pyparsing_common.convert_to_date())
  277. print(date_expr.parse_string("1999-12-31"))
  278. prints:
  279. .. testoutput::
  280. [datetime.date(1999, 12, 31)]
  281. """
  282. def cvt_fn(ss, ll, tt):
  283. try:
  284. return datetime.strptime(tt[0], fmt).date()
  285. except ValueError as ve:
  286. raise ParseException(ss, ll, str(ve))
  287. return cvt_fn
  288. @staticmethod
  289. def convert_to_datetime(fmt: str = "%Y-%m-%dT%H:%M:%S.%f"):
  290. """Helper to create a parse action for converting parsed
  291. datetime string to Python :class:`datetime.datetime`
  292. Params -
  293. - fmt - format to be passed to :class:`datetime.strptime` (default= ``"%Y-%m-%dT%H:%M:%S.%f"``)
  294. Example:
  295. .. testcode::
  296. dt_expr = pyparsing_common.iso8601_datetime.copy()
  297. dt_expr.set_parse_action(pyparsing_common.convert_to_datetime())
  298. print(dt_expr.parse_string("1999-12-31T23:59:59.999"))
  299. prints:
  300. .. testoutput::
  301. [datetime.datetime(1999, 12, 31, 23, 59, 59, 999000)]
  302. """
  303. def cvt_fn(s, l, t):
  304. try:
  305. return datetime.strptime(t[0], fmt)
  306. except ValueError as ve:
  307. raise ParseException(s, l, str(ve))
  308. return cvt_fn
  309. iso8601_date = Regex(
  310. r"(?P<year>\d{4})(?:-(?P<month>\d\d)(?:-(?P<day>\d\d))?)?"
  311. ).set_name("ISO8601 date")
  312. "ISO8601 date (``yyyy-mm-dd``)"
  313. iso8601_datetime = Regex(
  314. r"(?P<year>\d{4})-(?P<month>\d\d)-(?P<day>\d\d)[T ](?P<hour>\d\d):(?P<minute>\d\d)(:(?P<second>\d\d(\.\d*)?)?)?(?P<tz>Z|[+-]\d\d:?\d\d)?"
  315. ).set_name("ISO8601 datetime")
  316. "ISO8601 datetime (``yyyy-mm-ddThh:mm:ss.s(Z|+-00:00)``) - trailing seconds, milliseconds, and timezone optional; accepts separating ``'T'`` or ``' '``"
  317. @staticmethod
  318. def as_datetime(s, l, t):
  319. """Parse action to convert parsed dates or datetimes to a Python
  320. :class:`datetime.datetime`.
  321. This parse action will use the year, month, day, etc. results
  322. names defined in the ISO8601 date expressions, but it can be
  323. used with any expression that provides one or more of these fields.
  324. Omitted fields will default to fields from Jan 1, 00:00:00.
  325. Invalid dates will raise a :class:`ParseException` with the
  326. error message indicating the invalid date fields.
  327. """
  328. year = int(t.year.lstrip("0") or 0)
  329. month = int(t.month or 1)
  330. day = int(t.day or 1)
  331. hour = int(t.hour or 0)
  332. minute = int(t.minute or 0)
  333. second = float(t.second or 0)
  334. try:
  335. return datetime(
  336. year, month, day, hour, minute, int(second), int((second % 1) * 1000)
  337. )
  338. except ValueError as ve:
  339. raise ParseException(t, l, f"Invalid date/time: {ve}").with_traceback(
  340. ve.__traceback__
  341. ) from None
  342. if PY_310_OR_LATER:
  343. iso8601_date_validated = iso8601_date().add_parse_action(as_datetime)
  344. "Validated ISO8601 date strings, raising :class:`ParseException` for invalid date values."
  345. iso8601_datetime_validated = iso8601_datetime().add_parse_action(as_datetime)
  346. "Validated ISO8601 date and time strings, raising :class:`ParseException` for invalid date/time values."
  347. uuid = Regex(r"[0-9a-fA-F]{8}(?:-[0-9a-fA-F]{4}){3}-[0-9a-fA-F]{12}").set_name(
  348. "UUID"
  349. )
  350. "UUID (``xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx``)"
  351. _html_stripper = any_open_tag.suppress() | any_close_tag.suppress()
  352. @staticmethod
  353. def strip_html_tags(s: str, l: int, tokens: ParseResults):
  354. """Parse action to remove HTML tags from web page HTML source
  355. Example:
  356. .. testcode::
  357. # strip HTML links from normal text
  358. text = '<td>More info at the <a href="https://github.com/pyparsing/pyparsing/wiki">pyparsing</a> wiki page</td>'
  359. td, td_end = make_html_tags("TD")
  360. table_text = td + SkipTo(td_end).set_parse_action(
  361. pyparsing_common.strip_html_tags)("body") + td_end
  362. print(table_text.parse_string(text).body)
  363. Prints:
  364. .. testoutput::
  365. More info at the pyparsing wiki page
  366. """
  367. return pyparsing_common._html_stripper.transform_string(tokens[0])
  368. _commasepitem = (
  369. Combine(
  370. OneOrMore(
  371. ~Literal(",")
  372. + ~LineEnd()
  373. + Word(printables, exclude_chars=",")
  374. + Opt(White(" \t") + ~FollowedBy(LineEnd() | ","))
  375. )
  376. )
  377. .streamline()
  378. .set_name("commaItem")
  379. )
  380. comma_separated_list = DelimitedList(
  381. Opt(quoted_string.copy() | _commasepitem, default="")
  382. ).set_name("comma separated list")
  383. """Predefined expression of 1 or more printable words or quoted strings, separated by commas."""
  384. @staticmethod
  385. def upcase_tokens(s, l, t):
  386. """Parse action to convert tokens to upper case."""
  387. return [tt.upper() for tt in t]
  388. @staticmethod
  389. def downcase_tokens(s, l, t):
  390. """Parse action to convert tokens to lower case."""
  391. return [tt.lower() for tt in t]
  392. # fmt: off
  393. url = Regex(
  394. # https://mathiasbynens.be/demo/url-regex
  395. # https://gist.github.com/dperini/729294
  396. r"(?P<url>"
  397. # protocol identifier (optional)
  398. # short syntax // still required
  399. r"(?:(?:(?P<scheme>https?|ftp):)?\/\/)"
  400. # user:pass BasicAuth (optional)
  401. r"(?:(?P<auth>\S+(?::\S*)?)@)?"
  402. r"(?P<host>"
  403. # IP address exclusion
  404. # private & local networks
  405. r"(?!(?:10|127)(?:\.\d{1,3}){3})"
  406. r"(?!(?:169\.254|192\.168)(?:\.\d{1,3}){2})"
  407. r"(?!172\.(?:1[6-9]|2\d|3[0-1])(?:\.\d{1,3}){2})"
  408. # IP address dotted notation octets
  409. # excludes loopback network 0.0.0.0
  410. # excludes reserved space >= 224.0.0.0
  411. # excludes network & broadcast addresses
  412. # (first & last IP address of each class)
  413. r"(?:[1-9]\d?|1\d\d|2[01]\d|22[0-3])"
  414. r"(?:\.(?:1?\d{1,2}|2[0-4]\d|25[0-5])){2}"
  415. r"(?:\.(?:[1-9]\d?|1\d\d|2[0-4]\d|25[0-4]))"
  416. r"|"
  417. # host & domain names, may end with dot
  418. # can be replaced by a shortest alternative
  419. # (?![-_])(?:[-\w\u00a1-\uffff]{0,63}[^-_]\.)+
  420. r"(?:"
  421. r"(?:"
  422. r"[a-z0-9\u00a1-\uffff]"
  423. r"[a-z0-9\u00a1-\uffff_-]{0,62}"
  424. r")?"
  425. r"[a-z0-9\u00a1-\uffff]\."
  426. r")+"
  427. # TLD identifier name, may end with dot
  428. r"(?:[a-z\u00a1-\uffff]{2,}\.?)"
  429. r")"
  430. # port number (optional)
  431. r"(:(?P<port>\d{2,5}))?"
  432. # resource path (optional)
  433. r"(?P<path>\/[^?# ]*)?"
  434. # query string (optional)
  435. r"(\?(?P<query>[^#]*))?"
  436. # fragment (optional)
  437. r"(#(?P<fragment>\S*))?"
  438. r")"
  439. ).set_name("url")
  440. """
  441. URL (http/https/ftp scheme)
  442. .. versionchanged:: 3.1.0
  443. ``url`` named group added
  444. """
  445. # fmt: on
  446. # pre-PEP8 compatibility names
  447. # fmt: off
  448. convertToInteger = staticmethod(replaced_by_pep8("convertToInteger", convert_to_integer))
  449. convertToFloat = staticmethod(replaced_by_pep8("convertToFloat", convert_to_float))
  450. convertToDate = staticmethod(replaced_by_pep8("convertToDate", convert_to_date))
  451. convertToDatetime = staticmethod(replaced_by_pep8("convertToDatetime", convert_to_datetime))
  452. stripHTMLTags = staticmethod(replaced_by_pep8("stripHTMLTags", strip_html_tags))
  453. upcaseTokens = staticmethod(replaced_by_pep8("upcaseTokens", upcase_tokens))
  454. downcaseTokens = staticmethod(replaced_by_pep8("downcaseTokens", downcase_tokens))
  455. # fmt: on
  456. _builtin_exprs = [
  457. v for v in vars(pyparsing_common).values() if isinstance(v, ParserElement)
  458. ]