block_parser.py 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498
  1. import re
  2. from typing import Optional, List, Tuple, Match, Pattern
  3. import string
  4. from .util import (
  5. unikey,
  6. escape_url,
  7. expand_tab,
  8. expand_leading_tab,
  9. )
  10. from .core import Parser, BlockState
  11. from .helpers import (
  12. LINK_LABEL,
  13. HTML_TAGNAME,
  14. HTML_ATTRIBUTES,
  15. BLOCK_TAGS,
  16. PRE_TAGS,
  17. unescape_char,
  18. parse_link_href,
  19. parse_link_title,
  20. )
  21. from .list_parser import parse_list, LIST_PATTERN
  22. _INDENT_CODE_TRIM = re.compile(r"^ {1,4}", flags=re.M)
  23. _ATX_HEADING_TRIM = re.compile(r"(\s+|^)#+\s*$")
  24. _BLOCK_QUOTE_TRIM = re.compile(r"^ ?", flags=re.M)
  25. _BLOCK_QUOTE_LEADING = re.compile(r"^ *>", flags=re.M)
  26. _LINE_BLANK_END = re.compile(r"\n[ \t]*\n$")
  27. _BLANK_TO_LINE = re.compile(r"[ \t]*\n")
  28. _BLOCK_TAGS_PATTERN = "(" + "|".join(BLOCK_TAGS) + "|" + "|".join(PRE_TAGS) + ")"
  29. _OPEN_TAG_END = re.compile(HTML_ATTRIBUTES + r"[ \t]*>[ \t]*(?:\n|$)")
  30. _CLOSE_TAG_END = re.compile(r"[ \t]*>[ \t]*(?:\n|$)")
  31. _STRICT_BLOCK_QUOTE = re.compile(r"( {0,3}>[^\n]*(?:\n|$))+")
  32. class BlockParser(Parser[BlockState]):
  33. state_cls = BlockState
  34. BLANK_LINE = re.compile(r"(^[ \t\v\f]*\n)+", re.M)
  35. RAW_HTML = (
  36. r"^ {0,3}("
  37. r"</?" + HTML_TAGNAME + r"|"
  38. r"<!--|" # comment
  39. r"<\?|" # script
  40. r"<![A-Z]|"
  41. r"<!\[CDATA\[)"
  42. )
  43. BLOCK_HTML = (
  44. r"^ {0,3}(?:"
  45. r"(?:</?" + _BLOCK_TAGS_PATTERN + r"(?:[ \t]+|\n|$))"
  46. r"|<!--" # comment
  47. r"|<\?" # script
  48. r"|<![A-Z]"
  49. r"|<!\[CDATA\[)"
  50. )
  51. SPECIFICATION = {
  52. "blank_line": r"(^[ \t\v\f]*\n)+",
  53. "atx_heading": r"^ {0,3}(?P<atx_1>#{1,6})(?!#+)(?P<atx_2>[ \t]*|[ \t]+.*?)$",
  54. "setex_heading": r"^ {0,3}(?P<setext_1>=|-){1,}[ \t]*$",
  55. "fenced_code": (
  56. r"^(?P<fenced_1> {0,3})(?P<fenced_2>`{3,}|~{3,})"
  57. r"[ \t]*(?P<fenced_3>.*?)$"
  58. ),
  59. "indent_code": (
  60. r"^(?: {4}| *\t)[^\n]+(?:\n+|$)"
  61. r"((?:(?: {4}| *\t)[^\n]+(?:\n+|$))|\s)*"
  62. ),
  63. "thematic_break": r"^ {0,3}((?:-[ \t]*){3,}|(?:_[ \t]*){3,}|(?:\*[ \t]*){3,})$",
  64. "ref_link": r"^ {0,3}\[(?P<reflink_1>" + LINK_LABEL + r")\]:",
  65. "block_quote": r"^ {0,3}>(?P<quote_1>.*?)$",
  66. "list": LIST_PATTERN,
  67. "block_html": BLOCK_HTML,
  68. "raw_html": RAW_HTML,
  69. }
  70. DEFAULT_RULES = (
  71. "fenced_code",
  72. "indent_code",
  73. "atx_heading",
  74. "setex_heading",
  75. "thematic_break",
  76. "block_quote",
  77. "list",
  78. "ref_link",
  79. "raw_html",
  80. "blank_line",
  81. )
  82. def __init__(
  83. self,
  84. block_quote_rules: Optional[List[str]] = None,
  85. list_rules: Optional[List[str]] = None,
  86. max_nested_level: int = 6,
  87. ):
  88. super(BlockParser, self).__init__()
  89. if block_quote_rules is None:
  90. block_quote_rules = list(self.DEFAULT_RULES)
  91. if list_rules is None:
  92. list_rules = list(self.DEFAULT_RULES)
  93. self.block_quote_rules = block_quote_rules
  94. self.list_rules = list_rules
  95. self.max_nested_level = max_nested_level
  96. # register default parse methods
  97. self._methods = {name: getattr(self, "parse_" + name) for name in self.SPECIFICATION}
  98. def parse_blank_line(self, m: Match[str], state: BlockState) -> int:
  99. """Parse token for blank lines."""
  100. state.append_token({"type": "blank_line"})
  101. return m.end()
  102. def parse_thematic_break(self, m: Match[str], state: BlockState) -> int:
  103. """Parse token for thematic break, e.g. ``<hr>`` tag in HTML."""
  104. state.append_token({"type": "thematic_break"})
  105. # $ does not count '\n'
  106. return m.end() + 1
  107. def parse_indent_code(self, m: Match[str], state: BlockState) -> int:
  108. """Parse token for code block which is indented by 4 spaces."""
  109. # it is a part of the paragraph
  110. end_pos = state.append_paragraph()
  111. if end_pos:
  112. return end_pos
  113. code = m.group(0)
  114. code = expand_leading_tab(code)
  115. code = _INDENT_CODE_TRIM.sub("", code)
  116. code = code.strip("\n")
  117. state.append_token({"type": "block_code", "raw": code, "style": "indent"})
  118. return m.end()
  119. def parse_fenced_code(self, m: Match[str], state: BlockState) -> Optional[int]:
  120. """Parse token for fenced code block. A fenced code block is started with
  121. 3 or more backtick(`) or tilde(~).
  122. An example of a fenced code block:
  123. .. code-block:: markdown
  124. ```python
  125. def markdown(text):
  126. return mistune.html(text)
  127. ```
  128. """
  129. spaces = m.group("fenced_1")
  130. marker = m.group("fenced_2")
  131. info = m.group("fenced_3")
  132. c = marker[0]
  133. if info and c == "`":
  134. # CommonMark Example 145
  135. # Info strings for backtick code blocks cannot contain backticks
  136. if info.find(c) != -1:
  137. return None
  138. _end = re.compile(r"^ {0,3}" + c + "{" + str(len(marker)) + r",}[ \t]*(?:\n|$)", re.M)
  139. cursor_start = m.end() + 1
  140. m2 = _end.search(state.src, cursor_start)
  141. if m2:
  142. code = state.src[cursor_start : m2.start()]
  143. end_pos = m2.end()
  144. else:
  145. code = state.src[cursor_start:]
  146. end_pos = state.cursor_max
  147. if spaces and code:
  148. _trim_pattern = re.compile("^ {0," + str(len(spaces)) + "}", re.M)
  149. code = _trim_pattern.sub("", code)
  150. token = {"type": "block_code", "raw": code, "style": "fenced", "marker": marker}
  151. if info:
  152. info = unescape_char(info)
  153. token["attrs"] = {"info": info.strip()}
  154. state.append_token(token)
  155. return end_pos
  156. def parse_atx_heading(self, m: Match[str], state: BlockState) -> int:
  157. """Parse token for ATX heading. An ATX heading is started with 1 to 6
  158. symbol of ``#``."""
  159. level = len(m.group("atx_1"))
  160. text = m.group("atx_2").strip(string.whitespace)
  161. # remove last #
  162. if text:
  163. text = _ATX_HEADING_TRIM.sub("", text)
  164. token = {"type": "heading", "text": text, "attrs": {"level": level}, "style": "atx"}
  165. state.append_token(token)
  166. return m.end() + 1
  167. def parse_setex_heading(self, m: Match[str], state: BlockState) -> Optional[int]:
  168. """Parse token for setex style heading. A setex heading syntax looks like:
  169. .. code-block:: markdown
  170. H1 title
  171. ========
  172. """
  173. last_token = state.last_token()
  174. if last_token and last_token["type"] == "paragraph":
  175. level = 1 if m.group("setext_1") == "=" else 2
  176. last_token["type"] = "heading"
  177. last_token["style"] = "setext"
  178. last_token["attrs"] = {"level": level}
  179. return m.end() + 1
  180. sc = self.compile_sc(["thematic_break", "list"])
  181. m2 = sc.match(state.src, state.cursor)
  182. if m2:
  183. return self.parse_method(m2, state)
  184. return None
  185. def parse_ref_link(self, m: Match[str], state: BlockState) -> Optional[int]:
  186. """Parse link references and save the link information into ``state.env``.
  187. Here is an example of a link reference:
  188. .. code-block:: markdown
  189. a [link][example]
  190. [example]: https://example.com "Optional title"
  191. This method will save the link reference into ``state.env`` as::
  192. state.env['ref_links']['example'] = {
  193. 'url': 'https://example.com',
  194. 'title': "Optional title",
  195. }
  196. """
  197. end_pos = state.append_paragraph()
  198. if end_pos:
  199. return end_pos
  200. label = m.group("reflink_1")
  201. key = unikey(label)
  202. if not key:
  203. return None
  204. href, href_pos = parse_link_href(state.src, m.end(), block=True)
  205. if href is None:
  206. return None
  207. assert href_pos is not None
  208. _blank = self.BLANK_LINE.search(state.src, href_pos)
  209. if _blank:
  210. max_pos = _blank.start()
  211. else:
  212. max_pos = state.cursor_max
  213. title, title_pos = parse_link_title(state.src, href_pos, max_pos)
  214. if title_pos:
  215. m2 = _BLANK_TO_LINE.match(state.src, title_pos)
  216. if m2:
  217. title_pos = m2.end()
  218. else:
  219. title_pos = None
  220. title = None
  221. if title_pos is None:
  222. m3 = _BLANK_TO_LINE.match(state.src, href_pos)
  223. if m3:
  224. href_pos = m3.end()
  225. else:
  226. href_pos = None
  227. href = None
  228. end_pos = title_pos or href_pos
  229. if not end_pos:
  230. return None
  231. if key not in state.env["ref_links"]:
  232. assert href is not None
  233. href = unescape_char(href)
  234. data = {"url": escape_url(href), "label": label}
  235. if title:
  236. data["title"] = title
  237. state.env["ref_links"][key] = data
  238. return end_pos
  239. def extract_block_quote(self, m: Match[str], state: BlockState) -> Tuple[str, Optional[int]]:
  240. """Extract text and cursor end position of a block quote."""
  241. # cleanup at first to detect if it is code block
  242. text = m.group("quote_1") + "\n"
  243. text = expand_leading_tab(text, 3)
  244. text = _BLOCK_QUOTE_TRIM.sub("", text)
  245. sc = self.compile_sc(["blank_line", "indent_code", "fenced_code"])
  246. require_marker = bool(sc.match(text))
  247. state.cursor = m.end() + 1
  248. end_pos: Optional[int] = None
  249. if require_marker:
  250. m2 = _STRICT_BLOCK_QUOTE.match(state.src, state.cursor)
  251. if m2:
  252. quote = m2.group(0)
  253. quote = _BLOCK_QUOTE_LEADING.sub("", quote)
  254. quote = expand_leading_tab(quote, 3)
  255. quote = _BLOCK_QUOTE_TRIM.sub("", quote)
  256. text += quote
  257. state.cursor = m2.end()
  258. else:
  259. prev_blank_line = False
  260. break_sc = self.compile_sc(
  261. [
  262. "blank_line",
  263. "thematic_break",
  264. "fenced_code",
  265. "list",
  266. "block_html",
  267. ]
  268. )
  269. while state.cursor < state.cursor_max:
  270. m3 = _STRICT_BLOCK_QUOTE.match(state.src, state.cursor)
  271. if m3:
  272. quote = m3.group(0)
  273. quote = _BLOCK_QUOTE_LEADING.sub("", quote)
  274. quote = expand_leading_tab(quote, 3)
  275. quote = _BLOCK_QUOTE_TRIM.sub("", quote)
  276. text += quote
  277. state.cursor = m3.end()
  278. if not quote.strip():
  279. prev_blank_line = True
  280. else:
  281. prev_blank_line = bool(_LINE_BLANK_END.search(quote))
  282. continue
  283. if prev_blank_line:
  284. # CommonMark Example 249
  285. # because of laziness, a blank line is needed between
  286. # a block quote and a following paragraph
  287. break
  288. m4 = break_sc.match(state.src, state.cursor)
  289. if m4:
  290. end_pos = self.parse_method(m4, state)
  291. if end_pos:
  292. break
  293. # lazy continuation line
  294. pos = state.find_line_end()
  295. line = state.get_text(pos)
  296. line = expand_leading_tab(line, 3)
  297. text += line
  298. state.cursor = pos
  299. # according to CommonMark Example 6, the second tab should be
  300. # treated as 4 spaces
  301. return expand_tab(text), end_pos
  302. def parse_block_quote(self, m: Match[str], state: BlockState) -> int:
  303. """Parse token for block quote. Here is an example of the syntax:
  304. .. code-block:: markdown
  305. > a block quote starts
  306. > with right arrows
  307. """
  308. text, end_pos = self.extract_block_quote(m, state)
  309. # scan children state
  310. child = state.child_state(text)
  311. if state.depth() >= self.max_nested_level - 1:
  312. rules = list(self.block_quote_rules)
  313. rules.remove("block_quote")
  314. else:
  315. rules = self.block_quote_rules
  316. self.parse(child, rules)
  317. token = {"type": "block_quote", "children": child.tokens}
  318. if end_pos:
  319. state.prepend_token(token)
  320. return end_pos
  321. state.append_token(token)
  322. return state.cursor
  323. def parse_list(self, m: Match[str], state: BlockState) -> int:
  324. """Parse tokens for ordered and unordered list."""
  325. return parse_list(self, m, state)
  326. def parse_block_html(self, m: Match[str], state: BlockState) -> Optional[int]:
  327. return self.parse_raw_html(m, state)
  328. def parse_raw_html(self, m: Match[str], state: BlockState) -> Optional[int]:
  329. marker = m.group(0).strip()
  330. # rule 2
  331. if marker == "<!--":
  332. return _parse_html_to_end(state, "-->", m.end())
  333. # rule 3
  334. if marker == "<?":
  335. return _parse_html_to_end(state, "?>", m.end())
  336. # rule 5
  337. if marker == "<![CDATA[":
  338. return _parse_html_to_end(state, "]]>", m.end())
  339. # rule 4
  340. if marker.startswith("<!"):
  341. return _parse_html_to_end(state, ">", m.end())
  342. close_tag = None
  343. open_tag = None
  344. if marker.startswith("</"):
  345. close_tag = marker[2:].lower()
  346. # rule 6
  347. if close_tag in BLOCK_TAGS:
  348. return _parse_html_to_newline(state, self.BLANK_LINE)
  349. else:
  350. open_tag = marker[1:].lower()
  351. # rule 1
  352. if open_tag in PRE_TAGS:
  353. end_tag = "</" + open_tag + ">"
  354. return _parse_html_to_end(state, end_tag, m.end())
  355. # rule 6
  356. if open_tag in BLOCK_TAGS:
  357. return _parse_html_to_newline(state, self.BLANK_LINE)
  358. # Blocks of type 7 may not interrupt a paragraph.
  359. end_pos = state.append_paragraph()
  360. if end_pos:
  361. return end_pos
  362. # rule 7
  363. start_pos = m.end()
  364. end_pos = state.find_line_end()
  365. if (open_tag and _OPEN_TAG_END.match(state.src, start_pos, end_pos)) or (
  366. close_tag and _CLOSE_TAG_END.match(state.src, start_pos, end_pos)
  367. ):
  368. return _parse_html_to_newline(state, self.BLANK_LINE)
  369. return None
  370. def parse(self, state: BlockState, rules: Optional[List[str]] = None) -> None:
  371. sc = self.compile_sc(rules)
  372. while state.cursor < state.cursor_max:
  373. m = sc.search(state.src, state.cursor)
  374. if not m:
  375. break
  376. end_pos = m.start()
  377. if end_pos > state.cursor:
  378. text = state.get_text(end_pos)
  379. state.add_paragraph(text)
  380. state.cursor = end_pos
  381. end_pos2 = self.parse_method(m, state)
  382. if end_pos2:
  383. state.cursor = end_pos2
  384. else:
  385. end_pos3 = state.find_line_end()
  386. text = state.get_text(end_pos3)
  387. state.add_paragraph(text)
  388. state.cursor = end_pos3
  389. if state.cursor < state.cursor_max:
  390. text = state.src[state.cursor :]
  391. state.add_paragraph(text)
  392. state.cursor = state.cursor_max
  393. def _parse_html_to_end(state: BlockState, end_marker: str, start_pos: int) -> int:
  394. marker_pos = state.src.find(end_marker, start_pos)
  395. if marker_pos == -1:
  396. text = state.src[state.cursor :]
  397. end_pos = state.cursor_max
  398. else:
  399. text = state.get_text(marker_pos)
  400. state.cursor = marker_pos
  401. end_pos = state.find_line_end()
  402. text += state.get_text(end_pos)
  403. state.append_token({"type": "block_html", "raw": text})
  404. return end_pos
  405. def _parse_html_to_newline(state: BlockState, newline: Pattern[str]) -> int:
  406. m = newline.search(state.src, state.cursor)
  407. if m:
  408. end_pos = m.start()
  409. text = state.get_text(end_pos)
  410. else:
  411. text = state.src[state.cursor :]
  412. end_pos = state.cursor_max
  413. state.append_token({"type": "block_html", "raw": text})
  414. return end_pos