| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498 |
- import re
- from typing import Optional, List, Tuple, Match, Pattern
- import string
- from .util import (
- unikey,
- escape_url,
- expand_tab,
- expand_leading_tab,
- )
- from .core import Parser, BlockState
- from .helpers import (
- LINK_LABEL,
- HTML_TAGNAME,
- HTML_ATTRIBUTES,
- BLOCK_TAGS,
- PRE_TAGS,
- unescape_char,
- parse_link_href,
- parse_link_title,
- )
- from .list_parser import parse_list, LIST_PATTERN
- _INDENT_CODE_TRIM = re.compile(r"^ {1,4}", flags=re.M)
- _ATX_HEADING_TRIM = re.compile(r"(\s+|^)#+\s*$")
- _BLOCK_QUOTE_TRIM = re.compile(r"^ ?", flags=re.M)
- _BLOCK_QUOTE_LEADING = re.compile(r"^ *>", flags=re.M)
- _LINE_BLANK_END = re.compile(r"\n[ \t]*\n$")
- _BLANK_TO_LINE = re.compile(r"[ \t]*\n")
- _BLOCK_TAGS_PATTERN = "(" + "|".join(BLOCK_TAGS) + "|" + "|".join(PRE_TAGS) + ")"
- _OPEN_TAG_END = re.compile(HTML_ATTRIBUTES + r"[ \t]*>[ \t]*(?:\n|$)")
- _CLOSE_TAG_END = re.compile(r"[ \t]*>[ \t]*(?:\n|$)")
- _STRICT_BLOCK_QUOTE = re.compile(r"( {0,3}>[^\n]*(?:\n|$))+")
- class BlockParser(Parser[BlockState]):
- state_cls = BlockState
- BLANK_LINE = re.compile(r"(^[ \t\v\f]*\n)+", re.M)
- RAW_HTML = (
- r"^ {0,3}("
- r"</?" + HTML_TAGNAME + r"|"
- r"<!--|" # comment
- r"<\?|" # script
- r"<![A-Z]|"
- r"<!\[CDATA\[)"
- )
- BLOCK_HTML = (
- r"^ {0,3}(?:"
- r"(?:</?" + _BLOCK_TAGS_PATTERN + r"(?:[ \t]+|\n|$))"
- r"|<!--" # comment
- r"|<\?" # script
- r"|<![A-Z]"
- r"|<!\[CDATA\[)"
- )
- SPECIFICATION = {
- "blank_line": r"(^[ \t\v\f]*\n)+",
- "atx_heading": r"^ {0,3}(?P<atx_1>#{1,6})(?!#+)(?P<atx_2>[ \t]*|[ \t]+.*?)$",
- "setex_heading": r"^ {0,3}(?P<setext_1>=|-){1,}[ \t]*$",
- "fenced_code": (
- r"^(?P<fenced_1> {0,3})(?P<fenced_2>`{3,}|~{3,})"
- r"[ \t]*(?P<fenced_3>.*?)$"
- ),
- "indent_code": (
- r"^(?: {4}| *\t)[^\n]+(?:\n+|$)"
- r"((?:(?: {4}| *\t)[^\n]+(?:\n+|$))|\s)*"
- ),
- "thematic_break": r"^ {0,3}((?:-[ \t]*){3,}|(?:_[ \t]*){3,}|(?:\*[ \t]*){3,})$",
- "ref_link": r"^ {0,3}\[(?P<reflink_1>" + LINK_LABEL + r")\]:",
- "block_quote": r"^ {0,3}>(?P<quote_1>.*?)$",
- "list": LIST_PATTERN,
- "block_html": BLOCK_HTML,
- "raw_html": RAW_HTML,
- }
- DEFAULT_RULES = (
- "fenced_code",
- "indent_code",
- "atx_heading",
- "setex_heading",
- "thematic_break",
- "block_quote",
- "list",
- "ref_link",
- "raw_html",
- "blank_line",
- )
- def __init__(
- self,
- block_quote_rules: Optional[List[str]] = None,
- list_rules: Optional[List[str]] = None,
- max_nested_level: int = 6,
- ):
- super(BlockParser, self).__init__()
- if block_quote_rules is None:
- block_quote_rules = list(self.DEFAULT_RULES)
- if list_rules is None:
- list_rules = list(self.DEFAULT_RULES)
- self.block_quote_rules = block_quote_rules
- self.list_rules = list_rules
- self.max_nested_level = max_nested_level
- # register default parse methods
- self._methods = {name: getattr(self, "parse_" + name) for name in self.SPECIFICATION}
- def parse_blank_line(self, m: Match[str], state: BlockState) -> int:
- """Parse token for blank lines."""
- state.append_token({"type": "blank_line"})
- return m.end()
- def parse_thematic_break(self, m: Match[str], state: BlockState) -> int:
- """Parse token for thematic break, e.g. ``<hr>`` tag in HTML."""
- state.append_token({"type": "thematic_break"})
- # $ does not count '\n'
- return m.end() + 1
- def parse_indent_code(self, m: Match[str], state: BlockState) -> int:
- """Parse token for code block which is indented by 4 spaces."""
- # it is a part of the paragraph
- end_pos = state.append_paragraph()
- if end_pos:
- return end_pos
- code = m.group(0)
- code = expand_leading_tab(code)
- code = _INDENT_CODE_TRIM.sub("", code)
- code = code.strip("\n")
- state.append_token({"type": "block_code", "raw": code, "style": "indent"})
- return m.end()
- def parse_fenced_code(self, m: Match[str], state: BlockState) -> Optional[int]:
- """Parse token for fenced code block. A fenced code block is started with
- 3 or more backtick(`) or tilde(~).
- An example of a fenced code block:
- .. code-block:: markdown
- ```python
- def markdown(text):
- return mistune.html(text)
- ```
- """
- spaces = m.group("fenced_1")
- marker = m.group("fenced_2")
- info = m.group("fenced_3")
- c = marker[0]
- if info and c == "`":
- # CommonMark Example 145
- # Info strings for backtick code blocks cannot contain backticks
- if info.find(c) != -1:
- return None
- _end = re.compile(r"^ {0,3}" + c + "{" + str(len(marker)) + r",}[ \t]*(?:\n|$)", re.M)
- cursor_start = m.end() + 1
- m2 = _end.search(state.src, cursor_start)
- if m2:
- code = state.src[cursor_start : m2.start()]
- end_pos = m2.end()
- else:
- code = state.src[cursor_start:]
- end_pos = state.cursor_max
- if spaces and code:
- _trim_pattern = re.compile("^ {0," + str(len(spaces)) + "}", re.M)
- code = _trim_pattern.sub("", code)
- token = {"type": "block_code", "raw": code, "style": "fenced", "marker": marker}
- if info:
- info = unescape_char(info)
- token["attrs"] = {"info": info.strip()}
- state.append_token(token)
- return end_pos
- def parse_atx_heading(self, m: Match[str], state: BlockState) -> int:
- """Parse token for ATX heading. An ATX heading is started with 1 to 6
- symbol of ``#``."""
- level = len(m.group("atx_1"))
- text = m.group("atx_2").strip(string.whitespace)
- # remove last #
- if text:
- text = _ATX_HEADING_TRIM.sub("", text)
- token = {"type": "heading", "text": text, "attrs": {"level": level}, "style": "atx"}
- state.append_token(token)
- return m.end() + 1
- def parse_setex_heading(self, m: Match[str], state: BlockState) -> Optional[int]:
- """Parse token for setex style heading. A setex heading syntax looks like:
- .. code-block:: markdown
- H1 title
- ========
- """
- last_token = state.last_token()
- if last_token and last_token["type"] == "paragraph":
- level = 1 if m.group("setext_1") == "=" else 2
- last_token["type"] = "heading"
- last_token["style"] = "setext"
- last_token["attrs"] = {"level": level}
- return m.end() + 1
- sc = self.compile_sc(["thematic_break", "list"])
- m2 = sc.match(state.src, state.cursor)
- if m2:
- return self.parse_method(m2, state)
- return None
- def parse_ref_link(self, m: Match[str], state: BlockState) -> Optional[int]:
- """Parse link references and save the link information into ``state.env``.
- Here is an example of a link reference:
- .. code-block:: markdown
- a [link][example]
- [example]: https://example.com "Optional title"
- This method will save the link reference into ``state.env`` as::
- state.env['ref_links']['example'] = {
- 'url': 'https://example.com',
- 'title': "Optional title",
- }
- """
- end_pos = state.append_paragraph()
- if end_pos:
- return end_pos
- label = m.group("reflink_1")
- key = unikey(label)
- if not key:
- return None
- href, href_pos = parse_link_href(state.src, m.end(), block=True)
- if href is None:
- return None
- assert href_pos is not None
- _blank = self.BLANK_LINE.search(state.src, href_pos)
- if _blank:
- max_pos = _blank.start()
- else:
- max_pos = state.cursor_max
- title, title_pos = parse_link_title(state.src, href_pos, max_pos)
- if title_pos:
- m2 = _BLANK_TO_LINE.match(state.src, title_pos)
- if m2:
- title_pos = m2.end()
- else:
- title_pos = None
- title = None
- if title_pos is None:
- m3 = _BLANK_TO_LINE.match(state.src, href_pos)
- if m3:
- href_pos = m3.end()
- else:
- href_pos = None
- href = None
- end_pos = title_pos or href_pos
- if not end_pos:
- return None
- if key not in state.env["ref_links"]:
- assert href is not None
- href = unescape_char(href)
- data = {"url": escape_url(href), "label": label}
- if title:
- data["title"] = title
- state.env["ref_links"][key] = data
- return end_pos
- def extract_block_quote(self, m: Match[str], state: BlockState) -> Tuple[str, Optional[int]]:
- """Extract text and cursor end position of a block quote."""
- # cleanup at first to detect if it is code block
- text = m.group("quote_1") + "\n"
- text = expand_leading_tab(text, 3)
- text = _BLOCK_QUOTE_TRIM.sub("", text)
- sc = self.compile_sc(["blank_line", "indent_code", "fenced_code"])
- require_marker = bool(sc.match(text))
- state.cursor = m.end() + 1
- end_pos: Optional[int] = None
- if require_marker:
- m2 = _STRICT_BLOCK_QUOTE.match(state.src, state.cursor)
- if m2:
- quote = m2.group(0)
- quote = _BLOCK_QUOTE_LEADING.sub("", quote)
- quote = expand_leading_tab(quote, 3)
- quote = _BLOCK_QUOTE_TRIM.sub("", quote)
- text += quote
- state.cursor = m2.end()
- else:
- prev_blank_line = False
- break_sc = self.compile_sc(
- [
- "blank_line",
- "thematic_break",
- "fenced_code",
- "list",
- "block_html",
- ]
- )
- while state.cursor < state.cursor_max:
- m3 = _STRICT_BLOCK_QUOTE.match(state.src, state.cursor)
- if m3:
- quote = m3.group(0)
- quote = _BLOCK_QUOTE_LEADING.sub("", quote)
- quote = expand_leading_tab(quote, 3)
- quote = _BLOCK_QUOTE_TRIM.sub("", quote)
- text += quote
- state.cursor = m3.end()
- if not quote.strip():
- prev_blank_line = True
- else:
- prev_blank_line = bool(_LINE_BLANK_END.search(quote))
- continue
- if prev_blank_line:
- # CommonMark Example 249
- # because of laziness, a blank line is needed between
- # a block quote and a following paragraph
- break
- m4 = break_sc.match(state.src, state.cursor)
- if m4:
- end_pos = self.parse_method(m4, state)
- if end_pos:
- break
- # lazy continuation line
- pos = state.find_line_end()
- line = state.get_text(pos)
- line = expand_leading_tab(line, 3)
- text += line
- state.cursor = pos
- # according to CommonMark Example 6, the second tab should be
- # treated as 4 spaces
- return expand_tab(text), end_pos
- def parse_block_quote(self, m: Match[str], state: BlockState) -> int:
- """Parse token for block quote. Here is an example of the syntax:
- .. code-block:: markdown
- > a block quote starts
- > with right arrows
- """
- text, end_pos = self.extract_block_quote(m, state)
- # scan children state
- child = state.child_state(text)
- if state.depth() >= self.max_nested_level - 1:
- rules = list(self.block_quote_rules)
- rules.remove("block_quote")
- else:
- rules = self.block_quote_rules
- self.parse(child, rules)
- token = {"type": "block_quote", "children": child.tokens}
- if end_pos:
- state.prepend_token(token)
- return end_pos
- state.append_token(token)
- return state.cursor
- def parse_list(self, m: Match[str], state: BlockState) -> int:
- """Parse tokens for ordered and unordered list."""
- return parse_list(self, m, state)
- def parse_block_html(self, m: Match[str], state: BlockState) -> Optional[int]:
- return self.parse_raw_html(m, state)
- def parse_raw_html(self, m: Match[str], state: BlockState) -> Optional[int]:
- marker = m.group(0).strip()
- # rule 2
- if marker == "<!--":
- return _parse_html_to_end(state, "-->", m.end())
- # rule 3
- if marker == "<?":
- return _parse_html_to_end(state, "?>", m.end())
- # rule 5
- if marker == "<![CDATA[":
- return _parse_html_to_end(state, "]]>", m.end())
- # rule 4
- if marker.startswith("<!"):
- return _parse_html_to_end(state, ">", m.end())
- close_tag = None
- open_tag = None
- if marker.startswith("</"):
- close_tag = marker[2:].lower()
- # rule 6
- if close_tag in BLOCK_TAGS:
- return _parse_html_to_newline(state, self.BLANK_LINE)
- else:
- open_tag = marker[1:].lower()
- # rule 1
- if open_tag in PRE_TAGS:
- end_tag = "</" + open_tag + ">"
- return _parse_html_to_end(state, end_tag, m.end())
- # rule 6
- if open_tag in BLOCK_TAGS:
- return _parse_html_to_newline(state, self.BLANK_LINE)
- # Blocks of type 7 may not interrupt a paragraph.
- end_pos = state.append_paragraph()
- if end_pos:
- return end_pos
- # rule 7
- start_pos = m.end()
- end_pos = state.find_line_end()
- if (open_tag and _OPEN_TAG_END.match(state.src, start_pos, end_pos)) or (
- close_tag and _CLOSE_TAG_END.match(state.src, start_pos, end_pos)
- ):
- return _parse_html_to_newline(state, self.BLANK_LINE)
- return None
- def parse(self, state: BlockState, rules: Optional[List[str]] = None) -> None:
- sc = self.compile_sc(rules)
- while state.cursor < state.cursor_max:
- m = sc.search(state.src, state.cursor)
- if not m:
- break
- end_pos = m.start()
- if end_pos > state.cursor:
- text = state.get_text(end_pos)
- state.add_paragraph(text)
- state.cursor = end_pos
- end_pos2 = self.parse_method(m, state)
- if end_pos2:
- state.cursor = end_pos2
- else:
- end_pos3 = state.find_line_end()
- text = state.get_text(end_pos3)
- state.add_paragraph(text)
- state.cursor = end_pos3
- if state.cursor < state.cursor_max:
- text = state.src[state.cursor :]
- state.add_paragraph(text)
- state.cursor = state.cursor_max
- def _parse_html_to_end(state: BlockState, end_marker: str, start_pos: int) -> int:
- marker_pos = state.src.find(end_marker, start_pos)
- if marker_pos == -1:
- text = state.src[state.cursor :]
- end_pos = state.cursor_max
- else:
- text = state.get_text(marker_pos)
- state.cursor = marker_pos
- end_pos = state.find_line_end()
- text += state.get_text(end_pos)
- state.append_token({"type": "block_html", "raw": text})
- return end_pos
- def _parse_html_to_newline(state: BlockState, newline: Pattern[str]) -> int:
- m = newline.search(state.src, state.cursor)
- if m:
- end_pos = m.start()
- text = state.get_text(end_pos)
- else:
- text = state.src[state.cursor :]
- end_pos = state.cursor_max
- state.append_token({"type": "block_html", "raw": text})
- return end_pos
|