| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269 |
- """because list is complex, split list parser in a new file"""
- from __future__ import annotations
- import re
- from typing import TYPE_CHECKING, Any, Iterable, Optional, Match
- from .util import expand_leading_tab, expand_tab, strip_end
- if TYPE_CHECKING:
- from .block_parser import BlockParser
- from .core import BlockState
- LIST_PATTERN = (
- r"^(?P<list_1> {0,3})"
- r"(?P<list_2>[\*\+-]|\d{1,9}[.)])"
- r"(?P<list_3>[ \t]*|[ \t].+)$"
- )
- _LINE_HAS_TEXT = re.compile(r"(\s*)\S")
- def parse_list(block: "BlockParser", m: Match[str], state: "BlockState") -> int:
- """Parse tokens for ordered and unordered list."""
- text = m.group("list_3")
- if not text.strip():
- # Example 285
- # an empty list item cannot interrupt a paragraph
- end_pos = state.append_paragraph()
- if end_pos:
- return end_pos
- marker = m.group("list_2")
- ordered = len(marker) > 1
- depth = state.depth()
- token: dict[str, Any] = {
- "type": "list",
- "children": [],
- "tight": True,
- "bullet": marker[-1],
- "attrs": {
- "depth": depth,
- "ordered": ordered,
- },
- }
- if ordered:
- start = int(marker[:-1])
- if start != 1:
- # Example 304
- # we allow only lists starting with 1 to interrupt paragraphs
- end_pos = state.append_paragraph()
- if end_pos:
- return end_pos
- token["attrs"]["start"] = start
- state.cursor = m.end() + 1
- groups: Optional[tuple[str, str, str]] = (m.group("list_1"), marker, text)
- if depth >= block.max_nested_level - 1:
- rules = list(block.list_rules)
- rules.remove("list")
- else:
- rules = block.list_rules
- bullet = _get_list_bullet(marker[-1])
- while groups:
- groups = _parse_list_item(block, bullet, groups, token, state, rules)
- end_pos = token.pop("_end_pos", None)
- _transform_tight_list(token)
- if end_pos:
- index = token.pop("_tok_index")
- state.tokens.insert(index, token)
- return end_pos # type: ignore[no-any-return]
- state.append_token(token)
- return state.cursor
- def _transform_tight_list(token: dict[str, Any]) -> None:
- if token["tight"]:
- # reset tight list item
- for list_item in token["children"]:
- for tok in list_item["children"]:
- if tok["type"] == "paragraph":
- tok["type"] = "block_text"
- elif tok["type"] == "list":
- _transform_tight_list(tok)
- def _parse_list_item(
- block: "BlockParser",
- bullet: str,
- groups: tuple[str, str, str],
- token: dict[str, Any],
- state: "BlockState",
- rules: list[str],
- ) -> tuple[str, str, str] | None:
- spaces, marker, text = groups
- leading_width = len(spaces) + len(marker)
- text, continue_width = _compile_continue_width(text, leading_width)
- item_pattern = _compile_list_item_pattern(bullet, leading_width)
- list_item_breaks = [
- "thematic_break",
- "fenced_code",
- "atx_heading",
- "block_quote",
- "block_html",
- "list",
- ]
- if "fenced_directive" in block.specification:
- list_item_breaks.insert(1, "fenced_directive")
- pairs = [(name, block.specification[name]) for name in list_item_breaks]
- if leading_width < 3:
- _repl_w = str(leading_width)
- pairs = [(n, p.replace("3", _repl_w, 1)) for n, p in pairs]
- pairs.insert(1, ("list_item", item_pattern))
- regex = "|".join(r"(?P<%s>(?<=\n)%s)" % pair for pair in pairs)
- sc = re.compile(regex, re.M)
- src = ""
- next_group = None
- prev_blank_line = False
- pos = state.cursor
- continue_space = " " * continue_width
- while pos < state.cursor_max:
- pos = state.find_line_end()
- line = state.get_text(pos)
- if block.BLANK_LINE.match(line):
- src += "\n"
- prev_blank_line = True
- state.cursor = pos
- continue
- line = expand_leading_tab(line)
- if line.startswith(continue_space):
- if prev_blank_line and not text and not src.strip():
- # Example 280
- # A list item can begin with at most one blank line
- break
- src += line
- prev_blank_line = False
- state.cursor = pos
- continue
- m = sc.match(state.src, state.cursor)
- if m:
- tok_type = m.lastgroup
- if tok_type == "list_item":
- if prev_blank_line:
- token["tight"] = False
- next_group = (m.group("listitem_1"), m.group("listitem_2"), m.group("listitem_3"))
- state.cursor = m.end() + 1
- break
- if tok_type == "list":
- break
- tok_index = len(state.tokens)
- end_pos = block.parse_method(m, state)
- if end_pos:
- token["_tok_index"] = tok_index
- token["_end_pos"] = end_pos
- break
- if prev_blank_line and not line.startswith(continue_space):
- # not a continue line, and previous line is blank
- break
- src += line
- state.cursor = pos
- text += _clean_list_item_text(src, continue_width)
- child = state.child_state(strip_end(text))
- block.parse(child, rules)
- if token["tight"] and _is_loose_list(child.tokens):
- token["tight"] = False
- token["children"].append(
- {
- "type": "list_item",
- "children": child.tokens,
- }
- )
- if next_group:
- return next_group
- return None
- def _get_list_bullet(c: str) -> str:
- if c == ".":
- bullet = r"\d{0,9}\."
- elif c == ")":
- bullet = r"\d{0,9}\)"
- elif c == "*":
- bullet = r"\*"
- elif c == "+":
- bullet = r"\+"
- else:
- bullet = "-"
- return bullet
- def _compile_list_item_pattern(bullet: str, leading_width: int) -> str:
- if leading_width > 3:
- leading_width = 3
- return (
- r"^(?P<listitem_1> {0," + str(leading_width) + "})"
- r"(?P<listitem_2>" + bullet + ")"
- r"(?P<listitem_3>[ \t]*|[ \t][^\n]+)$"
- )
- def _compile_continue_width(text: str, leading_width: int) -> tuple[str, int]:
- text = expand_leading_tab(text, 3)
- text = expand_tab(text)
- m2 = _LINE_HAS_TEXT.match(text)
- if m2:
- # indent code, startswith 5 spaces
- if text.startswith(" "):
- space_width = 1
- else:
- space_width = len(m2.group(1))
- text = text[space_width:] + "\n"
- else:
- space_width = 1
- text = ""
- continue_width = leading_width + space_width
- return text, continue_width
- def _clean_list_item_text(src: str, continue_width: int) -> str:
- # according to Example 7, tab should be treated as 3 spaces
- rv = []
- trim_space = " " * continue_width
- lines = src.split("\n")
- for line in lines:
- if line.startswith(trim_space):
- line = line.replace(trim_space, "", 1)
- # according to CommonMark Example 5
- # tab should be treated as 4 spaces
- line = expand_tab(line)
- rv.append(line)
- else:
- rv.append(line)
- return "\n".join(rv)
- def _is_loose_list(tokens: Iterable[dict[str, Any]]) -> bool:
- paragraph_count = 0
- for tok in tokens:
- if tok["type"] == "blank_line":
- return True
- if tok["type"] == "paragraph":
- paragraph_count += 1
- if paragraph_count > 1:
- return True
- return False
|