list_parser.py 7.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269
  1. """because list is complex, split list parser in a new file"""
  2. from __future__ import annotations
  3. import re
  4. from typing import TYPE_CHECKING, Any, Iterable, Optional, Match
  5. from .util import expand_leading_tab, expand_tab, strip_end
  6. if TYPE_CHECKING:
  7. from .block_parser import BlockParser
  8. from .core import BlockState
  9. LIST_PATTERN = (
  10. r"^(?P<list_1> {0,3})"
  11. r"(?P<list_2>[\*\+-]|\d{1,9}[.)])"
  12. r"(?P<list_3>[ \t]*|[ \t].+)$"
  13. )
  14. _LINE_HAS_TEXT = re.compile(r"(\s*)\S")
  15. def parse_list(block: "BlockParser", m: Match[str], state: "BlockState") -> int:
  16. """Parse tokens for ordered and unordered list."""
  17. text = m.group("list_3")
  18. if not text.strip():
  19. # Example 285
  20. # an empty list item cannot interrupt a paragraph
  21. end_pos = state.append_paragraph()
  22. if end_pos:
  23. return end_pos
  24. marker = m.group("list_2")
  25. ordered = len(marker) > 1
  26. depth = state.depth()
  27. token: dict[str, Any] = {
  28. "type": "list",
  29. "children": [],
  30. "tight": True,
  31. "bullet": marker[-1],
  32. "attrs": {
  33. "depth": depth,
  34. "ordered": ordered,
  35. },
  36. }
  37. if ordered:
  38. start = int(marker[:-1])
  39. if start != 1:
  40. # Example 304
  41. # we allow only lists starting with 1 to interrupt paragraphs
  42. end_pos = state.append_paragraph()
  43. if end_pos:
  44. return end_pos
  45. token["attrs"]["start"] = start
  46. state.cursor = m.end() + 1
  47. groups: Optional[tuple[str, str, str]] = (m.group("list_1"), marker, text)
  48. if depth >= block.max_nested_level - 1:
  49. rules = list(block.list_rules)
  50. rules.remove("list")
  51. else:
  52. rules = block.list_rules
  53. bullet = _get_list_bullet(marker[-1])
  54. while groups:
  55. groups = _parse_list_item(block, bullet, groups, token, state, rules)
  56. end_pos = token.pop("_end_pos", None)
  57. _transform_tight_list(token)
  58. if end_pos:
  59. index = token.pop("_tok_index")
  60. state.tokens.insert(index, token)
  61. return end_pos # type: ignore[no-any-return]
  62. state.append_token(token)
  63. return state.cursor
  64. def _transform_tight_list(token: dict[str, Any]) -> None:
  65. if token["tight"]:
  66. # reset tight list item
  67. for list_item in token["children"]:
  68. for tok in list_item["children"]:
  69. if tok["type"] == "paragraph":
  70. tok["type"] = "block_text"
  71. elif tok["type"] == "list":
  72. _transform_tight_list(tok)
  73. def _parse_list_item(
  74. block: "BlockParser",
  75. bullet: str,
  76. groups: tuple[str, str, str],
  77. token: dict[str, Any],
  78. state: "BlockState",
  79. rules: list[str],
  80. ) -> tuple[str, str, str] | None:
  81. spaces, marker, text = groups
  82. leading_width = len(spaces) + len(marker)
  83. text, continue_width = _compile_continue_width(text, leading_width)
  84. item_pattern = _compile_list_item_pattern(bullet, leading_width)
  85. list_item_breaks = [
  86. "thematic_break",
  87. "fenced_code",
  88. "atx_heading",
  89. "block_quote",
  90. "block_html",
  91. "list",
  92. ]
  93. if "fenced_directive" in block.specification:
  94. list_item_breaks.insert(1, "fenced_directive")
  95. pairs = [(name, block.specification[name]) for name in list_item_breaks]
  96. if leading_width < 3:
  97. _repl_w = str(leading_width)
  98. pairs = [(n, p.replace("3", _repl_w, 1)) for n, p in pairs]
  99. pairs.insert(1, ("list_item", item_pattern))
  100. regex = "|".join(r"(?P<%s>(?<=\n)%s)" % pair for pair in pairs)
  101. sc = re.compile(regex, re.M)
  102. src = ""
  103. next_group = None
  104. prev_blank_line = False
  105. pos = state.cursor
  106. continue_space = " " * continue_width
  107. while pos < state.cursor_max:
  108. pos = state.find_line_end()
  109. line = state.get_text(pos)
  110. if block.BLANK_LINE.match(line):
  111. src += "\n"
  112. prev_blank_line = True
  113. state.cursor = pos
  114. continue
  115. line = expand_leading_tab(line)
  116. if line.startswith(continue_space):
  117. if prev_blank_line and not text and not src.strip():
  118. # Example 280
  119. # A list item can begin with at most one blank line
  120. break
  121. src += line
  122. prev_blank_line = False
  123. state.cursor = pos
  124. continue
  125. m = sc.match(state.src, state.cursor)
  126. if m:
  127. tok_type = m.lastgroup
  128. if tok_type == "list_item":
  129. if prev_blank_line:
  130. token["tight"] = False
  131. next_group = (m.group("listitem_1"), m.group("listitem_2"), m.group("listitem_3"))
  132. state.cursor = m.end() + 1
  133. break
  134. if tok_type == "list":
  135. break
  136. tok_index = len(state.tokens)
  137. end_pos = block.parse_method(m, state)
  138. if end_pos:
  139. token["_tok_index"] = tok_index
  140. token["_end_pos"] = end_pos
  141. break
  142. if prev_blank_line and not line.startswith(continue_space):
  143. # not a continue line, and previous line is blank
  144. break
  145. src += line
  146. state.cursor = pos
  147. text += _clean_list_item_text(src, continue_width)
  148. child = state.child_state(strip_end(text))
  149. block.parse(child, rules)
  150. if token["tight"] and _is_loose_list(child.tokens):
  151. token["tight"] = False
  152. token["children"].append(
  153. {
  154. "type": "list_item",
  155. "children": child.tokens,
  156. }
  157. )
  158. if next_group:
  159. return next_group
  160. return None
  161. def _get_list_bullet(c: str) -> str:
  162. if c == ".":
  163. bullet = r"\d{0,9}\."
  164. elif c == ")":
  165. bullet = r"\d{0,9}\)"
  166. elif c == "*":
  167. bullet = r"\*"
  168. elif c == "+":
  169. bullet = r"\+"
  170. else:
  171. bullet = "-"
  172. return bullet
  173. def _compile_list_item_pattern(bullet: str, leading_width: int) -> str:
  174. if leading_width > 3:
  175. leading_width = 3
  176. return (
  177. r"^(?P<listitem_1> {0," + str(leading_width) + "})"
  178. r"(?P<listitem_2>" + bullet + ")"
  179. r"(?P<listitem_3>[ \t]*|[ \t][^\n]+)$"
  180. )
  181. def _compile_continue_width(text: str, leading_width: int) -> tuple[str, int]:
  182. text = expand_leading_tab(text, 3)
  183. text = expand_tab(text)
  184. m2 = _LINE_HAS_TEXT.match(text)
  185. if m2:
  186. # indent code, startswith 5 spaces
  187. if text.startswith(" "):
  188. space_width = 1
  189. else:
  190. space_width = len(m2.group(1))
  191. text = text[space_width:] + "\n"
  192. else:
  193. space_width = 1
  194. text = ""
  195. continue_width = leading_width + space_width
  196. return text, continue_width
  197. def _clean_list_item_text(src: str, continue_width: int) -> str:
  198. # according to Example 7, tab should be treated as 3 spaces
  199. rv = []
  200. trim_space = " " * continue_width
  201. lines = src.split("\n")
  202. for line in lines:
  203. if line.startswith(trim_space):
  204. line = line.replace(trim_space, "", 1)
  205. # according to CommonMark Example 5
  206. # tab should be treated as 4 spaces
  207. line = expand_tab(line)
  208. rv.append(line)
  209. else:
  210. rv.append(line)
  211. return "\n".join(rv)
  212. def _is_loose_list(tokens: Iterable[dict[str, Any]]) -> bool:
  213. paragraph_count = 0
  214. for tok in tokens:
  215. if tok["type"] == "blank_line":
  216. return True
  217. if tok["type"] == "paragraph":
  218. paragraph_count += 1
  219. if paragraph_count > 1:
  220. return True
  221. return False