inline_parser.py 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406
  1. import re
  2. from typing import (
  3. Any,
  4. Dict,
  5. List,
  6. Match,
  7. MutableMapping,
  8. Optional,
  9. )
  10. from .core import InlineState, Parser
  11. from .helpers import (
  12. HTML_ATTRIBUTES,
  13. HTML_TAGNAME,
  14. PREVENT_BACKSLASH,
  15. PUNCTUATION,
  16. parse_link,
  17. parse_link_label,
  18. parse_link_text,
  19. unescape_char,
  20. )
  21. from .util import escape_url, unikey
  22. PAREN_END_RE = re.compile(r"\s*\)")
  23. AUTO_EMAIL = (
  24. r"""<[a-zA-Z0-9.!#$%&'*+\/=?^_`{|}~-]+@[a-zA-Z0-9]"""
  25. r"(?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?"
  26. r"(?:\.[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?)*>"
  27. )
  28. INLINE_HTML = (
  29. r"<" + HTML_TAGNAME + HTML_ATTRIBUTES + r"\s*/?>|" # open tag
  30. r"</" + HTML_TAGNAME + r"\s*>|" # close tag
  31. r"<!--(?!>|->)(?:(?!--)[\s\S])+?(?<!-)-->|" # comment
  32. r"<\?[\s\S]+?\?>|" # script like <?php?>
  33. r"<![A-Z][\s\S]+?>|" # doctype
  34. r"<!\[CDATA[\s\S]+?\]\]>" # cdata
  35. )
  36. EMPHASIS_END_RE = {
  37. "*": re.compile(r"(?:" + PREVENT_BACKSLASH + r"\\\*|[^\s*])\*(?!\*)"),
  38. "_": re.compile(r"(?:" + PREVENT_BACKSLASH + r"\\_|[^\s_])_(?!_)\b"),
  39. "**": re.compile(r"(?:" + PREVENT_BACKSLASH + r"\\\*|[^\s*])\*\*(?!\*)"),
  40. "__": re.compile(r"(?:" + PREVENT_BACKSLASH + r"\\_|[^\s_])__(?!_)\b"),
  41. "***": re.compile(r"(?:" + PREVENT_BACKSLASH + r"\\\*|[^\s*])\*\*\*(?!\*)"),
  42. "___": re.compile(r"(?:" + PREVENT_BACKSLASH + r"\\_|[^\s_])___(?!_)\b"),
  43. }
  44. class InlineParser(Parser[InlineState]):
  45. sc_flag = 0
  46. state_cls = InlineState
  47. #: linebreak leaves two spaces at the end of line
  48. STD_LINEBREAK = r"(?:\\| {2,})\n\s*"
  49. #: every new line becomes <br>
  50. HARD_LINEBREAK = r" *\n\s*"
  51. # we only need to find the start pattern of an inline token
  52. SPECIFICATION = {
  53. # e.g. \`, \$
  54. "escape": r"(?:\\" + PUNCTUATION + ")+",
  55. # `code, ```code
  56. "codespan": r"`{1,}",
  57. # *w, **w, _w, __w
  58. "emphasis": r"\*{1,3}(?=[^\s*])|\b_{1,3}(?=[^\s_])",
  59. # [link], ![img]
  60. "link": r"!?\[",
  61. # <https://example.com>. regex copied from commonmark.js
  62. "auto_link": r"<[A-Za-z][A-Za-z0-9.+-]{1,31}:[^<>\x00-\x20]*>",
  63. "auto_email": AUTO_EMAIL,
  64. "inline_html": INLINE_HTML,
  65. "linebreak": STD_LINEBREAK,
  66. "softbreak": HARD_LINEBREAK,
  67. "prec_auto_link": r"<[A-Za-z][A-Za-z\d.+-]{1,31}:",
  68. "prec_inline_html": r"</?" + HTML_TAGNAME + r"|<!|<\?",
  69. }
  70. DEFAULT_RULES = (
  71. "escape",
  72. "codespan",
  73. "emphasis",
  74. "link",
  75. "auto_link",
  76. "auto_email",
  77. "inline_html",
  78. "linebreak",
  79. )
  80. def __init__(self, hard_wrap: bool = False) -> None:
  81. super(InlineParser, self).__init__()
  82. self.hard_wrap = hard_wrap
  83. # lazy add linebreak
  84. if hard_wrap:
  85. self.specification["linebreak"] = self.HARD_LINEBREAK
  86. else:
  87. self.rules.append("softbreak")
  88. self._methods = {name: getattr(self, "parse_" + name) for name in self.rules}
  89. def parse_escape(self, m: Match[str], state: InlineState) -> int:
  90. text = m.group(0)
  91. text = unescape_char(text)
  92. state.append_token(
  93. {
  94. "type": "text",
  95. "raw": text,
  96. }
  97. )
  98. return m.end()
  99. def parse_link(self, m: Match[str], state: InlineState) -> Optional[int]:
  100. pos = m.end()
  101. marker = m.group(0)
  102. is_image = marker[0] == "!"
  103. if is_image and state.in_image:
  104. state.append_token({"type": "text", "raw": marker})
  105. return pos
  106. elif not is_image and state.in_link:
  107. state.append_token({"type": "text", "raw": marker})
  108. return pos
  109. text = None
  110. label, end_pos = parse_link_label(state.src, pos)
  111. if label is None:
  112. text, end_pos = parse_link_text(state.src, pos)
  113. if text is None:
  114. return None
  115. assert end_pos is not None
  116. if text is None:
  117. text = label
  118. assert text is not None
  119. if end_pos >= len(state.src) and label is None:
  120. return None
  121. rules = ["codespan", "prec_auto_link", "prec_inline_html"]
  122. prec_pos = self.precedence_scan(m, state, end_pos, rules)
  123. if prec_pos:
  124. return prec_pos
  125. if end_pos < len(state.src):
  126. c = state.src[end_pos]
  127. if c == "(":
  128. # standard link [text](<url> "title")
  129. attrs, pos2 = parse_link(state.src, end_pos + 1)
  130. if pos2:
  131. token = self.__parse_link_token(is_image, text, attrs, state)
  132. state.append_token(token)
  133. return pos2
  134. elif c == "[":
  135. # standard ref link [text][label]
  136. label2, pos2 = parse_link_label(state.src, end_pos + 1)
  137. if pos2:
  138. end_pos = pos2
  139. if label2:
  140. label = label2
  141. if label is None:
  142. return None
  143. ref_links = state.env.get("ref_links")
  144. if not ref_links:
  145. return None
  146. key = unikey(label)
  147. env = ref_links.get(key)
  148. if env:
  149. attrs = {"url": env["url"], "title": env.get("title")}
  150. token = self.__parse_link_token(is_image, text, attrs, state)
  151. token["ref"] = key
  152. token["label"] = label
  153. state.append_token(token)
  154. return end_pos
  155. return None
  156. def __parse_link_token(
  157. self,
  158. is_image: bool,
  159. text: str,
  160. attrs: Optional[Dict[str, Any]],
  161. state: InlineState,
  162. ) -> Dict[str, Any]:
  163. new_state = state.copy()
  164. new_state.src = text
  165. if is_image:
  166. new_state.in_image = True
  167. token = {
  168. "type": "image",
  169. "children": self.render(new_state),
  170. "attrs": attrs,
  171. }
  172. else:
  173. new_state.in_link = True
  174. token = {
  175. "type": "link",
  176. "children": self.render(new_state),
  177. "attrs": attrs,
  178. }
  179. return token
  180. def parse_auto_link(self, m: Match[str], state: InlineState) -> int:
  181. text = m.group(0)
  182. pos = m.end()
  183. if state.in_link:
  184. self.process_text(text, state)
  185. return pos
  186. text = text[1:-1]
  187. self._add_auto_link(text, text, state)
  188. return pos
  189. def parse_auto_email(self, m: Match[str], state: InlineState) -> int:
  190. text = m.group(0)
  191. pos = m.end()
  192. if state.in_link:
  193. self.process_text(text, state)
  194. return pos
  195. text = text[1:-1]
  196. url = "mailto:" + text
  197. self._add_auto_link(url, text, state)
  198. return pos
  199. def _add_auto_link(self, url: str, text: str, state: InlineState) -> None:
  200. state.append_token(
  201. {
  202. "type": "link",
  203. "children": [{"type": "text", "raw": text}],
  204. "attrs": {"url": escape_url(url)},
  205. }
  206. )
  207. def parse_emphasis(self, m: Match[str], state: InlineState) -> int:
  208. pos = m.end()
  209. marker = m.group(0)
  210. mlen = len(marker)
  211. if mlen == 1 and state.in_emphasis:
  212. state.append_token({"type": "text", "raw": marker})
  213. return pos
  214. elif mlen == 2 and state.in_strong:
  215. state.append_token({"type": "text", "raw": marker})
  216. return pos
  217. _end_re = EMPHASIS_END_RE[marker]
  218. m1 = _end_re.search(state.src, pos)
  219. if not m1:
  220. state.append_token({"type": "text", "raw": marker})
  221. return pos
  222. end_pos = m1.end()
  223. text = state.src[pos : end_pos - mlen]
  224. prec_pos = self.precedence_scan(m, state, end_pos)
  225. if prec_pos:
  226. return prec_pos
  227. new_state = state.copy()
  228. new_state.src = text
  229. if mlen == 1:
  230. new_state.in_emphasis = True
  231. children = self.render(new_state)
  232. state.append_token({"type": "emphasis", "children": children})
  233. elif mlen == 2:
  234. new_state.in_strong = True
  235. children = self.render(new_state)
  236. state.append_token({"type": "strong", "children": children})
  237. else:
  238. new_state.in_emphasis = True
  239. new_state.in_strong = True
  240. children = [{"type": "strong", "children": self.render(new_state)}]
  241. state.append_token(
  242. {
  243. "type": "emphasis",
  244. "children": children,
  245. }
  246. )
  247. return end_pos
  248. def parse_codespan(self, m: Match[str], state: InlineState) -> int:
  249. marker = m.group(0)
  250. # require same marker with same length at end
  251. pattern = re.compile(r"(.*?[^`])" + marker + r"(?!`)", re.S)
  252. pos = m.end()
  253. m2 = pattern.match(state.src, pos)
  254. if m2:
  255. end_pos = m2.end()
  256. code = m2.group(1)
  257. # Line endings are treated like spaces
  258. code = code.replace("\n", " ")
  259. if len(code.strip()):
  260. if code.startswith(" ") and code.endswith(" "):
  261. code = code[1:-1]
  262. state.append_token({"type": "codespan", "raw": code})
  263. return end_pos
  264. else:
  265. state.append_token({"type": "text", "raw": marker})
  266. return pos
  267. def parse_linebreak(self, m: Match[str], state: InlineState) -> int:
  268. state.append_token({"type": "linebreak"})
  269. return m.end()
  270. def parse_softbreak(self, m: Match[str], state: InlineState) -> int:
  271. state.append_token({"type": "softbreak"})
  272. return m.end()
  273. def parse_inline_html(self, m: Match[str], state: InlineState) -> int:
  274. end_pos = m.end()
  275. html = m.group(0)
  276. state.append_token({"type": "inline_html", "raw": html})
  277. if html.startswith(("<a ", "<a>", "<A ", "<A>")):
  278. state.in_link = True
  279. elif html.startswith(("</a ", "</a>", "</A ", "</A>")):
  280. state.in_link = False
  281. return end_pos
  282. def process_text(self, text: str, state: InlineState) -> None:
  283. state.append_token({"type": "text", "raw": text})
  284. def parse(self, state: InlineState) -> List[Dict[str, Any]]:
  285. pos = 0
  286. sc = self.compile_sc()
  287. while pos < len(state.src):
  288. m = sc.search(state.src, pos)
  289. if not m:
  290. break
  291. end_pos = m.start()
  292. if end_pos > pos:
  293. hole = state.src[pos:end_pos]
  294. self.process_text(hole, state)
  295. new_pos = self.parse_method(m, state)
  296. if not new_pos:
  297. # move cursor 1 character forward
  298. pos = end_pos + 1
  299. hole = state.src[end_pos:pos]
  300. self.process_text(hole, state)
  301. else:
  302. pos = new_pos
  303. if pos == 0:
  304. # special case, just pure text
  305. self.process_text(state.src, state)
  306. elif pos < len(state.src):
  307. self.process_text(state.src[pos:], state)
  308. return state.tokens
  309. def precedence_scan(
  310. self,
  311. m: Match[str],
  312. state: InlineState,
  313. end_pos: int,
  314. rules: Optional[List[str]] = None,
  315. ) -> Optional[int]:
  316. if rules is None:
  317. rules = ["codespan", "link", "prec_auto_link", "prec_inline_html"]
  318. mark_pos = m.end()
  319. sc = self.compile_sc(rules)
  320. m1 = sc.search(state.src, mark_pos, end_pos)
  321. if not m1:
  322. return None
  323. lastgroup = m1.lastgroup
  324. if not lastgroup:
  325. return None
  326. rule_name = lastgroup.replace("prec_", "")
  327. sc = self.compile_sc([rule_name])
  328. m2 = sc.match(state.src, m1.start())
  329. if not m2:
  330. return None
  331. func = self._methods[rule_name]
  332. new_state = state.copy()
  333. new_state.src = state.src
  334. m2_pos = func(m2, new_state)
  335. if not m2_pos or m2_pos < end_pos:
  336. return None
  337. raw_text = state.src[m.start() : m2.start()]
  338. state.append_token({"type": "text", "raw": raw_text})
  339. for token in new_state.tokens:
  340. state.append_token(token)
  341. return m2_pos
  342. def render(self, state: InlineState) -> List[Dict[str, Any]]:
  343. self.parse(state)
  344. return state.tokens
  345. def __call__(self, s: str, env: MutableMapping[str, Any]) -> List[Dict[str, Any]]:
  346. state = self.state_cls(env)
  347. state.src = s
  348. return self.render(state)