helpers.py 4.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193
  1. import re
  2. import string
  3. from typing import Any, Dict, Tuple, Union
  4. from .util import escape_url
  5. PREVENT_BACKSLASH = r"(?<!\\)(?:\\\\)*"
  6. PUNCTUATION = r"[" + re.escape(string.punctuation) + r"]"
  7. LINK_LABEL = r"(?:[^\\\[\]]|\\.){0,500}"
  8. LINK_BRACKET_START = re.compile(r"[ \t]*\n?[ \t]*<")
  9. LINK_BRACKET_RE = re.compile(r"<([^<>\n\\\x00]*)>")
  10. LINK_HREF_BLOCK_RE = re.compile(r"[ \t]*\n?[ \t]*([^\s]+)(?:\s|$)")
  11. LINK_HREF_INLINE_RE = re.compile(
  12. r"[ \t]*\n?[ \t]*([^ \t\n]*?)(?:[ \t\n]|"
  13. r"(?:" + PREVENT_BACKSLASH + r"\)))"
  14. )
  15. LINK_TITLE_RE = re.compile(
  16. r"[ \t\n]+("
  17. r'"(?:\\' + PUNCTUATION + r'|[^"\x00])*"|' # "title"
  18. r"'(?:\\" + PUNCTUATION + r"|[^'\x00])*'" # 'title'
  19. r")"
  20. )
  21. PAREN_END_RE = re.compile(r"\s*\)")
  22. HTML_TAGNAME = r"[A-Za-z][A-Za-z0-9-]*"
  23. HTML_ATTRIBUTES = (
  24. r"(?:\s+[A-Za-z_:][A-Za-z0-9_.:-]*"
  25. r'(?:\s*=\s*(?:[^ !"\'=<>`]+|\'[^\']*?\'|"[^\"]*?"))?)*'
  26. )
  27. BLOCK_TAGS = (
  28. "address",
  29. "article",
  30. "aside",
  31. "base",
  32. "basefont",
  33. "blockquote",
  34. "body",
  35. "caption",
  36. "center",
  37. "col",
  38. "colgroup",
  39. "dd",
  40. "details",
  41. "dialog",
  42. "dir",
  43. "div",
  44. "dl",
  45. "dt",
  46. "fieldset",
  47. "figcaption",
  48. "figure",
  49. "footer",
  50. "form",
  51. "frame",
  52. "frameset",
  53. "h1",
  54. "h2",
  55. "h3",
  56. "h4",
  57. "h5",
  58. "h6",
  59. "head",
  60. "header",
  61. "hr",
  62. "html",
  63. "iframe",
  64. "legend",
  65. "li",
  66. "link",
  67. "main",
  68. "menu",
  69. "menuitem",
  70. "meta",
  71. "nav",
  72. "noframes",
  73. "ol",
  74. "optgroup",
  75. "option",
  76. "p",
  77. "param",
  78. "section",
  79. "source",
  80. "summary",
  81. "table",
  82. "tbody",
  83. "td",
  84. "tfoot",
  85. "th",
  86. "thead",
  87. "title",
  88. "tr",
  89. "track",
  90. "ul",
  91. )
  92. PRE_TAGS = ("pre", "script", "style", "textarea")
  93. _INLINE_LINK_LABEL_RE = re.compile(LINK_LABEL + r"\]")
  94. _INLINE_SQUARE_BRACKET_RE = re.compile(PREVENT_BACKSLASH + r"[\[\]]")
  95. _ESCAPE_CHAR_RE = re.compile(r"\\(" + PUNCTUATION + r")")
  96. def unescape_char(text: str) -> str:
  97. return _ESCAPE_CHAR_RE.sub(r"\1", text)
  98. def parse_link_text(src: str, pos: int) -> Union[Tuple[str, int], Tuple[None, None]]:
  99. level = 1
  100. found = False
  101. start_pos = pos
  102. while pos < len(src):
  103. m = _INLINE_SQUARE_BRACKET_RE.search(src, pos)
  104. if not m:
  105. break
  106. pos = m.end()
  107. marker = m.group(0)
  108. if marker == "]":
  109. level -= 1
  110. if level == 0:
  111. found = True
  112. break
  113. else:
  114. level += 1
  115. if found:
  116. text = src[start_pos : pos - 1]
  117. return text, pos
  118. return None, None
  119. def parse_link_label(src: str, start_pos: int) -> Union[Tuple[str, int], Tuple[None, None]]:
  120. m = _INLINE_LINK_LABEL_RE.match(src, start_pos)
  121. if m:
  122. label = m.group(0)[:-1]
  123. return label, m.end()
  124. return None, None
  125. def parse_link_href(src: str, start_pos: int, block: bool = False) -> Union[Tuple[str, int], Tuple[None, None]]:
  126. m = LINK_BRACKET_START.match(src, start_pos)
  127. if m:
  128. start_pos = m.end() - 1
  129. m = LINK_BRACKET_RE.match(src, start_pos)
  130. if m:
  131. return m.group(1), m.end()
  132. return None, None
  133. if block:
  134. m = LINK_HREF_BLOCK_RE.match(src, start_pos)
  135. else:
  136. m = LINK_HREF_INLINE_RE.match(src, start_pos)
  137. if not m:
  138. return None, None
  139. end_pos = m.end()
  140. href = m.group(1)
  141. if block and src[end_pos - 1] == href[-1]:
  142. return href, end_pos
  143. return href, end_pos - 1
  144. def parse_link_title(src: str, start_pos: int, max_pos: int) -> Union[Tuple[str, int], Tuple[None, None]]:
  145. m = LINK_TITLE_RE.match(src, start_pos, max_pos)
  146. if m:
  147. title = m.group(1)[1:-1]
  148. title = unescape_char(title)
  149. return title, m.end()
  150. return None, None
  151. def parse_link(src: str, pos: int) -> Union[Tuple[Dict[str, Any], int], Tuple[None, None]]:
  152. href, href_pos = parse_link_href(src, pos)
  153. if href is None:
  154. return None, None
  155. assert href_pos is not None
  156. title, title_pos = parse_link_title(src, href_pos, len(src))
  157. next_pos = title_pos or href_pos
  158. m = PAREN_END_RE.match(src, next_pos)
  159. if not m:
  160. return None, None
  161. href = unescape_char(href)
  162. attrs = {"url": escape_url(href)}
  163. if title:
  164. attrs["title"] = title
  165. return attrs, m.end()