linkifier.py 22 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633
  1. import re
  2. from urllib.parse import quote
  3. from bleach import callbacks as linkify_callbacks
  4. from bleach import html5lib_shim
  5. #: List of default callbacks
  6. DEFAULT_CALLBACKS = [linkify_callbacks.nofollow]
  7. TLDS = """ac ad ae aero af ag ai al am an ao aq ar arpa as asia at au aw ax az
  8. ba bb bd be bf bg bh bi biz bj bm bn bo br bs bt bv bw by bz ca cat
  9. cc cd cf cg ch ci ck cl cm cn co com coop cr cu cv cx cy cz de dj dk
  10. dm do dz ec edu ee eg er es et eu fi fj fk fm fo fr ga gb gd ge gf gg
  11. gh gi gl gm gn gov gp gq gr gs gt gu gw gy hk hm hn hr ht hu id ie il
  12. im in info int io iq ir is it je jm jo jobs jp ke kg kh ki km kn kp
  13. kr kw ky kz la lb lc li lk lr ls lt lu lv ly ma mc md me mg mh mil mk
  14. ml mm mn mo mobi mp mq mr ms mt mu museum mv mw mx my mz na name nc ne
  15. net nf ng ni nl no np nr nu nz om org pa pe pf pg ph pk pl pm pn post
  16. pr pro ps pt pw py qa re ro rs ru rw sa sb sc sd se sg sh si sj sk sl
  17. sm sn so sr ss st su sv sx sy sz tc td tel tf tg th tj tk tl tm tn to
  18. tp tr travel tt tv tw tz ua ug uk us uy uz va vc ve vg vi vn vu wf ws
  19. xn xxx ye yt yu za zm zw""".split()
  20. # Make sure that .com doesn't get matched by .co first
  21. TLDS.reverse()
  22. def build_url_re(tlds=TLDS, protocols=html5lib_shim.allowed_protocols):
  23. """Builds the url regex used by linkifier
  24. If you want a different set of tlds or allowed protocols, pass those in
  25. and stomp on the existing ``url_re``::
  26. from bleach import linkifier
  27. my_url_re = linkifier.build_url_re(my_tlds_list, my_protocols)
  28. linker = LinkifyFilter(url_re=my_url_re)
  29. """
  30. return re.compile(
  31. r"""\(* # Match any opening parentheses.
  32. \b(?<![@.])(?:(?:{0}):/{{0,3}}(?:(?:\w+:)?\w+@)?)? # http://
  33. ([\w-]+\.)+(?:{1})(?:\:[0-9]+)?(?!\.\w)\b # xx.yy.tld(:##)?
  34. (?:[/?][^\s\{{\}}\|\\\^`<>"]*)?
  35. # /path/zz (excluding "unsafe" chars from RFC 3986,
  36. # except for # and ~, which happen in practice)
  37. """.format(
  38. "|".join(sorted(protocols)), "|".join(sorted(tlds))
  39. ),
  40. re.IGNORECASE | re.VERBOSE | re.UNICODE,
  41. )
  42. URL_RE = build_url_re()
  43. PROTO_RE = re.compile(r"^[\w-]+:/{0,3}", re.IGNORECASE)
  44. def build_email_re(tlds=TLDS):
  45. """Builds the email regex used by linkifier
  46. If you want a different set of tlds, pass those in and stomp on the existing ``email_re``::
  47. from bleach import linkifier
  48. my_email_re = linkifier.build_email_re(my_tlds_list)
  49. linker = LinkifyFilter(email_re=my_url_re)
  50. """
  51. # open and closing braces doubled below for format string
  52. return re.compile(
  53. r"""(?<!//)
  54. (([-!#$%&'*+/=?^_`{{}}|~0-9A-Z]+
  55. (\.[-!#$%&'*+/=?^_`{{}}|~0-9A-Z]+)* # dot-atom
  56. |^"([\001-\010\013\014\016-\037!#-\[\]-\177]
  57. |\\[\001-\011\013\014\016-\177])*" # quoted-string
  58. )@(?:[A-Z0-9](?:[A-Z0-9-]{{0,61}}[A-Z0-9])?\.)+(?:{0})) # domain
  59. """.format(
  60. "|".join(tlds)
  61. ),
  62. re.IGNORECASE | re.MULTILINE | re.VERBOSE,
  63. )
  64. EMAIL_RE = build_email_re()
  65. class Linker:
  66. """Convert URL-like strings in an HTML fragment to links
  67. This function converts strings that look like URLs, domain names and email
  68. addresses in text that may be an HTML fragment to links, while preserving:
  69. 1. links already in the string
  70. 2. urls found in attributes
  71. 3. email addresses
  72. linkify does a best-effort approach and tries to recover from bad
  73. situations due to crazy text.
  74. """
  75. def __init__(
  76. self,
  77. callbacks=DEFAULT_CALLBACKS,
  78. skip_tags=None,
  79. parse_email=False,
  80. url_re=URL_RE,
  81. email_re=EMAIL_RE,
  82. recognized_tags=html5lib_shim.HTML_TAGS,
  83. ):
  84. """Creates a Linker instance
  85. :arg list callbacks: list of callbacks to run when adjusting tag attributes;
  86. defaults to ``bleach.linkifier.DEFAULT_CALLBACKS``
  87. :arg set skip_tags: set of tags that you don't want to linkify the
  88. contents of; for example, you could set this to ``{'pre'}`` to skip
  89. linkifying contents of ``pre`` tags; ``None`` means you don't
  90. want linkify to skip any tags
  91. :arg bool parse_email: whether or not to linkify email addresses
  92. :arg url_re: url matching regex
  93. :arg email_re: email matching regex
  94. :arg set recognized_tags: the set of tags that linkify knows about;
  95. everything else gets escaped
  96. :returns: linkified text as unicode
  97. """
  98. self.callbacks = callbacks
  99. self.skip_tags = skip_tags
  100. self.parse_email = parse_email
  101. self.url_re = url_re
  102. self.email_re = email_re
  103. # Create a parser/tokenizer that allows all HTML tags and escapes
  104. # anything not in that list.
  105. self.parser = html5lib_shim.BleachHTMLParser(
  106. tags=frozenset(recognized_tags),
  107. strip=False,
  108. consume_entities=False,
  109. namespaceHTMLElements=False,
  110. )
  111. self.walker = html5lib_shim.getTreeWalker("etree")
  112. self.serializer = html5lib_shim.BleachHTMLSerializer(
  113. quote_attr_values="always",
  114. omit_optional_tags=False,
  115. # We want to leave entities as they are without escaping or
  116. # resolving or expanding
  117. resolve_entities=False,
  118. # linkify does not sanitize
  119. sanitize=False,
  120. # linkify preserves attr order
  121. alphabetical_attributes=False,
  122. )
  123. def linkify(self, text):
  124. """Linkify specified text
  125. :arg str text: the text to add links to
  126. :returns: linkified text as unicode
  127. :raises TypeError: if ``text`` is not a text type
  128. """
  129. if not isinstance(text, str):
  130. raise TypeError("argument must be of text type")
  131. if not text:
  132. return ""
  133. dom = self.parser.parseFragment(text)
  134. filtered = LinkifyFilter(
  135. source=self.walker(dom),
  136. callbacks=self.callbacks,
  137. skip_tags=self.skip_tags,
  138. parse_email=self.parse_email,
  139. url_re=self.url_re,
  140. email_re=self.email_re,
  141. )
  142. return self.serializer.render(filtered)
  143. class LinkifyFilter(html5lib_shim.Filter):
  144. """html5lib filter that linkifies text
  145. This will do the following:
  146. * convert email addresses into links
  147. * convert urls into links
  148. * edit existing links by running them through callbacks--the default is to
  149. add a ``rel="nofollow"``
  150. This filter can be used anywhere html5lib filters can be used.
  151. """
  152. def __init__(
  153. self,
  154. source,
  155. callbacks=DEFAULT_CALLBACKS,
  156. skip_tags=None,
  157. parse_email=False,
  158. url_re=URL_RE,
  159. email_re=EMAIL_RE,
  160. ):
  161. """Creates a LinkifyFilter instance
  162. :arg source: stream as an html5lib TreeWalker
  163. :arg list callbacks: list of callbacks to run when adjusting tag attributes;
  164. defaults to ``bleach.linkifier.DEFAULT_CALLBACKS``
  165. :arg set skip_tags: set of tags that you don't want to linkify the
  166. contents of; for example, you could set this to ``{'pre'}`` to skip
  167. linkifying contents of ``pre`` tags
  168. :arg bool parse_email: whether or not to linkify email addresses
  169. :arg url_re: url matching regex
  170. :arg email_re: email matching regex
  171. """
  172. super().__init__(source)
  173. self.callbacks = callbacks or []
  174. self.skip_tags = skip_tags or {}
  175. self.parse_email = parse_email
  176. self.url_re = url_re
  177. self.email_re = email_re
  178. def apply_callbacks(self, attrs, is_new):
  179. """Given an attrs dict and an is_new bool, runs through callbacks
  180. Callbacks can return an adjusted attrs dict or ``None``. In the case of
  181. ``None``, we stop going through callbacks and return that and the link
  182. gets dropped.
  183. :arg dict attrs: map of ``(namespace, name)`` -> ``value``
  184. :arg bool is_new: whether or not this link was added by linkify
  185. :returns: adjusted attrs dict or ``None``
  186. """
  187. for cb in self.callbacks:
  188. attrs = cb(attrs, is_new)
  189. if attrs is None:
  190. return None
  191. return attrs
  192. def extract_character_data(self, token_list):
  193. """Extracts and squashes character sequences in a token stream"""
  194. # FIXME(willkg): This is a terrible idea. What it does is drop all the
  195. # tags from the token list and merge the Characters and SpaceCharacters
  196. # tokens into a single text.
  197. #
  198. # So something like this::
  199. #
  200. # "<span>" "<b>" "some text" "</b>" "</span>"
  201. #
  202. # gets converted to "some text".
  203. #
  204. # This gets used to figure out the ``_text`` fauxttribute value for
  205. # linkify callables.
  206. #
  207. # I'm not really sure how else to support that ``_text`` fauxttribute and
  208. # maintain some modicum of backwards compatibility with previous versions
  209. # of Bleach.
  210. out = []
  211. for token in token_list:
  212. token_type = token["type"]
  213. if token_type in ["Characters", "SpaceCharacters"]:
  214. out.append(token["data"])
  215. return "".join(out)
  216. def handle_email_addresses(self, src_iter):
  217. """Handle email addresses in character tokens"""
  218. for token in src_iter:
  219. if token["type"] == "Characters":
  220. text = token["data"]
  221. new_tokens = []
  222. end = 0
  223. # For each email address we find in the text
  224. for match in self.email_re.finditer(text):
  225. if match.start() > end:
  226. new_tokens.append(
  227. {"type": "Characters", "data": text[end : match.start()]}
  228. )
  229. # URL-encode the "local-part" according to RFC6068
  230. parts = match.group(0).split("@")
  231. parts[0] = quote(parts[0])
  232. address = "@".join(parts)
  233. # Run attributes through the callbacks to see what we
  234. # should do with this match
  235. attrs = {
  236. (None, "href"): "mailto:%s" % address,
  237. "_text": match.group(0),
  238. }
  239. attrs = self.apply_callbacks(attrs, True)
  240. if attrs is None:
  241. # Just add the text--but not as a link
  242. new_tokens.append(
  243. {"type": "Characters", "data": match.group(0)}
  244. )
  245. else:
  246. # Add an "a" tag for the new link
  247. _text = attrs.pop("_text", "")
  248. new_tokens.extend(
  249. [
  250. {"type": "StartTag", "name": "a", "data": attrs},
  251. {"type": "Characters", "data": str(_text)},
  252. {"type": "EndTag", "name": "a"},
  253. ]
  254. )
  255. end = match.end()
  256. if new_tokens:
  257. # Yield the adjusted set of tokens and then continue
  258. # through the loop
  259. if end < len(text):
  260. new_tokens.append({"type": "Characters", "data": text[end:]})
  261. yield from new_tokens
  262. continue
  263. yield token
  264. def strip_non_url_bits(self, fragment):
  265. """Strips non-url bits from the url
  266. This accounts for over-eager matching by the regex.
  267. """
  268. prefix = suffix = ""
  269. while fragment:
  270. # Try removing ( from the beginning and, if it's balanced, from the
  271. # end, too
  272. if fragment.startswith("("):
  273. prefix = prefix + "("
  274. fragment = fragment[1:]
  275. if fragment.endswith(")"):
  276. suffix = ")" + suffix
  277. fragment = fragment[:-1]
  278. continue
  279. # Now try extraneous things from the end. For example, sometimes we
  280. # pick up ) at the end of a url, but the url is in a parenthesized
  281. # phrase like:
  282. #
  283. # "i looked at the site (at http://example.com)"
  284. if fragment.endswith(")") and "(" not in fragment:
  285. fragment = fragment[:-1]
  286. suffix = ")" + suffix
  287. continue
  288. # Handle commas
  289. if fragment.endswith(","):
  290. fragment = fragment[:-1]
  291. suffix = "," + suffix
  292. continue
  293. # Handle periods
  294. if fragment.endswith("."):
  295. fragment = fragment[:-1]
  296. suffix = "." + suffix
  297. continue
  298. # Nothing matched, so we're done
  299. break
  300. return fragment, prefix, suffix
  301. def handle_links(self, src_iter):
  302. """Handle links in character tokens"""
  303. in_a = False # happens, if parse_email=True and if a mail was found
  304. for token in src_iter:
  305. if in_a:
  306. if token["type"] == "EndTag" and token["name"] == "a":
  307. in_a = False
  308. yield token
  309. continue
  310. elif token["type"] == "StartTag" and token["name"] == "a":
  311. in_a = True
  312. yield token
  313. continue
  314. if token["type"] == "Characters":
  315. text = token["data"]
  316. new_tokens = []
  317. end = 0
  318. for match in self.url_re.finditer(text):
  319. if match.start() > end:
  320. new_tokens.append(
  321. {"type": "Characters", "data": text[end : match.start()]}
  322. )
  323. url = match.group(0)
  324. prefix = suffix = ""
  325. # Sometimes we pick up too much in the url match, so look for
  326. # bits we should drop and remove them from the match
  327. url, prefix, suffix = self.strip_non_url_bits(url)
  328. # If there's no protocol, add one
  329. if PROTO_RE.search(url):
  330. href = url
  331. else:
  332. href = "http://%s" % url
  333. attrs = {(None, "href"): href, "_text": url}
  334. attrs = self.apply_callbacks(attrs, True)
  335. if attrs is None:
  336. # Just add the text
  337. new_tokens.append(
  338. {"type": "Characters", "data": prefix + url + suffix}
  339. )
  340. else:
  341. # Add the "a" tag!
  342. if prefix:
  343. new_tokens.append({"type": "Characters", "data": prefix})
  344. _text = attrs.pop("_text", "")
  345. new_tokens.extend(
  346. [
  347. {"type": "StartTag", "name": "a", "data": attrs},
  348. {"type": "Characters", "data": str(_text)},
  349. {"type": "EndTag", "name": "a"},
  350. ]
  351. )
  352. if suffix:
  353. new_tokens.append({"type": "Characters", "data": suffix})
  354. end = match.end()
  355. if new_tokens:
  356. # Yield the adjusted set of tokens and then continue
  357. # through the loop
  358. if end < len(text):
  359. new_tokens.append({"type": "Characters", "data": text[end:]})
  360. yield from new_tokens
  361. continue
  362. yield token
  363. def handle_a_tag(self, token_buffer):
  364. """Handle the "a" tag
  365. This could adjust the link or drop it altogether depending on what the
  366. callbacks return.
  367. This yields the new set of tokens.
  368. """
  369. a_token = token_buffer[0]
  370. if a_token["data"]:
  371. attrs = a_token["data"]
  372. else:
  373. attrs = {}
  374. text = self.extract_character_data(token_buffer)
  375. attrs["_text"] = text
  376. attrs = self.apply_callbacks(attrs, False)
  377. if attrs is None:
  378. # We're dropping the "a" tag and everything else and replacing
  379. # it with character data. So emit that token.
  380. yield {"type": "Characters", "data": text}
  381. else:
  382. new_text = attrs.pop("_text", "")
  383. a_token["data"] = attrs
  384. if text == new_text:
  385. # The callbacks didn't change the text, so we yield the new "a"
  386. # token, then whatever else was there, then the end "a" token
  387. yield a_token
  388. yield from token_buffer[1:]
  389. else:
  390. # If the callbacks changed the text, then we're going to drop
  391. # all the tokens between the start and end "a" tags and replace
  392. # it with the new text
  393. yield a_token
  394. yield {"type": "Characters", "data": str(new_text)}
  395. yield token_buffer[-1]
  396. def extract_entities(self, token):
  397. """Handles Characters tokens with entities
  398. Our overridden tokenizer doesn't do anything with entities. However,
  399. that means that the serializer will convert all ``&`` in Characters
  400. tokens to ``&amp;``.
  401. Since we don't want that, we extract entities here and convert them to
  402. Entity tokens so the serializer will let them be.
  403. :arg token: the Characters token to work on
  404. :returns: generator of tokens
  405. """
  406. data = token.get("data", "")
  407. # If there isn't a & in the data, we can return now
  408. if "&" not in data:
  409. yield token
  410. return
  411. new_tokens = []
  412. # For each possible entity that starts with a "&", we try to extract an
  413. # actual entity and re-tokenize accordingly
  414. for part in html5lib_shim.next_possible_entity(data):
  415. if not part:
  416. continue
  417. if part.startswith("&"):
  418. entity = html5lib_shim.match_entity(part)
  419. if entity is not None:
  420. if entity == "amp":
  421. # LinkifyFilter can't match urls across token boundaries
  422. # which is problematic with &amp; since that shows up in
  423. # querystrings all the time. This special-cases &amp;
  424. # and converts it to a & and sticks it in as a
  425. # Characters token. It'll get merged with surrounding
  426. # tokens in the BleachSanitizerfilter.__iter__ and
  427. # escaped in the serializer.
  428. new_tokens.append({"type": "Characters", "data": "&"})
  429. else:
  430. new_tokens.append({"type": "Entity", "name": entity})
  431. # Length of the entity plus 2--one for & at the beginning
  432. # and one for ; at the end
  433. remainder = part[len(entity) + 2 :]
  434. if remainder:
  435. new_tokens.append({"type": "Characters", "data": remainder})
  436. continue
  437. new_tokens.append({"type": "Characters", "data": part})
  438. yield from new_tokens
  439. def __iter__(self):
  440. in_a = False
  441. in_skip_tag = None
  442. token_buffer = []
  443. for token in super().__iter__():
  444. if in_a:
  445. # Handle the case where we're in an "a" tag--we want to buffer tokens
  446. # until we hit an end "a" tag.
  447. if token["type"] == "EndTag" and token["name"] == "a":
  448. # Add the end tag to the token buffer and then handle them
  449. # and yield anything returned
  450. token_buffer.append(token)
  451. yield from self.handle_a_tag(token_buffer)
  452. # Clear "a" related state and continue since we've yielded all
  453. # the tokens we're going to yield
  454. in_a = False
  455. token_buffer = []
  456. else:
  457. token_buffer.extend(list(self.extract_entities(token)))
  458. continue
  459. if token["type"] in ["StartTag", "EmptyTag"]:
  460. if token["name"] in self.skip_tags:
  461. # Skip tags start a "special mode" where we don't linkify
  462. # anything until the end tag.
  463. in_skip_tag = token["name"]
  464. elif token["name"] == "a":
  465. # The "a" tag is special--we switch to a slurp mode and
  466. # slurp all the tokens until the end "a" tag and then
  467. # figure out what to do with them there.
  468. in_a = True
  469. token_buffer.append(token)
  470. # We buffer the start tag, so we don't want to yield it,
  471. # yet
  472. continue
  473. elif in_skip_tag and self.skip_tags:
  474. # NOTE(willkg): We put this clause here since in_a and
  475. # switching in and out of in_a takes precedence.
  476. if token["type"] == "EndTag" and token["name"] == in_skip_tag:
  477. in_skip_tag = None
  478. elif not in_a and not in_skip_tag and token["type"] == "Characters":
  479. new_stream = iter([token])
  480. if self.parse_email:
  481. new_stream = self.handle_email_addresses(new_stream)
  482. new_stream = self.handle_links(new_stream)
  483. for new_token in new_stream:
  484. yield from self.extract_entities(new_token)
  485. # We've already yielded this token, so continue
  486. continue
  487. yield token