dammit.py 56 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516
  1. # -*- coding: utf-8 -*-
  2. """Beautiful Soup bonus library: Unicode, Dammit
  3. This library converts a bytestream to Unicode through any means
  4. necessary. It is heavily based on code from Mark Pilgrim's `Universal
  5. Feed Parser <https://pypi.org/project/feedparser/>`_, now maintained
  6. by Kurt McKee. It does not rewrite the body of an XML or HTML document
  7. to reflect a new encoding; that's the job of `TreeBuilder`.
  8. """
  9. # Use of this source code is governed by the MIT license.
  10. __license__ = "MIT"
  11. from html.entities import codepoint2name
  12. from collections import defaultdict
  13. import codecs
  14. from html.entities import html5
  15. import re
  16. from logging import Logger, getLogger
  17. from types import ModuleType
  18. from typing import (
  19. Dict,
  20. Iterator,
  21. List,
  22. Optional,
  23. Pattern,
  24. Set,
  25. Tuple,
  26. Type,
  27. Union,
  28. cast,
  29. )
  30. from typing_extensions import Literal
  31. from bs4._typing import (
  32. _Encoding,
  33. _Encodings,
  34. )
  35. import warnings
  36. # Import a library to autodetect character encodings. We'll support
  37. # any of a number of libraries that all support the same API:
  38. #
  39. # * cchardet
  40. # * chardet
  41. # * charset-normalizer
  42. chardet_module: Optional[ModuleType] = None
  43. try:
  44. # PyPI package: cchardet
  45. import cchardet # type:ignore
  46. chardet_module = cchardet
  47. except ImportError:
  48. try:
  49. # Debian package: python-chardet
  50. # PyPI package: chardet
  51. import chardet
  52. chardet_module = chardet
  53. except ImportError:
  54. try:
  55. # PyPI package: charset-normalizer
  56. import charset_normalizer # type:ignore
  57. chardet_module = charset_normalizer
  58. except ImportError:
  59. # No chardet available.
  60. pass
  61. def _chardet_dammit(s: bytes) -> Optional[str]:
  62. """Try as hard as possible to detect the encoding of a bytestring."""
  63. if chardet_module is None or isinstance(s, str):
  64. return None
  65. module = chardet_module
  66. return module.detect(s)["encoding"]
  67. # Build bytestring and Unicode versions of regular expressions for finding
  68. # a declared encoding inside an XML or HTML document.
  69. xml_encoding: str = "^\\s*<\\?.*encoding=['\"](.*?)['\"].*\\?>" #: :meta private:
  70. html_meta: str = (
  71. "<\\s*meta[^>]+charset\\s*=\\s*[\"']?([^>]*?)[ /;'\">]" #: :meta private:
  72. )
  73. # TODO-TYPING: The Pattern type here could use more refinement, but it's tricky.
  74. encoding_res: Dict[Type, Dict[str, Pattern]] = dict()
  75. encoding_res[bytes] = {
  76. "html": re.compile(html_meta.encode("ascii"), re.I),
  77. "xml": re.compile(xml_encoding.encode("ascii"), re.I),
  78. }
  79. encoding_res[str] = {
  80. "html": re.compile(html_meta, re.I),
  81. "xml": re.compile(xml_encoding, re.I),
  82. }
  83. class EntitySubstitution(object):
  84. """The ability to substitute XML or HTML entities for certain characters."""
  85. #: A map of named HTML entities to the corresponding Unicode string.
  86. #:
  87. #: :meta hide-value:
  88. HTML_ENTITY_TO_CHARACTER: Dict[str, str]
  89. #: A map of Unicode strings to the corresponding named HTML entities;
  90. #: the inverse of HTML_ENTITY_TO_CHARACTER.
  91. #:
  92. #: :meta hide-value:
  93. CHARACTER_TO_HTML_ENTITY: Dict[str, str]
  94. #: A regular expression that matches any character (or, in rare
  95. #: cases, pair of characters) that can be replaced with a named
  96. #: HTML entity.
  97. #:
  98. #: :meta hide-value:
  99. CHARACTER_TO_HTML_ENTITY_RE: Pattern[str]
  100. #: A very similar regular expression to
  101. #: CHARACTER_TO_HTML_ENTITY_RE, but which also matches unescaped
  102. #: ampersands. This is used by the 'html' formatted to provide
  103. #: backwards-compatibility, even though the HTML5 spec allows most
  104. #: ampersands to go unescaped.
  105. #:
  106. #: :meta hide-value:
  107. CHARACTER_TO_HTML_ENTITY_WITH_AMPERSAND_RE: Pattern[str]
  108. @classmethod
  109. def _populate_class_variables(cls) -> None:
  110. """Initialize variables used by this class to manage the plethora of
  111. HTML5 named entities.
  112. This function sets the following class variables:
  113. CHARACTER_TO_HTML_ENTITY - A mapping of Unicode strings like "⦨" to
  114. entity names like "angmsdaa". When a single Unicode string has
  115. multiple entity names, we try to choose the most commonly-used
  116. name.
  117. HTML_ENTITY_TO_CHARACTER: A mapping of entity names like "angmsdaa" to
  118. Unicode strings like "⦨".
  119. CHARACTER_TO_HTML_ENTITY_RE: A regular expression matching (almost) any
  120. Unicode string that corresponds to an HTML5 named entity.
  121. CHARACTER_TO_HTML_ENTITY_WITH_AMPERSAND_RE: A very similar
  122. regular expression to CHARACTER_TO_HTML_ENTITY_RE, but which
  123. also matches unescaped ampersands. This is used by the 'html'
  124. formatted to provide backwards-compatibility, even though the HTML5
  125. spec allows most ampersands to go unescaped.
  126. """
  127. unicode_to_name = {}
  128. name_to_unicode = {}
  129. short_entities = set()
  130. long_entities_by_first_character = defaultdict(set)
  131. for name_with_semicolon, character in sorted(html5.items()):
  132. # "It is intentional, for legacy compatibility, that many
  133. # code points have multiple character reference names. For
  134. # example, some appear both with and without the trailing
  135. # semicolon, or with different capitalizations."
  136. # - https://html.spec.whatwg.org/multipage/named-characters.html#named-character-references
  137. #
  138. # The parsers are in charge of handling (or not) character
  139. # references with no trailing semicolon, so we remove the
  140. # semicolon whenever it appears.
  141. if name_with_semicolon.endswith(";"):
  142. name = name_with_semicolon[:-1]
  143. else:
  144. name = name_with_semicolon
  145. # When parsing HTML, we want to recognize any known named
  146. # entity and convert it to a sequence of Unicode
  147. # characters.
  148. if name not in name_to_unicode:
  149. name_to_unicode[name] = character
  150. # When _generating_ HTML, we want to recognize special
  151. # character sequences that _could_ be converted to named
  152. # entities.
  153. unicode_to_name[character] = name
  154. # We also need to build a regular expression that lets us
  155. # _find_ those characters in output strings so we can
  156. # replace them.
  157. #
  158. # This is tricky, for two reasons.
  159. if len(character) == 1 and ord(character) < 128 and character not in "<>":
  160. # First, it would be annoying to turn single ASCII
  161. # characters like | into named entities like
  162. # &verbar;. The exceptions are <>, which we _must_
  163. # turn into named entities to produce valid HTML.
  164. continue
  165. if len(character) > 1 and all(ord(x) < 128 for x in character):
  166. # We also do not want to turn _combinations_ of ASCII
  167. # characters like 'fj' into named entities like '&fjlig;',
  168. # though that's more debateable.
  169. continue
  170. # Second, some named entities have a Unicode value that's
  171. # a subset of the Unicode value for some _other_ named
  172. # entity. As an example, \u2267' is &GreaterFullEqual;,
  173. # but '\u2267\u0338' is &NotGreaterFullEqual;. Our regular
  174. # expression needs to match the first two characters of
  175. # "\u2267\u0338foo", but only the first character of
  176. # "\u2267foo".
  177. #
  178. # In this step, we build two sets of characters that
  179. # _eventually_ need to go into the regular expression. But
  180. # we won't know exactly what the regular expression needs
  181. # to look like until we've gone through the entire list of
  182. # named entities.
  183. if len(character) == 1 and character != "&":
  184. short_entities.add(character)
  185. else:
  186. long_entities_by_first_character[character[0]].add(character)
  187. # Now that we've been through the entire list of entities, we
  188. # can create a regular expression that matches any of them.
  189. particles = set()
  190. for short in short_entities:
  191. long_versions = long_entities_by_first_character[short]
  192. if not long_versions:
  193. particles.add(short)
  194. else:
  195. ignore = "".join([x[1] for x in long_versions])
  196. # This finds, e.g. \u2267 but only if it is _not_
  197. # followed by \u0338.
  198. particles.add("%s(?![%s])" % (short, ignore))
  199. for long_entities in list(long_entities_by_first_character.values()):
  200. for long_entity in long_entities:
  201. particles.add(long_entity)
  202. re_definition = "(%s)" % "|".join(particles)
  203. particles.add("&")
  204. re_definition_with_ampersand = "(%s)" % "|".join(particles)
  205. # If an entity shows up in both html5 and codepoint2name, it's
  206. # likely that HTML5 gives it several different names, such as
  207. # 'rsquo' and 'rsquor'. When converting Unicode characters to
  208. # named entities, the codepoint2name name should take
  209. # precedence where possible, since that's the more easily
  210. # recognizable one.
  211. for codepoint, name in list(codepoint2name.items()):
  212. character = chr(codepoint)
  213. unicode_to_name[character] = name
  214. cls.CHARACTER_TO_HTML_ENTITY = unicode_to_name
  215. cls.HTML_ENTITY_TO_CHARACTER = name_to_unicode
  216. cls.CHARACTER_TO_HTML_ENTITY_RE = re.compile(re_definition)
  217. cls.CHARACTER_TO_HTML_ENTITY_WITH_AMPERSAND_RE = re.compile(
  218. re_definition_with_ampersand
  219. )
  220. #: A map of Unicode strings to the corresponding named XML entities.
  221. #:
  222. #: :meta hide-value:
  223. CHARACTER_TO_XML_ENTITY: Dict[str, str] = {
  224. "'": "apos",
  225. '"': "quot",
  226. "&": "amp",
  227. "<": "lt",
  228. ">": "gt",
  229. }
  230. # Matches any named or numeric HTML entity.
  231. ANY_ENTITY_RE = re.compile("&(#\\d+|#x[0-9a-fA-F]+|\\w+);", re.I)
  232. #: A regular expression matching an angle bracket or an ampersand that
  233. #: is not part of an XML or HTML entity.
  234. #:
  235. #: :meta hide-value:
  236. BARE_AMPERSAND_OR_BRACKET: Pattern[str] = re.compile(
  237. "([<>]|" "&(?!#\\d+;|#x[0-9a-fA-F]+;|\\w+;)" ")"
  238. )
  239. #: A regular expression matching an angle bracket or an ampersand.
  240. #:
  241. #: :meta hide-value:
  242. AMPERSAND_OR_BRACKET: Pattern[str] = re.compile("([<>&])")
  243. @classmethod
  244. def _substitute_html_entity(cls, matchobj: re.Match) -> str:
  245. """Used with a regular expression to substitute the
  246. appropriate HTML entity for a special character string."""
  247. original_entity = matchobj.group(0)
  248. entity = cls.CHARACTER_TO_HTML_ENTITY.get(original_entity)
  249. if entity is None:
  250. return "&amp;%s;" % original_entity
  251. return "&%s;" % entity
  252. @classmethod
  253. def _substitute_xml_entity(cls, matchobj: re.Match) -> str:
  254. """Used with a regular expression to substitute the
  255. appropriate XML entity for a special character string."""
  256. entity = cls.CHARACTER_TO_XML_ENTITY[matchobj.group(0)]
  257. return "&%s;" % entity
  258. @classmethod
  259. def _escape_entity_name(cls, matchobj: re.Match) -> str:
  260. return "&amp;%s;" % matchobj.group(1)
  261. @classmethod
  262. def _escape_unrecognized_entity_name(cls, matchobj: re.Match) -> str:
  263. possible_entity = matchobj.group(1)
  264. if possible_entity in cls.HTML_ENTITY_TO_CHARACTER:
  265. return "&%s;" % possible_entity
  266. return "&amp;%s;" % possible_entity
  267. @classmethod
  268. def quoted_attribute_value(cls, value: str) -> str:
  269. """Make a value into a quoted XML attribute, possibly escaping it.
  270. Most strings will be quoted using double quotes.
  271. Bob's Bar -> "Bob's Bar"
  272. If a string contains double quotes, it will be quoted using
  273. single quotes.
  274. Welcome to "my bar" -> 'Welcome to "my bar"'
  275. If a string contains both single and double quotes, the
  276. double quotes will be escaped, and the string will be quoted
  277. using double quotes.
  278. Welcome to "Bob's Bar" -> Welcome to &quot;Bob's bar&quot;
  279. :param value: The XML attribute value to quote
  280. :return: The quoted value
  281. """
  282. quote_with = '"'
  283. if '"' in value:
  284. if "'" in value:
  285. # The string contains both single and double
  286. # quotes. Turn the double quotes into
  287. # entities. We quote the double quotes rather than
  288. # the single quotes because the entity name is
  289. # "&quot;" whether this is HTML or XML. If we
  290. # quoted the single quotes, we'd have to decide
  291. # between &apos; and &squot;.
  292. replace_with = "&quot;"
  293. value = value.replace('"', replace_with)
  294. else:
  295. # There are double quotes but no single quotes.
  296. # We can use single quotes to quote the attribute.
  297. quote_with = "'"
  298. return quote_with + value + quote_with
  299. @classmethod
  300. def substitute_xml(cls, value: str, make_quoted_attribute: bool = False) -> str:
  301. """Replace special XML characters with named XML entities.
  302. The less-than sign will become &lt;, the greater-than sign
  303. will become &gt;, and any ampersands will become &amp;. If you
  304. want ampersands that seem to be part of an entity definition
  305. to be left alone, use `substitute_xml_containing_entities`
  306. instead.
  307. :param value: A string to be substituted.
  308. :param make_quoted_attribute: If True, then the string will be
  309. quoted, as befits an attribute value.
  310. :return: A version of ``value`` with special characters replaced
  311. with named entities.
  312. """
  313. # Escape angle brackets and ampersands.
  314. value = cls.AMPERSAND_OR_BRACKET.sub(cls._substitute_xml_entity, value)
  315. if make_quoted_attribute:
  316. value = cls.quoted_attribute_value(value)
  317. return value
  318. @classmethod
  319. def substitute_xml_containing_entities(
  320. cls, value: str, make_quoted_attribute: bool = False
  321. ) -> str:
  322. """Substitute XML entities for special XML characters.
  323. :param value: A string to be substituted. The less-than sign will
  324. become &lt;, the greater-than sign will become &gt;, and any
  325. ampersands that are not part of an entity defition will
  326. become &amp;.
  327. :param make_quoted_attribute: If True, then the string will be
  328. quoted, as befits an attribute value.
  329. """
  330. # Escape angle brackets, and ampersands that aren't part of
  331. # entities.
  332. value = cls.BARE_AMPERSAND_OR_BRACKET.sub(cls._substitute_xml_entity, value)
  333. if make_quoted_attribute:
  334. value = cls.quoted_attribute_value(value)
  335. return value
  336. @classmethod
  337. def substitute_html(cls, s: str) -> str:
  338. """Replace certain Unicode characters with named HTML entities.
  339. This differs from ``data.encode(encoding, 'xmlcharrefreplace')``
  340. in that the goal is to make the result more readable (to those
  341. with ASCII displays) rather than to recover from
  342. errors. There's absolutely nothing wrong with a UTF-8 string
  343. containg a LATIN SMALL LETTER E WITH ACUTE, but replacing that
  344. character with "&eacute;" will make it more readable to some
  345. people.
  346. :param s: The string to be modified.
  347. :return: The string with some Unicode characters replaced with
  348. HTML entities.
  349. """
  350. # Convert any appropriate characters to HTML entities.
  351. return cls.CHARACTER_TO_HTML_ENTITY_WITH_AMPERSAND_RE.sub(
  352. cls._substitute_html_entity, s
  353. )
  354. @classmethod
  355. def substitute_html5(cls, s: str) -> str:
  356. """Replace certain Unicode characters with named HTML entities
  357. using HTML5 rules.
  358. Specifically, this method is much less aggressive about
  359. escaping ampersands than substitute_html. Only ambiguous
  360. ampersands are escaped, per the HTML5 standard:
  361. "An ambiguous ampersand is a U+0026 AMPERSAND character (&)
  362. that is followed by one or more ASCII alphanumerics, followed
  363. by a U+003B SEMICOLON character (;), where these characters do
  364. not match any of the names given in the named character
  365. references section."
  366. Unlike substitute_html5_raw, this method assumes HTML entities
  367. were converted to Unicode characters on the way in, as
  368. Beautiful Soup does. By the time Beautiful Soup does its work,
  369. the only ambiguous ampersands that need to be escaped are the
  370. ones that were escaped in the original markup when mentioning
  371. HTML entities.
  372. :param s: The string to be modified.
  373. :return: The string with some Unicode characters replaced with
  374. HTML entities.
  375. """
  376. # First, escape any HTML entities found in the markup.
  377. s = cls.ANY_ENTITY_RE.sub(cls._escape_entity_name, s)
  378. # Next, convert any appropriate characters to unescaped HTML entities.
  379. s = cls.CHARACTER_TO_HTML_ENTITY_RE.sub(cls._substitute_html_entity, s)
  380. return s
  381. @classmethod
  382. def substitute_html5_raw(cls, s: str) -> str:
  383. """Replace certain Unicode characters with named HTML entities
  384. using HTML5 rules.
  385. substitute_html5_raw is similar to substitute_html5 but it is
  386. designed for standalone use (whereas substitute_html5 is
  387. designed for use with Beautiful Soup).
  388. :param s: The string to be modified.
  389. :return: The string with some Unicode characters replaced with
  390. HTML entities.
  391. """
  392. # First, escape the ampersand for anything that looks like an
  393. # entity but isn't in the list of recognized entities. All other
  394. # ampersands can be left alone.
  395. s = cls.ANY_ENTITY_RE.sub(cls._escape_unrecognized_entity_name, s)
  396. # Then, convert a range of Unicode characters to unescaped
  397. # HTML entities.
  398. s = cls.CHARACTER_TO_HTML_ENTITY_RE.sub(cls._substitute_html_entity, s)
  399. return s
  400. EntitySubstitution._populate_class_variables()
  401. class EncodingDetector:
  402. """This class is capable of guessing a number of possible encodings
  403. for a bytestring.
  404. Order of precedence:
  405. 1. Encodings you specifically tell EncodingDetector to try first
  406. (the ``known_definite_encodings`` argument to the constructor).
  407. 2. An encoding determined by sniffing the document's byte-order mark.
  408. 3. Encodings you specifically tell EncodingDetector to try if
  409. byte-order mark sniffing fails (the ``user_encodings`` argument to the
  410. constructor).
  411. 4. An encoding declared within the bytestring itself, either in an
  412. XML declaration (if the bytestring is to be interpreted as an XML
  413. document), or in a <meta> tag (if the bytestring is to be
  414. interpreted as an HTML document.)
  415. 5. An encoding detected through textual analysis by chardet,
  416. cchardet, or a similar external library.
  417. 6. UTF-8.
  418. 7. Windows-1252.
  419. :param markup: Some markup in an unknown encoding.
  420. :param known_definite_encodings: When determining the encoding
  421. of ``markup``, these encodings will be tried first, in
  422. order. In HTML terms, this corresponds to the "known
  423. definite encoding" step defined in `section 13.2.3.1 of the HTML standard <https://html.spec.whatwg.org/multipage/parsing.html#parsing-with-a-known-character-encoding>`_.
  424. :param user_encodings: These encodings will be tried after the
  425. ``known_definite_encodings`` have been tried and failed, and
  426. after an attempt to sniff the encoding by looking at a
  427. byte order mark has failed. In HTML terms, this
  428. corresponds to the step "user has explicitly instructed
  429. the user agent to override the document's character
  430. encoding", defined in `section 13.2.3.2 of the HTML standard <https://html.spec.whatwg.org/multipage/parsing.html#determining-the-character-encoding>`_.
  431. :param override_encodings: A **deprecated** alias for
  432. ``known_definite_encodings``. Any encodings here will be tried
  433. immediately after the encodings in
  434. ``known_definite_encodings``.
  435. :param is_html: If True, this markup is considered to be
  436. HTML. Otherwise it's assumed to be XML.
  437. :param exclude_encodings: These encodings will not be tried,
  438. even if they otherwise would be.
  439. """
  440. def __init__(
  441. self,
  442. markup: bytes,
  443. known_definite_encodings: Optional[_Encodings] = None,
  444. is_html: Optional[bool] = False,
  445. exclude_encodings: Optional[_Encodings] = None,
  446. user_encodings: Optional[_Encodings] = None,
  447. override_encodings: Optional[_Encodings] = None,
  448. ):
  449. self.known_definite_encodings = list(known_definite_encodings or [])
  450. if override_encodings:
  451. warnings.warn(
  452. "The 'override_encodings' argument was deprecated in 4.10.0. Use 'known_definite_encodings' instead.",
  453. DeprecationWarning,
  454. stacklevel=3,
  455. )
  456. self.known_definite_encodings += override_encodings
  457. self.user_encodings = user_encodings or []
  458. exclude_encodings = exclude_encodings or []
  459. self.exclude_encodings = set([x.lower() for x in exclude_encodings])
  460. self.chardet_encoding = None
  461. self.is_html = False if is_html is None else is_html
  462. self.declared_encoding: Optional[str] = None
  463. # First order of business: strip a byte-order mark.
  464. self.markup, self.sniffed_encoding = self.strip_byte_order_mark(markup)
  465. known_definite_encodings: _Encodings
  466. user_encodings: _Encodings
  467. exclude_encodings: _Encodings
  468. chardet_encoding: Optional[_Encoding]
  469. is_html: bool
  470. declared_encoding: Optional[_Encoding]
  471. markup: bytes
  472. sniffed_encoding: Optional[_Encoding]
  473. def _usable(self, encoding: Optional[_Encoding], tried: Set[_Encoding]) -> bool:
  474. """Should we even bother to try this encoding?
  475. :param encoding: Name of an encoding.
  476. :param tried: Encodings that have already been tried. This
  477. will be modified as a side effect.
  478. """
  479. if encoding is None:
  480. return False
  481. encoding = encoding.lower()
  482. if encoding in self.exclude_encodings:
  483. return False
  484. if encoding not in tried:
  485. tried.add(encoding)
  486. return True
  487. return False
  488. @property
  489. def encodings(self) -> Iterator[_Encoding]:
  490. """Yield a number of encodings that might work for this markup.
  491. :yield: A sequence of strings. Each is the name of an encoding
  492. that *might* work to convert a bytestring into Unicode.
  493. """
  494. tried: Set[_Encoding] = set()
  495. # First, try the known definite encodings
  496. for e in self.known_definite_encodings:
  497. if self._usable(e, tried):
  498. yield e
  499. # Did the document originally start with a byte-order mark
  500. # that indicated its encoding?
  501. if self.sniffed_encoding is not None and self._usable(
  502. self.sniffed_encoding, tried
  503. ):
  504. yield self.sniffed_encoding
  505. # Sniffing the byte-order mark did nothing; try the user
  506. # encodings.
  507. for e in self.user_encodings:
  508. if self._usable(e, tried):
  509. yield e
  510. # Look within the document for an XML or HTML encoding
  511. # declaration.
  512. if self.declared_encoding is None:
  513. self.declared_encoding = self.find_declared_encoding(
  514. self.markup, self.is_html
  515. )
  516. if self.declared_encoding is not None and self._usable(
  517. self.declared_encoding, tried
  518. ):
  519. yield self.declared_encoding
  520. # Use third-party character set detection to guess at the
  521. # encoding.
  522. if self.chardet_encoding is None:
  523. self.chardet_encoding = _chardet_dammit(self.markup)
  524. if self.chardet_encoding is not None and self._usable(
  525. self.chardet_encoding, tried
  526. ):
  527. yield self.chardet_encoding
  528. # As a last-ditch effort, try utf-8 and windows-1252.
  529. for e in ("utf-8", "windows-1252"):
  530. if self._usable(e, tried):
  531. yield e
  532. @classmethod
  533. def strip_byte_order_mark(cls, data: bytes) -> Tuple[bytes, Optional[_Encoding]]:
  534. """If a byte-order mark is present, strip it and return the encoding it implies.
  535. :param data: A bytestring that may or may not begin with a
  536. byte-order mark.
  537. :return: A 2-tuple (data stripped of byte-order mark, encoding implied by byte-order mark)
  538. """
  539. encoding = None
  540. if isinstance(data, str):
  541. # Unicode data cannot have a byte-order mark.
  542. return data, encoding
  543. if (
  544. (len(data) >= 4)
  545. and (data[:2] == b"\xfe\xff")
  546. and (data[2:4] != b"\x00\x00")
  547. ):
  548. encoding = "utf-16be"
  549. data = data[2:]
  550. elif (
  551. (len(data) >= 4)
  552. and (data[:2] == b"\xff\xfe")
  553. and (data[2:4] != b"\x00\x00")
  554. ):
  555. encoding = "utf-16le"
  556. data = data[2:]
  557. elif data[:3] == b"\xef\xbb\xbf":
  558. encoding = "utf-8"
  559. data = data[3:]
  560. elif data[:4] == b"\x00\x00\xfe\xff":
  561. encoding = "utf-32be"
  562. data = data[4:]
  563. elif data[:4] == b"\xff\xfe\x00\x00":
  564. encoding = "utf-32le"
  565. data = data[4:]
  566. return data, encoding
  567. @classmethod
  568. def find_declared_encoding(
  569. cls,
  570. markup: Union[bytes, str],
  571. is_html: bool = False,
  572. search_entire_document: bool = False,
  573. ) -> Optional[_Encoding]:
  574. """Given a document, tries to find an encoding declared within the
  575. text of the document itself.
  576. An XML encoding is declared at the beginning of the document.
  577. An HTML encoding is declared in a <meta> tag, hopefully near the
  578. beginning of the document.
  579. :param markup: Some markup.
  580. :param is_html: If True, this markup is considered to be HTML. Otherwise
  581. it's assumed to be XML.
  582. :param search_entire_document: Since an encoding is supposed
  583. to declared near the beginning of the document, most of
  584. the time it's only necessary to search a few kilobytes of
  585. data. Set this to True to force this method to search the
  586. entire document.
  587. :return: The declared encoding, if one is found.
  588. """
  589. if search_entire_document:
  590. xml_endpos = html_endpos = len(markup)
  591. else:
  592. xml_endpos = 1024
  593. html_endpos = max(2048, int(len(markup) * 0.05))
  594. if isinstance(markup, bytes):
  595. res = encoding_res[bytes]
  596. else:
  597. res = encoding_res[str]
  598. xml_re = res["xml"]
  599. html_re = res["html"]
  600. declared_encoding: Optional[_Encoding] = None
  601. declared_encoding_match = xml_re.search(markup, endpos=xml_endpos)
  602. if not declared_encoding_match and is_html:
  603. declared_encoding_match = html_re.search(markup, endpos=html_endpos)
  604. if declared_encoding_match is not None:
  605. declared_encoding = declared_encoding_match.groups()[0]
  606. if declared_encoding:
  607. if isinstance(declared_encoding, bytes):
  608. declared_encoding = declared_encoding.decode("ascii", "replace")
  609. return declared_encoding.lower()
  610. return None
  611. class UnicodeDammit:
  612. """A class for detecting the encoding of a bytestring containing an
  613. HTML or XML document, and decoding it to Unicode. If the source
  614. encoding is windows-1252, `UnicodeDammit` can also replace
  615. Microsoft smart quotes with their HTML or XML equivalents.
  616. :param markup: HTML or XML markup in an unknown encoding.
  617. :param known_definite_encodings: When determining the encoding
  618. of ``markup``, these encodings will be tried first, in
  619. order. In HTML terms, this corresponds to the "known
  620. definite encoding" step defined in `section 13.2.3.1 of the HTML standard <https://html.spec.whatwg.org/multipage/parsing.html#parsing-with-a-known-character-encoding>`_.
  621. :param user_encodings: These encodings will be tried after the
  622. ``known_definite_encodings`` have been tried and failed, and
  623. after an attempt to sniff the encoding by looking at a
  624. byte order mark has failed. In HTML terms, this
  625. corresponds to the step "user has explicitly instructed
  626. the user agent to override the document's character
  627. encoding", defined in `section 13.2.3.2 of the HTML standard <https://html.spec.whatwg.org/multipage/parsing.html#determining-the-character-encoding>`_.
  628. :param override_encodings: A **deprecated** alias for
  629. ``known_definite_encodings``. Any encodings here will be tried
  630. immediately after the encodings in
  631. ``known_definite_encodings``.
  632. :param smart_quotes_to: By default, Microsoft smart quotes will,
  633. like all other characters, be converted to Unicode
  634. characters. Setting this to ``ascii`` will convert them to ASCII
  635. quotes instead. Setting it to ``xml`` will convert them to XML
  636. entity references, and setting it to ``html`` will convert them
  637. to HTML entity references.
  638. :param is_html: If True, ``markup`` is treated as an HTML
  639. document. Otherwise it's treated as an XML document.
  640. :param exclude_encodings: These encodings will not be considered,
  641. even if the sniffing code thinks they might make sense.
  642. """
  643. def __init__(
  644. self,
  645. markup: bytes,
  646. known_definite_encodings: Optional[_Encodings] = [],
  647. smart_quotes_to: Optional[Literal["ascii", "xml", "html"]] = None,
  648. is_html: bool = False,
  649. exclude_encodings: Optional[_Encodings] = [],
  650. user_encodings: Optional[_Encodings] = None,
  651. override_encodings: Optional[_Encodings] = None,
  652. ):
  653. self.smart_quotes_to = smart_quotes_to
  654. self.tried_encodings = []
  655. self.contains_replacement_characters = False
  656. self.is_html = is_html
  657. self.log = getLogger(__name__)
  658. self.detector = EncodingDetector(
  659. markup,
  660. known_definite_encodings,
  661. is_html,
  662. exclude_encodings,
  663. user_encodings,
  664. override_encodings,
  665. )
  666. # Short-circuit if the data is in Unicode to begin with.
  667. if isinstance(markup, str):
  668. self.markup = markup.encode("utf8")
  669. self.unicode_markup = markup
  670. self.original_encoding = None
  671. return
  672. # The encoding detector may have stripped a byte-order mark.
  673. # Use the stripped markup from this point on.
  674. self.markup = self.detector.markup
  675. u = None
  676. for encoding in self.detector.encodings:
  677. markup = self.detector.markup
  678. u = self._convert_from(encoding)
  679. if u is not None:
  680. break
  681. if not u:
  682. # None of the encodings worked. As an absolute last resort,
  683. # try them again with character replacement.
  684. for encoding in self.detector.encodings:
  685. if encoding != "ascii":
  686. u = self._convert_from(encoding, "replace")
  687. if u is not None:
  688. self.log.warning(
  689. "Some characters could not be decoded, and were "
  690. "replaced with REPLACEMENT CHARACTER."
  691. )
  692. self.contains_replacement_characters = True
  693. break
  694. # If none of that worked, we could at this point force it to
  695. # ASCII, but that would destroy so much data that I think
  696. # giving up is better.
  697. #
  698. # Note that this is extremely unlikely, probably impossible,
  699. # because the "replace" strategy is so powerful. Even running
  700. # the Python binary through Unicode, Dammit gives you Unicode,
  701. # albeit Unicode riddled with REPLACEMENT CHARACTER.
  702. if u is None:
  703. self.original_encoding = None
  704. self.unicode_markup = None
  705. else:
  706. self.unicode_markup = u
  707. #: The original markup, before it was converted to Unicode.
  708. #: This is not necessarily the same as what was passed in to the
  709. #: constructor, since any byte-order mark will be stripped.
  710. markup: bytes
  711. #: The Unicode version of the markup, following conversion. This
  712. #: is set to None if there was simply no way to convert the
  713. #: bytestring to Unicode (as with binary data).
  714. unicode_markup: Optional[str]
  715. #: This is True if `UnicodeDammit.unicode_markup` contains
  716. #: U+FFFD REPLACEMENT_CHARACTER characters which were not present
  717. #: in `UnicodeDammit.markup`. These mark character sequences that
  718. #: could not be represented in Unicode.
  719. contains_replacement_characters: bool
  720. #: Unicode, Dammit's best guess as to the original character
  721. #: encoding of `UnicodeDammit.markup`.
  722. original_encoding: Optional[_Encoding]
  723. #: The strategy used to handle Microsoft smart quotes.
  724. smart_quotes_to: Optional[str]
  725. #: The (encoding, error handling strategy) 2-tuples that were used to
  726. #: try and convert the markup to Unicode.
  727. tried_encodings: List[Tuple[_Encoding, str]]
  728. log: Logger #: :meta private:
  729. def _sub_ms_char(self, match: re.Match) -> bytes:
  730. """Changes a MS smart quote character to an XML or HTML
  731. entity, or an ASCII character.
  732. TODO: Since this is only used to convert smart quotes, it
  733. could be simplified, and MS_CHARS_TO_ASCII made much less
  734. parochial.
  735. """
  736. orig: bytes = match.group(1)
  737. sub: bytes
  738. if self.smart_quotes_to == "ascii":
  739. if orig in self.MS_CHARS_TO_ASCII:
  740. sub = self.MS_CHARS_TO_ASCII[orig].encode()
  741. else:
  742. # Shouldn't happen; substitute the character
  743. # with itself.
  744. sub = orig
  745. else:
  746. if orig in self.MS_CHARS:
  747. substitutions = self.MS_CHARS[orig]
  748. if type(substitutions) is tuple:
  749. if self.smart_quotes_to == "xml":
  750. sub = b"&#x" + substitutions[1].encode() + b";"
  751. else:
  752. sub = b"&" + substitutions[0].encode() + b";"
  753. else:
  754. substitutions = cast(str, substitutions)
  755. sub = substitutions.encode()
  756. else:
  757. # Shouldn't happen; substitute the character
  758. # for itself.
  759. sub = orig
  760. return sub
  761. #: This dictionary maps commonly seen values for "charset" in HTML
  762. #: meta tags to the corresponding Python codec names. It only covers
  763. #: values that aren't in Python's aliases and can't be determined
  764. #: by the heuristics in `find_codec`.
  765. #:
  766. #: :meta hide-value:
  767. CHARSET_ALIASES: Dict[str, _Encoding] = {
  768. "macintosh": "mac-roman",
  769. "x-sjis": "shift-jis",
  770. }
  771. #: A list of encodings that tend to contain Microsoft smart quotes.
  772. #:
  773. #: :meta hide-value:
  774. ENCODINGS_WITH_SMART_QUOTES: _Encodings = [
  775. "windows-1252",
  776. "iso-8859-1",
  777. "iso-8859-2",
  778. ]
  779. def _convert_from(
  780. self, proposed: _Encoding, errors: str = "strict"
  781. ) -> Optional[str]:
  782. """Attempt to convert the markup to the proposed encoding.
  783. :param proposed: The name of a character encoding.
  784. :param errors: An error handling strategy, used when calling `str`.
  785. :return: The converted markup, or `None` if the proposed
  786. encoding/error handling strategy didn't work.
  787. """
  788. lookup_result = self.find_codec(proposed)
  789. if lookup_result is None or (lookup_result, errors) in self.tried_encodings:
  790. return None
  791. proposed = lookup_result
  792. self.tried_encodings.append((proposed, errors))
  793. markup = self.markup
  794. # Convert smart quotes to HTML if coming from an encoding
  795. # that might have them.
  796. if (
  797. self.smart_quotes_to is not None
  798. and proposed in self.ENCODINGS_WITH_SMART_QUOTES
  799. ):
  800. smart_quotes_re = b"([\x80-\x9f])"
  801. smart_quotes_compiled = re.compile(smart_quotes_re)
  802. markup = smart_quotes_compiled.sub(self._sub_ms_char, markup)
  803. try:
  804. # print("Trying to convert document to %s (errors=%s)" % (
  805. # proposed, errors))
  806. u = self._to_unicode(markup, proposed, errors)
  807. self.unicode_markup = u
  808. self.original_encoding = proposed
  809. except Exception:
  810. # print("That didn't work!")
  811. # print(e)
  812. return None
  813. # print("Correct encoding: %s" % proposed)
  814. return self.unicode_markup
  815. def _to_unicode(
  816. self, data: bytes, encoding: _Encoding, errors: str = "strict"
  817. ) -> str:
  818. """Given a bytestring and its encoding, decodes the string into Unicode.
  819. :param encoding: The name of an encoding.
  820. :param errors: An error handling strategy, used when calling `str`.
  821. """
  822. return str(data, encoding, errors)
  823. @property
  824. def declared_html_encoding(self) -> Optional[_Encoding]:
  825. """If the markup is an HTML document, returns the encoding, if any,
  826. declared *inside* the document.
  827. """
  828. if not self.is_html:
  829. return None
  830. return self.detector.declared_encoding
  831. def find_codec(self, charset: _Encoding) -> Optional[str]:
  832. """Look up the Python codec corresponding to a given character set.
  833. :param charset: The name of a character set.
  834. :return: The name of a Python codec.
  835. """
  836. value = (
  837. self._codec(self.CHARSET_ALIASES.get(charset, charset))
  838. or (charset and self._codec(charset.replace("-", "")))
  839. or (charset and self._codec(charset.replace("-", "_")))
  840. or (charset and charset.lower())
  841. or charset
  842. )
  843. if value:
  844. return value.lower()
  845. return None
  846. def _codec(self, charset: _Encoding) -> Optional[str]:
  847. if not charset:
  848. return charset
  849. codec = None
  850. try:
  851. codecs.lookup(charset)
  852. codec = charset
  853. except (LookupError, ValueError):
  854. pass
  855. return codec
  856. #: A partial mapping of ISO-Latin-1 to HTML entities/XML numeric entities.
  857. #:
  858. #: :meta hide-value:
  859. MS_CHARS: Dict[bytes, Union[str, Tuple[str, str]]] = {
  860. b"\x80": ("euro", "20AC"),
  861. b"\x81": " ",
  862. b"\x82": ("sbquo", "201A"),
  863. b"\x83": ("fnof", "192"),
  864. b"\x84": ("bdquo", "201E"),
  865. b"\x85": ("hellip", "2026"),
  866. b"\x86": ("dagger", "2020"),
  867. b"\x87": ("Dagger", "2021"),
  868. b"\x88": ("circ", "2C6"),
  869. b"\x89": ("permil", "2030"),
  870. b"\x8a": ("Scaron", "160"),
  871. b"\x8b": ("lsaquo", "2039"),
  872. b"\x8c": ("OElig", "152"),
  873. b"\x8d": "?",
  874. b"\x8e": ("#x17D", "17D"),
  875. b"\x8f": "?",
  876. b"\x90": "?",
  877. b"\x91": ("lsquo", "2018"),
  878. b"\x92": ("rsquo", "2019"),
  879. b"\x93": ("ldquo", "201C"),
  880. b"\x94": ("rdquo", "201D"),
  881. b"\x95": ("bull", "2022"),
  882. b"\x96": ("ndash", "2013"),
  883. b"\x97": ("mdash", "2014"),
  884. b"\x98": ("tilde", "2DC"),
  885. b"\x99": ("trade", "2122"),
  886. b"\x9a": ("scaron", "161"),
  887. b"\x9b": ("rsaquo", "203A"),
  888. b"\x9c": ("oelig", "153"),
  889. b"\x9d": "?",
  890. b"\x9e": ("#x17E", "17E"),
  891. b"\x9f": ("Yuml", ""),
  892. }
  893. #: A parochial partial mapping of ISO-Latin-1 to ASCII. Contains
  894. #: horrors like stripping diacritical marks to turn á into a, but also
  895. #: contains non-horrors like turning “ into ".
  896. #:
  897. #: Seriously, don't use this for anything other than removing smart
  898. #: quotes.
  899. #:
  900. #: :meta private:
  901. MS_CHARS_TO_ASCII: Dict[bytes, str] = {
  902. b"\x80": "EUR",
  903. b"\x81": " ",
  904. b"\x82": ",",
  905. b"\x83": "f",
  906. b"\x84": ",,",
  907. b"\x85": "...",
  908. b"\x86": "+",
  909. b"\x87": "++",
  910. b"\x88": "^",
  911. b"\x89": "%",
  912. b"\x8a": "S",
  913. b"\x8b": "<",
  914. b"\x8c": "OE",
  915. b"\x8d": "?",
  916. b"\x8e": "Z",
  917. b"\x8f": "?",
  918. b"\x90": "?",
  919. b"\x91": "'",
  920. b"\x92": "'",
  921. b"\x93": '"',
  922. b"\x94": '"',
  923. b"\x95": "*",
  924. b"\x96": "-",
  925. b"\x97": "--",
  926. b"\x98": "~",
  927. b"\x99": "(TM)",
  928. b"\x9a": "s",
  929. b"\x9b": ">",
  930. b"\x9c": "oe",
  931. b"\x9d": "?",
  932. b"\x9e": "z",
  933. b"\x9f": "Y",
  934. b"\xa0": " ",
  935. b"\xa1": "!",
  936. b"\xa2": "c",
  937. b"\xa3": "GBP",
  938. b"\xa4": "$", # This approximation is especially parochial--this is the
  939. # generic currency symbol.
  940. b"\xa5": "YEN",
  941. b"\xa6": "|",
  942. b"\xa7": "S",
  943. b"\xa8": "..",
  944. b"\xa9": "",
  945. b"\xaa": "(th)",
  946. b"\xab": "<<",
  947. b"\xac": "!",
  948. b"\xad": " ",
  949. b"\xae": "(R)",
  950. b"\xaf": "-",
  951. b"\xb0": "o",
  952. b"\xb1": "+-",
  953. b"\xb2": "2",
  954. b"\xb3": "3",
  955. b"\xb4": "'",
  956. b"\xb5": "u",
  957. b"\xb6": "P",
  958. b"\xb7": "*",
  959. b"\xb8": ",",
  960. b"\xb9": "1",
  961. b"\xba": "(th)",
  962. b"\xbb": ">>",
  963. b"\xbc": "1/4",
  964. b"\xbd": "1/2",
  965. b"\xbe": "3/4",
  966. b"\xbf": "?",
  967. b"\xc0": "A",
  968. b"\xc1": "A",
  969. b"\xc2": "A",
  970. b"\xc3": "A",
  971. b"\xc4": "A",
  972. b"\xc5": "A",
  973. b"\xc6": "AE",
  974. b"\xc7": "C",
  975. b"\xc8": "E",
  976. b"\xc9": "E",
  977. b"\xca": "E",
  978. b"\xcb": "E",
  979. b"\xcc": "I",
  980. b"\xcd": "I",
  981. b"\xce": "I",
  982. b"\xcf": "I",
  983. b"\xd0": "D",
  984. b"\xd1": "N",
  985. b"\xd2": "O",
  986. b"\xd3": "O",
  987. b"\xd4": "O",
  988. b"\xd5": "O",
  989. b"\xd6": "O",
  990. b"\xd7": "*",
  991. b"\xd8": "O",
  992. b"\xd9": "U",
  993. b"\xda": "U",
  994. b"\xdb": "U",
  995. b"\xdc": "U",
  996. b"\xdd": "Y",
  997. b"\xde": "b",
  998. b"\xdf": "B",
  999. b"\xe0": "a",
  1000. b"\xe1": "a",
  1001. b"\xe2": "a",
  1002. b"\xe3": "a",
  1003. b"\xe4": "a",
  1004. b"\xe5": "a",
  1005. b"\xe6": "ae",
  1006. b"\xe7": "c",
  1007. b"\xe8": "e",
  1008. b"\xe9": "e",
  1009. b"\xea": "e",
  1010. b"\xeb": "e",
  1011. b"\xec": "i",
  1012. b"\xed": "i",
  1013. b"\xee": "i",
  1014. b"\xef": "i",
  1015. b"\xf0": "o",
  1016. b"\xf1": "n",
  1017. b"\xf2": "o",
  1018. b"\xf3": "o",
  1019. b"\xf4": "o",
  1020. b"\xf5": "o",
  1021. b"\xf6": "o",
  1022. b"\xf7": "/",
  1023. b"\xf8": "o",
  1024. b"\xf9": "u",
  1025. b"\xfa": "u",
  1026. b"\xfb": "u",
  1027. b"\xfc": "u",
  1028. b"\xfd": "y",
  1029. b"\xfe": "b",
  1030. b"\xff": "y",
  1031. }
  1032. #: A map used when removing rogue Windows-1252/ISO-8859-1
  1033. #: characters in otherwise UTF-8 documents. Also used when a
  1034. #: numeric character entity has been incorrectly encoded using the
  1035. #: character's Windows-1252 encoding.
  1036. #:
  1037. #: Note that \\x81, \\x8d, \\x8f, \\x90, and \\x9d are undefined in
  1038. #: Windows-1252.
  1039. #:
  1040. #: :meta hide-value:
  1041. WINDOWS_1252_TO_UTF8: Dict[int, bytes] = {
  1042. 0x80: b"\xe2\x82\xac", # €
  1043. 0x82: b"\xe2\x80\x9a", # ‚
  1044. 0x83: b"\xc6\x92", # ƒ
  1045. 0x84: b"\xe2\x80\x9e", # „
  1046. 0x85: b"\xe2\x80\xa6", # …
  1047. 0x86: b"\xe2\x80\xa0", # †
  1048. 0x87: b"\xe2\x80\xa1", # ‡
  1049. 0x88: b"\xcb\x86", # ˆ
  1050. 0x89: b"\xe2\x80\xb0", # ‰
  1051. 0x8A: b"\xc5\xa0", # Š
  1052. 0x8B: b"\xe2\x80\xb9", # ‹
  1053. 0x8C: b"\xc5\x92", # Œ
  1054. 0x8E: b"\xc5\xbd", # Ž
  1055. 0x91: b"\xe2\x80\x98", # ‘
  1056. 0x92: b"\xe2\x80\x99", # ’
  1057. 0x93: b"\xe2\x80\x9c", # “
  1058. 0x94: b"\xe2\x80\x9d", # ”
  1059. 0x95: b"\xe2\x80\xa2", # •
  1060. 0x96: b"\xe2\x80\x93", # –
  1061. 0x97: b"\xe2\x80\x94", # —
  1062. 0x98: b"\xcb\x9c", # ˜
  1063. 0x99: b"\xe2\x84\xa2", # ™
  1064. 0x9A: b"\xc5\xa1", # š
  1065. 0x9B: b"\xe2\x80\xba", # ›
  1066. 0x9C: b"\xc5\x93", # œ
  1067. 0x9E: b"\xc5\xbe", # ž
  1068. 0x9F: b"\xc5\xb8", # Ÿ
  1069. 0xA0: b"\xc2\xa0", #
  1070. 0xA1: b"\xc2\xa1", # ¡
  1071. 0xA2: b"\xc2\xa2", # ¢
  1072. 0xA3: b"\xc2\xa3", # £
  1073. 0xA4: b"\xc2\xa4", # ¤
  1074. 0xA5: b"\xc2\xa5", # ¥
  1075. 0xA6: b"\xc2\xa6", # ¦
  1076. 0xA7: b"\xc2\xa7", # §
  1077. 0xA8: b"\xc2\xa8", # ¨
  1078. 0xA9: b"\xc2\xa9", # ©
  1079. 0xAA: b"\xc2\xaa", # ª
  1080. 0xAB: b"\xc2\xab", # «
  1081. 0xAC: b"\xc2\xac", # ¬
  1082. 0xAD: b"\xc2\xad", # ­
  1083. 0xAE: b"\xc2\xae", # ®
  1084. 0xAF: b"\xc2\xaf", # ¯
  1085. 0xB0: b"\xc2\xb0", # °
  1086. 0xB1: b"\xc2\xb1", # ±
  1087. 0xB2: b"\xc2\xb2", # ²
  1088. 0xB3: b"\xc2\xb3", # ³
  1089. 0xB4: b"\xc2\xb4", # ´
  1090. 0xB5: b"\xc2\xb5", # µ
  1091. 0xB6: b"\xc2\xb6", # ¶
  1092. 0xB7: b"\xc2\xb7", # ·
  1093. 0xB8: b"\xc2\xb8", # ¸
  1094. 0xB9: b"\xc2\xb9", # ¹
  1095. 0xBA: b"\xc2\xba", # º
  1096. 0xBB: b"\xc2\xbb", # »
  1097. 0xBC: b"\xc2\xbc", # ¼
  1098. 0xBD: b"\xc2\xbd", # ½
  1099. 0xBE: b"\xc2\xbe", # ¾
  1100. 0xBF: b"\xc2\xbf", # ¿
  1101. 0xC0: b"\xc3\x80", # À
  1102. 0xC1: b"\xc3\x81", # Á
  1103. 0xC2: b"\xc3\x82", # Â
  1104. 0xC3: b"\xc3\x83", # Ã
  1105. 0xC4: b"\xc3\x84", # Ä
  1106. 0xC5: b"\xc3\x85", # Å
  1107. 0xC6: b"\xc3\x86", # Æ
  1108. 0xC7: b"\xc3\x87", # Ç
  1109. 0xC8: b"\xc3\x88", # È
  1110. 0xC9: b"\xc3\x89", # É
  1111. 0xCA: b"\xc3\x8a", # Ê
  1112. 0xCB: b"\xc3\x8b", # Ë
  1113. 0xCC: b"\xc3\x8c", # Ì
  1114. 0xCD: b"\xc3\x8d", # Í
  1115. 0xCE: b"\xc3\x8e", # Î
  1116. 0xCF: b"\xc3\x8f", # Ï
  1117. 0xD0: b"\xc3\x90", # Ð
  1118. 0xD1: b"\xc3\x91", # Ñ
  1119. 0xD2: b"\xc3\x92", # Ò
  1120. 0xD3: b"\xc3\x93", # Ó
  1121. 0xD4: b"\xc3\x94", # Ô
  1122. 0xD5: b"\xc3\x95", # Õ
  1123. 0xD6: b"\xc3\x96", # Ö
  1124. 0xD7: b"\xc3\x97", # ×
  1125. 0xD8: b"\xc3\x98", # Ø
  1126. 0xD9: b"\xc3\x99", # Ù
  1127. 0xDA: b"\xc3\x9a", # Ú
  1128. 0xDB: b"\xc3\x9b", # Û
  1129. 0xDC: b"\xc3\x9c", # Ü
  1130. 0xDD: b"\xc3\x9d", # Ý
  1131. 0xDE: b"\xc3\x9e", # Þ
  1132. 0xDF: b"\xc3\x9f", # ß
  1133. 0xE0: b"\xc3\xa0", # à
  1134. 0xE1: b"\xa1", # á
  1135. 0xE2: b"\xc3\xa2", # â
  1136. 0xE3: b"\xc3\xa3", # ã
  1137. 0xE4: b"\xc3\xa4", # ä
  1138. 0xE5: b"\xc3\xa5", # å
  1139. 0xE6: b"\xc3\xa6", # æ
  1140. 0xE7: b"\xc3\xa7", # ç
  1141. 0xE8: b"\xc3\xa8", # è
  1142. 0xE9: b"\xc3\xa9", # é
  1143. 0xEA: b"\xc3\xaa", # ê
  1144. 0xEB: b"\xc3\xab", # ë
  1145. 0xEC: b"\xc3\xac", # ì
  1146. 0xED: b"\xc3\xad", # í
  1147. 0xEE: b"\xc3\xae", # î
  1148. 0xEF: b"\xc3\xaf", # ï
  1149. 0xF0: b"\xc3\xb0", # ð
  1150. 0xF1: b"\xc3\xb1", # ñ
  1151. 0xF2: b"\xc3\xb2", # ò
  1152. 0xF3: b"\xc3\xb3", # ó
  1153. 0xF4: b"\xc3\xb4", # ô
  1154. 0xF5: b"\xc3\xb5", # õ
  1155. 0xF6: b"\xc3\xb6", # ö
  1156. 0xF7: b"\xc3\xb7", # ÷
  1157. 0xF8: b"\xc3\xb8", # ø
  1158. 0xF9: b"\xc3\xb9", # ù
  1159. 0xFA: b"\xc3\xba", # ú
  1160. 0xFB: b"\xc3\xbb", # û
  1161. 0xFC: b"\xc3\xbc", # ü
  1162. 0xFD: b"\xc3\xbd", # ý
  1163. 0xFE: b"\xc3\xbe", # þ
  1164. 0xFF: b"\xc3\xbf", # ÿ
  1165. }
  1166. #: :meta private
  1167. # Note that this isn't all Unicode noncharacters, just the noncontiguous ones that need to be listed.
  1168. #
  1169. # "A noncharacter is a code point that is in the range
  1170. # U+FDD0 to U+FDEF, inclusive, or U+FFFE, U+FFFF, U+1FFFE,
  1171. # U+1FFFF, U+2FFFE, U+2FFFF, U+3FFFE, U+3FFFF, U+4FFFE,
  1172. # U+4FFFF, U+5FFFE, U+5FFFF, U+6FFFE, U+6FFFF, U+7FFFE,
  1173. # U+7FFFF, U+8FFFE, U+8FFFF, U+9FFFE, U+9FFFF, U+AFFFE,
  1174. # U+AFFFF, U+BFFFE, U+BFFFF, U+CFFFE, U+CFFFF, U+DFFFE,
  1175. # U+DFFFF, U+EFFFE, U+EFFFF, U+FFFFE, U+FFFFF, U+10FFFE,
  1176. # or U+10FFFF."
  1177. ENUMERATED_NONCHARACTERS: Set[int] = set([0xfffe, 0xffff,
  1178. 0x1fffe, 0x1ffff,
  1179. 0x2fffe, 0x2ffff,
  1180. 0x3fffe, 0x3ffff,
  1181. 0x4fffe, 0x4ffff,
  1182. 0x5fffe, 0x5ffff,
  1183. 0x6fffe, 0x6ffff,
  1184. 0x7fffe, 0x7ffff,
  1185. 0x8fffe, 0x8ffff,
  1186. 0x9fffe, 0x9ffff,
  1187. 0xafffe, 0xaffff,
  1188. 0xbfffe, 0xbffff,
  1189. 0xcfffe, 0xcffff,
  1190. 0xdfffe, 0xdffff,
  1191. 0xefffe, 0xeffff,
  1192. 0xffffe, 0xfffff,
  1193. 0x10fffe, 0x10ffff])
  1194. #: :meta private:
  1195. MULTIBYTE_MARKERS_AND_SIZES: List[Tuple[int, int, int]] = [
  1196. (0xC2, 0xDF, 2), # 2-byte characters start with a byte C2-DF
  1197. (0xE0, 0xEF, 3), # 3-byte characters start with E0-EF
  1198. (0xF0, 0xF4, 4), # 4-byte characters start with F0-F4
  1199. ]
  1200. #: :meta private:
  1201. FIRST_MULTIBYTE_MARKER: int = MULTIBYTE_MARKERS_AND_SIZES[0][0]
  1202. #: :meta private:
  1203. LAST_MULTIBYTE_MARKER: int = MULTIBYTE_MARKERS_AND_SIZES[-1][1]
  1204. @classmethod
  1205. def numeric_character_reference(cls, numeric:int) -> Tuple[str, bool]:
  1206. """This (mostly) implements the algorithm described in "Numeric character
  1207. reference end state" from the HTML spec:
  1208. https://html.spec.whatwg.org/multipage/parsing.html#numeric-character-reference-end-state
  1209. The algorithm is designed to convert numeric character references like "&#9731;"
  1210. to Unicode characters like "☃".
  1211. :return: A 2-tuple (character, replaced). `character` is the Unicode
  1212. character corresponding to the numeric reference and `replaced` is
  1213. whether or not an unresolvable character was replaced with REPLACEMENT
  1214. CHARACTER.
  1215. """
  1216. replacement = "\ufffd"
  1217. if numeric == 0x00:
  1218. # "If the number is 0x00, then this is a
  1219. # null-character-reference parse error. Set the character
  1220. # reference code to 0xFFFD."
  1221. return replacement, True
  1222. if numeric > 0x10ffff:
  1223. # "If the number is greater than 0x10FFFF, then this is a
  1224. # character-reference-outside-unicode-range parse
  1225. # error. Set the character reference code to 0xFFFD."
  1226. return replacement, True
  1227. if numeric >= 0xd800 and numeric <= 0xdfff:
  1228. # "If the number is a surrogate, then this is a
  1229. # surrogate-character-reference parse error. Set the
  1230. # character reference code to 0xFFFD."
  1231. return replacement, True
  1232. if (numeric >= 0xfdd0 and numeric <= 0xfdef) or numeric in cls.ENUMERATED_NONCHARACTERS:
  1233. # "If the number is a noncharacter, then this is a
  1234. # noncharacter-character-reference parse error."
  1235. #
  1236. # "The parser resolves such character references as-is."
  1237. #
  1238. # I'm not sure what "as-is" means but I think it means that we act
  1239. # like there was no error condition.
  1240. return chr(numeric), False
  1241. # "If the number is 0x0D, or a control that's not ASCII whitespace,
  1242. # then this is a control-character-reference parse error."
  1243. #
  1244. # "A control is a C0 control or a code point in the range
  1245. # U+007F DELETE to U+009F APPLICATION PROGRAM COMMAND,
  1246. # inclusive."
  1247. #
  1248. # "A C0 control is a code point in the range U+0000 NULL to U+001F INFORMATION SEPARATOR ONE, inclusive."
  1249. #
  1250. # "The parser resolves such character references as-is except C1 control references that are replaced."
  1251. # First, let's replace the control references that can be replaced.
  1252. if numeric >= 0x80 and numeric <= 0x9f and numeric in cls.WINDOWS_1252_TO_UTF8:
  1253. # "If the number is one of the numbers in the first column of the
  1254. # following table, then find the row with that number in the first
  1255. # column, and set the character reference code to the number in the
  1256. # second column of that row."
  1257. #
  1258. # This is an attempt to catch characters that were encoded to numeric
  1259. # entities using their Windows-1252 encodings rather than their UTF-8
  1260. # encodings.
  1261. return cls.WINDOWS_1252_TO_UTF8[numeric].decode("utf8"), False
  1262. # Now all that's left are references that should be resolved as-is. This
  1263. # is also the default path for non-weird character references.
  1264. try:
  1265. return chr(numeric), False
  1266. except (ValueError, OverflowError):
  1267. # This shouldn't happen, since these cases should have been handled
  1268. # above, but if it does, return REPLACEMENT CHARACTER
  1269. return replacement, True
  1270. @classmethod
  1271. def detwingle(
  1272. cls,
  1273. in_bytes: bytes,
  1274. main_encoding: _Encoding = "utf8",
  1275. embedded_encoding: _Encoding = "windows-1252",
  1276. ) -> bytes:
  1277. """Fix characters from one encoding embedded in some other encoding.
  1278. Currently the only situation supported is Windows-1252 (or its
  1279. subset ISO-8859-1), embedded in UTF-8.
  1280. :param in_bytes: A bytestring that you suspect contains
  1281. characters from multiple encodings. Note that this *must*
  1282. be a bytestring. If you've already converted the document
  1283. to Unicode, you're too late.
  1284. :param main_encoding: The primary encoding of ``in_bytes``.
  1285. :param embedded_encoding: The encoding that was used to embed characters
  1286. in the main document.
  1287. :return: A bytestring similar to ``in_bytes``, in which
  1288. ``embedded_encoding`` characters have been converted to
  1289. their ``main_encoding`` equivalents.
  1290. """
  1291. if embedded_encoding.replace("_", "-").lower() not in (
  1292. "windows-1252",
  1293. "windows_1252",
  1294. ):
  1295. raise NotImplementedError(
  1296. "Windows-1252 and ISO-8859-1 are the only currently supported "
  1297. "embedded encodings."
  1298. )
  1299. if main_encoding.lower() not in ("utf8", "utf-8"):
  1300. raise NotImplementedError(
  1301. "UTF-8 is the only currently supported main encoding."
  1302. )
  1303. byte_chunks = []
  1304. chunk_start = 0
  1305. pos = 0
  1306. while pos < len(in_bytes):
  1307. byte = in_bytes[pos]
  1308. if byte >= cls.FIRST_MULTIBYTE_MARKER and byte <= cls.LAST_MULTIBYTE_MARKER:
  1309. # This is the start of a UTF-8 multibyte character. Skip
  1310. # to the end.
  1311. for start, end, size in cls.MULTIBYTE_MARKERS_AND_SIZES:
  1312. if byte >= start and byte <= end:
  1313. pos += size
  1314. break
  1315. elif byte >= 0x80 and byte in cls.WINDOWS_1252_TO_UTF8:
  1316. # We found a Windows-1252 character!
  1317. # Save the string up to this point as a chunk.
  1318. byte_chunks.append(in_bytes[chunk_start:pos])
  1319. # Now translate the Windows-1252 character into UTF-8
  1320. # and add it as another, one-byte chunk.
  1321. byte_chunks.append(cls.WINDOWS_1252_TO_UTF8[byte])
  1322. pos += 1
  1323. chunk_start = pos
  1324. else:
  1325. # Go on to the next character.
  1326. pos += 1
  1327. if chunk_start == 0:
  1328. # The string is unchanged.
  1329. return in_bytes
  1330. else:
  1331. # Store the final chunk.
  1332. byte_chunks.append(in_bytes[chunk_start:])
  1333. return b"".join(byte_chunks)