css_parser.py 46 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318
  1. """CSS selector parser."""
  2. from __future__ import annotations
  3. import re
  4. from functools import lru_cache
  5. from . import util
  6. from . import css_match as cm
  7. from . import css_types as ct
  8. from .util import SelectorSyntaxError
  9. import warnings
  10. from typing import Match, Any, Iterator, cast
  11. UNICODE_REPLACEMENT_CHAR = 0xFFFD
  12. # Simple pseudo classes that take no parameters
  13. PSEUDO_SIMPLE = {
  14. ":any-link",
  15. ":empty",
  16. ":first-child",
  17. ":first-of-type",
  18. ":in-range",
  19. ":open",
  20. ":out-of-range",
  21. ":last-child",
  22. ":last-of-type",
  23. ":link",
  24. ":only-child",
  25. ":only-of-type",
  26. ":root",
  27. ':checked',
  28. ':default',
  29. ':disabled',
  30. ':enabled',
  31. ':indeterminate',
  32. ':optional',
  33. ':placeholder-shown',
  34. ':read-only',
  35. ':read-write',
  36. ':required',
  37. ':scope',
  38. ':defined',
  39. ':muted'
  40. }
  41. # Supported, simple pseudo classes that match nothing in the Soup Sieve environment
  42. PSEUDO_SIMPLE_NO_MATCH = {
  43. ':active',
  44. ':autofill',
  45. ':buffering',
  46. ':current',
  47. ':focus',
  48. ':focus-visible',
  49. ':focus-within',
  50. ':fullscreen',
  51. ':future',
  52. ':host',
  53. ':hover',
  54. ':local-link',
  55. ':past',
  56. ':paused',
  57. ':picture-in-picture',
  58. ':playing',
  59. ':popover-open',
  60. ':seeking',
  61. ':stalled',
  62. ':target',
  63. ':target-within',
  64. ':user-invalid',
  65. ':volume-locked',
  66. ':visited'
  67. }
  68. # Complex pseudo classes that take selector lists
  69. PSEUDO_COMPLEX = {
  70. ':contains',
  71. ':-soup-contains',
  72. ':-soup-contains-own',
  73. ':has',
  74. ':is',
  75. ':matches',
  76. ':not',
  77. ':where'
  78. }
  79. PSEUDO_COMPLEX_NO_MATCH = {
  80. ':current',
  81. ':host',
  82. ':host-context'
  83. }
  84. # Complex pseudo classes that take very specific parameters and are handled special
  85. PSEUDO_SPECIAL = {
  86. ':dir',
  87. ':lang',
  88. ':nth-child',
  89. ':nth-last-child',
  90. ':nth-last-of-type',
  91. ':nth-of-type'
  92. }
  93. PSEUDO_SUPPORTED = PSEUDO_SIMPLE | PSEUDO_SIMPLE_NO_MATCH | PSEUDO_COMPLEX | PSEUDO_COMPLEX_NO_MATCH | PSEUDO_SPECIAL
  94. # Sub-patterns parts
  95. # Whitespace
  96. NEWLINE = r'(?:\r\n|(?!\r\n)[\n\f\r])'
  97. WS = fr'(?:[ \t]|{NEWLINE})'
  98. # Comments
  99. COMMENTS = r'(?:/\*[^*]*\*+(?:[^/*][^*]*\*+)*/)'
  100. # Whitespace with comments included
  101. WSC = fr'(?:{WS}|{COMMENTS})'
  102. # CSS escapes
  103. CSS_ESCAPES = fr'(?:\\(?:[a-f0-9]{{1,6}}{WS}?|[^\r\n\f]|$))'
  104. CSS_STRING_ESCAPES = fr'(?:\\(?:[a-f0-9]{{1,6}}{WS}?|[^\r\n\f]|$|{NEWLINE}))'
  105. # CSS Identifier
  106. IDENTIFIER = fr'''
  107. (?:(?:-?(?:[^\x00-\x2f\x30-\x40\x5B-\x5E\x60\x7B-\x9f]|{CSS_ESCAPES})+|--)
  108. (?:[^\x00-\x2c\x2e\x2f\x3A-\x40\x5B-\x5E\x60\x7B-\x9f]|{CSS_ESCAPES})*)
  109. '''
  110. # `nth` content
  111. NTH = fr'(?:[-+])?(?:[0-9]+n?|n)(?:(?<=n){WSC}*(?:[-+]){WSC}*(?:[0-9]+))?'
  112. # Value: quoted string or identifier
  113. VALUE = fr'''(?:"(?:\\(?:.|{NEWLINE})|[^\\"\r\n\f]+)*?"|'(?:\\(?:.|{NEWLINE})|[^\\'\r\n\f]+)*?'|{IDENTIFIER})'''
  114. # Attribute value comparison. `!=` is handled special as it is non-standard.
  115. ATTR = fr'(?:{WSC}*(?P<cmp>[!~^|*$]?=){WSC}*(?P<value>{VALUE})(?:{WSC}*(?P<case>[is]))?)?{WSC}*'
  116. # Selector patterns
  117. # IDs (`#id`)
  118. PAT_ID = fr'\#{IDENTIFIER}'
  119. # Classes (`.class`)
  120. PAT_CLASS = fr'\.{IDENTIFIER}'
  121. # Prefix:Tag (`prefix|tag`)
  122. PAT_TAG = fr'(?P<tag_ns>(?:{IDENTIFIER}|\*)?\|)?(?P<tag_name>{IDENTIFIER}|\*)'
  123. # Attributes (`[attr]`, `[attr=value]`, etc.)
  124. PAT_ATTR = fr'\[{WSC}*(?P<attr_ns>(?:{IDENTIFIER}|\*)?\|)?(?P<attr_name>{IDENTIFIER}){ATTR}\]'
  125. # Pseudo class (`:pseudo-class`, `:pseudo-class(`)
  126. PAT_PSEUDO_CLASS = fr'(?P<name>:{IDENTIFIER})(?P<open>\({WSC}*)?'
  127. # Pseudo class special patterns. Matches `:pseudo-class(` for special case pseudo classes.
  128. PAT_PSEUDO_CLASS_SPECIAL = fr'(?P<name>:{IDENTIFIER})(?P<open>\({WSC}*)'
  129. # Custom pseudo class (`:--custom-pseudo`)
  130. PAT_PSEUDO_CLASS_CUSTOM = fr'(?P<name>:(?=--){IDENTIFIER})'
  131. # Nesting ampersand selector. Matches `&`
  132. PAT_AMP = r'&'
  133. # Closing pseudo group (`)`)
  134. PAT_PSEUDO_CLOSE = fr'{WSC}*\)'
  135. # Pseudo element (`::pseudo-element`)
  136. PAT_PSEUDO_ELEMENT = fr':{PAT_PSEUDO_CLASS}'
  137. # At rule (`@page`, etc.) (not supported)
  138. PAT_AT_RULE = fr'@P{IDENTIFIER}'
  139. # Pseudo class `nth-child` (`:nth-child(an+b [of S]?)`, `:first-child`, etc.)
  140. PAT_PSEUDO_NTH_CHILD = fr'''
  141. (?P<pseudo_nth_child>{PAT_PSEUDO_CLASS_SPECIAL}
  142. (?P<nth_child>{NTH}|even|odd))(?:{WSC}*\)|(?P<of>{COMMENTS}*{WS}{WSC}*of{COMMENTS}*{WS}{WSC}*))
  143. '''
  144. # Pseudo class `nth-of-type` (`:nth-of-type(an+b)`, `:first-of-type`, etc.)
  145. PAT_PSEUDO_NTH_TYPE = fr'''
  146. (?P<pseudo_nth_type>{PAT_PSEUDO_CLASS_SPECIAL}
  147. (?P<nth_type>{NTH}|even|odd)){WSC}*\)
  148. '''
  149. # Pseudo class language (`:lang("*-de", en)`)
  150. PAT_PSEUDO_LANG = fr'{PAT_PSEUDO_CLASS_SPECIAL}(?P<values>{VALUE}(?:{WSC}*,{WSC}*{VALUE})*){WSC}*\)'
  151. # Pseudo class direction (`:dir(ltr)`)
  152. PAT_PSEUDO_DIR = fr'{PAT_PSEUDO_CLASS_SPECIAL}(?P<dir>ltr|rtl){WSC}*\)'
  153. # Combining characters (`>`, `~`, ` `, `+`, `,`)
  154. PAT_COMBINE = fr'{WSC}*?(?P<relation>[,+>~]|{WS}(?![,+>~])){WSC}*'
  155. # Extra: Contains (`:contains(text)`)
  156. PAT_PSEUDO_CONTAINS = fr'{PAT_PSEUDO_CLASS_SPECIAL}(?P<values>{VALUE}(?:{WSC}*,{WSC}*{VALUE})*){WSC}*\)'
  157. # Regular expressions
  158. # CSS escape pattern
  159. RE_CSS_ESC = re.compile(fr'(?:(\\[a-f0-9]{{1,6}}{WSC}?)|(\\[^\r\n\f])|(\\$))', re.I)
  160. RE_CSS_STR_ESC = re.compile(fr'(?:(\\[a-f0-9]{{1,6}}{WS}?)|(\\[^\r\n\f])|(\\$)|(\\{NEWLINE}))', re.I)
  161. # Pattern to break up `nth` specifiers
  162. RE_NTH = re.compile(fr'(?P<s1>[-+])?(?P<a>[0-9]+n?|n)(?:(?<=n){WSC}*(?P<s2>[-+]){WSC}*(?P<b>[0-9]+))?', re.I)
  163. # Pattern to iterate multiple values.
  164. RE_VALUES = re.compile(fr'(?:(?P<value>{VALUE})|(?P<split>{WSC}*,{WSC}*))', re.X)
  165. # Whitespace checks
  166. RE_WS = re.compile(WS)
  167. RE_WS_BEGIN = re.compile(fr'^{WSC}*')
  168. RE_WS_END = re.compile(fr'{WSC}*$')
  169. RE_CUSTOM = re.compile(fr'^{PAT_PSEUDO_CLASS_CUSTOM}$', re.X)
  170. # Constants
  171. # List split token
  172. COMMA_COMBINATOR = ','
  173. # Relation token for descendant
  174. WS_COMBINATOR = " "
  175. # Parse flags
  176. FLG_PSEUDO = 0x01
  177. FLG_NOT = 0x02
  178. FLG_RELATIVE = 0x04
  179. FLG_DEFAULT = 0x08
  180. FLG_HTML = 0x10
  181. FLG_INDETERMINATE = 0x20
  182. FLG_OPEN = 0x40
  183. FLG_IN_RANGE = 0x80
  184. FLG_OUT_OF_RANGE = 0x100
  185. FLG_PLACEHOLDER_SHOWN = 0x200
  186. FLG_FORGIVE = 0x400
  187. # Maximum cached patterns to store
  188. _MAXCACHE = 500
  189. @lru_cache(maxsize=_MAXCACHE)
  190. def _cached_css_compile(
  191. pattern: str,
  192. namespaces: ct.Namespaces | None,
  193. custom: ct.CustomSelectors | None,
  194. flags: int
  195. ) -> cm.SoupSieve:
  196. """Cached CSS compile."""
  197. custom_selectors = process_custom(custom)
  198. return cm.SoupSieve(
  199. pattern,
  200. CSSParser(
  201. pattern,
  202. custom=custom_selectors,
  203. flags=flags
  204. ).process_selectors(),
  205. namespaces,
  206. custom,
  207. flags
  208. )
  209. def _purge_cache() -> None:
  210. """Purge the cache."""
  211. _cached_css_compile.cache_clear()
  212. def process_custom(custom: ct.CustomSelectors | None) -> dict[str, str | ct.SelectorList]:
  213. """Process custom."""
  214. custom_selectors = {}
  215. if custom is not None:
  216. for key, value in custom.items():
  217. name = util.lower(key)
  218. if RE_CUSTOM.match(name) is None:
  219. raise SelectorSyntaxError(f"The name '{name}' is not a valid custom pseudo-class name")
  220. if name in custom_selectors:
  221. raise KeyError(f"The custom selector '{name}' has already been registered")
  222. custom_selectors[css_unescape(name)] = value
  223. return custom_selectors
  224. def css_unescape(content: str, string: bool = False) -> str:
  225. """
  226. Unescape CSS value.
  227. Strings allow for spanning the value on multiple strings by escaping a new line.
  228. """
  229. def replace(m: Match[str]) -> str:
  230. """Replace with the appropriate substitute."""
  231. if m.group(1):
  232. codepoint = int(m.group(1)[1:], 16)
  233. if codepoint == 0:
  234. codepoint = UNICODE_REPLACEMENT_CHAR
  235. value = chr(codepoint)
  236. elif m.group(2):
  237. value = m.group(2)[1:]
  238. elif m.group(3):
  239. value = '\ufffd'
  240. else:
  241. value = ''
  242. return value
  243. return (RE_CSS_ESC if not string else RE_CSS_STR_ESC).sub(replace, content)
  244. def escape(ident: str) -> str:
  245. """Escape identifier."""
  246. string = []
  247. length = len(ident)
  248. start_dash = length > 0 and ident[0] == '-'
  249. if length == 1 and start_dash:
  250. # Need to escape identifier that is a single `-` with no other characters
  251. string.append(f'\\{ident}')
  252. else:
  253. for index, c in enumerate(ident):
  254. codepoint = ord(c)
  255. if codepoint == 0x00:
  256. string.append('\ufffd')
  257. elif (0x01 <= codepoint <= 0x1F) or codepoint == 0x7F:
  258. string.append(f'\\{codepoint:x} ')
  259. elif (index == 0 or (start_dash and index == 1)) and (0x30 <= codepoint <= 0x39):
  260. string.append(f'\\{codepoint:x} ')
  261. elif (
  262. codepoint in (0x2D, 0x5F) or codepoint >= 0x80 or (0x30 <= codepoint <= 0x39) or
  263. (0x30 <= codepoint <= 0x39) or (0x41 <= codepoint <= 0x5A) or (0x61 <= codepoint <= 0x7A)
  264. ):
  265. string.append(c)
  266. else:
  267. string.append(f'\\{c}')
  268. return ''.join(string)
  269. class SelectorPattern:
  270. """Selector pattern."""
  271. def __init__(self, name: str, pattern: str) -> None:
  272. """Initialize."""
  273. self.name = name
  274. self.re_pattern = re.compile(pattern, re.I | re.X | re.U)
  275. def get_name(self) -> str:
  276. """Get name."""
  277. return self.name
  278. def match(self, selector: str, index: int, flags: int) -> Match[str] | None:
  279. """Match the selector."""
  280. return self.re_pattern.match(selector, index)
  281. class SpecialPseudoPattern(SelectorPattern):
  282. """Selector pattern."""
  283. def __init__(self, patterns: tuple[tuple[str, tuple[str, ...], str, type[SelectorPattern]], ...]) -> None:
  284. """Initialize."""
  285. self.patterns = {}
  286. for p in patterns:
  287. name = p[0]
  288. pattern = p[3](name, p[2])
  289. for pseudo in p[1]:
  290. self.patterns[pseudo] = pattern
  291. self.matched_name = None # type: SelectorPattern | None
  292. self.re_pseudo_name = re.compile(PAT_PSEUDO_CLASS_SPECIAL, re.I | re.X | re.U)
  293. def get_name(self) -> str:
  294. """Get name."""
  295. return '' if self.matched_name is None else self.matched_name.get_name()
  296. def match(self, selector: str, index: int, flags: int) -> Match[str] | None:
  297. """Match the selector."""
  298. pseudo = None
  299. m = self.re_pseudo_name.match(selector, index)
  300. if m:
  301. name = util.lower(css_unescape(m.group('name')))
  302. pattern = self.patterns.get(name)
  303. if pattern:
  304. pseudo = pattern.match(selector, index, flags)
  305. if pseudo:
  306. self.matched_name = pattern
  307. return pseudo
  308. class _Selector:
  309. """
  310. Intermediate selector class.
  311. This stores selector data for a compound selector as we are acquiring them.
  312. Once we are done collecting the data for a compound selector, we freeze
  313. the data in an object that can be pickled and hashed.
  314. """
  315. def __init__(self, **kwargs: Any) -> None:
  316. """Initialize."""
  317. self.tag = kwargs.get('tag', None) # type: ct.SelectorTag | None
  318. self.ids = kwargs.get('ids', []) # type: list[str]
  319. self.classes = kwargs.get('classes', []) # type: list[str]
  320. self.attributes = kwargs.get('attributes', []) # type: list[ct.SelectorAttribute]
  321. self.nth = kwargs.get('nth', []) # type: list[ct.SelectorNth]
  322. self.selectors = kwargs.get('selectors', []) # type: list[ct.SelectorList]
  323. self.relations = kwargs.get('relations', []) # type: list[_Selector]
  324. self.rel_type = kwargs.get('rel_type', None) # type: str | None
  325. self.contains = kwargs.get('contains', []) # type: list[ct.SelectorContains]
  326. self.lang = kwargs.get('lang', []) # type: list[ct.SelectorLang]
  327. self.flags = kwargs.get('flags', 0) # type: int
  328. self.no_match = kwargs.get('no_match', False) # type: bool
  329. def _freeze_relations(self, relations: list[_Selector]) -> ct.SelectorList:
  330. """Freeze relation."""
  331. if relations:
  332. sel = relations[0]
  333. sel.relations.extend(relations[1:])
  334. return ct.SelectorList([sel.freeze()])
  335. else:
  336. return ct.SelectorList()
  337. def freeze(self) -> ct.Selector | ct.SelectorNull:
  338. """Freeze self."""
  339. if self.no_match:
  340. return ct.SelectorNull()
  341. else:
  342. return ct.Selector(
  343. self.tag,
  344. tuple(self.ids),
  345. tuple(self.classes),
  346. tuple(self.attributes),
  347. tuple(self.nth),
  348. tuple(self.selectors),
  349. self._freeze_relations(self.relations),
  350. self.rel_type,
  351. tuple(self.contains),
  352. tuple(self.lang),
  353. self.flags
  354. )
  355. def __str__(self) -> str: # pragma: no cover
  356. """String representation."""
  357. return (
  358. f'_Selector(tag={self.tag!r}, ids={self.ids!r}, classes={self.classes!r}, attributes={self.attributes!r}, '
  359. f'nth={self.nth!r}, selectors={self.selectors!r}, relations={self.relations!r}, '
  360. f'rel_type={self.rel_type!r}, contains={self.contains!r}, lang={self.lang!r}, flags={self.flags!r}, '
  361. f'no_match={self.no_match!r})'
  362. )
  363. __repr__ = __str__
  364. class CSSParser:
  365. """Parse CSS selectors."""
  366. css_tokens = (
  367. SelectorPattern("pseudo_close", PAT_PSEUDO_CLOSE),
  368. SpecialPseudoPattern(
  369. (
  370. (
  371. "pseudo_contains",
  372. (':contains', ':-soup-contains', ':-soup-contains-own'),
  373. PAT_PSEUDO_CONTAINS,
  374. SelectorPattern
  375. ),
  376. ("pseudo_nth_child", (':nth-child', ':nth-last-child'), PAT_PSEUDO_NTH_CHILD, SelectorPattern),
  377. ("pseudo_nth_type", (':nth-of-type', ':nth-last-of-type'), PAT_PSEUDO_NTH_TYPE, SelectorPattern),
  378. ("pseudo_lang", (':lang',), PAT_PSEUDO_LANG, SelectorPattern),
  379. ("pseudo_dir", (':dir',), PAT_PSEUDO_DIR, SelectorPattern)
  380. )
  381. ),
  382. SelectorPattern("pseudo_class_custom", PAT_PSEUDO_CLASS_CUSTOM),
  383. SelectorPattern("pseudo_class", PAT_PSEUDO_CLASS),
  384. SelectorPattern("pseudo_element", PAT_PSEUDO_ELEMENT),
  385. SelectorPattern("amp", PAT_AMP),
  386. SelectorPattern("at_rule", PAT_AT_RULE),
  387. SelectorPattern("id", PAT_ID),
  388. SelectorPattern("class", PAT_CLASS),
  389. SelectorPattern("tag", PAT_TAG),
  390. SelectorPattern("attribute", PAT_ATTR),
  391. SelectorPattern("combine", PAT_COMBINE)
  392. )
  393. def __init__(
  394. self,
  395. selector: str,
  396. custom: dict[str, str | ct.SelectorList] | None = None,
  397. flags: int = 0
  398. ) -> None:
  399. """Initialize."""
  400. self.pattern = selector.replace('\x00', '\ufffd')
  401. self.flags = flags
  402. self.debug = self.flags & util.DEBUG
  403. self.custom = {} if custom is None else custom
  404. def parse_attribute_selector(self, sel: _Selector, m: Match[str], has_selector: bool) -> bool:
  405. """Create attribute selector from the returned regex match."""
  406. inverse = False
  407. op = m.group('cmp')
  408. case = util.lower(m.group('case')) if m.group('case') else None
  409. ns = css_unescape(m.group('attr_ns')[:-1]) if m.group('attr_ns') else ''
  410. attr = css_unescape(m.group('attr_name'))
  411. is_type = False
  412. pattern2 = None
  413. value = ''
  414. if case:
  415. flags = (re.I if case == 'i' else 0) | re.DOTALL
  416. elif util.lower(attr) == 'type':
  417. flags = re.I | re.DOTALL
  418. is_type = True
  419. else:
  420. flags = re.DOTALL
  421. if op:
  422. if m.group('value').startswith(('"', "'")):
  423. value = css_unescape(m.group('value')[1:-1], True)
  424. else:
  425. value = css_unescape(m.group('value'))
  426. if not op:
  427. # Attribute name
  428. pattern = None
  429. elif op.startswith('^'):
  430. # Value start with
  431. pattern = re.compile(r'^%s.*' % re.escape(value), flags)
  432. elif op.startswith('$'):
  433. # Value ends with
  434. pattern = re.compile(r'.*?%s$' % re.escape(value), flags)
  435. elif op.startswith('*'):
  436. # Value contains
  437. pattern = re.compile(r'.*?%s.*' % re.escape(value), flags)
  438. elif op.startswith('~'):
  439. # Value contains word within space separated list
  440. # `~=` should match nothing if it is empty or contains whitespace,
  441. # so if either of these cases is present, use `[^\s\S]` which cannot be matched.
  442. value = r'[^\s\S]' if not value or RE_WS.search(value) else re.escape(value)
  443. pattern = re.compile(r'.*?(?:(?<=^)|(?<=[ \t\r\n\f]))%s(?=(?:[ \t\r\n\f]|$)).*' % value, flags)
  444. elif op.startswith('|'):
  445. # Value starts with word in dash separated list
  446. pattern = re.compile(r'^%s(?:-.*)?$' % re.escape(value), flags)
  447. else:
  448. # Value matches
  449. pattern = re.compile(r'^%s$' % re.escape(value), flags)
  450. if op.startswith('!'):
  451. # Equivalent to `:not([attr=value])`
  452. inverse = True
  453. if is_type and pattern:
  454. pattern2 = re.compile(pattern.pattern)
  455. # Append the attribute selector
  456. sel_attr = ct.SelectorAttribute(attr, ns, pattern, pattern2)
  457. if inverse:
  458. # If we are using `!=`, we need to nest the pattern under a `:not()`.
  459. sub_sel = _Selector()
  460. sub_sel.attributes.append(sel_attr)
  461. not_list = ct.SelectorList([sub_sel.freeze()], True, False)
  462. sel.selectors.append(not_list)
  463. else:
  464. sel.attributes.append(sel_attr)
  465. has_selector = True
  466. return has_selector
  467. def parse_tag_pattern(self, sel: _Selector, m: Match[str], has_selector: bool) -> bool:
  468. """Parse tag pattern from regex match."""
  469. prefix = css_unescape(m.group('tag_ns')[:-1]) if m.group('tag_ns') else None
  470. tag = css_unescape(m.group('tag_name'))
  471. sel.tag = ct.SelectorTag(tag, prefix)
  472. has_selector = True
  473. return has_selector
  474. def parse_pseudo_class_custom(self, sel: _Selector, m: Match[str], has_selector: bool) -> bool:
  475. """
  476. Parse custom pseudo class alias.
  477. Compile custom selectors as we need them. When compiling a custom selector,
  478. set it to `None` in the dictionary so we can avoid an infinite loop.
  479. """
  480. pseudo = util.lower(css_unescape(m.group('name')))
  481. selector = self.custom.get(pseudo)
  482. if selector is None:
  483. raise SelectorSyntaxError(
  484. f"Undefined custom selector '{pseudo}' found at position {m.end(0)}",
  485. self.pattern,
  486. m.end(0)
  487. )
  488. if not isinstance(selector, ct.SelectorList):
  489. del self.custom[pseudo]
  490. selector = CSSParser(
  491. selector, custom=self.custom, flags=self.flags
  492. ).process_selectors(flags=FLG_PSEUDO)
  493. self.custom[pseudo] = selector
  494. sel.selectors.append(selector)
  495. has_selector = True
  496. return has_selector
  497. def parse_pseudo_class(
  498. self,
  499. sel: _Selector,
  500. m: Match[str],
  501. has_selector: bool,
  502. iselector: Iterator[tuple[str, Match[str]]],
  503. is_html: bool
  504. ) -> tuple[bool, bool]:
  505. """Parse pseudo class."""
  506. complex_pseudo = False
  507. pseudo = util.lower(css_unescape(m.group('name')))
  508. if m.group('open'):
  509. complex_pseudo = True
  510. if complex_pseudo and pseudo in PSEUDO_COMPLEX:
  511. has_selector = self.parse_pseudo_open(sel, pseudo, has_selector, iselector, m.end(0))
  512. elif not complex_pseudo and pseudo in PSEUDO_SIMPLE:
  513. if pseudo == ':root':
  514. sel.flags |= ct.SEL_ROOT
  515. elif pseudo == ':defined':
  516. sel.flags |= ct.SEL_DEFINED
  517. is_html = True
  518. elif pseudo == ':scope':
  519. sel.flags |= ct.SEL_SCOPE
  520. elif pseudo == ':empty':
  521. sel.flags |= ct.SEL_EMPTY
  522. elif pseudo in (':link', ':any-link'):
  523. sel.selectors.append(CSS_LINK)
  524. elif pseudo == ':checked':
  525. sel.selectors.append(CSS_CHECKED)
  526. elif pseudo == ':default':
  527. sel.selectors.append(CSS_DEFAULT)
  528. elif pseudo == ':indeterminate':
  529. sel.selectors.append(CSS_INDETERMINATE)
  530. elif pseudo == ":disabled":
  531. sel.selectors.append(CSS_DISABLED)
  532. elif pseudo == ":enabled":
  533. sel.selectors.append(CSS_ENABLED)
  534. elif pseudo == ":required":
  535. sel.selectors.append(CSS_REQUIRED)
  536. elif pseudo == ":muted":
  537. sel.selectors.append(CSS_MUTED)
  538. elif pseudo == ":open":
  539. sel.selectors.append(CSS_OPEN)
  540. elif pseudo == ":optional":
  541. sel.selectors.append(CSS_OPTIONAL)
  542. elif pseudo == ":read-only":
  543. sel.selectors.append(CSS_READ_ONLY)
  544. elif pseudo == ":read-write":
  545. sel.selectors.append(CSS_READ_WRITE)
  546. elif pseudo == ":in-range":
  547. sel.selectors.append(CSS_IN_RANGE)
  548. elif pseudo == ":out-of-range":
  549. sel.selectors.append(CSS_OUT_OF_RANGE)
  550. elif pseudo == ":placeholder-shown":
  551. sel.selectors.append(CSS_PLACEHOLDER_SHOWN)
  552. elif pseudo == ':first-child':
  553. sel.nth.append(ct.SelectorNth(1, False, 0, False, False, ct.SelectorList()))
  554. elif pseudo == ':last-child':
  555. sel.nth.append(ct.SelectorNth(1, False, 0, False, True, ct.SelectorList()))
  556. elif pseudo == ':first-of-type':
  557. sel.nth.append(ct.SelectorNth(1, False, 0, True, False, ct.SelectorList()))
  558. elif pseudo == ':last-of-type':
  559. sel.nth.append(ct.SelectorNth(1, False, 0, True, True, ct.SelectorList()))
  560. elif pseudo == ':only-child':
  561. sel.nth.extend(
  562. [
  563. ct.SelectorNth(1, False, 0, False, False, ct.SelectorList()),
  564. ct.SelectorNth(1, False, 0, False, True, ct.SelectorList())
  565. ]
  566. )
  567. elif pseudo == ':only-of-type':
  568. sel.nth.extend(
  569. [
  570. ct.SelectorNth(1, False, 0, True, False, ct.SelectorList()),
  571. ct.SelectorNth(1, False, 0, True, True, ct.SelectorList())
  572. ]
  573. )
  574. has_selector = True
  575. elif complex_pseudo and pseudo in PSEUDO_COMPLEX_NO_MATCH:
  576. self.parse_selectors(iselector, m.end(0), FLG_PSEUDO | FLG_OPEN)
  577. sel.no_match = True
  578. has_selector = True
  579. elif not complex_pseudo and pseudo in PSEUDO_SIMPLE_NO_MATCH:
  580. sel.no_match = True
  581. has_selector = True
  582. elif pseudo in PSEUDO_SUPPORTED:
  583. raise SelectorSyntaxError(
  584. f"Invalid syntax for pseudo class '{pseudo}'",
  585. self.pattern,
  586. m.start(0)
  587. )
  588. else:
  589. raise SelectorSyntaxError(
  590. f"'{pseudo}' was detected as a pseudo-class and is either unsupported or invalid. "
  591. "If the syntax was not intended to be recognized as a pseudo-class, please escape the colon.",
  592. self.pattern,
  593. m.start(0)
  594. )
  595. return has_selector, is_html
  596. def parse_pseudo_nth(
  597. self,
  598. sel: _Selector,
  599. m: Match[str],
  600. has_selector: bool,
  601. iselector: Iterator[tuple[str, Match[str]]]
  602. ) -> bool:
  603. """Parse `nth` pseudo."""
  604. mdict = m.groupdict()
  605. if mdict.get('pseudo_nth_child'):
  606. postfix = '_child'
  607. else:
  608. postfix = '_type'
  609. mdict['name'] = util.lower(css_unescape(mdict['name']))
  610. content = util.lower(mdict.get('nth' + postfix))
  611. if content == 'even':
  612. # 2n
  613. s1 = 2
  614. s2 = 0
  615. var = True
  616. elif content == 'odd':
  617. # 2n+1
  618. s1 = 2
  619. s2 = 1
  620. var = True
  621. else:
  622. nth_parts = cast(Match[str], RE_NTH.match(content))
  623. _s1 = '-' if nth_parts.group('s1') and nth_parts.group('s1') == '-' else ''
  624. a = nth_parts.group('a')
  625. var = a.endswith('n')
  626. if a.startswith('n'):
  627. _s1 += '1'
  628. elif var:
  629. _s1 += a[:-1]
  630. else:
  631. _s1 += a
  632. _s2 = '-' if nth_parts.group('s2') and nth_parts.group('s2') == '-' else ''
  633. if nth_parts.group('b'):
  634. _s2 += nth_parts.group('b')
  635. else:
  636. _s2 = '0'
  637. s1 = int(_s1, 10)
  638. s2 = int(_s2, 10)
  639. pseudo_sel = mdict['name']
  640. if postfix == '_child':
  641. if m.group('of'):
  642. # Parse the rest of `of S`.
  643. nth_sel = self.parse_selectors(iselector, m.end(0), FLG_PSEUDO | FLG_OPEN)
  644. else:
  645. # Use default `*|*` for `of S`.
  646. nth_sel = CSS_NTH_OF_S_DEFAULT
  647. if pseudo_sel == ':nth-child':
  648. sel.nth.append(ct.SelectorNth(s1, var, s2, False, False, nth_sel))
  649. elif pseudo_sel == ':nth-last-child':
  650. sel.nth.append(ct.SelectorNth(s1, var, s2, False, True, nth_sel))
  651. else:
  652. if pseudo_sel == ':nth-of-type':
  653. sel.nth.append(ct.SelectorNth(s1, var, s2, True, False, ct.SelectorList()))
  654. elif pseudo_sel == ':nth-last-of-type':
  655. sel.nth.append(ct.SelectorNth(s1, var, s2, True, True, ct.SelectorList()))
  656. has_selector = True
  657. return has_selector
  658. def parse_pseudo_open(
  659. self,
  660. sel: _Selector,
  661. name: str,
  662. has_selector: bool,
  663. iselector: Iterator[tuple[str, Match[str]]],
  664. index: int
  665. ) -> bool:
  666. """Parse pseudo with opening bracket."""
  667. flags = FLG_PSEUDO | FLG_OPEN
  668. if name == ':not':
  669. flags |= FLG_NOT
  670. elif name == ':has':
  671. flags |= FLG_RELATIVE
  672. elif name in (':where', ':is'):
  673. flags |= FLG_FORGIVE
  674. sel.selectors.append(self.parse_selectors(iselector, index, flags))
  675. has_selector = True
  676. return has_selector
  677. def parse_has_combinator(
  678. self,
  679. sel: _Selector,
  680. m: Match[str],
  681. has_selector: bool,
  682. selectors: list[_Selector],
  683. rel_type: str,
  684. index: int
  685. ) -> tuple[bool, _Selector, str]:
  686. """Parse combinator tokens."""
  687. combinator = m.group('relation').strip()
  688. if not combinator:
  689. combinator = WS_COMBINATOR
  690. if combinator == COMMA_COMBINATOR:
  691. sel.rel_type = rel_type
  692. selectors[-1].relations.append(sel)
  693. rel_type = ":" + WS_COMBINATOR
  694. selectors.append(_Selector())
  695. else:
  696. if has_selector:
  697. # End the current selector and associate the leading combinator with this selector.
  698. sel.rel_type = rel_type
  699. selectors[-1].relations.append(sel)
  700. elif rel_type[1:] != WS_COMBINATOR:
  701. # It's impossible to have two whitespace combinators after each other as the patterns
  702. # will gobble up trailing whitespace. It is also impossible to have a whitespace
  703. # combinator after any other kind for the same reason. But we could have
  704. # multiple non-whitespace combinators. So if the current combinator is not a whitespace,
  705. # then we've hit the multiple combinator case, so we should fail.
  706. raise SelectorSyntaxError(
  707. f'The multiple combinators at position {index}',
  708. self.pattern,
  709. index
  710. )
  711. # Set the leading combinator for the next selector.
  712. rel_type = ':' + combinator
  713. sel = _Selector()
  714. has_selector = False
  715. return has_selector, sel, rel_type
  716. def parse_combinator(
  717. self,
  718. sel: _Selector,
  719. m: Match[str],
  720. has_selector: bool,
  721. selectors: list[_Selector],
  722. relations: list[_Selector],
  723. is_pseudo: bool,
  724. is_forgive: bool,
  725. index: int
  726. ) -> tuple[bool, _Selector]:
  727. """Parse combinator tokens."""
  728. combinator = m.group('relation').strip()
  729. if not combinator:
  730. combinator = WS_COMBINATOR
  731. if not has_selector:
  732. if not is_forgive or combinator != COMMA_COMBINATOR:
  733. raise SelectorSyntaxError(
  734. f"The combinator '{combinator}' at position {index}, must have a selector before it",
  735. self.pattern,
  736. index
  737. )
  738. # If we are in a forgiving pseudo class, just make the selector a "no match"
  739. if combinator == COMMA_COMBINATOR:
  740. sel.no_match = True
  741. del relations[:]
  742. selectors.append(sel)
  743. else:
  744. if combinator == COMMA_COMBINATOR:
  745. if not sel.tag and not is_pseudo:
  746. # Implied `*`
  747. sel.tag = ct.SelectorTag('*', None)
  748. sel.relations.extend(relations)
  749. selectors.append(sel)
  750. del relations[:]
  751. else:
  752. sel.relations.extend(relations)
  753. sel.rel_type = combinator
  754. del relations[:]
  755. relations.append(sel)
  756. sel = _Selector()
  757. has_selector = False
  758. return has_selector, sel
  759. def parse_class_id(self, sel: _Selector, m: Match[str], has_selector: bool) -> bool:
  760. """Parse HTML classes and ids."""
  761. selector = m.group(0)
  762. if selector.startswith('.'):
  763. sel.classes.append(css_unescape(selector[1:]))
  764. else:
  765. sel.ids.append(css_unescape(selector[1:]))
  766. has_selector = True
  767. return has_selector
  768. def parse_pseudo_contains(self, sel: _Selector, m: Match[str], has_selector: bool) -> bool:
  769. """Parse contains."""
  770. pseudo = util.lower(css_unescape(m.group('name')))
  771. if pseudo == ":contains":
  772. warnings.warn( # noqa: B028
  773. "The pseudo class ':contains' is deprecated, ':-soup-contains' should be used moving forward.",
  774. FutureWarning
  775. )
  776. contains_own = pseudo == ":-soup-contains-own"
  777. values = css_unescape(m.group('values'))
  778. patterns = []
  779. for token in RE_VALUES.finditer(values):
  780. if token.group('split'):
  781. continue
  782. value = token.group('value')
  783. if value.startswith(("'", '"')):
  784. value = css_unescape(value[1:-1], True)
  785. else:
  786. value = css_unescape(value)
  787. patterns.append(value)
  788. sel.contains.append(ct.SelectorContains(patterns, contains_own))
  789. has_selector = True
  790. return has_selector
  791. def parse_pseudo_lang(self, sel: _Selector, m: Match[str], has_selector: bool) -> bool:
  792. """Parse pseudo language."""
  793. values = m.group('values')
  794. patterns = []
  795. for token in RE_VALUES.finditer(values):
  796. if token.group('split'):
  797. continue
  798. value = token.group('value')
  799. if value.startswith(('"', "'")):
  800. value = css_unescape(value[1:-1], True)
  801. else:
  802. value = css_unescape(value)
  803. patterns.append(value)
  804. sel.lang.append(ct.SelectorLang(patterns))
  805. has_selector = True
  806. return has_selector
  807. def parse_pseudo_dir(self, sel: _Selector, m: Match[str], has_selector: bool) -> bool:
  808. """Parse pseudo direction."""
  809. value = ct.SEL_DIR_LTR if util.lower(m.group('dir')) == 'ltr' else ct.SEL_DIR_RTL
  810. sel.flags |= value
  811. has_selector = True
  812. return has_selector
  813. def parse_selectors(
  814. self,
  815. iselector: Iterator[tuple[str, Match[str]]],
  816. index: int = 0,
  817. flags: int = 0
  818. ) -> ct.SelectorList:
  819. """Parse selectors."""
  820. # Initialize important variables
  821. sel = _Selector()
  822. selectors = []
  823. has_selector = False
  824. closed = False
  825. relations = [] # type: list[_Selector]
  826. rel_type = ":" + WS_COMBINATOR
  827. # Setup various flags
  828. is_open = bool(flags & FLG_OPEN)
  829. is_pseudo = bool(flags & FLG_PSEUDO)
  830. is_relative = bool(flags & FLG_RELATIVE)
  831. is_not = bool(flags & FLG_NOT)
  832. is_html = bool(flags & FLG_HTML)
  833. is_default = bool(flags & FLG_DEFAULT)
  834. is_indeterminate = bool(flags & FLG_INDETERMINATE)
  835. is_in_range = bool(flags & FLG_IN_RANGE)
  836. is_out_of_range = bool(flags & FLG_OUT_OF_RANGE)
  837. is_placeholder_shown = bool(flags & FLG_PLACEHOLDER_SHOWN)
  838. is_forgive = bool(flags & FLG_FORGIVE)
  839. # Print out useful debug stuff
  840. if self.debug: # pragma: no cover
  841. if is_pseudo:
  842. print(' is_pseudo: True')
  843. if is_open:
  844. print(' is_open: True')
  845. if is_relative:
  846. print(' is_relative: True')
  847. if is_not:
  848. print(' is_not: True')
  849. if is_html:
  850. print(' is_html: True')
  851. if is_default:
  852. print(' is_default: True')
  853. if is_indeterminate:
  854. print(' is_indeterminate: True')
  855. if is_in_range:
  856. print(' is_in_range: True')
  857. if is_out_of_range:
  858. print(' is_out_of_range: True')
  859. if is_placeholder_shown:
  860. print(' is_placeholder_shown: True')
  861. if is_forgive:
  862. print(' is_forgive: True')
  863. # The algorithm for relative selectors require an initial selector in the selector list
  864. if is_relative:
  865. selectors.append(_Selector())
  866. try:
  867. while True:
  868. key, m = next(iselector)
  869. # Handle parts
  870. if key == "at_rule":
  871. raise NotImplementedError(f"At-rules found at position {m.start(0)}")
  872. elif key == "amp":
  873. sel.flags |= ct.SEL_SCOPE
  874. has_selector = True
  875. elif key == 'pseudo_class_custom':
  876. has_selector = self.parse_pseudo_class_custom(sel, m, has_selector)
  877. elif key == 'pseudo_class':
  878. has_selector, is_html = self.parse_pseudo_class(sel, m, has_selector, iselector, is_html)
  879. elif key == 'pseudo_element':
  880. raise NotImplementedError(f"Pseudo-element found at position {m.start(0)}")
  881. elif key == 'pseudo_contains':
  882. has_selector = self.parse_pseudo_contains(sel, m, has_selector)
  883. elif key in ('pseudo_nth_type', 'pseudo_nth_child'):
  884. has_selector = self.parse_pseudo_nth(sel, m, has_selector, iselector)
  885. elif key == 'pseudo_lang':
  886. has_selector = self.parse_pseudo_lang(sel, m, has_selector)
  887. elif key == 'pseudo_dir':
  888. has_selector = self.parse_pseudo_dir(sel, m, has_selector)
  889. # Currently only supports HTML
  890. is_html = True
  891. elif key == 'pseudo_close':
  892. if not has_selector:
  893. if not is_forgive:
  894. raise SelectorSyntaxError(
  895. f"Expected a selector at position {m.start(0)}",
  896. self.pattern,
  897. m.start(0)
  898. )
  899. sel.no_match = True
  900. if is_open:
  901. closed = True
  902. break
  903. else:
  904. raise SelectorSyntaxError(
  905. f"Unmatched pseudo-class close at position {m.start(0)}",
  906. self.pattern,
  907. m.start(0)
  908. )
  909. elif key == 'combine':
  910. if is_relative:
  911. has_selector, sel, rel_type = self.parse_has_combinator(
  912. sel, m, has_selector, selectors, rel_type, index
  913. )
  914. else:
  915. has_selector, sel = self.parse_combinator(
  916. sel, m, has_selector, selectors, relations, is_pseudo, is_forgive, index
  917. )
  918. elif key == 'attribute':
  919. has_selector = self.parse_attribute_selector(sel, m, has_selector)
  920. elif key == 'tag':
  921. if has_selector:
  922. raise SelectorSyntaxError(
  923. f"Tag name found at position {m.start(0)} instead of at the start",
  924. self.pattern,
  925. m.start(0)
  926. )
  927. has_selector = self.parse_tag_pattern(sel, m, has_selector)
  928. elif key in ('class', 'id'):
  929. has_selector = self.parse_class_id(sel, m, has_selector)
  930. index = m.end(0)
  931. except StopIteration:
  932. pass
  933. # Handle selectors that are not closed
  934. if is_open and not closed:
  935. raise SelectorSyntaxError(
  936. f"Unclosed pseudo-class at position {index}",
  937. self.pattern,
  938. index
  939. )
  940. # Cleanup completed selector piece
  941. if has_selector:
  942. if not sel.tag and not is_pseudo:
  943. # Implied `*`
  944. sel.tag = ct.SelectorTag('*', None)
  945. if is_relative:
  946. sel.rel_type = rel_type
  947. selectors[-1].relations.append(sel)
  948. else:
  949. sel.relations.extend(relations)
  950. del relations[:]
  951. selectors.append(sel)
  952. # Forgive empty slots in pseudo-classes that have lists (and are forgiving)
  953. elif is_forgive and (not selectors or not relations):
  954. # Handle normal pseudo-classes with empty slots like `:is()` etc.
  955. sel.no_match = True
  956. del relations[:]
  957. selectors.append(sel)
  958. has_selector = True
  959. if not has_selector:
  960. # We will always need to finish a selector when `:has()` is used as it leads with combining.
  961. # May apply to others as well.
  962. raise SelectorSyntaxError(
  963. f'Expected a selector at position {index}',
  964. self.pattern,
  965. index
  966. )
  967. # Some patterns require additional logic, such as default. We try to make these the
  968. # last pattern, and append the appropriate flag to that selector which communicates
  969. # to the matcher what additional logic is required.
  970. if is_default:
  971. selectors[-1].flags = ct.SEL_DEFAULT
  972. if is_indeterminate:
  973. selectors[-1].flags = ct.SEL_INDETERMINATE
  974. if is_in_range:
  975. selectors[-1].flags = ct.SEL_IN_RANGE
  976. if is_out_of_range:
  977. selectors[-1].flags = ct.SEL_OUT_OF_RANGE
  978. if is_placeholder_shown:
  979. selectors[-1].flags = ct.SEL_PLACEHOLDER_SHOWN
  980. # Return selector list
  981. return ct.SelectorList([s.freeze() for s in selectors], is_not, is_html)
  982. def selector_iter(self, pattern: str) -> Iterator[tuple[str, Match[str]]]:
  983. """Iterate selector tokens."""
  984. # Ignore whitespace and comments at start and end of pattern
  985. m = RE_WS_BEGIN.search(pattern)
  986. index = m.end(0) if m else 0
  987. m = RE_WS_END.search(pattern)
  988. end = (m.start(0) - 1) if m else (len(pattern) - 1)
  989. if self.debug: # pragma: no cover
  990. print(f'## PARSING: {pattern!r}')
  991. while index <= end:
  992. m = None
  993. for v in self.css_tokens:
  994. m = v.match(pattern, index, self.flags)
  995. if m:
  996. name = v.get_name()
  997. if self.debug: # pragma: no cover
  998. print(f"TOKEN: '{name}' --> {m.group(0)!r} at position {m.start(0)}")
  999. index = m.end(0)
  1000. yield name, m
  1001. break
  1002. if m is None:
  1003. c = pattern[index]
  1004. # If the character represents the start of one of the known selector types,
  1005. # throw an exception mentioning that the known selector type is in error;
  1006. # otherwise, report the invalid character.
  1007. if c == '[':
  1008. msg = f"Malformed attribute selector at position {index}"
  1009. elif c == '.':
  1010. msg = f"Malformed class selector at position {index}"
  1011. elif c == '#':
  1012. msg = f"Malformed id selector at position {index}"
  1013. elif c == ':':
  1014. msg = f"Malformed pseudo-class selector at position {index}"
  1015. else:
  1016. msg = f"Invalid character {c!r} position {index}"
  1017. raise SelectorSyntaxError(msg, self.pattern, index)
  1018. if self.debug: # pragma: no cover
  1019. print('## END PARSING')
  1020. def process_selectors(self, index: int = 0, flags: int = 0) -> ct.SelectorList:
  1021. """Process selectors."""
  1022. return self.parse_selectors(self.selector_iter(self.pattern), index, flags)
  1023. # Precompile CSS selector lists for pseudo-classes (additional logic may be required beyond the pattern)
  1024. # A few patterns are order dependent as they use patterns previous compiled.
  1025. # CSS pattern for `:link` and `:any-link`
  1026. CSS_LINK = CSSParser(
  1027. 'html|*:is(a, area)[href]'
  1028. ).process_selectors(flags=FLG_PSEUDO | FLG_HTML)
  1029. # CSS pattern for `:checked`
  1030. CSS_CHECKED = CSSParser(
  1031. '''
  1032. html|*:is(input[type=checkbox], input[type=radio])[checked], html|option[selected]
  1033. '''
  1034. ).process_selectors(flags=FLG_PSEUDO | FLG_HTML)
  1035. # CSS pattern for `:default` (must compile CSS_CHECKED first)
  1036. CSS_DEFAULT = CSSParser(
  1037. '''
  1038. :checked,
  1039. /*
  1040. This pattern must be at the end.
  1041. Special logic is applied to the last selector.
  1042. */
  1043. html|form html|*:is(button, input)[type="submit"]
  1044. '''
  1045. ).process_selectors(flags=FLG_PSEUDO | FLG_HTML | FLG_DEFAULT)
  1046. # CSS pattern for `:indeterminate`
  1047. CSS_INDETERMINATE = CSSParser(
  1048. '''
  1049. html|input[type="checkbox"][indeterminate],
  1050. html|input[type="radio"]:is(:not([name]), [name=""]):not([checked]),
  1051. html|progress:not([value]),
  1052. /*
  1053. This pattern must be at the end.
  1054. Special logic is applied to the last selector.
  1055. */
  1056. html|input[type="radio"][name]:not([name='']):not([checked])
  1057. '''
  1058. ).process_selectors(flags=FLG_PSEUDO | FLG_HTML | FLG_INDETERMINATE)
  1059. # CSS pattern for `:disabled`
  1060. CSS_DISABLED = CSSParser(
  1061. '''
  1062. html|*:is(input:not([type=hidden]), button, select, textarea, fieldset, optgroup, option, fieldset)[disabled],
  1063. html|optgroup[disabled] > html|option,
  1064. html|fieldset[disabled] > html|*:is(input:not([type=hidden]), button, select, textarea, fieldset),
  1065. html|fieldset[disabled] >
  1066. html|*:not(legend:nth-of-type(1)) html|*:is(input:not([type=hidden]), button, select, textarea, fieldset)
  1067. '''
  1068. ).process_selectors(flags=FLG_PSEUDO | FLG_HTML)
  1069. # CSS pattern for `:enabled`
  1070. CSS_ENABLED = CSSParser(
  1071. '''
  1072. html|*:is(input:not([type=hidden]), button, select, textarea, fieldset, optgroup, option, fieldset):not(:disabled)
  1073. '''
  1074. ).process_selectors(flags=FLG_PSEUDO | FLG_HTML)
  1075. # CSS pattern for `:required`
  1076. CSS_REQUIRED = CSSParser(
  1077. 'html|*:is(input, textarea, select)[required]'
  1078. ).process_selectors(flags=FLG_PSEUDO | FLG_HTML)
  1079. # CSS pattern for `:optional`
  1080. CSS_OPTIONAL = CSSParser(
  1081. 'html|*:is(input, textarea, select):not([required])'
  1082. ).process_selectors(flags=FLG_PSEUDO | FLG_HTML)
  1083. # CSS pattern for `:placeholder-shown`
  1084. CSS_PLACEHOLDER_SHOWN = CSSParser(
  1085. '''
  1086. html|input:is(
  1087. :not([type]),
  1088. [type=""],
  1089. [type=text],
  1090. [type=search],
  1091. [type=url],
  1092. [type=tel],
  1093. [type=email],
  1094. [type=password],
  1095. [type=number]
  1096. )[placeholder]:not([placeholder='']):is(:not([value]), [value=""]),
  1097. html|textarea[placeholder]:not([placeholder=''])
  1098. '''
  1099. ).process_selectors(flags=FLG_PSEUDO | FLG_HTML | FLG_PLACEHOLDER_SHOWN)
  1100. # CSS pattern default for `:nth-child` "of S" feature
  1101. CSS_NTH_OF_S_DEFAULT = CSSParser(
  1102. '*|*'
  1103. ).process_selectors(flags=FLG_PSEUDO)
  1104. # CSS pattern for `:read-write` (CSS_DISABLED must be compiled first)
  1105. CSS_READ_WRITE = CSSParser(
  1106. '''
  1107. html|*:is(
  1108. textarea,
  1109. input:is(
  1110. :not([type]),
  1111. [type=""],
  1112. [type=text],
  1113. [type=search],
  1114. [type=url],
  1115. [type=tel],
  1116. [type=email],
  1117. [type=number],
  1118. [type=password],
  1119. [type=date],
  1120. [type=datetime-local],
  1121. [type=month],
  1122. [type=time],
  1123. [type=week]
  1124. )
  1125. ):not([readonly], :disabled),
  1126. html|*:is([contenteditable=""], [contenteditable="true" i])
  1127. '''
  1128. ).process_selectors(flags=FLG_PSEUDO | FLG_HTML)
  1129. # CSS pattern for `:read-only`
  1130. CSS_READ_ONLY = CSSParser(
  1131. '''
  1132. html|*:not(:read-write)
  1133. '''
  1134. ).process_selectors(flags=FLG_PSEUDO | FLG_HTML)
  1135. # CSS pattern for `:in-range`
  1136. CSS_IN_RANGE = CSSParser(
  1137. '''
  1138. html|input:is(
  1139. [type="date"],
  1140. [type="month"],
  1141. [type="week"],
  1142. [type="time"],
  1143. [type="datetime-local"],
  1144. [type="number"],
  1145. [type="range"]
  1146. ):is(
  1147. [min],
  1148. [max]
  1149. )
  1150. '''
  1151. ).process_selectors(flags=FLG_PSEUDO | FLG_IN_RANGE | FLG_HTML)
  1152. # CSS pattern for `:out-of-range`
  1153. CSS_OUT_OF_RANGE = CSSParser(
  1154. '''
  1155. html|input:is(
  1156. [type="date"],
  1157. [type="month"],
  1158. [type="week"],
  1159. [type="time"],
  1160. [type="datetime-local"],
  1161. [type="number"],
  1162. [type="range"]
  1163. ):is(
  1164. [min],
  1165. [max]
  1166. )
  1167. '''
  1168. ).process_selectors(flags=FLG_PSEUDO | FLG_OUT_OF_RANGE | FLG_HTML)
  1169. # CSS pattern for :open
  1170. CSS_OPEN = CSSParser(
  1171. '''
  1172. html|*:is(details, dialog)[open]
  1173. '''
  1174. ).process_selectors(flags=FLG_PSEUDO | FLG_HTML)
  1175. # CSS pattern for :muted
  1176. CSS_MUTED = CSSParser(
  1177. '''
  1178. html|*:is(video, audio)[muted]
  1179. '''
  1180. ).process_selectors(flags=FLG_PSEUDO | FLG_HTML)