css_match.py 60 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654
  1. """CSS matcher."""
  2. from __future__ import annotations
  3. from datetime import datetime
  4. from . import util
  5. import re
  6. from . import css_types as ct
  7. import unicodedata
  8. import bs4
  9. from typing import Iterator, Iterable, Any, Callable, Sequence, Any, cast # noqa: F401, F811
  10. # Empty tag pattern (whitespace okay)
  11. RE_NOT_EMPTY = re.compile('[^ \t\r\n\f]')
  12. RE_NOT_WS = re.compile('[^ \t\r\n\f]+')
  13. # Relationships
  14. REL_PARENT = ' '
  15. REL_CLOSE_PARENT = '>'
  16. REL_SIBLING = '~'
  17. REL_CLOSE_SIBLING = '+'
  18. # Relationships for :has() (forward looking)
  19. REL_HAS_PARENT = ': '
  20. REL_HAS_CLOSE_PARENT = ':>'
  21. REL_HAS_SIBLING = ':~'
  22. REL_HAS_CLOSE_SIBLING = ':+'
  23. NS_XHTML = 'http://www.w3.org/1999/xhtml'
  24. NS_XML = 'http://www.w3.org/XML/1998/namespace'
  25. DIR_FLAGS = ct.SEL_DIR_LTR | ct.SEL_DIR_RTL
  26. RANGES = ct.SEL_IN_RANGE | ct.SEL_OUT_OF_RANGE
  27. DIR_MAP = {
  28. 'ltr': ct.SEL_DIR_LTR,
  29. 'rtl': ct.SEL_DIR_RTL,
  30. 'auto': 0
  31. }
  32. RE_NUM = re.compile(r"^(?P<value>-?(?:[0-9]{1,}(\.[0-9]+)?|\.[0-9]+))$")
  33. RE_TIME = re.compile(r'^(?P<hour>[0-9]{2}):(?P<minutes>[0-9]{2})$')
  34. RE_MONTH = re.compile(r'^(?P<year>[0-9]{4,})-(?P<month>[0-9]{2})$')
  35. RE_WEEK = re.compile(r'^(?P<year>[0-9]{4,})-W(?P<week>[0-9]{2})$')
  36. RE_DATE = re.compile(r'^(?P<year>[0-9]{4,})-(?P<month>[0-9]{2})-(?P<day>[0-9]{2})$')
  37. RE_DATETIME = re.compile(
  38. r'^(?P<year>[0-9]{4,})-(?P<month>[0-9]{2})-(?P<day>[0-9]{2})T(?P<hour>[0-9]{2}):(?P<minutes>[0-9]{2})$'
  39. )
  40. RE_WILD_STRIP = re.compile(r'(?:(?:-\*-)(?:\*(?:-|$))*|-\*$)')
  41. MONTHS_30 = (4, 6, 9, 11) # April, June, September, and November
  42. FEB = 2
  43. SHORT_MONTH = 30
  44. LONG_MONTH = 31
  45. FEB_MONTH = 28
  46. FEB_LEAP_MONTH = 29
  47. DAYS_IN_WEEK = 7
  48. class _FakeParent:
  49. """
  50. Fake parent class.
  51. When we have a fragment with no `BeautifulSoup` document object,
  52. we can't evaluate `nth` selectors properly. Create a temporary
  53. fake parent so we can traverse the root element as a child.
  54. """
  55. def __init__(self, element: bs4.Tag) -> None:
  56. """Initialize."""
  57. self.contents = [element]
  58. def __len__(self) -> int:
  59. """Length."""
  60. return len(self.contents)
  61. class _DocumentNav:
  62. """Navigate a Beautiful Soup document."""
  63. @classmethod
  64. def assert_valid_input(cls, tag: Any) -> None:
  65. """Check if valid input tag or document."""
  66. # Fail on unexpected types.
  67. if not cls.is_tag(tag):
  68. raise TypeError(f"Expected a BeautifulSoup 'Tag', but instead received type {type(tag)}")
  69. @staticmethod
  70. def is_doc(obj: bs4.element.PageElement | None) -> bool:
  71. """Is `BeautifulSoup` object."""
  72. return isinstance(obj, bs4.BeautifulSoup)
  73. @staticmethod
  74. def is_tag(obj: bs4.element.PageElement | None) -> bool:
  75. """Is tag."""
  76. return isinstance(obj, bs4.Tag)
  77. @staticmethod
  78. def is_declaration(obj: bs4.element.PageElement | None) -> bool: # pragma: no cover
  79. """Is declaration."""
  80. return isinstance(obj, bs4.Declaration)
  81. @staticmethod
  82. def is_cdata(obj: bs4.element.PageElement | None) -> bool:
  83. """Is CDATA."""
  84. return isinstance(obj, bs4.CData)
  85. @staticmethod
  86. def is_processing_instruction(obj: bs4.element.PageElement | None) -> bool: # pragma: no cover
  87. """Is processing instruction."""
  88. return isinstance(obj, bs4.ProcessingInstruction)
  89. @staticmethod
  90. def is_navigable_string(obj: bs4.element.PageElement | None) -> bool:
  91. """Is navigable string."""
  92. return isinstance(obj, bs4.element.NavigableString)
  93. @staticmethod
  94. def is_special_string(obj: bs4.element.PageElement | None) -> bool:
  95. """Is special string."""
  96. return isinstance(obj, (bs4.Comment, bs4.Declaration, bs4.CData, bs4.ProcessingInstruction, bs4.Doctype))
  97. @classmethod
  98. def is_content_string(cls, obj: bs4.element.PageElement | None) -> bool:
  99. """Check if node is content string."""
  100. return cls.is_navigable_string(obj) and not cls.is_special_string(obj)
  101. @staticmethod
  102. def create_fake_parent(el: bs4.Tag) -> _FakeParent:
  103. """Create fake parent for a given element."""
  104. return _FakeParent(el)
  105. @staticmethod
  106. def is_xml_tree(el: bs4.Tag | None) -> bool:
  107. """Check if element (or document) is from a XML tree."""
  108. return el is not None and bool(el._is_xml)
  109. def is_iframe(self, el: bs4.Tag | None) -> bool:
  110. """Check if element is an `iframe`."""
  111. if el is None: # pragma: no cover
  112. return False
  113. return bool(
  114. ((el.name if self.is_xml_tree(el) else util.lower(el.name)) == 'iframe') and
  115. self.is_html_tag(el) # type: ignore[attr-defined]
  116. )
  117. def is_root(self, el: bs4.Tag) -> bool:
  118. """
  119. Return whether element is a root element.
  120. We check that the element is the root of the tree (which we have already pre-calculated),
  121. and we check if it is the root element under an `iframe`.
  122. """
  123. root = self.root and self.root is el # type: ignore[attr-defined]
  124. if not root:
  125. parent = self.get_parent(el)
  126. root = parent is not None and self.is_html and self.is_iframe(parent) # type: ignore[attr-defined]
  127. return root
  128. def get_contents(self, el: bs4.Tag | None, no_iframe: bool = False) -> Iterator[bs4.element.PageElement]:
  129. """Get contents or contents in reverse."""
  130. if el is not None:
  131. if not no_iframe or not self.is_iframe(el):
  132. yield from el.contents
  133. def get_tag_children(
  134. self,
  135. el: bs4.Tag | None,
  136. start: int | None = None,
  137. reverse: bool = False,
  138. no_iframe: bool = False
  139. ) -> Iterator[bs4.Tag]:
  140. """Get tag children."""
  141. return self.get_children(el, start, reverse, True, no_iframe) # type: ignore[return-value]
  142. def get_children(
  143. self,
  144. el: bs4.Tag | None,
  145. start: int | None = None,
  146. reverse: bool = False,
  147. tags: bool = False,
  148. no_iframe: bool = False
  149. ) -> Iterator[bs4.element.PageElement]:
  150. """Get children."""
  151. if el is not None and (not no_iframe or not self.is_iframe(el)):
  152. last = len(el.contents) - 1
  153. if start is None:
  154. index = last if reverse else 0
  155. else:
  156. index = start
  157. end = -1 if reverse else last + 1
  158. incr = -1 if reverse else 1
  159. if 0 <= index <= last:
  160. while index != end:
  161. node = el.contents[index]
  162. index += incr
  163. if not tags or self.is_tag(node):
  164. yield node
  165. def get_tag_descendants(
  166. self,
  167. el: bs4.Tag | None,
  168. no_iframe: bool = False
  169. ) -> Iterator[bs4.Tag]:
  170. """Specifically get tag descendants."""
  171. yield from self.get_descendants(el, tags=True, no_iframe=no_iframe) # type: ignore[misc]
  172. def get_descendants(
  173. self,
  174. el: bs4.Tag | None,
  175. tags: bool = False,
  176. no_iframe: bool = False
  177. ) -> Iterator[bs4.element.PageElement]:
  178. """Get descendants."""
  179. if el is not None and (not no_iframe or not self.is_iframe(el)):
  180. next_good = None
  181. for child in el.descendants:
  182. if next_good is not None:
  183. if child is not next_good:
  184. continue
  185. next_good = None
  186. if isinstance(child, bs4.Tag):
  187. if no_iframe and self.is_iframe(child):
  188. if child.next_sibling is not None:
  189. next_good = child.next_sibling
  190. else:
  191. last_child = child # type: bs4.element.PageElement
  192. while isinstance(last_child, bs4.Tag) and last_child.contents:
  193. last_child = last_child.contents[-1]
  194. next_good = last_child.next_element
  195. yield child
  196. if next_good is None:
  197. break
  198. # Coverage isn't seeing this even though it's executed
  199. continue # pragma: no cover
  200. yield child
  201. elif not tags:
  202. yield child
  203. def get_parent(self, el: bs4.Tag | None, no_iframe: bool = False) -> bs4.Tag | None:
  204. """Get parent."""
  205. parent = el.parent if el is not None else None
  206. if no_iframe and parent is not None and self.is_iframe(parent): # pragma: no cover
  207. parent = None
  208. return parent
  209. @staticmethod
  210. def get_tag_name(el: bs4.Tag | None) -> str | None:
  211. """Get tag."""
  212. return el.name if el is not None else None
  213. @staticmethod
  214. def get_prefix_name(el: bs4.Tag) -> str | None:
  215. """Get prefix."""
  216. return el.prefix
  217. @staticmethod
  218. def get_uri(el: bs4.Tag | None) -> str | None:
  219. """Get namespace `URI`."""
  220. return el.namespace if el is not None else None
  221. @classmethod
  222. def get_next_tag(cls, el: bs4.Tag) -> bs4.Tag | None:
  223. """Get next sibling tag."""
  224. return cls.get_next(el, tags=True) # type: ignore[return-value]
  225. @classmethod
  226. def get_next(cls, el: bs4.Tag, tags: bool = False) -> bs4.element.PageElement | None:
  227. """Get next sibling tag."""
  228. sibling = el.next_sibling
  229. while tags and not isinstance(sibling, bs4.Tag) and sibling is not None:
  230. sibling = sibling.next_sibling
  231. if tags and not isinstance(sibling, bs4.Tag):
  232. sibling = None
  233. return sibling
  234. @classmethod
  235. def get_previous_tag(cls, el: bs4.Tag, tags: bool = True) -> bs4.Tag | None:
  236. """Get previous sibling tag."""
  237. return cls.get_previous(el, True) # type: ignore[return-value]
  238. @classmethod
  239. def get_previous(cls, el: bs4.Tag, tags: bool = False) -> bs4.element.PageElement | None:
  240. """Get previous sibling tag."""
  241. sibling = el.previous_sibling
  242. while tags and not isinstance(sibling, bs4.Tag) and sibling is not None:
  243. sibling = sibling.previous_sibling
  244. if tags and not isinstance(sibling, bs4.Tag):
  245. sibling = None
  246. return sibling
  247. @staticmethod
  248. def has_html_ns(el: bs4.Tag | None) -> bool:
  249. """
  250. Check if element has an HTML namespace.
  251. This is a bit different than whether a element is treated as having an HTML namespace,
  252. like we do in the case of `is_html_tag`.
  253. """
  254. ns = getattr(el, 'namespace') if el is not None else None # noqa: B009
  255. return bool(ns and ns == NS_XHTML)
  256. @staticmethod
  257. def split_namespace(el: bs4.Tag | None, attr_name: str) -> tuple[str | None, str | None]:
  258. """Return namespace and attribute name without the prefix."""
  259. if el is None: # pragma: no cover
  260. return None, None
  261. return getattr(attr_name, 'namespace', None), getattr(attr_name, 'name', None)
  262. @classmethod
  263. def normalize_value(cls, value: Any) -> str | Sequence[str]:
  264. """Normalize the value to be a string or list of strings."""
  265. # Treat `None` as empty string.
  266. if value is None:
  267. return ''
  268. # Pass through strings
  269. if (isinstance(value, str)):
  270. return value
  271. # If it's a byte string, convert it to Unicode, treating it as UTF-8.
  272. if isinstance(value, bytes):
  273. return value.decode("utf8")
  274. # BeautifulSoup supports sequences of attribute values, so make sure the children are strings.
  275. if isinstance(value, Sequence):
  276. new_value = []
  277. for v in value:
  278. if not isinstance(v, (str, bytes)) and isinstance(v, Sequence):
  279. # This is most certainly a user error and will crash and burn later.
  280. # To keep things working, we'll do what we do with all objects,
  281. # And convert them to strings.
  282. new_value.append(str(v))
  283. else:
  284. # Convert the child to a string
  285. new_value.append(cast(str, cls.normalize_value(v)))
  286. return new_value
  287. # Try and make anything else a string
  288. return str(value)
  289. @classmethod
  290. def get_attribute_by_name(
  291. cls,
  292. el: bs4.Tag,
  293. name: str,
  294. default: str | Sequence[str] | None = None
  295. ) -> str | Sequence[str] | None:
  296. """Get attribute by name."""
  297. value = default
  298. if el._is_xml:
  299. try:
  300. value = cls.normalize_value(el.attrs[name])
  301. except KeyError:
  302. pass
  303. else:
  304. for k, v in el.attrs.items():
  305. if util.lower(k) == name:
  306. value = cls.normalize_value(v)
  307. break
  308. return value
  309. @classmethod
  310. def iter_attributes(cls, el: bs4.Tag | None) -> Iterator[tuple[str, str | Sequence[str] | None]]:
  311. """Iterate attributes."""
  312. if el is not None:
  313. for k, v in el.attrs.items():
  314. yield k, cls.normalize_value(v)
  315. @classmethod
  316. def get_classes(cls, el: bs4.Tag) -> Sequence[str]:
  317. """Get classes."""
  318. classes = cls.get_attribute_by_name(el, 'class', [])
  319. if isinstance(classes, str):
  320. classes = RE_NOT_WS.findall(classes)
  321. return cast(Sequence[str], classes)
  322. def get_text(self, el: bs4.Tag, no_iframe: bool = False) -> str:
  323. """Get text."""
  324. return ''.join(
  325. [
  326. node for node in self.get_descendants(el, no_iframe=no_iframe) # type: ignore[misc]
  327. if self.is_content_string(node)
  328. ]
  329. )
  330. def get_own_text(self, el: bs4.Tag, no_iframe: bool = False) -> list[str]:
  331. """Get Own Text."""
  332. return [
  333. node for node in self.get_contents(el, no_iframe=no_iframe) if self.is_content_string(node) # type: ignore[misc]
  334. ]
  335. class Inputs:
  336. """Class for parsing and validating input items."""
  337. @staticmethod
  338. def validate_day(year: int, month: int, day: int) -> bool:
  339. """Validate day."""
  340. max_days = LONG_MONTH
  341. if month == FEB:
  342. max_days = FEB_LEAP_MONTH if ((year % 4 == 0) and (year % 100 != 0)) or (year % 400 == 0) else FEB_MONTH
  343. elif month in MONTHS_30:
  344. max_days = SHORT_MONTH
  345. return 1 <= day <= max_days
  346. @staticmethod
  347. def validate_week(year: int, week: int) -> bool:
  348. """Validate week."""
  349. # Validate an ISO week number for `year`.
  350. #
  351. # Per ISO 8601 rules, the last ISO week of a year is the week
  352. # containing Dec 28. Using Dec 28 guarantees we obtain the
  353. # correct ISO week-number for the final week of `year`, even in
  354. # years where Dec 31 falls in ISO week 01 of the following year.
  355. #
  356. # Example: if Dec 31 is a Thursday the year's last ISO week will
  357. # be week 53; if Dec 31 is a Monday and that week is counted as
  358. # week 1 of the next year, Dec 28 still belongs to the final
  359. # week of the current ISO year and yields the correct max week.
  360. max_week = datetime(year, 12, 28).isocalendar()[1]
  361. return 1 <= week <= max_week
  362. @staticmethod
  363. def validate_month(month: int) -> bool:
  364. """Validate month."""
  365. return 1 <= month <= 12
  366. @staticmethod
  367. def validate_year(year: int) -> bool:
  368. """Validate year."""
  369. return 1 <= year
  370. @staticmethod
  371. def validate_hour(hour: int) -> bool:
  372. """Validate hour."""
  373. return 0 <= hour <= 23
  374. @staticmethod
  375. def validate_minutes(minutes: int) -> bool:
  376. """Validate minutes."""
  377. return 0 <= minutes <= 59
  378. @classmethod
  379. def parse_value(cls, itype: str, value: str | None) -> tuple[float, ...] | None:
  380. """Parse the input value."""
  381. parsed = None # type: tuple[float, ...] | None
  382. if value is None:
  383. return value
  384. if itype == "date":
  385. m = RE_DATE.match(value)
  386. if m:
  387. year = int(m.group('year'), 10)
  388. month = int(m.group('month'), 10)
  389. day = int(m.group('day'), 10)
  390. if cls.validate_year(year) and cls.validate_month(month) and cls.validate_day(year, month, day):
  391. parsed = (year, month, day)
  392. elif itype == "month":
  393. m = RE_MONTH.match(value)
  394. if m:
  395. year = int(m.group('year'), 10)
  396. month = int(m.group('month'), 10)
  397. if cls.validate_year(year) and cls.validate_month(month):
  398. parsed = (year, month)
  399. elif itype == "week":
  400. m = RE_WEEK.match(value)
  401. if m:
  402. year = int(m.group('year'), 10)
  403. week = int(m.group('week'), 10)
  404. if cls.validate_year(year) and cls.validate_week(year, week):
  405. parsed = (year, week)
  406. elif itype == "time":
  407. m = RE_TIME.match(value)
  408. if m:
  409. hour = int(m.group('hour'), 10)
  410. minutes = int(m.group('minutes'), 10)
  411. if cls.validate_hour(hour) and cls.validate_minutes(minutes):
  412. parsed = (hour, minutes)
  413. elif itype == "datetime-local":
  414. m = RE_DATETIME.match(value)
  415. if m:
  416. year = int(m.group('year'), 10)
  417. month = int(m.group('month'), 10)
  418. day = int(m.group('day'), 10)
  419. hour = int(m.group('hour'), 10)
  420. minutes = int(m.group('minutes'), 10)
  421. if (
  422. cls.validate_year(year) and cls.validate_month(month) and cls.validate_day(year, month, day) and
  423. cls.validate_hour(hour) and cls.validate_minutes(minutes)
  424. ):
  425. parsed = (year, month, day, hour, minutes)
  426. elif itype in ("number", "range"):
  427. m = RE_NUM.match(value)
  428. if m:
  429. parsed = (float(m.group('value')),)
  430. return parsed
  431. class CSSMatch(_DocumentNav):
  432. """Perform CSS matching."""
  433. def __init__(
  434. self,
  435. selectors: ct.SelectorList,
  436. scope: bs4.Tag | None,
  437. namespaces: ct.Namespaces | None,
  438. flags: int
  439. ) -> None:
  440. """Initialize."""
  441. self.assert_valid_input(scope)
  442. self.tag = scope
  443. self.cached_meta_lang = [] # type: list[tuple[str, str]]
  444. self.cached_default_forms = [] # type: list[tuple[bs4.Tag, bs4.Tag]]
  445. self.cached_indeterminate_forms = [] # type: list[tuple[bs4.Tag, str, bool]]
  446. self.selectors = selectors
  447. self.namespaces = {} if namespaces is None else namespaces # type: ct.Namespaces | dict[str, str]
  448. self.flags = flags
  449. self.iframe_restrict = False
  450. # Find the root element for the whole tree
  451. doc = scope
  452. parent = self.get_parent(doc)
  453. while parent:
  454. doc = parent
  455. parent = self.get_parent(doc)
  456. root = None # type: bs4.Tag | None
  457. if not self.is_doc(doc):
  458. root = doc
  459. else:
  460. for child in self.get_tag_children(doc):
  461. root = child
  462. break
  463. self.root = root
  464. self.scope = scope if scope is not doc else root
  465. self.has_html_namespace = self.has_html_ns(root)
  466. # A document can be both XML and HTML (XHTML)
  467. self.is_xml = self.is_xml_tree(doc)
  468. self.is_html = not self.is_xml or self.has_html_namespace
  469. def supports_namespaces(self) -> bool:
  470. """Check if namespaces are supported in the HTML type."""
  471. return self.is_xml or self.has_html_namespace
  472. def get_tag_ns(self, el: bs4.Tag | None) -> str:
  473. """Get tag namespace."""
  474. namespace = ''
  475. if el is None: # pragma: no cover
  476. return namespace
  477. if self.supports_namespaces():
  478. ns = self.get_uri(el)
  479. if ns:
  480. namespace = ns
  481. else:
  482. namespace = NS_XHTML
  483. return namespace
  484. def is_html_tag(self, el: bs4.Tag | None) -> bool:
  485. """Check if tag is in HTML namespace."""
  486. return self.get_tag_ns(el) == NS_XHTML
  487. def get_tag(self, el: bs4.Tag | None) -> str | None:
  488. """Get tag."""
  489. name = self.get_tag_name(el)
  490. return util.lower(name) if name is not None and not self.is_xml else name
  491. def get_prefix(self, el: bs4.Tag) -> str | None:
  492. """Get prefix."""
  493. prefix = self.get_prefix_name(el)
  494. return util.lower(prefix) if prefix is not None and not self.is_xml else prefix
  495. def find_bidi(self, el: bs4.Tag) -> int | None:
  496. """Get directionality from element text."""
  497. for node in self.get_children(el):
  498. # Analyze child text nodes
  499. if self.is_tag(node):
  500. # Avoid analyzing certain elements specified in the specification.
  501. direction = DIR_MAP.get(util.lower(self.get_attribute_by_name(node, 'dir', '')), None) # type: ignore[arg-type]
  502. name = self.get_tag(node) # type: ignore[arg-type]
  503. if (
  504. (name and name in ('bdi', 'script', 'style', 'textarea', 'iframe')) or
  505. not self.is_html_tag(node) or # type: ignore[arg-type]
  506. direction is not None
  507. ):
  508. continue # pragma: no cover
  509. # Check directionality of this node's text
  510. value = self.find_bidi(node) # type: ignore[arg-type]
  511. if value is not None:
  512. return value
  513. # Direction could not be determined
  514. continue # pragma: no cover
  515. # Skip `doctype` comments, etc.
  516. if self.is_special_string(node):
  517. continue
  518. # Analyze text nodes for directionality.
  519. for c in node: # type: ignore[attr-defined]
  520. bidi = unicodedata.bidirectional(c)
  521. if bidi in ('AL', 'R', 'L'):
  522. return ct.SEL_DIR_LTR if bidi == 'L' else ct.SEL_DIR_RTL
  523. return None
  524. def extended_language_filter(self, lang_range: str, lang_tag: str) -> bool:
  525. """Filter the language tags."""
  526. match = True
  527. lang_range = RE_WILD_STRIP.sub('-', lang_range).lower()
  528. ranges = lang_range.split('-')
  529. subtags = lang_tag.lower().split('-')
  530. length = len(ranges)
  531. slength = len(subtags)
  532. rindex = 0
  533. sindex = 0
  534. r = ranges[rindex]
  535. s = subtags[sindex]
  536. # Empty specified language should match unspecified language attributes
  537. if length == 1 and slength == 1 and not r and r == s:
  538. return True
  539. # Primary tag needs to match
  540. if (r != '*' and r != s) or (r == '*' and slength == 1 and not s):
  541. match = False
  542. rindex += 1
  543. sindex += 1
  544. # Match until we run out of ranges
  545. while match and rindex < length:
  546. r = ranges[rindex]
  547. try:
  548. s = subtags[sindex]
  549. except IndexError:
  550. # Ran out of subtags,
  551. # but we still have ranges
  552. match = False
  553. continue
  554. # Empty range
  555. if not r:
  556. match = False
  557. continue
  558. # Matched range
  559. elif s == r:
  560. rindex += 1
  561. # Implicit wildcard cannot match
  562. # singletons
  563. elif len(s) == 1:
  564. match = False
  565. continue
  566. # Implicitly matched, so grab next subtag
  567. sindex += 1
  568. return match
  569. def match_attribute_name(
  570. self,
  571. el: bs4.Tag,
  572. attr: str,
  573. prefix: str | None
  574. ) -> str | Sequence[str] | None:
  575. """Match attribute name and return value if it exists."""
  576. value = None
  577. if self.supports_namespaces():
  578. value = None
  579. # If we have not defined namespaces, we can't very well find them, so don't bother trying.
  580. if prefix:
  581. ns = self.namespaces.get(prefix)
  582. if ns is None and prefix != '*':
  583. return None
  584. else:
  585. ns = None
  586. for k, v in self.iter_attributes(el):
  587. # Get attribute parts
  588. namespace, name = self.split_namespace(el, k)
  589. # Can't match a prefix attribute as we haven't specified one to match
  590. # Try to match it normally as a whole `p:a` as selector may be trying `p\:a`.
  591. if ns is None:
  592. if (self.is_xml and attr == k) or (not self.is_xml and util.lower(attr) == util.lower(k)):
  593. value = v
  594. break
  595. # Coverage is not finding this even though it is executed.
  596. # Adding a print statement before this (and erasing coverage) causes coverage to find the line.
  597. # Ignore the false positive message.
  598. continue # pragma: no cover
  599. # We can't match our desired prefix attribute as the attribute doesn't have a prefix
  600. if namespace is None or (ns != namespace and prefix != '*'):
  601. continue
  602. # The attribute doesn't match.
  603. if (util.lower(attr) != util.lower(name)) if not self.is_xml else (attr != name):
  604. continue
  605. value = v
  606. break
  607. else:
  608. for k, v in self.iter_attributes(el):
  609. if util.lower(attr) != util.lower(k):
  610. continue
  611. value = v
  612. break
  613. return value
  614. def match_namespace(self, el: bs4.Tag, tag: ct.SelectorTag) -> bool:
  615. """Match the namespace of the element."""
  616. match = True
  617. namespace = self.get_tag_ns(el)
  618. default_namespace = self.namespaces.get('')
  619. tag_ns = '' if tag.prefix is None else self.namespaces.get(tag.prefix)
  620. # We must match the default namespace if one is not provided
  621. if tag.prefix is None and (default_namespace is not None and namespace != default_namespace):
  622. match = False
  623. # If we specified `|tag`, we must not have a namespace.
  624. elif (tag.prefix is not None and tag.prefix == '' and namespace):
  625. match = False
  626. # Verify prefix matches
  627. elif (
  628. tag.prefix and
  629. tag.prefix != '*' and (tag_ns is None or namespace != tag_ns)
  630. ):
  631. match = False
  632. return match
  633. def match_attributes(self, el: bs4.Tag, attributes: tuple[ct.SelectorAttribute, ...]) -> bool:
  634. """Match attributes."""
  635. match = True
  636. if attributes:
  637. for a in attributes:
  638. temp = self.match_attribute_name(el, a.attribute, a.prefix)
  639. pattern = a.xml_type_pattern if self.is_xml and a.xml_type_pattern else a.pattern
  640. if temp is None:
  641. match = False
  642. break
  643. value = temp if isinstance(temp, str) else ' '.join(temp)
  644. if pattern is None:
  645. continue
  646. elif pattern.match(value) is None:
  647. match = False
  648. break
  649. return match
  650. def match_tagname(self, el: bs4.Tag, tag: ct.SelectorTag) -> bool:
  651. """Match tag name."""
  652. name = (util.lower(tag.name) if not self.is_xml and tag.name is not None else tag.name)
  653. return not (
  654. name is not None and
  655. name not in (self.get_tag(el), '*')
  656. )
  657. def match_tag(self, el: bs4.Tag, tag: ct.SelectorTag | None) -> bool:
  658. """Match the tag."""
  659. match = True
  660. if tag is not None:
  661. # Verify namespace
  662. if not self.match_namespace(el, tag):
  663. match = False
  664. if not self.match_tagname(el, tag):
  665. match = False
  666. return match
  667. def match_past_relations(self, el: bs4.Tag, relation: ct.SelectorList) -> bool:
  668. """Match past relationship."""
  669. found = False
  670. # I don't think this can ever happen, but it makes `mypy` happy
  671. if isinstance(relation[0], ct.SelectorNull): # pragma: no cover
  672. return found
  673. if relation[0].rel_type == REL_PARENT:
  674. parent = self.get_parent(el, no_iframe=self.iframe_restrict)
  675. while not found and parent:
  676. found = self.match_selectors(parent, relation)
  677. parent = self.get_parent(parent, no_iframe=self.iframe_restrict)
  678. elif relation[0].rel_type == REL_CLOSE_PARENT:
  679. parent = self.get_parent(el, no_iframe=self.iframe_restrict)
  680. if parent:
  681. found = self.match_selectors(parent, relation)
  682. elif relation[0].rel_type == REL_SIBLING:
  683. sibling = self.get_previous_tag(el)
  684. while not found and sibling:
  685. found = self.match_selectors(sibling, relation)
  686. sibling = self.get_previous_tag(sibling)
  687. elif relation[0].rel_type == REL_CLOSE_SIBLING:
  688. sibling = self.get_previous_tag(el)
  689. if sibling and self.is_tag(sibling):
  690. found = self.match_selectors(sibling, relation)
  691. return found
  692. def match_future_child(self, parent: bs4.Tag, relation: ct.SelectorList, recursive: bool = False) -> bool:
  693. """Match future child."""
  694. match = False
  695. if recursive:
  696. children = self.get_tag_descendants # type: Callable[..., Iterator[bs4.Tag]]
  697. else:
  698. children = self.get_tag_children
  699. for child in children(parent, no_iframe=self.iframe_restrict):
  700. match = self.match_selectors(child, relation)
  701. if match:
  702. break
  703. return match
  704. def match_future_relations(self, el: bs4.Tag, relation: ct.SelectorList) -> bool:
  705. """Match future relationship."""
  706. found = False
  707. # I don't think this can ever happen, but it makes `mypy` happy
  708. if isinstance(relation[0], ct.SelectorNull): # pragma: no cover
  709. return found
  710. if relation[0].rel_type == REL_HAS_PARENT:
  711. found = self.match_future_child(el, relation, True)
  712. elif relation[0].rel_type == REL_HAS_CLOSE_PARENT:
  713. found = self.match_future_child(el, relation)
  714. elif relation[0].rel_type == REL_HAS_SIBLING:
  715. sibling = self.get_next_tag(el)
  716. while not found and sibling:
  717. found = self.match_selectors(sibling, relation)
  718. sibling = self.get_next_tag(sibling)
  719. elif relation[0].rel_type == REL_HAS_CLOSE_SIBLING:
  720. sibling = self.get_next_tag(el)
  721. if sibling and self.is_tag(sibling):
  722. found = self.match_selectors(sibling, relation)
  723. return found
  724. def match_relations(self, el: bs4.Tag, relation: ct.SelectorList) -> bool:
  725. """Match relationship to other elements."""
  726. found = False
  727. if isinstance(relation[0], ct.SelectorNull) or relation[0].rel_type is None:
  728. return found
  729. if relation[0].rel_type.startswith(':'):
  730. found = self.match_future_relations(el, relation)
  731. else:
  732. found = self.match_past_relations(el, relation)
  733. return found
  734. def match_id(self, el: bs4.Tag, ids: tuple[str, ...]) -> bool:
  735. """Match element's ID."""
  736. found = True
  737. for i in ids:
  738. if i != self.get_attribute_by_name(el, 'id', ''):
  739. found = False
  740. break
  741. return found
  742. def match_classes(self, el: bs4.Tag, classes: tuple[str, ...]) -> bool:
  743. """Match element's classes."""
  744. current_classes = self.get_classes(el)
  745. found = True
  746. for c in classes:
  747. if c not in current_classes:
  748. found = False
  749. break
  750. return found
  751. def match_root(self, el: bs4.Tag) -> bool:
  752. """Match element as root."""
  753. is_root = self.is_root(el)
  754. if is_root:
  755. sibling = self.get_previous(el) # type: Any
  756. while is_root and sibling is not None:
  757. if (
  758. self.is_tag(sibling) or (self.is_content_string(sibling) and sibling.strip()) or
  759. self.is_cdata(sibling)
  760. ):
  761. is_root = False
  762. else:
  763. sibling = self.get_previous(sibling)
  764. if is_root:
  765. sibling = self.get_next(el)
  766. while is_root and sibling is not None:
  767. if (
  768. self.is_tag(sibling) or (self.is_content_string(sibling) and sibling.strip()) or
  769. self.is_cdata(sibling)
  770. ):
  771. is_root = False
  772. else:
  773. sibling = self.get_next(sibling)
  774. return is_root
  775. def match_scope(self, el: bs4.Tag) -> bool:
  776. """Match element as scope."""
  777. return self.scope is el
  778. def match_nth_tag_type(self, el: bs4.Tag, child: bs4.Tag) -> bool:
  779. """Match tag type for `nth` matches."""
  780. return (
  781. (self.get_tag(child) == self.get_tag(el)) and
  782. (self.get_tag_ns(child) == self.get_tag_ns(el))
  783. )
  784. def match_nth(self, el: bs4.Tag, nth: tuple[ct.SelectorNth, ...]) -> bool:
  785. """Match `nth` elements."""
  786. matched = True
  787. for n in nth:
  788. matched = False
  789. if n.selectors and not self.match_selectors(el, n.selectors):
  790. break
  791. parent = self.get_parent(el) # type: bs4.Tag | None
  792. if parent is None:
  793. parent = cast('bs4.Tag', self.create_fake_parent(el))
  794. last = n.last
  795. last_index = len(parent) - 1
  796. index = last_index if last else 0
  797. relative_index = 0
  798. a = n.a
  799. b = n.b
  800. var = n.n
  801. count = 0
  802. count_incr = 1
  803. factor = -1 if last else 1
  804. idx = last_idx = a * count + b if var else a
  805. # We can only adjust bounds within a variable index
  806. if var:
  807. # Abort if our nth index is out of bounds and only getting further out of bounds as we increment.
  808. # Otherwise, increment to try to get in bounds.
  809. adjust = None
  810. while idx < 1 or idx > last_index:
  811. if idx < 0:
  812. diff_low = 0 - idx
  813. if adjust is not None and adjust == 1:
  814. break
  815. adjust = -1
  816. count += count_incr
  817. idx = last_idx = a * count + b if var else a
  818. diff = 0 - idx
  819. if diff >= diff_low:
  820. break
  821. else:
  822. diff_high = idx - last_index
  823. if adjust is not None and adjust == -1:
  824. break
  825. adjust = 1
  826. count += count_incr
  827. idx = last_idx = a * count + b if var else a
  828. diff = idx - last_index
  829. if diff >= diff_high:
  830. break
  831. diff_high = diff
  832. # If a < 0, our count is working backwards, so floor the index by increasing the count.
  833. # Find the count that yields the lowest, in bound value and use that.
  834. # Lastly reverse count increment so that we'll increase our index.
  835. lowest = count
  836. if a < 0:
  837. while idx >= 1:
  838. lowest = count
  839. count += count_incr
  840. idx = last_idx = a * count + b if var else a
  841. count_incr = -1
  842. count = lowest
  843. idx = last_idx = a * count + b if var else a
  844. # Evaluate elements while our calculated nth index is still in range
  845. while 1 <= idx <= last_index + 1:
  846. child = None # type: bs4.element.PageElement | None
  847. # Evaluate while our child index is still in range.
  848. for child in self.get_children(parent, start=index, reverse=factor < 0):
  849. index += factor
  850. if not isinstance(child, bs4.Tag):
  851. continue
  852. # Handle `of S` in `nth-child`
  853. if n.selectors and not self.match_selectors(child, n.selectors):
  854. continue
  855. # Handle `of-type`
  856. if n.of_type and not self.match_nth_tag_type(el, child):
  857. continue
  858. relative_index += 1
  859. if relative_index == idx:
  860. if child is el:
  861. matched = True
  862. else:
  863. break
  864. if child is el:
  865. break
  866. if child is el:
  867. break
  868. last_idx = idx
  869. count += count_incr
  870. if count < 0:
  871. # Count is counting down and has now ventured into invalid territory.
  872. break
  873. idx = a * count + b if var else a
  874. if last_idx == idx:
  875. break
  876. if not matched:
  877. break
  878. return matched
  879. def match_empty(self, el: bs4.Tag) -> bool:
  880. """Check if element is empty (if requested)."""
  881. is_empty = True
  882. for child in self.get_children(el):
  883. if self.is_tag(child):
  884. is_empty = False
  885. break
  886. elif self.is_content_string(child) and RE_NOT_EMPTY.search(child): # type: ignore[call-overload]
  887. is_empty = False
  888. break
  889. return is_empty
  890. def match_subselectors(self, el: bs4.Tag, selectors: tuple[ct.SelectorList, ...]) -> bool:
  891. """Match selectors."""
  892. match = True
  893. for sel in selectors:
  894. if not self.match_selectors(el, sel):
  895. match = False
  896. return match
  897. def match_contains(self, el: bs4.Tag, contains: tuple[ct.SelectorContains, ...]) -> bool:
  898. """Match element if it contains text."""
  899. match = True
  900. content = None # type: str | Sequence[str] | None
  901. for contain_list in contains:
  902. if content is None:
  903. if contain_list.own:
  904. content = self.get_own_text(el, no_iframe=self.is_html)
  905. else:
  906. content = self.get_text(el, no_iframe=self.is_html)
  907. found = False
  908. for text in contain_list.text:
  909. if contain_list.own:
  910. for c in content:
  911. if text in c:
  912. found = True
  913. break
  914. if found:
  915. break
  916. else:
  917. if text in content:
  918. found = True
  919. break
  920. if not found:
  921. match = False
  922. return match
  923. def match_default(self, el: bs4.Tag) -> bool:
  924. """Match default."""
  925. match = False
  926. # Find this input's form
  927. form = None # type: bs4.Tag | None
  928. parent = self.get_parent(el, no_iframe=True)
  929. while parent and form is None:
  930. if self.get_tag(parent) == 'form' and self.is_html_tag(parent):
  931. form = parent
  932. else:
  933. parent = self.get_parent(parent, no_iframe=True)
  934. if form is not None:
  935. # Look in form cache to see if we've already located its default button
  936. found_form = False
  937. for f, t in self.cached_default_forms:
  938. if f is form:
  939. found_form = True
  940. if t is el:
  941. match = True
  942. break
  943. # We didn't have the form cached, so look for its default button
  944. if not found_form:
  945. for child in self.get_tag_descendants(form, no_iframe=True):
  946. name = self.get_tag(child)
  947. # Can't do nested forms (haven't figured out why we never hit this)
  948. if name == 'form': # pragma: no cover
  949. break
  950. if name in ('input', 'button'):
  951. v = self.get_attribute_by_name(child, 'type', '')
  952. if v and util.lower(v) == 'submit':
  953. self.cached_default_forms.append((form, child))
  954. if el is child:
  955. match = True
  956. break
  957. return match
  958. def match_indeterminate(self, el: bs4.Tag) -> bool:
  959. """Match default."""
  960. match = False
  961. name = cast(str, self.get_attribute_by_name(el, 'name'))
  962. def get_parent_form(el: bs4.Tag) -> bs4.Tag | None:
  963. """Find this input's form."""
  964. form = None
  965. parent = self.get_parent(el, no_iframe=True)
  966. while form is None:
  967. if self.get_tag(parent) == 'form' and self.is_html_tag(parent):
  968. form = parent
  969. break
  970. last_parent = parent
  971. parent = self.get_parent(parent, no_iframe=True)
  972. if parent is None:
  973. form = last_parent
  974. break
  975. return form
  976. form = get_parent_form(el)
  977. # Look in form cache to see if we've already evaluated that its fellow radio buttons are indeterminate
  978. if form is not None:
  979. found_form = False
  980. for f, n, i in self.cached_indeterminate_forms:
  981. if f is form and n == name:
  982. found_form = True
  983. if i is True:
  984. match = True
  985. break
  986. # We didn't have the form cached, so validate that the radio button is indeterminate
  987. if not found_form:
  988. checked = False
  989. for child in self.get_tag_descendants(form, no_iframe=True):
  990. if child is el:
  991. continue
  992. tag_name = self.get_tag(child)
  993. if tag_name == 'input':
  994. is_radio = False
  995. check = False
  996. has_name = False
  997. for k, v in self.iter_attributes(child):
  998. if util.lower(k) == 'type' and util.lower(v) == 'radio':
  999. is_radio = True
  1000. elif util.lower(k) == 'name' and v == name:
  1001. has_name = True
  1002. elif util.lower(k) == 'checked':
  1003. check = True
  1004. if is_radio and check and has_name and get_parent_form(child) is form:
  1005. checked = True
  1006. break
  1007. if checked:
  1008. break
  1009. if not checked:
  1010. match = True
  1011. self.cached_indeterminate_forms.append((form, name, match))
  1012. return match
  1013. def match_lang(self, el: bs4.Tag, langs: tuple[ct.SelectorLang, ...]) -> bool:
  1014. """Match languages."""
  1015. match = False
  1016. has_ns = self.supports_namespaces()
  1017. root = self.root
  1018. has_html_namespace = self.has_html_namespace
  1019. # Walk parents looking for `lang` (HTML) or `xml:lang` XML property.
  1020. parent = el # type: bs4.Tag | None
  1021. found_lang = None
  1022. last = None
  1023. while not found_lang:
  1024. has_html_ns = self.has_html_ns(parent)
  1025. for k, v in self.iter_attributes(parent):
  1026. attr_ns, attr = self.split_namespace(parent, k)
  1027. if (
  1028. ((not has_ns or has_html_ns) and (util.lower(k) if not self.is_xml else k) == 'lang') or
  1029. (
  1030. has_ns and not has_html_ns and attr_ns == NS_XML and
  1031. (util.lower(attr) if not self.is_xml and attr is not None else attr) == 'lang'
  1032. )
  1033. ):
  1034. found_lang = v
  1035. break
  1036. last = parent
  1037. parent = self.get_parent(parent, no_iframe=self.is_html)
  1038. if parent is None:
  1039. root = last
  1040. has_html_namespace = self.has_html_ns(root)
  1041. parent = last
  1042. break
  1043. # Use cached meta language.
  1044. if found_lang is None and self.cached_meta_lang:
  1045. for cache in self.cached_meta_lang:
  1046. if root is not None and cast(str, root) is cache[0]:
  1047. found_lang = cache[1]
  1048. # If we couldn't find a language, and the document is HTML, look to meta to determine language.
  1049. if found_lang is None and (not self.is_xml or (has_html_namespace and root and root.name == 'html')):
  1050. # Find head
  1051. found = False
  1052. for tag in ('html', 'head'):
  1053. found = False
  1054. for child in self.get_tag_children(parent, no_iframe=self.is_html):
  1055. if self.get_tag(child) == tag and self.is_html_tag(child):
  1056. found = True
  1057. parent = child
  1058. break
  1059. if not found: # pragma: no cover
  1060. break
  1061. # Search meta tags
  1062. if found and parent is not None:
  1063. for child2 in parent:
  1064. if isinstance(child2, bs4.Tag) and self.get_tag(child2) == 'meta' and self.is_html_tag(parent):
  1065. c_lang = False
  1066. content = None
  1067. for k, v in self.iter_attributes(child2):
  1068. if util.lower(k) == 'http-equiv' and util.lower(v) == 'content-language':
  1069. c_lang = True
  1070. if util.lower(k) == 'content':
  1071. content = v
  1072. if c_lang and content:
  1073. found_lang = content
  1074. self.cached_meta_lang.append((cast(str, root), cast(str, found_lang)))
  1075. break
  1076. if found_lang is not None:
  1077. break
  1078. if found_lang is None:
  1079. self.cached_meta_lang.append((cast(str, root), ''))
  1080. # If we determined a language, compare.
  1081. if found_lang is not None:
  1082. for patterns in langs:
  1083. match = False
  1084. for pattern in patterns:
  1085. if self.extended_language_filter(pattern, cast(str, found_lang)):
  1086. match = True
  1087. if not match:
  1088. break
  1089. return match
  1090. def match_dir(self, el: bs4.Tag | None, directionality: int) -> bool:
  1091. """Check directionality."""
  1092. # If we have to match both left and right, we can't match either.
  1093. if directionality & ct.SEL_DIR_LTR and directionality & ct.SEL_DIR_RTL:
  1094. return False
  1095. if el is None or not self.is_html_tag(el):
  1096. return False
  1097. # Element has defined direction of left to right or right to left
  1098. direction = DIR_MAP.get(util.lower(self.get_attribute_by_name(el, 'dir', '')), None)
  1099. if direction not in (None, 0):
  1100. return direction == directionality
  1101. # Element is the document element (the root) and no direction assigned, assume left to right.
  1102. is_root = self.is_root(el)
  1103. if is_root and direction is None:
  1104. return ct.SEL_DIR_LTR == directionality
  1105. # If `input[type=telephone]` and no direction is assigned, assume left to right.
  1106. name = self.get_tag(el)
  1107. is_input = name == 'input'
  1108. is_textarea = name == 'textarea'
  1109. is_bdi = name == 'bdi'
  1110. itype = util.lower(self.get_attribute_by_name(el, 'type', '')) if is_input else ''
  1111. if is_input and itype == 'tel' and direction is None:
  1112. return ct.SEL_DIR_LTR == directionality
  1113. # Auto handling for text inputs
  1114. if ((is_input and itype in ('text', 'search', 'tel', 'url', 'email')) or is_textarea) and direction == 0:
  1115. if is_textarea:
  1116. value = ''.join(node for node in self.get_contents(el, no_iframe=True) if self.is_content_string(node)) # type: ignore[misc]
  1117. else:
  1118. value = cast(str, self.get_attribute_by_name(el, 'value', ''))
  1119. if value:
  1120. for c in value:
  1121. bidi = unicodedata.bidirectional(c)
  1122. if bidi in ('AL', 'R', 'L'):
  1123. direction = ct.SEL_DIR_LTR if bidi == 'L' else ct.SEL_DIR_RTL
  1124. return direction == directionality
  1125. # Assume left to right
  1126. return ct.SEL_DIR_LTR == directionality
  1127. elif is_root:
  1128. return ct.SEL_DIR_LTR == directionality
  1129. return self.match_dir(self.get_parent(el, no_iframe=True), directionality)
  1130. # Auto handling for `bdi` and other non text inputs.
  1131. if (is_bdi and direction is None) or direction == 0:
  1132. direction = self.find_bidi(el)
  1133. if direction is not None:
  1134. return direction == directionality
  1135. elif is_root:
  1136. return ct.SEL_DIR_LTR == directionality
  1137. return self.match_dir(self.get_parent(el, no_iframe=True), directionality)
  1138. # Match parents direction
  1139. return self.match_dir(self.get_parent(el, no_iframe=True), directionality)
  1140. def match_range(self, el: bs4.Tag, condition: int) -> bool:
  1141. """
  1142. Match range.
  1143. Behavior is modeled after what we see in browsers. Browsers seem to evaluate
  1144. if the value is out of range, and if not, it is in range. So a missing value
  1145. will not evaluate out of range; therefore, value is in range. Personally, I
  1146. feel like this should evaluate as neither in or out of range.
  1147. """
  1148. out_of_range = False
  1149. itype = util.lower(self.get_attribute_by_name(el, 'type'))
  1150. mn = Inputs.parse_value(itype, cast(str, self.get_attribute_by_name(el, 'min', None)))
  1151. mx = Inputs.parse_value(itype, cast(str, self.get_attribute_by_name(el, 'max', None)))
  1152. # There is no valid min or max, so we cannot evaluate a range
  1153. if mn is None and mx is None:
  1154. return False
  1155. value = Inputs.parse_value(itype, cast(str, self.get_attribute_by_name(el, 'value', None)))
  1156. if value is not None:
  1157. if itype in ("date", "datetime-local", "month", "week", "number", "range"):
  1158. if mn is not None and value < mn:
  1159. out_of_range = True
  1160. if not out_of_range and mx is not None and value > mx:
  1161. out_of_range = True
  1162. elif itype == "time":
  1163. if mn is not None and mx is not None and mn > mx:
  1164. # Time is periodic, so this is a reversed/discontinuous range
  1165. if value < mn and value > mx:
  1166. out_of_range = True
  1167. else:
  1168. if mn is not None and value < mn:
  1169. out_of_range = True
  1170. if not out_of_range and mx is not None and value > mx:
  1171. out_of_range = True
  1172. return not out_of_range if condition & ct.SEL_IN_RANGE else out_of_range
  1173. def match_defined(self, el: bs4.Tag) -> bool:
  1174. """
  1175. Match defined.
  1176. `:defined` is related to custom elements in a browser.
  1177. - If the document is XML (not XHTML), all tags will match.
  1178. - Tags that are not custom (don't have a hyphen) are marked defined.
  1179. - If the tag has a prefix (without or without a namespace), it will not match.
  1180. This is of course requires the parser to provide us with the proper prefix and namespace info,
  1181. if it doesn't, there is nothing we can do.
  1182. """
  1183. name = self.get_tag(el)
  1184. return (
  1185. name is not None and (
  1186. name.find('-') == -1 or
  1187. name.find(':') != -1 or
  1188. self.get_prefix(el) is not None
  1189. )
  1190. )
  1191. def match_placeholder_shown(self, el: bs4.Tag) -> bool:
  1192. """
  1193. Match placeholder shown according to HTML spec.
  1194. - text area should be checked if they have content. A single newline does not count as content.
  1195. """
  1196. match = False
  1197. content = self.get_text(el)
  1198. if content in ('', '\n'):
  1199. match = True
  1200. return match
  1201. def match_selectors(self, el: bs4.Tag, selectors: ct.SelectorList) -> bool:
  1202. """Check if element matches one of the selectors."""
  1203. match = False
  1204. is_not = selectors.is_not
  1205. is_html = selectors.is_html
  1206. # Internal selector lists that use the HTML flag, will automatically get the `html` namespace.
  1207. if is_html:
  1208. namespaces = self.namespaces
  1209. iframe_restrict = self.iframe_restrict
  1210. self.namespaces = {'html': NS_XHTML}
  1211. self.iframe_restrict = True
  1212. if not is_html or self.is_html:
  1213. for selector in selectors:
  1214. match = is_not
  1215. # We have a un-matchable situation (like `:focus` as you can focus an element in this environment)
  1216. if isinstance(selector, ct.SelectorNull):
  1217. continue
  1218. # Verify tag matches
  1219. if not self.match_tag(el, selector.tag):
  1220. continue
  1221. # Verify tag is defined
  1222. if selector.flags & ct.SEL_DEFINED and not self.match_defined(el):
  1223. continue
  1224. # Verify element is root
  1225. if selector.flags & ct.SEL_ROOT and not self.match_root(el):
  1226. continue
  1227. # Verify element is scope
  1228. if selector.flags & ct.SEL_SCOPE and not self.match_scope(el):
  1229. continue
  1230. # Verify element has placeholder shown
  1231. if selector.flags & ct.SEL_PLACEHOLDER_SHOWN and not self.match_placeholder_shown(el):
  1232. continue
  1233. # Verify `nth` matches
  1234. if not self.match_nth(el, selector.nth):
  1235. continue
  1236. if selector.flags & ct.SEL_EMPTY and not self.match_empty(el):
  1237. continue
  1238. # Verify id matches
  1239. if selector.ids and not self.match_id(el, selector.ids):
  1240. continue
  1241. # Verify classes match
  1242. if selector.classes and not self.match_classes(el, selector.classes):
  1243. continue
  1244. # Verify attribute(s) match
  1245. if not self.match_attributes(el, selector.attributes):
  1246. continue
  1247. # Verify ranges
  1248. if selector.flags & RANGES and not self.match_range(el, selector.flags & RANGES):
  1249. continue
  1250. # Verify language patterns
  1251. if selector.lang and not self.match_lang(el, selector.lang):
  1252. continue
  1253. # Verify pseudo selector patterns
  1254. if selector.selectors and not self.match_subselectors(el, selector.selectors):
  1255. continue
  1256. # Verify relationship selectors
  1257. if selector.relation and not self.match_relations(el, selector.relation):
  1258. continue
  1259. # Validate that the current default selector match corresponds to the first submit button in the form
  1260. if selector.flags & ct.SEL_DEFAULT and not self.match_default(el):
  1261. continue
  1262. # Validate that the unset radio button is among radio buttons with the same name in a form that are
  1263. # also not set.
  1264. if selector.flags & ct.SEL_INDETERMINATE and not self.match_indeterminate(el):
  1265. continue
  1266. # Validate element directionality
  1267. if selector.flags & DIR_FLAGS and not self.match_dir(el, selector.flags & DIR_FLAGS):
  1268. continue
  1269. # Validate that the tag contains the specified text.
  1270. if selector.contains and not self.match_contains(el, selector.contains):
  1271. continue
  1272. match = not is_not
  1273. break
  1274. # Restore actual namespaces being used for external selector lists
  1275. if is_html:
  1276. self.namespaces = namespaces
  1277. self.iframe_restrict = iframe_restrict
  1278. return match
  1279. def select(self, limit: int = 0) -> Iterator[bs4.Tag]:
  1280. """Match all tags under the targeted tag."""
  1281. lim = None if limit < 1 else limit
  1282. for child in self.get_tag_descendants(self.tag):
  1283. if self.match(child):
  1284. yield child
  1285. if lim is not None:
  1286. lim -= 1
  1287. if lim < 1:
  1288. break
  1289. def closest(self) -> bs4.Tag | None:
  1290. """Match closest ancestor."""
  1291. current = self.tag # type: bs4.Tag | None
  1292. closest = None
  1293. while closest is None and current is not None:
  1294. if self.match(current):
  1295. closest = current
  1296. else:
  1297. current = self.get_parent(current)
  1298. return closest
  1299. def filter(self) -> list[bs4.Tag]: # noqa A001
  1300. """Filter tag's children."""
  1301. return [
  1302. tag for tag in self.get_contents(self.tag)
  1303. if isinstance(tag, bs4.Tag) and self.match(tag)
  1304. ]
  1305. def match(self, el: bs4.Tag) -> bool:
  1306. """Match."""
  1307. return not self.is_doc(el) and self.is_tag(el) and self.match_selectors(el, self.selectors)
  1308. class SoupSieve(ct.Immutable):
  1309. """Compiled Soup Sieve selector matching object."""
  1310. pattern: str
  1311. selectors: ct.SelectorList
  1312. namespaces: ct.Namespaces | None
  1313. custom: dict[str, str]
  1314. flags: int
  1315. __slots__ = ("pattern", "selectors", "namespaces", "custom", "flags", "_hash")
  1316. def __init__(
  1317. self,
  1318. pattern: str,
  1319. selectors: ct.SelectorList,
  1320. namespaces: ct.Namespaces | None,
  1321. custom: ct.CustomSelectors | None,
  1322. flags: int
  1323. ):
  1324. """Initialize."""
  1325. super().__init__(
  1326. pattern=pattern,
  1327. selectors=selectors,
  1328. namespaces=namespaces,
  1329. custom=custom,
  1330. flags=flags
  1331. )
  1332. def match(self, tag: bs4.Tag) -> bool:
  1333. """Match."""
  1334. return CSSMatch(self.selectors, tag, self.namespaces, self.flags).match(tag)
  1335. def closest(self, tag: bs4.Tag) -> bs4.Tag | None:
  1336. """Match closest ancestor."""
  1337. return CSSMatch(self.selectors, tag, self.namespaces, self.flags).closest()
  1338. def filter(self, iterable: Iterable[bs4.Tag]) -> list[bs4.Tag]: # noqa A001
  1339. """
  1340. Filter.
  1341. `CSSMatch` can cache certain searches for tags of the same document,
  1342. so if we are given a tag, all tags are from the same document,
  1343. and we can take advantage of the optimization.
  1344. Any other kind of iterable could have tags from different documents or detached tags,
  1345. so for those, we use a new `CSSMatch` for each item in the iterable.
  1346. """
  1347. if isinstance(iterable, bs4.Tag):
  1348. return CSSMatch(self.selectors, iterable, self.namespaces, self.flags).filter()
  1349. else:
  1350. return [node for node in iterable if not CSSMatch.is_navigable_string(node) and self.match(node)]
  1351. def select_one(self, tag: bs4.Tag) -> bs4.Tag | None:
  1352. """Select a single tag."""
  1353. tags = self.select(tag, limit=1)
  1354. return tags[0] if tags else None
  1355. def select(self, tag: bs4.Tag, limit: int = 0) -> list[bs4.Tag]:
  1356. """Select the specified tags."""
  1357. return list(self.iselect(tag, limit))
  1358. def iselect(self, tag: bs4.Tag, limit: int = 0) -> Iterator[bs4.Tag]:
  1359. """Iterate the specified tags."""
  1360. yield from CSSMatch(self.selectors, tag, self.namespaces, self.flags).select(limit)
  1361. def __repr__(self) -> str: # pragma: no cover
  1362. """Representation."""
  1363. return (
  1364. f"SoupSieve(pattern={self.pattern!r}, namespaces={self.namespaces!r}, "
  1365. f"custom={self.custom!r}, flags={self.flags!r})"
  1366. )
  1367. __str__ = __repr__
  1368. ct.pickle_register(SoupSieve)