| 1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068206920702071207220732074207520762077207820792080208120822083208420852086208720882089209020912092209320942095209620972098209921002101210221032104210521062107210821092110211121122113211421152116211721182119212021212122212321242125212621272128212921302131213221332134213521362137213821392140214121422143214421452146214721482149215021512152215321542155215621572158215921602161216221632164216521662167216821692170217121722173217421752176217721782179218021812182218321842185218621872188218921902191219221932194219521962197219821992200220122022203220422052206220722082209221022112212221322142215221622172218221922202221222222232224222522262227222822292230223122322233223422352236223722382239224022412242224322442245224622472248224922502251225222532254225522562257225822592260226122622263226422652266226722682269227022712272227322742275227622772278227922802281228222832284228522862287228822892290229122922293229422952296229722982299230023012302230323042305230623072308230923102311231223132314231523162317231823192320232123222323232423252326232723282329233023312332233323342335233623372338233923402341234223432344234523462347234823492350235123522353235423552356235723582359236023612362236323642365236623672368236923702371237223732374237523762377237823792380238123822383238423852386238723882389239023912392239323942395239623972398239924002401240224032404240524062407240824092410241124122413241424152416241724182419242024212422242324242425242624272428242924302431243224332434243524362437243824392440244124422443244424452446244724482449245024512452245324542455245624572458245924602461246224632464246524662467246824692470247124722473247424752476247724782479248024812482248324842485248624872488248924902491249224932494249524962497249824992500250125022503250425052506250725082509251025112512251325142515251625172518251925202521252225232524252525262527252825292530253125322533253425352536253725382539254025412542254325442545254625472548254925502551255225532554255525562557255825592560256125622563256425652566256725682569257025712572257325742575257625772578257925802581258225832584258525862587258825892590259125922593259425952596259725982599260026012602260326042605260626072608260926102611261226132614261526162617261826192620262126222623262426252626262726282629263026312632263326342635263626372638263926402641264226432644264526462647264826492650265126522653265426552656265726582659266026612662266326642665266626672668266926702671267226732674267526762677267826792680268126822683268426852686268726882689269026912692269326942695269626972698269927002701270227032704270527062707270827092710271127122713271427152716271727182719272027212722272327242725272627272728272927302731273227332734273527362737273827392740274127422743274427452746274727482749275027512752275327542755275627572758275927602761276227632764276527662767276827692770277127722773277427752776277727782779278027812782278327842785278627872788278927902791279227932794279527962797279827992800280128022803280428052806280728082809281028112812281328142815281628172818281928202821282228232824282528262827282828292830283128322833283428352836283728382839284028412842284328442845284628472848284928502851285228532854285528562857285828592860286128622863286428652866286728682869287028712872287328742875287628772878287928802881288228832884288528862887288828892890289128922893289428952896289728982899290029012902290329042905290629072908290929102911291229132914291529162917291829192920292129222923292429252926292729282929293029312932293329342935293629372938293929402941294229432944294529462947294829492950295129522953295429552956295729582959296029612962296329642965296629672968296929702971297229732974297529762977297829792980298129822983298429852986298729882989299029912992299329942995299629972998299930003001300230033004300530063007300830093010301130123013301430153016301730183019302030213022302330243025302630273028302930303031303230333034303530363037303830393040304130423043304430453046304730483049305030513052305330543055305630573058305930603061306230633064306530663067306830693070307130723073307430753076307730783079308030813082308330843085308630873088308930903091309230933094309530963097309830993100310131023103310431053106310731083109311031113112311331143115311631173118311931203121312231233124312531263127312831293130313131323133313431353136313731383139314031413142314331443145314631473148314931503151315231533154315531563157315831593160316131623163316431653166316731683169317031713172317331743175317631773178317931803181318231833184318531863187318831893190319131923193319431953196319731983199320032013202320332043205320632073208320932103211 |
- from __future__ import annotations
- # Use of this source code is governed by the MIT license.
- __license__ = "MIT"
- import re
- import warnings
- from bs4.css import CSS
- from bs4._deprecation import (
- _deprecated,
- _deprecated_alias,
- _deprecated_function_alias,
- )
- from bs4.formatter import (
- Formatter,
- HTMLFormatter,
- XMLFormatter,
- )
- from bs4._warnings import AttributeResemblesVariableWarning
- from typing import (
- Any,
- Callable,
- Dict,
- Generic,
- Iterable,
- Iterator,
- List,
- Mapping,
- MutableSequence,
- Optional,
- Pattern,
- Set,
- TYPE_CHECKING,
- Tuple,
- Type,
- TypeVar,
- Union,
- cast,
- overload,
- )
- from typing_extensions import (
- Self,
- TypeAlias,
- )
- if TYPE_CHECKING:
- from bs4 import BeautifulSoup
- from bs4.builder import TreeBuilder
- from bs4.filter import ElementFilter
- from bs4.formatter import (
- _EntitySubstitutionFunction,
- _FormatterOrName,
- )
- from bs4._typing import (
- _AtMostOneElement,
- _AtMostOneTag,
- _AtMostOneNavigableString,
- _AttributeValue,
- _AttributeValues,
- _Encoding,
- _InsertableElement,
- _OneElement,
- _QueryResults,
- _RawOrProcessedAttributeValues,
- _StrainableElement,
- _StrainableAttribute,
- _StrainableAttributes,
- _StrainableString,
- _SomeNavigableStrings,
- _SomeTags,
- )
- _OneOrMoreStringTypes: TypeAlias = Union[
- Type["NavigableString"], Iterable[Type["NavigableString"]]
- ]
- _FindMethodName: TypeAlias = Optional[Union["_StrainableElement", "ElementFilter"]]
- # Deprecated module-level attributes.
- # See https://peps.python.org/pep-0562/
- _deprecated_names = dict(
- whitespace_re="The {name} attribute was deprecated in version 4.7.0. If you need it, make your own copy."
- )
- #: :meta private:
- _deprecated_whitespace_re: Pattern[str] = re.compile(r"\s+")
- def __getattr__(name: str) -> Any:
- if name in _deprecated_names:
- message = _deprecated_names[name]
- warnings.warn(message.format(name=name), DeprecationWarning, stacklevel=2)
- return globals()[f"_deprecated_{name}"]
- raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
- #: Documents output by Beautiful Soup will be encoded with
- #: this encoding unless you specify otherwise.
- DEFAULT_OUTPUT_ENCODING: str = "utf-8"
- #: A regular expression that can be used to split on whitespace.
- nonwhitespace_re: Pattern[str] = re.compile(r"\S+")
- #: These encodings are recognized by Python (so `Tag.encode`
- #: could theoretically support them) but XML and HTML don't recognize
- #: them (so they should not show up in an XML or HTML document as that
- #: document's encoding).
- #:
- #: If an XML document is encoded in one of these encodings, no encoding
- #: will be mentioned in the XML declaration. If an HTML document is
- #: encoded in one of these encodings, and the HTML document has a
- #: <meta> tag that mentions an encoding, the encoding will be given as
- #: the empty string.
- #:
- #: Source:
- #: Python documentation, `Python Specific Encodings <https://docs.python.org/3/library/codecs.html#python-specific-encodings>`_
- PYTHON_SPECIFIC_ENCODINGS: Set[_Encoding] = set(
- [
- "idna",
- "mbcs",
- "oem",
- "palmos",
- "punycode",
- "raw_unicode_escape",
- "undefined",
- "unicode_escape",
- "raw-unicode-escape",
- "unicode-escape",
- "string-escape",
- "string_escape",
- ]
- )
- class NamespacedAttribute(str):
- """A namespaced attribute (e.g. the 'xml:lang' in 'xml:lang="en"')
- which remembers the namespace prefix ('xml') and the name ('lang')
- that were used to create it.
- """
- prefix: Optional[str]
- name: Optional[str]
- namespace: Optional[str]
- def __new__(
- cls,
- prefix: Optional[str],
- name: Optional[str] = None,
- namespace: Optional[str] = None,
- ) -> Self:
- if not name:
- # This is the default namespace. Its name "has no value"
- # per https://www.w3.org/TR/xml-names/#defaulting
- name = None
- if not name:
- obj = str.__new__(cls, prefix)
- elif not prefix:
- # Not really namespaced.
- obj = str.__new__(cls, name)
- else:
- obj = str.__new__(cls, prefix + ":" + name)
- obj.prefix = prefix
- obj.name = name
- obj.namespace = namespace
- return obj
- class AttributeValueWithCharsetSubstitution(str):
- """An abstract class standing in for a character encoding specified
- inside an HTML ``<meta>`` tag.
- Subclasses exist for each place such a character encoding might be
- found: either inside the ``charset`` attribute
- (`CharsetMetaAttributeValue`) or inside the ``content`` attribute
- (`ContentMetaAttributeValue`)
- This allows Beautiful Soup to replace that part of the HTML file
- with a different encoding when ouputting a tree as a string.
- """
- # The original, un-encoded value of the ``content`` attribute.
- #: :meta private:
- original_value: str
- def substitute_encoding(self, eventual_encoding: str) -> str:
- """Do whatever's necessary in this implementation-specific
- portion an HTML document to substitute in a specific encoding.
- """
- raise NotImplementedError()
- class CharsetMetaAttributeValue(AttributeValueWithCharsetSubstitution):
- """A generic stand-in for the value of a ``<meta>`` tag's ``charset``
- attribute.
- When Beautiful Soup parses the markup ``<meta charset="utf8">``, the
- value of the ``charset`` attribute will become one of these objects.
- If the document is later encoded to an encoding other than UTF-8, its
- ``<meta>`` tag will mention the new encoding instead of ``utf8``.
- """
- def __new__(cls, original_value: str) -> Self:
- # We don't need to use the original value for anything, but
- # it might be useful for the user to know.
- obj = str.__new__(cls, original_value)
- obj.original_value = original_value
- return obj
- def substitute_encoding(self, eventual_encoding: _Encoding = "utf-8") -> str:
- """When an HTML document is being encoded to a given encoding, the
- value of a ``<meta>`` tag's ``charset`` becomes the name of
- the encoding.
- """
- if eventual_encoding in PYTHON_SPECIFIC_ENCODINGS:
- return ""
- return eventual_encoding
- class AttributeValueList(List[str]):
- """Class for the list used to hold the values of attributes which
- have multiple values (such as HTML's 'class'). It's just a regular
- list, but you can subclass it and pass it in to the TreeBuilder
- constructor as attribute_value_list_class, to have your subclass
- instantiated instead.
- """
- class AttributeDict(Dict[Any,Any]):
- """Superclass for the dictionary used to hold a tag's
- attributes. You can use this, but it's just a regular dict with no
- special logic.
- """
- class XMLAttributeDict(AttributeDict):
- """A dictionary for holding a Tag's attributes, which processes
- incoming values for consistency with the HTML spec.
- """
- def __setitem__(self, key: str, value: Any) -> None:
- """Set an attribute value, possibly modifying it to comply with
- the XML spec.
- This just means converting common non-string values to
- strings: XML attributes may have "any literal string as a
- value."
- """
- if value is None:
- value = ""
- if isinstance(value, bool):
- # XML does not define any rules for boolean attributes.
- # Preserve the old Beautiful Soup behavior (a bool that
- # gets converted to a string on output) rather than
- # guessing what the value should be.
- pass
- elif isinstance(value, (int, float)):
- # It's dangerous to convert _every_ attribute value into a
- # plain string, since an attribute value may be a more
- # sophisticated string-like object
- # (e.g. CharsetMetaAttributeValue). But we can definitely
- # convert numeric values and booleans, which are the most common.
- value = str(value)
- super().__setitem__(key, value)
- class HTMLAttributeDict(AttributeDict):
- """A dictionary for holding a Tag's attributes, which processes
- incoming values for consistency with the HTML spec, which says
- 'Attribute values are a mixture of text and character
- references...'
- Basically, this means converting common non-string values into
- strings, like XMLAttributeDict, though HTML also has some rules
- around boolean attributes that XML doesn't have.
- """
- def __setitem__(self, key: str, value: Any) -> None:
- """Set an attribute value, possibly modifying it to comply
- with the HTML spec,
- """
- if value in (False, None):
- # 'The values "true" and "false" are not allowed on
- # boolean attributes. To represent a false value, the
- # attribute has to be omitted altogether.'
- if key in self:
- del self[key]
- return
- if isinstance(value, bool):
- # 'If the [boolean] attribute is present, its value must
- # either be the empty string or a value that is an ASCII
- # case-insensitive match for the attribute's canonical
- # name, with no leading or trailing whitespace.'
- #
- # [fixme] It's not clear to me whether "canonical name"
- # means fully-qualified name, unqualified name, or
- # (probably not) name with namespace prefix. For now I'm
- # going with unqualified name.
- if isinstance(key, NamespacedAttribute):
- value = key.name
- else:
- value = key
- elif isinstance(value, (int, float)):
- # See note in XMLAttributeDict for the reasoning why we
- # only do this to numbers.
- value = str(value)
- super().__setitem__(key, value)
- class ContentMetaAttributeValue(AttributeValueWithCharsetSubstitution):
- """A generic stand-in for the value of a ``<meta>`` tag's ``content``
- attribute.
- When Beautiful Soup parses the markup:
- ``<meta http-equiv="content-type" content="text/html; charset=utf8">``
- The value of the ``content`` attribute will become one of these objects.
- If the document is later encoded to an encoding other than UTF-8, its
- ``<meta>`` tag will mention the new encoding instead of ``utf8``.
- """
- #: Match the 'charset' argument inside the 'content' attribute
- #: of a <meta> tag.
- #: :meta private:
- CHARSET_RE: Pattern[str] = re.compile(r"((^|;)\s*charset=)([^;]*)", re.M)
- def __new__(cls, original_value: str) -> Self:
- cls.CHARSET_RE.search(original_value)
- obj = str.__new__(cls, original_value)
- obj.original_value = original_value
- return obj
- def substitute_encoding(self, eventual_encoding: _Encoding = "utf-8") -> str:
- """When an HTML document is being encoded to a given encoding, the
- value of the ``charset=`` in a ``<meta>`` tag's ``content`` becomes
- the name of the encoding.
- """
- if eventual_encoding in PYTHON_SPECIFIC_ENCODINGS:
- return self.CHARSET_RE.sub("", self.original_value)
- def rewrite(match: re.Match[str]) -> str:
- return match.group(1) + eventual_encoding
- return self.CHARSET_RE.sub(rewrite, self.original_value)
- class PageElement(object):
- """An abstract class representing a single element in the parse tree.
- `NavigableString`, `Tag`, etc. are all subclasses of
- `PageElement`. For this reason you'll see a lot of methods that
- return `PageElement`, but you'll never see an actual `PageElement`
- object. For the most part you can think of `PageElement` as
- meaning "a `Tag` or a `NavigableString`."
- """
- #: In general, we can't tell just by looking at an element whether
- #: it's contained in an XML document or an HTML document. But for
- #: `Tag` objects (q.v.) we can store this information at parse time.
- #: :meta private:
- known_xml: Optional[bool] = None
- #: Whether or not this element has been decomposed from the tree
- #: it was created in.
- _decomposed: bool
- parent: Optional[Tag]
- next_element: _AtMostOneElement
- previous_element: _AtMostOneElement
- next_sibling: _AtMostOneElement
- previous_sibling: _AtMostOneElement
- #: Whether or not this element is hidden from generated output.
- #: Only the `BeautifulSoup` object itself is hidden.
- hidden: bool = False
- def setup(
- self,
- parent: Optional[Tag] = None,
- previous_element: _AtMostOneElement = None,
- next_element: _AtMostOneElement = None,
- previous_sibling: _AtMostOneElement = None,
- next_sibling: _AtMostOneElement = None,
- ) -> None:
- """Sets up the initial relations between this element and
- other elements.
- :param parent: The parent of this element.
- :param previous_element: The element parsed immediately before
- this one.
- :param next_element: The element parsed immediately after
- this one.
- :param previous_sibling: The most recently encountered element
- on the same level of the parse tree as this one.
- :param previous_sibling: The next element to be encountered
- on the same level of the parse tree as this one.
- """
- self.parent = parent
- self.previous_element = previous_element
- if self.previous_element is not None:
- self.previous_element.next_element = self
- self.next_element = next_element
- if self.next_element is not None:
- self.next_element.previous_element = self
- self.next_sibling = next_sibling
- if self.next_sibling is not None:
- self.next_sibling.previous_sibling = self
- if (
- previous_sibling is None
- and self.parent is not None
- and self.parent.contents
- ):
- previous_sibling = self.parent.contents[-1]
- self.previous_sibling = previous_sibling
- if self.previous_sibling is not None:
- self.previous_sibling.next_sibling = self
- def format_string(self, s: str, formatter: Optional[_FormatterOrName]) -> str:
- """Format the given string using the given formatter.
- :param s: A string.
- :param formatter: A Formatter object, or a string naming one of the standard formatters.
- """
- if formatter is None:
- return s
- if not isinstance(formatter, Formatter):
- formatter = self.formatter_for_name(formatter)
- output = formatter.substitute(s)
- return output
- def formatter_for_name(
- self, formatter_name: Union[_FormatterOrName, _EntitySubstitutionFunction]
- ) -> Formatter:
- """Look up or create a Formatter for the given identifier,
- if necessary.
- :param formatter: Can be a `Formatter` object (used as-is), a
- function (used as the entity substitution hook for an
- `bs4.formatter.XMLFormatter` or
- `bs4.formatter.HTMLFormatter`), or a string (used to look
- up an `bs4.formatter.XMLFormatter` or
- `bs4.formatter.HTMLFormatter` in the appropriate registry.
- """
- if isinstance(formatter_name, Formatter):
- return formatter_name
- c: type[Formatter]
- registry: Mapping[Optional[str], Formatter]
- if self._is_xml:
- c = XMLFormatter
- registry = XMLFormatter.REGISTRY
- else:
- c = HTMLFormatter
- registry = HTMLFormatter.REGISTRY
- if callable(formatter_name):
- return c(entity_substitution=formatter_name)
- return registry[formatter_name]
- @property
- def _is_xml(self) -> bool:
- """Is this element part of an XML tree or an HTML tree?
- This is used in formatter_for_name, when deciding whether an
- XMLFormatter or HTMLFormatter is more appropriate. It can be
- inefficient, but it should be called very rarely.
- """
- if self.known_xml is not None:
- # Most of the time we will have determined this when the
- # document is parsed.
- return self.known_xml
- # Otherwise, it's likely that this element was created by
- # direct invocation of the constructor from within the user's
- # Python code.
- if self.parent is None:
- # This is the top-level object. It should have .known_xml set
- # from tree creation. If not, take a guess--BS is usually
- # used on HTML markup.
- return getattr(self, "is_xml", False)
- return self.parent._is_xml
- nextSibling = _deprecated_alias("nextSibling", "next_sibling", "4.0.0")
- previousSibling = _deprecated_alias("previousSibling", "previous_sibling", "4.0.0")
- def __deepcopy__(self, memo: Dict[Any, Any], recursive: bool = False) -> Self:
- raise NotImplementedError()
- def __copy__(self) -> Self:
- """A copy of a PageElement can only be a deep copy, because
- only one PageElement can occupy a given place in a parse tree.
- """
- return self.__deepcopy__({})
- default: Iterable[type[NavigableString]] = tuple() #: :meta private:
- def _all_strings(
- self, strip: bool = False, types: Iterable[type[NavigableString]] = default
- ) -> Iterator[str]:
- """Yield all strings of certain classes, possibly stripping them.
- This is implemented differently in `Tag` and `NavigableString`.
- """
- raise NotImplementedError()
- @property
- def stripped_strings(self) -> Iterator[str]:
- """Yield all interesting strings in this PageElement, stripping them
- first.
- See `Tag` for information on which strings are considered
- interesting in a given context.
- """
- for string in self._all_strings(True):
- yield string
- def get_text(
- self,
- separator: str = "",
- strip: bool = False,
- types: Iterable[Type[NavigableString]] = default,
- ) -> str:
- """Get all child strings of this PageElement, concatenated using the
- given separator.
- :param separator: Strings will be concatenated using this separator.
- :param strip: If True, strings will be stripped before being
- concatenated.
- :param types: A tuple of NavigableString subclasses. Any
- strings of a subclass not found in this list will be
- ignored. Although there are exceptions, the default
- behavior in most cases is to consider only NavigableString
- and CData objects. That means no comments, processing
- instructions, etc.
- :return: A string.
- """
- return separator.join([s for s in self._all_strings(strip, types=types)])
- getText = get_text
- text = property(get_text)
- def replace_with(self, *args: _InsertableElement) -> Self:
- """Replace this `PageElement` with one or more other elements,
- objects, keeping the rest of the tree the same.
- :return: This `PageElement`, no longer part of the tree.
- """
- if self.parent is None:
- raise ValueError(
- "Cannot replace one element with another when the "
- "element to be replaced is not part of a tree."
- )
- if len(args) == 1 and args[0] is self:
- # Replacing an element with itself is a no-op.
- return self
- if any(x is self.parent for x in args):
- raise ValueError("Cannot replace a Tag with its parent.")
- old_parent = self.parent
- my_index = self.parent.index(self)
- self.extract(_self_index=my_index)
- for idx, replace_with in enumerate(args, start=my_index):
- old_parent.insert(idx, replace_with)
- return self
- replaceWith = _deprecated_function_alias("replaceWith", "replace_with", "4.0.0")
- def wrap(self, wrap_inside: Tag) -> Tag:
- """Wrap this `PageElement` inside a `Tag`.
- :return: ``wrap_inside``, occupying the position in the tree that used
- to be occupied by this object, and with this object now inside it.
- """
- me = self.replace_with(wrap_inside)
- wrap_inside.append(me)
- return wrap_inside
- def extract(self, _self_index: Optional[int] = None) -> Self:
- """Destructively rips this element out of the tree.
- :param _self_index: The location of this element in its parent's
- .contents, if known. Passing this in allows for a performance
- optimization.
- :return: this `PageElement`, no longer part of the tree.
- """
- if self.parent is not None:
- if _self_index is None:
- _self_index = self.parent.index(self)
- del self.parent.contents[_self_index]
- # Find the two elements that would be next to each other if
- # this element (and any children) hadn't been parsed. Connect
- # the two.
- last_child = self._last_descendant()
- # last_child can't be None because we passed accept_self=True
- # into _last_descendant. Worst case, last_child will be
- # self. Making this cast removes several mypy complaints later
- # on as we manipulate last_child.
- last_child = cast(PageElement, last_child)
- next_element = last_child.next_element
- if self.previous_element is not None:
- if self.previous_element is not next_element:
- self.previous_element.next_element = next_element
- if next_element is not None and next_element is not self.previous_element:
- next_element.previous_element = self.previous_element
- self.previous_element = None
- last_child.next_element = None
- self.parent = None
- if (
- self.previous_sibling is not None
- and self.previous_sibling is not self.next_sibling
- ):
- self.previous_sibling.next_sibling = self.next_sibling
- if (
- self.next_sibling is not None
- and self.next_sibling is not self.previous_sibling
- ):
- self.next_sibling.previous_sibling = self.previous_sibling
- self.previous_sibling = self.next_sibling = None
- return self
- def decompose(self) -> None:
- """Recursively destroys this `PageElement` and its children.
- The element will be removed from the tree and wiped out; so
- will everything beneath it.
- The behavior of a decomposed `PageElement` is undefined and you
- should never use one for anything, but if you need to *check*
- whether an element has been decomposed, you can use the
- `PageElement.decomposed` property.
- """
- self.extract()
- e: _AtMostOneElement = self
- next_up: _AtMostOneElement = None
- while e is not None:
- next_up = e.next_element
- e.__dict__.clear()
- if isinstance(e, Tag):
- e.name = ""
- e.contents = []
- e._decomposed = True
- e = next_up
- def _last_descendant(
- self, is_initialized: bool = True, accept_self: bool = True
- ) -> _AtMostOneElement:
- """Finds the last element beneath this object to be parsed.
- Special note to help you figure things out if your type
- checking is tripped up by the fact that this method returns
- _AtMostOneElement instead of PageElement: the only time
- this method returns None is if `accept_self` is False and the
- `PageElement` has no children--either it's a NavigableString
- or an empty Tag.
- :param is_initialized: Has `PageElement.setup` been called on
- this `PageElement` yet?
- :param accept_self: Is ``self`` an acceptable answer to the
- question?
- """
- if is_initialized and self.next_sibling is not None:
- last_child = self.next_sibling.previous_element
- else:
- last_child = self
- while isinstance(last_child, Tag) and last_child.contents:
- last_child = last_child.contents[-1]
- if not accept_self and last_child is self:
- last_child = None
- return last_child
- _lastRecursiveChild = _deprecated_alias(
- "_lastRecursiveChild", "_last_descendant", "4.0.0"
- )
- def insert_before(self, *args: _InsertableElement) -> List[PageElement]:
- """Makes the given element(s) the immediate predecessor of this one.
- All the elements will have the same `PageElement.parent` as
- this one, and the given elements will occur immediately before
- this one.
- :param args: One or more PageElements.
- :return The list of PageElements that were inserted.
- """
- parent = self.parent
- if parent is None:
- raise ValueError("Element has no parent, so 'before' has no meaning.")
- if any(x is self for x in args):
- raise ValueError("Can't insert an element before itself.")
- results: List[PageElement] = []
- for predecessor in args:
- # Extract first so that the index won't be screwed up if they
- # are siblings.
- if isinstance(predecessor, PageElement):
- predecessor.extract()
- index = parent.index(self)
- results.extend(parent.insert(index, predecessor))
- return results
- def insert_after(self, *args: _InsertableElement) -> List[PageElement]:
- """Makes the given element(s) the immediate successor of this one.
- The elements will have the same `PageElement.parent` as this
- one, and the given elements will occur immediately after this
- one.
- :param args: One or more PageElements.
- :return The list of PageElements that were inserted.
- """
- # Do all error checking before modifying the tree.
- parent = self.parent
- if parent is None:
- raise ValueError("Element has no parent, so 'after' has no meaning.")
- if any(x is self for x in args):
- raise ValueError("Can't insert an element after itself.")
- offset = 0
- results: List[PageElement] = []
- for successor in args:
- # Extract first so that the index won't be screwed up if they
- # are siblings.
- if isinstance(successor, PageElement):
- successor.extract()
- index = parent.index(self)
- results.extend(parent.insert(index + 1 + offset, successor))
- offset += 1
- return results
- # For the suppression of this pyright warning, see discussion here:
- # https://github.com/microsoft/pyright/issues/10929
- @overload
- def find_next( # pyright: ignore [reportOverlappingOverload]
- self,
- name: _FindMethodName = None,
- attrs: Optional[_StrainableAttributes] = None,
- string: None=None,
- **kwargs: _StrainableAttribute,
- ) -> _AtMostOneTag:
- ...
- @overload
- def find_next(
- self,
- name: None=None,
- attrs: None=None,
- string: _StrainableString="",
- **kwargs: _StrainableAttribute,
- ) -> _AtMostOneNavigableString:
- ...
- def find_next(
- self,
- name: _FindMethodName = None,
- attrs: Optional[_StrainableAttributes] = None,
- string: Optional[_StrainableString] = None,
- **kwargs: _StrainableAttribute,
- ) -> Union[_AtMostOneTag,_AtMostOneNavigableString,_AtMostOneElement]:
- """Find the first PageElement that matches the given criteria and
- appears later in the document than this PageElement.
- All find_* methods take a common set of arguments. See the online
- documentation for detailed explanations.
- :param name: A filter on tag name.
- :param attrs: Additional filters on attribute values.
- :param string: A filter for a NavigableString with specific text.
- :kwargs: Additional filters on attribute values.
- """
- return self._find_one(self.find_all_next, name, attrs, string, **kwargs)
- findNext = _deprecated_function_alias("findNext", "find_next", "4.0.0")
- @overload
- def find_all_next( # pyright: ignore [reportOverlappingOverload]
- self,
- name: _FindMethodName = None,
- attrs: Optional[_StrainableAttributes] = None,
- string: None = None,
- limit: Optional[int] = None,
- _stacklevel: int = 2,
- **kwargs: _StrainableAttribute,
- ) -> _SomeTags:
- ...
- @overload
- def find_all_next(
- self,
- name: None = None,
- attrs: None = None,
- string: _StrainableString = "",
- limit: Optional[int] = None,
- _stacklevel: int = 2,
- **kwargs: _StrainableAttribute,
- ) -> _SomeNavigableStrings:
- ...
- def find_all_next(
- self,
- name: _FindMethodName = None,
- attrs: Optional[_StrainableAttributes] = None,
- string: Optional[_StrainableString] = None,
- limit: Optional[int] = None,
- _stacklevel: int = 2,
- **kwargs: _StrainableAttribute,
- ) -> Union[_SomeTags,_SomeNavigableStrings,_QueryResults]:
- """Find all `PageElement` objects that match the given criteria and
- appear later in the document than this `PageElement`.
- All find_* methods take a common set of arguments. See the online
- documentation for detailed explanations.
- :param name: A filter on tag name.
- :param attrs: Additional filters on attribute values.
- :param string: A filter for a NavigableString with specific text.
- :param limit: Stop looking after finding this many results.
- :param _stacklevel: Used internally to improve warning messages.
- :kwargs: Additional filters on attribute values.
- """
- return self._find_all(
- name,
- attrs,
- string,
- limit,
- self.next_elements,
- _stacklevel=_stacklevel + 1,
- **kwargs,
- )
- findAllNext = _deprecated_function_alias("findAllNext", "find_all_next", "4.0.0")
- @overload
- def find_next_sibling( # pyright: ignore [reportOverlappingOverload]
- self,
- name: _FindMethodName = None,
- attrs: Optional[_StrainableAttributes] = None,
- string: None=None,
- **kwargs: _StrainableAttribute,
- ) -> _AtMostOneTag:
- ...
- @overload
- def find_next_sibling(
- self,
- name: None=None,
- attrs: None=None,
- string: _StrainableString="",
- **kwargs: _StrainableAttribute,
- ) -> _AtMostOneNavigableString:
- ...
- def find_next_sibling(
- self,
- name: _FindMethodName = None,
- attrs: Optional[_StrainableAttributes] = None,
- string: Optional[_StrainableString] = None,
- **kwargs: _StrainableAttribute,
- ) -> Union[_AtMostOneTag,_AtMostOneNavigableString,_AtMostOneElement]:
- """Find the closest sibling to this PageElement that matches the
- given criteria and appears later in the document.
- All find_* methods take a common set of arguments. See the
- online documentation for detailed explanations.
- :param name: A filter on tag name.
- :param attrs: Additional filters on attribute values.
- :param string: A filter for a `NavigableString` with specific text.
- :kwargs: Additional filters on attribute values.
- """
- return self._find_one(self.find_next_siblings, name, attrs, string, **kwargs)
- findNextSibling = _deprecated_function_alias(
- "findNextSibling", "find_next_sibling", "4.0.0"
- )
- @overload
- def find_next_siblings( # pyright: ignore [reportOverlappingOverload]
- self,
- name: _FindMethodName = None,
- attrs: Optional[_StrainableAttributes] = None,
- string: None = None,
- limit: Optional[int] = None,
- _stacklevel: int = 2,
- **kwargs: _StrainableAttribute,
- ) -> _SomeTags:
- ...
- @overload
- def find_next_siblings(
- self,
- name: None = None,
- attrs: None = None,
- string: _StrainableString = "",
- limit: Optional[int] = None,
- _stacklevel: int = 2,
- **kwargs: _StrainableAttribute,
- ) -> _SomeNavigableStrings:
- ...
- def find_next_siblings(
- self,
- name: _FindMethodName = None,
- attrs: Optional[_StrainableAttributes] = None,
- string: Optional[_StrainableString] = None,
- limit: Optional[int] = None,
- _stacklevel: int = 2,
- **kwargs: _StrainableAttribute,
- ) -> Union[_SomeTags,_SomeNavigableStrings,_QueryResults]:
- """Find all siblings of this `PageElement` that match the given criteria
- and appear later in the document.
- All find_* methods take a common set of arguments. See the online
- documentation for detailed explanations.
- :param name: A filter on tag name.
- :param attrs: Additional filters on attribute values.
- :param string: A filter for a `NavigableString` with specific text.
- :param limit: Stop looking after finding this many results.
- :param _stacklevel: Used internally to improve warning messages.
- :kwargs: Additional filters on attribute values.
- """
- return self._find_all(
- name,
- attrs,
- string,
- limit,
- self.next_siblings,
- _stacklevel=_stacklevel + 1,
- **kwargs,
- )
- findNextSiblings = _deprecated_function_alias(
- "findNextSiblings", "find_next_siblings", "4.0.0"
- )
- fetchNextSiblings = _deprecated_function_alias(
- "fetchNextSiblings", "find_next_siblings", "3.0.0"
- )
- @overload
- def find_previous( # pyright: ignore [reportOverlappingOverload]
- self,
- name: _FindMethodName = None,
- attrs: Optional[_StrainableAttributes] = None,
- string: None=None,
- **kwargs: _StrainableAttribute,
- ) -> _AtMostOneTag:
- ...
- @overload
- def find_previous(
- self,
- name: None=None,
- attrs: None=None,
- string: _StrainableString="",
- **kwargs: _StrainableAttribute,
- ) -> _AtMostOneNavigableString:
- ...
- def find_previous(
- self,
- name: _FindMethodName = None,
- attrs: Optional[_StrainableAttributes] = None,
- string: Optional[_StrainableString] = None,
- **kwargs: _StrainableAttribute,
- ) -> Union[_AtMostOneTag,_AtMostOneNavigableString,_AtMostOneElement]:
- """Look backwards in the document from this `PageElement` and find the
- first `PageElement` that matches the given criteria.
- All find_* methods take a common set of arguments. See the online
- documentation for detailed explanations.
- :param name: A filter on tag name.
- :param attrs: Additional filters on attribute values.
- :param string: A filter for a `NavigableString` with specific text.
- :kwargs: Additional filters on attribute values.
- """
- return self._find_one(self.find_all_previous, name, attrs, string, **kwargs)
- findPrevious = _deprecated_function_alias("findPrevious", "find_previous", "3.0.0")
- @overload
- def find_all_previous( # pyright: ignore [reportOverlappingOverload]
- self,
- name: _FindMethodName = None,
- attrs: Optional[_StrainableAttributes] = None,
- string: None = None,
- limit: Optional[int] = None,
- _stacklevel: int = 2,
- **kwargs: _StrainableAttribute,
- ) -> _SomeTags:
- ...
- @overload
- def find_all_previous(
- self,
- name: None = None,
- attrs: None = None,
- string: _StrainableString = "",
- limit: Optional[int] = None,
- _stacklevel: int = 2,
- **kwargs: _StrainableAttribute,
- ) -> _SomeNavigableStrings:
- ...
- def find_all_previous(
- self,
- name: _FindMethodName = None,
- attrs: Optional[_StrainableAttributes] = None,
- string: Optional[_StrainableString] = None,
- limit: Optional[int] = None,
- _stacklevel: int = 2,
- **kwargs: _StrainableAttribute,
- ) -> Union[_SomeTags,_SomeNavigableStrings,_QueryResults]:
- """Look backwards in the document from this `PageElement` and find all
- `PageElement` that match the given criteria.
- All find_* methods take a common set of arguments. See the online
- documentation for detailed explanations.
- :param name: A filter on tag name.
- :param attrs: Additional filters on attribute values.
- :param string: A filter for a `NavigableString` with specific text.
- :param limit: Stop looking after finding this many results.
- :param _stacklevel: Used internally to improve warning messages.
- :kwargs: Additional filters on attribute values.
- """
- return self._find_all(
- name,
- attrs,
- string,
- limit,
- self.previous_elements,
- _stacklevel=_stacklevel + 1,
- **kwargs,
- )
- findAllPrevious = _deprecated_function_alias(
- "findAllPrevious", "find_all_previous", "4.0.0"
- )
- fetchAllPrevious = _deprecated_function_alias(
- "fetchAllPrevious", "find_all_previous", "3.0.0"
- )
- @overload
- def find_previous_sibling( # pyright: ignore [reportOverlappingOverload]
- self,
- name: _FindMethodName = None,
- attrs: Optional[_StrainableAttributes] = None,
- string: None=None,
- **kwargs: _StrainableAttribute,
- ) -> _AtMostOneTag:
- ...
- @overload
- def find_previous_sibling(
- self,
- name: None=None,
- attrs: None=None,
- string: _StrainableString="",
- **kwargs: _StrainableAttribute,
- ) -> _AtMostOneNavigableString:
- ...
- def find_previous_sibling(
- self,
- name: _FindMethodName = None,
- attrs: Optional[_StrainableAttributes] = None,
- string: Optional[_StrainableString] = None,
- **kwargs: _StrainableAttribute,
- ) -> Union[_AtMostOneTag,_AtMostOneNavigableString,_AtMostOneElement]:
- """Returns the closest sibling to this `PageElement` that matches the
- given criteria and appears earlier in the document.
- All find_* methods take a common set of arguments. See the online
- documentation for detailed explanations.
- :param name: A filter on tag name.
- :param attrs: Additional filters on attribute values.
- :param string: A filter for a `NavigableString` with specific text.
- :kwargs: Additional filters on attribute values.
- """
- return self._find_one(
- self.find_previous_siblings, name, attrs, string, **kwargs
- )
- findPreviousSibling = _deprecated_function_alias(
- "findPreviousSibling", "find_previous_sibling", "4.0.0"
- )
- @overload
- def find_previous_siblings( # pyright: ignore [reportOverlappingOverload]
- self,
- name: _FindMethodName = None,
- attrs: Optional[_StrainableAttributes] = None,
- string: None = None,
- limit: Optional[int] = None,
- _stacklevel: int = 2,
- **kwargs: _StrainableAttribute,
- ) -> _SomeTags:
- ...
- @overload
- def find_previous_siblings(
- self,
- name: None = None,
- attrs: None = None,
- string: _StrainableString = "",
- limit: Optional[int] = None,
- _stacklevel: int = 2,
- **kwargs: _StrainableAttribute,
- ) -> _SomeNavigableStrings:
- ...
- def find_previous_siblings(
- self,
- name: _FindMethodName = None,
- attrs: Optional[_StrainableAttributes] = None,
- string: Optional[_StrainableString] = None,
- limit: Optional[int] = None,
- _stacklevel: int = 2,
- **kwargs: _StrainableAttribute,
- ) -> Union[_SomeTags,_SomeNavigableStrings,_QueryResults]:
- """Returns all siblings to this PageElement that match the
- given criteria and appear earlier in the document.
- All find_* methods take a common set of arguments. See the online
- documentation for detailed explanations.
- :param name: A filter on tag name.
- :param attrs: Additional filters on attribute values.
- :param string: A filter for a NavigableString with specific text.
- :param limit: Stop looking after finding this many results.
- :param _stacklevel: Used internally to improve warning messages.
- :kwargs: Additional filters on attribute values.
- """
- return self._find_all(
- name,
- attrs,
- string,
- limit,
- self.previous_siblings,
- _stacklevel=_stacklevel + 1,
- **kwargs,
- )
- findPreviousSiblings = _deprecated_function_alias(
- "findPreviousSiblings", "find_previous_siblings", "4.0.0"
- )
- fetchPreviousSiblings = _deprecated_function_alias(
- "fetchPreviousSiblings", "find_previous_siblings", "3.0.0"
- )
- def find_parent(
- self,
- name: _FindMethodName = None,
- attrs: Optional[_StrainableAttributes] = None,
- **kwargs: _StrainableAttribute,
- ) -> _AtMostOneTag:
- """Find the closest parent of this PageElement that matches the given
- criteria.
- All find_* methods take a common set of arguments. See the online
- documentation for detailed explanations.
- :param name: A filter on tag name.
- :param attrs: Additional filters on attribute values.
- :param self: Whether the PageElement itself should be considered
- as one of its 'parents'.
- :kwargs: Additional filters on attribute values.
- """
- # NOTE: We can't use _find_one because findParents takes a different
- # set of arguments.
- r = None
- results = self.find_parents(
- name, attrs, 1, _stacklevel=3, **kwargs
- )
- if results:
- r = results[0]
- return r
- findParent = _deprecated_function_alias("findParent", "find_parent", "4.0.0")
- def find_parents(
- self,
- name: _FindMethodName = None,
- attrs: Optional[_StrainableAttributes] = None,
- limit: Optional[int] = None,
- _stacklevel: int = 2,
- **kwargs: _StrainableAttribute,
- ) -> _SomeTags:
- """Find all parents of this `PageElement` that match the given criteria.
- All find_* methods take a common set of arguments. See the online
- documentation for detailed explanations.
- :param name: A filter on tag name.
- :param attrs: Additional filters on attribute values.
- :param limit: Stop looking after finding this many results.
- :param _stacklevel: Used internally to improve warning messages.
- :kwargs: Additional filters on attribute values.
- """
- iterator = self.parents
- # Only Tags can have children, so this ResultSet will contain
- # nothing but Tags.
- return cast(ResultSet[Tag], self._find_all(
- name, attrs, None, limit, iterator, _stacklevel=_stacklevel + 1, **kwargs
- ))
- findParents = _deprecated_function_alias("findParents", "find_parents", "4.0.0")
- fetchParents = _deprecated_function_alias("fetchParents", "find_parents", "3.0.0")
- @property
- def next(self) -> _AtMostOneElement:
- """The `PageElement`, if any, that was parsed just after this one."""
- return self.next_element
- @property
- def previous(self) -> _AtMostOneElement:
- """The `PageElement`, if any, that was parsed just before this one."""
- return self.previous_element
- # These methods do the real heavy lifting.
- def _find_one(
- self,
- # TODO-TYPING: "There is no syntax to indicate optional or
- # keyword arguments; such function types are rarely used
- # as callback types." - So, not sure how to get more
- # specific here.
- method: Callable,
- name: _FindMethodName,
- attrs: Optional[_StrainableAttributes],
- string: Optional[_StrainableString],
- **kwargs: _StrainableAttribute,
- ) -> _AtMostOneElement:
- r: _AtMostOneElement = None
- results: _QueryResults = method(name, attrs, string, 1, _stacklevel=4, **kwargs)
- if results:
- r = results[0]
- return r
- def _find_all(
- self,
- name: _FindMethodName,
- attrs: Optional[_StrainableAttributes],
- string: Optional[_StrainableString],
- limit: Optional[int],
- generator: Iterator[PageElement],
- _stacklevel: int = 3,
- **kwargs: _StrainableAttribute,
- ) -> _QueryResults:
- """Iterates over a generator looking for things that match."""
- if string is None and "text" in kwargs:
- string = kwargs.pop("text")
- warnings.warn(
- "The 'text' argument to find()-type methods is deprecated. Use 'string' instead.",
- DeprecationWarning,
- stacklevel=_stacklevel,
- )
- if "_class" in kwargs:
- warnings.warn(
- AttributeResemblesVariableWarning.MESSAGE
- % dict(
- original="_class",
- autocorrect="class_",
- ),
- AttributeResemblesVariableWarning,
- stacklevel=_stacklevel,
- )
- from bs4.filter import ElementFilter
- if isinstance(name, ElementFilter):
- matcher = name
- else:
- matcher = SoupStrainer(name, attrs, string, **kwargs)
- result: MutableSequence[_OneElement]
- if string is None and not limit and not attrs and not kwargs:
- if name is True or name is None:
- # Optimization to find all tags.
- result = [element for element in generator if isinstance(element, Tag)]
- return ResultSet(matcher, result)
- elif isinstance(name, str):
- # Optimization to find all tags with a given name.
- if name.count(":") == 1:
- # This is a name with a prefix. If this is a namespace-aware document,
- # we need to match the local name against tag.name. If not,
- # we need to match the fully-qualified name against tag.name.
- prefix, local_name = name.split(":", 1)
- else:
- prefix = None
- local_name = name
- result = []
- for element in generator:
- if not isinstance(element, Tag):
- continue
- if element.name == name or (
- element.name == local_name
- and (prefix is None or element.prefix == prefix)
- ):
- result.append(element)
- return ResultSet(matcher, result)
- return matcher.find_all(generator, limit)
- # These generators can be used to navigate starting from both
- # NavigableStrings and Tags.
- @property
- def next_elements(self) -> Iterator[PageElement]:
- """All PageElements that were parsed after this one."""
- i = self.next_element
- while i is not None:
- successor = i.next_element
- yield i
- i = successor
- @property
- def self_and_next_elements(self) -> Iterator[PageElement]:
- """This PageElement, then all PageElements that were parsed after it."""
- return self._self_and(self.next_elements)
- @property
- def next_siblings(self) -> Iterator[PageElement]:
- """All PageElements that are siblings of this one but were parsed
- later.
- """
- i = self.next_sibling
- while i is not None:
- successor = i.next_sibling
- yield i
- i = successor
- @property
- def self_and_next_siblings(self) -> Iterator[PageElement]:
- """This PageElement, then all of its siblings."""
- return self._self_and(self.next_siblings)
- @property
- def previous_elements(self) -> Iterator[PageElement]:
- """All PageElements that were parsed before this one.
- :yield: A sequence of PageElements.
- """
- i = self.previous_element
- while i is not None:
- successor = i.previous_element
- yield i
- i = successor
- @property
- def self_and_previous_elements(self) -> Iterator[PageElement]:
- """This PageElement, then all elements that were parsed
- earlier."""
- return self._self_and(self.previous_elements)
- @property
- def previous_siblings(self) -> Iterator[PageElement]:
- """All PageElements that are siblings of this one but were parsed
- earlier.
- :yield: A sequence of PageElements.
- """
- i = self.previous_sibling
- while i is not None:
- successor = i.previous_sibling
- yield i
- i = successor
- @property
- def self_and_previous_siblings(self) -> Iterator[PageElement]:
- """This PageElement, then all of its siblings that were parsed
- earlier."""
- return self._self_and(self.previous_siblings)
- @property
- def parents(self) -> Iterator[Tag]:
- """All elements that are parents of this PageElement.
- :yield: A sequence of Tags, ending with a BeautifulSoup object.
- """
- i = self.parent
- while i is not None:
- successor = i.parent
- yield i
- i = successor
- @property
- def self_and_parents(self) -> Iterator[PageElement]:
- """This element, then all of its parents.
- :yield: A sequence of PageElements, ending with a BeautifulSoup object.
- """
- return self._self_and(self.parents)
- def _self_and(self, other_generator:Iterator[PageElement]) -> Iterator[PageElement]:
- """Modify a generator by yielding this element, then everything
- yielded by the other generator.
- """
- if not self.hidden:
- yield self
- for i in other_generator:
- yield i
- @property
- def decomposed(self) -> bool:
- """Check whether a PageElement has been decomposed."""
- return getattr(self, "_decomposed", False) or False
- @_deprecated("next_elements", "4.0.0")
- def nextGenerator(self) -> Iterator[PageElement]:
- ":meta private:"
- return self.next_elements
- @_deprecated("next_siblings", "4.0.0")
- def nextSiblingGenerator(self) -> Iterator[PageElement]:
- ":meta private:"
- return self.next_siblings
- @_deprecated("previous_elements", "4.0.0")
- def previousGenerator(self) -> Iterator[PageElement]:
- ":meta private:"
- return self.previous_elements
- @_deprecated("previous_siblings", "4.0.0")
- def previousSiblingGenerator(self) -> Iterator[PageElement]:
- ":meta private:"
- return self.previous_siblings
- @_deprecated("parents", "4.0.0")
- def parentGenerator(self) -> Iterator[PageElement]:
- ":meta private:"
- return self.parents
- class NavigableString(str, PageElement):
- """A Python string that is part of a parse tree.
- When Beautiful Soup parses the markup ``<b>penguin</b>``, it will
- create a `NavigableString` for the string "penguin".
- """
- #: A string prepended to the body of the 'real' string
- #: when formatting it as part of a document, such as the '<!--'
- #: in an HTML comment.
- PREFIX: str = ""
- #: A string appended to the body of the 'real' string
- #: when formatting it as part of a document, such as the '-->'
- #: in an HTML comment.
- SUFFIX: str = ""
- def __new__(cls, value: Union[str, bytes]) -> Self:
- """Create a new NavigableString.
- When unpickling a NavigableString, this method is called with
- the string in DEFAULT_OUTPUT_ENCODING. That encoding needs to be
- passed in to the superclass's __new__ or the superclass won't know
- how to handle non-ASCII characters.
- """
- if isinstance(value, str):
- u = str.__new__(cls, value)
- else:
- u = str.__new__(cls, value, DEFAULT_OUTPUT_ENCODING)
- u.hidden = False
- u.setup()
- return u
- def __deepcopy__(self, memo: Dict[Any, Any], recursive: bool = False) -> Self:
- """A copy of a NavigableString has the same contents and class
- as the original, but it is not connected to the parse tree.
- :param recursive: This parameter is ignored; it's only defined
- so that NavigableString.__deepcopy__ implements the same
- signature as Tag.__deepcopy__.
- """
- return type(self)(self)
- def __getnewargs__(self) -> Tuple[str]:
- return (str(self),)
- # TODO-TYPING This should be SupportsIndex|slice but SupportsIndex
- # is introduced in 3.8. This can be changed once 3.7 support is dropped.
- def __getitem__(self, key: Union[int|slice]) -> str: # type:ignore
- """Raise an exception """
- if isinstance(key, str):
- raise TypeError("string indices must be integers, not '{0}'. Are you treating a NavigableString like a Tag?".format(key.__class__.__name__))
- return super(NavigableString, self).__getitem__(key)
- @property
- def string(self) -> str:
- """Convenience property defined to match `Tag.string`.
- :return: This property always returns the `NavigableString` it was
- called on.
- :meta private:
- """
- return self
- def output_ready(self, formatter: _FormatterOrName = "minimal") -> str:
- """Run the string through the provided formatter, making it
- ready for output as part of an HTML or XML document.
- :param formatter: A `Formatter` object, or a string naming one
- of the standard formatters.
- """
- output = self.format_string(self, formatter)
- return self.PREFIX + output + self.SUFFIX
- @property
- def name(self) -> None:
- """Since a NavigableString is not a Tag, it has no .name.
- This property is implemented so that code like this doesn't crash
- when run on a mixture of Tag and NavigableString objects:
- [x.name for x in tag.children]
- :meta private:
- """
- return None
- @name.setter
- def name(self, name: str) -> None:
- """Prevent NavigableString.name from ever being set.
- :meta private:
- """
- raise AttributeError("A NavigableString cannot be given a name.")
- def _all_strings(
- self, strip: bool = False, types: _OneOrMoreStringTypes = PageElement.default
- ) -> Iterator[str]:
- """Yield all strings of certain classes, possibly stripping them.
- This makes it easy for NavigableString to implement methods
- like get_text() as conveniences, creating a consistent
- text-extraction API across all PageElements.
- :param strip: If True, all strings will be stripped before being
- yielded.
- :param types: A tuple of NavigableString subclasses. If this
- NavigableString isn't one of those subclasses, the
- sequence will be empty. By default, the subclasses
- considered are NavigableString and CData objects. That
- means no comments, processing instructions, etc.
- :yield: A sequence that either contains this string, or is empty.
- """
- if types is self.default:
- # This is kept in Tag because it's full of subclasses of
- # this class, which aren't defined until later in the file.
- types = Tag.MAIN_CONTENT_STRING_TYPES
- # Do nothing if the caller is looking for specific types of
- # string, and we're of a different type.
- #
- # We check specific types instead of using isinstance(self,
- # types) because all of these classes subclass
- # NavigableString. Anyone who's using this feature probably
- # wants generic NavigableStrings but not other stuff.
- my_type = type(self)
- if types is not None:
- if isinstance(types, type):
- # Looking for a single type.
- if my_type is not types:
- return
- elif my_type not in types:
- # Looking for one of a list of types.
- return
- value = self
- if strip:
- final_value = value.strip()
- else:
- final_value = self
- if len(final_value) > 0:
- yield final_value
- @property
- def strings(self) -> Iterator[str]:
- """Yield this string, but only if it is interesting.
- This is defined the way it is for compatibility with
- `Tag.strings`. See `Tag` for information on which strings are
- interesting in a given context.
- :yield: A sequence that either contains this string, or is empty.
- """
- return self._all_strings()
- class PreformattedString(NavigableString):
- """A `NavigableString` not subject to the normal formatting rules.
- This is an abstract class used for special kinds of strings such
- as comments (`Comment`) and CDATA blocks (`CData`).
- """
- PREFIX: str = ""
- SUFFIX: str = ""
- def output_ready(self, formatter: Optional[_FormatterOrName] = None) -> str:
- """Make this string ready for output by adding any subclass-specific
- prefix or suffix.
- :param formatter: A `Formatter` object, or a string naming one
- of the standard formatters. The string will be passed into the
- `Formatter`, but only to trigger any side effects: the return
- value is ignored.
- :return: The string, with any subclass-specific prefix and
- suffix added on.
- """
- if formatter is not None:
- self.format_string(self, formatter)
- return self.PREFIX + self + self.SUFFIX
- class CData(PreformattedString):
- """A `CDATA section <https://dev.w3.org/html5/spec-LC/syntax.html#cdata-sections>`_."""
- PREFIX: str = "<![CDATA["
- SUFFIX: str = "]]>"
- class ProcessingInstruction(PreformattedString):
- """A SGML processing instruction."""
- PREFIX: str = "<?"
- SUFFIX: str = ">"
- class XMLProcessingInstruction(ProcessingInstruction):
- """An `XML processing instruction <https://www.w3.org/TR/REC-xml/#sec-pi>`_."""
- PREFIX: str = "<?"
- SUFFIX: str = "?>"
- class Comment(PreformattedString):
- """An `HTML comment <https://dev.w3.org/html5/spec-LC/syntax.html#comments>`_ or `XML comment <https://www.w3.org/TR/REC-xml/#sec-comments>`_."""
- PREFIX: str = "<!--"
- SUFFIX: str = "-->"
- class Declaration(PreformattedString):
- """An `XML declaration <https://www.w3.org/TR/REC-xml/#sec-prolog-dtd>`_."""
- PREFIX: str = "<?"
- SUFFIX: str = "?>"
- class Doctype(PreformattedString):
- """A `document type declaration <https://www.w3.org/TR/REC-xml/#dt-doctype>`_."""
- @classmethod
- def for_name_and_ids(
- cls, name: str, pub_id: Optional[str], system_id: Optional[str]
- ) -> Doctype:
- """Generate an appropriate document type declaration for a given
- public ID and system ID.
- :param name: The name of the document's root element, e.g. 'html'.
- :param pub_id: The Formal Public Identifier for this document type,
- e.g. '-//W3C//DTD XHTML 1.1//EN'
- :param system_id: The system identifier for this document type,
- e.g. 'http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd'
- """
- return Doctype(cls._string_for_name_and_ids(name, pub_id, system_id))
- @classmethod
- def _string_for_name_and_ids(
- cls, name: str, pub_id: Optional[str], system_id: Optional[str]
- ) -> str:
- """Generate a string to be used as the basis of a Doctype object.
- This is a separate method from for_name_and_ids() because the lxml
- TreeBuilder needs to call it.
- """
- value = name or ""
- if pub_id is not None:
- value += ' PUBLIC "%s"' % pub_id
- if system_id is not None:
- value += ' "%s"' % system_id
- elif system_id is not None:
- value += ' SYSTEM "%s"' % system_id
- return value
- PREFIX: str = "<!DOCTYPE "
- SUFFIX: str = ">\n"
- class Stylesheet(NavigableString):
- """A `NavigableString` representing the contents of a `<style> HTML
- tag <https://dev.w3.org/html5/spec-LC/Overview.html#the-style-element>`_
- (probably CSS).
- Used to distinguish embedded stylesheets from textual content.
- """
- class Script(NavigableString):
- """A `NavigableString` representing the contents of a `<script>
- HTML tag
- <https://dev.w3.org/html5/spec-LC/Overview.html#the-script-element>`_
- (probably Javascript).
- Used to distinguish executable code from textual content.
- """
- class TemplateString(NavigableString):
- """A `NavigableString` representing a string found inside an `HTML
- <template> tag <https://html.spec.whatwg.org/multipage/scripting.html#the-template-element>`_
- embedded in a larger document.
- Used to distinguish such strings from the main body of the document.
- """
- class RubyTextString(NavigableString):
- """A NavigableString representing the contents of an `<rt> HTML
- tag <https://dev.w3.org/html5/spec-LC/text-level-semantics.html#the-rt-element>`_.
- Can be used to distinguish such strings from the strings they're
- annotating.
- """
- class RubyParenthesisString(NavigableString):
- """A NavigableString representing the contents of an `<rp> HTML
- tag <https://dev.w3.org/html5/spec-LC/text-level-semantics.html#the-rp-element>`_.
- """
- class Tag(PageElement):
- """An HTML or XML tag that is part of a parse tree, along with its
- attributes, contents, and relationships to other parts of the tree.
- When Beautiful Soup parses the markup ``<b>penguin</b>``, it will
- create a `Tag` object representing the ``<b>`` tag. You can
- instantiate `Tag` objects directly, but it's not necessary unless
- you're adding entirely new markup to a parsed document. Most of
- the constructor arguments are intended for use by the `TreeBuilder`
- that's parsing a document.
- :param parser: A `BeautifulSoup` object representing the parse tree this
- `Tag` will be part of.
- :param builder: The `TreeBuilder` being used to build the tree.
- :param name: The name of the tag.
- :param namespace: The URI of this tag's XML namespace, if any.
- :param prefix: The prefix for this tag's XML namespace, if any.
- :param attrs: A dictionary of attribute values.
- :param parent: The `Tag` to use as the parent of this `Tag`. May be
- the `BeautifulSoup` object itself.
- :param previous: The `PageElement` that was parsed immediately before
- parsing this tag.
- :param is_xml: If True, this is an XML tag. Otherwise, this is an
- HTML tag.
- :param sourceline: The line number where this tag was found in its
- source document.
- :param sourcepos: The character position within ``sourceline`` where this
- tag was found.
- :param can_be_empty_element: If True, this tag should be
- represented as <tag/>. If False, this tag should be represented
- as <tag></tag>.
- :param cdata_list_attributes: A dictionary of attributes whose values should
- be parsed as lists of strings if they ever show up on this tag.
- :param preserve_whitespace_tags: Names of tags whose contents
- should have their whitespace preserved if they are encountered inside
- this tag.
- :param interesting_string_types: When iterating over this tag's
- string contents in methods like `Tag.strings` or
- `PageElement.get_text`, these are the types of strings that are
- interesting enough to be considered. By default,
- `NavigableString` (normal strings) and `CData` (CDATA
- sections) are the only interesting string subtypes.
- :param namespaces: A dictionary mapping currently active
- namespace prefixes to URIs, as of the point in the parsing process when
- this tag was encountered. This can be used later to
- construct CSS selectors.
- """
- def __init__(
- self,
- parser: Optional[BeautifulSoup] = None,
- builder: Optional[TreeBuilder] = None,
- name: Optional[str] = None,
- namespace: Optional[str] = None,
- prefix: Optional[str] = None,
- attrs: Optional[_RawOrProcessedAttributeValues] = None,
- parent: Optional[Union[BeautifulSoup, Tag]] = None,
- previous: _AtMostOneElement = None,
- is_xml: Optional[bool] = None,
- sourceline: Optional[int] = None,
- sourcepos: Optional[int] = None,
- can_be_empty_element: Optional[bool] = None,
- cdata_list_attributes: Optional[Dict[str, Set[str]]] = None,
- preserve_whitespace_tags: Optional[Set[str]] = None,
- interesting_string_types: Optional[Set[Type[NavigableString]]] = None,
- namespaces: Optional[Dict[str, str]] = None,
- # NOTE: Any new arguments here need to be mirrored in
- # Tag.copy_self, and potentially BeautifulSoup.new_tag
- # as well.
- ):
- if parser is None:
- self.parser_class = None
- else:
- # We don't actually store the parser object: that lets extracted
- # chunks be garbage-collected.
- self.parser_class = parser.__class__
- if name is None:
- raise ValueError("No value provided for new tag's name.")
- self.name = name
- self.namespace = namespace
- self._namespaces = namespaces or {}
- self.prefix = prefix
- if (not builder or builder.store_line_numbers) and (
- sourceline is not None or sourcepos is not None
- ):
- self.sourceline = sourceline
- self.sourcepos = sourcepos
- else:
- self.sourceline = sourceline
- self.sourcepos = sourcepos
- attr_dict_class: type[AttributeDict]
- attribute_value_list_class: type[AttributeValueList]
- if builder is None:
- if is_xml:
- attr_dict_class = XMLAttributeDict
- else:
- attr_dict_class = HTMLAttributeDict
- attribute_value_list_class = AttributeValueList
- else:
- attr_dict_class = builder.attribute_dict_class
- attribute_value_list_class = builder.attribute_value_list_class
- self.attribute_value_list_class = attribute_value_list_class
- if attrs is None:
- self.attrs = attr_dict_class()
- else:
- if builder is not None and builder.cdata_list_attributes:
- self.attrs = builder._replace_cdata_list_attribute_values(
- self.name, attrs
- )
- else:
- self.attrs = attr_dict_class()
- # Make sure that the values of any multi-valued
- # attributes (e.g. when a Tag is copied) are stored in
- # new lists.
- for k, v in attrs.items():
- if isinstance(v, list):
- v = v.__class__(v)
- self.attrs[k] = v
- # If possible, determine ahead of time whether this tag is an
- # XML tag.
- if builder:
- self.known_xml = builder.is_xml
- else:
- self.known_xml = is_xml
- self.contents: List[PageElement] = []
- self.setup(parent, previous)
- self.hidden = False
- if builder is None:
- # In the absence of a TreeBuilder, use whatever values were
- # passed in here. They're probably None, unless this is a copy of some
- # other tag.
- self.can_be_empty_element = can_be_empty_element
- self.cdata_list_attributes = cdata_list_attributes
- self.preserve_whitespace_tags = preserve_whitespace_tags
- self.interesting_string_types = interesting_string_types
- else:
- # Set up any substitutions for this tag, such as the charset in a META tag.
- self.attribute_value_list_class = builder.attribute_value_list_class
- builder.set_up_substitutions(self)
- # Ask the TreeBuilder whether this tag might be an empty-element tag.
- self.can_be_empty_element = builder.can_be_empty_element(name)
- # Keep track of the list of attributes of this tag that
- # might need to be treated as a list.
- #
- # For performance reasons, we store the whole data structure
- # rather than asking the question of every tag. Asking would
- # require building a new data structure every time, and
- # (unlike can_be_empty_element), we almost never need
- # to check this.
- self.cdata_list_attributes = builder.cdata_list_attributes
- # Keep track of the names that might cause this tag to be treated as a
- # whitespace-preserved tag.
- self.preserve_whitespace_tags = builder.preserve_whitespace_tags
- if self.name in builder.string_containers:
- # This sort of tag uses a special string container
- # subclass for most of its strings. We need to be able
- # to look up the proper container subclass.
- self.interesting_string_types = {builder.string_containers[self.name]}
- else:
- self.interesting_string_types = self.MAIN_CONTENT_STRING_TYPES
- parser_class: Optional[type[BeautifulSoup]]
- name: str
- namespace: Optional[str]
- prefix: Optional[str]
- attrs: _AttributeValues
- sourceline: Optional[int]
- sourcepos: Optional[int]
- known_xml: Optional[bool]
- contents: List[PageElement]
- hidden: bool
- interesting_string_types: Optional[Set[Type[NavigableString]]]
- can_be_empty_element: Optional[bool]
- cdata_list_attributes: Optional[Dict[str, Set[str]]]
- preserve_whitespace_tags: Optional[Set[str]]
- #: :meta private:
- parserClass = _deprecated_alias("parserClass", "parser_class", "4.0.0")
- def __deepcopy__(self, memo: Dict[Any, Any], recursive: bool = True) -> Self:
- """A deepcopy of a Tag is a new Tag, unconnected to the parse tree.
- Its contents are a copy of the old Tag's contents.
- """
- clone = self.copy_self()
- if recursive:
- # Clone this tag's descendants recursively, but without
- # making any recursive function calls.
- tag_stack: List[Tag] = [clone]
- for event, element in self._event_stream(self.descendants):
- if event is Tag.END_ELEMENT_EVENT:
- # Stop appending incoming Tags to the Tag that was
- # just closed.
- tag_stack.pop()
- else:
- descendant_clone = element.__deepcopy__(memo, recursive=False)
- # Add to its parent's .contents
- tag_stack[-1].append(descendant_clone)
- if event is Tag.START_ELEMENT_EVENT:
- # Add the Tag itself to the stack so that its
- # children will be .appended to it.
- tag_stack.append(cast(Tag, descendant_clone))
- return clone
- def copy_self(self) -> Self:
- """Create a new Tag just like this one, but with no
- contents and unattached to any parse tree.
- This is the first step in the deepcopy process, but you can
- call it on its own to create a copy of a Tag without copying its
- contents.
- """
- clone = type(self)(
- None,
- None,
- self.name,
- self.namespace,
- self.prefix,
- self.attrs,
- is_xml=self._is_xml,
- sourceline=self.sourceline,
- sourcepos=self.sourcepos,
- can_be_empty_element=self.can_be_empty_element,
- cdata_list_attributes=self.cdata_list_attributes,
- preserve_whitespace_tags=self.preserve_whitespace_tags,
- interesting_string_types=self.interesting_string_types,
- namespaces=self._namespaces,
- )
- for attr in ("can_be_empty_element", "hidden"):
- setattr(clone, attr, getattr(self, attr))
- return clone
- @property
- def is_empty_element(self) -> bool:
- """Is this tag an empty-element tag? (aka a self-closing tag)
- A tag that has contents is never an empty-element tag.
- A tag that has no contents may or may not be an empty-element
- tag. It depends on the `TreeBuilder` used to create the
- tag. If the builder has a designated list of empty-element
- tags, then only a tag whose name shows up in that list is
- considered an empty-element tag. This is usually the case
- for HTML documents.
- If the builder has no designated list of empty-element, then
- any tag with no contents is an empty-element tag. This is usually
- the case for XML documents.
- """
- return len(self.contents) == 0 and self.can_be_empty_element is True
- @_deprecated("is_empty_element", "4.0.0")
- def isSelfClosing(self) -> bool:
- ": :meta private:"
- return self.is_empty_element
- @property
- def string(self) -> Optional[str]:
- """Convenience property to get the single string within this
- `Tag`, assuming there is just one.
- :return: If this `Tag` has a single child that's a
- `NavigableString`, the return value is that string. If this
- element has one child `Tag`, the return value is that child's
- `Tag.string`, recursively. If this `Tag` has no children,
- or has more than one child, the return value is ``None``.
- If this property is unexpectedly returning ``None`` for you,
- it's probably because your `Tag` has more than one thing
- inside it.
- """
- if len(self.contents) != 1:
- return None
- child = self.contents[0]
- if isinstance(child, NavigableString):
- return child
- elif isinstance(child, Tag):
- return child.string
- return None
- @string.setter
- def string(self, string: str) -> None:
- """Replace the `Tag.contents` of this `Tag` with a single string."""
- self.clear()
- if isinstance(string, NavigableString):
- new_class = string.__class__
- else:
- new_class = NavigableString
- self.append(new_class(string))
- #: :meta private:
- MAIN_CONTENT_STRING_TYPES = {NavigableString, CData}
- def _all_strings(
- self, strip: bool = False, types: _OneOrMoreStringTypes = PageElement.default
- ) -> Iterator[str]:
- """Yield all strings of certain classes, possibly stripping them.
- :param strip: If True, all strings will be stripped before being
- yielded.
- :param types: A tuple of NavigableString subclasses. Any strings of
- a subclass not found in this list will be ignored. By
- default, the subclasses considered are the ones found in
- self.interesting_string_types. If that's not specified,
- only NavigableString and CData objects will be
- considered. That means no comments, processing
- instructions, etc.
- """
- if types is self.default:
- if self.interesting_string_types is None:
- types = self.MAIN_CONTENT_STRING_TYPES
- else:
- types = self.interesting_string_types
- for descendant in self.descendants:
- if not isinstance(descendant, NavigableString):
- continue
- descendant_type = type(descendant)
- if isinstance(types, type):
- if descendant_type is not types:
- # We're not interested in strings of this type.
- continue
- elif types is not None and descendant_type not in types:
- # We're not interested in strings of this type.
- continue
- if strip:
- stripped = descendant.strip()
- if len(stripped) == 0:
- continue
- yield stripped
- else:
- yield descendant
- strings = property(_all_strings)
- def insert(self, position: int, *new_children: _InsertableElement) -> List[PageElement]:
- """Insert one or more new PageElements as a child of this `Tag`.
- This works similarly to :py:meth:`list.insert`, except you can insert
- multiple elements at once.
- :param position: The numeric position that should be occupied
- in this Tag's `Tag.children` by the first new `PageElement`.
- :param new_children: The PageElements to insert.
- :return The newly inserted PageElements.
- """
- inserted: List[PageElement] = []
- for new_child in new_children:
- inserted.extend(self._insert(position, new_child))
- position += 1
- return inserted
- def _insert(self, position: int, new_child: _InsertableElement) -> List[PageElement]:
- if new_child is None:
- raise ValueError("Cannot insert None into a tag.")
- if new_child is self:
- raise ValueError("Cannot insert a tag into itself.")
- if isinstance(new_child, str) and not isinstance(new_child, NavigableString):
- new_child = NavigableString(new_child)
- from bs4 import BeautifulSoup
- if isinstance(new_child, BeautifulSoup):
- # We don't want to end up with a situation where one BeautifulSoup
- # object contains another. Insert the BeautifulSoup's children and
- # return them.
- return self.insert(position, *list(new_child.contents))
- position = min(position, len(self.contents))
- if hasattr(new_child, "parent") and new_child.parent is not None:
- # We're 'inserting' an element that's already one
- # of this object's children.
- if new_child.parent is self:
- current_index = self.index(new_child)
- if current_index < position:
- # We're moving this element further down the list
- # of this object's children. That means that when
- # we extract this element, our target index will
- # jump down one.
- position -= 1
- elif current_index == position:
- # We're 'inserting' an element into its current location.
- # This is a no-op.
- return [new_child]
- new_child.extract()
- new_child.parent = self
- previous_child = None
- if position == 0:
- new_child.previous_sibling = None
- new_child.previous_element = self
- else:
- previous_child = self.contents[position - 1]
- new_child.previous_sibling = previous_child
- new_child.previous_sibling.next_sibling = new_child
- new_child.previous_element = previous_child._last_descendant(False)
- if new_child.previous_element is not None:
- new_child.previous_element.next_element = new_child
- new_childs_last_element = new_child._last_descendant(
- is_initialized=False, accept_self=True
- )
- # new_childs_last_element can't be None because we passed
- # accept_self=True into _last_descendant. Worst case,
- # new_childs_last_element will be new_child itself. Making
- # this cast removes several mypy complaints later on as we
- # manipulate new_childs_last_element.
- new_childs_last_element = cast(PageElement, new_childs_last_element)
- if position >= len(self.contents):
- new_child.next_sibling = None
- parent: Optional[Tag] = self
- parents_next_sibling = None
- while parents_next_sibling is None and parent is not None:
- parents_next_sibling = parent.next_sibling
- parent = parent.parent
- if parents_next_sibling is not None:
- # We found the element that comes next in the document.
- break
- if parents_next_sibling is not None:
- new_childs_last_element.next_element = parents_next_sibling
- else:
- # The last element of this tag is the last element in
- # the document.
- new_childs_last_element.next_element = None
- else:
- next_child = self.contents[position]
- new_child.next_sibling = next_child
- if new_child.next_sibling is not None:
- new_child.next_sibling.previous_sibling = new_child
- new_childs_last_element.next_element = next_child
- if new_childs_last_element.next_element is not None:
- new_childs_last_element.next_element.previous_element = (
- new_childs_last_element
- )
- self.contents.insert(position, new_child)
- return [new_child]
- def unwrap(self) -> Self:
- """Replace this `PageElement` with its contents.
- :return: This object, no longer part of the tree.
- """
- my_parent = self.parent
- if my_parent is None:
- raise ValueError(
- "Cannot replace an element with its contents when that "
- "element is not part of a tree."
- )
- my_index = my_parent.index(self)
- self.extract(_self_index=my_index)
- for child in reversed(self.contents[:]):
- my_parent.insert(my_index, child)
- return self
- replace_with_children = unwrap
- @_deprecated("unwrap", "4.0.0")
- def replaceWithChildren(self) -> _OneElement:
- ": :meta private:"
- return self.unwrap()
- def append(self, tag: _InsertableElement) -> PageElement:
- """
- Appends the given `PageElement` to the contents of this `Tag`.
- :param tag: A PageElement.
- :return The newly appended PageElement.
- """
- return self.insert(len(self.contents), tag)[0]
- def extend(self, tags: Union[Iterable[_InsertableElement], Tag]) -> List[PageElement]:
- """Appends one or more objects to the contents of this
- `Tag`.
- :param tags: If a list of `PageElement` objects is provided,
- they will be appended to this tag's contents, one at a time.
- If a single `Tag` is provided, its `Tag.contents` will be
- used to extend this object's `Tag.contents`.
- :return The list of PageElements that were appended.
- """
- tag_list: Iterable[_InsertableElement]
- if isinstance(tags, Tag):
- tag_list = list(tags.contents)
- elif isinstance(tags, (PageElement, str)):
- # The caller should really be using append() instead,
- # but we can make it work.
- warnings.warn(
- "A single non-Tag item was passed into Tag.extend. Use Tag.append instead.",
- UserWarning,
- stacklevel=2,
- )
- if isinstance(tags, str) and not isinstance(tags, PageElement):
- tags = NavigableString(tags)
- tag_list = [tags]
- elif isinstance(tags, Iterable):
- # Moving items around the tree may change their position in
- # the original list. Make a list that won't change.
- tag_list = list(tags)
- results: List[PageElement] = []
- for tag in tag_list:
- results.append(self.append(tag))
- return results
- def clear(self, decompose: bool = False) -> None:
- """Destroy all children of this `Tag` by calling
- `PageElement.extract` on them.
- :param decompose: If this is True, `PageElement.decompose` (a
- more destructive method) will be called instead of
- `PageElement.extract`.
- """
- for element in self.contents[:]:
- if decompose:
- element.decompose()
- else:
- element.extract()
- def smooth(self) -> None:
- """Smooth out the children of this `Tag` by consolidating consecutive
- strings.
- If you perform a lot of operations that modify the tree,
- calling this method afterwards can make pretty-printed output
- look more natural.
- """
- # Mark the first position of every pair of children that need
- # to be consolidated. Do this rather than making a copy of
- # self.contents, since in most cases very few strings will be
- # affected.
- marked = []
- for i, a in enumerate(self.contents):
- if isinstance(a, Tag):
- # Recursively smooth children.
- a.smooth()
- if i == len(self.contents) - 1:
- # This is the last item in .contents, and it's not a
- # tag. There's no chance it needs any work.
- continue
- b = self.contents[i + 1]
- if (
- isinstance(a, NavigableString)
- and isinstance(b, NavigableString)
- and not isinstance(a, PreformattedString)
- and not isinstance(b, PreformattedString)
- ):
- marked.append(i)
- # Go over the marked positions in reverse order, so that
- # removing items from .contents won't affect the remaining
- # positions.
- for i in reversed(marked):
- a = cast(NavigableString, self.contents[i])
- b = cast(NavigableString, self.contents[i + 1])
- b.extract()
- n = NavigableString(a + b)
- a.replace_with(n)
- def index(self, element: PageElement) -> int:
- """Find the index of a child of this `Tag` (by identity, not value).
- Doing this by identity avoids issues when a `Tag` contains two
- children that have string equality.
- :param element: Look for this `PageElement` in this object's contents.
- """
- for i, child in enumerate(self.contents):
- if child is element:
- return i
- raise ValueError("Tag.index: element not in tag")
- def get(
- self, key: str, default: Optional[_AttributeValue] = None
- ) -> Optional[_AttributeValue]:
- """Returns the value of the 'key' attribute for the tag, or
- the value given for 'default' if it doesn't have that
- attribute.
- :param key: The attribute to look for.
- :param default: Use this value if the attribute is not present
- on this `Tag`.
- """
- return self.attrs.get(key, default)
- def get_attribute_list(
- self, key: str, default: Optional[AttributeValueList] = None
- ) -> AttributeValueList:
- """The same as get(), but always returns a (possibly empty) list.
- :param key: The attribute to look for.
- :param default: Use this value if the attribute is not present
- on this `Tag`.
- :return: A list of strings, usually empty or containing only a single
- value.
- """
- list_value: AttributeValueList
- value = self.get(key, default)
- if value is None:
- list_value = self.attribute_value_list_class()
- elif isinstance(value, list):
- list_value = value
- else:
- if not isinstance(value, str):
- value = cast(str, value)
- list_value = self.attribute_value_list_class([value])
- return list_value
- def has_attr(self, key: str) -> bool:
- """Does this `Tag` have an attribute with the given name?"""
- return key in self.attrs
- def __hash__(self) -> int:
- return str(self).__hash__()
- def __getitem__(self, key: str) -> _AttributeValue:
- """tag[key] returns the value of the 'key' attribute for the Tag,
- and throws an exception if it's not there."""
- return self.attrs[key]
- def __iter__(self) -> Iterator[PageElement]:
- "Iterating over a Tag iterates over its contents."
- return iter(self.contents)
- def __len__(self) -> int:
- "The length of a Tag is the length of its list of contents."
- return len(self.contents)
- def __contains__(self, x: Any) -> bool:
- return x in self.contents
- def __bool__(self) -> bool:
- "A tag is non-None even if it has no contents."
- return True
- def __setitem__(self, key: str, value: _AttributeValue) -> None:
- """Setting tag[key] sets the value of the 'key' attribute for the
- tag."""
- self.attrs[key] = value
- def __delitem__(self, key: str) -> None:
- "Deleting tag[key] deletes all 'key' attributes for the tag."
- self.attrs.pop(key, None)
- @overload
- def __call__( # pyright: ignore [reportOverlappingOverload]
- self,
- name: _FindMethodName = None,
- attrs: Optional[_StrainableAttributes] = None,
- recursive: bool = True,
- string: None = None,
- limit: Optional[int] = None,
- _stacklevel: int = 2,
- **kwargs: _StrainableAttribute,
- ) -> _SomeTags:
- ...
- @overload
- def __call__(
- self,
- name: None = None,
- attrs: None = None,
- recursive: bool = True,
- string: _StrainableString = "",
- limit: Optional[int] = None,
- _stacklevel: int = 2,
- **kwargs: _StrainableAttribute,
- ) -> _SomeNavigableStrings:
- ...
- def __call__(
- self,
- name: _FindMethodName = None,
- attrs: Optional[_StrainableAttributes] = None,
- recursive: bool = True,
- string: Optional[_StrainableString] = None,
- limit: Optional[int] = None,
- _stacklevel: int = 2,
- **kwargs: _StrainableAttribute,
- ) -> Union[_SomeTags,_SomeNavigableStrings,_QueryResults]:
- """Calling a Tag like a function is the same as calling its
- find_all() method. Eg. tag('a') returns a list of all the A tags
- found within this tag."""
- if string is not None and (name is not None or attrs is not None or kwargs):
- # TODO: Using the @overload decorator to express the three ways you
- # could get into this path is way too much code for a rarely(?) used
- # feature.
- return cast(ResultSet[Tag], self.find_all(name, attrs, recursive, string, limit, _stacklevel, **kwargs)) #type: ignore
- if string is None:
- # If string is None, we're searching for tags.
- tags:ResultSet[Tag] = self.find_all(
- name, attrs, recursive, None, limit, _stacklevel, **kwargs
- )
- return tags
- # Otherwise, we're searching for strings.
- strings:ResultSet[NavigableString] = self.find_all(
- None, None, recursive, string, limit, _stacklevel, **kwargs
- )
- return strings
- def __getattr__(self, subtag: str) -> Optional[Tag]:
- """Calling tag.subtag is the same as calling tag.find(name="subtag")"""
- # print("Getattr %s.%s" % (self.__class__, tag))
- result: _AtMostOneElement
- if len(subtag) > 3 and subtag.endswith("Tag"):
- # BS3: soup.aTag -> "soup.find("a")
- tag_name = subtag[:-3]
- warnings.warn(
- '.%(name)sTag is deprecated, use .find("%(name)s") instead. If you really were looking for a tag called %(name)sTag, use .find("%(name)sTag")'
- % dict(name=tag_name),
- DeprecationWarning,
- stacklevel=2,
- )
- result = self.find(tag_name)
- # We special case contents to avoid recursion.
- elif not subtag.startswith("__") and not subtag == "contents":
- result = self.find(subtag)
- else:
- raise AttributeError(
- "'%s' object has no attribute '%s'" % (self.__class__, subtag)
- )
- return result
- def __eq__(self, other: Any) -> bool:
- """Returns true iff this Tag has the same name, the same attributes,
- and the same contents (recursively) as `other`."""
- if self is other:
- return True
- if not isinstance(other, Tag):
- return False
- if (
- not hasattr(other, "name")
- or not hasattr(other, "attrs")
- or not hasattr(other, "contents")
- or self.name != other.name
- or self.attrs != other.attrs
- or len(self) != len(other)
- ):
- return False
- for i, my_child in enumerate(self.contents):
- if my_child != other.contents[i]:
- return False
- return True
- def __ne__(self, other: Any) -> bool:
- """Returns true iff this Tag is not identical to `other`,
- as defined in __eq__."""
- return not self == other
- def __repr__(self) -> str:
- """Renders this `Tag` as a string."""
- return self.decode()
- __str__ = __unicode__ = __repr__
- def encode(
- self,
- encoding: _Encoding = DEFAULT_OUTPUT_ENCODING,
- indent_level: Optional[int] = None,
- formatter: _FormatterOrName = "minimal",
- errors: str = "xmlcharrefreplace",
- ) -> bytes:
- """Render this `Tag` and its contents as a bytestring.
- :param encoding: The encoding to use when converting to
- a bytestring. This may also affect the text of the document,
- specifically any encoding declarations within the document.
- :param indent_level: Each line of the rendering will be
- indented this many levels. (The ``formatter`` decides what a
- 'level' means, in terms of spaces or other characters
- output.) This is used internally in recursive calls while
- pretty-printing.
- :param formatter: Either a `Formatter` object, or a string naming one of
- the standard formatters.
- :param errors: An error handling strategy such as
- 'xmlcharrefreplace'. This value is passed along into
- :py:meth:`str.encode` and its value should be one of the `error
- handling constants defined by Python's codecs module
- <https://docs.python.org/3/library/codecs.html#error-handlers>`_.
- """
- # Turn the data structure into Unicode, then encode the
- # Unicode.
- u = self.decode(indent_level, encoding, formatter)
- return u.encode(encoding, errors)
- def decode(
- self,
- indent_level: Optional[int] = None,
- eventual_encoding: _Encoding = DEFAULT_OUTPUT_ENCODING,
- formatter: _FormatterOrName = "minimal",
- iterator: Optional[Iterator[PageElement]] = None,
- ) -> str:
- """Render this `Tag` and its contents as a Unicode string.
- :param indent_level: Each line of the rendering will be
- indented this many levels. (The ``formatter`` decides what a
- 'level' means, in terms of spaces or other characters
- output.) This is used internally in recursive calls while
- pretty-printing.
- :param encoding: The encoding you intend to use when
- converting the string to a bytestring. decode() is *not*
- responsible for performing that encoding. This information
- is needed so that a real encoding can be substituted in if
- the document contains an encoding declaration (e.g. in a
- <meta> tag).
- :param formatter: Either a `Formatter` object, or a string
- naming one of the standard formatters.
- :param iterator: The iterator to use when navigating over the
- parse tree. This is only used by `Tag.decode_contents` and
- you probably won't need to use it.
- """
- pieces = []
- # First off, turn a non-Formatter `formatter` into a Formatter
- # object. This will stop the lookup from happening over and
- # over again.
- if not isinstance(formatter, Formatter):
- formatter = self.formatter_for_name(formatter)
- if indent_level is True:
- indent_level = 0
- # The currently active tag that put us into string literal
- # mode. Until this element is closed, children will be treated
- # as string literals and not pretty-printed. String literal
- # mode is turned on immediately after this tag begins, and
- # turned off immediately before it's closed. This means there
- # will be whitespace before and after the tag itself.
- string_literal_tag = None
- for event, element in self._event_stream(iterator):
- if event in (Tag.START_ELEMENT_EVENT, Tag.EMPTY_ELEMENT_EVENT):
- element = cast(Tag, element)
- piece = element._format_tag(eventual_encoding, formatter, opening=True)
- elif event is Tag.END_ELEMENT_EVENT:
- element = cast(Tag, element)
- piece = element._format_tag(eventual_encoding, formatter, opening=False)
- if indent_level is not None:
- indent_level -= 1
- else:
- element = cast(NavigableString, element)
- piece = element.output_ready(formatter)
- # Now we need to apply the 'prettiness' -- extra
- # whitespace before and/or after this tag. This can get
- # complicated because certain tags, like <pre> and
- # <script>, can't be prettified, since adding whitespace would
- # change the meaning of the content.
- # The default behavior is to add whitespace before and
- # after an element when string literal mode is off, and to
- # leave things as they are when string literal mode is on.
- if string_literal_tag:
- indent_before = indent_after = False
- else:
- indent_before = indent_after = True
- # The only time the behavior is more complex than that is
- # when we encounter an opening or closing tag that might
- # put us into or out of string literal mode.
- if (
- event is Tag.START_ELEMENT_EVENT
- and not string_literal_tag
- and not cast(Tag, element)._should_pretty_print()
- ):
- # We are about to enter string literal mode. Add
- # whitespace before this tag, but not after. We
- # will stay in string literal mode until this tag
- # is closed.
- indent_before = True
- indent_after = False
- string_literal_tag = element
- elif event is Tag.END_ELEMENT_EVENT and element is string_literal_tag:
- # We are about to exit string literal mode by closing
- # the tag that sent us into that mode. Add whitespace
- # after this tag, but not before.
- indent_before = False
- indent_after = True
- string_literal_tag = None
- # Now we know whether to add whitespace before and/or
- # after this element.
- if indent_level is not None:
- if indent_before or indent_after:
- if isinstance(element, NavigableString):
- piece = piece.strip()
- if piece:
- piece = self._indent_string(
- piece, indent_level, formatter, indent_before, indent_after
- )
- if event == Tag.START_ELEMENT_EVENT:
- indent_level += 1
- pieces.append(piece)
- return "".join(pieces)
- class _TreeTraversalEvent(object):
- """An internal class representing an event in the process
- of traversing a parse tree.
- :meta private:
- """
- # Stand-ins for the different events yielded by _event_stream
- START_ELEMENT_EVENT = _TreeTraversalEvent() #: :meta private:
- END_ELEMENT_EVENT = _TreeTraversalEvent() #: :meta private:
- EMPTY_ELEMENT_EVENT = _TreeTraversalEvent() #: :meta private:
- STRING_ELEMENT_EVENT = _TreeTraversalEvent() #: :meta private:
- def _event_stream(
- self, iterator: Optional[Iterator[PageElement]] = None
- ) -> Iterator[Tuple[_TreeTraversalEvent, PageElement]]:
- """Yield a sequence of events that can be used to reconstruct the DOM
- for this element.
- This lets us recreate the nested structure of this element
- (e.g. when formatting it as a string) without using recursive
- method calls.
- This is similar in concept to the SAX API, but it's a simpler
- interface designed for internal use. The events are different
- from SAX and the arguments associated with the events are Tags
- and other Beautiful Soup objects.
- :param iterator: An alternate iterator to use when traversing
- the tree.
- """
- tag_stack: List[Tag] = []
- iterator = iterator or self.self_and_descendants
- for c in iterator:
- # If the parent of the element we're about to yield is not
- # the tag currently on the stack, it means that the tag on
- # the stack closed before this element appeared.
- while tag_stack and c.parent != tag_stack[-1]:
- now_closed_tag = tag_stack.pop()
- yield Tag.END_ELEMENT_EVENT, now_closed_tag
- if isinstance(c, Tag):
- if c.is_empty_element:
- yield Tag.EMPTY_ELEMENT_EVENT, c
- else:
- yield Tag.START_ELEMENT_EVENT, c
- tag_stack.append(c)
- continue
- else:
- yield Tag.STRING_ELEMENT_EVENT, c
- while tag_stack:
- now_closed_tag = tag_stack.pop()
- yield Tag.END_ELEMENT_EVENT, now_closed_tag
- def _indent_string(
- self,
- s: str,
- indent_level: int,
- formatter: Formatter,
- indent_before: bool,
- indent_after: bool,
- ) -> str:
- """Add indentation whitespace before and/or after a string.
- :param s: The string to amend with whitespace.
- :param indent_level: The indentation level; affects how much
- whitespace goes before the string.
- :param indent_before: Whether or not to add whitespace
- before the string.
- :param indent_after: Whether or not to add whitespace
- (a newline) after the string.
- """
- space_before = ""
- if indent_before and indent_level:
- space_before = formatter.indent * indent_level
- space_after = ""
- if indent_after:
- space_after = "\n"
- return space_before + s + space_after
- def _format_tag(
- self, eventual_encoding: str, formatter: Formatter, opening: bool
- ) -> str:
- if self.hidden:
- # A hidden tag is invisible, although its contents
- # are visible.
- return ""
- # A tag starts with the < character (see below).
- # Then the / character, if this is a closing tag.
- closing_slash = ""
- if not opening:
- closing_slash = "/"
- # Then an optional namespace prefix.
- prefix = ""
- if self.prefix:
- prefix = self.prefix + ":"
- # Then a list of attribute values, if this is an opening tag.
- attribute_string = ""
- if opening:
- attributes = formatter.attributes(self)
- attrs = []
- for key, val in attributes:
- if val is None:
- decoded = key
- else:
- if isinstance(val, list) or isinstance(val, tuple):
- val = " ".join(val)
- elif not isinstance(val, str):
- val = str(val)
- elif (
- isinstance(val, AttributeValueWithCharsetSubstitution)
- and eventual_encoding is not None
- ):
- val = val.substitute_encoding(eventual_encoding)
- text = formatter.attribute_value(val)
- decoded = str(key) + "=" + formatter.quoted_attribute_value(text)
- attrs.append(decoded)
- if attrs:
- attribute_string = " " + " ".join(attrs)
- # Then an optional closing slash (for a void element in an
- # XML document).
- void_element_closing_slash = ""
- if self.is_empty_element:
- void_element_closing_slash = formatter.void_element_close_prefix or ""
- # Put it all together.
- return (
- "<"
- + closing_slash
- + prefix
- + self.name
- + attribute_string
- + void_element_closing_slash
- + ">"
- )
- def _should_pretty_print(self, indent_level: int = 1) -> bool:
- """Should this tag be pretty-printed?
- Most of them should, but some (such as <pre> in HTML
- documents) should not.
- """
- return indent_level is not None and (
- not self.preserve_whitespace_tags
- or self.name not in self.preserve_whitespace_tags
- )
- @overload
- def prettify(
- self,
- encoding: None = None,
- formatter: _FormatterOrName = "minimal",
- ) -> str:
- ...
- @overload
- def prettify(
- self,
- encoding: _Encoding,
- formatter: _FormatterOrName = "minimal",
- ) -> bytes:
- ...
- def prettify(
- self,
- encoding: Optional[_Encoding] = None,
- formatter: _FormatterOrName = "minimal",
- ) -> Union[str, bytes]:
- """Pretty-print this `Tag` as a string or bytestring.
- :param encoding: The encoding of the bytestring, or None if you want Unicode.
- :param formatter: A Formatter object, or a string naming one of
- the standard formatters.
- :return: A string (if no ``encoding`` is provided) or a bytestring
- (otherwise).
- """
- if encoding is None:
- return self.decode(indent_level=0, formatter=formatter)
- else:
- return self.encode(encoding=encoding, indent_level=0, formatter=formatter)
- def decode_contents(
- self,
- indent_level: Optional[int] = None,
- eventual_encoding: _Encoding = DEFAULT_OUTPUT_ENCODING,
- formatter: _FormatterOrName = "minimal",
- ) -> str:
- """Renders the contents of this tag as a Unicode string.
- :param indent_level: Each line of the rendering will be
- indented this many levels. (The formatter decides what a
- 'level' means in terms of spaces or other characters
- output.) Used internally in recursive calls while
- pretty-printing.
- :param eventual_encoding: The tag is destined to be
- encoded into this encoding. decode_contents() is *not*
- responsible for performing that encoding. This information
- is needed so that a real encoding can be substituted in if
- the document contains an encoding declaration (e.g. in a
- <meta> tag).
- :param formatter: A `Formatter` object, or a string naming one of
- the standard Formatters.
- """
- return self.decode(
- indent_level, eventual_encoding, formatter, iterator=self.descendants
- )
- def encode_contents(
- self,
- indent_level: Optional[int] = None,
- encoding: _Encoding = DEFAULT_OUTPUT_ENCODING,
- formatter: _FormatterOrName = "minimal",
- ) -> bytes:
- """Renders the contents of this PageElement as a bytestring.
- :param indent_level: Each line of the rendering will be
- indented this many levels. (The ``formatter`` decides what a
- 'level' means, in terms of spaces or other characters
- output.) This is used internally in recursive calls while
- pretty-printing.
- :param formatter: Either a `Formatter` object, or a string naming one of
- the standard formatters.
- :param encoding: The bytestring will be in this encoding.
- """
- contents = self.decode_contents(indent_level, encoding, formatter)
- return contents.encode(encoding)
- @_deprecated("encode_contents", "4.0.0")
- def renderContents(
- self,
- encoding: _Encoding = DEFAULT_OUTPUT_ENCODING,
- prettyPrint: bool = False,
- indentLevel: Optional[int] = 0,
- ) -> bytes:
- """Deprecated method for BS3 compatibility.
- :meta private:
- """
- if not prettyPrint:
- indentLevel = None
- return self.encode_contents(indent_level=indentLevel, encoding=encoding)
- # Soup methods
- @overload
- def find(
- self,
- name: _FindMethodName = None,
- attrs: Optional[_StrainableAttributes] = None,
- recursive: bool = True,
- string: None=None,
- **kwargs: _StrainableAttribute,
- ) -> _AtMostOneTag:
- ...
- @overload
- def find(
- self,
- name: None=None,
- attrs: None=None,
- recursive: bool = True,
- string: _StrainableString="",
- ) -> _AtMostOneNavigableString:
- ...
- def find(
- self,
- name: _FindMethodName = None,
- attrs: Optional[_StrainableAttributes] = None,
- recursive: bool = True,
- string: Optional[_StrainableString] = None,
- **kwargs: _StrainableAttribute,
- ) -> Union[_AtMostOneTag,_AtMostOneNavigableString,_AtMostOneElement]:
- """Look in the children of this PageElement and find the first
- PageElement that matches the given criteria.
- All find_* methods take a common set of arguments. See the online
- documentation for detailed explanations.
- :param name: A filter on tag name.
- :param attrs: Additional filters on attribute values.
- :param recursive: If this is True, find() will perform a
- recursive search of this Tag's children. Otherwise,
- only the direct children will be considered.
- :param string: A filter on the `Tag.string` attribute.
- :kwargs: Additional filters on attribute values.
- """
- if string is not None and (name is not None or attrs is not None or kwargs):
- # TODO: Using the @overload decorator to express the three ways you
- # could get into this path is way too much code for a rarely(?) used
- # feature.
- elements = self.find_all(name, attrs, recursive, string, 1, _stacklevel=3, **kwargs) # type:ignore
- if elements:
- return cast(Tag, elements[0])
- elif string is None:
- tags = self.find_all(name, attrs, recursive, None, 1, _stacklevel=3, **kwargs)
- if tags:
- return cast(Tag, tags[0])
- else:
- strings = self.find_all(None, None, recursive, string, 1, _stacklevel=3, **kwargs)
- if strings:
- return cast(NavigableString, strings[0])
- return None
- findChild = _deprecated_function_alias("findChild", "find", "3.0.0")
- @overload
- def find_all( # pyright: ignore [reportOverlappingOverload]
- self,
- name: _FindMethodName = None,
- attrs: Optional[_StrainableAttributes] = None,
- recursive: bool = True,
- string: None = None,
- limit: Optional[int] = None,
- _stacklevel: int = 2,
- **kwargs: _StrainableAttribute,
- ) -> _SomeTags:
- ...
- @overload
- def find_all(
- self,
- name: None = None,
- attrs: None = None,
- recursive: bool = True,
- string: _StrainableString = "",
- limit: Optional[int] = None,
- _stacklevel: int = 2,
- **kwargs: _StrainableAttribute,
- ) -> _SomeNavigableStrings:
- ...
- def find_all(
- self,
- name: _FindMethodName = None,
- attrs: Optional[_StrainableAttributes] = None,
- recursive: bool = True,
- string: Optional[_StrainableString] = None,
- limit: Optional[int] = None,
- _stacklevel: int = 2,
- **kwargs: _StrainableAttribute,
- ) -> Union[_SomeTags,_SomeNavigableStrings,_QueryResults]:
- """Look in the children of this `PageElement` and find all
- `PageElement` objects that match the given criteria.
- All find_* methods take a common set of arguments. See the online
- documentation for detailed explanations.
- :param name: A filter on tag name.
- :param attrs: Additional filters on attribute values.
- :param recursive: If this is True, find_all() will perform a
- recursive search of this PageElement's children. Otherwise,
- only the direct children will be considered.
- :param limit: Stop looking after finding this many results.
- :param _stacklevel: Used internally to improve warning messages.
- :kwargs: Additional filters on attribute values.
- """
- generator = self.descendants
- if not recursive:
- generator = self.children
- _stacklevel += 1
- if string is not None and (name is not None or attrs is not None or kwargs):
- # TODO: Using the @overload decorator to express the three ways you
- # could get into this path is way too much code for a rarely(?) used
- # feature.
- return cast(ResultSet[Tag],
- self._find_all(name, attrs, string, limit, generator,
- _stacklevel=_stacklevel, **kwargs)
- )
- if string is None:
- # If string is None, we're searching for tags.
- return cast(ResultSet[Tag], self._find_all(
- name, attrs, None, limit, generator, _stacklevel=_stacklevel, **kwargs
- ))
- # Otherwise, we're searching for strings.
- return cast(ResultSet[NavigableString], self._find_all(
- None, None, string, limit, generator, _stacklevel=_stacklevel, **kwargs
- ))
- findAll = _deprecated_function_alias("findAll", "find_all", "4.0.0")
- findChildren = _deprecated_function_alias("findChildren", "find_all", "3.0.0")
- # Generator methods
- @property
- def children(self) -> Iterator[PageElement]:
- """Iterate over all direct children of this `PageElement`."""
- return (x for x in self.contents)
- @property
- def self_and_descendants(self) -> Iterator[PageElement]:
- """Iterate over this `Tag` and its children in a
- breadth-first sequence.
- """
- return self._self_and(self.descendants)
- @property
- def descendants(self) -> Iterator[PageElement]:
- """Iterate over all children of this `Tag` in a
- breadth-first sequence.
- """
- if not len(self.contents):
- return
- # _last_descendant() can't return None here because
- # accept_self is True. Worst case, last_descendant will end up
- # as self.
- last_descendant = cast(PageElement, self._last_descendant(accept_self=True))
- stopNode = last_descendant.next_element
- current: _AtMostOneElement = self.contents[0]
- while current is not stopNode and current is not None:
- successor = current.next_element
- yield current
- current = successor
- # CSS selector code
- def select_one(
- self, selector: str, namespaces: Optional[Dict[str, str]] = None, **kwargs: Any
- ) -> Optional[Tag]:
- """Perform a CSS selection operation on the current element.
- :param selector: A CSS selector.
- :param namespaces: A dictionary mapping namespace prefixes
- used in the CSS selector to namespace URIs. By default,
- Beautiful Soup will use the prefixes it encountered while
- parsing the document.
- :param kwargs: Keyword arguments to be passed into Soup Sieve's
- soupsieve.select() method.
- """
- return self.css.select_one(selector, namespaces, **kwargs)
- def select(
- self,
- selector: str,
- namespaces: Optional[Dict[str, str]] = None,
- limit: int = 0,
- **kwargs: Any,
- ) -> ResultSet[Tag]:
- """Perform a CSS selection operation on the current element.
- This uses the SoupSieve library.
- :param selector: A string containing a CSS selector.
- :param namespaces: A dictionary mapping namespace prefixes
- used in the CSS selector to namespace URIs. By default,
- Beautiful Soup will use the prefixes it encountered while
- parsing the document.
- :param limit: After finding this number of results, stop looking.
- :param kwargs: Keyword arguments to be passed into SoupSieve's
- soupsieve.select() method.
- """
- return self.css.select(selector, namespaces, limit, **kwargs)
- @property
- def css(self) -> CSS:
- """Return an interface to the CSS selector API."""
- return CSS(self)
- # Old names for backwards compatibility
- @_deprecated("children", "4.0.0")
- def childGenerator(self) -> Iterator[PageElement]:
- """Deprecated generator.
- :meta private:
- """
- return self.children
- @_deprecated("descendants", "4.0.0")
- def recursiveChildGenerator(self) -> Iterator[PageElement]:
- """Deprecated generator.
- :meta private:
- """
- return self.descendants
- @_deprecated("has_attr", "4.0.0")
- def has_key(self, key: str) -> bool:
- """Deprecated method. This was kind of misleading because has_key()
- (attributes) was different from __in__ (contents).
- has_key() is gone in Python 3, anyway.
- :meta private:
- """
- return self.has_attr(key)
- _PageElementT = TypeVar("_PageElementT", bound=PageElement)
- class ResultSet(List[_PageElementT], Generic[_PageElementT]):
- """A ResultSet is a list of `PageElement` objects, gathered as the result
- of matching an :py:class:`ElementFilter` against a parse tree. Basically, a list of
- search results.
- """
- source: Optional[ElementFilter]
- def __init__(
- self, source: Optional[ElementFilter], result: Iterable[_PageElementT] = ()
- ) -> None:
- super(ResultSet, self).__init__(result)
- self.source = source
- def __getattr__(self, key: str) -> None:
- """Raise a helpful exception to explain a common code fix."""
- raise AttributeError(
- f"""ResultSet object has no attribute "{key}". You're probably treating a list of elements like a single element. Did you call find_all() when you meant to call find()?"""
- )
- # Now that all the classes used by SoupStrainer have been defined,
- # import SoupStrainer itself into this module to preserve the
- # backwards compatibility of anyone who imports
- # bs4.element.SoupStrainer.
- from bs4.filter import SoupStrainer # noqa: E402
|