You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
281 lines
9.8 KiB
281 lines
9.8 KiB
11 months ago
|
"""Integration code for CSS selectors using Soup Sieve (pypi: soupsieve)."""
|
||
|
|
||
|
import warnings
|
||
|
try:
|
||
|
import soupsieve
|
||
|
except ImportError as e:
|
||
|
soupsieve = None
|
||
|
warnings.warn(
|
||
|
'The soupsieve package is not installed. CSS selectors cannot be used.'
|
||
|
)
|
||
|
|
||
|
|
||
|
class CSS(object):
|
||
|
"""A proxy object against the soupsieve library, to simplify its
|
||
|
CSS selector API.
|
||
|
|
||
|
Acquire this object through the .css attribute on the
|
||
|
BeautifulSoup object, or on the Tag you want to use as the
|
||
|
starting point for a CSS selector.
|
||
|
|
||
|
The main advantage of doing this is that the tag to be selected
|
||
|
against doesn't need to be explicitly specified in the function
|
||
|
calls, since it's already scoped to a tag.
|
||
|
"""
|
||
|
|
||
|
def __init__(self, tag, api=soupsieve):
|
||
|
"""Constructor.
|
||
|
|
||
|
You don't need to instantiate this class yourself; instead,
|
||
|
access the .css attribute on the BeautifulSoup object, or on
|
||
|
the Tag you want to use as the starting point for your CSS
|
||
|
selector.
|
||
|
|
||
|
:param tag: All CSS selectors will use this as their starting
|
||
|
point.
|
||
|
|
||
|
:param api: A plug-in replacement for the soupsieve module,
|
||
|
designed mainly for use in tests.
|
||
|
"""
|
||
|
if api is None:
|
||
|
raise NotImplementedError(
|
||
|
"Cannot execute CSS selectors because the soupsieve package is not installed."
|
||
|
)
|
||
|
self.api = api
|
||
|
self.tag = tag
|
||
|
|
||
|
def escape(self, ident):
|
||
|
"""Escape a CSS identifier.
|
||
|
|
||
|
This is a simple wrapper around soupselect.escape(). See the
|
||
|
documentation for that function for more information.
|
||
|
"""
|
||
|
if soupsieve is None:
|
||
|
raise NotImplementedError(
|
||
|
"Cannot escape CSS identifiers because the soupsieve package is not installed."
|
||
|
)
|
||
|
return self.api.escape(ident)
|
||
|
|
||
|
def _ns(self, ns, select):
|
||
|
"""Normalize a dictionary of namespaces."""
|
||
|
if not isinstance(select, self.api.SoupSieve) and ns is None:
|
||
|
# If the selector is a precompiled pattern, it already has
|
||
|
# a namespace context compiled in, which cannot be
|
||
|
# replaced.
|
||
|
ns = self.tag._namespaces
|
||
|
return ns
|
||
|
|
||
|
def _rs(self, results):
|
||
|
"""Normalize a list of results to a Resultset.
|
||
|
|
||
|
A ResultSet is more consistent with the rest of Beautiful
|
||
|
Soup's API, and ResultSet.__getattr__ has a helpful error
|
||
|
message if you try to treat a list of results as a single
|
||
|
result (a common mistake).
|
||
|
"""
|
||
|
# Import here to avoid circular import
|
||
|
from bs4.element import ResultSet
|
||
|
return ResultSet(None, results)
|
||
|
|
||
|
def compile(self, select, namespaces=None, flags=0, **kwargs):
|
||
|
"""Pre-compile a selector and return the compiled object.
|
||
|
|
||
|
:param selector: A CSS selector.
|
||
|
|
||
|
:param namespaces: A dictionary mapping namespace prefixes
|
||
|
used in the CSS selector to namespace URIs. By default,
|
||
|
Beautiful Soup will use the prefixes it encountered while
|
||
|
parsing the document.
|
||
|
|
||
|
:param flags: Flags to be passed into Soup Sieve's
|
||
|
soupsieve.compile() method.
|
||
|
|
||
|
:param kwargs: Keyword arguments to be passed into SoupSieve's
|
||
|
soupsieve.compile() method.
|
||
|
|
||
|
:return: A precompiled selector object.
|
||
|
:rtype: soupsieve.SoupSieve
|
||
|
"""
|
||
|
return self.api.compile(
|
||
|
select, self._ns(namespaces, select), flags, **kwargs
|
||
|
)
|
||
|
|
||
|
def select_one(self, select, namespaces=None, flags=0, **kwargs):
|
||
|
"""Perform a CSS selection operation on the current Tag and return the
|
||
|
first result.
|
||
|
|
||
|
This uses the Soup Sieve library. For more information, see
|
||
|
that library's documentation for the soupsieve.select_one()
|
||
|
method.
|
||
|
|
||
|
:param selector: A CSS selector.
|
||
|
|
||
|
:param namespaces: A dictionary mapping namespace prefixes
|
||
|
used in the CSS selector to namespace URIs. By default,
|
||
|
Beautiful Soup will use the prefixes it encountered while
|
||
|
parsing the document.
|
||
|
|
||
|
:param flags: Flags to be passed into Soup Sieve's
|
||
|
soupsieve.select_one() method.
|
||
|
|
||
|
:param kwargs: Keyword arguments to be passed into SoupSieve's
|
||
|
soupsieve.select_one() method.
|
||
|
|
||
|
:return: A Tag, or None if the selector has no match.
|
||
|
:rtype: bs4.element.Tag
|
||
|
|
||
|
"""
|
||
|
return self.api.select_one(
|
||
|
select, self.tag, self._ns(namespaces, select), flags, **kwargs
|
||
|
)
|
||
|
|
||
|
def select(self, select, namespaces=None, limit=0, flags=0, **kwargs):
|
||
|
"""Perform a CSS selection operation on the current Tag.
|
||
|
|
||
|
This uses the Soup Sieve library. For more information, see
|
||
|
that library's documentation for the soupsieve.select()
|
||
|
method.
|
||
|
|
||
|
:param selector: A string containing a CSS selector.
|
||
|
|
||
|
:param namespaces: A dictionary mapping namespace prefixes
|
||
|
used in the CSS selector to namespace URIs. By default,
|
||
|
Beautiful Soup will pass in the prefixes it encountered while
|
||
|
parsing the document.
|
||
|
|
||
|
:param limit: After finding this number of results, stop looking.
|
||
|
|
||
|
:param flags: Flags to be passed into Soup Sieve's
|
||
|
soupsieve.select() method.
|
||
|
|
||
|
:param kwargs: Keyword arguments to be passed into SoupSieve's
|
||
|
soupsieve.select() method.
|
||
|
|
||
|
:return: A ResultSet of Tag objects.
|
||
|
:rtype: bs4.element.ResultSet
|
||
|
|
||
|
"""
|
||
|
if limit is None:
|
||
|
limit = 0
|
||
|
|
||
|
return self._rs(
|
||
|
self.api.select(
|
||
|
select, self.tag, self._ns(namespaces, select), limit, flags,
|
||
|
**kwargs
|
||
|
)
|
||
|
)
|
||
|
|
||
|
def iselect(self, select, namespaces=None, limit=0, flags=0, **kwargs):
|
||
|
"""Perform a CSS selection operation on the current Tag.
|
||
|
|
||
|
This uses the Soup Sieve library. For more information, see
|
||
|
that library's documentation for the soupsieve.iselect()
|
||
|
method. It is the same as select(), but it returns a generator
|
||
|
instead of a list.
|
||
|
|
||
|
:param selector: A string containing a CSS selector.
|
||
|
|
||
|
:param namespaces: A dictionary mapping namespace prefixes
|
||
|
used in the CSS selector to namespace URIs. By default,
|
||
|
Beautiful Soup will pass in the prefixes it encountered while
|
||
|
parsing the document.
|
||
|
|
||
|
:param limit: After finding this number of results, stop looking.
|
||
|
|
||
|
:param flags: Flags to be passed into Soup Sieve's
|
||
|
soupsieve.iselect() method.
|
||
|
|
||
|
:param kwargs: Keyword arguments to be passed into SoupSieve's
|
||
|
soupsieve.iselect() method.
|
||
|
|
||
|
:return: A generator
|
||
|
:rtype: types.GeneratorType
|
||
|
"""
|
||
|
return self.api.iselect(
|
||
|
select, self.tag, self._ns(namespaces, select), limit, flags, **kwargs
|
||
|
)
|
||
|
|
||
|
def closest(self, select, namespaces=None, flags=0, **kwargs):
|
||
|
"""Find the Tag closest to this one that matches the given selector.
|
||
|
|
||
|
This uses the Soup Sieve library. For more information, see
|
||
|
that library's documentation for the soupsieve.closest()
|
||
|
method.
|
||
|
|
||
|
:param selector: A string containing a CSS selector.
|
||
|
|
||
|
:param namespaces: A dictionary mapping namespace prefixes
|
||
|
used in the CSS selector to namespace URIs. By default,
|
||
|
Beautiful Soup will pass in the prefixes it encountered while
|
||
|
parsing the document.
|
||
|
|
||
|
:param flags: Flags to be passed into Soup Sieve's
|
||
|
soupsieve.closest() method.
|
||
|
|
||
|
:param kwargs: Keyword arguments to be passed into SoupSieve's
|
||
|
soupsieve.closest() method.
|
||
|
|
||
|
:return: A Tag, or None if there is no match.
|
||
|
:rtype: bs4.Tag
|
||
|
|
||
|
"""
|
||
|
return self.api.closest(
|
||
|
select, self.tag, self._ns(namespaces, select), flags, **kwargs
|
||
|
)
|
||
|
|
||
|
def match(self, select, namespaces=None, flags=0, **kwargs):
|
||
|
"""Check whether this Tag matches the given CSS selector.
|
||
|
|
||
|
This uses the Soup Sieve library. For more information, see
|
||
|
that library's documentation for the soupsieve.match()
|
||
|
method.
|
||
|
|
||
|
:param: a CSS selector.
|
||
|
|
||
|
:param namespaces: A dictionary mapping namespace prefixes
|
||
|
used in the CSS selector to namespace URIs. By default,
|
||
|
Beautiful Soup will pass in the prefixes it encountered while
|
||
|
parsing the document.
|
||
|
|
||
|
:param flags: Flags to be passed into Soup Sieve's
|
||
|
soupsieve.match() method.
|
||
|
|
||
|
:param kwargs: Keyword arguments to be passed into SoupSieve's
|
||
|
soupsieve.match() method.
|
||
|
|
||
|
:return: True if this Tag matches the selector; False otherwise.
|
||
|
:rtype: bool
|
||
|
"""
|
||
|
return self.api.match(
|
||
|
select, self.tag, self._ns(namespaces, select), flags, **kwargs
|
||
|
)
|
||
|
|
||
|
def filter(self, select, namespaces=None, flags=0, **kwargs):
|
||
|
"""Filter this Tag's direct children based on the given CSS selector.
|
||
|
|
||
|
This uses the Soup Sieve library. It works the same way as
|
||
|
passing this Tag into that library's soupsieve.filter()
|
||
|
method. More information, for more information see the
|
||
|
documentation for soupsieve.filter().
|
||
|
|
||
|
:param namespaces: A dictionary mapping namespace prefixes
|
||
|
used in the CSS selector to namespace URIs. By default,
|
||
|
Beautiful Soup will pass in the prefixes it encountered while
|
||
|
parsing the document.
|
||
|
|
||
|
:param flags: Flags to be passed into Soup Sieve's
|
||
|
soupsieve.filter() method.
|
||
|
|
||
|
:param kwargs: Keyword arguments to be passed into SoupSieve's
|
||
|
soupsieve.filter() method.
|
||
|
|
||
|
:return: A ResultSet of Tag objects.
|
||
|
:rtype: bs4.element.ResultSet
|
||
|
|
||
|
"""
|
||
|
return self._rs(
|
||
|
self.api.filter(
|
||
|
select, self.tag, self._ns(namespaces, select), flags, **kwargs
|
||
|
)
|
||
|
)
|