# Use of this source code is governed by the MIT license.
__license__ = " MIT "
try :
from collections . abc import Callable # Python 3.6
except ImportError as e :
from collections import Callable
import re
import sys
import warnings
try :
import soupsieve
except ImportError as e :
soupsieve = None
warnings . warn (
' The soupsieve package is not installed. CSS selectors cannot be used. '
)
from bs4 . formatter import (
Formatter ,
HTMLFormatter ,
XMLFormatter ,
)
DEFAULT_OUTPUT_ENCODING = " utf-8 "
PY3K = ( sys . version_info [ 0 ] > 2 )
nonwhitespace_re = re . compile ( r " \ S+ " )
# NOTE: This isn't used as of 4.7.0. I'm leaving it for a little bit on
# the off chance someone imported it for their own use.
whitespace_re = re . compile ( r " \ s+ " )
def _alias ( attr ) :
""" Alias one attribute name to another for backward compatibility """
@property
def alias ( self ) :
return getattr ( self , attr )
@alias.setter
def alias ( self ) :
return setattr ( self , attr )
return alias
# These encodings are recognized by Python (so PageElement.encode
# could theoretically support them) but XML and HTML don't recognize
# them (so they should not show up in an XML or HTML document as that
# document's encoding).
#
# If an XML document is encoded in one of these encodings, no encoding
# will be mentioned in the XML declaration. If an HTML document is
# encoded in one of these encodings, and the HTML document has a
# <meta> tag that mentions an encoding, the encoding will be given as
# the empty string.
#
# Source:
# https://docs.python.org/3/library/codecs.html#python-specific-encodings
PYTHON_SPECIFIC_ENCODINGS = set ( [
" idna " ,
" mbcs " ,
" oem " ,
" palmos " ,
" punycode " ,
" raw_unicode_escape " ,
" undefined " ,
" unicode_escape " ,
" raw-unicode-escape " ,
" unicode-escape " ,
" string-escape " ,
" string_escape " ,
] )
class NamespacedAttribute ( str ) :
""" A namespaced string (e.g. ' xml:lang ' ) that remembers the namespace
( ' xml ' ) and the name ( ' lang ' ) that were used to create it .
"""
def __new__ ( cls , prefix , name = None , namespace = None ) :
if not name :
# This is the default namespace. Its name "has no value"
# per https://www.w3.org/TR/xml-names/#defaulting
name = None
if not name :
obj = str . __new__ ( cls , prefix )
elif not prefix :
# Not really namespaced.
obj = str . __new__ ( cls , name )
else :
obj = str . __new__ ( cls , prefix + " : " + name )
obj . prefix = prefix
obj . name = name
obj . namespace = namespace
return obj
class AttributeValueWithCharsetSubstitution ( str ) :
""" A stand-in object for a character encoding specified in HTML. """
class CharsetMetaAttributeValue ( AttributeValueWithCharsetSubstitution ) :
""" A generic stand-in for the value of a meta tag ' s ' charset ' attribute.
When Beautiful Soup parses the markup ' <meta charset= " utf8 " > ' , the
value of the ' charset ' attribute will be one of these objects .
"""
def __new__ ( cls , original_value ) :
obj = str . __new__ ( cls , original_value )
obj . original_value = original_value
return obj
def encode ( self , encoding ) :
""" When an HTML document is being encoded to a given encoding, the
value of a meta tag ' s ' charset ' is the name of the encoding.
"""
if encoding in PYTHON_SPECIFIC_ENCODINGS :
return ' '
return encoding
class ContentMetaAttributeValue ( AttributeValueWithCharsetSubstitution ) :
""" A generic stand-in for the value of a meta tag ' s ' content ' attribute.
When Beautiful Soup parses the markup :
< meta http - equiv = " content-type " content = " text/html; charset=utf8 " >
The value of the ' content ' attribute will be one of these objects .
"""
CHARSET_RE = re . compile ( r " ((^|;) \ s*charset=)([^;]*) " , re . M )
def __new__ ( cls , original_value ) :
match = cls . CHARSET_RE . search ( original_value )
if match is None :
# No substitution necessary.
return str . __new__ ( str , original_value )
obj = str . __new__ ( cls , original_value )
obj . original_value = original_value
return obj
def encode ( self , encoding ) :
if encoding in PYTHON_SPECIFIC_ENCODINGS :
return ' '
def rewrite ( match ) :
return match . group ( 1 ) + encoding
return self . CHARSET_RE . sub ( rewrite , self . original_value )
class PageElement ( object ) :
""" Contains the navigational information for some part of the page:
that is , its current location in the parse tree .
NavigableString , Tag , etc . are all subclasses of PageElement .
"""
def setup ( self , parent = None , previous_element = None , next_element = None ,
previous_sibling = None , next_sibling = None ) :
""" Sets up the initial relations between this element and
other elements .
: param parent : The parent of this element .
: param previous_element : The element parsed immediately before
this one .
: param next_element : The element parsed immediately before
this one .
: param previous_sibling : The most recently encountered element
on the same level of the parse tree as this one .
: param previous_sibling : The next element to be encountered
on the same level of the parse tree as this one .
"""
self . parent = parent
self . previous_element = previous_element
if previous_element is not None :
self . previous_element . next_element = self
self . next_element = next_element
if self . next_element is not None :
self . next_element . previous_element = self
self . next_sibling = next_sibling
if self . next_sibling is not None :
self . next_sibling . previous_sibling = self
if ( previous_sibling is None
and self . parent is not None and self . parent . contents ) :
previous_sibling = self . parent . contents [ - 1 ]
self . previous_sibling = previous_sibling
if previous_sibling is not None :
self . previous_sibling . next_sibling = self
def format_string ( self , s , formatter ) :
""" Format the given string using the given formatter.
: param s : A string .
: param formatter : A Formatter object , or a string naming one of the standard formatters .
"""
if formatter is None :
return s
if not isinstance ( formatter , Formatter ) :
formatter = self . formatter_for_name ( formatter )
output = formatter . substitute ( s )
return output
def formatter_for_name ( self , formatter ) :
""" Look up or create a Formatter for the given identifier,
if necessary .
: param formatter : Can be a Formatter object ( used as - is ) , a
function ( used as the entity substitution hook for an
XMLFormatter or HTMLFormatter ) , or a string ( used to look
up an XMLFormatter or HTMLFormatter in the appropriate
registry .
"""
if isinstance ( formatter , Formatter ) :
return formatter
if self . _is_xml :
c = XMLFormatter
else :
c = HTMLFormatter
if isinstance ( formatter , Callable ) :
return c ( entity_substitution = formatter )
return c . REGISTRY [ formatter ]
@property
def _is_xml ( self ) :
""" Is this element part of an XML tree or an HTML tree?
This is used in formatter_for_name , when deciding whether an
XMLFormatter or HTMLFormatter is more appropriate . It can be
inefficient , but it should be called very rarely .
"""
if self . known_xml is not None :
# Most of the time we will have determined this when the
# document is parsed.
return self . known_xml
# Otherwise, it's likely that this element was created by
# direct invocation of the constructor from within the user's
# Python code.
if self . parent is None :
# This is the top-level object. It should have .known_xml set
# from tree creation. If not, take a guess--BS is usually
# used on HTML markup.
return getattr ( self , ' is_xml ' , False )
return self . parent . _is_xml
nextSibling = _alias ( " next_sibling " ) # BS3
previousSibling = _alias ( " previous_sibling " ) # BS3
default = object ( )
def _all_strings ( self , strip = False , types = default ) :
""" Yield all strings of certain classes, possibly stripping them.
This is implemented differently in Tag and NavigableString .
"""
raise NotImplementedError ( )
@property
def stripped_strings ( self ) :
""" Yield all strings in this PageElement, stripping them first.
: yield : A sequence of stripped strings .
"""
for string in self . _all_strings ( True ) :
yield string
def get_text ( self , separator = " " , strip = False ,
types = default ) :
""" Get all child strings of this PageElement, concatenated using the
given separator .
: param separator : Strings will be concatenated using this separator .
: param strip : If True , strings will be stripped before being
concatenated .
: param types : A tuple of NavigableString subclasses . Any
strings of a subclass not found in this list will be
ignored . Although there are exceptions , the default
behavior in most cases is to consider only NavigableString
and CData objects . That means no comments , processing
instructions , etc .
: return : A string .
"""
return separator . join ( [ s for s in self . _all_strings (
strip , types = types ) ] )
getText = get_text
text = property ( get_text )
def replace_with ( self , * args ) :
""" Replace this PageElement with one or more PageElements, keeping the
rest of the tree the same .
: param args : One or more PageElements .
: return : ` self ` , no longer part of the tree .
"""
if self . parent is None :
raise ValueError (
" Cannot replace one element with another when the "
" element to be replaced is not part of a tree. " )
if len ( args ) == 1 and args [ 0 ] is self :
return
if any ( x is self . parent for x in args ) :
raise ValueError ( " Cannot replace a Tag with its parent. " )
old_parent = self . parent
my_index = self . parent . index ( self )
self . extract ( _self_index = my_index )
for idx , replace_with in enumerate ( args , start = my_index ) :
old_parent . insert ( idx , replace_with )
return self
replaceWith = replace_with # BS3
def unwrap ( self ) :
""" Replace this PageElement with its contents.
: return : ` self ` , no longer part of the tree .
"""
my_parent = self . parent
if self . parent is None :
raise ValueError (
" Cannot replace an element with its contents when that "
" element is not part of a tree. " )
my_index = self . parent . index ( self )
self . extract ( _self_index = my_index )
for child in reversed ( self . contents [ : ] ) :
my_parent . insert ( my_index , child )
return self
replace_with_children = unwrap
replaceWithChildren = unwrap # BS3
def wrap ( self , wrap_inside ) :
""" Wrap this PageElement inside another one.
: param wrap_inside : A PageElement .
: return : ` wrap_inside ` , occupying the position in the tree that used
to be occupied by ` self ` , and with ` self ` inside it .
"""
me = self . replace_with ( wrap_inside )
wrap_inside . append ( me )
return wrap_inside
def extract ( self , _self_index = None ) :
""" Destructively rips this element out of the tree.
: param _self_index : The location of this element in its parent ' s
. contents , if known . Passing this in allows for a performance
optimization .
: return : ` self ` , no longer part of the tree .
"""
if self . parent is not None :
if _self_index is None :
_self_index = self . parent . index ( self )
del self . parent . contents [ _self_index ]
#Find the two elements that would be next to each other if
#this element (and any children) hadn't been parsed. Connect
#the two.
last_child = self . _last_descendant ( )
next_element = last_child . next_element
if ( self . previous_element is not None and
self . previous_element is not next_element ) :
self . previous_element . next_element = next_element
if next_element is not None and next_element is not self . previous_element :
next_element . previous_element = self . previous_element
self . previous_element = None
last_child . next_element = None
self . parent = None
if ( self . previous_sibling is not None
and self . previous_sibling is not self . next_sibling ) :
self . previous_sibling . next_sibling = self . next_sibling
if ( self . next_sibling is not None
and self . next_sibling is not self . previous_sibling ) :
self . next_sibling . previous_sibling = self . previous_sibling
self . previous_sibling = self . next_sibling = None
return self
def _last_descendant ( self , is_initialized = True , accept_self = True ) :
""" Finds the last element beneath this object to be parsed.
: param is_initialized : Has ` setup ` been called on this PageElement
yet ?
: param accept_self : Is ` self ` an acceptable answer to the question ?
"""
if is_initialized and self . next_sibling is not None :
last_child = self . next_sibling . previous_element
else :
last_child = self
while isinstance ( last_child , Tag ) and last_child . contents :
last_child = last_child . contents [ - 1 ]
if not accept_self and last_child is self :
last_child = None
return last_child
# BS3: Not part of the API!
_lastRecursiveChild = _last_descendant
def insert ( self , position , new_child ) :
""" Insert a new PageElement in the list of this PageElement ' s children.
This works the same way as ` list . insert ` .
: param position : The numeric position that should be occupied
in ` self . children ` by the new PageElement .
: param new_child : A PageElement .
"""
if new_child is None :
raise ValueError ( " Cannot insert None into a tag. " )
if new_child is self :
raise ValueError ( " Cannot insert a tag into itself. " )
if ( isinstance ( new_child , str )
and not isinstance ( new_child , NavigableString ) ) :
new_child = NavigableString ( new_child )
from bs4 import BeautifulSoup
if isinstance ( new_child , BeautifulSoup ) :
# We don't want to end up with a situation where one BeautifulSoup
# object contains another. Insert the children one at a time.
for subchild in list ( new_child . contents ) :
self . insert ( position , subchild )
position + = 1
return
position = min ( position , len ( self . contents ) )
if hasattr ( new_child , ' parent ' ) and new_child . parent is not None :
# We're 'inserting' an element that's already one
# of this object's children.
if new_child . parent is self :
current_index = self . index ( new_child )
if current_index < position :
# We're moving this element further down the list
# of this object's children. That means that when
# we extract this element, our target index will
# jump down one.
position - = 1
new_child . extract ( )
new_child . parent = self
previous_child = None
if position == 0 :
new_child . previous_sibling = None
new_child . previous_element = self
else :
previous_child = self . contents [ position - 1 ]
new_child . previous_sibling = previous_child
new_child . previous_sibling . next_sibling = new_child
new_child . previous_element = previous_child . _last_descendant ( False )
if new_child . previous_element is not None :
new_child . previous_element . next_element = new_child
new_childs_last_element = new_child . _last_descendant ( False )
if position > = len ( self . contents ) :
new_child . next_sibling = None
parent = self
parents_next_sibling = None
while parents_next_sibling is None and parent is not None :
parents_next_sibling = parent . next_sibling
parent = parent . parent
if parents_next_sibling is not None :
# We found the element that comes next in the document.
break
if parents_next_sibling is not None :
new_childs_last_element . next_element = parents_next_sibling
else :
# The last element of this tag is the last element in
# the document.
new_childs_last_element . next_element = None
else :
next_child = self . contents [ position ]
new_child . next_sibling = next_child
if new_child . next_sibling is not None :
new_child . next_sibling . previous_sibling = new_child
new_childs_last_element . next_element = next_child
if new_childs_last_element . next_element is not None :
new_childs_last_element . next_element . previous_element = new_childs_last_element
self . contents . insert ( position , new_child )
def append ( self , tag ) :
""" Appends the given PageElement to the contents of this one.
: param tag : A PageElement .
"""
self . insert ( len ( self . contents ) , tag )
def extend ( self , tags ) :
""" Appends the given PageElements to this one ' s contents.
: param tags : A list of PageElements .
"""
if isinstance ( tags , Tag ) :
# Calling self.append() on another tag's contents will change
# the list we're iterating over. Make a list that won't
# change.
tags = list ( tags . contents )
for tag in tags :
self . append ( tag )
def insert_before ( self , * args ) :
""" Makes the given element(s) the immediate predecessor of this one.
All the elements will have the same parent , and the given elements
will be immediately before this one .
: param args : One or more PageElements .
"""
parent = self . parent
if parent is None :
raise ValueError (
" Element has no parent, so ' before ' has no meaning. " )
if any ( x is self for x in args ) :
raise ValueError ( " Can ' t insert an element before itself. " )
for predecessor in args :
# Extract first so that the index won't be screwed up if they
# are siblings.
if isinstance ( predecessor , PageElement ) :
predecessor . extract ( )
index = parent . index ( self )
parent . insert ( index , predecessor )
def insert_after ( self , * args ) :
""" Makes the given element(s) the immediate successor of this one.
The elements will have the same parent , and the given elements
will be immediately after this one .
: param args : One or more PageElements .
"""
# Do all error checking before modifying the tree.
parent = self . parent
if parent is None :
raise ValueError (
" Element has no parent, so ' after ' has no meaning. " )
if any ( x is self for x in args ) :
raise ValueError ( " Can ' t insert an element after itself. " )
offset = 0
for successor in args :
# Extract first so that the index won't be screwed up if they
# are siblings.
if isinstance ( successor , PageElement ) :
successor . extract ( )
index = parent . index ( self )
parent . insert ( index + 1 + offset , successor )
offset + = 1
def find_next ( self , name = None , attrs = { } , text = None , * * kwargs ) :
""" Find the first PageElement that matches the given criteria and
appears later in the document than this PageElement .
All find_ * methods take a common set of arguments . See the online
documentation for detailed explanations .
: param name : A filter on tag name .
: param attrs : A dictionary of filters on attribute values .
: param text : A filter for a NavigableString with specific text .
: kwargs : A dictionary of filters on attribute values .
: return : A PageElement .
: rtype : bs4 . element . Tag | bs4 . element . NavigableString
"""
return self . _find_one ( self . find_all_next , name , attrs , text , * * kwargs )
findNext = find_next # BS3
def find_all_next ( self , name = None , attrs = { } , text = None , limit = None ,
* * kwargs ) :
""" Find all PageElements that match the given criteria and appear
later in the document than this PageElement .
All find_ * methods take a common set of arguments . See the online
documentation for detailed explanations .
: param name : A filter on tag name .
: param attrs : A dictionary of filters on attribute values .
: param text : A filter for a NavigableString with specific text .
: param limit : Stop looking after finding this many results .
: kwargs : A dictionary of filters on attribute values .
: return : A ResultSet containing PageElements .
"""
return self . _find_all ( name , attrs , text , limit , self . next_elements ,
* * kwargs )
findAllNext = find_all_next # BS3
def find_next_sibling ( self , name = None , attrs = { } , text = None , * * kwargs ) :
""" Find the closest sibling to this PageElement that matches the
given criteria and appears later in the document .
All find_ * methods take a common set of arguments . See the
online documentation for detailed explanations .
: param name : A filter on tag name .
: param attrs : A dictionary of filters on attribute values .
: param text : A filter for a NavigableString with specific text .
: kwargs : A dictionary of filters on attribute values .
: return : A PageElement .
: rtype : bs4 . element . Tag | bs4 . element . NavigableString
"""
return self . _find_one ( self . find_next_siblings , name , attrs , text ,
* * kwargs )
findNextSibling = find_next_sibling # BS3
def find_next_siblings ( self , name = None , attrs = { } , text = None , limit = None ,
* * kwargs ) :
""" Find all siblings of this PageElement that match the given criteria
and appear later in the document .
All find_ * methods take a common set of arguments . See the online
documentation for detailed explanations .
: param name : A filter on tag name .
: param attrs : A dictionary of filters on attribute values .
: param text : A filter for a NavigableString with specific text .
: param limit : Stop looking after finding this many results .
: kwargs : A dictionary of filters on attribute values .
: return : A ResultSet of PageElements .
: rtype : bs4 . element . ResultSet
"""
return self . _find_all ( name , attrs , text , limit ,
self . next_siblings , * * kwargs )
findNextSiblings = find_next_siblings # BS3
fetchNextSiblings = find_next_siblings # BS2
def find_previous ( self , name = None , attrs = { } , text = None , * * kwargs ) :
""" Look backwards in the document from this PageElement and find the
first PageElement that matches the given criteria .
All find_ * methods take a common set of arguments . See the online
documentation for detailed explanations .
: param name : A filter on tag name .
: param attrs : A dictionary of filters on attribute values .
: param text : A filter for a NavigableString with specific text .
: kwargs : A dictionary of filters on attribute values .
: return : A PageElement .
: rtype : bs4 . element . Tag | bs4 . element . NavigableString
"""
return self . _find_one (
self . find_all_previous , name , attrs , text , * * kwargs )
findPrevious = find_previous # BS3
def find_all_previous ( self , name = None , attrs = { } , text = None , limit = None ,
* * kwargs ) :
""" Look backwards in the document from this PageElement and find all
PageElements that match the given criteria .
All find_ * methods take a common set of arguments . See the online
documentation for detailed explanations .
: param name : A filter on tag name .
: param attrs : A dictionary of filters on attribute values .
: param text : A filter for a NavigableString with specific text .
: param limit : Stop looking after finding this many results .
: kwargs : A dictionary of filters on attribute values .
: return : A ResultSet of PageElements .
: rtype : bs4 . element . ResultSet
"""
return self . _find_all ( name , attrs , text , limit , self . previous_elements ,
* * kwargs )
findAllPrevious = find_all_previous # BS3
fetchPrevious = find_all_previous # BS2
def find_previous_sibling ( self , name = None , attrs = { } , text = None , * * kwargs ) :
""" Returns the closest sibling to this PageElement that matches the
given criteria and appears earlier in the document .
All find_ * methods take a common set of arguments . See the online
documentation for detailed explanations .
: param name : A filter on tag name .
: param attrs : A dictionary of filters on attribute values .
: param text : A filter for a NavigableString with specific text .
: kwargs : A dictionary of filters on attribute values .
: return : A PageElement .
: rtype : bs4 . element . Tag | bs4 . element . NavigableString
"""
return self . _find_one ( self . find_previous_siblings , name , attrs , text ,
* * kwargs )
findPreviousSibling = find_previous_sibling # BS3
def find_previous_siblings ( self , name = None , attrs = { } , text = None ,
limit = None , * * kwargs ) :
""" Returns all siblings to this PageElement that match the
given criteria and appear earlier in the document .
All find_ * methods take a common set of arguments . See the online
documentation for detailed explanations .
: param name : A filter on tag name .
: param attrs : A dictionary of filters on attribute values .
: param text : A filter for a NavigableString with specific text .
: param limit : Stop looking after finding this many results .
: kwargs : A dictionary of filters on attribute values .
: return : A ResultSet of PageElements .
: rtype : bs4 . element . ResultSet
"""
return self . _find_all ( name , attrs , text , limit ,
self . previous_siblings , * * kwargs )
findPreviousSiblings = find_previous_siblings # BS3
fetchPreviousSiblings = find_previous_siblings # BS2
def find_parent ( self , name = None , attrs = { } , * * kwargs ) :
""" Find the closest parent of this PageElement that matches the given
criteria .
All find_ * methods take a common set of arguments . See the online
documentation for detailed explanations .
: param name : A filter on tag name .
: param attrs : A dictionary of filters on attribute values .
: kwargs : A dictionary of filters on attribute values .
: return : A PageElement .
: rtype : bs4 . element . Tag | bs4 . element . NavigableString
"""
# NOTE: We can't use _find_one because findParents takes a different
# set of arguments.
r = None
l = self . find_parents ( name , attrs , 1 , * * kwargs )
if l :
r = l [ 0 ]
return r
findParent = find_parent # BS3
def find_parents ( self , name = None , attrs = { } , limit = None , * * kwargs ) :
""" Find all parents of this PageElement that match the given criteria.
All find_ * methods take a common set of arguments . See the online
documentation for detailed explanations .
: param name : A filter on tag name .
: param attrs : A dictionary of filters on attribute values .
: param limit : Stop looking after finding this many results .
: kwargs : A dictionary of filters on attribute values .
: return : A PageElement .
: rtype : bs4 . element . Tag | bs4 . element . NavigableString
"""
return self . _find_all ( name , attrs , None , limit , self . parents ,
* * kwargs )
findParents = find_parents # BS3
fetchParents = find_parents # BS2
@property
def next ( self ) :
""" The PageElement, if any, that was parsed just after this one.
: return : A PageElement .
: rtype : bs4 . element . Tag | bs4 . element . NavigableString
"""
return self . next_element
@property
def previous ( self ) :
""" The PageElement, if any, that was parsed just before this one.
: return : A PageElement .
: rtype : bs4 . element . Tag | bs4 . element . NavigableString
"""
return self . previous_element
#These methods do the real heavy lifting.
def _find_one ( self , method , name , attrs , text , * * kwargs ) :
r = None
l = method ( name , attrs , text , 1 , * * kwargs )
if l :
r = l [ 0 ]
return r
def _find_all ( self , name , attrs , text , limit , generator , * * kwargs ) :
" Iterates over a generator looking for things that match. "
if text is None and ' string ' in kwargs :
text = kwargs [ ' string ' ]
del kwargs [ ' string ' ]
if isinstance ( name , SoupStrainer ) :
strainer = name
else :
strainer = SoupStrainer ( name , attrs , text , * * kwargs )
if text is None and not limit and not attrs and not kwargs :
if name is True or name is None :
# Optimization to find all tags.
result = ( element for element in generator
if isinstance ( element , Tag ) )
return ResultSet ( strainer , result )
elif isinstance ( name , str ) :
# Optimization to find all tags with a given name.
if name . count ( ' : ' ) == 1 :
# This is a name with a prefix. If this is a namespace-aware document,
# we need to match the local name against tag.name. If not,
# we need to match the fully-qualified name against tag.name.
prefix , local_name = name . split ( ' : ' , 1 )
else :
prefix = None
local_name = name
result = ( element for element in generator
if isinstance ( element , Tag )
and (
element . name == name
) or (
element . name == local_name
and ( prefix is None or element . prefix == prefix )
)
)
return ResultSet ( strainer , result )
results = ResultSet ( strainer )
while True :
try :
i = next ( generator )
except StopIteration :
break
if i :
found = strainer . search ( i )
if found :
results . append ( found )
if limit and len ( results ) > = limit :
break
return results
#These generators can be used to navigate starting from both
#NavigableStrings and Tags.
@property
def next_elements ( self ) :
""" All PageElements that were parsed after this one.
: yield : A sequence of PageElements .
"""
i = self . next_element
while i is not None :
yield i
i = i . next_element
@property
def next_siblings ( self ) :
""" All PageElements that are siblings of this one but were parsed
later .
: yield : A sequence of PageElements .
"""
i = self . next_sibling
while i is not None :
yield i
i = i . next_sibling
@property
def previous_elements ( self ) :
""" All PageElements that were parsed before this one.
: yield : A sequence of PageElements .
"""
i = self . previous_element
while i is not None :
yield i
i = i . previous_element
@property
def previous_siblings ( self ) :
""" All PageElements that are siblings of this one but were parsed
earlier .
: yield : A sequence of PageElements .
"""
i = self . previous_sibling
while i is not None :
yield i
i = i . previous_sibling
@property
def parents ( self ) :
""" All PageElements that are parents of this PageElement.
: yield : A sequence of PageElements .
"""
i = self . parent
while i is not None :
yield i
i = i . parent
@property
def decomposed ( self ) :
""" Check whether a PageElement has been decomposed.
: rtype : bool
"""
return getattr ( self , ' _decomposed ' , False ) or False
# Old non-property versions of the generators, for backwards
# compatibility with BS3.
def nextGenerator ( self ) :
return self . next_elements
def nextSiblingGenerator ( self ) :
return self . next_siblings
def previousGenerator ( self ) :
return self . previous_elements
def previousSiblingGenerator ( self ) :
return self . previous_siblings
def parentGenerator ( self ) :
return self . parents
class NavigableString ( str , PageElement ) :
""" A Python Unicode string that is part of a parse tree.
When Beautiful Soup parses the markup < b > penguin < / b > , it will
create a NavigableString for the string " penguin " .
"""
PREFIX = ' '
SUFFIX = ' '
# We can't tell just by looking at a string whether it's contained
# in an XML document or an HTML document.
known_xml = None
def __new__ ( cls , value ) :
""" Create a new NavigableString.
When unpickling a NavigableString , this method is called with
the string in DEFAULT_OUTPUT_ENCODING . That encoding needs to be
passed in to the superclass ' s __new__ or the superclass won ' t know
how to handle non - ASCII characters .
"""
if isinstance ( value , str ) :
u = str . __new__ ( cls , value )
else :
u = str . __new__ ( cls , value , DEFAULT_OUTPUT_ENCODING )
u . setup ( )
return u
def __copy__ ( self ) :
""" A copy of a NavigableString has the same contents and class
as the original , but it is not connected to the parse tree .
"""
return type ( self ) ( self )
def __getnewargs__ ( self ) :
return ( str ( self ) , )
def __getattr__ ( self , attr ) :
""" text.string gives you text. This is for backwards
compatibility for Navigable * String , but for CData * it lets you
get the string without the CData wrapper . """
if attr == ' string ' :
return self
else :
raise AttributeError (
" ' %s ' object has no attribute ' %s ' " % (
self . __class__ . __name__ , attr ) )
def output_ready ( self , formatter = " minimal " ) :
""" Run the string through the provided formatter.
: param formatter : A Formatter object , or a string naming one of the standard formatters .
"""
output = self . format_string ( self , formatter )
return self . PREFIX + output + self . SUFFIX
@property
def name ( self ) :
""" Since a NavigableString is not a Tag, it has no .name.
This property is implemented so that code like this doesn ' t crash
when run on a mixture of Tag and NavigableString objects :
[ x . name for x in tag . children ]
"""
return None
@name.setter
def name ( self , name ) :
""" Prevent NavigableString.name from ever being set. """
raise AttributeError ( " A NavigableString cannot be given a name. " )
def _all_strings ( self , strip = False , types = PageElement . default ) :
""" Yield all strings of certain classes, possibly stripping them.
This makes it easy for NavigableString to implement methods
like get_text ( ) as conveniences , creating a consistent
text - extraction API across all PageElements .
: param strip : If True , all strings will be stripped before being
yielded .
: param types : A tuple of NavigableString subclasses . If this
NavigableString isn ' t one of those subclasses, the
sequence will be empty . By default , the subclasses
considered are NavigableString and CData objects . That
means no comments , processing instructions , etc .
: yield : A sequence that either contains this string , or is empty .
"""
if types is self . default :
# This is kept in Tag because it's full of subclasses of
# this class, which aren't defined until later in the file.
types = Tag . DEFAULT_INTERESTING_STRING_TYPES
# Do nothing if the caller is looking for specific types of
# string, and we're of a different type.
my_type = type ( self )
if types is not None :
if isinstance ( types , type ) :
# Looking for a single type.
if my_type is not types :
return
elif my_type not in types :
# Looking for one of a list of types.
return
value = self
if strip :
value = value . strip ( )
if len ( value ) > 0 :
yield value
strings = property ( _all_strings )
class PreformattedString ( NavigableString ) :
""" A NavigableString not subject to the normal formatting rules.
This is an abstract class used for special kinds of strings such
as comments ( the Comment class ) and CDATA blocks ( the CData
class ) .
"""
PREFIX = ' '
SUFFIX = ' '
def output_ready ( self , formatter = None ) :
""" Make this string ready for output by adding any subclass-specific
prefix or suffix .
: param formatter : A Formatter object , or a string naming one
of the standard formatters . The string will be passed into the
Formatter , but only to trigger any side effects : the return
value is ignored .
: return : The string , with any subclass - specific prefix and
suffix added on .
"""
if formatter is not None :
ignore = self . format_string ( self , formatter )
return self . PREFIX + self + self . SUFFIX
class CData ( PreformattedString ) :
""" A CDATA block. """
PREFIX = ' <![CDATA[ '
SUFFIX = ' ]]> '
class ProcessingInstruction ( PreformattedString ) :
""" A SGML processing instruction. """
PREFIX = ' <? '
SUFFIX = ' > '
class XMLProcessingInstruction ( ProcessingInstruction ) :
""" An XML processing instruction. """
PREFIX = ' <? '
SUFFIX = ' ?> '
class Comment ( PreformattedString ) :
""" An HTML or XML comment. """
PREFIX = ' <!-- '
SUFFIX = ' --> '
class Declaration ( PreformattedString ) :
""" An XML declaration. """
PREFIX = ' <? '
SUFFIX = ' ?> '
class Doctype ( PreformattedString ) :
""" A document type declaration. """
@classmethod
def for_name_and_ids ( cls , name , pub_id , system_id ) :
""" Generate an appropriate document type declaration for a given
public ID and system ID .
: param name : The name of the document ' s root element, e.g. ' html ' .
: param pub_id : The Formal Public Identifier for this document type ,
e . g . ' -//W3C//DTD XHTML 1.1//EN '
: param system_id : The system identifier for this document type ,
e . g . ' http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd '
: return : A Doctype .
"""
value = name or ' '
if pub_id is not None :
value + = ' PUBLIC " %s " ' % pub_id
if system_id is not None :
value + = ' " %s " ' % system_id
elif system_id is not None :
value + = ' SYSTEM " %s " ' % system_id
return Doctype ( value )
PREFIX = ' <!DOCTYPE '
SUFFIX = ' > \n '
class Stylesheet ( NavigableString ) :
""" A NavigableString representing an stylesheet (probably
CSS ) .
Used to distinguish embedded stylesheets from textual content .
"""
pass
class Script ( NavigableString ) :
""" A NavigableString representing an executable script (probably
Javascript ) .
Used to distinguish executable code from textual content .
"""
pass
class TemplateString ( NavigableString ) :
""" A NavigableString representing a string found inside an HTML
template embedded in a larger document .
Used to distinguish such strings from the main body of the document .
"""
pass
class Tag ( PageElement ) :
""" Represents an HTML or XML tag that is part of a parse tree, along
with its attributes and contents .
When Beautiful Soup parses the markup < b > penguin < / b > , it will
create a Tag object representing the < b > tag .
"""
def __init__ ( self , parser = None , builder = None , name = None , namespace = None ,
prefix = None , attrs = None , parent = None , previous = None ,
is_xml = None , sourceline = None , sourcepos = None ,
can_be_empty_element = None , cdata_list_attributes = None ,
preserve_whitespace_tags = None ,
interesting_string_types = None ,
) :
""" Basic constructor.
: param parser : A BeautifulSoup object .
: param builder : A TreeBuilder .
: param name : The name of the tag .
: param namespace : The URI of this Tag ' s XML namespace, if any.
: param prefix : The prefix for this Tag ' s XML namespace, if any.
: param attrs : A dictionary of this Tag ' s attribute values.
: param parent : The PageElement to use as this Tag ' s parent.
: param previous : The PageElement that was parsed immediately before
this tag .
: param is_xml : If True , this is an XML tag . Otherwise , this is an
HTML tag .
: param sourceline : The line number where this tag was found in its
source document .
: param sourcepos : The character position within ` sourceline ` where this
tag was found .
: param can_be_empty_element : If True , this tag should be
represented as < tag / > . If False , this tag should be represented
as < tag > < / tag > .
: param cdata_list_attributes : A list of attributes whose values should
be treated as CDATA if they ever show up on this tag .
: param preserve_whitespace_tags : A list of tag names whose contents
should have their whitespace preserved .
: param interesting_string_types : This is a NavigableString
subclass or a tuple of them . When iterating over this
Tag ' s strings in methods like Tag.strings or Tag.get_text,
these are the types of strings that are interesting enough
to be considered . The default is to consider
NavigableString and CData the only interesting string
subtypes .
"""
if parser is None :
self . parser_class = None
else :
# We don't actually store the parser object: that lets extracted
# chunks be garbage-collected.
self . parser_class = parser . __class__
if name is None :
raise ValueError ( " No value provided for new tag ' s name. " )
self . name = name
self . namespace = namespace
self . prefix = prefix
if ( ( not builder or builder . store_line_numbers )
and ( sourceline is not None or sourcepos is not None ) ) :
self . sourceline = sourceline
self . sourcepos = sourcepos
if attrs is None :
attrs = { }
elif attrs :
if builder is not None and builder . cdata_list_attributes :
attrs = builder . _replace_cdata_list_attribute_values (
self . name , attrs )
else :
attrs = dict ( attrs )
else :
attrs = dict ( attrs )
# If possible, determine ahead of time whether this tag is an
# XML tag.
if builder :
self . known_xml = builder . is_xml
else :
self . known_xml = is_xml
self . attrs = attrs
self . contents = [ ]
self . setup ( parent , previous )
self . hidden = False
if builder is None :
# In the absence of a TreeBuilder, use whatever values were
# passed in here. They're probably None, unless this is a copy of some
# other tag.
self . can_be_empty_element = can_be_empty_element
self . cdata_list_attributes = cdata_list_attributes
self . preserve_whitespace_tags = preserve_whitespace_tags
self . interesting_string_types = interesting_string_types
else :
# Set up any substitutions for this tag, such as the charset in a META tag.
builder . set_up_substitutions ( self )
# Ask the TreeBuilder whether this tag might be an empty-element tag.
self . can_be_empty_element = builder . can_be_empty_element ( name )
# Keep track of the list of attributes of this tag that
# might need to be treated as a list.
#
# For performance reasons, we store the whole data structure
# rather than asking the question of every tag. Asking would
# require building a new data structure every time, and
# (unlike can_be_empty_element), we almost never need
# to check this.
self . cdata_list_attributes = builder . cdata_list_attributes
# Keep track of the names that might cause this tag to be treated as a
# whitespace-preserved tag.
self . preserve_whitespace_tags = builder . preserve_whitespace_tags
if self . name in builder . string_containers :
# This sort of tag uses a special string container
# subclass for most of its strings. When we ask the
self . interesting_string_types = builder . string_containers [ self . name ]
else :
self . interesting_string_types = self . DEFAULT_INTERESTING_STRING_TYPES
parserClass = _alias ( " parser_class " ) # BS3
def __copy__ ( self ) :
""" A copy of a Tag is a new Tag, unconnected to the parse tree.
Its contents are a copy of the old Tag ' s contents.
"""
clone = type ( self ) (
None , self . builder , self . name , self . namespace ,
self . prefix , self . attrs , is_xml = self . _is_xml ,
sourceline = self . sourceline , sourcepos = self . sourcepos ,
can_be_empty_element = self . can_be_empty_element ,
cdata_list_attributes = self . cdata_list_attributes ,
preserve_whitespace_tags = self . preserve_whitespace_tags
)
for attr in ( ' can_be_empty_element ' , ' hidden ' ) :
setattr ( clone , attr , getattr ( self , attr ) )
for child in self . contents :
clone . append ( child . __copy__ ( ) )
return clone
@property
def is_empty_element ( self ) :
""" Is this tag an empty-element tag? (aka a self-closing tag)
A tag that has contents is never an empty - element tag .
A tag that has no contents may or may not be an empty - element
tag . It depends on the builder used to create the tag . If the
builder has a designated list of empty - element tags , then only
a tag whose name shows up in that list is considered an
empty - element tag .
If the builder has no designated list of empty - element tags ,
then any tag with no contents is an empty - element tag .
"""
return len ( self . contents ) == 0 and self . can_be_empty_element
isSelfClosing = is_empty_element # BS3
@property
def string ( self ) :
""" Convenience property to get the single string within this
PageElement .
TODO It might make sense to have NavigableString . string return
itself .
: return : If this element has a single string child , return
value is that string . If this element has one child tag ,
return value is the ' string ' attribute of the child tag ,
recursively . If this element is itself a string , has no
children , or has more than one child , return value is None .
"""
if len ( self . contents ) != 1 :
return None
child = self . contents [ 0 ]
if isinstance ( child , NavigableString ) :
return child
return child . string
@string.setter
def string ( self , string ) :
""" Replace this PageElement ' s contents with `string`. """
self . clear ( )
self . append ( string . __class__ ( string ) )
DEFAULT_INTERESTING_STRING_TYPES = ( NavigableString , CData )
def _all_strings ( self , strip = False , types = PageElement . default ) :
""" Yield all strings of certain classes, possibly stripping them.
: param strip : If True , all strings will be stripped before being
yielded .
: param types : A tuple of NavigableString subclasses . Any strings of
a subclass not found in this list will be ignored . By
default , the subclasses considered are the ones found in
self . interesting_string_types . If that ' s not specified,
only NavigableString and CData objects will be
considered . That means no comments , processing
instructions , etc .
: yield : A sequence of strings .
"""
if types is self . default :
types = self . interesting_string_types
for descendant in self . descendants :
if ( types is None and not isinstance ( descendant , NavigableString ) ) :
continue
descendant_type = type ( descendant )
if isinstance ( types , type ) :
if descendant_type is not types :
# We're not interested in strings of this type.
continue
elif types is not None and descendant_type not in types :
# We're not interested in strings of this type.
continue
if strip :
descendant = descendant . strip ( )
if len ( descendant ) == 0 :
continue
yield descendant
strings = property ( _all_strings )
def decompose ( self ) :
""" Recursively destroys this PageElement and its children.
This element will be removed from the tree and wiped out ; so
will everything beneath it .
The behavior of a decomposed PageElement is undefined and you
should never use one for anything , but if you need to _check_
whether an element has been decomposed , you can use the
` decomposed ` property .
"""
self . extract ( )
i = self
while i is not None :
n = i . next_element
i . __dict__ . clear ( )
i . contents = [ ]
i . _decomposed = True
i = n
def clear ( self , decompose = False ) :
""" Wipe out all children of this PageElement by calling extract()
on them .
: param decompose : If this is True , decompose ( ) ( a more
destructive method ) will be called instead of extract ( ) .
"""
if decompose :
for element in self . contents [ : ] :
if isinstance ( element , Tag ) :
element . decompose ( )
else :
element . extract ( )
else :
for element in self . contents [ : ] :
element . extract ( )
def smooth ( self ) :
""" Smooth out this element ' s children by consolidating consecutive
strings .
This makes pretty - printed output look more natural following a
lot of operations that modified the tree .
"""
# Mark the first position of every pair of children that need
# to be consolidated. Do this rather than making a copy of
# self.contents, since in most cases very few strings will be
# affected.
marked = [ ]
for i , a in enumerate ( self . contents ) :
if isinstance ( a , Tag ) :
# Recursively smooth children.
a . smooth ( )
if i == len ( self . contents ) - 1 :
# This is the last item in .contents, and it's not a
# tag. There's no chance it needs any work.
continue
b = self . contents [ i + 1 ]
if ( isinstance ( a , NavigableString )
and isinstance ( b , NavigableString )
and not isinstance ( a , PreformattedString )
and not isinstance ( b , PreformattedString )
) :
marked . append ( i )
# Go over the marked positions in reverse order, so that
# removing items from .contents won't affect the remaining
# positions.
for i in reversed ( marked ) :
a = self . contents [ i ]
b = self . contents [ i + 1 ]
b . extract ( )
n = NavigableString ( a + b )
a . replace_with ( n )
def index ( self , element ) :
""" Find the index of a child by identity, not value.
Avoids issues with tag . contents . index ( element ) getting the
index of equal elements .
: param element : Look for this PageElement in ` self . contents ` .
"""
for i , child in enumerate ( self . contents ) :
if child is element :
return i
raise ValueError ( " Tag.index: element not in tag " )
def get ( self , key , default = None ) :
""" Returns the value of the ' key ' attribute for the tag, or
the value given for ' default ' if it doesn ' t have that
attribute . """
return self . attrs . get ( key , default )
def get_attribute_list ( self , key , default = None ) :
""" The same as get(), but always returns a list.
: param key : The attribute to look for .
: param default : Use this value if the attribute is not present
on this PageElement .
: return : A list of values , probably containing only a single
value .
"""
value = self . get ( key , default )
if not isinstance ( value , list ) :
value = [ value ]
return value
def has_attr ( self , key ) :
""" Does this PageElement have an attribute with the given name? """
return key in self . attrs
def __hash__ ( self ) :
return str ( self ) . __hash__ ( )
def __getitem__ ( self , key ) :
""" tag[key] returns the value of the ' key ' attribute for the Tag,
and throws an exception if it ' s not there. " " "
return self . attrs [ key ]
def __iter__ ( self ) :
" Iterating over a Tag iterates over its contents. "
return iter ( self . contents )
def __len__ ( self ) :
" The length of a Tag is the length of its list of contents. "
return len ( self . contents )
def __contains__ ( self , x ) :
return x in self . contents
def __bool__ ( self ) :
" A tag is non-None even if it has no contents. "
return True
def __setitem__ ( self , key , value ) :
""" Setting tag[key] sets the value of the ' key ' attribute for the
tag . """
self . attrs [ key ] = value
def __delitem__ ( self , key ) :
" Deleting tag[key] deletes all ' key ' attributes for the tag. "
self . attrs . pop ( key , None )
def __call__ ( self , * args , * * kwargs ) :
""" Calling a Tag like a function is the same as calling its
find_all ( ) method . Eg . tag ( ' a ' ) returns a list of all the A tags
found within this tag . """
return self . find_all ( * args , * * kwargs )
def __getattr__ ( self , tag ) :
""" Calling tag.subtag is the same as calling tag.find(name= " subtag " ) """
#print("Getattr %s.%s" % (self.__class__, tag))
if len ( tag ) > 3 and tag . endswith ( ' Tag ' ) :
# BS3: soup.aTag -> "soup.find("a")
tag_name = tag [ : - 3 ]
warnings . warn (
' . %(name)s Tag is deprecated, use .find( " %(name)s " ) instead. If you really were looking for a tag called %(name)s Tag, use .find( " %(name)s Tag " ) ' % dict (
name = tag_name
)
)
return self . find ( tag_name )
# We special case contents to avoid recursion.
elif not tag . startswith ( " __ " ) and not tag == " contents " :
return self . find ( tag )
raise AttributeError (
" ' %s ' object has no attribute ' %s ' " % ( self . __class__ , tag ) )
def __eq__ ( self , other ) :
""" Returns true iff this Tag has the same name, the same attributes,
and the same contents ( recursively ) as ` other ` . """
if self is other :
return True
if ( not hasattr ( other , ' name ' ) or
not hasattr ( other , ' attrs ' ) or
not hasattr ( other , ' contents ' ) or
self . name != other . name or
self . attrs != other . attrs or
len ( self ) != len ( other ) ) :
return False
for i , my_child in enumerate ( self . contents ) :
if my_child != other . contents [ i ] :
return False
return True
def __ne__ ( self , other ) :
""" Returns true iff this Tag is not identical to `other`,
as defined in __eq__ . """
return not self == other
def __repr__ ( self , encoding = " unicode-escape " ) :
""" Renders this PageElement as a string.
: param encoding : The encoding to use ( Python 2 only ) .
: return : Under Python 2 , a bytestring ; under Python 3 ,
a Unicode string .
"""
if PY3K :
# "The return value must be a string object", i.e. Unicode
return self . decode ( )
else :
# "The return value must be a string object", i.e. a bytestring.
# By convention, the return value of __repr__ should also be
# an ASCII string.
return self . encode ( encoding )
def __unicode__ ( self ) :
""" Renders this PageElement as a Unicode string. """
return self . decode ( )
def __str__ ( self ) :
""" Renders this PageElement as a generic string.
: return : Under Python 2 , a UTF - 8 bytestring ; under Python 3 ,
a Unicode string .
"""
if PY3K :
return self . decode ( )
else :
return self . encode ( )
if PY3K :
__str__ = __repr__ = __unicode__
def encode ( self , encoding = DEFAULT_OUTPUT_ENCODING ,
indent_level = None , formatter = " minimal " ,
errors = " xmlcharrefreplace " ) :
""" Render a bytestring representation of this PageElement and its
contents .
: param encoding : The destination encoding .
: param indent_level : Each line of the rendering will be
indented this many spaces . Used internally in
recursive calls while pretty - printing .
: param formatter : A Formatter object , or a string naming one of
the standard formatters .
: param errors : An error handling strategy such as
' xmlcharrefreplace ' . This value is passed along into
encode ( ) and its value should be one of the constants
defined by Python .
: return : A bytestring .
"""
# Turn the data structure into Unicode, then encode the
# Unicode.
u = self . decode ( indent_level , encoding , formatter )
return u . encode ( encoding , errors )
def decode ( self , indent_level = None ,
eventual_encoding = DEFAULT_OUTPUT_ENCODING ,
formatter = " minimal " ) :
""" Render a Unicode representation of this PageElement and its
contents .
: param indent_level : Each line of the rendering will be
indented this many spaces . Used internally in
recursive calls while pretty - printing .
: param eventual_encoding : The tag is destined to be
encoded into this encoding . This method is _not_
responsible for performing that encoding . This information
is passed in so that it can be substituted in if the
document contains a < META > tag that mentions the document ' s
encoding .
: param formatter : A Formatter object , or a string naming one of
the standard formatters .
"""
# First off, turn a non-Formatter `formatter` into a Formatter
# object. This will stop the lookup from happening over and
# over again.
if not isinstance ( formatter , Formatter ) :
formatter = self . formatter_for_name ( formatter )
attributes = formatter . attributes ( self )
attrs = [ ]
for key , val in attributes :
if val is None :
decoded = key
else :
if isinstance ( val , list ) or isinstance ( val , tuple ) :
val = ' ' . join ( val )
elif not isinstance ( val , str ) :
val = str ( val )
elif (
isinstance ( val , AttributeValueWithCharsetSubstitution )
and eventual_encoding is not None
) :
val = val . encode ( eventual_encoding )
text = formatter . attribute_value ( val )
decoded = (
str ( key ) + ' = '
+ formatter . quoted_attribute_value ( text ) )
attrs . append ( decoded )
close = ' '
closeTag = ' '
prefix = ' '
if self . prefix :
prefix = self . prefix + " : "
if self . is_empty_element :
close = formatter . void_element_close_prefix or ' '
else :
closeTag = ' </ %s %s > ' % ( prefix , self . name )
pretty_print = self . _should_pretty_print ( indent_level )
space = ' '
indent_space = ' '
if indent_level is not None :
indent_space = ( ' ' * ( indent_level - 1 ) )
if pretty_print :
space = indent_space
indent_contents = indent_level + 1
else :
indent_contents = None
contents = self . decode_contents (
indent_contents , eventual_encoding , formatter
)
if self . hidden :
# This is the 'document root' object.
s = contents
else :
s = [ ]
attribute_string = ' '
if attrs :
attribute_string = ' ' + ' ' . join ( attrs )
if indent_level is not None :
# Even if this particular tag is not pretty-printed,
# we should indent up to the start of the tag.
s . append ( indent_space )
s . append ( ' < %s %s %s %s > ' % (
prefix , self . name , attribute_string , close ) )
if pretty_print :
s . append ( " \n " )
s . append ( contents )
if pretty_print and contents and contents [ - 1 ] != " \n " :
s . append ( " \n " )
if pretty_print and closeTag :
s . append ( space )
s . append ( closeTag )
if indent_level is not None and closeTag and self . next_sibling :
# Even if this particular tag is not pretty-printed,
# we're now done with the tag, and we should add a
# newline if appropriate.
s . append ( " \n " )
s = ' ' . join ( s )
return s
def _should_pretty_print ( self , indent_level ) :
""" Should this tag be pretty-printed?
Most of them should , but some ( such as < pre > in HTML
documents ) should not .
"""
return (
indent_level is not None
and (
not self . preserve_whitespace_tags
or self . name not in self . preserve_whitespace_tags
)
)
def prettify ( self , encoding = None , formatter = " minimal " ) :
""" Pretty-print this PageElement as a string.
: param encoding : The eventual encoding of the string . If this is None ,
a Unicode string will be returned .
: param formatter : A Formatter object , or a string naming one of
the standard formatters .
: return : A Unicode string ( if encoding == None ) or a bytestring
( otherwise ) .
"""
if encoding is None :
return self . decode ( True , formatter = formatter )
else :
return self . encode ( encoding , True , formatter = formatter )
def decode_contents ( self , indent_level = None ,
eventual_encoding = DEFAULT_OUTPUT_ENCODING ,
formatter = " minimal " ) :
""" Renders the contents of this tag as a Unicode string.
: param indent_level : Each line of the rendering will be
indented this many spaces . Used internally in
recursive calls while pretty - printing .
: param eventual_encoding : The tag is destined to be
encoded into this encoding . decode_contents ( ) is _not_
responsible for performing that encoding . This information
is passed in so that it can be substituted in if the
document contains a < META > tag that mentions the document ' s
encoding .
: param formatter : A Formatter object , or a string naming one of
the standard Formatters .
"""
# First off, turn a string formatter into a Formatter object. This
# will stop the lookup from happening over and over again.
if not isinstance ( formatter , Formatter ) :
formatter = self . formatter_for_name ( formatter )
pretty_print = ( indent_level is not None )
s = [ ]
for c in self :
text = None
if isinstance ( c , NavigableString ) :
text = c . output_ready ( formatter )
elif isinstance ( c , Tag ) :
s . append ( c . decode ( indent_level , eventual_encoding ,
formatter ) )
preserve_whitespace = (
self . preserve_whitespace_tags and self . name in self . preserve_whitespace_tags
)
if text and indent_level and not preserve_whitespace :
text = text . strip ( )
if text :
if pretty_print and not preserve_whitespace :
s . append ( " " * ( indent_level - 1 ) )
s . append ( text )
if pretty_print and not preserve_whitespace :
s . append ( " \n " )
return ' ' . join ( s )
def encode_contents (
self , indent_level = None , encoding = DEFAULT_OUTPUT_ENCODING ,
formatter = " minimal " ) :
""" Renders the contents of this PageElement as a bytestring.
: param indent_level : Each line of the rendering will be
indented this many spaces . Used internally in
recursive calls while pretty - printing .
: param eventual_encoding : The bytestring will be in this encoding .
: param formatter : A Formatter object , or a string naming one of
the standard Formatters .
: return : A bytestring .
"""
contents = self . decode_contents ( indent_level , encoding , formatter )
return contents . encode ( encoding )
# Old method for BS3 compatibility
def renderContents ( self , encoding = DEFAULT_OUTPUT_ENCODING ,
prettyPrint = False , indentLevel = 0 ) :
""" Deprecated method for BS3 compatibility. """
if not prettyPrint :
indentLevel = None
return self . encode_contents (
indent_level = indentLevel , encoding = encoding )
#Soup methods
def find ( self , name = None , attrs = { } , recursive = True , text = None ,
* * kwargs ) :
""" Look in the children of this PageElement and find the first
PageElement that matches the given criteria .
All find_ * methods take a common set of arguments . See the online
documentation for detailed explanations .
: param name : A filter on tag name .
: param attrs : A dictionary of filters on attribute values .
: param recursive : If this is True , find ( ) will perform a
recursive search of this PageElement ' s children. Otherwise,
only the direct children will be considered .
: param limit : Stop looking after finding this many results .
: kwargs : A dictionary of filters on attribute values .
: return : A PageElement .
: rtype : bs4 . element . Tag | bs4 . element . NavigableString
"""
r = None
l = self . find_all ( name , attrs , recursive , text , 1 , * * kwargs )
if l :
r = l [ 0 ]
return r
findChild = find #BS2
def find_all ( self , name = None , attrs = { } , recursive = True , text = None ,
limit = None , * * kwargs ) :
""" Look in the children of this PageElement and find all
PageElements that match the given criteria .
All find_ * methods take a common set of arguments . See the online
documentation for detailed explanations .
: param name : A filter on tag name .
: param attrs : A dictionary of filters on attribute values .
: param recursive : If this is True , find_all ( ) will perform a
recursive search of this PageElement ' s children. Otherwise,
only the direct children will be considered .
: param limit : Stop looking after finding this many results .
: kwargs : A dictionary of filters on attribute values .
: return : A ResultSet of PageElements .
: rtype : bs4 . element . ResultSet
"""
generator = self . descendants
if not recursive :
generator = self . children
return self . _find_all ( name , attrs , text , limit , generator , * * kwargs )
findAll = find_all # BS3
findChildren = find_all # BS2
#Generator methods
@property
def children ( self ) :
""" Iterate over all direct children of this PageElement.
: yield : A sequence of PageElements .
"""
# return iter() to make the purpose of the method clear
return iter ( self . contents ) # XXX This seems to be untested.
@property
def descendants ( self ) :
""" Iterate over all children of this PageElement in a
breadth - first sequence .
: yield : A sequence of PageElements .
"""
if not len ( self . contents ) :
return
stopNode = self . _last_descendant ( ) . next_element
current = self . contents [ 0 ]
while current is not stopNode :
yield current
current = current . next_element
# CSS selector code
def select_one ( self , selector , namespaces = None , * * kwargs ) :
""" Perform a CSS selection operation on the current element.
: param selector : A CSS selector .
: param namespaces : A dictionary mapping namespace prefixes
used in the CSS selector to namespace URIs . By default ,
Beautiful Soup will use the prefixes it encountered while
parsing the document .
: param kwargs : Keyword arguments to be passed into SoupSieve ' s
soupsieve . select ( ) method .
: return : A Tag .
: rtype : bs4 . element . Tag
"""
value = self . select ( selector , namespaces , 1 , * * kwargs )
if value :
return value [ 0 ]
return None
def select ( self , selector , namespaces = None , limit = None , * * kwargs ) :
""" Perform a CSS selection operation on the current element.
This uses the SoupSieve library .
: param selector : A string containing a CSS selector .
: param namespaces : A dictionary mapping namespace prefixes
used in the CSS selector to namespace URIs . By default ,
Beautiful Soup will use the prefixes it encountered while
parsing the document .
: param limit : After finding this number of results , stop looking .
: param kwargs : Keyword arguments to be passed into SoupSieve ' s
soupsieve . select ( ) method .
: return : A ResultSet of Tags .
: rtype : bs4 . element . ResultSet
"""
if namespaces is None :
namespaces = self . _namespaces
if limit is None :
limit = 0
if soupsieve is None :
raise NotImplementedError (
" Cannot execute CSS selectors because the soupsieve package is not installed. "
)
results = soupsieve . select ( selector , self , namespaces , limit , * * kwargs )
# We do this because it's more consistent and because
# ResultSet.__getattr__ has a helpful error message.
return ResultSet ( None , results )
# Old names for backwards compatibility
def childGenerator ( self ) :
""" Deprecated generator. """
return self . children
def recursiveChildGenerator ( self ) :
""" Deprecated generator. """
return self . descendants
def has_key ( self , key ) :
""" Deprecated method. This was kind of misleading because has_key()
( attributes ) was different from __in__ ( contents ) .
has_key ( ) is gone in Python 3 , anyway .
"""
warnings . warn ( ' has_key is deprecated. Use has_attr( " %s " ) instead. ' % (
key ) )
return self . has_attr ( key )
# Next, a couple classes to represent queries and their results.
class SoupStrainer ( object ) :
""" Encapsulates a number of ways of matching a markup element (tag or
string ) .
This is primarily used to underpin the find_ * methods , but you can
create one yourself and pass it in as ` parse_only ` to the
` BeautifulSoup ` constructor , to parse a subset of a large
document .
"""
def __init__ ( self , name = None , attrs = { } , text = None , * * kwargs ) :
""" Constructor.
The SoupStrainer constructor takes the same arguments passed
into the find_ * methods . See the online documentation for
detailed explanations .
: param name : A filter on tag name .
: param attrs : A dictionary of filters on attribute values .
: param text : A filter for a NavigableString with specific text .
: kwargs : A dictionary of filters on attribute values .
"""
self . name = self . _normalize_search_value ( name )
if not isinstance ( attrs , dict ) :
# Treat a non-dict value for attrs as a search for the 'class'
# attribute.
kwargs [ ' class ' ] = attrs
attrs = None
if ' class_ ' in kwargs :
# Treat class_="foo" as a search for the 'class'
# attribute, overriding any non-dict value for attrs.
kwargs [ ' class ' ] = kwargs [ ' class_ ' ]
del kwargs [ ' class_ ' ]
if kwargs :
if attrs :
attrs = attrs . copy ( )
attrs . update ( kwargs )
else :
attrs = kwargs
normalized_attrs = { }
for key , value in list ( attrs . items ( ) ) :
normalized_attrs [ key ] = self . _normalize_search_value ( value )
self . attrs = normalized_attrs
self . text = self . _normalize_search_value ( text )
def _normalize_search_value ( self , value ) :
# Leave it alone if it's a Unicode string, a callable, a
# regular expression, a boolean, or None.
if ( isinstance ( value , str ) or isinstance ( value , Callable ) or hasattr ( value , ' match ' )
or isinstance ( value , bool ) or value is None ) :
return value
# If it's a bytestring, convert it to Unicode, treating it as UTF-8.
if isinstance ( value , bytes ) :
return value . decode ( " utf8 " )
# If it's listlike, convert it into a list of strings.
if hasattr ( value , ' __iter__ ' ) :
new_value = [ ]
for v in value :
if ( hasattr ( v , ' __iter__ ' ) and not isinstance ( v , bytes )
and not isinstance ( v , str ) ) :
# This is almost certainly the user's mistake. In the
# interests of avoiding infinite loops, we'll let
# it through as-is rather than doing a recursive call.
new_value . append ( v )
else :
new_value . append ( self . _normalize_search_value ( v ) )
return new_value
# Otherwise, convert it into a Unicode string.
# The unicode(str()) thing is so this will do the same thing on Python 2
# and Python 3.
return str ( str ( value ) )
def __str__ ( self ) :
""" A human-readable representation of this SoupStrainer. """
if self . text :
return self . text
else :
return " %s | %s " % ( self . name , self . attrs )
def search_tag ( self , markup_name = None , markup_attrs = { } ) :
""" Check whether a Tag with the given name and attributes would
match this SoupStrainer .
Used prospectively to decide whether to even bother creating a Tag
object .
: param markup_name : A tag name as found in some markup .
: param markup_attrs : A dictionary of attributes as found in some markup .
: return : True if the prospective tag would match this SoupStrainer ;
False otherwise .
"""
found = None
markup = None
if isinstance ( markup_name , Tag ) :
markup = markup_name
markup_attrs = markup
if isinstance ( self . name , str ) :
# Optimization for a very common case where the user is
# searching for a tag with one specific name, and we're
# looking at a tag with a different name.
if markup and not markup . prefix and self . name != markup . name :
return False
call_function_with_tag_data = (
isinstance ( self . name , Callable )
and not isinstance ( markup_name , Tag ) )
if ( ( not self . name )
or call_function_with_tag_data
or ( markup and self . _matches ( markup , self . name ) )
or ( not markup and self . _matches ( markup_name , self . name ) ) ) :
if call_function_with_tag_data :
match = self . name ( markup_name , markup_attrs )
else :
match = True
markup_attr_map = None
for attr , match_against in list ( self . attrs . items ( ) ) :
if not markup_attr_map :
if hasattr ( markup_attrs , ' get ' ) :
markup_attr_map = markup_attrs
else :
markup_attr_map = { }
for k , v in markup_attrs :
markup_attr_map [ k ] = v
attr_value = markup_attr_map . get ( attr )
if not self . _matches ( attr_value , match_against ) :
match = False
break
if match :
if markup :
found = markup
else :
found = markup_name
if found and self . text and not self . _matches ( found . string , self . text ) :
found = None
return found
# For BS3 compatibility.
searchTag = search_tag
def search ( self , markup ) :
""" Find all items in `markup` that match this SoupStrainer.
Used by the core _find_all ( ) method , which is ultimately
called by all find_ * methods .
: param markup : A PageElement or a list of them .
"""
# print('looking for %s in %s' % (self, markup))
found = None
# If given a list of items, scan it for a text element that
# matches.
if hasattr ( markup , ' __iter__ ' ) and not isinstance ( markup , ( Tag , str ) ) :
for element in markup :
if isinstance ( element , NavigableString ) \
and self . search ( element ) :
found = element
break
# If it's a Tag, make sure its name or attributes match.
# Don't bother with Tags if we're searching for text.
elif isinstance ( markup , Tag ) :
if not self . text or self . name or self . attrs :
found = self . search_tag ( markup )
# If it's text, make sure the text matches.
elif isinstance ( markup , NavigableString ) or \
isinstance ( markup , str ) :
if not self . name and not self . attrs and self . _matches ( markup , self . text ) :
found = markup
else :
raise Exception (
" I don ' t know how to match against a %s " % markup . __class__ )
return found
def _matches ( self , markup , match_against , already_tried = None ) :
# print(u"Matching %s against %s" % (markup, match_against))
result = False
if isinstance ( markup , list ) or isinstance ( markup , tuple ) :
# This should only happen when searching a multi-valued attribute
# like 'class'.
for item in markup :
if self . _matches ( item , match_against ) :
return True
# We didn't match any particular value of the multivalue
# attribute, but maybe we match the attribute value when
# considered as a string.
if self . _matches ( ' ' . join ( markup ) , match_against ) :
return True
return False
if match_against is True :
# True matches any non-None value.
return markup is not None
if isinstance ( match_against , Callable ) :
return match_against ( markup )
# Custom callables take the tag as an argument, but all
# other ways of matching match the tag name as a string.
original_markup = markup
if isinstance ( markup , Tag ) :
markup = markup . name
# Ensure that `markup` is either a Unicode string, or None.
markup = self . _normalize_search_value ( markup )
if markup is None :
# None matches None, False, an empty string, an empty list, and so on.
return not match_against
if ( hasattr ( match_against , ' __iter__ ' )
and not isinstance ( match_against , str ) ) :
# We're asked to match against an iterable of items.
# The markup must be match at least one item in the
# iterable. We'll try each one in turn.
#
# To avoid infinite recursion we need to keep track of
# items we've already seen.
if not already_tried :
already_tried = set ( )
for item in match_against :
if item . __hash__ :
key = item
else :
key = id ( item )
if key in already_tried :
continue
else :
already_tried . add ( key )
if self . _matches ( original_markup , item , already_tried ) :
return True
else :
return False
# Beyond this point we might need to run the test twice: once against
# the tag's name and once against its prefixed name.
match = False
if not match and isinstance ( match_against , str ) :
# Exact string match
match = markup == match_against
if not match and hasattr ( match_against , ' search ' ) :
# Regexp match
return match_against . search ( markup )
if ( not match
and isinstance ( original_markup , Tag )
and original_markup . prefix ) :
# Try the whole thing again with the prefixed tag name.
return self . _matches (
original_markup . prefix + ' : ' + original_markup . name , match_against
)
return match
class ResultSet ( list ) :
""" A ResultSet is just a list that keeps track of the SoupStrainer
that created it . """
def __init__ ( self , source , result = ( ) ) :
""" Constructor.
: param source : A SoupStrainer .
: param result : A list of PageElements .
"""
super ( ResultSet , self ) . __init__ ( result )
self . source = source
def __getattr__ ( self , key ) :
""" Raise a helpful exception to explain a common code fix. """
raise AttributeError (
" ResultSet object has no attribute ' %s ' . You ' re probably treating a list of elements like a single element. Did you call find_all() when you meant to call find()? " % key
)