Hello.

Here is some bolded text", string_containers = { 'b': BString, 'p': PString, } ) # The string before the

tag is a regular NavigableString. assert isinstance(soup.div.contents[0], NavigableString) # The string inside the

tag, but not inside the tag, # is a PString. assert isinstance(soup.p.contents[0], PString) # Every string inside the tag is a BString, even the one that # was also inside an tag. for s in soup.b.strings: assert isinstance(s, BString) # Now that parsing was complete, the string_container_stack # (where this information was kept) has been cleared out. assert [] == soup.string_container_stack class TestOutput(SoupTest): @pytest.mark.parametrize( "eventual_encoding,actual_encoding", [ ("utf-8", "utf-8"), ("utf-16", "utf-16"), ] ) def test_decode_xml_declaration(self, eventual_encoding, actual_encoding): # Most of the time, calling decode() on an XML document will # give you a document declaration that mentions the encoding # you intend to use when encoding the document as a # bytestring. soup = self.soup("") soup.is_xml = True assert (f'\n' == soup.decode(eventual_encoding=eventual_encoding)) @pytest.mark.parametrize( "eventual_encoding", [x for x in PYTHON_SPECIFIC_ENCODINGS] + [None] ) def test_decode_xml_declaration_with_missing_or_python_internal_eventual_encoding(self, eventual_encoding): # But if you pass a Python internal encoding into decode(), or # omit the eventual_encoding altogether, the document # declaration won't mention any particular encoding. soup = BeautifulSoup("", "html.parser") soup.is_xml = True assert (f'\n' == soup.decode(eventual_encoding=eventual_encoding)) def test(self): # BeautifulSoup subclasses Tag and extends the decode() method. # Make sure the other Tag methods which call decode() call # it correctly. soup = self.soup("") assert b"" == soup.encode(encoding="utf-8") assert b"" == soup.encode_contents(encoding="utf-8") assert "" == soup.decode_contents() assert "\n\n" == soup.prettify() class TestWarnings(SoupTest): # Note that some of the tests in this class create BeautifulSoup # objects directly rather than using self.soup(). That's # because SoupTest.soup is defined in a different file, # which will throw off the assertion in _assert_warning # that the code that triggered the warning is in the same # file as the test. def _assert_warning(self, warnings, cls): for w in warnings: if isinstance(w.message, cls): assert w.filename == __file__ return w raise Exception("%s warning not found in %r" % (cls, warnings)) def _assert_no_parser_specified(self, w): warning = self._assert_warning(w, GuessedAtParserWarning) message = str(warning.message) assert message.startswith(BeautifulSoup.NO_PARSER_SPECIFIED_WARNING[:60]) def test_warning_if_no_parser_specified(self): with warnings.catch_warnings(record=True) as w: soup = BeautifulSoup("") self._assert_no_parser_specified(w) def test_warning_if_parser_specified_too_vague(self): with warnings.catch_warnings(record=True) as w: soup = BeautifulSoup("", "html") self._assert_no_parser_specified(w) def test_no_warning_if_explicit_parser_specified(self): with warnings.catch_warnings(record=True) as w: soup = self.soup("") assert [] == w def test_parseOnlyThese_renamed_to_parse_only(self): with warnings.catch_warnings(record=True) as w: soup = BeautifulSoup( "", "html.parser", parseOnlyThese=SoupStrainer("b"), ) warning = self._assert_warning(w, DeprecationWarning) msg = str(warning.message) assert "parseOnlyThese" in msg assert "parse_only" in msg assert b"" == soup.encode() def test_fromEncoding_renamed_to_from_encoding(self): with warnings.catch_warnings(record=True) as w: utf8 = b"\xc3\xa9" soup = BeautifulSoup( utf8, "html.parser", fromEncoding="utf8" ) warning = self._assert_warning(w, DeprecationWarning) msg = str(warning.message) assert "fromEncoding" in msg assert "from_encoding" in msg assert "utf8" == soup.original_encoding def test_unrecognized_keyword_argument(self): with pytest.raises(TypeError): self.soup("", no_such_argument=True) @pytest.mark.parametrize( "extension", ['markup.html', 'markup.htm', 'markup.HTML', 'markup.txt', 'markup.xhtml', 'markup.xml', "/home/user/file", "c:\\user\file"] ) def test_resembles_filename_warning(self, extension): # A warning is issued if the "markup" looks like the name of # an HTML or text file, or a full path to a file on disk. with warnings.catch_warnings(record=True) as w: soup = BeautifulSoup("markup" + extension, "html.parser") warning = self._assert_warning(w, MarkupResemblesLocatorWarning) assert "looks more like a filename" in str(warning.message) @pytest.mark.parametrize( "extension", ['markuphtml', 'markup.com', '', 'markup.js'] ) def test_resembles_filename_no_warning(self, extension): # The 'looks more like a filename' warning is not issued if # the markup looks like a bare string, a domain name, or a # file that's not an HTML file. with warnings.catch_warnings(record=True) as w: soup = self.soup("markup" + extension) assert [] == w def test_url_warning_with_bytes_url(self): url = b"http://www.crummybytes.com/" with warnings.catch_warnings(record=True) as warning_list: soup = BeautifulSoup(url, "html.parser") warning = self._assert_warning( warning_list, MarkupResemblesLocatorWarning ) assert "looks more like a URL" in str(warning.message) assert url not in str(warning.message).encode("utf8") def test_url_warning_with_unicode_url(self): url = "http://www.crummyunicode.com/" with warnings.catch_warnings(record=True) as warning_list: # note - this url must differ from the bytes one otherwise # python's warnings system swallows the second warning soup = BeautifulSoup(url, "html.parser") warning = self._assert_warning( warning_list, MarkupResemblesLocatorWarning ) assert "looks more like a URL" in str(warning.message) assert url not in str(warning.message) def test_url_warning_with_bytes_and_space(self): # Here the markup contains something besides a URL, so no warning # is issued. with warnings.catch_warnings(record=True) as warning_list: soup = self.soup(b"http://www.crummybytes.com/ is great") assert not any("looks more like a URL" in str(w.message) for w in warning_list) def test_url_warning_with_unicode_and_space(self): with warnings.catch_warnings(record=True) as warning_list: soup = self.soup("http://www.crummyunicode.com/ is great") assert not any("looks more like a URL" in str(w.message) for w in warning_list) class TestSelectiveParsing(SoupTest): def test_parse_with_soupstrainer(self): markup = "NoYes NoYes Yes" strainer = SoupStrainer("b") soup = self.soup(markup, parse_only=strainer) assert soup.encode() == b"YesYes Yes" class TestNewTag(SoupTest): """Test the BeautifulSoup.new_tag() method.""" def test_new_tag(self): soup = self.soup("") new_tag = soup.new_tag("foo", bar="baz", attrs={"name": "a name"}) assert isinstance(new_tag, Tag) assert "foo" == new_tag.name assert dict(bar="baz", name="a name") == new_tag.attrs assert None == new_tag.parent @pytest.mark.skipif( not LXML_PRESENT, reason="lxml not installed, cannot parse XML document" ) def test_xml_tag_inherits_self_closing_rules_from_builder(self): xml_soup = BeautifulSoup("", "xml") xml_br = xml_soup.new_tag("br") xml_p = xml_soup.new_tag("p") # Both the
and

tag are empty-element, just because # they have no contents. assert b"
" == xml_br.encode() assert b"

" == xml_p.encode() def test_tag_inherits_self_closing_rules_from_builder(self): html_soup = BeautifulSoup("", "html.parser") html_br = html_soup.new_tag("br") html_p = html_soup.new_tag("p") # The HTML builder users HTML's rules about which tags are # empty-element tags, and the new tags reflect these rules. assert b"
" == html_br.encode() assert b"

" == html_p.encode() class TestNewString(SoupTest): """Test the BeautifulSoup.new_string() method.""" def test_new_string_creates_navigablestring(self): soup = self.soup("") s = soup.new_string("foo") assert "foo" == s assert isinstance(s, NavigableString) def test_new_string_can_create_navigablestring_subclass(self): soup = self.soup("") s = soup.new_string("foo", Comment) assert "foo" == s assert isinstance(s, Comment) class TestPickle(SoupTest): # Test our ability to pickle the BeautifulSoup object itself. def test_normal_pickle(self): soup = self.soup("some markup") pickled = pickle.dumps(soup) unpickled = pickle.loads(pickled) assert "some markup" == unpickled.a.string def test_pickle_with_no_builder(self): # We had a bug that prevented pickling from working if # the builder wasn't set. soup = self.soup("some markup") soup.builder = None pickled = pickle.dumps(soup) unpickled = pickle.loads(pickled) assert "some markup" == unpickled.string class TestEncodingConversion(SoupTest): # Test Beautiful Soup's ability to decode and encode from various # encodings. def setup_method(self): self.unicode_data = 'Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!' self.utf8_data = self.unicode_data.encode("utf-8") # Just so you know what it looks like. assert self.utf8_data == b'Sacr\xc3\xa9 bleu!' def test_ascii_in_unicode_out(self): # ASCII input is converted to Unicode. The original_encoding # attribute is set to 'utf-8', a superset of ASCII. chardet = dammit.chardet_dammit logging.disable(logging.WARNING) try: def noop(str): return None # Disable chardet, which will realize that the ASCII is ASCII. dammit.chardet_dammit = noop ascii = b"a" soup_from_ascii = self.soup(ascii) unicode_output = soup_from_ascii.decode() assert isinstance(unicode_output, str) assert unicode_output == self.document_for(ascii.decode()) assert soup_from_ascii.original_encoding.lower() == "utf-8" finally: logging.disable(logging.NOTSET) dammit.chardet_dammit = chardet def test_unicode_in_unicode_out(self): # Unicode input is left alone. The original_encoding attribute # is not set. soup_from_unicode = self.soup(self.unicode_data) assert soup_from_unicode.decode() == self.unicode_data assert soup_from_unicode.foo.string == 'Sacr\xe9 bleu!' assert soup_from_unicode.original_encoding == None def test_utf8_in_unicode_out(self): # UTF-8 input is converted to Unicode. The original_encoding # attribute is set. soup_from_utf8 = self.soup(self.utf8_data) assert soup_from_utf8.decode() == self.unicode_data assert soup_from_utf8.foo.string == 'Sacr\xe9 bleu!' def test_utf8_out(self): # The internal data structures can be encoded as UTF-8. soup_from_unicode = self.soup(self.unicode_data) assert soup_from_unicode.encode('utf-8') == self.utf8_data

éé

foo\0bar