# -*- coding: utf-8 -*- """Tests for Beautiful Soup's tree traversal methods. The tree traversal methods are the main advantage of using Beautiful Soup over just using a parser. Different parsers will build different Beautiful Soup trees given the same markup, but all Beautiful Soup trees can be traversed with the methods tested here. """ from pdb import set_trace import pytest import re import warnings from bs4 import BeautifulSoup from bs4.builder import ( builder_registry, HTMLParserTreeBuilder, ) from bs4.element import ( CData, Comment, Declaration, Doctype, Formatter, NavigableString, Script, SoupStrainer, Stylesheet, Tag, TemplateString, ) from . import ( SoupTest, ) class TestFind(SoupTest): """Basic tests of the find() method. find() just calls find_all() with limit=1, so it's not tested all that thouroughly here. """ def test_find_tag(self): soup = self.soup("1234") assert soup.find("b").string == "2" def test_unicode_text_find(self): soup = self.soup('
Don't leave me here.
Don\'t leave!
""" soup = self.soup(doc) second_para = soup.find(id='2') bold = soup.b # Move the tag to the end of the second paragraph. soup.find(id='2').append(soup.b) # The tag is now a child of the second paragraph. assert bold.parent == second_para assert soup.decode() == self.document_for( 'Don\'t leave me .
\n' 'Don\'t leave!here
' ) def test_replace_with_returns_thing_that_was_replaced(self): text = "And now, a word:
And we're back.
") text = "p2
p3
" to_insert = self.soup(text) soup.insert(1, to_insert) for i in soup.descendants: assert not isinstance(i, BeautifulSoup) p1, p2, p3, p4 = list(soup.children) assert "And now, a word:" == p1.string assert "p2" == p2.string assert "p3" == p3.string assert "And we're back." == p4.string def test_replace_with_maintains_next_element_throughout(self): soup = self.soup('onethree
') a = soup.a b = a.contents[0] # Make it so the tag has two text children. a.insert(1, "two") # Now replace each one with the empty string. left, right = a.contents left.replaceWith('') right.replaceWith('') # The tag is still connected to the tree. assert "three" == soup.b.string def test_replace_final_node(self): soup = self.soup("Argh!") soup.find(string="Argh!").replace_with("Hooray!") new_text = soup.find(string="Hooray!") b = soup.b assert new_text.previous_element == b assert new_text.parent == b assert new_text.previous_element.next_element == new_text assert new_text.next_element == None def test_consecutive_text_nodes(self): # A builder should never create two consecutive text nodes, # but if you insert one next to another, Beautiful Soup will # handle it correctly. soup = self.soup("Argh!There's no business like show business
") no, show = soup.find_all('b') show.replace_with(no) assert soup.decode() == self.document_for( "There's business like no business
" ) assert show.parent == None assert no.parent == soup.p assert no.next_element == "no" assert no.next_sibling == " business" def test_replace_with_errors(self): # Can't replace a tag that's not part of a tree. a_tag = Tag(name="a") with pytest.raises(ValueError): a_tag.replace_with("won't work") # Can't replace a tag with its parent. a_tag = self.soup("").a with pytest.raises(ValueError): a_tag.b.replace_with(a_tag) # Or with a list that includes its parent. with pytest.raises(ValueError): a_tag.b.replace_with("string1", a_tag, "string2") def test_replace_with_multiple(self): data = "Unneeded formatting is unneeded
""") tree.em.unwrap() assert tree.em == None assert tree.p.text == "Unneeded formatting is unneeded" def test_wrap(self): soup = self.soup("I wish I was bold.") value = soup.string.wrap(soup.new_tag("b")) assert value.decode() == "I wish I was bold." assert soup.decode() == self.document_for("I wish I was bold.") def test_wrap_extracts_tag_from_elsewhere(self): soup = self.soup("I wish I was bold.") soup.b.next_sibling.wrap(soup.b) assert soup.decode() == self.document_for("I wish I was bold.") def test_wrap_puts_new_contents_at_the_end(self): soup = self.soup("I like being bold.I wish I was bold.") soup.b.next_sibling.wrap(soup.b) assert 2 == len(soup.b.contents) assert soup.decode() == self.document_for( "I like being bold.I wish I was bold." ) def test_extract(self): soup = self.soup( 'Some content. More content.') assert len(soup.body.contents) == 3 extracted = soup.find(id="nav").extract() assert soup.decode() == "Some content. More content." assert extracted.decode() == ' ' # The extracted tag is now an orphan. assert len(soup.body.contents) == 2 assert extracted.parent == None assert extracted.previous_element == None assert extracted.next_element.next_element == None # The gap where the extracted tag used to be has been mended. content_1 = soup.find(string="Some content. ") content_2 = soup.find(string=" More content.") assert content_1.next_element == content_2 assert content_1.next_sibling == content_2 assert content_2.previous_element == content_1 assert content_2.previous_sibling == content_1 def test_extract_distinguishes_between_identical_strings(self): soup = self.soup("foobar") foo_1 = soup.a.string bar_1 = soup.b.string foo_2 = soup.new_string("foo") bar_2 = soup.new_string("bar") soup.a.append(foo_2) soup.b.append(bar_2) # Now there are two identical strings in the tag, and two # in the tag. Let's remove the first "foo" and the second # "bar". foo_1.extract() bar_2.extract() assert foo_2 == soup.a.string assert bar_2 == soup.b.string def test_extract_multiples_of_same_tag(self): soup = self.soup(""" """) [soup.script.extract() for i in soup.find_all("script")] assert "\n\n\n" == str(soup.body) def test_extract_works_when_element_is_surrounded_by_identical_strings(self): soup = self.soup( '\n' 'hi\n' '') soup.find('body').extract() assert None == soup.find('body') def test_clear(self): """Tag.clear()""" soup = self.soup("String Italicized and another
") # clear using extract() a = soup.a soup.p.clear() assert len(soup.p.contents) == 0 assert hasattr(a, "contents") # clear using decompose() em = a.em a.clear(decompose=True) assert 0 == len(em.contents) def test_decompose(self): # Test PageElement.decompose() and PageElement.decomposed soup = self.soup("Another para
") p1, p2 = soup.find_all('p') a = p1.a text = p1.em.string for i in [p1, p2, a, text]: assert False == i.decomposed # This sets p1 and everything beneath it to decomposed. p1.decompose() for i in [p1, a, text]: assert True == i.decomposed # p2 is unaffected. assert False == p2.decomposed def test_string_set(self): """Tag.string = 'string'""" soup = self.soup("