bazarr/libs/bs4/tests/test_lxml.py

"""Tests to ensure that the lxml tree builder generates good trees."""

import pickle
import pytest
import re
import warnings
from . import LXML_PRESENT, LXML_VERSION

if LXML_PRESENT:
    from bs4.builder import LXMLTreeBuilder, LXMLTreeBuilderForXML

from bs4 import (
    BeautifulSoup,
    BeautifulStoneSoup,
    )
from bs4.element import Comment, Doctype, SoupStrainer
from . import (
    HTMLTreeBuilderSmokeTest,
    XMLTreeBuilderSmokeTest,
    SOUP_SIEVE_PRESENT,
    SoupTest,
)

@pytest.mark.skipif(
    not LXML_PRESENT,
    reason="lxml seems not to be present, not testing its tree builder."
)
class TestLXMLTreeBuilder(SoupTest, HTMLTreeBuilderSmokeTest):
    """See ``HTMLTreeBuilderSmokeTest``."""

    @property
    def default_builder(self):
        return LXMLTreeBuilder

    def test_out_of_range_entity(self):
        self.assert_soup(
            "<p>foo&#10000000000000;bar</p>", "<p>foobar</p>")
        self.assert_soup(
            "<p>foo&#x10000000000000;bar</p>", "<p>foobar</p>")
        self.assert_soup(
            "<p>foo&#1000000000;bar</p>", "<p>foobar</p>")
        
    def test_entities_in_foreign_document_encoding(self):
        # We can't implement this case correctly because by the time we
        # hear about markup like "&#147;", it's been (incorrectly) converted into
        # a string like u'\x93'
        pass
        
    # In lxml < 2.3.5, an empty doctype causes a segfault. Skip this
    # test if an old version of lxml is installed.

    @pytest.mark.skipif(
        not LXML_PRESENT or LXML_VERSION < (2,3,5,0),
        reason="Skipping doctype test for old version of lxml to avoid segfault."
    )
    def test_empty_doctype(self):
        soup = self.soup("<!DOCTYPE>")
        doctype = soup.contents[0]
        assert "" == doctype.strip()

    def test_beautifulstonesoup_is_xml_parser(self):
        # Make sure that the deprecated BSS class uses an xml builder
        # if one is installed.
        with warnings.catch_warnings(record=True) as w:
            soup = BeautifulStoneSoup("<b />")
        assert "<b/>" == str(soup.b)
        [warning] = w
        assert warning.filename == __file__
        assert "BeautifulStoneSoup class is deprecated" in str(warning.message)

    def test_tracking_line_numbers(self):
        # The lxml TreeBuilder cannot keep track of line numbers from
        # the original markup. Even if you ask for line numbers, we
        # don't have 'em.
        #
        # This means that if you have a tag like <sourceline> or
        # <sourcepos>, attribute access will find it rather than
        # giving you a numeric answer.
        soup = self.soup(
            "\n   <p>\n\n<sourceline>\n<b>text</b></sourceline><sourcepos></p>",
            store_line_numbers=True
        )
        assert "sourceline" == soup.p.sourceline.name
        assert "sourcepos" == soup.p.sourcepos.name
        
@pytest.mark.skipif(
    not LXML_PRESENT,
    reason="lxml seems not to be present, not testing its XML tree builder."
)
class TestLXMLXMLTreeBuilder(SoupTest, XMLTreeBuilderSmokeTest):
    """See ``HTMLTreeBuilderSmokeTest``."""

    @property
    def default_builder(self):
        return LXMLTreeBuilderForXML

    def test_namespace_indexing(self):
        soup = self.soup(
            '<?xml version="1.1"?>\n'
            '<root>'
            '<tag xmlns="http://unprefixed-namespace.com">content</tag>'
            '<prefix:tag2 xmlns:prefix="http://prefixed-namespace.com">content</prefix:tag2>'
            '<prefix2:tag3 xmlns:prefix2="http://another-namespace.com">'
            '<subtag xmlns="http://another-unprefixed-namespace.com">'
            '<subsubtag xmlns="http://yet-another-unprefixed-namespace.com">'
            '</prefix2:tag3>'
            '</root>'
        )

        # The BeautifulSoup object includes every namespace prefix
        # defined in the entire document. This is the default set of
        # namespaces used by soupsieve.
        #
        # Un-prefixed namespaces are not included, and if a given
        # prefix is defined twice, only the first prefix encountered
        # in the document shows up here.
        assert soup._namespaces == {
            'xml': 'http://www.w3.org/XML/1998/namespace',
            'prefix': 'http://prefixed-namespace.com',
            'prefix2': 'http://another-namespace.com'
        }

        # A Tag object includes only the namespace prefixes
        # that were in scope when it was parsed.

        # We do not track un-prefixed namespaces as we can only hold
        # one (the first one), and it will be recognized as the
        # default namespace by soupsieve, even when operating from a
        # tag with a different un-prefixed namespace.
        assert soup.tag._namespaces == {
            'xml': 'http://www.w3.org/XML/1998/namespace',
        }

        assert soup.tag2._namespaces == {
            'prefix': 'http://prefixed-namespace.com',
            'xml': 'http://www.w3.org/XML/1998/namespace',
        }

        assert soup.subtag._namespaces == {
            'prefix2': 'http://another-namespace.com',
            'xml': 'http://www.w3.org/XML/1998/namespace',
        }

        assert soup.subsubtag._namespaces == {
            'prefix2': 'http://another-namespace.com',
            'xml': 'http://www.w3.org/XML/1998/namespace',
        }


    @pytest.mark.skipif(
        not SOUP_SIEVE_PRESENT, reason="Soup Sieve not installed"
    )
    def test_namespace_interaction_with_select_and_find(self):
        # Demonstrate how namespaces interact with select* and
        # find* methods.
        
        soup = self.soup(
            '<?xml version="1.1"?>\n'
            '<root>'
            '<tag xmlns="http://unprefixed-namespace.com">content</tag>'
            '<prefix:tag2 xmlns:prefix="http://prefixed-namespace.com">content</tag>'
            '<subtag xmlns:prefix="http://another-namespace-same-prefix.com">'
             '<prefix:tag3>'
            '</subtag>'
            '</root>'
        )

        # soupselect uses namespace URIs.
        assert soup.select_one('tag').name == 'tag'
        assert soup.select_one('prefix|tag2').name == 'tag2'

        # If a prefix is declared more than once, only the first usage
        # is registered with the BeautifulSoup object.
        assert soup.select_one('prefix|tag3') is None

        # But you can always explicitly specify a namespace dictionary.
        assert soup.select_one(
            'prefix|tag3', namespaces=soup.subtag._namespaces
        ).name == 'tag3'

        # And a Tag (as opposed to the BeautifulSoup object) will
        # have a set of default namespaces scoped to that Tag.
        assert soup.subtag.select_one('prefix|tag3').name=='tag3'

        # the find() methods aren't fully namespace-aware; they just
        # look at prefixes.
        assert soup.find('tag').name == 'tag'
        assert soup.find('prefix:tag2').name == 'tag2'
        assert soup.find('prefix:tag3').name == 'tag3'
        assert soup.subtag.find('prefix:tag3').name == 'tag3'

    def test_pickle_restores_builder(self):
        # The lxml TreeBuilder is not picklable, so when unpickling
        # a document created with it, a new TreeBuilder of the
        # appropriate class is created.
        soup = self.soup("<a>some markup</a>")
        assert isinstance(soup.builder, self.default_builder)
        pickled = pickle.dumps(soup)
        unpickled = pickle.loads(pickled)

        assert "some markup" == unpickled.a.string
        assert unpickled.builder != soup.builder
        assert isinstance(unpickled.builder, self.default_builder)
WIP 5 years ago			`"""Tests to ensure that the lxml tree builder generates good trees."""`

Updated vendored dependencies. 2 years ago			`import pickle`
Updated multiple Python modules (now in libs and custom_libs directories) and React libraries 9 months ago			`import pytest`
WIP 5 years ago			`import re`
			`import warnings`
Updated multiple Python modules (now in libs and custom_libs directories) and React libraries 9 months ago			`from . import LXML_PRESENT, LXML_VERSION`
WIP 5 years ago
			`if LXML_PRESENT:`
			`from bs4.builder import LXMLTreeBuilder, LXMLTreeBuilderForXML`

			`from bs4 import (`
			`BeautifulSoup,`
			`BeautifulStoneSoup,`
			`)`
			`from bs4.element import Comment, Doctype, SoupStrainer`
Updated vendored dependencies. 2 years ago			`from . import (`
WIP 5 years ago			`HTMLTreeBuilderSmokeTest,`
			`XMLTreeBuilderSmokeTest,`
Updated multiple Python modules (now in libs and custom_libs directories) and React libraries 9 months ago			`SOUP_SIEVE_PRESENT,`
WIP 5 years ago			`SoupTest,`
			`)`

Updated multiple Python modules (now in libs and custom_libs directories) and React libraries 9 months ago			`@pytest.mark.skipif(`
WIP 5 years ago			`not LXML_PRESENT,`
Updated multiple Python modules (now in libs and custom_libs directories) and React libraries 9 months ago			`reason="lxml seems not to be present, not testing its tree builder."`
			`)`
Updated vendored dependencies. 2 years ago			`class TestLXMLTreeBuilder(SoupTest, HTMLTreeBuilderSmokeTest):`
WIP 5 years ago			"""See ``HTMLTreeBuilderSmokeTest``."""

			`@property`
			`def default_builder(self):`
			`return LXMLTreeBuilder`

			`def test_out_of_range_entity(self):`
Updated vendored dependencies. 2 years ago			`self.assert_soup(`
WIP 5 years ago			`"<p>foo&#10000000000000;bar</p>", "<p>foobar</p>")`
Updated vendored dependencies. 2 years ago			`self.assert_soup(`
WIP 5 years ago			`"<p>foo&#x10000000000000;bar</p>", "<p>foobar</p>")`
Updated vendored dependencies. 2 years ago			`self.assert_soup(`
WIP 5 years ago			`"<p>foo&#1000000000;bar</p>", "<p>foobar</p>")`
Upgraded vendored Python dependencies to the latest versions and removed the unused dependencies. 3 years ago
WIP 5 years ago			`def test_entities_in_foreign_document_encoding(self):`
			`# We can't implement this case correctly because by the time we`
			`# hear about markup like "", it's been (incorrectly) converted into`
			`# a string like u'\x93'`
			`pass`

			`# In lxml < 2.3.5, an empty doctype causes a segfault. Skip this`
			`# test if an old version of lxml is installed.`

Updated multiple Python modules (now in libs and custom_libs directories) and React libraries 9 months ago			`@pytest.mark.skipif(`
WIP 5 years ago			`not LXML_PRESENT or LXML_VERSION < (2,3,5,0),`
Updated multiple Python modules (now in libs and custom_libs directories) and React libraries 9 months ago			`reason="Skipping doctype test for old version of lxml to avoid segfault."`
			`)`
WIP 5 years ago			`def test_empty_doctype(self):`
			`soup = self.soup("<!DOCTYPE>")`
			`doctype = soup.contents[0]`
Updated vendored dependencies. 2 years ago			`assert "" == doctype.strip()`
WIP 5 years ago
			`def test_beautifulstonesoup_is_xml_parser(self):`
			`# Make sure that the deprecated BSS class uses an xml builder`
			`# if one is installed.`
			`with warnings.catch_warnings(record=True) as w:`
			`soup = BeautifulStoneSoup("<b />")`
Updated vendored dependencies. 2 years ago			`assert "<b/>" == str(soup.b)`
Updated multiple Python modules (now in libs and custom_libs directories) and React libraries 9 months ago			`[warning] = w`
			`assert warning.filename == __file__`
			`assert "BeautifulStoneSoup class is deprecated" in str(warning.message)`
WIP 5 years ago
Upgraded vendored Python dependencies to the latest versions and removed the unused dependencies. 3 years ago			`def test_tracking_line_numbers(self):`
			`# The lxml TreeBuilder cannot keep track of line numbers from`
			`# the original markup. Even if you ask for line numbers, we`
			`# don't have 'em.`
			`#`
			`# This means that if you have a tag like <sourceline> or`
			`# <sourcepos>, attribute access will find it rather than`
			`# giving you a numeric answer.`
			`soup = self.soup(`
			`"\n <p>\n\n<sourceline>\n<b>text</b></sourceline><sourcepos></p>",`
			`store_line_numbers=True`
			`)`
Updated vendored dependencies. 2 years ago			`assert "sourceline" == soup.p.sourceline.name`
			`assert "sourcepos" == soup.p.sourcepos.name`
Upgraded vendored Python dependencies to the latest versions and removed the unused dependencies. 3 years ago
Updated multiple Python modules (now in libs and custom_libs directories) and React libraries 9 months ago			`@pytest.mark.skipif(`
WIP 5 years ago			`not LXML_PRESENT,`
Updated multiple Python modules (now in libs and custom_libs directories) and React libraries 9 months ago			`reason="lxml seems not to be present, not testing its XML tree builder."`
			`)`
Updated vendored dependencies. 2 years ago			`class TestLXMLXMLTreeBuilder(SoupTest, XMLTreeBuilderSmokeTest):`
WIP 5 years ago			"""See ``HTMLTreeBuilderSmokeTest``."""

			`@property`
			`def default_builder(self):`
			`return LXMLTreeBuilderForXML`

			`def test_namespace_indexing(self):`
			`soup = self.soup(`
			`'<?xml version="1.1"?>\n'`
			`'<root>'`
			`'<tag xmlns="http://unprefixed-namespace.com">content</tag>'`
Updated vendored dependencies. 2 years ago			`'<prefix:tag2 xmlns:prefix="http://prefixed-namespace.com">content</prefix:tag2>'`
			`'<prefix2:tag3 xmlns:prefix2="http://another-namespace.com">'`
			`'<subtag xmlns="http://another-unprefixed-namespace.com">'`
			`'<subsubtag xmlns="http://yet-another-unprefixed-namespace.com">'`
			`'</prefix2:tag3>'`
WIP 5 years ago			`'</root>'`
			`)`
Updated vendored dependencies. 2 years ago
			`# The BeautifulSoup object includes every namespace prefix`
			`# defined in the entire document. This is the default set of`
			`# namespaces used by soupsieve.`
			`#`
			`# Un-prefixed namespaces are not included, and if a given`
			`# prefix is defined twice, only the first prefix encountered`
			`# in the document shows up here.`
			`assert soup._namespaces == {`
			`'xml': 'http://www.w3.org/XML/1998/namespace',`
			`'prefix': 'http://prefixed-namespace.com',`
			`'prefix2': 'http://another-namespace.com'`
			`}`

			`# A Tag object includes only the namespace prefixes`
			`# that were in scope when it was parsed.`

			`# We do not track un-prefixed namespaces as we can only hold`
			`# one (the first one), and it will be recognized as the`
			`# default namespace by soupsieve, even when operating from a`
			`# tag with a different un-prefixed namespace.`
			`assert soup.tag._namespaces == {`
			`'xml': 'http://www.w3.org/XML/1998/namespace',`
			`}`

			`assert soup.tag2._namespaces == {`
			`'prefix': 'http://prefixed-namespace.com',`
			`'xml': 'http://www.w3.org/XML/1998/namespace',`
			`}`

			`assert soup.subtag._namespaces == {`
			`'prefix2': 'http://another-namespace.com',`
			`'xml': 'http://www.w3.org/XML/1998/namespace',`
			`}`

			`assert soup.subsubtag._namespaces == {`
			`'prefix2': 'http://another-namespace.com',`
			`'xml': 'http://www.w3.org/XML/1998/namespace',`
			`}`


Updated multiple Python modules (now in libs and custom_libs directories) and React libraries 9 months ago			`@pytest.mark.skipif(`
			`not SOUP_SIEVE_PRESENT, reason="Soup Sieve not installed"`
			`)`
Updated vendored dependencies. 2 years ago			`def test_namespace_interaction_with_select_and_find(self):`
			`# Demonstrate how namespaces interact with select* and`
			`# find* methods.`

			`soup = self.soup(`
			`'<?xml version="1.1"?>\n'`
			`'<root>'`
			`'<tag xmlns="http://unprefixed-namespace.com">content</tag>'`
			`'<prefix:tag2 xmlns:prefix="http://prefixed-namespace.com">content</tag>'`
			`'<subtag xmlns:prefix="http://another-namespace-same-prefix.com">'`
			`'<prefix:tag3>'`
			`'</subtag>'`
			`'</root>'`
WIP 5 years ago			`)`
Updated vendored dependencies. 2 years ago
			`# soupselect uses namespace URIs.`
			`assert soup.select_one('tag').name == 'tag'`
			`assert soup.select_one('prefix\|tag2').name == 'tag2'`

			`# If a prefix is declared more than once, only the first usage`
			`# is registered with the BeautifulSoup object.`
			`assert soup.select_one('prefix\|tag3') is None`

			`# But you can always explicitly specify a namespace dictionary.`
			`assert soup.select_one(`
			`'prefix\|tag3', namespaces=soup.subtag._namespaces`
			`).name == 'tag3'`

			`# And a Tag (as opposed to the BeautifulSoup object) will`
			`# have a set of default namespaces scoped to that Tag.`
			`assert soup.subtag.select_one('prefix\|tag3').name=='tag3'`

			`# the find() methods aren't fully namespace-aware; they just`
			`# look at prefixes.`
			`assert soup.find('tag').name == 'tag'`
			`assert soup.find('prefix:tag2').name == 'tag2'`
			`assert soup.find('prefix:tag3').name == 'tag3'`
			`assert soup.subtag.find('prefix:tag3').name == 'tag3'`

Updated multiple Python modules (now in libs and custom_libs directories) and React libraries 9 months ago			`def test_pickle_restores_builder(self):`
			`# The lxml TreeBuilder is not picklable, so when unpickling`
			`# a document created with it, a new TreeBuilder of the`
			`# appropriate class is created.`
Updated vendored dependencies. 2 years ago			`soup = self.soup("<a>some markup</a>")`
			`assert isinstance(soup.builder, self.default_builder)`
			`pickled = pickle.dumps(soup)`
			`unpickled = pickle.loads(pickled)`
Updated multiple Python modules (now in libs and custom_libs directories) and React libraries 9 months ago
Updated vendored dependencies. 2 years ago			`assert "some markup" == unpickled.a.string`
Updated multiple Python modules (now in libs and custom_libs directories) and React libraries 9 months ago			`assert unpickled.builder != soup.builder`
			`assert isinstance(unpickled.builder, self.default_builder)`