diff --git a/bazarr/get_subtitle.py b/bazarr/get_subtitle.py index ea79bcb58..7d5a53b73 100644 --- a/bazarr/get_subtitle.py +++ b/bazarr/get_subtitle.py @@ -15,7 +15,7 @@ import subliminal import subliminal_patch from datetime import datetime, timedelta from subzero.language import Language -from subzero.video import parse_video +from subzero.video import parse_video, refine_video from subliminal import region, score as subliminal_scores, \ list_subtitles from subliminal_patch.core import SZAsyncProviderPool, download_best_subtitles, save_subtitles, download_subtitles @@ -63,6 +63,7 @@ def get_video(path, title, sceneName, use_scenename, providers=None, media_type= video.used_scene_name = dont_use_actual_file video.original_name = original_name video.original_path = original_path + refine_video(video) return video except: diff --git a/libs/lxml/ElementInclude.py b/libs/lxml/ElementInclude.py new file mode 100644 index 000000000..8badf8b44 --- /dev/null +++ b/libs/lxml/ElementInclude.py @@ -0,0 +1,219 @@ +# +# ElementTree +# $Id: ElementInclude.py 1862 2004-06-18 07:31:02Z Fredrik $ +# +# limited xinclude support for element trees +# +# history: +# 2003-08-15 fl created +# 2003-11-14 fl fixed default loader +# +# Copyright (c) 2003-2004 by Fredrik Lundh. All rights reserved. +# +# fredrik@pythonware.com +# http://www.pythonware.com +# +# -------------------------------------------------------------------- +# The ElementTree toolkit is +# +# Copyright (c) 1999-2004 by Fredrik Lundh +# +# By obtaining, using, and/or copying this software and/or its +# associated documentation, you agree that you have read, understood, +# and will comply with the following terms and conditions: +# +# Permission to use, copy, modify, and distribute this software and +# its associated documentation for any purpose and without fee is +# hereby granted, provided that the above copyright notice appears in +# all copies, and that both that copyright notice and this permission +# notice appear in supporting documentation, and that the name of +# Secret Labs AB or the author not be used in advertising or publicity +# pertaining to distribution of the software without specific, written +# prior permission. +# +# SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD +# TO THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANT- +# ABILITY AND FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR +# BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY +# DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, +# WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS +# ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE +# OF THIS SOFTWARE. +# -------------------------------------------------------------------- + +""" +Limited XInclude support for the ElementTree package. + +While lxml.etree has full support for XInclude (see +`etree.ElementTree.xinclude()`), this module provides a simpler, pure +Python, ElementTree compatible implementation that supports a simple +form of custom URL resolvers. +""" + +from lxml import etree +try: + from urlparse import urljoin + from urllib2 import urlopen +except ImportError: + # Python 3 + from urllib.parse import urljoin + from urllib.request import urlopen + +XINCLUDE = "{http://www.w3.org/2001/XInclude}" + +XINCLUDE_INCLUDE = XINCLUDE + "include" +XINCLUDE_FALLBACK = XINCLUDE + "fallback" +XINCLUDE_ITER_TAG = XINCLUDE + "*" + +## +# Fatal include error. + +class FatalIncludeError(etree.LxmlSyntaxError): + pass + +## +# ET compatible default loader. +# This loader reads an included resource from disk. +# +# @param href Resource reference. +# @param parse Parse mode. Either "xml" or "text". +# @param encoding Optional text encoding. +# @return The expanded resource. If the parse mode is "xml", this +# is an ElementTree instance. If the parse mode is "text", this +# is a Unicode string. If the loader fails, it can return None +# or raise an IOError exception. +# @throws IOError If the loader fails to load the resource. + +def default_loader(href, parse, encoding=None): + file = open(href, 'rb') + if parse == "xml": + data = etree.parse(file).getroot() + else: + data = file.read() + if not encoding: + encoding = 'utf-8' + data = data.decode(encoding) + file.close() + return data + +## +# Default loader used by lxml.etree - handles custom resolvers properly +# + +def _lxml_default_loader(href, parse, encoding=None, parser=None): + if parse == "xml": + data = etree.parse(href, parser).getroot() + else: + if "://" in href: + f = urlopen(href) + else: + f = open(href, 'rb') + data = f.read() + f.close() + if not encoding: + encoding = 'utf-8' + data = data.decode(encoding) + return data + +## +# Wrapper for ET compatibility - drops the parser + +def _wrap_et_loader(loader): + def load(href, parse, encoding=None, parser=None): + return loader(href, parse, encoding) + return load + + +## +# Expand XInclude directives. +# +# @param elem Root element. +# @param loader Optional resource loader. If omitted, it defaults +# to {@link default_loader}. If given, it should be a callable +# that implements the same interface as default_loader. +# @param base_url The base URL of the original file, to resolve +# relative include file references. +# @throws FatalIncludeError If the function fails to include a given +# resource, or if the tree contains malformed XInclude elements. +# @throws IOError If the function fails to load a given resource. +# @returns the node or its replacement if it was an XInclude node + +def include(elem, loader=None, base_url=None): + if base_url is None: + if hasattr(elem, 'getroot'): + tree = elem + elem = elem.getroot() + else: + tree = elem.getroottree() + if hasattr(tree, 'docinfo'): + base_url = tree.docinfo.URL + elif hasattr(elem, 'getroot'): + elem = elem.getroot() + _include(elem, loader, base_url=base_url) + +def _include(elem, loader=None, _parent_hrefs=None, base_url=None): + if loader is not None: + load_include = _wrap_et_loader(loader) + else: + load_include = _lxml_default_loader + + if _parent_hrefs is None: + _parent_hrefs = set() + + parser = elem.getroottree().parser + + include_elements = list( + elem.iter(XINCLUDE_ITER_TAG)) + + for e in include_elements: + if e.tag == XINCLUDE_INCLUDE: + # process xinclude directive + href = urljoin(base_url, e.get("href")) + parse = e.get("parse", "xml") + parent = e.getparent() + if parse == "xml": + if href in _parent_hrefs: + raise FatalIncludeError( + "recursive include of %r detected" % href + ) + _parent_hrefs.add(href) + node = load_include(href, parse, parser=parser) + if node is None: + raise FatalIncludeError( + "cannot load %r as %r" % (href, parse) + ) + node = _include(node, loader, _parent_hrefs) + if e.tail: + node.tail = (node.tail or "") + e.tail + if parent is None: + return node # replaced the root node! + parent.replace(e, node) + elif parse == "text": + text = load_include(href, parse, encoding=e.get("encoding")) + if text is None: + raise FatalIncludeError( + "cannot load %r as %r" % (href, parse) + ) + predecessor = e.getprevious() + if predecessor is not None: + predecessor.tail = (predecessor.tail or "") + text + elif parent is None: + return text # replaced the root node! + else: + parent.text = (parent.text or "") + text + (e.tail or "") + parent.remove(e) + else: + raise FatalIncludeError( + "unknown parse type in xi:include tag (%r)" % parse + ) + elif e.tag == XINCLUDE_FALLBACK: + parent = e.getparent() + if parent is not None and parent.tag != XINCLUDE_INCLUDE: + raise FatalIncludeError( + "xi:fallback tag must be child of xi:include (%r)" % e.tag + ) + else: + raise FatalIncludeError( + "Invalid element found in XInclude namespace (%r)" % e.tag + ) + return elem diff --git a/libs/lxml/__init__.py b/libs/lxml/__init__.py new file mode 100644 index 000000000..07cbe3a26 --- /dev/null +++ b/libs/lxml/__init__.py @@ -0,0 +1,20 @@ +# this is a package + +def get_include(): + """ + Returns a list of header include paths (for lxml itself, libxml2 + and libxslt) needed to compile C code against lxml if it was built + with statically linked libraries. + """ + import os + lxml_path = __path__[0] + include_path = os.path.join(lxml_path, 'includes') + includes = [include_path, lxml_path] + + for name in os.listdir(include_path): + path = os.path.join(include_path, name) + if os.path.isdir(path): + includes.append(path) + + return includes + diff --git a/libs/lxml/_elementpath.py b/libs/lxml/_elementpath.py new file mode 100644 index 000000000..5462df6cb --- /dev/null +++ b/libs/lxml/_elementpath.py @@ -0,0 +1,337 @@ +# cython: language_level=2 + +# +# ElementTree +# $Id: ElementPath.py 3375 2008-02-13 08:05:08Z fredrik $ +# +# limited xpath support for element trees +# +# history: +# 2003-05-23 fl created +# 2003-05-28 fl added support for // etc +# 2003-08-27 fl fixed parsing of periods in element names +# 2007-09-10 fl new selection engine +# 2007-09-12 fl fixed parent selector +# 2007-09-13 fl added iterfind; changed findall to return a list +# 2007-11-30 fl added namespaces support +# 2009-10-30 fl added child element value filter +# +# Copyright (c) 2003-2009 by Fredrik Lundh. All rights reserved. +# +# fredrik@pythonware.com +# http://www.pythonware.com +# +# -------------------------------------------------------------------- +# The ElementTree toolkit is +# +# Copyright (c) 1999-2009 by Fredrik Lundh +# +# By obtaining, using, and/or copying this software and/or its +# associated documentation, you agree that you have read, understood, +# and will comply with the following terms and conditions: +# +# Permission to use, copy, modify, and distribute this software and +# its associated documentation for any purpose and without fee is +# hereby granted, provided that the above copyright notice appears in +# all copies, and that both that copyright notice and this permission +# notice appear in supporting documentation, and that the name of +# Secret Labs AB or the author not be used in advertising or publicity +# pertaining to distribution of the software without specific, written +# prior permission. +# +# SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD +# TO THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANT- +# ABILITY AND FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR +# BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY +# DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, +# WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS +# ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE +# OF THIS SOFTWARE. +# -------------------------------------------------------------------- + +## +# Implementation module for XPath support. There's usually no reason +# to import this module directly; the ElementTree does this for +# you, if needed. +## + +from __future__ import absolute_import + +import re + +xpath_tokenizer_re = re.compile( + "(" + "'[^']*'|\"[^\"]*\"|" + "::|" + "//?|" + r"\.\.|" + r"\(\)|" + r"[/.*:\[\]\(\)@=])|" + r"((?:\{[^}]+\})?[^/\[\]\(\)@=\s]+)|" + r"\s+" + ) + +def xpath_tokenizer(pattern, namespaces=None): + default_namespace = namespaces.get(None) if namespaces else None + for token in xpath_tokenizer_re.findall(pattern): + tag = token[1] + if tag and tag[0] != "{": + if ":" in tag: + prefix, uri = tag.split(":", 1) + try: + if not namespaces: + raise KeyError + yield token[0], "{%s}%s" % (namespaces[prefix], uri) + except KeyError: + raise SyntaxError("prefix %r not found in prefix map" % prefix) + elif default_namespace: + yield token[0], "{%s}%s" % (default_namespace, tag) + else: + yield token + else: + yield token + + +def prepare_child(next, token): + tag = token[1] + def select(result): + for elem in result: + for e in elem.iterchildren(tag): + yield e + return select + +def prepare_star(next, token): + def select(result): + for elem in result: + for e in elem.iterchildren('*'): + yield e + return select + +def prepare_self(next, token): + def select(result): + return result + return select + +def prepare_descendant(next, token): + token = next() + if token[0] == "*": + tag = "*" + elif not token[0]: + tag = token[1] + else: + raise SyntaxError("invalid descendant") + def select(result): + for elem in result: + for e in elem.iterdescendants(tag): + yield e + return select + +def prepare_parent(next, token): + def select(result): + for elem in result: + parent = elem.getparent() + if parent is not None: + yield parent + return select + +def prepare_predicate(next, token): + # FIXME: replace with real parser!!! refs: + # http://effbot.org/zone/simple-iterator-parser.htm + # http://javascript.crockford.com/tdop/tdop.html + signature = '' + predicate = [] + while 1: + token = next() + if token[0] == "]": + break + if token == ('', ''): + # ignore whitespace + continue + if token[0] and token[0][:1] in "'\"": + token = "'", token[0][1:-1] + signature += token[0] or "-" + predicate.append(token[1]) + + # use signature to determine predicate type + if signature == "@-": + # [@attribute] predicate + key = predicate[1] + def select(result): + for elem in result: + if elem.get(key) is not None: + yield elem + return select + if signature == "@-='": + # [@attribute='value'] + key = predicate[1] + value = predicate[-1] + def select(result): + for elem in result: + if elem.get(key) == value: + yield elem + return select + if signature == "-" and not re.match(r"-?\d+$", predicate[0]): + # [tag] + tag = predicate[0] + def select(result): + for elem in result: + for _ in elem.iterchildren(tag): + yield elem + break + return select + if signature == ".='" or (signature == "-='" and not re.match(r"-?\d+$", predicate[0])): + # [.='value'] or [tag='value'] + tag = predicate[0] + value = predicate[-1] + if tag: + def select(result): + for elem in result: + for e in elem.iterchildren(tag): + if "".join(e.itertext()) == value: + yield elem + break + else: + def select(result): + for elem in result: + if "".join(elem.itertext()) == value: + yield elem + return select + if signature == "-" or signature == "-()" or signature == "-()-": + # [index] or [last()] or [last()-index] + if signature == "-": + # [index] + index = int(predicate[0]) - 1 + if index < 0: + if index == -1: + raise SyntaxError( + "indices in path predicates are 1-based, not 0-based") + else: + raise SyntaxError("path index >= 1 expected") + else: + if predicate[0] != "last": + raise SyntaxError("unsupported function") + if signature == "-()-": + try: + index = int(predicate[2]) - 1 + except ValueError: + raise SyntaxError("unsupported expression") + else: + index = -1 + def select(result): + for elem in result: + parent = elem.getparent() + if parent is None: + continue + try: + # FIXME: what if the selector is "*" ? + elems = list(parent.iterchildren(elem.tag)) + if elems[index] is elem: + yield elem + except IndexError: + pass + return select + raise SyntaxError("invalid predicate") + +ops = { + "": prepare_child, + "*": prepare_star, + ".": prepare_self, + "..": prepare_parent, + "//": prepare_descendant, + "[": prepare_predicate, +} + + +# -------------------------------------------------------------------- + +_cache = {} + + +def _build_path_iterator(path, namespaces): + """compile selector pattern""" + if path[-1:] == "/": + path += "*" # implicit all (FIXME: keep this?) + + cache_key = (path,) + if namespaces: + if '' in namespaces: + raise ValueError("empty namespace prefix must be passed as None, not the empty string") + if None in namespaces: + cache_key += (namespaces[None],) + tuple(sorted( + item for item in namespaces.items() if item[0] is not None)) + else: + cache_key += tuple(sorted(namespaces.items())) + + try: + return _cache[cache_key] + except KeyError: + pass + if len(_cache) > 100: + _cache.clear() + + if path[:1] == "/": + raise SyntaxError("cannot use absolute path on element") + stream = iter(xpath_tokenizer(path, namespaces)) + try: + _next = stream.next + except AttributeError: + # Python 3 + _next = stream.__next__ + try: + token = _next() + except StopIteration: + raise SyntaxError("empty path expression") + selector = [] + while 1: + try: + selector.append(ops[token[0]](_next, token)) + except StopIteration: + raise SyntaxError("invalid path") + try: + token = _next() + if token[0] == "/": + token = _next() + except StopIteration: + break + _cache[cache_key] = selector + return selector + + +## +# Iterate over the matching nodes + +def iterfind(elem, path, namespaces=None): + selector = _build_path_iterator(path, namespaces) + result = iter((elem,)) + for select in selector: + result = select(result) + return result + + +## +# Find first matching object. + +def find(elem, path, namespaces=None): + it = iterfind(elem, path, namespaces) + try: + return next(it) + except StopIteration: + return None + + +## +# Find all matching objects. + +def findall(elem, path, namespaces=None): + return list(iterfind(elem, path, namespaces)) + + +## +# Find text for first matching object. + +def findtext(elem, path, default=None, namespaces=None): + el = find(elem, path, namespaces) + if el is None: + return default + else: + return el.text or '' diff --git a/libs/lxml/_elementpath.pyd b/libs/lxml/_elementpath.pyd new file mode 100644 index 000000000..fdd9e6b25 Binary files /dev/null and b/libs/lxml/_elementpath.pyd differ diff --git a/libs/lxml/builder.py b/libs/lxml/builder.py new file mode 100644 index 000000000..a28884567 --- /dev/null +++ b/libs/lxml/builder.py @@ -0,0 +1,239 @@ +# cython: language_level=2 + +# +# Element generator factory by Fredrik Lundh. +# +# Source: +# http://online.effbot.org/2006_11_01_archive.htm#et-builder +# http://effbot.python-hosting.com/file/stuff/sandbox/elementlib/builder.py +# +# -------------------------------------------------------------------- +# The ElementTree toolkit is +# +# Copyright (c) 1999-2004 by Fredrik Lundh +# +# By obtaining, using, and/or copying this software and/or its +# associated documentation, you agree that you have read, understood, +# and will comply with the following terms and conditions: +# +# Permission to use, copy, modify, and distribute this software and +# its associated documentation for any purpose and without fee is +# hereby granted, provided that the above copyright notice appears in +# all copies, and that both that copyright notice and this permission +# notice appear in supporting documentation, and that the name of +# Secret Labs AB or the author not be used in advertising or publicity +# pertaining to distribution of the software without specific, written +# prior permission. +# +# SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD +# TO THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANT- +# ABILITY AND FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR +# BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY +# DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, +# WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS +# ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE +# OF THIS SOFTWARE. +# -------------------------------------------------------------------- + +""" +The ``E`` Element factory for generating XML documents. +""" + +from __future__ import absolute_import + +import lxml.etree as ET + +from functools import partial + +try: + basestring +except NameError: + basestring = str + +try: + unicode +except NameError: + unicode = str + + +class ElementMaker(object): + """Element generator factory. + + Unlike the ordinary Element factory, the E factory allows you to pass in + more than just a tag and some optional attributes; you can also pass in + text and other elements. The text is added as either text or tail + attributes, and elements are inserted at the right spot. Some small + examples:: + + >>> from lxml import etree as ET + >>> from lxml.builder import E + + >>> ET.tostring(E("tag")) + '' + >>> ET.tostring(E("tag", "text")) + 'text' + >>> ET.tostring(E("tag", "text", key="value")) + 'text' + >>> ET.tostring(E("tag", E("subtag", "text"), "tail")) + 'texttail' + + For simple tags, the factory also allows you to write ``E.tag(...)`` instead + of ``E('tag', ...)``:: + + >>> ET.tostring(E.tag()) + '' + >>> ET.tostring(E.tag("text")) + 'text' + >>> ET.tostring(E.tag(E.subtag("text"), "tail")) + 'texttail' + + Here's a somewhat larger example; this shows how to generate HTML + documents, using a mix of prepared factory functions for inline elements, + nested ``E.tag`` calls, and embedded XHTML fragments:: + + # some common inline elements + A = E.a + I = E.i + B = E.b + + def CLASS(v): + # helper function, 'class' is a reserved word + return {'class': v} + + page = ( + E.html( + E.head( + E.title("This is a sample document") + ), + E.body( + E.h1("Hello!", CLASS("title")), + E.p("This is a paragraph with ", B("bold"), " text in it!"), + E.p("This is another paragraph, with a ", + A("link", href="http://www.python.org"), "."), + E.p("Here are some reserved characters: ."), + ET.XML("

And finally, here is an embedded XHTML fragment.

"), + ) + ) + ) + + print ET.tostring(page) + + Here's a prettyprinted version of the output from the above script:: + + + + This is a sample document + + +

Hello!

+

This is a paragraph with bold text in it!

+

This is another paragraph, with link.

+

Here are some reserved characters: <spam&egg>.

+

And finally, here is an embedded XHTML fragment.

+ + + + For namespace support, you can pass a namespace map (``nsmap``) + and/or a specific target ``namespace`` to the ElementMaker class:: + + >>> E = ElementMaker(namespace="http://my.ns/") + >>> print(ET.tostring( E.test )) + + + >>> E = ElementMaker(namespace="http://my.ns/", nsmap={'p':'http://my.ns/'}) + >>> print(ET.tostring( E.test )) + + """ + + def __init__(self, typemap=None, + namespace=None, nsmap=None, makeelement=None): + if namespace is not None: + self._namespace = '{' + namespace + '}' + else: + self._namespace = None + + if nsmap: + self._nsmap = dict(nsmap) + else: + self._nsmap = None + + if makeelement is not None: + assert callable(makeelement) + self._makeelement = makeelement + else: + self._makeelement = ET.Element + + # initialize type map for this element factory + + if typemap: + typemap = dict(typemap) + else: + typemap = {} + + def add_text(elem, item): + try: + elem[-1].tail = (elem[-1].tail or "") + item + except IndexError: + elem.text = (elem.text or "") + item + + def add_cdata(elem, cdata): + if elem.text: + raise ValueError("Can't add a CDATA section. Element already has some text: %r" % elem.text) + elem.text = cdata + + if str not in typemap: + typemap[str] = add_text + if unicode not in typemap: + typemap[unicode] = add_text + if ET.CDATA not in typemap: + typemap[ET.CDATA] = add_cdata + + def add_dict(elem, item): + attrib = elem.attrib + for k, v in item.items(): + if isinstance(v, basestring): + attrib[k] = v + else: + attrib[k] = typemap[type(v)](None, v) + if dict not in typemap: + typemap[dict] = add_dict + + self._typemap = typemap + + def __call__(self, tag, *children, **attrib): + typemap = self._typemap + + if self._namespace is not None and tag[0] != '{': + tag = self._namespace + tag + elem = self._makeelement(tag, nsmap=self._nsmap) + if attrib: + typemap[dict](elem, attrib) + + for item in children: + if callable(item): + item = item() + t = typemap.get(type(item)) + if t is None: + if ET.iselement(item): + elem.append(item) + continue + for basetype in type(item).__mro__: + # See if the typemap knows of any of this type's bases. + t = typemap.get(basetype) + if t is not None: + break + else: + raise TypeError("bad argument type: %s(%r)" % + (type(item).__name__, item)) + v = t(elem, item) + if v: + typemap.get(type(v))(elem, v) + + return elem + + def __getattr__(self, tag): + return partial(self, tag) + + +# create factory object +E = ElementMaker() diff --git a/libs/lxml/builder.pyd b/libs/lxml/builder.pyd new file mode 100644 index 000000000..b20e66481 Binary files /dev/null and b/libs/lxml/builder.pyd differ diff --git a/libs/lxml/cssselect.py b/libs/lxml/cssselect.py new file mode 100644 index 000000000..586a1427c --- /dev/null +++ b/libs/lxml/cssselect.py @@ -0,0 +1,102 @@ +"""CSS Selectors based on XPath. + +This module supports selecting XML/HTML tags based on CSS selectors. +See the `CSSSelector` class for details. + +This is a thin wrapper around cssselect 0.7 or later. +""" + +from __future__ import absolute_import + +from . import etree +try: + import cssselect as external_cssselect +except ImportError: + raise ImportError( + 'cssselect does not seem to be installed. ' + 'See http://packages.python.org/cssselect/') + + +SelectorSyntaxError = external_cssselect.SelectorSyntaxError +ExpressionError = external_cssselect.ExpressionError +SelectorError = external_cssselect.SelectorError + + +__all__ = ['SelectorSyntaxError', 'ExpressionError', 'SelectorError', + 'CSSSelector'] + + +class LxmlTranslator(external_cssselect.GenericTranslator): + """ + A custom CSS selector to XPath translator with lxml-specific extensions. + """ + def xpath_contains_function(self, xpath, function): + # Defined there, removed in later drafts: + # http://www.w3.org/TR/2001/CR-css3-selectors-20011113/#content-selectors + if function.argument_types() not in (['STRING'], ['IDENT']): + raise ExpressionError( + "Expected a single string or ident for :contains(), got %r" + % function.arguments) + value = function.arguments[0].value + return xpath.add_condition( + 'contains(__lxml_internal_css:lower-case(string(.)), %s)' + % self.xpath_literal(value.lower())) + + +class LxmlHTMLTranslator(LxmlTranslator, external_cssselect.HTMLTranslator): + """ + lxml extensions + HTML support. + """ + + +def _make_lower_case(context, s): + return s.lower() + +ns = etree.FunctionNamespace('http://codespeak.net/lxml/css/') +ns.prefix = '__lxml_internal_css' +ns['lower-case'] = _make_lower_case + + +class CSSSelector(etree.XPath): + """A CSS selector. + + Usage:: + + >>> from lxml import etree, cssselect + >>> select = cssselect.CSSSelector("a tag > child") + + >>> root = etree.XML("TEXT") + >>> [ el.tag for el in select(root) ] + ['child'] + + To use CSS namespaces, you need to pass a prefix-to-namespace + mapping as ``namespaces`` keyword argument:: + + >>> rdfns = 'http://www.w3.org/1999/02/22-rdf-syntax-ns#' + >>> select_ns = cssselect.CSSSelector('root > rdf|Description', + ... namespaces={'rdf': rdfns}) + + >>> rdf = etree.XML(( + ... '' + ... 'blah' + ... '') % rdfns) + >>> [(el.tag, el.text) for el in select_ns(rdf)] + [('{http://www.w3.org/1999/02/22-rdf-syntax-ns#}Description', 'blah')] + + """ + def __init__(self, css, namespaces=None, translator='xml'): + if translator == 'xml': + translator = LxmlTranslator() + elif translator == 'html': + translator = LxmlHTMLTranslator() + elif translator == 'xhtml': + translator = LxmlHTMLTranslator(xhtml=True) + path = translator.css_to_xpath(css) + etree.XPath.__init__(self, path, namespaces=namespaces) + self.css = css + + def __repr__(self): + return '<%s %s for %r>' % ( + self.__class__.__name__, + hex(abs(id(self)))[2:], + self.css) diff --git a/libs/lxml/doctestcompare.py b/libs/lxml/doctestcompare.py new file mode 100644 index 000000000..1b0daa49a --- /dev/null +++ b/libs/lxml/doctestcompare.py @@ -0,0 +1,507 @@ +""" +lxml-based doctest output comparison. + +Note: normally, you should just import the `lxml.usedoctest` and +`lxml.html.usedoctest` modules from within a doctest, instead of this +one:: + + >>> import lxml.usedoctest # for XML output + + >>> import lxml.html.usedoctest # for HTML output + +To use this module directly, you must call ``lxmldoctest.install()``, +which will cause doctest to use this in all subsequent calls. + +This changes the way output is checked and comparisons are made for +XML or HTML-like content. + +XML or HTML content is noticed because the example starts with ``<`` +(it's HTML if it starts with ```` or include an ``any`` +attribute in the tag. An ``any`` tag matches any tag, while the +attribute matches any and all attributes. + +When a match fails, the reformatted example and gotten text is +displayed (indented), and a rough diff-like output is given. Anything +marked with ``+`` is in the output but wasn't supposed to be, and +similarly ``-`` means its in the example but wasn't in the output. + +You can disable parsing on one line with ``# doctest:+NOPARSE_MARKUP`` +""" + +from lxml import etree +import sys +import re +import doctest +try: + from html import escape as html_escape +except ImportError: + from cgi import escape as html_escape + +__all__ = ['PARSE_HTML', 'PARSE_XML', 'NOPARSE_MARKUP', 'LXMLOutputChecker', + 'LHTMLOutputChecker', 'install', 'temp_install'] + +try: + _basestring = basestring +except NameError: + _basestring = (str, bytes) + +_IS_PYTHON_3 = sys.version_info[0] >= 3 + +PARSE_HTML = doctest.register_optionflag('PARSE_HTML') +PARSE_XML = doctest.register_optionflag('PARSE_XML') +NOPARSE_MARKUP = doctest.register_optionflag('NOPARSE_MARKUP') + +OutputChecker = doctest.OutputChecker + +def strip(v): + if v is None: + return None + else: + return v.strip() + +def norm_whitespace(v): + return _norm_whitespace_re.sub(' ', v) + +_html_parser = etree.HTMLParser(recover=False, remove_blank_text=True) + +def html_fromstring(html): + return etree.fromstring(html, _html_parser) + +# We use this to distinguish repr()s from elements: +_repr_re = re.compile(r'^<[^>]+ (at|object) ') +_norm_whitespace_re = re.compile(r'[ \t\n][ \t\n]+') + +class LXMLOutputChecker(OutputChecker): + + empty_tags = ( + 'param', 'img', 'area', 'br', 'basefont', 'input', + 'base', 'meta', 'link', 'col') + + def get_default_parser(self): + return etree.XML + + def check_output(self, want, got, optionflags): + alt_self = getattr(self, '_temp_override_self', None) + if alt_self is not None: + super_method = self._temp_call_super_check_output + self = alt_self + else: + super_method = OutputChecker.check_output + parser = self.get_parser(want, got, optionflags) + if not parser: + return super_method( + self, want, got, optionflags) + try: + want_doc = parser(want) + except etree.XMLSyntaxError: + return False + try: + got_doc = parser(got) + except etree.XMLSyntaxError: + return False + return self.compare_docs(want_doc, got_doc) + + def get_parser(self, want, got, optionflags): + parser = None + if NOPARSE_MARKUP & optionflags: + return None + if PARSE_HTML & optionflags: + parser = html_fromstring + elif PARSE_XML & optionflags: + parser = etree.XML + elif (want.strip().lower().startswith('' % el.tag + return '<%s %s>' % (el.tag, ' '.join(attrs)) + + def format_end_tag(self, el): + if isinstance(el, etree.CommentBase): + # FIXME: probably PIs should be handled specially too? + return '-->' + return '' % el.tag + + def collect_diff(self, want, got, html, indent): + parts = [] + if not len(want) and not len(got): + parts.append(' '*indent) + parts.append(self.collect_diff_tag(want, got)) + if not self.html_empty_tag(got, html): + parts.append(self.collect_diff_text(want.text, got.text)) + parts.append(self.collect_diff_end_tag(want, got)) + parts.append(self.collect_diff_text(want.tail, got.tail)) + parts.append('\n') + return ''.join(parts) + parts.append(' '*indent) + parts.append(self.collect_diff_tag(want, got)) + parts.append('\n') + if strip(want.text) or strip(got.text): + parts.append(' '*indent) + parts.append(self.collect_diff_text(want.text, got.text)) + parts.append('\n') + want_children = list(want) + got_children = list(got) + while want_children or got_children: + if not want_children: + parts.append(self.format_doc(got_children.pop(0), html, indent+2, '+')) + continue + if not got_children: + parts.append(self.format_doc(want_children.pop(0), html, indent+2, '-')) + continue + parts.append(self.collect_diff( + want_children.pop(0), got_children.pop(0), html, indent+2)) + parts.append(' '*indent) + parts.append(self.collect_diff_end_tag(want, got)) + parts.append('\n') + if strip(want.tail) or strip(got.tail): + parts.append(' '*indent) + parts.append(self.collect_diff_text(want.tail, got.tail)) + parts.append('\n') + return ''.join(parts) + + def collect_diff_tag(self, want, got): + if not self.tag_compare(want.tag, got.tag): + tag = '%s (got: %s)' % (want.tag, got.tag) + else: + tag = got.tag + attrs = [] + any = want.tag == 'any' or 'any' in want.attrib + for name, value in sorted(got.attrib.items()): + if name not in want.attrib and not any: + attrs.append('+%s="%s"' % (name, self.format_text(value, False))) + else: + if name in want.attrib: + text = self.collect_diff_text(want.attrib[name], value, False) + else: + text = self.format_text(value, False) + attrs.append('%s="%s"' % (name, text)) + if not any: + for name, value in sorted(want.attrib.items()): + if name in got.attrib: + continue + attrs.append('-%s="%s"' % (name, self.format_text(value, False))) + if attrs: + tag = '<%s %s>' % (tag, ' '.join(attrs)) + else: + tag = '<%s>' % tag + return tag + + def collect_diff_end_tag(self, want, got): + if want.tag != got.tag: + tag = '%s (got: %s)' % (want.tag, got.tag) + else: + tag = got.tag + return '' % tag + + def collect_diff_text(self, want, got, strip=True): + if self.text_compare(want, got, strip): + if not got: + return '' + return self.format_text(got, strip) + text = '%s (got: %s)' % (want, got) + return self.format_text(text, strip) + +class LHTMLOutputChecker(LXMLOutputChecker): + def get_default_parser(self): + return html_fromstring + +def install(html=False): + """ + Install doctestcompare for all future doctests. + + If html is true, then by default the HTML parser will be used; + otherwise the XML parser is used. + """ + if html: + doctest.OutputChecker = LHTMLOutputChecker + else: + doctest.OutputChecker = LXMLOutputChecker + +def temp_install(html=False, del_module=None): + """ + Use this *inside* a doctest to enable this checker for this + doctest only. + + If html is true, then by default the HTML parser will be used; + otherwise the XML parser is used. + """ + if html: + Checker = LHTMLOutputChecker + else: + Checker = LXMLOutputChecker + frame = _find_doctest_frame() + dt_self = frame.f_locals['self'] + checker = Checker() + old_checker = dt_self._checker + dt_self._checker = checker + # The unfortunate thing is that there is a local variable 'check' + # in the function that runs the doctests, that is a bound method + # into the output checker. We have to update that. We can't + # modify the frame, so we have to modify the object in place. The + # only way to do this is to actually change the func_code + # attribute of the method. We change it, and then wait for + # __record_outcome to be run, which signals the end of the __run + # method, at which point we restore the previous check_output + # implementation. + if _IS_PYTHON_3: + check_func = frame.f_locals['check'].__func__ + checker_check_func = checker.check_output.__func__ + else: + check_func = frame.f_locals['check'].im_func + checker_check_func = checker.check_output.im_func + # Because we can't patch up func_globals, this is the only global + # in check_output that we care about: + doctest.etree = etree + _RestoreChecker(dt_self, old_checker, checker, + check_func, checker_check_func, + del_module) + +class _RestoreChecker(object): + def __init__(self, dt_self, old_checker, new_checker, check_func, clone_func, + del_module): + self.dt_self = dt_self + self.checker = old_checker + self.checker._temp_call_super_check_output = self.call_super + self.checker._temp_override_self = new_checker + self.check_func = check_func + self.clone_func = clone_func + self.del_module = del_module + self.install_clone() + self.install_dt_self() + def install_clone(self): + if _IS_PYTHON_3: + self.func_code = self.check_func.__code__ + self.func_globals = self.check_func.__globals__ + self.check_func.__code__ = self.clone_func.__code__ + else: + self.func_code = self.check_func.func_code + self.func_globals = self.check_func.func_globals + self.check_func.func_code = self.clone_func.func_code + def uninstall_clone(self): + if _IS_PYTHON_3: + self.check_func.__code__ = self.func_code + else: + self.check_func.func_code = self.func_code + def install_dt_self(self): + self.prev_func = self.dt_self._DocTestRunner__record_outcome + self.dt_self._DocTestRunner__record_outcome = self + def uninstall_dt_self(self): + self.dt_self._DocTestRunner__record_outcome = self.prev_func + def uninstall_module(self): + if self.del_module: + import sys + del sys.modules[self.del_module] + if '.' in self.del_module: + package, module = self.del_module.rsplit('.', 1) + package_mod = sys.modules[package] + delattr(package_mod, module) + def __call__(self, *args, **kw): + self.uninstall_clone() + self.uninstall_dt_self() + del self.checker._temp_override_self + del self.checker._temp_call_super_check_output + result = self.prev_func(*args, **kw) + self.uninstall_module() + return result + def call_super(self, *args, **kw): + self.uninstall_clone() + try: + return self.check_func(*args, **kw) + finally: + self.install_clone() + +def _find_doctest_frame(): + import sys + frame = sys._getframe(1) + while frame: + l = frame.f_locals + if 'BOOM' in l: + # Sign of doctest + return frame + frame = frame.f_back + raise LookupError( + "Could not find doctest (only use this function *inside* a doctest)") + +__test__ = { + 'basic': ''' + >>> temp_install() + >>> print """stuff""" + ... + >>> print """""" + + + + >>> print """blahblahblah""" # doctest: +NOPARSE_MARKUP, +ELLIPSIS + ...foo /> + '''} + +if __name__ == '__main__': + import doctest + doctest.testmod() + + diff --git a/libs/lxml/etree.h b/libs/lxml/etree.h new file mode 100644 index 000000000..dcf739840 --- /dev/null +++ b/libs/lxml/etree.h @@ -0,0 +1,223 @@ +/* Generated by Cython 0.29.2 */ + +#ifndef __PYX_HAVE__lxml__etree +#define __PYX_HAVE__lxml__etree + +struct LxmlDocument; +struct LxmlElement; +struct LxmlElementTree; +struct LxmlElementTagMatcher; +struct LxmlElementIterator; +struct LxmlElementBase; +struct LxmlElementClassLookup; +struct LxmlFallbackElementClassLookup; + +/* "lxml/etree.pyx":318 + * + * # type of a function that steps from node to node + * ctypedef public xmlNode* (*_node_to_node_function)(xmlNode*) # <<<<<<<<<<<<<< + * + * + */ +typedef xmlNode *(*_node_to_node_function)(xmlNode *); + +/* "lxml/etree.pyx":334 + * @cython.final + * @cython.freelist(8) + * cdef public class _Document [ type LxmlDocumentType, object LxmlDocument ]: # <<<<<<<<<<<<<< + * u"""Internal base class to reference a libxml document. + * + */ +struct LxmlDocument { + PyObject_HEAD + struct __pyx_vtabstruct_4lxml_5etree__Document *__pyx_vtab; + int _ns_counter; + PyObject *_prefix_tail; + xmlDoc *_c_doc; + struct __pyx_obj_4lxml_5etree__BaseParser *_parser; +}; + +/* "lxml/etree.pyx":683 + * + * @cython.no_gc_clear + * cdef public class _Element [ type LxmlElementType, object LxmlElement ]: # <<<<<<<<<<<<<< + * u"""Element class. + * + */ +struct LxmlElement { + PyObject_HEAD + struct LxmlDocument *_doc; + xmlNode *_c_node; + PyObject *_tag; +}; + +/* "lxml/etree.pyx":1847 + * + * + * cdef public class _ElementTree [ type LxmlElementTreeType, # <<<<<<<<<<<<<< + * object LxmlElementTree ]: + * cdef _Document _doc + */ +struct LxmlElementTree { + PyObject_HEAD + struct __pyx_vtabstruct_4lxml_5etree__ElementTree *__pyx_vtab; + struct LxmlDocument *_doc; + struct LxmlElement *_context_node; +}; + +/* "lxml/etree.pyx":2574 + * + * + * cdef public class _ElementTagMatcher [ object LxmlElementTagMatcher, # <<<<<<<<<<<<<< + * type LxmlElementTagMatcherType ]: + * """ + */ +struct LxmlElementTagMatcher { + PyObject_HEAD + struct __pyx_vtabstruct_4lxml_5etree__ElementTagMatcher *__pyx_vtab; + PyObject *_pystrings; + int _node_type; + char *_href; + char *_name; +}; + +/* "lxml/etree.pyx":2605 + * self._name = NULL + * + * cdef public class _ElementIterator(_ElementTagMatcher) [ # <<<<<<<<<<<<<< + * object LxmlElementIterator, type LxmlElementIteratorType ]: + * """ + */ +struct LxmlElementIterator { + struct LxmlElementTagMatcher __pyx_base; + struct LxmlElement *_node; + _node_to_node_function _next_element; +}; + +/* "src/lxml/classlookup.pxi":6 + * # Custom Element classes + * + * cdef public class ElementBase(_Element) [ type LxmlElementBaseType, # <<<<<<<<<<<<<< + * object LxmlElementBase ]: + * u"""ElementBase(*children, attrib=None, nsmap=None, **_extra) + */ +struct LxmlElementBase { + struct LxmlElement __pyx_base; +}; + +/* "src/lxml/classlookup.pxi":210 + * # Element class lookup + * + * ctypedef public object (*_element_class_lookup_function)(object, _Document, xmlNode*) # <<<<<<<<<<<<<< + * + * # class to store element class lookup functions + */ +typedef PyObject *(*_element_class_lookup_function)(PyObject *, struct LxmlDocument *, xmlNode *); + +/* "src/lxml/classlookup.pxi":213 + * + * # class to store element class lookup functions + * cdef public class ElementClassLookup [ type LxmlElementClassLookupType, # <<<<<<<<<<<<<< + * object LxmlElementClassLookup ]: + * u"""ElementClassLookup(self) + */ +struct LxmlElementClassLookup { + PyObject_HEAD + _element_class_lookup_function _lookup_function; +}; + +/* "src/lxml/classlookup.pxi":221 + * + * + * cdef public class FallbackElementClassLookup(ElementClassLookup) \ # <<<<<<<<<<<<<< + * [ type LxmlFallbackElementClassLookupType, + * object LxmlFallbackElementClassLookup ]: + */ +struct LxmlFallbackElementClassLookup { + struct LxmlElementClassLookup __pyx_base; + struct __pyx_vtabstruct_4lxml_5etree_FallbackElementClassLookup *__pyx_vtab; + struct LxmlElementClassLookup *fallback; + _element_class_lookup_function _fallback_function; +}; + +#ifndef __PYX_HAVE_API__lxml__etree + +#ifndef __PYX_EXTERN_C + #ifdef __cplusplus + #define __PYX_EXTERN_C extern "C" + #else + #define __PYX_EXTERN_C extern + #endif +#endif + +#ifndef DL_IMPORT + #define DL_IMPORT(_T) _T +#endif + +__PYX_EXTERN_C DL_IMPORT(PyTypeObject) LxmlDocumentType; +__PYX_EXTERN_C DL_IMPORT(PyTypeObject) LxmlElementType; +__PYX_EXTERN_C DL_IMPORT(PyTypeObject) LxmlElementTreeType; +__PYX_EXTERN_C DL_IMPORT(PyTypeObject) LxmlElementTagMatcherType; +__PYX_EXTERN_C DL_IMPORT(PyTypeObject) LxmlElementIteratorType; +__PYX_EXTERN_C DL_IMPORT(PyTypeObject) LxmlElementBaseType; +__PYX_EXTERN_C DL_IMPORT(PyTypeObject) LxmlElementClassLookupType; +__PYX_EXTERN_C DL_IMPORT(PyTypeObject) LxmlFallbackElementClassLookupType; + +__PYX_EXTERN_C struct LxmlElement *deepcopyNodeToDocument(struct LxmlDocument *, xmlNode *); +__PYX_EXTERN_C struct LxmlElementTree *elementTreeFactory(struct LxmlElement *); +__PYX_EXTERN_C struct LxmlElementTree *newElementTree(struct LxmlElement *, PyObject *); +__PYX_EXTERN_C struct LxmlElementTree *adoptExternalDocument(xmlDoc *, PyObject *, int); +__PYX_EXTERN_C struct LxmlElement *elementFactory(struct LxmlDocument *, xmlNode *); +__PYX_EXTERN_C struct LxmlElement *makeElement(PyObject *, struct LxmlDocument *, PyObject *, PyObject *, PyObject *, PyObject *, PyObject *); +__PYX_EXTERN_C struct LxmlElement *makeSubElement(struct LxmlElement *, PyObject *, PyObject *, PyObject *, PyObject *, PyObject *); +__PYX_EXTERN_C void setElementClassLookupFunction(_element_class_lookup_function, PyObject *); +__PYX_EXTERN_C PyObject *lookupDefaultElementClass(PyObject *, PyObject *, xmlNode *); +__PYX_EXTERN_C PyObject *lookupNamespaceElementClass(PyObject *, PyObject *, xmlNode *); +__PYX_EXTERN_C PyObject *callLookupFallback(struct LxmlFallbackElementClassLookup *, struct LxmlDocument *, xmlNode *); +__PYX_EXTERN_C int tagMatches(xmlNode *, const xmlChar *, const xmlChar *); +__PYX_EXTERN_C struct LxmlDocument *documentOrRaise(PyObject *); +__PYX_EXTERN_C struct LxmlElement *rootNodeOrRaise(PyObject *); +__PYX_EXTERN_C int hasText(xmlNode *); +__PYX_EXTERN_C int hasTail(xmlNode *); +__PYX_EXTERN_C PyObject *textOf(xmlNode *); +__PYX_EXTERN_C PyObject *tailOf(xmlNode *); +__PYX_EXTERN_C int setNodeText(xmlNode *, PyObject *); +__PYX_EXTERN_C int setTailText(xmlNode *, PyObject *); +__PYX_EXTERN_C PyObject *attributeValue(xmlNode *, xmlAttr *); +__PYX_EXTERN_C PyObject *attributeValueFromNsName(xmlNode *, const xmlChar *, const xmlChar *); +__PYX_EXTERN_C PyObject *getAttributeValue(struct LxmlElement *, PyObject *, PyObject *); +__PYX_EXTERN_C PyObject *iterattributes(struct LxmlElement *, int); +__PYX_EXTERN_C PyObject *collectAttributes(xmlNode *, int); +__PYX_EXTERN_C int setAttributeValue(struct LxmlElement *, PyObject *, PyObject *); +__PYX_EXTERN_C int delAttribute(struct LxmlElement *, PyObject *); +__PYX_EXTERN_C int delAttributeFromNsName(xmlNode *, const xmlChar *, const xmlChar *); +__PYX_EXTERN_C int hasChild(xmlNode *); +__PYX_EXTERN_C xmlNode *findChild(xmlNode *, Py_ssize_t); +__PYX_EXTERN_C xmlNode *findChildForwards(xmlNode *, Py_ssize_t); +__PYX_EXTERN_C xmlNode *findChildBackwards(xmlNode *, Py_ssize_t); +__PYX_EXTERN_C xmlNode *nextElement(xmlNode *); +__PYX_EXTERN_C xmlNode *previousElement(xmlNode *); +__PYX_EXTERN_C void appendChild(struct LxmlElement *, struct LxmlElement *); +__PYX_EXTERN_C int appendChildToElement(struct LxmlElement *, struct LxmlElement *); +__PYX_EXTERN_C PyObject *pyunicode(const xmlChar *); +__PYX_EXTERN_C PyObject *utf8(PyObject *); +__PYX_EXTERN_C PyObject *getNsTag(PyObject *); +__PYX_EXTERN_C PyObject *getNsTagWithEmptyNs(PyObject *); +__PYX_EXTERN_C PyObject *namespacedName(xmlNode *); +__PYX_EXTERN_C PyObject *namespacedNameFromNsName(const xmlChar *, const xmlChar *); +__PYX_EXTERN_C void iteratorStoreNext(struct LxmlElementIterator *, struct LxmlElement *); +__PYX_EXTERN_C void initTagMatch(struct LxmlElementTagMatcher *, PyObject *); +__PYX_EXTERN_C xmlNs *findOrBuildNodeNsPrefix(struct LxmlDocument *, xmlNode *, const xmlChar *, const xmlChar *); + +#endif /* !__PYX_HAVE_API__lxml__etree */ + +/* WARNING: the interface of the module init function changed in CPython 3.5. */ +/* It now returns a PyModuleDef instance instead of a PyModule instance. */ + +#if PY_MAJOR_VERSION < 3 +PyMODINIT_FUNC initetree(void); +#else +PyMODINIT_FUNC PyInit_etree(void); +#endif + +#endif /* !__PYX_HAVE__lxml__etree */ diff --git a/libs/lxml/etree.pyd b/libs/lxml/etree.pyd new file mode 100644 index 000000000..a03f974e6 Binary files /dev/null and b/libs/lxml/etree.pyd differ diff --git a/libs/lxml/etree_api.h b/libs/lxml/etree_api.h new file mode 100644 index 000000000..912f48c36 --- /dev/null +++ b/libs/lxml/etree_api.h @@ -0,0 +1,219 @@ +/* Generated by Cython 0.29.2 */ + +#ifndef __PYX_HAVE_API__lxml__etree +#define __PYX_HAVE_API__lxml__etree +#ifdef __MINGW64__ +#define MS_WIN64 +#endif +#include "Python.h" +#include "etree.h" + +static struct LxmlElement *(*__pyx_api_f_4lxml_5etree_deepcopyNodeToDocument)(struct LxmlDocument *, xmlNode *) = 0; +#define deepcopyNodeToDocument __pyx_api_f_4lxml_5etree_deepcopyNodeToDocument +static struct LxmlElementTree *(*__pyx_api_f_4lxml_5etree_elementTreeFactory)(struct LxmlElement *) = 0; +#define elementTreeFactory __pyx_api_f_4lxml_5etree_elementTreeFactory +static struct LxmlElementTree *(*__pyx_api_f_4lxml_5etree_newElementTree)(struct LxmlElement *, PyObject *) = 0; +#define newElementTree __pyx_api_f_4lxml_5etree_newElementTree +static struct LxmlElementTree *(*__pyx_api_f_4lxml_5etree_adoptExternalDocument)(xmlDoc *, PyObject *, int) = 0; +#define adoptExternalDocument __pyx_api_f_4lxml_5etree_adoptExternalDocument +static struct LxmlElement *(*__pyx_api_f_4lxml_5etree_elementFactory)(struct LxmlDocument *, xmlNode *) = 0; +#define elementFactory __pyx_api_f_4lxml_5etree_elementFactory +static struct LxmlElement *(*__pyx_api_f_4lxml_5etree_makeElement)(PyObject *, struct LxmlDocument *, PyObject *, PyObject *, PyObject *, PyObject *, PyObject *) = 0; +#define makeElement __pyx_api_f_4lxml_5etree_makeElement +static struct LxmlElement *(*__pyx_api_f_4lxml_5etree_makeSubElement)(struct LxmlElement *, PyObject *, PyObject *, PyObject *, PyObject *, PyObject *) = 0; +#define makeSubElement __pyx_api_f_4lxml_5etree_makeSubElement +static void (*__pyx_api_f_4lxml_5etree_setElementClassLookupFunction)(_element_class_lookup_function, PyObject *) = 0; +#define setElementClassLookupFunction __pyx_api_f_4lxml_5etree_setElementClassLookupFunction +static PyObject *(*__pyx_api_f_4lxml_5etree_lookupDefaultElementClass)(PyObject *, PyObject *, xmlNode *) = 0; +#define lookupDefaultElementClass __pyx_api_f_4lxml_5etree_lookupDefaultElementClass +static PyObject *(*__pyx_api_f_4lxml_5etree_lookupNamespaceElementClass)(PyObject *, PyObject *, xmlNode *) = 0; +#define lookupNamespaceElementClass __pyx_api_f_4lxml_5etree_lookupNamespaceElementClass +static PyObject *(*__pyx_api_f_4lxml_5etree_callLookupFallback)(struct LxmlFallbackElementClassLookup *, struct LxmlDocument *, xmlNode *) = 0; +#define callLookupFallback __pyx_api_f_4lxml_5etree_callLookupFallback +static int (*__pyx_api_f_4lxml_5etree_tagMatches)(xmlNode *, const xmlChar *, const xmlChar *) = 0; +#define tagMatches __pyx_api_f_4lxml_5etree_tagMatches +static struct LxmlDocument *(*__pyx_api_f_4lxml_5etree_documentOrRaise)(PyObject *) = 0; +#define documentOrRaise __pyx_api_f_4lxml_5etree_documentOrRaise +static struct LxmlElement *(*__pyx_api_f_4lxml_5etree_rootNodeOrRaise)(PyObject *) = 0; +#define rootNodeOrRaise __pyx_api_f_4lxml_5etree_rootNodeOrRaise +static int (*__pyx_api_f_4lxml_5etree_hasText)(xmlNode *) = 0; +#define hasText __pyx_api_f_4lxml_5etree_hasText +static int (*__pyx_api_f_4lxml_5etree_hasTail)(xmlNode *) = 0; +#define hasTail __pyx_api_f_4lxml_5etree_hasTail +static PyObject *(*__pyx_api_f_4lxml_5etree_textOf)(xmlNode *) = 0; +#define textOf __pyx_api_f_4lxml_5etree_textOf +static PyObject *(*__pyx_api_f_4lxml_5etree_tailOf)(xmlNode *) = 0; +#define tailOf __pyx_api_f_4lxml_5etree_tailOf +static int (*__pyx_api_f_4lxml_5etree_setNodeText)(xmlNode *, PyObject *) = 0; +#define setNodeText __pyx_api_f_4lxml_5etree_setNodeText +static int (*__pyx_api_f_4lxml_5etree_setTailText)(xmlNode *, PyObject *) = 0; +#define setTailText __pyx_api_f_4lxml_5etree_setTailText +static PyObject *(*__pyx_api_f_4lxml_5etree_attributeValue)(xmlNode *, xmlAttr *) = 0; +#define attributeValue __pyx_api_f_4lxml_5etree_attributeValue +static PyObject *(*__pyx_api_f_4lxml_5etree_attributeValueFromNsName)(xmlNode *, const xmlChar *, const xmlChar *) = 0; +#define attributeValueFromNsName __pyx_api_f_4lxml_5etree_attributeValueFromNsName +static PyObject *(*__pyx_api_f_4lxml_5etree_getAttributeValue)(struct LxmlElement *, PyObject *, PyObject *) = 0; +#define getAttributeValue __pyx_api_f_4lxml_5etree_getAttributeValue +static PyObject *(*__pyx_api_f_4lxml_5etree_iterattributes)(struct LxmlElement *, int) = 0; +#define iterattributes __pyx_api_f_4lxml_5etree_iterattributes +static PyObject *(*__pyx_api_f_4lxml_5etree_collectAttributes)(xmlNode *, int) = 0; +#define collectAttributes __pyx_api_f_4lxml_5etree_collectAttributes +static int (*__pyx_api_f_4lxml_5etree_setAttributeValue)(struct LxmlElement *, PyObject *, PyObject *) = 0; +#define setAttributeValue __pyx_api_f_4lxml_5etree_setAttributeValue +static int (*__pyx_api_f_4lxml_5etree_delAttribute)(struct LxmlElement *, PyObject *) = 0; +#define delAttribute __pyx_api_f_4lxml_5etree_delAttribute +static int (*__pyx_api_f_4lxml_5etree_delAttributeFromNsName)(xmlNode *, const xmlChar *, const xmlChar *) = 0; +#define delAttributeFromNsName __pyx_api_f_4lxml_5etree_delAttributeFromNsName +static int (*__pyx_api_f_4lxml_5etree_hasChild)(xmlNode *) = 0; +#define hasChild __pyx_api_f_4lxml_5etree_hasChild +static xmlNode *(*__pyx_api_f_4lxml_5etree_findChild)(xmlNode *, Py_ssize_t) = 0; +#define findChild __pyx_api_f_4lxml_5etree_findChild +static xmlNode *(*__pyx_api_f_4lxml_5etree_findChildForwards)(xmlNode *, Py_ssize_t) = 0; +#define findChildForwards __pyx_api_f_4lxml_5etree_findChildForwards +static xmlNode *(*__pyx_api_f_4lxml_5etree_findChildBackwards)(xmlNode *, Py_ssize_t) = 0; +#define findChildBackwards __pyx_api_f_4lxml_5etree_findChildBackwards +static xmlNode *(*__pyx_api_f_4lxml_5etree_nextElement)(xmlNode *) = 0; +#define nextElement __pyx_api_f_4lxml_5etree_nextElement +static xmlNode *(*__pyx_api_f_4lxml_5etree_previousElement)(xmlNode *) = 0; +#define previousElement __pyx_api_f_4lxml_5etree_previousElement +static void (*__pyx_api_f_4lxml_5etree_appendChild)(struct LxmlElement *, struct LxmlElement *) = 0; +#define appendChild __pyx_api_f_4lxml_5etree_appendChild +static int (*__pyx_api_f_4lxml_5etree_appendChildToElement)(struct LxmlElement *, struct LxmlElement *) = 0; +#define appendChildToElement __pyx_api_f_4lxml_5etree_appendChildToElement +static PyObject *(*__pyx_api_f_4lxml_5etree_pyunicode)(const xmlChar *) = 0; +#define pyunicode __pyx_api_f_4lxml_5etree_pyunicode +static PyObject *(*__pyx_api_f_4lxml_5etree_utf8)(PyObject *) = 0; +#define utf8 __pyx_api_f_4lxml_5etree_utf8 +static PyObject *(*__pyx_api_f_4lxml_5etree_getNsTag)(PyObject *) = 0; +#define getNsTag __pyx_api_f_4lxml_5etree_getNsTag +static PyObject *(*__pyx_api_f_4lxml_5etree_getNsTagWithEmptyNs)(PyObject *) = 0; +#define getNsTagWithEmptyNs __pyx_api_f_4lxml_5etree_getNsTagWithEmptyNs +static PyObject *(*__pyx_api_f_4lxml_5etree_namespacedName)(xmlNode *) = 0; +#define namespacedName __pyx_api_f_4lxml_5etree_namespacedName +static PyObject *(*__pyx_api_f_4lxml_5etree_namespacedNameFromNsName)(const xmlChar *, const xmlChar *) = 0; +#define namespacedNameFromNsName __pyx_api_f_4lxml_5etree_namespacedNameFromNsName +static void (*__pyx_api_f_4lxml_5etree_iteratorStoreNext)(struct LxmlElementIterator *, struct LxmlElement *) = 0; +#define iteratorStoreNext __pyx_api_f_4lxml_5etree_iteratorStoreNext +static void (*__pyx_api_f_4lxml_5etree_initTagMatch)(struct LxmlElementTagMatcher *, PyObject *) = 0; +#define initTagMatch __pyx_api_f_4lxml_5etree_initTagMatch +static xmlNs *(*__pyx_api_f_4lxml_5etree_findOrBuildNodeNsPrefix)(struct LxmlDocument *, xmlNode *, const xmlChar *, const xmlChar *) = 0; +#define findOrBuildNodeNsPrefix __pyx_api_f_4lxml_5etree_findOrBuildNodeNsPrefix +#if !defined(__Pyx_PyIdentifier_FromString) +#if PY_MAJOR_VERSION < 3 + #define __Pyx_PyIdentifier_FromString(s) PyString_FromString(s) +#else + #define __Pyx_PyIdentifier_FromString(s) PyUnicode_FromString(s) +#endif +#endif + +#ifndef __PYX_HAVE_RT_ImportFunction +#define __PYX_HAVE_RT_ImportFunction +static int __Pyx_ImportFunction(PyObject *module, const char *funcname, void (**f)(void), const char *sig) { + PyObject *d = 0; + PyObject *cobj = 0; + union { + void (*fp)(void); + void *p; + } tmp; + d = PyObject_GetAttrString(module, (char *)"__pyx_capi__"); + if (!d) + goto bad; + cobj = PyDict_GetItemString(d, funcname); + if (!cobj) { + PyErr_Format(PyExc_ImportError, + "%.200s does not export expected C function %.200s", + PyModule_GetName(module), funcname); + goto bad; + } +#if PY_VERSION_HEX >= 0x02070000 + if (!PyCapsule_IsValid(cobj, sig)) { + PyErr_Format(PyExc_TypeError, + "C function %.200s.%.200s has wrong signature (expected %.500s, got %.500s)", + PyModule_GetName(module), funcname, sig, PyCapsule_GetName(cobj)); + goto bad; + } + tmp.p = PyCapsule_GetPointer(cobj, sig); +#else + {const char *desc, *s1, *s2; + desc = (const char *)PyCObject_GetDesc(cobj); + if (!desc) + goto bad; + s1 = desc; s2 = sig; + while (*s1 != '\0' && *s1 == *s2) { s1++; s2++; } + if (*s1 != *s2) { + PyErr_Format(PyExc_TypeError, + "C function %.200s.%.200s has wrong signature (expected %.500s, got %.500s)", + PyModule_GetName(module), funcname, sig, desc); + goto bad; + } + tmp.p = PyCObject_AsVoidPtr(cobj);} +#endif + *f = tmp.fp; + if (!(*f)) + goto bad; + Py_DECREF(d); + return 0; +bad: + Py_XDECREF(d); + return -1; +} +#endif + + +static int import_lxml__etree(void) { + PyObject *module = 0; + module = PyImport_ImportModule("lxml.etree"); + if (!module) goto bad; + if (__Pyx_ImportFunction(module, "deepcopyNodeToDocument", (void (**)(void))&__pyx_api_f_4lxml_5etree_deepcopyNodeToDocument, "struct LxmlElement *(struct LxmlDocument *, xmlNode *)") < 0) goto bad; + if (__Pyx_ImportFunction(module, "elementTreeFactory", (void (**)(void))&__pyx_api_f_4lxml_5etree_elementTreeFactory, "struct LxmlElementTree *(struct LxmlElement *)") < 0) goto bad; + if (__Pyx_ImportFunction(module, "newElementTree", (void (**)(void))&__pyx_api_f_4lxml_5etree_newElementTree, "struct LxmlElementTree *(struct LxmlElement *, PyObject *)") < 0) goto bad; + if (__Pyx_ImportFunction(module, "adoptExternalDocument", (void (**)(void))&__pyx_api_f_4lxml_5etree_adoptExternalDocument, "struct LxmlElementTree *(xmlDoc *, PyObject *, int)") < 0) goto bad; + if (__Pyx_ImportFunction(module, "elementFactory", (void (**)(void))&__pyx_api_f_4lxml_5etree_elementFactory, "struct LxmlElement *(struct LxmlDocument *, xmlNode *)") < 0) goto bad; + if (__Pyx_ImportFunction(module, "makeElement", (void (**)(void))&__pyx_api_f_4lxml_5etree_makeElement, "struct LxmlElement *(PyObject *, struct LxmlDocument *, PyObject *, PyObject *, PyObject *, PyObject *, PyObject *)") < 0) goto bad; + if (__Pyx_ImportFunction(module, "makeSubElement", (void (**)(void))&__pyx_api_f_4lxml_5etree_makeSubElement, "struct LxmlElement *(struct LxmlElement *, PyObject *, PyObject *, PyObject *, PyObject *, PyObject *)") < 0) goto bad; + if (__Pyx_ImportFunction(module, "setElementClassLookupFunction", (void (**)(void))&__pyx_api_f_4lxml_5etree_setElementClassLookupFunction, "void (_element_class_lookup_function, PyObject *)") < 0) goto bad; + if (__Pyx_ImportFunction(module, "lookupDefaultElementClass", (void (**)(void))&__pyx_api_f_4lxml_5etree_lookupDefaultElementClass, "PyObject *(PyObject *, PyObject *, xmlNode *)") < 0) goto bad; + if (__Pyx_ImportFunction(module, "lookupNamespaceElementClass", (void (**)(void))&__pyx_api_f_4lxml_5etree_lookupNamespaceElementClass, "PyObject *(PyObject *, PyObject *, xmlNode *)") < 0) goto bad; + if (__Pyx_ImportFunction(module, "callLookupFallback", (void (**)(void))&__pyx_api_f_4lxml_5etree_callLookupFallback, "PyObject *(struct LxmlFallbackElementClassLookup *, struct LxmlDocument *, xmlNode *)") < 0) goto bad; + if (__Pyx_ImportFunction(module, "tagMatches", (void (**)(void))&__pyx_api_f_4lxml_5etree_tagMatches, "int (xmlNode *, const xmlChar *, const xmlChar *)") < 0) goto bad; + if (__Pyx_ImportFunction(module, "documentOrRaise", (void (**)(void))&__pyx_api_f_4lxml_5etree_documentOrRaise, "struct LxmlDocument *(PyObject *)") < 0) goto bad; + if (__Pyx_ImportFunction(module, "rootNodeOrRaise", (void (**)(void))&__pyx_api_f_4lxml_5etree_rootNodeOrRaise, "struct LxmlElement *(PyObject *)") < 0) goto bad; + if (__Pyx_ImportFunction(module, "hasText", (void (**)(void))&__pyx_api_f_4lxml_5etree_hasText, "int (xmlNode *)") < 0) goto bad; + if (__Pyx_ImportFunction(module, "hasTail", (void (**)(void))&__pyx_api_f_4lxml_5etree_hasTail, "int (xmlNode *)") < 0) goto bad; + if (__Pyx_ImportFunction(module, "textOf", (void (**)(void))&__pyx_api_f_4lxml_5etree_textOf, "PyObject *(xmlNode *)") < 0) goto bad; + if (__Pyx_ImportFunction(module, "tailOf", (void (**)(void))&__pyx_api_f_4lxml_5etree_tailOf, "PyObject *(xmlNode *)") < 0) goto bad; + if (__Pyx_ImportFunction(module, "setNodeText", (void (**)(void))&__pyx_api_f_4lxml_5etree_setNodeText, "int (xmlNode *, PyObject *)") < 0) goto bad; + if (__Pyx_ImportFunction(module, "setTailText", (void (**)(void))&__pyx_api_f_4lxml_5etree_setTailText, "int (xmlNode *, PyObject *)") < 0) goto bad; + if (__Pyx_ImportFunction(module, "attributeValue", (void (**)(void))&__pyx_api_f_4lxml_5etree_attributeValue, "PyObject *(xmlNode *, xmlAttr *)") < 0) goto bad; + if (__Pyx_ImportFunction(module, "attributeValueFromNsName", (void (**)(void))&__pyx_api_f_4lxml_5etree_attributeValueFromNsName, "PyObject *(xmlNode *, const xmlChar *, const xmlChar *)") < 0) goto bad; + if (__Pyx_ImportFunction(module, "getAttributeValue", (void (**)(void))&__pyx_api_f_4lxml_5etree_getAttributeValue, "PyObject *(struct LxmlElement *, PyObject *, PyObject *)") < 0) goto bad; + if (__Pyx_ImportFunction(module, "iterattributes", (void (**)(void))&__pyx_api_f_4lxml_5etree_iterattributes, "PyObject *(struct LxmlElement *, int)") < 0) goto bad; + if (__Pyx_ImportFunction(module, "collectAttributes", (void (**)(void))&__pyx_api_f_4lxml_5etree_collectAttributes, "PyObject *(xmlNode *, int)") < 0) goto bad; + if (__Pyx_ImportFunction(module, "setAttributeValue", (void (**)(void))&__pyx_api_f_4lxml_5etree_setAttributeValue, "int (struct LxmlElement *, PyObject *, PyObject *)") < 0) goto bad; + if (__Pyx_ImportFunction(module, "delAttribute", (void (**)(void))&__pyx_api_f_4lxml_5etree_delAttribute, "int (struct LxmlElement *, PyObject *)") < 0) goto bad; + if (__Pyx_ImportFunction(module, "delAttributeFromNsName", (void (**)(void))&__pyx_api_f_4lxml_5etree_delAttributeFromNsName, "int (xmlNode *, const xmlChar *, const xmlChar *)") < 0) goto bad; + if (__Pyx_ImportFunction(module, "hasChild", (void (**)(void))&__pyx_api_f_4lxml_5etree_hasChild, "int (xmlNode *)") < 0) goto bad; + if (__Pyx_ImportFunction(module, "findChild", (void (**)(void))&__pyx_api_f_4lxml_5etree_findChild, "xmlNode *(xmlNode *, Py_ssize_t)") < 0) goto bad; + if (__Pyx_ImportFunction(module, "findChildForwards", (void (**)(void))&__pyx_api_f_4lxml_5etree_findChildForwards, "xmlNode *(xmlNode *, Py_ssize_t)") < 0) goto bad; + if (__Pyx_ImportFunction(module, "findChildBackwards", (void (**)(void))&__pyx_api_f_4lxml_5etree_findChildBackwards, "xmlNode *(xmlNode *, Py_ssize_t)") < 0) goto bad; + if (__Pyx_ImportFunction(module, "nextElement", (void (**)(void))&__pyx_api_f_4lxml_5etree_nextElement, "xmlNode *(xmlNode *)") < 0) goto bad; + if (__Pyx_ImportFunction(module, "previousElement", (void (**)(void))&__pyx_api_f_4lxml_5etree_previousElement, "xmlNode *(xmlNode *)") < 0) goto bad; + if (__Pyx_ImportFunction(module, "appendChild", (void (**)(void))&__pyx_api_f_4lxml_5etree_appendChild, "void (struct LxmlElement *, struct LxmlElement *)") < 0) goto bad; + if (__Pyx_ImportFunction(module, "appendChildToElement", (void (**)(void))&__pyx_api_f_4lxml_5etree_appendChildToElement, "int (struct LxmlElement *, struct LxmlElement *)") < 0) goto bad; + if (__Pyx_ImportFunction(module, "pyunicode", (void (**)(void))&__pyx_api_f_4lxml_5etree_pyunicode, "PyObject *(const xmlChar *)") < 0) goto bad; + if (__Pyx_ImportFunction(module, "utf8", (void (**)(void))&__pyx_api_f_4lxml_5etree_utf8, "PyObject *(PyObject *)") < 0) goto bad; + if (__Pyx_ImportFunction(module, "getNsTag", (void (**)(void))&__pyx_api_f_4lxml_5etree_getNsTag, "PyObject *(PyObject *)") < 0) goto bad; + if (__Pyx_ImportFunction(module, "getNsTagWithEmptyNs", (void (**)(void))&__pyx_api_f_4lxml_5etree_getNsTagWithEmptyNs, "PyObject *(PyObject *)") < 0) goto bad; + if (__Pyx_ImportFunction(module, "namespacedName", (void (**)(void))&__pyx_api_f_4lxml_5etree_namespacedName, "PyObject *(xmlNode *)") < 0) goto bad; + if (__Pyx_ImportFunction(module, "namespacedNameFromNsName", (void (**)(void))&__pyx_api_f_4lxml_5etree_namespacedNameFromNsName, "PyObject *(const xmlChar *, const xmlChar *)") < 0) goto bad; + if (__Pyx_ImportFunction(module, "iteratorStoreNext", (void (**)(void))&__pyx_api_f_4lxml_5etree_iteratorStoreNext, "void (struct LxmlElementIterator *, struct LxmlElement *)") < 0) goto bad; + if (__Pyx_ImportFunction(module, "initTagMatch", (void (**)(void))&__pyx_api_f_4lxml_5etree_initTagMatch, "void (struct LxmlElementTagMatcher *, PyObject *)") < 0) goto bad; + if (__Pyx_ImportFunction(module, "findOrBuildNodeNsPrefix", (void (**)(void))&__pyx_api_f_4lxml_5etree_findOrBuildNodeNsPrefix, "xmlNs *(struct LxmlDocument *, xmlNode *, const xmlChar *, const xmlChar *)") < 0) goto bad; + Py_DECREF(module); module = 0; + return 0; + bad: + Py_XDECREF(module); + return -1; +} + +#endif /* !__PYX_HAVE_API__lxml__etree */ diff --git a/libs/lxml/html/ElementSoup.py b/libs/lxml/html/ElementSoup.py new file mode 100644 index 000000000..8e4fde13c --- /dev/null +++ b/libs/lxml/html/ElementSoup.py @@ -0,0 +1,10 @@ +__doc__ = """Legacy interface to the BeautifulSoup HTML parser. +""" + +__all__ = ["parse", "convert_tree"] + +from soupparser import convert_tree, parse as _parse + +def parse(file, beautifulsoup=None, makeelement=None): + root = _parse(file, beautifulsoup=beautifulsoup, makeelement=makeelement) + return root.getroot() diff --git a/libs/lxml/html/__init__.py b/libs/lxml/html/__init__.py new file mode 100644 index 000000000..5751f7097 --- /dev/null +++ b/libs/lxml/html/__init__.py @@ -0,0 +1,1926 @@ +# Copyright (c) 2004 Ian Bicking. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: +# +# 1. Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in +# the documentation and/or other materials provided with the +# distribution. +# +# 3. Neither the name of Ian Bicking nor the names of its contributors may +# be used to endorse or promote products derived from this software +# without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL IAN BICKING OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +"""The ``lxml.html`` tool set for HTML handling. +""" + +from __future__ import absolute_import + +__all__ = [ + 'document_fromstring', 'fragment_fromstring', 'fragments_fromstring', 'fromstring', + 'tostring', 'Element', 'defs', 'open_in_browser', 'submit_form', + 'find_rel_links', 'find_class', 'make_links_absolute', + 'resolve_base_href', 'iterlinks', 'rewrite_links', 'open_in_browser', 'parse'] + + +import copy +import sys +import re +from functools import partial + +try: + from collections.abc import MutableMapping, MutableSet +except ImportError: + from collections import MutableMapping, MutableSet + +from .. import etree +from . import defs +from ._setmixin import SetMixin + +try: + from urlparse import urljoin +except ImportError: + # Python 3 + from urllib.parse import urljoin + +try: + unicode +except NameError: + # Python 3 + unicode = str +try: + basestring +except NameError: + # Python 3 + basestring = (str, bytes) + + +def __fix_docstring(s): + if not s: + return s + if sys.version_info[0] >= 3: + sub = re.compile(r"^(\s*)u'", re.M).sub + else: + sub = re.compile(r"^(\s*)b'", re.M).sub + return sub(r"\1'", s) + + +XHTML_NAMESPACE = "http://www.w3.org/1999/xhtml" + +_rel_links_xpath = etree.XPath("descendant-or-self::a[@rel]|descendant-or-self::x:a[@rel]", + namespaces={'x':XHTML_NAMESPACE}) +_options_xpath = etree.XPath("descendant-or-self::option|descendant-or-self::x:option", + namespaces={'x':XHTML_NAMESPACE}) +_forms_xpath = etree.XPath("descendant-or-self::form|descendant-or-self::x:form", + namespaces={'x':XHTML_NAMESPACE}) +#_class_xpath = etree.XPath(r"descendant-or-self::*[regexp:match(@class, concat('\b', $class_name, '\b'))]", {'regexp': 'http://exslt.org/regular-expressions'}) +_class_xpath = etree.XPath("descendant-or-self::*[@class and contains(concat(' ', normalize-space(@class), ' '), concat(' ', $class_name, ' '))]") +_id_xpath = etree.XPath("descendant-or-self::*[@id=$id]") +_collect_string_content = etree.XPath("string()") +_iter_css_urls = re.compile(r'url\(('+'["][^"]*["]|'+"['][^']*[']|"+r'[^)]*)\)', re.I).finditer +_iter_css_imports = re.compile(r'@import "(.*?)"').finditer +_label_xpath = etree.XPath("//label[@for=$id]|//x:label[@for=$id]", + namespaces={'x':XHTML_NAMESPACE}) +_archive_re = re.compile(r'[^ ]+') +_parse_meta_refresh_url = re.compile( + r'[^;=]*;\s*(?:url\s*=\s*)?(?P.*)$', re.I).search + + +def _unquote_match(s, pos): + if s[:1] == '"' and s[-1:] == '"' or s[:1] == "'" and s[-1:] == "'": + return s[1:-1], pos+1 + else: + return s,pos + + +def _transform_result(typ, result): + """Convert the result back into the input type. + """ + if issubclass(typ, bytes): + return tostring(result, encoding='utf-8') + elif issubclass(typ, unicode): + return tostring(result, encoding='unicode') + else: + return result + + +def _nons(tag): + if isinstance(tag, basestring): + if tag[0] == '{' and tag[1:len(XHTML_NAMESPACE)+1] == XHTML_NAMESPACE: + return tag.split('}')[-1] + return tag + + +class Classes(MutableSet): + """Provides access to an element's class attribute as a set-like collection. + Usage:: + + >>> el = fromstring('') + >>> classes = el.classes # or: classes = Classes(el.attrib) + >>> classes |= ['block', 'paragraph'] + >>> el.get('class') + 'hidden large block paragraph' + >>> classes.toggle('hidden') + False + >>> el.get('class') + 'large block paragraph' + >>> classes -= ('some', 'classes', 'block') + >>> el.get('class') + 'large paragraph' + """ + def __init__(self, attributes): + self._attributes = attributes + self._get_class_value = partial(attributes.get, 'class', '') + + def add(self, value): + """ + Add a class. + + This has no effect if the class is already present. + """ + if not value or re.search(r'\s', value): + raise ValueError("Invalid class name: %r" % value) + classes = self._get_class_value().split() + if value in classes: + return + classes.append(value) + self._attributes['class'] = ' '.join(classes) + + def discard(self, value): + """ + Remove a class if it is currently present. + + If the class is not present, do nothing. + """ + if not value or re.search(r'\s', value): + raise ValueError("Invalid class name: %r" % value) + classes = [name for name in self._get_class_value().split() + if name != value] + if classes: + self._attributes['class'] = ' '.join(classes) + elif 'class' in self._attributes: + del self._attributes['class'] + + def remove(self, value): + """ + Remove a class; it must currently be present. + + If the class is not present, raise a KeyError. + """ + if not value or re.search(r'\s', value): + raise ValueError("Invalid class name: %r" % value) + super(Classes, self).remove(value) + + def __contains__(self, name): + classes = self._get_class_value() + return name in classes and name in classes.split() + + def __iter__(self): + return iter(self._get_class_value().split()) + + def __len__(self): + return len(self._get_class_value().split()) + + # non-standard methods + + def update(self, values): + """ + Add all names from 'values'. + """ + classes = self._get_class_value().split() + extended = False + for value in values: + if value not in classes: + classes.append(value) + extended = True + if extended: + self._attributes['class'] = ' '.join(classes) + + def toggle(self, value): + """ + Add a class name if it isn't there yet, or remove it if it exists. + + Returns true if the class was added (and is now enabled) and + false if it was removed (and is now disabled). + """ + if not value or re.search(r'\s', value): + raise ValueError("Invalid class name: %r" % value) + classes = self._get_class_value().split() + try: + classes.remove(value) + enabled = False + except ValueError: + classes.append(value) + enabled = True + if classes: + self._attributes['class'] = ' '.join(classes) + else: + del self._attributes['class'] + return enabled + + +class HtmlMixin(object): + + def set(self, key, value=None): + """set(self, key, value=None) + + Sets an element attribute. If no value is provided, or if the value is None, + creates a 'boolean' attribute without value, e.g. "
" + for ``form.set('novalidate')``. + """ + super(HtmlElement, self).set(key, value) + + @property + def classes(self): + """ + A set-like wrapper around the 'class' attribute. + """ + return Classes(self.attrib) + + @classes.setter + def classes(self, classes): + assert isinstance(classes, Classes) # only allow "el.classes |= ..." etc. + value = classes._get_class_value() + if value: + self.set('class', value) + elif self.get('class') is not None: + del self.attrib['class'] + + @property + def base_url(self): + """ + Returns the base URL, given when the page was parsed. + + Use with ``urlparse.urljoin(el.base_url, href)`` to get + absolute URLs. + """ + return self.getroottree().docinfo.URL + + @property + def forms(self): + """ + Return a list of all the forms + """ + return _forms_xpath(self) + + @property + def body(self): + """ + Return the element. Can be called from a child element + to get the document's head. + """ + return self.xpath('//body|//x:body', namespaces={'x':XHTML_NAMESPACE})[0] + + @property + def head(self): + """ + Returns the element. Can be called from a child + element to get the document's head. + """ + return self.xpath('//head|//x:head', namespaces={'x':XHTML_NAMESPACE})[0] + + @property + def label(self): + """ + Get or set any