bazarr/libs/waitress/parser.py

##############################################################################
#
# Copyright (c) 2001, 2002 Zope Foundation and Contributors.
# All Rights Reserved.
#
# This software is subject to the provisions of the Zope Public License,
# Version 2.1 (ZPL).  A copy of the ZPL should accompany this distribution.
# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
# FOR A PARTICULAR PURPOSE.
#
##############################################################################
"""HTTP Request Parser

This server uses asyncore to accept connections and do initial
processing but threads to do work.
"""
import re
from io import BytesIO

from waitress.compat import (
    tostr,
    urlparse,
    unquote_bytes_to_wsgi,
)

from waitress.buffers import OverflowableBuffer

from waitress.receiver import (
    FixedStreamReceiver,
    ChunkedReceiver,
)

from waitress.utilities import (
    find_double_newline,
    RequestEntityTooLarge,
    RequestHeaderFieldsTooLarge,
    BadRequest,
)

class ParsingError(Exception):
    pass

class HTTPRequestParser(object):
    """A structure that collects the HTTP request.

    Once the stream is completed, the instance is passed to
    a server task constructor.
    """
    completed = False        # Set once request is completed.
    empty = False            # Set if no request was made.
    expect_continue = False  # client sent "Expect: 100-continue" header
    headers_finished = False # True when headers have been read
    header_plus = b''
    chunked = False
    content_length = 0
    header_bytes_received = 0
    body_bytes_received = 0
    body_rcv = None
    version = '1.0'
    error = None
    connection_close = False

    # Other attributes: first_line, header, headers, command, uri, version,
    # path, query, fragment

    def __init__(self, adj):
        """
        adj is an Adjustments object.
        """
        # headers is a mapping containing keys translated to uppercase
        # with dashes turned into underscores.
        self.headers = {}
        self.adj = adj

    def received(self, data):
        """
        Receives the HTTP stream for one request.  Returns the number of
        bytes consumed.  Sets the completed flag once both the header and the
        body have been received.
        """
        if self.completed:
            return 0 # Can't consume any more.
        datalen = len(data)
        br = self.body_rcv
        if br is None:
            # In header.
            s = self.header_plus + data
            index = find_double_newline(s)
            if index >= 0:
                # Header finished.
                header_plus = s[:index]
                consumed = len(data) - (len(s) - index)
                # Remove preceeding blank lines.
                header_plus = header_plus.lstrip()
                if not header_plus:
                    self.empty = True
                    self.completed = True
                else:
                    try:
                        self.parse_header(header_plus)
                    except ParsingError as e:
                        self.error = BadRequest(e.args[0])
                        self.completed = True
                    else:
                        if self.body_rcv is None:
                            # no content-length header and not a t-e: chunked
                            # request
                            self.completed = True
                        if self.content_length > 0:
                            max_body = self.adj.max_request_body_size
                            # we won't accept this request if the content-length
                            # is too large
                            if self.content_length >= max_body:
                                self.error = RequestEntityTooLarge(
                                    'exceeds max_body of %s' % max_body)
                                self.completed = True
                self.headers_finished = True
                return consumed
            else:
                # Header not finished yet.
                self.header_bytes_received += datalen
                max_header = self.adj.max_request_header_size
                if self.header_bytes_received >= max_header:
                    # malformed header, we need to construct some request
                    # on our own. we disregard the incoming(?) requests HTTP
                    # version and just use 1.0. IOW someone just sent garbage
                    # over the wire
                    self.parse_header(b'GET / HTTP/1.0\n')
                    self.error = RequestHeaderFieldsTooLarge(
                        'exceeds max_header of %s' % max_header)
                    self.completed = True
                self.header_plus = s
                return datalen
        else:
            # In body.
            consumed = br.received(data)
            self.body_bytes_received += consumed
            max_body = self.adj.max_request_body_size
            if self.body_bytes_received >= max_body:
                # this will only be raised during t-e: chunked requests
                self.error = RequestEntityTooLarge(
                    'exceeds max_body of %s' % max_body)
                self.completed = True
            elif br.error:
                # garbage in chunked encoding input probably
                self.error = br.error
                self.completed = True
            elif br.completed:
                # The request (with the body) is ready to use.
                self.completed = True
                if self.chunked:
                    # We've converted the chunked transfer encoding request
                    # body into a normal request body, so we know its content
                    # length; set the header here.  We already popped the
                    # TRANSFER_ENCODING header in parse_header, so this will
                    # appear to the client to be an entirely non-chunked HTTP
                    # request with a valid content-length.
                    self.headers['CONTENT_LENGTH'] = str(br.__len__())
            return consumed

    def parse_header(self, header_plus):
        """
        Parses the header_plus block of text (the headers plus the
        first line of the request).
        """
        index = header_plus.find(b'\n')
        if index >= 0:
            first_line = header_plus[:index].rstrip()
            header = header_plus[index + 1:]
        else:
            first_line = header_plus.rstrip()
            header = b''

        self.first_line = first_line # for testing

        lines = get_header_lines(header)

        headers = self.headers
        for line in lines:
            index = line.find(b':')
            if index > 0:
                key = line[:index]
                if b'_' in key:
                    continue
                value = line[index + 1:].strip()
                key1 = tostr(key.upper().replace(b'-', b'_'))
                # If a header already exists, we append subsequent values
                # seperated by a comma. Applications already need to handle
                # the comma seperated values, as HTTP front ends might do
                # the concatenation for you (behavior specified in RFC2616).
                try:
                    headers[key1] += tostr(b', ' + value)
                except KeyError:
                    headers[key1] = tostr(value)
            # else there's garbage in the headers?

        # command, uri, version will be bytes
        command, uri, version = crack_first_line(first_line)
        version = tostr(version)
        command = tostr(command)
        self.command = command
        self.version = version
        (self.proxy_scheme,
         self.proxy_netloc,
         self.path,
         self.query, self.fragment) = split_uri(uri)
        self.url_scheme = self.adj.url_scheme
        connection = headers.get('CONNECTION', '')

        if version == '1.0':
            if connection.lower() != 'keep-alive':
                self.connection_close = True

        if version == '1.1':
            # since the server buffers data from chunked transfers and clients
            # never need to deal with chunked requests, downstream clients
            # should not see the HTTP_TRANSFER_ENCODING header; we pop it
            # here
            te = headers.pop('TRANSFER_ENCODING', '')
            if te.lower() == 'chunked':
                self.chunked = True
                buf = OverflowableBuffer(self.adj.inbuf_overflow)
                self.body_rcv = ChunkedReceiver(buf)
            expect = headers.get('EXPECT', '').lower()
            self.expect_continue = expect == '100-continue'
            if connection.lower() == 'close':
                self.connection_close = True

        if not self.chunked:
            try:
                cl = int(headers.get('CONTENT_LENGTH', 0))
            except ValueError:
                cl = 0
            self.content_length = cl
            if cl > 0:
                buf = OverflowableBuffer(self.adj.inbuf_overflow)
                self.body_rcv = FixedStreamReceiver(cl, buf)

    def get_body_stream(self):
        body_rcv = self.body_rcv
        if body_rcv is not None:
            return body_rcv.getfile()
        else:
            return BytesIO()

    def close(self):
        body_rcv = self.body_rcv
        if body_rcv is not None:
            body_rcv.getbuf().close()

def split_uri(uri):
    # urlsplit handles byte input by returning bytes on py3, so
    # scheme, netloc, path, query, and fragment are bytes
    try:
        scheme, netloc, path, query, fragment = urlparse.urlsplit(uri)
    except UnicodeError:
        raise ParsingError('Bad URI')
    return (
        tostr(scheme),
        tostr(netloc),
        unquote_bytes_to_wsgi(path),
        tostr(query),
        tostr(fragment),
    )

def get_header_lines(header):
    """
    Splits the header into lines, putting multi-line headers together.
    """
    r = []
    lines = header.split(b'\n')
    for line in lines:
        if line.startswith((b' ', b'\t')):
            if not r:
                # http://corte.si/posts/code/pathod/pythonservers/index.html
                raise ParsingError('Malformed header line "%s"' % tostr(line))
            r[-1] += line
        else:
            r.append(line)
    return r

first_line_re = re.compile(
    b'([^ ]+) '
    b'((?:[^ :?#]+://[^ ?#/]*(?:[0-9]{1,5})?)?[^ ]+)'
    b'(( HTTP/([0-9.]+))$|$)'
)

def crack_first_line(line):
    m = first_line_re.match(line)
    if m is not None and m.end() == len(line):
        if m.group(3):
            version = m.group(5)
        else:
            version = None
        method = m.group(1)

        # the request methods that are currently defined are all uppercase:
        # https://www.iana.org/assignments/http-methods/http-methods.xhtml and
        # the request method is case sensitive according to
        # https://tools.ietf.org/html/rfc7231#section-4.1

        # By disallowing anything but uppercase methods we save poor
        # unsuspecting souls from sending lowercase HTTP methods to waitress
        # and having the request complete, while servers like nginx drop the
        # request onto the floor.
        if method != method.upper():
            raise ParsingError('Malformed HTTP method "%s"' % tostr(method))
        uri = m.group(2)
        return method, uri, version
    else:
        return b'', b'', b''
Include dependencies and remove requirements.txt 6 years ago			`##############################################################################`
			`#`
			`# Copyright (c) 2001, 2002 Zope Foundation and Contributors.`
			`# All Rights Reserved.`
			`#`
			`# This software is subject to the provisions of the Zope Public License,`
			`# Version 2.1 (ZPL). A copy of the ZPL should accompany this distribution.`
			`# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED`
			`# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED`
			`# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS`
			`# FOR A PARTICULAR PURPOSE.`
			`#`
			`##############################################################################`
			`"""HTTP Request Parser`

			`This server uses asyncore to accept connections and do initial`
			`processing but threads to do work.`
			`"""`
			`import re`
			`from io import BytesIO`

			`from waitress.compat import (`
			`tostr,`
			`urlparse,`
			`unquote_bytes_to_wsgi,`
			`)`

			`from waitress.buffers import OverflowableBuffer`

			`from waitress.receiver import (`
			`FixedStreamReceiver,`
			`ChunkedReceiver,`
			`)`

			`from waitress.utilities import (`
			`find_double_newline,`
			`RequestEntityTooLarge,`
			`RequestHeaderFieldsTooLarge,`
			`BadRequest,`
			`)`

			`class ParsingError(Exception):`
			`pass`

			`class HTTPRequestParser(object):`
			`"""A structure that collects the HTTP request.`

			`Once the stream is completed, the instance is passed to`
			`a server task constructor.`
			`"""`
			`completed = False # Set once request is completed.`
			`empty = False # Set if no request was made.`
			`expect_continue = False # client sent "Expect: 100-continue" header`
			`headers_finished = False # True when headers have been read`
			`header_plus = b''`
			`chunked = False`
			`content_length = 0`
			`header_bytes_received = 0`
			`body_bytes_received = 0`
			`body_rcv = None`
			`version = '1.0'`
			`error = None`
			`connection_close = False`

			`# Other attributes: first_line, header, headers, command, uri, version,`
			`# path, query, fragment`

			`def __init__(self, adj):`
			`"""`
			`adj is an Adjustments object.`
			`"""`
			`# headers is a mapping containing keys translated to uppercase`
			`# with dashes turned into underscores.`
			`self.headers = {}`
			`self.adj = adj`

			`def received(self, data):`
			`"""`
			`Receives the HTTP stream for one request. Returns the number of`
			`bytes consumed. Sets the completed flag once both the header and the`
			`body have been received.`
			`"""`
			`if self.completed:`
			`return 0 # Can't consume any more.`
			`datalen = len(data)`
			`br = self.body_rcv`
			`if br is None:`
			`# In header.`
			`s = self.header_plus + data`
			`index = find_double_newline(s)`
			`if index >= 0:`
			`# Header finished.`
			`header_plus = s[:index]`
			`consumed = len(data) - (len(s) - index)`
			`# Remove preceeding blank lines.`
			`header_plus = header_plus.lstrip()`
			`if not header_plus:`
			`self.empty = True`
			`self.completed = True`
			`else:`
			`try:`
			`self.parse_header(header_plus)`
			`except ParsingError as e:`
			`self.error = BadRequest(e.args[0])`
			`self.completed = True`
			`else:`
			`if self.body_rcv is None:`
			`# no content-length header and not a t-e: chunked`
			`# request`
			`self.completed = True`
			`if self.content_length > 0:`
			`max_body = self.adj.max_request_body_size`
			`# we won't accept this request if the content-length`
			`# is too large`
			`if self.content_length >= max_body:`
			`self.error = RequestEntityTooLarge(`
			`'exceeds max_body of %s' % max_body)`
			`self.completed = True`
			`self.headers_finished = True`
			`return consumed`
			`else:`
			`# Header not finished yet.`
			`self.header_bytes_received += datalen`
			`max_header = self.adj.max_request_header_size`
			`if self.header_bytes_received >= max_header:`
			`# malformed header, we need to construct some request`
			`# on our own. we disregard the incoming(?) requests HTTP`
			`# version and just use 1.0. IOW someone just sent garbage`
			`# over the wire`
			`self.parse_header(b'GET / HTTP/1.0\n')`
			`self.error = RequestHeaderFieldsTooLarge(`
			`'exceeds max_header of %s' % max_header)`
			`self.completed = True`
			`self.header_plus = s`
			`return datalen`
			`else:`
			`# In body.`
			`consumed = br.received(data)`
			`self.body_bytes_received += consumed`
			`max_body = self.adj.max_request_body_size`
			`if self.body_bytes_received >= max_body:`
			`# this will only be raised during t-e: chunked requests`
			`self.error = RequestEntityTooLarge(`
			`'exceeds max_body of %s' % max_body)`
			`self.completed = True`
			`elif br.error:`
			`# garbage in chunked encoding input probably`
			`self.error = br.error`
			`self.completed = True`
			`elif br.completed:`
			`# The request (with the body) is ready to use.`
			`self.completed = True`
			`if self.chunked:`
			`# We've converted the chunked transfer encoding request`
			`# body into a normal request body, so we know its content`
			`# length; set the header here. We already popped the`
			`# TRANSFER_ENCODING header in parse_header, so this will`
			`# appear to the client to be an entirely non-chunked HTTP`
			`# request with a valid content-length.`
			`self.headers['CONTENT_LENGTH'] = str(br.__len__())`
			`return consumed`

			`def parse_header(self, header_plus):`
			`"""`
			`Parses the header_plus block of text (the headers plus the`
			`first line of the request).`
			`"""`
			`index = header_plus.find(b'\n')`
			`if index >= 0:`
			`first_line = header_plus[:index].rstrip()`
			`header = header_plus[index + 1:]`
			`else:`
			`first_line = header_plus.rstrip()`
			`header = b''`

			`self.first_line = first_line # for testing`

			`lines = get_header_lines(header)`

			`headers = self.headers`
			`for line in lines:`
			`index = line.find(b':')`
			`if index > 0:`
			`key = line[:index]`
			`if b'_' in key:`
			`continue`
			`value = line[index + 1:].strip()`
			`key1 = tostr(key.upper().replace(b'-', b'_'))`
			`# If a header already exists, we append subsequent values`
			`# seperated by a comma. Applications already need to handle`
			`# the comma seperated values, as HTTP front ends might do`
			`# the concatenation for you (behavior specified in RFC2616).`
			`try:`
			`headers[key1] += tostr(b', ' + value)`
			`except KeyError:`
			`headers[key1] = tostr(value)`
			`# else there's garbage in the headers?`

			`# command, uri, version will be bytes`
			`command, uri, version = crack_first_line(first_line)`
			`version = tostr(version)`
			`command = tostr(command)`
			`self.command = command`
			`self.version = version`
			`(self.proxy_scheme,`
			`self.proxy_netloc,`
			`self.path,`
			`self.query, self.fragment) = split_uri(uri)`
			`self.url_scheme = self.adj.url_scheme`
			`connection = headers.get('CONNECTION', '')`

			`if version == '1.0':`
			`if connection.lower() != 'keep-alive':`
			`self.connection_close = True`

			`if version == '1.1':`
			`# since the server buffers data from chunked transfers and clients`
			`# never need to deal with chunked requests, downstream clients`
			`# should not see the HTTP_TRANSFER_ENCODING header; we pop it`
			`# here`
			`te = headers.pop('TRANSFER_ENCODING', '')`
			`if te.lower() == 'chunked':`
			`self.chunked = True`
			`buf = OverflowableBuffer(self.adj.inbuf_overflow)`
			`self.body_rcv = ChunkedReceiver(buf)`
			`expect = headers.get('EXPECT', '').lower()`
			`self.expect_continue = expect == '100-continue'`
			`if connection.lower() == 'close':`
			`self.connection_close = True`

			`if not self.chunked:`
			`try:`
			`cl = int(headers.get('CONTENT_LENGTH', 0))`
			`except ValueError:`
			`cl = 0`
			`self.content_length = cl`
			`if cl > 0:`
			`buf = OverflowableBuffer(self.adj.inbuf_overflow)`
			`self.body_rcv = FixedStreamReceiver(cl, buf)`

			`def get_body_stream(self):`
			`body_rcv = self.body_rcv`
			`if body_rcv is not None:`
			`return body_rcv.getfile()`
			`else:`
			`return BytesIO()`

			`def close(self):`
			`body_rcv = self.body_rcv`
			`if body_rcv is not None:`
			`body_rcv.getbuf().close()`

			`def split_uri(uri):`
			`# urlsplit handles byte input by returning bytes on py3, so`
			`# scheme, netloc, path, query, and fragment are bytes`
			`try:`
			`scheme, netloc, path, query, fragment = urlparse.urlsplit(uri)`
			`except UnicodeError:`
			`raise ParsingError('Bad URI')`
			`return (`
			`tostr(scheme),`
			`tostr(netloc),`
			`unquote_bytes_to_wsgi(path),`
			`tostr(query),`
			`tostr(fragment),`
			`)`

			`def get_header_lines(header):`
			`"""`
			`Splits the header into lines, putting multi-line headers together.`
			`"""`
			`r = []`
			`lines = header.split(b'\n')`
			`for line in lines:`
			`if line.startswith((b' ', b'\t')):`
			`if not r:`
			`# http://corte.si/posts/code/pathod/pythonservers/index.html`
			`raise ParsingError('Malformed header line "%s"' % tostr(line))`
			`r[-1] += line`
			`else:`
			`r.append(line)`
			`return r`

			`first_line_re = re.compile(`
			`b'([^ ]+) '`
			`b'((?:[^ :?#]+://[^ ?#/]*(?:[0-9]{1,5})?)?[^ ]+)'`
			`b'(( HTTP/([0-9.]+))$\|$)'`
			`)`

			`def crack_first_line(line):`
			`m = first_line_re.match(line)`
			`if m is not None and m.end() == len(line):`
			`if m.group(3):`
			`version = m.group(5)`
			`else:`
			`version = None`
			`method = m.group(1)`

			`# the request methods that are currently defined are all uppercase:`
			`# https://www.iana.org/assignments/http-methods/http-methods.xhtml and`
			`# the request method is case sensitive according to`
			`# https://tools.ietf.org/html/rfc7231#section-4.1`

			`# By disallowing anything but uppercase methods we save poor`
			`# unsuspecting souls from sending lowercase HTTP methods to waitress`
			`# and having the request complete, while servers like nginx drop the`
			`# request onto the floor.`
			`if method != method.upper():`
			`raise ParsingError('Malformed HTTP method "%s"' % tostr(method))`
			`uri = m.group(2)`
			`return method, uri, version`
			`else:`
			`return b'', b'', b''`