diff options
Diffstat (limited to 'python/werkzeug/formparser.py')
-rw-r--r-- | python/werkzeug/formparser.py | 586 |
1 files changed, 586 insertions, 0 deletions
diff --git a/python/werkzeug/formparser.py b/python/werkzeug/formparser.py new file mode 100644 index 0000000..0ddc5c8 --- /dev/null +++ b/python/werkzeug/formparser.py @@ -0,0 +1,586 @@ +# -*- coding: utf-8 -*- +""" + werkzeug.formparser + ~~~~~~~~~~~~~~~~~~~ + + This module implements the form parsing. It supports url-encoded forms + as well as non-nested multipart uploads. + + :copyright: 2007 Pallets + :license: BSD-3-Clause +""" +import codecs +import re +from functools import update_wrapper +from itertools import chain +from itertools import repeat +from itertools import tee + +from ._compat import BytesIO +from ._compat import text_type +from ._compat import to_native +from .datastructures import FileStorage +from .datastructures import Headers +from .datastructures import MultiDict +from .http import parse_options_header +from .urls import url_decode_stream +from .wsgi import get_content_length +from .wsgi import get_input_stream +from .wsgi import make_line_iter + +# there are some platforms where SpooledTemporaryFile is not available. +# In that case we need to provide a fallback. +try: + from tempfile import SpooledTemporaryFile +except ImportError: + from tempfile import TemporaryFile + + SpooledTemporaryFile = None + + +#: an iterator that yields empty strings +_empty_string_iter = repeat("") + +#: a regular expression for multipart boundaries +_multipart_boundary_re = re.compile("^[ -~]{0,200}[!-~]$") + +#: supported http encodings that are also available in python we support +#: for multipart messages. +_supported_multipart_encodings = frozenset(["base64", "quoted-printable"]) + + +def default_stream_factory( + total_content_length, filename, content_type, content_length=None +): + """The stream factory that is used per default.""" + max_size = 1024 * 500 + if SpooledTemporaryFile is not None: + return SpooledTemporaryFile(max_size=max_size, mode="wb+") + if total_content_length is None or total_content_length > max_size: + return TemporaryFile("wb+") + return BytesIO() + + +def parse_form_data( + environ, + stream_factory=None, + charset="utf-8", + errors="replace", + max_form_memory_size=None, + max_content_length=None, + cls=None, + silent=True, +): + """Parse the form data in the environ and return it as tuple in the form + ``(stream, form, files)``. You should only call this method if the + transport method is `POST`, `PUT`, or `PATCH`. + + If the mimetype of the data transmitted is `multipart/form-data` the + files multidict will be filled with `FileStorage` objects. If the + mimetype is unknown the input stream is wrapped and returned as first + argument, else the stream is empty. + + This is a shortcut for the common usage of :class:`FormDataParser`. + + Have a look at :ref:`dealing-with-request-data` for more details. + + .. versionadded:: 0.5 + The `max_form_memory_size`, `max_content_length` and + `cls` parameters were added. + + .. versionadded:: 0.5.1 + The optional `silent` flag was added. + + :param environ: the WSGI environment to be used for parsing. + :param stream_factory: An optional callable that returns a new read and + writeable file descriptor. This callable works + the same as :meth:`~BaseResponse._get_file_stream`. + :param charset: The character set for URL and url encoded form data. + :param errors: The encoding error behavior. + :param max_form_memory_size: the maximum number of bytes to be accepted for + in-memory stored form data. If the data + exceeds the value specified an + :exc:`~exceptions.RequestEntityTooLarge` + exception is raised. + :param max_content_length: If this is provided and the transmitted data + is longer than this value an + :exc:`~exceptions.RequestEntityTooLarge` + exception is raised. + :param cls: an optional dict class to use. If this is not specified + or `None` the default :class:`MultiDict` is used. + :param silent: If set to False parsing errors will not be caught. + :return: A tuple in the form ``(stream, form, files)``. + """ + return FormDataParser( + stream_factory, + charset, + errors, + max_form_memory_size, + max_content_length, + cls, + silent, + ).parse_from_environ(environ) + + +def exhaust_stream(f): + """Helper decorator for methods that exhausts the stream on return.""" + + def wrapper(self, stream, *args, **kwargs): + try: + return f(self, stream, *args, **kwargs) + finally: + exhaust = getattr(stream, "exhaust", None) + if exhaust is not None: + exhaust() + else: + while 1: + chunk = stream.read(1024 * 64) + if not chunk: + break + + return update_wrapper(wrapper, f) + + +class FormDataParser(object): + """This class implements parsing of form data for Werkzeug. By itself + it can parse multipart and url encoded form data. It can be subclassed + and extended but for most mimetypes it is a better idea to use the + untouched stream and expose it as separate attributes on a request + object. + + .. versionadded:: 0.8 + + :param stream_factory: An optional callable that returns a new read and + writeable file descriptor. This callable works + the same as :meth:`~BaseResponse._get_file_stream`. + :param charset: The character set for URL and url encoded form data. + :param errors: The encoding error behavior. + :param max_form_memory_size: the maximum number of bytes to be accepted for + in-memory stored form data. If the data + exceeds the value specified an + :exc:`~exceptions.RequestEntityTooLarge` + exception is raised. + :param max_content_length: If this is provided and the transmitted data + is longer than this value an + :exc:`~exceptions.RequestEntityTooLarge` + exception is raised. + :param cls: an optional dict class to use. If this is not specified + or `None` the default :class:`MultiDict` is used. + :param silent: If set to False parsing errors will not be caught. + """ + + def __init__( + self, + stream_factory=None, + charset="utf-8", + errors="replace", + max_form_memory_size=None, + max_content_length=None, + cls=None, + silent=True, + ): + if stream_factory is None: + stream_factory = default_stream_factory + self.stream_factory = stream_factory + self.charset = charset + self.errors = errors + self.max_form_memory_size = max_form_memory_size + self.max_content_length = max_content_length + if cls is None: + cls = MultiDict + self.cls = cls + self.silent = silent + + def get_parse_func(self, mimetype, options): + return self.parse_functions.get(mimetype) + + def parse_from_environ(self, environ): + """Parses the information from the environment as form data. + + :param environ: the WSGI environment to be used for parsing. + :return: A tuple in the form ``(stream, form, files)``. + """ + content_type = environ.get("CONTENT_TYPE", "") + content_length = get_content_length(environ) + mimetype, options = parse_options_header(content_type) + return self.parse(get_input_stream(environ), mimetype, content_length, options) + + def parse(self, stream, mimetype, content_length, options=None): + """Parses the information from the given stream, mimetype, + content length and mimetype parameters. + + :param stream: an input stream + :param mimetype: the mimetype of the data + :param content_length: the content length of the incoming data + :param options: optional mimetype parameters (used for + the multipart boundary for instance) + :return: A tuple in the form ``(stream, form, files)``. + """ + if ( + self.max_content_length is not None + and content_length is not None + and content_length > self.max_content_length + ): + raise exceptions.RequestEntityTooLarge() + if options is None: + options = {} + + parse_func = self.get_parse_func(mimetype, options) + if parse_func is not None: + try: + return parse_func(self, stream, mimetype, content_length, options) + except ValueError: + if not self.silent: + raise + + return stream, self.cls(), self.cls() + + @exhaust_stream + def _parse_multipart(self, stream, mimetype, content_length, options): + parser = MultiPartParser( + self.stream_factory, + self.charset, + self.errors, + max_form_memory_size=self.max_form_memory_size, + cls=self.cls, + ) + boundary = options.get("boundary") + if boundary is None: + raise ValueError("Missing boundary") + if isinstance(boundary, text_type): + boundary = boundary.encode("ascii") + form, files = parser.parse(stream, boundary, content_length) + return stream, form, files + + @exhaust_stream + def _parse_urlencoded(self, stream, mimetype, content_length, options): + if ( + self.max_form_memory_size is not None + and content_length is not None + and content_length > self.max_form_memory_size + ): + raise exceptions.RequestEntityTooLarge() + form = url_decode_stream(stream, self.charset, errors=self.errors, cls=self.cls) + return stream, form, self.cls() + + #: mapping of mimetypes to parsing functions + parse_functions = { + "multipart/form-data": _parse_multipart, + "application/x-www-form-urlencoded": _parse_urlencoded, + "application/x-url-encoded": _parse_urlencoded, + } + + +def is_valid_multipart_boundary(boundary): + """Checks if the string given is a valid multipart boundary.""" + return _multipart_boundary_re.match(boundary) is not None + + +def _line_parse(line): + """Removes line ending characters and returns a tuple (`stripped_line`, + `is_terminated`). + """ + if line[-2:] in ["\r\n", b"\r\n"]: + return line[:-2], True + elif line[-1:] in ["\r", "\n", b"\r", b"\n"]: + return line[:-1], True + return line, False + + +def parse_multipart_headers(iterable): + """Parses multipart headers from an iterable that yields lines (including + the trailing newline symbol). The iterable has to be newline terminated. + + The iterable will stop at the line where the headers ended so it can be + further consumed. + + :param iterable: iterable of strings that are newline terminated + """ + result = [] + for line in iterable: + line = to_native(line) + line, line_terminated = _line_parse(line) + if not line_terminated: + raise ValueError("unexpected end of line in multipart header") + if not line: + break + elif line[0] in " \t" and result: + key, value = result[-1] + result[-1] = (key, value + "\n " + line[1:]) + else: + parts = line.split(":", 1) + if len(parts) == 2: + result.append((parts[0].strip(), parts[1].strip())) + + # we link the list to the headers, no need to create a copy, the + # list was not shared anyways. + return Headers(result) + + +_begin_form = "begin_form" +_begin_file = "begin_file" +_cont = "cont" +_end = "end" + + +class MultiPartParser(object): + def __init__( + self, + stream_factory=None, + charset="utf-8", + errors="replace", + max_form_memory_size=None, + cls=None, + buffer_size=64 * 1024, + ): + self.charset = charset + self.errors = errors + self.max_form_memory_size = max_form_memory_size + self.stream_factory = ( + default_stream_factory if stream_factory is None else stream_factory + ) + self.cls = MultiDict if cls is None else cls + + # make sure the buffer size is divisible by four so that we can base64 + # decode chunk by chunk + assert buffer_size % 4 == 0, "buffer size has to be divisible by 4" + # also the buffer size has to be at least 1024 bytes long or long headers + # will freak out the system + assert buffer_size >= 1024, "buffer size has to be at least 1KB" + + self.buffer_size = buffer_size + + def _fix_ie_filename(self, filename): + """Internet Explorer 6 transmits the full file name if a file is + uploaded. This function strips the full path if it thinks the + filename is Windows-like absolute. + """ + if filename[1:3] == ":\\" or filename[:2] == "\\\\": + return filename.split("\\")[-1] + return filename + + def _find_terminator(self, iterator): + """The terminator might have some additional newlines before it. + There is at least one application that sends additional newlines + before headers (the python setuptools package). + """ + for line in iterator: + if not line: + break + line = line.strip() + if line: + return line + return b"" + + def fail(self, message): + raise ValueError(message) + + def get_part_encoding(self, headers): + transfer_encoding = headers.get("content-transfer-encoding") + if ( + transfer_encoding is not None + and transfer_encoding in _supported_multipart_encodings + ): + return transfer_encoding + + def get_part_charset(self, headers): + # Figure out input charset for current part + content_type = headers.get("content-type") + if content_type: + mimetype, ct_params = parse_options_header(content_type) + return ct_params.get("charset", self.charset) + return self.charset + + def start_file_streaming(self, filename, headers, total_content_length): + if isinstance(filename, bytes): + filename = filename.decode(self.charset, self.errors) + filename = self._fix_ie_filename(filename) + content_type = headers.get("content-type") + try: + content_length = int(headers["content-length"]) + except (KeyError, ValueError): + content_length = 0 + container = self.stream_factory( + total_content_length=total_content_length, + filename=filename, + content_type=content_type, + content_length=content_length, + ) + return filename, container + + def in_memory_threshold_reached(self, bytes): + raise exceptions.RequestEntityTooLarge() + + def validate_boundary(self, boundary): + if not boundary: + self.fail("Missing boundary") + if not is_valid_multipart_boundary(boundary): + self.fail("Invalid boundary: %s" % boundary) + if len(boundary) > self.buffer_size: # pragma: no cover + # this should never happen because we check for a minimum size + # of 1024 and boundaries may not be longer than 200. The only + # situation when this happens is for non debug builds where + # the assert is skipped. + self.fail("Boundary longer than buffer size") + + def parse_lines(self, file, boundary, content_length, cap_at_buffer=True): + """Generate parts of + ``('begin_form', (headers, name))`` + ``('begin_file', (headers, name, filename))`` + ``('cont', bytestring)`` + ``('end', None)`` + + Always obeys the grammar + parts = ( begin_form cont* end | + begin_file cont* end )* + """ + next_part = b"--" + boundary + last_part = next_part + b"--" + + iterator = chain( + make_line_iter( + file, + limit=content_length, + buffer_size=self.buffer_size, + cap_at_buffer=cap_at_buffer, + ), + _empty_string_iter, + ) + + terminator = self._find_terminator(iterator) + + if terminator == last_part: + return + elif terminator != next_part: + self.fail("Expected boundary at start of multipart data") + + while terminator != last_part: + headers = parse_multipart_headers(iterator) + + disposition = headers.get("content-disposition") + if disposition is None: + self.fail("Missing Content-Disposition header") + disposition, extra = parse_options_header(disposition) + transfer_encoding = self.get_part_encoding(headers) + name = extra.get("name") + filename = extra.get("filename") + + # if no content type is given we stream into memory. A list is + # used as a temporary container. + if filename is None: + yield _begin_form, (headers, name) + + # otherwise we parse the rest of the headers and ask the stream + # factory for something we can write in. + else: + yield _begin_file, (headers, name, filename) + + buf = b"" + for line in iterator: + if not line: + self.fail("unexpected end of stream") + + if line[:2] == b"--": + terminator = line.rstrip() + if terminator in (next_part, last_part): + break + + if transfer_encoding is not None: + if transfer_encoding == "base64": + transfer_encoding = "base64_codec" + try: + line = codecs.decode(line, transfer_encoding) + except Exception: + self.fail("could not decode transfer encoded chunk") + + # we have something in the buffer from the last iteration. + # this is usually a newline delimiter. + if buf: + yield _cont, buf + buf = b"" + + # If the line ends with windows CRLF we write everything except + # the last two bytes. In all other cases however we write + # everything except the last byte. If it was a newline, that's + # fine, otherwise it does not matter because we will write it + # the next iteration. this ensures we do not write the + # final newline into the stream. That way we do not have to + # truncate the stream. However we do have to make sure that + # if something else than a newline is in there we write it + # out. + if line[-2:] == b"\r\n": + buf = b"\r\n" + cutoff = -2 + else: + buf = line[-1:] + cutoff = -1 + yield _cont, line[:cutoff] + + else: # pragma: no cover + raise ValueError("unexpected end of part") + + # if we have a leftover in the buffer that is not a newline + # character we have to flush it, otherwise we will chop of + # certain values. + if buf not in (b"", b"\r", b"\n", b"\r\n"): + yield _cont, buf + + yield _end, None + + def parse_parts(self, file, boundary, content_length): + """Generate ``('file', (name, val))`` and + ``('form', (name, val))`` parts. + """ + in_memory = 0 + + for ellt, ell in self.parse_lines(file, boundary, content_length): + if ellt == _begin_file: + headers, name, filename = ell + is_file = True + guard_memory = False + filename, container = self.start_file_streaming( + filename, headers, content_length + ) + _write = container.write + + elif ellt == _begin_form: + headers, name = ell + is_file = False + container = [] + _write = container.append + guard_memory = self.max_form_memory_size is not None + + elif ellt == _cont: + _write(ell) + # if we write into memory and there is a memory size limit we + # count the number of bytes in memory and raise an exception if + # there is too much data in memory. + if guard_memory: + in_memory += len(ell) + if in_memory > self.max_form_memory_size: + self.in_memory_threshold_reached(in_memory) + + elif ellt == _end: + if is_file: + container.seek(0) + yield ( + "file", + (name, FileStorage(container, filename, name, headers=headers)), + ) + else: + part_charset = self.get_part_charset(headers) + yield ( + "form", + (name, b"".join(container).decode(part_charset, self.errors)), + ) + + def parse(self, file, boundary, content_length): + formstream, filestream = tee( + self.parse_parts(file, boundary, content_length), 2 + ) + form = (p[1] for p in formstream if p[0] == "form") + files = (p[1] for p in filestream if p[0] == "file") + return self.cls(form), self.cls(files) + + +from . import exceptions |