aboutsummaryrefslogtreecommitdiffstats
path: root/python/werkzeug/formparser.py
diff options
context:
space:
mode:
Diffstat (limited to 'python/werkzeug/formparser.py')
-rw-r--r--python/werkzeug/formparser.py586
1 files changed, 586 insertions, 0 deletions
diff --git a/python/werkzeug/formparser.py b/python/werkzeug/formparser.py
new file mode 100644
index 0000000..0ddc5c8
--- /dev/null
+++ b/python/werkzeug/formparser.py
@@ -0,0 +1,586 @@
+# -*- coding: utf-8 -*-
+"""
+ werkzeug.formparser
+ ~~~~~~~~~~~~~~~~~~~
+
+ This module implements the form parsing. It supports url-encoded forms
+ as well as non-nested multipart uploads.
+
+ :copyright: 2007 Pallets
+ :license: BSD-3-Clause
+"""
+import codecs
+import re
+from functools import update_wrapper
+from itertools import chain
+from itertools import repeat
+from itertools import tee
+
+from ._compat import BytesIO
+from ._compat import text_type
+from ._compat import to_native
+from .datastructures import FileStorage
+from .datastructures import Headers
+from .datastructures import MultiDict
+from .http import parse_options_header
+from .urls import url_decode_stream
+from .wsgi import get_content_length
+from .wsgi import get_input_stream
+from .wsgi import make_line_iter
+
+# there are some platforms where SpooledTemporaryFile is not available.
+# In that case we need to provide a fallback.
+try:
+ from tempfile import SpooledTemporaryFile
+except ImportError:
+ from tempfile import TemporaryFile
+
+ SpooledTemporaryFile = None
+
+
+#: an iterator that yields empty strings
+_empty_string_iter = repeat("")
+
+#: a regular expression for multipart boundaries
+_multipart_boundary_re = re.compile("^[ -~]{0,200}[!-~]$")
+
+#: supported http encodings that are also available in python we support
+#: for multipart messages.
+_supported_multipart_encodings = frozenset(["base64", "quoted-printable"])
+
+
+def default_stream_factory(
+ total_content_length, filename, content_type, content_length=None
+):
+ """The stream factory that is used per default."""
+ max_size = 1024 * 500
+ if SpooledTemporaryFile is not None:
+ return SpooledTemporaryFile(max_size=max_size, mode="wb+")
+ if total_content_length is None or total_content_length > max_size:
+ return TemporaryFile("wb+")
+ return BytesIO()
+
+
+def parse_form_data(
+ environ,
+ stream_factory=None,
+ charset="utf-8",
+ errors="replace",
+ max_form_memory_size=None,
+ max_content_length=None,
+ cls=None,
+ silent=True,
+):
+ """Parse the form data in the environ and return it as tuple in the form
+ ``(stream, form, files)``. You should only call this method if the
+ transport method is `POST`, `PUT`, or `PATCH`.
+
+ If the mimetype of the data transmitted is `multipart/form-data` the
+ files multidict will be filled with `FileStorage` objects. If the
+ mimetype is unknown the input stream is wrapped and returned as first
+ argument, else the stream is empty.
+
+ This is a shortcut for the common usage of :class:`FormDataParser`.
+
+ Have a look at :ref:`dealing-with-request-data` for more details.
+
+ .. versionadded:: 0.5
+ The `max_form_memory_size`, `max_content_length` and
+ `cls` parameters were added.
+
+ .. versionadded:: 0.5.1
+ The optional `silent` flag was added.
+
+ :param environ: the WSGI environment to be used for parsing.
+ :param stream_factory: An optional callable that returns a new read and
+ writeable file descriptor. This callable works
+ the same as :meth:`~BaseResponse._get_file_stream`.
+ :param charset: The character set for URL and url encoded form data.
+ :param errors: The encoding error behavior.
+ :param max_form_memory_size: the maximum number of bytes to be accepted for
+ in-memory stored form data. If the data
+ exceeds the value specified an
+ :exc:`~exceptions.RequestEntityTooLarge`
+ exception is raised.
+ :param max_content_length: If this is provided and the transmitted data
+ is longer than this value an
+ :exc:`~exceptions.RequestEntityTooLarge`
+ exception is raised.
+ :param cls: an optional dict class to use. If this is not specified
+ or `None` the default :class:`MultiDict` is used.
+ :param silent: If set to False parsing errors will not be caught.
+ :return: A tuple in the form ``(stream, form, files)``.
+ """
+ return FormDataParser(
+ stream_factory,
+ charset,
+ errors,
+ max_form_memory_size,
+ max_content_length,
+ cls,
+ silent,
+ ).parse_from_environ(environ)
+
+
+def exhaust_stream(f):
+ """Helper decorator for methods that exhausts the stream on return."""
+
+ def wrapper(self, stream, *args, **kwargs):
+ try:
+ return f(self, stream, *args, **kwargs)
+ finally:
+ exhaust = getattr(stream, "exhaust", None)
+ if exhaust is not None:
+ exhaust()
+ else:
+ while 1:
+ chunk = stream.read(1024 * 64)
+ if not chunk:
+ break
+
+ return update_wrapper(wrapper, f)
+
+
+class FormDataParser(object):
+ """This class implements parsing of form data for Werkzeug. By itself
+ it can parse multipart and url encoded form data. It can be subclassed
+ and extended but for most mimetypes it is a better idea to use the
+ untouched stream and expose it as separate attributes on a request
+ object.
+
+ .. versionadded:: 0.8
+
+ :param stream_factory: An optional callable that returns a new read and
+ writeable file descriptor. This callable works
+ the same as :meth:`~BaseResponse._get_file_stream`.
+ :param charset: The character set for URL and url encoded form data.
+ :param errors: The encoding error behavior.
+ :param max_form_memory_size: the maximum number of bytes to be accepted for
+ in-memory stored form data. If the data
+ exceeds the value specified an
+ :exc:`~exceptions.RequestEntityTooLarge`
+ exception is raised.
+ :param max_content_length: If this is provided and the transmitted data
+ is longer than this value an
+ :exc:`~exceptions.RequestEntityTooLarge`
+ exception is raised.
+ :param cls: an optional dict class to use. If this is not specified
+ or `None` the default :class:`MultiDict` is used.
+ :param silent: If set to False parsing errors will not be caught.
+ """
+
+ def __init__(
+ self,
+ stream_factory=None,
+ charset="utf-8",
+ errors="replace",
+ max_form_memory_size=None,
+ max_content_length=None,
+ cls=None,
+ silent=True,
+ ):
+ if stream_factory is None:
+ stream_factory = default_stream_factory
+ self.stream_factory = stream_factory
+ self.charset = charset
+ self.errors = errors
+ self.max_form_memory_size = max_form_memory_size
+ self.max_content_length = max_content_length
+ if cls is None:
+ cls = MultiDict
+ self.cls = cls
+ self.silent = silent
+
+ def get_parse_func(self, mimetype, options):
+ return self.parse_functions.get(mimetype)
+
+ def parse_from_environ(self, environ):
+ """Parses the information from the environment as form data.
+
+ :param environ: the WSGI environment to be used for parsing.
+ :return: A tuple in the form ``(stream, form, files)``.
+ """
+ content_type = environ.get("CONTENT_TYPE", "")
+ content_length = get_content_length(environ)
+ mimetype, options = parse_options_header(content_type)
+ return self.parse(get_input_stream(environ), mimetype, content_length, options)
+
+ def parse(self, stream, mimetype, content_length, options=None):
+ """Parses the information from the given stream, mimetype,
+ content length and mimetype parameters.
+
+ :param stream: an input stream
+ :param mimetype: the mimetype of the data
+ :param content_length: the content length of the incoming data
+ :param options: optional mimetype parameters (used for
+ the multipart boundary for instance)
+ :return: A tuple in the form ``(stream, form, files)``.
+ """
+ if (
+ self.max_content_length is not None
+ and content_length is not None
+ and content_length > self.max_content_length
+ ):
+ raise exceptions.RequestEntityTooLarge()
+ if options is None:
+ options = {}
+
+ parse_func = self.get_parse_func(mimetype, options)
+ if parse_func is not None:
+ try:
+ return parse_func(self, stream, mimetype, content_length, options)
+ except ValueError:
+ if not self.silent:
+ raise
+
+ return stream, self.cls(), self.cls()
+
+ @exhaust_stream
+ def _parse_multipart(self, stream, mimetype, content_length, options):
+ parser = MultiPartParser(
+ self.stream_factory,
+ self.charset,
+ self.errors,
+ max_form_memory_size=self.max_form_memory_size,
+ cls=self.cls,
+ )
+ boundary = options.get("boundary")
+ if boundary is None:
+ raise ValueError("Missing boundary")
+ if isinstance(boundary, text_type):
+ boundary = boundary.encode("ascii")
+ form, files = parser.parse(stream, boundary, content_length)
+ return stream, form, files
+
+ @exhaust_stream
+ def _parse_urlencoded(self, stream, mimetype, content_length, options):
+ if (
+ self.max_form_memory_size is not None
+ and content_length is not None
+ and content_length > self.max_form_memory_size
+ ):
+ raise exceptions.RequestEntityTooLarge()
+ form = url_decode_stream(stream, self.charset, errors=self.errors, cls=self.cls)
+ return stream, form, self.cls()
+
+ #: mapping of mimetypes to parsing functions
+ parse_functions = {
+ "multipart/form-data": _parse_multipart,
+ "application/x-www-form-urlencoded": _parse_urlencoded,
+ "application/x-url-encoded": _parse_urlencoded,
+ }
+
+
+def is_valid_multipart_boundary(boundary):
+ """Checks if the string given is a valid multipart boundary."""
+ return _multipart_boundary_re.match(boundary) is not None
+
+
+def _line_parse(line):
+ """Removes line ending characters and returns a tuple (`stripped_line`,
+ `is_terminated`).
+ """
+ if line[-2:] in ["\r\n", b"\r\n"]:
+ return line[:-2], True
+ elif line[-1:] in ["\r", "\n", b"\r", b"\n"]:
+ return line[:-1], True
+ return line, False
+
+
+def parse_multipart_headers(iterable):
+ """Parses multipart headers from an iterable that yields lines (including
+ the trailing newline symbol). The iterable has to be newline terminated.
+
+ The iterable will stop at the line where the headers ended so it can be
+ further consumed.
+
+ :param iterable: iterable of strings that are newline terminated
+ """
+ result = []
+ for line in iterable:
+ line = to_native(line)
+ line, line_terminated = _line_parse(line)
+ if not line_terminated:
+ raise ValueError("unexpected end of line in multipart header")
+ if not line:
+ break
+ elif line[0] in " \t" and result:
+ key, value = result[-1]
+ result[-1] = (key, value + "\n " + line[1:])
+ else:
+ parts = line.split(":", 1)
+ if len(parts) == 2:
+ result.append((parts[0].strip(), parts[1].strip()))
+
+ # we link the list to the headers, no need to create a copy, the
+ # list was not shared anyways.
+ return Headers(result)
+
+
+_begin_form = "begin_form"
+_begin_file = "begin_file"
+_cont = "cont"
+_end = "end"
+
+
+class MultiPartParser(object):
+ def __init__(
+ self,
+ stream_factory=None,
+ charset="utf-8",
+ errors="replace",
+ max_form_memory_size=None,
+ cls=None,
+ buffer_size=64 * 1024,
+ ):
+ self.charset = charset
+ self.errors = errors
+ self.max_form_memory_size = max_form_memory_size
+ self.stream_factory = (
+ default_stream_factory if stream_factory is None else stream_factory
+ )
+ self.cls = MultiDict if cls is None else cls
+
+ # make sure the buffer size is divisible by four so that we can base64
+ # decode chunk by chunk
+ assert buffer_size % 4 == 0, "buffer size has to be divisible by 4"
+ # also the buffer size has to be at least 1024 bytes long or long headers
+ # will freak out the system
+ assert buffer_size >= 1024, "buffer size has to be at least 1KB"
+
+ self.buffer_size = buffer_size
+
+ def _fix_ie_filename(self, filename):
+ """Internet Explorer 6 transmits the full file name if a file is
+ uploaded. This function strips the full path if it thinks the
+ filename is Windows-like absolute.
+ """
+ if filename[1:3] == ":\\" or filename[:2] == "\\\\":
+ return filename.split("\\")[-1]
+ return filename
+
+ def _find_terminator(self, iterator):
+ """The terminator might have some additional newlines before it.
+ There is at least one application that sends additional newlines
+ before headers (the python setuptools package).
+ """
+ for line in iterator:
+ if not line:
+ break
+ line = line.strip()
+ if line:
+ return line
+ return b""
+
+ def fail(self, message):
+ raise ValueError(message)
+
+ def get_part_encoding(self, headers):
+ transfer_encoding = headers.get("content-transfer-encoding")
+ if (
+ transfer_encoding is not None
+ and transfer_encoding in _supported_multipart_encodings
+ ):
+ return transfer_encoding
+
+ def get_part_charset(self, headers):
+ # Figure out input charset for current part
+ content_type = headers.get("content-type")
+ if content_type:
+ mimetype, ct_params = parse_options_header(content_type)
+ return ct_params.get("charset", self.charset)
+ return self.charset
+
+ def start_file_streaming(self, filename, headers, total_content_length):
+ if isinstance(filename, bytes):
+ filename = filename.decode(self.charset, self.errors)
+ filename = self._fix_ie_filename(filename)
+ content_type = headers.get("content-type")
+ try:
+ content_length = int(headers["content-length"])
+ except (KeyError, ValueError):
+ content_length = 0
+ container = self.stream_factory(
+ total_content_length=total_content_length,
+ filename=filename,
+ content_type=content_type,
+ content_length=content_length,
+ )
+ return filename, container
+
+ def in_memory_threshold_reached(self, bytes):
+ raise exceptions.RequestEntityTooLarge()
+
+ def validate_boundary(self, boundary):
+ if not boundary:
+ self.fail("Missing boundary")
+ if not is_valid_multipart_boundary(boundary):
+ self.fail("Invalid boundary: %s" % boundary)
+ if len(boundary) > self.buffer_size: # pragma: no cover
+ # this should never happen because we check for a minimum size
+ # of 1024 and boundaries may not be longer than 200. The only
+ # situation when this happens is for non debug builds where
+ # the assert is skipped.
+ self.fail("Boundary longer than buffer size")
+
+ def parse_lines(self, file, boundary, content_length, cap_at_buffer=True):
+ """Generate parts of
+ ``('begin_form', (headers, name))``
+ ``('begin_file', (headers, name, filename))``
+ ``('cont', bytestring)``
+ ``('end', None)``
+
+ Always obeys the grammar
+ parts = ( begin_form cont* end |
+ begin_file cont* end )*
+ """
+ next_part = b"--" + boundary
+ last_part = next_part + b"--"
+
+ iterator = chain(
+ make_line_iter(
+ file,
+ limit=content_length,
+ buffer_size=self.buffer_size,
+ cap_at_buffer=cap_at_buffer,
+ ),
+ _empty_string_iter,
+ )
+
+ terminator = self._find_terminator(iterator)
+
+ if terminator == last_part:
+ return
+ elif terminator != next_part:
+ self.fail("Expected boundary at start of multipart data")
+
+ while terminator != last_part:
+ headers = parse_multipart_headers(iterator)
+
+ disposition = headers.get("content-disposition")
+ if disposition is None:
+ self.fail("Missing Content-Disposition header")
+ disposition, extra = parse_options_header(disposition)
+ transfer_encoding = self.get_part_encoding(headers)
+ name = extra.get("name")
+ filename = extra.get("filename")
+
+ # if no content type is given we stream into memory. A list is
+ # used as a temporary container.
+ if filename is None:
+ yield _begin_form, (headers, name)
+
+ # otherwise we parse the rest of the headers and ask the stream
+ # factory for something we can write in.
+ else:
+ yield _begin_file, (headers, name, filename)
+
+ buf = b""
+ for line in iterator:
+ if not line:
+ self.fail("unexpected end of stream")
+
+ if line[:2] == b"--":
+ terminator = line.rstrip()
+ if terminator in (next_part, last_part):
+ break
+
+ if transfer_encoding is not None:
+ if transfer_encoding == "base64":
+ transfer_encoding = "base64_codec"
+ try:
+ line = codecs.decode(line, transfer_encoding)
+ except Exception:
+ self.fail("could not decode transfer encoded chunk")
+
+ # we have something in the buffer from the last iteration.
+ # this is usually a newline delimiter.
+ if buf:
+ yield _cont, buf
+ buf = b""
+
+ # If the line ends with windows CRLF we write everything except
+ # the last two bytes. In all other cases however we write
+ # everything except the last byte. If it was a newline, that's
+ # fine, otherwise it does not matter because we will write it
+ # the next iteration. this ensures we do not write the
+ # final newline into the stream. That way we do not have to
+ # truncate the stream. However we do have to make sure that
+ # if something else than a newline is in there we write it
+ # out.
+ if line[-2:] == b"\r\n":
+ buf = b"\r\n"
+ cutoff = -2
+ else:
+ buf = line[-1:]
+ cutoff = -1
+ yield _cont, line[:cutoff]
+
+ else: # pragma: no cover
+ raise ValueError("unexpected end of part")
+
+ # if we have a leftover in the buffer that is not a newline
+ # character we have to flush it, otherwise we will chop of
+ # certain values.
+ if buf not in (b"", b"\r", b"\n", b"\r\n"):
+ yield _cont, buf
+
+ yield _end, None
+
+ def parse_parts(self, file, boundary, content_length):
+ """Generate ``('file', (name, val))`` and
+ ``('form', (name, val))`` parts.
+ """
+ in_memory = 0
+
+ for ellt, ell in self.parse_lines(file, boundary, content_length):
+ if ellt == _begin_file:
+ headers, name, filename = ell
+ is_file = True
+ guard_memory = False
+ filename, container = self.start_file_streaming(
+ filename, headers, content_length
+ )
+ _write = container.write
+
+ elif ellt == _begin_form:
+ headers, name = ell
+ is_file = False
+ container = []
+ _write = container.append
+ guard_memory = self.max_form_memory_size is not None
+
+ elif ellt == _cont:
+ _write(ell)
+ # if we write into memory and there is a memory size limit we
+ # count the number of bytes in memory and raise an exception if
+ # there is too much data in memory.
+ if guard_memory:
+ in_memory += len(ell)
+ if in_memory > self.max_form_memory_size:
+ self.in_memory_threshold_reached(in_memory)
+
+ elif ellt == _end:
+ if is_file:
+ container.seek(0)
+ yield (
+ "file",
+ (name, FileStorage(container, filename, name, headers=headers)),
+ )
+ else:
+ part_charset = self.get_part_charset(headers)
+ yield (
+ "form",
+ (name, b"".join(container).decode(part_charset, self.errors)),
+ )
+
+ def parse(self, file, boundary, content_length):
+ formstream, filestream = tee(
+ self.parse_parts(file, boundary, content_length), 2
+ )
+ form = (p[1] for p in formstream if p[0] == "form")
+ files = (p[1] for p in filestream if p[0] == "file")
+ return self.cls(form), self.cls(files)
+
+
+from . import exceptions