Skip to content

Commit

Permalink
change: Translate WSGI strings to utf8 immediately, not only on demand.
Browse files Browse the repository at this point in the history
  • Loading branch information
defnull committed Nov 18, 2024
1 parent 7912616 commit 821865d
Show file tree
Hide file tree
Showing 6 changed files with 54 additions and 113 deletions.
102 changes: 34 additions & 68 deletions bottle.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,6 @@ def _cli_patch(cli_args): # pragma: no coverage
import _thread as thread
from urllib.parse import urljoin, SplitResult as UrlSplitResult
from urllib.parse import urlencode, quote as urlquote, unquote as urlunquote
urlunquote = functools.partial(urlunquote, encoding='latin1')
from http.cookies import SimpleCookie, Morsel, CookieError
from collections.abc import MutableMapping as DictMixin
from types import ModuleType as new_module
Expand All @@ -112,6 +111,10 @@ def getargspec(func):
callable = lambda x: hasattr(x, '__call__')
imap = map

def _wsgi_recode(src, target='utf8'):
return src.encode('latin1').decode(target)


def _raise(*a):
raise a[0](a[1]).with_traceback(a[2])

Expand Down Expand Up @@ -679,11 +682,8 @@ def mountpoint_wrapper():
def start_response(status, headerlist, exc_info=None):
if exc_info:
_raise(*exc_info)
# Errors here mean that the mounted WSGI app did not
# follow PEP-3333 (which requires latin1) or used a
# pre-encoding other than utf8 :/
status = status.encode('latin1').decode('utf8')
headerlist = [(k, v.encode('latin1').decode('utf8'))
status = _wsgi_recode(status)
headerlist = [(k, _wsgi_recode(v))
for (k, v) in headerlist]
rs.status = status
for name, value in headerlist:
Expand Down Expand Up @@ -934,7 +934,7 @@ def default_error_handler(self, res):

def _handle(self, environ):
path = environ['bottle.raw_path'] = environ['PATH_INFO']
environ['PATH_INFO'] = path.encode('latin1').decode('utf8', 'ignore')
environ['PATH_INFO'] = _wsgi_recode(path)

environ['bottle.app'] = self
request.bind(environ)
Expand Down Expand Up @@ -1158,7 +1158,8 @@ def get_header(self, name, default=None):
def cookies(self):
""" Cookies parsed into a :class:`FormsDict`. Signed cookies are NOT
decoded. Use :meth:`get_cookie` if you expect signed cookies. """
cookies = SimpleCookie(self.environ.get('HTTP_COOKIE', '')).values()
cookie_header = _wsgi_recode(self.environ.get('HTTP_COOKIE', ''))
cookies = SimpleCookie(cookie_header).values()
return FormsDict((c.key, c.value) for c in cookies)

def get_cookie(self, key, default=None, secret=None, digestmod=hashlib.sha256):
Expand Down Expand Up @@ -1186,7 +1187,7 @@ def query(self):
not to be confused with "URL wildcards" as they are provided by the
:class:`Router`. """
get = self.environ['bottle.get'] = FormsDict()
pairs = _parse_qsl(self.environ.get('QUERY_STRING', ''))
pairs = _parse_qsl(self.environ.get('QUERY_STRING', ''), 'utf8')
for key, value in pairs:
get[key] = value
return get
Expand All @@ -1198,7 +1199,6 @@ def forms(self):
:class:`FormsDict`. All keys and values are strings. File uploads
are stored separately in :attr:`files`. """
forms = FormsDict()
forms.recode_unicode = self.POST.recode_unicode
for name, item in self.POST.allitems():
if not isinstance(item, FileUpload):
forms[name] = item
Expand All @@ -1222,7 +1222,6 @@ def files(self):
"""
files = FormsDict()
files.recode_unicode = self.POST.recode_unicode
for name, item in self.POST.allitems():
if isinstance(item, FileUpload):
files[name] = item
Expand Down Expand Up @@ -1345,12 +1344,11 @@ def POST(self):
# We default to application/x-www-form-urlencoded for everything that
# is not multipart and take the fast path (also: 3.1 workaround)
if not content_type.startswith('multipart/'):
body = self._get_body_string(self.MEMFILE_MAX).decode('latin1')
for key, value in _parse_qsl(body):
body = self._get_body_string(self.MEMFILE_MAX).decode('utf8')
for key, value in _parse_qsl(body, 'utf8'):
post[key] = value
return post

post.recode_unicode = False
charset = options.get("charset", "utf8")
boundary = options.get("boundary")
if not boundary:
Expand Down Expand Up @@ -2134,49 +2132,32 @@ def getall(self, key):

class FormsDict(MultiDict):
""" This :class:`MultiDict` subclass is used to store request form data.
Additionally to the normal dict-like item access methods (which return
unmodified data as native strings), this container also supports
attribute-like access to its values. Attributes are automatically de-
or recoded to match :attr:`input_encoding` (default: 'utf8'). Missing
attributes default to an empty string. """

#: Encoding used for attribute values.
input_encoding = 'utf8'
#: If true (default), unicode strings are first encoded with `latin1`
#: and then decoded to match :attr:`input_encoding`.
recode_unicode = True

def _fix(self, s, encoding=None):
if isinstance(s, unicode) and self.recode_unicode: # Python 3 WSGI
return s.encode('latin1').decode(encoding or self.input_encoding)
elif isinstance(s, bytes): # Python 2 WSGI
return s.decode(encoding or self.input_encoding)
else:
return s
Additionally to the normal dict-like item access methods, this container
also supports attribute-like access to its values. Missing attributes
default to an empty string.
.. versionchanged:: 0.14
All keys and values are now decoded as utf8 by default, item and
attribute access will return the same string.
"""

def decode(self, encoding=None):
""" Returns a copy with all keys and values de- or recoded to match
:attr:`input_encoding`. Some libraries (e.g. WTForms) want a
unicode dictionary. """
""" (deprecated) Starting with 0.13 all keys and values are already
correctly decoded. """
copy = FormsDict()
enc = copy.input_encoding = encoding or self.input_encoding
copy.recode_unicode = False
for key, value in self.allitems():
copy.append(self._fix(key, enc), self._fix(value, enc))
copy[key] = value
return copy

def getunicode(self, name, default=None, encoding=None):
""" Return the value as a unicode string, or the default. """
try:
return self._fix(self[name], encoding)
except (UnicodeError, KeyError):
return default
""" (deprecated) Return the value as a unicode string, or the default. """
return self.get(name, default)

def __getattr__(self, name, default=unicode()):
# Without this guard, pickle generates a cryptic TypeError:
if name.startswith('__') and name.endswith('__'):
return super(FormsDict, self).__getattr__(name)
return self.getunicode(name, default=default)
return self.get(name, default=default)

class HeaderDict(MultiDict):
""" A case-insensitive version of :class:`MultiDict` that defaults to
Expand Down Expand Up @@ -2218,14 +2199,7 @@ def filter(self, names):

class WSGIHeaderDict(DictMixin):
""" This dict-like class wraps a WSGI environ dict and provides convenient
access to HTTP_* fields. Keys and values are native strings
(2.x bytes or 3.x unicode) and keys are case-insensitive. If the WSGI
environment contains non-native string values, these are de- or encoded
using a lossless 'latin1' character set.
The API will remain stable even on changes to the relevant PEPs.
Currently PEP 333, 444 and 3333 are supported. (PEP 444 is the only one
that uses non-native strings.)
access to HTTP_* fields. Header names are case-insensitive and titled by default.
"""
#: List of keys that do not have a ``HTTP_`` prefix.
cgikeys = ('CONTENT_TYPE', 'CONTENT_LENGTH')
Expand All @@ -2241,16 +2215,11 @@ def _ekey(self, key):
return 'HTTP_' + key

def raw(self, key, default=None):
""" Return the header value as is (may be bytes or unicode). """
""" Return the header value as is (not utf8-translated). """
return self.environ.get(self._ekey(key), default)

def __getitem__(self, key):
val = self.environ[self._ekey(key)]
if isinstance(val, unicode):
val = val.encode('latin1').decode('utf8')
else:
val = val.decode('utf8')
return val
return _wsgi_recode(self.environ[self._ekey(key)])

def __setitem__(self, key, value):
raise TypeError("%s is read-only." % self.__class__)
Expand Down Expand Up @@ -2684,8 +2653,6 @@ def filename(self):
or dashes are removed. The filename is limited to 255 characters.
"""
fname = self.raw_filename
if not isinstance(fname, unicode):
fname = fname.decode('utf8', 'ignore')
fname = normalize('NFKD', fname)
fname = fname.encode('ASCII', 'ignore').decode('ASCII')
fname = os.path.basename(fname.replace('\\', os.path.sep))
Expand Down Expand Up @@ -2966,14 +2933,14 @@ def _parse_http_header(h):
return values


def _parse_qsl(qs):
def _parse_qsl(qs, encoding="utf8"):
r = []
for pair in qs.split('&'):
if not pair: continue
nv = pair.split('=', 1)
if len(nv) != 2: nv.append('')
key = urlunquote(nv[0].replace('+', ' '))
value = urlunquote(nv[1].replace('+', ' '))
key = urlunquote(nv[0].replace('+', ' '), encoding)
value = urlunquote(nv[1].replace('+', ' '), encoding)
r.append((key, value))
return r

Expand Down Expand Up @@ -3283,7 +3250,7 @@ def feed(self, line, nl=""):
return self.write_header(line, nl)

def write_header(self, line, nl):
line = line.decode(self.charset)
line = str(line, self.charset)

if not nl:
raise MultipartError("Unexpected end of line in header.")
Expand Down Expand Up @@ -3355,8 +3322,7 @@ def is_buffered(self):
@property
def value(self):
""" Data decoded with the specified charset """

return self.raw.decode(self.charset)
return str(self.raw, self.charset)

@property
def raw(self):
Expand Down
12 changes: 8 additions & 4 deletions docs/changelog.rst
Original file line number Diff line number Diff line change
Expand Up @@ -8,15 +8,19 @@ Release Notes
Release 0.14 (in development)
=============================

.. rubric:: Removed APIs (deprecated since 0.13)
.. rubric:: Removed APIs

* Dropped support for Python 2 and removed helpers and workarounds that only make sense in a Python 2/3 dual codebase (e.g. ``tonat()`` or the ``py3k`` flag).
* Dropped support for Python 2 and removed workarounds or helpers that only make sense in a Python 2/3 dual codebase.
* Removed the ``RouteReset`` exception and associated logic.
* Removed the `bottle.py` console script entrypoint in favour of the new `bottle` script. You can still execute `bottle.py` directly or via `python -m bottle`. The only change is that the command installed by pip or similar tools into the bin/Scripts folder of the (virtual) environment is now called `bottle` to avoid circular import errors.

.. rubric:: Changes
.. rubric:: Changed APIs

* ``bottle.FormsDict`` no longer translates between PEP-3333 `latin1` and the correct `utf8` encoding on demand. The `getunicode()` and `decode()` methods are deprecated and do nothing, as all values are already decoded correctly.

.. rubric:: New features

* ``bottle.HTTPError`` raised on Invalid JSON now include the underlying exception in their ``exception`` field.
* ``bottle.HTTPError`` raised on Invalid JSON now include the underlying exception in the ``exception`` field.


Release 0.13
Expand Down
16 changes: 3 additions & 13 deletions docs/tutorial.rst
Original file line number Diff line number Diff line change
Expand Up @@ -552,28 +552,18 @@ Property Data source

Bottle uses a special type of dictionary to store those parameters. :class:`FormsDict` behaves like a normal dictionary, but has some additional features to make your life easier.

First of all, :class:`FormsDict` is a subclass of :class:`MultiDict` and can store more than one value per key. The standard dictionary access methods will only return the first of many values, but the :meth:`MultiDict.getall` method returns a (possibly empty) list of all values for a specific key::
First of all, :class:`FormsDict` is a subclass of :class:`MultiDict` and can store more than one value per key. Only the first value is returned by default, but :meth:`MultiDict.getall` can be used to get a (possibly empty) list of all values for a specific key::

for choice in request.forms.getall('multiple_choice'):
do_something(choice)

To simplify dealing with lots of unreliable user input, :class:`FormsDict` exposes all its values as attributes, but with a twist: These virtual attributes always return properly encoded unicode strings, even if the value is missing or character decoding fails. They never return ``None`` or throw an exception, but return an empty string instead::
Attribute-like access is also supported, returning empty strings for missing values. This simplifies code a lot whend ealing with lots of optional attributes::

name = request.query.name # may be an empty string

.. rubric:: A word on unicode and character encodings

HTTP is a byte-based wire protocol. The server has to decode byte strings somehow before they are passed to the application. To be on the safe side, WSGI suggests ISO-8859-1 (aka latin1), a reversible single-byte codec that can be re-encoded with a different encoding later. Bottle does that for :meth:`FormsDict.getunicode` and attribute access, but not for :meth:`FormsDict.get` or item-access. These return the unchanged values as provided by the server implementation, which is probably not what you want.

::

>>> request.query['city']
'Göttingen' # An utf8 string provisionally decoded as ISO-8859-1 by the server
>>> request.query.city
'Göttingen' # The same string correctly re-encoded as utf8 by bottle

If you need the whole dictionary with correctly decoded values (e.g. for WTForms), you can call :meth:`FormsDict.decode` to get a fully re-encoded copy.

Unicode characters in the request path, query parameters or cookies are a bit tricky. HTTP is a very old byte-based protocol that predates unicode and lacks explicit encoding information. This is why WSGI servers have to fall back on `ISO-8859-1` (aka `latin1`, a reversible input encoding) for those estrings. Modern browsers default to `utf8`, though. It's a bit much to ask application developers to translate every single user input string to the correct encoding manually. Bottle makes this easy and just assumes `utf8` for everything. All strings returned by Bottle APIs support the full range of unicode characters, as long as the webpage or HTTP client follows best practices and does not break with established standards.

Query Parameters
--------------------------------------------------------------------------------
Expand Down
12 changes: 4 additions & 8 deletions test/test_environ.py
Original file line number Diff line number Diff line change
Expand Up @@ -168,8 +168,8 @@ def test_get(self):
self.assertEqual(['b'], request.query.getall('b'))
self.assertEqual('1', request.query['a'])
self.assertEqual('b', request.query['b'])
self.assertEqual(touni(tob('瓶'), 'latin1'), request.query['cn'])
self.assertEqual(touni('瓶'), request.query.cn)
self.assertEqual('瓶', request.query['cn'])
self.assertEqual('瓶', request.query.cn)

def test_post(self):
""" Environ: POST data """
Expand All @@ -189,8 +189,8 @@ def test_post(self):
self.assertEqual('b', request.POST['b'])
self.assertEqual('', request.POST['c'])
self.assertEqual('', request.POST['d'])
self.assertEqual(touni(tob('瓶'), 'latin1'), request.POST['cn'])
self.assertEqual(touni('瓶'), request.POST.cn)
self.assertEqual('瓶', request.POST['cn'])
self.assertEqual('瓶', request.POST.cn)

def test_bodypost(self):
sq = tob('foobar')
Expand Down Expand Up @@ -890,10 +890,6 @@ def test_native(self):
self.env['HTTP_TEST_HEADER'] = 'foobar'
self.assertEqual(self.headers['Test-header'], 'foobar')

def test_bytes(self):
self.env['HTTP_TEST_HEADER'] = tob('foobar')
self.assertEqual(self.headers['Test-Header'], 'foobar')

def test_unicode(self):
self.env['HTTP_TEST_HEADER'] = touni('foobar')
self.assertEqual(self.headers['Test-Header'], 'foobar')
Expand Down
3 changes: 1 addition & 2 deletions test/test_fileupload.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,8 +33,7 @@ def test_filename(self):
self.assertFilename('.name.cfg', 'name.cfg')
self.assertFilename(' . na me . ', 'na-me')
self.assertFilename('path/', 'empty')
self.assertFilename(bottle.tob('ümläüts$'), 'umlauts')
self.assertFilename(bottle.touni('ümläüts$'), 'umlauts')
self.assertFilename('ümläüts$', 'umlauts')
self.assertFilename('', 'empty')
self.assertFilename('a'+'b'*1337+'c', 'a'+'b'*254)

Expand Down
22 changes: 4 additions & 18 deletions test/test_formsdict.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,25 +7,11 @@
class TestFormsDict(unittest.TestCase):
def test_attr_access(self):
""" FomsDict.attribute returs string values as unicode. """
d = FormsDict(py2=tob('瓶'), py3=tob('瓶').decode('latin1'))
self.assertEqual(touni('瓶'), d.py2)
self.assertEqual(touni('瓶'), d.py3)
d = FormsDict(py3='瓶')
self.assertEqual('瓶', d.py3)
self.assertEqual('瓶', d["py3"])

def test_attr_missing(self):
""" FomsDict.attribute returs u'' on missing keys. """
d = FormsDict()
self.assertEqual(touni(''), d.missing)

def test_attr_unicode_error(self):
""" FomsDict.attribute returs u'' on UnicodeError. """
d = FormsDict(latin=touni('öäüß').encode('latin1'))
self.assertEqual(touni(''), d.latin)
d.input_encoding = 'latin1'
self.assertEqual(touni('öäüß'), d.latin)

def test_decode_method(self):
d = FormsDict(py2=tob('瓶'), py3=tob('瓶').decode('latin1'))
d = d.decode()
self.assertFalse(d.recode_unicode)
self.assertTrue(hasattr(list(d.keys())[0], 'encode'))
self.assertTrue(hasattr(list(d.values())[0], 'encode'))
self.assertEqual('', d.missing)

0 comments on commit 821865d

Please sign in to comment.