change: Translate WSGI strings to utf8 immediately, not only on demand.

bottlepy · Nov 18, 2024 · 821865d · 821865d
1 parent 7912616
commit 821865d
Show file tree

Hide file tree

Showing 6 changed files with 54 additions and 113 deletions.
diff --git a/bottle.py b/bottle.py
@@ -88,7 +88,6 @@ def _cli_patch(cli_args):  # pragma: no coverage
 import _thread as thread
 from urllib.parse import urljoin, SplitResult as UrlSplitResult
 from urllib.parse import urlencode, quote as urlquote, unquote as urlunquote
-urlunquote = functools.partial(urlunquote, encoding='latin1')
 from http.cookies import SimpleCookie, Morsel, CookieError
 from collections.abc import MutableMapping as DictMixin
 from types import ModuleType as new_module
@@ -112,6 +111,10 @@ def getargspec(func):
 callable = lambda x: hasattr(x, '__call__')
 imap = map
 
+def _wsgi_recode(src, target='utf8'):
+    return src.encode('latin1').decode(target)
+
+
 def _raise(*a):
     raise a[0](a[1]).with_traceback(a[2])
 
@@ -679,11 +682,8 @@ def mountpoint_wrapper():
                 def start_response(status, headerlist, exc_info=None):
                     if exc_info:
                         _raise(*exc_info)
-                    # Errors here mean that the mounted WSGI app did not
-                    # follow PEP-3333 (which requires latin1) or used a
-                    # pre-encoding other than utf8 :/
-                    status = status.encode('latin1').decode('utf8')
-                    headerlist = [(k, v.encode('latin1').decode('utf8'))
+                    status = _wsgi_recode(status)
+                    headerlist = [(k, _wsgi_recode(v))
                                     for (k, v) in headerlist]
                     rs.status = status
                     for name, value in headerlist:
@@ -934,7 +934,7 @@ def default_error_handler(self, res):
 
     def _handle(self, environ):
         path = environ['bottle.raw_path'] = environ['PATH_INFO']
-        environ['PATH_INFO'] = path.encode('latin1').decode('utf8', 'ignore')
+        environ['PATH_INFO'] = _wsgi_recode(path)
 
         environ['bottle.app'] = self
         request.bind(environ)
@@ -1158,7 +1158,8 @@ def get_header(self, name, default=None):
     def cookies(self):
         """ Cookies parsed into a :class:`FormsDict`. Signed cookies are NOT
             decoded. Use :meth:`get_cookie` if you expect signed cookies. """
-        cookies = SimpleCookie(self.environ.get('HTTP_COOKIE', '')).values()
+        cookie_header = _wsgi_recode(self.environ.get('HTTP_COOKIE', ''))
+        cookies = SimpleCookie(cookie_header).values()
         return FormsDict((c.key, c.value) for c in cookies)
 
     def get_cookie(self, key, default=None, secret=None, digestmod=hashlib.sha256):
@@ -1186,7 +1187,7 @@ def query(self):
             not to be confused with "URL wildcards" as they are provided by the
             :class:`Router`. """
         get = self.environ['bottle.get'] = FormsDict()
-        pairs = _parse_qsl(self.environ.get('QUERY_STRING', ''))
+        pairs = _parse_qsl(self.environ.get('QUERY_STRING', ''), 'utf8')
         for key, value in pairs:
             get[key] = value
         return get
@@ -1198,7 +1199,6 @@ def forms(self):
             :class:`FormsDict`. All keys and values are strings. File uploads
             are stored separately in :attr:`files`. """
         forms = FormsDict()
-        forms.recode_unicode = self.POST.recode_unicode
         for name, item in self.POST.allitems():
             if not isinstance(item, FileUpload):
                 forms[name] = item
@@ -1222,7 +1222,6 @@ def files(self):
 
         """
         files = FormsDict()
-        files.recode_unicode = self.POST.recode_unicode
         for name, item in self.POST.allitems():
             if isinstance(item, FileUpload):
                 files[name] = item
@@ -1345,12 +1344,11 @@ def POST(self):
         # We default to application/x-www-form-urlencoded for everything that
         # is not multipart and take the fast path (also: 3.1 workaround)
         if not content_type.startswith('multipart/'):
-            body = self._get_body_string(self.MEMFILE_MAX).decode('latin1')
-            for key, value in _parse_qsl(body):
+            body = self._get_body_string(self.MEMFILE_MAX).decode('utf8')
+            for key, value in _parse_qsl(body, 'utf8'):
                 post[key] = value
             return post
 
-        post.recode_unicode = False
         charset = options.get("charset", "utf8")
         boundary = options.get("boundary")
         if not boundary:
@@ -2134,49 +2132,32 @@ def getall(self, key):
 
 class FormsDict(MultiDict):
     """ This :class:`MultiDict` subclass is used to store request form data.
-        Additionally to the normal dict-like item access methods (which return
-        unmodified data as native strings), this container also supports
-        attribute-like access to its values. Attributes are automatically de-
-        or recoded to match :attr:`input_encoding` (default: 'utf8'). Missing
-        attributes default to an empty string. """
-
-    #: Encoding used for attribute values.
-    input_encoding = 'utf8'
-    #: If true (default), unicode strings are first encoded with `latin1`
-    #: and then decoded to match :attr:`input_encoding`.
-    recode_unicode = True
-
-    def _fix(self, s, encoding=None):
-        if isinstance(s, unicode) and self.recode_unicode:  # Python 3 WSGI
-            return s.encode('latin1').decode(encoding or self.input_encoding)
-        elif isinstance(s, bytes):  # Python 2 WSGI
-            return s.decode(encoding or self.input_encoding)
-        else:
-            return s
+        Additionally to the normal dict-like item access methods, this container
+        also supports attribute-like access to its values. Missing attributes
+        default to an empty string.
+
+        .. versionchanged:: 0.14
+            All keys and values are now decoded as utf8 by default, item and
+            attribute access will return the same string.
+    """
 
     def decode(self, encoding=None):
-        """ Returns a copy with all keys and values de- or recoded to match
-            :attr:`input_encoding`. Some libraries (e.g. WTForms) want a
-            unicode dictionary. """
+        """ (deprecated) Starting with 0.13 all keys and values are already
+            correctly decoded. """
         copy = FormsDict()
-        enc = copy.input_encoding = encoding or self.input_encoding
-        copy.recode_unicode = False
         for key, value in self.allitems():
-            copy.append(self._fix(key, enc), self._fix(value, enc))
+            copy[key] = value
         return copy
 
     def getunicode(self, name, default=None, encoding=None):
-        """ Return the value as a unicode string, or the default. """
-        try:
-            return self._fix(self[name], encoding)
-        except (UnicodeError, KeyError):
-            return default
+        """ (deprecated) Return the value as a unicode string, or the default. """
+        return self.get(name, default)
 
     def __getattr__(self, name, default=unicode()):
         # Without this guard, pickle generates a cryptic TypeError:
         if name.startswith('__') and name.endswith('__'):
             return super(FormsDict, self).__getattr__(name)
-        return self.getunicode(name, default=default)
+        return self.get(name, default=default)
 
 class HeaderDict(MultiDict):
     """ A case-insensitive version of :class:`MultiDict` that defaults to
@@ -2218,14 +2199,7 @@ def filter(self, names):
 
 class WSGIHeaderDict(DictMixin):
     """ This dict-like class wraps a WSGI environ dict and provides convenient
-        access to HTTP_* fields. Keys and values are native strings
-        (2.x bytes or 3.x unicode) and keys are case-insensitive. If the WSGI
-        environment contains non-native string values, these are de- or encoded
-        using a lossless 'latin1' character set.
-
-        The API will remain stable even on changes to the relevant PEPs.
-        Currently PEP 333, 444 and 3333 are supported. (PEP 444 is the only one
-        that uses non-native strings.)
+        access to HTTP_* fields. Header names are case-insensitive and titled by default.
     """
     #: List of keys that do not have a ``HTTP_`` prefix.
     cgikeys = ('CONTENT_TYPE', 'CONTENT_LENGTH')
@@ -2241,16 +2215,11 @@ def _ekey(self, key):
         return 'HTTP_' + key
 
     def raw(self, key, default=None):
-        """ Return the header value as is (may be bytes or unicode). """
+        """ Return the header value as is (not utf8-translated). """
         return self.environ.get(self._ekey(key), default)
 
     def __getitem__(self, key):
-        val = self.environ[self._ekey(key)]
-        if isinstance(val, unicode):
-            val = val.encode('latin1').decode('utf8')
-        else:
-            val = val.decode('utf8')
-        return val
+        return _wsgi_recode(self.environ[self._ekey(key)])
 
     def __setitem__(self, key, value):
         raise TypeError("%s is read-only." % self.__class__)
@@ -2684,8 +2653,6 @@ def filename(self):
             or dashes are removed. The filename is limited to 255 characters.
         """
         fname = self.raw_filename
-        if not isinstance(fname, unicode):
-            fname = fname.decode('utf8', 'ignore')
         fname = normalize('NFKD', fname)
         fname = fname.encode('ASCII', 'ignore').decode('ASCII')
         fname = os.path.basename(fname.replace('\\', os.path.sep))
@@ -2966,14 +2933,14 @@ def _parse_http_header(h):
     return values
 
 
-def _parse_qsl(qs):
+def _parse_qsl(qs, encoding="utf8"):
     r = []
     for pair in qs.split('&'):
         if not pair: continue
         nv = pair.split('=', 1)
         if len(nv) != 2: nv.append('')
-        key = urlunquote(nv[0].replace('+', ' '))
-        value = urlunquote(nv[1].replace('+', ' '))
+        key = urlunquote(nv[0].replace('+', ' '), encoding)
+        value = urlunquote(nv[1].replace('+', ' '), encoding)
         r.append((key, value))
     return r
 
@@ -3283,7 +3250,7 @@ def feed(self, line, nl=""):
         return self.write_header(line, nl)
 
     def write_header(self, line, nl):
-        line = line.decode(self.charset)
+        line = str(line, self.charset)
 
         if not nl:
             raise MultipartError("Unexpected end of line in header.")
@@ -3355,8 +3322,7 @@ def is_buffered(self):
     @property
     def value(self):
         """ Data decoded with the specified charset """
-
-        return self.raw.decode(self.charset)
+        return str(self.raw, self.charset)
 
     @property
     def raw(self):

diff --git a/docs/changelog.rst b/docs/changelog.rst
@@ -8,15 +8,19 @@ Release Notes
 Release 0.14 (in development)
 =============================
 
-.. rubric:: Removed APIs (deprecated since 0.13)
+.. rubric:: Removed APIs
 
-* Dropped support for Python 2 and removed helpers and workarounds that only make sense in a Python 2/3 dual codebase (e.g. ``tonat()`` or the ``py3k`` flag). 
+* Dropped support for Python 2 and removed workarounds or helpers that only make sense in a Python 2/3 dual codebase.
 * Removed the ``RouteReset`` exception and associated logic.
 * Removed the `bottle.py` console script entrypoint in favour of the new `bottle` script. You can still execute `bottle.py` directly or via `python -m bottle`. The only change is that the command installed by pip or similar tools into the bin/Scripts folder of the (virtual) environment is now called `bottle` to avoid circular import errors.
 
-.. rubric:: Changes
+.. rubric:: Changed APIs
+
+* ``bottle.FormsDict`` no longer translates between PEP-3333 `latin1` and the correct `utf8` encoding on demand. The `getunicode()` and `decode()` methods are deprecated and do nothing, as all values are already decoded correctly.
+
+.. rubric:: New features
 
-* ``bottle.HTTPError`` raised on Invalid JSON now include the underlying exception in their ``exception`` field.
+* ``bottle.HTTPError`` raised on Invalid JSON now include the underlying exception in the ``exception`` field.
 
 
 Release 0.13

diff --git a/docs/tutorial.rst b/docs/tutorial.rst
@@ -552,28 +552,18 @@ Property                                       Data source
 
 Bottle uses a special type of dictionary to store those parameters. :class:`FormsDict` behaves like a normal dictionary, but has some additional features to make your life easier.
 
-First of all, :class:`FormsDict` is a subclass of :class:`MultiDict` and can store more than one value per key. The standard dictionary access methods will only return the first of many values, but the :meth:`MultiDict.getall` method returns a (possibly empty) list of all values for a specific key::
+First of all, :class:`FormsDict` is a subclass of :class:`MultiDict` and can store more than one value per key. Only the first value is returned by default, but :meth:`MultiDict.getall` can be used to get a (possibly empty) list of all values for a specific key::
 
   for choice in request.forms.getall('multiple_choice'):
       do_something(choice)
 
-To simplify dealing with lots of unreliable user input, :class:`FormsDict` exposes all its values as attributes, but with a twist: These virtual attributes always return properly encoded unicode strings, even if the value is missing or character decoding fails. They never return ``None`` or throw an exception, but return an empty string instead::
+Attribute-like access is also supported, returning empty strings for missing values. This simplifies code a lot whend ealing with lots of optional attributes::
 
   name = request.query.name    # may be an empty string
 
 .. rubric:: A word on unicode and character encodings
 
-HTTP is a byte-based wire protocol. The server has to decode byte strings somehow before they are passed to the application. To be on the safe side, WSGI suggests ISO-8859-1 (aka latin1), a reversible single-byte codec that can be re-encoded with a different encoding later. Bottle does that for :meth:`FormsDict.getunicode` and attribute access, but not for :meth:`FormsDict.get` or item-access. These return the unchanged values as provided by the server implementation, which is probably not what you want.
-
-::
-
-    >>> request.query['city']
-    'GÃ¶ttingen' # An utf8 string provisionally decoded as ISO-8859-1 by the server
-    >>> request.query.city
-    'Göttingen'  # The same string correctly re-encoded as utf8 by bottle
-
-If you need the whole dictionary with correctly decoded values (e.g. for WTForms), you can call :meth:`FormsDict.decode` to get a fully re-encoded copy.
-
+Unicode characters in the request path, query parameters or cookies are a bit tricky. HTTP is a very old byte-based protocol that predates unicode and lacks explicit encoding information. This is why WSGI servers have to fall back on `ISO-8859-1` (aka `latin1`, a reversible input encoding) for those estrings. Modern browsers default to `utf8`, though. It's a bit much to ask application developers to translate every single user input string to the correct encoding manually. Bottle makes this easy and just assumes `utf8` for everything. All strings returned by Bottle APIs support the full range of unicode characters, as long as the webpage or HTTP client follows best practices and does not break with established standards.
 
 Query Parameters
 --------------------------------------------------------------------------------

diff --git a/test/test_environ.py b/test/test_environ.py
@@ -168,8 +168,8 @@ def test_get(self):
         self.assertEqual(['b'], request.query.getall('b'))
         self.assertEqual('1', request.query['a'])
         self.assertEqual('b', request.query['b'])
-        self.assertEqual(touni(tob('瓶'), 'latin1'), request.query['cn'])
-        self.assertEqual(touni('瓶'), request.query.cn)
+        self.assertEqual('瓶', request.query['cn'])
+        self.assertEqual('瓶', request.query.cn)
 
     def test_post(self):
         """ Environ: POST data """
@@ -189,8 +189,8 @@ def test_post(self):
         self.assertEqual('b', request.POST['b'])
         self.assertEqual('', request.POST['c'])
         self.assertEqual('', request.POST['d'])
-        self.assertEqual(touni(tob('瓶'), 'latin1'), request.POST['cn'])
-        self.assertEqual(touni('瓶'), request.POST.cn)
+        self.assertEqual('瓶', request.POST['cn'])
+        self.assertEqual('瓶', request.POST.cn)
 
     def test_bodypost(self):
         sq = tob('foobar')
@@ -890,10 +890,6 @@ def test_native(self):
         self.env['HTTP_TEST_HEADER'] = 'foobar'
         self.assertEqual(self.headers['Test-header'], 'foobar')
 
-    def test_bytes(self):
-        self.env['HTTP_TEST_HEADER'] = tob('foobar')
-        self.assertEqual(self.headers['Test-Header'], 'foobar')
-
     def test_unicode(self):
         self.env['HTTP_TEST_HEADER'] = touni('foobar')
         self.assertEqual(self.headers['Test-Header'], 'foobar')

diff --git a/test/test_fileupload.py b/test/test_fileupload.py
@@ -33,8 +33,7 @@ def test_filename(self):
         self.assertFilename('.name.cfg', 'name.cfg')
         self.assertFilename(' . na me . ', 'na-me')
         self.assertFilename('path/', 'empty')
-        self.assertFilename(bottle.tob('ümläüts$'), 'umlauts')
-        self.assertFilename(bottle.touni('ümläüts$'), 'umlauts')
+        self.assertFilename('ümläüts$', 'umlauts')
         self.assertFilename('', 'empty')
         self.assertFilename('a'+'b'*1337+'c', 'a'+'b'*254)
 

diff --git a/test/test_formsdict.py b/test/test_formsdict.py
@@ -7,25 +7,11 @@
 class TestFormsDict(unittest.TestCase):
     def test_attr_access(self):
         """ FomsDict.attribute returs string values as unicode. """
-        d = FormsDict(py2=tob('瓶'), py3=tob('瓶').decode('latin1'))
-        self.assertEqual(touni('瓶'), d.py2)
-        self.assertEqual(touni('瓶'), d.py3)
+        d = FormsDict(py3='瓶')
+        self.assertEqual('瓶', d.py3)
+        self.assertEqual('瓶', d["py3"])
 
     def test_attr_missing(self):
         """ FomsDict.attribute returs u'' on missing keys. """
         d = FormsDict()
-        self.assertEqual(touni(''), d.missing)
-
-    def test_attr_unicode_error(self):
-        """ FomsDict.attribute returs u'' on UnicodeError. """
-        d = FormsDict(latin=touni('öäüß').encode('latin1'))
-        self.assertEqual(touni(''), d.latin)
-        d.input_encoding = 'latin1'
-        self.assertEqual(touni('öäüß'), d.latin)
-
-    def test_decode_method(self):
-        d = FormsDict(py2=tob('瓶'), py3=tob('瓶').decode('latin1'))
-        d = d.decode()
-        self.assertFalse(d.recode_unicode)
-        self.assertTrue(hasattr(list(d.keys())[0], 'encode'))
-        self.assertTrue(hasattr(list(d.values())[0], 'encode'))
+        self.assertEqual('', d.missing)