From 2b2d607095335024e5e2bb358e3ef37650536839 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johan=20F=C3=B6rberg?= Date: Wed, 30 Oct 2024 23:08:30 +0100 Subject: [PATCH 1/2] gh-121267: Improve performance of tarfile (#121267) (#121269) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Tarfile in the default write mode spends much of its time resolving UIDs into usernames and GIDs into group names. By caching these mappings, a significant speedup can be achieved. In my simple benchmark[1], this extra caching speeds up tarfile by 8x. [1] https://gist.github.com/jforberg/86af759c796199740c31547ae828aef2 --------- Co-authored-by: Tian Gao Co-authored-by: Bénédikt Tran <10796600+picnixz@users.noreply.github.com> Co-authored-by: Shantanu <12621235+hauntsaninja@users.noreply.github.com> --- Lib/tarfile.py | 25 +++++++++++++------ ...-07-02-15-56-42.gh-issue-121267.yFBWkh.rst | 2 ++ 2 files changed, 19 insertions(+), 8 deletions(-) create mode 100644 Misc/NEWS.d/next/Library/2024-07-02-15-56-42.gh-issue-121267.yFBWkh.rst diff --git a/Lib/tarfile.py b/Lib/tarfile.py index 1475b3da2d3293..a0fab46b24e249 100644 --- a/Lib/tarfile.py +++ b/Lib/tarfile.py @@ -1760,6 +1760,8 @@ def __init__(self, name=None, mode="r", fileobj=None, format=None, # current position in the archive file self.inodes = {} # dictionary caching the inodes of # archive members already added + self._unames = {} # Cached mappings of uid -> uname + self._gnames = {} # Cached mappings of gid -> gname try: if self.mode == "r": @@ -2138,16 +2140,23 @@ def gettarinfo(self, name=None, arcname=None, fileobj=None): tarinfo.mtime = statres.st_mtime tarinfo.type = type tarinfo.linkname = linkname + + # Calls to pwd.getpwuid() and grp.getgrgid() tend to be expensive. To + # speed things up, cache the resolved usernames and group names. if pwd: - try: - tarinfo.uname = pwd.getpwuid(tarinfo.uid)[0] - except KeyError: - pass + if tarinfo.uid not in self._unames: + try: + self._unames[tarinfo.uid] = pwd.getpwuid(tarinfo.uid)[0] + except KeyError: + self._unames[tarinfo.uid] = '' + tarinfo.uname = self._unames[tarinfo.uid] if grp: - try: - tarinfo.gname = grp.getgrgid(tarinfo.gid)[0] - except KeyError: - pass + if tarinfo.gid not in self._gnames: + try: + self._gnames[tarinfo.gid] = grp.getgrgid(tarinfo.gid)[0] + except KeyError: + self._gnames[tarinfo.gid] = '' + tarinfo.gname = self._gnames[tarinfo.gid] if type in (CHRTYPE, BLKTYPE): if hasattr(os, "major") and hasattr(os, "minor"): diff --git a/Misc/NEWS.d/next/Library/2024-07-02-15-56-42.gh-issue-121267.yFBWkh.rst b/Misc/NEWS.d/next/Library/2024-07-02-15-56-42.gh-issue-121267.yFBWkh.rst new file mode 100644 index 00000000000000..9e52405c15a82d --- /dev/null +++ b/Misc/NEWS.d/next/Library/2024-07-02-15-56-42.gh-issue-121267.yFBWkh.rst @@ -0,0 +1,2 @@ +Improve the performance of :mod:`tarfile` when writing files, by caching user names +and group names. From 951cb2c369e8b8fb9f93662658391a53fd727787 Mon Sep 17 00:00:00 2001 From: Barney Gale Date: Wed, 30 Oct 2024 22:56:58 +0000 Subject: [PATCH 2/2] GH-126205: Fix conversion of UNC paths to file URIs (#126208) File URIs for Windows UNC paths should begin with two slashes, not four. --- Lib/nturl2path.py | 7 +------ Lib/test/test_urllib.py | 14 +++++++------- .../2024-10-30-20-45-17.gh-issue-126205.CHEmtx.rst | 2 ++ 3 files changed, 10 insertions(+), 13 deletions(-) create mode 100644 Misc/NEWS.d/next/Library/2024-10-30-20-45-17.gh-issue-126205.CHEmtx.rst diff --git a/Lib/nturl2path.py b/Lib/nturl2path.py index 61852aff58912d..6453f202c26d14 100644 --- a/Lib/nturl2path.py +++ b/Lib/nturl2path.py @@ -55,16 +55,11 @@ def pathname2url(p): if p[:4] == '\\\\?\\': p = p[4:] if p[:4].upper() == 'UNC\\': - p = '\\' + p[4:] + p = '\\\\' + p[4:] elif p[1:2] != ':': raise OSError('Bad path: ' + p) if not ':' in p: # No drive specifier, just convert slashes and quote the name - if p[:2] == '\\\\': - # path is something like \\host\path\on\remote\host - # convert this to ////host/path/on/remote/host - # (notice doubling of slashes at the start of the path) - p = '\\\\' + p components = p.split('\\') return urllib.parse.quote('/'.join(components)) comp = p.split(':', maxsplit=2) diff --git a/Lib/test/test_urllib.py b/Lib/test/test_urllib.py index dc852c8f02758c..3ee17f96b817e1 100644 --- a/Lib/test/test_urllib.py +++ b/Lib/test/test_urllib.py @@ -1524,7 +1524,7 @@ def test_pathname2url_win(self): # Test special prefixes are correctly handled in pathname2url() fn = urllib.request.pathname2url self.assertEqual(fn('\\\\?\\C:\\dir'), '///C:/dir') - self.assertEqual(fn('\\\\?\\unc\\server\\share\\dir'), '/server/share/dir') + self.assertEqual(fn('\\\\?\\unc\\server\\share\\dir'), '//server/share/dir') self.assertEqual(fn("C:"), '///C:') self.assertEqual(fn("C:\\"), '///C:') self.assertEqual(fn('C:\\a\\b.c'), '///C:/a/b.c') @@ -1535,14 +1535,14 @@ def test_pathname2url_win(self): self.assertRaises(IOError, fn, "XX:\\") # No drive letter self.assertEqual(fn("\\folder\\test\\"), '/folder/test/') - self.assertEqual(fn("\\\\folder\\test\\"), '////folder/test/') - self.assertEqual(fn("\\\\\\folder\\test\\"), '/////folder/test/') - self.assertEqual(fn('\\\\some\\share\\'), '////some/share/') - self.assertEqual(fn('\\\\some\\share\\a\\b.c'), '////some/share/a/b.c') - self.assertEqual(fn('\\\\some\\share\\a\\b%#c\xe9'), '////some/share/a/b%25%23c%C3%A9') + self.assertEqual(fn("\\\\folder\\test\\"), '//folder/test/') + self.assertEqual(fn("\\\\\\folder\\test\\"), '///folder/test/') + self.assertEqual(fn('\\\\some\\share\\'), '//some/share/') + self.assertEqual(fn('\\\\some\\share\\a\\b.c'), '//some/share/a/b.c') + self.assertEqual(fn('\\\\some\\share\\a\\b%#c\xe9'), '//some/share/a/b%25%23c%C3%A9') # Round-tripping urls = ['///C:', - '/////folder/test/', + '///folder/test/', '///C:/foo/bar/spam.foo'] for url in urls: self.assertEqual(fn(urllib.request.url2pathname(url)), url) diff --git a/Misc/NEWS.d/next/Library/2024-10-30-20-45-17.gh-issue-126205.CHEmtx.rst b/Misc/NEWS.d/next/Library/2024-10-30-20-45-17.gh-issue-126205.CHEmtx.rst new file mode 100644 index 00000000000000..c92ffb75056606 --- /dev/null +++ b/Misc/NEWS.d/next/Library/2024-10-30-20-45-17.gh-issue-126205.CHEmtx.rst @@ -0,0 +1,2 @@ +Fix issue where :func:`urllib.request.pathname2url` generated URLs beginning +with four slashes (rather than two) when given a Windows UNC path.