From d96ef528a16ec5c896d41d301b013e00caaeabce Mon Sep 17 00:00:00 2001 From: Vladimir Chebotarev Date: Sun, 28 Aug 2022 19:47:50 +0300 Subject: [PATCH] 35% faster than `pathspec` \o/. #24 cpburnz/python-path-specification#38 --- gitignorefile/__init__.py | 41 ++++++++++++++++++++++++++------------- tests/test_match.py | 18 +++++++++++++++++ 2 files changed, 45 insertions(+), 14 deletions(-) diff --git a/gitignorefile/__init__.py b/gitignorefile/__init__.py index 2fc08a5..24722dc 100644 --- a/gitignorefile/__init__.py +++ b/gitignorefile/__init__.py @@ -50,6 +50,7 @@ def __call__(self, path, is_dir=None): else: for plain_path in plain_paths: + # assert plain_path.parts not in self.__gitignores self.__gitignores[plain_path.parts] = [] if add_to_children: @@ -59,6 +60,7 @@ def __call__(self, path, is_dir=None): return False for parent, (_, parent_plain_paths) in reversed(list(add_to_children.items())): + # assert parent.parts not in self.__gitignores self.__gitignores[parent.parts] = self.__gitignores[parent.parts[:-1]].copy() for parent_to_add, (gitignore_to_add, _) in reversed(list(add_to_children.items())): self.__gitignores[parent.parts].append(gitignore_to_add) @@ -68,10 +70,12 @@ def __call__(self, path, is_dir=None): self.__gitignores[parent.parts].reverse() for plain_path in parent_plain_paths: + # assert plain_path.parts not in self.__gitignores self.__gitignores[plain_path.parts] = self.__gitignores[parent.parts] # This parent comes either from first or second loop. for plain_path in plain_paths: + # assert plain_path.parts not in self.__gitignores self.__gitignores[plain_path.parts] = self.__gitignores[parent.parts] return any((m(path, is_dir=is_dir) for m in self.__gitignores[parent.parts])) @@ -98,7 +102,7 @@ def join(self, name): return _Path(self.__parts + (name,)) def relpath(self, base_path): - assert self.__parts[: len(base_path.__parts)] == base_path.__parts + # assert self.__parts[: len(base_path.__parts)] == base_path.__parts return "/".join(self.__parts[len(base_path.__parts) :]) def parents(self): @@ -198,11 +202,7 @@ def _rule_from_pattern(pattern): pattern = pattern[:i] i -= 1 - regexp = _fnmatch_pathname_to_regexp(pattern, directory_only) - - if anchored: - regexp = f"^{regexp}" - + regexp = _fnmatch_pathname_to_regexp(pattern, anchored, directory_only) return _IgnoreRule(regexp, negation, directory_only) @@ -244,19 +244,24 @@ def __init__(self, regexp, negation, directory_only): self.__regexp = re.compile(regexp) self.__negation = negation self.__directory_only = directory_only + self.__match = self.__regexp.match + + @property + def regexp(self): + return self.__regexp @property def negation(self): return self.__negation def match(self, rel_path, is_dir): - match = self.__regexp.search(rel_path) + m = self.__match(rel_path) # If we need a directory, check there is something after slash and if there is not, target must be a directory. # If there is something after slash then it's a directory irrelevant to type of target. # `self.directory_only` implies we have group number 1. # N.B. Question mark inside a group without a name can shift indices. :( - return match and (not self.__directory_only or match.group(1) is not None or is_dir) + return m and (not self.__directory_only or m.group(1) is not None or is_dir) if os.altsep is not None: @@ -269,14 +274,22 @@ def match(self, rel_path, is_dir): # Frustratingly, python's fnmatch doesn't provide the FNM_PATHNAME # option that `.gitignore`'s behavior depends on. -def _fnmatch_pathname_to_regexp(pattern, directory_only): +def _fnmatch_pathname_to_regexp(pattern, anchored, directory_only): """ Implements `fnmatch` style-behavior, as though with `FNM_PATHNAME` flagged; the path separator will not match shell-style `*` and `.` wildcards. """ + + if not pattern: + if directory_only: + return "[^/]+(/.+)?$" # Empty name means no path fragment. + + else: + return ".*" + i, n = 0, len(pattern) - res = ["(?:^|/)"] if pattern else [] # Empty name means no path fragment. + res = ["(?:^|.+/)" if not anchored else ""] while i < n: c = pattern[i] i += 1 @@ -290,10 +303,10 @@ def _fnmatch_pathname_to_regexp(pattern, directory_only): res.append("/?") else: - res.append(f"[^/]*") + res.append("[^/]*") except IndexError: - res.append(f"[^/]*") + res.append("[^/]*") elif c == "?": res.append("[^/]") @@ -322,9 +335,9 @@ def _fnmatch_pathname_to_regexp(pattern, directory_only): res.append(re.escape(c)) if directory_only: # In this case we are interested if there is something after slash. - res.append(f"(/.+)?$") + res.append("(/.+)?$") else: - res.append(f"(?:/|$)") + res.append("(?:/.+)?$") return "".join(res) diff --git a/tests/test_match.py b/tests/test_match.py index 78e8769..0c6626f 100644 --- a/tests/test_match.py +++ b/tests/test_match.py @@ -108,6 +108,14 @@ def test_second_level_directories_unchained(self): self.assertTrue(matches("/home/michael/a/doc/frotz", is_dir=True)) self.assertFalse(matches("/home/michael/a/b/doc/frotz", is_dir=False)) self.assertTrue(matches("/home/michael/a/b/doc/frotz", is_dir=True)) + for is_dir in (False, True): + with self.subTest(i=is_dir): + self.assertTrue(matches("/home/michael/doc/frotz/file", is_dir=False)) + self.assertTrue(matches("/home/michael/doc/frotz/file", is_dir=True)) + self.assertTrue(matches("/home/michael/a/doc/frotz/file", is_dir=False)) + self.assertTrue(matches("/home/michael/a/doc/frotz/file", is_dir=True)) + self.assertTrue(matches("/home/michael/a/b/doc/frotz/file", is_dir=False)) + self.assertTrue(matches("/home/michael/a/b/doc/frotz/file", is_dir=True)) def test_second_level_files(self): matches = self.__parse_gitignore_string(["doc/frotz"], fake_base_dir="/home/michael") @@ -124,6 +132,16 @@ def test_ignore_file(self): self.assertTrue(matches("/home/michael/.venv/folder", is_dir=is_dir)) self.assertTrue(matches("/home/michael/.venv/file.txt", is_dir=is_dir)) + def test_ignore_core_file(self): + matches = self.__parse_gitignore_string(["core", "!core/"], fake_base_dir="/home/michael") + for is_dir in (False, True): + with self.subTest(i=is_dir): + self.assertFalse(matches("/home/michael/core/a", is_dir=is_dir)) + self.assertTrue(matches("/home/michael/core", is_dir=False)) + self.assertFalse(matches("/home/michael/core", is_dir=True)) + self.assertTrue(matches("/home/michael/a/core", is_dir=False)) + self.assertFalse(matches("/home/michael/a/core", is_dir=True)) + def test_ignore_directory(self): matches = self.__parse_gitignore_string([".venv/"], fake_base_dir="/home/michael") for is_dir in (False, True):