Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Correct and fast Cache #25

Merged
merged 2 commits into from
Aug 28, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
180 changes: 112 additions & 68 deletions gitignorefile/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,61 +29,103 @@ def ignored(path, is_dir=None):

class Cache:
def __init__(self):
self.__gitignores = {}

def __get_parents(self, path, is_dir):
if not is_dir:
path = os.path.dirname(path)
yield path

while True:
new_path = os.path.dirname(path)
if not os.path.samefile(path, new_path):
yield new_path
path = new_path
else:
break
self.__gitignores = {tuple(): []}

def __call__(self, path, is_dir=None):
if is_dir is None:
is_dir = os.path.isdir(path)

path = _Path(path)
add_to_children = {}
plain_paths = []
for parent in self.__get_parents(os.path.abspath(path), is_dir=is_dir):
if parent in self.__gitignores:
for parent in path.parents():
if parent.parts in self.__gitignores:
break

parent_gitignore = os.path.join(parent, ".gitignore")
if os.path.isfile(parent_gitignore):
p = parse(parent_gitignore, base_path=parent)
add_to_children[parent] = (p, plain_paths)
parent_gitignore = parent.join(".gitignore")
if parent_gitignore.isfile():
matches = parse(str(parent_gitignore), base_path=parent)
add_to_children[parent] = (matches, plain_paths)
plain_paths = []

else:
plain_paths.append(parent)

else:
for plain_path in plain_paths:
self.__gitignores[plain_path] = []
# assert plain_path.parts not in self.__gitignores
self.__gitignores[plain_path.parts] = []

if add_to_children:
plain_paths.clear()

if not add_to_children:
else:
return False

for parent, (_, parent_plain_paths) in reversed(list(add_to_children.items())):
self.__gitignores[parent] = []
# assert parent.parts not in self.__gitignores
self.__gitignores[parent.parts] = self.__gitignores[parent.parts[:-1]].copy()
for parent_to_add, (gitignore_to_add, _) in reversed(list(add_to_children.items())):
self.__gitignores[parent].append(gitignore_to_add)
self.__gitignores[parent.parts].append(gitignore_to_add)
if parent_to_add == parent:
break

self.__gitignores[parent].reverse()
self.__gitignores[parent.parts].reverse()

for plain_path in parent_plain_paths:
self.__gitignores[plain_path] = self.__gitignores[parent]
# assert plain_path.parts not in self.__gitignores
self.__gitignores[plain_path.parts] = self.__gitignores[parent.parts]

# This parent comes either from first or second loop.
for plain_path in plain_paths:
# assert plain_path.parts not in self.__gitignores
self.__gitignores[plain_path.parts] = self.__gitignores[parent.parts]

return any((m(path, is_dir=is_dir) for m in self.__gitignores[parent.parts]))


class _Path:
def __init__(self, path):
if isinstance(path, str):
abs_path = os.path.abspath(path)
self.__parts = tuple(_path_split(abs_path))
self.__joined = abs_path
self.__is_dir = None

else:
self.__parts = path
self.__joined = None
self.__is_dir = None

@property
def parts(self):
return self.__parts

def join(self, name):
return _Path(self.__parts + (name,))

def relpath(self, base_path):
# assert self.__parts[: len(base_path.__parts)] == base_path.__parts
return "/".join(self.__parts[len(base_path.__parts) :])

return any(
(m(path, is_dir=is_dir) for m in self.__gitignores[parent])
) # This parent comes either from first or second loop.
def parents(self):
for i in range(len(self.__parts) - 1, 0, -1):
yield _Path(self.__parts[:i])

def isfile(self):
if self.__joined is None:
self.__joined = "/".join(self.__parts)
return os.path.isfile(self.__joined)

def isdir(self):
if self.__is_dir is not None:
return self.__is_dir
if self.__joined is None:
self.__joined = "/".join(self.__parts)
self.__is_dir = os.path.isdir(self.__joined)
return self.__is_dir

def __str__(self):
if self.__joined is None:
self.__joined = "/".join(self.__parts) if self.__parts != ("",) else "/"
return self.__joined


def _rule_from_pattern(pattern):
Expand Down Expand Up @@ -160,32 +202,29 @@ def _rule_from_pattern(pattern):
pattern = pattern[:i]
i -= 1

regexp = _fnmatch_pathname_to_regexp(pattern, directory_only)

if anchored:
regexp = f"^{regexp}"

regexp = _fnmatch_pathname_to_regexp(pattern, anchored, directory_only)
return _IgnoreRule(regexp, negation, directory_only)


class _IgnoreRules:
def __init__(self, rules, base_path):
self.__rules = rules
self.__can_return_immediately = not any((r.negation for r in rules))
self.__base_path = base_path
self.__base_path = _Path(base_path) if isinstance(base_path, str) else base_path

def match(self, path, is_dir=None):
"""
Because Git allows for nested `.gitignore` files, a `base_path` value
is required for correct behavior.
"""
if is_dir is None:
is_dir = os.path.isdir(path)

rel_path = os.path.relpath(path, self.__base_path)
if isinstance(path, str):
path = _Path(path)

if is_dir is None:
is_dir = path.isdir() # TODO Pass callable here.

if rel_path.startswith(f".{os.sep}"):
rel_path = rel_path[2:]
rel_path = path.relpath(self.__base_path)

if self.__can_return_immediately:
return any((r.match(rel_path, is_dir) for r in self.__rules))
Expand All @@ -205,44 +244,52 @@ def __init__(self, regexp, negation, directory_only):
self.__regexp = re.compile(regexp)
self.__negation = negation
self.__directory_only = directory_only
self.__match = self.__regexp.match

@property
def regexp(self):
return self.__regexp

@property
def negation(self):
return self.__negation

def match(self, rel_path, is_dir):
match = self.__regexp.search(rel_path)
m = self.__match(rel_path)

# If we need a directory, check there is something after slash and if there is not, target must be a directory.
# If there is something after slash then it's a directory irrelevant to type of target.
# `self.directory_only` implies we have group number 1.
# N.B. Question mark inside a group without a name can shift indices. :(
return match and (not self.__directory_only or match.group(1) is not None or is_dir)

return m and (not self.__directory_only or m.group(1) is not None or is_dir)

def _seps_non_sep_expr():
if os.altsep is None:
seps = re.escape(os.sep)
non_sep = f"[^{re.escape(os.sep)}]"

else:
seps = f"[{re.escape(os.sep)}{re.escape(os.altsep)}]"
non_sep = f"[^{re.escape(os.sep)}{re.escape(os.altsep)}]"
if os.altsep is not None:
_all_seps_expr = f"[{re.escape(os.sep)}{re.escape(os.altsep)}]"
_path_split = lambda path: re.split(_all_seps_expr, path)

return seps, non_sep
else:
_path_split = lambda path: path.split(os.sep)


# Frustratingly, python's fnmatch doesn't provide the FNM_PATHNAME
# option that `.gitignore`'s behavior depends on.
def _fnmatch_pathname_to_regexp(pattern, directory_only):
def _fnmatch_pathname_to_regexp(pattern, anchored, directory_only):
"""
Implements fnmatch style-behavior, as though with FNM_PATHNAME flagged;
the path separator will not match shell-style '*' and '.' wildcards.
Implements `fnmatch` style-behavior, as though with `FNM_PATHNAME` flagged;
the path separator will not match shell-style `*` and `.` wildcards.
"""

if not pattern:
if directory_only:
return "[^/]+(/.+)?$" # Empty name means no path fragment.

else:
return ".*"

i, n = 0, len(pattern)

seps_group, non_sep = _seps_non_sep_expr()
res = [f"(?:^|{seps_group})"] if pattern else [] # Empty name means no path fragment.
res = ["(?:^|.+/)" if not anchored else ""]
while i < n:
c = pattern[i]
i += 1
Expand All @@ -253,19 +300,16 @@ def _fnmatch_pathname_to_regexp(pattern, directory_only):
res.append(".*")
if pattern[i] == "/":
i += 1
res.append(f"{seps_group}?")
res.append("/?")

else:
res.append(f"{non_sep}*")
res.append("[^/]*")

except IndexError:
res.append(f"{non_sep}*")
res.append("[^/]*")

elif c == "?":
res.append(non_sep)

elif c == "/":
res.append(seps_group)
res.append("[^/]")

elif c == "[":
j = i
Expand All @@ -291,9 +335,9 @@ def _fnmatch_pathname_to_regexp(pattern, directory_only):
res.append(re.escape(c))

if directory_only: # In this case we are interested if there is something after slash.
res.append(f"({seps_group}.+)?$")
res.append("(/.+)?$")

else:
res.append(f"(?:{seps_group}|$)")
res.append("(?:/.+)?$")

return "".join(res)
69 changes: 51 additions & 18 deletions tests/test_cache.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import itertools
import os
import stat
import tempfile
Expand All @@ -18,6 +19,12 @@ def __init__(self, is_file=False):
self.st_dev = 0
self.st_mode = stat.S_IFREG if is_file else stat.S_IFDIR

def isdir(self):
return self.st_mode == stat.S_IFDIR

def isfile(self):
return self.st_mode == stat.S_IFREG

class Stat:
def __init__(self, directories, files):
self.__filesystem = {}
Expand All @@ -43,16 +50,20 @@ def __call__(self, path):
"/",
],
[
"/home/vladimir/project/directory/subdirectory/subdirectory/file.txt",
"/home/vladimir/project/directory/subdirectory/subdirectory/file2.txt",
"/home/vladimir/project/directory/subdirectory/subdirectory/file3.txt",
"/home/vladimir/project/directory/subdirectory/file.txt",
"/home/vladimir/project/directory/subdirectory/file2.txt",
"/home/vladimir/project/directory/.gitignore",
"/home/vladimir/project/directory/file.txt",
"/home/vladimir/project/directory/file2.txt",
"/home/vladimir/project/file.txt",
"/home/vladimir/project/.gitignore",
"/home/vladimir/file.txt",
],
)

statistics = {"open": 0, "stat": 0}

def mock_open(path):
data = {
normalize_path("/home/vladimir/project/directory/.gitignore"): ["file.txt"],
Expand All @@ -66,23 +77,45 @@ def mock_open(path):
except KeyError:
raise FileNotFoundError()

def mock_stat(path):
statistics["stat"] += 1
return my_stat(path)

with unittest.mock.patch("builtins.open", mock_open):
with unittest.mock.patch("os.stat", mock_stat):
matches = gitignorefile.Cache()
self.assertTrue(matches("/home/vladimir/project/directory/subdirectory/file.txt"))
self.assertTrue(matches("/home/vladimir/project/directory/subdirectory/file2.txt"))
self.assertTrue(matches("/home/vladimir/project/directory/file.txt"))
self.assertTrue(matches("/home/vladimir/project/directory/file2.txt"))
self.assertFalse(matches("/home/vladimir/project/file.txt"))

self.assertEqual(statistics["open"], 2)
def mock_isdir(path):
statistics["isdir"] += 1
try:
return my_stat(path).isdir()
except FileNotFoundError:
return False

# On Windows and Python 3.7 `os.path.isdir()` does not use `os.stat`. See `Modules/getpath.c`.
self.assertIn(statistics["stat"], (6 * (2 + 1) + 5, 6 * (2 + 1)))
def mock_isfile(path):
statistics["isfile"] += 1
try:
return my_stat(path).isfile()
except FileNotFoundError:
return False

data = {
"/home/vladimir/project/directory/subdirectory/file.txt": True,
"/home/vladimir/project/directory/subdirectory/file2.txt": True,
"/home/vladimir/project/directory/subdirectory/subdirectory/file.txt": True,
"/home/vladimir/project/directory/subdirectory/subdirectory/file2.txt": True,
"/home/vladimir/project/directory/subdirectory/subdirectory/file3.txt": False,
"/home/vladimir/project/directory/file.txt": True,
"/home/vladimir/project/directory/file2.txt": True,
"/home/vladimir/project/file.txt": False,
"/home/vladimir/file.txt": False, # No rules and no `isdir` calls for this file.
}

for permutation in itertools.islice(itertools.permutations(data.items()), 0, None, 100):
statistics = {"open": 0, "isdir": 0, "isfile": 0}

with unittest.mock.patch("builtins.open", mock_open):
with unittest.mock.patch("os.path.isdir", mock_isdir):
with unittest.mock.patch("os.path.isfile", mock_isfile):
matches = gitignorefile.Cache()
for path, expected in permutation:
self.assertEqual(matches(path), expected)

self.assertEqual(statistics["open"], 2)
self.assertEqual(statistics["isdir"], len(data) - 1)
self.assertEqual(statistics["isfile"], 7) # Unique path fragments.

def test_wrong_symlink(self):
with tempfile.TemporaryDirectory() as d:
Expand Down
Loading