From 92c1139a56bdd0d9f2516bb099107177752514c1 Mon Sep 17 00:00:00 2001 From: Till Ehrengruber Date: Thu, 21 Nov 2024 15:24:45 +0100 Subject: [PATCH 1/8] Guard diskcache creation by file lock --- .pre-commit-config.yaml | 1 + constraints.txt | 8 ++++---- min-extra-requirements-test.txt | 1 + min-requirements-test.txt | 1 + pyproject.toml | 1 + requirements-dev.txt | 8 ++++---- src/gt4py/next/program_processors/runners/gtfn.py | 13 +++++++++++++ 7 files changed, 25 insertions(+), 8 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 1c3b6e693f..7e1870c67f 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -102,6 +102,7 @@ repos: - devtools==0.12.2 - diskcache==5.6.3 - factory-boy==3.3.1 + - filelock==3.16.1 - frozendict==2.4.6 - gridtools-cpp==2.3.8 - importlib-resources==6.4.5 diff --git a/constraints.txt b/constraints.txt index b4b8bc00d4..f039fa2125 100644 --- a/constraints.txt +++ b/constraints.txt @@ -49,7 +49,7 @@ executing==2.1.0 # via devtools, stack-data factory-boy==3.3.1 # via gt4py (pyproject.toml), pytest-factoryboy faker==33.0.0 # via factory-boy fastjsonschema==2.20.0 # via nbformat -filelock==3.16.1 # via tox, virtualenv +filelock==3.16.1 # via gt4py (pyproject.toml), tox, virtualenv fonttools==4.55.0 # via matplotlib fparser==0.1.4 # via dace frozendict==2.4.6 # via gt4py (pyproject.toml) @@ -113,8 +113,8 @@ psutil==6.1.0 # via -r requirements-dev.in, ipykernel, pytest-xdist ptyprocess==0.7.0 # via pexpect pure-eval==0.2.3 # via stack-data pybind11==2.13.6 # via gt4py (pyproject.toml) -pydantic==2.9.2 # via bump-my-version, pydantic-settings -pydantic-core==2.23.4 # via pydantic +pydantic==2.10.0 # via bump-my-version, pydantic-settings +pydantic-core==2.27.0 # via pydantic pydantic-settings==2.6.1 # via bump-my-version pydot==3.0.2 # via tach pygments==2.18.0 # via -r requirements-dev.in, devtools, ipython, nbmake, rich, sphinx @@ -159,7 +159,7 @@ stack-data==0.6.3 # via ipython stdlib-list==0.10.0 # via tach sympy==1.13.3 # via dace tabulate==0.9.0 # via gt4py (pyproject.toml) -tach==0.14.3 # via -r requirements-dev.in +tach==0.14.4 # via -r requirements-dev.in tomli==2.1.0 ; python_version < "3.11" # via -r requirements-dev.in, black, build, coverage, jupytext, mypy, pip-tools, pyproject-api, pytest, setuptools-scm, tach, tox tomli-w==1.0.0 # via tach tomlkit==0.13.2 # via bump-my-version diff --git a/min-extra-requirements-test.txt b/min-extra-requirements-test.txt index 57c0d3969d..d7679a1f0f 100644 --- a/min-extra-requirements-test.txt +++ b/min-extra-requirements-test.txt @@ -67,6 +67,7 @@ deepdiff==5.6.0 devtools==0.6 diskcache==5.6.3 factory-boy==3.3.0 +filelock==3.0.0 frozendict==2.3 gridtools-cpp==2.3.8 hypothesis==6.0.0 diff --git a/min-requirements-test.txt b/min-requirements-test.txt index 81a1c2dea3..cf505e88d6 100644 --- a/min-requirements-test.txt +++ b/min-requirements-test.txt @@ -63,6 +63,7 @@ deepdiff==5.6.0 devtools==0.6 diskcache==5.6.3 factory-boy==3.3.0 +filelock==3.0.0 frozendict==2.3 gridtools-cpp==2.3.8 hypothesis==6.0.0 diff --git a/pyproject.toml b/pyproject.toml index 02d301957c..1e24094fa2 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -36,6 +36,7 @@ dependencies = [ 'devtools>=0.6', 'diskcache>=5.6.3', 'factory-boy>=3.3.0', + 'filelock>=3.0.0', 'frozendict>=2.3', 'gridtools-cpp>=2.3.8,==2.*', "importlib-resources>=5.0;python_version<'3.9'", diff --git a/requirements-dev.txt b/requirements-dev.txt index 9f95779fd5..6542be36f1 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -49,7 +49,7 @@ executing==2.1.0 # via -c constraints.txt, devtools, stack-data factory-boy==3.3.1 # via -c constraints.txt, gt4py (pyproject.toml), pytest-factoryboy faker==33.0.0 # via -c constraints.txt, factory-boy fastjsonschema==2.20.0 # via -c constraints.txt, nbformat -filelock==3.16.1 # via -c constraints.txt, tox, virtualenv +filelock==3.16.1 # via -c constraints.txt, gt4py (pyproject.toml), tox, virtualenv fonttools==4.55.0 # via -c constraints.txt, matplotlib fparser==0.1.4 # via -c constraints.txt, dace frozendict==2.4.6 # via -c constraints.txt, gt4py (pyproject.toml) @@ -113,8 +113,8 @@ psutil==6.1.0 # via -c constraints.txt, -r requirements-dev.in, ipyk ptyprocess==0.7.0 # via -c constraints.txt, pexpect pure-eval==0.2.3 # via -c constraints.txt, stack-data pybind11==2.13.6 # via -c constraints.txt, gt4py (pyproject.toml) -pydantic==2.9.2 # via -c constraints.txt, bump-my-version, pydantic-settings -pydantic-core==2.23.4 # via -c constraints.txt, pydantic +pydantic==2.10.0 # via -c constraints.txt, bump-my-version, pydantic-settings +pydantic-core==2.27.0 # via -c constraints.txt, pydantic pydantic-settings==2.6.1 # via -c constraints.txt, bump-my-version pydot==3.0.2 # via -c constraints.txt, tach pygments==2.18.0 # via -c constraints.txt, -r requirements-dev.in, devtools, ipython, nbmake, rich, sphinx @@ -158,7 +158,7 @@ stack-data==0.6.3 # via -c constraints.txt, ipython stdlib-list==0.10.0 # via -c constraints.txt, tach sympy==1.13.3 # via -c constraints.txt, dace tabulate==0.9.0 # via -c constraints.txt, gt4py (pyproject.toml) -tach==0.14.3 # via -c constraints.txt, -r requirements-dev.in +tach==0.14.4 # via -c constraints.txt, -r requirements-dev.in tomli==2.1.0 ; python_version < "3.11" # via -c constraints.txt, -r requirements-dev.in, black, build, coverage, jupytext, mypy, pip-tools, pyproject-api, pytest, setuptools-scm, tach, tox tomli-w==1.0.0 # via -c constraints.txt, tach tomlkit==0.13.2 # via -c constraints.txt, bump-my-version diff --git a/src/gt4py/next/program_processors/runners/gtfn.py b/src/gt4py/next/program_processors/runners/gtfn.py index 965c6417b2..9f86240de9 100644 --- a/src/gt4py/next/program_processors/runners/gtfn.py +++ b/src/gt4py/next/program_processors/runners/gtfn.py @@ -7,11 +7,14 @@ # SPDX-License-Identifier: BSD-3-Clause import functools +import pathlib +import tempfile import warnings from typing import Any, Optional import diskcache import factory +import filelock import numpy.typing as npt import gt4py._core.definitions as core_defs @@ -146,6 +149,16 @@ class FileCache(diskcache.Cache): released when the instance is garbage collected. """ + def __init__(self, directory: Optional[str | pathlib.Path] = None, **settings: Any) -> None: + if directory: + lock_dir = pathlib.Path(directory).parent + else: + lock_dir = pathlib.Path(tempfile.gettempdir()) + + lock = filelock.FileLock(lock_dir / "file_cache.lock") + with lock: + super().__init__(directory=directory, **settings) + def __del__(self) -> None: self.close() From e6fab8a738e2015dd119a7a5fe6bc8915b2d604f Mon Sep 17 00:00:00 2001 From: Till Ehrengruber Date: Thu, 21 Nov 2024 15:39:22 +0100 Subject: [PATCH 2/8] Fix --- src/gt4py/next/program_processors/runners/gtfn.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/src/gt4py/next/program_processors/runners/gtfn.py b/src/gt4py/next/program_processors/runners/gtfn.py index 9f86240de9..475051302a 100644 --- a/src/gt4py/next/program_processors/runners/gtfn.py +++ b/src/gt4py/next/program_processors/runners/gtfn.py @@ -144,9 +144,12 @@ def fingerprint_compilable_program(inp: stages.CompilableProgram) -> str: class FileCache(diskcache.Cache): """ - This class extends `diskcache.Cache` to ensure the cache is closed upon deletion, - i.e. it ensures that any resources associated with the cache are properly - released when the instance is garbage collected. + This class extends `diskcache.Cache` to ensure the cache is properly + - opended on a distributed file system by using a file lock. This guards the creating of the + cache object, which has been reported to cause `sqlite3.OperationalError: database is locked` + errors and slow startup times when multiple processes access the cache. + - closed upon deletion, i.e. it ensures that any resources associated with the cache are + properly released when the instance is garbage collected. """ def __init__(self, directory: Optional[str | pathlib.Path] = None, **settings: Any) -> None: From f33abe3b82d3db65dbf59b5b19333426636f0bd1 Mon Sep 17 00:00:00 2001 From: Till Ehrengruber Date: Thu, 21 Nov 2024 15:42:31 +0100 Subject: [PATCH 3/8] Fix --- src/gt4py/next/program_processors/runners/gtfn.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/gt4py/next/program_processors/runners/gtfn.py b/src/gt4py/next/program_processors/runners/gtfn.py index 475051302a..ce8db00891 100644 --- a/src/gt4py/next/program_processors/runners/gtfn.py +++ b/src/gt4py/next/program_processors/runners/gtfn.py @@ -145,7 +145,7 @@ def fingerprint_compilable_program(inp: stages.CompilableProgram) -> str: class FileCache(diskcache.Cache): """ This class extends `diskcache.Cache` to ensure the cache is properly - - opended on a distributed file system by using a file lock. This guards the creating of the + - opened on a distributed file system by using a file lock. This guards the creating of the cache object, which has been reported to cause `sqlite3.OperationalError: database is locked` errors and slow startup times when multiple processes access the cache. - closed upon deletion, i.e. it ensures that any resources associated with the cache are From 9d2e87b6fe99fb5405d4632c59a04a819898a57e Mon Sep 17 00:00:00 2001 From: Till Ehrengruber Date: Thu, 21 Nov 2024 15:42:52 +0100 Subject: [PATCH 4/8] Fix --- src/gt4py/next/program_processors/runners/gtfn.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/gt4py/next/program_processors/runners/gtfn.py b/src/gt4py/next/program_processors/runners/gtfn.py index ce8db00891..1611444aa2 100644 --- a/src/gt4py/next/program_processors/runners/gtfn.py +++ b/src/gt4py/next/program_processors/runners/gtfn.py @@ -147,7 +147,7 @@ class FileCache(diskcache.Cache): This class extends `diskcache.Cache` to ensure the cache is properly - opened on a distributed file system by using a file lock. This guards the creating of the cache object, which has been reported to cause `sqlite3.OperationalError: database is locked` - errors and slow startup times when multiple processes access the cache. + errors and slow startup times when multiple processes access the cache concurrently. - closed upon deletion, i.e. it ensures that any resources associated with the cache are properly released when the instance is garbage collected. """ From ad1e5eb0148ec86867dc112d79b82f6392cd2cf3 Mon Sep 17 00:00:00 2001 From: Till Ehrengruber Date: Wed, 27 Nov 2024 16:44:40 +0100 Subject: [PATCH 5/8] Small fix --- src/gt4py/next/program_processors/runners/gtfn.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/gt4py/next/program_processors/runners/gtfn.py b/src/gt4py/next/program_processors/runners/gtfn.py index 1611444aa2..f688a5db91 100644 --- a/src/gt4py/next/program_processors/runners/gtfn.py +++ b/src/gt4py/next/program_processors/runners/gtfn.py @@ -162,8 +162,11 @@ def __init__(self, directory: Optional[str | pathlib.Path] = None, **settings: A with lock: super().__init__(directory=directory, **settings) + self._init_complete = True + def __del__(self) -> None: - self.close() + if getattr(self, "_init_complete", False): # skip if `__init__` didn't finished + self.close() class GTFNCompileWorkflowFactory(factory.Factory): From 9cde165a2fd22aa55d8eda60f7b47f366d3f8370 Mon Sep 17 00:00:00 2001 From: Till Ehrengruber Date: Mon, 2 Dec 2024 13:16:09 +0100 Subject: [PATCH 6/8] Improve docs --- src/gt4py/next/program_processors/runners/gtfn.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/src/gt4py/next/program_processors/runners/gtfn.py b/src/gt4py/next/program_processors/runners/gtfn.py index 0efb75bd48..2ed427d728 100644 --- a/src/gt4py/next/program_processors/runners/gtfn.py +++ b/src/gt4py/next/program_processors/runners/gtfn.py @@ -143,9 +143,12 @@ def fingerprint_compilable_program(inp: stages.CompilableProgram) -> str: class FileCache(diskcache.Cache): """ This class extends `diskcache.Cache` to ensure the cache is properly - - opened on a distributed file system by using a file lock. This guards the creating of the + - opened when accessed by multiple processes using a file lock. This guards the creating of the cache object, which has been reported to cause `sqlite3.OperationalError: database is locked` - errors and slow startup times when multiple processes access the cache concurrently. + errors and slow startup times when multiple processes access the cache concurrently. While + this issue occurred frequently and was observed to be fixed on distributed file systems, the + lock does not guarantee correct behavior in particular for the accesses to the cache (beyond + opening. See #1745 for more details. - closed upon deletion, i.e. it ensures that any resources associated with the cache are properly released when the instance is garbage collected. """ From f567a12c208ee1daad3d6236158c788573a5d9bc Mon Sep 17 00:00:00 2001 From: Till Ehrengruber Date: Mon, 2 Dec 2024 13:20:34 +0100 Subject: [PATCH 7/8] Improve docs --- src/gt4py/next/program_processors/runners/gtfn.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/src/gt4py/next/program_processors/runners/gtfn.py b/src/gt4py/next/program_processors/runners/gtfn.py index 2ed427d728..41eebbeb01 100644 --- a/src/gt4py/next/program_processors/runners/gtfn.py +++ b/src/gt4py/next/program_processors/runners/gtfn.py @@ -145,10 +145,12 @@ class FileCache(diskcache.Cache): This class extends `diskcache.Cache` to ensure the cache is properly - opened when accessed by multiple processes using a file lock. This guards the creating of the cache object, which has been reported to cause `sqlite3.OperationalError: database is locked` - errors and slow startup times when multiple processes access the cache concurrently. While - this issue occurred frequently and was observed to be fixed on distributed file systems, the - lock does not guarantee correct behavior in particular for the accesses to the cache (beyond - opening. See #1745 for more details. + errors and slow startup times when multiple processes access the cache concurrently. While this + issue occurred frequently and was observed to be fixed on distributed file systems, the lock + does not guarantee correct behavior in particular for accesses to the cache (beyond opening) + since the underlying SQLite database is unreliable when stored on an NFS based file system. + It does however ensure correctness off concurrent cache accesses on a local file system. See + #1745 for more details. - closed upon deletion, i.e. it ensures that any resources associated with the cache are properly released when the instance is garbage collected. """ From 8e39bd5ebc0308fd9b4894c85090219657f9ad39 Mon Sep 17 00:00:00 2001 From: Till Ehrengruber Date: Mon, 2 Dec 2024 13:21:20 +0100 Subject: [PATCH 8/8] Improve docs --- src/gt4py/next/program_processors/runners/gtfn.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/gt4py/next/program_processors/runners/gtfn.py b/src/gt4py/next/program_processors/runners/gtfn.py index 41eebbeb01..55f479c665 100644 --- a/src/gt4py/next/program_processors/runners/gtfn.py +++ b/src/gt4py/next/program_processors/runners/gtfn.py @@ -149,7 +149,7 @@ class FileCache(diskcache.Cache): issue occurred frequently and was observed to be fixed on distributed file systems, the lock does not guarantee correct behavior in particular for accesses to the cache (beyond opening) since the underlying SQLite database is unreliable when stored on an NFS based file system. - It does however ensure correctness off concurrent cache accesses on a local file system. See + It does however ensure correctness of concurrent cache accesses on a local file system. See #1745 for more details. - closed upon deletion, i.e. it ensures that any resources associated with the cache are properly released when the instance is garbage collected.