Skip to content

Commit

Permalink
fix: Make URI parsing more lenient
Browse files Browse the repository at this point in the history
The URI validation logic was too strict in some cases, causing valid
lakeFS ref expressions with relative (`~N`, `^N`) or HEAD (`@`) suffixes
to be rejected.

This commit refactors the relevant regexp to accept these URIs and also
makes the relative ref expression parsing more explicit in the regex.

Issue: #314
  • Loading branch information
AdrianoKF committed Jan 16, 2025
1 parent 6fb9701 commit 25ee73f
Show file tree
Hide file tree
Showing 3 changed files with 84 additions and 9 deletions.
17 changes: 9 additions & 8 deletions src/lakefs_spec/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,14 @@ def md5_checksum(lpath: str | os.PathLike[str], blocksize: int = 2**22) -> str:
return file_hash.hexdigest()


_uri_parts = {
"protocol": r"^(?:lakefs://)?", # leading lakefs:// protocol (optional)
"repository": r"(?P<repository>[a-z0-9][a-z0-9\-]{2,62})/",
"ref expression": r"(?P<ref>\w[\w\-.]*(([~\^]\d*)*|@)?)/", # ref name with optional @, ~N, ^N suffixes
"resource": r"(?P<resource>.*)",
}


def parse(path: str) -> tuple[str, str, str]:
"""
Parses a lakeFS URI in the form ``lakefs://<repo>/<ref>/<resource>``.
Expand All @@ -118,16 +126,9 @@ def parse(path: str) -> tuple[str, str, str]:
If the path does not conform to the lakeFS URI format.
"""

uri_parts = {
"protocol": r"^(?:lakefs://)?", # leading lakefs:// protocol (optional)
"repository": r"(?P<repository>[a-z0-9][a-z0-9\-]{2,62})/",
"ref expression": r"(?P<ref>\w[\w\-.^~]*)/",
"resource": r"(?P<resource>.*)",
}

groups: dict[str, str] = {}
start = 0
for group, regex in uri_parts.items():
for group, regex in _uri_parts.items():
# we parse iteratively to improve the error message for the user if an invalid URI is given.
# by going front to back and parsing each part successively, we obtain the current path segment,
# and print it out to the user if it does not conform to our assumption of the lakeFS URI spec.
Expand Down
31 changes: 31 additions & 0 deletions tests/regression/test_gh_314.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
from lakefs import Branch, Repository

from lakefs_spec.spec import LakeFSFileSystem


def test_gh_314(
fs: LakeFSFileSystem,
repository: Repository,
temp_branch: Branch,
) -> None:
"""
Regression test for GitHub issue 314: Enable `@` and `~N` syntax
https://github.com/aai-institute/lakefs-spec/issues/314
"""

prefix = f"lakefs://{repository.id}/{temp_branch.id}"
datapath = f"{prefix}/data.txt"

# add new file, and immediately commit.
fs.pipe(datapath, b"data1")
temp_branch.commit(message="Add data.txt")

fs.pipe(datapath, b"data2")
# Reading the committed version of the file should yield the correct data.
committed_head_path = f"{prefix}@/data.txt"
assert fs.read_text(committed_head_path) == "data1"

# Reading a relative commit should yield the correct data.
temp_branch.commit(message="Update data.txt")
relative_commit_path = f"{prefix}~1/data.txt"
assert fs.read_text(relative_commit_path) == "data1"
45 changes: 44 additions & 1 deletion tests/test_util.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
import re

import pytest

from lakefs_spec.util import _batched
from lakefs_spec.util import _batched, _uri_parts


def test_batched_empty_iterable():
Expand All @@ -26,3 +28,44 @@ def test_batched_batch_size_greater_than_iterable():
def test_batched_invalid_batch_size():
with pytest.raises(ValueError, match="n must be at least one"):
list(_batched([1, 2, 3], 0))


class TestLakeFSUriPartRegexes:
@pytest.mark.parametrize(
"repo_name, valid",
[
("my-repo", True),
("@@repo", False),
("", False),
("a", False),
("a" * 63, True),
("a" * 64, False),
],
)
def test_repository(self, repo_name: str, valid: bool) -> None:
result = re.match(_uri_parts["repository"], repo_name + "/")
if valid:
assert result is not None
else:
assert result is None

@pytest.mark.parametrize(
"refexp, valid",
[
("", False),
("main", True),
("main@", True),
("main~", True),
("main^", True),
("main^2", True),
("main^^^", True),
("main^1^1", True),
("main^1~1", True),
],
)
def test_ref_expression(self, refexp: str, valid: bool) -> None:
result = re.match(_uri_parts["ref expression"], refexp + "/")
if valid:
assert result is not None
else:
assert result is None

0 comments on commit 25ee73f

Please sign in to comment.