From dadbe27f69b8cc9e81f7b1e5b1bef2246a7bf8ae Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Michal=20=C5=A0oltis?= Date: Sat, 7 Dec 2024 09:41:00 +0100 Subject: [PATCH] yarn-v1: Use real names for packages MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit To make the SBOM even more accurate, we should use the name that comes from the package respective package.json file either from the cached tarball or a directory. User could potentially, call these non-registry dependencies whatever they want and the name in the package.json could be completely differnet. Similar behavior is already implemented for yarn berry [1]. --- [1]: https://github.com/containerbuildsystem/cachi2/blob/main/cachi2/core/package_managers/yarn/resolver.py#L356 Signed-off-by: Michal Ĺ oltis --- .../package_managers/yarn_classic/main.py | 2 +- .../package_managers/yarn_classic/resolver.py | 82 ++++++++++++++++--- .../yarn_classic/test_main.py | 3 +- .../yarn_classic/test_resolver.py | 60 ++++++++++++-- 4 files changed, 127 insertions(+), 20 deletions(-) diff --git a/cachi2/core/package_managers/yarn_classic/main.py b/cachi2/core/package_managers/yarn_classic/main.py index 9cf39f334..e19b8f826 100644 --- a/cachi2/core/package_managers/yarn_classic/main.py +++ b/cachi2/core/package_managers/yarn_classic/main.py @@ -76,7 +76,7 @@ def _resolve_yarn_project(project: Project, output_dir: RootedPath) -> None: prefetch_env = _get_prefetch_environment_variables(output_dir) _verify_corepack_yarn_version(project.source_dir, prefetch_env) _fetch_dependencies(project.source_dir, prefetch_env) - packages = resolve_packages(project) + packages = resolve_packages(project, output_dir.join_within_root(MIRROR_DIR)) _verify_no_offline_mirror_collisions(packages) diff --git a/cachi2/core/package_managers/yarn_classic/resolver.py b/cachi2/core/package_managers/yarn_classic/resolver.py index 4d88d5871..c1af5acc3 100644 --- a/cachi2/core/package_managers/yarn_classic/resolver.py +++ b/cachi2/core/package_managers/yarn_classic/resolver.py @@ -1,8 +1,10 @@ +import json import re +import tarfile from abc import ABC, abstractmethod from dataclasses import dataclass from pathlib import Path -from typing import Optional, Union +from typing import Any, Iterable, Optional, Union from urllib.parse import urlparse from packageurl import PackageURL @@ -12,7 +14,11 @@ from cachi2.core.errors import PackageRejected, UnexpectedFormat from cachi2.core.package_managers.npm import NPM_REGISTRY_CNAMES from cachi2.core.package_managers.yarn_classic.project import PackageJson, Project, YarnLock -from cachi2.core.package_managers.yarn_classic.utils import find_runtime_deps +from cachi2.core.package_managers.yarn_classic.utils import ( + find_runtime_deps, + get_git_tarball_mirror_name, + get_tarball_mirror_name, +) from cachi2.core.package_managers.yarn_classic.workspaces import ( Workspace, extract_workspace_metadata, @@ -181,8 +187,11 @@ def purl(self) -> str: class _YarnClassicPackageFactory: - def __init__(self, source_dir: RootedPath, runtime_deps: set[str]) -> None: + def __init__( + self, source_dir: RootedPath, mirror_dir: RootedPath, runtime_deps: set[str] + ) -> None: self._source_dir = source_dir + self._mirror_dir = mirror_dir self._runtime_deps = runtime_deps def create_package_from_pyarn_package(self, package: PYarnPackage) -> YarnClassicPackage: @@ -200,6 +209,7 @@ def assert_package_has_relative_path(package: PYarnPackage) -> None: dev = package_id not in self._runtime_deps if _is_from_npm_registry(package.url): + # registry packages should already have the correct name return RegistryPackage( name=package.name, version=package.version, @@ -214,29 +224,47 @@ def assert_package_has_relative_path(package: PYarnPackage) -> None: path = self._source_dir.join_within_root(package.path) # File packages have a url, whereas link packages do not if package.url: + real_name = _read_name_from_tarball(path) + return FilePackage( - name=package.name, + name=real_name, version=package.version, path=path, integrity=package.checksum, dev=dev, ) + + package_json_path = path.join_within_root("package.json") + # package.json is not required for link packages + if package_json_path.path.exists(): + real_name = _read_name_from_package_json(package_json_path) + else: + real_name = package.name + return LinkPackage( - name=package.name, + name=real_name, version=package.version, path=path, dev=dev, ) elif _is_git_url(package.url): + tarball_name = get_git_tarball_mirror_name(package.url) + tarball_path = self._mirror_dir.join_within_root(tarball_name) + real_name = _read_name_from_tarball(tarball_path) + return GitPackage( - name=package.name, + name=real_name, version=package.version, url=package.url, dev=dev, ) elif _is_tarball_url(package.url): + tarball_name = get_tarball_mirror_name(package.url) + tarball_path = self._mirror_dir.join_within_root(tarball_name) + real_name = _read_name_from_tarball(tarball_path) + return UrlPackage( - name=package.name, + name=real_name, version=package.version, url=package.url, integrity=package.checksum, @@ -298,11 +326,11 @@ def _is_from_npm_registry(url: str) -> bool: def _get_packages_from_lockfile( - source_dir: RootedPath, yarn_lock: YarnLock, runtime_deps: set[str] + source_dir: RootedPath, mirror_dir: RootedPath, yarn_lock: YarnLock, runtime_deps: set[str] ) -> list[YarnClassicPackage]: """Return a list of Packages for all dependencies in yarn.lock.""" pyarn_packages: list[PYarnPackage] = yarn_lock.yarn_lockfile.packages() - package_factory = _YarnClassicPackageFactory(source_dir, runtime_deps) + package_factory = _YarnClassicPackageFactory(source_dir, mirror_dir, runtime_deps) return [ package_factory.create_package_from_pyarn_package(package) for package in pyarn_packages @@ -337,7 +365,7 @@ def _get_workspace_packages( ] -def resolve_packages(project: Project) -> list[YarnClassicPackage]: +def resolve_packages(project: Project, mirror_dir: RootedPath) -> Iterable[YarnClassicPackage]: """Return a list of Packages corresponding to all project dependencies.""" workspaces = extract_workspace_metadata(project.source_dir) yarn_lock = YarnLock.from_file(project.source_dir.join_within_root("yarn.lock")) @@ -347,5 +375,37 @@ def resolve_packages(project: Project) -> list[YarnClassicPackage]: result.append(_get_main_package(project.source_dir, project.package_json)) result.extend(_get_workspace_packages(project.source_dir, workspaces)) - result.extend(_get_packages_from_lockfile(project.source_dir, yarn_lock, runtime_deps)) + result.extend( + _get_packages_from_lockfile(project.source_dir, mirror_dir, yarn_lock, runtime_deps) + ) return result + + +def _read_name_from_tarball(tarball_path: RootedPath) -> str: + """Read the package name from the package.json file in the cached tarball.""" + with tarfile.open(tarball_path) as tar: + names = tar.getnames() + package_json_subpath = next((n for n in names if n.endswith("package.json")), None) + if package_json_subpath is None: + raise ValueError(f"No package.json found in the tarball {tarball_path}") + + package_json = tar.extractfile(package_json_subpath) + if package_json is None: + raise ValueError(f"Failed to extract package.json from {tarball_path}") + + package_json_content: dict[str, Any] = json.loads(package_json.read()) + + if (name := package_json_content.get("name")) is None: + raise ValueError(f"No 'name' field found in package.json in {tarball_path}") + + return name + + +def _read_name_from_package_json(path: RootedPath) -> str: + """Read the package name from a package.json file.""" + package_json = PackageJson.from_file(path) + + if (name := package_json.data.get("name")) is None: + raise ValueError(f"No 'name' field found in package.json in {path}") + + return name diff --git a/tests/unit/package_managers/yarn_classic/test_main.py b/tests/unit/package_managers/yarn_classic/test_main.py index edf54fdbb..030b99217 100644 --- a/tests/unit/package_managers/yarn_classic/test_main.py +++ b/tests/unit/package_managers/yarn_classic/test_main.py @@ -10,6 +10,7 @@ from cachi2.core.models.output import BuildConfig, EnvironmentVariable, RequestOutput from cachi2.core.models.sbom import Component from cachi2.core.package_managers.yarn_classic.main import ( + MIRROR_DIR, _fetch_dependencies, _generate_build_environment_variables, _get_prefetch_environment_variables, @@ -122,7 +123,7 @@ def test_resolve_yarn_project( mock_fetch_dependencies.assert_called_once_with( project.source_dir, mock_prefetch_env_vars.return_value ) - mock_resolve_packages.assert_called_once_with(project) + mock_resolve_packages.assert_called_once_with(project, output_dir.join_within_root(MIRROR_DIR)) @mock.patch("cachi2.core.package_managers.yarn_classic.main.run_yarn_cmd") diff --git a/tests/unit/package_managers/yarn_classic/test_resolver.py b/tests/unit/package_managers/yarn_classic/test_resolver.py index 39f6febd8..44be48ad4 100644 --- a/tests/unit/package_managers/yarn_classic/test_resolver.py +++ b/tests/unit/package_managers/yarn_classic/test_resolver.py @@ -1,4 +1,7 @@ +import io +import json import re +import tarfile from unittest import mock from urllib.parse import quote @@ -8,6 +11,7 @@ from cachi2.core.checksum import ChecksumInfo from cachi2.core.errors import PackageRejected, UnexpectedFormat +from cachi2.core.package_managers.yarn_classic.main import MIRROR_DIR from cachi2.core.package_managers.yarn_classic.project import PackageJson from cachi2.core.package_managers.yarn_classic.resolver import ( FilePackage, @@ -23,6 +27,7 @@ _is_from_npm_registry, _is_git_url, _is_tarball_url, + _read_name_from_tarball, _YarnClassicPackageFactory, resolve_packages, ) @@ -106,7 +111,11 @@ def test__is_from_npm_registry_can_parse_incorrect_registry_urls() -> None: assert not _is_from_npm_registry("https://example.org/fecha.tar.gz") -def test_create_package_from_pyarn_package(rooted_tmp_path: RootedPath) -> None: +@mock.patch("cachi2.core.package_managers.yarn_classic.resolver._read_name_from_tarball") +def test_create_package_from_pyarn_package( + mock_read_name_from_tarball: mock.Mock, + rooted_tmp_path: RootedPath, +) -> None: test_cases: list[tuple[PYarnPackage, YarnClassicPackage]] = [ ( PYarnPackage( @@ -179,13 +188,14 @@ def test_create_package_from_pyarn_package(rooted_tmp_path: RootedPath) -> None: ] for pyarn_package, expected_package in test_cases: + mock_read_name_from_tarball.return_value = expected_package.name runtime_deps = ( set() if expected_package.dev else set({f"{pyarn_package.name}@{pyarn_package.version}"}) ) - package_factory = _YarnClassicPackageFactory(rooted_tmp_path, runtime_deps) + package_factory = _YarnClassicPackageFactory(rooted_tmp_path, rooted_tmp_path, runtime_deps) assert package_factory.create_package_from_pyarn_package(pyarn_package) == expected_package @@ -200,7 +210,7 @@ def test_create_package_from_pyarn_package_fail_absolute_path(rooted_tmp_path: R f"({pyarn_package.path}), which is not permitted." ) - package_factory = _YarnClassicPackageFactory(rooted_tmp_path, set()) + package_factory = _YarnClassicPackageFactory(rooted_tmp_path, rooted_tmp_path, set()) with pytest.raises(PackageRejected, match=re.escape(error_msg)): package_factory.create_package_from_pyarn_package(pyarn_package) @@ -214,7 +224,7 @@ def test_create_package_from_pyarn_package_fail_path_outside_root( path="../path/outside/root", ) - package_factory = _YarnClassicPackageFactory(rooted_tmp_path, set()) + package_factory = _YarnClassicPackageFactory(rooted_tmp_path, rooted_tmp_path, set()) with pytest.raises(PathOutsideRoot): package_factory.create_package_from_pyarn_package(pyarn_package) @@ -228,7 +238,7 @@ def test_create_package_from_pyarn_package_fail_unexpected_format( url="ftp://some-tarball.tgz", ) - package_factory = _YarnClassicPackageFactory(rooted_tmp_path, set()) + package_factory = _YarnClassicPackageFactory(rooted_tmp_path, rooted_tmp_path, set()) with pytest.raises(UnexpectedFormat): package_factory.create_package_from_pyarn_package(pyarn_package) @@ -255,7 +265,7 @@ def test__get_packages_from_lockfile( mock.call(mock_pyarn_package_2), ] - output = _get_packages_from_lockfile(rooted_tmp_path, mock_yarn_lock, set()) + output = _get_packages_from_lockfile(rooted_tmp_path, rooted_tmp_path, mock_yarn_lock, set()) mock_pyarn_lockfile.packages.assert_called_once() mock_create_package.assert_has_calls(create_package_expected_calls) @@ -290,7 +300,7 @@ def test_resolve_packages( mock_get_lockfile_packages.return_value = lockfile_packages mock_get_workspace_packages.return_value = workspace_packages - output = resolve_packages(project) + output = resolve_packages(project, rooted_tmp_path.join_within_root(MIRROR_DIR)) mock_extract_workspaces.assert_called_once_with(rooted_tmp_path) mock_get_yarn_lock.assert_called_once_with(yarn_lock_path) mock_get_main_package.assert_called_once_with(project.source_dir, project.package_json) @@ -299,6 +309,7 @@ def test_resolve_packages( ) mock_get_lockfile_packages.assert_called_once_with( rooted_tmp_path, + rooted_tmp_path.join_within_root(MIRROR_DIR), mock_get_yarn_lock.return_value, find_runtime_deps.return_value, ) @@ -432,3 +443,38 @@ def test_package_purl(rooted_tmp_path_repo: RootedPath) -> None: for package, expected_purl in yarn_classic_packages: assert package.purl == expected_purl + + +def mock_tarball(path: RootedPath, package_json_content: dict[str, str]) -> RootedPath: + tarball_path = path.join_within_root("package.tar.gz") + + if not package_json_content: + with tarfile.open(tarball_path, mode="w:gz") as tar: + tar.addfile(tarfile.TarInfo(name="foo"), io.BytesIO()) + + return tarball_path + + with tarfile.open(tarball_path, mode="w:gz") as tar: + package_json_bytes = json.dumps(package_json_content).encode("utf-8") + info = tarfile.TarInfo(name="package.json") + info.size = len(package_json_bytes) + tar.addfile(info, io.BytesIO(package_json_bytes)) + + return tarball_path + + +def test_successful_name_extraction(rooted_tmp_path: RootedPath) -> None: + tarball_path = mock_tarball(path=rooted_tmp_path, package_json_content={"name": "foo"}) + assert _read_name_from_tarball(tarball_path) == "foo" + + +def test_no_package_json(rooted_tmp_path: RootedPath) -> None: + tarball_path = mock_tarball(path=rooted_tmp_path, package_json_content={}) + with pytest.raises(ValueError, match="No package.json found"): + _read_name_from_tarball(tarball_path) + + +def test_missing_name_field(rooted_tmp_path: RootedPath) -> None: + tarball_path = mock_tarball(path=rooted_tmp_path, package_json_content={"key": "foo"}) + with pytest.raises(ValueError, match="No 'name' field found"): + _read_name_from_tarball(tarball_path)