Resolve duplicate file names (#1370)

Co-authored-by: Håkon V. Treider <[email protected]>
cognitedata · Oct 4, 2023 · a938e33 · a938e33
1 parent 99ab6cf
commit a938e33
Show file tree

Hide file tree

Showing 5 changed files with 126 additions and 38 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -17,6 +17,11 @@ Changes are grouped as follows
 - `Fixed` for any bug fixes.
 - `Security` in case of vulnerabilities.
 
+## [6.29.0] - 2023-10-04
+### Added
+- Added parameter `resolve_duplicate_file_names` to `client.files.download`. 
+  This will keep all the files when downloading to local machine, even if they have the same name.
+
 ## [6.28.5] - 2023-10-03
 ### Fixed
 - Bugfix for serialization of Workflows' `DynamicTasksParameters` during `workflows.versions.upsert` and `workflows.execution.retrieve_detailed`

diff --git a/cognite/client/_api/files.py b/cognite/client/_api/files.py
@@ -3,6 +3,7 @@
 import copy
 import os
 import warnings
+from collections import defaultdict
 from io import BufferedReader
 from pathlib import Path
 from typing import (
@@ -611,19 +612,50 @@ def retrieve_download_urls(
         results = tasks_summary.joined_results(unwrap_fn=lambda res: res.json()["items"])
         return {result.get("id") or result["externalId"]: result["downloadUrl"] for result in results}
 
+    @staticmethod
+    def _create_unique_file_names(file_names_in: list[str] | list[Path]) -> list[str]:
+        """
+        Create unique file names by appending a number to the base file name.
+        """
+        file_names: list[str] = [str(file_name) for file_name in file_names_in]
+        unique_original = set(file_names)
+        unique_created = []
+        original_count: defaultdict = defaultdict(int)
+
+        for file_name in file_names:
+            if file_name not in unique_created:
+                unique_created.append(file_name)
+                continue
+
+            file_suffixes = Path(file_name).suffixes
+            file_postfix = "".join(file_suffixes)
+            file_base = file_name[: file_name.index(file_postfix) or None]  # Awaiting .removesuffix in 3.9:
+
+            new_name = file_name
+            while (new_name in unique_created) or (new_name in unique_original):
+                original_count[file_name] += 1
+                new_name = f"{file_base}({original_count[file_name]}){file_postfix}"
+
+            unique_created.append(new_name)
+
+        return unique_created
+
     def download(
         self,
         directory: str | Path,
         id: int | Sequence[int] | None = None,
         external_id: str | Sequence[str] | None = None,
         keep_directory_structure: bool = False,
+        resolve_duplicate_file_names: bool = False,
     ) -> None:
         """`Download files by id or external id. <https://developer.cognite.com/api#tag/Files/operation/downloadLinks>`_
 
         This method will stream all files to disk, never keeping more than 2MB in memory per worker.
         The files will be stored in the provided directory using the file name retrieved from the file metadata in CDF.
         You can also choose to keep the directory structure from CDF so that the files will be stored in subdirectories
         matching the directory attribute on the files. When missing, the (root) directory is used.
+        By default, duplicate file names to the same local folder will be resolved by only keeping one of the files.
+        You can choose to resolve this by appending a number to the file name using the resolve_duplicate_file_names argument.
 
         Warning:
             If you are downloading several files at once, be aware that file name collisions lead to all-but-one of
@@ -635,6 +667,7 @@ def download(
             external_id (str | Sequence[str] | None): External ID or list of external ids.
             keep_directory_structure (bool): Whether or not to keep the directory hierarchy in CDF,
                 creating subdirectories as needed below the given directory.
+            resolve_duplicate_file_names (bool): Whether or not to resolve duplicate file names by appending a number on duplicate file names
 
         Examples:
 
@@ -655,20 +688,30 @@ def download(
         all_identifiers = IdentifierSequence.load(id, external_id).as_dicts()
         id_to_metadata = self._get_id_to_metadata_map(all_identifiers)
 
+        all_ids, filepaths, directories = self._get_ids_filepaths_directories(
+            directory, id_to_metadata, keep_directory_structure
+        )
+
         if keep_directory_structure:
-            all_ids, filepaths, file_directories = self._prepare_file_hierarchy(directory, id_to_metadata)
-            self._download_files_to_directory(file_directories, all_ids, id_to_metadata, filepaths)
-        else:
-            filepaths = [
-                directory / cast(str, metadata.name)
-                for identifier, metadata in id_to_metadata.items()
-                if isinstance(identifier, int)
-            ]
-            self._download_files_to_directory(directory, all_identifiers, id_to_metadata, filepaths)
+            for file_folder in set(directories):
+                file_folder.mkdir(parents=True, exist_ok=True)
+
+        if resolve_duplicate_file_names:
+            filepaths_str = self._create_unique_file_names(filepaths)
+            filepaths = [Path(file_path) for file_path in filepaths_str]
+
+        self._download_files_to_directory(
+            directory=directory,
+            all_ids=all_ids,
+            id_to_metadata=id_to_metadata,
+            filepaths=filepaths,
+        )
 
     @staticmethod
-    def _prepare_file_hierarchy(
-        directory: Path, id_to_metadata: dict[str | int, FileMetadata]
+    def _get_ids_filepaths_directories(
+        directory: Path,
+        id_to_metadata: dict[str | int, FileMetadata],
+        keep_directory_structure: bool = False,
     ) -> tuple[list[dict[str, str | int]], list[Path], list[Path]]:
         # Note on type hint: Too much of the SDK is wrongly typed with 'dict[str, str | int]',
         # instead of 'dict[str, str] | dict[str, int]', so we pretend dict-value type can also be str:
@@ -678,17 +721,14 @@ def _prepare_file_hierarchy(
             if not isinstance(identifier, int):
                 continue
             file_directory = directory
-            if metadata.directory:
+            if metadata.directory and keep_directory_structure:
                 # CDF enforces absolute, unix-style paths (i.e. always stating with '/'). We strip to make it relative:
                 file_directory /= metadata.directory[1:]
 
             ids.append({"id": identifier})
             file_directories.append(file_directory)
             filepaths.append(file_directory / cast(str, metadata.name))
 
-        for file_folder in set(file_directories):
-            file_folder.mkdir(parents=True, exist_ok=True)
-
         return ids, filepaths, file_directories
 
     @staticmethod
@@ -719,16 +759,13 @@ def _get_id_to_metadata_map(self, all_ids: Sequence[dict]) -> dict[str | int, Fi
 
     def _download_files_to_directory(
         self,
-        directory: Path | Sequence[Path],
+        directory: Path,
         all_ids: Sequence[dict[str, int | str]],
         id_to_metadata: dict[str | int, FileMetadata],
         filepaths: list[Path],
     ) -> None:
         self._warn_on_duplicate_filenames(filepaths)
-        if isinstance(directory, Path):
-            tasks = [(directory, id, id_to_metadata) for id in all_ids]
-        else:
-            tasks = [(dir, id, id_to_metadata) for dir, id in zip(directory, all_ids)]
+        tasks = [(directory, id, id_to_metadata, filepath) for id, filepath in zip(all_ids, filepaths)]
         tasks_summary = utils._concurrency.execute_tasks(
             self._process_file_download, tasks, max_workers=self._config.max_workers
         )
@@ -747,15 +784,14 @@ def _process_file_download(
         directory: Path,
         identifier: dict[str, int | str],
         id_to_metadata: dict[str | int, FileMetadata],
+        file_path: Path,
     ) -> None:
-        id = IdentifierSequence.unwrap_identifier(identifier)
-        file_metadata = id_to_metadata[id]
-        file_path = (directory / cast(str, file_metadata.name)).resolve()
-        file_is_in_download_directory = directory.resolve() in file_path.parents
+        file_path_absolute = file_path.resolve()
+        file_is_in_download_directory = directory.resolve() in file_path_absolute.parents
         if not file_is_in_download_directory:
-            raise RuntimeError(f"Resolved file path '{file_path}' is not inside download directory")
+            raise RuntimeError(f"Resolved file path '{file_path_absolute}' is not inside download directory")
         download_link = self._get_download_link(identifier)
-        self._download_file_to_path(download_link, file_path)
+        self._download_file_to_path(download_link, file_path_absolute)
 
     def _download_file_to_path(self, download_link: str, path: Path, chunk_size: int = 2**21) -> None:
         with self._http_client_with_retry.request(

diff --git a/cognite/client/_version.py b/cognite/client/_version.py
@@ -1,4 +1,4 @@
 from __future__ import annotations
 
-__version__ = "6.28.5"
+__version__ = "6.29.0"
 __api_subversion__ = "V20220125"
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,7 +1,6 @@
 [tool.poetry]
 name = "cognite-sdk"
-
-version = "6.28.5"
+version = "6.29.0"
 description = "Cognite Python SDK"
 readme = "README.md"
 documentation = "https://cognite-sdk-python.readthedocs-hosted.com"

diff --git a/tests/tests_unit/test_api/test_files.py b/tests/tests_unit/test_api/test_files.py
@@ -2,6 +2,7 @@
 import os
 import re
 from io import BufferedReader
+from pathlib import Path
 from tempfile import TemporaryDirectory
 
 import pytest
@@ -108,16 +109,16 @@ def mock_file_download_response(rsps, cognite_client):
         rsps.POST,
         cognite_client.files._get_base_url_with_base_path() + "/files/byids",
         status=200,
-        json={"items": [{"id": 1, "name": "file1"}, {"externalId": "2", "name": "file2"}]},
+        json={"items": [{"id": 1, "name": "file1"}, {"id": 10, "externalId": "2", "name": "file2"}]},
     )
 
     def download_link_callback(request):
         identifier = jsgz_load(request.body)["items"][0]
         response = {}
-        if "id" in identifier:
+        if identifier.get("id") == 1:
             response = {"items": [{"id": 1, "downloadUrl": "https://download.file1.here"}]}
-        elif "externalId" in identifier:
-            response = {"items": [{"externalId": "2", "downloadUrl": "https://download.file2.here"}]}
+        if identifier.get("id") == 10:
+            response = {"items": [{"id": 10, "externalId": "2", "downloadUrl": "https://download.file2.here"}]}
         return 200, {}, json.dumps(response)
 
     rsps.add_callback(
@@ -132,7 +133,7 @@ def download_link_callback(request):
 
 
 @pytest.fixture
-def mock_file_download_response_with_folder_structure(rsps, cognite_client):
+def mock_file_download_response_with_folder_structure_same_name(rsps, cognite_client):
     rsps.add(
         rsps.POST,
         cognite_client.files._get_base_url_with_base_path() + "/files/byids",
@@ -181,9 +182,9 @@ def mock_file_download_response_one_fails(rsps, cognite_client):
 
     def download_link_callback(request):
         identifier = jsgz_load(request.body)["items"][0]
-        if "id" in identifier:
+        if identifier.get("id") == 1:
             return 200, {}, json.dumps({"items": [{"id": 1, "downloadUrl": "https://download.file1.here"}]})
-        elif "externalId" in identifier:
+        elif identifier.get("id") == 2:
             return (400, {}, json.dumps({"error": {"message": "User error", "code": 400}}))
 
     rsps.add_callback(
@@ -525,12 +526,59 @@ def test_download(self, cognite_client, mock_file_download_response):
             with open(fp2, "rb") as fh:
                 assert b"content2" == fh.read()
 
+    @pytest.mark.parametrize(
+        "input_list,expected_output_list",
+        [
+            (["a.txt", "a.txt"], ["a.txt", "a(1).txt"]),
+            (["a.txt", "a.txt", "a(1).txt"], ["a.txt", "a(2).txt", "a(1).txt"]),
+            (["a.txt", "file", "a(1).txt", "a.txt", "file"], ["a.txt", "file", "a(1).txt", "a(2).txt", "file(1)"]),
+            (
+                [
+                    str(Path("posixfolder/a.txt")),
+                    str(Path("posixfolder/a.txt")),
+                    str(Path(r"winfolder\a.txt")),
+                    str(Path(r"winfolder\a.txt")),
+                ],
+                [
+                    str(Path("posixfolder/a.txt")),
+                    str(Path("posixfolder/a(1).txt")),
+                    str(Path(r"winfolder\a.txt")),
+                    str(Path(r"winfolder\a(1).txt")),
+                ],
+            ),
+            (
+                [str(Path("folder/sub.folder/arch.tar.gz")), str(Path("folder/sub.folder/arch.tar.gz"))],
+                [str(Path("folder/sub.folder/arch.tar.gz")), str(Path("folder/sub.folder/arch(1).tar.gz"))],
+            ),
+        ],
+    )
+    def test_create_unique_file_names(self, cognite_client, input_list, expected_output_list):
+        assert cognite_client.files._create_unique_file_names(input_list) == expected_output_list
+
+    def test_download_with_duplicate_names(
+        self, tmp_path, cognite_client, mock_file_download_response_with_folder_structure_same_name
+    ):
+        cognite_client.files.download(
+            directory=tmp_path,
+            id=[1],
+            external_id=["2"],
+            keep_directory_structure=False,
+            resolve_duplicate_file_names=True,
+        )
+        assert {"ignoreUnknownIds": False, "items": [{"id": 1}, {"externalId": "2"}]} == jsgz_load(
+            mock_file_download_response_with_folder_structure_same_name.calls[0].request.body
+        )
+        fp1 = tmp_path / "file_a"
+        fp2 = tmp_path / "file_a(1)"
+        assert fp1.is_file()
+        assert fp2.is_file()
+
     def test_download_with_folder_structure(
-        self, tmp_path, cognite_client, mock_file_download_response_with_folder_structure
+        self, tmp_path, cognite_client, mock_file_download_response_with_folder_structure_same_name
     ):
         cognite_client.files.download(directory=tmp_path, id=[1], external_id=["2"], keep_directory_structure=True)
         assert {"ignoreUnknownIds": False, "items": [{"id": 1}, {"externalId": "2"}]} == jsgz_load(
-            mock_file_download_response_with_folder_structure.calls[0].request.body
+            mock_file_download_response_with_folder_structure_same_name.calls[0].request.body
         )
         fp1 = tmp_path / "rootdir/subdir/file_a"
         fp2 = tmp_path / "file_a"