Skip to content

Commit

Permalink
Resolve duplicate file names (#1370)
Browse files Browse the repository at this point in the history
Co-authored-by: Håkon V. Treider <[email protected]>
  • Loading branch information
johanlrdahl and haakonvt authored Oct 4, 2023
1 parent 99ab6cf commit a938e33
Show file tree
Hide file tree
Showing 5 changed files with 126 additions and 38 deletions.
5 changes: 5 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,11 @@ Changes are grouped as follows
- `Fixed` for any bug fixes.
- `Security` in case of vulnerabilities.

## [6.29.0] - 2023-10-04
### Added
- Added parameter `resolve_duplicate_file_names` to `client.files.download`.
This will keep all the files when downloading to local machine, even if they have the same name.

## [6.28.5] - 2023-10-03
### Fixed
- Bugfix for serialization of Workflows' `DynamicTasksParameters` during `workflows.versions.upsert` and `workflows.execution.retrieve_detailed`
Expand Down
88 changes: 62 additions & 26 deletions cognite/client/_api/files.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import copy
import os
import warnings
from collections import defaultdict
from io import BufferedReader
from pathlib import Path
from typing import (
Expand Down Expand Up @@ -611,19 +612,50 @@ def retrieve_download_urls(
results = tasks_summary.joined_results(unwrap_fn=lambda res: res.json()["items"])
return {result.get("id") or result["externalId"]: result["downloadUrl"] for result in results}

@staticmethod
def _create_unique_file_names(file_names_in: list[str] | list[Path]) -> list[str]:
"""
Create unique file names by appending a number to the base file name.
"""
file_names: list[str] = [str(file_name) for file_name in file_names_in]
unique_original = set(file_names)
unique_created = []
original_count: defaultdict = defaultdict(int)

for file_name in file_names:
if file_name not in unique_created:
unique_created.append(file_name)
continue

file_suffixes = Path(file_name).suffixes
file_postfix = "".join(file_suffixes)
file_base = file_name[: file_name.index(file_postfix) or None] # Awaiting .removesuffix in 3.9:

new_name = file_name
while (new_name in unique_created) or (new_name in unique_original):
original_count[file_name] += 1
new_name = f"{file_base}({original_count[file_name]}){file_postfix}"

unique_created.append(new_name)

return unique_created

def download(
self,
directory: str | Path,
id: int | Sequence[int] | None = None,
external_id: str | Sequence[str] | None = None,
keep_directory_structure: bool = False,
resolve_duplicate_file_names: bool = False,
) -> None:
"""`Download files by id or external id. <https://developer.cognite.com/api#tag/Files/operation/downloadLinks>`_
This method will stream all files to disk, never keeping more than 2MB in memory per worker.
The files will be stored in the provided directory using the file name retrieved from the file metadata in CDF.
You can also choose to keep the directory structure from CDF so that the files will be stored in subdirectories
matching the directory attribute on the files. When missing, the (root) directory is used.
By default, duplicate file names to the same local folder will be resolved by only keeping one of the files.
You can choose to resolve this by appending a number to the file name using the resolve_duplicate_file_names argument.
Warning:
If you are downloading several files at once, be aware that file name collisions lead to all-but-one of
Expand All @@ -635,6 +667,7 @@ def download(
external_id (str | Sequence[str] | None): External ID or list of external ids.
keep_directory_structure (bool): Whether or not to keep the directory hierarchy in CDF,
creating subdirectories as needed below the given directory.
resolve_duplicate_file_names (bool): Whether or not to resolve duplicate file names by appending a number on duplicate file names
Examples:
Expand All @@ -655,20 +688,30 @@ def download(
all_identifiers = IdentifierSequence.load(id, external_id).as_dicts()
id_to_metadata = self._get_id_to_metadata_map(all_identifiers)

all_ids, filepaths, directories = self._get_ids_filepaths_directories(
directory, id_to_metadata, keep_directory_structure
)

if keep_directory_structure:
all_ids, filepaths, file_directories = self._prepare_file_hierarchy(directory, id_to_metadata)
self._download_files_to_directory(file_directories, all_ids, id_to_metadata, filepaths)
else:
filepaths = [
directory / cast(str, metadata.name)
for identifier, metadata in id_to_metadata.items()
if isinstance(identifier, int)
]
self._download_files_to_directory(directory, all_identifiers, id_to_metadata, filepaths)
for file_folder in set(directories):
file_folder.mkdir(parents=True, exist_ok=True)

if resolve_duplicate_file_names:
filepaths_str = self._create_unique_file_names(filepaths)
filepaths = [Path(file_path) for file_path in filepaths_str]

self._download_files_to_directory(
directory=directory,
all_ids=all_ids,
id_to_metadata=id_to_metadata,
filepaths=filepaths,
)

@staticmethod
def _prepare_file_hierarchy(
directory: Path, id_to_metadata: dict[str | int, FileMetadata]
def _get_ids_filepaths_directories(
directory: Path,
id_to_metadata: dict[str | int, FileMetadata],
keep_directory_structure: bool = False,
) -> tuple[list[dict[str, str | int]], list[Path], list[Path]]:
# Note on type hint: Too much of the SDK is wrongly typed with 'dict[str, str | int]',
# instead of 'dict[str, str] | dict[str, int]', so we pretend dict-value type can also be str:
Expand All @@ -678,17 +721,14 @@ def _prepare_file_hierarchy(
if not isinstance(identifier, int):
continue
file_directory = directory
if metadata.directory:
if metadata.directory and keep_directory_structure:
# CDF enforces absolute, unix-style paths (i.e. always stating with '/'). We strip to make it relative:
file_directory /= metadata.directory[1:]

ids.append({"id": identifier})
file_directories.append(file_directory)
filepaths.append(file_directory / cast(str, metadata.name))

for file_folder in set(file_directories):
file_folder.mkdir(parents=True, exist_ok=True)

return ids, filepaths, file_directories

@staticmethod
Expand Down Expand Up @@ -719,16 +759,13 @@ def _get_id_to_metadata_map(self, all_ids: Sequence[dict]) -> dict[str | int, Fi

def _download_files_to_directory(
self,
directory: Path | Sequence[Path],
directory: Path,
all_ids: Sequence[dict[str, int | str]],
id_to_metadata: dict[str | int, FileMetadata],
filepaths: list[Path],
) -> None:
self._warn_on_duplicate_filenames(filepaths)
if isinstance(directory, Path):
tasks = [(directory, id, id_to_metadata) for id in all_ids]
else:
tasks = [(dir, id, id_to_metadata) for dir, id in zip(directory, all_ids)]
tasks = [(directory, id, id_to_metadata, filepath) for id, filepath in zip(all_ids, filepaths)]
tasks_summary = utils._concurrency.execute_tasks(
self._process_file_download, tasks, max_workers=self._config.max_workers
)
Expand All @@ -747,15 +784,14 @@ def _process_file_download(
directory: Path,
identifier: dict[str, int | str],
id_to_metadata: dict[str | int, FileMetadata],
file_path: Path,
) -> None:
id = IdentifierSequence.unwrap_identifier(identifier)
file_metadata = id_to_metadata[id]
file_path = (directory / cast(str, file_metadata.name)).resolve()
file_is_in_download_directory = directory.resolve() in file_path.parents
file_path_absolute = file_path.resolve()
file_is_in_download_directory = directory.resolve() in file_path_absolute.parents
if not file_is_in_download_directory:
raise RuntimeError(f"Resolved file path '{file_path}' is not inside download directory")
raise RuntimeError(f"Resolved file path '{file_path_absolute}' is not inside download directory")
download_link = self._get_download_link(identifier)
self._download_file_to_path(download_link, file_path)
self._download_file_to_path(download_link, file_path_absolute)

def _download_file_to_path(self, download_link: str, path: Path, chunk_size: int = 2**21) -> None:
with self._http_client_with_retry.request(
Expand Down
2 changes: 1 addition & 1 deletion cognite/client/_version.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from __future__ import annotations

__version__ = "6.28.5"
__version__ = "6.29.0"
__api_subversion__ = "V20220125"
3 changes: 1 addition & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
[tool.poetry]
name = "cognite-sdk"

version = "6.28.5"
version = "6.29.0"
description = "Cognite Python SDK"
readme = "README.md"
documentation = "https://cognite-sdk-python.readthedocs-hosted.com"
Expand Down
66 changes: 57 additions & 9 deletions tests/tests_unit/test_api/test_files.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import os
import re
from io import BufferedReader
from pathlib import Path
from tempfile import TemporaryDirectory

import pytest
Expand Down Expand Up @@ -108,16 +109,16 @@ def mock_file_download_response(rsps, cognite_client):
rsps.POST,
cognite_client.files._get_base_url_with_base_path() + "/files/byids",
status=200,
json={"items": [{"id": 1, "name": "file1"}, {"externalId": "2", "name": "file2"}]},
json={"items": [{"id": 1, "name": "file1"}, {"id": 10, "externalId": "2", "name": "file2"}]},
)

def download_link_callback(request):
identifier = jsgz_load(request.body)["items"][0]
response = {}
if "id" in identifier:
if identifier.get("id") == 1:
response = {"items": [{"id": 1, "downloadUrl": "https://download.file1.here"}]}
elif "externalId" in identifier:
response = {"items": [{"externalId": "2", "downloadUrl": "https://download.file2.here"}]}
if identifier.get("id") == 10:
response = {"items": [{"id": 10, "externalId": "2", "downloadUrl": "https://download.file2.here"}]}
return 200, {}, json.dumps(response)

rsps.add_callback(
Expand All @@ -132,7 +133,7 @@ def download_link_callback(request):


@pytest.fixture
def mock_file_download_response_with_folder_structure(rsps, cognite_client):
def mock_file_download_response_with_folder_structure_same_name(rsps, cognite_client):
rsps.add(
rsps.POST,
cognite_client.files._get_base_url_with_base_path() + "/files/byids",
Expand Down Expand Up @@ -181,9 +182,9 @@ def mock_file_download_response_one_fails(rsps, cognite_client):

def download_link_callback(request):
identifier = jsgz_load(request.body)["items"][0]
if "id" in identifier:
if identifier.get("id") == 1:
return 200, {}, json.dumps({"items": [{"id": 1, "downloadUrl": "https://download.file1.here"}]})
elif "externalId" in identifier:
elif identifier.get("id") == 2:
return (400, {}, json.dumps({"error": {"message": "User error", "code": 400}}))

rsps.add_callback(
Expand Down Expand Up @@ -525,12 +526,59 @@ def test_download(self, cognite_client, mock_file_download_response):
with open(fp2, "rb") as fh:
assert b"content2" == fh.read()

@pytest.mark.parametrize(
"input_list,expected_output_list",
[
(["a.txt", "a.txt"], ["a.txt", "a(1).txt"]),
(["a.txt", "a.txt", "a(1).txt"], ["a.txt", "a(2).txt", "a(1).txt"]),
(["a.txt", "file", "a(1).txt", "a.txt", "file"], ["a.txt", "file", "a(1).txt", "a(2).txt", "file(1)"]),
(
[
str(Path("posixfolder/a.txt")),
str(Path("posixfolder/a.txt")),
str(Path(r"winfolder\a.txt")),
str(Path(r"winfolder\a.txt")),
],
[
str(Path("posixfolder/a.txt")),
str(Path("posixfolder/a(1).txt")),
str(Path(r"winfolder\a.txt")),
str(Path(r"winfolder\a(1).txt")),
],
),
(
[str(Path("folder/sub.folder/arch.tar.gz")), str(Path("folder/sub.folder/arch.tar.gz"))],
[str(Path("folder/sub.folder/arch.tar.gz")), str(Path("folder/sub.folder/arch(1).tar.gz"))],
),
],
)
def test_create_unique_file_names(self, cognite_client, input_list, expected_output_list):
assert cognite_client.files._create_unique_file_names(input_list) == expected_output_list

def test_download_with_duplicate_names(
self, tmp_path, cognite_client, mock_file_download_response_with_folder_structure_same_name
):
cognite_client.files.download(
directory=tmp_path,
id=[1],
external_id=["2"],
keep_directory_structure=False,
resolve_duplicate_file_names=True,
)
assert {"ignoreUnknownIds": False, "items": [{"id": 1}, {"externalId": "2"}]} == jsgz_load(
mock_file_download_response_with_folder_structure_same_name.calls[0].request.body
)
fp1 = tmp_path / "file_a"
fp2 = tmp_path / "file_a(1)"
assert fp1.is_file()
assert fp2.is_file()

def test_download_with_folder_structure(
self, tmp_path, cognite_client, mock_file_download_response_with_folder_structure
self, tmp_path, cognite_client, mock_file_download_response_with_folder_structure_same_name
):
cognite_client.files.download(directory=tmp_path, id=[1], external_id=["2"], keep_directory_structure=True)
assert {"ignoreUnknownIds": False, "items": [{"id": 1}, {"externalId": "2"}]} == jsgz_load(
mock_file_download_response_with_folder_structure.calls[0].request.body
mock_file_download_response_with_folder_structure_same_name.calls[0].request.body
)
fp1 = tmp_path / "rootdir/subdir/file_a"
fp2 = tmp_path / "file_a"
Expand Down

0 comments on commit a938e33

Please sign in to comment.