iterative · ilongin · Feb 13, 2025 · Feb 17, 2025 · Feb 17, 2025 · Feb 17, 2025
diff --git a/src/datachain/lib/dc.py b/src/datachain/lib/dc.py
@@ -23,6 +23,7 @@
 from pydantic import BaseModel
 from sqlalchemy.sql.functions import GenericFunction
 from sqlalchemy.sql.sqltypes import NullType
+from tqdm import tqdm
 
 from datachain.dataset import DatasetRecord
 from datachain.func import literal
@@ -32,7 +33,14 @@
 from datachain.lib.convert.values_to_tuples import values_to_tuples
 from datachain.lib.data_model import DataModel, DataType, DataValue, dict_to_data_model
 from datachain.lib.dataset_info import DatasetInfo
-from datachain.lib.file import ArrowRow, File, FileType, get_file_type
+from datachain.lib.file import (
+    EXPORT_FILES_MAX_THREADS,
+    ArrowRow,
+    File,
+    FileExporter,
+    FileType,
+    get_file_type,
+)
 from datachain.lib.file import ExportPlacement as FileExportPlacement
 from datachain.lib.listing import get_file_info, get_listing, list_bucket, ls
 from datachain.lib.listing_info import ListingInfo
@@ -2500,8 +2508,10 @@ def to_storage(
         placement: FileExportPlacement = "fullpath",
         use_cache: bool = True,
         link_type: Literal["copy", "symlink"] = "copy",
+        num_workers: Optional[int] = EXPORT_FILES_MAX_THREADS,
     ) -> None:
-        """Export files from a specified signal to a directory.
+        """Export files from a specified signal to a directory. Files can be
+        exported to a local or cloud directory.
 
         Args:
             output: Path to the target directory for exporting files.
@@ -2511,6 +2521,8 @@ def to_storage(
             use_cache: If `True`, cache the files before exporting.
             link_type: Method to use for exporting files.
                 Falls back to `'copy'` if symlinking fails.
+            num_workers : number of workers to use for exporting files.
+                By default it uses 5 workers.
 
         Example:
             Cross cloud transfer
@@ -2525,8 +2537,22 @@ def to_storage(
         ):
             raise ValueError("Files with the same name found")
 
-        for file in self.collect(signal):
-            file.export(output, placement, use_cache, link_type=link_type)  # type: ignore[union-attr]
+        progress_bar = tqdm(
+            desc=f"Exporting files to {output}: ",
+            unit=" files",
+            unit_scale=True,
+            unit_divisor=10,
+            total=self.count(),
+            leave=False,
+        )
+        file_exporter = FileExporter(
+            output,
+            placement,
+            use_cache,
+            link_type,
+            max_threads=num_workers or 1,
+        )
+        file_exporter.run(self.collect(signal), progress_bar)
 
     def shuffle(self) -> "Self":
         """Shuffle the rows of the chain deterministically."""

diff --git a/src/datachain/lib/file.py b/src/datachain/lib/file.py
@@ -24,6 +24,7 @@
 from datachain.client.fileslice import FileSlice
 from datachain.lib.data_model import DataModel
 from datachain.lib.utils import DataChainError
+from datachain.nodes_thread_pool import NodesThreadPool
 from datachain.sql.types import JSON, Boolean, DateTime, Int, String
 from datachain.utils import TIME_ZERO
 
@@ -43,6 +44,35 @@
 ExportPlacement = Literal["filename", "etag", "fullpath", "checksum"]
 
 FileType = Literal["binary", "text", "image", "video"]
+EXPORT_FILES_MAX_THREADS = 5
+
+
+class FileExporter(NodesThreadPool):
+    """Class that does file exporting concurrently with thread pool"""
+
+    def __init__(
+        self,
+        output: str,
+        placement: ExportPlacement,
+        use_cache: bool,
+        link_type: Literal["copy", "symlink"],
+        max_threads: int = EXPORT_FILES_MAX_THREADS,
+    ):
+        super().__init__(max_threads)
+        self.output = output
+        self.placement = placement
+        self.use_cache = use_cache
+        self.link_type = link_type
+
+    def done_task(self, done):
+        for task in done:
+            task.result()
+
+    def do_task(self, file):
+        file.export(
+            self.output, self.placement, self.use_cache, link_type=self.link_type
+        )
+        self.increase_counter(1)
 
 
 class VFileError(DataChainError):

diff --git a/src/datachain/nodes_thread_pool.py b/src/datachain/nodes_thread_pool.py
@@ -57,44 +57,65 @@
         self._max_threads = max_threads
         self._thread_counter = 0
         self._thread_lock = threading.Lock()
+        self.tasks = set()
+        self.canceled = False
+        self.th_pool = None
 
     def run(
         self,
         chunk_gen,
         progress_bar=None,
     ):
         results = []
-        with concurrent.futures.ThreadPoolExecutor(self._max_threads) as th_pool:
-            tasks = set()
+        self.th_pool = concurrent.futures.ThreadPoolExecutor(self._max_threads)
+        try:
             self._thread_counter = 0
             for chunk in chunk_gen:
-                while len(tasks) >= self._max_threads:
+                if self.canceled:
+                    break
+                while len(self.tasks) >= self._max_threads:
                     done, _ = concurrent.futures.wait(
-                        tasks, timeout=1, return_when="FIRST_COMPLETED"
+                        self.tasks, timeout=1, return_when="FIRST_COMPLETED"
                     )
                     self.done_task(done)
 
-                    tasks = tasks - done
+                    self.tasks = self.tasks - done
                     self.update_progress_bar(progress_bar)
 
-                tasks.add(th_pool.submit(self.do_task, chunk))
+                self.tasks.add(self.th_pool.submit(self.do_task, chunk))
                 self.update_progress_bar(progress_bar)
 
-            while tasks:
+            while self.tasks:
+                if self.canceled:
+                    break
                 done, _ = concurrent.futures.wait(
-                    tasks, timeout=1, return_when="FIRST_COMPLETED"
+                    self.tasks, timeout=1, return_when="FIRST_COMPLETED"
                 )
                 task_results = self.done_task(done)
                 if task_results:
                     results.extend(task_results)
 
-                tasks = tasks - done
+                self.tasks = self.tasks - done
                 self.update_progress_bar(progress_bar)
-
-            th_pool.shutdown()
+        except:
+            self.cancel_all()
+            raise
+        else:
+            self.th_pool.shutdown()
 
         return results
 
+    def cancel_all(self):
+        self.cancel = True
+        # Canceling tasks just in case any of them is scheduled to run.
+        # Note that running tasks cannot be canceled, instead we will wait for
+        # them to finish when shutting down thread loop executor by calling
+        # shutdown() method.
+        for task in self.tasks:
+            task.cancel()
+        if self.th_pool:
+            self.th_pool.shutdown()  # this will wait for running tasks to finish
+
     def update_progress_bar(self, progress_bar):
         if progress_bar is not None:
             with self._thread_lock:

diff --git a/tests/func/test_datachain.py b/tests/func/test_datachain.py
@@ -301,21 +301,34 @@ def test_read_file(cloud_test_catalog, use_cache):
 @pytest.mark.parametrize("use_map", [True, False])
 @pytest.mark.parametrize("use_cache", [True, False])
 @pytest.mark.parametrize("file_type", ["", "binary", "text"])
+@pytest.mark.parametrize("num_workers", [0, 2])
 @pytest.mark.parametrize("cloud_type", ["file"], indirect=True)
 def test_to_storage(
-    tmp_dir, cloud_test_catalog, test_session, placement, use_map, use_cache, file_type
+    tmp_dir,
+    cloud_test_catalog,
+    test_session,
+    placement,
+    use_map,
+    use_cache,
+    file_type,
+    num_workers,
 ):
     ctc = cloud_test_catalog
     df = DataChain.from_storage(ctc.src_uri, type=file_type, session=test_session)
     if use_map:
-        df.to_storage(tmp_dir / "output", placement=placement, use_cache=use_cache)
+        df.to_storage(
+            tmp_dir / "output",
+            placement=placement,
+            use_cache=use_cache,
+            num_workers=num_workers,
+        )
         df.map(
             res=lambda file: file.export(
                 tmp_dir / "output", placement=placement, use_cache=use_cache
             )
         ).exec()
     else:
-        df.to_storage(tmp_dir / "output", placement=placement)
+        df.to_storage(tmp_dir / "output", placement=placement, num_workers=num_workers)
 
     expected = {
         "description": "Cats and Dogs",