diff --git a/pipeline.yaml b/pipeline.yaml index d8cef59..9a73e56 100644 --- a/pipeline.yaml +++ b/pipeline.yaml @@ -1568,8 +1568,8 @@ deploymentSpec: \ set_precomputed_skills_data_ratio(\n sampling_size=sdg_sampling_size,\ \ skills_recipe=skills_recipe\n )\n except PermissionError:\n\ \ print(\"Failed to set precomputed skills data ratio: Permission\ - \ denied\")\n print(\"Attempting to override DataMixer class\ - \ to set the ratio\")\n import os\n import shutil\n\ + \ denied\")\n print(\"Attempting to move default data recipes\ + \ to temporary directory\")\n import os\n import shutil\n\ \ import tempfile\n\n import xdg_base_dirs\n\n \ \ # Create a temporary directory\n with tempfile.TemporaryDirectory()\ \ as temp_dir:\n # Create a default_data_recipes directory\n\ @@ -1586,19 +1586,22 @@ deploymentSpec: pipeline\")\n os.mkdir(temp_pipeline_dir)\n \ \ for d in data_dirs:\n pipeline_path = os.path.join(d,\ \ \"pipelines\", pipeline)\n if os.path.exists(pipeline_path):\n\ - \ shutil.copytree(pipeline_path, temp_pipeline_dir)\n\ - \ break\n\n # Build new skills.yaml\ - \ path\n new_skills_recipe = path.join(temp_dir, \"skills.yaml\"\ - )\n print(f\"New skills recipe path: {new_skills_recipe}\"\ - )\n\n # Override XDG_DATA_DIRS with the temporary directory\n\ + \ shutil.copytree(\n pipeline_path,\n\ + \ temp_pipeline_dir,\n \ + \ dirs_exist_ok=True,\n )\n \ + \ break\n\n # Build new skills.yaml path\n \ + \ new_skills_recipe = path.join(temp_dir, \"skills.yaml\")\n \ + \ print(f\"New skills recipe path: {new_skills_recipe}\")\n\n\ + \ # Override XDG_DATA_DIRS with the temporary directory\n\ \ # This allows SDG to read the new skills.yaml since it's\ \ looking into XDG_DATA_DIRS\n # and looks for a default_data_recipes\ \ directory with a skills.yaml file\n os.environ[\"XDG_DATA_DIRS\"\ ] = f\"{temp_dir}\"\n\n # Try to set the precomputed skills\ \ data ratio again\n try:\n set_precomputed_skills_data_ratio(\n\ \ sampling_size=sdg_sampling_size, skills_recipe=new_skills_recipe\n\ - \ )\n print(\"Successfully set precomputed\ - \ skills data ratio\")\n\n # generate_data has a magic\ + \ )\n print(\n \ + \ f\"Successfully set precomputed skills data ratio to {sdg_sampling_size}\"\ + \n )\n\n # generate_data has a magic\ \ word for its taxonomy_base argument - 'empty'\n # it\ \ allows generating from the whole repo, see:\n # https://github.com/instructlab/sdg/blob/c6a9e74a1618b1077cd38e713b8aaed8b7c0c8ce/src/instructlab/sdg/utils/taxonomy.py#L230\n\ \ instructlab.sdg.generate_data(\n \ diff --git a/sdg/components.py b/sdg/components.py index 941815b..40f0b4b 100644 --- a/sdg/components.py +++ b/sdg/components.py @@ -103,7 +103,7 @@ def set_precomputed_skills_data_ratio(sampling_size: float, skills_recipe: str): ) except PermissionError: print("Failed to set precomputed skills data ratio: Permission denied") - print("Attempting to override DataMixer class to set the ratio") + print("Attempting to move default data recipes to temporary directory") import os import shutil import tempfile @@ -131,7 +131,11 @@ def set_precomputed_skills_data_ratio(sampling_size: float, skills_recipe: str): for d in data_dirs: pipeline_path = os.path.join(d, "pipelines", pipeline) if os.path.exists(pipeline_path): - shutil.copytree(pipeline_path, temp_pipeline_dir) + shutil.copytree( + pipeline_path, + temp_pipeline_dir, + dirs_exist_ok=True, + ) break # Build new skills.yaml path @@ -148,7 +152,9 @@ def set_precomputed_skills_data_ratio(sampling_size: float, skills_recipe: str): set_precomputed_skills_data_ratio( sampling_size=sdg_sampling_size, skills_recipe=new_skills_recipe ) - print("Successfully set precomputed skills data ratio") + print( + f"Successfully set precomputed skills data ratio to {sdg_sampling_size}" + ) # generate_data has a magic word for its taxonomy_base argument - 'empty' # it allows generating from the whole repo, see: diff --git a/standalone/standalone.py b/standalone/standalone.py index 7c1f41e..f85c476 100755 --- a/standalone/standalone.py +++ b/standalone/standalone.py @@ -1198,7 +1198,7 @@ def set_precomputed_skills_data_ratio(sampling_size: float, skills_recipe: str): ) except PermissionError: print("Failed to set precomputed skills data ratio: Permission denied") - print("Attempting to override DataMixer class to set the ratio") + print("Attempting to move default data recipes to temporary directory") import os import shutil import tempfile @@ -1226,7 +1226,11 @@ def set_precomputed_skills_data_ratio(sampling_size: float, skills_recipe: str): for d in data_dirs: pipeline_path = os.path.join(d, "pipelines", pipeline) if os.path.exists(pipeline_path): - shutil.copytree(pipeline_path, temp_pipeline_dir) + shutil.copytree( + pipeline_path, + temp_pipeline_dir, + dirs_exist_ok=True, + ) break # Build new skills.yaml path @@ -1243,7 +1247,9 @@ def set_precomputed_skills_data_ratio(sampling_size: float, skills_recipe: str): set_precomputed_skills_data_ratio( sampling_size=sdg_sampling_size, skills_recipe=new_skills_recipe ) - print("Successfully set precomputed skills data ratio") + print( + f"Successfully set precomputed skills data ratio to {sdg_sampling_size}" + ) # generate_data has a magic word for its taxonomy_base argument - 'empty' # it allows generating from the whole repo, see: