Skip to content

Commit

Permalink
set copytree dirs_exist_ok to True in sdg op
Browse files Browse the repository at this point in the history
Signed-off-by: Michael Clifford <[email protected]>
  • Loading branch information
MichaelClifford committed Dec 5, 2024
1 parent b235539 commit 262fb94
Show file tree
Hide file tree
Showing 3 changed files with 30 additions and 15 deletions.
21 changes: 12 additions & 9 deletions pipeline.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -1568,8 +1568,8 @@ deploymentSpec:
\ set_precomputed_skills_data_ratio(\n sampling_size=sdg_sampling_size,\
\ skills_recipe=skills_recipe\n )\n except PermissionError:\n\
\ print(\"Failed to set precomputed skills data ratio: Permission\
\ denied\")\n print(\"Attempting to override DataMixer class\
\ to set the ratio\")\n import os\n import shutil\n\
\ denied\")\n print(\"Attempting to move default data recipes\
\ to temporary directory\")\n import os\n import shutil\n\
\ import tempfile\n\n import xdg_base_dirs\n\n \
\ # Create a temporary directory\n with tempfile.TemporaryDirectory()\
\ as temp_dir:\n # Create a default_data_recipes directory\n\
Expand All @@ -1586,19 +1586,22 @@ deploymentSpec:
pipeline\")\n os.mkdir(temp_pipeline_dir)\n \
\ for d in data_dirs:\n pipeline_path = os.path.join(d,\
\ \"pipelines\", pipeline)\n if os.path.exists(pipeline_path):\n\
\ shutil.copytree(pipeline_path, temp_pipeline_dir)\n\
\ break\n\n # Build new skills.yaml\
\ path\n new_skills_recipe = path.join(temp_dir, \"skills.yaml\"\
)\n print(f\"New skills recipe path: {new_skills_recipe}\"\
)\n\n # Override XDG_DATA_DIRS with the temporary directory\n\
\ shutil.copytree(\n pipeline_path,\n\
\ temp_pipeline_dir,\n \
\ dirs_exist_ok=True,\n )\n \
\ break\n\n # Build new skills.yaml path\n \
\ new_skills_recipe = path.join(temp_dir, \"skills.yaml\")\n \
\ print(f\"New skills recipe path: {new_skills_recipe}\")\n\n\
\ # Override XDG_DATA_DIRS with the temporary directory\n\
\ # This allows SDG to read the new skills.yaml since it's\
\ looking into XDG_DATA_DIRS\n # and looks for a default_data_recipes\
\ directory with a skills.yaml file\n os.environ[\"XDG_DATA_DIRS\"\
] = f\"{temp_dir}\"\n\n # Try to set the precomputed skills\
\ data ratio again\n try:\n set_precomputed_skills_data_ratio(\n\
\ sampling_size=sdg_sampling_size, skills_recipe=new_skills_recipe\n\
\ )\n print(\"Successfully set precomputed\
\ skills data ratio\")\n\n # generate_data has a magic\
\ )\n print(\n \
\ f\"Successfully set precomputed skills data ratio to {sdg_sampling_size}\"\
\n )\n\n # generate_data has a magic\
\ word for its taxonomy_base argument - 'empty'\n # it\
\ allows generating from the whole repo, see:\n # https://github.com/instructlab/sdg/blob/c6a9e74a1618b1077cd38e713b8aaed8b7c0c8ce/src/instructlab/sdg/utils/taxonomy.py#L230\n\
\ instructlab.sdg.generate_data(\n \
Expand Down
12 changes: 9 additions & 3 deletions sdg/components.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,7 +103,7 @@ def set_precomputed_skills_data_ratio(sampling_size: float, skills_recipe: str):
)
except PermissionError:
print("Failed to set precomputed skills data ratio: Permission denied")
print("Attempting to override DataMixer class to set the ratio")
print("Attempting to move default data recipes to temporary directory")
import os
import shutil
import tempfile
Expand Down Expand Up @@ -131,7 +131,11 @@ def set_precomputed_skills_data_ratio(sampling_size: float, skills_recipe: str):
for d in data_dirs:
pipeline_path = os.path.join(d, "pipelines", pipeline)
if os.path.exists(pipeline_path):
shutil.copytree(pipeline_path, temp_pipeline_dir)
shutil.copytree(
pipeline_path,
temp_pipeline_dir,
dirs_exist_ok=True,
)
break

# Build new skills.yaml path
Expand All @@ -148,7 +152,9 @@ def set_precomputed_skills_data_ratio(sampling_size: float, skills_recipe: str):
set_precomputed_skills_data_ratio(
sampling_size=sdg_sampling_size, skills_recipe=new_skills_recipe
)
print("Successfully set precomputed skills data ratio")
print(
f"Successfully set precomputed skills data ratio to {sdg_sampling_size}"
)

# generate_data has a magic word for its taxonomy_base argument - 'empty'
# it allows generating from the whole repo, see:
Expand Down
12 changes: 9 additions & 3 deletions standalone/standalone.py
Original file line number Diff line number Diff line change
Expand Up @@ -1198,7 +1198,7 @@ def set_precomputed_skills_data_ratio(sampling_size: float, skills_recipe: str):
)
except PermissionError:
print("Failed to set precomputed skills data ratio: Permission denied")
print("Attempting to override DataMixer class to set the ratio")
print("Attempting to move default data recipes to temporary directory")
import os
import shutil
import tempfile
Expand Down Expand Up @@ -1226,7 +1226,11 @@ def set_precomputed_skills_data_ratio(sampling_size: float, skills_recipe: str):
for d in data_dirs:
pipeline_path = os.path.join(d, "pipelines", pipeline)
if os.path.exists(pipeline_path):
shutil.copytree(pipeline_path, temp_pipeline_dir)
shutil.copytree(
pipeline_path,
temp_pipeline_dir,
dirs_exist_ok=True,
)
break
# Build new skills.yaml path
Expand All @@ -1243,7 +1247,9 @@ def set_precomputed_skills_data_ratio(sampling_size: float, skills_recipe: str):
set_precomputed_skills_data_ratio(
sampling_size=sdg_sampling_size, skills_recipe=new_skills_recipe
)
print("Successfully set precomputed skills data ratio")
print(
f"Successfully set precomputed skills data ratio to {sdg_sampling_size}"
)
# generate_data has a magic word for its taxonomy_base argument - 'empty'
# it allows generating from the whole repo, see:
Expand Down

0 comments on commit 262fb94

Please sign in to comment.