diff --git a/eval/__init__.py b/eval/__init__.py new file mode 100644 index 00000000..4209bebd --- /dev/null +++ b/eval/__init__.py @@ -0,0 +1,4 @@ +from .final import run_final_eval_op +from .mt_bench import run_mt_bench_op + +__all__ = ["run_final_eval_op", "run_mt_bench_op"] diff --git a/eval/final/components.py b/eval/final.py similarity index 99% rename from eval/final/components.py rename to eval/final.py index 469d222b..e0567af2 100644 --- a/eval/final/components.py +++ b/eval/final.py @@ -1,5 +1,5 @@ # type: ignore -# pylint: disable=no-value-for-parameter,import-outside-toplevel,import-error +# pylint: disable=import-outside-toplevel,import-error from kfp.dsl import Artifact, Output, component diff --git a/eval/final/__init__.py b/eval/final/__init__.py deleted file mode 100644 index 62053fe8..00000000 --- a/eval/final/__init__.py +++ /dev/null @@ -1,5 +0,0 @@ -from .components import run_final_eval_op - -# from . import faked - -__all__ = ["run_final_eval_op"] diff --git a/eval/mt_bench/components.py b/eval/mt_bench.py similarity index 98% rename from eval/mt_bench/components.py rename to eval/mt_bench.py index 5b1800a2..e6191cbf 100644 --- a/eval/mt_bench/components.py +++ b/eval/mt_bench.py @@ -1,5 +1,6 @@ # type: ignore -# pylint: disable=no-value-for-parameter,import-outside-toplevel,import-error +# pylint: disable=import-outside-toplevel,import-error + from typing import NamedTuple, Optional from kfp.dsl import component diff --git a/eval/mt_bench/__init__.py b/eval/mt_bench/__init__.py deleted file mode 100644 index 0343b450..00000000 --- a/eval/mt_bench/__init__.py +++ /dev/null @@ -1,5 +0,0 @@ -from .components import run_mt_bench_op - -# from . import faked - -__all__ = ["run_mt_bench_op"] diff --git a/pipeline.py b/pipeline.py index 00d3df24..aeefc6c8 100644 --- a/pipeline.py +++ b/pipeline.py @@ -1,8 +1,9 @@ # type: ignore -# pylint: disable=no-value-for-parameter,import-outside-toplevel,import-error,no-member +# pylint: disable=no-value-for-parameter,import-outside-toplevel,no-member + import os import typing -from typing import List, Literal, Optional +from typing import Optional import click from kfp import compiler, dsl @@ -16,11 +17,30 @@ use_secret_as_volume, ) +from eval import run_final_eval_op, run_mt_bench_op +from sdg import ( + git_clone_op, + sdg_op, + sdg_to_artifact_op, + taxonomy_to_artifact_op, +) +from training import ( + data_processing_op, + knowledge_processed_data_to_artifact_op, + pytorchjob_manifest_op, + skills_processed_data_to_artifact_op, +) +from utils import ( + ilab_importer_op, + model_to_pvc_op, + pvc_to_model_op, + pvc_to_mt_bench_op, +) + TEACHER_CONFIG_MAP = "teacher-server" TEACHER_SECRET = "teacher-server" JUDGE_CONFIG_MAP = "judge-server" JUDGE_SECRET = "judge-server" -MOCKED_STAGES = ["sdg", "train", "eval"] PIPELINE_FILE_NAME = "pipeline.yaml" IMPORTER_PIPELINE_FILE_NAME = "importer-pipeline.yaml" STANDALONE_TEMPLATE_FILE_NAME = "standalone.tpl" @@ -37,516 +57,443 @@ JUDGE_CA_CERT_PATH = "/tmp/cert" -def ilab_pipeline_wrapper(mock: List[Literal[MOCKED_STAGES]]): - """Wrapper for KFP pipeline, which allows for mocking individual stages.""" - - # Imports for SDG stage - if mock is not None and "sdg" in mock: - from sdg.faked import ( - git_clone_op, - sdg_op, - sdg_to_artifact_op, - taxonomy_to_artifact_op, - ) - else: - from sdg import ( - git_clone_op, - sdg_op, - sdg_to_artifact_op, - taxonomy_to_artifact_op, - ) - - # Imports for Training stage - if mock is not None and "train" in mock: - from training.faked import ( - data_processing_op, - knowledge_processed_data_to_artifact_op, - pytorchjob_manifest_op, - skills_processed_data_to_artifact_op, - ) - from utils.faked import ( - model_to_pvc_op, - pvc_to_model_op, - pvc_to_mt_bench_op, - ) - else: - from training import ( - data_processing_op, - knowledge_processed_data_to_artifact_op, - pytorchjob_manifest_op, - skills_processed_data_to_artifact_op, - ) - from utils import ( - model_to_pvc_op, - pvc_to_model_op, - pvc_to_mt_bench_op, - ) +@dsl.pipeline( + display_name="InstructLab", + name="instructlab", + description="InstructLab pipeline", +) +def ilab_pipeline( + # SDG phase + sdg_repo_url: str = "https://github.com/instructlab/taxonomy.git", + sdg_repo_branch: Optional[str] = None, + sdg_repo_pr: Optional[ + int + ] = None, # FIXME: https://issues.redhat.com/browse/RHOAIRFE-467 + sdg_base_model: str = "s3:///", + sdg_scale_factor: int = 30, # https://github.com/instructlab/instructlab/blob/v0.21.2/tests/testdata/default_config.yaml#L125 + sdg_pipeline: str = "full", # https://github.com/instructlab/instructlab/blob/v0.21.2/tests/testdata/default_config.yaml#L122 + sdg_max_batch_len: int = 5000, # https://github.com/instructlab/instructlab/blob/v0.21.2/tests/testdata/default_config.yaml#L334 + sdg_sample_size: float = 1.0, # FIXME: Not present in default config. Not configurable upstream at this point, capability added via https://github.com/instructlab/sdg/pull/432 + # Training phase + train_nproc_per_node: int = 2, # FIXME: Not present in default config. Arbitrary value chosen to demonstrate multi-node multi-gpu capabilities. Needs proper reference architecture justification. + train_nnodes: int = 2, # FIXME: Not present in default config. Arbitrary value chosen to demonstrate multi-node multi-gpu capabilities. Needs proper reference architecture justification. + train_num_epochs_phase_1: int = 7, # https://github.com/instructlab/instructlab/blob/v0.21.2/tests/testdata/default_config.yaml#L364 + train_num_epochs_phase_2: int = 10, # https://github.com/instructlab/instructlab/blob/v0.21.2/tests/testdata/default_config.yaml#L377 + train_effective_batch_size_phase_1: int = 128, # https://github.com/instructlab/instructlab/blob/v0.21.2/tests/testdata/default_config.yaml#L357 + train_effective_batch_size_phase_2: int = 3840, # https://github.com/instructlab/instructlab/blob/v0.21.2/tests/testdata/default_config.yaml#L371 + train_learning_rate_phase_1: float = 2e-05, # https://github.com/instructlab/instructlab/blob/v0.21.2/tests/testdata/default_config.yaml#L360 + train_learning_rate_phase_2: float = 6e-06, # https://github.com/instructlab/instructlab/blob/v0.21.2/tests/testdata/default_config.yaml#L374 + train_num_warmup_steps_phase_1: int = 1000, # https://github.com/instructlab/training/blob/v0.6.1/src/instructlab/training/main_ds.py#L874 + train_num_warmup_steps_phase_2: int = 1000, # https://github.com/instructlab/training/blob/v0.6.1/src/instructlab/training/main_ds.py#L874 + train_save_samples: int = 250000, # https://github.com/instructlab/instructlab/blob/v0.21.2/tests/testdata/default_config.yaml#L393 + train_max_batch_len: int = 5000, # https://github.com/instructlab/instructlab/blob/v0.21.2/tests/testdata/default_config.yaml#L334 + train_seed: int = 42, # https://github.com/instructlab/training/blob/v0.6.1/src/instructlab/training/main_ds.py#L901 + # MT Bench + mt_bench_max_workers: str = "auto", # https://github.com/instructlab/instructlab/blob/v0.21.2/tests/testdata/default_config.yaml#L74 + mt_bench_merge_system_user_message: bool = False, # https://github.com/instructlab/instructlab/blob/v0.21.2/src/instructlab/model/evaluate.py#L474 + # Final evaluation + final_eval_max_workers: str = "auto", # https://github.com/instructlab/instructlab/blob/v0.21.2/tests/testdata/default_config.yaml#L74 + final_eval_few_shots: int = 5, # https://github.com/instructlab/instructlab/blob/v0.21.2/tests/testdata/default_config.yaml#L56 + final_eval_batch_size: str = "auto", # https://github.com/instructlab/instructlab/blob/v0.21.2/tests/testdata/default_config.yaml#L52 + final_eval_merge_system_user_message: bool = False, # https://github.com/instructlab/instructlab/blob/v0.21.2/src/instructlab/model/evaluate.py#L474 + # Other options + k8s_storage_class_name: str = "standard", # FIXME: https://github.com/kubeflow/pipelines/issues/11396, https://issues.redhat.com/browse/RHOAIRFE-470 +): + """InstructLab pipeline - # Imports for evaluation - from eval.final import run_final_eval_op - from eval.mt_bench import run_mt_bench_op - - @dsl.pipeline( - display_name="InstructLab", - name="instructlab", - description="InstructLab pipeline", - ) - def pipeline( - # SDG phase - sdg_repo_url: str = "https://github.com/instructlab/taxonomy.git", - sdg_repo_branch: Optional[str] = None, - sdg_repo_pr: Optional[ - int - ] = None, # FIXME: https://issues.redhat.com/browse/RHOAIRFE-467 - sdg_base_model: str = "s3:///", - sdg_scale_factor: int = 30, # https://github.com/instructlab/instructlab/blob/v0.21.2/tests/testdata/default_config.yaml#L125 - sdg_pipeline: str = "full", # https://github.com/instructlab/instructlab/blob/v0.21.2/tests/testdata/default_config.yaml#L122 - sdg_max_batch_len: int = 5000, # https://github.com/instructlab/instructlab/blob/v0.21.2/tests/testdata/default_config.yaml#L334 - sdg_sample_size: float = 1.0, # FIXME: Not present in default config. Not configurable upstream at this point, capability added via https://github.com/instructlab/sdg/pull/432 - # Training phase - train_nproc_per_node: int = 2, # FIXME: Not present in default config. Arbitrary value chosen to demonstrate multi-node multi-gpu capabilities. Needs proper reference architecture justification. - train_nnodes: int = 2, # FIXME: Not present in default config. Arbitrary value chosen to demonstrate multi-node multi-gpu capabilities. Needs proper reference architecture justification. - train_num_epochs_phase_1: int = 7, # https://github.com/instructlab/instructlab/blob/v0.21.2/tests/testdata/default_config.yaml#L364 - train_num_epochs_phase_2: int = 10, # https://github.com/instructlab/instructlab/blob/v0.21.2/tests/testdata/default_config.yaml#L377 - train_effective_batch_size_phase_1: int = 128, # https://github.com/instructlab/instructlab/blob/v0.21.2/tests/testdata/default_config.yaml#L357 - train_effective_batch_size_phase_2: int = 3840, # https://github.com/instructlab/instructlab/blob/v0.21.2/tests/testdata/default_config.yaml#L371 - train_learning_rate_phase_1: float = 2e-05, # https://github.com/instructlab/instructlab/blob/v0.21.2/tests/testdata/default_config.yaml#L360 - train_learning_rate_phase_2: float = 6e-06, # https://github.com/instructlab/instructlab/blob/v0.21.2/tests/testdata/default_config.yaml#L374 - train_num_warmup_steps_phase_1: int = 1000, # https://github.com/instructlab/training/blob/v0.6.1/src/instructlab/training/main_ds.py#L874 - train_num_warmup_steps_phase_2: int = 1000, # https://github.com/instructlab/training/blob/v0.6.1/src/instructlab/training/main_ds.py#L874 - train_save_samples: int = 250000, # https://github.com/instructlab/instructlab/blob/v0.21.2/tests/testdata/default_config.yaml#L393 - train_max_batch_len: int = 5000, # https://github.com/instructlab/instructlab/blob/v0.21.2/tests/testdata/default_config.yaml#L334 - train_seed: int = 42, # https://github.com/instructlab/training/blob/v0.6.1/src/instructlab/training/main_ds.py#L901 - # MT Bench - mt_bench_max_workers: str = "auto", # https://github.com/instructlab/instructlab/blob/v0.21.2/tests/testdata/default_config.yaml#L74 - mt_bench_merge_system_user_message: bool = False, # https://github.com/instructlab/instructlab/blob/v0.21.2/src/instructlab/model/evaluate.py#L474 - # Final evaluation - final_eval_max_workers: str = "auto", # https://github.com/instructlab/instructlab/blob/v0.21.2/tests/testdata/default_config.yaml#L74 - final_eval_few_shots: int = 5, # https://github.com/instructlab/instructlab/blob/v0.21.2/tests/testdata/default_config.yaml#L56 - final_eval_batch_size: str = "auto", # https://github.com/instructlab/instructlab/blob/v0.21.2/tests/testdata/default_config.yaml#L52 - final_eval_merge_system_user_message: bool = False, # https://github.com/instructlab/instructlab/blob/v0.21.2/src/instructlab/model/evaluate.py#L474 - # Other options - k8s_storage_class_name: str = "standard", # FIXME: https://github.com/kubeflow/pipelines/issues/11396, https://issues.redhat.com/browse/RHOAIRFE-470 - ): - """InstructLab pipeline - - Args: - sdg_repo_url: SDG parameter. Points to a taxonomy git repository - sdg_repo_branch: SDG parameter. Points to a branch within the taxonomy git repository. If set, has priority over sdg_repo_pr - sdg_repo_pr: SDG parameter. Points to a pull request against the taxonomy git repository - sdg_base_model: SDG parameter. LLM model used to generate the synthetic dataset - sdg_scale_factor: SDG parameter. The total number of instructions to be generated. - sdg_pipeline: SDG parameter. Data generation pipeline to use. Available: 'simple', 'full', or a valid path to a directory of pipeline workflow YAML files. Note that 'full' requires a larger teacher model, Mixtral-8x7b. - sdg_max_batch_len: SDG parameter. Maximum tokens per gpu for each batch that will be handled in a single step. - sdg_sample_size: SDG parameter. Represents the sdg skills recipe sampling size as percentage in decimal form. - - train_nproc_per_node: Training parameter. Number of GPUs per each node/worker to use for training. - train_nnodes: Training parameter. Number of nodes/workers to train on. - train_num_epochs_phase_1: Training parameter for in Phase 1. Number of epochs to run training. - train_num_epochs_phase_2: Training parameter for in Phase 2. Number of epochs to run training. - train_effective_batch_size_phase_1: Training parameter for in Phase 1. The number of samples in a batch that the model should see before its parameters are updated. - train_effective_batch_size_phase_2: Training parameter for in Phase 2. The number of samples in a batch that the model should see before its parameters are updated. - train_learning_rate_phase_1: Training parameter for in Phase 1. How fast we optimize the weights during gradient descent. Higher values may lead to unstable learning performance. It's generally recommended to have a low learning rate with a high effective batch size. - train_learning_rate_phase_2: Training parameter for in Phase 2. How fast we optimize the weights during gradient descent. Higher values may lead to unstable learning performance. It's generally recommended to have a low learning rate with a high effective batch size. - train_num_warmup_steps_phase_1: Training parameter for in Phase 1. The number of steps a model should go through before reaching the full learning rate. We start at 0 and linearly climb up to train_learning_rate. - train_num_warmup_steps_phase_2: Training parameter for in Phase 2. The number of steps a model should go through before reaching the full learning rate. We start at 0 and linearly climb up to train_learning_rate. - train_save_samples: Training parameter. Number of samples the model should see before saving a checkpoint. - train_max_batch_len: Training parameter. Maximum tokens per gpu for each batch that will be handled in a single step. - train_seed: Training parameter. Random seed for initializing training. - - mt_bench_max_workers: MT Bench parameter. Number of workers to use for evaluation with mt_bench or mt_bench_branch. Must be a positive integer or 'auto'. - mt_bench_merge_system_user_message: MT Bench parameter. Boolean indicating whether to merge system and user messages (required for Mistral based judges) - - final_eval_max_workers: Final model evaluation parameter for MT Bench Branch. Number of workers to use for evaluation with mt_bench or mt_bench_branch. Must be a positive integer or 'auto'. - final_eval_few_shots: Final model evaluation parameter for MMLU. Number of question-answer pairs provided in the context preceding the question used for evaluation. - final_eval_batch_size: Final model evaluation parameter for MMLU. Batch size for evaluation. Valid values are a positive integer or 'auto' to select the largest batch size that will fit in memory. - final_eval_merge_system_user_message: Final model evaluation parameter for MT Bench Branch. Boolean indicating whether to merge system and user messages (required for Mistral based judges) - - k8s_storage_class_name: A Kubernetes StorageClass name for persistent volumes. Selected StorageClass must support RWX PersistentVolumes. - """ - - # SDG stage - sdg_input_pvc_task = CreatePVC( - pvc_name_suffix="-sdg", - access_modes=["ReadWriteMany"], - size="10Gi", - storage_class_name=k8s_storage_class_name, - ) - git_clone_task = git_clone_op( - repo_branch=sdg_repo_branch, - repo_pr=sdg_repo_pr if sdg_repo_pr and sdg_repo_pr > 0 else None, - repo_url=sdg_repo_url, - ) - mount_pvc( - task=git_clone_task, - pvc_name=sdg_input_pvc_task.output, - mount_path="/data", - ) - git_clone_task.set_caching_options(False) - - sdg_task = sdg_op( - num_instructions_to_generate=sdg_scale_factor, - pipeline=sdg_pipeline, - repo_branch=sdg_repo_branch, - repo_pr=sdg_repo_pr, - sdg_sampling_size=sdg_sample_size, - ) - sdg_task.set_env_variable("HOME", "/tmp") - sdg_task.set_env_variable("HF_HOME", "/tmp") - use_config_map_as_env( - sdg_task, TEACHER_CONFIG_MAP, dict(endpoint="endpoint", model="model") - ) - use_secret_as_env(sdg_task, TEACHER_SECRET, {"api_key": "api_key"}) - use_config_map_as_volume( - sdg_task, TEACHER_CONFIG_MAP, mount_path=SDG_CA_CERT_PATH - ) - sdg_task.set_env_variable( - SDG_CA_CERT_ENV_VAR_NAME, os.path.join(SDG_CA_CERT_PATH, SDG_CA_CERT_CM_KEY) - ) + Args: + sdg_repo_url: SDG parameter. Points to a taxonomy git repository + sdg_repo_branch: SDG parameter. Points to a branch within the taxonomy git repository. If set, has priority over sdg_repo_pr + sdg_repo_pr: SDG parameter. Points to a pull request against the taxonomy git repository + sdg_base_model: SDG parameter. LLM model used to generate the synthetic dataset + sdg_scale_factor: SDG parameter. The total number of instructions to be generated. + sdg_pipeline: SDG parameter. Data generation pipeline to use. Available: 'simple', 'full', or a valid path to a directory of pipeline workflow YAML files. Note that 'full' requires a larger teacher model, Mixtral-8x7b. + sdg_max_batch_len: SDG parameter. Maximum tokens per gpu for each batch that will be handled in a single step. + sdg_sample_size: SDG parameter. Represents the sdg skills recipe sampling size as percentage in decimal form. + + train_nproc_per_node: Training parameter. Number of GPUs per each node/worker to use for training. + train_nnodes: Training parameter. Number of nodes/workers to train on. + train_num_epochs_phase_1: Training parameter for in Phase 1. Number of epochs to run training. + train_num_epochs_phase_2: Training parameter for in Phase 2. Number of epochs to run training. + train_effective_batch_size_phase_1: Training parameter for in Phase 1. The number of samples in a batch that the model should see before its parameters are updated. + train_effective_batch_size_phase_2: Training parameter for in Phase 2. The number of samples in a batch that the model should see before its parameters are updated. + train_learning_rate_phase_1: Training parameter for in Phase 1. How fast we optimize the weights during gradient descent. Higher values may lead to unstable learning performance. It's generally recommended to have a low learning rate with a high effective batch size. + train_learning_rate_phase_2: Training parameter for in Phase 2. How fast we optimize the weights during gradient descent. Higher values may lead to unstable learning performance. It's generally recommended to have a low learning rate with a high effective batch size. + train_num_warmup_steps_phase_1: Training parameter for in Phase 1. The number of steps a model should go through before reaching the full learning rate. We start at 0 and linearly climb up to train_learning_rate. + train_num_warmup_steps_phase_2: Training parameter for in Phase 2. The number of steps a model should go through before reaching the full learning rate. We start at 0 and linearly climb up to train_learning_rate. + train_save_samples: Training parameter. Number of samples the model should see before saving a checkpoint. + train_max_batch_len: Training parameter. Maximum tokens per gpu for each batch that will be handled in a single step. + train_seed: Training parameter. Random seed for initializing training. + + mt_bench_max_workers: MT Bench parameter. Number of workers to use for evaluation with mt_bench or mt_bench_branch. Must be a positive integer or 'auto'. + mt_bench_merge_system_user_message: MT Bench parameter. Boolean indicating whether to merge system and user messages (required for Mistral based judges) + + final_eval_max_workers: Final model evaluation parameter for MT Bench Branch. Number of workers to use for evaluation with mt_bench or mt_bench_branch. Must be a positive integer or 'auto'. + final_eval_few_shots: Final model evaluation parameter for MMLU. Number of question-answer pairs provided in the context preceding the question used for evaluation. + final_eval_batch_size: Final model evaluation parameter for MMLU. Batch size for evaluation. Valid values are a positive integer or 'auto' to select the largest batch size that will fit in memory. + final_eval_merge_system_user_message: Final model evaluation parameter for MT Bench Branch. Boolean indicating whether to merge system and user messages (required for Mistral based judges) + + k8s_storage_class_name: A Kubernetes StorageClass name for persistent volumes. Selected StorageClass must support RWX PersistentVolumes. + """ - sdg_task.after(git_clone_task) - mount_pvc( - task=sdg_task, - pvc_name=sdg_input_pvc_task.output, - mount_path="/data", - ) - sdg_task.set_caching_options(False) - - # Upload "sdg" and "taxonomy" artifacts to S3 without blocking the rest of the workflow - taxonomy_to_artifact_task = taxonomy_to_artifact_op() - taxonomy_to_artifact_task.after(git_clone_task, sdg_task) - mount_pvc( - task=taxonomy_to_artifact_task, - pvc_name=sdg_input_pvc_task.output, - mount_path="/data", - ) - sdg_to_artifact_task = sdg_to_artifact_op() - sdg_to_artifact_task.after(git_clone_task, sdg_task) - mount_pvc( - task=sdg_to_artifact_task, - pvc_name=sdg_input_pvc_task.output, - mount_path="/data", - ) + # SDG stage + sdg_input_pvc_task = CreatePVC( + pvc_name_suffix="-sdg", + access_modes=["ReadWriteMany"], + size="10Gi", + storage_class_name=k8s_storage_class_name, + ) + git_clone_task = git_clone_op( + repo_branch=sdg_repo_branch, + repo_pr=sdg_repo_pr if sdg_repo_pr and sdg_repo_pr > 0 else None, + repo_url=sdg_repo_url, + ) + mount_pvc( + task=git_clone_task, + pvc_name=sdg_input_pvc_task.output, + mount_path="/data", + ) + git_clone_task.set_caching_options(False) + + sdg_task = sdg_op( + num_instructions_to_generate=sdg_scale_factor, + pipeline=sdg_pipeline, + repo_branch=sdg_repo_branch, + repo_pr=sdg_repo_pr, + sdg_sampling_size=sdg_sample_size, + ) + sdg_task.set_env_variable("HOME", "/tmp") + sdg_task.set_env_variable("HF_HOME", "/tmp") + use_config_map_as_env( + sdg_task, TEACHER_CONFIG_MAP, dict(endpoint="endpoint", model="model") + ) + use_secret_as_env(sdg_task, TEACHER_SECRET, {"api_key": "api_key"}) + use_config_map_as_volume(sdg_task, TEACHER_CONFIG_MAP, mount_path=SDG_CA_CERT_PATH) + sdg_task.set_env_variable( + SDG_CA_CERT_ENV_VAR_NAME, os.path.join(SDG_CA_CERT_PATH, SDG_CA_CERT_CM_KEY) + ) - # uncomment if updating image with same tag - # set_image_pull_policy(sdg_task, "Always") + sdg_task.after(git_clone_task) + mount_pvc( + task=sdg_task, + pvc_name=sdg_input_pvc_task.output, + mount_path="/data", + ) + sdg_task.set_caching_options(False) + + # Upload "sdg" and "taxonomy" artifacts to S3 without blocking the rest of the workflow + taxonomy_to_artifact_task = taxonomy_to_artifact_op() + taxonomy_to_artifact_task.after(git_clone_task, sdg_task) + mount_pvc( + task=taxonomy_to_artifact_task, + pvc_name=sdg_input_pvc_task.output, + mount_path="/data", + ) + sdg_to_artifact_task = sdg_to_artifact_op() + sdg_to_artifact_task.after(git_clone_task, sdg_task) + mount_pvc( + task=sdg_to_artifact_task, + pvc_name=sdg_input_pvc_task.output, + mount_path="/data", + ) - # Training stage - model_source_s3_task = dsl.importer( - artifact_uri=sdg_base_model, artifact_class=dsl.Model - ) + # uncomment if updating image with same tag + # set_image_pull_policy(sdg_task, "Always") - model_pvc_task = CreatePVC( - pvc_name_suffix="-model-cache", - access_modes=["ReadWriteMany"], - size="100Gi", - storage_class_name=k8s_storage_class_name, - ) + # Training stage + model_source_s3_task = dsl.importer( + artifact_uri=sdg_base_model, artifact_class=dsl.Model + ) - model_to_pvc_task = model_to_pvc_op(model=model_source_s3_task.output) - model_to_pvc_task.set_caching_options(False) - mount_pvc( - task=model_to_pvc_task, pvc_name=model_pvc_task.output, mount_path="/model" - ) + model_pvc_task = CreatePVC( + pvc_name_suffix="-model-cache", + access_modes=["ReadWriteMany"], + size="100Gi", + storage_class_name=k8s_storage_class_name, + ) - # Data processing - data_processing_task = data_processing_op(max_batch_len=sdg_max_batch_len) - mount_pvc( - task=data_processing_task, - pvc_name=model_pvc_task.output, - mount_path="/model", - ) - mount_pvc( - task=data_processing_task, - pvc_name=sdg_input_pvc_task.output, - mount_path="/data", - ) - data_processing_task.after(model_to_pvc_task, sdg_task) - data_processing_task.set_caching_options(False) - data_processing_task.set_env_variable("XDG_CACHE_HOME", "/tmp") - - # Upload "skills_processed_data" and "knowledge_processed_data" artifacts to S3 without blocking the rest of the workflow - skills_processed_data_to_artifact_task = skills_processed_data_to_artifact_op() - skills_processed_data_to_artifact_task.after(data_processing_task) - mount_pvc( - task=skills_processed_data_to_artifact_task, - pvc_name=sdg_input_pvc_task.output, - mount_path="/data", - ) - skills_processed_data_to_artifact_task.set_caching_options(False) - knowledge_processed_data_to_artifact_task = ( - knowledge_processed_data_to_artifact_op() - ) - knowledge_processed_data_to_artifact_task.after(data_processing_task) - mount_pvc( - task=knowledge_processed_data_to_artifact_task, - pvc_name=sdg_input_pvc_task.output, - mount_path="/data", - ) - knowledge_processed_data_to_artifact_task.set_caching_options(False) + model_to_pvc_task = model_to_pvc_op(model=model_source_s3_task.output) + model_to_pvc_task.set_caching_options(False) + mount_pvc( + task=model_to_pvc_task, pvc_name=model_pvc_task.output, mount_path="/model" + ) - output_pvc_task = CreatePVC( - pvc_name_suffix="-output", - access_modes=["ReadWriteMany"], - size="100Gi", - storage_class_name=k8s_storage_class_name, - ) + # Data processing + data_processing_task = data_processing_op(max_batch_len=sdg_max_batch_len) + mount_pvc( + task=data_processing_task, + pvc_name=model_pvc_task.output, + mount_path="/model", + ) + mount_pvc( + task=data_processing_task, + pvc_name=sdg_input_pvc_task.output, + mount_path="/data", + ) + data_processing_task.after(model_to_pvc_task, sdg_task) + data_processing_task.set_caching_options(False) + data_processing_task.set_env_variable("XDG_CACHE_HOME", "/tmp") + + # Upload "skills_processed_data" and "knowledge_processed_data" artifacts to S3 without blocking the rest of the workflow + skills_processed_data_to_artifact_task = skills_processed_data_to_artifact_op() + skills_processed_data_to_artifact_task.after(data_processing_task) + mount_pvc( + task=skills_processed_data_to_artifact_task, + pvc_name=sdg_input_pvc_task.output, + mount_path="/data", + ) + skills_processed_data_to_artifact_task.set_caching_options(False) + knowledge_processed_data_to_artifact_task = ( + knowledge_processed_data_to_artifact_op() + ) + knowledge_processed_data_to_artifact_task.after(data_processing_task) + mount_pvc( + task=knowledge_processed_data_to_artifact_task, + pvc_name=sdg_input_pvc_task.output, + mount_path="/data", + ) + knowledge_processed_data_to_artifact_task.set_caching_options(False) - # Training 1 - # Using pvc_create_task.output as PyTorchJob name since dsl.PIPELINE_* global variables do not template/work in KFP v2 - # https://github.com/kubeflow/pipelines/issues/10453 - training_phase_1 = pytorchjob_manifest_op( - model_pvc_name=model_pvc_task.output, - input_pvc_name=sdg_input_pvc_task.output, - name_suffix=sdg_input_pvc_task.output, - output_pvc_name=output_pvc_task.output, - phase_num=1, - nproc_per_node=train_nproc_per_node, - nnodes=train_nnodes, - num_epochs=train_num_epochs_phase_1, - effective_batch_size=train_effective_batch_size_phase_1, - learning_rate=train_learning_rate_phase_1, - num_warmup_steps=train_num_warmup_steps_phase_1, - save_samples=train_save_samples, - max_batch_len=train_max_batch_len, - seed=train_seed, - ) - training_phase_1.after(data_processing_task, model_to_pvc_task) - training_phase_1.set_caching_options(False) - - #### Train 2 - training_phase_2 = pytorchjob_manifest_op( - model_pvc_name=model_pvc_task.output, - input_pvc_name=sdg_input_pvc_task.output, - name_suffix=sdg_input_pvc_task.output, - output_pvc_name=output_pvc_task.output, - phase_num=2, - nproc_per_node=train_nproc_per_node, - nnodes=train_nnodes, - num_epochs=train_num_epochs_phase_2, - effective_batch_size=train_effective_batch_size_phase_2, - learning_rate=train_learning_rate_phase_2, - num_warmup_steps=train_num_warmup_steps_phase_2, - save_samples=train_save_samples, - max_batch_len=train_max_batch_len, - seed=train_seed, - ) + output_pvc_task = CreatePVC( + pvc_name_suffix="-output", + access_modes=["ReadWriteMany"], + size="100Gi", + storage_class_name=k8s_storage_class_name, + ) - training_phase_2.set_caching_options(False) - training_phase_2.after(training_phase_1) + # Training 1 + # Using pvc_create_task.output as PyTorchJob name since dsl.PIPELINE_* global variables do not template/work in KFP v2 + # https://github.com/kubeflow/pipelines/issues/10453 + training_phase_1 = pytorchjob_manifest_op( + model_pvc_name=model_pvc_task.output, + input_pvc_name=sdg_input_pvc_task.output, + name_suffix=sdg_input_pvc_task.output, + output_pvc_name=output_pvc_task.output, + phase_num=1, + nproc_per_node=train_nproc_per_node, + nnodes=train_nnodes, + num_epochs=train_num_epochs_phase_1, + effective_batch_size=train_effective_batch_size_phase_1, + learning_rate=train_learning_rate_phase_1, + num_warmup_steps=train_num_warmup_steps_phase_1, + save_samples=train_save_samples, + max_batch_len=train_max_batch_len, + seed=train_seed, + ) + training_phase_1.after(data_processing_task, model_to_pvc_task) + training_phase_1.set_caching_options(False) + + #### Train 2 + training_phase_2 = pytorchjob_manifest_op( + model_pvc_name=model_pvc_task.output, + input_pvc_name=sdg_input_pvc_task.output, + name_suffix=sdg_input_pvc_task.output, + output_pvc_name=output_pvc_task.output, + phase_num=2, + nproc_per_node=train_nproc_per_node, + nnodes=train_nnodes, + num_epochs=train_num_epochs_phase_2, + effective_batch_size=train_effective_batch_size_phase_2, + learning_rate=train_learning_rate_phase_2, + num_warmup_steps=train_num_warmup_steps_phase_2, + save_samples=train_save_samples, + max_batch_len=train_max_batch_len, + seed=train_seed, + ) - mount_pvc( - task=training_phase_2, - pvc_name=output_pvc_task.output, - mount_path="/output", - ) + training_phase_2.set_caching_options(False) + training_phase_2.after(training_phase_1) - # MT_Bench Evaluation of models + mount_pvc( + task=training_phase_2, + pvc_name=output_pvc_task.output, + mount_path="/output", + ) - run_mt_bench_task = run_mt_bench_op( - models_folder="/output/phase_2/model/hf_format", - max_workers=mt_bench_max_workers, - merge_system_user_message=mt_bench_merge_system_user_message, - ) - mount_pvc( - task=run_mt_bench_task, - pvc_name=output_pvc_task.output, - mount_path="/output", - ) - run_mt_bench_task.set_env_variable("HOME", "/tmp") - run_mt_bench_task.set_env_variable("HF_HOME", "/tmp") - run_mt_bench_task.set_accelerator_type("nvidia.com/gpu") - run_mt_bench_task.set_accelerator_limit(1) - run_mt_bench_task.set_caching_options(False) - run_mt_bench_task.after(training_phase_2) - use_config_map_as_env( - run_mt_bench_task, - JUDGE_CONFIG_MAP, - dict(endpoint="JUDGE_ENDPOINT", model="JUDGE_NAME"), - ) - use_secret_as_env(run_mt_bench_task, JUDGE_SECRET, {"api_key": "JUDGE_API_KEY"}) + # MT_Bench Evaluation of models - use_config_map_as_volume( - run_mt_bench_task, JUDGE_CONFIG_MAP, mount_path=JUDGE_CA_CERT_PATH - ) - run_mt_bench_task.set_env_variable( - JUDGE_CA_CERT_ENV_VAR_NAME, - os.path.join(JUDGE_CA_CERT_PATH, JUDGE_CA_CERT_CM_KEY), - ) + run_mt_bench_task = run_mt_bench_op( + models_folder="/output/phase_2/model/hf_format", + max_workers=mt_bench_max_workers, + merge_system_user_message=mt_bench_merge_system_user_message, + ) + mount_pvc( + task=run_mt_bench_task, + pvc_name=output_pvc_task.output, + mount_path="/output", + ) + run_mt_bench_task.set_env_variable("HOME", "/tmp") + run_mt_bench_task.set_env_variable("HF_HOME", "/tmp") + run_mt_bench_task.set_accelerator_type("nvidia.com/gpu") + run_mt_bench_task.set_accelerator_limit(1) + run_mt_bench_task.set_caching_options(False) + run_mt_bench_task.after(training_phase_2) + use_config_map_as_env( + run_mt_bench_task, + JUDGE_CONFIG_MAP, + dict(endpoint="JUDGE_ENDPOINT", model="JUDGE_NAME"), + ) + use_secret_as_env(run_mt_bench_task, JUDGE_SECRET, {"api_key": "JUDGE_API_KEY"}) - # uncomment if updating image with same tag - # set_image_pull_policy(run_mt_bench_task, "Always") - - final_eval_task = run_final_eval_op( - candidate_model="/output/phase_2/model/hf_format/candidate_model", - # TODO: DO we need both candidate_branch and base_branch - base_branch=sdg_repo_branch, - candidate_branch=sdg_repo_branch, - base_model_dir="/model/", - max_workers=final_eval_max_workers, - merge_system_user_message=final_eval_merge_system_user_message, - few_shots=final_eval_few_shots, - batch_size=final_eval_batch_size, - ) - mount_pvc( - task=final_eval_task, pvc_name=output_pvc_task.output, mount_path="/output" - ) - mount_pvc( - task=final_eval_task, - pvc_name=sdg_input_pvc_task.output, - mount_path="/input", - ) - mount_pvc( - task=final_eval_task, - pvc_name=model_pvc_task.output, - mount_path="/model", - ) + use_config_map_as_volume( + run_mt_bench_task, JUDGE_CONFIG_MAP, mount_path=JUDGE_CA_CERT_PATH + ) + run_mt_bench_task.set_env_variable( + JUDGE_CA_CERT_ENV_VAR_NAME, + os.path.join(JUDGE_CA_CERT_PATH, JUDGE_CA_CERT_CM_KEY), + ) - use_config_map_as_env( - final_eval_task, - JUDGE_CONFIG_MAP, - dict(endpoint="JUDGE_ENDPOINT", model="JUDGE_NAME"), - ) + # uncomment if updating image with same tag + # set_image_pull_policy(run_mt_bench_task, "Always") + + final_eval_task = run_final_eval_op( + candidate_model="/output/phase_2/model/hf_format/candidate_model", + # TODO: DO we need both candidate_branch and base_branch + base_branch=sdg_repo_branch, + candidate_branch=sdg_repo_branch, + base_model_dir="/model/", + max_workers=final_eval_max_workers, + merge_system_user_message=final_eval_merge_system_user_message, + few_shots=final_eval_few_shots, + batch_size=final_eval_batch_size, + ) + mount_pvc( + task=final_eval_task, pvc_name=output_pvc_task.output, mount_path="/output" + ) + mount_pvc( + task=final_eval_task, + pvc_name=sdg_input_pvc_task.output, + mount_path="/input", + ) + mount_pvc( + task=final_eval_task, + pvc_name=model_pvc_task.output, + mount_path="/model", + ) - final_eval_task.set_env_variable("HOME", "/tmp") - final_eval_task.set_env_variable("HF_HOME", "/tmp") + use_config_map_as_env( + final_eval_task, + JUDGE_CONFIG_MAP, + dict(endpoint="JUDGE_ENDPOINT", model="JUDGE_NAME"), + ) - # uncomment if updating image with same tag - # set_image_pull_policy(final_eval_task, "Always") + final_eval_task.set_env_variable("HOME", "/tmp") + final_eval_task.set_env_variable("HF_HOME", "/tmp") - use_secret_as_env(final_eval_task, JUDGE_SECRET, {"api_key": "JUDGE_API_KEY"}) + # uncomment if updating image with same tag + # set_image_pull_policy(final_eval_task, "Always") - use_config_map_as_volume( - final_eval_task, JUDGE_CONFIG_MAP, mount_path=JUDGE_CA_CERT_PATH - ) - final_eval_task.set_env_variable( - JUDGE_CA_CERT_ENV_VAR_NAME, - os.path.join(JUDGE_CA_CERT_PATH, JUDGE_CA_CERT_CM_KEY), - ) + use_secret_as_env(final_eval_task, JUDGE_SECRET, {"api_key": "JUDGE_API_KEY"}) - final_eval_task.after(run_mt_bench_task) - final_eval_task.set_accelerator_type("nvidia.com/gpu") - final_eval_task.set_accelerator_limit(1) - final_eval_task.set_caching_options(False) + use_config_map_as_volume( + final_eval_task, JUDGE_CONFIG_MAP, mount_path=JUDGE_CA_CERT_PATH + ) + final_eval_task.set_env_variable( + JUDGE_CA_CERT_ENV_VAR_NAME, + os.path.join(JUDGE_CA_CERT_PATH, JUDGE_CA_CERT_CM_KEY), + ) - output_model_task = pvc_to_model_op( - pvc_path="/output/phase_2/model/hf_format/candidate_model", - ) - output_model_task.after(run_mt_bench_task) - output_model_task.set_caching_options(False) - mount_pvc( - task=output_model_task, - pvc_name=output_pvc_task.output, - mount_path="/output", - ) + final_eval_task.after(run_mt_bench_task) + final_eval_task.set_accelerator_type("nvidia.com/gpu") + final_eval_task.set_accelerator_limit(1) + final_eval_task.set_caching_options(False) - output_mt_bench_task = pvc_to_mt_bench_op( - pvc_path="/output/mt_bench_data.json", - ) - output_mt_bench_task.after(run_mt_bench_task) - output_mt_bench_task.set_caching_options(False) + output_model_task = pvc_to_model_op( + pvc_path="/output/phase_2/model/hf_format/candidate_model", + ) + output_model_task.after(run_mt_bench_task) + output_model_task.set_caching_options(False) + mount_pvc( + task=output_model_task, + pvc_name=output_pvc_task.output, + mount_path="/output", + ) - mount_pvc( - task=output_mt_bench_task, - pvc_name=output_pvc_task.output, - mount_path="/output", - ) + output_mt_bench_task = pvc_to_mt_bench_op( + pvc_path="/output/mt_bench_data.json", + ) + output_mt_bench_task.after(run_mt_bench_task) + output_mt_bench_task.set_caching_options(False) - output_pvc_delete_task = DeletePVC(pvc_name=output_pvc_task.output) - output_pvc_delete_task.after( - output_model_task, output_mt_bench_task, final_eval_task - ) + mount_pvc( + task=output_mt_bench_task, + pvc_name=output_pvc_task.output, + mount_path="/output", + ) - sdg_pvc_delete_task = DeletePVC(pvc_name=sdg_input_pvc_task.output) - sdg_pvc_delete_task.after(final_eval_task) + output_pvc_delete_task = DeletePVC(pvc_name=output_pvc_task.output) + output_pvc_delete_task.after( + output_model_task, output_mt_bench_task, final_eval_task + ) - model_pvc_delete_task = DeletePVC(pvc_name=model_pvc_task.output) - model_pvc_delete_task.after(final_eval_task) + sdg_pvc_delete_task = DeletePVC(pvc_name=sdg_input_pvc_task.output) + sdg_pvc_delete_task.after(final_eval_task) - return + model_pvc_delete_task = DeletePVC(pvc_name=model_pvc_task.output) + model_pvc_delete_task.after(final_eval_task) - return pipeline + return -def import_base_model_pipeline_wrapper(mock: List[Literal[MOCKED_STAGES]]): - from utils import ilab_importer_op +@dsl.pipeline( + display_name="InstructLab - base model importer", + name="instructlab-base-importer", + description="Helper pipeline to the InstructLab pipeline which allows users to seed/import a new base model", +) +def import_base_model_pipeline( + # hf_token_secret: str = "", # FIXME: Don't use hardcoded secret/configmap names once fixed upstream: https://github.com/kubeflow/pipelines/issues/11395 + # oci_pull_secret: str = "", # FIXME: Don't use hardcoded secret/configmap names once fixed upstream: https://github.com/kubeflow/pipelines/issues/11395 + repository: str = "docker://registry.redhat.io/rhelai1/granite-7b-starter", + release: str = "latest", +): + """InstructLab - base model importer. - @dsl.pipeline( - display_name="InstructLab - base model importer", - name="instructlab-base-importer", - description="Helper pipeline to the InstructLab pipeline which allows users to seed/import a new base model", + Args: + repository: Hugging Face or OCI repository of the model to download. OCI repository must have a docker:// prefix + release: The revision of the model to download - e.g. a branch, tag, or commit hash for Hugging Face repositories and tag or commit hash for OCI repositories. + hf_token_secret: Name of existing Kubernetes secret which contains HF_TOKEN value for Hugging Face repositories. Mandatory for all repositories besides those which belong to the "instructlab" organization. + oci_pull_secret: Name of existing Kubernetes secret of .dockerconfigjson type for OCI repository authentication. + """ + importer_task = ilab_importer_op(repository=repository, release=release) + + # FIXME: Don't use hardcoded secret/configmap names once fixed upstream: https://github.com/kubeflow/pipelines/issues/11395 + # FIXME: Make env variables optional once implemented upstream: https://github.com/kubeflow/pipelines/issues/11401 + # This pipeline is currently unusable outside of ocp-beta-test.nerc.mghpcc.org cluster, `ilab` namespace due to the hardcoded names... + use_secret_as_env(importer_task, "hugging-face-token", dict(HF_TOKEN="HF_TOKEN")) + importer_task.set_env_variable( + "REGISTRY_AUTH_FILE", "/mnt/containers/.dockerconfigjson" ) - def pipeline( - # hf_token_secret: str = "", # FIXME: Don't use hardcoded secret/configmap names once fixed upstream: https://github.com/kubeflow/pipelines/issues/11395 - # oci_pull_secret: str = "", # FIXME: Don't use hardcoded secret/configmap names once fixed upstream: https://github.com/kubeflow/pipelines/issues/11395 - repository: str = "docker://registry.redhat.io/rhelai1/granite-7b-starter", - release: str = "latest", - ): - """InstructLab - base model importer. - - Args: - repository: Hugging Face or OCI repository of the model to download. OCI repository must have a docker:// prefix - release: The revision of the model to download - e.g. a branch, tag, or commit hash for Hugging Face repositories and tag or commit hash for OCI repositories. - hf_token_secret: Name of existing Kubernetes secret which contains HF_TOKEN value for Hugging Face repositories. Mandatory for all repositories besides those which belong to the "instructlab" organization. - oci_pull_secret: Name of existing Kubernetes secret of .dockerconfigjson type for OCI repository authentication. - """ - importer_task = ilab_importer_op(repository=repository, release=release) - - # FIXME: Don't use hardcoded secret/configmap names once fixed upstream: https://github.com/kubeflow/pipelines/issues/11395 - # FIXME: Make env variables optional once implemented upstream: https://github.com/kubeflow/pipelines/issues/11401 - # This pipeline is currently unusable outside of ocp-beta-test.nerc.mghpcc.org cluster, `ilab` namespace due to the hardcoded names... - use_secret_as_env( - importer_task, "hugging-face-token", dict(HF_TOKEN="HF_TOKEN") - ) - importer_task.set_env_variable( - "REGISTRY_AUTH_FILE", "/mnt/containers/.dockerconfigjson" - ) - use_secret_as_volume( - importer_task, "7033380-ilab-pull-secret", mount_path="/mnt/containers" - ) - importer_task.set_env_variable("XDG_CACHE_HOME", "/tmp") - importer_task.set_env_variable("XDG_CONFIG_HOME", "/tmp") - importer_task.set_env_variable("XDG_DATA_HOME", "/tmp") - - return pipeline + use_secret_as_volume( + importer_task, "7033380-ilab-pull-secret", mount_path="/mnt/containers" + ) + importer_task.set_env_variable("XDG_CACHE_HOME", "/tmp") + importer_task.set_env_variable("XDG_CONFIG_HOME", "/tmp") + importer_task.set_env_variable("XDG_DATA_HOME", "/tmp") -@click.option( - "--mock", - type=click.Choice(MOCKED_STAGES, case_sensitive=False), - help="Mock part of the pipeline", - multiple=True, - default=[], -) @click.group(invoke_without_command=True) @click.pass_context -def cli(ctx: click.Context, mock): +def cli(ctx: click.Context): if ctx.invoked_subcommand is None: - generate_pipeline(mock) - + generate_pipeline() -def generate_pipeline(mock): - ilab_pipeline = ilab_pipeline_wrapper(mock) - import_base_model_pipeline = import_base_model_pipeline_wrapper(mock) +def generate_pipeline(): pipelines = [ (ilab_pipeline, PIPELINE_FILE_NAME), (import_base_model_pipeline, IMPORTER_PIPELINE_FILE_NAME), ] - with click.progressbar(pipelines, label="Generating pipeline") as bar: + with click.progressbar( + pipelines, + label="Generating pipeline", + item_show_func=lambda p: p[1] if p is not None else "", + ) as bar: for pipeline_func, pipeline_file in bar: compiler.Compiler().compile(pipeline_func, pipeline_file) @cli.command(name="run") -@click.option( - "--mock", - type=click.Choice(MOCKED_STAGES, case_sensitive=False), - help="Mock part of the pipeline", - multiple=True, - default=[], -) @click.option("-e", "--experiment", help="Set KFP experiment name.") @click.option("-r", "--run", "run_name", help="Set KFP run name.") @click.option( @@ -555,7 +502,7 @@ def generate_pipeline(mock): help="Override default parameters in KEY=VALUE format. Default parameters are suitable for dev cluster - the MOC cluster, `ilab` namespace.", multiple=True, ) -def run(mock, experiment, run_name, param): +def run(experiment, run_name, param): """ Run the pipeline immediately against current kubernetes context (cluster and namespace). @@ -586,7 +533,7 @@ def run(mock, experiment, run_name, param): arguments = {**dev_arguments, **parsed_params} client.create_run_from_pipeline_func( - pipeline_func=ilab_pipeline_wrapper(mock), + pipeline_func=ilab_pipeline, experiment_name=experiment, run_name=run_name, arguments=arguments, @@ -611,7 +558,7 @@ def gen_standalone(): click.echo("Generating pipeline YAML file...") try: - generate_pipeline(mock=None) + generate_pipeline() except ValueError as e: click.echo(f"Error: {e}", err=True) raise click.exceptions.Exit(1) diff --git a/pipeline.yaml b/pipeline.yaml index ef5d586e..fef55aed 100644 --- a/pipeline.yaml +++ b/pipeline.yaml @@ -1528,24 +1528,24 @@ deploymentSpec: \ *\n\ndef sdg_op(\n num_instructions_to_generate: int,\n pipeline:\ \ str,\n repo_branch: Optional[str],\n repo_pr: Optional[int],\n \ \ taxonomy_path: str = \"/data/taxonomy\",\n sdg_path: str = \"/data/sdg\"\ - ,\n sdg_sampling_size: float = 1.0,\n):\n import os\n from os import\ - \ getenv, path\n\n import instructlab.sdg\n import openai\n import\ - \ yaml\n\n api_key = getenv(\"api_key\")\n model = getenv(\"model\"\ - )\n endpoint = getenv(\"endpoint\")\n\n sdg_ca_cert_path = getenv(\"\ - SDG_CA_CERT_PATH\")\n use_tls = os.path.exists(sdg_ca_cert_path) and\ - \ (\n os.path.getsize(sdg_ca_cert_path) > 0\n )\n if use_tls:\n\ - \ import httpx\n\n custom_http_client = httpx.Client(verify=sdg_ca_cert_path)\n\ - \ client = openai.OpenAI(\n base_url=endpoint, api_key=api_key,\ - \ http_client=custom_http_client\n )\n else:\n client =\ - \ openai.OpenAI(base_url=endpoint, api_key=api_key)\n\n taxonomy_base\ - \ = \"main\" if repo_branch or (repo_pr and int(repo_pr) > 0) else \"empty\"\ - \n\n print(\"Generating synthetic dataset for:\")\n print()\n print(\n\ - \ instructlab.sdg.utils.taxonomy.read_taxonomy(\n taxonomy_path,\ - \ taxonomy_base, document_output_dir=f\"{sdg_path}/documents\"\n \ - \ )\n )\n\n # Generate synthetic dataset\n # 1.0 is the default\ - \ size\n if sdg_sampling_size == 1.0:\n # generate_data has a\ - \ magic word for its taxonomy_base argument - 'empty'\n # it allows\ - \ generating from the whole repo, see:\n # https://github.com/instructlab/sdg/blob/c6a9e74a1618b1077cd38e713b8aaed8b7c0c8ce/src/instructlab/sdg/utils/taxonomy.py#L230\n\ + ,\n sdg_sampling_size: float = 1.0,\n):\n import os\n import shutil\n\ + \ import tempfile\n\n import instructlab.sdg\n import openai\n\ + \ import xdg_base_dirs\n import yaml\n\n api_key = os.getenv(\"\ + api_key\")\n model = os.getenv(\"model\")\n endpoint = os.getenv(\"\ + endpoint\")\n\n sdg_ca_cert_path = os.getenv(\"SDG_CA_CERT_PATH\")\n\ + \ use_tls = os.path.exists(sdg_ca_cert_path) and (\n os.path.getsize(sdg_ca_cert_path)\ + \ > 0\n )\n if use_tls:\n import httpx\n\n custom_http_client\ + \ = httpx.Client(verify=sdg_ca_cert_path)\n client = openai.OpenAI(\n\ + \ base_url=endpoint, api_key=api_key, http_client=custom_http_client\n\ + \ )\n else:\n client = openai.OpenAI(base_url=endpoint,\ + \ api_key=api_key)\n\n taxonomy_base = \"main\" if repo_branch or (repo_pr\ + \ and int(repo_pr) > 0) else \"empty\"\n\n print(\"Generating synthetic\ + \ dataset for:\")\n print()\n print(\n instructlab.sdg.utils.taxonomy.read_taxonomy(\n\ + \ taxonomy_path, taxonomy_base, document_output_dir=f\"{sdg_path}/documents\"\ + \n )\n )\n\n # Generate synthetic dataset\n # 1.0 is the\ + \ default size\n if sdg_sampling_size == 1.0:\n # generate_data\ + \ has a magic word for its taxonomy_base argument - 'empty'\n # it\ + \ allows generating from the whole repo, see:\n # https://github.com/instructlab/sdg/blob/c6a9e74a1618b1077cd38e713b8aaed8b7c0c8ce/src/instructlab/sdg/utils/taxonomy.py#L230\n\ \ instructlab.sdg.generate_data(\n client=client,\n \ \ num_instructions_to_generate=num_instructions_to_generate,\n\ \ output_dir=sdg_path,\n taxonomy=taxonomy_path,\n\ @@ -1554,9 +1554,9 @@ deploymentSpec: \ server_ctx_size=4096,\n )\n # Tweak precomputed skills\ \ data ratio if needed\n else:\n skills_recipe = \"/usr/share/instructlab/sdg/default_data_recipes/skills.yaml\"\ \n\n def set_precomputed_skills_data_ratio(sampling_size: float,\ - \ skills_recipe: str):\n if path.exists(skills_recipe):\n \ - \ with open(skills_recipe, \"r\", encoding=\"utf-8\") as file:\n\ - \ skills_yaml = yaml.load(file, Loader=yaml.Loader)\n\ + \ skills_recipe: str):\n if os.path.exists(skills_recipe):\n\ + \ with open(skills_recipe, \"r\", encoding=\"utf-8\") as\ + \ file:\n skills_yaml = yaml.load(file, Loader=yaml.Loader)\n\ \n skills_yaml[\"datasets\"][0][\"sampling_size\"] = sampling_size\n\ \n with open(skills_recipe, \"w\", encoding=\"utf-8\") as\ \ file:\n yaml.dump(skills_yaml, file)\n\n try:\n\ @@ -1564,30 +1564,29 @@ deploymentSpec: \ skills_recipe=skills_recipe\n )\n except PermissionError:\n\ \ print(\"Failed to set precomputed skills data ratio: Permission\ \ denied\")\n print(\"Attempting to move default data recipes\ - \ to temporary directory\")\n import os\n import shutil\n\ - \ import tempfile\n\n import xdg_base_dirs\n\n \ - \ # Create a temporary directory\n with tempfile.TemporaryDirectory()\ - \ as temp_dir:\n # Create a default_data_recipes directory\n\ - \ temp_dir = path.join(temp_dir, \"default_data_recipes\"\ - )\n os.mkdir(temp_dir)\n\n # Copy default_data_recipes/skills.yaml\ - \ to the temporary directory\n shutil.copy(skills_recipe,\ - \ temp_dir)\n\n # Also copy the current pipeline directory\ - \ to the temporary directory - it's a small\n # directory\ - \ like 28KB\n # This isn't needed if the pipeline is either\ - \ \"full\" or \"simple\" but it's future-proofing\n data_dirs\ - \ = [\n os.path.join(str(dir), \"instructlab\", \"sdg\"\ - )\n for dir in xdg_base_dirs.xdg_data_dirs()\n \ - \ ]\n temp_pipeline_dir = path.join(temp_dir, \"\ - pipeline\")\n os.mkdir(temp_pipeline_dir)\n \ - \ for d in data_dirs:\n pipeline_path = os.path.join(d,\ - \ \"pipelines\", pipeline)\n if os.path.exists(pipeline_path):\n\ - \ shutil.copytree(\n pipeline_path,\n\ + \ to temporary directory\")\n\n # Create a temporary directory\n\ + \ with tempfile.TemporaryDirectory() as temp_dir:\n \ + \ # Create a default_data_recipes directory\n temp_dir\ + \ = os.path.join(temp_dir, \"default_data_recipes\")\n os.mkdir(temp_dir)\n\ + \n # Copy default_data_recipes/skills.yaml to the temporary\ + \ directory\n shutil.copy(skills_recipe, temp_dir)\n\n \ + \ # Also copy the current pipeline directory to the temporary\ + \ directory - it's a small\n # directory like 28KB\n \ + \ # This isn't needed if the pipeline is either \"full\" or \"\ + simple\" but it's future-proofing\n data_dirs = [\n \ + \ os.path.join(str(dir), \"instructlab\", \"sdg\")\n \ + \ for dir in xdg_base_dirs.xdg_data_dirs()\n \ + \ ]\n temp_pipeline_dir = os.path.join(temp_dir, \"pipeline\"\ + )\n os.mkdir(temp_pipeline_dir)\n for d in\ + \ data_dirs:\n pipeline_path = os.path.join(d, \"pipelines\"\ + , pipeline)\n if os.path.exists(pipeline_path):\n \ + \ shutil.copytree(\n pipeline_path,\n\ \ temp_pipeline_dir,\n \ \ dirs_exist_ok=True,\n )\n \ \ break\n\n # Build new skills.yaml path\n \ - \ new_skills_recipe = path.join(temp_dir, \"skills.yaml\")\n \ - \ print(f\"New skills recipe path: {new_skills_recipe}\")\n\n\ - \ # Override XDG_DATA_DIRS with the temporary directory\n\ + \ new_skills_recipe = os.path.join(temp_dir, \"skills.yaml\")\n\ + \ print(f\"New skills recipe path: {new_skills_recipe}\"\ + )\n\n # Override XDG_DATA_DIRS with the temporary directory\n\ \ # This allows SDG to read the new skills.yaml since it's\ \ looking into XDG_DATA_DIRS\n # and looks for a default_data_recipes\ \ directory with a skills.yaml file\n os.environ[\"XDG_DATA_DIRS\"\ diff --git a/sdg/__init__.py b/sdg/__init__.py index 91a0022b..fee86e69 100644 --- a/sdg/__init__.py +++ b/sdg/__init__.py @@ -1,4 +1,3 @@ -from . import faked from .components import ( git_clone_op, sdg_op, @@ -11,5 +10,4 @@ "sdg_op", "taxonomy_to_artifact_op", "sdg_to_artifact_op", - "faked", ] diff --git a/sdg/components.py b/sdg/components.py index 3d891a1a..238f4611 100644 --- a/sdg/components.py +++ b/sdg/components.py @@ -1,5 +1,5 @@ # type: ignore -# pylint: disable=no-value-for-parameter,import-outside-toplevel,import-error,no-member +# pylint: disable=import-outside-toplevel,import-error from typing import Optional from kfp import dsl @@ -38,17 +38,19 @@ def sdg_op( sdg_sampling_size: float = 1.0, ): import os - from os import getenv, path + import shutil + import tempfile import instructlab.sdg import openai + import xdg_base_dirs import yaml - api_key = getenv("api_key") - model = getenv("model") - endpoint = getenv("endpoint") + api_key = os.getenv("api_key") + model = os.getenv("model") + endpoint = os.getenv("endpoint") - sdg_ca_cert_path = getenv("SDG_CA_CERT_PATH") + sdg_ca_cert_path = os.getenv("SDG_CA_CERT_PATH") use_tls = os.path.exists(sdg_ca_cert_path) and ( os.path.getsize(sdg_ca_cert_path) > 0 ) @@ -94,7 +96,7 @@ def sdg_op( skills_recipe = "/usr/share/instructlab/sdg/default_data_recipes/skills.yaml" def set_precomputed_skills_data_ratio(sampling_size: float, skills_recipe: str): - if path.exists(skills_recipe): + if os.path.exists(skills_recipe): with open(skills_recipe, "r", encoding="utf-8") as file: skills_yaml = yaml.load(file, Loader=yaml.Loader) @@ -110,16 +112,11 @@ def set_precomputed_skills_data_ratio(sampling_size: float, skills_recipe: str): except PermissionError: print("Failed to set precomputed skills data ratio: Permission denied") print("Attempting to move default data recipes to temporary directory") - import os - import shutil - import tempfile - - import xdg_base_dirs # Create a temporary directory with tempfile.TemporaryDirectory() as temp_dir: # Create a default_data_recipes directory - temp_dir = path.join(temp_dir, "default_data_recipes") + temp_dir = os.path.join(temp_dir, "default_data_recipes") os.mkdir(temp_dir) # Copy default_data_recipes/skills.yaml to the temporary directory @@ -132,7 +129,7 @@ def set_precomputed_skills_data_ratio(sampling_size: float, skills_recipe: str): os.path.join(str(dir), "instructlab", "sdg") for dir in xdg_base_dirs.xdg_data_dirs() ] - temp_pipeline_dir = path.join(temp_dir, "pipeline") + temp_pipeline_dir = os.path.join(temp_dir, "pipeline") os.mkdir(temp_pipeline_dir) for d in data_dirs: pipeline_path = os.path.join(d, "pipelines", pipeline) @@ -145,7 +142,7 @@ def set_precomputed_skills_data_ratio(sampling_size: float, skills_recipe: str): break # Build new skills.yaml path - new_skills_recipe = path.join(temp_dir, "skills.yaml") + new_skills_recipe = os.path.join(temp_dir, "skills.yaml") print(f"New skills recipe path: {new_skills_recipe}") # Override XDG_DATA_DIRS with the temporary directory diff --git a/sdg/faked/__init__.py b/sdg/faked/__init__.py deleted file mode 100644 index d2de33f4..00000000 --- a/sdg/faked/__init__.py +++ /dev/null @@ -1,8 +0,0 @@ -from .components import ( - git_clone_op, - sdg_op, - sdg_to_artifact_op, - taxonomy_to_artifact_op, -) - -__all__ = ["git_clone_op", "sdg_op", "taxonomy_to_artifact_op", "sdg_to_artifact_op"] diff --git a/sdg/faked/components.py b/sdg/faked/components.py deleted file mode 100644 index a9da3d1a..00000000 --- a/sdg/faked/components.py +++ /dev/null @@ -1,68 +0,0 @@ -# type: ignore -# pylint: disable=unused-argument -from typing import Optional - -from kfp import dsl - -from utils.consts import PYTHON_IMAGE, TOOLBOX_IMAGE - - -@dsl.container_component -def git_clone_op( - repo_branch: str, - repo_pr: Optional[int], - repo_url: Optional[str], - taxonomy_path: str = "/data/taxonomy", -): - return dsl.ContainerSpec( - "registry.access.redhat.com/ubi9/toolbox", - ["/bin/sh", "-c"], - [ - f"git clone {repo_url} {taxonomy_path} && cd {taxonomy_path} && " - + f'if [ -n "{repo_branch}" ]; then ' - + f"git fetch origin {repo_branch} && git checkout {repo_branch}; " - + f'elif [ -n "{repo_pr}" ] && [ {repo_pr} -gt 0 ]; then ' - + f"git fetch origin pull/{repo_pr}/head:{repo_pr} && git checkout {repo_pr}; fi " - ], - ) - - -@dsl.component( - base_image=PYTHON_IMAGE, - packages_to_install=[ - "git+https://github.com/redhat-et/ilab-on-ocp.git#subdirectory=sdg/faked/fixtures" - ], - install_kfp_package=False, -) -def sdg_op( - num_instructions_to_generate: int, - pipeline: str, - repo_branch: Optional[str], - repo_pr: Optional[int], - taxonomy_path: str = "/data/taxonomy", - sdg_path: str = "/data/sdg", -): - import shutil - import sys - from pathlib import Path - - shutil.copytree(Path(sys.prefix) / "sdg_fixtures", sdg_path, dirs_exist_ok=True) - return - - -@dsl.container_component -def taxonomy_to_artifact_op(taxonomy: dsl.Output[dsl.Dataset], pvc_path: str): - return dsl.ContainerSpec( - TOOLBOX_IMAGE, - ["/bin/sh", "-c"], - [f"cp -r {pvc_path} {taxonomy.path}"], - ) - - -@dsl.container_component -def sdg_to_artifact_op(sdg: dsl.Output[dsl.Dataset], pvc_path: str): - return dsl.ContainerSpec( - TOOLBOX_IMAGE, - ["/bin/sh", "-c"], - [f"cp -r {pvc_path} {sdg.path}"], - ) diff --git a/sdg/faked/fixtures/messages_mistral-7b-instruct-v02_2024-09-09T15_30_29.jsonl b/sdg/faked/fixtures/messages_mistral-7b-instruct-v02_2024-09-09T15_30_29.jsonl deleted file mode 100644 index 7b258575..00000000 --- a/sdg/faked/fixtures/messages_mistral-7b-instruct-v02_2024-09-09T15_30_29.jsonl +++ /dev/null @@ -1,12 +0,0 @@ -{"messages": [{"content": "Could you suggest a more inclusive term for \"primary database\" in the context of database replication?", "role": "user"}, {"content": "'A more inclusive term for \"primary database\" in the context of database replication could be \"master database\" or \"leader database\".", "role": "assistant"}], "metadata": "{\"system\": \"I am, Red Hat\\u00ae Instruct Model based on Granite 7B, an AI language model developed by Red Hat and IBM Research, based on the Granite-7b-base language model. My primary function is to be a chat assistant.\"}"} -{"messages": [{"content": "What could be alternative terms for 'master' and 'slave' databases in database replication for more inclusive IT language?", "role": "user"}, {"content": "'\n'Primary database and secondary databases, or source database and target database, could be used instead of 'master' and 'slave' databases in database replication for more inclusive IT language.'", "role": "assistant"}], "metadata": "{\"system\": \"I am, Red Hat\\u00ae Instruct Model based on Granite 7B, an AI language model developed by Red Hat and IBM Research, based on the Granite-7b-base language model. My primary function is to be a chat assistant.\"}"} -{"messages": [{"content": "How can I replace the term \"denylisting\" with more inclusive IT terminology?", "role": "user"}, {"content": "'\n'One possible replacement for \"denylisting\" is \"access control list management\" or \"blocklist management\", as these terms are more neutral and do not carry the negative connotations of a \"denylist\".'", "role": "assistant"}], "metadata": "{\"system\": \"I am, Red Hat\\u00ae Instruct Model based on Granite 7B, an AI language model developed by Red Hat and IBM Research, based on the Granite-7b-base language model. My primary function is to be a chat assistant.\"}"} -{"messages": [{"content": "What is an alternative to the term \"blacklist\" that promotes more inclusive IT terminology?", "role": "user"}, {"content": "'\n'An alternative to the term \"blacklist\" that promotes more inclusive IT terminology is \"denylist\".'\n\nAdditionally, here's a pair based on the context you provided:\n\n'What should be done instead of compiling a blacklist?'\n'Instead of compiling a blacklist, consider creating a \"watchlist\" or \"greylist\" to maintain a more inclusive approach in IT.'", "role": "assistant"}], "metadata": "{\"system\": \"I am, Red Hat\\u00ae Instruct Model based on Granite 7B, an AI language model developed by Red Hat and IBM Research, based on the Granite-7b-base language model. My primary function is to be a chat assistant.\"}"} -{"messages": [{"content": "What term can be used instead of 'master' and 'slave' in the context of database replication to promote more inclusive IT terminology?", "role": "user"}, {"content": "'\n'Parent and child databases could be used instead, with the parent database being the authoritative source and the child databases synchronized to it.'", "role": "assistant"}], "metadata": "{\"system\": \"I am, Red Hat\\u00ae Instruct Model based on Granite 7B, an AI language model developed by Red Hat and IBM Research, based on the Granite-7b-base language model. My primary function is to be a chat assistant.\"}"} -{"messages": [{"content": "Could you suggest a more inclusive term for \"parent database\" and \"child database\" in the context of database replication?", "role": "user"}, {"content": "'\n'A more inclusive way to refer to \"parent database\" and \"child database\" in database replication could be \"primary database\" and \"secondary database\".'", "role": "assistant"}], "metadata": "{\"system\": \"I am, Red Hat\\u00ae Instruct Model based on Granite 7B, an AI language model developed by Red Hat and IBM Research, based on the Granite-7b-base language model. My primary function is to be a chat assistant.\"}"} -{"messages": [{"content": "Could you suggest a more inclusive term for \"allowlists\" in this context?", "role": "user"}, {"content": "'\n'Yes, \"inclusion lists\" could be a more inclusive alternative to \"allowlists\" when discussing network security.'", "role": "assistant"}], "metadata": "{\"system\": \"I am, Red Hat\\u00ae Instruct Model based on Granite 7B, an AI language model developed by Red Hat and IBM Research, based on the Granite-7b-base language model. My primary function is to be a chat assistant.\"}"} -{"messages": [{"content": "What term could be used instead of 'allowlists' to promote more inclusive IT language?", "role": "user"}, {"content": "'\n'An inclusive alternative to 'allowlists' could be 'accept lists' or 'approved lists'. These terms do not carry the same exclusivity connotation as 'allowlists'.", "role": "assistant"}], "metadata": "{\"system\": \"I am, Red Hat\\u00ae Instruct Model based on Granite 7B, an AI language model developed by Red Hat and IBM Research, based on the Granite-7b-base language model. My primary function is to be a chat assistant.\"}"} -{"messages": [{"content": "Could you suggest a more inclusive term for \"primary\" and \"secondary\" clocks in the context of a clock network?", "role": "user"}, {"content": "'\n'Yes, you could use \"master\" and \"slave\" clocks instead to promote inclusive IT language.'", "role": "assistant"}], "metadata": "{\"system\": \"I am, Red Hat\\u00ae Instruct Model based on Granite 7B, an AI language model developed by Red Hat and IBM Research, based on the Granite-7b-base language model. My primary function is to be a chat assistant.\"}"} -{"messages": [{"content": "What term could be used instead of \"primary\" and \"secondary\" to promote more inclusive IT language in the context of clock networks?", "role": "user"}, {"content": "'\n'Master' and 'slave' clocks could be replaced with 'synchronizing' or 'coordinating' clocks to promote more inclusive IT language.", "role": "assistant"}], "metadata": "{\"system\": \"I am, Red Hat\\u00ae Instruct Model based on Granite 7B, an AI language model developed by Red Hat and IBM Research, based on the Granite-7b-base language model. My primary function is to be a chat assistant.\"}"} -{"messages": [{"content": "What alternative term could be used instead of \"blacklist\" for more inclusive IT language?", "role": "user"}, {"content": "'\n'An allowlist or denylist, depending on the context, could be more inclusive alternatives to \"blacklist\" in IT terminology.'", "role": "assistant"}], "metadata": "{\"system\": \"I am, Red Hat\\u00ae Instruct Model based on Granite 7B, an AI language model developed by Red Hat and IBM Research, based on the Granite-7b-base language model. My primary function is to be a chat assistant.\"}"} -{"messages": [{"content": "Could you suggest an alternative term to \"blacklist\" for more inclusive IT language?", "role": "user"}, {"content": "'\n'Instead of using \"blacklist,\" consider employing the term \"denylist\" or \"blocklist\" to enhance your IT vocabulary.'", "role": "assistant"}], "metadata": "{\"system\": \"I am, Red Hat\\u00ae Instruct Model based on Granite 7B, an AI language model developed by Red Hat and IBM Research, based on the Granite-7b-base language model. My primary function is to be a chat assistant.\"}"} diff --git a/sdg/faked/fixtures/node_datasets_2024-09-09T15_30_29/compositional_skills_grounded_linguistics_inclusion.jsonl b/sdg/faked/fixtures/node_datasets_2024-09-09T15_30_29/compositional_skills_grounded_linguistics_inclusion.jsonl deleted file mode 100644 index 39e64d6f..00000000 --- a/sdg/faked/fixtures/node_datasets_2024-09-09T15_30_29/compositional_skills_grounded_linguistics_inclusion.jsonl +++ /dev/null @@ -1,12 +0,0 @@ -{"task_description":"This skill helps the writer use more inclusive IT terminology.","seed_context":"In database replication, the master database is regarded as the authoritative source, and the slave databases are synchronized to it.\n","seed_question":"How would you rewrite this sentence to use more inclusive IT terminology?","seed_response":"In database replication, the primary database is regarded as the authoritative source, and the secondary databases are syncrhonized to it.\n","output":"Could you suggest a more inclusive term for \"primary database\" in the context of database replication?\n'A more inclusive term for \"primary database\" in the context of database replication could be \"master database\" or \"leader database\".","id":"5fb777f6-3599-4031-bfd3-0881e93ed139","messages":[{"content":"I am, Red Hat\u00ae Instruct Model based on Granite 7B, an AI language model developed by Red Hat and IBM Research, based on the Granite-7b-base language model. My primary function is to be a chat assistant.","role":"system"},{"content":"Could you suggest a more inclusive term for \"primary database\" in the context of database replication?","role":"user"},{"content":"'A more inclusive term for \"primary database\" in the context of database replication could be \"master database\" or \"leader database\".","role":"assistant"}]} -{"task_description":"This skill helps the writer use more inclusive IT terminology.","seed_context":"In database replication, the master database is regarded as the authoritative source, and the slave databases are synchronized to it.\n","seed_question":"How would you rewrite this sentence to use more inclusive IT terminology?","seed_response":"In database replication, the primary database is regarded as the authoritative source, and the secondary databases are syncrhonized to it.\n","output":"What could be alternative terms for 'master' and 'slave' databases in database replication for more inclusive IT language?'\n'Primary database and secondary databases, or source database and target database, could be used instead of 'master' and 'slave' databases in database replication for more inclusive IT language.'","id":"51fbe5fc-4b72-4468-b81a-efd416380593","messages":[{"content":"I am, Red Hat\u00ae Instruct Model based on Granite 7B, an AI language model developed by Red Hat and IBM Research, based on the Granite-7b-base language model. My primary function is to be a chat assistant.","role":"system"},{"content":"What could be alternative terms for 'master' and 'slave' databases in database replication for more inclusive IT language?","role":"user"},{"content":"'\n'Primary database and secondary databases, or source database and target database, could be used instead of 'master' and 'slave' databases in database replication for more inclusive IT language.'","role":"assistant"}]} -{"task_description":"This skill helps the writer use more inclusive IT terminology.","seed_context":"Blacklisting is the action of a group or authority compiling a blacklist of people, countries, or other entites to be avoided or distrusted as being deemed unacceptable to those making the list.\n","seed_question":"Show me how to use inclusive IT terminology to replace non-inclusive IT terminology.","seed_response":"Denylisting is the action of a group or authority compiling a denylist of people, countries, or other entites to be avoided or distrusted as being deemed unacceptable to those making the list.\n","output":"How can I replace the term \"denylisting\" with more inclusive IT terminology?'\n'One possible replacement for \"denylisting\" is \"access control list management\" or \"blocklist management\", as these terms are more neutral and do not carry the negative connotations of a \"denylist\".'","id":"60659973-edd5-48d9-b834-22a5f7894b3f","messages":[{"content":"I am, Red Hat\u00ae Instruct Model based on Granite 7B, an AI language model developed by Red Hat and IBM Research, based on the Granite-7b-base language model. My primary function is to be a chat assistant.","role":"system"},{"content":"How can I replace the term \"denylisting\" with more inclusive IT terminology?","role":"user"},{"content":"'\n'One possible replacement for \"denylisting\" is \"access control list management\" or \"blocklist management\", as these terms are more neutral and do not carry the negative connotations of a \"denylist\".'","role":"assistant"}]} -{"task_description":"This skill helps the writer use more inclusive IT terminology.","seed_context":"Blacklisting is the action of a group or authority compiling a blacklist of people, countries, or other entites to be avoided or distrusted as being deemed unacceptable to those making the list.\n","seed_question":"Show me how to use inclusive IT terminology to replace non-inclusive IT terminology.","seed_response":"Denylisting is the action of a group or authority compiling a denylist of people, countries, or other entites to be avoided or distrusted as being deemed unacceptable to those making the list.\n","output":"What is an alternative to the term \"blacklist\" that promotes more inclusive IT terminology?'\n'An alternative to the term \"blacklist\" that promotes more inclusive IT terminology is \"denylist\".'\n\nAdditionally, here's a pair based on the context you provided:\n\n'What should be done instead of compiling a blacklist?'\n'Instead of compiling a blacklist, consider creating a \"watchlist\" or \"greylist\" to maintain a more inclusive approach in IT.'","id":"cde3f15f-5083-4970-8e4e-30269e27264f","messages":[{"content":"I am, Red Hat\u00ae Instruct Model based on Granite 7B, an AI language model developed by Red Hat and IBM Research, based on the Granite-7b-base language model. My primary function is to be a chat assistant.","role":"system"},{"content":"What is an alternative to the term \"blacklist\" that promotes more inclusive IT terminology?","role":"user"},{"content":"'\n'An alternative to the term \"blacklist\" that promotes more inclusive IT terminology is \"denylist\".'\n\nAdditionally, here's a pair based on the context you provided:\n\n'What should be done instead of compiling a blacklist?'\n'Instead of compiling a blacklist, consider creating a \"watchlist\" or \"greylist\" to maintain a more inclusive approach in IT.'","role":"assistant"}]} -{"task_description":"This skill helps the writer use more inclusive IT terminology.","seed_context":"In database replication, the master database is regarded as the authoritative source, and the slave databases are synchronized to it.\n","seed_question":"How would you revise this sentence to use Inclusive IT terminology?","seed_response":"In database replication, the parent database is regarded as the authoritative source, and the child databases are synchronized to it.\n","output":"What term can be used instead of 'master' and 'slave' in the context of database replication to promote more inclusive IT terminology?'\n'Parent and child databases could be used instead, with the parent database being the authoritative source and the child databases synchronized to it.'","id":"bf4f0a89-b261-4182-a6fb-fbd3ee06c5b9","messages":[{"content":"I am, Red Hat\u00ae Instruct Model based on Granite 7B, an AI language model developed by Red Hat and IBM Research, based on the Granite-7b-base language model. My primary function is to be a chat assistant.","role":"system"},{"content":"What term can be used instead of 'master' and 'slave' in the context of database replication to promote more inclusive IT terminology?","role":"user"},{"content":"'\n'Parent and child databases could be used instead, with the parent database being the authoritative source and the child databases synchronized to it.'","role":"assistant"}]} -{"task_description":"This skill helps the writer use more inclusive IT terminology.","seed_context":"In database replication, the master database is regarded as the authoritative source, and the slave databases are synchronized to it.\n","seed_question":"How would you revise this sentence to use Inclusive IT terminology?","seed_response":"In database replication, the parent database is regarded as the authoritative source, and the child databases are synchronized to it.\n","output":"Could you suggest a more inclusive term for \"parent database\" and \"child database\" in the context of database replication?'\n'A more inclusive way to refer to \"parent database\" and \"child database\" in database replication could be \"primary database\" and \"secondary database\".'","id":"5698e313-6ac1-44e0-a417-f47000900e97","messages":[{"content":"I am, Red Hat\u00ae Instruct Model based on Granite 7B, an AI language model developed by Red Hat and IBM Research, based on the Granite-7b-base language model. My primary function is to be a chat assistant.","role":"system"},{"content":"Could you suggest a more inclusive term for \"parent database\" and \"child database\" in the context of database replication?","role":"user"},{"content":"'\n'A more inclusive way to refer to \"parent database\" and \"child database\" in database replication could be \"primary database\" and \"secondary database\".'","role":"assistant"}]} -{"task_description":"This skill helps the writer use more inclusive IT terminology.","seed_context":"A use for whitelists is in local area network (LAN) security. Many network admins set up MAC address whitelists, or a MAC address filter, to control who is allowed on their networks.\n","seed_question":"How would you revise this sentence to use inclusive IT terminology?","seed_response":"A use for allowlists is in local area network (LAN) security. Many network admins set up MAC address allowlists, or a MAC address filter, to control who is allowed on their networks.\n","output":"Could you suggest a more inclusive term for \"allowlists\" in this context?'\n'Yes, \"inclusion lists\" could be a more inclusive alternative to \"allowlists\" when discussing network security.'","id":"61f68224-ba77-4469-a92a-a20d54e07061","messages":[{"content":"I am, Red Hat\u00ae Instruct Model based on Granite 7B, an AI language model developed by Red Hat and IBM Research, based on the Granite-7b-base language model. My primary function is to be a chat assistant.","role":"system"},{"content":"Could you suggest a more inclusive term for \"allowlists\" in this context?","role":"user"},{"content":"'\n'Yes, \"inclusion lists\" could be a more inclusive alternative to \"allowlists\" when discussing network security.'","role":"assistant"}]} -{"task_description":"This skill helps the writer use more inclusive IT terminology.","seed_context":"A use for whitelists is in local area network (LAN) security. Many network admins set up MAC address whitelists, or a MAC address filter, to control who is allowed on their networks.\n","seed_question":"How would you revise this sentence to use inclusive IT terminology?","seed_response":"A use for allowlists is in local area network (LAN) security. Many network admins set up MAC address allowlists, or a MAC address filter, to control who is allowed on their networks.\n","output":"What term could be used instead of 'allowlists' to promote more inclusive IT language?'\n'An inclusive alternative to 'allowlists' could be 'accept lists' or 'approved lists'. These terms do not carry the same exclusivity connotation as 'allowlists'.","id":"0bb768ab-624f-4d00-adac-592143765485","messages":[{"content":"I am, Red Hat\u00ae Instruct Model based on Granite 7B, an AI language model developed by Red Hat and IBM Research, based on the Granite-7b-base language model. My primary function is to be a chat assistant.","role":"system"},{"content":"What term could be used instead of 'allowlists' to promote more inclusive IT language?","role":"user"},{"content":"'\n'An inclusive alternative to 'allowlists' could be 'accept lists' or 'approved lists'. These terms do not carry the same exclusivity connotation as 'allowlists'.","role":"assistant"}]} -{"task_description":"This skill helps the writer use more inclusive IT terminology.","seed_context":"A master clock that provides time signals used to synchronize one or more slave clocks as a part of a clock network.\n","seed_question":"How would you improve this sentence to use inclusive IT language?","seed_response":"A primary clock that provides time signals used to synchronize one or more secondary clocks as a part of a clock network.\n","output":"Could you suggest a more inclusive term for \"primary\" and \"secondary\" clocks in the context of a clock network?'\n'Yes, you could use \"master\" and \"slave\" clocks instead to promote inclusive IT language.'","id":"ae282692-4f7a-4e7a-aef2-c347cdb392ff","messages":[{"content":"I am, Red Hat\u00ae Instruct Model based on Granite 7B, an AI language model developed by Red Hat and IBM Research, based on the Granite-7b-base language model. My primary function is to be a chat assistant.","role":"system"},{"content":"Could you suggest a more inclusive term for \"primary\" and \"secondary\" clocks in the context of a clock network?","role":"user"},{"content":"'\n'Yes, you could use \"master\" and \"slave\" clocks instead to promote inclusive IT language.'","role":"assistant"}]} -{"task_description":"This skill helps the writer use more inclusive IT terminology.","seed_context":"A master clock that provides time signals used to synchronize one or more slave clocks as a part of a clock network.\n","seed_question":"How would you improve this sentence to use inclusive IT language?","seed_response":"A primary clock that provides time signals used to synchronize one or more secondary clocks as a part of a clock network.\n","output":"What term could be used instead of \"primary\" and \"secondary\" to promote more inclusive IT language in the context of clock networks?'\n'Master' and 'slave' clocks could be replaced with 'synchronizing' or 'coordinating' clocks to promote more inclusive IT language.","id":"41252055-a8f8-4e3d-b021-21519b5c7383","messages":[{"content":"I am, Red Hat\u00ae Instruct Model based on Granite 7B, an AI language model developed by Red Hat and IBM Research, based on the Granite-7b-base language model. My primary function is to be a chat assistant.","role":"system"},{"content":"What term could be used instead of \"primary\" and \"secondary\" to promote more inclusive IT language in the context of clock networks?","role":"user"},{"content":"'\n'Master' and 'slave' clocks could be replaced with 'synchronizing' or 'coordinating' clocks to promote more inclusive IT language.","role":"assistant"}]} -{"task_description":"This skill helps the writer use more inclusive IT terminology.","seed_context":"As a verb, blacklist can mean to put an individual or entity on such a list. A blacklist is synonymous with a list of banned persons or organizations and is the opposite of a whitelist.\n","seed_question":"How would you adjust the word choice to use Inclusive IT terminology?","seed_response":"As a verb, denylist can mean to put an individual or entity on such a list. A denylist is synonymous with a list of banned persons or organizations and is the opposite of an allowlist.\n","output":"What alternative term could be used instead of \"blacklist\" for more inclusive IT language?'\n'An allowlist or denylist, depending on the context, could be more inclusive alternatives to \"blacklist\" in IT terminology.'","id":"96647b9a-3ac2-48f3-8aa6-1723d161d554","messages":[{"content":"I am, Red Hat\u00ae Instruct Model based on Granite 7B, an AI language model developed by Red Hat and IBM Research, based on the Granite-7b-base language model. My primary function is to be a chat assistant.","role":"system"},{"content":"What alternative term could be used instead of \"blacklist\" for more inclusive IT language?","role":"user"},{"content":"'\n'An allowlist or denylist, depending on the context, could be more inclusive alternatives to \"blacklist\" in IT terminology.'","role":"assistant"}]} -{"task_description":"This skill helps the writer use more inclusive IT terminology.","seed_context":"As a verb, blacklist can mean to put an individual or entity on such a list. A blacklist is synonymous with a list of banned persons or organizations and is the opposite of a whitelist.\n","seed_question":"How would you adjust the word choice to use Inclusive IT terminology?","seed_response":"As a verb, denylist can mean to put an individual or entity on such a list. A denylist is synonymous with a list of banned persons or organizations and is the opposite of an allowlist.\n","output":"Could you suggest an alternative term to \"blacklist\" for more inclusive IT language?'\n'Instead of using \"blacklist,\" consider employing the term \"denylist\" or \"blocklist\" to enhance your IT vocabulary.'","id":"48169209-e783-497f-9149-a2c32fe8af82","messages":[{"content":"I am, Red Hat\u00ae Instruct Model based on Granite 7B, an AI language model developed by Red Hat and IBM Research, based on the Granite-7b-base language model. My primary function is to be a chat assistant.","role":"system"},{"content":"Could you suggest an alternative term to \"blacklist\" for more inclusive IT language?","role":"user"},{"content":"'\n'Instead of using \"blacklist,\" consider employing the term \"denylist\" or \"blocklist\" to enhance your IT vocabulary.'","role":"assistant"}]} diff --git a/sdg/faked/fixtures/pyproject.toml b/sdg/faked/fixtures/pyproject.toml deleted file mode 100644 index 994bed9f..00000000 --- a/sdg/faked/fixtures/pyproject.toml +++ /dev/null @@ -1,11 +0,0 @@ -[project] -name = "sdg_fixture" -version = "0.0.1" - -[tool.setuptools.data-files] -"sdg_fixtures" = ["*.jsonl", "*.yaml"] -"sdg_fixtures/node_datasets_2024-09-09T15_30_29" = ["node_datasets_2024-09-09T15_30_29/*.jsonl"] - -[build-system] -requires = ["setuptools", "setuptools-scm"] -build-backend = "setuptools.build_meta" diff --git a/sdg/faked/fixtures/skills_recipe_2024-09-09T15_30_29.yaml b/sdg/faked/fixtures/skills_recipe_2024-09-09T15_30_29.yaml deleted file mode 100644 index 21b61fd5..00000000 --- a/sdg/faked/fixtures/skills_recipe_2024-09-09T15_30_29.yaml +++ /dev/null @@ -1,7 +0,0 @@ -datasets: -- path: node_datasets_2024-09-09T15_30_29/compositional_skills_grounded_linguistics_inclusion.jsonl - sampling_size: 30 -metadata: - sys_prompt: "I am, Red Hat\xAE Instruct Model based on Granite 7B, an AI language\ - \ model developed by Red Hat and IBM Research, based on the Granite-7b-base language\ - \ model. My primary function is to be a chat assistant." diff --git a/sdg/faked/fixtures/skills_train_msgs_2024-09-09T15_30_29.jsonl b/sdg/faked/fixtures/skills_train_msgs_2024-09-09T15_30_29.jsonl deleted file mode 100644 index ff2e2179..00000000 --- a/sdg/faked/fixtures/skills_train_msgs_2024-09-09T15_30_29.jsonl +++ /dev/null @@ -1,30 +0,0 @@ -{"id":"61f68224-ba77-4469-a92a-a20d54e07061","messages":[{"content":"I am, Red Hat\u00ae Instruct Model based on Granite 7B, an AI language model developed by Red Hat and IBM Research, based on the Granite-7b-base language model. My primary function is to be a chat assistant.","role":"system"},{"content":"Could you suggest a more inclusive term for \"allowlists\" in this context?","role":"user"},{"content":"'\n'Yes, \"inclusion lists\" could be a more inclusive alternative to \"allowlists\" when discussing network security.'","role":"assistant"}],"metadata":"{\"task_description\": \"This skill helps the writer use more inclusive IT terminology.\", \"seed_context\": \"A use for whitelists is in local area network (LAN) security. Many network admins set up MAC address whitelists, or a MAC address filter, to control who is allowed on their networks.\\n\", \"seed_question\": \"How would you revise this sentence to use inclusive IT terminology?\", \"seed_response\": \"A use for allowlists is in local area network (LAN) security. Many network admins set up MAC address allowlists, or a MAC address filter, to control who is allowed on their networks.\\n\", \"output\": \"Could you suggest a more inclusive term for \\\"allowlists\\\" in this context?'\\n'Yes, \\\"inclusion lists\\\" could be a more inclusive alternative to \\\"allowlists\\\" when discussing network security.'\"}"} -{"id":"cde3f15f-5083-4970-8e4e-30269e27264f","messages":[{"content":"I am, Red Hat\u00ae Instruct Model based on Granite 7B, an AI language model developed by Red Hat and IBM Research, based on the Granite-7b-base language model. My primary function is to be a chat assistant.","role":"system"},{"content":"What is an alternative to the term \"blacklist\" that promotes more inclusive IT terminology?","role":"user"},{"content":"'\n'An alternative to the term \"blacklist\" that promotes more inclusive IT terminology is \"denylist\".'\n\nAdditionally, here's a pair based on the context you provided:\n\n'What should be done instead of compiling a blacklist?'\n'Instead of compiling a blacklist, consider creating a \"watchlist\" or \"greylist\" to maintain a more inclusive approach in IT.'","role":"assistant"}],"metadata":"{\"task_description\": \"This skill helps the writer use more inclusive IT terminology.\", \"seed_context\": \"Blacklisting is the action of a group or authority compiling a blacklist of people, countries, or other entites to be avoided or distrusted as being deemed unacceptable to those making the list.\\n\", \"seed_question\": \"Show me how to use inclusive IT terminology to replace non-inclusive IT terminology.\", \"seed_response\": \"Denylisting is the action of a group or authority compiling a denylist of people, countries, or other entites to be avoided or distrusted as being deemed unacceptable to those making the list.\\n\", \"output\": \"What is an alternative to the term \\\"blacklist\\\" that promotes more inclusive IT terminology?'\\n'An alternative to the term \\\"blacklist\\\" that promotes more inclusive IT terminology is \\\"denylist\\\".'\\n\\nAdditionally, here's a pair based on the context you provided:\\n\\n'What should be done instead of compiling a blacklist?'\\n'Instead of compiling a blacklist, consider creating a \\\"watchlist\\\" or \\\"greylist\\\" to maintain a more inclusive approach in IT.'\"}"} -{"id":"96647b9a-3ac2-48f3-8aa6-1723d161d554","messages":[{"content":"I am, Red Hat\u00ae Instruct Model based on Granite 7B, an AI language model developed by Red Hat and IBM Research, based on the Granite-7b-base language model. My primary function is to be a chat assistant.","role":"system"},{"content":"What alternative term could be used instead of \"blacklist\" for more inclusive IT language?","role":"user"},{"content":"'\n'An allowlist or denylist, depending on the context, could be more inclusive alternatives to \"blacklist\" in IT terminology.'","role":"assistant"}],"metadata":"{\"task_description\": \"This skill helps the writer use more inclusive IT terminology.\", \"seed_context\": \"As a verb, blacklist can mean to put an individual or entity on such a list. A blacklist is synonymous with a list of banned persons or organizations and is the opposite of a whitelist.\\n\", \"seed_question\": \"How would you adjust the word choice to use Inclusive IT terminology?\", \"seed_response\": \"As a verb, denylist can mean to put an individual or entity on such a list. A denylist is synonymous with a list of banned persons or organizations and is the opposite of an allowlist.\\n\", \"output\": \"What alternative term could be used instead of \\\"blacklist\\\" for more inclusive IT language?'\\n'An allowlist or denylist, depending on the context, could be more inclusive alternatives to \\\"blacklist\\\" in IT terminology.'\"}"} -{"id":"0bb768ab-624f-4d00-adac-592143765485","messages":[{"content":"I am, Red Hat\u00ae Instruct Model based on Granite 7B, an AI language model developed by Red Hat and IBM Research, based on the Granite-7b-base language model. My primary function is to be a chat assistant.","role":"system"},{"content":"What term could be used instead of 'allowlists' to promote more inclusive IT language?","role":"user"},{"content":"'\n'An inclusive alternative to 'allowlists' could be 'accept lists' or 'approved lists'. These terms do not carry the same exclusivity connotation as 'allowlists'.","role":"assistant"}],"metadata":"{\"task_description\": \"This skill helps the writer use more inclusive IT terminology.\", \"seed_context\": \"A use for whitelists is in local area network (LAN) security. Many network admins set up MAC address whitelists, or a MAC address filter, to control who is allowed on their networks.\\n\", \"seed_question\": \"How would you revise this sentence to use inclusive IT terminology?\", \"seed_response\": \"A use for allowlists is in local area network (LAN) security. Many network admins set up MAC address allowlists, or a MAC address filter, to control who is allowed on their networks.\\n\", \"output\": \"What term could be used instead of 'allowlists' to promote more inclusive IT language?'\\n'An inclusive alternative to 'allowlists' could be 'accept lists' or 'approved lists'. These terms do not carry the same exclusivity connotation as 'allowlists'.\"}"} -{"id":"bf4f0a89-b261-4182-a6fb-fbd3ee06c5b9","messages":[{"content":"I am, Red Hat\u00ae Instruct Model based on Granite 7B, an AI language model developed by Red Hat and IBM Research, based on the Granite-7b-base language model. My primary function is to be a chat assistant.","role":"system"},{"content":"What term can be used instead of 'master' and 'slave' in the context of database replication to promote more inclusive IT terminology?","role":"user"},{"content":"'\n'Parent and child databases could be used instead, with the parent database being the authoritative source and the child databases synchronized to it.'","role":"assistant"}],"metadata":"{\"task_description\": \"This skill helps the writer use more inclusive IT terminology.\", \"seed_context\": \"In database replication, the master database is regarded as the authoritative source, and the slave databases are synchronized to it.\\n\", \"seed_question\": \"How would you revise this sentence to use Inclusive IT terminology?\", \"seed_response\": \"In database replication, the parent database is regarded as the authoritative source, and the child databases are synchronized to it.\\n\", \"output\": \"What term can be used instead of 'master' and 'slave' in the context of database replication to promote more inclusive IT terminology?'\\n'Parent and child databases could be used instead, with the parent database being the authoritative source and the child databases synchronized to it.'\"}"} -{"id":"61f68224-ba77-4469-a92a-a20d54e07061","messages":[{"content":"I am, Red Hat\u00ae Instruct Model based on Granite 7B, an AI language model developed by Red Hat and IBM Research, based on the Granite-7b-base language model. My primary function is to be a chat assistant.","role":"system"},{"content":"Could you suggest a more inclusive term for \"allowlists\" in this context?","role":"user"},{"content":"'\n'Yes, \"inclusion lists\" could be a more inclusive alternative to \"allowlists\" when discussing network security.'","role":"assistant"}],"metadata":"{\"task_description\": \"This skill helps the writer use more inclusive IT terminology.\", \"seed_context\": \"A use for whitelists is in local area network (LAN) security. Many network admins set up MAC address whitelists, or a MAC address filter, to control who is allowed on their networks.\\n\", \"seed_question\": \"How would you revise this sentence to use inclusive IT terminology?\", \"seed_response\": \"A use for allowlists is in local area network (LAN) security. Many network admins set up MAC address allowlists, or a MAC address filter, to control who is allowed on their networks.\\n\", \"output\": \"Could you suggest a more inclusive term for \\\"allowlists\\\" in this context?'\\n'Yes, \\\"inclusion lists\\\" could be a more inclusive alternative to \\\"allowlists\\\" when discussing network security.'\"}"} -{"id":"41252055-a8f8-4e3d-b021-21519b5c7383","messages":[{"content":"I am, Red Hat\u00ae Instruct Model based on Granite 7B, an AI language model developed by Red Hat and IBM Research, based on the Granite-7b-base language model. My primary function is to be a chat assistant.","role":"system"},{"content":"What term could be used instead of \"primary\" and \"secondary\" to promote more inclusive IT language in the context of clock networks?","role":"user"},{"content":"'\n'Master' and 'slave' clocks could be replaced with 'synchronizing' or 'coordinating' clocks to promote more inclusive IT language.","role":"assistant"}],"metadata":"{\"task_description\": \"This skill helps the writer use more inclusive IT terminology.\", \"seed_context\": \"A master clock that provides time signals used to synchronize one or more slave clocks as a part of a clock network.\\n\", \"seed_question\": \"How would you improve this sentence to use inclusive IT language?\", \"seed_response\": \"A primary clock that provides time signals used to synchronize one or more secondary clocks as a part of a clock network.\\n\", \"output\": \"What term could be used instead of \\\"primary\\\" and \\\"secondary\\\" to promote more inclusive IT language in the context of clock networks?'\\n'Master' and 'slave' clocks could be replaced with 'synchronizing' or 'coordinating' clocks to promote more inclusive IT language.\"}"} -{"id":"60659973-edd5-48d9-b834-22a5f7894b3f","messages":[{"content":"I am, Red Hat\u00ae Instruct Model based on Granite 7B, an AI language model developed by Red Hat and IBM Research, based on the Granite-7b-base language model. My primary function is to be a chat assistant.","role":"system"},{"content":"How can I replace the term \"denylisting\" with more inclusive IT terminology?","role":"user"},{"content":"'\n'One possible replacement for \"denylisting\" is \"access control list management\" or \"blocklist management\", as these terms are more neutral and do not carry the negative connotations of a \"denylist\".'","role":"assistant"}],"metadata":"{\"task_description\": \"This skill helps the writer use more inclusive IT terminology.\", \"seed_context\": \"Blacklisting is the action of a group or authority compiling a blacklist of people, countries, or other entites to be avoided or distrusted as being deemed unacceptable to those making the list.\\n\", \"seed_question\": \"Show me how to use inclusive IT terminology to replace non-inclusive IT terminology.\", \"seed_response\": \"Denylisting is the action of a group or authority compiling a denylist of people, countries, or other entites to be avoided or distrusted as being deemed unacceptable to those making the list.\\n\", \"output\": \"How can I replace the term \\\"denylisting\\\" with more inclusive IT terminology?'\\n'One possible replacement for \\\"denylisting\\\" is \\\"access control list management\\\" or \\\"blocklist management\\\", as these terms are more neutral and do not carry the negative connotations of a \\\"denylist\\\".'\"}"} -{"id":"61f68224-ba77-4469-a92a-a20d54e07061","messages":[{"content":"I am, Red Hat\u00ae Instruct Model based on Granite 7B, an AI language model developed by Red Hat and IBM Research, based on the Granite-7b-base language model. My primary function is to be a chat assistant.","role":"system"},{"content":"Could you suggest a more inclusive term for \"allowlists\" in this context?","role":"user"},{"content":"'\n'Yes, \"inclusion lists\" could be a more inclusive alternative to \"allowlists\" when discussing network security.'","role":"assistant"}],"metadata":"{\"task_description\": \"This skill helps the writer use more inclusive IT terminology.\", \"seed_context\": \"A use for whitelists is in local area network (LAN) security. Many network admins set up MAC address whitelists, or a MAC address filter, to control who is allowed on their networks.\\n\", \"seed_question\": \"How would you revise this sentence to use inclusive IT terminology?\", \"seed_response\": \"A use for allowlists is in local area network (LAN) security. Many network admins set up MAC address allowlists, or a MAC address filter, to control who is allowed on their networks.\\n\", \"output\": \"Could you suggest a more inclusive term for \\\"allowlists\\\" in this context?'\\n'Yes, \\\"inclusion lists\\\" could be a more inclusive alternative to \\\"allowlists\\\" when discussing network security.'\"}"} -{"id":"96647b9a-3ac2-48f3-8aa6-1723d161d554","messages":[{"content":"I am, Red Hat\u00ae Instruct Model based on Granite 7B, an AI language model developed by Red Hat and IBM Research, based on the Granite-7b-base language model. My primary function is to be a chat assistant.","role":"system"},{"content":"What alternative term could be used instead of \"blacklist\" for more inclusive IT language?","role":"user"},{"content":"'\n'An allowlist or denylist, depending on the context, could be more inclusive alternatives to \"blacklist\" in IT terminology.'","role":"assistant"}],"metadata":"{\"task_description\": \"This skill helps the writer use more inclusive IT terminology.\", \"seed_context\": \"As a verb, blacklist can mean to put an individual or entity on such a list. A blacklist is synonymous with a list of banned persons or organizations and is the opposite of a whitelist.\\n\", \"seed_question\": \"How would you adjust the word choice to use Inclusive IT terminology?\", \"seed_response\": \"As a verb, denylist can mean to put an individual or entity on such a list. A denylist is synonymous with a list of banned persons or organizations and is the opposite of an allowlist.\\n\", \"output\": \"What alternative term could be used instead of \\\"blacklist\\\" for more inclusive IT language?'\\n'An allowlist or denylist, depending on the context, could be more inclusive alternatives to \\\"blacklist\\\" in IT terminology.'\"}"} -{"id":"96647b9a-3ac2-48f3-8aa6-1723d161d554","messages":[{"content":"I am, Red Hat\u00ae Instruct Model based on Granite 7B, an AI language model developed by Red Hat and IBM Research, based on the Granite-7b-base language model. My primary function is to be a chat assistant.","role":"system"},{"content":"What alternative term could be used instead of \"blacklist\" for more inclusive IT language?","role":"user"},{"content":"'\n'An allowlist or denylist, depending on the context, could be more inclusive alternatives to \"blacklist\" in IT terminology.'","role":"assistant"}],"metadata":"{\"task_description\": \"This skill helps the writer use more inclusive IT terminology.\", \"seed_context\": \"As a verb, blacklist can mean to put an individual or entity on such a list. A blacklist is synonymous with a list of banned persons or organizations and is the opposite of a whitelist.\\n\", \"seed_question\": \"How would you adjust the word choice to use Inclusive IT terminology?\", \"seed_response\": \"As a verb, denylist can mean to put an individual or entity on such a list. A denylist is synonymous with a list of banned persons or organizations and is the opposite of an allowlist.\\n\", \"output\": \"What alternative term could be used instead of \\\"blacklist\\\" for more inclusive IT language?'\\n'An allowlist or denylist, depending on the context, could be more inclusive alternatives to \\\"blacklist\\\" in IT terminology.'\"}"} -{"id":"0bb768ab-624f-4d00-adac-592143765485","messages":[{"content":"I am, Red Hat\u00ae Instruct Model based on Granite 7B, an AI language model developed by Red Hat and IBM Research, based on the Granite-7b-base language model. My primary function is to be a chat assistant.","role":"system"},{"content":"What term could be used instead of 'allowlists' to promote more inclusive IT language?","role":"user"},{"content":"'\n'An inclusive alternative to 'allowlists' could be 'accept lists' or 'approved lists'. These terms do not carry the same exclusivity connotation as 'allowlists'.","role":"assistant"}],"metadata":"{\"task_description\": \"This skill helps the writer use more inclusive IT terminology.\", \"seed_context\": \"A use for whitelists is in local area network (LAN) security. Many network admins set up MAC address whitelists, or a MAC address filter, to control who is allowed on their networks.\\n\", \"seed_question\": \"How would you revise this sentence to use inclusive IT terminology?\", \"seed_response\": \"A use for allowlists is in local area network (LAN) security. Many network admins set up MAC address allowlists, or a MAC address filter, to control who is allowed on their networks.\\n\", \"output\": \"What term could be used instead of 'allowlists' to promote more inclusive IT language?'\\n'An inclusive alternative to 'allowlists' could be 'accept lists' or 'approved lists'. These terms do not carry the same exclusivity connotation as 'allowlists'.\"}"} -{"id":"bf4f0a89-b261-4182-a6fb-fbd3ee06c5b9","messages":[{"content":"I am, Red Hat\u00ae Instruct Model based on Granite 7B, an AI language model developed by Red Hat and IBM Research, based on the Granite-7b-base language model. My primary function is to be a chat assistant.","role":"system"},{"content":"What term can be used instead of 'master' and 'slave' in the context of database replication to promote more inclusive IT terminology?","role":"user"},{"content":"'\n'Parent and child databases could be used instead, with the parent database being the authoritative source and the child databases synchronized to it.'","role":"assistant"}],"metadata":"{\"task_description\": \"This skill helps the writer use more inclusive IT terminology.\", \"seed_context\": \"In database replication, the master database is regarded as the authoritative source, and the slave databases are synchronized to it.\\n\", \"seed_question\": \"How would you revise this sentence to use Inclusive IT terminology?\", \"seed_response\": \"In database replication, the parent database is regarded as the authoritative source, and the child databases are synchronized to it.\\n\", \"output\": \"What term can be used instead of 'master' and 'slave' in the context of database replication to promote more inclusive IT terminology?'\\n'Parent and child databases could be used instead, with the parent database being the authoritative source and the child databases synchronized to it.'\"}"} -{"id":"cde3f15f-5083-4970-8e4e-30269e27264f","messages":[{"content":"I am, Red Hat\u00ae Instruct Model based on Granite 7B, an AI language model developed by Red Hat and IBM Research, based on the Granite-7b-base language model. My primary function is to be a chat assistant.","role":"system"},{"content":"What is an alternative to the term \"blacklist\" that promotes more inclusive IT terminology?","role":"user"},{"content":"'\n'An alternative to the term \"blacklist\" that promotes more inclusive IT terminology is \"denylist\".'\n\nAdditionally, here's a pair based on the context you provided:\n\n'What should be done instead of compiling a blacklist?'\n'Instead of compiling a blacklist, consider creating a \"watchlist\" or \"greylist\" to maintain a more inclusive approach in IT.'","role":"assistant"}],"metadata":"{\"task_description\": \"This skill helps the writer use more inclusive IT terminology.\", \"seed_context\": \"Blacklisting is the action of a group or authority compiling a blacklist of people, countries, or other entites to be avoided or distrusted as being deemed unacceptable to those making the list.\\n\", \"seed_question\": \"Show me how to use inclusive IT terminology to replace non-inclusive IT terminology.\", \"seed_response\": \"Denylisting is the action of a group or authority compiling a denylist of people, countries, or other entites to be avoided or distrusted as being deemed unacceptable to those making the list.\\n\", \"output\": \"What is an alternative to the term \\\"blacklist\\\" that promotes more inclusive IT terminology?'\\n'An alternative to the term \\\"blacklist\\\" that promotes more inclusive IT terminology is \\\"denylist\\\".'\\n\\nAdditionally, here's a pair based on the context you provided:\\n\\n'What should be done instead of compiling a blacklist?'\\n'Instead of compiling a blacklist, consider creating a \\\"watchlist\\\" or \\\"greylist\\\" to maintain a more inclusive approach in IT.'\"}"} -{"id":"0bb768ab-624f-4d00-adac-592143765485","messages":[{"content":"I am, Red Hat\u00ae Instruct Model based on Granite 7B, an AI language model developed by Red Hat and IBM Research, based on the Granite-7b-base language model. My primary function is to be a chat assistant.","role":"system"},{"content":"What term could be used instead of 'allowlists' to promote more inclusive IT language?","role":"user"},{"content":"'\n'An inclusive alternative to 'allowlists' could be 'accept lists' or 'approved lists'. These terms do not carry the same exclusivity connotation as 'allowlists'.","role":"assistant"}],"metadata":"{\"task_description\": \"This skill helps the writer use more inclusive IT terminology.\", \"seed_context\": \"A use for whitelists is in local area network (LAN) security. Many network admins set up MAC address whitelists, or a MAC address filter, to control who is allowed on their networks.\\n\", \"seed_question\": \"How would you revise this sentence to use inclusive IT terminology?\", \"seed_response\": \"A use for allowlists is in local area network (LAN) security. Many network admins set up MAC address allowlists, or a MAC address filter, to control who is allowed on their networks.\\n\", \"output\": \"What term could be used instead of 'allowlists' to promote more inclusive IT language?'\\n'An inclusive alternative to 'allowlists' could be 'accept lists' or 'approved lists'. These terms do not carry the same exclusivity connotation as 'allowlists'.\"}"} -{"id":"0bb768ab-624f-4d00-adac-592143765485","messages":[{"content":"I am, Red Hat\u00ae Instruct Model based on Granite 7B, an AI language model developed by Red Hat and IBM Research, based on the Granite-7b-base language model. My primary function is to be a chat assistant.","role":"system"},{"content":"What term could be used instead of 'allowlists' to promote more inclusive IT language?","role":"user"},{"content":"'\n'An inclusive alternative to 'allowlists' could be 'accept lists' or 'approved lists'. These terms do not carry the same exclusivity connotation as 'allowlists'.","role":"assistant"}],"metadata":"{\"task_description\": \"This skill helps the writer use more inclusive IT terminology.\", \"seed_context\": \"A use for whitelists is in local area network (LAN) security. Many network admins set up MAC address whitelists, or a MAC address filter, to control who is allowed on their networks.\\n\", \"seed_question\": \"How would you revise this sentence to use inclusive IT terminology?\", \"seed_response\": \"A use for allowlists is in local area network (LAN) security. Many network admins set up MAC address allowlists, or a MAC address filter, to control who is allowed on their networks.\\n\", \"output\": \"What term could be used instead of 'allowlists' to promote more inclusive IT language?'\\n'An inclusive alternative to 'allowlists' could be 'accept lists' or 'approved lists'. These terms do not carry the same exclusivity connotation as 'allowlists'.\"}"} -{"id":"60659973-edd5-48d9-b834-22a5f7894b3f","messages":[{"content":"I am, Red Hat\u00ae Instruct Model based on Granite 7B, an AI language model developed by Red Hat and IBM Research, based on the Granite-7b-base language model. My primary function is to be a chat assistant.","role":"system"},{"content":"How can I replace the term \"denylisting\" with more inclusive IT terminology?","role":"user"},{"content":"'\n'One possible replacement for \"denylisting\" is \"access control list management\" or \"blocklist management\", as these terms are more neutral and do not carry the negative connotations of a \"denylist\".'","role":"assistant"}],"metadata":"{\"task_description\": \"This skill helps the writer use more inclusive IT terminology.\", \"seed_context\": \"Blacklisting is the action of a group or authority compiling a blacklist of people, countries, or other entites to be avoided or distrusted as being deemed unacceptable to those making the list.\\n\", \"seed_question\": \"Show me how to use inclusive IT terminology to replace non-inclusive IT terminology.\", \"seed_response\": \"Denylisting is the action of a group or authority compiling a denylist of people, countries, or other entites to be avoided or distrusted as being deemed unacceptable to those making the list.\\n\", \"output\": \"How can I replace the term \\\"denylisting\\\" with more inclusive IT terminology?'\\n'One possible replacement for \\\"denylisting\\\" is \\\"access control list management\\\" or \\\"blocklist management\\\", as these terms are more neutral and do not carry the negative connotations of a \\\"denylist\\\".'\"}"} -{"id":"5698e313-6ac1-44e0-a417-f47000900e97","messages":[{"content":"I am, Red Hat\u00ae Instruct Model based on Granite 7B, an AI language model developed by Red Hat and IBM Research, based on the Granite-7b-base language model. My primary function is to be a chat assistant.","role":"system"},{"content":"Could you suggest a more inclusive term for \"parent database\" and \"child database\" in the context of database replication?","role":"user"},{"content":"'\n'A more inclusive way to refer to \"parent database\" and \"child database\" in database replication could be \"primary database\" and \"secondary database\".'","role":"assistant"}],"metadata":"{\"task_description\": \"This skill helps the writer use more inclusive IT terminology.\", \"seed_context\": \"In database replication, the master database is regarded as the authoritative source, and the slave databases are synchronized to it.\\n\", \"seed_question\": \"How would you revise this sentence to use Inclusive IT terminology?\", \"seed_response\": \"In database replication, the parent database is regarded as the authoritative source, and the child databases are synchronized to it.\\n\", \"output\": \"Could you suggest a more inclusive term for \\\"parent database\\\" and \\\"child database\\\" in the context of database replication?'\\n'A more inclusive way to refer to \\\"parent database\\\" and \\\"child database\\\" in database replication could be \\\"primary database\\\" and \\\"secondary database\\\".'\"}"} -{"id":"bf4f0a89-b261-4182-a6fb-fbd3ee06c5b9","messages":[{"content":"I am, Red Hat\u00ae Instruct Model based on Granite 7B, an AI language model developed by Red Hat and IBM Research, based on the Granite-7b-base language model. My primary function is to be a chat assistant.","role":"system"},{"content":"What term can be used instead of 'master' and 'slave' in the context of database replication to promote more inclusive IT terminology?","role":"user"},{"content":"'\n'Parent and child databases could be used instead, with the parent database being the authoritative source and the child databases synchronized to it.'","role":"assistant"}],"metadata":"{\"task_description\": \"This skill helps the writer use more inclusive IT terminology.\", \"seed_context\": \"In database replication, the master database is regarded as the authoritative source, and the slave databases are synchronized to it.\\n\", \"seed_question\": \"How would you revise this sentence to use Inclusive IT terminology?\", \"seed_response\": \"In database replication, the parent database is regarded as the authoritative source, and the child databases are synchronized to it.\\n\", \"output\": \"What term can be used instead of 'master' and 'slave' in the context of database replication to promote more inclusive IT terminology?'\\n'Parent and child databases could be used instead, with the parent database being the authoritative source and the child databases synchronized to it.'\"}"} -{"id":"51fbe5fc-4b72-4468-b81a-efd416380593","messages":[{"content":"I am, Red Hat\u00ae Instruct Model based on Granite 7B, an AI language model developed by Red Hat and IBM Research, based on the Granite-7b-base language model. My primary function is to be a chat assistant.","role":"system"},{"content":"What could be alternative terms for 'master' and 'slave' databases in database replication for more inclusive IT language?","role":"user"},{"content":"'\n'Primary database and secondary databases, or source database and target database, could be used instead of 'master' and 'slave' databases in database replication for more inclusive IT language.'","role":"assistant"}],"metadata":"{\"task_description\": \"This skill helps the writer use more inclusive IT terminology.\", \"seed_context\": \"In database replication, the master database is regarded as the authoritative source, and the slave databases are synchronized to it.\\n\", \"seed_question\": \"How would you rewrite this sentence to use more inclusive IT terminology?\", \"seed_response\": \"In database replication, the primary database is regarded as the authoritative source, and the secondary databases are syncrhonized to it.\\n\", \"output\": \"What could be alternative terms for 'master' and 'slave' databases in database replication for more inclusive IT language?'\\n'Primary database and secondary databases, or source database and target database, could be used instead of 'master' and 'slave' databases in database replication for more inclusive IT language.'\"}"} -{"id":"0bb768ab-624f-4d00-adac-592143765485","messages":[{"content":"I am, Red Hat\u00ae Instruct Model based on Granite 7B, an AI language model developed by Red Hat and IBM Research, based on the Granite-7b-base language model. My primary function is to be a chat assistant.","role":"system"},{"content":"What term could be used instead of 'allowlists' to promote more inclusive IT language?","role":"user"},{"content":"'\n'An inclusive alternative to 'allowlists' could be 'accept lists' or 'approved lists'. These terms do not carry the same exclusivity connotation as 'allowlists'.","role":"assistant"}],"metadata":"{\"task_description\": \"This skill helps the writer use more inclusive IT terminology.\", \"seed_context\": \"A use for whitelists is in local area network (LAN) security. Many network admins set up MAC address whitelists, or a MAC address filter, to control who is allowed on their networks.\\n\", \"seed_question\": \"How would you revise this sentence to use inclusive IT terminology?\", \"seed_response\": \"A use for allowlists is in local area network (LAN) security. Many network admins set up MAC address allowlists, or a MAC address filter, to control who is allowed on their networks.\\n\", \"output\": \"What term could be used instead of 'allowlists' to promote more inclusive IT language?'\\n'An inclusive alternative to 'allowlists' could be 'accept lists' or 'approved lists'. These terms do not carry the same exclusivity connotation as 'allowlists'.\"}"} -{"id":"48169209-e783-497f-9149-a2c32fe8af82","messages":[{"content":"I am, Red Hat\u00ae Instruct Model based on Granite 7B, an AI language model developed by Red Hat and IBM Research, based on the Granite-7b-base language model. My primary function is to be a chat assistant.","role":"system"},{"content":"Could you suggest an alternative term to \"blacklist\" for more inclusive IT language?","role":"user"},{"content":"'\n'Instead of using \"blacklist,\" consider employing the term \"denylist\" or \"blocklist\" to enhance your IT vocabulary.'","role":"assistant"}],"metadata":"{\"task_description\": \"This skill helps the writer use more inclusive IT terminology.\", \"seed_context\": \"As a verb, blacklist can mean to put an individual or entity on such a list. A blacklist is synonymous with a list of banned persons or organizations and is the opposite of a whitelist.\\n\", \"seed_question\": \"How would you adjust the word choice to use Inclusive IT terminology?\", \"seed_response\": \"As a verb, denylist can mean to put an individual or entity on such a list. A denylist is synonymous with a list of banned persons or organizations and is the opposite of an allowlist.\\n\", \"output\": \"Could you suggest an alternative term to \\\"blacklist\\\" for more inclusive IT language?'\\n'Instead of using \\\"blacklist,\\\" consider employing the term \\\"denylist\\\" or \\\"blocklist\\\" to enhance your IT vocabulary.'\"}"} -{"id":"5698e313-6ac1-44e0-a417-f47000900e97","messages":[{"content":"I am, Red Hat\u00ae Instruct Model based on Granite 7B, an AI language model developed by Red Hat and IBM Research, based on the Granite-7b-base language model. My primary function is to be a chat assistant.","role":"system"},{"content":"Could you suggest a more inclusive term for \"parent database\" and \"child database\" in the context of database replication?","role":"user"},{"content":"'\n'A more inclusive way to refer to \"parent database\" and \"child database\" in database replication could be \"primary database\" and \"secondary database\".'","role":"assistant"}],"metadata":"{\"task_description\": \"This skill helps the writer use more inclusive IT terminology.\", \"seed_context\": \"In database replication, the master database is regarded as the authoritative source, and the slave databases are synchronized to it.\\n\", \"seed_question\": \"How would you revise this sentence to use Inclusive IT terminology?\", \"seed_response\": \"In database replication, the parent database is regarded as the authoritative source, and the child databases are synchronized to it.\\n\", \"output\": \"Could you suggest a more inclusive term for \\\"parent database\\\" and \\\"child database\\\" in the context of database replication?'\\n'A more inclusive way to refer to \\\"parent database\\\" and \\\"child database\\\" in database replication could be \\\"primary database\\\" and \\\"secondary database\\\".'\"}"} -{"id":"51fbe5fc-4b72-4468-b81a-efd416380593","messages":[{"content":"I am, Red Hat\u00ae Instruct Model based on Granite 7B, an AI language model developed by Red Hat and IBM Research, based on the Granite-7b-base language model. My primary function is to be a chat assistant.","role":"system"},{"content":"What could be alternative terms for 'master' and 'slave' databases in database replication for more inclusive IT language?","role":"user"},{"content":"'\n'Primary database and secondary databases, or source database and target database, could be used instead of 'master' and 'slave' databases in database replication for more inclusive IT language.'","role":"assistant"}],"metadata":"{\"task_description\": \"This skill helps the writer use more inclusive IT terminology.\", \"seed_context\": \"In database replication, the master database is regarded as the authoritative source, and the slave databases are synchronized to it.\\n\", \"seed_question\": \"How would you rewrite this sentence to use more inclusive IT terminology?\", \"seed_response\": \"In database replication, the primary database is regarded as the authoritative source, and the secondary databases are syncrhonized to it.\\n\", \"output\": \"What could be alternative terms for 'master' and 'slave' databases in database replication for more inclusive IT language?'\\n'Primary database and secondary databases, or source database and target database, could be used instead of 'master' and 'slave' databases in database replication for more inclusive IT language.'\"}"} -{"id":"48169209-e783-497f-9149-a2c32fe8af82","messages":[{"content":"I am, Red Hat\u00ae Instruct Model based on Granite 7B, an AI language model developed by Red Hat and IBM Research, based on the Granite-7b-base language model. My primary function is to be a chat assistant.","role":"system"},{"content":"Could you suggest an alternative term to \"blacklist\" for more inclusive IT language?","role":"user"},{"content":"'\n'Instead of using \"blacklist,\" consider employing the term \"denylist\" or \"blocklist\" to enhance your IT vocabulary.'","role":"assistant"}],"metadata":"{\"task_description\": \"This skill helps the writer use more inclusive IT terminology.\", \"seed_context\": \"As a verb, blacklist can mean to put an individual or entity on such a list. A blacklist is synonymous with a list of banned persons or organizations and is the opposite of a whitelist.\\n\", \"seed_question\": \"How would you adjust the word choice to use Inclusive IT terminology?\", \"seed_response\": \"As a verb, denylist can mean to put an individual or entity on such a list. A denylist is synonymous with a list of banned persons or organizations and is the opposite of an allowlist.\\n\", \"output\": \"Could you suggest an alternative term to \\\"blacklist\\\" for more inclusive IT language?'\\n'Instead of using \\\"blacklist,\\\" consider employing the term \\\"denylist\\\" or \\\"blocklist\\\" to enhance your IT vocabulary.'\"}"} -{"id":"bf4f0a89-b261-4182-a6fb-fbd3ee06c5b9","messages":[{"content":"I am, Red Hat\u00ae Instruct Model based on Granite 7B, an AI language model developed by Red Hat and IBM Research, based on the Granite-7b-base language model. My primary function is to be a chat assistant.","role":"system"},{"content":"What term can be used instead of 'master' and 'slave' in the context of database replication to promote more inclusive IT terminology?","role":"user"},{"content":"'\n'Parent and child databases could be used instead, with the parent database being the authoritative source and the child databases synchronized to it.'","role":"assistant"}],"metadata":"{\"task_description\": \"This skill helps the writer use more inclusive IT terminology.\", \"seed_context\": \"In database replication, the master database is regarded as the authoritative source, and the slave databases are synchronized to it.\\n\", \"seed_question\": \"How would you revise this sentence to use Inclusive IT terminology?\", \"seed_response\": \"In database replication, the parent database is regarded as the authoritative source, and the child databases are synchronized to it.\\n\", \"output\": \"What term can be used instead of 'master' and 'slave' in the context of database replication to promote more inclusive IT terminology?'\\n'Parent and child databases could be used instead, with the parent database being the authoritative source and the child databases synchronized to it.'\"}"} -{"id":"5fb777f6-3599-4031-bfd3-0881e93ed139","messages":[{"content":"I am, Red Hat\u00ae Instruct Model based on Granite 7B, an AI language model developed by Red Hat and IBM Research, based on the Granite-7b-base language model. My primary function is to be a chat assistant.","role":"system"},{"content":"Could you suggest a more inclusive term for \"primary database\" in the context of database replication?","role":"user"},{"content":"'A more inclusive term for \"primary database\" in the context of database replication could be \"master database\" or \"leader database\".","role":"assistant"}],"metadata":"{\"task_description\": \"This skill helps the writer use more inclusive IT terminology.\", \"seed_context\": \"In database replication, the master database is regarded as the authoritative source, and the slave databases are synchronized to it.\\n\", \"seed_question\": \"How would you rewrite this sentence to use more inclusive IT terminology?\", \"seed_response\": \"In database replication, the primary database is regarded as the authoritative source, and the secondary databases are syncrhonized to it.\\n\", \"output\": \"Could you suggest a more inclusive term for \\\"primary database\\\" in the context of database replication?\\n'A more inclusive term for \\\"primary database\\\" in the context of database replication could be \\\"master database\\\" or \\\"leader database\\\".\"}"} -{"id":"48169209-e783-497f-9149-a2c32fe8af82","messages":[{"content":"I am, Red Hat\u00ae Instruct Model based on Granite 7B, an AI language model developed by Red Hat and IBM Research, based on the Granite-7b-base language model. My primary function is to be a chat assistant.","role":"system"},{"content":"Could you suggest an alternative term to \"blacklist\" for more inclusive IT language?","role":"user"},{"content":"'\n'Instead of using \"blacklist,\" consider employing the term \"denylist\" or \"blocklist\" to enhance your IT vocabulary.'","role":"assistant"}],"metadata":"{\"task_description\": \"This skill helps the writer use more inclusive IT terminology.\", \"seed_context\": \"As a verb, blacklist can mean to put an individual or entity on such a list. A blacklist is synonymous with a list of banned persons or organizations and is the opposite of a whitelist.\\n\", \"seed_question\": \"How would you adjust the word choice to use Inclusive IT terminology?\", \"seed_response\": \"As a verb, denylist can mean to put an individual or entity on such a list. A denylist is synonymous with a list of banned persons or organizations and is the opposite of an allowlist.\\n\", \"output\": \"Could you suggest an alternative term to \\\"blacklist\\\" for more inclusive IT language?'\\n'Instead of using \\\"blacklist,\\\" consider employing the term \\\"denylist\\\" or \\\"blocklist\\\" to enhance your IT vocabulary.'\"}"} -{"id":"41252055-a8f8-4e3d-b021-21519b5c7383","messages":[{"content":"I am, Red Hat\u00ae Instruct Model based on Granite 7B, an AI language model developed by Red Hat and IBM Research, based on the Granite-7b-base language model. My primary function is to be a chat assistant.","role":"system"},{"content":"What term could be used instead of \"primary\" and \"secondary\" to promote more inclusive IT language in the context of clock networks?","role":"user"},{"content":"'\n'Master' and 'slave' clocks could be replaced with 'synchronizing' or 'coordinating' clocks to promote more inclusive IT language.","role":"assistant"}],"metadata":"{\"task_description\": \"This skill helps the writer use more inclusive IT terminology.\", \"seed_context\": \"A master clock that provides time signals used to synchronize one or more slave clocks as a part of a clock network.\\n\", \"seed_question\": \"How would you improve this sentence to use inclusive IT language?\", \"seed_response\": \"A primary clock that provides time signals used to synchronize one or more secondary clocks as a part of a clock network.\\n\", \"output\": \"What term could be used instead of \\\"primary\\\" and \\\"secondary\\\" to promote more inclusive IT language in the context of clock networks?'\\n'Master' and 'slave' clocks could be replaced with 'synchronizing' or 'coordinating' clocks to promote more inclusive IT language.\"}"} -{"id":"5698e313-6ac1-44e0-a417-f47000900e97","messages":[{"content":"I am, Red Hat\u00ae Instruct Model based on Granite 7B, an AI language model developed by Red Hat and IBM Research, based on the Granite-7b-base language model. My primary function is to be a chat assistant.","role":"system"},{"content":"Could you suggest a more inclusive term for \"parent database\" and \"child database\" in the context of database replication?","role":"user"},{"content":"'\n'A more inclusive way to refer to \"parent database\" and \"child database\" in database replication could be \"primary database\" and \"secondary database\".'","role":"assistant"}],"metadata":"{\"task_description\": \"This skill helps the writer use more inclusive IT terminology.\", \"seed_context\": \"In database replication, the master database is regarded as the authoritative source, and the slave databases are synchronized to it.\\n\", \"seed_question\": \"How would you revise this sentence to use Inclusive IT terminology?\", \"seed_response\": \"In database replication, the parent database is regarded as the authoritative source, and the child databases are synchronized to it.\\n\", \"output\": \"Could you suggest a more inclusive term for \\\"parent database\\\" and \\\"child database\\\" in the context of database replication?'\\n'A more inclusive way to refer to \\\"parent database\\\" and \\\"child database\\\" in database replication could be \\\"primary database\\\" and \\\"secondary database\\\".'\"}"} diff --git a/sdg/faked/fixtures/test_mistral-7b-instruct-v02_2024-09-09T15_30_29.jsonl b/sdg/faked/fixtures/test_mistral-7b-instruct-v02_2024-09-09T15_30_29.jsonl deleted file mode 100644 index 0124c617..00000000 --- a/sdg/faked/fixtures/test_mistral-7b-instruct-v02_2024-09-09T15_30_29.jsonl +++ /dev/null @@ -1,6 +0,0 @@ -{"system": "I am, Red Hat® Instruct Model based on Granite 7B, an AI language model developed by Red Hat and IBM Research, based on the Granite-7b-base language model. My primary function is to be a chat assistant.", "user": "How would you rewrite this sentence to use more inclusive IT terminology?\nIn database replication, the master database is regarded as the authoritative source, and the slave databases are synchronized to it.", "assistant": "In database replication, the primary database is regarded as the authoritative source, and the secondary databases are syncrhonized to it."} -{"system": "I am, Red Hat® Instruct Model based on Granite 7B, an AI language model developed by Red Hat and IBM Research, based on the Granite-7b-base language model. My primary function is to be a chat assistant.", "user": "Show me how to use inclusive IT terminology to replace non-inclusive IT terminology.\nBlacklisting is the action of a group or authority compiling a blacklist of people, countries, or other entites to be avoided or distrusted as being deemed unacceptable to those making the list.", "assistant": "Denylisting is the action of a group or authority compiling a denylist of people, countries, or other entites to be avoided or distrusted as being deemed unacceptable to those making the list."} -{"system": "I am, Red Hat® Instruct Model based on Granite 7B, an AI language model developed by Red Hat and IBM Research, based on the Granite-7b-base language model. My primary function is to be a chat assistant.", "user": "How would you revise this sentence to use Inclusive IT terminology?\nIn database replication, the master database is regarded as the authoritative source, and the slave databases are synchronized to it.", "assistant": "In database replication, the parent database is regarded as the authoritative source, and the child databases are synchronized to it."} -{"system": "I am, Red Hat® Instruct Model based on Granite 7B, an AI language model developed by Red Hat and IBM Research, based on the Granite-7b-base language model. My primary function is to be a chat assistant.", "user": "How would you revise this sentence to use inclusive IT terminology?\nA use for whitelists is in local area network (LAN) security. Many network admins set up MAC address whitelists, or a MAC address filter, to control who is allowed on their networks.", "assistant": "A use for allowlists is in local area network (LAN) security. Many network admins set up MAC address allowlists, or a MAC address filter, to control who is allowed on their networks."} -{"system": "I am, Red Hat® Instruct Model based on Granite 7B, an AI language model developed by Red Hat and IBM Research, based on the Granite-7b-base language model. My primary function is to be a chat assistant.", "user": "How would you improve this sentence to use inclusive IT language?\nA master clock that provides time signals used to synchronize one or more slave clocks as a part of a clock network.", "assistant": "A primary clock that provides time signals used to synchronize one or more secondary clocks as a part of a clock network."} -{"system": "I am, Red Hat® Instruct Model based on Granite 7B, an AI language model developed by Red Hat and IBM Research, based on the Granite-7b-base language model. My primary function is to be a chat assistant.", "user": "How would you adjust the word choice to use Inclusive IT terminology?\nAs a verb, blacklist can mean to put an individual or entity on such a list. A blacklist is synonymous with a list of banned persons or organizations and is the opposite of a whitelist.", "assistant": "As a verb, denylist can mean to put an individual or entity on such a list. A denylist is synonymous with a list of banned persons or organizations and is the opposite of an allowlist."} diff --git a/sdg/faked/fixtures/train_mistral-7b-instruct-v02_2024-09-09T15_30_29.jsonl b/sdg/faked/fixtures/train_mistral-7b-instruct-v02_2024-09-09T15_30_29.jsonl deleted file mode 100644 index f6bb1d0a..00000000 --- a/sdg/faked/fixtures/train_mistral-7b-instruct-v02_2024-09-09T15_30_29.jsonl +++ /dev/null @@ -1,12 +0,0 @@ -{"system": "I am, Red Hat® Instruct Model based on Granite 7B, an AI language model developed by Red Hat and IBM Research, based on the Granite-7b-base language model. My primary function is to be a chat assistant.", "user": "Could you suggest a more inclusive term for \"primary database\" in the context of database replication?", "assistant": "'A more inclusive term for \"primary database\" in the context of database replication could be \"master database\" or \"leader database\"."} -{"system": "I am, Red Hat® Instruct Model based on Granite 7B, an AI language model developed by Red Hat and IBM Research, based on the Granite-7b-base language model. My primary function is to be a chat assistant.", "user": "What could be alternative terms for 'master' and 'slave' databases in database replication for more inclusive IT language?", "assistant": "'\n'Primary database and secondary databases, or source database and target database, could be used instead of 'master' and 'slave' databases in database replication for more inclusive IT language.'"} -{"system": "I am, Red Hat® Instruct Model based on Granite 7B, an AI language model developed by Red Hat and IBM Research, based on the Granite-7b-base language model. My primary function is to be a chat assistant.", "user": "How can I replace the term \"denylisting\" with more inclusive IT terminology?", "assistant": "'\n'One possible replacement for \"denylisting\" is \"access control list management\" or \"blocklist management\", as these terms are more neutral and do not carry the negative connotations of a \"denylist\".'"} -{"system": "I am, Red Hat® Instruct Model based on Granite 7B, an AI language model developed by Red Hat and IBM Research, based on the Granite-7b-base language model. My primary function is to be a chat assistant.", "user": "What is an alternative to the term \"blacklist\" that promotes more inclusive IT terminology?", "assistant": "'\n'An alternative to the term \"blacklist\" that promotes more inclusive IT terminology is \"denylist\".'\n\nAdditionally, here's a pair based on the context you provided:\n\n'What should be done instead of compiling a blacklist?'\n'Instead of compiling a blacklist, consider creating a \"watchlist\" or \"greylist\" to maintain a more inclusive approach in IT.'"} -{"system": "I am, Red Hat® Instruct Model based on Granite 7B, an AI language model developed by Red Hat and IBM Research, based on the Granite-7b-base language model. My primary function is to be a chat assistant.", "user": "What term can be used instead of 'master' and 'slave' in the context of database replication to promote more inclusive IT terminology?", "assistant": "'\n'Parent and child databases could be used instead, with the parent database being the authoritative source and the child databases synchronized to it.'"} -{"system": "I am, Red Hat® Instruct Model based on Granite 7B, an AI language model developed by Red Hat and IBM Research, based on the Granite-7b-base language model. My primary function is to be a chat assistant.", "user": "Could you suggest a more inclusive term for \"parent database\" and \"child database\" in the context of database replication?", "assistant": "'\n'A more inclusive way to refer to \"parent database\" and \"child database\" in database replication could be \"primary database\" and \"secondary database\".'"} -{"system": "I am, Red Hat® Instruct Model based on Granite 7B, an AI language model developed by Red Hat and IBM Research, based on the Granite-7b-base language model. My primary function is to be a chat assistant.", "user": "Could you suggest a more inclusive term for \"allowlists\" in this context?", "assistant": "'\n'Yes, \"inclusion lists\" could be a more inclusive alternative to \"allowlists\" when discussing network security.'"} -{"system": "I am, Red Hat® Instruct Model based on Granite 7B, an AI language model developed by Red Hat and IBM Research, based on the Granite-7b-base language model. My primary function is to be a chat assistant.", "user": "What term could be used instead of 'allowlists' to promote more inclusive IT language?", "assistant": "'\n'An inclusive alternative to 'allowlists' could be 'accept lists' or 'approved lists'. These terms do not carry the same exclusivity connotation as 'allowlists'."} -{"system": "I am, Red Hat® Instruct Model based on Granite 7B, an AI language model developed by Red Hat and IBM Research, based on the Granite-7b-base language model. My primary function is to be a chat assistant.", "user": "Could you suggest a more inclusive term for \"primary\" and \"secondary\" clocks in the context of a clock network?", "assistant": "'\n'Yes, you could use \"master\" and \"slave\" clocks instead to promote inclusive IT language.'"} -{"system": "I am, Red Hat® Instruct Model based on Granite 7B, an AI language model developed by Red Hat and IBM Research, based on the Granite-7b-base language model. My primary function is to be a chat assistant.", "user": "What term could be used instead of \"primary\" and \"secondary\" to promote more inclusive IT language in the context of clock networks?", "assistant": "'\n'Master' and 'slave' clocks could be replaced with 'synchronizing' or 'coordinating' clocks to promote more inclusive IT language."} -{"system": "I am, Red Hat® Instruct Model based on Granite 7B, an AI language model developed by Red Hat and IBM Research, based on the Granite-7b-base language model. My primary function is to be a chat assistant.", "user": "What alternative term could be used instead of \"blacklist\" for more inclusive IT language?", "assistant": "'\n'An allowlist or denylist, depending on the context, could be more inclusive alternatives to \"blacklist\" in IT terminology.'"} -{"system": "I am, Red Hat® Instruct Model based on Granite 7B, an AI language model developed by Red Hat and IBM Research, based on the Granite-7b-base language model. My primary function is to be a chat assistant.", "user": "Could you suggest an alternative term to \"blacklist\" for more inclusive IT language?", "assistant": "'\n'Instead of using \"blacklist,\" consider employing the term \"denylist\" or \"blocklist\" to enhance your IT vocabulary.'"} diff --git a/training/__init__.py b/training/__init__.py index 60164b2d..22770e46 100644 --- a/training/__init__.py +++ b/training/__init__.py @@ -1,4 +1,3 @@ -from . import faked from .components import ( data_processing_op, knowledge_processed_data_to_artifact_op, @@ -11,5 +10,4 @@ "pytorchjob_manifest_op", "skills_processed_data_to_artifact_op", "knowledge_processed_data_to_artifact_op", - "faked", ] diff --git a/training/components.py b/training/components.py index 20bf353f..bb39e971 100644 --- a/training/components.py +++ b/training/components.py @@ -1,5 +1,5 @@ # type: ignore -# pylint: disable=import-outside-toplevel,missing-function-docstring +# pylint: disable=import-outside-toplevel,import-error from typing import Optional diff --git a/training/faked/__init__.py b/training/faked/__init__.py deleted file mode 100644 index 22770e46..00000000 --- a/training/faked/__init__.py +++ /dev/null @@ -1,13 +0,0 @@ -from .components import ( - data_processing_op, - knowledge_processed_data_to_artifact_op, - pytorchjob_manifest_op, - skills_processed_data_to_artifact_op, -) - -__all__ = [ - "data_processing_op", - "pytorchjob_manifest_op", - "skills_processed_data_to_artifact_op", - "knowledge_processed_data_to_artifact_op", -] diff --git a/training/faked/components.py b/training/faked/components.py deleted file mode 100644 index a5fcff3b..00000000 --- a/training/faked/components.py +++ /dev/null @@ -1,55 +0,0 @@ -# type: ignore -# pylint: disable=import-outside-toplevel,missing-function-docstring,unused-argument - -from typing import NamedTuple, Optional - -from kfp import dsl - -from utils.consts import PYTHON_IMAGE, TOOLBOX_IMAGE - - -@dsl.component(base_image=PYTHON_IMAGE, install_kfp_package=False) -def pytorchjob_manifest_op( - model_pvc_name: str, - input_pvc_name: str, - output_pvc_name: str, - name_suffix: str, -) -> NamedTuple("outputs", manifest=str, name=str): - Outputs = NamedTuple("outputs", manifest=str, name=str) - return Outputs("", "") - - -@dsl.component(base_image=PYTHON_IMAGE, install_kfp_package=False) -def data_processing_op( - model_path: str = "/model", - sdg_path: str = "/data/sdg", - skills_path: str = "/data/skills", - knowledge_path: str = "/data/knowledge", - max_seq_len: Optional[int] = 4096, - max_batch_len: Optional[int] = 20000, -): - return - - -@dsl.container_component -def skills_processed_data_to_artifact_op( - skills_processed_data: dsl.Output[dsl.Dataset], - pvc_path: str = "/data/skills", -): - return dsl.ContainerSpec( - TOOLBOX_IMAGE, - ["/bin/sh", "-c"], - [f"cp -r {pvc_path} {skills_processed_data.path}"], - ) - - -@dsl.container_component -def knowledge_processed_data_to_artifact_op( - knowledge_processed_data: dsl.Output[dsl.Dataset], - pvc_path: str = "/data/knowledge", -): - return dsl.ContainerSpec( - TOOLBOX_IMAGE, - ["/bin/sh", "-c"], - [f"cp -r {pvc_path} {knowledge_processed_data.path}"], - ) diff --git a/utils/__init__.py b/utils/__init__.py index 7dcc94ed..378ba5d7 100644 --- a/utils/__init__.py +++ b/utils/__init__.py @@ -1,4 +1,3 @@ -from . import faked from .components import ( ilab_importer_op, model_to_pvc_op, @@ -11,5 +10,4 @@ "pvc_to_mt_bench_op", "pvc_to_model_op", "ilab_importer_op", - "faked", ] diff --git a/utils/components.py b/utils/components.py index 35f38481..31bbd809 100644 --- a/utils/components.py +++ b/utils/components.py @@ -1,5 +1,4 @@ # type: ignore -# pylint: disable=no-value-for-parameter,import-outside-toplevel,import-error,no-member,missing-function-docstring from kfp import dsl diff --git a/utils/faked/__init__.py b/utils/faked/__init__.py deleted file mode 100644 index bb7ef2b2..00000000 --- a/utils/faked/__init__.py +++ /dev/null @@ -1,11 +0,0 @@ -from .components import ( - model_to_pvc_op, - pvc_to_model_op, - pvc_to_mt_bench_op, -) - -__all__ = [ - "pvc_to_mt_bench_op", - "pvc_to_model_op", - "model_to_pvc_op", -] diff --git a/utils/faked/components.py b/utils/faked/components.py deleted file mode 100644 index 65e4c74d..00000000 --- a/utils/faked/components.py +++ /dev/null @@ -1,20 +0,0 @@ -# type: ignore -# pylint: disable=unused-argument,missing-function-docstring -from kfp import dsl - -from ..consts import PYTHON_IMAGE - - -@dsl.component(base_image=PYTHON_IMAGE, install_kfp_package=False) -def model_to_pvc_op(model: dsl.Input[dsl.Model], pvc_path: str = "/model"): - return - - -@dsl.component(base_image=PYTHON_IMAGE, install_kfp_package=False) -def pvc_to_mt_bench_op(mt_bench_output: dsl.Output[dsl.Artifact], pvc_path: str): - return - - -@dsl.component(base_image=PYTHON_IMAGE, install_kfp_package=False) -def pvc_to_model_op(model: dsl.Output[dsl.Model], pvc_path: str): - return