diff --git a/tests/taxonomy.py b/tests/taxonomy.py index f2b6cb47..227c2534 100644 --- a/tests/taxonomy.py +++ b/tests/taxonomy.py @@ -2,11 +2,12 @@ # Standard from pathlib import Path -from typing import List +from typing import Any, Dict, List import shutil # Third Party import git +import yaml class MockTaxonomy: @@ -25,12 +26,12 @@ def untracked_files(self) -> List[str]: """List untracked files in the repository""" return self._repo.untracked_files - def create_untracked(self, rel_path: str, contents: str) -> Path: + def create_untracked(self, rel_path: str, contents: Dict[str, Any]) -> Path: """Create a new untracked file in the repository. Args: rel_path (str): Relative path (from repository root) to the file. - contents (str): String to be written to the file. + contents (Dict[str, Any]): Object to be written to the file. Returns: file_path: The path to the created file. """ @@ -38,15 +39,16 @@ def create_untracked(self, rel_path: str, contents: str) -> Path: assert not taxonomy_path.is_absolute() file_path = self.root.joinpath(taxonomy_path) file_path.parent.mkdir(exist_ok=True, parents=True) - file_path.write_text(contents, encoding="utf-8") + with file_path.open(mode="w", encoding="utf-8") as fp: + yaml.dump(contents, fp) return file_path - def add_tracked(self, rel_path, contents: str) -> Path: - """Add a new tracked file to the repository (and commits it). + def add_tracked(self, rel_path, contents: Dict[str, Any]) -> Path: + """Add a new tracked file to the repository (and commit it). Args: rel_path (str): Relative path (from repository root) to the file. - contents (str): String to be written to the file. + contents (Dict[str, Any]): Object to be written to the file. Returns: file_path: The path to the added file. """ diff --git a/tests/test_generate_data.py b/tests/test_generate_data.py index 33d21e8f..98b87780 100644 --- a/tests/test_generate_data.py +++ b/tests/test_generate_data.py @@ -3,12 +3,462 @@ """ # Standard -from unittest import mock +from contextlib import contextmanager +from typing import Any, Dict, Union +from unittest.mock import MagicMock, patch +import glob +import json +import os +import shutil +import tempfile +import unittest + +# Third Party +from datasets import load_dataset +import pytest +import yaml # First Party -from instructlab.sdg.generate_data import _context_init +from instructlab.sdg.generate_data import _SYS_PROMPT, _context_init, generate_data +from instructlab.sdg.llmblock import LLMBlock from instructlab.sdg.pipeline import PipelineContext +TEST_TAXONOMY_BASE = "main" + +TEST_CUSTOM_YAML_RULES = b"""extends: relaxed +rules: + line-length: + max: 180 +""" + +TEST_DATA_DIR = os.path.join(os.path.dirname(__file__), "testdata") + +NUM_INSTRUCTIONS_TO_GENERATE = 10 + + +def validate_legacy_dataset(dataset_file_name, expected_samples): + """Test dataset in the "legacy message sample" format. + + See LegacyMessageSample in instructlab/instructlab. + + system: str + user: str + assistant: str + + This is what is currently used by the legacy training methods such as Linux training and MacOS training. + """ + ds = load_dataset("json", data_files=dataset_file_name, split="train") + features = ["system", "user", "assistant"] + assert len(ds.features) == len(features) + for feature in features: + assert feature in ds.features + assert ds.features[feature].dtype == "string" + + for idx, sample in enumerate(expected_samples): + assert ds[idx]["system"] == _SYS_PROMPT + assert ds[idx]["user"] == sample["user"] + assert ds[idx]["assistant"] == sample["assistant"] + + +def validate_messages_dataset(dataset_file_name, expected_samples): + """Test dataset in the Hugging Face messages format + + See MessageSample in instructlab/instructlab. + + messages: + content: str + # one of: "user", "assistant", or "system" + role: str + """ + ds = load_dataset("json", data_files=dataset_file_name, split="train") + assert len(ds.features) == 2 + assert len(ds.features["messages"]) == 1 + assert len(ds.features["messages"][0]) == 2 + assert ds.features["messages"][0]["content"].dtype == "string" + assert ds.features["messages"][0]["role"].dtype == "string" + assert ds.features["metadata"].dtype == "string" + + for idx, sample in enumerate(expected_samples): + assert len(ds[idx]["messages"]) == 2 + assert ds[idx]["messages"][0]["role"] == "user" + assert ds[idx]["messages"][0]["content"] == sample["user"] + assert ds[idx]["messages"][1]["role"] == "assistant" + assert ds[idx]["messages"][1]["content"] == sample["assistant"] + assert ds[idx]["metadata"] == json.dumps({"system": _SYS_PROMPT}) + + +def validate_skill_leaf_node_dataset(dataset_file_name): + ds = load_dataset("json", data_files=dataset_file_name, split="train") + assert len(ds.features) == 7 + features = [ + "task_description", + "seed_context", + "seed_question", + "seed_response", + "output", + "id", + ] + for feature in features: + assert feature in ds.features + assert ds.features[feature].dtype == "string" + assert "messages" in ds.features + assert len(ds.features["messages"]) == 1 + assert len(ds.features["messages"][0]) == 2 + assert ds.features["messages"][0]["content"].dtype == "string" + assert ds.features["messages"][0]["role"].dtype == "string" + + +def validate_phase_leaf_node_dataset(dataset_file_name): + ds = load_dataset("json", data_files=dataset_file_name, split="train") + assert len(ds.features) == 3 + features = ["metadata", "id"] + for feature in features: + assert feature in ds.features + assert ds.features[feature].dtype == "string" + assert "messages" in ds.features + assert len(ds.features["messages"]) == 1 + assert len(ds.features["messages"][0]) == 2 + assert ds.features["messages"][0]["content"].dtype == "string" + assert ds.features["messages"][0]["role"].dtype == "string" + + +def validate_recipe(recipe_file_name): + with open(recipe_file_name, encoding="utf-8") as fp: + yaml_contents = yaml.safe_load(fp) + assert len(yaml_contents["datasets"]) == 1 + assert yaml_contents["datasets"][0]["path"].endswith(".jsonl") + assert "sampling_size" in yaml_contents["datasets"][0] + assert yaml_contents["metadata"]["sys_prompt"] == _SYS_PROMPT + + +def validate_mixed_dataset(dataset_file_name): + ds = load_dataset("json", data_files=dataset_file_name, split="train") + assert "messages" in ds.features + assert len(ds.features["messages"]) == 1 + assert len(ds.features["messages"][0]) == 2 + assert ds.features["messages"][0]["content"].dtype == "string" + assert ds.features["messages"][0]["role"].dtype == "string" + + +def validate_lm_eval_task(lm_eval_task_file_name): + with open(lm_eval_task_file_name, encoding="utf-8") as fp: + yaml_contents = yaml.safe_load(fp) + assert "task" in yaml_contents + assert "dataset_kwargs" in yaml_contents + assert "doc_to_text" in yaml_contents + assert "doc_to_choice" in yaml_contents + assert "doc_to_target" in yaml_contents + + +def validate_mmlubench_dataset(dataset_file_name): + with open(dataset_file_name, encoding="utf-8") as fp: + # FIXME: fix the mmlubench pipeline in this test + assert fp.readlines() == [] + + +def generate_test_samples(yaml_contents): + """Convert questions and answers from the taxonomy format into the + user/assistant format used by the legacy training methods such as + Linux training and MacOS training. + + This mirrors what _gen_test_data() does. + """ + test_samples = [] + is_knowledge = "document" in yaml_contents + for seed_example in yaml_contents["seed_examples"]: + if is_knowledge: + for qna in seed_example["questions_and_answers"]: + test_samples.append( + { + "user": qna["question"] + + "\n" + + seed_example["context"].strip(), + "assistant": qna["answer"].strip(), + } + ) + + else: + # FIXME: handle freeform skills - no context + test_samples.append( + { + "user": seed_example["question"] + "\n" + seed_example["context"], + "assistant": seed_example["answer"], + } + ) + return test_samples + + +def generate_train_samples(yaml_contents): + """Generate expected training samples in the user/assistant format + used by the legacy training methods such as Linux training and MacOS + training. + + Mirroring _noop_llmblock_generate() below, we generate 10 samples + per input, and then follow _gen_train_data()'s output format. + """ + + def add_question_mark(q): + return (q + "?") if not "?" in q else q + + train_samples = [] + is_knowledge = "document" in yaml_contents + for seed_example in yaml_contents["seed_examples"]: + for i in range(NUM_INSTRUCTIONS_TO_GENERATE): + if is_knowledge: + train_samples.append( + { + "user": seed_example["context"] + + f" (q{i}) " + + add_question_mark( + seed_example["questions_and_answers"][0]["question"].strip() + ), + "assistant": f"(a{i}) " + + seed_example["questions_and_answers"][0]["answer"].strip(), + } + ) + else: + # FIXME: handle freeform skills - no context + train_samples.append( + { + "user": seed_example["context"] + + f" (q{i}) " + + add_question_mark(seed_example["question"]), + "assistant": f"(a{i}) " + seed_example["answer"], + } + ) + return train_samples + + +def load_test_skills(skills_file_path) -> Union[Dict[str, Any], None]: + with open(skills_file_path, "r", encoding="utf-8") as skills_file: + return yaml.safe_load(skills_file) + + +def _noop_llmblock_generate(self, samples): + """Generate mock output based on input samples. + + Simply return the seed question and response from the input sample, + joined using '?' and with an integer discriminator. + + _get_question_hack() and _get_response_hack() is the code that later + splits these using the '?' separator. + + Return 10 output samples per input samples, since the LLMBlock in the + simple pipeline is configured with 'n: scaled' and we pass + num_instructions_to_generate=10 to generate_data. + """ + + def strip_q(q): + return q.strip().rstrip("?") + + output = [] + for sample in samples: + for i in range(NUM_INSTRUCTIONS_TO_GENERATE): + if "domain" in sample: # knowledge + output.append( + sample["icl_document"] + + f" (q{i}) " + + strip_q(sample["icl_query_1"]) + + f" ? (a{i}) " + + sample["icl_response_1"] + ) + else: + output.append( + sample["seed_context"] + + f" (q{i}) " + + strip_q(sample["seed_question"]) + + f" ? (a{i}) " + + sample["seed_response"] + ) + return output + + +@patch.object(LLMBlock, "_generate", _noop_llmblock_generate) +class TestGenerateCompositionalData(unittest.TestCase): + @pytest.fixture(autouse=True) + def _init_taxonomy(self, taxonomy_dir): + self.test_taxonomy = taxonomy_dir + + def setUp(self): + self.tmp_path = tempfile.TemporaryDirectory().name + test_valid_compositional_skill_file = os.path.join( + TEST_DATA_DIR, "test_valid_compositional_skill.yaml" + ) + tracked_compositional_file = os.path.join( + "compositional_skills", "tracked", "qna.yaml" + ) + untracked_compositional_file = os.path.join( + "compositional_skills", "new", "qna.yaml" + ) + test_valid_compositional_skill = load_test_skills( + test_valid_compositional_skill_file + ) + self.test_taxonomy.add_tracked( + tracked_compositional_file, test_valid_compositional_skill + ) + self.test_taxonomy.create_untracked( + untracked_compositional_file, test_valid_compositional_skill + ) + self.expected_test_samples = generate_test_samples( + test_valid_compositional_skill + ) + self.expected_train_samples = generate_train_samples( + test_valid_compositional_skill + ) + + def test_generate(self): + with patch("logging.Logger.info") as mocked_logger: + generate_data( + mocked_logger, + model_family="merlinite", + model_name="models/merlinite-7b-lab-Q4_K_M.gguf", + num_instructions_to_generate=10, + taxonomy=self.test_taxonomy.root, + taxonomy_base=TEST_TAXONOMY_BASE, + output_dir=self.tmp_path, + yaml_rules=TEST_CUSTOM_YAML_RULES, + client=MagicMock(), + pipeline="simple", + ) + + for name in ["test_*.jsonl", "train_*.jsonl", "messages_*.jsonl"]: + matches = glob.glob(os.path.join(self.tmp_path, name)) + assert len(matches) == 1 + if name.startswith("test_"): + validate_legacy_dataset(matches[0], self.expected_test_samples) + elif name.startswith("train_"): + validate_legacy_dataset(matches[0], self.expected_train_samples) + elif name.startswith("messages_"): + validate_messages_dataset(matches[0], self.expected_train_samples) + + node_file = os.path.join("node_datasets_*", "compositional_skills_new.jsonl") + for name in [ + "skills_recipe_*.yaml", + "skills_train_msgs_*.jsonl", + node_file, + ]: + matches = glob.glob(os.path.join(self.tmp_path, name)) + assert len(matches) == 1 + if name.endswith("compositional_skills_new.jsonl"): + validate_skill_leaf_node_dataset(matches[0]) + elif name.startswith("skills_recipe_"): + validate_recipe(matches[0]) + elif name.startswith("skills_train_msgs_"): + validate_mixed_dataset(matches[0]) + + def teardown(self) -> None: + """Recursively remove the temporary repository and all of its + subdirectories and files. + """ + shutil.rmtree(self.tmp_path) + return + + def __enter__(self): + return self + + def __exit__(self, *args): + self.teardown() + + +@patch.object(LLMBlock, "_generate", _noop_llmblock_generate) +class TestGenerateKnowledgeData(unittest.TestCase): + @pytest.fixture(autouse=True) + def _init_taxonomy(self, taxonomy_dir): + self.test_taxonomy = taxonomy_dir + + def setUp(self): + self.tmp_path = tempfile.TemporaryDirectory().name + test_valid_knowledge_skill_file = os.path.join( + TEST_DATA_DIR, "test_valid_knowledge_skill.yaml" + ) + tracked_knowledge_file = os.path.join("knowledge ", "tracked", "qna.yaml") + untracked_knowledge_file = os.path.join("knowledge", "new", "qna.yaml") + test_valid_knowledge_skill = load_test_skills(test_valid_knowledge_skill_file) + self.test_taxonomy.add_tracked( + tracked_knowledge_file, test_valid_knowledge_skill + ) + self.test_taxonomy.create_untracked( + untracked_knowledge_file, test_valid_knowledge_skill + ) + self.expected_test_samples = generate_test_samples(test_valid_knowledge_skill) + self.expected_train_samples = generate_train_samples(test_valid_knowledge_skill) + + def test_generate(self): + with patch("logging.Logger.info") as mocked_logger: + generate_data( + mocked_logger, + model_family="merlinite", + model_name="models/merlinite-7b-lab-Q4_K_M.gguf", + num_instructions_to_generate=10, + taxonomy=self.test_taxonomy.root, + taxonomy_base=TEST_TAXONOMY_BASE, + output_dir=self.tmp_path, + yaml_rules=TEST_CUSTOM_YAML_RULES, + chunk_word_count=1000, + server_ctx_size=4096, + client=MagicMock(), + pipeline="simple", + ) + + for name in ["test_*.jsonl", "train_*.jsonl", "messages_*.jsonl"]: + matches = glob.glob(os.path.join(self.tmp_path, name)) + assert len(matches) == 1 + if name.startswith("test_"): + validate_legacy_dataset(matches[0], self.expected_test_samples) + elif name.startswith("train_"): + validate_legacy_dataset(matches[0], self.expected_train_samples) + elif name.startswith("messages_"): + validate_messages_dataset(matches[0], self.expected_train_samples) + + node_p07_file = os.path.join("node_datasets_*", "knowledge_new_p07.jsonl") + node_p10_file = os.path.join("node_datasets_*", "knowledge_new_p10.jsonl") + for name in [ + "skills_recipe_*.yaml", + "skills_train_*.jsonl", + "knowledge_recipe_*.yaml", + "knowledge_train_msgs_*.jsonl", + node_p07_file, + node_p10_file, + ]: + matches = glob.glob(os.path.join(self.tmp_path, name)) + assert len(matches) == 1 + if name.endswith("knowledge_new_p07.jsonl") or name.endswith( + "knowledge_new_p10.jsonl" + ): + validate_phase_leaf_node_dataset(matches[0]) + elif name.startswith("skills_recipe_") or name.startswith( + "knowledge_recipe_" + ): + validate_recipe(matches[0]) + elif name.startswith("skills_train_msgs_") or name.startswith( + "knowledge_train_msgs_" + ): + validate_mixed_dataset(matches[0]) + + for name in [ + "knowledge_new_task.yaml", + "mmlubench_knowledge_new.jsonl", + ]: + matches = glob.glob(os.path.join(self.tmp_path, "node_datasets_*", name)) + assert len(matches) == 1 + if name == "knowledge_new_task.yaml": + validate_lm_eval_task(matches[0]) + elif name == "mmlubench_knowledge_new.jsonl": + validate_mmlubench_dataset(matches[0]) + + def teardown(self) -> None: + """Recursively remove the temporary repository and all of its + subdirectories and files. + """ + shutil.rmtree(self.tmp_path) + return + + def __enter__(self): + return self + + def __exit__(self, *args): + self.teardown() + def test_context_init_batch_size_optional(): """Test that the _context_init function can handle a missing batch size by diff --git a/tests/test_taxonomy.py b/tests/test_taxonomy.py index e8aa4bda..8c148113 100644 --- a/tests/test_taxonomy.py +++ b/tests/test_taxonomy.py @@ -1,36 +1,17 @@ # SPDX-License-Identifier: Apache-2.0 # Standard +from typing import Any, Dict, Union import os import pathlib # Third Party import pytest +import yaml # First Party from instructlab.sdg.utils import taxonomy -TEST_VALID_COMPOSITIONAL_SKILL_YAML = """created_by: rafael-vasquez -version: 1 -seed_examples: -- answer: "Sure thing!" - context: "This is a valid YAML." - question: "Can you help me debug this failing unit test?" -- answer: "answer2" - context: "context2" - question: "question2" -- answer: "answer3" - context: "context3" - question: "question3" -- answer: "answer4" - context: "context4" - question: "question4" -- answer: "answer5" - context: "context5" - question: "question5" -task_description: 'This is a task' -""" - TEST_SEED_EXAMPLE = "Can you help me debug this failing unit test?" TEST_CUSTOM_YAML_RULES = b"""extends: relaxed @@ -40,6 +21,13 @@ max: 180 """ +TEST_DATA_DIR = os.path.join(os.path.dirname(__file__), "testdata") + + +def load_test_skills(skills_file_path) -> Union[Dict[str, Any], None]: + with open(skills_file_path, "r", encoding="utf-8") as skills_file: + return yaml.safe_load(skills_file) + class TestTaxonomy: """Test taxonomy in instructlab.sdg.utils.taxonomy.""" @@ -80,17 +68,18 @@ def test_read_taxonomy_leaf_nodes( ): tracked_file = "compositional_skills/tracked/qna.yaml" untracked_file = "compositional_skills/new/qna.yaml" + test_compositional_skill_file = os.path.join( + TEST_DATA_DIR, "test_valid_compositional_skill.yaml" + ) + test_compositional_skill = load_test_skills(test_compositional_skill_file) if create_tracked_file: - self.taxonomy.add_tracked(tracked_file, TEST_VALID_COMPOSITIONAL_SKILL_YAML) + self.taxonomy.add_tracked(tracked_file, test_compositional_skill) if create_untracked_file: - self.taxonomy.create_untracked( - untracked_file, TEST_VALID_COMPOSITIONAL_SKILL_YAML - ) + self.taxonomy.create_untracked(untracked_file, test_compositional_skill) leaf_nodes = taxonomy.read_taxonomy_leaf_nodes( self.taxonomy.root, taxonomy_base, TEST_CUSTOM_YAML_RULES ) - assert len(leaf_nodes) == len(check_leaf_node_keys) for leaf_node_key in check_leaf_node_keys: diff --git a/tests/testdata/test_valid_compositional_skill.yaml b/tests/testdata/test_valid_compositional_skill.yaml new file mode 100644 index 00000000..df4ad2c8 --- /dev/null +++ b/tests/testdata/test_valid_compositional_skill.yaml @@ -0,0 +1,19 @@ +created_by: rafael-vasquez +version: 1 +seed_examples: +- answer: "Sure thing!" + context: "This is a valid YAML." + question: "Can you help me debug this failing unit test?" +- answer: "answer2" + context: "context2" + question: "question2" +- answer: "answer3" + context: "context3" + question: "question3" +- answer: "answer4" + context: "context4" + question: "question4" +- answer: "answer5" + context: "context5" + question: "question5" +task_description: 'This is a task' \ No newline at end of file diff --git a/tests/testdata/test_valid_knowledge_skill.yaml b/tests/testdata/test_valid_knowledge_skill.yaml new file mode 100644 index 00000000..705acb41 --- /dev/null +++ b/tests/testdata/test_valid_knowledge_skill.yaml @@ -0,0 +1,176 @@ +created_by: lukeinglis +domain: anatomy_tonsil +version: 3 +seed_examples: + - context: | + ## Structure + Humans are born with four types of tonsils: the pharyngeal tonsil, two + tubal tonsils, two palatine tonsils, and the lingual tonsils.[1] + +
Type |
+ + | + | + | Location |
+
---|---|---|---|---|
Pharyngeal tonsil (also + termed "adenoid") |
+ Ciliated + pseudostratified columnar (respiratory epithelium) |
+ Incompletely encapsulated |
+ Small folds—sometimes described as crypts1 |
+ Roof of pharynx |
+
+ | Ciliated pseudostratified columnar (respiratory epithelium) |
+ Not encapsulated |
+ No crypts |
+ Roof of pharynx |
+
+ | Stratified squamous epithelium |
+ Fully encapsulated |
+ Multiple deep crypts |
+ Each side of the throat at the back of the mouth |
+