From e6068112bfd19ae6f2c7f8fd13ba0a366584ea02 Mon Sep 17 00:00:00 2001 From: Russell Bryant Date: Sun, 30 Jun 2024 14:25:25 -0400 Subject: [PATCH] Create a sample per seed example for skills The full skills pipelines expect a single seed question and response in each sample in the dataset. Change the simple skills pipelines to match and update the code to generate the samples in the expected format. Closes #55 (the short term needs at least) Signed-off-by: Russell Bryant --- .../skills/simple_generate_qa_freeform.yaml | 12 ++----- .../skills/simple_generate_qa_grounded.yaml | 18 ++++------ src/instructlab/sdg/utils/taxonomy.py | 33 ++++--------------- 3 files changed, 16 insertions(+), 47 deletions(-) diff --git a/src/instructlab/sdg/configs/skills/simple_generate_qa_freeform.yaml b/src/instructlab/sdg/configs/skills/simple_generate_qa_freeform.yaml index 9abc1950..d584ac33 100644 --- a/src/instructlab/sdg/configs/skills/simple_generate_qa_freeform.yaml +++ b/src/instructlab/sdg/configs/skills/simple_generate_qa_freeform.yaml @@ -15,16 +15,10 @@ Here are the requirements: examples: | The task is {task_description}. - Here are some examples to help you understand the type of questions that are asked for: + Here is an example to help you understand the type of questions that are asked for: - {icl_query_1} - {icl_response_1} - - {icl_query_2} - {icl_response_2} - - {icl_query_3} - {icl_response_3} + {seed_question} + {seed_response} generation: | Provide a single question and answer pair based on the examples. diff --git a/src/instructlab/sdg/configs/skills/simple_generate_qa_grounded.yaml b/src/instructlab/sdg/configs/skills/simple_generate_qa_grounded.yaml index f40d3d11..2ac41a82 100644 --- a/src/instructlab/sdg/configs/skills/simple_generate_qa_grounded.yaml +++ b/src/instructlab/sdg/configs/skills/simple_generate_qa_grounded.yaml @@ -15,23 +15,17 @@ Here are the requirements: examples: | The task is {task_description}. - Here is some context for the example questions: + Here is some context for the example question: - {context} + {seed_context} - Here are some examples to help you understand the type of questions that are asked for: + Here is an example to help you understand the type of questions that are asked for: - {icl_query_1} - {icl_response_1} - - {icl_query_2} - {icl_response_2} - - {icl_query_3} - {icl_response_3} + {seed_question} + {seed_response} generation: | - Provide a single question and answer pair based on the examples. + Provide a single question and answer pair based on the example. start_tags: [""] end_tags: [""] diff --git a/src/instructlab/sdg/utils/taxonomy.py b/src/instructlab/sdg/utils/taxonomy.py index da9ffa11..d6f6441b 100644 --- a/src/instructlab/sdg/utils/taxonomy.py +++ b/src/instructlab/sdg/utils/taxonomy.py @@ -472,35 +472,16 @@ def _knowledge_leaf_node_to_samples(leaf_node, server_ctx_size, chunk_word_count def _skill_leaf_node_to_samples(leaf_node): - samples = [{}] + samples = [] # pylint: disable=consider-using-enumerate for i in range(len(leaf_node)): - samples[-1].setdefault("task_description", leaf_node[i]["task_description"]) + samples.append({}) + samples[-1]["task_description"] = leaf_node[i]["task_description"] if leaf_node[i].get("input"): - samples[-1].setdefault("context", leaf_node[i]["input"]) - if "icl_query_3" in samples[-1]: - samples.append({}) - if "icl_query_1" not in samples[-1]: - samples[-1]["icl_query_1"] = leaf_node[i]["instruction"] - samples[-1]["icl_response_1"] = leaf_node[i]["output"] - elif "icl_query_2" not in samples[-1]: - samples[-1]["icl_query_2"] = leaf_node[i]["instruction"] - samples[-1]["icl_response_2"] = leaf_node[i]["output"] - else: - samples[-1]["icl_query_3"] = leaf_node[i]["instruction"] - samples[-1]["icl_response_3"] = leaf_node[i]["output"] - - # wrap back around to the beginning if the number of examples was not - # evenly divisble by 3 - if "icl_query_2" not in samples[-1]: - samples[-1]["icl_query_2"] = leaf_node[0]["instruction"] - samples[-1]["icl_response_2"] = leaf_node[0]["output"] - if "icl_query_3" not in samples[-1]: - samples[-1]["icl_query_3"] = leaf_node[1 if len(leaf_node) > 1 else 0][ - "instruction" - ] - samples[-1]["icl_response_3"] = leaf_node[1 if len(leaf_node) > 1 else 0]["output"] + samples[-1]["seed_context"] = leaf_node[i]["input"] + samples[-1]["seed_question"] = leaf_node[i]["instruction"] + samples[-1]["seed_response"] = leaf_node[i]["output"] return samples @@ -508,7 +489,7 @@ def _skill_leaf_node_to_samples(leaf_node): def leaf_node_to_samples(leaf_node, server_ctx_size, chunk_word_count): if not leaf_node: return [] - if "document" in leaf_node[0]: + if leaf_node[0].get("document"): return _knowledge_leaf_node_to_samples( leaf_node, server_ctx_size, chunk_word_count )