diff --git a/src/instructlab/sdg/configs/skills/simple_generate_qa_freeform.yaml b/src/instructlab/sdg/configs/skills/simple_generate_qa_freeform.yaml index 9abc1950..d584ac33 100644 --- a/src/instructlab/sdg/configs/skills/simple_generate_qa_freeform.yaml +++ b/src/instructlab/sdg/configs/skills/simple_generate_qa_freeform.yaml @@ -15,16 +15,10 @@ Here are the requirements: examples: | The task is {task_description}. - Here are some examples to help you understand the type of questions that are asked for: + Here is an example to help you understand the type of questions that are asked for: - {icl_query_1} - {icl_response_1} - - {icl_query_2} - {icl_response_2} - - {icl_query_3} - {icl_response_3} + {seed_question} + {seed_response} generation: | Provide a single question and answer pair based on the examples. diff --git a/src/instructlab/sdg/configs/skills/simple_generate_qa_grounded.yaml b/src/instructlab/sdg/configs/skills/simple_generate_qa_grounded.yaml index f40d3d11..2ac41a82 100644 --- a/src/instructlab/sdg/configs/skills/simple_generate_qa_grounded.yaml +++ b/src/instructlab/sdg/configs/skills/simple_generate_qa_grounded.yaml @@ -15,23 +15,17 @@ Here are the requirements: examples: | The task is {task_description}. - Here is some context for the example questions: + Here is some context for the example question: - {context} + {seed_context} - Here are some examples to help you understand the type of questions that are asked for: + Here is an example to help you understand the type of questions that are asked for: - {icl_query_1} - {icl_response_1} - - {icl_query_2} - {icl_response_2} - - {icl_query_3} - {icl_response_3} + {seed_question} + {seed_response} generation: | - Provide a single question and answer pair based on the examples. + Provide a single question and answer pair based on the example. start_tags: [""] end_tags: [""] diff --git a/src/instructlab/sdg/utils/taxonomy.py b/src/instructlab/sdg/utils/taxonomy.py index da9ffa11..d6f6441b 100644 --- a/src/instructlab/sdg/utils/taxonomy.py +++ b/src/instructlab/sdg/utils/taxonomy.py @@ -472,35 +472,16 @@ def _knowledge_leaf_node_to_samples(leaf_node, server_ctx_size, chunk_word_count def _skill_leaf_node_to_samples(leaf_node): - samples = [{}] + samples = [] # pylint: disable=consider-using-enumerate for i in range(len(leaf_node)): - samples[-1].setdefault("task_description", leaf_node[i]["task_description"]) + samples.append({}) + samples[-1]["task_description"] = leaf_node[i]["task_description"] if leaf_node[i].get("input"): - samples[-1].setdefault("context", leaf_node[i]["input"]) - if "icl_query_3" in samples[-1]: - samples.append({}) - if "icl_query_1" not in samples[-1]: - samples[-1]["icl_query_1"] = leaf_node[i]["instruction"] - samples[-1]["icl_response_1"] = leaf_node[i]["output"] - elif "icl_query_2" not in samples[-1]: - samples[-1]["icl_query_2"] = leaf_node[i]["instruction"] - samples[-1]["icl_response_2"] = leaf_node[i]["output"] - else: - samples[-1]["icl_query_3"] = leaf_node[i]["instruction"] - samples[-1]["icl_response_3"] = leaf_node[i]["output"] - - # wrap back around to the beginning if the number of examples was not - # evenly divisble by 3 - if "icl_query_2" not in samples[-1]: - samples[-1]["icl_query_2"] = leaf_node[0]["instruction"] - samples[-1]["icl_response_2"] = leaf_node[0]["output"] - if "icl_query_3" not in samples[-1]: - samples[-1]["icl_query_3"] = leaf_node[1 if len(leaf_node) > 1 else 0][ - "instruction" - ] - samples[-1]["icl_response_3"] = leaf_node[1 if len(leaf_node) > 1 else 0]["output"] + samples[-1]["seed_context"] = leaf_node[i]["input"] + samples[-1]["seed_question"] = leaf_node[i]["instruction"] + samples[-1]["seed_response"] = leaf_node[i]["output"] return samples @@ -508,7 +489,7 @@ def _skill_leaf_node_to_samples(leaf_node): def leaf_node_to_samples(leaf_node, server_ctx_size, chunk_word_count): if not leaf_node: return [] - if "document" in leaf_node[0]: + if leaf_node[0].get("document"): return _knowledge_leaf_node_to_samples( leaf_node, server_ctx_size, chunk_word_count )