From 15ae2b947369716a27e26ee8cd138e79fce4f45a Mon Sep 17 00:00:00 2001 From: Russell Bryant Date: Sun, 30 Jun 2024 14:08:39 -0400 Subject: [PATCH] Change question/response to icl_query/icl_response PR #50 changed the format used in the full knowledge pipeline. Change the simple pipelines to match. Part of issue #55. Signed-off-by: Russell Bryant --- .../configs/knowledge/simple_generate_qa.yaml | 12 ++-- .../skills/simple_generate_qa_freeform.yaml | 12 ++-- .../skills/simple_generate_qa_grounded.yaml | 12 ++-- src/instructlab/sdg/utils/taxonomy.py | 60 +++++++++---------- 4 files changed, 48 insertions(+), 48 deletions(-) diff --git a/src/instructlab/sdg/configs/knowledge/simple_generate_qa.yaml b/src/instructlab/sdg/configs/knowledge/simple_generate_qa.yaml index 9ad6fa77..c63b4209 100644 --- a/src/instructlab/sdg/configs/knowledge/simple_generate_qa.yaml +++ b/src/instructlab/sdg/configs/knowledge/simple_generate_qa.yaml @@ -15,14 +15,14 @@ Here are the requirements: examples: | Here are some examples to help you understand the type of questions that are asked for this document: - {question_1} - {response_1} + {icl_query_1} + {icl_response_1} - {question_2} - {response_2} + {icl_query_2} + {icl_response_2} - {question_3} - {response_3} + {icl_query_3} + {icl_response_3} Here is the document: {document} diff --git a/src/instructlab/sdg/configs/skills/simple_generate_qa_freeform.yaml b/src/instructlab/sdg/configs/skills/simple_generate_qa_freeform.yaml index 2913d7df..9abc1950 100644 --- a/src/instructlab/sdg/configs/skills/simple_generate_qa_freeform.yaml +++ b/src/instructlab/sdg/configs/skills/simple_generate_qa_freeform.yaml @@ -17,14 +17,14 @@ examples: | Here are some examples to help you understand the type of questions that are asked for: - {question_1} - {response_1} + {icl_query_1} + {icl_response_1} - {question_2} - {response_2} + {icl_query_2} + {icl_response_2} - {question_3} - {response_3} + {icl_query_3} + {icl_response_3} generation: | Provide a single question and answer pair based on the examples. diff --git a/src/instructlab/sdg/configs/skills/simple_generate_qa_grounded.yaml b/src/instructlab/sdg/configs/skills/simple_generate_qa_grounded.yaml index fe48c99c..f40d3d11 100644 --- a/src/instructlab/sdg/configs/skills/simple_generate_qa_grounded.yaml +++ b/src/instructlab/sdg/configs/skills/simple_generate_qa_grounded.yaml @@ -21,14 +21,14 @@ examples: | Here are some examples to help you understand the type of questions that are asked for: - {question_1} - {response_1} + {icl_query_1} + {icl_response_1} - {question_2} - {response_2} + {icl_query_2} + {icl_response_2} - {question_3} - {response_3} + {icl_query_3} + {icl_response_3} generation: | Provide a single question and answer pair based on the examples. diff --git a/src/instructlab/sdg/utils/taxonomy.py b/src/instructlab/sdg/utils/taxonomy.py index d11dc92e..da9ffa11 100644 --- a/src/instructlab/sdg/utils/taxonomy.py +++ b/src/instructlab/sdg/utils/taxonomy.py @@ -443,28 +443,28 @@ def _knowledge_leaf_node_to_samples(leaf_node, server_ctx_size, chunk_word_count raise utils.GenerateException( "Error: No domain provided for knowledge document in leaf node" ) - if "question_3" in samples[-1]: + if "icl_query_3" in samples[-1]: samples.append({}) - if "question_1" not in samples[-1]: - samples[-1]["question_1"] = leaf_node[i]["instruction"] - samples[-1]["response_1"] = leaf_node[i]["output"] - elif "question_2" not in samples[-1]: - samples[-1]["question_2"] = leaf_node[i]["instruction"] - samples[-1]["response_2"] = leaf_node[i]["output"] + if "icl_query_1" not in samples[-1]: + samples[-1]["icl_query_1"] = leaf_node[i]["instruction"] + samples[-1]["icl_response_1"] = leaf_node[i]["output"] + elif "icl_query_2" not in samples[-1]: + samples[-1]["icl_query_2"] = leaf_node[i]["instruction"] + samples[-1]["icl_response_2"] = leaf_node[i]["output"] else: - samples[-1]["question_3"] = leaf_node[i]["instruction"] - samples[-1]["response_3"] = leaf_node[i]["output"] + samples[-1]["icl_query_3"] = leaf_node[i]["instruction"] + samples[-1]["icl_response_3"] = leaf_node[i]["output"] # wrap back around to the beginning if the number of examples was not # evenly divisble by 3 - if "question_2" not in samples[-1]: - samples[-1]["question_2"] = leaf_node[0]["instruction"] - samples[-1]["response_2"] = leaf_node[0]["output"] - if "question_3" not in samples[-1]: - samples[-1]["question_3"] = leaf_node[1 if len(leaf_node) > 1 else 0][ + if "icl_query_2" not in samples[-1]: + samples[-1]["icl_query_2"] = leaf_node[0]["instruction"] + samples[-1]["icl_response_2"] = leaf_node[0]["output"] + if "icl_query_3" not in samples[-1]: + samples[-1]["icl_query_3"] = leaf_node[1 if len(leaf_node) > 1 else 0][ "instruction" ] - samples[-1]["response_3"] = leaf_node[1 if len(leaf_node) > 1 else 0][ + samples[-1]["icl_response_3"] = leaf_node[1 if len(leaf_node) > 1 else 0][ "output" ] @@ -479,28 +479,28 @@ def _skill_leaf_node_to_samples(leaf_node): samples[-1].setdefault("task_description", leaf_node[i]["task_description"]) if leaf_node[i].get("input"): samples[-1].setdefault("context", leaf_node[i]["input"]) - if "question_3" in samples[-1]: + if "icl_query_3" in samples[-1]: samples.append({}) - if "question_1" not in samples[-1]: - samples[-1]["question_1"] = leaf_node[i]["instruction"] - samples[-1]["response_1"] = leaf_node[i]["output"] - elif "question_2" not in samples[-1]: - samples[-1]["question_2"] = leaf_node[i]["instruction"] - samples[-1]["response_2"] = leaf_node[i]["output"] + if "icl_query_1" not in samples[-1]: + samples[-1]["icl_query_1"] = leaf_node[i]["instruction"] + samples[-1]["icl_response_1"] = leaf_node[i]["output"] + elif "icl_query_2" not in samples[-1]: + samples[-1]["icl_query_2"] = leaf_node[i]["instruction"] + samples[-1]["icl_response_2"] = leaf_node[i]["output"] else: - samples[-1]["question_3"] = leaf_node[i]["instruction"] - samples[-1]["response_3"] = leaf_node[i]["output"] + samples[-1]["icl_query_3"] = leaf_node[i]["instruction"] + samples[-1]["icl_response_3"] = leaf_node[i]["output"] # wrap back around to the beginning if the number of examples was not # evenly divisble by 3 - if "question_2" not in samples[-1]: - samples[-1]["question_2"] = leaf_node[0]["instruction"] - samples[-1]["response_2"] = leaf_node[0]["output"] - if "question_3" not in samples[-1]: - samples[-1]["question_3"] = leaf_node[1 if len(leaf_node) > 1 else 0][ + if "icl_query_2" not in samples[-1]: + samples[-1]["icl_query_2"] = leaf_node[0]["instruction"] + samples[-1]["icl_response_2"] = leaf_node[0]["output"] + if "icl_query_3" not in samples[-1]: + samples[-1]["icl_query_3"] = leaf_node[1 if len(leaf_node) > 1 else 0][ "instruction" ] - samples[-1]["response_3"] = leaf_node[1 if len(leaf_node) > 1 else 0]["output"] + samples[-1]["icl_response_3"] = leaf_node[1 if len(leaf_node) > 1 else 0]["output"] return samples