From 01c24aa64df856be11664433628fa37ead21c6f9 Mon Sep 17 00:00:00 2001 From: Oindrilla Chatterjee Date: Fri, 28 Jun 2024 17:47:19 -0400 Subject: [PATCH 01/10] Combine question and context for training preparation and testing script This is a step towards converting it to the final format (messages) required for training. Signed-off-by: Oindrilla Chatterjee Co-authored-by: Aakanksha Duggal Co-authored-by: Shiv --- scripts/test_grounded_skills.py | 107 +++++++++++++++++++++++++++ src/instructlab/sdg/default_flows.py | 36 ++++++--- 2 files changed, 133 insertions(+), 10 deletions(-) create mode 100644 scripts/test_grounded_skills.py diff --git a/scripts/test_grounded_skills.py b/scripts/test_grounded_skills.py new file mode 100644 index 00000000..98dc739d --- /dev/null +++ b/scripts/test_grounded_skills.py @@ -0,0 +1,107 @@ +# Third Party +from datasets import Dataset +from openai import OpenAI + +# First Party +from src.instructlab.sdg import SDG +from src.instructlab.sdg.default_flows import SynthGroundedSkillsFlow +from src.instructlab.sdg.pipeline import Pipeline + +# for vLLM endpoints, the api_key remains "EMPTY" +openai_api_key = "EMPTY" +openai_api_base = "Add model endpoint here" + + +client = OpenAI( + api_key=openai_api_key, + base_url=openai_api_base, +) + +models = client.models.list() +teacher_model = models.data[0].id + +samples = [ + { + 'seed_context': """*Ms. Thompson:* Good morning, everyone. Today, we''re here to discuss + our customer journey mapping and analysis. I believe this is crucial to understanding + our customers'' experiences and improving our services. + + + *Mr. Patel:* I agree, Lisa. We should start by identifying all touchpoints in + our customer journey, from initial contact to post-sale support. + + + *Ms. Rodriguez:* Yes, and let''s not forget about the emotional aspect of the + journey. How do our customers feel at each stage? What are their pain points? + + + *Mr. Kim:* We can use data from our CRM system to track the customer journey and + gather insights. This will help us create a detailed, data-driven map. + + + *Ms. Johnson:* Once we have the map, we can analyze it to identify areas for improvement. + Perhaps there are steps where customers drop off or express dissatisfaction. + + + *Mr. Davis:* We should also consider the customer''s perspective. Conducting interviews + or surveys can provide valuable insights into their thoughts and feelings. + + + *Ms. Brown:* Absolutely. And once we''ve identified areas for improvement, we + can develop strategies to address them. This might involve redesigning certain + touchpoints, enhancing our communication, or streamlining processes. + + + *Mr. Smith:* And we must remember to measure the impact of any changes we make. + By tracking key performance indicators, we can determine whether our efforts are + successful. + + + *Ms. Thompson:* Great suggestions, everyone. Let''s divide into teams to tackle + different aspects of the customer journey. Team 1 will focus on pre-sale touchpoints, + Team 2 on sales, and Team 3 on post-sale support. + + + *Mr. Patel:* Sounds good. We''ll reconvene in a week to share our findings and + discuss next steps.""", + 'seed_question': """Generate a company wide email based on the given meeting transcript""", + 'task_description': 'Writing emails based on minutes of meeting', + 'seed_response': """Subject: Recap and Next Steps - Customer Journey Mapping and Analysis\n\ + \nDear [Company Name] Team,\n\nI hope this email finds you well. Yesterday, we\ + \ gathered to discuss our customer journey mapping and analysis, aiming to understand\ + \ our customers' experiences and identify opportunities for improvement. The discussions\ + \ were fruitful, and I want to share a brief overview of the key points and outcomes.\n\ + \n1. **Identifying Touchpoints:**\n Mr. Patel suggested mapping all touchpoints\ + \ in our customer journey, from initial contact to post-sale support.\n\n2. **Emotional\ + \ Aspect and Pain Points:**\n Ms. Rodriguez emphasized the importance of considering\ + \ the emotional aspect of the journey and identifying customers' pain points at\ + \ each stage.\n\n3. **Data-Driven Mapping:**\n Mr. Kim proposed using data from\ + \ our CRM system to create a detailed, data-driven customer journey map.\n\n4.\ + \ **Customer Perspective:**\n Ms. Johnson recommended gathering insights from\ + \ the customer's perspective through interviews or surveys.\n\n5. **Analysis and\ + \ Improvement:**\n Ms. Brown suggested analyzing the customer journey map to\ + \ identify areas for improvement and developing strategies to address them.\n\n\ + 6. **Measuring Impact:**\n Mr. Smith stressed the need to measure the impact\ + \ of any changes made by tracking key performance indicators.\n\nTo facilitate\ + \ a comprehensive analysis, we have divided into teams to tackle different aspects\ + \ of the customer journey:\n\n* Team 1: Pre-sale touchpoints\n* Team 2: Sales\n\ + * Team 3: Post-sale support\n\nEach team will share their findings and discuss\ + \ next steps in a week.\n\nYour engagement and insights have been invaluable in\ + \ understanding our customers' experiences and identifying opportunities for improvement.\ + \ I look forward to our continued collaboration as we work towards enhancing our\ + \ services and delivering exceptional customer experiences.\n\nBest regards,\n\ + \n[Your Full Name]\n[Your Position]\n[Company Name]""", + } +] + + +ds = Dataset.from_list(samples) + +skills_flow = SynthGroundedSkillsFlow(client, teacher_model).get_flow() +skills_pipe = Pipeline(skills_flow) + +sdg = SDG([skills_pipe]) +gen_data = sdg.generate(ds) + +print(gen_data) +print(gen_data[0]) diff --git a/src/instructlab/sdg/default_flows.py b/src/instructlab/sdg/default_flows.py index 1a8ebb75..d4a4ec03 100644 --- a/src/instructlab/sdg/default_flows.py +++ b/src/instructlab/sdg/default_flows.py @@ -9,6 +9,7 @@ from .filterblock import FilterByValueBlock from .iterblock import IterBlock from .llmblock import LLMBlock +from .utilblocks import CombineColumnsBlock MODEL_FAMILY_MIXTRAL = "mixtral" MODEL_FAMILY_MERLINITE = "merlinite" @@ -225,8 +226,9 @@ def get_flow(self) -> list: "block_config": { "block_name": "filter_relevancy", "filter_column": "score", - "filter_value": "2", + "filter_value": "2.0", "operation": operator.eq, + "convert_dtype": float, "batch_kwargs": { "num_procs": 8, }, @@ -258,8 +260,9 @@ def get_flow(self) -> list: "block_config": { "block_name": "filter_verify_question", "filter_column": "rating", - "filter_value": "1", + "filter_value": "1.0", "operation": operator.eq, + "convert_dtype": float, "batch_kwargs": { "num_procs": 8, }, @@ -309,9 +312,9 @@ def get_flow(self) -> list: "block_config": { "block_name": "filter_questions", "filter_column": "score", - "filter_value": 1, + "filter_value": 1.0, "operation": operator.eq, - "convert_dtype": int, + "convert_dtype": float, "batch_kwargs": { "num_procs": 8, }, @@ -353,9 +356,9 @@ def get_flow(self) -> list: "block_config": { "block_name": "filter_qa_pair", "filter_column": "score", - "filter_value": 2, + "filter_value": 2.0, "operation": operator.ge, - "convert_dtype": int, + "convert_dtype": float, "batch_kwargs": { "num_procs": 8, }, @@ -420,6 +423,7 @@ def get_flow(self) -> list: "batch_kwargs": { "num_procs": 8, "batched": self.batched, + "num_samples": 10, }, }, }, @@ -428,9 +432,9 @@ def get_flow(self) -> list: "block_config": { "block_name": "filter_grounded_questions", "filter_column": "score", - "filter_value": 1, + "filter_value": 1.0, "operation": operator.eq, - "convert_dtype": int, + "convert_dtype": float, "batch_kwargs": { "num_procs": 8, }, @@ -472,12 +476,24 @@ def get_flow(self) -> list: "block_config": { "block_name": "filter_grounded_qa_pair", "filter_column": "score", - "filter_value": 2, + "filter_value": 2.0, "operation": operator.ge, - "convert_dtype": int, + "convert_dtype": float, "batch_kwargs": { "num_procs": 8, }, }, }, + { + 'block_type': CombineColumnsBlock, + 'block_config': { + 'block_name': 'combine_question_and_context', + 'columns': ['context', 'question'], + 'output_col': 'question', + 'batch_kwargs': { + 'num_procs': 8, + 'batched': True, + }, + }, + } ] From 757d4dfb7ed2972c5f6f5ad63899c8bc4fbb6bf4 Mon Sep 17 00:00:00 2001 From: Oindrilla Chatterjee Date: Fri, 28 Jun 2024 17:53:59 -0400 Subject: [PATCH 02/10] Refined prompt templates to improve model behavior Changed the prompt templates and alignment with expected outputs. Conducted stress testing across various leaf nodes to ensure accuracy and relevance. Signed-off-by: Oindrilla Chatterjee Co-authored-by: Aakanksha Duggal Co-authored-by: Shiv --- scripts/test_freeform_skills.py | 2 +- scripts/test_grounded_skills.py | 10 ++++----- scripts/test_knowledge.py | 4 ++-- .../skills/evaluate_grounded_pair.yaml | 2 +- .../skills/evaluate_grounded_questions.yaml | 2 +- .../configs/skills/freeform_responses.yaml | 4 ++-- .../configs/skills/grounded_responses.yaml | 5 ++++- src/instructlab/sdg/default_flows.py | 22 +++++++++---------- 8 files changed, 27 insertions(+), 24 deletions(-) diff --git a/scripts/test_freeform_skills.py b/scripts/test_freeform_skills.py index 01232e27..a8612c09 100644 --- a/scripts/test_freeform_skills.py +++ b/scripts/test_freeform_skills.py @@ -49,7 +49,7 @@ ds = Dataset.from_list(samples) -skills_flow = SynthSkillsFlow(client, teacher_model).get_flow() +skills_flow = SynthSkillsFlow(client, "mixtral", teacher_model, 1).get_flow() skills_pipe = Pipeline(skills_flow) sdg = SDG([skills_pipe]) diff --git a/scripts/test_grounded_skills.py b/scripts/test_grounded_skills.py index 98dc739d..338edb6c 100644 --- a/scripts/test_grounded_skills.py +++ b/scripts/test_grounded_skills.py @@ -22,7 +22,7 @@ samples = [ { - 'seed_context': """*Ms. Thompson:* Good morning, everyone. Today, we''re here to discuss + "seed_context": """*Ms. Thompson:* Good morning, everyone. Today, we''re here to discuss our customer journey mapping and analysis. I believe this is crucial to understanding our customers'' experiences and improving our services. @@ -64,9 +64,9 @@ *Mr. Patel:* Sounds good. We''ll reconvene in a week to share our findings and discuss next steps.""", - 'seed_question': """Generate a company wide email based on the given meeting transcript""", - 'task_description': 'Writing emails based on minutes of meeting', - 'seed_response': """Subject: Recap and Next Steps - Customer Journey Mapping and Analysis\n\ + "seed_question": """Generate a company wide email based on the given meeting transcript""", + "task_description": "Writing emails based on minutes of meeting", + "seed_response": """Subject: Recap and Next Steps - Customer Journey Mapping and Analysis\n\ \nDear [Company Name] Team,\n\nI hope this email finds you well. Yesterday, we\ \ gathered to discuss our customer journey mapping and analysis, aiming to understand\ \ our customers' experiences and identify opportunities for improvement. The discussions\ @@ -97,7 +97,7 @@ ds = Dataset.from_list(samples) -skills_flow = SynthGroundedSkillsFlow(client, teacher_model).get_flow() +skills_flow = SynthGroundedSkillsFlow(client, "mixtral", teacher_model, 10).get_flow() skills_pipe = Pipeline(skills_flow) sdg = SDG([skills_pipe]) diff --git a/scripts/test_knowledge.py b/scripts/test_knowledge.py index d777c8c3..aeedcf59 100644 --- a/scripts/test_knowledge.py +++ b/scripts/test_knowledge.py @@ -38,8 +38,8 @@ ds = Dataset.from_list(samples) -mmlu_flow = MMLUBenchFlow(client, teacher_model).get_flow() -knowledge_flow = SynthKnowledgeFlow(client, teacher_model).get_flow() +mmlu_flow = MMLUBenchFlow(client, "mixtral", teacher_model, 1).get_flow() +knowledge_flow = SynthKnowledgeFlow(client, "mixtral", teacher_model, 1).get_flow() knowledge_pipe = Pipeline(knowledge_flow) mmlu_pipe = Pipeline(mmlu_flow) diff --git a/src/instructlab/sdg/configs/skills/evaluate_grounded_pair.yaml b/src/instructlab/sdg/configs/skills/evaluate_grounded_pair.yaml index 3f40a6fd..45580d3b 100644 --- a/src/instructlab/sdg/configs/skills/evaluate_grounded_pair.yaml +++ b/src/instructlab/sdg/configs/skills/evaluate_grounded_pair.yaml @@ -31,6 +31,7 @@ examples: | [End of Score] generation: | + Begin your evaluation by providing a short explanation. Be as objective as possible. After providing your explanation, you must rate the answer on a scale of 1 to 3 as mentioned above. Here's the context, question and the answer you need to evaluate: [Start of Context] @@ -45,7 +46,6 @@ generation: | {answer} [End of Answer] - Begin your evaluation by providing a short explanation. Be as objective as possible. After providing your explanation, you must rate the answer on a scale of 1 to 3 as mentioned above. * Return the evaluation between [Start of Evaluation] and [End of Evaluation] tags. * Return the score between [Start of Score] and [End of Score] tags. diff --git a/src/instructlab/sdg/configs/skills/evaluate_grounded_questions.yaml b/src/instructlab/sdg/configs/skills/evaluate_grounded_questions.yaml index 70f6feb9..6999987f 100644 --- a/src/instructlab/sdg/configs/skills/evaluate_grounded_questions.yaml +++ b/src/instructlab/sdg/configs/skills/evaluate_grounded_questions.yaml @@ -34,7 +34,7 @@ examples: | [End of Score] generation: | - Here's the context and question you need to evaluate: + Here's the context and question you need to evaluate. Return the evaluation between [Start of Evaluation] and [End of Evaluation] tags. [Start of Context] {context} diff --git a/src/instructlab/sdg/configs/skills/freeform_responses.yaml b/src/instructlab/sdg/configs/skills/freeform_responses.yaml index 0b0eda38..cf7ff177 100644 --- a/src/instructlab/sdg/configs/skills/freeform_responses.yaml +++ b/src/instructlab/sdg/configs/skills/freeform_responses.yaml @@ -21,13 +21,13 @@ examples: | [End of Response] generation: | - Now generate a response to the following prompt. + Now generate a response to the following prompt. Remember to use the same style and format as the example above. [Start of Question] {question} [End of Question] - Remember to use the same style and format as the example above. Return the response between [Start of Response] and [End of Response] tags. + Return the response between [Start of Response] and [End of Response] tags. start_tags: ["[Start of Response]"] end_tags: ["[End of Response]"] diff --git a/src/instructlab/sdg/configs/skills/grounded_responses.yaml b/src/instructlab/sdg/configs/skills/grounded_responses.yaml index 87429b9a..bacd5c10 100644 --- a/src/instructlab/sdg/configs/skills/grounded_responses.yaml +++ b/src/instructlab/sdg/configs/skills/grounded_responses.yaml @@ -26,7 +26,8 @@ examples: | [End of Response] generation: | - Now generate a response to the following prompt. Remember to use the same style and format as the example above. Return the response between [Start of Response] and [End of Response] tags. + Now generate a response to the following prompt. Remember to use the same style and format as the example above. + Return the response between [Start of Response] and [End of Response] tags. [Start of Context] {context} @@ -35,6 +36,8 @@ generation: | {question} [End of Question] + Return the response between [Start of Response] and [End of Response] tags. + start_tags: ["[Start of Response]"] end_tags: ["[End of Response]"] \ No newline at end of file diff --git a/src/instructlab/sdg/default_flows.py b/src/instructlab/sdg/default_flows.py index d4a4ec03..31edd3d6 100644 --- a/src/instructlab/sdg/default_flows.py +++ b/src/instructlab/sdg/default_flows.py @@ -226,7 +226,7 @@ def get_flow(self) -> list: "block_config": { "block_name": "filter_relevancy", "filter_column": "score", - "filter_value": "2.0", + "filter_value": 2.0, "operation": operator.eq, "convert_dtype": float, "batch_kwargs": { @@ -260,7 +260,7 @@ def get_flow(self) -> list: "block_config": { "block_name": "filter_verify_question", "filter_column": "rating", - "filter_value": "1.0", + "filter_value": 1.0, "operation": operator.eq, "convert_dtype": float, "batch_kwargs": { @@ -485,15 +485,15 @@ def get_flow(self) -> list: }, }, { - 'block_type': CombineColumnsBlock, - 'block_config': { - 'block_name': 'combine_question_and_context', - 'columns': ['context', 'question'], - 'output_col': 'question', - 'batch_kwargs': { - 'num_procs': 8, - 'batched': True, + "block_type": CombineColumnsBlock, + "block_config": { + "block_name": "combine_question_and_context", + "columns": ["context", "question"], + "output_col": "question", + "batch_kwargs": { + "num_procs": 8, + "batched": True, }, }, - } + }, ] From b7977ea9635dd8c1fee09f3eece07eb2742ac9ae Mon Sep 17 00:00:00 2001 From: Nikhil Palaskar Date: Fri, 28 Jun 2024 19:04:25 -0400 Subject: [PATCH 03/10] Remove the iterBlock and use openai's 'n' parameter instead --- src/instructlab/sdg/default_flows.py | 63 +++++++++++++--------------- src/instructlab/sdg/iterblock.py | 29 ------------- src/instructlab/sdg/pipeline.py | 6 --- 3 files changed, 29 insertions(+), 69 deletions(-) delete mode 100644 src/instructlab/sdg/iterblock.py diff --git a/src/instructlab/sdg/default_flows.py b/src/instructlab/sdg/default_flows.py index 31edd3d6..6ea7f85e 100644 --- a/src/instructlab/sdg/default_flows.py +++ b/src/instructlab/sdg/default_flows.py @@ -7,7 +7,6 @@ # Local from .filterblock import FilterByValueBlock -from .iterblock import IterBlock from .llmblock import LLMBlock from .utilblocks import CombineColumnsBlock @@ -46,12 +45,10 @@ class _SimpleFlow(Flow): def get_flow(self) -> list: return [ { - "block_type": IterBlock, - "block_config": { - "block_name": "", # must be set by subclass - "num_iters": self.num_iters, - "block_type": LLMBlock, - "block_kwargs": { + "block_type": LLMBlock, + "block_name": "", # must be set by subclass + "num_iters": self.num_iters, + "block_kwargs": { "block_name": "", # must be set by subclass "config_path": "", # must be set by subclass "client": self.client, @@ -62,13 +59,13 @@ def get_flow(self) -> list: "num_procs": 8, "batched": self.batched, }, - }, - "gen_kwargs": { - "max_tokens": 2048, - "temperature": 0.7, - }, - "drop_duplicates": ["output"], }, + "gen_kwargs": { + "max_tokens": 2048, + "temperature": 0.7, + "n": 1 + }, + "drop_duplicates": ["output"], } ] @@ -372,27 +369,25 @@ class SynthGroundedSkillsFlow(Flow): def get_flow(self) -> list: return [ { - "block_type": IterBlock, - "block_config": { - "block_name": "context_iter", - "num_iters": 10, - "block_type": LLMBlock, - "block_kwargs": { - "block_name": "gen_contexts", - "config_path": "src/instructlab/sdg/configs/skills/contexts.yaml", - "client": self.client, - "model_id": self.model_id, - "model_prompt": _get_model_prompt(self.model_family), - "output_cols": ["context"], - "batch_kwargs": { - "num_procs": 8, - "batched": self.batched, - }, - }, - "gen_kwargs": { - "temperature": 0.7, - "max_tokens": 2048, - }, + "block_type": LLMBlock, + "block_name": "context_iter", + "block_kwargs": { + "block_name": "gen_contexts", + "config_path": "src/instructlab/sdg/configs/skills/contexts.yaml", + "client": self.client, + "model_id": self.model_id, + "model_prompt": _get_model_prompt(self.model_family), + "output_cols": ["context"], + "batch_kwargs": { + "num_procs": 8, + "batched": self.batched, + } + }, + "gen_kwargs": { + "num_samples": 30, + "temperature": 0.7, + "max_tokens": 2048, + "n": 10 }, }, { diff --git a/src/instructlab/sdg/iterblock.py b/src/instructlab/sdg/iterblock.py deleted file mode 100644 index 21a20470..00000000 --- a/src/instructlab/sdg/iterblock.py +++ /dev/null @@ -1,29 +0,0 @@ -# Third Party -from datasets import Dataset - -# Local -from .block import Block -from .logger_config import setup_logger - -logger = setup_logger(__name__) - - -class IterBlock(Block): - def __init__(self, block_name, num_iters, block_type, block_kwargs, **kwargs): - super().__init__(block_name) - self.num_iters = num_iters - self.block = block_type(**block_kwargs) - self.gen_kwargs = kwargs.get("gen_kwargs", {}) - self.gen_kwargs = kwargs.get("gen_kwargs", {}) - - def generate(self, samples, **gen_kwargs) -> Dataset: - generated_samples = [] - num_iters = self.num_iters - - for _ in range(num_iters): - batch_generated = self.block.generate( - samples, **{**self.gen_kwargs, **gen_kwargs} - ) - generated_samples.extend(batch_generated) - - return Dataset.from_list(generated_samples) diff --git a/src/instructlab/sdg/pipeline.py b/src/instructlab/sdg/pipeline.py index fc93f78d..982a6ecb 100644 --- a/src/instructlab/sdg/pipeline.py +++ b/src/instructlab/sdg/pipeline.py @@ -39,12 +39,6 @@ def generate(self, dataset) -> Dataset: drop_duplicates_cols = block_prop.get("drop_duplicates", False) block = block_type(**block_config) - if block_type == IterBlock: - block_kwargs = block_config.pop("block_kwargs") - block = block_type(**block_config, block_kwargs=block_kwargs) - else: - block = block_type(**block_config) - logger.info("Running block: %s", block_config["block_name"]) logger.info(dataset) From 2ca0534e3e08d18f52c5a304c564bb59af4fd9c9 Mon Sep 17 00:00:00 2001 From: Nikhil Palaskar Date: Fri, 28 Jun 2024 20:01:10 -0400 Subject: [PATCH 04/10] some debug --- scripts/test_grounded_skills.py | 2 +- src/instructlab/sdg/default_flows.py | 30 ++++++++++++---------------- 2 files changed, 14 insertions(+), 18 deletions(-) diff --git a/scripts/test_grounded_skills.py b/scripts/test_grounded_skills.py index 338edb6c..b4edcd59 100644 --- a/scripts/test_grounded_skills.py +++ b/scripts/test_grounded_skills.py @@ -97,7 +97,7 @@ ds = Dataset.from_list(samples) -skills_flow = SynthGroundedSkillsFlow(client, "mixtral", teacher_model, 10).get_flow() +skills_flow = SynthGroundedSkillsFlow(client, "mixtral", teacher_model).get_flow() skills_pipe = Pipeline(skills_flow) sdg = SDG([skills_pipe]) diff --git a/src/instructlab/sdg/default_flows.py b/src/instructlab/sdg/default_flows.py index 6ea7f85e..9fdb2818 100644 --- a/src/instructlab/sdg/default_flows.py +++ b/src/instructlab/sdg/default_flows.py @@ -29,11 +29,10 @@ def _get_model_prompt(model_family): class Flow(ABC): - def __init__(self, client, model_family, model_id, num_iters, batched=True) -> None: + def __init__(self, client, model_family, model_id, batched=True) -> None: self.client = client self.model_family = model_family self.model_id = model_id - self.num_iters = num_iters self.batched = batched @abstractmethod @@ -46,19 +45,17 @@ def get_flow(self) -> list: return [ { "block_type": LLMBlock, - "block_name": "", # must be set by subclass - "num_iters": self.num_iters, - "block_kwargs": { - "block_name": "", # must be set by subclass - "config_path": "", # must be set by subclass - "client": self.client, - "model_id": self.model_id, - "model_prompt": _get_model_prompt(self.model_family), - "output_cols": ["output"], - "batch_kwargs": { - "num_procs": 8, - "batched": self.batched, - }, + "block_config": { + "block_name": "", # must be set by subclass + "config_path": "", # must be set by subclass + "client": self.client, + "model_id": self.model_id, + "model_prompt": _get_model_prompt(self.model_family), + "output_cols": ["output"], + "batch_kwargs": { + "num_procs": 8, + "batched": self.batched, + }, }, "gen_kwargs": { "max_tokens": 2048, @@ -370,8 +367,7 @@ def get_flow(self) -> list: return [ { "block_type": LLMBlock, - "block_name": "context_iter", - "block_kwargs": { + "block_config": { "block_name": "gen_contexts", "config_path": "src/instructlab/sdg/configs/skills/contexts.yaml", "client": self.client, From 6f1784c8129c9b539c209f45a76a4ce01b4191cb Mon Sep 17 00:00:00 2001 From: Nikhil Palaskar Date: Sun, 30 Jun 2024 23:35:03 -0400 Subject: [PATCH 05/10] fix zipping of samples and outputs --- scripts/test_freeform_skills.py | 2 +- scripts/test_knowledge.py | 4 ++-- src/instructlab/sdg/llmblock.py | 5 ++++- 3 files changed, 7 insertions(+), 4 deletions(-) diff --git a/scripts/test_freeform_skills.py b/scripts/test_freeform_skills.py index a8612c09..9b5ce810 100644 --- a/scripts/test_freeform_skills.py +++ b/scripts/test_freeform_skills.py @@ -49,7 +49,7 @@ ds = Dataset.from_list(samples) -skills_flow = SynthSkillsFlow(client, "mixtral", teacher_model, 1).get_flow() +skills_flow = SynthSkillsFlow(client, "mixtral", teacher_model).get_flow() skills_pipe = Pipeline(skills_flow) sdg = SDG([skills_pipe]) diff --git a/scripts/test_knowledge.py b/scripts/test_knowledge.py index aeedcf59..75bd7783 100644 --- a/scripts/test_knowledge.py +++ b/scripts/test_knowledge.py @@ -38,8 +38,8 @@ ds = Dataset.from_list(samples) -mmlu_flow = MMLUBenchFlow(client, "mixtral", teacher_model, 1).get_flow() -knowledge_flow = SynthKnowledgeFlow(client, "mixtral", teacher_model, 1).get_flow() +mmlu_flow = MMLUBenchFlow(client, "mixtral", teacher_model).get_flow() +knowledge_flow = SynthKnowledgeFlow(client, "mixtral", teacher_model).get_flow() knowledge_pipe = Pipeline(knowledge_flow) mmlu_pipe = Pipeline(mmlu_flow) diff --git a/src/instructlab/sdg/llmblock.py b/src/instructlab/sdg/llmblock.py index 7952609a..ce333071 100644 --- a/src/instructlab/sdg/llmblock.py +++ b/src/instructlab/sdg/llmblock.py @@ -123,8 +123,11 @@ def generate(self, samples, **gen_kwargs) -> Dataset: outputs = [self._generate([sample], **gen_kwargs)[0] for sample in samples] logger.debug("Generated outputs: {}".format(outputs)) + num_parallel_samples = gen_kwargs.get("n", 1) + n_samples = [item for item in samples for i in range(num_parallel_samples)] + new_data = [] - for sample, output in zip(samples, outputs): + for sample, output in zip(n_samples, outputs): parsed_outputs = self._parse(output) # pylint: disable=consider-using-generator max_length = max([len(value) for value in parsed_outputs.values()]) From 7e8282dafb258a4a6a96f2bc2dffa7e3c443d02d Mon Sep 17 00:00:00 2001 From: Nikhil Palaskar Date: Mon, 1 Jul 2024 14:13:51 -0400 Subject: [PATCH 06/10] some refactoring --- src/instructlab/sdg/llmblock.py | 6 ++++-- src/instructlab/sdg/pipeline.py | 1 - 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/src/instructlab/sdg/llmblock.py b/src/instructlab/sdg/llmblock.py index ce333071..338b8d2b 100644 --- a/src/instructlab/sdg/llmblock.py +++ b/src/instructlab/sdg/llmblock.py @@ -124,10 +124,12 @@ def generate(self, samples, **gen_kwargs) -> Dataset: logger.debug("Generated outputs: {}".format(outputs)) num_parallel_samples = gen_kwargs.get("n", 1) - n_samples = [item for item in samples for i in range(num_parallel_samples)] + extended_samples = [] + for item in samples: + extended_samples.extend([item] * num_parallel_samples) new_data = [] - for sample, output in zip(n_samples, outputs): + for sample, output in zip(extended_samples, outputs): parsed_outputs = self._parse(output) # pylint: disable=consider-using-generator max_length = max([len(value) for value in parsed_outputs.values()]) diff --git a/src/instructlab/sdg/pipeline.py b/src/instructlab/sdg/pipeline.py index 982a6ecb..bc570a83 100644 --- a/src/instructlab/sdg/pipeline.py +++ b/src/instructlab/sdg/pipeline.py @@ -3,7 +3,6 @@ from datasets import Dataset # Local -from .iterblock import IterBlock from .logger_config import setup_logger logger = setup_logger(__name__) From 09fdffbd6f78e87edaf783eff9bc36f3359f4dc3 Mon Sep 17 00:00:00 2001 From: Nikhil Palaskar Date: Tue, 2 Jul 2024 15:44:54 -0400 Subject: [PATCH 07/10] fix the num_samples location --- src/instructlab/sdg/default_flows.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/instructlab/sdg/default_flows.py b/src/instructlab/sdg/default_flows.py index 9fdb2818..b3074066 100644 --- a/src/instructlab/sdg/default_flows.py +++ b/src/instructlab/sdg/default_flows.py @@ -375,12 +375,12 @@ def get_flow(self) -> list: "model_prompt": _get_model_prompt(self.model_family), "output_cols": ["context"], "batch_kwargs": { + "num_samples": 30, "num_procs": 8, "batched": self.batched, } }, "gen_kwargs": { - "num_samples": 30, "temperature": 0.7, "max_tokens": 2048, "n": 10 From 1ac689a9a5c067b34fa75a558545281fa601585d Mon Sep 17 00:00:00 2001 From: Nikhil Palaskar Date: Tue, 2 Jul 2024 17:40:29 -0400 Subject: [PATCH 08/10] update generate_data API calls --- scripts/test_freeform_skills.py | 2 +- scripts/test_grounded_skills.py | 2 +- scripts/test_knowledge.py | 4 ++-- src/instructlab/sdg/default_flows.py | 13 +++++++------ src/instructlab/sdg/generate_data.py | 8 ++++---- 5 files changed, 15 insertions(+), 14 deletions(-) diff --git a/scripts/test_freeform_skills.py b/scripts/test_freeform_skills.py index 9b5ce810..9b1f443a 100644 --- a/scripts/test_freeform_skills.py +++ b/scripts/test_freeform_skills.py @@ -49,7 +49,7 @@ ds = Dataset.from_list(samples) -skills_flow = SynthSkillsFlow(client, "mixtral", teacher_model).get_flow() +skills_flow = SynthSkillsFlow(client, "mixtral", teacher_model, 30).get_flow() skills_pipe = Pipeline(skills_flow) sdg = SDG([skills_pipe]) diff --git a/scripts/test_grounded_skills.py b/scripts/test_grounded_skills.py index b4edcd59..abbce46f 100644 --- a/scripts/test_grounded_skills.py +++ b/scripts/test_grounded_skills.py @@ -97,7 +97,7 @@ ds = Dataset.from_list(samples) -skills_flow = SynthGroundedSkillsFlow(client, "mixtral", teacher_model).get_flow() +skills_flow = SynthGroundedSkillsFlow(client, "mixtral", teacher_model, 30).get_flow() skills_pipe = Pipeline(skills_flow) sdg = SDG([skills_pipe]) diff --git a/scripts/test_knowledge.py b/scripts/test_knowledge.py index 75bd7783..aa7bfbcd 100644 --- a/scripts/test_knowledge.py +++ b/scripts/test_knowledge.py @@ -38,8 +38,8 @@ ds = Dataset.from_list(samples) -mmlu_flow = MMLUBenchFlow(client, "mixtral", teacher_model).get_flow() -knowledge_flow = SynthKnowledgeFlow(client, "mixtral", teacher_model).get_flow() +mmlu_flow = MMLUBenchFlow(client, "mixtral", teacher_model, 30).get_flow() +knowledge_flow = SynthKnowledgeFlow(client, "mixtral", teacher_model, 30).get_flow() knowledge_pipe = Pipeline(knowledge_flow) mmlu_pipe = Pipeline(mmlu_flow) diff --git a/src/instructlab/sdg/default_flows.py b/src/instructlab/sdg/default_flows.py index b3074066..cd91099e 100644 --- a/src/instructlab/sdg/default_flows.py +++ b/src/instructlab/sdg/default_flows.py @@ -29,10 +29,11 @@ def _get_model_prompt(model_family): class Flow(ABC): - def __init__(self, client, model_family, model_id, batched=True) -> None: + def __init__(self, client, model_family, model_id, num_instructions_to_generate, batched=True) -> None: self.client = client self.model_family = model_family self.model_id = model_id + self.num_instructions_to_generate = num_instructions_to_generate self.batched = batched @abstractmethod @@ -60,7 +61,7 @@ def get_flow(self) -> list: "gen_kwargs": { "max_tokens": 2048, "temperature": 0.7, - "n": 1 + "n": self.num_instructions_to_generate }, "drop_duplicates": ["output"], } @@ -280,7 +281,7 @@ def get_flow(self) -> list: "output_cols": ["question"], "batch_kwargs": { "num_procs": 8, - "num_samples": 30, + "num_samples": self.num_instructions_to_generate, "batched": self.batched, }, }, @@ -375,7 +376,6 @@ def get_flow(self) -> list: "model_prompt": _get_model_prompt(self.model_family), "output_cols": ["context"], "batch_kwargs": { - "num_samples": 30, "num_procs": 8, "batched": self.batched, } @@ -383,8 +383,9 @@ def get_flow(self) -> list: "gen_kwargs": { "temperature": 0.7, "max_tokens": 2048, - "n": 10 + "n": self.num_instructions_to_generate }, + "drop_duplicates": ["context"], }, { "block_type": LLMBlock, @@ -396,6 +397,7 @@ def get_flow(self) -> list: "model_prompt": _get_model_prompt(self.model_family), "output_cols": ["question"], "batch_kwargs": { + "num_samples": 3, "num_procs": 8, "batched": self.batched, }, @@ -414,7 +416,6 @@ def get_flow(self) -> list: "batch_kwargs": { "num_procs": 8, "batched": self.batched, - "num_samples": 10, }, }, }, diff --git a/src/instructlab/sdg/generate_data.py b/src/instructlab/sdg/generate_data.py index 66a2987e..b82cda15 100644 --- a/src/instructlab/sdg/generate_data.py +++ b/src/instructlab/sdg/generate_data.py @@ -124,7 +124,7 @@ def _gen_test_data( outfile.write("\n") -def _sdg_init(pipeline, client, model_family, model_name, num_iters, batched): +def _sdg_init(pipeline, client, model_family, model_name, num_instructions_to_generate, batched): knowledge_flow_types = [] freeform_skill_flow_types = [] grounded_skill_flow_types = [] @@ -144,7 +144,7 @@ def _sdg_init(pipeline, client, model_family, model_name, num_iters, batched): [ Pipeline( flow_type( - client, model_family, model_name, num_iters, batched + client, model_family, model_name, num_instructions_to_generate, batched ).get_flow() ) for flow_type in knowledge_flow_types @@ -154,7 +154,7 @@ def _sdg_init(pipeline, client, model_family, model_name, num_iters, batched): [ Pipeline( flow_type( - client, model_family, model_name, num_iters, batched + client, model_family, model_name, num_instructions_to_generate, batched ).get_flow() ) for flow_type in freeform_skill_flow_types @@ -164,7 +164,7 @@ def _sdg_init(pipeline, client, model_family, model_name, num_iters, batched): [ Pipeline( flow_type( - client, model_family, model_name, num_iters, batched + client, model_family, model_name, num_instructions_to_generate, batched ).get_flow() ) for flow_type in grounded_skill_flow_types From 0c908047e1d1ce912f80a3130c606a8b04f6c689 Mon Sep 17 00:00:00 2001 From: Nikhil Palaskar Date: Tue, 2 Jul 2024 17:46:01 -0400 Subject: [PATCH 09/10] change back the defaults in test scripts --- scripts/test_freeform_skills.py | 2 +- scripts/test_grounded_skills.py | 2 +- scripts/test_knowledge.py | 4 ++-- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/scripts/test_freeform_skills.py b/scripts/test_freeform_skills.py index 9b1f443a..a8612c09 100644 --- a/scripts/test_freeform_skills.py +++ b/scripts/test_freeform_skills.py @@ -49,7 +49,7 @@ ds = Dataset.from_list(samples) -skills_flow = SynthSkillsFlow(client, "mixtral", teacher_model, 30).get_flow() +skills_flow = SynthSkillsFlow(client, "mixtral", teacher_model, 1).get_flow() skills_pipe = Pipeline(skills_flow) sdg = SDG([skills_pipe]) diff --git a/scripts/test_grounded_skills.py b/scripts/test_grounded_skills.py index abbce46f..338edb6c 100644 --- a/scripts/test_grounded_skills.py +++ b/scripts/test_grounded_skills.py @@ -97,7 +97,7 @@ ds = Dataset.from_list(samples) -skills_flow = SynthGroundedSkillsFlow(client, "mixtral", teacher_model, 30).get_flow() +skills_flow = SynthGroundedSkillsFlow(client, "mixtral", teacher_model, 10).get_flow() skills_pipe = Pipeline(skills_flow) sdg = SDG([skills_pipe]) diff --git a/scripts/test_knowledge.py b/scripts/test_knowledge.py index aa7bfbcd..aeedcf59 100644 --- a/scripts/test_knowledge.py +++ b/scripts/test_knowledge.py @@ -38,8 +38,8 @@ ds = Dataset.from_list(samples) -mmlu_flow = MMLUBenchFlow(client, "mixtral", teacher_model, 30).get_flow() -knowledge_flow = SynthKnowledgeFlow(client, "mixtral", teacher_model, 30).get_flow() +mmlu_flow = MMLUBenchFlow(client, "mixtral", teacher_model, 1).get_flow() +knowledge_flow = SynthKnowledgeFlow(client, "mixtral", teacher_model, 1).get_flow() knowledge_pipe = Pipeline(knowledge_flow) mmlu_pipe = Pipeline(mmlu_flow) From 20b9f7ce6e40d10b39340e6be960c21c18a933ee Mon Sep 17 00:00:00 2001 From: Nikhil Palaskar Date: Tue, 2 Jul 2024 18:24:33 -0400 Subject: [PATCH 10/10] fix SimpleFlows --- src/instructlab/sdg/default_flows.py | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/src/instructlab/sdg/default_flows.py b/src/instructlab/sdg/default_flows.py index cd91099e..f3de0a8b 100644 --- a/src/instructlab/sdg/default_flows.py +++ b/src/instructlab/sdg/default_flows.py @@ -72,10 +72,9 @@ class SimpleKnowledgeFlow(_SimpleFlow): def get_flow(self) -> list: flow = super().get_flow() sdg_base = resources.files(__package__) - flow[0]["block_config"]["block_kwargs"]["config_path"] = os.path.join( + flow[0]["block_config"]["config_path"] = os.path.join( sdg_base, "configs/knowledge/simple_generate_qa.yaml" ) - flow[0]["block_config"]["block_kwargs"]["block_name"] = "gen_knowledge" flow[0]["block_config"]["block_name"] = "gen_knowledge" return flow @@ -84,10 +83,9 @@ class SimpleFreeformSkillFlow(_SimpleFlow): def get_flow(self) -> list: flow = super().get_flow() sdg_base = resources.files(__package__) - flow[0]["block_config"]["block_kwargs"]["config_path"] = os.path.join( + flow[0]["block_config"]["config_path"] = os.path.join( sdg_base, "configs/skills/simple_generate_qa_freeform.yaml" ) - flow[0]["block_config"]["block_kwargs"]["block_name"] = "gen_skill_freeform" flow[0]["block_config"]["block_name"] = "gen_skill_freeform" return flow @@ -96,10 +94,9 @@ class SimpleGroundedSkillFlow(_SimpleFlow): def get_flow(self) -> list: flow = super().get_flow() sdg_base = resources.files(__package__) - flow[0]["block_config"]["block_kwargs"]["config_path"] = os.path.join( + flow[0]["block_config"]["config_path"] = os.path.join( sdg_base, "configs/skills/simple_generate_qa_grounded.yaml" ) - flow[0]["block_config"]["block_kwargs"]["block_name"] = "gen_skill_grounded" flow[0]["block_config"]["block_name"] = "gen_skill_grounded" return flow