From 01c24aa64df856be11664433628fa37ead21c6f9 Mon Sep 17 00:00:00 2001
From: Oindrilla Chatterjee <oc@bu.edu>
Date: Fri, 28 Jun 2024 17:47:19 -0400
Subject: [PATCH 01/10] Combine question and context for training preparation
 and testing script

This is a step towards converting it to the final format (messages) required for training.

Signed-off-by: Oindrilla Chatterjee <oc@bu.edu>
Co-authored-by: Aakanksha Duggal <aduggal@redhat.com>
Co-authored-by: Shiv <shivchander.s30@gmail.com>
---
 scripts/test_grounded_skills.py      | 107 +++++++++++++++++++++++++++
 src/instructlab/sdg/default_flows.py |  36 ++++++---
 2 files changed, 133 insertions(+), 10 deletions(-)
 create mode 100644 scripts/test_grounded_skills.py

diff --git a/scripts/test_grounded_skills.py b/scripts/test_grounded_skills.py
new file mode 100644
index 00000000..98dc739d
--- /dev/null
+++ b/scripts/test_grounded_skills.py
@@ -0,0 +1,107 @@
+# Third Party
+from datasets import Dataset
+from openai import OpenAI
+
+# First Party
+from src.instructlab.sdg import SDG
+from src.instructlab.sdg.default_flows import SynthGroundedSkillsFlow
+from src.instructlab.sdg.pipeline import Pipeline
+
+# for vLLM endpoints, the api_key remains "EMPTY"
+openai_api_key = "EMPTY"
+openai_api_base = "Add model endpoint here"
+
+
+client = OpenAI(
+    api_key=openai_api_key,
+    base_url=openai_api_base,
+)
+
+models = client.models.list()
+teacher_model = models.data[0].id
+
+samples = [
+    {
+        'seed_context': """*Ms. Thompson:* Good morning, everyone. Today, we''re here to discuss
+    our customer journey mapping and analysis. I believe this is crucial to understanding
+    our customers'' experiences and improving our services.
+
+
+    *Mr. Patel:* I agree, Lisa. We should start by identifying all touchpoints in
+    our customer journey, from initial contact to post-sale support.
+
+
+    *Ms. Rodriguez:* Yes, and let''s not forget about the emotional aspect of the
+    journey. How do our customers feel at each stage? What are their pain points?
+
+
+    *Mr. Kim:* We can use data from our CRM system to track the customer journey and
+    gather insights. This will help us create a detailed, data-driven map.
+
+
+    *Ms. Johnson:* Once we have the map, we can analyze it to identify areas for improvement.
+    Perhaps there are steps where customers drop off or express dissatisfaction.
+
+
+    *Mr. Davis:* We should also consider the customer''s perspective. Conducting interviews
+    or surveys can provide valuable insights into their thoughts and feelings.
+
+
+    *Ms. Brown:* Absolutely. And once we''ve identified areas for improvement, we
+    can develop strategies to address them. This might involve redesigning certain
+    touchpoints, enhancing our communication, or streamlining processes.
+
+
+    *Mr. Smith:* And we must remember to measure the impact of any changes we make.
+    By tracking key performance indicators, we can determine whether our efforts are
+    successful.
+
+
+    *Ms. Thompson:* Great suggestions, everyone. Let''s divide into teams to tackle
+    different aspects of the customer journey. Team 1 will focus on pre-sale touchpoints,
+    Team 2 on sales, and Team 3 on post-sale support.
+
+
+    *Mr. Patel:* Sounds good. We''ll reconvene in a week to share our findings and
+    discuss next steps.""",
+        'seed_question': """Generate a company wide email based on the given meeting transcript""", 
+        'task_description': 'Writing emails based on minutes of meeting', 
+        'seed_response': """Subject: Recap and Next Steps - Customer Journey Mapping and Analysis\n\
+    \nDear [Company Name] Team,\n\nI hope this email finds you well. Yesterday, we\
+    \ gathered to discuss our customer journey mapping and analysis, aiming to understand\
+    \ our customers' experiences and identify opportunities for improvement. The discussions\
+    \ were fruitful, and I want to share a brief overview of the key points and outcomes.\n\
+    \n1. **Identifying Touchpoints:**\n   Mr. Patel suggested mapping all touchpoints\
+    \ in our customer journey, from initial contact to post-sale support.\n\n2. **Emotional\
+    \ Aspect and Pain Points:**\n   Ms. Rodriguez emphasized the importance of considering\
+    \ the emotional aspect of the journey and identifying customers' pain points at\
+    \ each stage.\n\n3. **Data-Driven Mapping:**\n   Mr. Kim proposed using data from\
+    \ our CRM system to create a detailed, data-driven customer journey map.\n\n4.\
+    \ **Customer Perspective:**\n   Ms. Johnson recommended gathering insights from\
+    \ the customer's perspective through interviews or surveys.\n\n5. **Analysis and\
+    \ Improvement:**\n   Ms. Brown suggested analyzing the customer journey map to\
+    \ identify areas for improvement and developing strategies to address them.\n\n\
+    6. **Measuring Impact:**\n   Mr. Smith stressed the need to measure the impact\
+    \ of any changes made by tracking key performance indicators.\n\nTo facilitate\
+    \ a comprehensive analysis, we have divided into teams to tackle different aspects\
+    \ of the customer journey:\n\n* Team 1: Pre-sale touchpoints\n* Team 2: Sales\n\
+    * Team 3: Post-sale support\n\nEach team will share their findings and discuss\
+    \ next steps in a week.\n\nYour engagement and insights have been invaluable in\
+    \ understanding our customers' experiences and identifying opportunities for improvement.\
+    \ I look forward to our continued collaboration as we work towards enhancing our\
+    \ services and delivering exceptional customer experiences.\n\nBest regards,\n\
+    \n[Your Full Name]\n[Your Position]\n[Company Name]""",
+    }
+]
+
+
+ds = Dataset.from_list(samples)
+
+skills_flow = SynthGroundedSkillsFlow(client, teacher_model).get_flow()
+skills_pipe = Pipeline(skills_flow)
+
+sdg = SDG([skills_pipe])
+gen_data = sdg.generate(ds)
+
+print(gen_data)
+print(gen_data[0])
diff --git a/src/instructlab/sdg/default_flows.py b/src/instructlab/sdg/default_flows.py
index 1a8ebb75..d4a4ec03 100644
--- a/src/instructlab/sdg/default_flows.py
+++ b/src/instructlab/sdg/default_flows.py
@@ -9,6 +9,7 @@
 from .filterblock import FilterByValueBlock
 from .iterblock import IterBlock
 from .llmblock import LLMBlock
+from .utilblocks import CombineColumnsBlock
 
 MODEL_FAMILY_MIXTRAL = "mixtral"
 MODEL_FAMILY_MERLINITE = "merlinite"
@@ -225,8 +226,9 @@ def get_flow(self) -> list:
                 "block_config": {
                     "block_name": "filter_relevancy",
                     "filter_column": "score",
-                    "filter_value": "2",
+                    "filter_value": "2.0",
                     "operation": operator.eq,
+                    "convert_dtype": float,
                     "batch_kwargs": {
                         "num_procs": 8,
                     },
@@ -258,8 +260,9 @@ def get_flow(self) -> list:
                 "block_config": {
                     "block_name": "filter_verify_question",
                     "filter_column": "rating",
-                    "filter_value": "1",
+                    "filter_value": "1.0",
                     "operation": operator.eq,
+                    "convert_dtype": float,
                     "batch_kwargs": {
                         "num_procs": 8,
                     },
@@ -309,9 +312,9 @@ def get_flow(self) -> list:
                 "block_config": {
                     "block_name": "filter_questions",
                     "filter_column": "score",
-                    "filter_value": 1,
+                    "filter_value": 1.0,
                     "operation": operator.eq,
-                    "convert_dtype": int,
+                    "convert_dtype": float,
                     "batch_kwargs": {
                         "num_procs": 8,
                     },
@@ -353,9 +356,9 @@ def get_flow(self) -> list:
                 "block_config": {
                     "block_name": "filter_qa_pair",
                     "filter_column": "score",
-                    "filter_value": 2,
+                    "filter_value": 2.0,
                     "operation": operator.ge,
-                    "convert_dtype": int,
+                    "convert_dtype": float,
                     "batch_kwargs": {
                         "num_procs": 8,
                     },
@@ -420,6 +423,7 @@ def get_flow(self) -> list:
                     "batch_kwargs": {
                         "num_procs": 8,
                         "batched": self.batched,
+                        "num_samples": 10,
                     },
                 },
             },
@@ -428,9 +432,9 @@ def get_flow(self) -> list:
                 "block_config": {
                     "block_name": "filter_grounded_questions",
                     "filter_column": "score",
-                    "filter_value": 1,
+                    "filter_value": 1.0,
                     "operation": operator.eq,
-                    "convert_dtype": int,
+                    "convert_dtype": float,
                     "batch_kwargs": {
                         "num_procs": 8,
                     },
@@ -472,12 +476,24 @@ def get_flow(self) -> list:
                 "block_config": {
                     "block_name": "filter_grounded_qa_pair",
                     "filter_column": "score",
-                    "filter_value": 2,
+                    "filter_value": 2.0,
                     "operation": operator.ge,
-                    "convert_dtype": int,
+                    "convert_dtype": float,
                     "batch_kwargs": {
                         "num_procs": 8,
                     },
                 },
             },
+            {
+                'block_type': CombineColumnsBlock,
+                'block_config': {
+                    'block_name': 'combine_question_and_context',
+                    'columns': ['context', 'question'],
+                    'output_col': 'question',
+                    'batch_kwargs': {
+                        'num_procs': 8,
+                        'batched': True,
+                    },
+                },
+            }
         ]

From 757d4dfb7ed2972c5f6f5ad63899c8bc4fbb6bf4 Mon Sep 17 00:00:00 2001
From: Oindrilla Chatterjee <oc@bu.edu>
Date: Fri, 28 Jun 2024 17:53:59 -0400
Subject: [PATCH 02/10] Refined prompt templates to improve model behavior

Changed the prompt templates and alignment with expected outputs. Conducted stress testing across various leaf nodes to ensure accuracy and relevance.

Signed-off-by: Oindrilla Chatterjee <oc@bu.edu>
Co-authored-by: Aakanksha Duggal <aduggal@redhat.com>
Co-authored-by: Shiv <shivchander.s30@gmail.com>
---
 scripts/test_freeform_skills.py               |  2 +-
 scripts/test_grounded_skills.py               | 10 ++++-----
 scripts/test_knowledge.py                     |  4 ++--
 .../skills/evaluate_grounded_pair.yaml        |  2 +-
 .../skills/evaluate_grounded_questions.yaml   |  2 +-
 .../configs/skills/freeform_responses.yaml    |  4 ++--
 .../configs/skills/grounded_responses.yaml    |  5 ++++-
 src/instructlab/sdg/default_flows.py          | 22 +++++++++----------
 8 files changed, 27 insertions(+), 24 deletions(-)

diff --git a/scripts/test_freeform_skills.py b/scripts/test_freeform_skills.py
index 01232e27..a8612c09 100644
--- a/scripts/test_freeform_skills.py
+++ b/scripts/test_freeform_skills.py
@@ -49,7 +49,7 @@
 
 ds = Dataset.from_list(samples)
 
-skills_flow = SynthSkillsFlow(client, teacher_model).get_flow()
+skills_flow = SynthSkillsFlow(client, "mixtral", teacher_model, 1).get_flow()
 skills_pipe = Pipeline(skills_flow)
 
 sdg = SDG([skills_pipe])
diff --git a/scripts/test_grounded_skills.py b/scripts/test_grounded_skills.py
index 98dc739d..338edb6c 100644
--- a/scripts/test_grounded_skills.py
+++ b/scripts/test_grounded_skills.py
@@ -22,7 +22,7 @@
 
 samples = [
     {
-        'seed_context': """*Ms. Thompson:* Good morning, everyone. Today, we''re here to discuss
+        "seed_context": """*Ms. Thompson:* Good morning, everyone. Today, we''re here to discuss
     our customer journey mapping and analysis. I believe this is crucial to understanding
     our customers'' experiences and improving our services.
 
@@ -64,9 +64,9 @@
 
     *Mr. Patel:* Sounds good. We''ll reconvene in a week to share our findings and
     discuss next steps.""",
-        'seed_question': """Generate a company wide email based on the given meeting transcript""", 
-        'task_description': 'Writing emails based on minutes of meeting', 
-        'seed_response': """Subject: Recap and Next Steps - Customer Journey Mapping and Analysis\n\
+        "seed_question": """Generate a company wide email based on the given meeting transcript""",
+        "task_description": "Writing emails based on minutes of meeting",
+        "seed_response": """Subject: Recap and Next Steps - Customer Journey Mapping and Analysis\n\
     \nDear [Company Name] Team,\n\nI hope this email finds you well. Yesterday, we\
     \ gathered to discuss our customer journey mapping and analysis, aiming to understand\
     \ our customers' experiences and identify opportunities for improvement. The discussions\
@@ -97,7 +97,7 @@
 
 ds = Dataset.from_list(samples)
 
-skills_flow = SynthGroundedSkillsFlow(client, teacher_model).get_flow()
+skills_flow = SynthGroundedSkillsFlow(client, "mixtral", teacher_model, 10).get_flow()
 skills_pipe = Pipeline(skills_flow)
 
 sdg = SDG([skills_pipe])
diff --git a/scripts/test_knowledge.py b/scripts/test_knowledge.py
index d777c8c3..aeedcf59 100644
--- a/scripts/test_knowledge.py
+++ b/scripts/test_knowledge.py
@@ -38,8 +38,8 @@
 
 ds = Dataset.from_list(samples)
 
-mmlu_flow = MMLUBenchFlow(client, teacher_model).get_flow()
-knowledge_flow = SynthKnowledgeFlow(client, teacher_model).get_flow()
+mmlu_flow = MMLUBenchFlow(client, "mixtral", teacher_model, 1).get_flow()
+knowledge_flow = SynthKnowledgeFlow(client, "mixtral", teacher_model, 1).get_flow()
 knowledge_pipe = Pipeline(knowledge_flow)
 mmlu_pipe = Pipeline(mmlu_flow)
 
diff --git a/src/instructlab/sdg/configs/skills/evaluate_grounded_pair.yaml b/src/instructlab/sdg/configs/skills/evaluate_grounded_pair.yaml
index 3f40a6fd..45580d3b 100644
--- a/src/instructlab/sdg/configs/skills/evaluate_grounded_pair.yaml
+++ b/src/instructlab/sdg/configs/skills/evaluate_grounded_pair.yaml
@@ -31,6 +31,7 @@ examples: |
    [End of Score]
 
 generation: |
+  Begin your evaluation by providing a short explanation. Be as objective as possible. After providing your explanation, you must rate the answer on a scale of 1 to 3 as mentioned above. 
   Here's the context, question and the answer you need to evaluate:
   
   [Start of Context]
@@ -45,7 +46,6 @@ generation: |
   {answer}
   [End of Answer]
 
-  Begin your evaluation by providing a short explanation. Be as objective as possible. After providing your explanation, you must rate the answer on a scale of 1 to 3 as mentioned above. 
   * Return the evaluation between [Start of Evaluation] and [End of Evaluation] tags.
   * Return the score between [Start of Score] and [End of Score] tags.
 
diff --git a/src/instructlab/sdg/configs/skills/evaluate_grounded_questions.yaml b/src/instructlab/sdg/configs/skills/evaluate_grounded_questions.yaml
index 70f6feb9..6999987f 100644
--- a/src/instructlab/sdg/configs/skills/evaluate_grounded_questions.yaml
+++ b/src/instructlab/sdg/configs/skills/evaluate_grounded_questions.yaml
@@ -34,7 +34,7 @@ examples: |
    [End of Score]
 
 generation: |   
-   Here's the context and question you need to evaluate:
+   Here's the context and question you need to evaluate. Return the evaluation between [Start of Evaluation] and [End of Evaluation] tags.
 
    [Start of Context]
    {context}
diff --git a/src/instructlab/sdg/configs/skills/freeform_responses.yaml b/src/instructlab/sdg/configs/skills/freeform_responses.yaml
index 0b0eda38..cf7ff177 100644
--- a/src/instructlab/sdg/configs/skills/freeform_responses.yaml
+++ b/src/instructlab/sdg/configs/skills/freeform_responses.yaml
@@ -21,13 +21,13 @@ examples: |
   [End of Response]
 
 generation: |
-  Now generate a response to the following prompt. 
+  Now generate a response to the following prompt. Remember to use the same style and format as the example above. 
 
   [Start of Question]
   {question}
   [End of Question]
 
-  Remember to use the same style and format as the example above. Return the response between [Start of Response] and [End of Response] tags.
+  Return the response between [Start of Response] and [End of Response] tags.
 
 start_tags: ["[Start of Response]"]
 end_tags: ["[End of Response]"]
diff --git a/src/instructlab/sdg/configs/skills/grounded_responses.yaml b/src/instructlab/sdg/configs/skills/grounded_responses.yaml
index 87429b9a..bacd5c10 100644
--- a/src/instructlab/sdg/configs/skills/grounded_responses.yaml
+++ b/src/instructlab/sdg/configs/skills/grounded_responses.yaml
@@ -26,7 +26,8 @@ examples: |
   [End of Response]
 
 generation: |
-  Now generate a response to the following prompt. Remember to use the same style and format as the example above. Return the response between [Start of Response] and [End of Response] tags.
+  Now generate a response to the following prompt. Remember to use the same style and format as the example above. 
+  Return the response between [Start of Response] and [End of Response] tags.
 
   [Start of Context]
   {context}
@@ -35,6 +36,8 @@ generation: |
   {question}
   [End of Question]
 
+  Return the response between [Start of Response] and [End of Response] tags.
+
 
 start_tags: ["[Start of Response]"]
 end_tags: ["[End of Response]"]
\ No newline at end of file
diff --git a/src/instructlab/sdg/default_flows.py b/src/instructlab/sdg/default_flows.py
index d4a4ec03..31edd3d6 100644
--- a/src/instructlab/sdg/default_flows.py
+++ b/src/instructlab/sdg/default_flows.py
@@ -226,7 +226,7 @@ def get_flow(self) -> list:
                 "block_config": {
                     "block_name": "filter_relevancy",
                     "filter_column": "score",
-                    "filter_value": "2.0",
+                    "filter_value": 2.0,
                     "operation": operator.eq,
                     "convert_dtype": float,
                     "batch_kwargs": {
@@ -260,7 +260,7 @@ def get_flow(self) -> list:
                 "block_config": {
                     "block_name": "filter_verify_question",
                     "filter_column": "rating",
-                    "filter_value": "1.0",
+                    "filter_value": 1.0,
                     "operation": operator.eq,
                     "convert_dtype": float,
                     "batch_kwargs": {
@@ -485,15 +485,15 @@ def get_flow(self) -> list:
                 },
             },
             {
-                'block_type': CombineColumnsBlock,
-                'block_config': {
-                    'block_name': 'combine_question_and_context',
-                    'columns': ['context', 'question'],
-                    'output_col': 'question',
-                    'batch_kwargs': {
-                        'num_procs': 8,
-                        'batched': True,
+                "block_type": CombineColumnsBlock,
+                "block_config": {
+                    "block_name": "combine_question_and_context",
+                    "columns": ["context", "question"],
+                    "output_col": "question",
+                    "batch_kwargs": {
+                        "num_procs": 8,
+                        "batched": True,
                     },
                 },
-            }
+            },
         ]

From b7977ea9635dd8c1fee09f3eece07eb2742ac9ae Mon Sep 17 00:00:00 2001
From: Nikhil Palaskar <npalaska@redhat.com>
Date: Fri, 28 Jun 2024 19:04:25 -0400
Subject: [PATCH 03/10] Remove the iterBlock and use openai's 'n' parameter
 instead

---
 src/instructlab/sdg/default_flows.py | 63 +++++++++++++---------------
 src/instructlab/sdg/iterblock.py     | 29 -------------
 src/instructlab/sdg/pipeline.py      |  6 ---
 3 files changed, 29 insertions(+), 69 deletions(-)
 delete mode 100644 src/instructlab/sdg/iterblock.py

diff --git a/src/instructlab/sdg/default_flows.py b/src/instructlab/sdg/default_flows.py
index 31edd3d6..6ea7f85e 100644
--- a/src/instructlab/sdg/default_flows.py
+++ b/src/instructlab/sdg/default_flows.py
@@ -7,7 +7,6 @@
 
 # Local
 from .filterblock import FilterByValueBlock
-from .iterblock import IterBlock
 from .llmblock import LLMBlock
 from .utilblocks import CombineColumnsBlock
 
@@ -46,12 +45,10 @@ class _SimpleFlow(Flow):
     def get_flow(self) -> list:
         return [
             {
-                "block_type": IterBlock,
-                "block_config": {
-                    "block_name": "",  # must be set by subclass
-                    "num_iters": self.num_iters,
-                    "block_type": LLMBlock,
-                    "block_kwargs": {
+                "block_type": LLMBlock,
+                "block_name": "",  # must be set by subclass
+                "num_iters": self.num_iters,
+                "block_kwargs": {
                         "block_name": "",  # must be set by subclass
                         "config_path": "",  # must be set by subclass
                         "client": self.client,
@@ -62,13 +59,13 @@ def get_flow(self) -> list:
                             "num_procs": 8,
                             "batched": self.batched,
                         },
-                    },
-                    "gen_kwargs": {
-                        "max_tokens": 2048,
-                        "temperature": 0.7,
-                    },
-                    "drop_duplicates": ["output"],
                 },
+                "gen_kwargs": {
+                    "max_tokens": 2048,
+                    "temperature": 0.7,
+                    "n": 1
+                },
+                "drop_duplicates": ["output"],
             }
         ]
 
@@ -372,27 +369,25 @@ class SynthGroundedSkillsFlow(Flow):
     def get_flow(self) -> list:
         return [
             {
-                "block_type": IterBlock,
-                "block_config": {
-                    "block_name": "context_iter",
-                    "num_iters": 10,
-                    "block_type": LLMBlock,
-                    "block_kwargs": {
-                        "block_name": "gen_contexts",
-                        "config_path": "src/instructlab/sdg/configs/skills/contexts.yaml",
-                        "client": self.client,
-                        "model_id": self.model_id,
-                        "model_prompt": _get_model_prompt(self.model_family),
-                        "output_cols": ["context"],
-                        "batch_kwargs": {
-                            "num_procs": 8,
-                            "batched": self.batched,
-                        },
-                    },
-                    "gen_kwargs": {
-                        "temperature": 0.7,
-                        "max_tokens": 2048,
-                    },
+                "block_type": LLMBlock,
+                "block_name": "context_iter",
+                "block_kwargs": {
+                    "block_name": "gen_contexts",
+                    "config_path": "src/instructlab/sdg/configs/skills/contexts.yaml",
+                    "client": self.client,
+                    "model_id": self.model_id,
+                    "model_prompt": _get_model_prompt(self.model_family),
+                    "output_cols": ["context"],
+                    "batch_kwargs": {
+                        "num_procs": 8,
+                        "batched": self.batched,
+                    }
+                },
+                "gen_kwargs": {
+                    "num_samples": 30,
+                    "temperature": 0.7,
+                    "max_tokens": 2048,
+                    "n": 10
                 },
             },
             {
diff --git a/src/instructlab/sdg/iterblock.py b/src/instructlab/sdg/iterblock.py
deleted file mode 100644
index 21a20470..00000000
--- a/src/instructlab/sdg/iterblock.py
+++ /dev/null
@@ -1,29 +0,0 @@
-# Third Party
-from datasets import Dataset
-
-# Local
-from .block import Block
-from .logger_config import setup_logger
-
-logger = setup_logger(__name__)
-
-
-class IterBlock(Block):
-    def __init__(self, block_name, num_iters, block_type, block_kwargs, **kwargs):
-        super().__init__(block_name)
-        self.num_iters = num_iters
-        self.block = block_type(**block_kwargs)
-        self.gen_kwargs = kwargs.get("gen_kwargs", {})
-        self.gen_kwargs = kwargs.get("gen_kwargs", {})
-
-    def generate(self, samples, **gen_kwargs) -> Dataset:
-        generated_samples = []
-        num_iters = self.num_iters
-
-        for _ in range(num_iters):
-            batch_generated = self.block.generate(
-                samples, **{**self.gen_kwargs, **gen_kwargs}
-            )
-            generated_samples.extend(batch_generated)
-
-        return Dataset.from_list(generated_samples)
diff --git a/src/instructlab/sdg/pipeline.py b/src/instructlab/sdg/pipeline.py
index fc93f78d..982a6ecb 100644
--- a/src/instructlab/sdg/pipeline.py
+++ b/src/instructlab/sdg/pipeline.py
@@ -39,12 +39,6 @@ def generate(self, dataset) -> Dataset:
             drop_duplicates_cols = block_prop.get("drop_duplicates", False)
             block = block_type(**block_config)
 
-            if block_type == IterBlock:
-                block_kwargs = block_config.pop("block_kwargs")
-                block = block_type(**block_config, block_kwargs=block_kwargs)
-            else:
-                block = block_type(**block_config)
-
             logger.info("Running block: %s", block_config["block_name"])
             logger.info(dataset)
 

From 2ca0534e3e08d18f52c5a304c564bb59af4fd9c9 Mon Sep 17 00:00:00 2001
From: Nikhil Palaskar <npalaska@redhat.com>
Date: Fri, 28 Jun 2024 20:01:10 -0400
Subject: [PATCH 04/10] some debug

---
 scripts/test_grounded_skills.py      |  2 +-
 src/instructlab/sdg/default_flows.py | 30 ++++++++++++----------------
 2 files changed, 14 insertions(+), 18 deletions(-)

diff --git a/scripts/test_grounded_skills.py b/scripts/test_grounded_skills.py
index 338edb6c..b4edcd59 100644
--- a/scripts/test_grounded_skills.py
+++ b/scripts/test_grounded_skills.py
@@ -97,7 +97,7 @@
 
 ds = Dataset.from_list(samples)
 
-skills_flow = SynthGroundedSkillsFlow(client, "mixtral", teacher_model, 10).get_flow()
+skills_flow = SynthGroundedSkillsFlow(client, "mixtral", teacher_model).get_flow()
 skills_pipe = Pipeline(skills_flow)
 
 sdg = SDG([skills_pipe])
diff --git a/src/instructlab/sdg/default_flows.py b/src/instructlab/sdg/default_flows.py
index 6ea7f85e..9fdb2818 100644
--- a/src/instructlab/sdg/default_flows.py
+++ b/src/instructlab/sdg/default_flows.py
@@ -29,11 +29,10 @@ def _get_model_prompt(model_family):
 
 
 class Flow(ABC):
-    def __init__(self, client, model_family, model_id, num_iters, batched=True) -> None:
+    def __init__(self, client, model_family, model_id, batched=True) -> None:
         self.client = client
         self.model_family = model_family
         self.model_id = model_id
-        self.num_iters = num_iters
         self.batched = batched
 
     @abstractmethod
@@ -46,19 +45,17 @@ def get_flow(self) -> list:
         return [
             {
                 "block_type": LLMBlock,
-                "block_name": "",  # must be set by subclass
-                "num_iters": self.num_iters,
-                "block_kwargs": {
-                        "block_name": "",  # must be set by subclass
-                        "config_path": "",  # must be set by subclass
-                        "client": self.client,
-                        "model_id": self.model_id,
-                        "model_prompt": _get_model_prompt(self.model_family),
-                        "output_cols": ["output"],
-                        "batch_kwargs": {
-                            "num_procs": 8,
-                            "batched": self.batched,
-                        },
+                "block_config": {
+                    "block_name": "",  # must be set by subclass
+                    "config_path": "",  # must be set by subclass
+                    "client": self.client,
+                    "model_id": self.model_id,
+                    "model_prompt": _get_model_prompt(self.model_family),
+                    "output_cols": ["output"],
+                    "batch_kwargs": {
+                        "num_procs": 8,
+                        "batched": self.batched,
+                    },
                 },
                 "gen_kwargs": {
                     "max_tokens": 2048,
@@ -370,8 +367,7 @@ def get_flow(self) -> list:
         return [
             {
                 "block_type": LLMBlock,
-                "block_name": "context_iter",
-                "block_kwargs": {
+                "block_config": {
                     "block_name": "gen_contexts",
                     "config_path": "src/instructlab/sdg/configs/skills/contexts.yaml",
                     "client": self.client,

From 6f1784c8129c9b539c209f45a76a4ce01b4191cb Mon Sep 17 00:00:00 2001
From: Nikhil Palaskar <npalaska@redhat.com>
Date: Sun, 30 Jun 2024 23:35:03 -0400
Subject: [PATCH 05/10] fix zipping of samples and outputs

---
 scripts/test_freeform_skills.py | 2 +-
 scripts/test_knowledge.py       | 4 ++--
 src/instructlab/sdg/llmblock.py | 5 ++++-
 3 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/scripts/test_freeform_skills.py b/scripts/test_freeform_skills.py
index a8612c09..9b5ce810 100644
--- a/scripts/test_freeform_skills.py
+++ b/scripts/test_freeform_skills.py
@@ -49,7 +49,7 @@
 
 ds = Dataset.from_list(samples)
 
-skills_flow = SynthSkillsFlow(client, "mixtral", teacher_model, 1).get_flow()
+skills_flow = SynthSkillsFlow(client, "mixtral", teacher_model).get_flow()
 skills_pipe = Pipeline(skills_flow)
 
 sdg = SDG([skills_pipe])
diff --git a/scripts/test_knowledge.py b/scripts/test_knowledge.py
index aeedcf59..75bd7783 100644
--- a/scripts/test_knowledge.py
+++ b/scripts/test_knowledge.py
@@ -38,8 +38,8 @@
 
 ds = Dataset.from_list(samples)
 
-mmlu_flow = MMLUBenchFlow(client, "mixtral", teacher_model, 1).get_flow()
-knowledge_flow = SynthKnowledgeFlow(client, "mixtral", teacher_model, 1).get_flow()
+mmlu_flow = MMLUBenchFlow(client, "mixtral", teacher_model).get_flow()
+knowledge_flow = SynthKnowledgeFlow(client, "mixtral", teacher_model).get_flow()
 knowledge_pipe = Pipeline(knowledge_flow)
 mmlu_pipe = Pipeline(mmlu_flow)
 
diff --git a/src/instructlab/sdg/llmblock.py b/src/instructlab/sdg/llmblock.py
index 7952609a..ce333071 100644
--- a/src/instructlab/sdg/llmblock.py
+++ b/src/instructlab/sdg/llmblock.py
@@ -123,8 +123,11 @@ def generate(self, samples, **gen_kwargs) -> Dataset:
             outputs = [self._generate([sample], **gen_kwargs)[0] for sample in samples]
         logger.debug("Generated outputs: {}".format(outputs))
 
+        num_parallel_samples = gen_kwargs.get("n", 1)
+        n_samples = [item for item in samples for i in range(num_parallel_samples)]
+
         new_data = []
-        for sample, output in zip(samples, outputs):
+        for sample, output in zip(n_samples, outputs):
             parsed_outputs = self._parse(output)
             # pylint: disable=consider-using-generator
             max_length = max([len(value) for value in parsed_outputs.values()])

From 7e8282dafb258a4a6a96f2bc2dffa7e3c443d02d Mon Sep 17 00:00:00 2001
From: Nikhil Palaskar <npalaska@redhat.com>
Date: Mon, 1 Jul 2024 14:13:51 -0400
Subject: [PATCH 06/10] some refactoring

---
 src/instructlab/sdg/llmblock.py | 6 ++++--
 src/instructlab/sdg/pipeline.py | 1 -
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/src/instructlab/sdg/llmblock.py b/src/instructlab/sdg/llmblock.py
index ce333071..338b8d2b 100644
--- a/src/instructlab/sdg/llmblock.py
+++ b/src/instructlab/sdg/llmblock.py
@@ -124,10 +124,12 @@ def generate(self, samples, **gen_kwargs) -> Dataset:
         logger.debug("Generated outputs: {}".format(outputs))
 
         num_parallel_samples = gen_kwargs.get("n", 1)
-        n_samples = [item for item in samples for i in range(num_parallel_samples)]
+        extended_samples = []
+        for item in samples:
+            extended_samples.extend([item] * num_parallel_samples)
 
         new_data = []
-        for sample, output in zip(n_samples, outputs):
+        for sample, output in zip(extended_samples, outputs):
             parsed_outputs = self._parse(output)
             # pylint: disable=consider-using-generator
             max_length = max([len(value) for value in parsed_outputs.values()])
diff --git a/src/instructlab/sdg/pipeline.py b/src/instructlab/sdg/pipeline.py
index 982a6ecb..bc570a83 100644
--- a/src/instructlab/sdg/pipeline.py
+++ b/src/instructlab/sdg/pipeline.py
@@ -3,7 +3,6 @@
 from datasets import Dataset
 
 # Local
-from .iterblock import IterBlock
 from .logger_config import setup_logger
 
 logger = setup_logger(__name__)

From 09fdffbd6f78e87edaf783eff9bc36f3359f4dc3 Mon Sep 17 00:00:00 2001
From: Nikhil Palaskar <npalaska@redhat.com>
Date: Tue, 2 Jul 2024 15:44:54 -0400
Subject: [PATCH 07/10] fix the num_samples location

---
 src/instructlab/sdg/default_flows.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/instructlab/sdg/default_flows.py b/src/instructlab/sdg/default_flows.py
index 9fdb2818..b3074066 100644
--- a/src/instructlab/sdg/default_flows.py
+++ b/src/instructlab/sdg/default_flows.py
@@ -375,12 +375,12 @@ def get_flow(self) -> list:
                     "model_prompt": _get_model_prompt(self.model_family),
                     "output_cols": ["context"],
                     "batch_kwargs": {
+                        "num_samples": 30,
                         "num_procs": 8,
                         "batched": self.batched,
                     }
                 },
                 "gen_kwargs": {
-                    "num_samples": 30,
                     "temperature": 0.7,
                     "max_tokens": 2048,
                     "n": 10

From 1ac689a9a5c067b34fa75a558545281fa601585d Mon Sep 17 00:00:00 2001
From: Nikhil Palaskar <npalaska@redhat.com>
Date: Tue, 2 Jul 2024 17:40:29 -0400
Subject: [PATCH 08/10] update generate_data API calls

---
 scripts/test_freeform_skills.py      |  2 +-
 scripts/test_grounded_skills.py      |  2 +-
 scripts/test_knowledge.py            |  4 ++--
 src/instructlab/sdg/default_flows.py | 13 +++++++------
 src/instructlab/sdg/generate_data.py |  8 ++++----
 5 files changed, 15 insertions(+), 14 deletions(-)

diff --git a/scripts/test_freeform_skills.py b/scripts/test_freeform_skills.py
index 9b5ce810..9b1f443a 100644
--- a/scripts/test_freeform_skills.py
+++ b/scripts/test_freeform_skills.py
@@ -49,7 +49,7 @@
 
 ds = Dataset.from_list(samples)
 
-skills_flow = SynthSkillsFlow(client, "mixtral", teacher_model).get_flow()
+skills_flow = SynthSkillsFlow(client, "mixtral", teacher_model, 30).get_flow()
 skills_pipe = Pipeline(skills_flow)
 
 sdg = SDG([skills_pipe])
diff --git a/scripts/test_grounded_skills.py b/scripts/test_grounded_skills.py
index b4edcd59..abbce46f 100644
--- a/scripts/test_grounded_skills.py
+++ b/scripts/test_grounded_skills.py
@@ -97,7 +97,7 @@
 
 ds = Dataset.from_list(samples)
 
-skills_flow = SynthGroundedSkillsFlow(client, "mixtral", teacher_model).get_flow()
+skills_flow = SynthGroundedSkillsFlow(client, "mixtral", teacher_model, 30).get_flow()
 skills_pipe = Pipeline(skills_flow)
 
 sdg = SDG([skills_pipe])
diff --git a/scripts/test_knowledge.py b/scripts/test_knowledge.py
index 75bd7783..aa7bfbcd 100644
--- a/scripts/test_knowledge.py
+++ b/scripts/test_knowledge.py
@@ -38,8 +38,8 @@
 
 ds = Dataset.from_list(samples)
 
-mmlu_flow = MMLUBenchFlow(client, "mixtral", teacher_model).get_flow()
-knowledge_flow = SynthKnowledgeFlow(client, "mixtral", teacher_model).get_flow()
+mmlu_flow = MMLUBenchFlow(client, "mixtral", teacher_model, 30).get_flow()
+knowledge_flow = SynthKnowledgeFlow(client, "mixtral", teacher_model, 30).get_flow()
 knowledge_pipe = Pipeline(knowledge_flow)
 mmlu_pipe = Pipeline(mmlu_flow)
 
diff --git a/src/instructlab/sdg/default_flows.py b/src/instructlab/sdg/default_flows.py
index b3074066..cd91099e 100644
--- a/src/instructlab/sdg/default_flows.py
+++ b/src/instructlab/sdg/default_flows.py
@@ -29,10 +29,11 @@ def _get_model_prompt(model_family):
 
 
 class Flow(ABC):
-    def __init__(self, client, model_family, model_id, batched=True) -> None:
+    def __init__(self, client, model_family, model_id, num_instructions_to_generate, batched=True) -> None:
         self.client = client
         self.model_family = model_family
         self.model_id = model_id
+        self.num_instructions_to_generate = num_instructions_to_generate
         self.batched = batched
 
     @abstractmethod
@@ -60,7 +61,7 @@ def get_flow(self) -> list:
                 "gen_kwargs": {
                     "max_tokens": 2048,
                     "temperature": 0.7,
-                    "n": 1
+                    "n": self.num_instructions_to_generate
                 },
                 "drop_duplicates": ["output"],
             }
@@ -280,7 +281,7 @@ def get_flow(self) -> list:
                     "output_cols": ["question"],
                     "batch_kwargs": {
                         "num_procs": 8,
-                        "num_samples": 30,
+                        "num_samples": self.num_instructions_to_generate,
                         "batched": self.batched,
                     },
                 },
@@ -375,7 +376,6 @@ def get_flow(self) -> list:
                     "model_prompt": _get_model_prompt(self.model_family),
                     "output_cols": ["context"],
                     "batch_kwargs": {
-                        "num_samples": 30,
                         "num_procs": 8,
                         "batched": self.batched,
                     }
@@ -383,8 +383,9 @@ def get_flow(self) -> list:
                 "gen_kwargs": {
                     "temperature": 0.7,
                     "max_tokens": 2048,
-                    "n": 10
+                    "n": self.num_instructions_to_generate
                 },
+                "drop_duplicates": ["context"],
             },
             {
                 "block_type": LLMBlock,
@@ -396,6 +397,7 @@ def get_flow(self) -> list:
                     "model_prompt": _get_model_prompt(self.model_family),
                     "output_cols": ["question"],
                     "batch_kwargs": {
+                        "num_samples": 3,
                         "num_procs": 8,
                         "batched": self.batched,
                     },
@@ -414,7 +416,6 @@ def get_flow(self) -> list:
                     "batch_kwargs": {
                         "num_procs": 8,
                         "batched": self.batched,
-                        "num_samples": 10,
                     },
                 },
             },
diff --git a/src/instructlab/sdg/generate_data.py b/src/instructlab/sdg/generate_data.py
index 66a2987e..b82cda15 100644
--- a/src/instructlab/sdg/generate_data.py
+++ b/src/instructlab/sdg/generate_data.py
@@ -124,7 +124,7 @@ def _gen_test_data(
             outfile.write("\n")
 
 
-def _sdg_init(pipeline, client, model_family, model_name, num_iters, batched):
+def _sdg_init(pipeline, client, model_family, model_name, num_instructions_to_generate, batched):
     knowledge_flow_types = []
     freeform_skill_flow_types = []
     grounded_skill_flow_types = []
@@ -144,7 +144,7 @@ def _sdg_init(pipeline, client, model_family, model_name, num_iters, batched):
         [
             Pipeline(
                 flow_type(
-                    client, model_family, model_name, num_iters, batched
+                    client, model_family, model_name, num_instructions_to_generate, batched
                 ).get_flow()
             )
             for flow_type in knowledge_flow_types
@@ -154,7 +154,7 @@ def _sdg_init(pipeline, client, model_family, model_name, num_iters, batched):
         [
             Pipeline(
                 flow_type(
-                    client, model_family, model_name, num_iters, batched
+                    client, model_family, model_name, num_instructions_to_generate, batched
                 ).get_flow()
             )
             for flow_type in freeform_skill_flow_types
@@ -164,7 +164,7 @@ def _sdg_init(pipeline, client, model_family, model_name, num_iters, batched):
         [
             Pipeline(
                 flow_type(
-                    client, model_family, model_name, num_iters, batched
+                    client, model_family, model_name, num_instructions_to_generate, batched
                 ).get_flow()
             )
             for flow_type in grounded_skill_flow_types

From 0c908047e1d1ce912f80a3130c606a8b04f6c689 Mon Sep 17 00:00:00 2001
From: Nikhil Palaskar <npalaska@redhat.com>
Date: Tue, 2 Jul 2024 17:46:01 -0400
Subject: [PATCH 09/10] change back the defaults in test scripts

---
 scripts/test_freeform_skills.py | 2 +-
 scripts/test_grounded_skills.py | 2 +-
 scripts/test_knowledge.py       | 4 ++--
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/scripts/test_freeform_skills.py b/scripts/test_freeform_skills.py
index 9b1f443a..a8612c09 100644
--- a/scripts/test_freeform_skills.py
+++ b/scripts/test_freeform_skills.py
@@ -49,7 +49,7 @@
 
 ds = Dataset.from_list(samples)
 
-skills_flow = SynthSkillsFlow(client, "mixtral", teacher_model, 30).get_flow()
+skills_flow = SynthSkillsFlow(client, "mixtral", teacher_model, 1).get_flow()
 skills_pipe = Pipeline(skills_flow)
 
 sdg = SDG([skills_pipe])
diff --git a/scripts/test_grounded_skills.py b/scripts/test_grounded_skills.py
index abbce46f..338edb6c 100644
--- a/scripts/test_grounded_skills.py
+++ b/scripts/test_grounded_skills.py
@@ -97,7 +97,7 @@
 
 ds = Dataset.from_list(samples)
 
-skills_flow = SynthGroundedSkillsFlow(client, "mixtral", teacher_model, 30).get_flow()
+skills_flow = SynthGroundedSkillsFlow(client, "mixtral", teacher_model, 10).get_flow()
 skills_pipe = Pipeline(skills_flow)
 
 sdg = SDG([skills_pipe])
diff --git a/scripts/test_knowledge.py b/scripts/test_knowledge.py
index aa7bfbcd..aeedcf59 100644
--- a/scripts/test_knowledge.py
+++ b/scripts/test_knowledge.py
@@ -38,8 +38,8 @@
 
 ds = Dataset.from_list(samples)
 
-mmlu_flow = MMLUBenchFlow(client, "mixtral", teacher_model, 30).get_flow()
-knowledge_flow = SynthKnowledgeFlow(client, "mixtral", teacher_model, 30).get_flow()
+mmlu_flow = MMLUBenchFlow(client, "mixtral", teacher_model, 1).get_flow()
+knowledge_flow = SynthKnowledgeFlow(client, "mixtral", teacher_model, 1).get_flow()
 knowledge_pipe = Pipeline(knowledge_flow)
 mmlu_pipe = Pipeline(mmlu_flow)
 

From 20b9f7ce6e40d10b39340e6be960c21c18a933ee Mon Sep 17 00:00:00 2001
From: Nikhil Palaskar <npalaska@redhat.com>
Date: Tue, 2 Jul 2024 18:24:33 -0400
Subject: [PATCH 10/10] fix SimpleFlows

---
 src/instructlab/sdg/default_flows.py | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/src/instructlab/sdg/default_flows.py b/src/instructlab/sdg/default_flows.py
index cd91099e..f3de0a8b 100644
--- a/src/instructlab/sdg/default_flows.py
+++ b/src/instructlab/sdg/default_flows.py
@@ -72,10 +72,9 @@ class SimpleKnowledgeFlow(_SimpleFlow):
     def get_flow(self) -> list:
         flow = super().get_flow()
         sdg_base = resources.files(__package__)
-        flow[0]["block_config"]["block_kwargs"]["config_path"] = os.path.join(
+        flow[0]["block_config"]["config_path"] = os.path.join(
             sdg_base, "configs/knowledge/simple_generate_qa.yaml"
         )
-        flow[0]["block_config"]["block_kwargs"]["block_name"] = "gen_knowledge"
         flow[0]["block_config"]["block_name"] = "gen_knowledge"
         return flow
 
@@ -84,10 +83,9 @@ class SimpleFreeformSkillFlow(_SimpleFlow):
     def get_flow(self) -> list:
         flow = super().get_flow()
         sdg_base = resources.files(__package__)
-        flow[0]["block_config"]["block_kwargs"]["config_path"] = os.path.join(
+        flow[0]["block_config"]["config_path"] = os.path.join(
             sdg_base, "configs/skills/simple_generate_qa_freeform.yaml"
         )
-        flow[0]["block_config"]["block_kwargs"]["block_name"] = "gen_skill_freeform"
         flow[0]["block_config"]["block_name"] = "gen_skill_freeform"
         return flow
 
@@ -96,10 +94,9 @@ class SimpleGroundedSkillFlow(_SimpleFlow):
     def get_flow(self) -> list:
         flow = super().get_flow()
         sdg_base = resources.files(__package__)
-        flow[0]["block_config"]["block_kwargs"]["config_path"] = os.path.join(
+        flow[0]["block_config"]["config_path"] = os.path.join(
             sdg_base, "configs/skills/simple_generate_qa_grounded.yaml"
         )
-        flow[0]["block_config"]["block_kwargs"]["block_name"] = "gen_skill_grounded"
         flow[0]["block_config"]["block_name"] = "gen_skill_grounded"
         return flow