instructlab · markmc · Jul 5, 2024 · Jul 2, 2024 · Jul 10, 2024 · Jul 11, 2024
diff --git a/scripts/test_freeform_skills.py b/scripts/test_freeform_skills.py
@@ -5,7 +5,7 @@
 # First Party
 from src.instructlab.sdg import SDG
 from src.instructlab.sdg.default_flows import SynthSkillsFlow
-from src.instructlab.sdg.pipeline import Pipeline
+from src.instructlab.sdg.pipeline import Pipeline, PipelineContext
 
 # for vLLM endpoints, the api_key remains "EMPTY"
 openai_api_key = "EMPTY"
@@ -49,7 +49,9 @@
 
 ds = Dataset.from_list(samples)
 
-skills_flow = SynthSkillsFlow(client, "mixtral", teacher_model, 1).get_flow()
+ctx = PipelineContext(client, "mixtral", teacher_model, 1)
+
+skills_flow = SynthSkillsFlow(ctx).get_flow()
 skills_pipe = Pipeline(skills_flow)
 
 sdg = SDG([skills_pipe])

diff --git a/scripts/test_grounded_skills.py b/scripts/test_grounded_skills.py
@@ -5,7 +5,7 @@
 # First Party
 from src.instructlab.sdg import SDG
 from src.instructlab.sdg.default_flows import SynthGroundedSkillsFlow
-from src.instructlab.sdg.pipeline import Pipeline
+from src.instructlab.sdg.pipeline import Pipeline, PipelineContext
 
 # for vLLM endpoints, the api_key remains "EMPTY"
 openai_api_key = "EMPTY"
@@ -97,7 +97,9 @@
 
 ds = Dataset.from_list(samples)
 
-skills_flow = SynthGroundedSkillsFlow(client, "mixtral", teacher_model, 10).get_flow()
+ctx = PipelineContext(client, "mixtral", teacher_model, 10)
+
+skills_flow = SynthGroundedSkillsFlow(ctx).get_flow()
 skills_pipe = Pipeline(skills_flow)
 
 sdg = SDG([skills_pipe])

diff --git a/scripts/test_knowledge.py b/scripts/test_knowledge.py
@@ -8,7 +8,7 @@
 # First Party
 from src.instructlab.sdg import SDG
 from src.instructlab.sdg.default_flows import MMLUBenchFlow, SynthKnowledgeFlow
-from src.instructlab.sdg.pipeline import Pipeline
+from src.instructlab.sdg.pipeline import Pipeline, PipelineContext
 
 # Please don't add you vLLM endpoint key here
 openai_api_key = "EMPTY"
@@ -38,12 +38,13 @@
 
 ds = Dataset.from_list(samples)
 
-mmlu_flow = MMLUBenchFlow(client, "mixtral", teacher_model, 1).get_flow()
-knowledge_flow = SynthKnowledgeFlow(client, "mixtral", teacher_model, 1).get_flow()
-knowledge_pipe = Pipeline(knowledge_flow)
-mmlu_pipe = Pipeline(mmlu_flow)
+ctx = PipelineContext(client, "mixtral", teacher_model, 1)
 
-sdg = SDG([mmlu_pipe, knowledge_pipe])
+mmlu_flow = MMLUBenchFlow(ctx).get_flow()
+knowledge_flow = SynthKnowledgeFlow(ctx).get_flow()
+knowledge_pipe = Pipeline(mmlu_flow + knowledge_flow)
+
+sdg = SDG([knowledge_pipe])
 mmlubench_data = sdg.generate(ds)
 
 print(mmlubench_data)

diff --git a/src/instructlab/sdg/block.py b/src/instructlab/sdg/block.py
@@ -14,7 +14,8 @@
 
 
 class Block(ABC):
-    def __init__(self, block_name: str) -> None:
+    def __init__(self, ctx, block_name: str) -> None:
+        self.ctx = ctx
         self.block_name = block_name
 
     @staticmethod

diff --git a/src/instructlab/sdg/default_flows.py b/src/instructlab/sdg/default_flows.py
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # Standard
 from abc import ABC, abstractmethod
-from importlib import resources
 import operator
 import os
 
@@ -10,33 +9,10 @@
 from .llmblock import LLMBlock
 from .utilblocks import CombineColumnsBlock
 
-MODEL_FAMILY_MIXTRAL = "mixtral"
-MODEL_FAMILY_MERLINITE = "merlinite"
-
-_MODEL_PROMPT_MIXTRAL = "<s> [INST] {prompt} [/INST]"
-_MODEL_PROMPT_MERLINITE = "'<|system|>\nYou are an AI language model developed by IBM Research. You are a cautious assistant. You carefully follow instructions. You are helpful and harmless and you follow ethical guidelines and promote positive behavior.\n<|user|>\n{prompt}\n<|assistant|>\n'"
-
-_MODEL_PROMPTS = {
-    MODEL_FAMILY_MIXTRAL: _MODEL_PROMPT_MIXTRAL,
-    MODEL_FAMILY_MERLINITE: _MODEL_PROMPT_MERLINITE,
-}
-
-
-def _get_model_prompt(model_family):
-    if model_family not in _MODEL_PROMPTS:
-        raise ValueError(f"Unknown model family: {model_family}")
-    return _MODEL_PROMPTS[model_family]
-
 
 class Flow(ABC):
-    def __init__(
-        self, client, model_family, model_id, num_instructions_to_generate
-    ) -> None:
-        self.client = client
-        self.model_family = model_family
-        self.model_id = model_id
-        self.num_instructions_to_generate = num_instructions_to_generate
-        self.sdg_base = resources.files(__package__)
+    def __init__(self, ctx) -> None:
+        self.ctx = ctx
 
     @abstractmethod
     def get_flow(self) -> list:
@@ -51,15 +27,12 @@ def get_flow(self) -> list:
                 "block_config": {
                     "block_name": "",  # must be set by subclass
                     "config_path": "",  # must be set by subclass
-                    "client": self.client,
-                    "model_id": self.model_id,
-                    "model_prompt": _get_model_prompt(self.model_family),
                     "output_cols": ["output"],
                 },
                 "gen_kwargs": {
                     "max_tokens": 2048,
                     "temperature": 0.7,
-                    "n": self.num_instructions_to_generate,
+                    "n": self.ctx.num_instructions_to_generate,
                 },
                 "drop_duplicates": ["output"],
             }
@@ -70,7 +43,7 @@ class SimpleKnowledgeFlow(_SimpleFlow):
     def get_flow(self) -> list:
         flow = super().get_flow()
         flow[0]["block_config"]["config_path"] = os.path.join(
-            self.sdg_base, "configs/knowledge/simple_generate_qa.yaml"
+            self.ctx.sdg_base, "configs/knowledge/simple_generate_qa.yaml"
         )
         flow[0]["block_config"]["block_name"] = "gen_knowledge"
         return flow
@@ -80,37 +53,32 @@ class SimpleFreeformSkillFlow(_SimpleFlow):
     def get_flow(self) -> list:
         flow = super().get_flow()
         flow[0]["block_config"]["config_path"] = os.path.join(
-            self.sdg_base, "configs/skills/simple_generate_qa_freeform.yaml"
+            self.ctx.sdg_base, "configs/skills/simple_generate_qa_freeform.yaml"
         )
         flow[0]["block_config"]["block_name"] = "gen_skill_freeform"
-        flow[0]["block_config"]["block_name"] = "gen_skill_freeform"
         return flow
 
 
 class SimpleGroundedSkillFlow(_SimpleFlow):
     def get_flow(self) -> list:
         flow = super().get_flow()
         flow[0]["block_config"]["config_path"] = os.path.join(
-            self.sdg_base, "configs/skills/simple_generate_qa_grounded.yaml"
+            self.ctx.sdg_base, "configs/skills/simple_generate_qa_grounded.yaml"
         )
         flow[0]["block_config"]["block_name"] = "gen_skill_grounded"
         return flow
 
 
 class MMLUBenchFlow(Flow):
     def get_flow(self) -> list:
-        self.sdg_base = resources.files(__package__)
         return [
             {
                 "block_type": LLMBlock,
                 "block_config": {
                     "block_name": "gen_mmlu_knowledge",
                     "config_path": os.path.join(
-                        self.sdg_base, "configs/knowledge/mcq_generation.yaml"
+                        self.ctx.sdg_base, "configs/knowledge/mcq_generation.yaml"
                     ),
-                    "client": self.client,
-                    "model_id": self.model_id,
-                    "model_prompt": _get_model_prompt(self.model_family),
                     "output_cols": ["mmlubench_question", "mmlubench_answer"],
                 },
                 "gen_kwargs": {
@@ -130,12 +98,9 @@ def get_flow(self) -> list:
                 "block_config": {
                     "block_name": "gen_knowledge",
                     "config_path": os.path.join(
-                        self.sdg_base,
+                        self.ctx.sdg_base,
                         "configs/knowledge/generate_questions_responses.yaml",
                     ),
-                    "client": self.client,
-                    "model_id": self.model_id,
-                    "model_prompt": _get_model_prompt(self.model_family),
                     "output_cols": ["question", "response"],
                     "parser_kwargs": {
                         "parser_name": "custom",
@@ -153,11 +118,9 @@ def get_flow(self) -> list:
                 "block_config": {
                     "block_name": "eval_faithfulness_qa_pair",
                     "config_path": os.path.join(
-                        self.sdg_base, "configs/knowledge/evaluate_faithfulness.yaml"
+                        self.ctx.sdg_base,
+                        "configs/knowledge/evaluate_faithfulness.yaml",
                     ),
-                    "client": self.client,
-                    "model_id": self.model_id,
-                    "model_prompt": _get_model_prompt(self.model_family),
                     "output_cols": ["explanation", "judgment"],
                 },
                 "gen_kwargs": {
@@ -182,11 +145,9 @@ def get_flow(self) -> list:
                 "block_config": {
                     "block_name": "eval_relevancy_qa_pair",
                     "config_path": os.path.join(
-                        self.sdg_base, "configs/knowledge/evaluate_relevancy.yaml"
+                        self.ctx.sdg_base,
+                        "configs/knowledge/evaluate_relevancy.yaml",
                     ),
-                    "client": self.client,
-                    "model_id": self.model_id,
-                    "model_prompt": _get_model_prompt(self.model_family),
                     "output_cols": ["feedback", "score"],
                 },
                 "gen_kwargs": {
@@ -212,11 +173,8 @@ def get_flow(self) -> list:
                 "block_config": {
                     "block_name": "eval_verify_question",
                     "config_path": os.path.join(
-                        self.sdg_base, "configs/knowledge/evaluate_question.yaml"
+                        self.ctx.sdg_base, "configs/knowledge/evaluate_question.yaml"
                     ),
-                    "client": self.client,
-                    "model_id": self.model_id,
-                    "model_prompt": _get_model_prompt(self.model_family),
                     "output_cols": ["explanation", "rating"],
                 },
                 "gen_kwargs": {
@@ -248,15 +206,12 @@ def get_flow(self) -> list:
                 "block_config": {
                     "block_name": "gen_questions",
                     "config_path": os.path.join(
-                        self.sdg_base,
+                        self.ctx.sdg_base,
                         "configs/skills/freeform_questions.yaml",
                     ),
-                    "client": self.client,
-                    "model_id": self.model_id,
-                    "model_prompt": _get_model_prompt(self.model_family),
                     "output_cols": ["question"],
                     "batch_kwargs": {
-                        "num_samples": self.num_instructions_to_generate,
+                        "num_samples": self.ctx.num_instructions_to_generate,
                     },
                 },
                 "drop_duplicates": ["question"],
@@ -266,12 +221,9 @@ def get_flow(self) -> list:
                 "block_config": {
                     "block_name": "eval_questions",
                     "config_path": os.path.join(
-                        self.sdg_base,
+                        self.ctx.sdg_base,
                         "configs/skills/evaluate_freeform_questions.yaml",
                     ),
-                    "client": self.client,
-                    "model_id": self.model_id,
-                    "model_prompt": _get_model_prompt(self.model_family),
                     "output_cols": ["evaluation", "score"],
                 },
             },
@@ -294,12 +246,9 @@ def get_flow(self) -> list:
                 "block_config": {
                     "block_name": "gen_responses",
                     "config_path": os.path.join(
-                        self.sdg_base,
+                        self.ctx.sdg_base,
                         "configs/skills/freeform_responses.yaml",
                     ),
-                    "client": self.client,
-                    "model_id": self.model_id,
-                    "model_prompt": _get_model_prompt(self.model_family),
                     "output_cols": ["response"],
                 },
             },
@@ -308,12 +257,9 @@ def get_flow(self) -> list:
                 "block_config": {
                     "block_name": "evaluate_qa_pair",
                     "config_path": os.path.join(
-                        self.sdg_base,
+                        self.ctx.sdg_base,
                         "configs/skills/evaluate_freeform_pair.yaml",
                     ),
-                    "client": self.client,
-                    "model_id": self.model_id,
-                    "model_prompt": _get_model_prompt(self.model_family),
                     "output_cols": ["evaluation", "score"],
                 },
             },
@@ -342,18 +288,15 @@ def get_flow(self) -> list:
                 "block_config": {
                     "block_name": "gen_contexts",
                     "config_path": os.path.join(
-                        self.sdg_base,
+                        self.ctx.sdg_base,
                         "configs/skills/contexts.yaml",
                     ),
-                    "client": self.client,
-                    "model_id": self.model_id,
-                    "model_prompt": _get_model_prompt(self.model_family),
                     "output_cols": ["context"],
                 },
                 "gen_kwargs": {
                     "temperature": 0.7,
                     "max_tokens": 2048,
-                    "n": self.num_instructions_to_generate,
+                    "n": self.ctx.num_instructions_to_generate,
                 },
                 "drop_duplicates": ["context"],
             },
@@ -362,12 +305,9 @@ def get_flow(self) -> list:
                 "block_config": {
                     "block_name": "gen_grounded_questions",
                     "config_path": os.path.join(
-                        self.sdg_base,
+                        self.ctx.sdg_base,
                         "configs/skills/grounded_questions.yaml",
                     ),
-                    "client": self.client,
-                    "model_id": self.model_id,
-                    "model_prompt": _get_model_prompt(self.model_family),
                     "output_cols": ["question"],
                     "batch_kwargs": {
                         "num_samples": 3,
@@ -380,12 +320,9 @@ def get_flow(self) -> list:
                 "block_config": {
                     "block_name": "eval_grounded_questions",
                     "config_path": os.path.join(
-                        self.sdg_base,
+                        self.ctx.sdg_base,
                         "configs/skills/evaluate_grounded_questions.yaml",
                     ),
-                    "client": self.client,
-                    "model_id": self.model_id,
-                    "model_prompt": _get_model_prompt(self.model_family),
                     "output_cols": ["evaluation", "score"],
                 },
             },
@@ -408,12 +345,9 @@ def get_flow(self) -> list:
                 "block_config": {
                     "block_name": "gen_grounded_responses",
                     "config_path": os.path.join(
-                        self.sdg_base,
+                        self.ctx.sdg_base,
                         "configs/skills/grounded_responses.yaml",
                     ),
-                    "client": self.client,
-                    "model_id": self.model_id,
-                    "model_prompt": _get_model_prompt(self.model_family),
                     "output_cols": ["response"],
                 },
             },
@@ -422,12 +356,9 @@ def get_flow(self) -> list:
                 "block_config": {
                     "block_name": "evaluate_grounded_qa_pair",
                     "config_path": os.path.join(
-                        self.sdg_base,
+                        self.ctx.sdg_base,
                         "configs/skills/evaluate_grounded_pair.yaml",
                     ),
-                    "client": self.client,
-                    "model_id": self.model_id,
-                    "model_prompt": _get_model_prompt(self.model_family),
                     "output_cols": ["evaluation", "score"],
                 },
             },