diff --git a/scripts/test_freeform_skills.py b/scripts/test_freeform_skills.py index 01232e27..a8612c09 100644 --- a/scripts/test_freeform_skills.py +++ b/scripts/test_freeform_skills.py @@ -49,7 +49,7 @@ ds = Dataset.from_list(samples) -skills_flow = SynthSkillsFlow(client, teacher_model).get_flow() +skills_flow = SynthSkillsFlow(client, "mixtral", teacher_model, 1).get_flow() skills_pipe = Pipeline(skills_flow) sdg = SDG([skills_pipe]) diff --git a/scripts/test_grounded_skills.py b/scripts/test_grounded_skills.py new file mode 100644 index 00000000..338edb6c --- /dev/null +++ b/scripts/test_grounded_skills.py @@ -0,0 +1,107 @@ +# Third Party +from datasets import Dataset +from openai import OpenAI + +# First Party +from src.instructlab.sdg import SDG +from src.instructlab.sdg.default_flows import SynthGroundedSkillsFlow +from src.instructlab.sdg.pipeline import Pipeline + +# for vLLM endpoints, the api_key remains "EMPTY" +openai_api_key = "EMPTY" +openai_api_base = "Add model endpoint here" + + +client = OpenAI( + api_key=openai_api_key, + base_url=openai_api_base, +) + +models = client.models.list() +teacher_model = models.data[0].id + +samples = [ + { + "seed_context": """*Ms. Thompson:* Good morning, everyone. Today, we''re here to discuss + our customer journey mapping and analysis. I believe this is crucial to understanding + our customers'' experiences and improving our services. + + + *Mr. Patel:* I agree, Lisa. We should start by identifying all touchpoints in + our customer journey, from initial contact to post-sale support. + + + *Ms. Rodriguez:* Yes, and let''s not forget about the emotional aspect of the + journey. How do our customers feel at each stage? What are their pain points? + + + *Mr. Kim:* We can use data from our CRM system to track the customer journey and + gather insights. This will help us create a detailed, data-driven map. + + + *Ms. Johnson:* Once we have the map, we can analyze it to identify areas for improvement. + Perhaps there are steps where customers drop off or express dissatisfaction. + + + *Mr. Davis:* We should also consider the customer''s perspective. Conducting interviews + or surveys can provide valuable insights into their thoughts and feelings. + + + *Ms. Brown:* Absolutely. And once we''ve identified areas for improvement, we + can develop strategies to address them. This might involve redesigning certain + touchpoints, enhancing our communication, or streamlining processes. + + + *Mr. Smith:* And we must remember to measure the impact of any changes we make. + By tracking key performance indicators, we can determine whether our efforts are + successful. + + + *Ms. Thompson:* Great suggestions, everyone. Let''s divide into teams to tackle + different aspects of the customer journey. Team 1 will focus on pre-sale touchpoints, + Team 2 on sales, and Team 3 on post-sale support. + + + *Mr. Patel:* Sounds good. We''ll reconvene in a week to share our findings and + discuss next steps.""", + "seed_question": """Generate a company wide email based on the given meeting transcript""", + "task_description": "Writing emails based on minutes of meeting", + "seed_response": """Subject: Recap and Next Steps - Customer Journey Mapping and Analysis\n\ + \nDear [Company Name] Team,\n\nI hope this email finds you well. Yesterday, we\ + \ gathered to discuss our customer journey mapping and analysis, aiming to understand\ + \ our customers' experiences and identify opportunities for improvement. The discussions\ + \ were fruitful, and I want to share a brief overview of the key points and outcomes.\n\ + \n1. **Identifying Touchpoints:**\n Mr. Patel suggested mapping all touchpoints\ + \ in our customer journey, from initial contact to post-sale support.\n\n2. **Emotional\ + \ Aspect and Pain Points:**\n Ms. Rodriguez emphasized the importance of considering\ + \ the emotional aspect of the journey and identifying customers' pain points at\ + \ each stage.\n\n3. **Data-Driven Mapping:**\n Mr. Kim proposed using data from\ + \ our CRM system to create a detailed, data-driven customer journey map.\n\n4.\ + \ **Customer Perspective:**\n Ms. Johnson recommended gathering insights from\ + \ the customer's perspective through interviews or surveys.\n\n5. **Analysis and\ + \ Improvement:**\n Ms. Brown suggested analyzing the customer journey map to\ + \ identify areas for improvement and developing strategies to address them.\n\n\ + 6. **Measuring Impact:**\n Mr. Smith stressed the need to measure the impact\ + \ of any changes made by tracking key performance indicators.\n\nTo facilitate\ + \ a comprehensive analysis, we have divided into teams to tackle different aspects\ + \ of the customer journey:\n\n* Team 1: Pre-sale touchpoints\n* Team 2: Sales\n\ + * Team 3: Post-sale support\n\nEach team will share their findings and discuss\ + \ next steps in a week.\n\nYour engagement and insights have been invaluable in\ + \ understanding our customers' experiences and identifying opportunities for improvement.\ + \ I look forward to our continued collaboration as we work towards enhancing our\ + \ services and delivering exceptional customer experiences.\n\nBest regards,\n\ + \n[Your Full Name]\n[Your Position]\n[Company Name]""", + } +] + + +ds = Dataset.from_list(samples) + +skills_flow = SynthGroundedSkillsFlow(client, "mixtral", teacher_model, 10).get_flow() +skills_pipe = Pipeline(skills_flow) + +sdg = SDG([skills_pipe]) +gen_data = sdg.generate(ds) + +print(gen_data) +print(gen_data[0]) diff --git a/scripts/test_knowledge.py b/scripts/test_knowledge.py index d777c8c3..aeedcf59 100644 --- a/scripts/test_knowledge.py +++ b/scripts/test_knowledge.py @@ -38,8 +38,8 @@ ds = Dataset.from_list(samples) -mmlu_flow = MMLUBenchFlow(client, teacher_model).get_flow() -knowledge_flow = SynthKnowledgeFlow(client, teacher_model).get_flow() +mmlu_flow = MMLUBenchFlow(client, "mixtral", teacher_model, 1).get_flow() +knowledge_flow = SynthKnowledgeFlow(client, "mixtral", teacher_model, 1).get_flow() knowledge_pipe = Pipeline(knowledge_flow) mmlu_pipe = Pipeline(mmlu_flow) diff --git a/src/instructlab/sdg/configs/skills/evaluate_grounded_pair.yaml b/src/instructlab/sdg/configs/skills/evaluate_grounded_pair.yaml index 3f40a6fd..45580d3b 100644 --- a/src/instructlab/sdg/configs/skills/evaluate_grounded_pair.yaml +++ b/src/instructlab/sdg/configs/skills/evaluate_grounded_pair.yaml @@ -31,6 +31,7 @@ examples: | [End of Score] generation: | + Begin your evaluation by providing a short explanation. Be as objective as possible. After providing your explanation, you must rate the answer on a scale of 1 to 3 as mentioned above. Here's the context, question and the answer you need to evaluate: [Start of Context] @@ -45,7 +46,6 @@ generation: | {answer} [End of Answer] - Begin your evaluation by providing a short explanation. Be as objective as possible. After providing your explanation, you must rate the answer on a scale of 1 to 3 as mentioned above. * Return the evaluation between [Start of Evaluation] and [End of Evaluation] tags. * Return the score between [Start of Score] and [End of Score] tags. diff --git a/src/instructlab/sdg/configs/skills/evaluate_grounded_questions.yaml b/src/instructlab/sdg/configs/skills/evaluate_grounded_questions.yaml index 70f6feb9..6999987f 100644 --- a/src/instructlab/sdg/configs/skills/evaluate_grounded_questions.yaml +++ b/src/instructlab/sdg/configs/skills/evaluate_grounded_questions.yaml @@ -34,7 +34,7 @@ examples: | [End of Score] generation: | - Here's the context and question you need to evaluate: + Here's the context and question you need to evaluate. Return the evaluation between [Start of Evaluation] and [End of Evaluation] tags. [Start of Context] {context} diff --git a/src/instructlab/sdg/configs/skills/freeform_responses.yaml b/src/instructlab/sdg/configs/skills/freeform_responses.yaml index 0b0eda38..cf7ff177 100644 --- a/src/instructlab/sdg/configs/skills/freeform_responses.yaml +++ b/src/instructlab/sdg/configs/skills/freeform_responses.yaml @@ -21,13 +21,13 @@ examples: | [End of Response] generation: | - Now generate a response to the following prompt. + Now generate a response to the following prompt. Remember to use the same style and format as the example above. [Start of Question] {question} [End of Question] - Remember to use the same style and format as the example above. Return the response between [Start of Response] and [End of Response] tags. + Return the response between [Start of Response] and [End of Response] tags. start_tags: ["[Start of Response]"] end_tags: ["[End of Response]"] diff --git a/src/instructlab/sdg/configs/skills/grounded_responses.yaml b/src/instructlab/sdg/configs/skills/grounded_responses.yaml index 87429b9a..bacd5c10 100644 --- a/src/instructlab/sdg/configs/skills/grounded_responses.yaml +++ b/src/instructlab/sdg/configs/skills/grounded_responses.yaml @@ -26,7 +26,8 @@ examples: | [End of Response] generation: | - Now generate a response to the following prompt. Remember to use the same style and format as the example above. Return the response between [Start of Response] and [End of Response] tags. + Now generate a response to the following prompt. Remember to use the same style and format as the example above. + Return the response between [Start of Response] and [End of Response] tags. [Start of Context] {context} @@ -35,6 +36,8 @@ generation: | {question} [End of Question] + Return the response between [Start of Response] and [End of Response] tags. + start_tags: ["[Start of Response]"] end_tags: ["[End of Response]"] \ No newline at end of file diff --git a/src/instructlab/sdg/default_flows.py b/src/instructlab/sdg/default_flows.py index 1a8ebb75..f3de0a8b 100644 --- a/src/instructlab/sdg/default_flows.py +++ b/src/instructlab/sdg/default_flows.py @@ -7,8 +7,8 @@ # Local from .filterblock import FilterByValueBlock -from .iterblock import IterBlock from .llmblock import LLMBlock +from .utilblocks import CombineColumnsBlock MODEL_FAMILY_MIXTRAL = "mixtral" MODEL_FAMILY_MERLINITE = "merlinite" @@ -29,11 +29,11 @@ def _get_model_prompt(model_family): class Flow(ABC): - def __init__(self, client, model_family, model_id, num_iters, batched=True) -> None: + def __init__(self, client, model_family, model_id, num_instructions_to_generate, batched=True) -> None: self.client = client self.model_family = model_family self.model_id = model_id - self.num_iters = num_iters + self.num_instructions_to_generate = num_instructions_to_generate self.batched = batched @abstractmethod @@ -45,29 +45,25 @@ class _SimpleFlow(Flow): def get_flow(self) -> list: return [ { - "block_type": IterBlock, + "block_type": LLMBlock, "block_config": { "block_name": "", # must be set by subclass - "num_iters": self.num_iters, - "block_type": LLMBlock, - "block_kwargs": { - "block_name": "", # must be set by subclass - "config_path": "", # must be set by subclass - "client": self.client, - "model_id": self.model_id, - "model_prompt": _get_model_prompt(self.model_family), - "output_cols": ["output"], - "batch_kwargs": { - "num_procs": 8, - "batched": self.batched, - }, - }, - "gen_kwargs": { - "max_tokens": 2048, - "temperature": 0.7, + "config_path": "", # must be set by subclass + "client": self.client, + "model_id": self.model_id, + "model_prompt": _get_model_prompt(self.model_family), + "output_cols": ["output"], + "batch_kwargs": { + "num_procs": 8, + "batched": self.batched, }, - "drop_duplicates": ["output"], }, + "gen_kwargs": { + "max_tokens": 2048, + "temperature": 0.7, + "n": self.num_instructions_to_generate + }, + "drop_duplicates": ["output"], } ] @@ -76,10 +72,9 @@ class SimpleKnowledgeFlow(_SimpleFlow): def get_flow(self) -> list: flow = super().get_flow() sdg_base = resources.files(__package__) - flow[0]["block_config"]["block_kwargs"]["config_path"] = os.path.join( + flow[0]["block_config"]["config_path"] = os.path.join( sdg_base, "configs/knowledge/simple_generate_qa.yaml" ) - flow[0]["block_config"]["block_kwargs"]["block_name"] = "gen_knowledge" flow[0]["block_config"]["block_name"] = "gen_knowledge" return flow @@ -88,10 +83,9 @@ class SimpleFreeformSkillFlow(_SimpleFlow): def get_flow(self) -> list: flow = super().get_flow() sdg_base = resources.files(__package__) - flow[0]["block_config"]["block_kwargs"]["config_path"] = os.path.join( + flow[0]["block_config"]["config_path"] = os.path.join( sdg_base, "configs/skills/simple_generate_qa_freeform.yaml" ) - flow[0]["block_config"]["block_kwargs"]["block_name"] = "gen_skill_freeform" flow[0]["block_config"]["block_name"] = "gen_skill_freeform" return flow @@ -100,10 +94,9 @@ class SimpleGroundedSkillFlow(_SimpleFlow): def get_flow(self) -> list: flow = super().get_flow() sdg_base = resources.files(__package__) - flow[0]["block_config"]["block_kwargs"]["config_path"] = os.path.join( + flow[0]["block_config"]["config_path"] = os.path.join( sdg_base, "configs/skills/simple_generate_qa_grounded.yaml" ) - flow[0]["block_config"]["block_kwargs"]["block_name"] = "gen_skill_grounded" flow[0]["block_config"]["block_name"] = "gen_skill_grounded" return flow @@ -225,8 +218,9 @@ def get_flow(self) -> list: "block_config": { "block_name": "filter_relevancy", "filter_column": "score", - "filter_value": "2", + "filter_value": 2.0, "operation": operator.eq, + "convert_dtype": float, "batch_kwargs": { "num_procs": 8, }, @@ -258,8 +252,9 @@ def get_flow(self) -> list: "block_config": { "block_name": "filter_verify_question", "filter_column": "rating", - "filter_value": "1", + "filter_value": 1.0, "operation": operator.eq, + "convert_dtype": float, "batch_kwargs": { "num_procs": 8, }, @@ -283,7 +278,7 @@ def get_flow(self) -> list: "output_cols": ["question"], "batch_kwargs": { "num_procs": 8, - "num_samples": 30, + "num_samples": self.num_instructions_to_generate, "batched": self.batched, }, }, @@ -309,9 +304,9 @@ def get_flow(self) -> list: "block_config": { "block_name": "filter_questions", "filter_column": "score", - "filter_value": 1, + "filter_value": 1.0, "operation": operator.eq, - "convert_dtype": int, + "convert_dtype": float, "batch_kwargs": { "num_procs": 8, }, @@ -353,9 +348,9 @@ def get_flow(self) -> list: "block_config": { "block_name": "filter_qa_pair", "filter_column": "score", - "filter_value": 2, + "filter_value": 2.0, "operation": operator.ge, - "convert_dtype": int, + "convert_dtype": float, "batch_kwargs": { "num_procs": 8, }, @@ -369,28 +364,25 @@ class SynthGroundedSkillsFlow(Flow): def get_flow(self) -> list: return [ { - "block_type": IterBlock, + "block_type": LLMBlock, "block_config": { - "block_name": "context_iter", - "num_iters": 10, - "block_type": LLMBlock, - "block_kwargs": { - "block_name": "gen_contexts", - "config_path": "src/instructlab/sdg/configs/skills/contexts.yaml", - "client": self.client, - "model_id": self.model_id, - "model_prompt": _get_model_prompt(self.model_family), - "output_cols": ["context"], - "batch_kwargs": { - "num_procs": 8, - "batched": self.batched, - }, - }, - "gen_kwargs": { - "temperature": 0.7, - "max_tokens": 2048, - }, + "block_name": "gen_contexts", + "config_path": "src/instructlab/sdg/configs/skills/contexts.yaml", + "client": self.client, + "model_id": self.model_id, + "model_prompt": _get_model_prompt(self.model_family), + "output_cols": ["context"], + "batch_kwargs": { + "num_procs": 8, + "batched": self.batched, + } + }, + "gen_kwargs": { + "temperature": 0.7, + "max_tokens": 2048, + "n": self.num_instructions_to_generate }, + "drop_duplicates": ["context"], }, { "block_type": LLMBlock, @@ -402,6 +394,7 @@ def get_flow(self) -> list: "model_prompt": _get_model_prompt(self.model_family), "output_cols": ["question"], "batch_kwargs": { + "num_samples": 3, "num_procs": 8, "batched": self.batched, }, @@ -428,9 +421,9 @@ def get_flow(self) -> list: "block_config": { "block_name": "filter_grounded_questions", "filter_column": "score", - "filter_value": 1, + "filter_value": 1.0, "operation": operator.eq, - "convert_dtype": int, + "convert_dtype": float, "batch_kwargs": { "num_procs": 8, }, @@ -472,11 +465,23 @@ def get_flow(self) -> list: "block_config": { "block_name": "filter_grounded_qa_pair", "filter_column": "score", - "filter_value": 2, + "filter_value": 2.0, "operation": operator.ge, - "convert_dtype": int, + "convert_dtype": float, + "batch_kwargs": { + "num_procs": 8, + }, + }, + }, + { + "block_type": CombineColumnsBlock, + "block_config": { + "block_name": "combine_question_and_context", + "columns": ["context", "question"], + "output_col": "question", "batch_kwargs": { "num_procs": 8, + "batched": True, }, }, }, diff --git a/src/instructlab/sdg/generate_data.py b/src/instructlab/sdg/generate_data.py index 66a2987e..b82cda15 100644 --- a/src/instructlab/sdg/generate_data.py +++ b/src/instructlab/sdg/generate_data.py @@ -124,7 +124,7 @@ def _gen_test_data( outfile.write("\n") -def _sdg_init(pipeline, client, model_family, model_name, num_iters, batched): +def _sdg_init(pipeline, client, model_family, model_name, num_instructions_to_generate, batched): knowledge_flow_types = [] freeform_skill_flow_types = [] grounded_skill_flow_types = [] @@ -144,7 +144,7 @@ def _sdg_init(pipeline, client, model_family, model_name, num_iters, batched): [ Pipeline( flow_type( - client, model_family, model_name, num_iters, batched + client, model_family, model_name, num_instructions_to_generate, batched ).get_flow() ) for flow_type in knowledge_flow_types @@ -154,7 +154,7 @@ def _sdg_init(pipeline, client, model_family, model_name, num_iters, batched): [ Pipeline( flow_type( - client, model_family, model_name, num_iters, batched + client, model_family, model_name, num_instructions_to_generate, batched ).get_flow() ) for flow_type in freeform_skill_flow_types @@ -164,7 +164,7 @@ def _sdg_init(pipeline, client, model_family, model_name, num_iters, batched): [ Pipeline( flow_type( - client, model_family, model_name, num_iters, batched + client, model_family, model_name, num_instructions_to_generate, batched ).get_flow() ) for flow_type in grounded_skill_flow_types diff --git a/src/instructlab/sdg/iterblock.py b/src/instructlab/sdg/iterblock.py deleted file mode 100644 index 21a20470..00000000 --- a/src/instructlab/sdg/iterblock.py +++ /dev/null @@ -1,29 +0,0 @@ -# Third Party -from datasets import Dataset - -# Local -from .block import Block -from .logger_config import setup_logger - -logger = setup_logger(__name__) - - -class IterBlock(Block): - def __init__(self, block_name, num_iters, block_type, block_kwargs, **kwargs): - super().__init__(block_name) - self.num_iters = num_iters - self.block = block_type(**block_kwargs) - self.gen_kwargs = kwargs.get("gen_kwargs", {}) - self.gen_kwargs = kwargs.get("gen_kwargs", {}) - - def generate(self, samples, **gen_kwargs) -> Dataset: - generated_samples = [] - num_iters = self.num_iters - - for _ in range(num_iters): - batch_generated = self.block.generate( - samples, **{**self.gen_kwargs, **gen_kwargs} - ) - generated_samples.extend(batch_generated) - - return Dataset.from_list(generated_samples) diff --git a/src/instructlab/sdg/llmblock.py b/src/instructlab/sdg/llmblock.py index 7952609a..338b8d2b 100644 --- a/src/instructlab/sdg/llmblock.py +++ b/src/instructlab/sdg/llmblock.py @@ -123,8 +123,13 @@ def generate(self, samples, **gen_kwargs) -> Dataset: outputs = [self._generate([sample], **gen_kwargs)[0] for sample in samples] logger.debug("Generated outputs: {}".format(outputs)) + num_parallel_samples = gen_kwargs.get("n", 1) + extended_samples = [] + for item in samples: + extended_samples.extend([item] * num_parallel_samples) + new_data = [] - for sample, output in zip(samples, outputs): + for sample, output in zip(extended_samples, outputs): parsed_outputs = self._parse(output) # pylint: disable=consider-using-generator max_length = max([len(value) for value in parsed_outputs.values()]) diff --git a/src/instructlab/sdg/pipeline.py b/src/instructlab/sdg/pipeline.py index fc93f78d..bc570a83 100644 --- a/src/instructlab/sdg/pipeline.py +++ b/src/instructlab/sdg/pipeline.py @@ -3,7 +3,6 @@ from datasets import Dataset # Local -from .iterblock import IterBlock from .logger_config import setup_logger logger = setup_logger(__name__) @@ -39,12 +38,6 @@ def generate(self, dataset) -> Dataset: drop_duplicates_cols = block_prop.get("drop_duplicates", False) block = block_type(**block_config) - if block_type == IterBlock: - block_kwargs = block_config.pop("block_kwargs") - block = block_type(**block_config, block_kwargs=block_kwargs) - else: - block = block_type(**block_config) - logger.info("Running block: %s", block_config["block_name"]) logger.info(dataset)