From 0207156c87a7893c8466cd0b1832cd76b1d033a8 Mon Sep 17 00:00:00 2001 From: Russell Bryant Date: Wed, 26 Jun 2024 11:17:09 -0400 Subject: [PATCH] Add simple knowledge pipeline for use with default merlinite The CLI's default model is quantized merlinite, and it does not seem good enough to follow the instructions in the full pipeline included in the new library. It's not doing any validation on the output, so the output is not going to be great. Then again, the output has never been great doing SDG with merlinite and the old sdg implementation. This at least keeps the ability to a basic workflow test and demo on a smaller system. Signed-off-by: Russell Bryant --- .../configs/knowledge/simple_generate_qa.yaml | 37 +++++++++++++++++++ src/instructlab/sdg/default_flows.py | 28 ++++++++++++++ src/instructlab/sdg/llmblock.py | 11 ++++-- 3 files changed, 72 insertions(+), 4 deletions(-) create mode 100644 src/instructlab/sdg/configs/knowledge/simple_generate_qa.yaml diff --git a/src/instructlab/sdg/configs/knowledge/simple_generate_qa.yaml b/src/instructlab/sdg/configs/knowledge/simple_generate_qa.yaml new file mode 100644 index 00000000..c20add97 --- /dev/null +++ b/src/instructlab/sdg/configs/knowledge/simple_generate_qa.yaml @@ -0,0 +1,37 @@ +system: You are a very knowledgeable AI Assistant that will faithfully assist the user with their task. + +introduction: Develop a series of educational question and answer pairs from a chapter in a {domain} textbook. + +principles: | +Here are the requirements: + 1. Try not to repeat the verb for each instruction to maximize diversity. + 2. The language used for the instruction also should be diverse. For example, you should combine questions with imperative instructions. + 3. The type of instructions should be similar to provided examples. The generated instruction and the output should be grounded in the provided document. + 4. A GPT language model should be able to complete the instruction. For example, do not ask the assistant to create any visual or audio output. For another example, do not ask the assistant to wake you up at 5pm or set a reminder because it cannot perform any action. + 5. The instructions should be in English. + 6. The instructions should be 1 to 2 sentences long. Either an imperative sentence or a question is permitted. + 7. The output should be an appropriate response to the input and the instruction. Long outputs are preferable. + +examples: | + Here are some examples to help you understand the type of questions that are asked for this document: + + {question_1} + {response_1} + + {question_2} + {response_2} + + {question_3} + {response_3} + + Here is the document: + {document} + +generation: | + Provide a single question and answer pair based on the document: + + Document: + {{document}} + +start_tags: [""] +end_tags: [""] diff --git a/src/instructlab/sdg/default_flows.py b/src/instructlab/sdg/default_flows.py index 8c730228..d12ce4ff 100644 --- a/src/instructlab/sdg/default_flows.py +++ b/src/instructlab/sdg/default_flows.py @@ -40,6 +40,34 @@ def get_flow(self) -> list: pass +class SimpleKnowledgeFlow(Flow): + def get_flow(self) -> list: + sdg_base = resources.files(__package__) + return [ + { + "block_type": LLMBlock, + "block_config": { + "block_name": "gen_knowledge", + "config_path": os.path.join( + sdg_base, "configs/knowledge/simple_generate_qa.yaml" + ), + "client": self.client, + "model_id": self.model_id, + "model_prompt": _get_model_prompt(self.model_family), + "output_cols": ["output"], + "batch_kwargs": { + "num_procs": 8, + "batched": self.batched, + }, + }, + "gen_kwargs": { + "max_tokens": 2048, + }, + "drop_duplicates": ["output"], + }, + ] + + class MMLUBenchFlow(Flow): def get_flow(self) -> list: sdg_base = resources.files(__package__) diff --git a/src/instructlab/sdg/llmblock.py b/src/instructlab/sdg/llmblock.py index c675722e..7952609a 100644 --- a/src/instructlab/sdg/llmblock.py +++ b/src/instructlab/sdg/llmblock.py @@ -67,20 +67,21 @@ def _parse(self, generated_string) -> dict: ) else: for start_tag, end_tag, output_col in zip( - self.block_config["start_tags"], - self.block_config["end_tags"], + self.block_config.get("start_tags", []), + self.block_config.get("end_tags", []), self.output_cols, ): if not start_tag and not end_tag: - matches[output_col] = ( + matches[output_col] = [ generated_string.strip() if generated_string else None - ) + ] else: pattern = re.escape(start_tag) + r"(.*?)" + re.escape(end_tag) all_matches = re.findall(pattern, generated_string, re.DOTALL) matches[output_col] = ( [match.strip() for match in all_matches] if all_matches else [] ) + return matches def _generate(self, samples, **gen_kwargs) -> list: @@ -104,6 +105,7 @@ def generate(self, samples, **gen_kwargs) -> Dataset: """ num_samples = self.batch_params.get("num_samples", None) batched = self.batch_params.get("batched", False) + logger.debug("Generating outputs for {} samples".format(len(samples))) if (num_samples is not None) and ("num_samples" not in samples.column_names): samples = samples.add_column("num_samples", [num_samples] * len(samples)) @@ -119,6 +121,7 @@ def generate(self, samples, **gen_kwargs) -> Dataset: outputs = self._generate(samples, **gen_kwargs) else: outputs = [self._generate([sample], **gen_kwargs)[0] for sample in samples] + logger.debug("Generated outputs: {}".format(outputs)) new_data = [] for sample, output in zip(samples, outputs):