From 63afe34ccea3fbb2329ca6e74d836a20ee18180e Mon Sep 17 00:00:00 2001 From: "Abhi.B" Date: Thu, 27 Jun 2024 18:18:50 +0000 Subject: [PATCH 1/8] removing start/end tags/reverting to original format Signed-off-by: Abhi.B --- .../generate_questions_responses.yaml | 66 ++++++++----------- 1 file changed, 28 insertions(+), 38 deletions(-) diff --git a/src/instructlab/sdg/configs/knowledge/generate_questions_responses.yaml b/src/instructlab/sdg/configs/knowledge/generate_questions_responses.yaml index b424f517..910c64a5 100644 --- a/src/instructlab/sdg/configs/knowledge/generate_questions_responses.yaml +++ b/src/instructlab/sdg/configs/knowledge/generate_questions_responses.yaml @@ -15,64 +15,54 @@ principles: | Strictly follow this format for each question answer pair your generate while responding - [Start of Question] - ... - [End of Question] - [Start of Response] - ... - [End of Response] + [QUESTION] + + [ANSWER] + + [END] + Each question and answer pair should stand alone as a mini-lesson, encapsulating a key concept or idea from the chapter in a way that is accessible and informative without requiring the reader to refer back to the textbook. examples: | Here are some examples of questions: - [Start of Question] + [QUESTION] Explain the process of photosynthesis in plants. Include in your answer the roles of chlorophyll, light, water, and carbon dioxide, and describe how oxygen and glucose are produced. - [End of Question] - [Start of Response] + [ANSWER] Photosynthesis is the process by which plants, algae, and some bacteria use sunlight to synthesize food from carbon dioxide and water. Photosynthesis in plants primarily occurs in the leaves, specifically in the chloroplasts. Chlorophyll, the green pigment in chloroplasts, absorbs light energy, which is then used to convert carbon dioxide (from the air) and water (from the soil) into glucose, a simple sugar. This process also releases oxygen as a byproduct. Light energy splits water molecules, releasing electrons and hydrogen ions and forming oxygen. The light-dependent reactions convert light energy into chemical energy (ATP and NADPH), which is used in the light-independent reactions (Calvin cycle) to convert carbon dioxide into glucose. The overall result is the conversion of solar energy into chemical energy in the form of glucose, which plants use for growth and development. - [End of Response] + [END] - [Start of Question] + [QUESTION] In a study on the effects of temperature on enzyme activity, an enzyme exhibited its highest activity at 37°C. At both higher and lower temperatures, its activity decreased. Based on this information, which of the following best explains the enzyme's behavior? Options: a) Enzymes are temperature-sensitive and can denature at high temperatures, losing their functional shape, while at low temperatures, their reaction rates decrease due to reduced molecular motion. b) Enzymes are more effective at higher temperatures as increased heat provides more energy for reactions, and lower temperatures cause enzymes to become more active due to enhanced molecular stability. c) The enzyme's behavior is unrelated to temperature; instead, it is likely due to changes in pH levels, which affect enzyme activity. d) All enzymes universally work best at exactly 37°C, as this is the standard temperature for all biochemical reactions in nature. - [End of Question] - [Start of Response] + [ANSWER] a) Enzymes are temperature-sensitive and can denature at high temperatures, losing their functional shape, while at low temperatures, their reaction rates decrease due to reduced molecular motion. - [End of Response] + [END] For this {domain} domain here are some sample questions: - [Start of Question] - {question_1} - [End of Question] - [Start of Response] - {response_1} - [End of Response] + [QUESTION] + {icl_query_1} + [ANSWER] + {icl_response_1} + [END] - [Start of Question] - {question_2} - [End of Question] - [Start of Response] - {response_2} - [End of Response] + [QUESTION] + {icl_query_2} + [ANSWER] + {icl_response_2} + [END] - [Start of Question] - {question_3} - [End of Question] - [Start of Response] - {response_3} - [End of Response] + [QUESTION] + {icl_query_3} + [ANSWER] + {icl_response_3} + [END] +generation: | Here is the document: {document} - -generation: | - Now generate the question and answer pairs, remember to follow the principles mentioned above and use the same format as the examples. Remember to use the same style and format as the example above. Return each question between [Start of Question] and [End of Question] tags and answer between [Start of Response] and [End of Response] tags. - -start_tags: ["[Start of Question]", "[Start of Response]"] -end_tags: ["[End of Question]", "[End of Response]"] From 57d4ec36867dad054558fdb9fd1295a85828c41e Mon Sep 17 00:00:00 2001 From: "Abhi.B" Date: Thu, 27 Jun 2024 18:20:34 +0000 Subject: [PATCH 2/8] =?UTF-8?q?=E2=9C=A8=20Added=20custom=20parsing=20func?= =?UTF-8?q?tionality=20=F0=9F=94=A7=20Implemented=20conditional=20llmblock?= =?UTF-8?q?=20=F0=9F=90=9B=20Fixed=20minor=20bugs?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Abhi.B --- src/instructlab/sdg/llmblock.py | 115 +++++++++++++++++++++++++++----- 1 file changed, 100 insertions(+), 15 deletions(-) diff --git a/src/instructlab/sdg/llmblock.py b/src/instructlab/sdg/llmblock.py index ad429b75..ebb90b54 100644 --- a/src/instructlab/sdg/llmblock.py +++ b/src/instructlab/sdg/llmblock.py @@ -1,5 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 # Standard +from typing import Any, Dict import re # Third Party @@ -21,6 +22,7 @@ def __init__( client, model_id, output_cols, + parser_kwargs={}, model_prompt="{prompt}", **batch_kwargs, ) -> None: @@ -35,6 +37,9 @@ def __init__( self.model_prompt = model_prompt self.output_cols = output_cols self.batch_params = batch_kwargs.get("batch_kwargs", {}) + self.parser_name = parser_kwargs.get("parser_name", None) + self.parsing_pattern = parser_kwargs.get("parsing_pattern", None) + self.parser_cleanup_tags = parser_kwargs.get("parser_cleanup_tags", None) self.defaults = { "model": self.model, "temperature": 0, @@ -43,22 +48,40 @@ def __init__( def _parse(self, generated_string) -> dict: matches = {} - for start_tag, end_tag, output_col in zip( - self.block_config["start_tags"], - self.block_config["end_tags"], - self.output_cols, - ): - if not start_tag and not end_tag: - matches[output_col] = ( - generated_string.strip() if generated_string else None - ) + + + if self.parser_name is not None and self.parser_name == 'custom': + pattern = re.compile(self.parsing_pattern, re.DOTALL) + all_matches = pattern.findall(generated_string) + matches = {column_name: [] for column_name in self.output_cols} + if all_matches and isinstance(all_matches[0], tuple): + for match in all_matches: + for column_name, value in zip(self.output_cols, match): + value = value.strip() + for clean_tag in self.parser_cleanup_tags: + value = value.replace(clean_tag, "") + matches[column_name].append(value) else: - pattern = re.escape(start_tag) + r"(.*?)" + re.escape(end_tag) - all_matches = re.findall(pattern, generated_string, re.DOTALL) - matches[output_col] = ( - [match.strip() for match in all_matches] if all_matches else None - ) + matches[self.output_cols[0]] = ( + [match.strip() for match in all_matches] if all_matches else [] + ) + else: + for start_tag, end_tag, output_col in zip( + self.block_config["start_tags"], + self.block_config["end_tags"], + self.output_cols, + ): + if not start_tag and not end_tag: + matches[output_col] = ( + generated_string.strip() if generated_string else None + ) + else: + pattern = re.escape(start_tag) + r"(.*?)" + re.escape(end_tag) + all_matches = re.findall(pattern, generated_string, re.DOTALL) + matches[output_col] = ( + [match.strip() for match in all_matches] if all_matches else [] + ) return matches def _generate(self, samples, **gen_kwargs) -> list: @@ -86,7 +109,7 @@ def generate(self, samples, **gen_kwargs) -> Dataset: if (num_samples is not None) and ("num_samples" not in samples.column_names): samples = samples.add_column("num_samples", [num_samples] * len(samples)) - # validate the each sample + # validate each sample for sample in samples: if not self._validate(self.prompt_template, sample): return None @@ -107,3 +130,65 @@ def generate(self, samples, **gen_kwargs) -> Dataset: new_data.append({**sample, **dict(zip(parsed_outputs.keys(), values))}) return Dataset.from_list(new_data) + + +class ConditionalLLMBlock(LLMBlock): + def __init__( + self, + block_name, + config_paths, + client, + model_id, + output_cols, + selector_column_name, + parser_kwargs={}, + model_prompt="{prompt}", + **batch_kwargs, + ) -> None: + super().__init__( + block_name, + config_paths[0][0], + client, + model_id, + output_cols, + parser_kwargs=parser_kwargs, + model_prompt=model_prompt, + **batch_kwargs, + ) + self.selector_column_name = selector_column_name + self.prompt_template = {} + if len(config_paths) == 1 and config_paths[0][1] == "All": + self.prompt_template = self.prompt_struct.format(**self.block_config) + else: + for config, config_key in config_paths: + self.prompt_template[config_key] = self.prompt_struct.format( + **self._load_config(config) + ) + + + def _generate(self, samples, **gen_kwargs) -> str: + if isinstance(self.prompt_template, dict): + prompts = [ + self.model_prompt.format( + prompt=self.prompt_template[sample[self.selector_column_name]] + .format(**sample) + .strip() + ) + for sample in samples + ] + else: + prompts = [ + self.model_prompt.format( + prompt=self.prompt_template.format(**sample).strip() + ) + for sample in samples + ] + response = self.client.completions.create( + prompt=prompts, **{**self.defaults, **gen_kwargs} + ) + return [choice.text.strip() for choice in response.choices] + + def _validate(self, prompt_template: str, input_dict: Dict[str, Any]) -> bool: + if isinstance(prompt_template, dict): + prompt_template = prompt_template[input_dict[self.selector_column_name]] + return super()._validate(prompt_template, input_dict) From 3fbf4227bff988fce15e570303358c764f4db51f Mon Sep 17 00:00:00 2001 From: "Abhi.B" Date: Thu, 27 Jun 2024 18:22:22 +0000 Subject: [PATCH 3/8] =?UTF-8?q?=E2=9C=A8=20adding=20optional=20postfix=20f?= =?UTF-8?q?or=20loading=20icl=20yamls?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Abhi.B --- src/instructlab/sdg/utilblocks.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/src/instructlab/sdg/utilblocks.py b/src/instructlab/sdg/utilblocks.py index 5f3c0407..db04b5a1 100644 --- a/src/instructlab/sdg/utilblocks.py +++ b/src/instructlab/sdg/utilblocks.py @@ -10,12 +10,18 @@ class SamplePopulatorBlock(Block): - def __init__(self, config_paths, column_name, **batch_kwargs) -> None: - super().__init__(block_name=self.__class__.__name__) + def __init__(self, config_paths, column_name, post_fix="", **batch_kwargs) -> None: + super().__init__( + block_name=self.__class__.__name__ + ) # Call the base class's __init__ self.configs = {} for config in config_paths: + if post_fix: + config_name = config.replace(".yaml", f"_{post_fix}.yaml") + else: + config_name = config config_key = config.split("/")[-1].split(".")[0] - self.configs[config_key] = self._load_config(config) + self.configs[config_key] = self._load_config(config_name) self.column_name = column_name self.num_procs = batch_kwargs.get("num_procs", 8) From e0ed18cb13bd28753774ba3e766f129e654c8a7e Mon Sep 17 00:00:00 2001 From: "Abhi.B" Date: Thu, 27 Jun 2024 18:23:20 +0000 Subject: [PATCH 4/8] =?UTF-8?q?drop=20column=20=F0=9F=90=9B=20fix?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Abhi.B --- src/instructlab/sdg/pipeline.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/instructlab/sdg/pipeline.py b/src/instructlab/sdg/pipeline.py index 0de65d1b..fc93f78d 100644 --- a/src/instructlab/sdg/pipeline.py +++ b/src/instructlab/sdg/pipeline.py @@ -34,7 +34,7 @@ def generate(self, dataset) -> Dataset: for block_prop in self.chained_blocks: block_type = block_prop["block_type"] block_config = block_prop["block_config"] - drop_columns = block_prop.get("drop_columns", None) + drop_columns = block_prop.get("drop_columns", []) gen_kwargs = block_prop.get("gen_kwargs", {}) drop_duplicates_cols = block_prop.get("drop_duplicates", False) block = block_type(**block_config) @@ -50,8 +50,9 @@ def generate(self, dataset) -> Dataset: dataset = block.generate(dataset, **gen_kwargs) + drop_columns_in_ds = [e for e in drop_columns if e in dataset.column_names] if drop_columns: - dataset = dataset.remove_columns(drop_columns) + dataset = dataset.remove_columns(drop_columns_in_ds) if drop_duplicates_cols: dataset = self._drop_duplicates(dataset, cols=drop_duplicates_cols) From 87746ec9f4bcd52c4735c8c3d22b4277e9e29e6c Mon Sep 17 00:00:00 2001 From: "Abhi.B" Date: Thu, 27 Jun 2024 18:23:51 +0000 Subject: [PATCH 5/8] =?UTF-8?q?=E2=9C=A8=20adding=20custom=20parser=20para?= =?UTF-8?q?meters?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Abhi.B --- src/instructlab/sdg/default_flows.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/instructlab/sdg/default_flows.py b/src/instructlab/sdg/default_flows.py index 8fbe4f86..397e37fa 100644 --- a/src/instructlab/sdg/default_flows.py +++ b/src/instructlab/sdg/default_flows.py @@ -69,6 +69,11 @@ def get_flow(self) -> list: "num_procs": 8, "batched": True, }, + "parser_kwargs": { + "parser_name": "custom", + "parsing_pattern": r"\[(?:Question|QUESTION)\]\s*(.*?)\s*\[(?:Answer|ANSWER)\]\s*(.*?)\s*(?=\[(?:Question|QUESTION)\]|$)", + "parser_cleanup_tags": ["[END]"], + }, }, "gen_kwargs": { "max_tokens": 2048, From abf8598cf84984cbc5401e396a3528fc7e8d1f56 Mon Sep 17 00:00:00 2001 From: "Abhi.B" Date: Thu, 27 Jun 2024 18:27:32 +0000 Subject: [PATCH 6/8] fixing linting issues Signed-off-by: Abhi.B --- src/instructlab/sdg/llmblock.py | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/src/instructlab/sdg/llmblock.py b/src/instructlab/sdg/llmblock.py index ebb90b54..dbb94c39 100644 --- a/src/instructlab/sdg/llmblock.py +++ b/src/instructlab/sdg/llmblock.py @@ -49,8 +49,7 @@ def __init__( def _parse(self, generated_string) -> dict: matches = {} - - if self.parser_name is not None and self.parser_name == 'custom': + if self.parser_name is not None and self.parser_name == "custom": pattern = re.compile(self.parsing_pattern, re.DOTALL) all_matches = pattern.findall(generated_string) matches = {column_name: [] for column_name in self.output_cols} @@ -63,15 +62,14 @@ def _parse(self, generated_string) -> dict: matches[column_name].append(value) else: matches[self.output_cols[0]] = ( - [match.strip() for match in all_matches] if all_matches else [] - ) + [match.strip() for match in all_matches] if all_matches else [] + ) else: for start_tag, end_tag, output_col in zip( self.block_config["start_tags"], self.block_config["end_tags"], self.output_cols, ): - if not start_tag and not end_tag: matches[output_col] = ( generated_string.strip() if generated_string else None @@ -165,7 +163,6 @@ def __init__( **self._load_config(config) ) - def _generate(self, samples, **gen_kwargs) -> str: if isinstance(self.prompt_template, dict): prompts = [ From 2f08bd674925da969b126ca2e2331ed2ba83e43a Mon Sep 17 00:00:00 2001 From: Aakanksha Duggal Date: Thu, 27 Jun 2024 17:03:26 -0400 Subject: [PATCH 7/8] :rotating_light: Fix linting issues Signed-off-by: Aakanksha Duggal --- src/instructlab/sdg/llmblock.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/instructlab/sdg/llmblock.py b/src/instructlab/sdg/llmblock.py index dbb94c39..3e42c798 100644 --- a/src/instructlab/sdg/llmblock.py +++ b/src/instructlab/sdg/llmblock.py @@ -13,6 +13,7 @@ logger = setup_logger(__name__) +# pylint: disable=dangerous-default-value class LLMBlock(Block): # pylint: disable=too-many-instance-attributes def __init__( @@ -185,7 +186,7 @@ def _generate(self, samples, **gen_kwargs) -> str: ) return [choice.text.strip() for choice in response.choices] - def _validate(self, prompt_template: str, input_dict: Dict[str, Any]) -> bool: + def validate(self, prompt_template: str, input_dict: Dict[str, Any]) -> bool: if isinstance(prompt_template, dict): prompt_template = prompt_template[input_dict[self.selector_column_name]] return super()._validate(prompt_template, input_dict) From fe8594326810a9003fd2c6d4ce34913d656872f4 Mon Sep 17 00:00:00 2001 From: Oindrilla Chatterjee Date: Thu, 27 Jun 2024 17:50:58 -0400 Subject: [PATCH 8/8] =?UTF-8?q?=F0=9F=90=9B=20updated=20keys=20in=20test?= =?UTF-8?q?=20file=20to=20reflect=20template?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Oindrilla Chatterjee --- scripts/test_knowledge.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/scripts/test_knowledge.py b/scripts/test_knowledge.py index e800d3b2..d777c8c3 100644 --- a/scripts/test_knowledge.py +++ b/scripts/test_knowledge.py @@ -24,13 +24,13 @@ samples = [ { - "question_1": "what is the location of the tubal tonsils?", - "response_1": "The location of the tubal tonsils is the roof of the pharynx.", - "question_2": "How long does the adenoid grow?", + "icl_query_1": "what is the location of the tubal tonsils?", + "icl_response_1": "The location of the tubal tonsils is the roof of the pharynx.", + "icl_query_2": "How long does the adenoid grow?", "task_description": "Teaching about human anatomy, specifically tonsils", - "response_2": "The adenoid grows until the age of 5, starts to shrink at the age of 7 and becomes small in adulthood.", - "question_3": "What is the immune systems first line of defense against ingested or inhaled foreign pathogens?", - "response_3": "The tonsils are the immune systems first line of defense.", + "icl_response_2": "The adenoid grows until the age of 5, starts to shrink at the age of 7 and becomes small in adulthood.", + "icl_query_3": "What is the immune systems first line of defense against ingested or inhaled foreign pathogens?", + "icl_response_3": "The tonsils are the immune systems first line of defense.", "document": "The **tonsils** are a set of lymphoid organs facing into the aerodigestive tract, which is known as Waldeyer's tonsillar ring and consists of the adenoid tonsil or pharyngeal tonsil, two tubal tonsils, two palatine tonsils, and the lingual tonsils. These organs play an important role in the immune system. When used unqualified, the term most commonly refers specifically to the palatine tonsils, which are two lymphoid organs situated at either side of the back of the human throat. The palatine tonsils and the adenoid tonsil are organs consisting of lymphoepithelial tissue located near the oropharynx and nasopharynx parts of the throat", "domain": "textbook", }