diff --git a/.github/workflows/e2e.yml b/.github/workflows/e2e.yml index e7b9eed9..bdf4e739 100644 --- a/.github/workflows/e2e.yml +++ b/.github/workflows/e2e.yml @@ -76,6 +76,13 @@ jobs: # config contains DEFAULT_MODEL key: huggingface-${{ hashFiles('src/instructlab/configuration.py') }} + - name: Switch instructlab to PR 1790 (TEMPORARY) + run: | + cd instructlab + git fetch origin pull/1790/head:pr1790 + git checkout pr1790 + cd .. + - name: Install instructlab and instructlab-sdg run: | export PATH="/home/runner/.local/bin:/usr/local/cuda/bin:$PATH" diff --git a/requirements.txt b/requirements.txt index 90988d8f..ac6d4762 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,8 +4,9 @@ httpx>=0.25.0,<1.0.0 langchain-text-splitters openai>=1.13.3,<2.0.0 platformdirs>=4.2 -# Note: this dependency goes along with langchain-text-splitters and mayt be +# Note: this dependency goes along with langchain-text-splitters and may be # removed once that one is removed. # do not use 8.4.0 due to a bug in the library # https://github.com/instructlab/instructlab/issues/1389 tenacity>=8.3.0,!=8.4.0 +instructlab-schema>=0.3.1 diff --git a/src/instructlab/sdg/configs/knowledge/generate_questions_responses.yaml b/src/instructlab/sdg/configs/knowledge/generate_questions_responses.yaml index 910c64a5..a2decd76 100644 --- a/src/instructlab/sdg/configs/knowledge/generate_questions_responses.yaml +++ b/src/instructlab/sdg/configs/knowledge/generate_questions_responses.yaml @@ -27,24 +27,9 @@ principles: | examples: | Here are some examples of questions: - [QUESTION] - Explain the process of photosynthesis in plants. Include in your answer the roles of chlorophyll, light, water, and carbon dioxide, and describe how oxygen and glucose are produced. - [ANSWER] - Photosynthesis is the process by which plants, algae, and some bacteria use sunlight to synthesize food from carbon dioxide and water. Photosynthesis in plants primarily occurs in the leaves, specifically in the chloroplasts. Chlorophyll, the green pigment in chloroplasts, absorbs light energy, which is then used to convert carbon dioxide (from the air) and water (from the soil) into glucose, a simple sugar. This process also releases oxygen as a byproduct. Light energy splits water molecules, releasing electrons and hydrogen ions and forming oxygen. The light-dependent reactions convert light energy into chemical energy (ATP and NADPH), which is used in the light-independent reactions (Calvin cycle) to convert carbon dioxide into glucose. The overall result is the conversion of solar energy into chemical energy in the form of glucose, which plants use for growth and development. - [END] - - [QUESTION] - In a study on the effects of temperature on enzyme activity, an enzyme exhibited its highest activity at 37°C. At both higher and lower temperatures, its activity decreased. Based on this information, which of the following best explains the enzyme's behavior? - Options: - a) Enzymes are temperature-sensitive and can denature at high temperatures, losing their functional shape, while at low temperatures, their reaction rates decrease due to reduced molecular motion. - b) Enzymes are more effective at higher temperatures as increased heat provides more energy for reactions, and lower temperatures cause enzymes to become more active due to enhanced molecular stability. - c) The enzyme's behavior is unrelated to temperature; instead, it is likely due to changes in pH levels, which affect enzyme activity. - d) All enzymes universally work best at exactly 37°C, as this is the standard temperature for all biochemical reactions in nature. - [ANSWER] - a) Enzymes are temperature-sensitive and can denature at high temperatures, losing their functional shape, while at low temperatures, their reaction rates decrease due to reduced molecular motion. - [END] + [Document] + {icl_document} - For this {domain} domain here are some sample questions: [QUESTION] {icl_query_1} [ANSWER] @@ -65,4 +50,7 @@ examples: | generation: | Here is the document: + + [DOCUMENT] + {document_outline} {document} diff --git a/src/instructlab/sdg/configs/knowledge/simple_generate_qa.yaml b/src/instructlab/sdg/configs/knowledge/simple_generate_qa.yaml index f7403752..784902b4 100644 --- a/src/instructlab/sdg/configs/knowledge/simple_generate_qa.yaml +++ b/src/instructlab/sdg/configs/knowledge/simple_generate_qa.yaml @@ -13,6 +13,9 @@ principles: | 7. The output should be an appropriate response to the input and the instruction. Long outputs are preferable. examples: | + Here is a sample section of the document as an example: + {icl_document} + Here are some examples to help you understand the type of questions that are asked for this document: {icl_query_1} @@ -25,6 +28,8 @@ examples: | {icl_response_3} Here is the document: + + {document_outline} {document} generation: | diff --git a/src/instructlab/sdg/generate_data.py b/src/instructlab/sdg/generate_data.py index cfbb784e..819d2d9a 100644 --- a/src/instructlab/sdg/generate_data.py +++ b/src/instructlab/sdg/generate_data.py @@ -139,6 +139,20 @@ def _gen_train_data( outfile.write("\n") +def _knowledge_seed_example_to_test_data(seed_example): + res = [] + for qna in seed_example["questions_and_answers"]: + user = qna["question"] + "\n" + seed_example["context"] + res.append( + { + "system": _SYS_PROMPT, + "user": _unescape(user), + "assistant": _unescape(qna["answer"]), + } + ) + return res + + def _gen_test_data( leaf_nodes, output_file_test, @@ -146,6 +160,12 @@ def _gen_test_data( test_data = [] for _, leaf_node in leaf_nodes.items(): for seed_example in leaf_node: + if "questions_and_answers" in seed_example: + test_data.extend(_knowledge_seed_example_to_test_data(seed_example)) + continue + + # skill seed example + user = seed_example["instruction"] # question if len(seed_example["input"]) > 0: diff --git a/src/instructlab/sdg/utils/taxonomy.py b/src/instructlab/sdg/utils/taxonomy.py index c8389247..a77738bf 100644 --- a/src/instructlab/sdg/utils/taxonomy.py +++ b/src/instructlab/sdg/utils/taxonomy.py @@ -18,11 +18,12 @@ import yaml # First Party -from instructlab.sdg import utils from instructlab.sdg.utils import chunking logger = logging.getLogger(__name__) +MIN_KNOWLEDGE_VERSION = 3 + DEFAULT_YAML_RULES = """\ extends: relaxed @@ -202,6 +203,13 @@ def retrieve(uri: URI) -> Resource: f"Cannot determine schema name from path {taxonomy_path}. Using {schema_name} schema." ) + if schema_name == "knowledge" and version < MIN_KNOWLEDGE_VERSION: + logger.error( + f"Version {version} is not supported for knowledge taxonomy. Minimum supported version is {MIN_KNOWLEDGE_VERSION}." + ) + errors += 1 + return errors + try: schema_resource = retrieve(f"{schema_name}.json") schema = schema_resource.contents @@ -335,7 +343,7 @@ def _read_taxonomy_file(file_path: str, yaml_rules: Optional[str] = None): # get seed instruction data tax_path = "->".join(taxonomy_path.parent.parts) - task_description = contents.get("task_description") + task_description = contents.get("task_description", None) domain = contents.get("domain") documents = contents.get("document") if documents: @@ -343,20 +351,34 @@ def _read_taxonomy_file(file_path: str, yaml_rules: Optional[str] = None): logger.debug("Content from git repo fetched") for seed_example in contents.get("seed_examples"): - question = seed_example.get("question") - answer = seed_example.get("answer") context = seed_example.get("context", "") - seed_instruction_data.append( - { - "instruction": question, - "input": context, - "output": answer, - "taxonomy_path": tax_path, - "task_description": task_description, - "document": documents, - "domain": domain, - } - ) + if "questions_and_answers" in seed_example: + question_answer_list = seed_example.get("questions_and_answers") + seed_instruction_data.append( + { + "questions_and_answers": question_answer_list, + "context": context, + "taxonomy_path": tax_path, + "document": documents, + "domain": domain, + "document_outline": contents.get("document_outline"), + } + ) + else: + question = seed_example.get("question") + answer = seed_example.get("answer") + + seed_instruction_data.append( + { + "instruction": question, + "input": context, + "output": answer, + "taxonomy_path": tax_path, + "task_description": task_description, + "document": documents, + "domain": domain, + } + ) except Exception as e: errors += 1 raise TaxonomyReadingException(f"Exception {e} raised in {file_path}") from e @@ -418,8 +440,7 @@ def read_taxonomy_leaf_nodes(taxonomy, taxonomy_base, yaml_rules): def _knowledge_leaf_node_to_samples(leaf_node, server_ctx_size, chunk_word_count): - samples = [{}] - + samples = [] # document is the same for the whole leaf node chunks = ( chunking.chunk_document( @@ -436,38 +457,24 @@ def _knowledge_leaf_node_to_samples(leaf_node, server_ctx_size, chunk_word_count for chunk in chunks: # pylint: disable=consider-using-enumerate - for i in range(len(leaf_node)): - samples[-1].setdefault("task_description", leaf_node[i]["task_description"]) - samples[-1].setdefault("domain", domain) - samples[-1].setdefault("document", chunk) - if samples[-1].get("document") and not samples[-1].get("domain"): - raise utils.GenerateException( - "Error: No domain provided for knowledge document in leaf node" - ) - if "icl_query_3" in samples[-1]: - samples.append({}) - if "icl_query_1" not in samples[-1]: - samples[-1]["icl_query_1"] = leaf_node[i]["instruction"] - samples[-1]["icl_response_1"] = leaf_node[i]["output"] - elif "icl_query_2" not in samples[-1]: - samples[-1]["icl_query_2"] = leaf_node[i]["instruction"] - samples[-1]["icl_response_2"] = leaf_node[i]["output"] - else: - samples[-1]["icl_query_3"] = leaf_node[i]["instruction"] - samples[-1]["icl_response_3"] = leaf_node[i]["output"] - - # wrap back around to the beginning if the number of examples was not - # evenly divisble by 3 - if "icl_query_2" not in samples[-1]: - samples[-1]["icl_query_2"] = leaf_node[0]["instruction"] - samples[-1]["icl_response_2"] = leaf_node[0]["output"] - if "icl_query_3" not in samples[-1]: - samples[-1]["icl_query_3"] = leaf_node[1 if len(leaf_node) > 1 else 0][ - "instruction" - ] - samples[-1]["icl_response_3"] = leaf_node[1 if len(leaf_node) > 1 else 0][ - "output" - ] + for icl_ in leaf_node: + icl_query = { + f"icl_query_{idx+1}": val["question"] + for idx, val in enumerate(icl_["questions_and_answers"]) + } + icl_resp = { + f"icl_response_{idx+1}": val["answer"] + for idx, val in enumerate(icl_["questions_and_answers"]) + } + samples_row = { + "icl_document": icl_["context"], + "document": chunk, + "document_outline": icl_["document_outline"], + "domain": domain, + } + samples_row.update(icl_query) + samples_row.update(icl_resp) + samples.append(samples_row) return samples