Skip to content

Commit

Permalink
utils: Update taxonomy reading code to handle knowledge v3
Browse files Browse the repository at this point in the history
This is part of instructlab#160

The changes here originated from aakankshaduggal@5baf6df

There are two major changes here.

- When parsing a `qna.yaml` file from a taxonomy tree, adjust for the
  new schema for knowledge. There is no attempt to maintain
  compatibility with prior versions of the schema (v1, v2).

- Change how we translate the taxonomy data into the dataset sent into
  the pipeline as input. Instead of implementing a sliding window
  approach of 3 sample qna pairs at a time over all chunks of the
  document, we now create a row per seed_example (context and
  associated qna pairs) for each chunk of knowledge docs.

Co-authored-by: abhi1092 <[email protected]>
Co-authored-by: shiv <[email protected]>
Co-authored-by: Aakanksha Duggal <[email protected]>
Signed-off-by: Russell Bryant <[email protected]>
  • Loading branch information
4 people committed Jul 22, 2024
1 parent 33abe1e commit 94a7a5e
Showing 1 changed file with 47 additions and 48 deletions.
95 changes: 47 additions & 48 deletions src/instructlab/sdg/utils/taxonomy.py
Original file line number Diff line number Diff line change
Expand Up @@ -335,28 +335,42 @@ def _read_taxonomy_file(file_path: str, yaml_rules: Optional[str] = None):

# get seed instruction data
tax_path = "->".join(taxonomy_path.parent.parts)
task_description = contents.get("task_description")
task_description = contents.get("task_description", None)
domain = contents.get("domain")
documents = contents.get("document")
if documents:
documents = _get_documents(source=documents)
logger.debug("Content from git repo fetched")

for seed_example in contents.get("seed_examples"):
question = seed_example.get("question")
answer = seed_example.get("answer")
context = seed_example.get("context", "")
seed_instruction_data.append(
{
"instruction": question,
"input": context,
"output": answer,
"taxonomy_path": tax_path,
"task_description": task_description,
"document": documents,
"domain": domain,
}
)
if "questions_and_answers" in seed_example:
question_answer_list = seed_example.get("questions_and_answers")
seed_instruction_data.append(
{
"questions_and_answers": question_answer_list,
"input": context,
"taxonomy_path": tax_path,
"document": documents,
"domain": domain,
"document_outline": contents.get("document_outline"),
}
)
else:
question = seed_example.get("question")
answer = seed_example.get("answer")

seed_instruction_data.append(
{
"instruction": question,
"input": context,
"output": answer,
"taxonomy_path": tax_path,
"task_description": task_description,
"document": documents,
"domain": domain,
}
)
except Exception as e:
errors += 1
raise TaxonomyReadingException(f"Exception {e} raised in {file_path}") from e
Expand Down Expand Up @@ -418,8 +432,7 @@ def read_taxonomy_leaf_nodes(taxonomy, taxonomy_base, yaml_rules):


def _knowledge_leaf_node_to_samples(leaf_node, server_ctx_size, chunk_word_count):
samples = [{}]

samples = []
# document is the same for the whole leaf node
chunks = (
chunking.chunk_document(
Expand All @@ -436,38 +449,24 @@ def _knowledge_leaf_node_to_samples(leaf_node, server_ctx_size, chunk_word_count

for chunk in chunks:
# pylint: disable=consider-using-enumerate
for i in range(len(leaf_node)):
samples[-1].setdefault("task_description", leaf_node[i]["task_description"])
samples[-1].setdefault("domain", domain)
samples[-1].setdefault("document", chunk)
if samples[-1].get("document") and not samples[-1].get("domain"):
raise utils.GenerateException(
"Error: No domain provided for knowledge document in leaf node"
)
if "icl_query_3" in samples[-1]:
samples.append({})
if "icl_query_1" not in samples[-1]:
samples[-1]["icl_query_1"] = leaf_node[i]["instruction"]
samples[-1]["icl_response_1"] = leaf_node[i]["output"]
elif "icl_query_2" not in samples[-1]:
samples[-1]["icl_query_2"] = leaf_node[i]["instruction"]
samples[-1]["icl_response_2"] = leaf_node[i]["output"]
else:
samples[-1]["icl_query_3"] = leaf_node[i]["instruction"]
samples[-1]["icl_response_3"] = leaf_node[i]["output"]

# wrap back around to the beginning if the number of examples was not
# evenly divisble by 3
if "icl_query_2" not in samples[-1]:
samples[-1]["icl_query_2"] = leaf_node[0]["instruction"]
samples[-1]["icl_response_2"] = leaf_node[0]["output"]
if "icl_query_3" not in samples[-1]:
samples[-1]["icl_query_3"] = leaf_node[1 if len(leaf_node) > 1 else 0][
"instruction"
]
samples[-1]["icl_response_3"] = leaf_node[1 if len(leaf_node) > 1 else 0][
"output"
]
for icl_ in leaf_node:
icl_query = {
f"icl_query_{idx+1}": val["question"]
for idx, val in enumerate(icl_["questions_and_answers"])
}
icl_resp = {
f"icl_response_{idx+1}": val["answer"]
for idx, val in enumerate(icl_["questions_and_answers"])
}
samples_row = {
"icl_document": icl_["input"],
"document": chunk,
"document_outline": icl_["document_outline"],
"domain": domain,
}
samples_row.update(icl_query)
samples_row.update(icl_resp)
samples.append(samples_row)

return samples

Expand Down

0 comments on commit 94a7a5e

Please sign in to comment.