Skip to content

Commit

Permalink
Re-introduce document chunking for knowledge
Browse files Browse the repository at this point in the history
When generating samples for a knowledge pipeline, we have to chunk the
document down to a size that will fit within the model's context size.
There was a hack in place that only used a single chunk. The code now
iterates over all chunks of the document for creating samples to send
through the pipeline.

The commit also separates the code for knowledge and skills since the
differences between the formats is growing.

Closes instructlab#52

Signed-off-by: Russell Bryant <[email protected]>
  • Loading branch information
russellb committed Jun 30, 2024
1 parent 1f71fb6 commit 8112123
Show file tree
Hide file tree
Showing 2 changed files with 69 additions and 20 deletions.
14 changes: 2 additions & 12 deletions src/instructlab/sdg/generate_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@
SynthSkillsFlow,
)
from instructlab.sdg.pipeline import Pipeline
from instructlab.sdg.utils import chunking, models
from instructlab.sdg.utils import models
from instructlab.sdg.utils.taxonomy import (
leaf_node_to_samples,
read_taxonomy_leaf_nodes,
Expand Down Expand Up @@ -270,7 +270,7 @@ def generate_data(

generated_data = None
for leaf_node in leaf_nodes.values():
samples = leaf_node_to_samples(leaf_node)
samples = leaf_node_to_samples(leaf_node, server_ctx_size, chunk_word_count)

if not samples:
raise utils.GenerateException("Error: No samples found in leaf node.")
Expand All @@ -290,16 +290,6 @@ def generate_data(
"Error: No SDG pipeline for this leaf node type: %s" % samples[0]
)

# TODO this is broken, just trying to get initial integration to run
# pylint: disable=consider-using-enumerate
if samples[0].get("document"):
for i in range(len(samples)):
samples[i]["document"] = chunking.chunk_document(
documents=samples[i]["document"],
server_ctx_size=server_ctx_size,
chunk_word_count=chunk_word_count,
)[0]

# TODO -- there is a parameter for how many samples to generate, but we ignore it so far

logger.debug("Samples: %s" % samples)
Expand Down
75 changes: 67 additions & 8 deletions src/instructlab/sdg/utils/taxonomy.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@

# First Party
from instructlab.sdg import utils
from instructlab.sdg.utils import chunking

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -415,19 +416,67 @@ def read_taxonomy_leaf_nodes(taxonomy, taxonomy_base, yaml_rules):
return leaf_nodes


def leaf_node_to_samples(leaf_node):
def _knowledge_leaf_node_to_samples(leaf_node, server_ctx_size, chunk_word_count):
samples = [{}]

# document is the same for the whole leaf node
chunks = (
chunking.chunk_document(
documents=leaf_node[0]["document"],
server_ctx_size=server_ctx_size,
chunk_word_count=chunk_word_count,
)
if leaf_node[0].get("document")
else []
)

# domain is the same for the whole leaf node
domain = leaf_node[0].get("domain")

for chunk in chunks:
# pylint: disable=consider-using-enumerate
for i in range(len(leaf_node)):
samples[-1].setdefault("task_description", leaf_node[i]["task_description"])
samples[-1].setdefault("domain", domain)
samples[-1].setdefault("document", chunk)
if samples[-1].get("document") and not samples[-1].get("domain"):
raise utils.GenerateException(
"Error: No domain provided for knowledge document in leaf node"
)
if "question_3" in samples[-1]:
samples.append({})
if "question_1" not in samples[-1]:
samples[-1]["question_1"] = leaf_node[i]["instruction"]
samples[-1]["response_1"] = leaf_node[i]["output"]
elif "question_2" not in samples[-1]:
samples[-1]["question_2"] = leaf_node[i]["instruction"]
samples[-1]["response_2"] = leaf_node[i]["output"]
else:
samples[-1]["question_3"] = leaf_node[i]["instruction"]
samples[-1]["response_3"] = leaf_node[i]["output"]

# wrap back around to the beginning if the number of examples was not
# evenly divisble by 3
if "question_2" not in samples[-1]:
samples[-1]["question_2"] = leaf_node[0]["instruction"]
samples[-1]["response_2"] = leaf_node[0]["output"]
if "question_3" not in samples[-1]:
samples[-1]["question_3"] = leaf_node[1 if len(leaf_node) > 1 else 0][
"instruction"
]
samples[-1]["response_3"] = leaf_node[1 if len(leaf_node) > 1 else 0][
"output"
]

return samples


def _skill_leaf_node_to_samples(leaf_node):
samples = [{}]

# pylint: disable=consider-using-enumerate
for i in range(len(leaf_node)):
samples[-1].setdefault("task_description", leaf_node[i]["task_description"])
for field in ["document", "domain"]:
if leaf_node[i].get(field):
samples[-1].setdefault(field, leaf_node[i][field])
if samples[-1].get("document") and not samples[-1].get("domain"):
raise utils.GenerateException(
"Error: No domain provided for knowledge document in leaf node"
)
if leaf_node[i].get("input"):
samples[-1].setdefault("context", leaf_node[i]["input"])
if "question_3" in samples[-1]:
Expand All @@ -454,3 +503,13 @@ def leaf_node_to_samples(leaf_node):
samples[-1]["response_3"] = leaf_node[1 if len(leaf_node) > 1 else 0]["output"]

return samples


def leaf_node_to_samples(leaf_node, server_ctx_size, chunk_word_count):
if not leaf_node:
return []
if "document" in leaf_node[0]:
return _knowledge_leaf_node_to_samples(
leaf_node, server_ctx_size, chunk_word_count
)
return _skill_leaf_node_to_samples(leaf_node)

0 comments on commit 8112123

Please sign in to comment.