Skip to content

Commit

Permalink
✨ Add context-aware document chunking and table processing
Browse files Browse the repository at this point in the history
Signed-off-by: Khaled Sulayman <[email protected]>
Co-authored-by: Aakanksha Duggal <[email protected]>
Co-authored-by: abhi1092 <[email protected]>
  • Loading branch information
3 people committed Oct 30, 2024
1 parent 3ac7b45 commit 2fb36f3
Show file tree
Hide file tree
Showing 6 changed files with 757 additions and 149 deletions.
3 changes: 3 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -10,5 +10,8 @@ openai>=1.13.3,<2.0.0
# removed once that one is removed.
# do not use 8.4.0 due to a bug in the library
# https://github.com/instructlab/instructlab/issues/1389
pypdf>=5.0.0
tabulate>=0.9.0
tenacity>=8.3.0,!=8.4.0
transformers>=4.44.2
xdg-base-dirs>=6.0.1
25 changes: 15 additions & 10 deletions src/instructlab/sdg/generate_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -309,12 +309,16 @@ def generate_data(
if not (taxonomy and os.path.exists(taxonomy)):
raise GenerateException(f"Error: taxonomy ({taxonomy}) does not exist.")

leaf_nodes = read_taxonomy_leaf_nodes(taxonomy, taxonomy_base, yaml_rules)
date_suffix = datetime.now().replace(microsecond=0).isoformat().replace(":", "_")
document_output_dir = Path(output_dir) / f"documents-{date_suffix}"

leaf_nodes = read_taxonomy_leaf_nodes(
taxonomy, taxonomy_base, yaml_rules, document_output_dir
)
if not leaf_nodes:
raise GenerateException("Error: No new leaf nodes found in the taxonomy.")

name = Path(model_name).stem # Just in case it is a file path
date_suffix = datetime.now().replace(microsecond=0).isoformat().replace(":", "_")
output_file_messages = f"messages_{name}_{date_suffix}.jsonl"
output_file_test = f"test_{name}_{date_suffix}.jsonl"
output_file_train = f"train_{name}_{date_suffix}.jsonl"
Expand Down Expand Up @@ -362,25 +366,26 @@ def generate_data(
for leaf_node in leaf_nodes.values():
is_knowledge = False
leaf_node_path = leaf_node[0]["taxonomy_path"].replace("->", "_")
samples = leaf_node_to_samples(leaf_node, server_ctx_size, chunk_word_count)
samples = leaf_node_to_samples(
leaf_node, server_ctx_size, chunk_word_count, document_output_dir, model_name
)

if not samples:
raise GenerateException("Error: No samples found in leaf node.")

if samples[0].get("document"):

Check warning on line 375 in src/instructlab/sdg/generate_data.py

View workflow job for this annotation

GitHub Actions / pylint

C0303: Trailing whitespace (trailing-whitespace)
if "document" in samples.column_names:
pipe = knowledge_pipe
is_knowledge = True

elif samples[0].get("seed_context"):
elif "seed_context" in samples.column_names:
pipe = grounded_skills_pipe

else:
pipe = freeform_skills_pipe

logger.debug("Samples: %s", samples)
ds = Dataset.from_list(samples)
logger.debug("Dataset: %s", ds)
new_generated_data = pipe.generate(ds, leaf_node_path)

new_generated_data = pipe.generate(samples, leaf_node_path)
if len(new_generated_data) == 0:
empty_sdg_leaf_nodes.append(leaf_node_path)
logger.warning("Empty dataset for qna node: %s", leaf_node_path)
Expand All @@ -398,7 +403,7 @@ def generate_data(
generate_eval_task_data(
mmlu_bench_pipe,
leaf_node_path,
ds,
samples,
output_dir,
date_suffix,
)
Expand Down
1 change: 0 additions & 1 deletion src/instructlab/sdg/pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -195,7 +195,6 @@ def _generate_single(self, dataset) -> Dataset:
drop_duplicates_cols = block_prop.get("drop_duplicates", False)
block = block_type(self.ctx, self, block_name, **block_config)
logger.info("Running block: %s", block_name)
logger.info(dataset)

# Execute the block and wrap errors with the block name/type
dataset = block.generate(dataset)
Expand Down
Loading

0 comments on commit 2fb36f3

Please sign in to comment.