Skip to content

Commit

Permalink
convert chunked dataset to list
Browse files Browse the repository at this point in the history
  • Loading branch information
khaledsulayman committed Sep 27, 2024
1 parent a70e2bc commit c1fd1d4
Showing 1 changed file with 5 additions and 1 deletion.
6 changes: 5 additions & 1 deletion src/instructlab/sdg/utils/docprocessor.py
Original file line number Diff line number Diff line change
Expand Up @@ -587,10 +587,14 @@ def chunk_pdfs(pdf_docs: List, filepaths: List, leaf_node_path: Path, model_name
for entry in pd:
json_file.write(json.dumps(entry) + "\n")

chunked_pdfs = DocProcessor(
dp = DocProcessor(
parsed_doc_dir=str(docling_jsons_path),
tokenizer_model_name=model_name,
qna_yaml_path=Path("~/.local/share/instructlab/taxonomy").expanduser() / leaf_node_path / "qna.yaml",
)

chunked_pdfs = list(dp.get_processed_dataset())
print(f"THIS IS KHALED: {chunk_pdfs=}")
print(f"THIS IS KHALED: {type(chunk_pdfs)=}")

return chunked_pdfs

0 comments on commit c1fd1d4

Please sign in to comment.