forked from e-p-armstrong/augmentoolkit
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathconvert_text_to_jsonl.py
21 lines (17 loc) · 950 Bytes
/
convert_text_to_jsonl.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
import json
# Function to convert a .txt file to a .jsonl file
# Useful if you want to train on the raw text first to give it broad knowledge, then train on the Augmentoolkit dataset to teach it to answer questions on the subject
def txt_to_single_jsonl(txt_file_path, jsonl_file_path):
with open(txt_file_path, 'r', encoding='utf-8') as txt_file:
# Read the entire content of the file, preserving whitespace
file_content = txt_file.read()
# Create a dictionary with the entire file content
json_obj = {"text": file_content}
with open(jsonl_file_path, 'w', encoding='utf-8') as jsonl_file:
# Write the single JSON object to the .jsonl file
jsonl_file.write(json.dumps(json_obj) + '\n')
# Example usage
txt_file_path = './raw_txt_input/on_war_clausewitz.txt'
jsonl_file_path = './on_war_clausewitz.json'
txt_to_single_jsonl(txt_file_path, jsonl_file_path)
print("Conversion completed.")