Skip to content

Commit

Permalink
Merge pull request #36 from makelinux/docs
Browse files Browse the repository at this point in the history
links to openai, rouge_scorer tokenize docs or src
  • Loading branch information
russellb authored Jun 25, 2024
2 parents c4b55e5 + 010cd6b commit d395b77
Showing 1 changed file with 8 additions and 1 deletion.
9 changes: 8 additions & 1 deletion src/instructlab/sdg/generate_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -180,6 +180,7 @@ def post_process_gpt3_response(num_prompt_instructions, response, discarded_file
if response is None:
return [], 0
raw_instructions = (
# https://platform.openai.com/docs/api-reference/chat/object#chat/object-choices
f"* Task {num_prompt_instructions + 1}\n" + response.message.content
)
raw_instructions = re.split(r"\* Task \d+", raw_instructions)
Expand Down Expand Up @@ -521,6 +522,9 @@ def generate_data(
)

# similarities = {}
# Calculate rouges scores between two blobs of text.
# rougeL: Longest common subsequence based scoring.
# https://github.com/google-research/google-research/blob/master/rouge/rouge_scorer.py#L50
scorer = rouge_scorer.RougeScorer(["rougeL"], use_stemmer=False)

# now let's generate new instructions!
Expand All @@ -533,7 +537,9 @@ def generate_data(
d["instruction"] for d in machine_instruction_data
]
all_instruction_tokens = [
scorer._tokenizer.tokenize(inst) for inst in all_instructions
# https://github.com/google-research/google-research/blob/master/rouge/tokenize.py
scorer._tokenizer.tokenize(inst)
for inst in all_instructions
]

if console_output:
Expand Down Expand Up @@ -582,6 +588,7 @@ def generate_data(
assess_start = time.time()
for instruction_data_entry in instruction_data:
# computing similarity with the pre-tokenized instructions
# https://github.com/google-research/google-research/blob/master/rouge/tokenize.py
new_instruction_tokens = scorer._tokenizer.tokenize(
instruction_data_entry["instruction"]
)
Expand Down

0 comments on commit d395b77

Please sign in to comment.