Merge pull request #36 from makelinux/docs

links to openai, rouge_scorer tokenize docs or src
instructlab · Jun 25, 2024 · d395b77 · d395b77
2 parents c4b55e5 + 010cd6b
commit d395b77
Showing 1 changed file with 8 additions and 1 deletion.
diff --git a/src/instructlab/sdg/generate_data.py b/src/instructlab/sdg/generate_data.py
@@ -180,6 +180,7 @@ def post_process_gpt3_response(num_prompt_instructions, response, discarded_file
     if response is None:
         return [], 0
     raw_instructions = (
+        # https://platform.openai.com/docs/api-reference/chat/object#chat/object-choices
         f"* Task {num_prompt_instructions + 1}\n" + response.message.content
     )
     raw_instructions = re.split(r"\* Task \d+", raw_instructions)
@@ -521,6 +522,9 @@ def generate_data(
         )
 
     # similarities = {}
+    # Calculate rouges scores between two blobs of text.
+    # rougeL: Longest common subsequence based scoring.
+    # https://github.com/google-research/google-research/blob/master/rouge/rouge_scorer.py#L50
     scorer = rouge_scorer.RougeScorer(["rougeL"], use_stemmer=False)
 
     # now let's generate new instructions!
@@ -533,7 +537,9 @@ def generate_data(
         d["instruction"] for d in machine_instruction_data
     ]
     all_instruction_tokens = [
-        scorer._tokenizer.tokenize(inst) for inst in all_instructions
+        # https://github.com/google-research/google-research/blob/master/rouge/tokenize.py
+        scorer._tokenizer.tokenize(inst)
+        for inst in all_instructions
     ]
 
     if console_output:
@@ -582,6 +588,7 @@ def generate_data(
         assess_start = time.time()
         for instruction_data_entry in instruction_data:
             # computing similarity with the pre-tokenized instructions
+            # https://github.com/google-research/google-research/blob/master/rouge/tokenize.py
             new_instruction_tokens = scorer._tokenizer.tokenize(
                 instruction_data_entry["instruction"]
             )