apply pr microsoft#20

evoLonation · Jul 11, 2024 · 684909d · 684909d
1 parent 35f54d6
commit 684909d
Show file tree

Hide file tree

Showing 5 changed files with 89 additions and 9 deletions.
diff --git a/RepoCoder/README.md b/RepoCoder/README.md
@@ -22,6 +22,7 @@ This project contains the basic components of RepoCoder. Here is an overview:
 |-- build_prompt.py # build the prompt with the unfinished code and the retrieved code snippets
 |-- run_pipeline.py # run the code completion pipeline
 |-- compute_score.py # evaluate the performance of the code completion
+|-- codegen_inference.py # an example script for using CodeGen to generate code completions
 |-- utils.py # utility functions
 |-- datasets/datasets.zip # the input data for the code completion task
     |-- function_level_completion_4k_context_codex.test.jsonl

diff --git a/RepoCoder/codegen_inference.py b/RepoCoder/codegen_inference.py
@@ -0,0 +1,77 @@
+import torch
+import tqdm
+import json
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+
+class Tools:
+    @staticmethod
+    def load_jsonl(path):
+        with open(path, 'r') as f:
+            return [json.loads(line) for line in f.readlines()]
+
+    @staticmethod
+    def dump_jsonl(obj, path):
+        with open(path, 'w') as f:
+            for line in obj:
+                f.write(json.dumps(line) + '\n')
+
+
+class CodeGen:
+    def __init__(self, model_name, batch_size):
+        self.model_name = model_name
+        self.model = AutoModelForCausalLM.from_pretrained(model_name)
+        self.tokenizer = AutoTokenizer.from_pretrained(model_name, padding_side="left")
+        self.tokenizer.add_special_tokens({'pad_token': self.tokenizer.eos_token})
+        self.model.cuda()
+        self.batch_size = batch_size
+        print('done loading model')
+
+    def _get_batchs(self, prompts, batch_size):
+        batches = []
+        for i in range(0, len(prompts), batch_size):
+            batches.append(prompts[i:i+batch_size])
+        return batches
+
+    def _generate_batch(self, prompt_batch, max_new_tokens=100):
+        prompts = self.tokenizer(prompt_batch, return_tensors='pt', padding=True, truncation=True)
+
+        with torch.no_grad():
+            gen_tokens = self.model.generate(
+                input_ids = prompts['input_ids'].cuda(),
+                attention_mask = prompts['attention_mask'].cuda(),
+                do_sample=False,
+                max_new_tokens=max_new_tokens,
+            )
+        gen_text = self.tokenizer.batch_decode(gen_tokens, skip_special_tokens=True)
+        for i in range(len(gen_text)):  # remove the prompt
+            gen_text[i] = gen_text[i][len(prompt_batch[i]):]
+        return gen_text
+
+    def batch_generate(self, file):
+        print(f'generating from {file}')
+        lines = Tools.load_jsonl(file)
+        # have a new line at the end
+        prompts = [f"{line['prompt']}\n" for line in lines]
+        batches = self._get_batchs(prompts, self.batch_size)
+        gen_text = []
+        for batch in tqdm.tqdm(batches):
+            gen_text.extend(self._generate_batch(batch))
+        print(f'generated {len(gen_text)} samples')
+        assert len(gen_text) == len(prompts)
+        new_lines = []
+        for line, gen in zip(lines, gen_text):
+            new_lines.append({
+                'prompt': line['prompt'],
+                'metadata': line['metadata'],
+                'choices': [{'text': gen}]
+            })
+        Tools.dump_jsonl(new_lines, file.replace('.jsonl', f'_{self.model_name.split("/")[-1]}.jsonl'))
+
+
+if __name__ == '__main__':
+    file_path = 'datasets/line_level_completion_1k_context_codegen.test.jsonl'
+    tiny_codegen = 'Salesforce/codegen-350M-mono'
+
+    cg = CodeGen(tiny_codegen, batch_size=8)
+    cg.batch_generate(file_path)
diff --git a/RepoCoder/make_window.py b/RepoCoder/make_window.py
@@ -138,7 +138,7 @@ def build_window(self):
                 }
             })
         print(f'build {len(code_windows)} ground truth windows for {self.repo} with window size {self.window_size}')
-        output_path = FilePathBuilder.search_first_window_path(self.benchmark, CONSTANTS.rg, self.repo, self.window_size)
+        output_path = FilePathBuilder.search_first_window_path(self.benchmark, CONSTANTS.gt, self.repo, self.window_size)
         Tools.dump_pickle(code_windows, output_path)
 
 class PredictionWindowMaker:

diff --git a/RepoCoder/run_pipeline.py b/RepoCoder/run_pipeline.py
@@ -12,13 +12,12 @@
 from utils import CONSTANTS, CodexTokenizer
 
 def make_repo_window(repos, window_sizes, slice_sizes):
-    worker = MakeWindowWrapper(None, repos, window_sizes, slice_sizes)
-    worker.window_for_repo_files()
+    MakeWindowWrapper(None, repos, window_sizes, slice_sizes).window_for_repo_files()
+    vectorizer = BagOfWords
+    BuildVectorWrapper(None, vectorizer, repos, window_sizes, slice_sizes).vectorize_repo_windows()
 
 
 def run_RG1_and_oracle_method(benchmark, repos, window_sizes, slice_sizes):
-    # build code snippets for all the repositories
-    make_repo_window(repos, window_sizes, slice_sizes)
     # build code snippets for vanilla retrieval-augmented approach and ground truth
     MakeWindowWrapper(benchmark, repos, window_sizes, slice_sizes).window_for_baseline_and_ground()
     # build vector for vanilla retrieval-augmented approach and ground truth
@@ -62,6 +61,9 @@ def run_RepoCoder_method(benchmark, repos, window_sizes, slice_sizes, prediction
     window_sizes = [20]
     slice_sizes = [2]  # 20 / 2 = 10
 
+    # build window for the repos
+    make_repo_window(repos, window_sizes, slice_sizes)
+
     # build prompt for the RG1 and oracle methods
     run_RG1_and_oracle_method(CONSTANTS.api_benchmark, repos, window_sizes, slice_sizes)
 

diff --git a/RepoCoder/utils.py b/RepoCoder/utils.py
@@ -20,11 +20,11 @@ class CONSTANTS:
     rgrg = 'r-g-r-g' # RepoCoder, two-stage retrieval and generation
 
 class FilePathBuilder:
-    api_completion_benchmark = 'datasets/random-api-completion.test.jsonl'
-    random_line_completion_benchmark = 'datasets/random-line-completion.test.jsonl'
+    api_completion_benchmark = 'datasets/api_level_completion_2k_context_codex.test.jsonl'
+    random_line_completion_benchmark = 'datasets/line_level_completion_2k_context_codex.test.jsonl'
     # short version for codegen
-    short_api_completion_benchmark = 'datasets/random-api-completion-short-version.test.jsonl'
-    short_random_line_completion_benchmark = 'datasets/random-line-completion-short-version.test.jsonl'
+    short_api_completion_benchmark = 'datasets/api_level_completion_1k_context_codegen.test.jsonl'
+    short_random_line_completion_benchmark = 'datasets/line_level_completion_1k_context_codegen.test.jsonl'
     repo_base_dir = 'repositories/line_and_api_level'
 
     @staticmethod