microsoft · zfj1998 · Jul 28, 2023 · May 3, 2024 · May 3, 2024 · May 20, 2024
diff --git a/RepoCoder/README.md b/RepoCoder/README.md
@@ -22,6 +22,7 @@ This project contains the basic components of RepoCoder. Here is an overview:
 |-- build_prompt.py # build the prompt with the unfinished code and the retrieved code snippets
 |-- run_pipeline.py # run the code completion pipeline
 |-- compute_score.py # evaluate the performance of the code completion
+|-- codegen_inference.py # an example script for using CodeGen to generate code completions
 |-- utils.py # utility functions
 |-- datasets/datasets.zip # the input data for the code completion task
     |-- function_level_completion_4k_context_codex.test.jsonl

diff --git a/RepoCoder/build_prompt.py b/RepoCoder/build_prompt.py
@@ -39,31 +39,37 @@ def _make_a_block(self, retrieved_context):
         token_len = len(tokenized_block)
         return block_str, token_len
 
-    def _make_an_extended_block(self, retrieved_context):
+    def _make_an_extended_block(self, task_metadata, retrieved_context):
         content, sim_score = retrieved_context
         metadata = content['metadata']
-        # put the file path in the comment
-        assert metadata[0]['fpath_tuple'][0] == metadata[0]['repo']
-        f_paths = ['/'.join(x['fpath_tuple'][1:]) for x in metadata]
-        f_paths_str = '\n'.join([f'# {f_path}' for f_path in f_paths])
-        f_path_comment = f'# the below code fragment can be found in:'
-        # put code lines in the comment
-        original_code = Tools.read_code(os.path.join(FilePathBuilder.repo_base_dir, *metadata[0]['fpath_tuple']))
-        code_lines = original_code.splitlines()
-        end_line_no = metadata[0]['end_line_no']
-        window_size = metadata[0]['window_size']
-        slice_size = metadata[0]['slice_size']
-        new_end_line_no = min(end_line_no + window_size // slice_size, len(code_lines))
-        new_start_line_no = max(0, new_end_line_no - window_size)
-        content_lines = code_lines[new_start_line_no:new_end_line_no]
-        content_lines_comment = [f'# {line}' for line in content_lines]
-        # aggregate the comment and the code lines
-        block_str = '\n'.join([f_path_comment, f_paths_str, self.seperator] + content_lines_comment + [self.seperator]) + '\n'
-        tokenized_block = self.tokenizer.tokenize(block_str)
-        token_len = len(tokenized_block)
-        return block_str, token_len
+        duplicate_num = len(metadata) # for those share the exact same code fragment from different files
+        for i in range(duplicate_num):
+            # put the file path in the comment
+            assert metadata[i]['fpath_tuple'][0] == metadata[i]['repo']
+            f_paths = ['/'.join(x['fpath_tuple'][1:]) for x in metadata]
+            f_paths_str = '\n'.join([f'# {f_path}' for f_path in f_paths])
+            f_path_comment = f'# the below code fragment can be found in:'
+            # put code lines in the comment
+            original_code = Tools.read_code(os.path.join(FilePathBuilder.repo_base_dir, *metadata[i]['fpath_tuple']))
+            code_lines = original_code.splitlines()
+            end_line_no = metadata[i]['end_line_no']
+            window_size = metadata[i]['window_size']
+            slice_size = metadata[i]['slice_size']
+            new_end_line_no = min(end_line_no + window_size // slice_size, len(code_lines))
+            new_start_line_no = max(0, new_end_line_no - window_size)
+            if metadata[i]['fpath_tuple'] == tuple(task_metadata['fpath_tuple']) and new_end_line_no >= task_metadata['line_no']:
+                continue
+            content_lines = code_lines[new_start_line_no:new_end_line_no]
+            content_lines_comment = [f'# {line}' for line in content_lines]
+            # aggregate the comment and the code lines
+            block_str = '\n'.join([f_path_comment, f_paths_str, self.seperator] + content_lines_comment + [self.seperator]) + '\n'
+            tokenized_block = self.tokenizer.tokenize(block_str)
+            token_len = len(tokenized_block)
+            return block_str, token_len
+        else:
+            return '', 0
 
-    def _build_prompt(self, mode, prompt, top_k_context):
+    def _build_prompt(self, mode, prompt, task_metadata, top_k_context):
         prepend_context = "# Here are some relevant code fragments from other files of the repo:\n"
         prepend_context += self.seperator + '\n'
         current_token_length = 20  # the length of the head_prompt, same for codex and codegen tokenizer
@@ -73,7 +79,10 @@ def _build_prompt(self, mode, prompt, top_k_context):
         for retrieved_context in top_k_context[::-1]:
             if len(chosen_context) >= self.max_examples:
                 break
-            block_str, token_len = make_block_func(retrieved_context)
+            kwargs = {'retrieved_context': retrieved_context}
+            if mode == CONSTANTS.rg:
+                kwargs['task_metadata'] = task_metadata
+            block_str, token_len = make_block_func(**kwargs)
             if current_token_length + token_len < self.max_retrieval_length:
                 prepend_blocks.insert(0, block_str) 
                 current_token_length += token_len
@@ -90,7 +99,7 @@ def build_2nd_stage_input_file(self, mode):
             task = self.tasks_by_task_id[task_id]
             old_prompt = task['prompt']
             top_k_context = query_line['top_k_context']
-            new_prompt, chosen_context = self._build_prompt(mode, old_prompt, top_k_context)
+            new_prompt, chosen_context = self._build_prompt(mode, old_prompt, task['metadata'], top_k_context)
             new_prompt_line = {
                 'prompt': new_prompt,
                 'metadata': task['metadata'],

diff --git a/RepoCoder/codegen_inference.py b/RepoCoder/codegen_inference.py
@@ -0,0 +1,77 @@
+import torch
+import tqdm
+import json
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+
+class Tools:
+    @staticmethod
+    def load_jsonl(path):
+        with open(path, 'r') as f:
+            return [json.loads(line) for line in f.readlines()]
+
+    @staticmethod
+    def dump_jsonl(obj, path):
+        with open(path, 'w') as f:
+            for line in obj:
+                f.write(json.dumps(line) + '\n')
+
+
+class CodeGen:
+    def __init__(self, model_name, batch_size):
+        self.model_name = model_name
+        self.model = AutoModelForCausalLM.from_pretrained(model_name)
+        self.tokenizer = AutoTokenizer.from_pretrained(model_name, padding_side="left")
+        self.tokenizer.add_special_tokens({'pad_token': self.tokenizer.eos_token})
+        self.model.cuda()
+        self.batch_size = batch_size
+        print('done loading model')
+
+    def _get_batchs(self, prompts, batch_size):
+        batches = []
+        for i in range(0, len(prompts), batch_size):
+            batches.append(prompts[i:i+batch_size])
+        return batches
+
+    def _generate_batch(self, prompt_batch, max_new_tokens=100):
+        prompts = self.tokenizer(prompt_batch, return_tensors='pt', padding=True, truncation=True)
+
+        with torch.no_grad():
+            gen_tokens = self.model.generate(
+                input_ids = prompts['input_ids'].cuda(),
+                attention_mask = prompts['attention_mask'].cuda(),
+                do_sample=False,
+                max_new_tokens=max_new_tokens,
+            )
+        gen_text = self.tokenizer.batch_decode(gen_tokens, skip_special_tokens=True)
+        for i in range(len(gen_text)):  # remove the prompt
+            gen_text[i] = gen_text[i][len(prompt_batch[i]):]
+        return gen_text
+
+    def batch_generate(self, file):
+        print(f'generating from {file}')
+        lines = Tools.load_jsonl(file)
+        # have a new line at the end
+        prompts = [f"{line['prompt']}\n" for line in lines]
+        batches = self._get_batchs(prompts, self.batch_size)
+        gen_text = []
+        for batch in tqdm.tqdm(batches):
+            gen_text.extend(self._generate_batch(batch))
+        print(f'generated {len(gen_text)} samples')
+        assert len(gen_text) == len(prompts)
+        new_lines = []
+        for line, gen in zip(lines, gen_text):
+            new_lines.append({
+                'prompt': line['prompt'],
+                'metadata': line['metadata'],
+                'choices': [{'text': gen}]
+            })
+        Tools.dump_jsonl(new_lines, file.replace('.jsonl', f'_{self.model_name.split("/")[-1]}.jsonl'))
+
+
+if __name__ == '__main__':
+    file_path = 'datasets/line_level_completion_1k_context_codegen.test.jsonl'
+    tiny_codegen = 'Salesforce/codegen-350M-mono'
+
+    cg = CodeGen(tiny_codegen, batch_size=8)
+    cg.batch_generate(file_path)