Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix wrong dataset file path #20

Open
wants to merge 8 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions RepoCoder/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ This project contains the basic components of RepoCoder. Here is an overview:
|-- build_prompt.py # build the prompt with the unfinished code and the retrieved code snippets
|-- run_pipeline.py # run the code completion pipeline
|-- compute_score.py # evaluate the performance of the code completion
|-- codegen_inference.py # an example script for using CodeGen to generate code completions
|-- utils.py # utility functions
|-- datasets/datasets.zip # the input data for the code completion task
|-- function_level_completion_4k_context_codex.test.jsonl
Expand Down
57 changes: 33 additions & 24 deletions RepoCoder/build_prompt.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,31 +39,37 @@ def _make_a_block(self, retrieved_context):
token_len = len(tokenized_block)
return block_str, token_len

def _make_an_extended_block(self, retrieved_context):
def _make_an_extended_block(self, task_metadata, retrieved_context):
content, sim_score = retrieved_context
metadata = content['metadata']
# put the file path in the comment
assert metadata[0]['fpath_tuple'][0] == metadata[0]['repo']
f_paths = ['/'.join(x['fpath_tuple'][1:]) for x in metadata]
f_paths_str = '\n'.join([f'# {f_path}' for f_path in f_paths])
f_path_comment = f'# the below code fragment can be found in:'
# put code lines in the comment
original_code = Tools.read_code(os.path.join(FilePathBuilder.repo_base_dir, *metadata[0]['fpath_tuple']))
code_lines = original_code.splitlines()
end_line_no = metadata[0]['end_line_no']
window_size = metadata[0]['window_size']
slice_size = metadata[0]['slice_size']
new_end_line_no = min(end_line_no + window_size // slice_size, len(code_lines))
new_start_line_no = max(0, new_end_line_no - window_size)
content_lines = code_lines[new_start_line_no:new_end_line_no]
content_lines_comment = [f'# {line}' for line in content_lines]
# aggregate the comment and the code lines
block_str = '\n'.join([f_path_comment, f_paths_str, self.seperator] + content_lines_comment + [self.seperator]) + '\n'
tokenized_block = self.tokenizer.tokenize(block_str)
token_len = len(tokenized_block)
return block_str, token_len
duplicate_num = len(metadata) # for those share the exact same code fragment from different files
for i in range(duplicate_num):
# put the file path in the comment
assert metadata[i]['fpath_tuple'][0] == metadata[i]['repo']
f_paths = ['/'.join(x['fpath_tuple'][1:]) for x in metadata]
f_paths_str = '\n'.join([f'# {f_path}' for f_path in f_paths])
f_path_comment = f'# the below code fragment can be found in:'
# put code lines in the comment
original_code = Tools.read_code(os.path.join(FilePathBuilder.repo_base_dir, *metadata[i]['fpath_tuple']))
code_lines = original_code.splitlines()
end_line_no = metadata[i]['end_line_no']
window_size = metadata[i]['window_size']
slice_size = metadata[i]['slice_size']
new_end_line_no = min(end_line_no + window_size // slice_size, len(code_lines))
new_start_line_no = max(0, new_end_line_no - window_size)
if metadata[i]['fpath_tuple'] == tuple(task_metadata['fpath_tuple']) and new_end_line_no >= task_metadata['line_no']:
continue
content_lines = code_lines[new_start_line_no:new_end_line_no]
content_lines_comment = [f'# {line}' for line in content_lines]
# aggregate the comment and the code lines
block_str = '\n'.join([f_path_comment, f_paths_str, self.seperator] + content_lines_comment + [self.seperator]) + '\n'
tokenized_block = self.tokenizer.tokenize(block_str)
token_len = len(tokenized_block)
return block_str, token_len
else:
return '', 0

def _build_prompt(self, mode, prompt, top_k_context):
def _build_prompt(self, mode, prompt, task_metadata, top_k_context):
prepend_context = "# Here are some relevant code fragments from other files of the repo:\n"
prepend_context += self.seperator + '\n'
current_token_length = 20 # the length of the head_prompt, same for codex and codegen tokenizer
Expand All @@ -73,7 +79,10 @@ def _build_prompt(self, mode, prompt, top_k_context):
for retrieved_context in top_k_context[::-1]:
if len(chosen_context) >= self.max_examples:
break
block_str, token_len = make_block_func(retrieved_context)
kwargs = {'retrieved_context': retrieved_context}
if mode == CONSTANTS.rg:
kwargs['task_metadata'] = task_metadata
block_str, token_len = make_block_func(**kwargs)
if current_token_length + token_len < self.max_retrieval_length:
prepend_blocks.insert(0, block_str)
current_token_length += token_len
Expand All @@ -90,7 +99,7 @@ def build_2nd_stage_input_file(self, mode):
task = self.tasks_by_task_id[task_id]
old_prompt = task['prompt']
top_k_context = query_line['top_k_context']
new_prompt, chosen_context = self._build_prompt(mode, old_prompt, top_k_context)
new_prompt, chosen_context = self._build_prompt(mode, old_prompt, task['metadata'], top_k_context)
new_prompt_line = {
'prompt': new_prompt,
'metadata': task['metadata'],
Expand Down
77 changes: 77 additions & 0 deletions RepoCoder/codegen_inference.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
import torch
import tqdm
import json
from transformers import AutoModelForCausalLM, AutoTokenizer


class Tools:
@staticmethod
def load_jsonl(path):
with open(path, 'r') as f:
return [json.loads(line) for line in f.readlines()]

@staticmethod
def dump_jsonl(obj, path):
with open(path, 'w') as f:
for line in obj:
f.write(json.dumps(line) + '\n')


class CodeGen:
def __init__(self, model_name, batch_size):
self.model_name = model_name
self.model = AutoModelForCausalLM.from_pretrained(model_name)
self.tokenizer = AutoTokenizer.from_pretrained(model_name, padding_side="left")
self.tokenizer.add_special_tokens({'pad_token': self.tokenizer.eos_token})
self.model.cuda()
self.batch_size = batch_size
print('done loading model')

def _get_batchs(self, prompts, batch_size):
batches = []
for i in range(0, len(prompts), batch_size):
batches.append(prompts[i:i+batch_size])
return batches

def _generate_batch(self, prompt_batch, max_new_tokens=100):
prompts = self.tokenizer(prompt_batch, return_tensors='pt', padding=True, truncation=True)

with torch.no_grad():
gen_tokens = self.model.generate(
input_ids = prompts['input_ids'].cuda(),
attention_mask = prompts['attention_mask'].cuda(),
do_sample=False,
max_new_tokens=max_new_tokens,
)
gen_text = self.tokenizer.batch_decode(gen_tokens, skip_special_tokens=True)
for i in range(len(gen_text)): # remove the prompt
gen_text[i] = gen_text[i][len(prompt_batch[i]):]
return gen_text

def batch_generate(self, file):
print(f'generating from {file}')
lines = Tools.load_jsonl(file)
# have a new line at the end
prompts = [f"{line['prompt']}\n" for line in lines]
batches = self._get_batchs(prompts, self.batch_size)
gen_text = []
for batch in tqdm.tqdm(batches):
gen_text.extend(self._generate_batch(batch))
print(f'generated {len(gen_text)} samples')
assert len(gen_text) == len(prompts)
new_lines = []
for line, gen in zip(lines, gen_text):
new_lines.append({
'prompt': line['prompt'],
'metadata': line['metadata'],
'choices': [{'text': gen}]
})
Tools.dump_jsonl(new_lines, file.replace('.jsonl', f'_{self.model_name.split("/")[-1]}.jsonl'))


if __name__ == '__main__':
file_path = 'datasets/line_level_completion_1k_context_codegen.test.jsonl'
tiny_codegen = 'Salesforce/codegen-350M-mono'

cg = CodeGen(tiny_codegen, batch_size=8)
cg.batch_generate(file_path)
Loading