diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index e612d423..4bcc07cb 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -60,9 +60,9 @@ jobs: # runs-on: ubuntu-22.04 # do not expose sensitive environment variables in this yaml - env: + # env: # TODO: dynamically choose GPUs if tests end up using them - CUDA_VISIBLE_DEVICES: 1 + # CUDA_VISIBLE_DEVICES: 1 steps: - uses: actions/checkout@v2 @@ -129,19 +129,19 @@ jobs: run: | . .venv/bin/activate ls -la - python -m unittest tests/test_datasets.py + python -m unittest -v tests/test_datasets.py - name: Test executors run: | . .venv/bin/activate - python -m unittest tests/test_executors.py + python -m unittest -v tests/test_executors.py - name: Test inference pipeline run: | . .venv/bin/activate - python -m unittest tests/test_inference_pipeline.py + python -m unittest -v tests/test_inference_pipeline.py - name: Test models run: | . .venv/bin/activate - python -m unittest tests/test_models.py \ No newline at end of file + python -m unittest -v tests/test_models.py \ No newline at end of file diff --git a/finetuning/lightning_modules/datasets/mathqa_reader.py b/finetuning/lightning_modules/datasets/mathqa_reader.py index aca23f9a..92322026 100644 --- a/finetuning/lightning_modules/datasets/mathqa_reader.py +++ b/finetuning/lightning_modules/datasets/mathqa_reader.py @@ -13,7 +13,8 @@ def get_train_instance(self, example: Dict[str, Any]) -> List[Dict[str, Any]]: def get_test_instance(self, example: Dict[str, Any]) -> List[Dict[str, Any]]: # parse the answer and add the field example["original_answer"] = example["answer"] - example["answer"] = example["answer"].split("\n####")[-1].strip() + # TODO: in data/mathqa/val_dedup.jsonl, example["answer"] are floats + # example["answer"] = example["answer"].split("\n####")[-1].strip() return [self.get_example_dict(example, example["text"], "", train_mode=False)] diff --git a/finetuning/lightning_modules/datasets/spider_reader.py b/finetuning/lightning_modules/datasets/spider_reader.py index 658667cf..c085701e 100644 --- a/finetuning/lightning_modules/datasets/spider_reader.py +++ b/finetuning/lightning_modules/datasets/spider_reader.py @@ -9,6 +9,11 @@ from finetuning.lightning_modules.datasets.base_reader import NL2CodeDataset, FewShotNL2CodeDataset +# DB_INFO_FILE = os.path.join(os.path.dirname(__file__), '../../../data/squall/db_info_wtq.json') +DB_INFO_FILE = os.path.join(os.path.dirname(__file__), f"{os.environ['NLP4CODE_TEST_DATA_PATH']}/squall/db_info_wtq.json") +with open(DB_INFO_FILE, "r") as f: + full_db_info = json.load(f) + class FewShotSpiderDataset(FewShotNL2CodeDataset): diff --git a/finetuning/lightning_modules/models/seq2seq_model_util.py b/finetuning/lightning_modules/models/seq2seq_model_util.py index 6bc49bfb..8d0b5e5e 100755 --- a/finetuning/lightning_modules/models/seq2seq_model_util.py +++ b/finetuning/lightning_modules/models/seq2seq_model_util.py @@ -83,14 +83,19 @@ def get_model(model_name: str, use_cache=not gradient_ckpt, **additional_init_args) elif model_name.startswith("Salesforce/codegen-"): + # TODO: using float32 here for tests + # RunTime error: "LayerNormKernelImpl" not implemented for 'Half' codegen + # https://github.com/huggingface/transformers/issues/21989 tokenizer = AutoTokenizer.from_pretrained(model_name, - additional_special_tokens=additional_special_tokens) + additional_special_tokens=additional_special_tokens, + torch_dtype=torch.float32) + tokenizer.pad_token = tokenizer.eos_token if not tokenizer_only: model = AutoModelForCausalLM.from_pretrained(model_name, pad_token_id=tokenizer.eos_token_id, - torch_dtype=torch.float16, + torch_dtype=torch.float32, # device_map="auto", use_cache=True) elif model_name.startswith("bigscience/bloom-"): diff --git a/tests/consts.py b/tests/consts.py new file mode 100644 index 00000000..93c20d4c --- /dev/null +++ b/tests/consts.py @@ -0,0 +1,211 @@ +import os +from typing import List, Dict, Tuple, Optional + +NLP4CODE_TEST_DATA_PATH = os.environ["NLP4CODE_TEST_DATA_PATH"] + + +from finetuning.lightning_modules.datasets.base_reader import ( + FewShotNL2CodeDataset, + NL2CodeDataset, +) +from finetuning.lightning_modules.datasets.mathqa_reader import ( + FewShotMathQADataset, + MathQADataset, +) +from finetuning.lightning_modules.datasets.mbpp_reader import ( + FewShotMBPPDataset, +) +from finetuning.lightning_modules.datasets.spider_reader import ( + FewShotSpiderDataset, + SpiderDataset, +) + +from execution.executors import ( + BaseExecutor, + MathExecutor, + MBPPExecutor, + SpiderExecutor, + WTQExecutor, +) + + +# TODO: use special test string for test transformer model name? (don't load model) +TEST_TRANSFORMER_MODEL_NAME = "EleutherAI/gpt-neo-125M" + + +# ======== datasets ======== + +# defines kwargs needed to initialize NL2CodeDataset +class TestDatasetInitKwargs: + transformer_model_name: str + file_path: str + mode: str + + def __init__( + self, + file_path: str, + mode: Optional[str] = "train", # default to train + transformer_model_name: Optional[str] = TEST_TRANSFORMER_MODEL_NAME, + ): + self.file_path = file_path + self.mode = mode + self.transformer_model_name = transformer_model_name + + +DATASETS: List[Tuple[NL2CodeDataset, TestDatasetInitKwargs]] = [ + ( + MathQADataset, + TestDatasetInitKwargs( + file_path=f"{NLP4CODE_TEST_DATA_PATH}/mathqa/train_dedup.jsonl", + ), + ), + # TODO: SpiderDataset prompt_function + # ( + # SpiderDataset, + # TestDatasetInitKwargs( + # file_path=f"{NLP4CODE_TEST_DATA_PATH}/spider/train_spider_processed_v2.jsonl", + # ), + # ), +] + + +# defines kwargs needed to instantiate FewShotNL2CodeDataset +class TestFewShotDatasetInitKwargs(TestDatasetInitKwargs): + transformer_model_name: str + file_path: str + exemplar_file_path: str + mode: str = "test" + + def __init__( + self, + file_path: str, + exemplar_file_path: str, + transformer_model_name: Optional[str] = TEST_TRANSFORMER_MODEL_NAME, + ): + super().__init__( + file_path=file_path, + transformer_model_name=transformer_model_name, + mode="test", + ) + self.exemplar_file_path = exemplar_file_path + + +# TODO: better way to do this? (custom types for each kwargs?) +# TODO: make sure to keep dataset files up to date here +# list of (dataset, **init_kwargs) tuples +FEW_SHOT_DATASETS: List[Tuple[FewShotNL2CodeDataset, TestFewShotDatasetInitKwargs]] = [ + ( + FewShotMathQADataset, + TestFewShotDatasetInitKwargs( + exemplar_file_path="prompt_files/mathqa-non_idiomatic_code-annotated-8_exemplars.jsonl", + file_path=f"{NLP4CODE_TEST_DATA_PATH}/mathqa/val_dedup_init_val.jsonl", + ), + ), + ( + FewShotMBPPDataset, + TestFewShotDatasetInitKwargs( + exemplar_file_path="prompt_files/mbpp-official_first_3-10_exemplars.jsonl", + file_path=f"{NLP4CODE_TEST_DATA_PATH}/mbpp/mbpp_test.jsonl", + ), + ), + ( + FewShotSpiderDataset, + TestFewShotDatasetInitKwargs( + exemplar_file_path="prompt_files/spider-8_exemplars.jsonl", + file_path=f"{NLP4CODE_TEST_DATA_PATH}/spider/dev_processed_db_path.jsonl", + ), + ), + ( + FewShotSpiderDataset, + TestFewShotDatasetInitKwargs( + exemplar_file_path="prompt_files/wtq-8_exemplars.jsonl", + # TODO: why does wtq_restored_dev.jsonl error + file_path=f"{NLP4CODE_TEST_DATA_PATH}/squall/wtq_restored_test.jsonl", + ), + ), +] + + +# ======== models ======== + +TEST_MODEL_TRANSFORMER_MODEL_NAMES: List[str] = [ + "EleutherAI/gpt-neo-125M", + "Salesforce/codet5-small", + "Salesforce/codegen-350M-multi", +] + +TEST_MODEL_EXECUTOR_CLS = "execution.executors.MathExecutor" + + +# ======== executors ======== + +TEST_PROGRAM = "answer = 5" + +# Tuple[ExecutorCls, program, example] +TEST_EXECUTORS: List[Tuple[BaseExecutor, str, Dict]] = [ + ( + MathExecutor, + TEST_PROGRAM, + { + "question": "some question", + "answer": 5, + }, + ), + ( + MBPPExecutor, + TEST_PROGRAM, + { + "question": "some question", + "answer": 5, + "code": "return 5", + "task_id": "xyz", + "test_setup_code": 'print("setup")', + "test_list": ["assert 1+1 == 2", "assert 1+1 != 3"], + }, + ), + ( + SpiderExecutor, + TEST_PROGRAM, + { + "question": "some question", + "db_id": "my_db_id", + "query": "SELECT * FROM table", + }, + ), + ( + WTQExecutor, + TEST_PROGRAM, + { + "question": "some question", + "db_id": "my_db_id", + "db_path": "path/to/my/db", + "original_answer": 5, + }, + ), +] + + +# ======== integration ======== + +TEST_PIPELINE_YAML_CONFIG_FILE = "tests/test_configs/few_shot-pipeline.yaml" + +# TODO: more datasets (see SummerTime matrix) +# each tuple contains model_name, Pytorch Lightning config YAML file, val_file_path +TEST_PIPELINE_INFO: List[Tuple[str, str, str]] = [ + ( + "EleutherAI/gpt-neo-125M", + TEST_PIPELINE_YAML_CONFIG_FILE, + "$NLP4CODE_TEST_DATA_PATH/gsmath/split_dev.jsonl", + ), + # TODO: tensor dimension mismatch error for codet5-small (probably config file problem) + # ( + # "Salesforce/codet5-small", + # TEST_PIPELINE_YAML_CONFIG_FILE, + # "$NLP4CODE_TEST_DATA_PATH/gsmath/split_dev.jsonl", + # ), + ( + "Salesforce/codegen-350M-multi", + TEST_PIPELINE_YAML_CONFIG_FILE, + "$NLP4CODE_TEST_DATA_PATH/gsmath/split_dev.jsonl", + ), +] diff --git a/tests/test_datasets.py b/tests/test_datasets.py index 384b7c60..859ad623 100644 --- a/tests/test_datasets.py +++ b/tests/test_datasets.py @@ -1,14 +1,111 @@ import unittest from os import path, sys +from typing import List, Tuple, Dict ROOT_DIR = path.dirname(path.dirname(path.abspath(__file__))) sys.path.append(ROOT_DIR) -from execution.executors import MathExecutor + +# from tests.consts import DATA_MODULES, DATASETS, FEW_SHOT_DATASETS +from tests.consts import DATASETS, FEW_SHOT_DATASETS, TestDatasetInitKwargs + +from torch.utils.data import DataLoader + +from finetuning.lightning_modules.datasets.base_datamodule import ( + NL2CodeDataModule, + FewShotNL2CodeDataModule, +) + + +# test cases to add: +# - test base_reader classes are abstract +# - test different modes (train, test, few_shot_test) class TestDatasets(unittest.TestCase): - def test_gsmath(self): - # TODO: this is dummy test - self.assertTrue(True) + # TODO: NotImplemented error testing + def test_few_shot_datasets(self): + for few_shot_dataset_cls, few_shot_dataset_init_kwargs in FEW_SHOT_DATASETS: + print( + f"\n======== testing few-shot dataset {few_shot_dataset_cls.__name__} ========" + ) + few_shot_dataset = few_shot_dataset_cls( + **vars(few_shot_dataset_init_kwargs), + ) + + def test_finetune_datasets(self): + for finetune_dataset_cls, finetune_dataset_init_kwargs in DATASETS: + print( + f"\n======== testing finetune dataset {finetune_dataset_cls.__name__} ========" + ) + finetune_dataset = finetune_dataset_cls( + **vars(finetune_dataset_init_kwargs) + ) + + +def create_data_module_init_kwargs( + dataset_init_kwargs: TestDatasetInitKwargs, is_few_shot: bool +) -> Dict: + dataset_init_kwargs_dict = vars(dataset_init_kwargs) + data_module_init_kwargs_dict = dataset_init_kwargs_dict.copy() + + data_module_init_kwargs_dict["batch_size"] = 1 + + data_module_init_kwargs_dict["val_file_path"] = data_module_init_kwargs_dict[ + "file_path" + ] + data_module_init_kwargs_dict["val_batch_size"] = 1 + if not is_few_shot: + data_module_init_kwargs_dict["train_file_path"] = data_module_init_kwargs_dict[ + "file_path" + ] + + del data_module_init_kwargs_dict["file_path"] + del data_module_init_kwargs_dict["mode"] + return data_module_init_kwargs_dict + + +class TestDataModules(unittest.TestCase): + def test_few_shot_data_modules(self): + # instantiate each few shot dataset as part of a data module + for few_shot_dataset_cls, few_shot_dataset_init_kwargs in FEW_SHOT_DATASETS: + print( + f"\n======== testing few-shot DataModule with {few_shot_dataset_cls.__name__} ========" + ) + few_shot_dataset_cls_str = few_shot_dataset_cls.__name__ + few_shot_data_module_init_kwargs = create_data_module_init_kwargs( + few_shot_dataset_init_kwargs, True + ) + + few_shot_data_module = FewShotNL2CodeDataModule( + dataset_cls=few_shot_dataset_cls_str, + **few_shot_data_module_init_kwargs, + ) + + # no train_dataloader on few shot data module + with self.assertRaises(NotImplementedError): + train_dl = few_shot_data_module.train_dataloader() + val_dl = few_shot_data_module.val_dataloader() + self.assertTrue(isinstance(val_dl, DataLoader)) + + def test_finetune_data_modules(self): + # instantiate each few shot dataset as part of a data module + for finetune_dataset_cls, finetune_dataset_init_kwargs in DATASETS: + print( + f"\n======== testing finetune DataModule with {finetune_dataset_cls.__name__} ========" + ) + finetune_dataset_cls_str = finetune_dataset_cls.__name__ + finetune_data_module_init_kwargs = create_data_module_init_kwargs( + finetune_dataset_init_kwargs, False + ) + + finetune_data_module = NL2CodeDataModule( + dataset_cls=finetune_dataset_cls_str, + **finetune_data_module_init_kwargs, + ) + + train_dl = finetune_data_module.train_dataloader() + self.assertTrue(isinstance(train_dl, DataLoader)) + val_dl = finetune_data_module.val_dataloader() + self.assertTrue(isinstance(val_dl, DataLoader)) diff --git a/tests/test_executors.py b/tests/test_executors.py index cbe68366..a3f78bf8 100644 --- a/tests/test_executors.py +++ b/tests/test_executors.py @@ -6,9 +6,31 @@ sys.path.append(ROOT_DIR) from execution.executors import MathExecutor +from tests.consts import TEST_EXECUTORS class TestExecutors(unittest.TestCase): + def test_executors(self): + for executor_cls, test_program, test_example in TEST_EXECUTORS: + print(f"\n======== testing {executor_cls.__name__} ========") + executor = executor_cls() + + print(test_program) + print(test_example) + + try: + exec_match, exec_results = executor.exec_program( + test_program, test_example + ) + self.assertIsInstance(exec_match, int) + print(exec_results) + # TODO: use real DB connections + except: + self.assertIsInstance(exec_match, int) + print(exec_results) + + # custom tests for specific executors + def test_math_executor(self): executor = MathExecutor() diff --git a/tests/test_inference_pipeline.py b/tests/test_inference_pipeline.py index 247c4a8b..102f2e55 100644 --- a/tests/test_inference_pipeline.py +++ b/tests/test_inference_pipeline.py @@ -2,27 +2,31 @@ import unittest # get the data directory from the environment variable -DATA_DIR = os.environ.get('DATA_DIR') +DATA_DIR = os.environ.get("DATA_DIR") + +from tests.consts import TEST_PIPELINE_INFO + class TestDecOnlyModelInference(unittest.TestCase): def test_basic(self): - exit_code = os.system("export PYTHONPATH=`pwd`; echo $PYTHONPATH; echo $NLP4CODE_TEST_DATA_PATH; " + \ - "python finetuning/trainer.py validate " + \ - "--config finetuning/training_configs/few_shot/gsmath.yaml " + \ - # still using CPU for now - "--trainer.gpus 0 " + \ - "--trainer.accelerator cpu " + \ - "--trainer.precision 32 " + \ - "--model.init_args.print_eval_every_n_batches 1 " + \ - "--model.init_args.transformer_model_name EleutherAI/gpt-neo-125M " + \ - "--data.init_args.transformer_model_name EleutherAI/gpt-neo-125M " + \ - "--data.init_args.val_max_instances 2 " + \ - # "--data.init_args.val_batch_size 1 ") - "--data.init_args.val_batch_size 1 " + \ - "--data.init_args.val_file_path $NLP4CODE_TEST_DATA_PATH/gsmath/split_dev.jsonl " - ) - - self.assertEqual(exit_code, 0) + for model_name, yaml_config_path, val_file_path in TEST_PIPELINE_INFO: + exit_code = os.system( + "export PYTHONPATH=`pwd`; echo $PYTHONPATH; echo $NLP4CODE_TEST_DATA_PATH; " + + "python finetuning/trainer.py validate " + + f"--config {yaml_config_path} " + + "--trainer.gpus 0 " # still using CPU for now + + "--trainer.accelerator cpu " + + "--trainer.precision 32 " + + "--model.init_args.print_eval_every_n_batches 1 " + + f"--model.init_args.transformer_model_name {model_name} " + + f"--data.init_args.transformer_model_name {model_name} " + + "--data.init_args.val_max_instances 2 " + + "--data.init_args.val_batch_size 1 " # "--data.init_args.val_batch_size 1 ") + + f"--data.init_args.val_file_path {val_file_path} " + ) + + self.assertEqual(exit_code, 0) + -if __name__ == '__main__': - unittest.main() \ No newline at end of file +if __name__ == "__main__": + unittest.main() diff --git a/tests/test_models.py b/tests/test_models.py index ca4f567f..2dd99d4c 100644 --- a/tests/test_models.py +++ b/tests/test_models.py @@ -2,6 +2,8 @@ from os import path, sys +from tests.consts import TEST_MODEL_EXECUTOR_CLS, TEST_MODEL_TRANSFORMER_MODEL_NAMES + ROOT_DIR = path.dirname(path.dirname(path.abspath(__file__))) sys.path.append(ROOT_DIR) @@ -11,27 +13,31 @@ class TestModels(unittest.TestCase): - def test_gpt_neo(self): - model = Seq2SeqModel( - transformer_model_name="EleutherAI/gpt-neo-125M", - executor_cls="execution.executors.MathExecutor", - ) - - test_input_str = [ - "# write a python program that adds two integers", - "# write a python program that adds two integers", - ] - context_tokenizer_outputs = model.tokenizer(test_input_str, return_tensors="pt") - input_ids = context_tokenizer_outputs["input_ids"] - attention_mask = context_tokenizer_outputs["attention_mask"] - - generation_result = model.forward( - input_ids, - attention_mask=attention_mask, - metadata=[{"nl": test_input_str[0]}, {"nl": test_input_str[1]}], - ) - - self.assertEqual(len(generation_result), 2) - self.assertEqual( - all(["generated_program" in result for result in generation_result]), True - ) + def test_models(self): + for model_name in TEST_MODEL_TRANSFORMER_MODEL_NAMES: + model = Seq2SeqModel( + transformer_model_name=model_name, + executor_cls=TEST_MODEL_EXECUTOR_CLS, + ) + + test_input_str = [ + "# write a python program that adds two integers", + "# write a python program that adds two integers", + ] + context_tokenizer_outputs = model.tokenizer( + test_input_str, return_tensors="pt" + ) + input_ids = context_tokenizer_outputs["input_ids"] + attention_mask = context_tokenizer_outputs["attention_mask"] + + generation_result = model.forward( + input_ids, + attention_mask=attention_mask, + metadata=[{"nl": test_input_str[0]}, {"nl": test_input_str[1]}], + ) + + self.assertEqual(len(generation_result), 2) + self.assertEqual( + all(["generated_program" in result for result in generation_result]), + True, + )