diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
index e612d423..4bcc07cb 100644
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -60,9 +60,9 @@ jobs:
     # runs-on: ubuntu-22.04
 
     # do not expose sensitive environment variables in this yaml
-    env:
+    # env:
     # TODO: dynamically choose GPUs if tests end up using them
-      CUDA_VISIBLE_DEVICES: 1
+    #   CUDA_VISIBLE_DEVICES: 1
 
     steps:
       - uses: actions/checkout@v2
@@ -129,19 +129,19 @@ jobs:
         run: |
           . .venv/bin/activate
           ls -la
-          python -m unittest tests/test_datasets.py
+          python -m unittest -v tests/test_datasets.py
 
       - name: Test executors
         run: |
           . .venv/bin/activate
-          python -m unittest tests/test_executors.py
+          python -m unittest -v tests/test_executors.py
 
       - name: Test inference pipeline
         run: |
           . .venv/bin/activate
-          python -m unittest tests/test_inference_pipeline.py
+          python -m unittest -v tests/test_inference_pipeline.py
 
       - name: Test models
         run: |
           . .venv/bin/activate
-          python -m unittest tests/test_models.py
\ No newline at end of file
+          python -m unittest -v tests/test_models.py
\ No newline at end of file
diff --git a/finetuning/lightning_modules/datasets/mathqa_reader.py b/finetuning/lightning_modules/datasets/mathqa_reader.py
index aca23f9a..92322026 100644
--- a/finetuning/lightning_modules/datasets/mathqa_reader.py
+++ b/finetuning/lightning_modules/datasets/mathqa_reader.py
@@ -13,7 +13,8 @@ def get_train_instance(self, example: Dict[str, Any]) -> List[Dict[str, Any]]:
     def get_test_instance(self, example: Dict[str, Any]) -> List[Dict[str, Any]]:
         # parse the answer and add the field
         example["original_answer"] = example["answer"]
-        example["answer"] = example["answer"].split("\n####")[-1].strip()
+        # TODO: in data/mathqa/val_dedup.jsonl, example["answer"] are floats
+        # example["answer"] = example["answer"].split("\n####")[-1].strip()
 
         return [self.get_example_dict(example, example["text"], "", train_mode=False)]
 
diff --git a/finetuning/lightning_modules/datasets/spider_reader.py b/finetuning/lightning_modules/datasets/spider_reader.py
index 658667cf..c085701e 100644
--- a/finetuning/lightning_modules/datasets/spider_reader.py
+++ b/finetuning/lightning_modules/datasets/spider_reader.py
@@ -9,6 +9,11 @@
 
 from finetuning.lightning_modules.datasets.base_reader import NL2CodeDataset, FewShotNL2CodeDataset
 
+# DB_INFO_FILE = os.path.join(os.path.dirname(__file__), '../../../data/squall/db_info_wtq.json')
+DB_INFO_FILE = os.path.join(os.path.dirname(__file__), f"{os.environ['NLP4CODE_TEST_DATA_PATH']}/squall/db_info_wtq.json")
+with open(DB_INFO_FILE, "r") as f:
+    full_db_info = json.load(f)
+
 
 class FewShotSpiderDataset(FewShotNL2CodeDataset):
 
diff --git a/finetuning/lightning_modules/models/seq2seq_model_util.py b/finetuning/lightning_modules/models/seq2seq_model_util.py
index 6bc49bfb..8d0b5e5e 100755
--- a/finetuning/lightning_modules/models/seq2seq_model_util.py
+++ b/finetuning/lightning_modules/models/seq2seq_model_util.py
@@ -83,14 +83,19 @@ def get_model(model_name: str,
                                                                use_cache=not gradient_ckpt,
                                                                **additional_init_args)
     elif model_name.startswith("Salesforce/codegen-"):
+        # TODO: using float32 here for tests
+        # RunTime error: "LayerNormKernelImpl" not implemented for 'Half' codegen
+        # https://github.com/huggingface/transformers/issues/21989
         tokenizer = AutoTokenizer.from_pretrained(model_name,
-                                                    additional_special_tokens=additional_special_tokens)
+                                                    additional_special_tokens=additional_special_tokens,
+                                                    torch_dtype=torch.float32)
+
         tokenizer.pad_token = tokenizer.eos_token
 
         if not tokenizer_only:
             model = AutoModelForCausalLM.from_pretrained(model_name, 
                                                     pad_token_id=tokenizer.eos_token_id, 
-                                                    torch_dtype=torch.float16, 
+                                                    torch_dtype=torch.float32, 
                                                     # device_map="auto",
                                                     use_cache=True)
     elif model_name.startswith("bigscience/bloom-"):
diff --git a/tests/consts.py b/tests/consts.py
new file mode 100644
index 00000000..93c20d4c
--- /dev/null
+++ b/tests/consts.py
@@ -0,0 +1,211 @@
+import os
+from typing import List, Dict, Tuple, Optional
+
+NLP4CODE_TEST_DATA_PATH = os.environ["NLP4CODE_TEST_DATA_PATH"]
+
+
+from finetuning.lightning_modules.datasets.base_reader import (
+    FewShotNL2CodeDataset,
+    NL2CodeDataset,
+)
+from finetuning.lightning_modules.datasets.mathqa_reader import (
+    FewShotMathQADataset,
+    MathQADataset,
+)
+from finetuning.lightning_modules.datasets.mbpp_reader import (
+    FewShotMBPPDataset,
+)
+from finetuning.lightning_modules.datasets.spider_reader import (
+    FewShotSpiderDataset,
+    SpiderDataset,
+)
+
+from execution.executors import (
+    BaseExecutor,
+    MathExecutor,
+    MBPPExecutor,
+    SpiderExecutor,
+    WTQExecutor,
+)
+
+
+# TODO: use special test string for test transformer model name? (don't load model)
+TEST_TRANSFORMER_MODEL_NAME = "EleutherAI/gpt-neo-125M"
+
+
+# ======== datasets ========
+
+# defines kwargs needed to initialize NL2CodeDataset
+class TestDatasetInitKwargs:
+    transformer_model_name: str
+    file_path: str
+    mode: str
+
+    def __init__(
+        self,
+        file_path: str,
+        mode: Optional[str] = "train",  # default to train
+        transformer_model_name: Optional[str] = TEST_TRANSFORMER_MODEL_NAME,
+    ):
+        self.file_path = file_path
+        self.mode = mode
+        self.transformer_model_name = transformer_model_name
+
+
+DATASETS: List[Tuple[NL2CodeDataset, TestDatasetInitKwargs]] = [
+    (
+        MathQADataset,
+        TestDatasetInitKwargs(
+            file_path=f"{NLP4CODE_TEST_DATA_PATH}/mathqa/train_dedup.jsonl",
+        ),
+    ),
+    # TODO: SpiderDataset prompt_function
+    # (
+    #     SpiderDataset,
+    #     TestDatasetInitKwargs(
+    #         file_path=f"{NLP4CODE_TEST_DATA_PATH}/spider/train_spider_processed_v2.jsonl",
+    #     ),
+    # ),
+]
+
+
+# defines kwargs needed to instantiate FewShotNL2CodeDataset
+class TestFewShotDatasetInitKwargs(TestDatasetInitKwargs):
+    transformer_model_name: str
+    file_path: str
+    exemplar_file_path: str
+    mode: str = "test"
+
+    def __init__(
+        self,
+        file_path: str,
+        exemplar_file_path: str,
+        transformer_model_name: Optional[str] = TEST_TRANSFORMER_MODEL_NAME,
+    ):
+        super().__init__(
+            file_path=file_path,
+            transformer_model_name=transformer_model_name,
+            mode="test",
+        )
+        self.exemplar_file_path = exemplar_file_path
+
+
+# TODO: better way to do this? (custom types for each kwargs?)
+# TODO: make sure to keep dataset files up to date here
+# list of (dataset, **init_kwargs) tuples
+FEW_SHOT_DATASETS: List[Tuple[FewShotNL2CodeDataset, TestFewShotDatasetInitKwargs]] = [
+    (
+        FewShotMathQADataset,
+        TestFewShotDatasetInitKwargs(
+            exemplar_file_path="prompt_files/mathqa-non_idiomatic_code-annotated-8_exemplars.jsonl",
+            file_path=f"{NLP4CODE_TEST_DATA_PATH}/mathqa/val_dedup_init_val.jsonl",
+        ),
+    ),
+    (
+        FewShotMBPPDataset,
+        TestFewShotDatasetInitKwargs(
+            exemplar_file_path="prompt_files/mbpp-official_first_3-10_exemplars.jsonl",
+            file_path=f"{NLP4CODE_TEST_DATA_PATH}/mbpp/mbpp_test.jsonl",
+        ),
+    ),
+    (
+        FewShotSpiderDataset,
+        TestFewShotDatasetInitKwargs(
+            exemplar_file_path="prompt_files/spider-8_exemplars.jsonl",
+            file_path=f"{NLP4CODE_TEST_DATA_PATH}/spider/dev_processed_db_path.jsonl",
+        ),
+    ),
+    (
+        FewShotSpiderDataset,
+        TestFewShotDatasetInitKwargs(
+            exemplar_file_path="prompt_files/wtq-8_exemplars.jsonl",
+            # TODO: why does wtq_restored_dev.jsonl error
+            file_path=f"{NLP4CODE_TEST_DATA_PATH}/squall/wtq_restored_test.jsonl",
+        ),
+    ),
+]
+
+
+# ======== models ========
+
+TEST_MODEL_TRANSFORMER_MODEL_NAMES: List[str] = [
+    "EleutherAI/gpt-neo-125M",
+    "Salesforce/codet5-small",
+    "Salesforce/codegen-350M-multi",
+]
+
+TEST_MODEL_EXECUTOR_CLS = "execution.executors.MathExecutor"
+
+
+# ======== executors ========
+
+TEST_PROGRAM = "answer = 5"
+
+# Tuple[ExecutorCls, program, example]
+TEST_EXECUTORS: List[Tuple[BaseExecutor, str, Dict]] = [
+    (
+        MathExecutor,
+        TEST_PROGRAM,
+        {
+            "question": "some question",
+            "answer": 5,
+        },
+    ),
+    (
+        MBPPExecutor,
+        TEST_PROGRAM,
+        {
+            "question": "some question",
+            "answer": 5,
+            "code": "return 5",
+            "task_id": "xyz",
+            "test_setup_code": 'print("setup")',
+            "test_list": ["assert 1+1 == 2", "assert 1+1 != 3"],
+        },
+    ),
+    (
+        SpiderExecutor,
+        TEST_PROGRAM,
+        {
+            "question": "some question",
+            "db_id": "my_db_id",
+            "query": "SELECT * FROM table",
+        },
+    ),
+    (
+        WTQExecutor,
+        TEST_PROGRAM,
+        {
+            "question": "some question",
+            "db_id": "my_db_id",
+            "db_path": "path/to/my/db",
+            "original_answer": 5,
+        },
+    ),
+]
+
+
+# ======== integration ========
+
+TEST_PIPELINE_YAML_CONFIG_FILE = "tests/test_configs/few_shot-pipeline.yaml"
+
+# TODO: more datasets (see SummerTime matrix)
+# each tuple contains model_name, Pytorch Lightning config YAML file, val_file_path
+TEST_PIPELINE_INFO: List[Tuple[str, str, str]] = [
+    (
+        "EleutherAI/gpt-neo-125M",
+        TEST_PIPELINE_YAML_CONFIG_FILE,
+        "$NLP4CODE_TEST_DATA_PATH/gsmath/split_dev.jsonl",
+    ),
+    # TODO: tensor dimension mismatch error for codet5-small (probably config file problem)
+    # (
+    #     "Salesforce/codet5-small",
+    #     TEST_PIPELINE_YAML_CONFIG_FILE,
+    #     "$NLP4CODE_TEST_DATA_PATH/gsmath/split_dev.jsonl",
+    # ),
+    (
+        "Salesforce/codegen-350M-multi",
+        TEST_PIPELINE_YAML_CONFIG_FILE,
+        "$NLP4CODE_TEST_DATA_PATH/gsmath/split_dev.jsonl",
+    ),
+]
diff --git a/tests/test_datasets.py b/tests/test_datasets.py
index 384b7c60..859ad623 100644
--- a/tests/test_datasets.py
+++ b/tests/test_datasets.py
@@ -1,14 +1,111 @@
 import unittest
 
 from os import path, sys
+from typing import List, Tuple, Dict
 
 ROOT_DIR = path.dirname(path.dirname(path.abspath(__file__)))
 sys.path.append(ROOT_DIR)
 
-from execution.executors import MathExecutor
+
+# from tests.consts import DATA_MODULES, DATASETS, FEW_SHOT_DATASETS
+from tests.consts import DATASETS, FEW_SHOT_DATASETS, TestDatasetInitKwargs
+
+from torch.utils.data import DataLoader
+
+from finetuning.lightning_modules.datasets.base_datamodule import (
+    NL2CodeDataModule,
+    FewShotNL2CodeDataModule,
+)
+
+
+# test cases to add:
+# - test base_reader classes are abstract
+# - test different modes (train, test, few_shot_test)
 
 
 class TestDatasets(unittest.TestCase):
-    def test_gsmath(self):
-        # TODO: this is dummy test
-        self.assertTrue(True)
+    # TODO: NotImplemented error testing
+    def test_few_shot_datasets(self):
+        for few_shot_dataset_cls, few_shot_dataset_init_kwargs in FEW_SHOT_DATASETS:
+            print(
+                f"\n======== testing few-shot dataset {few_shot_dataset_cls.__name__} ========"
+            )
+            few_shot_dataset = few_shot_dataset_cls(
+                **vars(few_shot_dataset_init_kwargs),
+            )
+
+    def test_finetune_datasets(self):
+        for finetune_dataset_cls, finetune_dataset_init_kwargs in DATASETS:
+            print(
+                f"\n======== testing finetune dataset {finetune_dataset_cls.__name__} ========"
+            )
+            finetune_dataset = finetune_dataset_cls(
+                **vars(finetune_dataset_init_kwargs)
+            )
+
+
+def create_data_module_init_kwargs(
+    dataset_init_kwargs: TestDatasetInitKwargs, is_few_shot: bool
+) -> Dict:
+    dataset_init_kwargs_dict = vars(dataset_init_kwargs)
+    data_module_init_kwargs_dict = dataset_init_kwargs_dict.copy()
+
+    data_module_init_kwargs_dict["batch_size"] = 1
+
+    data_module_init_kwargs_dict["val_file_path"] = data_module_init_kwargs_dict[
+        "file_path"
+    ]
+    data_module_init_kwargs_dict["val_batch_size"] = 1
+    if not is_few_shot:
+        data_module_init_kwargs_dict["train_file_path"] = data_module_init_kwargs_dict[
+            "file_path"
+        ]
+
+    del data_module_init_kwargs_dict["file_path"]
+    del data_module_init_kwargs_dict["mode"]
+    return data_module_init_kwargs_dict
+
+
+class TestDataModules(unittest.TestCase):
+    def test_few_shot_data_modules(self):
+        # instantiate each few shot dataset as part of a data module
+        for few_shot_dataset_cls, few_shot_dataset_init_kwargs in FEW_SHOT_DATASETS:
+            print(
+                f"\n======== testing few-shot DataModule with {few_shot_dataset_cls.__name__} ========"
+            )
+            few_shot_dataset_cls_str = few_shot_dataset_cls.__name__
+            few_shot_data_module_init_kwargs = create_data_module_init_kwargs(
+                few_shot_dataset_init_kwargs, True
+            )
+
+            few_shot_data_module = FewShotNL2CodeDataModule(
+                dataset_cls=few_shot_dataset_cls_str,
+                **few_shot_data_module_init_kwargs,
+            )
+
+            # no train_dataloader on few shot data module
+            with self.assertRaises(NotImplementedError):
+                train_dl = few_shot_data_module.train_dataloader()
+            val_dl = few_shot_data_module.val_dataloader()
+            self.assertTrue(isinstance(val_dl, DataLoader))
+
+    def test_finetune_data_modules(self):
+        # instantiate each few shot dataset as part of a data module
+        for finetune_dataset_cls, finetune_dataset_init_kwargs in DATASETS:
+            print(
+                f"\n======== testing finetune DataModule with {finetune_dataset_cls.__name__} ========"
+            )
+            finetune_dataset_cls_str = finetune_dataset_cls.__name__
+            finetune_data_module_init_kwargs = create_data_module_init_kwargs(
+                finetune_dataset_init_kwargs, False
+            )
+
+            finetune_data_module = NL2CodeDataModule(
+                dataset_cls=finetune_dataset_cls_str,
+                **finetune_data_module_init_kwargs,
+            )
+
+            train_dl = finetune_data_module.train_dataloader()
+            self.assertTrue(isinstance(train_dl, DataLoader))
+            val_dl = finetune_data_module.val_dataloader()
+            self.assertTrue(isinstance(val_dl, DataLoader))
diff --git a/tests/test_executors.py b/tests/test_executors.py
index cbe68366..a3f78bf8 100644
--- a/tests/test_executors.py
+++ b/tests/test_executors.py
@@ -6,9 +6,31 @@
 sys.path.append(ROOT_DIR)
 
 from execution.executors import MathExecutor
+from tests.consts import TEST_EXECUTORS
 
 
 class TestExecutors(unittest.TestCase):
+    def test_executors(self):
+        for executor_cls, test_program, test_example in TEST_EXECUTORS:
+            print(f"\n======== testing {executor_cls.__name__} ========")
+            executor = executor_cls()
+
+            print(test_program)
+            print(test_example)
+
+            try:
+                exec_match, exec_results = executor.exec_program(
+                    test_program, test_example
+                )
+                self.assertIsInstance(exec_match, int)
+                print(exec_results)
+            # TODO: use real DB connections
+            except:
+                self.assertIsInstance(exec_match, int)
+                print(exec_results)
+
+    # custom tests for specific executors
+
     def test_math_executor(self):
         executor = MathExecutor()
 
diff --git a/tests/test_inference_pipeline.py b/tests/test_inference_pipeline.py
index 247c4a8b..102f2e55 100644
--- a/tests/test_inference_pipeline.py
+++ b/tests/test_inference_pipeline.py
@@ -2,27 +2,31 @@
 import unittest
 
 # get the data directory from the environment variable
-DATA_DIR = os.environ.get('DATA_DIR')
+DATA_DIR = os.environ.get("DATA_DIR")
+
+from tests.consts import TEST_PIPELINE_INFO
+
 
 class TestDecOnlyModelInference(unittest.TestCase):
     def test_basic(self):
-        exit_code = os.system("export PYTHONPATH=`pwd`; echo $PYTHONPATH; echo $NLP4CODE_TEST_DATA_PATH; " + \
-                                "python finetuning/trainer.py validate " + \
-                                "--config finetuning/training_configs/few_shot/gsmath.yaml " + \
-                                # still using CPU for now
-                                "--trainer.gpus 0 " + \
-                                "--trainer.accelerator cpu " + \
-                                "--trainer.precision 32 " + \
-                                "--model.init_args.print_eval_every_n_batches 1 " + \
-                                "--model.init_args.transformer_model_name EleutherAI/gpt-neo-125M " + \
-                                "--data.init_args.transformer_model_name EleutherAI/gpt-neo-125M " + \
-                                "--data.init_args.val_max_instances 2 " + \
-                                # "--data.init_args.val_batch_size 1 ")
-                                "--data.init_args.val_batch_size 1 " + \
-                                "--data.init_args.val_file_path $NLP4CODE_TEST_DATA_PATH/gsmath/split_dev.jsonl "
-                                )
-        
-        self.assertEqual(exit_code, 0)
+        for model_name, yaml_config_path, val_file_path in TEST_PIPELINE_INFO:
+            exit_code = os.system(
+                "export PYTHONPATH=`pwd`; echo $PYTHONPATH; echo $NLP4CODE_TEST_DATA_PATH; "
+                + "python finetuning/trainer.py validate "
+                + f"--config {yaml_config_path} "
+                + "--trainer.gpus 0 "  # still using CPU for now
+                + "--trainer.accelerator cpu "
+                + "--trainer.precision 32 "
+                + "--model.init_args.print_eval_every_n_batches 1 "
+                + f"--model.init_args.transformer_model_name {model_name} "
+                + f"--data.init_args.transformer_model_name {model_name} "
+                + "--data.init_args.val_max_instances 2 "
+                + "--data.init_args.val_batch_size 1 "  # "--data.init_args.val_batch_size 1 ")
+                + f"--data.init_args.val_file_path {val_file_path} "
+            )
+
+            self.assertEqual(exit_code, 0)
+
 
-if __name__ == '__main__':
-    unittest.main()
\ No newline at end of file
+if __name__ == "__main__":
+    unittest.main()
diff --git a/tests/test_models.py b/tests/test_models.py
index ca4f567f..2dd99d4c 100644
--- a/tests/test_models.py
+++ b/tests/test_models.py
@@ -2,6 +2,8 @@
 
 from os import path, sys
 
+from tests.consts import TEST_MODEL_EXECUTOR_CLS, TEST_MODEL_TRANSFORMER_MODEL_NAMES
+
 ROOT_DIR = path.dirname(path.dirname(path.abspath(__file__)))
 sys.path.append(ROOT_DIR)
 
@@ -11,27 +13,31 @@
 
 
 class TestModels(unittest.TestCase):
-    def test_gpt_neo(self):
-        model = Seq2SeqModel(
-            transformer_model_name="EleutherAI/gpt-neo-125M",
-            executor_cls="execution.executors.MathExecutor",
-        )
-
-        test_input_str = [
-            "# write a python program that adds two integers",
-            "# write a python program that adds two integers",
-        ]
-        context_tokenizer_outputs = model.tokenizer(test_input_str, return_tensors="pt")
-        input_ids = context_tokenizer_outputs["input_ids"]
-        attention_mask = context_tokenizer_outputs["attention_mask"]
-
-        generation_result = model.forward(
-            input_ids,
-            attention_mask=attention_mask,
-            metadata=[{"nl": test_input_str[0]}, {"nl": test_input_str[1]}],
-        )
-
-        self.assertEqual(len(generation_result), 2)
-        self.assertEqual(
-            all(["generated_program" in result for result in generation_result]), True
-        )
+    def test_models(self):
+        for model_name in TEST_MODEL_TRANSFORMER_MODEL_NAMES:
+            model = Seq2SeqModel(
+                transformer_model_name=model_name,
+                executor_cls=TEST_MODEL_EXECUTOR_CLS,
+            )
+
+            test_input_str = [
+                "# write a python program that adds two integers",
+                "# write a python program that adds two integers",
+            ]
+            context_tokenizer_outputs = model.tokenizer(
+                test_input_str, return_tensors="pt"
+            )
+            input_ids = context_tokenizer_outputs["input_ids"]
+            attention_mask = context_tokenizer_outputs["attention_mask"]
+
+            generation_result = model.forward(
+                input_ids,
+                attention_mask=attention_mask,
+                metadata=[{"nl": test_input_str[0]}, {"nl": test_input_str[1]}],
+            )
+
+            self.assertEqual(len(generation_result), 2)
+            self.assertEqual(
+                all(["generated_program" in result for result in generation_result]),
+                True,
+            )