From 065dc981d66fbfe8cebbe3b91317734c7d1dbf6e Mon Sep 17 00:00:00 2001 From: Joel Ye Date: Wed, 29 May 2024 12:42:18 -0400 Subject: [PATCH 1/3] add verbose mode --- decoder_demos/ndt2_sample.Dockerfile | 21 ++++++++++++++------- decoder_demos/ndt2_sample.py | 2 +- falcon_challenge/evaluator.py | 20 ++++++++++++++++---- setup.py | 2 +- test_docker_local.sh | 1 + 5 files changed, 33 insertions(+), 13 deletions(-) diff --git a/decoder_demos/ndt2_sample.Dockerfile b/decoder_demos/ndt2_sample.Dockerfile index 3a3387f..45a7411 100644 --- a/decoder_demos/ndt2_sample.Dockerfile +++ b/decoder_demos/ndt2_sample.Dockerfile @@ -26,21 +26,28 @@ ENV EVALUATION_LOC remote # Note that Docker cannot easily import across symlinks, make sure data is not symlinked # H1 -# ADD ./local_data/ndt2_h1_sample_nokey.pth data/decoder.pth +# ADD ./local_data/ndt2_h1_sample.pth data/decoder.pth # ADD ./local_data/ndt2_zscore_h1.pt data/zscore.pt +# ENV SPLIT "h1" +# ENV CONFIG_STEM falcon/h1/h1_100 # M1 +ADD ./local_data/ndt2_m1_sample_continual.pth data/decoder.pth +ADD ./local_data/ndt2_zscore_m1.pt data/zscore.pt +ENV SPLIT "m1" +ENV CONFIG_STEM falcon/m1/m1_100 -# M2 -ADD ./local_data/ndt2_m2_sample_continual.pth data/decoder.pth -ADD ./local_data/ndt2_zscore_m2.pt data/zscore.pt +# # M2 +# ADD ./local_data/ndt2_m2_sample_continual.pth data/decoder.pth +# ADD ./local_data/ndt2_zscore_m2.pt data/zscore.pt +# ENV SPLIT "m2" +# ENV CONFIG_STEM falcon/m2/m2_100 # Add runfile RUN pwd ADD ./decoder_demos/ndt2_sample.py decode.py -ADD ./decoder_demos/ndt2_decoder.py ndt2_decoder.py -ENV SPLIT "h1" +ENV BATCH_SIZE 16 ENV PHASE "test" # Make sure this matches the mounted data volume path. Generally leave as is. @@ -50,4 +57,4 @@ ENV EVAL_DATA_PATH "/dataset/evaluation_data" # CMD specifies a default command to run when the container is launched. # It can be overridden with any cmd e.g. sudo docker run -it my_image /bin/bash CMD ["/bin/bash", "-c", \ - "python decode.py --evaluation $EVALUATION_LOC --model-path data/decoder.pth --zscore-path data/zscore.pt --split $SPLIT --phase $PHASE"] \ No newline at end of file + "python decode.py --evaluation $EVALUATION_LOC --model-path data/decoder.pth --config-stem $CONFIG_STEM --zscore-path data/zscore.pt --split $SPLIT --batch-size $BATCH_SIZE --phase $PHASE"] \ No newline at end of file diff --git a/decoder_demos/ndt2_sample.py b/decoder_demos/ndt2_sample.py index 456833e..e85b59f 100644 --- a/decoder_demos/ndt2_sample.py +++ b/decoder_demos/ndt2_sample.py @@ -53,7 +53,7 @@ def main(): task = getattr(FalconTask, args.split) config = FalconConfig(task=task) max_bins = 50 if task in [FalconTask.m1, FalconTask.m2] else 200 # h1 - + decoder = NDT2Decoder( task_config=config, model_ckpt_path=args.model_path, diff --git a/falcon_challenge/evaluator.py b/falcon_challenge/evaluator.py index 9e45a02..d08a76f 100644 --- a/falcon_challenge/evaluator.py +++ b/falcon_challenge/evaluator.py @@ -235,6 +235,7 @@ def evaluate( mask_dict['held_out'].append(dataset_mask) else: raise ValueError(f"Dataset {dataset} submitted but not found in held-in or held-out list of split {datasplit}.") + for in_or_out in pred_dict: if len(pred_dict[in_or_out]) < len(DATASET_HELDINOUT_MAP[datasplit][in_or_out]): raise ValueError(f"Missing predictions for {datasplit} {in_or_out}. User submitted: {user_submission[datasplit].keys()}. Expecting more like: {HELDIN_OR_OUT_MAP[datasplit][in_or_out]}.") @@ -312,13 +313,17 @@ def simple_collater(batch, task): class FalconEvaluator: - def __init__(self, eval_remote=False, split='h1'): + def __init__(self, eval_remote=False, split='h1', verbose=False): + r""" + verbose: Print out dataset specific metrics for movement tasks. + """ self.eval_remote = eval_remote assert split in ['h1', 'h2', 'm1', 'm2'], "Split must be h1, h2, m1, or m2." if split in ['h1', 'm1', 'm2']: self.continual = True else: self.continual = False + self.verbose = verbose self.dataset: FalconTask = getattr(FalconTask, split) self.cfg = FalconConfig(self.dataset) @@ -554,9 +559,9 @@ def evaluate( else: for k, v in metrics.items(): logger.info("{}: {}".format(k, v)) - + @staticmethod - def compute_metrics_regression(preds, targets, eval_mask, dset_lens): + def compute_metrics_regression(preds, targets, eval_mask, dset_lens, verbose=False): # Verbose drop-in dset_lens = np.cumsum([sum(dset_lens[key]) for key in sorted(dset_lens.keys())]) masked_points = np.cumsum(~eval_mask) dset_lens = [0] + [dset_len - masked_points[dset_len - 1] for dset_len in dset_lens] @@ -566,11 +571,18 @@ def compute_metrics_regression(preds, targets, eval_mask, dset_lens): raise ValueError(f"Targets and predictions have different lengths: {targets.shape[0]} vs {preds.shape[0]}.") r2_scores = [r2_score(targets[dset_lens[i]:dset_lens[i+1]], preds[dset_lens[i]:dset_lens[i+1]], multioutput='variance_weighted') for i in range(len(dset_lens) - 1)] + if verbose: + dsets = sorted(dset_lens.keys()) + print([f'{k}: {r2}' for k, r2 in zip(dsets, r2_scores)]) + preds_dict = {k: preds[dset_lens[i]:dset_lens[i+1]] for i, k in enumerate(dsets)} + with open('preds.pkl', 'wb') as f: + pickle.dump(preds_dict, f) return { "R2 Mean": np.mean(r2_scores), "R2 Std.": np.std(r2_scores) } + @staticmethod def compute_metrics_edit_distance(preds, targets, eval_mask): if len(preds) != len(targets): @@ -609,7 +621,7 @@ def compute_metrics(self, all_preds, all_targets, all_eval_mask=None): all_eval_mask: array of shape (n_timesteps, k_dim). True if we should evaluate this timestep. """ if self.dataset in [FalconTask.h1, FalconTask.m1, FalconTask.m2]: - metrics = self.compute_metrics_regression(all_preds, all_targets, all_eval_mask) + metrics = self.compute_metrics_regression(all_preds, all_targets, all_eval_mask, verbose=self.verbose) elif self.dataset in [FalconTask.h2]: metrics = self.compute_metrics_edit_distance(all_preds, all_targets, all_eval_mask) else: diff --git a/setup.py b/setup.py index cb972d9..5e713cb 100644 --- a/setup.py +++ b/setup.py @@ -2,7 +2,7 @@ setup( name='falcon_challenge', - version='0.3.9', + version='0.3.10', url='https://github.com/snel-repo/stability-benchmark', author='Joel Ye', diff --git a/test_docker_local.sh b/test_docker_local.sh index f43a6df..31130a2 100755 --- a/test_docker_local.sh +++ b/test_docker_local.sh @@ -22,4 +22,5 @@ done docker run \ -v $(pwd)/data:/dataset/evaluation_data \ -e "EVALUATION_LOC=local" \ + --gpus all \ ${DOCKER_NAME}\ \ No newline at end of file From 9c17316f05836689244a7329dc7e06f74d75b79d Mon Sep 17 00:00:00 2001 From: Joel Ye Date: Wed, 29 May 2024 14:02:05 -0400 Subject: [PATCH 2/3] update verbose path --- decoder_demos/ndt2_sample.py | 1 + falcon_challenge/evaluator.py | 19 ++++++++++++------- 2 files changed, 13 insertions(+), 7 deletions(-) diff --git a/decoder_demos/ndt2_sample.py b/decoder_demos/ndt2_sample.py index e85b59f..81e204b 100644 --- a/decoder_demos/ndt2_sample.py +++ b/decoder_demos/ndt2_sample.py @@ -47,6 +47,7 @@ def main(): evaluator = FalconEvaluator( eval_remote=args.evaluation == "remote", split=args.split, + verbose=True # continual=args.continual ) diff --git a/falcon_challenge/evaluator.py b/falcon_challenge/evaluator.py index d08a76f..2b20577 100644 --- a/falcon_challenge/evaluator.py +++ b/falcon_challenge/evaluator.py @@ -147,6 +147,7 @@ def evaluate( test_annotation_file: str, # The annotation file for the phase user_submission_file: str, # * JY: This appears to always be /submission/submission.csv on EvalAI. No matter - load it as a pickle. phase_codename: str, # e.g. minival or test + verbose: bool = False, **kwargs ): r""" @@ -247,7 +248,7 @@ def evaluate( dset_lens = dset_len_dict[in_or_out] mask = np.concatenate(mask_dict[in_or_out]) try: - metrics = FalconEvaluator.compute_metrics_edit_distance(pred, tgt, mask) if 'h2' in datasplit else FalconEvaluator.compute_metrics_regression(pred, tgt, mask, dset_lens) + metrics = FalconEvaluator.compute_metrics_edit_distance(pred, tgt, mask) if 'h2' in datasplit else FalconEvaluator.compute_metrics_regression(pred, tgt, mask, dset_lens, verbose=verbose) except Exception as e: raise ValueError(f"Failed to compute metrics for {datasplit} {in_or_out}: {e}. Lengths submitted: {[len(piece) for piece in pred_dict[in_or_out]]}") for k in metrics: @@ -554,7 +555,8 @@ def evaluate( return evaluate( test_annotation_file=gt_path, user_submission_file=prediction_path, - phase_codename=phase + phase_codename=phase, + verbose=self.verbose ) else: for k, v in metrics.items(): @@ -571,16 +573,19 @@ def compute_metrics_regression(preds, targets, eval_mask, dset_lens, verbose=Fal raise ValueError(f"Targets and predictions have different lengths: {targets.shape[0]} vs {preds.shape[0]}.") r2_scores = [r2_score(targets[dset_lens[i]:dset_lens[i+1]], preds[dset_lens[i]:dset_lens[i+1]], multioutput='variance_weighted') for i in range(len(dset_lens) - 1)] + base_metrics = { + "R2 Mean": np.mean(r2_scores), + "R2 Std.": np.std(r2_scores) + } if verbose: dsets = sorted(dset_lens.keys()) - print([f'{k}: {r2}' for k, r2 in zip(dsets, r2_scores)]) + for k, r2 in zip(dsets, r2_scores): + print(f"{k}: {r2}") + base_metrics[f"{k} R2"] = r2 preds_dict = {k: preds[dset_lens[i]:dset_lens[i+1]] for i, k in enumerate(dsets)} with open('preds.pkl', 'wb') as f: pickle.dump(preds_dict, f) - return { - "R2 Mean": np.mean(r2_scores), - "R2 Std.": np.std(r2_scores) - } + return base_metrics @staticmethod From 66255b1ecfff03496fcc81ee42b7b9c9e711d45a Mon Sep 17 00:00:00 2001 From: Joel Ye Date: Wed, 29 May 2024 15:09:53 -0400 Subject: [PATCH 3/3] fixup verbose change --- falcon_challenge/evaluator.py | 13 +++++++------ setup.py | 2 +- 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/falcon_challenge/evaluator.py b/falcon_challenge/evaluator.py index 2b20577..5f08255 100644 --- a/falcon_challenge/evaluator.py +++ b/falcon_challenge/evaluator.py @@ -247,6 +247,7 @@ def evaluate( tgt = np.concatenate(tgt_dict[in_or_out]) dset_lens = dset_len_dict[in_or_out] mask = np.concatenate(mask_dict[in_or_out]) + try: metrics = FalconEvaluator.compute_metrics_edit_distance(pred, tgt, mask) if 'h2' in datasplit else FalconEvaluator.compute_metrics_regression(pred, tgt, mask, dset_lens, verbose=verbose) except Exception as e: @@ -564,25 +565,25 @@ def evaluate( @staticmethod def compute_metrics_regression(preds, targets, eval_mask, dset_lens, verbose=False): # Verbose drop-in - dset_lens = np.cumsum([sum(dset_lens[key]) for key in sorted(dset_lens.keys())]) + dsets = sorted(dset_lens.keys()) + dset_bounds = np.cumsum([sum(dset_lens[key]) for key in dsets]) masked_points = np.cumsum(~eval_mask) - dset_lens = [0] + [dset_len - masked_points[dset_len - 1] for dset_len in dset_lens] + dset_bounds = [0] + [dset_len - masked_points[dset_len - 1] for dset_len in dset_bounds] # assumes targets are already masked preds = preds[eval_mask] if not targets.shape[0] == preds.shape[0]: raise ValueError(f"Targets and predictions have different lengths: {targets.shape[0]} vs {preds.shape[0]}.") - r2_scores = [r2_score(targets[dset_lens[i]:dset_lens[i+1]], preds[dset_lens[i]:dset_lens[i+1]], - multioutput='variance_weighted') for i in range(len(dset_lens) - 1)] + r2_scores = [r2_score(targets[dset_bounds[i]:dset_bounds[i+1]], preds[dset_bounds[i]:dset_bounds[i+1]], + multioutput='variance_weighted') for i in range(len(dset_bounds) - 1)] base_metrics = { "R2 Mean": np.mean(r2_scores), "R2 Std.": np.std(r2_scores) } if verbose: - dsets = sorted(dset_lens.keys()) for k, r2 in zip(dsets, r2_scores): print(f"{k}: {r2}") base_metrics[f"{k} R2"] = r2 - preds_dict = {k: preds[dset_lens[i]:dset_lens[i+1]] for i, k in enumerate(dsets)} + preds_dict = {k: preds[dset_bounds[i]:dset_bounds[i+1]] for i, k in enumerate(dsets)} with open('preds.pkl', 'wb') as f: pickle.dump(preds_dict, f) return base_metrics diff --git a/setup.py b/setup.py index 5e713cb..15dfb0a 100644 --- a/setup.py +++ b/setup.py @@ -2,7 +2,7 @@ setup( name='falcon_challenge', - version='0.3.10', + version='0.3.11', url='https://github.com/snel-repo/stability-benchmark', author='Joel Ye',