Skip to content

Commit

Permalink
Merge branch 'main' of github.com:snel-repo/falcon-challenge into main
Browse files Browse the repository at this point in the history
  • Loading branch information
claytonwashington committed May 29, 2024
2 parents eb7feb5 + 66255b1 commit 8911fc9
Show file tree
Hide file tree
Showing 5 changed files with 47 additions and 20 deletions.
21 changes: 14 additions & 7 deletions decoder_demos/ndt2_sample.Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -26,21 +26,28 @@ ENV EVALUATION_LOC remote
# Note that Docker cannot easily import across symlinks, make sure data is not symlinked

# H1
# ADD ./local_data/ndt2_h1_sample_nokey.pth data/decoder.pth
# ADD ./local_data/ndt2_h1_sample.pth data/decoder.pth
# ADD ./local_data/ndt2_zscore_h1.pt data/zscore.pt
# ENV SPLIT "h1"
# ENV CONFIG_STEM falcon/h1/h1_100

# M1
ADD ./local_data/ndt2_m1_sample_continual.pth data/decoder.pth
ADD ./local_data/ndt2_zscore_m1.pt data/zscore.pt
ENV SPLIT "m1"
ENV CONFIG_STEM falcon/m1/m1_100

# M2
ADD ./local_data/ndt2_m2_sample_continual.pth data/decoder.pth
ADD ./local_data/ndt2_zscore_m2.pt data/zscore.pt
# # M2
# ADD ./local_data/ndt2_m2_sample_continual.pth data/decoder.pth
# ADD ./local_data/ndt2_zscore_m2.pt data/zscore.pt
# ENV SPLIT "m2"
# ENV CONFIG_STEM falcon/m2/m2_100

# Add runfile
RUN pwd
ADD ./decoder_demos/ndt2_sample.py decode.py
ADD ./decoder_demos/ndt2_decoder.py ndt2_decoder.py

ENV SPLIT "h1"
ENV BATCH_SIZE 16
ENV PHASE "test"

# Make sure this matches the mounted data volume path. Generally leave as is.
Expand All @@ -50,4 +57,4 @@ ENV EVAL_DATA_PATH "/dataset/evaluation_data"
# CMD specifies a default command to run when the container is launched.
# It can be overridden with any cmd e.g. sudo docker run -it my_image /bin/bash
CMD ["/bin/bash", "-c", \
"python decode.py --evaluation $EVALUATION_LOC --model-path data/decoder.pth --zscore-path data/zscore.pt --split $SPLIT --phase $PHASE"]
"python decode.py --evaluation $EVALUATION_LOC --model-path data/decoder.pth --config-stem $CONFIG_STEM --zscore-path data/zscore.pt --split $SPLIT --batch-size $BATCH_SIZE --phase $PHASE"]
3 changes: 2 additions & 1 deletion decoder_demos/ndt2_sample.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,13 +47,14 @@ def main():
evaluator = FalconEvaluator(
eval_remote=args.evaluation == "remote",
split=args.split,
verbose=True
# continual=args.continual
)

task = getattr(FalconTask, args.split)
config = FalconConfig(task=task)
max_bins = 50 if task in [FalconTask.m1, FalconTask.m2] else 200 # h1

decoder = NDT2Decoder(
task_config=config,
model_ckpt_path=args.model_path,
Expand Down
40 changes: 29 additions & 11 deletions falcon_challenge/evaluator.py
Original file line number Diff line number Diff line change
Expand Up @@ -147,6 +147,7 @@ def evaluate(
test_annotation_file: str, # The annotation file for the phase
user_submission_file: str, # * JY: This appears to always be /submission/submission.csv on EvalAI. No matter - load it as a pickle.
phase_codename: str, # e.g. minival or test
verbose: bool = False,
**kwargs
):
r"""
Expand Down Expand Up @@ -235,6 +236,7 @@ def evaluate(
mask_dict['held_out'].append(dataset_mask)
else:
raise ValueError(f"Dataset {dataset} submitted but not found in held-in or held-out list of split {datasplit}.")

for in_or_out in pred_dict:
if len(pred_dict[in_or_out]) < len(DATASET_HELDINOUT_MAP[datasplit][in_or_out]):
raise ValueError(f"Missing predictions for {datasplit} {in_or_out}. User submitted: {user_submission[datasplit].keys()}. Expecting more like: {HELDIN_OR_OUT_MAP[datasplit][in_or_out]}.")
Expand All @@ -245,8 +247,9 @@ def evaluate(
tgt = np.concatenate(tgt_dict[in_or_out])
dset_lens = dset_len_dict[in_or_out]
mask = np.concatenate(mask_dict[in_or_out])

try:
metrics = FalconEvaluator.compute_metrics_edit_distance(pred, tgt, mask) if 'h2' in datasplit else FalconEvaluator.compute_metrics_regression(pred, tgt, mask, dset_lens)
metrics = FalconEvaluator.compute_metrics_edit_distance(pred, tgt, mask) if 'h2' in datasplit else FalconEvaluator.compute_metrics_regression(pred, tgt, mask, dset_lens, verbose=verbose)
except Exception as e:
raise ValueError(f"Failed to compute metrics for {datasplit} {in_or_out}: {e}. Lengths submitted: {[len(piece) for piece in pred_dict[in_or_out]]}")
for k in metrics:
Expand Down Expand Up @@ -312,13 +315,17 @@ def simple_collater(batch, task):

class FalconEvaluator:

def __init__(self, eval_remote=False, split='h1'):
def __init__(self, eval_remote=False, split='h1', verbose=False):
r"""
verbose: Print out dataset specific metrics for movement tasks.
"""
self.eval_remote = eval_remote
assert split in ['h1', 'h2', 'm1', 'm2'], "Split must be h1, h2, m1, or m2."
if split in ['h1', 'm1', 'm2']:
self.continual = True
else:
self.continual = False
self.verbose = verbose
self.dataset: FalconTask = getattr(FalconTask, split)
self.cfg = FalconConfig(self.dataset)

Expand Down Expand Up @@ -549,27 +556,38 @@ def evaluate(
return evaluate(
test_annotation_file=gt_path,
user_submission_file=prediction_path,
phase_codename=phase
phase_codename=phase,
verbose=self.verbose
)
else:
for k, v in metrics.items():
logger.info("{}: {}".format(k, v))

@staticmethod
def compute_metrics_regression(preds, targets, eval_mask, dset_lens):
dset_lens = np.cumsum([sum(dset_lens[key]) for key in sorted(dset_lens.keys())])
def compute_metrics_regression(preds, targets, eval_mask, dset_lens, verbose=False): # Verbose drop-in
dsets = sorted(dset_lens.keys())
dset_bounds = np.cumsum([sum(dset_lens[key]) for key in dsets])
masked_points = np.cumsum(~eval_mask)
dset_lens = [0] + [dset_len - masked_points[dset_len - 1] for dset_len in dset_lens]
dset_bounds = [0] + [dset_len - masked_points[dset_len - 1] for dset_len in dset_bounds]
# assumes targets are already masked
preds = preds[eval_mask]
if not targets.shape[0] == preds.shape[0]:
raise ValueError(f"Targets and predictions have different lengths: {targets.shape[0]} vs {preds.shape[0]}.")
r2_scores = [r2_score(targets[dset_lens[i]:dset_lens[i+1]], preds[dset_lens[i]:dset_lens[i+1]],
multioutput='variance_weighted') for i in range(len(dset_lens) - 1)]
return {
r2_scores = [r2_score(targets[dset_bounds[i]:dset_bounds[i+1]], preds[dset_bounds[i]:dset_bounds[i+1]],
multioutput='variance_weighted') for i in range(len(dset_bounds) - 1)]
base_metrics = {
"R2 Mean": np.mean(r2_scores),
"R2 Std.": np.std(r2_scores)
}
if verbose:
for k, r2 in zip(dsets, r2_scores):
print(f"{k}: {r2}")
base_metrics[f"{k} R2"] = r2
preds_dict = {k: preds[dset_bounds[i]:dset_bounds[i+1]] for i, k in enumerate(dsets)}
with open('preds.pkl', 'wb') as f:
pickle.dump(preds_dict, f)
return base_metrics


@staticmethod
def compute_metrics_edit_distance(preds, targets, eval_mask):
Expand Down Expand Up @@ -609,7 +627,7 @@ def compute_metrics(self, all_preds, all_targets, all_eval_mask=None):
all_eval_mask: array of shape (n_timesteps, k_dim). True if we should evaluate this timestep.
"""
if self.dataset in [FalconTask.h1, FalconTask.m1, FalconTask.m2]:
metrics = self.compute_metrics_regression(all_preds, all_targets, all_eval_mask)
metrics = self.compute_metrics_regression(all_preds, all_targets, all_eval_mask, verbose=self.verbose)
elif self.dataset in [FalconTask.h2]:
metrics = self.compute_metrics_edit_distance(all_preds, all_targets, all_eval_mask)
else:
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

setup(
name='falcon_challenge',
version='0.3.9',
version='0.3.11',

url='https://github.com/snel-repo/stability-benchmark',
author='Joel Ye',
Expand Down
1 change: 1 addition & 0 deletions test_docker_local.sh
Original file line number Diff line number Diff line change
Expand Up @@ -22,4 +22,5 @@ done
docker run \
-v $(pwd)/data:/dataset/evaluation_data \
-e "EVALUATION_LOC=local" \
--gpus all \
${DOCKER_NAME}\

0 comments on commit 8911fc9

Please sign in to comment.