From aa9ad675c10b4f97b916c406d59be66db7d425c5 Mon Sep 17 00:00:00 2001 From: Max Kaznady Date: Tue, 28 Jul 2020 12:04:05 -0400 Subject: [PATCH 01/15] segyviewer fix in README; train.py single GPU multiprocess fix; new data BYOD script --- README.md | 9 +- .../interpretation/dutchf3_patch/train.py | 6 +- .../deepseismic_interpretation/data.py | 41 ++++ scripts/byod_competition.py | 190 ++++++++++++++++++ 4 files changed, 244 insertions(+), 2 deletions(-) create mode 100644 scripts/byod_competition.py diff --git a/README.md b/README.md index c28d22e8..1b82b9b1 100644 --- a/README.md +++ b/README.md @@ -228,10 +228,17 @@ To install [segyviewer](https://github.com/equinor/segyviewer) run: ```bash conda env create -n segyviewer python=2.7 conda activate segyviewer -conda install -c anaconda pyqt=4.11.4 +conda install -c conda-forge pyqt=4.11.4 pip install segyviewer ``` +If you run into any QtPy4 related problems after the installation, try running: +```bash +sudo add-apt-repository ppa:rock-core/qt4 +sudo apt update +sudo apt install libqt4-designer libqt4-opengl libqt4-svg libqtgui4 libqtwebkit4 +``` + To visualize cross-sections of a 3D volume, you can run [segyviewer](https://github.com/equinor/segyviewer) like so: ```bash diff --git a/experiments/interpretation/dutchf3_patch/train.py b/experiments/interpretation/dutchf3_patch/train.py index fc585817..e687290e 100644 --- a/experiments/interpretation/dutchf3_patch/train.py +++ b/experiments/interpretation/dutchf3_patch/train.py @@ -95,7 +95,11 @@ def run(*options, cfg=None, local_rank=0, debug=False, input=None, distributed=F if distributed: # FOR DISTRIBUTED: Set the device according to local_rank. - torch.cuda.set_device(local_rank) + # if we're running on a single GPU (multi-GPU development), set to the same GPU + if torch.cuda.device_count()==1: + torch.cuda.set_device(0) + else: + torch.cuda.set_device(local_rank) # FOR DISTRIBUTED: Initialize the backend. torch.distributed.launch will # provide environment variables, and requires that you use init_method=`env://`. diff --git a/interpretation/deepseismic_interpretation/data.py b/interpretation/deepseismic_interpretation/data.py index 53ec7f9f..1ff9c100 100644 --- a/interpretation/deepseismic_interpretation/data.py +++ b/interpretation/deepseismic_interpretation/data.py @@ -5,6 +5,10 @@ import math from collections import defaultdict +import logging +# toggle to WARNING when running in production, or use CLI +logging.getLogger().setLevel(logging.DEBUG) + import numpy as np import torch from PIL import Image @@ -13,6 +17,8 @@ from torch.utils.data import Dataset from torchvision.datasets.utils import iterable_to_str, verify_str_arg +import segyio + _open_to_array = compose(np.array, Image.open) @@ -20,6 +26,41 @@ class DataNotSplitException(Exception): pass +def read_segy(filename): + """ + Read in a SEGY-format file given a filename + + Args: + filename: input filename + + Returns: + numpy data array and its info as a dictionary (tuple) + + """ + logging.info(f"Loading data cube from {filename}") + + # Read full data cube + data = segyio.tools.cube(filename) + + # Read meta data + segyfile = segyio.open(filename, "r") + print(" Crosslines: ", segyfile.xlines[0], ":", segyfile.xlines[-1]) + print(" Inlines: ", segyfile.ilines[0], ":", segyfile.ilines[-1]) + print(" Timeslices: ", "1", ":", data.shape[2]) + + # Make dict with cube-info + # TODO: read this from segy + # Read dt and other params needed to do create a new + data_info = { + "crossline_start": segyfile.xlines[0], + "inline_start": segyfile.ilines[0], + "timeslice_start": 1, + "shape": data.shape, + } + + return data, data_info + + def _get_classes_and_counts(mask_list): class_counts_dict = defaultdict(int) for mask in mask_list: diff --git a/scripts/byod_competition.py b/scripts/byod_competition.py new file mode 100644 index 00000000..70e70492 --- /dev/null +++ b/scripts/byod_competition.py @@ -0,0 +1,190 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. + +""" +Run example: + +python byod_competition.py --train --label --outdir +python prepare_dutchf3.py split_train_val patch --data_dir= --label_file=train/train_labels.npy --output_dir=splits --stride=50 --patch_size=100 --split_direction=both --section_stride=100 + +# information to include in configuration file when running: + +INFO:root:CLASS WEIGHTS TO USE +INFO:root:[0.84979262 0.57790153 0.95866329 0.71236326 0.99004844 0.91123086] +INFO:root:MEAN +INFO:root:0.0043642526 +INFO:root:STANDARD DEVIATION +INFO:root:0.07544233 + +# kick off run as: + +NGPU=2 +python -m torch.distributed.launch --nproc_per_node=${NGPU} train.py \ +TRAIN.BATCH_SIZE_PER_GPU 2 VALIDATION.BATCH_SIZE_PER_GPU 2 \ +DATASET.ROOT "/data/seismic" DATASET.NUM_CLASSES 6 DATASET.CLASS_WEIGHTS "[0.84979262, 0.57790153, 0.95866329, 0.71236326, 0.99004844, 0.91123086]" \ +TRAIN.MEAN 0.0043642526 TRAIN.STD 0.07544233 \ +--distributed --cfg configs/seresnet_unet.yaml + + +nohup time \ +python -m torch.distributed.launch --nproc_per_node=4 train.py \ +DATASET.ROOT "/home/maxkaz/data/seismic" DATASET.NUM_CLASSES 6 DATASET.CLASS_WEIGHTS "[0.84979262, 0.57790153, 0.95866329, 0.71236326, 0.99004844, 0.91123086]" \ +TRAIN.MEAN 0.0043642526 TRAIN.STD 0.07544233 \ +--distributed --cfg configs/seresnet_unet.yaml > se.log 2>&1 & + +nohup time \ +python -m torch.distributed.launch --nproc_per_node=4 train.py \ +MODEL.PRETRAINED "/home/alfred/models/hrnetv2_w48_imagenet_pretrained.pth" \ +DATASET.ROOT "/home/maxkaz/data/seismic" DATASET.NUM_CLASSES 6 DATASET.CLASS_WEIGHTS "[0.84979262, 0.57790153, 0.95866329, 0.71236326, 0.99004844, 0.91123086]" \ +TRAIN.MEAN 0.0043642526 TRAIN.STD 0.07544233 \ +--distributed --cfg configs/hrnet.yaml > hr.log 2>&1 & + +""" + +from interpretation.deepseismic_interpretation.data import read_segy + +""" libraries """ +import segyio + +import numpy as np +from scipy import stats +import os + +np.set_printoptions(linewidth=200) +import logging + +# toggle to WARNING when running in production, or use CLI +logging.getLogger().setLevel(logging.DEBUG) +# logging.getLogger().setLevel(logging.WARNING) +import argparse + +parser = argparse.ArgumentParser() + +""" useful information when running from a GIT folder.""" +myname = os.path.realpath(__file__) +mypath = os.path.dirname(myname) +myname = os.path.basename(myname) + + +def main(args): + """ + Transforms Penobscot HDF5 dataset into DeepSeismic Tensor Format + """ + + logging.info("loading data") + data, _ = read_segy(args.train) + labels, _ = read_segy(args.label) + + assert labels.min() == 1.0 + n_classes = labels.max() + assert n_classes == N_CLASSES + + logging.info("Running 3-sigma clipping") + clip_scaling = 3.0 + mean, std = data.mean(), data.std() + data[data > mean + clip_scaling * std] = mean + clip_scaling * std + data[data < mean - clip_scaling * std] = mean - clip_scaling * std + + # Make data cube fast to access + logging.info("Adjusting precision") + data = np.ascontiguousarray(data, "float32") + labels = np.ascontiguousarray(labels, "uint8") + + # adjust labels to start from zero + labels -= 1 + + # rescale to be within a certain range + range_min, range_max = -1.0, 1.0 + data_std = (data - data.min()) / (data.max() - data.min()) + data = data_std * (range_max - range_min) + range_min + + """ + # cut off a buffer zone around the volume (to avoid mislabeled data): + buffer = 25 + data = data[:, buffer:-buffer, buffer:-buffer] + labels = labels[:, buffer:-buffer, buffer:-buffer] + """ + + # time by crosslines by inlines + n_inlines = data.shape[0] + n_crosslines = data.shape[1] + + inline_cut = int(np.floor(n_inlines * INLINE_FRACTION)) + crossline_cut = int(np.floor(n_crosslines * CROSSLINE_FRACTION)) + + data_train = data[0:inline_cut, 0:crossline_cut, :] + data_test1 = data[inline_cut:n_inlines, :, :] + data_test2 = data[:, crossline_cut:n_crosslines, :] + + labels_train = labels[0:inline_cut, 0:crossline_cut, :] + labels_test1 = labels[inline_cut:n_inlines, :, :] + labels_test2 = labels[:, crossline_cut:n_crosslines, :] + + def mkdir(dirname): + + if os.path.isdir(dirname) and os.path.exists(dirname): + return + + if not os.path.isdir(dirname) and os.path.exists(dirname): + logging.info("remote file", dirname, "and run this script again") + + os.mkdir(dirname) + + mkdir(args.outdir) + mkdir(os.path.join(args.outdir, "splits")) + mkdir(os.path.join(args.outdir, "train")) + mkdir(os.path.join(args.outdir, "test_once")) + + np.save(os.path.join(args.outdir, "train", "train_seismic.npy"), data_train) + np.save(os.path.join(args.outdir, "train", "train_labels.npy"), labels_train) + + np.save(os.path.join(args.outdir, "test_once", "test1_seismic.npy"), data_test1) + np.save(os.path.join(args.outdir, "test_once", "test1_labels.npy"), labels_test1) + + np.save(os.path.join(args.outdir, "test_once", "test2_seismic.npy"), data_test2) + np.save(os.path.join(args.outdir, "test_once", "test2_labels.npy"), labels_test2) + + # Compute class weights: + num_classes, class_count = np.unique(labels[:], return_counts=True) + # class_probabilities = np.histogram(labels[:], bins= , density=True) + class_weights = 1 - class_count / np.sum(class_count) + logging.info("CLASS WEIGHTS TO USE") + logging.info(class_weights) + logging.info("MEAN") + logging.info(mean) + logging.info("STANDARD DEVIATION") + logging.info(std) + + +""" GLOBAL VARIABLES """ +INLINE_FRACTION = 0.7 +CROSSLINE_FRACTION = 1.0 +N_CLASSES = 6 + +parser.add_argument("--train", help="Name of train data", type=str, required=True) +parser.add_argument("--label", help="Name of train labels data", type=str, required=True) +parser.add_argument("--outdir", help="Output data directory location", type=str, required=True) + +""" main wrapper with profiler """ +if __name__ == "__main__": + main(parser.parse_args()) + +# pretty printing of the stack +""" + try: + logging.info('before main') + main(parser.parse_args()) + logging.info('after main') + except: + for frame in traceback.extract_tb(sys.exc_info()[2]): + fname,lineno,fn,text = frame + print ("Error in %s on line %d" % (fname, lineno)) +""" +# optionally enable profiling information +# import cProfile +# name = +# cProfile.run('main.run()', name + '.prof') +# import pstats +# p = pstats.Stats(name + '.prof') +# p.sort_stats('cumulative').print_stats(10) +# p.sort_stats('time').print_stats() From fc32ec7b5ad38e316ae3f8b9f41b8e7400e6cc53 Mon Sep 17 00:00:00 2001 From: Max Kaznady Date: Wed, 29 Jul 2020 15:52:42 -0400 Subject: [PATCH 02/15] fixes to ratios in byod scripts; documentation --- scripts/byod_competition.py | 25 +++++++++++++++++-------- scripts/byod_penobscot.py | 2 +- 2 files changed, 18 insertions(+), 9 deletions(-) diff --git a/scripts/byod_competition.py b/scripts/byod_competition.py index 70e70492..4b46d412 100644 --- a/scripts/byod_competition.py +++ b/scripts/byod_competition.py @@ -9,12 +9,11 @@ # information to include in configuration file when running: -INFO:root:CLASS WEIGHTS TO USE INFO:root:[0.84979262 0.57790153 0.95866329 0.71236326 0.99004844 0.91123086] INFO:root:MEAN -INFO:root:0.0043642526 +INFO:root:4.183678e-05 INFO:root:STANDARD DEVIATION -INFO:root:0.07544233 +INFO:root:0.31477982 # kick off run as: @@ -22,23 +21,32 @@ python -m torch.distributed.launch --nproc_per_node=${NGPU} train.py \ TRAIN.BATCH_SIZE_PER_GPU 2 VALIDATION.BATCH_SIZE_PER_GPU 2 \ DATASET.ROOT "/data/seismic" DATASET.NUM_CLASSES 6 DATASET.CLASS_WEIGHTS "[0.84979262, 0.57790153, 0.95866329, 0.71236326, 0.99004844, 0.91123086]" \ -TRAIN.MEAN 0.0043642526 TRAIN.STD 0.07544233 \ +TRAIN.MEAN 0.0 TRAIN.STD 0.31477982 \ --distributed --cfg configs/seresnet_unet.yaml - nohup time \ python -m torch.distributed.launch --nproc_per_node=4 train.py \ DATASET.ROOT "/home/maxkaz/data/seismic" DATASET.NUM_CLASSES 6 DATASET.CLASS_WEIGHTS "[0.84979262, 0.57790153, 0.95866329, 0.71236326, 0.99004844, 0.91123086]" \ -TRAIN.MEAN 0.0043642526 TRAIN.STD 0.07544233 \ +TRAIN.MEAN 0.0 TRAIN.STD 0.31477982 \ --distributed --cfg configs/seresnet_unet.yaml > se.log 2>&1 & nohup time \ python -m torch.distributed.launch --nproc_per_node=4 train.py \ MODEL.PRETRAINED "/home/alfred/models/hrnetv2_w48_imagenet_pretrained.pth" \ DATASET.ROOT "/home/maxkaz/data/seismic" DATASET.NUM_CLASSES 6 DATASET.CLASS_WEIGHTS "[0.84979262, 0.57790153, 0.95866329, 0.71236326, 0.99004844, 0.91123086]" \ -TRAIN.MEAN 0.0043642526 TRAIN.STD 0.07544233 \ +TRAIN.MEAN 0.0 TRAIN.STD 0.31477982 \ --distributed --cfg configs/hrnet.yaml > hr.log 2>&1 & +Scoring: + +nohup time \ +python test.py \ +DATASET.ROOT "/home/maxkaz/data/seismic" DATASET.NUM_CLASSES 6 \ +DATASET.CLASS_WEIGHTS "[0.84979262, 0.57790153, 0.95866329, 0.71236326, 0.99004844, 0.91123086]" \ +TRAIN.MEAN 0.0043642526 TRAIN.STD 0.07544233 \ +TEST.SPLIT 'test1' +--distributed --cfg configs/seresnet_unet.yaml > se.log 2>&1 & + """ from interpretation.deepseismic_interpretation.data import read_segy @@ -150,6 +158,7 @@ def mkdir(dirname): class_weights = 1 - class_count / np.sum(class_count) logging.info("CLASS WEIGHTS TO USE") logging.info(class_weights) + mean, std = data.mean(), data.std() logging.info("MEAN") logging.info(mean) logging.info("STANDARD DEVIATION") @@ -158,7 +167,7 @@ def mkdir(dirname): """ GLOBAL VARIABLES """ INLINE_FRACTION = 0.7 -CROSSLINE_FRACTION = 1.0 +CROSSLINE_FRACTION = 0.78 N_CLASSES = 6 parser.add_argument("--train", help="Name of train data", type=str, required=True) diff --git a/scripts/byod_penobscot.py b/scripts/byod_penobscot.py index 8ccc8b6e..32e65d5f 100644 --- a/scripts/byod_penobscot.py +++ b/scripts/byod_penobscot.py @@ -120,7 +120,7 @@ def mkdir(dirname): """ GLOBAL VARIABLES """ INLINE_FRACTION = 0.7 -CROSSLINE_FRACTION = 1.0 +CROSSLINE_FRACTION = 0.78 N_CLASSES = 8 parser.add_argument("--filename", help="Name of HDF5 data", type=str, required=True) From 325e0a5f7cdf46f1f1d0f550dd66689bd576cce2 Mon Sep 17 00:00:00 2001 From: Max Kaznady Date: Wed, 29 Jul 2020 19:55:11 -0400 Subject: [PATCH 03/15] fixed unet model not loading properly in the unet config file --- cv_lib/cv_lib/segmentation/models/unet.py | 2 ++ experiments/interpretation/dutchf3_patch/configs/unet.yaml | 4 ++-- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/cv_lib/cv_lib/segmentation/models/unet.py b/cv_lib/cv_lib/segmentation/models/unet.py index ddb9197f..d73b75c1 100644 --- a/cv_lib/cv_lib/segmentation/models/unet.py +++ b/cv_lib/cv_lib/segmentation/models/unet.py @@ -4,6 +4,8 @@ """ Taken from https://github.com/milesial/Pytorch-UNet """ +import os + import torch import torch.nn as nn import torch.nn.functional as F diff --git a/experiments/interpretation/dutchf3_patch/configs/unet.yaml b/experiments/interpretation/dutchf3_patch/configs/unet.yaml index 5ae1ee45..0597b8fa 100644 --- a/experiments/interpretation/dutchf3_patch/configs/unet.yaml +++ b/experiments/interpretation/dutchf3_patch/configs/unet.yaml @@ -11,7 +11,7 @@ WORKERS: 4 PRINT_FREQ: 10 LOG_CONFIG: logging.conf SEED: 2019 - +OPENCV_BORDER_CONSTANT: 0 DATASET: NUM_CLASSES: 6 @@ -21,7 +21,7 @@ DATASET: MAX: 1 MODEL: - NAME: resnet_unet + NAME: unet IN_CHANNELS: 3 From 9dd1e989ca0e7ff1b7709d43e6b2f90f197b15f1 Mon Sep 17 00:00:00 2001 From: Max Kaznady Date: Fri, 31 Jul 2020 15:00:11 +0000 Subject: [PATCH 04/15] minor tweaks to docs and imports --- scripts/byod_competition.py | 5 ++++- scripts/byod_penobscot.py | 2 +- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/scripts/byod_competition.py b/scripts/byod_competition.py index 4b46d412..8f6bb805 100644 --- a/scripts/byod_competition.py +++ b/scripts/byod_competition.py @@ -17,6 +17,9 @@ # kick off run as: +python byod_competition.py --train /home/maxkaz/data/seismic/TrainingData_Image.segy --label /home/maxkaz/data/seismic/TrainingData_Labels.segy --outdir /home/maxkaz/data/seismic +python prepare_dutchf3.py split_train_val patch --data_dir=/home/maxkaz/data/seismic --label_file=train/train_labels.npy --output_dir=splits --stride=50 --patch_size=100 --split_direction=both --section_stride=100 + NGPU=2 python -m torch.distributed.launch --nproc_per_node=${NGPU} train.py \ TRAIN.BATCH_SIZE_PER_GPU 2 VALIDATION.BATCH_SIZE_PER_GPU 2 \ @@ -49,7 +52,7 @@ """ -from interpretation.deepseismic_interpretation.data import read_segy +from deepseismic_interpretation.data import read_segy """ libraries """ import segyio diff --git a/scripts/byod_penobscot.py b/scripts/byod_penobscot.py index 32e65d5f..bc0b682b 100644 --- a/scripts/byod_penobscot.py +++ b/scripts/byod_penobscot.py @@ -19,7 +19,7 @@ import logging # toggle to WARNING when running in production, or use CLI -logging.getLogger().setLevel(logging.DEBUG) +logging.getLogger().setLevel(logging.DEBUG)~/da # logging.getLogger().setLevel(logging.WARNING) import argparse From b8b219ea2a0e4d23d49d6ace844378cc1b2c27b0 Mon Sep 17 00:00:00 2001 From: Max Kaznady Date: Fri, 31 Jul 2020 15:38:00 -0400 Subject: [PATCH 05/15] added 3D scoring - working on making it work - file is also saved as segy --- .../interpretation/dutchf3_patch/test.py | 98 ++++++++++++++++++- .../deepseismic_interpretation/data.py | 44 +++++++++ scripts/byod_competition.py | 9 +- 3 files changed, 144 insertions(+), 7 deletions(-) diff --git a/experiments/interpretation/dutchf3_patch/test.py b/experiments/interpretation/dutchf3_patch/test.py index 2efeee85..4acf7db5 100644 --- a/experiments/interpretation/dutchf3_patch/test.py +++ b/experiments/interpretation/dutchf3_patch/test.py @@ -27,11 +27,14 @@ from toolz import compose, curry, itertoolz, pipe, take from torch.utils import data +from interpretation.deepseismic_interpretation.data import write_segy + from cv_lib.segmentation import models from cv_lib.segmentation.dutchf3.utils import current_datetime, git_branch, git_hash from cv_lib.utils import load_log_configuration, mask_to_disk, generate_path, image_to_disk -from deepseismic_interpretation.dutchf3.data import add_patch_depth_channels, get_test_loader +from deepseismic_interpretation.dutchf3.data import add_patch_depth_channels, get_test_loader, _test1_labels_for, \ + _test2_labels_for from default import _C as config from default import update_config @@ -44,6 +47,9 @@ "zechstein", ] +# we can optionally supply a segy file whose geometry we will use to write out 3D test set predictions +# if it doesn't exist, we won't output a segy file +SEGY_INFILE = '/data/seismic_orig/TrainingData_Labels.segy' class runningScore(object): def __init__(self, n_classes): @@ -94,6 +100,48 @@ def reset(self): self.confusion_matrix = np.zeros((self.n_classes, self.n_classes)) +def _compute_3D_metrics(gt_labels, pred, n_classes, split): + """ + Compute 3D metrics on two 3D arrays. A good test case is to set gt==pred. + + Args: + gt: ground truth 3D numpy array + pred: predictions 3D array + n_classes: number of classes + split: which test set split we're computing + + Returns: + Nothing - stdout print + + """ + + logger = logging.getLogger(__name__) + + score = runningScore(n_classes) + score.update(gt_labels, pred) + + score, class_iou = score.get_scores() + + logger.info(f"--------------- 3D RESULTS {split} -----------------") + logger.info(f'Pixel Acc: {score["Pixel Acc: "]:.4f}') + + logger.info(f'Mean Class Acc: {score["Mean Class Acc: "]:.4f}') + for cdx, class_name in enumerate(_CLASS_NAMES): + logger.info(f' class {cdx} named {class_name} accuracy {score["Class Accuracy: "][cdx]:.4f}') + + logger.info(f'Mean IoU: {score["Mean IoU: "]:0.4f}') + + for cdx, class_name in enumerate(_CLASS_NAMES): + logger.info(f" class {cdx} named {class_name} IoU {class_iou[cdx]:.4f}") + logger.info(f'Freq Weighted IoU: {score["Freq Weighted IoU: "]:.4f}') + + # Save confusion matrix: + logger.info("writing confusion matrix") + confusion = score["confusion_matrix"] + np.savetxt(f"confusion_split_{split}.csv", confusion, delimiter=" ") + + logger.info("----------------- 3D DONE ---------------------------") + def _transform_CHW_to_HWC(numpy_array): return np.moveaxis(numpy_array, 0, -1) @@ -307,6 +355,10 @@ def _evaluate_split( running_metrics_split = runningScore(n_classes) + n_inlines, n_crosslines, n_depth = test_set.labels.shape + accum_inline = np.zeros((n_classes, n_depth, n_crosslines, n_inlines)) + accum_crossline = np.zeros((n_classes, n_depth, n_crosslines, n_inlines)) + # evaluation mode: with torch.no_grad(): # operations inside don't track history model.eval() @@ -328,6 +380,21 @@ def _evaluate_split( config.DATASET.MAX, ) + # for debugging, if you set this to GT then you can test if + # the reconstructions matches test_set.labels + preds_numpy = outputs.detach().squeeze().numpy() + + # direction is channel x depth x crossline x inline + + # dealing with inline + if test_set.sections[i].startswith("i"): + accum_inline[:, :, :, i] = preds_numpy + # dealing with crossline + elif test_set.sections[i].startswith("x"): + accum_crossline[:, :, i, :] = preds_numpy + else: + raise Exception("we need either an inline or crossline split") + pred = outputs.detach().max(1)[1].numpy() gt = labels.numpy() if debug: @@ -364,6 +431,31 @@ def _evaluate_split( logger.info(f'Mean IoU: {score["Mean IoU: "]:0.3f}') running_metrics_split.reset() + ###################################################################### + # 3D: now compute metrics on full 3D volume + ###################################################################### + + gt_labels = test_set.labels.swapaxes(0, 2) + assert gt_labels.shape == accum_inline.shape + assert gt_labels.shape == accum_crossline.shape + + # compute mIoU here + logging.info("Simple average") + np.save() + pred_sum = accum_inline + accum_crossline + pred = pred_sum.argmax(0) + _compute_3D_metrics(gt_labels, pred, n_classes, split) + np.save(f"test_simple_avg_split_{split}.npy", pred) + if os.path.isfile(SEGY_INFILE): + write_segy(f"test_simple_avg_split_{split}.segy", SEGY_INFILE, pred) + + logging.info("Geometric average") + pred_sum = np.sqrt(accum_inline*accum_crossline) + pred = pred_sum.argmax(0) + _compute_3D_metrics(gt_labels, pred, n_classes, split) + np.save(f"test_geometric_avg_split_{split}.npy", pred) + if os.path.isfile(SEGY_INFILE): + write_segy(f"test_geometric_avg_split_{split}.segy", SEGY_INFILE, pred) def _write_section_file(labels, section_file): # define indices of the array @@ -381,7 +473,9 @@ def _write_section_file(labels, section_file): else: x_list = [] - list_test = i_list + x_list + # TODO: revert + # list_test = i_list + x_list + list_test = i_list[0:1] + x_list[0:1] file_object = open(section_file, "w") file_object.write("\n".join(list_test)) diff --git a/interpretation/deepseismic_interpretation/data.py b/interpretation/deepseismic_interpretation/data.py index 1ff9c100..2dd6ed37 100644 --- a/interpretation/deepseismic_interpretation/data.py +++ b/interpretation/deepseismic_interpretation/data.py @@ -19,6 +19,9 @@ import segyio +from shutil import copyfile + + _open_to_array = compose(np.array, Image.open) @@ -61,6 +64,47 @@ def read_segy(filename): return data, data_info +def write_segy(out_filename, in_filename, out_cube): + """ + Writes out_cube to a segy-file (out_filename) with same header/size as in_filename + + Args: + out_filename: output filename + in_filename: input file, whose metadata will be copied + out_cube: array which we write to out_filename + + Returns: + Nothing + """ + + # Select last channel + if type(out_cube) is list: + out_cube = out_cube[-1] + + print("Writing interpretation to " + out_filename) + + # Copy segy file + copyfile(in_filename, out_filename) + + # Moving temporal axis back again + out_cube = np.moveaxis(out_cube, 0, -1) + + # Open out-file + with segyio.open(out_filename, "r+") as src: + iline_start = src.ilines[0] + dtype = src.iline[iline_start].dtype + # loop through inlines and insert output + for i in src.ilines: + iline = out_cube[i - iline_start, :, :] + src.iline[i] = np.ascontiguousarray(iline.astype(dtype)) + + # TODO: rewrite this whole function + # Moving temporal axis first again - just in case the user want to keep working on it + out_cube = np.moveaxis(out_cube, -1, 0) + + print("Writing interpretation - Finished") + return + def _get_classes_and_counts(mask_list): class_counts_dict = defaultdict(int) for mask in mask_list: diff --git a/scripts/byod_competition.py b/scripts/byod_competition.py index 4b46d412..2e542b71 100644 --- a/scripts/byod_competition.py +++ b/scripts/byod_competition.py @@ -41,11 +41,10 @@ nohup time \ python test.py \ -DATASET.ROOT "/home/maxkaz/data/seismic" DATASET.NUM_CLASSES 6 \ -DATASET.CLASS_WEIGHTS "[0.84979262, 0.57790153, 0.95866329, 0.71236326, 0.99004844, 0.91123086]" \ -TRAIN.MEAN 0.0043642526 TRAIN.STD 0.07544233 \ -TEST.SPLIT 'test1' ---distributed --cfg configs/seresnet_unet.yaml > se.log 2>&1 & +DATASET.ROOT "/data/seismic" DATASET.NUM_CLASSES 6 DATASET.CLASS_WEIGHTS "[0.84979262, 0.57790153, 0.95866329, 0.71236326, 0.99004844, 0.91123086]" \ +TRAIN.MEAN 0.0 TRAIN.STD 0.31477982 \ +TEST.SPLIT 'both' +--cfg configs/unet.yaml > unet.log 2>&1 & """ From d4e05b865339958265514c94073c3796751fa0f8 Mon Sep 17 00:00:00 2001 From: Max Kaznady Date: Sun, 2 Aug 2020 21:16:29 -0400 Subject: [PATCH 06/15] optimized memory usage when computing confusion matrix --- environment/anaconda/local/environment.yml | 1 + .../interpretation/dutchf3_patch/test.py | 86 ++++++++++++++----- .../deepseismic_interpretation/data.py | 20 ++++- 3 files changed, 81 insertions(+), 26 deletions(-) diff --git a/environment/anaconda/local/environment.yml b/environment/anaconda/local/environment.yml index cf7078c1..503dd2b1 100644 --- a/environment/anaconda/local/environment.yml +++ b/environment/anaconda/local/environment.yml @@ -19,6 +19,7 @@ dependencies: - papermill>=1.0.1 - jupyterlab - pip: + - memory_profiler - segyio==1.8.8 - pytorch-ignite==0.3.0 - fire==0.2.1 diff --git a/experiments/interpretation/dutchf3_patch/test.py b/experiments/interpretation/dutchf3_patch/test.py index 4acf7db5..ba945b8d 100644 --- a/experiments/interpretation/dutchf3_patch/test.py +++ b/experiments/interpretation/dutchf3_patch/test.py @@ -21,20 +21,25 @@ import fire import numpy as np +from sklearn import metrics import torch import torch.nn.functional as F from albumentations import Compose, Normalize, PadIfNeeded, Resize from toolz import compose, curry, itertoolz, pipe, take from torch.utils import data -from interpretation.deepseismic_interpretation.data import write_segy +from deepseismic_interpretation.data import write_segy from cv_lib.segmentation import models from cv_lib.segmentation.dutchf3.utils import current_datetime, git_branch, git_hash from cv_lib.utils import load_log_configuration, mask_to_disk, generate_path, image_to_disk -from deepseismic_interpretation.dutchf3.data import add_patch_depth_channels, get_test_loader, _test1_labels_for, \ - _test2_labels_for +from deepseismic_interpretation.dutchf3.data import ( + add_patch_depth_channels, + get_test_loader, + _test1_labels_for, + _test2_labels_for, +) from default import _C as config from default import update_config @@ -49,24 +54,47 @@ # we can optionally supply a segy file whose geometry we will use to write out 3D test set predictions # if it doesn't exist, we won't output a segy file -SEGY_INFILE = '/data/seismic_orig/TrainingData_Labels.segy' +SEGY_INFILE = "/data/seismic_orig/TrainingData_Labels.segy" + class runningScore(object): def __init__(self, n_classes): self.n_classes = n_classes self.confusion_matrix = np.zeros((n_classes, n_classes)) + # @profile def _fast_hist(self, label_true, label_pred, n_class): + """ + speed-optimized but not memory-optimized version of the confusion matrix calculation + """ + logger = logging.getLogger(__name__) mask = (label_true >= 0) & (label_true < n_class) - hist = np.bincount(n_class * label_true[mask].astype(int) + label_pred[mask], minlength=n_class ** 2,).reshape( - n_class, n_class - ) + bincount_arg = n_class * label_true[mask].astype(int) + label_pred[mask] + logger.info('bincount operation starting...') + hist = np.bincount(bincount_arg, minlength=n_class ** 2,) + hist = hist.reshape(n_class, n_class) + logger.info('finished') return hist - def update(self, label_trues, label_preds): + # @profile + def _confusion_matrix(self, label_true, label_pred, n_class): + """ + memory-optimized but not speed-optimized version of the confusion matrix calculation + """ + mask = (label_true >= 0) & (label_true < n_class) + #bincount_arg = n_class * label_true[mask].astype(int) + label_pred[mask] + matrix = metrics.confusion_matrix(label_true[mask], label_pred[mask], labels = list(range(n_class))) + return matrix + + # @profile + def update(self, label_trues, label_preds, fast_hist = True): for lt, lp in zip(label_trues, label_preds): - self.confusion_matrix += self._fast_hist(lt.flatten(), lp.flatten(), self.n_classes) + if fast_hist: + self.confusion_matrix += self._fast_hist(lt.flatten(), lp.flatten(), self.n_classes) + else: + self.confusion_matrix += self._confusion_matrix(lt.flatten(), lp.flatten(), self.n_classes) + # @profile def get_scores(self): """Returns accuracy score evaluation result. - overall accuracy @@ -117,8 +145,13 @@ def _compute_3D_metrics(gt_labels, pred, n_classes, split): logger = logging.getLogger(__name__) + # TODO: remove + #n = 300 + #gt_labels = gt_labels[:n, :n, :n] + #pred = pred[:n, :n, :n] + score = runningScore(n_classes) - score.update(gt_labels, pred) + score.update(gt_labels, pred, fast_hist=False) score, class_iou = score.get_scores() @@ -142,6 +175,7 @@ def _compute_3D_metrics(gt_labels, pred, n_classes, split): logger.info("----------------- 3D DONE ---------------------------") + def _transform_CHW_to_HWC(numpy_array): return np.moveaxis(numpy_array, 0, -1) @@ -287,6 +321,9 @@ def _patch_label_2d( output = output_processing(output) output_p[:, :, hdx + ps : hdx + ps + patch_size, wdx + ps : wdx + ps + patch_size,] += output + # TODO remove + break + # dump the data right before it's being put into the model and after scoring if debug: outdir = f"debug/test/batch_{split}" @@ -356,8 +393,8 @@ def _evaluate_split( running_metrics_split = runningScore(n_classes) n_inlines, n_crosslines, n_depth = test_set.labels.shape - accum_inline = np.zeros((n_classes, n_depth, n_crosslines, n_inlines)) - accum_crossline = np.zeros((n_classes, n_depth, n_crosslines, n_inlines)) + accum_inline = np.zeros((n_classes, n_depth, n_crosslines, n_inlines), dtype=np.float32) + accum_crossline = np.zeros((n_classes, n_depth, n_crosslines, n_inlines), dtype=np.float32) # evaluation mode: with torch.no_grad(): # operations inside don't track history @@ -382,7 +419,7 @@ def _evaluate_split( # for debugging, if you set this to GT then you can test if # the reconstructions matches test_set.labels - preds_numpy = outputs.detach().squeeze().numpy() + preds_numpy = outputs.detach().squeeze().numpy().astype(np.float32) # direction is channel x depth x crossline x inline @@ -435,27 +472,31 @@ def _evaluate_split( # 3D: now compute metrics on full 3D volume ###################################################################### - gt_labels = test_set.labels.swapaxes(0, 2) - assert gt_labels.shape == accum_inline.shape - assert gt_labels.shape == accum_crossline.shape + gt_labels = test_set.labels.swapaxes(0, 2).astype(np.uint8) + assert gt_labels.shape == accum_inline.shape[1:] + assert gt_labels.shape == accum_crossline.shape[1:] # compute mIoU here logging.info("Simple average") - np.save() pred_sum = accum_inline + accum_crossline - pred = pred_sum.argmax(0) + pred = pred_sum.argmax(0).astype(np.uint8) + del pred_sum _compute_3D_metrics(gt_labels, pred, n_classes, split) np.save(f"test_simple_avg_split_{split}.npy", pred) + # use existing SEGY file as a template to write our data into if os.path.isfile(SEGY_INFILE): - write_segy(f"test_simple_avg_split_{split}.segy", SEGY_INFILE, pred) + write_segy(f"test_simple_avg_split_{split}.segy", SEGY_INFILE, pred) logging.info("Geometric average") - pred_sum = np.sqrt(accum_inline*accum_crossline) - pred = pred_sum.argmax(0) + pred_sum = np.sqrt(accum_inline * accum_crossline) + pred = pred_sum.argmax(0).astype(np.uint8) + del pred_sum _compute_3D_metrics(gt_labels, pred, n_classes, split) np.save(f"test_geometric_avg_split_{split}.npy", pred) + # use existing SEGY file as a template to write our data into if os.path.isfile(SEGY_INFILE): - write_segy(f"test_geometric_avg_split_{split}.segy", SEGY_INFILE, pred) + write_segy(f"test_geometric_avg_split_{split}.segy", SEGY_INFILE, pred) + def _write_section_file(labels, section_file): # define indices of the array @@ -586,3 +627,4 @@ def test(*options, cfg=None, debug=False): if __name__ == "__main__": fire.Fire(test) + diff --git a/interpretation/deepseismic_interpretation/data.py b/interpretation/deepseismic_interpretation/data.py index 2dd6ed37..3f166158 100644 --- a/interpretation/deepseismic_interpretation/data.py +++ b/interpretation/deepseismic_interpretation/data.py @@ -77,10 +77,6 @@ def write_segy(out_filename, in_filename, out_cube): Nothing """ - # Select last channel - if type(out_cube) is list: - out_cube = out_cube[-1] - print("Writing interpretation to " + out_filename) # Copy segy file @@ -91,10 +87,26 @@ def write_segy(out_filename, in_filename, out_cube): # Open out-file with segyio.open(out_filename, "r+") as src: + iline_start = src.ilines[0] + # set type to inlines dtype = src.iline[iline_start].dtype + """ + src.ilines = list(range(src.ilines[0], src.ilines[0] + out_cube.shape[0])) + + # set crosslines to match the numpy array shape + xline_start = src.xlines[0] + src.xlines = list(range(src.xlines[0], src.xlines[0] + out_cube.shape[1])) + + # set depth to match the numpy array shape + depth_start = src.depth[0] + src.depth = list(range(src)) + """ + # loop through inlines and insert output for i in src.ilines: + if i>=out_cube.shape[0]: + break iline = out_cube[i - iline_start, :, :] src.iline[i] = np.ascontiguousarray(iline.astype(dtype)) From 30c3eac0bb64f23601b7c830d70edbd4efc6d470 Mon Sep 17 00:00:00 2001 From: Max Kaznady Date: Mon, 3 Aug 2020 15:00:16 -0400 Subject: [PATCH 07/15] added custom segy writer which can create segy files from numpy arrays --- .../interpretation/dutchf3_patch/test.py | 54 ++++++++++--------- .../deepseismic_interpretation/data.py | 25 +-------- 2 files changed, 31 insertions(+), 48 deletions(-) diff --git a/experiments/interpretation/dutchf3_patch/test.py b/experiments/interpretation/dutchf3_patch/test.py index ba945b8d..df9f9584 100644 --- a/experiments/interpretation/dutchf3_patch/test.py +++ b/experiments/interpretation/dutchf3_patch/test.py @@ -21,6 +21,7 @@ import fire import numpy as np +import segyio from sklearn import metrics import torch import torch.nn.functional as F @@ -53,8 +54,8 @@ ] # we can optionally supply a segy file whose geometry we will use to write out 3D test set predictions -# if it doesn't exist, we won't output a segy file -SEGY_INFILE = "/data/seismic_orig/TrainingData_Labels.segy" +# if it doesn't exist, we will write a blank segy file with same dimensions as the predictions array +SEGY_INFILE = "/data/seismic/TrainingData_Labels.segy" class runningScore(object): @@ -67,13 +68,13 @@ def _fast_hist(self, label_true, label_pred, n_class): """ speed-optimized but not memory-optimized version of the confusion matrix calculation """ - logger = logging.getLogger(__name__) + # logger = logging.getLogger(__name__) mask = (label_true >= 0) & (label_true < n_class) bincount_arg = n_class * label_true[mask].astype(int) + label_pred[mask] - logger.info('bincount operation starting...') + # logger.info("bincount operation starting...") hist = np.bincount(bincount_arg, minlength=n_class ** 2,) hist = hist.reshape(n_class, n_class) - logger.info('finished') + # logger.info("finished") return hist # @profile @@ -82,12 +83,10 @@ def _confusion_matrix(self, label_true, label_pred, n_class): memory-optimized but not speed-optimized version of the confusion matrix calculation """ mask = (label_true >= 0) & (label_true < n_class) - #bincount_arg = n_class * label_true[mask].astype(int) + label_pred[mask] - matrix = metrics.confusion_matrix(label_true[mask], label_pred[mask], labels = list(range(n_class))) - return matrix + return metrics.confusion_matrix(label_true[mask], label_pred[mask], labels=list(range(n_class))) # @profile - def update(self, label_trues, label_preds, fast_hist = True): + def update(self, label_trues, label_preds, fast_hist=True): for lt, lp in zip(label_trues, label_preds): if fast_hist: self.confusion_matrix += self._fast_hist(lt.flatten(), lp.flatten(), self.n_classes) @@ -145,13 +144,8 @@ def _compute_3D_metrics(gt_labels, pred, n_classes, split): logger = logging.getLogger(__name__) - # TODO: remove - #n = 300 - #gt_labels = gt_labels[:n, :n, :n] - #pred = pred[:n, :n, :n] - score = runningScore(n_classes) - score.update(gt_labels, pred, fast_hist=False) + score.update(gt_labels, pred, fast_hist=True) score, class_iou = score.get_scores() @@ -321,9 +315,6 @@ def _patch_label_2d( output = output_processing(output) output_p[:, :, hdx + ps : hdx + ps + patch_size, wdx + ps : wdx + ps + patch_size,] += output - # TODO remove - break - # dump the data right before it's being put into the model and after scoring if debug: outdir = f"debug/test/batch_{split}" @@ -482,10 +473,18 @@ def _evaluate_split( pred = pred_sum.argmax(0).astype(np.uint8) del pred_sum _compute_3D_metrics(gt_labels, pred, n_classes, split) - np.save(f"test_simple_avg_split_{split}.npy", pred) + np.save(os.path.join(output_dir, f"test_simple_avg_split_{split}.npy", pred)) # use existing SEGY file as a template to write our data into if os.path.isfile(SEGY_INFILE): - write_segy(f"test_simple_avg_split_{split}.segy", SEGY_INFILE, pred) + # input segy file is the ground truth here + write_segy(os.path.join(output_dir, f"pred_simple_avg_split_{split}.segy"), SEGY_INFILE, pred.swapaxes(0, 2)) + else: + # write array into segy using array dimensions for # of inlines and crosslines + # make sure directions are inline, crossline, depth + logging.info("writing segy files") + segyio.tools.from_array3D(os.path.join(output_dir, f"pred_simple_avg_split_{split}.segy"), pred.swapaxes(0, 2), dt=1000) + segyio.tools.from_array3D(os.path.join(output_dir, f"groundtruth_simple_avg_split_{split}.segy"), gt_labels.swapaxes(0, 2), dt=1000) + logging.info("done") logging.info("Geometric average") pred_sum = np.sqrt(accum_inline * accum_crossline) @@ -495,7 +494,15 @@ def _evaluate_split( np.save(f"test_geometric_avg_split_{split}.npy", pred) # use existing SEGY file as a template to write our data into if os.path.isfile(SEGY_INFILE): - write_segy(f"test_geometric_avg_split_{split}.segy", SEGY_INFILE, pred) + # input segy file is the ground truth here + write_segy(os.path.join(output_dir, f"pred_geometric_avg_split_{split}.segy"), SEGY_INFILE, pred.swapaxes(0, 2)) + else: + # write array into segy using array dimensions for # of inlines and crosslines + # make sure directions are inline, crossline, depth + logging.info("writing segy files") + segyio.tools.from_array3D(os.path.join(output_dir,f"pred_geometric_avg_split_{split}.segy"), pred.swapaxes(0, 2), dt=1000) + segyio.tools.from_array3D(os.path.join(output_dir,f"groundtruth_geometric_avg_split_{split}.segy"), gt_labels.swapaxes(0, 2), dt=1000) + logging.info("done") def _write_section_file(labels, section_file): @@ -514,9 +521,7 @@ def _write_section_file(labels, section_file): else: x_list = [] - # TODO: revert - # list_test = i_list + x_list - list_test = i_list[0:1] + x_list[0:1] + list_test = i_list + x_list file_object = open(section_file, "w") file_object.write("\n".join(list_test)) @@ -627,4 +632,3 @@ def test(*options, cfg=None, debug=False): if __name__ == "__main__": fire.Fire(test) - diff --git a/interpretation/deepseismic_interpretation/data.py b/interpretation/deepseismic_interpretation/data.py index 3f166158..a3ad5008 100644 --- a/interpretation/deepseismic_interpretation/data.py +++ b/interpretation/deepseismic_interpretation/data.py @@ -77,45 +77,24 @@ def write_segy(out_filename, in_filename, out_cube): Nothing """ - print("Writing interpretation to " + out_filename) + logging.info("Writing interpretation to " + out_filename) # Copy segy file copyfile(in_filename, out_filename) - # Moving temporal axis back again - out_cube = np.moveaxis(out_cube, 0, -1) - # Open out-file with segyio.open(out_filename, "r+") as src: iline_start = src.ilines[0] # set type to inlines dtype = src.iline[iline_start].dtype - """ - src.ilines = list(range(src.ilines[0], src.ilines[0] + out_cube.shape[0])) - - # set crosslines to match the numpy array shape - xline_start = src.xlines[0] - src.xlines = list(range(src.xlines[0], src.xlines[0] + out_cube.shape[1])) - - # set depth to match the numpy array shape - depth_start = src.depth[0] - src.depth = list(range(src)) - """ # loop through inlines and insert output for i in src.ilines: - if i>=out_cube.shape[0]: - break iline = out_cube[i - iline_start, :, :] src.iline[i] = np.ascontiguousarray(iline.astype(dtype)) - # TODO: rewrite this whole function - # Moving temporal axis first again - just in case the user want to keep working on it - out_cube = np.moveaxis(out_cube, -1, 0) - - print("Writing interpretation - Finished") - return + logging.info("Writing interpretation - finished") def _get_classes_and_counts(mask_list): class_counts_dict = defaultdict(int) From d896a03ad54e40936345dd672e5c77dca1542ed3 Mon Sep 17 00:00:00 2001 From: Max Kaznady Date: Mon, 3 Aug 2020 15:30:59 -0400 Subject: [PATCH 08/15] minor typo --- scripts/byod_penobscot.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/byod_penobscot.py b/scripts/byod_penobscot.py index bc0b682b..32e65d5f 100644 --- a/scripts/byod_penobscot.py +++ b/scripts/byod_penobscot.py @@ -19,7 +19,7 @@ import logging # toggle to WARNING when running in production, or use CLI -logging.getLogger().setLevel(logging.DEBUG)~/da +logging.getLogger().setLevel(logging.DEBUG) # logging.getLogger().setLevel(logging.WARNING) import argparse From e590bb39165de817cd84a37f4dd78a4c5064a27d Mon Sep 17 00:00:00 2001 From: Max Kaznady Date: Wed, 5 Aug 2020 21:31:32 +0000 Subject: [PATCH 09/15] latest fixes --- .../interpretation/dutchf3_patch/test.py | 8 +++---- scripts/byod_competition.py | 21 +++++++++++++++++-- 2 files changed, 23 insertions(+), 6 deletions(-) diff --git a/experiments/interpretation/dutchf3_patch/test.py b/experiments/interpretation/dutchf3_patch/test.py index df9f9584..bfa3c3f1 100644 --- a/experiments/interpretation/dutchf3_patch/test.py +++ b/experiments/interpretation/dutchf3_patch/test.py @@ -391,7 +391,7 @@ def _evaluate_split( with torch.no_grad(): # operations inside don't track history model.eval() for i, (images, labels) in enumerate(test_loader): - logger.info(f"split: {split}, section: {i}") + logger.info(f"split: {split}, section: {test_set.sections[i]}") outputs = _patch_label_2d( model, images, @@ -419,7 +419,7 @@ def _evaluate_split( accum_inline[:, :, :, i] = preds_numpy # dealing with crossline elif test_set.sections[i].startswith("x"): - accum_crossline[:, :, i, :] = preds_numpy + accum_crossline[:, :, i-n_inlines, :] = preds_numpy else: raise Exception("we need either an inline or crossline split") @@ -473,7 +473,7 @@ def _evaluate_split( pred = pred_sum.argmax(0).astype(np.uint8) del pred_sum _compute_3D_metrics(gt_labels, pred, n_classes, split) - np.save(os.path.join(output_dir, f"test_simple_avg_split_{split}.npy", pred)) + np.save(os.path.join(output_dir, f"test_simple_avg_split_{split}.npy"), pred) # use existing SEGY file as a template to write our data into if os.path.isfile(SEGY_INFILE): # input segy file is the ground truth here @@ -491,7 +491,7 @@ def _evaluate_split( pred = pred_sum.argmax(0).astype(np.uint8) del pred_sum _compute_3D_metrics(gt_labels, pred, n_classes, split) - np.save(f"test_geometric_avg_split_{split}.npy", pred) + np.save(os.path.join(output_dir, f"test_geometric_avg_split_{split}.npy"), pred) # use existing SEGY file as a template to write our data into if os.path.isfile(SEGY_INFILE): # input segy file is the ground truth here diff --git a/scripts/byod_competition.py b/scripts/byod_competition.py index 34f9bd72..0bb4dbc1 100644 --- a/scripts/byod_competition.py +++ b/scripts/byod_competition.py @@ -9,12 +9,29 @@ # information to include in configuration file when running: +clip INFO:root:[0.84979262 0.57790153 0.95866329 0.71236326 0.99004844 0.91123086] INFO:root:MEAN INFO:root:4.183678e-05 INFO:root:STANDARD DEVIATION INFO:root:0.31477982 +noclip +INFO:root:[0.84979262 0.57790153 0.95866329 0.71236326 0.99004844 0.91123086] +INFO:root:MEAN +INFO:root:0.0043642526 +INFO:root:STANDARD DEVIATION +INFO:root:0.07544233 + +reduced test size +INFO:root:[0.84979262 0.57790153 0.95866329 0.71236326 0.99004844 0.91123086] +INFO:root:[0.84979262 0.57790153 0.95866329 0.71236326 0.99004844 0.91123086] +INFO:root:MEAN +INFO:root:4.183678e-05 +INFO:root:STANDARD DEVIATION +INFO:root:0.31477982 + + # kick off run as: python byod_competition.py --train /home/maxkaz/data/seismic/TrainingData_Image.segy --label /home/maxkaz/data/seismic/TrainingData_Labels.segy --outdir /home/maxkaz/data/seismic @@ -168,8 +185,8 @@ def mkdir(dirname): """ GLOBAL VARIABLES """ -INLINE_FRACTION = 0.7 -CROSSLINE_FRACTION = 0.78 +INLINE_FRACTION = 0.9 +CROSSLINE_FRACTION = 0.9 N_CLASSES = 6 parser.add_argument("--train", help="Name of train data", type=str, required=True) From 73e376437f44a5e9b6dc236425137df21106d251 Mon Sep 17 00:00:00 2001 From: Max Kaznady Date: Thu, 6 Aug 2020 11:52:38 -0400 Subject: [PATCH 10/15] added Lovasz losses --- cv_lib/cv_lib/segmentation/lovasz_losses.py | 258 ++++++++++++++++++ .../interpretation/dutchf3_patch/train.py | 4 +- 2 files changed, 261 insertions(+), 1 deletion(-) create mode 100644 cv_lib/cv_lib/segmentation/lovasz_losses.py diff --git a/cv_lib/cv_lib/segmentation/lovasz_losses.py b/cv_lib/cv_lib/segmentation/lovasz_losses.py new file mode 100644 index 00000000..ab989e7e --- /dev/null +++ b/cv_lib/cv_lib/segmentation/lovasz_losses.py @@ -0,0 +1,258 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. + +""" +DO NOT REMOVE THIS COMMENT +Lovasz-Softmax and Jaccard hinge loss in PyTorch +Maxim Berman 2018 ESAT-PSI KU Leuven (MIT License) +taken from https://github.com/bermanmaxim/LovaszSoftmax under MIT license +""" + +from __future__ import print_function, division + +import torch +from torch.autograd import Variable +import torch.nn.functional as F +import numpy as np + +try: + from itertools import ifilterfalse +except ImportError: # py3k + from itertools import filterfalse as ifilterfalse + + +def lovasz_grad(gt_sorted): + """ + Computes gradient of the Lovasz extension w.r.t sorted errors + See Alg. 1 in paper + """ + p = len(gt_sorted) + gts = gt_sorted.sum() + intersection = gts - gt_sorted.float().cumsum(0) + union = gts + (1 - gt_sorted).float().cumsum(0) + jaccard = 1. - intersection / union + if p > 1: # cover 1-pixel case + jaccard[1:p] = jaccard[1:p] - jaccard[0:-1] + return jaccard + + +def iou_binary(preds, labels, EMPTY=1., ignore=None, per_image=True): + """ + IoU for foreground class + binary: 1 foreground, 0 background + """ + if not per_image: + preds, labels = (preds,), (labels,) + ious = [] + for pred, label in zip(preds, labels): + intersection = ((label == 1) & (pred == 1)).sum() + union = ((label == 1) | ((pred == 1) & (label != ignore))).sum() + if not union: + iou = EMPTY + else: + iou = float(intersection) / float(union) + ious.append(iou) + iou = mean(ious) # mean accross images if per_image + return 100 * iou + + +def iou(preds, labels, C, EMPTY=1., ignore=None, per_image=False): + """ + Array of IoU for each (non ignored) class + """ + if not per_image: + preds, labels = (preds,), (labels,) + ious = [] + for pred, label in zip(preds, labels): + iou = [] + for i in range(C): + if i != ignore: # The ignored label is sometimes among predicted classes (ENet - CityScapes) + intersection = ((label == i) & (pred == i)).sum() + union = ((label == i) | ((pred == i) & (label != ignore))).sum() + if not union: + iou.append(EMPTY) + else: + iou.append(float(intersection) / float(union)) + ious.append(iou) + ious = [mean(iou) for iou in zip(*ious)] # mean accross images if per_image + return 100 * np.array(ious) + + +# --------------------------- BINARY LOSSES --------------------------- + + +def lovasz_hinge(logits, labels, per_image=True, ignore=None): + """ + Binary Lovasz hinge loss + logits: [B, H, W] Variable, logits at each pixel (between -\infty and +\infty) + labels: [B, H, W] Tensor, binary ground truth masks (0 or 1) + per_image: compute the loss per image instead of per batch + ignore: void class id + """ + if per_image: + loss = mean(lovasz_hinge_flat(*flatten_binary_scores(log.unsqueeze(0), lab.unsqueeze(0), ignore)) + for log, lab in zip(logits, labels)) + else: + loss = lovasz_hinge_flat(*flatten_binary_scores(logits, labels, ignore)) + return loss + + +def lovasz_hinge_flat(logits, labels): + """ + Binary Lovasz hinge loss + logits: [P] Variable, logits at each prediction (between -\infty and +\infty) + labels: [P] Tensor, binary ground truth labels (0 or 1) + ignore: label to ignore + """ + if len(labels) == 0: + # only void pixels, the gradients should be 0 + return logits.sum() * 0. + signs = 2. * labels.float() - 1. + errors = (1. - logits * Variable(signs)) + errors_sorted, perm = torch.sort(errors, dim=0, descending=True) + perm = perm.data + gt_sorted = labels[perm] + grad = lovasz_grad(gt_sorted) + loss = torch.dot(F.relu(errors_sorted), Variable(grad)) + return loss + + +def flatten_binary_scores(scores, labels, ignore=None): + """ + Flattens predictions in the batch (binary case) + Remove labels equal to 'ignore' + """ + scores = scores.view(-1) + labels = labels.view(-1) + if ignore is None: + return scores, labels + valid = (labels != ignore) + vscores = scores[valid] + vlabels = labels[valid] + return vscores, vlabels + + +class StableBCELoss(torch.nn.modules.Module): + def __init__(self): + super(StableBCELoss, self).__init__() + + def forward(self, input, target): + neg_abs = - input.abs() + loss = input.clamp(min=0) - input * target + (1 + neg_abs.exp()).log() + return loss.mean() + + +def binary_xloss(logits, labels, ignore=None): + """ + Binary Cross entropy loss + logits: [B, H, W] Variable, logits at each pixel (between -\infty and +\infty) + labels: [B, H, W] Tensor, binary ground truth masks (0 or 1) + ignore: void class id + """ + logits, labels = flatten_binary_scores(logits, labels, ignore) + loss = StableBCELoss()(logits, Variable(labels.float())) + return loss + + +# --------------------------- MULTICLASS LOSSES --------------------------- + + +def lovasz_softmax(probas, labels, classes='present', per_image=False, ignore=None): + """ + Multi-class Lovasz-Softmax loss + probas: [B, C, H, W] Variable, class probabilities at each prediction (between 0 and 1). + Interpreted as binary (sigmoid) output with outputs of size [B, H, W]. + labels: [B, H, W] Tensor, ground truth labels (between 0 and C - 1) + classes: 'all' for all, 'present' for classes present in labels, or a list of classes to average. + per_image: compute the loss per image instead of per batch + ignore: void class labels + """ + if per_image: + loss = mean(lovasz_softmax_flat(*flatten_probas(prob.unsqueeze(0), lab.unsqueeze(0), ignore), classes=classes) + for prob, lab in zip(probas, labels)) + else: + loss = lovasz_softmax_flat(*flatten_probas(probas, labels, ignore), classes=classes) + return loss + + +def lovasz_softmax_flat(probas, labels, classes='present'): + """ + Multi-class Lovasz-Softmax loss + probas: [P, C] Variable, class probabilities at each prediction (between 0 and 1) + labels: [P] Tensor, ground truth labels (between 0 and C - 1) + classes: 'all' for all, 'present' for classes present in labels, or a list of classes to average. + """ + if probas.numel() == 0: + # only void pixels, the gradients should be 0 + return probas * 0. + C = probas.size(1) + losses = [] + class_to_sum = list(range(C)) if classes in ['all', 'present'] else classes + for c in class_to_sum: + fg = (labels == c).float() # foreground for class c + if (classes is 'present' and fg.sum() == 0): + continue + if C == 1: + if len(classes) > 1: + raise ValueError('Sigmoid output possible only with 1 class') + class_pred = probas[:, 0] + else: + class_pred = probas[:, c] + errors = (Variable(fg) - class_pred).abs() + errors_sorted, perm = torch.sort(errors, 0, descending=True) + perm = perm.data + fg_sorted = fg[perm] + losses.append(torch.dot(errors_sorted, Variable(lovasz_grad(fg_sorted)))) + return mean(losses) + + +def flatten_probas(probas, labels, ignore=None): + """ + Flattens predictions in the batch + """ + if probas.dim() == 3: + # assumes output of a sigmoid layer + B, H, W = probas.size() + probas = probas.view(B, 1, H, W) + B, C, H, W = probas.size() + probas = probas.permute(0, 2, 3, 1).contiguous().view(-1, C) # B * H * W, C = P, C + labels = labels.view(-1) + if ignore is None: + return probas, labels + valid = (labels != ignore) + vprobas = probas[valid.nonzero().squeeze()] + vlabels = labels[valid] + return vprobas, vlabels + + +def xloss(logits, labels, ignore=None): + """ + Cross entropy loss + """ + return F.cross_entropy(logits, Variable(labels), ignore_index=255) + + +# --------------------------- HELPER FUNCTIONS --------------------------- +def isnan(x): + return x != x + + +def mean(l, ignore_nan=False, empty=0): + """ + nanmean compatible with generators. + """ + l = iter(l) + if ignore_nan: + l = ifilterfalse(isnan, l) + try: + n = 1 + acc = next(l) + except StopIteration: + if empty == 'raise': + raise ValueError('Empty mean') + return empty + for n, v in enumerate(l, 2): + acc += v + if n == 1: + return acc + return acc / n diff --git a/experiments/interpretation/dutchf3_patch/train.py b/experiments/interpretation/dutchf3_patch/train.py index e687290e..c8b9fa1b 100644 --- a/experiments/interpretation/dutchf3_patch/train.py +++ b/experiments/interpretation/dutchf3_patch/train.py @@ -37,6 +37,7 @@ from cv_lib.segmentation.dutchf3.utils import current_datetime, git_branch, git_hash from cv_lib.segmentation.metrics import class_accuracy, class_iou, mean_class_accuracy, mean_iou, pixelwise_accuracy from cv_lib.utils import generate_path, load_log_configuration +from cv_lib.segmentation import lovasz_losses as L from deepseismic_interpretation.dutchf3.data import get_patch_loader from default import _C as config from default import update_config @@ -236,7 +237,8 @@ def run(*options, cfg=None, local_rank=0, debug=False, input=None, distributed=F class_weights = torch.tensor(config.DATASET.CLASS_WEIGHTS, device=device, requires_grad=False) # Loss: - criterion = torch.nn.CrossEntropyLoss(weight=class_weights, ignore_index=255, reduction="mean") + #criterion = torch.nn.CrossEntropyLoss(weight=class_weights, ignore_index=255, reduction="mean") + criterion = lambda x, y: L.lovasz_softmax(x, y, classes = list(range(n_classes)), ignore=255) # Model: if distributed: From 8eb84193269f193708f79146cde872ba59187c65 Mon Sep 17 00:00:00 2001 From: Max Kaznady Date: Sun, 9 Aug 2020 15:27:25 +0000 Subject: [PATCH 11/15] more minor fixes --- README.md | 3 ++- experiments/interpretation/dutchf3_patch/train.py | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 1b82b9b1..6804da50 100644 --- a/README.md +++ b/README.md @@ -226,8 +226,9 @@ For seismic interpretation (segmentation), if you want to visualize cross-sectio To install [segyviewer](https://github.com/equinor/segyviewer) run: ```bash -conda env create -n segyviewer python=2.7 +conda create -n segyviewer python=2.7 conda activate segyviewer +sudo apt install libqtwebkit4 conda install -c conda-forge pyqt=4.11.4 pip install segyviewer ``` diff --git a/experiments/interpretation/dutchf3_patch/train.py b/experiments/interpretation/dutchf3_patch/train.py index c8b9fa1b..5076c541 100644 --- a/experiments/interpretation/dutchf3_patch/train.py +++ b/experiments/interpretation/dutchf3_patch/train.py @@ -238,7 +238,7 @@ def run(*options, cfg=None, local_rank=0, debug=False, input=None, distributed=F # Loss: #criterion = torch.nn.CrossEntropyLoss(weight=class_weights, ignore_index=255, reduction="mean") - criterion = lambda x, y: L.lovasz_softmax(x, y, classes = list(range(n_classes)), ignore=255) + criterion = lambda x, y: L.lovasz_softmax(x, y, classes = list(range(n_classes)), ignore=255, per_image = True) # Model: if distributed: From 3047e47d307483a921b57be04057dddc5c9d35c8 Mon Sep 17 00:00:00 2001 From: Max Kaznady Date: Wed, 12 Aug 2020 19:31:54 +0000 Subject: [PATCH 12/15] fixed data drop in README; changed default HRNet batch size --- README.md | 2 +- experiments/interpretation/dutchf3_patch/configs/hrnet.yaml | 2 +- scripts/byod_competition.py | 4 ++-- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 6804da50..7cf50124 100644 --- a/README.md +++ b/README.md @@ -158,7 +158,7 @@ wget -o /dev/null -O dataset.h5 https://zenodo.org/record/3924682/files/dataset. # convert penobscot python byod_penobscot.py --filename dataset.h5 --outdir # preprocess for experiments -python prepare_dutchf3.py split_train_val patch --data_dir= --label_file=train/train_labels.npy --output_dir=splits --stride=50 --patch_size=100 --split_direction=both --section_stride=100 +python prepare_dutchf3.py split_train_val patch --data_dir= --label_file=train/train_labels.npy --output_dir=splits --stride=50 --patch_size=100 --split_direction=both ``` ### Run Examples diff --git a/experiments/interpretation/dutchf3_patch/configs/hrnet.yaml b/experiments/interpretation/dutchf3_patch/configs/hrnet.yaml index 94921bf7..ba8d4236 100644 --- a/experiments/interpretation/dutchf3_patch/configs/hrnet.yaml +++ b/experiments/interpretation/dutchf3_patch/configs/hrnet.yaml @@ -67,7 +67,7 @@ MODEL: FUSE_METHOD: SUM TRAIN: - BATCH_SIZE_PER_GPU: 16 + BATCH_SIZE_PER_GPU: 32 BEGIN_EPOCH: 0 END_EPOCH: 300 MIN_LR: 0.001 diff --git a/scripts/byod_competition.py b/scripts/byod_competition.py index 0bb4dbc1..978d24c5 100644 --- a/scripts/byod_competition.py +++ b/scripts/byod_competition.py @@ -5,7 +5,7 @@ Run example: python byod_competition.py --train --label --outdir -python prepare_dutchf3.py split_train_val patch --data_dir= --label_file=train/train_labels.npy --output_dir=splits --stride=50 --patch_size=100 --split_direction=both --section_stride=100 +python prepare_dutchf3.py split_train_val patch --data_dir= --label_file=train/train_labels.npy --output_dir=splits --stride=50 --patch_size=100 --split_direction=both # information to include in configuration file when running: @@ -35,7 +35,7 @@ # kick off run as: python byod_competition.py --train /home/maxkaz/data/seismic/TrainingData_Image.segy --label /home/maxkaz/data/seismic/TrainingData_Labels.segy --outdir /home/maxkaz/data/seismic -python prepare_dutchf3.py split_train_val patch --data_dir=/home/maxkaz/data/seismic --label_file=train/train_labels.npy --output_dir=splits --stride=50 --patch_size=100 --split_direction=both --section_stride=100 +python prepare_dutchf3.py split_train_val patch --data_dir=/home/maxkaz/data/seismic --label_file=train/train_labels.npy --output_dir=splits --stride=50 --patch_size=100 --split_direction=both NGPU=2 python -m torch.distributed.launch --nproc_per_node=${NGPU} train.py \ From fdfe04ee074c048273cc0456fa897ad131ac24de Mon Sep 17 00:00:00 2001 From: Max Kaznady Date: Mon, 14 Sep 2020 16:02:26 -0400 Subject: [PATCH 13/15] final tweaks to SEG 2020 submission --- .../interpretation/dutchf3_patch/test.py | 6 ++ scripts/byod_competition.py | 12 ++-- scripts/seg20_test_process.py | 59 +++++++++++++++++++ 3 files changed, 73 insertions(+), 4 deletions(-) create mode 100644 scripts/seg20_test_process.py diff --git a/experiments/interpretation/dutchf3_patch/test.py b/experiments/interpretation/dutchf3_patch/test.py index bfa3c3f1..9e5275f8 100644 --- a/experiments/interpretation/dutchf3_patch/test.py +++ b/experiments/interpretation/dutchf3_patch/test.py @@ -474,9 +474,14 @@ def _evaluate_split( del pred_sum _compute_3D_metrics(gt_labels, pred, n_classes, split) np.save(os.path.join(output_dir, f"test_simple_avg_split_{split}.npy"), pred) + # use existing SEGY file as a template to write our data into + SEGY_INFILE=f"/data/seismic/TestData_Image{split[-1]}.segy" + if os.path.isfile(SEGY_INFILE): # input segy file is the ground truth here + # adjust for competition to make classes start from 1 and not 0 + pred += 1 write_segy(os.path.join(output_dir, f"pred_simple_avg_split_{split}.segy"), SEGY_INFILE, pred.swapaxes(0, 2)) else: # write array into segy using array dimensions for # of inlines and crosslines @@ -495,6 +500,7 @@ def _evaluate_split( # use existing SEGY file as a template to write our data into if os.path.isfile(SEGY_INFILE): # input segy file is the ground truth here + pred += 1 write_segy(os.path.join(output_dir, f"pred_geometric_avg_split_{split}.segy"), SEGY_INFILE, pred.swapaxes(0, 2)) else: # write array into segy using array dimensions for # of inlines and crosslines diff --git a/scripts/byod_competition.py b/scripts/byod_competition.py index 978d24c5..3bc4d4a8 100644 --- a/scripts/byod_competition.py +++ b/scripts/byod_competition.py @@ -5,7 +5,7 @@ Run example: python byod_competition.py --train --label --outdir -python prepare_dutchf3.py split_train_val patch --data_dir= --label_file=train/train_labels.npy --output_dir=splits --stride=50 --patch_size=100 --split_direction=both +python prepare_dutchf3.py split_train_val patch --data_dir= --label_file=train/train_labels.npy --output_dir=splits --stride=50 --patch_size=100 --split_direction=both # information to include in configuration file when running: @@ -31,11 +31,10 @@ INFO:root:STANDARD DEVIATION INFO:root:0.31477982 - # kick off run as: python byod_competition.py --train /home/maxkaz/data/seismic/TrainingData_Image.segy --label /home/maxkaz/data/seismic/TrainingData_Labels.segy --outdir /home/maxkaz/data/seismic -python prepare_dutchf3.py split_train_val patch --data_dir=/home/maxkaz/data/seismic --label_file=train/train_labels.npy --output_dir=splits --stride=50 --patch_size=100 --split_direction=both +python prepare_dutchf3.py split_train_val patch --data_dir=/home/maxkaz/data/seismic --label_file=train/train_labels.npy --output_dir=splits --stride=50 --patch_size=100 --split_direction=both NGPU=2 python -m torch.distributed.launch --nproc_per_node=${NGPU} train.py \ @@ -66,6 +65,8 @@ TEST.SPLIT 'both' --cfg configs/unet.yaml > unet.log 2>&1 & +nohup time python test.py DATASET.ROOT "/data/seismic" DATASET.NUM_CLASSES 6 DATASET.CLASS_WEIGHTS "[0.84979262, 0.57790153, 0.95866329, 0.71236326, 0.99004844, 0.91123086]" TRAIN.MEAN 0.0 TRAIN.STD 0.31477982 TEST.SPLIT 'both' MODEL.PRETRAINED /home/maxkaz/Downloads/hrnetv2_w48_imagenet_pretrained.pth TEST.MODEL_PATH --cfg configs/hrnet.yaml + """ from deepseismic_interpretation.data import read_segy @@ -109,6 +110,7 @@ def main(args): logging.info("Running 3-sigma clipping") clip_scaling = 3.0 mean, std = data.mean(), data.std() + logging.info(f"mean {mean} std {std}") data[data > mean + clip_scaling * std] = mean + clip_scaling * std data[data < mean - clip_scaling * std] = mean - clip_scaling * std @@ -122,7 +124,9 @@ def main(args): # rescale to be within a certain range range_min, range_max = -1.0, 1.0 - data_std = (data - data.min()) / (data.max() - data.min()) + min, max = data.min(), data.max() + logging.info(f"min {min} max {max}") + data_std = (data - min) / (max - min) data = data_std * (range_max - range_min) + range_min """ diff --git a/scripts/seg20_test_process.py b/scripts/seg20_test_process.py new file mode 100644 index 00000000..16d6832a --- /dev/null +++ b/scripts/seg20_test_process.py @@ -0,0 +1,59 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. + +""" +Custom one-off script to process the SEG20 competition test dataset. +""" + +from deepseismic_interpretation.data import read_segy + +""" libraries """ +import segyio + +import numpy as np +from scipy import stats +import os + +np.set_printoptions(linewidth=200) +import logging + +# toggle to WARNING when running in production, or use CLI +logging.getLogger().setLevel(logging.DEBUG) + +# dataset locations +N_CLASSES = 6 +TEST1 = "/data/seismic/TestData_Image1.segy" +TEST2 = "/data/seismic/TestData_Image2.segy" +# output location +OUTDIR = "/data/seismic/test_once" +# enter these from byod_competition logging output - computed on the training set +MEAN = 0.676609992980957 +STD = 390.308837890625 +MIN = -1170.2498779296875 +MAX = 1171.6031494140625 + +def process_test(infile, outdir, n_set): + + logging.info("loading data") + data, _ = read_segy(infile) + + logging.info("Running 3-sigma clipping") + clip_scaling = 3.0 + data[data > MEAN + clip_scaling * STD] = MEAN + clip_scaling * STD + data[data < MEAN - clip_scaling * STD] = MEAN - clip_scaling * STD + + # Make data cube fast to access + logging.info("Adjusting precision") + data = np.ascontiguousarray(data, "float32") + + # rescale to be within a certain range + range_min, range_max = -1.0, 1.0 + data_std = (data - MIN) / (MAX - MIN) + data = data_std * (range_max - range_min) + range_min + + random_test_labels = np.random.randint(0,N_CLASSES-1, data.shape, dtype='uint8') + np.save(os.path.join(outdir, f"test{n_set}_seismic.npy"), data) + np.save(os.path.join(outdir, f"test{n_set}_labels.npy"), random_test_labels) + +process_test(TEST1, OUTDIR, 1) +process_test(TEST2, OUTDIR, 2) From 9782338827d4e7f868b66ece9ef3496740da5284 Mon Sep 17 00:00:00 2001 From: Max Kaznady Date: Fri, 18 Sep 2020 18:17:40 -0400 Subject: [PATCH 14/15] added script to check class distributions --- scripts/seg20_check_distrib.py | 48 ++++++++++++++++++++++++++++++++++ 1 file changed, 48 insertions(+) create mode 100644 scripts/seg20_check_distrib.py diff --git a/scripts/seg20_check_distrib.py b/scripts/seg20_check_distrib.py new file mode 100644 index 00000000..7eddfe64 --- /dev/null +++ b/scripts/seg20_check_distrib.py @@ -0,0 +1,48 @@ +# checks distribution across classes in the new SEG20 competition + +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. + +""" +Custom one-off script to process the SEG20 competition test dataset. +""" +import collections + +from deepseismic_interpretation.data import read_segy + +""" libraries """ +import segyio + +import numpy as np +from scipy import stats +import os + +np.set_printoptions(linewidth=200) +import logging + +# toggle to WARNING when running in production, or use CLI +logging.getLogger().setLevel(logging.DEBUG) + +# dataset locations +N_CLASSES = 6 +TRAIN = "/data/seismic_orig/TrainingData_Labels.segy" +TEST1 = "/home/maxkaz/Desktop/pred_simple_avg_split_test1.segy" +TEST2 = "/home/maxkaz/Desktop/pred_simple_avg_split_test2.segy" + +def check(infile): + + data, _ = read_segy(infile) + n = data.size + counts = collections.Counter(data.astype(int).flatten().tolist()) + ccounts = 0 + for k in range(1,N_CLASSES+1): + ccounts += counts[k] + if k in counts: + print(f"{k}: {float(counts[k])/n} = {counts[k]} / {n}") + print(f"coverage {ccounts/n}") + +check(TRAIN) +check(TEST1) +check(TEST2) + +logging.info("done") From 647901761e1c042563420892de61bc3495dbeab3 Mon Sep 17 00:00:00 2001 From: Max Kaznady Date: Fri, 18 Sep 2020 18:18:20 -0400 Subject: [PATCH 15/15] remove unnecessary libraries --- scripts/seg20_check_distrib.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/scripts/seg20_check_distrib.py b/scripts/seg20_check_distrib.py index 7eddfe64..792bcc5e 100644 --- a/scripts/seg20_check_distrib.py +++ b/scripts/seg20_check_distrib.py @@ -11,11 +11,7 @@ from deepseismic_interpretation.data import read_segy """ libraries """ -import segyio - import numpy as np -from scipy import stats -import os np.set_printoptions(linewidth=200) import logging