diff --git a/README.md b/README.md index c28d22e8..7cf50124 100644 --- a/README.md +++ b/README.md @@ -158,7 +158,7 @@ wget -o /dev/null -O dataset.h5 https://zenodo.org/record/3924682/files/dataset. # convert penobscot python byod_penobscot.py --filename dataset.h5 --outdir # preprocess for experiments -python prepare_dutchf3.py split_train_val patch --data_dir= --label_file=train/train_labels.npy --output_dir=splits --stride=50 --patch_size=100 --split_direction=both --section_stride=100 +python prepare_dutchf3.py split_train_val patch --data_dir= --label_file=train/train_labels.npy --output_dir=splits --stride=50 --patch_size=100 --split_direction=both ``` ### Run Examples @@ -226,12 +226,20 @@ For seismic interpretation (segmentation), if you want to visualize cross-sectio To install [segyviewer](https://github.com/equinor/segyviewer) run: ```bash -conda env create -n segyviewer python=2.7 +conda create -n segyviewer python=2.7 conda activate segyviewer -conda install -c anaconda pyqt=4.11.4 +sudo apt install libqtwebkit4 +conda install -c conda-forge pyqt=4.11.4 pip install segyviewer ``` +If you run into any QtPy4 related problems after the installation, try running: +```bash +sudo add-apt-repository ppa:rock-core/qt4 +sudo apt update +sudo apt install libqt4-designer libqt4-opengl libqt4-svg libqtgui4 libqtwebkit4 +``` + To visualize cross-sections of a 3D volume, you can run [segyviewer](https://github.com/equinor/segyviewer) like so: ```bash diff --git a/cv_lib/cv_lib/segmentation/lovasz_losses.py b/cv_lib/cv_lib/segmentation/lovasz_losses.py new file mode 100644 index 00000000..ab989e7e --- /dev/null +++ b/cv_lib/cv_lib/segmentation/lovasz_losses.py @@ -0,0 +1,258 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. + +""" +DO NOT REMOVE THIS COMMENT +Lovasz-Softmax and Jaccard hinge loss in PyTorch +Maxim Berman 2018 ESAT-PSI KU Leuven (MIT License) +taken from https://github.com/bermanmaxim/LovaszSoftmax under MIT license +""" + +from __future__ import print_function, division + +import torch +from torch.autograd import Variable +import torch.nn.functional as F +import numpy as np + +try: + from itertools import ifilterfalse +except ImportError: # py3k + from itertools import filterfalse as ifilterfalse + + +def lovasz_grad(gt_sorted): + """ + Computes gradient of the Lovasz extension w.r.t sorted errors + See Alg. 1 in paper + """ + p = len(gt_sorted) + gts = gt_sorted.sum() + intersection = gts - gt_sorted.float().cumsum(0) + union = gts + (1 - gt_sorted).float().cumsum(0) + jaccard = 1. - intersection / union + if p > 1: # cover 1-pixel case + jaccard[1:p] = jaccard[1:p] - jaccard[0:-1] + return jaccard + + +def iou_binary(preds, labels, EMPTY=1., ignore=None, per_image=True): + """ + IoU for foreground class + binary: 1 foreground, 0 background + """ + if not per_image: + preds, labels = (preds,), (labels,) + ious = [] + for pred, label in zip(preds, labels): + intersection = ((label == 1) & (pred == 1)).sum() + union = ((label == 1) | ((pred == 1) & (label != ignore))).sum() + if not union: + iou = EMPTY + else: + iou = float(intersection) / float(union) + ious.append(iou) + iou = mean(ious) # mean accross images if per_image + return 100 * iou + + +def iou(preds, labels, C, EMPTY=1., ignore=None, per_image=False): + """ + Array of IoU for each (non ignored) class + """ + if not per_image: + preds, labels = (preds,), (labels,) + ious = [] + for pred, label in zip(preds, labels): + iou = [] + for i in range(C): + if i != ignore: # The ignored label is sometimes among predicted classes (ENet - CityScapes) + intersection = ((label == i) & (pred == i)).sum() + union = ((label == i) | ((pred == i) & (label != ignore))).sum() + if not union: + iou.append(EMPTY) + else: + iou.append(float(intersection) / float(union)) + ious.append(iou) + ious = [mean(iou) for iou in zip(*ious)] # mean accross images if per_image + return 100 * np.array(ious) + + +# --------------------------- BINARY LOSSES --------------------------- + + +def lovasz_hinge(logits, labels, per_image=True, ignore=None): + """ + Binary Lovasz hinge loss + logits: [B, H, W] Variable, logits at each pixel (between -\infty and +\infty) + labels: [B, H, W] Tensor, binary ground truth masks (0 or 1) + per_image: compute the loss per image instead of per batch + ignore: void class id + """ + if per_image: + loss = mean(lovasz_hinge_flat(*flatten_binary_scores(log.unsqueeze(0), lab.unsqueeze(0), ignore)) + for log, lab in zip(logits, labels)) + else: + loss = lovasz_hinge_flat(*flatten_binary_scores(logits, labels, ignore)) + return loss + + +def lovasz_hinge_flat(logits, labels): + """ + Binary Lovasz hinge loss + logits: [P] Variable, logits at each prediction (between -\infty and +\infty) + labels: [P] Tensor, binary ground truth labels (0 or 1) + ignore: label to ignore + """ + if len(labels) == 0: + # only void pixels, the gradients should be 0 + return logits.sum() * 0. + signs = 2. * labels.float() - 1. + errors = (1. - logits * Variable(signs)) + errors_sorted, perm = torch.sort(errors, dim=0, descending=True) + perm = perm.data + gt_sorted = labels[perm] + grad = lovasz_grad(gt_sorted) + loss = torch.dot(F.relu(errors_sorted), Variable(grad)) + return loss + + +def flatten_binary_scores(scores, labels, ignore=None): + """ + Flattens predictions in the batch (binary case) + Remove labels equal to 'ignore' + """ + scores = scores.view(-1) + labels = labels.view(-1) + if ignore is None: + return scores, labels + valid = (labels != ignore) + vscores = scores[valid] + vlabels = labels[valid] + return vscores, vlabels + + +class StableBCELoss(torch.nn.modules.Module): + def __init__(self): + super(StableBCELoss, self).__init__() + + def forward(self, input, target): + neg_abs = - input.abs() + loss = input.clamp(min=0) - input * target + (1 + neg_abs.exp()).log() + return loss.mean() + + +def binary_xloss(logits, labels, ignore=None): + """ + Binary Cross entropy loss + logits: [B, H, W] Variable, logits at each pixel (between -\infty and +\infty) + labels: [B, H, W] Tensor, binary ground truth masks (0 or 1) + ignore: void class id + """ + logits, labels = flatten_binary_scores(logits, labels, ignore) + loss = StableBCELoss()(logits, Variable(labels.float())) + return loss + + +# --------------------------- MULTICLASS LOSSES --------------------------- + + +def lovasz_softmax(probas, labels, classes='present', per_image=False, ignore=None): + """ + Multi-class Lovasz-Softmax loss + probas: [B, C, H, W] Variable, class probabilities at each prediction (between 0 and 1). + Interpreted as binary (sigmoid) output with outputs of size [B, H, W]. + labels: [B, H, W] Tensor, ground truth labels (between 0 and C - 1) + classes: 'all' for all, 'present' for classes present in labels, or a list of classes to average. + per_image: compute the loss per image instead of per batch + ignore: void class labels + """ + if per_image: + loss = mean(lovasz_softmax_flat(*flatten_probas(prob.unsqueeze(0), lab.unsqueeze(0), ignore), classes=classes) + for prob, lab in zip(probas, labels)) + else: + loss = lovasz_softmax_flat(*flatten_probas(probas, labels, ignore), classes=classes) + return loss + + +def lovasz_softmax_flat(probas, labels, classes='present'): + """ + Multi-class Lovasz-Softmax loss + probas: [P, C] Variable, class probabilities at each prediction (between 0 and 1) + labels: [P] Tensor, ground truth labels (between 0 and C - 1) + classes: 'all' for all, 'present' for classes present in labels, or a list of classes to average. + """ + if probas.numel() == 0: + # only void pixels, the gradients should be 0 + return probas * 0. + C = probas.size(1) + losses = [] + class_to_sum = list(range(C)) if classes in ['all', 'present'] else classes + for c in class_to_sum: + fg = (labels == c).float() # foreground for class c + if (classes is 'present' and fg.sum() == 0): + continue + if C == 1: + if len(classes) > 1: + raise ValueError('Sigmoid output possible only with 1 class') + class_pred = probas[:, 0] + else: + class_pred = probas[:, c] + errors = (Variable(fg) - class_pred).abs() + errors_sorted, perm = torch.sort(errors, 0, descending=True) + perm = perm.data + fg_sorted = fg[perm] + losses.append(torch.dot(errors_sorted, Variable(lovasz_grad(fg_sorted)))) + return mean(losses) + + +def flatten_probas(probas, labels, ignore=None): + """ + Flattens predictions in the batch + """ + if probas.dim() == 3: + # assumes output of a sigmoid layer + B, H, W = probas.size() + probas = probas.view(B, 1, H, W) + B, C, H, W = probas.size() + probas = probas.permute(0, 2, 3, 1).contiguous().view(-1, C) # B * H * W, C = P, C + labels = labels.view(-1) + if ignore is None: + return probas, labels + valid = (labels != ignore) + vprobas = probas[valid.nonzero().squeeze()] + vlabels = labels[valid] + return vprobas, vlabels + + +def xloss(logits, labels, ignore=None): + """ + Cross entropy loss + """ + return F.cross_entropy(logits, Variable(labels), ignore_index=255) + + +# --------------------------- HELPER FUNCTIONS --------------------------- +def isnan(x): + return x != x + + +def mean(l, ignore_nan=False, empty=0): + """ + nanmean compatible with generators. + """ + l = iter(l) + if ignore_nan: + l = ifilterfalse(isnan, l) + try: + n = 1 + acc = next(l) + except StopIteration: + if empty == 'raise': + raise ValueError('Empty mean') + return empty + for n, v in enumerate(l, 2): + acc += v + if n == 1: + return acc + return acc / n diff --git a/cv_lib/cv_lib/segmentation/models/unet.py b/cv_lib/cv_lib/segmentation/models/unet.py index ddb9197f..d73b75c1 100644 --- a/cv_lib/cv_lib/segmentation/models/unet.py +++ b/cv_lib/cv_lib/segmentation/models/unet.py @@ -4,6 +4,8 @@ """ Taken from https://github.com/milesial/Pytorch-UNet """ +import os + import torch import torch.nn as nn import torch.nn.functional as F diff --git a/environment/anaconda/local/environment.yml b/environment/anaconda/local/environment.yml index cf7078c1..503dd2b1 100644 --- a/environment/anaconda/local/environment.yml +++ b/environment/anaconda/local/environment.yml @@ -19,6 +19,7 @@ dependencies: - papermill>=1.0.1 - jupyterlab - pip: + - memory_profiler - segyio==1.8.8 - pytorch-ignite==0.3.0 - fire==0.2.1 diff --git a/experiments/interpretation/dutchf3_patch/configs/hrnet.yaml b/experiments/interpretation/dutchf3_patch/configs/hrnet.yaml index 94921bf7..ba8d4236 100644 --- a/experiments/interpretation/dutchf3_patch/configs/hrnet.yaml +++ b/experiments/interpretation/dutchf3_patch/configs/hrnet.yaml @@ -67,7 +67,7 @@ MODEL: FUSE_METHOD: SUM TRAIN: - BATCH_SIZE_PER_GPU: 16 + BATCH_SIZE_PER_GPU: 32 BEGIN_EPOCH: 0 END_EPOCH: 300 MIN_LR: 0.001 diff --git a/experiments/interpretation/dutchf3_patch/configs/unet.yaml b/experiments/interpretation/dutchf3_patch/configs/unet.yaml index 5ae1ee45..0597b8fa 100644 --- a/experiments/interpretation/dutchf3_patch/configs/unet.yaml +++ b/experiments/interpretation/dutchf3_patch/configs/unet.yaml @@ -11,7 +11,7 @@ WORKERS: 4 PRINT_FREQ: 10 LOG_CONFIG: logging.conf SEED: 2019 - +OPENCV_BORDER_CONSTANT: 0 DATASET: NUM_CLASSES: 6 @@ -21,7 +21,7 @@ DATASET: MAX: 1 MODEL: - NAME: resnet_unet + NAME: unet IN_CHANNELS: 3 diff --git a/experiments/interpretation/dutchf3_patch/test.py b/experiments/interpretation/dutchf3_patch/test.py index 2efeee85..9e5275f8 100644 --- a/experiments/interpretation/dutchf3_patch/test.py +++ b/experiments/interpretation/dutchf3_patch/test.py @@ -21,17 +21,26 @@ import fire import numpy as np +import segyio +from sklearn import metrics import torch import torch.nn.functional as F from albumentations import Compose, Normalize, PadIfNeeded, Resize from toolz import compose, curry, itertoolz, pipe, take from torch.utils import data +from deepseismic_interpretation.data import write_segy + from cv_lib.segmentation import models from cv_lib.segmentation.dutchf3.utils import current_datetime, git_branch, git_hash from cv_lib.utils import load_log_configuration, mask_to_disk, generate_path, image_to_disk -from deepseismic_interpretation.dutchf3.data import add_patch_depth_channels, get_test_loader +from deepseismic_interpretation.dutchf3.data import ( + add_patch_depth_channels, + get_test_loader, + _test1_labels_for, + _test2_labels_for, +) from default import _C as config from default import update_config @@ -44,23 +53,47 @@ "zechstein", ] +# we can optionally supply a segy file whose geometry we will use to write out 3D test set predictions +# if it doesn't exist, we will write a blank segy file with same dimensions as the predictions array +SEGY_INFILE = "/data/seismic/TrainingData_Labels.segy" + class runningScore(object): def __init__(self, n_classes): self.n_classes = n_classes self.confusion_matrix = np.zeros((n_classes, n_classes)) + # @profile def _fast_hist(self, label_true, label_pred, n_class): + """ + speed-optimized but not memory-optimized version of the confusion matrix calculation + """ + # logger = logging.getLogger(__name__) mask = (label_true >= 0) & (label_true < n_class) - hist = np.bincount(n_class * label_true[mask].astype(int) + label_pred[mask], minlength=n_class ** 2,).reshape( - n_class, n_class - ) + bincount_arg = n_class * label_true[mask].astype(int) + label_pred[mask] + # logger.info("bincount operation starting...") + hist = np.bincount(bincount_arg, minlength=n_class ** 2,) + hist = hist.reshape(n_class, n_class) + # logger.info("finished") return hist - def update(self, label_trues, label_preds): + # @profile + def _confusion_matrix(self, label_true, label_pred, n_class): + """ + memory-optimized but not speed-optimized version of the confusion matrix calculation + """ + mask = (label_true >= 0) & (label_true < n_class) + return metrics.confusion_matrix(label_true[mask], label_pred[mask], labels=list(range(n_class))) + + # @profile + def update(self, label_trues, label_preds, fast_hist=True): for lt, lp in zip(label_trues, label_preds): - self.confusion_matrix += self._fast_hist(lt.flatten(), lp.flatten(), self.n_classes) + if fast_hist: + self.confusion_matrix += self._fast_hist(lt.flatten(), lp.flatten(), self.n_classes) + else: + self.confusion_matrix += self._confusion_matrix(lt.flatten(), lp.flatten(), self.n_classes) + # @profile def get_scores(self): """Returns accuracy score evaluation result. - overall accuracy @@ -94,6 +127,49 @@ def reset(self): self.confusion_matrix = np.zeros((self.n_classes, self.n_classes)) +def _compute_3D_metrics(gt_labels, pred, n_classes, split): + """ + Compute 3D metrics on two 3D arrays. A good test case is to set gt==pred. + + Args: + gt: ground truth 3D numpy array + pred: predictions 3D array + n_classes: number of classes + split: which test set split we're computing + + Returns: + Nothing - stdout print + + """ + + logger = logging.getLogger(__name__) + + score = runningScore(n_classes) + score.update(gt_labels, pred, fast_hist=True) + + score, class_iou = score.get_scores() + + logger.info(f"--------------- 3D RESULTS {split} -----------------") + logger.info(f'Pixel Acc: {score["Pixel Acc: "]:.4f}') + + logger.info(f'Mean Class Acc: {score["Mean Class Acc: "]:.4f}') + for cdx, class_name in enumerate(_CLASS_NAMES): + logger.info(f' class {cdx} named {class_name} accuracy {score["Class Accuracy: "][cdx]:.4f}') + + logger.info(f'Mean IoU: {score["Mean IoU: "]:0.4f}') + + for cdx, class_name in enumerate(_CLASS_NAMES): + logger.info(f" class {cdx} named {class_name} IoU {class_iou[cdx]:.4f}") + logger.info(f'Freq Weighted IoU: {score["Freq Weighted IoU: "]:.4f}') + + # Save confusion matrix: + logger.info("writing confusion matrix") + confusion = score["confusion_matrix"] + np.savetxt(f"confusion_split_{split}.csv", confusion, delimiter=" ") + + logger.info("----------------- 3D DONE ---------------------------") + + def _transform_CHW_to_HWC(numpy_array): return np.moveaxis(numpy_array, 0, -1) @@ -307,11 +383,15 @@ def _evaluate_split( running_metrics_split = runningScore(n_classes) + n_inlines, n_crosslines, n_depth = test_set.labels.shape + accum_inline = np.zeros((n_classes, n_depth, n_crosslines, n_inlines), dtype=np.float32) + accum_crossline = np.zeros((n_classes, n_depth, n_crosslines, n_inlines), dtype=np.float32) + # evaluation mode: with torch.no_grad(): # operations inside don't track history model.eval() for i, (images, labels) in enumerate(test_loader): - logger.info(f"split: {split}, section: {i}") + logger.info(f"split: {split}, section: {test_set.sections[i]}") outputs = _patch_label_2d( model, images, @@ -328,6 +408,21 @@ def _evaluate_split( config.DATASET.MAX, ) + # for debugging, if you set this to GT then you can test if + # the reconstructions matches test_set.labels + preds_numpy = outputs.detach().squeeze().numpy().astype(np.float32) + + # direction is channel x depth x crossline x inline + + # dealing with inline + if test_set.sections[i].startswith("i"): + accum_inline[:, :, :, i] = preds_numpy + # dealing with crossline + elif test_set.sections[i].startswith("x"): + accum_crossline[:, :, i-n_inlines, :] = preds_numpy + else: + raise Exception("we need either an inline or crossline split") + pred = outputs.detach().max(1)[1].numpy() gt = labels.numpy() if debug: @@ -364,6 +459,57 @@ def _evaluate_split( logger.info(f'Mean IoU: {score["Mean IoU: "]:0.3f}') running_metrics_split.reset() + ###################################################################### + # 3D: now compute metrics on full 3D volume + ###################################################################### + + gt_labels = test_set.labels.swapaxes(0, 2).astype(np.uint8) + assert gt_labels.shape == accum_inline.shape[1:] + assert gt_labels.shape == accum_crossline.shape[1:] + + # compute mIoU here + logging.info("Simple average") + pred_sum = accum_inline + accum_crossline + pred = pred_sum.argmax(0).astype(np.uint8) + del pred_sum + _compute_3D_metrics(gt_labels, pred, n_classes, split) + np.save(os.path.join(output_dir, f"test_simple_avg_split_{split}.npy"), pred) + + # use existing SEGY file as a template to write our data into + SEGY_INFILE=f"/data/seismic/TestData_Image{split[-1]}.segy" + + if os.path.isfile(SEGY_INFILE): + # input segy file is the ground truth here + # adjust for competition to make classes start from 1 and not 0 + pred += 1 + write_segy(os.path.join(output_dir, f"pred_simple_avg_split_{split}.segy"), SEGY_INFILE, pred.swapaxes(0, 2)) + else: + # write array into segy using array dimensions for # of inlines and crosslines + # make sure directions are inline, crossline, depth + logging.info("writing segy files") + segyio.tools.from_array3D(os.path.join(output_dir, f"pred_simple_avg_split_{split}.segy"), pred.swapaxes(0, 2), dt=1000) + segyio.tools.from_array3D(os.path.join(output_dir, f"groundtruth_simple_avg_split_{split}.segy"), gt_labels.swapaxes(0, 2), dt=1000) + logging.info("done") + + logging.info("Geometric average") + pred_sum = np.sqrt(accum_inline * accum_crossline) + pred = pred_sum.argmax(0).astype(np.uint8) + del pred_sum + _compute_3D_metrics(gt_labels, pred, n_classes, split) + np.save(os.path.join(output_dir, f"test_geometric_avg_split_{split}.npy"), pred) + # use existing SEGY file as a template to write our data into + if os.path.isfile(SEGY_INFILE): + # input segy file is the ground truth here + pred += 1 + write_segy(os.path.join(output_dir, f"pred_geometric_avg_split_{split}.segy"), SEGY_INFILE, pred.swapaxes(0, 2)) + else: + # write array into segy using array dimensions for # of inlines and crosslines + # make sure directions are inline, crossline, depth + logging.info("writing segy files") + segyio.tools.from_array3D(os.path.join(output_dir,f"pred_geometric_avg_split_{split}.segy"), pred.swapaxes(0, 2), dt=1000) + segyio.tools.from_array3D(os.path.join(output_dir,f"groundtruth_geometric_avg_split_{split}.segy"), gt_labels.swapaxes(0, 2), dt=1000) + logging.info("done") + def _write_section_file(labels, section_file): # define indices of the array diff --git a/experiments/interpretation/dutchf3_patch/train.py b/experiments/interpretation/dutchf3_patch/train.py index fc585817..5076c541 100644 --- a/experiments/interpretation/dutchf3_patch/train.py +++ b/experiments/interpretation/dutchf3_patch/train.py @@ -37,6 +37,7 @@ from cv_lib.segmentation.dutchf3.utils import current_datetime, git_branch, git_hash from cv_lib.segmentation.metrics import class_accuracy, class_iou, mean_class_accuracy, mean_iou, pixelwise_accuracy from cv_lib.utils import generate_path, load_log_configuration +from cv_lib.segmentation import lovasz_losses as L from deepseismic_interpretation.dutchf3.data import get_patch_loader from default import _C as config from default import update_config @@ -95,7 +96,11 @@ def run(*options, cfg=None, local_rank=0, debug=False, input=None, distributed=F if distributed: # FOR DISTRIBUTED: Set the device according to local_rank. - torch.cuda.set_device(local_rank) + # if we're running on a single GPU (multi-GPU development), set to the same GPU + if torch.cuda.device_count()==1: + torch.cuda.set_device(0) + else: + torch.cuda.set_device(local_rank) # FOR DISTRIBUTED: Initialize the backend. torch.distributed.launch will # provide environment variables, and requires that you use init_method=`env://`. @@ -232,7 +237,8 @@ def run(*options, cfg=None, local_rank=0, debug=False, input=None, distributed=F class_weights = torch.tensor(config.DATASET.CLASS_WEIGHTS, device=device, requires_grad=False) # Loss: - criterion = torch.nn.CrossEntropyLoss(weight=class_weights, ignore_index=255, reduction="mean") + #criterion = torch.nn.CrossEntropyLoss(weight=class_weights, ignore_index=255, reduction="mean") + criterion = lambda x, y: L.lovasz_softmax(x, y, classes = list(range(n_classes)), ignore=255, per_image = True) # Model: if distributed: diff --git a/interpretation/deepseismic_interpretation/data.py b/interpretation/deepseismic_interpretation/data.py index 53ec7f9f..a3ad5008 100644 --- a/interpretation/deepseismic_interpretation/data.py +++ b/interpretation/deepseismic_interpretation/data.py @@ -5,6 +5,10 @@ import math from collections import defaultdict +import logging +# toggle to WARNING when running in production, or use CLI +logging.getLogger().setLevel(logging.DEBUG) + import numpy as np import torch from PIL import Image @@ -13,6 +17,11 @@ from torch.utils.data import Dataset from torchvision.datasets.utils import iterable_to_str, verify_str_arg +import segyio + +from shutil import copyfile + + _open_to_array = compose(np.array, Image.open) @@ -20,6 +29,73 @@ class DataNotSplitException(Exception): pass +def read_segy(filename): + """ + Read in a SEGY-format file given a filename + + Args: + filename: input filename + + Returns: + numpy data array and its info as a dictionary (tuple) + + """ + logging.info(f"Loading data cube from {filename}") + + # Read full data cube + data = segyio.tools.cube(filename) + + # Read meta data + segyfile = segyio.open(filename, "r") + print(" Crosslines: ", segyfile.xlines[0], ":", segyfile.xlines[-1]) + print(" Inlines: ", segyfile.ilines[0], ":", segyfile.ilines[-1]) + print(" Timeslices: ", "1", ":", data.shape[2]) + + # Make dict with cube-info + # TODO: read this from segy + # Read dt and other params needed to do create a new + data_info = { + "crossline_start": segyfile.xlines[0], + "inline_start": segyfile.ilines[0], + "timeslice_start": 1, + "shape": data.shape, + } + + return data, data_info + + +def write_segy(out_filename, in_filename, out_cube): + """ + Writes out_cube to a segy-file (out_filename) with same header/size as in_filename + + Args: + out_filename: output filename + in_filename: input file, whose metadata will be copied + out_cube: array which we write to out_filename + + Returns: + Nothing + """ + + logging.info("Writing interpretation to " + out_filename) + + # Copy segy file + copyfile(in_filename, out_filename) + + # Open out-file + with segyio.open(out_filename, "r+") as src: + + iline_start = src.ilines[0] + # set type to inlines + dtype = src.iline[iline_start].dtype + + # loop through inlines and insert output + for i in src.ilines: + iline = out_cube[i - iline_start, :, :] + src.iline[i] = np.ascontiguousarray(iline.astype(dtype)) + + logging.info("Writing interpretation - finished") + def _get_classes_and_counts(mask_list): class_counts_dict = defaultdict(int) for mask in mask_list: diff --git a/scripts/byod_competition.py b/scripts/byod_competition.py new file mode 100644 index 00000000..3bc4d4a8 --- /dev/null +++ b/scripts/byod_competition.py @@ -0,0 +1,222 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. + +""" +Run example: + +python byod_competition.py --train --label --outdir +python prepare_dutchf3.py split_train_val patch --data_dir= --label_file=train/train_labels.npy --output_dir=splits --stride=50 --patch_size=100 --split_direction=both + +# information to include in configuration file when running: + +clip +INFO:root:[0.84979262 0.57790153 0.95866329 0.71236326 0.99004844 0.91123086] +INFO:root:MEAN +INFO:root:4.183678e-05 +INFO:root:STANDARD DEVIATION +INFO:root:0.31477982 + +noclip +INFO:root:[0.84979262 0.57790153 0.95866329 0.71236326 0.99004844 0.91123086] +INFO:root:MEAN +INFO:root:0.0043642526 +INFO:root:STANDARD DEVIATION +INFO:root:0.07544233 + +reduced test size +INFO:root:[0.84979262 0.57790153 0.95866329 0.71236326 0.99004844 0.91123086] +INFO:root:[0.84979262 0.57790153 0.95866329 0.71236326 0.99004844 0.91123086] +INFO:root:MEAN +INFO:root:4.183678e-05 +INFO:root:STANDARD DEVIATION +INFO:root:0.31477982 + +# kick off run as: + +python byod_competition.py --train /home/maxkaz/data/seismic/TrainingData_Image.segy --label /home/maxkaz/data/seismic/TrainingData_Labels.segy --outdir /home/maxkaz/data/seismic +python prepare_dutchf3.py split_train_val patch --data_dir=/home/maxkaz/data/seismic --label_file=train/train_labels.npy --output_dir=splits --stride=50 --patch_size=100 --split_direction=both + +NGPU=2 +python -m torch.distributed.launch --nproc_per_node=${NGPU} train.py \ +TRAIN.BATCH_SIZE_PER_GPU 2 VALIDATION.BATCH_SIZE_PER_GPU 2 \ +DATASET.ROOT "/data/seismic" DATASET.NUM_CLASSES 6 DATASET.CLASS_WEIGHTS "[0.84979262, 0.57790153, 0.95866329, 0.71236326, 0.99004844, 0.91123086]" \ +TRAIN.MEAN 0.0 TRAIN.STD 0.31477982 \ +--distributed --cfg configs/seresnet_unet.yaml + +nohup time \ +python -m torch.distributed.launch --nproc_per_node=4 train.py \ +DATASET.ROOT "/home/maxkaz/data/seismic" DATASET.NUM_CLASSES 6 DATASET.CLASS_WEIGHTS "[0.84979262, 0.57790153, 0.95866329, 0.71236326, 0.99004844, 0.91123086]" \ +TRAIN.MEAN 0.0 TRAIN.STD 0.31477982 \ +--distributed --cfg configs/seresnet_unet.yaml > se.log 2>&1 & + +nohup time \ +python -m torch.distributed.launch --nproc_per_node=4 train.py \ +MODEL.PRETRAINED "/home/alfred/models/hrnetv2_w48_imagenet_pretrained.pth" \ +DATASET.ROOT "/home/maxkaz/data/seismic" DATASET.NUM_CLASSES 6 DATASET.CLASS_WEIGHTS "[0.84979262, 0.57790153, 0.95866329, 0.71236326, 0.99004844, 0.91123086]" \ +TRAIN.MEAN 0.0 TRAIN.STD 0.31477982 \ +--distributed --cfg configs/hrnet.yaml > hr.log 2>&1 & + +Scoring: + +nohup time \ +python test.py \ +DATASET.ROOT "/data/seismic" DATASET.NUM_CLASSES 6 DATASET.CLASS_WEIGHTS "[0.84979262, 0.57790153, 0.95866329, 0.71236326, 0.99004844, 0.91123086]" \ +TRAIN.MEAN 0.0 TRAIN.STD 0.31477982 \ +TEST.SPLIT 'both' +--cfg configs/unet.yaml > unet.log 2>&1 & + +nohup time python test.py DATASET.ROOT "/data/seismic" DATASET.NUM_CLASSES 6 DATASET.CLASS_WEIGHTS "[0.84979262, 0.57790153, 0.95866329, 0.71236326, 0.99004844, 0.91123086]" TRAIN.MEAN 0.0 TRAIN.STD 0.31477982 TEST.SPLIT 'both' MODEL.PRETRAINED /home/maxkaz/Downloads/hrnetv2_w48_imagenet_pretrained.pth TEST.MODEL_PATH --cfg configs/hrnet.yaml + +""" + +from deepseismic_interpretation.data import read_segy + +""" libraries """ +import segyio + +import numpy as np +from scipy import stats +import os + +np.set_printoptions(linewidth=200) +import logging + +# toggle to WARNING when running in production, or use CLI +logging.getLogger().setLevel(logging.DEBUG) +# logging.getLogger().setLevel(logging.WARNING) +import argparse + +parser = argparse.ArgumentParser() + +""" useful information when running from a GIT folder.""" +myname = os.path.realpath(__file__) +mypath = os.path.dirname(myname) +myname = os.path.basename(myname) + + +def main(args): + """ + Transforms Penobscot HDF5 dataset into DeepSeismic Tensor Format + """ + + logging.info("loading data") + data, _ = read_segy(args.train) + labels, _ = read_segy(args.label) + + assert labels.min() == 1.0 + n_classes = labels.max() + assert n_classes == N_CLASSES + + logging.info("Running 3-sigma clipping") + clip_scaling = 3.0 + mean, std = data.mean(), data.std() + logging.info(f"mean {mean} std {std}") + data[data > mean + clip_scaling * std] = mean + clip_scaling * std + data[data < mean - clip_scaling * std] = mean - clip_scaling * std + + # Make data cube fast to access + logging.info("Adjusting precision") + data = np.ascontiguousarray(data, "float32") + labels = np.ascontiguousarray(labels, "uint8") + + # adjust labels to start from zero + labels -= 1 + + # rescale to be within a certain range + range_min, range_max = -1.0, 1.0 + min, max = data.min(), data.max() + logging.info(f"min {min} max {max}") + data_std = (data - min) / (max - min) + data = data_std * (range_max - range_min) + range_min + + """ + # cut off a buffer zone around the volume (to avoid mislabeled data): + buffer = 25 + data = data[:, buffer:-buffer, buffer:-buffer] + labels = labels[:, buffer:-buffer, buffer:-buffer] + """ + + # time by crosslines by inlines + n_inlines = data.shape[0] + n_crosslines = data.shape[1] + + inline_cut = int(np.floor(n_inlines * INLINE_FRACTION)) + crossline_cut = int(np.floor(n_crosslines * CROSSLINE_FRACTION)) + + data_train = data[0:inline_cut, 0:crossline_cut, :] + data_test1 = data[inline_cut:n_inlines, :, :] + data_test2 = data[:, crossline_cut:n_crosslines, :] + + labels_train = labels[0:inline_cut, 0:crossline_cut, :] + labels_test1 = labels[inline_cut:n_inlines, :, :] + labels_test2 = labels[:, crossline_cut:n_crosslines, :] + + def mkdir(dirname): + + if os.path.isdir(dirname) and os.path.exists(dirname): + return + + if not os.path.isdir(dirname) and os.path.exists(dirname): + logging.info("remote file", dirname, "and run this script again") + + os.mkdir(dirname) + + mkdir(args.outdir) + mkdir(os.path.join(args.outdir, "splits")) + mkdir(os.path.join(args.outdir, "train")) + mkdir(os.path.join(args.outdir, "test_once")) + + np.save(os.path.join(args.outdir, "train", "train_seismic.npy"), data_train) + np.save(os.path.join(args.outdir, "train", "train_labels.npy"), labels_train) + + np.save(os.path.join(args.outdir, "test_once", "test1_seismic.npy"), data_test1) + np.save(os.path.join(args.outdir, "test_once", "test1_labels.npy"), labels_test1) + + np.save(os.path.join(args.outdir, "test_once", "test2_seismic.npy"), data_test2) + np.save(os.path.join(args.outdir, "test_once", "test2_labels.npy"), labels_test2) + + # Compute class weights: + num_classes, class_count = np.unique(labels[:], return_counts=True) + # class_probabilities = np.histogram(labels[:], bins= , density=True) + class_weights = 1 - class_count / np.sum(class_count) + logging.info("CLASS WEIGHTS TO USE") + logging.info(class_weights) + mean, std = data.mean(), data.std() + logging.info("MEAN") + logging.info(mean) + logging.info("STANDARD DEVIATION") + logging.info(std) + + +""" GLOBAL VARIABLES """ +INLINE_FRACTION = 0.9 +CROSSLINE_FRACTION = 0.9 +N_CLASSES = 6 + +parser.add_argument("--train", help="Name of train data", type=str, required=True) +parser.add_argument("--label", help="Name of train labels data", type=str, required=True) +parser.add_argument("--outdir", help="Output data directory location", type=str, required=True) + +""" main wrapper with profiler """ +if __name__ == "__main__": + main(parser.parse_args()) + +# pretty printing of the stack +""" + try: + logging.info('before main') + main(parser.parse_args()) + logging.info('after main') + except: + for frame in traceback.extract_tb(sys.exc_info()[2]): + fname,lineno,fn,text = frame + print ("Error in %s on line %d" % (fname, lineno)) +""" +# optionally enable profiling information +# import cProfile +# name = +# cProfile.run('main.run()', name + '.prof') +# import pstats +# p = pstats.Stats(name + '.prof') +# p.sort_stats('cumulative').print_stats(10) +# p.sort_stats('time').print_stats() diff --git a/scripts/byod_penobscot.py b/scripts/byod_penobscot.py index 8ccc8b6e..32e65d5f 100644 --- a/scripts/byod_penobscot.py +++ b/scripts/byod_penobscot.py @@ -120,7 +120,7 @@ def mkdir(dirname): """ GLOBAL VARIABLES """ INLINE_FRACTION = 0.7 -CROSSLINE_FRACTION = 1.0 +CROSSLINE_FRACTION = 0.78 N_CLASSES = 8 parser.add_argument("--filename", help="Name of HDF5 data", type=str, required=True) diff --git a/scripts/seg20_check_distrib.py b/scripts/seg20_check_distrib.py new file mode 100644 index 00000000..792bcc5e --- /dev/null +++ b/scripts/seg20_check_distrib.py @@ -0,0 +1,44 @@ +# checks distribution across classes in the new SEG20 competition + +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. + +""" +Custom one-off script to process the SEG20 competition test dataset. +""" +import collections + +from deepseismic_interpretation.data import read_segy + +""" libraries """ +import numpy as np + +np.set_printoptions(linewidth=200) +import logging + +# toggle to WARNING when running in production, or use CLI +logging.getLogger().setLevel(logging.DEBUG) + +# dataset locations +N_CLASSES = 6 +TRAIN = "/data/seismic_orig/TrainingData_Labels.segy" +TEST1 = "/home/maxkaz/Desktop/pred_simple_avg_split_test1.segy" +TEST2 = "/home/maxkaz/Desktop/pred_simple_avg_split_test2.segy" + +def check(infile): + + data, _ = read_segy(infile) + n = data.size + counts = collections.Counter(data.astype(int).flatten().tolist()) + ccounts = 0 + for k in range(1,N_CLASSES+1): + ccounts += counts[k] + if k in counts: + print(f"{k}: {float(counts[k])/n} = {counts[k]} / {n}") + print(f"coverage {ccounts/n}") + +check(TRAIN) +check(TEST1) +check(TEST2) + +logging.info("done") diff --git a/scripts/seg20_test_process.py b/scripts/seg20_test_process.py new file mode 100644 index 00000000..16d6832a --- /dev/null +++ b/scripts/seg20_test_process.py @@ -0,0 +1,59 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. + +""" +Custom one-off script to process the SEG20 competition test dataset. +""" + +from deepseismic_interpretation.data import read_segy + +""" libraries """ +import segyio + +import numpy as np +from scipy import stats +import os + +np.set_printoptions(linewidth=200) +import logging + +# toggle to WARNING when running in production, or use CLI +logging.getLogger().setLevel(logging.DEBUG) + +# dataset locations +N_CLASSES = 6 +TEST1 = "/data/seismic/TestData_Image1.segy" +TEST2 = "/data/seismic/TestData_Image2.segy" +# output location +OUTDIR = "/data/seismic/test_once" +# enter these from byod_competition logging output - computed on the training set +MEAN = 0.676609992980957 +STD = 390.308837890625 +MIN = -1170.2498779296875 +MAX = 1171.6031494140625 + +def process_test(infile, outdir, n_set): + + logging.info("loading data") + data, _ = read_segy(infile) + + logging.info("Running 3-sigma clipping") + clip_scaling = 3.0 + data[data > MEAN + clip_scaling * STD] = MEAN + clip_scaling * STD + data[data < MEAN - clip_scaling * STD] = MEAN - clip_scaling * STD + + # Make data cube fast to access + logging.info("Adjusting precision") + data = np.ascontiguousarray(data, "float32") + + # rescale to be within a certain range + range_min, range_max = -1.0, 1.0 + data_std = (data - MIN) / (MAX - MIN) + data = data_std * (range_max - range_min) + range_min + + random_test_labels = np.random.randint(0,N_CLASSES-1, data.shape, dtype='uint8') + np.save(os.path.join(outdir, f"test{n_set}_seismic.npy"), data) + np.save(os.path.join(outdir, f"test{n_set}_labels.npy"), random_test_labels) + +process_test(TEST1, OUTDIR, 1) +process_test(TEST2, OUTDIR, 2)