Skip to content

Commit

Permalink
Merge pull request #27 from IvanKuchin/slicing_ct
Browse files Browse the repository at this point in the history
Feature: tiling training dataset
  • Loading branch information
IvanKuchin authored Aug 18, 2024
2 parents d0c70f2 + 35bcd58 commit 07d449e
Show file tree
Hide file tree
Showing 4 changed files with 131 additions and 31 deletions.
15 changes: 13 additions & 2 deletions config.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,13 +20,24 @@
# https://radiopaedia.org/articles/windowing-ct?lang=us
# Option 2) 3D Slicer preset for abdominal CT
# W/L: 350/40, which makes the pancreas range from -310 to 390
PANCREAS_MIN_HU = -1200 # -512
PANCREAS_MAX_HU = 4000 # 1024
#
# Our pancreas calculations show values from -1200 to 4000
#
# training attempts shows that the best performance in the range [-512, 1024]
PANCREAS_MIN_HU = -512 # -512
PANCREAS_MAX_HU = 1024 # 1024

IMAGE_DIMENSION_X = 160
IMAGE_DIMENSION_Y = IMAGE_DIMENSION_X
IMAGE_DIMENSION_Z = IMAGE_DIMENSION_X

AUGMENTATIO_SHIFT_MARGIN = 0.1

MIN_LABEL = 0
MAX_LABEL = 1
MIN_DATA = -1
MAX_DATA = 1

MONITOR_METRIC = "val_custom_f1"
MONITOR_MODE = "max"

Expand Down
26 changes: 24 additions & 2 deletions dataset/craft_datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ def py_read_data_and_label(data_fname:str):
data_label = content["arr_0"]
data_array = data_label[0]
label_array = data_label[1]
return (tf.convert_to_tensor(data_array, dtype=tf.float32), tf.convert_to_tensor(label_array, dtype=tf.int32))
return data_array, label_array


def read_data_and_label(patient_id:str, src_folder:str):
Expand Down Expand Up @@ -69,6 +69,22 @@ class Array3d_read_and_resize:
def __init__(self, folder):
self.folder = folder

def random_crop(self, data, label, x, y, z):
data_shape = np.shape(data)
random_range = [data_shape[0] - x, data_shape[1] - y, data_shape[2] - z]
random_offset = np.random.randint(0, random_range, size = 3)
_data = data[
random_offset[0]:random_offset[0] + x,
random_offset[1]:random_offset[1] + y,
random_offset[2]:random_offset[2] + z,
...]
_label = label[
random_offset[0]:random_offset[0] + x,
random_offset[1]:random_offset[1] + y,
random_offset[2]:random_offset[2] + z,
...]
return _data, _label

def __call__(self):
self.file_list = FileIterator(self.folder)
for data_file in self.file_list:
Expand All @@ -79,14 +95,16 @@ def __call__(self):
data, label = read_data_and_label(patient_id, self.folder)
finish_reading = time.time()

data, label = self.random_crop(data, label, config.IMAGE_DIMENSION_X, config.IMAGE_DIMENSION_Y, config.IMAGE_DIMENSION_Z)

start_resize = time.time()
# data, label = borders.cut_and_resize_including_pancreas(data, label, np.random.rand(), np.random.rand())
finish_resize = time.time()

if DEBUG_DATA_LOADING_PERFORMANCE:
print(f"\tDATA_LOADING_PERFORMANCE: reading time: {finish_reading - start_reading:.1f} resize time: {finish_resize - start_resize:.1f}")

yield data, label
yield tf.convert_to_tensor(data, dtype=tf.float32), tf.convert_to_tensor(label, dtype=tf.int8)


# def array3d_read_and_resize():
Expand Down Expand Up @@ -209,6 +227,10 @@ def __run_through_data_wo_any_action(ds_train, ds_valid):


if __name__ == "__main__":
# read_and_resize = Array3d_read_and_resize(os.path.join(config.TFRECORD_FOLDER, "train"))
# item1 = next(read_and_resize())
# print("item1:", item1[0].shape, item1[1].shape)

train_ds = craft_datasets(os.path.join(config.TFRECORD_FOLDER, "train"))
valid_ds = craft_datasets(os.path.join(config.TFRECORD_FOLDER, "valid"))
__run_through_data_wo_any_action(train_ds, valid_ds)
53 changes: 26 additions & 27 deletions dataset/pomc_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,11 @@
import sys
import re
import numpy as np
import numpy.typing as npt
import pydicom
import nrrd
import borders
from saver import Saver

currentdir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
parentdir = os.path.dirname(currentdir)
Expand Down Expand Up @@ -171,20 +173,12 @@ def consistency_check(self, data, label, data_metadata, label_metadata):
self.min_HU = min(self.min_HU, min_HU)
self.max_HU = max(self.max_HU, max_HU)

if min_HU < config.PANCREAS_MIN_HU or min_HU > config.PANCREAS_MAX_HU:
print("ERROR: min HU(", min_HU, ") in pancreas area is out of range [", config.PANCREAS_MIN_HU, config.PANCREAS_MAX_HU, "]")
return False
# if min_HU < config.PANCREAS_MIN_HU or min_HU > config.PANCREAS_MAX_HU:
# print("ERROR: min HU(", min_HU, ") in pancreas area is out of range [", config.PANCREAS_MIN_HU, config.PANCREAS_MAX_HU, "]")
# return False

if max_HU < config.PANCREAS_MIN_HU or max_HU > config.PANCREAS_MAX_HU:
print("ERROR: max HU(", max_HU, ") in pancreas area is out of range [", config.PANCREAS_MIN_HU, config.PANCREAS_MAX_HU, "]")
return False

##############################
# Thos is not relevant check #
##############################
# print("\tDICOM min/max coordinates:", data_metadata["min"], data_metadata["max"])
# if self._body_up_side_down(data_metadata["min"]):
# print("ERROR: data is upside down")
# if max_HU < config.PANCREAS_MIN_HU or max_HU > config.PANCREAS_MAX_HU:
# print("ERROR: max HU(", max_HU, ") in pancreas area is out of range [", config.PANCREAS_MIN_HU, config.PANCREAS_MAX_HU, "]")
# return False

return True
Expand Down Expand Up @@ -234,7 +228,8 @@ def preprocess_data(self, data, label):
#
# scale final data to [-1; 1] range, that should help with ReLU activation
#
data_processed = (data - np.min(data)) / (np.max(data) - np.min(data)) * 2 - 1
spread = config.MAX_DATA - config.MIN_DATA
data_processed = (data - np.min(data)) / (np.max(data) - np.min(data)) * spread - spread / 2
# if data_processed.shape != AUGMENT_SCALED_DIMS:
# print_error("wrong Z-axis dimensionality {} must be {}".format(data_processed.shape, AUGMENT_SCALED_DIMS))

Expand All @@ -246,13 +241,13 @@ def sanity_check_after_preprocessing(self, data, label):
# print("\tsanity check data: {}/{}/{}".format(np.min(data), np.mean(data), np.max(data)))
# print("\tsanity check label: {}/{}/{}".format(np.min(label), np.mean(label), np.max(label)))

if np.min(data) != -1: # data scaled to range [-1, 1]
if np.min(data) != config.MIN_DATA: # data scaled to range [-1, 1]
result = False
print("ERROR: (min(data) == {}) != -1".format(np.min(data)))
if np.mean(data) == 0:
result = False
print("ERROR: (mean(data) == {}) == 0".format(np.mean(data)))
if np.max(data) != 1:
if np.max(data) != config.MAX_DATA:
result = False
print("ERROR: (max(data) == {}) != 1".format(np.max(data)))

Expand All @@ -265,28 +260,28 @@ def sanity_check_after_preprocessing(self, data, label):
result = False
print("ERROR: (min(label) == {}) != -1".format(np.min(label)))
else:
if np.min(label) != 0:
if np.min(label) != config.MIN_LABEL:
result = False
print("ERROR: (min(label) == {}) != -1".format(np.min(label)))

if np.mean(label) == 0:
result = False
print("ERROR: (mean(label) == {}) == 0".format(np.mean(label)))
if np.max(label) != 1:
if np.max(label) != config.MAX_LABEL:
result = False
print("ERROR: (max(label) == {}) != 1".format(np.max(label)))

return result

def save_npy(self, subfolder: str, patient_id:str, percentage: int, original_data, original_label, scaled_data, scaled_label):
def save_npz(self, subfolder: str, patient_id:str, percentage: int, original_data, original_label, scaled_data, scaled_label):
result = True
scaled_data = np.cast[np.float32](scaled_data)
scaled_label = np.cast[np.int8](scaled_label)
np.savez_compressed(os.path.join(self.TFRECORD_FOLDER, subfolder, patient_id + f"_{percentage}.npz", ), [scaled_data, scaled_label])

return result

def pickle_src_data(self, train_valid_percentage=0.2):
def pickle_src_data(self, train_valid_percentage=0.15):
if not os.path.exists(self.TFRECORD_FOLDER):
print("ERROR: can't find TFRecord folder:", self.TFRECORD_FOLDER)
return
Expand Down Expand Up @@ -327,10 +322,11 @@ def pickle_src_data(self, train_valid_percentage=0.2):
print("ERROR: data & labels are not consistent patient_id:", patient_id)
continue

for percentage in [0, 30, 60, 90]:
for percentage in [0]: # [0, 30, 60, 90]:
print(f"\n\tPreprocess data for {percentage}%")

scaled_data, scaled_label = borders.cut_and_resize_including_pancreas(src_data, label_data, percentage/100, percentage/100)
# scaled_data, scaled_label = borders.cut_and_resize_including_pancreas(src_data, label_data, percentage/100, percentage/100)
scaled_data, scaled_label = tf.constant(src_data), tf.constant(label_data)

start_ts = time.time()
scaled_src_data, scaled_label_data = self.preprocess_data(scaled_data.numpy(), scaled_label.numpy())
Expand All @@ -348,16 +344,19 @@ def pickle_src_data(self, train_valid_percentage=0.2):
print("ERROR: data or label failed sanity check")
continue

if self.save_npy(subfolder, patient_id, percentage, src_data, label_data, scaled_src_data, scaled_label_data) == False:
print("ERROR: can't save TFRecord patient id:", patient_id)
# if self.save_npz(subfolder, patient_id, percentage, src_data, label_data, scaled_src_data, scaled_label_data) == False:
# print("ERROR: can't save TFRecord patient id:", patient_id)
# continue

print(f"\tSave patientID: {patient_id} to {subfolder} with border cut out around pancreas at {percentage}%")
saver = Saver(self.TFRECORD_FOLDER, subfolder, patient_id, percentage, config.IMAGE_DIMENSION_X, config.IMAGE_DIMENSION_Y, config.IMAGE_DIMENSION_Z)
if saver.save(scaled_src_data, scaled_label_data) == False:
print("ERROR: can't save sliced CT of patientID:", patient_id)
continue

def main():
pomc = POMCDataset(PATIENTS_SRC_FOLDER, LABELS_SRC_FOLDER, config.TFRECORD_FOLDER)
pomc.pickle_src_data()

print("min HU in pancreas area:", pomc.min_HU)
print("max HU in pancreas area:", pomc.max_HU)
return

if __name__ == "__main__":
Expand Down
68 changes: 68 additions & 0 deletions dataset/saver.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
import numpy as np
import math
import os
import sys
import inspect

currentdir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
parentdir = os.path.dirname(currentdir)
sys.path.insert(0, parentdir)

from tools import resize_3d
import config as config

class Slicer:
def __init__(self, data, label):
x = math.ceil(data.shape[0] / config.IMAGE_DIMENSION_X) * config.IMAGE_DIMENSION_X
y = math.ceil(data.shape[1] / config.IMAGE_DIMENSION_Y) * config.IMAGE_DIMENSION_Y
z = math.ceil(data.shape[2] / config.IMAGE_DIMENSION_Z) * config.IMAGE_DIMENSION_Z

self.data = np.zeros((x, y, z)) + config.MIN_DATA
self.label = np.zeros((x, y, z)) + config.MIN_LABEL

self.data[:data.shape[0], :data.shape[1], :data.shape[2]] = data
self.label[:label.shape[0], :label.shape[1], :label.shape[2]] = label

def __iter__(self):
augment_margin = [
int(config.IMAGE_DIMENSION_X * config.AUGMENTATIO_SHIFT_MARGIN),
int(config.IMAGE_DIMENSION_Y * config.AUGMENTATIO_SHIFT_MARGIN),
int(config.IMAGE_DIMENSION_Z * config.AUGMENTATIO_SHIFT_MARGIN)
]
for x in range(0, self.data.shape[0], config.IMAGE_DIMENSION_X):
for y in range(0, self.data.shape[1], config.IMAGE_DIMENSION_Y):
for z in range(0, self.data.shape[2], config.IMAGE_DIMENSION_Z):
x_start = np.max([x - augment_margin[0], 0])
y_start = np.max([y - augment_margin[1], 0])
z_start = np.max([z - augment_margin[2], 0])

x_finish = np.min([x + config.IMAGE_DIMENSION_X + augment_margin[0], self.data.shape[0]])
y_finish = np.min([y + config.IMAGE_DIMENSION_Y + augment_margin[1], self.data.shape[1]])
z_finish = np.min([z + config.IMAGE_DIMENSION_Z + augment_margin[2], self.data.shape[2]])

data = self.data [x_start:x_finish, y_start:y_finish, z_start:z_finish]
label = self.label[x_start:x_finish, y_start:y_finish, z_start:z_finish]

yield data, label, x, y, z


class Saver:
def __init__(self, folder: str, subfolder: str, patient_id:str, percentage: int, image_dimension_x: int, image_dimension_y: int, image_dimension_z: int):
self.folder = folder
self.subfolder = subfolder
self.patient_id = patient_id
self.percentage = percentage
self.image_dimension_x = image_dimension_x
self.image_dimension_y = image_dimension_y
self.image_dimension_z = image_dimension_z

def save(self, src_data, label_data):
result = True
src_data = np.cast[np.float32](src_data)
label_data = np.cast[np.int8](label_data)

for (data, label, x, y, z) in Slicer(src_data, label_data):
# print(f"Saving slice at {x}, {y}, {z}...")
np.savez_compressed(os.path.join(self.folder, self.subfolder, self.patient_id + f"_cut-{self.percentage}_slice-{x}-{y}-{z}.npz", ), [data, label])

return result

0 comments on commit 07d449e

Please sign in to comment.