inputdata_setup.py

import os, sys
import numpy as np
import pandas as pd
import fnmatch
import pickle


def all_files_exist(flist):
    numfiles = len(flist)
    allexist = True
    co = 0
    while allexist and co < numfiles:
        allexist = os.path.isfile(flist[co])
        co += 1
    return allexist


def file_len(fname):
    if os.path.isfile(fname) and os.path.getsize(fname)>0:
        with open(fname) as f:
            for i, l in enumerate(f):
                pass
        return i + 1
    else: 
        print('Failed to read {}!'.format(fname))
        return -1


try:
    stubdir = sys.argv[1]
    print('Reading mesh motion data from directory {}....'.format(stubdir))
except:
    print('Please pass name of directory containing segmented data...')


mpointsfile = 'matchedpointsnew.txt'
pim2 = 'subjnames.txt'
outcomefile = os.path.join(stubdir,'surv_outcomes.csv')
stepsuccess = [True for _ in range(3)]


# Read mpointsfile
try: 
    mpoints = np.loadtxt(os.path.join(stubdir,mpointsfile), dtype=int)
except: 
    stepsuccess[0] = False
    print('{} read failed!'.format(mpointsfile))

# Read list of subjects
try: 
    with open(os.path.join(stubdir,pim2)) as f: IDlist = [lin.strip('\n') for lin in f.readlines() if len(lin)>1]
except: 
    stepsuccess[1] = False
    print('{} read failed!'.format(pim2))

# Find number of vertices
if stepsuccess[1]:
    try:
        meshtxtfile = os.path.join(stubdir,IDlist[0],'motion/RV_fr00.txt')
        num_vertx = file_len(meshtxtfile)
        if num_vertx <= 0:
            stepsuccess[2] = False
            print('There was a problem reading {} in order to determine number of vertices in 3D meshes!'.format(meshtxtfile))
    except:
        stepsuccess[2] = False
        print('Failed to read {} in order to determine number of vertices in 3D meshes!'.format(meshtxtfile))


validIDs = [False]
numframes = 20
if all(stepsuccess):
    print('\n\n------------------------------------------')
    print('Reading mesh motion data from directory {}...'.format(stubdir))
    print('Subject IDs will be read from file {}...'.format(pim2))
    print('Expected number of vertices per mesh = {0}, of which {1} will be extracted'.format(num_vertx, mpoints.shape[0]))
    print('Outcome data will be read from file {}...'.format(outcomefile))
    print('------------------------------------------\n\n\n')
    if os.path.exists(stubdir):
        if len(IDlist)>0:
            validIDs = [False for _ in range(len(IDlist))]
            X_all = np.zeros(shape=(len(IDlist),(numframes-1),mpoints.shape[0],3), dtype=float)
            for counter,ID in enumerate(IDlist):
                if os.path.exists(os.path.join(stubdir,ID)):
                    if os.path.exists(os.path.join(stubdir,ID,'motion')):
                        frames_file_list = [os.path.join(stubdir, ID, 'motion/RV_fr' + '{:0>2}'.format(b) + '.txt') for b in range(numframes)]
                        if all_files_exist(frames_file_list):
                            nframes = len(fnmatch.filter(os.listdir(os.path.join(stubdir , ID , 'motion')), 'RV_fr*.txt'))
                            if nframes == numframes:
                                if np.sum([file_len(frames_file_list[i]) == num_vertx for i in range(numframes)]) == numframes:
                                    vs = [True for _ in range(numframes)]
                                    try:
                                        coords_fr0 = np.loadtxt(frames_file_list[0])[mpoints[:,1]]
                                    except: 
                                        print('Error! could not read file {} !'.format(frames_file_list[0]))
                                        vs[0]=False
                                    if vs[0]:
                                        for j in range(1, numframes):
                                            try: 
                                                coords_frj = np.loadtxt(frames_file_list[j])[mpoints[:,1]]
                                            except: 
                                                print('Error! could not read file {} !'.format(frames_file_list[j]))
                                                vs[j]=False
                                            if vs[j]:
                                                X_all[counter, j-1, :, :] = coords_frj - coords_fr0
                                            else:
                                                break
                                        if np.all(vs):
                                            validIDs[counter] = True
                                            print('Successfully read motion data for ID {}'.format(ID))
                #               else: print(ID + ' RV files do not have ' + str(num_vertx) + ' vertices')
                                else:
                                    print('{0} : wrong # of vertices, expected {1} for all {2} frames but got {3}'.format(ID,num_vertx,numframes,str([file_len(frames_file_list[i]) for i in range(numframes)])))
                            else:
                                print('{0} : RV files exist but not {1} in number. Skipping to next ID....'.format(ID,numframes))
                        else:
                            print(ID + ' : folder exists but not all RV files exist. Skipping to next ID....')
                    else:
                        print('There is no motion folder under directory {} !'.format(os.path.join(stubdir,ID)))
                else:
                    print('{0} folder does not exist under directory {1}'.format(ID,stubdir))
        else:
            print('No IDs found in predinput_master2.txt !')
    else:
        print('directory meant to contain IDs is not valid!' )
else: pass


if any(validIDs): 
    numvalids = np.sum(validIDs)
    print('{} IDs with valid mesh motion data were found'.format(numvalids))
    X = X_all[validIDs]
else: print('No valid mesh motion data could be read!')


# Processing outcome data

# Read outcome master file - Column 1: ID, Column 2: censoring status, Column 3: time to event/censoring
# Tests of outcome file:
    # number of columns is 3
    # columns ordered correctly - ID, status, time
    # columns contain correct data (ID is string, status = 0 or 1, time > 0)
if any(validIDs):
    oreadable = True
    ofmtcorr1 = True
    if os.path.exists(outcomefile):
        try:
            outcome_df = pd.read_csv(outcomefile)
        except: 
            print('Error in reading outcome file {} !'.format(outcomefile))
            oreadable = False
        if oreadable:
            print('Outcome file {0} read: {1} rows and {2} columns...'.format(outcomefile, outcome_df.shape[0], outcome_df.shape[1]))
            if len(outcome_df.columns) != 3:
                print('Wrong number of columns in outcome file {} ! Expected 3 columns'.format(outcomefile))
            else:
                outcome_df.columns = ['ID','status','time']
                try: 
                    ocorrfmt = np.all([ i and j for (i,j) in zip([l in [0,1] for l in list(outcome_df.status)], [k>=0 for k in list(outcome_df.time)])])
                except: 
                    ofmtcorr1 = False
                    ocorrfmt = False
                if not (ofmtcorr1 and ocorrfmt):
                    print('status and/or time columns in {} are incorrectly formatted!'.format(outcomefile))
                    if ofmtcorr1==True and ocorrfmt == False:
                        aw = np.argwhere([ not(i and j) for (i,j) in zip([l in [0,1] for l in list(outcome_df.status)], [k>=0 for k in list(outcome_df.time)])])
                        if aw.shape[0] > 0:
                            print('{} {rw} {w} problematic: '.format(aw.shape[0],rw='rows' if aw.shape[0]>1 else 'row',w='were' if aw.shape[0]>1 else 'was'))
                            print(outcome_df.iloc[list(aw[:,0])])
                else: 
                    if any(validIDs):
                        print('matching mesh motion data IDs to outcome data IDs....')
                        IDlist_valids = list(np.array(IDlist)[validIDs])
                        #IDlist_woutc = [ii for ii in IDlist_valids if ii in list(outcome_df.ID)]
                        IDlist_woutc = list(set(list(outcome_df.ID)).intersection(set(IDlist_valids)))
                        if len(IDlist_woutc)==0:
                            print('None of the IDs from the mesh motion data were found in outcome file {}'.format(outcomefile))
                        else:
                            print('{1} of {2} valid IDs from mesh motion data were found in outcome file {0}'.format(outcomefile, len(IDlist_woutc), len(IDlist_valids)))
                            if len(IDlist_woutc) < len(IDlist_valids):
                                print('The following IDs from the mesh motion data were not found in outcome file {} :'.format(outcomefile))
                                print([ii for ii in IDlist_valids if ii not in list(outcome_df.ID)])
                            y = outcome_df[(outcome_df['ID'].isin(IDlist_woutc))]
                            matchmask = [(u in IDlist_woutc) for u in IDlist_valids]
                            Xout = X[matchmask]
                            xshp = Xout.shape
                            xymatch = (y.shape[0]==xshp[0])
                            assert xymatch, 'ERROR: mesh motion (x) data has {1} rows while outcome (y) data has {0} rows'.format(y.shape[0], xshp[0])
                            if xymatch:
                                Xfin = Xout.reshape(xshp[:2]+(np.prod(xshp[2:]),)).reshape((xshp[0],-1))
                                plist = [Xfin,np.array(y[['status','time']]),list(y.ID)]
                                pklname = 'inputdata_DL' + '.pkl'
                                pklpath = os.path.join(os.getcwd(),'data',pklname)
                                with open(pklpath, 'wb') as f: pickle.dump(obj=plist, file=f)
                                print('Mesh motion and corresponding survival data for {0} subjects has been saved in {1}'.format(xshp[0],pklpath))

    else:
        print('Outcome file {} does not exist! Outcome data cannot be read!'.format(outcomefile))