From 1890788dde360571f62f35225c4eacf03e6c13e0 Mon Sep 17 00:00:00 2001 From: angerhang <514140809@qq.com> Date: Mon, 11 Sep 2023 16:33:40 +0100 Subject: [PATCH] Fix style issues with ldopa processing script (#15) --- data_parsing/ldopa.py | 97 ++++++++++++++++++++++++++++++------------- 1 file changed, 68 insertions(+), 29 deletions(-) diff --git a/data_parsing/ldopa.py b/data_parsing/ldopa.py index f3a2b4e..c67ac7c 100644 --- a/data_parsing/ldopa.py +++ b/data_parsing/ldopa.py @@ -3,7 +3,6 @@ from tqdm.auto import tqdm from datetime import datetime from scipy import constants -from glob import glob from joblib import Parallel, delayed import numpy as np import warnings @@ -13,7 +12,6 @@ from pathlib import Path from shutil import rmtree, copyfile from glob import glob -import urllib.request as urllib import synapseclient import synapseutils from dotenv import load_dotenv, find_dotenv @@ -40,6 +38,7 @@ ["HomeTasks", "syn20681035"], ] + def load_environment_vars(env_strings=[]): load_dotenv(find_dotenv()) missing_envs = [] @@ -60,8 +59,10 @@ def load_environment_vars(env_strings=[]): return tuple(env_values) -USERNAME, APIKEY = load_environment_vars(["SYNAPSE_USERNAME", "SYNAPSE_APIKEY"]) +USERNAME, APIKEY = load_environment_vars( + ["SYNAPSE_USERNAME", "SYNAPSE_APIKEY"] +) def check_files_exist(dir, files): @@ -70,7 +71,9 @@ def check_files_exist(dir, files): def get_first_file(dataFolder, folderName): return os.path.join( - dataFolder, folderName, os.listdir(os.path.join(dataFolder, folderName))[0] + dataFolder, + folderName, + os.listdir(os.path.join(dataFolder, folderName))[0], ) @@ -107,7 +110,9 @@ def build_metadata(datadir=RAW_DIR, processeddir=PROCESSED_DIR): def build_acc_data(datadir=RAW_DIR, processeddir=PROCESSED_DIR, n_jobs=N_JOBS): - subjects = build_task_reference_file(datadir, processeddir)["subject_id"].unique() + subjects = build_task_reference_file(datadir, processeddir)[ + "subject_id" + ].unique() outdir = os.path.join(processeddir, "acc_data") os.makedirs(outdir, exist_ok=True) @@ -122,7 +127,9 @@ def build_acc_data(datadir=RAW_DIR, processeddir=PROCESSED_DIR, n_jobs=N_JOBS): print("Acceleration data already compiled...\n") -def build_task_reference_file(datadir=RAW_DIR, outdir=PROCESSED_DIR, overwrite=False): +def build_task_reference_file( + datadir=RAW_DIR, outdir=PROCESSED_DIR, overwrite=False +): outFile = os.path.join(outdir, "TaskReferenceFile.csv") if os.path.exists(outFile) and not overwrite: @@ -149,7 +156,13 @@ def build_task_reference_file(datadir=RAW_DIR, outdir=PROCESSED_DIR, overwrite=F date_parser=parse_datetime_from_timestamp, ) taskScores = pd.concat([taskScore1, taskScore2])[ - ["subject_id", "visit", "task_code", "timestamp_start", "timestamp_end"] + [ + "subject_id", + "visit", + "task_code", + "timestamp_start", + "timestamp_end", + ] ] visit_to_day = {1: 1, 2: 4} @@ -172,7 +185,9 @@ def build_task_reference_file(datadir=RAW_DIR, outdir=PROCESSED_DIR, overwrite=F ] taskRefFile = ( - pd.concat([taskScores, homeTasks]).drop_duplicates().reset_index(drop=True) + pd.concat([taskScores, homeTasks]) + .drop_duplicates() + .reset_index(drop=True) ) taskRefFile.to_csv(outFile) @@ -216,7 +231,10 @@ def build_participant_acc_data(subject, datadir, outdir): def build_patient_file_path(dataFolder, device, subject_id, index): return os.path.join( - dataFolder, device, get_patient_folder(subject_id), f"rawdata_day{index}.txt" + dataFolder, + device, + get_patient_folder(subject_id), + f"rawdata_day{index}.txt", ) @@ -230,7 +248,9 @@ def get_patient_folder(subject_id): raise AssertionError("Invalid subject id") -def label_acc_data(label, datadir=RAW_DIR, processeddir=PROCESSED_DIR, n_jobs=N_JOBS): +def label_acc_data( + label, datadir=RAW_DIR, processeddir=PROCESSED_DIR, n_jobs=N_JOBS +): taskRefFile = build_task_reference_file(datadir, processeddir) subjects = taskRefFile["subject_id"].unique() @@ -256,11 +276,15 @@ def build_task_dictionary(datadir=RAW_DIR, outdir=PROCESSED_DIR): processedDictionaryPath = os.path.join(outdir, "TaskDictionary.csv") if os.path.exists(processedDictionaryPath): - taskDictionary = pd.read_csv(processedDictionaryPath, index_col="task_code") + taskDictionary = pd.read_csv( + processedDictionaryPath, index_col="task_code" + ) else: os.makedirs(os.path.dirname(outdir), exist_ok=True) - taskDictionary = pd.read_csv(os.path.join(datadir, "TaskCodeDictionary.csv")) + taskDictionary = pd.read_csv( + os.path.join(datadir, "TaskCodeDictionary.csv") + ) taskDictionary["is-walking"] = taskDictionary["description"].apply( is_walking_given_description ) @@ -276,7 +300,10 @@ def build_task_dictionary(datadir=RAW_DIR, outdir=PROCESSED_DIR): def is_walking_given_description(description): return ( "walking" - if (("WALKING" in description.upper()) or ("STAIRS" in description.upper())) + if ( + ("WALKING" in description.upper()) + or ("STAIRS" in description.upper()) + ) else "not-walking" ) @@ -324,10 +351,14 @@ def label_participant_data( accFile.to_csv(accFilePath) else: - print(f'Using saved subject labelled accelerometery data at "{accFilePath}".') + print( + f'Using saved subject labelled accelerometery data at "{accFilePath}".' + ) -def download_ldopa(datadir, annot_label="is-walking", overwrite=False, n_jobs=10): +def download_ldopa( + datadir, annot_label="is-walking", overwrite=False, n_jobs=10 +): ldopa_datadir = os.path.join(datadir, "LDOPA_DATA") if overwrite or ( len(glob(os.path.join(ldopa_datadir, "*.csv"))) < len(LDOPA_DOWNLOADS) @@ -368,7 +399,9 @@ def download_ldopa(datadir, annot_label="is-walking", overwrite=False, n_jobs=10 label_acc_data(annot_label, ldopa_datadir, processeddir, n_jobs) -def load_data(datafile, sample_rate=100, index_col="timestamp", annot_type="int"): +def load_data( + datafile, sample_rate=100, index_col="timestamp", annot_type="int" +): if ".parquet" in datafile: data = pd.read_parquet(datafile) data.dropna(inplace=True) @@ -390,7 +423,9 @@ def resize(x, length, axis=1): length_orig = x.shape[axis] t_orig = np.linspace(0, 1, length_orig, endpoint=True) t_new = np.linspace(0, 1, length, endpoint=True) - x = interp1d(t_orig, x, kind="linear", axis=axis, assume_sorted=True)(t_new) + x = interp1d(t_orig, x, kind="linear", axis=axis, assume_sorted=True)( + t_new + ) return x @@ -402,11 +437,13 @@ def make_windows( resample_rate=30, label_type="threshold", dropna=True, - verbose=False + verbose=False, ): X, Y, T, D = [], [], [], [] - for t, w in tqdm(data.resample(f"{winsec}s", origin="start"), disable=not verbose): + for t, w in tqdm( + data.resample(f"{winsec}s", origin="start"), disable=not verbose + ): if len(w) < 1: continue @@ -424,7 +461,9 @@ def make_windows( if label_type == "mode": with warnings.catch_warnings(): - warnings.filterwarnings("ignore", message="Unable to sort modes") + warnings.filterwarnings( + "ignore", message="Unable to sort modes" + ) mode_label = annot.mode(dropna=False).iloc[0] if mode_label == -1 or mode_label == "-1": @@ -432,7 +471,11 @@ def make_windows( y = mode_label - d = w["day"].mode(dropna=False).iloc[0] if "day" in w.columns else 1 + d = ( + w["day"].mode(dropna=False).iloc[0] + if "day" in w.columns + else 1 + ) if dropna and pd.isna(y): continue @@ -466,9 +509,7 @@ def is_good_window(x, sample_rate, winsec): return True -def load_all_and_make_windows( - datadir, outdir, n_jobs, overwrite=False -): +def load_all_and_make_windows(datadir, outdir, n_jobs, overwrite=False): """Make windows from all available data, extract features and store locally""" if not overwrite and check_files_exist( outdir, ["X.npy", "Y.npy", "T.npy", "pid.npy", "day.npy"] @@ -481,9 +522,7 @@ def load_all_and_make_windows( Xs, Ys, Ts, Ds, Ps = zip( *Parallel(n_jobs=n_jobs)( delayed(load_and_make_windows)(datafile) - for datafile in tqdm( - datafiles, desc=f"Load all and make windows" - ) + for datafile in tqdm(datafiles, desc="Load all and make windows") ) ) @@ -520,8 +559,8 @@ def load_and_make_windows(datafile): def filter_for_analysis(X, Y, T, D, P): - day_mask = (D==1) | (D==4) - label_mask = (Y != 'ram') & (Y != 'ftn') + day_mask = (D == 1) | (D == 4) + label_mask = (Y != "ram") & (Y != "ftn") X_out = X[day_mask & label_mask] Y_out = Y[day_mask & label_mask]