diff --git a/.dockerignore b/.dockerignore index d18d9ac9..483b77db 100644 --- a/.dockerignore +++ b/.dockerignore @@ -4,5 +4,4 @@ coderdata/ dataSummary/ docs/ candle_bmd/ -schema/ -build/local/ \ No newline at end of file +build/local/ diff --git a/build/beatAML/GetBeatAML.py b/build/beatAML/GetBeatAML.py index c3f93143..5afdff8e 100755 --- a/build/beatAML/GetBeatAML.py +++ b/build/beatAML/GetBeatAML.py @@ -174,10 +174,8 @@ def retrieve_drug_info(compound_name): return np.nan, np.nan, np.nan, np.nan, np.nan, np.nan data = response.json() - #print(data) if "PropertyTable" in data: properties = data["PropertyTable"]["Properties"][0] - #print(properties) pubchem_id = properties.get('CID',np.nan) canSMILES = properties.get("CanonicalSMILES", np.nan) isoSMILES = properties.get("IsomericSMILES", np.nan) @@ -259,9 +257,6 @@ def merge_drug_info(d_df,drug_map): pd.DataFrame The merged dataframe containing combined drug information. """ - #print(drug_map) - #print(d_df.columns) - #print(d_df) print(d_df['isoSMILES'].dtype, drug_map['isoSMILES'].dtype) d_df['isoSMILES'] = d_df['isoSMILES'].astype(str) drug_map['isoSMILES'] = drug_map['isoSMILES'].astype(str) @@ -337,10 +332,9 @@ def add_improve_id(previous_df, new_df): """ if not previous_df.empty and 'improve_drug_id' in previous_df.columns: id_list = [int(val.replace('SMI_', '')) for val in previous_df['improve_drug_id'].tolist() if pd.notnull(val) and val.startswith('SMI_')] - max_id = max(id_list) if id_list else 0 # Default to 0 if the list is empty + max_id = max(id_list) if id_list else 0 else: - max_id = 0 # Default value if the DataFrame is empty or doesn't have the column - # max_id = max([int(val.replace('SMI_', '')) for val in previous_df['improve_drug_id'].tolist() if pd.notnull(val) and val.startswith('SMI_')]) + max_id = 0 # Identify isoSMILES in the new dataframe that don't exist in the old dataframe unique_new_smiles = set(new_df['isoSMILES']) - set(previous_df['isoSMILES']) # Identify rows in the new dataframe with isoSMILES that are unique and where improve_drug_id is NaN @@ -370,24 +364,9 @@ def map_exp_to_improve(exp_path):#df,improve_map_file): pd.DataFrame Mapped dataframe with 'improve_sample_id' added and 'sample_id' removed. """ - mapped_df = pd.read_csv(exp_path,sep='\t') # Map sample_id to improve_sample_id - #mapped_df = pd.merge(df, improve[['other_id', 'improve_sample_id']], left_on='sample_id', right_on='other_id', how='left') - #mapped_df.drop(columns=['sample_id', 'other_id'], inplace=True) - #mapped_df.insert(0, 'improve_sample_id', mapped_df.pop('improve_sample_id')) + mapped_df = pd.read_csv(exp_path,sep='\t') mapped_df['source'] = 'synapse' mapped_df['study'] = 'BeatAML' - #mapped_df= mapped_df.rename(columns={'Drug':'improve_sample_id', - # 'IC50':'ic50', - # 'EC50':'ec50', - # 'EC50se':'ec50se', - # 'Einf':'einf', - # 'HS':'hs', - # 'AAC1':'aac1', - # 'AUC1':'auc1', - # 'DSS1':'dss1', - # 'R2fit':'r2fit' - # } - # ) return mapped_df @@ -445,12 +424,21 @@ def map_and_combine(df, data_type, entrez_map_file, improve_map_file, map_file=N mapped_df.rename(columns={"hgvsc": "mutation"}, inplace=True) mapped_df.rename(columns={"labId": "sample_id"}, inplace=True) mapped_df.rename(columns={"Entrez_Gene_Id": "entrez_id"}, inplace=True) - - elif data_type == "mutation": - df = df[['dbgap_sample_id','hgvsc', 'hgvsp', 'gene', 'variant_classification','t_vaf', 'refseq', 'symbol']] - mapped_df = df.merge(genes, left_on='symbol', right_on='gene_symbol', how='left').reindex( - columns=['hgvsc', 'entrez_id', "dbgap_sample_id","variant_classification"]) + variant_mapping = { + 'frameshift_variant': 'Frameshift_Variant', + 'missense_variant': 'Missense_Mutation', + 'stop_gained': 'Nonsense_Mutation', + 'inframe_deletion': 'In_Frame_Del', + 'protein_altering_variant': 'Protein_Altering_Variant', + 'splice_acceptor_variant': 'Splice_Site', + 'splice_donor_variant': 'Splice_Site', + 'start_lost': 'Start_Codon_Del', + 'inframe_insertion': 'In_Frame_Ins', + 'stop_lost': 'Nonstop_Mutation' + } + + mapped_df['variant_classification'] = mapped_df['variant_classification'].map(variant_mapping) elif data_type == "proteomics": mapped_ids['sampleID'] = mapped_ids['sampleID'].str.split('_').apply(lambda x: x[2]) @@ -473,7 +461,6 @@ def map_and_combine(df, data_type, entrez_map_file, improve_map_file, map_file=N inplace=True ) - mapped_df = pd.merge(mapped_df, improve[['other_id', 'improve_sample_id']], left_on='sample_id', right_on='other_id', @@ -482,7 +469,7 @@ def map_and_combine(df, data_type, entrez_map_file, improve_map_file, map_file=N mapped_df['source'] = 'synapse' mapped_df['study'] = 'BeatAML' - final_dataframe = mapped_df.dropna()#pd.dropna(mapped_df,0) + final_dataframe = mapped_df.dropna() return final_dataframe @@ -659,8 +646,6 @@ def generate_drug_list(drug_map_path,drug_path): t_df = pd.read_csv('tpm_'+transcriptomics_file, sep = '\t') - # t_df.index = t_df.stable_id#display_label -# t_df = t_df.iloc[:, 4:] t_df = t_df.reset_index().rename(columns={'stable_id': 'Gene'}) t_df = pd.melt(t_df, id_vars=['Gene'], var_name='sample_id', value_name='transcriptomics') print(improve_map_file) @@ -724,7 +709,5 @@ def generate_drug_list(drug_map_path,drug_path): exp_res = map_exp_to_improve(drug_path) exp_res.to_csv("/tmp/beataml_experiments.tsv", index=False, sep='\t') - #drug_map_path = retrieve_figshare_data("https://figshare.com/ndownloader/files/43112314?private_link=0ea222d9bd461c756fb0") - # print("Finished Pipeline") diff --git a/build/beatAML/build_drugs.sh b/build/beatAML/build_drugs.sh index bbeade15..be78c115 100644 --- a/build/beatAML/build_drugs.sh +++ b/build/beatAML/build_drugs.sh @@ -1,2 +1,10 @@ +#!/bin/bash +set -euo pipefail + +trap 'echo "Error on or near line $LINENO while executing: $BASH_COMMAND"; exit 1' ERR + +echo "Running GetBeatAML.py with token and drugFile $1" python GetBeatAML.py --token $SYNAPSE_AUTH_TOKEN --drugs --drugFile $1 + +echo "Running build_drug_desc.py..." python build_drug_desc.py --drugtable /tmp/beataml_drugs.tsv --desctable /tmp/beataml_drug_descriptors.tsv.gz diff --git a/build/beatAML/build_exp.sh b/build/beatAML/build_exp.sh index 409e3c50..784c2dcc 100644 --- a/build/beatAML/build_exp.sh +++ b/build/beatAML/build_exp.sh @@ -1 +1,7 @@ +#!/bin/bash +set -euo pipefail + +trap 'echo "Error on or near line $LINENO while executing: $BASH_COMMAND"; exit 1' ERR + +echo "Running GetBeatAML.py with token and curSamples $1 and drugFile $2." python GetBeatAML.py --exp --token $SYNAPSE_AUTH_TOKEN --curSamples $1 --drugFile $2 diff --git a/build/beatAML/build_omics.sh b/build/beatAML/build_omics.sh index 104c3853..79679311 100644 --- a/build/beatAML/build_omics.sh +++ b/build/beatAML/build_omics.sh @@ -1 +1,7 @@ +#!/bin/bash +set -euo pipefail + +trap 'echo "Error on or near line $LINENO while executing: $BASH_COMMAND"; exit 1' ERR + +echo "Running GetBeatAML.py with token, curSamples $2, and genes $1." python GetBeatAML.py --token $SYNAPSE_AUTH_TOKEN --omics --curSamples $2 --genes $1 diff --git a/build/beatAML/build_samples.sh b/build/beatAML/build_samples.sh index 537875a9..b8b7cb3b 100644 --- a/build/beatAML/build_samples.sh +++ b/build/beatAML/build_samples.sh @@ -1 +1,7 @@ +#!/bin/bash +set -euo pipefail + +trap 'echo "Error on or near line $LINENO while executing: $BASH_COMMAND"; exit 1' ERR + +echo "Running GetBeatAML.py with token and prevSamples $1." python GetBeatAML.py --token $SYNAPSE_AUTH_TOKEN --samples --prevSamples $1 diff --git a/build/broad_sanger/03a-nci60Drugs.py b/build/broad_sanger/03a-nci60Drugs.py index e1239d4f..28720c90 100644 --- a/build/broad_sanger/03a-nci60Drugs.py +++ b/build/broad_sanger/03a-nci60Drugs.py @@ -122,9 +122,21 @@ def main(): merged = pl.concat([mdf,namedf],how='horizontal').select(['SMILES','pubchem_id','nscid','lower_name']) melted = merged.melt(id_vars=['SMILES','pubchem_id'],value_vars=['nscid','lower_name']).select(['SMILES','pubchem_id','value']).unique() melted.columns = ['canSMILES','pubchem_id','chem_name'] - if newdf.shape[0]>0: - newdf = newdf.join(melted,on='canSMILES',how='inner').select(res.columns) - res = pl.concat([res,newdf],how='vertical') + + if newdf.shape[0] > 0: + res = res.with_columns([ + pl.col("InChIKey").cast(pl.Utf8), + pl.col("formula").cast(pl.Utf8), + pl.col("weight").cast(pl.Utf8) + ]) + newdf = newdf.with_columns([ + pl.col("InChIKey").cast(pl.Utf8), + pl.col("formula").cast(pl.Utf8), + pl.col("weight").cast(pl.Utf8) + ]) + + newdf = newdf.join(melted, on='canSMILES', how='inner').select(res.columns) + res = pl.concat([res, newdf], how='vertical') res.write_csv(opts.output,separator='\t') if __name__=='__main__': diff --git a/build/broad_sanger/05_separate_datasets.py b/build/broad_sanger/05_separate_datasets.py new file mode 100644 index 00000000..9e99adc9 --- /dev/null +++ b/build/broad_sanger/05_separate_datasets.py @@ -0,0 +1,83 @@ +import gc +import polars as pl + + + +def main(): + + datasets_to_process = ["CCLE", "CTRPv2", "PRISM", "GDSCv1", "GDSCv2", "FIMM", "gCSI", "NCI60"] + omics_datatypes = ["transcriptomics","proteomics", "copy_number","mutations"] # csv + samples_datatypes = ["samples"] #csv + + drugs_datatypes = ["drugs", "drug_descriptors"] # tsv + + + dataset_sources = { + "CCLE": ["Broad"], + "CTRPv2": ["Broad"], + "PRISM": ["Broad"], + "GDSCv1": ["Sanger"], + "GDSCv2": ["Sanger"], + "FIMM": ["Broad"], + "gCSI": ["Broad"], # gCSI generates its own omics data but it is comparable to CCLE. In future, retrive gCSI omics. + "NCI60": ["Broad"] + } + + for dataset in datasets_to_process: + exp = pl.read_csv("broad_sanger_experiments.tsv", separator="\t") # Keeping memory down, so I will not be making copies. + exp = exp.filter(pl.col("study") == dataset) + + # Extract information to separate out datasets + exp_improve_sample_ids = exp["improve_sample_id"].unique().to_list() + exp_improve_drug_ids = exp["improve_drug_id"].unique().to_list() + + # Write Filtered Experiments File to TSV. Then delete it from memory. + exp_filename = f"/tmp/{dataset}_experiments.tsv".lower() + exp.write_csv(exp_filename, separator="\t") + del exp + gc.collect() + + + #Filter Samples files, write to file, delete from mem. + for samples in samples_datatypes: + samples_filename_in = f"broad_sanger_{samples}.csv" + samples_filename_out = f"/tmp/{dataset}_{samples}.csv".lower() + samples_df = pl.read_csv(samples_filename_in) + samples_df = samples_df.filter(pl.col("improve_sample_id").is_in(exp_improve_sample_ids)) + samples_df.write_csv(samples_filename_out) #csv + del samples_df + gc.collect() + + #One by one, filter other Omics files, write to file, delete from mem. + for omics in omics_datatypes: + omics_filename_in = f"broad_sanger_{omics}.csv" + omics_filename_out = f"/tmp/{dataset}_{omics}.csv".lower() + omics_df = pl.read_csv(omics_filename_in) + omics_df = omics_df.filter(pl.col("improve_sample_id").is_in(exp_improve_sample_ids)) + omics_df = omics_df.filter(pl.col("source").is_in(dataset_sources[dataset])) + omics_df.write_csv(omics_filename_out) #csv + del omics_df + gc.collect() + + + #One by one, filter other Drugs files, write to file, delete from mem. + for drugs in drugs_datatypes: + drugs_filename_in = f"broad_sanger_{drugs}.tsv" + drugs_filename_out = f"/tmp/{dataset}_{drugs}.tsv".lower() + if drugs == "drug_descriptors": + drugs_df = pl.read_csv(drugs_filename_in,separator="\t", + dtypes={"improve_drug_id": pl.Utf8, + "structural_descriptor": pl.Utf8, + "descriptor_value": pl.Utf8} + ) + + else: + drugs_df = pl.read_csv(drugs_filename_in,separator="\t") + + drugs_df = drugs_df.filter(pl.col("improve_drug_id").is_in(exp_improve_drug_ids)) + drugs_df.write_csv(drugs_filename_out,separator="\t") #tsv + del drugs_df + gc.collect() + +if __name__ == "__main__": + main() diff --git a/build/broad_sanger/build_drugs.sh b/build/broad_sanger/build_drugs.sh index 2ee3f78a..0e73cee7 100644 --- a/build/broad_sanger/build_drugs.sh +++ b/build/broad_sanger/build_drugs.sh @@ -1,3 +1,15 @@ -/opt/venv/bin/python 03a-nci60Drugs.py +#!/bin/bash +set -euo pipefail + +trap 'echo "Error on or near line $LINENO while executing: $BASH_COMMAND"; exit 1' ERR + +echo "Running 03a-nci60Drugs.py..." +/opt/venv/bin/python 03a-nci60Drugs.py + +echo "Running 03-createDrugFile.R..." Rscript 03-createDrugFile.R CTRPv2,GDSC,gCSI,PRISM,CCLE,FIMM -/opt/venv/bin/python build_drug_desc.py --drugtable /tmp/broad_sanger_drugs.tsv --desctable /tmp/broad_sanger_drug_descriptors.tsv.gz + +echo "Running build_drug_desc.py..." +/opt/venv/bin/python build_drug_desc.py \ + --drugtable /tmp/broad_sanger_drugs.tsv \ + --desctable /tmp/broad_sanger_drug_descriptors.tsv.gz \ No newline at end of file diff --git a/build/broad_sanger/build_exp.sh b/build/broad_sanger/build_exp.sh index ba4760fd..48bc2a17 100644 --- a/build/broad_sanger/build_exp.sh +++ b/build/broad_sanger/build_exp.sh @@ -1 +1,7 @@ -/opt/venv/bin/python 04-drug_dosage_and_curves.py --drugfile $2 --curSampleFile $1 +#!/bin/bash +set -euo pipefail + +trap 'echo "Error on or near line $LINENO while executing: $BASH_COMMAND"; exit 1' ERR + +echo "Running 04-drug_dosage_and_curves.py with drugfile $2 and curSampleFile $1" +/opt/venv/bin/python 04-drug_dosage_and_curves.py --drugfile $2 --curSampleFile $1 \ No newline at end of file diff --git a/build/broad_sanger/build_misc.sh b/build/broad_sanger/build_misc.sh new file mode 100644 index 00000000..2fa847f1 --- /dev/null +++ b/build/broad_sanger/build_misc.sh @@ -0,0 +1,11 @@ +#!/bin/bash +set -euo pipefail + +trap 'echo "Error on or near line $LINENO while executing: $BASH_COMMAND"; exit 1' ERR + +cp /tmp/broad_sanger* . +echo "Running 05_separate_datasets.py..." +/opt/venv/bin/python 05_separate_datasets.py + +echo "Removing broad_sanger* files..." +rm broad_sanger* diff --git a/build/broad_sanger/build_omics.sh b/build/broad_sanger/build_omics.sh index d898c289..024e0d17 100644 --- a/build/broad_sanger/build_omics.sh +++ b/build/broad_sanger/build_omics.sh @@ -1,3 +1,10 @@ +#!/bin/bash +set -euo pipefail + +trap 'echo "Error on or near line $LINENO while executing: $BASH_COMMAND"; exit 1' ERR + +echo "Running 02a-broad_sanger_proteomics.py with gene file $1 and sample file $2." /opt/venv/bin/python 02a-broad_sanger_proteomics.py --gene $1 --sample $2 + +echo "Running 02-broadSangerOmics.R with gene file $1 and sample file $2," Rscript 02-broadSangerOmics.R $1 $2 -#python 02a-broad/sanger_proteomics.py $1 $2 diff --git a/build/broad_sanger/build_samples.sh b/build/broad_sanger/build_samples.sh index 93406ddb..4302ba44 100644 --- a/build/broad_sanger/build_samples.sh +++ b/build/broad_sanger/build_samples.sh @@ -1 +1,7 @@ +#!/bin/bash +set -euo pipefail + +trap 'echo "Error on or near line $LINENO while executing: $BASH_COMMAND"; exit 1' ERR + +echo "Running 01-broadSangerSamples.R." Rscript 01-broadSangerSamples.R diff --git a/build/broad_sanger/requirements.txt b/build/broad_sanger/requirements.txt index d28b9866..76820c67 100755 --- a/build/broad_sanger/requirements.txt +++ b/build/broad_sanger/requirements.txt @@ -7,6 +7,8 @@ scikit-learn scipy requests openpyxl -polars +polars==0.19.17 mordredcommunity rdkit +coderdata==0.1.40 +psutil \ No newline at end of file diff --git a/build/build_all.py b/build/build_all.py index dc0f1071..5fb1e368 100644 --- a/build/build_all.py +++ b/build/build_all.py @@ -11,6 +11,7 @@ import gzip from glob import glob from packaging import version +import sys def main(): parser=argparse.ArgumentParser( @@ -70,7 +71,8 @@ def run_docker_cmd(cmd_arr,filename): cmd = docker_run+cmd_arr print(cmd) - res = subprocess.run(cmd,stdout=subprocess.PIPE,stderr=subprocess.PIPE) + # res = subprocess.run(cmd,stdout=subprocess.PIPE,stderr=subprocess.PIPE) + res = subprocess.run(cmd, stdout=sys.stdout, stderr=sys.stderr) if res.returncode !=0: print(res.stderr) exit(filename+' file failed') @@ -170,7 +172,7 @@ def process_drugs(executor, datasets): #if not os.path.exists(f'local/{da}_drugs.tsv'): if last_drug_future: last_drug_future.result() # Ensure the last drug process is completed before starting the next - last_drug_future = executor.submit(run_docker_cmd, [di, 'sh', 'build_drugs.sh', ','.join(dflist)], f'{da} drugs') + last_drug_future = executor.submit(run_docker_cmd, [di, 'bash', 'build_drugs.sh', ','.join(dflist)], f'{da} drugs') dflist.append(f'/tmp/{da}_drugs.tsv') def process_samples(executor, datasets): @@ -190,7 +192,7 @@ def process_samples(executor, datasets): if not os.path.exists(f'local/{da}_samples.csv'): if last_sample_future: last_sample_future.result() - last_sample_future = executor.submit(run_docker_cmd, [di, 'sh', 'build_samples.sh', sf], f'{da} samples') + last_sample_future = executor.submit(run_docker_cmd, [di, 'bash', 'build_samples.sh', sf], f'{da} samples') sf = f'/tmp/{da}_samples.csv' def process_omics(executor, datasets,high_mem): @@ -202,12 +204,12 @@ def process_omics(executor, datasets,high_mem): di = 'broad_sanger_omics' if da == 'broad_sanger' else da #Run all at once: if high_mem: - executor.submit(run_docker_cmd, [di, 'sh', 'build_omics.sh', '/tmp/genes.csv', f'/tmp/{da}_samples.csv'], f'{da} omics') + executor.submit(run_docker_cmd, [di, 'bash', 'build_omics.sh', '/tmp/genes.csv', f'/tmp/{da}_samples.csv'], f'{da} omics') #Run one at a time. else: if last_omics_future: last_omics_future.result() - last_omics_future = executor.submit(run_docker_cmd, [di, 'sh', 'build_omics.sh', '/tmp/genes.csv', f'/tmp/{da}_samples.csv'], f'{da} omics') + last_omics_future = executor.submit(run_docker_cmd, [di, 'bash', 'build_omics.sh', '/tmp/genes.csv', f'/tmp/{da}_samples.csv'], f'{da} omics') def process_experiments(executor, datasets, high_mem): ''' @@ -220,16 +222,40 @@ def process_experiments(executor, datasets, high_mem): if not os.path.exists(f'local/{da}_experiments.tsv'): #Run all at once if high_mem: - executor.submit(run_docker_cmd, [di, 'sh', 'build_exp.sh', f'/tmp/{da}_samples.csv', f'/tmp/{da}_drugs.tsv'], f'{da} experiments') + executor.submit(run_docker_cmd, [di, 'bash', 'build_exp.sh', f'/tmp/{da}_samples.csv', f'/tmp/{da}_drugs.tsv'], f'{da} experiments') #Run one at a time else: if last_experiments_future: last_experiments_future.result() - last_experiments_future = executor.submit(run_docker_cmd, [di, 'sh', 'build_exp.sh', f'/tmp/{da}_samples.csv', f'/tmp/{da}_drugs.tsv'], f'{da} experiments') + last_experiments_future = executor.submit(run_docker_cmd, [di, 'bash', 'build_exp.sh', f'/tmp/{da}_samples.csv', f'/tmp/{da}_drugs.tsv'], f'{da} experiments') + + + def process_misc(executor, datasets, high_mem): + ''' + Run all misc scripts concurrently or one at a time. + ''' + last_misc_future = None + #Currently this only applies to broad_sanger. Add others here if they need a final step. + if "broad_sanger" in datasets: + datasets = ["broad_sanger"] + else: + return + for da in datasets: + #Running the build_misc.sh in broad_sanger_omics + di = 'broad_sanger_omics' if da == 'broad_sanger' else da + #Run all at once: + if high_mem: + executor.submit(run_docker_cmd, [di, 'bash', 'build_misc.sh'], f'{da} misc') + #Run one at a time. + else: + if last_misc_future: + last_misc_future.result() + last_misc_future = executor.submit(run_docker_cmd, [di, 'bash', 'build_misc.sh'], f'{da} misc') + def process_genes(executor): if not os.path.exists('/tmp/genes.csv'): - executor.submit(run_docker_cmd,['genes','sh','build_genes.sh'],'gene file') + executor.submit(run_docker_cmd,['genes','bash','build_genes.sh'],'gene file') def run_docker_upload_cmd(cmd_arr, all_files_dir, name, version): @@ -252,9 +278,10 @@ def run_docker_upload_cmd(cmd_arr, all_files_dir, name, version): # Full command to run including version update docker_run.extend(cmd_arr) print('Executing:', ' '.join(docker_run)) - res = subprocess.run(docker_run, stdout=subprocess.PIPE, stderr=subprocess.PIPE) + # res = subprocess.run(docker_run, stdout=subprocess.PIPE, stderr=subprocess.PIPE) + res = subprocess.run(docker_run, stdout=sys.stdout, stderr=sys.stderr) if res.returncode != 0: - print(res.stderr.decode()) + print(res.stderr) exit(f'{name} failed') else: print(f'{name} successful') @@ -292,7 +319,8 @@ def compress_file(file_path): if args.pypi and not pypi_token: raise ValueError("PYPI_TOKEN environment variable is not set.") if ('beataml' in args.datasets or 'mpnst' in args.datasets) and not synapse_auth_token: - raise ValueError("SYNAPSE_AUTH_TOKEN is required for accessing MPNST and beatAML datasets.") + if args.docker or args.samples or args.omics or args.drugs or args.exp or args.all: # Token only required if building data, not upload or validate. + raise ValueError("SYNAPSE_AUTH_TOKEN is required for accessing MPNST and beatAML datasets.") ###### ### Begin Pipeline @@ -350,14 +378,31 @@ def compress_file(file_path): exp_thread.result() print("All experiments files completed") + + ### Final Step, some datasets may need an additional post build step. Add this here + # Currently only the cell line datasets need this. This seperates broad_sanger into all of its component datasets. + + with ThreadPoolExecutor() as executor: + if args.all: + misc_thread = executor.submit(process_misc, executor, datasets, args.high_mem) + if args.all: + misc_thread.result() + print("Final build step complete.") + + ###### ### Begin Upload and/or validation ##### - if args.pypi or args.figshare or args.validate: # FigShare File Prefixes: - prefixes = ['beataml', 'hcmi', 'cptac', 'mpnst', 'broad_sanger', 'genes', 'drugs'] + prefixes = ['beataml', 'hcmi', 'cptac', 'mpnst', 'genes', 'drugs'] + broad_sanger_datasets = ["ccle","ctrpv2","fimm","gdscv1","gdscv2","gcsi","prism","nci60"] + if "broad_sanger" in datasets: + prefixes.extend(broad_sanger_datasets) + datasets.extend(broad_sanger_datasets) + datasets.remove("broad_sanger") + figshare_token = os.getenv('FIGSHARE_TOKEN') pypi_token = os.getenv('PYPI_TOKEN') @@ -388,8 +433,7 @@ def compress_file(file_path): decompress_file(file) # Run schema checker - This will always run if uploading data. - datasets_list = args.datasets.split(',') - schema_check_command = ['python3', 'scripts/check_all_schemas.py', '--datasets'] + datasets_list + schema_check_command = ['python3', 'check_schema.py', '--datasets'] + datasets run_docker_upload_cmd(schema_check_command, 'all_files_dir', 'validate', args.version) print("Validation complete. Proceeding with file compression/decompression adjustments") diff --git a/build/build_dataset.py b/build/build_dataset.py index 162b7ed1..9e63030b 100644 --- a/build/build_dataset.py +++ b/build/build_dataset.py @@ -76,7 +76,7 @@ def process_genes(executor): Build the genes file if it does not exist. ''' if not os.path.exists('local/genes.csv'): - executor.submit(run_docker_cmd, ['genes', 'sh', 'build_genes.sh'], 'genes file') + executor.submit(run_docker_cmd, ['genes', 'bash', 'build_genes.sh'], 'genes file') def process_samples(executor, dataset, use_prev_dataset, should_continue): ''' @@ -90,7 +90,7 @@ def process_samples(executor, dataset, use_prev_dataset, should_continue): prev_samples_file = f'/tmp/{use_prev_dataset}_samples.csv' if use_prev_dataset else '' di = 'broad_sanger_omics' if dataset == 'broad_sanger' else dataset filename = f'{dataset} samples' - executor.submit(run_docker_cmd, [di, 'sh', 'build_samples.sh', prev_samples_file], filename) + executor.submit(run_docker_cmd, [di, 'bash', 'build_samples.sh', prev_samples_file], filename) def process_drugs(executor, dataset, use_prev_dataset, should_continue): ''' @@ -108,7 +108,7 @@ def process_drugs(executor, dataset, use_prev_dataset, should_continue): dflist = [prev_drugs_file] if use_prev_dataset else [] di = 'broad_sanger_exp' if dataset == 'broad_sanger' else dataset filename = f'{dataset} drugs' - executor.submit(run_docker_cmd, [di, 'sh', 'build_drugs.sh', ','.join(dflist)], filename) + executor.submit(run_docker_cmd, [di, 'bash', 'build_drugs.sh', ','.join(dflist)], filename) def process_omics(executor, dataset, should_continue): @@ -155,7 +155,7 @@ def process_omics(executor, dataset, should_continue): di = 'broad_sanger_omics' if dataset == 'broad_sanger' else dataset filename = f'{dataset} omics' - executor.submit(run_docker_cmd, [di, 'sh', 'build_omics.sh', '/tmp/genes.csv', f'/tmp/{dataset}_samples.csv'], filename) + executor.submit(run_docker_cmd, [di, 'bash', 'build_omics.sh', '/tmp/genes.csv', f'/tmp/{dataset}_samples.csv'], filename) def process_experiments(executor, dataset, should_continue): @@ -172,7 +172,28 @@ def process_experiments(executor, dataset, should_continue): di = 'broad_sanger_exp' if dataset == 'broad_sanger' else dataset filename = f'{dataset} experiments' - executor.submit(run_docker_cmd, [di, 'sh', 'build_exp.sh', f'/tmp/{dataset}_samples.csv', f'/tmp/{dataset}_drugs.tsv'], filename) + executor.submit(run_docker_cmd, [di, 'bash', 'build_exp.sh', f'/tmp/{dataset}_samples.csv', f'/tmp/{dataset}_drugs.tsv'], filename) + + + +def process_misc(executor, datasets): + ''' + Run all misc scripts concurrently or one at a time. + ''' + last_misc_future = None + #Currently this only applies to broad_sanger. Add others here if they need a final step. + if "broad_sanger" in datasets: + datasets = ["broad_sanger"] + else: + return + for da in datasets: + di = 'broad_sanger_omics' if da == 'broad_sanger' else da + #Run all at once: + if last_misc_future: + last_misc_future.result() + last_misc_future = executor.submit(run_docker_cmd, [di, 'bash', 'build_misc.sh'], f'{da} misc') + + def decompress_file(file_path): """Decompress a gzip file and delete the original compressed file.""" @@ -212,7 +233,15 @@ def run_schema_checker(dataset): ''' # Prepare the directory with the built files prefixes = ['genes', dataset] + datasets = [dataset] + broad_sanger_datasets = ["ccle","ctrpv2","fimm","gdscv1","gdscv2","gcsi","prism","nci60"] all_files_dir = 'all_files_dir' + if "broad_sanger" == dataset: + prefixes.extend(broad_sanger_datasets) + datasets.extend(broad_sanger_datasets) + datasets.remove("broad_sanger") + prefixes.remove("broad_sanger") + if not os.path.exists(f'local/{all_files_dir}'): os.makedirs(f'local/{all_files_dir}') @@ -227,7 +256,7 @@ def run_schema_checker(dataset): decompress_file(os.path.join('local', all_files_dir, file)) # Run schema checker - schema_check_command = ['python3', 'scripts/check_all_schemas.py', '--datasets', dataset] + schema_check_command = ['python3', 'scripts/check_schema.py', '--datasets'] + datasets run_docker_validate_cmd(schema_check_command, all_files_dir, 'Validation') def main(): @@ -236,6 +265,7 @@ def main(): ) parser.add_argument('--dataset', required=True, help='Name of the dataset to build') parser.add_argument('--use_prev_dataset', help='Prefix of the previous dataset for sample and drug ID assignment') + parser.add_argument('--build', action='store_true', help='Run data build.') parser.add_argument('--validate', action='store_true', help='Run schema checker on the built files') parser.add_argument('--continue', dest='should_continue', action='store_true', help='Continue from where the build left off by skipping existing files') @@ -247,31 +277,40 @@ def main(): # Build Docker Image process_docker(args.dataset,args.validate) - # Use ThreadPoolExecutor for parallel execution - with ThreadPoolExecutor() as executor: - # Always build genes file - process_genes(executor) - - # Build samples and drugs - samples_future = executor.submit(process_samples, executor, args.dataset, args.use_prev_dataset, args.should_continue) - drugs_future = executor.submit(process_drugs, executor, args.dataset, args.use_prev_dataset, args.should_continue) + if args.build: + # Use ThreadPoolExecutor for parallel execution + with ThreadPoolExecutor() as executor: + # Always build genes file + process_genes(executor) - samples_future.result() - drugs_future.result() - - print("Samples and Drugs Files Completed.") + # Build samples and drugs + samples_future = executor.submit(process_samples, executor, args.dataset, args.use_prev_dataset, args.should_continue) + drugs_future = executor.submit(process_drugs, executor, args.dataset, args.use_prev_dataset, args.should_continue) - with ThreadPoolExecutor() as executor: - - # Build omics and experiments - omics_future = executor.submit(process_omics, executor, args.dataset, args.should_continue) - experiments_future = executor.submit(process_experiments, executor, args.dataset, args.should_continue) + samples_future.result() + drugs_future.result() + + print("Samples and Drugs Files Completed.") - omics_future.result() - experiments_future.result() + with ThreadPoolExecutor() as executor: + + # Build omics and experiments + omics_future = executor.submit(process_omics, executor, args.dataset, args.should_continue) + experiments_future = executor.submit(process_experiments, executor, args.dataset, args.should_continue) - print("Experiments and Omics Files completed.") + omics_future.result() + experiments_future.result() + print("Experiments and Omics Files completed.") + + with ThreadPoolExecutor() as executor: + + if args.build: + misc_thread = executor.submit(process_misc, executor, args.dataset) + if args.build: + misc_thread.result() + print("Final build step complete.") + if args.validate: run_schema_checker(args.dataset) print("Validation completed.") diff --git a/build/cptac/build_omics.sh b/build/cptac/build_omics.sh index 0ff95945..95acf485 100644 --- a/build/cptac/build_omics.sh +++ b/build/cptac/build_omics.sh @@ -1 +1,7 @@ +#!/bin/bash +set -euo pipefail + +trap 'echo "Error on or near line $LINENO while executing: $BASH_COMMAND"; exit 1' ERR + +echo "Running getCptacData.py with geneFile $1 and curSampleFile=$2." python getCptacData.py --geneFile $1 --curSampleFile=$2 diff --git a/build/cptac/build_samples.sh b/build/cptac/build_samples.sh index 45f99921..9a8f9cb7 100644 --- a/build/cptac/build_samples.sh +++ b/build/cptac/build_samples.sh @@ -1 +1,7 @@ +#!/bin/bash +set -euo pipefail + +trap 'echo "Error on or near line $LINENO while executing: $BASH_COMMAND"; exit 1' ERR + +echo "Running getCptacData.py with prevSampleFile=$1." python3 getCptacData.py --prevSampleFile=$1 diff --git a/build/docker/Dockerfile.broad_sanger_omics b/build/docker/Dockerfile.broad_sanger_omics index d1aa5155..2d4cbde8 100755 --- a/build/docker/Dockerfile.broad_sanger_omics +++ b/build/docker/Dockerfile.broad_sanger_omics @@ -33,6 +33,8 @@ ADD build/broad_sanger/02a-broad_sanger_proteomics.py ./ ADD build/broad_sanger/build_samples.sh ./ ADD build/broad_sanger/build_omics.sh ./ ADD build/utils/* ./ +ADD build/broad_sanger/build_misc.sh ./ +ADD build/broad_sanger/05_separate_datasets.py ./ ADD build/broad_sanger/requirements.txt . ADD build/broad_sanger/omics_requirements.r . diff --git a/build/docker/Dockerfile.upload b/build/docker/Dockerfile.upload index b84151eb..41c5295a 100644 --- a/build/docker/Dockerfile.upload +++ b/build/docker/Dockerfile.upload @@ -6,6 +6,6 @@ RUN python -m pip install --upgrade pip setuptools wheel twine packaging pyyaml RUN apt-get update && apt-get install -y git -RUN git clone https://github.com/PNNL-CompBio/coderdata.git -WORKDIR /usr/src/app/coderdata \ No newline at end of file +COPY ./schema /usr/src/app/schema +ADD scripts/check_schema.py ./ diff --git a/build/hcmi/build_omics.sh b/build/hcmi/build_omics.sh index 95161aae..d5781a12 100644 --- a/build/hcmi/build_omics.sh +++ b/build/hcmi/build_omics.sh @@ -1,4 +1,13 @@ +#!/bin/bash +set -euo pipefail + +trap 'echo "Error on or near line $LINENO while executing: $BASH_COMMAND"; exit 1' ERR + +echo "Running 02-getHCMIData.py for transcriptomics." python 02-getHCMIData.py -m full_manifest.txt -t transcriptomics -o /tmp/hcmi_transcriptomics.csv.gz -g $1 -s $2 + +echo "Running 02-getHCMIData.py for copy_number." python 02-getHCMIData.py -m full_manifest.txt -t copy_number -o /tmp/hcmi_copy_number.csv.gz -g $1 -s $2 -python 02-getHCMIData.py -m full_manifest.txt -t mutations -o /tmp/hcmi_mutations.csv.gz -g $1 -s $2 +echo "Running 02-getHCMIData.py for mutations." +python 02-getHCMIData.py -m full_manifest.txt -t mutations -o /tmp/hcmi_mutations.csv.gz -g $1 -s $2 \ No newline at end of file diff --git a/build/hcmi/build_samples.sh b/build/hcmi/build_samples.sh index e5246c01..48f7e4a0 100644 --- a/build/hcmi/build_samples.sh +++ b/build/hcmi/build_samples.sh @@ -1 +1,7 @@ +#!/bin/bash +set -euo pipefail + +trap 'echo "Error on or near line $LINENO while executing: $BASH_COMMAND"; exit 1' ERR + +echo "Running 01-createHCMISamplesFile.py with prevSamples $1." python 01-createHCMISamplesFile.py --prevSamples $1 diff --git a/build/mpnst/02_get_drug_data.R b/build/mpnst/02_get_drug_data.R index e1894515..7c07af9b 100644 --- a/build/mpnst/02_get_drug_data.R +++ b/build/mpnst/02_get_drug_data.R @@ -154,7 +154,7 @@ source_python("pubchem_retrieval.py") update_dataframe_and_write_tsv(unique_names=alldrugs,output_filename=newdrugfile,ignore_chems=ignore_file_path) -tab<-read.table(newdrugfile,sep='\t',header=T,quote="") +tab<-read.table(newdrugfile,sep='\t',header=T,quote="",fill=TRUE) newdrugs<-tab|> subset(chem_name%in%tolower(alldrugs)) diff --git a/build/mpnst/build_drugs.sh b/build/mpnst/build_drugs.sh index 4561d1a2..3b969d2b 100644 --- a/build/mpnst/build_drugs.sh +++ b/build/mpnst/build_drugs.sh @@ -1,3 +1,10 @@ #!/bin/bash +set -euo pipefail + +trap 'echo "Error on or near line $LINENO while executing: $BASH_COMMAND"; exit 1' ERR + +echo "Running 02_get_drug_data.R with /tmp/mpnst_drugs.tsv and $1." Rscript 02_get_drug_data.R /tmp/mpnst_drugs.tsv $1 + +echo "Running build_drug_desc.py." /opt/venv/bin/python3 build_drug_desc.py --drugtable /tmp/mpnst_drugs.tsv --desctable /tmp/mpnst_drug_descriptors.tsv.gz diff --git a/build/mpnst/build_exp.sh b/build/mpnst/build_exp.sh index 7b4b0396..a9a2b763 100644 --- a/build/mpnst/build_exp.sh +++ b/build/mpnst/build_exp.sh @@ -1 +1,7 @@ +#!/bin/bash +set -euo pipefail + +trap 'echo "Error on or near line $LINENO while executing: $BASH_COMMAND"; exit 1' ERR + +echo "Running 03_get_drug_response_data.R with $SYNAPSE_AUTH_TOKEN, $1, and $2." Rscript 03_get_drug_response_data.R $SYNAPSE_AUTH_TOKEN $1 $2 diff --git a/build/mpnst/build_omics.sh b/build/mpnst/build_omics.sh index ac9a4420..b08ac63d 100644 --- a/build/mpnst/build_omics.sh +++ b/build/mpnst/build_omics.sh @@ -1 +1,7 @@ +#!/bin/bash +set -euo pipefail + +trap 'echo "Error on or near line $LINENO while executing: $BASH_COMMAND"; exit 1' ERR + +echo "Running 01_mpnst_get_omics.R with $SYNAPSE_AUTH_TOKEN, $2, and $1." Rscript 01_mpnst_get_omics.R $SYNAPSE_AUTH_TOKEN $2 $1 diff --git a/build/mpnst/build_samples.sh b/build/mpnst/build_samples.sh index 90f920c3..c9c079fa 100644 --- a/build/mpnst/build_samples.sh +++ b/build/mpnst/build_samples.sh @@ -1 +1,7 @@ -Rscript 00_sample_gen.R $1 \ No newline at end of file +#!/bin/bash +set -euo pipefail + +trap 'echo "Error on or near line $LINENO while executing: $BASH_COMMAND"; exit 1' ERR + +echo "Running 00_sample_gen.R with $1." +Rscript 00_sample_gen.R $1 diff --git a/build/mpnst/requirements.txt b/build/mpnst/requirements.txt index 99f3c960..27c4dc2a 100755 --- a/build/mpnst/requirements.txt +++ b/build/mpnst/requirements.txt @@ -8,4 +8,4 @@ scikit-learn scipy requests mordredcommunity -rdkit +rdkit \ No newline at end of file diff --git a/build/utils/fit_curve.py b/build/utils/fit_curve.py index 352e3418..41aca076 100755 --- a/build/utils/fit_curve.py +++ b/build/utils/fit_curve.py @@ -44,7 +44,7 @@ def hs_response_curve_original(x, einf, ec50, hs): HS_BOUNDS = ([0, 0, 0], [1, 12, 4]) #HS_BOUNDS_NEG = ([0, -3,-1],[1,8,0]) ## made hill slope forced to be negative -HS_BOUNDS_NEG = ([0, -5,-1],[1,3,0]) ## made hill slope forced to be negative ##20241017 updated to shift EC50 range +HS_BOUNDS_NEG = ([0, -5,-1],[1,3,1]) ## made hill slope forced to be negative ##20241017 updated to shift EC50 range def response_curve(x, einf, ec50, hs): """ transformed the original function with ec50 in -log10(M) instead of M """ diff --git a/build/utils/tpmFromCounts.py b/build/utils/tpmFromCounts.py index bafbbc5f..17786110 100644 --- a/build/utils/tpmFromCounts.py +++ b/build/utils/tpmFromCounts.py @@ -20,7 +20,8 @@ def main(): pats = set(counts.columns)-set(['stable_id','display_label','description','biotype']) ##transcript info from grc37 - gtf = pd.read_csv("https://ftp.ensembl.org/pub/grch37/current/gtf/homo_sapiens/Homo_sapiens.GRCh37.87.gtf.gz",sep='\t',comment='#') + # gtf = pd.read_csv("https://ftp.ensembl.org/pub/grch37/current/gtf/homo_sapiens/Homo_sapiens.GRCh37.87.gtf.gz",sep='\t',comment='#') # the "current" dir no longer exists... + gtf = pd.read_csv("https://ftp.ensembl.org/pub/grch37/release-113/gtf/homo_sapiens/Homo_sapiens.GRCh37.87.gtf.gz",sep='\t',comment='#') gtf.index = [a.split(';')[0].split(' ')[1].strip('"') for a in gtf[gtf.columns[8]]] ##first select only exons gtf = gtf[gtf.gene=='exon'] diff --git a/schema/check_beataml_linkml.sh b/schema/check_beataml_linkml.sh deleted file mode 100644 index 01498d09..00000000 --- a/schema/check_beataml_linkml.sh +++ /dev/null @@ -1,43 +0,0 @@ -#!/bin/bash - -# Define two parallel arrays: one for target classes, another for file names -target_classes=("Sample" "Transcriptomics" "Proteomics" "Mutations" "Experiments" "Drug") -files=("/tmp/beataml_samples.csv" "/tmp/beataml_transcriptomics.csv" "/tmp/beataml_proteomics.csv" "/tmp/beataml_mutations.csv" "/tmp/beataml_experiments.csv" "/tmp/beataml_drugs.tsv") - -# Initialize a flag to track validation status -validation_failed=0 - -# Get the length of the arrays -array_length=${#target_classes[@]} - -# Loop through the arrays -for (( i=0; i<${array_length}; i++ )); do - target_class=${target_classes[$i]} - file=${files[$i]} - echo "Validating $target_class in file $file..." - - # Run the validation command - linkml-validate --schema schema/coderdata.yaml --target-class "$target_class" "$file" - - # Capture the exit status - status=$? - - # Check the exit status of the command - if [ $status -ne 0 ]; then - echo "Validation failed for $target_class in file $file." - validation_failed=1 - else - echo "Validation succeeded for $target_class in file $file." - fi -done - -# Check if any validations failed -if [ $validation_failed -ne 0 ]; then - echo "One or more validations failed. Exiting with error." - exit 1 -else - echo "All validations succeeded." -fi - -echo "Validation process completed." - diff --git a/schema/check_cptac_linkml.sh b/schema/check_cptac_linkml.sh deleted file mode 100644 index d5b3682e..00000000 --- a/schema/check_cptac_linkml.sh +++ /dev/null @@ -1,43 +0,0 @@ -#!/bin/bash - -# Define two parallel arrays: one for target classes, another for file names -target_classes=("Sample" "Transcriptomics" "Proteomics" "Mutations" "Copy Number") -files=("/tmp/cptac_samples.csv" "/tmp/cptac_transcriptomics.csv" "/tmp/cptac_proteomics.csv" "/tmp/cptac_mutations.csv" "/tmp/cptac_copy_number.csv") - -# Initialize a flag to track validation status -validation_failed=0 - -# Get the length of the arrays -array_length=${#target_classes[@]} - -# Loop through the arrays -for (( i=0; i<${array_length}; i++ )); do - target_class=${target_classes[$i]} - file=${files[$i]} - echo "Validating $target_class in file $file..." - - # Run the validation command - linkml-validate --schema schema/coderdata.yaml --target-class "$target_class" "$file" - - # Capture the exit status - status=$? - - # Check the exit status of the command - if [ $status -ne 0 ]; then - echo "Validation failed for $target_class in file $file." - validation_failed=1 - else - echo "Validation succeeded for $target_class in file $file." - fi -done - -# Check if any validations failed -if [ $validation_failed -ne 0 ]; then - echo "One or more validations failed. Exiting with error." - exit 1 -else - echo "All validations succeeded." -fi - -echo "Validation process completed." - diff --git a/schema/check_depmap_linkml.sh b/schema/check_depmap_linkml.sh deleted file mode 100644 index 027699db..00000000 --- a/schema/check_depmap_linkml.sh +++ /dev/null @@ -1,43 +0,0 @@ -#!/bin/bash - -# Define two parallel arrays: one for target classes, another for file names -target_classes=("Sample" "Transcriptomics" "Proteomics" "Mutations" "Copy Number" "Experiments" "Drug") -files=("/tmp/depmap_samples.csv" "/tmp/depmap_transcriptomics.csv" "/tmp/depmap_proteomics.csv" "/tmp/depmap_mutations.csv" "/tmp/depmap_copy_number.csv" "/tmp/depmap_experiments.csv" "/tmp/depmap_drugs.tsv") - -# Initialize a flag to track validation status -validation_failed=0 - -# Get the length of the arrays -array_length=${#target_classes[@]} - -# Loop through the arrays -for (( i=0; i<${array_length}; i++ )); do - target_class=${target_classes[$i]} - file=${files[$i]} - echo "Validating $target_class in file $file..." - - # Run the validation command - linkml-validate --schema schema/coderdata.yaml --target-class "$target_class" "$file" - - # Capture the exit status - status=$? - - # Check the exit status of the command - if [ $status -ne 0 ]; then - echo "Validation failed for $target_class in file $file." - validation_failed=1 - else - echo "Validation succeeded for $target_class in file $file." - fi -done - -# Check if any validations failed -if [ $validation_failed -ne 0 ]; then - echo "One or more validations failed. Exiting with error." - exit 1 -else - echo "All validations succeeded." -fi - -echo "Validation process completed." - diff --git a/schema/check_hcmi_linkml.sh b/schema/check_hcmi_linkml.sh deleted file mode 100644 index 24f94ddc..00000000 --- a/schema/check_hcmi_linkml.sh +++ /dev/null @@ -1,43 +0,0 @@ -#!/bin/bash - -# Define two parallel arrays: one for target classes, another for file names -target_classes=("Sample" "Transcriptomics" "Copy Number" "Mutations") -files=("/tmp/hcmi_samples.csv" "/tmp/hcmi_transcriptomics.csv" "/tmp/hcmi_copy_number.csv" "/tmp/hcmi_mutations.csv") - -# Initialize a flag to track validation status -validation_failed=0 - -# Get the length of the arrays -array_length=${#target_classes[@]} - -# Loop through the arrays -for (( i=0; i<${array_length}; i++ )); do - target_class=${target_classes[$i]} - file=${files[$i]} - echo "Validating $target_class in file $file..." - - # Run the validation command - linkml-validate --schema schema/coderdata.yaml --target-class "$target_class" "$file" - - # Capture the exit status - status=$? - - # Check the exit status of the command - if [ $status -ne 0 ]; then - echo "Validation failed for $target_class in file $file." - validation_failed=1 - else - echo "Validation succeeded for $target_class in file $file." - fi -done - -# Check if any validations failed -if [ $validation_failed -ne 0 ]; then - echo "One or more validations failed. Exiting with error." - exit 1 -else - echo "All validations succeeded." -fi - -echo "Validation process completed." - diff --git a/schema/check_mpnst_linkml.sh b/schema/check_mpnst_linkml.sh deleted file mode 100644 index 6daebfd0..00000000 --- a/schema/check_mpnst_linkml.sh +++ /dev/null @@ -1,43 +0,0 @@ -#!/bin/bash - -# Define two parallel arrays: one for target classes, another for file names -target_classes=("Sample" "Transcriptomics" "Proteomics" "Mutations" "Experiments" "Drug") -files=("/tmp/mpnst_samples.csv" "/tmp/mpnst_transcriptomics.csv" "/tmp/mpnst_proteomics.csv" "/tmp/mpnst_mutations.csv" "/tmp/mpnst_experiments.csv" "/tmp/mpnst_drugs.tsv") - -# Initialize a flag to track validation status -validation_failed=0 - -# Get the length of the arrays -array_length=${#target_classes[@]} - -# Loop through the arrays -for (( i=0; i<${array_length}; i++ )); do - target_class=${target_classes[$i]} - file=${files[$i]} - echo "Validating $target_class in file $file..." - - # Run the validation command - linkml-validate --schema schema/coderdata.yaml --target-class "$target_class" "$file" - - # Capture the exit status - status=$? - - # Check the exit status of the command - if [ $status -ne 0 ]; then - echo "Validation failed for $target_class in file $file." - validation_failed=1 - else - echo "Validation succeeded for $target_class in file $file." - fi -done - -# Check if any validations failed -if [ $validation_failed -ne 0 ]; then - echo "One or more validations failed. Exiting with error." - exit 1 -else - echo "All validations succeeded." -fi - -echo "Validation process completed." - diff --git a/schema/coderdata.yaml b/schema/coderdata.yaml index 1b5f9916..3f9dafde 100755 --- a/schema/coderdata.yaml +++ b/schema/coderdata.yaml @@ -76,6 +76,7 @@ classes: attributes: chem_name: description: Name of drug + range: linkml:Any canSMILES: description: Canonical SMILE string isoSMILES: @@ -86,7 +87,7 @@ classes: description: Chemical formula weight: description: Molecular weight - range: float + range: linkml:Any pubchem_id: description: PubChem Identifier for this drug, can be many. range: int @@ -98,7 +99,7 @@ classes: structural_descriptor: description: string name describing structural descriptor descriptor_value: - range: any + range: linkml:Any description: value representing descriptor value Transcriptomics: description: @@ -160,7 +161,7 @@ classes: description: Metric by which dose response value is measured dose_response_value: description: Value of metric - range: float + range: linkml:Any Perturbations: slots: - entrez_id diff --git a/schema/expected_files.yaml b/schema/expected_files.yaml new file mode 100644 index 00000000..6604b094 --- /dev/null +++ b/schema/expected_files.yaml @@ -0,0 +1,194 @@ +datasets: + beataml: + - target_class: Sample + file: /tmp/beataml_samples.csv + - target_class: Transcriptomics + file: /tmp/beataml_transcriptomics.csv + - target_class: Proteomics + file: /tmp/beataml_proteomics.csv + - target_class: Mutations + file: /tmp/beataml_mutations.csv + - target_class: Experiments + file: /tmp/beataml_experiments.tsv + - target_class: Drug + file: /tmp/beataml_drugs.tsv + + hcmi: + - target_class: Sample + file: /tmp/hcmi_samples.csv + - target_class: Transcriptomics + file: /tmp/hcmi_transcriptomics.csv + - target_class: Copy Number + file: /tmp/hcmi_copy_number.csv + - target_class: Mutations + file: /tmp/hcmi_mutations.csv + + mpnst: + - target_class: Sample + file: /tmp/mpnst_samples.csv + - target_class: Transcriptomics + file: /tmp/mpnst_transcriptomics.csv + - target_class: Proteomics + file: /tmp/mpnst_proteomics.csv + - target_class: Mutations + file: /tmp/mpnst_mutations.csv + - target_class: Experiments + file: /tmp/mpnst_experiments.tsv + - target_class: Drug + file: /tmp/mpnst_drugs.tsv + + cptac: + - target_class: Sample + file: /tmp/cptac_samples.csv + - target_class: Transcriptomics + file: /tmp/cptac_transcriptomics.csv + - target_class: Proteomics + file: /tmp/cptac_proteomics.csv + - target_class: Mutations + file: /tmp/cptac_mutations.csv + - target_class: Copy Number + file: /tmp/cptac_copy_number.csv + + ccle: + - target_class: Sample + file: /tmp/ccle_samples.csv + - target_class: Transcriptomics + file: /tmp/ccle_transcriptomics.csv + - target_class: Proteomics + file: /tmp/ccle_proteomics.csv + - target_class: Mutations + file: /tmp/ccle_mutations.csv + - target_class: Copy Number + file: /tmp/ccle_copy_number.csv + - target_class: Experiments + file: /tmp/ccle_experiments.tsv + - target_class: Drug + file: /tmp/ccle_drugs.tsv + + ctrpv2: + - target_class: Sample + file: /tmp/ctrpv2_samples.csv + - target_class: Transcriptomics + file: /tmp/ctrpv2_transcriptomics.csv + - target_class: Proteomics + file: /tmp/ctrpv2_proteomics.csv + - target_class: Mutations + file: /tmp/ctrpv2_mutations.csv + - target_class: Copy Number + file: /tmp/ctrpv2_copy_number.csv + - target_class: Experiments + file: /tmp/ctrpv2_experiments.tsv + - target_class: Drug + file: /tmp/ctrpv2_drugs.tsv + + depmap: + - target_class: Sample + file: /tmp/depmap_samples.csv + - target_class: Transcriptomics + file: /tmp/depmap_transcriptomics.csv + - target_class: Proteomics + file: /tmp/depmap_proteomics.csv + - target_class: Mutations + file: /tmp/depmap_mutations.csv + - target_class: Copy Number + file: /tmp/depmap_copy_number.csv + - target_class: Experiments + file: /tmp/depmap_experiments.csv + - target_class: Drug + file: /tmp/depmap_drugs.tsv + + fimm: + - target_class: Sample + file: /tmp/fimm_samples.csv + - target_class: Transcriptomics + file: /tmp/fimm_transcriptomics.csv + - target_class: Proteomics + file: /tmp/fimm_proteomics.csv + - target_class: Mutations + file: /tmp/fimm_mutations.csv + - target_class: Copy Number + file: /tmp/fimm_copy_number.csv + - target_class: Experiments + file: /tmp/fimm_experiments.tsv + - target_class: Drug + file: /tmp/fimm_drugs.tsv + + gcsi: + - target_class: Sample + file: /tmp/gcsi_samples.csv + - target_class: Transcriptomics + file: /tmp/gcsi_transcriptomics.csv + - target_class: Proteomics + file: /tmp/gcsi_proteomics.csv + - target_class: Mutations + file: /tmp/gcsi_mutations.csv + - target_class: Copy Number + file: /tmp/gcsi_copy_number.csv + - target_class: Experiments + file: /tmp/gcsi_experiments.tsv + - target_class: Drug + file: /tmp/gcsi_drugs.tsv + + gdscv1: + - target_class: Sample + file: /tmp/gdscv1_samples.csv + - target_class: Transcriptomics + file: /tmp/gdscv1_transcriptomics.csv + - target_class: Proteomics + file: /tmp/gdscv1_proteomics.csv + - target_class: Mutations + file: /tmp/gdscv1_mutations.csv + - target_class: Copy Number + file: /tmp/gdscv1_copy_number.csv + - target_class: Experiments + file: /tmp/gdscv1_experiments.tsv + - target_class: Drug + file: /tmp/gdscv1_drugs.tsv + + gdscv2: + - target_class: Sample + file: /tmp/gdscv2_samples.csv + - target_class: Transcriptomics + file: /tmp/gdscv2_transcriptomics.csv + - target_class: Proteomics + file: /tmp/gdscv2_proteomics.csv + - target_class: Mutations + file: /tmp/gdscv2_mutations.csv + - target_class: Copy Number + file: /tmp/gdscv2_copy_number.csv + - target_class: Experiments + file: /tmp/gdscv2_experiments.tsv + - target_class: Drug + file: /tmp/gdscv2_drugs.tsv + + nci60: + - target_class: Sample + file: /tmp/nci60_samples.csv + - target_class: Transcriptomics + file: /tmp/nci60_transcriptomics.csv + - target_class: Proteomics + file: /tmp/nci60_proteomics.csv + - target_class: Mutations + file: /tmp/nci60_mutations.csv + - target_class: Copy Number + file: /tmp/nci60_copy_number.csv + - target_class: Experiments + file: /tmp/nci60_experiments.tsv + - target_class: Drug + file: /tmp/nci60_drugs.tsv + + prism: + - target_class: Sample + file: /tmp/prism_samples.csv + - target_class: Transcriptomics + file: /tmp/prism_transcriptomics.csv + - target_class: Proteomics + file: /tmp/prism_proteomics.csv + - target_class: Mutations + file: /tmp/prism_mutations.csv + - target_class: Copy Number + file: /tmp/prism_copy_number.csv + - target_class: Experiments + file: /tmp/prism_experiments.tsv + - target_class: Drug + file: /tmp/prism_drugs.tsv diff --git a/scripts/check_all_schemas.py b/scripts/check_all_schemas.py deleted file mode 100644 index 72817f25..00000000 --- a/scripts/check_all_schemas.py +++ /dev/null @@ -1,63 +0,0 @@ -import subprocess -import os -import argparse -import concurrent.futures - -def run_schema_checker(script_name): - """ - Runs a schema checker shell script and returns the outcome. - - Parameters: - script_name (str): The filename of the shell script to run. - - Returns: - tuple: (script_name, bool) where bool is True if the validation succeeded, False if it failed. - """ - try: - # Build the full path to the script - script_path = os.path.join('schema', script_name) - # Run the shell script - result = subprocess.run(['bash', script_path], capture_output=True, text=True) - # Print output and error from shell script - print(result.stdout) - if result.stderr: - print(f"Error in {script_name}:", result.stderr) - # Return the script name and True if the script executed successfully - return (script_name, result.returncode == 0) - except Exception as e: - print(f"An error occurred while running {script_name}: {e}") - return (script_name, False) - -def main(): - parser = argparse.ArgumentParser(description="Run schema validations for specified datasets.") - parser.add_argument('-d', '--datasets', nargs='*', help='List of datasets to validate (e.g., beataml, cptac, depmap, hcmi)', default=None) - args = parser.parse_args() - - # Mapping from dataset names to script names - schema_mapping = { - 'beataml': 'check_beataml_linkml.sh', - 'cptac': 'check_cptac_linkml.sh', - 'depmap': 'check_depmap_linkml.sh', - 'hcmi': 'check_hcmi_linkml.sh', - 'mpnst': 'check_mpnst_linkml.sh' - } - - scripts_to_run = schema_mapping.values() if not args.datasets else [schema_mapping[dataset] for dataset in args.datasets if dataset in schema_mapping] - - all_passed = True - with concurrent.futures.ThreadPoolExecutor() as executor: - futures = {executor.submit(run_schema_checker, script): script for script in scripts_to_run} - - for future in concurrent.futures.as_completed(futures): - script_name, result = future.result() - if not result: - all_passed = False - print(f"Validation failed for {script_name}") - - if all_passed: - print("All schema validations passed successfully.") - else: - print("Some schema validations failed.") - -if __name__ == '__main__': - main() diff --git a/scripts/check_schema.py b/scripts/check_schema.py new file mode 100644 index 00000000..f547e54d --- /dev/null +++ b/scripts/check_schema.py @@ -0,0 +1,96 @@ +import subprocess +import argparse +import concurrent.futures +import sys +import yaml +import os + +def run_validations_for_dataset(dataset_name, validations): + """ + Runs validations for a dataset and returns whether all validations passed. + + Parameters: + dataset_name (str): The name of the dataset. + validations (list): List of validations, each a dict with 'target_class' and 'file'. + + Returns: + tuple: (dataset_name, bool) where bool is True if all validations passed, False otherwise. + """ + validation_failed = False + for validation in validations: + target_class = validation['target_class'] + file_path = validation['file'] + print(f"Validating {target_class} in file {file_path} for dataset {dataset_name}...") + + # Run the validation command + try: + result = subprocess.run( + ['linkml-validate', '--schema', 'schema/coderdata.yaml', '--target-class', target_class, file_path], + stdout=sys.stdout, stderr=sys.stderr, text=True + ) + + # Print output and error from command + if result.stdout: + print(result.stdout) + if result.stderr: + print(f"Error in validating {target_class} in {file_path}:", result.stderr) + # Check the exit status + if result.returncode != 0: + print(f"Validation failed for {target_class} in file {file_path}.") + validation_failed = True + else: + print(f"Validation succeeded for {target_class} in file {file_path}.") + except Exception as e: + print(f"An error occurred while validating {target_class} in {file_path}: {e}") + validation_failed = True + + if validation_failed: + print(f"One or more validations failed for dataset {dataset_name}.") + return (dataset_name, False) + else: + print(f"All validations succeeded for dataset {dataset_name}.") + return (dataset_name, True) + +def main(): + parser = argparse.ArgumentParser(description="Run schema validations for specified datasets.") + parser.add_argument('-d', '--datasets', nargs='*', help='List of datasets to validate (e.g., "beataml cptac ccle hcmi")', default=None) + args = parser.parse_args() + + # Read the config file + config_path = os.path.join('schema', 'expected_files.yaml') + with open(config_path, 'r') as f: + config = yaml.safe_load(f) + + available_datasets = config['datasets'].keys() + datasets_to_validate = args.datasets if args.datasets else available_datasets + datasets_to_validate = [dataset for dataset in datasets_to_validate if dataset in available_datasets] + + print(f"Datasets to validate: {datasets_to_validate}") + + all_passed = True + with concurrent.futures.ThreadPoolExecutor() as executor: + futures = { + executor.submit( + run_validations_for_dataset, + dataset, + config['datasets'][dataset] + ): dataset for dataset in datasets_to_validate + } + + for future in concurrent.futures.as_completed(futures): + dataset_name, result = future.result() + if not result: + all_passed = False + print(f"Validation failed for dataset {dataset_name}") + else: + print(f"Validation passed for dataset {dataset_name}") + + if all_passed: + print("All schema validations passed successfully.") + sys.exit(0) + else: + print("Some schema validations failed.") + sys.exit(1) + +if __name__ == '__main__': + main()