Merge pull request #232 from PNNL-CompBio/nci-fix

Nci fix
PNNL-CompBio · Oct 21, 2024 · 4025953 · 4025953
2 parents 1fc3e75 + 4895f96
commit 4025953
Show file tree

Hide file tree

Showing 8 changed files with 97 additions and 42 deletions.
diff --git a/.gitignore b/.gitignore
@@ -17,3 +17,4 @@ __pycache__
 tests/__pycache__
 dist
 build/lib
+build/local
diff --git a/build/broad_sanger/03a-nci60Drugs.py b/build/broad_sanger/03a-nci60Drugs.py
@@ -13,10 +13,23 @@
 
 ##drug files
 smi_strings='https://wiki.nci.nih.gov/download/attachments/155844992/nsc_smiles.csv?version=1&modificationDate=1710381820000&api=v2&download=true'
+#oct 2024
+smi_strings = 'https://wiki.nci.nih.gov/download/attachments/155844992/nsc_smiles.csv?version=3&modificationDate=1727924130457&api=v2&download=true'
+
 pc_ids='https://wiki.nci.nih.gov/download/attachments/155844992/nsc_sid_cid.csv?version=2&modificationDate=1712766341112&api=v2&download=true'
+pc_ids = 'https://wiki.nci.nih.gov/download/attachments/155844992/nsc_sid_cid.csv?version=4&modificationDate=1727924129121&api=v2&download=true'
+
+#oct 2024
 chemnames='https://wiki.nci.nih.gov/download/attachments/155844992/nsc_chemcal_name.csv?version=1&modificationDate=1710382716000&api=v2&download=true'
+
+chemnames='https://wiki.nci.nih.gov/download/attachments/155844992/nsc_chemical_name.csv?version=1&modificationDate=1727924127004&api=v2'
+#oct 2024
 cas='https://wiki.nci.nih.gov/download/attachments/155844992/nsc_cas.csv?version=1&modificationDate=1710381783000&api=v2&download=true'
+#oct 2024
+cas = 'https://wiki.nci.nih.gov/download/attachments/155844992/nsc_cas.csv?version=3&modificationDate=1727924126194&api=v2&download=true'
 conc_data = 'https://wiki.nci.nih.gov/download/attachments/147193864/DOSERESP.zip?version=11&modificationDate=1712351454136&api=v2'
+##OCT 2024
+conc_data = 'https://wiki.nci.nih.gov/download/attachments/147193864/DOSERESP.zip?version=13&modificationDate=1727922354561&api=v2'
 
 
 def main():    
@@ -39,13 +52,27 @@ def main():
     if not os.path.exists('DOSERESP.csv'):
         resp = request.urlretrieve(conc_data,'doseresp.zip')
         os.system('unzip doseresp.zip')
-    dose_resp = pl.read_csv("DOSERESP.csv",quote_char='"',infer_schema_length=10000000)
+    dose_resp = pl.read_csv("DOSERESP.csv",quote_char='"',infer_schema_length=10000000,ignore_errors=True)
     pubchems = pubchems.filter(pl.col('NSC').is_in(dose_resp['NSC']))
+    smiles = smiles.filter(pl.col("NSC").is_in(dose_resp['NSC']))
     ##first retreive pubchem data
     if opts.test:
         arr = rand.sample(list(pubchems['CID']),100)
     else:
         arr = set(pubchems['CID'])
+
+    ##first filter to see if there are structures/drugs in teh data already. i dont think this does much.
+    if os.path.exists(opts.output):
+        curdrugs = pl.read_csv(opts.output,separator='\t')
+       # cs = set(curdrugs['isoSMILES'])
+        smiles = smiles.filter(pl.col('SMILES').is_not_null())
+        upper=[a.upper() for a in smiles['SMILES']]
+        smiles= pl.DataFrame({'NSC':smiles['NSC'],'upper':upper})#smiles.with_columns(upper=upper)
+        ##reduce to smiels only in current drugs
+        ssmiles = smiles.filter(~pl.col('upper').is_in(curdrugs['isoSMILES']))
+        ssmiles = ssmiles.filter(~pl.col('upper').is_in(curdrugs['canSMILES']))
+        pubchems = pubchems.filter(pl.col('NSC').is_in(ssmiles['NSC']))
+        arr = set(pubchems['CID'])
 
     print("Querying pubchem from CIDs")
     pr.update_dataframe_and_write_tsv(arr,opts.output,'/tmp/ignore_chems.txt',batch_size=400,isname=False,time_limit=10*60*60)

diff --git a/build/broad_sanger/04a-drugResponseData.R b/build/broad_sanger/04a-drugResponseData.R
@@ -123,7 +123,7 @@ getDoseRespData<-function(dset,studyName,improve_samples,drug.map){
     dplyr::full_join(respDat,by=c('doseNum','exp_id'))%>%
     left_join(full.map)%>%
     dplyr::select(Drug=improve_drug_id,improve_sample_id=improve_sample_id,DOSE=Dose,GROWTH=Response)%>%
-    #dplyr::mutate(DOSE=-log10(Dose/1000))###curve fitting code requires -log10(M), these are mM
+    #dplyr::mutate(DOSE=-log10(Dose/1000))###curve fitting code requires -log10(M), these are uM according to https://github.com/bhklab/PharmacoGx/blob/master/vignettes/CreatingPharmacoSet.Rmd
     #rename(GROWTH=RESPONSE)%>%
     mutate(source='pharmacoGX')%>%
       mutate(study=studyName)|>

diff --git a/build/broad_sanger/04b-nci60-updated.py b/build/broad_sanger/04b-nci60-updated.py
@@ -1,5 +1,5 @@
 '''
-gets nci60 data from 10/2023 release
+gets nci60 data from 10/2024 release
 
 '''
 
@@ -12,13 +12,17 @@
 from urllib import request
 
 conc_data = 'https://wiki.nci.nih.gov/download/attachments/147193864/DOSERESP.zip?version=11&modificationDate=1712351454136&api=v2'
+##OCT 2024
+conc_data = 'https://wiki.nci.nih.gov/download/attachments/147193864/DOSERESP.zip?version=13&modificationDate=1727922354561&api=v2'
+
 cancelled = 'https://wiki.nci.nih.gov/download/attachments/147193864/DOSERESP_Cancelled.csv?version=1&modificationDate=1660871847000&api=v2&download=true'
 
 def main():
 
     parser = argparse.ArgumentParser()
     parser.add_argument('--sampleFile',dest='samplefile',default=None,help='DepMap sample file') 
     parser.add_argument('--drugFile',dest='dfile',default=None,help='Drug database')
+
 
     opts = parser.parse_args()
 
@@ -31,7 +35,7 @@ def main():
     samples = pl.read_csv(samplefile,quote_char='"')
     drugs = pl.read_csv(drugfile,separator='\t',quote_char='"')
 
-    dose_resp = pl.read_csv("DOSERESP.csv",quote_char='"',infer_schema_length=10000000)
+    dose_resp = pl.read_csv("DOSERESP.csv",quote_char='"',infer_schema_length=10000000,ignore_errors=True)
 
     ##update drug mapping
     drugmapping = pl.DataFrame(
@@ -47,7 +51,10 @@ def main():
     ###update sample mapping
     on = samples[['other_names','improve_sample_id']]
     on.columns=['common_name','improve_sample_id']
-
+
+    #there should be 71 cell lines, but there are 163.
+    # 82 map to the 'other_names'
+    # 81 map to neither
     sampmapping = pl.concat([on[['common_name','improve_sample_id']],samples[['common_name','improve_sample_id']]])
 
     sampmapping = sampmapping.unique()
@@ -64,44 +71,50 @@ def main():
 
 
     ##now we can merge all the data into the dose response data frame
-    merged = dose_resp[['AVERAGE_PTC','CONCENTRATION','CELL_NAME','EXPID','NSC']].join(sampmapping,on='CELL_NAME',how='left')
+    merged = dose_resp[['AVERAGE_PTC','CONCENTRATION_UNIT','CONCENTRATION','CELL_NAME','EXPID','NSC']].join(sampmapping,on='CELL_NAME',how='left')
     merged = merged.join(timemapping,on='EXPID',how='left')
 
     ##clean up mssing samples
     nonulls = merged.filter(pl.col('improve_sample_id').is_not_null())
 
     nulls = merged.filter(pl.col('improve_sample_id').is_null())
 
-    newnames = pl.DataFrame(
-        {
-            'new_name': [re.split(r' |\(|/', a)[0] for a in nulls['CELL_NAME']],
-            'CELL_NAME':nulls['CELL_NAME']
-        }
-    )
-    newnames = newnames.unique()
+  #  newnames = pl.DataFrame(
+  #      {
+  #          'new_name':[re.split(' |\(|\/',a)[0] for a in nulls['CELL_NAME']],
+  #          'CELL_NAME':nulls['CELL_NAME']
+  #      }
+  #  )
+  #  newnames = newnames.unique()
+
 
-    fixed = nulls[['AVERAGE_PTC','CONCENTRATION','CELL_NAME','EXPID','NSC','time','time_unit']].join(newnames,on='CELL_NAME',how='left')
-    fixed.columns = ['AVERAGE_PTC','CONCENTRATION','old_CELL_NAME','EXPID','NSC','time','time_unit','CELL_NAME']
-    fixed = fixed.join(sampmapping,on='CELL_NAME',how='left')[['AVERAGE_PTC','CONCENTRATION','old_CELL_NAME','EXPID','NSC','improve_sample_id','time','time_unit']]
-    fixed.columns = ['AVERAGE_PTC','CONCENTRATION','CELL_NAME','EXPID','NSC','improve_sample_id','time','time_unit']
-    fixed = fixed.filter(pl.col('improve_sample_id').is_not_null())
+  #  fixed = nulls[['AVERAGE_PTC','CONCENTRATION_UNIT','CONCENTRATION','CELL_NAME','EXPID','NSC','time','time_unit']].join(newnames,on='CELL_NAME',how='left')
+  #  merged.columns = ['AVERAGE_PTC','CONCENTRATION_UNIT','CONCENTRATION','old_CELL_NAME','EXPID','NSC','time','time_unit','CELL_NAME']
+  #  fixed = merged.join(sampmapping,on='CELL_NAME',how='left')[['AVERAGE_PTC','CONCENTRATION_UNIT','CONCENTRATION','old_CELL_NAME','EXPID','NSC','improve_sample_id','time','time_unit']]
+  #  fixed.columns = ['AVERAGE_PTC','CONCENTRATION_UNIT','CONCENTRATION','CELL_NAME','EXPID','NSC','improve_sample_id','time','time_unit']
+#    fixed = fixed.filter(pl.col('improve_sample_id').is_not_null())
 
-    merged = pl.concat([nonulls,fixed])
+    merged = nonulls#pl.concat([nonulls,fixed])
 
     ###we get a few more results added, but still missing a bunch    
     merged = merged.join(drugmapping,on='NSC',how='left')
     nulldrugs = merged.filter(pl.col('improve_drug_id').is_null())
     nonulls =  merged.filter(pl.col('improve_drug_id').is_not_null())
+
+    ###now update all the concentrations to be in Moles (some are in uM, all are log10)
+    ##some are provided as molecular weights ('v') or other ('s') and we can't compare
+    molar = merged.filter(pl.col('CONCENTRATION_UNIT')=='M')
+
     finaldf = pl.DataFrame(
         {
-            'source':['NCI60_24' for a in nonulls['improve_drug_id']], ##2024 build
-            'improve_sample_id':nonulls['improve_sample_id'],
-            'Drug':nonulls['improve_drug_id'],
-            'study':['NCI60' for a in nonulls['improve_drug_id']],
-            'time':nonulls['time'],
-            'time_unit':nonulls['time_unit'],
-            'DOSE': [10**a for a in nonulls['CONCENTRATION']],
-            'GROWTH':nonulls['AVERAGE_PTC']
+            'source':['NCI60' for a in molar['improve_drug_id']], ##2024 build
+            'improve_sample_id':molar['improve_sample_id'],
+            'Drug':molar['improve_drug_id'],
+            'study': molar['EXPID'],#['NCI60' for a in nonulls['improve_drug_id']],
+            'time':molar['time'],
+            'time_unit':molar['time_unit'],
+            'DOSE': [(10**a)*1000000 for a in molar['CONCENTRATION']], ##move from molar to uM to match pharmacoDB
+            'GROWTH':molar['AVERAGE_PTC']
         }
     )
     ##write to file

diff --git a/build/docker/Dockerfile.mpnst b/build/docker/Dockerfile.mpnst
@@ -31,7 +31,7 @@ RUN /opt/venv/bin/pip install --upgrade pip
 
 # Set environment variables for reticulate
 ENV RETICULATE_PYTHON="/opt/venv/bin/python3"
-ENV PYTHONPATH="${PYTHONPATH}:/app"
+ENV PYTHONPATH=/app#"${PYTHONPATH}:/app"
 WORKDIR /app
 
 # Set MPLCONFIGDIR to a writable directory
@@ -51,4 +51,4 @@ RUN /opt/venv/bin/pip3 install -r requirements.txt
 RUN Rscript requirements.r
 
 # Set up volume for temporary storage
-VOLUME ["/tmp"]
+VOLUME ["/tmp"]
diff --git a/build/mpnst/03_get_drug_response_data.R b/build/mpnst/03_get_drug_response_data.R
@@ -85,7 +85,7 @@ getDrugDataByParent<-function(parid,sampleId){
         data <- fread(synGet(x)$path)|>
             filter(response_type=='percent viability')|>
             mutate(improve_sample_id=sampleId,
-                   DOSE=exp(dosage)/1000000, ##dosage is log(M), need to move to micromolar
+                   DOSE=(10^dosage)*1000000, ##dosage is log(M), need to move to micromolar
                    GROWTH=response, #/100,
                    source = "NF Data Portal",
                    #CELL = improve_sample_id,

diff --git a/build/mpnst/README.md b/build/mpnst/README.md
@@ -12,7 +12,7 @@ directory. Currently using the test files as input.
    `mpnst_samples.csv` file. This pulls from the latest synapse
    project metadata table.
    ```
-   docker run -v $PWD:/tmp -e -e SYNAPSE_AUTH_TOKEN=$SYNAPSE_AUTH_TOKEN mpnst sh build_samples.sh /tmp/build/build_test/test_samples.csv
+   docker run -v $PWD:/tmp -e SYNAPSE_AUTH_TOKEN=$SYNAPSE_AUTH_TOKEN mpnst sh build_samples.sh /tmp/build/build_test/test_samples.csv
    ```
 
 3. Pull the data and map it to the samples. This uses the metadata

diff --git a/build/utils/fit_curve.py b/build/utils/fit_curve.py
@@ -43,7 +43,8 @@ def hs_response_curve_original(x, einf, ec50, hs):
 
 
 HS_BOUNDS = ([0, 0, 0], [1, 12, 4]) 
-HS_BOUNDS_NEG = ([0, -3,-1],[1,8,0]) ## made hill slope forced to be negative
+#HS_BOUNDS_NEG = ([0, -3,-1],[1,8,0]) ## made hill slope forced to be negative
+HS_BOUNDS_NEG = ([0, -5,-1],[1,3,0]) ## made hill slope forced to be negative  ##20241017 updated to shift EC50 range
 def response_curve(x, einf, ec50, hs):
     """ transformed the original function with ec50 in -log10(M) instead of M
     """
@@ -116,14 +117,14 @@ def fit_exp(df_exp, title=None, dmin=None, dmax=None, save=False):
 
     return metrics.to_frame(name='metrics').T
 
-def compute_fit_metrics(xdata, ydata, popt, pcov, d1=4, d2=10):
+
+def compute_fit_metrics(xdata, ydata, popt, pcov, d1 = -5, d2=3):#d1=4, d2=10):
     '''
-    xdata: dose data in log10(M)
-    ydata: range from 0 to 1
+    xdata: dose data in log10(    ydata: range from 0 to 1
     popt: fit curve metrics
     pcov: ??
-    d1: minimum fixed dose in log10(M)
-    d2: maximum fixed dose log10(M)
+    d1: minimum fixed dose in log10(M) ##updated to uM and  made range larger
+    d2: maximum fixed dose log10(M) ##updated to uM and made ranger larger
     '''
     if popt is None:
         cols = ['fit_auc','fit_ic50','fit_ec50','fit_ec50se','fit_r2','fit_einf','fit_hs','aac','auc','dss']#'auc ic50 ec50 ec50se R2fit rinf hs aac1 auc1 dss1'.split(' ')
@@ -146,6 +147,7 @@ def compute_fit_metrics(xdata, ydata, popt, pcov, d1=4, d2=10):
     int10x = compute_area(xmin, ic10x, *popt)
     ##old code - assumes a positive hill slope, otherwise doesn't seem to work.
     dss1 = (0.9 * (ic10x - xmin) - int10x) / (0.9 * (xmax - xmin)) if xmin < ic10x else 0
+    #this auc has fixed doses, so can be (in theory) standardized across datasets
     auc = (response_integral(d2, *popt) - response_integral(d1, *popt)) / (d2 - d1)
     ##added by sara, i'm not sure where the above came from
     ## orig definition from paper is here: https://static-content.springer.com/esm/art%3A10.1038%2Fsrep05193/MediaObjects/41598_2014_BFsrep05193_MOESM1_ESM.pdf
@@ -239,20 +241,32 @@ def process_df_part(df, fname, beataml=False, sep='\t', start=0, count=None):
 
 def main():
     parser = argparse.ArgumentParser()
-    parser.add_argument('--input', help='input file')
+    parser.add_argument('--input', help='input file with the following columns:\
+    DOSE: dose of drug in uM,\
+    GROWTH: percentage of cells left,\
+    study: name of study to group measurements by,\
+    source: source of the data,\
+    improve_sample_id: improve_sample_id,\
+    Drug: improve_drug_id,\
+    time: time at which measurement was taken,\
+    time_unit: unit of time')
     parser.add_argument('--output', help='prefix of output file')
     parser.add_argument('--beataml', action='store_true', help='Include this if for BeatAML')
-
+    parser.add_argument('--debug',action='store_true',default=False)
+
     args = parser.parse_args()
     print(args.input)
     df_all = pd.read_table(args.input)
+    if args.debug:
+        df_all = df_all.iloc[0:1000000]
+
     #drop nas
     df_all = df_all.dropna()
-    ##pharmacoGX data is micromolar, we need log transformed molar
-    df_all.DOSE = np.log10(df_all.DOSE*1000000)
+    ##pharmacoGX data is micromolar, we need log transformed data
+    df_all.DOSE = np.log10(df_all.DOSE)
     ##need data to be between 0 and 1, not 0 and 100
     df_all.GROWTH=df_all.GROWTH/100.00
-
+    print(df_all.head)
     fname = args.output or 'combined_single_response_agg'
     process_df_part(df_all, fname, beataml=args.beataml)#, start=args.start, count=args.count)