Skip to content

Commit

Permalink
Merge pull request #232 from PNNL-CompBio/nci-fix
Browse files Browse the repository at this point in the history
Nci fix
  • Loading branch information
sgosline authored Oct 21, 2024
2 parents 1fc3e75 + 4895f96 commit 4025953
Show file tree
Hide file tree
Showing 8 changed files with 97 additions and 42 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -17,3 +17,4 @@ __pycache__
tests/__pycache__
dist
build/lib
build/local
29 changes: 28 additions & 1 deletion build/broad_sanger/03a-nci60Drugs.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,10 +13,23 @@

##drug files
smi_strings='https://wiki.nci.nih.gov/download/attachments/155844992/nsc_smiles.csv?version=1&modificationDate=1710381820000&api=v2&download=true'
#oct 2024
smi_strings = 'https://wiki.nci.nih.gov/download/attachments/155844992/nsc_smiles.csv?version=3&modificationDate=1727924130457&api=v2&download=true'

pc_ids='https://wiki.nci.nih.gov/download/attachments/155844992/nsc_sid_cid.csv?version=2&modificationDate=1712766341112&api=v2&download=true'
pc_ids = 'https://wiki.nci.nih.gov/download/attachments/155844992/nsc_sid_cid.csv?version=4&modificationDate=1727924129121&api=v2&download=true'

#oct 2024
chemnames='https://wiki.nci.nih.gov/download/attachments/155844992/nsc_chemcal_name.csv?version=1&modificationDate=1710382716000&api=v2&download=true'

chemnames='https://wiki.nci.nih.gov/download/attachments/155844992/nsc_chemical_name.csv?version=1&modificationDate=1727924127004&api=v2'
#oct 2024
cas='https://wiki.nci.nih.gov/download/attachments/155844992/nsc_cas.csv?version=1&modificationDate=1710381783000&api=v2&download=true'
#oct 2024
cas = 'https://wiki.nci.nih.gov/download/attachments/155844992/nsc_cas.csv?version=3&modificationDate=1727924126194&api=v2&download=true'
conc_data = 'https://wiki.nci.nih.gov/download/attachments/147193864/DOSERESP.zip?version=11&modificationDate=1712351454136&api=v2'
##OCT 2024
conc_data = 'https://wiki.nci.nih.gov/download/attachments/147193864/DOSERESP.zip?version=13&modificationDate=1727922354561&api=v2'


def main():
Expand All @@ -39,13 +52,27 @@ def main():
if not os.path.exists('DOSERESP.csv'):
resp = request.urlretrieve(conc_data,'doseresp.zip')
os.system('unzip doseresp.zip')
dose_resp = pl.read_csv("DOSERESP.csv",quote_char='"',infer_schema_length=10000000)
dose_resp = pl.read_csv("DOSERESP.csv",quote_char='"',infer_schema_length=10000000,ignore_errors=True)
pubchems = pubchems.filter(pl.col('NSC').is_in(dose_resp['NSC']))
smiles = smiles.filter(pl.col("NSC").is_in(dose_resp['NSC']))
##first retreive pubchem data
if opts.test:
arr = rand.sample(list(pubchems['CID']),100)
else:
arr = set(pubchems['CID'])

##first filter to see if there are structures/drugs in teh data already. i dont think this does much.
if os.path.exists(opts.output):
curdrugs = pl.read_csv(opts.output,separator='\t')
# cs = set(curdrugs['isoSMILES'])
smiles = smiles.filter(pl.col('SMILES').is_not_null())
upper=[a.upper() for a in smiles['SMILES']]
smiles= pl.DataFrame({'NSC':smiles['NSC'],'upper':upper})#smiles.with_columns(upper=upper)
##reduce to smiels only in current drugs
ssmiles = smiles.filter(~pl.col('upper').is_in(curdrugs['isoSMILES']))
ssmiles = ssmiles.filter(~pl.col('upper').is_in(curdrugs['canSMILES']))
pubchems = pubchems.filter(pl.col('NSC').is_in(ssmiles['NSC']))
arr = set(pubchems['CID'])

print("Querying pubchem from CIDs")
pr.update_dataframe_and_write_tsv(arr,opts.output,'/tmp/ignore_chems.txt',batch_size=400,isname=False,time_limit=10*60*60)
Expand Down
2 changes: 1 addition & 1 deletion build/broad_sanger/04a-drugResponseData.R
Original file line number Diff line number Diff line change
Expand Up @@ -123,7 +123,7 @@ getDoseRespData<-function(dset,studyName,improve_samples,drug.map){
dplyr::full_join(respDat,by=c('doseNum','exp_id'))%>%
left_join(full.map)%>%
dplyr::select(Drug=improve_drug_id,improve_sample_id=improve_sample_id,DOSE=Dose,GROWTH=Response)%>%
#dplyr::mutate(DOSE=-log10(Dose/1000))###curve fitting code requires -log10(M), these are mM
#dplyr::mutate(DOSE=-log10(Dose/1000))###curve fitting code requires -log10(M), these are uM according to https://github.com/bhklab/PharmacoGx/blob/master/vignettes/CreatingPharmacoSet.Rmd
#rename(GROWTH=RESPONSE)%>%
mutate(source='pharmacoGX')%>%
mutate(study=studyName)|>
Expand Down
63 changes: 38 additions & 25 deletions build/broad_sanger/04b-nci60-updated.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
'''
gets nci60 data from 10/2023 release
gets nci60 data from 10/2024 release
'''

Expand All @@ -12,13 +12,17 @@
from urllib import request

conc_data = 'https://wiki.nci.nih.gov/download/attachments/147193864/DOSERESP.zip?version=11&modificationDate=1712351454136&api=v2'
##OCT 2024
conc_data = 'https://wiki.nci.nih.gov/download/attachments/147193864/DOSERESP.zip?version=13&modificationDate=1727922354561&api=v2'

cancelled = 'https://wiki.nci.nih.gov/download/attachments/147193864/DOSERESP_Cancelled.csv?version=1&modificationDate=1660871847000&api=v2&download=true'

def main():

parser = argparse.ArgumentParser()
parser.add_argument('--sampleFile',dest='samplefile',default=None,help='DepMap sample file')
parser.add_argument('--drugFile',dest='dfile',default=None,help='Drug database')


opts = parser.parse_args()

Expand All @@ -31,7 +35,7 @@ def main():
samples = pl.read_csv(samplefile,quote_char='"')
drugs = pl.read_csv(drugfile,separator='\t',quote_char='"')

dose_resp = pl.read_csv("DOSERESP.csv",quote_char='"',infer_schema_length=10000000)
dose_resp = pl.read_csv("DOSERESP.csv",quote_char='"',infer_schema_length=10000000,ignore_errors=True)

##update drug mapping
drugmapping = pl.DataFrame(
Expand All @@ -47,7 +51,10 @@ def main():
###update sample mapping
on = samples[['other_names','improve_sample_id']]
on.columns=['common_name','improve_sample_id']


#there should be 71 cell lines, but there are 163.
# 82 map to the 'other_names'
# 81 map to neither
sampmapping = pl.concat([on[['common_name','improve_sample_id']],samples[['common_name','improve_sample_id']]])

sampmapping = sampmapping.unique()
Expand All @@ -64,44 +71,50 @@ def main():


##now we can merge all the data into the dose response data frame
merged = dose_resp[['AVERAGE_PTC','CONCENTRATION','CELL_NAME','EXPID','NSC']].join(sampmapping,on='CELL_NAME',how='left')
merged = dose_resp[['AVERAGE_PTC','CONCENTRATION_UNIT','CONCENTRATION','CELL_NAME','EXPID','NSC']].join(sampmapping,on='CELL_NAME',how='left')
merged = merged.join(timemapping,on='EXPID',how='left')

##clean up mssing samples
nonulls = merged.filter(pl.col('improve_sample_id').is_not_null())

nulls = merged.filter(pl.col('improve_sample_id').is_null())

newnames = pl.DataFrame(
{
'new_name': [re.split(r' |\(|/', a)[0] for a in nulls['CELL_NAME']],
'CELL_NAME':nulls['CELL_NAME']
}
)
newnames = newnames.unique()
# newnames = pl.DataFrame(
# {
# 'new_name':[re.split(' |\(|\/',a)[0] for a in nulls['CELL_NAME']],
# 'CELL_NAME':nulls['CELL_NAME']
# }
# )
# newnames = newnames.unique()


fixed = nulls[['AVERAGE_PTC','CONCENTRATION','CELL_NAME','EXPID','NSC','time','time_unit']].join(newnames,on='CELL_NAME',how='left')
fixed.columns = ['AVERAGE_PTC','CONCENTRATION','old_CELL_NAME','EXPID','NSC','time','time_unit','CELL_NAME']
fixed = fixed.join(sampmapping,on='CELL_NAME',how='left')[['AVERAGE_PTC','CONCENTRATION','old_CELL_NAME','EXPID','NSC','improve_sample_id','time','time_unit']]
fixed.columns = ['AVERAGE_PTC','CONCENTRATION','CELL_NAME','EXPID','NSC','improve_sample_id','time','time_unit']
fixed = fixed.filter(pl.col('improve_sample_id').is_not_null())
# fixed = nulls[['AVERAGE_PTC','CONCENTRATION_UNIT','CONCENTRATION','CELL_NAME','EXPID','NSC','time','time_unit']].join(newnames,on='CELL_NAME',how='left')
# merged.columns = ['AVERAGE_PTC','CONCENTRATION_UNIT','CONCENTRATION','old_CELL_NAME','EXPID','NSC','time','time_unit','CELL_NAME']
# fixed = merged.join(sampmapping,on='CELL_NAME',how='left')[['AVERAGE_PTC','CONCENTRATION_UNIT','CONCENTRATION','old_CELL_NAME','EXPID','NSC','improve_sample_id','time','time_unit']]
# fixed.columns = ['AVERAGE_PTC','CONCENTRATION_UNIT','CONCENTRATION','CELL_NAME','EXPID','NSC','improve_sample_id','time','time_unit']
# fixed = fixed.filter(pl.col('improve_sample_id').is_not_null())

merged = pl.concat([nonulls,fixed])
merged = nonulls#pl.concat([nonulls,fixed])

###we get a few more results added, but still missing a bunch
merged = merged.join(drugmapping,on='NSC',how='left')
nulldrugs = merged.filter(pl.col('improve_drug_id').is_null())
nonulls = merged.filter(pl.col('improve_drug_id').is_not_null())

###now update all the concentrations to be in Moles (some are in uM, all are log10)
##some are provided as molecular weights ('v') or other ('s') and we can't compare
molar = merged.filter(pl.col('CONCENTRATION_UNIT')=='M')

finaldf = pl.DataFrame(
{
'source':['NCI60_24' for a in nonulls['improve_drug_id']], ##2024 build
'improve_sample_id':nonulls['improve_sample_id'],
'Drug':nonulls['improve_drug_id'],
'study':['NCI60' for a in nonulls['improve_drug_id']],
'time':nonulls['time'],
'time_unit':nonulls['time_unit'],
'DOSE': [10**a for a in nonulls['CONCENTRATION']],
'GROWTH':nonulls['AVERAGE_PTC']
'source':['NCI60' for a in molar['improve_drug_id']], ##2024 build
'improve_sample_id':molar['improve_sample_id'],
'Drug':molar['improve_drug_id'],
'study': molar['EXPID'],#['NCI60' for a in nonulls['improve_drug_id']],
'time':molar['time'],
'time_unit':molar['time_unit'],
'DOSE': [(10**a)*1000000 for a in molar['CONCENTRATION']], ##move from molar to uM to match pharmacoDB
'GROWTH':molar['AVERAGE_PTC']
}
)
##write to file
Expand Down
4 changes: 2 additions & 2 deletions build/docker/Dockerfile.mpnst
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ RUN /opt/venv/bin/pip install --upgrade pip

# Set environment variables for reticulate
ENV RETICULATE_PYTHON="/opt/venv/bin/python3"
ENV PYTHONPATH="${PYTHONPATH}:/app"
ENV PYTHONPATH=/app#"${PYTHONPATH}:/app"
WORKDIR /app

# Set MPLCONFIGDIR to a writable directory
Expand All @@ -51,4 +51,4 @@ RUN /opt/venv/bin/pip3 install -r requirements.txt
RUN Rscript requirements.r

# Set up volume for temporary storage
VOLUME ["/tmp"]
VOLUME ["/tmp"]
2 changes: 1 addition & 1 deletion build/mpnst/03_get_drug_response_data.R
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,7 @@ getDrugDataByParent<-function(parid,sampleId){
data <- fread(synGet(x)$path)|>
filter(response_type=='percent viability')|>
mutate(improve_sample_id=sampleId,
DOSE=exp(dosage)/1000000, ##dosage is log(M), need to move to micromolar
DOSE=(10^dosage)*1000000, ##dosage is log(M), need to move to micromolar
GROWTH=response, #/100,
source = "NF Data Portal",
#CELL = improve_sample_id,
Expand Down
2 changes: 1 addition & 1 deletion build/mpnst/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ directory. Currently using the test files as input.
`mpnst_samples.csv` file. This pulls from the latest synapse
project metadata table.
```
docker run -v $PWD:/tmp -e -e SYNAPSE_AUTH_TOKEN=$SYNAPSE_AUTH_TOKEN mpnst sh build_samples.sh /tmp/build/build_test/test_samples.csv
docker run -v $PWD:/tmp -e SYNAPSE_AUTH_TOKEN=$SYNAPSE_AUTH_TOKEN mpnst sh build_samples.sh /tmp/build/build_test/test_samples.csv
```

3. Pull the data and map it to the samples. This uses the metadata
Expand Down
36 changes: 25 additions & 11 deletions build/utils/fit_curve.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,8 @@ def hs_response_curve_original(x, einf, ec50, hs):


HS_BOUNDS = ([0, 0, 0], [1, 12, 4])
HS_BOUNDS_NEG = ([0, -3,-1],[1,8,0]) ## made hill slope forced to be negative
#HS_BOUNDS_NEG = ([0, -3,-1],[1,8,0]) ## made hill slope forced to be negative
HS_BOUNDS_NEG = ([0, -5,-1],[1,3,0]) ## made hill slope forced to be negative ##20241017 updated to shift EC50 range
def response_curve(x, einf, ec50, hs):
""" transformed the original function with ec50 in -log10(M) instead of M
"""
Expand Down Expand Up @@ -116,14 +117,14 @@ def fit_exp(df_exp, title=None, dmin=None, dmax=None, save=False):

return metrics.to_frame(name='metrics').T

def compute_fit_metrics(xdata, ydata, popt, pcov, d1=4, d2=10):

def compute_fit_metrics(xdata, ydata, popt, pcov, d1 = -5, d2=3):#d1=4, d2=10):
'''
xdata: dose data in log10(M)
ydata: range from 0 to 1
xdata: dose data in log10( ydata: range from 0 to 1
popt: fit curve metrics
pcov: ??
d1: minimum fixed dose in log10(M)
d2: maximum fixed dose log10(M)
d1: minimum fixed dose in log10(M) ##updated to uM and made range larger
d2: maximum fixed dose log10(M) ##updated to uM and made ranger larger
'''
if popt is None:
cols = ['fit_auc','fit_ic50','fit_ec50','fit_ec50se','fit_r2','fit_einf','fit_hs','aac','auc','dss']#'auc ic50 ec50 ec50se R2fit rinf hs aac1 auc1 dss1'.split(' ')
Expand All @@ -146,6 +147,7 @@ def compute_fit_metrics(xdata, ydata, popt, pcov, d1=4, d2=10):
int10x = compute_area(xmin, ic10x, *popt)
##old code - assumes a positive hill slope, otherwise doesn't seem to work.
dss1 = (0.9 * (ic10x - xmin) - int10x) / (0.9 * (xmax - xmin)) if xmin < ic10x else 0
#this auc has fixed doses, so can be (in theory) standardized across datasets
auc = (response_integral(d2, *popt) - response_integral(d1, *popt)) / (d2 - d1)
##added by sara, i'm not sure where the above came from
## orig definition from paper is here: https://static-content.springer.com/esm/art%3A10.1038%2Fsrep05193/MediaObjects/41598_2014_BFsrep05193_MOESM1_ESM.pdf
Expand Down Expand Up @@ -239,20 +241,32 @@ def process_df_part(df, fname, beataml=False, sep='\t', start=0, count=None):

def main():
parser = argparse.ArgumentParser()
parser.add_argument('--input', help='input file')
parser.add_argument('--input', help='input file with the following columns:\
DOSE: dose of drug in uM,\
GROWTH: percentage of cells left,\
study: name of study to group measurements by,\
source: source of the data,\
improve_sample_id: improve_sample_id,\
Drug: improve_drug_id,\
time: time at which measurement was taken,\
time_unit: unit of time')
parser.add_argument('--output', help='prefix of output file')
parser.add_argument('--beataml', action='store_true', help='Include this if for BeatAML')

parser.add_argument('--debug',action='store_true',default=False)

args = parser.parse_args()
print(args.input)
df_all = pd.read_table(args.input)
if args.debug:
df_all = df_all.iloc[0:1000000]

#drop nas
df_all = df_all.dropna()
##pharmacoGX data is micromolar, we need log transformed molar
df_all.DOSE = np.log10(df_all.DOSE*1000000)
##pharmacoGX data is micromolar, we need log transformed data
df_all.DOSE = np.log10(df_all.DOSE)
##need data to be between 0 and 1, not 0 and 100
df_all.GROWTH=df_all.GROWTH/100.00

print(df_all.head)
fname = args.output or 'combined_single_response_agg'
process_df_part(df_all, fname, beataml=args.beataml)#, start=args.start, count=args.count)

Expand Down

0 comments on commit 4025953

Please sign in to comment.