Skip to content

Commit

Permalink
added some logic to structure search
Browse files Browse the repository at this point in the history
  • Loading branch information
sgosline committed Oct 18, 2024
1 parent e294a92 commit 55e4b39
Showing 1 changed file with 14 additions and 0 deletions.
14 changes: 14 additions & 0 deletions build/broad_sanger/03a-nci60Drugs.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,11 +54,25 @@ def main():
os.system('unzip doseresp.zip')
dose_resp = pl.read_csv("DOSERESP.csv",quote_char='"',infer_schema_length=10000000,ignore_errors=True)
pubchems = pubchems.filter(pl.col('NSC').is_in(dose_resp['NSC']))
smiles = smiles.filter(pl.col("NSC").is_in(dose_resp['NSC']))
##first retreive pubchem data
if opts.test:
arr = rand.sample(list(pubchems['CID']),100)
else:
arr = set(pubchems['CID'])

##first filter to see if there are structures/drugs in teh data already. i dont think this does much.
if os.path.exists(opts.output):
curdrugs = pl.read_csv(opts.output,separator='\t')
# cs = set(curdrugs['isoSMILES'])
smiles = smiles.filter(pl.col('SMILES').is_not_null())
upper=[a.upper() for a in smiles['SMILES']]
smiles= pl.DataFrame({'NSC':smiles['NSC'],'upper':upper})#smiles.with_columns(upper=upper)
##reduce to smiels only in current drugs
ssmiles = smiles.filter(~pl.col('upper').is_in(curdrugs['isoSMILES']))
ssmiles = ssmiles.filter(~pl.col('upper').is_in(curdrugs['canSMILES']))
pubchems = pubchems.filter(pl.col('NSC').is_in(ssmiles['NSC']))
arr = set(pubchems['CID'])

print("Querying pubchem from CIDs")
pr.update_dataframe_and_write_tsv(arr,opts.output,'/tmp/ignore_chems.txt',batch_size=400,isname=False,time_limit=10*60*60)
Expand Down

0 comments on commit 55e4b39

Please sign in to comment.