From f6108df0260364840082ca53f3c03be9cbd79e08 Mon Sep 17 00:00:00 2001 From: Alejandro Velez Date: Mon, 6 Jan 2025 18:28:16 -0500 Subject: [PATCH 1/4] scgpt in model server --- tdc/metadata.py | 2 + tdc/model_server/tdc_hf.py | 16 ++++---- tdc/model_server/tokenizers/scgpt.py | 61 ++++++++++++++++++++++++++++ tdc/multi_pred/anndata_dataset.py | 9 +++- tdc/test/test_model_server.py | 18 ++++++++ 5 files changed, 96 insertions(+), 10 deletions(-) create mode 100644 tdc/model_server/tokenizers/scgpt.py diff --git a/tdc/metadata.py b/tdc/metadata.py index c26641d4..98368367 100644 --- a/tdc/metadata.py +++ b/tdc/metadata.py @@ -956,6 +956,7 @@ def get_task2category(): "evebio_pharmone_v1_summary_result_table": "tab", "evebio_pharmone_v1_target_doc": "tab", "evebio_pharmone_v1_target_table": "tab", + "cellxgene_sample_small": "h5ad", } name2id = { @@ -1162,6 +1163,7 @@ def get_task2category(): "evebio_pharmone_v1_summary_result_table": 10741542, "evebio_pharmone_v1_target_doc": 10741536, "evebio_pharmone_v1_target_table": 10741537, + "cellxgene_sample_small": 10806522, } oracle2type = { diff --git a/tdc/model_server/tdc_hf.py b/tdc/model_server/tdc_hf.py index d7562c1a..57e25473 100644 --- a/tdc/model_server/tdc_hf.py +++ b/tdc/model_server/tdc_hf.py @@ -14,7 +14,7 @@ 'CYP3A4_Veith-AttentiveFP', ] -model_hub = ["Geneformer"] +model_hub = ["Geneformer", "scGPT"] class tdc_hf_interface: @@ -56,14 +56,12 @@ def load(self): if self.model_name not in model_hub: raise Exception("this model is not in the TDC model hub GH repo.") elif self.model_name == "Geneformer": - # Load model directly - from transformers import AutoTokenizer, AutoModelForMaskedLM, pipeline - # tokenizer = AutoTokenizer.from_pretrained("ctheodoris/Geneformer") - model = AutoModelForMaskedLM.from_pretrained( - "ctheodoris/Geneformer") - # pipe = pipeline("fill-mask", model=model, tokenizer=tokenizer) - # pipe = pipeline("fill-mask", model="ctheodoris/Geneformer") - # return pipe + from transformers import AutoModelForMaskedLM + model = AutoModelForMaskedLM.from_pretrained("tdc/Geneformer") + return model + elif self.model_name == "scGPT": + from transformers import AutoModel + model = AutoModel.from_pretrained("tdc/scGPT") return model raise Exception("Not implemented yet!") diff --git a/tdc/model_server/tokenizers/scgpt.py b/tdc/model_server/tokenizers/scgpt.py new file mode 100644 index 00000000..df472e93 --- /dev/null +++ b/tdc/model_server/tokenizers/scgpt.py @@ -0,0 +1,61 @@ +import numpy as np +from typing import List, Tuple + + +def tokenize_batch( + data: np.ndarray, + gene_ids: np.ndarray, + return_pt: bool = True, + append_cls: bool = True, + include_zero_gene: bool = False, + cls_id: str = "", +) -> List[Tuple]: + """ + Tokenize a batch of data. Returns a list of tuple (gene_id, count). + + Args: + data (array-like): A batch of data, with shape (batch_size, n_features). + n_features equals the number of all genes. + gene_ids (array-like): A batch of gene ids, with shape (n_features,). + return_pt (bool): Whether to return torch tensors of gene_ids and counts, + default to True. + + Returns: + list: A list of tuple (gene_names, counts) of non zero gene expressions. + """ + if data.shape[1] != len(gene_ids): + raise ValueError( + f"Number of features in data ({data.shape[1]}) does not match " + f"number of gene_ids ({len(gene_ids)}).") + + tokenized_data = [] + for i in range(len(data)): + row = data[i] + if include_zero_gene: + values = row + genes = gene_ids + else: + idx = np.nonzero(row)[0] + values = row[idx] + genes = gene_ids[idx] + if append_cls: + genes = np.insert(genes, 0, cls_id) + values = np.insert(values, 0, 0) + if return_pt: + import torch + values = torch.from_numpy(values).float().to(torch.int64) + tokenized_data.append((genes, values)) + return tokenized_data + + +class scGPTTokenizer: + + def __init__(self): + pass + + @classmethod + def tokenize_cell_vectors(cls, data, gene_names): + """ + Tokenizing single-cell gene expression vectors formatted as anndata types + """ + return tokenize_batch(data, gene_names) diff --git a/tdc/multi_pred/anndata_dataset.py b/tdc/multi_pred/anndata_dataset.py index 462318eb..65788ad2 100644 --- a/tdc/multi_pred/anndata_dataset.py +++ b/tdc/multi_pred/anndata_dataset.py @@ -5,12 +5,19 @@ class DataLoader(DL): - def __init__(self, name, path, print_stats, dataset_names): + def __init__(self, + name, + path, + print_stats=False, + dataset_names=None, + no_convert=False): super(DataLoader, self).__init__(name, path, print_stats, dataset_names) self.adata = self.df # this is in AnnData format cmap = ConfigMap() self.cmap = cmap self.config = cmap.get(name) + if no_convert: + return if self.config is None: # default to converting adata to dataframe as is self.df = AnnDataToDataFrame.anndata_to_df(self.adata) diff --git a/tdc/test/test_model_server.py b/tdc/test/test_model_server.py index 6a51ab5c..adc0b06f 100644 --- a/tdc/test/test_model_server.py +++ b/tdc/test/test_model_server.py @@ -93,6 +93,24 @@ def setUp(self): print(os.getcwd()) self.resource = cellxgene_census.CensusResource() + def testscGPT(self): + from tdc.multi_pred.anndata_dataset import DataLoader + from tdc import tdc_hf_interface + from tdc.model_server.tokenizers.scgpt import scGPTTokenizer + adata = DataLoader("cellxgene_sample_small", + "./data", + dataset_names=["cellxgene_sample_small"], + no_convert=True).adata + scgpt = tdc_hf_interface("scGPT") + model = scgpt.load() # this line can cause segmentation fault + tokenizer = scGPTTokenizer() + gene_ids = adata.var["feature_name"].to_numpy( + ) # Convert to numpy array + tokenized_data = tokenizer.tokenize_cell_vectors( + adata.X.toarray(), gene_ids) + first_embed = model(tokenized_data[0][1]).last_hidden_state + self.assertEqual(first_embed.shape[0], len(gene_ids)) + def testGeneformerTokenizer(self): adata = self.resource.get_anndata( From e6268fbcd8e360812836b9b18fbb43e6813015ff Mon Sep 17 00:00:00 2001 From: Alejandro Velez Date: Mon, 6 Jan 2025 19:37:35 -0500 Subject: [PATCH 2/4] mend --- tdc/test/test_model_server.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tdc/test/test_model_server.py b/tdc/test/test_model_server.py index adc0b06f..3dcf1330 100644 --- a/tdc/test/test_model_server.py +++ b/tdc/test/test_model_server.py @@ -109,7 +109,7 @@ def testscGPT(self): tokenized_data = tokenizer.tokenize_cell_vectors( adata.X.toarray(), gene_ids) first_embed = model(tokenized_data[0][1]).last_hidden_state - self.assertEqual(first_embed.shape[0], len(gene_ids)) + self.assertEqual(first_embed.shape[0], len(tokenized_data[0][0])) def testGeneformerTokenizer(self): From a684dd384ee144a683b13ac2beca4c266d30909d Mon Sep 17 00:00:00 2001 From: Alex Velez-Arce Date: Mon, 6 Jan 2025 20:35:58 -0500 Subject: [PATCH 3/4] Update tdc_hf.py --- tdc/model_server/tdc_hf.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tdc/model_server/tdc_hf.py b/tdc/model_server/tdc_hf.py index 57e25473..5cf76051 100644 --- a/tdc/model_server/tdc_hf.py +++ b/tdc/model_server/tdc_hf.py @@ -57,7 +57,7 @@ def load(self): raise Exception("this model is not in the TDC model hub GH repo.") elif self.model_name == "Geneformer": from transformers import AutoModelForMaskedLM - model = AutoModelForMaskedLM.from_pretrained("tdc/Geneformer") + model = AutoModelForMaskedLM.from_pretrained("ctheodoris/Geneformer") return model elif self.model_name == "scGPT": from transformers import AutoModel From 22bbc07ddcd25af4e6a26c45783d7f7b147972ad Mon Sep 17 00:00:00 2001 From: Alejandro Velez Date: Mon, 6 Jan 2025 21:01:44 -0500 Subject: [PATCH 4/4] mend --- tdc/test/test_hf.py | 153 +++++++++++++++++++++++++++++--------------- 1 file changed, 101 insertions(+), 52 deletions(-) diff --git a/tdc/test/test_hf.py b/tdc/test/test_hf.py index 74a5cbf6..223012d1 100644 --- a/tdc/test/test_hf.py +++ b/tdc/test/test_hf.py @@ -1,66 +1,115 @@ -# -*- coding: utf-8 -*- +from huggingface_hub import create_repo +from huggingface_hub import HfApi, snapshot_download, hf_hub_download +import os -from __future__ import division -from __future__ import print_function +deeppurpose_repo = [ + 'hERG_Karim-Morgan', + 'hERG_Karim-CNN', + 'hERG_Karim-AttentiveFP', + 'BBB_Martins-AttentiveFP', + 'BBB_Martins-Morgan', + 'BBB_Martins-CNN', + 'CYP3A4_Veith-Morgan', + 'CYP3A4_Veith-CNN', + 'CYP3A4_Veith-AttentiveFP', +] -import os -import sys +model_hub = ["Geneformer", "scGPT"] -import unittest -import shutil -import pytest -# temporary solution for relative imports in case TDC is not installed -# if TDC is installed, no need to use the following line -sys.path.append( - os.path.abspath(os.path.join(os.path.dirname(__file__), "../.."))) -# TODO: add verification for the generation other than simple integration +class tdc_hf_interface: + ''' + Example use cases: + # initialize an interface object with HF repo name + tdc_hf_herg = tdc_hf_interface("hERG_Karim-Morgan") + # upload folder/files to this repo + tdc_hf_herg.upload('./Morgan_herg_karim_optimal') + # load deeppurpose model from this repo + dp_model = tdc_hf_herg.load_deeppurpose('./data') + dp_model.predict(XXX) + ''' + def __init__(self, repo_name): + self.repo_id = "tdc/" + repo_name + try: + self.model_name = repo_name.split('-')[1] + except: + self.model_name = repo_name -class TestHF(unittest.TestCase): + def upload(self, folder_path): + create_repo(repo_id=self.repo_id) + api = HfApi() + api.upload_folder(folder_path=folder_path, + path_in_repo="model", + repo_id=self.repo_id, + repo_type="model") - def setUp(self): - print(os.getcwd()) - pass + def file_download(self, save_path, filename): + model_ckpt = hf_hub_download(repo_id=self.repo_id, + filename=filename, + cache_dir=save_path) - @pytest.mark.skip( - reason="This test is skipped due to deeppurpose installation dependency" - ) - @unittest.skip(reason="DeepPurpose") - def test_hf_load_predict(self): - from tdc.single_pred import Tox - data = Tox(name='herg_karim') + def repo_download(self, save_path): + snapshot_download(repo_id=self.repo_id, cache_dir=save_path) - from tdc import tdc_hf_interface - tdc_hf = tdc_hf_interface("hERG_Karim-CNN") - # load deeppurpose model from this repo - dp_model = tdc_hf.load_deeppurpose('./data') - tdc_hf.predict_deeppurpose(dp_model, ['CC(=O)NC1=CC=C(O)C=C1']) + def load(self): + if self.model_name not in model_hub: + raise Exception("this model is not in the TDC model hub GH repo.") + elif self.model_name == "Geneformer": + from transformers import AutoModelForMaskedLM + model = AutoModelForMaskedLM.from_pretrained( + "ctheodoris/Geneformer") + return model + elif self.model_name == "scGPT": + from transformers import AutoModel + model = AutoModel.from_pretrained("tdc/scGPT") + return model + raise Exception("Not implemented yet!") - def test_hf_transformer(self): - from tdc import tdc_hf_interface - # from transformers import Pipeline - from transformers import BertForMaskedLM as BertModel - geneformer = tdc_hf_interface("Geneformer") - model = geneformer.load() - # assert isinstance(pipeline, Pipeline) - assert isinstance(model, BertModel), type(model) + def load_deeppurpose(self, save_path): + if self.repo_id[4:] in deeppurpose_repo: + save_path = save_path + '/' + self.repo_id[4:] + if not os.path.exists(save_path): + os.mkdir(save_path) + self.file_download(save_path, "model/model.pt") + self.file_download(save_path, "model/config.pkl") - # def test_hf_load_new_pytorch_standard(self): - # from tdc import tdc_hf_interface - # # from tdc.resource.dataloader import DataLoader - # # data = DataLoader(name="pinnacle_dti") - # tdc_hf = tdc_hf_interface("mli-PINNACLE") - # dp_model = tdc_hf.load() - # assert dp_model is not None + save_path = save_path + '/models--tdc--' + self.repo_id[ + 4:] + '/blobs/' + file_name1 = save_path + os.listdir(save_path)[0] + file_name2 = save_path + os.listdir(save_path)[1] - def tearDown(self): - try: - print(os.getcwd()) - shutil.rmtree(os.path.join(os.getcwd(), "data")) - except: - pass + if os.path.getsize(file_name1) > os.path.getsize(file_name2): + model_file, config_file = file_name1, file_name2 + else: + config_file, model_file = file_name1, file_name2 + + os.rename(model_file, save_path + 'model.pt') + os.rename(config_file, save_path + 'config.pkl') + try: + from DeepPurpose import CompoundPred + except: + raise ValueError( + "Please install DeepPurpose package following https://github.com/kexinhuang12345/DeepPurpose#installation" + ) + net = CompoundPred.model_pretrained(path_dir=save_path) + return net + else: + raise ValueError("This repo does not host a DeepPurpose model!") -if __name__ == "__main__": - unittest.main() + def predict_deeppurpose(self, model, drugs): + try: + from DeepPurpose import utils + except: + raise ValueError( + "Please install DeepPurpose package following https://github.com/kexinhuang12345/DeepPurpose#installation" + ) + if self.model_name == 'AttentiveFP': + self.model_name = 'DGL_' + self.model_name + X_pred = utils.data_process(X_drug=drugs, + y=[0] * len(drugs), + drug_encoding=self.model_name, + split_method='no_split') + y_pred = model.predict(X_pred)[0] + return y_pred