Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

scgpt in model server #336

Merged
merged 4 commits into from
Jan 7, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions tdc/metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -956,6 +956,7 @@ def get_task2category():
"evebio_pharmone_v1_summary_result_table": "tab",
"evebio_pharmone_v1_target_doc": "tab",
"evebio_pharmone_v1_target_table": "tab",
"cellxgene_sample_small": "h5ad",
}

name2id = {
Expand Down Expand Up @@ -1162,6 +1163,7 @@ def get_task2category():
"evebio_pharmone_v1_summary_result_table": 10741542,
"evebio_pharmone_v1_target_doc": 10741536,
"evebio_pharmone_v1_target_table": 10741537,
"cellxgene_sample_small": 10806522,
}

oracle2type = {
Expand Down
16 changes: 7 additions & 9 deletions tdc/model_server/tdc_hf.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
'CYP3A4_Veith-AttentiveFP',
]

model_hub = ["Geneformer"]
model_hub = ["Geneformer", "scGPT"]


class tdc_hf_interface:
Expand Down Expand Up @@ -56,14 +56,12 @@ def load(self):
if self.model_name not in model_hub:
raise Exception("this model is not in the TDC model hub GH repo.")
elif self.model_name == "Geneformer":
# Load model directly
from transformers import AutoTokenizer, AutoModelForMaskedLM, pipeline
# tokenizer = AutoTokenizer.from_pretrained("ctheodoris/Geneformer")
model = AutoModelForMaskedLM.from_pretrained(
"ctheodoris/Geneformer")
# pipe = pipeline("fill-mask", model=model, tokenizer=tokenizer)
# pipe = pipeline("fill-mask", model="ctheodoris/Geneformer")
# return pipe
from transformers import AutoModelForMaskedLM
model = AutoModelForMaskedLM.from_pretrained("ctheodoris/Geneformer")
return model
elif self.model_name == "scGPT":
from transformers import AutoModel
model = AutoModel.from_pretrained("tdc/scGPT")
return model
raise Exception("Not implemented yet!")

Expand Down
61 changes: 61 additions & 0 deletions tdc/model_server/tokenizers/scgpt.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
import numpy as np
from typing import List, Tuple


def tokenize_batch(
data: np.ndarray,
gene_ids: np.ndarray,
return_pt: bool = True,
append_cls: bool = True,
include_zero_gene: bool = False,
cls_id: str = "<cls>",
) -> List[Tuple]:
"""
Tokenize a batch of data. Returns a list of tuple (gene_id, count).

Args:
data (array-like): A batch of data, with shape (batch_size, n_features).
n_features equals the number of all genes.
gene_ids (array-like): A batch of gene ids, with shape (n_features,).
return_pt (bool): Whether to return torch tensors of gene_ids and counts,
default to True.

Returns:
list: A list of tuple (gene_names, counts) of non zero gene expressions.
"""
if data.shape[1] != len(gene_ids):
raise ValueError(
f"Number of features in data ({data.shape[1]}) does not match "
f"number of gene_ids ({len(gene_ids)}).")

tokenized_data = []
for i in range(len(data)):
row = data[i]
if include_zero_gene:
values = row
genes = gene_ids
else:
idx = np.nonzero(row)[0]
values = row[idx]
genes = gene_ids[idx]
if append_cls:
genes = np.insert(genes, 0, cls_id)
values = np.insert(values, 0, 0)
if return_pt:
import torch
values = torch.from_numpy(values).float().to(torch.int64)
tokenized_data.append((genes, values))
return tokenized_data


class scGPTTokenizer:

def __init__(self):
pass

@classmethod
def tokenize_cell_vectors(cls, data, gene_names):
"""
Tokenizing single-cell gene expression vectors formatted as anndata types
"""
return tokenize_batch(data, gene_names)
9 changes: 8 additions & 1 deletion tdc/multi_pred/anndata_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,19 @@

class DataLoader(DL):

def __init__(self, name, path, print_stats, dataset_names):
def __init__(self,
name,
path,
print_stats=False,
dataset_names=None,
no_convert=False):
super(DataLoader, self).__init__(name, path, print_stats, dataset_names)
self.adata = self.df # this is in AnnData format
cmap = ConfigMap()
self.cmap = cmap
self.config = cmap.get(name)
if no_convert:
return
if self.config is None:
# default to converting adata to dataframe as is
self.df = AnnDataToDataFrame.anndata_to_df(self.adata)
Expand Down
153 changes: 101 additions & 52 deletions tdc/test/test_hf.py
Original file line number Diff line number Diff line change
@@ -1,66 +1,115 @@
# -*- coding: utf-8 -*-
from huggingface_hub import create_repo
from huggingface_hub import HfApi, snapshot_download, hf_hub_download
import os

from __future__ import division
from __future__ import print_function
deeppurpose_repo = [
'hERG_Karim-Morgan',
'hERG_Karim-CNN',
'hERG_Karim-AttentiveFP',
'BBB_Martins-AttentiveFP',
'BBB_Martins-Morgan',
'BBB_Martins-CNN',
'CYP3A4_Veith-Morgan',
'CYP3A4_Veith-CNN',
'CYP3A4_Veith-AttentiveFP',
]

import os
import sys
model_hub = ["Geneformer", "scGPT"]

import unittest
import shutil
import pytest

# temporary solution for relative imports in case TDC is not installed
# if TDC is installed, no need to use the following line
sys.path.append(
os.path.abspath(os.path.join(os.path.dirname(__file__), "../..")))
# TODO: add verification for the generation other than simple integration
class tdc_hf_interface:
'''
Example use cases:
# initialize an interface object with HF repo name
tdc_hf_herg = tdc_hf_interface("hERG_Karim-Morgan")
# upload folder/files to this repo
tdc_hf_herg.upload('./Morgan_herg_karim_optimal')
# load deeppurpose model from this repo
dp_model = tdc_hf_herg.load_deeppurpose('./data')
dp_model.predict(XXX)
'''

def __init__(self, repo_name):
self.repo_id = "tdc/" + repo_name
try:
self.model_name = repo_name.split('-')[1]
except:
self.model_name = repo_name

class TestHF(unittest.TestCase):
def upload(self, folder_path):
create_repo(repo_id=self.repo_id)
api = HfApi()
api.upload_folder(folder_path=folder_path,
path_in_repo="model",
repo_id=self.repo_id,
repo_type="model")

def setUp(self):
print(os.getcwd())
pass
def file_download(self, save_path, filename):
model_ckpt = hf_hub_download(repo_id=self.repo_id,
filename=filename,
cache_dir=save_path)

@pytest.mark.skip(
reason="This test is skipped due to deeppurpose installation dependency"
)
@unittest.skip(reason="DeepPurpose")
def test_hf_load_predict(self):
from tdc.single_pred import Tox
data = Tox(name='herg_karim')
def repo_download(self, save_path):
snapshot_download(repo_id=self.repo_id, cache_dir=save_path)

from tdc import tdc_hf_interface
tdc_hf = tdc_hf_interface("hERG_Karim-CNN")
# load deeppurpose model from this repo
dp_model = tdc_hf.load_deeppurpose('./data')
tdc_hf.predict_deeppurpose(dp_model, ['CC(=O)NC1=CC=C(O)C=C1'])
def load(self):
if self.model_name not in model_hub:
raise Exception("this model is not in the TDC model hub GH repo.")
elif self.model_name == "Geneformer":
from transformers import AutoModelForMaskedLM
model = AutoModelForMaskedLM.from_pretrained(
"ctheodoris/Geneformer")
return model
elif self.model_name == "scGPT":
from transformers import AutoModel
model = AutoModel.from_pretrained("tdc/scGPT")
return model
raise Exception("Not implemented yet!")

def test_hf_transformer(self):
from tdc import tdc_hf_interface
# from transformers import Pipeline
from transformers import BertForMaskedLM as BertModel
geneformer = tdc_hf_interface("Geneformer")
model = geneformer.load()
# assert isinstance(pipeline, Pipeline)
assert isinstance(model, BertModel), type(model)
def load_deeppurpose(self, save_path):
if self.repo_id[4:] in deeppurpose_repo:
save_path = save_path + '/' + self.repo_id[4:]
if not os.path.exists(save_path):
os.mkdir(save_path)
self.file_download(save_path, "model/model.pt")
self.file_download(save_path, "model/config.pkl")

# def test_hf_load_new_pytorch_standard(self):
# from tdc import tdc_hf_interface
# # from tdc.resource.dataloader import DataLoader
# # data = DataLoader(name="pinnacle_dti")
# tdc_hf = tdc_hf_interface("mli-PINNACLE")
# dp_model = tdc_hf.load()
# assert dp_model is not None
save_path = save_path + '/models--tdc--' + self.repo_id[
4:] + '/blobs/'
file_name1 = save_path + os.listdir(save_path)[0]
file_name2 = save_path + os.listdir(save_path)[1]

def tearDown(self):
try:
print(os.getcwd())
shutil.rmtree(os.path.join(os.getcwd(), "data"))
except:
pass
if os.path.getsize(file_name1) > os.path.getsize(file_name2):
model_file, config_file = file_name1, file_name2
else:
config_file, model_file = file_name1, file_name2

os.rename(model_file, save_path + 'model.pt')
os.rename(config_file, save_path + 'config.pkl')
try:
from DeepPurpose import CompoundPred
except:
raise ValueError(
"Please install DeepPurpose package following https://github.com/kexinhuang12345/DeepPurpose#installation"
)

net = CompoundPred.model_pretrained(path_dir=save_path)
return net
else:
raise ValueError("This repo does not host a DeepPurpose model!")

if __name__ == "__main__":
unittest.main()
def predict_deeppurpose(self, model, drugs):
try:
from DeepPurpose import utils
except:
raise ValueError(
"Please install DeepPurpose package following https://github.com/kexinhuang12345/DeepPurpose#installation"
)
if self.model_name == 'AttentiveFP':
self.model_name = 'DGL_' + self.model_name
X_pred = utils.data_process(X_drug=drugs,
y=[0] * len(drugs),
drug_encoding=self.model_name,
split_method='no_split')
y_pred = model.predict(X_pred)[0]
return y_pred
18 changes: 18 additions & 0 deletions tdc/test/test_model_server.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,24 @@ def setUp(self):
print(os.getcwd())
self.resource = cellxgene_census.CensusResource()

def testscGPT(self):
from tdc.multi_pred.anndata_dataset import DataLoader
from tdc import tdc_hf_interface
from tdc.model_server.tokenizers.scgpt import scGPTTokenizer
adata = DataLoader("cellxgene_sample_small",
"./data",
dataset_names=["cellxgene_sample_small"],
no_convert=True).adata
scgpt = tdc_hf_interface("scGPT")
model = scgpt.load() # this line can cause segmentation fault
tokenizer = scGPTTokenizer()
gene_ids = adata.var["feature_name"].to_numpy(
) # Convert to numpy array
tokenized_data = tokenizer.tokenize_cell_vectors(
adata.X.toarray(), gene_ids)
first_embed = model(tokenized_data[0][1]).last_hidden_state
self.assertEqual(first_embed.shape[0], len(tokenized_data[0][0]))

def testGeneformerTokenizer(self):

adata = self.resource.get_anndata(
Expand Down
Loading