Skip to content

Commit

Permalink
modify predict_kinase_df function
Browse files Browse the repository at this point in the history
correct a bug 'where'
  • Loading branch information
sky1ove committed Sep 26, 2024
1 parent b809878 commit cb8c385
Show file tree
Hide file tree
Showing 2 changed files with 98 additions and 47 deletions.
47 changes: 28 additions & 19 deletions katlas/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -451,17 +451,18 @@ def predict_kinase(input_string: str, # site sequence

# %% ../nbs/00_core.ipynb 41
# PSPA
param_PSPA_st = {'ref':Data.get_pspa_st_norm(), 'func':multiply} # Johnson et al. Nature official
param_PSPA_y = {'ref':Data.get_pspa_tyr_norm(), 'func':multiply}
param_PSPA = {'ref':Data.get_pspa_all_norm(), 'func':multiply}
param_PSPA_st = {'ref':Data.get_pspa_st_norm().astype('float32'), 'func':multiply} # Johnson et al. Nature official
param_PSPA_y = {'ref':Data.get_pspa_tyr_norm().astype('float32'), 'func':multiply}
param_PSPA = {'ref':Data.get_pspa_all_norm().astype('float32'), 'func':multiply}


# Kinase-substrate dataset, CDDM
param_CDDM = {'ref':Data.get_cddm(), 'func':sumup}
param_CDDM_upper = {'ref':Data.get_cddm_upper(), 'func':sumup, 'to_upper':True} # specific for all uppercase
param_CDDM = {'ref':Data.get_cddm().astype('float32'), 'func':sumup}
param_CDDM_upper = {'ref':Data.get_cddm_upper().astype('float32'), 'func':sumup, 'to_upper':True} # specific for all uppercase

# %% ../nbs/00_core.ipynb 45
# %% ../nbs/00_core.ipynb 46
def predict_kinase_df(df, seq_col, ref, func, to_lower=False, to_upper=False):

print('input dataframe has a length', df.shape[0])
print('Preprocessing')

Expand Down Expand Up @@ -493,12 +494,20 @@ def predict_kinase_df(df, seq_col, ref, func, to_lower=False, to_upper=False):
df['keys'] = df['site_seq'].apply(get_dict)
input_keys_df = df[['keys']].explode('keys').reset_index()
input_keys_df.columns = ['input_index', 'key']


ref_T = ref.T

merged_df = input_keys_df.merge(ref_T, left_on='key', right_index=True, how='inner')
input_keys_df = input_keys_df.set_index('key')


print('Merging reference')
merged_df = input_keys_df.merge(ref_T, left_index=True, right_index=True, how='inner')

print('Finish merging')

if func == sumup:
grouped_df = merged_df.drop(columns=['key']).groupby('input_index').sum()
grouped_df = merged_df.groupby('input_index').sum()
out = grouped_df.reindex(df.index)

elif func==multiply:
Expand All @@ -514,7 +523,7 @@ def predict_kinase_df(df, seq_col, ref, func, to_lower=False, to_upper=False):
kinase_df = kinase_df.rename(columns={kinase: 'value'})

# Compute log_value
kinase_df['log_value'] = np.log2(kinase_df['value'],where=kinase_df['value']>0)
kinase_df['log_value'] = np.log2(kinase_df['value'].where(kinase_df['value'] > 0))

# Group by 'input_index' and compute sum and count
grouped = kinase_df.dropna().groupby('input_index')
Expand All @@ -541,7 +550,7 @@ def predict_kinase_df(df, seq_col, ref, func, to_lower=False, to_upper=False):
# Return results as a DataFrame
return out

# %% ../nbs/00_core.ipynb 54
# %% ../nbs/00_core.ipynb 56
def get_pct(site,ref,func,pct_ref):

"Replicate the precentile results from The Kinase Library."
Expand All @@ -566,7 +575,7 @@ def get_pct(site,ref,func,pct_ref):
final.columns=['log2(score)','percentile']
return final

# %% ../nbs/00_core.ipynb 60
# %% ../nbs/00_core.ipynb 62
def get_pct_df(score_df, # output from predict_kinase_df
pct_ref, # a reference df for percentile calculation
):
Expand All @@ -591,7 +600,7 @@ def get_pct_df(score_df, # output from predict_kinase_df

return percentiles_df

# %% ../nbs/00_core.ipynb 65
# %% ../nbs/00_core.ipynb 67
def get_unique_site(df:pd.DataFrame = None,# dataframe that contains phosphorylation sites
seq_col: str='site_seq', # column name of site sequence
id_col: str='gene_site' # column name of site id
Expand All @@ -607,7 +616,7 @@ def get_unique_site(df:pd.DataFrame = None,# dataframe that contains phosphoryla

return unique

# %% ../nbs/00_core.ipynb 68
# %% ../nbs/00_core.ipynb 70
def extract_site_seq(df: pd.DataFrame, # dataframe that contains protein sequence
seq_col: str, # column name of protein sequence
position_col: str # column name of position 0
Expand All @@ -633,7 +642,7 @@ def extract_site_seq(df: pd.DataFrame, # dataframe that contains protein sequenc

return np.array(data)

# %% ../nbs/00_core.ipynb 73
# %% ../nbs/00_core.ipynb 75
def get_freq(df_k: pd.DataFrame, # a dataframe for a single kinase that contains phosphorylation sequence splitted by their position
aa_order = [i for i in 'PGACSTVILMFYWHKRQNDEsty'], # amino acid to include in the full matrix
aa_order_paper = [i for i in 'PGACSTVILMFYWHKRQNDEsty'], # amino acid to include in the partial matrix
Expand Down Expand Up @@ -674,7 +683,7 @@ def get_freq(df_k: pd.DataFrame, # a dataframe for a single kinase that contains

return paper,full

# %% ../nbs/00_core.ipynb 77
# %% ../nbs/00_core.ipynb 79
def query_gene(df,gene):

"Query gene in the phosphoproteomics dataset"
Expand All @@ -688,7 +697,7 @@ def query_gene(df,gene):

return df_gene

# %% ../nbs/00_core.ipynb 81
# %% ../nbs/00_core.ipynb 83
def get_ttest(df,
columns1, # list of column names for group1
columns2, # list of column names for group2
Expand Down Expand Up @@ -758,7 +767,7 @@ def get_signed_logP(r,p_col):

return results

# %% ../nbs/00_core.ipynb 82
# %% ../nbs/00_core.ipynb 84
def get_metaP(p_values):

"Use Fisher's method to calculate a combined p value given a list of p values; this function also allows negative p values (negative correlation)"
Expand All @@ -770,7 +779,7 @@ def get_metaP(p_values):

return score

# %% ../nbs/00_core.ipynb 85
# %% ../nbs/00_core.ipynb 87
def raw2norm(df: pd.DataFrame, # single kinase's df has position as index, and single amino acid as columns
PDHK: bool=False, # whether this kinase belongs to PDHK family
):
Expand All @@ -793,7 +802,7 @@ def raw2norm(df: pd.DataFrame, # single kinase's df has position as index, and s

return df2

# %% ../nbs/00_core.ipynb 87
# %% ../nbs/00_core.ipynb 89
def get_one_kinase(df: pd.DataFrame, #stacked dataframe (paper's raw data)
kinase:str, # a specific kinase
normalize: bool=False, # normalize according to the paper; special for PDHK1/4
Expand Down
98 changes: 70 additions & 28 deletions nbs/00_core.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -1067,7 +1067,7 @@
},
{
"cell_type": "code",
"execution_count": 19,
"execution_count": 18,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -1091,7 +1091,7 @@
},
{
"cell_type": "code",
"execution_count": 20,
"execution_count": 19,
"metadata": {},
"outputs": [
{
Expand All @@ -1100,7 +1100,7 @@
"22.906890595608516"
]
},
"execution_count": 20,
"execution_count": 19,
"metadata": {},
"output_type": "execute_result"
}
Expand All @@ -1118,7 +1118,7 @@
},
{
"cell_type": "code",
"execution_count": 21,
"execution_count": 20,
"metadata": {},
"outputs": [],
"source": [
Expand Down Expand Up @@ -1151,7 +1151,7 @@
},
{
"cell_type": "code",
"execution_count": 22,
"execution_count": 21,
"metadata": {},
"outputs": [],
"source": [
Expand Down Expand Up @@ -1217,25 +1217,25 @@
},
{
"cell_type": "code",
"execution_count": 23,
"execution_count": 35,
"metadata": {},
"outputs": [],
"source": [
"#| export\n",
"# PSPA\n",
"param_PSPA_st = {'ref':Data.get_pspa_st_norm(), 'func':multiply} # Johnson et al. Nature official\n",
"param_PSPA_y = {'ref':Data.get_pspa_tyr_norm(), 'func':multiply}\n",
"param_PSPA = {'ref':Data.get_pspa_all_norm(), 'func':multiply}\n",
"param_PSPA_st = {'ref':Data.get_pspa_st_norm().astype('float32'), 'func':multiply} # Johnson et al. Nature official\n",
"param_PSPA_y = {'ref':Data.get_pspa_tyr_norm().astype('float32'), 'func':multiply}\n",
"param_PSPA = {'ref':Data.get_pspa_all_norm().astype('float32'), 'func':multiply}\n",
"\n",
"\n",
"# Kinase-substrate dataset, CDDM\n",
"param_CDDM = {'ref':Data.get_cddm(), 'func':sumup}\n",
"param_CDDM_upper = {'ref':Data.get_cddm_upper(), 'func':sumup, 'to_upper':True} # specific for all uppercase"
"param_CDDM = {'ref':Data.get_cddm().astype('float32'), 'func':sumup}\n",
"param_CDDM_upper = {'ref':Data.get_cddm_upper().astype('float32'), 'func':sumup, 'to_upper':True} # specific for all uppercase"
]
},
{
"cell_type": "code",
"execution_count": 26,
"execution_count": 36,
"metadata": {},
"outputs": [
{
Expand Down Expand Up @@ -1263,7 +1263,7 @@
"Length: 93, dtype: float64"
]
},
"execution_count": 26,
"execution_count": 36,
"metadata": {},
"output_type": "execute_result"
}
Expand All @@ -1274,7 +1274,7 @@
},
{
"cell_type": "code",
"execution_count": 28,
"execution_count": 37,
"metadata": {},
"outputs": [
{
Expand Down Expand Up @@ -1322,12 +1322,20 @@
},
{
"cell_type": "code",
"execution_count": 57,
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": 56,
"metadata": {},
"outputs": [],
"source": [
"#| export\n",
"def predict_kinase_df(df, seq_col, ref, func, to_lower=False, to_upper=False):\n",
" \n",
" print('input dataframe has a length', df.shape[0])\n",
" print('Preprocessing')\n",
" \n",
Expand Down Expand Up @@ -1359,12 +1367,20 @@
" df['keys'] = df['site_seq'].apply(get_dict)\n",
" input_keys_df = df[['keys']].explode('keys').reset_index()\n",
" input_keys_df.columns = ['input_index', 'key']\n",
" \n",
" \n",
" ref_T = ref.T\n",
" \n",
" merged_df = input_keys_df.merge(ref_T, left_on='key', right_index=True, how='inner')\n",
" input_keys_df = input_keys_df.set_index('key')\n",
" \n",
" \n",
" print('Merging reference')\n",
" merged_df = input_keys_df.merge(ref_T, left_index=True, right_index=True, how='inner')\n",
"\n",
" print('Finish merging')\n",
" \n",
" if func == sumup:\n",
" grouped_df = merged_df.drop(columns=['key']).groupby('input_index').sum()\n",
" grouped_df = merged_df.groupby('input_index').sum()\n",
" out = grouped_df.reindex(df.index)\n",
" \n",
" elif func==multiply:\n",
Expand All @@ -1380,7 +1396,7 @@
" kinase_df = kinase_df.rename(columns={kinase: 'value'})\n",
"\n",
" # Compute log_value\n",
" kinase_df['log_value'] = np.log2(kinase_df['value'],where=kinase_df['value']>0)\n",
" kinase_df['log_value'] = np.log2(kinase_df['value'].where(kinase_df['value'] > 0))\n",
"\n",
" # Group by 'input_index' and compute sum and count\n",
" grouped = kinase_df.dropna().groupby('input_index')\n",
Expand Down Expand Up @@ -1410,48 +1426,74 @@
},
{
"cell_type": "code",
"execution_count": 58,
"execution_count": 57,
"metadata": {},
"outputs": [],
"source": [
"# Get y site from psp dataset\n",
"df = Data.get_psp_human_site()\n",
"df_y = df[df['site_seq'].str[7].isin(['y'])].head(3_000)"
"df_sty = df[df['site_seq'].str[7].isin(list('sty'))]"
]
},
{
"cell_type": "code",
"execution_count": 59,
"execution_count": 58,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"input dataframe has a length 3000\n",
"input dataframe has a length 20000\n",
"Preprocessing\n",
"Finish preprocessing\n"
"Finish preprocessing\n",
"Merging reference\n",
"Finish merging\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"100%|██████████| 93/93 [00:00<00:00, 148.51it/s]\n"
"100%|██████████| 396/396 [00:17<00:00, 23.17it/s]\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: user 698 ms, sys: 27.3 ms, total: 726 ms\n",
"Wall time: 989 ms\n"
"CPU times: user 18.8 s, sys: 76.4 ms, total: 18.9 s\n",
"Wall time: 19.1 s\n"
]
}
],
"source": [
"%%time\n",
"out = predict_kinase_df(df_sty.head(20_000),'site_seq', **param_PSPA)"
]
},
{
"cell_type": "code",
"execution_count": 59,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"input dataframe has a length 20000\n",
"Preprocessing\n",
"Finish preprocessing\n",
"Merging reference\n",
"Finish merging\n",
"CPU times: user 2.26 s, sys: 16 ms, total: 2.27 s\n",
"Wall time: 2.27 s\n"
]
}
],
"source": [
"%%time\n",
"out_y = predict_kinase_df(df_y,'site_seq', **param_PSPA_y)"
"out_cddm = predict_kinase_df(df_sty.head(20_000),'site_seq', **param_CDDM)"
]
},
{
Expand Down Expand Up @@ -5762,7 +5804,7 @@
},
{
"cell_type": "code",
"execution_count": 64,
"execution_count": 60,
"metadata": {},
"outputs": [],
"source": [
Expand Down

0 comments on commit cb8c385

Please sign in to comment.