modify predict_kinase_df function

correct a bug 'where'
sky1ove · Sep 26, 2024 · cb8c385 · cb8c385
1 parent b809878
commit cb8c385
Show file tree

Hide file tree

Showing 2 changed files with 98 additions and 47 deletions.
diff --git a/katlas/core.py b/katlas/core.py
@@ -451,17 +451,18 @@ def predict_kinase(input_string: str, # site sequence
 
 # %% ../nbs/00_core.ipynb 41
 # PSPA
-param_PSPA_st = {'ref':Data.get_pspa_st_norm(), 'func':multiply} # Johnson et al. Nature official
-param_PSPA_y = {'ref':Data.get_pspa_tyr_norm(), 'func':multiply}
-param_PSPA = {'ref':Data.get_pspa_all_norm(), 'func':multiply}
+param_PSPA_st = {'ref':Data.get_pspa_st_norm().astype('float32'), 'func':multiply} # Johnson et al. Nature official
+param_PSPA_y = {'ref':Data.get_pspa_tyr_norm().astype('float32'), 'func':multiply}
+param_PSPA = {'ref':Data.get_pspa_all_norm().astype('float32'), 'func':multiply}
 
 
 # Kinase-substrate dataset, CDDM
-param_CDDM = {'ref':Data.get_cddm(), 'func':sumup}
-param_CDDM_upper = {'ref':Data.get_cddm_upper(), 'func':sumup, 'to_upper':True} # specific for all uppercase
+param_CDDM = {'ref':Data.get_cddm().astype('float32'), 'func':sumup}
+param_CDDM_upper = {'ref':Data.get_cddm_upper().astype('float32'), 'func':sumup, 'to_upper':True} # specific for all uppercase
 
-# %% ../nbs/00_core.ipynb 45
+# %% ../nbs/00_core.ipynb 46
 def predict_kinase_df(df, seq_col, ref, func, to_lower=False, to_upper=False):
+
     print('input dataframe has a length', df.shape[0])
     print('Preprocessing')
 
@@ -493,12 +494,20 @@ def predict_kinase_df(df, seq_col, ref, func, to_lower=False, to_upper=False):
     df['keys'] = df['site_seq'].apply(get_dict)
     input_keys_df  = df[['keys']].explode('keys').reset_index()
     input_keys_df.columns = ['input_index', 'key']
+
+
     ref_T = ref.T
 
-    merged_df = input_keys_df.merge(ref_T, left_on='key', right_index=True, how='inner')
+    input_keys_df = input_keys_df.set_index('key')
+
+
+    print('Merging reference')
+    merged_df = input_keys_df.merge(ref_T, left_index=True, right_index=True, how='inner')
+
+    print('Finish merging')
 
     if func == sumup:
-        grouped_df = merged_df.drop(columns=['key']).groupby('input_index').sum()
+        grouped_df = merged_df.groupby('input_index').sum()
         out = grouped_df.reindex(df.index)
 
     elif func==multiply:
@@ -514,7 +523,7 @@ def predict_kinase_df(df, seq_col, ref, func, to_lower=False, to_upper=False):
             kinase_df = kinase_df.rename(columns={kinase: 'value'})
 
             # Compute log_value
-            kinase_df['log_value'] = np.log2(kinase_df['value'],where=kinase_df['value']>0)
+            kinase_df['log_value'] = np.log2(kinase_df['value'].where(kinase_df['value'] > 0))
 
             # Group by 'input_index' and compute sum and count
             grouped = kinase_df.dropna().groupby('input_index')
@@ -541,7 +550,7 @@ def predict_kinase_df(df, seq_col, ref, func, to_lower=False, to_upper=False):
     # Return results as a DataFrame
     return out
 
-# %% ../nbs/00_core.ipynb 54
+# %% ../nbs/00_core.ipynb 56
 def get_pct(site,ref,func,pct_ref):
 
     "Replicate the precentile results from The Kinase Library."
@@ -566,7 +575,7 @@ def get_pct(site,ref,func,pct_ref):
     final.columns=['log2(score)','percentile']
     return final
 
-# %% ../nbs/00_core.ipynb 60
+# %% ../nbs/00_core.ipynb 62
 def get_pct_df(score_df, # output from predict_kinase_df 
                pct_ref, # a reference df for percentile calculation
               ):
@@ -591,7 +600,7 @@ def get_pct_df(score_df, # output from predict_kinase_df
 
     return percentiles_df
 
-# %% ../nbs/00_core.ipynb 65
+# %% ../nbs/00_core.ipynb 67
 def get_unique_site(df:pd.DataFrame = None,# dataframe that contains phosphorylation sites
                     seq_col: str='site_seq', # column name of site sequence
                     id_col: str='gene_site' # column name of site id
@@ -607,7 +616,7 @@ def get_unique_site(df:pd.DataFrame = None,# dataframe that contains phosphoryla
 
     return unique
 
-# %% ../nbs/00_core.ipynb 68
+# %% ../nbs/00_core.ipynb 70
 def extract_site_seq(df: pd.DataFrame, # dataframe that contains protein sequence
                      seq_col: str, # column name of protein sequence
                      position_col: str # column name of position 0
@@ -633,7 +642,7 @@ def extract_site_seq(df: pd.DataFrame, # dataframe that contains protein sequenc
 
     return np.array(data)
 
-# %% ../nbs/00_core.ipynb 73
+# %% ../nbs/00_core.ipynb 75
 def get_freq(df_k: pd.DataFrame, # a dataframe for a single kinase that contains phosphorylation sequence splitted by their position
              aa_order = [i for i in 'PGACSTVILMFYWHKRQNDEsty'], # amino acid to include in the full matrix 
              aa_order_paper = [i for i in 'PGACSTVILMFYWHKRQNDEsty'], # amino acid to include in the partial matrix
@@ -674,7 +683,7 @@ def get_freq(df_k: pd.DataFrame, # a dataframe for a single kinase that contains
 
     return paper,full
 
-# %% ../nbs/00_core.ipynb 77
+# %% ../nbs/00_core.ipynb 79
 def query_gene(df,gene):
 
     "Query gene in the phosphoproteomics dataset"
@@ -688,7 +697,7 @@ def query_gene(df,gene):
 
     return df_gene
 
-# %% ../nbs/00_core.ipynb 81
+# %% ../nbs/00_core.ipynb 83
 def get_ttest(df, 
               columns1, # list of column names for group1
               columns2, # list of column names for group2
@@ -758,7 +767,7 @@ def get_signed_logP(r,p_col):
 
     return results
 
-# %% ../nbs/00_core.ipynb 82
+# %% ../nbs/00_core.ipynb 84
 def get_metaP(p_values):
 
     "Use Fisher's method to calculate a combined p value given a list of p values; this function also allows negative p values (negative correlation)"
@@ -770,7 +779,7 @@ def get_metaP(p_values):
 
     return score
 
-# %% ../nbs/00_core.ipynb 85
+# %% ../nbs/00_core.ipynb 87
 def raw2norm(df: pd.DataFrame, # single kinase's df has position as index, and single amino acid as columns
              PDHK: bool=False, # whether this kinase belongs to PDHK family 
             ):
@@ -793,7 +802,7 @@ def raw2norm(df: pd.DataFrame, # single kinase's df has position as index, and s
 
     return df2
 
-# %% ../nbs/00_core.ipynb 87
+# %% ../nbs/00_core.ipynb 89
 def get_one_kinase(df: pd.DataFrame, #stacked dataframe (paper's raw data)
                    kinase:str, # a specific kinase
                    normalize: bool=False, # normalize according to the paper; special for PDHK1/4

diff --git a/nbs/00_core.ipynb b/nbs/00_core.ipynb
@@ -1067,7 +1067,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 19,
+   "execution_count": 18,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1091,7 +1091,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 20,
+   "execution_count": 19,
    "metadata": {},
    "outputs": [
     {
@@ -1100,7 +1100,7 @@
        "22.906890595608516"
       ]
      },
-     "execution_count": 20,
+     "execution_count": 19,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -1118,7 +1118,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 21,
+   "execution_count": 20,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1151,7 +1151,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 22,
+   "execution_count": 21,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1217,25 +1217,25 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 23,
+   "execution_count": 35,
    "metadata": {},
    "outputs": [],
    "source": [
     "#| export\n",
     "# PSPA\n",
-    "param_PSPA_st = {'ref':Data.get_pspa_st_norm(), 'func':multiply} # Johnson et al. Nature official\n",
-    "param_PSPA_y = {'ref':Data.get_pspa_tyr_norm(), 'func':multiply}\n",
-    "param_PSPA = {'ref':Data.get_pspa_all_norm(), 'func':multiply}\n",
+    "param_PSPA_st = {'ref':Data.get_pspa_st_norm().astype('float32'), 'func':multiply} # Johnson et al. Nature official\n",
+    "param_PSPA_y = {'ref':Data.get_pspa_tyr_norm().astype('float32'), 'func':multiply}\n",
+    "param_PSPA = {'ref':Data.get_pspa_all_norm().astype('float32'), 'func':multiply}\n",
     "\n",
     "\n",
     "# Kinase-substrate dataset, CDDM\n",
-    "param_CDDM = {'ref':Data.get_cddm(), 'func':sumup}\n",
-    "param_CDDM_upper = {'ref':Data.get_cddm_upper(), 'func':sumup, 'to_upper':True} # specific for all uppercase"
+    "param_CDDM = {'ref':Data.get_cddm().astype('float32'), 'func':sumup}\n",
+    "param_CDDM_upper = {'ref':Data.get_cddm_upper().astype('float32'), 'func':sumup, 'to_upper':True} # specific for all uppercase"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 26,
+   "execution_count": 36,
    "metadata": {},
    "outputs": [
     {
@@ -1263,7 +1263,7 @@
        "Length: 93, dtype: float64"
       ]
      },
-     "execution_count": 26,
+     "execution_count": 36,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -1274,7 +1274,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 28,
+   "execution_count": 37,
    "metadata": {},
    "outputs": [
     {
@@ -1322,12 +1322,20 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 57,
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 56,
    "metadata": {},
    "outputs": [],
    "source": [
     "#| export\n",
     "def predict_kinase_df(df, seq_col, ref, func, to_lower=False, to_upper=False):\n",
+    "    \n",
     "    print('input dataframe has a length', df.shape[0])\n",
     "    print('Preprocessing')\n",
     "    \n",
@@ -1359,12 +1367,20 @@
     "    df['keys'] = df['site_seq'].apply(get_dict)\n",
     "    input_keys_df  = df[['keys']].explode('keys').reset_index()\n",
     "    input_keys_df.columns = ['input_index', 'key']\n",
+    "    \n",
+    "    \n",
     "    ref_T = ref.T\n",
     "    \n",
-    "    merged_df = input_keys_df.merge(ref_T, left_on='key', right_index=True, how='inner')\n",
+    "    input_keys_df = input_keys_df.set_index('key')\n",
+    "    \n",
+    "    \n",
+    "    print('Merging reference')\n",
+    "    merged_df = input_keys_df.merge(ref_T, left_index=True, right_index=True, how='inner')\n",
+    "\n",
+    "    print('Finish merging')\n",
     "    \n",
     "    if func == sumup:\n",
-    "        grouped_df = merged_df.drop(columns=['key']).groupby('input_index').sum()\n",
+    "        grouped_df = merged_df.groupby('input_index').sum()\n",
     "        out = grouped_df.reindex(df.index)\n",
     "         \n",
     "    elif func==multiply:\n",
@@ -1380,7 +1396,7 @@
     "            kinase_df = kinase_df.rename(columns={kinase: 'value'})\n",
     "\n",
     "            # Compute log_value\n",
-    "            kinase_df['log_value'] = np.log2(kinase_df['value'],where=kinase_df['value']>0)\n",
+    "            kinase_df['log_value'] = np.log2(kinase_df['value'].where(kinase_df['value'] > 0))\n",
     "\n",
     "            # Group by 'input_index' and compute sum and count\n",
     "            grouped = kinase_df.dropna().groupby('input_index')\n",
@@ -1410,48 +1426,74 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 58,
+   "execution_count": 57,
    "metadata": {},
    "outputs": [],
    "source": [
     "# Get y site from psp dataset\n",
     "df = Data.get_psp_human_site()\n",
-    "df_y = df[df['site_seq'].str[7].isin(['y'])].head(3_000)"
+    "df_sty = df[df['site_seq'].str[7].isin(list('sty'))]"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 59,
+   "execution_count": 58,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "input dataframe has a length 3000\n",
+      "input dataframe has a length 20000\n",
       "Preprocessing\n",
-      "Finish preprocessing\n"
+      "Finish preprocessing\n",
+      "Merging reference\n",
+      "Finish merging\n"
      ]
     },
     {
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "100%|██████████| 93/93 [00:00<00:00, 148.51it/s]\n"
+      "100%|██████████| 396/396 [00:17<00:00, 23.17it/s]\n"
      ]
     },
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "CPU times: user 698 ms, sys: 27.3 ms, total: 726 ms\n",
-      "Wall time: 989 ms\n"
+      "CPU times: user 18.8 s, sys: 76.4 ms, total: 18.9 s\n",
+      "Wall time: 19.1 s\n"
+     ]
+    }
+   ],
+   "source": [
+    "%%time\n",
+    "out = predict_kinase_df(df_sty.head(20_000),'site_seq', **param_PSPA)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 59,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "input dataframe has a length 20000\n",
+      "Preprocessing\n",
+      "Finish preprocessing\n",
+      "Merging reference\n",
+      "Finish merging\n",
+      "CPU times: user 2.26 s, sys: 16 ms, total: 2.27 s\n",
+      "Wall time: 2.27 s\n"
      ]
     }
    ],
    "source": [
     "%%time\n",
-    "out_y = predict_kinase_df(df_y,'site_seq', **param_PSPA_y)"
+    "out_cddm = predict_kinase_df(df_sty.head(20_000),'site_seq', **param_CDDM)"
    ]
   },
   {
@@ -5762,7 +5804,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 64,
+   "execution_count": 60,
    "metadata": {},
    "outputs": [],
    "source": [