From 150997596ab38aea7222f84f55962d4f18b9cff1 Mon Sep 17 00:00:00 2001
From: Christina Schmidt <christina.schmidt1@outlook.de>
Date: Tue, 19 Nov 2024 15:48:37 +0100
Subject: [PATCH] Updated omnipathR translate_ids and started on user-specific
 summary structure

---
 R/RefactorPriorKnoweldge.R | 127 +++++++++++++++++++++++++++++++++++--
 1 file changed, 121 insertions(+), 6 deletions(-)

diff --git a/R/RefactorPriorKnoweldge.R b/R/RefactorPriorKnoweldge.R
index 37a0f74..ab050b4 100644
--- a/R/RefactorPriorKnoweldge.R
+++ b/R/RefactorPriorKnoweldge.R
@@ -88,19 +88,60 @@ TranslateID <- function(
   Folder <- MetaProViz:::SavePath(FolderName = "TranslateID", FolderPath = NULL)
 
   ## ------------------ Translate To-From for each pair ------------------- ##
-  list(
-    InputDF = InputData,
-    TranslatedDF = OmnipathR::translate_ids(
+  TranslatedDF <- OmnipathR::translate_ids(
       InputData,
       !!sym(SettingsInfo[['InputID']]) :=  !!sym(From),
       !!!syms(To),#list of symbols, hence three !!!
       ramp = TRUE,
       expand = FALSE,
-      inspect = TRUE,
-      inspect_grp = SettingsInfo[['GroupingVariable']]
+      quantify_ambiguity = TRUE,
+      qualify_ambiguity = TRUE,
+      ambiguity_groups =  SettingsInfo[['GroupingVariable']]
     )
-  )
 
+  ## ------------------ Add information to the results and Create Summary------------------- ##
+  ResList <- list()
+  for(item in  To){
+    #Extract and prepare table for each metabolite ID:
+    ExpandID <-  TranslatedDF %>%
+      dplyr::select(any_of(names(InputData)), dplyr::contains(item))  %>%
+      tidyr::unnest(cols = all_of(dplyr::contains(item)))
+
+    # Add information about instances across or within pathways!
+    if(SettingsInfo[["GroupingVariable"]] %in% colnames(ExpandID)){
+      ExpandID <- ExpandID %>% #many-to-many = within or across pathways? --> add column with this information
+        group_by(MetaboliteID, term) %>%
+        mutate(GroupingVariable = case_when(
+          n_distinct(hmdb) > 1 & MetaboliteID_hmdb_to_ambiguity > 1 & MetaboliteID_hmdb_ambiguity== "one-to-many" & n_distinct(term) >=2 & duplicated(term)==TRUE ~ "one-to-many_Within-and-AcrossGroups",  # Multiple KEGG IDs, multiple terms --> should not happen!
+          n_distinct(hmdb) > 1 & MetaboliteID_hmdb_to_ambiguity > 1 & MetaboliteID_hmdb_ambiguity== "one-to-many" & n_distinct(term) >=2 & duplicated(term)==FALSE ~ "one-to-many_AcrossGroups",  # Multiple KEGG IDs, multiple terms --> should not happen!
+          n_distinct(hmdb) > 1 & MetaboliteID_hmdb_to_ambiguity > 1 & MetaboliteID_hmdb_ambiguity == "one-to-many" & n_distinct(term) <= 1 ~ "one-to-many_WithinGroups",  # Multiple KEGG IDs, same term
+          TRUE ~ NA_character_  #
+        )) %>%
+        ungroup()%>%
+        group_by(hmdb) %>%
+        mutate(GroupingVariable = case_when(
+          n_distinct(MetaboliteID) > 1 & MetaboliteID_hmdb_to_ambiguity > 1 & MetaboliteID_hmdb_ambiguity== "many-to-many" & n_distinct(term) == 1 ~ "many-to-many_WithinGroups",  # Multiple KEGG IDs, same term
+          n_distinct(MetaboliteID) > 1 & MetaboliteID_hmdb_to_ambiguity > 1 & MetaboliteID_hmdb_ambiguity== "many-to-many" & n_distinct(term) > 1 ~ "many-to-many_AcrossGroups",  # Multiple KEGG IDs, multiple terms --> should not happen!
+          TRUE ~  paste(GroupingVariable) #
+        )) %>%
+        ungroup()
+      }
+
+    #Create a summary file about the instances of one-to-many etc. also include a descriptive column that verbalizes issues
+    # --> e.g. pathway inflation/deflation
+
+
+    #return results
+    ResList[[item]] <- ExpandID
+  }
+
+  #many-to-many = within or across pathways? --> add column with this information
+
+
+  ## ------------------ Save the results ------------------- ##
+  res <- list(
+    InputDF = InputData,
+    TranslatedDF = TranslatedDF)
 }
 
 
@@ -197,6 +238,9 @@ CheckMatchID <- function(InputData
   ## ------------ Prepare the Input -------- ##
 
 
+
+
+
 }
 
 
@@ -234,5 +278,76 @@ PossibleID <- function(InputData,
 
 #' Deal with pathway overlap in prior knowledge
 #'
+#' @param InputData Dataframe with at least one column with the target (e.g. metabolite) and a column source (e.g. term).
+#' @param SettingsInfo = c(InputID="MetaboliteID", GroupingVariable="term"),
+#' @examples
+#' KEGG_Pathways <- MetaProViz::LoadKEGG()
+#' InputData = KEGG_Pathways
+#'
 #'
 #'
+
+
+
+ClusterPK <- function(InputData,
+                      SettingsInfo= c(InputID="MetaboliteID", GroupingVariable="term")
+
+){
+
+  # Cluster PK before running enrichment analysis --> add another column that groups the data based on the pathway overlap:
+  # provide different options for clustering (e.g. % of overlap, semantics similarity) --> Ramp uses % of overlap, semnatics similarity: https://yulab-smu.top/biomedical-knowledge-mining-book/GOSemSim.html
+
+
+  ## ------------------ Check Input ------------------- ##
+
+
+  ## ------------------ Create output folders and path ------------------- ##
+
+  ## ------------------ Cluster the data ------------------- ##
+  # 1. Create a list of unique MetaboliteIDs for each term
+  term_metabolites <- InputData %>%
+    dplyr::group_by(!!sym(SettingsInfo[["GroupingVariable"]])) %>%
+    dplyr::summarize(MetaboliteIDs = list(unique(!!sym(SettingsInfo[["InputID"]])))) %>%
+    dplyr::ungroup()
+
+  # 2. Compute pairwise overlaps
+  term_overlap <- combn(term_metabolites[[SettingsInfo[["GroupingVariable"]]]], 2, function(terms) {
+    term1_ids <- term_metabolites$MetaboliteIDs[term_metabolites[[SettingsInfo[["GroupingVariable"]]]] == terms[1]][[1]]
+    term2_ids <- term_metabolites$MetaboliteIDs[term_metabolites[[SettingsInfo[["GroupingVariable"]]]] == terms[2]][[1]]
+
+    overlap <- length(intersect(term1_ids, term2_ids)) / length(union(term1_ids, term2_ids))
+    data.frame(Term1 = terms[1], Term2 = terms[2], Overlap = overlap)
+    }, simplify = FALSE) %>%
+    dplyr::bind_rows()
+
+  # 3. Cluster terms based on overlap threshold
+  threshold <- 0.7 # Define similarity threshold
+  term_clusters <- term_overlap %>%
+    dplyr::filter(Overlap >= threshold) %>%
+    dplyr::select(Term1, Term2)
+
+  # 4. Merge cluster group information back to the original data
+  df <- InputData %>%
+    dplyr::left_join(term_metabolites %>% select(!!sym(SettingsInfo[["GroupingVariable"]]), Group), by = SettingsInfo[["GroupingVariable"]])
+
+
+
+
+  #Maybe use igraph?
+  #g <- igraph::graph_from_data_frame(term_clusters, directed = FALSE)
+  #clusters <- igraph::clusters(g)$membership
+  #term_metabolites$Group <- clusters[match(term_metabolites[[SettingsInfo[["GroupingVariable"]]]], names(clusters))]
+
+}
+
+
+
+
+
+
+
+
+
+
+
+