diff --git a/R/GetPriorKnoweldge.R b/R/GetPriorKnoweldge.R index ad67522..802c25b 100644 --- a/R/GetPriorKnoweldge.R +++ b/R/GetPriorKnoweldge.R @@ -1,6 +1,6 @@ ## --------------------------- ## -## Script name: Make_GeneMetabSet +## Script name: GetPriorKnowledge ## ## Purpose of script: Create gene-metabolite sets for pathway enrichment analysis. ## @@ -19,320 +19,6 @@ ## --------------------------- -########################################################################################## -### ### ### Translate IDs to/from KEGG, PubChem, Chebi, HMDB ### ### ### -########################################################################################## -#' -#' @param Input_DataFrame Dataframe with two columns for source (=term) and Target (=gene), e.g. Hallmarks. -#' @param SettingsInfo \emph{Optional: } Column name of Target in Input_GeneSet. \strong{Default = list(IdColumn="MetaboliteID", FromFormat=c("kegg"), ToFormat=c("pubchem","chebi","hmdb"), Method="GetAll", GroupingVariable="term")} -#' @param SaveAs_Table \emph{Optional: } File types for the analysis results are: "csv", "xlsx", "txt". \strong{Default = "csv"} -#' @param FolderPath {Optional:} String which is added to the resulting folder name \strong{Default = NULL} -#' -#' @title Translate IDs -#' @description Translate IDs to and from KEGG, PubChem, Chebi. -#' @return 3 data frames: 1) Original data and the new column of translated ids. 2) Mapping summary from Original ID to Translated. 3) Mapping summary from Translated to Original. -#' @export -#' -TranslateID <- function(Input_DataFrame, - SettingsInfo = list(IdColumn="MetaboliteID", FromFormat=c("kegg"), ToFormat=c("pubchem","chebi","hmdb"), Method="GetAll", GroupingVariable="term") - ){ - - Output_DataFrame <- Input_DataFrame # making a copy of Input Dataframe so we can merge the results later - idcolname <- SettingsInfo[['IdColumn']] - from <- SettingsInfo[['FromFormat']] - to <- SettingsInfo[['ToFormat']] - method <- SettingsInfo[['Method']] - groupvar <- SettingsInfo[['GroupingVariable']] - DF_List <- list() - - # Check that all Name types added by the user are present - userlist <- c(from, to) - allowedids <- c("kegg","pubchem","chebi","hmdb") - if (!all(userlist %in% allowedids)) { - stop(paste("We currently 'only' support ID translation between: kegg, pubchem, chebi, hmdb ;) Please also use lowercase. You entered:", - paste(userlist[!userlist %in% allowedids], collapse = ", "))) - } - - print(paste('Using method ', method)) - - for (to_singular in to) { - # Rename and use OmnipathR to translate the ids. Note that the returned object (df_translated) will most likely have multiple mappings. - if (!from %in% names(Input_DataFrame)) { - Input_DataFrame <- Input_DataFrame %>% - dplyr::mutate(!!from := .[[idcolname]]) # This is used to keep the original column of the idcolname, otherwise we could use dplyr::rename(!!from := idcolname) - message("Created '", from, "' as a colname.") - } else { - message("Column '", from, "' already exists in the dataframe.") - } - # perform the basic translation - note, the results will be in long format likely with double ups - df_translated <- Input_DataFrame %>% - OmnipathR::translate_ids(!!from, !!to_singular, ramp = TRUE) - # now collapse the desired translated column rows into a single row for each group (e.g. by path term and metaboliteID), so that it looks like: "16680, 57856, 181457", for example - # we will also make the prefix '_collapsed' to distinguish it from other columns that might not be collapsed - df_translated <- df_translated %>% group_by(!!sym(from), !!sym(groupvar)) %>% summarize(!!paste0(to_singular, '_collapsed') := paste(!!sym(to_singular), collapse = ', ')) - # previously we selected just the collapsed column, and ungroup it so that we don't keep the other grouping columns when we select, now we want to keep them so we can merge effectively - #collapsed_col <- df_translated %>% ungroup() %>% select(ends_with('_collapsed')) - collapsed_col <- df_translated %>% ungroup() - - if (method == 'GetAll') { - print(paste('Converting from ', from, " to ", to_singular)) - new_col <- collapsed_col - - } else if (method == 'GetFirst') { - print(paste('Converting from ', from, " to ", to_singular)) - print(glue::glue("WARNING: Only the first translated ID from <{to_singular}> will be returned for each unique ID from <{from}>.")) - - firstItem_cols <- collapsed_col %>% - mutate(!!paste0(to_singular, '_first') := ifelse( - !is.na(!!sym(paste0(to_singular, '_collapsed'))), - sapply(strsplit(!!sym(paste0(to_singular, '_collapsed')), ", "), `[`, 1), # Split and get the first element - NA)) - firstItem_cols <- firstItem_cols %>% - select(-!!paste0(to_singular, '_collapsed')) #note this removes the 'collapsed' columns. If we wanted to keep these we could comment this part out... - new_col <- firstItem_cols - - } else { - print('You may want to check the Method you are trying to use is implemented.') - } - - # Now update the results from each type of ID translation into one main table, based off the users input table - #note if we change the grouping variable this might go kaputt - Output_DataFrame <- merge(x=Output_DataFrame, y=new_col, - by.x=c(idcolname, groupvar), - by.y=c(from,groupvar), - all.x=TRUE) - DF_List$Translated_DataFrame <- Output_DataFrame - - # Now Inspect the IDs to identify potential problems with mapping - let's do this even if the user chose the GetFirst method, because it is very important to show the limitations... - mapping_inspection = InspectID(new_col, SettingsInfo = list(OriginalIDcolumn = from, TranslatedCollapsedIDcolumn = paste0(to_singular, '_collapsed'), Pathway = groupvar)) - # Now rename the columns so the user knows what exactly the Original and Translated IDs are for each DF... - mapping_inspection$Orig2Trans <- mapping_inspection$Orig2Trans %>% - rename(!!paste0("Original_ID_", from) := "Original_ID", !!paste0("Translated_IDs_", to_singular) := "Translated_IDs") - mapping_inspection$Trans2Orig <- mapping_inspection$Trans2Orig %>% - rename(!!paste0("Original_IDs_", from) := "Original_IDs", !!paste0("Translated_ID_", to_singular) := "Translated_ID") - - DF_List <- c(DF_List, setNames(list(mapping_inspection$Orig2Trans), paste0("Mapping_Orig2Trans_", from, '_to_', to_singular))) - DF_List <- c(DF_List, setNames(list(mapping_inspection$Trans2Orig), paste0("Mapping_Trans2Orig_", to_singular, '_to_', from))) - - - } - # Create a final summary table of the mapping issues - # Create a function to get the counts - get_counts <- function(table) { - table %>% - count(Relationship) %>% - rename(n = n) # Rename to n for consistency - } - # Extract counts for each table and add the table name - summary_table <- map_dfr( - seq_along(DF_List)[-1], - function(i) { - table <- DF_List[[i]] - table_name <- names(DF_List)[i] - counts <- get_counts(table) - counts <- counts %>% mutate(Table = table_name) # Add table name as a new column - return(counts) - } - ) - # Pivot to a wider format - final_table_summary <- summary_table %>% - pivot_wider(names_from = Relationship, values_from = n, values_fill = 0) - # Reorder safely - desired_order <- c("Table", "One-to-None","One-to-One","One-to-Many") - existing_columns <- names(final_table_summary) # Get the existing columns in final_table - safe_order <- intersect(desired_order, existing_columns) # Create a safe order by intersecting desired_order with existing_columns - # Check if "Table" is in the safe order and ensure it's at the beginning - if ("Table" %in% safe_order) { - safe_order <- c("Table", setdiff(safe_order, "Table")) - } - final_table_summary <- final_table_summary %>% - select(all_of(safe_order)) - - DF_List <- c(DF_List, setNames(list(final_table_summary), "TranslationSummary")) - - return(DF_List) -} - - -########################################################################################## -### ### ### Inspect ID mapping to/from KEGG, PubChem, Chebi, HMDB ### ### ### -########################################################################################## -#' -#' @title Inspect ID -#' @description Inspect how well IDs map from the translated format (e.g. PubChem) to the original data format (e.g. KEGG), in terms of direct mapping, or one-to-many relationships. -#' @return Two data frames, the first of which is a summary of the mapping from Original to Translated, and the second of which is the reverse, from Translated to Original, with counts per unique ID and pathway. -#' @export -#' -InspectID <- function(Input_DataFrame, - SettingsInfo = list(OriginalIDcolumn="MetaboliteID", TranslatedCollapsedIDcolumn="chebi_collapsed", Pathway="term") - ) { - # note that 'translated' and 'original' could be swapped if we want to inspect the reverse mapping - this difference is mostly semantic - # input data frame should be what we have after the translation has occurred, and include a column with collapsed values in the case of multi-mapping (e.g. C1, C2, C3), but could be all 1-to-1 in rare cases. - original <- SettingsInfo[['OriginalIDcolumn']] - translated <- SettingsInfo[['TranslatedCollapsedIDcolumn']] - pathway <- SettingsInfo[['Pathway']] - - suppressMessages(library(tidyverse)) - - # Function to process collapsed field (e.g. "C1, C2, C3, C4") - process_collapsed <- function(collapsed, origin_info) { - if (length(collapsed) == 0 || is.null(collapsed) || is.na(collapsed)) { - return(list(None = origin_info)) - } else { - # Split the string by commas and return a named list - ids <- strsplit(collapsed, ",\\s*")[[1]] - return(setNames(rep(list(origin_info), length(ids)), ids)) - } - } - - # Main function to map IDs - map_ids <- function(df, target_col = 'collapsed', term_colname = 'term', idcolname = 'id') { - mapping_list <- list() - - # Iterate over rows of the dataframe - for (i in 1:nrow(df)) { - collapsed <- df[[target_col]][i] - term <- df[[term_colname]][i] - row_id <- df[[idcolname]][i] - - # Process the collapsed field - processed_data <- process_collapsed(collapsed, list(term = term, id = row_id)) - - # Append data to the mapping list - for (item in names(processed_data)) { - if (is.null(mapping_list[[item]])) { - mapping_list[[item]] <- list(processed_data[[item]]) - } else { - mapping_list[[item]] <- append(mapping_list[[item]], list(processed_data[[item]])) - } - } - } - - return(mapping_list) - } - - # Convert the mapping list into a data frame - list_to_df <- function(mapping_list) { - rows <- list() # Initialize a list to store the rows - - # Iterate through the mapping list and extract data - for (key in names(mapping_list)) { - for (mapping in mapping_list[[key]]) { - # Ensure that mapping is not NULL and contains term/id - if (!is.null(mapping) && !is.null(mapping$term) && !is.null(mapping$id)) { - # Append each row as a list to the rows list - rows[[length(rows) + 1]] <- list(ID = key, term = mapping$term, id = mapping$id) - } - } - } - - # Convert the list of rows into a data frame - df <- bind_rows(rows) - - return(df) - } - - inspect_mapping <- function(input_translated_df, - target_col_inp = 'chebi_collapsed', - term_colname_inp = 'term', - idcolname_inp = 'MetaboliteID') { - mapping_results <- map_ids(input_translated_df, - target_col = target_col_inp, - term_colname = term_colname_inp, - idcolname = idcolname_inp) - mapping_df <- list_to_df(mapping_results) - #print(colnames(mapping_df)) #usually results in "ID" "term" "id" - - mapping_df <- mapping_df %>% - rename(ID_translated = ID, ID_original = id, pathway = term) # assumes that term is the name of the original pathway colname, might still break, todo: fix later - - ### Key result - # Count how many unique terms and ids are associated with each ID - mapping_df_summary <- mapping_df %>% - group_by(ID_translated) %>% - summarize(pathway_count = n_distinct(pathway), ID_original_count = n_distinct(ID_original)) %>% - filter(pathway_count > 0 | ID_original_count > 0) # was previously filter(pathway_count > 1 | ID_original_count > 1) - #print(mapping_df_summary) - - ### Nested results - mapping_df_nested <- mapping_df %>% - group_by(ID_translated) %>% - summarize(pathways = toString(unique(pathway)), ID_originals = toString(unique(ID_original))) - - ### Output - results <- list( - mapping = mapping_df, - mapping_summary = mapping_df_summary, - mapping_nested = mapping_df_nested - ) - return(results) - } - - # Inspect how the mapping performs going from the translated IDs to the original IDs - # Note that we do this before Orig2Trans, because the translated data is already in a nicely formatted manner that is already collapsed - # If we wanted to refactor all this to be simpler, perhaps we could just start with the Orig2Trans counts, then flip it. But end result should be the same. - mapping_translated_to_original <- inspect_mapping(Input_DataFrame, - target_col_inp = translated, - term_colname_inp = pathway, - idcolname_inp = original) - # Join the summary with the nested table for easier user inspection - mapping_translated_to_original$mapping_joined <- mapping_translated_to_original$mapping_nested %>% - left_join(mapping_translated_to_original$mapping_summary, by = "ID_translated") %>% - rename("Original_IDs" = "ID_originals", "Translated_ID" = "ID_translated", "Original_IDs_count" = "ID_original_count", - "Pathways" = "pathways", "Pathways_count" = "pathway_count") - - # Now inspect the mapping going back from original to translated - mapping_original_to_translated <- mapping_translated_to_original$mapping_nested %>% - separate_rows("pathways", sep = ", ") %>% - inspect_mapping(target_col_inp = 'ID_originals', - term_colname_inp = "pathways", - idcolname_inp = 'ID_translated') - # Join the summary with the nested table for easier user inspection - # Also, correct instances of NA's that have counted towards the ID_original count... - # Note that the pathway count for these instances will still likely be a bit dodgy - # We also want to rename the columns, because the function named them anticipating we were going the other way - mapping_original_to_translated$mapping_joined <- mapping_original_to_translated$mapping_nested %>% - left_join(mapping_original_to_translated$mapping_summary, by = "ID_translated") %>% - mutate( - ID_original_count = if_else(is.na(ID_originals) | ID_originals == "NA", - ID_original_count - 1, - ID_original_count) - ) %>% - mutate( - pathway_count = if_else(is.na(ID_originals) | ID_originals == "NA", - 0, - pathway_count) - ) %>% - mutate( - pathways = if_else(is.na(ID_originals) | ID_originals == "NA", - "NA", - pathways) - ) %>% - rename("Translated_IDs" = "ID_originals", "Original_ID" = "ID_translated", "Translated_IDs_count" = "ID_original_count", - "Pathways" = "pathways", "Pathways_count" = "pathway_count") - - # Now let's add a column describing the type of relationship present... This could definitely be refactored to save the double up - mapping_original_to_translated$mapping_joined <- mapping_original_to_translated$mapping_joined %>% - mutate(Relationship = case_when( - Translated_IDs_count == 0 ~ "One-to-None", - Translated_IDs_count == 1 ~ "One-to-One", - Translated_IDs_count > 1 ~ "One-to-Many" - )) - mapping_translated_to_original$mapping_joined <- mapping_translated_to_original$mapping_joined %>% - mutate(Relationship = case_when( - Original_IDs_count == 0 ~ "One-to-None", - Original_IDs_count == 1 ~ "One-to-One", - Original_IDs_count > 1 ~ "One-to-Many" - )) - - - return(list("Orig2Trans"=mapping_original_to_translated$mapping_joined, "Trans2Orig"=mapping_translated_to_original$mapping_joined)) -} - - - - - ########################################################################################## ### ### ### Get KEGG prior knowledge ### ### ### ########################################################################################## @@ -795,12 +481,12 @@ LoadMetalinks <- function(types = NULL, ## Rearrange columns: MetalinksDB <- MetalinksDB[,c(2,10:12, 1, 13:14, 3:9)]%>% - mutate(type = case_when( + dplyr::mutate(type = dplyr::case_when( type == "lr" ~ "Ligand-Receptor", type == "pd" ~ "Production-Degradation", TRUE ~ type # this keeps the original value if it doesn't match any condition ))%>% - mutate(mode_of_regulation = case_when( + dplyr::mutate(mode_of_regulation = dplyr::case_when( mor == -1 ~ "Inhibiting", mor == 1 ~ "Activating", mor == 0 ~ "Binding", @@ -816,9 +502,9 @@ LoadMetalinks <- function(types = NULL, MetalinksDB_Type <-MetalinksDB[, c(1:3, 7,13)]%>% subset(!is.na(protein_type))%>% - mutate(term = gsub('\"', '', protein_type))%>% + dplyr::mutate(term = gsub('\"', '', protein_type))%>% unite(term_specific, c("term", "type"), sep = "_", remove = FALSE)%>% - select(-"type", -"protein_type") + dplyr::select(-"type", -"protein_type") #------------------------------------------------------------------ #Save results in folder @@ -840,4 +526,9 @@ LoadMetalinks <- function(types = NULL, } +########################################################################################## +### ### ### Helper: Load compound lists of ions, xenobiotics, cofactors (not detectable by LC-MS), etc ### ### ### +########################################################################################## + +# This is needed to remove ions, xenobiotics, cofactors, etc. from the metabolite list for any of the functions above.