Skip to content

Commit

Permalink
refactoring of the functions
Browse files Browse the repository at this point in the history
  • Loading branch information
ChristinaSchmidt1 committed Nov 5, 2024
1 parent 666e7ee commit 849586e
Showing 1 changed file with 10 additions and 319 deletions.
329 changes: 10 additions & 319 deletions R/GetPriorKnoweldge.R
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
## ---------------------------
##
## Script name: Make_GeneMetabSet
## Script name: GetPriorKnowledge
##
## Purpose of script: Create gene-metabolite sets for pathway enrichment analysis.
##
Expand All @@ -19,320 +19,6 @@
## ---------------------------


##########################################################################################
### ### ### Translate IDs to/from KEGG, PubChem, Chebi, HMDB ### ### ###
##########################################################################################
#'
#' @param Input_DataFrame Dataframe with two columns for source (=term) and Target (=gene), e.g. Hallmarks.
#' @param SettingsInfo \emph{Optional: } Column name of Target in Input_GeneSet. \strong{Default = list(IdColumn="MetaboliteID", FromFormat=c("kegg"), ToFormat=c("pubchem","chebi","hmdb"), Method="GetAll", GroupingVariable="term")}
#' @param SaveAs_Table \emph{Optional: } File types for the analysis results are: "csv", "xlsx", "txt". \strong{Default = "csv"}
#' @param FolderPath {Optional:} String which is added to the resulting folder name \strong{Default = NULL}
#'
#' @title Translate IDs
#' @description Translate IDs to and from KEGG, PubChem, Chebi.
#' @return 3 data frames: 1) Original data and the new column of translated ids. 2) Mapping summary from Original ID to Translated. 3) Mapping summary from Translated to Original.
#' @export
#'
TranslateID <- function(Input_DataFrame,
SettingsInfo = list(IdColumn="MetaboliteID", FromFormat=c("kegg"), ToFormat=c("pubchem","chebi","hmdb"), Method="GetAll", GroupingVariable="term")
){

Output_DataFrame <- Input_DataFrame # making a copy of Input Dataframe so we can merge the results later
idcolname <- SettingsInfo[['IdColumn']]
from <- SettingsInfo[['FromFormat']]
to <- SettingsInfo[['ToFormat']]
method <- SettingsInfo[['Method']]
groupvar <- SettingsInfo[['GroupingVariable']]
DF_List <- list()

# Check that all Name types added by the user are present
userlist <- c(from, to)
allowedids <- c("kegg","pubchem","chebi","hmdb")
if (!all(userlist %in% allowedids)) {
stop(paste("We currently 'only' support ID translation between: kegg, pubchem, chebi, hmdb ;) Please also use lowercase. You entered:",
paste(userlist[!userlist %in% allowedids], collapse = ", ")))
}

print(paste('Using method ', method))

for (to_singular in to) {
# Rename and use OmnipathR to translate the ids. Note that the returned object (df_translated) will most likely have multiple mappings.
if (!from %in% names(Input_DataFrame)) {
Input_DataFrame <- Input_DataFrame %>%
dplyr::mutate(!!from := .[[idcolname]]) # This is used to keep the original column of the idcolname, otherwise we could use dplyr::rename(!!from := idcolname)
message("Created '", from, "' as a colname.")
} else {
message("Column '", from, "' already exists in the dataframe.")
}
# perform the basic translation - note, the results will be in long format likely with double ups
df_translated <- Input_DataFrame %>%
OmnipathR::translate_ids(!!from, !!to_singular, ramp = TRUE)
# now collapse the desired translated column rows into a single row for each group (e.g. by path term and metaboliteID), so that it looks like: "16680, 57856, 181457", for example
# we will also make the prefix '_collapsed' to distinguish it from other columns that might not be collapsed
df_translated <- df_translated %>% group_by(!!sym(from), !!sym(groupvar)) %>% summarize(!!paste0(to_singular, '_collapsed') := paste(!!sym(to_singular), collapse = ', '))
# previously we selected just the collapsed column, and ungroup it so that we don't keep the other grouping columns when we select, now we want to keep them so we can merge effectively
#collapsed_col <- df_translated %>% ungroup() %>% select(ends_with('_collapsed'))
collapsed_col <- df_translated %>% ungroup()

if (method == 'GetAll') {
print(paste('Converting from ', from, " to ", to_singular))
new_col <- collapsed_col

} else if (method == 'GetFirst') {
print(paste('Converting from ', from, " to ", to_singular))
print(glue::glue("WARNING: Only the first translated ID from <{to_singular}> will be returned for each unique ID from <{from}>."))

firstItem_cols <- collapsed_col %>%
mutate(!!paste0(to_singular, '_first') := ifelse(
!is.na(!!sym(paste0(to_singular, '_collapsed'))),
sapply(strsplit(!!sym(paste0(to_singular, '_collapsed')), ", "), `[`, 1), # Split and get the first element
NA))
firstItem_cols <- firstItem_cols %>%
select(-!!paste0(to_singular, '_collapsed')) #note this removes the 'collapsed' columns. If we wanted to keep these we could comment this part out...
new_col <- firstItem_cols

} else {
print('You may want to check the Method you are trying to use is implemented.')
}

# Now update the results from each type of ID translation into one main table, based off the users input table
#note if we change the grouping variable this might go kaputt
Output_DataFrame <- merge(x=Output_DataFrame, y=new_col,
by.x=c(idcolname, groupvar),
by.y=c(from,groupvar),
all.x=TRUE)
DF_List$Translated_DataFrame <- Output_DataFrame

# Now Inspect the IDs to identify potential problems with mapping - let's do this even if the user chose the GetFirst method, because it is very important to show the limitations...
mapping_inspection = InspectID(new_col, SettingsInfo = list(OriginalIDcolumn = from, TranslatedCollapsedIDcolumn = paste0(to_singular, '_collapsed'), Pathway = groupvar))
# Now rename the columns so the user knows what exactly the Original and Translated IDs are for each DF...
mapping_inspection$Orig2Trans <- mapping_inspection$Orig2Trans %>%
rename(!!paste0("Original_ID_", from) := "Original_ID", !!paste0("Translated_IDs_", to_singular) := "Translated_IDs")
mapping_inspection$Trans2Orig <- mapping_inspection$Trans2Orig %>%
rename(!!paste0("Original_IDs_", from) := "Original_IDs", !!paste0("Translated_ID_", to_singular) := "Translated_ID")

DF_List <- c(DF_List, setNames(list(mapping_inspection$Orig2Trans), paste0("Mapping_Orig2Trans_", from, '_to_', to_singular)))
DF_List <- c(DF_List, setNames(list(mapping_inspection$Trans2Orig), paste0("Mapping_Trans2Orig_", to_singular, '_to_', from)))


}
# Create a final summary table of the mapping issues
# Create a function to get the counts
get_counts <- function(table) {
table %>%
count(Relationship) %>%
rename(n = n) # Rename to n for consistency
}
# Extract counts for each table and add the table name
summary_table <- map_dfr(
seq_along(DF_List)[-1],
function(i) {
table <- DF_List[[i]]
table_name <- names(DF_List)[i]
counts <- get_counts(table)
counts <- counts %>% mutate(Table = table_name) # Add table name as a new column
return(counts)
}
)
# Pivot to a wider format
final_table_summary <- summary_table %>%
pivot_wider(names_from = Relationship, values_from = n, values_fill = 0)
# Reorder safely
desired_order <- c("Table", "One-to-None","One-to-One","One-to-Many")
existing_columns <- names(final_table_summary) # Get the existing columns in final_table
safe_order <- intersect(desired_order, existing_columns) # Create a safe order by intersecting desired_order with existing_columns
# Check if "Table" is in the safe order and ensure it's at the beginning
if ("Table" %in% safe_order) {
safe_order <- c("Table", setdiff(safe_order, "Table"))
}
final_table_summary <- final_table_summary %>%
select(all_of(safe_order))

DF_List <- c(DF_List, setNames(list(final_table_summary), "TranslationSummary"))

return(DF_List)
}


##########################################################################################
### ### ### Inspect ID mapping to/from KEGG, PubChem, Chebi, HMDB ### ### ###
##########################################################################################
#'
#' @title Inspect ID
#' @description Inspect how well IDs map from the translated format (e.g. PubChem) to the original data format (e.g. KEGG), in terms of direct mapping, or one-to-many relationships.
#' @return Two data frames, the first of which is a summary of the mapping from Original to Translated, and the second of which is the reverse, from Translated to Original, with counts per unique ID and pathway.
#' @export
#'
InspectID <- function(Input_DataFrame,
SettingsInfo = list(OriginalIDcolumn="MetaboliteID", TranslatedCollapsedIDcolumn="chebi_collapsed", Pathway="term")
) {
# note that 'translated' and 'original' could be swapped if we want to inspect the reverse mapping - this difference is mostly semantic
# input data frame should be what we have after the translation has occurred, and include a column with collapsed values in the case of multi-mapping (e.g. C1, C2, C3), but could be all 1-to-1 in rare cases.
original <- SettingsInfo[['OriginalIDcolumn']]
translated <- SettingsInfo[['TranslatedCollapsedIDcolumn']]
pathway <- SettingsInfo[['Pathway']]

suppressMessages(library(tidyverse))

# Function to process collapsed field (e.g. "C1, C2, C3, C4")
process_collapsed <- function(collapsed, origin_info) {
if (length(collapsed) == 0 || is.null(collapsed) || is.na(collapsed)) {
return(list(None = origin_info))
} else {
# Split the string by commas and return a named list
ids <- strsplit(collapsed, ",\\s*")[[1]]
return(setNames(rep(list(origin_info), length(ids)), ids))
}
}

# Main function to map IDs
map_ids <- function(df, target_col = 'collapsed', term_colname = 'term', idcolname = 'id') {
mapping_list <- list()

# Iterate over rows of the dataframe
for (i in 1:nrow(df)) {
collapsed <- df[[target_col]][i]
term <- df[[term_colname]][i]
row_id <- df[[idcolname]][i]

# Process the collapsed field
processed_data <- process_collapsed(collapsed, list(term = term, id = row_id))

# Append data to the mapping list
for (item in names(processed_data)) {
if (is.null(mapping_list[[item]])) {
mapping_list[[item]] <- list(processed_data[[item]])
} else {
mapping_list[[item]] <- append(mapping_list[[item]], list(processed_data[[item]]))
}
}
}

return(mapping_list)
}

# Convert the mapping list into a data frame
list_to_df <- function(mapping_list) {
rows <- list() # Initialize a list to store the rows

# Iterate through the mapping list and extract data
for (key in names(mapping_list)) {
for (mapping in mapping_list[[key]]) {
# Ensure that mapping is not NULL and contains term/id
if (!is.null(mapping) && !is.null(mapping$term) && !is.null(mapping$id)) {
# Append each row as a list to the rows list
rows[[length(rows) + 1]] <- list(ID = key, term = mapping$term, id = mapping$id)
}
}
}

# Convert the list of rows into a data frame
df <- bind_rows(rows)

return(df)
}

inspect_mapping <- function(input_translated_df,
target_col_inp = 'chebi_collapsed',
term_colname_inp = 'term',
idcolname_inp = 'MetaboliteID') {
mapping_results <- map_ids(input_translated_df,
target_col = target_col_inp,
term_colname = term_colname_inp,
idcolname = idcolname_inp)
mapping_df <- list_to_df(mapping_results)
#print(colnames(mapping_df)) #usually results in "ID" "term" "id"

mapping_df <- mapping_df %>%
rename(ID_translated = ID, ID_original = id, pathway = term) # assumes that term is the name of the original pathway colname, might still break, todo: fix later

### Key result
# Count how many unique terms and ids are associated with each ID
mapping_df_summary <- mapping_df %>%
group_by(ID_translated) %>%
summarize(pathway_count = n_distinct(pathway), ID_original_count = n_distinct(ID_original)) %>%
filter(pathway_count > 0 | ID_original_count > 0) # was previously filter(pathway_count > 1 | ID_original_count > 1)
#print(mapping_df_summary)

### Nested results
mapping_df_nested <- mapping_df %>%
group_by(ID_translated) %>%
summarize(pathways = toString(unique(pathway)), ID_originals = toString(unique(ID_original)))

### Output
results <- list(
mapping = mapping_df,
mapping_summary = mapping_df_summary,
mapping_nested = mapping_df_nested
)
return(results)
}

# Inspect how the mapping performs going from the translated IDs to the original IDs
# Note that we do this before Orig2Trans, because the translated data is already in a nicely formatted manner that is already collapsed
# If we wanted to refactor all this to be simpler, perhaps we could just start with the Orig2Trans counts, then flip it. But end result should be the same.
mapping_translated_to_original <- inspect_mapping(Input_DataFrame,
target_col_inp = translated,
term_colname_inp = pathway,
idcolname_inp = original)
# Join the summary with the nested table for easier user inspection
mapping_translated_to_original$mapping_joined <- mapping_translated_to_original$mapping_nested %>%
left_join(mapping_translated_to_original$mapping_summary, by = "ID_translated") %>%
rename("Original_IDs" = "ID_originals", "Translated_ID" = "ID_translated", "Original_IDs_count" = "ID_original_count",
"Pathways" = "pathways", "Pathways_count" = "pathway_count")

# Now inspect the mapping going back from original to translated
mapping_original_to_translated <- mapping_translated_to_original$mapping_nested %>%
separate_rows("pathways", sep = ", ") %>%
inspect_mapping(target_col_inp = 'ID_originals',
term_colname_inp = "pathways",
idcolname_inp = 'ID_translated')
# Join the summary with the nested table for easier user inspection
# Also, correct instances of NA's that have counted towards the ID_original count...
# Note that the pathway count for these instances will still likely be a bit dodgy
# We also want to rename the columns, because the function named them anticipating we were going the other way
mapping_original_to_translated$mapping_joined <- mapping_original_to_translated$mapping_nested %>%
left_join(mapping_original_to_translated$mapping_summary, by = "ID_translated") %>%
mutate(
ID_original_count = if_else(is.na(ID_originals) | ID_originals == "NA",
ID_original_count - 1,
ID_original_count)
) %>%
mutate(
pathway_count = if_else(is.na(ID_originals) | ID_originals == "NA",
0,
pathway_count)
) %>%
mutate(
pathways = if_else(is.na(ID_originals) | ID_originals == "NA",
"NA",
pathways)
) %>%
rename("Translated_IDs" = "ID_originals", "Original_ID" = "ID_translated", "Translated_IDs_count" = "ID_original_count",
"Pathways" = "pathways", "Pathways_count" = "pathway_count")

# Now let's add a column describing the type of relationship present... This could definitely be refactored to save the double up
mapping_original_to_translated$mapping_joined <- mapping_original_to_translated$mapping_joined %>%
mutate(Relationship = case_when(
Translated_IDs_count == 0 ~ "One-to-None",
Translated_IDs_count == 1 ~ "One-to-One",
Translated_IDs_count > 1 ~ "One-to-Many"
))
mapping_translated_to_original$mapping_joined <- mapping_translated_to_original$mapping_joined %>%
mutate(Relationship = case_when(
Original_IDs_count == 0 ~ "One-to-None",
Original_IDs_count == 1 ~ "One-to-One",
Original_IDs_count > 1 ~ "One-to-Many"
))


return(list("Orig2Trans"=mapping_original_to_translated$mapping_joined, "Trans2Orig"=mapping_translated_to_original$mapping_joined))
}





##########################################################################################
### ### ### Get KEGG prior knowledge ### ### ###
##########################################################################################
Expand Down Expand Up @@ -795,12 +481,12 @@ LoadMetalinks <- function(types = NULL,

## Rearrange columns:
MetalinksDB <- MetalinksDB[,c(2,10:12, 1, 13:14, 3:9)]%>%
mutate(type = case_when(
dplyr::mutate(type = dplyr::case_when(
type == "lr" ~ "Ligand-Receptor",
type == "pd" ~ "Production-Degradation",
TRUE ~ type # this keeps the original value if it doesn't match any condition
))%>%
mutate(mode_of_regulation = case_when(
dplyr::mutate(mode_of_regulation = dplyr::case_when(
mor == -1 ~ "Inhibiting",
mor == 1 ~ "Activating",
mor == 0 ~ "Binding",
Expand All @@ -816,9 +502,9 @@ LoadMetalinks <- function(types = NULL,

MetalinksDB_Type <-MetalinksDB[, c(1:3, 7,13)]%>%
subset(!is.na(protein_type))%>%
mutate(term = gsub('\"', '', protein_type))%>%
dplyr::mutate(term = gsub('\"', '', protein_type))%>%
unite(term_specific, c("term", "type"), sep = "_", remove = FALSE)%>%
select(-"type", -"protein_type")
dplyr::select(-"type", -"protein_type")

#------------------------------------------------------------------
#Save results in folder
Expand All @@ -840,4 +526,9 @@ LoadMetalinks <- function(types = NULL,

}

##########################################################################################
### ### ### Helper: Load compound lists of ions, xenobiotics, cofactors (not detectable by LC-MS), etc ### ### ###
##########################################################################################

# This is needed to remove ions, xenobiotics, cofactors, etc. from the metabolite list for any of the functions above.

0 comments on commit 849586e

Please sign in to comment.