Imported QFeatures Reading from Norman

ipb-halle · Aug 22, 2024 · b47dad7 · b47dad7
1 parent 4617bdc
commit b47dad7
Show file tree

Hide file tree

Showing 9 changed files with 449 additions and 21 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,10 +1,11 @@
 Package: MetFamily
 Type: Package
 Title: MetFamily: Discovering Regulated Metabolite Families in Untargeted Metabolomics Studies
-Version: 0.99.3
-Date: 2024-03-02
+Version: 0.99.4
+Date: 2024-08-22
 Author: c( person("Hendrik", "Treutler, role = c("aut"), email = "[email protected]"),
            person("Khabat", "Vahabi, role = c("aut"), email = "[email protected]"),
+           person("Norman", "Storz", role = c("aut"), email = "[email protected]"),
            person(given = "Steffen", family = "Neumann", email = "[email protected]",
            role = c("aut", "cre"), comment = c(ORCID = "0000-0002-7899-7192")) )
 Depends:
@@ -37,6 +38,7 @@ Imports:
     graphics,
     grDevices,
     methods,
+    QFeatures,
     stats,
     utils
 Remotes: decisionpatterns/searchable
@@ -63,5 +65,8 @@ Collate:
     'TreeAlgorithms.R'
     'Plots.R'
     'R_packages.R'
+    parsePeakAbundanceMatrixQF.R
+    readMSDial.R
+    readMetaboScape.R
 RoxygenNote: 7.2.3
 Encoding: UTF-8
diff --git a/NAMESPACE b/NAMESPACE
@@ -1,4 +1,26 @@
-# Generated by roxyXXXgen2: do not edit by hand
+# Generated by roxygen2: do not edit by hand
+
+export(calcPlotDendrogram_plotly)
+export(calcPlotHeatmapLegend)
+export(castListEntries)
+export(data.numericmatrix)
+export(metaboliteFamilyVersusClass)
+export(mzClustGeneric)
+export(parsePeakAbundanceMatrixQF)
+export(processMS1data)
+export(readClusterDataFromProjectFile)
+export(readMSDial)
+export(readMetaboscape)
+export(readProjectData)
+export(runMetFamily)
+importFrom(QFeatures,QFeatures)
+importFrom(S4Vectors,DataFrame)
+importFrom(SummarizedExperiment,SummarizedExperiment)
+importFrom(grDevices,colorRampPalette)
+importFrom(grDevices,rainbow)
+importFrom(grDevices,rgb)
+importFrom(openxlsx2,read_xlsx)
+
 
 ## Hardcoded:
 exportPattern("^[^\\.]")
@@ -14,15 +36,6 @@ importFrom("stats", "as.dendrogram", "cor", "dendrapply", "dist",
 importFrom("utils", "flush.console", "read.table")
 
 
-## From roxygen:
-export(calcPlotDendrogram_plotly)
-export(calcPlotHeatmapLegend)
-export(castListEntries)
-export(data.numericmatrix)
-export(metaboliteFamilyVersusClass)
-export(mzClustGeneric)
-export(processMS1data)
-export(readClusterDataFromProjectFile)
-export(readProjectData)
-export(runMetFamily)
-importFrom(grDevices,colorRampPalette)
+importFrom(SummarizedExperiment,colData)
+importFrom(SummarizedExperiment,rowData)
+importFrom(SummarizedExperiment,assay)
diff --git a/R/FragmentMatrixFunctions.R b/R/FragmentMatrixFunctions.R
@@ -1396,7 +1396,7 @@ mzClustGeneric <- function(p,
   return(list(mat=groupmat,idx=groupindex))
 }
 
-convertToProjectFile <- function(filePeakMatrix, 
+convertToProjectFile <- function(filePeakMatrixPath, 
                                  fileSpectra, 
                                  parameterSet, 
                                  progress = FALSE){
@@ -1434,8 +1434,11 @@ convertToProjectFile <- function(filePeakMatrix,
 
   rm(returnObj)
 
+
+  filePeakMatrixQF <- readMSDial(filePeakMatrixPath)
+
   returnObj <- convertToProjectFile2(
-    filePeakMatrix = filePeakMatrix, 
+    filePeakMatrixQF = filePeakMatrixQF, 
     spectraList = spectraList, precursorMz = precursorMz, precursorRt = precursorRt, 
     metaboliteFamilies = rep(x = "", times = numberOfSpectra), uniqueMetaboliteFamilies = NULL, metaboliteFamilyColors = NULL, 
     parameterSet = parameterSet, 
@@ -1454,7 +1457,7 @@ convertToProjectFile <- function(filePeakMatrix,
   return(returnObj)
 }
 
-convertToProjectFile2 <- function(filePeakMatrix, 
+convertToProjectFile2 <- function(filePeakMatrixQF, 
                                   spectraList, 
                                   precursorMz, 
                                   precursorRt, 
@@ -1471,9 +1474,9 @@ convertToProjectFile2 <- function(filePeakMatrix,
   ## metabolite profile
   if(!is.na(progress))  if(progress)  incProgress(amount = 0.1, detail = paste("Parsing MS1 file...", sep = "")) else print(paste("Parsing MS1 file...", sep = ""))
 
-  if(!is.null(filePeakMatrix)){
-    returnObj <- parsePeakAbundanceMatrix(
-      filePeakMatrix = filePeakMatrix, 
+  if(!is.null(filePeakMatrixQF)){
+    returnObj <- parsePeakAbundanceMatrixQF(
+      qfeatures = filePeakMatrixQF, 
       doPrecursorDeisotoping = parameterSet$doPrecursorDeisotoping, 
       mzDeviationInPPM_precursorDeisotoping = parameterSet$mzDeviationInPPM_precursorDeisotoping, 
       mzDeviationAbsolute_precursorDeisotoping = parameterSet$mzDeviationAbsolute_precursorDeisotoping, 

diff --git a/R/parsePeakAbundanceMatrixQF.R b/R/parsePeakAbundanceMatrixQF.R
@@ -0,0 +1,175 @@
+#' Title
+#'
+#' @param qfeatures Object of type QFeatures 
+#' @param doPrecursorDeisotoping 
+#' @param mzDeviationInPPM_precursorDeisotoping 
+#' @param mzDeviationAbsolute_precursorDeisotoping 
+#' @param maximumRtDifference 
+#' @param progress boolean whether or not to show a progress bar
+#'
+#' @return
+#' @export
+#'
+#' @examples
+parsePeakAbundanceMatrixQF <- function(qfeatures, 
+                                     doPrecursorDeisotoping, 
+                                     mzDeviationInPPM_precursorDeisotoping, 
+                                     mzDeviationAbsolute_precursorDeisotoping, 
+                                     maximumRtDifference, 
+                                     progress=FALSE)
+{
+  ## read file
+
+  if(!is.na(progress)) {
+    if(progress) {
+      incProgress(amount = 0.1, detail = paste("Parsing MS1 file content...", sep = ""))
+    } else {
+      print(paste("Parsing MS1 file content...", sep = ""))
+    } 
+  }
+
+  cols_to_exclude <- c("Reference RT","Reference m/z","Comment",
+                       "Manually modified for quantification",
+                       "Total score","RT similarity","Average","Stdev") 
+
+  cols_to_keep <- which(!colnames(rowData(qfeatures))[[1]] %in% cols_to_exclude)
+
+  dataFrame <- cbind(rowData(qfeatures)[[1]][,cols_to_keep] ,assay(qfeatures))
+
+  ncol(rowData(qfeatures)[[1]]) 
+  numberOfPrecursors <- nrow(dataFrame)
+  numberOfPrecursorsPrior <- numberOfPrecursors 
+
+
+
+  if(nrow(colData(qfeatures))>0){
+
+    dataColumnStartEndIndeces <- 1
+    numberOfDataColumns   <- nrow(colData(qfeatures))
+    sampleClass           <- colData(qfeatures)$Class
+    sampleType            <- colData(qfeatures)$Type
+    sampleInjectionOrder  <- colData(qfeatures)$"Injection order"
+    batchID               <- NULL
+    if(! is.null(colData(qfeatures)$BatchID))
+      batchID            <- colData(qfeatures)$BatchID
+
+    }   else {
+    dataColumnStartEndIndeces <- NULL
+    numberOfDataColumns <- 0
+    sampleClass          <- NULL
+    sampleType           <- NULL
+    sampleInjectionOrder <- NULL
+    batchID              <- NULL
+  }
+
+  commaNumbers <- sum(grepl(x = dataFrame$"Average Mz", pattern = "^(\\d+,\\d+$)|(^\\d+$)"))
+  decimalSeparatorIsComma <- commaNumbers == nrow(dataFrame)
+  if(decimalSeparatorIsComma){
+    if(!is.null(dataFrame$"Average Rt(min)"))     dataFrame$"Average Rt(min)"     <- gsub(x = gsub(x = dataFrame$"Average Rt(min)", pattern = "\\.", replacement = ""), pattern = ",", replacement = ".")
+    if(!is.null(dataFrame$"Average Mz"))          dataFrame$"Average Mz"          <- gsub(x = gsub(x = dataFrame$"Average Mz", pattern = "\\.", replacement = ""), pattern = ",", replacement = ".")
+    if(!is.null(dataFrame$"Fill %"))              dataFrame$"Fill %"              <- gsub(x = gsub(x = dataFrame$"Fill %", pattern = "\\.", replacement = ""), pattern = ",", replacement = ".")
+    if(!is.null(dataFrame$"Dot product"))         dataFrame$"Dot product"         <- gsub(x = gsub(x = dataFrame$"Dot product", pattern = "\\.", replacement = ""), pattern = ",", replacement = ".")
+    if(!is.null(dataFrame$"Reverse dot product")) dataFrame$"Reverse dot product" <- gsub(x = gsub(x = dataFrame$"Reverse dot product", pattern = "\\.", replacement = ""), pattern = ",", replacement = ".")
+    if(!is.null(dataFrame$"Fragment presence %")) dataFrame$"Fragment presence %" <- gsub(x = gsub(x = dataFrame$"Fragment presence %", pattern = "\\.", replacement = ""), pattern = ",", replacement = ".")
+
+    ## replace -1 by 0
+    if(numberOfDataColumns > 0) {
+      for(colIdx in dataColumnStartEndIndeces[[1]]:dataColumnStartEndIndeces[[2]]){
+        dataFrame[ , colIdx] <- gsub(x = gsub(x = dataFrame[ , colIdx], pattern = "\\.", replacement = ""), pattern = ",", replacement = ".")
+      }
+    }
+  }
+
+  ###################
+  ## column formats
+  if(!is.null(dataFrame$"Average Rt(min)"))     dataFrame$"Average Rt(min)"     <- as.numeric(dataFrame$"Average Rt(min)")
+  if(!is.null(dataFrame$"Average Mz"))          dataFrame$"Average Mz"          <- as.numeric(dataFrame$"Average Mz")
+  if(!is.null(dataFrame$"Fill %"))              dataFrame$"Fill %"              <- as.numeric(dataFrame$"Fill %")
+  if(!is.null(dataFrame$"MS/MS included"))      dataFrame$"MS/MS included"      <- as.logical(dataFrame$"MS/MS included")
+  if(!is.null(dataFrame$"Dot product"))         dataFrame$"Dot product"         <- as.numeric(dataFrame$"Dot product")
+  if(!is.null(dataFrame$"Reverse dot product")) dataFrame$"Reverse dot product" <- as.numeric(dataFrame$"Reverse dot product")
+  if(!is.null(dataFrame$"Fragment presence %")) dataFrame$"Fragment presence %" <- as.numeric(dataFrame$"Fragment presence %")
+
+  #####################
+  ## sorted by m/z (needed for deisotoping)
+  if(!is.null(dataFrame$"Average Mz"))
+    dataFrame <- dataFrame[order(dataFrame$"Average Mz"), ]
+
+  ## replace -1 by 0
+  if(numberOfDataColumns > 0){
+    for(colIdx in dataColumnStartEndIndeces[[1]]:dataColumnStartEndIndeces[[2]]){
+      dataFrame[ , colIdx] <- as.numeric(dataFrame[ , colIdx])
+      if(!is.na(sum(dataFrame[,colIdx] == -1)))
+        dataFrame[(dataFrame[,colIdx] == -1),colIdx] <- 0
+    }
+  }
+
+  ## deisotoping
+  numberOfRemovedIsotopePeaks <- 0
+  if(doPrecursorDeisotoping & !is.null(dataFrame$"Average Mz")){
+    if(!is.na(progress))  if(progress)  incProgress(amount = 0, detail = paste("Precursor deisotoping...", sep = "")) else print(paste("Precursor deisotoping...", sep = ""))
+    distance13Cminus12C <- 1.0033548378
+
+    ## mark isotope precursors
+    precursorsToRemove <- vector(mode = "logical", length = numberOfPrecursors)
+
+    if(numberOfDataColumns > 0){
+      intensities <- dataFrame[ , dataColumnStartEndIndeces[[1]]:dataColumnStartEndIndeces[[2]]]
+      medians <- apply(X = as.matrix(intensities), MARGIN = 1, FUN = median)
+    }
+
+    for(precursorIdx in seq_len(numberOfPrecursors)){
+      if((precursorIdx %% (as.integer(numberOfPrecursors/10))) == 0)
+        if(!is.na(progress))  if(progress)  incProgress(amount = 0.0, detail = paste("Precursor deisotoping ", precursorIdx, " / ", numberOfPrecursors, sep = "")) else print(paste("Precursor deisotoping ", precursorIdx, " / ", numberOfPrecursors, sep = ""))
+
+      mzError <- dataFrame$"Average Mz"[[precursorIdx]] * mzDeviationInPPM_precursorDeisotoping / 1000000
+      mzError <- max(mzError, mzDeviationAbsolute_precursorDeisotoping)
+
+      ## RT difference <= maximumRtDifference
+      validPrecursorsInRt <- abs(dataFrame$"Average Rt(min)"[[precursorIdx]] - dataFrame$"Average Rt(min)"[-precursorIdx]) <= maximumRtDifference
+
+      ## MZ difference around 1.0033548378 (first isotope) or 1.0033548378 * 2 (second isotope)
+      validPrecursorsInMz1 <- abs((dataFrame$"Average Mz"[[precursorIdx]] - distance13Cminus12C * 1) - dataFrame$"Average Mz"[-precursorIdx]) <= mzError
+      validPrecursorsInMz2 <- abs((dataFrame$"Average Mz"[[precursorIdx]] - distance13Cminus12C * 2) - dataFrame$"Average Mz"[-precursorIdx]) <= mzError
+      validPrecursorsInMz <- validPrecursorsInMz1 | validPrecursorsInMz2
+
+      ## intensity gets smaller in the isotope spectrum
+      if(numberOfDataColumns > 0){
+        validPrecursorsInIntensity <- (medians[-precursorIdx] - medians[[precursorIdx]]) > 0
+      } else {
+        validPrecursorsInIntensity <- TRUE
+      }
+
+      if(any(validPrecursorsInRt & validPrecursorsInMz & validPrecursorsInIntensity))
+        precursorsToRemove[[precursorIdx]] <- TRUE
+    }
+
+    ## remove isotopes
+    dataFrame <- dataFrame[!precursorsToRemove, ]
+
+    numberOfRemovedIsotopePeaks <- sum(precursorsToRemove)
+    numberOfPrecursors <- nrow(dataFrame)
+  }
+
+  if(!is.na(progress))  if(progress)  incProgress(amount = 0, detail = paste("Boxing...", sep = "")) else print(paste("Boxing...", sep = ""))
+  returnObj <- list()
+  returnObj$dataFrame <- dataFrame
+
+  ## meta
+  returnObj$oldFormat <- oldFormat
+  returnObj$numberOfPrecursors <- numberOfPrecursors
+  returnObj$dataColumnStartEndIndeces <- dataColumnStartEndIndeces
+  returnObj$numberOfDataColumns <- numberOfDataColumns
+
+  ## group anno
+  returnObj$sampleClass          <- sampleClass
+  returnObj$sampleType           <- sampleType
+  returnObj$sampleInjectionOrder <- sampleInjectionOrder
+  returnObj$batchID              <- batchID
+
+  ## misc
+  returnObj$numberOfPrecursorsPrior <- numberOfPrecursorsPrior
+  returnObj$numberOfRemovedIsotopePeaks <- numberOfRemovedIsotopePeaks
+
+  return (returnObj)
+}
diff --git a/R/readMSDial.R b/R/readMSDial.R
@@ -0,0 +1,91 @@
+#' Read MS-DIAL Output File into a QFeatures Object
+#'
+#' This function reads the output file from MS-DIAL and 
+#' converts it into a QFeatures object.
+#'
+#' @param file A string with the path to the MS-DIAL output file.
+#' @param version A character string specifying the version of MS-DIAL used to generate the file.
+#'   This parameter is currently not used.
+#'
+#' @return A QFeatures object containing:
+#'   \itemize{
+#'     \item An assay named "exampleAssay" with the metabolite counts.
+#'     \item Row data (feature metadata) extracted from the input file.
+#'     \item Column data (sample metadata) extracted from the input file.
+#'   }
+#'
+#' @export
+#'
+#' @examples
+#' \dontrun{
+#' # Assuming you have an MS-DIAL output file named "Metabolite_profile_showcase.txt" in a "data" directory:
+#' qf <- readMSDial("data/Metabolite_profile_showcase.txt")
+#' 
+#' # Examine the structure of the resulting QFeatures object
+#' qf
+#' 
+#' # Access the assay data
+#' assay(qf[["exampleAssay"]])
+#' 
+#' # Access the row data (feature metadata)
+#' rowData(qf[["exampleAssay"]])
+#' 
+#' # Access the column data (sample metadata)
+#' colData(qf)
+#' }
+#'
+#' @importFrom QFeatures QFeatures
+#' @importFrom SummarizedExperiment SummarizedExperiment
+#' @importFrom S4Vectors DataFrame
+#'
+#' @seealso 
+#' \code{\link[QFeatures]{QFeatures}} for more information on the QFeatures class.
+#' \code{\link[SummarizedExperiment]{SummarizedExperiment}} for details on the underlying data structure.
+#'
+#' @note 
+#'
+#'
+#' @references
+#' 
+readMSDial <- function(file, version){
+    table <- read.table(file, fill = TRUE, sep = "\t",
+                        quote = "", header = FALSE)
+
+    # Identify the starting row and column of the data
+    startRow <- which(table[, 1] != "")[1]
+    startCol <- which(table[1, ] != "")[1]
+    ##TODO: version dependent error message if startRow or startCol are not as expected.
+
+    # Split the table in parts
+    colDataRaw <- table[1:startRow, startCol:ncol(table)]
+    rowDataRaw <- table[startRow:nrow(table), 1:(startCol)]
+    countsRaw <- table[startRow:nrow(table), startCol:ncol(table)]
+
+    # Extract ids and counts data
+    ids <- rowDataRaw[-1, 1]
+    counts <- as.matrix(countsRaw[-1, -1])
+    counts <- matrix(as.numeric(counts), nrow = nrow(counts), ncol = ncol(counts))
+    colnames(counts) <- as.character(countsRaw[1, -1])
+    rownames(counts) <- ids
+
+    # Ensure row names of colData match counts column names
+    colData <- DataFrame(t(colDataRaw[-nrow(colDataRaw), -1]))
+    rownames(colData) <- as.character(colDataRaw[nrow(colDataRaw), -1])
+    colnames(colData) <- as.character(colDataRaw[-nrow(colDataRaw), 1])
+
+    # Ensure row names of rowData match counts row names
+    rowData <- DataFrame(rowDataRaw[-1, ], row.names = ids)
+    colnames(rowData) <- as.character(rowDataRaw[1,])
+
+    # Create SummarizedExperiment object
+
+    sumExp <- SummarizedExperiment(assays = list(counts = counts),
+                                    rowData = rowData,
+                                    colData = colData)
+    ##TODO: Metadata with data source and version 
+
+    # Create QFeatures object
+    qf <- QFeatures(list(exampleAssay = sumExp), colData = colData(sumExp))
+    qf
+    ##TODO: name
+  }