diff --git a/DESCRIPTION b/DESCRIPTION index 6662d44..c96135d 100755 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,10 +1,11 @@ Package: MetFamily Type: Package Title: MetFamily: Discovering Regulated Metabolite Families in Untargeted Metabolomics Studies -Version: 0.99.3 -Date: 2024-03-02 +Version: 0.99.4 +Date: 2024-08-22 Author: c( person("Hendrik", "Treutler, role = c("aut"), email = "hendrik.treutler@ipb-halle.de"), person("Khabat", "Vahabi, role = c("aut"), email = "khabat.vahabi@ipb-halle.de"), + person("Norman", "Storz", role = c("aut"), email = "nstorz@ipb-halle.de"), person(given = "Steffen", family = "Neumann", email = "sneumann@ipb-halle.de", role = c("aut", "cre"), comment = c(ORCID = "0000-0002-7899-7192")) ) Depends: @@ -37,6 +38,7 @@ Imports: graphics, grDevices, methods, + QFeatures, stats, utils Remotes: decisionpatterns/searchable @@ -63,5 +65,8 @@ Collate: 'TreeAlgorithms.R' 'Plots.R' 'R_packages.R' + parsePeakAbundanceMatrixQF.R + readMSDial.R + readMetaboScape.R RoxygenNote: 7.2.3 Encoding: UTF-8 diff --git a/NAMESPACE b/NAMESPACE old mode 100755 new mode 100644 index 0156efc..a72af06 --- a/NAMESPACE +++ b/NAMESPACE @@ -1,4 +1,26 @@ -# Generated by roxyXXXgen2: do not edit by hand +# Generated by roxygen2: do not edit by hand + +export(calcPlotDendrogram_plotly) +export(calcPlotHeatmapLegend) +export(castListEntries) +export(data.numericmatrix) +export(metaboliteFamilyVersusClass) +export(mzClustGeneric) +export(parsePeakAbundanceMatrixQF) +export(processMS1data) +export(readClusterDataFromProjectFile) +export(readMSDial) +export(readMetaboscape) +export(readProjectData) +export(runMetFamily) +importFrom(QFeatures,QFeatures) +importFrom(S4Vectors,DataFrame) +importFrom(SummarizedExperiment,SummarizedExperiment) +importFrom(grDevices,colorRampPalette) +importFrom(grDevices,rainbow) +importFrom(grDevices,rgb) +importFrom(openxlsx2,read_xlsx) + ## Hardcoded: exportPattern("^[^\\.]") @@ -14,15 +36,6 @@ importFrom("stats", "as.dendrogram", "cor", "dendrapply", "dist", importFrom("utils", "flush.console", "read.table") -## From roxygen: -export(calcPlotDendrogram_plotly) -export(calcPlotHeatmapLegend) -export(castListEntries) -export(data.numericmatrix) -export(metaboliteFamilyVersusClass) -export(mzClustGeneric) -export(processMS1data) -export(readClusterDataFromProjectFile) -export(readProjectData) -export(runMetFamily) -importFrom(grDevices,colorRampPalette) +importFrom(SummarizedExperiment,colData) +importFrom(SummarizedExperiment,rowData) +importFrom(SummarizedExperiment,assay) diff --git a/R/FragmentMatrixFunctions.R b/R/FragmentMatrixFunctions.R index c3864c8..44c03a8 100644 --- a/R/FragmentMatrixFunctions.R +++ b/R/FragmentMatrixFunctions.R @@ -1396,7 +1396,7 @@ mzClustGeneric <- function(p, return(list(mat=groupmat,idx=groupindex)) } -convertToProjectFile <- function(filePeakMatrix, +convertToProjectFile <- function(filePeakMatrixPath, fileSpectra, parameterSet, progress = FALSE){ @@ -1434,8 +1434,11 @@ convertToProjectFile <- function(filePeakMatrix, rm(returnObj) + + filePeakMatrixQF <- readMSDial(filePeakMatrixPath) + returnObj <- convertToProjectFile2( - filePeakMatrix = filePeakMatrix, + filePeakMatrixQF = filePeakMatrixQF, spectraList = spectraList, precursorMz = precursorMz, precursorRt = precursorRt, metaboliteFamilies = rep(x = "", times = numberOfSpectra), uniqueMetaboliteFamilies = NULL, metaboliteFamilyColors = NULL, parameterSet = parameterSet, @@ -1454,7 +1457,7 @@ convertToProjectFile <- function(filePeakMatrix, return(returnObj) } -convertToProjectFile2 <- function(filePeakMatrix, +convertToProjectFile2 <- function(filePeakMatrixQF, spectraList, precursorMz, precursorRt, @@ -1471,9 +1474,9 @@ convertToProjectFile2 <- function(filePeakMatrix, ## metabolite profile if(!is.na(progress)) if(progress) incProgress(amount = 0.1, detail = paste("Parsing MS1 file...", sep = "")) else print(paste("Parsing MS1 file...", sep = "")) - if(!is.null(filePeakMatrix)){ - returnObj <- parsePeakAbundanceMatrix( - filePeakMatrix = filePeakMatrix, + if(!is.null(filePeakMatrixQF)){ + returnObj <- parsePeakAbundanceMatrixQF( + qfeatures = filePeakMatrixQF, doPrecursorDeisotoping = parameterSet$doPrecursorDeisotoping, mzDeviationInPPM_precursorDeisotoping = parameterSet$mzDeviationInPPM_precursorDeisotoping, mzDeviationAbsolute_precursorDeisotoping = parameterSet$mzDeviationAbsolute_precursorDeisotoping, diff --git a/R/parsePeakAbundanceMatrixQF.R b/R/parsePeakAbundanceMatrixQF.R new file mode 100644 index 0000000..c5cb0b2 --- /dev/null +++ b/R/parsePeakAbundanceMatrixQF.R @@ -0,0 +1,175 @@ +#' Title +#' +#' @param qfeatures Object of type QFeatures +#' @param doPrecursorDeisotoping +#' @param mzDeviationInPPM_precursorDeisotoping +#' @param mzDeviationAbsolute_precursorDeisotoping +#' @param maximumRtDifference +#' @param progress boolean whether or not to show a progress bar +#' +#' @return +#' @export +#' +#' @examples +parsePeakAbundanceMatrixQF <- function(qfeatures, + doPrecursorDeisotoping, + mzDeviationInPPM_precursorDeisotoping, + mzDeviationAbsolute_precursorDeisotoping, + maximumRtDifference, + progress=FALSE) +{ + ## read file + + if(!is.na(progress)) { + if(progress) { + incProgress(amount = 0.1, detail = paste("Parsing MS1 file content...", sep = "")) + } else { + print(paste("Parsing MS1 file content...", sep = "")) + } + } + + cols_to_exclude <- c("Reference RT","Reference m/z","Comment", + "Manually modified for quantification", + "Total score","RT similarity","Average","Stdev") + + cols_to_keep <- which(!colnames(rowData(qfeatures))[[1]] %in% cols_to_exclude) + + dataFrame <- cbind(rowData(qfeatures)[[1]][,cols_to_keep] ,assay(qfeatures)) + + ncol(rowData(qfeatures)[[1]]) + numberOfPrecursors <- nrow(dataFrame) + numberOfPrecursorsPrior <- numberOfPrecursors + + + + if(nrow(colData(qfeatures))>0){ + + dataColumnStartEndIndeces <- 1 + numberOfDataColumns <- nrow(colData(qfeatures)) + sampleClass <- colData(qfeatures)$Class + sampleType <- colData(qfeatures)$Type + sampleInjectionOrder <- colData(qfeatures)$"Injection order" + batchID <- NULL + if(! is.null(colData(qfeatures)$BatchID)) + batchID <- colData(qfeatures)$BatchID + + } else { + dataColumnStartEndIndeces <- NULL + numberOfDataColumns <- 0 + sampleClass <- NULL + sampleType <- NULL + sampleInjectionOrder <- NULL + batchID <- NULL + } + + commaNumbers <- sum(grepl(x = dataFrame$"Average Mz", pattern = "^(\\d+,\\d+$)|(^\\d+$)")) + decimalSeparatorIsComma <- commaNumbers == nrow(dataFrame) + if(decimalSeparatorIsComma){ + if(!is.null(dataFrame$"Average Rt(min)")) dataFrame$"Average Rt(min)" <- gsub(x = gsub(x = dataFrame$"Average Rt(min)", pattern = "\\.", replacement = ""), pattern = ",", replacement = ".") + if(!is.null(dataFrame$"Average Mz")) dataFrame$"Average Mz" <- gsub(x = gsub(x = dataFrame$"Average Mz", pattern = "\\.", replacement = ""), pattern = ",", replacement = ".") + if(!is.null(dataFrame$"Fill %")) dataFrame$"Fill %" <- gsub(x = gsub(x = dataFrame$"Fill %", pattern = "\\.", replacement = ""), pattern = ",", replacement = ".") + if(!is.null(dataFrame$"Dot product")) dataFrame$"Dot product" <- gsub(x = gsub(x = dataFrame$"Dot product", pattern = "\\.", replacement = ""), pattern = ",", replacement = ".") + if(!is.null(dataFrame$"Reverse dot product")) dataFrame$"Reverse dot product" <- gsub(x = gsub(x = dataFrame$"Reverse dot product", pattern = "\\.", replacement = ""), pattern = ",", replacement = ".") + if(!is.null(dataFrame$"Fragment presence %")) dataFrame$"Fragment presence %" <- gsub(x = gsub(x = dataFrame$"Fragment presence %", pattern = "\\.", replacement = ""), pattern = ",", replacement = ".") + + ## replace -1 by 0 + if(numberOfDataColumns > 0) { + for(colIdx in dataColumnStartEndIndeces[[1]]:dataColumnStartEndIndeces[[2]]){ + dataFrame[ , colIdx] <- gsub(x = gsub(x = dataFrame[ , colIdx], pattern = "\\.", replacement = ""), pattern = ",", replacement = ".") + } + } + } + + ################### + ## column formats + if(!is.null(dataFrame$"Average Rt(min)")) dataFrame$"Average Rt(min)" <- as.numeric(dataFrame$"Average Rt(min)") + if(!is.null(dataFrame$"Average Mz")) dataFrame$"Average Mz" <- as.numeric(dataFrame$"Average Mz") + if(!is.null(dataFrame$"Fill %")) dataFrame$"Fill %" <- as.numeric(dataFrame$"Fill %") + if(!is.null(dataFrame$"MS/MS included")) dataFrame$"MS/MS included" <- as.logical(dataFrame$"MS/MS included") + if(!is.null(dataFrame$"Dot product")) dataFrame$"Dot product" <- as.numeric(dataFrame$"Dot product") + if(!is.null(dataFrame$"Reverse dot product")) dataFrame$"Reverse dot product" <- as.numeric(dataFrame$"Reverse dot product") + if(!is.null(dataFrame$"Fragment presence %")) dataFrame$"Fragment presence %" <- as.numeric(dataFrame$"Fragment presence %") + + ##################### + ## sorted by m/z (needed for deisotoping) + if(!is.null(dataFrame$"Average Mz")) + dataFrame <- dataFrame[order(dataFrame$"Average Mz"), ] + + ## replace -1 by 0 + if(numberOfDataColumns > 0){ + for(colIdx in dataColumnStartEndIndeces[[1]]:dataColumnStartEndIndeces[[2]]){ + dataFrame[ , colIdx] <- as.numeric(dataFrame[ , colIdx]) + if(!is.na(sum(dataFrame[,colIdx] == -1))) + dataFrame[(dataFrame[,colIdx] == -1),colIdx] <- 0 + } + } + + ## deisotoping + numberOfRemovedIsotopePeaks <- 0 + if(doPrecursorDeisotoping & !is.null(dataFrame$"Average Mz")){ + if(!is.na(progress)) if(progress) incProgress(amount = 0, detail = paste("Precursor deisotoping...", sep = "")) else print(paste("Precursor deisotoping...", sep = "")) + distance13Cminus12C <- 1.0033548378 + + ## mark isotope precursors + precursorsToRemove <- vector(mode = "logical", length = numberOfPrecursors) + + if(numberOfDataColumns > 0){ + intensities <- dataFrame[ , dataColumnStartEndIndeces[[1]]:dataColumnStartEndIndeces[[2]]] + medians <- apply(X = as.matrix(intensities), MARGIN = 1, FUN = median) + } + + for(precursorIdx in seq_len(numberOfPrecursors)){ + if((precursorIdx %% (as.integer(numberOfPrecursors/10))) == 0) + if(!is.na(progress)) if(progress) incProgress(amount = 0.0, detail = paste("Precursor deisotoping ", precursorIdx, " / ", numberOfPrecursors, sep = "")) else print(paste("Precursor deisotoping ", precursorIdx, " / ", numberOfPrecursors, sep = "")) + + mzError <- dataFrame$"Average Mz"[[precursorIdx]] * mzDeviationInPPM_precursorDeisotoping / 1000000 + mzError <- max(mzError, mzDeviationAbsolute_precursorDeisotoping) + + ## RT difference <= maximumRtDifference + validPrecursorsInRt <- abs(dataFrame$"Average Rt(min)"[[precursorIdx]] - dataFrame$"Average Rt(min)"[-precursorIdx]) <= maximumRtDifference + + ## MZ difference around 1.0033548378 (first isotope) or 1.0033548378 * 2 (second isotope) + validPrecursorsInMz1 <- abs((dataFrame$"Average Mz"[[precursorIdx]] - distance13Cminus12C * 1) - dataFrame$"Average Mz"[-precursorIdx]) <= mzError + validPrecursorsInMz2 <- abs((dataFrame$"Average Mz"[[precursorIdx]] - distance13Cminus12C * 2) - dataFrame$"Average Mz"[-precursorIdx]) <= mzError + validPrecursorsInMz <- validPrecursorsInMz1 | validPrecursorsInMz2 + + ## intensity gets smaller in the isotope spectrum + if(numberOfDataColumns > 0){ + validPrecursorsInIntensity <- (medians[-precursorIdx] - medians[[precursorIdx]]) > 0 + } else { + validPrecursorsInIntensity <- TRUE + } + + if(any(validPrecursorsInRt & validPrecursorsInMz & validPrecursorsInIntensity)) + precursorsToRemove[[precursorIdx]] <- TRUE + } + + ## remove isotopes + dataFrame <- dataFrame[!precursorsToRemove, ] + + numberOfRemovedIsotopePeaks <- sum(precursorsToRemove) + numberOfPrecursors <- nrow(dataFrame) + } + + if(!is.na(progress)) if(progress) incProgress(amount = 0, detail = paste("Boxing...", sep = "")) else print(paste("Boxing...", sep = "")) + returnObj <- list() + returnObj$dataFrame <- dataFrame + + ## meta + returnObj$oldFormat <- oldFormat + returnObj$numberOfPrecursors <- numberOfPrecursors + returnObj$dataColumnStartEndIndeces <- dataColumnStartEndIndeces + returnObj$numberOfDataColumns <- numberOfDataColumns + + ## group anno + returnObj$sampleClass <- sampleClass + returnObj$sampleType <- sampleType + returnObj$sampleInjectionOrder <- sampleInjectionOrder + returnObj$batchID <- batchID + + ## misc + returnObj$numberOfPrecursorsPrior <- numberOfPrecursorsPrior + returnObj$numberOfRemovedIsotopePeaks <- numberOfRemovedIsotopePeaks + + return (returnObj) +} diff --git a/R/readMSDial.R b/R/readMSDial.R new file mode 100644 index 0000000..d3a06bb --- /dev/null +++ b/R/readMSDial.R @@ -0,0 +1,91 @@ +#' Read MS-DIAL Output File into a QFeatures Object +#' +#' This function reads the output file from MS-DIAL and +#' converts it into a QFeatures object. +#' +#' @param file A string with the path to the MS-DIAL output file. +#' @param version A character string specifying the version of MS-DIAL used to generate the file. +#' This parameter is currently not used. +#' +#' @return A QFeatures object containing: +#' \itemize{ +#' \item An assay named "exampleAssay" with the metabolite counts. +#' \item Row data (feature metadata) extracted from the input file. +#' \item Column data (sample metadata) extracted from the input file. +#' } +#' +#' @export +#' +#' @examples +#' \dontrun{ +#' # Assuming you have an MS-DIAL output file named "Metabolite_profile_showcase.txt" in a "data" directory: +#' qf <- readMSDial("data/Metabolite_profile_showcase.txt") +#' +#' # Examine the structure of the resulting QFeatures object +#' qf +#' +#' # Access the assay data +#' assay(qf[["exampleAssay"]]) +#' +#' # Access the row data (feature metadata) +#' rowData(qf[["exampleAssay"]]) +#' +#' # Access the column data (sample metadata) +#' colData(qf) +#' } +#' +#' @importFrom QFeatures QFeatures +#' @importFrom SummarizedExperiment SummarizedExperiment +#' @importFrom S4Vectors DataFrame +#' +#' @seealso +#' \code{\link[QFeatures]{QFeatures}} for more information on the QFeatures class. +#' \code{\link[SummarizedExperiment]{SummarizedExperiment}} for details on the underlying data structure. +#' +#' @note +#' +#' +#' @references +#' +readMSDial <- function(file, version){ + table <- read.table(file, fill = TRUE, sep = "\t", + quote = "", header = FALSE) + + # Identify the starting row and column of the data + startRow <- which(table[, 1] != "")[1] + startCol <- which(table[1, ] != "")[1] + ##TODO: version dependent error message if startRow or startCol are not as expected. + + # Split the table in parts + colDataRaw <- table[1:startRow, startCol:ncol(table)] + rowDataRaw <- table[startRow:nrow(table), 1:(startCol)] + countsRaw <- table[startRow:nrow(table), startCol:ncol(table)] + + # Extract ids and counts data + ids <- rowDataRaw[-1, 1] + counts <- as.matrix(countsRaw[-1, -1]) + counts <- matrix(as.numeric(counts), nrow = nrow(counts), ncol = ncol(counts)) + colnames(counts) <- as.character(countsRaw[1, -1]) + rownames(counts) <- ids + + # Ensure row names of colData match counts column names + colData <- DataFrame(t(colDataRaw[-nrow(colDataRaw), -1])) + rownames(colData) <- as.character(colDataRaw[nrow(colDataRaw), -1]) + colnames(colData) <- as.character(colDataRaw[-nrow(colDataRaw), 1]) + + # Ensure row names of rowData match counts row names + rowData <- DataFrame(rowDataRaw[-1, ], row.names = ids) + colnames(rowData) <- as.character(rowDataRaw[1,]) + + # Create SummarizedExperiment object + + sumExp <- SummarizedExperiment(assays = list(counts = counts), + rowData = rowData, + colData = colData) + ##TODO: Metadata with data source and version + + # Create QFeatures object + qf <- QFeatures(list(exampleAssay = sumExp), colData = colData(sumExp)) + qf + ##TODO: name + } \ No newline at end of file diff --git a/R/readMetaboScape.R b/R/readMetaboScape.R new file mode 100644 index 0000000..5725627 --- /dev/null +++ b/R/readMetaboScape.R @@ -0,0 +1,99 @@ +library(openxlsx2) +library(dplyr) +library(purrr) +library(QFeatures) +library(SummarizedExperiment) + +#' Read Metaboscape Output File into a QFeatures Object +#' +#' This function reads a metabolite profile output file (.xlsx) from Metaboscape and +#' converts it into a QFeatures object. +#' +#' @param file A character string specifying the path to the Metaboscape output file (Excel format). +#' @param version A character string specifying the version of Metaboscape used to generate the file. +#' This parameter is currently not used. +#' +#' @return A QFeatures object containing: +#' \itemize{ +#' \item An assay named "exampleAssay" with the metabolite counts. +#' \item Row data (feature metadata) extracted from the input file. +#' \item Column data (sample metadata) extracted from the sample names, including injection order and sample name. +#' } +#' +#' @importFrom openxlsx2 read_xlsx +#' @importFrom QFeatures QFeatures +#' @importFrom SummarizedExperiment SummarizedExperiment +#' +#' @export +#' +#' @examples +#' \dontrun{ +#' # Assuming you have a Metaboscape output file named "data.xlsx": +#' qf <- readMetaboscape("data.xlsx") #TODO: System file +#' +#' +#' # Examine the structure of the resulting QFeatures object +#' qf +#' +#' # Access the assay data +#' assay(qf[["exampleAssay"]]) +#' +#' # Access the row data (feature metadata) +#' rowData(qf[["exampleAssay"]]) +#' +#' # Access the column data (sample metadata) +#' colData(qf) +#' } +#' +#' +#' @details +#' @note +#' +#' @seealso +#' \code{\link[QFeatures]{QFeatures}} for more information on the QFeatures class. +#' \code{\link[SummarizedExperiment]{SummarizedExperiment}} for details on the underlying data structure. +#' +#' @references +#' #TODO: Bruker metaboscape site +#' #TODO: Ordering ? +#' + +readMetaboscape <- function(file, version){ + + table <- read_xlsx(file) + rownames(table) <- table[,1] + colnames <- colnames(table) + colIdsSamples <- grepl("\\d+$", colnames) + + startOfSamples <- which(colIdsSamples)[1] + + # Extract ids and counts data + ids <- table[,1] + countsRaw <- table[,colIdsSamples] + countsNumeric <- apply(countsRaw, 2, as.numeric) + counts <- as.matrix(countsNumeric) + rownames(counts) <- ids + + # Extract rowData + rowData <- table[,!colIdsSamples] + rownames(rowData) <- ids + + # Extract colData from sample Names + sampleNames <- colnames(table[,colIdsSamples]) + colDataRaw <- sapply(sampleNames, function(x) { + # find position of the first character before the Run number + pos <- max(gregexpr("[^0-9]", x)[[1]]) + c(substr(x, 1, (pos - 1)), substr(x, pos + 1, nchar(x))) + }) + colData <- data.frame("Injection order" = colDataRaw[2,], + "Sample name" = colDataRaw[1,]) + + # Create SummarizedExperiment object + sumExp <- SummarizedExperiment(assays = list(counts = counts), + rowData = rowData, + colData = colData) + rownames(sumExp) + # Create QFeatures object + qf <- QFeatures(list(exampleAssay = sumExp), colData = colData(sumExp)) + qf +} \ No newline at end of file diff --git a/inst/extdata/testdata/MetaboScape_rye_data.xlsx b/inst/extdata/testdata/MetaboScape_rye_data.xlsx new file mode 100644 index 0000000..7a6e146 Binary files /dev/null and b/inst/extdata/testdata/MetaboScape_rye_data.xlsx differ diff --git a/tests/testthat/test-readMSDial.R b/tests/testthat/test-readMSDial.R new file mode 100644 index 0000000..80c7da1 --- /dev/null +++ b/tests/testthat/test-readMSDial.R @@ -0,0 +1,23 @@ +filePeakMatrix <- system.file("extdata/showcase/Metabolite_profile_showcase.txt", package = "MetFamily") +qf <- readMSDial(filePeakMatrix) + +test_that("Sum of Intensities is correct", { + + sumQf <- sum(colSums(assay(qf))) + expect_equal(sumQf, 232301678) + +}) + +test_that("Number of Rows and Columns are correct", { + + # check if number of rows is identical + nrowQf <- nrow(assay(qf)) + expect_equal(nrowQf, 5823L) + + # check if number of cols is identical + ncolQf <- ncol(assay(qf))+ ncols(rowData(qf)) + expect_equal(as.integer(ncolQf), 20) + +}) + + diff --git a/tests/testthat/test-readMetaboScape.R b/tests/testthat/test-readMetaboScape.R new file mode 100644 index 0000000..c9d19a1 --- /dev/null +++ b/tests/testthat/test-readMetaboScape.R @@ -0,0 +1,19 @@ +filePath <- system.file("extdata/testdata/MetaboScape_rye_data.xlsx", package = "MetFamily") +qf <- readMetaboscape(filePath) + +test_that("Sum of Intensities is correct", { + sumQf <- sum(colSums(assay(qf))) + expect_equal(sumQf,307716661) +}) + +test_that("Number of Rows and Columns are correct", { + + # check if number of rows is identical + nrowQf <- nrow(assay(qf)) + expect_equal(nrowQf, 805L) + + # check if number of cols is identical + ncolQf <- ncol(assay(qf))+ ncols(rowData(qf)) + expect_equal(as.integer(ncolQf), 73) + #TODO: Alignment id 0 vs 1 +}) \ No newline at end of file