Merge pull request #83 from NorStorz/feature/qfeature

WIP: support for Sirius annotations
ipb-halle · Nov 7, 2024 · 7ae4c32 · 7ae4c32
2 parents eb5311a + 660541a
commit 7ae4c32
Show file tree

Hide file tree

Showing 11 changed files with 25,587 additions and 117 deletions.
diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
@@ -106,43 +106,13 @@ jobs:
           echo $sysreqs
           sudo -s eval "$sysreqs"
 
-      # Step 9: Install macOS system dependencies
-      - name: Install macOS system dependencies
-        if: matrix.config.os == 'macOS-latest'
-        run: |
-          ## Enable installing XML from source if needed
-          brew install libxml2
-          echo "XML_CONFIG=/usr/local/opt/libxml2/bin/xml2-config" >> $GITHUB_ENV
-
-          ## Required to install magick as noted at
-          ## https://github.com/r-lib/usethis/commit/f1f1e0d10c1ebc75fd4c18fa7e2de4551fd9978f#diff-9bfee71065492f63457918efcd912cf2
-          brew install imagemagick@6
-
-          ## For textshaping, required by ragg, and required by pkgdown
-          brew install harfbuzz fribidi
-
-          ## For installing usethis's dependency gert
-          brew install libgit2
-
-          ## required for ncdf4
-          ## brew install netcdf ## Does not work as it is compiled with gcc
-          ## Use pre-compiled libraries from https://mac.r-project.org/libs-4/
-          curl -O https://mac.r-project.org/libs-4/netcdf-4.7.4-darwin.17-x86_64.tar.gz
-          tar fvxzm netcdf-4.7.4-darwin.17-x86_64.tar.gz -C /
-          rm netcdf-4.7.4-darwin.17-x86_64.tar.gz
-          curl -O https://mac.r-project.org/libs-4/hdf5-1.12.0-darwin.17-x86_64.tar.gz
-          tar fvxzm hdf5-1.12.0-darwin.17-x86_64.tar.gz -C /
-          rm hdf5-1.12.0-darwin.17-x86_64.tar.gz
-          curl -O https://mac.r-project.org/libs-4/szip-2.1.1-darwin.17-x86_64.tar.gz
-          tar fvxzm szip-2.1.1-darwin.17-x86_64.tar.gz -C /
-          rm szip-2.1.1-darwin.17-x86_64.tar.gz
-
-      # Step 10: Install Windows system dependencies
-      - name: Install Windows system dependencies
-        if: runner.os == 'Windows'
-        run: |
-          ## Edit below if you have any Windows system dependencies
-        shell: Rscript {0}
+      # Steps 9 and 10 (macOS and Windows dependencies) removed
+      - uses: r-lib/actions/setup-r-dependencies@v2
+        with:
+          extra-packages: any::rcmdcheck, testthat, openxlsx2
+          needs: check
+          dependencies: '"hard"'
+
 
       # Step 11: Install BiocManager
       - name: Install BiocManager
@@ -206,12 +176,7 @@ jobs:
           BiocManager::install("BiocGenerics")
         shell: Rscript {0}
 
-      # Step 16: Install covr
-      - name: Install covr
-        if: github.ref == 'refs/heads/devel' && env.run_covr == 'true' && runner.os == 'Linux'
-        run: |
-          remotes::install_cran("covr")
-        shell: Rscript {0}
+      # Skipping Step 16: Install covr
 
       # Step 17: Install pkgdown
       - name: Install pkgdown

diff --git a/.gitignore b/.gitignore
@@ -1,3 +1,4 @@
+repopack-output.txt
 .Rproj.user
 .Rhistory
 .RData

diff --git a/DESCRIPTION b/DESCRIPTION
@@ -40,7 +40,10 @@ Imports:
     methods,
     QFeatures,
     stats,
-    utils
+    utils,
+    S4Vectors, 
+    SummarizedExperiment,
+    openxlsx2
 Remotes: 
     decisionpatterns/searchable
 Suggests:

diff --git a/R/DataProcessing.R b/R/DataProcessing.R
@@ -79,8 +79,8 @@ readClusterDataFromProjectFile <- function(file, progress = FALSE)
     fileLines <- readLines(con = file)
   )
   base::close(con = file)
-  
-  dataList <- readProjectData(fileLines = fileLines, progress = progress)
+
+  dataList <- readProjectData(fileLines = fileLines, progress = progress, qfeatures = qfeatures)
   fileLines <- NULL
 
   return(dataList)
@@ -98,7 +98,7 @@ readClusterDataFromProjectFile <- function(file, progress = FALSE)
 #' @export
 #'
 #' @examples
-readProjectData <- function(fileLines, progress = FALSE)
+readProjectData <- function(fileLines, progress = FALSE, qfeatures = NULL)
 {
   allowedTags <- c("ID")
   allowedTagPrefixes <- c("AnnotationColors=")
@@ -196,70 +196,43 @@ readProjectData <- function(fileLines, progress = FALSE)
   listMatrixRows <- NULL
   listMatrixCols <- NULL
 
-  ## Disable command line reading of answer
-  if (FALSE) {
+
   ################################################################################
   #Start of importing  annotation part1 from two
   # Display the message and give the user the option to choose whether to upload the annotation file or not. 
   #If Y shows selection window for annotation file. if N ignores annotation process
   #message("Do you want to upload the annotation file? (Y/N)")
   #user_choice <- readline()
-  user_choice <- "N"
 
-  if (toupper(user_choice) == "Y") {
-
-
-    # Read the annotation_file file (if needed)
-    annotation_file <- read.delim(file.choose(), header = TRUE, check.names = FALSE) # select interactively
-
-    # Display the available columns in annotation_file
-    message("Available columns in annotation_file:")
-    available_columns <- colnames(annotation_file)
-    for (i in 1:length(available_columns)) {
-      message(paste(i, "-", available_columns[i]))
+  ######debugging
+  tryCatch(
+    {
+      rowData(qfeatures)
+    },
+    error = function(e) {
+      message("Error: ", e$message)
+      traceback()
     }
+  )
+  ######debugging
+
+  if (!is.null(attr(rowData(qfeatures[[1]]), "annotation column"))) {
 
-    # Prompt the user to select the column containing IDs
-    message("Enter the number corresponding to the column containing IDs:")
-    selected_column_id <- as.integer(readline())
+    # Extract the relevant data: Alignment ID and the annotation column from qfeatures
+    annot_colname <- attr(rowData(qfeatures[[1]]), "annotation column")
+    annotation_data <- rowData(qfeatures[[1]])[[annot_colname]]
+    alignment_ids <- rowData(qfeatures[[1]])[["Alignment ID"]]
 
-    # Check if the selected column index is valid
-    if (selected_column_id >= 1 && selected_column_id <= length(available_columns)) {
-      id_column <- available_columns[selected_column_id]
-
-      # Prompt the user to select the Annotation column to use
-      message("Enter the number corresponding to the annotation column:")
-      selected_column_annot <- as.integer(readline())
-
-      # Check if the selected column index is valid
-      if (selected_column_annot >= 1 && selected_column_annot <= length(available_columns)) {
-        selected_column <- available_columns[selected_column_annot]
-
-        # Iterate through all values in the "Annotation" column of metaboliteProfile, excluding first row
-        for (i in 1:nrow(metaboliteProfile)) {
-          # Perform the lookup based on metaboliteProfile's "Alignment ID" column and annotation_file's selected ID column
-          matching_indices <- which(annotation_file[[id_column]] == metaboliteProfile$'Alignment ID'[i])
-
-          # Check data types and unique values of IDs column in annotation_file
-
-          # Check if any matches were found
-          if (length(matching_indices) > 0) {
-            # Update the specified column (Annotation) in metaboliteProfile with the corresponding value from annotation_file
-            metaboliteProfile[i, "Annotation"] <- annotation_file[matching_indices[1], selected_column]
-          } else {
-            # Handle the case where no match was found (you can add custom logic here)
-            warning(paste("No match found for row", i, "in metaboliteProfile"))
-          }
-        }
-      } else {
-        message("Invalid column selection. Skipping annotation step.")
-      }
-    }
+    # Find the matching indices between metaboliteProfile and annotation_data
+    matching_indices <- match(metaboliteProfile[["Alignment ID"]], alignment_ids)
+
+    metaboliteProfile$Annotation[!is.na(matching_indices)] <- annotation_data[matching_indices[!is.na(matching_indices)]] 
+    #eliminate NAs replace by "" so nchar(annoVals[[i]]) > 0 works in l. 597
+    metaboliteProfile$Annotation[is.na(metaboliteProfile$Annotation)] <- "" 
   }
 
   #####################################################################################################################################
   #end of importing  annotation part1 from two
-  }
 
   listMatrixVals <- NULL
 
@@ -319,13 +292,17 @@ readProjectData <- function(fileLines, progress = FALSE)
   }
 
   ## STN: Disabled. 
-  if (FALSE) {
+  if (!is.null(attr(rowData(qfeatures[[1]]), "annotation column"))) {
   #Start of importing annotation part2 from two
   ################################################################################
    #adding HEX color codes from external annotations to the annotationColorsMapInitValue of dataFrameHeader
-  if (toupper(user_choice) == "Y") {
+
       # Copy the selected column by user, Remove duplicates and exclude the first row
     uniqueAnnotations <- unique(unlist(strsplit(metaboliteProfile$Annotation, ",")))
+    ###Debug
+    print("Unique Annotations Before Filtering:")
+    print(uniqueAnnotations)
+    ###/Debug
     uniqueAnnotations <- paste0(uniqueAnnotations, "=")
     # Add a random string from the hex color list to each element of uniqueAnnotions
     # strings_list <- c("#000000", "#FFFFFF", "#FF0000", "#00FF00", "#0000FF", "#FFFF00", "#FF00FF", "#00FFFF", "#800000", "#008000", "#000080", "#808000", "#800080", "#008080", "#808080", "#C0C0C0", "#FFA500", "#FFC0CB", "#FFD700", "#A52A2A")
@@ -338,7 +315,7 @@ readProjectData <- function(fileLines, progress = FALSE)
     uniqueAnnotationsHexs <- gsub("AnnotationColors=\\{\\s+", "AnnotationColors={", paste("AnnotationColors={", paste(uniqueAnnotations1, collapse = ","), "}"))
     # Assuming dataFrameHeader is your data frame
     dataFrameHeader$Annotation[2] <- uniqueAnnotationsHexs
-  }
+
 ################################################################################
 #End of importing  annotation part2 from two
   }
@@ -612,9 +589,11 @@ readProjectData <- function(fileLines, progress = FALSE)
   annotationValueIgnore <- "Ignore"
   annotationColorIgnore <- "red"
 
+
   ## present annotations
   annotations    <- vector(mode='list', length=numberOfMS1features)
   annoVals <- metaboliteProfile[, annotationColumnName]
+
   for(i in seq_len(numberOfMS1features)){
     if(nchar(annoVals[[i]]) > 0){
       annotations[[i]] <- as.list(unlist(strsplit(x = annoVals[[i]], split = ", ")))

diff --git a/R/FragmentMatrixFunctions.R b/R/FragmentMatrixFunctions.R
@@ -1397,7 +1397,8 @@ mzClustGeneric <- function(p,
 }
 
 convertToProjectFile <- function(filePeakMatrixPath, 
-                                 fileSpectra, 
+                                 fileSpectra,
+                                 fileAnnotation,
                                  parameterSet, 
                                  progress = FALSE){
   ####################################################################################
@@ -1436,6 +1437,10 @@ convertToProjectFile <- function(filePeakMatrixPath,
 
 
   filePeakMatrixQF <- readMSDial(filePeakMatrixPath)
+  if (!is.null(fileAnnotation)){
+    # TODO: determine colums to merge by
+    filePeakMatrixQF <- addSiriusAnnotations(filePeakMatrixQF,fileAnnotation)
+  }
 
   returnObj <- convertToProjectFile2(
     filePeakMatrixQF = filePeakMatrixQF, 
@@ -1453,7 +1458,7 @@ convertToProjectFile <- function(filePeakMatrixPath,
   returnObj$numberOfSpectraDiscardedDueToNoPeaks <- numberOfSpectraDiscardedDueToNoPeaks
   returnObj$numberOfSpectraDiscardedDueToMaxIntensity <- numberOfSpectraDiscardedDueToMaxIntensity
   returnObj$numberOfSpectraDiscardedDueToTooHeavy <- numberOfSpectraDiscardedDueToTooHeavy
-
+  returnObj$qfeatures <- filePeakMatrixQF
   return(returnObj)
 }
 
@@ -1497,6 +1502,7 @@ convertToProjectFile2 <- function(filePeakMatrixQF,
     numberOfParsedMs1Features <- returnObj$numberOfPrecursorsPrior
     numberOfRemovedPrecursorIsotopePeaks <- returnObj$numberOfRemovedIsotopePeaks
 
+    qfeatures <- returnObj$qfeatures
     rm(returnObj)
   } else {
     propList <- list(
@@ -1589,7 +1595,7 @@ convertToProjectFile2 <- function(filePeakMatrixQF,
   #temporary fix
   #filePeakMatrix <- NULL
 
-  if(!is.null(filePeakMatrix)){
+  if(!is.null(filePeakMatrixQF)){
     ## allHits: dataFrame$"Average Mz" --> precursorMz; allHits indexes the spectraList
     diffAll <- abs(outer(X = precursorMz, Y = dataFrame$"Average Mz", FUN = function(x, y){abs(x-y)}))
     allHits <- apply(X = diffAll, MARGIN = 2, FUN = function(x){which(x == min(x[x < parameterSet$mzDeviationAbsolute_mapping], Inf))})
@@ -1850,7 +1856,8 @@ convertToProjectFile2 <- function(filePeakMatrixQF,
     numberOfUnmappedSpectra = numberOfUnmappedSpectra,
     numberOfUnmappedPrecursors = numberOfUnmappedPrecursors,
     numberOfUnmappedPrecursorsMz = numberOfUnmappedPrecursorsMz,
-    numberOfUnmappedPrecursorsRt = numberOfUnmappedPrecursorsRt
+    numberOfUnmappedPrecursorsRt = numberOfUnmappedPrecursorsRt,
+    qfeatures <- qfeatures
   )
 
   if(!is.na(progress))  if(progress)  setProgress(1) else print("Ready")

diff --git a/R/parsePeakAbundanceMatrixQF.R b/R/parsePeakAbundanceMatrixQF.R
@@ -40,7 +40,7 @@ parsePeakAbundanceMatrixQF <- function(qfeatures,
   dataFrame <- cbind(rowData(qfeatures)[[1]][,cols_to_keep] ,assay(qfeatures))
   #workaround for avoiding change in colnames during coercion
   cnames <- colnames(dataFrame)
-  dataFrame <- as.data.frame(dataFrame)
+  dataFrame <- as.data.frame(dataFrame, check.names = FALSE)
   colnames(dataFrame) <- cnames
   oldFormat <- ncol(colData(qfeatures))==3
   numRowDataCols <- ncol(rowData(qfeatures)[[1]])
@@ -161,6 +161,9 @@ parsePeakAbundanceMatrixQF <- function(qfeatures,
   returnObj$dataFrame <- dataFrame
   returnObj$vals <- vals
 
+  ## qfeatures
+  returnObj$qfeatures <- qfeatures
+
   ## meta
   returnObj$oldFormat <- oldFormat
   returnObj$numberOfPrecursors <- numberOfPrecursors
@@ -179,3 +182,49 @@ parsePeakAbundanceMatrixQF <- function(qfeatures,
 
   return (returnObj)
 }
+
+#' Title
+#'
+#' @param qfeatures 
+#' @param siriusFile 
+#' @param featureID 
+#' @param siriusID 
+#'
+#' @return
+#' @export
+#'
+#' @examples
+addSiriusAnnotations <- function(qfeatures,
+                                 siriusFile,
+                                 rowData_col = "Alignment ID",
+                                 sirius_col = "featureId") {
+  #TODO: specify more parameters in read delim
+  annotation <- read.delim(siriusFile)
+
+  rowData <- rowData(qfeatures[[1]])
+
+  # Print for debugging
+  print(paste("Merging by:", sirius_col, "and", rowData_col))
+
+  # Merge the data frames
+  annotatedRowData <- S4Vectors::merge( rowData, annotation,
+                             by.x = rowData_col, by.y = sirius_col,  all.x = TRUE)
+
+  #TODO: ? check for duplicate columns ?
+  annotation_cols <- colnames(annotation)[colnames(annotation) != rowData_col]
+  rowData_cols <- colnames(rowData)
+
+  for (col in colnames(annotatedRowData)) {
+    if (col %in% annotation_cols) {
+      attr(annotatedRowData[[col]], "source") <- "sirius"
+    } else if (col %in% rowData_cols) {
+      attr(annotatedRowData[[col]], "source") <- "data"
+    }
+  }
+
+  # Set the annotation column
+  attr(annotatedRowData, "annotation column") <- "ClassyFire.subclass"
+
+  rowData(qfeatures[[1]]) <- annotatedRowData
+  return(qfeatures)
+}