Merge pull request #120 from cbielow/r1.0.14

Release preparation for 1.0.14
cbielow · Sep 21, 2022 · 4bbbb9f · 4bbbb9f
2 parents 97d5f87 + a33c38e
commit 4bbbb9f
Show file tree

Hide file tree

Showing 19 changed files with 174 additions and 99 deletions.
diff --git a/.gitignore b/.gitignore
@@ -1,2 +1,3 @@
 inst/doc
 .Rhistory
+.Rproj.user/
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,8 +1,8 @@
 Package: PTXQC
 Type: Package
 Title: Quality Report Generation for MaxQuant and mzTab Results
-Version: 1.0.13
-Date: 2022-03-22
+Version: 1.0.14
+Date: 2022-09-21
 Author: Chris Bielow [aut, cre],
   Juliane Schmachtenberg [ctb],
   Swenja Wagner [ctb],
@@ -23,6 +23,7 @@ Imports:
     ggplot2 (>= 2.2),
     ggdendro,
     grid,
+    gridExtra,
     grDevices,
     gtable,
     htmlTable,
@@ -49,6 +50,6 @@ VignetteBuilder: knitr
 License: BSD_3_clause + file LICENSE
 Encoding: UTF-8
 Roxygen: list()
-RoxygenNote: 7.1.0
+RoxygenNote: 7.2.1
 URL: https://github.com/cbielow/PTXQC
 BugReports: https://github.com/cbielow/PTXQC/issues
diff --git a/NAMESPACE b/NAMESPACE
@@ -58,6 +58,7 @@ export(plot_ContUser)
 export(plot_ContUserScore)
 export(plot_ContsPG)
 export(plot_CountData)
+export(plot_DataOverRT)
 export(plot_IDRate)
 export(plot_IDsOverRT)
 export(plot_IonInjectionTimeOverRT)

diff --git a/NEWS b/NEWS
@@ -7,6 +7,12 @@ Versions uploaded to CRAN are marked with [CRAN].
 #########   CHANGELOG  ##########
 #################################
 
+[CRAN] v1.00.14 -- 2022/09/21
+  - [FIX] crash when re-running PTX-QC on a txt folder where a (very old) PTX-QC was already run (issue #118)
+  - [FIX] msmsScans TopNoverN metric showing invalid data for MaxQuant 2.x (issue #119)
+  - [FEATURE] a better plot for TMT/iTRAQ reporter intensity (issue #123)
+  - a few minor things (mainly CRAN docs)
+
 [CRAN] v1.00.13 -- 2022/03/22
   - [FIX] Remove dependency 'kableExtra' and use 'htmlTable' package instead
 

diff --git a/R/PTXQC.R b/R/PTXQC.R
@@ -29,6 +29,7 @@
 #' @import ggplot2
 #' @import ggdendro
 #' @import grid
+#' @import gridExtra
 #' @import grDevices
 #' @import gtable
 #' @import knitr

diff --git a/R/createReport.R b/R/createReport.R
@@ -642,10 +642,11 @@ createReport = function(txt_folder = NULL,
   #####################################################################
   #####################################################################
   ## write mzQC file
-  writeMZQC(
+  try( ## if not enough metrics are produced, then writing will fail (e.g. one run or setQuality needs to be present)
+    writeMZQC(
     rprt_fns$mzQC_file, 
     assembleMZQC(lst_qcMetrics, raw_file_mapping = eval(expr_fn_map)$raw_file_mapping)
-  )
+  ))
 
 
   #####################################################################
@@ -676,14 +677,16 @@ createReport = function(txt_folder = NULL,
     stop("Output format(s) not supported: '", paste(out_formats[is.na(out_format_requested)], collapse="', '"), "'")
   }
 
+  ## a bit hacky, but we want gridExtra plots to plot when we call print() -- similar to ggplot's print
+  print.gtable = function(t) { plot(t)}
 
   if ("html" %in% out_format_requested)
   {
     if (rmarkdown::pandoc_available()) {
       ## HTML reports require Pandoc for converting Markdown to Html via the rmarkdown package
       if (DEBUG_PTXQC) {
         html_template = paste0(getwd(), "/inst/reportTemplate/PTXQC_report_template.Rmd")
-        if (!file.exists(html_template)) stop("Wrong working directroy. Please set your working directory to the PTXQC main dir such that 'paste0(getwd(), '/inst/reportTemplate/PTXQC_report_template.Rmd')' is a valid file.")
+        if (!file.exists(html_template)) stop("Wrong working directory. Please set your working directory to the PTXQC main dir such that 'paste0(getwd(), '/inst/reportTemplate/PTXQC_report_template.Rmd')' is a valid file.")
       } else {
         html_template = system.file("./reportTemplate/PTXQC_report_template.Rmd", package="PTXQC")
       }
@@ -693,8 +696,16 @@ createReport = function(txt_folder = NULL,
       out_template = file.path(out_dir, basename(html_template))
       ## Rmarkdown: convert to Markdown, and then to HTML (or PDF) ...
       ## Intermediates_dir is required if inputdir!=outputdir, since Shiny server might not allow write-access to input file directory
-      rmarkdown::render(out_template, output_file = rprt_fns$report_file_HTML) #, intermediates_dir = dirname(rprt_fns$report_file_HTML))
-    } else {
+      res_html = try(
+        rmarkdown::render(out_template, output_file = rprt_fns$report_file_HTML) #, intermediates_dir = dirname(rprt_fns$report_file_HTML))
+      )
+      if (inherits(res_html, "try-error")) {
+        warning("Creating the HTML template did not succeed, probably due to an outdated markdown template the in
+                txt folder. PTXQC will use the default template now instead. Fix or remove the broken/old .Rmd file from the ", txt_folder, 
+                " to avoid this warning.", immediate. = TRUE)
+        rmarkdown::render(html_template, output_file = rprt_fns$report_file_HTML) #, intermediates_dir = dirname(rprt_fns$report_file_HTML))
+      } 
+   } else {
       warning("The 'Pandoc' converter is not installed on your system or you do not have read-access to it!\n",
               "Pandoc is required for HTML reports.\n",
               "Please install Pandoc <http://pandoc.org/installing.html> or make sure you have access to pandoc(.exe).\n",

diff --git a/R/fcn_miscGGplot.R b/R/fcn_miscGGplot.R
@@ -29,7 +29,8 @@ ggText = function(title, text, col = "black") {
 #'
 printWithFooter = function(gg_obj, bottom_left = NULL, bottom_right = NULL) 
 {
-  print(gg_obj)
+  if ("gtable" %in% class(gg_obj)) plot(gg_obj) else print(gg_obj)
+
   if (!is.null(bottom_left))
   {
     label = grid::textGrob(bottom_left,

diff --git a/R/qcMetric_EVD.R b/R/qcMetric_EVD.R
@@ -248,8 +248,8 @@ qcMetric_EVD_ReporterInt =  setRefClass(
   contains = "qcMetric",
   methods = list(initialize=function() {  callSuper(
     helpText = 
-      "ITRAQ/TMT reporter intensity boxplots of all PSMs for each channel and Raw file.
-The opacity (alpha value) of the bar correlates to the number of PSMs with non-zero abundance (1.0 = full labeling; 0.0 = no reporter ions; see heatmap scoring below).
+      "ITRAQ/TMT reporter intensity violin plots of all PSMs for each channel and Raw file.
+The second subplot shows labeling efficiency (LE), i.e the fraction of PSMs with non-zero abundance (100% = full labeling of all PSMs; 0% = no reporter ions at all). This is used for heatmap scoring. See below.
 
 There is a similar 'Experimental Group' based metric/plot based on proteins.txt.
 
@@ -262,8 +262,7 @@ Note: global labelling efficiency can only be judged indirectly with this metric
       Observing only very few peptides (see peptide count metric), is a good indicator.
       However, if only the labeling of a few channels failed, this will be noticable here!
 
-Labeling can still be poor, even though identification was successful. In this case, the boxplots will touch the left (0 intensity)
-side of the plot.
+Labeling can still be poor, even though identification was successful.
 
 A labeling efficiency (LE) is computed per Raw file AND channel as: the percentage of PSMs which have non-zero reporter intensity.
 Ideally LE reaches 100 percent (all peptides have an intensity in the channel; biological missingness ignored).
@@ -291,8 +290,7 @@ Each Raw file is now scored by the minimum LE of all its 4 channels.
         title_subtext = "";  
         title_color = "black"
       }
-
-
+      g_title = "EVD: Reporter Intensities"  
       ## use data.table for aggregation, its MUCH faster than ddply() and uses almost no extra memory
       df_reps = reshape2::melt(df_evd[, c("fc.raw.file", cols_reporter)], 
                      id.vars ="fc.raw.file", 
@@ -309,47 +307,74 @@ Each Raw file is now scored by the minimum LE of all its 4 channels.
       dt_reps$channel = factor(dt_reps$channel, levels = sort(unique(dt_reps$channel), decreasing = TRUE))
       head(dt_reps)
 
-      ## compute global boxplot stats (so we can fix min/max across plots)
-      ## also return labEff_PC (labeling efficiency in %)
-      ylims = dt_reps[, { limits = boxplot.stats(intensity + 1, coef = 0.7)$stats;
-                          list(imin = limits[1], lower = limits[2], middle = limits[3], upper = limits[4], imax = limits[5], labEff_PC = sum(intensity > 0, na.rm = TRUE) / (.N)) 
-                        },
-                        by=c("fc.raw.file", "channel")
-                     ]
-      ylims2 = range(ylims$imin, ylims$imax)
+      ylims_minmax = range(dt_reps$intensity)
+
       fcn_boxplot_internal = function(data, title_subtext = title_subtext, title_color = title_color) 
       {
         #require(ggplot2)
         #data = ylims
-        pl = ggplot(data=data) +
-          geom_boxplot(aes_string(x = "fc.raw.file", fill = "channel", ## do not use col="channel", since this will dodge bars and loose scaling
-                                  ymin = "imin", lower = "lower", middle = "middle", upper = "upper", ymax = "imax",
-                                  alpha = "labEff_PC"),
-                       position = "dodge", stat = "identity") +
+        ### first subplot (distribution of intensities)
+        data_noZero = data[data$intensity!=0,]
+        pl = ggplot(data=data_noZero) +
+          geom_violin(aes_string(x = "fc.raw.file", 
+                                 y = "intensity",  
+                                 color = "channel",
+                                 fill = "channel"
+                                 )) +
           xlab("") + 
-          ylab("reporter intensity (log10)") +
-          guides(alpha=guide_legend(title="Label Eff"), fill = guide_legend(reverse = TRUE)) + ## inverse label order, so that channel 0 is on top
-          theme(axis.text.x = element_text(angle=45, vjust = 0.5), legend.position="right", plot.title = element_text(color=title_color)) +
-          ggtitle("EVD: Reporter label intensities", title_subtext) + 
-          #geom_hline(size = 1, alpha = 0.5, yintercept = ref_median, colour = "black") +
-          scale_alpha(range = range(ylims$labEff_PC)) +
-          scale_x_discrete_reverse(unique(data$fc.raw.file)) +
-          scale_y_log10(limits = ylims2) +
+          ylab("reporter intensity (zeros removed)") +
+          guides(#alpha = guide_legend(title="Label Eff"), 
+                 fill = guide_legend(reverse = TRUE), ## inverse label order, so that channel 0 is on top
+                 color = guide_none()) + 
+          theme(axis.text.x = element_text(angle = 45, vjust = 0.5), 
+                legend.position = "right",
+                plot.title = element_text(color = title_color)) +
+          ggtitle(g_title, title_subtext) + 
+          #scale_alpha(range = range(ylims$labEff_PC)) +
+          PTXQC:::scale_x_discrete_reverse(unique(data$fc.raw.file)) +
+          scale_y_log10(limits = ylims_minmax + 1) + ## +1 to make sure that lower bound is not 0 (--> since log(0) = error)
           coord_flip() 
-
+        #pl
+
+        ylims = dt_reps[, { #limits = boxplot.stats(intensity, coef = 0.7)$stats;
+                          list(labEff_PC = sum(intensity > 0, na.rm = TRUE) / (.N)) 
+                        }, by = c("fc.raw.file", "channel")]
+
+
+        ### second subplot (labeling efficiency)
+        pl_eff = ggplot(data = ylims) + geom_bar(aes_string(x = "fc.raw.file",
+                                                            y = "labEff_PC * 100",
+                                                            fill = "channel"), 
+                                                 stat = "identity",
+                                                 position = "dodge") + 
+          xlab("") + 
+          ylab("labelling efficiency (%)") +
+          ylim(0, 100) +
+          guides(fill = guide_legend(reverse = TRUE), ## inverse label order, so that channel 0 is on top
+                 color = guide_none()) + 
+          theme(legend.position = "right") +
+          ggtitle("Fraction of Non-Zero Intensities", "") + 
+          PTXQC:::scale_x_discrete_reverse(unique(ylims$fc.raw.file)) +
+          coord_flip() 
+        #pl_eff
+        pl_both = gridExtra::grid.arrange(pl, pl_eff, ncol=2)
         #print(pl)
-        return(pl)
+        return(pl_both)
       }
-      channel_count = length(unique(ylims$channel))
-      lpl = byXflex(data = ylims, indices = ylims$fc.raw.file, subset_size = round(40 / channel_count), 
+      channel_count = length(cols_reporter)
+      lpl = byXflex(data = dt_reps, indices = dt_reps$fc.raw.file, subset_size = round(40 / channel_count), 
                     sort_indices = TRUE, FUN = fcn_boxplot_internal, title_subtext = title_subtext, title_color = title_color)
+      lpl
       # heatmap scoring
       ## .. take min score over all channels
+      ylims = dt_reps[, { #limits = boxplot.stats(intensity, coef = 0.7)$stats;
+        list(labEff_PC = sum(intensity > 0, na.rm = TRUE) / (.N)) 
+      }, by = c("fc.raw.file", "channel")]
       qcScore = ylims[, list(score_min = min(labEff_PC)), by=c("fc.raw.file")]
       colnames(qcScore) = c("fc.raw.file", .self$qcName)
 
-
-      return(list(plots = lpl, qcScores = qcScore))
+      ## add manual title, since we return a grid.arrange() where automatic extraction is hard
+      return(list(plots = lpl, qcScores = qcScore, title = rep(list(g_title), length(lpl))))
     }, 
     qcCat = "prep", 
     qcName = "EVD:~Reporter~intensity", 

diff --git a/R/qcMetric_MSMS.R b/R/qcMetric_MSMS.R
@@ -140,13 +140,12 @@ Heatmap score [MSMS: MC Var]: each Raw file is scored for its deviation (score:
 current study. ",
     workerFcn = function(.self, df_any, df_evd = NULL)
     {
+      ## metric already ran... return result we have
+      if (length(.self$plots) != 0 ) return(list(plots = .self$plots, qcScores = .self$qcScores))
       ## completeness check
       if (!checkInput(c("fc.raw.file", "sequence", "missed.cleavages"), df_any)) return()
       if (!is.null(df_evd) && !checkInput(c("contaminant", "id"), df_evd)) {df_evd = NULL}
-      ## metric already ran... return result we have
-      if (length(.self$plots) != 0 ) return(list(plots = .self$plots, qcScores = .self$qcScores))
-
-
+
       max_mc = max(-Inf, df_any$missed.cleavages, na.rm = TRUE) ## will be -Inf iff enzyme was not specified and columns is 100% NA
       if (!is.infinite(max_mc))
       { ## MC's require an enzyme to be set

diff --git a/R/qcMetric_MSMSScans.R b/R/qcMetric_MSMSScans.R
@@ -307,10 +307,12 @@ Heatmap score [MS<sup>2</sup> Scans: TopN ID over N]: Rewards uniform identifica
       #   --> fail, 'D' and p-values are too low
       df.ratio = plyr::ddply(DF, c("scan.event.number", "fc.raw.file"), function(x)
       {
-        xp = xm = 0
+        xp = xm1 = xm2 = 0
         if ("+" %in% x$identified) xp = x$n[x$identified=="+"]
-        if ("-" %in% x$identified) xm = x$n[x$identified=="-"]
-        ratio = xp * 100 / sum(xp, xm)
+        if ("-" %in% x$identified) xm1 = x$n[x$identified=="-"]
+        if ("" %in% x$identified) xm2 = x$n[x$identified==""] # MQ 2.x leaves unidentified empty
+
+        ratio = xp * 100 / sum(xp, xm1, xm2)
         if (is.na(ratio)) 
         { # the whole 'identified' column is empty (no '+', no '-')
           ratio = 0 
@@ -365,7 +367,7 @@ Heatmap score [MS<sup>2</sup> Scans: DepPep]: No score.
     {
       ## completeness check
       if (!checkInput(c("fc.raw.file", "dp.modification", "dp.aa", "identified"), d_msmsScan)) return()
-      stopifnot(unique(d_msmsScan$identified) %in% c("-","+"))
+      stopifnot(unique(d_msmsScan$identified) %in% c("-","+",""))
 
       ## modified subset
       d_msmsScan$hasDP = (d_msmsScan$dp.modification != "") & (tolower(d_msmsScan$dp.modification) != "unmodified")

diff --git a/README.md b/README.md
@@ -9,16 +9,8 @@ PTXQC
 
 ### Latest changes / ChangeLog
 
-
-  - v1.00.13 - Mar 2022: internal changes (removed a deprecated dependency)
-  - v1.00.12 - Nov 2021: Documentation and cosmetic fixes (https://github.com/cbielow/PTXQC/pull/109); improved legends, better links in Html reports (https://github.com/cbielow/PTXQC/pull/111); avoid crash on datasets with only a single MS/MS spectrum (https://github.com/cbielow/PTXQC/pull/112)
-  - v1.00.11 - Sep 2021: initial support for mzQC Quality Control output (https://github.com/cbielow/PTXQC/pull/105); Missed cleavages can be computed from evidence.txt (https://github.com/cbielow/PTXQC/pull/104); minor fixes
-  - v1.00.10 - May 2021: detect non UK/US locale (https://github.com/cbielow/PTXQC/pull/99); Revive missing MBR metrics (https://github.com/cbielow/PTXQC/issues/97); restore full Parameter list (showing MaxQuant parameters (https://github.com/cbielow/PTXQC/issues/101); decoy distribution in MS/MS mass error plot restored (https://github.com/cbielow/PTXQC/issues/102)
-  - v1.00.09 - Jan 2021: output logging to external file (https://github.com/cbielow/PTXQC/issues/94); mzQC support (partial); less warnings;
-  - v1.00.08 - Dec 2020: fix issues with two metrics (https://github.com/cbielow/PTXQC/issues/90, https://github.com/cbielow/PTXQC/issues/91)
-  - v1.00.07 - Nov 2020: fix issues with creating intermediate Rplots.pdf
-  - v1.00.05 - Jun 2020: mzTab fixes introduced in v1.0.4
-
+v1.00.14 - September 2022
+
 See [NEWS][News_File] file for a version history.
 
 ### Platform support

diff --git a/inst/reportTemplate/PTXQC_report_template.Rmd b/inst/reportTemplate/PTXQC_report_template.Rmd
@@ -184,7 +184,7 @@ if(!is.null(pl_nameMapping) && (!any(is.na(pl_nameMapping)))) {
 
 ## Metrics
 ```{r metrics, echo=FALSE, results="asis"}
-  #for (qcm in lst_qcMetrics_ord[1:3])
+  #for (qcm in lst_qcMetrics[1:3])
   for (qcm in lst_qcMetrics)
   {
     if (length(qcm$plots) == 0) next; # skip empty metrics

diff --git a/man/CVDictionarySingleton.Rd b/man/CVDictionarySingleton.Rd
diff --git a/man/MzTabReader-class.Rd b/man/MzTabReader-class.Rd