read single metrics v9plus complete

samuel-marsh · Dec 4, 2024 · 7ea98f3 · 7ea98f3
1 parent 6e01614
commit 7ea98f3
Showing 1 changed file with 184 additions and 65 deletions.
diff --git a/R/Internal_Utilities.R b/R/Internal_Utilities.R
@@ -2135,85 +2135,204 @@ Metrics_Single_File_v9plus <- function(
     base_path,
     cellranger_mutli = FALSE
 ){
-  cli_inform(message = "Reading {.field Gene Expression} Metrics")
+  # read count metrics
+  if (isFALSE(x = cellranger_mutli)) {
+    cli_inform(message = "Reading {.field Gene Expression} Metrics")
+    raw_data <- read.csv(file = base_path, stringsAsFactors = FALSE)
 
-  raw_data <- read.csv(file = base_path, stringsAsFactors = FALSE)
+    # Change format to column based and select relevant metrics
+    GEX_metrics <- raw_data %>%
+      filter(.data[["Grouped.By"]] == "Physical library ID" & .data[["Library.Type"]] == "Gene Expression") %>%
+      select(all_of(c("Metric.Name", "Metric.Value"))) %>%
+      column_to_rownames("Metric.Name") %>%
+      t() %>%
+      data.frame()
 
-  # Change format to column based and select relevant metrics
-  GEX_metrics <- raw_data %>%
-    filter(.data[["Grouped.By"]] == "Physical library ID" & .data[["Library.Type"]] == "Gene Expression") %>%
-    select(all_of(c("Metric.Name", "Metric.Value"))) %>%
-    column_to_rownames("Metric.Name") %>%
-    t() %>%
-    data.frame()
+    GEX_metrics2 <- raw_data %>%
+      filter(.data[["Metric.Name"]] %in% c(c("Median UMI counts per cell", "Median genes per cell", "Median reads per cell", "Total genes detected"))) %>%
+      select(all_of(c("Metric.Name", "Metric.Value"))) %>%
+      column_to_rownames("Metric.Name") %>%
+      t() %>%
+      data.frame()
 
-  GEX_metrics2 <- raw_data %>%
-    filter(.data[["Metric.Name"]] %in% c(c("Median UMI counts per cell", "Median genes per cell", "Median reads per cell", "Total genes detected"))) %>%
-    select(all_of(c("Metric.Name", "Metric.Value"))) %>%
-    column_to_rownames("Metric.Name") %>%
-    t() %>%
-    data.frame()
+    raw_data_gex <- cbind(GEX_metrics, GEX_metrics2)
 
-  raw_data_gex <- cbind(GEX_metrics, GEX_metrics2)
+    # Change format of numeric columns to due commas in data csv output.
+    column_numbers <- grep(pattern = ",", x = raw_data_gex[1, ])
+    raw_data_gex[,c(column_numbers)] <- lapply(raw_data_gex[,c(column_numbers)],function(x){as.numeric(gsub(",", "", x))})
 
-  # Change format of numeric columns to due commas in data csv output.
-  column_numbers <- grep(pattern = ",", x = raw_data_gex[1, ])
-  raw_data_gex[,c(column_numbers)] <- lapply(raw_data_gex[,c(column_numbers)],function(x){as.numeric(gsub(",", "", x))})
+    if ("Estimated.number.of.cells" %in% colnames(x = raw_data_gex)) {
+      # Rename multi columns to match names from count
+      names_to_replace <- c(Reads.Mapped.to.Genome = "Mapped.to.genome",
+                            Reads.Mapped.Confidently.to.Genome = "Confidently.mapped.to.genome",
+                            Reads.Mapped.Confidently.to.Intergenic.Regions = "Confidently.mapped.to.intergenic.regions",
+                            Reads.Mapped.Confidently.to.Intronic.Regions = "Confidently.mapped.to.intronic.regions",
+                            Reads.Mapped.Confidently.to.Exonic.Regions = "Confidently.mapped.to.exonic.regions",
+                            Reads.Mapped.Confidently.to.Transcriptome = "Confidently.mapped.to.transcriptome",
+                            Reads.Mapped.Antisense.to.Gene = "Confidently.mapped.antisense",
+                            Fraction.Reads.in.Cells = "Confidently.mapped.reads.in.cells",
+                            Estimated.Number.of.Cells = "Estimated.number.of.cells",
+                            Mean.Reads.per.Cell = "Mean.reads.per.cell",
+                            Median.Genes.per.Cell = "Median.genes.per.cell",
+                            Number.of.Reads = "Number.of.reads",
+                            Valid.Barcodes = "Valid.barcodes",
+                            Sequencing.Saturation = "Sequencing.saturation",
+                            Total.Genes.Detected = "Total.genes.detected",
+                            Median.UMI.Counts.per.Cell = "Median.UMI.counts.per.cell")
+    } else {
+      # Rename multi columns to match names from count
+      names_to_replace <- c(Reads.Mapped.to.Genome = "Mapped.to.genome",
+                            Reads.Mapped.Confidently.to.Genome = "Confidently.mapped.to.genome",
+                            Reads.Mapped.Confidently.to.Intergenic.Regions = "Confidently.mapped.to.intergenic.regions",
+                            Reads.Mapped.Confidently.to.Intronic.Regions = "Confidently.mapped.to.intronic.regions",
+                            Reads.Mapped.Confidently.to.Exonic.Regions = "Confidently.mapped.to.exonic.regions",
+                            Reads.Mapped.Confidently.to.Transcriptome = "Confidently.mapped.to.transcriptome",
+                            Reads.Mapped.Antisense.to.Gene = "Confidently.mapped.antisense",
+                            Fraction.Reads.in.Cells = "Confidently.mapped.reads.in.cells",
+                            Estimated.Number.of.Cells = "Cells",
+                            Mean.Reads.per.Cell = "Mean.reads.per.cell",
+                            Median.Genes.per.Cell = "Median.genes.per.cell",
+                            Number.of.Reads = "Number.of.reads",
+                            Valid.Barcodes = "Valid.barcodes",
+                            Sequencing.Saturation = "Sequencing.saturation",
+                            Total.Genes.Detected = "Total.genes.detected",
+                            Median.UMI.Counts.per.Cell = "Median.UMI.counts.per.cell")
+    }
 
-  if ("Estimated.number.of.cells" %in% colnames(x = raw_data_gex)) {
-    # Rename multi columns to match names from count
-    names_to_replace <- c(Reads.Mapped.to.Genome = "Mapped.to.genome",
-                          Reads.Mapped.Confidently.to.Genome = "Confidently.mapped.to.genome",
-                          Reads.Mapped.Confidently.to.Intergenic.Regions = "Confidently.mapped.to.intergenic.regions",
-                          Reads.Mapped.Confidently.to.Intronic.Regions = "Confidently.mapped.to.intronic.regions",
-                          Reads.Mapped.Confidently.to.Exonic.Regions = "Confidently.mapped.to.exonic.regions",
-                          Reads.Mapped.Confidently.to.Transcriptome = "Confidently.mapped.to.transcriptome",
-                          Reads.Mapped.Antisense.to.Gene = "Confidently.mapped.antisense",
-                          Fraction.Reads.in.Cells = "Confidently.mapped.reads.in.cells",
-                          Estimated.Number.of.Cells = "Estimated.number.of.cells",
-                          Mean.Reads.per.Cell = "Mean.reads.per.cell",
-                          Median.Genes.per.Cell = "Median.genes.per.cell",
-                          Number.of.Reads = "Number.of.reads",
-                          Valid.Barcodes = "Valid.barcodes",
-                          Sequencing.Saturation = "Sequencing.saturation",
-                          Total.Genes.Detected = "Total.genes.detected",
-                          Median.UMI.Counts.per.Cell = "Median.UMI.counts.per.cell")
+    raw_data_gex <- raw_data_gex %>%
+      rename(all_of(names_to_replace))
+
+    column_numbers_pct <- grep(pattern = "%", x = raw_data_gex[1, ])
+    all_columns <- 1:ncol(x = raw_data_gex)
+
+    column_numbers_numeric <- setdiff(x = all_columns, y = column_numbers_pct)
+
+    raw_data_gex[,c(column_numbers_numeric)] <- lapply(raw_data_gex[,c(column_numbers_numeric)],function(x){as.numeric(x)})
+
+    # Change column nams to use "_" separator instead of "." for readability
+    colnames(x = raw_data_gex) <- gsub(pattern = "\\.", replacement = "_", x = colnames(x = raw_data_gex))
+
+    rownames(x = raw_data_gex) <- NULL
+
+    return(raw_data_gex)
   } else {
-    # Rename multi columns to match names from count
-    names_to_replace <- c(Reads.Mapped.to.Genome = "Mapped.to.genome",
-                          Reads.Mapped.Confidently.to.Genome = "Confidently.mapped.to.genome",
-                          Reads.Mapped.Confidently.to.Intergenic.Regions = "Confidently.mapped.to.intergenic.regions",
-                          Reads.Mapped.Confidently.to.Intronic.Regions = "Confidently.mapped.to.intronic.regions",
-                          Reads.Mapped.Confidently.to.Exonic.Regions = "Confidently.mapped.to.exonic.regions",
-                          Reads.Mapped.Confidently.to.Transcriptome = "Confidently.mapped.to.transcriptome",
-                          Reads.Mapped.Antisense.to.Gene = "Confidently.mapped.antisense",
-                          Fraction.Reads.in.Cells = "Confidently.mapped.reads.in.cells",
-                          Estimated.Number.of.Cells = "Cells",
-                          Mean.Reads.per.Cell = "Mean.reads.per.cell",
-                          Median.Genes.per.Cell = "Median.genes.per.cell",
-                          Number.of.Reads = "Number.of.reads",
-                          Valid.Barcodes = "Valid.barcodes",
-                          Sequencing.Saturation = "Sequencing.saturation",
-                          Total.Genes.Detected = "Total.genes.detected",
-                          Median.UMI.Counts.per.Cell = "Median.UMI.counts.per.cell")
-  }
+    # read GEX metrics
+    cli_inform(message = "Reading {.field Gene Expression} Metrics")
+    raw_data <- read.csv(file = base_path, stringsAsFactors = FALSE)
 
-  raw_data_gex <- raw_data_gex %>%
-    rename(all_of(names_to_replace))
+    # Change format to column based and select relevant metrics
+    GEX_metrics <- raw_data %>%
+      filter(.data[["Grouped.By"]] == "Physical library ID" & .data[["Library.Type"]] == "Gene Expression") %>%
+      select(all_of(c("Metric.Name", "Metric.Value"))) %>%
+      column_to_rownames("Metric.Name") %>%
+      t() %>%
+      data.frame()
 
-  column_numbers_pct <- grep(pattern = "%", x = raw_data_gex[1, ])
-  all_columns <- 1:ncol(x = raw_data_gex)
+    GEX_metrics2 <- raw_data %>%
+      filter(.data[["Metric.Name"]] %in% c(c("Median UMI counts per cell", "Median genes per cell", "Median reads per cell", "Total genes detected"))) %>%
+      select(all_of(c("Metric.Name", "Metric.Value"))) %>%
+      column_to_rownames("Metric.Name") %>%
+      t() %>%
+      data.frame()
 
-  column_numbers_numeric <- setdiff(x = all_columns, y = column_numbers_pct)
+    raw_data_gex <- cbind(GEX_metrics, GEX_metrics2)
 
-  raw_data_gex[,c(column_numbers_numeric)] <- lapply(raw_data_gex[,c(column_numbers_numeric)],function(x){as.numeric(x)})
+    # Change format of numeric columns to due commas in data csv output.
+    column_numbers <- grep(pattern = ",", x = raw_data_gex[1, ])
+    raw_data_gex[,c(column_numbers)] <- lapply(raw_data_gex[,c(column_numbers)],function(x){as.numeric(gsub(",", "", x))})
 
-  # Change column nams to use "_" separator instead of "." for readability
-  colnames(x = raw_data_gex) <- gsub(pattern = "\\.", replacement = "_", x = colnames(x = raw_data_gex))
+    if ("Estimated.number.of.cells" %in% colnames(x = raw_data_gex)) {
+      # Rename multi columns to match names from count
+      names_to_replace <- c(Reads.Mapped.to.Genome = "Mapped.to.genome",
+                            Reads.Mapped.Confidently.to.Genome = "Confidently.mapped.to.genome",
+                            Reads.Mapped.Confidently.to.Intergenic.Regions = "Confidently.mapped.to.intergenic.regions",
+                            Reads.Mapped.Confidently.to.Intronic.Regions = "Confidently.mapped.to.intronic.regions",
+                            Reads.Mapped.Confidently.to.Exonic.Regions = "Confidently.mapped.to.exonic.regions",
+                            Reads.Mapped.Confidently.to.Transcriptome = "Confidently.mapped.to.transcriptome",
+                            Reads.Mapped.Antisense.to.Gene = "Confidently.mapped.antisense",
+                            Fraction.Reads.in.Cells = "Confidently.mapped.reads.in.cells",
+                            Estimated.Number.of.Cells = "Estimated.number.of.cells",
+                            Mean.Reads.per.Cell = "Mean.reads.per.cell",
+                            Median.Genes.per.Cell = "Median.genes.per.cell",
+                            Number.of.Reads = "Number.of.reads",
+                            Valid.Barcodes = "Valid.barcodes",
+                            Sequencing.Saturation = "Sequencing.saturation",
+                            Total.Genes.Detected = "Total.genes.detected",
+                            Median.UMI.Counts.per.Cell = "Median.UMI.counts.per.cell")
+    } else {
+      # Rename multi columns to match names from count
+      names_to_replace <- c(Reads.Mapped.to.Genome = "Mapped.to.genome",
+                            Reads.Mapped.Confidently.to.Genome = "Confidently.mapped.to.genome",
+                            Reads.Mapped.Confidently.to.Intergenic.Regions = "Confidently.mapped.to.intergenic.regions",
+                            Reads.Mapped.Confidently.to.Intronic.Regions = "Confidently.mapped.to.intronic.regions",
+                            Reads.Mapped.Confidently.to.Exonic.Regions = "Confidently.mapped.to.exonic.regions",
+                            Reads.Mapped.Confidently.to.Transcriptome = "Confidently.mapped.to.transcriptome",
+                            Reads.Mapped.Antisense.to.Gene = "Confidently.mapped.antisense",
+                            Fraction.Reads.in.Cells = "Confidently.mapped.reads.in.cells",
+                            Estimated.Number.of.Cells = "Cells",
+                            Mean.Reads.per.Cell = "Mean.reads.per.cell",
+                            Median.Genes.per.Cell = "Median.genes.per.cell",
+                            Number.of.Reads = "Number.of.reads",
+                            Valid.Barcodes = "Valid.barcodes",
+                            Sequencing.Saturation = "Sequencing.saturation",
+                            Total.Genes.Detected = "Total.genes.detected",
+                            Median.UMI.Counts.per.Cell = "Median.UMI.counts.per.cell")
+    }
+
+    raw_data_gex <- raw_data_gex %>%
+      rename(all_of(names_to_replace))
+
+    column_numbers_pct <- grep(pattern = "%", x = raw_data_gex[1, ])
+    all_columns <- 1:ncol(x = raw_data_gex)
+
+    column_numbers_numeric <- setdiff(x = all_columns, y = column_numbers_pct)
 
-  rownames(x = raw_data_gex) <- NULL
+    raw_data_gex[,c(column_numbers_numeric)] <- lapply(raw_data_gex[,c(column_numbers_numeric)],function(x){as.numeric(x)})
+
+    # Change column nams to use "_" separator instead of "." for readability
+    colnames(x = raw_data_gex) <- gsub(pattern = "\\.", replacement = "_", x = colnames(x = raw_data_gex))
+
+    rownames(x = raw_data_gex) <- NULL
 
-  return(raw_data_gex)
+    # Get VDJT metrics
+    raw_data <- read.csv(file = base_path, stringsAsFactors = FALSE)
+
+    VDJ_T_Metrics <- raw_data %>%
+      filter(.data[["Grouped.By"]]== "Physical library ID" & .data[["Library.Type"]] == "VDJ T") %>%
+      select(all_of(c("Metric.Name", "Metric.Value"))) %>%
+      column_to_rownames("Metric.Name") %>%
+      t() %>%
+      data.frame()
+
+    VDJ_T_Metrics2 <- raw_data %>%
+      filter(.data[["Metric.Name"]] %in% c("Cells with productive TRA contig", "Cells with productive TRB contig", "Cells with productive V-J spanning (TRA, TRB) pair", "Cells with productive V-J spanning pair", "Median TRA UMIs per Cell", "Median TRB UMIs per Cell", "Number of cells with productive V-J spanning pair", "Paired clonotype diversity")
+      ) %>%
+      select(all_of(c("Metric.Name", "Metric.Value"))) %>%
+      column_to_rownames("Metric.Name") %>%
+      t() %>%
+      data.frame()
+
+    raw_data_vdjt <- cbind(VDJ_T_Metrics, VDJ_T_Metrics2)
+
+    column_numbers <- grep(pattern = ",", x = raw_data_vdjt[1, ])
+    raw_data_vdjt[,c(column_numbers)] <- lapply(raw_data_vdjt[,c(column_numbers)],function(x){as.numeric(gsub(",", "", x))})
+
+    column_numbers_pct <- grep(pattern = "%", x = raw_data_vdjt[1, ])
+    all_columns <- 1:ncol(x = raw_data_vdjt)
+
+    column_numbers_numeric <- setdiff(x = all_columns, y = column_numbers_pct)
+
+    raw_data_vdjt[,c(column_numbers_numeric)] <- lapply(raw_data_vdjt[,c(column_numbers_numeric)],function(x){as.numeric(x)})
+
+    # combine outputs into a list
+    data_list <- list(
+      multi_gex_metrics = raw_data_gex,
+      multi_vdjt_metrics = raw_data_vdjt
+    )
+
+    # return data list
+    return(data_list)
+  }
 }