Skip to content

Commit

Permalink
read single metrics v9plus complete
Browse files Browse the repository at this point in the history
  • Loading branch information
samuel-marsh committed Dec 4, 2024
1 parent 6e01614 commit 7ea98f3
Showing 1 changed file with 184 additions and 65 deletions.
249 changes: 184 additions & 65 deletions R/Internal_Utilities.R
Original file line number Diff line number Diff line change
Expand Up @@ -2135,85 +2135,204 @@ Metrics_Single_File_v9plus <- function(
base_path,
cellranger_mutli = FALSE
){
cli_inform(message = "Reading {.field Gene Expression} Metrics")
# read count metrics
if (isFALSE(x = cellranger_mutli)) {
cli_inform(message = "Reading {.field Gene Expression} Metrics")
raw_data <- read.csv(file = base_path, stringsAsFactors = FALSE)

raw_data <- read.csv(file = base_path, stringsAsFactors = FALSE)
# Change format to column based and select relevant metrics
GEX_metrics <- raw_data %>%
filter(.data[["Grouped.By"]] == "Physical library ID" & .data[["Library.Type"]] == "Gene Expression") %>%
select(all_of(c("Metric.Name", "Metric.Value"))) %>%
column_to_rownames("Metric.Name") %>%
t() %>%
data.frame()

# Change format to column based and select relevant metrics
GEX_metrics <- raw_data %>%
filter(.data[["Grouped.By"]] == "Physical library ID" & .data[["Library.Type"]] == "Gene Expression") %>%
select(all_of(c("Metric.Name", "Metric.Value"))) %>%
column_to_rownames("Metric.Name") %>%
t() %>%
data.frame()
GEX_metrics2 <- raw_data %>%
filter(.data[["Metric.Name"]] %in% c(c("Median UMI counts per cell", "Median genes per cell", "Median reads per cell", "Total genes detected"))) %>%
select(all_of(c("Metric.Name", "Metric.Value"))) %>%
column_to_rownames("Metric.Name") %>%
t() %>%
data.frame()

GEX_metrics2 <- raw_data %>%
filter(.data[["Metric.Name"]] %in% c(c("Median UMI counts per cell", "Median genes per cell", "Median reads per cell", "Total genes detected"))) %>%
select(all_of(c("Metric.Name", "Metric.Value"))) %>%
column_to_rownames("Metric.Name") %>%
t() %>%
data.frame()
raw_data_gex <- cbind(GEX_metrics, GEX_metrics2)

raw_data_gex <- cbind(GEX_metrics, GEX_metrics2)
# Change format of numeric columns to due commas in data csv output.
column_numbers <- grep(pattern = ",", x = raw_data_gex[1, ])
raw_data_gex[,c(column_numbers)] <- lapply(raw_data_gex[,c(column_numbers)],function(x){as.numeric(gsub(",", "", x))})

# Change format of numeric columns to due commas in data csv output.
column_numbers <- grep(pattern = ",", x = raw_data_gex[1, ])
raw_data_gex[,c(column_numbers)] <- lapply(raw_data_gex[,c(column_numbers)],function(x){as.numeric(gsub(",", "", x))})
if ("Estimated.number.of.cells" %in% colnames(x = raw_data_gex)) {
# Rename multi columns to match names from count
names_to_replace <- c(Reads.Mapped.to.Genome = "Mapped.to.genome",
Reads.Mapped.Confidently.to.Genome = "Confidently.mapped.to.genome",
Reads.Mapped.Confidently.to.Intergenic.Regions = "Confidently.mapped.to.intergenic.regions",
Reads.Mapped.Confidently.to.Intronic.Regions = "Confidently.mapped.to.intronic.regions",
Reads.Mapped.Confidently.to.Exonic.Regions = "Confidently.mapped.to.exonic.regions",
Reads.Mapped.Confidently.to.Transcriptome = "Confidently.mapped.to.transcriptome",
Reads.Mapped.Antisense.to.Gene = "Confidently.mapped.antisense",
Fraction.Reads.in.Cells = "Confidently.mapped.reads.in.cells",
Estimated.Number.of.Cells = "Estimated.number.of.cells",
Mean.Reads.per.Cell = "Mean.reads.per.cell",
Median.Genes.per.Cell = "Median.genes.per.cell",
Number.of.Reads = "Number.of.reads",
Valid.Barcodes = "Valid.barcodes",
Sequencing.Saturation = "Sequencing.saturation",
Total.Genes.Detected = "Total.genes.detected",
Median.UMI.Counts.per.Cell = "Median.UMI.counts.per.cell")
} else {
# Rename multi columns to match names from count
names_to_replace <- c(Reads.Mapped.to.Genome = "Mapped.to.genome",
Reads.Mapped.Confidently.to.Genome = "Confidently.mapped.to.genome",
Reads.Mapped.Confidently.to.Intergenic.Regions = "Confidently.mapped.to.intergenic.regions",
Reads.Mapped.Confidently.to.Intronic.Regions = "Confidently.mapped.to.intronic.regions",
Reads.Mapped.Confidently.to.Exonic.Regions = "Confidently.mapped.to.exonic.regions",
Reads.Mapped.Confidently.to.Transcriptome = "Confidently.mapped.to.transcriptome",
Reads.Mapped.Antisense.to.Gene = "Confidently.mapped.antisense",
Fraction.Reads.in.Cells = "Confidently.mapped.reads.in.cells",
Estimated.Number.of.Cells = "Cells",
Mean.Reads.per.Cell = "Mean.reads.per.cell",
Median.Genes.per.Cell = "Median.genes.per.cell",
Number.of.Reads = "Number.of.reads",
Valid.Barcodes = "Valid.barcodes",
Sequencing.Saturation = "Sequencing.saturation",
Total.Genes.Detected = "Total.genes.detected",
Median.UMI.Counts.per.Cell = "Median.UMI.counts.per.cell")
}

if ("Estimated.number.of.cells" %in% colnames(x = raw_data_gex)) {
# Rename multi columns to match names from count
names_to_replace <- c(Reads.Mapped.to.Genome = "Mapped.to.genome",
Reads.Mapped.Confidently.to.Genome = "Confidently.mapped.to.genome",
Reads.Mapped.Confidently.to.Intergenic.Regions = "Confidently.mapped.to.intergenic.regions",
Reads.Mapped.Confidently.to.Intronic.Regions = "Confidently.mapped.to.intronic.regions",
Reads.Mapped.Confidently.to.Exonic.Regions = "Confidently.mapped.to.exonic.regions",
Reads.Mapped.Confidently.to.Transcriptome = "Confidently.mapped.to.transcriptome",
Reads.Mapped.Antisense.to.Gene = "Confidently.mapped.antisense",
Fraction.Reads.in.Cells = "Confidently.mapped.reads.in.cells",
Estimated.Number.of.Cells = "Estimated.number.of.cells",
Mean.Reads.per.Cell = "Mean.reads.per.cell",
Median.Genes.per.Cell = "Median.genes.per.cell",
Number.of.Reads = "Number.of.reads",
Valid.Barcodes = "Valid.barcodes",
Sequencing.Saturation = "Sequencing.saturation",
Total.Genes.Detected = "Total.genes.detected",
Median.UMI.Counts.per.Cell = "Median.UMI.counts.per.cell")
raw_data_gex <- raw_data_gex %>%
rename(all_of(names_to_replace))

column_numbers_pct <- grep(pattern = "%", x = raw_data_gex[1, ])
all_columns <- 1:ncol(x = raw_data_gex)

column_numbers_numeric <- setdiff(x = all_columns, y = column_numbers_pct)

raw_data_gex[,c(column_numbers_numeric)] <- lapply(raw_data_gex[,c(column_numbers_numeric)],function(x){as.numeric(x)})

# Change column nams to use "_" separator instead of "." for readability
colnames(x = raw_data_gex) <- gsub(pattern = "\\.", replacement = "_", x = colnames(x = raw_data_gex))

rownames(x = raw_data_gex) <- NULL

return(raw_data_gex)
} else {
# Rename multi columns to match names from count
names_to_replace <- c(Reads.Mapped.to.Genome = "Mapped.to.genome",
Reads.Mapped.Confidently.to.Genome = "Confidently.mapped.to.genome",
Reads.Mapped.Confidently.to.Intergenic.Regions = "Confidently.mapped.to.intergenic.regions",
Reads.Mapped.Confidently.to.Intronic.Regions = "Confidently.mapped.to.intronic.regions",
Reads.Mapped.Confidently.to.Exonic.Regions = "Confidently.mapped.to.exonic.regions",
Reads.Mapped.Confidently.to.Transcriptome = "Confidently.mapped.to.transcriptome",
Reads.Mapped.Antisense.to.Gene = "Confidently.mapped.antisense",
Fraction.Reads.in.Cells = "Confidently.mapped.reads.in.cells",
Estimated.Number.of.Cells = "Cells",
Mean.Reads.per.Cell = "Mean.reads.per.cell",
Median.Genes.per.Cell = "Median.genes.per.cell",
Number.of.Reads = "Number.of.reads",
Valid.Barcodes = "Valid.barcodes",
Sequencing.Saturation = "Sequencing.saturation",
Total.Genes.Detected = "Total.genes.detected",
Median.UMI.Counts.per.Cell = "Median.UMI.counts.per.cell")
}
# read GEX metrics
cli_inform(message = "Reading {.field Gene Expression} Metrics")
raw_data <- read.csv(file = base_path, stringsAsFactors = FALSE)

raw_data_gex <- raw_data_gex %>%
rename(all_of(names_to_replace))
# Change format to column based and select relevant metrics
GEX_metrics <- raw_data %>%
filter(.data[["Grouped.By"]] == "Physical library ID" & .data[["Library.Type"]] == "Gene Expression") %>%
select(all_of(c("Metric.Name", "Metric.Value"))) %>%
column_to_rownames("Metric.Name") %>%
t() %>%
data.frame()

column_numbers_pct <- grep(pattern = "%", x = raw_data_gex[1, ])
all_columns <- 1:ncol(x = raw_data_gex)
GEX_metrics2 <- raw_data %>%
filter(.data[["Metric.Name"]] %in% c(c("Median UMI counts per cell", "Median genes per cell", "Median reads per cell", "Total genes detected"))) %>%
select(all_of(c("Metric.Name", "Metric.Value"))) %>%
column_to_rownames("Metric.Name") %>%
t() %>%
data.frame()

column_numbers_numeric <- setdiff(x = all_columns, y = column_numbers_pct)
raw_data_gex <- cbind(GEX_metrics, GEX_metrics2)

raw_data_gex[,c(column_numbers_numeric)] <- lapply(raw_data_gex[,c(column_numbers_numeric)],function(x){as.numeric(x)})
# Change format of numeric columns to due commas in data csv output.
column_numbers <- grep(pattern = ",", x = raw_data_gex[1, ])
raw_data_gex[,c(column_numbers)] <- lapply(raw_data_gex[,c(column_numbers)],function(x){as.numeric(gsub(",", "", x))})

# Change column nams to use "_" separator instead of "." for readability
colnames(x = raw_data_gex) <- gsub(pattern = "\\.", replacement = "_", x = colnames(x = raw_data_gex))
if ("Estimated.number.of.cells" %in% colnames(x = raw_data_gex)) {
# Rename multi columns to match names from count
names_to_replace <- c(Reads.Mapped.to.Genome = "Mapped.to.genome",
Reads.Mapped.Confidently.to.Genome = "Confidently.mapped.to.genome",
Reads.Mapped.Confidently.to.Intergenic.Regions = "Confidently.mapped.to.intergenic.regions",
Reads.Mapped.Confidently.to.Intronic.Regions = "Confidently.mapped.to.intronic.regions",
Reads.Mapped.Confidently.to.Exonic.Regions = "Confidently.mapped.to.exonic.regions",
Reads.Mapped.Confidently.to.Transcriptome = "Confidently.mapped.to.transcriptome",
Reads.Mapped.Antisense.to.Gene = "Confidently.mapped.antisense",
Fraction.Reads.in.Cells = "Confidently.mapped.reads.in.cells",
Estimated.Number.of.Cells = "Estimated.number.of.cells",
Mean.Reads.per.Cell = "Mean.reads.per.cell",
Median.Genes.per.Cell = "Median.genes.per.cell",
Number.of.Reads = "Number.of.reads",
Valid.Barcodes = "Valid.barcodes",
Sequencing.Saturation = "Sequencing.saturation",
Total.Genes.Detected = "Total.genes.detected",
Median.UMI.Counts.per.Cell = "Median.UMI.counts.per.cell")
} else {
# Rename multi columns to match names from count
names_to_replace <- c(Reads.Mapped.to.Genome = "Mapped.to.genome",
Reads.Mapped.Confidently.to.Genome = "Confidently.mapped.to.genome",
Reads.Mapped.Confidently.to.Intergenic.Regions = "Confidently.mapped.to.intergenic.regions",
Reads.Mapped.Confidently.to.Intronic.Regions = "Confidently.mapped.to.intronic.regions",
Reads.Mapped.Confidently.to.Exonic.Regions = "Confidently.mapped.to.exonic.regions",
Reads.Mapped.Confidently.to.Transcriptome = "Confidently.mapped.to.transcriptome",
Reads.Mapped.Antisense.to.Gene = "Confidently.mapped.antisense",
Fraction.Reads.in.Cells = "Confidently.mapped.reads.in.cells",
Estimated.Number.of.Cells = "Cells",
Mean.Reads.per.Cell = "Mean.reads.per.cell",
Median.Genes.per.Cell = "Median.genes.per.cell",
Number.of.Reads = "Number.of.reads",
Valid.Barcodes = "Valid.barcodes",
Sequencing.Saturation = "Sequencing.saturation",
Total.Genes.Detected = "Total.genes.detected",
Median.UMI.Counts.per.Cell = "Median.UMI.counts.per.cell")
}

raw_data_gex <- raw_data_gex %>%
rename(all_of(names_to_replace))

column_numbers_pct <- grep(pattern = "%", x = raw_data_gex[1, ])
all_columns <- 1:ncol(x = raw_data_gex)

column_numbers_numeric <- setdiff(x = all_columns, y = column_numbers_pct)

rownames(x = raw_data_gex) <- NULL
raw_data_gex[,c(column_numbers_numeric)] <- lapply(raw_data_gex[,c(column_numbers_numeric)],function(x){as.numeric(x)})

# Change column nams to use "_" separator instead of "." for readability
colnames(x = raw_data_gex) <- gsub(pattern = "\\.", replacement = "_", x = colnames(x = raw_data_gex))

rownames(x = raw_data_gex) <- NULL

return(raw_data_gex)
# Get VDJT metrics
raw_data <- read.csv(file = base_path, stringsAsFactors = FALSE)

VDJ_T_Metrics <- raw_data %>%
filter(.data[["Grouped.By"]]== "Physical library ID" & .data[["Library.Type"]] == "VDJ T") %>%
select(all_of(c("Metric.Name", "Metric.Value"))) %>%
column_to_rownames("Metric.Name") %>%
t() %>%
data.frame()

VDJ_T_Metrics2 <- raw_data %>%
filter(.data[["Metric.Name"]] %in% c("Cells with productive TRA contig", "Cells with productive TRB contig", "Cells with productive V-J spanning (TRA, TRB) pair", "Cells with productive V-J spanning pair", "Median TRA UMIs per Cell", "Median TRB UMIs per Cell", "Number of cells with productive V-J spanning pair", "Paired clonotype diversity")
) %>%
select(all_of(c("Metric.Name", "Metric.Value"))) %>%
column_to_rownames("Metric.Name") %>%
t() %>%
data.frame()

raw_data_vdjt <- cbind(VDJ_T_Metrics, VDJ_T_Metrics2)

column_numbers <- grep(pattern = ",", x = raw_data_vdjt[1, ])
raw_data_vdjt[,c(column_numbers)] <- lapply(raw_data_vdjt[,c(column_numbers)],function(x){as.numeric(gsub(",", "", x))})

column_numbers_pct <- grep(pattern = "%", x = raw_data_vdjt[1, ])
all_columns <- 1:ncol(x = raw_data_vdjt)

column_numbers_numeric <- setdiff(x = all_columns, y = column_numbers_pct)

raw_data_vdjt[,c(column_numbers_numeric)] <- lapply(raw_data_vdjt[,c(column_numbers_numeric)],function(x){as.numeric(x)})

# combine outputs into a list
data_list <- list(
multi_gex_metrics = raw_data_gex,
multi_vdjt_metrics = raw_data_vdjt
)

# return data list
return(data_list)
}
}


Expand Down

0 comments on commit 7ea98f3

Please sign in to comment.