You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
Parallel processing was added to the code of the existing Prep4DeepDEP. It was confirmed that the memory usage was not large and the speed increased dramatically. When checked with the provided data, the same results are shown. Please review it and let me know if there is a problem.
Prep4DeepDEP_custom <- function (exp.data = NULL, mut.data = NULL, meth.data = NULL, cna.data = NULL, dep.data = NULL,
mode = c("Training", "Prediction"), filename.out = "data"){
check.cellNames <- NULL
cat("Mode", mode, "\n\n")
path <- system.file("extdata/", package = "Prep4DeepDEP")
if (sum(is.null(exp.data), is.null(mut.data), is.null(meth.data),
is.null(cna.data)) == 4) {
stop(c("All genomic profiles are missing. Please provide at least one of mut.data, exp.data, meth.data, and cna.data."))
}
if (is.null(dep.data) & tolower(mode) == "prediction") {
cat("dep.data is not provided, running with the default 1298 DepOIs...",
"\n")
load(paste0(path, "default_dep_genes_1298.RData"))
}
if (is.null(dep.data) & tolower(mode) == "training") {
cat("dep.data is not provided. Please provide gene dependency scores for the training mode...",
"\n")
}
if (ncol(dep.data) == 1 & tolower(mode) == "training") {
stop(c("Only one column detected in dep.data. Please provide gene dependency symbols and scores for the training mode."),
call. = FALSE)
}
load(paste0(path, "gene_fingerprints_CGP.RData"))
list.genes <- .CheckGeneSymbol(dep.data = dep.data, filename.out = filename.out)
n <- nrow(list.genes)
# Gene expression
if (!is.null(exp.data)) {
if (!is.character(exp.data[1, 1])) {
stop("exp.data format error, please check!", call. = FALSE)
}
cat(c("Exp started..."), "\n")
colnames(exp.data)[1] <- "Gene"
ncell <- ncol(exp.data[, -1])
if (is.null(check.cellNames)) {
check.cellNames <- colnames(exp.data[, -1])
}else if (length(check.cellNames) != length(colnames(exp.data[, -1])) | sum(check.cellNames %in% colnames(exp.data[, -1])) != length(check.cellNames)) {
stop(c("Cell line names are inconsistent!"), call. = FALSE)
}
cat("Precessing", paste(length(check.cellNames), "cell lines..."), "\n")
inputData <- exp.data[!duplicated(exp.data$Gene), ]
######## TCGA - CCLE intersection
load(paste0(path, "ccle_exp_for_missing_value_6016.RData"))
# outputData <- merge(exp.index, inputData, by = "Gene", sort = FALSE, all.x = TRUE)
outputData <- merge(exp.index, inputData, by = "Gene", sort = FALSE, all.x = TRUE)
Gene <- outputData$Gene
rownames(outputData) <- outputData$Gene
value_NA <- rowSums(outputData[, -c(1, 2)])
cat(sum(is.na(value_NA)), "genes with NA values in exp.data. Substitute by mean values of CCLE.", "\n")
if (round((sum(is.na(value_NA))/nrow(outputData)), digits = 2) > 0.2) {
warning("NA found in >20% genes, please check if input format is correct!")
}
for (i in 1:nrow(outputData)) {
if (is.na(value_NA[i])) {
outputData[i, is.na(outputData[i, ])] <- outputData$Mean[i]
}
}
outputData <- round(as.matrix(outputData[, -1]), digits = 4)
outputData.final.exp <- cbind(Gene, as.data.frame(outputData[, -1], stringsAsFactors = FALSE))
outputData.final.exp <- outputData.final.exp[, c("Gene", check.cellNames)]
if (tolower(mode) == "prediction") {
data.table::fwrite(outputData.final.exp,
file = paste(filename.out, "exp_prediction.txt", sep = "_"), sep = "\t",
row.names = FALSE, col.names = TRUE, quote = FALSE)
}
if (tolower(mode) == "training") {
k = 2:ncol(outputData.final.exp)
rep_col_list <- pbmcapply::pbmclapply(X = k, FUN = function(index){
rep_col.1 <- do.call("cbind", replicate(n, outputData.final.exp[, index], simplify = FALSE)) #
colnames(rep_col.1) <- paste0("C", index-1, "G", seq(1, n, 1))
return(rep_col.1)
}, mc.cores = 4)
rep_col <- rep_col_list %>%
bind_cols(tibble(Gene = rownames(outputData.final.exp)), .)
data.table::fwrite(rep_col, file = paste(filename.out, "exp_training.txt", sep = "_"),
sep = "\t", row.names = FALSE, col.names = TRUE, quote = FALSE)
}
cat("Exp completed!", "\n\n")
}
# Mutation
if (!is.null(mut.data)) {
if (!is.character(mut.data[1, 1])) {
stop("mut.data format error, please check!", call. = FALSE)
}
cat(c("Mut started..."), "\n")
colnames(mut.data)[1] <- "Gene"
ncell <- ncol(mut.data[, -1])
if (is.null(check.cellNames)) {
check.cellNames <- colnames(mut.data[, -1])
} else if (length(check.cellNames) != length(colnames(mut.data[, -1])) | sum(check.cellNames %in% colnames(mut.data[, -1])) != length(check.cellNames)) {
stop(c("Cell line names are inconsistent!"), call. = FALSE)
}
cat("Precessing", paste(length(check.cellNames), "cell lines..."), "\n")
inputData <- mut.data[!duplicated(mut.data$Gene), ]
######## TCGA - CCLE intersection
load(paste0(path, "ccle_mut_for_missing_value_4539.RData"))
outputData <- merge(mut.index, inputData, by = "Gene", sort = FALSE, all.x = TRUE)
Gene <- outputData$Gene
rownames(outputData) <- outputData$Gene
value_NA <- rowSums(outputData[, -c(1, 2)])
cat(sum(is.na(value_NA)), "genes with NA values in mut.data. Substitute by median values of CCLE.", "\n")
if (round((sum(is.na(value_NA))/nrow(outputData)), digits = 2) > 0.2) {
warning("NA found in >20% genes, please check if input format is correct!")
}
for (i in 1:nrow(outputData)) {
if (is.na(value_NA[i])) {
outputData[i, is.na(outputData[i, ])] <- outputData$Median[i]
}
}
outputData.final.mut <- cbind(Gene, as.data.frame(outputData[, -c(1, 2)]))
outputData.final.mut <- outputData.final.mut[, c("Gene", check.cellNames)]
if (tolower(mode) == "prediction") {
data.table::fwrite(outputData.final.mut,
file = paste(filename.out, "mut_prediction.txt", sep = "_"),
sep = "\t", row.names = FALSE, col.names = TRUE, quote = FALSE)
}
if (tolower(mode) == "training") {
k = 2:ncol(outputData.final.mut)
rep_col_list <- pbmcapply::pbmclapply(X = k, FUN = function(index){
rep_col.1 <- do.call("cbind", replicate(n, outputData.final.mut[, index], simplify = FALSE)) #
colnames(rep_col.1) <- paste0("C", index-1, "G", seq(1, n, 1))
return(rep_col.1)
}, mc.cores = 4)
rep_col <- rep_col_list %>%
bind_cols(tibble(Gene = rownames(outputData.final.mut)), .)
data.table::fwrite(rep_col,
file = paste(filename.out, "mut_training.txt", sep = "_"),
sep = "\t", row.names = FALSE, col.names = TRUE, quote = FALSE)
}
cat("Mut completed!", "\n\n")
}
# Methylation
if (!is.null(meth.data)) {
if (!is.character(meth.data[1, 1])) {
stop("meth.data format error, please check!", call. = FALSE)
}
cat(c("Meth started..."), "\n")
colnames(meth.data)[1] <- "Probe"
ncell <- ncol(meth.data[, -1])
if (is.null(check.cellNames)) {
check.cellNames <- colnames(meth.data[, -1])
} else if (length(check.cellNames) != length(colnames(meth.data[, -1])) | sum(check.cellNames %in% colnames(meth.data[, -1])) != length(check.cellNames)) {
stop(c("Cell line names are inconsistent!"), call. = FALSE)
}
cat("Precessing", paste(length(check.cellNames), "cell lines..."), "\n")
inputData <- meth.data[!duplicated(meth.data$Probe), ]
load(paste0(path, "ccle_meth_for_missing_value_6617.RData"))
outputData <- merge(meth.index, inputData, by = "Probe", sort = FALSE, all.x = TRUE)
Probe <- outputData$Probe
rownames(outputData) <- outputData$Probe
value_NA <- rowSums(outputData[, -c(1, 2)])
cat(sum(is.na(value_NA)), "genes with NA values in meth.data. Substitute by 0.", "\n")
if (round((sum(is.na(value_NA))/nrow(outputData)), digits = 2) > 0.2) {
warning("NA found in >20% genes, please check if input format is correct!")
}
for (i in 1:nrow(outputData)) {
if (is.na(value_NA[i])) {
if (sum(is.na(outputData[i, ])) == sum(ncol(outputData) - 2)) {
outputData[i, is.na(outputData[i, ])] <- 0
} else {
outputData[i, is.na(outputData[i, ])] <- 0
}
}
}
outputData <- round(as.matrix(outputData[, -1]), digits = 4)
outputData.final.meth <- cbind(Probe, as.data.frame(outputData[, -1]))
outputData.final.meth <- outputData.final.meth[, c("Probe", check.cellNames)]
if (tolower(mode) == "prediction") {
data.table::fwrite(outputData.final.meth,
file = paste(filename.out, "meth_prediction.txt", sep = "_"),
sep = "\t", row.names = FALSE, col.names = TRUE, quote = FALSE)
}
if (tolower(mode) == "training") {
k = 2:ncol(outputData.final.meth)
rep_col_list <- pbmcapply::pbmclapply(X = k, FUN = function(index){
rep_col.1 <- do.call("cbind", replicate(n, outputData.final.meth[, index], simplify = FALSE)) #
colnames(rep_col.1) <- paste0("C", index-1, "G", seq(1, n, 1))
return(rep_col.1)
}, mc.cores = 4)
rep_col <- rep_col_list %>%
bind_cols(tibble(Probe = rownames(outputData.final.meth)), .)
data.table::fwrite(rep_col,
file = paste(filename.out, "meth_training.txt", sep = "_"),
sep = "\t", row.names = FALSE, col.names = TRUE, quote = FALSE)
}
cat("Meth completed!", "\n\n")
}
# Gene dependency
if (!is.character(dep.data[1, 1])) {
stop("dep.data format error, please check!", call. = FALSE)
} else {
cat(c("Fingerprint started..."), "\n")
colnames(dep.data)[1] <- "Gene"
idx <- which(fingerprint[1, ] %in% c("GeneSet", list.genes$Gene))
outputData <- fingerprint[, idx]
if (tolower(mode) == "prediction") {
data.table::fwrite(outputData,
file = paste(filename.out, "fingerprint_prediction.txt", sep = "_"),
sep = "\t", row.names = FALSE, col.names = FALSE, quote = FALSE)
}
if (tolower(mode) == "training") {
outputData.train <- cbind(outputData[, 1],
do.call("cbind", replicate(ncell, outputData[, -1], simplify = FALSE)))
outputData.train[1, ] <- c("GeneSet", paste0(paste0("C", rep(seq(1, ncell, 1), each = n)), "G", seq(1, n, 1)))
data.table::fwrite(outputData.train,
file = paste(filename.out, "fingerprint_training.txt", sep = "_"),
sep = "\t", row.names = FALSE, col.names = FALSE, quote = FALSE)
}
cat("Fingerprint completed!", "\n\n")
}
if (!is.null(dep.data) & tolower(mode) == "training") {
if (is.null(check.cellNames)) {
check.cellNames <- colnames(dep.data[, -1])
}else if (length(check.cellNames) != length(colnames(dep.data[, -1])) | sum(check.cellNames %in% colnames(dep.data[, -1])) != length(check.cellNames)) {
stop(c("Cell line names are inconsistent!"), call. = FALSE)
}
cat("Gene dependency scores (training mode) start...", "\n")
crispr.input <- dep.data[which(dep.data$Gene %in% list.genes$Gene), # list.genes
which(colnames(dep.data) %in% c("Gene", check.cellNames))]
k = 2:ncol(crispr.input)
crispr.output <- pbmcapply::pbmclapply(X = k, FUN = function(index){
table <- as.data.frame(t(crispr.input[, index]))
colnames(table) <- paste0("C", index - 1, "G", seq(1, n, 1))
return(table)
}, mc.cores = 3) %>%
bind_cols()
crispr.output <- bind_cols(tibble(Dep_Score = "score"), crispr.output)
data.table::fwrite(crispr.output,
file = paste(filename.out, "DepScore_training.txt", sep = "_"),
sep = "\t", row.names = FALSE, col.names = TRUE, quote = FALSE)
cat("Gene dependency scores (training) completed!", "\n\n")
}
if (!is.null(cna.data)) {
if (sum(colnames(cna.data) %in% c("CCLE_name", "Chromosome", "Start", "End", "Num_Probe", "Segment_Mean")) != 5) {
stop("cna.data format error, please check!", call. = FALSE)
}
cat(c("CNA started..."), "\n")
ncell <- length(unique(cna.data$CCLE_name))
if (is.null(check.cellNames)) {
outputData.cna <- .PrepCNA(cna.original = cna.data, filename.out, exportTable = FALSE)
} else {
idx <- which(cna.data$CCLE_name %in% check.cellNames)
if (length(check.cellNames) != length(unique(cna.data$CCLE_name[idx])) | sum(check.cellNames %in% unique(cna.data$CCLE_name[idx])) !=
length(check.cellNames)) {
stop(c("Cell line names are inconsistent!"),
call. = FALSE)
}
outputData.cna <- .PrepCNA_custom(cna.original = cna.data[idx, ], filename.out, exportTable = FALSE)
outputData.cna <- outputData.cna[, c("CNA", check.cellNames)]
}
if (tolower(mode) == "prediction") {
colnames(outputData.cna)[1] <- "Bin"
data.table::fwrite(outputData.cna,
file = paste(filename.out, "cna_prediction.txt", sep = "_"),
sep = "\t", row.names = FALSE, col.names = TRUE, quote = FALSE)
}
if (tolower(mode) == "training") {
k = 2:ncol(outputData.cna)
rep_col_list <- pbmcapply::pbmclapply(X = k, FUN = function(index){
rep_col.1 <- do.call("cbind", replicate(n, outputData.cna[, index], simplify = FALSE)) #
colnames(rep_col.1) <- paste0("C", index-1, "G", seq(1, n, 1))
return(rep_col.1)
}, mc.cores = 4)
rep_col <- rep_col_list %>%
bind_cols(tibble(Bin = outputData.cna$CNA), .)
data.table::fwrite(rep_col,
file = paste(filename.out, "cna_training.txt", sep = "_"),
sep = "\t", row.names = FALSE, col.names = TRUE, quote = FALSE)
}
cat("CNA completed!", "\n\n")
}
}
The text was updated successfully, but these errors were encountered:
Parallel processing was added to the code of the existing Prep4DeepDEP. It was confirmed that the memory usage was not large and the speed increased dramatically. When checked with the provided data, the same results are shown. Please review it and let me know if there is a problem.
The text was updated successfully, but these errors were encountered: