Prep4DeepDEP custom #2

Jin0331 · 2022-07-28T06:47:08Z

Parallel processing was added to the code of the existing Prep4DeepDEP. It was confirmed that the memory usage was not large and the speed increased dramatically. When checked with the provided data, the same results are shown. Please review it and let me know if there is a problem.

Prep4DeepDEP_custom <- function (exp.data = NULL, mut.data = NULL, meth.data = NULL, cna.data = NULL, dep.data = NULL, 
                          mode = c("Training", "Prediction"), filename.out = "data"){
  
    check.cellNames <- NULL
    cat("Mode", mode, "\n\n")
    path <- system.file("extdata/", package = "Prep4DeepDEP")
    if (sum(is.null(exp.data), is.null(mut.data), is.null(meth.data), 
            is.null(cna.data)) == 4) {
      stop(c("All genomic profiles are missing. Please provide at least one of mut.data, exp.data, meth.data, and cna.data."))
    }
    if (is.null(dep.data) & tolower(mode) == "prediction") {
      cat("dep.data is not provided, running with the default 1298 DepOIs...", 
          "\n")
      load(paste0(path, "default_dep_genes_1298.RData"))
    }
    if (is.null(dep.data) & tolower(mode) == "training") {
      cat("dep.data is not provided. Please provide gene dependency scores for the training mode...", 
          "\n")
    }
    if (ncol(dep.data) == 1 & tolower(mode) == "training") {
      stop(c("Only one column detected in dep.data. Please provide gene dependency symbols and scores for the training mode."), 
           call. = FALSE)
    }
    load(paste0(path, "gene_fingerprints_CGP.RData"))
    list.genes <- .CheckGeneSymbol(dep.data = dep.data, filename.out = filename.out)
    n <- nrow(list.genes)
    
    # Gene expression
    if (!is.null(exp.data)) {
        if (!is.character(exp.data[1, 1])) {
            stop("exp.data format error, please check!", call. = FALSE)
        }
        cat(c("Exp started..."), "\n")
        colnames(exp.data)[1] <- "Gene"
        ncell <- ncol(exp.data[, -1])
        if (is.null(check.cellNames)) {
            check.cellNames <- colnames(exp.data[, -1])
        }else if (length(check.cellNames) != length(colnames(exp.data[, -1])) | sum(check.cellNames %in% colnames(exp.data[, -1])) != length(check.cellNames)) {
            stop(c("Cell line names are inconsistent!"), call. = FALSE)
        }
        cat("Precessing", paste(length(check.cellNames), "cell lines..."), "\n")
        inputData <- exp.data[!duplicated(exp.data$Gene), ]

        ######## TCGA - CCLE intersection        
        load(paste0(path, "ccle_exp_for_missing_value_6016.RData"))

        # outputData <- merge(exp.index, inputData, by = "Gene", sort = FALSE, all.x = TRUE)
        outputData <- merge(exp.index, inputData, by = "Gene", sort = FALSE, all.x = TRUE)

        Gene <- outputData$Gene
        rownames(outputData) <- outputData$Gene
        value_NA <- rowSums(outputData[, -c(1, 2)])
        cat(sum(is.na(value_NA)), "genes with NA values in exp.data. Substitute by mean values of CCLE.", "\n")
        if (round((sum(is.na(value_NA))/nrow(outputData)), digits = 2) > 0.2) {
            warning("NA found in >20% genes, please check if input format is correct!")
        }
        for (i in 1:nrow(outputData)) {
            if (is.na(value_NA[i])) {
                outputData[i, is.na(outputData[i, ])] <- outputData$Mean[i]
            }
        }
        outputData <- round(as.matrix(outputData[, -1]), digits = 4)
        outputData.final.exp <- cbind(Gene, as.data.frame(outputData[, -1], stringsAsFactors = FALSE))
        outputData.final.exp <- outputData.final.exp[, c("Gene", check.cellNames)]
        if (tolower(mode) == "prediction") {
            data.table::fwrite(outputData.final.exp, 
                               file = paste(filename.out, "exp_prediction.txt", sep = "_"), sep = "\t", 
                               row.names = FALSE, col.names = TRUE, quote = FALSE)
        }
        if (tolower(mode) == "training") {
          k = 2:ncol(outputData.final.exp)
          rep_col_list <- pbmcapply::pbmclapply(X = k, FUN = function(index){
            rep_col.1 <- do.call("cbind", replicate(n, outputData.final.exp[, index], simplify = FALSE)) # 
            colnames(rep_col.1) <- paste0("C", index-1, "G", seq(1, n, 1))
            return(rep_col.1)
          }, mc.cores = 4)
          
          rep_col <- rep_col_list %>% 
            bind_cols(tibble(Gene = rownames(outputData.final.exp)), .) 
            
          data.table::fwrite(rep_col, file = paste(filename.out, "exp_training.txt", sep = "_"), 
                      sep = "\t", row.names = FALSE, col.names = TRUE, quote = FALSE)
        }
        cat("Exp completed!", "\n\n")
    }
    # Mutation
    if (!is.null(mut.data)) {
      if (!is.character(mut.data[1, 1])) {
        stop("mut.data format error, please check!", call. = FALSE)
      }
      cat(c("Mut started..."), "\n")
      colnames(mut.data)[1] <- "Gene"
      ncell <- ncol(mut.data[, -1])
      if (is.null(check.cellNames)) {
        check.cellNames <- colnames(mut.data[, -1])
      } else if (length(check.cellNames) != length(colnames(mut.data[, -1])) | sum(check.cellNames %in% colnames(mut.data[, -1])) != length(check.cellNames)) {
        stop(c("Cell line names are inconsistent!"), call. = FALSE)
      }
      cat("Precessing", paste(length(check.cellNames), "cell lines..."), "\n")
      inputData <- mut.data[!duplicated(mut.data$Gene), ]
      
      ######## TCGA - CCLE intersection        
      load(paste0(path, "ccle_mut_for_missing_value_4539.RData"))
      
      outputData <- merge(mut.index, inputData, by = "Gene", sort = FALSE, all.x = TRUE)
      Gene <- outputData$Gene
      rownames(outputData) <- outputData$Gene
      value_NA <- rowSums(outputData[, -c(1, 2)])
      cat(sum(is.na(value_NA)), "genes with NA values in mut.data. Substitute by median values of CCLE.", "\n")
      if (round((sum(is.na(value_NA))/nrow(outputData)), digits = 2) > 0.2) {
        warning("NA found in >20% genes, please check if input format is correct!")
      }
      for (i in 1:nrow(outputData)) {
        if (is.na(value_NA[i])) {
          outputData[i, is.na(outputData[i, ])] <- outputData$Median[i]
        }
      }
      outputData.final.mut <- cbind(Gene, as.data.frame(outputData[, -c(1, 2)]))
      outputData.final.mut <- outputData.final.mut[, c("Gene", check.cellNames)]
      if (tolower(mode) == "prediction") {
        data.table::fwrite(outputData.final.mut, 
                           file = paste(filename.out, "mut_prediction.txt", sep = "_"), 
                           sep = "\t", row.names = FALSE, col.names = TRUE, quote = FALSE)
      }
      if (tolower(mode) == "training") {
        k = 2:ncol(outputData.final.mut)
        rep_col_list <- pbmcapply::pbmclapply(X = k, FUN = function(index){
          rep_col.1 <- do.call("cbind", replicate(n, outputData.final.mut[, index], simplify = FALSE)) # 
          colnames(rep_col.1) <- paste0("C", index-1, "G", seq(1, n, 1))
          return(rep_col.1)
        }, mc.cores = 4)
        
        rep_col <- rep_col_list %>% 
          bind_cols(tibble(Gene = rownames(outputData.final.mut)), .) 
        
        data.table::fwrite(rep_col, 
                           file = paste(filename.out, "mut_training.txt", sep = "_"), 
                           sep = "\t", row.names = FALSE, col.names = TRUE, quote = FALSE)
        }
      cat("Mut completed!", "\n\n")
    }
    # Methylation
    if (!is.null(meth.data)) {
        if (!is.character(meth.data[1, 1])) {
            stop("meth.data format error, please check!", call. = FALSE)
        }
        cat(c("Meth started..."), "\n")
        colnames(meth.data)[1] <- "Probe"
        ncell <- ncol(meth.data[, -1])
        if (is.null(check.cellNames)) {
            check.cellNames <- colnames(meth.data[, -1])
        } else if (length(check.cellNames) != length(colnames(meth.data[, -1])) | sum(check.cellNames %in% colnames(meth.data[, -1])) != length(check.cellNames)) {
            stop(c("Cell line names are inconsistent!"), call. = FALSE)
        }
        
        cat("Precessing", paste(length(check.cellNames), "cell lines..."), "\n")
        inputData <- meth.data[!duplicated(meth.data$Probe), ]
        
        load(paste0(path, "ccle_meth_for_missing_value_6617.RData"))
        
        outputData <- merge(meth.index, inputData, by = "Probe", sort = FALSE, all.x = TRUE)
        Probe <- outputData$Probe
        rownames(outputData) <- outputData$Probe
        value_NA <- rowSums(outputData[, -c(1, 2)])
        cat(sum(is.na(value_NA)), "genes with NA values in meth.data. Substitute by 0.",  "\n")
        if (round((sum(is.na(value_NA))/nrow(outputData)), digits = 2) > 0.2) {
            warning("NA found in >20% genes, please check if input format is correct!")
        }
        for (i in 1:nrow(outputData)) {
            if (is.na(value_NA[i])) {
              if (sum(is.na(outputData[i, ])) == sum(ncol(outputData) - 2)) {
                  outputData[i, is.na(outputData[i, ])] <- 0
                } else {
                  outputData[i, is.na(outputData[i, ])] <- 0
                }
            }
        }
        outputData <- round(as.matrix(outputData[, -1]), digits = 4)
        outputData.final.meth <- cbind(Probe, as.data.frame(outputData[, -1]))
        outputData.final.meth <- outputData.final.meth[, c("Probe", check.cellNames)]
        if (tolower(mode) == "prediction") {
            data.table::fwrite(outputData.final.meth, 
                               file = paste(filename.out, "meth_prediction.txt", sep = "_"), 
                               sep = "\t", row.names = FALSE, col.names = TRUE, quote = FALSE)
        }
        if (tolower(mode) == "training") {
          k = 2:ncol(outputData.final.meth)
          rep_col_list <- pbmcapply::pbmclapply(X = k, FUN = function(index){
            rep_col.1 <- do.call("cbind", replicate(n, outputData.final.meth[, index], simplify = FALSE)) # 
            colnames(rep_col.1) <- paste0("C", index-1, "G", seq(1, n, 1))
            return(rep_col.1)
          }, mc.cores = 4)
          
          rep_col <- rep_col_list %>% 
            bind_cols(tibble(Probe = rownames(outputData.final.meth)), .) 
          
          data.table::fwrite(rep_col, 
                             file = paste(filename.out, "meth_training.txt", sep = "_"), 
                             sep = "\t", row.names = FALSE, col.names = TRUE, quote = FALSE)
        }
        cat("Meth completed!", "\n\n")
    }
    # Gene dependency
    if (!is.character(dep.data[1, 1])) {
        stop("dep.data format error, please check!", call. = FALSE)
    } else {
        cat(c("Fingerprint started..."), "\n")
        colnames(dep.data)[1] <- "Gene"
        idx <- which(fingerprint[1, ] %in% c("GeneSet", list.genes$Gene))
        outputData <- fingerprint[, idx]
        if (tolower(mode) == "prediction") {
            data.table::fwrite(outputData, 
                               file = paste(filename.out, "fingerprint_prediction.txt", sep = "_"), 
                               sep = "\t", row.names = FALSE, col.names = FALSE, quote = FALSE)
        }
        if (tolower(mode) == "training") {
            outputData.train <- cbind(outputData[, 1], 
                                      do.call("cbind", replicate(ncell, outputData[, -1], simplify = FALSE)))
            outputData.train[1, ] <- c("GeneSet", paste0(paste0("C", rep(seq(1, ncell, 1), each = n)), "G", seq(1, n, 1)))
            data.table::fwrite(outputData.train, 
                               file = paste(filename.out, "fingerprint_training.txt", sep = "_"), 
                               sep = "\t", row.names = FALSE, col.names = FALSE, quote = FALSE)
            
        }
        cat("Fingerprint completed!", "\n\n")
    }
    if (!is.null(dep.data) & tolower(mode) == "training") {
        if (is.null(check.cellNames)) {
            check.cellNames <- colnames(dep.data[, -1])
        }else if (length(check.cellNames) != length(colnames(dep.data[, -1])) | sum(check.cellNames %in% colnames(dep.data[, -1])) != length(check.cellNames)) {
            stop(c("Cell line names are inconsistent!"), call. = FALSE)
        }
        cat("Gene dependency scores (training mode) start...", "\n")
        crispr.input <- dep.data[which(dep.data$Gene %in% list.genes$Gene),  # list.genes
                                 which(colnames(dep.data) %in% c("Gene", check.cellNames))]
        k = 2:ncol(crispr.input)
        crispr.output <- pbmcapply::pbmclapply(X = k, FUN = function(index){
          table <- as.data.frame(t(crispr.input[, index]))
          colnames(table) <- paste0("C", index - 1, "G", seq(1, n, 1))
          return(table)
        }, mc.cores = 3) %>% 
          bind_cols()
          
        crispr.output <- bind_cols(tibble(Dep_Score = "score"), crispr.output)
        data.table::fwrite(crispr.output, 
                           file = paste(filename.out, "DepScore_training.txt", sep = "_"), 
                           sep = "\t", row.names = FALSE, col.names = TRUE, quote = FALSE)
        
        cat("Gene dependency scores (training) completed!", "\n\n")
    }
    if (!is.null(cna.data)) {
        if (sum(colnames(cna.data) %in% c("CCLE_name", "Chromosome", "Start", "End", "Num_Probe", "Segment_Mean")) != 5) {
            stop("cna.data format error, please check!", call. = FALSE)
        }
        cat(c("CNA started..."), "\n")
        ncell <- length(unique(cna.data$CCLE_name))
        if (is.null(check.cellNames)) {
            outputData.cna <- .PrepCNA(cna.original = cna.data, filename.out, exportTable = FALSE)
        } else {
            idx <- which(cna.data$CCLE_name %in% check.cellNames)
            if (length(check.cellNames) != length(unique(cna.data$CCLE_name[idx])) | sum(check.cellNames %in% unique(cna.data$CCLE_name[idx])) != 
                  length(check.cellNames)) {
                stop(c("Cell line names are inconsistent!"), 
                  call. = FALSE)
            }
            outputData.cna <- .PrepCNA_custom(cna.original = cna.data[idx, ], filename.out, exportTable = FALSE)
            outputData.cna <- outputData.cna[, c("CNA", check.cellNames)]
        }
        if (tolower(mode) == "prediction") {
            colnames(outputData.cna)[1] <- "Bin"
            data.table::fwrite(outputData.cna, 
                               file = paste(filename.out, "cna_prediction.txt", sep = "_"), 
                               sep = "\t", row.names = FALSE, col.names = TRUE, quote = FALSE)
        }
        if (tolower(mode) == "training") {
          k = 2:ncol(outputData.cna)
          rep_col_list <- pbmcapply::pbmclapply(X = k, FUN = function(index){
            rep_col.1 <- do.call("cbind", replicate(n, outputData.cna[, index], simplify = FALSE)) # 
            colnames(rep_col.1) <- paste0("C", index-1, "G", seq(1, n, 1))
            return(rep_col.1)
          }, mc.cores = 4)
          
          rep_col <- rep_col_list %>% 
            bind_cols(tibble(Bin = outputData.cna$CNA), .) 
          
          data.table::fwrite(rep_col, 
                             file = paste(filename.out, "cna_training.txt", sep = "_"), 
                             sep = "\t", row.names = FALSE, col.names = TRUE, quote = FALSE)
        }
        cat("CNA completed!", "\n\n")
    }
}

The text was updated successfully, but these errors were encountered:

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Prep4DeepDEP custom #2

Prep4DeepDEP custom #2

Jin0331 commented Jul 28, 2022

Prep4DeepDEP custom #2

Prep4DeepDEP custom #2

Comments

Jin0331 commented Jul 28, 2022