merge dev

wbaopaul · May 25, 2020 · aa56346 · aa56346
2 parents 9c04c31 + 30354eb
commit aa56346
Show file tree

Hide file tree

Showing 11 changed files with 126 additions and 68 deletions.
diff --git a/README.md b/README.md
@@ -46,35 +46,15 @@ Installation
      
 Updates
 ------------
-- Current version: 1.1.1
-- March, 2020
-    * *get_mtx* requires two input files: a fragments.txt file and a peak file, separated by comma
-    * annotate peak as overlapped with a gene Tss if the corresponding distance <= 1000bp; mark peak with a gene if their distance <= 100kb 
-- Feb, 2020
-    * *integrate* module enables 3 options: seurat, harmony and pool
-    * new module *visualize*, allowing interactively explore and analyze the data
-    * *footprint* module supports one-vs-rest comparison and provides result in heatmap
-    * module *runDA* changed to use group name as the input (e.g. "0:1,2" or "one,rest") 
-    * installed rgt-hint (for footprinting analysis) using miniconda3
-    * added module *process_with_bam*, allowing processing from aggregated bam file
-    * enabled data integration from peaks files, assuming all data sets are processed using scATAC-pro. Output matrix with the same merged peaks/features and the previously called cells, along with an integrated seurat object
-    * added new parameters in the configuration file: Top_Variable_Features, REDUCTION, nREDUCTION
-    * enabled all clustering methods mentioned in the manuscript, along with kmeans clustering on principal components
-    * file path changed to like downstreame_analysis/PEAK_CALLER/CELL_CALLER/..., indicating peak caller
-- Jan, 2020 
-    * added a new module *mergePeaks* to merge different peak files called from different data sets
-    * added a new module *reConstMtx* to reconstruct peak-by-cell matrix given a peak file, a fragment file and a barcodes.txt file
-- Dec, 2019 
-    * corrected an error due to using older version of chromVAR
-    * corrected a bug for demultiplexing multiple index files
-    * added a module *convert10xbam* to convert 10x position sorted bam file to scATAC-pro file format
-    * updated module *get_bam4Cells*, with the inputs as a bam file and a txt file of barcodes, separated by comma
-
-
-
-
-
-
+- Current version: 1.1.2
+- May, 2020
+    * *integrate*: add VFACS (Variable Features Across ClusterS) option for the integration module,
+      **which reselect variable features across cell clusters after an initial clustering, followed by 
+        another round of dimension reduction and clustering**, specify *Integrate_by = VFACS* in configure file
+    * *clustering*: filter peaks before clustering (accessible in less than 0.5% of cells) and
+       remove rare peaks (accessible in less than 1% of cells) from the variable features list
+    * *reConsMtx*: enable specifying a path for saving reconstructed matrix (optional)
+- Complete update history can be viewd [here](complete_update_history.md)
 
 Dependencies
 ------------
@@ -227,6 +207,8 @@ Step by step guide to running scATAC-pro
 
     ## perform integrated analysis, assuming all data sets are processed by scATAC-pro
     ## which means each fragments.txt and barcodes.txt files can be found correspondingly            
+    ## the integration methods includes 'VFACS', 'pool', 'seurat', and 'harmony', for instance, 
+    ## you can specify the integration method with 'Integrate_by = VFACS' in the configure file
     $ scATAC-pro -s integrate
                  -i peak_file1,peak_file2,(peak_file3...)   ## 
                  -c configure_user.txt
@@ -263,7 +245,7 @@ Detailed Usage
     usage : scATAC-pro -s STEP -i INPUT -c CONFIG [-o] [-h] [-v]
     Use option -h|--help for more information
 
-    scATAC-pro 1.1.1
+    scATAC-pro 1.1.2
     ---------------
     OPTIONS
 
@@ -368,11 +350,13 @@ Detailed Usage
                          input: peak files and a distance parameter separated by comma: 
                                 peakFile1,peakFile2,peakFile3,200
                          output: merged peaks saved in file output/peaks/merged.bed
-          reconstMtx: reconstruct peak-by-cell matrix given peak file, fragments.txt file, and barcodes.txt file 
+          reconstMtx: reconstruct peak-by-cell matrix given peak file, fragments.txt file, barcodes.txt and 
+                      an optional path for reconstructed matrix 
                          input: different files separated by comma:
-                                peakFilePath,fragmentFilePath,barcodesPath
-                         output: a sub-folder reConst_matrix for the reconstructed peak-by-cell matrix, saved in
-                                 the same path as the input barcodes.txt file
+                                peakFilePath,fragmentFilePath,barcodesPath,reconstructMatrixPath
+                         output: reconstructed peak-by-cell matrix saved in reconstructMatrixPath, 
+                                 if reconstructMatrixPath not specified, a sub-folder reConstruct_matrix will be created
+                                 under the same path as the input barcodes.txt file
           integrate: perform integration of two ore more data sets
                            input: peak/feature files, separated by comma: peak_file1,peak_file2
                            output: merged peaks, reconstructed matrix, integrated seurat obj and umap plot, saved in
@@ -417,7 +401,7 @@ $ singularity exec -H YOUR_WORK_DIR --cleanenv scatac-pro_latest.sif scATAC-pro
 #!/bin/bash
 module load singularity
 
-singularity pull -F docker://wbaopaul/scatac-pro  ## you just need run line this once
+singularity pull -F docker://wbaopaul/scatac-pro  ## you just need run this line once
 ## will generate scatac-pro_latest.sif in the current directory
 
 singularity exec --cleanenv -H /mnt/isilon/tan_lab/yuw1/run_scATAC-pro/PBMC10k scatac-pro_latest.sif \ 

diff --git a/complete_update_history.md b/complete_update_history.md
@@ -0,0 +1,39 @@
+## Complete Update History
+
+- Current version: 1.2.1
+- May, 2020
+* *integrate*: add VFACS (Variable Features Across ClusterS) option for the integration module,
+      **which reselect variable features across cell clusters after an initial clustering, followed by 
+        another round of dimension reduction and clustering**, specify *Integrate_by = VFACS* in configure file
+    * *clustering*: filter peaks before clustering (accessible in less than 0.5% of cells) and
+       remove rare peaks (accessible in less than 1% of cells) from the variable features list
+    * *reConsMtx*: enable specifying a path for saving reconstructed matrix (optional)
+- VERSION **1.1.1** released
+- March,April, 2020
+    * *get_mtx*: it requires two input files: a fragments.txt file and a peak file, separated by comma
+    * annotate peak as overlapped with a gene Tss if the corresponding distance <= 1000bp; mark peak with a gene if their distance <= 100kb
+    * update DA, fix bug of using covariate
+    * using mefa4::Melt instead of melt -- better for large sparse matrix
+    * add PEAK_CALLER prefix to qc_per_barcode.txt filename
+    * fix a bug of file location of tmpJob when calling cells
+- VERSION **1.1.0** released
+- Feb, 2020
+    * *integrate* module enables 3 options: seurat, harmony and pool
+    * new module *visualize*, allowing interactively explore and analyze the data
+    * *footprint* module supports one-vs-rest comparison and provides result in heatmap
+    * module *runDA* changed to use group name as the input (e.g. "0:1,2" or "one,rest")
+    * installed rgt-hint (for footprinting analysis) using miniconda3
+    * added module *process_with_bam*, allowing processing from aggregated bam file
+    * enabled data integration from peaks files, assuming all data sets are processed using scATAC-pro. Output matrix with the same merged peaks/features and the previously called cells, along with an integrated seurat object
+    * added new parameters in the configuration file: Top_Variable_Features, REDUCTION, nREDUCTION
+    * enabled all clustering methods mentioned in the manuscript, along with kmeans clustering on principal components
+    * file path changed to like downstreame_analysis/PEAK_CALLER/CELL_CALLER/..., indicating peak caller
+- Jan, 2020
+    * added a new module *mergePeaks* to merge different peak files called from different data sets
+    * added a new module *reConstMtx* to reconstruct peak-by-cell matrix given a peak file, a fragment file and a barcodes.txt file
+- Dec, 2019
+    * corrected an error due to using older version of chromVAR
+    * corrected a bug for demultiplexing multiple index files
+    * added a module *convert10xbam* to convert 10x position sorted bam file to scATAC-pro file format
+    * updated module *get_bam4Cells*, with the inputs as a bam file and a txt file of barcodes, separated by comma
+
diff --git a/configure_user.txt b/configure_user.txt
@@ -157,9 +157,9 @@ Cicero_Plot_Region = chr5:140610000-140640000
 ############################################################################
 ## about integrate module ##
 ## integrate two or more samples
-## by seurat cca integration or by pool
+## by VFACS, seurat cca, harmony, or pool 
 ############################################################################
-Integrate_By = seurat  ## seurat or pool or harmony or VFACS
+Integrate_By = VFACS  ## VFACS, seurat, pool or harmony 
 prepCello4Integration = TRUE ## prepare cello for integrated object
 
 

diff --git a/scATAC-pro b/scATAC-pro
@@ -9,7 +9,7 @@
 #########################                                                                   
 
 SOFT="scATAC-pro"
-VERSION="1.1.1"
+VERSION="1.1.2"
 
 function usage {
     echo -e "usage : $SOFT -s STEP -i INPUT -c CONFIG [-o] [-h] [-v] [-b]"
@@ -115,11 +115,13 @@ function help {
                          input: peak files and a distance parameter separated by comma:
                                 peakFile1,peakFile2,peakFile3,200
                          output: merged peaks saved in file output/peaks/merged.bed"
-    echo "      reConstMtx: reconstruct peak-by-cell matrix a given peak file, fragments.txt file, and barcodes.txt file 
+    echo "      reConstMtx: reconstruct peak-by-cell matrix given peak file, fragments.txt file, barcodes.txt and
+                            an optional path for reconstructed matrix
                          input: different files separated by comma:
-                                peakFilePath,fragmentFilePath,barcodesPath
-                         output: a sub-folder reConstructed_matrix of reconstructed peak-cell matrix saved in the same
-                                 directory as the input barcodes.txt file"
+                                peakFilePath,fragmentFilePath,barcodesPath,reconstructMatrixPath
+                         output: reconstructed peak-by-cell matrix saved in reconstructMatrixPath,
+                                 if reconstructMatrixPath not specified, a sub-folder reConstruct_matrix will be created
+                                 under the same path as the input barcodes.txt file" 
     echo "      integrate: perform integration of two ore more data sets
                            input: peak/feature files, separated by comma: peak_file1,peak_file2
                            output: merged peaks, reconstructed matrix, integrated seurat obj and umap plot, saved in 

diff --git a/scripts/process.sh b/scripts/process.sh
@@ -67,8 +67,8 @@ wait
 
 ## 6.generate matrix
 echo "generating raw matrix and qc per barcode..."
-feature_file=${OUTPUT_DIR}/peaks/${PEAK_CALLER}/${OUTPUT_PREFIX}_features_BlacklistRemoved.bed
 frag_file=${OUTPUT_DIR}/summary/${OUTPUT_PREFIX}.fragments.txt
+feature_file=${OUTPUT_DIR}/peaks/${PEAK_CALLER}/${OUTPUT_PREFIX}_features_BlacklistRemoved.bed
 ${curr_dir}/get_mtx.sh ${frag_file},${feature_file} $2 $3 &
 
 echo "QC per cell ..."

diff --git a/scripts/reConstMtx.sh b/scripts/reConstMtx.sh
@@ -13,15 +13,19 @@ inputs=(${inputs//,/ })
 peak_file=${inputs[0]}
 frag_file=${inputs[1]}
 bc_file=${inputs[2]}
+outMat_dir=${inputs[3]}
 
 # reading configure file
 curr_dir=`dirname $0`
 source ${curr_dir}/read_conf.sh
 read_conf "$2"
 read_conf "$3"
 
-outMat_dir=`dirname $bc_file`
-outMat_dir=${outMat_dir}/reConstruct_matrix
+# default output diretory
+if [[ -z "${outMat_dir}" ]]; then 
+    outMat_dir=`dirname $bc_file`
+    outMat_dir=${outMat_dir}/reConstruct_matrix
+fi
 mkdir -p ${outMat_dir}
 
 echo "re-constructing peak-by-cell matrix for each sample ..."

diff --git a/scripts/src/clustering.R b/scripts/src/clustering.R
@@ -28,6 +28,13 @@ tss_ann <- fread(tss_path, header = F)
 names(tss_ann)[c(1:4)] <- c('chr', 'start', 'end', 'gene_name')
 mtx = assignGene2Peak(mtx, tss_ann)
 
+
+## remove peaks than are less freqent than 0.5% of cells
+rfreqs = Matrix::rowMeans(mtx > 0)
+mtx = mtx[rfreqs > 0.005, ]
+cfreqs = Matrix::colMeans(mtx > 0)
+mtx = mtx[, cfreqs > 0]
+
 seurat.obj = doBasicSeurat_new(mtx, npc = nREDUCTION, norm_by = norm_by, 
                                top_variable_features = top_variable_features, reg.var = 'nCount_ATAC')
 if(REDUCTION != 'lda'){

diff --git a/scripts/src/dsAnalysis_utilities.R b/scripts/src/dsAnalysis_utilities.R
@@ -161,15 +161,45 @@ doBasicSeurat_new <- function(mtx, npc = 50, top_variable_features = 0.2,
 
   # top.variabl -- use top most variable features
   seurat.obj = CreateSeuratObject(mtx, project = project, assay = assay,
-                                  names.delim = '-')
+                                  names.delim = '-', min.cells = 1,
+                                  min.features = 1)
 
-  if(norm_by == 'log') seurat.obj@assays$ATAC@data <- log1p(seurat.obj@assays$ATAC@data)/log(2)
-  if(norm_by == 'tf-idf') seurat.obj@assays$ATAC@data <- TF.IDF(seurat.obj@assays$ATAC@data)
-  nveg = ifelse(top_variable_features > 1, top_variable_features, floor(nrow(mtx) * top_variable_features))
+  if(norm_by == 'log') seurat.obj[[assay]]@data <- log1p(seurat.obj[[assay]]@counts)/log(2)
+  if(norm_by == 'tf-idf') seurat.obj[[assay]]@data <- TF.IDF(seurat.obj[[assay]]@counts)
+  nvap = ifelse(top_variable_features > 1, top_variable_features, 
+                floor(nrow(mtx) * top_variable_features))
 
   seurat.obj <- FindVariableFeatures(object = seurat.obj,
                                      selection.method = 'vst',
-                                     nfeatures = nveg)
+                                     nfeatures = nvap)
+
+  ## remove variable features only accessible in less than 1% of cells
+  mtx = seurat.obj[[assay]]@counts
+  rs = Matrix::rowMeans(mtx > 0)
+  rare.features = names(which(rs < 0.01))
+  vaps = VariableFeatures(seurat.obj)
+  vaps = setdiff(vaps, rare.features)
+  niter = 0
+  while(length(vaps) < 500 & nvap > 500){
+    niter = niter + 1
+    nvap = nvap + 2000
+    seurat.obj <- FindVariableFeatures(object = seurat.obj,
+                                       selection.method = 'vst',
+                                       nfeatures = min(nvap, nrow(seurat.obj)))
+    vaps = VariableFeatures(seurat.obj)
+    vaps = setdiff(vaps, rare.features)
+    if(niter > 5) stop('Too many variable features were filtered, 
+                       please specify a large Top_Variable_Features
+                       in the configure file!')
+  }
+  VariableFeatures(seurat.obj) <- vaps
+
+  ## redo normalization using vap if norm by tf-idf
+  if(norm_by == 'tf-idf'){
+    mtx.norm = TF.IDF(mtx[vaps, ])
+    seurat.obj[[assay]]@data[vaps, ] = mtx.norm
+  }
+
   seurat.obj <- ScaleData(object = seurat.obj,
                           features = VariableFeatures(seurat.obj),
                           vars.to.regress = NULL, do.scale = doScale,

diff --git a/scripts/src/integrate_seu.R b/scripts/src/integrate_seu.R
@@ -30,7 +30,7 @@ tss_ann <- fread(tss_path, header = F)
 names(tss_ann)[c(1:4)] <- c('chr', 'start', 'end', 'gene_name')
 
 
-## do seurat individually
+## run seurat individually
 seu.all = mtx.all = list()
 len = length(mtx_files)
 for(i in 1:length(mtx_files)){
@@ -62,7 +62,7 @@ if(integrate_by == 'seurat'){
                          features = VariableFeatures(seurat.obj))
     seurat.obj <- RunPCA(seurat.obj, npcs = nREDUCTION, verbose = FALSE)
 }else{
-    ## use pool and regress method
+    ## pool the matrxi first
     nf = sapply(mtx.all, nrow)
     nc = sapply(mtx.all, ncol)
     if(length(unique(nf)) == 1) {
@@ -105,8 +105,8 @@ if(integrate_by == 'VFACS'){
       VariableFeatures(seurat.obj) <- sele.features
       seurat.obj <- RunPCA(seurat.obj, dims = 1:nReduction, verbose = F)
       seurat.obj <- regress_on_pca(seurat.obj, 'nCount_ATAC')
-      seurat.obj <- FindNeighbors(seurat.atac, dims = 1:nREDUCTION, reduction = 'pca')
-      seurat.obj <- FindClusters(seurat.atac, resl = 0.6)
+      seurat.obj <- FindNeighbors(seurat.obj, dims = 1:nREDUCTION, reduction = 'pca')
+      seurat.obj <- FindClusters(seurat.obj, resl = 0.6)
 
 }
 seurat.obj <- RunUMAP(seurat.obj, reduction = "pca", dims = 1:nREDUCTION)

diff --git a/scripts/src/motif_analysis.R b/scripts/src/motif_analysis.R
@@ -35,8 +35,13 @@ if(T){
 
       seurat.obj <- FindVariableFeatures(object = seurat.obj,
                                          selection.method = 'vst',
-                                         nfeatures = floor(nrow(mtx) * 0.3))
+                                         nfeatures = floor(nrow(mtx) * 0.4))
       vFeatures = VariableFeatures(seurat.obj)
+      ## further filter peaks
+      rs = Matrix::rowSums(mtx > 0)
+      filter.pks = names(which(rs > (0.005 * ncol(seurat.obj))))
+      vFeatures = intersect(vFeatures, filter.pks)
+
       rm(seurat.obj)
       rnames = rownames(mtx)
       mtx = mtx[rnames %in% vFeatures, ]

diff --git a/update_note.txt b/update_note.txt