-
Notifications
You must be signed in to change notification settings - Fork 21
/
Copy pathprocess_single_cell_mm10_20170514.sh
66 lines (48 loc) · 3.31 KB
/
process_single_cell_mm10_20170514.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
#!/usr/bin/env bash
scripts_dir='/net/shendure/vol8/projects/BCBL/Mixing/bin'
dropseq_scripts_dir='/net/shendure/vol1/home/abros/bin/Drop-seq_tools-1.12'
dropseq_ref_dir='/net/shendure/vol8/projects/BCBL/Mixing/ref/mus10_ref/mm10'
fastq1=$1
fastq2=$2
output_dir=$3
mkdir $output_dir
# Convert fastq to unaligned bam:
java -jar ${dropseq_scripts_dir}/3rdParty/picard/picard.jar FastqToSam F1=$fastq1 F2=$fastq2 O=${output_dir}/single_cells.bam SM=1
cd ${output_dir}
# Copy header
echo "Trimming Read2"
samtools view -H single_cells.bam > single_cells.trimmed.sam
# Trim read2 to UMI_BC3_BC2_BC1 if bases 81-86=="CATTCG" (no dephasing):
samtools view single_cells.bam | awk 'NR%2==1{a=$0}NR%2==0 && substr($10,81,6)=="CATTCG"{print a"\n"$1"\t"$2"\t"$3"\t"$4"\t"$5"\t"$6"\t"$7"\t"$8"\t"$9"\t"substr($10,1,18)substr($10,49,8)substr($10,87,8)"\t"substr($11,1,18)substr($11,49,8)substr($11,87,8)"\t"$12}' >> single_cells.trimmed.sam
# Filter based on quality scores in barcode and UMI (allow only 1 base with phred<20 in each BC and UMI):
echo "Filtering UMIs and BCs based on quality scores"
python ${scripts_dir}/filter_qual_scores_umi.py single_cells.trimmed.sam single_cells_qs_filt.sam
rm single_cells.trimmed.sam
# Tag headers with BC3_BC2_BC1_UMI
echo "Tagging headers with BCs and UMIs"
samtools view -HS single_cells_qs_filt.sam > single_cells_barcoded_head.sam
samtools view -S single_cells_qs_filt.sam | awk 'NR%2==1{a=$0}NR%2==0 {print substr($10,11,24)"_"substr($10,1,10)"_"a}' >> single_cells_barcoded_head.sam
rm single_cells_qs_filt.sam
# Make reads unmated:
echo "Unmating reads"
sed -i -e 's/\t77/\t4/g' single_cells_barcoded_head.sam
# Convert sam to fastq
java -jar ${dropseq_scripts_dir}/3rdParty/picard/picard.jar SamToFastq INPUT=single_cells_barcoded_head.sam FASTQ=single_cells_barcoded_head.fastq
rm single_cells_barcoded_head.sam
# Run the STAR alignment
STAR --genomeDir ${dropseq_ref_dir}/ --runThreadN 8 --readFilesIn single_cells_barcoded_head.fastq --outFileNamePrefix single_cells_barcoded_head
rm single_cells_barcoded_head.fastq
# Sort the aligned bam file
java -Dsamjdk.buffer_size=131072 -XX:GCTimeLimit=50 -XX:GCHeapFreeLimit=10 -Xmx4000m -jar ${dropseq_scripts_dir}/3rdParty/picard/picard.jar SortSam INPUT=single_cells_barcoded_headAligned.out.sam OUTPUT=single_cells_barcoded_headAligned.sorted.bam SORT_ORDER=queryname
rm single_cells_barcoded_headAligned.out.sam
# Tag the reads with the gene and read type (only exonic get tagged with gene)
${dropseq_scripts_dir}/TagReadWithGeneExon O=star_gene_tagged.bam ANNOTATIONS_FILE=${dropseq_ref_dir}/mm10.refFlat TAG=GE CREATE_INDEX=true INPUT=single_cells_barcoded_headAligned.sorted.bam
# Tag the reads with the gene and read type (intronic also get tagged with gene)
${dropseq_scripts_dir}/TagReadWithGeneExon O=star_gene_tagged_full_gene.bam ANNOTATIONS_FILE=${dropseq_ref_dir}/mm10_full_genes.refFlat TAG=GE CREATE_INDEX=true INPUT=single_cells_barcoded_headAligned.sorted.bam
rm single_cells_barcoded_headAligned.sorted.bam
# Get the gene counts
python ${scripts_dir}/gene_counts_unique_align.py star_gene_tagged.bam star_gene_tagged_full_gene.bam
# Make the digital count matrix
python ${scripts_dir}/make_digital_count_matrix.py digital_gene_counts.mat ./
# Count the different read types (exon,intron,utr,intergenic)
python ${scripts_dir}/count_read_types.py star_gene_tagged.bam