Skip to content

Commit

Permalink
Merge pull request #79 from RBL-NCI/activeDev
Browse files Browse the repository at this point in the history
v0.6
  • Loading branch information
slsevilla authored Jun 22, 2021
2 parents 65365f0 + 7efe557 commit fb473c1
Show file tree
Hide file tree
Showing 4 changed files with 54 additions and 149 deletions.
7 changes: 3 additions & 4 deletions .tests/snakemake_config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -24,10 +24,9 @@ nt_merge: 50 #minimum distance of nucleotides to merge peaks [10,20,30,40,50,60]
peak_id: "all" #report peaks for unique peaks only or unique and fractional mm ["unique","all"]
DE_method: "manorm" #choose DE method ["manorm","none"]
splice_junction: "Y" #include splice junctions in peak calls: "manorm" #choose DE method ["manorm","none"]
SY_flag: "Y" #if mm10, flag to run additional annotations with Soyeong's BED files

#modules, container parameters
container_dir: "/data/RBL_NCI/iCLIP/container"
container_dir: "/data/CCBR_Pipeliner/iCLIP/container"
bedtools: "bedtools/2.29.2"
bowtie2: "bowtie/2-2.3.4"
fastq_screen: "fastq_screen/0.14.0"
Expand All @@ -37,8 +36,8 @@ multiqc: "multiqc/1.9"
novocraft: "novocraft/4.03.01"
perl: "perl/5.24.3"
python: "python/3.7"
Qt: "Qt/5.14.2"
singularity: "singularity/3.7.0"
Qt: "Qt/5.13.2"
singularity: "singularity"
samtools: "samtools/1.11"
umitools: "umitools/1.1.1"
subread: "subread/2.0.1"
Expand Down
20 changes: 10 additions & 10 deletions config/index_config.yaml
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
hg38:
std: '/data/CCBR_Pipeliner/iCLIP/index/active/2021_0607/hg38/06_final/hg38_final_nosplicing.nix'
std: '/data/CCBR_Pipeliner/iCLIP/index/active/phil/hg38/hg38.nix'
spliceaware_unmasked:
50bp: '/data/CCBR_Pipeliner/iCLIP/index/active/2021_0607/hg38/06_final/hg38_final_unmaskedexon_46.nix'
75bp: '/data/CCBR_Pipeliner/iCLIP/index/active/2021_0607/hg38/06_final/hg38_final_unmaskedexon_71.nix'
50bp: '/data/CCBR_Pipeliner/iCLIP/index/active/phil/hg38/gencode.v32.chr_patch_hapl_scaff.annotation.gtf.SplicedTransc_46.nix'
75bp: '/data/CCBR_Pipeliner/iCLIP/index/active/phil/hg38/gencode.v32.chr_patch_hapl_scaff.annotation.gtf.SplicedTransc_71.nix'
spliceaware_masked:
50bp: '/data/CCBR_Pipeliner/iCLIP/index/active/2021_0607/hg38/06_final/hg38_final_maskedexon_46.nix'
75bp: '/data/CCBR_Pipeliner/iCLIP/index/active/2021_0607/hg38/06_final/hg38_final_maskedexon_71.nix'
50bp: '/data/CCBR_Pipeliner/iCLIP/index/active/phil/hg38/gencode.v32.chr_patch_hapl_scaff.annotation.gtf.SplicedTransc.maskedexon_46.nix'
75bp: '/data/CCBR_Pipeliner/iCLIP/index/active/phil/hg38/gencode.v32.chr_patch_hapl_scaff.annotation.gtf.SplicedTransc.maskedexon_71.nix'
gencode_path: '/data/CCBR_Pipeliner/iCLIP/ref/annotations/hg38/Gencode_V32/fromGencode/gencode.v32.annotation.gtf.txt'
refseq_path: '/data/CCBR_Pipeliner/iCLIP/ref/annotations/hg38/NCBI_RefSeq/GCF_000001405.39_GRCh38.p13_genomic.gtf.txt'
canonical_path: '/data/CCBR_Pipeliner/iCLIP/ref/annotations/hg38/Gencode_V32/fromUCSC/KnownCanonical/KnownCanonical_GencodeM32_GRCh38.txt'
Expand All @@ -14,13 +14,13 @@ hg38:
sy_path: '/data/CCBR_Pipeliner/iCLIP/ref/annotations/mm10/additional_anno/'
alias_path: '/data/CCBR_Pipeliner/iCLIP/ref/annotations/hg38/hg38.chromAlias.txt'
mm10:
std: '/data/CCBR_Pipeliner/iCLIP/index/active/2021_0607/mm10/06_final/mm10_final_nosplicing.nix'
std: '/data/CCBR_Pipeliner/iCLIP/index/active/phil/mm10/mm10.nix'
spliceaware_unmasked:
50bp: '/data/CCBR_Pipeliner/iCLIP/index/active/2021_0607/mm10/06_final/mm10_final_unmaskedexon_46.nix'
75bp: '/data/CCBR_Pipeliner/iCLIP/index/active/2021_0607/mm10/06_final/mm10_final_unmaskedexon_71.nix'
50bp: '/data/CCBR_Pipeliner/iCLIP/index/active/phil/mm10/mm10_splice50bp_unmasked.nix'
75bp: '/data/CCBR_Pipeliner/iCLIP/index/active/phil/mm10/mm10_splice75bp_unmasked.nix'
spliceaware_masked:
50bp: '/data/CCBR_Pipeliner/iCLIP/index/active/2021_0607/mm10/06_final/mm10_final_maskedexon_46.nix'
75bp: '/data/CCBR_Pipeliner/iCLIP/index/active/2021_0607/mm10/06_final/mm10_final_maskedexon_71.nix'
50bp: '/data/CCBR_Pipeliner/iCLIP/index/active/phil/mm10/mm10_splice50bp_masked.nix'
75bp: '/data/CCBR_Pipeliner/iCLIP/index/active/phil/mm10/mm10_splice75bp_masked.nix'
gencode_path: '/data/CCBR_Pipeliner/iCLIP/ref/annotations/mm10/Gencode_VM23/fromGencode/gencode.vM23.annotation.gtf.txt'
refseq_path: '/data/CCBR_Pipeliner/iCLIP/ref/annotations/mm10/NCBI_RefSeq/GCF_000001635.26_GRCm38.p6_genomic.gtf.txt'
canonical_path: '/data/CCBR_Pipeliner/iCLIP/ref/annotations/mm10/Gencode_VM23/fromUCSC/KnownCanonical/KnownCanonical_GencodeM23_GRCm38.txt'
Expand Down
2 changes: 1 addition & 1 deletion run_snakemake.sh
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,7 @@ if [[ $pipeline = "cluster" ]] || [[ $pipeline = "local" ]]; then

#submit jobs to cluster
if [[ $pipeline = "cluster" ]]; then
sbatch --job-name="iCLIP" --gres=lscratch:200 --time=120:00:00 --output=${output_dir}/log/${log_time}_00_%j_%x.out --mail-type=BEGIN,END,FAIL \
sbatch --job-name="iCLIP" --gres=lscratch:200 --time=24:00:00 --output=${output_dir}/log/${log_time}_00_%j_%x.out --mail-type=BEGIN,END,FAIL \
snakemake --use-envmodules --latency-wait 120 -s ${output_dir}/workflow/${log_time}_Snakefile --configfile ${output_dir}/log/${log_time}_00_snakemake_config.yaml \
--printshellcmds --cluster-config ${output_dir}/log/${log_time}_00_cluster_config.yml --keep-going \
--restart-times 1 --cluster "sbatch --gres {cluster.gres} --cpus-per-task {cluster.threads} \
Expand Down
174 changes: 40 additions & 134 deletions workflow/Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ S. Sevilla
P. Homan
* Overview *
- Multiplexed samples are split based on provided barcodes and named using provide manifests
- Multiplexed samples are split based on provided barcodes and named using provide manifests, maximum 10 samples
- Adaptors are stripped from samples
- Samples are unzipped and split into smaller fastq files to increase speed
- Samples are aligned using NovaAlign
Expand All @@ -14,8 +14,6 @@ P. Homan
* Requirements *
- Read specific input requirements, and execution information on the Wikipage
located at: https://github.com/RBL-NCI/iCLIP.git
Pipeline info: activeDev 05272021
'''

report: "report/workflow.rst"
Expand Down Expand Up @@ -89,7 +87,7 @@ else:
if (splice_aware == 'N'):
align_list = ["unaware"]
else:
align_list = ["unaware","masked","unmasked"]
align_list = ["unmasked"]

###############################################################
# create sample lists
Expand Down Expand Up @@ -279,14 +277,6 @@ def get_align_input(wildcards):
f1 = join(out_dir,'04_sam','03_genomic','{sp}.{al}.split.{n}.sam'),
return(f1)

#determine dedup input based on splcie_aware flag
def input_mapq_corrected_bam(wildcards):
if (splice_aware=="N"):
f1 = join(out_dir,'09_dedup','01_bam','{sp}.unaware.dedup.bam'),
else:
f1 = join(out_dir,'10_mapq_score','{sp}.mapq_recalculated.bam'),
return(f1)

###############################################################
# main code
###############################################################
Expand Down Expand Up @@ -318,10 +308,14 @@ else:

## if samples are spliced
if splice_aware == 'Y':
input_unmapped = expand(join(out_dir,'04_sam','04_unmapped','{sp}.{al}.complete.bam'), sp=sp_list, al=align_list)

input_splice = [expand(join(out_dir,'10_mapq_score','{sp}.readids.txt'), sp=sp_list),
expand(join(out_dir,'10_mapq_score','{sp}.unaware.subset.bam'),sp=sp_list),
expand(join(out_dir,'10_mapq_score','{sp}.mapq_recalculated.bam'),sp=sp_list)]
else:
input_unmapped = expand(join(out_dir,'09_dedup','01_bam','{sp}.{al}.dedup.bam'), sp=sp_list, al=align_list),

input_splice = [expand(join(out_dir,'09_dedup','01_bam','{sp}.{al}.dedup.bam'),sp=sp_list, al=align_list)]

#local rules
Expand Down Expand Up @@ -384,30 +378,27 @@ rule all:
join(out_dir,'qc','qc_report.html'),

#Unmapped read output
expand(join(out_dir,'04_sam','04_unmapped','{sp}.{al}.complete.bam'), sp=sp_list, al=align_list),
input_unmapped,

#Deduplicate
expand(join(out_dir,'09_dedup','01_bam','{sp}.unmasked.dedup.bam'), sp=sp_list),
expand(join(out_dir,'09_dedup','01_bam','{sp}.{al}.dedup.bam'), sp=sp_list, al=align_list),

#MapQ recalculation
#input_splice,

# #Bam processing
# expand(join(out_dir,'09_dedup','03_unique','{sp}.dedup.unique.i.bam'), sp=sp_list),
#Bam processing
expand(join(out_dir,'09_dedup','03_unique','{sp}.dedup.unique.i.bam'), sp=sp_list),

# #Bed files
# expand(join(out_dir,'11_bed','{sp}_all.bed'), sp=sp_list),
# expand(join(out_dir,'11_bed','{sp}_unique.bed'), sp=sp_list),
#Bed files
expand(join(out_dir,'11_bed','{sp}_all.bed'), sp=sp_list),
expand(join(out_dir,'11_bed','{sp}_unique.bed'), sp=sp_list),

# #SAF
# expand(join(out_dir,'12_SAF/{sp}_'+ str(nt_merge) +'_all.SAF'), sp=sp_list),
# expand(join(out_dir,'12_SAF/{sp}_'+ str(nt_merge) +'_unique.SAF'), sp=sp_list),
#SAF
expand(join(out_dir,'12_SAF/{sp}_'+ str(nt_merge) +'_all.SAF'), sp=sp_list),
expand(join(out_dir,'12_SAF/{sp}_'+ str(nt_merge) +'_unique.SAF'), sp=sp_list),

# #Count features
# expand(join(out_dir,'14_counts','uniquereadpeaks','{sp}_' + str(nt_merge) + '_uniqueCounts.txt'), sp=sp_list),
# expand(join(out_dir,'14_counts','uniquereadpeaks','{sp}_'+ str(nt_merge) +'_allFracMMCounts.txt'), sp=sp_list),
# expand(join(out_dir,'14_counts','allreadpeaks','{sp}_'+ str(nt_merge) +'_uniqueCounts.txt'), sp=sp_list),
# expand(join(out_dir,'14_counts','allreadpeaks','{sp}_'+ str(nt_merge) +'_allFracMMCounts.txt'), sp=sp_list),
#Count features
expand(join(out_dir,'13_counts','uniquereadpeaks','{sp}_' + str(nt_merge) + '_uniqueCounts.txt'), sp=sp_list),
expand(join(out_dir,'13_counts','uniquereadpeaks','{sp}_'+ str(nt_merge) +'_allFracMMCounts.txt'), sp=sp_list),
expand(join(out_dir,'13_counts','allreadpeaks','{sp}_'+ str(nt_merge) +'_uniqueCounts.txt'), sp=sp_list),
expand(join(out_dir,'13_counts','allreadpeaks','{sp}_'+ str(nt_merge) +'_allFracMMCounts.txt'), sp=sp_list),

# #In progress
# join(out_dir,'15_annotation', 'project',,'annotations.txt'),
Expand All @@ -418,8 +409,13 @@ rule all:
# #expand(join(out_dir,'14_annotation','{sp}_'+ str(nt_merge) +'_MD15.txt'),zip, mp=mp_list, sp=sp_list),
#expand(join(out_dir,'14_annotation','{sp}_'+ str(nt_merge) +'_MD15.html'),zip, mp=mp_list, sp=sp_list),

include: join(source_dir,"workflow/rules/common.smk")
include: join(source_dir,"workflow/rules/other.smk")
#common and other SMK
if source_dir == "":
include: "rules/common.smk"
include: "rules/other.smk"
else:
include: join(source_dir,"workflow/rules/common.smk")
include: join(source_dir,"workflow/rules/other.smk")

###############################################################
# snakemake rules
Expand Down Expand Up @@ -1203,18 +1199,18 @@ else:

rule dedup:
"""
deduplicate merged.i.bam files
deduplicate
"""
input:
f1 = join(out_dir,'08_bam_merged','{sp}.unmasked.merged.si.bam'),
f1 = join(out_dir,'08_bam_merged','{sp}.{al}.merged.si.bam'),
params:
rname='23_dedup',
umi = umi_parameter
envmodules:
config['umitools']
output:
o1 = join(out_dir,'09_dedup','01_bam','{sp}.unmasked.dedup.bam'),
o2 = join(out_dir,'09_dedup','01_bam','{sp}.unmasked.dedup.log'),
o1 = join(out_dir,'09_dedup','01_bam','{sp}.{al}.dedup.bam'),
o2 = join(out_dir,'09_dedup','01_bam','{sp}.{al}.dedup.log'),
shell:
"""
umi_tools dedup \
Expand All @@ -1224,102 +1220,12 @@ rule dedup:
--log2stderr -L {output.o2};
"""

#pipeline splits for mapq score recalculation on splice_aware samples
if (splice_aware == 'Y'):
rule generate_readids:
"""
generate readids from unmasked files
"""
input:
f1 = join(out_dir,'09_dedup','01_bam','{sp}.unmasked.dedup.bam')
params:
rname='24a_readids',
envmodules:
config['samtools']
output:
o1 = join(out_dir,'10_mapq_score','{sp}.readids.txt'),
shell:
"""
samtools view {input.f1}|cut -f1|sort|uniq > {output.o1}
"""

rule subsample_reads:
"""
subsample:
A) splice unaware BAM
B) splice aware (masked exon) BAM
using the readids from splice aware (unmasked exon) in rule generate_readids
"""
input:
id = join(out_dir,'10_mapq_score','{sp}.readids.txt'),
unaware = join(out_dir,'08_bam_merged','{sp}.unaware.merged.si.bam'),
masked = join(out_dir,'08_bam_merged','{sp}.masked.merged.si.bam'),
params:
rname='24b_subsample',
script = join(source_dir,'workflow','scripts','06_filter_bam_by_readids.py'),
envmodules:
config['python']
output:
unaware = join(out_dir,'10_mapq_score','{sp}.unaware.subset.bam'),
masked = join(out_dir,'10_mapq_score','{sp}.masked.subset.bam'),
shell:
"""
python {params.script} --inputBAM {input.unaware} --outputBAM {output.unaware} --readids {input.id};
python {params.script} --inputBAM {input.masked} --outputBAM {output.masked} --readids {input.id}
"""

rule mapq_recalc:
"""
input deduplicate (C) BAM file and the 2 BAMs subset A) and subset B) pysam script for MAPQ
correction - outputs D) updated mapq score bam file
"""
input:
unaware = join(out_dir,'10_mapq_score','{sp}.unaware.subset.bam'),
masked = join(out_dir,'10_mapq_score','{sp}.masked.subset.bam'),
unmasked = join(out_dir,'09_dedup','01_bam','{sp}.unmasked.dedup.bam'),
params:
rname='24c_recalc',
script = join(source_dir,'workflow','scripts','07_correct_mapq.py'),
unaware = join(out_dir,'10_mapq_score','{sp}.unaware.subset.bam'),
masked = join(out_dir,'10_mapq_score','{sp}.masked.subset.bam'),
envmodules:
config['python']
output:
bam = join(out_dir,'10_mapq_score','{sp}.mapq_recalculated.bam'),
tsv = join(out_dir,'10_mapq_score','{sp}.mapq_recalculated.tsv'),
shell:
"""
python {params.script} \
--inputBAM1 {input.unaware} --inputBAM2 {input.masked} --inputBAM3 {input.unmasked} \
--outBAM {output.bam} --out {output.tsv}
"""

rule mapq_stats:
"""
TODO
**Use D) for visualization
"""
input:
f1 = expand(join(out_dir,'10_mapq_score','{sp}.mapq_recalculated.bam'),sp=sp_list),
params:
rname='24c_mapq_stats',
script = join(source_dir,'workflow','scripts','.py'),
envmodules:
config['R']
output:
o1 = join(out_dir,'10_mapq_score','report.pdf'),
shell:
"""
Rscript {params.script} {input.f1} {output.o1}
"""

rule sort_index_dedup:
"""
sort dedup.bam file
"""
input:
f1 = input_mapq_corrected_bam
f1 = expand(join(out_dir,'09_dedup','01_bam','{{sp}}.{al}.dedup.bam'), al=align_list),
params:
rname='25_si_dedup',
envmodules:
Expand Down Expand Up @@ -1456,8 +1362,8 @@ rule feature_counts_allreads:
envmodules:
config['subread']
output:
out_unique = join(out_dir,'14_counts','allreadpeaks','{sp}_' + str(nt_merge) + '_uniqueCounts.txt'),
out_all = join(out_dir,'14_counts','allreadpeaks','{sp}_' + str(nt_merge) + '_allFracMMCounts.txt')
out_unique = join(out_dir,'13_counts','allreadpeaks','{sp}_' + str(nt_merge) + '_uniqueCounts.txt'),
out_all = join(out_dir,'13_counts','allreadpeaks','{sp}_' + str(nt_merge) + '_allFracMMCounts.txt')
shell:
"""
featureCounts -F SAF \
Expand Down Expand Up @@ -1498,8 +1404,8 @@ rule feature_counts_uniquereads:
envmodules:
config['subread']
output:
out_unique = join(out_dir,'14_counts','uniquereadpeaks','{sp}_' + str(nt_merge) + '_uniqueCounts.txt'),
out_all = join(out_dir,'14_counts','uniquereadpeaks','{sp}_' + str(nt_merge) + '_allFracMMCounts.txt')
out_unique = join(out_dir,'13_counts','uniquereadpeaks','{sp}_' + str(nt_merge) + '_uniqueCounts.txt'),
out_all = join(out_dir,'13_counts','uniquereadpeaks','{sp}_' + str(nt_merge) + '_allFracMMCounts.txt')
shell:
"""
featureCounts -F SAF \
Expand Down Expand Up @@ -1530,7 +1436,7 @@ rule feature_counts_uniquereads:
# generate annotation table once per project
# """
# input:
# expand(join(out_dir,'14_counts', 'allreadpeaks', '{sp}_'+ str(nt_merge) +'_uniqueCounts.txt'), sp=sp_list),
# expand(join(out_dir,'13_counts', 'allreadpeaks', '{sp}_'+ str(nt_merge) +'_uniqueCounts.txt'), sp=sp_list),
# params:
# rname='36_proj_anno',
# script = join(source_dir,'workflow','scripts','06_annotation.R'),
Expand Down Expand Up @@ -1578,8 +1484,8 @@ rule feature_counts_uniquereads:

# '''
# input:
# unique = join(out_dir,'14_counts', 'allreadpeaks', '{sp}_'+ str(nt_merge) +'_uniqueCounts.txt'),
# all = join(out_dir,'14_counts', 'allreadpeaks', '{sp}_'+ str(nt_merge) +'_allFracMMCounts.txt'),
# unique = join(out_dir,'13_counts', 'allreadpeaks', '{sp}_'+ str(nt_merge) +'_uniqueCounts.txt'),
# all = join(out_dir,'13_counts', 'allreadpeaks', '{sp}_'+ str(nt_merge) +'_allFracMMCounts.txt'),
# anno = join(out_dir,'15_annotation', 'project','annotations.txt'),
# params:
# rname = '37_peak_anno',
Expand Down

0 comments on commit fb473c1

Please sign in to comment.