-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathSnakefile
127 lines (115 loc) · 4.12 KB
/
Snakefile
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
import pandas as pd
from gimmemotifs.config import CACHE_DIR
import os
configfile: "config.yaml"
IDS, = glob_wildcards(expand("{peak_dir}", peak_dir=config["peak_dir"])[0] + "/{id}.bed.gz")
DBS, = glob_wildcards(expand("{pfm_dir}", pfm_dir=config["pfm_dir"])[0] + "/{db}.pfm")
GC_INDEX = os.path.join(CACHE_DIR, "{}.gcfreq.100.feather".format(os.path.basename(config["genome"])))
localrules: all, final_tables, final_roc, fg, bg, figures, gc_index, check_db
rule all:
input:
"out/final.ROC_AUC.table.txt",
# "out/final.PR_AUC.table.txt",
"out/final.Enr_1FPR.table.txt",
"out/final.Recall_10FDR.table.txt",
"out/figure1a.png",
"out/figure1b.png",
rule figures:
input:
"out/final.Recall_10FDR.table.txt",
"out/final.ROC_AUC.table.txt",
# "out/final.PR_AUC.table.txt",
output: "out/figure1a.png", "out/figure1b.png"
conda: "envs/plot.yaml"
shell: """
python scripts/figure1_comparison_motif_databases.py {config[reference]} out/
"""
rule final_tables:
input:
expand("out/roc/{db}.{{metric}}.table.txt", db=DBS)
output:
"out/final.{metric}.table.txt"
run:
df = pd.read_csv(input[0], index_col=0, sep="\t")
for x in input[1:]:
df = df.join(pd.read_csv(x, index_col=0, sep="\t"))
df.to_csv(output[0], sep="\t")
rule gc_index:
output:
GC_INDEX
threads: 1
conda: "envs/gimme.yaml"
shell:
"""
python scripts/create_gc_index.py {config[genome]}
"""
rule final_roc:
input:
expand("out/roc/{id}_roc.{{db}}/gimme.roc.report.txt", id=IDS)
output:
"out/roc/{db}.{metric}.table.txt"
run:
remap = {
"ROC AUC":"ROC_AUC",
"PR AUC":"PR_AUC",
'Recall at 10% FDR':'Recall_10FDR',
'Enr. at 1% FPR':'Enr_1FPR'
}
with open(output[0], "w") as f:
f.write("\t{}\n".format(wildcards.db))
for fname in input:
name = os.path.split(os.path.split(fname)[0])[-1].split("_")[0]
df = pd.read_csv(fname,
index_col=0, sep="\t").rename(columns=remap)
m = df.max()[wildcards.metric]
f.write("{}\t{}\n".format(name, m))
rule check_db:
input:
GC_INDEX,
expand("{pfm_dir}/{{db}}.pfm", pfm_dir=config["pfm_dir"])[0]
output: "out/{db}.check"
threads: 24
resources:
mem_mb=4000
conda: "envs/gimme.yaml"
shell: """
gimme background out/{wildcards.db}.check.fa genomic -g {config[genome]} -n 10 -l 200
gimme scan out/{wildcards.db}.check.fa -g {config[genome]} --zscore --gc -T > {output}
"""
rule roc:
input:
fg = "out/fa/{id}.fa",
bg = "out/fa/{id}.bg.fa",
pfm = expand("{pfm_dir}/{{db}}.pfm",pfm_dir=config["pfm_dir"])[0],
index = GC_INDEX,
check = "out/{db}.check"
output:
"out/roc/{id}_roc.{db}/gimme.roc.report.txt"
threads: 24
conda: "envs/gimme.yaml"
shell: """
set +u
echo 'starting gimme roc {input.fg} {input.pfm}'
gimme roc {input.fg} out/roc/{wildcards.id}_roc.{wildcards.db} -b {input.bg} -p {input.pfm} -N 24 --known -g {config[genome]} --noreport
set -u
"""
rule fg:
input: expand("{peak_dir}/{{id}}.bed.gz", peak_dir=config["peak_dir"])
output: "out/fa/{id}.fa"
conda: "envs/bedtools.yaml"
shell: """
zcat {input} | shuf | tail -n {config[maxpeaks]} | sort -k1,1 -k2g,2 | bedtools getfasta -fi {config[genome]} -fo {output} -bed -
"""
rule bg:
input:
fa = "out/fa/{id}.fa",
bed = expand("{peak_dir}/{{id}}.bed.gz", peak_dir=config["peak_dir"]),
index = GC_INDEX
output: "out/fa/{id}.bg.fa"
conda: "envs/gimme.yaml"
resources:
mem_mb=4000
shell: """
gimme background {output}.bed gc -i {input.fa} -n {config[maxpeaks]} -g {config[genome]} -f BED
bedtools intersect -a {output}.bed -b {input.bed} -v | head -n {config[maxpeaks]} | bedtools getfasta -fi {config[genome]} -fo {output} -bed -
"""