motif_database_comparison/Snakefile

import pandas as pd
from gimmemotifs.config import CACHE_DIR
import os

configfile: "config.yaml"

IDS, = glob_wildcards(expand("{peak_dir}", peak_dir=config["peak_dir"])[0] + "/{id}.bed.gz")
DBS, = glob_wildcards(expand("{pfm_dir}", pfm_dir=config["pfm_dir"])[0] + "/{db}.pfm")

GC_INDEX =  os.path.join(CACHE_DIR, "{}.gcfreq.100.feather".format(os.path.basename(config["genome"])))

localrules: all, final_tables, final_roc, fg, bg, figures, gc_index, check_db

rule all:
    input: 
        "out/final.ROC_AUC.table.txt",
#        "out/final.PR_AUC.table.txt",
        "out/final.Enr_1FPR.table.txt",
        "out/final.Recall_10FDR.table.txt",
        "out/figure1a.png",
        "out/figure1b.png",

rule figures:
    input:  
        "out/final.Recall_10FDR.table.txt", 
        "out/final.ROC_AUC.table.txt",
#        "out/final.PR_AUC.table.txt",
    output: "out/figure1a.png", "out/figure1b.png"
    conda: "envs/plot.yaml"
    shell: """
           python scripts/figure1_comparison_motif_databases.py {config[reference]} out/ 
           """
rule final_tables:
    input:
        expand("out/roc/{db}.{{metric}}.table.txt", db=DBS)
    output:
        "out/final.{metric}.table.txt"
    run:
        df = pd.read_csv(input[0], index_col=0, sep="\t")
        for x in input[1:]:
            df = df.join(pd.read_csv(x, index_col=0, sep="\t"))
        df.to_csv(output[0], sep="\t")

rule gc_index:
    output:
         GC_INDEX
    threads: 1
    conda: "envs/gimme.yaml"
    shell:
        """
        python scripts/create_gc_index.py {config[genome]}
        """
    
rule final_roc:
    input:
        expand("out/roc/{id}_roc.{{db}}/gimme.roc.report.txt", id=IDS)
    output:
        "out/roc/{db}.{metric}.table.txt"
    run:
        remap = {
                    "ROC AUC":"ROC_AUC", 
                    "PR AUC":"PR_AUC", 
                    'Recall at 10% FDR':'Recall_10FDR', 
                    'Enr. at 1% FPR':'Enr_1FPR'
                    }
        with open(output[0], "w") as f:
            f.write("\t{}\n".format(wildcards.db))
            for fname in input:
                name = os.path.split(os.path.split(fname)[0])[-1].split("_")[0]
                df = pd.read_csv(fname,
                    index_col=0, sep="\t").rename(columns=remap)
                m = df.max()[wildcards.metric]
                f.write("{}\t{}\n".format(name, m))
rule check_db:
    input: 
        GC_INDEX,
        expand("{pfm_dir}/{{db}}.pfm", pfm_dir=config["pfm_dir"])[0]
    output: "out/{db}.check"
    threads: 24
    resources:
         mem_mb=4000
    conda: "envs/gimme.yaml"
    shell: """
    gimme background out/{wildcards.db}.check.fa genomic -g {config[genome]} -n 10 -l 200
    gimme scan out/{wildcards.db}.check.fa -g {config[genome]} --zscore --gc -T  > {output} 
    """

rule roc:
    input:
        fg = "out/fa/{id}.fa",
        bg = "out/fa/{id}.bg.fa",
        pfm = expand("{pfm_dir}/{{db}}.pfm",pfm_dir=config["pfm_dir"])[0],
        index = GC_INDEX,
        check = "out/{db}.check"
    output:
        "out/roc/{id}_roc.{db}/gimme.roc.report.txt"
    threads: 24
    conda: "envs/gimme.yaml"
    shell: """
    set +u
    echo 'starting gimme roc {input.fg} {input.pfm}'
    gimme roc {input.fg} out/roc/{wildcards.id}_roc.{wildcards.db} -b {input.bg} -p {input.pfm} -N 24 --known -g {config[genome]} --noreport
    set -u
    """

rule fg:
   input:  expand("{peak_dir}/{{id}}.bed.gz", peak_dir=config["peak_dir"])
   output: "out/fa/{id}.fa"
   conda: "envs/bedtools.yaml"
   shell:  """
       zcat {input} | shuf | tail -n {config[maxpeaks]} | sort -k1,1 -k2g,2 | bedtools getfasta -fi {config[genome]} -fo {output} -bed -
    """

rule bg:
    input:  
        fa = "out/fa/{id}.fa",
        bed = expand("{peak_dir}/{{id}}.bed.gz", peak_dir=config["peak_dir"]),
        index = GC_INDEX
    output: "out/fa/{id}.bg.fa"
    conda: "envs/gimme.yaml"
    resources:
         mem_mb=4000
    shell:  """
        gimme background {output}.bed gc -i {input.fa} -n {config[maxpeaks]} -g {config[genome]} -f BED
        bedtools intersect -a {output}.bed -b {input.bed} -v | head -n {config[maxpeaks]} | bedtools getfasta -fi {config[genome]} -fo {output} -bed -
    """