Merge pull request #1 from RIVM-bioinformatics/create_rule

Add PopPUNK
RIVM-bioinformatics · Nov 16, 2022 · 7337b16 · 7337b16
2 parents 567bed3 + 081ffa4
commit 7337b16
Show file tree

Hide file tree

Showing 21 changed files with 191 additions and 67 deletions.
diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
@@ -15,7 +15,7 @@ jobs:
         id: release
         with:
           release-type: python # just keep a changelog, no version anywhere outside of git tags
-          package-name: juno_template
+          package-name: juno_population
   lint:
     name: Lint Code Base
     runs-on: ubuntu-latest

diff --git a/.gitignore b/.gitignore
@@ -133,3 +133,5 @@ dmypy.json
 envs/src
 input
 output
+config/sample_sheet.yaml
+config/user_parameters.yaml
diff --git a/.gitmodules b/.gitmodules
@@ -1,3 +1,4 @@
-[submodule "juno-library"]
+[submodule "base_juno_pipeline"]
 	path = base_juno_pipeline
-	url = https://github.com/RIVM-bioinformatics/base_juno_pipeline.git
+	url = https://github.com/RIVM-bioinformatics/juno-library.git
+	branch = v0.9.2
diff --git a/README.md b/README.md
@@ -1,6 +1,11 @@
-# Juno-Template
+# Juno-Population
+- [ ] TODO: Write readme
 A template pipeline where the other juno pipelines are based on.
 
+Before running the pipeline be sure to initialize the submodules:
+```bash
+git submodule update --init --recursive
+```
 ## Contribution guidelines
 Juno pipelines use a [feature branch workflow](https://www.atlassian.com/git/tutorials/comparing-workflows/feature-branch-workflow). To work on features, create a branch from the `main` branch to make changes to. This branch can be merged to the main branch via a pull request. Hotfixes for bugs can be committed to the `main` branch.
 

diff --git a/Snakefile b/Snakefile
@@ -1,21 +1,22 @@
 import yaml
 
 
-sample_sheet=config["sample_sheet"]
+sample_sheet = config["sample_sheet"]
 with open(sample_sheet) as f:
     SAMPLES = yaml.safe_load(f)
 
-print(SAMPLES)
-
 OUT = config["out"]
 
+
 localrules:
     all,
 
 
-include: "workflow/rules/rule.smk"
+include: "workflow/rules/aggregatePoppunkCsv.smk"
+include: "workflow/rules/createQfileFasta.smk"
+include: "workflow/rules/PopPUNK.smk"
 
 
 rule all:
     input:
-        expand(OUT + "/{sample}_combined.fastq", sample=SAMPLES),
+        expand(OUT + "/poppunk_clusters.csv"),
diff --git a/config/pipeline_parameters.yaml b/config/pipeline_parameters.yaml
@@ -1,5 +1,9 @@
 threads:
-    template_rule: 1
+    create_Qfile: 1
+    fasta_popPUNK: 8
+    aggregatePoppunkCsv: 1
 
 mem_gb:
-    template_rule: 1
+    create_Qfile: 1
+    fasta_popPUNK: 1
+    aggregatePoppunkCsv: 1
diff --git a/config/sample_sheet.yaml b/config/sample_sheet.yaml
diff --git a/config/user_parameters.yaml b/config/user_parameters.yaml
diff --git a/database_locations.py b/database_locations.py
@@ -0,0 +1,7 @@
+import pathlib
+
+species_database_locations = {
+    "streptococcus_pneumoniae": pathlib.Path(
+        "/mnt/db/juno/poppunk/streptococcus/GPS_v4_references"
+    ),
+}
diff --git a/envs/mamba.yaml b/envs/mamba.yaml
@@ -2,4 +2,4 @@ name: mamba
 channels:
   - conda-forge
 dependencies:
-  - mamba
+  - mamba==0.27
diff --git a/envs/population_master.yaml b/envs/population_master.yaml
@@ -0,0 +1,11 @@
+name: juno_population
+channels:
+  - bioconda
+  - conda-forge
+  - anaconda
+  - defaults
+dependencies:
+  - git
+  - mamba==0.27
+  - pandas
+  - snakemake
diff --git a/envs/template_master.yaml b/envs/template_master.yaml
diff --git a/template.py → population.py b/template.py → population.py
@@ -1,4 +1,5 @@
 import pathlib
+from pickle import DUP
 import yaml
 import argparse
 import sys
@@ -10,16 +11,21 @@
     helper_functions,
 )
 
+from database_locations import species_database_locations
 
-class TemplateRun(PipelineStartup, RunSnakemake):
+
+class PopulationRun(PipelineStartup, RunSnakemake):
     def __init__(
         self,
         input_dir,
         output_dir,
-        input_type="fastq",
+        species=None,
+        db_dir=None,
+        input_type="fasta",
         unlock=False,
         rerunincomplete=False,
         dryrun=False,
+        queue="bio",
         **kwargs
     ):
         PipelineStartup.__init__(
@@ -29,22 +35,41 @@ def __init__(
         )
         RunSnakemake.__init__(
             self,
-            pipeline_name="template",
+            pipeline_name="population",
             pipeline_version="0.1.0",
             output_dir=output_dir,
             workdir=pathlib.Path(__file__).parent.resolve(),
+            queue=queue,
             unlock=unlock,
             rerunincomplete=rerunincomplete,
             dryrun=dryrun,
             **kwargs,
         )
+
+        # Specific Juno-Population pipeline attributes
+        if not db_dir:
+            self.db_dir = species_database_locations.get(species)
+            if not self.db_dir:
+                raise KeyError(
+                    "Cannot determine db_dir: This species is currently not configured AND no db_dir was provided. Manually provide a db_dir via -b/--database, or ask for your species to be configured."
+                )
+
+        self.user_parameters = pathlib.Path("config/user_parameters.yaml")
+
+        # Start pipeline
         self.start_juno_pipeline()
+
+        # Create user_parameters.yaml and sample_sheet.yaml files
         self.config_params = {
             "input_dir": str(self.input_dir),
             "out": str(self.output_dir),
+            "db_dir": str(self.db_dir),
         }
         with open(self.user_parameters, "w") as f:
             yaml.dump(self.config_params, f, default_flow_style=False)
+
+        with open(self.sample_sheet, "w") as f:
+            yaml.dump(self.sample_dict, f, default_flow_style=False)
         self.run_snakemake()
 
 
@@ -68,12 +93,36 @@ def __init__(
         default="output",
         help="Relative or absolute path to the output directory. If non is given, an 'output' directory will be created in the current directory.",
     )
+    parser.add_argument(
+        "-s",
+        "--species",
+        default=None,
+        required=False,
+        help="The species name, use an underscore instead of a space (e.g. streptococcus_pneumoniae). Check the publicly available popPUNK databases on www.poppunk.net/pages/databases.html",
+    )
+    parser.add_argument(
+        "-b",
+        "--database",
+        default=None,
+        required=False,
+        help="The path to the popPUNK database to use. This overrides information provide with the --species argument.",
+    )
     parser.add_argument(
         "-l",
         "--local",
         action="store_true",
         help="Running pipeline locally (instead of in a computer cluster). Default is running it in a cluster.",
     )
+    parser.add_argument(
+        "-q",
+        "--queue",
+        type=str,
+        required=False,
+        default="bio",
+        metavar="STR",
+        dest="queue",
+        help="Name of the queue that the job will be sumitted to if working on a cluster.",
+    )
     # Snakemake arguments
     parser.add_argument(
         "-u",
@@ -100,10 +149,13 @@ def __init__(
         help="Extra arguments to be passed to snakemake API (https://snakemake.readthedocs.io/en/stable/api_reference/snakemake.html).",
     )
     args = parser.parse_args()
-    TemplateRun(
+    PopulationRun(
         input_dir=args.input,
         output_dir=args.output,
+        species=args.species,
+        db_dir=args.database,
         local=args.local,
+        queue=args.queue,
         unlock=args.unlock,
         rerunincomplete=args.rerunincomplete,
         dryrun=args.dryrun,

diff --git a/run_pipeline.sh b/run_pipeline.sh
@@ -4,28 +4,37 @@ set -euo pipefail
 
 #----------------------------------------------#
 # User parameters
-if [ ! -z "${1}" ] || [ ! -z "${2}" ] #|| [ ! -z "${irods_input_projectID}" ]
+if [ ! -z "${1}" ] || [ ! -z "${2}" ] || [ ! -z "${irods_input_projectID}" ]
 then
-   input_dir="${1}"
-   output_dir="${2}"
-#    PROJECT_NAME="${irods_input_projectID}"
+    input_dir="${1}"
+    output_dir="${2}"
+    PROJECT_NAME="${irods_input_projectID}"
 else
     echo "One of the parameters is missing, make sure there is an input directory, output directory and project name(param 1, 2 or irods_input_projectID)."
     exit 1
 fi
 
 if [ ! -d "${input_dir}" ] || [ ! -d "${output_dir}" ]
 then
-  echo "The input directory $input_dir, output directory $output_dir or fastq dir ${input_dir}/clean_fastq does not exist"
-  exit 1
+    echo "The input directory $input_dir, output directory $output_dir or fastq dir ${input_dir}/clean_fastq does not exist"
+    exit 1
 else
-  input_fastq="${input_dir}/clean_fastq"
+    input_fastq="${input_dir}/clean_fastq"
 fi
 
+case $PROJECT_NAME in
+
+    rvp_spn)
+        GENUS_ALL="streptococcus_pneumoniae";;
+    *)
+        GENUS_ALL="other";;
+
+esac
+
 #----------------------------------------------#
 # Create/update necessary environments
 PATH_MAMBA_YAML="envs/mamba.yaml"
-PATH_MASTER_YAML="envs/template_master.yaml"
+PATH_MASTER_YAML="envs/population_master.yaml"
 MAMBA_NAME=$(head -n 1 ${PATH_MAMBA_YAML} | cut -f2 -d ' ')
 MASTER_NAME=$(head -n 1 ${PATH_MASTER_YAML} | cut -f2 -d ' ')
 
@@ -56,7 +65,7 @@ fi
 
 set -euo pipefail
 
-python template.py --queue "${QUEUE}" -i "${input_dir}" -o "${output_dir}"
+python population.py --queue "${QUEUE}" -i "${input_dir}" -o "${output_dir}" -s "${GENUS_ALL}"
 
 result=$?
 

diff --git a/workflow/envs/env.yaml b/workflow/envs/env.yaml
diff --git a/workflow/envs/poppunk.yaml b/workflow/envs/poppunk.yaml
@@ -0,0 +1,11 @@
+name: poppunk
+channels:
+  - bioconda
+  - conda-forge
+  - anaconda
+  - defaults
+dependencies:
+  - popPUNK
+  # Joblib 1.2.0 breaks HDBscan clustering that is used by popPUNK. 
+  # Temporarily pin to v1.1, beware of vulnerability that triggered release of joblib v1.2 https://nvd.nist.gov/vuln/detail/CVE-2022-21797
+  - joblib==1.1.0
diff --git a/workflow/rules/PopPUNK.smk b/workflow/rules/PopPUNK.smk
@@ -0,0 +1,29 @@
+rule assign_popPUNK_cluster:
+    input:
+        OUT + "/q_files/{sample}_qfile.txt",
+    output:
+        output_dir=directory(OUT + "/results_per_sample/{sample}_poppunk/"),
+        output_csv=OUT
+        + "/results_per_sample/{sample}_poppunk/{sample}_poppunk_clusters.csv",
+        output_pkl=OUT
+        + "/results_per_sample/{sample}_poppunk/{sample}_poppunk.dists.pkl",
+        output_npy=OUT
+        + "/results_per_sample/{sample}_poppunk/{sample}_poppunk.dists.npy",
+        output_h5=OUT + "/results_per_sample/{sample}_poppunk/{sample}_poppunk.h5",
+    log:
+        OUT + "/log/{sample}_poppunk.log",
+    conda:
+        "../envs/poppunk.yaml"
+    message:
+        "Running popPUNK clustering"
+    params:
+        db_dir=config["db_dir"],
+    resources:
+        mem_gb=config["mem_gb"]["fasta_popPUNK"],
+    threads: config["threads"]["fasta_popPUNK"]
+    shell:
+        """
+        poppunk_assign \
+            --db {params.db_dir} \
+            --threads {threads} --query {input} --output {output.output_dir} 2> {log}
+        """
diff --git a/workflow/rules/aggregatePoppunkCsv.smk b/workflow/rules/aggregatePoppunkCsv.smk
@@ -0,0 +1,20 @@
+rule aggregate_poppunk_csv:
+    input:
+        expand(
+            OUT + "/results_per_sample/{sample}_poppunk/{sample}_poppunk_clusters.csv",
+            sample=SAMPLES,
+        ),
+    output:
+        OUT + "/poppunk_clusters.csv",
+    log:
+        OUT + "/log/summarize.log",
+    message:
+        "Merging individual popPUNK output to one csv."
+    resources:
+        mem_gb=config["mem_gb"]["aggregatePoppunkCsv"],
+    threads: config["threads"]["aggregatePoppunkCsv"]
+    run:
+        import pandas as pd
+
+        aggregated_csv = pd.concat([pd.read_csv(f) for f in input], ignore_index=True)
+        aggregated_csv.to_csv(output[0])
diff --git a/workflow/rules/createQfileFasta.smk b/workflow/rules/createQfileFasta.smk
@@ -0,0 +1,14 @@
+rule create_Qfile_fasta:
+    """Create popPUNKs required query file, a textfile containing sampleID and location of fasta"""
+    # TODO: popPUNK can also work from fastqs. This approach completely ignores this.
+    input:
+        lambda wc: SAMPLES[wc.sample]["assembly"],
+    output:
+        OUT + "/q_files/{sample}_qfile.txt",
+    resources:
+        mem_gb=config["mem_gb"]["create_Qfile"],
+    threads: config["threads"]["create_Qfile"]
+    shell:
+        """
+        printf "$(basename {input} .fasta)\t$(realpath {input})\n" > {output}
+        """
diff --git a/workflow/rules/rule.smk b/workflow/rules/rule.smk