From 39eb86d91199c78920b61a03704d48d63eb55723 Mon Sep 17 00:00:00 2001
From: Linda Visser <linda.visser@rivm.nl>
Date: Wed, 19 Oct 2022 10:32:53 +0200
Subject: [PATCH 01/23] Initial commit

---
 .github/workflows/release.yml                         |  2 +-
 .gitignore                                            |  1 +
 README.md                                             |  3 ++-
 Snakefile                                             |  2 --
 config/sample_sheet.yaml                              |  3 ---
 envs/{template_master.yaml => population_master.yaml} |  2 +-
 template.py => population.py                          | 11 ++++++++---
 run_pipeline.sh                                       |  4 ++--
 8 files changed, 15 insertions(+), 13 deletions(-)
 delete mode 100644 config/sample_sheet.yaml
 rename envs/{template_master.yaml => population_master.yaml} (90%)
 rename template.py => population.py (92%)

diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
index ff16024..373d5c1 100644
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@@ -15,7 +15,7 @@ jobs:
         id: release
         with:
           release-type: python # just keep a changelog, no version anywhere outside of git tags
-          package-name: juno_template
+          package-name: juno_population
   lint:
     name: Lint Code Base
     runs-on: ubuntu-latest
diff --git a/.gitignore b/.gitignore
index 7b630f9..c769264 100644
--- a/.gitignore
+++ b/.gitignore
@@ -133,3 +133,4 @@ dmypy.json
 envs/src
 input
 output
+config/sample_sheet.yaml
diff --git a/README.md b/README.md
index a4802bc..4e41e80 100644
--- a/README.md
+++ b/README.md
@@ -1,4 +1,5 @@
-# Juno-Template
+# Juno-Population
+- [ ] TODO: Write readme
 A template pipeline where the other juno pipelines are based on.
 
 ## Contribution guidelines
diff --git a/Snakefile b/Snakefile
index 9b45640..9f4c43b 100644
--- a/Snakefile
+++ b/Snakefile
@@ -5,8 +5,6 @@ sample_sheet=config["sample_sheet"]
 with open(sample_sheet) as f:
     SAMPLES = yaml.safe_load(f)
 
-print(SAMPLES)
-
 OUT = config["out"]
 
 localrules:
diff --git a/config/sample_sheet.yaml b/config/sample_sheet.yaml
deleted file mode 100644
index b0a7513..0000000
--- a/config/sample_sheet.yaml
+++ /dev/null
@@ -1,3 +0,0 @@
-'1':
-  R1: 'input/1_R1.fastq'
-  R2: 'input/1_R2.fastq'
diff --git a/envs/template_master.yaml b/envs/population_master.yaml
similarity index 90%
rename from envs/template_master.yaml
rename to envs/population_master.yaml
index 7606d21..f669f93 100644
--- a/envs/template_master.yaml
+++ b/envs/population_master.yaml
@@ -1,4 +1,4 @@
-name: template_master
+name: population_master
 channels:
   - bioconda
   - conda-forge
diff --git a/template.py b/population.py
similarity index 92%
rename from template.py
rename to population.py
index a7bee73..a6b212b 100644
--- a/template.py
+++ b/population.py
@@ -1,4 +1,5 @@
 import pathlib
+from pickle import DUP
 import yaml
 import argparse
 import sys
@@ -11,7 +12,7 @@
 )
 
 
-class TemplateRun(PipelineStartup, RunSnakemake):
+class PopulationRun(PipelineStartup, RunSnakemake):
     def __init__(
         self,
         input_dir,
@@ -29,7 +30,7 @@ def __init__(
         )
         RunSnakemake.__init__(
             self,
-            pipeline_name="template",
+            pipeline_name="population",
             pipeline_version="0.1.0",
             output_dir=output_dir,
             workdir=pathlib.Path(__file__).parent.resolve(),
@@ -45,6 +46,10 @@ def __init__(
         }
         with open(self.user_parameters, "w") as f:
             yaml.dump(self.config_params, f, default_flow_style=False)
+
+        # print(self.sample_dict)
+        with open(self.sample_sheet, 'w') as f:
+            yaml.dump(self.sample_dict, f, default_flow_style=False)
         self.run_snakemake()
 
 
@@ -100,7 +105,7 @@ def __init__(
         help="Extra arguments to be passed to snakemake API (https://snakemake.readthedocs.io/en/stable/api_reference/snakemake.html).",
     )
     args = parser.parse_args()
-    TemplateRun(
+    PopulationRun(
         input_dir=args.input,
         output_dir=args.output,
         local=args.local,
diff --git a/run_pipeline.sh b/run_pipeline.sh
index 56cb9f6..2edb207 100755
--- a/run_pipeline.sh
+++ b/run_pipeline.sh
@@ -25,7 +25,7 @@ fi
 #----------------------------------------------#
 # Create/update necessary environments
 PATH_MAMBA_YAML="envs/mamba.yaml"
-PATH_MASTER_YAML="envs/template_master.yaml"
+PATH_MASTER_YAML="envs/population_master.yaml"
 MAMBA_NAME=$(head -n 1 ${PATH_MAMBA_YAML} | cut -f2 -d ' ')
 MASTER_NAME=$(head -n 1 ${PATH_MASTER_YAML} | cut -f2 -d ' ')
 
@@ -56,7 +56,7 @@ fi
 
 set -euo pipefail
 
-python template.py --queue "${QUEUE}" -i "${input_dir}" -o "${output_dir}"
+python population.py -i "${input_dir}" -o "${output_dir}"
 
 result=$?
 

From e1a6d4d3e5ed20e063900e66dfe32d18eff55f7a Mon Sep 17 00:00:00 2001
From: Linda Visser <linda.visser@rivm.nl>
Date: Mon, 24 Oct 2022 20:35:23 +0200
Subject: [PATCH 02/23] feat:Create first version of rule for Q-file

---
 Snakefile                      |  3 ++-
 population.py                  |  2 +-
 workflow/rules/createQfile.smk | 13 +++++++++++++
 3 files changed, 16 insertions(+), 2 deletions(-)
 create mode 100644 workflow/rules/createQfile.smk

diff --git a/Snakefile b/Snakefile
index 9f4c43b..977886b 100644
--- a/Snakefile
+++ b/Snakefile
@@ -12,8 +12,9 @@ localrules:
 
 
 include: "workflow/rules/rule.smk"
+include: "workflow/rules/createQfile.smk"
 
 
 rule all:
     input:
-        expand(OUT + "/{sample}_combined.fastq", sample=SAMPLES),
+        expand(OUT + "/{sample}_qfile.txt", sample=SAMPLES),
diff --git a/population.py b/population.py
index a6b212b..c316175 100644
--- a/population.py
+++ b/population.py
@@ -17,7 +17,7 @@ def __init__(
         self,
         input_dir,
         output_dir,
-        input_type="fastq",
+        input_type="both",
         unlock=False,
         rerunincomplete=False,
         dryrun=False,
diff --git a/workflow/rules/createQfile.smk b/workflow/rules/createQfile.smk
new file mode 100644
index 0000000..58d5d97
--- /dev/null
+++ b/workflow/rules/createQfile.smk
@@ -0,0 +1,13 @@
+rule createQfile:
+    """Create popPUNKs required query file, a textfile containing sampleID and location of fasta"""
+    # TODO: popPUNK can also work from fastqs. This approach completely ignores this.
+    input:
+        lambda wc: SAMPLES[wc.sample]["assembly"]
+    output:
+        OUT + "/{sample}_qfile.txt"
+    resources:
+        mem_gb=config["mem_gb"]["template_rule"],
+    threads: config["threads"]["template_rule"]
+    shell:"""
+    printf "$(basename {input} .fasta)\t$(realpath {input})\n" > {output}
+    """
\ No newline at end of file

From 131de0379f8796c856a035ddc07706ebb14d2843 Mon Sep 17 00:00:00 2001
From: Linda Visser <linda.visser@rivm.nl>
Date: Wed, 26 Oct 2022 15:27:23 +0200
Subject: [PATCH 03/23] feat:Create hardcoded version of popPUNK rule

---
 Snakefile                       |  3 ++-
 config/pipeline_parameters.yaml |  4 ++++
 envs/mamba.yaml                 |  2 +-
 envs/poppunk.yaml               | 11 +++++++++++
 envs/population_master.yaml     |  2 +-
 run_pipeline.sh                 |  3 +++
 workflow/rules/createQfile.smk  |  6 +++---
 workflow/rules/fastaPopPUNK.smk | 17 +++++++++++++++++
 8 files changed, 42 insertions(+), 6 deletions(-)
 create mode 100644 envs/poppunk.yaml
 create mode 100644 workflow/rules/fastaPopPUNK.smk

diff --git a/Snakefile b/Snakefile
index 977886b..430f759 100644
--- a/Snakefile
+++ b/Snakefile
@@ -13,8 +13,9 @@ localrules:
 
 include: "workflow/rules/rule.smk"
 include: "workflow/rules/createQfile.smk"
+include: "workflow/rules/fastaPopPUNK.smk"
 
 
 rule all:
     input:
-        expand(OUT + "/{sample}_qfile.txt", sample=SAMPLES),
+        expand(OUT + "/{sample}_poppunk/", sample=SAMPLES),
diff --git a/config/pipeline_parameters.yaml b/config/pipeline_parameters.yaml
index d240a8f..0b6a093 100644
--- a/config/pipeline_parameters.yaml
+++ b/config/pipeline_parameters.yaml
@@ -1,5 +1,9 @@
 threads:
     template_rule: 1
+    create_Qfile: 1
+    fasta_popPUNK: 8
 
 mem_gb:
     template_rule: 1
+    create_Qfile: 1
+    fasta_popPUNK: 1
diff --git a/envs/mamba.yaml b/envs/mamba.yaml
index 57a4e0e..be30445 100644
--- a/envs/mamba.yaml
+++ b/envs/mamba.yaml
@@ -2,4 +2,4 @@ name: mamba
 channels:
   - conda-forge
 dependencies:
-  - mamba
+  - mamba==0.27
diff --git a/envs/poppunk.yaml b/envs/poppunk.yaml
new file mode 100644
index 0000000..dbed9fc
--- /dev/null
+++ b/envs/poppunk.yaml
@@ -0,0 +1,11 @@
+name: poppunk_test
+channels:
+  - bioconda
+  - conda-forge
+  - anaconda
+  - defaults
+dependencies:
+  - popPUNK
+  # Joblib 1.2.0 breaks HDBscan clustering that is used by popPUNK. 
+  # Temporarily pin to v1.1, beware of vulnerability that triggered release of joblib v1.2 https://nvd.nist.gov/vuln/detail/CVE-2022-21797
+  - joblib==1.1.0
diff --git a/envs/population_master.yaml b/envs/population_master.yaml
index f669f93..484d607 100644
--- a/envs/population_master.yaml
+++ b/envs/population_master.yaml
@@ -6,7 +6,7 @@ channels:
   - defaults
 dependencies:
   - git
-  - mamba
+  - mamba==0.27
   - pandas
   - snakemake
   - pip
diff --git a/run_pipeline.sh b/run_pipeline.sh
index 2edb207..763e2b6 100755
--- a/run_pipeline.sh
+++ b/run_pipeline.sh
@@ -26,8 +26,10 @@ fi
 # Create/update necessary environments
 PATH_MAMBA_YAML="envs/mamba.yaml"
 PATH_MASTER_YAML="envs/population_master.yaml"
+PATH_POPPUNK_YAML="envs/poppunk.yaml"
 MAMBA_NAME=$(head -n 1 ${PATH_MAMBA_YAML} | cut -f2 -d ' ')
 MASTER_NAME=$(head -n 1 ${PATH_MASTER_YAML} | cut -f2 -d ' ')
+POPPUNK_NAME=$(head -n 1 ${PATH_POPPUNK_YAML} | cut -f2 -d ' ')
 
 echo -e "\nUpdating necessary environments to run the pipeline..."
 
@@ -40,6 +42,7 @@ conda env update -f "${PATH_MAMBA_YAML}"
 source activate "${MAMBA_NAME}"
 
 mamba env update -f "${PATH_MASTER_YAML}"
+mamba env update -f "${PATH_POPPUNK_YAML}"
 
 source activate "${MASTER_NAME}"
 
diff --git a/workflow/rules/createQfile.smk b/workflow/rules/createQfile.smk
index 58d5d97..b2feda6 100644
--- a/workflow/rules/createQfile.smk
+++ b/workflow/rules/createQfile.smk
@@ -1,4 +1,4 @@
-rule createQfile:
+rule create_Qfile:
     """Create popPUNKs required query file, a textfile containing sampleID and location of fasta"""
     # TODO: popPUNK can also work from fastqs. This approach completely ignores this.
     input:
@@ -6,8 +6,8 @@ rule createQfile:
     output:
         OUT + "/{sample}_qfile.txt"
     resources:
-        mem_gb=config["mem_gb"]["template_rule"],
-    threads: config["threads"]["template_rule"]
+        mem_gb=config["mem_gb"]["create_Qfile"],
+    threads: config["threads"]["create_Qfile"]
     shell:"""
     printf "$(basename {input} .fasta)\t$(realpath {input})\n" > {output}
     """
\ No newline at end of file
diff --git a/workflow/rules/fastaPopPUNK.smk b/workflow/rules/fastaPopPUNK.smk
new file mode 100644
index 0000000..4004dd6
--- /dev/null
+++ b/workflow/rules/fastaPopPUNK.smk
@@ -0,0 +1,17 @@
+rule fasta_popPUNK:
+    input:
+        expand(OUT + "/{sample}_qfile.txt", sample=SAMPLES)
+    output:
+        output_dir = directory(OUT + "/{sample}_poppunk/"),
+    log:
+        OUT + "/log/{sample}_poppunk.log"
+    conda:
+        "../../envs/poppunk.yaml"
+    message:
+        "Running popPUNK clustering"
+    resources:
+        mem_gb=config["mem_gb"]["fasta_popPUNK"],
+    threads: config["threads"]["fasta_popPUNK"]
+    shell: """
+    poppunk_assign --db /mnt/db/juno/poppunk/streptococcus/GPS_v4_references --threads {threads} --query {input} --output {output.output_dir} 2> {log}
+    """

From 2a5ab308b0e23ac548ffa1c6dc216598fddbb7de Mon Sep 17 00:00:00 2001
From: Linda Visser <linda.visser@rivm.nl>
Date: Mon, 31 Oct 2022 15:24:48 +0100
Subject: [PATCH 04/23] Attempt at incorporating --species argument.

---
 .gitignore                                    |  1 +
 Snakefile                                     |  4 ++--
 config/user_parameters.yaml                   |  2 --
 population.py                                 | 22 ++++++++++++++++++-
 run_pipeline.sh                               |  3 ---
 .../rules/{fastaPopPUNK.smk => PopPUNK.smk}   |  4 +++-
 .../{createQfile.smk => createQfileFasta.smk} |  2 +-
 7 files changed, 28 insertions(+), 10 deletions(-)
 delete mode 100644 config/user_parameters.yaml
 rename workflow/rules/{fastaPopPUNK.smk => PopPUNK.smk} (67%)
 rename workflow/rules/{createQfile.smk => createQfileFasta.smk} (95%)

diff --git a/.gitignore b/.gitignore
index c769264..7ef4c19 100644
--- a/.gitignore
+++ b/.gitignore
@@ -134,3 +134,4 @@ envs/src
 input
 output
 config/sample_sheet.yaml
+config/user_parameters.yaml
diff --git a/Snakefile b/Snakefile
index 430f759..c281e13 100644
--- a/Snakefile
+++ b/Snakefile
@@ -12,8 +12,8 @@ localrules:
 
 
 include: "workflow/rules/rule.smk"
-include: "workflow/rules/createQfile.smk"
-include: "workflow/rules/fastaPopPUNK.smk"
+include: "workflow/rules/createQfileFasta.smk"
+include: "workflow/rules/PopPUNK.smk"
 
 
 rule all:
diff --git a/config/user_parameters.yaml b/config/user_parameters.yaml
deleted file mode 100644
index 2311c9d..0000000
--- a/config/user_parameters.yaml
+++ /dev/null
@@ -1,2 +0,0 @@
-input_dir: input
-out: output
diff --git a/population.py b/population.py
index c316175..3f9191e 100644
--- a/population.py
+++ b/population.py
@@ -17,6 +17,8 @@ def __init__(
         self,
         input_dir,
         output_dir,
+        species=None,
+        db_dir="/mnt/db/juno/poppunk/",
         input_type="both",
         unlock=False,
         rerunincomplete=False,
@@ -39,15 +41,25 @@ def __init__(
             dryrun=dryrun,
             **kwargs,
         )
+
+        # Specific Juno-Population pipeline attributes
+        self.species = species
+        self.db_dir = db_dir
+        self.user_parameters = pathlib.Path("config/user_parameters.yaml")
+
+        # Start pipeline
         self.start_juno_pipeline()
+
+        # Create user_parameters.yaml and sample_sheet.yaml files
         self.config_params = {
             "input_dir": str(self.input_dir),
             "out": str(self.output_dir),
+            "species": str(self.species),
+            "db_dir": str(self.db_dir)
         }
         with open(self.user_parameters, "w") as f:
             yaml.dump(self.config_params, f, default_flow_style=False)
 
-        # print(self.sample_dict)
         with open(self.sample_sheet, 'w') as f:
             yaml.dump(self.sample_dict, f, default_flow_style=False)
         self.run_snakemake()
@@ -73,6 +85,13 @@ def __init__(
         default="output",
         help="Relative or absolute path to the output directory. If non is given, an 'output' directory will be created in the current directory.",
     )
+    parser.add_argument(
+        "-s",
+        "--species",
+        default=None,
+        required=False,
+        help="The species name. It should be consistent with the popPUNK databases as found on www.poppunk.net/pages/databases.html (e.g. Streptococcus_pneumoniae)",
+    )
     parser.add_argument(
         "-l",
         "--local",
@@ -108,6 +127,7 @@ def __init__(
     PopulationRun(
         input_dir=args.input,
         output_dir=args.output,
+        species=args.species,
         local=args.local,
         unlock=args.unlock,
         rerunincomplete=args.rerunincomplete,
diff --git a/run_pipeline.sh b/run_pipeline.sh
index 763e2b6..2edb207 100755
--- a/run_pipeline.sh
+++ b/run_pipeline.sh
@@ -26,10 +26,8 @@ fi
 # Create/update necessary environments
 PATH_MAMBA_YAML="envs/mamba.yaml"
 PATH_MASTER_YAML="envs/population_master.yaml"
-PATH_POPPUNK_YAML="envs/poppunk.yaml"
 MAMBA_NAME=$(head -n 1 ${PATH_MAMBA_YAML} | cut -f2 -d ' ')
 MASTER_NAME=$(head -n 1 ${PATH_MASTER_YAML} | cut -f2 -d ' ')
-POPPUNK_NAME=$(head -n 1 ${PATH_POPPUNK_YAML} | cut -f2 -d ' ')
 
 echo -e "\nUpdating necessary environments to run the pipeline..."
 
@@ -42,7 +40,6 @@ conda env update -f "${PATH_MAMBA_YAML}"
 source activate "${MAMBA_NAME}"
 
 mamba env update -f "${PATH_MASTER_YAML}"
-mamba env update -f "${PATH_POPPUNK_YAML}"
 
 source activate "${MASTER_NAME}"
 
diff --git a/workflow/rules/fastaPopPUNK.smk b/workflow/rules/PopPUNK.smk
similarity index 67%
rename from workflow/rules/fastaPopPUNK.smk
rename to workflow/rules/PopPUNK.smk
index 4004dd6..ab2670f 100644
--- a/workflow/rules/fastaPopPUNK.smk
+++ b/workflow/rules/PopPUNK.smk
@@ -9,9 +9,11 @@ rule fasta_popPUNK:
         "../../envs/poppunk.yaml"
     message:
         "Running popPUNK clustering"
+    params:
+        species = config["species"]
     resources:
         mem_gb=config["mem_gb"]["fasta_popPUNK"],
     threads: config["threads"]["fasta_popPUNK"]
     shell: """
-    poppunk_assign --db /mnt/db/juno/poppunk/streptococcus/GPS_v4_references --threads {threads} --query {input} --output {output.output_dir} 2> {log}
+    echo {species} & poppunk_assign --db /mnt/db/juno/poppunk/streptococcus/GPS_v4_references --threads {threads} --query {input} --output {output.output_dir} 2> {log}
     """
diff --git a/workflow/rules/createQfile.smk b/workflow/rules/createQfileFasta.smk
similarity index 95%
rename from workflow/rules/createQfile.smk
rename to workflow/rules/createQfileFasta.smk
index b2feda6..b9ed121 100644
--- a/workflow/rules/createQfile.smk
+++ b/workflow/rules/createQfileFasta.smk
@@ -1,4 +1,4 @@
-rule create_Qfile:
+rule create_Qfile_fasta:
     """Create popPUNKs required query file, a textfile containing sampleID and location of fasta"""
     # TODO: popPUNK can also work from fastqs. This approach completely ignores this.
     input:

From 6e6c95ce7b6f9e781ef61784ed994b4c98577ed0 Mon Sep 17 00:00:00 2001
From: Linda Visser <linda.visser@rivm.nl>
Date: Fri, 4 Nov 2022 11:56:40 +0100
Subject: [PATCH 05/23] Feat: added --species and --database arguments and use
 these to set correct poppunk db_dir

---
 population.py              | 37 +++++++++++++++++++++++++++++++------
 workflow/rules/PopPUNK.smk | 10 ++++++++--
 2 files changed, 39 insertions(+), 8 deletions(-)

diff --git a/population.py b/population.py
index 3f9191e..5393e7e 100644
--- a/population.py
+++ b/population.py
@@ -18,7 +18,7 @@ def __init__(
         input_dir,
         output_dir,
         species=None,
-        db_dir="/mnt/db/juno/poppunk/",
+        db_dir=None,
         input_type="both",
         unlock=False,
         rerunincomplete=False,
@@ -43,8 +43,7 @@ def __init__(
         )
 
         # Specific Juno-Population pipeline attributes
-        self.species = species
-        self.db_dir = db_dir
+        self.db_dir = self.determine_db_dir(species, db_dir)
         self.user_parameters = pathlib.Path("config/user_parameters.yaml")
 
         # Start pipeline
@@ -54,8 +53,7 @@ def __init__(
         self.config_params = {
             "input_dir": str(self.input_dir),
             "out": str(self.output_dir),
-            "species": str(self.species),
-            "db_dir": str(self.db_dir)
+            "db_dir": str(self.db_dir),
         }
         with open(self.user_parameters, "w") as f:
             yaml.dump(self.config_params, f, default_flow_style=False)
@@ -65,6 +63,25 @@ def __init__(
         self.run_snakemake()
 
 
+    def determine_db_dir(self, species, db_dir=None):
+        """
+        Provided the species and a db_dir optionally set by the user, determines the actual db_dir to use
+        """
+        if db_dir is not None:
+            return db_dir
+        # Future feature: Import a yaml with species_db_dirs instead?
+        species_db_dirs = {
+            'streptococcus_pneumoniae': pathlib.Path('/mnt/db/juno/poppunk/streptococcus/GPS_v4_references'),
+        }
+
+        species_db_dir = species_db_dirs.get(species)
+
+        if species_db_dir is None:
+            raise KeyError('Cannot determine db_dir: This species is currently not configured AND no db_dir was provided. Manually provide a db_dir via -b/--database, or ask for your species to be configured.')
+
+        return species_db_dir
+
+
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(
         description="Template juno pipeline. If you see this message please change it to something appropriate"
@@ -90,7 +107,14 @@ def __init__(
         "--species",
         default=None,
         required=False,
-        help="The species name. It should be consistent with the popPUNK databases as found on www.poppunk.net/pages/databases.html (e.g. Streptococcus_pneumoniae)",
+        help="The species name, use an underscore instead of a space (e.g. streptococcus_pneumoniae). Check the publicly available popPUNK databases on www.poppunk.net/pages/databases.html",
+    )
+    parser.add_argument(
+        "-b",
+        "--database",
+        default=None,
+        required=False,
+        help="The path to the popPUNK database to use. This overrides information provide with the --species argument.",
     )
     parser.add_argument(
         "-l",
@@ -128,6 +152,7 @@ def __init__(
         input_dir=args.input,
         output_dir=args.output,
         species=args.species,
+        db_dir=args.database,
         local=args.local,
         unlock=args.unlock,
         rerunincomplete=args.rerunincomplete,
diff --git a/workflow/rules/PopPUNK.smk b/workflow/rules/PopPUNK.smk
index ab2670f..5635209 100644
--- a/workflow/rules/PopPUNK.smk
+++ b/workflow/rules/PopPUNK.smk
@@ -3,6 +3,10 @@ rule fasta_popPUNK:
         expand(OUT + "/{sample}_qfile.txt", sample=SAMPLES)
     output:
         output_dir = directory(OUT + "/{sample}_poppunk/"),
+        output_csv = OUT + "/{sample}_poppunk/{sample}_poppunk_clusters.csv",
+        output_pkl = OUT + "/{sample}_poppunk/{sample}_poppunk.dists.pkl",
+        output_npy = OUT + "/{sample}_poppunk/{sample}_poppunk.dists.npy",
+        output_h5 = OUT + "/{sample}_poppunk/{sample}_poppunk.h5",
     log:
         OUT + "/log/{sample}_poppunk.log"
     conda:
@@ -10,10 +14,12 @@ rule fasta_popPUNK:
     message:
         "Running popPUNK clustering"
     params:
-        species = config["species"]
+        db_dir = config["db_dir"],
     resources:
         mem_gb=config["mem_gb"]["fasta_popPUNK"],
     threads: config["threads"]["fasta_popPUNK"]
     shell: """
-    echo {species} & poppunk_assign --db /mnt/db/juno/poppunk/streptococcus/GPS_v4_references --threads {threads} --query {input} --output {output.output_dir} 2> {log}
+    poppunk_assign \
+        --db {params.db_dir} \
+        --threads {threads} --query {input} --output {output.output_dir} 2> {log}
     """

From 0f3e21915f2a82cbe7bb7ec3aa68926b91812b4e Mon Sep 17 00:00:00 2001
From: Linda Visser <linda.visser@rivm.nl>
Date: Fri, 4 Nov 2022 16:34:40 +0100
Subject: [PATCH 06/23] Feat: Create summary rule.

---
 Snakefile                            |  5 +++--
 config/pipeline_parameters.yaml      |  4 ++--
 envs/population_master.yaml          |  1 +
 workflow/rules/PopPUNK.smk           | 12 ++++++------
 workflow/rules/createQfileFasta.smk  |  2 +-
 workflow/rules/makeSummaryCsv.smk    | 16 ++++++++++++++++
 workflow/rules/rule.smk              | 17 -----------------
 workflow/scripts/make_summary_csv.py | 24 ++++++++++++++++++++++++
 8 files changed, 53 insertions(+), 28 deletions(-)
 create mode 100644 workflow/rules/makeSummaryCsv.smk
 delete mode 100644 workflow/rules/rule.smk
 create mode 100644 workflow/scripts/make_summary_csv.py

diff --git a/Snakefile b/Snakefile
index c281e13..2387146 100644
--- a/Snakefile
+++ b/Snakefile
@@ -11,11 +11,12 @@ localrules:
     all,
 
 
-include: "workflow/rules/rule.smk"
+include: "workflow/rules/makeSummaryCsv.smk"
 include: "workflow/rules/createQfileFasta.smk"
 include: "workflow/rules/PopPUNK.smk"
 
 
 rule all:
     input:
-        expand(OUT + "/{sample}_poppunk/", sample=SAMPLES),
+        expand(OUT + "/results_per_sample/{sample}_poppunk/", sample=SAMPLES),
+        expand(OUT + "/poppunk_clusters.csv"),
diff --git a/config/pipeline_parameters.yaml b/config/pipeline_parameters.yaml
index 0b6a093..16c1394 100644
--- a/config/pipeline_parameters.yaml
+++ b/config/pipeline_parameters.yaml
@@ -1,9 +1,9 @@
 threads:
-    template_rule: 1
     create_Qfile: 1
     fasta_popPUNK: 8
+    makeSummaryCsv: 1
 
 mem_gb:
-    template_rule: 1
     create_Qfile: 1
     fasta_popPUNK: 1
+    makeSummaryCsv: 1
diff --git a/envs/population_master.yaml b/envs/population_master.yaml
index 484d607..ab12647 100644
--- a/envs/population_master.yaml
+++ b/envs/population_master.yaml
@@ -9,6 +9,7 @@ dependencies:
   - mamba==0.27
   - pandas
   - snakemake
+  - pandas
   - pip
   - pip:
     - "--editable=git+https://github.com/RIVM-bioinformatics/base_juno_pipeline.git#egg=base_juno"
diff --git a/workflow/rules/PopPUNK.smk b/workflow/rules/PopPUNK.smk
index 5635209..5ccf02e 100644
--- a/workflow/rules/PopPUNK.smk
+++ b/workflow/rules/PopPUNK.smk
@@ -1,12 +1,12 @@
 rule fasta_popPUNK:
     input:
-        expand(OUT + "/{sample}_qfile.txt", sample=SAMPLES)
+        OUT + "/q_files/{sample}_qfile.txt"
     output:
-        output_dir = directory(OUT + "/{sample}_poppunk/"),
-        output_csv = OUT + "/{sample}_poppunk/{sample}_poppunk_clusters.csv",
-        output_pkl = OUT + "/{sample}_poppunk/{sample}_poppunk.dists.pkl",
-        output_npy = OUT + "/{sample}_poppunk/{sample}_poppunk.dists.npy",
-        output_h5 = OUT + "/{sample}_poppunk/{sample}_poppunk.h5",
+        output_dir = directory(OUT + "/results_per_sample/{sample}_poppunk/"),
+        output_csv = OUT + "/results_per_sample/{sample}_poppunk/{sample}_poppunk_clusters.csv",
+        output_pkl = OUT + "/results_per_sample/{sample}_poppunk/{sample}_poppunk.dists.pkl",
+        output_npy = OUT + "/results_per_sample/{sample}_poppunk/{sample}_poppunk.dists.npy",
+        output_h5 = OUT + "/results_per_sample/{sample}_poppunk/{sample}_poppunk.h5",
     log:
         OUT + "/log/{sample}_poppunk.log"
     conda:
diff --git a/workflow/rules/createQfileFasta.smk b/workflow/rules/createQfileFasta.smk
index b9ed121..388d200 100644
--- a/workflow/rules/createQfileFasta.smk
+++ b/workflow/rules/createQfileFasta.smk
@@ -4,7 +4,7 @@ rule create_Qfile_fasta:
     input:
         lambda wc: SAMPLES[wc.sample]["assembly"]
     output:
-        OUT + "/{sample}_qfile.txt"
+        OUT + "/q_files/{sample}_qfile.txt"
     resources:
         mem_gb=config["mem_gb"]["create_Qfile"],
     threads: config["threads"]["create_Qfile"]
diff --git a/workflow/rules/makeSummaryCsv.smk b/workflow/rules/makeSummaryCsv.smk
new file mode 100644
index 0000000..e999d22
--- /dev/null
+++ b/workflow/rules/makeSummaryCsv.smk
@@ -0,0 +1,16 @@
+rule makeSummaryCsv:
+    input:
+        OUT + "/results_per_sample/",
+    output:
+        OUT + "/poppunk_clusters.csv",
+    log:
+        OUT + "/log/summarize.log"
+    message:
+        "Merging individual popPUNK output to one csv."
+    resources:
+        mem_gb=config["mem_gb"]["makeSummaryCsv"],
+    params: script = "workflow/scripts/make_summary_csv.py"
+    threads: config["threads"]["makeSummaryCsv"]
+    shell: """
+    python {params.script} -i {input} > {output}
+    """
diff --git a/workflow/rules/rule.smk b/workflow/rules/rule.smk
deleted file mode 100644
index 1730f76..0000000
--- a/workflow/rules/rule.smk
+++ /dev/null
@@ -1,17 +0,0 @@
-rule template_rule:
-    input:
-        lambda wc: SAMPLES[wc.sample]["R1"],
-        lambda wc: SAMPLES[wc.sample]["R2"],
-    output:
-        OUT + "/{sample}_combined.fastq",
-    log:
-        OUT + "/log/{sample}_template_rule.log"
-    message:
-        "Merging {input}."
-    resources:
-        mem_gb=config["mem_gb"]["template_rule"],
-    params: script = "workflow/scripts/script.py"
-    threads: config["threads"]["template_rule"]
-    shell: """
-    python {params.script} {input} > {output}
-    """
diff --git a/workflow/scripts/make_summary_csv.py b/workflow/scripts/make_summary_csv.py
new file mode 100644
index 0000000..51d8b4f
--- /dev/null
+++ b/workflow/scripts/make_summary_csv.py
@@ -0,0 +1,24 @@
+import sys
+import glob
+import argparse
+import pathlib
+import pandas as pd
+
+
+def combine_csv(root_dir=None):
+    csv_files = glob.glob(f'{root_dir}/*/*.csv')
+    return pd.concat([pd.read_csv(f) for f in csv_files], ignore_index=True)
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "-i",
+        "--input",
+        type=pathlib.Path,
+        required=True,
+        metavar="DIR",
+        help="Relative or absolute path to input directory from which all csv files should be merged."
+    )
+    args = parser.parse_args()
+    sys.stdout.write(combine_csv(args.input).to_csv())
\ No newline at end of file

From 0034d15d9fda3fbfc5ceb053c0f92b7b46da52af Mon Sep 17 00:00:00 2001
From: Karim Hajji <karim.hajji@rivm.nl>
Date: Mon, 7 Nov 2022 10:39:53 +0100
Subject: [PATCH 07/23] style: Run snakefmt

---
 Snakefile                           |  3 ++-
 workflow/rules/PopPUNK.smk          | 30 ++++++++++++++++-------------
 workflow/rules/createQfileFasta.smk | 11 ++++++-----
 workflow/rules/makeSummaryCsv.smk   | 12 +++++++-----
 4 files changed, 32 insertions(+), 24 deletions(-)

diff --git a/Snakefile b/Snakefile
index 2387146..a94c996 100644
--- a/Snakefile
+++ b/Snakefile
@@ -1,12 +1,13 @@
 import yaml
 
 
-sample_sheet=config["sample_sheet"]
+sample_sheet = config["sample_sheet"]
 with open(sample_sheet) as f:
     SAMPLES = yaml.safe_load(f)
 
 OUT = config["out"]
 
+
 localrules:
     all,
 
diff --git a/workflow/rules/PopPUNK.smk b/workflow/rules/PopPUNK.smk
index 5ccf02e..c03a52e 100644
--- a/workflow/rules/PopPUNK.smk
+++ b/workflow/rules/PopPUNK.smk
@@ -1,25 +1,29 @@
 rule fasta_popPUNK:
     input:
-        OUT + "/q_files/{sample}_qfile.txt"
+        OUT + "/q_files/{sample}_qfile.txt",
     output:
-        output_dir = directory(OUT + "/results_per_sample/{sample}_poppunk/"),
-        output_csv = OUT + "/results_per_sample/{sample}_poppunk/{sample}_poppunk_clusters.csv",
-        output_pkl = OUT + "/results_per_sample/{sample}_poppunk/{sample}_poppunk.dists.pkl",
-        output_npy = OUT + "/results_per_sample/{sample}_poppunk/{sample}_poppunk.dists.npy",
-        output_h5 = OUT + "/results_per_sample/{sample}_poppunk/{sample}_poppunk.h5",
+        output_dir=directory(OUT + "/results_per_sample/{sample}_poppunk/"),
+        output_csv=OUT
+        + "/results_per_sample/{sample}_poppunk/{sample}_poppunk_clusters.csv",
+        output_pkl=OUT
+        + "/results_per_sample/{sample}_poppunk/{sample}_poppunk.dists.pkl",
+        output_npy=OUT
+        + "/results_per_sample/{sample}_poppunk/{sample}_poppunk.dists.npy",
+        output_h5=OUT + "/results_per_sample/{sample}_poppunk/{sample}_poppunk.h5",
     log:
-        OUT + "/log/{sample}_poppunk.log"
+        OUT + "/log/{sample}_poppunk.log",
     conda:
         "../../envs/poppunk.yaml"
     message:
         "Running popPUNK clustering"
     params:
-        db_dir = config["db_dir"],
+        db_dir=config["db_dir"],
     resources:
         mem_gb=config["mem_gb"]["fasta_popPUNK"],
     threads: config["threads"]["fasta_popPUNK"]
-    shell: """
-    poppunk_assign \
-        --db {params.db_dir} \
-        --threads {threads} --query {input} --output {output.output_dir} 2> {log}
-    """
+    shell:
+        """
+        poppunk_assign \
+            --db {params.db_dir} \
+            --threads {threads} --query {input} --output {output.output_dir} 2> {log}
+        """
diff --git a/workflow/rules/createQfileFasta.smk b/workflow/rules/createQfileFasta.smk
index 388d200..849a7f3 100644
--- a/workflow/rules/createQfileFasta.smk
+++ b/workflow/rules/createQfileFasta.smk
@@ -2,12 +2,13 @@ rule create_Qfile_fasta:
     """Create popPUNKs required query file, a textfile containing sampleID and location of fasta"""
     # TODO: popPUNK can also work from fastqs. This approach completely ignores this.
     input:
-        lambda wc: SAMPLES[wc.sample]["assembly"]
+        lambda wc: SAMPLES[wc.sample]["assembly"],
     output:
-        OUT + "/q_files/{sample}_qfile.txt"
+        OUT + "/q_files/{sample}_qfile.txt",
     resources:
         mem_gb=config["mem_gb"]["create_Qfile"],
     threads: config["threads"]["create_Qfile"]
-    shell:"""
-    printf "$(basename {input} .fasta)\t$(realpath {input})\n" > {output}
-    """
\ No newline at end of file
+    shell:
+        """
+        printf "$(basename {input} .fasta)\t$(realpath {input})\n" > {output}
+        """
diff --git a/workflow/rules/makeSummaryCsv.smk b/workflow/rules/makeSummaryCsv.smk
index e999d22..43adf26 100644
--- a/workflow/rules/makeSummaryCsv.smk
+++ b/workflow/rules/makeSummaryCsv.smk
@@ -4,13 +4,15 @@ rule makeSummaryCsv:
     output:
         OUT + "/poppunk_clusters.csv",
     log:
-        OUT + "/log/summarize.log"
+        OUT + "/log/summarize.log",
     message:
         "Merging individual popPUNK output to one csv."
     resources:
         mem_gb=config["mem_gb"]["makeSummaryCsv"],
-    params: script = "workflow/scripts/make_summary_csv.py"
+    params:
+        script="workflow/scripts/make_summary_csv.py",
     threads: config["threads"]["makeSummaryCsv"]
-    shell: """
-    python {params.script} -i {input} > {output}
-    """
+    shell:
+        """
+        python {params.script} -i {input} > {output}
+        """

From 4849624aaedb64e0f30ee8bed78e58d5044dd999 Mon Sep 17 00:00:00 2001
From: Karim Hajji <karim.hajji@rivm.nl>
Date: Mon, 7 Nov 2022 10:40:34 +0100
Subject: [PATCH 08/23] style: Run black

---
 workflow/scripts/make_summary_csv.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/workflow/scripts/make_summary_csv.py b/workflow/scripts/make_summary_csv.py
index 51d8b4f..d6e5ea9 100644
--- a/workflow/scripts/make_summary_csv.py
+++ b/workflow/scripts/make_summary_csv.py
@@ -6,11 +6,11 @@
 
 
 def combine_csv(root_dir=None):
-    csv_files = glob.glob(f'{root_dir}/*/*.csv')
+    csv_files = glob.glob(f"{root_dir}/*/*.csv")
     return pd.concat([pd.read_csv(f) for f in csv_files], ignore_index=True)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     parser = argparse.ArgumentParser()
     parser.add_argument(
         "-i",
@@ -18,7 +18,7 @@ def combine_csv(root_dir=None):
         type=pathlib.Path,
         required=True,
         metavar="DIR",
-        help="Relative or absolute path to input directory from which all csv files should be merged."
+        help="Relative or absolute path to input directory from which all csv files should be merged.",
     )
     args = parser.parse_args()
-    sys.stdout.write(combine_csv(args.input).to_csv())
\ No newline at end of file
+    sys.stdout.write(combine_csv(args.input).to_csv())

From f15dca8904d177d2c0128d34c5805688eacc9c9f Mon Sep 17 00:00:00 2001
From: Karim Hajji <karim.hajji@rivm.nl>
Date: Mon, 7 Nov 2022 10:44:25 +0100
Subject: [PATCH 09/23] refactor: Move poppunk.yaml env specification

---
 {envs => workflow/envs}/poppunk.yaml | 0
 workflow/rules/PopPUNK.smk           | 2 +-
 workflow/scripts/script.py           | 6 ------
 3 files changed, 1 insertion(+), 7 deletions(-)
 rename {envs => workflow/envs}/poppunk.yaml (100%)
 delete mode 100644 workflow/scripts/script.py

diff --git a/envs/poppunk.yaml b/workflow/envs/poppunk.yaml
similarity index 100%
rename from envs/poppunk.yaml
rename to workflow/envs/poppunk.yaml
diff --git a/workflow/rules/PopPUNK.smk b/workflow/rules/PopPUNK.smk
index c03a52e..06b4f43 100644
--- a/workflow/rules/PopPUNK.smk
+++ b/workflow/rules/PopPUNK.smk
@@ -13,7 +13,7 @@ rule fasta_popPUNK:
     log:
         OUT + "/log/{sample}_poppunk.log",
     conda:
-        "../../envs/poppunk.yaml"
+        "../envs/poppunk.yaml"
     message:
         "Running popPUNK clustering"
     params:
diff --git a/workflow/scripts/script.py b/workflow/scripts/script.py
deleted file mode 100644
index 81656b6..0000000
--- a/workflow/scripts/script.py
+++ /dev/null
@@ -1,6 +0,0 @@
-import subprocess
-import sys
-
-subprocess.call(
-    ["cat"] + sys.argv[1:],
-)

From 3c4a5f7b97b5091f275e5b79d10dc56d7d7c1d2c Mon Sep 17 00:00:00 2001
From: Karim Hajji <karim.hajji@rivm.nl>
Date: Mon, 7 Nov 2022 11:03:08 +0100
Subject: [PATCH 10/23] refactor: Remove juno-library from master env

---
 envs/population_master.yaml | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/envs/population_master.yaml b/envs/population_master.yaml
index ab12647..e5a0dba 100644
--- a/envs/population_master.yaml
+++ b/envs/population_master.yaml
@@ -10,6 +10,3 @@ dependencies:
   - pandas
   - snakemake
   - pandas
-  - pip
-  - pip:
-    - "--editable=git+https://github.com/RIVM-bioinformatics/base_juno_pipeline.git#egg=base_juno"

From dd3f02a563e4bb903afda97b9381c175fb353f5a Mon Sep 17 00:00:00 2001
From: Karim Hajji <karim.hajji@rivm.nl>
Date: Mon, 7 Nov 2022 11:05:32 +0100
Subject: [PATCH 11/23] refactor: Remove duplicate pandas dependency

---
 envs/population_master.yaml | 1 -
 1 file changed, 1 deletion(-)

diff --git a/envs/population_master.yaml b/envs/population_master.yaml
index e5a0dba..25581c5 100644
--- a/envs/population_master.yaml
+++ b/envs/population_master.yaml
@@ -9,4 +9,3 @@ dependencies:
   - mamba==0.27
   - pandas
   - snakemake
-  - pandas

From c901c55cdb4aac0d4a09399594bdabdc5002f799 Mon Sep 17 00:00:00 2001
From: Karim Hajji <karim.hajji@rivm.nl>
Date: Mon, 7 Nov 2022 11:10:47 +0100
Subject: [PATCH 12/23] doc: Add instruction for submodule initialization

---
 README.md | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/README.md b/README.md
index 4e41e80..ca6a559 100644
--- a/README.md
+++ b/README.md
@@ -2,6 +2,10 @@
 - [ ] TODO: Write readme
 A template pipeline where the other juno pipelines are based on.
 
+Before running the pipeline be sure to initialize the submodules:
+```bash
+git submodule update --init --recursive
+```
 ## Contribution guidelines
 Juno pipelines use a [feature branch workflow](https://www.atlassian.com/git/tutorials/comparing-workflows/feature-branch-workflow). To work on features, create a branch from the `main` branch to make changes to. This branch can be merged to the main branch via a pull request. Hotfixes for bugs can be committed to the `main` branch.
 

From 5b647640458a9b488e580f4e70cb9165d16b602c Mon Sep 17 00:00:00 2001
From: Karim Hajji <karim.hajji@rivm.nl>
Date: Mon, 7 Nov 2022 11:27:30 +0100
Subject: [PATCH 13/23] fix: Specify juno-library version

---
 .gitmodules | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/.gitmodules b/.gitmodules
index f73a69e..ef80eff 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -1,3 +1,4 @@
-[submodule "juno-library"]
+[submodule "base_juno_pipeline"]
 	path = base_juno_pipeline
-	url = https://github.com/RIVM-bioinformatics/base_juno_pipeline.git
+	url = https://github.com/RIVM-bioinformatics/juno-library.git
+	branch = v0.9.2

From c899f36a9c723b358946bd35a6869f0972b4540f Mon Sep 17 00:00:00 2001
From: Karim Hajji <karim.hajji@rivm.nl>
Date: Mon, 7 Nov 2022 11:36:49 +0100
Subject: [PATCH 14/23] refactor: Change database location determination

---
 database_locations.py |  7 +++++++
 population.py         | 31 ++++++++++---------------------
 2 files changed, 17 insertions(+), 21 deletions(-)
 create mode 100644 database_locations.py

diff --git a/database_locations.py b/database_locations.py
new file mode 100644
index 0000000..8ff47cd
--- /dev/null
+++ b/database_locations.py
@@ -0,0 +1,7 @@
+import pathlib
+
+species_database_locations = {
+    "streptococcus_pneumoniae": pathlib.Path(
+        "/mnt/db/juno/poppunk/streptococcus/GPS_v4_references"
+    ),
+}
diff --git a/population.py b/population.py
index 5393e7e..bc3e3b3 100644
--- a/population.py
+++ b/population.py
@@ -11,6 +11,8 @@
     helper_functions,
 )
 
+from database_locations import species_database_locations
+
 
 class PopulationRun(PipelineStartup, RunSnakemake):
     def __init__(
@@ -43,7 +45,13 @@ def __init__(
         )
 
         # Specific Juno-Population pipeline attributes
-        self.db_dir = self.determine_db_dir(species, db_dir)
+        if not db_dir:
+            self.db_dir = species_database_locations.get(species)
+            if not self.db_dir:
+                raise KeyError(
+                    "Cannot determine db_dir: This species is currently not configured AND no db_dir was provided. Manually provide a db_dir via -b/--database, or ask for your species to be configured."
+                )
+
         self.user_parameters = pathlib.Path("config/user_parameters.yaml")
 
         # Start pipeline
@@ -58,30 +66,11 @@ def __init__(
         with open(self.user_parameters, "w") as f:
             yaml.dump(self.config_params, f, default_flow_style=False)
 
-        with open(self.sample_sheet, 'w') as f:
+        with open(self.sample_sheet, "w") as f:
             yaml.dump(self.sample_dict, f, default_flow_style=False)
         self.run_snakemake()
 
 
-    def determine_db_dir(self, species, db_dir=None):
-        """
-        Provided the species and a db_dir optionally set by the user, determines the actual db_dir to use
-        """
-        if db_dir is not None:
-            return db_dir
-        # Future feature: Import a yaml with species_db_dirs instead?
-        species_db_dirs = {
-            'streptococcus_pneumoniae': pathlib.Path('/mnt/db/juno/poppunk/streptococcus/GPS_v4_references'),
-        }
-
-        species_db_dir = species_db_dirs.get(species)
-
-        if species_db_dir is None:
-            raise KeyError('Cannot determine db_dir: This species is currently not configured AND no db_dir was provided. Manually provide a db_dir via -b/--database, or ask for your species to be configured.')
-
-        return species_db_dir
-
-
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(
         description="Template juno pipeline. If you see this message please change it to something appropriate"

From 3ccc75f53a160d8e56f1d69e66c04b8ecf89e1f8 Mon Sep 17 00:00:00 2001
From: Karim Hajji <karim.hajji@rivm.nl>
Date: Mon, 7 Nov 2022 11:41:24 +0100
Subject: [PATCH 15/23] refactor: Remove "test" from poppunk env name

---
 workflow/envs/poppunk.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/workflow/envs/poppunk.yaml b/workflow/envs/poppunk.yaml
index dbed9fc..3cca6fa 100644
--- a/workflow/envs/poppunk.yaml
+++ b/workflow/envs/poppunk.yaml
@@ -1,4 +1,4 @@
-name: poppunk_test
+name: poppunk
 channels:
   - bioconda
   - conda-forge

From b629f4626b40ee43d0bf8363ad59c2eed8b570db Mon Sep 17 00:00:00 2001
From: Karim Hajji <karim.hajji@rivm.nl>
Date: Mon, 7 Nov 2022 11:41:54 +0100
Subject: [PATCH 16/23] refactor: Remove placeholder env.yaml

---
 workflow/envs/env.yaml | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 delete mode 100644 workflow/envs/env.yaml

diff --git a/workflow/envs/env.yaml b/workflow/envs/env.yaml
deleted file mode 100644
index e69de29..0000000

From 35dc4554ca21d356485841b386ecb0684abd914b Mon Sep 17 00:00:00 2001
From: Linda Visser <linda.visser@rivm.nl>
Date: Wed, 9 Nov 2022 13:49:09 +0100
Subject: [PATCH 17/23] refactor: Make rule names more descriptive

---
 Snakefile                                                      | 2 +-
 workflow/rules/PopPUNK.smk                                     | 2 +-
 workflow/rules/{makeSummaryCsv.smk => aggregatePoppunkCsv.smk} | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)
 rename workflow/rules/{makeSummaryCsv.smk => aggregatePoppunkCsv.smk} (94%)

diff --git a/Snakefile b/Snakefile
index a94c996..6a34473 100644
--- a/Snakefile
+++ b/Snakefile
@@ -12,7 +12,7 @@ localrules:
     all,
 
 
-include: "workflow/rules/makeSummaryCsv.smk"
+include: "workflow/rules/aggregatePoppunkCsv.smk"
 include: "workflow/rules/createQfileFasta.smk"
 include: "workflow/rules/PopPUNK.smk"
 
diff --git a/workflow/rules/PopPUNK.smk b/workflow/rules/PopPUNK.smk
index 06b4f43..1b82890 100644
--- a/workflow/rules/PopPUNK.smk
+++ b/workflow/rules/PopPUNK.smk
@@ -1,4 +1,4 @@
-rule fasta_popPUNK:
+rule assign_popPUNK_cluster:
     input:
         OUT + "/q_files/{sample}_qfile.txt",
     output:
diff --git a/workflow/rules/makeSummaryCsv.smk b/workflow/rules/aggregatePoppunkCsv.smk
similarity index 94%
rename from workflow/rules/makeSummaryCsv.smk
rename to workflow/rules/aggregatePoppunkCsv.smk
index 43adf26..c75ea87 100644
--- a/workflow/rules/makeSummaryCsv.smk
+++ b/workflow/rules/aggregatePoppunkCsv.smk
@@ -1,4 +1,4 @@
-rule makeSummaryCsv:
+rule aggregate_poppunk_csv:
     input:
         OUT + "/results_per_sample/",
     output:

From 15bc8c8f691d6834af4c55671b1d33e1c7449b63 Mon Sep 17 00:00:00 2001
From: Linda Visser <linda.visser@rivm.nl>
Date: Wed, 9 Nov 2022 15:13:38 +0100
Subject: [PATCH 18/23] refactor: Change the aggregatePoppunkCsv rule to make
 it more its input files more specific

---
 Snakefile                              |  1 -
 config/pipeline_parameters.yaml        |  4 ++--
 workflow/rules/aggregatePoppunkCsv.smk | 18 +++++++++---------
 workflow/scripts/make_summary_csv.py   | 24 ------------------------
 4 files changed, 11 insertions(+), 36 deletions(-)
 delete mode 100644 workflow/scripts/make_summary_csv.py

diff --git a/Snakefile b/Snakefile
index 6a34473..dd78dcc 100644
--- a/Snakefile
+++ b/Snakefile
@@ -19,5 +19,4 @@ include: "workflow/rules/PopPUNK.smk"
 
 rule all:
     input:
-        expand(OUT + "/results_per_sample/{sample}_poppunk/", sample=SAMPLES),
         expand(OUT + "/poppunk_clusters.csv"),
diff --git a/config/pipeline_parameters.yaml b/config/pipeline_parameters.yaml
index 16c1394..7a8a25a 100644
--- a/config/pipeline_parameters.yaml
+++ b/config/pipeline_parameters.yaml
@@ -1,9 +1,9 @@
 threads:
     create_Qfile: 1
     fasta_popPUNK: 8
-    makeSummaryCsv: 1
+    aggregatePoppunkCsv: 1
 
 mem_gb:
     create_Qfile: 1
     fasta_popPUNK: 1
-    makeSummaryCsv: 1
+    aggregatePoppunkCsv: 1
diff --git a/workflow/rules/aggregatePoppunkCsv.smk b/workflow/rules/aggregatePoppunkCsv.smk
index c75ea87..e068447 100644
--- a/workflow/rules/aggregatePoppunkCsv.smk
+++ b/workflow/rules/aggregatePoppunkCsv.smk
@@ -1,6 +1,6 @@
 rule aggregate_poppunk_csv:
     input:
-        OUT + "/results_per_sample/",
+        expand(OUT + "/results_per_sample/{sample}_poppunk/{sample}_poppunk_clusters.csv", sample=SAMPLES),
     output:
         OUT + "/poppunk_clusters.csv",
     log:
@@ -8,11 +8,11 @@ rule aggregate_poppunk_csv:
     message:
         "Merging individual popPUNK output to one csv."
     resources:
-        mem_gb=config["mem_gb"]["makeSummaryCsv"],
-    params:
-        script="workflow/scripts/make_summary_csv.py",
-    threads: config["threads"]["makeSummaryCsv"]
-    shell:
-        """
-        python {params.script} -i {input} > {output}
-        """
+        mem_gb=config["mem_gb"]["aggregatePoppunkCsv"],
+    threads: config["threads"]["aggregatePoppunkCsv"]
+    run:
+        import pandas as pd
+                        
+        aggregated_csv = pd.concat([pd.read_csv(f) for f in input], ignore_index=True)
+        aggregated_csv.to_csv(output[0])
+
diff --git a/workflow/scripts/make_summary_csv.py b/workflow/scripts/make_summary_csv.py
deleted file mode 100644
index d6e5ea9..0000000
--- a/workflow/scripts/make_summary_csv.py
+++ /dev/null
@@ -1,24 +0,0 @@
-import sys
-import glob
-import argparse
-import pathlib
-import pandas as pd
-
-
-def combine_csv(root_dir=None):
-    csv_files = glob.glob(f"{root_dir}/*/*.csv")
-    return pd.concat([pd.read_csv(f) for f in csv_files], ignore_index=True)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "-i",
-        "--input",
-        type=pathlib.Path,
-        required=True,
-        metavar="DIR",
-        help="Relative or absolute path to input directory from which all csv files should be merged.",
-    )
-    args = parser.parse_args()
-    sys.stdout.write(combine_csv(args.input).to_csv())

From 4a2594bc611568652ea9fa5e92841ec2a6df3cfc Mon Sep 17 00:00:00 2001
From: Linda Visser <linda.visser@rivm.nl>
Date: Wed, 9 Nov 2022 16:05:24 +0100
Subject: [PATCH 19/23] Feat: Add --species to run_pipeline.sh. Fix: Correct
 input type from both to fasta.

---
 population.py   |  2 +-
 run_pipeline.sh | 14 +++++++++++---
 2 files changed, 12 insertions(+), 4 deletions(-)

diff --git a/population.py b/population.py
index bc3e3b3..faf2529 100644
--- a/population.py
+++ b/population.py
@@ -21,7 +21,7 @@ def __init__(
         output_dir,
         species=None,
         db_dir=None,
-        input_type="both",
+        input_type="fasta",
         unlock=False,
         rerunincomplete=False,
         dryrun=False,
diff --git a/run_pipeline.sh b/run_pipeline.sh
index 2edb207..2be1ce1 100755
--- a/run_pipeline.sh
+++ b/run_pipeline.sh
@@ -4,11 +4,11 @@ set -euo pipefail
 
 #----------------------------------------------#
 # User parameters
-if [ ! -z "${1}" ] || [ ! -z "${2}" ] #|| [ ! -z "${irods_input_projectID}" ]
+if [ ! -z "${1}" ] || [ ! -z "${2}" ] || [ ! -z "${irods_input_projectID}" ]
 then
    input_dir="${1}"
    output_dir="${2}"
-#    PROJECT_NAME="${irods_input_projectID}"
+   PROJECT_NAME="${irods_input_projectID}"
 else
     echo "One of the parameters is missing, make sure there is an input directory, output directory and project name(param 1, 2 or irods_input_projectID)."
     exit 1
@@ -22,6 +22,14 @@ else
   input_fastq="${input_dir}/clean_fastq"
 fi
 
+case $PROJECT_NAME in
+
+  rvp_spn)
+    GENUS_ALL="streptococcus_pneumoniae"
+    ;;
+      
+esac
+
 #----------------------------------------------#
 # Create/update necessary environments
 PATH_MAMBA_YAML="envs/mamba.yaml"
@@ -56,7 +64,7 @@ fi
 
 set -euo pipefail
 
-python population.py -i "${input_dir}" -o "${output_dir}"
+python population.py --queue "${QUEUE}"-i "${input_dir}" -o "${output_dir}" -s "${GENUS_ALL}"
 
 result=$?
 

From 7f0833a4f333c83230d986f4aa6a44658e3d12f7 Mon Sep 17 00:00:00 2001
From: Linda Visser <linda.visser@rivm.nl>
Date: Wed, 9 Nov 2022 17:09:36 +0100
Subject: [PATCH 20/23] Fix: Correctly handle --queue argument.

---
 population.py   | 13 +++++++++++++
 run_pipeline.sh |  2 +-
 2 files changed, 14 insertions(+), 1 deletion(-)

diff --git a/population.py b/population.py
index faf2529..1102205 100644
--- a/population.py
+++ b/population.py
@@ -25,6 +25,7 @@ def __init__(
         unlock=False,
         rerunincomplete=False,
         dryrun=False,
+        queue="bio",
         **kwargs
     ):
         PipelineStartup.__init__(
@@ -38,6 +39,7 @@ def __init__(
             pipeline_version="0.1.0",
             output_dir=output_dir,
             workdir=pathlib.Path(__file__).parent.resolve(),
+            queue=queue,
             unlock=unlock,
             rerunincomplete=rerunincomplete,
             dryrun=dryrun,
@@ -111,6 +113,16 @@ def __init__(
         action="store_true",
         help="Running pipeline locally (instead of in a computer cluster). Default is running it in a cluster.",
     )
+    parser.add_argument(
+        "-q",
+        "--queue",
+        type = str,
+        required=False,
+        default = "bio",
+        metavar = "STR",
+        dest="queue",
+        help = "Name of the queue that the job will be sumitted to if working on a cluster."
+    )
     # Snakemake arguments
     parser.add_argument(
         "-u",
@@ -143,6 +155,7 @@ def __init__(
         species=args.species,
         db_dir=args.database,
         local=args.local,
+        queue=args.queue,
         unlock=args.unlock,
         rerunincomplete=args.rerunincomplete,
         dryrun=args.dryrun,
diff --git a/run_pipeline.sh b/run_pipeline.sh
index 2be1ce1..f8dbd46 100755
--- a/run_pipeline.sh
+++ b/run_pipeline.sh
@@ -64,7 +64,7 @@ fi
 
 set -euo pipefail
 
-python population.py --queue "${QUEUE}"-i "${input_dir}" -o "${output_dir}" -s "${GENUS_ALL}"
+python population.py --queue "${QUEUE}" -i "${input_dir}" -o "${output_dir}" -s "${GENUS_ALL}"
 
 result=$?
 

From 106ac17a0d98e12180f550c1c78e3bf0d9fbfac0 Mon Sep 17 00:00:00 2001
From: Karim Hajji <karim.hajji@rivm.nl>
Date: Tue, 15 Nov 2022 09:54:50 +0100
Subject: [PATCH 21/23] style: Format with black and snakefmt

---
 population.py                          | 8 ++++----
 workflow/rules/aggregatePoppunkCsv.smk | 8 +++++---
 2 files changed, 9 insertions(+), 7 deletions(-)

diff --git a/population.py b/population.py
index 1102205..2f61a34 100644
--- a/population.py
+++ b/population.py
@@ -116,12 +116,12 @@ def __init__(
     parser.add_argument(
         "-q",
         "--queue",
-        type = str,
+        type=str,
         required=False,
-        default = "bio",
-        metavar = "STR",
+        default="bio",
+        metavar="STR",
         dest="queue",
-        help = "Name of the queue that the job will be sumitted to if working on a cluster."
+        help="Name of the queue that the job will be sumitted to if working on a cluster.",
     )
     # Snakemake arguments
     parser.add_argument(
diff --git a/workflow/rules/aggregatePoppunkCsv.smk b/workflow/rules/aggregatePoppunkCsv.smk
index e068447..c901c2f 100644
--- a/workflow/rules/aggregatePoppunkCsv.smk
+++ b/workflow/rules/aggregatePoppunkCsv.smk
@@ -1,6 +1,9 @@
 rule aggregate_poppunk_csv:
     input:
-        expand(OUT + "/results_per_sample/{sample}_poppunk/{sample}_poppunk_clusters.csv", sample=SAMPLES),
+        expand(
+            OUT + "/results_per_sample/{sample}_poppunk/{sample}_poppunk_clusters.csv",
+            sample=SAMPLES,
+        ),
     output:
         OUT + "/poppunk_clusters.csv",
     log:
@@ -12,7 +15,6 @@ rule aggregate_poppunk_csv:
     threads: config["threads"]["aggregatePoppunkCsv"]
     run:
         import pandas as pd
-                        
+
         aggregated_csv = pd.concat([pd.read_csv(f) for f in input], ignore_index=True)
         aggregated_csv.to_csv(output[0])
-

From 931882c50b3b08ecf61650461305095da3a4ce2f Mon Sep 17 00:00:00 2001
From: Karim Hajji <karim.hajji@rivm.nl>
Date: Tue, 15 Nov 2022 10:03:11 +0100
Subject: [PATCH 22/23] fix: Add default species "other" in run_pipeline.sh

---
 run_pipeline.sh | 19 ++++++++++---------
 1 file changed, 10 insertions(+), 9 deletions(-)

diff --git a/run_pipeline.sh b/run_pipeline.sh
index f8dbd46..abf5536 100755
--- a/run_pipeline.sh
+++ b/run_pipeline.sh
@@ -6,9 +6,9 @@ set -euo pipefail
 # User parameters
 if [ ! -z "${1}" ] || [ ! -z "${2}" ] || [ ! -z "${irods_input_projectID}" ]
 then
-   input_dir="${1}"
-   output_dir="${2}"
-   PROJECT_NAME="${irods_input_projectID}"
+    input_dir="${1}"
+    output_dir="${2}"
+    PROJECT_NAME="${irods_input_projectID}"
 else
     echo "One of the parameters is missing, make sure there is an input directory, output directory and project name(param 1, 2 or irods_input_projectID)."
     exit 1
@@ -16,17 +16,18 @@ fi
 
 if [ ! -d "${input_dir}" ] || [ ! -d "${output_dir}" ]
 then
-  echo "The input directory $input_dir, output directory $output_dir or fastq dir ${input_dir}/clean_fastq does not exist"
-  exit 1
+    echo "The input directory $input_dir, output directory $output_dir or fastq dir ${input_dir}/clean_fastq does not exist"
+    exit 1
 else
-  input_fastq="${input_dir}/clean_fastq"
+    input_fastq="${input_dir}/clean_fastq"
 fi
 
 case $PROJECT_NAME in
 
-  rvp_spn)
-    GENUS_ALL="streptococcus_pneumoniae"
-    ;;
+    rvp_spn)
+        GENUS_ALL="streptococcus_pneumoniae";;
+    *)
+        GENUS_ALL="other";;
       
 esac
 

From 081ffa477a1a3798a004247d21fd79987b951e8f Mon Sep 17 00:00:00 2001
From: Karim Hajji <karim.hajji@rivm.nl>
Date: Tue, 15 Nov 2022 10:04:01 +0100
Subject: [PATCH 23/23] refactor: Rename master environment

---
 envs/population_master.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/envs/population_master.yaml b/envs/population_master.yaml
index 25581c5..50cff10 100644
--- a/envs/population_master.yaml
+++ b/envs/population_master.yaml
@@ -1,4 +1,4 @@
-name: population_master
+name: juno_population
 channels:
   - bioconda
   - conda-forge