From 39eb86d91199c78920b61a03704d48d63eb55723 Mon Sep 17 00:00:00 2001 From: Linda Visser Date: Wed, 19 Oct 2022 10:32:53 +0200 Subject: [PATCH 01/23] Initial commit --- .github/workflows/release.yml | 2 +- .gitignore | 1 + README.md | 3 ++- Snakefile | 2 -- config/sample_sheet.yaml | 3 --- envs/{template_master.yaml => population_master.yaml} | 2 +- template.py => population.py | 11 ++++++++--- run_pipeline.sh | 4 ++-- 8 files changed, 15 insertions(+), 13 deletions(-) delete mode 100644 config/sample_sheet.yaml rename envs/{template_master.yaml => population_master.yaml} (90%) rename template.py => population.py (92%) diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index ff16024..373d5c1 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -15,7 +15,7 @@ jobs: id: release with: release-type: python # just keep a changelog, no version anywhere outside of git tags - package-name: juno_template + package-name: juno_population lint: name: Lint Code Base runs-on: ubuntu-latest diff --git a/.gitignore b/.gitignore index 7b630f9..c769264 100644 --- a/.gitignore +++ b/.gitignore @@ -133,3 +133,4 @@ dmypy.json envs/src input output +config/sample_sheet.yaml diff --git a/README.md b/README.md index a4802bc..4e41e80 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,5 @@ -# Juno-Template +# Juno-Population +- [ ] TODO: Write readme A template pipeline where the other juno pipelines are based on. ## Contribution guidelines diff --git a/Snakefile b/Snakefile index 9b45640..9f4c43b 100644 --- a/Snakefile +++ b/Snakefile @@ -5,8 +5,6 @@ sample_sheet=config["sample_sheet"] with open(sample_sheet) as f: SAMPLES = yaml.safe_load(f) -print(SAMPLES) - OUT = config["out"] localrules: diff --git a/config/sample_sheet.yaml b/config/sample_sheet.yaml deleted file mode 100644 index b0a7513..0000000 --- a/config/sample_sheet.yaml +++ /dev/null @@ -1,3 +0,0 @@ -'1': - R1: 'input/1_R1.fastq' - R2: 'input/1_R2.fastq' diff --git a/envs/template_master.yaml b/envs/population_master.yaml similarity index 90% rename from envs/template_master.yaml rename to envs/population_master.yaml index 7606d21..f669f93 100644 --- a/envs/template_master.yaml +++ b/envs/population_master.yaml @@ -1,4 +1,4 @@ -name: template_master +name: population_master channels: - bioconda - conda-forge diff --git a/template.py b/population.py similarity index 92% rename from template.py rename to population.py index a7bee73..a6b212b 100644 --- a/template.py +++ b/population.py @@ -1,4 +1,5 @@ import pathlib +from pickle import DUP import yaml import argparse import sys @@ -11,7 +12,7 @@ ) -class TemplateRun(PipelineStartup, RunSnakemake): +class PopulationRun(PipelineStartup, RunSnakemake): def __init__( self, input_dir, @@ -29,7 +30,7 @@ def __init__( ) RunSnakemake.__init__( self, - pipeline_name="template", + pipeline_name="population", pipeline_version="0.1.0", output_dir=output_dir, workdir=pathlib.Path(__file__).parent.resolve(), @@ -45,6 +46,10 @@ def __init__( } with open(self.user_parameters, "w") as f: yaml.dump(self.config_params, f, default_flow_style=False) + + # print(self.sample_dict) + with open(self.sample_sheet, 'w') as f: + yaml.dump(self.sample_dict, f, default_flow_style=False) self.run_snakemake() @@ -100,7 +105,7 @@ def __init__( help="Extra arguments to be passed to snakemake API (https://snakemake.readthedocs.io/en/stable/api_reference/snakemake.html).", ) args = parser.parse_args() - TemplateRun( + PopulationRun( input_dir=args.input, output_dir=args.output, local=args.local, diff --git a/run_pipeline.sh b/run_pipeline.sh index 56cb9f6..2edb207 100755 --- a/run_pipeline.sh +++ b/run_pipeline.sh @@ -25,7 +25,7 @@ fi #----------------------------------------------# # Create/update necessary environments PATH_MAMBA_YAML="envs/mamba.yaml" -PATH_MASTER_YAML="envs/template_master.yaml" +PATH_MASTER_YAML="envs/population_master.yaml" MAMBA_NAME=$(head -n 1 ${PATH_MAMBA_YAML} | cut -f2 -d ' ') MASTER_NAME=$(head -n 1 ${PATH_MASTER_YAML} | cut -f2 -d ' ') @@ -56,7 +56,7 @@ fi set -euo pipefail -python template.py --queue "${QUEUE}" -i "${input_dir}" -o "${output_dir}" +python population.py -i "${input_dir}" -o "${output_dir}" result=$? From e1a6d4d3e5ed20e063900e66dfe32d18eff55f7a Mon Sep 17 00:00:00 2001 From: Linda Visser Date: Mon, 24 Oct 2022 20:35:23 +0200 Subject: [PATCH 02/23] feat:Create first version of rule for Q-file --- Snakefile | 3 ++- population.py | 2 +- workflow/rules/createQfile.smk | 13 +++++++++++++ 3 files changed, 16 insertions(+), 2 deletions(-) create mode 100644 workflow/rules/createQfile.smk diff --git a/Snakefile b/Snakefile index 9f4c43b..977886b 100644 --- a/Snakefile +++ b/Snakefile @@ -12,8 +12,9 @@ localrules: include: "workflow/rules/rule.smk" +include: "workflow/rules/createQfile.smk" rule all: input: - expand(OUT + "/{sample}_combined.fastq", sample=SAMPLES), + expand(OUT + "/{sample}_qfile.txt", sample=SAMPLES), diff --git a/population.py b/population.py index a6b212b..c316175 100644 --- a/population.py +++ b/population.py @@ -17,7 +17,7 @@ def __init__( self, input_dir, output_dir, - input_type="fastq", + input_type="both", unlock=False, rerunincomplete=False, dryrun=False, diff --git a/workflow/rules/createQfile.smk b/workflow/rules/createQfile.smk new file mode 100644 index 0000000..58d5d97 --- /dev/null +++ b/workflow/rules/createQfile.smk @@ -0,0 +1,13 @@ +rule createQfile: + """Create popPUNKs required query file, a textfile containing sampleID and location of fasta""" + # TODO: popPUNK can also work from fastqs. This approach completely ignores this. + input: + lambda wc: SAMPLES[wc.sample]["assembly"] + output: + OUT + "/{sample}_qfile.txt" + resources: + mem_gb=config["mem_gb"]["template_rule"], + threads: config["threads"]["template_rule"] + shell:""" + printf "$(basename {input} .fasta)\t$(realpath {input})\n" > {output} + """ \ No newline at end of file From 131de0379f8796c856a035ddc07706ebb14d2843 Mon Sep 17 00:00:00 2001 From: Linda Visser Date: Wed, 26 Oct 2022 15:27:23 +0200 Subject: [PATCH 03/23] feat:Create hardcoded version of popPUNK rule --- Snakefile | 3 ++- config/pipeline_parameters.yaml | 4 ++++ envs/mamba.yaml | 2 +- envs/poppunk.yaml | 11 +++++++++++ envs/population_master.yaml | 2 +- run_pipeline.sh | 3 +++ workflow/rules/createQfile.smk | 6 +++--- workflow/rules/fastaPopPUNK.smk | 17 +++++++++++++++++ 8 files changed, 42 insertions(+), 6 deletions(-) create mode 100644 envs/poppunk.yaml create mode 100644 workflow/rules/fastaPopPUNK.smk diff --git a/Snakefile b/Snakefile index 977886b..430f759 100644 --- a/Snakefile +++ b/Snakefile @@ -13,8 +13,9 @@ localrules: include: "workflow/rules/rule.smk" include: "workflow/rules/createQfile.smk" +include: "workflow/rules/fastaPopPUNK.smk" rule all: input: - expand(OUT + "/{sample}_qfile.txt", sample=SAMPLES), + expand(OUT + "/{sample}_poppunk/", sample=SAMPLES), diff --git a/config/pipeline_parameters.yaml b/config/pipeline_parameters.yaml index d240a8f..0b6a093 100644 --- a/config/pipeline_parameters.yaml +++ b/config/pipeline_parameters.yaml @@ -1,5 +1,9 @@ threads: template_rule: 1 + create_Qfile: 1 + fasta_popPUNK: 8 mem_gb: template_rule: 1 + create_Qfile: 1 + fasta_popPUNK: 1 diff --git a/envs/mamba.yaml b/envs/mamba.yaml index 57a4e0e..be30445 100644 --- a/envs/mamba.yaml +++ b/envs/mamba.yaml @@ -2,4 +2,4 @@ name: mamba channels: - conda-forge dependencies: - - mamba + - mamba==0.27 diff --git a/envs/poppunk.yaml b/envs/poppunk.yaml new file mode 100644 index 0000000..dbed9fc --- /dev/null +++ b/envs/poppunk.yaml @@ -0,0 +1,11 @@ +name: poppunk_test +channels: + - bioconda + - conda-forge + - anaconda + - defaults +dependencies: + - popPUNK + # Joblib 1.2.0 breaks HDBscan clustering that is used by popPUNK. + # Temporarily pin to v1.1, beware of vulnerability that triggered release of joblib v1.2 https://nvd.nist.gov/vuln/detail/CVE-2022-21797 + - joblib==1.1.0 diff --git a/envs/population_master.yaml b/envs/population_master.yaml index f669f93..484d607 100644 --- a/envs/population_master.yaml +++ b/envs/population_master.yaml @@ -6,7 +6,7 @@ channels: - defaults dependencies: - git - - mamba + - mamba==0.27 - pandas - snakemake - pip diff --git a/run_pipeline.sh b/run_pipeline.sh index 2edb207..763e2b6 100755 --- a/run_pipeline.sh +++ b/run_pipeline.sh @@ -26,8 +26,10 @@ fi # Create/update necessary environments PATH_MAMBA_YAML="envs/mamba.yaml" PATH_MASTER_YAML="envs/population_master.yaml" +PATH_POPPUNK_YAML="envs/poppunk.yaml" MAMBA_NAME=$(head -n 1 ${PATH_MAMBA_YAML} | cut -f2 -d ' ') MASTER_NAME=$(head -n 1 ${PATH_MASTER_YAML} | cut -f2 -d ' ') +POPPUNK_NAME=$(head -n 1 ${PATH_POPPUNK_YAML} | cut -f2 -d ' ') echo -e "\nUpdating necessary environments to run the pipeline..." @@ -40,6 +42,7 @@ conda env update -f "${PATH_MAMBA_YAML}" source activate "${MAMBA_NAME}" mamba env update -f "${PATH_MASTER_YAML}" +mamba env update -f "${PATH_POPPUNK_YAML}" source activate "${MASTER_NAME}" diff --git a/workflow/rules/createQfile.smk b/workflow/rules/createQfile.smk index 58d5d97..b2feda6 100644 --- a/workflow/rules/createQfile.smk +++ b/workflow/rules/createQfile.smk @@ -1,4 +1,4 @@ -rule createQfile: +rule create_Qfile: """Create popPUNKs required query file, a textfile containing sampleID and location of fasta""" # TODO: popPUNK can also work from fastqs. This approach completely ignores this. input: @@ -6,8 +6,8 @@ rule createQfile: output: OUT + "/{sample}_qfile.txt" resources: - mem_gb=config["mem_gb"]["template_rule"], - threads: config["threads"]["template_rule"] + mem_gb=config["mem_gb"]["create_Qfile"], + threads: config["threads"]["create_Qfile"] shell:""" printf "$(basename {input} .fasta)\t$(realpath {input})\n" > {output} """ \ No newline at end of file diff --git a/workflow/rules/fastaPopPUNK.smk b/workflow/rules/fastaPopPUNK.smk new file mode 100644 index 0000000..4004dd6 --- /dev/null +++ b/workflow/rules/fastaPopPUNK.smk @@ -0,0 +1,17 @@ +rule fasta_popPUNK: + input: + expand(OUT + "/{sample}_qfile.txt", sample=SAMPLES) + output: + output_dir = directory(OUT + "/{sample}_poppunk/"), + log: + OUT + "/log/{sample}_poppunk.log" + conda: + "../../envs/poppunk.yaml" + message: + "Running popPUNK clustering" + resources: + mem_gb=config["mem_gb"]["fasta_popPUNK"], + threads: config["threads"]["fasta_popPUNK"] + shell: """ + poppunk_assign --db /mnt/db/juno/poppunk/streptococcus/GPS_v4_references --threads {threads} --query {input} --output {output.output_dir} 2> {log} + """ From 2a5ab308b0e23ac548ffa1c6dc216598fddbb7de Mon Sep 17 00:00:00 2001 From: Linda Visser Date: Mon, 31 Oct 2022 15:24:48 +0100 Subject: [PATCH 04/23] Attempt at incorporating --species argument. --- .gitignore | 1 + Snakefile | 4 ++-- config/user_parameters.yaml | 2 -- population.py | 22 ++++++++++++++++++- run_pipeline.sh | 3 --- .../rules/{fastaPopPUNK.smk => PopPUNK.smk} | 4 +++- .../{createQfile.smk => createQfileFasta.smk} | 2 +- 7 files changed, 28 insertions(+), 10 deletions(-) delete mode 100644 config/user_parameters.yaml rename workflow/rules/{fastaPopPUNK.smk => PopPUNK.smk} (67%) rename workflow/rules/{createQfile.smk => createQfileFasta.smk} (95%) diff --git a/.gitignore b/.gitignore index c769264..7ef4c19 100644 --- a/.gitignore +++ b/.gitignore @@ -134,3 +134,4 @@ envs/src input output config/sample_sheet.yaml +config/user_parameters.yaml diff --git a/Snakefile b/Snakefile index 430f759..c281e13 100644 --- a/Snakefile +++ b/Snakefile @@ -12,8 +12,8 @@ localrules: include: "workflow/rules/rule.smk" -include: "workflow/rules/createQfile.smk" -include: "workflow/rules/fastaPopPUNK.smk" +include: "workflow/rules/createQfileFasta.smk" +include: "workflow/rules/PopPUNK.smk" rule all: diff --git a/config/user_parameters.yaml b/config/user_parameters.yaml deleted file mode 100644 index 2311c9d..0000000 --- a/config/user_parameters.yaml +++ /dev/null @@ -1,2 +0,0 @@ -input_dir: input -out: output diff --git a/population.py b/population.py index c316175..3f9191e 100644 --- a/population.py +++ b/population.py @@ -17,6 +17,8 @@ def __init__( self, input_dir, output_dir, + species=None, + db_dir="/mnt/db/juno/poppunk/", input_type="both", unlock=False, rerunincomplete=False, @@ -39,15 +41,25 @@ def __init__( dryrun=dryrun, **kwargs, ) + + # Specific Juno-Population pipeline attributes + self.species = species + self.db_dir = db_dir + self.user_parameters = pathlib.Path("config/user_parameters.yaml") + + # Start pipeline self.start_juno_pipeline() + + # Create user_parameters.yaml and sample_sheet.yaml files self.config_params = { "input_dir": str(self.input_dir), "out": str(self.output_dir), + "species": str(self.species), + "db_dir": str(self.db_dir) } with open(self.user_parameters, "w") as f: yaml.dump(self.config_params, f, default_flow_style=False) - # print(self.sample_dict) with open(self.sample_sheet, 'w') as f: yaml.dump(self.sample_dict, f, default_flow_style=False) self.run_snakemake() @@ -73,6 +85,13 @@ def __init__( default="output", help="Relative or absolute path to the output directory. If non is given, an 'output' directory will be created in the current directory.", ) + parser.add_argument( + "-s", + "--species", + default=None, + required=False, + help="The species name. It should be consistent with the popPUNK databases as found on www.poppunk.net/pages/databases.html (e.g. Streptococcus_pneumoniae)", + ) parser.add_argument( "-l", "--local", @@ -108,6 +127,7 @@ def __init__( PopulationRun( input_dir=args.input, output_dir=args.output, + species=args.species, local=args.local, unlock=args.unlock, rerunincomplete=args.rerunincomplete, diff --git a/run_pipeline.sh b/run_pipeline.sh index 763e2b6..2edb207 100755 --- a/run_pipeline.sh +++ b/run_pipeline.sh @@ -26,10 +26,8 @@ fi # Create/update necessary environments PATH_MAMBA_YAML="envs/mamba.yaml" PATH_MASTER_YAML="envs/population_master.yaml" -PATH_POPPUNK_YAML="envs/poppunk.yaml" MAMBA_NAME=$(head -n 1 ${PATH_MAMBA_YAML} | cut -f2 -d ' ') MASTER_NAME=$(head -n 1 ${PATH_MASTER_YAML} | cut -f2 -d ' ') -POPPUNK_NAME=$(head -n 1 ${PATH_POPPUNK_YAML} | cut -f2 -d ' ') echo -e "\nUpdating necessary environments to run the pipeline..." @@ -42,7 +40,6 @@ conda env update -f "${PATH_MAMBA_YAML}" source activate "${MAMBA_NAME}" mamba env update -f "${PATH_MASTER_YAML}" -mamba env update -f "${PATH_POPPUNK_YAML}" source activate "${MASTER_NAME}" diff --git a/workflow/rules/fastaPopPUNK.smk b/workflow/rules/PopPUNK.smk similarity index 67% rename from workflow/rules/fastaPopPUNK.smk rename to workflow/rules/PopPUNK.smk index 4004dd6..ab2670f 100644 --- a/workflow/rules/fastaPopPUNK.smk +++ b/workflow/rules/PopPUNK.smk @@ -9,9 +9,11 @@ rule fasta_popPUNK: "../../envs/poppunk.yaml" message: "Running popPUNK clustering" + params: + species = config["species"] resources: mem_gb=config["mem_gb"]["fasta_popPUNK"], threads: config["threads"]["fasta_popPUNK"] shell: """ - poppunk_assign --db /mnt/db/juno/poppunk/streptococcus/GPS_v4_references --threads {threads} --query {input} --output {output.output_dir} 2> {log} + echo {species} & poppunk_assign --db /mnt/db/juno/poppunk/streptococcus/GPS_v4_references --threads {threads} --query {input} --output {output.output_dir} 2> {log} """ diff --git a/workflow/rules/createQfile.smk b/workflow/rules/createQfileFasta.smk similarity index 95% rename from workflow/rules/createQfile.smk rename to workflow/rules/createQfileFasta.smk index b2feda6..b9ed121 100644 --- a/workflow/rules/createQfile.smk +++ b/workflow/rules/createQfileFasta.smk @@ -1,4 +1,4 @@ -rule create_Qfile: +rule create_Qfile_fasta: """Create popPUNKs required query file, a textfile containing sampleID and location of fasta""" # TODO: popPUNK can also work from fastqs. This approach completely ignores this. input: From 6e6c95ce7b6f9e781ef61784ed994b4c98577ed0 Mon Sep 17 00:00:00 2001 From: Linda Visser Date: Fri, 4 Nov 2022 11:56:40 +0100 Subject: [PATCH 05/23] Feat: added --species and --database arguments and use these to set correct poppunk db_dir --- population.py | 37 +++++++++++++++++++++++++++++++------ workflow/rules/PopPUNK.smk | 10 ++++++++-- 2 files changed, 39 insertions(+), 8 deletions(-) diff --git a/population.py b/population.py index 3f9191e..5393e7e 100644 --- a/population.py +++ b/population.py @@ -18,7 +18,7 @@ def __init__( input_dir, output_dir, species=None, - db_dir="/mnt/db/juno/poppunk/", + db_dir=None, input_type="both", unlock=False, rerunincomplete=False, @@ -43,8 +43,7 @@ def __init__( ) # Specific Juno-Population pipeline attributes - self.species = species - self.db_dir = db_dir + self.db_dir = self.determine_db_dir(species, db_dir) self.user_parameters = pathlib.Path("config/user_parameters.yaml") # Start pipeline @@ -54,8 +53,7 @@ def __init__( self.config_params = { "input_dir": str(self.input_dir), "out": str(self.output_dir), - "species": str(self.species), - "db_dir": str(self.db_dir) + "db_dir": str(self.db_dir), } with open(self.user_parameters, "w") as f: yaml.dump(self.config_params, f, default_flow_style=False) @@ -65,6 +63,25 @@ def __init__( self.run_snakemake() + def determine_db_dir(self, species, db_dir=None): + """ + Provided the species and a db_dir optionally set by the user, determines the actual db_dir to use + """ + if db_dir is not None: + return db_dir + # Future feature: Import a yaml with species_db_dirs instead? + species_db_dirs = { + 'streptococcus_pneumoniae': pathlib.Path('/mnt/db/juno/poppunk/streptococcus/GPS_v4_references'), + } + + species_db_dir = species_db_dirs.get(species) + + if species_db_dir is None: + raise KeyError('Cannot determine db_dir: This species is currently not configured AND no db_dir was provided. Manually provide a db_dir via -b/--database, or ask for your species to be configured.') + + return species_db_dir + + if __name__ == "__main__": parser = argparse.ArgumentParser( description="Template juno pipeline. If you see this message please change it to something appropriate" @@ -90,7 +107,14 @@ def __init__( "--species", default=None, required=False, - help="The species name. It should be consistent with the popPUNK databases as found on www.poppunk.net/pages/databases.html (e.g. Streptococcus_pneumoniae)", + help="The species name, use an underscore instead of a space (e.g. streptococcus_pneumoniae). Check the publicly available popPUNK databases on www.poppunk.net/pages/databases.html", + ) + parser.add_argument( + "-b", + "--database", + default=None, + required=False, + help="The path to the popPUNK database to use. This overrides information provide with the --species argument.", ) parser.add_argument( "-l", @@ -128,6 +152,7 @@ def __init__( input_dir=args.input, output_dir=args.output, species=args.species, + db_dir=args.database, local=args.local, unlock=args.unlock, rerunincomplete=args.rerunincomplete, diff --git a/workflow/rules/PopPUNK.smk b/workflow/rules/PopPUNK.smk index ab2670f..5635209 100644 --- a/workflow/rules/PopPUNK.smk +++ b/workflow/rules/PopPUNK.smk @@ -3,6 +3,10 @@ rule fasta_popPUNK: expand(OUT + "/{sample}_qfile.txt", sample=SAMPLES) output: output_dir = directory(OUT + "/{sample}_poppunk/"), + output_csv = OUT + "/{sample}_poppunk/{sample}_poppunk_clusters.csv", + output_pkl = OUT + "/{sample}_poppunk/{sample}_poppunk.dists.pkl", + output_npy = OUT + "/{sample}_poppunk/{sample}_poppunk.dists.npy", + output_h5 = OUT + "/{sample}_poppunk/{sample}_poppunk.h5", log: OUT + "/log/{sample}_poppunk.log" conda: @@ -10,10 +14,12 @@ rule fasta_popPUNK: message: "Running popPUNK clustering" params: - species = config["species"] + db_dir = config["db_dir"], resources: mem_gb=config["mem_gb"]["fasta_popPUNK"], threads: config["threads"]["fasta_popPUNK"] shell: """ - echo {species} & poppunk_assign --db /mnt/db/juno/poppunk/streptococcus/GPS_v4_references --threads {threads} --query {input} --output {output.output_dir} 2> {log} + poppunk_assign \ + --db {params.db_dir} \ + --threads {threads} --query {input} --output {output.output_dir} 2> {log} """ From 0f3e21915f2a82cbe7bb7ec3aa68926b91812b4e Mon Sep 17 00:00:00 2001 From: Linda Visser Date: Fri, 4 Nov 2022 16:34:40 +0100 Subject: [PATCH 06/23] Feat: Create summary rule. --- Snakefile | 5 +++-- config/pipeline_parameters.yaml | 4 ++-- envs/population_master.yaml | 1 + workflow/rules/PopPUNK.smk | 12 ++++++------ workflow/rules/createQfileFasta.smk | 2 +- workflow/rules/makeSummaryCsv.smk | 16 ++++++++++++++++ workflow/rules/rule.smk | 17 ----------------- workflow/scripts/make_summary_csv.py | 24 ++++++++++++++++++++++++ 8 files changed, 53 insertions(+), 28 deletions(-) create mode 100644 workflow/rules/makeSummaryCsv.smk delete mode 100644 workflow/rules/rule.smk create mode 100644 workflow/scripts/make_summary_csv.py diff --git a/Snakefile b/Snakefile index c281e13..2387146 100644 --- a/Snakefile +++ b/Snakefile @@ -11,11 +11,12 @@ localrules: all, -include: "workflow/rules/rule.smk" +include: "workflow/rules/makeSummaryCsv.smk" include: "workflow/rules/createQfileFasta.smk" include: "workflow/rules/PopPUNK.smk" rule all: input: - expand(OUT + "/{sample}_poppunk/", sample=SAMPLES), + expand(OUT + "/results_per_sample/{sample}_poppunk/", sample=SAMPLES), + expand(OUT + "/poppunk_clusters.csv"), diff --git a/config/pipeline_parameters.yaml b/config/pipeline_parameters.yaml index 0b6a093..16c1394 100644 --- a/config/pipeline_parameters.yaml +++ b/config/pipeline_parameters.yaml @@ -1,9 +1,9 @@ threads: - template_rule: 1 create_Qfile: 1 fasta_popPUNK: 8 + makeSummaryCsv: 1 mem_gb: - template_rule: 1 create_Qfile: 1 fasta_popPUNK: 1 + makeSummaryCsv: 1 diff --git a/envs/population_master.yaml b/envs/population_master.yaml index 484d607..ab12647 100644 --- a/envs/population_master.yaml +++ b/envs/population_master.yaml @@ -9,6 +9,7 @@ dependencies: - mamba==0.27 - pandas - snakemake + - pandas - pip - pip: - "--editable=git+https://github.com/RIVM-bioinformatics/base_juno_pipeline.git#egg=base_juno" diff --git a/workflow/rules/PopPUNK.smk b/workflow/rules/PopPUNK.smk index 5635209..5ccf02e 100644 --- a/workflow/rules/PopPUNK.smk +++ b/workflow/rules/PopPUNK.smk @@ -1,12 +1,12 @@ rule fasta_popPUNK: input: - expand(OUT + "/{sample}_qfile.txt", sample=SAMPLES) + OUT + "/q_files/{sample}_qfile.txt" output: - output_dir = directory(OUT + "/{sample}_poppunk/"), - output_csv = OUT + "/{sample}_poppunk/{sample}_poppunk_clusters.csv", - output_pkl = OUT + "/{sample}_poppunk/{sample}_poppunk.dists.pkl", - output_npy = OUT + "/{sample}_poppunk/{sample}_poppunk.dists.npy", - output_h5 = OUT + "/{sample}_poppunk/{sample}_poppunk.h5", + output_dir = directory(OUT + "/results_per_sample/{sample}_poppunk/"), + output_csv = OUT + "/results_per_sample/{sample}_poppunk/{sample}_poppunk_clusters.csv", + output_pkl = OUT + "/results_per_sample/{sample}_poppunk/{sample}_poppunk.dists.pkl", + output_npy = OUT + "/results_per_sample/{sample}_poppunk/{sample}_poppunk.dists.npy", + output_h5 = OUT + "/results_per_sample/{sample}_poppunk/{sample}_poppunk.h5", log: OUT + "/log/{sample}_poppunk.log" conda: diff --git a/workflow/rules/createQfileFasta.smk b/workflow/rules/createQfileFasta.smk index b9ed121..388d200 100644 --- a/workflow/rules/createQfileFasta.smk +++ b/workflow/rules/createQfileFasta.smk @@ -4,7 +4,7 @@ rule create_Qfile_fasta: input: lambda wc: SAMPLES[wc.sample]["assembly"] output: - OUT + "/{sample}_qfile.txt" + OUT + "/q_files/{sample}_qfile.txt" resources: mem_gb=config["mem_gb"]["create_Qfile"], threads: config["threads"]["create_Qfile"] diff --git a/workflow/rules/makeSummaryCsv.smk b/workflow/rules/makeSummaryCsv.smk new file mode 100644 index 0000000..e999d22 --- /dev/null +++ b/workflow/rules/makeSummaryCsv.smk @@ -0,0 +1,16 @@ +rule makeSummaryCsv: + input: + OUT + "/results_per_sample/", + output: + OUT + "/poppunk_clusters.csv", + log: + OUT + "/log/summarize.log" + message: + "Merging individual popPUNK output to one csv." + resources: + mem_gb=config["mem_gb"]["makeSummaryCsv"], + params: script = "workflow/scripts/make_summary_csv.py" + threads: config["threads"]["makeSummaryCsv"] + shell: """ + python {params.script} -i {input} > {output} + """ diff --git a/workflow/rules/rule.smk b/workflow/rules/rule.smk deleted file mode 100644 index 1730f76..0000000 --- a/workflow/rules/rule.smk +++ /dev/null @@ -1,17 +0,0 @@ -rule template_rule: - input: - lambda wc: SAMPLES[wc.sample]["R1"], - lambda wc: SAMPLES[wc.sample]["R2"], - output: - OUT + "/{sample}_combined.fastq", - log: - OUT + "/log/{sample}_template_rule.log" - message: - "Merging {input}." - resources: - mem_gb=config["mem_gb"]["template_rule"], - params: script = "workflow/scripts/script.py" - threads: config["threads"]["template_rule"] - shell: """ - python {params.script} {input} > {output} - """ diff --git a/workflow/scripts/make_summary_csv.py b/workflow/scripts/make_summary_csv.py new file mode 100644 index 0000000..51d8b4f --- /dev/null +++ b/workflow/scripts/make_summary_csv.py @@ -0,0 +1,24 @@ +import sys +import glob +import argparse +import pathlib +import pandas as pd + + +def combine_csv(root_dir=None): + csv_files = glob.glob(f'{root_dir}/*/*.csv') + return pd.concat([pd.read_csv(f) for f in csv_files], ignore_index=True) + + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + parser.add_argument( + "-i", + "--input", + type=pathlib.Path, + required=True, + metavar="DIR", + help="Relative or absolute path to input directory from which all csv files should be merged." + ) + args = parser.parse_args() + sys.stdout.write(combine_csv(args.input).to_csv()) \ No newline at end of file From 0034d15d9fda3fbfc5ceb053c0f92b7b46da52af Mon Sep 17 00:00:00 2001 From: Karim Hajji Date: Mon, 7 Nov 2022 10:39:53 +0100 Subject: [PATCH 07/23] style: Run snakefmt --- Snakefile | 3 ++- workflow/rules/PopPUNK.smk | 30 ++++++++++++++++------------- workflow/rules/createQfileFasta.smk | 11 ++++++----- workflow/rules/makeSummaryCsv.smk | 12 +++++++----- 4 files changed, 32 insertions(+), 24 deletions(-) diff --git a/Snakefile b/Snakefile index 2387146..a94c996 100644 --- a/Snakefile +++ b/Snakefile @@ -1,12 +1,13 @@ import yaml -sample_sheet=config["sample_sheet"] +sample_sheet = config["sample_sheet"] with open(sample_sheet) as f: SAMPLES = yaml.safe_load(f) OUT = config["out"] + localrules: all, diff --git a/workflow/rules/PopPUNK.smk b/workflow/rules/PopPUNK.smk index 5ccf02e..c03a52e 100644 --- a/workflow/rules/PopPUNK.smk +++ b/workflow/rules/PopPUNK.smk @@ -1,25 +1,29 @@ rule fasta_popPUNK: input: - OUT + "/q_files/{sample}_qfile.txt" + OUT + "/q_files/{sample}_qfile.txt", output: - output_dir = directory(OUT + "/results_per_sample/{sample}_poppunk/"), - output_csv = OUT + "/results_per_sample/{sample}_poppunk/{sample}_poppunk_clusters.csv", - output_pkl = OUT + "/results_per_sample/{sample}_poppunk/{sample}_poppunk.dists.pkl", - output_npy = OUT + "/results_per_sample/{sample}_poppunk/{sample}_poppunk.dists.npy", - output_h5 = OUT + "/results_per_sample/{sample}_poppunk/{sample}_poppunk.h5", + output_dir=directory(OUT + "/results_per_sample/{sample}_poppunk/"), + output_csv=OUT + + "/results_per_sample/{sample}_poppunk/{sample}_poppunk_clusters.csv", + output_pkl=OUT + + "/results_per_sample/{sample}_poppunk/{sample}_poppunk.dists.pkl", + output_npy=OUT + + "/results_per_sample/{sample}_poppunk/{sample}_poppunk.dists.npy", + output_h5=OUT + "/results_per_sample/{sample}_poppunk/{sample}_poppunk.h5", log: - OUT + "/log/{sample}_poppunk.log" + OUT + "/log/{sample}_poppunk.log", conda: "../../envs/poppunk.yaml" message: "Running popPUNK clustering" params: - db_dir = config["db_dir"], + db_dir=config["db_dir"], resources: mem_gb=config["mem_gb"]["fasta_popPUNK"], threads: config["threads"]["fasta_popPUNK"] - shell: """ - poppunk_assign \ - --db {params.db_dir} \ - --threads {threads} --query {input} --output {output.output_dir} 2> {log} - """ + shell: + """ + poppunk_assign \ + --db {params.db_dir} \ + --threads {threads} --query {input} --output {output.output_dir} 2> {log} + """ diff --git a/workflow/rules/createQfileFasta.smk b/workflow/rules/createQfileFasta.smk index 388d200..849a7f3 100644 --- a/workflow/rules/createQfileFasta.smk +++ b/workflow/rules/createQfileFasta.smk @@ -2,12 +2,13 @@ rule create_Qfile_fasta: """Create popPUNKs required query file, a textfile containing sampleID and location of fasta""" # TODO: popPUNK can also work from fastqs. This approach completely ignores this. input: - lambda wc: SAMPLES[wc.sample]["assembly"] + lambda wc: SAMPLES[wc.sample]["assembly"], output: - OUT + "/q_files/{sample}_qfile.txt" + OUT + "/q_files/{sample}_qfile.txt", resources: mem_gb=config["mem_gb"]["create_Qfile"], threads: config["threads"]["create_Qfile"] - shell:""" - printf "$(basename {input} .fasta)\t$(realpath {input})\n" > {output} - """ \ No newline at end of file + shell: + """ + printf "$(basename {input} .fasta)\t$(realpath {input})\n" > {output} + """ diff --git a/workflow/rules/makeSummaryCsv.smk b/workflow/rules/makeSummaryCsv.smk index e999d22..43adf26 100644 --- a/workflow/rules/makeSummaryCsv.smk +++ b/workflow/rules/makeSummaryCsv.smk @@ -4,13 +4,15 @@ rule makeSummaryCsv: output: OUT + "/poppunk_clusters.csv", log: - OUT + "/log/summarize.log" + OUT + "/log/summarize.log", message: "Merging individual popPUNK output to one csv." resources: mem_gb=config["mem_gb"]["makeSummaryCsv"], - params: script = "workflow/scripts/make_summary_csv.py" + params: + script="workflow/scripts/make_summary_csv.py", threads: config["threads"]["makeSummaryCsv"] - shell: """ - python {params.script} -i {input} > {output} - """ + shell: + """ + python {params.script} -i {input} > {output} + """ From 4849624aaedb64e0f30ee8bed78e58d5044dd999 Mon Sep 17 00:00:00 2001 From: Karim Hajji Date: Mon, 7 Nov 2022 10:40:34 +0100 Subject: [PATCH 08/23] style: Run black --- workflow/scripts/make_summary_csv.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/workflow/scripts/make_summary_csv.py b/workflow/scripts/make_summary_csv.py index 51d8b4f..d6e5ea9 100644 --- a/workflow/scripts/make_summary_csv.py +++ b/workflow/scripts/make_summary_csv.py @@ -6,11 +6,11 @@ def combine_csv(root_dir=None): - csv_files = glob.glob(f'{root_dir}/*/*.csv') + csv_files = glob.glob(f"{root_dir}/*/*.csv") return pd.concat([pd.read_csv(f) for f in csv_files], ignore_index=True) -if __name__ == '__main__': +if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument( "-i", @@ -18,7 +18,7 @@ def combine_csv(root_dir=None): type=pathlib.Path, required=True, metavar="DIR", - help="Relative or absolute path to input directory from which all csv files should be merged." + help="Relative or absolute path to input directory from which all csv files should be merged.", ) args = parser.parse_args() - sys.stdout.write(combine_csv(args.input).to_csv()) \ No newline at end of file + sys.stdout.write(combine_csv(args.input).to_csv()) From f15dca8904d177d2c0128d34c5805688eacc9c9f Mon Sep 17 00:00:00 2001 From: Karim Hajji Date: Mon, 7 Nov 2022 10:44:25 +0100 Subject: [PATCH 09/23] refactor: Move poppunk.yaml env specification --- {envs => workflow/envs}/poppunk.yaml | 0 workflow/rules/PopPUNK.smk | 2 +- workflow/scripts/script.py | 6 ------ 3 files changed, 1 insertion(+), 7 deletions(-) rename {envs => workflow/envs}/poppunk.yaml (100%) delete mode 100644 workflow/scripts/script.py diff --git a/envs/poppunk.yaml b/workflow/envs/poppunk.yaml similarity index 100% rename from envs/poppunk.yaml rename to workflow/envs/poppunk.yaml diff --git a/workflow/rules/PopPUNK.smk b/workflow/rules/PopPUNK.smk index c03a52e..06b4f43 100644 --- a/workflow/rules/PopPUNK.smk +++ b/workflow/rules/PopPUNK.smk @@ -13,7 +13,7 @@ rule fasta_popPUNK: log: OUT + "/log/{sample}_poppunk.log", conda: - "../../envs/poppunk.yaml" + "../envs/poppunk.yaml" message: "Running popPUNK clustering" params: diff --git a/workflow/scripts/script.py b/workflow/scripts/script.py deleted file mode 100644 index 81656b6..0000000 --- a/workflow/scripts/script.py +++ /dev/null @@ -1,6 +0,0 @@ -import subprocess -import sys - -subprocess.call( - ["cat"] + sys.argv[1:], -) From 3c4a5f7b97b5091f275e5b79d10dc56d7d7c1d2c Mon Sep 17 00:00:00 2001 From: Karim Hajji Date: Mon, 7 Nov 2022 11:03:08 +0100 Subject: [PATCH 10/23] refactor: Remove juno-library from master env --- envs/population_master.yaml | 3 --- 1 file changed, 3 deletions(-) diff --git a/envs/population_master.yaml b/envs/population_master.yaml index ab12647..e5a0dba 100644 --- a/envs/population_master.yaml +++ b/envs/population_master.yaml @@ -10,6 +10,3 @@ dependencies: - pandas - snakemake - pandas - - pip - - pip: - - "--editable=git+https://github.com/RIVM-bioinformatics/base_juno_pipeline.git#egg=base_juno" From dd3f02a563e4bb903afda97b9381c175fb353f5a Mon Sep 17 00:00:00 2001 From: Karim Hajji Date: Mon, 7 Nov 2022 11:05:32 +0100 Subject: [PATCH 11/23] refactor: Remove duplicate pandas dependency --- envs/population_master.yaml | 1 - 1 file changed, 1 deletion(-) diff --git a/envs/population_master.yaml b/envs/population_master.yaml index e5a0dba..25581c5 100644 --- a/envs/population_master.yaml +++ b/envs/population_master.yaml @@ -9,4 +9,3 @@ dependencies: - mamba==0.27 - pandas - snakemake - - pandas From c901c55cdb4aac0d4a09399594bdabdc5002f799 Mon Sep 17 00:00:00 2001 From: Karim Hajji Date: Mon, 7 Nov 2022 11:10:47 +0100 Subject: [PATCH 12/23] doc: Add instruction for submodule initialization --- README.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/README.md b/README.md index 4e41e80..ca6a559 100644 --- a/README.md +++ b/README.md @@ -2,6 +2,10 @@ - [ ] TODO: Write readme A template pipeline where the other juno pipelines are based on. +Before running the pipeline be sure to initialize the submodules: +```bash +git submodule update --init --recursive +``` ## Contribution guidelines Juno pipelines use a [feature branch workflow](https://www.atlassian.com/git/tutorials/comparing-workflows/feature-branch-workflow). To work on features, create a branch from the `main` branch to make changes to. This branch can be merged to the main branch via a pull request. Hotfixes for bugs can be committed to the `main` branch. From 5b647640458a9b488e580f4e70cb9165d16b602c Mon Sep 17 00:00:00 2001 From: Karim Hajji Date: Mon, 7 Nov 2022 11:27:30 +0100 Subject: [PATCH 13/23] fix: Specify juno-library version --- .gitmodules | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/.gitmodules b/.gitmodules index f73a69e..ef80eff 100644 --- a/.gitmodules +++ b/.gitmodules @@ -1,3 +1,4 @@ -[submodule "juno-library"] +[submodule "base_juno_pipeline"] path = base_juno_pipeline - url = https://github.com/RIVM-bioinformatics/base_juno_pipeline.git + url = https://github.com/RIVM-bioinformatics/juno-library.git + branch = v0.9.2 From c899f36a9c723b358946bd35a6869f0972b4540f Mon Sep 17 00:00:00 2001 From: Karim Hajji Date: Mon, 7 Nov 2022 11:36:49 +0100 Subject: [PATCH 14/23] refactor: Change database location determination --- database_locations.py | 7 +++++++ population.py | 31 ++++++++++--------------------- 2 files changed, 17 insertions(+), 21 deletions(-) create mode 100644 database_locations.py diff --git a/database_locations.py b/database_locations.py new file mode 100644 index 0000000..8ff47cd --- /dev/null +++ b/database_locations.py @@ -0,0 +1,7 @@ +import pathlib + +species_database_locations = { + "streptococcus_pneumoniae": pathlib.Path( + "/mnt/db/juno/poppunk/streptococcus/GPS_v4_references" + ), +} diff --git a/population.py b/population.py index 5393e7e..bc3e3b3 100644 --- a/population.py +++ b/population.py @@ -11,6 +11,8 @@ helper_functions, ) +from database_locations import species_database_locations + class PopulationRun(PipelineStartup, RunSnakemake): def __init__( @@ -43,7 +45,13 @@ def __init__( ) # Specific Juno-Population pipeline attributes - self.db_dir = self.determine_db_dir(species, db_dir) + if not db_dir: + self.db_dir = species_database_locations.get(species) + if not self.db_dir: + raise KeyError( + "Cannot determine db_dir: This species is currently not configured AND no db_dir was provided. Manually provide a db_dir via -b/--database, or ask for your species to be configured." + ) + self.user_parameters = pathlib.Path("config/user_parameters.yaml") # Start pipeline @@ -58,30 +66,11 @@ def __init__( with open(self.user_parameters, "w") as f: yaml.dump(self.config_params, f, default_flow_style=False) - with open(self.sample_sheet, 'w') as f: + with open(self.sample_sheet, "w") as f: yaml.dump(self.sample_dict, f, default_flow_style=False) self.run_snakemake() - def determine_db_dir(self, species, db_dir=None): - """ - Provided the species and a db_dir optionally set by the user, determines the actual db_dir to use - """ - if db_dir is not None: - return db_dir - # Future feature: Import a yaml with species_db_dirs instead? - species_db_dirs = { - 'streptococcus_pneumoniae': pathlib.Path('/mnt/db/juno/poppunk/streptococcus/GPS_v4_references'), - } - - species_db_dir = species_db_dirs.get(species) - - if species_db_dir is None: - raise KeyError('Cannot determine db_dir: This species is currently not configured AND no db_dir was provided. Manually provide a db_dir via -b/--database, or ask for your species to be configured.') - - return species_db_dir - - if __name__ == "__main__": parser = argparse.ArgumentParser( description="Template juno pipeline. If you see this message please change it to something appropriate" From 3ccc75f53a160d8e56f1d69e66c04b8ecf89e1f8 Mon Sep 17 00:00:00 2001 From: Karim Hajji Date: Mon, 7 Nov 2022 11:41:24 +0100 Subject: [PATCH 15/23] refactor: Remove "test" from poppunk env name --- workflow/envs/poppunk.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflow/envs/poppunk.yaml b/workflow/envs/poppunk.yaml index dbed9fc..3cca6fa 100644 --- a/workflow/envs/poppunk.yaml +++ b/workflow/envs/poppunk.yaml @@ -1,4 +1,4 @@ -name: poppunk_test +name: poppunk channels: - bioconda - conda-forge From b629f4626b40ee43d0bf8363ad59c2eed8b570db Mon Sep 17 00:00:00 2001 From: Karim Hajji Date: Mon, 7 Nov 2022 11:41:54 +0100 Subject: [PATCH 16/23] refactor: Remove placeholder env.yaml --- workflow/envs/env.yaml | 0 1 file changed, 0 insertions(+), 0 deletions(-) delete mode 100644 workflow/envs/env.yaml diff --git a/workflow/envs/env.yaml b/workflow/envs/env.yaml deleted file mode 100644 index e69de29..0000000 From 35dc4554ca21d356485841b386ecb0684abd914b Mon Sep 17 00:00:00 2001 From: Linda Visser Date: Wed, 9 Nov 2022 13:49:09 +0100 Subject: [PATCH 17/23] refactor: Make rule names more descriptive --- Snakefile | 2 +- workflow/rules/PopPUNK.smk | 2 +- workflow/rules/{makeSummaryCsv.smk => aggregatePoppunkCsv.smk} | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) rename workflow/rules/{makeSummaryCsv.smk => aggregatePoppunkCsv.smk} (94%) diff --git a/Snakefile b/Snakefile index a94c996..6a34473 100644 --- a/Snakefile +++ b/Snakefile @@ -12,7 +12,7 @@ localrules: all, -include: "workflow/rules/makeSummaryCsv.smk" +include: "workflow/rules/aggregatePoppunkCsv.smk" include: "workflow/rules/createQfileFasta.smk" include: "workflow/rules/PopPUNK.smk" diff --git a/workflow/rules/PopPUNK.smk b/workflow/rules/PopPUNK.smk index 06b4f43..1b82890 100644 --- a/workflow/rules/PopPUNK.smk +++ b/workflow/rules/PopPUNK.smk @@ -1,4 +1,4 @@ -rule fasta_popPUNK: +rule assign_popPUNK_cluster: input: OUT + "/q_files/{sample}_qfile.txt", output: diff --git a/workflow/rules/makeSummaryCsv.smk b/workflow/rules/aggregatePoppunkCsv.smk similarity index 94% rename from workflow/rules/makeSummaryCsv.smk rename to workflow/rules/aggregatePoppunkCsv.smk index 43adf26..c75ea87 100644 --- a/workflow/rules/makeSummaryCsv.smk +++ b/workflow/rules/aggregatePoppunkCsv.smk @@ -1,4 +1,4 @@ -rule makeSummaryCsv: +rule aggregate_poppunk_csv: input: OUT + "/results_per_sample/", output: From 15bc8c8f691d6834af4c55671b1d33e1c7449b63 Mon Sep 17 00:00:00 2001 From: Linda Visser Date: Wed, 9 Nov 2022 15:13:38 +0100 Subject: [PATCH 18/23] refactor: Change the aggregatePoppunkCsv rule to make it more its input files more specific --- Snakefile | 1 - config/pipeline_parameters.yaml | 4 ++-- workflow/rules/aggregatePoppunkCsv.smk | 18 +++++++++--------- workflow/scripts/make_summary_csv.py | 24 ------------------------ 4 files changed, 11 insertions(+), 36 deletions(-) delete mode 100644 workflow/scripts/make_summary_csv.py diff --git a/Snakefile b/Snakefile index 6a34473..dd78dcc 100644 --- a/Snakefile +++ b/Snakefile @@ -19,5 +19,4 @@ include: "workflow/rules/PopPUNK.smk" rule all: input: - expand(OUT + "/results_per_sample/{sample}_poppunk/", sample=SAMPLES), expand(OUT + "/poppunk_clusters.csv"), diff --git a/config/pipeline_parameters.yaml b/config/pipeline_parameters.yaml index 16c1394..7a8a25a 100644 --- a/config/pipeline_parameters.yaml +++ b/config/pipeline_parameters.yaml @@ -1,9 +1,9 @@ threads: create_Qfile: 1 fasta_popPUNK: 8 - makeSummaryCsv: 1 + aggregatePoppunkCsv: 1 mem_gb: create_Qfile: 1 fasta_popPUNK: 1 - makeSummaryCsv: 1 + aggregatePoppunkCsv: 1 diff --git a/workflow/rules/aggregatePoppunkCsv.smk b/workflow/rules/aggregatePoppunkCsv.smk index c75ea87..e068447 100644 --- a/workflow/rules/aggregatePoppunkCsv.smk +++ b/workflow/rules/aggregatePoppunkCsv.smk @@ -1,6 +1,6 @@ rule aggregate_poppunk_csv: input: - OUT + "/results_per_sample/", + expand(OUT + "/results_per_sample/{sample}_poppunk/{sample}_poppunk_clusters.csv", sample=SAMPLES), output: OUT + "/poppunk_clusters.csv", log: @@ -8,11 +8,11 @@ rule aggregate_poppunk_csv: message: "Merging individual popPUNK output to one csv." resources: - mem_gb=config["mem_gb"]["makeSummaryCsv"], - params: - script="workflow/scripts/make_summary_csv.py", - threads: config["threads"]["makeSummaryCsv"] - shell: - """ - python {params.script} -i {input} > {output} - """ + mem_gb=config["mem_gb"]["aggregatePoppunkCsv"], + threads: config["threads"]["aggregatePoppunkCsv"] + run: + import pandas as pd + + aggregated_csv = pd.concat([pd.read_csv(f) for f in input], ignore_index=True) + aggregated_csv.to_csv(output[0]) + diff --git a/workflow/scripts/make_summary_csv.py b/workflow/scripts/make_summary_csv.py deleted file mode 100644 index d6e5ea9..0000000 --- a/workflow/scripts/make_summary_csv.py +++ /dev/null @@ -1,24 +0,0 @@ -import sys -import glob -import argparse -import pathlib -import pandas as pd - - -def combine_csv(root_dir=None): - csv_files = glob.glob(f"{root_dir}/*/*.csv") - return pd.concat([pd.read_csv(f) for f in csv_files], ignore_index=True) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser.add_argument( - "-i", - "--input", - type=pathlib.Path, - required=True, - metavar="DIR", - help="Relative or absolute path to input directory from which all csv files should be merged.", - ) - args = parser.parse_args() - sys.stdout.write(combine_csv(args.input).to_csv()) From 4a2594bc611568652ea9fa5e92841ec2a6df3cfc Mon Sep 17 00:00:00 2001 From: Linda Visser Date: Wed, 9 Nov 2022 16:05:24 +0100 Subject: [PATCH 19/23] Feat: Add --species to run_pipeline.sh. Fix: Correct input type from both to fasta. --- population.py | 2 +- run_pipeline.sh | 14 +++++++++++--- 2 files changed, 12 insertions(+), 4 deletions(-) diff --git a/population.py b/population.py index bc3e3b3..faf2529 100644 --- a/population.py +++ b/population.py @@ -21,7 +21,7 @@ def __init__( output_dir, species=None, db_dir=None, - input_type="both", + input_type="fasta", unlock=False, rerunincomplete=False, dryrun=False, diff --git a/run_pipeline.sh b/run_pipeline.sh index 2edb207..2be1ce1 100755 --- a/run_pipeline.sh +++ b/run_pipeline.sh @@ -4,11 +4,11 @@ set -euo pipefail #----------------------------------------------# # User parameters -if [ ! -z "${1}" ] || [ ! -z "${2}" ] #|| [ ! -z "${irods_input_projectID}" ] +if [ ! -z "${1}" ] || [ ! -z "${2}" ] || [ ! -z "${irods_input_projectID}" ] then input_dir="${1}" output_dir="${2}" -# PROJECT_NAME="${irods_input_projectID}" + PROJECT_NAME="${irods_input_projectID}" else echo "One of the parameters is missing, make sure there is an input directory, output directory and project name(param 1, 2 or irods_input_projectID)." exit 1 @@ -22,6 +22,14 @@ else input_fastq="${input_dir}/clean_fastq" fi +case $PROJECT_NAME in + + rvp_spn) + GENUS_ALL="streptococcus_pneumoniae" + ;; + +esac + #----------------------------------------------# # Create/update necessary environments PATH_MAMBA_YAML="envs/mamba.yaml" @@ -56,7 +64,7 @@ fi set -euo pipefail -python population.py -i "${input_dir}" -o "${output_dir}" +python population.py --queue "${QUEUE}"-i "${input_dir}" -o "${output_dir}" -s "${GENUS_ALL}" result=$? From 7f0833a4f333c83230d986f4aa6a44658e3d12f7 Mon Sep 17 00:00:00 2001 From: Linda Visser Date: Wed, 9 Nov 2022 17:09:36 +0100 Subject: [PATCH 20/23] Fix: Correctly handle --queue argument. --- population.py | 13 +++++++++++++ run_pipeline.sh | 2 +- 2 files changed, 14 insertions(+), 1 deletion(-) diff --git a/population.py b/population.py index faf2529..1102205 100644 --- a/population.py +++ b/population.py @@ -25,6 +25,7 @@ def __init__( unlock=False, rerunincomplete=False, dryrun=False, + queue="bio", **kwargs ): PipelineStartup.__init__( @@ -38,6 +39,7 @@ def __init__( pipeline_version="0.1.0", output_dir=output_dir, workdir=pathlib.Path(__file__).parent.resolve(), + queue=queue, unlock=unlock, rerunincomplete=rerunincomplete, dryrun=dryrun, @@ -111,6 +113,16 @@ def __init__( action="store_true", help="Running pipeline locally (instead of in a computer cluster). Default is running it in a cluster.", ) + parser.add_argument( + "-q", + "--queue", + type = str, + required=False, + default = "bio", + metavar = "STR", + dest="queue", + help = "Name of the queue that the job will be sumitted to if working on a cluster." + ) # Snakemake arguments parser.add_argument( "-u", @@ -143,6 +155,7 @@ def __init__( species=args.species, db_dir=args.database, local=args.local, + queue=args.queue, unlock=args.unlock, rerunincomplete=args.rerunincomplete, dryrun=args.dryrun, diff --git a/run_pipeline.sh b/run_pipeline.sh index 2be1ce1..f8dbd46 100755 --- a/run_pipeline.sh +++ b/run_pipeline.sh @@ -64,7 +64,7 @@ fi set -euo pipefail -python population.py --queue "${QUEUE}"-i "${input_dir}" -o "${output_dir}" -s "${GENUS_ALL}" +python population.py --queue "${QUEUE}" -i "${input_dir}" -o "${output_dir}" -s "${GENUS_ALL}" result=$? From 106ac17a0d98e12180f550c1c78e3bf0d9fbfac0 Mon Sep 17 00:00:00 2001 From: Karim Hajji Date: Tue, 15 Nov 2022 09:54:50 +0100 Subject: [PATCH 21/23] style: Format with black and snakefmt --- population.py | 8 ++++---- workflow/rules/aggregatePoppunkCsv.smk | 8 +++++--- 2 files changed, 9 insertions(+), 7 deletions(-) diff --git a/population.py b/population.py index 1102205..2f61a34 100644 --- a/population.py +++ b/population.py @@ -116,12 +116,12 @@ def __init__( parser.add_argument( "-q", "--queue", - type = str, + type=str, required=False, - default = "bio", - metavar = "STR", + default="bio", + metavar="STR", dest="queue", - help = "Name of the queue that the job will be sumitted to if working on a cluster." + help="Name of the queue that the job will be sumitted to if working on a cluster.", ) # Snakemake arguments parser.add_argument( diff --git a/workflow/rules/aggregatePoppunkCsv.smk b/workflow/rules/aggregatePoppunkCsv.smk index e068447..c901c2f 100644 --- a/workflow/rules/aggregatePoppunkCsv.smk +++ b/workflow/rules/aggregatePoppunkCsv.smk @@ -1,6 +1,9 @@ rule aggregate_poppunk_csv: input: - expand(OUT + "/results_per_sample/{sample}_poppunk/{sample}_poppunk_clusters.csv", sample=SAMPLES), + expand( + OUT + "/results_per_sample/{sample}_poppunk/{sample}_poppunk_clusters.csv", + sample=SAMPLES, + ), output: OUT + "/poppunk_clusters.csv", log: @@ -12,7 +15,6 @@ rule aggregate_poppunk_csv: threads: config["threads"]["aggregatePoppunkCsv"] run: import pandas as pd - + aggregated_csv = pd.concat([pd.read_csv(f) for f in input], ignore_index=True) aggregated_csv.to_csv(output[0]) - From 931882c50b3b08ecf61650461305095da3a4ce2f Mon Sep 17 00:00:00 2001 From: Karim Hajji Date: Tue, 15 Nov 2022 10:03:11 +0100 Subject: [PATCH 22/23] fix: Add default species "other" in run_pipeline.sh --- run_pipeline.sh | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/run_pipeline.sh b/run_pipeline.sh index f8dbd46..abf5536 100755 --- a/run_pipeline.sh +++ b/run_pipeline.sh @@ -6,9 +6,9 @@ set -euo pipefail # User parameters if [ ! -z "${1}" ] || [ ! -z "${2}" ] || [ ! -z "${irods_input_projectID}" ] then - input_dir="${1}" - output_dir="${2}" - PROJECT_NAME="${irods_input_projectID}" + input_dir="${1}" + output_dir="${2}" + PROJECT_NAME="${irods_input_projectID}" else echo "One of the parameters is missing, make sure there is an input directory, output directory and project name(param 1, 2 or irods_input_projectID)." exit 1 @@ -16,17 +16,18 @@ fi if [ ! -d "${input_dir}" ] || [ ! -d "${output_dir}" ] then - echo "The input directory $input_dir, output directory $output_dir or fastq dir ${input_dir}/clean_fastq does not exist" - exit 1 + echo "The input directory $input_dir, output directory $output_dir or fastq dir ${input_dir}/clean_fastq does not exist" + exit 1 else - input_fastq="${input_dir}/clean_fastq" + input_fastq="${input_dir}/clean_fastq" fi case $PROJECT_NAME in - rvp_spn) - GENUS_ALL="streptococcus_pneumoniae" - ;; + rvp_spn) + GENUS_ALL="streptococcus_pneumoniae";; + *) + GENUS_ALL="other";; esac From 081ffa477a1a3798a004247d21fd79987b951e8f Mon Sep 17 00:00:00 2001 From: Karim Hajji Date: Tue, 15 Nov 2022 10:04:01 +0100 Subject: [PATCH 23/23] refactor: Rename master environment --- envs/population_master.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/envs/population_master.yaml b/envs/population_master.yaml index 25581c5..50cff10 100644 --- a/envs/population_master.yaml +++ b/envs/population_master.yaml @@ -1,4 +1,4 @@ -name: population_master +name: juno_population channels: - bioconda - conda-forge