Skip to content

Commit

Permalink
Merge pull request #1 from RIVM-bioinformatics/create_rule
Browse files Browse the repository at this point in the history
Add PopPUNK
  • Loading branch information
wolthuisr authored Nov 16, 2022
2 parents 567bed3 + 081ffa4 commit 7337b16
Show file tree
Hide file tree
Showing 21 changed files with 191 additions and 67 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/release.yml
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ jobs:
id: release
with:
release-type: python # just keep a changelog, no version anywhere outside of git tags
package-name: juno_template
package-name: juno_population
lint:
name: Lint Code Base
runs-on: ubuntu-latest
Expand Down
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -133,3 +133,5 @@ dmypy.json
envs/src
input
output
config/sample_sheet.yaml
config/user_parameters.yaml
5 changes: 3 additions & 2 deletions .gitmodules
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
[submodule "juno-library"]
[submodule "base_juno_pipeline"]
path = base_juno_pipeline
url = https://github.com/RIVM-bioinformatics/base_juno_pipeline.git
url = https://github.com/RIVM-bioinformatics/juno-library.git
branch = v0.9.2
7 changes: 6 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,11 @@
# Juno-Template
# Juno-Population
- [ ] TODO: Write readme
A template pipeline where the other juno pipelines are based on.

Before running the pipeline be sure to initialize the submodules:
```bash
git submodule update --init --recursive
```
## Contribution guidelines
Juno pipelines use a [feature branch workflow](https://www.atlassian.com/git/tutorials/comparing-workflows/feature-branch-workflow). To work on features, create a branch from the `main` branch to make changes to. This branch can be merged to the main branch via a pull request. Hotfixes for bugs can be committed to the `main` branch.

Expand Down
11 changes: 6 additions & 5 deletions Snakefile
Original file line number Diff line number Diff line change
@@ -1,21 +1,22 @@
import yaml


sample_sheet=config["sample_sheet"]
sample_sheet = config["sample_sheet"]
with open(sample_sheet) as f:
SAMPLES = yaml.safe_load(f)

print(SAMPLES)

OUT = config["out"]


localrules:
all,


include: "workflow/rules/rule.smk"
include: "workflow/rules/aggregatePoppunkCsv.smk"
include: "workflow/rules/createQfileFasta.smk"
include: "workflow/rules/PopPUNK.smk"


rule all:
input:
expand(OUT + "/{sample}_combined.fastq", sample=SAMPLES),
expand(OUT + "/poppunk_clusters.csv"),
8 changes: 6 additions & 2 deletions config/pipeline_parameters.yaml
Original file line number Diff line number Diff line change
@@ -1,5 +1,9 @@
threads:
template_rule: 1
create_Qfile: 1
fasta_popPUNK: 8
aggregatePoppunkCsv: 1

mem_gb:
template_rule: 1
create_Qfile: 1
fasta_popPUNK: 1
aggregatePoppunkCsv: 1
3 changes: 0 additions & 3 deletions config/sample_sheet.yaml

This file was deleted.

2 changes: 0 additions & 2 deletions config/user_parameters.yaml

This file was deleted.

7 changes: 7 additions & 0 deletions database_locations.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
import pathlib

species_database_locations = {
"streptococcus_pneumoniae": pathlib.Path(
"/mnt/db/juno/poppunk/streptococcus/GPS_v4_references"
),
}
2 changes: 1 addition & 1 deletion envs/mamba.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,4 @@ name: mamba
channels:
- conda-forge
dependencies:
- mamba
- mamba==0.27
11 changes: 11 additions & 0 deletions envs/population_master.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
name: juno_population
channels:
- bioconda
- conda-forge
- anaconda
- defaults
dependencies:
- git
- mamba==0.27
- pandas
- snakemake
14 changes: 0 additions & 14 deletions envs/template_master.yaml

This file was deleted.

60 changes: 56 additions & 4 deletions template.py → population.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import pathlib
from pickle import DUP
import yaml
import argparse
import sys
Expand All @@ -10,16 +11,21 @@
helper_functions,
)

from database_locations import species_database_locations

class TemplateRun(PipelineStartup, RunSnakemake):

class PopulationRun(PipelineStartup, RunSnakemake):
def __init__(
self,
input_dir,
output_dir,
input_type="fastq",
species=None,
db_dir=None,
input_type="fasta",
unlock=False,
rerunincomplete=False,
dryrun=False,
queue="bio",
**kwargs
):
PipelineStartup.__init__(
Expand All @@ -29,22 +35,41 @@ def __init__(
)
RunSnakemake.__init__(
self,
pipeline_name="template",
pipeline_name="population",
pipeline_version="0.1.0",
output_dir=output_dir,
workdir=pathlib.Path(__file__).parent.resolve(),
queue=queue,
unlock=unlock,
rerunincomplete=rerunincomplete,
dryrun=dryrun,
**kwargs,
)

# Specific Juno-Population pipeline attributes
if not db_dir:
self.db_dir = species_database_locations.get(species)
if not self.db_dir:
raise KeyError(
"Cannot determine db_dir: This species is currently not configured AND no db_dir was provided. Manually provide a db_dir via -b/--database, or ask for your species to be configured."
)

self.user_parameters = pathlib.Path("config/user_parameters.yaml")

# Start pipeline
self.start_juno_pipeline()

# Create user_parameters.yaml and sample_sheet.yaml files
self.config_params = {
"input_dir": str(self.input_dir),
"out": str(self.output_dir),
"db_dir": str(self.db_dir),
}
with open(self.user_parameters, "w") as f:
yaml.dump(self.config_params, f, default_flow_style=False)

with open(self.sample_sheet, "w") as f:
yaml.dump(self.sample_dict, f, default_flow_style=False)
self.run_snakemake()


Expand All @@ -68,12 +93,36 @@ def __init__(
default="output",
help="Relative or absolute path to the output directory. If non is given, an 'output' directory will be created in the current directory.",
)
parser.add_argument(
"-s",
"--species",
default=None,
required=False,
help="The species name, use an underscore instead of a space (e.g. streptococcus_pneumoniae). Check the publicly available popPUNK databases on www.poppunk.net/pages/databases.html",
)
parser.add_argument(
"-b",
"--database",
default=None,
required=False,
help="The path to the popPUNK database to use. This overrides information provide with the --species argument.",
)
parser.add_argument(
"-l",
"--local",
action="store_true",
help="Running pipeline locally (instead of in a computer cluster). Default is running it in a cluster.",
)
parser.add_argument(
"-q",
"--queue",
type=str,
required=False,
default="bio",
metavar="STR",
dest="queue",
help="Name of the queue that the job will be sumitted to if working on a cluster.",
)
# Snakemake arguments
parser.add_argument(
"-u",
Expand All @@ -100,10 +149,13 @@ def __init__(
help="Extra arguments to be passed to snakemake API (https://snakemake.readthedocs.io/en/stable/api_reference/snakemake.html).",
)
args = parser.parse_args()
TemplateRun(
PopulationRun(
input_dir=args.input,
output_dir=args.output,
species=args.species,
db_dir=args.database,
local=args.local,
queue=args.queue,
unlock=args.unlock,
rerunincomplete=args.rerunincomplete,
dryrun=args.dryrun,
Expand Down
27 changes: 18 additions & 9 deletions run_pipeline.sh
Original file line number Diff line number Diff line change
Expand Up @@ -4,28 +4,37 @@ set -euo pipefail

#----------------------------------------------#
# User parameters
if [ ! -z "${1}" ] || [ ! -z "${2}" ] #|| [ ! -z "${irods_input_projectID}" ]
if [ ! -z "${1}" ] || [ ! -z "${2}" ] || [ ! -z "${irods_input_projectID}" ]
then
input_dir="${1}"
output_dir="${2}"
# PROJECT_NAME="${irods_input_projectID}"
input_dir="${1}"
output_dir="${2}"
PROJECT_NAME="${irods_input_projectID}"
else
echo "One of the parameters is missing, make sure there is an input directory, output directory and project name(param 1, 2 or irods_input_projectID)."
exit 1
fi

if [ ! -d "${input_dir}" ] || [ ! -d "${output_dir}" ]
then
echo "The input directory $input_dir, output directory $output_dir or fastq dir ${input_dir}/clean_fastq does not exist"
exit 1
echo "The input directory $input_dir, output directory $output_dir or fastq dir ${input_dir}/clean_fastq does not exist"
exit 1
else
input_fastq="${input_dir}/clean_fastq"
input_fastq="${input_dir}/clean_fastq"
fi

case $PROJECT_NAME in

rvp_spn)
GENUS_ALL="streptococcus_pneumoniae";;
*)
GENUS_ALL="other";;

esac

#----------------------------------------------#
# Create/update necessary environments
PATH_MAMBA_YAML="envs/mamba.yaml"
PATH_MASTER_YAML="envs/template_master.yaml"
PATH_MASTER_YAML="envs/population_master.yaml"
MAMBA_NAME=$(head -n 1 ${PATH_MAMBA_YAML} | cut -f2 -d ' ')
MASTER_NAME=$(head -n 1 ${PATH_MASTER_YAML} | cut -f2 -d ' ')

Expand Down Expand Up @@ -56,7 +65,7 @@ fi

set -euo pipefail

python template.py --queue "${QUEUE}" -i "${input_dir}" -o "${output_dir}"
python population.py --queue "${QUEUE}" -i "${input_dir}" -o "${output_dir}" -s "${GENUS_ALL}"

result=$?

Expand Down
Empty file removed workflow/envs/env.yaml
Empty file.
11 changes: 11 additions & 0 deletions workflow/envs/poppunk.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
name: poppunk
channels:
- bioconda
- conda-forge
- anaconda
- defaults
dependencies:
- popPUNK
# Joblib 1.2.0 breaks HDBscan clustering that is used by popPUNK.
# Temporarily pin to v1.1, beware of vulnerability that triggered release of joblib v1.2 https://nvd.nist.gov/vuln/detail/CVE-2022-21797
- joblib==1.1.0
29 changes: 29 additions & 0 deletions workflow/rules/PopPUNK.smk
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
rule assign_popPUNK_cluster:
input:
OUT + "/q_files/{sample}_qfile.txt",
output:
output_dir=directory(OUT + "/results_per_sample/{sample}_poppunk/"),
output_csv=OUT
+ "/results_per_sample/{sample}_poppunk/{sample}_poppunk_clusters.csv",
output_pkl=OUT
+ "/results_per_sample/{sample}_poppunk/{sample}_poppunk.dists.pkl",
output_npy=OUT
+ "/results_per_sample/{sample}_poppunk/{sample}_poppunk.dists.npy",
output_h5=OUT + "/results_per_sample/{sample}_poppunk/{sample}_poppunk.h5",
log:
OUT + "/log/{sample}_poppunk.log",
conda:
"../envs/poppunk.yaml"
message:
"Running popPUNK clustering"
params:
db_dir=config["db_dir"],
resources:
mem_gb=config["mem_gb"]["fasta_popPUNK"],
threads: config["threads"]["fasta_popPUNK"]
shell:
"""
poppunk_assign \
--db {params.db_dir} \
--threads {threads} --query {input} --output {output.output_dir} 2> {log}
"""
20 changes: 20 additions & 0 deletions workflow/rules/aggregatePoppunkCsv.smk
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
rule aggregate_poppunk_csv:
input:
expand(
OUT + "/results_per_sample/{sample}_poppunk/{sample}_poppunk_clusters.csv",
sample=SAMPLES,
),
output:
OUT + "/poppunk_clusters.csv",
log:
OUT + "/log/summarize.log",
message:
"Merging individual popPUNK output to one csv."
resources:
mem_gb=config["mem_gb"]["aggregatePoppunkCsv"],
threads: config["threads"]["aggregatePoppunkCsv"]
run:
import pandas as pd

aggregated_csv = pd.concat([pd.read_csv(f) for f in input], ignore_index=True)
aggregated_csv.to_csv(output[0])
14 changes: 14 additions & 0 deletions workflow/rules/createQfileFasta.smk
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
rule create_Qfile_fasta:
"""Create popPUNKs required query file, a textfile containing sampleID and location of fasta"""
# TODO: popPUNK can also work from fastqs. This approach completely ignores this.
input:
lambda wc: SAMPLES[wc.sample]["assembly"],
output:
OUT + "/q_files/{sample}_qfile.txt",
resources:
mem_gb=config["mem_gb"]["create_Qfile"],
threads: config["threads"]["create_Qfile"]
shell:
"""
printf "$(basename {input} .fasta)\t$(realpath {input})\n" > {output}
"""
17 changes: 0 additions & 17 deletions workflow/rules/rule.smk

This file was deleted.

Loading

0 comments on commit 7337b16

Please sign in to comment.