diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index ff16024..8a71a1b 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -15,7 +15,7 @@ jobs: id: release with: release-type: python # just keep a changelog, no version anywhere outside of git tags - package-name: juno_template + package-name: juno_assembly_verfication lint: name: Lint Code Base runs-on: ubuntu-latest diff --git a/README.md b/README.md index a4802bc..6d8c030 100644 --- a/README.md +++ b/README.md @@ -1,7 +1,28 @@ -# Juno-Template -A template pipeline where the other juno pipelines are based on. +# Juno-assembly verification + +This pipeline aims to make the verification of Juno-assembly versions easier. + +Validations and verification are organised around pipeline functionalities, which are managed by different groups. Each group has their own criteria for validation and verification. This workflow is organised around this concept: each group has their own verification workflow which is are separate from each other. + +Configuration files and reference data are stored in /mnt/db/juno and are under version control of a local (not uploaded) git. + +## General components for verification workflows + +General components used by workflows + +- `config/config_GROUP.tsv`: Config file indicating which columns should be compare with which operator. E.g. "# contigs" from results file (`result_col`) should be less than or equal to (`operator`) 150 contigs for sample X (`criterium_col`). Thresholds/values per sample are defined in `config/verification_criteria_GROUP.tsv`. +- `config/verification_criteria_GROUP.tsv`: defines per sample which thresholds/values the results should be compared to. `criterium_col` from `config/config_GROUP.tsv` should be present in this file. +- `rules/GROUP/verify_qc.smk`: Snakemake rule which calls `workflow/scripts/verify_qc.py`. The Python script uses `config/config_GROUP.tsv`, `config/verification_criteria_GROUP.tsv` and result files and compares these to assign TRUE/FALSE indicating whether a threshold/value was met. TO DO: separate parsing/summarising of results and actual comparison, this now happens in a single script. + +Specific components used + +- The AMR workflow also compares typing results. Plasmidfinder and Resfinder containers (including fixed database versions) are used to type assemblies. Relevant data (gene name, nucleotide identity, reference coverage and accession number) are written to a sorted table of which hashes are compared with reference hashes. A hash of the raw output file is not used, as the table order can differ as well positions of hits within an assembly. +- The AMR workflow also outputs a folder `de_novo_assembly_filtered` with assemblies renamed to reflect the Juno-assembly version in the file name. This folder can be used by an automated SeqSphere script to type assemblies according to MLST and wgMLST. + ## Contribution guidelines +This pipeline is based on Juno-template. + Juno pipelines use a [feature branch workflow](https://www.atlassian.com/git/tutorials/comparing-workflows/feature-branch-workflow). To work on features, create a branch from the `main` branch to make changes to. This branch can be merged to the main branch via a pull request. Hotfixes for bugs can be committed to the `main` branch. Please adhere to the [conventional commits](https://www.conventionalcommits.org/) specification for commit messages. These commit messages can be picked up by [release please](https://github.com/googleapis/release-please) to create meaningful release messages. diff --git a/Snakefile b/Snakefile index cd1fde2..b081079 100644 --- a/Snakefile +++ b/Snakefile @@ -1,25 +1,115 @@ import yaml +import pandas as pd +def select_samples_per_group(verification_table, sample_sheet_dict): + """ + Selects subset of samples based on verification table. + + Args: + verification_table: path to verification table to parse sample names from. + sample_sheet_dict: dict with sample names as key, generated by juno-library. + + Returns: + list of sample names present in both sample_sheet_dict and verification table. + """ + df = pd.read_csv(verification_table, sep='\t') + selected_samples = [] + for sample in df['sample']: + if sample in sample_sheet_dict.keys(): + selected_samples.append(sample) + return selected_samples sample_sheet=config["sample_sheet"] with open(sample_sheet) as f: SAMPLES = yaml.safe_load(f) -for param in ["threads", "mem_gb"]: - for k in config[param]: - config[param][k] = int(config[param][k]) - -# print(SAMPLES) +INPUT = config["input_dir"] +OUT = config["out"] -OUT = config["output_dir"] localrules: all, + verify_accuracy_BVI, + verify_accuracy_AMR, + collect_and_rename, + select_cols_resfinder, + select_cols_plasmidfinder, + get_hashes_resfinder, + get_hashes_plasmidfinder, + typing_report, + display_typing_discrepancies, + + +#################################################### +### BVI verification ### +#################################################### + +BVI_SAMPLES = select_samples_per_group("config/verification_criteria_BVI.tsv", SAMPLES) # has to be implemented in main wrapper + +# Create empty list and append. +# If no samples from the input folder are also in the verification table, +# Snakemake will not attempt to make output for this group of samples. +BVI_OUTPUT = [] + +if len(BVI_SAMPLES) > 0: + BVI_OUTPUT.append(OUT + "/verification_subreports/verify_accuracy_BVI.tsv") + BVI_OUTPUT.append(OUT + "/verification_subreports/BVI_assembly_hashes.txt") + BVI_OUTPUT.append(OUT + "/verification_subreports/BVI_clean_reads_hashes.txt") + BVI_OUTPUT.append(OUT + "/verification_subreports/BVI_id_species_hashes.txt") + BVI_OUTPUT.append(OUT + "/verification_subreports/BVI_qc_assembly_hashes.txt") + +# Include BVI rules +include: "workflow/rules/BVI/verify_accuracy.smk" +#################################################### +### AMR verification ### +#################################################### -include: "workflow/rules/rule.smk" +AMR_SAMPLES = select_samples_per_group("config/verification_criteria_AMR.tsv", SAMPLES) # has to be implemented in main wrapper +# Create empty list and append. +# If no samples from the input folder are also in the verification table, +# Snakemake will not attempt to make output for this group of samples. +AMR_OUTPUT = [] + +if len(AMR_SAMPLES) > 0: + AMR_OUTPUT.append(expand(OUT + "/verification_subreports/verify_accuracy_AMR.tsv")) + AMR_OUTPUT.append(OUT + "/de_novo_assembly_filtered") + AMR_OUTPUT.append(OUT + "/discrepancies") + +# Include AMR rules +include: "workflow/rules/AMR/typing.smk" +include: "workflow/rules/AMR/copy_assemblies.smk" +include: "workflow/rules/AMR/verify_accuracy.smk" + +#################################################### +### RVP verification ### +#################################################### + +# RVP_SAMPLES = select_samples_per_group("config/verification_criteria_RVP.tsv", SAMPLES) # has to be implemented in main wrapper + +# RVP_OUTPUT = [] + +# if len(RVP_SAMPLES) > 0: +# RVP_OUTPUT.append(OUT + "/verification_subreports/verify_qc_RVP.tsv") + +# # Include rules +# include: "workflow/rules/RVP/verify_qc.smk" rule all: input: - expand(OUT + "/{sample}_combined.fastq", sample=SAMPLES), + BVI_OUTPUT, + AMR_OUTPUT, + OUT + "/git_version.txt" + +rule get_git_version: + output: + OUT + "/git_version.txt" + params: + reference_dir = config["reference_dir"], + config_dir = config["config_dir"], + shell: + """ +date | tr '\n' '\t' > {output} +git --git-dir {params.reference_dir}/../.git -n 1 --pretty=format:"%H" >> {output} + """ \ No newline at end of file diff --git a/config/pipeline_parameters.yaml b/config/pipeline_parameters.yaml index d240a8f..a909f5a 100644 --- a/config/pipeline_parameters.yaml +++ b/config/pipeline_parameters.yaml @@ -1,5 +1,5 @@ threads: - template_rule: 1 + typing: 2 mem_gb: - template_rule: 1 + typing: 4 diff --git a/config/sample_sheet.yaml b/config/sample_sheet.yaml deleted file mode 100644 index b0a7513..0000000 --- a/config/sample_sheet.yaml +++ /dev/null @@ -1,3 +0,0 @@ -'1': - R1: 'input/1_R1.fastq' - R2: 'input/1_R2.fastq' diff --git a/config/user_parameters.yaml b/config/user_parameters.yaml deleted file mode 100644 index 2311c9d..0000000 --- a/config/user_parameters.yaml +++ /dev/null @@ -1,2 +0,0 @@ -input_dir: input -out: output diff --git a/envs/juno_assembly_verification.yaml b/envs/juno_assembly_verification.yaml new file mode 100644 index 0000000..376b213 --- /dev/null +++ b/envs/juno_assembly_verification.yaml @@ -0,0 +1,16 @@ +name: juno_assembly_verification +channels: + - bioconda + - conda-forge + - anaconda + - defaults +dependencies: + - git + - mamba + - pandas + - snakemake + - pyyaml + - pip + - openpyxl + - pip: + - "--editable=git+https://github.com/RIVM-bioinformatics/base_juno_pipeline.git@v0.2.9#egg=base_juno" diff --git a/envs/template_master.yaml b/envs/template_master.yaml deleted file mode 100644 index c75d997..0000000 --- a/envs/template_master.yaml +++ /dev/null @@ -1,16 +0,0 @@ -name: juno_template -channels: - - conda-forge - - bioconda - - anaconda - - biocore - - defaults -dependencies: - - git=2.40.* - - mamba=1.3.* - - pandas=1.5.* - - snakemake=7.18.* - - pip=23.* - - python=3.11.* - - pip: - - "--editable=git+https://github.com/RIVM-bioinformatics/juno-library.git@v2.0.0#egg=juno_library" diff --git a/juno_assemby_verification.py b/juno_assemby_verification.py new file mode 100644 index 0000000..d5a8d03 --- /dev/null +++ b/juno_assemby_verification.py @@ -0,0 +1,225 @@ +from base_juno_pipeline import * +import argparse +import os +import pathlib +import sys +import yaml + +from version import __package_name__, __version__, __description__ + +class JunoAssemblyVerificationRun(base_juno_pipeline.PipelineStartup, + base_juno_pipeline.RunSnakemake): + def __init__( + self, + input_dir, + output_dir, + input_type="fasta", + cores=300, + time_limit=60, + local=False, + queue='bio', + unlock=False, + rerunincomplete=False, + dryrun=False, + run_in_container=False, + prefix=None, + **kwargs + ): + output_dir = pathlib.Path(output_dir).resolve() + workdir = pathlib.Path(__file__).parent.resolve() + self.path_to_audit = output_dir.joinpath('audit_trail') + base_juno_pipeline.PipelineStartup.__init__( + self, + input_dir=pathlib.Path(input_dir).resolve(), + input_type="fasta", + ) + base_juno_pipeline.RunSnakemake.__init__( + self, + pipeline_name=__package_name__, + pipeline_version=__version__, + output_dir=output_dir, + workdir=pathlib.Path(__file__).parent.resolve(), + cores=cores, + local=local, + time_limit=time_limit, + queue=queue, + unlock=unlock, + rerunincomplete=rerunincomplete, + dryrun=dryrun, + useconda=not run_in_container, + conda_prefix=prefix if not run_in_container else None, + usesingularity=run_in_container, + singularityargs=f"--bind {self.input_dir}:{self.input_dir} --bind {output_dir}:{output_dir}" if run_in_container else "", + singularity_prefix=prefix if run_in_container else None, + restarttimes=1, + latency_wait=60, + name_snakemake_report=str( + self.path_to_audit.joinpath("juno_assembly_verification_report.html") + ), + **kwargs, + ) + + self.run_juno_assembly_verification_pipeline() + + + # def update_sample_dict_with_metadata(self): + # self.__check_genus_is_supported() + # self.get_metadata_from_csv_file(filepath=self.metadata_file, expected_colnames=['sample', 'genus']) + # for sample in self.sample_dict: + # try: + # self.sample_dict[sample]['genus'] = self.juno_metadata[sample]['genus'].strip().lower() + # except (KeyError, TypeError): + # self.sample_dict[sample]['genus'] = self.genus + + def start_juno_assembly_pipeline(self): + """ + Function to start the pipeline (some steps from PipelineStartup need to + be modified for the Juno_assembly pipeline to accept metadata + """ + self.start_juno_pipeline() + #self.update_sample_dict_with_metadata() + with open(self.sample_sheet, 'w') as file: + yaml.dump(self.sample_dict, file, default_flow_style=False) + + def write_userparameters(self): + config_params = {'input_dir': str(self.input_dir), + 'out': str(self.output_dir), + 'run_in_container': self.usesingularity} + + with open(self.user_parameters, 'w') as file: + yaml.dump(config_params, file, default_flow_style=False) + + return config_params + + def run_juno_assembly_verification_pipeline(self): + self.start_juno_assembly_pipeline() + self.user_params = self.write_userparameters() + self.get_run_info() + self.successful_run = self.run_snakemake() + assert self.successful_run, f'Please check the log files' + if not self.dryrun or self.unlock: + self.make_snakemake_report() + + + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + description="Juno assembly verification pipeline." + ) + parser.add_argument( + "-i", + "--input", + type=pathlib.Path, + required=True, + metavar="DIR", + help="Relative or absolute path to the input directory. It must either be the output directory of the Juno-assembly pipeline or it must contain all the raw reads (fastq) and assemblies (fasta) files for all samples to be processed.", + ) + parser.add_argument( + "-o", + "--output", + type=pathlib.Path, + metavar="DIR", + default="output", + help="Relative or absolute path to the output directory. If non is given, an 'output' directory will be created in the current directory.", + ) + parser.add_argument( + "-l", + "--local", + action="store_true", + help="Running pipeline locally (instead of in a computer cluster). Default is running it in a cluster.", + ) + parser.add_argument( + "--no-containers", + action = 'store_false', + help = "Use conda environments instead of containers." + ) + parser.add_argument( + "-p", + "--prefix", + type = str, + metavar="PATH", + default=None, + help = "Conda or singularity prefix. Basically a path to the place where you want to store the conda environments or the singularity images." + ) + parser.add_argument( + "-q", + "--queue", + type = str, + metavar = "STR", + default = 'bio', + help = 'Name of the queue that the job will be submitted to if working on a cluster.' + ) + parser.add_argument( + "-c", + "--cores", + type = int, + metavar = "INT", + default = 300 if not '--local' in sys.argv else 4, + help="Number of cores to use. Default is 4 if running locally (--local) or 300 otherwise." + ) + parser.add_argument( + "-tl", + "--time-limit", + type = int, + metavar = "INT", + default = 60, + help="Time limit per job in minutes (passed as -W argument to bsub). Jobs will be killed if not finished in this time." + ) + parser.add_argument( + "-rd", + "--reference-directory", + type = pathlib.Path, + metavar = "DIR", + default = pathlib.Path("/mnt/db/juno/assembly_verification/ref_typing"), + help="Directory where reference typing data is stored" + ) + parser.add_argument( + "-cd", + "--config-directory", + type = pathlib.Path, + metavar = "DIR", + default = pathlib.Path("/mnt/db/juno/assembly_verification/config"), + help="Directory where configuration files are stored" + ) + # Snakemake arguments + parser.add_argument( + "-u", + "--unlock", + action="store_true", + help="Unlock output directory (passed to snakemake).", + ) + parser.add_argument( + "-n", + "--dryrun", + action="store_true", + help="Dry run printing steps to be taken in the pipeline without actually running it (passed to snakemake).", + ) + parser.add_argument( + "--rerunincomplete", + action="store_true", + help="Re-run jobs if they are marked as incomplete (passed to snakemake).", + ) + parser.add_argument( + "--snakemake-args", + nargs="*", + default={}, + action=helper_functions.SnakemakeKwargsAction, + help="Extra arguments to be passed to snakemake API (https://snakemake.readthedocs.io/en/stable/api_reference/snakemake.html).", + ) + args = parser.parse_args() + juno_assembly_verification_run = JunoAssemblyVerificationRun( + input_dir=args.input, + output_dir=args.output, + reference_dir=args.reference_directory, + config_dir=args.config_directory, + cores=args.cores, + time_limit=args.time_limit, + local=args.local, + queue=args.queue, + unlock=args.unlock, + rerunincomplete=args.rerunincomplete, + run_in_container=args.no_containers, + prefix=args.prefix, + dryrun=args.dryrun, + **args.snakemake_args, + ) diff --git a/run_pipeline.sh b/run_pipeline.sh index 56cb9f6..a23dd47 100755 --- a/run_pipeline.sh +++ b/run_pipeline.sh @@ -23,25 +23,35 @@ else fi #----------------------------------------------# -# Create/update necessary environments -PATH_MAMBA_YAML="envs/mamba.yaml" -PATH_MASTER_YAML="envs/template_master.yaml" -MAMBA_NAME=$(head -n 1 ${PATH_MAMBA_YAML} | cut -f2 -d ' ') -MASTER_NAME=$(head -n 1 ${PATH_MASTER_YAML} | cut -f2 -d ' ') +## make sure conda works -echo -e "\nUpdating necessary environments to run the pipeline..." +# >>> conda initialize >>> +# !! Contents within this block are managed by 'conda init' !! +__conda_setup="$('/mnt/miniconda/bin/conda' 'shell.bash' 'hook' 2> /dev/null)" +if [ $? -eq 0 ]; then + eval "$__conda_setup" +else + if [ -f "/mnt/miniconda/etc/profile.d/conda.sh" ]; then + . "/mnt/miniconda/etc/profile.d/conda.sh" + else + export PATH="/mnt/miniconda/bin:$PATH" + fi +fi +unset __conda_setup +# <<< conda initialize << None: - juno_template = JunoTemplate() - juno_template.run() - -@dataclass -class JunoTemplate(Pipeline): - pipeline_name: str = __package_name__ - pipeline_version: str = __version__ - input_type: str = "fastq" - - def _add_args_to_parser(self) -> None: - super()._add_args_to_parser() - - self.parser.description = "Template juno pipeline. If you see this message please change it to something appropriate" - - self.add_argument( - "--example-option", - dest="example", - type=str, - required=False, - metavar="STR", - help="This is an optional argument, specific for this pipeline. General options are included in juno-library.", - ) - - def _parse_args(self) -> argparse.Namespace: - args = super()._parse_args() - - # Optional arguments are loaded into self here - self.example: bool = args.example - - return args - - # Extra class methods for this pipeline can be defined here - def example_class_method(self): - print(f"example option is set to {self.example}") - - def setup(self) -> None: - super().setup() - - if self.snakemake_args["use_singularity"]: - self.snakemake_args["singularity_args"] = " ".join( - [ - self.snakemake_args["singularity_args"] - ] # paths that singularity should be able to read from can be bound by adding to the above list - ) - - # Extra class methods for this pipeline can be invoked here - if self.example: - self.example_class_method() - - with open( - Path(__file__).parent.joinpath("config/pipeline_parameters.yaml") - ) as f: - parameters_dict = yaml.safe_load(f) - self.snakemake_config.update(parameters_dict) - - self.user_parameters = { - "input_dir": str(self.input_dir), - "output_dir": str(self.output_dir), - "exclusion_file": str(self.exclusion_file), - "example": str(self.example), # other user parameters can be included in user_parameters.yaml here - } - - -if __name__ == "__main__": - main() diff --git a/version.py b/version.py index e5d8a19..c125731 100644 --- a/version.py +++ b/version.py @@ -1,6 +1,6 @@ -__package_name__ = "juno_template" -__authors__ = "author" -__email__ = "author@rivm.nl" +__package_name__ = "juno_assembly_verification" +__authors__ = "Boas van der Putten" +__email__ = "ids-bioinformatics@rivm.nl" __license__ = "AGPLv3" -__version__ = "1.0" -__description__ = "Template juno pipeline. If you see this message please change it to something appropriate" \ No newline at end of file +__version__ = "0.0.1" +__description__ = "Juno-assembly verification pipeline, used to verifyh updates in Juno-assembly" \ No newline at end of file diff --git a/workflow/rules/AMR/copy_assemblies.smk b/workflow/rules/AMR/copy_assemblies.smk new file mode 100644 index 0000000..eea6f7a --- /dev/null +++ b/workflow/rules/AMR/copy_assemblies.smk @@ -0,0 +1,16 @@ +rule collect_and_rename: + input: + assemblies = [SAMPLES[sample]['assembly'] for sample in AMR_SAMPLES], + log_pipeline = INPUT + "/audit_trail/log_pipeline.yaml" + output: + directory(OUT + "/de_novo_assembly_filtered") + log: + OUT + "/log/collect_and_rename.log", + shell: + """ +python workflow/scripts/batch_rename.py \ + --input {input.assemblies} \ + --output {output} \ + --log-pipeline {input.log_pipeline} 2>&1>{log} + """ + \ No newline at end of file diff --git a/workflow/rules/AMR/typing.smk b/workflow/rules/AMR/typing.smk new file mode 100644 index 0000000..9a819da --- /dev/null +++ b/workflow/rules/AMR/typing.smk @@ -0,0 +1,127 @@ +rule resfinder: + input: + assembly = lambda wildcards: SAMPLES[wildcards.sample]["assembly"], + output: + OUT + "/typing/resfinder/{sample}/ResFinder_results_tab.txt" + message: + "Running ResFinder on {wildcards.sample}." + container: + "docker://genomicepidemiology/resfinder:4.3.1" + threads: config["threads"]["typing"] + resources: + mem_gb=config["mem_gb"]["typing"], + log: + OUT + "/log/typing/resfinder/{sample}.log", + params: + outdir = OUT + "/typing/resfinder/{sample}" + shell: + """ +python -m resfinder -ifa {input.assembly} \ + -o {params.outdir} \ + -acq \ + -l 0.9 \ + -t 0.9 2>&1>{log} + """ + +rule plasmidfinder: + input: + assembly = lambda wildcards: SAMPLES[wildcards.sample]["assembly"], + output: + OUT + "/typing/plasmidfinder/{sample}/results_tab.tsv" + message: + "Running PlasmidFinder on {wildcards.sample}." + container: + "docker://staphb/plasmidfinder:2.1.6" + threads: config["threads"]["typing"] + resources: + mem_gb=config["mem_gb"]["typing"], + params: + outdir = OUT + "/typing/plasmidfinder/{sample}" + log: + OUT + "/log/typing/plasmidfinder/{sample}.log", + shell: + """ +mkdir -p {params.outdir} +plasmidfinder.py -i {input.assembly} \ + -o {params.outdir} \ + -x \ + -l 0.9 \ + -t 0.9 2>&1>{log} + """ + +rule select_cols_resfinder: + input: + OUT + "/typing/resfinder/{sample}/ResFinder_results_tab.txt" + output: + OUT + "/typing/resfinder_data/{sample}.tsv" + params: + columns = "Resistance gene,Identity,Coverage,Accession no." + log: + OUT + "/log/typing/select_cols_resfinder/{sample}.log", + shell: + """ +python workflow/scripts/select_relevant_cols.py \ + -i {input} \ + -o {output} \ + -c "{params.columns}" 2>&1>{log} + """ + +rule select_cols_plasmidfinder: + input: + OUT + "/typing/plasmidfinder/{sample}/results_tab.tsv" + output: + OUT + "/typing/plasmidfinder_data/{sample}.tsv" + params: + columns = "Plasmid,Identity,Query / Template length,Accession number" + log: + OUT + "/log/typing/select_cols_plasmidfinder/{sample}.log", + shell: + """ +python workflow/scripts/select_relevant_cols.py \ + -i {input} \ + -o {output} \ + -c "{params.columns}" 2>&1>{log} + """ + +rule get_hashes_resfinder: + input: + resfinder = expand(OUT + "/typing/resfinder_data/{sample}.tsv", sample=AMR_SAMPLES), + output: + OUT + "/typing/resfinder_hashes.txt" + log: + OUT + "/log/get_hashes_resfinder.log", + shell: + """ +sha256sum {input} > {output} 2>{log} + """ + +rule get_hashes_plasmidfinder: + input: + plasmidfinder = expand(OUT + "/typing/plasmidfinder_data/{sample}.tsv", sample=AMR_SAMPLES), + output: + OUT + "/typing/plasmidfinder_hashes.txt" + log: + OUT + "/log/get_hashes_plasmidfinder.log", + shell: + """ +sha256sum {input} > {output} 2>{log} + """ + +rule typing_report: + input: + rf = OUT + "/typing/resfinder_hashes.txt", + pf = OUT + "/typing/plasmidfinder_hashes.txt", + output: + OUT + "/typing/typing_report.tsv" + message: + "Combinging typing results" + log: + OUT + "/log/typing_report.log", + shell: + """ +python workflow/scripts/compare_typing.py \ + --resfinder {input.rf} \ + --plasmidfinder {input.pf} \ + --output {output} 2>&1>{log} + """ + diff --git a/workflow/rules/AMR/verify_qc.smk b/workflow/rules/AMR/verify_qc.smk new file mode 100644 index 0000000..394a742 --- /dev/null +++ b/workflow/rules/AMR/verify_qc.smk @@ -0,0 +1,37 @@ +rule verify_accuracy_AMR: + input: + verification = config["config_dir"] +"/verification_criteria_AMR.tsv", + quast = INPUT + "/qc_de_novo_assembly/quast/transposed_report.tsv", + bbtools = INPUT + "/qc_de_novo_assembly/bbtools_scaffolds/bbtools_summary_report.tsv", + qc_report = INPUT + "/Juno_assembly_QC_report/QC_report.xlsx", + typing_report = OUT + "/typing/typing_report.tsv", + config = config["config_dir"] + "/config_AMR.tsv", + output: + tsv = OUT + "/verification_subreports/verify_accuracy_AMR.tsv" + shell: + """ +python workflow/scripts/verify_accuracy.py \ + --verification {input.verification} \ + --qc-report {input.qc_report} \ + --typing-report {input.typing_report} \ + --config {input.config} \ + --output {output.tsv} + """ + +rule display_typing_discrepancies: + input: + typing = OUT + "/typing/typing_report.tsv", + ref_typing = config["reference_dir"], + tsv = OUT + "/verification_subreports/verify_qc_AMR.tsv", + output: + directory(OUT + "/discrepancies") + params: + typing_dir = OUT + "/typing" + shell: + """ +python workflow/scripts/display_typing_discrepancies.py \ + --input {params.typing_dir} \ + --ref-input {input.ref_typing} \ + --verification {input.tsv} \ + --output {output} + """ \ No newline at end of file diff --git a/workflow/rules/BVI/generate_hashes.smk b/workflow/rules/BVI/generate_hashes.smk new file mode 100644 index 0000000..c267193 --- /dev/null +++ b/workflow/rules/BVI/generate_hashes.smk @@ -0,0 +1,40 @@ +rule generate_assembly_hash: + input: + [BVI_SAMPLES[sample]['assembly'] for sample in BVI_SAMPLES] + output: + OUT + "/verification_subreports/BVI_assembly_hashes.txt" + shell: + """ +sha256sum {input} > {output} + """ + +rule generate_clean_reads_hash: + input: + r1 = [BVI_SAMPLES[sample]['R1'] for sample in BVI_SAMPLES], + r2 = [BVI_SAMPLES[sample]['R2'] for sample in BVI_SAMPLES], + output: + OUT + "/verification_subreports/BVI_clean_reads_hashes.txt" + shell: + """ +sha256sum {input} > {output} + """ + +rule generate_id_species_hash: + input: + expand(INPUT + "/identify_species/{sample}", sample=BVI_SAMPLES), + output: + OUT + "/verification_subreports/BVI_id_species_hashes.txt" + shell: + """ +sha256sum {input} > {output} + """ + +rule generate_qc_assembly_hash: + input: + INPUT + "/qc_de_novo_assembly/quast/report.tsv" + output: + OUT + "/verification_subreports/BVI_qc_assembly_hashes.txt" + shell: + """ +sha256sum {input} > {output} + """ \ No newline at end of file diff --git a/workflow/rules/BVI/verify_qc.smk b/workflow/rules/BVI/verify_qc.smk new file mode 100644 index 0000000..3881a07 --- /dev/null +++ b/workflow/rules/BVI/verify_qc.smk @@ -0,0 +1,17 @@ +rule verify_accuracy_BVI: + input: + verification = config["config_dir"] + "/verification_criteria_BVI.tsv", + qc_report = INPUT + "/Juno_assembly_QC_report/QC_report.xlsx", + config = config["config_dir"] + "/config_BVI.tsv", + output: + tsv = OUT + "/verification_subreports/verify_accuracy_BVI.tsv" + log: + OUT + "/log/verify_qc_BVI.log", + shell: + """ +python workflow/scripts/verify_accuracy.py \ + --verification {input.verification} \ + --qc-report {input.qc_report} \ + --config {input.config} \ + --output {output.tsv} 2>&1>{log} + """ \ No newline at end of file diff --git a/workflow/rules/rule.smk b/workflow/rules/rule.smk deleted file mode 100644 index 1730f76..0000000 --- a/workflow/rules/rule.smk +++ /dev/null @@ -1,17 +0,0 @@ -rule template_rule: - input: - lambda wc: SAMPLES[wc.sample]["R1"], - lambda wc: SAMPLES[wc.sample]["R2"], - output: - OUT + "/{sample}_combined.fastq", - log: - OUT + "/log/{sample}_template_rule.log" - message: - "Merging {input}." - resources: - mem_gb=config["mem_gb"]["template_rule"], - params: script = "workflow/scripts/script.py" - threads: config["threads"]["template_rule"] - shell: """ - python {params.script} {input} > {output} - """ diff --git a/workflow/scripts/batch_rename.py b/workflow/scripts/batch_rename.py new file mode 100644 index 0000000..a5020d8 --- /dev/null +++ b/workflow/scripts/batch_rename.py @@ -0,0 +1,87 @@ +#!/usr/bin/env python3 + +import pathlib +import shutil + +import argparse + +parser = argparse.ArgumentParser() + +parser.add_argument("--input", + help="Input assemblies", + type=pathlib.Path, + nargs="+", + required=True) +parser.add_argument("--output", + help="Output directory with renamed assemblies", + type=pathlib.Path, + default="de_novo_assembly_filtered") +parser.add_argument("--log-pipeline", + help="log_pipeline.yaml from Juno-assembly", + type=pathlib.Path, + required=True) + +args = parser.parse_args() + +def get_version(log_pipeline_path : pathlib.Path) -> str: + """ + Get version from Juno-assembly run. + + Args: + log_pipeline_path: path to log_pipeline.yaml in audit_trail dir. + + Returns: + Juno-assembly version as str + """ + with open(log_pipeline_path) as log_pipeline: + for line in log_pipeline.readlines(): + if line.startswith("pipeline_version:"): + return line.split(' ')[1].rstrip('\n') + + +def produce_new_name(assembly_path : pathlib.Path, pipeline_version : str) -> str: + """ + Add Juno-assembly version to assembly name. + + Args: + assembly_path: Path to assembly to rename + pipeline_version: Juno-assembly version + + Returns: + Assembly with Juno-assembly version added to file name. + + Raises: + AssertionError: if no underscore is found in assembly name (AMR group-specific convention indicating organism) + """ + assembly_name = assembly_path.stem + before_first_underscore = assembly_name.split('_')[0] + after_first_underscore = "_".join(assembly_name.split('_')[1:]) + assert len(after_first_underscore) > 0, f"Expected an underscore in {assembly_name}, but did not find any" + new_assembly_name = f"{before_first_underscore}-version{pipeline_version}_{after_first_underscore}.fasta" + return new_assembly_name + +def copy_assembly(input_path : pathlib.Path, new_name : str, output_directory : pathlib.Path) -> None: + """ + Copies renamed assembly to other dir. + + Args: + input_path: path to assembly to rename + new_name: file name (incl Juno-assembluy version) + output_directory: dir where to save renamed assembly + + Returns: + None + """ + output_path = output_directory.joinpath(new_name) + shutil.copy2(input_path, output_path) + +def main(args): + if not args.output.exists(): + args.output.mkdir() + assembly_version = get_version(args.log_pipeline) + for assembly_path in args.input: + assembly_renamed = produce_new_name(assembly_path, assembly_version) + copy_assembly(assembly_path, assembly_renamed, args.output) + +if __name__ == "__main__": + main(args) \ No newline at end of file diff --git a/workflow/scripts/compare_typing.py b/workflow/scripts/compare_typing.py new file mode 100644 index 0000000..700e4db --- /dev/null +++ b/workflow/scripts/compare_typing.py @@ -0,0 +1,44 @@ +#!/usr/bin/env python3 + +import pandas as pd +import pathlib +import argparse + +parser = argparse.ArgumentParser() + +parser.add_argument("-pf", "--plasmidfinder", + type=pathlib.Path, + help="Input file with sha256 hashes of all plasmidfinder data", + metavar="FILE") +parser.add_argument("-rf", "--resfinder", + type=pathlib.Path, + help="Input file with sha256 hashes of all resfinder data", + metavar="FILE") +parser.add_argument("-o", "--output", + type=pathlib.Path, + help="Output table", + metavar="FILE") + +args = parser.parse_args() + +def read_df(input_path : pathlib.Path, col_name : str) -> pd.DataFrame: + """ + Read table of hashes per file. + + Args: + input_path: path to table with all hashes. + col_name: intended name for column in output, containing hashed results. + + Returns: + Pandas DataFrame with sample name and hash string. + """ + df = pd.read_csv(input_path, delim_whitespace=True, + header=None, names=[col_name, "file"]) + df['sample'] = df['file'].apply(lambda x: pathlib.Path(x).stem) + df = df[['sample', col_name]] + return df + +df_pf = read_df(args.plasmidfinder, "plasmidfinder_result") +df_rf = read_df(args.resfinder, "resfinder_result") +df = df_pf.merge(df_rf, on="sample") +df.to_csv(args.output, sep='\t', index=False) diff --git a/workflow/scripts/display_typing_discrepancies.py b/workflow/scripts/display_typing_discrepancies.py new file mode 100644 index 0000000..aaf42aa --- /dev/null +++ b/workflow/scripts/display_typing_discrepancies.py @@ -0,0 +1,54 @@ +#!/usr/bin/env python3 + +import argparse +from pathlib import Path +import pandas as pd + +parser = argparse.ArgumentParser() + +parser.add_argument("--verification", + help="Verification result to find typing discrepancies from", + type=Path) +parser.add_argument("--input", + help="Reference plasmidfinder typing hashes", + type=Path) +parser.add_argument("--ref-input", + help="Reference plasmidfinder typing hashes", + type=Path) +parser.add_argument("--output", + help="Output directory", + type=Path) + +args = parser.parse_args() + + +def compare_typing(sample, comparison, ref_input, test_input, output): + ref_path = ref_input.joinpath(f"{comparison}_data", f"{sample}.tsv") + test_path = test_input.joinpath(f"{comparison}_data", f"{sample}.tsv") + df_ref = pd.read_csv(ref_path, sep='\t') + df_test = pd.read_csv(test_path, sep='\t') + print(f"df_ref shape: {df_ref.shape}. df_test shape: {df_test.shape}") + df_comp = df_ref.merge(df_test, indicator=True, how="outer").loc[lambda x: x['_merge']!='both'] + df_comp["Result"] = df_comp["_merge"].map({"left_only": "Reference", "right_only": "Test"}) + df_comp.drop("_merge", axis=1, inplace=True) + print(f"df_comp shape: {df_comp.shape}") + output_path = output.joinpath(f"{sample}_{comparison}.tsv") + df_comp.to_csv(output_path, sep='\t', index=False) + +def main(args): + df = pd.read_csv(args.verification, sep='\t', dtype={'plasmidfinder': str, 'resfinder': str}) + list_discrepant_samples_pf = list(df[df['plasmidfinder'] == "False"]['sample']) + list_discrepant_samples_rf = list(df[df['resfinder'] == "False"]['sample']) + + Path(args.output).mkdir(parents=True, exist_ok=True) + + print(f"list_discrepant_samples_pf is {list_discrepant_samples_pf}") + for sample in list_discrepant_samples_pf: + compare_typing(sample, "plasmidfinder", args.ref_input, args.input, args.output) + + print(f"list_discrepant_samples_rf is {list_discrepant_samples_rf}") + for sample in list_discrepant_samples_rf: + compare_typing(sample, "resfinder", args.ref_input, args.input, args.output) + +if __name__ == "__main__": + main(args) \ No newline at end of file diff --git a/workflow/scripts/script.py b/workflow/scripts/script.py deleted file mode 100644 index 81656b6..0000000 --- a/workflow/scripts/script.py +++ /dev/null @@ -1,6 +0,0 @@ -import subprocess -import sys - -subprocess.call( - ["cat"] + sys.argv[1:], -) diff --git a/workflow/scripts/select_relevant_cols.py b/workflow/scripts/select_relevant_cols.py new file mode 100644 index 0000000..0308962 --- /dev/null +++ b/workflow/scripts/select_relevant_cols.py @@ -0,0 +1,30 @@ +#!/usr/bin/env python3 + +import pandas as pd +import pathlib +import argparse + +parser = argparse.ArgumentParser() + +parser.add_argument("-i", "--input", + type=pathlib.Path, + help="Input file to select columns from", + metavar="FILE") +parser.add_argument("-o", "--output", + type=pathlib.Path, + help="Output table", + metavar="FILE") +parser.add_argument("-c", "--columns", + type=str, + metavar="STR") + +args = parser.parse_args() + +list_relevant_columns = args.columns.split(',') + +# Read df, subset relevant columns and sort by these columns +df = pd.read_csv(args.input, dtype='str', sep='\t') +df_selection = df[list_relevant_columns] +df_selection_sorted = df_selection.sort_values(list_relevant_columns) + +df_selection_sorted.to_csv(args.output, sep='\t', index=False) \ No newline at end of file diff --git a/workflow/scripts/verify_accuracy.py b/workflow/scripts/verify_accuracy.py new file mode 100644 index 0000000..f2b555d --- /dev/null +++ b/workflow/scripts/verify_accuracy.py @@ -0,0 +1,181 @@ +import pandas as pd +import openpyxl +import pathlib + +import argparse + +parser = argparse.ArgumentParser() + +parser.add_argument("-V", "--verification", + help=" Verification criteria per sample", + type=pathlib.Path, + required=True) +parser.add_argument("-qc", "--qc-report", + help=" Juno-assembly QC_report.xlsx", + type=pathlib.Path) +parser.add_argument("--quast", + help=" Juno-assembly quast transposed_report.tsv", + type=pathlib.Path) +parser.add_argument("--bbtools", + help=" Juno-assembly bbtools multireport", + type=pathlib.Path) +parser.add_argument("--typing-report", + help="Typing report to include", + type=pathlib.Path) +parser.add_argument("--config", + help="Table describing comparisons to make", + type=pathlib.Path, + required=True) +parser.add_argument("-o", "--output", + help="Output verification result", + type=pathlib.Path, + default="verification_result.tsv") + +args = parser.parse_args() + +def compare_criterium(result_col : str, operator : str, criterium_col : str, dataframe : pd.DataFrame) -> pd.Series: + """ + Compare columns from dataframe using specified operator. + + Args: + result_col: column describing result values. + operator: comparison to make between result_col and criterium_col. + criterium_col: column describing threshold or reference value. + dataframe: dataframe to use for comparisons. + + Returns: + pd.Series of comparisons, expressed as boolean. + + Raises: + ValueError: if operator is not one of ">", ">=", "<", "<=", "==". + """ + if operator == ">": + return dataframe[result_col] > dataframe[criterium_col] + if operator == ">=": + return dataframe[result_col] >= dataframe[criterium_col] + if operator == "<": + return dataframe[result_col] < dataframe[criterium_col] + if operator == "<=": + return dataframe[result_col] <= dataframe[criterium_col] + elif operator == "==": + return dataframe[result_col] == dataframe[criterium_col] + else: + raise ValueError("Did not recognise operator") + +def read_verification_df(path_to_verification : pathlib.Path) -> pd.DataFrame: + """ + Read in verification table. + + Args: + path_to_verification: path to verification table. + + Returns: + Dataframe with verification criteria. + """ + df_verification = pd.read_csv(path_to_verification, sep='\t') + df_verification['sample'] = df_verification['sample'].astype(str) + return df_verification + +def use_qc_report(path_qc_report : pathlib.Path, df_verification : pd.DataFrame) -> pd.DataFrame: + """ + Read QC report and merge with verification table. + + Args: + path_qc_report: path to QC_report.xlsx from Juno-assembly (v2.1.0 and later). + df_verification: dataframe with verification criteria. + + Returns: + dataframe with QC report and verification data. + """ + dict_qc_report = pd.read_excel(path_qc_report, None) + df_results = pd.concat(dict_qc_report.values()) + df_results['sample'] = df_results['sample'].astype(str) + df_merged = df_verification.merge(df_results, on="sample") + return df_merged + +def use_quast_and_bbtools(path_quast : pathlib.Path, path_bbtools : pathlib.Path, df_verification : pd.DataFrame) -> pd.DataFrame: + """ + Read Quast and bbtools summaries and merge with verification table. + + Args: + path_quast: path to Quast transposed_report.tsv from Juno-assembly. + path_bbtools: path to bbtools summary_report.tsv from Juno-assembly. + df_verification: dataframe with verification criteria. + + Returns: + dataframe with Quast, bbtools and verification data. + """ + df_quast = pd.read_csv(path_quast, sep='\t') + df_bbtools = pd.read_csv(path_bbtools, sep='\t') + df_verification['sample_truncated'] = df_verification['sample'].apply(lambda x: str(x).split('_')[0]) + df_out = df_verification.merge(df_quast[['Assembly', '# contigs']], left_on="sample", right_on="Assembly") + df_out = df_out.merge(df_bbtools[['Sample', 'Average coverage']], left_on="sample_truncated", right_on="Sample") + return df_out + +def add_typing_report(df_results : pd.DataFrame, path_to_typing : pathlib.Path) -> pd.DataFrame: + """ + Read typing results and merge with verification data. + + Args: + df_results: dataframe with verification data (can include other data). + path_to_typing: path to typing report (e.g. result hashes). + + Returns: + dataframe with typing data added. + """ + df_typing = pd.read_csv(path_to_typing, sep='\t') + df_results = df_results.merge(df_typing, on="sample") + return df_results + +def compare_to_criteria(df_results : pd.DataFrame, path_to_config : pathlib.Path) -> pd.DataFrame: + """ + Compare results with verification thresholds/reference values. + + Args: + df_results: dataframe with all results and verification data needed for comparisons. + path_to_config: path to config table, outlining which comparisons to make. + + Returns: + dataframe with only the comparisons specified in path_to_config. + """ + df_config = pd.read_csv(path_to_config, sep="\t") + + df_output = df_results[['sample', 'organism']].copy() + + # Compare columns in df_results by looping over df_config lines + for index, row in df_config.iterrows(): + comp_name = row['comparison_name'] + operator_name = '_'.join(["operator", str(row['result_col'])]) + df_output[comp_name] = compare_criterium(row['result_col'], + row['operator'], + row['criterium_col'], + df_results) + df_output[row['result_col']] = df_results[row['result_col']] + df_output[operator_name] = row['operator'] + df_output[row['criterium_col']] = df_results[row['criterium_col']] + + return df_output + +def main(args): + df_verification = read_verification_df(args.verification) + + # Currently, need either QC_report.xlsx or otherwise Quast AND bbtools + if args.qc_report is not None: + df_merged = use_qc_report(args.qc_report, df_verification) + elif (args.quast is not None) and (args.bbtools is not None): + df_merged = use_quast_and_bbtools(args.quast, args.bbtools, df_verification) + else: + raise ValueError("QC report, quast multireport and bbtools multireport were not provided. I need either (the QC report) or (the quast and bbtools reports)") + + # Add typing if available + if args.typing_report is not None: + df_merged = add_typing_report(df_merged, args.typing_report) + + df_output = compare_to_criteria(df_merged, args.config) + df_output.to_csv(args.output, sep='\t', index=False) + +if __name__ == "__main__": + main(args) + + +