From 610ae8aceef75ce63e9de0e64be05d2a827c604a Mon Sep 17 00:00:00 2001 From: Ulthran Date: Tue, 12 Mar 2024 15:47:52 -0600 Subject: [PATCH 01/22] Copy instead of symlink for sample intake --- workflow/scripts/sample_intake.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/workflow/scripts/sample_intake.py b/workflow/scripts/sample_intake.py index 6fbf088c..4e7d8321 100755 --- a/workflow/scripts/sample_intake.py +++ b/workflow/scripts/sample_intake.py @@ -1,6 +1,8 @@ +import shutil from pathlib import Path with open(snakemake.log[0], "w") as log: assert snakemake.input[0].endswith(".fastq.gz") log.write("Creating symlink\n") - Path(snakemake.output[0]).symlink_to(snakemake.input[0]) + # Path(snakemake.output[0]).symlink_to(snakemake.input[0]) + shutil.copy(snakemake.input[0], snakemake.output[0]) From 9bbd4130adf27a6e1bd9df6fde93d2492113cc51 Mon Sep 17 00:00:00 2001 From: Ulthran Date: Tue, 12 Mar 2024 15:48:16 -0600 Subject: [PATCH 02/22] Don't use paths in cutadapt params --- workflow/Snakefile | 4 ++-- workflow/rules/qc.smk | 10 ++++------ workflow/scripts/adapter_removal_paired.py | 10 ++++------ workflow/scripts/adapter_removal_unpaired.py | 9 ++++----- 4 files changed, 14 insertions(+), 19 deletions(-) diff --git a/workflow/Snakefile b/workflow/Snakefile index e78c6674..4eecf5f2 100644 --- a/workflow/Snakefile +++ b/workflow/Snakefile @@ -110,13 +110,13 @@ sys.stderr.write("done.\n") QC_FP = output_subdir(Cfg, "qc") BENCHMARK_FP = output_subdir(Cfg, "benchmarks") LOG_FP = output_subdir(Cfg, "logs") -# ---- DEPRECATED +# ---- BEGIN DEPRECATED # These paths will be moved to their respective extensions in a future version ASSEMBLY_FP = output_subdir(Cfg, "assembly") ANNOTATION_FP = output_subdir(Cfg, "annotation") CLASSIFY_FP = output_subdir(Cfg, "classify") MAPPING_FP = output_subdir(Cfg, "mapping") -# ---- DEPRECATED +# ---- END DEPRECATED # ---- Targets rules diff --git a/workflow/rules/qc.smk b/workflow/rules/qc.smk index 3944991f..21703c38 100755 --- a/workflow/rules/qc.smk +++ b/workflow/rules/qc.smk @@ -36,13 +36,12 @@ rule adapter_removal_unpaired: input: QC_FP / "00_samples" / "{sample}_1.fastq.gz", output: - QC_FP / "01_cutadapt" / "{sample}_1.fastq.gz", + r=QC_FP / "01_cutadapt" / "{sample}_1.fastq.gz", + ngz=temp(QC_FP / "01_cutadapt" / "{sample}_1.fastq"), log: LOG_FP / "adapter_removal_unpaired_{sample}.log", benchmark: BENCHMARK_FP / "adapter_removal_unpaired_{sample}.tsv" - params: - str(QC_FP / "01_cutadapt" / "{sample}_1.fastq"), resources: runtime=lambda wc, input: max(MIN_RUNTIME, input.size_mb / 5), threads: 4 @@ -61,13 +60,12 @@ rule adapter_removal_paired: output: r1=QC_FP / "01_cutadapt" / "{sample}_1.fastq.gz", r2=QC_FP / "01_cutadapt" / "{sample}_2.fastq.gz", + ngz1=temp(QC_FP / "01_cutadapt" / "{sample}_1.fastq"), + ngz2=temp(QC_FP / "01_cutadapt" / "{sample}_2.fastq"), log: LOG_FP / "adapter_removal_paired_{sample}.log", benchmark: BENCHMARK_FP / "adapter_removal_paired_{sample}.tsv" - params: - r1=str(QC_FP / "01_cutadapt" / "{sample}_1.fastq"), - r2=str(QC_FP / "01_cutadapt" / "{sample}_2.fastq"), resources: runtime=lambda wc, input: max(MIN_RUNTIME, input.size_mb / 10), threads: 4 diff --git a/workflow/scripts/adapter_removal_paired.py b/workflow/scripts/adapter_removal_paired.py index 0631ab65..92105f17 100755 --- a/workflow/scripts/adapter_removal_paired.py +++ b/workflow/scripts/adapter_removal_paired.py @@ -38,9 +38,9 @@ args += rev_adapter_str.split(" ") args += [ "-o", - f"{snakemake.params.r1}", + f"{snakemake.output.ngz1}", "-p", - f"{snakemake.params.r2}", + f"{snakemake.output.ngz2}", f"{snakemake.input.r1}", f"{snakemake.input.r2}", ] @@ -53,16 +53,14 @@ sys.exit(e.returncode) log.write(cutadapt_output.decode()) - with open(snakemake.params.r1, "rb") as f_in, gzip.open( + with open(snakemake.output.ngz1, "rb") as f_in, gzip.open( snakemake.output.r1, "wb" ) as f_out: shutil.copyfileobj(f_in, f_out) - with open(snakemake.params.r2, "rb") as f_in, gzip.open( + with open(snakemake.output.ngz2, "rb") as f_in, gzip.open( snakemake.output.r2, "wb" ) as f_out: shutil.copyfileobj(f_in, f_out) - os.remove(snakemake.params.r1) - os.remove(snakemake.params.r2) else: log.write("Adapters not found, skipping adapter removal...") os.symlink(snakemake.input.r1, snakemake.output.r1) diff --git a/workflow/scripts/adapter_removal_unpaired.py b/workflow/scripts/adapter_removal_unpaired.py index 2906332c..e3e9f47c 100755 --- a/workflow/scripts/adapter_removal_unpaired.py +++ b/workflow/scripts/adapter_removal_unpaired.py @@ -36,7 +36,7 @@ args += rev_adapter_str.split(" ") args += [ "-o", - snakemake.params[0], + snakemake.output.ngz, snakemake.input[0], ] cutadapt_output = sp.check_output( @@ -48,11 +48,10 @@ sys.exit(e.returncode) log.write(cutadapt_output.decode()) - with open(snakemake.params[0]) as f_in, gzip.open( - snakemake.output[0], "wt" + with open(snakemake.output.ngz) as f_in, gzip.open( + snakemake.output.r, "wt" ) as f_out: f_out.writelines(f_in.readlines()) - os.remove(snakemake.params[0]) else: log.write("Adapters not found, skipping adapter removal...") - os.symlink(snakemake.input[0], snakemake.output[0]) + os.symlink(snakemake.input[0], snakemake.output.r) From 2a5fad42455c5b091ee3626b63e825cbd4092a10 Mon Sep 17 00:00:00 2001 From: Ulthran Date: Tue, 12 Mar 2024 16:06:03 -0600 Subject: [PATCH 03/22] Don't use paths in params --- workflow/rules/decontaminate.smk | 13 +++++-------- workflow/rules/qc.smk | 22 ++++++++++++---------- workflow/scripts/sample_intake.py | 3 --- 3 files changed, 17 insertions(+), 21 deletions(-) diff --git a/workflow/rules/decontaminate.smk b/workflow/rules/decontaminate.smk index 554ead49..5cd4d229 100644 --- a/workflow/rules/decontaminate.smk +++ b/workflow/rules/decontaminate.smk @@ -158,16 +158,13 @@ rule clean_decontam: rp=Pairs, ), QC_FP / ".qc_cleaned", - params: - cutadapt_fp=QC_FP / "01_cutadapt", - trimmomatic_fp=QC_FP / "02_trimmomatic", - komplexity_fp=QC_FP / "03_komplexity", - clean_qc_fp=QC_FP / "cleaned", - intermediates_fp=QC_FP / "decontam" / "intermediates", output: touch(QC_FP / ".decontam_cleaned"), shell: """ - rm -r {params.clean_qc} || true - rm -r {params.intermediates} || true + cleaned_dir=$(dirname {input[0]}) + qc_dir=$(dirname $cleaned_dir) + + rm -r $qc_dir/cleaned || true + rm -r $qc_dir/decontam/intermediates || true """ diff --git a/workflow/rules/qc.smk b/workflow/rules/qc.smk index 21703c38..6e28c64f 100755 --- a/workflow/rules/qc.smk +++ b/workflow/rules/qc.smk @@ -167,8 +167,6 @@ rule fastqc: LOG_FP / "fastqc_{sample}.log", benchmark: BENCHMARK_FP / "fastqc_{sample}.tsv" - params: - outdir=QC_FP / "reports", resources: runtime=lambda wc: max(MIN_RUNTIME, 120), conda: @@ -176,7 +174,12 @@ rule fastqc: container: get_docker_str("qc") shell: - "fastqc -o {params.outdir} {input.reads} -extract 2>&1 | tee {log}" + """ + sample_dir=$(dirname {output[0]}) + outdir=$(dirname $sample_dir) + + fastqc -o $outdir {input.reads} -extract 2>&1 | tee {log} + """ rule fastqc_report: @@ -260,15 +263,14 @@ rule clean_qc: sample=Samples.keys(), rp=Pairs, ), - params: - cutadapt_fp=QC_FP / "01_cutadapt", - trimmomatic_fp=QC_FP / "02_trimmomatic", - komplexity_fp=QC_FP / "03_komplexity", output: touch(QC_FP / ".qc_cleaned"), shell: """ - rm -r {params.cutadapt_fp} || true - rm -r {params.trimmomatic_fp} || true - rm -r {params.komplexity_fp} || true + cleaned_dir=$(dirname {input[0]}) + qc_dir=$(dirname $cleaned_dir) + + rm -r $qc_dir/01_cutadapt || true + rm -r $qc_dir/02_trimmomatic || true + rm -r $qc_dir/03_komplexity || true """ diff --git a/workflow/scripts/sample_intake.py b/workflow/scripts/sample_intake.py index 4e7d8321..d8755afe 100755 --- a/workflow/scripts/sample_intake.py +++ b/workflow/scripts/sample_intake.py @@ -1,8 +1,5 @@ import shutil -from pathlib import Path with open(snakemake.log[0], "w") as log: assert snakemake.input[0].endswith(".fastq.gz") - log.write("Creating symlink\n") - # Path(snakemake.output[0]).symlink_to(snakemake.input[0]) shutil.copy(snakemake.input[0], snakemake.output[0]) From 3bd3f1dc184f309b8ae49c7ab309ccc1c794b8ed Mon Sep 17 00:00:00 2001 From: Ulthran Date: Thu, 14 Mar 2024 09:17:39 -0600 Subject: [PATCH 04/22] Add FAQs page --- docs/faqs.rst | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) create mode 100644 docs/faqs.rst diff --git a/docs/faqs.rst b/docs/faqs.rst new file mode 100644 index 00000000..e3881f07 --- /dev/null +++ b/docs/faqs.rst @@ -0,0 +1,19 @@ +.. _faqs: + +==== +FAQs +==== + +A collection of common questions, issues, or points of confusion. + +**I'm getting ``snakemake: error: argument --executor/-e: invalid choice: '_____' (choose from 'local', 'dryrun', 'touch')``. Why can't I use the ``--executor`` option?** + +You're using the exectuor option properly, it's just that you haven't installed the executor plugin. Use ``pip`` to install it and you should be good to go. + +**I'm trying to use singularity but it keeps failing and complaining about running out of space. I know I have plenty of open disk space. Why is it running out?** + +This is a known issue with singularity. It's not actually running out of space, it's just that the default location for the temporary directory is on a partition that is too small. You can change the location of the temporary directory by setting the ``SINGULARITY_TMPDIR`` and ``TMPDIR`` environment variables to a location with more space. + +**A rule keeps failing with an error like "perl: error while loading shared libraries: libcrypt.so.1: cannot open shared object file: No such file or directory". What's going on?** + +This is unfortunately a common issue with conda where shared libraries are either not installed or not properly loaded for packages that depend on them. There can be many causes and many fixes. You can start by searching the exact error message and seeing if there are any suggestions for how to solve it. Often it will involve installing the missing library with conda or installing the missing library with the system package manager. For example, the solution to the example error for me running sunbeam on a standard Amazon machine image (AMI) was to install the library using ``sudo yum install libxcrypt-compat``. \ No newline at end of file From 45678333db5953b999297140f3fc9318d1c4451b Mon Sep 17 00:00:00 2001 From: Ulthran Date: Tue, 2 Apr 2024 15:44:57 -0400 Subject: [PATCH 05/22] Allow non-local paths through _verify_path --- src/sunbeamlib/__init__.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/sunbeamlib/__init__.py b/src/sunbeamlib/__init__.py index b1dd6c71..a693e287 100755 --- a/src/sunbeamlib/__init__.py +++ b/src/sunbeamlib/__init__.py @@ -131,7 +131,8 @@ def _verify_path(fp: str) -> str: raise ValueError("Missing filename") path = Path(fp) if not path.is_file(): - raise ValueError("File not found") + sys.stderr.write(f"WARNING: File {str(path)} does not exist locally\n") + return str(path) return str(path.resolve()) From 3b6c064b0083db1d822373b5e37d6eaee211faf2 Mon Sep 17 00:00:00 2001 From: Ulthran Date: Tue, 2 Apr 2024 16:19:07 -0400 Subject: [PATCH 06/22] Allow empty output_fp --- src/sunbeamlib/config.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/sunbeamlib/config.py b/src/sunbeamlib/config.py index 7ac27565..f563184a 100644 --- a/src/sunbeamlib/config.py +++ b/src/sunbeamlib/config.py @@ -35,7 +35,9 @@ def validate_paths(cfg: Dict[str, str], root: Path) -> Dict[str, Union[str, Path """ new_cfg = dict() for k, v in cfg.items(): - if k.endswith("_fp"): + if k == "output_fp" and not v: + v = "" + elif k.endswith("_fp"): try: v = makepath(v) except TypeError as e: From 7b74b05d19a893e64650d97362bb1c3f08e52287 Mon Sep 17 00:00:00 2001 From: Ulthran Date: Wed, 3 Apr 2024 08:09:15 -0400 Subject: [PATCH 07/22] Ignore host_fp too, TEMP FIX --- src/sunbeamlib/config.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/sunbeamlib/config.py b/src/sunbeamlib/config.py index f563184a..3f4e7bd3 100644 --- a/src/sunbeamlib/config.py +++ b/src/sunbeamlib/config.py @@ -35,8 +35,8 @@ def validate_paths(cfg: Dict[str, str], root: Path) -> Dict[str, Union[str, Path """ new_cfg = dict() for k, v in cfg.items(): - if k == "output_fp" and not v: - v = "" + if k == "output_fp" or k == "host_fp": + v = Path(v) elif k.endswith("_fp"): try: v = makepath(v) From 4e8c1774374341a60162b8dce80e39e00de15db6 Mon Sep 17 00:00:00 2001 From: Ulthran Date: Wed, 3 Apr 2024 09:10:59 -0400 Subject: [PATCH 08/22] Reconfigure HostGenomes to allow for remotes --- src/sunbeamlib/default_config.yml | 1 + workflow/Snakefile | 17 ++++++++++++++--- workflow/rules/decontaminate.smk | 7 ++----- 3 files changed, 17 insertions(+), 8 deletions(-) diff --git a/src/sunbeamlib/default_config.yml b/src/sunbeamlib/default_config.yml index 4ea22e16..b9c9c275 100644 --- a/src/sunbeamlib/default_config.yml +++ b/src/sunbeamlib/default_config.yml @@ -43,6 +43,7 @@ qc: pct_id: 0.5 frac: 0.6 host_fp: "" + host_list: [] # Taxonomic classifications classify: diff --git a/workflow/Snakefile b/workflow/Snakefile index 4eecf5f2..23106429 100644 --- a/workflow/Snakefile +++ b/workflow/Snakefile @@ -88,6 +88,7 @@ if Cfg["qc"]["host_fp"] == Cfg["all"]["root"]: HostGenomeFiles = [] else: HostGenomeFiles = [f for f in Cfg["qc"]["host_fp"].glob("*.fasta")] + print(HostGenomeFiles) if not HostGenomeFiles: sys.stderr.write( "\n\nWARNING: No files detected in host genomes folder ({}). " @@ -96,9 +97,19 @@ else: Cfg["qc"]["host_fp"] ) ) -HostGenomes = { - Path(g.name).stem: read_seq_ids(Cfg["qc"]["host_fp"] / g) for g in HostGenomeFiles -} + +# Once this change has been implemented for a while we can remove the try/except +# and just use an if/else, using the try/except for now to avoid migration pains +# with old sunbeam configs being copied over +try: + if Cfg["qc"]["host_list"]: + HostGenomes = Cfg["qc"]["host_list"] + else: + raise KeyError +except KeyError: + HostGenomes = [Path(g.name).stem for g in HostGenomeFiles] + print(HostGenomes) + sys.stderr.write("done.\n") diff --git a/workflow/rules/decontaminate.smk b/workflow/rules/decontaminate.smk index 5cd4d229..02c02971 100644 --- a/workflow/rules/decontaminate.smk +++ b/workflow/rules/decontaminate.smk @@ -24,9 +24,6 @@ rule build_host_index: LOG_FP / "build_host_index_{host}.log", benchmark: BENCHMARK_FP / "build_host_index_{host}.tsv" - params: - host="{host}", - index_fp=Cfg["qc"]["host_fp"], conda: "../envs/qc.yml" container: @@ -84,7 +81,7 @@ rule aggregate_reads: input: expand( QC_FP / "decontam" / "intermediates" / "{host}" / "{{sample}}.ids", - host=HostGenomes.keys(), + host=HostGenomes, ), output: temp(QC_FP / "decontam" / "intermediates" / "{sample}_hostreads.ids"), @@ -102,7 +99,7 @@ rule filter_reads: reads=QC_FP / "cleaned" / "{sample}_{rp}.fastq.gz", hostids=expand( QC_FP / "decontam" / "intermediates" / "{host}" / "{{sample}}.ids", - host=HostGenomes.keys(), + host=HostGenomes, ), output: reads=QC_FP / "decontam" / "{sample}_{rp}.fastq.gz", From fce0b34e67b3f5de7791b10b6a0ab10a08e9bdfe Mon Sep 17 00:00:00 2001 From: Ulthran Date: Wed, 3 Apr 2024 09:42:23 -0400 Subject: [PATCH 09/22] Change dir in reference to input/output rather than Cfg --- workflow/rules/decontaminate.smk | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflow/rules/decontaminate.smk b/workflow/rules/decontaminate.smk index 02c02971..2638acf9 100644 --- a/workflow/rules/decontaminate.smk +++ b/workflow/rules/decontaminate.smk @@ -29,7 +29,7 @@ rule build_host_index: container: get_docker_str("qc") shell: - "cd {Cfg[qc][host_fp]} && bwa index {input} 2>&1 | tee {log}" + "cd $(dirname {input}) && bwa index {input} 2>&1 | tee {log}" rule align_to_host: From f7d01ee5901b6614a2a9e26fff3babbe4e539016 Mon Sep 17 00:00:00 2001 From: Ulthran Date: Wed, 3 Apr 2024 10:00:52 -0400 Subject: [PATCH 10/22] Don't need to direct bwa index output? --- workflow/rules/decontaminate.smk | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/workflow/rules/decontaminate.smk b/workflow/rules/decontaminate.smk index 2638acf9..b8942b90 100644 --- a/workflow/rules/decontaminate.smk +++ b/workflow/rules/decontaminate.smk @@ -29,7 +29,9 @@ rule build_host_index: container: get_docker_str("qc") shell: - "cd $(dirname {input}) && bwa index {input} 2>&1 | tee {log}" + """ + bwa index {input} 2>&1 | tee {log} + """ rule align_to_host: From 7ca8cd68ab501b98095485cd239a0865b8d56bdc Mon Sep 17 00:00:00 2001 From: Ulthran Date: Wed, 3 Apr 2024 10:19:46 -0400 Subject: [PATCH 11/22] Fix tests for _verify_path --- tests/unit/sunbeamlib/test__init__.py | 24 +++++++++++++++--------- 1 file changed, 15 insertions(+), 9 deletions(-) diff --git a/tests/unit/sunbeamlib/test__init__.py b/tests/unit/sunbeamlib/test__init__.py index 2af759ee..2859d363 100755 --- a/tests/unit/sunbeamlib/test__init__.py +++ b/tests/unit/sunbeamlib/test__init__.py @@ -37,7 +37,7 @@ def test_version(): assert __version__ != "0.0.0" -def test_load_sample_list(init): +def test_load_sample_list(init, capsys): output_dir = init samples_fp = output_dir / "samples" samples_fp.mkdir() @@ -50,7 +50,10 @@ def test_load_sample_list(init): try: load_sample_list(sample_list_fp) - assert False + assert ( + capsys.readouterr().err + == f"WARNING: File {sample1.resolve()} does not exist locally\nWARNING: File {sample2.resolve()} does not exist locally\n" + ) except ValueError as e: pass @@ -59,7 +62,10 @@ def test_load_sample_list(init): try: load_sample_list(sample_list_fp) - assert False + assert ( + capsys.readouterr().err + == f"WARNING: File {sample2.resolve()} does not exist locally\n" + ) except ValueError as e: pass @@ -96,7 +102,7 @@ def test_guess_format_string_single_end(): assert ret == "{sample}.fastq.gz" -def test_verify_path(init): +def test_verify_path(init, capsys): output_dir = init try: @@ -105,11 +111,11 @@ def test_verify_path(init): except ValueError as e: pass - try: - _verify_path("thisdoesnotexist") - assert False - except ValueError as e: - pass + _verify_path("thisdoesnotexist") + assert ( + capsys.readouterr().err + == "WARNING: File thisdoesnotexist does not exist locally\n" + ) with open(output_dir / "test", "w") as f: f.write(" ") From ed5020e345d0fa81eaa0c124883e783366536a5b Mon Sep 17 00:00:00 2001 From: Ulthran Date: Thu, 4 Apr 2024 11:44:21 -0400 Subject: [PATCH 12/22] Add --ignore_local_fs arg --- src/sunbeamlib/script_run.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/src/sunbeamlib/script_run.py b/src/sunbeamlib/script_run.py index 036fdbf5..44da6237 100644 --- a/src/sunbeamlib/script_run.py +++ b/src/sunbeamlib/script_run.py @@ -33,7 +33,7 @@ def main(argv=sys.argv): "-m", "--mamba", action="store_true", - help="Use mamba instead of conda to manage environments", + help="Use mamba instead of conda to create environments", ) parser.add_argument( "--target_list", @@ -58,6 +58,11 @@ def main(argv=sys.argv): default=__version__, help="The tag to use when pulling docker images for the core pipeline environments, defaults to sunbeam's current version ($SUNBEAM_VER), a good alternative is 'latest' for the latest stable release", ) + parser.add_argument( + "--ignore_local_fs", + action="store_true", + help="Ignore local filesystem performing checks for input files", + ) # The remaining args (after --) are passed to Snakemake args, remaining = parser.parse_known_args(argv) From 84ec42c8709981261e16ebf457473a1056336a0a Mon Sep 17 00:00:00 2001 From: Ulthran Date: Mon, 10 Jun 2024 15:10:34 -0400 Subject: [PATCH 13/22] Add dev doc --- docs/dev.rst | 46 ++++++++++++++++++++++++++++++++++++++++++++++ docs/index.rst | 1 + docs/structure.rst | 2 +- 3 files changed, 48 insertions(+), 1 deletion(-) create mode 100644 docs/dev.rst diff --git a/docs/dev.rst b/docs/dev.rst new file mode 100644 index 00000000..fee6b385 --- /dev/null +++ b/docs/dev.rst @@ -0,0 +1,46 @@ +.. _dev: + +==== +Dev +==== + +Getting involved with developing Sunbeam can be a little daunting at first. This doc will try to break down the constituent parts from a developer's perspective. For starters, check out the structure_ doc to get a sense of how the code is organized. + +sunbeamlib +========== + +The core of Sunbeam's configuration, setup, and execution is in the ``sunbeamlib`` module. This module is located in ``src/sunbeamlib/`` with the root ``pyproject.toml`` configuring it. It has a number of different scripts, each located in its own file prefixed by ``script_``, and also has utility functions and classes for both the scripts and some portions of the pipeline that are particularly common. + +workflow +======== + +The core of the work done by Sunbeam is handled by the Snakemake workflow, located in ``workflow/``. Once a project is setup properly by sunbeamlib, the workflow can be run with all the reproducibility benefits of Snakemake. The core of the workflow is defined in ``workflow/Snakefile``. Reference the Snakemake docs for help understanding Snakemake things better; they're very good. From this core Snakefile, we import more ``.smk`` files from ``workflow/rules/`` and ``extensions/sbx_*/``. + +Important Variables +------------------- + +Variables defined in the main Snakefile can be accessed throughout the workflow. Some important variables include: + +- ``Samples``: Dict[str, Dict[str, str]] - A dictionary where keys are sample names and values are dictionaries of read pairs mapping to file paths (``Samples[sample] = {"1": r1, "2": r2}``). +- ``Pairs``: List[str] - Either ``["1", "2"]`` or ``["1"]`` depending on if the project is paired end or not. +- ``Cfg``: Dict[str, Dict[str, str]] - The YAML config converted into dictionary form. +- ``MIN_MEM_MB``: int - A minimum value of the number of megabytes of memory to request for each job. This will only apply for jobs that rely on Sunbeam to guess their memory requirements. +- ``MIN_RUNTIME``: int - A minimum value of the number of minutes to request for each job. This will only apply for jobs that rely on Sunbeam to guess their runtime requirements. +- ``HostGenomes``: List[str] - A list of host genomes that are used for decontaminating reads. +- ``HostGenomeFiles``: List[str] - A list of files with host genomes that are used for decontaminating reads (not to be confused with ``sbx_mapping``'s ``GenomeFiles`` variable, which it uses to track reference genome files). +- ``QC_FP``: Path - The Path to the project's quality control output directory. +- ``ASSEMBLY_FP``: Path - The Path to the project's assembly output directory. +- ``CLASSIFY_FP``: Path - The Path to the project's classification output directory. +- ``MAPPING_FP``: Path - The Path to the project's mapping output directory. +- ``BENCHMARK_FP``: Path - The Path to the project's benchmarking output directory. +- ``LOG_FP``: Path - The Path to the project's log output directory. + +tests +===== + +All tests are located in the ``tests/`` directory. The tests are run with pytest, and the tests are organized into subdirectories based on the module they are testing. + +.github +======= + +The ``.github/`` directory contains the configuration for GitHub Actions, which are used to run the tests on every push to the repository and manage releases. The configuration is in ``.github/workflows/``. \ No newline at end of file diff --git a/docs/index.rst b/docs/index.rst index 81987589..81550684 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -37,5 +37,6 @@ EL Clarke, LJ Taylor, C Zhao *et al.* Sunbeam: an extensible pipeline for analyz extensions.rst examples.rst install.rst + dev.rst citation.rst diff --git a/docs/structure.rst b/docs/structure.rst index e3ff5b9d..2cc31aaf 100755 --- a/docs/structure.rst +++ b/docs/structure.rst @@ -2,8 +2,8 @@ ================== Software Structure - ================== + Overview ======== Sunbeam is a snakemake pipeline with a python library acting as a wrapper (``sunbeamlib``). Calling ``sunbeam run [args] [options]`` is a call to this wrapper library which then invokes the necessary snakemake commands. The main Snakefile can be found in the ``workflow/`` directory and it makes use of rules from ``workflow/rules/`` and ``extensions/``, scripts from ``workflow/scripts/``, and environments from ``workflow/envs/``. Tests are run with pytest and live in the ``tests/`` directory. Documentation lives in ``docs/`` and is served by ReadTheDocs. From 735594f409e9578a56db1be13d85dfa83aa74233 Mon Sep 17 00:00:00 2001 From: Ulthran Date: Mon, 10 Jun 2024 15:11:08 -0400 Subject: [PATCH 14/22] Don't need new arg --- src/sunbeamlib/script_run.py | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/src/sunbeamlib/script_run.py b/src/sunbeamlib/script_run.py index 44da6237..961b9401 100644 --- a/src/sunbeamlib/script_run.py +++ b/src/sunbeamlib/script_run.py @@ -58,11 +58,6 @@ def main(argv=sys.argv): default=__version__, help="The tag to use when pulling docker images for the core pipeline environments, defaults to sunbeam's current version ($SUNBEAM_VER), a good alternative is 'latest' for the latest stable release", ) - parser.add_argument( - "--ignore_local_fs", - action="store_true", - help="Ignore local filesystem performing checks for input files", - ) # The remaining args (after --) are passed to Snakemake args, remaining = parser.parse_known_args(argv) @@ -85,7 +80,7 @@ def main(argv=sys.argv): ) if args.include and args.exclude: - sys.stderr.write("Error: cannot pass both --include and --exclude\n") + sys.stderr.write("Error: cannot use both --include and --exclude\n") sys.exit(1) os.environ["SUNBEAM_EXTS_INCLUDE"] = "" From 8502a18da4c9fb990a9389b011045e172f2628f6 Mon Sep 17 00:00:00 2001 From: Ulthran Date: Mon, 10 Jun 2024 17:11:03 -0400 Subject: [PATCH 15/22] Add skip qc and skip decontam options for sunbeam run --- src/sunbeamlib/config.py | 6 ++---- src/sunbeamlib/script_run.py | 11 ++++++++++ workflow/Snakefile | 42 ++++++++++++++++++++++++++++++++---- 3 files changed, 51 insertions(+), 8 deletions(-) diff --git a/src/sunbeamlib/config.py b/src/sunbeamlib/config.py index 3f4e7bd3..367065b3 100644 --- a/src/sunbeamlib/config.py +++ b/src/sunbeamlib/config.py @@ -35,13 +35,11 @@ def validate_paths(cfg: Dict[str, str], root: Path) -> Dict[str, Union[str, Path """ new_cfg = dict() for k, v in cfg.items(): - if k == "output_fp" or k == "host_fp": - v = Path(v) - elif k.endswith("_fp"): + if k.endswith("_fp"): try: v = makepath(v) except TypeError as e: - raise TypeError(f"Missing value for key: {k}") + sys.stderr.write(f"Warning: Missing value for key: {k}") if not v.is_absolute(): v = root / v if k != "output_fp": diff --git a/src/sunbeamlib/script_run.py b/src/sunbeamlib/script_run.py index 961b9401..3630fe53 100644 --- a/src/sunbeamlib/script_run.py +++ b/src/sunbeamlib/script_run.py @@ -53,6 +53,11 @@ def main(argv=sys.argv): default=[], help="List of extensions to exclude from run, use 'all' to exclude all extensions", ) + parser.add_argument( + "--skip", + default="", + help="Workflow to skip. Either 'qc' to skip the quality control steps or 'decontam' to skip everything in sunbeam core (QC and decontamination).", + ) parser.add_argument( "--docker_tag", default=__version__, @@ -90,6 +95,12 @@ def main(argv=sys.argv): if args.exclude: os.environ["SUNBEAM_EXTS_EXCLUDE"] = ", ".join(args.exclude) + if args.skip not in ["", "qc", "decontam"]: + sys.stderr.write("Error: --skip must be either 'qc' or 'decontam'\n") + sys.exit(1) + + os.environ["SUNBEAM_SKIP"] = args.skip + os.environ["SUNBEAM_DOCKER_TAG"] = args.docker_tag snakemake_args = ( diff --git a/workflow/Snakefile b/workflow/Snakefile index 23106429..2ba5bfd5 100644 --- a/workflow/Snakefile +++ b/workflow/Snakefile @@ -130,11 +130,45 @@ MAPPING_FP = output_subdir(Cfg, "mapping") # ---- END DEPRECATED -# ---- Targets rules +# ---- Import rules include: "rules/targets.smk" -# ---- Quality control rules -include: "rules/qc.smk" -include: "rules/decontaminate.smk" + + +# Skip QC and/or decontam +if os.environ.get("SUNBEAM_SKIP", "").lower() == "decontam": + + rule skip_decontam: + input: + lambda wildcards: Samples[wildcards.sample][wildcards.rp], + output: + QC_FP / "decontam" / "{sample}_{rp}.fastq.gz", + log: + LOG_FP / "skip_decontam_{sample}_{rp}.log", + shell: + """ + cp {input} {output} + """ + +elif os.environ.get("SUNBEAM_SKIP", "").lower() == "qc": + + rule skip_qc: + input: + lambda wildcards: Samples[wildcards.sample][wildcards.rp], + output: + QC_FP / "cleaned" / "{sample}_{rp}.fastq.gz", + log: + LOG_FP / "skip_qc_{sample}_{rp}.log", + shell: + """ + cp {input} {output} + """ + + include: "rules/decontaminate.smk" + +else: + + include: "rules/qc.smk" + include: "rules/decontaminate.smk" for sbx_path, wildcards in sbxs: From b58d55efc7f6331965a5fda0503a82fd8cbbda7a Mon Sep 17 00:00:00 2001 From: Ulthran Date: Tue, 11 Jun 2024 11:34:19 -0400 Subject: [PATCH 16/22] Add chop.yml --- src/sunbeamlib/chop.yml | 6 ++++++ 1 file changed, 6 insertions(+) create mode 100644 src/sunbeamlib/chop.yml diff --git a/src/sunbeamlib/chop.yml b/src/sunbeamlib/chop.yml new file mode 100644 index 00000000..64633bb5 --- /dev/null +++ b/src/sunbeamlib/chop.yml @@ -0,0 +1,6 @@ +# Template for running on CHOP HPC + +qc: + host_fp: /mnt/isilon/microbiome/analysis/biodata/hosts +sbx_kraken: + kraken_db_fp: '/mnt/isilon/microbiome/analysis/biodata/kraken2db/standard_20200204' From 64f183ae7eafb6d2c646b3131af1190735947d68 Mon Sep 17 00:00:00 2001 From: Ulthran Date: Wed, 12 Jun 2024 13:02:00 -0400 Subject: [PATCH 17/22] Add --skip to commands doc --- docs/commands.rst | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/docs/commands.rst b/docs/commands.rst index d1f89589..ecdc7d4f 100755 --- a/docs/commands.rst +++ b/docs/commands.rst @@ -46,7 +46,7 @@ Sunbeam Commands Executes the Sunbeam pipeline by calling Snakemake. .. code-block:: shell - sunbeam run [-h] [-m] [-s PATH] [--target_list [TARGETS, ...]] [--include [INCLUDES, ...]] [--exclude [EXCLUDE, ...]] [--docker_tag TAG] + sunbeam run [-h] [-m] [-s PATH] [--target_list [TARGETS, ...]] [--include [INCLUDES, ...]] [--exclude [EXCLUDE, ...]] [--skip SKIP] [--docker_tag TAG] .. tip:: The ``--target_list`` option is deprecated. Pass the targets directly to ``sunbeam run`` instead. @@ -58,6 +58,8 @@ Sunbeam Commands ``sunbeam run --profile /path/to/project/ all_decontam all_assembly all_annotation`` 3. The equivalent of 2, using the deprecated ``--target_list`` option: ``sunbeam run --profile /path/to/project/ --target_list all_decontam all_assembly all_annotation`` + 4. To run assembly on samples that have already been decontaminated: + ``sunbeam run --profile /path/to/project/ --skip decontam all_assembly`` .. code-block:: shell -h/--help: Display help. @@ -66,6 +68,7 @@ Sunbeam Commands --target_list: A list of targets to run successively. (DEPRECATED) --include: List of extensions to include in run. --exclude: List of extensions to exclude from run, use 'all' to exclude all extensions. + --skip: Either 'qc' to skip the quality control steps or 'decontam' to skip the quality control and decontamination. --docker_tag: Tag to use for internal environment docker images. Try 'latest' if the default tag doesn't work. : You can pass further arguments to Snakemake, e.g: ``$ sunbeam run --cores 12``. See http://snakemake.readthedocs.io for more information. From 5b0b39354fae968ac93a0da81b6c3bc321356bc6 Mon Sep 17 00:00:00 2001 From: Ulthran Date: Wed, 12 Jun 2024 14:01:15 -0400 Subject: [PATCH 18/22] Add fields to CHOP template --- src/sunbeamlib/chop.yml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/sunbeamlib/chop.yml b/src/sunbeamlib/chop.yml index 64633bb5..3a1c5c23 100644 --- a/src/sunbeamlib/chop.yml +++ b/src/sunbeamlib/chop.yml @@ -4,3 +4,7 @@ qc: host_fp: /mnt/isilon/microbiome/analysis/biodata/hosts sbx_kraken: kraken_db_fp: '/mnt/isilon/microbiome/analysis/biodata/kraken2db/standard_20200204' +sbx_gene_clusters: + genes_fp: /mnt/isilon/microbiome/analysis/biodata/diamondIndexes/v2.1.6.160 +sbx_mapping: + genomes_fp: /mnt/isilon/microbiome/analysis/biodata/bwa_and_bowtie2/six_fungal_genomes \ No newline at end of file From c57723fe22b9ddb1d61df4b930c6dbf1ac06bf1a Mon Sep 17 00:00:00 2001 From: Ulthran Date: Wed, 10 Jul 2024 11:54:09 -0400 Subject: [PATCH 19/22] Update chop.yml --- src/sunbeamlib/chop.yml | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/sunbeamlib/chop.yml b/src/sunbeamlib/chop.yml index 3a1c5c23..608bf2bd 100644 --- a/src/sunbeamlib/chop.yml +++ b/src/sunbeamlib/chop.yml @@ -7,4 +7,7 @@ sbx_kraken: sbx_gene_clusters: genes_fp: /mnt/isilon/microbiome/analysis/biodata/diamondIndexes/v2.1.6.160 sbx_mapping: - genomes_fp: /mnt/isilon/microbiome/analysis/biodata/bwa_and_bowtie2/six_fungal_genomes \ No newline at end of file + genomes_fp: /mnt/isilon/microbiome/analysis/biodata/bwa_and_bowtie2/six_fungal_genomes +sbx_metaphlan4: + dbdir: "/mnt/isilon/microbiome/analysis/biodata/metaphlan_databases/v4" + dbname: "mpa_vOct22_CHOCOPhlAnSGB_202212" \ No newline at end of file From c27d3fad28cc65a7e93f079b25e08528dc449fc4 Mon Sep 17 00:00:00 2001 From: Ulthran Date: Fri, 9 Aug 2024 13:19:21 -0400 Subject: [PATCH 20/22] Add sunbeam env vars to docs --- docs/dev.rst | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/docs/dev.rst b/docs/dev.rst index fee6b385..fc541f5e 100644 --- a/docs/dev.rst +++ b/docs/dev.rst @@ -35,6 +35,19 @@ Variables defined in the main Snakefile can be accessed throughout the workflow. - ``BENCHMARK_FP``: Path - The Path to the project's benchmarking output directory. - ``LOG_FP``: Path - The Path to the project's log output directory. +Environment Variables +--------------------- + +- ``SUNBEAM_DIR``: str - The path to the Sunbeam installation directory. +- ``SUNBEAM_VER``: str - The version of Sunbeam being run. +- ``SUNBEAM_EXTS_INCLUDE``: str - If set, will include the given extension in the workflow (and exclude the rest). This is useful for testing individual extensions. +- ``SUNBEAM_EXTS_EXCLUDE``: str - If set, will exclude the given extension from the workflow. This is useful for when namespaces between extensions collide (same rule name multiple times). +- ``SUNBEAM_SKIP``: str - If set, will skip either 'qc' or 'decontam'. +- ``SUNBEAM_DOCKER_TAG``: str - If set, will use the given tag for the Docker image instead of the default. +- ``SUNBEAM_MIN_MEM_MB``: int - If set, will override the default minimum memory value. +- ``SUNBEAM_MIN_RUNTIME``: int - If set, will override the default minimum runtime value. +- ``SUNBEAM_NO_ADAPTER``: bool - If set, will not check that the adapter template file exists. + tests ===== From 76ec41b71095dc9f985083630b1bda800ed3b2cd Mon Sep 17 00:00:00 2001 From: Ulthran Date: Mon, 12 Aug 2024 14:12:16 -0400 Subject: [PATCH 21/22] Add skip example --- docs/examples.rst | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/docs/examples.rst b/docs/examples.rst index b57fcdf5..4f5fdc57 100644 --- a/docs/examples.rst +++ b/docs/examples.rst @@ -159,6 +159,19 @@ Then you submit the job: sbatch run_sunbeam.sh +Skipping the QC and Decontamination +=================================== + +This time you're coming at sunbeam with a data set that you have already run QC on and removed host reads from. You want to run the assembly pipeline on this data. Your data is paired end and lives in a directory called ``/data``. Run: + +.. code-block:: bash + + sunbeam extend https://github.com/sunbeam-labs/sbx_assembly + sunbeam init --data_fp /data/ /projects/my_project/ + sunbeam run --profile /projects/my_project --skip decontam all_assembly + +Once this run completes, you will have a directory called ``/projects/my_project/sunbeam_output/`` that contains all of the output from the run. Look in ``/projects/my_project/sunbeam_output/assembly/contigs/`` for the assembled contigs. + Running on AWS Batch with AWS S3 Data ====================================== From d9d2184a71c39fb862b379995c9d30b65a7f91be Mon Sep 17 00:00:00 2001 From: Ulthran Date: Mon, 12 Aug 2024 14:21:04 -0400 Subject: [PATCH 22/22] Remove debug --- workflow/Snakefile | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) diff --git a/workflow/Snakefile b/workflow/Snakefile index 2ba5bfd5..afe9ce29 100644 --- a/workflow/Snakefile +++ b/workflow/Snakefile @@ -51,15 +51,9 @@ MIN_RUNTIME = int(os.getenv("SUNBEAM_MIN_RUNTIME", 15)) # Check for major version compatibility pkg_major, cfg_major = check_compatibility(config) -if pkg_major > cfg_major: +if pkg_major != cfg_major: raise SystemExit( - "\nThis config file was created with an older version of Sunbeam" - " and may not be compatible. Create a new config file using" - "`sunbeam init` or update this one using `sunbeam config update -i /path/to/sunbeam_config.yml`\n" - ) -elif pkg_major < cfg_major: - raise SystemExit( - "\nThis config file was created with an older version of Sunbeam" + "\nThis config file was created with a different version of Sunbeam" " and may not be compatible. Create a new config file using" "`sunbeam init` or update this one using `sunbeam config update -i /path/to/sunbeam_config.yml`\n" ) @@ -88,7 +82,6 @@ if Cfg["qc"]["host_fp"] == Cfg["all"]["root"]: HostGenomeFiles = [] else: HostGenomeFiles = [f for f in Cfg["qc"]["host_fp"].glob("*.fasta")] - print(HostGenomeFiles) if not HostGenomeFiles: sys.stderr.write( "\n\nWARNING: No files detected in host genomes folder ({}). "