From 50b9f531f93401c09fcbdbc79738999e5f684e22 Mon Sep 17 00:00:00 2001 From: Bede Constantinides Date: Sun, 23 Jul 2023 13:01:41 +0100 Subject: [PATCH] 0.1.0; fixes issues 10,19,20,22,23,25 --- README.md | 22 +- src/hostile/__init__.py | 2 +- src/hostile/aligner.py | 45 +-- src/hostile/cli.py | 51 ++-- src/hostile/lib.py | 55 ++-- src/hostile/util.py | 51 ++-- tests/data/h37rv_10.r1.fastq | 40 --- tests/data/h37rv_10.r1.fastq.gz | Bin 1508 -> 0 bytes tests/data/h37rv_10.r2.fastq | 40 --- tests/data/h37rv_10.r2.fastq.gz | Bin 1501 -> 0 bytes tests/data/human.1k.fa.gz | Bin 19004 -> 0 bytes tests/data/mixed_human_100_1.fastq.gz | Bin 3497 -> 0 bytes tests/data/mixed_human_100_2.fastq.gz | Bin 3798 -> 0 bytes .../partial-for-mask-testing.fa.gz | Bin .../sars-cov-2.1.bt2} | Bin .../sars-cov-2.2.bt2} | Bin .../sars-cov-2.3.bt2} | Bin .../sars-cov-2.4.bt2} | Bin .../sars-cov-2.fasta.gz} | Bin .../sars-cov-2.rev.1.bt2} | Bin .../sars-cov-2.rev.2.bt2} | Bin tests/data/sars-cov-2_100_1.fastq.gz | Bin 0 -> 4104 bytes tests/data/sars-cov-2_100_2.fastq.gz | Bin 0 -> 4263 bytes tests/data/sars-cov-2_1_1.fastq | 4 + tests/data/sars-cov-2_1_2.fastq | 4 + tests/data/tuberculosis_1_1.fastq | 4 + tests/data/tuberculosis_1_2.fastq | 4 + tests/test_all.py | 277 ++++++++++++++---- 28 files changed, 365 insertions(+), 234 deletions(-) delete mode 100644 tests/data/h37rv_10.r1.fastq delete mode 100644 tests/data/h37rv_10.r1.fastq.gz delete mode 100644 tests/data/h37rv_10.r2.fastq delete mode 100644 tests/data/h37rv_10.r2.fastq.gz delete mode 100644 tests/data/human.1k.fa.gz delete mode 100644 tests/data/mixed_human_100_1.fastq.gz delete mode 100644 tests/data/mixed_human_100_2.fastq.gz rename tests/data/{MN908947 => sars-cov-2}/partial-for-mask-testing.fa.gz (100%) rename tests/data/{MN908947/MN908947.1.bt2 => sars-cov-2/sars-cov-2.1.bt2} (100%) rename tests/data/{MN908947/MN908947.2.bt2 => sars-cov-2/sars-cov-2.2.bt2} (100%) rename tests/data/{MN908947/MN908947.3.bt2 => sars-cov-2/sars-cov-2.3.bt2} (100%) rename tests/data/{MN908947/MN908947.4.bt2 => sars-cov-2/sars-cov-2.4.bt2} (100%) rename tests/data/{MN908947/MN908947.fasta.gz => sars-cov-2/sars-cov-2.fasta.gz} (100%) rename tests/data/{MN908947/MN908947.rev.1.bt2 => sars-cov-2/sars-cov-2.rev.1.bt2} (100%) rename tests/data/{MN908947/MN908947.rev.2.bt2 => sars-cov-2/sars-cov-2.rev.2.bt2} (100%) create mode 100644 tests/data/sars-cov-2_100_1.fastq.gz create mode 100644 tests/data/sars-cov-2_100_2.fastq.gz create mode 100644 tests/data/sars-cov-2_1_1.fastq create mode 100644 tests/data/sars-cov-2_1_2.fastq create mode 100644 tests/data/tuberculosis_1_1.fastq create mode 100644 tests/data/tuberculosis_1_2.fastq diff --git a/README.md b/README.md index b8520c2..7834e00 100644 --- a/README.md +++ b/README.md @@ -1,20 +1,18 @@ -[![Tests](https://github.com/bede/hostile/actions/workflows/test.yml/badge.svg)](https://github.com/bede/hostile/actions/workflows/test.yml) [![PyPI version](https://img.shields.io/pypi/v/hostile)](https://pypi.org/project/hostile/) [![Bioconda version](https://anaconda.org/bioconda/hostile/badges/version.svg)](https://anaconda.org/bioconda/hostile/) [![Install with bioconda](https://img.shields.io/badge/install%20with-bioconda-brightgreen.svg?style=flat-square&logo=anaconda)](https://biocontainers.pro/tools/hostile) [![Install with Docker](https://img.shields.io/badge/install%20with-docker-important.svg?style=flat-square&logo=docker)](https://biocontainers.pro/tools/hostile) [![DOI:10.1101/2023.07.04.547735](http://img.shields.io/badge/BioRxiv-10.1101/2023.07.04.547735-bd2736.svg)](https://doi.org/10.1101/2023.07.04.547735) +[![Tests](https://github.com/bede/hostile/actions/workflows/test.yml/badge.svg)](https://github.com/bede/hostile/actions/workflows/test.yml) [![PyPI version](https://img.shields.io/pypi/v/hostile)](https://pypi.org/project/hostile/) [![Bioconda version](https://anaconda.org/bioconda/hostile/badges/version.svg)](https://anaconda.org/bioconda/hostile/) [![Install with bioconda](https://img.shields.io/badge/install%20with-bioconda-brightgreen.svg?style=flat-square&logo=anaconda)](https://biocontainers.pro/tools/hostile) [![Install with Docker](https://img.shields.io/badge/install%20with-docker-important.svg?style=flat-square&logo=docker)](https://biocontainers.pro/tools/hostile) [![DOI:10.1101/2023.07.04.547735](http://img.shields.io/badge/BioRxiv-10.1101/2023.07.04.547735-bd2736.svg)](https://www.biorxiv.org/content/10.1101/2023.07.04.547735) # Hostile -Hostile removes host sequences from short and long reads, consuming paired or unpaired `fastq[.gz]` input. Batteries are included – Hostile downloads and saves a human T2T-CHM13v2.0 + HLA reference when run for the first time. Read headers can be replaced with integers (using `--rename`) for privacy and more compressible FASTQs. Hostile is implemented as a Python package with a CLI and Python API, but heavy lifting is all done by compiled code (Minimap2/Bowtie2 and Samtools). Bowtie2 is the default and recommended aligner for short (paired) reads while Minimap2 is default and recommended aligner for long reads. When used with a masked reference genome, Hostile achieves near-perfect retention of microbial reads while removing >99.5% of human reads. Please read the [BioRxiv preprint](https://www.biorxiv.org/content/10.1101/2023.07.04.547735) for further information and open a GitHub issue, [tweet](https://twitter.com/beconsta) or [toot](https://mstdn.science/@bede) me to report issues or suggest improvements. - - +Hostile removes host sequences from short and long reads, consuming paired or unpaired `fastq[.gz]` input. Batteries are included – a human reference genome is downloaded when run for the first time. For maximum retention of microbial reads, an existing masked reference genome can be downloaded, or a new one created for target organisms. When used with a masked reference genome, Hostile achieves near-perfect retention of microbial reads while removing >99.6% of human reads. Read headers can be replaced with integers (using `--rename`) for privacy and smaller FASTQs. Heavy lifting is done with fast existing tools (Minimap2/Bowtie2 and Samtools). Bowtie2 is the default aligner for short (paired) reads while Minimap2 is default aligner for long reads. Benchmarks and further info can be found in the [BioRxiv preprint](https://www.biorxiv.org/content/10.1101/2023.07.04.547735) (please cite if useful!). Feel free open an issue, [tweet](https://twitter.com/beconsta) or [toot](https://mstdn.science/@bede) me to report problems or suggest improvements. ## Reference genomes -The default `human-t2t-hla` reference is downloaded when running Hostile for the first time. This can be overriden by specifying a custom `--index`. Bowtie2 indexes need to be untarred before use. The databases `human-t2t-hla` and `human-t2t-hla-argos985-mycob140` were compared in the [paper](https://www.biorxiv.org/content/10.1101/2023.07.04.547735). +The default `human-t2t-hla` reference is downloaded when running Hostile for the first time. This can be overriden by specifying a custom `--index`. Bowtie2 indexes need to be untarred before use. The databases `human-t2t-hla` and `human-t2t-hla-argos985-mycob140` were compared in the [paper](https://www.biorxiv.org/content/10.1101/2023.07.04.547735). -| Name | Composition | Genome (Minimap2) | Bowtie2 index | -| :-------------------------------: | :----------------------------------------------------------: | :----------------------------------------------------------: | :----------------------------------------------------------: | -| `human-t2t-hla` **(default)** | [T2T-CHM13v2.0](https://www.ncbi.nlm.nih.gov/assembly/11828891) + [IPD-IMGT/HLA](https://www.ebi.ac.uk/ipd/imgt/hla/) v3.51 | [human-t2t-hla.fa.gz](https://objectstorage.uk-london-1.oraclecloud.com/n/lrbvkel2wjot/b/human-genome-bucket/o/human-t2t-hla.fa.gz) | [human-t2t-hla.tar](https://objectstorage.uk-london-1.oraclecloud.com/n/lrbvkel2wjot/b/human-genome-bucket/o/human-t2t-hla.tar) | -| `human-t2t-hla-argos985` | [T2T-CHM13v2.0](https://www.ncbi.nlm.nih.gov/assembly/11828891) & [IPD-IMGT/HLA](https://www.ebi.ac.uk/ipd/imgt/hla/) v3.51; masked with [985](https://github.com/bede/hostile/blob/main/paper/supplementary-table-2.tsv) [FDA-ARGOS](https://www.ncbi.nlm.nih.gov/bioproject/231221) 150mers | [human-t2t-hla-argos985.fa.gz](https://objectstorage.uk-london-1.oraclecloud.com/n/lrbvkel2wjot/b/human-genome-bucket/o/human-t2t-hla-argos985.fa.gz) | [human-t2t-hla-argos985.tar](https://objectstorage.uk-london-1.oraclecloud.com/n/lrbvkel2wjot/b/human-genome-bucket/o/human-t2t-hla-argos985.tar) | -| `human-t2t-hla-argos985-mycob140` | [T2T-CHM13v2.0](https://www.ncbi.nlm.nih.gov/assembly/11828891) & [IPD-IMGT/HLA](https://www.ebi.ac.uk/ipd/imgt/hla/) v3.51; masked with [985](https://github.com/bede/hostile/blob/main/paper/supplementary-table-2.tsv) [FDA-ARGOS](https://www.ncbi.nlm.nih.gov/bioproject/231221) & [140](https://github.com/bede/hostile/blob/main/paper/supplementary-table-2.tsv) mycobacterial 150mers | [human-t2t-hla-argos985-mycob140.fa.gz](https://objectstorage.uk-london-1.oraclecloud.com/n/lrbvkel2wjot/b/human-genome-bucket/o/human-t2t-hla-argos985-mycob140.fa.gz) | [human-t2t-hla-argos985-mycob140.tar](https://objectstorage.uk-london-1.oraclecloud.com/n/lrbvkel2wjot/b/human-genome-bucket/o/human-t2t-hla-argos985-mycob140.tar) | +| Name | Composition | Genome (Minimap2) | Bowtie2 index | Date | +| :-------------------------------: | :----------------------------------------------------------: | :----------------------------------------------------------: | :----------------------------------------------------------: | ------- | +| `human-t2t-hla` **(default)** | [T2T-CHM13v2.0](https://www.ncbi.nlm.nih.gov/assembly/11828891) + [IPD-IMGT/HLA](https://www.ebi.ac.uk/ipd/imgt/hla/) v3.51 | [human-t2t-hla.fa.gz](https://objectstorage.uk-london-1.oraclecloud.com/n/lrbvkel2wjot/b/human-genome-bucket/o/human-t2t-hla.fa.gz) | [human-t2t-hla.tar](https://objectstorage.uk-london-1.oraclecloud.com/n/lrbvkel2wjot/b/human-genome-bucket/o/human-t2t-hla.tar) | 2023-07 | +| `human-t2t-hla-argos985` | [T2T-CHM13v2.0](https://www.ncbi.nlm.nih.gov/assembly/11828891) & [IPD-IMGT/HLA](https://www.ebi.ac.uk/ipd/imgt/hla/) v3.51; masked with [985](https://github.com/bede/hostile/blob/main/paper/supplementary-table-2.tsv) [FDA-ARGOS](https://www.ncbi.nlm.nih.gov/bioproject/231221) 150mers | [human-t2t-hla-argos985.fa.gz](https://objectstorage.uk-london-1.oraclecloud.com/n/lrbvkel2wjot/b/human-genome-bucket/o/human-t2t-hla-argos985.fa.gz) | [human-t2t-hla-argos985.tar](https://objectstorage.uk-london-1.oraclecloud.com/n/lrbvkel2wjot/b/human-genome-bucket/o/human-t2t-hla-argos985.tar) | 2023-07 | +| `human-t2t-hla-argos985-mycob140` | [T2T-CHM13v2.0](https://www.ncbi.nlm.nih.gov/assembly/11828891) & [IPD-IMGT/HLA](https://www.ebi.ac.uk/ipd/imgt/hla/) v3.51; masked with [985](https://github.com/bede/hostile/blob/main/paper/supplementary-table-2.tsv) [FDA-ARGOS](https://www.ncbi.nlm.nih.gov/bioproject/231221) & [140](https://github.com/bede/hostile/blob/main/paper/supplementary-table-2.tsv) mycobacterial 150mers | [human-t2t-hla-argos985-mycob140.fa.gz](https://objectstorage.uk-london-1.oraclecloud.com/n/lrbvkel2wjot/b/human-genome-bucket/o/human-t2t-hla-argos985-mycob140.fa.gz) | [human-t2t-hla-argos985-mycob140.tar](https://objectstorage.uk-london-1.oraclecloud.com/n/lrbvkel2wjot/b/human-genome-bucket/o/human-t2t-hla-argos985-mycob140.tar) | 2023-07 | @@ -25,7 +23,7 @@ Installation with conda/mamba or Docker is recommended due to non-Python depende **Conda/mamba** ```bash -conda create -n hostile -c conda-forge -c bioconda hostile # mamba/micromamba are faster +conda create -n hostile -c conda-forge -c bioconda hostile # Mamba/Micromamba are faster conda activate hostile ``` @@ -50,7 +48,7 @@ pip install hostile # Requires python >= 3.10 ```bash git clone https://github.com/bede/hostile.git cd hostile -conda env create -f environment.yml # Mamba/micromamba are faster +conda env create -f environment.yml # Mamba/Micromamba are faster conda activate hostile pip install --editable '.[dev]' pytest diff --git a/src/hostile/__init__.py b/src/hostile/__init__.py index 486e22f..2f969cb 100644 --- a/src/hostile/__init__.py +++ b/src/hostile/__init__.py @@ -1,2 +1,2 @@ """Accurate host read removal""" -__version__ = "0.0.3" +__version__ = "0.1.0" diff --git a/src/hostile/aligner.py b/src/hostile/aligner.py index 2760439..de6a4e5 100644 --- a/src/hostile/aligner.py +++ b/src/hostile/aligner.py @@ -1,5 +1,7 @@ import logging +import shutil import subprocess +import tempfile from dataclasses import dataclass from pathlib import Path @@ -13,43 +15,48 @@ class Aligner: short_name: str bin_path: Path cdn_base_url: str - working_dir: Path + data_dir: Path cmd: str paired_cmd: str idx_archive_fn: str = "" ref_archive_fn: str = "" idx_name: str = "" - idx_paths: tuple[Path] = tuple() + idx_paths: tuple[Path, ...] = tuple() def __post_init__(self): self.ref_archive_url = f"{self.cdn_base_url}/{self.ref_archive_fn}" self.idx_archive_url = f"{self.cdn_base_url}/{self.idx_archive_fn}" - self.ref_archive_path = self.working_dir / self.ref_archive_fn - self.idx_archive_path = self.working_dir / self.idx_archive_fn - self.idx_path = self.working_dir / self.idx_name - Path(self.working_dir).mkdir(exist_ok=True, parents=True) + self.ref_archive_path = self.data_dir / self.ref_archive_fn + self.idx_archive_path = self.data_dir / self.idx_archive_fn + self.idx_path = self.data_dir / self.idx_name + Path(self.data_dir).mkdir(exist_ok=True, parents=True) def check(self, using_custom_index: bool): """Test aligner and check/download a ref/index if necessary""" if not using_custom_index: # Check for and if necessary fetch a genome/index if self.name == "Bowtie2": if not all(path.exists() for path in self.idx_paths): - self.working_dir.mkdir(exist_ok=True, parents=True) - logging.info(f"Fetching human index") - util.download(self.idx_archive_url, self.idx_archive_path) - util.untar_file(self.idx_archive_path, self.working_dir) - self.idx_archive_path.unlink() + self.data_dir.mkdir(exist_ok=True, parents=True) + logging.info(f"Fetching human index ({self.idx_archive_url})") + with tempfile.NamedTemporaryFile() as temporary_file: + tmp_path = Path(temporary_file.name) + util.download(self.idx_archive_url, tmp_path) + logging.info("Extracting index…") + util.untar_file(tmp_path, self.data_dir) logging.info(f"Saved human index ({self.idx_path})") else: logging.info(f"Found cached index ({self.idx_path})") elif self.name == "Minimap2": if not self.ref_archive_path.exists(): - util.download(self.ref_archive_url, self.ref_archive_path) + with tempfile.NamedTemporaryFile(delete=False) as temporary_file: + tmp_path = Path(temporary_file.name) + util.download(self.ref_archive_url, tmp_path) + shutil.move(tmp_path, self.ref_archive_path) logging.info(f"Saved human reference ({self.ref_archive_path})") else: logging.info(f"Found cached reference ({self.ref_archive_path})") try: - util.run(f"{self.bin_path} --version", cwd=self.working_dir) + util.run(f"{self.bin_path} --version", cwd=self.data_dir) except subprocess.CalledProcessError: logging.warning(f"Failed to execute {self.bin_path}") raise RuntimeError(f"Failed to execute {self.bin_path}") @@ -84,8 +91,9 @@ def gen_clean_cmd( "{FASTQ}": str(fastq), "{THREADS}": str(threads), } + alignment_cmd = self.cmd for k in cmd_template.keys(): - self.cmd = self.cmd.replace(k, cmd_template[k]) + alignment_cmd = alignment_cmd.replace(k, cmd_template[k]) rename_cmd = ( ' | awk \'BEGIN{{FS=OFS="\\t"}} {{$1=int((NR+1)/2)" "; print $0}}\'' if rename @@ -93,7 +101,7 @@ def gen_clean_cmd( ) cmd = ( # Align, stream reads to stdout in SAM format - f"{self.cmd}" + f"{alignment_cmd}" # Count reads in stream before filtering (2048 + 256 = 2304) f" | tee >(samtools view -F 2304 -c - > '{count_before_path}')" # Discard mapped reads @@ -105,7 +113,6 @@ def gen_clean_cmd( # Stream remaining records into fastq files f" | samtools fastq --threads 2 -c 6 -0 '{fastq_out_path}'" ) - logging.debug(f"{cmd}") return cmd def gen_paired_clean_cmd( @@ -142,8 +149,9 @@ def gen_paired_clean_cmd( "{FASTQ2}": str(fastq2), "{THREADS}": str(threads), } + alignment_cmd = self.paired_cmd for k in cmd_template.keys(): - self.paired_cmd = self.paired_cmd.replace(k, cmd_template[k]) + alignment_cmd = alignment_cmd.replace(k, cmd_template[k]) rename_cmd = ( f' | awk \'BEGIN{{FS=OFS="\\t"}} {{$1=int((NR+1)/2)" "; print $0}}\'' if rename @@ -151,7 +159,7 @@ def gen_paired_clean_cmd( ) cmd = ( # Align, stream reads to stdout in SAM format - f"{self.paired_cmd}" + f"{alignment_cmd}" # Count reads in stream before filtering (2048 + 256 = 2304) f" | tee >(samtools view -F 2304 -c - > '{count_before_path}')" # Discard mapped reads and reads with mapped mates @@ -163,5 +171,4 @@ def gen_paired_clean_cmd( # Stream remaining records into fastq files f" | samtools fastq --threads 2 -c 6 -N -1 '{fastq1_out_path}' -2 '{fastq2_out_path}'" ) - logging.debug(f"{cmd}") return cmd diff --git a/src/hostile/cli.py b/src/hostile/cli.py index 765665c..4f1325e 100644 --- a/src/hostile/cli.py +++ b/src/hostile/cli.py @@ -1,4 +1,5 @@ import json +import logging from enum import Enum from pathlib import Path @@ -41,7 +42,8 @@ def clean( :arg debug: show debug messages """ - # Choose a sensible aligner + if debug: + logging.getLogger().setLevel(logging.DEBUG) aligner_paired = ( lib.ALIGNER.bowtie2 if aligner == ALIGNER.auto or aligner == ALIGNER.bowtie2 @@ -52,7 +54,6 @@ def clean( if aligner == ALIGNER.auto or aligner == ALIGNER.minimap2 else lib.ALIGNER.bowtie2 ) - if fastq2: stats = lib.clean_paired_fastqs( [(fastq1, fastq2)], @@ -76,6 +77,29 @@ def clean( print(json.dumps(stats, indent=4)) +def mask( + reference: Path, target: Path, out_dir: Path = Path("masked"), threads: int = 1 +) -> None: + """ + Mask reference genome against target genome[s] + + :arg reference: path to reference genome in fasta[.gz] format + :arg target: path to target genome(s) in fasta[.gz] format + :arg out_dir: path to output directory + :arg threads: number of threads to use + """ + lib.mask(reference=reference, target=target, out_dir=out_dir, threads=threads) + + +def main(): + defopt.run( + {"clean": clean, "mask": mask}, + no_negated_flags=True, + strict_kwonly=False, + short={}, + ) + + # def clean_many( # *fastqs: str, # aligner: lib.ALIGNER = lib.ALIGNER.bowtie2, @@ -109,26 +133,3 @@ def clean( # raise NotImplementedError( # "Forward and reverse fastq(.gz) paths should be separated with a comma" # ) - - -def mask( - reference: Path, target: Path, out_dir: Path = Path("masked"), threads: int = 1 -) -> None: - """ - Mask reference genome against target genome[s] - - :arg reference: path to reference genome in fasta[.gz] format - :arg target: path to target genome(s) in fasta[.gz] format - :arg out_dir: path to output directory - :arg threads: number of threads to use - """ - lib.mask(reference=reference, target=target, out_dir=out_dir, threads=threads) - - -def main(): - defopt.run( - {"clean": clean, "mask": mask}, - no_negated_flags=True, - strict_kwonly=False, - short={}, - ) diff --git a/src/hostile/lib.py b/src/hostile/lib.py index b92ab7c..f341bce 100644 --- a/src/hostile/lib.py +++ b/src/hostile/lib.py @@ -1,8 +1,8 @@ import logging import gzip -import json import multiprocessing import shutil +import subprocess from enum import Enum from dataclasses import dataclass @@ -14,7 +14,11 @@ from hostile.aligner import Aligner -logging.basicConfig(format="%(levelname)s: %(message)s", level=logging.INFO) +logging.basicConfig( + format="%(asctime)s %(levelname)s: %(message)s", + datefmt="%H:%M:%S", + level=logging.INFO, +) CWD = Path.cwd().resolve() @@ -29,8 +33,9 @@ name="Bowtie2", short_name="bt2", bin_path=Path("bowtie2"), + # cdn_base_url="http://localhost:8000", # python -m http.server cdn_base_url=f"https://objectstorage.uk-london-1.oraclecloud.com/n/lrbvkel2wjot/b/human-genome-bucket/o", - working_dir=XDG_DATA_DIR, + data_dir=XDG_DATA_DIR, cmd=("{BIN_PATH} -x '{INDEX_PATH}' -U '{FASTQ}'" " -k 1 --mm -p {THREADS}"), paired_cmd=( "{BIN_PATH} -x '{INDEX_PATH}' -1 '{FASTQ1}' -2 '{FASTQ2}'" @@ -51,8 +56,9 @@ name="Minimap2", short_name="mm2", bin_path=Path("minimap2"), + # cdn_base_url="http://localhost:8000", # python -m http.server cdn_base_url=f"https://objectstorage.uk-london-1.oraclecloud.com/n/lrbvkel2wjot/b/human-genome-bucket/o", - working_dir=XDG_DATA_DIR, + data_dir=XDG_DATA_DIR, cmd="{BIN_PATH} -ax map-ont --secondary no -t {THREADS} '{REF_ARCHIVE_PATH}' '{FASTQ}'", paired_cmd="{BIN_PATH} -ax sr -m 40 --secondary no -t {THREADS} '{REF_ARCHIVE_PATH}' '{FASTQ1}' '{FASTQ2}'", ref_archive_fn="human-t2t-hla.fa.gz", @@ -81,8 +87,8 @@ class SampleReport: def gather_stats( - fastqs: list[Path, Path], out_dir: Path, aligner: str, index: Path | None -) -> dict[str, dict[str : str | int | float]]: + fastqs: list[Path], out_dir: Path, aligner: str, index: Path | None +) -> list[dict[str, str | int | float]]: stats = [] for fastq1 in fastqs: fastq1_stem = util.fastq_path_to_stem(fastq1) @@ -101,7 +107,7 @@ def gather_stats( index_fmt = ( index if index - else Path(ALIGNER[aligner].value.working_dir) + else Path(ALIGNER[aligner].value.data_dir) / Path(ALIGNER[aligner].value.idx_name) ) report = SampleReport( @@ -122,7 +128,7 @@ def gather_stats( def gather_stats_paired( fastqs: list[tuple[Path, Path]], out_dir: Path, aligner: str, index: Path | None -) -> dict[str, dict[str : str | int | float]]: +) -> list[dict[str, str | int | float]]: stats = [] for fastq1, fastq2 in fastqs: fastq1_stem = util.fastq_path_to_stem(fastq1) @@ -143,7 +149,7 @@ def gather_stats_paired( index_fmt = ( index if index - else Path(ALIGNER[aligner].value.working_dir) + else Path(ALIGNER[aligner].value.data_dir) / Path(ALIGNER[aligner].value.idx_name) ) stats.append( @@ -176,7 +182,7 @@ def choose_aligner(preferred_aligner: ALIGNER, using_custom_index: bool) -> ALIG if aligner == ALIGNER.bowtie2: aligner = ALIGNER.minimap2 logging.warning(f"Using Minimap2 instead of Bowtie2") - aligner.value.check() + aligner.value.check(using_custom_index=using_custom_index) else: raise e return aligner @@ -194,15 +200,15 @@ def clean_fastqs( if aligner == ALIGNER.bowtie2: logging.info("Using Bowtie2") elif aligner == ALIGNER.minimap2: - logging.info("Using Minimap2's long read preset (map-ont)") + logging.info("Using Minimap2's long read preset") fastqs = [Path(path).resolve() for path in fastqs] if not all(fastq.is_file() for fastq in fastqs): raise FileNotFoundError("One or more fastq files do not exist") Path(out_dir).mkdir(exist_ok=True, parents=True) aligner = choose_aligner(aligner, using_custom_index=bool(index)) - backend_cmds = { - fastq: aligner.value.gen_clean_cmd( - Path(fastq), + backend_cmds = [ + aligner.value.gen_clean_cmd( + fastq=fastq, out_dir=out_dir, index=index, rename=rename, @@ -210,10 +216,11 @@ def clean_fastqs( force=force, ) for fastq in fastqs - } + ] logging.info("Cleaning…") util.run_bash_parallel(backend_cmds, description="Cleaning") stats = gather_stats(fastqs, out_dir=out_dir, aligner=aligner.name, index=index) + logging.info("Complete") return stats @@ -227,31 +234,33 @@ def clean_paired_fastqs( force: bool = False, ): if aligner == ALIGNER.bowtie2: - logging.info("Using Bowtie2") + logging.info("Using Bowtie2 (paired reads)") elif aligner == ALIGNER.minimap2: - logging.info("Using Minimap2's short read preset (sr)") + logging.info("Using Minimap2's short read preset (paired reads)") fastqs = [(Path(path1).resolve(), Path(path2).resolve()) for path1, path2 in fastqs] if not all(path.is_file() for fastq_pair in fastqs for path in fastq_pair): raise FileNotFoundError("One or more fastq files do not exist") Path(out_dir).mkdir(exist_ok=True, parents=True) aligner = choose_aligner(aligner, using_custom_index=bool(index)) - backend_cmds = { - p: aligner.value.gen_paired_clean_cmd( - Path(p[0]), - Path(p[1]), + backend_cmds = [ + aligner.value.gen_paired_clean_cmd( + fastq1=pair[0], + fastq2=pair[1], out_dir=out_dir, index=index, rename=rename, threads=threads, force=force, ) - for p in fastqs - } + for pair in fastqs + ] + logging.debug(f"{backend_cmds=}") logging.info("Cleaning…") util.run_bash_parallel(backend_cmds, description="Cleaning") stats = gather_stats_paired( fastqs, out_dir=out_dir, aligner=aligner.name, index=index ) + logging.info("Complete") return stats diff --git a/src/hostile/util.py b/src/hostile/util.py index 4c4db93..e484381 100644 --- a/src/hostile/util.py +++ b/src/hostile/util.py @@ -1,8 +1,8 @@ import concurrent.futures +import logging import subprocess import tarfile -from functools import partial from pathlib import Path import httpx @@ -23,13 +23,30 @@ def run_bash(cmd: str, cwd: Path | None = None) -> subprocess.CompletedProcess: ) +def handle_alignment_exceptions(exception: subprocess.CalledProcessError) -> None: + """Catch samtools view's non-zero exit if all input reads are contaminated""" + alignment_successful = False + stream_empty = False + if 'Failed to read header for "-"' in exception.stderr: + stream_empty = True + if "overall alignment rate" in exception.stderr: # Bowtie2 + alignment_successful = True + if "Peak RSS" in exception.stderr: # Minimap2 + alignment_successful = True + logging.debug(f"{stream_empty=} {alignment_successful=}") + if alignment_successful and stream_empty: # Non zero exit but actually fine + pass + else: + print(f"Hostile encountered a problem. Stderr below") + print(f"{exception.stderr}") + raise exception + + def run_bash_parallel( - cmds: dict[str, str], cwd: Path | None = None, description: str = "Processing tasks" -) -> dict[str, subprocess.CompletedProcess]: + cmds: list[str], description: str = "Processing" +) -> dict[int, subprocess.CompletedProcess]: with concurrent.futures.ThreadPoolExecutor(max_workers=1) as x: - futures = { - x.submit(partial(run_bash, cwd=cwd), cmd): k for k, cmd in cmds.items() - } + futures = [x.submit(run_bash, cmd) for cmd in cmds] results = {} for future in tqdm( concurrent.futures.as_completed(futures), @@ -37,32 +54,30 @@ def run_bash_parallel( desc=description, disable=len(cmds) == 1, ): - key = futures[future] + i = futures.index(future) try: - results[key] = future.result() - except Exception as e: - print(f"Exception occurred during executing command:") - print(f"{cmds[key]}") - print(f"stderr:") - print(f"{e.stderr}") + results[i] = future.result() + except subprocess.CalledProcessError as e: + handle_alignment_exceptions(e) return results def fastq_path_to_stem(fastq_path: Path) -> str: fastq_path = Path(fastq_path) - return fastq_path.name.removesuffix(fastq_path.suffixes[-1]).removesuffix( - fastq_path.suffixes[-2] - ) + stem = fastq_path.name.removesuffix(".gz") + for suffix in (".fastq", ".fq"): + stem = stem.removesuffix(suffix) + return stem def parse_count_file(path: Path) -> int: - # logging.info(f"{path=}") try: with open(path, "r") as fh: - print() count = int(fh.read().strip()) except ValueError: # file is empty and count is zero + logging.debug(f"Count file missing: {path}") count = 0 + logging.debug(f"{path=} {count=}") return count diff --git a/tests/data/h37rv_10.r1.fastq b/tests/data/h37rv_10.r1.fastq deleted file mode 100644 index c14b93f..0000000 --- a/tests/data/h37rv_10.r1.fastq +++ /dev/null @@ -1,40 +0,0 @@ -@NC_000962.3_3000195_3000563_0_1_0_0_1:0:0_0:0:0_0/1 -TCCCGTCGTAAGCATCGATTCCGACGCGCTGGATGCTGCCCGCATGCTCGCAGAGCATCGTCTGCCTGGACTATTGGTCACCGCCGGAGCGGGCAAACAGTATGCGGTACTCCCTGCCTCACAGGTCGTGCGCTTCATCGTGCCCCGCTG -+ -5430361214022-5224244425513232342544661102331222222/0231322106422524012261/22022/32253121.4240226412221023435/3264252222142422302224322324/222224/2322 -@NC_000962.3_593134_593433_0_1_0_0_5:0:0_4:0:0_1/1 -TCGTACGACCCTTGCTGCGATTCTTCTTCAACTCATGGTTTCGGGTCGAAGTCAGTGGTGTCGAGAATATCCCGCGCGATGGTGCGGCGCTGGTGGTGGCCAAACAAGCAGGTGTGTTGCCGTTTGACGGGTTCATGTTGTCGGTGGCCG -+ -713021.42322232223/61042/3132022222120424222140213104.023.26122232232133252222/23023322223222/3223225510234315352235020321253211.2423.205334523/320/23 -@NC_000962.3_916222_916482_0_1_0_0_3:0:0_3:0:0_2/1 -GGGCGGTACCGTGATCAATCGGTCAACACCATCGGGGAATGCTGACTCGGCGAGCCGTGCCCACGGCGCGGCGCTCTCGCCGTCACTCATGTCCTACCGGCCTCCGAGAGTCTAGGTGTCGGACGCCCGCGGTGTTGGCTGCGTGTCCTA -+ -212263322332/221322232212535283242242233320262663245242642022353011/2/001222502222422433132/43230250222222/022/2153322/24252.2142622234301126214523522 -@NC_000962.3_3968020_3967634_1_0_0_0_1:0:0_3:0:0_3/1 -TGGGCAGCATGTGGTACATCTCGCTGGACTATATCAATCATCAGACCTCGTTGAATGCCAGCCAGGCCCAAGCGGACCCGGATGGCAAGGTGCGCATCGTGGTCGCCGAGCAGAATCCGGGTGTGACCAACTGGGTGGAAACGGTCGGCC -+ -201322611220121332142123140-32236331434203261222.251532212142332125012332/62/30312142320362951206322202753/2503252420233322223222214122252232342413240 -@NC_000962.3_909008_909293_0_1_0_0_3:0:0_1:0:0_4/1 -TCATGGCCGTAGAGCTTGAAATACCAGTACGCGTAGGCGGCGAACCAATTGTTGTTGCCGCCGTACAGGATCACCGTGTCCTCGTTGGCGATGCCACACTCGGACAGGAGCTTGGAGAATTGCTGGGCGTCGACGAAGTCACGTTAGACC -+ -2211332234121232214422/10505/2552/22/2323121226452224524472521/146413045220230/3223222331361323223425-2522212213224205413/4572023222245531232014403212 -@NC_000962.3_244959_244622_1_0_0_0_4:0:0_3:0:0_5/1 -AAGACGCCTGCAGACCCGCATCGGGCTGGGCGAGATCCACCTGCCCGACGAGCGCAAGCGGCCCGTCAGCAACGGGCGTCCCGCACGTGATCTGGTCACAGCTGGGCTGGTTGCGGCGCGCGCCGCTGGGGACCCGCGCCCACCGCACGA -+ -32331422826212301272122223/2232323.42442420223/2233/3/13111133734352133242142/2322.4222/22330212242222443232224030332220033032412/212333211323/41216,2 -@NC_000962.3_1353795_1353364_1_0_0_0_3:0:0_3:0:0_6/1 -TTCGGAACCGTCGACGGGGCGGGGCCGAGGCGGTGGCGCGAACGACCGCAGCAGCTCGGATAGGTCTTCGGACGACCCCAGCTCGGCAGCTGCGCGCGCGTTGGCGATCGTCGCCTCAATCTTGGCGCGGTTGCGCACGATTCCGTCATC -+ -23331230236030022532153322403222202106221136222255422154126224323232337313234402201222353212225432302313424434323334022,/22241142342213214326242543213 -@NC_000962.3_2027307_2027649_0_1_0_0_5:0:0_0:0:0_7/1 -TTGGCAACCCGACCGCTCAGGGTACGCAGGCCGTGGCCGTGGCCACCGCCGCCGGTACCGCCCAGTCGACGCTGACGGAGATGATCACCGGGCTACCCAACGCGCTGCAAAGCCTCACCTCACGTCTGTTGCAGTCATCTAATGGTCCGC -+ -23356223141-3220226125241410121212725153222221402/13214262145122223213421224103424412424416..4322424482.401223/2/4340143231213321232310534224220264220 -@NC_000962.3_1684205_1684573_0_1_0_0_0:0:0_6:0:0_8/1 -GGCCGCCGCCGAAGCCGAGGGCTACCCGCTGCACAGCTTCCCGGGCGAGCCCCCCTTTGTGCGCGGCCCCTATCCGACGATGTATGTGAACCAGCCGTGGACCATCCGCCAGTACGCCGGGTTTTCCACCGCCGCGGATTCCAATGCGTT -+ -32223334024424421211/434423512362232413312582422024453155332464422022220222242-22422/262022-52535212505651332/3322331312321222202/240320247/3306132242 -@NC_000962.3_3102065_3102388_0_1_0_0_2:0:0_1:0:0_9/1 -GCCTTCCGCGCCGGCACCGGCAACAACCCCGCATGAAACCAGCGATACGCAGTCACCCGCGTAACACCGTTGCGCTCAGCCCACACCGCCAGATTCATACTGTTGTTCCTACAGCACGTCACTGACAACTACCGACCACTCAGACCGCAA -+ -132231234122402522260433723/13223232222222435335233.06243234140362/42122112/421114062211314/3222456,6220303213223632224152453430.242312230221251354420 diff --git a/tests/data/h37rv_10.r1.fastq.gz b/tests/data/h37rv_10.r1.fastq.gz deleted file mode 100644 index 429fc13c9f33912c8e7a74bec4e87313114ca2fe..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 1508 zcmVkQb z@lpMEnBDyOJHAs&SF>CBF1X59Ra;y6PTv{7`2L-KqmAz)f2=XaT=?`pN5@59=$d^@ z{H%#vHXJl`$#(9cOHVjxEKHl|=o~mEN6d*)y<_;|OgzP~F=EbOzxLHWE7s#a+VW4qW4@<<#7)nkPJH`7py02Od<6%r`8A z`{ta*>0T-xVoK#A8IR2hMwnwtF+a388r**T`3bpR*w5K`c_FTX>-x?@%vT2_ym!zP z12nSDI|!*A~oFT79AicGMYLCZO11*BS&18?J~LjE zJn&-id$gP!s6a94t+OTZP#AK-(;JB0KsBZmbd~^j5L)3fTIGwxM||am#n|}Wo`{R!#iBZ6~ zA~|9nBh(<`bQo6ITtGwwr zK)bA>$LU-$S@G+XUzdfbKZM!^WW}M@Vb?5D?qS0r!V#*R5rz@sk6~Ir>_rEI!|zyuw3TN1kgE^Tk1bHn0` zj0B4Pl9A*%CO}*<=kR)Rj-+2{;0O#hmy7(57`t8dVgu1XRqQ`hT!FavIJ0YpOt`FuH_WH5M_5Lf&)N$Ic%fB2BU_@53#$btss;ebl!)Y+RFsZ6st2i^&Ix6MT80p5 zq(a9qo%G-0EFVamgEggcV&g@>|NI)s%FqZGOGg z6;ZmKf-L(Cl`3`hIm0Jb|8l$CyhJH@ zhr9_NCjprdmmcmTKj$UNipZ{s>D1o*IJY|;Mr}P`jvRQKzVE6mp5%mp`rUK_j6^9L zcoY>wh2Dy+CeIkE-z4XyYIucz@9;#4GUn6fqhwp_6}K5WRyM^W*@bO&xoiQio(TdH z14F8phni}y_n_bo`>Yf@+*m!fzudk=>Ocsh+Gcsev{+ju(IZNjEc?wd@6eIKvL~LV z7M}f9S)leyt*R~e`xCPLNP8C{4>TyxUyiGCcAfk1FZ(L^!ST2K#yO8i{kX2R^x@%(HFY)diG{Bb z7e15sIj{or=j61B!I*nFt|x|YmyhD6s#_V}N@?C&?FEBgOKSzUtZ6EKaHnF_Ioc~G zVyGF0Z(jMhZp}Jpy4P+v_rth7rrr9-pYM?S>G&mhYW@##7u=>}0b;!L1qMrRNs2%R zDtLFH6aaz-A0Z`=oSj%jn9DGIiY|hp3!Su%LE<&8$d8LO7fLj>W4BQi)XtcALD-JoTKkFkECVZfYLEIt?1G!eDixF8hDt2Lw}@!2*n*5CEfz~t zFRml0=QyX38^38c4kYWj_8>Q6pi;*Q5I2_G7gK-%(Oy(BfZgg1l&L)TPpi-*z(av4 zZ(fupOwDxcR5_K5NnQ~dEx`ul*8#4S8(z9m{~T~-Mm;d;z!zqMVgdF8O2+2iP2wNU zP}cGjv7h5%)p))meKz8!RczGMG4ajy^atXI4K8?<>c*@P>8abX&0t4iX%l6?9Dv7H z4$vU-uH5H}9k1Nzdg(edC3(v#>QP#6eRKyr2-2yUVTD{Ucqqp~dJqmsn6NblNBNH| zI5eY@IF`dh+!B1JNmR=P)b3E0bgG<>=?sE7426T4M!fbn9ouR4;HjSersD+NsAHvn zJh>7{5NXgXiHax|jq{Rs#`e%qZcECVARiFi*q$mWhr);K2>I>QKCu_f$m~&wX2?}8 zg#?I=nu3z>qIAM;>6}Mv{oor9WqI!|h&&hrL9;bj?|>77Xt8u~SS&SrIB#jtH#u*E zPr+6^&>uL4A56McJN&cKm^^3yM2z3)>?co%%S*-Wn~FPCESbhm|EC|y2+|fdrR0$C z)4e0IQtphOF`urnvg)O*x4j7G%M9dCIkL22$i=stq>2kO*s@4<6)Gnn)h)g)zK(?L zu;pSbjE$Z!Q~}`~hFUjN2eq6Pjbao>$ul($+QoX|Gcb0Og*1j!cMt8@68)Q1z^wg_ zu7YP|$qVv#KOTTgVabq1R@$5CrwN$_NF_~)AL$RltWR3$-g6heNSoEyD_f4ap({kH ziYwAP(t754n#NQ`Wbi_M+^pPN`Y-;Y)b+bC6!iu^ScT}rBHn48U2?{nPpTJ^cPW+p zl;~)q5E|@9o8H1lxhe)n!Xwl=07}EQo3WY|{07+P_zzEd)3KpS+Fj?-&bPiITemwQ zr~1~j2ao~GE#51wnx;WNre~~IH!)+o>D(@}_Nvz}SNf?$cx9WslJ}I&MEcEE{2qyIDDrW-%~IAd5)+V-&^^7@ncJ{{(t4T& zm^JhXMK%A%o|eLcWn+kc4A2K2>OP2`h77vuvuL{NOI$5PmPU1UdR%Ym9ju$`zZr!} z_l3}aKdq#rX=Ux(5JdI*K zokBCSu*uV}{F@{V{Zf70Tju7wSpj6c8eaEcu>yco)`?VS^55E_Ru*^G%VNuH?!E#g9pXw)^9M^v%8P>J0z@ DKED6f diff --git a/tests/data/human.1k.fa.gz b/tests/data/human.1k.fa.gz deleted file mode 100644 index 29c9e961c41d338c8cac39aeb3b048e14153bb89..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 19004 zcmV)6K*+xziwFpgXm?})188+^VQwxlYc6JC0PUU2jwQ)*UFZH4{Q(j|Qjh>m1UP)) zoJb>^3!%V8hycke@CF3v?-g!qnV*wcS>077K>`6*b!TPfiHC4n}7G)`QQKf z`S34){>MN4_Ba3Lx8on~KmGGRtxu1?{N+#o_}~8FfBhDJ8~-*B{hxpPSAX3`A7k9d zah&IQ=U4yv{jckvqy6B)Uw-}dlb?UuFW&T5PX52u?0-G^U%<)xoA%4M{iT!7*Vp#` z!_R*G|9SHN(+>NWc=Ervh4Iw~Us%syy7TYc;>$ancjKQgzP=7RzW23#{?(7%_-8xp z%TL)yWd+wXp@AKLF8_4f7n z=KJs6`A2R~Z-2V}^>K3i-jla4kD@dGu+Q3uU;R{ne?0jIPkH_DM>FhqJK*cy{=vp$ zhR%D$9DUBQzVh#No$K$J{~mr_fAjbGrTy-wfAC@Z^;56s&cA%>YW6Rm`WV1pKed|u zOQ+K8Up#d+`^QebPt3sjbMnt|wd=l4Cr5tWlVryD;`rygu@7$?AM5M$tCc9tIoGeA zYphGGKfwmz8wy7co2|E`8SKdc+JzP^8*+w<0ur#!c0xvY;PuN!%}$S28@ZT9+zHcW5p ziBTu6+dplNJbT@vWjD=zZs)e^f#aLo(znyII~_JYpFZjF^{c@%e|h?9_8*kFNhW8T+U**PO0dku2Wf7ZHx zPQIT%PjqrWhUT#v`O}%pi*s_$7>6ccW+z0=j@9pdO^-AQ zJ)4qdh-X>%VUBnDn#mA3_eP$*#&Pw~pk*!QeyGTf&0&gM#5qmYo$>^x>13s>v5Gcw zLU;}rn4G+B&}liVd>qGptR8xTpJVk-TR-ZQ^AbI{e5b!1xWGh&hpmTr?%9|)>n5}y z&+j?5YuGzeWHkw+mNvTQoQ|B^BPU4qQy!DEeO)HMS@U-tAKbqt+w;rj5BXFv8Cymx z=Jf<0&jaT39V?w%Kix;(>p9|X>!&9G%a0zHLno*b3r~m6YA< zLGL5)b&a%GYiw{b>p{jIvT@ondUc(=E{_Rd!E?gE)C8t9J9itBJZ7vrXIEsmW{6KO z9rsK}P6My6j(NJZojECDua8LM(fhR??#_|nDUa;;I}?F4c8~ScvTC9_=4|50BQSRt zLww(Ke_mz%?s?{?CF|LuCh5QDE+kD`i?r^Y<+c`ddXBXw(=TGn)2I$yqu;NNX_F<8AwwU` z3>?fl%~&t(V>*%Q$edSA_d{f0(6W~Yo-f$;tgr7jBPwZv%t}EU)056z9AX&)syOv!ygN!4mp(ZU?!{}?C zpZ9iFL`~`JdO})ZCPe}fq*Ozjr9;fIz@J6Bmc$H>ql^u1?JTo{hr7}SJm28Y)(j`W zTz%OHce>qW{c@Cu z-VO$4T`9vyqp$QWES>ya#hq!oMi6?iFtaVI!$_;ov=%i=XP%eV$VZx5y?kC>Lt~kd z69>;iE;MSU^vaT^*@vO@$f)_(8azCHDGv!0+EZ^mRHgBWD|iN^@@^>jK8%ef+mw?9vx zE?bwgkje1$Hh<>x(Y=>LKErLqchoifexa=ygg%arqn4G&>_%pjX+%s*N$8IBh*w=F zW6$N=&RmrDRl)BL2o?f!OrJa#o1p`VVsp@fY%@J8H^#L2fN23)fcyA0J#!J0JmryN z5UXR52>Ll9NIZJZy^OapJz$q0V!`9?{o-BPVyH1rL zYrMsX%K)C43S@w8u=xC!&U_HjMS6|V?KR^j{BwG`6_e(?-a)-FdHuiQpQrzsg*(<4qKt;tTxJI^p>a?z69X1tUAd}WBb%ry4I1;4_v*$g{Yay#jw zG_2_}pWia2j`v4eokL%=FiIRE2K1&WBH#o*BOl6PNM{LJ5ts?is}uv5ILc(jmD!w0 zn&z{wTXAgN$h1C_AwWG9e>w%nhI;%l74xa?s9pm^_{t_2vIp2H4A( zE)Sc?>3tR&kQrfi+!Jq;-x-OMy^s-;yXODv(kDTI9LEaDkGq)|W*^*nqbn0d`tD5Q z6d+;*$l_+JiyFRxfEnk~GmgJ1NyZ%RB=XGMGjEm5_vJFbFz(M^=S)UVId7a`&Vs2G zZw^S>OtMzM3QtKBv%Y%&vl6BCr5&a*&wo=$%m^=+paBTM3Asg74EV&JC+p-sK_&9v zgVtcfNfoi+y@0c)$L`FZ@e3F^h5<-r@^IOcS7Kdvh5tD#ACGYiq7p3YFtldAfhMoy za7{5D+Gk%*rje(nh0d(4tOZ&2GqdPQx;F~uO4IdN&&*W-y!>uV;shcbM}~arWzv=z zY*>j5?^3A_Q5VErRmeFK8y0mkmmyzdofPa6f05woSU6!ETg;u5CL|) zED53g07!c~k?%97tly+9HTjG@8Q+>Y>#WnfTnAF;*1EpD5Z1lb9L|J5v)#oW)3?vq z3O$TRy64oHUSfV7OcVjDBBlaF`u^g;4ZC@q4?f+G9sb`JP9 zfe!P8WLJ>38=%aHnx@#ayipt6V(LsPbP0M(44q&`T&D1p4AzO$golYG$X!E-dx|4o z$19$EewAszK!O11YlV#Pwt$PYM#K~dw-MW1Ab;2)`}B3S)n{BA8kD5S_f5vIu8S(j6zIt z@T3wlq-s1)H&ZTaGK+~4uE72nQsTREhRJtySc}BHyBy4{gZEJ!t^Dww|}=A&?r za4cey<{cvIc*)0f1nF>8{<&eQ=OoZFqF9mnMS@iBb}#`FCN#6bbJiLs=Oar1Y-y$! zq>5xNk_87PCoaD6~9>Ci)rYeBLw&lZbefkHKsVe2{Q>aUKBXttgSf5gMK_+${WB zy4%c6c7HU<6-~&Vwy`N$a*H9nDBDY0o7O$_P@sNbG}sbh)O@aVO;Zg(&-^(=NTQ-M zN%&!qOqQ$xw0Rk|HRswuI3u`CCm|FV;eVi_x`%9klf0Onk>QZ2*^rD>o3(zsP%Ch!>gR+^$SS z*tW=lyeMJACJCl0oLr<0PKCS08_^@7727|M_Js0g^CW~i@nXv{HxlEa zv3ssryUb*Osm#OQ%^BdbVwvBky6M$S zQ=vfmBws=-)ruY2=GUw_MOTd9^G&XH3<3aq%0Jq5mjegR6rkuzi}Jhe#D=o1{MDSP zGnfQCQt7t{-sQv$7qaPDK|kqk`XvqSFjp9fY2lIuHQ$GJBfEB_e?EYs@{nt@-`6ca zk_Kx%eu_9xEb;2(29J>Uz7WY&E$wq?vwVk|P1iL!#h9C?Xtxzm{>O6$^$ij$dI#5bdBy zxY5mAK>PuQ^S=Jc%B^MP0CNkSgmW)kocv9K2sF5 zT|COOQITjFwnTH9B>ar(&a%w#X)G`n+|=%7G`UjKmU*?>j2u#V`}J(j#T}q@8TBa+ z)2xu_B;bMSE?9=dXd9F&RKxfrGxsQjFlDw~Z)8$M6a*(JCqDE}D}SC^SdSdI83>J8 za3O$&U^GPSiU31xEw)~%tqDa2`I&_i-l_Fe*OO9#t@-O9{rFm%YmH-T19jr321DYY)s@X2|Uybq!zqP`^@oiCh+0E_%T^1^ls+GJ-C7DbHeuCfMIkr07Y-6Jyp^#;> zFSH4x#17LOc~%+#3zMkS(|q!Tv8qg9)zPnO_ty2C3!=lkt9gnH-ST7 zX`GP@rSEK6IL#|XVndic)NbI`Gcl-xw9dbYTVKQ$8zFCdwdnIxPMF$OynLpiI_u2Y z5VU5-){aa}QzL760g}HfDFl(3C;A(@0Ig`s+WbK^O0A2VXfHEd z11q)8na(CVn9LPNbGtLO!CN^F@aB3IB26QhA_eg8!9*$)4&rp~kYSOrmv~_19Y=ER z*}kXICWIvp(Si)VOcY*7!=xCOyMZJ({v+c)jJa!cj4oyNRAXjs!hWL5hT>O3M;V*9t31+Lme?Kn^(a~B40PZ;MK0I0 z*K2O8mS^>EoW>!}HJ4p78!>wqtTUg;)b-2Pmoq=BW@Pm zGC@~9^Rh;8$PmjP5v}7BqGjfS?`s>;%9Q28m5~&lSh4$sXerafldilfuZ~5m)_wXP z8%e)${TnE-eQfVrzc~QZ{(S)jUb(7HRfE-UK!NvXC~zZz_pN$oW$qXM@r0_VU;TD| zMglXtJ2Mq(b{tmG12cI13K2ZEH^`B03f=9$>b~Oz37&8NZ+MV0gLd>%KBt3D-X@2o zxSlgq@2(KyMzMr-I}=zU>t7k_MPy0&2tXQ&{WOWE1#3f4tPN6$P7vI6DaJ}gQG8K zy<2QlDq7fr3QVDiR`S<^NLbKP|FqgfMQRRtS>pV9-6Bhi1>Rzz?1NnFnMU*Yx$tu) ztPbzW%hAN8Na6YS#61a5oZA|%tCSERD+UnlE%oP#N^3ixqsp0{^U$`I&XLp$_-Sk=?hu=<2;fSOv88>4OqPKh zsB+R3Vdf2duUdX*1H3u+`pfw66Bv&_J6(DN{cEwbdEP>(nymW9o1K21wnE(uBC*WU(5WK1YGdmk2w%)@ znIa@nqf8aN!0>POHleG0AAd^R%waLD7?KwuD~WHrIBtAbrVE+?d!n;X@&xP3g%v(F%ZGA&dYVqqJc(A zup@y|L+gsJC$yy6nwO738_t?rDlI~i#Iwa#T(!B&MP8&=^$T(Ob9j=l?PZVCFe%Qv zDavB7O4_>08iApdH&RBz&s#x2D607@%{ePn=@kyhqFS;9uH$;{y*MVdhf{hb?E6)~ zk|TuT(5RF#!-k`BSv^s8YUG3&|F?tEhgm?eO25)<_g?iV<@|^I$r|SpHpyBDE(>QfWCYa~$5zI;~+u6s96Mq$!UOm7eMvoy=2tQ~pe$<&@>Zpe-{r z@yHc)Af7FWLbtFbmwAE*iXGW%p;1wCS{=Ab^Gd>TDN>X#rgEL_2~;$;RlrTQl9^@-iiy$N&M5gp&t&ohLAwgbemXIENxd+kiCThQ zm(-D)ks0-C_*+WpVvY(RE529sH7RHsMz(4ndxaIlD1!$j%nhTe9eR)X)j{p|F3wTg z$)i*u@5*J@SPf}HGfW05!eJDb@R+9_rO-x9mvKj9^-Zkebk&9&Wf)Z14H#(jqa!mq zt8055gMY_fS{Xbfx7S(k^nOE&v24pI-lZEq_7_uAS&Cs~Mn#Je#b9S=4mK(3#dsGJ zlwPQ#xRjXAo@!$&2gWpH+T2%Vf(FJWNL~(tFXQxSIy9JZyYzwy{9yMFq`wg!Tw{{|(XI=b$`!Gi$OIu}oR(Bl4-*t;)7Z~Iej%sjr)ZO0h!M+>I zhYHakX&h;+K^)$Snil_jB^zB-?k%#uw`M6R2lcvZd4cz2RMNHrLp(YxzpB`s40Wc6 z0U&snr6==!6=8*xMb7)hCChosz9e!2an(-6oOj80PHSu7nr~_D7w3J#2WBC0am@$3 zHQP>|S5YX1;3611B)tjUMT3>XrO zOX~mJTLavZiMeIa{n~Tk!ygHE(RB+&KvUr(~~>6t_4^~b$G>+tqt$A8_N_MPthq2{+= zeB3;|{oLbzynOqk+3d~dS7U%vg)TJ-StQe7`xb$!;Bx{S4Gn1Vze0w0>Jx}SK$B8mqr!GJ$fT-@3EE_1^snyVJL zTL?G2Q9zr)$qlbGLh_2uyThG1chUO2zu8~j|0n!kn)Os$+FF9mEnr~^~{`<|9u1r?VT_heI{n>MTj46*L{(Uf=F7w-0E3c)s$( z^LQHqo|)&mOW9b;QkBnW=kP@5$AM_yB#2uRt^MH=NOpV+Mzz109J@WzQ3+b6>uZxS z>XrH`z65PBhdVR*e*QqHoH;%2>zluBaG5q`|5h&vQc;D`Dm;X2&EjMcq8MFSlLmf! zCRR{+|1OiF=g(R%N0r3eFYR#m<&(-fy1n*Y3$Czf)vvEjM5YGUUywiqbYeJDJG z`waKF8q({MylFipy;S%Q3tghHck~;U<|avEkHIM_|FLj5|PU?|H*2Uh=x$AX7t_P@e^zohR#VTxo2`hoqf%N|C>%Ej; z{L_%>pTB$3_ZrdOTJh73Xm72^YI>|Q#bGc@5miw{4N}LYBZs7^GC@HRe3?`cJ_xou#LH7a)-Kb z>V?9FC^-lZlD~#8nlA^NwMK=83)@N0OgT+?yZY9R>V`tK_SRRF-%~kB6EMiL*Ebsp zfXf!x zxr81dgNEWblZ6)F?h-K6BTZ?VQelh|o*_WJ;|KOELS_-_$JO z$DaGVo*45+IPq9-Xt!MoS~FrQt9mo3$>-(noynyY#9`}X$3eZDUGzG(C+Xr038=D4eHa19 zoFxOHGr~TTmBA+&o=_vbdON=l0%W4u`${Ta<(!}M=rYp?4UFt7dG|c>`SO@bTg@ZP}hAUe&U#oG# zwt5F8pOoI^mIS!ebWPA@w`B#H5e&WM)?*kuurRVJL6y zp5=}!dHh`d_Xj&#omCCC%h=A^(F*^3|HO_~ukydc@;~$JY{>O0V}tSiYD25*uIjaB z6Yq~3T3KiJc|$8P0CrXZ-nk1>{DM&3V8>y2MIw!ytBt5~jEy&zBy6Z`lWVn@BT8_E zqcahj0+DmqQ&2)-jY(m5qf9?Is(C6!F-kGY|5cvrrSXu&=-rYF;JFI9+b*jkd(xlgIz50=UP|WEB=Yh- zR_!W?!n#_2pi=6JF!dWkM!O^$+WbVWTRQ`gF$~LUDONzi#mdWPxv2bXyW%3T8Bjj6 zV!NeN7k^XRc6-|^nQy1oDo+8`^e#i;Fd3471%7T?)z%DB26+jikYhsRQX2J29K!r0 zhr8|46}H33Sf8SXVdCBWe;^mFP&5mbQ1~k)tcf}1sKO*O80?~Q6husGCxxx#=S zA3}cyn&z>UjDb_LUoR1ImDyz};enUFuDj~^9JSSPIa${SP1O%Gh?j!U3QcM^SW%Ti zHBjC#_Zz}7t1zNHBeX{Gkt*b3WNU@Q zIi5~MwAHLLVMCR{>RQLYBu3vXR|=iZBl)UyHHY>3uRo^|##!_P5=rc7%rd>qq`|hz zed8#@)Em>>i{3%pnK_0j)!4#T#i7-?y<#(?`u6NNiWy09#d(r;wbDxWXAv zcXS82U^GUT#nvaG`Arp!nErLL7`sqUuyU!m(G`Yi>fB*@8*Id_>Q-|BEJtE&_F~Hn z<|%gYMNqyjl$fno9%_(ZXcJ}X0xzTULXr7)KnLZ{XNhZvM9$@P6egYep zeEb}rfo01y!Q9f0aw;&*+-B+Z0|tYIl`qR7OwpRi;!P{prVZ*x?p{+{>t>{}6iWfl zs@?3$meH{5BIYa&G#y}gV;RYF^V$pSy4)urnPgFuEu$7+!{j!5#NH?frBY<#v#wKp zAA!`w^;rqr1dFpI)QKcwF)MLpHUSnT3p^xBg+M@6yAY_jD4)@~uzUh_Z6)))$YGXA zrPMB?!0U)DtjFdB1T>x40gO63#_II*ZL58-MdG9kTzl=9CtyAcEQ-o(A#w;4dmX?_ z&JwpJ?!!hAo1SiaV#0nL?ijfD{J2~Aifvq({f+^_T<1d-?Av9!$i`dNfv|^4VWlQ{ z0)4N=&nm`@X7iiOG{H{@xl)bRkLy*Man=q`Yr|+Un1pM4g##fl&?`Kj6>F4r@3uHw z*l_kQT;5#KB|MY}6vgnmBvlb&(p^;m!rYtpCcvZmtuNwPo}H~Wmk>Hvi900Jp(Ot2 zO$p^7R2c5bvES#@d}B|2ZqDpCgc^J&z6DvI7hUyLwCX%wcV!cYx8I)`#B|TfG0!4A za`}XRj}K>M(%B1;1S5wV-0k8`JA)TSxW(F8XvD$ynG>e1BD$W~@GPE=LN|18x-KgW zICSH>lZxyxo@!oz)UrM-N!zFfnSB5a|TzJ%AA7n~KzErWqpGt@yYm`g$DHA#bG)Yx|vz$R+1q9guU^#}g|VQuReN4Vfr{ zJUKdUG>vTmdv{e8P{w4x5M@Gn`6j3)nXt!qkRTBhf#wxDqW@kyf2SI3M)m#;cGY%%X~Q(65MV%>y|#uy5f7!b3>Xn?XcPnThAX{OB_NPa0Ma}RecQJjGUHJU z<9Mm4!q5%QMpAPKX|Zv3pVCWMGoF%#kdxm;VuaZ>;xhLfU?3UBje+h3JY%_>vT65VgN2OfBYh? zuJS7vX`h=?&+{V}Y2SOEuP+iGF4F#}Q-1Gp+K=|^S1oBjJ3Q|fX@A_ayta+tyMLG{ z?MHj|tCqB%9iI1#v>zGc=esW-hjg$P;$vdjb;f=LQoyQ@+Ik^l640rM3DAnF*qg5P zs(Y>Iv6fw{SQ5h{1~})&3Zbc3xYHNQl$mUESd(r*)Hh!MWJOtqpevV!!&-$T(Y}zlFRJ`t(8&6l8GmNx0T+>o8ETA$gJE$C(7l}D=-jA`! z2W&+aY2d^=5}zmwT<@Qv?2*4;)Rsat^`|Yb8v!;v^+RuC(ydcbeLNE0y0wnsW&~4`? zYL43G6K3!zMsSNyA@5TBOz@g}QGY`XE@1D~6!}WtSs+87t+T-Ny7%qx!%$zXn(@^Z zOcC9Z_UGcvi@jA_%2Mri*V}jZ1HksQyUJ_*&`mv#!bk^P&}7U3erq?WhV&IkJ#PlZGQpyNw-qF0Qo@PsD{P;tcfXGp zcMLV0l_0N|5S_kSqP9SFsJO^|%Mvq37Tf>63F`OR><;hNM@Z;u!owac8P?i!Nr^3n@%7{v?q zs0tY#1^9snCte_WQy_;Xt``kW5trs05W5enrPOEYk zh|W-u&J9)z8V81qwkW=EtU3eGDN}prTMkhJM;N7c8fN(jD!LnRRRlb&%Qt7PV13A_ zhamhcz`7Sk7^N_cy36oQcp7`Sm+o8(p(tj5AEk?9;q=@EyG+By@)z~n!YtDZ`_$1^ zr_XdC)GBS#@r^d=3u#goXQksm@hYGF%lSd~bf2HxODq4fE!$bk@3Vus%;jDYNc&=w z?YkVzu^o)j(zX3bMj+W?myYENbyD6^^1LL^?h>7QIG{W2mG}s2nNyU&fvJQYHw()g zW0u5TD2m(!l$qs%annBLR3mU`wW;F9;@B^20O02!uUq0wt%pePJfopr6_76xlrz*D zj$+K<2jIN+d8Ar5MQ7F-j{X^=d9Pa(3PIdbc-;dhW!iZ_ZWN zif0<;yjLTlVY9fA~FY-U|WgK#JP3?`VCAqhUg$QW<1JI1Rh^ddt? zXu2nx52ES-WdvTsIMJV48{=H?@tEq%dy>~%b5PV$;@HV#r`e%}6(qduXbzQH6}Q5o zpN46CfW8hh^5_MbVs|KPD1>Co%O^Vk7TuecRqS1rWdewZJ2W;T!`uETGZ5gYBAR+} zD+^;5%qcBc^gC6+WM)H+NC^24?yRj2c<3m)1`}E07(5J9E7?kX)i4gQk4g@L@!dJS z7aJu2eLlQ04%6Blq!@d@9`StYL)j!0>u6j|YUHMtMle+tV$@hn5yeviP=Fo@IIwoN zJ;{;oC7vHe8x=wgIU>|i(G@jLU8OGB&2W|DGq|{QJeekt+8=UqmGk?ujwZ6Civ(!eYkc=cPuDiBs)8z2k$8~Au`YxN!w?Lfd&P%M}ES8J7 zAgsh|U3g(cWyQTrjuv-R?8S6|?qzRzYXdDN#aqNyQw&4k&4tbi`4a;y6EO>>DSk!<#bzxZ3KADAKJyWT0u_MKbX;45B#W@d-_#3f`Bre49%)gPRwcDq zpUL|fLW3p}@H-)WmM+iO^L55jnX~Ljp^F5~Me|I%eU|is_qk)v<_@-H)uo2|yI!~$ zF&Lr-Ut@lv1$+0?L*rKFxGH)gC0dkt2f z2k!~G9naK)t_Ncq!!55fgWX=hxhZB_T(;(F#hwYWCiX@s(kWt^cC;XfLDQ(LHT$RS zSk`5BHSZq zS-SB<*96Q+nC(`2xv^dqu%A^8o=~%(6k7`hyaYQH9_&GU)eFB$o_FuOh}NB@t6$1G zCUPbz>;T|S{#1r$OaVGh`a^jfLV7F4kB!Q$W_#t-(cc?Att~LvE8_-^b&HDS0?E4D zy;H>9r~4&UZZMecZnfz19v;n?8r|@I|yqPTm#@av3H73+Bb_@M{N z!IUf#$%f@hElL$xRKu9=6QH=nSY!6mR+LsYe2dg9IUBs~Fc#Q`cNKQ3W-vEsn#7!M1 zu0+^OVT@U6A^(*GlVmh=+t>k^7>XFg6vz|ONcXL00w)1a282-J18itQ35n?`9PrBn>L zfEps$Q+~fC%hx?D%V6kJj~<1x_j(&hZ2VWXUgN5yOkIsxknMH)f3q@!*of!oU_*?g ziA`n?q#NXEn?5y1J z2FuhK-f0ZTQB8SdqKA;O6TT~aoT{101Tx-M1V&cwp3nFkO*qv-XPF}`MeuTsP)W?z z-M|dt`Y~0CFhZ)HB!p~4;tb}x<&~hapem+*nF~jLl!;0U*u+A~)qy2Q*LxxR=49a8 z@M?jH@=H`}Uqcevay+o7Ql2)|utYt7mVQEnX!vr|P;#pRCJ!n&|T4uyJ zd;&0XyD=(*B@GS@8*do5Q$w}7z>XY7nf*P%W4IYpE=lN_ZV^Dil%YaeQ=kHgL4!7$ z6du|MVa*$97a7{2qE&wR+^;}gLaY9w%GkVX?fw;ObIS{62Vcb2X0)!aG$a06byNG9Ky|a9qLmgk&((B_+Wi&g@1{AtFCp4C{peewYSCfqDzRI5Wsv8 zY`nB13w=DUa+pJL!8>uIbnd?`%+QHKcPcT5BI=3lT?U7U)C*@eM#mJPe=&ItG zq2STFzFzk;{#lcu_097LrW-N6h;ZS=-36)MOPMh&;79f%t6 zIQPhZZ8*0K#59=f6}*#QAf8&?-PGdfPrE^S!Iiaa^p0Kif5)jc_>p=FYWAf+IxHf$ zdJ*$;)>3J`fcCwJouM1{jJoK?O@|#p2{XCZj1mP7_Gl|mIEFywT|dSB8KIJ5Qmtpasn+c1gh<}9r6D6F@vF^FM<4;IUG^99unvqT+6 z?o?5hIyskQg{K5#W@lTRQqO8&NSPv!74$aU_w>O9VuCo5(1566pcbq$5CQ=*_i52D zV`fS*OHkc4#5w}B9U!i+iVw4gq9L%t)7*IM1sO=1I=Q4QZLcW3V=bq*#F>yGI&L%e zWh6G}) zD%7HNJtfI4Sp2yljM))BT%F%BAn@~z?2{q%uZ$fRhWcPpQLkG+c16R ztr6IX9Rs<`su%`bZJ5tzcGGpgIOd#4ND};n+KXwdhu1V+Y34eeWmNyrje2Ym6jTGH zMM(c_%+CPMP~~3^B1q#U;%DFQz(fYqeFpI78OXOREs*4t_NDTM0*9oS+NL;<*J}!7 zIEnOM45u<{L{H0#WzkMioTm(D32{FXi}l{XdWBkU>G$CsYJ$9nHux$w`8cmlah^qS zwvTTs;B9Va(Wo3R7h2^`ua_0H&2b)O=?Rm$ZS89EDrU&Ll31l$KB_P|Mn!*nQ@UH7 z{&P#vQgZ^`&0c?C8UQ&&!XxHuaT^}7u!t_$a@S&y0+f04aS$^a;15@mlWig*#Ey64 zg4iJm%WBpfC{BO5hano;rgpaUJz~5j^m?IAn#J-GiORrrM|VS?y<>6#sUY7N*f;^} z%x78!jE2x9(;~j@#boHI0AGjTSG0<2yZA0Z$_pUMl{f^Zi7&~FGU|oiGl4H*dp2w; z?2>cr*7fTw)AWu4)(n*lEa$~W_t(3sW3SM}D1A}m)*#eNI*#iuu=kR_uOIM7DXM5% zxV~N=iPuNIF2R1@-~Isw3i|aa-;zO`^`ga_@`C$YBXMIRPHY5F^IaC=ED=O26JdjHW&NdssP}nz znn5XQhNZ5}Pj74o@5tOCD;D*T;vxYT8(NV>s|{GB`;RT60mEL{6q{&G$r8Lvnvf>^ zXE~tZF5pqQw;?=50}3YY$)I2k1T@lj7vn4K-7jl3*j)3j0%ebhFVoi zR{85R1Zl6ZPUSpZNuo|Yx@qfx}my#H04K3 zqlUQW=xQl@8P|NK6UC7xL^my=%u;Z#OR%*U-A~?K;)9r*31NUs2hh54u_*kmzmnLq z`l*}9sx3i#bk9TWc?>Gzg>bOx)9i7U_N+dN1@bmwFiK zO_|ilgj`KtCB?U8*Gr(6I7yfL>_rxhZlaIt0<#z;^B@_{UTx)WyE>5qS6$c~K9pJK zL*r;8A?jgRn|&#`YZ&bworm$+wUttoYbFM0+#t-Ucv9*HIR$Y>2=vXi^tX4714vbxfB+2o;8b(3AuLiZe^- z5C%M0d;cYYlR-i+T(u-i;nEouv!vj?GP=ClaqZmfE-j|j973y%OXe6XkSXZQED5$Q zWtoFb4aEf6;M1bj_R^l6N4*K^>2-AS*xXrR5K&qbp^?S_^EyY7<3Tkp{zA;6G*GNq zy0tgDxhAB%A}5(|6D{H>v?()#d(P?`0sUQxc@>Hx- zRK6c-+d3Q0Xka{XvMsZvno6b}f&m+tL{YE?Cs!H+qX0BlkTyx0ddw&DjyO{%b>8tQi4YNa# z{X0}+C3X|0I&T!9lBs<6O9ASS>NLMK&*RYXikC1-DsR?uB- z8cLuN-(U>6;C=CI%DgBdS#SOJ~9A$Gss4pvsUl|hMeHA5V;?u78 z7bYCX-gobC^!COm*V`r8PhxL+QLhnG4E*h;7w&%k64~5eZ+h{rvN6?x*TJ9GPAMk> zSzwX-+Ap6(82-f@6Wp_x_UG$cN{#;ZKMwnICnlN*!IcBu95|gNCEEq3lari=grXsa z&13Mku+b3~@vuj!!!H`ayRPn-#VJC^u76r+^`kO@3_A3&_G#YMQGjX%1uH-Q`$v-j?{vSD6&+5)macl%EMjE^9;WG;r=9wf=qE z0?e(wPJ>u@PWf2s-#)yhD7UNJ8-a)-{`fvq`F37*YP)BZ+BZhH3O zjVrEkN#J-tZ3fcHg19}eo;RK9Q7h*CTXMO4&y5@RYt$LIJD6jR@Cbi7s(F-5sUc!i!*7N~QZ7zVmvh zGYm8mf`om47n@C-ssT>7x4YejOb0nxtX$vq7E;HRJ*+33N#vRbD7RDkq(BSAaFk?87J`Lfs}N?k%URQ0(p&DoHmk8ZILJSwD4Sm;6-$&y(B9#PkK zSjcynG-=1JygObTTziSSFaKL{kD{hth6~Oeq_GQ^%xM`(PEA>!f(T`r6<~>;3p&Bm zwJ7~UmuwcNR=F^0%ffq$JT9hFMnhn}K>59lCBB>p4C)|i!WeO^PH&*v0^Fgp%Bj#r zM+*w&k&3NmoS#d>WZY;R_6B2VP{N5Kz1P*H49IH`j;QemRA6D*s4Wl~rD3A;S(0v> zI)z1HBaEq}0ZK%Pc!6A@^{o)BIlmRguN@dEdR|ttvwk56nhe8M)p^;u(I!L#61Gw3 zR-#62c$RV!1r<9EJkHm~j!@*JABlT3$E=GlitA5=2tJIJj*`Mbgu}Pl+>HK{{C$Q{W@{b{F*(awjNfX}lOwQ!JK6xIJi?WOJ#6Z=}S}J%83F zuo-kwioBV1_oT_e0SfQs^kWnh+%}vTRYd6OlfCIBGu3j>Q~L7G*BRf`J|8)t6mP`X zYM98^`@L0!lW1i{c+247esQ9kV7%%Li8oBA?&@7G(T>_9=8KAOPmsvn1>rJ~?jW0w zaUin3j?Z!ln9%DeIvH?a#TAVZXbU;^o|bGpL-F=Ut;DMwynS5{eklFOD`zv#0w5oY nOroRUedMAmvO=GJ}Y6c1I*%r7}~ZfBW~Je|DyGUDy9Z|IN3p(c?Uh^UvS^^S|H!_20k# zn)R%8XJbgWB1#(?UnXo_#4{T^_$<<@Zz^0>c{f@p^kmEH06jYuFS8m ze}R>EKYq(#HdUC-Z2D2p#<*dcJJKT=|N_ zQJ~x*%5IqZZvMIc@$(NUX4{XUjKj<9OiL)gT#kD&;&&-XzpXy}*+UcFh911a(7vr< z6bctU9j**-_xKYA`~hW~Rns-PDj&-#RxMUBhARJ*ZNjizrxaYH>_S7f!Lcv^|2BQn z9#QVADEG7OwVvJ3j)6Qvtpm#MV~gX^_Q1!sqUD$Ei1McZKYCm+MCJR6D;0tcccDPp zR#EO|GXOC=*8<>XFac&Lt59a2w3Pwuub+SP^aX4BeBQ3qP)jgh!Eqe788rwnDvdL> zsyH6$8XK%usb{`GU`Eb>0pt#k zJmQ&0woT3kj(KtZyMG|C0kI|m8~Tw{!c?hn2v0OBf{aH~z&u8nhXC^*dnYjWfs=JR znkjVqwgZ{*)Wo-al~J~fexT{lN`qew$7e7{;yo5Nolx1?yu6WYm&n76^FlBy2A2rC zQH*w^Z=~%4=IKJsnXWano}3Oc!&o<-x%nHkn)Fv}Y0C5QgvWn##YuAoki#VSoX&Xg zV0af0cnF~oLI4=2N+k&6fg}7GYfG5j)eLBLfZ5ui57c3%?jfP^9=QFXpR1E3mRXy; z4)guVG10VAx%S|)jU{|KYyq=wugZDo#@UP($l10Nkn_{D>$wH0CBhZj-I0o!aEY3s znt^qp3*?JZE{eG|(FGsZIA069BoC~B1!_$3{0F5O`2ySuk}Z9z!EX4ug^Lrl!c zO6~-Iu*A7;0->IVAf88ShItq}P;^X{v3;Ug_d6qXZJh`ujsuabMEc>xv*lb`jySC? zB{|0_jaj2niH4ys%)CfhThSW_m`!yO4CNeVzB#xP6Ud|M+VOd<;id1P-B&7Bkq#RP z39RQ8&f+-H0$jVoMQ}UNpoDT>rJVQA%K0Irl@&x8EyCC?mWLExp>oUv^@iidbWEq! z&EDeOz+a^j=k5w;+kN^- z1I}(T_A9Z*=j42H+*RtXv{7yBUr`%U&P)@T4@@IAiRt)y$8?{Sc=Cn7Do3?3T;Uve zBXIQ4hqf_woE` zN+?mTTS1ur+AtI9p?7mX3_TU|b^+wqEA}yK_yFp^0%pcigh;|R9=Gz!5)GicVq6XUg@H_9EIh8uI3;?abVnNe`sy-h{0$q@BnL3FLC2k;ER{4N}=F z*~*pf%nS9bY=*E^f^KSQL;|BGCQ8w)G$cGe$R7&ybTdkv^+mK{dJB1I8>-s1&VE-M z8iP*sD(J!q*p&hI;#zQlMQR^Z_YNy855Qb^-tx` ztg)1&5t+iqc$v}tAmpS@8x|CeR5(^{!|}$4xWtpBGj%5zq=xPxdk)lL1~rd(m9SsFXk+6}T=>xV5 z_f3D5?hipZckN)i&`#RChVyy7DUKfuUtBPemlf>@A`jLN%9$|7m#^V$%Qm+vi*T~o zYT;&^5I!78$=t!dm3x30UWs$v2?o79m=PDN+&py-a*YS8b2|)gkE~63`KXuHZ<*}C z{zgIs`m9a-<1j1KT`E#_gYTQubxmwwC5qzDWkK#(KCmDC^&$&|AGq3zSAJN={s{AM zrRmUcx3G&~!iCqzoXPur;aNX>pY-#?#e7bF^5ZZQ-c9^t-UM*pipXDaBpUi9%5@tE zDBEc^ZE)J5q$h20=h1uFc`Zx%94(zIe>?utypV?4Q$HKY1C2cJNWTkG%2d|3q)=`P zReOo^bdiQ?(*;>)jd8Pat)1#iukWo}^x?QuW-rX2FU%i^49}MP6|M2W5fcm|aba>= zVzDtfrOSdlkrh~+PfMKZi-O&3%wXrC1%}pc(v;G@k2S77c`vKu+aOm{`&de+vcy#Oz`$BViMI2S!MZ|js7RIefW|20=@cLg9^&A*JbR-|} zjU+KBaE4n@;#}Vt3dG#C5bnF6qPxLWLg&|wpidz^5HD~t=E~S9wky)bWC3w~5bBw} z?ak~QIask52cZ^e8CSPyHWXfIBx{Zv4pb%A=&sfTwdwnLw6In+8QoE?!GYP9Zy_~bMW>xl5BMLb#f5%j#i6IeRr)al^=no8)j@4|hwv&f)4g$J0Hfp+eu=B63bvwNy&$Xn>cZ$n3!CYdgKQ85!_m@-1vBgL_zQXx0;avDOl4`v1h~~UT z_7ZSm)9fb^GncjOilJ<-A+Y1-(3pFaIM=-(OnnyAMldQ}Z@cMax!*S(A|>zCV&EV7 zA>9s`HCLQCDHX+i`=ITxoQoQ@r}hZ(L9m^uA_1);vJK6DA88~7r(nAq<`K~#$8xT( z2?A}5Q=pBpcL8VZs*6?YgDg?o@Uot`;xNK7^oFl89rB`g+@km33ilzIJNK8|x$qBj z1*iN;=EWd7Diez5diLGd9;tYAlC%SHQh_?f4^!h5@*FiPw);ZW5V=8;fpHXZn9a?#7 z;*R@1QLs^;BrzvQ%+xC}F(T{*VLrYJbEW*2xJt6fW+WxNOv_a`{WS%1qXDD|QYOk^ z00YRfwY-G+YImqcavohf1d2AceOIz^SKpnr-i9}zuq#`%D=_rZnqXPcD%CG%ZB)zD za)=Zo4p=LpB8EEy9{YxzjQ7AE*980eLxKjLkpP_S2#^o$dvQJve0AT^hIX75317E@ z_Sj9EH;SiU;SFU(OJe5Y!&YJvYY&rPU$=t6o^-GacW@fFAvyPkZHsx5kiA&c=k-TJudZ`5Np|_u=lOXC6k0Raj`}$Rg263+6ebESw{MFDg-abSEE3 z(YF%k`jdje3l8wO&(!K}o;vI9PJ^$<99+k1tYS_#;8_S9fnNs9GjL?5qTlX6T6M^h z(>h4KjkTUKi$(Ss#ph0Fun|a?*o&R273{~Wa@MAugEuiY?j?AW34CG?=timzuA?jz zHOg~x+P6k*vEia%nb8q^k;gNYj`V zu1z75=(>etxQQr>5? zTiTv@$lxl+7F+n;pqzv1nbjD>ZN+RXQ3)?gT!PKE;L5~C@HR5jHS}CyBR)lLl*%W+ XblZx*4gS9b*!}T;_gq9wFh&3Xtb^;Z diff --git a/tests/data/mixed_human_100_2.fastq.gz b/tests/data/mixed_human_100_2.fastq.gz deleted file mode 100644 index 2ffbdbae4af82592dc181d1a75ab13bd90a43308..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 3798 zcmV;{4k_^;iwFP!000001HD|!vZS~X-S<~`QFU1;!UjR|+Asw6(BYWXm`Q3hv6Ug$F?2ox?#J8?)SEZ!0y@|Kkm=I?)wk(WBJ+!$({-Eidw)E zfa4YRc$bcv%HSE5#Ek12vw*p~u4~*0u>ANfg4q^fwzKVqX0~?f$GM%l&Y}D^9iR4odwfHqJI#Gz zc)#4fp34`Tp}z)`*pau4{ZCKTmd7|3x2)hs9M20$^F>Pvt;K^clDx3le$n5U^mz-@04X3VOC+o+qj zdmKUP+}GrsExk<=KVQpf9Q~_tb{|nPgKv@z6lH=lu&*ZyK8Jvhu4eeEXupatB)*`n z(jenZKL9?Gz}y#!xt$x^)Q+9op`P2i9(o7!C9toyhxGArtlU4CPXua;_%n~=i0<%q z&WJYtX@IAZN`DCVg$V=G*XZ-}#KFIm1m?lR+&h^6I9i3dbDZ4NgB@MBtJ_3oyuilH z3aj*DX4zM!5Qi>m3Ac+MA_yjE%=azHXWQD&%{VnqW>}bpZy5n&)xdCayYCD(qD?!;b{+$d=V69Z9|GFU8LHuk za+zXrC|elMih!>ous!54x~wxW(IF#^>1qUo#=#8GcaA#jNZI@Xr0a9yE(@HMpL-C8 zDiKQVPszE$W=^euUnLI(BtXQ=>|x|#0eI@Z^IhEO?hz$0H}zFI_w6v-0g#-b9$zG9 zCRL&Xmzvna&p~uRfAIti9DXF-!^lX`9(y#xN~`I78v0B&*p&h1kqZgKku9cXrOa!p zCE^4l1DlHib1^I?&T1ge<|`^yI4REMA`tVrcj9>f+t2;bHt(xWJSpe7u3B0XdDqK! zrInrN8tHm%{Ya)>fYdv8L>N9`bjVL(!+y(4AFlk`>;qp0!*%k*u zm(FhFn_YEc9eK2EJ;dq!qP$p7{nd*JN?ZPjexn3Gya42#<1Fb)dhW@MRIo~DQ5KS) zOQfthqfdT%mAmh2z7k$E?rw9vXpF!>OY5YL?olsYHy0T5XahnEQ9=y7D%i}xntEibJP ze@=OeaSj#5d0RqJoQ1M@@#>2^a6EFD0Jw0>D{fm~So^j?zj>@%JGXXhdZLXW30}A7 zj;M{?PeM95cqGqq&!0q|&sRiX9`>UM7?H(l;2V+5@({g{Q~7K}>*?vl>RCS9=EB;r z9iSgtyR4q-37)ICkI&(a;~H6@k@?BsdaX>gc;q*);kUx4dYFqin)Ex<%>^ANgiD2> z9EJO0R5=@QL`Am+3zVM=NWMcp23mXNAfcvfdo=f;bZVrEQ|%kin2JYV`RhQ+Kk&2+ESiJ20cuLeL*pX- z9OrT+==5@L2b!#!=CPgX80N<{_a3#|>K}?k7<`UzyMGY*efU_B8K%-u^$Tjm=>SBe2&ODhM9-PTx;}KnvM$Y8dnjFJn{NHRd&4W zWsg?d>~eJ5#K_<1mk?9O6*r&ML$YJa5i%~M9ydZIAYu5BgBbbn1L6k~w~KxludwCH zB_9IIWgwyq<7{hJX}gpjn}XE%5+8r{qGFsXmNU**cK*XHHu91ULJ1=2ZcO+csB$J@ zvdGh+2aw@BUbvx+ZJn$GYp2;xHH_agCn(ipl)@{!_-fM;EvsdxV(y4X!S_{yCV0*S zFc%x7qd%4)p;^9RhJ#AE=zHNA!4qqgCk5Nt+8*Yr}Ma$0XTK2F? zX2t32J-A1kOqSrhYe0Ktw0E6{-S;5&9>E+HDpEa=EV##m9x;O-#!`)Ti*|3)kPTp&~ZkGguoWs3v(N3TJ)K35ug4v(#U5it>6 zl;)~_v7O}zV^?W0C@WtWEVAoS5KF`>3FQr*hOLs)Tm&@F1f)2Z7ltA+w>6sklT*=c zKfT{{cDzFKs7Mbqp7BbJZH+YH^_G?>@7xR`e`k;W&f3^(?IlQK@AtE+)*lLUi8+-h z?bV!MW4mr10OzVk+qCb8nY){|2Q6S0mm_tanU6QWVDyL|!P{VlIN>3D(?qIF3VJws z(Tpg*q$<+eWH6U2K}Q=-ZOm;0Jx;yCcdS2Jb!sa0I##L*r3m+a{uWr{2nl_>0cHqa zpk~{`nweI(FbQZvD?{*{;G(slFDXH%P%@a|YB{+}XD6U*3trO|829B-E*8_?)i_ue zt1G%@Jh4Zv-PI(@dAKX~&u#emW1!LzZV`fBc-;*1w63@fiwl&%&#Y0N60~d`HKVl- z@?oMH<#K$>hOw_=&Jb>DXz1j?Ax?BW2Kw%S`F!agozM#fa=OJUY(Z;gV|*zBlHtK5 zQfwJrZyW`aIvWMcwIB|CfNH}PDs2bt_~w#3VhowCqBkOlkwqi(n7HwpP;smZ&giFU zqcj0%eB5xMN39XAxGQ0Z@|Ka07l~G;w4KtqJST{>F^rBjhHi2=H&gc@ogdfKg(~~l z3yDuZi+B}tX*lF~^8UoLIOnus5;rMI_QYt47J;ROVpNLbJ1Is@)S$pqoG(kk9#A&d zeK(J;pS>aBlTh|HP0>x>deC~$cfvam8d|xh`~He_(+l*Xj%J`@5en@0!^s4quHwbovN<9JQP3dn`#WNjS zo`s&GUmg{uU3m$?MJDf($wJOgW%HM}kf9*XhE8V7VUxL`@|;tcn=5y}ajLngrwJFw z*tcoAmEQ5w_tCf{@ryn8qrkqu*xoc(Wny1Go{b7aoKBR0Utz?Q!qp70TNtD%^f*Xc^Kp6<+x%cb}1RYh*X8xQiDpMz07`yV81cc^PQesgy@GN(mQ`)ffUD zXa>jr8<1%Xpf94i*oSJ5pyp)QTiAZaMdTmiM1SZgn&~Tu*L}viY3H$p>9v<6@>O$dma&^It=qkc7uMZw8~aO$I@x-= zUyb`TZJbvrCU5ps8mZ(h6gE0(o}^krbyCSo5+Q=?aixZ#i4{(7b4usoDxDi!&#pGH z;MWqo$(wtYi+BCa=vI~@@1o~I-ELxHR`*`y1=M z0H-~^WbiYRrBRQi!s)6&4|@_}KHg4f|0 zUNT-e?b$+t98fR2JV*P=FH&+^Bv+x>QwPYiu#uy(nuaeAY$2?yFTELA$dSJ;U!E64 zZ=*xTn6UDmTIkbZaAoKiBC?8Qe~sF(7)e`-AO4(`BfLP7VkOez1DU!xA?*L);jV)f_0_ zn!i-IAYWQu%U!s)(ifTSzj7V&=heUV#~&~8oi9_ywC*lfcr!-jAAnTw%D+A&{Lr9X z1!x1t0J8v{*-|(*W^pfg$3UN z5hAwiEkSDud&>|+j7s1@SsA4i@O6x5D<3#pZo~K0_Ng&Ao?JcL`{J_d4Buv~+vPI1 z^Nt?!4~?A4h^wgZN(e|sUSjYeydonQ_(TF^iBlkd2p+uU>VT2KwS=zFL;%Eqzt9r5 zeD-1c*7&{N)vT&2bD-D!zN*&36BYJX75`q(Kf!mbn`s$_y1T+#cu))glH^ZkBnF?q zCjMo>fh`I%2T%owAc0T|?g;XNF8H?)cElJ(UBf-NLX69CEue#BGVnhQ@8_?y7oU7v z&%(EFntmDkJv|^?@r%KUh|9oV&Oin<5i1n1N#s#NLJ9!=f|DbbIK06K;a@OJiBZHj z{*u7KEoE?gy2Qd6`IdWpzY6mGb@l?f*U!rX)7lw85?=xO=8zpf zK0Xu2&f<4yry;)|=R*KX6qptzNU0>!C}9hkz?IC1AB9;7>oOt=R@*KG=VG}7m*9<* z1lC9cw@_)2I!F}w2C(&S)DOGws2=S+d^ii>zVGX$Y1?t!>)(WxD`2uiA89t1I154@ zM@62HYz%6{B~Th%{1vo;yo<1esO4!s6n3^Yem}quZ~b^@QOaiP8vwU$)65y*VVDjS z9H?8%gfC*N012ItM6F4>w*X?K3`zjL3)>n;90M~GCM^(rk>f`0PCz_~B7$g!ES$84 z{u+R9FaOzf?^Hpw_0Z(^LpL7RI4o0?Uy(`_hyxm}{hmf7DL7-iD={fofWAFjGu_aM zoWeMfaw8@~q(b4F06N0*ikWi<@E!SH53dLxPsexPjmR0m z%`~?e;CXD05uET7!Hk%*SpT;KERcpNSOI+#Q*I<6h7rywt^8^7v(X_cp$7 z<5msVPwR{GuJ3HW(`dP!;5%Hh2=3>jhqg%o{!m02LWyuh_=c1do5Cd@Z0R%(h7_cZ zX>>9_!)=4}f>LTkbYdnFMMn|yfP8U;_p?#_bF8v~f6w2;S@@2ZW*ORH&%T(ek*Z0t zh!^^1ffE>qLUJ-@eirCcBn5Sl!{BqACQgjfZSK^8f1)p$<%R^6X+}gCe})~lu-w7% zFmJd^)~fCBZZTW!2y$Nc%F+j``HX$r{*qtEy4z7HYS_wlg@Q9(o9z-1d2!wxIQZ;w`mLpZ52 z2(HRK_WY;XX}$8+ssc3GTH^r zt3kVACZd0;9`d1j8T)ZK;;0_5=;w9p0Cj9?f`U%(WK;uMyA zlE#RUOr=Zdq3>nfYYF)762{4Wt96C>oBw=Nc$=ZE-obb1`^z%-dv!+pinalYVufI1 zBwNr@SMZ4~Q;<-gxr@Xa3!tfp$I4OQwR9!}sXB;lWZ`c<8Ym~|+H@#kybJNLhiw~} z^BWw`xJ`&ZO5(P9ln-b4UWPV*X!?2BJ3LszA{}5#2`*_A5OJdkX1`dzf(8V+7Oe&! z3tE^Q3zH{AYgm@LZAnLRNR`R3zL`F`zWs)cbdj0(EPm&C%%9qBu5aWo=Xl)mF;y$m z4v3&t&FO?5BrC^(m1U+JCIcAKRuvJB0GBo_Va#W?3=kB-RTkv^YXN?Ko73pFZ52g& zcP4-9zTi7u#sfhh>8Ef)c&ehQYP}|YboItMJoH&gdS&LU5?k8x40`Rnzw!?WzU?YZ zn4* zqCTop^|*@seQ3W^CMrkmEQ4#tOFL&FG0y#=OvLVvAag89+~Vi59YjSSrT}Unff!(g z(v%Uao~eciQ|->n`N8Qucuv8hDw0E*A4vs{Z+FmkzVRK5*Pfkwgijrw{7;6ttB(}n z_CaLSigB3)u+1bYr;tcsS*Jv%D#ggX0;>@!Aeri?R3ac74Wn8pls?M}if&~5a)8eb zowv60LT9(MSp|<-N_0Ds#xm(rjgaTbR;;A-&JG&l3x(rv+|KYF>S4$z?=@PmZK0yeo8%m_WJ%(<7T`S_ zK@y>)%~6W3^L(MCd!t#ea>f>L8+YFW;>?HpGWn9&F1@5(X`5Uy)JjG!%cbgEPC5^z6`}??+*IRWM-wUA4hdd zleG}Fz|$1l7*^DrSl-7+wXrehqcwzZH@82dH>LUjhA46}?_7JL&z^wr_3dPGJlOwQ zK4g5mzP&7C))2=Yh^+%BXthrpmD04)+J@d68Vl_Hi^>y7*{p8_9|TVaaVzF*YBpxF z!ZsoHz|r}XMfBMMXLV)h8-PAF2G{e~!vtBK%}*?dF2k7@w{aTk1H5dWR;|a^bQOLR zTXv|6-9=}&1U<4qtBnb-@)7UYmLujfL{$nFGZ65en^RP81M&OvH@mflZ;pPr&HqM$ zzFRW7V_xC*9X#}EgF9bn*7fq3I@l$48PMcWIk@bNHrF1E;3RoV6)fk`)X~F6ROe!& z51g*$XOQrhwC`Id?&6L_1ZZi=kWs8W?1qGzq&61-=$I^XXusGv+YlDr<<-j^@5-xZD%lt{ZY6WvpcSq02p}XYF zqHZrc^8H;sW#$GZkGLecR*{p|(D=ph{V7L#dO0d|# zE`b`<65=@rOZ8DjPbD46dm$LI=w>H#hu{t`bZ}5Y;k{Q{Bl$p0@*)0?06sBpp6+*v z!Ctg8^-xbGg4?M(tZ!V~m;eN~dei_Sf71Nn7xngpq$e!6fCaZD-EV_3Amr-2mdVNT zX*|o`z69{Y_KU69SsaBC#9}6-LRZyn%;sV9sQk4ur z#|DS3aK_ATYoijeQVG-5@$A_jx?szWr5nUEzSVPlZ*mKdvF-HXfYn1xUyq#kUAlfy zg=jqs;JNGSr5T!L-V+4ZG;b?cB9Vlt`9d!zhtK-}PVWlzDw~pNHSt)K3B-jG@yoJ@ zq}o>X=j2cyCdJDCSp{wz!?XCEo3_;9hVCFC%)Q0n%_O!hZfv&!Oq|~=7vs5!a8=E$ zQpo3wbTcsJsYc39wX3P%$y>5`9ds`r)bNu}89oZ!Pv7Zra`zJ8OO_D*u(aczACeLY ztl(0xNx8B=`06>Er(9cQT~BBQ|Exa1sJ1Jw+M6oG!dG9XCk)NCw;C&502hK-Wr$T~ zdWsbj799!s^YAeYXs&TTtek{)95z2~Jv(@qZC%w9r6=3)VilSj{K;5!JHVnt$HPVw z#c``nI(RP2l<{5KdGAf?q2EaHs(ufpbLuz2ietWmi~&vwUjAg+0S|gPi?SaxL>5Cg ztKI{9>ri7V6R3`P&-j!chGR%QOp3RydJup1u>pGd~Gl>A&5WfB%^DLs9Z~*T8Z|R_oR(%Y$I8#wPJM$SFUBE+qL`l-x!p9sZBvy8Xp+q^6st!L>4nWb^P+dDrq{Fh*hX`k3j`GuOGPsZ)$c= zOh&GiQ6?lCz7IcAWhG_<$GdVFT6kPoLSpTisq0!PCw5 zZy(Xc?zo|V0cBPqd|*^_81P_nQNU;T)^)-3n7+lW5l7w6G6=jO#EoF7UM_Yl8N1MM zgy`)?@9NuGLCKjmdKyko$)h&v(G3dVQ}OUsG4Pdp7)wVbyiedAF73En>O=G<8oKOp z3~HBVJCtm#=%}gg7F@@v;GoQeGQmZMXVPysR_wD2hxa$B-)CR5?o*bs{rDeZWjI~I GL;wH>Mhn#d literal 0 HcmV?d00001 diff --git a/tests/data/sars-cov-2_100_2.fastq.gz b/tests/data/sars-cov-2_100_2.fastq.gz new file mode 100644 index 0000000000000000000000000000000000000000..3e43e47da544d258132ba49bbc2b8c4ecd4d6699 GIT binary patch literal 4263 zcmV;Y5LoXYiwFR!1ifSc1I=5@lH<4$yw6wo?gK?q1Stj&TSmxm*b#fe!TDQyrI=$Zzm{@N(X70&Y&n&m7*me99M0LwDV#_tS-G_AmmAC3a(~3V zWsGvN{FWcHa%H`|TvDztSCwNJsvIkuVdQ#6{VA816A6RW56bb!-+x%$?uX6vvL4

8M_M2s~5Ecw`Uz#6(ytMD_Iu)B< z`s-$1mWSi9P{p0aAIdO?!Ot=b^Ll;u=CUhP*d!TItFlj2CRSUqa>anzlR zd+RmT&@gy86=CsM?<%LOadUob-o2H-U|T*9XZs`g1MsUiW+&z??t8h-eR1NQ`|_(+ z7c_q51dGPZ1J$$Yad@4<*{WH~hef<5T$bpR^=+LA)w1>CIuGicNObTz1#q-_Y>GHq zZ?1Nae8lymExokw*O~Xl__FGLij?ilzU_GF=IgDy9qgO%#Ui|per0s6I-{#O62Etn z4_8aUhpVr`abdQM5$fSG`7!CMQ__klYqWY}st4=S0a(y8>W6IGx7yd5$A?$L(y^Ng z3Jo{9@BPew$4fiSAzb>+yl_qT-s))8a0awZ)et-=3&Pzvm+R+MFiCXu1VqU~f@w#Am^1$k-4x3H zcss_o%{Xvgk*)fb39n#ez=5b>52N5V1S=pmU^cq{0}mpZpWGWSjA(dn6?WwHek(q@ z9Qy(XeniNQ*xLmA_$=(WXCKa;0Yp0t!)0!}ZX7oQXSWB*Ru*9bD1skE#wjp|1hSRL zu=EfR7XcvP9yC%S%4S}N_dxWD%|w-#vGI2hh$oianb%gxD7%FRTR8LJu5E9{z#&Zg z5SKGa0!S5L009hi)a0BX!I@A|!7|JJNZK-B6j?=5$l`Dr*qQ;y2oO!!!2WC9o@!6dRbMmZ4H?CD0r z?Z_zTp$RT&d^(JOS*GLoiTpyTh(FyFMPc5Ahy&Hy&{Ic){LjMjm^qZZ%(mNX4RP?+ zJ8oJG*C3&*%nhb_?Y^>F~s_poMLc0-MV7n+t}_|nvphv z2**fe!GyL(&r~P_b|%C(m_N~4xB@=Pe3=fMgsOCF_&wnwGWRn^(M0EFOihS=CrR>O z-JT7*9R{}uxU&9K2j+>F2JR~QPS?2``@O~kC?$P$faEmB4~Q$Amy=jhLH5vS42aqM zMr09*$h=FOk!dR-2fk}a%z!L9VhV#l<-lL`5RQc5$)G3p?ZPEgk>{YHNpHy#UYKst zMco{cv0i+aG%d?I2!*EjNRz>GlH?NeH65<(*PK}xMK`hxW_uvt`PcmS)5T$ThNNIQ zaAenS8h~v*f|shK=~7?{kQhnE{WXW^=5Q?}c)0B~H0G)~DUT(38tDP~)Jn@@RLV=n zOB|_p;TAYGbjGNZPHA=Cur?;fOpBf3++1y?ZoG?O-jG=ejgH!6>b@WDqyQ?3L7Ba?&~ zIU1aslfqSGQEi#`RY0<5doL4X$;5KmyX2gOr(BBM=NAxv8|KN)YyMEoJ6y}_c=V}rj@%f4N>k^!yq47=6!@@qo6Dh#^iAK zwZ;wBJh3&cP-U(V`~WP(JDUT-nA?7-3YjdsKxDv5icxPfdoQgIi|g3G#6CZX69396bTw#02^C0QmG=DLdPT;eVCZSI@bZO1k| zINl8Y>bJp;tEmC6VcGfYoka}y<524umna5%8zi!5;v>4x%pdb(snzF8g zB2ZLLw}w1XfFxNmb22zDqr$1;1yx_n6;1`ZzpxsGs4vsdeqr169KA0y!-e3J3Q@6d z7>4V78}{ssZpARK9|gHgd?h~69!P0nZ_A0$TeVP9*m~INXO@op7vg_{l%k(N?5j`<;ra*44o6AF~g;kmHtWO{HX{`(z% z{b)?vira*za*hOBM|m*7u!*yI9zDELi4GC!)2)UovsH0p&E`Q zWeohXVohZKC~0`D!Mv7ERS0%6%0FJsNRp6M4>IYP)Yia0=(sdIC^##kiOKly2Vi0m zS;fasmSA~q-{QHqp@g|hzpKKD5m6kHt^gfC000D+F#3<)HqZ#Vq%<_n^2#SJj#;MD+8bAYj0olWLs9I zOm%0w-*X~w`#CRJ%)Hyj#1-Vey$swrH3L1^FWRm1lbxLZubUT2|mYuMc$GDd~ zZ#GF5mc$*9cw}`(sOlO28bo}$>D4H!n!YQ@;oaiAVzv7H4Tvb|aW!+lFVwTe21-JT z!V0Fsps=tH$U`E8K#@ckt8-rpJDoRgrDf{9k@QT};po|#L>;GRtl2TWNK}u1&cLt5 z-Cy5)T+u~4>Er3=uCU;-P>DVq5JGG>skOuylZ+9oiUOLxNz$cQ7CkkpYMhl%7`v{b z>+LYI(mW}f3XZ7TOCA;`@y6E(ac+fr3ji0i;jftbp&OS@vwTr8j zM!aoEb{8lONZF7jmw#oeF9kBfawhWh^E6Jy$Xz!a1DvVk=owj3vLUpfOVOd(n8$pX z3XS^L1kmaUqz1{r#RiMS{LcHVP!|=;h>C2O8(=1Xi9x)|;f@dI;oYC2q-i_D-+msh zw_ACcj_6<>*_173cQ!bho7wZUm^GP`_lP9KLnRV%RV-PHj?oMALJ8$}=BZx>=x^gy zTfe0M+lzPa7}!4i&uwWwU9&#uQ}pd_*HC5lU35cxsGN<3oy>j*v_|E)t5~qLQr3w| zXjhTVOGpkLUd9ZGXiX0WS9KhweI$ zg+uI0^ps%KtZS>h%USMe0BICcyOh0T<_K%zZLGy9-;5JU-a;P6C_25QRWj9K9@TvkEq@!f9tg<_+K}O6q zFCQamU_DxT1ZZAlw~xuAoi}hiax>;Fh8*R>k=nACYqgQ87V`z8C+Bw-pC97J`Usf9%A1x)e?8Xp*Upn@ZTU`Zi5HH(bzBESmu-Z~=p zkL+L|OeB@x)CHw>Al${Npd8V)VS0Od1P;Gp+f_AJ$YloSYVd3Qw}3-`Et$ooyLNl~ zdX@M8s15CF?fdpH2UfS-kL0DkJ9FQ0s2zUo)a`SNB(KS;>qd#SzQ`r1a<0zxc9&Bs5^Q-}_u!qZ3J-D0t zy@3_gK{G)iV$7r`n=A9V$fhTh)rW9wmFLwMh{-p(u>%e2jF1F6n(yQ8gy5@p2Yt65A<2 zSaK#PfpXUt%vPbzzTqC4-FIzl_h#EwsjjX@Su^Tzw|c9J%8M4O++9n@x;gghXp{v5 z+(TfNUF&&`((aN~*9CEB_PzD}Wo|>;-j4lev}shzDw##e1Amp{aXM4CjOr#O8f)p1 z7Pz=htU~>&*hiJ{xL8R^3Zoy3Bh2M~5`iDFte@SUD#xaU7&v_9zqhulb+~Zokm72K zxO!T4nKvsbPKtV2ODc2;ThL5B{#nhZTvMQ8k+JBmQmDGlqF_5hlYsh3ddBP;5Pmf9 zw=s$4u*;8I_6o%DA)h$#wNQv5%-y)h2ik?KcU?;4OIb}6i#1@lV9Mebj?P&|C$ZlcR~7RHG&*y(*AANUVtn6#G&NGXDtiFv1D-?_UTwaH)97um_f zbk8`yC0Ce=5#Yfj#hz1}*1egcsl<0@%UqlI%}X%Oa_&zD{Ry39CSu%o(_RT})7% z+)f%yihcVk{`TFm{Wq#Sp49NHrgSO;uKFY`6j-UQE5$LpPiMBiM-h(1b+T!#&GSO@ zUY)!mJ0;|(#-aGMKHog`XM30hzj9KV@V-64wFobY0Xjk&!3S3t9iG{D7{+4M;h_E| zVJ@?+{Cc4~`_7XyS94uVIUnouDypf?0a;&kQ3!!838XijNXVpR2Jh0}PxW)(UYYyZ zC?A69KE(O)Q~O>n)uxB^&DqqJx|uIL7LBl-KtpXjxenpRA}}+tAfH{Et)iG;^8R&- z58fy2Ej(KF!Uy-Bc|V?h&aj>w|G0wlqY5nlU9WzfzFp^V?Z)|f*?TY6tcU?iCHKyo z=yaAit7O)Ep*?eDb1iIR2BzA3T-BhMU2oIU3-)5KPa=KD=g!Rg-MY_%@$uLE_#f(m J9dp4%001K1jCue7 literal 0 HcmV?d00001 diff --git a/tests/data/sars-cov-2_1_1.fastq b/tests/data/sars-cov-2_1_1.fastq new file mode 100644 index 0000000..0e76031 --- /dev/null +++ b/tests/data/sars-cov-2_1_1.fastq @@ -0,0 +1,4 @@ +@SARS-CoV-2 /1 +TCCGAACAACTGGACTTTATTGACACTAAGAGGGGTGTATACTGCTGCCGTGAACATGAGCATGAAATTGCTTGGTACACGGAACGTTCTGAAAAGAGCTATGAATTGCAGACACCTTTTGAAATTAAATTGGCAAAGAAATTTGACACC ++ +?????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????? diff --git a/tests/data/sars-cov-2_1_2.fastq b/tests/data/sars-cov-2_1_2.fastq new file mode 100644 index 0000000..1571c20 --- /dev/null +++ b/tests/data/sars-cov-2_1_2.fastq @@ -0,0 +1,4 @@ +@SARS-CoV-2 /2 +TGTGCCTTTCAACTCTCATGAAGTGTGATCATTGTGGTGAAACTTCATGGCAGACGGGCGATTTTGTTAAAGCCACTTGCGAATTTTGTGGCACTGAGAATTTGACTAAAGAAGGTGCCACTACTTGTGGTTACTTACCCCAAAATGCTG ++ +?????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????? diff --git a/tests/data/tuberculosis_1_1.fastq b/tests/data/tuberculosis_1_1.fastq new file mode 100644 index 0000000..6496345 --- /dev/null +++ b/tests/data/tuberculosis_1_1.fastq @@ -0,0 +1,4 @@ +@Mycobacterium_tuberculosis /1 +TCCCGTCGTAAGCATCGATTCCGACGCGCTGGATGCTGCCCGCATGCTCGCAGAGCATCGTCTGCCTGGACTATTGGTCACCGCCGGAGCGGGCAAACAGTATGCGGTACTCCCTGCCTCACAGGTCGTGCGCTTCATCGTGCCCCGCTG ++ +5430361214022-5224244425513232342544661102331222222/0231322106422524012261/22022/32253121.4240226412221023435/3264252222142422302224322324/222224/2322 diff --git a/tests/data/tuberculosis_1_2.fastq b/tests/data/tuberculosis_1_2.fastq new file mode 100644 index 0000000..da0bda4 --- /dev/null +++ b/tests/data/tuberculosis_1_2.fastq @@ -0,0 +1,4 @@ +@Mycobacterium_tuberculosis /2 +GGCCGCCGCGCCGGTCAGCGCCACCATCGTCTTGTTGACGCGATCGCTGGCAATCAGAACGTAGGCCGCCACGAAGATGGTGACCGCGACGACGCTCACGAGGTCAAGTCTTCAGTGCCGCAGCAAGCAGGCGCGATGCGGTGACCACCC ++ +20222334212/220142113242254215421213422322201223526230163411434425123532144126204232223224252214312625244222440242.323421504052232332502325/02352224.5 diff --git a/tests/test_all.py b/tests/test_all.py index 06d2e50..3eae4ca 100644 --- a/tests/test_all.py +++ b/tests/test_all.py @@ -8,9 +8,10 @@ from hostile import lib data_dir = Path("tests/data") +out_dir = Path("test_data") -def run(cmd, cwd=data_dir): # Helper for CLI testing +def run(cmd: str, cwd: Path = Path()): # Helper for CLI testing return subprocess.run( cmd, cwd=cwd, shell=True, check=True, text=True, capture_output=True ) @@ -26,101 +27,201 @@ def test_version_cli(): def test_minimal_fastq(): - lib.clean_fastqs( - fastqs=[data_dir / "h37rv_10.r1.fastq.gz"], + stats = lib.clean_fastqs( + fastqs=[data_dir / "tuberculosis_1_1.fastq.gz"], + aligner=lib.ALIGNER.bowtie2, + index=data_dir / "sars-cov-2/sars-cov-2", + out_dir=out_dir, + force=True, + ) + assert stats[0]["reads_out"] == 1 + shutil.rmtree(out_dir, ignore_errors=True) + + +def test_multiple_fastqs_bowtie2(): + stats = lib.clean_fastqs( + fastqs=[ + data_dir / "sars-cov-2_1_1.fastq", + data_dir / "human_1_1.fastq.gz", + data_dir / "tuberculosis_1_1.fastq", + ], + aligner=lib.ALIGNER.bowtie2, + index=data_dir / "sars-cov-2/sars-cov-2", + out_dir=out_dir, + force=True, + ) + assert stats[0]["reads_out"] == 0 + assert stats[1]["reads_out"] == 1 + assert stats[2]["reads_out"] == 1 + shutil.rmtree(out_dir, ignore_errors=True) + + +def test_multiple_fastqs_minimap2(): + stats = lib.clean_fastqs( + fastqs=[ + data_dir / "sars-cov-2_1_1.fastq", + data_dir / "human_1_1.fastq.gz", + data_dir / "tuberculosis_1_1.fastq", + ], + aligner=lib.ALIGNER.minimap2, + index=data_dir / "sars-cov-2/sars-cov-2.fasta.gz", + out_dir=out_dir, + force=True, + ) + assert stats[0]["reads_out"] == 0 + assert stats[1]["reads_out"] == 1 + assert stats[2]["reads_out"] == 1 + shutil.rmtree(out_dir, ignore_errors=True) + + +def test_multiple_paired_fastqs_bowtie2(): + stats = lib.clean_paired_fastqs( + fastqs=[ + (data_dir / "sars-cov-2_1_1.fastq", data_dir / "sars-cov-2_1_2.fastq"), + (data_dir / "human_1_1.fastq.gz", data_dir / "human_1_2.fastq.gz"), + (data_dir / "tuberculosis_1_1.fastq", data_dir / "tuberculosis_1_2.fastq"), + ], aligner=lib.ALIGNER.bowtie2, - index=data_dir / "MN908947/MN908947", - out_dir=Path("test_minimal_fastq"), + index=data_dir / "sars-cov-2/sars-cov-2", + out_dir=out_dir, + force=True, ) - shutil.rmtree("test_minimal_fastq") + assert stats[0]["reads_out"] == 0 + assert stats[1]["reads_out"] == 2 + assert stats[2]["reads_out"] == 2 + shutil.rmtree(out_dir, ignore_errors=True) + + +def test_multiple_paired_fastqs_minimap2(): + stats = lib.clean_paired_fastqs( + fastqs=[ + (data_dir / "sars-cov-2_1_1.fastq", data_dir / "sars-cov-2_1_2.fastq"), + (data_dir / "human_1_1.fastq.gz", data_dir / "human_1_2.fastq.gz"), + (data_dir / "tuberculosis_1_1.fastq", data_dir / "tuberculosis_1_2.fastq"), + ], + aligner=lib.ALIGNER.minimap2, + index=data_dir / "sars-cov-2/sars-cov-2.fasta.gz", + out_dir=out_dir, + force=True, + ) + assert stats[0]["reads_out"] == 0 + assert stats[1]["reads_out"] == 2 + assert stats[2]["reads_out"] == 2 + shutil.rmtree(out_dir, ignore_errors=True) def test_minimal_paired_fastqs(): - lib.clean_paired_fastqs( - fastqs=[(data_dir / "h37rv_10.r1.fastq.gz", data_dir / "h37rv_10.r2.fastq.gz")], + stats = lib.clean_paired_fastqs( + fastqs=[ + ( + data_dir / "tuberculosis_1_1.fastq.gz", + data_dir / "tuberculosis_1_2.fastq.gz", + ) + ], aligner=lib.ALIGNER.bowtie2, - index=data_dir / "MN908947/MN908947", - out_dir=Path("test_minimal_fastqs"), + index=data_dir / "sars-cov-2/sars-cov-2", + out_dir=out_dir, + force=True, ) - shutil.rmtree("test_minimal_fastqs") + assert stats[0]["reads_out"] == 2 + shutil.rmtree(out_dir, ignore_errors=True) def test_minimal_uncompressed_paired_fastqs(): + shutil.rmtree(out_dir, ignore_errors=True) lib.clean_paired_fastqs( - fastqs=[(data_dir / "h37rv_10.r1.fastq", data_dir / "h37rv_10.r2.fastq")], + fastqs=[ + (data_dir / "tuberculosis_1_1.fastq", data_dir / "tuberculosis_1_2.fastq") + ], aligner=lib.ALIGNER.bowtie2, - index=data_dir / "MN908947/MN908947", - out_dir=Path("test_minimal_fastqs"), + index=data_dir / "sars-cov-2/sars-cov-2", + out_dir=out_dir, + force=True, ) - shutil.rmtree("test_minimal_fastqs") + shutil.rmtree(out_dir, ignore_errors=True) def test_minimal_paired_fastqs_cli(): run( - f"hostile clean --index MN908947/MN908947 --fastq1 h37rv_10.r1.fastq.gz --fastq2 h37rv_10.r2.fastq.gz --out-dir test_minimal_fastqs" + f"hostile clean --index {data_dir}/sars-cov-2/sars-cov-2 --fastq1 {data_dir}/tuberculosis_1_1.fastq.gz --fastq2 {data_dir}/tuberculosis_1_2.fastq.gz --out-dir {out_dir} --force" ) - shutil.rmtree(f"{data_dir}/test_minimal_fastqs") + shutil.rmtree(out_dir) def test_custom_index(): lib.clean_paired_fastqs( - fastqs=[(data_dir / "h37rv_10.r1.fastq.gz", data_dir / "h37rv_10.r2.fastq.gz")], - index=data_dir / "MN908947/MN908947", - out_dir=Path("test_minimal_fastqs"), + fastqs=[ + ( + data_dir / "tuberculosis_1_1.fastq.gz", + data_dir / "tuberculosis_1_2.fastq.gz", + ) + ], + index=data_dir / "sars-cov-2/sars-cov-2", + out_dir=out_dir, + force=True, ) - shutil.rmtree("test_minimal_fastqs") + shutil.rmtree(out_dir, ignore_errors=True) def test_both_aligners_paired_and_unpaired(): stats = lib.clean_paired_fastqs( - fastqs=[(data_dir / "h37rv_10.r1.fastq.gz", data_dir / "h37rv_10.r2.fastq.gz")], + fastqs=[ + ( + data_dir / "tuberculosis_1_1.fastq.gz", + data_dir / "tuberculosis_1_2.fastq.gz", + ) + ], aligner=lib.ALIGNER.bowtie2, - index=data_dir / "MN908947/MN908947", - out_dir=Path("tst"), + index=data_dir / "sars-cov-2/sars-cov-2", + out_dir=out_dir, force=True, ) assert ( stats[0]["aligner"] == "bowtie2" - and stats[0]["fastq2_out_name"] == "h37rv_10.r2.clean_2.fastq.gz" + and stats[0]["fastq2_out_name"] == "tuberculosis_1_2.clean_2.fastq.gz" ) stats = lib.clean_paired_fastqs( - fastqs=[(data_dir / "h37rv_10.r1.fastq.gz", data_dir / "h37rv_10.r2.fastq.gz")], + fastqs=[ + ( + data_dir / "tuberculosis_1_1.fastq.gz", + data_dir / "tuberculosis_1_2.fastq.gz", + ) + ], aligner=lib.ALIGNER.minimap2, - index=data_dir / "MN908947/MN908947.fasta.gz", - out_dir=Path("tst"), + index=data_dir / "sars-cov-2/sars-cov-2.fasta.gz", + out_dir=out_dir, force=True, ) assert ( stats[0]["aligner"] == "minimap2" - and stats[0]["fastq2_out_name"] == "h37rv_10.r2.clean_2.fastq.gz" + and stats[0]["fastq2_out_name"] == "tuberculosis_1_2.clean_2.fastq.gz" ) stats = lib.clean_fastqs( - fastqs=[data_dir / "h37rv_10.r1.fastq.gz"], + fastqs=[data_dir / "tuberculosis_1_1.fastq.gz"], aligner=lib.ALIGNER.bowtie2, - index=data_dir / "MN908947/MN908947", - out_dir=Path("tst"), + index=data_dir / "sars-cov-2/sars-cov-2", + out_dir=out_dir, force=True, ) assert ( stats[0]["aligner"] == "bowtie2" - and stats[0]["fastq1_out_name"] == "h37rv_10.r1.clean.fastq.gz" + and stats[0]["fastq1_out_name"] == "tuberculosis_1_1.clean.fastq.gz" ) stats = lib.clean_fastqs( - fastqs=[data_dir / "h37rv_10.r1.fastq.gz"], + fastqs=[data_dir / "tuberculosis_1_1.fastq.gz"], aligner=lib.ALIGNER.minimap2, - index=data_dir / "MN908947/MN908947.fasta.gz", - out_dir=Path("tst"), + index=data_dir / "sars-cov-2/sars-cov-2.fasta.gz", + out_dir=out_dir, force=True, ) assert ( stats[0]["aligner"] == "minimap2" - and stats[0]["fastq1_out_name"] == "h37rv_10.r1.clean.fastq.gz" + and stats[0]["fastq1_out_name"] == "tuberculosis_1_1.clean.fastq.gz" ) - - shutil.rmtree(Path("tst")) + shutil.rmtree(out_dir, ignore_errors=True) def test_rename(): @@ -132,18 +233,19 @@ def test_rename(): ) ], aligner=lib.ALIGNER.bowtie2, - index=data_dir / "MN908947/MN908947", + index=data_dir / "sars-cov-2/sars-cov-2", rename=True, - out_dir=Path("tst"), + out_dir=out_dir, ) first_line = get_first_line_of_gzip_file( - (Path("tst") / "tuberculosis_1_2.clean_1.fastq.gz").resolve() + out_dir / "tuberculosis_1_2.clean_1.fastq.gz" ) assert first_line == "@1 /1" - shutil.rmtree(Path("tst")) + shutil.rmtree(out_dir, ignore_errors=True) def test_with_and_without_force(): + shutil.rmtree(out_dir, ignore_errors=True) stats = lib.clean_paired_fastqs( fastqs=[ ( @@ -152,9 +254,9 @@ def test_with_and_without_force(): ) ], aligner=lib.ALIGNER.bowtie2, - index=data_dir / "MN908947/MN908947", + index=data_dir / "sars-cov-2/sars-cov-2", rename=True, - out_dir=Path("tst"), + out_dir=out_dir, ) with pytest.raises(FileExistsError): stats = lib.clean_paired_fastqs( @@ -165,11 +267,11 @@ def test_with_and_without_force(): ) ], aligner=lib.ALIGNER.bowtie2, - index=data_dir / "MN908947/MN908947", + index=data_dir / "sars-cov-2/sars-cov-2", rename=True, - out_dir=Path("tst"), + out_dir=out_dir, ) - shutil.rmtree(Path("tst")) + shutil.rmtree(out_dir, ignore_errors=True) def test_no_rename(): @@ -181,14 +283,15 @@ def test_no_rename(): ) ], aligner=lib.ALIGNER.bowtie2, - index=data_dir / "MN908947/MN908947", - out_dir=Path("tst"), + index=data_dir / "sars-cov-2/sars-cov-2", + out_dir=out_dir, + force=True, ) first_line = get_first_line_of_gzip_file( - (Path("tst") / "tuberculosis_1_2.clean_1.fastq.gz").resolve() + (out_dir / "tuberculosis_1_2.clean_1.fastq.gz").resolve() ) - assert first_line == "@NC_000962.3_3000195_3000563_0_1_0_0_1:0:0_0:0:0_0/1" - shutil.rmtree(Path("tst")) + assert first_line == "@Mycobacterium_tuberculosis/1" + shutil.rmtree(out_dir, ignore_errors=True) def test_broken_fastq_path(): @@ -196,14 +299,76 @@ def test_broken_fastq_path(): stats = lib.clean_fastqs( fastqs=[Path("invalid_path.fastq.gz")], aligner=lib.ALIGNER.bowtie2, - index=data_dir / "MN908947/MN908947", + index=data_dir / "sars-cov-2/sars-cov-2", + out_dir=out_dir, ) + shutil.rmtree(out_dir, ignore_errors=True) + + +def test_no_reads_remaining_after_decontamination(): + stats = lib.clean_fastqs( + fastqs=[ + data_dir / "sars-cov-2_1_1.fastq", + ], + index=data_dir / "sars-cov-2/sars-cov-2.fasta.gz", + out_dir=out_dir, + force=True, + ) + assert stats[0]["reads_out"] == 0 + shutil.rmtree(out_dir, ignore_errors=True) + + +def test_no_reads_remaining_after_decontamination_paired(): + stats = lib.clean_paired_fastqs( + fastqs=[ + ( + data_dir / "sars-cov-2_1_1.fastq", + data_dir / "sars-cov-2_1_2.fastq", + ) + ], + index=data_dir / "sars-cov-2/sars-cov-2", + out_dir=out_dir, + force=True, + ) + assert stats[0]["reads_out"] == 0 + shutil.rmtree(out_dir, ignore_errors=True) + + +def test_decontamination_performance_sars2_bowtie2(): + stats = lib.clean_paired_fastqs( + fastqs=[ + ( + data_dir / "sars-cov-2_100_1.fastq.gz", + data_dir / "sars-cov-2_100_2.fastq.gz", + ) + ], + index=data_dir / "sars-cov-2/sars-cov-2", + out_dir=out_dir, + ) + assert stats[0]["reads_out"] == 6 + shutil.rmtree(out_dir, ignore_errors=True) + + +def test_decontamination_performance_sars2_minimap2(): + stats = lib.clean_paired_fastqs( + fastqs=[ + ( + data_dir / "sars-cov-2_100_1.fastq.gz", + data_dir / "sars-cov-2_100_2.fastq.gz", + ) + ], + index=data_dir / "sars-cov-2/sars-cov-2.fasta.gz", + aligner=lib.ALIGNER.minimap2, + out_dir=out_dir, + ) + assert stats[0]["reads_out"] == 0 + shutil.rmtree(out_dir, ignore_errors=True) def test_mask(): lib.mask( - reference=data_dir / "MN908947/MN908947.fasta.gz", - target=data_dir / "MN908947/partial-for-mask-testing.fa.gz", + reference=data_dir / "sars-cov-2/sars-cov-2.fasta.gz", + target=data_dir / "sars-cov-2/partial-for-mask-testing.fa.gz", ) assert Path("masked/mask.bed").exists() and Path("masked/masked.fa").exists() shutil.rmtree("masked")