Merge pull request #9 from bluenote-1577/0.6-fix

v0.6.0
bluenote-1577 · Apr 6, 2024 · 2a01389 · 2a01389
2 parents fce8f4a + c032e5b
commit 2a01389
Show file tree

Hide file tree

Showing 21 changed files with 20,349 additions and 186,295 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,3 +1,10 @@
+## sylph v0.6.0 release: New output column, lazy raw paired fastq profiling: 2024-04-06 
+
+### Major
+
+* A new column called `kmers_reassigned` is now in the profile output. This states how many k-mers are lost due to reassignment for that particular genome. 
+* `-1, -2` options are now available for `sylph profile`. You can now do `sylph profile database.syldb -1 1.fq -2 2.fq ...`
+
 ## sylph v0.5.1 release: **Memory improvement and bug fixes** : Dec 27 2023
 
 ### Major

diff --git a/Cargo.lock b/Cargo.lock
diff --git a/Cargo.toml b/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "sylph"
-version = "0.5.1"
+version = "0.6.0"
 edition = "2021"
 license = "MIT OR Apache-2.0"
 
@@ -17,14 +17,13 @@ bincode = "1"
 fxhash = "0"
 clap = { version = "3", features = ["derive"] }
 flate2 = { version = "1.0.17", features = ["zlib-ng"], default-features = false }
-#nlopt="*"
 statrs="0.16"
 nalgebra="0"
 rand = "0"
 regex = "1"
 fastrand = "2"
 memory-stats = "1"
-scalable_cuckoo_filter = "*"
+scalable_cuckoo_filter = "0.2"
 
 [target.'cfg(target_env = "musl")'.dependencies]
 tikv-jemallocator = "0"

diff --git a/README.md b/README.md
@@ -15,35 +15,42 @@
    </i>
 </p>
 
+
 ### Why sylph?
 
-1. **Accurate (containment) ANIs down to 0.1x effective coverage**: for bacterial ANI queries of > 90% ANI, sylph can often give accurate ANI estimates down to 0.1x coverage.
+1. **Precise species-level profiling**: Our tests show that sylph is more precise than Kraken and about as precise and sensitive as marker gene methods (MetaPhlAn, mOTUs). 
 
-2. **Precise species-level profiling**: Our tests show that sylph is more precise than Kraken and about as precise and sensitive as marker gene methods (MetaPhlAn, mOTUs). 
+2. **Ultrafast, multithreaded, multi-sample**: sylph can be > 50x faster than MetaPhlAn for multi-sample processing. sylph only takes 13GB of RAM for profiling against the entire GTDB-R214 database (85k genomes).
 
-3. **Ultrafast, multithreaded, multi-sample**: sylph can be > 100x faster than MetaPhlAn for multi-sample processing. sylph only takes 13GB of RAM for profiling against the entire GTDB-R214 database (85k genomes).
+3. **Accurate (containment) ANIs down to 0.1x effective coverage**: for bacterial ANI queries of > 90% ANI, sylph can often give accurate ANI estimates down to 0.1x coverage.
 
-4. **Easily customized databases**: sylph does not require taxonomic information, so you can profile against [metagenome-assembled genomes (MAGs), viruses, eukaryotes](https://github.com/bluenote-1577/sylph/wiki/Pre%E2%80%90built-databases), even assembled contigs, etc. Taxonomic information can be incorporated downstream for traditional profiling reports. 
+4. **Easily customized databases**: sylph can profile against [metagenome-assembled genomes (MAGs), viruses, eukaryotes](https://github.com/bluenote-1577/sylph/wiki/Pre%E2%80%90built-databases), and more. Taxonomic information can be incorporated downstream for traditional profiling reports. 
 
 ### How does sylph work?
 
-sylph uses a k-mer containment method, similar to sourmash or Mash. sylph's novelty lies in **using a statistical technique to correct ANI for low coverage genomes** within the sample, allowing accurate ANI queries for even low abundance genomes. See [here for more information on what sylph can and can not do](https://github.com/bluenote-1577/sylph/wiki/Introduction:-what-is-sylph-and-how-does-it-work%3F). 
+sylph uses a k-mer containment method, similar to sourmash or Mash. sylph's novelty lies in **using a statistical technique to correct ANI for low coverage genomes** within the sample, allowing accurate ANI for low abundance genomes. See [here for more information on what sylph can and can not do](https://github.com/bluenote-1577/sylph/wiki/Introduction:-what-is-sylph-and-how-does-it-work%3F). 
 
-## Changelog
+See below for more comprehensive usage information/tutorials/manuals. 
 
-### Version v0.5.0 and v0.5.1 - Dec 27, 2023. Major breaking updates.
+## Very quick start
 
-#### IMPORTANT
+#### Profile metagenome sample against [GTDB-R214](https://gtdb.ecogenomic.org/) (85,205 bacterial/archaeal genomes) 
 
-* Big sensitivity boost for real Illumina profiling in v0.5 versus v0.4.
-* Breaking change: *.sylsp files are now in a new format. Old sketches will no longer work.
-* Shorter reads (>= 32bp) now usable
-* New probabilistic data structures for read deduplication -- lower memory usage 
+```sh
+# see below for install options
+conda install -c bioconda sylph
 
-See the [CHANGELOG](https://github.com/bluenote-1577/sylph/blob/main/CHANGELOG.md) for complete details.
+# download GTDB-R214 pre-built database (~10 GB)
+wget https://storage.googleapis.com/sylph-stuff/v0.3-c200-gtdb-r214.syldb
 
+# multi-sample paired-end profiling (sylph version >= 0.6)
+sylph profile v0.3-200-gtdb-r214.syldb -1 *_1.fastq.gz -2 *_2.fastq.gz -t (threads) > profiling.tsv
 
-##  Install (current version v0.5.1)
+# multi-sample single-end profiling
+sylph profile v0.3-200-gtdb-r214.syldb *.fastq -t (threads) > profiling.tsv
+```
+
+##  Install (current version v0.6.0)
 
 #### Option 1: conda install 
 [![Anaconda-Server Badge](https://anaconda.org/bioconda/sylph/badges/version.svg)](https://anaconda.org/bioconda/sylph)
@@ -85,7 +92,9 @@ chmod +x sylph
 
 Note: the binary is compiled with a different set of libraries (musl instead of glibc), probably impacting performance. 
 
-## Quick start
+## Standard usage
+
+#### Sketching reads/genomes (indexing)
 
 ```sh
 # all fasta -> one *.syldb; fasta are assumed to be genomes
@@ -98,19 +107,22 @@ sylph sketch -1 A_1.fq B_1.fq -2 A_2.fq B_2.fq -d read_sketch_folder
 # multi-sample sketching for single end reads, fastq are assumed to be reads
 sylph sketch reads.fq 
 #EQUIVALENT: sylph sketch -r reads.fq
+```
 
+#### Profiling or querying
+```sh
 # ANI querying 
 sylph query database.syldb read_sketch_folder/*.sylsp -t (threads) > ani_queries.tsv
 
 # taxonomic profiling 
 sylph profile database.syldb read_sketch_folder/*.sylsp -t (threads) > profiling.tsv
 ```
 
-## [Pre-built databases](https://github.com/bluenote-1577/sylph/wiki/Pre%E2%80%90built-databases)
+## Tutorials, manuals, and pre-built databases
 
-The pre-built databases [available here](https://github.com/bluenote-1577/sylph/wiki/Pre%E2%80%90built-databases) can be downloaded and used with sylph for profiling and containment querying. 
+### [Pre-built databases](https://github.com/bluenote-1577/sylph/wiki/Pre%E2%80%90built-databases)
 
-## Tutorials and manuals
+The pre-built databases [available here](https://github.com/bluenote-1577/sylph/wiki/Pre%E2%80%90built-databases) can be downloaded and used with sylph for profiling and containment querying. 
 
 ### [Cookbook](https://github.com/bluenote-1577/sylph/wiki/sylph-cookbook)
 
@@ -126,7 +138,16 @@ For common use cases and fast explanations, see the above [cookbook](https://git
 
 ### [sylph-utils](https://github.com/bluenote-1577/sylph-utils) 
 
-For incoporating taxonomy and manipulating output formats, see the [sylph-utils repository](https://github.com/bluenote-1577/sylph-utils).
+For incorporating taxonomy and manipulating output formats, see the [sylph-utils repository](https://github.com/bluenote-1577/sylph-utils).
+
+### Changelog
+
+#### Version v0.6.0 - 2024-04-06. New input/output options.
+
+* `-1` and `-2` options are available for raw fastq profiling for `sylph profile` now. 
+* Output slightly changed. See the documentation below. 
+
+See the [CHANGELOG](https://github.com/bluenote-1577/sylph/blob/main/CHANGELOG.md) for complete details.
 
 ## Citing sylph
 

diff --git a/src/cmdline.rs b/src/cmdline.rs
@@ -1,7 +1,8 @@
 use clap::{Args, Parser, Subcommand};
+use crate::constants::*;
 
 #[derive(Parser)]
-#[clap(author, version, about = "Ultrafast genome ANI queries and taxonomic profiling for metagenomic shotgun samples.\n\n--- Preparing inputs by sketching (indexing)\n## fastq (reads) and fasta (genomes all at once\n## *.sylsp found in -d; *.syldb given by -o\nsylph sketch -t 5 sample1.fq sample2.fq genome1.fa genome2.fa -o genome1+genome2 -d sample_dir\n\n## paired-end reads\nsylph sketch -1 a_1.fq b_1.fq -2 b_2.fq b_2.fq -d paired_sketches\n\n--- Nearest neighbour containment ANI\nsylph query *.syldb *.sylsp > all-to-all-query.tsv\n\n--- Taxonomic profiling with relative abundances and ANI\nsylph profile *.syldb *.sylsp > all-to-all-profile.tsv", arg_required_else_help = true, disable_help_subcommand = true)]
+#[clap(author, version, about = "Ultrafast genome ANI queries and taxonomic profiling for metagenomic shotgun samples.\n\n--- Preparing inputs by sketching (indexing)\n## fastq (reads) and fasta (genomes all at once)\n## *.sylsp found in -d; *.syldb given by -o\nsylph sketch -t 5 sample1.fq sample2.fq genome1.fa genome2.fa -o genome1+genome2 -d sample_dir\n\n## paired-end reads\nsylph sketch -1 a_1.fq b_1.fq -2 b_2.fq b_2.fq -d paired_sketches\n\n--- Nearest neighbour containment ANI\nsylph query *.syldb *.sylsp > all-to-all-query.tsv\n\n--- Taxonomic profiling with relative abundances and ANI\nsylph profile *.syldb *.sylsp > all-to-all-profile.tsv", arg_required_else_help = true, disable_help_subcommand = true)]
 pub struct Cli {
     #[clap(subcommand,)]
     pub mode: Mode,
@@ -67,7 +68,7 @@ pub struct SketchArgs {
     pub no_pseudotax: bool,
     #[clap(long="min-spacing", default_value_t = 30, help_heading = "ALGORITHM", help = "Minimum spacing between selected k-mers on the genomes")]
     pub min_spacing_kmer: usize,
-    #[clap(long="fpr", default_value_t = 0.0001, help_heading = "ALGORITHM", help = "False positive rate for read deduplicate hashing; valid values in [0,1).")]
+    #[clap(long="fpr", default_value_t = DEFAULT_FPR, help_heading = "ALGORITHM", help = "False positive rate for read deduplicate hashing; valid values in [0,1).")]
     pub fpr: f64,
     #[clap(short='1',long="first-pairs", multiple=true, help_heading = "PAIRED-END INPUT", help = "First pairs for paired end reads")]
     pub first_pair: Vec<String>,
@@ -77,10 +78,11 @@ pub struct SketchArgs {
 
 #[derive(Args)]
 pub struct ContainArgs {
-    #[clap(multiple=true, help = "Pre-sketched *.syldb/*.sylsp files. Raw fastq/fasta are allowed and will be automatically sketched to .sylsp/.syldb")]
+    #[clap(multiple=true, help = "Pre-sketched *.syldb/*.sylsp files. Raw single-end fastq/fasta are allowed and will be automatically sketched to .sylsp/.syldb")]
     pub files: Vec<String>,
 
-    #[clap(short='l',long="list", help = "Newline delimited file of file inputs")]
+
+    #[clap(short='l',long="list", help = "Newline delimited file of file inputs",help_heading = "INPUT/OUTPUT")]
     pub file_list: Option<String>,
 
     #[clap(long,default_value_t = 3., help_heading = "ALGORITHM", help = "Minimum k-mer multiplicity needed for coverage correction. Higher values gives more precision but lower sensitivity")]
@@ -98,9 +100,10 @@ pub struct ContainArgs {
     #[clap(long="debug", help = "Debug output")]
     pub debug: bool,
 
-    #[clap(short='u', long="estimate-unknown", help_heading = "ALGORITHM", help = "Estimates true coverage and scales sequence abundance in `profile` by estimated unknown sequence percentage" )]
+    #[clap(short='u', long="estimate-unknown", help_heading = "ALGORITHM", help = "Estimate true coverage and scale sequence abundance in `profile` by estimated unknown sequence percentage" )]
     pub estimate_unknown: bool,
 
+
     #[clap(short='I',long="read-seq-id", help_heading = "ALGORITHM", help = "Mean sequence identity of reads (0-100). Only used if --estimate-unknown is toggled. Consider this if automatic identity estimate fails" )]
     pub seq_id: Option<f64>,
 
@@ -110,6 +113,12 @@ pub struct ContainArgs {
     #[clap(short='R', long="redundancy-threshold", help_heading = "ALGORITHM", help = "Removes redundant genomes up to a rough ANI percentile when profiling", default_value_t = 99.0, hidden=true)]
     pub redundant_ani: f64,
 
+    #[clap(short='1', long="first-pairs", multiple=true, help = "First pairs for raw paired-end reads (fastx/gzip)",help_heading = "SKETCHING")]
+    pub first_pair: Vec<String>,
+
+    #[clap(short='2', long="second-pairs", multiple=true, help = "Second pairs for raw paired-end reads (fastx/gzip)",help_heading = "SKETCHING")]
+    pub second_pair: Vec<String>,
+
     #[clap(short, default_value_t = 200, help_heading = "SKETCHING", help = "Subsampling rate. Does nothing for pre-sketched files")]
     pub c: usize,
     #[clap(short, default_value_t = 31, help_heading = "SKETCHING", help = "Value of k. Only k = 21, 31 are currently supported. Does nothing for pre-sketched files")]
@@ -119,6 +128,12 @@ pub struct ContainArgs {
     #[clap(long="min-spacing", default_value_t = 30, help_heading = "SKETCHING", help = "Minimum spacing between selected k-mers on the database genomes. Does nothing for pre-sketched files")]
     pub min_spacing_kmer: usize,
 
+    #[clap(short='o',long="output-file", help = "Output to this file (TSV format). [default: stdout]", help_heading="INPUT/OUTPUT")]
+    pub out_file_name: Option<String>,
+    #[clap(long="log-reassignments", help = "Output information for how k-mers for genomes are reassigned during `profile`. Caution: can be verbose and slows down computation.")]
+    pub log_reassignments: bool,
+
+
     //Hidden options that are embedded in the args but no longer used... 
     #[clap(short, hidden=true, long="pseudotax", help_heading = "ALGORITHM", help = "Pseudo taxonomic classification mode. This removes shared k-mers between species by assigning k-mers to the highest ANI species. Requires sketches with --enable-pseudotax option" )]
     pub pseudotax: bool,
@@ -134,9 +149,11 @@ pub struct ContainArgs {
     pub no_ci: bool,
     #[clap(long="no-adjust", hidden=true)]
     pub no_adj: bool,
+    #[clap(long="mean-coverage", help_heading = "ALGORITHM", help = "Use the robust mean coverage estimator instead of median estimator", hidden=true )]
+    pub mean_coverage: bool,
 
-    #[clap(short='o',long="output-file", help = "Output to this file instead of stdout")]
-    pub out_file_name: Option<String>,
+
+
 
 
 }
diff --git a/src/constants.rs b/src/constants.rs
@@ -13,3 +13,4 @@ pub const MAX_MEDIAN_FOR_MEAN_FINAL_EST: f64 = 15.;
 pub const DEREP_PROFILE_ANI: f64 = 0.975;
 pub const MAX_DEDUP_COUNT: u32 = 4;
 pub const MAX_DEDUP_LEN: usize = 10000000;
+pub const DEFAULT_FPR: f64 = 0.0001;