diff --git a/README.md b/README.md index cd4f245..a4cc9db 100644 --- a/README.md +++ b/README.md @@ -3,7 +3,7 @@ Rust implementation of [NanoFilt](https://github.com/wdecoster/nanofilt)+[NanoLyse](https://github.com/wdecoster/nanolyse), both originally written in Python. This tool, intended for long read sequencing such as PacBio or ONT, filters and trims a fastq file. Filtering is done on average read quality and minimal or maximal read length, and applying a headcrop (start of read) and tailcrop (end of read) while printing the reads passing the filter. -Compared to the Python implementation the scope is to deliver the same results, almost the same functionality, at much faster execution times. At the moment this tool does not support filtering using a sequencing_summary file or filtering on GC content. If those features are of interest then please reach out. +Compared to the Python implementation the scope is to deliver the same results, almost the same functionality, at much faster execution times. At the moment this tool does not support filtering using a sequencing_summary file. If those features are of interest then please reach out. ## Installation @@ -31,6 +31,8 @@ OPTIONS: --threads Number of parallel threads to use [default: 4] --contam Fasta file with reference to check potential contaminants against [default None] -i, --input Input filename [default: read from stdin] + --maxgc Sets a maximum GC content [default: 1.0] + --mingc Sets a minimum GC content [default: 0.0] ``` EXAMPLES: @@ -41,6 +43,8 @@ chopper -q 10 -l 500 -i reads.fastq > filtered_reads.fastq chopper -q 10 -l 500 -i reads.fastq.gz | gzip > filtered_reads.fastq.gz ``` +Note that the tool may be substantially slower in the third example above, and piping while decompressing is recommended (as in the first example). + ## CITATION If you use this tool, please consider citing our [publication](https://academic.oup.com/bioinformatics/article/39/5/btad311/7160911). diff --git a/src/main.rs b/src/main.rs index d3b106b..59b54e9 100644 --- a/src/main.rs +++ b/src/main.rs @@ -54,6 +54,14 @@ struct Cli { /// Input filename [default: read from stdin] #[arg(short = 'i', long = "input", value_parser)] input: Option, + + /// Filter max GC content + #[arg(long, value_parser, default_value_t = 1.0)] + maxgc: f64, + + /// Filter min GC content + #[arg(long, value_parser, default_value_t = 0.0)] + mingc: f64, } @@ -128,6 +136,14 @@ where if !record.is_empty() { let read_len = record.seq().len(); // If a read is shorter than what is to be cropped the read is dropped entirely (filtered out) + + // Check if gc content filter exist, if no gc content filter is set pass the 0.5 to pass all the follwoing filter + let read_gc = if args.mingc != 0.0 || args.maxgc != 1.0 { + cal_gc(record.seq()) + } else { + 0.5 + }; + if args.headcrop + args.tailcrop < read_len { let average_quality = ave_qual( &record.qual().iter().map(|i| i - 33).collect::>(), @@ -135,6 +151,8 @@ where if (!args.inverse && average_quality >= args.minqual && average_quality <= args.maxqual + && read_gc >= args.mingc + && read_gc <= args.maxgc && read_len >= args.minlength && read_len <= args.maxlength && !is_contamination(&record.seq(), &aligner)) @@ -170,6 +188,14 @@ where if !record.is_empty() { let read_len = record.seq().len(); // If a read is shorter than what is to be cropped the read is dropped entirely (filtered out) + + // Check if gc content filter exist, if no gc content filter is set pass the 0.5 to pass all the follwoing filter + let read_gc = if args.mingc != 0.0 || args.maxgc != 1.0 { + cal_gc(record.seq()) + } else { + 0.5 + }; + if args.headcrop + args.tailcrop < read_len { let average_quality = ave_qual( &record.qual().iter().map(|i| i - 33).collect::>(), @@ -177,6 +203,8 @@ where if (!args.inverse && average_quality >= args.minqual && average_quality <= args.maxqual + && read_gc >= args.mingc + && read_gc <= args.maxgc && read_len >= args.minlength && read_len <= args.maxlength) || (args.inverse @@ -253,6 +281,11 @@ fn is_contamination(readseq: &&[u8], contam: &Aligner) -> bool { } } +fn cal_gc(readseq: &[u8]) -> f64 { + let gc_count = readseq.iter().filter(|&&base| base == b'G' || base == b'g' || base == b'C' || base == b'c').count(); + (gc_count as f64) / (readseq.len() as f64) +} + #[test] fn test_ave_qual() { assert_eq!(ave_qual(&[10]), 10.0); @@ -289,6 +322,8 @@ fn test_filter() { contam: None, inverse: false, input: None, + mingc: 0.0, + maxgc: 1.0, }, ); } @@ -331,7 +366,9 @@ fn test_filter_with_contam() { threads: 1, contam: Some("test-data/random_contam.fa".to_owned()), inverse: false, - input: None, + input: None, + mingc: 0.0, + maxgc: 1.0, }, ); } diff --git a/test-data/testGC.fastq b/test-data/testGC.fastq new file mode 100644 index 0000000..7e5e835 --- /dev/null +++ b/test-data/testGC.fastq @@ -0,0 +1,44 @@ +@GC0 +AAAAAAAAAA ++ +OOOOOOOOOO +@GC10 +GAAAAAAAAA ++ +OOOOOOOOOO +@GC20 +GCAAAAAAAA ++ +OOOOOOOOOO +@GC30 +GCGAAAAAAA ++ +OOOOOOOOOO +@GC40 +GCGCAAAAAA ++ +OOOOOOOOOO +@GC50 +GCGCGAAAAA ++ +OOOOOOOOOO +@GC60 +GCGCGCAAAA ++ +OOOOOOOOOO +@GC40 +GCGCGCGAAA ++ +OOOOOOOOOO +@GC80 +GCGCGCGCAA ++ +OOOOOOOOOO +@GC90 +GCGCGCGCGA ++ +OOOOOOOOOO +@GC100 +GCGCGCGCGC ++ +OOOOOOOOOO