From f50e54652f32856dff3386b53dc3afa50e02db48 Mon Sep 17 00:00:00 2001 From: JMencius <553843771@qq.com> Date: Wed, 24 Apr 2024 19:55:44 +0800 Subject: [PATCH 1/4] Add GC content parameter --mingc --maxgc --- README.md | 4 +++- src/main.rs | 41 ++++++++++++++++++++++++++++++++++++++- test-data/testGC.fastq | 44 ++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 87 insertions(+), 2 deletions(-) create mode 100644 test-data/testGC.fastq diff --git a/README.md b/README.md index cd4f245..04d6e0c 100644 --- a/README.md +++ b/README.md @@ -3,7 +3,7 @@ Rust implementation of [NanoFilt](https://github.com/wdecoster/nanofilt)+[NanoLyse](https://github.com/wdecoster/nanolyse), both originally written in Python. This tool, intended for long read sequencing such as PacBio or ONT, filters and trims a fastq file. Filtering is done on average read quality and minimal or maximal read length, and applying a headcrop (start of read) and tailcrop (end of read) while printing the reads passing the filter. -Compared to the Python implementation the scope is to deliver the same results, almost the same functionality, at much faster execution times. At the moment this tool does not support filtering using a sequencing_summary file or filtering on GC content. If those features are of interest then please reach out. +Compared to the Python implementation the scope is to deliver the same results, almost the same functionality, at much faster execution times. At the moment this tool does not support filtering using a sequencing_summary file. If those features are of interest then please reach out. ## Installation @@ -31,6 +31,8 @@ OPTIONS: --threads Number of parallel threads to use [default: 4] --contam Fasta file with reference to check potential contaminants against [default None] -i, --input Input filename [default: read from stdin] + --maxgc Sets a maximum GC content [default: 1.0] + --mingc Sets a minimum GC content [default: 0.0] ``` EXAMPLES: diff --git a/src/main.rs b/src/main.rs index d3b106b..8fe4b90 100644 --- a/src/main.rs +++ b/src/main.rs @@ -54,6 +54,13 @@ struct Cli { /// Input filename [default: read from stdin] #[arg(short = 'i', long = "input", value_parser)] input: Option, + + /// Filter GC content [default : 0] + #[arg(long, value_parser, default_value_t = 1.0)] + maxgc : f64, + + #[arg(long, value_parser, default_value_t = 0.0)] + mingc : f64, } @@ -128,6 +135,7 @@ where if !record.is_empty() { let read_len = record.seq().len(); // If a read is shorter than what is to be cropped the read is dropped entirely (filtered out) + let read_gc = cal_gc(record.seq()); if args.headcrop + args.tailcrop < read_len { let average_quality = ave_qual( &record.qual().iter().map(|i| i - 33).collect::>(), @@ -135,6 +143,8 @@ where if (!args.inverse && average_quality >= args.minqual && average_quality <= args.maxqual + && read_gc >= args.mingc + && read_gc <= args.maxgc && read_len >= args.minlength && read_len <= args.maxlength && !is_contamination(&record.seq(), &aligner)) @@ -170,6 +180,7 @@ where if !record.is_empty() { let read_len = record.seq().len(); // If a read is shorter than what is to be cropped the read is dropped entirely (filtered out) + let read_gc = cal_gc(record.seq()); if args.headcrop + args.tailcrop < read_len { let average_quality = ave_qual( &record.qual().iter().map(|i| i - 33).collect::>(), @@ -177,6 +188,8 @@ where if (!args.inverse && average_quality >= args.minqual && average_quality <= args.maxqual + && read_gc >= args.mingc + && read_gc <= args.maxgc && read_len >= args.minlength && read_len <= args.maxlength) || (args.inverse @@ -253,6 +266,28 @@ fn is_contamination(readseq: &&[u8], contam: &Aligner) -> bool { } } +fn cal_gc(readseq: &[u8]) -> f64 { + let mut gc_count = 0; + let mut total_bases = 0; + + for &base in readseq { + match base { + b'G' | b'g' | b'C' | b'c' => gc_count += 1, + _ => {}, // Ignore non-GC bases + } + total_bases += 1; + } + + // Return 0 if sequence is empty + if total_bases == 0 { + return 0.0; + } + + let gc_content = (gc_count as f64) / (total_bases as f64); + // Return GC content as absolute value + gc_content +} + #[test] fn test_ave_qual() { assert_eq!(ave_qual(&[10]), 10.0); @@ -289,6 +324,8 @@ fn test_filter() { contam: None, inverse: false, input: None, + mingc: 0, + maxgc: 1, }, ); } @@ -331,7 +368,9 @@ fn test_filter_with_contam() { threads: 1, contam: Some("test-data/random_contam.fa".to_owned()), inverse: false, - input: None, + input: None, + mingc: 0.0, + maxgc: 1.0, }, ); } diff --git a/test-data/testGC.fastq b/test-data/testGC.fastq new file mode 100644 index 0000000..7e5e835 --- /dev/null +++ b/test-data/testGC.fastq @@ -0,0 +1,44 @@ +@GC0 +AAAAAAAAAA ++ +OOOOOOOOOO +@GC10 +GAAAAAAAAA ++ +OOOOOOOOOO +@GC20 +GCAAAAAAAA ++ +OOOOOOOOOO +@GC30 +GCGAAAAAAA ++ +OOOOOOOOOO +@GC40 +GCGCAAAAAA ++ +OOOOOOOOOO +@GC50 +GCGCGAAAAA ++ +OOOOOOOOOO +@GC60 +GCGCGCAAAA ++ +OOOOOOOOOO +@GC40 +GCGCGCGAAA ++ +OOOOOOOOOO +@GC80 +GCGCGCGCAA ++ +OOOOOOOOOO +@GC90 +GCGCGCGCGA ++ +OOOOOOOOOO +@GC100 +GCGCGCGCGC ++ +OOOOOOOOOO From d684ace20b0aae931f0744f54a6eec735941a47d Mon Sep 17 00:00:00 2001 From: JMencius <553843771@qq.com> Date: Wed, 24 Apr 2024 21:41:11 +0800 Subject: [PATCH 2/4] Change to optional gc content filter --- src/main.rs | 35 +++++++++++++++++++++-------------- 1 file changed, 21 insertions(+), 14 deletions(-) diff --git a/src/main.rs b/src/main.rs index 8fe4b90..b0fbfa7 100644 --- a/src/main.rs +++ b/src/main.rs @@ -55,12 +55,13 @@ struct Cli { #[arg(short = 'i', long = "input", value_parser)] input: Option, - /// Filter GC content [default : 0] + /// Filter max GC content #[arg(long, value_parser, default_value_t = 1.0)] - maxgc : f64, + maxgc: f64, + /// Filter min GC content #[arg(long, value_parser, default_value_t = 0.0)] - mingc : f64, + mingc: f64, } @@ -135,7 +136,13 @@ where if !record.is_empty() { let read_len = record.seq().len(); // If a read is shorter than what is to be cropped the read is dropped entirely (filtered out) - let read_gc = cal_gc(record.seq()); + + // Check if gc content filter exist, if no gc content filter is set pass the 0.5 to pass all the follwoing filter + let mut read_gc: f64 = 0.5; + if args.mingc != 0.0 || args.maxgc != 1.0 { + read_gc = cal_gc(record.seq()); + } + if args.headcrop + args.tailcrop < read_len { let average_quality = ave_qual( &record.qual().iter().map(|i| i - 33).collect::>(), @@ -180,7 +187,13 @@ where if !record.is_empty() { let read_len = record.seq().len(); // If a read is shorter than what is to be cropped the read is dropped entirely (filtered out) - let read_gc = cal_gc(record.seq()); + + // Check if gc content filter exist, if no gc content filter is set pass the 0.5 to pass all the follwoing filter + let mut read_gc: f64 = 0.5; + if args.mingc != 0.0 || args.maxgc != 1.0 { + read_gc = cal_gc(record.seq()); + } + if args.headcrop + args.tailcrop < read_len { let average_quality = ave_qual( &record.qual().iter().map(|i| i - 33).collect::>(), @@ -268,22 +281,16 @@ fn is_contamination(readseq: &&[u8], contam: &Aligner) -> bool { fn cal_gc(readseq: &[u8]) -> f64 { let mut gc_count = 0; - let mut total_bases = 0; for &base in readseq { match base { b'G' | b'g' | b'C' | b'c' => gc_count += 1, _ => {}, // Ignore non-GC bases } - total_bases += 1; - } - // Return 0 if sequence is empty - if total_bases == 0 { - return 0.0; } - let gc_content = (gc_count as f64) / (total_bases as f64); + let gc_content = (gc_count as f64) / (readseq.len() as f64); // Return GC content as absolute value gc_content } @@ -324,8 +331,8 @@ fn test_filter() { contam: None, inverse: false, input: None, - mingc: 0, - maxgc: 1, + mingc: 0.0, + maxgc: 1.0, }, ); } From 9962fcbd0951a31678bbc741380573921dc233dc Mon Sep 17 00:00:00 2001 From: wdecoster Date: Thu, 25 Apr 2024 09:50:53 +0200 Subject: [PATCH 3/4] minor changes in implementation --- src/main.rs | 33 ++++++++++++--------------------- 1 file changed, 12 insertions(+), 21 deletions(-) diff --git a/src/main.rs b/src/main.rs index b0fbfa7..59b54e9 100644 --- a/src/main.rs +++ b/src/main.rs @@ -138,10 +138,11 @@ where // If a read is shorter than what is to be cropped the read is dropped entirely (filtered out) // Check if gc content filter exist, if no gc content filter is set pass the 0.5 to pass all the follwoing filter - let mut read_gc: f64 = 0.5; - if args.mingc != 0.0 || args.maxgc != 1.0 { - read_gc = cal_gc(record.seq()); - } + let read_gc = if args.mingc != 0.0 || args.maxgc != 1.0 { + cal_gc(record.seq()) + } else { + 0.5 + }; if args.headcrop + args.tailcrop < read_len { let average_quality = ave_qual( @@ -189,10 +190,11 @@ where // If a read is shorter than what is to be cropped the read is dropped entirely (filtered out) // Check if gc content filter exist, if no gc content filter is set pass the 0.5 to pass all the follwoing filter - let mut read_gc: f64 = 0.5; - if args.mingc != 0.0 || args.maxgc != 1.0 { - read_gc = cal_gc(record.seq()); - } + let read_gc = if args.mingc != 0.0 || args.maxgc != 1.0 { + cal_gc(record.seq()) + } else { + 0.5 + }; if args.headcrop + args.tailcrop < read_len { let average_quality = ave_qual( @@ -280,19 +282,8 @@ fn is_contamination(readseq: &&[u8], contam: &Aligner) -> bool { } fn cal_gc(readseq: &[u8]) -> f64 { - let mut gc_count = 0; - - for &base in readseq { - match base { - b'G' | b'g' | b'C' | b'c' => gc_count += 1, - _ => {}, // Ignore non-GC bases - } - - } - - let gc_content = (gc_count as f64) / (readseq.len() as f64); - // Return GC content as absolute value - gc_content + let gc_count = readseq.iter().filter(|&&base| base == b'G' || base == b'g' || base == b'C' || base == b'c').count(); + (gc_count as f64) / (readseq.len() as f64) } #[test] From 184e8d5b8a384b18394e4f7ffb20aea575e5a8d2 Mon Sep 17 00:00:00 2001 From: Wouter De Coster Date: Fri, 26 Apr 2024 15:04:56 +0200 Subject: [PATCH 4/4] Update README.md add note on speed of decompression while piping --- README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/README.md b/README.md index 04d6e0c..a4cc9db 100644 --- a/README.md +++ b/README.md @@ -43,6 +43,8 @@ chopper -q 10 -l 500 -i reads.fastq > filtered_reads.fastq chopper -q 10 -l 500 -i reads.fastq.gz | gzip > filtered_reads.fastq.gz ``` +Note that the tool may be substantially slower in the third example above, and piping while decompressing is recommended (as in the first example). + ## CITATION If you use this tool, please consider citing our [publication](https://academic.oup.com/bioinformatics/article/39/5/btad311/7160911).