From f50e54652f32856dff3386b53dc3afa50e02db48 Mon Sep 17 00:00:00 2001
From: JMencius <553843771@qq.com>
Date: Wed, 24 Apr 2024 19:55:44 +0800
Subject: [PATCH 1/4] Add GC content parameter --mingc --maxgc

---
 README.md              |  4 +++-
 src/main.rs            | 41 ++++++++++++++++++++++++++++++++++++++-
 test-data/testGC.fastq | 44 ++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 87 insertions(+), 2 deletions(-)
 create mode 100644 test-data/testGC.fastq
diff --git a/README.md b/README.md
index cd4f245..04d6e0c 100644
--- a/README.md
+++ b/README.md
@@ -3,7 +3,7 @@
 Rust implementation of [NanoFilt](https://github.com/wdecoster/nanofilt)+[NanoLyse](https://github.com/wdecoster/nanolyse), both originally written in Python. This tool, intended for long read sequencing such as PacBio or ONT, filters and trims a fastq file.  
 Filtering is done on average read quality and minimal or maximal read length, and applying a headcrop (start of read) and tailcrop (end of read) while printing the reads passing the filter.
 
-Compared to the Python implementation the scope is to deliver the same results, almost the same functionality, at much faster execution times. At the moment this tool does not support filtering using a sequencing_summary file or filtering on GC content. If those features are of interest then please reach out.  
+Compared to the Python implementation the scope is to deliver the same results, almost the same functionality, at much faster execution times. At the moment this tool does not support filtering using a sequencing_summary file. If those features are of interest then please reach out.  
 
 ## Installation
 
@@ -31,6 +31,8 @@ OPTIONS:
         --threads       Number of parallel threads to use [default: 4]
         --contam        Fasta file with reference to check potential contaminants against [default None]
     -i, --input         Input filename [default: read from stdin]
+        --maxgc         Sets a maximum GC content [default: 1.0]
+        --mingc         Sets a minimum GC content [default: 0.0]
 ```
 
 EXAMPLES:
diff --git a/src/main.rs b/src/main.rs
index d3b106b..8fe4b90 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -54,6 +54,13 @@ struct Cli {
     /// Input filename [default: read from stdin]
     #[arg(short = 'i', long = "input", value_parser)]
     input: Option<String>,
+
+    /// Filter GC content [default : 0]
+    #[arg(long, value_parser, default_value_t = 1.0)]
+    maxgc : f64,
+
+    #[arg(long, value_parser, default_value_t = 0.0)]
+    mingc : f64,
 }
 
 
@@ -128,6 +135,7 @@ where
                     if !record.is_empty() {
                         let read_len = record.seq().len();
                         // If a read is shorter than what is to be cropped the read is dropped entirely (filtered out)
+                        let read_gc = cal_gc(record.seq());
                         if args.headcrop + args.tailcrop < read_len {
                             let average_quality = ave_qual(
                                 &record.qual().iter().map(|i| i - 33).collect::<Vec<u8>>(),
@@ -135,6 +143,8 @@ where
                             if (!args.inverse
                                 && average_quality >= args.minqual
                                 && average_quality <= args.maxqual
+                                && read_gc >= args.mingc
+                                && read_gc <= args.maxgc
                                 && read_len >= args.minlength
                                 && read_len <= args.maxlength
                                 && !is_contamination(&record.seq(), &aligner))
@@ -170,6 +180,7 @@ where
                     if !record.is_empty() {
                         let read_len = record.seq().len();
                         // If a read is shorter than what is to be cropped the read is dropped entirely (filtered out)
+                        let read_gc = cal_gc(record.seq());
                         if args.headcrop + args.tailcrop < read_len {
                             let average_quality = ave_qual(
                                 &record.qual().iter().map(|i| i - 33).collect::<Vec<u8>>(),
@@ -177,6 +188,8 @@ where
                             if (!args.inverse
                                 && average_quality >= args.minqual
                                 && average_quality <= args.maxqual
+                                && read_gc >= args.mingc
+                                && read_gc <= args.maxgc
                                 && read_len >= args.minlength
                                 && read_len <= args.maxlength)
                                 || (args.inverse
@@ -253,6 +266,28 @@ fn is_contamination(readseq: &&[u8], contam: &Aligner) -> bool {
     }
 }
 
+fn cal_gc(readseq: &[u8]) -> f64 {
+    let mut gc_count = 0;
+    let mut total_bases = 0;
+
+    for &base in readseq {
+        match base {
+            b'G' | b'g' | b'C' | b'c' => gc_count += 1,
+            _ => {}, // Ignore non-GC bases
+        }
+        total_bases += 1;
+    }
+
+    // Return 0 if sequence is empty
+    if total_bases == 0 {
+        return 0.0; 
+    }
+
+    let gc_content = (gc_count as f64) / (total_bases as f64);
+    // Return GC content as absolute value
+    gc_content 
+}
+
 #[test]
 fn test_ave_qual() {
     assert_eq!(ave_qual(&[10]), 10.0);
@@ -289,6 +324,8 @@ fn test_filter() {
             contam: None,
             inverse: false,
             input: None,
+            mingc: 0,
+            maxgc: 1,
 	},
     );
 }
@@ -331,7 +368,9 @@ fn test_filter_with_contam() {
             threads: 1,
             contam: Some("test-data/random_contam.fa".to_owned()),
             inverse: false,
-	    input: None,
+	        input: None,
+            mingc: 0.0,
+            maxgc: 1.0,
         },
     );
 }
diff --git a/test-data/testGC.fastq b/test-data/testGC.fastq
new file mode 100644
index 0000000..7e5e835
--- /dev/null
+++ b/test-data/testGC.fastq
@@ -0,0 +1,44 @@
+@GC0
+AAAAAAAAAA
++
+OOOOOOOOOO
+@GC10
+GAAAAAAAAA
++
+OOOOOOOOOO
+@GC20
+GCAAAAAAAA
++
+OOOOOOOOOO
+@GC30
+GCGAAAAAAA
++
+OOOOOOOOOO
+@GC40
+GCGCAAAAAA
++
+OOOOOOOOOO
+@GC50
+GCGCGAAAAA
++
+OOOOOOOOOO
+@GC60
+GCGCGCAAAA
++
+OOOOOOOOOO
+@GC40
+GCGCGCGAAA
++
+OOOOOOOOOO
+@GC80
+GCGCGCGCAA
++
+OOOOOOOOOO
+@GC90
+GCGCGCGCGA
++
+OOOOOOOOOO
+@GC100
+GCGCGCGCGC
++
+OOOOOOOOOO

From d684ace20b0aae931f0744f54a6eec735941a47d Mon Sep 17 00:00:00 2001
From: JMencius <553843771@qq.com>
Date: Wed, 24 Apr 2024 21:41:11 +0800
Subject: [PATCH 2/4] Change to optional gc content filter

---
 src/main.rs | 35 +++++++++++++++++++++--------------
 1 file changed, 21 insertions(+), 14 deletions(-)

diff --git a/src/main.rs b/src/main.rs
index 8fe4b90..b0fbfa7 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -55,12 +55,13 @@ struct Cli {
     #[arg(short = 'i', long = "input", value_parser)]
     input: Option<String>,
 
-    /// Filter GC content [default : 0]
+    /// Filter max GC content
     #[arg(long, value_parser, default_value_t = 1.0)]
-    maxgc : f64,
+    maxgc: f64,
 
+    /// Filter min GC content
     #[arg(long, value_parser, default_value_t = 0.0)]
-    mingc : f64,
+    mingc: f64,
 }
 
 
@@ -135,7 +136,13 @@ where
                     if !record.is_empty() {
                         let read_len = record.seq().len();
                         // If a read is shorter than what is to be cropped the read is dropped entirely (filtered out)
-                        let read_gc = cal_gc(record.seq());
+                        
+                        // Check if gc content filter exist, if no gc content filter is set pass the 0.5 to pass all the follwoing filter
+                        let mut read_gc: f64 = 0.5;
+                        if args.mingc != 0.0 || args.maxgc != 1.0 {
+                            read_gc = cal_gc(record.seq());
+                        }
+                        
                         if args.headcrop + args.tailcrop < read_len {
                             let average_quality = ave_qual(
                                 &record.qual().iter().map(|i| i - 33).collect::<Vec<u8>>(),
@@ -180,7 +187,13 @@ where
                     if !record.is_empty() {
                         let read_len = record.seq().len();
                         // If a read is shorter than what is to be cropped the read is dropped entirely (filtered out)
-                        let read_gc = cal_gc(record.seq());
+
+                        // Check if gc content filter exist, if no gc content filter is set pass the 0.5 to pass all the follwoing filter
+                        let mut read_gc: f64 = 0.5;
+                        if args.mingc != 0.0 || args.maxgc != 1.0 {
+                            read_gc = cal_gc(record.seq());
+                        }
+                        
                         if args.headcrop + args.tailcrop < read_len {
                             let average_quality = ave_qual(
                                 &record.qual().iter().map(|i| i - 33).collect::<Vec<u8>>(),
@@ -268,22 +281,16 @@ fn is_contamination(readseq: &&[u8], contam: &Aligner) -> bool {
 
 fn cal_gc(readseq: &[u8]) -> f64 {
     let mut gc_count = 0;
-    let mut total_bases = 0;
 
     for &base in readseq {
         match base {
             b'G' | b'g' | b'C' | b'c' => gc_count += 1,
             _ => {}, // Ignore non-GC bases
         }
-        total_bases += 1;
-    }
 
-    // Return 0 if sequence is empty
-    if total_bases == 0 {
-        return 0.0; 
     }
 
-    let gc_content = (gc_count as f64) / (total_bases as f64);
+    let gc_content = (gc_count as f64) / (readseq.len() as f64);
     // Return GC content as absolute value
     gc_content 
 }
@@ -324,8 +331,8 @@ fn test_filter() {
             contam: None,
             inverse: false,
             input: None,
-            mingc: 0,
-            maxgc: 1,
+            mingc: 0.0,
+            maxgc: 1.0,
 	},
     );
 }

From 9962fcbd0951a31678bbc741380573921dc233dc Mon Sep 17 00:00:00 2001
From: wdecoster <decosterwouter@gmail.com>
Date: Thu, 25 Apr 2024 09:50:53 +0200
Subject: [PATCH 3/4] minor changes in implementation

---
 src/main.rs | 33 ++++++++++++---------------------
 1 file changed, 12 insertions(+), 21 deletions(-)

diff --git a/src/main.rs b/src/main.rs
index b0fbfa7..59b54e9 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -138,10 +138,11 @@ where
                         // If a read is shorter than what is to be cropped the read is dropped entirely (filtered out)
                         
                         // Check if gc content filter exist, if no gc content filter is set pass the 0.5 to pass all the follwoing filter
-                        let mut read_gc: f64 = 0.5;
-                        if args.mingc != 0.0 || args.maxgc != 1.0 {
-                            read_gc = cal_gc(record.seq());
-                        }
+                        let read_gc = if args.mingc != 0.0 || args.maxgc != 1.0 {
+                            cal_gc(record.seq())
+                        } else {
+                            0.5
+                        };
                         
                         if args.headcrop + args.tailcrop < read_len {
                             let average_quality = ave_qual(
@@ -189,10 +190,11 @@ where
                         // If a read is shorter than what is to be cropped the read is dropped entirely (filtered out)
 
                         // Check if gc content filter exist, if no gc content filter is set pass the 0.5 to pass all the follwoing filter
-                        let mut read_gc: f64 = 0.5;
-                        if args.mingc != 0.0 || args.maxgc != 1.0 {
-                            read_gc = cal_gc(record.seq());
-                        }
+                        let read_gc = if args.mingc != 0.0 || args.maxgc != 1.0 {
+                            cal_gc(record.seq())
+                        } else {
+                            0.5
+                        };
                         
                         if args.headcrop + args.tailcrop < read_len {
                             let average_quality = ave_qual(
@@ -280,19 +282,8 @@ fn is_contamination(readseq: &&[u8], contam: &Aligner) -> bool {
 }
 
 fn cal_gc(readseq: &[u8]) -> f64 {
-    let mut gc_count = 0;
-
-    for &base in readseq {
-        match base {
-            b'G' | b'g' | b'C' | b'c' => gc_count += 1,
-            _ => {}, // Ignore non-GC bases
-        }
-
-    }
-
-    let gc_content = (gc_count as f64) / (readseq.len() as f64);
-    // Return GC content as absolute value
-    gc_content 
+    let gc_count = readseq.iter().filter(|&&base| base == b'G' || base == b'g' || base == b'C' || base == b'c').count();
+    (gc_count as f64) / (readseq.len() as f64)
 }
 
 #[test]

From 184e8d5b8a384b18394e4f7ffb20aea575e5a8d2 Mon Sep 17 00:00:00 2001
From: Wouter De Coster <decosterwouter@gmail.com>
Date: Fri, 26 Apr 2024 15:04:56 +0200
Subject: [PATCH 4/4] Update README.md

add note on speed of decompression while piping
---
 README.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/README.md b/README.md
index 04d6e0c..a4cc9db 100644
--- a/README.md
+++ b/README.md
@@ -43,6 +43,8 @@ chopper -q 10 -l 500 -i reads.fastq > filtered_reads.fastq
 chopper -q 10 -l 500 -i reads.fastq.gz | gzip > filtered_reads.fastq.gz
 ```
 
+Note that the tool may be substantially slower in the third example above, and piping while decompressing is recommended (as in the first example). 
+
 ## CITATION
 
 If you use this tool, please consider citing our [publication](https://academic.oup.com/bioinformatics/article/39/5/btad311/7160911).