From a2eb1c05559ad4bb522ad9ef0081dbfa185e533d Mon Sep 17 00:00:00 2001
From: Trent Hauck <trent@trenthauck.com>
Date: Fri, 24 May 2024 09:52:46 -0700
Subject: [PATCH] fix: better handling for ambiguous AAs

---
 .../sqllogictests/slt/fasta-scan-tests.slt    | 11 ++-
 exon/exon-fasta/src/array_builder.rs          | 98 ++++++++++++-------
 exon/exon-fasta/src/batch_reader.rs           | 18 ++--
 exon/exon-fasta/src/config.rs                 | 14 +++
 exon/exon-fasta/src/error.rs                  |  6 +-
 5 files changed, 95 insertions(+), 52 deletions(-)

diff --git a/exon/exon-core/tests/sqllogictests/slt/fasta-scan-tests.slt b/exon/exon-core/tests/sqllogictests/slt/fasta-scan-tests.slt
index 9eb489a3..41235779 100644
--- a/exon/exon-core/tests/sqllogictests/slt/fasta-scan-tests.slt
+++ b/exon/exon-core/tests/sqllogictests/slt/fasta-scan-tests.slt
@@ -142,8 +142,8 @@ CREATE EXTERNAL TABLE exon_table STORED AS FASTA OPTIONS (file_extension 'faa')
 query T
 SELECT id, description, sequence FROM exon_table;
 ----
-a description [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20]
-b description2 [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20]
+a description [1, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 15, 16, 17, 18, 19, 21, 22, 23]
+b description2 [1, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 15, 16, 17, 18, 19, 21, 22, 23]
 
 statement ok
 DROP TABLE exon_table;
@@ -154,5 +154,8 @@ CREATE EXTERNAL TABLE exon_table STORED AS FASTA OPTIONS (file_extension 'faa',
 query T
 SELECT id, description, sequence FROM exon_table;
 ----
-a description [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20]
-b description2 [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20]
+a description [1, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 15, 16, 17, 18, 19, 21, 22, 23]
+b description2 [1, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 15, 16, 17, 18, 19, 21, 22, 23]
+
+statement ok
+DROP TABLE exon_table;
diff --git a/exon/exon-fasta/src/array_builder.rs b/exon/exon-fasta/src/array_builder.rs
index 94d61eb7..45d68b79 100644
--- a/exon/exon-fasta/src/array_builder.rs
+++ b/exon/exon-fasta/src/array_builder.rs
@@ -15,7 +15,7 @@
 use std::{str::FromStr, sync::Arc};
 
 use arrow::{
-    array::{ArrayBuilder, ArrayRef, GenericListBuilder, GenericStringBuilder, Int32Builder},
+    array::{ArrayRef, GenericListBuilder, GenericStringBuilder, Int32Builder},
     datatypes::SchemaRef,
     error::ArrowError,
 };
@@ -29,6 +29,8 @@ pub struct FASTAArrayBuilder {
     descriptions: GenericStringBuilder<i32>,
     sequences: SequenceBuilder,
     projection: Vec<usize>,
+    append_name: bool,
+    append_description: bool,
     append_sequence: bool,
     rows: usize,
 }
@@ -67,10 +69,16 @@ impl FASTAArrayBuilder {
                 GenericStringBuilder::<i64>::with_capacity(capacity, capacity),
             ),
             SequenceDataType::IntegerEncodeProtein => SequenceBuilder::IntegerEncodeProtein(
-                GenericListBuilder::<i32, Int32Builder>::new(Int32Builder::with_capacity(capacity)),
+                GenericListBuilder::<i32, Int32Builder>::with_capacity(
+                    Int32Builder::with_capacity(60),
+                    capacity,
+                ),
             ),
             SequenceDataType::IntegerEncodeDNA => SequenceBuilder::IntegerEncodeDNA(
-                GenericListBuilder::<i32, Int32Builder>::new(Int32Builder::with_capacity(capacity)),
+                GenericListBuilder::<i32, Int32Builder>::with_capacity(
+                    Int32Builder::with_capacity(60),
+                    capacity,
+                ),
             ),
         };
 
@@ -79,7 +87,9 @@ impl FASTAArrayBuilder {
             None => (0..schema.fields().len()).collect(),
         };
 
-        let append_sequence = true;
+        let append_name = projection.contains(&0);
+        let append_description = projection.contains(&1);
+        let append_sequence = projection.contains(&2);
 
         Ok(Self {
             names: GenericStringBuilder::<i32>::with_capacity(capacity, capacity),
@@ -88,11 +98,13 @@ impl FASTAArrayBuilder {
             projection,
             rows: 0,
             append_sequence,
+            append_name,
+            append_description,
         })
     }
 
     pub fn len(&self) -> usize {
-        self.names.len()
+        self.rows
     }
 
     pub fn is_empty(&self) -> bool {
@@ -100,17 +112,23 @@ impl FASTAArrayBuilder {
     }
 
     pub fn append(&mut self, definition: &str, sequence: &[u8]) -> Result<(), ArrowError> {
-        let definition =
-            Definition::from_str(definition).map_err(|e| ArrowError::ExternalError(Box::new(e)))?;
+        if self.append_name || self.append_description {
+            let definition = Definition::from_str(definition)
+                .map_err(|e| ArrowError::ExternalError(Box::new(e)))?;
 
-        let name = std::str::from_utf8(definition.name())?;
-        self.names.append_value(name);
+            if self.append_name {
+                let name = std::str::from_utf8(definition.name())?;
+                self.names.append_value(name);
+            }
 
-        if let Some(description) = definition.description() {
-            let description = std::str::from_utf8(description)?;
-            self.descriptions.append_value(description);
-        } else {
-            self.descriptions.append_null();
+            if self.append_description {
+                if let Some(description) = definition.description() {
+                    let description = std::str::from_utf8(description)?;
+                    self.descriptions.append_value(description);
+                } else {
+                    self.descriptions.append_null();
+                }
+            }
         }
 
         if self.append_sequence {
@@ -129,25 +147,30 @@ impl FASTAArrayBuilder {
                     for aa in sequence {
                         let aa = match aa {
                             b'A' => 1,
-                            b'C' => 2,
-                            b'D' => 3,
-                            b'E' => 4,
-                            b'F' => 5,
-                            b'G' => 6,
-                            b'H' => 7,
-                            b'I' => 8,
-                            b'K' => 9,
-                            b'L' => 10,
-                            b'M' => 11,
-                            b'N' => 12,
-                            b'P' => 13,
-                            b'Q' => 14,
-                            b'R' => 15,
-                            b'S' => 16,
-                            b'T' => 17,
-                            b'V' => 18,
-                            b'W' => 19,
-                            b'Y' => 20,
+                            b'B' => 2,
+                            b'C' => 3,
+                            b'D' => 4,
+                            b'E' => 5,
+                            b'F' => 6,
+                            b'G' => 7,
+                            b'H' => 8,
+                            b'I' => 9,
+                            b'K' => 10,
+                            b'L' => 11,
+                            b'M' => 12,
+                            b'N' => 13,
+                            b'O' => 14,
+                            b'P' => 15,
+                            b'Q' => 16,
+                            b'R' => 17,
+                            b'S' => 18,
+                            b'T' => 19,
+                            b'U' => 20,
+                            b'V' => 21,
+                            b'W' => 22,
+                            b'Y' => 23,
+                            b'X' => 24,
+                            b'Z' => 25,
                             _ => {
                                 return Err(ExonFastaError::InvalidAminoAcid(*aa).into());
                             }
@@ -190,8 +213,13 @@ impl FASTAArrayBuilder {
     fn finish_inner(&mut self) -> Vec<ArrayRef> {
         let mut arrays: Vec<ArrayRef> = Vec::with_capacity(self.projection.len());
 
-        arrays.push(Arc::new(self.names.finish()));
-        arrays.push(Arc::new(self.descriptions.finish()));
+        if self.append_name {
+            arrays.push(Arc::new(self.names.finish()));
+        }
+
+        if self.append_description {
+            arrays.push(Arc::new(self.descriptions.finish()));
+        }
 
         if self.append_sequence {
             arrays.push(self.sequences.finish());
diff --git a/exon/exon-fasta/src/batch_reader.rs b/exon/exon-fasta/src/batch_reader.rs
index e1d05103..51b489ab 100644
--- a/exon/exon-fasta/src/batch_reader.rs
+++ b/exon/exon-fasta/src/batch_reader.rs
@@ -70,7 +70,7 @@ where
     }
 
     async fn read_batch(&mut self) -> ExonFastaResult<Option<RecordBatch>> {
-        let mut record_batch = FASTAArrayBuilder::create(
+        let mut array_builder = FASTAArrayBuilder::create(
             self.config.file_schema.clone(),
             self.config.projection.clone(),
             self.config.batch_size,
@@ -82,26 +82,20 @@ where
             self.sequence_buffer.clear();
 
             if (self.read_record().await?).is_some() {
-                record_batch.append(&self.buf, &self.sequence_buffer)?;
+                array_builder.append(&self.buf, &self.sequence_buffer)?;
             } else {
                 break;
             }
         }
 
-        if record_batch.is_empty() {
+        if array_builder.is_empty() {
             return Ok(None);
         }
 
-        let batch = RecordBatch::try_new(self.config.file_schema.clone(), record_batch.finish())?;
+        let schema = self.config.projected_schema()?;
+        let batch = array_builder.try_into_record_batch(schema)?;
 
-        match &self.config.projection {
-            Some(projection) => {
-                let projected_batch = batch.project(projection)?;
-
-                Ok(Some(projected_batch))
-            }
-            None => Ok(Some(batch)),
-        }
+        Ok(Some(batch))
     }
 
     /// Converts the reader into a stream of batches.
diff --git a/exon/exon-fasta/src/config.rs b/exon/exon-fasta/src/config.rs
index 26cfb698..36ca6f31 100644
--- a/exon/exon-fasta/src/config.rs
+++ b/exon/exon-fasta/src/config.rs
@@ -102,6 +102,20 @@ impl FASTAConfig {
         self
     }
 
+    /// Get the projection, returning the identity projection if none is set.
+    pub fn projection(&self) -> Vec<usize> {
+        self.projection
+            .clone()
+            .unwrap_or_else(|| (0..self.file_schema.fields().len()).collect())
+    }
+
+    /// Get the projected schema.
+    pub fn projected_schema(&self) -> arrow::error::Result<SchemaRef> {
+        let schema = self.file_schema.project(&self.projection())?;
+
+        Ok(Arc::new(schema))
+    }
+
     /// Create a new FASTA configuration with a given projection.
     pub fn with_projection(mut self, projection: Vec<usize>) -> Self {
         // Only include fields that are in the file schema.
diff --git a/exon/exon-fasta/src/error.rs b/exon/exon-fasta/src/error.rs
index 6f51f24a..41ef334a 100644
--- a/exon/exon-fasta/src/error.rs
+++ b/exon/exon-fasta/src/error.rs
@@ -42,7 +42,11 @@ impl Display for ExonFastaError {
                 write!(f, "Invalid nucleotide: {}", nucleotide)
             }
             ExonFastaError::InvalidAminoAcid(amino_acid) => {
-                write!(f, "Invalid amino acid: {}", amino_acid)
+                write!(
+                    f,
+                    "Invalid amino acid: {}",
+                    std::char::from_u32(*amino_acid as u32).unwrap()
+                )
             }
         }
     }