From 896a5e18ffc31e74640b6b03262586c357aa38ac Mon Sep 17 00:00:00 2001
From: Patrick Meredith <pmeredit@protonmail.com>
Date: Tue, 17 Dec 2024 17:28:39 -0500
Subject: [PATCH] New PR with Devin's complete changes (#507)

Mostly generated by devin, serialization test updated personally, and removed redundant fuzz tests
---
 .evergreen/config.yml                |  31 +++++-
 .evergreen/run-fuzzer.sh             |  37 ++++++-
 fuzz/Cargo.toml                      |  30 +++++-
 fuzz/fuzz_targets/serialization.rs   |  55 +++++++++++
 fuzz/fuzz_targets/string_handling.rs |  23 +++++
 fuzz/fuzz_targets/type_markers.rs    |  14 +++
 fuzz/generate_corpus.rs              | 143 +++++++++++++++++++++++++++
 src/spec.rs                          |   9 +-
 8 files changed, 330 insertions(+), 12 deletions(-)
 create mode 100644 fuzz/fuzz_targets/serialization.rs
 create mode 100644 fuzz/fuzz_targets/string_handling.rs
 create mode 100644 fuzz/fuzz_targets/type_markers.rs
 create mode 100644 fuzz/generate_corpus.rs

diff --git a/.evergreen/config.yml b/.evergreen/config.yml
index 956e50c0..673b5ffc 100644
--- a/.evergreen/config.yml
+++ b/.evergreen/config.yml
@@ -13,15 +13,18 @@ stepback: true
 command_type: system
 
 # Protect ourself against rogue test case, or curl gone wild, that runs forever
-# 12 minutes is the longest we'll ever run
-exec_timeout_secs: 3600 # 12 minutes is the longest we'll ever run
+# 60 minutes is the longest we'll ever run
+exec_timeout_secs: 3600 # 1 hour total for security-focused fuzzing
 
 # What to do when evergreen hits the timeout (`post:` tasks are run automatically)
 timeout:
   - command: shell.exec
     params:
       script: |
-        ls -la
+        echo "Fuzzing timed out. Collecting any available artifacts..."
+        if [ -d "src/fuzz/artifacts" ]; then
+          tar czf "${PROJECT_DIRECTORY}/crash-artifacts.tar.gz" src/fuzz/artifacts/
+        fi
 
 functions:
   "fetch source":
@@ -154,7 +157,25 @@ functions:
     - command: shell.exec
       params:
         script: |
-            # Nothing needs to be done here
+          # Archive crash artifacts if they exist and contain crashes
+          if [ -d "src/fuzz/artifacts" ] && [ "$(ls -A src/fuzz/artifacts)" ]; then
+            echo "Creating artifacts archive..."
+            tar czf "${PROJECT_DIRECTORY}/crash-artifacts.tar.gz" src/fuzz/artifacts/
+          else
+            echo "No crashes found in artifacts directory. Skipping archive creation."
+          fi
+    # Upload crash artifacts if they exist
+    - command: s3.put
+      params:
+        aws_key: ${aws_key}
+        aws_secret: ${aws_secret}
+        local_file: ${PROJECT_DIRECTORY}/crash-artifacts.tar.gz
+        remote_file: ${CURRENT_VERSION}/crash-artifacts.tar.gz
+        bucket: mciuploads
+        permissions: public-read
+        content_type: application/x-gzip
+        optional: true
+
 pre:
   - func: "fetch source"
   - func: "install dependencies"
@@ -259,4 +280,4 @@ buildvariants:
   run_on:
     - ubuntu1804-test
   tasks:
-    - name: "wasm-test"
\ No newline at end of file
+    - name: "wasm-test"
diff --git a/.evergreen/run-fuzzer.sh b/.evergreen/run-fuzzer.sh
index 511799cb..b66abf1b 100755
--- a/.evergreen/run-fuzzer.sh
+++ b/.evergreen/run-fuzzer.sh
@@ -6,7 +6,36 @@ set -o errexit
 
 cd fuzz
 
-# each runs for a minute
-cargo +nightly fuzz run deserialize -- -rss_limit_mb=4096 -max_total_time=60
-cargo +nightly fuzz run raw_deserialize -- -rss_limit_mb=4096 -max_total_time=60
-cargo +nightly fuzz run iterate -- -rss_limit_mb=4096 -max_total_time=60
+# Create directories for crashes and corpus
+mkdir -p artifacts
+mkdir -p corpus
+
+# Generate initial corpus if directory is empty
+if [ -z "$(ls -A corpus)" ]; then
+    echo "Generating initial corpus..."
+    cargo run --bin generate_corpus
+fi
+
+# Function to run fuzzer and collect crashes
+run_fuzzer() {
+    target=$1
+    echo "Running fuzzer for $target"
+    # Run fuzzer and redirect crashes to artifacts directory
+    RUST_BACKTRACE=1 cargo +nightly fuzz run $target -- \
+        -rss_limit_mb=4096 \
+        -max_total_time=60 \
+        -artifact_prefix=artifacts/ \
+        -print_final_stats=1 \
+        corpus/
+}
+
+# Run existing targets
+run_fuzzer "deserialize"
+run_fuzzer "raw_deserialize"
+run_fuzzer "iterate"
+
+# Run new security-focused targets
+run_fuzzer "malformed_length"
+run_fuzzer "type_markers"
+run_fuzzer "string_handling"
+run_fuzzer "serialization"
diff --git a/fuzz/Cargo.toml b/fuzz/Cargo.toml
index 25c60712..abdcffb5 100644
--- a/fuzz/Cargo.toml
+++ b/fuzz/Cargo.toml
@@ -1,19 +1,25 @@
-
 [package]
 name = "bson-fuzz"
 version = "0.0.1"
 authors = ["Automatically generated"]
 publish = false
+edition = "2021"
 
 [package.metadata]
 cargo-fuzz = true
 
 [dependencies.bson]
 path = ".."
+
 [dependencies.libfuzzer-sys]
 version = "0.4.0"
 
-# Prevent this from interfering with workspaces
+[dependencies.serde]
+version = "1.0"
+
+[dependencies.serde_json]
+version = "1.0"
+
 [workspace]
 members = ["."]
 
@@ -32,3 +38,23 @@ path = "fuzz_targets/raw_deserialize.rs"
 [[bin]]
 name = "raw_deserialize_utf8_lossy"
 path = "fuzz_targets/raw_deserialize_utf8_lossy.rs"
+
+[[bin]]
+name = "malformed_length"
+path = "fuzz_targets/malformed_length.rs"
+
+[[bin]]
+name = "type_markers"
+path = "fuzz_targets/type_markers.rs"
+
+[[bin]]
+name = "string_handling"
+path = "fuzz_targets/string_handling.rs"
+
+[[bin]]
+name = "serialization"
+path = "fuzz_targets/serialization.rs"
+
+[[bin]]
+name = "generate_corpus"
+path = "generate_corpus.rs"
diff --git a/fuzz/fuzz_targets/serialization.rs b/fuzz/fuzz_targets/serialization.rs
new file mode 100644
index 00000000..e5ba621d
--- /dev/null
+++ b/fuzz/fuzz_targets/serialization.rs
@@ -0,0 +1,55 @@
+#![no_main]
+use bson::{
+    raw::{RawDocument, RawDocumentBuf},
+    Bson,
+    Document,
+};
+use libfuzzer_sys::fuzz_target;
+
+fn compare_docs(doc1: &Document, doc2: &Document) -> bool {
+    if doc1.len() != doc2.len() {
+        return false;
+    }
+    for (key, value) in doc1 {
+        if !doc2.contains_key(key) {
+            return false;
+        }
+        if let Some(val2) = doc2.get(key) {
+            match (value, val2) {
+                (Bson::Double(d1), Bson::Double(d2)) => {
+                    if (!d1.is_nan() || !d2.is_nan()) && d1 != d2 {
+                        return false;
+                    }
+                }
+                (v1, v2) => {
+                    if v1 != v2 {
+                        return false;
+                    }
+                }
+            }
+        }
+    }
+    true
+}
+
+fuzz_target!(|input: &[u8]| {
+    if let Ok(rawdoc) = RawDocument::from_bytes(&input) {
+        if let Ok(doc) = Document::try_from(rawdoc) {
+            let out = RawDocumentBuf::try_from(&doc).unwrap();
+            let out_bytes = out.as_bytes();
+            if input != out_bytes {
+                let reserialized = RawDocument::from_bytes(&out_bytes).unwrap();
+                let reserialized_doc = Document::try_from(reserialized).unwrap();
+                // Ensure that the reserialized document is the same as the original document, the
+                // bytes can differ while still resulting in the same Document.
+                if !compare_docs(&doc, &reserialized_doc) {
+                    panic!(
+                        "Reserialized document is not the same as the original document: {:?} != \
+                         {:?}",
+                        doc, reserialized_doc
+                    );
+                }
+            }
+        }
+    }
+});
diff --git a/fuzz/fuzz_targets/string_handling.rs b/fuzz/fuzz_targets/string_handling.rs
new file mode 100644
index 00000000..090d132f
--- /dev/null
+++ b/fuzz/fuzz_targets/string_handling.rs
@@ -0,0 +1,23 @@
+#![no_main]
+#[macro_use]
+extern crate libfuzzer_sys;
+extern crate bson;
+use bson::{RawBsonRef, RawDocument};
+use std::convert::TryInto;
+
+fuzz_target!(|buf: &[u8]| {
+    if let Ok(doc) = RawDocument::from_bytes(buf) {
+        for elem in doc.iter_elements().flatten() {
+            // Convert to RawBsonRef and check string-related types
+            if let Ok(bson) = elem.try_into() {
+                match bson {
+                    RawBsonRef::String(s) => {
+                        let _ = s.len();
+                        let _ = s.chars().count();
+                    }
+                    _ => {}
+                }
+            }
+        }
+    }
+});
diff --git a/fuzz/fuzz_targets/type_markers.rs b/fuzz/fuzz_targets/type_markers.rs
new file mode 100644
index 00000000..3506b96e
--- /dev/null
+++ b/fuzz/fuzz_targets/type_markers.rs
@@ -0,0 +1,14 @@
+#![no_main]
+#[macro_use]
+extern crate libfuzzer_sys;
+extern crate bson;
+use bson::{RawBsonRef, RawDocument};
+use std::convert::TryInto;
+
+fuzz_target!(|buf: &[u8]| {
+    if let Ok(doc) = RawDocument::from_bytes(buf) {
+        for elem in doc.iter_elements().flatten() {
+            let _: Result<RawBsonRef, _> = elem.try_into();
+        }
+    }
+});
diff --git a/fuzz/generate_corpus.rs b/fuzz/generate_corpus.rs
new file mode 100644
index 00000000..c67c4cfc
--- /dev/null
+++ b/fuzz/generate_corpus.rs
@@ -0,0 +1,143 @@
+use bson::{doc, Bson, Decimal128};
+use std::{
+    fs,
+    io::{Error, ErrorKind},
+    path::Path,
+    str::FromStr,
+};
+
+fn main() -> std::io::Result<()> {
+    let corpus_dir = Path::new("fuzz/corpus");
+    fs::create_dir_all(corpus_dir)?;
+
+    // Generate edge cases for each fuzz target
+    generate_length_edge_cases(corpus_dir)?;
+    generate_type_marker_cases(corpus_dir)?;
+    generate_string_edge_cases(corpus_dir)?;
+    generate_serialization_cases(corpus_dir)?;
+    Ok(())
+}
+
+fn generate_length_edge_cases(dir: &Path) -> std::io::Result<()> {
+    let target_dir = dir.join("malformed_length");
+    fs::create_dir_all(&target_dir)?;
+
+    // Invalid length
+    fs::write(target_dir.join("invalid_len"), vec![4, 5])?;
+
+    // Minimal valid document
+    let min_doc = doc! {};
+    fs::write(
+        target_dir.join("min_doc"),
+        bson::to_vec(&min_doc).map_err(|e| Error::new(ErrorKind::Other, e.to_string()))?,
+    )?;
+
+    // Document with length near i32::MAX
+    let large_doc = doc! { "a": "b".repeat(i32::MAX as usize / 2) };
+    fs::write(
+        target_dir.join("large_doc"),
+        bson::to_vec(&large_doc).map_err(|e| Error::new(ErrorKind::Other, e.to_string()))?,
+    )?;
+
+    Ok(())
+}
+
+fn generate_type_marker_cases(dir: &Path) -> std::io::Result<()> {
+    let target_dir = dir.join("type_markers");
+    fs::create_dir_all(&target_dir)?;
+
+    // Document with all BSON types
+    let all_types = doc! {
+        "double": 1.0f64,
+        "double_nan": f64::NAN,
+        "double_infinity": f64::INFINITY,
+        "double_neg_infinity": f64::NEG_INFINITY,
+        "string": "test",
+        "document": doc! {},
+        "array": vec![1, 2, 3],
+        "binary": Bson::Binary(bson::Binary { subtype: bson::spec::BinarySubtype::Generic, bytes: vec![1, 2, 3] }),
+        "object_id": bson::oid::ObjectId::new(),
+        "bool": true,
+        "date": bson::DateTime::now(),
+        "null": Bson::Null,
+        "regex": Bson::RegularExpression(bson::Regex { pattern: "pattern".into(), options: "i".into() }),
+        "int32": 123i32,
+        "timestamp": bson::Timestamp { time: 12345, increment: 1 },
+        "int64": 123i64,
+        "decimal128_nan": Decimal128::from_str("NaN").unwrap(),
+        "decimal128_infinity": Decimal128::from_str("Infinity").unwrap(),
+        "decimal128_neg_infinity": Decimal128::from_str("-Infinity").unwrap(),
+        "min_key": Bson::MinKey,
+        "max_key": Bson::MaxKey,
+        "undefined": Bson::Undefined
+    };
+    fs::write(
+        target_dir.join("all_types"),
+        bson::to_vec(&all_types).map_err(|e| Error::new(ErrorKind::Other, e.to_string()))?,
+    )?;
+
+    Ok(())
+}
+
+fn generate_string_edge_cases(dir: &Path) -> std::io::Result<()> {
+    let target_dir = dir.join("string_handling");
+    fs::create_dir_all(&target_dir)?;
+
+    // UTF-8 edge cases
+    let utf8_cases = doc! {
+        "empty": "",
+        "null_bytes": "hello\0world",
+        "unicode": "🦀💻🔒",
+        "high_surrogate": "\u{10000}",
+        "invalid_continuation": Bson::Binary(bson::Binary {
+            subtype: bson::spec::BinarySubtype::Generic,
+            bytes: vec![0x80u8, 0x80u8, 0x80u8]
+        }),
+        "overlong": Bson::Binary(bson::Binary {
+            subtype: bson::spec::BinarySubtype::Generic,
+            bytes: vec![0xC0u8, 0x80u8]
+        })
+    };
+    fs::write(
+        target_dir.join("utf8_cases"),
+        bson::to_vec(&utf8_cases).map_err(|e| Error::new(ErrorKind::Other, e.to_string()))?,
+    )?;
+
+    Ok(())
+}
+
+fn generate_serialization_cases(dir: &Path) -> std::io::Result<()> {
+    let target_dir = dir.join("serialization");
+    fs::create_dir_all(&target_dir)?;
+
+    // Deeply nested document
+    let mut nested_doc = doc! {};
+    let mut current = &mut nested_doc;
+    for i in 0..100 {
+        let next_doc = doc! {};
+        current.insert(i.to_string(), next_doc);
+        current = current
+            .get_mut(&i.to_string())
+            .unwrap()
+            .as_document_mut()
+            .unwrap();
+    }
+    fs::write(
+        target_dir.join("nested_doc"),
+        bson::to_vec(&nested_doc).map_err(|e| Error::new(ErrorKind::Other, e.to_string()))?,
+    )?;
+
+    // Document with large binary data
+    let large_binary = doc! {
+        "binary": Bson::Binary(bson::Binary {
+            subtype: bson::spec::BinarySubtype::Generic,
+            bytes: vec![0xFF; 1024 * 1024] // 1MB of data
+        })
+    };
+    fs::write(
+        target_dir.join("large_binary"),
+        bson::to_vec(&large_binary).map_err(|e| Error::new(ErrorKind::Other, e.to_string()))?,
+    )?;
+
+    Ok(())
+}
diff --git a/src/spec.rs b/src/spec.rs
index 04ec7708..e853c150 100644
--- a/src/spec.rs
+++ b/src/spec.rs
@@ -21,7 +21,14 @@
 
 //! Constants derived from the [BSON Specification Version 1.1](http://bsonspec.org/spec.html).
 
-use std::convert::From;
+use std::{convert::From, fmt};
+
+impl fmt::LowerHex for BinarySubtype {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        let value: u8 = (*self).into();
+        fmt::LowerHex::fmt(&value, f)
+    }
+}
 
 const ELEMENT_TYPE_FLOATING_POINT: u8 = 0x01;
 const ELEMENT_TYPE_UTF8_STRING: u8 = 0x02;