Add a fuzz test for index operations (clear, import, delete, settings)

It is very limited so far. It is meant to catch bugs with soft-deleted document ids.
meilisearch · Dec 12, 2022 · cf5c934 · cf5c934
1 parent e44ca84
commit cf5c934
Show file tree

Hide file tree

Showing 4 changed files with 356 additions and 3 deletions.
diff --git a/.gitignore b/.gitignore
@@ -16,5 +16,5 @@ milli/target/
 ## ... unreviewed
 *.snap.new
 
-# Fuzzcheck data for the facet indexing fuzz test
-milli/fuzz/update::facet::incremental::fuzz::fuzz/
+# Fuzzcheck data
+milli/fuzz/*
diff --git a/milli/src/lib.rs b/milli/src/lib.rs
@@ -1,4 +1,4 @@
-#![cfg_attr(all(test, fuzzing), feature(no_coverage))]
+#![cfg_attr(all(test, fuzzing), feature(no_coverage, once_cell))]
 #[macro_use]
 pub mod documents;
 

diff --git a/milli/src/update/fuzz.rs b/milli/src/update/fuzz.rs
@@ -0,0 +1,351 @@
+// Things tested by this fuzz test
+//
+// - A few different document identifiers only
+// - Simple setting updates (searchable and filterable attributes only)
+// - Document Deletion (given existing or unexisting external document ids)
+// - Clear Documents
+// - Batched document imports
+// - Update/Replacememt of existing documents
+// - Each operation with and without soft deletion
+// - Empty document imports
+// - No crash should ever happen
+
+// A small sample of what isn't tested:
+//
+// - The correctness of the indexing operations
+// - Indexing mistakes that happen when many different documents are inserted
+// - Long batches of document imports
+// - Nested fields (not tested well anyway)
+// - Any search result
+// - Arbitrary document contents
+//   (instead, the components of the documents are pre-written manually)
+// - Index creation / Deletion
+// - Autogenerated docids
+// - Indexing for geosearch
+// - Documents with too many field ids or too many words in a field id
+// - Anything related to the prefix databases
+// - Incorrect setting updates
+// - The logic that chooses between soft and hard deletion
+//   (the choice is instead set manually for each operation)
+// - Different IndexerConfig parameters
+
+// Efficiency tips:
+//
+// - Use a RAM disk (see https://stackoverflow.com/questions/46224103/create-apfs-ram-disk-on-macos-high-sierra)
+// - change the value of the TMPDIR environment variable to a folder in the RAM disk
+
+// Quality:
+// - finds issue 2945 if any of the last two fixes are not present (within a few minutes)
+//      - issue 2945: https://github.com/meilisearch/meilisearch/issues/2945
+//      - fix 1: https://github.com/meilisearch/milli/pull/723
+//      - fix 2: https://github.com/meilisearch/milli/pull/734
+// - but doesn't detect anything wrong if this fix is not included: https://github.com/meilisearch/milli/pull/690
+//      - because it doesn't cause any crash, I think
+// - each fuzz test iteration is quite slow
+// - for this fuzz test in particular, it is good to let it run for a few hours, or even a day
+
+use std::hash::Hash;
+use std::sync::LazyLock;
+
+use fuzzcheck::mutators::integer_within_range::U8WithinRangeMutator;
+use fuzzcheck::mutators::option::OptionMutator;
+use fuzzcheck::mutators::unique::UniqueMutator;
+use fuzzcheck::mutators::vector::VecMutator;
+use fuzzcheck::DefaultMutator;
+use heed::{EnvOpenOptions, RwTxn};
+use serde::{Deserialize, Serialize};
+use tempfile::TempDir;
+
+use super::{
+    ClearDocuments, DeleteDocuments, IndexDocuments, IndexDocumentsConfig, IndexerConfig, Settings,
+};
+use crate::Index;
+
+/// The list of document identifiers that we choose to test
+static DOCUMENT_IDENTIFIERS: LazyLock<Vec<serde_json::Value>> = LazyLock::new(|| {
+    let mut ids = vec![];
+    for i in 0..10 {
+        ids.push(serde_json::json!(i));
+        ids.push(serde_json::json!(format!("{i}")));
+    }
+    ids.push(serde_json::json!("complex-ID-1_2"));
+    ids.push(serde_json::json!("1-2-3-4"));
+    ids.push(serde_json::json!("invalidsupercalifragilisticexpialidocioussupercalifragilisticexpialidocioussupercalifragilisticexpialidocioussupercalifragilisticexpialidocioussupercalifragilisticexpialidocioussupercalifragilisticexpialidocioussupercalifragilisticexpialidocioussupercalifragilisticexpialidocious"));
+    ids.push(serde_json::json!("invalid.id"));
+    ids
+});
+
+/// The list of field values that we choose to test
+static FIELD_VALUES: LazyLock<Vec<serde_json::Value>> = LazyLock::new(|| {
+    let mut vals = vec![];
+    for i in 0..10i32 {
+        vals.push(serde_json::json!(i));
+        vals.push(serde_json::json!((i as f64) / 3.4));
+        vals.push(serde_json::json!(111.1_f32.powi(i)));
+        vals.push(serde_json::json!(format!("{i}")));
+        vals.push(serde_json::json!([i - 1, format!("{i}"), i + 1, format!("{}", i - 1), i - 2]));
+        vals.push(serde_json::json!(format!("{}", "a".repeat(i as usize))));
+    }
+    vals.push(serde_json::json!({ "nested": ["value", { "nested": ["value", "value", "the quick brown fox jumps over the lazy dog, wow!"] }], "value": 0}));
+    vals.push(serde_json::json!("the quick brown fox jumps over the lazy dog, wow!"));
+    vals.push(serde_json::json!("the quick brown supercalifragilisticexpialidocioussupercalifragilisticexpialidocioussupercalifragilisticexpialidocioussupercalifragilisticexpialidocioussupercalifragilisticexpialidocioussupercalifragilisticexpialidocioussupercalifragilisticexpialidocioussupercalifragilisticexpialidocious fox jumps over the lazy dog"));
+    vals.push(serde_json::json!({ "lat": 23.0, "lon": 22.1 }));
+    vals.push(serde_json::json!({ "lat": 23.0, "lon": 22.1, "other": 10.0 }));
+    vals.push(serde_json::json!({ "lat": -23.0, "lon": -22.1 }));
+    vals.push(serde_json::json!({ "lat": 93.0, "lon": 22.1 }));
+    vals.push(serde_json::json!({ "lat": 90.0, "lon": 221.1 }));
+    vals
+});
+/// The list of field keys that we choose to test
+static FIELD_KEYS: LazyLock<Vec<String>> = LazyLock::new(|| {
+    let mut keys = vec![];
+    for f in ["identifier", "field1", "field2", "_geo"] {
+        keys.push(f.to_owned());
+        for g in [
+            "nested",
+            "value",
+            "nested.value",
+            "nested.value.nested",
+            "_geo",
+            "lat",
+            "lon",
+            "other",
+        ] {
+            let mut key = f.to_owned();
+            key.push('.');
+            key.push_str(g);
+            keys.push(key);
+        }
+    }
+    keys
+});
+fn document_identifier(i: u8) -> serde_json::Value {
+    DOCUMENT_IDENTIFIERS[i as usize].clone()
+}
+fn field_key(i: u8) -> String {
+    FIELD_KEYS[i as usize].clone()
+}
+fn field_value(i: u8) -> serde_json::Value {
+    FIELD_VALUES[i as usize].clone()
+}
+fn document_identifier_index_mutator() -> U8WithinRangeMutator {
+    U8WithinRangeMutator::new(..DOCUMENT_IDENTIFIERS.len() as u8)
+}
+fn field_key_index_mutator() -> U8WithinRangeMutator {
+    U8WithinRangeMutator::new(..FIELD_KEYS.len() as u8)
+}
+fn field_value_index_mutator() -> U8WithinRangeMutator {
+    U8WithinRangeMutator::new(..FIELD_VALUES.len() as u8)
+}
+
+#[derive(Debug, Clone, Serialize, Deserialize, DefaultMutator, PartialEq, Eq, Hash)]
+enum Operation {
+    SettingsUpdate(SettingsUpdate),
+    DocumentImport(DocumentImport),
+    DocumentDeletion(DocumentDeletion),
+    Clear,
+}
+#[derive(Debug, Clone, Serialize, Deserialize, DefaultMutator, PartialEq, Eq, Hash)]
+enum Method {
+    Update,
+    Replace,
+}
+#[derive(Debug, Clone, Serialize, Deserialize, DefaultMutator, PartialEq, Eq, Hash)]
+struct DocumentImport {
+    disable_soft_deletion: bool,
+    method: Method,
+    documents: DocumentImportBatch,
+}
+#[derive(Debug, Clone, Serialize, Deserialize, DefaultMutator, PartialEq, Eq, Hash)]
+struct SettingsUpdate {
+    // Adding filterable fields slows down the fuzzer a lot
+    // #[field_mutator(OptionMutator<Vec<u8>, VecMutator<u8, U8WithinRangeMutator>> = {
+    //     OptionMutator::new(VecMutator::new(field_key_index_mutator(), 0..=10))
+    // })]
+    // filterable_fields: Option<Vec<u8>>,
+    #[field_mutator(OptionMutator<Vec<u8>, VecMutator<u8, U8WithinRangeMutator>> = {
+        OptionMutator::new(VecMutator::new(field_key_index_mutator(), 0..=10))
+    })]
+    searchable_fields: Option<Vec<u8>>,
+}
+#[derive(Debug, Clone, Serialize, Deserialize, DefaultMutator, PartialEq, Eq, Hash)]
+struct DocumentDeletion {
+    disable_soft_deletion: bool,
+    #[field_mutator(VecMutator<u8, U8WithinRangeMutator> = {
+        VecMutator::new(document_identifier_index_mutator(), 0..=10)
+    })]
+    external_document_ids: Vec<u8>,
+}
+#[derive(Debug, Clone, Serialize, Deserialize, DefaultMutator, PartialEq, Eq, Hash)]
+struct Document {
+    #[field_mutator(U8WithinRangeMutator = { document_identifier_index_mutator() })]
+    identifier: u8,
+    #[field_mutator(OptionMutator<u8, U8WithinRangeMutator> = {
+        OptionMutator::new(field_value_index_mutator())
+    })]
+    field1: Option<u8>,
+    #[field_mutator(OptionMutator<u8, U8WithinRangeMutator> = {
+        OptionMutator::new(field_value_index_mutator())
+    })]
+    field2: Option<u8>,
+}
+#[derive(Debug, Clone, Serialize, Deserialize, DefaultMutator, PartialEq, Eq, Hash)]
+struct DocumentImportBatch {
+    #[field_mutator(VecMutator<Document, DocumentMutator> = {
+        VecMutator::new(Document::default_mutator(), 0..=10)
+    })]
+    docs1: Vec<Document>,
+    #[field_mutator(VecMutator<Document, DocumentMutator> = {
+        VecMutator::new(Document::default_mutator(), 0..=5)
+    })]
+    docs2: Vec<Document>,
+}
+
+fn apply_document_deletion<'i>(
+    wtxn: &mut RwTxn<'i, '_>,
+    index: &'i Index,
+    deletion: &DocumentDeletion,
+) {
+    let DocumentDeletion { disable_soft_deletion, external_document_ids } = deletion;
+    let mut builder = DeleteDocuments::new(wtxn, index).unwrap();
+    builder.disable_soft_deletion(*disable_soft_deletion);
+    for id in external_document_ids {
+        let id = document_identifier(*id);
+        let id = match id {
+            serde_json::Value::Number(n) => format!("{n}"),
+            serde_json::Value::String(s) => s,
+            _ => panic!(),
+        };
+        let _ = builder.delete_external_id(id.as_str());
+    }
+    builder.execute().unwrap();
+}
+
+fn apply_document_import<'i>(wtxn: &mut RwTxn<'i, '_>, index: &'i Index, import: &DocumentImport) {
+    let DocumentImport {
+        disable_soft_deletion,
+        method,
+        documents: DocumentImportBatch { docs1, docs2 },
+    } = import;
+    let indexer_config = IndexerConfig::default();
+    let mut builder = IndexDocuments::new(
+        wtxn,
+        index,
+        &indexer_config,
+        IndexDocumentsConfig {
+            update_method: match method {
+                Method::Update => super::IndexDocumentsMethod::UpdateDocuments,
+                Method::Replace => super::IndexDocumentsMethod::ReplaceDocuments,
+            },
+            disable_soft_deletion: *disable_soft_deletion,
+            autogenerate_docids: false,
+            ..IndexDocumentsConfig::default()
+        },
+        |_| {},
+        || false,
+    )
+    .unwrap();
+
+    let make_real_docs = |docs: &Vec<Document>| {
+        docs.iter()
+            .map(|doc| {
+                let Document { identifier, field1, field2 } = doc;
+                let mut object = crate::Object::new();
+                let identifier = document_identifier(*identifier);
+                object.insert("identifier".to_owned(), serde_json::json!(identifier));
+                if let Some(field1) = field1 {
+                    let field1 = field_value(*field1);
+                    object.insert("field1".to_owned(), field1);
+                }
+                if let Some(field2) = field2 {
+                    let field2 = field_value(*field2);
+                    object.insert("field2".to_owned(), field2);
+                }
+                object
+            })
+            .collect::<Vec<_>>()
+    };
+
+    let docs1 = make_real_docs(docs1);
+
+    let (new_builder, _user_error) = builder.add_documents(documents!(docs1)).unwrap();
+    builder = new_builder;
+
+    let docs2 = make_real_docs(docs2);
+
+    let (new_builder, _user_error) = builder.add_documents(documents!(docs2)).unwrap();
+    builder = new_builder;
+
+    let _ = builder.execute().unwrap();
+}
+
+fn apply_settings_update<'i>(
+    wtxn: &mut RwTxn<'i, '_>,
+    index: &'i Index,
+    settings: &SettingsUpdate,
+) {
+    let SettingsUpdate { searchable_fields /* , filterable_fields */ } = settings;
+    let indexer_config = IndexerConfig::default();
+    let mut settings = Settings::new(wtxn, index, &indexer_config);
+    // match filterable_fields {
+    //     Some(fields) => {
+    //         let fields = fields.iter().map(|f| field_key(*f)).collect();
+    //         settings.set_filterable_fields(fields);
+    //     }
+    //     None => settings.reset_filterable_fields(),
+    // }
+    match searchable_fields {
+        Some(fields) => {
+            let fields = fields.iter().map(|f| field_key(*f)).collect();
+            settings.set_searchable_fields(fields);
+        }
+        None => settings.reset_searchable_fields(),
+    }
+    settings.execute(|_| {}, || false).unwrap();
+}
+
+fn apply_operation<'i>(wtxn: &mut RwTxn<'i, '_>, index: &'i Index, operation: &Operation) {
+    match operation {
+        Operation::SettingsUpdate(settings) => apply_settings_update(wtxn, index, settings),
+        Operation::DocumentImport(import) => apply_document_import(wtxn, index, import),
+        Operation::DocumentDeletion(deletion) => apply_document_deletion(wtxn, index, deletion),
+        Operation::Clear => {
+            let builder = ClearDocuments::new(wtxn, index);
+            let _result = builder.execute().unwrap();
+        }
+    }
+}
+
+#[test]
+fn fuzz() {
+    let tempdir = TempDir::new_in("/Volumes/Ramdisk").unwrap();
+
+    let mut options = EnvOpenOptions::new();
+    options.map_size(4096 * 1000 * 1000);
+
+    let index = {
+        let index = Index::new(options, tempdir.path()).unwrap();
+        let mut wtxn = index.write_txn().unwrap();
+        let indexer_config = IndexerConfig::default();
+        let mut settings = Settings::new(&mut wtxn, &index, &indexer_config);
+        settings.set_primary_key("identifier".to_owned());
+        settings.execute(|_| {}, || false).unwrap();
+        wtxn.commit().unwrap();
+        index
+    };
+
+    let result = fuzzcheck::fuzz_test(move |operations: &[Operation]| {
+        let mut wtxn = index.write_txn().unwrap();
+        for operation in operations {
+            apply_operation(&mut wtxn, &index, operation);
+        }
+        wtxn.abort().unwrap();
+    })
+    // We use a bloom filter (through UniqueMutator) to prevent the same test input from being tested too many times
+    .mutator(UniqueMutator::new(VecMutator::new(Operation::default_mutator(), 0..=20), |x| x))
+    .serde_serializer()
+    .default_sensor_and_pool()
+    .arguments_from_cargo_fuzzcheck()
+    .launch();
+    assert!(!result.found_test_failure);
+}
diff --git a/milli/src/update/mod.rs b/milli/src/update/mod.rs
@@ -18,6 +18,8 @@ mod available_documents_ids;
 mod clear_documents;
 mod delete_documents;
 pub(crate) mod facet;
+#[cfg(all(fuzzing, test))]
+mod fuzz;
 mod index_documents;
 mod indexer_config;
 mod prefix_word_pairs;