Skip to content
This repository has been archived by the owner on Apr 4, 2023. It is now read-only.

Commit

Permalink
Add a fuzz test for index operations (clear, import, delete, settings)
Browse files Browse the repository at this point in the history
It is very limited so far. It is meant to catch bugs with soft-deleted
document ids.
  • Loading branch information
loiclec committed Dec 12, 2022
1 parent e44ca84 commit cf5c934
Show file tree
Hide file tree
Showing 4 changed files with 356 additions and 3 deletions.
4 changes: 2 additions & 2 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -16,5 +16,5 @@ milli/target/
## ... unreviewed
*.snap.new

# Fuzzcheck data for the facet indexing fuzz test
milli/fuzz/update::facet::incremental::fuzz::fuzz/
# Fuzzcheck data
milli/fuzz/*
2 changes: 1 addition & 1 deletion milli/src/lib.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
#![cfg_attr(all(test, fuzzing), feature(no_coverage))]
#![cfg_attr(all(test, fuzzing), feature(no_coverage, once_cell))]
#[macro_use]
pub mod documents;

Expand Down
351 changes: 351 additions & 0 deletions milli/src/update/fuzz.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,351 @@
// Things tested by this fuzz test
//
// - A few different document identifiers only
// - Simple setting updates (searchable and filterable attributes only)
// - Document Deletion (given existing or unexisting external document ids)
// - Clear Documents
// - Batched document imports
// - Update/Replacememt of existing documents
// - Each operation with and without soft deletion
// - Empty document imports
// - No crash should ever happen

// A small sample of what isn't tested:
//
// - The correctness of the indexing operations
// - Indexing mistakes that happen when many different documents are inserted
// - Long batches of document imports
// - Nested fields (not tested well anyway)
// - Any search result
// - Arbitrary document contents
// (instead, the components of the documents are pre-written manually)
// - Index creation / Deletion
// - Autogenerated docids
// - Indexing for geosearch
// - Documents with too many field ids or too many words in a field id
// - Anything related to the prefix databases
// - Incorrect setting updates
// - The logic that chooses between soft and hard deletion
// (the choice is instead set manually for each operation)
// - Different IndexerConfig parameters

// Efficiency tips:
//
// - Use a RAM disk (see https://stackoverflow.com/questions/46224103/create-apfs-ram-disk-on-macos-high-sierra)
// - change the value of the TMPDIR environment variable to a folder in the RAM disk

// Quality:
// - finds issue 2945 if any of the last two fixes are not present (within a few minutes)
// - issue 2945: https://github.com/meilisearch/meilisearch/issues/2945
// - fix 1: https://github.com/meilisearch/milli/pull/723
// - fix 2: https://github.com/meilisearch/milli/pull/734
// - but doesn't detect anything wrong if this fix is not included: https://github.com/meilisearch/milli/pull/690
// - because it doesn't cause any crash, I think
// - each fuzz test iteration is quite slow
// - for this fuzz test in particular, it is good to let it run for a few hours, or even a day

use std::hash::Hash;
use std::sync::LazyLock;

use fuzzcheck::mutators::integer_within_range::U8WithinRangeMutator;
use fuzzcheck::mutators::option::OptionMutator;
use fuzzcheck::mutators::unique::UniqueMutator;
use fuzzcheck::mutators::vector::VecMutator;
use fuzzcheck::DefaultMutator;
use heed::{EnvOpenOptions, RwTxn};
use serde::{Deserialize, Serialize};
use tempfile::TempDir;

use super::{
ClearDocuments, DeleteDocuments, IndexDocuments, IndexDocumentsConfig, IndexerConfig, Settings,
};
use crate::Index;

/// The list of document identifiers that we choose to test
static DOCUMENT_IDENTIFIERS: LazyLock<Vec<serde_json::Value>> = LazyLock::new(|| {
let mut ids = vec![];
for i in 0..10 {
ids.push(serde_json::json!(i));
ids.push(serde_json::json!(format!("{i}")));
}
ids.push(serde_json::json!("complex-ID-1_2"));
ids.push(serde_json::json!("1-2-3-4"));
ids.push(serde_json::json!("invalidsupercalifragilisticexpialidocioussupercalifragilisticexpialidocioussupercalifragilisticexpialidocioussupercalifragilisticexpialidocioussupercalifragilisticexpialidocioussupercalifragilisticexpialidocioussupercalifragilisticexpialidocioussupercalifragilisticexpialidocious"));
ids.push(serde_json::json!("invalid.id"));
ids
});

/// The list of field values that we choose to test
static FIELD_VALUES: LazyLock<Vec<serde_json::Value>> = LazyLock::new(|| {
let mut vals = vec![];
for i in 0..10i32 {
vals.push(serde_json::json!(i));
vals.push(serde_json::json!((i as f64) / 3.4));
vals.push(serde_json::json!(111.1_f32.powi(i)));
vals.push(serde_json::json!(format!("{i}")));
vals.push(serde_json::json!([i - 1, format!("{i}"), i + 1, format!("{}", i - 1), i - 2]));
vals.push(serde_json::json!(format!("{}", "a".repeat(i as usize))));
}
vals.push(serde_json::json!({ "nested": ["value", { "nested": ["value", "value", "the quick brown fox jumps over the lazy dog, wow!"] }], "value": 0}));
vals.push(serde_json::json!("the quick brown fox jumps over the lazy dog, wow!"));
vals.push(serde_json::json!("the quick brown supercalifragilisticexpialidocioussupercalifragilisticexpialidocioussupercalifragilisticexpialidocioussupercalifragilisticexpialidocioussupercalifragilisticexpialidocioussupercalifragilisticexpialidocioussupercalifragilisticexpialidocioussupercalifragilisticexpialidocious fox jumps over the lazy dog"));
vals.push(serde_json::json!({ "lat": 23.0, "lon": 22.1 }));
vals.push(serde_json::json!({ "lat": 23.0, "lon": 22.1, "other": 10.0 }));
vals.push(serde_json::json!({ "lat": -23.0, "lon": -22.1 }));
vals.push(serde_json::json!({ "lat": 93.0, "lon": 22.1 }));
vals.push(serde_json::json!({ "lat": 90.0, "lon": 221.1 }));
vals
});
/// The list of field keys that we choose to test
static FIELD_KEYS: LazyLock<Vec<String>> = LazyLock::new(|| {
let mut keys = vec![];
for f in ["identifier", "field1", "field2", "_geo"] {
keys.push(f.to_owned());
for g in [
"nested",
"value",
"nested.value",
"nested.value.nested",
"_geo",
"lat",
"lon",
"other",
] {
let mut key = f.to_owned();
key.push('.');
key.push_str(g);
keys.push(key);
}
}
keys
});
fn document_identifier(i: u8) -> serde_json::Value {
DOCUMENT_IDENTIFIERS[i as usize].clone()
}
fn field_key(i: u8) -> String {
FIELD_KEYS[i as usize].clone()
}
fn field_value(i: u8) -> serde_json::Value {
FIELD_VALUES[i as usize].clone()
}
fn document_identifier_index_mutator() -> U8WithinRangeMutator {
U8WithinRangeMutator::new(..DOCUMENT_IDENTIFIERS.len() as u8)
}
fn field_key_index_mutator() -> U8WithinRangeMutator {
U8WithinRangeMutator::new(..FIELD_KEYS.len() as u8)
}
fn field_value_index_mutator() -> U8WithinRangeMutator {
U8WithinRangeMutator::new(..FIELD_VALUES.len() as u8)
}

#[derive(Debug, Clone, Serialize, Deserialize, DefaultMutator, PartialEq, Eq, Hash)]
enum Operation {
SettingsUpdate(SettingsUpdate),
DocumentImport(DocumentImport),
DocumentDeletion(DocumentDeletion),
Clear,
}
#[derive(Debug, Clone, Serialize, Deserialize, DefaultMutator, PartialEq, Eq, Hash)]
enum Method {
Update,
Replace,
}
#[derive(Debug, Clone, Serialize, Deserialize, DefaultMutator, PartialEq, Eq, Hash)]
struct DocumentImport {
disable_soft_deletion: bool,
method: Method,
documents: DocumentImportBatch,
}
#[derive(Debug, Clone, Serialize, Deserialize, DefaultMutator, PartialEq, Eq, Hash)]
struct SettingsUpdate {
// Adding filterable fields slows down the fuzzer a lot
// #[field_mutator(OptionMutator<Vec<u8>, VecMutator<u8, U8WithinRangeMutator>> = {
// OptionMutator::new(VecMutator::new(field_key_index_mutator(), 0..=10))
// })]
// filterable_fields: Option<Vec<u8>>,
#[field_mutator(OptionMutator<Vec<u8>, VecMutator<u8, U8WithinRangeMutator>> = {
OptionMutator::new(VecMutator::new(field_key_index_mutator(), 0..=10))
})]
searchable_fields: Option<Vec<u8>>,
}
#[derive(Debug, Clone, Serialize, Deserialize, DefaultMutator, PartialEq, Eq, Hash)]
struct DocumentDeletion {
disable_soft_deletion: bool,
#[field_mutator(VecMutator<u8, U8WithinRangeMutator> = {
VecMutator::new(document_identifier_index_mutator(), 0..=10)
})]
external_document_ids: Vec<u8>,
}
#[derive(Debug, Clone, Serialize, Deserialize, DefaultMutator, PartialEq, Eq, Hash)]
struct Document {
#[field_mutator(U8WithinRangeMutator = { document_identifier_index_mutator() })]
identifier: u8,
#[field_mutator(OptionMutator<u8, U8WithinRangeMutator> = {
OptionMutator::new(field_value_index_mutator())
})]
field1: Option<u8>,
#[field_mutator(OptionMutator<u8, U8WithinRangeMutator> = {
OptionMutator::new(field_value_index_mutator())
})]
field2: Option<u8>,
}
#[derive(Debug, Clone, Serialize, Deserialize, DefaultMutator, PartialEq, Eq, Hash)]
struct DocumentImportBatch {
#[field_mutator(VecMutator<Document, DocumentMutator> = {
VecMutator::new(Document::default_mutator(), 0..=10)
})]
docs1: Vec<Document>,
#[field_mutator(VecMutator<Document, DocumentMutator> = {
VecMutator::new(Document::default_mutator(), 0..=5)
})]
docs2: Vec<Document>,
}

fn apply_document_deletion<'i>(
wtxn: &mut RwTxn<'i, '_>,
index: &'i Index,
deletion: &DocumentDeletion,
) {
let DocumentDeletion { disable_soft_deletion, external_document_ids } = deletion;
let mut builder = DeleteDocuments::new(wtxn, index).unwrap();
builder.disable_soft_deletion(*disable_soft_deletion);
for id in external_document_ids {
let id = document_identifier(*id);
let id = match id {
serde_json::Value::Number(n) => format!("{n}"),
serde_json::Value::String(s) => s,
_ => panic!(),
};
let _ = builder.delete_external_id(id.as_str());
}
builder.execute().unwrap();
}

fn apply_document_import<'i>(wtxn: &mut RwTxn<'i, '_>, index: &'i Index, import: &DocumentImport) {
let DocumentImport {
disable_soft_deletion,
method,
documents: DocumentImportBatch { docs1, docs2 },
} = import;
let indexer_config = IndexerConfig::default();
let mut builder = IndexDocuments::new(
wtxn,
index,
&indexer_config,
IndexDocumentsConfig {
update_method: match method {
Method::Update => super::IndexDocumentsMethod::UpdateDocuments,
Method::Replace => super::IndexDocumentsMethod::ReplaceDocuments,
},
disable_soft_deletion: *disable_soft_deletion,
autogenerate_docids: false,
..IndexDocumentsConfig::default()
},
|_| {},
|| false,
)
.unwrap();

let make_real_docs = |docs: &Vec<Document>| {
docs.iter()
.map(|doc| {
let Document { identifier, field1, field2 } = doc;
let mut object = crate::Object::new();
let identifier = document_identifier(*identifier);
object.insert("identifier".to_owned(), serde_json::json!(identifier));
if let Some(field1) = field1 {
let field1 = field_value(*field1);
object.insert("field1".to_owned(), field1);
}
if let Some(field2) = field2 {
let field2 = field_value(*field2);
object.insert("field2".to_owned(), field2);
}
object
})
.collect::<Vec<_>>()
};

let docs1 = make_real_docs(docs1);

let (new_builder, _user_error) = builder.add_documents(documents!(docs1)).unwrap();
builder = new_builder;

let docs2 = make_real_docs(docs2);

let (new_builder, _user_error) = builder.add_documents(documents!(docs2)).unwrap();
builder = new_builder;

let _ = builder.execute().unwrap();
}

fn apply_settings_update<'i>(
wtxn: &mut RwTxn<'i, '_>,
index: &'i Index,
settings: &SettingsUpdate,
) {
let SettingsUpdate { searchable_fields /* , filterable_fields */ } = settings;
let indexer_config = IndexerConfig::default();
let mut settings = Settings::new(wtxn, index, &indexer_config);
// match filterable_fields {
// Some(fields) => {
// let fields = fields.iter().map(|f| field_key(*f)).collect();
// settings.set_filterable_fields(fields);
// }
// None => settings.reset_filterable_fields(),
// }
match searchable_fields {
Some(fields) => {
let fields = fields.iter().map(|f| field_key(*f)).collect();
settings.set_searchable_fields(fields);
}
None => settings.reset_searchable_fields(),
}
settings.execute(|_| {}, || false).unwrap();
}

fn apply_operation<'i>(wtxn: &mut RwTxn<'i, '_>, index: &'i Index, operation: &Operation) {
match operation {
Operation::SettingsUpdate(settings) => apply_settings_update(wtxn, index, settings),
Operation::DocumentImport(import) => apply_document_import(wtxn, index, import),
Operation::DocumentDeletion(deletion) => apply_document_deletion(wtxn, index, deletion),
Operation::Clear => {
let builder = ClearDocuments::new(wtxn, index);
let _result = builder.execute().unwrap();
}
}
}

#[test]
fn fuzz() {
let tempdir = TempDir::new_in("/Volumes/Ramdisk").unwrap();

let mut options = EnvOpenOptions::new();
options.map_size(4096 * 1000 * 1000);

let index = {
let index = Index::new(options, tempdir.path()).unwrap();
let mut wtxn = index.write_txn().unwrap();
let indexer_config = IndexerConfig::default();
let mut settings = Settings::new(&mut wtxn, &index, &indexer_config);
settings.set_primary_key("identifier".to_owned());
settings.execute(|_| {}, || false).unwrap();
wtxn.commit().unwrap();
index
};

let result = fuzzcheck::fuzz_test(move |operations: &[Operation]| {
let mut wtxn = index.write_txn().unwrap();
for operation in operations {
apply_operation(&mut wtxn, &index, operation);
}
wtxn.abort().unwrap();
})
// We use a bloom filter (through UniqueMutator) to prevent the same test input from being tested too many times
.mutator(UniqueMutator::new(VecMutator::new(Operation::default_mutator(), 0..=20), |x| x))
.serde_serializer()
.default_sensor_and_pool()
.arguments_from_cargo_fuzzcheck()
.launch();
assert!(!result.found_test_failure);
}
2 changes: 2 additions & 0 deletions milli/src/update/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,8 @@ mod available_documents_ids;
mod clear_documents;
mod delete_documents;
pub(crate) mod facet;
#[cfg(all(fuzzing, test))]
mod fuzz;
mod index_documents;
mod indexer_config;
mod prefix_word_pairs;
Expand Down

0 comments on commit cf5c934

Please sign in to comment.