document search after (#4265)

* document search_after support * more ideas of partial cache-based optimization * fix regression for sort by doc_id
quickwit-oss · Dec 14, 2023 · 59d5498 · 59d5498
1 parent 281bd01
commit 59d5498
Show file tree

Hide file tree

Showing 4 changed files with 88 additions and 6 deletions.
diff --git a/docs/reference/es_compatible_api.md b/docs/reference/es_compatible_api.md
@@ -133,7 +133,7 @@ If a parameter appears both as a query string parameter and in the JSON payload,
 | `from`             | `Integer`     | The rank of the first hit to return. This is useful for pagination.              | 0             |
 | `q`                | `String`      | The search query.                                                                | (Optional)    |
 | `size`             | `Integer`     | Number of hits to return.                                                        | 10            |
-| `sort`             | `String`      | Describes how documents should be ranked. See [Sort order](#sort-order)          | `[]`          | (Optional) |
+| `sort`             | `String`      | Describes how documents should be ranked. See [Sort order](#sort-order)          | (Optional)    |
 | `scroll`           | `Duration`    | Creates a scroll context for "time to live". See [Scroll](#_scroll--scroll-api). | (Optional)    |
 
 #### Supported Request Body parameters
@@ -145,7 +145,8 @@ If a parameter appears both as a query string parameter and in the JSON payload,
 | `query`            | `Json object`     | Describe the search query. See [Query DSL](#query-dsl)                         | (Optional)    |
 | `size`             | `Integer`         | Number of hits to return.                                                      | 10            |
 | `sort`             | `JsonObject[]`    | Describes how documents should be ranked. See [Sort order](#sort-order)        | `[]`          |
-| `aggs`             | `Json object`     | Aggregation definition. See [Aggregations](aggregation.md).                    | `{}`          | ` |
+| `search_after`     | `Any[]`           | Ignore documents with a SortingValue preceding or equal to the parameter       | (Optional)    |
+| `aggs`             | `Json object`     | Aggregation definition. See [Aggregations](aggregation.md).                    | `{}`          |
 
 
 #### Sort order
@@ -168,7 +169,7 @@ following syntax.
 {
   // ...
   "sort" : [
-    { "timestamp" : {"order" : "asc"}},
+    { "timestamp" : {"format": "epoch_millis_as_int","order" : "asc"}},
     { "serial_number" : "desc" }
   ]
   // ...
@@ -185,6 +186,44 @@ It is also possible to not supply an order and rely on the default order using t
 }
 ```
 
+If no format is provided for timestamps, timestamps are returned with nanosecond precision. Beware
+this means the resulting json may contain high numbers for which there is loss of precision when
+using languages where all numbers are floats, such as JavaScript.
+
+#### Search after
+
+When sorting results, the answer looks like the following
+
+```json
+{
+  // ...
+  "hits": {
+    // ...
+    "hits": [
+      // ...
+      {
+        // ...
+        "sort": [
+          1701962929199000000
+        ]
+      }
+    ]
+  }
+}
+```
+
+You can pass the `sort` value of the last hit in a subsequent request where other fields are kept unchanged:
+```json
+{
+  // keep all fields from the original request
+  "seach_after": [
+    1701962929199000000
+  ]
+}
+```
+
+This allows you to paginate your results.
+
 ### `_msearch` &nbsp; Multi search API
 
 ```
@@ -219,7 +258,7 @@ GET api/v1/_elastic/_search/scroll
 
 | Variable    | Type                                        | Description | Default value |
 | ----------- | ------------------------------------------- | ----------- | ------------- |
-| `scroll_id` | Scroll id (obtained from a search response) | Required    |
+| `scroll_id` | Scroll id (obtained from a search response) | Required    |               |
 
 
 The `_search/scroll` endpoint, in combination with the `_search` API makes it possible to request successive pages of search results.
@@ -632,4 +671,4 @@ GET api/v1/_elastic/stackoverflow*/_search
     }
   }
 }
-```
+```
diff --git a/quickwit/quickwit-search/src/leaf_cache.rs b/quickwit/quickwit-search/src/leaf_cache.rs
@@ -38,6 +38,12 @@ pub struct LeafSearchCache {
 // match, the merged_time_range is strictly smaller, and every hit we had fits in the new
 // timebound, we can reply from cache, saying we hit only result.partial_hits.len() res. It always
 // undercount, and necessarily returns the right hits.
+// TODO if we stored a result for X hits, but a subsequent request asks for Y < X hits, we can
+// modify the answer and serve from cache.
+// TODO mix of 1 and 3.
+// TODO this means given a request for X documents, we could search for k*X docs in each split,
+// truncate to X while merging, and get free results from cache for at least the next k subsequent
+// queries which vary only by search_after.
 
 impl LeafSearchCache {
     pub fn new(capacity: usize) -> LeafSearchCache {

diff --git a/quickwit/quickwit-search/src/root.rs b/quickwit/quickwit-search/src/root.rs
@@ -353,7 +353,7 @@ fn validate_sort_by_field(
     has_timestamp_format: bool,
     schema: &Schema,
 ) -> crate::Result<()> {
-    if field_name == "_score" {
+    if ["_score", "_shard_doc", "_doc"].contains(&field_name) {
         return Ok(());
     }
     let dynamic_field_opt = schema.get_field(DYNAMIC_FIELD_NAME).ok();
@@ -1728,6 +1728,27 @@ mod tests {
             .unwrap();
     }
 
+    #[test]
+    fn test_validate_sort_by_docid() {
+        let sort_fields = vec![
+            SortField {
+                field_name: "_doc".to_string(),
+                sort_order: 0,
+                sort_datetime_format: None,
+            },
+            SortField {
+                field_name: "_shard_doc".to_string(),
+                sort_order: 0,
+                sort_datetime_format: None,
+            },
+        ];
+        let mut schema_builder = Schema::builder();
+        schema_builder.add_date_field("timestamp", FAST);
+        schema_builder.add_u64_field("id", FAST);
+        let schema = schema_builder.build();
+        validate_sort_by_fields_and_search_after(&sort_fields, &None, &schema).unwrap();
+    }
+
     #[test]
     fn test_validate_sort_by_fields_and_search_after_invalid_1() {
         // 2 sort fields + search after with only one sort value is invalid.

diff --git a/quickwit/rest-api-tests/scenarii/es_compatibility/0008-sort_by.yaml b/quickwit/rest-api-tests/scenarii/es_compatibility/0008-sort_by.yaml
@@ -63,3 +63,19 @@ expected:
       - _source:
           actor:
             id: 5688
+---
+json:
+  size: 1
+  query:
+      match_all: {}
+  sort:
+    _doc: {}
+expected:
+  hits:
+    total:
+      value: 100
+      relation: eq
+    hits:
+      - _source:
+          actor:
+            id: 1762355