Avoid unnecessary hfield Creation/Deletion on updates in hashTypeSet.…

… HSET updates improvement of ~10% (redis#13655) This PR eliminates unnecessary creation and destruction of hfield objects, ensuring only required updates or insertions are performed. This reduces overhead and improves performance by streamlining field management in hash dictionaries, particularly in scenarios involving frequent updates, like the benchmarks in: - [memtier_benchmark-100Kkeys-load-hash-50-fields-with-100B-values](https://github.com/redis/redis-benchmarks-specification/blob/main/redis_benchmarks_specification/test-suites/memtier_benchmark-100Kkeys-load-hash-50-fields-with-100B-values.yml) - [memtier_benchmark-10Mkeys-load-hash-5-fields-with-100B-values-pipeline-10](https://github.com/redis/redis-benchmarks-specification/blob/main/redis_benchmarks_specification/test-suites/memtier_benchmark-10Mkeys-load-hash-5-fields-with-100B-values-pipeline-10.yml) To test it we can simply focus on the hfield related tests ``` tclsh tests/test_helper.tcl --single unit/type/hash-field-expire tclsh tests/test_helper.tcl --single unit/type/hash tclsh tests/test_helper.tcl --dump-logs --single unit/other ``` Extra check on full CI: - [x] https://github.com/filipecosta90/redis/actions/runs/12225788759 ## microbenchmark results 16.7% improvement (drop in time) in dictAddNonExistingRaw vs dictAddRaw ``` make REDIS_CFLAGS="-g -fno-omit-frame-pointer -O3 -DREDIS_TEST" -j $ ./src/redis-server test dict --accurate (...) Inserting via dictAddRaw() non existing: 5000000 items in 2592 ms (...) Inserting via dictAddNonExistingRaw() non existing: 5000000 items in 2160 ms ``` 8% improvement (drop in time) in find (non existing) and adding via `dictGetHash()+dictFindWithHash()+dictAddNonExistingRaw()` vs `dictFind()+dictAddRaw()` ``` make REDIS_CFLAGS="-g -fno-omit-frame-pointer -O3 -DREDIS_TEST" -j $ ./src/redis-server test dict --accurate (...) Find() and inserting via dictFind()+dictAddRaw() non existing: 5000000 items in 2983 ms Find() and inserting via dictGetHash()+dictFindWithHash()+dictAddNonExistingRaw() non existing: 5000000 items in 2740 ms ``` ## benchmark results To benchmark: ``` pip3 install redis-benchmarks-specification==0.1.250 taskset -c 0 ./src/redis-server --save '' --protected-mode no --daemonize yes redis-benchmarks-spec-client-runner --tests-regexp ".*load-hash.*" --flushall_on_every_test_start --flushall_on_every_test_end --cpuset_start_pos 2 --override-memtier-test-time 60 ``` Improvements on achievable throughput in: test | ops/sec unstable (59953d2) | ops/sec this PR (24af719) | % change -- | -- | -- | -- memtier_benchmark-1key-load-hash-1K-fields-with-5B-values | 4097 | 5032 | 22.8% memtier_benchmark-100Kkeys-load-hash-50-fields-with-100B-values | 37658 | 44688 | 18.7% memtier_benchmark-100Kkeys-load-hash-50-fields-with-1000B-values | 14736 | 17350 | 17.7% memtier_benchmark-1Mkeys-load-hash-5-fields-with-1000B-values-pipeline-10 | 131848 | 143485 | 8.8% memtier_benchmark-1Mkeys-load-hash-hmset-5-fields-with-1000B-values | 82071 | 85681 | 4.4% memtier_benchmark-1Mkeys-load-hash-5-fields-with-1000B-values | 82882 | 86336 | 4.2% memtier_benchmark-10Mkeys-load-hash-5-fields-with-100B-values-pipeline-10 | 262502 | 273376 | 4.1% memtier_benchmark-10Kkeys-load-hash-50-fields-with-10000B-values | 2821 | 2936 | 4.1% --------- Co-authored-by: Moti Cohen <[email protected]>
pizhenwei · Dec 12, 2024 · f8942f9 · f8942f9
1 parent c51c966
commit f8942f9
Show file tree

Hide file tree

Showing 6 changed files with 205 additions and 63 deletions.
diff --git a/src/defrag.c b/src/defrag.c
@@ -296,7 +296,7 @@ void activeDefragHfieldDictCallback(void *privdata, const dictEntry *de) {
         dictUseStoredKeyApi(d, 1);
         uint64_t hash = dictGetHash(d, newhf);
         dictUseStoredKeyApi(d, 0);
-        dictEntry *de = dictFindEntryByPtrAndHash(d, hf, hash);
+        dictEntry *de = dictFindByHashAndPtr(d, hf, hash);
         serverAssert(de);
         dictSetKey(d, de, newhf);
     }
@@ -753,7 +753,7 @@ void defragKey(defragCtx *ctx, dictEntry *de) {
              * the pointer it holds, since it won't be able to do the string
              * compare, but we can find the entry using key hash and pointer. */
             uint64_t hash = kvstoreGetHash(db->expires, newsds);
-            dictEntry *expire_de = kvstoreDictFindEntryByPtrAndHash(db->expires, slot, keysds, hash);
+            dictEntry *expire_de = kvstoreDictFindByHashAndPtr(db->expires, slot, keysds, hash);
             if (expire_de) kvstoreDictSetKey(db->expires, slot, expire_de, newsds);
         }
 

diff --git a/src/dict.c b/src/dict.c
@@ -62,6 +62,7 @@ typedef struct {
 
 static void _dictExpandIfNeeded(dict *d);
 static void _dictShrinkIfNeeded(dict *d);
+static void _dictRehashStepIfNeeded(dict *d, uint64_t visitedIdx);
 static signed char _dictNextExp(unsigned long size);
 static int _dictInit(dict *d, dictType *type);
 static dictEntry *dictGetNext(const dictEntry *de);
@@ -509,6 +510,39 @@ dictEntry *dictAddRaw(dict *d, void *key, dictEntry **existing)
     return dictInsertAtPosition(d, key, position);
 }
 
+/* Low-level add function for non-existing keys:
+ * This function adds a new entry to the dictionary, assuming the key does not
+ * already exist.
+ * Parameters:
+ * - `dict *d`: Pointer to the dictionary structure.
+ * - `void *key`: Pointer to the key being added.
+ * - `const uint64_t hash`: hash of the key being added.
+ * Guarantees:
+ * - The key is assumed to be non-existing.
+ * Note:
+ * Ensure that the key's uniqueness is managed externally before calling this function. */
+dictEntry *dictAddNonExistsByHash(dict *d, void *key, const uint64_t hash) {
+    /* Get the position for the new key, it should never be NULL. */
+    unsigned long idx, table;
+    idx = hash & DICTHT_SIZE_MASK(d->ht_size_exp[0]);
+
+    /* Rehash the hash table if needed */
+    _dictRehashStepIfNeeded(d,idx);
+
+    /* Expand the hash table if needed */
+    _dictExpandIfNeeded(d);
+
+    table = dictIsRehashing(d) ? 1 : 0;
+    idx = hash & DICTHT_SIZE_MASK(d->ht_size_exp[table]);
+    void *position = &d->ht_table[table][idx];
+    assert(position!=NULL);
+
+    /* Dup the key if necessary. */
+    if (d->type->keyDup) key = d->type->keyDup(d, key);
+
+    return dictInsertAtPosition(d, key, position);
+}
+
 /* Adds a key in the dict's hashtable at the position returned by a preceding
  * call to dictFindPositionForInsert. This is a low level function which allows
  * splitting dictAddRaw in two parts. Normally, dictAddRaw or dictAdd should be
@@ -608,17 +642,8 @@ static dictEntry *dictGenericDelete(dict *d, const void *key, int nofree) {
     h = dictHashKey(d, key, d->useStoredKeyApi);
     idx = h & DICTHT_SIZE_MASK(d->ht_size_exp[0]);
 
-    if (dictIsRehashing(d)) {
-        if ((long)idx >= d->rehashidx && d->ht_table[0][idx]) {
-            /* If we have a valid hash entry at `idx` in ht0, we perform
-             * rehash on the bucket at `idx` (being more CPU cache friendly) */
-            _dictBucketRehash(d, idx);
-        } else {
-            /* If the hash entry is not in ht0, we rehash the buckets based
-             * on the rehashidx (not CPU cache friendly). */
-            _dictRehashStep(d);
-        }
-    }
+    /* Rehash the hash table if needed */
+    _dictRehashStepIfNeeded(d,idx);
 
     keyCmpFunc cmpFunc = dictGetKeyCmpFunc(d);
 
@@ -734,32 +759,21 @@ void dictRelease(dict *d)
     zfree(d);
 }
 
-dictEntry *dictFind(dict *d, const void *key)
-{
+dictEntry *dictFindByHash(dict *d, const void *key, const uint64_t hash) {
     dictEntry *he;
-    uint64_t h, idx, table;
+    uint64_t idx, table;
 
     if (dictSize(d) == 0) return NULL; /* dict is empty */
 
-    h = dictHashKey(d, key, d->useStoredKeyApi);
-    idx = h & DICTHT_SIZE_MASK(d->ht_size_exp[0]);
+    idx = hash & DICTHT_SIZE_MASK(d->ht_size_exp[0]);
     keyCmpFunc cmpFunc = dictGetKeyCmpFunc(d);
 
-    if (dictIsRehashing(d)) {
-        if ((long)idx >= d->rehashidx && d->ht_table[0][idx]) {
-            /* If we have a valid hash entry at `idx` in ht0, we perform
-             * rehash on the bucket at `idx` (being more CPU cache friendly) */
-            _dictBucketRehash(d, idx);
-        } else {
-            /* If the hash entry is not in ht0, we rehash the buckets based
-             * on the rehashidx (not CPU cache friendly). */
-            _dictRehashStep(d);
-        }
-    }
+    /* Rehash the hash table if needed */
+    _dictRehashStepIfNeeded(d,idx);
 
     for (table = 0; table <= 1; table++) {
         if (table == 0 && (long)idx < d->rehashidx) continue;
-        idx = h & DICTHT_SIZE_MASK(d->ht_size_exp[table]);
+        idx = hash & DICTHT_SIZE_MASK(d->ht_size_exp[table]);
 
         /* Prefetch the bucket at the calculated index */
         redis_prefetch_read(&d->ht_table[table][idx]);
@@ -781,6 +795,13 @@ dictEntry *dictFind(dict *d, const void *key)
     return NULL;
 }
 
+dictEntry *dictFind(dict *d, const void *key)
+{
+    if (dictSize(d) == 0) return NULL; /* dict is empty */
+    const uint64_t hash = dictHashKey(d, key, d->useStoredKeyApi);
+    return dictFindByHash(d,key,hash);
+}
+
 void *dictFetchValue(dict *d, const void *key) {
     dictEntry *he;
 
@@ -1566,6 +1587,21 @@ static void _dictShrinkIfNeeded(dict *d)
     dictShrinkIfNeeded(d);
 }
 
+static void _dictRehashStepIfNeeded(dict *d, uint64_t visitedIdx) {
+    if ((!dictIsRehashing(d)) || (d->pauserehash != 0))
+        return;
+    /* rehashing not in progress if rehashidx == -1 */
+    if ((long)visitedIdx >= d->rehashidx && d->ht_table[0][visitedIdx]) {
+        /* If we have a valid hash entry at `idx` in ht0, we perform
+         * rehash on the bucket at `idx` (being more CPU cache friendly) */
+        _dictBucketRehash(d, visitedIdx);
+    } else {
+        /* If the hash entry is not in ht0, we rehash the buckets based
+         * on the rehashidx (not CPU cache friendly). */
+        dictRehash(d,1);
+    }
+}
+
 /* Our hash table capability is a power of two */
 static signed char _dictNextExp(unsigned long size)
 {
@@ -1586,17 +1622,8 @@ void *dictFindPositionForInsert(dict *d, const void *key, dictEntry **existing)
     if (existing) *existing = NULL;
     idx = hash & DICTHT_SIZE_MASK(d->ht_size_exp[0]);
 
-    if (dictIsRehashing(d)) {
-        if ((long)idx >= d->rehashidx && d->ht_table[0][idx]) {
-            /* If we have a valid hash entry at `idx` in ht0, we perform
-             * rehash on the bucket at `idx` (being more CPU cache friendly) */
-            _dictBucketRehash(d, idx);
-        } else {
-            /* If the hash entry is not in ht0, we rehash the buckets based
-             * on the rehashidx (not CPU cache friendly). */
-            _dictRehashStep(d);
-        }
-    }
+    /* Rehash the hash table if needed */
+    _dictRehashStepIfNeeded(d,idx);
 
     /* Expand the hash table if needed */
     _dictExpandIfNeeded(d);
@@ -1624,6 +1651,7 @@ void *dictFindPositionForInsert(dict *d, const void *key, dictEntry **existing)
     return bucket;
 }
 
+
 void dictEmpty(dict *d, void(callback)(dict*)) {
     /* Someone may be monitoring a dict that started rehashing, before
      * destroying the dict fake completion. */
@@ -1649,7 +1677,7 @@ uint64_t dictGetHash(dict *d, const void *key) {
  * the hash value should be provided using dictGetHash.
  * no string / key comparison is performed.
  * return value is a pointer to the dictEntry if found, or NULL if not found. */
-dictEntry *dictFindEntryByPtrAndHash(dict *d, const void *oldptr, uint64_t hash) {
+dictEntry *dictFindByHashAndPtr(dict *d, const void *oldptr, const uint64_t hash) {
     dictEntry *he;
     unsigned long idx, table;
 
@@ -1831,6 +1859,32 @@ char *stringFromLongLong(long long value) {
     return s;
 }
 
+char *stringFromSubstring(void) {
+    #define LARGE_STRING_SIZE 10000
+    #define MIN_STRING_SIZE 100
+    #define MAX_STRING_SIZE 500
+    static char largeString[LARGE_STRING_SIZE + 1];
+    static int init = 0;
+    if (init == 0) {
+        /* Generate a large string */
+        for (size_t i = 0; i < LARGE_STRING_SIZE; i++) {
+            /* Random printable ASCII character (33 to 126) */
+            largeString[i] = 33 + (rand() % 94);
+        }
+        /* Null-terminate the large string */
+        largeString[LARGE_STRING_SIZE] = '\0';
+        init = 1;
+    }
+    /* Randomly choose a size between minSize and maxSize */
+    size_t substringSize = MIN_STRING_SIZE + (rand() % (MAX_STRING_SIZE - MIN_STRING_SIZE + 1));
+    size_t startIndex = rand() % (LARGE_STRING_SIZE - substringSize + 1);
+    /* Allocate memory for the substring (+1 for null terminator) */
+    char *s = zmalloc(substringSize + 1);
+    memcpy(s, largeString + startIndex, substringSize); // Copy the substring
+    s[substringSize] = '\0'; // Null-terminate the string
+    return s;
+}
+
 dictType BenchmarkDictType = {
     hashCallback,
     NULL,
@@ -1853,6 +1907,8 @@ int dictTest(int argc, char **argv, int flags) {
     long long start, elapsed;
     int retval;
     dict *dict = dictCreate(&BenchmarkDictType);
+    dictEntry* de = NULL;
+    dictEntry* existing = NULL;
     long count = 0;
     unsigned long new_dict_size, current_dict_used, remain_keys;
     int accurate = (flags & REDIS_TEST_ACCURATE);
@@ -1992,13 +2048,99 @@ int dictTest(int argc, char **argv, int flags) {
         dictEmpty(dict, NULL);
         dictSetResizeEnabled(DICT_RESIZE_ENABLE);
     }
+    srand(12345);
+    start_benchmark();
+    for (j = 0; j < count; j++) {
+        /* Create a dynamically allocated substring */
+        char *key = stringFromSubstring();
+
+        /* Insert the range directly from the large string */
+        de = dictAddRaw(dict, key, &existing);
+        assert(de != NULL || existing != NULL);
+        /* If key already exists NULL is returned so we need to free the temp key string */
+        if (de == NULL) zfree(key);
+    }
+    end_benchmark("Inserting random substrings (100-500B) from large string with symbols");
+    assert((long)dictSize(dict) <= count);
+    dictEmpty(dict, NULL);
 
     start_benchmark();
     for (j = 0; j < count; j++) {
         retval = dictAdd(dict,stringFromLongLong(j),(void*)j);
         assert(retval == DICT_OK);
     }
-    end_benchmark("Inserting");
+    end_benchmark("Inserting via dictAdd() non existing");
+    assert((long)dictSize(dict) == count);
+
+    dictEmpty(dict, NULL);
+
+    start_benchmark();
+    for (j = 0; j < count; j++) {
+        de = dictAddRaw(dict,stringFromLongLong(j),NULL);
+        assert(de != NULL);
+    }
+    end_benchmark("Inserting via dictAddRaw() non existing");
+    assert((long)dictSize(dict) == count);
+
+    start_benchmark();
+    for (j = 0; j < count; j++) {
+        void *key = stringFromLongLong(j);
+        de = dictAddRaw(dict,key,&existing);
+        assert(existing != NULL);
+        zfree(key);
+    }
+    end_benchmark("Inserting via dictAddRaw() existing (no insertion)");
+    assert((long)dictSize(dict) == count);
+
+    dictEmpty(dict, NULL);
+
+    start_benchmark();
+    for (j = 0; j < count; j++) {
+        void *key = stringFromLongLong(j);
+        const uint64_t hash = dictGetHash(dict, key);
+        de = dictAddNonExistsByHash(dict,key,hash);
+        assert(de != NULL);
+    }
+    end_benchmark("Inserting via dictAddNonExistsByHash() non existing");
+    assert((long)dictSize(dict) == count);
+
+    /* Wait for rehashing. */
+    while (dictIsRehashing(dict)) {
+        dictRehashMicroseconds(dict,100*1000);
+    }
+
+    dictEmpty(dict, NULL);
+
+    start_benchmark();
+    for (j = 0; j < count; j++) {
+        /* Create a key */
+        void *key = stringFromLongLong(j);
+
+        /* Check if the key exists */
+        dictEntry *entry = dictFind(dict, key);
+        assert(entry == NULL);
+
+        /* Add the key */
+        dictEntry *de = dictAddRaw(dict, key, NULL);
+        assert(de != NULL);
+    }
+    end_benchmark("Find() and inserting via dictFind()+dictAddRaw() non existing");
+
+    dictEmpty(dict, NULL);
+
+    start_benchmark();
+    for (j = 0; j < count; j++) {
+        /* Create a key */
+        void *key = stringFromLongLong(j);
+        uint64_t hash = dictGetHash(dict, key);
+
+         /* Check if the key exists */
+        dictEntry *entry = dictFindByHash(dict, key, hash);
+        assert(entry == NULL);
+        de = dictAddNonExistsByHash(dict, key, hash);
+        assert(de != NULL);
+    }
+    end_benchmark("Find() and inserting via dictGetHash()+dictFindByHash()+dictAddNonExistsByHash() non existing");
     assert((long)dictSize(dict) == count);
 
     /* Wait for rehashing. */

diff --git a/src/dict.h b/src/dict.h
@@ -196,6 +196,7 @@ int dictTryExpand(dict *d, unsigned long size);
 int dictShrink(dict *d, unsigned long size);
 int dictAdd(dict *d, void *key, void *val);
 dictEntry *dictAddRaw(dict *d, void *key, dictEntry **existing);
+dictEntry *dictAddNonExistsByHash(dict *d, void *key, const uint64_t hash);
 void *dictFindPositionForInsert(dict *d, const void *key, dictEntry **existing);
 dictEntry *dictInsertAtPosition(dict *d, void *key, void *position);
 dictEntry *dictAddOrFind(dict *d, void *key);
@@ -207,6 +208,8 @@ dictEntry *dictTwoPhaseUnlinkFind(dict *d, const void *key, dictEntry ***plink,
 void dictTwoPhaseUnlinkFree(dict *d, dictEntry *he, dictEntry **plink, int table_index);
 void dictRelease(dict *d);
 dictEntry * dictFind(dict *d, const void *key);
+dictEntry *dictFindByHash(dict *d, const void *key, const uint64_t hash);
+dictEntry *dictFindByHashAndPtr(dict *d, const void *oldptr, const uint64_t hash);
 void *dictFetchValue(dict *d, const void *key);
 int dictShrinkIfNeeded(dict *d);
 int dictExpandIfNeeded(dict *d);
@@ -249,7 +252,6 @@ uint8_t *dictGetHashFunctionSeed(void);
 unsigned long dictScan(dict *d, unsigned long v, dictScanFunction *fn, void *privdata);
 unsigned long dictScanDefrag(dict *d, unsigned long v, dictScanFunction *fn, dictDefragFunctions *defragfns, void *privdata);
 uint64_t dictGetHash(dict *d, const void *key);
-dictEntry *dictFindEntryByPtrAndHash(dict *d, const void *oldptr, uint64_t hash);
 void dictRehashingInfo(dict *d, unsigned long long *from_size, unsigned long long *to_size);
 
 size_t dictGetStatsMsg(char *buf, size_t bufsize, dictStats *stats, int full);

diff --git a/src/kvstore.c b/src/kvstore.c
@@ -766,12 +766,12 @@ dictEntry *kvstoreDictGetFairRandomKey(kvstore *kvs, int didx)
     return dictGetFairRandomKey(d);
 }
 
-dictEntry *kvstoreDictFindEntryByPtrAndHash(kvstore *kvs, int didx, const void *oldptr, uint64_t hash)
+dictEntry *kvstoreDictFindByHashAndPtr(kvstore *kvs, int didx, const void *oldptr, uint64_t hash)
 {
     dict *d = kvstoreGetDict(kvs, didx);
     if (!d)
         return NULL;
-    return dictFindEntryByPtrAndHash(d, oldptr, hash);
+    return dictFindByHashAndPtr(d, oldptr, hash);
 }
 
 unsigned int kvstoreDictGetSomeKeys(kvstore *kvs, int didx, dictEntry **des, unsigned int count)

diff --git a/src/kvstore.h b/src/kvstore.h
@@ -73,7 +73,7 @@ void kvstoreReleaseDictIterator(kvstoreDictIterator *kvs_id);
 dictEntry *kvstoreDictIteratorNext(kvstoreDictIterator *kvs_di);
 dictEntry *kvstoreDictGetRandomKey(kvstore *kvs, int didx);
 dictEntry *kvstoreDictGetFairRandomKey(kvstore *kvs, int didx);
-dictEntry *kvstoreDictFindEntryByPtrAndHash(kvstore *kvs, int didx, const void *oldptr, uint64_t hash);
+dictEntry *kvstoreDictFindByHashAndPtr(kvstore *kvs, int didx, const void *oldptr, uint64_t hash);
 unsigned int kvstoreDictGetSomeKeys(kvstore *kvs, int didx, dictEntry **des, unsigned int count);
 int kvstoreDictExpand(kvstore *kvs, int didx, unsigned long size);
 unsigned long kvstoreDictScanDefrag(kvstore *kvs, int didx, unsigned long v, dictScanFunction *fn, dictDefragFunctions *defragfns, void *privdata);