Skip to content

Commit

Permalink
Introduce thread-local storage variable to update thread's own used_m…
Browse files Browse the repository at this point in the history
…emory and sum when reading to reduce atomic contention. (#674)

#### Description 
This patch try to introduce a thread-local storage variable for each
thread to update its own `used_memory`, and then sum them together when
reading in `zmalloc_used_memory`. Then we can reduce unnecessary `lock
add` contention from atomic variable. We also add a protection if too
many threads created and the total threads number greater than 132, then
fall back to atomic operation for the threads index >= 132.

#### Problem Statement
`zmalloc` and `zfree` related functions will update the `used_memory`
atomicity for each operation, and they are called very frequency. From
the benchmark of
[memtier_benchmark-1Mkeys-load-stream-5-fields-with-100B-values-pipeline-10.yml](https://github.com/redis/redis-benchmarks-specification/blob/main/redis_benchmarks_specification/test-suites/memtier_benchmark-1Mkeys-load-stream-5-fields-with-100B-values-pipeline-10.yml)
, the cycles ratio of `zmalloc` and `zfree` are high, they are wrappers
for the lower allocator library, it should not take too much cycles. And
most of the cycles are contributed by `lock add` and `lock sub` , they
are expensive instructions. From the profiling, the metrics' update
mainly come from the main thread, use a TLS will reduce a lot of
contention.

#### Performance Boost

**Note:** This optimization should benefit common benchmark widely. I
choose below 2 scenarios to validate the performance boost in my local
environment.

| Test Suites | Performance Boost |
|-|-|

|[memtier_benchmark-1Mkeys-load-stream-5-fields-with-100B-values-pipeline-10](https://github.com/redis/redis-benchmarks-specification/blob/main/redis_benchmarks_specification/test-suites/memtier_benchmark-1Mkeys-load-stream-5-fields-with-100B-values-pipeline-10.yml)|8%|

|[memtier_benchmark-1Mkeys-load-string-with-100B-values-pipeline-10](https://github.com/redis/redis-benchmarks-specification/blob/main/redis_benchmarks_specification/test-suites/memtier_benchmark-1Mkeys-load-string-with-100B-values-pipeline-10.yml)|4%|

##### Test Env
- OS: Ubuntu 22.04.4 LTS
- Platform: Intel Xeon Platinum 8380
- Server and Client in same socket

##### Start Server 
```sh
taskset -c 0-3 ~/valkey/src/valkey-server /tmp/valkey_1.conf
port 9001
bind * -::*
daemonize yes
protected-mode no
save ""
```

---------

Signed-off-by: Lipeng Zhu <[email protected]>
Co-authored-by: Wangyang Guo <[email protected]>
  • Loading branch information
lipzhu and guowangy authored Jul 2, 2024
1 parent 0cc16d0 commit 3323e42
Show file tree
Hide file tree
Showing 3 changed files with 65 additions and 13 deletions.
10 changes: 10 additions & 0 deletions src/config.h
Original file line number Diff line number Diff line change
Expand Up @@ -329,4 +329,14 @@ void setcpuaffinity(const char *cpulist);
#define HAVE_FADVISE
#endif

#define IO_THREADS_MAX_NUM 128

#ifndef CACHE_LINE_SIZE
#if defined(__aarch64__) && defined(__APPLE__)
#define CACHE_LINE_SIZE 128
#else
#define CACHE_LINE_SIZE 64
#endif
#endif

#endif
9 changes: 0 additions & 9 deletions src/networking.c
Original file line number Diff line number Diff line change
Expand Up @@ -4222,15 +4222,6 @@ void processEventsWhileBlocked(void) {
* Threaded I/O
* ========================================================================== */

#define IO_THREADS_MAX_NUM 128
#ifndef CACHE_LINE_SIZE
#if defined(__aarch64__) && defined(__APPLE__)
#define CACHE_LINE_SIZE 128
#else
#define CACHE_LINE_SIZE 64
#endif
#endif

typedef struct __attribute__((aligned(CACHE_LINE_SIZE))) threads_pending {
_Atomic unsigned long value;
} threads_pending;
Expand Down
59 changes: 55 additions & 4 deletions src/zmalloc.c
Original file line number Diff line number Diff line change
Expand Up @@ -88,10 +88,53 @@ void zlibc_free(void *ptr) {
#define dallocx(ptr, flags) je_dallocx(ptr, flags)
#endif

#define update_zmalloc_stat_alloc(__n) atomic_fetch_add_explicit(&used_memory, (__n), memory_order_relaxed)
#define update_zmalloc_stat_free(__n) atomic_fetch_sub_explicit(&used_memory, (__n), memory_order_relaxed)
#if __STDC_NO_THREADS__
#define thread_local __thread
#else
#include <threads.h>
#endif

#define MAX_THREADS_NUM (IO_THREADS_MAX_NUM + 3 + 1)
/* A thread-local storage which keep the current thread's index in the used_memory_thread array. */
static thread_local int thread_index = -1;
/* Element in used_memory_thread array should only be written by a single thread which
* distinguished by the thread-local storage thread_index. But when an element in
* used_memory_thread array was written, it could be read by another thread simultaneously,
* the reader will see the inconsistency memory on non x86 architecture potentially.
* For the ARM and PowerPC platform, we can solve this issue by make the memory aligned.
* For the other architecture, lets fall back to the atomic operation to keep safe. */
#if defined(__i386__) || defined(__x86_64__) || defined(__amd64__) || defined(__POWERPC__) || defined(__arm__) || \
defined(__arm64__)
static __attribute__((aligned(sizeof(size_t)))) size_t used_memory_thread[MAX_THREADS_NUM];
#else
static _Atomic size_t used_memory_thread[MAX_THREADS_NUM];
#endif
static atomic_int total_active_threads = 0;
/* This is a simple protection. It's used only if some modules create a lot of threads. */
static atomic_size_t used_memory_for_additional_threads = 0;

/* Register the thread index in start_routine. */
static inline void zmalloc_register_thread_index(void) {
thread_index = atomic_fetch_add_explicit(&total_active_threads, 1, memory_order_relaxed);
}

static inline void update_zmalloc_stat_alloc(size_t size) {
if (unlikely(thread_index == -1)) zmalloc_register_thread_index();
if (unlikely(thread_index >= MAX_THREADS_NUM)) {
atomic_fetch_add_explicit(&used_memory_for_additional_threads, size, memory_order_relaxed);
} else {
used_memory_thread[thread_index] += size;
}
}

static _Atomic size_t used_memory = 0;
static inline void update_zmalloc_stat_free(size_t size) {
if (unlikely(thread_index == -1)) zmalloc_register_thread_index();
if (unlikely(thread_index >= MAX_THREADS_NUM)) {
atomic_fetch_sub_explicit(&used_memory_for_additional_threads, size, memory_order_relaxed);
} else {
used_memory_thread[thread_index] -= size;
}
}

static void zmalloc_default_oom(size_t size) {
fprintf(stderr, "zmalloc: Out of memory trying to allocate %zu bytes\n", size);
Expand Down Expand Up @@ -415,7 +458,15 @@ char *zstrdup(const char *s) {
}

size_t zmalloc_used_memory(void) {
size_t um = atomic_load_explicit(&used_memory, memory_order_relaxed);
size_t um = 0;
int threads_num = total_active_threads;
if (unlikely(total_active_threads > MAX_THREADS_NUM)) {
um += atomic_load_explicit(&used_memory_for_additional_threads, memory_order_relaxed);
threads_num = MAX_THREADS_NUM;
}
for (int i = 0; i < threads_num; i++) {
um += used_memory_thread[i];
}
return um;
}

Expand Down

0 comments on commit 3323e42

Please sign in to comment.