valkey-io · PingXie · Jul 2, 2024 · Jun 19, 2024 · Jun 25, 2024 · Jun 25, 2024
diff --git a/src/config.h b/src/config.h
@@ -329,4 +329,15 @@ void setcpuaffinity(const char *cpulist);
 #define HAVE_FADVISE
 #endif
 
+#define IO_THREADS_MAX_NUM 128
+#define MAX_THREADS_NUM (IO_THREADS_MAX_NUM + 3 + 1)
+
+#ifndef CACHE_LINE_SIZE
+#if defined(__aarch64__) && defined(__APPLE__)
+#define CACHE_LINE_SIZE 128
+#else
+#define CACHE_LINE_SIZE 64
+#endif
+#endif
+
 #endif
diff --git a/src/networking.c b/src/networking.c
@@ -4204,15 +4204,6 @@ void processEventsWhileBlocked(void) {
  * Threaded I/O
  * ========================================================================== */
 
-#define IO_THREADS_MAX_NUM 128
-#ifndef CACHE_LINE_SIZE
-#if defined(__aarch64__) && defined(__APPLE__)
-#define CACHE_LINE_SIZE 128
-#else
-#define CACHE_LINE_SIZE 64
-#endif
-#endif
-
 typedef struct __attribute__((aligned(CACHE_LINE_SIZE))) threads_pending {
     _Atomic unsigned long value;
 } threads_pending;

diff --git a/src/zmalloc.c b/src/zmalloc.c
@@ -88,10 +88,44 @@ void zlibc_free(void *ptr) {
 #define dallocx(ptr, flags) je_dallocx(ptr, flags)
 #endif
 
-#define update_zmalloc_stat_alloc(__n) atomic_fetch_add_explicit(&used_memory, (__n), memory_order_relaxed)
-#define update_zmalloc_stat_free(__n) atomic_fetch_sub_explicit(&used_memory, (__n), memory_order_relaxed)
+#if __STDC_NO_THREADS__
+#define thread_local __thread
+#else
+#include <threads.h>
+#endif
+
+/* A thread-local storage which keep the current thread's index in the used_memory_thread array. */
+static thread_local int thread_index = -1;
+/* Element in used_memory_thread array should only be written by a single thread which
+ * distinguished by the thread-local storage thread_index. But when an element in
+ * used_memory_thread array was written, it could be read by another thread simultaneously,
+ * the reader will see the inconsistency memory on non x86 architecture potentially.
+ * For the ARM and PowerPC platform, we can solve this issue by make the memory aligned.
+ * For the other architecture, lets fall back to the atomic operation to keep safe. */
+#if defined(__i386__) || defined(__x86_64__) || defined(__amd64__) || defined(__POWERPC__) || defined(__arm__) ||      \
+    defined(__arm64__)
+static __attribute__((aligned(sizeof(size_t)))) size_t used_memory_thread[MAX_THREADS_NUM];
+#else
+static _Atomic size_t used_memory_thread[MAX_THREADS_NUM];
+#endif
+static atomic_int total_active_threads;
 
-static _Atomic size_t used_memory = 0;
+/* Register the thread index in start_routine. */
+static inline void zmalloc_register_thread_index(void) {
+    thread_index = atomic_fetch_add_explicit(&total_active_threads, 1, memory_order_relaxed);
+    /* TODO: Handle the case when exceed the MAX_THREADS_NUM (may rarely happen). */
+    assert(total_active_threads < MAX_THREADS_NUM);
+}
+
+static inline void update_zmalloc_stat_alloc(size_t size) {
+    if (unlikely(thread_index == -1)) zmalloc_register_thread_index();
+    used_memory_thread[thread_index] += size;
+}
+
+static inline void update_zmalloc_stat_free(size_t size) {
+    if (unlikely(thread_index == -1)) zmalloc_register_thread_index();
+    used_memory_thread[thread_index] -= size;
+}
 
 static void zmalloc_default_oom(size_t size) {
     fprintf(stderr, "zmalloc: Out of memory trying to allocate %zu bytes\n", size);
@@ -415,7 +449,11 @@ char *zstrdup(const char *s) {
 }
 
 size_t zmalloc_used_memory(void) {
-    size_t um = atomic_load_explicit(&used_memory, memory_order_relaxed);
+    assert(total_active_threads < MAX_THREADS_NUM);
+    size_t um = 0;
+    for (int i = 0; i < total_active_threads; i++) {
+        um += used_memory_thread[i];
+    }
     return um;
 }