Numa benchmarking code

marius-team · Sep 11, 2024 · 08af4bb · 08af4bb
1 parent 46954c2
commit 08af4bb
Show file tree

Hide file tree

Showing 5 changed files with 147 additions and 4 deletions.
diff --git a/.bash_history b/.bash_history
@@ -0,0 +1,25 @@
+ls
+pwd
+cd /root/
+ls
+numact --hardware
+apt install -y numactl
+numactl --hardware
+apt-get install -y libnuma-dev
+ls
+cd numa_benchmarking/
+g++ -o numa_test_runner numa_test.cpp -lnuma -std=c++9
+g++ -o numa_test_runner numa_test.cpp -lnuma -std=c++11
+ls
+./numa_test_runner 
+exit
+exit
+ls
+clear
+cd numa_benchmarking/
+g++ -o numa_test_runner num_test.cpp -lnuma
+ls
+g++ -o numa_test_runner numa_test.cpp -lnuma
+chmod ugo+x ./numa_test_runner 
+./numa_test_runner 
+exit
diff --git a/.gitconfig b/.gitconfig
@@ -0,0 +1,2 @@
+[safe]
+	directory = *
diff --git a/examples/docker/cpu_ubuntu/dockerfile b/examples/docker/cpu_ubuntu/dockerfile
@@ -1,5 +1,5 @@
 FROM ubuntu:22.04
-RUN apt update
+RUN apt-get update && apt update -y && apt upgrade -y
 
 RUN apt install -y g++ \
          make \
@@ -8,7 +8,8 @@ RUN apt install -y g++ \
          vim \
          git \
          dstat \
-         python3-pip
+         numactl \
+         python3-pip --fix-missing
 
 # install gcc-9
 RUN apt install -y software-properties-common
@@ -27,5 +28,7 @@ RUN ln -s /opt/cmake/bin/cmake /usr/local/bin/cmake
 # install pytorch
 RUN python3 -m pip install torch --index-url https://download.pytorch.org/whl/cpu
 
-RUN mkdir /working_dir
-WORKDIR /working_dir
+RUN mkdir -p /root
+WORKDIR /root
+
+RUN apt-get install -y libnuma-dev
diff --git a/numa_benchmarking/CMakeLists.txt b/numa_benchmarking/CMakeLists.txt
@@ -0,0 +1,41 @@
+cmake_minimum_required(VERSION 3.20)
+project(SimSIMDExample)
+
+set(CMAKE_CXX_STANDARD 11)
+set(CMAKE_CXX_STANDARD_REQUIRED ON)
+
+include(FetchContent)
+
+# Declare and fetch SimSIMD
+FetchContent_Declare(
+    simsimd
+    GIT_REPOSITORY https://github.com/ashvardanian/simsimd.git
+    GIT_SHALLOW TRUE
+    GIT_TAG main
+)
+FetchContent_MakeAvailable(simsimd)
+
+# Find libnuma
+find_package(PkgConfig REQUIRED)
+pkg_check_modules(NUMA REQUIRED numa)
+
+# Add your executable
+add_executable(main main.cpp)
+
+# Link SimSIMD and libnuma to your executable
+target_link_libraries(main PRIVATE simsimd ${NUMA_LIBRARIES})
+
+# Include SimSIMD and libnuma headers
+target_include_directories(main PRIVATE 
+    ${simsimd_SOURCE_DIR}/include
+    ${NUMA_INCLUDE_DIRS}
+)
+
+# Add compile options for libnuma
+target_compile_options(main PRIVATE ${NUMA_CFLAGS_OTHER})
+
+# Add preprocessor definitions to handle missing types
+target_compile_definitions(main PRIVATE
+    SIMSIMD_NATIVE_F16=0
+    SIMSIMD_NATIVE_BF16=0
+)
diff --git a/numa_benchmarking/main.cpp b/numa_benchmarking/main.cpp
@@ -0,0 +1,72 @@
+#include <iostream>
+#include <chrono>
+#include <cstring>
+#include <numa.h>
+#include <simsimd/simsimd.h>
+
+
+// Define the necessary constants
+const int vector_size = 128;
+const size_t ONE_GB = 1024 * 1024 * 1024;  
+const float NUM_CHUNKS = 10;
+const size_t BUFFER_SIZE = NUM_CHUNKS * ONE_GB; // Buffer size of 10 GB
+
+void benchmark_scan_list(float* query_vec, float* search_vectors, size_t num_vectors, std::string benchmark_name) {
+    float total_distance = 0.0;
+    double dist_result;
+
+    // Run the actual benchmark
+    auto start = std::chrono::high_resolution_clock::now();
+    for (size_t j = 0; j < num_vectors; j++) {
+        float* curr_search_vec = search_vectors + j * vector_size;
+        simsimd_l2sq_f32(query_vec, curr_search_vec, vector_size, &dist_result);
+        total_distance += dist_result;
+    }
+    auto end = std::chrono::high_resolution_clock::now();
+    float time_taken = std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count();
+
+    std::cout << "Benchmark " << benchmark_name << ": Got average distance of " << total_distance/num_vectors << " at throughput of " << num_vectors/time_taken << " vectors/ms" << std::endl;
+}
+
+int main() {
+    if (numa_available() < 0) {
+        std::cerr << "NUMA is not available on this system" << std::endl;
+        return 1;
+    }
+
+    // Ensure we have at least 2 NUMA nodes
+    if (numa_max_node() < 1) {
+        std::cerr << "This system doesn't have at least 2 NUMA nodes" << std::endl;
+        return 1;
+    }
+
+    if(numa_run_on_node(0) != 0) {
+        std::cerr << "Failed to force worker to run on node 0" << std::endl;
+    }
+
+    // Initialize the query vector
+    float query_vector[vector_size];
+
+    // Create the target vector on both nodes
+    float* node_zero_vectors = reinterpret_cast<float*>(numa_alloc_onnode(BUFFER_SIZE, 0));
+    float* node_one_vectors = reinterpret_cast<float*>(numa_alloc_onnode(BUFFER_SIZE, 1));
+    if (!node_zero_vectors || !node_one_vectors) {
+        std::cerr << "Failed to allocate memory on numa nodes" << std::endl;
+        numa_free(node_zero_vectors, BUFFER_SIZE);
+        numa_free(node_one_vectors, BUFFER_SIZE);
+        return 1;
+    }
+    size_t single_vector_size = vector_size * sizeof(float);
+    size_t num_vectors = BUFFER_SIZE/single_vector_size;
+
+    // Run the cross node benchmark
+    std::cout << "Running benchmarking for buffer size of " << NUM_CHUNKS << " GB" << std::endl;
+    benchmark_scan_list(query_vector, node_one_vectors, num_vectors, "Node1 Vectors");
+    numa_free(node_one_vectors, BUFFER_SIZE);
+
+    // Run the same node benchmark
+    benchmark_scan_list(query_vector, node_zero_vectors, num_vectors, "Node0 Vectors");
+    numa_free(node_zero_vectors, BUFFER_SIZE);
+
+    return 0;
+}