anza-xyz · alexpyattaev · Dec 3, 2024 · Dec 3, 2024 · Dec 3, 2024 · Dec 7, 2024
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/Cargo.toml b/Cargo.toml
@@ -198,6 +198,7 @@ members = [
     "svm-transaction",
     "test-validator",
     "thin-client",
+    "thread-manager",
     "timings",
     "tls-utils",
     "tokens",
@@ -446,6 +447,7 @@ solana-bucket-map = { path = "bucket_map", version = "=2.2.0" }
 solana-builtins = { path = "builtins", version = "=2.2.0" }
 solana-builtins-default-costs = { path = "builtins-default-costs", version = "=2.2.0" }
 agave-cargo-registry = { path = "cargo-registry", version = "=2.2.0" }
+agave-thread-manager = { path = "thread-manager", version = "=2.2.0" }
 solana-clap-utils = { path = "clap-utils", version = "=2.2.0" }
 solana-clap-v3-utils = { path = "clap-v3-utils", version = "=2.2.0" }
 solana-cli = { path = "cli", version = "=2.2.0" }

diff --git a/thread-manager/Cargo.toml b/thread-manager/Cargo.toml
@@ -0,0 +1,31 @@
+[package]
+name = "agave-thread-manager"
+description = "Thread pool manager for agave"
+
+version = { workspace = true }
+authors = { workspace = true }
+repository = { workspace = true }
+homepage = { workspace = true }
+license = { workspace = true }
+edition = { workspace = true }
+
+publish = false
+
+[dependencies]
+anyhow = { workspace = true }
+log = { workspace = true }
+num_cpus = { workspace = true }
+rayon = { workspace = true }
+serde = { workspace = true, features = ["derive"] }
+solana-metrics = { workspace = true }
+thread-priority = "1.2.0"
+tokio = { workspace = true, features = ["time", "rt-multi-thread"] }
+
+[target.'cfg(target_os = "linux")'.dependencies]
+affinity = "0.1.2"
+
+[dev-dependencies]
+axum = "0.7.9"
+env_logger = { workspace = true }
+serde_json = { workspace = true }
+toml = { workspace = true }
diff --git a/thread-manager/README.md b/thread-manager/README.md
@@ -0,0 +1,43 @@
+# thread-manager
+Balances machine resources between multiple threaded runtimes. The purpose is to manage thread contention
+between different parts of the code that may
+benefit from a diverse set of management options. For example, we may want to have cores 1-4 handling
+networking via Tokio, core 5 handling file IO via Tokio, cores 9-16 hallocated for Rayon thread pool,
+and cores 6-8 available for general use by std::thread. This will minimize contention for CPU caches
+and context switches that would occur if Rayon was entirely unaware it was running side-by-side with
+tokio, and each was to spawn as many threads as there are cores.
+
+# Supported threading models
+## Tokio
+Multiple tokio runtimes can be created, and each may be assigned its own pool of CPU cores to run on.
+Number of worker and blocking threads is configurable, as are thread priorities for the pool.
+
+## Native
+Native threads (std::thread) can be spawned from managed pools, this allows them to inheirt a particular
+affinity from the pool, as well as to
+control the total number of threads made in every pool.
+
+## Rayon
+Rayon already manages thread pools well enough, all thread_manager does on top is enforce affinity and
+priority for rayon threads. Normally one would only ever have one rayon pool, but for priority allocations
+one may want to spawn many rayon pools.
+
+# Limitations
+
+ * Thread pools can only be created at process startup
+ * Once thread pool is created, its policy can not be modified at runtime
+ * Thread affinity not supported outside of linux
+
+# TODO:
+
+ * support tracing
+ * better metrics integration
+ * proper error handling everywhere
+ * even more tests
+
+
+# Examples
+All examples need wrk for workload generation. Please install it before running.
+
+ * core_contention_basics will demonstrate why core contention is bad, and how thread configs can help
+ * core_contention_sweep will sweep across a range of core counts to show how benefits scale with core counts
diff --git a/thread-manager/examples/core_contention_basics.rs b/thread-manager/examples/core_contention_basics.rs
@@ -0,0 +1,137 @@
+use {
+    agave_thread_manager::*,
+    log::{debug, info},
+    std::{
+        future::IntoFuture,
+        io::{Read, Write},
+        net::{IpAddr, Ipv4Addr, SocketAddr},
+        path::PathBuf,
+        time::Duration,
+    },
+};
+
+async fn axum_main(port: u16) {
+    use axum::{routing::get, Router};
+
+    // basic handler that responds with a static string
+    async fn root() -> &'static str {
+        tokio::time::sleep(Duration::from_millis(1)).await;
+        "Hello, World!"
+    }
+
+    // build our application with a route
+    let app = Router::new().route("/", get(root));
+
+    // run our app with hyper, listening globally on port 3000
+    let listener =
+        tokio::net::TcpListener::bind(SocketAddr::new(IpAddr::V4(Ipv4Addr::UNSPECIFIED), port))
+            .await
+            .unwrap();
+    let timeout = tokio::time::timeout(
+        Duration::from_secs(11),
+        axum::serve(listener, app).into_future(),
+    )
+    .await;
+    match timeout {
+        Ok(v) => v.unwrap(),
+        Err(_) => {
+            info!("Terminating server on port {port}");
+        }
+    }
+}
+
+fn main() -> anyhow::Result<()> {
+    env_logger::Builder::from_env(env_logger::Env::default().default_filter_or("info")).init();
+    let experiments = [
+        "examples/core_contention_dedicated_set.toml",
+        "examples/core_contention_contending_set.toml",
+    ];
+
+    for exp in experiments {
+        info!("===================");
+        info!("Running {exp}");
+        let mut conf_file = PathBuf::from(env!("CARGO_MANIFEST_DIR"));
+        conf_file.push(exp);
+        let mut buf = String::new();
+        std::fs::File::open(conf_file)?.read_to_string(&mut buf)?;
+        let cfg: RuntimeManagerConfig = toml::from_str(&buf)?;
+
+        let manager = ThreadManager::new(cfg).unwrap();
+        let tokio1 = manager
+            .get_tokio("axum1")
+            .expect("Expecting runtime named axum1");
+        tokio1.start_metrics_sampling(Duration::from_secs(1));
+        let tokio2 = manager
+            .get_tokio("axum2")
+            .expect("Expecting runtime named axum2");
+        tokio2.start_metrics_sampling(Duration::from_secs(1));
+
+        let wrk_cores: Vec<_> = (32..64).collect();
+        let results = std::thread::scope(|scope| {
+            scope.spawn(|| {
+                tokio1.tokio.block_on(axum_main(8888));
+            });
+            scope.spawn(|| {
+                tokio2.tokio.block_on(axum_main(8889));
+            });
+            let join_handle =
+                scope.spawn(|| run_wrk(&[8888, 8889], &wrk_cores, wrk_cores.len(), 1000).unwrap());
+            join_handle.join().expect("WRK crashed!")
+        });
+        //print out the results of the bench run
+        println!("Results are: {:?}", results);
+    }
+    Ok(())
+}
+
+fn run_wrk(
+    ports: &[u16],
+    cpus: &[usize],
+    threads: usize,
+    connections: usize,
+) -> anyhow::Result<(Vec<Duration>, Vec<f32>)> {
+    let mut script = PathBuf::from(env!("CARGO_MANIFEST_DIR"));
+    script.push("examples/report.lua");
+    let cpus: Vec<String> = cpus.iter().map(|c| c.to_string()).collect();
+    let cpus = cpus.join(",");
+
+    let mut children: Vec<_> = ports
+        .iter()
+        .map(|p| {
+            std::process::Command::new("taskset")
+                .arg("-c")
+                .arg(&cpus)
+                .arg("wrk")
+                .arg(format!("http://localhost:{}", p))
+                .arg("-d10")
+                .arg(format!("-s{}", script.to_str().unwrap()))
+                .arg(format!("-t{threads}"))
+                .arg(format!("-c{connections}"))
+                .stdout(std::process::Stdio::piped())
+                .stderr(std::process::Stdio::piped())
+                .spawn()
+                .unwrap()
+        })
+        .collect();
+
+    use std::str;
+    let outs = children.drain(..).map(|c| c.wait_with_output().unwrap());
+    let mut all_latencies = vec![];
+    let mut all_rps = vec![];
+    for (out, port) in outs.zip(ports.iter()) {
+        debug!("=========================");
+        std::io::stdout().write_all(&out.stderr)?;
+        let res = str::from_utf8(&out.stdout)?;
+        let mut res = res.lines().last().unwrap().split(' ');
+
+        let latency_us: u64 = res.next().unwrap().parse()?;
+        let latency = Duration::from_micros(latency_us);
+
+        let requests: usize = res.next().unwrap().parse()?;
+        let rps = requests as f32 / 10.0;
+        debug!("WRK results for port {port}: {latency:?} {rps}");
+        all_latencies.push(Duration::from_micros(latency_us));
+        all_rps.push(rps);
+    }
+    Ok((all_latencies, all_rps))
+}
diff --git a/thread-manager/examples/core_contention_contending_set.toml b/thread-manager/examples/core_contention_contending_set.toml
@@ -0,0 +1,13 @@
+[native_configs]
+
+[rayon_configs]
+
+[tokio_configs.axum1]
+worker_threads = 8
+max_blocking_threads = 1
+core_allocation.DedicatedCoreSet = { min = 0, max = 8 }
+
+[tokio_configs.axum2]
+worker_threads = 8
+max_blocking_threads = 1
+core_allocation.DedicatedCoreSet = { min = 0, max = 8 }
diff --git a/thread-manager/examples/core_contention_dedicated_set.toml b/thread-manager/examples/core_contention_dedicated_set.toml
@@ -0,0 +1,13 @@
+[native_configs]
+
+[rayon_configs]
+
+[tokio_configs.axum1]
+worker_threads = 4
+max_blocking_threads = 1
+core_allocation.DedicatedCoreSet = { min = 0, max = 4 }
+
+[tokio_configs.axum2]
+worker_threads = 4
+max_blocking_threads = 1
+core_allocation.DedicatedCoreSet = { min = 4, max = 8 }