Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improve CPU performance by 50x #4

Open
wants to merge 12 commits into
base: main
Choose a base branch
from
14 changes: 14 additions & 0 deletions build.zig
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,20 @@ pub fn build(b: *std.Build) !void {
const run_step = b.step("run", "Run the app");
run_step.dependOn(&run_cmd.step);

const tests = b.addTest(.{
.name = "slimytest",
.root_source_file = b.path("src/main.zig"),
.target = target,
.optimize = optimize,
.single_threaded = singlethread,
.strip = strip,
});
tests.linkLibC();

const run_tests = b.addRunArtifact(tests);
const test_step = b.step("test", "Run unit tests");
test_step.dependOn(&run_tests.step);

const wasm = b.addSharedLibrary(.{
.name = "slimy",
.root_source_file = b.path("src/web.zig"),
Expand Down
15 changes: 4 additions & 11 deletions src/bench.zig
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ fn mainInternal() !void {
// - check that slimy is generating the expected results
// - get a rough estimate of how fast things are running, to base later benchmarks on

var collector = Collector{};
var collector: Collector = .{};
var params = warmup_params;
var timer = try std.time.Timer.start();

Expand Down Expand Up @@ -100,7 +100,7 @@ fn formatIntGrouped(
}

const test_seed: i64 = -2152535657050944081;
const warmup_params = slimy.SearchParams{
const warmup_params: slimy.SearchParams = .{
.world_seed = test_seed,
.threshold = 39,

Expand All @@ -111,7 +111,7 @@ const warmup_params = slimy.SearchParams{

.method = undefined,
};
const warmup_results = &[_]slimy.Result{
const warmup_results: []const slimy.Result = &.{
.{ .x = 949, .z = -923, .count = 43 },
.{ .x = 950, .z = -924, .count = 42 },
.{ .x = 245, .z = 481, .count = 40 },
Expand Down Expand Up @@ -183,18 +183,11 @@ const Collector = struct {
}

pub fn check(self: *Collector, expected: []const slimy.Result) !void {
if (expected.len != self.n) {
return error.IncorrectResults;
}
std.sort.block(slimy.Result, self.buf[0..self.n], {}, slimy.Result.sortLessThan);
std.debug.assert(
std.sort.isSorted(slimy.Result, expected, {}, slimy.Result.sortLessThan),
);
for (expected, 0..) |r, i| {
if (!std.meta.eql(r, self.buf[i])) {
return error.IncorrectResults;
}
}
try std.testing.expectEqualSlices(slimy.Result, expected, self.buf[0..self.n]);
}
};

Expand Down
291 changes: 102 additions & 189 deletions src/cpu.zig
Original file line number Diff line number Diff line change
Expand Up @@ -2,210 +2,123 @@ const std = @import("std");
const builtin = @import("builtin");
const common = @import("common.zig");
const slimy = @import("slimy.zig");
const SearchBlock = @import("cpu/SearchBlock.zig");

fn isSlime(world_seed: i64, x: i32, z: i32) bool {
@setRuntimeSafety(false);

// Init slime seed
var seed = world_seed +%
@as(i64, x * x *% 4987142) +
@as(i64, x *% 5947611) +
@as(i64, z * z) * 4392871 +
@as(i64, z *% 389711);
seed ^= 987234911;

// Init LCG seed
const magic = 0x5DEECE66D;
const mask = (1 << 48) - 1;
seed = (seed ^ magic) & mask;

// Calculate random result
seed = (seed *% magic +% 0xB) & mask;
const bits: i32 = @intCast(seed >> 48 - 31);
const val = @mod(bits, 10);

std.debug.assert(bits >= val - 9);
return val == 0;
pub fn search(
params: slimy.SearchParams,
context: anytype,
comptime resultCallback: fn (@TypeOf(context), slimy.Result) void,
comptime progressCallback: ?fn (@TypeOf(context), completed: u64, total: u64) void,
) !void {
std.debug.assert(params.method == .cpu);
std.debug.assert(params.method.cpu > 0);
if (params.method.cpu == 1) {
searchSinglethread(params, context, resultCallback, progressCallback);
} else if (builtin.single_threaded) {
unreachable;
} else {
try searchMultithread(params, context, resultCallback, progressCallback);
}
}

test "isSlime" {
const expected = [10][10]u1{
.{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
.{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
.{ 0, 0, 0, 0, 0, 1, 0, 0, 0, 0 },
.{ 1, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
.{ 0, 0, 0, 1, 0, 0, 0, 0, 0, 1 },
.{ 0, 0, 0, 0, 0, 1, 0, 0, 0, 0 },
.{ 0, 1, 0, 0, 1, 0, 0, 1, 0, 0 },
.{ 1, 0, 0, 0, 0, 1, 0, 0, 0, 0 },
.{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
.{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 1 },
};

for (expected, 0..) |row, y| {
for (row, 0..) |e, x| {
try std.testing.expectEqual(
e != 0,
isSlime(1, @intCast(x), @intCast(y)),
);
pub fn searchSinglethread(
params: slimy.SearchParams,
context: anytype,
comptime resultCallback: fn (@TypeOf(context), slimy.Result) void,
comptime progressCallback: ?fn (@TypeOf(context), completed: u64, total: u64) void,
) void {
std.debug.assert(params.method == .cpu);
std.debug.assert(params.method.cpu == 1);
std.debug.assert(params.x0 < params.x1);
std.debug.assert(params.z0 < params.z1);
const block_size = SearchBlock.tested_size;

var completed_chunks: usize = 0;
const width: u64 = @intCast(params.x1 - params.x0);
const height: u64 = @intCast(params.z1 - params.z0);
const total_chunks = width * height;

var x = params.x0;
while (x < params.x1) : (x += block_size) {
var z = params.z0;
while (z < params.z1) : (z += block_size) {
var block = SearchBlock.initSimd(params.world_seed, x, z);
block.preprocess();
_ = block.calculateSliminess(params, context, resultCallback);
completed_chunks += block_size * block_size;
(progressCallback orelse continue)(context, completed_chunks, total_chunks);
}
}
}
test "isSlime with Z 23" {
try std.testing.expect(!isSlime(1, -1, 23));
}

fn checkLocation(world_seed: i64, cx: i32, cz: i32) u32 {
@setRuntimeSafety(false);

var count: u32 = 0;
for (common.mask, 0..) |row, mz| {
for (row, 0..) |bit, mx| {
const x = @as(i32, @intCast(mx)) + cx - @as(i32, @intCast(row.len / 2));
const z = @as(i32, @intCast(mz)) + cz - @as(i32, @intCast(common.mask.len / 2));
count += @intFromBool(bit and isSlime(world_seed, x, z));
}
pub fn searchMultithread(
params: slimy.SearchParams,
context: anytype,
comptime resultCallback: fn (@TypeOf(context), slimy.Result) void,
comptime progressCallback: ?fn (@TypeOf(context), completed: u64, total: u64) void,
) !void {
std.debug.assert(params.method == .cpu);
std.debug.assert(params.method.cpu > 1);
std.debug.assert(params.x0 < params.x1);
std.debug.assert(params.z0 < params.z1);

// Reset chunk search counter
chunks_searched = std.atomic.Value(usize).init(0);

var threads = std.BoundedArray(std.Thread, 255).init(0) catch unreachable;
const thread_count = params.method.cpu;
for (0..thread_count) |thread_index| {
threads.append(try std.Thread.spawn(
.{ .stack_size = 64 * 1024 },
worker,
.{
params,
context,
resultCallback,
progressCallback,
thread_index,
thread_count,
},
)) catch unreachable;
}
std.Thread.yield() catch {};
for (threads.slice()) |thread| {
thread.join();
}
return count;
}

pub fn search(
pub fn worker(
params: slimy.SearchParams,
context: anytype,
comptime resultCallback: fn (@TypeOf(context), slimy.Result) void,
comptime progressCallback: ?fn (@TypeOf(context), completed: u64, total: u64) void,
thread_id: usize,
thread_count: usize,
) !void {
try Searcher(struct {
ctx: @TypeOf(context),

const Self = @This();

pub fn reportResult(self: Self, result: slimy.Result) void {
resultCallback(self.ctx, result);
}
pub fn reportProgress(self: Self, completed: u64, total: u64) void {
if (progressCallback) |callback| {
callback(self.ctx, completed, total);
}
const block_size = SearchBlock.tested_size;

const blocks_x = try std.math.divCeil(usize, @intCast(params.x1 - params.x0), block_size);
const blocks_z = try std.math.divCeil(usize, @intCast(params.z1 - params.z0), block_size);

// split blocks as evenly as possible
const start_block = blocks_x * blocks_z * thread_id / thread_count;
const end_block = blocks_x * blocks_z * (thread_id + 1) / thread_count;

var i: usize = 0;
for (start_block..end_block) |block_index| {
const rel_block_x = block_index / blocks_x;
const rel_block_z = @mod(block_index, blocks_x);
var chunk = SearchBlock.initSimd(params.world_seed, params.x0 + @as(i32, @intCast(rel_block_x * block_size)), params.z0 + @as(i32, @intCast(rel_block_z * block_size)));
chunk.preprocess();
_ = chunk.calculateSliminess(params, context, resultCallback);
i += 1;
if (i == 20) {
_ = chunks_searched.fetchAdd(i, .monotonic);
i = 0;
if (thread_id == 0) (progressCallback orelse continue)(context, chunks_searched.raw, blocks_x * blocks_z);
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this is a... creative use of continue

I'd probably recommend just doing this instead:

if (thread_id == 0 and progressCallback != null) {
    progressCallback.?(...);
}

}
}).init(params, .{ .ctx = context }).search();
}
_ = chunks_searched.fetchAdd(i, .monotonic);
}

// Context should have the following functions:
// pub fn reportResult(self: Context, result: slimy.Result) void;
// pub fn reportProgress(self: Context, completed: u64, total: u64) void;
pub fn Searcher(comptime Context: type) type {
return struct {
world_seed: i64,
threshold: i32,

x0: i32,
z0: i32,
x1: i32,
z1: i32,

threads: u8,
ctx: Context,

const Self = @This();

pub fn init(params: slimy.SearchParams, context: Context) Self {
std.debug.assert(params.method.cpu > 0);
return .{
.world_seed = params.world_seed,
.threshold = params.threshold,

.x0 = params.x0,
.x1 = params.x1,
.z0 = params.z0,
.z1 = params.z1,

.threads = params.method.cpu,
.ctx = context,
};
}

pub fn search(self: Self) !void {
if (self.threads == 1) {
self.searchSinglethread();
} else if (builtin.single_threaded) {
unreachable;
} else {
try self.searchMultithread();
}
}

pub fn searchSinglethread(self: Self) void {
const width: u64 = @intCast(self.x1 - self.x0);
const height: u64 = @intCast(self.z1 - self.z0);
const total_chunks = width * height;
var completed_chunks: u64 = 0;
const step = 100;

var z0 = self.z0;
while (z0 < self.z1) : (z0 += step) {
const z1 = @min(z0 + step, self.z1);

var x0 = self.x0;
while (x0 < self.x1) : (x0 += step) {
const x1 = @min(x0 + step, self.x1);
self.searchArea(x0, x1, z0, z1);
completed_chunks += @as(u64, @intCast(x1 - x0)) * @as(u64, @intCast(z1 - z0));

self.ctx.reportProgress(completed_chunks, total_chunks);
}
}
}

pub fn searchMultithread(
self: Self,
) !void {
var i: u8 = 0;
var thr: ?std.Thread = null;
while (i < self.threads) : (i += 1) {
thr = try std.Thread.spawn(.{}, searchWorker, .{ self, i, thr });
}
thr.?.join();
}
fn searchWorker(self: Self, thread_idx: u8, prev_thread: ?std.Thread) void {
// TODO: work stealing

const thread_width = @as(u31, @intCast(self.z1 - self.z0)) / self.threads;
const z0 = self.z0 + thread_idx * thread_width;
const z1 = if (thread_idx == self.threads - 1)
self.z1 // Last thread, consume all remaining area
else
z0 + thread_width;

// TODO: progress reporting
self.searchArea(self.x0, self.x1, z0, z1);

// This creates a linked list of threads, so we can just join the last one from the main thread
if (prev_thread) |thr| thr.join();
}

// TODO: cache isSlime results
fn searchArea(
self: Self,
x0: i32,
x1: i32,
z0: i32,
z1: i32,
) void {
var z = z0;
while (z < z1) : (z += 1) {
var x = x0;
while (x < x1) : (x += 1) {
const count = checkLocation(self.world_seed, x, z);
if (count >= self.threshold) {
self.ctx.reportResult(.{
.x = x,
.z = z,
.count = count,
});
}
}
}
}
};
}
var chunks_searched = std.atomic.Value(usize).init(0);
Loading