From 4038f5eb391d67e8544779f28b66dcce66d2d32f Mon Sep 17 00:00:00 2001 From: Joakim Hassila Date: Tue, 14 Nov 2023 19:05:02 +0100 Subject: [PATCH] chore(minor): Fix sampling for virtual/resident memory peaks (#200) As reported here: https://forums.swift.org/t/compiler-optimisations-for-functional-style-collection-algorithms/68291/12 the peak resident memory counter showed invalid results - this PR enables the missing sampling for these two peak counters (virtual/resident) so it provides expected results. --- Sources/Benchmark/BenchmarkExecutor.swift | 14 +++--- Sources/Benchmark/BenchmarkMetric.swift | 2 +- .../MallocStatsProducer+jemalloc.swift | 7 +-- .../OperatingSystemStatsProducer+Darwin.swift | 36 +++++++++++-- .../OperatingSystemStatsProducer+Linux.swift | 50 ++++++++++++++++--- 5 files changed, 85 insertions(+), 24 deletions(-) diff --git a/Sources/Benchmark/BenchmarkExecutor.swift b/Sources/Benchmark/BenchmarkExecutor.swift index 9b2dec48..36a54278 100644 --- a/Sources/Benchmark/BenchmarkExecutor.swift +++ b/Sources/Benchmark/BenchmarkExecutor.swift @@ -100,10 +100,8 @@ final class BenchmarkExecutor { // swiftlint:disable:this type_body_length var iterations = 0 let initialStartTime = BenchmarkClock.now - // 'Warmup' to remove initial mallocs from stats in p100 - if mallocStatsRequested { - _ = MallocStatsProducer.makeMallocStats() - } + // 'Warmup' to remove initial mallocs from stats in p100, also used as base for some metrics + _ = MallocStatsProducer.makeMallocStats() // baselineMallocStats // Calculate typical sys call check overhead and deduct that to get 'clean' stats for the actual benchmark var operatingSystemStatsOverhead = OperatingSystemStats() @@ -200,10 +198,10 @@ final class BenchmarkExecutor { // swiftlint:disable:this type_body_length delta = stopMallocStats.mallocCountLarge - startMallocStats.mallocCountLarge statistics[.mallocCountLarge]?.add(Int(delta)) - delta = stopMallocStats.allocatedResidentMemory - - startMallocStats.allocatedResidentMemory + delta = stopMallocStats.allocatedResidentMemory - startMallocStats.allocatedResidentMemory statistics[.memoryLeaked]?.add(Int(delta)) +// delta = stopMallocStats.allocatedResidentMemory - baselineMallocStats.allocatedResidentMemory // baselineMallocStats! statistics[.allocatedResidentMemory]?.add(Int(stopMallocStats.allocatedResidentMemory)) } @@ -269,7 +267,9 @@ final class BenchmarkExecutor { // swiftlint:disable:this type_body_length } if benchmark.configuration.metrics.contains(.threads) || - benchmark.configuration.metrics.contains(.threadsRunning) { + benchmark.configuration.metrics.contains(.threadsRunning) || + benchmark.configuration.metrics.contains(.peakMemoryResident) || + benchmark.configuration.metrics.contains(.peakMemoryVirtual) { operatingSystemStatsProducer.startSampling(5_000) // ~5 ms } diff --git a/Sources/Benchmark/BenchmarkMetric.swift b/Sources/Benchmark/BenchmarkMetric.swift index 282539d8..7cd3e1db 100644 --- a/Sources/Benchmark/BenchmarkMetric.swift +++ b/Sources/Benchmark/BenchmarkMetric.swift @@ -172,7 +172,7 @@ public extension BenchmarkMetric { case .mallocCountTotal: return "Malloc (total)" case .allocatedResidentMemory: - return "Memory (allocated)" + return "Memory (allocated resident)" case .memoryLeaked: return "Malloc / free Δ" case .syscalls: diff --git a/Sources/Benchmark/MallocStats/MallocStatsProducer+jemalloc.swift b/Sources/Benchmark/MallocStats/MallocStatsProducer+jemalloc.swift index 9af88f52..521a36ed 100644 --- a/Sources/Benchmark/MallocStats/MallocStatsProducer+jemalloc.swift +++ b/Sources/Benchmark/MallocStats/MallocStatsProducer+jemalloc.swift @@ -33,7 +33,7 @@ import ExtrasJSON // var largeNMallocMIB = setupMIB(name: "stats.arenas.\(MALLCTL_ARENAS_ALL).large.nmalloc") // var smallNDallocMIB = setupMIB(name: "stats.arenas.\(MALLCTL_ARENAS_ALL).small.ndalloc") // var largeNDallocMIB = setupMIB(name: "stats.arenas.\(MALLCTL_ARENAS_ALL).large.ndalloc") -// var smallAlloctedMIB = setupMIB(name: "stats.arenas.\(MALLCTL_ARENAS_ALL).small.allocated") +// var smallAllocatedMIB = setupMIB(name: "stats.arenas.\(MALLCTL_ARENAS_ALL).small.allocated") // var largeAllocatedMIB = setupMIB(name: "stats.arenas.\(MALLCTL_ARENAS_ALL).large.allocated") // var smallNFillsMIB = setupMIB(name: "stats.arenas.\(MALLCTL_ARENAS_ALL).small.nfills") // var largeNFillsMIB = setupMIB(name: "stats.arenas.\(MALLCTL_ARENAS_ALL).large.nfills") @@ -53,8 +53,6 @@ import ExtrasJSON // Update jemalloc internal statistics, this is the magic incantation to do it static func updateEpoch() { - var allocated = 0 - var size = MemoryLayout.size var epoch = 0 let epochSize = MemoryLayout.size var result: Int32 = 0 @@ -66,11 +64,10 @@ import ExtrasJSON } // Then update epoch - result = mallctlbymib(epochMIB, epochMIB.count, &allocated, &size, &epoch, epochSize) + result = mallctlbymib(epochMIB, epochMIB.count, nil, nil, &epoch, epochSize) if result != 0 { print("mallctlbymib epochMIB returned \(result)") } -// return epoch } // Read the actual stats using a cached MIB as the key diff --git a/Sources/Benchmark/OperatingSystemStats/OperatingSystemStatsProducer+Darwin.swift b/Sources/Benchmark/OperatingSystemStats/OperatingSystemStatsProducer+Darwin.swift index 1256855d..ad15ea9d 100644 --- a/Sources/Benchmark/OperatingSystemStats/OperatingSystemStatsProducer+Darwin.swift +++ b/Sources/Benchmark/OperatingSystemStats/OperatingSystemStatsProducer+Darwin.swift @@ -22,6 +22,8 @@ let semaphore = DispatchSemaphore(value: 0) var peakThreads: Int = 0 var peakThreadsRunning: Int = 0 + var peakMemoryResident: Int = 0 + var peakMemoryVirtual: Int = 0 var runState: RunState = .running var sampleRate: Int = 10_000 var metrics: Set? @@ -89,11 +91,16 @@ func startSampling(_: Int = 10_000) { // sample rate in microseconds #if os(macOS) + let sampleSemaphore = DispatchSemaphore(value: 0) + DispatchQueue.global(qos: .userInitiated).async { self.lock.lock() let rate = self.sampleRate self.peakThreads = 0 self.peakThreadsRunning = 0 + self.peakMemoryResident = 0 + self.peakMemoryVirtual = 0 + self.runState = .running self.lock.unlock() @@ -109,6 +116,14 @@ self.peakThreadsRunning = Int(procTaskInfo.pti_numrunning) } + if procTaskInfo.pti_resident_size > self.peakMemoryResident { + self.peakMemoryResident = Int(procTaskInfo.pti_resident_size) + } + + if procTaskInfo.pti_virtual_size > self.peakMemoryVirtual { + self.peakMemoryVirtual = Int(procTaskInfo.pti_virtual_size) + } + if self.runState == .shuttingDown { self.runState = .done self.semaphore.signal() @@ -117,6 +132,8 @@ let quit = self.runState self.lock.unlock() + sampleSemaphore.signal() + if quit == .done { return } @@ -124,8 +141,9 @@ usleep(UInt32.random(in: UInt32(Double(rate) * 0.9) ... UInt32(Double(rate) * 1.1))) } } - // We'll sleep just a little bit to let the sampler thread get going so we don't get 0 samples - usleep(1_000) + + // We'll need to wait for a single sample from the so we don't get 0 samples + sampleSemaphore.wait() #endif } @@ -154,13 +172,21 @@ let totalTime = userTime + systemTime var threads = 0 var threadsRunning = 0 + var peakResident = 0 + var peakVirtual = 0 - if metrics.contains(.threads) || metrics.contains(.threadsRunning) { + if metrics.contains(.threads) || + metrics.contains(.threadsRunning) || + metrics.contains(.peakMemoryResident) || + metrics.contains(.peakMemoryVirtual) { lock.lock() threads = peakThreads threadsRunning = peakThreadsRunning + peakResident = peakMemoryResident + peakVirtual = peakMemoryVirtual lock.unlock() } + var ioStats = IOStats() if metrics.contains(.writeBytesPhysical) || metrics.contains(.writeBytesPhysical) { @@ -170,8 +196,8 @@ let stats = OperatingSystemStats(cpuUser: userTime, cpuSystem: systemTime, cpuTotal: totalTime, - peakMemoryResident: Int(procTaskInfo.pti_resident_size), - peakMemoryVirtual: Int(procTaskInfo.pti_virtual_size), + peakMemoryResident: peakResident, + peakMemoryVirtual: peakVirtual, syscalls: Int(procTaskInfo.pti_syscalls_unix) + Int(procTaskInfo.pti_syscalls_mach), contextSwitches: Int(procTaskInfo.pti_csw), diff --git a/Sources/Benchmark/OperatingSystemStats/OperatingSystemStatsProducer+Linux.swift b/Sources/Benchmark/OperatingSystemStats/OperatingSystemStatsProducer+Linux.swift index 9e5e906d..c6c5728f 100644 --- a/Sources/Benchmark/OperatingSystemStats/OperatingSystemStatsProducer+Linux.swift +++ b/Sources/Benchmark/OperatingSystemStats/OperatingSystemStatsProducer+Linux.swift @@ -22,6 +22,9 @@ let lock = NIOLock() let semaphore = DispatchSemaphore(value: 0) var peakThreads: Int = 0 + var peakThreadsRunning: Int = 0 + var peakMemoryResident: Int = 0 + var peakMemoryVirtual: Int = 0 var sampleRate: Int = 10_000 var runState: RunState = .running var metrics: Set? @@ -98,18 +101,39 @@ } func makeOperatingSystemStats() -> OperatingSystemStats { + guard let metrics else { + return .init() + } + let ioStats = readIOStats() let processStats = readProcessStats() + var threads = 0 + var threadsRunning = 0 + var peakResident = 0 + var peakVirtual = 0 + + if metrics.contains(.threads) || + metrics.contains(.threadsRunning) || + metrics.contains(.peakMemoryResident) || + metrics.contains(.peakMemoryVirtual) { + lock.lock() + threads = peakThreads + threadsRunning = peakThreadsRunning + peakResident = peakMemoryResident + peakVirtual = peakMemoryVirtual + lock.unlock() + } + return OperatingSystemStats(cpuUser: Int(processStats.cpuUser), cpuSystem: Int(processStats.cpuSystem), cpuTotal: Int(processStats.cpuTotal), - peakMemoryResident: Int(processStats.peakMemoryResident), - peakMemoryVirtual: Int(processStats.peakMemoryVirtual), + peakMemoryResident: peakResident, + peakMemoryVirtual: peakVirtual, syscalls: 0, contextSwitches: 0, - threads: Int(processStats.threads), - threadsRunning: 0, // we can go dig in /proc/self/task/ later if want this + threads: threads, + threadsRunning: threadsRunning, // we can go dig in /proc/self/task/ later if want this readSyscalls: Int(ioStats.readSyscalls), writeSyscalls: Int(ioStats.writeSyscalls), readBytesLogical: Int(ioStats.readBytesLogical), @@ -132,11 +156,15 @@ } func startSampling(_: Int = 10_000) { // sample rate in microseconds + let sampleSemaphore = DispatchSemaphore(value: 0) + DispatchQueue.global(qos: .userInitiated).async { self.lock.lock() let rate = self.sampleRate self.peakThreads = 0 + self.peakMemoryResident = 0 + self.peakMemoryVirtual = 0 self.runState = .running self.lock.unlock() @@ -150,6 +178,14 @@ self.peakThreads = processStats.threads } + if processStats.peakMemoryResident > self.peakMemoryResident { + self.peakMemoryResident = processStats.peakMemoryResident + } + + if processStats.peakMemoryVirtual > self.peakMemoryVirtual { + self.peakMemoryVirtual = processStats.peakMemoryVirtual + } + if self.runState == .shuttingDown { self.runState = .done self.semaphore.signal() @@ -159,6 +195,8 @@ self.lock.unlock() + sampleSemaphore.signal() + if quit == .done { return } @@ -166,8 +204,8 @@ usleep(UInt32.random(in: UInt32(Double(rate) * 0.9) ... UInt32(Double(rate) * 1.1))) } } - // We'll sleep just a little bit to let the sampler thread get going so we try to avoid 0 samples - usleep(1_000) + // We'll need to wait for a single sample from the so we don't get 0 samples + sampleSemaphore.wait() } func stopSampling() {