JuliaGPU · tgymnich · Sep 14, 2024 · Oct 1, 2024 · Jan 27, 2025 · Jan 27, 2025
diff --git a/docs/src/api/kernel.md b/docs/src/api/kernel.md
@@ -53,4 +53,13 @@ MtlThreadGroupArray
 MemoryFlags
 threadgroup_barrier
 simdgroup_barrier
+```
+
+## Printing
+
+```@docs
+@mtlprintf
+@mtlprint
+@mtlprintln
+@mtlshow
 ```
diff --git a/docs/src/usage/kernel.md b/docs/src/usage/kernel.md
@@ -84,6 +84,34 @@ Additional notes:
 - Kernels must always return nothing
 - Kernels are asynchronous. To synchronize, use the `Metal.@sync` macro.
 
+## Printing
+
+When debugging, it's not uncommon to want to print some values. This is achieved with `@mtlprintf`:
+
+```julia
+function gpu_add2_print!(y, x)
+    index = thread_position_in_grid_1d()
+    @mtlprintf("thread %d", index)
+    @inbounds y[i] += x[i]
+    return nothing
+end
+
+A = Metal.ones(Float32, 8);
+B = Metal.rand(Float32, 8);
+
+@metal threads=length(A) gpu_add2_print!(A, B)
+```
+
+`@mtlprintf` is supported on macOS 15 and later. `@mtlprintf` support most of the format specifiers that `printf`
+supports in C with the following exceptions:
+ - `%n` and `%s` conversion specifiers are not supported
+ - Default argument promotion applies to arguments of half type which promote to the `double` type
+ - The format string must be a string literal
+
+Metal places output from `@mtlprintf` into a log buffer. The system only removes the messages from the log buffer when the command buffer completes. When the log buffer becomes full, the system drops all subsequent messages.
+
+See also: `@mtlprint`, `@mtlprintln` and `@mtlshow`
+
 ## Other Helpful Links
 
 [Metal Shading Language Specification](https://developer.apple.com/metal/Metal-Shading-Language-Specification.pdf)
diff --git a/lib/mtl/MTL.jl b/lib/mtl/MTL.jl
@@ -34,6 +34,7 @@ include("events.jl")
 include("fences.jl")
 include("heap.jl")
 include("buffer.jl")
+include("log_state.jl")
 include("command_queue.jl")
 include("command_buf.jl")
 include("compute_pipeline.jl")

diff --git a/lib/mtl/command_queue.jl b/lib/mtl/command_queue.jl
@@ -1,3 +1,24 @@
+
+export MTLCommandQueueDescriptor
+
+# @objcwrapper immutable=false MTLCommandQueueDescriptor <: NSObject
+
+function MTLCommandQueueDescriptor()
+    handle = @objc [MTLCommandQueueDescriptor alloc]::id{MTLCommandQueueDescriptor}
+    obj = MTLCommandQueueDescriptor(handle)
+    finalizer(release, obj)
+    @objc [obj::id{MTLCommandQueueDescriptor} init]::id{MTLCommandQueueDescriptor}
+    return obj
+end
+
+function MTLCommandQueue(dev::MTLDevice, descriptor::MTLCommandQueueDescriptor)
+    handle = @objc [dev::id{MTLDevice} newCommandQueueWithDescriptor:descriptor::id{MTLCommandQueueDescriptor}]::id{MTLCommandQueue}
+    obj = MTLCommandQueue(handle)
+    finalizer(release, obj)
+    return obj
+end
+
+
 export MTLCommandQueue
 
 # @objcwrapper immutable=false MTLCommandQueue <: NSObject
@@ -8,3 +29,4 @@ function MTLCommandQueue(dev::MTLDevice)
     finalizer(release, obj)
     return obj
 end
+
diff --git a/lib/mtl/libmtl.jl b/lib/mtl/libmtl.jl
@@ -1395,7 +1395,7 @@ end
     @autoproperty dispatchType::MTLDispatchType
 end
 
-@objcwrapper immutable = true availability = macos(v"15.0.0") MTLCommandQueueDescriptor <: NSObject
+@objcwrapper immutable = false availability = macos(v"15.0.0") MTLCommandQueueDescriptor <: NSObject
 
 @objcproperties MTLCommandQueueDescriptor begin
     @autoproperty maxCommandBufferCount::UInt64 setter = setMaxCommandBufferCount
@@ -2675,7 +2675,7 @@ end
     MTLLogLevelFault = 5
 end
 
-@objcwrapper immutable = true availability = macos(v"15.0.0") MTLLogStateDescriptor <: NSObject
+@objcwrapper immutable = false availability = macos(v"15.0.0") MTLLogStateDescriptor <: NSObject
 
 @objcproperties MTLLogStateDescriptor begin
     @autoproperty level::MTLLogLevel setter = setLevel

diff --git a/lib/mtl/log_state.jl b/lib/mtl/log_state.jl
@@ -0,0 +1,26 @@
+export MTLLogLevel
+
+export MTLLogStateDescriptor
+
+# @objcwrapper immutable = false MTLLogStateDescriptor <: NSObject
+
+function MTLLogStateDescriptor()
+    handle = @objc [MTLLogStateDescriptor alloc]::id{MTLLogStateDescriptor}
+    obj = MTLLogStateDescriptor(handle)
+    finalizer(release, obj)
+    @objc [obj::id{MTLLogStateDescriptor} init]::id{MTLLogStateDescriptor}
+    return obj
+end
+
+
+export MTLLogState
+
+# @objcwrapper immutable = true MTLLogState <: NSObject
+
+function MTLLogState(dev::MTLDevice, descriptor::MTLLogStateDescriptor)
+    err = Ref{id{NSError}}(nil)
+    handle = @objc [dev::id{MTLDevice} newLogStateWithDescriptor:descriptor::id{MTLLogStateDescriptor}
+        error:err::Ptr{id{NSError}}]::id{MTLLogState}
+    err[] == nil || throw(NSError(err[]))
+    MTLLogState(handle)
+end
diff --git a/res/wrap/libmtl.toml b/res/wrap/libmtl.toml
@@ -50,6 +50,9 @@ immutable=false
 [api.MTLCommandQueue]
 immutable=false
 
+[api.MTLCommandQueueDescriptor]
+immutable=false
+
 [api.MTLCompileOptions]
 immutable=false
     [api.MTLCompileOptions.proptype]
@@ -85,6 +88,9 @@ immutable=false
 [api.MTLLibrary]
 immutable=false
 
+[api.MTLLogStateDescriptor]
+immutable=false
+
 [api.MTLSharedEvent]
 immutable=false
 

diff --git a/src/Metal.jl b/src/Metal.jl
@@ -35,6 +35,7 @@ include("device/intrinsics/synchronization.jl")
 include("device/intrinsics/memory.jl")
 include("device/intrinsics/simd.jl")
 include("device/intrinsics/atomics.jl")
+include("device/intrinsics/output.jl")
 include("device/quirks.jl")
 
 # array essentials

diff --git a/src/compiler/compilation.jl b/src/compiler/compilation.jl
@@ -104,9 +104,9 @@ function compile(@nospecialize(job::CompilerJob))
 
     @signpost_interval log=log_compiler() "Generate LLVM IR" begin
         # TODO: on 1.9, this actually creates a context. cache those.
-        ir, entry = JuliaContext() do ctx
+        ir, entry, loggingEnabled = JuliaContext() do ctx
             mod, meta = GPUCompiler.compile(:llvm, job)
-            string(mod), LLVM.name(meta.entry)
+            string(mod), LLVM.name(meta.entry), haskey(functions(mod), "air.os_log")
         end
     end
 
@@ -172,7 +172,7 @@ function compile(@nospecialize(job::CompilerJob))
         end
     end
 
-    return (; ir, air, metallib, entry)
+    return (; ir, air, metallib, entry, loggingEnabled)
 end
 
 # link into an executable kernel
@@ -210,5 +210,5 @@ end
         end
     end
 
-    pipeline_state
+    pipeline_state, compiled.loggingEnabled
 end
diff --git a/src/compiler/execution.jl b/src/compiler/execution.jl
@@ -161,6 +161,7 @@ mtlconvert(arg, cce=nothing) = adapt(Adaptor(cce), arg)
 struct HostKernel{F,TT}
     f::F
     pipeline::MTLComputePipelineState
+    loggingEnabled::Bool
 end
 
 const mtlfunction_lock = ReentrantLock()
@@ -186,15 +187,15 @@ function mtlfunction(f::F, tt::TT=Tuple{}; name=nothing, kwargs...) where {F,TT}
         cache = compiler_cache(dev)
         source = methodinstance(F, tt)
         config = compiler_config(dev; name, kwargs...)::MetalCompilerConfig
-        pipeline = GPUCompiler.cached_compilation(cache, source, config, compile, link)
+        pipeline, loggingEnabled = GPUCompiler.cached_compilation(cache, source, config, compile, link)
 
         # create a callable object that captures the function instance. we don't need to think
         # about world age here, as GPUCompiler already does and will return a different object
         h = hash(pipeline, hash(f, hash(tt)))
         kernel = get(_kernel_instances, h, nothing)
         if kernel === nothing
             # create the kernel state object
-            kernel = HostKernel{F,tt}(f, pipeline)
+            kernel = HostKernel{F, tt}(f, pipeline, loggingEnabled)
             _kernel_instances[h] = kernel
         end
         return kernel::HostKernel{F,tt}
@@ -275,7 +276,35 @@ end
     (threads.width * threads.height * threads.depth) > kernel.pipeline.maxTotalThreadsPerThreadgroup &&
         throw(ArgumentError("Number of threads in group ($(threads.width * threads.height * threads.depth)) should not exceed $(kernel.pipeline.maxTotalThreadsPerThreadgroup)"))
 
-    cmdbuf = MTLCommandBuffer(queue)
+    cmdbuf = if kernel.loggingEnabled
+        # TODO: make this a dynamic error, i.e., from the kernel (JuliaGPU/Metal.jl#433)
+        @static if !is_macos(v"15.0.0")
+            error("Logging is only supported on macOS 15 or higher")
+        end
+
+        if MTLCaptureManager().isCapturing
+            error("Logging is not supported while GPU frame capturing")
+        end
+
+        log_state_descriptor = MTLLogStateDescriptor()
+        log_state_descriptor.level = MTL.MTLLogLevelDebug
+        log_state = MTLLogState(queue.device, log_state_descriptor)
+
+        function log_handler(subSystem, category, logLevel, message)
+            Core.print(String(NSString(message)))
+            return nothing
+        end
+
+        block = @objcblock(log_handler, Nothing, (id{NSString}, id{NSString}, NSInteger, id{NSString}))
+        @objc [log_state::id{MTLLogState} addLogHandler:block::id{NSBlock}]::Nothing
+
+        cmdbuf_descriptor = MTLCommandBufferDescriptor()
+        cmdbuf_descriptor.logState = log_state
+        MTLCommandBuffer(queue, cmdbuf_descriptor)
+    else
+        MTLCommandBuffer(queue)
+    end
+
     cmdbuf.label = "MTLCommandBuffer($(nameof(kernel.f)))"
     cce = MTLComputeCommandEncoder(cmdbuf)
     argument_buffers = try