From 455f28c59107a58b910418e10f4591cf65b3a6d1 Mon Sep 17 00:00:00 2001 From: Tim Besard Date: Thu, 25 Jul 2024 10:06:42 +0200 Subject: [PATCH] WIP: Exception handling. --- src/compiler/execution.jl | 20 +++++- src/device/runtime.jl | 141 +++++++++++++++++++++++++++++++++----- 2 files changed, 141 insertions(+), 20 deletions(-) diff --git a/src/compiler/execution.jl b/src/compiler/execution.jl index 4808c6224..9fb25b0d5 100644 --- a/src/compiler/execution.jl +++ b/src/compiler/execution.jl @@ -15,7 +15,7 @@ High-level interface for executing code on a GPU. The `@metal` macro should prefix a call, with `func` a callable function or object that should return nothing. It will be compiled to a Metal function upon first use, and to a certain extent arguments will be converted and managed automatically using `mtlconvert`. -Finally, a call to `mtlcall` is performed, creating a command buffer in the current global +Finally, the kernel will be called, creating a command buffer in the current global command queue then committing it. There is one supported keyword argument that influences the behavior of `@metal`: @@ -271,12 +271,28 @@ end (threads.width * threads.height * threads.depth) > kernel.pipeline.maxTotalThreadsPerThreadgroup && throw(ArgumentError("Number of threads in group ($(threads.width * threads.height * threads.depth)) should not exceed $(kernel.pipeline.maxTotalThreadsPerThreadgroup)")) + + # TODO: cache kernel state as MTLBuffer to lower launch cost? + exception_info, exception_buffer = begin + ref = Ref{ExceptionInfo}() + sz = cld(sizeof(ExceptionInfo_st), MTL.PAGESIZE) * MTL.PAGESIZE + status = @ccall posix_memalign(ref::Ptr{ExceptionInfo}, + MTL.PAGESIZE::Csize_t, + sz::Csize_t)::Cint + status == 0 || throw(OutOfMemoryError()) + cpu = unsafe_wrap(Vector{ExceptionInfo_st}, ref[], sz; own=true) + gpu = MTLBuffer(device(), sz, ref[]; nocopy=true) + cpu, gpu + end + state = KernelState(exception_buffer.gpuAddress) + # XXX: adapt() informs the cce about r/w usage of this buffer + cmdbuf = MTLCommandBuffer(queue) cmdbuf.label = "MTLCommandBuffer($(nameof(kernel.f)))" cce = MTLComputeCommandEncoder(cmdbuf) argument_buffers = try MTL.set_function!(cce, kernel.pipeline) - bufs = encode_arguments!(cce, kernel, kernel.f, args...) + bufs = encode_arguments!(cce, kernel, kernel.f, state, args...) MTL.append_current_function!(cce, groups, threads) bufs finally diff --git a/src/device/runtime.jl b/src/device/runtime.jl index dcd82933d..2d8063e38 100644 --- a/src/device/runtime.jl +++ b/src/device/runtime.jl @@ -1,34 +1,139 @@ -# device runtime libraries - - -## Julia library +# GPU runtime library # reset the runtime cache from global scope, so that any change triggers recompilation GPUCompiler.reset_runtime() -function signal_exception() - return + +## exception handling + +struct ExceptionInfo_st + # whether an exception has been encountered (0 -> 1) + status::Int32 + + # whether an exception is in the process of being reported (0 -> 1 -> 2) + output_lock::Int32 + + # who is reporting the exception + thread::@NamedTuple{x::Int32,y::Int32,z::Int32} + threadgroup::@NamedTuple{x::Int32,y::Int32,z::Int32} + + ExceptionInfo_st() = new(0, 0, + (; x=Int32(0), y=Int32(0), z=Int32(0)), + (; x=Int32(0), y=Int32(0), z=Int32(0))) end -function report_exception(ex) - # @cuprintf(""" - # ERROR: a %s was thrown during kernel execution. - # Run Julia on debug level 2 for device stack traces. - # """, ex) - return + +# to simplify use of this struct, which is passed by-reference, use property overloading +const ExceptionInfo = Ptr{ExceptionInfo_st} +@inline function Base.getproperty(info::ExceptionInfo, sym::Symbol) + if sym === :status + unsafe_load(convert(Ptr{Int32}, info)) + elseif sym === :output_lock + # XXX: atomic_load_explicit? + unsafe_load(convert(Ptr{Int32}, info + 4)) + elseif sym === :output_lock_ptr + reinterpret(LLVMPtr{Int32,AS.Device}, info + 4) + elseif sym === :thread + unsafe_load(convert(Ptr{@NamedTuple{x::Int32,y::Int32,z::Int32}}, info + 8)) + elseif sym === :threadgroup + unsafe_load(convert(Ptr{@NamedTuple{x::Int32,y::Int32,z::Int32}}, info + 20)) + else + getfield(info, sym) + end +end +@inline function Base.setproperty!(info::ExceptionInfo, sym::Symbol, value) + if sym === :status + unsafe_store!(convert(Ptr{Int32}, info), value) + elseif sym === :output_lock + # XXX: atomic_store_explicit? + unsafe_store!(convert(Ptr{Int32}, info + 4), value) + elseif sym === :thread + unsafe_store!(convert(Ptr{@NamedTuple{x::Int32,y::Int32,z::Int32}}, info + 8), value) + elseif sym === :threadgroup + unsafe_store!(convert(Ptr{@NamedTuple{x::Int32,y::Int32,z::Int32}}, info + 20), value) + else + setfield!(info, sym, value) + end end -report_oom(sz) = return #@cuprintf("ERROR: Out of dynamic GPU memory (trying to allocate %i bytes)\n", sz) +# it's not useful to have several threads report exceptions, so use an output +# lock to only have a single thread write an exception message +@inline function lock_output!(info::ExceptionInfo) + if atomic_compare_exchange_weak_explicit(info.output_lock_ptr, Int32(0), Int32(1)) == Int32(0) + # we just took the lock, so note our position + info.thread, info.threadgroup = thread_position_in_threadgroup_3d(), + threadgroup_position_in_grid_3d() + #threadfence() + return true + elseif info.output_lock == 1 && + info.thread == thread_position_in_threadgroup_3d() && + info.threadgroup == threadgroup_position_in_grid_3d() + # we already have the lock + return true + else + # somebody else has the lock + return false + end +end function report_exception_name(ex) - # @cuprintf(""" - # ERROR: a %s was thrown during kernel execution. - # Stacktrace: - # """, ex) + info = kernel_state().exception_info + + # this is the first reporting function being called, so claim the exception + if lock_output!(info) + #@cuprintf("ERROR: a %s was thrown during kernel execution on thread (%d, %d, %d) in block (%d, %d, %d).\n", + # ex, threadIdx().x, threadIdx().y, threadIdx().z, blockIdx().x, blockIdx().y, blockIdx().z) + #@cuprintf("Stacktrace:\n") + end return end function report_exception_frame(idx, func, file, line) - # @cuprintf(" [%i] %s at %s:%i\n", idx, func, file, line) + info = kernel_state().exception_info + + if lock_output!(info) + #@cuprintf(" [%d] %s at %s:%d\n", idx, func, file, line) + end + return +end + +function signal_exception() + info = kernel_state().exception_info + + # finalize output + if lock_output!(info) + #@cuprintf("\n") + info.output_lock = 2 + end + + # inform the host + info.status = 1 + + # XXX: threadgroup_barrier(MemoryFlagDevice) expects all threads to execute + # the barrier, so would deadlock + #threadfence_system() + + # stop executing + # XXX: we don't have a way to stop execution, so just return + # (GPUCompiler.jl will emit a trap instruction anyway) + #exit() + + return +end + + +## kernel state + +struct KernelState + exception_info::ExceptionInfo +end + +@inline @generated kernel_state() = GPUCompiler.kernel_state_value(KernelState) + + +## other + +function report_oom(sz) + #@cuprintf("ERROR: Out of dynamic GPU memory (trying to allocate %d bytes)\n", sz) return end