Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

WIP: Exception handling. #416

Draft
wants to merge 1 commit into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 18 additions & 2 deletions src/compiler/execution.jl
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ High-level interface for executing code on a GPU.
The `@metal` macro should prefix a call, with `func` a callable function or object that
should return nothing. It will be compiled to a Metal function upon first use, and to a
certain extent arguments will be converted and managed automatically using `mtlconvert`.
Finally, a call to `mtlcall` is performed, creating a command buffer in the current global
Finally, the kernel will be called, creating a command buffer in the current global
command queue then committing it.

There is one supported keyword argument that influences the behavior of `@metal`:
Expand Down Expand Up @@ -271,12 +271,28 @@ end
(threads.width * threads.height * threads.depth) > kernel.pipeline.maxTotalThreadsPerThreadgroup &&
throw(ArgumentError("Number of threads in group ($(threads.width * threads.height * threads.depth)) should not exceed $(kernel.pipeline.maxTotalThreadsPerThreadgroup)"))


# TODO: cache kernel state as MTLBuffer to lower launch cost?
exception_info, exception_buffer = begin
ref = Ref{ExceptionInfo}()
sz = cld(sizeof(ExceptionInfo_st), MTL.PAGESIZE) * MTL.PAGESIZE
status = @ccall posix_memalign(ref::Ptr{ExceptionInfo},
MTL.PAGESIZE::Csize_t,
sz::Csize_t)::Cint
status == 0 || throw(OutOfMemoryError())
cpu = unsafe_wrap(Vector{ExceptionInfo_st}, ref[], sz; own=true)
gpu = MTLBuffer(device(), sz, ref[]; nocopy=true)
cpu, gpu
end
state = KernelState(exception_buffer.gpuAddress)
# XXX: adapt() informs the cce about r/w usage of this buffer

cmdbuf = MTLCommandBuffer(queue)
cmdbuf.label = "MTLCommandBuffer($(nameof(kernel.f)))"
cce = MTLComputeCommandEncoder(cmdbuf)
argument_buffers = try
MTL.set_function!(cce, kernel.pipeline)
bufs = encode_arguments!(cce, kernel, kernel.f, args...)
bufs = encode_arguments!(cce, kernel, kernel.f, state, args...)
MTL.append_current_function!(cce, groups, threads)
bufs
finally
Expand Down
141 changes: 123 additions & 18 deletions src/device/runtime.jl
Original file line number Diff line number Diff line change
@@ -1,34 +1,139 @@
# device runtime libraries


## Julia library
# GPU runtime library

# reset the runtime cache from global scope, so that any change triggers recompilation
GPUCompiler.reset_runtime()

function signal_exception()
return

## exception handling

struct ExceptionInfo_st
# whether an exception has been encountered (0 -> 1)
status::Int32

# whether an exception is in the process of being reported (0 -> 1 -> 2)
output_lock::Int32

# who is reporting the exception
thread::@NamedTuple{x::Int32,y::Int32,z::Int32}
threadgroup::@NamedTuple{x::Int32,y::Int32,z::Int32}

ExceptionInfo_st() = new(0, 0,
(; x=Int32(0), y=Int32(0), z=Int32(0)),
(; x=Int32(0), y=Int32(0), z=Int32(0)))
end

function report_exception(ex)
# @cuprintf("""
# ERROR: a %s was thrown during kernel execution.
# Run Julia on debug level 2 for device stack traces.
# """, ex)
return

# to simplify use of this struct, which is passed by-reference, use property overloading
const ExceptionInfo = Ptr{ExceptionInfo_st}
@inline function Base.getproperty(info::ExceptionInfo, sym::Symbol)
if sym === :status
unsafe_load(convert(Ptr{Int32}, info))
elseif sym === :output_lock
# XXX: atomic_load_explicit?
unsafe_load(convert(Ptr{Int32}, info + 4))
elseif sym === :output_lock_ptr
reinterpret(LLVMPtr{Int32,AS.Device}, info + 4)
elseif sym === :thread
unsafe_load(convert(Ptr{@NamedTuple{x::Int32,y::Int32,z::Int32}}, info + 8))
elseif sym === :threadgroup
unsafe_load(convert(Ptr{@NamedTuple{x::Int32,y::Int32,z::Int32}}, info + 20))
else
getfield(info, sym)
end
end
@inline function Base.setproperty!(info::ExceptionInfo, sym::Symbol, value)
if sym === :status
unsafe_store!(convert(Ptr{Int32}, info), value)
elseif sym === :output_lock
# XXX: atomic_store_explicit?
unsafe_store!(convert(Ptr{Int32}, info + 4), value)
elseif sym === :thread
unsafe_store!(convert(Ptr{@NamedTuple{x::Int32,y::Int32,z::Int32}}, info + 8), value)
elseif sym === :threadgroup
unsafe_store!(convert(Ptr{@NamedTuple{x::Int32,y::Int32,z::Int32}}, info + 20), value)
else
setfield!(info, sym, value)
end
end

report_oom(sz) = return #@cuprintf("ERROR: Out of dynamic GPU memory (trying to allocate %i bytes)\n", sz)
# it's not useful to have several threads report exceptions, so use an output
# lock to only have a single thread write an exception message
@inline function lock_output!(info::ExceptionInfo)
if atomic_compare_exchange_weak_explicit(info.output_lock_ptr, Int32(0), Int32(1)) == Int32(0)
# we just took the lock, so note our position
info.thread, info.threadgroup = thread_position_in_threadgroup_3d(),
threadgroup_position_in_grid_3d()
#threadfence()
return true
elseif info.output_lock == 1 &&
info.thread == thread_position_in_threadgroup_3d() &&
info.threadgroup == threadgroup_position_in_grid_3d()
# we already have the lock
return true
else
# somebody else has the lock
return false
end
end

function report_exception_name(ex)
# @cuprintf("""
# ERROR: a %s was thrown during kernel execution.
# Stacktrace:
# """, ex)
info = kernel_state().exception_info

# this is the first reporting function being called, so claim the exception
if lock_output!(info)
#@cuprintf("ERROR: a %s was thrown during kernel execution on thread (%d, %d, %d) in block (%d, %d, %d).\n",
# ex, threadIdx().x, threadIdx().y, threadIdx().z, blockIdx().x, blockIdx().y, blockIdx().z)
#@cuprintf("Stacktrace:\n")
end
return
end

function report_exception_frame(idx, func, file, line)
# @cuprintf(" [%i] %s at %s:%i\n", idx, func, file, line)
info = kernel_state().exception_info

if lock_output!(info)
#@cuprintf(" [%d] %s at %s:%d\n", idx, func, file, line)
end
return
end

function signal_exception()
info = kernel_state().exception_info

# finalize output
if lock_output!(info)
#@cuprintf("\n")
info.output_lock = 2
end

# inform the host
info.status = 1

# XXX: threadgroup_barrier(MemoryFlagDevice) expects all threads to execute
# the barrier, so would deadlock
#threadfence_system()

# stop executing
# XXX: we don't have a way to stop execution, so just return
# (GPUCompiler.jl will emit a trap instruction anyway)
#exit()

return
end


## kernel state

struct KernelState
exception_info::ExceptionInfo
end

@inline @generated kernel_state() = GPUCompiler.kernel_state_value(KernelState)


## other

function report_oom(sz)
#@cuprintf("ERROR: Out of dynamic GPU memory (trying to allocate %d bytes)\n", sz)
return
end