Hello,
I have a simple example which computes the mean of dirichlet distribution using quadrature. When I set ind_gpu = 1 to run on the gpu I get an error. When I set ind_gpu = 0, the code runs just fine and produces a reasonable answer for the mean of the distribution. Code and error message below:
Code:
Pkg.add(Pkg.PackageSpec(;name="Distributions", version="0.24.15"))
using Quadrature, Cuba, Cubature, Base.Threads
using Distributions, Random
using DataFrames, CSV
using Flux, CUDA
## User Inputs
N = 3
tol = 2
ind_gpu = 1 # indicator whether to use gpu
alg = CubaDivonne() #CubatureJLh() # CubaDivonne() #works for CubaCuhre, CubaDivonne CubaSUAVE, fails for cubavega
# Setting up Variables
if ind_gpu == 1
α0 = 10 .* (1 .- rand(N)) |> gpu
else
α0 = 10 .* (1 .- rand(N))
end
reltol_val = 10.0^(-tol)
abstol_val = 10.0^(-tol)
# Setting up function
dist_dirichlet_pdf(x,p) = Distributions.pdf(Distributions.Dirichlet(p),x)
function f_dirichlet(dx,x,p)
Threads.@threads for i in 1:N
dx[i] = (dist_dirichlet_pdf([x;1.00-sum(x)],p) .* [x;1.00-sum(x)])[i]
end
end
# Solving Integral
prob = QuadratureProblem(f_dirichlet,zeros(N-1),ones(N-1), α0, nout = N)
time_start = time()
mem_usage = @allocated sol_mean = Quadrature.solve(prob,alg,reltol=reltol_val,abstol=abstol_val)
total_mem = mem_usage/1000/2^20
# Checking Answer
mean_dirichlet(p) = p./sum(p)
display(mean_dirichlet(α0))
display(sol_mean)
'''
Error:
TaskFailedException
nested task error: InvalidIRError: compiling kernel partial_mapreduce_grid(typeof(identity), typeof(Base.add_sum), Float32, CartesianIndices{1, Tuple{Base.OneTo{Int64}}}, CartesianIndices{1, Tuple{Base.OneTo{Int64}}}, Val{true}, CuDeviceMatrix{Float32, 1}, Base.Broadcast.Broadcasted{CUDA.CuArrayStyle{1}, Tuple{Base.OneTo{Int64}}, typeof(SpecialFunctions.loggamma), Tuple{CuDeviceVector{Float32, 1}}}) resulted in invalid LLVM IR
Reason: unsupported call through a literal pointer (call to lgammaf_r)
Stacktrace:
[1] logabsgamma
@ ~/.julia/packages/SpecialFunctions/mFAQ4/src/gamma.jl:627
[2] loggamma
@ ~/.julia/packages/SpecialFunctions/mFAQ4/src/gamma.jl:670
[3] _broadcast_getindex_evalf
@ broadcast.jl:648
[4] _broadcast_getindex
@ broadcast.jl:621
[5] getindex
@ broadcast.jl:575
[6] _map_getindex
@ ~/.julia/packages/CUDA/M4jkK/src/mapreduce.jl:80
[7] partial_mapreduce_grid
@ ~/.julia/packages/CUDA/M4jkK/src/mapreduce.jl:117
Stacktrace:
[1] check_ir(job::GPUCompiler.CompilerJob{GPUCompiler.PTXCompilerTarget, CUDA.CUDACompilerParams, GPUCompiler.FunctionSpec{typeof(CUDA.partial_mapreduce_grid), Tuple{typeof(identity), typeof(Base.add_sum), Float32, CartesianIndices{1, Tuple{Base.OneTo{Int64}}}, CartesianIndices{1, Tuple{Base.OneTo{Int64}}}, Val{true}, CuDeviceMatrix{Float32, 1}, Base.Broadcast.Broadcasted{CUDA.CuArrayStyle{1}, Tuple{Base.OneTo{Int64}}, typeof(SpecialFunctions.loggamma), Tuple{CuDeviceVector{Float32, 1}}}}}}, args::LLVM.Module)
@ GPUCompiler ~/.julia/packages/GPUCompiler/XwWPj/src/validation.jl:123
[2] macro expansion
@ ~/.julia/packages/GPUCompiler/XwWPj/src/driver.jl:288 [inlined]
[3] macro expansion
@ ~/.julia/packages/TimerOutputs/4QAIk/src/TimerOutput.jl:206 [inlined]
[4] macro expansion
@ ~/.julia/packages/GPUCompiler/XwWPj/src/driver.jl:286 [inlined]
[5] emit_asm(job::GPUCompiler.CompilerJob, ir::LLVM.Module, kernel::LLVM.Function; strip::Bool, validate::Bool, format::LLVM.API.LLVMCodeGenFileType)
@ GPUCompiler ~/.julia/packages/GPUCompiler/XwWPj/src/utils.jl:62
[6] cufunction_compile(job::GPUCompiler.CompilerJob)
@ CUDA ~/.julia/packages/CUDA/M4jkK/src/compiler/execution.jl:306
[7] check_cache
@ ~/.julia/packages/GPUCompiler/XwWPj/src/cache.jl:44 [inlined]
[8] cached_compilation
@ ~/.julia/packages/CUDA/M4jkK/src/mapreduce.jl:87 [inlined]
[9] cached_compilation(cache::Dict{UInt64, Any}, job::GPUCompiler.CompilerJob{GPUCompiler.PTXCompilerTarget, CUDA.CUDACompilerParams, GPUCompiler.FunctionSpec{typeof(CUDA.partial_mapreduce_grid), Tuple{typeof(identity), typeof(Base.add_sum), Float32, CartesianIndices{1, Tuple{Base.OneTo{Int64}}}, CartesianIndices{1, Tuple{Base.OneTo{Int64}}}, Val{true}, CuDeviceMatrix{Float32, 1}, Base.Broadcast.Broadcasted{CUDA.CuArrayStyle{1}, Tuple{Base.OneTo{Int64}}, typeof(SpecialFunctions.loggamma), Tuple{CuDeviceVector{Float32, 1}}}}}}, compiler::typeof(CUDA.cufunction_compile), linker::typeof(CUDA.cufunction_link))
@ GPUCompiler ~/.julia/packages/GPUCompiler/XwWPj/src/cache.jl:0
[10] cufunction(f::typeof(CUDA.partial_mapreduce_grid), tt::Type{Tuple{typeof(identity), typeof(Base.add_sum), Float32, CartesianIndices{1, Tuple{Base.OneTo{Int64}}}, CartesianIndices{1, Tuple{Base.OneTo{Int64}}}, Val{true}, CuDeviceMatrix{Float32, 1}, Base.Broadcast.Broadcasted{CUDA.CuArrayStyle{1}, Tuple{Base.OneTo{Int64}}, typeof(SpecialFunctions.loggamma), Tuple{CuDeviceVector{Float32, 1}}}}}; name::Nothing, kwargs::Base.Iterators.Pairs{Union{}, Union{}, Tuple{}, NamedTuple{(), Tuple{}}})
@ CUDA ~/.julia/packages/CUDA/M4jkK/src/compiler/execution.jl:294
[11] cufunction
@ ~/.julia/packages/CUDA/M4jkK/src/compiler/execution.jl:288 [inlined]
[12] macro expansion
@ ~/.julia/packages/CUDA/M4jkK/src/compiler/execution.jl:102 [inlined]
[13] mapreducedim!(f::typeof(identity), op::typeof(Base.add_sum), R::CuArray{Float32, 1}, A::Base.Broadcast.Broadcasted{CUDA.CuArrayStyle{1}, Tuple{Base.OneTo{Int64}}, typeof(SpecialFunctions.loggamma), Tuple{CuArray{Float32, 1}}}; init::Float32)
@ CUDA ~/.julia/packages/CUDA/M4jkK/src/mapreduce.jl:192
[14] #_mapreduce#19
@ ~/.julia/packages/GPUArrays/bjw3g/src/host/mapreduce.jl:62 [inlined]
[15] #mapreduce#17
@ ~/.julia/packages/GPUArrays/bjw3g/src/host/mapreduce.jl:28 [inlined]
[16] mapreduce
@ ~/.julia/packages/GPUArrays/bjw3g/src/host/mapreduce.jl:28 [inlined]
[17] #_sum#682
@ ./reducedim.jl:878 [inlined]
[18] _sum
@ ./reducedim.jl:878 [inlined]
[19] #sum#680
@ ./reducedim.jl:874 [inlined]
[20] sum
@ ./reducedim.jl:874 [inlined]
[21] (Dirichlet{Float32, Ts, S} where {Ts<:AbstractVector{Float32}, S<:Real})(alpha::CuArray{Float32, 1}; check_args::Bool)
@ Distributions ~/.julia/packages/Distributions/cNe2C/src/multivariate/dirichlet.jl:33
[22] #Dirichlet#143
@ ~/.julia/packages/Distributions/cNe2C/src/multivariate/dirichlet.jl:39 [inlined]
[23] Dirichlet
@ ~/.julia/packages/Distributions/cNe2C/src/multivariate/dirichlet.jl:39 [inlined]
[24] dist_dirichlet_pdf(x::Vector{Float64}, p::CuArray{Float32, 1})
@ Main ./In[26]:26
[25] macro expansion
@ ./In[26]:29 [inlined]
[26] (::var"#483#threadsfor_fun#14"{Vector{Float64}, Vector{Float64}, CuArray{Float32, 1}, UnitRange{Int64}})(onethread::Bool)
@ Main ./threadingconstructs.jl:81
[27] (::var"#483#threadsfor_fun#14"{Vector{Float64}, Vector{Float64}, CuArray{Float32, 1}, UnitRange{Int64}})()
@ Main ./threadingconstructs.jl:48
Stacktrace:
[1] wait
@ ./task.jl:317 [inlined]
[2] threading_run(func::Function)
@ Base.Threads ./threadingconstructs.jl:34
[3] macro expansion
@ ./threadingconstructs.jl:93 [inlined]
[4] f_dirichlet(dx::Vector{Float64}, x::Vector{Float64}, p::CuArray{Float32, 1})
@ Main ./In[26]:28
[5] (::Quadrature.var"#77#93"{QuadratureProblem{true, CuArray{Float32, 1}, typeof(f_dirichlet), Vector{Float64}, Vector{Float64}, Base.Iterators.Pairs{Union{}, Union{}, Tuple{}, NamedTuple{(), Tuple{}}}}, Vector{Float64}, Vector{Float64}, CuArray{Float32, 1}})(x::Vector{Float64}, dx::Vector{Float64})
@ Quadrature ~/.julia/packages/Quadrature/NPUfc/src/Quadrature.jl:403
[6] generic_integrand!(ndim::Int32, x_::Ptr{Float64}, ncomp::Int32, f_::Ptr{Float64}, func!::Quadrature.var"#77#93"{QuadratureProblem{true, CuArray{Float32, 1}, typeof(f_dirichlet), Vector{Float64}, Vector{Float64}, Base.Iterators.Pairs{Union{}, Union{}, Tuple{}, NamedTuple{(), Tuple{}}}}, Vector{Float64}, Vector{Float64}, CuArray{Float32, 1}})
@ Cuba ~/.julia/packages/Cuba/KIQTB/src/Cuba.jl:92
[7] dointegrate!
@ ~/.julia/packages/Cuba/KIQTB/src/divonne.jl:52 [inlined]
[8] dointegrate
@ ~/.julia/packages/Cuba/KIQTB/src/Cuba.jl:195 [inlined]
[9] divonne(integrand::Quadrature.var"#77#93"{QuadratureProblem{true, CuArray{Float32, 1}, typeof(f_dirichlet), Vector{Float64}, Vector{Float64}, Base.Iterators.Pairs{Union{}, Union{}, Tuple{}, NamedTuple{(), Tuple{}}}}, Vector{Float64}, Vector{Float64}, CuArray{Float32, 1}}, ndim::Int64, ncomp::Int64; nvec::Int64, rtol::Float64, atol::Float64, flags::Int64, seed::Int64, minevals::Int64, maxevals::Int64, key1::Int64, key2::Int64, key3::Int64, maxpass::Int64, border::Float64, maxchisq::Float64, mindeviation::Float64, ngiven::Int64, ldxgiven::Int64, xgiven::Matrix{Float64}, nextra::Int64, peakfinder::Ptr{Nothing}, statefile::String, spin::Ptr{Nothing}, reltol::Missing, abstol::Missing)
@ Cuba ~/.julia/packages/Cuba/KIQTB/src/divonne.jl:145
[10] __solvebp_call(::QuadratureProblem{true, CuArray{Float32, 1}, typeof(f_dirichlet), Vector{Float64}, Vector{Float64}, Base.Iterators.Pairs{Union{}, Union{}, Tuple{}, NamedTuple{(), Tuple{}}}}, ::CubaDivonne, ::Quadrature.ReCallVJP{Quadrature.ZygoteVJP}, ::Vector{Float64}, ::Vector{Float64}, ::CuArray{Float32, 1}; reltol::Float64, abstol::Float64, maxiters::Int64, kwargs::Base.Iterators.Pairs{Union{}, Union{}, Tuple{}, NamedTuple{(), Tuple{}}})
@ Quadrature ~/.julia/packages/Quadrature/NPUfc/src/Quadrature.jl:463
[11] #__solvebp#11
@ ~/.julia/packages/Quadrature/NPUfc/src/Quadrature.jl:153 [inlined]
[12] solve(::QuadratureProblem{true, CuArray{Float32, 1}, typeof(f_dirichlet), Vector{Float64}, Vector{Float64}, Base.Iterators.Pairs{Union{}, Union{}, Tuple{}, NamedTuple{(), Tuple{}}}}, ::CubaDivonne; sensealg::Quadrature.ReCallVJP{Quadrature.ZygoteVJP}, kwargs::Base.Iterators.Pairs{Symbol, Float64, Tuple{Symbol, Symbol}, NamedTuple{(:reltol, :abstol), Tuple{Float64, Float64}}})
@ Quadrature ~/.julia/packages/Quadrature/NPUfc/src/Quadrature.jl:149
[13] top-level scope
@ ./timing.jl:321 [inlined]
[14] top-level scope
@ ./In[26]:0
[15] eval
@ ./boot.jl:360 [inlined]
[16] include_string(mapexpr::typeof(REPL.softscope), mod::Module, code::String, filename::String)
@ Base ./loading.jl:1094
'''