InvalidIRError when compling TrixCUDA.boundary_flux_kernel!

> Thanks for reminding! 
> 
> Yes this is fixed for both Julia 1.10 and 1.11 with latest Trixi.jl
> ```
> (@v1.10) pkg> st
> Status `C:\Users\huiyu\.julia\environments\v1.10\Project.toml`
>   [052768ef] CUDA v5.6.1
>   [a7f1ee26] Trixi v0.10.1
> ```
> ```
> (@v1.11) pkg> st
> Status `C:\Users\huiyu\.julia\environments\v1.11\Project.toml`
>   [052768ef] CUDA v5.6.1
>   [a7f1ee26] Trixi v0.10.1
> ``` 

 _Originally posted by @huiyuxie in [#70](https://github.com/trixi-gpu/TrixiCUDA.jl/issues/70#issuecomment-2661592422)_

Hi, the latest Trixi.jl and TrixiCUDA.jl is working! Thanks of your work! but I got the error when I do the test for problems below

### MWE
```julia
using Trixi, TrixiCUDA
using CUDA
# Currently we need to allow scalar indexing on GPU arrays for the tests to pass,
# once the issues are resolved, this line can be removed.
CUDA.allowscalar(true)

equations = ShallowWaterEquations1D(gravity_constant = 9.812, H0 = 1.75)

function initial_condition_stone_throw_discontinuous_bottom(x, t,
                                                            equations::ShallowWaterEquations1D)
    # Flat lake
    H = equations.H0

    # Discontinuous velocity
    v = 0.0
    if x[1] >= -0.75 && x[1] <= 0.0
        v = -1.0
    elseif x[1] >= 0.0 && x[1] <= 0.75
        v = 1.0
    end

    b = (1.5 / exp(0.5 * ((x[1] - 1.0)^2)) +
         0.75 / exp(0.5 * ((x[1] + 1.0)^2)))

    # Force a discontinuous bottom topography
    if x[1] >= -1.5 && x[1] <= 0.0
        b = 0.5
    end

    return prim2cons(SVector(H, v, b), equations)
end

initial_condition = initial_condition_stone_throw_discontinuous_bottom

boundary_condition = boundary_condition_slip_wall

volume_flux = (flux_wintermeyer_etal, flux_nonconservative_wintermeyer_etal)
surface_flux = (FluxHydrostaticReconstruction(flux_lax_friedrichs,
                                              hydrostatic_reconstruction_audusse_etal),
                flux_nonconservative_audusse_etal)
basis = LobattoLegendreBasis(4)
basis_gpu = LobattoLegendreBasisGPU(4)

indicator_sc = IndicatorHennemannGassner(equations, basis,
                                         alpha_max = 0.5,
                                         alpha_min = 0.001,
                                         alpha_smooth = true,
                                         variable = waterheight_pressure)
volume_integral = VolumeIntegralShockCapturingHG(indicator_sc;
                                                 volume_flux_dg = volume_flux,
                                                 volume_flux_fv = surface_flux)

solver_gpu = DGSEMGPU(basis_gpu, surface_flux, volume_integral)

coordinates_min = -3.0
coordinates_max = 3.0
mesh = TreeMesh(coordinates_min, coordinates_max,
                initial_refinement_level = 3,
                n_cells_max = 10_000,
                periodicity = false)

semi_gpu = SemidiscretizationHyperbolicGPU(mesh, equations, initial_condition, solver_gpu,
                                           boundary_conditions = boundary_condition)

tspan = tspan_gpu = (0.0, 3.0)
t = t_gpu = 0.0

# Semi on GPU
equations_gpu, mesh_gpu, solver_gpu = semi_gpu.equations, semi_gpu.mesh, semi_gpu.solver
cache_gpu, cache_cpu = semi_gpu.cache_gpu, semi_gpu.cache_cpu
boundary_conditions_gpu, source_terms_gpu = semi_gpu.boundary_conditions, semi_gpu.source_terms

# ODE on GPU
ode_gpu = semidiscretizeGPU(semi_gpu, tspan_gpu)
u_gpu_ = copy(ode_gpu.u0)
du_gpu_ = similar(u_gpu_)
u_gpu = TrixiCUDA.wrap_array(u_gpu_, mesh_gpu, equations_gpu, solver_gpu, cache_gpu)
du_gpu = TrixiCUDA.wrap_array(du_gpu_, mesh_gpu, equations_gpu, solver_gpu, cache_gpu)

TrixiCUDA.cuda_boundary_flux!(t_gpu, mesh_gpu, boundary_conditions_gpu,
                              Trixi.have_nonconservative_terms(equations_gpu),
                              equations_gpu, solver_gpu, cache_gpu)

```

### error info
```
(base) jovyan@workspace-0 ~/f/D/TrixiCUDA.jl> julia debug_shallow.jl 
┌ Warning: It's not recommended to use allowscalar([true]) to allow scalar indexing.
│ Instead, use `allowscalar() do end` or `@allowscalar` to denote exactly which operations can use scalar operations.
└ @ GPUArraysCore ~/.julia/packages/GPUArraysCore/aNaXo/src/GPUArraysCore.jl:184
ERROR: LoadError: InvalidIRError: compiling MethodInstance for TrixiCUDA.boundary_flux_kernel!(::CuDeviceArray{Float64, 3, 1}, ::CuDeviceArray{Float64, 3, 1}, ::CuDeviceMatrix{Float64, 1}, ::Float64, ::CuDeviceVector{Int64, 1}, ::CuDeviceVector{Int64, 1}, ::CuDeviceVector{Int64, 1}, ::CuDeviceVector{Int64, 1}, ::CuDeviceVector{Int64, 1}, ::@NamedTuple{x_neg::typeof(boundary_condition_slip_wall), x_pos::typeof(boundary_condition_slip_wall)}, ::ShallowWaterEquations1D{Float64}, ::FluxHydrostaticReconstruction{FluxLaxFriedrichs{typeof(max_abs_speed_naive)}, typeof(hydrostatic_reconstruction_audusse_etal)}, ::typeof(flux_nonconservative_audusse_etal)) resulted in invalid LLVM IR
Reason: unsupported dynamic function invocation (call to iterate)
Stacktrace:
 [1] indexed_iterate
   @ ./tuple.jl:162
 [2] boundary_flux_kernel!
   @ ~/DGCFN.jl/TrixiCUDA.jl/src/solvers/dg_1d_kernel.jl:0
Hint: catch this exception as `err` and call `code_typed(err; interactive = true)` to introspect the erronous code with Cthulhu.jl
Stacktrace:
  [1] check_ir(job::GPUCompiler.CompilerJob{GPUCompiler.PTXCompilerTarget, CUDA.CUDACompilerParams}, args::LLVM.Module)
    @ GPUCompiler ~/.julia/packages/GPUCompiler/OGnEB/src/validation.jl:167
  [2] macro expansion
    @ ~/.julia/packages/GPUCompiler/OGnEB/src/driver.jl:381 [inlined]
  [3] emit_llvm(job::GPUCompiler.CompilerJob; toplevel::Bool, libraries::Bool, optimize::Bool, cleanup::Bool, validate::Bool, only_entry::Bool)
    @ GPUCompiler ~/.julia/packages/GPUCompiler/OGnEB/src/utils.jl:110
  [4] emit_llvm
    @ ~/.julia/packages/GPUCompiler/OGnEB/src/utils.jl:108 [inlined]
  [5] codegen(output::Symbol, job::GPUCompiler.CompilerJob; toplevel::Bool, libraries::Bool, optimize::Bool, cleanup::Bool, validate::Bool, strip::Bool, only_entry::Bool, parent_job::Nothing)
    @ GPUCompiler ~/.julia/packages/GPUCompiler/OGnEB/src/driver.jl:100
  [6] codegen
    @ ~/.julia/packages/GPUCompiler/OGnEB/src/driver.jl:82 [inlined]
  [7] compile(target::Symbol, job::GPUCompiler.CompilerJob; kwargs::@Kwargs{})
    @ GPUCompiler ~/.julia/packages/GPUCompiler/OGnEB/src/driver.jl:79
  [8] compile
    @ ~/.julia/packages/GPUCompiler/OGnEB/src/driver.jl:74 [inlined]
  [9] #1171
    @ ~/.julia/packages/CUDA/sWPBr/src/compiler/compilation.jl:255 [inlined]
 [10] JuliaContext(f::CUDA.var"#1171#1174"{GPUCompiler.CompilerJob{GPUCompiler.PTXCompilerTarget, CUDA.CUDACompilerParams}}; kwargs::@Kwargs{})
    @ GPUCompiler ~/.julia/packages/GPUCompiler/OGnEB/src/driver.jl:34
 [11] JuliaContext(f::Function)
    @ GPUCompiler ~/.julia/packages/GPUCompiler/OGnEB/src/driver.jl:25
 [12] compile(job::GPUCompiler.CompilerJob)
    @ CUDA ~/.julia/packages/CUDA/sWPBr/src/compiler/compilation.jl:254
 [13] actual_compilation(cache::Dict{Any, CuFunction}, src::Core.MethodInstance, world::UInt64, cfg::GPUCompiler.CompilerConfig{GPUCompiler.PTXCompilerTarget, CUDA.CUDACompilerParams}, compiler::typeof(CUDA.compile), linker::typeof(CUDA.link))
    @ GPUCompiler ~/.julia/packages/GPUCompiler/OGnEB/src/execution.jl:237
 [14] cached_compilation(cache::Dict{Any, CuFunction}, src::Core.MethodInstance, cfg::GPUCompiler.CompilerConfig{GPUCompiler.PTXCompilerTarget, CUDA.CUDACompilerParams}, compiler::Function, linker::Function)
    @ GPUCompiler ~/.julia/packages/GPUCompiler/OGnEB/src/execution.jl:151
 [15] macro expansion
    @ ~/.julia/packages/CUDA/sWPBr/src/compiler/execution.jl:373 [inlined]
 [16] macro expansion
    @ ./lock.jl:273 [inlined]
 [17] cufunction(f::typeof(TrixiCUDA.boundary_flux_kernel!), tt::Type{Tuple{CuDeviceArray{Float64, 3, 1}, CuDeviceArray{Float64, 3, 1}, CuDeviceMatrix{Float64, 1}, Float64, CuDeviceVector{Int64, 1}, CuDeviceVector{Int64, 1}, CuDeviceVector{Int64, 1}, CuDeviceVector{Int64, 1}, CuDeviceVector{Int64, 1}, @NamedTuple{x_neg::typeof(boundary_condition_slip_wall), x_pos::typeof(boundary_condition_slip_wall)}, ShallowWaterEquations1D{Float64}, FluxHydrostaticReconstruction{FluxLaxFriedrichs{typeof(max_abs_speed_naive)}, typeof(hydrostatic_reconstruction_audusse_etal)}, typeof(flux_nonconservative_audusse_etal)}}; kwargs::@Kwargs{})
    @ CUDA ~/.julia/packages/CUDA/sWPBr/src/compiler/execution.jl:368
 [18] cufunction(f::typeof(TrixiCUDA.boundary_flux_kernel!), tt::Type{Tuple{CuDeviceArray{Float64, 3, 1}, CuDeviceArray{Float64, 3, 1}, CuDeviceMatrix{Float64, 1}, Float64, CuDeviceVector{Int64, 1}, CuDeviceVector{Int64, 1}, CuDeviceVector{Int64, 1}, CuDeviceVector{Int64, 1}, CuDeviceVector{Int64, 1}, @NamedTuple{x_neg::typeof(boundary_condition_slip_wall), x_pos::typeof(boundary_condition_slip_wall)}, ShallowWaterEquations1D{Float64}, FluxHydrostaticReconstruction{FluxLaxFriedrichs{typeof(max_abs_speed_naive)}, typeof(hydrostatic_reconstruction_audusse_etal)}, typeof(flux_nonconservative_audusse_etal)}})
    @ CUDA ~/.julia/packages/CUDA/sWPBr/src/compiler/execution.jl:365
 [19] macro expansion
    @ ~/.julia/packages/CUDA/sWPBr/src/compiler/execution.jl:112 [inlined]
 [20] cuda_boundary_flux!(t::Float64, mesh::TreeMesh{1, Trixi.SerialTree{1, Float64}, Float64}, boundary_conditions::@NamedTuple{x_neg::typeof(boundary_condition_slip_wall), x_pos::typeof(boundary_condition_slip_wall)}, nonconservative_terms::Static.True, equations::ShallowWaterEquations1D{Float64}, dg::DGSEM{LobattoLegendreBasis{Float64, 5, CuArray{Float64, 1, CUDA.DeviceMemory}, Matrix{Float64}, Matrix{Float64}, CuArray{Float64, 2, CUDA.DeviceMemory}}, Trixi.LobattoLegendreMortarL2{Float64, 5, CuArray{Float64, 2, CUDA.DeviceMemory}, CuArray{Float64, 2, CUDA.DeviceMemory}}, SurfaceIntegralWeakForm{Tuple{FluxHydrostaticReconstruction{FluxLaxFriedrichs{typeof(max_abs_speed_naive)}, typeof(hydrostatic_reconstruction_audusse_etal)}, typeof(flux_nonconservative_audusse_etal)}}, VolumeIntegralShockCapturingHG{Tuple{typeof(flux_wintermeyer_etal), typeof(flux_nonconservative_wintermeyer_etal)}, Tuple{FluxHydrostaticReconstruction{FluxLaxFriedrichs{typeof(max_abs_speed_naive)}, typeof(hydrostatic_reconstruction_audusse_etal)}, typeof(flux_nonconservative_audusse_etal)}, IndicatorHennemannGassner{Float64, typeof(waterheight_pressure), @NamedTuple{alpha::Vector{Float64}, alpha_tmp::Vector{Float64}, indicator_threaded::Vector{Vector{Float64}}, modal_threaded::Vector{Vector{Float64}}}}}}, cache::@NamedTuple{elements::TrixiCUDA.ElementContainerGPU1D{Float64, Float64}, interfaces::TrixiCUDA.InterfaceContainerGPU1D{Float64}, boundaries::TrixiCUDA.BoundaryContainerGPU1D{Float64, Float64}, fstar1_L::CuArray{Float64, 3, CUDA.DeviceMemory}, fstar1_R::CuArray{Float64, 3, CUDA.DeviceMemory}})
    @ TrixiCUDA ~/DGCFN.jl/TrixiCUDA.jl/src/solvers/dg_1d.jl:451
 [21] top-level scope
    @ ~/DGCFN.jl/TrixiCUDA.jl/debug_shallow.jl:80
in expression starting at /home/jovyan/DGCFN.jl/TrixiCUDA.jl/debug_shallow.jl:80
```

Below is the `Project.toml`
```
name = "TrixiCUDA"
uuid = "5056661c-942a-472c-b3bc-01507f717a60"
authors = ["Huiyu Xie"]
version = "1.0.0-DEV"

[deps]
BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf"
CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
SciMLBase = "0bca4576-84f4-4d90-8ffe-ffa030f20462"
StaticArrays = "90137ffa-7385-5640-81b9-e52037218182"
Trixi = "a7f1ee26-1774-49b1-8366-f1abc58fbfcb"

[compat]
CUDA = "5.7"

[extras]
Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"

[targets]
test = ["Test"]
```

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

InvalidIRError when compling TrixCUDA.boundary_flux_kernel! #142

MWE

error info

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

InvalidIRError when compling TrixCUDA.boundary_flux_kernel! #142

Description

MWE

error info

Metadata

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

Issue actions