From 90f018b4c23023e4a4025ef883e40599b694b46a Mon Sep 17 00:00:00 2001 From: Manuel Drehwald Date: Wed, 2 Jul 2025 16:49:31 -0700 Subject: [PATCH 01/10] offload wrapper generation --- compiler/rustc_codegen_llvm/src/builder.rs | 1 + .../src/builder/gpu_offload.rs | 4 +- .../src/builder/gpu_wrapper.rs | 119 ++++++++++++++++++ compiler/rustc_codegen_llvm/src/context.rs | 17 +++ compiler/rustc_codegen_llvm/src/lib.rs | 15 +++ compiler/rustc_codegen_llvm/src/llvm/ffi.rs | 9 +- 6 files changed, 162 insertions(+), 3 deletions(-) create mode 100644 compiler/rustc_codegen_llvm/src/builder/gpu_wrapper.rs diff --git a/compiler/rustc_codegen_llvm/src/builder.rs b/compiler/rustc_codegen_llvm/src/builder.rs index da2a153d819f9..68f972f65d822 100644 --- a/compiler/rustc_codegen_llvm/src/builder.rs +++ b/compiler/rustc_codegen_llvm/src/builder.rs @@ -4,6 +4,7 @@ use std::{iter, ptr}; pub(crate) mod autodiff; pub(crate) mod gpu_offload; +pub(crate) mod gpu_wrapper; use libc::{c_char, c_uint, size_t}; use rustc_abi as abi; diff --git a/compiler/rustc_codegen_llvm/src/builder/gpu_offload.rs b/compiler/rustc_codegen_llvm/src/builder/gpu_offload.rs index 1280ab1442a09..da7bde86463bc 100644 --- a/compiler/rustc_codegen_llvm/src/builder/gpu_offload.rs +++ b/compiler/rustc_codegen_llvm/src/builder/gpu_offload.rs @@ -12,7 +12,7 @@ use crate::llvm::{self, Linkage, Type, Value}; use crate::{LlvmCodegenBackend, SimpleCx, attributes}; pub(crate) fn handle_gpu_code<'ll>( - _cgcx: &CodegenContext, + cgcx: &CodegenContext, cx: &'ll SimpleCx<'_>, ) { // The offload memory transfer type for each kernel @@ -26,8 +26,8 @@ pub(crate) fn handle_gpu_code<'ll>( kernels.push(kernel); } } - gen_call_handling(&cx, &kernels, &o_types); + crate::builder::gpu_wrapper::gen_image_wrapper_module(&cgcx); } // What is our @1 here? A magic global, used in our data_{begin/update/end}_mapper: diff --git a/compiler/rustc_codegen_llvm/src/builder/gpu_wrapper.rs b/compiler/rustc_codegen_llvm/src/builder/gpu_wrapper.rs new file mode 100644 index 0000000000000..037208d656a81 --- /dev/null +++ b/compiler/rustc_codegen_llvm/src/builder/gpu_wrapper.rs @@ -0,0 +1,119 @@ +use std::ffi::CString; + +use llvm::Linkage::*; +use rustc_abi::Align; +use rustc_codegen_ssa::back::write::CodegenContext; +use rustc_codegen_ssa::traits::BaseTypeCodegenMethods; + +use crate::builder::gpu_offload::*; +use crate::llvm::{self, Visibility}; +use crate::{LlvmCodegenBackend, ModuleLlvm, SimpleCx}; + +pub(crate) fn create_struct_ty<'ll>( + cx: &'ll SimpleCx<'_>, + name: &str, + tys: &[&'ll llvm::Type], +) -> &'ll llvm::Type { + let entry_struct_name = CString::new(name).unwrap(); + unsafe { + let entry_struct = llvm::LLVMStructCreateNamed(cx.llcx, entry_struct_name.as_ptr()); + llvm::LLVMStructSetBody(entry_struct, tys.as_ptr(), tys.len() as u32, 0); + entry_struct + } +} + +// We don't copy types from other functions because we generate a new module and context. +// Bringing in types from other contexts would likely cause issues. +pub(crate) fn gen_image_wrapper_module(cgcx: &CodegenContext) { + let dl_cstr = CString::new("e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128-p9:192:256:256:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8:9").unwrap(); + let target_cstr = CString::new("amdgcn-amd-amdhsa").unwrap(); + let name = "offload.wrapper.module"; + let m: crate::ModuleLlvm = + ModuleLlvm::new_simple(name, dl_cstr.into_raw(), target_cstr.into_raw(), &cgcx).unwrap(); + let cx = SimpleCx::new(m.llmod(), m.llcx, cgcx.pointer_size); + let tptr = cx.type_ptr(); + let ti64 = cx.type_i64(); + let ti32 = cx.type_i32(); + let ti16 = cx.type_i16(); + + let entry_fields = [ti64, ti16, ti16, ti32, tptr, tptr, ti64, ti64, tptr]; + create_struct_ty(&cx, "__tgt_offload_entry", &entry_fields); + create_struct_ty(&cx, "__tgt_device_image", &[tptr, tptr, tptr, tptr]); + create_struct_ty(&cx, "__tgt_bin_desc", &[ti32, tptr, tptr, tptr]); + + let offload_entry_ty = add_tgt_offload_entry(&cx); + let offload_entry_arr = cx.type_array(offload_entry_ty, 0); + + let c_name = CString::new("__start_omp_offloading_entries").unwrap(); + let llglobal = llvm::add_global(cx.llmod, offload_entry_arr, &c_name); + llvm::set_global_constant(llglobal, true); + llvm::set_linkage(llglobal, ExternalLinkage); + llvm::set_visibility(llglobal, Visibility::Hidden); + let c_name = CString::new("__stop_omp_offloading_entries").unwrap(); + let llglobal = llvm::add_global(cx.llmod, offload_entry_arr, &c_name); + llvm::set_global_constant(llglobal, true); + llvm::set_linkage(llglobal, ExternalLinkage); + llvm::set_visibility(llglobal, Visibility::Hidden); + + let c_name = CString::new("__dummy.omp_offloading_entries").unwrap(); + let llglobal = llvm::add_global(cx.llmod, offload_entry_arr, &c_name); + llvm::set_global_constant(llglobal, true); + llvm::set_linkage(llglobal, InternalLinkage); + let c_section_name = CString::new("omp_offloading_entries").unwrap(); + llvm::set_section(llglobal, &c_section_name); + let zeroinit = cx.const_null(offload_entry_arr); + llvm::set_initializer(llglobal, zeroinit); + + CString::new("llvm.compiler.used").unwrap(); + let arr_val = cx.const_array(tptr, &[llglobal]); + let c_section_name = CString::new("llvm.metadata").unwrap(); + let llglobal = add_global(&cx, "llvm.compiler.used", arr_val, AppendingLinkage); + llvm::set_section(llglobal, &c_section_name); + llvm::set_global_constant(llglobal, false); + + //@llvm.compiler.used = appending global [1 x ptr] [ptr @__dummy.omp_offloading_entries], section "llvm.metadata" + + let mapper_fn_ty = cx.type_func(&[tptr], cx.type_void()); + crate::declare::declare_simple_fn( + &cx, + &"__tgt_unregister_lib", + llvm::CallConv::CCallConv, + llvm::UnnamedAddr::No, + llvm::Visibility::Default, + mapper_fn_ty, + ); + crate::declare::declare_simple_fn( + &cx, + &"__tgt_register_lib", + llvm::CallConv::CCallConv, + llvm::UnnamedAddr::No, + llvm::Visibility::Default, + mapper_fn_ty, + ); + crate::declare::declare_simple_fn( + &cx, + &"atexit", + llvm::CallConv::CCallConv, + llvm::UnnamedAddr::No, + llvm::Visibility::Default, + cx.type_func(&[tptr], ti32), + ); + + let unknown_txt = "11111111111111"; + let c_entry_name = CString::new(unknown_txt).unwrap(); + let c_val = c_entry_name.as_bytes_with_nul(); + let initializer = crate::common::bytes_in_context(cx.llcx, c_val); + let llglobal = + add_unnamed_global(&cx, &".omp_offloading.device_image", initializer, InternalLinkage); + let c_section_name = CString::new(".llvm.offloading").unwrap(); + llvm::set_section(llglobal, &c_section_name); + llvm::set_alignment(llglobal, Align::EIGHT); + + unsafe { + llvm::LLVMPrintModuleToFile( + cx.llmod, + CString::new("rustmagic.openmp.image.wrapper.ll").unwrap().as_ptr(), + std::ptr::null_mut(), + ); + } +} diff --git a/compiler/rustc_codegen_llvm/src/context.rs b/compiler/rustc_codegen_llvm/src/context.rs index ee77774c68832..2563d2e18e99d 100644 --- a/compiler/rustc_codegen_llvm/src/context.rs +++ b/compiler/rustc_codegen_llvm/src/context.rs @@ -159,6 +159,23 @@ fn to_llvm_tls_model(tls_model: TlsModel) -> llvm::ThreadLocalMode { } } +// FIXME(offload): This method is not relying on a tcx. We might still want to try to share some of +// the logic with create_module, e.g. the target_data_layout handling. +pub(crate) unsafe fn create_simple_module<'ll>( + llcx: &'ll llvm::Context, + target_data_layout: *const i8, + target_triple: *const i8, + mod_name: &str, +) -> &'ll llvm::Module { + let mod_name = SmallCStr::new(mod_name); + let llmod = unsafe { llvm::LLVMModuleCreateWithNameInContext(mod_name.as_ptr(), llcx) }; + unsafe { + llvm::LLVMSetDataLayout(llmod, target_data_layout); + llvm::LLVMSetTarget(llmod, target_triple); + } + llmod +} + pub(crate) unsafe fn create_module<'ll>( tcx: TyCtxt<'_>, llcx: &'ll llvm::Context, diff --git a/compiler/rustc_codegen_llvm/src/lib.rs b/compiler/rustc_codegen_llvm/src/lib.rs index ca84b6de8b11a..f25f7a3106176 100644 --- a/compiler/rustc_codegen_llvm/src/lib.rs +++ b/compiler/rustc_codegen_llvm/src/lib.rs @@ -392,6 +392,21 @@ unsafe impl Send for ModuleLlvm {} unsafe impl Sync for ModuleLlvm {} impl ModuleLlvm { + fn new_simple( + name: &str, + dl_cstr: *const i8, + target_cstr: *const i8, + cgcx: &CodegenContext, + ) -> Result { + unsafe { + let llcx = llvm::LLVMRustContextCreate(false); + let llmod_raw = context::create_simple_module(llcx, dl_cstr, target_cstr, name); + let dcx = cgcx.create_dcx(); + let tm = ModuleLlvm::tm_from_cgcx(cgcx, name, dcx.handle())?; + Ok(ModuleLlvm { llmod_raw, llcx, tm: ManuallyDrop::new(tm) }) + } + } + fn new(tcx: TyCtxt<'_>, mod_name: &str) -> Self { unsafe { let llcx = llvm::LLVMRustContextCreate(tcx.sess.fewer_names()); diff --git a/compiler/rustc_codegen_llvm/src/llvm/ffi.rs b/compiler/rustc_codegen_llvm/src/llvm/ffi.rs index 2443194ff4832..7bb2c6fcee658 100644 --- a/compiler/rustc_codegen_llvm/src/llvm/ffi.rs +++ b/compiler/rustc_codegen_llvm/src/llvm/ffi.rs @@ -1015,16 +1015,23 @@ unsafe extern "C" { ) -> MetadataKindId; // Create modules. + pub(crate) fn LLVMCloneModule(M: &Module) -> &Module; pub(crate) fn LLVMModuleCreateWithNameInContext( ModuleID: *const c_char, C: &Context, ) -> &Module; - pub(crate) safe fn LLVMCloneModule(M: &Module) -> &Module; + pub(crate) fn LLVMPrintModuleToFile( + M: &Module, + Name: *const c_char, + Error_message: *mut c_char, + ); /// Data layout. See Module::getDataLayout. pub(crate) fn LLVMGetDataLayoutStr(M: &Module) -> *const c_char; pub(crate) fn LLVMSetDataLayout(M: &Module, Triple: *const c_char); + pub(crate) fn LLVMSetTarget(M: &Module, Name: *const c_char); + /// Append inline assembly to a module. See `Module::appendModuleInlineAsm`. pub(crate) fn LLVMAppendModuleInlineAsm( M: &Module, From 91bf2a9ae2dfb435c879c632e81eb6e54c5b6329 Mon Sep 17 00:00:00 2001 From: Manuel Drehwald Date: Wed, 2 Jul 2025 16:53:37 -0700 Subject: [PATCH 02/10] postpone device side generation --- compiler/rustc_codegen_llvm/src/builder.rs | 1 + .../src/builder/gpu_device.rs | 113 ++++++++++++++++++ compiler/rustc_codegen_llvm/src/llvm/ffi.rs | 8 ++ compiler/rustc_codegen_llvm/src/llvm/mod.rs | 9 ++ 4 files changed, 131 insertions(+) create mode 100644 compiler/rustc_codegen_llvm/src/builder/gpu_device.rs diff --git a/compiler/rustc_codegen_llvm/src/builder.rs b/compiler/rustc_codegen_llvm/src/builder.rs index 68f972f65d822..ad52e3ff931eb 100644 --- a/compiler/rustc_codegen_llvm/src/builder.rs +++ b/compiler/rustc_codegen_llvm/src/builder.rs @@ -3,6 +3,7 @@ use std::ops::Deref; use std::{iter, ptr}; pub(crate) mod autodiff; +pub(crate) mod gpu_device; pub(crate) mod gpu_offload; pub(crate) mod gpu_wrapper; diff --git a/compiler/rustc_codegen_llvm/src/builder/gpu_device.rs b/compiler/rustc_codegen_llvm/src/builder/gpu_device.rs new file mode 100644 index 0000000000000..63416743ca322 --- /dev/null +++ b/compiler/rustc_codegen_llvm/src/builder/gpu_device.rs @@ -0,0 +1,113 @@ +use std::ffi::{CString, c_uint}; + +use llvm::Linkage::*; +use rustc_codegen_ssa::back::write::CodegenContext; + +use crate::llvm::{self, Linkage}; +use crate::{LlvmCodegenBackend, SimpleCx}; + +fn add_unnamed_global_in_addrspace<'ll>( + cx: &SimpleCx<'ll>, + name: &str, + initializer: &'ll llvm::Value, + l: Linkage, + addrspace: u32, +) -> &'ll llvm::Value { + let llglobal = add_global_in_addrspace(cx, name, initializer, l, addrspace); + unsafe { llvm::LLVMSetUnnamedAddress(llglobal, llvm::UnnamedAddr::Global) }; + llglobal +} + +pub(crate) fn add_global_in_addrspace<'ll>( + cx: &SimpleCx<'ll>, + name: &str, + initializer: &'ll llvm::Value, + l: Linkage, + addrspace: u32, +) -> &'ll llvm::Value { + let c_name = CString::new(name).unwrap(); + let llglobal: &'ll llvm::Value = llvm::add_global_in_addrspace( + cx.llmod, + cx.val_ty(initializer), + &c_name, + addrspace as c_uint, + ); + llvm::set_global_constant(llglobal, true); + llvm::set_linkage(llglobal, l); + llvm::set_initializer(llglobal, initializer); + llglobal +} + +#[allow(unused)] +pub(crate) fn gen_asdf<'ll>(cgcx: &CodegenContext, _old_cx: &SimpleCx<'ll>) { + let llcx = unsafe { llvm::LLVMRustContextCreate(false) }; + let module_name = CString::new("offload.wrapper.module").unwrap(); + let llmod = unsafe { llvm::LLVMModuleCreateWithNameInContext(module_name.as_ptr(), llcx) }; + let cx = SimpleCx::new(llmod, llcx, cgcx.pointer_size); + let initializer = cx.get_const_i32(0); + add_unnamed_global_in_addrspace(&cx, "__omp_rtl_debug_kind", initializer, WeakODRLinkage, 1); + add_unnamed_global_in_addrspace( + &cx, + "__omp_rtl_assume_teams_oversubscription", + initializer, + WeakODRLinkage, + 1, + ); + add_unnamed_global_in_addrspace( + &cx, + "__omp_rtl_assume_threads_oversubscription", + initializer, + WeakODRLinkage, + 1, + ); + add_unnamed_global_in_addrspace( + &cx, + "__omp_rtl_assume_no_thread_state", + initializer, + WeakODRLinkage, + 1, + ); + add_unnamed_global_in_addrspace( + &cx, + "__oclc_ABI_version", + cx.get_const_i32(500), + WeakODRLinkage, + 4, + ); + unsafe { + llvm::LLVMPrintModuleToFile( + llmod, + CString::new("rustmagic-openmp-amdgcn-amd-amdhsa-gfx90a.ll").unwrap().as_ptr(), + std::ptr::null_mut(), + ); + + // Clean up + llvm::LLVMDisposeModule(llmod); + llvm::LLVMContextDispose(llcx); + } + // TODO: addressspace 1 or 4 +} +// source_filename = "mem.cpp" +// GPU: target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128-p9:192:256:256:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8:9" +// CPU: target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128" +// target triple = "amdgcn-amd-amdhsa" +// +// @__omp_rtl_debug_kind = weak_odr hidden local_unnamed_addr addrspace(1) constant i32 0 +// @__omp_rtl_assume_teams_oversubscription = weak_odr hidden local_unnamed_addr addrspace(1) constant i32 0 +// @__omp_rtl_assume_threads_oversubscription = weak_odr hidden local_unnamed_addr addrspace(1) constant i32 0 +// @__omp_rtl_assume_no_thread_state = weak_odr hidden local_unnamed_addr addrspace(1) constant i32 0 +// @__omp_rtl_assume_no_nested_parallelism = weak_odr hidden local_unnamed_addr addrspace(1) constant i32 0 +// @__oclc_ABI_version = weak_odr hidden local_unnamed_addr addrspace(4) constant i32 500 +// +// !llvm.module.flags = !{!0, !1, !2, !3, !4} +// !opencl.ocl.version = !{!5} +// !llvm.ident = !{!6, !7} +// +// !0 = !{i32 1, !"amdhsa_code_object_version", i32 500} +// !1 = !{i32 1, !"wchar_size", i32 4} +// !2 = !{i32 7, !"openmp", i32 51} +// !3 = !{i32 7, !"openmp-device", i32 51} +// !4 = !{i32 8, !"PIC Level", i32 2} +// !5 = !{i32 2, i32 0} +// !6 = !{!"clang version 20.1.5-rust-1.89.0-nightly (https://github.com/rust-lang/llvm-project.git c1118fdbb3024157df7f4cfe765f2b0b4339e8a2)"} +// !7 = !{!"AMD clang version 19.0.0git (https://github.com/RadeonOpenCompute/llvm-project roc-6.4.0 25133 c7fe45cf4b819c5991fe208aaa96edf142730f1d)"} diff --git a/compiler/rustc_codegen_llvm/src/llvm/ffi.rs b/compiler/rustc_codegen_llvm/src/llvm/ffi.rs index 7bb2c6fcee658..6d35a911572f5 100644 --- a/compiler/rustc_codegen_llvm/src/llvm/ffi.rs +++ b/compiler/rustc_codegen_llvm/src/llvm/ffi.rs @@ -1025,6 +1025,8 @@ unsafe extern "C" { Name: *const c_char, Error_message: *mut c_char, ); + pub(crate) fn LLVMCloneModule(M: &Module) -> &Module; + pub(crate) fn LLVMDisposeModule(M: &Module); /// Data layout. See Module::getDataLayout. pub(crate) fn LLVMGetDataLayoutStr(M: &Module) -> *const c_char; @@ -1192,6 +1194,12 @@ unsafe extern "C" { // Operations on global variables pub(crate) safe fn LLVMIsAGlobalVariable(GlobalVar: &Value) -> Option<&Value>; pub(crate) fn LLVMAddGlobal<'a>(M: &'a Module, Ty: &'a Type, Name: *const c_char) -> &'a Value; + pub(crate) fn LLVMAddGlobalInAddressSpace<'a>( + M: &'a Module, + Ty: &'a Type, + Name: *const c_char, + addrspace: c_uint, + ) -> &'a Value; pub(crate) fn LLVMGetNamedGlobal(M: &Module, Name: *const c_char) -> Option<&Value>; pub(crate) fn LLVMGetFirstGlobal(M: &Module) -> Option<&Value>; pub(crate) fn LLVMGetNextGlobal(GlobalVar: &Value) -> Option<&Value>; diff --git a/compiler/rustc_codegen_llvm/src/llvm/mod.rs b/compiler/rustc_codegen_llvm/src/llvm/mod.rs index 154ba4fd69018..a9035ef231f2d 100644 --- a/compiler/rustc_codegen_llvm/src/llvm/mod.rs +++ b/compiler/rustc_codegen_llvm/src/llvm/mod.rs @@ -251,6 +251,15 @@ pub(crate) fn add_global<'a>(llmod: &'a Module, ty: &'a Type, name_cstr: &CStr) unsafe { LLVMAddGlobal(llmod, ty, name_cstr.as_ptr()) } } +pub(crate) fn add_global_in_addrspace<'a>( + llmod: &'a Module, + ty: &'a Type, + name_cstr: &CStr, + addrspace: c_uint, +) -> &'a Value { + unsafe { LLVMAddGlobalInAddressSpace(llmod, ty, name_cstr.as_ptr(), addrspace) } +} + pub(crate) fn set_initializer(llglobal: &Value, constant_val: &Value) { unsafe { LLVMSetInitializer(llglobal, constant_val); From 36fa060ad0b7ab0e518a01615c00648e4d933615 Mon Sep 17 00:00:00 2001 From: Manuel Drehwald Date: Thu, 10 Jul 2025 15:09:34 -0700 Subject: [PATCH 03/10] fix device code --- .../src/builder/gpu_offload.rs | 2 +- .../src/builder/gpu_wrapper.rs | 73 ++++++++----------- 2 files changed, 33 insertions(+), 42 deletions(-) diff --git a/compiler/rustc_codegen_llvm/src/builder/gpu_offload.rs b/compiler/rustc_codegen_llvm/src/builder/gpu_offload.rs index da7bde86463bc..b87806b8807b3 100644 --- a/compiler/rustc_codegen_llvm/src/builder/gpu_offload.rs +++ b/compiler/rustc_codegen_llvm/src/builder/gpu_offload.rs @@ -248,7 +248,7 @@ fn gen_define_handling<'ll>( o_types } -fn declare_offload_fn<'ll>( +pub(crate) fn declare_offload_fn<'ll>( cx: &'ll SimpleCx<'_>, name: &str, ty: &'ll llvm::Type, diff --git a/compiler/rustc_codegen_llvm/src/builder/gpu_wrapper.rs b/compiler/rustc_codegen_llvm/src/builder/gpu_wrapper.rs index 037208d656a81..22aed90f8c188 100644 --- a/compiler/rustc_codegen_llvm/src/builder/gpu_wrapper.rs +++ b/compiler/rustc_codegen_llvm/src/builder/gpu_wrapper.rs @@ -1,12 +1,12 @@ use std::ffi::CString; use llvm::Linkage::*; -use rustc_abi::Align; +use rustc_abi::{AddressSpace, Align}; use rustc_codegen_ssa::back::write::CodegenContext; use rustc_codegen_ssa::traits::BaseTypeCodegenMethods; use crate::builder::gpu_offload::*; -use crate::llvm::{self, Visibility}; +use crate::llvm::{self, Linkage, Type, Value, Visibility}; use crate::{LlvmCodegenBackend, ModuleLlvm, SimpleCx}; pub(crate) fn create_struct_ty<'ll>( @@ -22,6 +22,23 @@ pub(crate) fn create_struct_ty<'ll>( } } +pub(crate) fn add_global_decl<'ll>( + cx: &SimpleCx<'ll>, + ty: &'ll Type, + name: &str, + l: Linkage, + hidden: bool, +) -> &'ll llvm::Value { + let c_name = CString::new(name).unwrap(); + let llglobal: &'ll llvm::Value = llvm::add_global(cx.llmod, ty, &c_name); + llvm::set_global_constant(llglobal, true); + llvm::set_linkage(llglobal, l); + if hidden { + llvm::set_visibility(llglobal, Visibility::Hidden); + } + llglobal +} + // We don't copy types from other functions because we generate a new module and context. // Bringing in types from other contexts would likely cause issues. pub(crate) fn gen_image_wrapper_module(cgcx: &CodegenContext) { @@ -32,6 +49,7 @@ pub(crate) fn gen_image_wrapper_module(cgcx: &CodegenContext ModuleLlvm::new_simple(name, dl_cstr.into_raw(), target_cstr.into_raw(), &cgcx).unwrap(); let cx = SimpleCx::new(m.llmod(), m.llcx, cgcx.pointer_size); let tptr = cx.type_ptr(); + let tptr1 = cx.type_ptr_ext(AddressSpace(1)); let ti64 = cx.type_i64(); let ti32 = cx.type_i32(); let ti16 = cx.type_i16(); @@ -44,28 +62,22 @@ pub(crate) fn gen_image_wrapper_module(cgcx: &CodegenContext let offload_entry_ty = add_tgt_offload_entry(&cx); let offload_entry_arr = cx.type_array(offload_entry_ty, 0); - let c_name = CString::new("__start_omp_offloading_entries").unwrap(); - let llglobal = llvm::add_global(cx.llmod, offload_entry_arr, &c_name); - llvm::set_global_constant(llglobal, true); - llvm::set_linkage(llglobal, ExternalLinkage); - llvm::set_visibility(llglobal, Visibility::Hidden); - let c_name = CString::new("__stop_omp_offloading_entries").unwrap(); - let llglobal = llvm::add_global(cx.llmod, offload_entry_arr, &c_name); - llvm::set_global_constant(llglobal, true); - llvm::set_linkage(llglobal, ExternalLinkage); - llvm::set_visibility(llglobal, Visibility::Hidden); + let name = "__start_omp_offloading_entries"; + add_global_decl(&cx, offload_entry_arr, name, ExternalLinkage, true); + + let name = "__stop_omp_offloading_entries"; + add_global_decl(&cx, offload_entry_arr, name, ExternalLinkage, true); + + let name = "__dummy.omp_offloading_entries"; + let llglobal = add_global_decl(&cx, offload_entry_arr, name, InternalLinkage, false); - let c_name = CString::new("__dummy.omp_offloading_entries").unwrap(); - let llglobal = llvm::add_global(cx.llmod, offload_entry_arr, &c_name); - llvm::set_global_constant(llglobal, true); - llvm::set_linkage(llglobal, InternalLinkage); let c_section_name = CString::new("omp_offloading_entries").unwrap(); llvm::set_section(llglobal, &c_section_name); let zeroinit = cx.const_null(offload_entry_arr); llvm::set_initializer(llglobal, zeroinit); CString::new("llvm.compiler.used").unwrap(); - let arr_val = cx.const_array(tptr, &[llglobal]); + let arr_val = cx.const_array(tptr1, &[llglobal]); let c_section_name = CString::new("llvm.metadata").unwrap(); let llglobal = add_global(&cx, "llvm.compiler.used", arr_val, AppendingLinkage); llvm::set_section(llglobal, &c_section_name); @@ -74,30 +86,9 @@ pub(crate) fn gen_image_wrapper_module(cgcx: &CodegenContext //@llvm.compiler.used = appending global [1 x ptr] [ptr @__dummy.omp_offloading_entries], section "llvm.metadata" let mapper_fn_ty = cx.type_func(&[tptr], cx.type_void()); - crate::declare::declare_simple_fn( - &cx, - &"__tgt_unregister_lib", - llvm::CallConv::CCallConv, - llvm::UnnamedAddr::No, - llvm::Visibility::Default, - mapper_fn_ty, - ); - crate::declare::declare_simple_fn( - &cx, - &"__tgt_register_lib", - llvm::CallConv::CCallConv, - llvm::UnnamedAddr::No, - llvm::Visibility::Default, - mapper_fn_ty, - ); - crate::declare::declare_simple_fn( - &cx, - &"atexit", - llvm::CallConv::CCallConv, - llvm::UnnamedAddr::No, - llvm::Visibility::Default, - cx.type_func(&[tptr], ti32), - ); + declare_offload_fn(&cx, &"__tgt_register_lib", mapper_fn_ty); + declare_offload_fn(&cx, &"__tgt_unregister_lib", mapper_fn_ty); + declare_offload_fn(&cx, &"atexit", cx.type_func(&[tptr], ti32)); let unknown_txt = "11111111111111"; let c_entry_name = CString::new(unknown_txt).unwrap(); From 915b7d10ecaead61ca82a5fe8de22b55e104c923 Mon Sep 17 00:00:00 2001 From: Manuel Drehwald Date: Tue, 22 Jul 2025 12:46:35 -0700 Subject: [PATCH 04/10] fixup --- compiler/rustc_codegen_llvm/src/back/write.rs | 2 +- compiler/rustc_codegen_llvm/src/builder/gpu_device.rs | 2 +- compiler/rustc_codegen_llvm/src/builder/gpu_wrapper.rs | 2 +- compiler/rustc_codegen_llvm/src/llvm/ffi.rs | 1 - 4 files changed, 3 insertions(+), 4 deletions(-) diff --git a/compiler/rustc_codegen_llvm/src/back/write.rs b/compiler/rustc_codegen_llvm/src/back/write.rs index 85a06f457ebea..2cd45f48ffc17 100644 --- a/compiler/rustc_codegen_llvm/src/back/write.rs +++ b/compiler/rustc_codegen_llvm/src/back/write.rs @@ -926,7 +926,7 @@ pub(crate) fn codegen( // binaries. So we must clone the module to produce the asm output // if we are also producing object code. let llmod = if let EmitObj::ObjectCode(_) = config.emit_obj { - llvm::LLVMCloneModule(llmod) + unsafe { llvm::LLVMCloneModule(llmod) } } else { llmod }; diff --git a/compiler/rustc_codegen_llvm/src/builder/gpu_device.rs b/compiler/rustc_codegen_llvm/src/builder/gpu_device.rs index 63416743ca322..cb957872fec05 100644 --- a/compiler/rustc_codegen_llvm/src/builder/gpu_device.rs +++ b/compiler/rustc_codegen_llvm/src/builder/gpu_device.rs @@ -14,7 +14,7 @@ fn add_unnamed_global_in_addrspace<'ll>( addrspace: u32, ) -> &'ll llvm::Value { let llglobal = add_global_in_addrspace(cx, name, initializer, l, addrspace); - unsafe { llvm::LLVMSetUnnamedAddress(llglobal, llvm::UnnamedAddr::Global) }; + llvm::LLVMSetUnnamedAddress(llglobal, llvm::UnnamedAddr::Global); llglobal } diff --git a/compiler/rustc_codegen_llvm/src/builder/gpu_wrapper.rs b/compiler/rustc_codegen_llvm/src/builder/gpu_wrapper.rs index 22aed90f8c188..f7cf6c906392f 100644 --- a/compiler/rustc_codegen_llvm/src/builder/gpu_wrapper.rs +++ b/compiler/rustc_codegen_llvm/src/builder/gpu_wrapper.rs @@ -6,7 +6,7 @@ use rustc_codegen_ssa::back::write::CodegenContext; use rustc_codegen_ssa::traits::BaseTypeCodegenMethods; use crate::builder::gpu_offload::*; -use crate::llvm::{self, Linkage, Type, Value, Visibility}; +use crate::llvm::{self, Linkage, Type, Visibility}; use crate::{LlvmCodegenBackend, ModuleLlvm, SimpleCx}; pub(crate) fn create_struct_ty<'ll>( diff --git a/compiler/rustc_codegen_llvm/src/llvm/ffi.rs b/compiler/rustc_codegen_llvm/src/llvm/ffi.rs index 6d35a911572f5..7501447ec0d69 100644 --- a/compiler/rustc_codegen_llvm/src/llvm/ffi.rs +++ b/compiler/rustc_codegen_llvm/src/llvm/ffi.rs @@ -1015,7 +1015,6 @@ unsafe extern "C" { ) -> MetadataKindId; // Create modules. - pub(crate) fn LLVMCloneModule(M: &Module) -> &Module; pub(crate) fn LLVMModuleCreateWithNameInContext( ModuleID: *const c_char, C: &Context, From 70f0fe3fb23d4b4168dfbf8966ef5c159c631614 Mon Sep 17 00:00:00 2001 From: Manuel Drehwald Date: Wed, 30 Jul 2025 17:28:51 -0700 Subject: [PATCH 05/10] add tgt_target_kernel helper and test update --- .../src/builder/gpu_offload.rs | 26 ++++++++++++++++--- tests/codegen-llvm/gpu_offload/gpu_host.rs | 7 +++-- 2 files changed, 28 insertions(+), 5 deletions(-) diff --git a/compiler/rustc_codegen_llvm/src/builder/gpu_offload.rs b/compiler/rustc_codegen_llvm/src/builder/gpu_offload.rs index b87806b8807b3..9da171ef15e93 100644 --- a/compiler/rustc_codegen_llvm/src/builder/gpu_offload.rs +++ b/compiler/rustc_codegen_llvm/src/builder/gpu_offload.rs @@ -27,9 +27,25 @@ pub(crate) fn handle_gpu_code<'ll>( } } gen_call_handling(&cx, &kernels, &o_types); + generate_launcher(&cx); crate::builder::gpu_wrapper::gen_image_wrapper_module(&cgcx); } +// ; Function Attrs: nounwind +// declare i32 @__tgt_target_kernel(ptr, i64, i32, i32, ptr, ptr) #2 +fn generate_launcher<'ll>(cx: &'ll SimpleCx<'_>) -> &'ll llvm::Value { + let tptr = cx.type_ptr(); + let ti64 = cx.type_i64(); + let ti32 = cx.type_i32(); + let args = vec![tptr, ti64, ti32, ti32, tptr, tptr]; + let tgt_fn_ty = cx.type_func(&args, ti32); + let name = "__tgt_target_kernel"; + let tgt_decl = declare_offload_fn(&cx, name, tgt_fn_ty); + let nounwind = llvm::AttributeKind::NoUnwind.create_attr(cx.llcx); + attributes::apply_to_llfn(tgt_decl, Function, &[nounwind]); + tgt_decl +} + // What is our @1 here? A magic global, used in our data_{begin/update/end}_mapper: // @0 = private unnamed_addr constant [23 x i8] c";unknown;unknown;0;0;;\00", align 1 // @1 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 22, ptr @0 }, align 8 @@ -83,7 +99,7 @@ pub(crate) fn add_tgt_offload_entry<'ll>(cx: &'ll SimpleCx<'_>) -> &'ll llvm::Ty offload_entry_ty } -fn gen_tgt_kernel_global<'ll>(cx: &'ll SimpleCx<'_>) { +fn gen_tgt_kernel_global<'ll>(cx: &'ll SimpleCx<'_>) -> &'ll llvm::Type { let kernel_arguments_ty = cx.type_named_struct("struct.__tgt_kernel_arguments"); let tptr = cx.type_ptr(); let ti64 = cx.type_i64(); @@ -118,9 +134,10 @@ fn gen_tgt_kernel_global<'ll>(cx: &'ll SimpleCx<'_>) { vec![ti32, ti32, tptr, tptr, tptr, tptr, tptr, tptr, ti64, ti64, tarr, tarr, ti32]; cx.set_struct_body(kernel_arguments_ty, &kernel_elements, false); + kernel_arguments_ty // For now we don't handle kernels, so for now we just add a global dummy // to make sure that the __tgt_offload_entry is defined and handled correctly. - cx.declare_global("my_struct_global2", kernel_arguments_ty); + //cx.declare_global("my_struct_global2", kernel_arguments_ty); } fn gen_tgt_data_mappers<'ll>( @@ -295,7 +312,7 @@ fn gen_call_handling<'ll>( let tgt_bin_desc = cx.type_named_struct("struct.__tgt_bin_desc"); cx.set_struct_body(tgt_bin_desc, &tgt_bin_desc_ty, false); - gen_tgt_kernel_global(&cx); + let tgt_kernel_decl = gen_tgt_kernel_global(&cx); let (begin_mapper_decl, _, end_mapper_decl, fn_ty) = gen_tgt_data_mappers(&cx); let main_fn = cx.get_function("main"); @@ -329,6 +346,9 @@ fn gen_call_handling<'ll>( // These represent the sizes in bytes, e.g. the entry for `&[f64; 16]` will be 8*16. let ty2 = cx.type_array(cx.type_i64(), num_args); let a4 = builder.direct_alloca(ty2, Align::EIGHT, ".offload_sizes"); + + let a5 = builder.direct_alloca(tgt_kernel_decl, Align::EIGHT, "kernel_args"); + //%kernel_args = alloca %struct.__tgt_kernel_arguments, align 8 // Now we allocate once per function param, a copy to be passed to one of our maps. let mut vals = vec![]; let mut geps = vec![]; diff --git a/tests/codegen-llvm/gpu_offload/gpu_host.rs b/tests/codegen-llvm/gpu_offload/gpu_host.rs index 513e27426bc0e..d34502a730862 100644 --- a/tests/codegen-llvm/gpu_offload/gpu_host.rs +++ b/tests/codegen-llvm/gpu_offload/gpu_host.rs @@ -21,16 +21,15 @@ fn main() { } // CHECK: %struct.__tgt_offload_entry = type { i64, i16, i16, i32, ptr, ptr, i64, i64, ptr } -// CHECK: %struct.__tgt_kernel_arguments = type { i32, i32, ptr, ptr, ptr, ptr, ptr, ptr, i64, i64, [3 x i32], [3 x i32], i32 } // CHECK: %struct.ident_t = type { i32, i32, i32, i32, ptr } // CHECK: %struct.__tgt_bin_desc = type { i32, ptr, ptr, ptr } +// CHECK: %struct.__tgt_kernel_arguments = type { i32, i32, ptr, ptr, ptr, ptr, ptr, ptr, i64, i64, [3 x i32], [3 x i32], i32 } // CHECK: @.offload_sizes.1 = private unnamed_addr constant [1 x i64] [i64 1024] // CHECK: @.offload_maptypes.1 = private unnamed_addr constant [1 x i64] [i64 3] // CHECK: @.kernel_1.region_id = weak unnamed_addr constant i8 0 // CHECK: @.offloading.entry_name.1 = internal unnamed_addr constant [9 x i8] c"kernel_1\00", section ".llvm.rodata.offloading", align 1 // CHECK: @.offloading.entry.kernel_1 = weak constant %struct.__tgt_offload_entry { i64 0, i16 1, i16 1, i32 0, ptr @.kernel_1.region_id, ptr @.offloading.entry_name.1, i64 0, i64 0, ptr null }, section ".omp_offloading_entries", align 1 -// CHECK: @my_struct_global2 = external global %struct.__tgt_kernel_arguments // CHECK: @0 = private unnamed_addr constant [23 x i8] c";unknown;unknown;0;0;;\00", align 1 // CHECK: @1 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 22, ptr @0 }, align 8 @@ -43,6 +42,7 @@ fn main() { // CHECK-NEXT: %.offload_baseptrs = alloca [1 x ptr], align 8 // CHECK-NEXT: %.offload_ptrs = alloca [1 x ptr], align 8 // CHECK-NEXT: %.offload_sizes = alloca [1 x i64], align 8 +// CHECK-NEXT: %kernel_args = alloca %struct.__tgt_kernel_arguments, align 8 // CHECK-NEXT: %x.addr = alloca ptr, align 8 // CHECK-NEXT: store ptr %x, ptr %x.addr, align 8 // CHECK-NEXT: %1 = load ptr, ptr %x.addr, align 8 @@ -71,6 +71,9 @@ fn main() { // CHECK: ret void // CHECK-NEXT: } +// CHECK: Function Attrs: nounwind +// CHECK: declare i32 @__tgt_target_kernel(ptr, i64, i32, i32, ptr, ptr) + #[unsafe(no_mangle)] #[inline(never)] pub fn kernel_1(x: &mut [f32; 256]) { From cb5250876415089bec4e94ba86b81d87517f2648 Mon Sep 17 00:00:00 2001 From: Manuel Drehwald Date: Wed, 30 Jul 2025 17:48:11 -0700 Subject: [PATCH 06/10] wip --- .../src/builder/gpu_offload.rs | 49 ++++++++++++++++++- 1 file changed, 47 insertions(+), 2 deletions(-) diff --git a/compiler/rustc_codegen_llvm/src/builder/gpu_offload.rs b/compiler/rustc_codegen_llvm/src/builder/gpu_offload.rs index 9da171ef15e93..505e5285968de 100644 --- a/compiler/rustc_codegen_llvm/src/builder/gpu_offload.rs +++ b/compiler/rustc_codegen_llvm/src/builder/gpu_offload.rs @@ -26,8 +26,8 @@ pub(crate) fn handle_gpu_code<'ll>( kernels.push(kernel); } } - gen_call_handling(&cx, &kernels, &o_types); generate_launcher(&cx); + gen_call_handling(&cx, &kernels, &o_types); crate::builder::gpu_wrapper::gen_image_wrapper_module(&cgcx); } @@ -347,8 +347,9 @@ fn gen_call_handling<'ll>( let ty2 = cx.type_array(cx.type_i64(), num_args); let a4 = builder.direct_alloca(ty2, Align::EIGHT, ".offload_sizes"); - let a5 = builder.direct_alloca(tgt_kernel_decl, Align::EIGHT, "kernel_args"); //%kernel_args = alloca %struct.__tgt_kernel_arguments, align 8 + let a5 = builder.direct_alloca(tgt_kernel_decl, Align::EIGHT, "kernel_args"); + // Now we allocate once per function param, a copy to be passed to one of our maps. let mut vals = vec![]; let mut geps = vec![]; @@ -441,7 +442,51 @@ fn gen_call_handling<'ll>( // Step 3) // Here we will add code for the actual kernel launches in a follow-up PR. + //%28 = getelementptr inbounds nuw %struct.__tgt_kernel_arguments, ptr %kernel_args, i32 0, i32 0 + //store i32 3, ptr %28, align 4 + //%29 = getelementptr inbounds nuw %struct.__tgt_kernel_arguments, ptr %kernel_args, i32 0, i32 1 + //store i32 3, ptr %29, align 4 + //%30 = getelementptr inbounds nuw %struct.__tgt_kernel_arguments, ptr %kernel_args, i32 0, i32 2 + //store ptr %26, ptr %30, align 8 + //%31 = getelementptr inbounds nuw %struct.__tgt_kernel_arguments, ptr %kernel_args, i32 0, i32 3 + //store ptr %27, ptr %31, align 8 + //%32 = getelementptr inbounds nuw %struct.__tgt_kernel_arguments, ptr %kernel_args, i32 0, i32 4 + //store ptr @.offload_sizes, ptr %32, align 8 + //%33 = getelementptr inbounds nuw %struct.__tgt_kernel_arguments, ptr %kernel_args, i32 0, i32 5 + //store ptr @.offload_maptypes, ptr %33, align 8 + //%34 = getelementptr inbounds nuw %struct.__tgt_kernel_arguments, ptr %kernel_args, i32 0, i32 6 + //store ptr null, ptr %34, align 8 + //%35 = getelementptr inbounds nuw %struct.__tgt_kernel_arguments, ptr %kernel_args, i32 0, i32 7 + //store ptr null, ptr %35, align 8 + //%36 = getelementptr inbounds nuw %struct.__tgt_kernel_arguments, ptr %kernel_args, i32 0, i32 8 + //store i64 0, ptr %36, align 8 + //%37 = getelementptr inbounds nuw %struct.__tgt_kernel_arguments, ptr %kernel_args, i32 0, i32 9 + //store i64 0, ptr %37, align 8 + //%38 = getelementptr inbounds nuw %struct.__tgt_kernel_arguments, ptr %kernel_args, i32 0, i32 10 + //store [3 x i32] [i32 2097152, i32 0, i32 0], ptr %38, align 4 + //%39 = getelementptr inbounds nuw %struct.__tgt_kernel_arguments, ptr %kernel_args, i32 0, i32 11 + //store [3 x i32] [i32 256, i32 0, i32 0], ptr %39, align 4 + //%40 = getelementptr inbounds nuw %struct.__tgt_kernel_arguments, ptr %kernel_args, i32 0, i32 12 + //store i32 0, ptr %40, align 4 // FIXME(offload): launch kernels + let mut values = vec![]; + values.push(cx.get_const_i32(3)); + values.push(cx.get_const_i32(3)); + values.push(geps.0); + values.push(geps.1); + values.push(geps.2); + values.push(o_types[0]); + values.push(cx.const_null(cx.type_ptr())); + values.push(cx.const_null(cx.type_ptr())); + values.push(cx.get_const_i64(0)); + values.push(cx.get_const_i64(0)); + values.push(); + values.push(); + values.push(cx.get_const_i32(0)); + for (value, i) in values.enumerate() { + let gep1 = builder.inbounds_gep(ty, a1, &[i32_0, cx.get_const_i32(i)]); + builder.store(p, alloca, Align::EIGHT); + } // Step 4) unsafe { llvm::LLVMRustPositionAfter(builder.llbuilder, kernel_call) }; From cc13fc32b378f476dc633c444ec27f002a5fbd1f Mon Sep 17 00:00:00 2001 From: Manuel Drehwald Date: Wed, 30 Jul 2025 18:29:33 -0700 Subject: [PATCH 07/10] generate more geps and stores for kernel_args --- .../src/builder/gpu_offload.rs | 41 ++++++++++--------- 1 file changed, 22 insertions(+), 19 deletions(-) diff --git a/compiler/rustc_codegen_llvm/src/builder/gpu_offload.rs b/compiler/rustc_codegen_llvm/src/builder/gpu_offload.rs index 505e5285968de..0159783492e77 100644 --- a/compiler/rustc_codegen_llvm/src/builder/gpu_offload.rs +++ b/compiler/rustc_codegen_llvm/src/builder/gpu_offload.rs @@ -99,7 +99,7 @@ pub(crate) fn add_tgt_offload_entry<'ll>(cx: &'ll SimpleCx<'_>) -> &'ll llvm::Ty offload_entry_ty } -fn gen_tgt_kernel_global<'ll>(cx: &'ll SimpleCx<'_>) -> &'ll llvm::Type { +fn gen_tgt_kernel_global<'ll>(cx: &'ll SimpleCx<'_>) -> (&'ll llvm::Type, Vec<&'ll llvm::Type>) { let kernel_arguments_ty = cx.type_named_struct("struct.__tgt_kernel_arguments"); let tptr = cx.type_ptr(); let ti64 = cx.type_i64(); @@ -134,7 +134,7 @@ fn gen_tgt_kernel_global<'ll>(cx: &'ll SimpleCx<'_>) -> &'ll llvm::Type { vec![ti32, ti32, tptr, tptr, tptr, tptr, tptr, tptr, ti64, ti64, tarr, tarr, ti32]; cx.set_struct_body(kernel_arguments_ty, &kernel_elements, false); - kernel_arguments_ty + (kernel_arguments_ty, kernel_elements) // For now we don't handle kernels, so for now we just add a global dummy // to make sure that the __tgt_offload_entry is defined and handled correctly. //cx.declare_global("my_struct_global2", kernel_arguments_ty); @@ -312,7 +312,7 @@ fn gen_call_handling<'ll>( let tgt_bin_desc = cx.type_named_struct("struct.__tgt_bin_desc"); cx.set_struct_body(tgt_bin_desc, &tgt_bin_desc_ty, false); - let tgt_kernel_decl = gen_tgt_kernel_global(&cx); + let (tgt_kernel_decl, tgt_kernel_types) = gen_tgt_kernel_global(&cx); let (begin_mapper_decl, _, end_mapper_decl, fn_ty) = gen_tgt_data_mappers(&cx); let main_fn = cx.get_function("main"); @@ -470,22 +470,25 @@ fn gen_call_handling<'ll>( //store i32 0, ptr %40, align 4 // FIXME(offload): launch kernels let mut values = vec![]; - values.push(cx.get_const_i32(3)); - values.push(cx.get_const_i32(3)); - values.push(geps.0); - values.push(geps.1); - values.push(geps.2); - values.push(o_types[0]); - values.push(cx.const_null(cx.type_ptr())); - values.push(cx.const_null(cx.type_ptr())); - values.push(cx.get_const_i64(0)); - values.push(cx.get_const_i64(0)); - values.push(); - values.push(); - values.push(cx.get_const_i32(0)); - for (value, i) in values.enumerate() { - let gep1 = builder.inbounds_gep(ty, a1, &[i32_0, cx.get_const_i32(i)]); - builder.store(p, alloca, Align::EIGHT); + values.push((4, cx.get_const_i32(3))); + values.push((4, cx.get_const_i32(3))); + values.push((8, geps.0)); + values.push((8, geps.1)); + values.push((8, geps.2)); + values.push((8, o_types[0])); + values.push((8, cx.const_null(cx.type_ptr()))); + values.push((8, cx.const_null(cx.type_ptr()))); + values.push((8, cx.get_const_i64(0))); + values.push((8, cx.get_const_i64(0))); + let ti32 = cx.type_i32(); + let ci32_0 = cx.get_const_i32(0); + values.push((8, cx.const_array(ti32, &vec![cx.get_const_i32(2097152), ci32_0, ci32_0]))); + values.push((8, cx.const_array(ti32, &vec![cx.get_const_i32(256), ci32_0, ci32_0]))); + values.push((4, cx.get_const_i32(0))); + + for (i, value) in values.iter().enumerate() { + let ptr = builder.inbounds_gep(tgt_kernel_decl, a5, &[i32_0, cx.get_const_i32(i as u64)]); + builder.store(value.1, ptr, Align::from_bytes(value.0).unwrap()); } // Step 4) From d60482e4530f261c6a32ab475225d7c668fdd8bf Mon Sep 17 00:00:00 2001 From: Manuel Drehwald Date: Mon, 4 Aug 2025 14:09:42 -0700 Subject: [PATCH 08/10] host might be ready? --- .../src/builder/gpu_offload.rs | 38 +++++++++++++++---- compiler/rustc_codegen_llvm/src/llvm/ffi.rs | 2 + tests/codegen-llvm/gpu_offload/gpu_host.rs | 36 +++++++++++++++--- 3 files changed, 63 insertions(+), 13 deletions(-) diff --git a/compiler/rustc_codegen_llvm/src/builder/gpu_offload.rs b/compiler/rustc_codegen_llvm/src/builder/gpu_offload.rs index 0159783492e77..3b0f5f20f00d3 100644 --- a/compiler/rustc_codegen_llvm/src/builder/gpu_offload.rs +++ b/compiler/rustc_codegen_llvm/src/builder/gpu_offload.rs @@ -18,22 +18,24 @@ pub(crate) fn handle_gpu_code<'ll>( // The offload memory transfer type for each kernel let mut o_types = vec![]; let mut kernels = vec![]; + let mut region_ids = vec![]; let offload_entry_ty = add_tgt_offload_entry(&cx); for num in 0..9 { let kernel = cx.get_function(&format!("kernel_{num}")); if let Some(kernel) = kernel { - o_types.push(gen_define_handling(&cx, kernel, offload_entry_ty, num)); + let (o, k) = gen_define_handling(&cx, kernel, offload_entry_ty, num); + o_types.push(o); + region_ids.push(k); kernels.push(kernel); } } - generate_launcher(&cx); - gen_call_handling(&cx, &kernels, &o_types); + gen_call_handling(&cx, &kernels, &o_types, ®ion_ids); crate::builder::gpu_wrapper::gen_image_wrapper_module(&cgcx); } // ; Function Attrs: nounwind // declare i32 @__tgt_target_kernel(ptr, i64, i32, i32, ptr, ptr) #2 -fn generate_launcher<'ll>(cx: &'ll SimpleCx<'_>) -> &'ll llvm::Value { +fn generate_launcher<'ll>(cx: &'ll SimpleCx<'_>) -> (&'ll llvm::Value, &'ll llvm::Type) { let tptr = cx.type_ptr(); let ti64 = cx.type_i64(); let ti32 = cx.type_i32(); @@ -43,7 +45,7 @@ fn generate_launcher<'ll>(cx: &'ll SimpleCx<'_>) -> &'ll llvm::Value { let tgt_decl = declare_offload_fn(&cx, name, tgt_fn_ty); let nounwind = llvm::AttributeKind::NoUnwind.create_attr(cx.llcx); attributes::apply_to_llfn(tgt_decl, Function, &[nounwind]); - tgt_decl + (tgt_decl, tgt_fn_ty) } // What is our @1 here? A magic global, used in our data_{begin/update/end}_mapper: @@ -204,7 +206,7 @@ fn gen_define_handling<'ll>( kernel: &'ll llvm::Value, offload_entry_ty: &'ll llvm::Type, num: i64, -) -> &'ll llvm::Value { +) -> (&'ll llvm::Value, &'ll llvm::Value) { let types = cx.func_params_types(cx.get_type_of_global(kernel)); // It seems like non-pointer values are automatically mapped. So here, we focus on pointer (or // reference) types. @@ -262,7 +264,7 @@ fn gen_define_handling<'ll>( llvm::set_alignment(llglobal, Align::ONE); let c_section_name = CString::new(".omp_offloading_entries").unwrap(); llvm::set_section(llglobal, &c_section_name); - o_types + (o_types, region_id) } pub(crate) fn declare_offload_fn<'ll>( @@ -304,7 +306,9 @@ fn gen_call_handling<'ll>( cx: &'ll SimpleCx<'_>, _kernels: &[&'ll llvm::Value], o_types: &[&'ll llvm::Value], + region_ids: &[&'ll llvm::Value], ) { + let (tgt_decl, tgt_target_kernel_ty) = generate_launcher(&cx); // %struct.__tgt_bin_desc = type { i32, ptr, ptr, ptr } let tptr = cx.type_ptr(); let ti32 = cx.type_i32(); @@ -491,8 +495,26 @@ fn gen_call_handling<'ll>( builder.store(value.1, ptr, Align::from_bytes(value.0).unwrap()); } + let args = vec![ + s_ident_t, + // MAX == -1 + cx.get_const_i64(u64::MAX), + cx.get_const_i32(2097152), + cx.get_const_i32(256), + region_ids[0], + a5, + ]; + let offload_success = builder.call(tgt_target_kernel_ty, tgt_decl, &args, None); + // %41 = call i32 @__tgt_target_kernel(ptr @1, i64 -1, i32 2097152, i32 256, ptr @.kernel_1.region_id, ptr %kernel_args) + unsafe { + let next = llvm::LLVMGetNextInstruction(offload_success).unwrap(); + dbg!(&next); + llvm::LLVMRustPositionAfter(builder.llbuilder, next); + llvm::LLVMInstructionEraseFromParent(next); + } + // Step 4) - unsafe { llvm::LLVMRustPositionAfter(builder.llbuilder, kernel_call) }; + //unsafe { llvm::LLVMRustPositionAfter(builder.llbuilder, kernel_call) }; let geps = get_geps(&mut builder, &cx, ty, ty2, a1, a2, a4); generate_mapper_call(&mut builder, &cx, geps, o, end_mapper_decl, fn_ty, num_args, s_ident_t); diff --git a/compiler/rustc_codegen_llvm/src/llvm/ffi.rs b/compiler/rustc_codegen_llvm/src/llvm/ffi.rs index 7501447ec0d69..b6fe83ff3a88f 100644 --- a/compiler/rustc_codegen_llvm/src/llvm/ffi.rs +++ b/compiler/rustc_codegen_llvm/src/llvm/ffi.rs @@ -1252,6 +1252,8 @@ unsafe extern "C" { pub(crate) fn LLVMIsAInstruction(Val: &Value) -> Option<&Value>; pub(crate) fn LLVMGetFirstBasicBlock(Fn: &Value) -> &BasicBlock; pub(crate) fn LLVMGetOperand(Val: &Value, Index: c_uint) -> Option<&Value>; + pub(crate) fn LLVMGetNextInstruction(Val: &Value) -> Option<&Value>; + pub(crate) fn LLVMInstructionEraseFromParent(Val: &Value); // Operations on call sites pub(crate) fn LLVMSetInstructionCallConv(Instr: &Value, CC: c_uint); diff --git a/tests/codegen-llvm/gpu_offload/gpu_host.rs b/tests/codegen-llvm/gpu_offload/gpu_host.rs index d34502a730862..4f7d2b4f7783c 100644 --- a/tests/codegen-llvm/gpu_offload/gpu_host.rs +++ b/tests/codegen-llvm/gpu_offload/gpu_host.rs @@ -60,11 +60,37 @@ fn main() { // CHECK-NEXT: %7 = getelementptr inbounds [1 x ptr], ptr %.offload_ptrs, i32 0, i32 0 // CHECK-NEXT: %8 = getelementptr inbounds [1 x i64], ptr %.offload_sizes, i32 0, i32 0 // CHECK-NEXT: call void @__tgt_target_data_begin_mapper(ptr @1, i64 -1, i32 1, ptr %6, ptr %7, ptr %8, ptr @.offload_maptypes.1, ptr null, ptr null) -// CHECK-NEXT: call void @kernel_1(ptr noalias noundef nonnull align 4 dereferenceable(1024) %x) -// CHECK-NEXT: %9 = getelementptr inbounds [1 x ptr], ptr %.offload_baseptrs, i32 0, i32 0 -// CHECK-NEXT: %10 = getelementptr inbounds [1 x ptr], ptr %.offload_ptrs, i32 0, i32 0 -// CHECK-NEXT: %11 = getelementptr inbounds [1 x i64], ptr %.offload_sizes, i32 0, i32 0 -// CHECK-NEXT: call void @__tgt_target_data_end_mapper(ptr @1, i64 -1, i32 1, ptr %9, ptr %10, ptr %11, ptr @.offload_maptypes.1, ptr null, ptr null) +// CHECK-NEXT: %9 = getelementptr inbounds %struct.__tgt_kernel_arguments, ptr %kernel_args, i32 0, i32 0 +// CHECK-NEXT: store i32 3, ptr %9, align 4 +// CHECK-NEXT: %10 = getelementptr inbounds %struct.__tgt_kernel_arguments, ptr %kernel_args, i32 0, i32 1 +// CHECK-NEXT: store i32 3, ptr %10, align 4 +// CHECK-NEXT: %11 = getelementptr inbounds %struct.__tgt_kernel_arguments, ptr %kernel_args, i32 0, i32 2 +// CHECK-NEXT: store ptr %6, ptr %11, align 8 +// CHECK-NEXT: %12 = getelementptr inbounds %struct.__tgt_kernel_arguments, ptr %kernel_args, i32 0, i32 3 +// CHECK-NEXT: store ptr %7, ptr %12, align 8 +// CHECK-NEXT: %13 = getelementptr inbounds %struct.__tgt_kernel_arguments, ptr %kernel_args, i32 0, i32 4 +// CHECK-NEXT: store ptr %8, ptr %13, align 8 +// CHECK-NEXT: %14 = getelementptr inbounds %struct.__tgt_kernel_arguments, ptr %kernel_args, i32 0, i32 5 +// CHECK-NEXT: store ptr @.offload_maptypes.1, ptr %14, align 8 +// CHECK-NEXT: %15 = getelementptr inbounds %struct.__tgt_kernel_arguments, ptr %kernel_args, i32 0, i32 6 +// CHECK-NEXT: store ptr null, ptr %15, align 8 +// CHECK-NEXT: %16 = getelementptr inbounds %struct.__tgt_kernel_arguments, ptr %kernel_args, i32 0, i32 7 +// CHECK-NEXT: store ptr null, ptr %16, align 8 +// CHECK-NEXT: %17 = getelementptr inbounds %struct.__tgt_kernel_arguments, ptr %kernel_args, i32 0, i32 8 +// CHECK-NEXT: store i64 0, ptr %17, align 8 +// CHECK-NEXT: %18 = getelementptr inbounds %struct.__tgt_kernel_arguments, ptr %kernel_args, i32 0, i32 9 +// CHECK-NEXT: store i64 0, ptr %18, align 8 +// CHECK-NEXT: %19 = getelementptr inbounds %struct.__tgt_kernel_arguments, ptr %kernel_args, i32 0, i32 10 +// CHECK-NEXT: store [3 x i32] [i32 2097152, i32 0, i32 0], ptr %19, align 8 +// CHECK-NEXT: %20 = getelementptr inbounds %struct.__tgt_kernel_arguments, ptr %kernel_args, i32 0, i32 11 +// CHECK-NEXT: store [3 x i32] [i32 256, i32 0, i32 0], ptr %20, align 8 +// CHECK-NEXT: %21 = getelementptr inbounds %struct.__tgt_kernel_arguments, ptr %kernel_args, i32 0, i32 12 +// CHECK-NEXT: store i32 0, ptr %21, align 4 +// CHECK-NEXT: %22 = call i32 @__tgt_target_kernel(ptr @1, i64 -1, i32 2097152, i32 256, ptr @.kernel_1.region_id, ptr %kernel_args) +// CHECK-NEXT: %23 = getelementptr inbounds [1 x ptr], ptr %.offload_baseptrs, i32 0, i32 0 +// CHECK-NEXT: %24 = getelementptr inbounds [1 x ptr], ptr %.offload_ptrs, i32 0, i32 0 +// CHECK-NEXT: %25 = getelementptr inbounds [1 x i64], ptr %.offload_sizes, i32 0, i32 0 +// CHECK-NEXT: call void @__tgt_target_data_end_mapper(ptr @1, i64 -1, i32 1, ptr %23, ptr %24, ptr %25, ptr @.offload_maptypes.1, ptr null, ptr null) // CHECK-NEXT: call void @__tgt_unregister_lib(ptr %EmptyDesc) // CHECK: store ptr %x, ptr %0, align 8 // CHECK-NEXT: call void asm sideeffect "", "r,~{memory}"(ptr nonnull %0) From 9d28bd2d0bcb5b8c4030a440c9067ada7b6dc785 Mon Sep 17 00:00:00 2001 From: Manuel Drehwald Date: Mon, 4 Aug 2025 16:26:37 -0700 Subject: [PATCH 09/10] fix illegal usage of parameter of kernel function in main function --- compiler/rustc_codegen_llvm/src/builder/gpu_offload.rs | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/compiler/rustc_codegen_llvm/src/builder/gpu_offload.rs b/compiler/rustc_codegen_llvm/src/builder/gpu_offload.rs index 3b0f5f20f00d3..738fcd022382e 100644 --- a/compiler/rustc_codegen_llvm/src/builder/gpu_offload.rs +++ b/compiler/rustc_codegen_llvm/src/builder/gpu_offload.rs @@ -366,7 +366,8 @@ fn gen_call_handling<'ll>( let arg_name = format!("{name}.addr"); let alloca = builder.direct_alloca(in_ty, Align::EIGHT, &arg_name); - builder.store(p, alloca, Align::EIGHT); + let v = unsafe { llvm::LLVMGetOperand(kernel_call, index as u32).unwrap() }; + builder.store(v, alloca, Align::EIGHT); let val = builder.load(in_ty, alloca, Align::EIGHT); let gep = builder.inbounds_gep(cx.type_f32(), val, &[i32_0]); vals.push(val); @@ -510,7 +511,9 @@ fn gen_call_handling<'ll>( let next = llvm::LLVMGetNextInstruction(offload_success).unwrap(); dbg!(&next); llvm::LLVMRustPositionAfter(builder.llbuilder, next); + let called_kernel = llvm::LLVMGetCalledValue(next).unwrap(); llvm::LLVMInstructionEraseFromParent(next); + dbg!(&called_kernel); } // Step 4) @@ -521,6 +524,10 @@ fn gen_call_handling<'ll>( builder.call(mapper_fn_ty, unregister_lib_decl, &[tgt_bin_desc_alloca], None); + drop(builder); + unsafe { llvm::LLVMDeleteFunction(called) }; + dbg!("survived"); + // With this we generated the following begin and end mappers. We could easily generate the // update mapper in an update. // call void @__tgt_target_data_begin_mapper(ptr @1, i64 -1, i32 3, ptr %27, ptr %28, ptr %29, ptr @.offload_maptypes, ptr null, ptr null) From a4a0cb7822450078055cb795dd2199c12a0e3beb Mon Sep 17 00:00:00 2001 From: Manuel Drehwald Date: Mon, 4 Aug 2025 16:30:36 -0700 Subject: [PATCH 10/10] fixup --- compiler/rustc_codegen_llvm/src/llvm/ffi.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/compiler/rustc_codegen_llvm/src/llvm/ffi.rs b/compiler/rustc_codegen_llvm/src/llvm/ffi.rs index b6fe83ff3a88f..f454f3a4771ce 100644 --- a/compiler/rustc_codegen_llvm/src/llvm/ffi.rs +++ b/compiler/rustc_codegen_llvm/src/llvm/ffi.rs @@ -1223,6 +1223,7 @@ unsafe extern "C" { // Operations on functions pub(crate) fn LLVMSetFunctionCallConv(Fn: &Value, CC: c_uint); + pub(crate) fn LLVMDeleteFunction(Fn: &Value); // Operations about llvm intrinsics pub(crate) fn LLVMLookupIntrinsicID(Name: *const c_char, NameLen: size_t) -> c_uint;