Rollup merge of #153411 - Sa4dUs:offload-slices, r=ZuseZ4

Offload slice support This PR allows offload to support slice type arguments. ~NOTE: this is built on top of https://github.com/rust-lang/rust/pull/152283~ r? @ZuseZ4
2026-04-27 18:57:42 +03:00 · 2026-04-20 20:50:20 -04:00
parent 66da6cae1a af839a8a96
commit 4c7e6565ef
5 changed files with 130 additions and 14 deletions
@@ -448,14 +448,19 @@ pub(crate) fn gen_define_handling<'ll>(
        transfer.iter().map(|m| m.intersection(valid_begin_mappings).bits()).collect();
    let transfer_from: Vec<u64> =
        transfer.iter().map(|m| m.intersection(MappingFlags::FROM).bits()).collect();
+    let valid_kernel_mappings = MappingFlags::LITERAL | MappingFlags::IMPLICIT;
    // FIXME(offload): add `OMP_MAP_TARGET_PARAM = 0x20` only if necessary
-    let transfer_kernel = vec![MappingFlags::TARGET_PARAM.bits(); transfer_to.len()];
+    let transfer_kernel: Vec<u64> = transfer
+        .iter()
+        .map(|m| (m.intersection(valid_kernel_mappings) | MappingFlags::TARGET_PARAM).bits())
+        .collect();

    let actual_sizes = sizes
        .iter()
        .map(|s| match s {
            OffloadSize::Static(sz) => *sz,
-            OffloadSize::Dynamic => 0,
+            // NOTE(Sa4dUs): set `.offload_sizes` entry to 0 for sizes that we determine at runtime, just like clang
+            _ => 0,
        })
        .collect::<Vec<_>>();
    let offload_sizes =
@@ -542,12 +547,20 @@ pub(crate) fn scalar_width<'ll>(cx: &'ll SimpleCx<'_>, ty: &'ll Type) -> u64 {
 }

 fn get_runtime_size<'ll, 'tcx>(
-    _cx: &CodegenCx<'ll, 'tcx>,
-    _val: &'ll Value,
-    _meta: &OffloadMetadata,
+    builder: &mut Builder<'_, 'll, 'tcx>,
+    args: &[&'ll Value],
+    index: usize,
+    meta: &OffloadMetadata,
 ) -> &'ll Value {
-    // FIXME(Sa4dUs): handle dynamic-size data (e.g. slices)
-    bug!("offload does not support dynamic sizes yet");
+    match meta.payload_size {
+        OffloadSize::Slice { element_size } => {
+            let length_idx = index + 1;
+            let length = args[length_idx];
+            let length_i64 = builder.intcast(length, builder.cx.type_i64(), false);
+            builder.mul(length_i64, builder.cx.get_const_i64(element_size))
+        }
+        _ => bug!("unexpected offload size {:?}", meta.payload_size),
+    }
 }

 // For each kernel *call*, we now use some of our previous declared globals to move data to and from
@@ -588,7 +601,7 @@ pub(crate) fn gen_call_handling<'ll, 'tcx>(
    let OffloadKernelDims { num_workgroups, threads_per_block, workgroup_dims, thread_dims } =
        offload_dims;

-    let has_dynamic = metadata.iter().any(|m| matches!(m.payload_size, OffloadSize::Dynamic));
+    let has_dynamic = metadata.iter().any(|m| !matches!(m.payload_size, OffloadSize::Static(_)));

    let tgt_decl = offload_globals.launcher_fn;
    let tgt_target_kernel_ty = offload_globals.launcher_ty;
@@ -683,9 +696,9 @@ pub(crate) fn gen_call_handling<'ll, 'tcx>(
        let gep2 = builder.inbounds_gep(ty, a2, &[i32_0, idx]);
        builder.store(geps[i as usize], gep2, Align::EIGHT);

-        if matches!(metadata[i as usize].payload_size, OffloadSize::Dynamic) {
+        if !matches!(metadata[i as usize].payload_size, OffloadSize::Static(_)) {
            let gep3 = builder.inbounds_gep(ty2, a4, &[i32_0, idx]);
-            let size_val = get_runtime_size(cx, args[i as usize], &metadata[i as usize]);
+            let size_val = get_runtime_size(builder, args, i as usize, &metadata[i as usize]);
            builder.store(size_val, gep3, Align::EIGHT);
        }
    }
@@ -1813,9 +1813,20 @@ fn codegen_offload<'ll, 'tcx>(
    let sig = tcx.instantiate_bound_regions_with_erased(sig);
    let inputs = sig.inputs();

-    let metadata = inputs.iter().map(|ty| OffloadMetadata::from_ty(tcx, *ty)).collect::<Vec<_>>();
+    let fn_abi = cx.fn_abi_of_instance(fn_target, ty::List::empty());

-    let types = inputs.iter().map(|ty| cx.layout_of(*ty).llvm_type(cx)).collect::<Vec<_>>();
+    let mut metadata = Vec::new();
+    let mut types = Vec::new();
+
+    for (i, arg_abi) in fn_abi.args.iter().enumerate() {
+        let ty = inputs[i];
+        let decomposed = OffloadMetadata::handle_abi(cx, tcx, ty, arg_abi);
+
+        for (meta, entry_ty) in decomposed {
+            metadata.push(meta);
+            types.push(bx.cx.layout_of(entry_ty).llvm_type(bx.cx));
+        }
+    }

    let offload_globals_ref = cx.offload_globals.borrow();
    let offload_globals = match offload_globals_ref.as_ref() {
@@ -1,7 +1,10 @@
 use bitflags::bitflags;
+use rustc_abi::{BackendRepr, TyAbiInterface};
+use rustc_target::callconv::ArgAbi;

 use crate::ty::{self, PseudoCanonicalInput, Ty, TyCtxt, TypingEnv};

+#[derive(Debug, Copy, Clone)]
 pub struct OffloadMetadata {
    pub payload_size: OffloadSize,
    pub mode: MappingFlags,
@@ -9,13 +12,13 @@ pub struct OffloadMetadata {

 #[derive(Debug, Copy, Clone)]
 pub enum OffloadSize {
-    Dynamic,
    Static(u64),
+    Slice { element_size: u64 },
 }

 bitflags! {
    /// Mirrors `OpenMPOffloadMappingFlags` from Clang/OpenMP.
-    #[derive(Debug, Copy, Clone)]
+    #[derive(Debug, Copy, Clone, PartialEq, Eq)]
    #[repr(transparent)]
    pub struct MappingFlags: u64 {
        /// No flags.
@@ -62,11 +65,38 @@ pub fn from_ty<'tcx>(tcx: TyCtxt<'tcx>, ty: Ty<'tcx>) -> Self {
            mode: MappingFlags::from_ty(tcx, ty),
        }
    }
+
+    pub fn handle_abi<'tcx, C>(
+        cx: &C,
+        tcx: TyCtxt<'tcx>,
+        ty: Ty<'tcx>,
+        arg_abi: &ArgAbi<'tcx, Ty<'tcx>>,
+    ) -> Vec<(Self, Ty<'tcx>)>
+    where
+        Ty<'tcx>: TyAbiInterface<'tcx, C>,
+    {
+        match arg_abi.layout.backend_repr {
+            BackendRepr::ScalarPair(_, _) => (0..2)
+                .map(|i| {
+                    let ty = arg_abi.layout.field(cx, i).ty;
+                    (OffloadMetadata::from_ty(tcx, ty), ty)
+                })
+                .collect(),
+            _ => vec![(OffloadMetadata::from_ty(tcx, ty), ty)],
+        }
+    }
 }

 // FIXME(Sa4dUs): implement a solid logic to determine the payload size
 fn get_payload_size<'tcx>(tcx: TyCtxt<'tcx>, ty: Ty<'tcx>) -> OffloadSize {
    match ty.kind() {
+        ty::Slice(elem_ty) => {
+            let layout = tcx.layout_of(PseudoCanonicalInput {
+                typing_env: TypingEnv::fully_monomorphized(),
+                value: *elem_ty,
+            });
+            OffloadSize::Slice { element_size: layout.unwrap().size.bytes() }
+        }
        ty::RawPtr(inner, _) | ty::Ref(_, inner, _) => get_payload_size(tcx, *inner),
        _ => OffloadSize::Static(
            tcx.layout_of(PseudoCanonicalInput {
@@ -0,0 +1,27 @@
+//@ add-minicore
+//@ revisions: amdgpu nvptx
+//@[nvptx] compile-flags: -Copt-level=3 -Zunstable-options -Zoffload=Device --target nvptx64-nvidia-cuda --crate-type=rlib
+//@[nvptx] needs-llvm-components: nvptx
+//@[amdgpu] compile-flags: -Copt-level=3 -Zunstable-options -Zoffload=Device --target amdgcn-amd-amdhsa -Ctarget-cpu=gfx900 --crate-type=rlib
+//@[amdgpu] needs-llvm-components: amdgpu
+//@ no-prefer-dynamic
+//@ needs-offload
+
+#![feature(abi_gpu_kernel, rustc_attrs, no_core)]
+#![no_core]
+
+extern crate minicore;
+
+// CHECK: ; Function Attrs
+// nvptx-NEXT: define ptx_kernel void @foo
+// amdgpu-NEXT: define amdgpu_kernel void @foo
+// CHECK-SAME: ptr readnone captures(none) %dyn_ptr
+// nvptx-SAME: [2 x i64] %0
+// amdgpu-SAME: ptr noalias {{.*}} %0, i64 {{.*}} %1
+// CHECK-NEXT: entry:
+// CHECK-NEXT: ret void
+// CHECK-NEXT: }
+
+#[unsafe(no_mangle)]
+#[rustc_offload_kernel]
+pub unsafe extern "gpu-kernel" fn foo(x: &[f32]) {}
@@ -0,0 +1,35 @@
+//@ compile-flags: -Zoffload=Test -Zunstable-options -C opt-level=1 -Clto=fat
+//@ no-prefer-dynamic
+//@ needs-offload
+
+// This test verifies that offload is properly handling slices passing them properly to the device
+
+#![feature(abi_gpu_kernel)]
+#![feature(rustc_attrs)]
+#![feature(core_intrinsics)]
+#![no_main]
+
+// CHECK: @anon.[[ID:.*]].0 = private unnamed_addr constant [23 x i8] c";unknown;unknown;0;0;;\00", align 1
+
+// CHECK-DAG: @.offload_sizes.[[K:[^ ]*foo]] = private unnamed_addr constant [2 x i64] [i64 0, i64 8]
+// CHECK-DAG: @.offload_maptypes.[[K]].begin = private unnamed_addr constant [2 x i64] [i64 1, i64 768]
+// CHECK-DAG: @.offload_maptypes.[[K]].kernel = private unnamed_addr constant [2 x i64] [i64 32, i64 800]
+// CHECK-DAG: @.offload_maptypes.[[K]].end = private unnamed_addr constant [2 x i64] [i64 2, i64 0]
+
+// CHECK:       define{{( dso_local)?}} void @main()
+// CHECK:       %.offload_sizes = alloca [2 x i64], align 8
+// CHECK:  call void @llvm.memcpy.p0.p0.i64(ptr {{.*}} %.offload_sizes, ptr {{.*}} @.offload_sizes.foo, i64 16, i1 false)
+// CHECK:       store i64 16, ptr %.offload_sizes, align 8
+// CHECK:       call void @__tgt_target_data_begin_mapper(ptr nonnull @anon.[[ID]].1, i64 -1, i32 2, ptr nonnull %.offload_baseptrs, ptr nonnull %.offload_ptrs, ptr nonnull %.offload_sizes, ptr nonnull @.offload_maptypes.[[K]].begin, ptr null, ptr null)
+// CHECK:       %11 = call i32 @__tgt_target_kernel(ptr nonnull @anon.[[ID]].1, i64 -1, i32 1, i32 1, ptr nonnull @.foo.region_id, ptr nonnull %kernel_args)
+// CHECK-NEXT:  call void @__tgt_target_data_end_mapper(ptr nonnull @anon.[[ID]].1, i64 -1, i32 2, ptr nonnull %.offload_baseptrs, ptr nonnull %.offload_ptrs, ptr nonnull %.offload_sizes, ptr nonnull @.offload_maptypes.[[K]].end, ptr null, ptr null)
+
+#[unsafe(no_mangle)]
+fn main() {
+    let mut x = [0.0, 0.0, 0.0, 0.0];
+    core::intrinsics::offload::<_, _, ()>(foo, [1, 1, 1], [1, 1, 1], ((&mut x) as &mut [f64],));
+}
+
+unsafe extern "C" {
+    pub fn foo(x: &mut [f32]);
+}