mirror of
https://github.com/rust-lang/rust.git
synced 2026-04-27 18:57:42 +03:00
Avoid alloca for fully static sizes
This commit is contained in:
@@ -3,11 +3,12 @@
|
||||
use bitflags::Flags;
|
||||
use llvm::Linkage::*;
|
||||
use rustc_abi::Align;
|
||||
use rustc_codegen_ssa::MemFlags;
|
||||
use rustc_codegen_ssa::common::TypeKind;
|
||||
use rustc_codegen_ssa::mir::operand::{OperandRef, OperandValue};
|
||||
use rustc_codegen_ssa::traits::{BaseTypeCodegenMethods, BuilderMethods};
|
||||
use rustc_middle::bug;
|
||||
use rustc_middle::ty::offload_meta::{MappingFlags, OffloadMetadata};
|
||||
use rustc_middle::ty::offload_meta::{MappingFlags, OffloadMetadata, OffloadSize};
|
||||
|
||||
use crate::builder::Builder;
|
||||
use crate::common::CodegenCx;
|
||||
@@ -450,7 +451,15 @@ pub(crate) fn gen_define_handling<'ll>(
|
||||
// FIXME(offload): add `OMP_MAP_TARGET_PARAM = 0x20` only if necessary
|
||||
let transfer_kernel = vec![MappingFlags::TARGET_PARAM.bits(); transfer_to.len()];
|
||||
|
||||
let offload_sizes = add_priv_unnamed_arr(&cx, &format!(".offload_sizes.{symbol}"), &sizes);
|
||||
let actual_sizes = sizes
|
||||
.iter()
|
||||
.map(|s| match s {
|
||||
OffloadSize::Static(sz) => *sz,
|
||||
OffloadSize::Dynamic => 0,
|
||||
})
|
||||
.collect::<Vec<_>>();
|
||||
let offload_sizes =
|
||||
add_priv_unnamed_arr(&cx, &format!(".offload_sizes.{symbol}"), &actual_sizes);
|
||||
let memtransfer_begin =
|
||||
add_priv_unnamed_arr(&cx, &format!(".offload_maptypes.{symbol}.begin"), &transfer_to);
|
||||
let memtransfer_kernel =
|
||||
@@ -499,9 +508,6 @@ pub(crate) fn gen_define_handling<'ll>(
|
||||
region_id,
|
||||
};
|
||||
|
||||
// FIXME(Sa4dUs): use this global for constant offload sizes
|
||||
cx.add_compiler_used_global(result.offload_sizes);
|
||||
|
||||
cx.offload_kernel_cache.borrow_mut().insert(symbol, result);
|
||||
|
||||
result
|
||||
@@ -535,6 +541,15 @@ pub(crate) fn scalar_width<'ll>(cx: &'ll SimpleCx<'_>, ty: &'ll Type) -> u64 {
|
||||
}
|
||||
}
|
||||
|
||||
fn get_runtime_size<'ll, 'tcx>(
|
||||
_cx: &CodegenCx<'ll, 'tcx>,
|
||||
_val: &'ll Value,
|
||||
_meta: &OffloadMetadata,
|
||||
) -> &'ll Value {
|
||||
// FIXME(Sa4dUs): handle dynamic-size data (e.g. slices)
|
||||
bug!("offload does not support dynamic sizes yet");
|
||||
}
|
||||
|
||||
// For each kernel *call*, we now use some of our previous declared globals to move data to and from
|
||||
// the gpu. For now, we only handle the data transfer part of it.
|
||||
// If two consecutive kernels use the same memory, we still move it to the host and back to the gpu.
|
||||
@@ -564,15 +579,17 @@ pub(crate) fn gen_call_handling<'ll, 'tcx>(
|
||||
) {
|
||||
let cx = builder.cx;
|
||||
let OffloadKernelGlobals {
|
||||
offload_sizes,
|
||||
memtransfer_begin,
|
||||
memtransfer_kernel,
|
||||
memtransfer_end,
|
||||
region_id,
|
||||
..
|
||||
} = offload_data;
|
||||
let OffloadKernelDims { num_workgroups, threads_per_block, workgroup_dims, thread_dims } =
|
||||
offload_dims;
|
||||
|
||||
let has_dynamic = metadata.iter().any(|m| matches!(m.payload_size, OffloadSize::Dynamic));
|
||||
|
||||
let tgt_decl = offload_globals.launcher_fn;
|
||||
let tgt_target_kernel_ty = offload_globals.launcher_ty;
|
||||
|
||||
@@ -596,7 +613,24 @@ pub(crate) fn gen_call_handling<'ll, 'tcx>(
|
||||
let a2 = builder.direct_alloca(ty, Align::EIGHT, ".offload_ptrs");
|
||||
// These represent the sizes in bytes, e.g. the entry for `&[f64; 16]` will be 8*16.
|
||||
let ty2 = cx.type_array(cx.type_i64(), num_args);
|
||||
let a4 = builder.direct_alloca(ty2, Align::EIGHT, ".offload_sizes");
|
||||
|
||||
let a4 = if has_dynamic {
|
||||
let alloc = builder.direct_alloca(ty2, Align::EIGHT, ".offload_sizes");
|
||||
|
||||
builder.memcpy(
|
||||
alloc,
|
||||
Align::EIGHT,
|
||||
offload_sizes,
|
||||
Align::EIGHT,
|
||||
cx.get_const_i64(8 * args.len() as u64),
|
||||
MemFlags::empty(),
|
||||
None,
|
||||
);
|
||||
|
||||
alloc
|
||||
} else {
|
||||
offload_sizes
|
||||
};
|
||||
|
||||
//%kernel_args = alloca %struct.__tgt_kernel_arguments, align 8
|
||||
let a5 = builder.direct_alloca(tgt_kernel_decl, Align::EIGHT, "kernel_args");
|
||||
@@ -648,9 +682,12 @@ pub(crate) fn gen_call_handling<'ll, 'tcx>(
|
||||
builder.store(vals[i as usize], gep1, Align::EIGHT);
|
||||
let gep2 = builder.inbounds_gep(ty, a2, &[i32_0, idx]);
|
||||
builder.store(geps[i as usize], gep2, Align::EIGHT);
|
||||
let gep3 = builder.inbounds_gep(ty2, a4, &[i32_0, idx]);
|
||||
// FIXME(offload): write an offload frontend and handle arbitrary types.
|
||||
builder.store(cx.get_const_i64(metadata[i as usize].payload_size), gep3, Align::EIGHT);
|
||||
|
||||
if matches!(metadata[i as usize].payload_size, OffloadSize::Dynamic) {
|
||||
let gep3 = builder.inbounds_gep(ty2, a4, &[i32_0, idx]);
|
||||
let size_val = get_runtime_size(cx, args[i as usize], &metadata[i as usize]);
|
||||
builder.store(size_val, gep3, Align::EIGHT);
|
||||
}
|
||||
}
|
||||
|
||||
// For now we have a very simplistic indexing scheme into our
|
||||
@@ -662,13 +699,14 @@ fn get_geps<'ll, 'tcx>(
|
||||
a1: &'ll Value,
|
||||
a2: &'ll Value,
|
||||
a4: &'ll Value,
|
||||
is_dynamic: bool,
|
||||
) -> [&'ll Value; 3] {
|
||||
let cx = builder.cx;
|
||||
let i32_0 = cx.get_const_i32(0);
|
||||
|
||||
let gep1 = builder.inbounds_gep(ty, a1, &[i32_0, i32_0]);
|
||||
let gep2 = builder.inbounds_gep(ty, a2, &[i32_0, i32_0]);
|
||||
let gep3 = builder.inbounds_gep(ty2, a4, &[i32_0, i32_0]);
|
||||
let gep3 = if is_dynamic { builder.inbounds_gep(ty2, a4, &[i32_0, i32_0]) } else { a4 };
|
||||
[gep1, gep2, gep3]
|
||||
}
|
||||
|
||||
@@ -692,7 +730,7 @@ fn generate_mapper_call<'ll, 'tcx>(
|
||||
|
||||
// Step 2)
|
||||
let s_ident_t = offload_globals.ident_t_global;
|
||||
let geps = get_geps(builder, ty, ty2, a1, a2, a4);
|
||||
let geps = get_geps(builder, ty, ty2, a1, a2, a4, has_dynamic);
|
||||
generate_mapper_call(
|
||||
builder,
|
||||
geps,
|
||||
@@ -725,7 +763,7 @@ fn generate_mapper_call<'ll, 'tcx>(
|
||||
// %41 = call i32 @__tgt_target_kernel(ptr @1, i64 -1, i32 2097152, i32 256, ptr @.kernel_1.region_id, ptr %kernel_args)
|
||||
|
||||
// Step 4)
|
||||
let geps = get_geps(builder, ty, ty2, a1, a2, a4);
|
||||
let geps = get_geps(builder, ty, ty2, a1, a2, a4, has_dynamic);
|
||||
generate_mapper_call(
|
||||
builder,
|
||||
geps,
|
||||
|
||||
@@ -3,10 +3,16 @@
|
||||
use crate::ty::{self, PseudoCanonicalInput, Ty, TyCtxt, TypingEnv};
|
||||
|
||||
pub struct OffloadMetadata {
|
||||
pub payload_size: u64,
|
||||
pub payload_size: OffloadSize,
|
||||
pub mode: MappingFlags,
|
||||
}
|
||||
|
||||
#[derive(Debug, Copy, Clone)]
|
||||
pub enum OffloadSize {
|
||||
Dynamic,
|
||||
Static(u64),
|
||||
}
|
||||
|
||||
bitflags! {
|
||||
/// Mirrors `OpenMPOffloadMappingFlags` from Clang/OpenMP.
|
||||
#[derive(Debug, Copy, Clone)]
|
||||
@@ -59,17 +65,18 @@ pub fn from_ty<'tcx>(tcx: TyCtxt<'tcx>, ty: Ty<'tcx>) -> Self {
|
||||
}
|
||||
|
||||
// FIXME(Sa4dUs): implement a solid logic to determine the payload size
|
||||
fn get_payload_size<'tcx>(tcx: TyCtxt<'tcx>, ty: Ty<'tcx>) -> u64 {
|
||||
fn get_payload_size<'tcx>(tcx: TyCtxt<'tcx>, ty: Ty<'tcx>) -> OffloadSize {
|
||||
match ty.kind() {
|
||||
ty::RawPtr(inner, _) | ty::Ref(_, inner, _) => get_payload_size(tcx, *inner),
|
||||
_ => tcx
|
||||
.layout_of(PseudoCanonicalInput {
|
||||
_ => OffloadSize::Static(
|
||||
tcx.layout_of(PseudoCanonicalInput {
|
||||
typing_env: TypingEnv::fully_monomorphized(),
|
||||
value: ty,
|
||||
})
|
||||
.unwrap()
|
||||
.size
|
||||
.bytes(),
|
||||
),
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -14,14 +14,13 @@
|
||||
// CHECK-NOT: define
|
||||
// CHECK: %.offload_baseptrs = alloca [1 x ptr], align 8
|
||||
// CHECK-NEXT: %.offload_ptrs = alloca [1 x ptr], align 8
|
||||
// CHECK-NEXT: %.offload_sizes = alloca [1 x i64], align 8
|
||||
// CHECK-NEXT: %kernel_args = alloca %struct.__tgt_kernel_arguments, align 8
|
||||
// CHECK: br label %bb3
|
||||
// CHECK-NOT define
|
||||
// CHECK: bb3
|
||||
// CHECK: call void @__tgt_target_data_begin_mapper(ptr nonnull @anon.{{.*}}.1, i64 -1, i32 1, ptr nonnull %.offload_baseptrs, ptr nonnull %.offload_ptrs, ptr nonnull %.offload_sizes, ptr nonnull @.offload_maptypes.foo.begin, ptr null, ptr null)
|
||||
// CHECK: call void @__tgt_target_data_begin_mapper(ptr nonnull @anon.{{.*}}.1, i64 -1, i32 1, ptr nonnull %.offload_baseptrs, ptr nonnull %.offload_ptrs, ptr nonnull @.offload_sizes.foo, ptr nonnull @.offload_maptypes.foo.begin, ptr null, ptr null)
|
||||
// CHECK: %10 = call i32 @__tgt_target_kernel(ptr nonnull @anon.{{.*}}.1, i64 -1, i32 256, i32 32, ptr nonnull @.foo.region_id, ptr nonnull %kernel_args)
|
||||
// CHECK-NEXT: call void @__tgt_target_data_end_mapper(ptr nonnull @anon.{{.*}}.1, i64 -1, i32 1, ptr nonnull %.offload_baseptrs, ptr nonnull %.offload_ptrs, ptr nonnull %.offload_sizes, ptr nonnull @.offload_maptypes.foo.end, ptr null, ptr null)
|
||||
// CHECK-NEXT: call void @__tgt_target_data_end_mapper(ptr nonnull @anon.{{.*}}.1, i64 -1, i32 1, ptr nonnull %.offload_baseptrs, ptr nonnull %.offload_ptrs, ptr nonnull @.offload_sizes.foo, ptr nonnull @.offload_maptypes.foo.end, ptr null, ptr null)
|
||||
#[unsafe(no_mangle)]
|
||||
unsafe fn main() {
|
||||
let A = [1.0, 1.0, 1.0, 1.0, 1.0, 1.0];
|
||||
|
||||
@@ -58,18 +58,14 @@ fn main() {
|
||||
// CHECK-NEXT: %x = alloca [1024 x i8], align 16
|
||||
// CHECK-NEXT: %.offload_baseptrs = alloca [2 x ptr], align 8
|
||||
// CHECK-NEXT: %.offload_ptrs = alloca [2 x ptr], align 8
|
||||
// CHECK-NEXT: %.offload_sizes = alloca [2 x i64], align 8
|
||||
// CHECK-NEXT: %kernel_args = alloca %struct.__tgt_kernel_arguments, align 8
|
||||
// CHECK: store ptr %x, ptr %.offload_baseptrs, align 8
|
||||
// CHECK-NEXT: store ptr %x, ptr %.offload_ptrs, align 8
|
||||
// CHECK-NEXT: store i64 1024, ptr %.offload_sizes, align 8
|
||||
// CHECK-NEXT: [[BPTRS_1:%.*]] = getelementptr inbounds nuw i8, ptr %.offload_baseptrs, i64 8
|
||||
// CHECK-NEXT: store ptr %y, ptr [[BPTRS_1]], align 8
|
||||
// CHECK-NEXT: [[PTRS_1:%.*]] = getelementptr inbounds nuw i8, ptr %.offload_ptrs, i64 8
|
||||
// CHECK-NEXT: store ptr %y, ptr [[PTRS_1]], align 8
|
||||
// CHECK-NEXT: [[SIZES_1:%.*]] = getelementptr inbounds nuw i8, ptr %.offload_sizes, i64 8
|
||||
// CHECK-NEXT: store i64 1024, ptr [[SIZES_1]], align 8
|
||||
// CHECK-NEXT: call void @__tgt_target_data_begin_mapper(ptr nonnull @anon.{{.*}}.1, i64 -1, i32 2, ptr nonnull %.offload_baseptrs, ptr nonnull %.offload_ptrs, ptr nonnull %.offload_sizes, ptr nonnull @.offload_maptypes.[[K]].begin, ptr null, ptr null)
|
||||
// CHECK-NEXT: call void @__tgt_target_data_begin_mapper(ptr nonnull @anon.{{.*}}.1, i64 -1, i32 2, ptr nonnull %.offload_baseptrs, ptr nonnull %.offload_ptrs, ptr nonnull @.offload_sizes.[[K]], ptr nonnull @.offload_maptypes.[[K]].begin, ptr null, ptr null)
|
||||
// CHECK-NEXT: store i32 3, ptr %kernel_args, align 8
|
||||
// CHECK-NEXT: [[P4:%[^ ]+]] = getelementptr inbounds nuw i8, ptr %kernel_args, i64 4
|
||||
// CHECK-NEXT: store i32 2, ptr [[P4]], align 4
|
||||
@@ -78,7 +74,7 @@ fn main() {
|
||||
// CHECK-NEXT: [[P16:%[^ ]+]] = getelementptr inbounds nuw i8, ptr %kernel_args, i64 16
|
||||
// CHECK-NEXT: store ptr %.offload_ptrs, ptr [[P16]], align 8
|
||||
// CHECK-NEXT: [[P24:%[^ ]+]] = getelementptr inbounds nuw i8, ptr %kernel_args, i64 24
|
||||
// CHECK-NEXT: store ptr %.offload_sizes, ptr [[P24]], align 8
|
||||
// CHECK-NEXT: store ptr @.offload_sizes.[[K]], ptr [[P24]], align 8
|
||||
// CHECK-NEXT: [[P32:%[^ ]+]] = getelementptr inbounds nuw i8, ptr %kernel_args, i64 32
|
||||
// CHECK-NEXT: store ptr @.offload_maptypes.[[K]].kernel, ptr [[P32]], align 8
|
||||
// CHECK-NEXT: [[P40:%[^ ]+]] = getelementptr inbounds nuw i8, ptr %kernel_args, i64 40
|
||||
@@ -92,7 +88,7 @@ fn main() {
|
||||
// CHECK-NEXT: [[P96:%[^ ]+]] = getelementptr inbounds nuw i8, ptr %kernel_args, i64 96
|
||||
// CHECK-NEXT: store i32 0, ptr [[P96]], align 8
|
||||
// CHECK-NEXT: [[TGT_RET:%.*]] = call i32 @__tgt_target_kernel(ptr nonnull @anon.{{.*}}.1, i64 -1, i32 256, i32 32, ptr nonnull @.[[K]].region_id, ptr nonnull %kernel_args)
|
||||
// CHECK-NEXT: call void @__tgt_target_data_end_mapper(ptr nonnull @anon.{{.*}}.1, i64 -1, i32 2, ptr nonnull %.offload_baseptrs, ptr nonnull %.offload_ptrs, ptr nonnull %.offload_sizes, ptr nonnull @.offload_maptypes.[[K]].end, ptr null, ptr null)
|
||||
// CHECK-NEXT: call void @__tgt_target_data_end_mapper(ptr nonnull @anon.{{.*}}.1, i64 -1, i32 2, ptr nonnull %.offload_baseptrs, ptr nonnull %.offload_ptrs, ptr nonnull @.offload_sizes.[[K]], ptr nonnull @.offload_maptypes.[[K]].end, ptr null, ptr null)
|
||||
// CHECK: ret void
|
||||
// CHECK-NEXT: }
|
||||
|
||||
|
||||
@@ -20,8 +20,6 @@
|
||||
// CHECK-NEXT: store double %_0.i, ptr %1, align 8
|
||||
// CHECK-NEXT: %2 = getelementptr inbounds nuw i8, ptr %.offload_ptrs, i64 8
|
||||
// CHECK-NEXT: store ptr %addr, ptr %2, align 8
|
||||
// CHECK-NEXT: %3 = getelementptr inbounds nuw i8, ptr %.offload_sizes, i64 8
|
||||
// CHECK-NEXT: store i64 4, ptr %3, align 8
|
||||
// CHECK-NEXT: call void @__tgt_target_data_begin_mapper
|
||||
|
||||
#[unsafe(no_mangle)]
|
||||
|
||||
Reference in New Issue
Block a user