stage2-wasm: vector, std tests

2026-06-01 13:56:18 +03:00 · 2026-04-20 20:49:35 +02:00
parent 36faf76fe1
commit 22945fbbdc
30 changed files with 554 additions and 460 deletions
@@ -43,6 +43,83 @@ pub fn legalizeFeatures(_: *const std.Target) *const Air.Legalize.Features {
        .expand_packed_store,
        .expand_packed_struct_field_val,
        .expand_packed_aggregate_init,
+
+        .scalarize_add,
+        .scalarize_add_optimized,
+        .scalarize_add_wrap,
+        .scalarize_add_sat,
+        .scalarize_sub,
+        .scalarize_sub_optimized,
+        .scalarize_sub_wrap,
+        .scalarize_sub_sat,
+        .scalarize_mul,
+        .scalarize_mul_optimized,
+        .scalarize_mul_wrap,
+        .scalarize_mul_sat,
+        .scalarize_div_float,
+        .scalarize_div_float_optimized,
+        .scalarize_div_trunc,
+        .scalarize_div_trunc_optimized,
+        .scalarize_div_floor,
+        .scalarize_div_floor_optimized,
+        .scalarize_div_exact,
+        .scalarize_div_exact_optimized,
+        .scalarize_rem,
+        .scalarize_rem_optimized,
+        .scalarize_mod,
+        .scalarize_mod_optimized,
+        .scalarize_max,
+        .scalarize_min,
+        .scalarize_add_with_overflow,
+        .scalarize_sub_with_overflow,
+        .scalarize_mul_with_overflow,
+        .scalarize_shl_with_overflow,
+        .scalarize_bit_and,
+        .scalarize_bit_or,
+        .scalarize_shr,
+        .scalarize_shr_exact,
+        .scalarize_shl,
+        .scalarize_shl_exact,
+        .scalarize_shl_sat,
+        .scalarize_xor,
+        .scalarize_not,
+        .scalarize_bitcast,
+        .scalarize_clz,
+        .scalarize_ctz,
+        .scalarize_popcount,
+        .scalarize_byte_swap,
+        .scalarize_bit_reverse,
+        .scalarize_sqrt,
+        .scalarize_sin,
+        .scalarize_cos,
+        .scalarize_tan,
+        .scalarize_exp,
+        .scalarize_exp2,
+        .scalarize_log,
+        .scalarize_log2,
+        .scalarize_log10,
+        .scalarize_abs,
+        .scalarize_floor,
+        .scalarize_ceil,
+        .scalarize_round,
+        .scalarize_trunc_float,
+        .scalarize_neg,
+        .scalarize_neg_optimized,
+        .scalarize_cmp_vector,
+        .scalarize_cmp_vector_optimized,
+        .scalarize_fptrunc,
+        .scalarize_fpext,
+        .scalarize_intcast,
+        .scalarize_trunc,
+        .scalarize_int_from_float,
+        .scalarize_int_from_float_optimized,
+        .scalarize_float_from_int,
+        .scalarize_reduce,
+        .scalarize_reduce_optimized,
+        .scalarize_shuffle_one,
+        .scalarize_shuffle_two,
+        .scalarize_select,
+        .scalarize_mul_add,
    });
 }

@@ -217,8 +294,7 @@ const WValue = union(enum) {
                try gen.addLocal(.local_set, new_local.local.value);
                return new_local;
            },
-            .local, .stack_offset => return value,
-            else => unreachable,
+            else => return value,
        }
    }

@@ -765,7 +841,8 @@ fn generateInner(cg: *CodeGen, any_returns: bool) InnerError!Mir {
    // In case we have a return value, but the last instruction is a noreturn (such as a while loop)
    // we emit an unreachable instruction to tell the stack validator that part will never be reached.
    if (any_returns and cg.air.instructions.len > 0) {
-        const inst: Air.Inst.Index = @enumFromInt(cg.air.instructions.len - 1);
+        const main_body = cg.air.getMainBody();
+        const inst: Air.Inst.Index = main_body[main_body.len - 1];
        const last_inst_ty = cg.typeOfIndex(inst);
        if (!last_inst_ty.hasRuntimeBits(zcu)) {
            try cg.addTag(.@"unreachable");
@@ -1032,10 +1109,6 @@ fn allocStackPtr(cg: *CodeGen, inst: Air.Inst.Index) !WValue {
        try cg.initializeStack();
    }

-    if (!pointee_ty.hasRuntimeBits(zcu)) {
-        return cg.allocStack(Type.usize); // create a value containing just the stack pointer.
-    }
-
    const abi_alignment = ptr_ty.ptrAlignment(zcu);
    const abi_size = std.math.cast(u32, pointee_ty.abiSize(zcu)) orelse {
        return cg.fail("Type {f} with ABI size of {d} exceeds stack frame size", .{
@@ -1050,157 +1123,67 @@ fn allocStackPtr(cg: *CodeGen, inst: Air.Inst.Index) !WValue {
    return .{ .stack_offset = .{ .value = offset, .references = 1 } };
 }

-/// Performs a copy of bytes for a given type. Copying all bytes
-/// from rhs to lhs.
-fn memcpy(cg: *CodeGen, dst: WValue, src: WValue, len: WValue) !void {
+fn emitMemoryCopy(cg: *CodeGen, dst: WValue, src: WValue, len: WValue) !void {
    const len_known_neq_0 = switch (len) {
        .imm32 => |val| if (val != 0) true else return,
        .imm64 => |val| if (val != 0) true else return,
        else => false,
    };
-    // When bulk_memory is enabled, we lower it to wasm's memcpy instruction.
-    // If not, we lower it ourselves manually
-    if (cg.target.cpu.has(.wasm, .bulk_memory)) {
-        const len0_ok = cg.target.cpu.has(.wasm, .nontrapping_bulk_memory_len0);
-        const emit_check = !(len0_ok or len_known_neq_0);
+    const len0_ok = cg.target.cpu.has(.wasm, .nontrapping_bulk_memory_len0);
+    const emit_check = !(len0_ok or len_known_neq_0);

-        if (emit_check) {
-            try cg.startBlock(.block, .empty);
+    if (emit_check) {
+        try cg.startBlock(.block, .empty);

-            // Even if `len` is zero, the spec requires an implementation to trap if `src + len` or
-            // `dst + len` are out of memory bounds. This can easily happen in Zig in a case such
-            // as:
-            //
-            // const dst: [*]u8 = undefined;
-            // const src: [*]u8 = undefined;
-            // var len: usize = runtime_zero();
-            // @memcpy(dst[0..len], src[0..len]);
-            //
-            // So explicitly avoid using `memory.copy` in the `len == 0` case. Lovely design.
-            try cg.emitWValue(len);
-            try cg.addTag(.i32_eqz);
-            try cg.addLabel(.br_if, 0);
-        }
-
-        try cg.lowerToStack(dst);
-        try cg.lowerToStack(src);
+        // Even if `len` is zero, the spec requires an implementation to trap if `src + len` or
+        // `dst + len` are out of memory bounds. This can easily happen in Zig in a case such
+        // as:
+        //
+        // const dst: [*]u8 = undefined;
+        // const src: [*]u8 = undefined;
+        // var len: usize = runtime_zero();
+        // @memcpy(dst[0..len], src[0..len]);
+        //
+        // So explicitly avoid using `memory.copy` in the `len == 0` case. Lovely design.
        try cg.emitWValue(len);
-        try cg.addExtended(.memory_copy);
+        try cg.addTag(.i32_eqz);
+        try cg.addLabel(.br_if, 0);
+    }

-        if (emit_check) {
-            try cg.endBlock();
-        }
+    try cg.lowerToStack(dst);
+    try cg.lowerToStack(src);
+    try cg.emitWValue(len);
+    try cg.addExtended(.memory_copy);

+    if (emit_check) {
+        try cg.endBlock();
+    }
+}
+
+fn memcpy(cg: *CodeGen, dst: WValue, src: WValue, len: WValue) !void {
+    if (cg.target.cpu.has(.wasm, .bulk_memory)) {
+        try cg.emitMemoryCopy(dst, src, len);
        return;
    }

-    // when the length is comptime-known, rather than a runtime value, we can optimize the generated code by having
-    // the loop during codegen, rather than inserting a runtime loop into the binary.
-    switch (len) {
-        .imm32, .imm64 => blk: {
-            const length = switch (len) {
-                .imm32 => |val| val,
-                .imm64 => |val| val,
-                else => unreachable,
-            };
-            // if the size (length) is more than 32 bytes, we use a runtime loop instead to prevent
-            // binary size bloat.
-            if (length > 32) break :blk;
-            var offset: u32 = 0;
-            const lhs_base = dst.offset();
-            const rhs_base = src.offset();
-            while (offset < length) : (offset += 1) {
-                // get dst's address to store the result
-                try cg.emitWValue(dst);
-                // load byte from src's address
-                try cg.emitWValue(src);
-                switch (cg.ptr_size) {
-                    .wasm32 => {
-                        try cg.addMemArg(.i32_load8_u, .{ .offset = rhs_base + offset, .alignment = 1 });
-                        try cg.addMemArg(.i32_store8, .{ .offset = lhs_base + offset, .alignment = 1 });
-                    },
-                    .wasm64 => {
-                        try cg.addMemArg(.i64_load8_u, .{ .offset = rhs_base + offset, .alignment = 1 });
-                        try cg.addMemArg(.i64_store8, .{ .offset = lhs_base + offset, .alignment = 1 });
-                    },
-                }
-            }
-            return;
-        },
-        else => {},
+    try cg.lowerToStack(dst);
+    try cg.lowerToStack(src);
+    try cg.emitWValue(len);
+    try cg.addCallIntrinsic(.memcpy);
+    try cg.addTag(.drop);
+}
+
+fn memmove(cg: *CodeGen, dst: WValue, src: WValue, len: WValue) !void {
+    if (cg.target.cpu.has(.wasm, .bulk_memory)) {
+        try cg.emitMemoryCopy(dst, src, len);
+        return;
    }

-    // allocate a local for the offset, and set it to 0.
-    // This to ensure that inside loops we correctly re-set the counter.
-    var offset = try cg.allocLocal(Type.usize); // local for counter
-    defer offset.free(cg);
-    switch (cg.ptr_size) {
-        .wasm32 => try cg.addImm32(0),
-        .wasm64 => try cg.addImm64(0),
-    }
-    try cg.addLocal(.local_set, offset.local.value);
-
-    // outer block to jump to when loop is done
-    try cg.startBlock(.block, .empty);
-    try cg.startBlock(.loop, .empty);
-
-    // loop condition (offset == length -> break)
-    {
-        try cg.emitWValue(offset);
-        try cg.emitWValue(len);
-        switch (cg.ptr_size) {
-            .wasm32 => try cg.addTag(.i32_eq),
-            .wasm64 => try cg.addTag(.i64_eq),
-        }
-        try cg.addLabel(.br_if, 1); // jump out of loop into outer block (finished)
-    }
-
-    // get dst ptr
-    {
-        try cg.emitWValue(dst);
-        try cg.emitWValue(offset);
-        switch (cg.ptr_size) {
-            .wasm32 => try cg.addTag(.i32_add),
-            .wasm64 => try cg.addTag(.i64_add),
-        }
-    }
-
-    // get src value and also store in dst
-    {
-        try cg.emitWValue(src);
-        try cg.emitWValue(offset);
-        switch (cg.ptr_size) {
-            .wasm32 => {
-                try cg.addTag(.i32_add);
-                try cg.addMemArg(.i32_load8_u, .{ .offset = src.offset(), .alignment = 1 });
-                try cg.addMemArg(.i32_store8, .{ .offset = dst.offset(), .alignment = 1 });
-            },
-            .wasm64 => {
-                try cg.addTag(.i64_add);
-                try cg.addMemArg(.i64_load8_u, .{ .offset = src.offset(), .alignment = 1 });
-                try cg.addMemArg(.i64_store8, .{ .offset = dst.offset(), .alignment = 1 });
-            },
-        }
-    }
-
-    // increment loop counter
-    {
-        try cg.emitWValue(offset);
-        switch (cg.ptr_size) {
-            .wasm32 => {
-                try cg.addImm32(1);
-                try cg.addTag(.i32_add);
-            },
-            .wasm64 => {
-                try cg.addImm64(1);
-                try cg.addTag(.i64_add);
-            },
-        }
-        try cg.addLocal(.local_set, offset.local.value);
-        try cg.addLabel(.br, 0); // jump to start of loop
-    }
-    try cg.endBlock(); // close off loop block
-    try cg.endBlock(); // close off outer block
+    try cg.lowerToStack(dst);
+    try cg.lowerToStack(src);
+    try cg.emitWValue(len);
+    try cg.addCallIntrinsic(.memmove);
+    try cg.addTag(.drop);
 }

 fn ptrSize(cg: *const CodeGen) u16 {
@@ -1310,14 +1293,34 @@ fn genInst(cg: *CodeGen, inst: Air.Inst.Index) InnerError!void {
    const zcu = cg.pt.zcu;
    const air_tags = cg.air.instructions.items(.tag);
    return switch (air_tags[@intFromEnum(inst)]) {
-        // No "scalarize" legalizations are enabled, so these instructions never appear.
-        .legalize_vec_elem_val => unreachable,
-        .legalize_vec_store_elem => unreachable,
        // No soft float legalizations are enabled.
        .legalize_compiler_rt_call => unreachable,

        .inferred_alloc, .inferred_alloc_comptime => unreachable,

+        .legalize_vec_elem_val => cg.airArrayElemVal(inst),
+        .legalize_vec_store_elem => {
+            const pl_op = cg.air.instructions.items(.data)[@intFromEnum(inst)].pl_op;
+            const bin_op = cg.air.extraData(Air.Bin, pl_op.payload).data;
+            const vec_ptr = try cg.resolveInst(pl_op.operand);
+            const elem_idx = try cg.resolveInst(bin_op.lhs);
+            const elem_val = try cg.resolveInst(bin_op.rhs);
+
+            const elem_ty = cg.typeOf(bin_op.rhs);
+            const elem_size = elem_ty.abiSize(zcu);
+
+            try cg.lowerToStack(vec_ptr);
+            try cg.emitWValue(elem_idx);
+            try cg.addImm32(@intCast(elem_size));
+            try cg.addTag(.i32_mul);
+            try cg.addTag(.i32_add);
+            const ptr = try WValue.toLocal(.stack, cg, Type.usize);
+
+            try cg.store(ptr, elem_val, elem_ty, 0);
+
+            return cg.finishAir(inst, .none, &.{ pl_op.operand, bin_op.lhs, bin_op.rhs });
+        },
+
        .add,
        .sub,
        .mul,
@@ -1818,7 +1821,8 @@ fn genInst(cg: *CodeGen, inst: Air.Inst.Index) InnerError!void {
        .wasm_memory_size => cg.airWasmMemorySize(inst),
        .wasm_memory_grow => cg.airWasmMemoryGrow(inst),

-        .memcpy, .memmove => cg.airMemcpy(inst),
+        .memcpy => cg.airMemcpy(inst),
+        .memmove => cg.airMemmove(inst),

        .ret_addr => cg.airRetAddr(inst),
        .tag_name => cg.airTagName(inst),
@@ -2105,7 +2109,7 @@ fn airStore(cg: *CodeGen, inst: Air.Inst.Index, safety: bool) InnerError!void {
        return cg.finishAir(inst, .none, &.{ bin_op.lhs, bin_op.rhs });
    }

-    assert(ptr_info.packed_offset.host_size == 0); // legalize .expand_packed_store
+    assert(!(ptr_info.packed_offset.host_size > 0 and ptr_info.flags.vector_index == .none)); // legalize .expand_packed_store

    try cg.store(lhs, rhs, ty, 0);

@@ -2180,35 +2184,22 @@ fn airLoad(cg: *CodeGen, inst: Air.Inst.Index) InnerError!void {

    if (!ty.hasRuntimeBits(zcu)) return cg.finishAir(inst, .none, &.{ty_op.operand});

-    assert(ptr_info.packed_offset.host_size == 0); // legalize .expand_packed_load
+    assert(!(ptr_info.packed_offset.host_size > 0 and ptr_info.flags.vector_index == .none)); // legalize .expand_packed_load

-    const result = result: {
-        if (isByRef(ty, zcu, cg.target)) {
-            const new_local = try cg.allocStack(ty);
-            try cg.store(new_local, operand, ty, 0);
-            break :result new_local;
-        }
-
-        const loaded = try cg.load(operand, ty, 0);
-        const ty_size = ty.abiSize(zcu);
-        if (ty.isAbiInt(zcu) and ty_size * 8 > ty.bitSize(zcu)) {
-            const int_info = ty.intInfo(zcu);
-            const loaded_int_ty: IntType = .{
-                .is_signed = int_info.signedness == .signed,
-                .bits = @intCast(ty_size * 8),
-            };
-            break :result try cg.intTrunc(.fromType(cg, ty), loaded_int_ty, loaded);
-        } else {
-            break :result loaded;
-        }
-    };
+    const result = try cg.load(operand, ty, 0);
    return cg.finishAir(inst, result, &.{ty_op.operand});
 }

 /// Loads an operand from the linear memory section.
-/// NOTE: Leaves the value on the stack.
+/// NOTE: Leaves the value on the stack, if isByRef == false.
 fn load(cg: *CodeGen, operand: WValue, ty: Type, offset: u32) InnerError!WValue {
    const zcu = cg.pt.zcu;
+    if (isByRef(ty, zcu, cg.target)) {
+        const val = try cg.allocStack(ty);
+        try cg.store(val, try operand.toLocal(cg, .usize), ty, 0);
+        return val;
+    }
+
    // load local's value from memory by its stack position
    try cg.emitWValue(operand);

@@ -2254,6 +2245,14 @@ fn load(cg: *CodeGen, operand: WValue, ty: Type, offset: u32) InnerError!WValue
        },
    );

+    if (ty.isAbiInt(zcu)) {
+        const int_info: IntType = .fromType(cg, ty);
+        switch (int_info.bits) {
+            8, 16, 32, 64 => {},
+            else => _ = try cg.intWrap(int_info, .stack),
+        }
+    }
+
    return .stack;
 }

@@ -2321,7 +2320,7 @@ const IntType = struct {
                .hasRuntimeBits(zcu)) .{ .is_signed = false, .bits = zcu.errorSetBits() } else unreachable,
            .simple_type => |simple_type| return switch (simple_type) {
                .bool => .{ .is_signed = false, .bits = 1 },
-                .anyerror => .{ .is_signed = false, .bits = zcu.errorSetBits() },
+                .anyerror, .adhoc_inferred_error_set => .{ .is_signed = false, .bits = zcu.errorSetBits() },
                .isize => .{ .is_signed = true, .bits = cg.target.ptrBitWidth() },
                .usize => .{ .is_signed = false, .bits = cg.target.ptrBitWidth() },
                .c_char => .{ .is_signed = cg.target.cCharSignedness() == .signed, .bits = cg.target.cTypeBitSize(.char) },
@@ -2334,7 +2333,7 @@ const IntType = struct {
                .c_longlong => .{ .is_signed = true, .bits = cg.target.cTypeBitSize(.longlong) },
                .c_ulonglong => .{ .is_signed = false, .bits = cg.target.cTypeBitSize(.longlong) },
                .f16, .f32, .f64, .f80, .f128, .c_longdouble => unreachable,
-                .anyopaque, .void, .type, .comptime_int, .comptime_float, .noreturn, .null, .undefined, .enum_literal, .adhoc_inferred_error_set, .generic_poison => unreachable,
+                .anyopaque, .void, .type, .comptime_int, .comptime_float, .noreturn, .null, .undefined, .enum_literal, .generic_poison => unreachable,
            },
            .struct_type => {
                const loaded_struct = ip.loadStructType(ty_index);
@@ -3069,7 +3068,13 @@ fn intClz(cg: *CodeGen, ty: IntType, operand: WValue) InnerError!WValue {
            var msb = try (try cg.load(operand, Type.u64, 8)).toLocal(cg, Type.u64);
            defer msb.free(cg);

-            try cg.emitWValue(msb);
+            if (ty.is_signed and ty.bits < 128) {
+                const mask: u64 = ~@as(u64, 0) >> @intCast(128 - ty.bits);
+                _ = try cg.intAnd(.u64, msb, .{ .imm64 = mask });
+            } else {
+                try cg.emitWValue(msb);
+            }
+
            try cg.addTag(.i64_clz);
            _ = try cg.load(operand, Type.u64, 0);
            try cg.addTag(.i64_clz);
@@ -3078,6 +3083,12 @@ fn intClz(cg: *CodeGen, ty: IntType, operand: WValue) InnerError!WValue {
            _ = try cg.intCmp(.u64, .neq, msb, .{ .imm64 = 0 });
            try cg.addTag(.select);
            try cg.addTag(.i32_wrap_i64);
+
+            if (ty.bits < 128) {
+                try cg.addImm32(128 - ty.bits);
+                try cg.addTag(.i32_sub);
+            }
+
            return .stack;
        },
        else => {
@@ -4622,11 +4633,18 @@ fn floatFromInt(cg: *CodeGen, dest_ty: FloatType, src_ty: IntType, operand: WVal
 fn lowerPtr(cg: *CodeGen, ptr_val: InternPool.Index, prev_offset: u64) InnerError!WValue {
    const pt = cg.pt;
    const zcu = pt.zcu;
+    const ip = &zcu.intern_pool;
    const ptr = zcu.intern_pool.indexToKey(ptr_val).ptr;
    const offset: u64 = prev_offset + ptr.byte_offset;
    return switch (ptr.base_addr) {
-        .nav => |nav| return .{ .nav_ref = .{ .nav_index = nav, .offset = @intCast(offset) } },
-        .uav => |uav| return .{ .uav_ref = .{ .ip_index = uav.val, .offset = @intCast(offset), .orig_ptr_ty = uav.orig_ty } },
+        .nav => |nav| return if (Type.fromInterned(ip.getNav(nav).resolved.?.type).isRuntimeFnOrHasRuntimeBits(zcu))
+            .{ .nav_ref = .{ .nav_index = nav, .offset = @intCast(offset) } }
+        else
+            .{ .imm32 = @intCast(zcu.navAlignment(nav).forward(@as(u32, 0xaaaaaaaa))) },
+        .uav => |uav| return if (Type.fromInterned(ip.typeOf(uav.val)).isRuntimeFnOrHasRuntimeBits(zcu))
+            .{ .uav_ref = .{ .ip_index = uav.val, .offset = @intCast(offset), .orig_ptr_ty = uav.orig_ty } }
+        else
+            .{ .imm32 = @intCast(Type.fromInterned(uav.orig_ty).ptrAlignment(zcu).forward(@as(u32, 0xaaaaaaaa))) },
        .int => return cg.lowerConstant(try pt.intValue(.usize, offset)),
        .eu_payload => |eu_ptr| try cg.lowerPtr(
            eu_ptr,
@@ -4960,9 +4978,9 @@ fn airCmp(cg: *CodeGen, inst: Air.Inst.Index, op: std.math.CompareOperator) Inne
            try cg.addImm32(if (op == .eq) 0 else 1);
            try cg.addLocal(.local_set, result.local.value);

-            _ = try cg.isNull(lhs, operand_ty, .i32_eq);
+            _ = try cg.isNull(lhs, operand_ty, .i32_eq, .value);
            try cg.addLocal(.local_tee, lhs_null.local.value);
-            _ = try cg.isNull(rhs, operand_ty, .i32_eq);
+            _ = try cg.isNull(rhs, operand_ty, .i32_eq, .value);
            try cg.addTag(.i32_ne);
            try cg.addLabel(.br_if, 0);

@@ -5234,26 +5252,56 @@ fn airBitcast(cg: *CodeGen, inst: Air.Inst.Index) InnerError!void {
 fn bitcast(cg: *CodeGen, dest_ty: Type, src_ty: Type, operand: WValue) InnerError!?WValue {
    const zcu = cg.pt.zcu;
    const bit_size = src_ty.bitSize(zcu);
-    const needs_wrapping = (src_ty.isSignedInt(zcu) != dest_ty.isSignedInt(zcu)) and
+    const dest_signed = if (dest_ty.isAbiInt(zcu)) IntType.fromType(cg, dest_ty).is_signed else false;
+    const src_signed = if (src_ty.isAbiInt(zcu)) IntType.fromType(cg, src_ty).is_signed else false;
+    const needs_wrapping = (src_signed != dest_signed) and
        bit_size != 32 and bit_size != 64 and bit_size != 128;

-    if (src_ty.isAnyFloat() or dest_ty.isAnyFloat()) {
-        if (dest_ty.ip_index == .f16_type or src_ty.ip_index == .f16_type) return null;
-        if (dest_ty.bitSize(zcu) > 64) return null;
-        assert((dest_ty.isInt(zcu) and src_ty.isAnyFloat()) or (dest_ty.isAnyFloat() and src_ty.isInt(zcu)));
+    if (src_ty.isAnyFloat()) {
+        const float_ty: FloatType = .fromType(cg, src_ty);
+        switch (float_ty) {
+            .f16, .f80, .f128 => {
+                if (dest_signed) {
+                    const int_ty: IntType = .fromType(cg, dest_ty);
+                    return try cg.intWrap(int_ty, operand);
+                } else {
+                    return null;
+                }
+            },
+            .f32 => {
+                try cg.emitWValue(operand);
+                try cg.addTag(.i32_reinterpret_f32);
+                return .stack;
+            },
+            .f64 => {
+                try cg.emitWValue(operand);
+                try cg.addTag(.i64_reinterpret_f64);
+                return .stack;
+            },
+        }
+    }

-        const dest_valtype = typeToValtype(dest_ty, zcu, cg.target);
-        const opcode: Mir.Inst.Tag = switch (dest_valtype) {
-            .i32 => .i32_reinterpret_f32,
-            .i64 => .i64_reinterpret_f64,
-            .f32 => .f32_reinterpret_i32,
-            .f64 => .f64_reinterpret_i64,
-            else => unreachable,
-        };
-
-        try cg.emitWValue(operand);
-        try cg.addTag(opcode);
-        return .stack;
+    if (dest_ty.isAnyFloat()) {
+        const float_ty: FloatType = .fromType(cg, dest_ty);
+        switch (float_ty) {
+            .f16, .f80, .f128 => {
+                if (src_signed) {
+                    return try cg.intWrap(.{ .bits = @intCast(bit_size), .is_signed = false }, operand);
+                } else {
+                    return null;
+                }
+            },
+            .f32 => {
+                try cg.emitWValue(operand);
+                try cg.addTag(.f32_reinterpret_i32);
+                return .stack;
+            },
+            .f64 => {
+                try cg.emitWValue(operand);
+                try cg.addTag(.f64_reinterpret_i64);
+                return .stack;
+            },
+        }
    }

    if (isByRef(src_ty, zcu, cg.target) and !isByRef(dest_ty, zcu, cg.target)) {
@@ -5751,23 +5799,27 @@ fn airWrapErrUnionErr(cg: *CodeGen, inst: Air.Inst.Index) InnerError!void {
    return cg.finishAir(inst, result, &.{ty_op.operand});
 }

-fn airIsNull(cg: *CodeGen, inst: Air.Inst.Index, opcode: std.wasm.Opcode, op_kind: enum { value, ptr }) InnerError!void {
-    const zcu = cg.pt.zcu;
+const OpKind = enum { value, ptr };
+
+fn airIsNull(cg: *CodeGen, inst: Air.Inst.Index, opcode: std.wasm.Opcode, op_kind: OpKind) InnerError!void {
    const un_op = cg.air.instructions.items(.data)[@intFromEnum(inst)].un_op;
    const operand = try cg.resolveInst(un_op);

    const op_ty = cg.typeOf(un_op);
-    const optional_ty = if (op_kind == .ptr) op_ty.childType(zcu) else op_ty;
-    const result = try cg.isNull(operand, optional_ty, opcode);
+    const result = try cg.isNull(operand, op_ty, opcode, op_kind);
    return cg.finishAir(inst, result, &.{un_op});
 }

 /// For a given type and operand, checks if it's considered `null`.
 /// NOTE: Leaves the result on the stack
-fn isNull(cg: *CodeGen, operand: WValue, optional_ty: Type, opcode: std.wasm.Opcode) InnerError!WValue {
+fn isNull(cg: *CodeGen, operand: WValue, op_ty: Type, opcode: std.wasm.Opcode, op_kind: OpKind) InnerError!WValue {
    const pt = cg.pt;
    const zcu = pt.zcu;
    try cg.emitWValue(operand);
+    const optional_ty = switch (op_kind) {
+        .value => op_ty,
+        .ptr => op_ty.childType(zcu),
+    };
    const payload_ty = optional_ty.optionalChild(zcu);
    if (!optional_ty.optionalReprIsPayload(zcu)) {
        // When payload is zero-bits, we can treat operand as a value, rather than
@@ -5783,6 +5835,13 @@ fn isNull(cg: *CodeGen, operand: WValue, optional_ty: Type, opcode: std.wasm.Opc
            .wasm32 => try cg.addMemArg(.i32_load, .{ .offset = operand.offset(), .alignment = 4 }),
            .wasm64 => try cg.addMemArg(.i64_load, .{ .offset = operand.offset(), .alignment = 8 }),
        }
+    } else {
+        if (op_kind == .ptr) {
+            try cg.addMemArg(.i32_load, .{
+                .offset = operand.offset(),
+                .alignment = 4,
+            });
+        }
    }

    // Compare the null value with '0'
@@ -5934,10 +5993,7 @@ fn airSliceElemVal(cg: *CodeGen, inst: Air.Inst.Index) InnerError!void {
    try cg.addTag(.i32_mul);
    try cg.addTag(.i32_add);

-    const elem_result = if (isByRef(elem_ty, zcu, cg.target))
-        .stack
-    else
-        try cg.load(.stack, elem_ty, 0);
+    const elem_result = try cg.load(.stack, elem_ty, 0);

    return cg.finishAir(inst, elem_result, &.{ bin_op.lhs, bin_op.rhs });
 }
@@ -5991,10 +6047,7 @@ fn airArrayToSlice(cg: *CodeGen, inst: Air.Inst.Index) InnerError!void {
    // create a slice on the stack
    const slice_local = try cg.allocStack(slice_ty);

-    // store the array ptr in the slice
-    if (array_ty.hasRuntimeBits(zcu)) {
-        try cg.store(slice_local, operand, Type.usize, 0);
-    }
+    try cg.store(slice_local, operand, Type.usize, 0);

    // store the length of the array in the slice
    const array_len: u32 = @intCast(array_ty.arrayLen(zcu));
@@ -6026,10 +6079,7 @@ fn airPtrElemVal(cg: *CodeGen, inst: Air.Inst.Index) InnerError!void {
    try cg.addTag(.i32_mul);
    try cg.addTag(.i32_add);

-    const elem_result = if (isByRef(elem_ty, zcu, cg.target))
-        .stack
-    else
-        try cg.load(.stack, elem_ty, 0);
+    const elem_result = try cg.load(.stack, elem_ty, 0);

    return cg.finishAir(inst, elem_result, &.{ bin_op.lhs, bin_op.rhs });
 }
@@ -6349,12 +6399,8 @@ fn airSplat(cg: *CodeGen, inst: Air.Inst.Index) InnerError!void {
            else => unreachable,
        }
    }
-    const elem_size = elem_ty.bitSize(zcu);
-    const vector_len = @as(usize, @intCast(ty.vectorLen(zcu)));
-    if ((!std.math.isPowerOfTwo(elem_size) or elem_size % 8 != 0) and vector_len > 1) {
-        return cg.fail("TODO: WebAssembly `@splat` for arbitrary element bitsize {d}", .{elem_size});
-    }

+    const vector_len = @as(usize, @intCast(ty.vectorLen(zcu)));
    const result = try cg.allocStack(ty);
    const elem_byte_size = @as(u32, @intCast(elem_ty.abiSize(zcu)));
    var index: usize = 0;
@@ -6494,7 +6540,7 @@ fn airAggregateInit(cg: *CodeGen, inst: Air.Inst.Index) InnerError!void {

    const result: WValue = result_value: {
        switch (result_ty.zigTypeTag(zcu)) {
-            .array => {
+            .array, .vector => {
                const result = try cg.allocStack(result_ty);
                const elem_ty = result_ty.childType(zcu);
                const elem_size = @as(u32, @intCast(elem_ty.abiSize(zcu)));
@@ -6554,7 +6600,6 @@ fn airAggregateInit(cg: *CodeGen, inst: Air.Inst.Index) InnerError!void {
                    break :result_value result;
                },
            },
-            .vector => return cg.fail("TODO: Wasm backend: implement airAggregateInit for vectors", .{}),
            else => unreachable,
        }
    };
@@ -6792,6 +6837,37 @@ fn airMemcpy(cg: *CodeGen, inst: Air.Inst.Index) InnerError!void {
    return cg.finishAir(inst, .none, &.{ bin_op.lhs, bin_op.rhs });
 }

+fn airMemmove(cg: *CodeGen, inst: Air.Inst.Index) InnerError!void {
+    const zcu = cg.pt.zcu;
+    const bin_op = cg.air.instructions.items(.data)[@intFromEnum(inst)].bin_op;
+    const dst = try cg.resolveInst(bin_op.lhs);
+    const dst_ty = cg.typeOf(bin_op.lhs);
+    const ptr_elem_ty = dst_ty.childType(zcu);
+    const src = try cg.resolveInst(bin_op.rhs);
+    const src_ty = cg.typeOf(bin_op.rhs);
+    const len = switch (dst_ty.ptrSize(zcu)) {
+        .slice => blk: {
+            const slice_len = try cg.sliceLen(dst);
+            if (ptr_elem_ty.abiSize(zcu) != 1) {
+                try cg.emitWValue(slice_len);
+                try cg.emitWValue(.{ .imm32 = @as(u32, @intCast(ptr_elem_ty.abiSize(zcu))) });
+                try cg.addTag(.i32_mul);
+                try cg.addLocal(.local_set, slice_len.local.value);
+            }
+            break :blk slice_len;
+        },
+        .one => @as(WValue, .{
+            .imm32 = @as(u32, @intCast(ptr_elem_ty.arrayLen(zcu) * ptr_elem_ty.childType(zcu).abiSize(zcu))),
+        }),
+        .c, .many => unreachable,
+    };
+    const dst_ptr = try cg.sliceOrArrayPtr(dst, dst_ty);
+    const src_ptr = try cg.sliceOrArrayPtr(src, src_ty);
+    try cg.memmove(dst_ptr, src_ptr, len);
+
+    return cg.finishAir(inst, .none, &.{ bin_op.lhs, bin_op.rhs });
+}
+
 fn airRetAddr(cg: *CodeGen, inst: Air.Inst.Index) InnerError!void {
    // TODO: Implement this properly once stack serialization is solved
    return cg.finishAir(inst, switch (cg.ptr_size) {
@@ -6992,7 +7068,7 @@ fn airTagName(cg: *CodeGen, inst: Air.Inst.Index) InnerError!void {

    const result_ptr = try cg.allocStack(cg.typeOfIndex(inst));
    try cg.lowerToStack(result_ptr);
-    try cg.emitWValue(operand);
+    try cg.lowerToStack(operand);
    try cg.addInst(.{ .tag = .call_tag_name, .data = .{ .ip_index = enum_ty.toIntern() } });

    return cg.finishAir(inst, result_ptr, &.{un_op});
@@ -7195,9 +7271,8 @@ fn airAtomicRmw(cg: *CodeGen, inst: Air.Inst.Index) InnerError!void {
    const ty = cg.typeOfIndex(inst);
    const op: std.builtin.AtomicRmwOp = extra.op();

-    const int_ty: IntType = .fromType(cg, ty);
-
    if (cg.useAtomicFeature()) {
+        const int_ty: IntType = .fromType(cg, ty);
        switch (op) {
            .Max,
            .Min,
@@ -7312,35 +7387,65 @@ fn airAtomicRmw(cg: *CodeGen, inst: Air.Inst.Index) InnerError!void {
            },
            .Add,
            .Sub,
+            => {
+                if (ty.isAnyFloat()) {
+                    const float_ty: FloatType = .fromType(cg, ty);
+                    try cg.emitWValue(ptr);
+                    _ = switch (op) {
+                        .Add => try cg.floatAdd(float_ty, result, operand),
+                        .Sub => try cg.floatSub(float_ty, result, operand),
+                        else => unreachable,
+                    };
+                    try cg.store(.stack, .stack, ty, ptr.offset());
+                } else {
+                    const int_ty: IntType = .fromType(cg, ty);
+                    try cg.emitWValue(ptr);
+                    _ = switch (op) {
+                        .Add => try cg.intAdd(int_ty, result, operand),
+                        .Sub => try cg.intSub(int_ty, result, operand),
+                        else => unreachable,
+                    };
+                    _ = try cg.intWrap(int_ty, .stack);
+                    try cg.store(.stack, .stack, ty, ptr.offset());
+                }
+            },
            .And,
            .Or,
            .Xor,
            => {
+                const int_ty: IntType = .fromType(cg, ty);
                try cg.emitWValue(ptr);
                _ = switch (op) {
-                    .Add => try cg.intAdd(int_ty, result, operand),
-                    .Sub => try cg.intSub(int_ty, result, operand),
                    .And => try cg.intAnd(int_ty, result, operand),
                    .Or => try cg.intOr(int_ty, result, operand),
                    .Xor => try cg.intXor(int_ty, result, operand),
                    else => unreachable,
                };
-                if (ty.isInt(zcu) and (op == .Add or op == .Sub)) {
-                    _ = try cg.intWrap(int_ty, .stack);
-                }
                try cg.store(.stack, .stack, ty, ptr.offset());
            },
            .Max,
            .Min,
            => {
-                try cg.emitWValue(ptr);
-                try cg.emitWValue(result);
-                try cg.emitWValue(operand);
-                _ = try cg.intCmp(int_ty, if (op == .Max) .gt else .lt, result, operand);
-                try cg.addTag(.select);
-                try cg.store(.stack, .stack, ty, ptr.offset());
+                if (ty.isAnyFloat()) {
+                    const float_ty: FloatType = .fromType(cg, ty);
+                    try cg.emitWValue(ptr);
+                    try cg.emitWValue(result);
+                    try cg.emitWValue(operand);
+                    _ = try cg.floatCmp(float_ty, if (op == .Max) .gt else .lt, result, operand);
+                    try cg.addTag(.select);
+                    try cg.store(.stack, .stack, ty, ptr.offset());
+                } else {
+                    const int_ty: IntType = .fromType(cg, ty);
+                    try cg.emitWValue(ptr);
+                    try cg.emitWValue(result);
+                    try cg.emitWValue(operand);
+                    _ = try cg.intCmp(int_ty, if (op == .Max) .gt else .lt, result, operand);
+                    try cg.addTag(.select);
+                    try cg.store(.stack, .stack, ty, ptr.offset());
+                }
            },
            .Nand => {
+                const int_ty: IntType = .fromType(cg, ty);
                try cg.emitWValue(ptr);
                const and_res = try cg.intAnd(int_ty, result, operand);
                if (int_ty.bits <= 32) {
@@ -7423,49 +7528,51 @@ fn airAsm(cg: *CodeGen, inst: Air.Inst.Index) InnerError!void {
    else
        .none;

-    var local_map: assembly.LocalMap = .empty;
-    defer local_map.deinit(cg.gpa);
+    if (unwrapped_asm.source.len != 0) {
+        var local_map: assembly.LocalMap = .empty;
+        defer local_map.deinit(cg.gpa);

-    {
-        var it = unwrapped_asm.iterateOutputs();
-        if (it.next()) |output| {
-            const constraint = output.constraint;
-            assert(output.operand == .none);
-            const name = output.name;
+        {
+            var it = unwrapped_asm.iterateOutputs();
+            if (it.next()) |output| {
+                const constraint = output.constraint;
+                assert(output.operand == .none);
+                const name = output.name;

-            if (!mem.eql(u8, constraint, "=r")) {
-                return cg.fail("Self-hosted wasm backend requires output constraint to be equal \"=r\"", .{});
-            }
+                if (!mem.eql(u8, constraint, "=r")) {
+                    return cg.fail("Self-hosted wasm backend requires output constraint to be equal \"=r\"", .{});
+                }

-            const gop = try local_map.getOrPutValue(cg.gpa, name, result.local.value);
-            assert(!gop.found_existing); // first value
+                const gop = try local_map.getOrPutValue(cg.gpa, name, result.local.value);
+                assert(!gop.found_existing); // first value

-            assert(it.next() == null);
-        }
-    }
-
-    {
-        var it = unwrapped_asm.iterateInputs();
-        while (it.next()) |input| {
-            const constraint = input.constraint;
-            const operand = try cg.resolveInst(input.operand);
-            const name = input.name;
-
-            if (!mem.eql(u8, constraint, "r")) {
-                return cg.fail("Self-hosted wasm backend requires input constraint to be equal \"r\"", .{});
-            }
-
-            try cg.lowerToStack(operand);
-            const op_local = try WValue.toLocal(.stack, cg, cg.typeOf(input.operand));
-
-            const gop = try local_map.getOrPutValue(cg.gpa, name, op_local.local.value);
-            if (gop.found_existing) {
-                return cg.fail("Duplicate asm variable name \"{s}\"", .{name});
+                assert(it.next() == null);
            }
        }
-    }

-    try assembly.assemble(cg, unwrapped_asm.source, &local_map);
+        {
+            var it = unwrapped_asm.iterateInputs();
+            while (it.next()) |input| {
+                const constraint = input.constraint;
+                const operand = try cg.resolveInst(input.operand);
+                const name = input.name;
+
+                if (!mem.eql(u8, constraint, "r")) {
+                    return cg.fail("Self-hosted wasm backend requires input constraint to be equal \"r\"", .{});
+                }
+
+                try cg.lowerToStack(operand);
+                const op_local = try WValue.toLocal(.stack, cg, cg.typeOf(input.operand));
+
+                const gop = try local_map.getOrPutValue(cg.gpa, name, op_local.local.value);
+                if (gop.found_existing) {
+                    return cg.fail("Duplicate asm variable name \"{s}\"", .{name});
+                }
+            }
+        }
+
+        try assembly.assemble(cg, unwrapped_asm.source, &local_map);
+    }

    var bt = cg.liveness.iterateBigTomb(inst);
    for (outputs) |output| if (output != .none) cg.feed(&bt, output);
@@ -1028,6 +1028,9 @@ pub const Intrinsic = enum(u32) {
    tanf,
    tanq,
    truncq,
+    memcpy,
+    memmove,
+    memset,
    __addo_limb64,
    __subo_limb64,
    __cmp_limb64,