diff --git a/lib/compiler_rt/limb64.zig b/lib/compiler_rt/limb64.zig
index 4c0126d4b3..a5456b548c 100644
--- a/lib/compiler_rt/limb64.zig
+++ b/lib/compiler_rt/limb64.zig
@@ -839,3 +839,141 @@ test __byteswap_limb64 {
     try test__byteswap_limb64(i128, 1 << 56, 1 << 64);
     try test__byteswap_limb64(i248, minInt(i248), 128);
 }
+
+comptime {
+    symbol(&__mulo_limb64, "__mulo_limb64");
+}
+
+inline fn add3(x: *[3]u64, start: usize, v0: u64) void {
+    var i = start;
+    var v = v0;
+    while (i < 3) : (i += 1) {
+        const s = @addWithOverflow(x[i], v);
+        x[i] = s[0];
+        if (s[1] == 0) break;
+        v = 1;
+    }
+}
+
+fn mulwide(a: u64, b: u64) [2]u64 {
+    const muldXi = @import("mulXi3.zig").muldXi;
+    return @bitCast(muldXi(u64, a, b));
+}
+
+fn __mulo_limb64(out_ptr: [*]u64, a_ptr: [*]const u64, b_ptr: [*]const u64, is_signed: bool, bits: u16) callconv(.c) bool {
+    const limb_cnt = limbCount(bits);
+
+    const out = out_ptr[0..limb_cnt];
+    const a = a_ptr[0..limb_cnt];
+    const b = b_ptr[0..limb_cnt];
+
+    @memset(out, 0);
+
+    const all_ones = ~@as(u64, 0);
+    const a_neg = is_signed and ((limbGet(a, limb_cnt - 1) >> 63) != 0);
+    const b_neg = is_signed and ((limbGet(b, limb_cnt - 1) >> 63) != 0);
+
+    var carry: [3]u64 = @splat(0);
+    var hi_zero = true;
+    var hi_ones = true;
+    var hi_borrow: u1 = 0;
+    var raw_last: u64 = 0;
+
+    var k: usize = 0;
+    while (k < 2 * limb_cnt) : (k += 1) {
+        var acc = carry;
+
+        var i: usize = if (k < limb_cnt) 0 else k - (limb_cnt - 1);
+        while (i < limb_cnt and i <= k) : (i += 1) {
+            const j = k - i;
+            if (j >= limb_cnt) continue;
+
+            const p = mulwide(limbGet(a, i), limbGet(b, j));
+            add3(&acc, 0, p[0]);
+            add3(&acc, 1, p[1]);
+        }
+
+        var limb = acc[0];
+        if (k < limb_cnt) {
+            limbSet(out, k, limb);
+            if (k == limb_cnt - 1) raw_last = limb;
+        } else {
+            if (is_signed) {
+                const h = k - limb_cnt;
+
+                const s0 = @subWithOverflow(limb, if (a_neg) limbGet(b, h) else 0);
+                const s1 = @subWithOverflow(s0[0], if (b_neg) limbGet(a, h) else 0);
+                const s2 = @subWithOverflow(s1[0], hi_borrow);
+
+                limb = s2[0];
+                hi_borrow = @intFromBool(s0[1] != 0 or s1[1] != 0 or s2[1] != 0);
+            }
+
+            hi_zero = hi_zero and limb == 0;
+            hi_ones = hi_ones and limb == all_ones;
+        }
+
+        carry = .{ acc[1], acc[2], 0 };
+    }
+
+    const last = if (bits % 64 == 0) raw_last else limbWrap(raw_last, is_signed, bits);
+    if (bits % 64 != 0) {
+        limbSet(out, limb_cnt - 1, last);
+    }
+
+    if (!is_signed) {
+        return !hi_zero or raw_last != last;
+    }
+
+    const sign_extend: u64 = if ((last >> 63) == 1) all_ones else 0;
+    return (raw_last != last) or if (sign_extend == 0) !hi_zero else !hi_ones;
+}
+
+fn test__mulo_limb64(comptime T: type, a: T, b: T, expected: struct { T, bool }) !void {
+    const int_info = @typeInfo(T).int;
+    const is_signed = int_info.signedness == .signed;
+
+    var a_limbs = asLimbs(a);
+    var b_limbs = asLimbs(b);
+    var out: Limbs(T) = undefined;
+    const overflow = __mulo_limb64(&out, &a_limbs, &b_limbs, is_signed, int_info.bits);
+
+    const expected_limbs = asLimbs(expected[0]);
+    try testing.expectEqual(expected_limbs, out);
+    try testing.expectEqual(expected[1], overflow);
+}
+
+test __mulo_limb64 {
+    try test__mulo_limb64(u64, 3, 5, .{ 15, false });
+    try test__mulo_limb64(u64, maxInt(u64), 2, .{ maxInt(u64) - 1, true });
+    try test__mulo_limb64(u65, 1 << 32, 1 << 32, .{ 1 << 64, false });
+    try test__mulo_limb64(u65, 1 << 64, 2, .{ 0, true });
+    try test__mulo_limb64(u128, 1 << 80, 1 << 40, .{ 1 << 120, false });
+    try test__mulo_limb64(u128, 1 << 100, 1 << 40, .{ 0, true });
+    try test__mulo_limb64(u255, 7, 9, .{ 63, false });
+    try test__mulo_limb64(u255, maxInt(u255), 2, .{ maxInt(u255) - 1, true });
+
+    try test__mulo_limb64(i64, -3, 2, .{ -6, false });
+    try test__mulo_limb64(i64, maxInt(i64), 2, .{ -2, true });
+    try test__mulo_limb64(i65, 1 << 63, 2, .{ minInt(i65), true });
+    try test__mulo_limb64(i65, -1 << 32, 1 << 16, .{ -1 << 48, false });
+    try test__mulo_limb64(i128, 1 << 100, 1 << 27, .{ minInt(i128), true });
+    try test__mulo_limb64(i128, -1 << 80, 1 << 40, .{ -1 << 120, false });
+    try test__mulo_limb64(i255, -3, 2, .{ -6, false });
+    try test__mulo_limb64(i255, maxInt(i255), 2, .{ -2, true });
+
+    try test__mulo_limb64(u200, 0, maxInt(u200), .{ 0, false });
+    try test__mulo_limb64(u200, 1, maxInt(u200), .{ maxInt(u200), false });
+    try test__mulo_limb64(u200, 1 << 100, 1 << 99, .{ 1 << 199, false });
+    try test__mulo_limb64(u200, 1 << 100, 1 << 100, .{ 0, true });
+    try test__mulo_limb64(u200, maxInt(u200), maxInt(u200), .{ 1, true });
+
+    try test__mulo_limb64(i200, 0, -1, .{ 0, false });
+    try test__mulo_limb64(i200, -1, -1, .{ 1, false });
+    try test__mulo_limb64(i200, -1, minInt(i200), .{ minInt(i200), true });
+    try test__mulo_limb64(i200, maxInt(i200), 2, .{ -2, true });
+    try test__mulo_limb64(i200, 1 << 100, 1 << 98, .{ 1 << 198, false });
+    try test__mulo_limb64(i200, 1 << 100, 1 << 99, .{ minInt(i200), true });
+    try test__mulo_limb64(i200, maxInt(i200), maxInt(i200), .{ 1, true });
+    try test__mulo_limb64(i200, minInt(i200), minInt(i200), .{ 0, true });
+}
diff --git a/lib/compiler_rt/mulXi3.zig b/lib/compiler_rt/mulXi3.zig
index 41df7283ea..9fce9754f3 100644
--- a/lib/compiler_rt/mulXi3.zig
+++ b/lib/compiler_rt/mulXi3.zig
@@ -63,7 +63,7 @@ fn DoubleInt(comptime T: type) type {
     };
 }
 
-fn muldXi(comptime T: type, a: T, b: T) DoubleInt(T) {
+pub fn muldXi(comptime T: type, a: T, b: T) DoubleInt(T) {
     const DT = DoubleInt(T);
     const word_t = compiler_rt.HalveInt(DT, false);
     const bits_in_word_2 = @sizeOf(T) * 8 / 2;
diff --git a/src/codegen/wasm/CodeGen.zig b/src/codegen/wasm/CodeGen.zig
index 19fb654ae8..a2750fa02c 100644
--- a/src/codegen/wasm/CodeGen.zig
+++ b/src/codegen/wasm/CodeGen.zig
@@ -2480,7 +2480,19 @@ fn intMul(cg: *CodeGen, ty: IntType, lhs: WValue, rhs: WValue) InnerError!WValue
             return .stack;
         },
         65...128 => return cg.callIntrinsic(.__multi3, &.{ .i128_type, .i128_type }, Type.i128, &.{ lhs, rhs }),
-        else => return cg.fail("TODO: Support intMul for integer bitsize: {d}", .{ty.bits}),
+        else => {
+            const result = try cg.allocInt(ty);
+
+            try cg.lowerToStack(result);
+            try cg.lowerToStack(lhs);
+            try cg.lowerToStack(rhs);
+            try cg.addImm32(@intFromBool(ty.is_signed));
+            try cg.addImm32(ty.bits);
+            try cg.addCallIntrinsic(.__mulo_limb64);
+            try cg.addTag(.drop);
+
+            return result;
+        },
     }
 }
 
@@ -3680,68 +3692,6 @@ fn intMulOverflow(cg: *CodeGen, int_ty: IntType, lhs: WValue, rhs: WValue) Inner
         _ = try cg.intCmp(new_ty, .neq, res_upcast, bin_op);
         try cg.addLocal(.local_set, overflow_bit.local.value);
         break :blk res_tmp;
-    } else if (int_ty.bits == 128 and !int_ty.is_signed) blk: {
-        var lhs_lsb = try (try cg.load(lhs, Type.u64, 0)).toLocal(cg, Type.u64);
-        defer lhs_lsb.free(cg);
-        var lhs_msb = try (try cg.load(lhs, Type.u64, 8)).toLocal(cg, Type.u64);
-        defer lhs_msb.free(cg);
-        var rhs_lsb = try (try cg.load(rhs, Type.u64, 0)).toLocal(cg, Type.u64);
-        defer rhs_lsb.free(cg);
-        var rhs_msb = try (try cg.load(rhs, Type.u64, 8)).toLocal(cg, Type.u64);
-        defer rhs_msb.free(cg);
-
-        const zero: WValue = .{ .imm64 = 0 };
-
-        const cross_1 = try cg.callIntrinsic(
-            .__multi3,
-            &[_]InternPool.Index{.i64_type} ** 4,
-            Type.i128,
-            &.{ lhs_msb, zero, rhs_lsb, zero },
-        );
-        const cross_2 = try cg.callIntrinsic(
-            .__multi3,
-            &[_]InternPool.Index{.i64_type} ** 4,
-            Type.i128,
-            &.{ rhs_msb, zero, lhs_lsb, zero },
-        );
-        const mul_lsb = try cg.callIntrinsic(
-            .__multi3,
-            &[_]InternPool.Index{.i64_type} ** 4,
-            Type.i128,
-            &.{ rhs_lsb, zero, lhs_lsb, zero },
-        );
-
-        const rhs_msb_not_zero = try cg.intCmp(.u64, .neq, rhs_msb, zero);
-        const lhs_msb_not_zero = try cg.intCmp(.u64, .neq, lhs_msb, zero);
-        const both_msb_not_zero = try cg.intAnd(.u32, rhs_msb_not_zero, lhs_msb_not_zero);
-
-        const cross_1_msb = try cg.load(cross_1, .u64, 8);
-        const cross_1_msb_not_zero = try cg.intCmp(.u64, .neq, cross_1_msb, zero);
-        const cond_1 = try cg.intOr(.u32, both_msb_not_zero, cross_1_msb_not_zero);
-
-        const cross_2_msb = try cg.load(cross_2, Type.u64, 8);
-        const cross_2_msb_not_zero = try cg.intCmp(.u64, .neq, cross_2_msb, zero);
-        const cond_2 = try cg.intOr(.u32, cond_1, cross_2_msb_not_zero);
-
-        const cross_1_lsb = try cg.load(cross_1, Type.u64, 0);
-        const cross_2_lsb = try cg.load(cross_2, Type.u64, 0);
-        const cross_add = try cg.intAdd(.u64, cross_1_lsb, cross_2_lsb);
-
-        var mul_lsb_msb = try (try cg.load(mul_lsb, Type.u64, 8)).toLocal(cg, Type.u64);
-        defer mul_lsb_msb.free(cg);
-        var all_add = try (try cg.intAdd(.u64, cross_add, mul_lsb_msb)).toLocal(cg, Type.u64);
-        defer all_add.free(cg);
-        const add_overflow = try cg.intCmp(.u64, .lt, all_add, mul_lsb_msb);
-
-        _ = try cg.intOr(.u32, cond_2, add_overflow);
-        try cg.addLocal(.local_set, overflow_bit.local.value);
-
-        const tmp_result = try cg.allocStack(Type.u128);
-        try cg.emitWValue(tmp_result);
-        const mul_lsb_lsb = try cg.load(mul_lsb, Type.u64, 0);
-        try cg.store(.stack, mul_lsb_lsb, Type.u64, tmp_result.offset());
-        try cg.store(tmp_result, all_add, Type.u64, 8);
-        break :blk tmp_result;
     } else if (int_ty.bits == 128 and int_ty.is_signed) blk: {
         const overflow_ret = try cg.allocStack(Type.i32);
         const res = try cg.callIntrinsic(
@@ -3753,7 +3703,18 @@ fn intMulOverflow(cg: *CodeGen, int_ty: IntType, lhs: WValue, rhs: WValue) Inner
         _ = try cg.load(overflow_ret, Type.i32, 0);
         try cg.addLocal(.local_set, overflow_bit.local.value);
         break :blk res;
-    } else return cg.fail("TODO: intMulOverflow for bitsize {d}", .{int_ty.bits});
+    } else {
+        const result = try cg.allocInt(int_ty);
+
+        try cg.lowerToStack(result);
+        try cg.lowerToStack(lhs);
+        try cg.lowerToStack(rhs);
+        try cg.addImm32(@intFromBool(int_ty.is_signed));
+        try cg.addImm32(int_ty.bits);
+        try cg.addCallIntrinsic(.__mulo_limb64);
+
+        return .{ .result = result, .ov = .stack };
+    };
 
     return .{ .result = result_val, .ov = .{ .local = overflow_bit.local } };
 }
diff --git a/src/codegen/wasm/Mir.zig b/src/codegen/wasm/Mir.zig
index d7297af253..74b6c33020 100644
--- a/src/codegen/wasm/Mir.zig
+++ b/src/codegen/wasm/Mir.zig
@@ -1018,4 +1018,5 @@ pub const Intrinsic = enum(u32) {
     __popcount_limb64,
     __bitreverse_limb64,
     __byteswap_limb64,
+    __mulo_limb64,
 };
diff --git a/test/behavior/math.zig b/test/behavior/math.zig
index 085a72f390..20a1c4ed3e 100644
--- a/test/behavior/math.zig
+++ b/test/behavior/math.zig
@@ -1100,10 +1100,37 @@ test "@mulWithOverflow bitsize 128 bits" {
     try testMulWithOverflow(i128, -1 << 63, -1 << 64, -1 << 127, 1);
 }
 
+test "@mulWithOverflow > 128 bits" {
+    if (builtin.zig_backend == .stage2_aarch64) return error.SkipZigTest;
+
+    try testMulWithOverflow(u140, 0, maxInt(u140), 0, 0);
+    try testMulWithOverflow(u140, 1, maxInt(u140), maxInt(u140), 0);
+    try testMulWithOverflow(u140, 1 << 70, 1 << 69, 1 << 139, 0);
+    try testMulWithOverflow(u140, 1 << 70, 1 << 70, 0, 1);
+
+    try testMulWithOverflow(u200, 1 << 100, 1 << 99, 1 << 199, 0);
+    try testMulWithOverflow(u200, 1 << 100, 1 << 100, 0, 1);
+    try testMulWithOverflow(u200, maxInt(u200), maxInt(u200), 1, 1);
+    try testMulWithOverflow(u200, maxInt(u200) - 1, 2, maxInt(u200) - 3, 1);
+
+    try testMulWithOverflow(i140, 0, -1, 0, 0);
+    try testMulWithOverflow(i140, -1, -1, 1, 0);
+    try testMulWithOverflow(i140, 1 << 69, 1 << 69, 1 << 138, 0);
+    try testMulWithOverflow(i140, 1 << 69, 1 << 70, minInt(i140), 1);
+    try testMulWithOverflow(i140, -1 << 70, 1 << 20, -1 << 90, 0);
+    try testMulWithOverflow(i140, minInt(i140), -1, minInt(i140), 1);
+
+    try testMulWithOverflow(i200, 1 << 100, 1 << 98, 1 << 198, 0);
+    try testMulWithOverflow(i200, 1 << 100, 1 << 99, minInt(i200), 1);
+    try testMulWithOverflow(i200, -1 << 120, 1 << 30, -1 << 150, 0);
+    try testMulWithOverflow(i200, minInt(i200), minInt(i200), 0, 1);
+    try testMulWithOverflow(i200, maxInt(i200), 2, -2, 1);
+    try testMulWithOverflow(i200, maxInt(i200), maxInt(i200), 1, 1);
+}
+
 test "@mulWithOverflow bitsize 256 bits" {
     if (builtin.zig_backend == .stage2_aarch64) return error.SkipZigTest;
     if (builtin.zig_backend == .stage2_c) return error.SkipZigTest;
-    if (builtin.zig_backend == .stage2_wasm) return error.SkipZigTest;
     if (builtin.zig_backend == .stage2_spirv) return error.SkipZigTest;
     if (builtin.zig_backend == .stage2_riscv64) return error.SkipZigTest;