zig/lib/std/math/float.zig

const std = @import("../std.zig");
const builtin = @import("builtin");
const assert = std.debug.assert;
const expect = std.testing.expect;
const expectEqual = std.testing.expectEqual;

/// A namespace for functions that deal with floats that provide greater than
/// double precision (`f80`, `f128`, `c_longdouble`). Commonly referred to as
/// `long double` in C.
pub const long_double = struct {
    const U80 = @Int(.unsigned, 80);

    inline fn bitWidth(x: anytype) u16 {
        const T = @TypeOf(x);
        return switch (T) {
            f80, f128, c_longdouble => @typeInfo(T).float.bits,
            else => @compileError("Unsupported type: " ++ @typeName(T) ++ "\nPass a `f80`, `f128`, or `c_longdouble`."),
        };
    }

    /// Returns the sign + exponent bits of a `long double`.
    pub fn signExponent(x: anytype) u16 {
        const bit_width = bitWidth(x);
        switch (bit_width) {
            80 => {
                const bits: U80 = @bitCast(x);
                return @intCast(bits >> 64);
            },
            128 => {
                const bits: u128 = @bitCast(x);
                return @intCast(bits >> 112);
            },
            // `c_longdouble` can have <80 bits on some targets, we want to error on that
            else => @compileError(std.fmt.comptimePrint("`signExponent` supports floats of only `80` and `128` bit width, got bit width: {d}", .{bit_width})),
        }
    }

    test "signExponent" {
        try expectEqual(signExponent(@as(f80, -0.0)), 0x8000);
        try expectEqual(signExponent(@as(f128, 0.0)), 0x0000);
        try expectEqual(signExponent(@as(f128, 42.0)), 0x4004);
        try expectEqual(signExponent(nan(c_longdouble)), 0x7FFF);
    }

    /// Takes the top 16 bits of a `long double`'s mantissa.
    pub fn mantissaTop(x: anytype) u16 {
        const bit_width = bitWidth(x);
        switch (bit_width) {
            80 => {
                const bits: U80 = @bitCast(x);
                return @intCast((bits >> 48) & 0xFFFF);
            },
            128 => {
                const bits: u128 = @bitCast(x);
                return @intCast((bits >> 96) & 0xFFFF);
            },
            // `c_longdouble` can have <80 bits on some targets, we want to error on that
            else => @compileError(std.fmt.comptimePrint("`mantissaTop` supports floats of only `80` and `128` bit width, got bit width: {d}", .{bit_width})),
        }
    }

    test "mantissaTop" {
        try expectEqual(mantissaTop(@as(f80, -0.0)), 0x0000);
        try expectEqual(mantissaTop(nan(f128)), 0x8000);
        try expectEqual(mantissaTop(@as(f128, 42.0)), 0x5000);
    }
};

pub fn FloatRepr(comptime Float: type) type {
    const fractional_bits = floatFractionalBits(Float);
    const exponent_bits = floatExponentBits(Float);
    return packed struct {
        const Repr = @This();

        mantissa: StoredMantissa,
        exponent: BiasedExponent,
        sign: std.math.Sign,

        pub const StoredMantissa = @Int(.unsigned, floatMantissaBits(Float));
        pub const Mantissa = @Int(.unsigned, 1 + fractional_bits);
        pub const Exponent = @Int(.signed, exponent_bits);
        pub const BiasedExponent = enum(@Int(.unsigned, exponent_bits)) {
            denormal = 0,
            min_normal = 1,
            zero = (1 << (exponent_bits - 1)) - 1,
            max_normal = (1 << exponent_bits) - 2,
            infinite = (1 << exponent_bits) - 1,
            _,

            pub const Int = @typeInfo(BiasedExponent).@"enum".tag_type;

            pub fn unbias(biased: BiasedExponent) Exponent {
                switch (biased) {
                    .denormal => unreachable,
                    else => return @bitCast(@intFromEnum(biased) -% @intFromEnum(BiasedExponent.zero)),
                    .infinite => unreachable,
                }
            }

            pub fn bias(unbiased: Exponent) BiasedExponent {
                return @enumFromInt(@intFromEnum(BiasedExponent.zero) +% @as(Int, @bitCast(unbiased)));
            }
        };

        pub const Normalized = struct {
            fraction: Fraction,
            exponent: Normalized.Exponent,

            pub const Fraction = @Int(.unsigned, fractional_bits);
            pub const Exponent = @Int(.signed, 1 + exponent_bits);

            /// This currently truncates denormal values, which needs to be fixed before this can be used to
            /// produce a rounded value.
            pub fn reconstruct(normalized: Normalized, sign: std.math.Sign) Float {
                if (normalized.exponent > BiasedExponent.max_normal.unbias()) return @bitCast(Repr{
                    .mantissa = 0,
                    .exponent = .infinite,
                    .sign = sign,
                });
                const mantissa = @as(Mantissa, 1 << fractional_bits) | normalized.fraction;
                if (normalized.exponent < BiasedExponent.min_normal.unbias()) return @bitCast(Repr{
                    .mantissa = @truncate(std.math.shr(
                        Mantissa,
                        mantissa,
                        BiasedExponent.min_normal.unbias() - normalized.exponent,
                    )),
                    .exponent = .denormal,
                    .sign = sign,
                });
                return @bitCast(Repr{
                    .mantissa = @truncate(mantissa),
                    .exponent = .bias(@intCast(normalized.exponent)),
                    .sign = sign,
                });
            }
        };

        pub const Classified = union(enum) { normalized: Normalized, infinity, nan, invalid };
        fn classify(repr: Repr) Classified {
            return switch (repr.exponent) {
                .denormal => {
                    const mantissa: Mantissa = repr.mantissa;
                    const shift = @clz(mantissa);
                    return .{ .normalized = .{
                        .fraction = @truncate(mantissa << shift),
                        .exponent = @as(Normalized.Exponent, comptime BiasedExponent.min_normal.unbias()) - shift,
                    } };
                },
                else => if (repr.mantissa <= std.math.maxInt(Normalized.Fraction)) .{ .normalized = .{
                    .fraction = @intCast(repr.mantissa),
                    .exponent = repr.exponent.unbias(),
                } } else .invalid,
                .infinite => switch (repr.mantissa) {
                    0 => .infinity,
                    else => .nan,
                },
            };
        }
    };
}

/// Creates a raw "1.0" mantissa for floating point type T. Used to dedupe f80 logic.
inline fn mantissaOne(comptime T: type) comptime_int {
    return if (@typeInfo(T).float.bits == 80) 1 << floatFractionalBits(T) else 0;
}

/// Creates floating point type T from an unbiased exponent and raw mantissa.
inline fn reconstructFloat(comptime T: type, comptime exponent: comptime_int, comptime mantissa: comptime_int) T {
    const TBits = @Int(.unsigned, @bitSizeOf(T));
    const biased_exponent = @as(TBits, exponent + floatExponentMax(T));
    return @as(T, @bitCast((biased_exponent << floatMantissaBits(T)) | @as(TBits, mantissa)));
}

/// Returns the number of bits in the exponent of floating point type T.
pub inline fn floatExponentBits(comptime T: type) comptime_int {
    comptime assert(@typeInfo(T) == .float);

    return switch (@typeInfo(T).float.bits) {
        16 => 5,
        32 => 8,
        64 => 11,
        80 => 15,
        128 => 15,
        else => @compileError("unknown floating point type " ++ @typeName(T)),
    };
}

/// Returns the number of bits in the mantissa of floating point type T.
pub inline fn floatMantissaBits(comptime T: type) comptime_int {
    comptime assert(@typeInfo(T) == .float);

    return switch (@typeInfo(T).float.bits) {
        16 => 10,
        32 => 23,
        64 => 52,
        80 => 64,
        128 => 112,
        else => @compileError("unknown floating point type " ++ @typeName(T)),
    };
}

/// Returns the number of fractional bits in the mantissa of floating point type T.
pub inline fn floatFractionalBits(comptime T: type) comptime_int {
    comptime assert(@typeInfo(T) == .float);

    // standard IEEE floats have an implicit 0.m or 1.m integer part
    // f80 is special and has an explicitly stored bit in the MSB
    // this function corresponds to `MANT_DIG - 1' from C
    return switch (@typeInfo(T).float.bits) {
        16 => 10,
        32 => 23,
        64 => 52,
        80 => 63,
        128 => 112,
        else => @compileError("unknown floating point type " ++ @typeName(T)),
    };
}

/// Returns the minimum exponent that can represent
/// a normalised value in floating point type T.
pub inline fn floatExponentMin(comptime T: type) comptime_int {
    return -floatExponentMax(T) + 1;
}

/// Returns the maximum exponent that can represent
/// a normalised value in floating point type T.
pub inline fn floatExponentMax(comptime T: type) comptime_int {
    return (1 << (floatExponentBits(T) - 1)) - 1;
}

/// Returns the smallest subnormal number representable in floating point type T.
pub inline fn floatTrueMin(comptime T: type) T {
    return reconstructFloat(T, floatExponentMin(T) - 1, 1);
}

/// Returns the smallest normal number representable in floating point type T.
pub inline fn floatMin(comptime T: type) T {
    return reconstructFloat(T, floatExponentMin(T), mantissaOne(T));
}

/// Returns the largest normal number representable in floating point type T.
pub inline fn floatMax(comptime T: type) T {
    const all1s_mantissa = (1 << floatMantissaBits(T)) - 1;
    return reconstructFloat(T, floatExponentMax(T), all1s_mantissa);
}

/// Returns the machine epsilon of floating point type T.
pub inline fn floatEps(comptime T: type) T {
    return reconstructFloat(T, -floatFractionalBits(T), mantissaOne(T));
}

/// Returns the local epsilon of floating point type T.
pub inline fn floatEpsAt(comptime T: type, x: T) T {
    switch (@typeInfo(T)) {
        .float => |F| {
            const U: type = @Int(.unsigned, F.bits);
            const u: U = @bitCast(x);
            const y: T = @bitCast(u ^ 1);
            return @abs(x - y);
        },
        else => @compileError("floatEpsAt only supports floats"),
    }
}

/// Returns the inf value for a floating point `Type`.
pub inline fn inf(comptime Type: type) Type {
    const RuntimeType = switch (Type) {
        else => Type,
        comptime_float => f128, // any float type will do
    };
    return reconstructFloat(RuntimeType, floatExponentMax(RuntimeType) + 1, mantissaOne(RuntimeType));
}

/// Returns the canonical quiet NaN representation for a floating point `Type`.
pub inline fn nan(comptime Type: type) Type {
    const RuntimeType = switch (Type) {
        else => Type,
        comptime_float => f128, // any float type will do
    };
    return reconstructFloat(
        RuntimeType,
        floatExponentMax(RuntimeType) + 1,
        mantissaOne(RuntimeType) | 1 << (floatFractionalBits(RuntimeType) - 1),
    );
}

/// Returns a signalling NaN representation for a floating point `Type`.
///
/// TODO: LLVM is known to miscompile on some architectures to quiet NaN -
///       this is tracked by https://github.com/ziglang/zig/issues/14366
pub inline fn snan(comptime Type: type) Type {
    const RuntimeType = switch (Type) {
        else => Type,
        comptime_float => f128, // any float type will do
    };
    return reconstructFloat(
        RuntimeType,
        floatExponentMax(RuntimeType) + 1,
        mantissaOne(RuntimeType) | 1 << (floatFractionalBits(RuntimeType) - 2),
    );
}

fn floatBits(comptime Type: type) !void {
    // (1 +) for the sign bit, since it is separate from the other bits
    const size = 1 + floatExponentBits(Type) + floatMantissaBits(Type);
    try expect(@bitSizeOf(Type) == size);
    try expect(floatFractionalBits(Type) <= floatMantissaBits(Type));

    // for machine epsilon, assert expmin <= -prec <= expmax
    try expect(floatExponentMin(Type) <= -floatFractionalBits(Type));
    try expect(-floatFractionalBits(Type) <= floatExponentMax(Type));
}
test floatBits {
    try floatBits(f16);
    try floatBits(f32);
    try floatBits(f64);
    try floatBits(f80);
    try floatBits(f128);
    try floatBits(c_longdouble);
}

test inf {
    const inf_u16: u16 = 0x7C00;
    const inf_u32: u32 = 0x7F800000;
    const inf_u64: u64 = 0x7FF0000000000000;
    const inf_u80: u80 = 0x7FFF8000000000000000;
    const inf_u128: u128 = 0x7FFF0000000000000000000000000000;
    try expectEqual(inf_u16, @as(u16, @bitCast(inf(f16))));
    try expectEqual(inf_u32, @as(u32, @bitCast(inf(f32))));
    try expectEqual(inf_u64, @as(u64, @bitCast(inf(f64))));
    try expectEqual(inf_u80, @as(u80, @bitCast(inf(f80))));
    try expectEqual(inf_u128, @as(u128, @bitCast(inf(f128))));
}

test nan {
    const qnan_u16: u16 = 0x7E00;
    const qnan_u32: u32 = 0x7FC00000;
    const qnan_u64: u64 = 0x7FF8000000000000;
    const qnan_u80: u80 = 0x7FFFC000000000000000;
    const qnan_u128: u128 = 0x7FFF8000000000000000000000000000;
    try expectEqual(qnan_u16, @as(u16, @bitCast(nan(f16))));
    try expectEqual(qnan_u32, @as(u32, @bitCast(nan(f32))));
    try expectEqual(qnan_u64, @as(u64, @bitCast(nan(f64))));
    try expectEqual(qnan_u80, @as(u80, @bitCast(nan(f80))));
    try expectEqual(qnan_u128, @as(u128, @bitCast(nan(f128))));
}

test snan {
    const snan_u16: u16 = 0x7D00;
    const snan_u32: u32 = 0x7FA00000;
    const snan_u64: u64 = 0x7FF4000000000000;
    const snan_u80: u80 = 0x7FFFA000000000000000;
    const snan_u128: u128 = 0x7FFF4000000000000000000000000000;
    try expectEqual(snan_u16, @as(u16, @bitCast(snan(f16))));
    try expectEqual(snan_u32, @as(u32, @bitCast(snan(f32))));
    try expectEqual(snan_u64, @as(u64, @bitCast(snan(f64))));
    try expectEqual(snan_u80, @as(u80, @bitCast(snan(f80))));
    try expectEqual(snan_u128, @as(u128, @bitCast(snan(f128))));
}