zig/deps/aro/Tokenizer.zig

const std = @import("std");
const assert = std.debug.assert;
const Compilation = @import("Compilation.zig");
const Source = @import("Source.zig");
const LangOpts = @import("LangOpts.zig");
const CharInfo = @import("CharInfo.zig");
const unicode = @import("unicode.zig");

const Tokenizer = @This();

pub const Token = struct {
    id: Id,
    source: Source.Id,
    start: u32 = 0,
    end: u32 = 0,
    line: u32 = 0,

    pub const Id = enum(u8) {
        invalid,
        nl,
        whitespace,
        eof,
        /// identifier containing solely basic character set characters
        identifier,
        /// identifier with at least one extended character
        extended_identifier,

        // string literals with prefixes
        string_literal,
        string_literal_utf_16,
        string_literal_utf_8,
        string_literal_utf_32,
        string_literal_wide,

        // <foobar> only generated by preprocessor
        macro_string,

        // char literals with prefixes
        char_literal,
        char_literal_utf_8,
        char_literal_utf_16,
        char_literal_utf_32,
        char_literal_wide,

        /// Integer literal tokens generated by preprocessor.
        one,
        zero,

        bang,
        bang_equal,
        pipe,
        pipe_pipe,
        pipe_equal,
        equal,
        equal_equal,
        l_paren,
        r_paren,
        l_brace,
        r_brace,
        l_bracket,
        r_bracket,
        period,
        ellipsis,
        caret,
        caret_equal,
        plus,
        plus_plus,
        plus_equal,
        minus,
        minus_minus,
        minus_equal,
        asterisk,
        asterisk_equal,
        percent,
        percent_equal,
        arrow,
        colon,
        colon_colon,
        semicolon,
        slash,
        slash_equal,
        comma,
        ampersand,
        ampersand_ampersand,
        ampersand_equal,
        question_mark,
        angle_bracket_left,
        angle_bracket_left_equal,
        angle_bracket_angle_bracket_left,
        angle_bracket_angle_bracket_left_equal,
        angle_bracket_right,
        angle_bracket_right_equal,
        angle_bracket_angle_bracket_right,
        angle_bracket_angle_bracket_right_equal,
        tilde,
        hash,
        hash_hash,

        /// Special token to speed up preprocessing, `loc.end` will be an index to the param list.
        macro_param,
        /// Special token to signal that the argument must be replaced without expansion (e.g. in concatenation)
        macro_param_no_expand,
        /// Special token to speed up preprocessing, `loc.end` will be an index to the param list.
        stringify_param,
        /// Same as stringify_param, but for var args
        stringify_va_args,
        /// Special macro whitespace, always equal to a single space
        macro_ws,
        /// Special token for implementing __has_attribute
        macro_param_has_attribute,
        /// Special token for implementing __has_warning
        macro_param_has_warning,
        /// Special token for implementing __has_feature
        macro_param_has_feature,
        /// Special token for implementing __has_extension
        macro_param_has_extension,
        /// Special token for implementing __has_builtin
        macro_param_has_builtin,
        /// Special token for implementing __has_include
        macro_param_has_include,
        /// Special token for implementing __has_include_next
        macro_param_has_include_next,
        /// Special token for implementing __is_identifier
        macro_param_is_identifier,
        /// Special token for implementing __FILE__
        macro_file,
        /// Special token for implementing __LINE__
        macro_line,
        /// Special token for implementing __COUNTER__
        macro_counter,
        /// Special token for implementing _Pragma
        macro_param_pragma_operator,

        /// Special identifier for implementing __func__
        macro_func,
        /// Special identifier for implementing __FUNCTION__
        macro_function,
        /// Special identifier for implementing __PRETTY_FUNCTION__
        macro_pretty_func,

        keyword_auto,
        keyword_auto_type,
        keyword_break,
        keyword_case,
        keyword_char,
        keyword_const,
        keyword_continue,
        keyword_default,
        keyword_do,
        keyword_double,
        keyword_else,
        keyword_enum,
        keyword_extern,
        keyword_float,
        keyword_for,
        keyword_goto,
        keyword_if,
        keyword_int,
        keyword_long,
        keyword_register,
        keyword_return,
        keyword_short,
        keyword_signed,
        keyword_sizeof,
        keyword_static,
        keyword_struct,
        keyword_switch,
        keyword_typedef,
        keyword_typeof1,
        keyword_typeof2,
        keyword_union,
        keyword_unsigned,
        keyword_void,
        keyword_volatile,
        keyword_while,

        // ISO C99
        keyword_bool,
        keyword_complex,
        keyword_imaginary,
        keyword_inline,
        keyword_restrict,

        // ISO C11
        keyword_alignas,
        keyword_alignof,
        keyword_atomic,
        keyword_generic,
        keyword_noreturn,
        keyword_static_assert,
        keyword_thread_local,

        // ISO C23
        keyword_bit_int,
        keyword_c23_alignas,
        keyword_c23_alignof,
        keyword_c23_bool,
        keyword_c23_static_assert,
        keyword_c23_thread_local,
        keyword_constexpr,
        keyword_true,
        keyword_false,
        keyword_nullptr,

        // Preprocessor directives
        keyword_include,
        keyword_include_next,
        keyword_embed,
        keyword_define,
        keyword_defined,
        keyword_undef,
        keyword_ifdef,
        keyword_ifndef,
        keyword_elif,
        keyword_elifdef,
        keyword_elifndef,
        keyword_endif,
        keyword_error,
        keyword_warning,
        keyword_pragma,
        keyword_line,
        keyword_va_args,

        // gcc keywords
        keyword_const1,
        keyword_const2,
        keyword_inline1,
        keyword_inline2,
        keyword_volatile1,
        keyword_volatile2,
        keyword_restrict1,
        keyword_restrict2,
        keyword_alignof1,
        keyword_alignof2,
        keyword_typeof,
        keyword_attribute1,
        keyword_attribute2,
        keyword_extension,
        keyword_asm,
        keyword_asm1,
        keyword_asm2,
        keyword_float80,
        keyword_float128,
        keyword_int128,
        keyword_imag1,
        keyword_imag2,
        keyword_real1,
        keyword_real2,
        keyword_float16,

        // clang keywords
        keyword_fp16,

        // ms keywords
        keyword_declspec,
        keyword_int64,
        keyword_int64_2,
        keyword_int32,
        keyword_int32_2,
        keyword_int16,
        keyword_int16_2,
        keyword_int8,
        keyword_int8_2,
        keyword_stdcall,
        keyword_stdcall2,
        keyword_thiscall,
        keyword_thiscall2,
        keyword_vectorcall,
        keyword_vectorcall2,

        // builtins that require special parsing
        builtin_choose_expr,
        builtin_va_arg,
        builtin_offsetof,
        builtin_bitoffsetof,
        builtin_types_compatible_p,

        /// Generated by #embed directive
        /// Decimal value with no prefix or suffix
        embed_byte,

        /// preprocessor number
        /// An optional period, followed by a digit 0-9, followed by any number of letters
        /// digits, underscores, periods, and exponents (e+, e-, E+, E-, p+, p-, P+, P-)
        pp_num,

        /// preprocessor placemarker token
        /// generated if `##` is used with a zero-token argument
        /// removed after substitution, so the parser should never see this
        /// See C99 6.10.3.3.2
        placemarker,

        /// Return true if token is identifier or keyword.
        pub fn isMacroIdentifier(id: Id) bool {
            switch (id) {
                .keyword_include,
                .keyword_include_next,
                .keyword_embed,
                .keyword_define,
                .keyword_defined,
                .keyword_undef,
                .keyword_ifdef,
                .keyword_ifndef,
                .keyword_elif,
                .keyword_elifdef,
                .keyword_elifndef,
                .keyword_endif,
                .keyword_error,
                .keyword_warning,
                .keyword_pragma,
                .keyword_line,
                .keyword_va_args,
                .macro_func,
                .macro_function,
                .macro_pretty_func,
                .keyword_auto,
                .keyword_auto_type,
                .keyword_break,
                .keyword_case,
                .keyword_char,
                .keyword_const,
                .keyword_continue,
                .keyword_default,
                .keyword_do,
                .keyword_double,
                .keyword_else,
                .keyword_enum,
                .keyword_extern,
                .keyword_float,
                .keyword_for,
                .keyword_goto,
                .keyword_if,
                .keyword_int,
                .keyword_long,
                .keyword_register,
                .keyword_return,
                .keyword_short,
                .keyword_signed,
                .keyword_sizeof,
                .keyword_static,
                .keyword_struct,
                .keyword_switch,
                .keyword_typedef,
                .keyword_union,
                .keyword_unsigned,
                .keyword_void,
                .keyword_volatile,
                .keyword_while,
                .keyword_bool,
                .keyword_complex,
                .keyword_imaginary,
                .keyword_inline,
                .keyword_restrict,
                .keyword_alignas,
                .keyword_alignof,
                .keyword_atomic,
                .keyword_generic,
                .keyword_noreturn,
                .keyword_static_assert,
                .keyword_thread_local,
                .identifier,
                .extended_identifier,
                .keyword_typeof,
                .keyword_typeof1,
                .keyword_typeof2,
                .keyword_const1,
                .keyword_const2,
                .keyword_inline1,
                .keyword_inline2,
                .keyword_volatile1,
                .keyword_volatile2,
                .keyword_restrict1,
                .keyword_restrict2,
                .keyword_alignof1,
                .keyword_alignof2,
                .builtin_choose_expr,
                .builtin_va_arg,
                .builtin_offsetof,
                .builtin_bitoffsetof,
                .builtin_types_compatible_p,
                .keyword_attribute1,
                .keyword_attribute2,
                .keyword_extension,
                .keyword_asm,
                .keyword_asm1,
                .keyword_asm2,
                .keyword_float80,
                .keyword_float128,
                .keyword_int128,
                .keyword_imag1,
                .keyword_imag2,
                .keyword_real1,
                .keyword_real2,
                .keyword_float16,
                .keyword_fp16,
                .keyword_declspec,
                .keyword_int64,
                .keyword_int64_2,
                .keyword_int32,
                .keyword_int32_2,
                .keyword_int16,
                .keyword_int16_2,
                .keyword_int8,
                .keyword_int8_2,
                .keyword_stdcall,
                .keyword_stdcall2,
                .keyword_thiscall,
                .keyword_thiscall2,
                .keyword_vectorcall,
                .keyword_vectorcall2,
                .keyword_bit_int,
                .keyword_c23_alignas,
                .keyword_c23_alignof,
                .keyword_c23_bool,
                .keyword_c23_static_assert,
                .keyword_c23_thread_local,
                .keyword_constexpr,
                .keyword_true,
                .keyword_false,
                .keyword_nullptr,
                => return true,
                else => return false,
            }
        }

        /// Turn macro keywords into identifiers.
        /// `keyword_defined` is special since it should only turn into an identifier if
        /// we are *not* in an #if or #elif expression
        pub fn simplifyMacroKeywordExtra(id: *Id, defined_to_identifier: bool) void {
            switch (id.*) {
                .keyword_include,
                .keyword_include_next,
                .keyword_embed,
                .keyword_define,
                .keyword_undef,
                .keyword_ifdef,
                .keyword_ifndef,
                .keyword_elif,
                .keyword_elifdef,
                .keyword_elifndef,
                .keyword_endif,
                .keyword_error,
                .keyword_warning,
                .keyword_pragma,
                .keyword_line,
                .keyword_va_args,
                => id.* = .identifier,
                .keyword_defined => if (defined_to_identifier) {
                    id.* = .identifier;
                },
                else => {},
            }
        }

        pub fn simplifyMacroKeyword(id: *Id) void {
            simplifyMacroKeywordExtra(id, false);
        }

        pub fn lexeme(id: Id) ?[]const u8 {
            return switch (id) {
                .invalid,
                .identifier,
                .extended_identifier,
                .string_literal,
                .string_literal_utf_16,
                .string_literal_utf_8,
                .string_literal_utf_32,
                .string_literal_wide,
                .char_literal,
                .char_literal_utf_8,
                .char_literal_utf_16,
                .char_literal_utf_32,
                .char_literal_wide,
                .macro_string,
                .whitespace,
                .pp_num,
                .embed_byte,
                => null,

                .zero => "0",
                .one => "1",

                .nl,
                .eof,
                .macro_param,
                .macro_param_no_expand,
                .stringify_param,
                .stringify_va_args,
                .macro_param_has_attribute,
                .macro_param_has_warning,
                .macro_param_has_feature,
                .macro_param_has_extension,
                .macro_param_has_builtin,
                .macro_param_has_include,
                .macro_param_has_include_next,
                .macro_param_is_identifier,
                .macro_file,
                .macro_line,
                .macro_counter,
                .macro_param_pragma_operator,
                .placemarker,
                => "",
                .macro_ws => " ",

                .macro_func => "__func__",
                .macro_function => "__FUNCTION__",
                .macro_pretty_func => "__PRETTY_FUNCTION__",

                .bang => "!",
                .bang_equal => "!=",
                .pipe => "|",
                .pipe_pipe => "||",
                .pipe_equal => "|=",
                .equal => "=",
                .equal_equal => "==",
                .l_paren => "(",
                .r_paren => ")",
                .l_brace => "{",
                .r_brace => "}",
                .l_bracket => "[",
                .r_bracket => "]",
                .period => ".",
                .ellipsis => "...",
                .caret => "^",
                .caret_equal => "^=",
                .plus => "+",
                .plus_plus => "++",
                .plus_equal => "+=",
                .minus => "-",
                .minus_minus => "--",
                .minus_equal => "-=",
                .asterisk => "*",
                .asterisk_equal => "*=",
                .percent => "%",
                .percent_equal => "%=",
                .arrow => "->",
                .colon => ":",
                .colon_colon => "::",
                .semicolon => ";",
                .slash => "/",
                .slash_equal => "/=",
                .comma => ",",
                .ampersand => "&",
                .ampersand_ampersand => "&&",
                .ampersand_equal => "&=",
                .question_mark => "?",
                .angle_bracket_left => "<",
                .angle_bracket_left_equal => "<=",
                .angle_bracket_angle_bracket_left => "<<",
                .angle_bracket_angle_bracket_left_equal => "<<=",
                .angle_bracket_right => ">",
                .angle_bracket_right_equal => ">=",
                .angle_bracket_angle_bracket_right => ">>",
                .angle_bracket_angle_bracket_right_equal => ">>=",
                .tilde => "~",
                .hash => "#",
                .hash_hash => "##",

                .keyword_auto => "auto",
                .keyword_auto_type => "__auto_type",
                .keyword_break => "break",
                .keyword_case => "case",
                .keyword_char => "char",
                .keyword_const => "const",
                .keyword_continue => "continue",
                .keyword_default => "default",
                .keyword_do => "do",
                .keyword_double => "double",
                .keyword_else => "else",
                .keyword_enum => "enum",
                .keyword_extern => "extern",
                .keyword_float => "float",
                .keyword_for => "for",
                .keyword_goto => "goto",
                .keyword_if => "if",
                .keyword_int => "int",
                .keyword_long => "long",
                .keyword_register => "register",
                .keyword_return => "return",
                .keyword_short => "short",
                .keyword_signed => "signed",
                .keyword_sizeof => "sizeof",
                .keyword_static => "static",
                .keyword_struct => "struct",
                .keyword_switch => "switch",
                .keyword_typedef => "typedef",
                .keyword_typeof => "typeof",
                .keyword_union => "union",
                .keyword_unsigned => "unsigned",
                .keyword_void => "void",
                .keyword_volatile => "volatile",
                .keyword_while => "while",
                .keyword_bool => "_Bool",
                .keyword_complex => "_Complex",
                .keyword_imaginary => "_Imaginary",
                .keyword_inline => "inline",
                .keyword_restrict => "restrict",
                .keyword_alignas => "_Alignas",
                .keyword_alignof => "_Alignof",
                .keyword_atomic => "_Atomic",
                .keyword_generic => "_Generic",
                .keyword_noreturn => "_Noreturn",
                .keyword_static_assert => "_Static_assert",
                .keyword_thread_local => "_Thread_local",
                .keyword_bit_int => "_BitInt",
                .keyword_c23_alignas => "alignas",
                .keyword_c23_alignof => "alignof",
                .keyword_c23_bool => "bool",
                .keyword_c23_static_assert => "static_assert",
                .keyword_c23_thread_local => "thread_local",
                .keyword_constexpr => "constexpr",
                .keyword_true => "true",
                .keyword_false => "false",
                .keyword_nullptr => "nullptr",
                .keyword_include => "include",
                .keyword_include_next => "include_next",
                .keyword_embed => "embed",
                .keyword_define => "define",
                .keyword_defined => "defined",
                .keyword_undef => "undef",
                .keyword_ifdef => "ifdef",
                .keyword_ifndef => "ifndef",
                .keyword_elif => "elif",
                .keyword_elifdef => "elifdef",
                .keyword_elifndef => "elifndef",
                .keyword_endif => "endif",
                .keyword_error => "error",
                .keyword_warning => "warning",
                .keyword_pragma => "pragma",
                .keyword_line => "line",
                .keyword_va_args => "__VA_ARGS__",
                .keyword_const1 => "__const",
                .keyword_const2 => "__const__",
                .keyword_inline1 => "__inline",
                .keyword_inline2 => "__inline__",
                .keyword_volatile1 => "__volatile",
                .keyword_volatile2 => "__volatile__",
                .keyword_restrict1 => "__restrict",
                .keyword_restrict2 => "__restrict__",
                .keyword_alignof1 => "__alignof",
                .keyword_alignof2 => "__alignof__",
                .keyword_typeof1 => "__typeof",
                .keyword_typeof2 => "__typeof__",
                .builtin_choose_expr => "__builtin_choose_expr",
                .builtin_va_arg => "__builtin_va_arg",
                .builtin_offsetof => "__builtin_offsetof",
                .builtin_bitoffsetof => "__builtin_bitoffsetof",
                .builtin_types_compatible_p => "__builtin_types_compatible_p",
                .keyword_attribute1 => "__attribute",
                .keyword_attribute2 => "__attribute__",
                .keyword_extension => "__extension__",
                .keyword_asm => "asm",
                .keyword_asm1 => "__asm",
                .keyword_asm2 => "__asm__",
                .keyword_float80 => "__float80",
                .keyword_float128 => "__float18",
                .keyword_int128 => "__int128",
                .keyword_imag1 => "__imag",
                .keyword_imag2 => "__imag__",
                .keyword_real1 => "__real",
                .keyword_real2 => "__real__",
                .keyword_float16 => "_Float16",
                .keyword_fp16 => "__fp16",
                .keyword_declspec => "__declspec",
                .keyword_int64 => "__int64",
                .keyword_int64_2 => "_int64",
                .keyword_int32 => "__int32",
                .keyword_int32_2 => "_int32",
                .keyword_int16 => "__int16",
                .keyword_int16_2 => "_int16",
                .keyword_int8 => "__int8",
                .keyword_int8_2 => "_int8",
                .keyword_stdcall => "__stdcall",
                .keyword_stdcall2 => "_stdcall",
                .keyword_thiscall => "__thiscall",
                .keyword_thiscall2 => "_thiscall",
                .keyword_vectorcall => "__vectorcall",
                .keyword_vectorcall2 => "_vectorcall",
            };
        }

        pub fn symbol(id: Id) []const u8 {
            return switch (id) {
                .macro_string, .invalid => unreachable,
                .identifier,
                .extended_identifier,
                .macro_func,
                .macro_function,
                .macro_pretty_func,
                .builtin_choose_expr,
                .builtin_va_arg,
                .builtin_offsetof,
                .builtin_bitoffsetof,
                .builtin_types_compatible_p,
                => "an identifier",
                .string_literal,
                .string_literal_utf_16,
                .string_literal_utf_8,
                .string_literal_utf_32,
                .string_literal_wide,
                => "a string literal",
                .char_literal,
                .char_literal_utf_8,
                .char_literal_utf_16,
                .char_literal_utf_32,
                .char_literal_wide,
                => "a character literal",
                .pp_num, .embed_byte => "A number",
                else => id.lexeme().?,
            };
        }

        /// tokens that can start an expression parsed by Preprocessor.expr
        /// Note that eof, r_paren, and string literals cannot actually start a
        /// preprocessor expression, but we include them here so that a nicer
        /// error message can be generated by the parser.
        pub fn validPreprocessorExprStart(id: Id) bool {
            return switch (id) {
                .eof,
                .r_paren,
                .string_literal,
                .string_literal_utf_16,
                .string_literal_utf_8,
                .string_literal_utf_32,
                .string_literal_wide,

                .char_literal,
                .char_literal_utf_8,
                .char_literal_utf_16,
                .char_literal_utf_32,
                .char_literal_wide,
                .l_paren,
                .plus,
                .minus,
                .tilde,
                .bang,
                .identifier,
                .extended_identifier,
                .keyword_defined,
                .one,
                .zero,
                .pp_num,
                .keyword_true,
                .keyword_false,
                => true,
                else => false,
            };
        }

        pub fn allowsDigraphs(id: Id, comp: *const Compilation) bool {
            return switch (id) {
                .l_bracket,
                .r_bracket,
                .l_brace,
                .r_brace,
                .hash,
                .hash_hash,
                => comp.langopts.hasDigraphs(),
                else => false,
            };
        }

        pub fn canOpenGCCAsmStmt(id: Id) bool {
            return switch (id) {
                .keyword_volatile, .keyword_volatile1, .keyword_volatile2, .keyword_inline, .keyword_inline1, .keyword_inline2, .keyword_goto, .l_paren => true,
                else => false,
            };
        }

        pub fn isStringLiteral(id: Id) bool {
            return switch (id) {
                .string_literal, .string_literal_utf_16, .string_literal_utf_8, .string_literal_utf_32, .string_literal_wide => true,
                else => false,
            };
        }
    };

    /// double underscore and underscore + capital letter identifiers
    /// belong to the implementation namespace, so we always convert them
    /// to keywords.
    pub fn getTokenId(comp: *const Compilation, str: []const u8) Token.Id {
        const kw = all_kws.get(str) orelse return .identifier;
        const standard = comp.langopts.standard;
        return switch (kw) {
            .keyword_inline => if (standard.isGNU() or standard.atLeast(.c99)) kw else .identifier,
            .keyword_restrict => if (standard.atLeast(.c99)) kw else .identifier,
            .keyword_typeof => if (standard.isGNU() or standard.atLeast(.c2x)) kw else .identifier,
            .keyword_asm => if (standard.isGNU()) kw else .identifier,
            .keyword_declspec => if (comp.langopts.declspec_attrs) kw else .identifier,

            .keyword_c23_alignas,
            .keyword_c23_alignof,
            .keyword_c23_bool,
            .keyword_c23_static_assert,
            .keyword_c23_thread_local,
            .keyword_constexpr,
            .keyword_true,
            .keyword_false,
            .keyword_nullptr,
            .keyword_elifdef,
            .keyword_elifndef,
            => if (standard.atLeast(.c2x)) kw else .identifier,

            .keyword_int64,
            .keyword_int64_2,
            .keyword_int32,
            .keyword_int32_2,
            .keyword_int16,
            .keyword_int16_2,
            .keyword_int8,
            .keyword_int8_2,
            .keyword_stdcall2,
            .keyword_thiscall2,
            .keyword_vectorcall2,
            => if (comp.langopts.ms_extensions) kw else .identifier,
            else => kw,
        };
    }

    /// Check if codepoint may appear in specified context
    /// does not check basic character set chars because the tokenizer handles them separately to keep the common
    /// case on the fast path
    pub fn mayAppearInIdent(comp: *const Compilation, codepoint: u21, where: enum { start, inside }) bool {
        if (codepoint == '$') return comp.langopts.dollars_in_identifiers;
        if (codepoint <= 0x7F) return false;
        return switch (where) {
            .start => if (comp.langopts.standard.atLeast(.c11))
                CharInfo.isC11IdChar(codepoint) and !CharInfo.isC11DisallowedInitialIdChar(codepoint)
            else
                CharInfo.isC99IdChar(codepoint) and !CharInfo.isC99DisallowedInitialIDChar(codepoint),
            .inside => if (comp.langopts.standard.atLeast(.c11))
                CharInfo.isC11IdChar(codepoint)
            else
                CharInfo.isC99IdChar(codepoint),
        };
    }

    const all_kws = std.ComptimeStringMap(Id, .{
        .{ "auto", auto: {
            @setEvalBranchQuota(3000);
            break :auto .keyword_auto;
        } },
        .{ "break", .keyword_break },
        .{ "case", .keyword_case },
        .{ "char", .keyword_char },
        .{ "const", .keyword_const },
        .{ "continue", .keyword_continue },
        .{ "default", .keyword_default },
        .{ "do", .keyword_do },
        .{ "double", .keyword_double },
        .{ "else", .keyword_else },
        .{ "enum", .keyword_enum },
        .{ "extern", .keyword_extern },
        .{ "float", .keyword_float },
        .{ "for", .keyword_for },
        .{ "goto", .keyword_goto },
        .{ "if", .keyword_if },
        .{ "int", .keyword_int },
        .{ "long", .keyword_long },
        .{ "register", .keyword_register },
        .{ "return", .keyword_return },
        .{ "short", .keyword_short },
        .{ "signed", .keyword_signed },
        .{ "sizeof", .keyword_sizeof },
        .{ "static", .keyword_static },
        .{ "struct", .keyword_struct },
        .{ "switch", .keyword_switch },
        .{ "typedef", .keyword_typedef },
        .{ "union", .keyword_union },
        .{ "unsigned", .keyword_unsigned },
        .{ "void", .keyword_void },
        .{ "volatile", .keyword_volatile },
        .{ "while", .keyword_while },
        .{ "__typeof__", .keyword_typeof2 },
        .{ "__typeof", .keyword_typeof1 },

        // ISO C99
        .{ "_Bool", .keyword_bool },
        .{ "_Complex", .keyword_complex },
        .{ "_Imaginary", .keyword_imaginary },
        .{ "inline", .keyword_inline },
        .{ "restrict", .keyword_restrict },

        // ISO C11
        .{ "_Alignas", .keyword_alignas },
        .{ "_Alignof", .keyword_alignof },
        .{ "_Atomic", .keyword_atomic },
        .{ "_Generic", .keyword_generic },
        .{ "_Noreturn", .keyword_noreturn },
        .{ "_Static_assert", .keyword_static_assert },
        .{ "_Thread_local", .keyword_thread_local },

        // ISO C23
        .{ "_BitInt", .keyword_bit_int },
        .{ "alignas", .keyword_c23_alignas },
        .{ "alignof", .keyword_c23_alignof },
        .{ "bool", .keyword_c23_bool },
        .{ "static_assert", .keyword_c23_static_assert },
        .{ "thread_local", .keyword_c23_thread_local },
        .{ "constexpr", .keyword_constexpr },
        .{ "true", .keyword_true },
        .{ "false", .keyword_false },
        .{ "nullptr", .keyword_nullptr },

        // Preprocessor directives
        .{ "include", .keyword_include },
        .{ "include_next", .keyword_include_next },
        .{ "embed", .keyword_embed },
        .{ "define", .keyword_define },
        .{ "defined", .keyword_defined },
        .{ "undef", .keyword_undef },
        .{ "ifdef", .keyword_ifdef },
        .{ "ifndef", .keyword_ifndef },
        .{ "elif", .keyword_elif },
        .{ "elifdef", .keyword_elifdef },
        .{ "elifndef", .keyword_elifndef },
        .{ "endif", .keyword_endif },
        .{ "error", .keyword_error },
        .{ "warning", .keyword_warning },
        .{ "pragma", .keyword_pragma },
        .{ "line", .keyword_line },
        .{ "__VA_ARGS__", .keyword_va_args },
        .{ "__func__", .macro_func },
        .{ "__FUNCTION__", .macro_function },
        .{ "__PRETTY_FUNCTION__", .macro_pretty_func },

        // gcc keywords
        .{ "__auto_type", .keyword_auto_type },
        .{ "__const", .keyword_const1 },
        .{ "__const__", .keyword_const2 },
        .{ "__inline", .keyword_inline1 },
        .{ "__inline__", .keyword_inline2 },
        .{ "__volatile", .keyword_volatile1 },
        .{ "__volatile__", .keyword_volatile2 },
        .{ "__restrict", .keyword_restrict1 },
        .{ "__restrict__", .keyword_restrict2 },
        .{ "__alignof", .keyword_alignof1 },
        .{ "__alignof__", .keyword_alignof2 },
        .{ "typeof", .keyword_typeof },
        .{ "__attribute", .keyword_attribute1 },
        .{ "__attribute__", .keyword_attribute2 },
        .{ "__extension__", .keyword_extension },
        .{ "asm", .keyword_asm },
        .{ "__asm", .keyword_asm1 },
        .{ "__asm__", .keyword_asm2 },
        .{ "__float80", .keyword_float80 },
        .{ "__float128", .keyword_float128 },
        .{ "__int128", .keyword_int128 },
        .{ "__imag", .keyword_imag1 },
        .{ "__imag__", .keyword_imag2 },
        .{ "__real", .keyword_real1 },
        .{ "__real__", .keyword_real2 },
        .{ "_Float16", .keyword_float16 },

        // clang keywords
        .{ "__fp16", .keyword_fp16 },

        // ms keywords
        .{ "__declspec", .keyword_declspec },
        .{ "__int64", .keyword_int64 },
        .{ "_int64", .keyword_int64_2 },
        .{ "__int32", .keyword_int32 },
        .{ "_int32", .keyword_int32_2 },
        .{ "__int16", .keyword_int16 },
        .{ "_int16", .keyword_int16_2 },
        .{ "__int8", .keyword_int8 },
        .{ "_int8", .keyword_int8_2 },
        .{ "__stdcall", .keyword_stdcall },
        .{ "_stdcall", .keyword_stdcall2 },
        .{ "__thiscall", .keyword_thiscall },
        .{ "_thiscall", .keyword_thiscall2 },
        .{ "__vectorcall", .keyword_vectorcall },
        .{ "_vectorcall", .keyword_vectorcall2 },

        // builtins that require special parsing
        .{ "__builtin_choose_expr", .builtin_choose_expr },
        .{ "__builtin_va_arg", .builtin_va_arg },
        .{ "__builtin_offsetof", .builtin_offsetof },
        .{ "__builtin_bitoffsetof", .builtin_bitoffsetof },
        .{ "__builtin_types_compatible_p", .builtin_types_compatible_p },
    });
};

buf: []const u8,
index: u32 = 0,
source: Source.Id,
comp: *const Compilation,
line: u32 = 1,

pub fn next(self: *Tokenizer) Token {
    var state: enum {
        start,
        whitespace,
        u,
        u8,
        U,
        L,
        string_literal,
        char_literal_start,
        char_literal,
        escape_sequence,
        octal_escape,
        hex_escape,
        unicode_escape,
        identifier,
        extended_identifier,
        equal,
        bang,
        pipe,
        colon,
        percent,
        asterisk,
        plus,
        angle_bracket_left,
        angle_bracket_angle_bracket_left,
        angle_bracket_right,
        angle_bracket_angle_bracket_right,
        caret,
        period,
        period2,
        minus,
        slash,
        ampersand,
        hash,
        hash_digraph,
        hash_hash_digraph_partial,
        line_comment,
        multi_line_comment,
        multi_line_comment_asterisk,
        multi_line_comment_done,
        pp_num,
        pp_num_exponent,
        pp_num_digit_separator,
    } = .start;

    var start = self.index;
    var id: Token.Id = .eof;

    var return_state = state;
    var counter: u32 = 0;
    var codepoint_len: u3 = undefined;
    while (self.index < self.buf.len) : (self.index += codepoint_len) {
        // Source files get checked for valid utf-8 before being tokenized so it is safe to use
        // these versions.
        codepoint_len = unicode.utf8ByteSequenceLength_unsafe(self.buf[self.index]);
        const c: u21 = switch (codepoint_len) {
            1 => @as(u21, self.buf[self.index]),
            2 => unicode.utf8Decode2_unsafe(self.buf[self.index..]),
            3 => unicode.utf8Decode3_unsafe(self.buf[self.index..]),
            4 => unicode.utf8Decode4_unsafe(self.buf[self.index..]),
            else => unreachable,
        };
        switch (state) {
            .start => switch (c) {
                '\n' => {
                    id = .nl;
                    self.index += 1;
                    self.line += 1;
                    break;
                },
                '"' => {
                    id = .string_literal;
                    state = .string_literal;
                },
                '\'' => {
                    id = .char_literal;
                    state = .char_literal_start;
                },
                'u' => state = .u,
                'U' => state = .U,
                'L' => state = .L,
                'a'...'t', 'v'...'z', 'A'...'K', 'M'...'T', 'V'...'Z', '_' => state = .identifier,
                '=' => state = .equal,
                '!' => state = .bang,
                '|' => state = .pipe,
                '(' => {
                    id = .l_paren;
                    self.index += 1;
                    break;
                },
                ')' => {
                    id = .r_paren;
                    self.index += 1;
                    break;
                },
                '[' => {
                    id = .l_bracket;
                    self.index += 1;
                    break;
                },
                ']' => {
                    id = .r_bracket;
                    self.index += 1;
                    break;
                },
                ';' => {
                    id = .semicolon;
                    self.index += 1;
                    break;
                },
                ',' => {
                    id = .comma;
                    self.index += 1;
                    break;
                },
                '?' => {
                    id = .question_mark;
                    self.index += 1;
                    break;
                },
                ':' => state = .colon,
                '%' => state = .percent,
                '*' => state = .asterisk,
                '+' => state = .plus,
                '<' => state = .angle_bracket_left,
                '>' => state = .angle_bracket_right,
                '^' => state = .caret,
                '{' => {
                    id = .l_brace;
                    self.index += 1;
                    break;
                },
                '}' => {
                    id = .r_brace;
                    self.index += 1;
                    break;
                },
                '~' => {
                    id = .tilde;
                    self.index += 1;
                    break;
                },
                '.' => state = .period,
                '-' => state = .minus,
                '/' => state = .slash,
                '&' => state = .ampersand,
                '#' => state = .hash,
                '0'...'9' => state = .pp_num,
                '\t', '\x0B', '\x0C', ' ' => state = .whitespace,
                else => if (Token.mayAppearInIdent(self.comp, c, .start)) {
                    state = .extended_identifier;
                } else {
                    id = .invalid;
                    self.index += codepoint_len;
                    break;
                },
            },
            .whitespace => switch (c) {
                '\t', '\x0B', '\x0C', ' ' => {},
                else => {
                    id = .whitespace;
                    break;
                },
            },
            .u => switch (c) {
                '8' => {
                    state = .u8;
                },
                '\'' => {
                    id = .char_literal_utf_16;
                    state = .char_literal_start;
                },
                '\"' => {
                    id = .string_literal_utf_16;
                    state = .string_literal;
                },
                else => {
                    codepoint_len = 0;
                    state = .identifier;
                },
            },
            .u8 => switch (c) {
                '\"' => {
                    id = .string_literal_utf_8;
                    state = .string_literal;
                },
                '\'' => {
                    id = .char_literal_utf_8;
                    state = .char_literal_start;
                },
                else => {
                    codepoint_len = 0;
                    state = .identifier;
                },
            },
            .U => switch (c) {
                '\'' => {
                    id = .char_literal_utf_32;
                    state = .char_literal_start;
                },
                '\"' => {
                    id = .string_literal_utf_32;
                    state = .string_literal;
                },
                else => {
                    codepoint_len = 0;
                    state = .identifier;
                },
            },
            .L => switch (c) {
                '\'' => {
                    id = .char_literal_wide;
                    state = .char_literal_start;
                },
                '\"' => {
                    id = .string_literal_wide;
                    state = .string_literal;
                },
                else => {
                    codepoint_len = 0;
                    state = .identifier;
                },
            },
            .string_literal => switch (c) {
                '\\' => {
                    return_state = .string_literal;
                    state = .escape_sequence;
                },
                '"' => {
                    self.index += 1;
                    break;
                },
                '\n' => {
                    id = .invalid;
                    break;
                },
                '\r' => unreachable,
                else => {},
            },
            .char_literal_start => switch (c) {
                '\\' => {
                    return_state = .char_literal;
                    state = .escape_sequence;
                },

                '\'', '\n' => {
                    id = .invalid;
                    break;
                },
                else => {
                    state = .char_literal;
                },
            },
            .char_literal => switch (c) {
                '\\' => {
                    return_state = .char_literal;
                    state = .escape_sequence;
                },
                '\'' => {
                    self.index += 1;
                    break;
                },
                '\n' => {
                    id = .invalid;
                    break;
                },
                else => {},
            },
            .escape_sequence => switch (c) {
                '\'', '"', '?', '\\', 'a', 'b', 'e', 'f', 'n', 'r', 't', 'v' => {
                    state = return_state;
                },
                '\n' => {
                    state = return_state;
                    self.line += 1;
                },
                '0'...'7' => {
                    counter = 1;
                    state = .octal_escape;
                },
                'x' => state = .hex_escape,
                'u' => {
                    counter = 4;
                    state = .unicode_escape;
                },
                'U' => {
                    counter = 8;
                    state = .unicode_escape;
                },
                else => {
                    id = .invalid;
                    break;
                },
            },
            .octal_escape => switch (c) {
                '0'...'7' => {
                    counter += 1;
                    if (counter == 3) state = return_state;
                },
                else => {
                    codepoint_len = 0;
                    state = return_state;
                },
            },
            .hex_escape => switch (c) {
                '0'...'9', 'a'...'f', 'A'...'F' => {},
                else => {
                    codepoint_len = 0;
                    state = return_state;
                },
            },
            .unicode_escape => switch (c) {
                '0'...'9', 'a'...'f', 'A'...'F' => {
                    counter -= 1;
                    if (counter == 0) state = return_state;
                },
                else => {
                    id = .invalid;
                    break;
                },
            },
            .identifier, .extended_identifier => switch (c) {
                'a'...'z', 'A'...'Z', '_', '0'...'9' => {},
                else => {
                    if (!Token.mayAppearInIdent(self.comp, c, .inside)) {
                        id = if (state == .identifier) Token.getTokenId(self.comp, self.buf[start..self.index]) else .extended_identifier;
                        break;
                    }
                    state = .extended_identifier;
                },
            },
            .equal => switch (c) {
                '=' => {
                    id = .equal_equal;
                    self.index += 1;
                    break;
                },
                else => {
                    id = .equal;
                    break;
                },
            },
            .bang => switch (c) {
                '=' => {
                    id = .bang_equal;
                    self.index += 1;
                    break;
                },
                else => {
                    id = .bang;
                    break;
                },
            },
            .pipe => switch (c) {
                '=' => {
                    id = .pipe_equal;
                    self.index += 1;
                    break;
                },
                '|' => {
                    id = .pipe_pipe;
                    self.index += 1;
                    break;
                },
                else => {
                    id = .pipe;
                    break;
                },
            },
            .colon => switch (c) {
                '>' => {
                    if (self.comp.langopts.hasDigraphs()) {
                        id = .r_bracket;
                        self.index += 1;
                    } else {
                        id = .colon;
                    }
                    break;
                },
                ':' => {
                    if (self.comp.langopts.standard.atLeast(.c2x)) {
                        id = .colon_colon;
                        self.index += 1;
                        break;
                    } else {
                        id = .colon;
                        break;
                    }
                },
                else => {
                    id = .colon;
                    break;
                },
            },
            .percent => switch (c) {
                '=' => {
                    id = .percent_equal;
                    self.index += 1;
                    break;
                },
                '>' => {
                    if (self.comp.langopts.hasDigraphs()) {
                        id = .r_brace;
                        self.index += 1;
                    } else {
                        id = .percent;
                    }
                    break;
                },
                ':' => {
                    if (self.comp.langopts.hasDigraphs()) {
                        state = .hash_digraph;
                    } else {
                        id = .percent;
                        break;
                    }
                },
                else => {
                    id = .percent;
                    break;
                },
            },
            .asterisk => switch (c) {
                '=' => {
                    id = .asterisk_equal;
                    self.index += 1;
                    break;
                },
                else => {
                    id = .asterisk;
                    break;
                },
            },
            .plus => switch (c) {
                '=' => {
                    id = .plus_equal;
                    self.index += 1;
                    break;
                },
                '+' => {
                    id = .plus_plus;
                    self.index += 1;
                    break;
                },
                else => {
                    id = .plus;
                    break;
                },
            },
            .angle_bracket_left => switch (c) {
                '<' => state = .angle_bracket_angle_bracket_left,
                '=' => {
                    id = .angle_bracket_left_equal;
                    self.index += 1;
                    break;
                },
                ':' => {
                    if (self.comp.langopts.hasDigraphs()) {
                        id = .l_bracket;
                        self.index += 1;
                    } else {
                        id = .angle_bracket_left;
                    }
                    break;
                },
                '%' => {
                    if (self.comp.langopts.hasDigraphs()) {
                        id = .l_brace;
                        self.index += 1;
                    } else {
                        id = .angle_bracket_left;
                    }
                    break;
                },
                else => {
                    id = .angle_bracket_left;
                    break;
                },
            },
            .angle_bracket_angle_bracket_left => switch (c) {
                '=' => {
                    id = .angle_bracket_angle_bracket_left_equal;
                    self.index += 1;
                    break;
                },
                else => {
                    id = .angle_bracket_angle_bracket_left;
                    break;
                },
            },
            .angle_bracket_right => switch (c) {
                '>' => state = .angle_bracket_angle_bracket_right,
                '=' => {
                    id = .angle_bracket_right_equal;
                    self.index += 1;
                    break;
                },
                else => {
                    id = .angle_bracket_right;
                    break;
                },
            },
            .angle_bracket_angle_bracket_right => switch (c) {
                '=' => {
                    id = .angle_bracket_angle_bracket_right_equal;
                    self.index += 1;
                    break;
                },
                else => {
                    id = .angle_bracket_angle_bracket_right;
                    break;
                },
            },
            .caret => switch (c) {
                '=' => {
                    id = .caret_equal;
                    self.index += 1;
                    break;
                },
                else => {
                    id = .caret;
                    break;
                },
            },
            .period => switch (c) {
                '.' => state = .period2,
                '0'...'9' => state = .pp_num,
                else => {
                    id = .period;
                    break;
                },
            },
            .period2 => switch (c) {
                '.' => {
                    id = .ellipsis;
                    self.index += 1;
                    break;
                },
                else => {
                    id = .period;
                    self.index -= 1;
                    break;
                },
            },
            .minus => switch (c) {
                '>' => {
                    id = .arrow;
                    self.index += 1;
                    break;
                },
                '=' => {
                    id = .minus_equal;
                    self.index += 1;
                    break;
                },
                '-' => {
                    id = .minus_minus;
                    self.index += 1;
                    break;
                },
                else => {
                    id = .minus;
                    break;
                },
            },
            .ampersand => switch (c) {
                '&' => {
                    id = .ampersand_ampersand;
                    self.index += 1;
                    break;
                },
                '=' => {
                    id = .ampersand_equal;
                    self.index += 1;
                    break;
                },
                else => {
                    id = .ampersand;
                    break;
                },
            },
            .hash => switch (c) {
                '#' => {
                    id = .hash_hash;
                    self.index += 1;
                    break;
                },
                else => {
                    id = .hash;
                    break;
                },
            },
            .hash_digraph => switch (c) {
                '%' => state = .hash_hash_digraph_partial,
                else => {
                    id = .hash;
                    break;
                },
            },
            .hash_hash_digraph_partial => switch (c) {
                ':' => {
                    id = .hash_hash;
                    self.index += 1;
                    break;
                },
                else => {
                    id = .hash;
                    self.index -= 1; // re-tokenize the percent
                    break;
                },
            },
            .slash => switch (c) {
                '/' => state = .line_comment,
                '*' => state = .multi_line_comment,
                '=' => {
                    id = .slash_equal;
                    self.index += 1;
                    break;
                },
                else => {
                    id = .slash;
                    break;
                },
            },
            .line_comment => switch (c) {
                '\n' => {
                    self.index -= 1;
                    state = .start;
                },
                else => {},
            },
            .multi_line_comment => switch (c) {
                '*' => state = .multi_line_comment_asterisk,
                '\n' => self.line += 1,
                else => {},
            },
            .multi_line_comment_asterisk => switch (c) {
                '/' => state = .multi_line_comment_done,
                '\n' => {
                    self.line += 1;
                    state = .multi_line_comment;
                },
                '*' => {},
                else => state = .multi_line_comment,
            },
            .multi_line_comment_done => switch (c) {
                '\n' => {
                    start = self.index;
                    id = .nl;
                    self.index += 1;
                    self.line += 1;
                    break;
                },
                '\r' => unreachable,
                '\t', '\x0B', '\x0C', ' ' => {
                    start = self.index;
                    state = .whitespace;
                },
                else => {
                    id = .whitespace;
                    break;
                },
            },
            .pp_num => switch (c) {
                'a'...'d',
                'A'...'D',
                'f'...'o',
                'F'...'O',
                'q'...'z',
                'Q'...'Z',
                '0'...'9',
                '_',
                '.',
                => {},
                'e', 'E', 'p', 'P' => state = .pp_num_exponent,
                '\'' => if (self.comp.langopts.standard.atLeast(.c2x)) {
                    state = .pp_num_digit_separator;
                } else {
                    id = .pp_num;
                    break;
                },
                else => {
                    id = .pp_num;
                    break;
                },
            },
            .pp_num_digit_separator => switch (c) {
                'a'...'d',
                'A'...'D',
                'f'...'o',
                'F'...'O',
                'q'...'z',
                'Q'...'Z',
                '0'...'9',
                '_',
                => state = .pp_num,
                else => {
                    self.index -= 1;
                    id = .pp_num;
                    break;
                },
            },
            .pp_num_exponent => switch (c) {
                'a'...'z',
                'A'...'Z',
                '0'...'9',
                '_',
                '.',
                '+',
                '-',
                => state = .pp_num,
                else => {
                    id = .pp_num;
                    break;
                },
            },
        }
    } else if (self.index == self.buf.len) {
        switch (state) {
            .start, .line_comment => {},
            .u, .u8, .U, .L, .identifier => id = Token.getTokenId(self.comp, self.buf[start..self.index]),
            .extended_identifier => id = .extended_identifier,
            .period2,
            .string_literal,
            .char_literal_start,
            .char_literal,
            .escape_sequence,
            .octal_escape,
            .hex_escape,
            .unicode_escape,
            .multi_line_comment,
            .multi_line_comment_asterisk,
            => id = .invalid,

            .whitespace => id = .whitespace,
            .multi_line_comment_done => id = .whitespace,

            .equal => id = .equal,
            .bang => id = .bang,
            .minus => id = .minus,
            .slash => id = .slash,
            .ampersand => id = .ampersand,
            .hash => id = .hash,
            .period => id = .period,
            .pipe => id = .pipe,
            .angle_bracket_angle_bracket_right => id = .angle_bracket_angle_bracket_right,
            .angle_bracket_right => id = .angle_bracket_right,
            .angle_bracket_angle_bracket_left => id = .angle_bracket_angle_bracket_left,
            .angle_bracket_left => id = .angle_bracket_left,
            .plus => id = .plus,
            .colon => id = .colon,
            .percent => id = .percent,
            .caret => id = .caret,
            .asterisk => id = .asterisk,
            .hash_digraph => id = .hash,
            .hash_hash_digraph_partial => {
                id = .hash;
                self.index -= 1; // re-tokenize the percent
            },
            .pp_num, .pp_num_exponent, .pp_num_digit_separator => id = .pp_num,
        }
    }

    return .{
        .id = id,
        .start = start,
        .end = self.index,
        .line = self.line,
        .source = self.source,
    };
}

pub fn nextNoWS(self: *Tokenizer) Token {
    var tok = self.next();
    while (tok.id == .whitespace) tok = self.next();
    return tok;
}

test "operators" {
    try expectTokens(
        \\ ! != | || |= = ==
        \\ ( ) { } [ ] . .. ...
        \\ ^ ^= + ++ += - -- -=
        \\ * *= % %= -> : ; / /=
        \\ , & && &= ? < <= <<
        \\  <<= > >= >> >>= ~ # ##
        \\
    , &.{
        .bang,
        .bang_equal,
        .pipe,
        .pipe_pipe,
        .pipe_equal,
        .equal,
        .equal_equal,
        .nl,
        .l_paren,
        .r_paren,
        .l_brace,
        .r_brace,
        .l_bracket,
        .r_bracket,
        .period,
        .period,
        .period,
        .ellipsis,
        .nl,
        .caret,
        .caret_equal,
        .plus,
        .plus_plus,
        .plus_equal,
        .minus,
        .minus_minus,
        .minus_equal,
        .nl,
        .asterisk,
        .asterisk_equal,
        .percent,
        .percent_equal,
        .arrow,
        .colon,
        .semicolon,
        .slash,
        .slash_equal,
        .nl,
        .comma,
        .ampersand,
        .ampersand_ampersand,
        .ampersand_equal,
        .question_mark,
        .angle_bracket_left,
        .angle_bracket_left_equal,
        .angle_bracket_angle_bracket_left,
        .nl,
        .angle_bracket_angle_bracket_left_equal,
        .angle_bracket_right,
        .angle_bracket_right_equal,
        .angle_bracket_angle_bracket_right,
        .angle_bracket_angle_bracket_right_equal,
        .tilde,
        .hash,
        .hash_hash,
        .nl,
    });
}

test "keywords" {
    try expectTokens(
        \\auto __auto_type break case char const continue default do
        \\double else enum extern float for goto if int
        \\long register return short signed sizeof static
        \\struct switch typedef union unsigned void volatile
        \\while _Bool _Complex _Imaginary inline restrict _Alignas
        \\_Alignof _Atomic _Generic _Noreturn _Static_assert _Thread_local
        \\__attribute __attribute__
        \\
    , &.{
        .keyword_auto,
        .keyword_auto_type,
        .keyword_break,
        .keyword_case,
        .keyword_char,
        .keyword_const,
        .keyword_continue,
        .keyword_default,
        .keyword_do,
        .nl,
        .keyword_double,
        .keyword_else,
        .keyword_enum,
        .keyword_extern,
        .keyword_float,
        .keyword_for,
        .keyword_goto,
        .keyword_if,
        .keyword_int,
        .nl,
        .keyword_long,
        .keyword_register,
        .keyword_return,
        .keyword_short,
        .keyword_signed,
        .keyword_sizeof,
        .keyword_static,
        .nl,
        .keyword_struct,
        .keyword_switch,
        .keyword_typedef,
        .keyword_union,
        .keyword_unsigned,
        .keyword_void,
        .keyword_volatile,
        .nl,
        .keyword_while,
        .keyword_bool,
        .keyword_complex,
        .keyword_imaginary,
        .keyword_inline,
        .keyword_restrict,
        .keyword_alignas,
        .nl,
        .keyword_alignof,
        .keyword_atomic,
        .keyword_generic,
        .keyword_noreturn,
        .keyword_static_assert,
        .keyword_thread_local,
        .nl,
        .keyword_attribute1,
        .keyword_attribute2,
        .nl,
    });
}

test "preprocessor keywords" {
    try expectTokens(
        \\#include
        \\#include_next
        \\#embed
        \\#define
        \\#ifdef
        \\#ifndef
        \\#error
        \\#pragma
        \\
    , &.{
        .hash,
        .keyword_include,
        .nl,
        .hash,
        .keyword_include_next,
        .nl,
        .hash,
        .keyword_embed,
        .nl,
        .hash,
        .keyword_define,
        .nl,
        .hash,
        .keyword_ifdef,
        .nl,
        .hash,
        .keyword_ifndef,
        .nl,
        .hash,
        .keyword_error,
        .nl,
        .hash,
        .keyword_pragma,
        .nl,
    });
}

test "line continuation" {
    try expectTokens(
        \\#define foo \
        \\  bar
        \\"foo\
        \\ bar"
        \\#define "foo"
        \\ "bar"
        \\#define "foo" \
        \\ "bar"
    , &.{
        .hash,
        .keyword_define,
        .identifier,
        .identifier,
        .nl,
        .string_literal,
        .nl,
        .hash,
        .keyword_define,
        .string_literal,
        .nl,
        .string_literal,
        .nl,
        .hash,
        .keyword_define,
        .string_literal,
        .string_literal,
    });
}

test "string prefix" {
    try expectTokens(
        \\"foo"
        \\u"foo"
        \\u8"foo"
        \\U"foo"
        \\L"foo"
        \\'foo'
        \\u8'A'
        \\u'foo'
        \\U'foo'
        \\L'foo'
        \\
    , &.{
        .string_literal,
        .nl,
        .string_literal_utf_16,
        .nl,
        .string_literal_utf_8,
        .nl,
        .string_literal_utf_32,
        .nl,
        .string_literal_wide,
        .nl,
        .char_literal,
        .nl,
        .char_literal_utf_8,
        .nl,
        .char_literal_utf_16,
        .nl,
        .char_literal_utf_32,
        .nl,
        .char_literal_wide,
        .nl,
    });
}

test "num suffixes" {
    try expectTokens(
        \\ 1.0f 1.0L 1.0 .0 1. 0x1p0f 0X1p0
        \\ 0l 0lu 0ll 0llu 0
        \\ 1u 1ul 1ull 1
        \\ 1.0i 1.0I
        \\ 1.0if 1.0If 1.0fi 1.0fI
        \\ 1.0il 1.0Il 1.0li 1.0lI
        \\
    , &.{
        .pp_num,
        .pp_num,
        .pp_num,
        .pp_num,
        .pp_num,
        .pp_num,
        .pp_num,
        .nl,
        .pp_num,
        .pp_num,
        .pp_num,
        .pp_num,
        .pp_num,
        .nl,
        .pp_num,
        .pp_num,
        .pp_num,
        .pp_num,
        .nl,
        .pp_num,
        .pp_num,
        .nl,
        .pp_num,
        .pp_num,
        .pp_num,
        .pp_num,
        .nl,
        .pp_num,
        .pp_num,
        .pp_num,
        .pp_num,
        .nl,
    });
}

test "comments" {
    try expectTokens(
        \\//foo
        \\#foo
    , &.{
        .nl,
        .hash,
        .identifier,
    });
}

test "extended identifiers" {
    try expectTokens("𝓪𝓻𝓸𝓬𝓬", &.{.extended_identifier});
    try expectTokens("u𝓪𝓻𝓸𝓬𝓬", &.{.extended_identifier});
    try expectTokens("u8𝓪𝓻𝓸𝓬𝓬", &.{.extended_identifier});
    try expectTokens("U𝓪𝓻𝓸𝓬𝓬", &.{.extended_identifier});
    try expectTokens("L𝓪𝓻𝓸𝓬𝓬", &.{.extended_identifier});
    try expectTokens("1™", &.{ .pp_num, .extended_identifier });
    try expectTokens("1.™", &.{ .pp_num, .extended_identifier });
    try expectTokens("..™", &.{ .period, .period, .extended_identifier });
    try expectTokens("0™", &.{ .pp_num, .extended_identifier });
    try expectTokens("0b\u{E0000}", &.{ .pp_num, .extended_identifier });
    try expectTokens("0b0\u{E0000}", &.{ .pp_num, .extended_identifier });
    try expectTokens("01\u{E0000}", &.{ .pp_num, .extended_identifier });
    try expectTokens("010\u{E0000}", &.{ .pp_num, .extended_identifier });
    try expectTokens("0x\u{E0000}", &.{ .pp_num, .extended_identifier });
    try expectTokens("0x0\u{E0000}", &.{ .pp_num, .extended_identifier });
    try expectTokens("\"\\0\u{E0000}\"", &.{.string_literal});
    try expectTokens("\"\\x\u{E0000}\"", &.{.string_literal});
    try expectTokens("\"\\u\u{E0000}\"", &.{ .invalid, .extended_identifier, .invalid });
    try expectTokens("1e\u{E0000}", &.{ .pp_num, .extended_identifier });
    try expectTokens("1e1\u{E0000}", &.{ .pp_num, .extended_identifier });
}

test "digraphs" {
    try expectTokens("%:<::><%%>%:%:", &.{ .hash, .l_bracket, .r_bracket, .l_brace, .r_brace, .hash_hash });
    try expectTokens("\"%:<::><%%>%:%:\"", &.{.string_literal});
    try expectTokens("%:%42 %:%", &.{ .hash, .percent, .pp_num, .hash, .percent });
}

test "C23 keywords" {
    try expectTokensExtra("true false alignas alignof bool static_assert thread_local nullptr", &.{
        .keyword_true,
        .keyword_false,
        .keyword_c23_alignas,
        .keyword_c23_alignof,
        .keyword_c23_bool,
        .keyword_c23_static_assert,
        .keyword_c23_thread_local,
        .keyword_nullptr,
    }, .c2x);
}

fn expectTokensExtra(contents: []const u8, expected_tokens: []const Token.Id, standard: ?LangOpts.Standard) !void {
    var comp = Compilation.init(std.testing.allocator);
    defer comp.deinit();
    if (standard) |provided| {
        comp.langopts.standard = provided;
    }
    const source = try comp.addSourceFromBuffer("path", contents);
    var tokenizer = Tokenizer{
        .buf = source.buf,
        .source = source.id,
        .comp = &comp,
    };
    var i: usize = 0;
    while (i < expected_tokens.len) {
        const token = tokenizer.next();
        if (token.id == .whitespace) continue;
        const expected_token_id = expected_tokens[i];
        i += 1;
        if (!std.meta.eql(token.id, expected_token_id)) {
            std.debug.print("expected {s}, found {s}\n", .{ @tagName(expected_token_id), @tagName(token.id) });
            return error.TokensDoNotEqual;
        }
    }
    const last_token = tokenizer.next();
    try std.testing.expect(last_token.id == .eof);
}

fn expectTokens(contents: []const u8, expected_tokens: []const Token.Id) !void {
    return expectTokensExtra(contents, expected_tokens, null);
}