mirror of
https://codeberg.org/ziglang/zig.git
synced 2026-04-30 06:42:48 +03:00
5792570197
ref: 5688dbccfb58216468267a0f46b96bed7013715a
2141 lines
67 KiB
Zig
Vendored
2141 lines
67 KiB
Zig
Vendored
const std = @import("std");
|
|
const assert = std.debug.assert;
|
|
const Compilation = @import("Compilation.zig");
|
|
const Source = @import("Source.zig");
|
|
const LangOpts = @import("LangOpts.zig");
|
|
const CharInfo = @import("CharInfo.zig");
|
|
const unicode = @import("unicode.zig");
|
|
|
|
const Tokenizer = @This();
|
|
|
|
pub const Token = struct {
|
|
id: Id,
|
|
source: Source.Id,
|
|
start: u32 = 0,
|
|
end: u32 = 0,
|
|
line: u32 = 0,
|
|
|
|
pub const Id = enum(u8) {
|
|
invalid,
|
|
nl,
|
|
whitespace,
|
|
eof,
|
|
/// identifier containing solely basic character set characters
|
|
identifier,
|
|
/// identifier with at least one extended character
|
|
extended_identifier,
|
|
|
|
// string literals with prefixes
|
|
string_literal,
|
|
string_literal_utf_16,
|
|
string_literal_utf_8,
|
|
string_literal_utf_32,
|
|
string_literal_wide,
|
|
|
|
// <foobar> only generated by preprocessor
|
|
macro_string,
|
|
|
|
// char literals with prefixes
|
|
char_literal,
|
|
char_literal_utf_8,
|
|
char_literal_utf_16,
|
|
char_literal_utf_32,
|
|
char_literal_wide,
|
|
|
|
/// Integer literal tokens generated by preprocessor.
|
|
one,
|
|
zero,
|
|
|
|
bang,
|
|
bang_equal,
|
|
pipe,
|
|
pipe_pipe,
|
|
pipe_equal,
|
|
equal,
|
|
equal_equal,
|
|
l_paren,
|
|
r_paren,
|
|
l_brace,
|
|
r_brace,
|
|
l_bracket,
|
|
r_bracket,
|
|
period,
|
|
ellipsis,
|
|
caret,
|
|
caret_equal,
|
|
plus,
|
|
plus_plus,
|
|
plus_equal,
|
|
minus,
|
|
minus_minus,
|
|
minus_equal,
|
|
asterisk,
|
|
asterisk_equal,
|
|
percent,
|
|
percent_equal,
|
|
arrow,
|
|
colon,
|
|
colon_colon,
|
|
semicolon,
|
|
slash,
|
|
slash_equal,
|
|
comma,
|
|
ampersand,
|
|
ampersand_ampersand,
|
|
ampersand_equal,
|
|
question_mark,
|
|
angle_bracket_left,
|
|
angle_bracket_left_equal,
|
|
angle_bracket_angle_bracket_left,
|
|
angle_bracket_angle_bracket_left_equal,
|
|
angle_bracket_right,
|
|
angle_bracket_right_equal,
|
|
angle_bracket_angle_bracket_right,
|
|
angle_bracket_angle_bracket_right_equal,
|
|
tilde,
|
|
hash,
|
|
hash_hash,
|
|
|
|
/// Special token to speed up preprocessing, `loc.end` will be an index to the param list.
|
|
macro_param,
|
|
/// Special token to signal that the argument must be replaced without expansion (e.g. in concatenation)
|
|
macro_param_no_expand,
|
|
/// Special token to speed up preprocessing, `loc.end` will be an index to the param list.
|
|
stringify_param,
|
|
/// Same as stringify_param, but for var args
|
|
stringify_va_args,
|
|
/// Special macro whitespace, always equal to a single space
|
|
macro_ws,
|
|
/// Special token for implementing __has_attribute
|
|
macro_param_has_attribute,
|
|
/// Special token for implementing __has_warning
|
|
macro_param_has_warning,
|
|
/// Special token for implementing __has_feature
|
|
macro_param_has_feature,
|
|
/// Special token for implementing __has_extension
|
|
macro_param_has_extension,
|
|
/// Special token for implementing __has_builtin
|
|
macro_param_has_builtin,
|
|
/// Special token for implementing __has_include
|
|
macro_param_has_include,
|
|
/// Special token for implementing __has_include_next
|
|
macro_param_has_include_next,
|
|
/// Special token for implementing __is_identifier
|
|
macro_param_is_identifier,
|
|
/// Special token for implementing __FILE__
|
|
macro_file,
|
|
/// Special token for implementing __LINE__
|
|
macro_line,
|
|
/// Special token for implementing __COUNTER__
|
|
macro_counter,
|
|
/// Special token for implementing _Pragma
|
|
macro_param_pragma_operator,
|
|
|
|
/// Special identifier for implementing __func__
|
|
macro_func,
|
|
/// Special identifier for implementing __FUNCTION__
|
|
macro_function,
|
|
/// Special identifier for implementing __PRETTY_FUNCTION__
|
|
macro_pretty_func,
|
|
|
|
keyword_auto,
|
|
keyword_auto_type,
|
|
keyword_break,
|
|
keyword_case,
|
|
keyword_char,
|
|
keyword_const,
|
|
keyword_continue,
|
|
keyword_default,
|
|
keyword_do,
|
|
keyword_double,
|
|
keyword_else,
|
|
keyword_enum,
|
|
keyword_extern,
|
|
keyword_float,
|
|
keyword_for,
|
|
keyword_goto,
|
|
keyword_if,
|
|
keyword_int,
|
|
keyword_long,
|
|
keyword_register,
|
|
keyword_return,
|
|
keyword_short,
|
|
keyword_signed,
|
|
keyword_sizeof,
|
|
keyword_static,
|
|
keyword_struct,
|
|
keyword_switch,
|
|
keyword_typedef,
|
|
keyword_typeof1,
|
|
keyword_typeof2,
|
|
keyword_union,
|
|
keyword_unsigned,
|
|
keyword_void,
|
|
keyword_volatile,
|
|
keyword_while,
|
|
|
|
// ISO C99
|
|
keyword_bool,
|
|
keyword_complex,
|
|
keyword_imaginary,
|
|
keyword_inline,
|
|
keyword_restrict,
|
|
|
|
// ISO C11
|
|
keyword_alignas,
|
|
keyword_alignof,
|
|
keyword_atomic,
|
|
keyword_generic,
|
|
keyword_noreturn,
|
|
keyword_static_assert,
|
|
keyword_thread_local,
|
|
|
|
// ISO C23
|
|
keyword_bit_int,
|
|
keyword_c23_alignas,
|
|
keyword_c23_alignof,
|
|
keyword_c23_bool,
|
|
keyword_c23_static_assert,
|
|
keyword_c23_thread_local,
|
|
keyword_constexpr,
|
|
keyword_true,
|
|
keyword_false,
|
|
keyword_nullptr,
|
|
|
|
// Preprocessor directives
|
|
keyword_include,
|
|
keyword_include_next,
|
|
keyword_embed,
|
|
keyword_define,
|
|
keyword_defined,
|
|
keyword_undef,
|
|
keyword_ifdef,
|
|
keyword_ifndef,
|
|
keyword_elif,
|
|
keyword_elifdef,
|
|
keyword_elifndef,
|
|
keyword_endif,
|
|
keyword_error,
|
|
keyword_warning,
|
|
keyword_pragma,
|
|
keyword_line,
|
|
keyword_va_args,
|
|
|
|
// gcc keywords
|
|
keyword_const1,
|
|
keyword_const2,
|
|
keyword_inline1,
|
|
keyword_inline2,
|
|
keyword_volatile1,
|
|
keyword_volatile2,
|
|
keyword_restrict1,
|
|
keyword_restrict2,
|
|
keyword_alignof1,
|
|
keyword_alignof2,
|
|
keyword_typeof,
|
|
keyword_attribute1,
|
|
keyword_attribute2,
|
|
keyword_extension,
|
|
keyword_asm,
|
|
keyword_asm1,
|
|
keyword_asm2,
|
|
keyword_float80,
|
|
keyword_float128,
|
|
keyword_int128,
|
|
keyword_imag1,
|
|
keyword_imag2,
|
|
keyword_real1,
|
|
keyword_real2,
|
|
keyword_float16,
|
|
|
|
// clang keywords
|
|
keyword_fp16,
|
|
|
|
// ms keywords
|
|
keyword_declspec,
|
|
keyword_int64,
|
|
keyword_int64_2,
|
|
keyword_int32,
|
|
keyword_int32_2,
|
|
keyword_int16,
|
|
keyword_int16_2,
|
|
keyword_int8,
|
|
keyword_int8_2,
|
|
keyword_stdcall,
|
|
keyword_stdcall2,
|
|
keyword_thiscall,
|
|
keyword_thiscall2,
|
|
keyword_vectorcall,
|
|
keyword_vectorcall2,
|
|
|
|
// builtins that require special parsing
|
|
builtin_choose_expr,
|
|
builtin_va_arg,
|
|
builtin_offsetof,
|
|
builtin_bitoffsetof,
|
|
builtin_types_compatible_p,
|
|
|
|
/// Generated by #embed directive
|
|
/// Decimal value with no prefix or suffix
|
|
embed_byte,
|
|
|
|
/// preprocessor number
|
|
/// An optional period, followed by a digit 0-9, followed by any number of letters
|
|
/// digits, underscores, periods, and exponents (e+, e-, E+, E-, p+, p-, P+, P-)
|
|
pp_num,
|
|
|
|
/// preprocessor placemarker token
|
|
/// generated if `##` is used with a zero-token argument
|
|
/// removed after substitution, so the parser should never see this
|
|
/// See C99 6.10.3.3.2
|
|
placemarker,
|
|
|
|
/// Return true if token is identifier or keyword.
|
|
pub fn isMacroIdentifier(id: Id) bool {
|
|
switch (id) {
|
|
.keyword_include,
|
|
.keyword_include_next,
|
|
.keyword_embed,
|
|
.keyword_define,
|
|
.keyword_defined,
|
|
.keyword_undef,
|
|
.keyword_ifdef,
|
|
.keyword_ifndef,
|
|
.keyword_elif,
|
|
.keyword_elifdef,
|
|
.keyword_elifndef,
|
|
.keyword_endif,
|
|
.keyword_error,
|
|
.keyword_warning,
|
|
.keyword_pragma,
|
|
.keyword_line,
|
|
.keyword_va_args,
|
|
.macro_func,
|
|
.macro_function,
|
|
.macro_pretty_func,
|
|
.keyword_auto,
|
|
.keyword_auto_type,
|
|
.keyword_break,
|
|
.keyword_case,
|
|
.keyword_char,
|
|
.keyword_const,
|
|
.keyword_continue,
|
|
.keyword_default,
|
|
.keyword_do,
|
|
.keyword_double,
|
|
.keyword_else,
|
|
.keyword_enum,
|
|
.keyword_extern,
|
|
.keyword_float,
|
|
.keyword_for,
|
|
.keyword_goto,
|
|
.keyword_if,
|
|
.keyword_int,
|
|
.keyword_long,
|
|
.keyword_register,
|
|
.keyword_return,
|
|
.keyword_short,
|
|
.keyword_signed,
|
|
.keyword_sizeof,
|
|
.keyword_static,
|
|
.keyword_struct,
|
|
.keyword_switch,
|
|
.keyword_typedef,
|
|
.keyword_union,
|
|
.keyword_unsigned,
|
|
.keyword_void,
|
|
.keyword_volatile,
|
|
.keyword_while,
|
|
.keyword_bool,
|
|
.keyword_complex,
|
|
.keyword_imaginary,
|
|
.keyword_inline,
|
|
.keyword_restrict,
|
|
.keyword_alignas,
|
|
.keyword_alignof,
|
|
.keyword_atomic,
|
|
.keyword_generic,
|
|
.keyword_noreturn,
|
|
.keyword_static_assert,
|
|
.keyword_thread_local,
|
|
.identifier,
|
|
.extended_identifier,
|
|
.keyword_typeof,
|
|
.keyword_typeof1,
|
|
.keyword_typeof2,
|
|
.keyword_const1,
|
|
.keyword_const2,
|
|
.keyword_inline1,
|
|
.keyword_inline2,
|
|
.keyword_volatile1,
|
|
.keyword_volatile2,
|
|
.keyword_restrict1,
|
|
.keyword_restrict2,
|
|
.keyword_alignof1,
|
|
.keyword_alignof2,
|
|
.builtin_choose_expr,
|
|
.builtin_va_arg,
|
|
.builtin_offsetof,
|
|
.builtin_bitoffsetof,
|
|
.builtin_types_compatible_p,
|
|
.keyword_attribute1,
|
|
.keyword_attribute2,
|
|
.keyword_extension,
|
|
.keyword_asm,
|
|
.keyword_asm1,
|
|
.keyword_asm2,
|
|
.keyword_float80,
|
|
.keyword_float128,
|
|
.keyword_int128,
|
|
.keyword_imag1,
|
|
.keyword_imag2,
|
|
.keyword_real1,
|
|
.keyword_real2,
|
|
.keyword_float16,
|
|
.keyword_fp16,
|
|
.keyword_declspec,
|
|
.keyword_int64,
|
|
.keyword_int64_2,
|
|
.keyword_int32,
|
|
.keyword_int32_2,
|
|
.keyword_int16,
|
|
.keyword_int16_2,
|
|
.keyword_int8,
|
|
.keyword_int8_2,
|
|
.keyword_stdcall,
|
|
.keyword_stdcall2,
|
|
.keyword_thiscall,
|
|
.keyword_thiscall2,
|
|
.keyword_vectorcall,
|
|
.keyword_vectorcall2,
|
|
.keyword_bit_int,
|
|
.keyword_c23_alignas,
|
|
.keyword_c23_alignof,
|
|
.keyword_c23_bool,
|
|
.keyword_c23_static_assert,
|
|
.keyword_c23_thread_local,
|
|
.keyword_constexpr,
|
|
.keyword_true,
|
|
.keyword_false,
|
|
.keyword_nullptr,
|
|
=> return true,
|
|
else => return false,
|
|
}
|
|
}
|
|
|
|
/// Turn macro keywords into identifiers.
|
|
/// `keyword_defined` is special since it should only turn into an identifier if
|
|
/// we are *not* in an #if or #elif expression
|
|
pub fn simplifyMacroKeywordExtra(id: *Id, defined_to_identifier: bool) void {
|
|
switch (id.*) {
|
|
.keyword_include,
|
|
.keyword_include_next,
|
|
.keyword_embed,
|
|
.keyword_define,
|
|
.keyword_undef,
|
|
.keyword_ifdef,
|
|
.keyword_ifndef,
|
|
.keyword_elif,
|
|
.keyword_elifdef,
|
|
.keyword_elifndef,
|
|
.keyword_endif,
|
|
.keyword_error,
|
|
.keyword_warning,
|
|
.keyword_pragma,
|
|
.keyword_line,
|
|
.keyword_va_args,
|
|
=> id.* = .identifier,
|
|
.keyword_defined => if (defined_to_identifier) {
|
|
id.* = .identifier;
|
|
},
|
|
else => {},
|
|
}
|
|
}
|
|
|
|
pub fn simplifyMacroKeyword(id: *Id) void {
|
|
simplifyMacroKeywordExtra(id, false);
|
|
}
|
|
|
|
pub fn lexeme(id: Id) ?[]const u8 {
|
|
return switch (id) {
|
|
.invalid,
|
|
.identifier,
|
|
.extended_identifier,
|
|
.string_literal,
|
|
.string_literal_utf_16,
|
|
.string_literal_utf_8,
|
|
.string_literal_utf_32,
|
|
.string_literal_wide,
|
|
.char_literal,
|
|
.char_literal_utf_8,
|
|
.char_literal_utf_16,
|
|
.char_literal_utf_32,
|
|
.char_literal_wide,
|
|
.macro_string,
|
|
.whitespace,
|
|
.pp_num,
|
|
.embed_byte,
|
|
=> null,
|
|
|
|
.zero => "0",
|
|
.one => "1",
|
|
|
|
.nl,
|
|
.eof,
|
|
.macro_param,
|
|
.macro_param_no_expand,
|
|
.stringify_param,
|
|
.stringify_va_args,
|
|
.macro_param_has_attribute,
|
|
.macro_param_has_warning,
|
|
.macro_param_has_feature,
|
|
.macro_param_has_extension,
|
|
.macro_param_has_builtin,
|
|
.macro_param_has_include,
|
|
.macro_param_has_include_next,
|
|
.macro_param_is_identifier,
|
|
.macro_file,
|
|
.macro_line,
|
|
.macro_counter,
|
|
.macro_param_pragma_operator,
|
|
.placemarker,
|
|
=> "",
|
|
.macro_ws => " ",
|
|
|
|
.macro_func => "__func__",
|
|
.macro_function => "__FUNCTION__",
|
|
.macro_pretty_func => "__PRETTY_FUNCTION__",
|
|
|
|
.bang => "!",
|
|
.bang_equal => "!=",
|
|
.pipe => "|",
|
|
.pipe_pipe => "||",
|
|
.pipe_equal => "|=",
|
|
.equal => "=",
|
|
.equal_equal => "==",
|
|
.l_paren => "(",
|
|
.r_paren => ")",
|
|
.l_brace => "{",
|
|
.r_brace => "}",
|
|
.l_bracket => "[",
|
|
.r_bracket => "]",
|
|
.period => ".",
|
|
.ellipsis => "...",
|
|
.caret => "^",
|
|
.caret_equal => "^=",
|
|
.plus => "+",
|
|
.plus_plus => "++",
|
|
.plus_equal => "+=",
|
|
.minus => "-",
|
|
.minus_minus => "--",
|
|
.minus_equal => "-=",
|
|
.asterisk => "*",
|
|
.asterisk_equal => "*=",
|
|
.percent => "%",
|
|
.percent_equal => "%=",
|
|
.arrow => "->",
|
|
.colon => ":",
|
|
.colon_colon => "::",
|
|
.semicolon => ";",
|
|
.slash => "/",
|
|
.slash_equal => "/=",
|
|
.comma => ",",
|
|
.ampersand => "&",
|
|
.ampersand_ampersand => "&&",
|
|
.ampersand_equal => "&=",
|
|
.question_mark => "?",
|
|
.angle_bracket_left => "<",
|
|
.angle_bracket_left_equal => "<=",
|
|
.angle_bracket_angle_bracket_left => "<<",
|
|
.angle_bracket_angle_bracket_left_equal => "<<=",
|
|
.angle_bracket_right => ">",
|
|
.angle_bracket_right_equal => ">=",
|
|
.angle_bracket_angle_bracket_right => ">>",
|
|
.angle_bracket_angle_bracket_right_equal => ">>=",
|
|
.tilde => "~",
|
|
.hash => "#",
|
|
.hash_hash => "##",
|
|
|
|
.keyword_auto => "auto",
|
|
.keyword_auto_type => "__auto_type",
|
|
.keyword_break => "break",
|
|
.keyword_case => "case",
|
|
.keyword_char => "char",
|
|
.keyword_const => "const",
|
|
.keyword_continue => "continue",
|
|
.keyword_default => "default",
|
|
.keyword_do => "do",
|
|
.keyword_double => "double",
|
|
.keyword_else => "else",
|
|
.keyword_enum => "enum",
|
|
.keyword_extern => "extern",
|
|
.keyword_float => "float",
|
|
.keyword_for => "for",
|
|
.keyword_goto => "goto",
|
|
.keyword_if => "if",
|
|
.keyword_int => "int",
|
|
.keyword_long => "long",
|
|
.keyword_register => "register",
|
|
.keyword_return => "return",
|
|
.keyword_short => "short",
|
|
.keyword_signed => "signed",
|
|
.keyword_sizeof => "sizeof",
|
|
.keyword_static => "static",
|
|
.keyword_struct => "struct",
|
|
.keyword_switch => "switch",
|
|
.keyword_typedef => "typedef",
|
|
.keyword_typeof => "typeof",
|
|
.keyword_union => "union",
|
|
.keyword_unsigned => "unsigned",
|
|
.keyword_void => "void",
|
|
.keyword_volatile => "volatile",
|
|
.keyword_while => "while",
|
|
.keyword_bool => "_Bool",
|
|
.keyword_complex => "_Complex",
|
|
.keyword_imaginary => "_Imaginary",
|
|
.keyword_inline => "inline",
|
|
.keyword_restrict => "restrict",
|
|
.keyword_alignas => "_Alignas",
|
|
.keyword_alignof => "_Alignof",
|
|
.keyword_atomic => "_Atomic",
|
|
.keyword_generic => "_Generic",
|
|
.keyword_noreturn => "_Noreturn",
|
|
.keyword_static_assert => "_Static_assert",
|
|
.keyword_thread_local => "_Thread_local",
|
|
.keyword_bit_int => "_BitInt",
|
|
.keyword_c23_alignas => "alignas",
|
|
.keyword_c23_alignof => "alignof",
|
|
.keyword_c23_bool => "bool",
|
|
.keyword_c23_static_assert => "static_assert",
|
|
.keyword_c23_thread_local => "thread_local",
|
|
.keyword_constexpr => "constexpr",
|
|
.keyword_true => "true",
|
|
.keyword_false => "false",
|
|
.keyword_nullptr => "nullptr",
|
|
.keyword_include => "include",
|
|
.keyword_include_next => "include_next",
|
|
.keyword_embed => "embed",
|
|
.keyword_define => "define",
|
|
.keyword_defined => "defined",
|
|
.keyword_undef => "undef",
|
|
.keyword_ifdef => "ifdef",
|
|
.keyword_ifndef => "ifndef",
|
|
.keyword_elif => "elif",
|
|
.keyword_elifdef => "elifdef",
|
|
.keyword_elifndef => "elifndef",
|
|
.keyword_endif => "endif",
|
|
.keyword_error => "error",
|
|
.keyword_warning => "warning",
|
|
.keyword_pragma => "pragma",
|
|
.keyword_line => "line",
|
|
.keyword_va_args => "__VA_ARGS__",
|
|
.keyword_const1 => "__const",
|
|
.keyword_const2 => "__const__",
|
|
.keyword_inline1 => "__inline",
|
|
.keyword_inline2 => "__inline__",
|
|
.keyword_volatile1 => "__volatile",
|
|
.keyword_volatile2 => "__volatile__",
|
|
.keyword_restrict1 => "__restrict",
|
|
.keyword_restrict2 => "__restrict__",
|
|
.keyword_alignof1 => "__alignof",
|
|
.keyword_alignof2 => "__alignof__",
|
|
.keyword_typeof1 => "__typeof",
|
|
.keyword_typeof2 => "__typeof__",
|
|
.builtin_choose_expr => "__builtin_choose_expr",
|
|
.builtin_va_arg => "__builtin_va_arg",
|
|
.builtin_offsetof => "__builtin_offsetof",
|
|
.builtin_bitoffsetof => "__builtin_bitoffsetof",
|
|
.builtin_types_compatible_p => "__builtin_types_compatible_p",
|
|
.keyword_attribute1 => "__attribute",
|
|
.keyword_attribute2 => "__attribute__",
|
|
.keyword_extension => "__extension__",
|
|
.keyword_asm => "asm",
|
|
.keyword_asm1 => "__asm",
|
|
.keyword_asm2 => "__asm__",
|
|
.keyword_float80 => "__float80",
|
|
.keyword_float128 => "__float18",
|
|
.keyword_int128 => "__int128",
|
|
.keyword_imag1 => "__imag",
|
|
.keyword_imag2 => "__imag__",
|
|
.keyword_real1 => "__real",
|
|
.keyword_real2 => "__real__",
|
|
.keyword_float16 => "_Float16",
|
|
.keyword_fp16 => "__fp16",
|
|
.keyword_declspec => "__declspec",
|
|
.keyword_int64 => "__int64",
|
|
.keyword_int64_2 => "_int64",
|
|
.keyword_int32 => "__int32",
|
|
.keyword_int32_2 => "_int32",
|
|
.keyword_int16 => "__int16",
|
|
.keyword_int16_2 => "_int16",
|
|
.keyword_int8 => "__int8",
|
|
.keyword_int8_2 => "_int8",
|
|
.keyword_stdcall => "__stdcall",
|
|
.keyword_stdcall2 => "_stdcall",
|
|
.keyword_thiscall => "__thiscall",
|
|
.keyword_thiscall2 => "_thiscall",
|
|
.keyword_vectorcall => "__vectorcall",
|
|
.keyword_vectorcall2 => "_vectorcall",
|
|
};
|
|
}
|
|
|
|
pub fn symbol(id: Id) []const u8 {
|
|
return switch (id) {
|
|
.macro_string, .invalid => unreachable,
|
|
.identifier,
|
|
.extended_identifier,
|
|
.macro_func,
|
|
.macro_function,
|
|
.macro_pretty_func,
|
|
.builtin_choose_expr,
|
|
.builtin_va_arg,
|
|
.builtin_offsetof,
|
|
.builtin_bitoffsetof,
|
|
.builtin_types_compatible_p,
|
|
=> "an identifier",
|
|
.string_literal,
|
|
.string_literal_utf_16,
|
|
.string_literal_utf_8,
|
|
.string_literal_utf_32,
|
|
.string_literal_wide,
|
|
=> "a string literal",
|
|
.char_literal,
|
|
.char_literal_utf_8,
|
|
.char_literal_utf_16,
|
|
.char_literal_utf_32,
|
|
.char_literal_wide,
|
|
=> "a character literal",
|
|
.pp_num, .embed_byte => "A number",
|
|
else => id.lexeme().?,
|
|
};
|
|
}
|
|
|
|
/// tokens that can start an expression parsed by Preprocessor.expr
|
|
/// Note that eof, r_paren, and string literals cannot actually start a
|
|
/// preprocessor expression, but we include them here so that a nicer
|
|
/// error message can be generated by the parser.
|
|
pub fn validPreprocessorExprStart(id: Id) bool {
|
|
return switch (id) {
|
|
.eof,
|
|
.r_paren,
|
|
.string_literal,
|
|
.string_literal_utf_16,
|
|
.string_literal_utf_8,
|
|
.string_literal_utf_32,
|
|
.string_literal_wide,
|
|
|
|
.char_literal,
|
|
.char_literal_utf_8,
|
|
.char_literal_utf_16,
|
|
.char_literal_utf_32,
|
|
.char_literal_wide,
|
|
.l_paren,
|
|
.plus,
|
|
.minus,
|
|
.tilde,
|
|
.bang,
|
|
.identifier,
|
|
.extended_identifier,
|
|
.keyword_defined,
|
|
.one,
|
|
.zero,
|
|
.pp_num,
|
|
.keyword_true,
|
|
.keyword_false,
|
|
=> true,
|
|
else => false,
|
|
};
|
|
}
|
|
|
|
pub fn allowsDigraphs(id: Id, comp: *const Compilation) bool {
|
|
return switch (id) {
|
|
.l_bracket,
|
|
.r_bracket,
|
|
.l_brace,
|
|
.r_brace,
|
|
.hash,
|
|
.hash_hash,
|
|
=> comp.langopts.hasDigraphs(),
|
|
else => false,
|
|
};
|
|
}
|
|
|
|
pub fn canOpenGCCAsmStmt(id: Id) bool {
|
|
return switch (id) {
|
|
.keyword_volatile, .keyword_volatile1, .keyword_volatile2, .keyword_inline, .keyword_inline1, .keyword_inline2, .keyword_goto, .l_paren => true,
|
|
else => false,
|
|
};
|
|
}
|
|
|
|
pub fn isStringLiteral(id: Id) bool {
|
|
return switch (id) {
|
|
.string_literal, .string_literal_utf_16, .string_literal_utf_8, .string_literal_utf_32, .string_literal_wide => true,
|
|
else => false,
|
|
};
|
|
}
|
|
};
|
|
|
|
/// double underscore and underscore + capital letter identifiers
|
|
/// belong to the implementation namespace, so we always convert them
|
|
/// to keywords.
|
|
pub fn getTokenId(comp: *const Compilation, str: []const u8) Token.Id {
|
|
const kw = all_kws.get(str) orelse return .identifier;
|
|
const standard = comp.langopts.standard;
|
|
return switch (kw) {
|
|
.keyword_inline => if (standard.isGNU() or standard.atLeast(.c99)) kw else .identifier,
|
|
.keyword_restrict => if (standard.atLeast(.c99)) kw else .identifier,
|
|
.keyword_typeof => if (standard.isGNU() or standard.atLeast(.c2x)) kw else .identifier,
|
|
.keyword_asm => if (standard.isGNU()) kw else .identifier,
|
|
.keyword_declspec => if (comp.langopts.declspec_attrs) kw else .identifier,
|
|
|
|
.keyword_c23_alignas,
|
|
.keyword_c23_alignof,
|
|
.keyword_c23_bool,
|
|
.keyword_c23_static_assert,
|
|
.keyword_c23_thread_local,
|
|
.keyword_constexpr,
|
|
.keyword_true,
|
|
.keyword_false,
|
|
.keyword_nullptr,
|
|
.keyword_elifdef,
|
|
.keyword_elifndef,
|
|
=> if (standard.atLeast(.c2x)) kw else .identifier,
|
|
|
|
.keyword_int64,
|
|
.keyword_int64_2,
|
|
.keyword_int32,
|
|
.keyword_int32_2,
|
|
.keyword_int16,
|
|
.keyword_int16_2,
|
|
.keyword_int8,
|
|
.keyword_int8_2,
|
|
.keyword_stdcall2,
|
|
.keyword_thiscall2,
|
|
.keyword_vectorcall2,
|
|
=> if (comp.langopts.ms_extensions) kw else .identifier,
|
|
else => kw,
|
|
};
|
|
}
|
|
|
|
/// Check if codepoint may appear in specified context
|
|
/// does not check basic character set chars because the tokenizer handles them separately to keep the common
|
|
/// case on the fast path
|
|
pub fn mayAppearInIdent(comp: *const Compilation, codepoint: u21, where: enum { start, inside }) bool {
|
|
if (codepoint == '$') return comp.langopts.dollars_in_identifiers;
|
|
if (codepoint <= 0x7F) return false;
|
|
return switch (where) {
|
|
.start => if (comp.langopts.standard.atLeast(.c11))
|
|
CharInfo.isC11IdChar(codepoint) and !CharInfo.isC11DisallowedInitialIdChar(codepoint)
|
|
else
|
|
CharInfo.isC99IdChar(codepoint) and !CharInfo.isC99DisallowedInitialIDChar(codepoint),
|
|
.inside => if (comp.langopts.standard.atLeast(.c11))
|
|
CharInfo.isC11IdChar(codepoint)
|
|
else
|
|
CharInfo.isC99IdChar(codepoint),
|
|
};
|
|
}
|
|
|
|
const all_kws = std.ComptimeStringMap(Id, .{
|
|
.{ "auto", auto: {
|
|
@setEvalBranchQuota(3000);
|
|
break :auto .keyword_auto;
|
|
} },
|
|
.{ "break", .keyword_break },
|
|
.{ "case", .keyword_case },
|
|
.{ "char", .keyword_char },
|
|
.{ "const", .keyword_const },
|
|
.{ "continue", .keyword_continue },
|
|
.{ "default", .keyword_default },
|
|
.{ "do", .keyword_do },
|
|
.{ "double", .keyword_double },
|
|
.{ "else", .keyword_else },
|
|
.{ "enum", .keyword_enum },
|
|
.{ "extern", .keyword_extern },
|
|
.{ "float", .keyword_float },
|
|
.{ "for", .keyword_for },
|
|
.{ "goto", .keyword_goto },
|
|
.{ "if", .keyword_if },
|
|
.{ "int", .keyword_int },
|
|
.{ "long", .keyword_long },
|
|
.{ "register", .keyword_register },
|
|
.{ "return", .keyword_return },
|
|
.{ "short", .keyword_short },
|
|
.{ "signed", .keyword_signed },
|
|
.{ "sizeof", .keyword_sizeof },
|
|
.{ "static", .keyword_static },
|
|
.{ "struct", .keyword_struct },
|
|
.{ "switch", .keyword_switch },
|
|
.{ "typedef", .keyword_typedef },
|
|
.{ "union", .keyword_union },
|
|
.{ "unsigned", .keyword_unsigned },
|
|
.{ "void", .keyword_void },
|
|
.{ "volatile", .keyword_volatile },
|
|
.{ "while", .keyword_while },
|
|
.{ "__typeof__", .keyword_typeof2 },
|
|
.{ "__typeof", .keyword_typeof1 },
|
|
|
|
// ISO C99
|
|
.{ "_Bool", .keyword_bool },
|
|
.{ "_Complex", .keyword_complex },
|
|
.{ "_Imaginary", .keyword_imaginary },
|
|
.{ "inline", .keyword_inline },
|
|
.{ "restrict", .keyword_restrict },
|
|
|
|
// ISO C11
|
|
.{ "_Alignas", .keyword_alignas },
|
|
.{ "_Alignof", .keyword_alignof },
|
|
.{ "_Atomic", .keyword_atomic },
|
|
.{ "_Generic", .keyword_generic },
|
|
.{ "_Noreturn", .keyword_noreturn },
|
|
.{ "_Static_assert", .keyword_static_assert },
|
|
.{ "_Thread_local", .keyword_thread_local },
|
|
|
|
// ISO C23
|
|
.{ "_BitInt", .keyword_bit_int },
|
|
.{ "alignas", .keyword_c23_alignas },
|
|
.{ "alignof", .keyword_c23_alignof },
|
|
.{ "bool", .keyword_c23_bool },
|
|
.{ "static_assert", .keyword_c23_static_assert },
|
|
.{ "thread_local", .keyword_c23_thread_local },
|
|
.{ "constexpr", .keyword_constexpr },
|
|
.{ "true", .keyword_true },
|
|
.{ "false", .keyword_false },
|
|
.{ "nullptr", .keyword_nullptr },
|
|
|
|
// Preprocessor directives
|
|
.{ "include", .keyword_include },
|
|
.{ "include_next", .keyword_include_next },
|
|
.{ "embed", .keyword_embed },
|
|
.{ "define", .keyword_define },
|
|
.{ "defined", .keyword_defined },
|
|
.{ "undef", .keyword_undef },
|
|
.{ "ifdef", .keyword_ifdef },
|
|
.{ "ifndef", .keyword_ifndef },
|
|
.{ "elif", .keyword_elif },
|
|
.{ "elifdef", .keyword_elifdef },
|
|
.{ "elifndef", .keyword_elifndef },
|
|
.{ "endif", .keyword_endif },
|
|
.{ "error", .keyword_error },
|
|
.{ "warning", .keyword_warning },
|
|
.{ "pragma", .keyword_pragma },
|
|
.{ "line", .keyword_line },
|
|
.{ "__VA_ARGS__", .keyword_va_args },
|
|
.{ "__func__", .macro_func },
|
|
.{ "__FUNCTION__", .macro_function },
|
|
.{ "__PRETTY_FUNCTION__", .macro_pretty_func },
|
|
|
|
// gcc keywords
|
|
.{ "__auto_type", .keyword_auto_type },
|
|
.{ "__const", .keyword_const1 },
|
|
.{ "__const__", .keyword_const2 },
|
|
.{ "__inline", .keyword_inline1 },
|
|
.{ "__inline__", .keyword_inline2 },
|
|
.{ "__volatile", .keyword_volatile1 },
|
|
.{ "__volatile__", .keyword_volatile2 },
|
|
.{ "__restrict", .keyword_restrict1 },
|
|
.{ "__restrict__", .keyword_restrict2 },
|
|
.{ "__alignof", .keyword_alignof1 },
|
|
.{ "__alignof__", .keyword_alignof2 },
|
|
.{ "typeof", .keyword_typeof },
|
|
.{ "__attribute", .keyword_attribute1 },
|
|
.{ "__attribute__", .keyword_attribute2 },
|
|
.{ "__extension__", .keyword_extension },
|
|
.{ "asm", .keyword_asm },
|
|
.{ "__asm", .keyword_asm1 },
|
|
.{ "__asm__", .keyword_asm2 },
|
|
.{ "__float80", .keyword_float80 },
|
|
.{ "__float128", .keyword_float128 },
|
|
.{ "__int128", .keyword_int128 },
|
|
.{ "__imag", .keyword_imag1 },
|
|
.{ "__imag__", .keyword_imag2 },
|
|
.{ "__real", .keyword_real1 },
|
|
.{ "__real__", .keyword_real2 },
|
|
.{ "_Float16", .keyword_float16 },
|
|
|
|
// clang keywords
|
|
.{ "__fp16", .keyword_fp16 },
|
|
|
|
// ms keywords
|
|
.{ "__declspec", .keyword_declspec },
|
|
.{ "__int64", .keyword_int64 },
|
|
.{ "_int64", .keyword_int64_2 },
|
|
.{ "__int32", .keyword_int32 },
|
|
.{ "_int32", .keyword_int32_2 },
|
|
.{ "__int16", .keyword_int16 },
|
|
.{ "_int16", .keyword_int16_2 },
|
|
.{ "__int8", .keyword_int8 },
|
|
.{ "_int8", .keyword_int8_2 },
|
|
.{ "__stdcall", .keyword_stdcall },
|
|
.{ "_stdcall", .keyword_stdcall2 },
|
|
.{ "__thiscall", .keyword_thiscall },
|
|
.{ "_thiscall", .keyword_thiscall2 },
|
|
.{ "__vectorcall", .keyword_vectorcall },
|
|
.{ "_vectorcall", .keyword_vectorcall2 },
|
|
|
|
// builtins that require special parsing
|
|
.{ "__builtin_choose_expr", .builtin_choose_expr },
|
|
.{ "__builtin_va_arg", .builtin_va_arg },
|
|
.{ "__builtin_offsetof", .builtin_offsetof },
|
|
.{ "__builtin_bitoffsetof", .builtin_bitoffsetof },
|
|
.{ "__builtin_types_compatible_p", .builtin_types_compatible_p },
|
|
});
|
|
};
|
|
|
|
buf: []const u8,
|
|
index: u32 = 0,
|
|
source: Source.Id,
|
|
comp: *const Compilation,
|
|
line: u32 = 1,
|
|
|
|
pub fn next(self: *Tokenizer) Token {
|
|
var state: enum {
|
|
start,
|
|
whitespace,
|
|
u,
|
|
u8,
|
|
U,
|
|
L,
|
|
string_literal,
|
|
char_literal_start,
|
|
char_literal,
|
|
escape_sequence,
|
|
octal_escape,
|
|
hex_escape,
|
|
unicode_escape,
|
|
identifier,
|
|
extended_identifier,
|
|
equal,
|
|
bang,
|
|
pipe,
|
|
colon,
|
|
percent,
|
|
asterisk,
|
|
plus,
|
|
angle_bracket_left,
|
|
angle_bracket_angle_bracket_left,
|
|
angle_bracket_right,
|
|
angle_bracket_angle_bracket_right,
|
|
caret,
|
|
period,
|
|
period2,
|
|
minus,
|
|
slash,
|
|
ampersand,
|
|
hash,
|
|
hash_digraph,
|
|
hash_hash_digraph_partial,
|
|
line_comment,
|
|
multi_line_comment,
|
|
multi_line_comment_asterisk,
|
|
multi_line_comment_done,
|
|
pp_num,
|
|
pp_num_exponent,
|
|
pp_num_digit_separator,
|
|
} = .start;
|
|
|
|
var start = self.index;
|
|
var id: Token.Id = .eof;
|
|
|
|
var return_state = state;
|
|
var counter: u32 = 0;
|
|
var codepoint_len: u3 = undefined;
|
|
while (self.index < self.buf.len) : (self.index += codepoint_len) {
|
|
// Source files get checked for valid utf-8 before being tokenized so it is safe to use
|
|
// these versions.
|
|
codepoint_len = unicode.utf8ByteSequenceLength_unsafe(self.buf[self.index]);
|
|
const c: u21 = switch (codepoint_len) {
|
|
1 => @as(u21, self.buf[self.index]),
|
|
2 => unicode.utf8Decode2_unsafe(self.buf[self.index..]),
|
|
3 => unicode.utf8Decode3_unsafe(self.buf[self.index..]),
|
|
4 => unicode.utf8Decode4_unsafe(self.buf[self.index..]),
|
|
else => unreachable,
|
|
};
|
|
switch (state) {
|
|
.start => switch (c) {
|
|
'\n' => {
|
|
id = .nl;
|
|
self.index += 1;
|
|
self.line += 1;
|
|
break;
|
|
},
|
|
'"' => {
|
|
id = .string_literal;
|
|
state = .string_literal;
|
|
},
|
|
'\'' => {
|
|
id = .char_literal;
|
|
state = .char_literal_start;
|
|
},
|
|
'u' => state = .u,
|
|
'U' => state = .U,
|
|
'L' => state = .L,
|
|
'a'...'t', 'v'...'z', 'A'...'K', 'M'...'T', 'V'...'Z', '_' => state = .identifier,
|
|
'=' => state = .equal,
|
|
'!' => state = .bang,
|
|
'|' => state = .pipe,
|
|
'(' => {
|
|
id = .l_paren;
|
|
self.index += 1;
|
|
break;
|
|
},
|
|
')' => {
|
|
id = .r_paren;
|
|
self.index += 1;
|
|
break;
|
|
},
|
|
'[' => {
|
|
id = .l_bracket;
|
|
self.index += 1;
|
|
break;
|
|
},
|
|
']' => {
|
|
id = .r_bracket;
|
|
self.index += 1;
|
|
break;
|
|
},
|
|
';' => {
|
|
id = .semicolon;
|
|
self.index += 1;
|
|
break;
|
|
},
|
|
',' => {
|
|
id = .comma;
|
|
self.index += 1;
|
|
break;
|
|
},
|
|
'?' => {
|
|
id = .question_mark;
|
|
self.index += 1;
|
|
break;
|
|
},
|
|
':' => state = .colon,
|
|
'%' => state = .percent,
|
|
'*' => state = .asterisk,
|
|
'+' => state = .plus,
|
|
'<' => state = .angle_bracket_left,
|
|
'>' => state = .angle_bracket_right,
|
|
'^' => state = .caret,
|
|
'{' => {
|
|
id = .l_brace;
|
|
self.index += 1;
|
|
break;
|
|
},
|
|
'}' => {
|
|
id = .r_brace;
|
|
self.index += 1;
|
|
break;
|
|
},
|
|
'~' => {
|
|
id = .tilde;
|
|
self.index += 1;
|
|
break;
|
|
},
|
|
'.' => state = .period,
|
|
'-' => state = .minus,
|
|
'/' => state = .slash,
|
|
'&' => state = .ampersand,
|
|
'#' => state = .hash,
|
|
'0'...'9' => state = .pp_num,
|
|
'\t', '\x0B', '\x0C', ' ' => state = .whitespace,
|
|
else => if (Token.mayAppearInIdent(self.comp, c, .start)) {
|
|
state = .extended_identifier;
|
|
} else {
|
|
id = .invalid;
|
|
self.index += codepoint_len;
|
|
break;
|
|
},
|
|
},
|
|
.whitespace => switch (c) {
|
|
'\t', '\x0B', '\x0C', ' ' => {},
|
|
else => {
|
|
id = .whitespace;
|
|
break;
|
|
},
|
|
},
|
|
.u => switch (c) {
|
|
'8' => {
|
|
state = .u8;
|
|
},
|
|
'\'' => {
|
|
id = .char_literal_utf_16;
|
|
state = .char_literal_start;
|
|
},
|
|
'\"' => {
|
|
id = .string_literal_utf_16;
|
|
state = .string_literal;
|
|
},
|
|
else => {
|
|
codepoint_len = 0;
|
|
state = .identifier;
|
|
},
|
|
},
|
|
.u8 => switch (c) {
|
|
'\"' => {
|
|
id = .string_literal_utf_8;
|
|
state = .string_literal;
|
|
},
|
|
'\'' => {
|
|
id = .char_literal_utf_8;
|
|
state = .char_literal_start;
|
|
},
|
|
else => {
|
|
codepoint_len = 0;
|
|
state = .identifier;
|
|
},
|
|
},
|
|
.U => switch (c) {
|
|
'\'' => {
|
|
id = .char_literal_utf_32;
|
|
state = .char_literal_start;
|
|
},
|
|
'\"' => {
|
|
id = .string_literal_utf_32;
|
|
state = .string_literal;
|
|
},
|
|
else => {
|
|
codepoint_len = 0;
|
|
state = .identifier;
|
|
},
|
|
},
|
|
.L => switch (c) {
|
|
'\'' => {
|
|
id = .char_literal_wide;
|
|
state = .char_literal_start;
|
|
},
|
|
'\"' => {
|
|
id = .string_literal_wide;
|
|
state = .string_literal;
|
|
},
|
|
else => {
|
|
codepoint_len = 0;
|
|
state = .identifier;
|
|
},
|
|
},
|
|
.string_literal => switch (c) {
|
|
'\\' => {
|
|
return_state = .string_literal;
|
|
state = .escape_sequence;
|
|
},
|
|
'"' => {
|
|
self.index += 1;
|
|
break;
|
|
},
|
|
'\n' => {
|
|
id = .invalid;
|
|
break;
|
|
},
|
|
'\r' => unreachable,
|
|
else => {},
|
|
},
|
|
.char_literal_start => switch (c) {
|
|
'\\' => {
|
|
return_state = .char_literal;
|
|
state = .escape_sequence;
|
|
},
|
|
|
|
'\'', '\n' => {
|
|
id = .invalid;
|
|
break;
|
|
},
|
|
else => {
|
|
state = .char_literal;
|
|
},
|
|
},
|
|
.char_literal => switch (c) {
|
|
'\\' => {
|
|
return_state = .char_literal;
|
|
state = .escape_sequence;
|
|
},
|
|
'\'' => {
|
|
self.index += 1;
|
|
break;
|
|
},
|
|
'\n' => {
|
|
id = .invalid;
|
|
break;
|
|
},
|
|
else => {},
|
|
},
|
|
.escape_sequence => switch (c) {
|
|
'\'', '"', '?', '\\', 'a', 'b', 'e', 'f', 'n', 'r', 't', 'v' => {
|
|
state = return_state;
|
|
},
|
|
'\n' => {
|
|
state = return_state;
|
|
self.line += 1;
|
|
},
|
|
'0'...'7' => {
|
|
counter = 1;
|
|
state = .octal_escape;
|
|
},
|
|
'x' => state = .hex_escape,
|
|
'u' => {
|
|
counter = 4;
|
|
state = .unicode_escape;
|
|
},
|
|
'U' => {
|
|
counter = 8;
|
|
state = .unicode_escape;
|
|
},
|
|
else => {
|
|
id = .invalid;
|
|
break;
|
|
},
|
|
},
|
|
.octal_escape => switch (c) {
|
|
'0'...'7' => {
|
|
counter += 1;
|
|
if (counter == 3) state = return_state;
|
|
},
|
|
else => {
|
|
codepoint_len = 0;
|
|
state = return_state;
|
|
},
|
|
},
|
|
.hex_escape => switch (c) {
|
|
'0'...'9', 'a'...'f', 'A'...'F' => {},
|
|
else => {
|
|
codepoint_len = 0;
|
|
state = return_state;
|
|
},
|
|
},
|
|
.unicode_escape => switch (c) {
|
|
'0'...'9', 'a'...'f', 'A'...'F' => {
|
|
counter -= 1;
|
|
if (counter == 0) state = return_state;
|
|
},
|
|
else => {
|
|
id = .invalid;
|
|
break;
|
|
},
|
|
},
|
|
.identifier, .extended_identifier => switch (c) {
|
|
'a'...'z', 'A'...'Z', '_', '0'...'9' => {},
|
|
else => {
|
|
if (!Token.mayAppearInIdent(self.comp, c, .inside)) {
|
|
id = if (state == .identifier) Token.getTokenId(self.comp, self.buf[start..self.index]) else .extended_identifier;
|
|
break;
|
|
}
|
|
state = .extended_identifier;
|
|
},
|
|
},
|
|
.equal => switch (c) {
|
|
'=' => {
|
|
id = .equal_equal;
|
|
self.index += 1;
|
|
break;
|
|
},
|
|
else => {
|
|
id = .equal;
|
|
break;
|
|
},
|
|
},
|
|
.bang => switch (c) {
|
|
'=' => {
|
|
id = .bang_equal;
|
|
self.index += 1;
|
|
break;
|
|
},
|
|
else => {
|
|
id = .bang;
|
|
break;
|
|
},
|
|
},
|
|
.pipe => switch (c) {
|
|
'=' => {
|
|
id = .pipe_equal;
|
|
self.index += 1;
|
|
break;
|
|
},
|
|
'|' => {
|
|
id = .pipe_pipe;
|
|
self.index += 1;
|
|
break;
|
|
},
|
|
else => {
|
|
id = .pipe;
|
|
break;
|
|
},
|
|
},
|
|
.colon => switch (c) {
|
|
'>' => {
|
|
if (self.comp.langopts.hasDigraphs()) {
|
|
id = .r_bracket;
|
|
self.index += 1;
|
|
} else {
|
|
id = .colon;
|
|
}
|
|
break;
|
|
},
|
|
':' => {
|
|
if (self.comp.langopts.standard.atLeast(.c2x)) {
|
|
id = .colon_colon;
|
|
self.index += 1;
|
|
break;
|
|
} else {
|
|
id = .colon;
|
|
break;
|
|
}
|
|
},
|
|
else => {
|
|
id = .colon;
|
|
break;
|
|
},
|
|
},
|
|
.percent => switch (c) {
|
|
'=' => {
|
|
id = .percent_equal;
|
|
self.index += 1;
|
|
break;
|
|
},
|
|
'>' => {
|
|
if (self.comp.langopts.hasDigraphs()) {
|
|
id = .r_brace;
|
|
self.index += 1;
|
|
} else {
|
|
id = .percent;
|
|
}
|
|
break;
|
|
},
|
|
':' => {
|
|
if (self.comp.langopts.hasDigraphs()) {
|
|
state = .hash_digraph;
|
|
} else {
|
|
id = .percent;
|
|
break;
|
|
}
|
|
},
|
|
else => {
|
|
id = .percent;
|
|
break;
|
|
},
|
|
},
|
|
.asterisk => switch (c) {
|
|
'=' => {
|
|
id = .asterisk_equal;
|
|
self.index += 1;
|
|
break;
|
|
},
|
|
else => {
|
|
id = .asterisk;
|
|
break;
|
|
},
|
|
},
|
|
.plus => switch (c) {
|
|
'=' => {
|
|
id = .plus_equal;
|
|
self.index += 1;
|
|
break;
|
|
},
|
|
'+' => {
|
|
id = .plus_plus;
|
|
self.index += 1;
|
|
break;
|
|
},
|
|
else => {
|
|
id = .plus;
|
|
break;
|
|
},
|
|
},
|
|
.angle_bracket_left => switch (c) {
|
|
'<' => state = .angle_bracket_angle_bracket_left,
|
|
'=' => {
|
|
id = .angle_bracket_left_equal;
|
|
self.index += 1;
|
|
break;
|
|
},
|
|
':' => {
|
|
if (self.comp.langopts.hasDigraphs()) {
|
|
id = .l_bracket;
|
|
self.index += 1;
|
|
} else {
|
|
id = .angle_bracket_left;
|
|
}
|
|
break;
|
|
},
|
|
'%' => {
|
|
if (self.comp.langopts.hasDigraphs()) {
|
|
id = .l_brace;
|
|
self.index += 1;
|
|
} else {
|
|
id = .angle_bracket_left;
|
|
}
|
|
break;
|
|
},
|
|
else => {
|
|
id = .angle_bracket_left;
|
|
break;
|
|
},
|
|
},
|
|
.angle_bracket_angle_bracket_left => switch (c) {
|
|
'=' => {
|
|
id = .angle_bracket_angle_bracket_left_equal;
|
|
self.index += 1;
|
|
break;
|
|
},
|
|
else => {
|
|
id = .angle_bracket_angle_bracket_left;
|
|
break;
|
|
},
|
|
},
|
|
.angle_bracket_right => switch (c) {
|
|
'>' => state = .angle_bracket_angle_bracket_right,
|
|
'=' => {
|
|
id = .angle_bracket_right_equal;
|
|
self.index += 1;
|
|
break;
|
|
},
|
|
else => {
|
|
id = .angle_bracket_right;
|
|
break;
|
|
},
|
|
},
|
|
.angle_bracket_angle_bracket_right => switch (c) {
|
|
'=' => {
|
|
id = .angle_bracket_angle_bracket_right_equal;
|
|
self.index += 1;
|
|
break;
|
|
},
|
|
else => {
|
|
id = .angle_bracket_angle_bracket_right;
|
|
break;
|
|
},
|
|
},
|
|
.caret => switch (c) {
|
|
'=' => {
|
|
id = .caret_equal;
|
|
self.index += 1;
|
|
break;
|
|
},
|
|
else => {
|
|
id = .caret;
|
|
break;
|
|
},
|
|
},
|
|
.period => switch (c) {
|
|
'.' => state = .period2,
|
|
'0'...'9' => state = .pp_num,
|
|
else => {
|
|
id = .period;
|
|
break;
|
|
},
|
|
},
|
|
.period2 => switch (c) {
|
|
'.' => {
|
|
id = .ellipsis;
|
|
self.index += 1;
|
|
break;
|
|
},
|
|
else => {
|
|
id = .period;
|
|
self.index -= 1;
|
|
break;
|
|
},
|
|
},
|
|
.minus => switch (c) {
|
|
'>' => {
|
|
id = .arrow;
|
|
self.index += 1;
|
|
break;
|
|
},
|
|
'=' => {
|
|
id = .minus_equal;
|
|
self.index += 1;
|
|
break;
|
|
},
|
|
'-' => {
|
|
id = .minus_minus;
|
|
self.index += 1;
|
|
break;
|
|
},
|
|
else => {
|
|
id = .minus;
|
|
break;
|
|
},
|
|
},
|
|
.ampersand => switch (c) {
|
|
'&' => {
|
|
id = .ampersand_ampersand;
|
|
self.index += 1;
|
|
break;
|
|
},
|
|
'=' => {
|
|
id = .ampersand_equal;
|
|
self.index += 1;
|
|
break;
|
|
},
|
|
else => {
|
|
id = .ampersand;
|
|
break;
|
|
},
|
|
},
|
|
.hash => switch (c) {
|
|
'#' => {
|
|
id = .hash_hash;
|
|
self.index += 1;
|
|
break;
|
|
},
|
|
else => {
|
|
id = .hash;
|
|
break;
|
|
},
|
|
},
|
|
.hash_digraph => switch (c) {
|
|
'%' => state = .hash_hash_digraph_partial,
|
|
else => {
|
|
id = .hash;
|
|
break;
|
|
},
|
|
},
|
|
.hash_hash_digraph_partial => switch (c) {
|
|
':' => {
|
|
id = .hash_hash;
|
|
self.index += 1;
|
|
break;
|
|
},
|
|
else => {
|
|
id = .hash;
|
|
self.index -= 1; // re-tokenize the percent
|
|
break;
|
|
},
|
|
},
|
|
.slash => switch (c) {
|
|
'/' => state = .line_comment,
|
|
'*' => state = .multi_line_comment,
|
|
'=' => {
|
|
id = .slash_equal;
|
|
self.index += 1;
|
|
break;
|
|
},
|
|
else => {
|
|
id = .slash;
|
|
break;
|
|
},
|
|
},
|
|
.line_comment => switch (c) {
|
|
'\n' => {
|
|
self.index -= 1;
|
|
state = .start;
|
|
},
|
|
else => {},
|
|
},
|
|
.multi_line_comment => switch (c) {
|
|
'*' => state = .multi_line_comment_asterisk,
|
|
'\n' => self.line += 1,
|
|
else => {},
|
|
},
|
|
.multi_line_comment_asterisk => switch (c) {
|
|
'/' => state = .multi_line_comment_done,
|
|
'\n' => {
|
|
self.line += 1;
|
|
state = .multi_line_comment;
|
|
},
|
|
'*' => {},
|
|
else => state = .multi_line_comment,
|
|
},
|
|
.multi_line_comment_done => switch (c) {
|
|
'\n' => {
|
|
start = self.index;
|
|
id = .nl;
|
|
self.index += 1;
|
|
self.line += 1;
|
|
break;
|
|
},
|
|
'\r' => unreachable,
|
|
'\t', '\x0B', '\x0C', ' ' => {
|
|
start = self.index;
|
|
state = .whitespace;
|
|
},
|
|
else => {
|
|
id = .whitespace;
|
|
break;
|
|
},
|
|
},
|
|
.pp_num => switch (c) {
|
|
'a'...'d',
|
|
'A'...'D',
|
|
'f'...'o',
|
|
'F'...'O',
|
|
'q'...'z',
|
|
'Q'...'Z',
|
|
'0'...'9',
|
|
'_',
|
|
'.',
|
|
=> {},
|
|
'e', 'E', 'p', 'P' => state = .pp_num_exponent,
|
|
'\'' => if (self.comp.langopts.standard.atLeast(.c2x)) {
|
|
state = .pp_num_digit_separator;
|
|
} else {
|
|
id = .pp_num;
|
|
break;
|
|
},
|
|
else => {
|
|
id = .pp_num;
|
|
break;
|
|
},
|
|
},
|
|
.pp_num_digit_separator => switch (c) {
|
|
'a'...'d',
|
|
'A'...'D',
|
|
'f'...'o',
|
|
'F'...'O',
|
|
'q'...'z',
|
|
'Q'...'Z',
|
|
'0'...'9',
|
|
'_',
|
|
=> state = .pp_num,
|
|
else => {
|
|
self.index -= 1;
|
|
id = .pp_num;
|
|
break;
|
|
},
|
|
},
|
|
.pp_num_exponent => switch (c) {
|
|
'a'...'z',
|
|
'A'...'Z',
|
|
'0'...'9',
|
|
'_',
|
|
'.',
|
|
'+',
|
|
'-',
|
|
=> state = .pp_num,
|
|
else => {
|
|
id = .pp_num;
|
|
break;
|
|
},
|
|
},
|
|
}
|
|
} else if (self.index == self.buf.len) {
|
|
switch (state) {
|
|
.start, .line_comment => {},
|
|
.u, .u8, .U, .L, .identifier => id = Token.getTokenId(self.comp, self.buf[start..self.index]),
|
|
.extended_identifier => id = .extended_identifier,
|
|
.period2,
|
|
.string_literal,
|
|
.char_literal_start,
|
|
.char_literal,
|
|
.escape_sequence,
|
|
.octal_escape,
|
|
.hex_escape,
|
|
.unicode_escape,
|
|
.multi_line_comment,
|
|
.multi_line_comment_asterisk,
|
|
=> id = .invalid,
|
|
|
|
.whitespace => id = .whitespace,
|
|
.multi_line_comment_done => id = .whitespace,
|
|
|
|
.equal => id = .equal,
|
|
.bang => id = .bang,
|
|
.minus => id = .minus,
|
|
.slash => id = .slash,
|
|
.ampersand => id = .ampersand,
|
|
.hash => id = .hash,
|
|
.period => id = .period,
|
|
.pipe => id = .pipe,
|
|
.angle_bracket_angle_bracket_right => id = .angle_bracket_angle_bracket_right,
|
|
.angle_bracket_right => id = .angle_bracket_right,
|
|
.angle_bracket_angle_bracket_left => id = .angle_bracket_angle_bracket_left,
|
|
.angle_bracket_left => id = .angle_bracket_left,
|
|
.plus => id = .plus,
|
|
.colon => id = .colon,
|
|
.percent => id = .percent,
|
|
.caret => id = .caret,
|
|
.asterisk => id = .asterisk,
|
|
.hash_digraph => id = .hash,
|
|
.hash_hash_digraph_partial => {
|
|
id = .hash;
|
|
self.index -= 1; // re-tokenize the percent
|
|
},
|
|
.pp_num, .pp_num_exponent, .pp_num_digit_separator => id = .pp_num,
|
|
}
|
|
}
|
|
|
|
return .{
|
|
.id = id,
|
|
.start = start,
|
|
.end = self.index,
|
|
.line = self.line,
|
|
.source = self.source,
|
|
};
|
|
}
|
|
|
|
pub fn nextNoWS(self: *Tokenizer) Token {
|
|
var tok = self.next();
|
|
while (tok.id == .whitespace) tok = self.next();
|
|
return tok;
|
|
}
|
|
|
|
test "operators" {
|
|
try expectTokens(
|
|
\\ ! != | || |= = ==
|
|
\\ ( ) { } [ ] . .. ...
|
|
\\ ^ ^= + ++ += - -- -=
|
|
\\ * *= % %= -> : ; / /=
|
|
\\ , & && &= ? < <= <<
|
|
\\ <<= > >= >> >>= ~ # ##
|
|
\\
|
|
, &.{
|
|
.bang,
|
|
.bang_equal,
|
|
.pipe,
|
|
.pipe_pipe,
|
|
.pipe_equal,
|
|
.equal,
|
|
.equal_equal,
|
|
.nl,
|
|
.l_paren,
|
|
.r_paren,
|
|
.l_brace,
|
|
.r_brace,
|
|
.l_bracket,
|
|
.r_bracket,
|
|
.period,
|
|
.period,
|
|
.period,
|
|
.ellipsis,
|
|
.nl,
|
|
.caret,
|
|
.caret_equal,
|
|
.plus,
|
|
.plus_plus,
|
|
.plus_equal,
|
|
.minus,
|
|
.minus_minus,
|
|
.minus_equal,
|
|
.nl,
|
|
.asterisk,
|
|
.asterisk_equal,
|
|
.percent,
|
|
.percent_equal,
|
|
.arrow,
|
|
.colon,
|
|
.semicolon,
|
|
.slash,
|
|
.slash_equal,
|
|
.nl,
|
|
.comma,
|
|
.ampersand,
|
|
.ampersand_ampersand,
|
|
.ampersand_equal,
|
|
.question_mark,
|
|
.angle_bracket_left,
|
|
.angle_bracket_left_equal,
|
|
.angle_bracket_angle_bracket_left,
|
|
.nl,
|
|
.angle_bracket_angle_bracket_left_equal,
|
|
.angle_bracket_right,
|
|
.angle_bracket_right_equal,
|
|
.angle_bracket_angle_bracket_right,
|
|
.angle_bracket_angle_bracket_right_equal,
|
|
.tilde,
|
|
.hash,
|
|
.hash_hash,
|
|
.nl,
|
|
});
|
|
}
|
|
|
|
test "keywords" {
|
|
try expectTokens(
|
|
\\auto __auto_type break case char const continue default do
|
|
\\double else enum extern float for goto if int
|
|
\\long register return short signed sizeof static
|
|
\\struct switch typedef union unsigned void volatile
|
|
\\while _Bool _Complex _Imaginary inline restrict _Alignas
|
|
\\_Alignof _Atomic _Generic _Noreturn _Static_assert _Thread_local
|
|
\\__attribute __attribute__
|
|
\\
|
|
, &.{
|
|
.keyword_auto,
|
|
.keyword_auto_type,
|
|
.keyword_break,
|
|
.keyword_case,
|
|
.keyword_char,
|
|
.keyword_const,
|
|
.keyword_continue,
|
|
.keyword_default,
|
|
.keyword_do,
|
|
.nl,
|
|
.keyword_double,
|
|
.keyword_else,
|
|
.keyword_enum,
|
|
.keyword_extern,
|
|
.keyword_float,
|
|
.keyword_for,
|
|
.keyword_goto,
|
|
.keyword_if,
|
|
.keyword_int,
|
|
.nl,
|
|
.keyword_long,
|
|
.keyword_register,
|
|
.keyword_return,
|
|
.keyword_short,
|
|
.keyword_signed,
|
|
.keyword_sizeof,
|
|
.keyword_static,
|
|
.nl,
|
|
.keyword_struct,
|
|
.keyword_switch,
|
|
.keyword_typedef,
|
|
.keyword_union,
|
|
.keyword_unsigned,
|
|
.keyword_void,
|
|
.keyword_volatile,
|
|
.nl,
|
|
.keyword_while,
|
|
.keyword_bool,
|
|
.keyword_complex,
|
|
.keyword_imaginary,
|
|
.keyword_inline,
|
|
.keyword_restrict,
|
|
.keyword_alignas,
|
|
.nl,
|
|
.keyword_alignof,
|
|
.keyword_atomic,
|
|
.keyword_generic,
|
|
.keyword_noreturn,
|
|
.keyword_static_assert,
|
|
.keyword_thread_local,
|
|
.nl,
|
|
.keyword_attribute1,
|
|
.keyword_attribute2,
|
|
.nl,
|
|
});
|
|
}
|
|
|
|
test "preprocessor keywords" {
|
|
try expectTokens(
|
|
\\#include
|
|
\\#include_next
|
|
\\#embed
|
|
\\#define
|
|
\\#ifdef
|
|
\\#ifndef
|
|
\\#error
|
|
\\#pragma
|
|
\\
|
|
, &.{
|
|
.hash,
|
|
.keyword_include,
|
|
.nl,
|
|
.hash,
|
|
.keyword_include_next,
|
|
.nl,
|
|
.hash,
|
|
.keyword_embed,
|
|
.nl,
|
|
.hash,
|
|
.keyword_define,
|
|
.nl,
|
|
.hash,
|
|
.keyword_ifdef,
|
|
.nl,
|
|
.hash,
|
|
.keyword_ifndef,
|
|
.nl,
|
|
.hash,
|
|
.keyword_error,
|
|
.nl,
|
|
.hash,
|
|
.keyword_pragma,
|
|
.nl,
|
|
});
|
|
}
|
|
|
|
test "line continuation" {
|
|
try expectTokens(
|
|
\\#define foo \
|
|
\\ bar
|
|
\\"foo\
|
|
\\ bar"
|
|
\\#define "foo"
|
|
\\ "bar"
|
|
\\#define "foo" \
|
|
\\ "bar"
|
|
, &.{
|
|
.hash,
|
|
.keyword_define,
|
|
.identifier,
|
|
.identifier,
|
|
.nl,
|
|
.string_literal,
|
|
.nl,
|
|
.hash,
|
|
.keyword_define,
|
|
.string_literal,
|
|
.nl,
|
|
.string_literal,
|
|
.nl,
|
|
.hash,
|
|
.keyword_define,
|
|
.string_literal,
|
|
.string_literal,
|
|
});
|
|
}
|
|
|
|
test "string prefix" {
|
|
try expectTokens(
|
|
\\"foo"
|
|
\\u"foo"
|
|
\\u8"foo"
|
|
\\U"foo"
|
|
\\L"foo"
|
|
\\'foo'
|
|
\\u8'A'
|
|
\\u'foo'
|
|
\\U'foo'
|
|
\\L'foo'
|
|
\\
|
|
, &.{
|
|
.string_literal,
|
|
.nl,
|
|
.string_literal_utf_16,
|
|
.nl,
|
|
.string_literal_utf_8,
|
|
.nl,
|
|
.string_literal_utf_32,
|
|
.nl,
|
|
.string_literal_wide,
|
|
.nl,
|
|
.char_literal,
|
|
.nl,
|
|
.char_literal_utf_8,
|
|
.nl,
|
|
.char_literal_utf_16,
|
|
.nl,
|
|
.char_literal_utf_32,
|
|
.nl,
|
|
.char_literal_wide,
|
|
.nl,
|
|
});
|
|
}
|
|
|
|
test "num suffixes" {
|
|
try expectTokens(
|
|
\\ 1.0f 1.0L 1.0 .0 1. 0x1p0f 0X1p0
|
|
\\ 0l 0lu 0ll 0llu 0
|
|
\\ 1u 1ul 1ull 1
|
|
\\ 1.0i 1.0I
|
|
\\ 1.0if 1.0If 1.0fi 1.0fI
|
|
\\ 1.0il 1.0Il 1.0li 1.0lI
|
|
\\
|
|
, &.{
|
|
.pp_num,
|
|
.pp_num,
|
|
.pp_num,
|
|
.pp_num,
|
|
.pp_num,
|
|
.pp_num,
|
|
.pp_num,
|
|
.nl,
|
|
.pp_num,
|
|
.pp_num,
|
|
.pp_num,
|
|
.pp_num,
|
|
.pp_num,
|
|
.nl,
|
|
.pp_num,
|
|
.pp_num,
|
|
.pp_num,
|
|
.pp_num,
|
|
.nl,
|
|
.pp_num,
|
|
.pp_num,
|
|
.nl,
|
|
.pp_num,
|
|
.pp_num,
|
|
.pp_num,
|
|
.pp_num,
|
|
.nl,
|
|
.pp_num,
|
|
.pp_num,
|
|
.pp_num,
|
|
.pp_num,
|
|
.nl,
|
|
});
|
|
}
|
|
|
|
test "comments" {
|
|
try expectTokens(
|
|
\\//foo
|
|
\\#foo
|
|
, &.{
|
|
.nl,
|
|
.hash,
|
|
.identifier,
|
|
});
|
|
}
|
|
|
|
test "extended identifiers" {
|
|
try expectTokens("𝓪𝓻𝓸𝓬𝓬", &.{.extended_identifier});
|
|
try expectTokens("u𝓪𝓻𝓸𝓬𝓬", &.{.extended_identifier});
|
|
try expectTokens("u8𝓪𝓻𝓸𝓬𝓬", &.{.extended_identifier});
|
|
try expectTokens("U𝓪𝓻𝓸𝓬𝓬", &.{.extended_identifier});
|
|
try expectTokens("L𝓪𝓻𝓸𝓬𝓬", &.{.extended_identifier});
|
|
try expectTokens("1™", &.{ .pp_num, .extended_identifier });
|
|
try expectTokens("1.™", &.{ .pp_num, .extended_identifier });
|
|
try expectTokens("..™", &.{ .period, .period, .extended_identifier });
|
|
try expectTokens("0™", &.{ .pp_num, .extended_identifier });
|
|
try expectTokens("0b\u{E0000}", &.{ .pp_num, .extended_identifier });
|
|
try expectTokens("0b0\u{E0000}", &.{ .pp_num, .extended_identifier });
|
|
try expectTokens("01\u{E0000}", &.{ .pp_num, .extended_identifier });
|
|
try expectTokens("010\u{E0000}", &.{ .pp_num, .extended_identifier });
|
|
try expectTokens("0x\u{E0000}", &.{ .pp_num, .extended_identifier });
|
|
try expectTokens("0x0\u{E0000}", &.{ .pp_num, .extended_identifier });
|
|
try expectTokens("\"\\0\u{E0000}\"", &.{.string_literal});
|
|
try expectTokens("\"\\x\u{E0000}\"", &.{.string_literal});
|
|
try expectTokens("\"\\u\u{E0000}\"", &.{ .invalid, .extended_identifier, .invalid });
|
|
try expectTokens("1e\u{E0000}", &.{ .pp_num, .extended_identifier });
|
|
try expectTokens("1e1\u{E0000}", &.{ .pp_num, .extended_identifier });
|
|
}
|
|
|
|
test "digraphs" {
|
|
try expectTokens("%:<::><%%>%:%:", &.{ .hash, .l_bracket, .r_bracket, .l_brace, .r_brace, .hash_hash });
|
|
try expectTokens("\"%:<::><%%>%:%:\"", &.{.string_literal});
|
|
try expectTokens("%:%42 %:%", &.{ .hash, .percent, .pp_num, .hash, .percent });
|
|
}
|
|
|
|
test "C23 keywords" {
|
|
try expectTokensExtra("true false alignas alignof bool static_assert thread_local nullptr", &.{
|
|
.keyword_true,
|
|
.keyword_false,
|
|
.keyword_c23_alignas,
|
|
.keyword_c23_alignof,
|
|
.keyword_c23_bool,
|
|
.keyword_c23_static_assert,
|
|
.keyword_c23_thread_local,
|
|
.keyword_nullptr,
|
|
}, .c2x);
|
|
}
|
|
|
|
fn expectTokensExtra(contents: []const u8, expected_tokens: []const Token.Id, standard: ?LangOpts.Standard) !void {
|
|
var comp = Compilation.init(std.testing.allocator);
|
|
defer comp.deinit();
|
|
if (standard) |provided| {
|
|
comp.langopts.standard = provided;
|
|
}
|
|
const source = try comp.addSourceFromBuffer("path", contents);
|
|
var tokenizer = Tokenizer{
|
|
.buf = source.buf,
|
|
.source = source.id,
|
|
.comp = &comp,
|
|
};
|
|
var i: usize = 0;
|
|
while (i < expected_tokens.len) {
|
|
const token = tokenizer.next();
|
|
if (token.id == .whitespace) continue;
|
|
const expected_token_id = expected_tokens[i];
|
|
i += 1;
|
|
if (!std.meta.eql(token.id, expected_token_id)) {
|
|
std.debug.print("expected {s}, found {s}\n", .{ @tagName(expected_token_id), @tagName(token.id) });
|
|
return error.TokensDoNotEqual;
|
|
}
|
|
}
|
|
const last_token = tokenizer.next();
|
|
try std.testing.expect(last_token.id == .eof);
|
|
}
|
|
|
|
fn expectTokens(contents: []const u8, expected_tokens: []const Token.Id) !void {
|
|
return expectTokensExtra(contents, expected_tokens, null);
|
|
}
|