std.zig.tokenizer: simplify

I pointed a fuzzer at the tokenizer and it crashed immediately. Upon
inspection, I was dissatisfied with the implementation. This commit
removes several mechanisms:
* Removes the "invalid byte" compile error note.
* Dramatically simplifies tokenizer recovery by making recovery always
  occur at newlines, and never otherwise.
* Removes UTF-8 validation.
* Moves some character validation logic to `std.zig.parseCharLiteral`.

Removing UTF-8 validation is a regression of #663, however, the existing
implementation was already buggy. When adding this functionality back,
it must be fuzz-tested while checking the property that it matches an
independent Unicode validation implementation on the same file. While
we're at it, fuzzing should check the other properties of that proposal,
such as no ASCII control characters existing inside the source code.

Other changes included in this commit:

* Deprecate `std.unicode.utf8Decode` and its WTF-8 counterpart. This
  function has an awkward API that is too easy to misuse.
* Make `utf8Decode2` and friends use arrays as parameters, eliminating a
  runtime assertion in favor of using the type system.

After this commit, the crash found by fuzzing, which was
"\x07\xd5\x80\xc3=o\xda|a\xfc{\x9a\xec\x91\xdf\x0f\\\x1a^\xbe;\x8c\xbf\xee\xea"
no longer causes a crash. However, I did not feel the need to add this
test case because the simplified logic eradicates most crashes of this
nature.
This commit is contained in:
Andrew Kelley
2024-07-31 11:51:19 -07:00
parent c08effc20a
commit 377e8579f9
12 changed files with 246 additions and 404 deletions
+6 -12
View File
@@ -42,8 +42,7 @@ pub fn addCases(ctx: *Cases, b: *std.Build) !void {
const case = ctx.obj("isolated carriage return in multiline string literal", b.graph.host);
case.addError("const foo = \\\\\test\r\r rogue carriage return\n;", &[_][]const u8{
":1:13: error: expected expression, found 'invalid bytes'",
":1:19: note: invalid byte: '\\r'",
":1:13: error: expected expression, found 'invalid token'",
});
}
@@ -179,8 +178,7 @@ pub fn addCases(ctx: *Cases, b: *std.Build) !void {
\\ return true;
\\}
, &[_][]const u8{
":1:1: error: expected type expression, found 'invalid bytes'",
":1:1: note: invalid byte: '\\xff'",
":1:1: error: expected type expression, found 'invalid token'",
});
}
@@ -222,8 +220,7 @@ pub fn addCases(ctx: *Cases, b: *std.Build) !void {
const case = ctx.obj("invalid byte in string", b.graph.host);
case.addError("_ = \"\x01Q\";", &[_][]const u8{
":1:5: error: expected expression, found 'invalid bytes'",
":1:6: note: invalid byte: '\\x01'",
":1:5: error: expected expression, found 'invalid token'",
});
}
@@ -231,8 +228,7 @@ pub fn addCases(ctx: *Cases, b: *std.Build) !void {
const case = ctx.obj("invalid byte in comment", b.graph.host);
case.addError("//\x01Q", &[_][]const u8{
":1:1: error: expected type expression, found 'invalid bytes'",
":1:3: note: invalid byte: '\\x01'",
":1:1: error: expected type expression, found 'invalid token'",
});
}
@@ -240,8 +236,7 @@ pub fn addCases(ctx: *Cases, b: *std.Build) !void {
const case = ctx.obj("control character in character literal", b.graph.host);
case.addError("const c = '\x01';", &[_][]const u8{
":1:11: error: expected expression, found 'invalid bytes'",
":1:12: note: invalid byte: '\\x01'",
":1:11: error: expected expression, found 'invalid token'",
});
}
@@ -249,8 +244,7 @@ pub fn addCases(ctx: *Cases, b: *std.Build) !void {
const case = ctx.obj("invalid byte at start of token", b.graph.host);
case.addError("x = \x00Q", &[_][]const u8{
":1:5: error: expected expression, found 'invalid bytes'",
":1:5: note: invalid byte: '\\x00'",
":1:5: error: expected expression, found 'invalid token'",
});
}
}