From db2cc31279b2552c9ba7aa1ac79122a53a5d5fa0 Mon Sep 17 00:00:00 2001 From: Juliano Martinez Date: Mon, 11 May 2026 10:01:41 -0700 Subject: [PATCH] extend simple load fast path coverage Handle scalar tags and nested block mappings in the strict simple load path while preserving fallback diagnostics, tag validation, and alias fallback behavior. Cap parser event preallocation to avoid over-allocating for token-heavy inputs. --- src/api/load.zig | 468 ++++++++++++++++++++++++++++++++++-------- src/parser/parser.zig | 2 +- 2 files changed, 389 insertions(+), 81 deletions(-) diff --git a/src/api/load.zig b/src/api/load.zig index 44f406f7..a29191da 100644 --- a/src/api/load.zig +++ b/src/api/load.zig @@ -15,7 +15,9 @@ const parse = @import("parse.zig"); const loader = @import("../loader/loader.zig"); const scanner = @import("../scanner/scanner.zig"); const schema = @import("../schema/schema.zig"); +const schema_tag = @import("../schema/tag.zig"); const simple_fast_path = @import("../parser/simple_fast_path.zig"); +const parser_tag = @import("../parser/tag.zig"); const value = @import("../value/value.zig"); const Error = diagnostics.Error; @@ -183,6 +185,26 @@ const StrictShapeStats = struct { max_nesting_depth: usize, max_scalar_bytes: usize, node_count: usize, + root_child_count: usize = 0, + nested_mapping_values: bool = false, +}; + +const StrictScalarToken = struct { + value: []const u8, + tag: ?[]const u8 = null, +}; + +const StrictScalarRole = enum { + key, + value, +}; + +const StrictNodeStats = struct { + event_count: usize, + max_nesting_depth: usize, + max_scalar_bytes: usize = 0, + node_count: usize, + root_child_count: usize = 0, }; fn mayBeStrictSimpleInput(input: []const u8) bool { @@ -198,8 +220,7 @@ fn mayBeStrictSimpleInput(input: []const u8) bool { if (trimmed[0] == '#') return false; if (trimmed[0] == '%') return false; if (std.mem.startsWith(u8, trimmed, "---") or std.mem.startsWith(u8, trimmed, "...")) return false; - if (std.mem.indexOfAny(u8, trimmed, "&*!'\"[]{}|>%#") != null) return false; - if (mappingValueIsOmitted(trimmed)) return false; + if (std.mem.indexOfAny(u8, trimmed, "&*'\"[]{}|>%#") != null) return false; if (line.len != trimmed.len and !std.mem.containsAtLeast(u8, trimmed, 1, ":")) return false; } @@ -208,15 +229,6 @@ fn mayBeStrictSimpleInput(input: []const u8) bool { return true; } -fn mappingValueIsOmitted(line: []const u8) bool { - if (std.mem.indexOfScalar(u8, line, ':')) |colon| { - var cursor = colon + 1; - while (cursor < line.len and (line[cursor] == ' ' or line[cursor] == '\t')) : (cursor += 1) {} - return cursor == line.len; - } - return false; -} - fn strictShapeStats(shape: StrictSimpleShape) StrictShapeStats { return switch (shape) { inline else => |stats| stats, @@ -231,18 +243,28 @@ fn strictSimpleShape(tokens: []const scanner.Token) ?StrictSimpleShape { } fn strictSimpleScalarShape(tokens: []const scanner.Token) ?StrictShapeStats { - if (tokens.len != 4 or tokens[0] != .stream_start or tokens[1] != .indent or tokens[1].indent != 0 or tokens[2] != .scalar or tokens[3] != .stream_end) return null; - if (!simple_fast_path.isSimplePlainScalarToken(tokens[2].scalar)) return null; + if (tokens.len < 4 or tokens[0] != .stream_start or tokens[tokens.len - 1] != .stream_end) return null; + var index: usize = 1; + if (tokens[index] != .indent or tokens[index].indent != 0) return null; + index += 1; + + const scalar_token = readStrictScalarToken(tokens, &index, tokens.len - 1, .value) orelse return null; + if (index != tokens.len - 1) return null; return .{ .event_count = 5, .max_nesting_depth = 0, - .max_scalar_bytes = tokens[2].scalar.len, + .max_scalar_bytes = scalar_token.value.len, .node_count = 1, }; } fn strictSimpleBlockMappingShape(tokens: []const scanner.Token) ?StrictShapeStats { + if (strictFlatBlockMappingShape(tokens)) |stats| return stats; + return strictNestedBlockMappingShape(tokens); +} + +fn strictFlatBlockMappingShape(tokens: []const scanner.Token) ?StrictShapeStats { if (tokens.len < 6 or tokens[0] != .stream_start or tokens[tokens.len - 1] != .stream_end) return null; var pair_count: usize = 0; @@ -252,16 +274,14 @@ fn strictSimpleBlockMappingShape(tokens: []const scanner.Token) ?StrictShapeStat if (tokens[index] != .indent or tokens[index].indent != 0) return null; index += 1; - if (index >= tokens.len - 1 or tokens[index] != .scalar or !simple_fast_path.isSimpleBlockMappingKeyToken(tokens[index].scalar)) return null; - max_scalar_bytes = @max(max_scalar_bytes, tokens[index].scalar.len); - index += 1; + const key = readStrictScalarToken(tokens, &index, tokens.len - 1, .key) orelse return null; + max_scalar_bytes = @max(max_scalar_bytes, key.value.len); if (index >= tokens.len - 1 or tokens[index] != .block_mapping_value) return null; index += 1; - if (index >= tokens.len - 1 or tokens[index] != .scalar or !simple_fast_path.isSimplePlainScalarToken(tokens[index].scalar)) return null; - max_scalar_bytes = @max(max_scalar_bytes, tokens[index].scalar.len); - index += 1; + const scalar_value = readStrictScalarToken(tokens, &index, tokens.len - 1, .value) orelse return null; + max_scalar_bytes = @max(max_scalar_bytes, scalar_value.value.len); pair_count += 1; } @@ -272,9 +292,85 @@ fn strictSimpleBlockMappingShape(tokens: []const scanner.Token) ?StrictShapeStat .max_nesting_depth = 1, .max_scalar_bytes = max_scalar_bytes, .node_count = 1 + pair_count * 2, + .root_child_count = pair_count, }; } +fn strictNestedBlockMappingShape(tokens: []const scanner.Token) ?StrictShapeStats { + if (tokens.len < 6 or tokens[0] != .stream_start or tokens[tokens.len - 1] != .stream_end) return null; + + var index: usize = 1; + const mapping = strictBlockMappingNodeStats(tokens, &index, tokens.len - 1, 0, 1) orelse return null; + if (index != tokens.len - 1) return null; + + return .{ + .event_count = 4 + mapping.event_count, + .max_nesting_depth = mapping.max_nesting_depth, + .max_scalar_bytes = mapping.max_scalar_bytes, + .node_count = mapping.node_count, + .root_child_count = mapping.root_child_count, + .nested_mapping_values = true, + }; +} + +fn strictBlockMappingNodeStats( + tokens: []const scanner.Token, + index: *usize, + end: usize, + mapping_indent: usize, + depth: usize, +) ?StrictNodeStats { + var stats: StrictNodeStats = .{ + .event_count = 2, + .max_nesting_depth = depth, + .node_count = 1, + }; + var pair_count: usize = 0; + + while (index.* < end) { + if (tokens[index.*] != .indent) break; + const indent = tokens[index.*].indent; + if (indent < mapping_indent) break; + if (indent != mapping_indent) return null; + index.* += 1; + + const key = readStrictScalarToken(tokens, index, end, .key) orelse return null; + includeStrictScalarStats(&stats, key); + + if (index.* >= end or tokens[index.*] != .block_mapping_value) return null; + index.* += 1; + + var value_index = index.*; + if (readStrictScalarToken(tokens, &value_index, end, .value)) |scalar_value| { + index.* = value_index; + includeStrictScalarStats(&stats, scalar_value); + } else { + if (index.* >= end or tokens[index.*] != .indent or tokens[index.*].indent <= mapping_indent) return null; + const nested = strictBlockMappingNodeStats(tokens, index, end, tokens[index.*].indent, depth + 1) orelse return null; + includeStrictNodeStats(&stats, nested); + } + + pair_count += 1; + } + + if (pair_count == 0) return null; + stats.root_child_count = pair_count; + return stats; +} + +fn includeStrictScalarStats(stats: *StrictNodeStats, scalar_token: StrictScalarToken) void { + stats.event_count += 1; + stats.node_count += 1; + stats.max_scalar_bytes = @max(stats.max_scalar_bytes, scalar_token.value.len); +} + +fn includeStrictNodeStats(stats: *StrictNodeStats, node_stats: StrictNodeStats) void { + stats.event_count += node_stats.event_count; + stats.node_count += node_stats.node_count; + stats.max_scalar_bytes = @max(stats.max_scalar_bytes, node_stats.max_scalar_bytes); + stats.max_nesting_depth = @max(stats.max_nesting_depth, node_stats.max_nesting_depth); +} + fn strictSimpleBlockSequenceShape(tokens: []const scanner.Token) ?StrictShapeStats { if (tokens.len < 4 or tokens[0] != .stream_start or tokens[tokens.len - 1] != .stream_end) return null; @@ -290,19 +386,18 @@ fn strictSimpleBlockSequenceShape(tokens: []const scanner.Token) ?StrictShapeSta if (index >= tokens.len - 1 or tokens[index] != .block_sequence_entry) return null; index += 1; - if (index >= tokens.len - 1 or tokens[index] != .scalar or !simple_fast_path.isSimplePlainScalarToken(tokens[index].scalar)) return null; - if (index + 1 < tokens.len - 1 and tokens[index + 1] == .block_mapping_value) { - if (!simple_fast_path.isSimpleBlockMappingKeyToken(tokens[index].scalar)) return null; + const first = readStrictScalarToken(tokens, &index, tokens.len - 1, .value) orelse return null; + if (index < tokens.len - 1 and tokens[index] == .block_mapping_value) { + if (!simple_fast_path.isSimpleBlockMappingKeyToken(first.value)) return null; max_nesting_depth = 2; item_event_count += 2; node_count += 3; - max_scalar_bytes = @max(max_scalar_bytes, tokens[index].scalar.len); - index += 2; + max_scalar_bytes = @max(max_scalar_bytes, first.value.len); + index += 1; - if (index >= tokens.len - 1 or tokens[index] != .scalar or !simple_fast_path.isSimplePlainScalarToken(tokens[index].scalar)) return null; + const first_value = readStrictScalarToken(tokens, &index, tokens.len - 1, .value) orelse return null; item_event_count += 2; - max_scalar_bytes = @max(max_scalar_bytes, tokens[index].scalar.len); - index += 1; + max_scalar_bytes = @max(max_scalar_bytes, first_value.value.len); var mapping_indent: ?usize = null; while (index < tokens.len - 1) { @@ -316,22 +411,19 @@ fn strictSimpleBlockSequenceShape(tokens: []const scanner.Token) ?StrictShapeSta } index += 1; - if (index >= tokens.len - 1 or tokens[index] != .scalar or !simple_fast_path.isSimpleBlockMappingKeyToken(tokens[index].scalar)) return null; - max_scalar_bytes = @max(max_scalar_bytes, tokens[index].scalar.len); - index += 1; + const key = readStrictScalarToken(tokens, &index, tokens.len - 1, .key) orelse return null; + max_scalar_bytes = @max(max_scalar_bytes, key.value.len); if (index >= tokens.len - 1 or tokens[index] != .block_mapping_value) return null; index += 1; - if (index >= tokens.len - 1 or tokens[index] != .scalar or !simple_fast_path.isSimplePlainScalarToken(tokens[index].scalar)) return null; - max_scalar_bytes = @max(max_scalar_bytes, tokens[index].scalar.len); + const scalar_value = readStrictScalarToken(tokens, &index, tokens.len - 1, .value) orelse return null; + max_scalar_bytes = @max(max_scalar_bytes, scalar_value.value.len); item_event_count += 2; node_count += 2; - index += 1; } } else { - max_scalar_bytes = @max(max_scalar_bytes, tokens[index].scalar.len); + max_scalar_bytes = @max(max_scalar_bytes, first.value.len); item_event_count += 1; node_count += 1; - index += 1; } item_count += 1; @@ -343,9 +435,45 @@ fn strictSimpleBlockSequenceShape(tokens: []const scanner.Token) ?StrictShapeSta .max_nesting_depth = max_nesting_depth, .max_scalar_bytes = max_scalar_bytes, .node_count = node_count, + .root_child_count = item_count, }; } +fn readStrictScalarToken(tokens: []const scanner.Token, index: *usize, end: usize, role: StrictScalarRole) ?StrictScalarToken { + var cursor = index.*; + if (cursor >= end) return null; + + const tag_value = if (tokens[cursor] == .tag) value: { + if (!isStrictSimpleTagToken(tokens[cursor].tag)) return null; + const raw = tokens[cursor].tag; + cursor += 1; + break :value raw; + } else null; + + if (cursor >= end or tokens[cursor] != .scalar) return null; + const scalar_value = tokens[cursor].scalar; + const supported = switch (role) { + .key => simple_fast_path.isSimpleBlockMappingKeyToken(scalar_value), + .value => simple_fast_path.isSimplePlainScalarToken(scalar_value), + }; + if (!supported) return null; + + index.* = cursor + 1; + return .{ .value = scalar_value, .tag = tag_value }; +} + +fn isStrictSimpleTagToken(raw: []const u8) bool { + if (raw.len == 0 or raw[0] != '!') return false; + if (std.mem.eql(u8, raw, "!")) return true; + if (std.mem.indexOfAny(u8, raw, "<>%")) |_| return false; + + if (std.mem.startsWith(u8, raw, "!!")) { + return raw.len > 2 and std.mem.indexOfScalar(u8, raw[2..], '!') == null; + } + + return std.mem.indexOfScalar(u8, raw[1..], '!') == null; +} + fn checkStrictShapeLimits(input: []const u8, options: LoadOptions, shape: StrictSimpleShape) Error!void { const stats = strictShapeStats(shape); @@ -379,24 +507,34 @@ const StrictSimpleBuilder = struct { fn construct(self: *StrictSimpleBuilder, tokens: []const scanner.Token, shape: StrictSimpleShape) Error!*const value.Node { return switch (shape) { - .scalar => self.constructPlainScalar(tokens[2].scalar), - .mapping => self.constructBlockMapping(tokens), - .sequence => self.constructBlockSequence(tokens), + .scalar => self.constructRootScalar(tokens), + .mapping => |stats| if (stats.nested_mapping_values) + self.constructNestedBlockMapping(tokens, stats.root_child_count) + else + self.constructFlatBlockMapping(tokens, stats.root_child_count), + .sequence => |stats| self.constructBlockSequence(tokens, stats.root_child_count), }; } - fn constructBlockMapping(self: *StrictSimpleBuilder, tokens: []const scanner.Token) Error!*const value.Node { - const pair_count = (tokens.len - 2) / 4; + fn constructRootScalar(self: *StrictSimpleBuilder, tokens: []const scanner.Token) Error!*const value.Node { + var index: usize = 2; + const scalar_token = readStrictScalarToken(tokens, &index, tokens.len - 1, .value) orelse return ParseError.InvalidSyntax; + return self.constructPlainScalar(scalar_token); + } + + fn constructFlatBlockMapping(self: *StrictSimpleBuilder, tokens: []const scanner.Token, pair_count: usize) Error!*const value.Node { const pairs = try self.arena_allocator.alloc(value.MappingPair, pair_count); var pair_index: usize = 0; var token_index: usize = 1; while (token_index < tokens.len - 1) : (pair_index += 1) { token_index += 1; - const key = try self.constructPlainScalar(tokens[token_index].scalar); - token_index += 2; - const node_value = try self.constructPlainScalar(tokens[token_index].scalar); + const key_token = readStrictScalarToken(tokens, &token_index, tokens.len - 1, .key) orelse return ParseError.InvalidSyntax; + const key = try self.constructPlainScalar(key_token); + if (token_index >= tokens.len - 1 or tokens[token_index] != .block_mapping_value) return ParseError.InvalidSyntax; token_index += 1; + const value_token = readStrictScalarToken(tokens, &token_index, tokens.len - 1, .value) orelse return ParseError.InvalidSyntax; + const node_value = try self.constructPlainScalar(value_token); pairs[pair_index] = .{ .key = key, .value = node_value }; } @@ -407,19 +545,71 @@ const StrictSimpleBuilder = struct { return node; } - fn constructBlockSequence(self: *StrictSimpleBuilder, tokens: []const scanner.Token) Error!*const value.Node { - const item_count = strictSequenceItemCount(tokens); + fn constructNestedBlockMapping(self: *StrictSimpleBuilder, tokens: []const scanner.Token, pair_count: usize) Error!*const value.Node { + var token_index: usize = 1; + return self.constructBlockMappingAt(tokens, &token_index, tokens.len - 1, 0, pair_count); + } + + fn constructBlockMappingAt( + self: *StrictSimpleBuilder, + tokens: []const scanner.Token, + index: *usize, + end: usize, + mapping_indent: usize, + pair_capacity: usize, + ) Error!*const value.Node { + var pairs: std.ArrayList(value.MappingPair) = .empty; + errdefer pairs.deinit(self.arena_allocator); + try pairs.ensureTotalCapacity(self.arena_allocator, pair_capacity); + + while (index.* < end) { + if (tokens[index.*] != .indent) break; + const indent = tokens[index.*].indent; + if (indent < mapping_indent) break; + if (indent != mapping_indent) return ParseError.InvalidSyntax; + index.* += 1; + + const key_token = readStrictScalarToken(tokens, index, end, .key) orelse return ParseError.InvalidSyntax; + const key = try self.constructPlainScalar(key_token); + + if (index.* >= end or tokens[index.*] != .block_mapping_value) return ParseError.InvalidSyntax; + index.* += 1; + + var value_index = index.*; + const node_value = if (readStrictScalarToken(tokens, &value_index, end, .value)) |value_token| value: { + index.* = value_index; + break :value try self.constructPlainScalar(value_token); + } else value: { + if (index.* >= end or tokens[index.*] != .indent or tokens[index.*].indent <= mapping_indent) return ParseError.InvalidSyntax; + break :value try self.constructBlockMappingAt(tokens, index, end, tokens[index.*].indent, 0); + }; + + try pairs.append(self.arena_allocator, .{ .key = key, .value = node_value }); + } + + const owned_pairs = try pairs.toOwnedSlice(self.arena_allocator); + try self.validateMappingPairs(owned_pairs); + + const node = try self.nodes.create(); + node.* = .{ .mapping = .{ .pairs = owned_pairs } }; + return node; + } + + fn constructBlockSequence(self: *StrictSimpleBuilder, tokens: []const scanner.Token, item_count: usize) Error!*const value.Node { const items = try self.arena_allocator.alloc(*const value.Node, item_count); var item_index: usize = 0; var token_index: usize = 1; while (token_index < tokens.len - 1) : (item_index += 1) { token_index += 2; - if (token_index + 1 < tokens.len - 1 and tokens[token_index + 1] == .block_mapping_value) { + const item_start = token_index; + const first = readStrictScalarToken(tokens, &token_index, tokens.len - 1, .value) orelse return ParseError.InvalidSyntax; + if (token_index < tokens.len - 1 and tokens[token_index] == .block_mapping_value) { + if (!simple_fast_path.isSimpleBlockMappingKeyToken(first.value)) return ParseError.InvalidSyntax; + token_index = item_start; items[item_index] = try self.constructCompactMappingItem(tokens, &token_index); } else { - items[item_index] = try self.constructPlainScalar(tokens[token_index].scalar); - token_index += 1; + items[item_index] = try self.constructPlainScalar(first); } } @@ -433,10 +623,12 @@ const StrictSimpleBuilder = struct { errdefer pairs.deinit(self.arena_allocator); while (true) { - const key = try self.constructPlainScalar(tokens[index.*].scalar); - index.* += 2; - const node_value = try self.constructPlainScalar(tokens[index.*].scalar); + const key_token = readStrictScalarToken(tokens, index, tokens.len - 1, .key) orelse return ParseError.InvalidSyntax; + const key = try self.constructPlainScalar(key_token); + if (index.* >= tokens.len - 1 or tokens[index.*] != .block_mapping_value) return ParseError.InvalidSyntax; index.* += 1; + const value_token = readStrictScalarToken(tokens, index, tokens.len - 1, .value) orelse return ParseError.InvalidSyntax; + const node_value = try self.constructPlainScalar(value_token); try pairs.append(self.arena_allocator, .{ .key = key, .value = node_value }); if (index.* >= tokens.len - 1 or tokens[index.*] != .indent or tokens[index.*].indent == 0) break; @@ -451,32 +643,63 @@ const StrictSimpleBuilder = struct { return node; } - fn constructPlainScalar(self: *StrictSimpleBuilder, scalar_value: []const u8) Error!*const value.Node { + fn constructPlainScalar(self: *StrictSimpleBuilder, scalar_token: StrictScalarToken) Error!*const value.Node { const node = try self.nodes.create(); - node.* = try self.constructPlainScalarNode(scalar_value); + node.* = try self.constructPlainScalarNode(scalar_token); return node; } - fn constructPlainScalarNode(self: *StrictSimpleBuilder, scalar_value: []const u8) Error!value.Node { - const resolved = schema.resolveScalar(self.options.schema, scalar_value, true, null) catch |err| { + fn constructPlainScalarNode(self: *StrictSimpleBuilder, scalar_token: StrictScalarToken) Error!value.Node { + const resolved_tag = try self.resolveTag(scalar_token.tag); + try self.validateScalarTag(resolved_tag, scalar_token.value); + + const resolved = schema.resolveScalar(self.options.schema, scalar_token.value, true, resolved_tag) catch |err| { setLoadFailureDiagnostic(self.input, self.options, .invalid_scalar_tag, err); return err; }; if (resolved) |resolved_scalar| { return switch (resolved_scalar) { - .null_value => .{ .null_value = .{} }, - .bool_value => |bool_value| .{ .bool_value = .{ .value = bool_value } }, - .int_value => |int_value| .{ .int_value = .{ .value = int_value } }, - .float_value => |float_value| .{ .float_value = .{ .value = float_value } }, + .null_value => .{ .null_value = .{ .tag = resolved_tag } }, + .bool_value => |bool_value| .{ .bool_value = .{ .value = bool_value, .tag = resolved_tag } }, + .int_value => |int_value| .{ .int_value = .{ .value = int_value, .tag = resolved_tag } }, + .float_value => |float_value| .{ .float_value = .{ .value = float_value, .tag = resolved_tag } }, }; } return .{ .scalar = .{ - .value = try self.arena_allocator.dupe(u8, scalar_value), + .value = try self.arena_allocator.dupe(u8, scalar_token.value), + .tag = resolved_tag, } }; } + fn resolveTag(self: *StrictSimpleBuilder, raw_tag: ?[]const u8) Error!?[]const u8 { + const tag_value = raw_tag orelse return null; + return parser_tag.resolve(self.arena_allocator, &.{}, tag_value) catch |err| { + setLoadFailureDiagnostic(self.input, self.options, .invalid_graph, err); + return err; + }; + } + + fn validateScalarTag(self: *StrictSimpleBuilder, resolved_tag: ?[]const u8, scalar_value: []const u8) Error!void { + schema_tag.validateStandardTagKind(resolved_tag, .scalar) catch |err| { + setLoadFailureDiagnostic(self.input, self.options, .invalid_standard_tag, err); + return err; + }; + if (self.options.unknown_tag_behavior == .reject and schema_tag.isUnknownTag(resolved_tag)) { + setLoadFailureDiagnostic(self.input, self.options, .unknown_tag, ParseError.InvalidSyntax); + return ParseError.InvalidSyntax; + } + schema_tag.validateStandardBinaryContent(resolved_tag, scalar_value) catch |err| { + setLoadFailureDiagnostic(self.input, self.options, .invalid_standard_tag, err); + return err; + }; + schema_tag.validateStandardTimestampContent(resolved_tag, scalar_value) catch |err| { + setLoadFailureDiagnostic(self.input, self.options, .invalid_standard_tag, err); + return err; + }; + } + fn validateMappingPairs(self: *StrictSimpleBuilder, pairs: []const value.MappingPair) Error!void { if (self.options.duplicate_key_behavior == .allow) return; duplicate_key.validateUniqueMappingKeys(self.temporary_allocator, pairs) catch |err| { @@ -486,23 +709,6 @@ const StrictSimpleBuilder = struct { } }; -fn strictSequenceItemCount(tokens: []const scanner.Token) usize { - var count: usize = 0; - var index: usize = 1; - while (index < tokens.len - 1) : (count += 1) { - index += 2; - if (index + 1 < tokens.len - 1 and tokens[index + 1] == .block_mapping_value) { - index += 3; - while (index < tokens.len - 1 and tokens[index] == .indent and tokens[index].indent != 0) { - index += 4; - } - } else { - index += 1; - } - } - return count; -} - fn setLimitDiagnostic(input: []const u8, offset: usize, options: LoadOptions, message: []const u8) void { if (options.diagnostic) |diagnostic| { diagnostic.* = common_diagnostic.atOffset(input, offset, message); @@ -596,10 +802,39 @@ test "load fast path declines aliases for fallback" { try std.testing.expect(loaded == null); } +test "load fast path declines alias-heavy input for fallback" { + const input = + \\- &a0 value0 + \\- *a0 + \\- &a1 value1 + \\- *a1 + \\ + ; + + const fast = try tryStrictSimpleLoadStream(std.testing.allocator, input, .{}); + if (fast) |loaded| { + var owned = loaded; + defer owned.deinit(); + return error.ExpectedFallback; + } + + var stream = try loadStreamWithOptions(std.testing.allocator, input, .{}); + defer stream.deinit(); + + try std.testing.expectEqual(@as(usize, 1), stream.documents.len); + const root = stream.documents[0]; + try std.testing.expect(root.* == .sequence); + try std.testing.expectEqual(@as(usize, 4), root.sequence.items.len); + try std.testing.expectEqual(root.sequence.items[0], root.sequence.items[1]); + try std.testing.expectEqual(root.sequence.items[2], root.sequence.items[3]); +} + test "strict simple load fast path matches forced fallback" { const cases = [_][]const u8{ "plain\n", + "!!str value\n", "name: yaml\nversion: 1\n", + "name: yaml\nfeatures:\n parser: true\n loader: true\n", "- one\n- two\n", "- id: 1\n name: record-1\n- id: 2\n name: record-2\n", }; @@ -619,7 +854,6 @@ test "strict simple load fast path declines unsupported shapes" { const cases = [_][]const u8{ "*anchor\n", "&anchor value\n", - "!!str value\n", "%YAML 1.2\n---\nvalue\n", "--- value\n", "key: value # comment\n", @@ -628,12 +862,16 @@ test "strict simple load fast path declines unsupported shapes" { "[one, two]\n", "key: first\n second\n", "key:\n", - "outer:\n inner: value\n", "-\n - nested\n", }; for (cases) |input| { const fast = try tryStrictSimpleLoadStream(std.testing.allocator, input, .{}); + if (fast) |loaded| { + var owned = loaded; + defer owned.deinit(); + return error.ExpectedFallback; + } try std.testing.expect(fast == null); } } @@ -664,6 +902,9 @@ fn checkStrictSimpleLoadAllocationFailure(failing_allocator: std.mem.Allocator) var accepted = (try tryStrictSimpleLoadStream(failing_allocator, "name: yaml\nversion: 1\n", .{})).?; defer accepted.deinit(); + var tagged = (try tryStrictSimpleLoadStream(failing_allocator, "name: !local yaml\nversion: !!int 1\n", .{})).?; + defer tagged.deinit(); + const declined = try tryStrictSimpleLoadStream(failing_allocator, "name: [yaml]\n", .{}); try std.testing.expect(declined == null); } @@ -733,6 +974,73 @@ test "strict simple load fast path preserves schema duplicate key and limit beha try std.testing.expectEqualStrings("loader exceeded configured document count limit", diagnostic.message); } +test "strict simple load fast path preserves unknown tag behavior" { + var preserved = (try tryStrictSimpleLoadStream(std.testing.allocator, "!local value\n", .{})) orelse return error.ExpectedDirectLoad; + defer preserved.deinit(); + try std.testing.expect(preserved.documents[0].* == .scalar); + try std.testing.expectEqualStrings("value", preserved.documents[0].scalar.value); + try std.testing.expectEqualStrings("!local", preserved.documents[0].scalar.tag.?); + + var standard = (try tryStrictSimpleLoadStream(std.testing.allocator, "!!str true\n", .{})) orelse return error.ExpectedDirectLoad; + defer standard.deinit(); + try std.testing.expect(standard.documents[0].* == .scalar); + try std.testing.expectEqualStrings("true", standard.documents[0].scalar.value); + try std.testing.expectEqualStrings("tag:yaml.org,2002:str", standard.documents[0].scalar.tag.?); + + var diagnostic: diagnostics.Diagnostic = .{}; + try std.testing.expectError(ParseError.InvalidSyntax, tryStrictSimpleLoadStream(std.testing.allocator, "!local value\n", .{ + .unknown_tag_behavior = .reject, + .diagnostic = &diagnostic, + })); + try std.testing.expectEqualStrings("loader rejected unknown tag", diagnostic.message); +} + +test "strict simple load fast path matches forced fallback malformed diagnostics" { + const cases = [_][]const u8{ + "!!int not-int\n", + "!!seq value\n", + "!!binary SGVsbG8\n", + }; + + for (cases) |input| { + const direct = try expectStrictSimpleLoadError(input); + const fallback = try expectFallbackLoadError(input); + + try std.testing.expectEqual(fallback.err, direct.err); + try std.testing.expectEqualStrings(fallback.diagnostic.message, direct.diagnostic.message); + try std.testing.expectEqual(fallback.diagnostic.offset, direct.diagnostic.offset); + try std.testing.expectEqual(fallback.diagnostic.line, direct.diagnostic.line); + try std.testing.expectEqual(fallback.diagnostic.column, direct.diagnostic.column); + } +} + +const LoadErrorResult = struct { + err: Error, + diagnostic: diagnostics.Diagnostic, +}; + +fn expectStrictSimpleLoadError(input: []const u8) !LoadErrorResult { + var diagnostic: diagnostics.Diagnostic = .{}; + if (tryStrictSimpleLoadStream(std.testing.allocator, input, .{ .diagnostic = &diagnostic })) |loaded| { + var owned = loaded orelse return error.ExpectedDirectLoad; + owned.deinit(); + return error.ExpectedLoadFailure; + } else |err| { + return .{ .err = err, .diagnostic = diagnostic }; + } +} + +fn expectFallbackLoadError(input: []const u8) !LoadErrorResult { + var diagnostic: diagnostics.Diagnostic = .{}; + if (loadStreamViaEvents(std.testing.allocator, input, .{ .diagnostic = &diagnostic })) |loaded| { + var owned = loaded; + owned.deinit(); + return error.ExpectedLoadFailure; + } else |err| { + return .{ .err = err, .diagnostic = diagnostic }; + } +} + fn expectLoadedStreamsEqual(actual: *const LoadedStream, expected: *const LoadedStream) !void { try std.testing.expectEqual(expected.documents.len, actual.documents.len); for (actual.documents, expected.documents) |actual_document, expected_document| { diff --git a/src/parser/parser.zig b/src/parser/parser.zig index 9ba1a526..616822a5 100644 --- a/src/parser/parser.zig +++ b/src/parser/parser.zig @@ -195,7 +195,7 @@ pub fn parseTokensWithStats(allocator: std.mem.Allocator, tokens: []const scanne ScalarDocumentTokens{ .scalar = null }; var events: EventBuilder = .{}; - try events.ensureTotalCapacity(arena_allocator, tokens.len); + try events.ensureTotalCapacity(arena_allocator, @min(tokens.len, 32)); try events.append(arena_allocator, .stream_start);