From ced5545caf9953890eda59fb6012fb23f1ed78f0 Mon Sep 17 00:00:00 2001 From: Asger F Date: Thu, 7 May 2026 13:55:36 +0200 Subject: [PATCH 01/12] Yeast: add reachable_node_ids() --- shared/yeast/src/lib.rs | 33 +++++++++++++++++++++++++++++++++ shared/yeast/tests/test.rs | 22 ++++++++++++++++++++++ 2 files changed, 55 insertions(+) diff --git a/shared/yeast/src/lib.rs b/shared/yeast/src/lib.rs index 281f44a98b26..c06029a86260 100644 --- a/shared/yeast/src/lib.rs +++ b/shared/yeast/src/lib.rs @@ -193,10 +193,43 @@ impl Ast { AstCursor::new(self) } + /// Return all nodes currently allocated in the AST arena. + /// + /// This includes nodes that are no longer reachable from `get_root()` + /// after desugaring rewrites. Use `reachable_node_ids()` for output-level + /// validation/traversal semantics. pub fn nodes(&self) -> &[Node] { &self.nodes } + /// Return node ids reachable from `get_root()` by following child edges. + /// + /// This reflects the effective AST after desugaring and excludes orphaned + /// arena nodes left behind by rewrite operations. + pub fn reachable_node_ids(&self) -> Vec { + let mut reachable = Vec::new(); + let mut stack = vec![self.root]; + let mut seen = vec![false; self.nodes.len()]; + + while let Some(id) = stack.pop() { + if id >= self.nodes.len() || seen[id] { + continue; + } + seen[id] = true; + reachable.push(id); + + if let Some(node) = self.get_node(id) { + for children in node.fields.values() { + for &child in children { + stack.push(child); + } + } + } + } + + reachable + } + pub fn get_root(&self) -> Id { self.root } diff --git a/shared/yeast/tests/test.rs b/shared/yeast/tests/test.rs index ed4202493a46..e058e6b1eb07 100644 --- a/shared/yeast/tests/test.rs +++ b/shared/yeast/tests/test.rs @@ -166,6 +166,28 @@ fn test_query_no_match() { assert!(!matched); } +#[test] +fn test_reachable_nodes_excludes_orphaned_rewrite_nodes() { + let lang: tree_sitter::Language = tree_sitter_ruby::LANGUAGE.into(); + let schema = yeast::node_types_yaml::schema_from_yaml_with_language(OUTPUT_SCHEMA_YAML, &lang) + .unwrap(); + let rules = vec![yeast::rule!((integer) => (identifier "replaced"))]; + let runner = Runner::with_schema(lang, &schema, &rules); + + let input = "x = 1"; + let ast = runner.run(input).unwrap(); + let reachable_ids = ast.reachable_node_ids(); + + assert!( + ast.nodes().len() > reachable_ids.len(), + "expected rewrite to leave orphaned arena nodes" + ); + + let dump = dump_ast(&ast, ast.get_root(), input); + assert!(dump.contains("identifier \"replaced\"")); + assert!(!dump.contains("integer \"1\"")); +} + #[test] fn test_query_repeated_capture() { let runner = Runner::new(tree_sitter_ruby::LANGUAGE.into(), &[]); From 5b8cd37e910e31e493307983ea30355d0f50803d Mon Sep 17 00:00:00 2001 From: Asger F Date: Thu, 7 May 2026 22:50:30 +0200 Subject: [PATCH 02/12] Yeast: add type-checking errors in AST dump --- shared/yeast/src/dump.rs | 201 +++++++++++++++++++++++++++- shared/yeast/src/node_types_yaml.rs | 123 +++++++++++------ shared/yeast/src/schema.rs | 76 ++++++++++- shared/yeast/tests/test.rs | 110 ++++++++++++++- 4 files changed, 459 insertions(+), 51 deletions(-) diff --git a/shared/yeast/src/dump.rs b/shared/yeast/src/dump.rs index 99ba019cc3ea..07ee134e058c 100644 --- a/shared/yeast/src/dump.rs +++ b/shared/yeast/src/dump.rs @@ -1,6 +1,6 @@ use std::fmt::Write; -use crate::{Ast, Node, NodeContent, CHILD_FIELD}; +use crate::{schema::Schema, Ast, Node, NodeContent, CHILD_FIELD}; /// Options for controlling AST dump output. pub struct DumpOptions { @@ -45,16 +45,143 @@ pub fn dump_ast_with_options( options: &DumpOptions, ) -> String { let mut out = String::new(); - dump_node(ast, root, source, options, 0, &mut out); + dump_node(ast, root, source, options, 0, None, &mut out); out } +/// Dump an AST and annotate type mismatches against a schema inline. +/// +/// Any node that does not match the expected type set for its parent field is +/// rendered with a trailing `" <-- ERROR: ..."` annotation on the same line. +pub fn dump_ast_with_type_errors( + ast: &Ast, + root: usize, + source: &str, + schema: &Schema, +) -> String { + dump_ast_with_type_errors_and_options(ast, root, source, schema, &DumpOptions::default()) +} + +/// Dump an AST and annotate type mismatches against a schema inline. +/// +/// Any node that does not match the expected type set for its parent field is +/// rendered with a trailing `" <-- ERROR: ..."` annotation on the same line. +pub fn dump_ast_with_type_errors_and_options( + ast: &Ast, + root: usize, + source: &str, + schema: &Schema, + options: &DumpOptions, +) -> String { + let mut out = String::new(); + dump_node(ast, root, source, options, 0, Some((schema, None, None)), &mut out); + out +} + +fn format_node_types(node_types: &[crate::schema::NodeType]) -> String { + node_types + .iter() + .map(|t| { + if t.named { + t.kind.clone() + } else { + format!("\"{}\"", t.kind) + } + }) + .collect::>() + .join(" | ") +} + +const EMPTY_NODE_TYPES: &[crate::schema::NodeType] = &[]; + +/// Generate a type-checking error message for a node if it doesn't match expected types. +/// +/// # Arguments +/// - `schema`: The AST schema to validate against. +/// - `node`: The node being checked. +/// - `expected`: The set of allowed types for this node, or `None` if type-checking is disabled. +/// - `parent_field`: Optional tuple of (parent_kind, field_name) for context in error messages. +/// +/// # Returns +/// `Some(error_message)` if the node violates the schema (e.g., wrong kind, missing field declaration). +/// `None` if the node matches the expected types or if type-checking is disabled. +fn type_error_for_node( + schema: &Schema, + node: &Node, + expected: Option<&[crate::schema::NodeType]>, + parent_field: Option<(&str, &str)>, +) -> Option { + if schema.id_for_node_kind(node.kind_name()).is_none() + && schema.id_for_unnamed_node_kind(node.kind_name()).is_none() + { + return Some(format!("node kind '{}' not in schema", node.kind_name())); + } + + let expected = expected?; + if expected.is_empty() { + if let Some((kind, field)) = parent_field { + return Some(format!("the node '{kind}' has no field '{field}'")); + } + return Some("field not declared in schema for this parent node".to_string()); + } + if schema.node_matches_types(node.kind_name(), node.is_named(), expected) { + None + } else { + let actual = if node.is_named() { + node.kind_name().to_string() + } else { + format!("\"{}\"", node.kind_name()) + }; + + if let Some((kind, field)) = parent_field { + Some(format!( + "The field {}.{} should contain {}, but got {}", + kind, + field, + format_node_types(expected), + actual + )) + } else { + Some(format!( + "expected {}, got {}", + format_node_types(expected), + actual + )) + } + } +} + +/// Look up the allowed types for a field in the schema. +/// +/// # Arguments +/// - `schema`: The AST schema to query. +/// - `parent_kind`: The node kind of the parent that contains this field. +/// - `field_id`: The field ID within that parent node. +/// +/// # Returns +/// `Some(&[NodeType])` if the field is declared in the schema and has type constraints. +/// `None` if the field is not declared or has no constraints (undeclared field). +fn expected_for_field<'a>( + schema: &'a Schema, + parent_kind: &str, + field_id: u16, +) -> Option<&'a [crate::schema::NodeType]> { + schema + .field_types(parent_kind, field_id) + .map(|v| v.as_slice()) +} + fn dump_node( ast: &Ast, id: usize, source: &str, options: &DumpOptions, indent: usize, + type_check: Option<( + &Schema, + Option<&[crate::schema::NodeType]>, + Option<(&str, &str)>, + )>, out: &mut String, ) { let node = match ast.get_node(id) { @@ -90,6 +217,12 @@ fn dump_node( } } + if let Some((schema, expected, parent_field)) = type_check { + if let Some(err) = type_error_for_node(schema, node, expected, parent_field) { + write!(out, " <-- ERROR: {err}").unwrap(); + } + } + writeln!(out).unwrap(); // Named fields first @@ -98,31 +231,68 @@ fn dump_node( continue; // Handle unnamed children last } let field_name = ast.field_name_for_id(field_id).unwrap_or("?"); + let child_type_check = type_check.map(|(schema, _, _)| { + let expected = expected_for_field(schema, node.kind_name(), field_id) + .or(Some(EMPTY_NODE_TYPES)); + let parent_field = Some((node.kind_name(), field_name)); + (schema, expected, parent_field) + }); + if children.len() == 1 { write!(out, "{prefix} {field_name}:").unwrap(); // Inline single child let child = ast.get_node(children[0]); if child.is_some_and(is_leaf) { write!(out, " ").unwrap(); - dump_node_inline(ast, children[0], source, options, out); + dump_node_inline(ast, children[0], source, options, child_type_check, out); } else { writeln!(out).unwrap(); - dump_node(ast, children[0], source, options, indent + 2, out); + dump_node( + ast, + children[0], + source, + options, + indent + 2, + child_type_check, + out, + ); } } else { writeln!(out, "{prefix} {field_name}:").unwrap(); for &child_id in children { - dump_node(ast, child_id, source, options, indent + 2, out); + dump_node( + ast, + child_id, + source, + options, + indent + 2, + child_type_check, + out, + ); } } } // Unnamed children — skip unnamed tokens (keywords, punctuation) if let Some(children) = node.fields.get(&CHILD_FIELD) { + let child_type_check = type_check.map(|(schema, _, _)| { + let expected = expected_for_field(schema, node.kind_name(), CHILD_FIELD) + .or(Some(EMPTY_NODE_TYPES)); + let parent_field = Some((node.kind_name(), "children")); + (schema, expected, parent_field) + }); for &child_id in children { if let Some(child) = ast.get_node(child_id) { if child.is_named() { - dump_node(ast, child_id, source, options, indent + 1, out); + dump_node( + ast, + child_id, + source, + options, + indent + 1, + child_type_check, + out, + ); } } } @@ -130,7 +300,18 @@ fn dump_node( } /// Dump a leaf node inline (no newline prefix, caller provides context). -fn dump_node_inline(ast: &Ast, id: usize, source: &str, options: &DumpOptions, out: &mut String) { +fn dump_node_inline( + ast: &Ast, + id: usize, + source: &str, + options: &DumpOptions, + type_check: Option<( + &Schema, + Option<&[crate::schema::NodeType]>, + Option<(&str, &str)>, + )>, + out: &mut String, +) { let node = match ast.get_node(id) { Some(n) => n, None => return, @@ -159,6 +340,12 @@ fn dump_node_inline(ast: &Ast, id: usize, source: &str, options: &DumpOptions, o } } + if let Some((schema, expected, parent_field)) = type_check { + if let Some(err) = type_error_for_node(schema, node, expected, parent_field) { + write!(out, " <-- ERROR: {err}").unwrap(); + } + } + writeln!(out).unwrap(); } diff --git a/shared/yeast/src/node_types_yaml.rs b/shared/yeast/src/node_types_yaml.rs index d321ba8a2cf0..eb191076be48 100644 --- a/shared/yeast/src/node_types_yaml.rs +++ b/shared/yeast/src/node_types_yaml.rs @@ -23,6 +23,7 @@ use std::collections::{BTreeMap, BTreeSet}; use std::fmt::Write; +use crate::CHILD_FIELD; use serde::Deserialize; use serde_json::json; @@ -100,32 +101,38 @@ fn parse_field_name(raw: &str) -> FieldSpec { /// Resolve a TypeRef to a (type, named) pair, given the sets of known named /// and unnamed types. -fn resolve_type_ref( +fn resolve_type_ref_pair( type_ref: &TypeRef, named_types: &BTreeSet, unnamed_types: &BTreeSet, -) -> serde_json::Value { +) -> (String, bool) { match type_ref { - TypeRef::Explicit { unnamed } => { - json!({"type": unnamed, "named": false}) - } + TypeRef::Explicit { unnamed } => (unnamed.clone(), false), TypeRef::Name(name) => { let is_named = named_types.contains(name); let is_unnamed = unnamed_types.contains(name); - if is_named && is_unnamed { - // Ambiguous: default to named - json!({"type": name, "named": true}) + (name.clone(), true) } else if is_unnamed { - json!({"type": name, "named": false}) + (name.clone(), false) } else { - // Named, or unknown (assume named) - json!({"type": name, "named": true}) + (name.clone(), true) } } } } +/// Resolve a TypeRef to a {type, named} JSON record, given the sets of known named +/// and unnamed types. +fn resolve_type_ref( + type_ref: &TypeRef, + named_types: &BTreeSet, + unnamed_types: &BTreeSet, +) -> serde_json::Value { + let (kind, named) = resolve_type_ref_pair(type_ref, named_types, unnamed_types); + json!({"type": kind, "named": named}) +} + /// Convert YAML string to node-types JSON string. pub fn convert(yaml_input: &str) -> Result { let yaml: YamlNodeTypes = @@ -233,14 +240,12 @@ pub fn convert(yaml_input: &str) -> Result { serde_json::to_string_pretty(&output).map_err(|e| format!("Failed to serialize JSON: {e}")) } -/// Build a Schema from a YAML node-types string. -/// Registers all node kinds and field names found in the YAML. -pub fn schema_from_yaml(yaml_input: &str) -> Result { - let yaml: YamlNodeTypes = - serde_yaml::from_str(yaml_input).map_err(|e| format!("Failed to parse YAML: {e}"))?; - - let mut schema = crate::schema::Schema::new(); - +/// Apply YAML node-type definitions to a mutable Schema. +/// Registers all types, fields, and allowed types from the YAML into the schema. +fn apply_yaml_to_schema( + yaml: &YamlNodeTypes, + schema: &mut crate::schema::Schema, +) { // Register all supertypes as node kinds for name in yaml.supertypes.keys() { schema.register_kind(name); @@ -264,6 +269,62 @@ pub fn schema_from_yaml(yaml_input: &str) -> Result = yaml.unnamed.iter().cloned().collect(); + + for (supertype, members) in &yaml.supertypes { + let node_types = members + .iter() + .map(|m| { + let (kind, named) = resolve_type_ref_pair(m, &named_types, &unnamed_types); + crate::schema::NodeType { kind, named } + }) + .collect(); + schema.set_supertype_members(supertype, node_types); + } + + // Register allowed field child types for type checking. + for (parent_kind, fields_opt) in &yaml.named { + let Some(fields) = fields_opt else { + continue; + }; + + for (raw_field_name, type_refs) in fields { + let spec = parse_field_name(raw_field_name); + let field_id = match &spec.name { + Some(name) => schema.register_field(name), + None => CHILD_FIELD, + }; + + let mut node_types = type_refs + .clone() + .into_vec() + .into_iter() + .map(|type_ref| { + let (kind, named) = resolve_type_ref_pair(&type_ref, &named_types, &unnamed_types); + crate::schema::NodeType { kind, named } + }) + .collect::>(); + node_types.sort_by(|a, b| a.kind.cmp(&b.kind).then(a.named.cmp(&b.named))); + node_types.dedup_by(|a, b| a.kind == b.kind && a.named == b.named); + schema.set_field_types(parent_kind, field_id, node_types); + } + } +} + +pub fn schema_from_yaml(yaml_input: &str) -> Result { + let yaml: YamlNodeTypes = + serde_yaml::from_str(yaml_input).map_err(|e| format!("Failed to parse YAML: {e}"))?; + + let mut schema = crate::schema::Schema::new(); + apply_yaml_to_schema(&yaml, &mut schema); + Ok(schema) } @@ -278,29 +339,7 @@ pub fn schema_from_yaml_with_language( serde_yaml::from_str(yaml_input).map_err(|e| format!("Failed to parse YAML: {e}"))?; let mut schema = crate::schema::Schema::from_language(language); - - // Register supertypes - for name in yaml.supertypes.keys() { - schema.register_kind(name); - } - - // Register named node kinds and their fields - for (name, fields_opt) in &yaml.named { - schema.register_kind(name); - if let Some(fields) = fields_opt { - for raw_field_name in fields.keys() { - let spec = parse_field_name(raw_field_name); - if let Some(field_name) = &spec.name { - schema.register_field(field_name); - } - } - } - } - - // Register unnamed tokens - for name in &yaml.unnamed { - schema.register_unnamed_kind(name); - } + apply_yaml_to_schema(&yaml, &mut schema); Ok(schema) } diff --git a/shared/yeast/src/schema.rs b/shared/yeast/src/schema.rs index 12554d9c8692..c832a57b23ad 100644 --- a/shared/yeast/src/schema.rs +++ b/shared/yeast/src/schema.rs @@ -1,7 +1,13 @@ -use std::collections::BTreeMap; +use std::collections::{BTreeMap, BTreeSet}; use crate::{FieldId, KindId, CHILD_FIELD}; +#[derive(Clone, Debug)] +pub struct NodeType { + pub kind: String, + pub named: bool, +} + /// A schema defining node kinds and field names for the output AST. /// Built from a node-types.yml file, independent of any tree-sitter grammar. /// @@ -25,6 +31,8 @@ pub struct Schema { unnamed_kind_ids: BTreeMap, kind_names: BTreeMap, next_kind_id: KindId, + field_types: BTreeMap<(String, FieldId), Vec>, + supertypes: BTreeMap>, } impl Default for Schema { @@ -43,6 +51,8 @@ impl Schema { unnamed_kind_ids: BTreeMap::new(), kind_names: BTreeMap::new(), next_kind_id: 1, // 0 is reserved + field_types: BTreeMap::new(), + supertypes: BTreeMap::new(), } } @@ -166,4 +176,68 @@ impl Schema { pub fn node_kind_for_id(&self, id: KindId) -> Option<&'static str> { self.kind_names.get(&id).copied() } + + pub fn set_field_types( + &mut self, + parent_kind: &str, + field_id: FieldId, + node_types: Vec, + ) { + self.field_types + .insert((parent_kind.to_string(), field_id), node_types); + } + + pub fn field_types( + &self, + parent_kind: &str, + field_id: FieldId, + ) -> Option<&Vec> { + self.field_types + .get(&(parent_kind.to_string(), field_id)) + } + + pub fn set_supertype_members(&mut self, supertype: &str, node_types: Vec) { + self.supertypes.insert(supertype.to_string(), node_types); + } + + fn allows_node( + &self, + node_type: &NodeType, + node_kind: &str, + node_named: bool, + active: &mut BTreeSet, + ) -> bool { + if node_type.kind == node_kind && node_type.named == node_named { + return true; + } + + if !node_type.named { + return false; + } + + let Some(members) = self.supertypes.get(&node_type.kind) else { + return false; + }; + + if !active.insert(node_type.kind.clone()) { + return false; + } + + let matched = members + .iter() + .any(|member| self.allows_node(member, node_kind, node_named, active)); + active.remove(&node_type.kind); + matched + } + + pub fn node_matches_types( + &self, + node_kind: &str, + node_named: bool, + node_types: &[NodeType], + ) -> bool { + node_types.iter().any(|node_type| { + self.allows_node(node_type, node_kind, node_named, &mut BTreeSet::new()) + }) + } } diff --git a/shared/yeast/tests/test.rs b/shared/yeast/tests/test.rs index e058e6b1eb07..05fd19981656 100644 --- a/shared/yeast/tests/test.rs +++ b/shared/yeast/tests/test.rs @@ -1,6 +1,6 @@ #![cfg(test)] -use yeast::dump::dump_ast; +use yeast::dump::{dump_ast, dump_ast_with_type_errors}; use yeast::*; const OUTPUT_SCHEMA_YAML: &str = include_str!("node-types.yml"); @@ -42,6 +42,35 @@ fn run_and_get_error(input: &str, rules: Vec) -> String { .expect_err("expected runner to return an error") } +/// Helper: parse Ruby source with no rules and dump with schema type errors. +fn parse_and_dump_typed(input: &str, schema_yaml: &str) -> String { + let runner = Runner::new(tree_sitter_ruby::LANGUAGE.into(), &[]); + let ast = runner.run(input).unwrap(); + let schema = yeast::node_types_yaml::schema_from_yaml(schema_yaml).unwrap(); + dump_ast_with_type_errors(&ast, ast.get_root(), input, &schema) +} + +/// Helper: parse Ruby source with no rules and dump with schema type errors, +/// building schema with language IDs so field checks align with parser fields. +fn parse_and_dump_typed_with_language(input: &str, schema_yaml: &str) -> String { + let lang: tree_sitter::Language = tree_sitter_ruby::LANGUAGE.into(); + let runner = Runner::new(lang.clone(), &[]); + let ast = runner.run(input).unwrap(); + let schema = yeast::node_types_yaml::schema_from_yaml_with_language(schema_yaml, &lang) + .unwrap(); + dump_ast_with_type_errors(&ast, ast.get_root(), input, &schema) +} + +/// Helper: parse Ruby source with custom rules and dump with schema type errors. +fn run_and_dump_typed(input: &str, rules: Vec, schema_yaml: &str) -> String { + let lang: tree_sitter::Language = tree_sitter_ruby::LANGUAGE.into(); + let schema = yeast::node_types_yaml::schema_from_yaml(schema_yaml).unwrap(); + let phases = vec![Phase::new("test", rules)]; + let runner = Runner::with_schema(lang, &schema, &phases); + let ast = runner.run(input).unwrap(); + dump_ast_with_type_errors(&ast, ast.get_root(), input, &schema) +} + /// Assert that a dump equals the expected string, treating the expected /// string as an indented multiline literal: leading/trailing blank lines /// are stripped, and the common leading indentation is removed from every @@ -125,6 +154,85 @@ fn test_parse_for_loop() { ); } +#[test] +fn test_dump_highlights_type_errors_inline() { + let schema_yaml = r#" +named: + program: + $children*: assignment + assignment: + left: identifier + right: identifier + identifier: +"#; + + let dump = parse_and_dump_typed("x = 1", schema_yaml); + assert!(dump.contains("integer \"1\" <-- ERROR:")); +} + +#[test] +fn test_dump_reports_preserved_unknown_kind_after_transformation() { + let schema_yaml = r#" +named: + program: + $children*: assignment + assignment: + left: identifier + right: identifier + identifier: +"#; + + // This rewrite runs and preserves the RHS node kind via capture. + // With schema above, preserving `integer` should be reported inline. + let rules = vec![yeast::rule!( + (assignment left: (_) @left right: (_) @right) + => + (assignment + left: {left} + right: {right} + ) + )]; + + let dump = run_and_dump_typed("x = 1", rules, schema_yaml); + assert!(dump.contains("integer \"1\" <-- ERROR:")); + assert!(dump.contains("node kind 'integer' not in schema")); +} + +#[test] +fn test_dump_reports_undeclared_field_on_node() { + let schema_yaml = r#" +named: + program: + $children*: assignment + assignment: + left: identifier + identifier: +"#; + + let dump = parse_and_dump_typed_with_language("x = y", schema_yaml); + assert!(dump.contains("right: identifier \"y\" <-- ERROR:")); + assert!(dump.contains("the node 'assignment' has no field 'right'")); +} + +#[test] +fn test_dump_reports_disallowed_kind_in_field_type() { + let schema_yaml = r#" +named: + program: + $children*: assignment + assignment: + left: identifier + right: identifier + identifier: + integer: +"#; + + let dump = parse_and_dump_typed_with_language("x = 1", schema_yaml); + assert!(dump.contains("right: integer \"1\" <-- ERROR:")); + assert!(dump.contains("should contain")); + assert!(dump.contains("but got integer")); +} + // ---- Query tests ---- #[test] From 0b59c41e5ffa0b4cd62c405b9675298c519a8f52 Mon Sep 17 00:00:00 2001 From: Asger F Date: Fri, 8 May 2026 12:02:56 +0200 Subject: [PATCH 03/12] Yeast: Add one-shot phase kind --- shared/yeast/doc/yeast.md | 13 ++- shared/yeast/src/captures.rs | 15 +++ shared/yeast/src/lib.rs | 138 ++++++++++++++++++++++---- shared/yeast/tests/test.rs | 186 +++++++++++++++++++++++++++++++++-- 4 files changed, 323 insertions(+), 29 deletions(-) diff --git a/shared/yeast/doc/yeast.md b/shared/yeast/doc/yeast.md index 893cdea24dde..df5085272338 100644 --- a/shared/yeast/doc/yeast.md +++ b/shared/yeast/doc/yeast.md @@ -349,8 +349,8 @@ to enable rewriting: ```rust let desugar = yeast::DesugaringConfig::new() - .add_phase("cleanup", cleanup_rules()) - .add_phase("desugar", desugar_rules()) + .add_phase("cleanup", yeast::PhaseKind::Repeating, cleanup_rules()) + .add_phase("translate", yeast::PhaseKind::OneShot, translate_rules()) .with_output_node_types_yaml(include_str!("output-node-types.yml")); let lang = simple::LanguageSpec { @@ -365,6 +365,15 @@ let lang = simple::LanguageSpec { A single-phase config is just `.add_phase(...)` called once. Phase names appear in error messages so you can tell which phase failed. +There are two kinds of phases: +- **Repeating**: + Each node is re-processed until none of the rules in the phase matches. + When a node no longer matches any rules, its children are recursively processed. In practice this is used to desugar or simplify an AST, while staying mostly within the same schema. +- **One-shot**: + Each node is processed by the first matching rule, and the rewrite fails if no rule matches. + Rules are then recursively applied to every captured node. + In practice this is used when translating from one AST schema to another, where an exhaustive match is required. + The same YAML node-types is used for both the runtime yeast `Schema` (so rules can refer to output-only kinds and fields) and TRAP validation (it is converted to JSON internally). diff --git a/shared/yeast/src/captures.rs b/shared/yeast/src/captures.rs index a92c5096e94e..ef184cd9f69a 100644 --- a/shared/yeast/src/captures.rs +++ b/shared/yeast/src/captures.rs @@ -61,6 +61,21 @@ impl Captures { } } } + + /// Apply a fallible function to every captured id (across all keys), + /// replacing each id with the result. Stops and returns the error on + /// the first failure. + pub fn try_map_all_captures( + &mut self, + mut f: impl FnMut(Id) -> Result, + ) -> Result<(), E> { + for ids in self.captures.values_mut() { + for id in ids { + *id = f(*id)?; + } + } + Ok(()) + } pub fn map_captures_to(&mut self, from: &str, to: &'static str, f: &mut impl FnMut(Id) -> Id) { if let Some(from_ids) = self.captures.get(from) { let new_values = from_ids.iter().copied().map(f).collect(); diff --git a/shared/yeast/src/lib.rs b/shared/yeast/src/lib.rs index c06029a86260..d45fe4980dfd 100644 --- a/shared/yeast/src/lib.rs +++ b/shared/yeast/src/lib.rs @@ -526,18 +526,39 @@ impl Rule { node: Id, fresh: &tree_builder::FreshScope, ) -> Result>, String> { + match self.try_match(ast, node)? { + Some(captures) => Ok(Some(self.run_transform(ast, captures, node, fresh))), + None => Ok(None), + } + } + + /// Attempt to match this rule's query against `node`, returning the + /// resulting captures on success. Does not invoke the transform. + fn try_match(&self, ast: &Ast, node: Id) -> Result, String> { let mut captures = Captures::new(); if self.query.do_match(ast, node, &mut captures)? { - fresh.next_scope(); - let source_range = ast.get_node(node).and_then(|n| match n.content { - NodeContent::Range(r) => Some(r), - _ => n.source_range, - }); - Ok(Some((self.transform)(ast, captures, fresh, source_range))) + Ok(Some(captures)) } else { Ok(None) } } + + /// Run this rule's transform with the given captures, using `node`'s + /// source range as the source range of the produced nodes. + fn run_transform( + &self, + ast: &mut Ast, + captures: Captures, + node: Id, + fresh: &tree_builder::FreshScope, + ) -> Vec { + fresh.next_scope(); + let source_range = ast.get_node(node).and_then(|n| match n.content { + NodeContent::Range(r) => Some(r), + _ => n.source_range, + }); + (self.transform)(ast, captures, fresh, source_range) + } } const MAX_REWRITE_DEPTH: usize = 100; @@ -572,17 +593,17 @@ impl<'a> RuleIndex<'a> { } } -fn apply_rules( +fn apply_repeating_rules( rules: &[Rule], ast: &mut Ast, id: Id, fresh: &tree_builder::FreshScope, ) -> Result, String> { let index = RuleIndex::new(rules); - apply_rules_inner(&index, ast, id, fresh, 0, None) + apply_repeating_rules_inner(&index, ast, id, fresh, 0, None) } -fn apply_rules_inner( +fn apply_repeating_rules_inner( index: &RuleIndex, ast: &mut Ast, id: Id, @@ -611,7 +632,7 @@ fn apply_rules_inner( let next_skip = if rule.repeated { None } else { Some(rule_ptr) }; let mut results = Vec::new(); for node in result_node { - results.extend(apply_rules_inner( + results.extend(apply_repeating_rules_inner( index, ast, node, @@ -636,7 +657,7 @@ fn apply_rules_inner( for children in fields.values_mut() { let mut new_children: Option> = None; for (i, &child_id) in children.iter().enumerate() { - let result = apply_rules_inner(index, ast, child_id, fresh, rewrite_depth, None)?; + let result = apply_repeating_rules_inner(index, ast, child_id, fresh, rewrite_depth, None)?; let unchanged = result.len() == 1 && result[0] == child_id; match (&mut new_children, unchanged) { (None, true) => {} // unchanged so far, no allocation needed @@ -661,6 +682,75 @@ fn apply_rules_inner( Ok(vec![id]) } +/// Apply rules using `OneShot` semantics: the first matching rule fires on +/// each visited node, recursion proceeds only through captured nodes (not +/// through the input node's children directly), and an error is returned if +/// no rule matches a visited node. +fn apply_one_shot_rules( + rules: &[Rule], + ast: &mut Ast, + id: Id, + fresh: &tree_builder::FreshScope, +) -> Result, String> { + let index = RuleIndex::new(rules); + apply_one_shot_rules_inner(&index, ast, id, fresh, 0) +} + +fn apply_one_shot_rules_inner( + index: &RuleIndex, + ast: &mut Ast, + id: Id, + fresh: &tree_builder::FreshScope, + rewrite_depth: usize, +) -> Result, String> { + if rewrite_depth > MAX_REWRITE_DEPTH { + return Err(format!( + "Desugaring exceeded maximum rewrite depth ({MAX_REWRITE_DEPTH}). \ + This likely indicates a non-terminating rule cycle." + )); + } + + let node_kind = ast.get_node(id).map(|n| n.kind()).unwrap_or(""); + for rule in index.rules_for_kind(node_kind) { + if let Some(mut captures) = rule.try_match(ast, id)? { + // Recursively translate every captured node before invoking the + // transform. The transform's output uses output-schema kinds, so + // we must translate captured input-schema nodes to their + // output-schema equivalents first. + captures.try_map_all_captures(|captured_id| { + let result = + apply_one_shot_rules_inner(index, ast, captured_id, fresh, rewrite_depth + 1)?; + if result.len() != 1 { + return Err(format!( + "OneShot: recursion on captured node produced {} results, expected exactly 1", + result.len() + )); + } + Ok(result[0]) + })?; + return Ok(rule.run_transform(ast, captures, id, fresh)); + } + } + + Err(format!( + "OneShot: no rule matched node of kind '{node_kind}'" + )) +} + +#[derive(Clone, Copy, Debug, PartialEq, Eq)] +pub enum PhaseKind { + /// A node is re-processed until none of the rules in the phase matches, + /// albeit a single rule cannot be applied twice in a row unless that rule is also marked as repeating. + /// When a node no longer matches any rules, its children are recursively processed (top down). + Repeating, + + /// A node is processed by the first matching rule, and the rewrite fails if no rule matches. + /// Rules are then recursively applied to every captured node. + /// In practice this is used when translating from one AST schema to another, where every node must be rewritten, + /// and it would be a type error to match the rule patterns (based on the input schema) against the output nodes (which conform to the output schema). + OneShot, +} + /// One phase of a desugaring pass: a named bundle of rules that runs to /// completion (a full traversal applying its rules) before the next phase /// starts. Rules within a phase compete for matches as usual; rules in @@ -670,13 +760,15 @@ pub struct Phase { /// Name used in error messages. pub name: String, pub rules: Vec, + pub kind: PhaseKind, } impl Phase { - pub fn new(name: impl Into, rules: Vec) -> Self { + pub fn new(name: impl Into, kind: PhaseKind, rules: Vec) -> Self { Self { name: name.into(), rules, + kind, } } } @@ -694,8 +786,8 @@ impl Phase { /// /// ```ignore /// let config = yeast::DesugaringConfig::new() -/// .add_phase("cleanup", cleanup_rules) -/// .add_phase("desugar", desugar_rules) +/// .add_phase("cleanup", PhaseKind::Repeating, cleanup_rules) +/// .add_phase("desugar", PhaseKind::Repeating, desugar_rules) /// .with_output_node_types_yaml(yaml); /// ``` #[derive(Default)] @@ -715,9 +807,14 @@ impl DesugaringConfig { Self::default() } - /// Append a new phase with the given name and rules. - pub fn add_phase(mut self, name: impl Into, rules: Vec) -> Self { - self.phases.push(Phase::new(name, rules)); + /// Append a new phase with the given name, kind, and rules. + pub fn add_phase( + mut self, + name: impl Into, + kind: PhaseKind, + rules: Vec, + ) -> Self { + self.phases.push(Phase::new(name, kind, rules)); self } @@ -806,8 +903,11 @@ impl<'a> Runner<'a> { let fresh = tree_builder::FreshScope::new(); let mut root = ast.get_root(); for phase in self.phases { - let res = apply_rules(&phase.rules, ast, root, &fresh) - .map_err(|e| format!("Phase `{}`: {e}", phase.name))?; + let res = match phase.kind { + PhaseKind::Repeating => apply_repeating_rules(&phase.rules, ast, root, &fresh), + PhaseKind::OneShot => apply_one_shot_rules(&phase.rules, ast, root, &fresh), + } + .map_err(|e| format!("Phase `{}`: {e}", phase.name))?; if res.len() != 1 { return Err(format!( "Phase `{}`: expected exactly one result node, got {}", diff --git a/shared/yeast/tests/test.rs b/shared/yeast/tests/test.rs index 05fd19981656..5e5c97c9ccb2 100644 --- a/shared/yeast/tests/test.rs +++ b/shared/yeast/tests/test.rs @@ -15,7 +15,7 @@ fn parse_and_dump(input: &str) -> String { /// Helper: parse Ruby source with a custom output schema and a single /// phase of rules, return dump. fn run_and_dump(input: &str, rules: Vec) -> String { - run_phased_and_dump(input, vec![Phase::new("test", rules)]) + run_phased_and_dump(input, vec![Phase::new("test", PhaseKind::Repeating, rules)]) } /// Helper: parse Ruby source with a custom output schema and multiple @@ -35,7 +35,7 @@ fn run_and_get_error(input: &str, rules: Vec) -> String { let lang: tree_sitter::Language = tree_sitter_ruby::LANGUAGE.into(); let schema = yeast::node_types_yaml::schema_from_yaml_with_language(OUTPUT_SCHEMA_YAML, &lang).unwrap(); - let phases = vec![Phase::new("test", rules)]; + let phases = vec![Phase::new("test", PhaseKind::Repeating, rules)]; let runner = Runner::with_schema(lang, &schema, &phases); runner .run(input) @@ -65,7 +65,7 @@ fn parse_and_dump_typed_with_language(input: &str, schema_yaml: &str) -> String fn run_and_dump_typed(input: &str, rules: Vec, schema_yaml: &str) -> String { let lang: tree_sitter::Language = tree_sitter_ruby::LANGUAGE.into(); let schema = yeast::node_types_yaml::schema_from_yaml(schema_yaml).unwrap(); - let phases = vec![Phase::new("test", rules)]; + let phases = vec![Phase::new("test", PhaseKind::Repeating, rules)]; let runner = Runner::with_schema(lang, &schema, &phases); let ast = runner.run(input).unwrap(); dump_ast_with_type_errors(&ast, ast.get_root(), input, &schema) @@ -279,8 +279,12 @@ fn test_reachable_nodes_excludes_orphaned_rewrite_nodes() { let lang: tree_sitter::Language = tree_sitter_ruby::LANGUAGE.into(); let schema = yeast::node_types_yaml::schema_from_yaml_with_language(OUTPUT_SCHEMA_YAML, &lang) .unwrap(); - let rules = vec![yeast::rule!((integer) => (identifier "replaced"))]; - let runner = Runner::with_schema(lang, &schema, &rules); + let phases = vec![Phase::new( + "test", + PhaseKind::Repeating, + vec![yeast::rule!((integer) => (identifier "replaced"))], + )]; + let runner = Runner::with_schema(lang, &schema, &phases); let input = "x = 1"; let ast = runner.run(input).unwrap(); @@ -783,8 +787,8 @@ fn test_phased_desugaring() { let dump = run_phased_and_dump( "x = 1", vec![ - Phase::new("cleanup", cleanup), - Phase::new("desugar", desugar), + Phase::new("cleanup", PhaseKind::Repeating, cleanup), + Phase::new("desugar", PhaseKind::Repeating, desugar), ], ); assert_dump_eq( @@ -805,7 +809,11 @@ fn test_phase_error_includes_phase_name() { let lang: tree_sitter::Language = tree_sitter_ruby::LANGUAGE.into(); let schema = yeast::node_types_yaml::schema_from_yaml_with_language(OUTPUT_SCHEMA_YAML, &lang).unwrap(); - let phases = vec![Phase::new("buggy", vec![swap_assignment_rule().repeated()])]; + let phases = vec![Phase::new( + "buggy", + PhaseKind::Repeating, + vec![swap_assignment_rule().repeated()], + )]; let runner = Runner::with_schema(lang, &schema, &phases); let err = runner .run("x = 1") @@ -820,6 +828,168 @@ fn test_phase_error_includes_phase_name() { ); } +/// Helper: an exhaustive set of OneShot rules covering every node reachable +/// (via captures) when translating `"x = 1"`. +fn one_shot_xeq1_rules() -> Vec { + vec![ + yeast::rule!( + (program (_)* @stmts) + => + (program stmt: {..stmts}) + ), + yeast::rule!( + (assignment left: (_) @left right: (_) @right) + => + (first_node left: {left} right: {right}) + ), + yeast::rule!((identifier) => (identifier "ID")), + yeast::rule!((integer) => (integer "INT")), + ] +} + +#[test] +fn test_one_shot_phase() { + let lang: tree_sitter::Language = tree_sitter_ruby::LANGUAGE.into(); + let schema = + yeast::node_types_yaml::schema_from_yaml_with_language(OUTPUT_SCHEMA_YAML, &lang).unwrap(); + let phases = vec![Phase::new( + "translate", + PhaseKind::OneShot, + one_shot_xeq1_rules(), + )]; + let runner = Runner::with_schema(lang, &schema, &phases); + + let input = "x = 1"; + let ast = runner.run(input).unwrap(); + let dump = dump_ast(&ast, ast.get_root(), input); + assert_dump_eq( + &dump, + r#" + program + stmt: + first_node + left: identifier "ID" + right: integer "INT" + "#, + ); +} + +#[test] +fn test_one_shot_phase_errors_when_no_rule_matches() { + let lang: tree_sitter::Language = tree_sitter_ruby::LANGUAGE.into(); + let schema = + yeast::node_types_yaml::schema_from_yaml_with_language(OUTPUT_SCHEMA_YAML, &lang).unwrap(); + // Drop the `integer` rule so the recursion has no rule for `integer`. + let mut rules = one_shot_xeq1_rules(); + rules.pop(); + let phases = vec![Phase::new("translate", PhaseKind::OneShot, rules)]; + let runner = Runner::with_schema(lang, &schema, &phases); + + let err = runner + .run("x = 1") + .expect_err("expected OneShot to error on unmatched node"); + assert!( + err.contains("Phase `translate`"), + "error should name the phase, got: {err}" + ); + assert!( + err.contains("no rule matched") && err.contains("integer"), + "error should describe the unmatched node kind, got: {err}" + ); +} + +/// OneShot recursion must apply rules to *captured* nodes, even if the rule +/// returns a captured child verbatim. A buggy implementation that only +/// recurses into the children of the rule's output (rather than into the +/// captures) would leave the returned capture untransformed. +#[test] +fn test_one_shot_recurses_into_returned_capture() { + let lang: tree_sitter::Language = tree_sitter_ruby::LANGUAGE.into(); + let schema = + yeast::node_types_yaml::schema_from_yaml_with_language(OUTPUT_SCHEMA_YAML, &lang).unwrap(); + let rules = vec![ + yeast::rule!( + (program (_)* @stmts) + => + (program stmt: {..stmts}) + ), + // Returns the captured `left` verbatim, discarding `right`. + yeast::rule!( + (assignment left: (_) @left right: (_) @right) + => + {left} + ), + yeast::rule!((identifier) => (identifier "ID")), + yeast::rule!((integer) => (integer "INT")), + ]; + let phases = vec![Phase::new("translate", PhaseKind::OneShot, rules)]; + let runner = Runner::with_schema(lang, &schema, &phases); + + let input = "x = 1"; + let ast = runner.run(input).unwrap(); + let dump = dump_ast(&ast, ast.get_root(), input); + // `left` is an `identifier`; OneShot must apply the identifier rule to + // it before the assignment transform returns it verbatim. + assert_dump_eq( + &dump, + r#" + program + stmt: identifier "ID" + "#, + ); +} + +/// OneShot recursion must NOT descend into the children of the rule's output. +/// A rule may legitimately wrap a captured node in fresh output-schema nodes +/// that have no matching rule of their own (since rule patterns target the +/// input schema). Recursing into the output would erroneously try to find +/// rules for those wrapper kinds and fail. +#[test] +fn test_one_shot_does_not_recurse_into_wrapper_output() { + let lang: tree_sitter::Language = tree_sitter_ruby::LANGUAGE.into(); + let schema = + yeast::node_types_yaml::schema_from_yaml_with_language(OUTPUT_SCHEMA_YAML, &lang).unwrap(); + let rules = vec![ + yeast::rule!( + (program (_)* @stmts) + => + (program stmt: {..stmts}) + ), + // Wraps `left` in nested `first_node`/`second_node` output kinds. + // Neither wrapper kind has a matching rule, so a buggy implementation + // that recurses into the wrapper's children would error. + yeast::rule!( + (assignment left: (_) @left right: (_) @right) + => + (first_node + left: (second_node left: {left} right: {right}) + right: {left} + ) + ), + yeast::rule!((identifier) => (identifier "ID")), + yeast::rule!((integer) => (integer "INT")), + ]; + let phases = vec![Phase::new("translate", PhaseKind::OneShot, rules)]; + let runner = Runner::with_schema(lang, &schema, &phases); + + let input = "x = 1"; + let ast = runner.run(input).unwrap(); + let dump = dump_ast(&ast, ast.get_root(), input); + assert_dump_eq( + &dump, + r#" + program + stmt: + first_node + left: + second_node + left: identifier "ID" + right: integer "INT" + right: identifier "ID" + "#, + ); +} + // ---- Cursor tests ---- #[test] From 4b44753603bac1e40efb7f3bd95bce68f2e40781 Mon Sep 17 00:00:00 2001 From: Asger F Date: Mon, 11 May 2026 11:37:59 +0200 Subject: [PATCH 04/12] Yeast one-shot: fix infinite loop when root is captured --- shared/yeast/src/lib.rs | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/shared/yeast/src/lib.rs b/shared/yeast/src/lib.rs index d45fe4980dfd..159287df119e 100644 --- a/shared/yeast/src/lib.rs +++ b/shared/yeast/src/lib.rs @@ -718,6 +718,13 @@ fn apply_one_shot_rules_inner( // we must translate captured input-schema nodes to their // output-schema equivalents first. captures.try_map_all_captures(|captured_id| { + // Avoid infinite recursion when a capture refers to the root + // node of the matched tree (e.g. an `@_` capture on the + // pattern root): re-analyzing it would match the same rule + // again indefinitely. + if captured_id == id { + return Ok(captured_id); + } let result = apply_one_shot_rules_inner(index, ast, captured_id, fresh, rewrite_depth + 1)?; if result.len() != 1 { From 4e5122e3cb1393b922f14ba985293caf1e5c41a9 Mon Sep 17 00:00:00 2001 From: Asger F Date: Thu, 7 May 2026 11:33:04 +0200 Subject: [PATCH 05/12] Add support for tree-sitter-style corpus tests This adds tests consisting of source code and a printout of its rewritten AST. --- unified/AGENTS.md | 11 +- unified/extractor/src/extractor.rs | 8 +- unified/extractor/src/languages/mod.rs | 8 + unified/extractor/src/main.rs | 1 + .../extractor/tests/corpus/swift/desugar.txt | 23 +++ unified/extractor/tests/corpus_tests.rs | 182 ++++++++++++++++++ 6 files changed, 224 insertions(+), 9 deletions(-) create mode 100644 unified/extractor/src/languages/mod.rs create mode 100644 unified/extractor/tests/corpus/swift/desugar.txt create mode 100644 unified/extractor/tests/corpus_tests.rs diff --git a/unified/AGENTS.md b/unified/AGENTS.md index 488a94f44bd4..132f61e269d3 100644 --- a/unified/AGENTS.md +++ b/unified/AGENTS.md @@ -20,10 +20,15 @@ grammar source), run `scripts/regenerate-grammar.sh` to: it shows the impact of a grammar tweak on the named node kinds, fields, and child types in a form much easier to read than the raw JSON. -## Testing -- If you changed the extractor code, always rebuild it before running tests. +## Extractor Testing +- To run extractor tests, run `cargo test` in the `extractor` directory. -- To run all tests, run `codeql test run --search-path extractor-pack ql/test` +- Do not edit the printed ASTs in `extractor/test/corpus` directly. To regenerate the ASTs, run tests with the environment variable `YEAST_UPDATE_CORPUS=1`. + +## CodeQL Testing +- If you changed the extractor code, always rebuild it before running CodeQL tests. + +- To run all CodeQL tests, run `codeql test run --search-path extractor-pack ql/test` - Do not edit `.expected` files manually. To update the expected output, pass `--learn` to the `codeql test run` command. diff --git a/unified/extractor/src/extractor.rs b/unified/extractor/src/extractor.rs index eb6f06eb259b..ae3c1e78715b 100644 --- a/unified/extractor/src/extractor.rs +++ b/unified/extractor/src/extractor.rs @@ -3,9 +3,7 @@ use std::path::PathBuf; use codeql_extractor::extractor::simple; use codeql_extractor::trap; - -#[path = "languages/swift/swift.rs"] -mod swift; +use crate::languages; #[derive(Args)] pub struct Options { @@ -27,9 +25,7 @@ pub fn run(options: Options) -> std::io::Result<()> { let extractor = simple::Extractor { prefix: "unified".to_string(), - languages: vec![ - swift::language_spec(), - ], + languages: languages::all_language_specs(), trap_dir: options.output_dir, trap_compression: trap::Compression::from_env("CODEQL_EXTRACTOR_UNIFIED_OPTION_TRAP_COMPRESSION"), source_archive_dir: options.source_archive_dir, diff --git a/unified/extractor/src/languages/mod.rs b/unified/extractor/src/languages/mod.rs new file mode 100644 index 000000000000..4d5c945cb9b3 --- /dev/null +++ b/unified/extractor/src/languages/mod.rs @@ -0,0 +1,8 @@ +use codeql_extractor::extractor::simple; + +#[path = "swift/swift.rs"] +mod swift; + +pub fn all_language_specs() -> Vec { + vec![swift::language_spec()] +} diff --git a/unified/extractor/src/main.rs b/unified/extractor/src/main.rs index e6721d4e2243..5a3407c37a29 100644 --- a/unified/extractor/src/main.rs +++ b/unified/extractor/src/main.rs @@ -3,6 +3,7 @@ use clap::Parser; mod autobuilder; mod extractor; mod generator; +mod languages; #[derive(Parser)] #[command(author, version, about)] diff --git a/unified/extractor/tests/corpus/swift/desugar.txt b/unified/extractor/tests/corpus/swift/desugar.txt new file mode 100644 index 000000000000..1ea0e260aad2 --- /dev/null +++ b/unified/extractor/tests/corpus/swift/desugar.txt @@ -0,0 +1,23 @@ +=== +Additive expression is desugared +=== + +1 + 2 + +--- + +source_file + simple_identifier "blah" + + +=== +Another additive expression is desugared +=== + +foo + bar + +--- + +source_file + simple_identifier "blah" + diff --git a/unified/extractor/tests/corpus_tests.rs b/unified/extractor/tests/corpus_tests.rs new file mode 100644 index 000000000000..ea7bf7b11ca8 --- /dev/null +++ b/unified/extractor/tests/corpus_tests.rs @@ -0,0 +1,182 @@ +use std::fs; +use std::path::Path; + +use codeql_extractor::extractor::simple; +use yeast::{dump::dump_ast, Runner}; + +#[path = "../src/languages/mod.rs"] +mod languages; + +#[derive(Debug)] +struct CorpusCase { + name: String, + input: String, + expected: String, +} + +fn update_mode_enabled() -> bool { + std::env::var("YEAST_UPDATE_CORPUS") + .map(|v| matches!(v.to_ascii_lowercase().as_str(), "1" | "true" | "yes" | "on")) + .unwrap_or(false) +} + +fn is_header_rule(line: &str) -> bool { + let trimmed = line.trim(); + trimmed.len() >= 3 && trimmed.chars().all(|c| c == '=') +} + +fn parse_corpus(content: &str) -> Vec { + let lines: Vec<&str> = content.lines().collect(); + let mut i = 0; + let mut cases = Vec::new(); + + while i < lines.len() { + while i < lines.len() && lines[i].trim().is_empty() { + i += 1; + } + if i >= lines.len() { + break; + } + + assert!( + is_header_rule(lines[i]), + "Expected header delimiter at line {}", + i + 1 + ); + i += 1; + + assert!(i < lines.len(), "Missing test name at line {}", i + 1); + let name = lines[i].trim().to_string(); + i += 1; + + assert!( + i < lines.len() && is_header_rule(lines[i]), + "Missing closing header delimiter for case {name}" + ); + i += 1; + + let input_start = i; + while i < lines.len() && lines[i].trim() != "---" { + i += 1; + } + assert!(i < lines.len(), "Missing --- separator for case {name}"); + let input = lines[input_start..i].join("\n").trim_end().to_string(); + i += 1; + + let expected_start = i; + while i < lines.len() { + if is_header_rule(lines[i]) + && i + 2 < lines.len() + && !lines[i + 1].trim().is_empty() + && is_header_rule(lines[i + 2]) + { + break; + } + i += 1; + } + let expected = lines[expected_start..i].join("\n").trim().to_string(); + + cases.push(CorpusCase { + name, + input, + expected, + }); + } + + cases +} + +fn render_corpus(cases: &[CorpusCase]) -> String { + let mut out = String::new(); + + for (idx, case) in cases.iter().enumerate() { + if idx > 0 { + out.push('\n'); + } + out.push_str("===\n"); + out.push_str(case.name.trim()); + out.push_str("\n===\n"); + out.push('\n'); + out.push_str(case.input.trim()); + out.push_str("\n\n---\n"); + out.push('\n'); + out.push_str(case.expected.trim()); + out.push_str("\n\n"); + } + + out +} + +fn run_desugaring(lang: &simple::LanguageSpec, input: &str) -> String { + let runner = match lang.desugar.as_ref() { + Some(config) => Runner::from_config(lang.ts_language.clone(), config) + .expect("Failed to create yeast runner from desugaring config"), + None => Runner::new(lang.ts_language.clone(), &[]), + }; + let ast = runner + .run(input) + .unwrap_or_else(|e| panic!("Failed to parse corpus input: {e}")); + dump_ast(&ast, ast.get_root(), input) +} + +#[test] +fn test_corpus() { + let update_mode = update_mode_enabled(); + let all_languages = languages::all_language_specs(); + let corpus_dir = Path::new("tests/corpus"); + + for lang in all_languages { + let lang_corpus_dir = corpus_dir.join(&lang.prefix); + if !lang_corpus_dir.exists() { + continue; + } + + let mut corpus_files: Vec<_> = fs::read_dir(&lang_corpus_dir) + .unwrap_or_else(|e| { + panic!( + "Failed to read corpus directory {}: {e}", + lang_corpus_dir.display() + ) + }) + .map(|entry| entry.expect("Failed to read corpus entry").path()) + .filter(|path| path.extension().is_some_and(|ext| ext == "txt")) + .collect(); + corpus_files.sort(); + + for corpus_path in corpus_files { + let content = fs::read_to_string(&corpus_path) + .unwrap_or_else(|e| panic!("Failed to read {}: {e}", corpus_path.display())); + let mut cases = parse_corpus(&content); + assert!( + !cases.is_empty(), + "No corpus cases found in {}", + corpus_path.display() + ); + + for case in &mut cases { + let actual = run_desugaring(&lang, &case.input); + if update_mode { + case.expected = actual.trim().to_string(); + } else { + assert_eq!( + case.expected.trim(), + actual.trim(), + "Corpus case failed in {}: {}", + corpus_path.display(), + case.name + ); + } + } + + if update_mode { + let updated = render_corpus(&cases); + fs::write(&corpus_path, updated).unwrap_or_else(|e| { + panic!( + "Failed to update corpus file {}: {e}", + corpus_path.display() + ) + }); + } + } + } +} From ef34a2cce78b7dc91fa4bfc2daf4bb7021975838 Mon Sep 17 00:00:00 2001 From: Asger F Date: Tue, 12 May 2026 16:15:55 +0200 Subject: [PATCH 06/12] Switch to one-shot The test output becomes useless because we can't map the root node to anything at the moment --- unified/extractor/src/languages/swift/swift.rs | 9 +++++++-- unified/extractor/tests/corpus/swift/desugar.txt | 6 ++---- 2 files changed, 9 insertions(+), 6 deletions(-) diff --git a/unified/extractor/src/languages/swift/swift.rs b/unified/extractor/src/languages/swift/swift.rs index c3843a5979c5..34fdc82b499c 100644 --- a/unified/extractor/src/languages/swift/swift.rs +++ b/unified/extractor/src/languages/swift/swift.rs @@ -1,5 +1,5 @@ use codeql_extractor::extractor::simple; -use yeast::{rule, DesugaringConfig}; +use yeast::{rule, DesugaringConfig, PhaseKind}; fn desugaring_rules() -> Vec { vec![ @@ -8,11 +8,16 @@ fn desugaring_rules() -> Vec { => (simple_identifier "blah") ), + rule!( + _ + => + (simple_identifier "not supported") + ) ] } pub fn language_spec() -> simple::LanguageSpec { - let desugar = DesugaringConfig::new().add_phase("desugar", desugaring_rules()); + let desugar = DesugaringConfig::new().add_phase("desugar", PhaseKind::OneShot, desugaring_rules()); simple::LanguageSpec { prefix: "swift", ts_language: tree_sitter_swift::LANGUAGE.into(), diff --git a/unified/extractor/tests/corpus/swift/desugar.txt b/unified/extractor/tests/corpus/swift/desugar.txt index 1ea0e260aad2..76e6afe1fd84 100644 --- a/unified/extractor/tests/corpus/swift/desugar.txt +++ b/unified/extractor/tests/corpus/swift/desugar.txt @@ -6,8 +6,7 @@ Additive expression is desugared --- -source_file - simple_identifier "blah" +simple_identifier "not supported" === @@ -18,6 +17,5 @@ foo + bar --- -source_file - simple_identifier "blah" +simple_identifier "not supported" From 16b1eb5cddfb3951d966017e17df4b80144ccbe3 Mon Sep 17 00:00:00 2001 From: Asger F Date: Mon, 11 May 2026 13:07:03 +0200 Subject: [PATCH 07/12] Add 3-section corpus test --- .../extractor/tests/corpus/swift/desugar.txt | 17 ++++- unified/extractor/tests/corpus_tests.rs | 62 +++++++++++++++++++ 2 files changed, 78 insertions(+), 1 deletion(-) diff --git a/unified/extractor/tests/corpus/swift/desugar.txt b/unified/extractor/tests/corpus/swift/desugar.txt index 76e6afe1fd84..442985dd18dd 100644 --- a/unified/extractor/tests/corpus/swift/desugar.txt +++ b/unified/extractor/tests/corpus/swift/desugar.txt @@ -6,6 +6,14 @@ Additive expression is desugared --- +source_file + additive_expression + lhs: integer_literal "1" + op: + + rhs: integer_literal "2" + +--- + simple_identifier "not supported" @@ -17,5 +25,12 @@ foo + bar --- -simple_identifier "not supported" +source_file + additive_expression + lhs: simple_identifier "foo" + op: + + rhs: simple_identifier "bar" +--- + +simple_identifier "not supported" diff --git a/unified/extractor/tests/corpus_tests.rs b/unified/extractor/tests/corpus_tests.rs index ea7bf7b11ca8..587800172b8b 100644 --- a/unified/extractor/tests/corpus_tests.rs +++ b/unified/extractor/tests/corpus_tests.rs @@ -11,6 +11,7 @@ mod languages; struct CorpusCase { name: String, input: String, + raw: String, expected: String, } @@ -63,6 +64,30 @@ fn parse_corpus(content: &str) -> Vec { let input = lines[input_start..i].join("\n").trim_end().to_string(); i += 1; + // Raw tree-sitter parse section. New-format files have a second + // `---` separator between the raw tree and the mapped AST. Legacy + // files (with only one separator) have no raw section — in that + // case `raw` stays empty and update mode will populate it. + let raw_start = i; + let mut next_sep = i; + while next_sep < lines.len() && lines[next_sep].trim() != "---" { + if is_header_rule(lines[next_sep]) + && next_sep + 2 < lines.len() + && !lines[next_sep + 1].trim().is_empty() + && is_header_rule(lines[next_sep + 2]) + { + break; + } + next_sep += 1; + } + let raw = if next_sep < lines.len() && lines[next_sep].trim() == "---" { + let raw_text = lines[raw_start..next_sep].join("\n").trim().to_string(); + i = next_sep + 1; + raw_text + } else { + String::new() + }; + let expected_start = i; while i < lines.len() { if is_header_rule(lines[i]) @@ -79,6 +104,7 @@ fn parse_corpus(content: &str) -> Vec { cases.push(CorpusCase { name, input, + raw, expected, }); } @@ -100,6 +126,9 @@ fn render_corpus(cases: &[CorpusCase]) -> String { out.push_str(case.input.trim()); out.push_str("\n\n---\n"); out.push('\n'); + out.push_str(case.raw.trim()); + out.push_str("\n\n---\n"); + out.push('\n'); out.push_str(case.expected.trim()); out.push_str("\n\n"); } @@ -119,6 +148,20 @@ fn run_desugaring(lang: &simple::LanguageSpec, input: &str) -> String { dump_ast(&ast, ast.get_root(), input) } +/// Produce the raw tree-sitter parse tree dump for `input`, with no +/// desugaring rules applied. Uses a `Runner` with an empty phase list and +/// the input grammar's own schema. +fn dump_raw_parse( + lang: &simple::LanguageSpec, + input: &str, +) -> Result { + let runner = Runner::new(lang.ts_language.clone(), &[]); + let ast = runner + .run(input) + .map_err(|e| format!("Failed to parse input: {e}"))?; + Ok(dump_ast(&ast, ast.get_root(), input)) +} + #[test] fn test_corpus() { let update_mode = update_mode_enabled(); @@ -154,6 +197,25 @@ fn test_corpus() { ); for case in &mut cases { + let actual_raw = dump_raw_parse(&lang, &case.input) + .unwrap_or_else(|e| panic!( + "Raw parse failed for {} in {}: {}", + case.name, + corpus_path.display(), + e + )); + if update_mode { + case.raw = actual_raw.trim().to_string(); + } else { + assert_eq!( + case.raw.trim(), + actual_raw.trim(), + "Raw parse mismatch in {}: {}", + corpus_path.display(), + case.name + ); + } + let actual = run_desugaring(&lang, &case.input); if update_mode { case.expected = actual.trim().to_string(); From 3d20fdec05fbe1572ecfc01ac68cc03ce516048e Mon Sep 17 00:00:00 2001 From: Asger F Date: Tue, 12 May 2026 16:31:08 +0200 Subject: [PATCH 08/12] Wire up type checking to corpus tests --- unified/extractor/ast_types.yml | 21 +++ unified/extractor/src/languages/mod.rs | 5 +- .../extractor/src/languages/swift/swift.rs | 21 ++- .../extractor/tests/corpus/swift/desugar.txt | 7 +- unified/extractor/tests/corpus_tests.rs | 133 +++++++++++------- 5 files changed, 131 insertions(+), 56 deletions(-) create mode 100644 unified/extractor/ast_types.yml diff --git a/unified/extractor/ast_types.yml b/unified/extractor/ast_types.yml new file mode 100644 index 000000000000..832bd1a6c447 --- /dev/null +++ b/unified/extractor/ast_types.yml @@ -0,0 +1,21 @@ +supertypes: + expr: + - identifier + - unsupported_node + + stmt: + - unsupported_node + +named: + # Top of a translated source file. The body is a list of expressions or + # statements. + top_level: + body*: [expr, stmt] + + # An identifier in expression position. + identifier: + + # A node that we don't (yet) translate. + unsupported_node: + + wrong_type: diff --git a/unified/extractor/src/languages/mod.rs b/unified/extractor/src/languages/mod.rs index 4d5c945cb9b3..20ad599edfb6 100644 --- a/unified/extractor/src/languages/mod.rs +++ b/unified/extractor/src/languages/mod.rs @@ -3,6 +3,9 @@ use codeql_extractor::extractor::simple; #[path = "swift/swift.rs"] mod swift; +/// Shared YEAST output AST schema for all languages. +pub(crate) const OUTPUT_AST_SCHEMA: &str = include_str!("../../ast_types.yml"); + pub fn all_language_specs() -> Vec { - vec![swift::language_spec()] + vec![swift::language_spec(OUTPUT_AST_SCHEMA)] } diff --git a/unified/extractor/src/languages/swift/swift.rs b/unified/extractor/src/languages/swift/swift.rs index 34fdc82b499c..c7e1b7954860 100644 --- a/unified/extractor/src/languages/swift/swift.rs +++ b/unified/extractor/src/languages/swift/swift.rs @@ -1,23 +1,34 @@ use codeql_extractor::extractor::simple; use yeast::{rule, DesugaringConfig, PhaseKind}; -fn desugaring_rules() -> Vec { +/// Output AST schema. The corpus tests load this via the language spec to +/// type-check the desugared AST. +pub const OUTPUT_NODE_TYPES_YAML: &str = include_str!("../../../ast_types.yml"); + +fn translation_rules() -> Vec { vec![ + rule!( + (source_file (_)* @body) + => + (top_level body: {..body}) + ), rule!( (additive_expression) => - (simple_identifier "blah") + (name_expr "test") ), rule!( _ => - (simple_identifier "not supported") + (unsupported_node) ) ] } -pub fn language_spec() -> simple::LanguageSpec { - let desugar = DesugaringConfig::new().add_phase("desugar", PhaseKind::OneShot, desugaring_rules()); +pub fn language_spec(desugared_ast_schema: &'static str) -> simple::LanguageSpec { + let desugar = DesugaringConfig::new() + .add_phase("translate", PhaseKind::OneShot, translation_rules()) + .with_output_node_types_yaml(desugared_ast_schema); simple::LanguageSpec { prefix: "swift", ts_language: tree_sitter_swift::LANGUAGE.into(), diff --git a/unified/extractor/tests/corpus/swift/desugar.txt b/unified/extractor/tests/corpus/swift/desugar.txt index 442985dd18dd..00ab759580e2 100644 --- a/unified/extractor/tests/corpus/swift/desugar.txt +++ b/unified/extractor/tests/corpus/swift/desugar.txt @@ -14,8 +14,8 @@ source_file --- -simple_identifier "not supported" - +top_level + body: name_expr "test" === Another additive expression is desugared @@ -33,4 +33,5 @@ source_file --- -simple_identifier "not supported" +top_level + body: name_expr "test" diff --git a/unified/extractor/tests/corpus_tests.rs b/unified/extractor/tests/corpus_tests.rs index 587800172b8b..abe03161be5b 100644 --- a/unified/extractor/tests/corpus_tests.rs +++ b/unified/extractor/tests/corpus_tests.rs @@ -2,7 +2,7 @@ use std::fs; use std::path::Path; use codeql_extractor::extractor::simple; -use yeast::{dump::dump_ast, Runner}; +use yeast::{dump::dump_ast, dump::dump_ast_with_type_errors, Runner}; #[path = "../src/languages/mod.rs"] mod languages; @@ -117,35 +117,38 @@ fn render_corpus(cases: &[CorpusCase]) -> String { for (idx, case) in cases.iter().enumerate() { if idx > 0 { + // Blank line between cases. out.push('\n'); } out.push_str("===\n"); out.push_str(case.name.trim()); - out.push_str("\n===\n"); - out.push('\n'); + out.push_str("\n===\n\n"); out.push_str(case.input.trim()); - out.push_str("\n\n---\n"); - out.push('\n'); + out.push_str("\n\n---\n\n"); out.push_str(case.raw.trim()); - out.push_str("\n\n---\n"); - out.push('\n'); + out.push_str("\n\n---\n\n"); out.push_str(case.expected.trim()); - out.push_str("\n\n"); + // Single trailing newline per case; the inter-case blank line is + // added by the prefix above, and the file ends with exactly one `\n`. + out.push('\n'); } out } -fn run_desugaring(lang: &simple::LanguageSpec, input: &str) -> String { +fn run_desugaring( + lang: &simple::LanguageSpec, + input: &str, +) -> Result { let runner = match lang.desugar.as_ref() { Some(config) => Runner::from_config(lang.ts_language.clone(), config) - .expect("Failed to create yeast runner from desugaring config"), + .map_err(|e| format!("Failed to create yeast runner: {e}"))?, None => Runner::new(lang.ts_language.clone(), &[]), }; - let ast = runner + + runner .run(input) - .unwrap_or_else(|e| panic!("Failed to parse corpus input: {e}")); - dump_ast(&ast, ast.get_root(), input) + .map_err(|e| format!("Failed to parse input: {e}")) } /// Produce the raw tree-sitter parse tree dump for `input`, with no @@ -169,6 +172,12 @@ fn test_corpus() { let corpus_dir = Path::new("tests/corpus"); for lang in all_languages { + let output_schema = yeast::node_types_yaml::schema_from_yaml_with_language( + languages::OUTPUT_AST_SCHEMA, + &lang.ts_language, + ) + .expect("Failed to parse OUTPUT_AST_SCHEMA YAML"); + let lang_corpus_dir = corpus_dir.join(&lang.prefix); if !lang_corpus_dir.exists() { continue; @@ -190,6 +199,7 @@ fn test_corpus() { let content = fs::read_to_string(&corpus_path) .unwrap_or_else(|e| panic!("Failed to read {}: {e}", corpus_path.display())); let mut cases = parse_corpus(&content); + let mut failures = Vec::new(); assert!( !cases.is_empty(), "No corpus cases found in {}", @@ -197,47 +207,76 @@ fn test_corpus() { ); for case in &mut cases { - let actual_raw = dump_raw_parse(&lang, &case.input) - .unwrap_or_else(|e| panic!( - "Raw parse failed for {} in {}: {}", - case.name, - corpus_path.display(), - e - )); - if update_mode { - case.raw = actual_raw.trim().to_string(); - } else { - assert_eq!( - case.raw.trim(), - actual_raw.trim(), - "Raw parse mismatch in {}: {}", - corpus_path.display(), - case.name - ); + match dump_raw_parse(&lang, &case.input) { + Err(e) => { + failures.push(format!( + "Raw parse failed for {} in {}: {}", + case.name, + corpus_path.display(), + e + )); + } + Ok(actual_raw) => { + if update_mode { + case.raw = actual_raw.trim().to_string(); + } else if case.raw.trim() != actual_raw.trim() { + failures.push(format!( + "Raw parse mismatch in {}: \"{}\"\nEXPECTED:\n\n{}\n\nACTUAL:\n\n{}", + corpus_path.display(), + case.name, + case.raw.trim(), + actual_raw.trim() + )); + } + } } - let actual = run_desugaring(&lang, &case.input); - if update_mode { - case.expected = actual.trim().to_string(); - } else { - assert_eq!( - case.expected.trim(), - actual.trim(), - "Corpus case failed in {}: {}", - corpus_path.display(), - case.name - ); + match run_desugaring(&lang, &case.input) { + Err(e) => { + failures.push(format!( + "Desugaring failed for {} in {}: {}", + case.name, + corpus_path.display(), + e + )); + } + Ok(actual) => { + let actual_dump = dump_ast_with_type_errors( + &actual, + actual.get_root(), + &case.input, + &output_schema, + ); + if update_mode { + case.expected = actual_dump.trim().to_string(); + } else if case.expected.trim() != actual_dump.trim() { + failures.push(format!( + "Test failed in {}: \"{}\"\nEXPECTED:\n\n{}\n\nACTUAL:\n\n{}", + corpus_path.display(), + case.name, + case.expected.trim(), + actual_dump.trim() + )); + } + } } } + assert!( + failures.is_empty(), + "{}", + failures.join("\n\n") + "\n\n" + ); + if update_mode { let updated = render_corpus(&cases); - fs::write(&corpus_path, updated).unwrap_or_else(|e| { - panic!( - "Failed to update corpus file {}: {e}", - corpus_path.display() - ) - }); + let write_result = fs::write(&corpus_path, updated); + assert!( + write_result.is_ok(), + "Failed to update corpus file {}: {}", + corpus_path.display(), + write_result.err().map_or_else(String::new, |e| e.to_string()) + ); } } } From a5d5364ab289e65a2cd18beb11817118571ce163 Mon Sep 17 00:00:00 2001 From: Asger F Date: Tue, 12 May 2026 16:45:39 +0200 Subject: [PATCH 09/12] Add some more output types --- unified/extractor/ast_types.yml | 139 ++++++++++++++++++++++++++++++-- 1 file changed, 131 insertions(+), 8 deletions(-) diff --git a/unified/extractor/ast_types.yml b/unified/extractor/ast_types.yml index 832bd1a6c447..22a5e8b19fb8 100644 --- a/unified/extractor/ast_types.yml +++ b/unified/extractor/ast_types.yml @@ -1,21 +1,144 @@ supertypes: expr: - - identifier + - name_expr + - int_literal + - string_literal + - binary_expr + - unary_expr + - call_expr + - member_access_expr + - lambda_expr - unsupported_node - stmt: + - empty_stmt + - block_stmt + - expr_stmt + - if_stmt + - variable_declaration_stmt + - guard_if_stmt + - unsupported_node + condition: + - expr_condition + - let_pattern_condition + - sequence_condition + - unsupported_node + pattern: + - var_pattern + - apply_pattern + - tuple_pattern + - ignore_pattern - unsupported_node - named: - # Top of a translated source file. The body is a list of expressions or - # statements. + # Top-level is the root node, currently containing a list of expressions top_level: body*: [expr, stmt] - # An identifier in expression position. + # An identifier used in the context of an expression + name_expr: + identifier: identifier + + # An integer literal + int_literal: + + # A string literal + string_literal: + + # Application of a binary operator, such as `a + b` + binary_expr: + left: expr + operator: operator + right: expr + + # Application of a unary operator, such as `!x` + unary_expr: + operand: expr + operator: operator + + # A function or method call, such as `f(x)` or `obj.m(x)`. Method calls + # are represented as a call whose `function` is a `member_access_expr`. + call_expr: + function: expr + argument*: expr + + # Member access, such as `obj.member`. + member_access_expr: + target: expr + member: identifier + + lambda_expr: + parameter*: parameter + body: [expr, stmt] + + # A parameter + parameter: + pattern: pattern + + empty_stmt: + + block_stmt: + body*: stmt + + expr_stmt: + expr: expr + + if_stmt: + condition: condition + then?: stmt + else?: stmt + + variable_declaration_stmt: + variable_declarator+: variable_declarator + + # A variable declaration, or assignment to a pattern. + # The initializer is optional (but typically only possible in combination with a simple variable pattern). + variable_declarator: + pattern: pattern + value?: expr + + # Evaluate 'condition', and if false, execute 'else' which must break from the enclosing block scope (return, break, etc). + # Any variables bound by 'condition' will be in scope for the remainder of the enclosing block scope + # (which differs from how if_stmt works). + guard_if_stmt: + condition: condition + else: stmt + + # Evaluates the given condition and interprets it as a boolean (by language conventions) + expr_condition: + expr: expr + + # A series of statements that are executed before evaluating the trailing condition. + # Useful for languages where a conditional clause may be preceded by side-effecting + # syntactic elements (e.g. binding clauses) that don't themselves form a condition. + sequence_condition: + stmt*: stmt + condition: condition + + # Evaluate 'expr' and match its result against 'pattern', and return true if it matches. + # Variables bound by the pattern will be in scope within the 'true' branch controlled by this condition. + let_pattern_condition: + pattern: pattern + value: expr + + # A pattern matching anything, binding its value to the given variable + var_pattern: + identifier: identifier + + # A pattern matching anything, binding no variables, usually using the syntax "_" + ignore_pattern: + + # A pattern such as `Some(x)` where `Some` is the constructor and `x` is an argument + apply_pattern: + constructor: expr + argument*: pattern + + # A tuple pattern such as `(a, b)` in `let (a, b) = pair`. + tuple_pattern: + element*: pattern + + # An simple unqualified identifier token identifier: - # A node that we don't (yet) translate. + # A node that we don't yet translate unsupported_node: - wrong_type: + operator: From d2eba5cc07fcfdc42f3f58ba2454e77680bb2fef Mon Sep 17 00:00:00 2001 From: Asger F Date: Mon, 11 May 2026 13:55:14 +0200 Subject: [PATCH 10/12] Fix something about synthesized nodes having the wrong text --- shared/yeast/src/lib.rs | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/shared/yeast/src/lib.rs b/shared/yeast/src/lib.rs index 159287df119e..aa813e19a68a 100644 --- a/shared/yeast/src/lib.rs +++ b/shared/yeast/src/lib.rs @@ -339,7 +339,18 @@ impl Ast { let content = match &node.content { NodeContent::Range(range) => source[range.start_byte..range.end_byte].to_string(), NodeContent::String(s) => s.to_string(), - NodeContent::DynamicString(s) => s.clone(), + NodeContent::DynamicString(s) if !s.is_empty() => s.clone(), + // Synthesized nodes (from rule transforms) carry an empty + // `DynamicString`; resolve them against the inherited source + // range so `#{capture}` after a translation still yields the + // original source text. + NodeContent::DynamicString(_) => match node.source_range { + Some(range) => source + .get(range.start_byte..range.end_byte) + .map(|s| s.to_string()) + .unwrap_or_default(), + None => String::new(), + }, }; if fields.is_empty() { value.insert(kind, json!(content)); From fae82a47380d3af6aaa2b1182f9e99f5a07d18f5 Mon Sep 17 00:00:00 2001 From: Asger F Date: Mon, 11 May 2026 14:11:16 +0200 Subject: [PATCH 11/12] Change how patterns with repetition is parsed --- shared/yeast-macros/src/parse.rs | 39 ++++++++++++++++++++++++++++---- 1 file changed, 35 insertions(+), 4 deletions(-) diff --git a/shared/yeast-macros/src/parse.rs b/shared/yeast-macros/src/parse.rs index 70bd46d5b6f6..3926ad381d72 100644 --- a/shared/yeast-macros/src/parse.rs +++ b/shared/yeast-macros/src/parse.rs @@ -122,10 +122,41 @@ fn parse_query_fields(tokens: &mut Tokens) -> Result> { expect_punct(tokens, ':', "expected `:` after field name")?; - let child = parse_query_node(tokens)?; - fields.push(quote! { - (#field_str, vec![yeast::query::QueryListElem::SingleNode(#child)]) - }); + // Parse the field's pattern. To support repetition like + // `field: (kind)* @cap`, parse the atom first, then check for + // a quantifier, and lastly handle a trailing `@capture`. + let atom = parse_query_atom(tokens)?; + if peek_is_repetition(tokens) { + let rep = expect_repetition(tokens)?; + let elem = quote! { + yeast::query::QueryListElem::Repeated { + children: vec![yeast::query::QueryListElem::SingleNode(#atom)], + rep: #rep, + } + }; + let elem = maybe_wrap_list_capture(tokens, elem)?; + fields.push(quote! { + (#field_str, vec![#elem]) + }); + } else { + let child = if peek_is_at(tokens) { + tokens.next(); + let capture_name = + expect_ident(tokens, "expected capture name after @")?; + let name_str = capture_name.to_string(); + quote! { + yeast::query::QueryNode::Capture { + capture: #name_str, + node: Box::new(#atom), + } + } + } else { + atom + }; + fields.push(quote! { + (#field_str, vec![yeast::query::QueryListElem::SingleNode(#child)]) + }); + } } else { // Bare patterns — accumulate into the implicit `child` field. // We don't break here, so we can interleave with named fields. From b1098c54119c6d7b06c8019c470a662f5e53ffc6 Mon Sep 17 00:00:00 2001 From: Asger F Date: Mon, 11 May 2026 14:39:03 +0200 Subject: [PATCH 12/12] yeast-macros: merge repeated field declarations and support repetition in field patterns Two changes to parse_query_fields: - Allow `field: (kind)* @cap` (repetition + optional capture) in field position, mirroring how it works for bare children. - When the same field name is declared multiple times in a query (e.g. `condition: (foo) condition: (bar)`), merge them into a single ordered list of children rather than emitting duplicate field entries (which at runtime restart the iterator for the field and cause the second declaration to re-match from the first child). --- shared/yeast-macros/src/parse.rs | 36 +++++++++++++++++++++++++------- 1 file changed, 29 insertions(+), 7 deletions(-) diff --git a/shared/yeast-macros/src/parse.rs b/shared/yeast-macros/src/parse.rs index 3926ad381d72..4095600cf438 100644 --- a/shared/yeast-macros/src/parse.rs +++ b/shared/yeast-macros/src/parse.rs @@ -113,8 +113,24 @@ fn parse_query_node_inner(tokens: &mut Tokens) -> Result { /// appear in any order; bare patterns are accumulated and emitted as a /// single `("child", ...)` entry. fn parse_query_fields(tokens: &mut Tokens) -> Result> { - let mut fields = Vec::new(); + // Accumulate per-field elems in declaration order; multiple uses of the + // same field name extend the same list (so e.g. `cond: (foo) cond: (bar)` + // matches a `cond` field whose first child is `foo` and second is `bar`). + let mut field_order: Vec = Vec::new(); + let mut field_elems: std::collections::HashMap> = + std::collections::HashMap::new(); let mut bare_children: Vec = Vec::new(); + let mut push_field_elem = |order: &mut Vec, + map: &mut std::collections::HashMap>, + name: String, + elem: TokenStream| { + if !map.contains_key(&name) { + order.push(name.clone()); + map.insert(name, vec![elem]); + } else { + map.get_mut(&name).unwrap().push(elem); + } + }; while tokens.peek().is_some() { if peek_is_field(tokens) { let field_name = expect_ident(tokens, "expected field name")?; @@ -135,9 +151,7 @@ fn parse_query_fields(tokens: &mut Tokens) -> Result> { } }; let elem = maybe_wrap_list_capture(tokens, elem)?; - fields.push(quote! { - (#field_str, vec![#elem]) - }); + push_field_elem(&mut field_order, &mut field_elems, field_str, elem); } else { let child = if peek_is_at(tokens) { tokens.next(); @@ -153,9 +167,10 @@ fn parse_query_fields(tokens: &mut Tokens) -> Result> { } else { atom }; - fields.push(quote! { - (#field_str, vec![yeast::query::QueryListElem::SingleNode(#child)]) - }); + let elem = quote! { + yeast::query::QueryListElem::SingleNode(#child) + }; + push_field_elem(&mut field_order, &mut field_elems, field_str, elem); } } else { // Bare patterns — accumulate into the implicit `child` field. @@ -168,6 +183,13 @@ fn parse_query_fields(tokens: &mut Tokens) -> Result> { bare_children.extend(elems); } } + let mut fields: Vec = Vec::new(); + for name in field_order { + let elems = field_elems.remove(&name).unwrap(); + fields.push(quote! { + (#name, vec![#(#elems),*]) + }); + } if !bare_children.is_empty() { fields.push(quote! { ("child", vec![#(#bare_children),*])