From ced5545caf9953890eda59fb6012fb23f1ed78f0 Mon Sep 17 00:00:00 2001
From: Asger F <asgerf@github.com>
Date: Thu, 7 May 2026 13:55:36 +0200
Subject: [PATCH 01/12] Yeast: add reachable_node_ids()

---
 shared/yeast/src/lib.rs    | 33 +++++++++++++++++++++++++++++++++
 shared/yeast/tests/test.rs | 22 ++++++++++++++++++++++
 2 files changed, 55 insertions(+)
diff --git a/shared/yeast/src/lib.rs b/shared/yeast/src/lib.rs
index 281f44a98b26..c06029a86260 100644
--- a/shared/yeast/src/lib.rs
+++ b/shared/yeast/src/lib.rs
@@ -193,10 +193,43 @@ impl Ast {
         AstCursor::new(self)
     }
 
+    /// Return all nodes currently allocated in the AST arena.
+    ///
+    /// This includes nodes that are no longer reachable from `get_root()`
+    /// after desugaring rewrites. Use `reachable_node_ids()` for output-level
+    /// validation/traversal semantics.
     pub fn nodes(&self) -> &[Node] {
         &self.nodes
     }
 
+    /// Return node ids reachable from `get_root()` by following child edges.
+    ///
+    /// This reflects the effective AST after desugaring and excludes orphaned
+    /// arena nodes left behind by rewrite operations.
+    pub fn reachable_node_ids(&self) -> Vec<usize> {
+        let mut reachable = Vec::new();
+        let mut stack = vec![self.root];
+        let mut seen = vec![false; self.nodes.len()];
+
+        while let Some(id) = stack.pop() {
+            if id >= self.nodes.len() || seen[id] {
+                continue;
+            }
+            seen[id] = true;
+            reachable.push(id);
+
+            if let Some(node) = self.get_node(id) {
+                for children in node.fields.values() {
+                    for &child in children {
+                        stack.push(child);
+                    }
+                }
+            }
+        }
+
+        reachable
+    }
+
     pub fn get_root(&self) -> Id {
         self.root
     }
diff --git a/shared/yeast/tests/test.rs b/shared/yeast/tests/test.rs
index ed4202493a46..e058e6b1eb07 100644
--- a/shared/yeast/tests/test.rs
+++ b/shared/yeast/tests/test.rs
@@ -166,6 +166,28 @@ fn test_query_no_match() {
     assert!(!matched);
 }
 
+#[test]
+fn test_reachable_nodes_excludes_orphaned_rewrite_nodes() {
+    let lang: tree_sitter::Language = tree_sitter_ruby::LANGUAGE.into();
+    let schema = yeast::node_types_yaml::schema_from_yaml_with_language(OUTPUT_SCHEMA_YAML, &lang)
+        .unwrap();
+    let rules = vec![yeast::rule!((integer) => (identifier "replaced"))];
+    let runner = Runner::with_schema(lang, &schema, &rules);
+
+    let input = "x = 1";
+    let ast = runner.run(input).unwrap();
+    let reachable_ids = ast.reachable_node_ids();
+
+    assert!(
+        ast.nodes().len() > reachable_ids.len(),
+        "expected rewrite to leave orphaned arena nodes"
+    );
+
+    let dump = dump_ast(&ast, ast.get_root(), input);
+    assert!(dump.contains("identifier \"replaced\""));
+    assert!(!dump.contains("integer \"1\""));
+}
+
 #[test]
 fn test_query_repeated_capture() {
     let runner = Runner::new(tree_sitter_ruby::LANGUAGE.into(), &[]);

From 5b8cd37e910e31e493307983ea30355d0f50803d Mon Sep 17 00:00:00 2001
From: Asger F <asgerf@github.com>
Date: Thu, 7 May 2026 22:50:30 +0200
Subject: [PATCH 02/12] Yeast: add type-checking errors in AST dump

---
 shared/yeast/src/dump.rs            | 201 +++++++++++++++++++++++++++-
 shared/yeast/src/node_types_yaml.rs | 123 +++++++++++------
 shared/yeast/src/schema.rs          |  76 ++++++++++-
 shared/yeast/tests/test.rs          | 110 ++++++++++++++-
 4 files changed, 459 insertions(+), 51 deletions(-)

diff --git a/shared/yeast/src/dump.rs b/shared/yeast/src/dump.rs
index 99ba019cc3ea..07ee134e058c 100644
--- a/shared/yeast/src/dump.rs
+++ b/shared/yeast/src/dump.rs
@@ -1,6 +1,6 @@
 use std::fmt::Write;
 
-use crate::{Ast, Node, NodeContent, CHILD_FIELD};
+use crate::{schema::Schema, Ast, Node, NodeContent, CHILD_FIELD};
 
 /// Options for controlling AST dump output.
 pub struct DumpOptions {
@@ -45,16 +45,143 @@ pub fn dump_ast_with_options(
     options: &DumpOptions,
 ) -> String {
     let mut out = String::new();
-    dump_node(ast, root, source, options, 0, &mut out);
+    dump_node(ast, root, source, options, 0, None, &mut out);
     out
 }
 
+/// Dump an AST and annotate type mismatches against a schema inline.
+///
+/// Any node that does not match the expected type set for its parent field is
+/// rendered with a trailing `" <-- ERROR: ..."` annotation on the same line.
+pub fn dump_ast_with_type_errors(
+    ast: &Ast,
+    root: usize,
+    source: &str,
+    schema: &Schema,
+) -> String {
+    dump_ast_with_type_errors_and_options(ast, root, source, schema, &DumpOptions::default())
+}
+
+/// Dump an AST and annotate type mismatches against a schema inline.
+///
+/// Any node that does not match the expected type set for its parent field is
+/// rendered with a trailing `" <-- ERROR: ..."` annotation on the same line.
+pub fn dump_ast_with_type_errors_and_options(
+    ast: &Ast,
+    root: usize,
+    source: &str,
+    schema: &Schema,
+    options: &DumpOptions,
+) -> String {
+    let mut out = String::new();
+    dump_node(ast, root, source, options, 0, Some((schema, None, None)), &mut out);
+    out
+}
+
+fn format_node_types(node_types: &[crate::schema::NodeType]) -> String {
+    node_types
+        .iter()
+        .map(|t| {
+            if t.named {
+                t.kind.clone()
+            } else {
+                format!("\"{}\"", t.kind)
+            }
+        })
+        .collect::<Vec<_>>()
+        .join(" | ")
+}
+
+const EMPTY_NODE_TYPES: &[crate::schema::NodeType] = &[];
+
+/// Generate a type-checking error message for a node if it doesn't match expected types.
+///
+/// # Arguments
+/// - `schema`: The AST schema to validate against.
+/// - `node`: The node being checked.
+/// - `expected`: The set of allowed types for this node, or `None` if type-checking is disabled.
+/// - `parent_field`: Optional tuple of (parent_kind, field_name) for context in error messages.
+///
+/// # Returns
+/// `Some(error_message)` if the node violates the schema (e.g., wrong kind, missing field declaration).
+/// `None` if the node matches the expected types or if type-checking is disabled.
+fn type_error_for_node(
+    schema: &Schema,
+    node: &Node,
+    expected: Option<&[crate::schema::NodeType]>,
+    parent_field: Option<(&str, &str)>,
+) -> Option<String> {
+    if schema.id_for_node_kind(node.kind_name()).is_none()
+        && schema.id_for_unnamed_node_kind(node.kind_name()).is_none()
+    {
+        return Some(format!("node kind '{}' not in schema", node.kind_name()));
+    }
+
+    let expected = expected?;
+    if expected.is_empty() {
+        if let Some((kind, field)) = parent_field {
+            return Some(format!("the node '{kind}' has no field '{field}'"));
+        }
+        return Some("field not declared in schema for this parent node".to_string());
+    }
+    if schema.node_matches_types(node.kind_name(), node.is_named(), expected) {
+        None
+    } else {
+        let actual = if node.is_named() {
+            node.kind_name().to_string()
+        } else {
+            format!("\"{}\"", node.kind_name())
+        };
+
+        if let Some((kind, field)) = parent_field {
+            Some(format!(
+                "The field {}.{} should contain {}, but got {}",
+                kind,
+                field,
+                format_node_types(expected),
+                actual
+            ))
+        } else {
+            Some(format!(
+                "expected {}, got {}",
+                format_node_types(expected),
+                actual
+            ))
+        }
+    }
+}
+
+/// Look up the allowed types for a field in the schema.
+///
+/// # Arguments
+/// - `schema`: The AST schema to query.
+/// - `parent_kind`: The node kind of the parent that contains this field.
+/// - `field_id`: The field ID within that parent node.
+///
+/// # Returns
+/// `Some(&[NodeType])` if the field is declared in the schema and has type constraints.
+/// `None` if the field is not declared or has no constraints (undeclared field).
+fn expected_for_field<'a>(
+    schema: &'a Schema,
+    parent_kind: &str,
+    field_id: u16,
+) -> Option<&'a [crate::schema::NodeType]> {
+    schema
+        .field_types(parent_kind, field_id)
+        .map(|v| v.as_slice())
+}
+
 fn dump_node(
     ast: &Ast,
     id: usize,
     source: &str,
     options: &DumpOptions,
     indent: usize,
+    type_check: Option<(
+        &Schema,
+        Option<&[crate::schema::NodeType]>,
+        Option<(&str, &str)>,
+    )>,
     out: &mut String,
 ) {
     let node = match ast.get_node(id) {
@@ -90,6 +217,12 @@ fn dump_node(
         }
     }
 
+    if let Some((schema, expected, parent_field)) = type_check {
+        if let Some(err) = type_error_for_node(schema, node, expected, parent_field) {
+            write!(out, " <-- ERROR: {err}").unwrap();
+        }
+    }
+
     writeln!(out).unwrap();
 
     // Named fields first
@@ -98,31 +231,68 @@ fn dump_node(
             continue; // Handle unnamed children last
         }
         let field_name = ast.field_name_for_id(field_id).unwrap_or("?");
+        let child_type_check = type_check.map(|(schema, _, _)| {
+            let expected = expected_for_field(schema, node.kind_name(), field_id)
+                .or(Some(EMPTY_NODE_TYPES));
+            let parent_field = Some((node.kind_name(), field_name));
+            (schema, expected, parent_field)
+        });
+
         if children.len() == 1 {
             write!(out, "{prefix}  {field_name}:").unwrap();
             // Inline single child
             let child = ast.get_node(children[0]);
             if child.is_some_and(is_leaf) {
                 write!(out, " ").unwrap();
-                dump_node_inline(ast, children[0], source, options, out);
+                dump_node_inline(ast, children[0], source, options, child_type_check, out);
             } else {
                 writeln!(out).unwrap();
-                dump_node(ast, children[0], source, options, indent + 2, out);
+                dump_node(
+                    ast,
+                    children[0],
+                    source,
+                    options,
+                    indent + 2,
+                    child_type_check,
+                    out,
+                );
             }
         } else {
             writeln!(out, "{prefix}  {field_name}:").unwrap();
             for &child_id in children {
-                dump_node(ast, child_id, source, options, indent + 2, out);
+                dump_node(
+                    ast,
+                    child_id,
+                    source,
+                    options,
+                    indent + 2,
+                    child_type_check,
+                    out,
+                );
             }
         }
     }
 
     // Unnamed children — skip unnamed tokens (keywords, punctuation)
     if let Some(children) = node.fields.get(&CHILD_FIELD) {
+        let child_type_check = type_check.map(|(schema, _, _)| {
+            let expected = expected_for_field(schema, node.kind_name(), CHILD_FIELD)
+                .or(Some(EMPTY_NODE_TYPES));
+            let parent_field = Some((node.kind_name(), "children"));
+            (schema, expected, parent_field)
+        });
         for &child_id in children {
             if let Some(child) = ast.get_node(child_id) {
                 if child.is_named() {
-                    dump_node(ast, child_id, source, options, indent + 1, out);
+                    dump_node(
+                        ast,
+                        child_id,
+                        source,
+                        options,
+                        indent + 1,
+                        child_type_check,
+                        out,
+                    );
                 }
             }
         }
@@ -130,7 +300,18 @@ fn dump_node(
 }
 
 /// Dump a leaf node inline (no newline prefix, caller provides context).
-fn dump_node_inline(ast: &Ast, id: usize, source: &str, options: &DumpOptions, out: &mut String) {
+fn dump_node_inline(
+    ast: &Ast,
+    id: usize,
+    source: &str,
+    options: &DumpOptions,
+    type_check: Option<(
+        &Schema,
+        Option<&[crate::schema::NodeType]>,
+        Option<(&str, &str)>,
+    )>,
+    out: &mut String,
+) {
     let node = match ast.get_node(id) {
         Some(n) => n,
         None => return,
@@ -159,6 +340,12 @@ fn dump_node_inline(ast: &Ast, id: usize, source: &str, options: &DumpOptions, o
         }
     }
 
+    if let Some((schema, expected, parent_field)) = type_check {
+        if let Some(err) = type_error_for_node(schema, node, expected, parent_field) {
+            write!(out, " <-- ERROR: {err}").unwrap();
+        }
+    }
+
     writeln!(out).unwrap();
 }
 
diff --git a/shared/yeast/src/node_types_yaml.rs b/shared/yeast/src/node_types_yaml.rs
index d321ba8a2cf0..eb191076be48 100644
--- a/shared/yeast/src/node_types_yaml.rs
+++ b/shared/yeast/src/node_types_yaml.rs
@@ -23,6 +23,7 @@
 use std::collections::{BTreeMap, BTreeSet};
 use std::fmt::Write;
 
+use crate::CHILD_FIELD;
 use serde::Deserialize;
 use serde_json::json;
 
@@ -100,32 +101,38 @@ fn parse_field_name(raw: &str) -> FieldSpec {
 
 /// Resolve a TypeRef to a (type, named) pair, given the sets of known named
 /// and unnamed types.
-fn resolve_type_ref(
+fn resolve_type_ref_pair(
     type_ref: &TypeRef,
     named_types: &BTreeSet<String>,
     unnamed_types: &BTreeSet<String>,
-) -> serde_json::Value {
+) -> (String, bool) {
     match type_ref {
-        TypeRef::Explicit { unnamed } => {
-            json!({"type": unnamed, "named": false})
-        }
+        TypeRef::Explicit { unnamed } => (unnamed.clone(), false),
         TypeRef::Name(name) => {
             let is_named = named_types.contains(name);
             let is_unnamed = unnamed_types.contains(name);
-
             if is_named && is_unnamed {
-                // Ambiguous: default to named
-                json!({"type": name, "named": true})
+                (name.clone(), true)
             } else if is_unnamed {
-                json!({"type": name, "named": false})
+                (name.clone(), false)
             } else {
-                // Named, or unknown (assume named)
-                json!({"type": name, "named": true})
+                (name.clone(), true)
             }
         }
     }
 }
 
+/// Resolve a TypeRef to a {type, named} JSON record, given the sets of known named
+/// and unnamed types.
+fn resolve_type_ref(
+    type_ref: &TypeRef,
+    named_types: &BTreeSet<String>,
+    unnamed_types: &BTreeSet<String>,
+) -> serde_json::Value {
+    let (kind, named) = resolve_type_ref_pair(type_ref, named_types, unnamed_types);
+    json!({"type": kind, "named": named})
+}
+
 /// Convert YAML string to node-types JSON string.
 pub fn convert(yaml_input: &str) -> Result<String, String> {
     let yaml: YamlNodeTypes =
@@ -233,14 +240,12 @@ pub fn convert(yaml_input: &str) -> Result<String, String> {
     serde_json::to_string_pretty(&output).map_err(|e| format!("Failed to serialize JSON: {e}"))
 }
 
-/// Build a Schema from a YAML node-types string.
-/// Registers all node kinds and field names found in the YAML.
-pub fn schema_from_yaml(yaml_input: &str) -> Result<crate::schema::Schema, String> {
-    let yaml: YamlNodeTypes =
-        serde_yaml::from_str(yaml_input).map_err(|e| format!("Failed to parse YAML: {e}"))?;
-
-    let mut schema = crate::schema::Schema::new();
-
+/// Apply YAML node-type definitions to a mutable Schema.
+/// Registers all types, fields, and allowed types from the YAML into the schema.
+fn apply_yaml_to_schema(
+    yaml: &YamlNodeTypes,
+    schema: &mut crate::schema::Schema,
+) {
     // Register all supertypes as node kinds
     for name in yaml.supertypes.keys() {
         schema.register_kind(name);
@@ -264,6 +269,62 @@ pub fn schema_from_yaml(yaml_input: &str) -> Result<crate::schema::Schema, Strin
         schema.register_unnamed_kind(name);
     }
 
+    let mut named_types = BTreeSet::new();
+    for name in yaml.supertypes.keys() {
+        named_types.insert(name.clone());
+    }
+    for name in yaml.named.keys() {
+        named_types.insert(name.clone());
+    }
+    let unnamed_types: BTreeSet<String> = yaml.unnamed.iter().cloned().collect();
+
+    for (supertype, members) in &yaml.supertypes {
+        let node_types = members
+            .iter()
+            .map(|m| {
+                let (kind, named) = resolve_type_ref_pair(m, &named_types, &unnamed_types);
+                crate::schema::NodeType { kind, named }
+            })
+            .collect();
+        schema.set_supertype_members(supertype, node_types);
+    }
+
+    // Register allowed field child types for type checking.
+    for (parent_kind, fields_opt) in &yaml.named {
+        let Some(fields) = fields_opt else {
+            continue;
+        };
+
+        for (raw_field_name, type_refs) in fields {
+            let spec = parse_field_name(raw_field_name);
+            let field_id = match &spec.name {
+                Some(name) => schema.register_field(name),
+                None => CHILD_FIELD,
+            };
+
+            let mut node_types = type_refs
+                .clone()
+                .into_vec()
+                .into_iter()
+                .map(|type_ref| {
+                    let (kind, named) = resolve_type_ref_pair(&type_ref, &named_types, &unnamed_types);
+                    crate::schema::NodeType { kind, named }
+                })
+                .collect::<Vec<_>>();
+            node_types.sort_by(|a, b| a.kind.cmp(&b.kind).then(a.named.cmp(&b.named)));
+            node_types.dedup_by(|a, b| a.kind == b.kind && a.named == b.named);
+            schema.set_field_types(parent_kind, field_id, node_types);
+        }
+    }
+}
+
+pub fn schema_from_yaml(yaml_input: &str) -> Result<crate::schema::Schema, String> {
+    let yaml: YamlNodeTypes =
+        serde_yaml::from_str(yaml_input).map_err(|e| format!("Failed to parse YAML: {e}"))?;
+
+    let mut schema = crate::schema::Schema::new();
+    apply_yaml_to_schema(&yaml, &mut schema);
+
     Ok(schema)
 }
 
@@ -278,29 +339,7 @@ pub fn schema_from_yaml_with_language(
         serde_yaml::from_str(yaml_input).map_err(|e| format!("Failed to parse YAML: {e}"))?;
 
     let mut schema = crate::schema::Schema::from_language(language);
-
-    // Register supertypes
-    for name in yaml.supertypes.keys() {
-        schema.register_kind(name);
-    }
-
-    // Register named node kinds and their fields
-    for (name, fields_opt) in &yaml.named {
-        schema.register_kind(name);
-        if let Some(fields) = fields_opt {
-            for raw_field_name in fields.keys() {
-                let spec = parse_field_name(raw_field_name);
-                if let Some(field_name) = &spec.name {
-                    schema.register_field(field_name);
-                }
-            }
-        }
-    }
-
-    // Register unnamed tokens
-    for name in &yaml.unnamed {
-        schema.register_unnamed_kind(name);
-    }
+    apply_yaml_to_schema(&yaml, &mut schema);
 
     Ok(schema)
 }
diff --git a/shared/yeast/src/schema.rs b/shared/yeast/src/schema.rs
index 12554d9c8692..c832a57b23ad 100644
--- a/shared/yeast/src/schema.rs
+++ b/shared/yeast/src/schema.rs
@@ -1,7 +1,13 @@
-use std::collections::BTreeMap;
+use std::collections::{BTreeMap, BTreeSet};
 
 use crate::{FieldId, KindId, CHILD_FIELD};
 
+#[derive(Clone, Debug)]
+pub struct NodeType {
+    pub kind: String,
+    pub named: bool,
+}
+
 /// A schema defining node kinds and field names for the output AST.
 /// Built from a node-types.yml file, independent of any tree-sitter grammar.
 ///
@@ -25,6 +31,8 @@ pub struct Schema {
     unnamed_kind_ids: BTreeMap<String, KindId>,
     kind_names: BTreeMap<KindId, &'static str>,
     next_kind_id: KindId,
+    field_types: BTreeMap<(String, FieldId), Vec<NodeType>>,
+    supertypes: BTreeMap<String, Vec<NodeType>>,
 }
 
 impl Default for Schema {
@@ -43,6 +51,8 @@ impl Schema {
             unnamed_kind_ids: BTreeMap::new(),
             kind_names: BTreeMap::new(),
             next_kind_id: 1, // 0 is reserved
+            field_types: BTreeMap::new(),
+            supertypes: BTreeMap::new(),
         }
     }
 
@@ -166,4 +176,68 @@ impl Schema {
     pub fn node_kind_for_id(&self, id: KindId) -> Option<&'static str> {
         self.kind_names.get(&id).copied()
     }
+
+    pub fn set_field_types(
+        &mut self,
+        parent_kind: &str,
+        field_id: FieldId,
+        node_types: Vec<NodeType>,
+    ) {
+        self.field_types
+            .insert((parent_kind.to_string(), field_id), node_types);
+    }
+
+    pub fn field_types(
+        &self,
+        parent_kind: &str,
+        field_id: FieldId,
+    ) -> Option<&Vec<NodeType>> {
+        self.field_types
+            .get(&(parent_kind.to_string(), field_id))
+    }
+
+    pub fn set_supertype_members(&mut self, supertype: &str, node_types: Vec<NodeType>) {
+        self.supertypes.insert(supertype.to_string(), node_types);
+    }
+
+    fn allows_node(
+        &self,
+        node_type: &NodeType,
+        node_kind: &str,
+        node_named: bool,
+        active: &mut BTreeSet<String>,
+    ) -> bool {
+        if node_type.kind == node_kind && node_type.named == node_named {
+            return true;
+        }
+
+        if !node_type.named {
+            return false;
+        }
+
+        let Some(members) = self.supertypes.get(&node_type.kind) else {
+            return false;
+        };
+
+        if !active.insert(node_type.kind.clone()) {
+            return false;
+        }
+
+        let matched = members
+            .iter()
+            .any(|member| self.allows_node(member, node_kind, node_named, active));
+        active.remove(&node_type.kind);
+        matched
+    }
+
+    pub fn node_matches_types(
+        &self,
+        node_kind: &str,
+        node_named: bool,
+        node_types: &[NodeType],
+    ) -> bool {
+        node_types.iter().any(|node_type| {
+            self.allows_node(node_type, node_kind, node_named, &mut BTreeSet::new())
+        })
+    }
 }
diff --git a/shared/yeast/tests/test.rs b/shared/yeast/tests/test.rs
index e058e6b1eb07..05fd19981656 100644
--- a/shared/yeast/tests/test.rs
+++ b/shared/yeast/tests/test.rs
@@ -1,6 +1,6 @@
 #![cfg(test)]
 
-use yeast::dump::dump_ast;
+use yeast::dump::{dump_ast, dump_ast_with_type_errors};
 use yeast::*;
 
 const OUTPUT_SCHEMA_YAML: &str = include_str!("node-types.yml");
@@ -42,6 +42,35 @@ fn run_and_get_error(input: &str, rules: Vec<Rule>) -> String {
         .expect_err("expected runner to return an error")
 }
 
+/// Helper: parse Ruby source with no rules and dump with schema type errors.
+fn parse_and_dump_typed(input: &str, schema_yaml: &str) -> String {
+    let runner = Runner::new(tree_sitter_ruby::LANGUAGE.into(), &[]);
+    let ast = runner.run(input).unwrap();
+    let schema = yeast::node_types_yaml::schema_from_yaml(schema_yaml).unwrap();
+    dump_ast_with_type_errors(&ast, ast.get_root(), input, &schema)
+}
+
+/// Helper: parse Ruby source with no rules and dump with schema type errors,
+/// building schema with language IDs so field checks align with parser fields.
+fn parse_and_dump_typed_with_language(input: &str, schema_yaml: &str) -> String {
+    let lang: tree_sitter::Language = tree_sitter_ruby::LANGUAGE.into();
+    let runner = Runner::new(lang.clone(), &[]);
+    let ast = runner.run(input).unwrap();
+    let schema = yeast::node_types_yaml::schema_from_yaml_with_language(schema_yaml, &lang)
+        .unwrap();
+    dump_ast_with_type_errors(&ast, ast.get_root(), input, &schema)
+}
+
+/// Helper: parse Ruby source with custom rules and dump with schema type errors.
+fn run_and_dump_typed(input: &str, rules: Vec<Rule>, schema_yaml: &str) -> String {
+    let lang: tree_sitter::Language = tree_sitter_ruby::LANGUAGE.into();
+    let schema = yeast::node_types_yaml::schema_from_yaml(schema_yaml).unwrap();
+    let phases = vec![Phase::new("test", rules)];
+    let runner = Runner::with_schema(lang, &schema, &phases);
+    let ast = runner.run(input).unwrap();
+    dump_ast_with_type_errors(&ast, ast.get_root(), input, &schema)
+}
+
 /// Assert that a dump equals the expected string, treating the expected
 /// string as an indented multiline literal: leading/trailing blank lines
 /// are stripped, and the common leading indentation is removed from every
@@ -125,6 +154,85 @@ fn test_parse_for_loop() {
     );
 }
 
+#[test]
+fn test_dump_highlights_type_errors_inline() {
+        let schema_yaml = r#"
+named:
+    program:
+        $children*: assignment
+    assignment:
+        left: identifier
+        right: identifier
+    identifier:
+"#;
+
+        let dump = parse_and_dump_typed("x = 1", schema_yaml);
+        assert!(dump.contains("integer \"1\" <-- ERROR:"));
+}
+
+#[test]
+fn test_dump_reports_preserved_unknown_kind_after_transformation() {
+        let schema_yaml = r#"
+named:
+    program:
+        $children*: assignment
+    assignment:
+        left: identifier
+        right: identifier
+    identifier:
+"#;
+
+        // This rewrite runs and preserves the RHS node kind via capture.
+        // With schema above, preserving `integer` should be reported inline.
+        let rules = vec![yeast::rule!(
+                (assignment left: (_) @left right: (_) @right)
+                =>
+                (assignment
+                        left: {left}
+                        right: {right}
+                )
+        )];
+
+        let dump = run_and_dump_typed("x = 1", rules, schema_yaml);
+        assert!(dump.contains("integer \"1\" <-- ERROR:"));
+        assert!(dump.contains("node kind 'integer' not in schema"));
+}
+
+#[test]
+fn test_dump_reports_undeclared_field_on_node() {
+        let schema_yaml = r#"
+named:
+    program:
+        $children*: assignment
+    assignment:
+        left: identifier
+    identifier:
+"#;
+
+        let dump = parse_and_dump_typed_with_language("x = y", schema_yaml);
+        assert!(dump.contains("right: identifier \"y\" <-- ERROR:"));
+        assert!(dump.contains("the node 'assignment' has no field 'right'"));
+}
+
+#[test]
+fn test_dump_reports_disallowed_kind_in_field_type() {
+        let schema_yaml = r#"
+named:
+    program:
+        $children*: assignment
+    assignment:
+        left: identifier
+        right: identifier
+    identifier:
+    integer:
+"#;
+
+        let dump = parse_and_dump_typed_with_language("x = 1", schema_yaml);
+        assert!(dump.contains("right: integer \"1\" <-- ERROR:"));
+        assert!(dump.contains("should contain"));
+        assert!(dump.contains("but got integer"));
+}
+
 // ---- Query tests ----
 
 #[test]

From 0b59c41e5ffa0b4cd62c405b9675298c519a8f52 Mon Sep 17 00:00:00 2001
From: Asger F <asgerf@github.com>
Date: Fri, 8 May 2026 12:02:56 +0200
Subject: [PATCH 03/12] Yeast: Add one-shot phase kind

---
 shared/yeast/doc/yeast.md    |  13 ++-
 shared/yeast/src/captures.rs |  15 +++
 shared/yeast/src/lib.rs      | 138 ++++++++++++++++++++++----
 shared/yeast/tests/test.rs   | 186 +++++++++++++++++++++++++++++++++--
 4 files changed, 323 insertions(+), 29 deletions(-)

diff --git a/shared/yeast/doc/yeast.md b/shared/yeast/doc/yeast.md
index 893cdea24dde..df5085272338 100644
--- a/shared/yeast/doc/yeast.md
+++ b/shared/yeast/doc/yeast.md
@@ -349,8 +349,8 @@ to enable rewriting:
 
 ```rust
 let desugar = yeast::DesugaringConfig::new()
-    .add_phase("cleanup", cleanup_rules())
-    .add_phase("desugar", desugar_rules())
+    .add_phase("cleanup", yeast::PhaseKind::Repeating, cleanup_rules())
+    .add_phase("translate", yeast::PhaseKind::OneShot, translate_rules())
     .with_output_node_types_yaml(include_str!("output-node-types.yml"));
 
 let lang = simple::LanguageSpec {
@@ -365,6 +365,15 @@ let lang = simple::LanguageSpec {
 A single-phase config is just `.add_phase(...)` called once. Phase names
 appear in error messages so you can tell which phase failed.
 
+There are two kinds of phases:
+- **Repeating**:
+    Each node is re-processed until none of the rules in the phase matches.
+    When a node no longer matches any rules, its children are recursively processed. In practice this is used to desugar or simplify an AST, while staying mostly within the same schema.
+- **One-shot**:
+    Each node is processed by the first matching rule, and the rewrite fails if no rule matches.
+    Rules are then recursively applied to every captured node.
+    In practice this is used when translating from one AST schema to another, where an exhaustive match is required.
+
 The same YAML node-types is used for both the runtime yeast `Schema` (so
 rules can refer to output-only kinds and fields) and TRAP validation (it
 is converted to JSON internally).
diff --git a/shared/yeast/src/captures.rs b/shared/yeast/src/captures.rs
index a92c5096e94e..ef184cd9f69a 100644
--- a/shared/yeast/src/captures.rs
+++ b/shared/yeast/src/captures.rs
@@ -61,6 +61,21 @@ impl Captures {
             }
         }
     }
+
+    /// Apply a fallible function to every captured id (across all keys),
+    /// replacing each id with the result. Stops and returns the error on
+    /// the first failure.
+    pub fn try_map_all_captures<E>(
+        &mut self,
+        mut f: impl FnMut(Id) -> Result<Id, E>,
+    ) -> Result<(), E> {
+        for ids in self.captures.values_mut() {
+            for id in ids {
+                *id = f(*id)?;
+            }
+        }
+        Ok(())
+    }
     pub fn map_captures_to(&mut self, from: &str, to: &'static str, f: &mut impl FnMut(Id) -> Id) {
         if let Some(from_ids) = self.captures.get(from) {
             let new_values = from_ids.iter().copied().map(f).collect();
diff --git a/shared/yeast/src/lib.rs b/shared/yeast/src/lib.rs
index c06029a86260..d45fe4980dfd 100644
--- a/shared/yeast/src/lib.rs
+++ b/shared/yeast/src/lib.rs
@@ -526,18 +526,39 @@ impl Rule {
         node: Id,
         fresh: &tree_builder::FreshScope,
     ) -> Result<Option<Vec<Id>>, String> {
+        match self.try_match(ast, node)? {
+            Some(captures) => Ok(Some(self.run_transform(ast, captures, node, fresh))),
+            None => Ok(None),
+        }
+    }
+
+    /// Attempt to match this rule's query against `node`, returning the
+    /// resulting captures on success. Does not invoke the transform.
+    fn try_match(&self, ast: &Ast, node: Id) -> Result<Option<Captures>, String> {
         let mut captures = Captures::new();
         if self.query.do_match(ast, node, &mut captures)? {
-            fresh.next_scope();
-            let source_range = ast.get_node(node).and_then(|n| match n.content {
-                NodeContent::Range(r) => Some(r),
-                _ => n.source_range,
-            });
-            Ok(Some((self.transform)(ast, captures, fresh, source_range)))
+            Ok(Some(captures))
         } else {
             Ok(None)
         }
     }
+
+    /// Run this rule's transform with the given captures, using `node`'s
+    /// source range as the source range of the produced nodes.
+    fn run_transform(
+        &self,
+        ast: &mut Ast,
+        captures: Captures,
+        node: Id,
+        fresh: &tree_builder::FreshScope,
+    ) -> Vec<Id> {
+        fresh.next_scope();
+        let source_range = ast.get_node(node).and_then(|n| match n.content {
+            NodeContent::Range(r) => Some(r),
+            _ => n.source_range,
+        });
+        (self.transform)(ast, captures, fresh, source_range)
+    }
 }
 
 const MAX_REWRITE_DEPTH: usize = 100;
@@ -572,17 +593,17 @@ impl<'a> RuleIndex<'a> {
     }
 }
 
-fn apply_rules(
+fn apply_repeating_rules(
     rules: &[Rule],
     ast: &mut Ast,
     id: Id,
     fresh: &tree_builder::FreshScope,
 ) -> Result<Vec<Id>, String> {
     let index = RuleIndex::new(rules);
-    apply_rules_inner(&index, ast, id, fresh, 0, None)
+    apply_repeating_rules_inner(&index, ast, id, fresh, 0, None)
 }
 
-fn apply_rules_inner(
+fn apply_repeating_rules_inner(
     index: &RuleIndex,
     ast: &mut Ast,
     id: Id,
@@ -611,7 +632,7 @@ fn apply_rules_inner(
             let next_skip = if rule.repeated { None } else { Some(rule_ptr) };
             let mut results = Vec::new();
             for node in result_node {
-                results.extend(apply_rules_inner(
+                results.extend(apply_repeating_rules_inner(
                     index,
                     ast,
                     node,
@@ -636,7 +657,7 @@ fn apply_rules_inner(
     for children in fields.values_mut() {
         let mut new_children: Option<Vec<Id>> = None;
         for (i, &child_id) in children.iter().enumerate() {
-            let result = apply_rules_inner(index, ast, child_id, fresh, rewrite_depth, None)?;
+            let result = apply_repeating_rules_inner(index, ast, child_id, fresh, rewrite_depth, None)?;
             let unchanged = result.len() == 1 && result[0] == child_id;
             match (&mut new_children, unchanged) {
                 (None, true) => {} // unchanged so far, no allocation needed
@@ -661,6 +682,75 @@ fn apply_rules_inner(
     Ok(vec![id])
 }
 
+/// Apply rules using `OneShot` semantics: the first matching rule fires on
+/// each visited node, recursion proceeds only through captured nodes (not
+/// through the input node's children directly), and an error is returned if
+/// no rule matches a visited node.
+fn apply_one_shot_rules(
+    rules: &[Rule],
+    ast: &mut Ast,
+    id: Id,
+    fresh: &tree_builder::FreshScope,
+) -> Result<Vec<Id>, String> {
+    let index = RuleIndex::new(rules);
+    apply_one_shot_rules_inner(&index, ast, id, fresh, 0)
+}
+
+fn apply_one_shot_rules_inner(
+    index: &RuleIndex,
+    ast: &mut Ast,
+    id: Id,
+    fresh: &tree_builder::FreshScope,
+    rewrite_depth: usize,
+) -> Result<Vec<Id>, String> {
+    if rewrite_depth > MAX_REWRITE_DEPTH {
+        return Err(format!(
+            "Desugaring exceeded maximum rewrite depth ({MAX_REWRITE_DEPTH}). \
+             This likely indicates a non-terminating rule cycle."
+        ));
+    }
+
+    let node_kind = ast.get_node(id).map(|n| n.kind()).unwrap_or("");
+    for rule in index.rules_for_kind(node_kind) {
+        if let Some(mut captures) = rule.try_match(ast, id)? {
+            // Recursively translate every captured node before invoking the
+            // transform. The transform's output uses output-schema kinds, so
+            // we must translate captured input-schema nodes to their
+            // output-schema equivalents first.
+            captures.try_map_all_captures(|captured_id| {
+                let result =
+                    apply_one_shot_rules_inner(index, ast, captured_id, fresh, rewrite_depth + 1)?;
+                if result.len() != 1 {
+                    return Err(format!(
+                        "OneShot: recursion on captured node produced {} results, expected exactly 1",
+                        result.len()
+                    ));
+                }
+                Ok(result[0])
+            })?;
+            return Ok(rule.run_transform(ast, captures, id, fresh));
+        }
+    }
+
+    Err(format!(
+        "OneShot: no rule matched node of kind '{node_kind}'"
+    ))
+}
+
+#[derive(Clone, Copy, Debug, PartialEq, Eq)]
+pub enum PhaseKind {
+    /// A node is re-processed until none of the rules in the phase matches,
+    /// albeit a single rule cannot be applied twice in a row unless that rule is also marked as repeating.
+    /// When a node no longer matches any rules, its children are recursively processed (top down).
+    Repeating,
+
+    /// A node is processed by the first matching rule, and the rewrite fails if no rule matches.
+    /// Rules are then recursively applied to every captured node.
+    /// In practice this is used when translating from one AST schema to another, where every node must be rewritten,
+    /// and it would be a type error to match the rule patterns (based on the input schema) against the output nodes (which conform to the output schema).
+    OneShot,
+}
+
 /// One phase of a desugaring pass: a named bundle of rules that runs to
 /// completion (a full traversal applying its rules) before the next phase
 /// starts. Rules within a phase compete for matches as usual; rules in
@@ -670,13 +760,15 @@ pub struct Phase {
     /// Name used in error messages.
     pub name: String,
     pub rules: Vec<Rule>,
+    pub kind: PhaseKind,
 }
 
 impl Phase {
-    pub fn new(name: impl Into<String>, rules: Vec<Rule>) -> Self {
+    pub fn new(name: impl Into<String>, kind: PhaseKind, rules: Vec<Rule>) -> Self {
         Self {
             name: name.into(),
             rules,
+            kind,
         }
     }
 }
@@ -694,8 +786,8 @@ impl Phase {
 ///
 /// ```ignore
 /// let config = yeast::DesugaringConfig::new()
-///     .add_phase("cleanup", cleanup_rules)
-///     .add_phase("desugar", desugar_rules)
+///     .add_phase("cleanup", PhaseKind::Repeating, cleanup_rules)
+///     .add_phase("desugar", PhaseKind::Repeating, desugar_rules)
 ///     .with_output_node_types_yaml(yaml);
 /// ```
 #[derive(Default)]
@@ -715,9 +807,14 @@ impl DesugaringConfig {
         Self::default()
     }
 
-    /// Append a new phase with the given name and rules.
-    pub fn add_phase(mut self, name: impl Into<String>, rules: Vec<Rule>) -> Self {
-        self.phases.push(Phase::new(name, rules));
+    /// Append a new phase with the given name, kind, and rules.
+    pub fn add_phase(
+        mut self,
+        name: impl Into<String>,
+        kind: PhaseKind,
+        rules: Vec<Rule>,
+    ) -> Self {
+        self.phases.push(Phase::new(name, kind, rules));
         self
     }
 
@@ -806,8 +903,11 @@ impl<'a> Runner<'a> {
         let fresh = tree_builder::FreshScope::new();
         let mut root = ast.get_root();
         for phase in self.phases {
-            let res = apply_rules(&phase.rules, ast, root, &fresh)
-                .map_err(|e| format!("Phase `{}`: {e}", phase.name))?;
+            let res = match phase.kind {
+                PhaseKind::Repeating => apply_repeating_rules(&phase.rules, ast, root, &fresh),
+                PhaseKind::OneShot => apply_one_shot_rules(&phase.rules, ast, root, &fresh),
+            }
+            .map_err(|e| format!("Phase `{}`: {e}", phase.name))?;
             if res.len() != 1 {
                 return Err(format!(
                     "Phase `{}`: expected exactly one result node, got {}",
diff --git a/shared/yeast/tests/test.rs b/shared/yeast/tests/test.rs
index 05fd19981656..5e5c97c9ccb2 100644
--- a/shared/yeast/tests/test.rs
+++ b/shared/yeast/tests/test.rs
@@ -15,7 +15,7 @@ fn parse_and_dump(input: &str) -> String {
 /// Helper: parse Ruby source with a custom output schema and a single
 /// phase of rules, return dump.
 fn run_and_dump(input: &str, rules: Vec<Rule>) -> String {
-    run_phased_and_dump(input, vec![Phase::new("test", rules)])
+    run_phased_and_dump(input, vec![Phase::new("test", PhaseKind::Repeating, rules)])
 }
 
 /// Helper: parse Ruby source with a custom output schema and multiple
@@ -35,7 +35,7 @@ fn run_and_get_error(input: &str, rules: Vec<Rule>) -> String {
     let lang: tree_sitter::Language = tree_sitter_ruby::LANGUAGE.into();
     let schema =
         yeast::node_types_yaml::schema_from_yaml_with_language(OUTPUT_SCHEMA_YAML, &lang).unwrap();
-    let phases = vec![Phase::new("test", rules)];
+    let phases = vec![Phase::new("test", PhaseKind::Repeating, rules)];
     let runner = Runner::with_schema(lang, &schema, &phases);
     runner
         .run(input)
@@ -65,7 +65,7 @@ fn parse_and_dump_typed_with_language(input: &str, schema_yaml: &str) -> String
 fn run_and_dump_typed(input: &str, rules: Vec<Rule>, schema_yaml: &str) -> String {
     let lang: tree_sitter::Language = tree_sitter_ruby::LANGUAGE.into();
     let schema = yeast::node_types_yaml::schema_from_yaml(schema_yaml).unwrap();
-    let phases = vec![Phase::new("test", rules)];
+    let phases = vec![Phase::new("test", PhaseKind::Repeating, rules)];
     let runner = Runner::with_schema(lang, &schema, &phases);
     let ast = runner.run(input).unwrap();
     dump_ast_with_type_errors(&ast, ast.get_root(), input, &schema)
@@ -279,8 +279,12 @@ fn test_reachable_nodes_excludes_orphaned_rewrite_nodes() {
     let lang: tree_sitter::Language = tree_sitter_ruby::LANGUAGE.into();
     let schema = yeast::node_types_yaml::schema_from_yaml_with_language(OUTPUT_SCHEMA_YAML, &lang)
         .unwrap();
-    let rules = vec![yeast::rule!((integer) => (identifier "replaced"))];
-    let runner = Runner::with_schema(lang, &schema, &rules);
+    let phases = vec![Phase::new(
+        "test",
+        PhaseKind::Repeating,
+        vec![yeast::rule!((integer) => (identifier "replaced"))],
+    )];
+    let runner = Runner::with_schema(lang, &schema, &phases);
 
     let input = "x = 1";
     let ast = runner.run(input).unwrap();
@@ -783,8 +787,8 @@ fn test_phased_desugaring() {
     let dump = run_phased_and_dump(
         "x = 1",
         vec![
-            Phase::new("cleanup", cleanup),
-            Phase::new("desugar", desugar),
+            Phase::new("cleanup", PhaseKind::Repeating, cleanup),
+            Phase::new("desugar", PhaseKind::Repeating, desugar),
         ],
     );
     assert_dump_eq(
@@ -805,7 +809,11 @@ fn test_phase_error_includes_phase_name() {
     let lang: tree_sitter::Language = tree_sitter_ruby::LANGUAGE.into();
     let schema =
         yeast::node_types_yaml::schema_from_yaml_with_language(OUTPUT_SCHEMA_YAML, &lang).unwrap();
-    let phases = vec![Phase::new("buggy", vec![swap_assignment_rule().repeated()])];
+    let phases = vec![Phase::new(
+        "buggy",
+        PhaseKind::Repeating,
+        vec![swap_assignment_rule().repeated()],
+    )];
     let runner = Runner::with_schema(lang, &schema, &phases);
     let err = runner
         .run("x = 1")
@@ -820,6 +828,168 @@ fn test_phase_error_includes_phase_name() {
     );
 }
 
+/// Helper: an exhaustive set of OneShot rules covering every node reachable
+/// (via captures) when translating `"x = 1"`.
+fn one_shot_xeq1_rules() -> Vec<Rule> {
+    vec![
+        yeast::rule!(
+            (program (_)* @stmts)
+            =>
+            (program stmt: {..stmts})
+        ),
+        yeast::rule!(
+            (assignment left: (_) @left right: (_) @right)
+            =>
+            (first_node left: {left} right: {right})
+        ),
+        yeast::rule!((identifier) => (identifier "ID")),
+        yeast::rule!((integer) => (integer "INT")),
+    ]
+}
+
+#[test]
+fn test_one_shot_phase() {
+    let lang: tree_sitter::Language = tree_sitter_ruby::LANGUAGE.into();
+    let schema =
+        yeast::node_types_yaml::schema_from_yaml_with_language(OUTPUT_SCHEMA_YAML, &lang).unwrap();
+    let phases = vec![Phase::new(
+        "translate",
+        PhaseKind::OneShot,
+        one_shot_xeq1_rules(),
+    )];
+    let runner = Runner::with_schema(lang, &schema, &phases);
+
+    let input = "x = 1";
+    let ast = runner.run(input).unwrap();
+    let dump = dump_ast(&ast, ast.get_root(), input);
+    assert_dump_eq(
+        &dump,
+        r#"
+        program
+          stmt:
+            first_node
+              left: identifier "ID"
+              right: integer "INT"
+    "#,
+    );
+}
+
+#[test]
+fn test_one_shot_phase_errors_when_no_rule_matches() {
+    let lang: tree_sitter::Language = tree_sitter_ruby::LANGUAGE.into();
+    let schema =
+        yeast::node_types_yaml::schema_from_yaml_with_language(OUTPUT_SCHEMA_YAML, &lang).unwrap();
+    // Drop the `integer` rule so the recursion has no rule for `integer`.
+    let mut rules = one_shot_xeq1_rules();
+    rules.pop();
+    let phases = vec![Phase::new("translate", PhaseKind::OneShot, rules)];
+    let runner = Runner::with_schema(lang, &schema, &phases);
+
+    let err = runner
+        .run("x = 1")
+        .expect_err("expected OneShot to error on unmatched node");
+    assert!(
+        err.contains("Phase `translate`"),
+        "error should name the phase, got: {err}"
+    );
+    assert!(
+        err.contains("no rule matched") && err.contains("integer"),
+        "error should describe the unmatched node kind, got: {err}"
+    );
+}
+
+/// OneShot recursion must apply rules to *captured* nodes, even if the rule
+/// returns a captured child verbatim. A buggy implementation that only
+/// recurses into the children of the rule's output (rather than into the
+/// captures) would leave the returned capture untransformed.
+#[test]
+fn test_one_shot_recurses_into_returned_capture() {
+    let lang: tree_sitter::Language = tree_sitter_ruby::LANGUAGE.into();
+    let schema =
+        yeast::node_types_yaml::schema_from_yaml_with_language(OUTPUT_SCHEMA_YAML, &lang).unwrap();
+    let rules = vec![
+        yeast::rule!(
+            (program (_)* @stmts)
+            =>
+            (program stmt: {..stmts})
+        ),
+        // Returns the captured `left` verbatim, discarding `right`.
+        yeast::rule!(
+            (assignment left: (_) @left right: (_) @right)
+            =>
+            {left}
+        ),
+        yeast::rule!((identifier) => (identifier "ID")),
+        yeast::rule!((integer) => (integer "INT")),
+    ];
+    let phases = vec![Phase::new("translate", PhaseKind::OneShot, rules)];
+    let runner = Runner::with_schema(lang, &schema, &phases);
+
+    let input = "x = 1";
+    let ast = runner.run(input).unwrap();
+    let dump = dump_ast(&ast, ast.get_root(), input);
+    // `left` is an `identifier`; OneShot must apply the identifier rule to
+    // it before the assignment transform returns it verbatim.
+    assert_dump_eq(
+        &dump,
+        r#"
+        program
+          stmt: identifier "ID"
+    "#,
+    );
+}
+
+/// OneShot recursion must NOT descend into the children of the rule's output.
+/// A rule may legitimately wrap a captured node in fresh output-schema nodes
+/// that have no matching rule of their own (since rule patterns target the
+/// input schema). Recursing into the output would erroneously try to find
+/// rules for those wrapper kinds and fail.
+#[test]
+fn test_one_shot_does_not_recurse_into_wrapper_output() {
+    let lang: tree_sitter::Language = tree_sitter_ruby::LANGUAGE.into();
+    let schema =
+        yeast::node_types_yaml::schema_from_yaml_with_language(OUTPUT_SCHEMA_YAML, &lang).unwrap();
+    let rules = vec![
+        yeast::rule!(
+            (program (_)* @stmts)
+            =>
+            (program stmt: {..stmts})
+        ),
+        // Wraps `left` in nested `first_node`/`second_node` output kinds.
+        // Neither wrapper kind has a matching rule, so a buggy implementation
+        // that recurses into the wrapper's children would error.
+        yeast::rule!(
+            (assignment left: (_) @left right: (_) @right)
+            =>
+            (first_node
+                left: (second_node left: {left} right: {right})
+                right: {left}
+            )
+        ),
+        yeast::rule!((identifier) => (identifier "ID")),
+        yeast::rule!((integer) => (integer "INT")),
+    ];
+    let phases = vec![Phase::new("translate", PhaseKind::OneShot, rules)];
+    let runner = Runner::with_schema(lang, &schema, &phases);
+
+    let input = "x = 1";
+    let ast = runner.run(input).unwrap();
+    let dump = dump_ast(&ast, ast.get_root(), input);
+    assert_dump_eq(
+        &dump,
+        r#"
+        program
+          stmt:
+            first_node
+              left:
+                second_node
+                  left: identifier "ID"
+                  right: integer "INT"
+              right: identifier "ID"
+    "#,
+    );
+}
+
 // ---- Cursor tests ----
 
 #[test]

From 4b44753603bac1e40efb7f3bd95bce68f2e40781 Mon Sep 17 00:00:00 2001
From: Asger F <asgerf@github.com>
Date: Mon, 11 May 2026 11:37:59 +0200
Subject: [PATCH 04/12] Yeast one-shot: fix infinite loop when root is captured

---
 shared/yeast/src/lib.rs | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/shared/yeast/src/lib.rs b/shared/yeast/src/lib.rs
index d45fe4980dfd..159287df119e 100644
--- a/shared/yeast/src/lib.rs
+++ b/shared/yeast/src/lib.rs
@@ -718,6 +718,13 @@ fn apply_one_shot_rules_inner(
             // we must translate captured input-schema nodes to their
             // output-schema equivalents first.
             captures.try_map_all_captures(|captured_id| {
+                // Avoid infinite recursion when a capture refers to the root
+                // node of the matched tree (e.g. an `@_` capture on the
+                // pattern root): re-analyzing it would match the same rule
+                // again indefinitely.
+                if captured_id == id {
+                    return Ok(captured_id);
+                }
                 let result =
                     apply_one_shot_rules_inner(index, ast, captured_id, fresh, rewrite_depth + 1)?;
                 if result.len() != 1 {

From 4e5122e3cb1393b922f14ba985293caf1e5c41a9 Mon Sep 17 00:00:00 2001
From: Asger F <asgerf@github.com>
Date: Thu, 7 May 2026 11:33:04 +0200
Subject: [PATCH 05/12] Add support for tree-sitter-style corpus tests

This adds tests consisting of source code and a printout of its rewritten AST.
---
 unified/AGENTS.md                             |  11 +-
 unified/extractor/src/extractor.rs            |   8 +-
 unified/extractor/src/languages/mod.rs        |   8 +
 unified/extractor/src/main.rs                 |   1 +
 .../extractor/tests/corpus/swift/desugar.txt  |  23 +++
 unified/extractor/tests/corpus_tests.rs       | 182 ++++++++++++++++++
 6 files changed, 224 insertions(+), 9 deletions(-)
 create mode 100644 unified/extractor/src/languages/mod.rs
 create mode 100644 unified/extractor/tests/corpus/swift/desugar.txt
 create mode 100644 unified/extractor/tests/corpus_tests.rs

diff --git a/unified/AGENTS.md b/unified/AGENTS.md
index 488a94f44bd4..132f61e269d3 100644
--- a/unified/AGENTS.md
+++ b/unified/AGENTS.md
@@ -20,10 +20,15 @@ grammar source), run `scripts/regenerate-grammar.sh` to:
 it shows the impact of a grammar tweak on the named node kinds, fields,
 and child types in a form much easier to read than the raw JSON.
 
-## Testing
-- If you changed the extractor code, always rebuild it before running tests.
+## Extractor Testing
+- To run extractor tests, run `cargo test` in the `extractor` directory.
 
-- To run all tests, run `codeql test run --search-path extractor-pack ql/test`
+- Do not edit the printed ASTs in `extractor/test/corpus` directly. To regenerate the ASTs, run tests with the environment variable `YEAST_UPDATE_CORPUS=1`.
+
+## CodeQL Testing
+- If you changed the extractor code, always rebuild it before running CodeQL tests.
+
+- To run all CodeQL tests, run `codeql test run --search-path extractor-pack ql/test`
 
 - Do not edit `.expected` files manually. To update the expected output, pass `--learn` to the `codeql test run` command.
 
diff --git a/unified/extractor/src/extractor.rs b/unified/extractor/src/extractor.rs
index eb6f06eb259b..ae3c1e78715b 100644
--- a/unified/extractor/src/extractor.rs
+++ b/unified/extractor/src/extractor.rs
@@ -3,9 +3,7 @@ use std::path::PathBuf;
 
 use codeql_extractor::extractor::simple;
 use codeql_extractor::trap;
-
-#[path = "languages/swift/swift.rs"]
-mod swift;
+use crate::languages;
 
 #[derive(Args)]
 pub struct Options {
@@ -27,9 +25,7 @@ pub fn run(options: Options) -> std::io::Result<()> {
 
     let extractor = simple::Extractor {
         prefix: "unified".to_string(),
-        languages: vec![
-            swift::language_spec(),
-        ],
+        languages: languages::all_language_specs(),
         trap_dir: options.output_dir,
         trap_compression: trap::Compression::from_env("CODEQL_EXTRACTOR_UNIFIED_OPTION_TRAP_COMPRESSION"),
         source_archive_dir: options.source_archive_dir,
diff --git a/unified/extractor/src/languages/mod.rs b/unified/extractor/src/languages/mod.rs
new file mode 100644
index 000000000000..4d5c945cb9b3
--- /dev/null
+++ b/unified/extractor/src/languages/mod.rs
@@ -0,0 +1,8 @@
+use codeql_extractor::extractor::simple;
+
+#[path = "swift/swift.rs"]
+mod swift;
+
+pub fn all_language_specs() -> Vec<simple::LanguageSpec> {
+    vec![swift::language_spec()]
+}
diff --git a/unified/extractor/src/main.rs b/unified/extractor/src/main.rs
index e6721d4e2243..5a3407c37a29 100644
--- a/unified/extractor/src/main.rs
+++ b/unified/extractor/src/main.rs
@@ -3,6 +3,7 @@ use clap::Parser;
 mod autobuilder;
 mod extractor;
 mod generator;
+mod languages;
 
 #[derive(Parser)]
 #[command(author, version, about)]
diff --git a/unified/extractor/tests/corpus/swift/desugar.txt b/unified/extractor/tests/corpus/swift/desugar.txt
new file mode 100644
index 000000000000..1ea0e260aad2
--- /dev/null
+++ b/unified/extractor/tests/corpus/swift/desugar.txt
@@ -0,0 +1,23 @@
+===
+Additive expression is desugared
+===
+
+1 + 2
+
+---
+
+source_file
+  simple_identifier "blah"
+
+
+===
+Another additive expression is desugared
+===
+
+foo + bar
+
+---
+
+source_file
+  simple_identifier "blah"
+
diff --git a/unified/extractor/tests/corpus_tests.rs b/unified/extractor/tests/corpus_tests.rs
new file mode 100644
index 000000000000..ea7bf7b11ca8
--- /dev/null
+++ b/unified/extractor/tests/corpus_tests.rs
@@ -0,0 +1,182 @@
+use std::fs;
+use std::path::Path;
+
+use codeql_extractor::extractor::simple;
+use yeast::{dump::dump_ast, Runner};
+
+#[path = "../src/languages/mod.rs"]
+mod languages;
+
+#[derive(Debug)]
+struct CorpusCase {
+    name: String,
+    input: String,
+    expected: String,
+}
+
+fn update_mode_enabled() -> bool {
+    std::env::var("YEAST_UPDATE_CORPUS")
+        .map(|v| matches!(v.to_ascii_lowercase().as_str(), "1" | "true" | "yes" | "on"))
+        .unwrap_or(false)
+}
+
+fn is_header_rule(line: &str) -> bool {
+    let trimmed = line.trim();
+    trimmed.len() >= 3 && trimmed.chars().all(|c| c == '=')
+}
+
+fn parse_corpus(content: &str) -> Vec<CorpusCase> {
+    let lines: Vec<&str> = content.lines().collect();
+    let mut i = 0;
+    let mut cases = Vec::new();
+
+    while i < lines.len() {
+        while i < lines.len() && lines[i].trim().is_empty() {
+            i += 1;
+        }
+        if i >= lines.len() {
+            break;
+        }
+
+        assert!(
+            is_header_rule(lines[i]),
+            "Expected header delimiter at line {}",
+            i + 1
+        );
+        i += 1;
+
+        assert!(i < lines.len(), "Missing test name at line {}", i + 1);
+        let name = lines[i].trim().to_string();
+        i += 1;
+
+        assert!(
+            i < lines.len() && is_header_rule(lines[i]),
+            "Missing closing header delimiter for case {name}"
+        );
+        i += 1;
+
+        let input_start = i;
+        while i < lines.len() && lines[i].trim() != "---" {
+            i += 1;
+        }
+        assert!(i < lines.len(), "Missing --- separator for case {name}");
+        let input = lines[input_start..i].join("\n").trim_end().to_string();
+        i += 1;
+
+        let expected_start = i;
+        while i < lines.len() {
+            if is_header_rule(lines[i])
+                && i + 2 < lines.len()
+                && !lines[i + 1].trim().is_empty()
+                && is_header_rule(lines[i + 2])
+            {
+                break;
+            }
+            i += 1;
+        }
+        let expected = lines[expected_start..i].join("\n").trim().to_string();
+
+        cases.push(CorpusCase {
+            name,
+            input,
+            expected,
+        });
+    }
+
+    cases
+}
+
+fn render_corpus(cases: &[CorpusCase]) -> String {
+    let mut out = String::new();
+
+    for (idx, case) in cases.iter().enumerate() {
+        if idx > 0 {
+            out.push('\n');
+        }
+        out.push_str("===\n");
+        out.push_str(case.name.trim());
+        out.push_str("\n===\n");
+        out.push('\n');
+        out.push_str(case.input.trim());
+        out.push_str("\n\n---\n");
+        out.push('\n');
+        out.push_str(case.expected.trim());
+        out.push_str("\n\n");
+    }
+
+    out
+}
+
+fn run_desugaring(lang: &simple::LanguageSpec, input: &str) -> String {
+    let runner = match lang.desugar.as_ref() {
+        Some(config) => Runner::from_config(lang.ts_language.clone(), config)
+            .expect("Failed to create yeast runner from desugaring config"),
+        None => Runner::new(lang.ts_language.clone(), &[]),
+    };
+    let ast = runner
+        .run(input)
+        .unwrap_or_else(|e| panic!("Failed to parse corpus input: {e}"));
+    dump_ast(&ast, ast.get_root(), input)
+}
+
+#[test]
+fn test_corpus() {
+    let update_mode = update_mode_enabled();
+    let all_languages = languages::all_language_specs();
+    let corpus_dir = Path::new("tests/corpus");
+
+    for lang in all_languages {
+        let lang_corpus_dir = corpus_dir.join(&lang.prefix);
+        if !lang_corpus_dir.exists() {
+            continue;
+        }
+
+        let mut corpus_files: Vec<_> = fs::read_dir(&lang_corpus_dir)
+            .unwrap_or_else(|e| {
+                panic!(
+                    "Failed to read corpus directory {}: {e}",
+                    lang_corpus_dir.display()
+                )
+            })
+            .map(|entry| entry.expect("Failed to read corpus entry").path())
+            .filter(|path| path.extension().is_some_and(|ext| ext == "txt"))
+            .collect();
+        corpus_files.sort();
+
+        for corpus_path in corpus_files {
+            let content = fs::read_to_string(&corpus_path)
+                .unwrap_or_else(|e| panic!("Failed to read {}: {e}", corpus_path.display()));
+            let mut cases = parse_corpus(&content);
+            assert!(
+                !cases.is_empty(),
+                "No corpus cases found in {}",
+                corpus_path.display()
+            );
+
+            for case in &mut cases {
+                let actual = run_desugaring(&lang, &case.input);
+                if update_mode {
+                    case.expected = actual.trim().to_string();
+                } else {
+                    assert_eq!(
+                        case.expected.trim(),
+                        actual.trim(),
+                        "Corpus case failed in {}: {}",
+                        corpus_path.display(),
+                        case.name
+                    );
+                }
+            }
+
+            if update_mode {
+                let updated = render_corpus(&cases);
+                fs::write(&corpus_path, updated).unwrap_or_else(|e| {
+                    panic!(
+                        "Failed to update corpus file {}: {e}",
+                        corpus_path.display()
+                    )
+                });
+            }
+        }
+    }
+}

From ef34a2cce78b7dc91fa4bfc2daf4bb7021975838 Mon Sep 17 00:00:00 2001
From: Asger F <asgerf@github.com>
Date: Tue, 12 May 2026 16:15:55 +0200
Subject: [PATCH 06/12] Switch to one-shot

The test output becomes useless because we can't map the root node to anything at the moment
---
 unified/extractor/src/languages/swift/swift.rs   | 9 +++++++--
 unified/extractor/tests/corpus/swift/desugar.txt | 6 ++----
 2 files changed, 9 insertions(+), 6 deletions(-)

diff --git a/unified/extractor/src/languages/swift/swift.rs b/unified/extractor/src/languages/swift/swift.rs
index c3843a5979c5..34fdc82b499c 100644
--- a/unified/extractor/src/languages/swift/swift.rs
+++ b/unified/extractor/src/languages/swift/swift.rs
@@ -1,5 +1,5 @@
 use codeql_extractor::extractor::simple;
-use yeast::{rule, DesugaringConfig};
+use yeast::{rule, DesugaringConfig, PhaseKind};
 
 fn desugaring_rules() -> Vec<yeast::Rule> {
     vec![
@@ -8,11 +8,16 @@ fn desugaring_rules() -> Vec<yeast::Rule> {
             =>
             (simple_identifier "blah")
         ),
+        rule!(
+            _
+            =>
+            (simple_identifier "not supported")
+        )
     ]
 }
 
 pub fn language_spec() -> simple::LanguageSpec {
-    let desugar = DesugaringConfig::new().add_phase("desugar", desugaring_rules());
+    let desugar = DesugaringConfig::new().add_phase("desugar", PhaseKind::OneShot, desugaring_rules());
     simple::LanguageSpec {
         prefix: "swift",
         ts_language: tree_sitter_swift::LANGUAGE.into(),
diff --git a/unified/extractor/tests/corpus/swift/desugar.txt b/unified/extractor/tests/corpus/swift/desugar.txt
index 1ea0e260aad2..76e6afe1fd84 100644
--- a/unified/extractor/tests/corpus/swift/desugar.txt
+++ b/unified/extractor/tests/corpus/swift/desugar.txt
@@ -6,8 +6,7 @@ Additive expression is desugared
 
 ---
 
-source_file
-  simple_identifier "blah"
+simple_identifier "not supported"
 
 
 ===
@@ -18,6 +17,5 @@ foo + bar
 
 ---
 
-source_file
-  simple_identifier "blah"
+simple_identifier "not supported"
 

From 16b1eb5cddfb3951d966017e17df4b80144ccbe3 Mon Sep 17 00:00:00 2001
From: Asger F <asgerf@github.com>
Date: Mon, 11 May 2026 13:07:03 +0200
Subject: [PATCH 07/12] Add 3-section corpus test

---
 .../extractor/tests/corpus/swift/desugar.txt  | 17 ++++-
 unified/extractor/tests/corpus_tests.rs       | 62 +++++++++++++++++++
 2 files changed, 78 insertions(+), 1 deletion(-)

diff --git a/unified/extractor/tests/corpus/swift/desugar.txt b/unified/extractor/tests/corpus/swift/desugar.txt
index 76e6afe1fd84..442985dd18dd 100644
--- a/unified/extractor/tests/corpus/swift/desugar.txt
+++ b/unified/extractor/tests/corpus/swift/desugar.txt
@@ -6,6 +6,14 @@ Additive expression is desugared
 
 ---
 
+source_file
+  additive_expression
+    lhs: integer_literal "1"
+    op: +
+    rhs: integer_literal "2"
+
+---
+
 simple_identifier "not supported"
 
 
@@ -17,5 +25,12 @@ foo + bar
 
 ---
 
-simple_identifier "not supported"
+source_file
+  additive_expression
+    lhs: simple_identifier "foo"
+    op: +
+    rhs: simple_identifier "bar"
 
+---
+
+simple_identifier "not supported"
diff --git a/unified/extractor/tests/corpus_tests.rs b/unified/extractor/tests/corpus_tests.rs
index ea7bf7b11ca8..587800172b8b 100644
--- a/unified/extractor/tests/corpus_tests.rs
+++ b/unified/extractor/tests/corpus_tests.rs
@@ -11,6 +11,7 @@ mod languages;
 struct CorpusCase {
     name: String,
     input: String,
+    raw: String,
     expected: String,
 }
 
@@ -63,6 +64,30 @@ fn parse_corpus(content: &str) -> Vec<CorpusCase> {
         let input = lines[input_start..i].join("\n").trim_end().to_string();
         i += 1;
 
+        // Raw tree-sitter parse section. New-format files have a second
+        // `---` separator between the raw tree and the mapped AST. Legacy
+        // files (with only one separator) have no raw section — in that
+        // case `raw` stays empty and update mode will populate it.
+        let raw_start = i;
+        let mut next_sep = i;
+        while next_sep < lines.len() && lines[next_sep].trim() != "---" {
+            if is_header_rule(lines[next_sep])
+                && next_sep + 2 < lines.len()
+                && !lines[next_sep + 1].trim().is_empty()
+                && is_header_rule(lines[next_sep + 2])
+            {
+                break;
+            }
+            next_sep += 1;
+        }
+        let raw = if next_sep < lines.len() && lines[next_sep].trim() == "---" {
+            let raw_text = lines[raw_start..next_sep].join("\n").trim().to_string();
+            i = next_sep + 1;
+            raw_text
+        } else {
+            String::new()
+        };
+
         let expected_start = i;
         while i < lines.len() {
             if is_header_rule(lines[i])
@@ -79,6 +104,7 @@ fn parse_corpus(content: &str) -> Vec<CorpusCase> {
         cases.push(CorpusCase {
             name,
             input,
+            raw,
             expected,
         });
     }
@@ -100,6 +126,9 @@ fn render_corpus(cases: &[CorpusCase]) -> String {
         out.push_str(case.input.trim());
         out.push_str("\n\n---\n");
         out.push('\n');
+        out.push_str(case.raw.trim());
+        out.push_str("\n\n---\n");
+        out.push('\n');
         out.push_str(case.expected.trim());
         out.push_str("\n\n");
     }
@@ -119,6 +148,20 @@ fn run_desugaring(lang: &simple::LanguageSpec, input: &str) -> String {
     dump_ast(&ast, ast.get_root(), input)
 }
 
+/// Produce the raw tree-sitter parse tree dump for `input`, with no
+/// desugaring rules applied. Uses a `Runner` with an empty phase list and
+/// the input grammar's own schema.
+fn dump_raw_parse(
+    lang: &simple::LanguageSpec,
+    input: &str,
+) -> Result<String, String> {
+    let runner = Runner::new(lang.ts_language.clone(), &[]);
+    let ast = runner
+        .run(input)
+        .map_err(|e| format!("Failed to parse input: {e}"))?;
+    Ok(dump_ast(&ast, ast.get_root(), input))
+}
+
 #[test]
 fn test_corpus() {
     let update_mode = update_mode_enabled();
@@ -154,6 +197,25 @@ fn test_corpus() {
             );
 
             for case in &mut cases {
+                let actual_raw = dump_raw_parse(&lang, &case.input)
+                    .unwrap_or_else(|e| panic!(
+                        "Raw parse failed for {} in {}: {}",
+                        case.name,
+                        corpus_path.display(),
+                        e
+                    ));
+                if update_mode {
+                    case.raw = actual_raw.trim().to_string();
+                } else {
+                    assert_eq!(
+                        case.raw.trim(),
+                        actual_raw.trim(),
+                        "Raw parse mismatch in {}: {}",
+                        corpus_path.display(),
+                        case.name
+                    );
+                }
+
                 let actual = run_desugaring(&lang, &case.input);
                 if update_mode {
                     case.expected = actual.trim().to_string();

From 3d20fdec05fbe1572ecfc01ac68cc03ce516048e Mon Sep 17 00:00:00 2001
From: Asger F <asgerf@github.com>
Date: Tue, 12 May 2026 16:31:08 +0200
Subject: [PATCH 08/12] Wire up type checking to corpus tests

---
 unified/extractor/ast_types.yml               |  21 +++
 unified/extractor/src/languages/mod.rs        |   5 +-
 .../extractor/src/languages/swift/swift.rs    |  21 ++-
 .../extractor/tests/corpus/swift/desugar.txt  |   7 +-
 unified/extractor/tests/corpus_tests.rs       | 133 +++++++++++-------
 5 files changed, 131 insertions(+), 56 deletions(-)
 create mode 100644 unified/extractor/ast_types.yml

diff --git a/unified/extractor/ast_types.yml b/unified/extractor/ast_types.yml
new file mode 100644
index 000000000000..832bd1a6c447
--- /dev/null
+++ b/unified/extractor/ast_types.yml
@@ -0,0 +1,21 @@
+supertypes:
+  expr:
+    - identifier
+    - unsupported_node
+
+  stmt:
+    - unsupported_node
+
+named:
+  # Top of a translated source file. The body is a list of expressions or
+  # statements.
+  top_level:
+    body*: [expr, stmt]
+
+  # An identifier in expression position.
+  identifier:
+
+  # A node that we don't (yet) translate.
+  unsupported_node:
+
+  wrong_type:
diff --git a/unified/extractor/src/languages/mod.rs b/unified/extractor/src/languages/mod.rs
index 4d5c945cb9b3..20ad599edfb6 100644
--- a/unified/extractor/src/languages/mod.rs
+++ b/unified/extractor/src/languages/mod.rs
@@ -3,6 +3,9 @@ use codeql_extractor::extractor::simple;
 #[path = "swift/swift.rs"]
 mod swift;
 
+/// Shared YEAST output AST schema for all languages.
+pub(crate) const OUTPUT_AST_SCHEMA: &str = include_str!("../../ast_types.yml");
+
 pub fn all_language_specs() -> Vec<simple::LanguageSpec> {
-    vec![swift::language_spec()]
+    vec![swift::language_spec(OUTPUT_AST_SCHEMA)]
 }
diff --git a/unified/extractor/src/languages/swift/swift.rs b/unified/extractor/src/languages/swift/swift.rs
index 34fdc82b499c..c7e1b7954860 100644
--- a/unified/extractor/src/languages/swift/swift.rs
+++ b/unified/extractor/src/languages/swift/swift.rs
@@ -1,23 +1,34 @@
 use codeql_extractor::extractor::simple;
 use yeast::{rule, DesugaringConfig, PhaseKind};
 
-fn desugaring_rules() -> Vec<yeast::Rule> {
+/// Output AST schema. The corpus tests load this via the language spec to
+/// type-check the desugared AST.
+pub const OUTPUT_NODE_TYPES_YAML: &str = include_str!("../../../ast_types.yml");
+
+fn translation_rules() -> Vec<yeast::Rule> {
     vec![
+        rule!(
+            (source_file (_)* @body)
+            =>
+            (top_level body: {..body})
+        ),
         rule!(
             (additive_expression)
             =>
-            (simple_identifier "blah")
+            (name_expr "test")
         ),
         rule!(
             _
             =>
-            (simple_identifier "not supported")
+            (unsupported_node)
         )
     ]
 }
 
-pub fn language_spec() -> simple::LanguageSpec {
-    let desugar = DesugaringConfig::new().add_phase("desugar", PhaseKind::OneShot, desugaring_rules());
+pub fn language_spec(desugared_ast_schema: &'static str) -> simple::LanguageSpec {
+    let desugar = DesugaringConfig::new()
+        .add_phase("translate", PhaseKind::OneShot, translation_rules())
+        .with_output_node_types_yaml(desugared_ast_schema);
     simple::LanguageSpec {
         prefix: "swift",
         ts_language: tree_sitter_swift::LANGUAGE.into(),
diff --git a/unified/extractor/tests/corpus/swift/desugar.txt b/unified/extractor/tests/corpus/swift/desugar.txt
index 442985dd18dd..00ab759580e2 100644
--- a/unified/extractor/tests/corpus/swift/desugar.txt
+++ b/unified/extractor/tests/corpus/swift/desugar.txt
@@ -14,8 +14,8 @@ source_file
 
 ---
 
-simple_identifier "not supported"
-
+top_level
+  body: name_expr "test"
 
 ===
 Another additive expression is desugared
@@ -33,4 +33,5 @@ source_file
 
 ---
 
-simple_identifier "not supported"
+top_level
+  body: name_expr "test"
diff --git a/unified/extractor/tests/corpus_tests.rs b/unified/extractor/tests/corpus_tests.rs
index 587800172b8b..abe03161be5b 100644
--- a/unified/extractor/tests/corpus_tests.rs
+++ b/unified/extractor/tests/corpus_tests.rs
@@ -2,7 +2,7 @@ use std::fs;
 use std::path::Path;
 
 use codeql_extractor::extractor::simple;
-use yeast::{dump::dump_ast, Runner};
+use yeast::{dump::dump_ast, dump::dump_ast_with_type_errors, Runner};
 
 #[path = "../src/languages/mod.rs"]
 mod languages;
@@ -117,35 +117,38 @@ fn render_corpus(cases: &[CorpusCase]) -> String {
 
     for (idx, case) in cases.iter().enumerate() {
         if idx > 0 {
+            // Blank line between cases.
             out.push('\n');
         }
         out.push_str("===\n");
         out.push_str(case.name.trim());
-        out.push_str("\n===\n");
-        out.push('\n');
+        out.push_str("\n===\n\n");
         out.push_str(case.input.trim());
-        out.push_str("\n\n---\n");
-        out.push('\n');
+        out.push_str("\n\n---\n\n");
         out.push_str(case.raw.trim());
-        out.push_str("\n\n---\n");
-        out.push('\n');
+        out.push_str("\n\n---\n\n");
         out.push_str(case.expected.trim());
-        out.push_str("\n\n");
+        // Single trailing newline per case; the inter-case blank line is
+        // added by the prefix above, and the file ends with exactly one `\n`.
+        out.push('\n');
     }
 
     out
 }
 
-fn run_desugaring(lang: &simple::LanguageSpec, input: &str) -> String {
+fn run_desugaring(
+    lang: &simple::LanguageSpec,
+    input: &str,
+) -> Result<yeast::Ast, String> {
     let runner = match lang.desugar.as_ref() {
         Some(config) => Runner::from_config(lang.ts_language.clone(), config)
-            .expect("Failed to create yeast runner from desugaring config"),
+            .map_err(|e| format!("Failed to create yeast runner: {e}"))?,
         None => Runner::new(lang.ts_language.clone(), &[]),
     };
-    let ast = runner
+
+    runner
         .run(input)
-        .unwrap_or_else(|e| panic!("Failed to parse corpus input: {e}"));
-    dump_ast(&ast, ast.get_root(), input)
+        .map_err(|e| format!("Failed to parse input: {e}"))
 }
 
 /// Produce the raw tree-sitter parse tree dump for `input`, with no
@@ -169,6 +172,12 @@ fn test_corpus() {
     let corpus_dir = Path::new("tests/corpus");
 
     for lang in all_languages {
+        let output_schema = yeast::node_types_yaml::schema_from_yaml_with_language(
+            languages::OUTPUT_AST_SCHEMA,
+            &lang.ts_language,
+        )
+        .expect("Failed to parse OUTPUT_AST_SCHEMA YAML");
+
         let lang_corpus_dir = corpus_dir.join(&lang.prefix);
         if !lang_corpus_dir.exists() {
             continue;
@@ -190,6 +199,7 @@ fn test_corpus() {
             let content = fs::read_to_string(&corpus_path)
                 .unwrap_or_else(|e| panic!("Failed to read {}: {e}", corpus_path.display()));
             let mut cases = parse_corpus(&content);
+            let mut failures = Vec::new();
             assert!(
                 !cases.is_empty(),
                 "No corpus cases found in {}",
@@ -197,47 +207,76 @@ fn test_corpus() {
             );
 
             for case in &mut cases {
-                let actual_raw = dump_raw_parse(&lang, &case.input)
-                    .unwrap_or_else(|e| panic!(
-                        "Raw parse failed for {} in {}: {}",
-                        case.name,
-                        corpus_path.display(),
-                        e
-                    ));
-                if update_mode {
-                    case.raw = actual_raw.trim().to_string();
-                } else {
-                    assert_eq!(
-                        case.raw.trim(),
-                        actual_raw.trim(),
-                        "Raw parse mismatch in {}: {}",
-                        corpus_path.display(),
-                        case.name
-                    );
+                match dump_raw_parse(&lang, &case.input) {
+                    Err(e) => {
+                        failures.push(format!(
+                            "Raw parse failed for {} in {}: {}",
+                            case.name,
+                            corpus_path.display(),
+                            e
+                        ));
+                    }
+                    Ok(actual_raw) => {
+                        if update_mode {
+                            case.raw = actual_raw.trim().to_string();
+                        } else if case.raw.trim() != actual_raw.trim() {
+                            failures.push(format!(
+                                "Raw parse mismatch in {}: \"{}\"\nEXPECTED:\n\n{}\n\nACTUAL:\n\n{}",
+                                corpus_path.display(),
+                                case.name,
+                                case.raw.trim(),
+                                actual_raw.trim()
+                            ));
+                        }
+                    }
                 }
 
-                let actual = run_desugaring(&lang, &case.input);
-                if update_mode {
-                    case.expected = actual.trim().to_string();
-                } else {
-                    assert_eq!(
-                        case.expected.trim(),
-                        actual.trim(),
-                        "Corpus case failed in {}: {}",
-                        corpus_path.display(),
-                        case.name
-                    );
+                match run_desugaring(&lang, &case.input) {
+                    Err(e) => {
+                        failures.push(format!(
+                            "Desugaring failed for {} in {}: {}",
+                            case.name,
+                            corpus_path.display(),
+                            e
+                        ));
+                    }
+                    Ok(actual) => {
+                        let actual_dump = dump_ast_with_type_errors(
+                            &actual,
+                            actual.get_root(),
+                            &case.input,
+                            &output_schema,
+                        );
+                        if update_mode {
+                            case.expected = actual_dump.trim().to_string();
+                        } else if case.expected.trim() != actual_dump.trim() {
+                            failures.push(format!(
+                                "Test failed in {}: \"{}\"\nEXPECTED:\n\n{}\n\nACTUAL:\n\n{}",
+                                corpus_path.display(),
+                                case.name,
+                                case.expected.trim(),
+                                actual_dump.trim()
+                            ));
+                        }
+                    }
                 }
             }
 
+            assert!(
+                failures.is_empty(),
+                "{}",
+                failures.join("\n\n") + "\n\n"
+            );
+
             if update_mode {
                 let updated = render_corpus(&cases);
-                fs::write(&corpus_path, updated).unwrap_or_else(|e| {
-                    panic!(
-                        "Failed to update corpus file {}: {e}",
-                        corpus_path.display()
-                    )
-                });
+                let write_result = fs::write(&corpus_path, updated);
+                assert!(
+                    write_result.is_ok(),
+                    "Failed to update corpus file {}: {}",
+                    corpus_path.display(),
+                    write_result.err().map_or_else(String::new, |e| e.to_string())
+                );
             }
         }
     }

From a5d5364ab289e65a2cd18beb11817118571ce163 Mon Sep 17 00:00:00 2001
From: Asger F <asgerf@github.com>
Date: Tue, 12 May 2026 16:45:39 +0200
Subject: [PATCH 09/12] Add some more output types

---
 unified/extractor/ast_types.yml | 139 ++++++++++++++++++++++++++++++--
 1 file changed, 131 insertions(+), 8 deletions(-)

diff --git a/unified/extractor/ast_types.yml b/unified/extractor/ast_types.yml
index 832bd1a6c447..22a5e8b19fb8 100644
--- a/unified/extractor/ast_types.yml
+++ b/unified/extractor/ast_types.yml
@@ -1,21 +1,144 @@
 supertypes:
   expr:
-    - identifier
+    - name_expr
+    - int_literal
+    - string_literal
+    - binary_expr
+    - unary_expr
+    - call_expr
+    - member_access_expr
+    - lambda_expr
     - unsupported_node
-
   stmt:
+    - empty_stmt
+    - block_stmt
+    - expr_stmt
+    - if_stmt
+    - variable_declaration_stmt
+    - guard_if_stmt
+    - unsupported_node
+  condition:
+    - expr_condition
+    - let_pattern_condition
+    - sequence_condition
+    - unsupported_node
+  pattern:
+    - var_pattern
+    - apply_pattern
+    - tuple_pattern
+    - ignore_pattern
     - unsupported_node
-
 named:
-  # Top of a translated source file. The body is a list of expressions or
-  # statements.
+  # Top-level is the root node, currently containing a list of expressions
   top_level:
     body*: [expr, stmt]
 
-  # An identifier in expression position.
+  # An identifier used in the context of an expression
+  name_expr:
+    identifier: identifier
+
+  # An integer literal
+  int_literal:
+
+  # A string literal
+  string_literal:
+
+  # Application of a binary operator, such as `a + b`
+  binary_expr:
+    left: expr
+    operator: operator
+    right: expr
+
+  # Application of a unary operator, such as `!x`
+  unary_expr:
+    operand: expr
+    operator: operator
+
+  # A function or method call, such as `f(x)` or `obj.m(x)`. Method calls
+  # are represented as a call whose `function` is a `member_access_expr`.
+  call_expr:
+    function: expr
+    argument*: expr
+
+  # Member access, such as `obj.member`.
+  member_access_expr:
+    target: expr
+    member: identifier
+
+  lambda_expr:
+    parameter*: parameter
+    body: [expr, stmt]
+
+  # A parameter
+  parameter:
+    pattern: pattern
+
+  empty_stmt:
+
+  block_stmt:
+    body*: stmt
+
+  expr_stmt:
+    expr: expr
+
+  if_stmt:
+    condition: condition
+    then?: stmt
+    else?: stmt
+
+  variable_declaration_stmt:
+    variable_declarator+: variable_declarator
+
+  # A variable declaration, or assignment to a pattern.
+  # The initializer is optional (but typically only possible in combination with a simple variable pattern).
+  variable_declarator:
+    pattern: pattern
+    value?: expr
+
+  # Evaluate 'condition', and if false, execute 'else' which must break from the enclosing block scope (return, break, etc).
+  # Any variables bound by 'condition' will be in scope for the remainder of the enclosing block scope
+  # (which differs from how if_stmt works).
+  guard_if_stmt:
+    condition: condition
+    else: stmt
+
+  # Evaluates the given condition and interprets it as a boolean (by language conventions)
+  expr_condition:
+    expr: expr
+
+  # A series of statements that are executed before evaluating the trailing condition.
+  # Useful for languages where a conditional clause may be preceded by side-effecting
+  # syntactic elements (e.g. binding clauses) that don't themselves form a condition.
+  sequence_condition:
+    stmt*: stmt
+    condition: condition
+
+  # Evaluate 'expr' and match its result against 'pattern', and return true if it matches.
+  # Variables bound by the pattern will be in scope within the 'true' branch controlled by this condition.
+  let_pattern_condition:
+    pattern: pattern
+    value: expr
+
+  # A pattern matching anything, binding its value to the given variable
+  var_pattern:
+    identifier: identifier
+
+  # A pattern matching anything, binding no variables, usually using the syntax "_"
+  ignore_pattern:
+
+  # A pattern such as `Some(x)` where `Some` is the constructor and `x` is an argument
+  apply_pattern:
+    constructor: expr
+    argument*: pattern
+
+  # A tuple pattern such as `(a, b)` in `let (a, b) = pair`.
+  tuple_pattern:
+    element*: pattern
+
+  # An simple unqualified identifier token
   identifier:
 
-  # A node that we don't (yet) translate.
+  # A node that we don't yet translate
   unsupported_node:
 
-  wrong_type:
+  operator:

From d2eba5cc07fcfdc42f3f58ba2454e77680bb2fef Mon Sep 17 00:00:00 2001
From: Asger F <asgerf@github.com>
Date: Mon, 11 May 2026 13:55:14 +0200
Subject: [PATCH 10/12] Fix something about synthesized nodes having the wrong
 text

---
 shared/yeast/src/lib.rs | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

diff --git a/shared/yeast/src/lib.rs b/shared/yeast/src/lib.rs
index 159287df119e..aa813e19a68a 100644
--- a/shared/yeast/src/lib.rs
+++ b/shared/yeast/src/lib.rs
@@ -339,7 +339,18 @@ impl Ast {
         let content = match &node.content {
             NodeContent::Range(range) => source[range.start_byte..range.end_byte].to_string(),
             NodeContent::String(s) => s.to_string(),
-            NodeContent::DynamicString(s) => s.clone(),
+            NodeContent::DynamicString(s) if !s.is_empty() => s.clone(),
+            // Synthesized nodes (from rule transforms) carry an empty
+            // `DynamicString`; resolve them against the inherited source
+            // range so `#{capture}` after a translation still yields the
+            // original source text.
+            NodeContent::DynamicString(_) => match node.source_range {
+                Some(range) => source
+                    .get(range.start_byte..range.end_byte)
+                    .map(|s| s.to_string())
+                    .unwrap_or_default(),
+                None => String::new(),
+            },
         };
         if fields.is_empty() {
             value.insert(kind, json!(content));

From fae82a47380d3af6aaa2b1182f9e99f5a07d18f5 Mon Sep 17 00:00:00 2001
From: Asger F <asgerf@github.com>
Date: Mon, 11 May 2026 14:11:16 +0200
Subject: [PATCH 11/12] Change how patterns with repetition is parsed

---
 shared/yeast-macros/src/parse.rs | 39 ++++++++++++++++++++++++++++----
 1 file changed, 35 insertions(+), 4 deletions(-)

diff --git a/shared/yeast-macros/src/parse.rs b/shared/yeast-macros/src/parse.rs
index 70bd46d5b6f6..3926ad381d72 100644
--- a/shared/yeast-macros/src/parse.rs
+++ b/shared/yeast-macros/src/parse.rs
@@ -122,10 +122,41 @@ fn parse_query_fields(tokens: &mut Tokens) -> Result<Vec<TokenStream>> {
 
             expect_punct(tokens, ':', "expected `:` after field name")?;
 
-            let child = parse_query_node(tokens)?;
-            fields.push(quote! {
-                (#field_str, vec![yeast::query::QueryListElem::SingleNode(#child)])
-            });
+            // Parse the field's pattern. To support repetition like
+            // `field: (kind)* @cap`, parse the atom first, then check for
+            // a quantifier, and lastly handle a trailing `@capture`.
+            let atom = parse_query_atom(tokens)?;
+            if peek_is_repetition(tokens) {
+                let rep = expect_repetition(tokens)?;
+                let elem = quote! {
+                    yeast::query::QueryListElem::Repeated {
+                        children: vec![yeast::query::QueryListElem::SingleNode(#atom)],
+                        rep: #rep,
+                    }
+                };
+                let elem = maybe_wrap_list_capture(tokens, elem)?;
+                fields.push(quote! {
+                    (#field_str, vec![#elem])
+                });
+            } else {
+                let child = if peek_is_at(tokens) {
+                    tokens.next();
+                    let capture_name =
+                        expect_ident(tokens, "expected capture name after @")?;
+                    let name_str = capture_name.to_string();
+                    quote! {
+                        yeast::query::QueryNode::Capture {
+                            capture: #name_str,
+                            node: Box::new(#atom),
+                        }
+                    }
+                } else {
+                    atom
+                };
+                fields.push(quote! {
+                    (#field_str, vec![yeast::query::QueryListElem::SingleNode(#child)])
+                });
+            }
         } else {
             // Bare patterns — accumulate into the implicit `child` field.
             // We don't break here, so we can interleave with named fields.

From b1098c54119c6d7b06c8019c470a662f5e53ffc6 Mon Sep 17 00:00:00 2001
From: Asger F <asgerf@github.com>
Date: Mon, 11 May 2026 14:39:03 +0200
Subject: [PATCH 12/12] yeast-macros: merge repeated field declarations and
 support repetition in field patterns

Two changes to parse_query_fields:

- Allow `field: (kind)* @cap` (repetition + optional capture) in field
  position, mirroring how it works for bare children.
- When the same field name is declared multiple times in a query (e.g.
  `condition: (foo) condition: (bar)`), merge them into a single
  ordered list of children rather than emitting duplicate field
  entries (which at runtime restart the iterator for the field and
  cause the second declaration to re-match from the first child).
---
 shared/yeast-macros/src/parse.rs | 36 +++++++++++++++++++++++++-------
 1 file changed, 29 insertions(+), 7 deletions(-)

diff --git a/shared/yeast-macros/src/parse.rs b/shared/yeast-macros/src/parse.rs
index 3926ad381d72..4095600cf438 100644
--- a/shared/yeast-macros/src/parse.rs
+++ b/shared/yeast-macros/src/parse.rs
@@ -113,8 +113,24 @@ fn parse_query_node_inner(tokens: &mut Tokens) -> Result<TokenStream> {
 /// appear in any order; bare patterns are accumulated and emitted as a
 /// single `("child", ...)` entry.
 fn parse_query_fields(tokens: &mut Tokens) -> Result<Vec<TokenStream>> {
-    let mut fields = Vec::new();
+    // Accumulate per-field elems in declaration order; multiple uses of the
+    // same field name extend the same list (so e.g. `cond: (foo) cond: (bar)`
+    // matches a `cond` field whose first child is `foo` and second is `bar`).
+    let mut field_order: Vec<String> = Vec::new();
+    let mut field_elems: std::collections::HashMap<String, Vec<TokenStream>> =
+        std::collections::HashMap::new();
     let mut bare_children: Vec<TokenStream> = Vec::new();
+    let mut push_field_elem = |order: &mut Vec<String>,
+                               map: &mut std::collections::HashMap<String, Vec<TokenStream>>,
+                               name: String,
+                               elem: TokenStream| {
+        if !map.contains_key(&name) {
+            order.push(name.clone());
+            map.insert(name, vec![elem]);
+        } else {
+            map.get_mut(&name).unwrap().push(elem);
+        }
+    };
     while tokens.peek().is_some() {
         if peek_is_field(tokens) {
             let field_name = expect_ident(tokens, "expected field name")?;
@@ -135,9 +151,7 @@ fn parse_query_fields(tokens: &mut Tokens) -> Result<Vec<TokenStream>> {
                     }
                 };
                 let elem = maybe_wrap_list_capture(tokens, elem)?;
-                fields.push(quote! {
-                    (#field_str, vec![#elem])
-                });
+                push_field_elem(&mut field_order, &mut field_elems, field_str, elem);
             } else {
                 let child = if peek_is_at(tokens) {
                     tokens.next();
@@ -153,9 +167,10 @@ fn parse_query_fields(tokens: &mut Tokens) -> Result<Vec<TokenStream>> {
                 } else {
                     atom
                 };
-                fields.push(quote! {
-                    (#field_str, vec![yeast::query::QueryListElem::SingleNode(#child)])
-                });
+                let elem = quote! {
+                    yeast::query::QueryListElem::SingleNode(#child)
+                };
+                push_field_elem(&mut field_order, &mut field_elems, field_str, elem);
             }
         } else {
             // Bare patterns — accumulate into the implicit `child` field.
@@ -168,6 +183,13 @@ fn parse_query_fields(tokens: &mut Tokens) -> Result<Vec<TokenStream>> {
             bare_children.extend(elems);
         }
     }
+    let mut fields: Vec<TokenStream> = Vec::new();
+    for name in field_order {
+        let elems = field_elems.remove(&name).unwrap();
+        fields.push(quote! {
+            (#name, vec![#(#elems),*])
+        });
+    }
     if !bare_children.is_empty() {
         fields.push(quote! {
             ("child", vec![#(#bare_children),*])