From d54e3214017ac06d3dbde782f77dc473c5b9211d Mon Sep 17 00:00:00 2001 From: Shreyas Goenka Date: Wed, 11 Mar 2026 23:58:27 +0100 Subject: [PATCH 1/7] Add design docs for variable interpolation parser and lookup suggestions Two proposals: 1. Replace regex-based ${...} parsing with a proper two-mode character scanner 2. Add "did you mean?" suggestions for invalid variable references Co-Authored-By: Claude Opus 4.6 --- design/interpolation-parser.md | 473 ++++++++++++++++++++ design/variable-lookup-suggestions.md | 605 ++++++++++++++++++++++++++ 2 files changed, 1078 insertions(+) create mode 100644 design/interpolation-parser.md create mode 100644 design/variable-lookup-suggestions.md diff --git a/design/interpolation-parser.md b/design/interpolation-parser.md new file mode 100644 index 0000000000..2495949adf --- /dev/null +++ b/design/interpolation-parser.md @@ -0,0 +1,473 @@ +# Proposal: Replace Regex-Based Variable Interpolation with a Proper Parser + +## Status: Draft + +## Background + +DABs variable interpolation (`${...}` syntax) is currently implemented via regex +matching in `libs/dyn/dynvar/ref.go`: + +```go +baseVarDef = `[a-zA-Z]+([-_]*[a-zA-Z0-9]+)*` +re = regexp.MustCompile( + fmt.Sprintf(`\$\{(%s(\.%s(\[[0-9]+\])*)*(\[[0-9]+\])*)\}`, baseVarDef, baseVarDef), +) +``` + +This regex is used in `NewRef()` via `FindAllStringSubmatch` to extract all +`${...}` references from a string. The matched substrings are then resolved by +the pipeline in `resolve.go` (collect → resolve → replace). + +### Problems with the Regex Approach + +1. **No error reporting.** A non-match produces zero information — the user + gets no feedback about *why* `${foo.bar-}` or `${foo..bar}` is silently + ignored. Invalid references are indistinguishable from literal text. + +2. **No position information.** Errors cannot point to the character where + parsing fails. When resolution *does* fail, the error messages refer to the + matched string but not its location within the original value. + +3. **Hard to extend.** Adding new syntax (e.g., default values like + `${var.name:-default}`, or function calls like `${upper(var.name)}`) requires + modifying a regex that is already at the edge of readability. + +4. **No escape mechanism.** There is no way to produce a literal `${` in a + string. Users who need `${` in their output (e.g., shell scripts, ARM + templates) have no workaround. + +5. **Dual maintenance burden.** The regex must be kept in sync with a Python + regex in `python/databricks/bundles/core/_transform.py` — a fragile + arrangement with no automated enforcement. + +6. **Silent acceptance of ambiguous input.** The regex approach cannot + distinguish between "this string has no variable references" and "this string + has a malformed variable reference that should be reported." + +## Research: How Other Systems Parse `${...}` + +| System | Strategy | Escape | Nesting | Error Quality | +|--------|----------|--------|---------|---------------| +| Go `text/template` | State-function lexer | None | Paren depth | Line + template name | +| HCL2 (Terraform) | Ragel FSM + recursive descent | `$${` → literal `${` | Brace depth stack | Source range + suggestions | +| Python f-strings (PEP 701) | Mode-stack tokenizer | `{{` → `{` | Mode stack | Line/column | +| Rust `format!` | Iterator-based descent | `{{`/`}}` | N/A | Spans + suggestions | +| Bash | Char-by-char + depth tracking | `\$` | Full recursive | Line number | + +**Key insight from the research:** For a syntax as simple as `${path.to.var[0]}` +(no nested expressions, no function calls, no operators), a full recursive +descent parser is overkill. The right tool is a **two-mode character scanner** — +the same pattern used by Go's `text/template` lexer and HCL's scanner at their +core. This gives us proper error reporting, escape support, and extensibility +without the complexity of a full parser generator. + +## Proposed Design + +### Architecture: Two-Phase Scanner + +Replace the regex with a small, explicit scanner that operates in two modes: + +``` +Mode 1: TEXT — accumulate literal characters +Mode 2: REFERENCE — accumulate variable path characters inside ${...} +``` + +The scanner produces a flat list of tokens. No AST, no recursive descent — just +a linear scan that cleanly separates literal text from variable references. + +### Token Types + +```go +// TokenKind represents the type of a parsed token. +type TokenKind int + +const ( + TokenLiteral TokenKind = iota // Literal text (no interpolation) + TokenRef // Variable reference: content between ${ and } +) +``` + +### Core Data Structure + +```go +// Token represents a parsed segment of an interpolation string. +type Token struct { + Kind TokenKind + Value string // For Literal: the text. For Ref: the variable path (e.g., "a.b[0].c"). + Start int // Byte offset of the start of this token in the original string. + End int // Byte offset past the end of this token. +} +``` + +### Scanner Implementation + +```go +// Parse parses a string that may contain ${...} variable references. +// It returns a slice of tokens representing literal text and variable references. +// +// Escape sequences: +// - "$$" produces a literal "$" (only when followed by "{") +// +// Examples: +// - "hello" → [Literal("hello")] +// - "${a.b}" → [Ref("a.b")] +// - "pre ${a.b} post" → [Literal("pre "), Ref("a.b"), Literal(" post")] +// - "$${a.b}" → [Literal("${a.b}")] +// - "${a.b} ${c[0]}" → [Ref("a.b"), Literal(" "), Ref("c[0]")] +func Parse(s string) ([]Token, error) { + var tokens []Token + i := 0 + buf := strings.Builder{} // accumulates literal text + + flushLiteral := func(end int) { + if buf.Len() > 0 { + tokens = append(tokens, Token{ + Kind: TokenLiteral, + Value: buf.String(), + Start: end - buf.Len(), + End: end, + }) + buf.Reset() + } + } + + for i < len(s) { + if s[i] != '$' { + buf.WriteByte(s[i]) + i++ + continue + } + + // We see '$'. Look ahead. + if i+1 >= len(s) { + // Trailing '$' — treat as literal. + buf.WriteByte('$') + i++ + continue + } + + switch s[i+1] { + case '$': + // Escape: "$$" → literal "$". + buf.WriteByte('$') + i += 2 + + case '{': + // Start of variable reference. + flushLiteral(i) + refStart := i + i += 2 // skip "${" + + // Scan the variable path until we find '}'. + pathStart := i + for i < len(s) && s[i] != '}' { + i++ + } + + if i >= len(s) { + return nil, fmt.Errorf( + "unterminated variable reference at position %d", + refStart, + ) + } + + path := s[pathStart:i] + i++ // skip '}' + + if path == "" { + return nil, fmt.Errorf( + "empty variable reference at position %d", + refStart, + ) + } + + // Validate the path content. + if err := validatePath(path, refStart); err != nil { + return nil, err + } + + tokens = append(tokens, Token{ + Kind: TokenRef, + Value: path, + Start: refStart, + End: i, + }) + + default: + // '$' not followed by '$' or '{' — treat as literal. + buf.WriteByte('$') + i++ + } + } + + flushLiteral(i) + return tokens, nil +} +``` + +### Path Validation + +Rather than encoding path rules in the regex, validate path contents explicitly +after extraction. This function reuses the existing `dyn.NewPathFromString` but +adds character-level error reporting: + +```go +// validatePath checks that a variable path is well-formed. +// It wraps dyn.NewPathFromString with position-aware error messages. +func validatePath(path string, refStart int) error { + _, err := dyn.NewPathFromString(path) + if err != nil { + return fmt.Errorf( + "invalid variable reference ${%s} at position %d: %w", + path, refStart, err, + ) + } + return nil +} +``` + +We should also add validation for the character set used in path segments. The +current regex implicitly enforces `[a-zA-Z]` start and `[a-zA-Z0-9_-]` +continuation. This should move to an explicit check inside path validation: + +```go +func validatePathSegment(seg string) error { + if len(seg) == 0 { + return fmt.Errorf("empty path segment") + } + if seg[0] < 'A' || (seg[0] > 'Z' && seg[0] < 'a') || seg[0] > 'z' { + return fmt.Errorf("path segment must start with a letter, got %q", seg[0]) + } + // ... check continuation characters ... +} +``` + +### Updated Ref Type + +The `Ref` struct changes from storing raw regex match groups to storing parsed +tokens: + +```go +type Ref struct { + Value dyn.Value // Original dyn.Value. + Str string // Original string content. + Tokens []Token // Parsed tokens (literals and references). +} + +func NewRef(v dyn.Value) (Ref, bool) { + s, ok := v.AsString() + if !ok { + return Ref{}, false + } + + tokens, err := Parse(s) + if err != nil { + // Return error through a new error-aware API (see Migration section). + return Ref{}, false + } + + // Check if any token is a reference. + hasRef := false + for _, t := range tokens { + if t.Kind == TokenRef { + hasRef = true + break + } + } + if !hasRef { + return Ref{}, false + } + + return Ref{Value: v, Str: s, Tokens: tokens}, true +} +``` + +### Updated Resolution Logic + +The string interpolation in `resolveRef` simplifies from a regex-replacement +loop to a token-concatenation loop: + +```go +func (r *resolver) resolveRef(ref Ref, seen []string) (dyn.Value, error) { + deps := ref.References() + resolved := make([]dyn.Value, len(deps)) + complete := true + + // ... resolve deps (unchanged) ... + + // Pure substitution (single ref, no literals). + if ref.IsPure() && complete { + return dyn.NewValue(resolved[0].Value(), ref.Value.Locations()), nil + } + + // String interpolation: concatenate tokens. + var buf strings.Builder + refIdx := 0 + for _, tok := range ref.Tokens { + switch tok.Kind { + case TokenLiteral: + buf.WriteString(tok.Value) + case TokenRef: + if !resolved[refIdx].IsValid() { + // Skipped — write original ${...} back. + buf.WriteString("${") + buf.WriteString(tok.Value) + buf.WriteByte('}') + } else { + s, err := valueToString(resolved[refIdx]) + if err != nil { + return dyn.InvalidValue, err + } + buf.WriteString(s) + } + refIdx++ + } + } + + return dyn.NewValue(buf.String(), ref.Value.Locations()), nil +} +``` + +This is cleaner than the current approach which uses `strings.Replace` with a +count of 1 — a trick needed to avoid double-replacing when the same variable +appears multiple times. + +### Helper Methods on Ref + +```go +// IsPure returns true if the string is a single variable reference with no +// surrounding text (e.g., "${a.b}" but not "x ${a.b}" or "${a} ${b}"). +func (v Ref) IsPure() bool { + return len(v.Tokens) == 1 && v.Tokens[0].Kind == TokenRef +} + +// References returns the variable paths referenced in this string. +func (v Ref) References() []string { + var out []string + for _, t := range v.Tokens { + if t.Kind == TokenRef { + out = append(out, t.Value) + } + } + return out +} +``` + +### Escape Sequence: `$$` + +Following HCL2's precedent, `$$` before `{` produces a literal `$`. This is the +most natural escape for users already familiar with Terraform/HCL: + +| Input | Output | +|-------|--------| +| `${a.b}` | *(resolved value of a.b)* | +| `$${a.b}` | `${a.b}` (literal) | +| `$$notbrace` | `$notbrace` (literal) | +| `$notbrace` | `$notbrace` (literal) | + +This is backward compatible: `$$` is not a valid prefix today (the regex +requires `${`), so no existing config uses `$$` in a way that would change +meaning. + +## File Changes + +| File | Change | +|------|--------| +| `libs/dyn/dynvar/ref.go` | Replace regex + `Matches` with `Parse()` + `[]Token` | +| `libs/dyn/dynvar/ref_test.go` | Update tests: add parser tests, keep behavioral tests | +| `libs/dyn/dynvar/resolve.go` | Update `resolveRef` to use token concatenation | +| `libs/dyn/dynvar/resolve_test.go` | Add tests for escape sequences, error messages | +| `libs/dyn/dynvar/parse.go` | **New file**: scanner + token types | +| `libs/dyn/dynvar/parse_test.go` | **New file**: scanner unit tests | +| `python/databricks/bundles/core/_transform.py` | Update Python side to match (separate PR) | + +## Migration Strategy + +### Phase 1: Add Parser, Keep Regex + +1. Implement `Parse()` in a new `parse.go` file with full test coverage. +2. Add a `NewRefWithDiagnostics(v dyn.Value) (Ref, diag.Diagnostics)` that + uses the parser and can return warnings for malformed references. +3. Keep the existing `NewRef` as-is, calling the parser internally but falling + back to the regex for any parse errors (belt-and-suspenders). +4. Add logging when the parser and regex disagree, to catch discrepancies. + +### Phase 2: Switch Over + +1. Remove the regex fallback — `NewRef` uses only the parser. +2. Update `Ref` to store `[]Token` instead of `[][]string`. +3. Update `resolveRef` to use token concatenation. +4. Remove the `re` variable. + +### Phase 3: Add Escape Support + +1. Enable `$$` escape handling in the parser. +2. Document the escape sequence. +3. Update the Python implementation. + +## Compatibility + +- **Forward compatible:** All strings that currently contain valid `${...}` + references will parse identically. The parser accepts a strict superset of + the regex (it can also report errors for malformed references). + +- **Backward compatible escape:** `$$` before `{` is a new feature, not a + breaking change. No existing valid config contains `$${` (the regex would not + match it, and a literal `$${` in YAML has no special meaning today). + +- **Error reporting is additive:** Strings that silently failed to match the + regex will now produce actionable error messages. This is a UX improvement, + not a breaking change, though it could surface new warnings for configs that + previously "worked" by accident (e.g., a typo like `${foo.bar-}` was silently + treated as literal text). + +## Testing Plan + +1. **Parser unit tests** (`parse_test.go`): + - Valid references: single, multiple, with indices, with hyphens/underscores + - Escape sequences: `$$`, `$` at end of string, `$` before non-`{` + - Error cases: unterminated `${`, empty `${}`, invalid characters in path + - Position tracking: verify `Start`/`End` offsets are correct + +2. **Ref behavioral tests** (`ref_test.go`): + - All existing tests pass unchanged + - New tests for `IsPure()` and `References()` using token-based `Ref` + +3. **Resolution tests** (`resolve_test.go`): + - All existing tests pass unchanged + - New tests for escape sequences in interpolation + - New tests verifying improved error messages + +4. **Acceptance tests**: + - Add acceptance test with `$$` escape in `databricks.yml` + - Verify existing acceptance tests pass without output changes + +## Why Not a More Powerful Parser? + +A recursive descent parser or parser combinator would allow richer syntax (nested +expressions, function calls, filters). We deliberately avoid this because: + +1. **YAGNI.** The current `${path.to.var[0]}` syntax covers all use cases. There + are no open feature requests for computed expressions inside `${...}`. + +2. **Two implementations.** Any syntax change must be mirrored in the Python + implementation. A simple scanner is easy to port; a recursive descent parser + is not. + +3. **Terraform alignment.** DABs variable references are conceptually similar to + HCL variable references. Keeping the syntax simple avoids user confusion + about what expressions are supported. + +If we ever need richer expressions, the token-based architecture makes it easy to +add a parser layer on top of the scanner without changing the `Ref`/`Token` types +or the resolution pipeline. + +## Summary + +Replace the regex in `dynvar/ref.go` with a ~80-line character scanner that: +- Produces the same results for all valid inputs +- Reports actionable errors for invalid inputs (with byte positions) +- Supports `$$` escape for literal `${` output +- Is straightforward to read, test, and extend +- Simplifies the interpolation logic in `resolve.go` from regex-replacement to + token concatenation diff --git a/design/variable-lookup-suggestions.md b/design/variable-lookup-suggestions.md new file mode 100644 index 0000000000..8e8fa221a4 --- /dev/null +++ b/design/variable-lookup-suggestions.md @@ -0,0 +1,605 @@ +# Proposal: "Did You Mean?" Suggestions for Invalid Variable References + +## Status: Draft + +## Problem + +When a user writes a variable reference like `${bundle.git.origin_urlx}` (typo) +or `${var.my_clster_id}` (misspelling), the error message today is: + +``` +reference does not exist: ${bundle.git.origin_urlx} +``` + +That's it. No suggestions, no list of valid keys, no indication of what went +wrong. The user has to go back to the docs or mentally diff against the schema +to figure out the correct name. + +This is a common source of frustration: a single character typo in a long path +like `${resources.jobs.my_pipeline.tasks[0].task_key}` can take minutes to spot. + +Variable references are multi-level paths (e.g., `${resources.jobs.my_job.id}`). +A typo can occur in **any** component — or even in **multiple** components at +once. A good suggestion system must handle all of these cases. + +## What We Have Today + +### The Error Path + +When resolution fails, the call chain is: + +``` +resolve_variable_references.go: dynvar.Resolve(v, lookupFn) + resolve.go: r.resolveKey(dep, seen) + resolve.go: r.fn(p) // calls the lookup function + resolve_variable_references.go: m.lookupFn(normalized, path, b) + resolve_variable_references.go: dyn.GetByPath(v, path) + visit.go: m.GetByString(c.key) // FAILS HERE + return noSuchKeyError{path} +``` + +At the point of failure in `visit.go:135-137`: +```go +m := v.MustMap() +ev, ok := m.GetByString(c.key) +if !ok { + return InvalidValue, noSuchKeyError{path} +} +``` + +The map `m` contains **all sibling keys** — i.e., the valid alternatives. But +that information is discarded. The error only carries the failed `path`. + +Back in `resolve.go:200-201`, the error is rewrapped into a flat string: +```go +if dyn.IsNoSuchKeyError(err) { + err = fmt.Errorf("reference does not exist: ${%s}", key) +} +``` + +**Crucially**, `visit.go` stops at the **first** non-existent key. For a path +like `${resources.jbs.my_jb.id}` with typos in both `jbs` and `my_jb`: +1. `resources` — exists, traverse into it +2. `jbs` — **does not exist** → `NoSuchKeyError` immediately + +The traversal never reaches `my_jb`, so we can only suggest fixing `jbs`. The +user has to fix that, re-run, and discover the second typo. This round-trip +is exactly the frustration we want to avoid. + +### The Normalized Config Tree + +In `resolve_variable_references.go:203`, before resolution begins: +```go +normalized, _ := convert.Normalize(b.Config, root, convert.IncludeMissingFields) +``` + +This `normalized` tree includes: +- All struct-defined fields (from Go types), even if unset (via `IncludeMissingFields`) +- All user-defined map keys (resource names, variable names, etc.) + +This is the right source of truth for suggestions. It contains everything the +user could validly reference. + +## Design + +### Approach: Fuzzy Path Walk Against the Config Tree + +Rather than relying on the error from `visit.go` (which only tells us about the +first failing component), we do a **separate fuzzy walk** of the config tree when +a lookup fails. This walk processes every component in the reference path and +can fix typos in **multiple** components simultaneously. + +The flow: +1. Lookup fails with `NoSuchKeyError` +2. We walk the reference path component by component against the normalized tree +3. At each component, if the exact key exists, we follow it +4. If not, we fuzzy-match against sibling keys and follow the best match +5. If all components are resolved (some via fuzzy matching), we suggest the + corrected full path +6. If any component can't be fuzzy-matched (too far from all candidates), we + give up on the suggestion + +### Implementation + +#### 1. Levenshtein Distance Utility + +```go +// File: libs/dyn/dynvar/suggest.go + +package dynvar + +// levenshtein computes the edit distance between two strings. +func levenshtein(a, b string) int { + if len(a) == 0 { + return len(b) + } + if len(b) == 0 { + return len(a) + } + + // Use single-row DP to save memory. + prev := make([]int, len(b)+1) + for j := range prev { + prev[j] = j + } + + for i := range len(a) { + curr := make([]int, len(b)+1) + curr[0] = i + 1 + for j := range len(b) { + cost := 1 + if a[i] == b[j] { + cost = 0 + } + curr[j+1] = min( + curr[j]+1, // insertion + prev[j+1]+1, // deletion + prev[j]+cost, // substitution + ) + } + prev = curr + } + + return prev[len(b)] +} +``` + +#### 2. Single-Key Match Function + +```go +// closestKeyMatch finds the closest matching key from a list of candidates. +// Returns the best match and its edit distance. +// Returns ("", -1) if no candidate is within the distance threshold. +func closestKeyMatch(key string, candidates []string) (string, int) { + // Threshold: allow up to 3 edits, but no more than half the key length. + // This avoids suggesting wildly different strings for short keys. + maxDist := min(3, max(1, len(key)/2)) + + bestMatch := "" + bestDist := maxDist + 1 + + for _, c := range candidates { + d := levenshtein(key, c) + if d < bestDist { + bestDist = d + bestMatch = c + } + } + + if bestMatch == "" { + return "", -1 + } + return bestMatch, bestDist +} +``` + +#### 3. Fuzzy Path Walk + +This is the core new function. It walks the reference path against the config +tree, fuzzy-matching at each level: + +```go +// suggestPath walks the reference path against root and tries to find the +// closest valid path. At each component, it first tries an exact match; if +// that fails, it fuzzy-matches against available keys. +// +// Returns the suggested path as a string, or "" if no reasonable suggestion +// can be made. +func suggestPath(root dyn.Value, refPath dyn.Path) string { + current := root + suggested := make(dyn.Path, 0, len(refPath)) + + for _, component := range refPath { + if component.IsIndex() { + // For index components (e.g., [0]), we can't fuzzy-match. + // Just check if the index is valid and pass through. + s, ok := current.AsSequence() + if !ok || component.Index() >= len(s) { + return "" + } + suggested = append(suggested, component) + current = s[component.Index()] + continue + } + + key := component.Key() + m, ok := current.AsMap() + if !ok { + // Expected a map but got something else — can't suggest. + return "" + } + + // Try exact match first. + if v, exists := m.GetByString(key); exists { + suggested = append(suggested, component) + current = v + continue + } + + // Exact match failed — try fuzzy match. + candidates := m.StringKeys() + match, _ := closestKeyMatch(key, candidates) + if match == "" { + // No close match — can't suggest beyond this point. + return "" + } + + suggested = append(suggested, dyn.Key(match)) + v, _ := m.GetByString(match) + current = v + } + + return suggested.String() +} +``` + +**Key properties:** +- Handles typos at **any** level: first, middle, last, or multiple levels +- Index components (`[0]`) are passed through verbatim — no fuzzy matching +- Stops suggesting as soon as any component can't be matched (no partial guesses) +- Each component is matched independently, so two typos in different components + are both corrected + +#### 4. Wire It Into Resolution + +The suggestion logic needs access to the normalized config tree that the lookup +function uses. Today, the `Lookup` function type is: + +```go +type Lookup func(path dyn.Path) (dyn.Value, error) +``` + +The `resolve.go` resolver doesn't have direct access to the underlying tree — +it only has the lookup function. We add the suggestion logic at the layer above, +in `resolve_variable_references.go`, which has access to the `normalized` tree. + +**Option A: Pass a suggest function into the resolver** + +Add an optional suggest callback to the resolver: + +```go +// SuggestFn takes a failed reference path and returns a suggested correction, +// or "" if no suggestion can be made. +type SuggestFn func(path dyn.Path) string + +func Resolve(in dyn.Value, fn Lookup, opts ...ResolveOption) (out dyn.Value, err error) { + r := resolver{in: in, fn: fn} + for _, opt := range opts { + opt(&r) + } + return r.run() +} + +type ResolveOption func(*resolver) + +func WithSuggestFn(fn SuggestFn) ResolveOption { + return func(r *resolver) { + r.suggestFn = fn + } +} +``` + +Then in `resolveKey`: +```go +v, err := r.fn(p) +if err != nil { + if dyn.IsNoSuchKeyError(err) { + msg := fmt.Sprintf("reference does not exist: ${%s}", key) + + if r.suggestFn != nil { + if suggestion := r.suggestFn(p); suggestion != "" { + msg += fmt.Sprintf("; did you mean ${%s}?", suggestion) + } + } + + err = fmt.Errorf(msg) + } + + r.lookups[key] = lookupResult{v: dyn.InvalidValue, err: err} + return dyn.InvalidValue, err +} +``` + +And in `resolve_variable_references.go`, pass the suggest function: + +```go +return dynvar.Resolve(v, lookupFn, dynvar.WithSuggestFn(func(p dyn.Path) string { + return dynvar.SuggestPath(normalized, p) +})) +``` + +**Option B: Suggest at the `resolve_variable_references.go` level** + +Instead of modifying `Resolve`'s signature, wrap the error after `Resolve` +returns. This is simpler but less clean: + +```go +out, err := dynvar.Resolve(v, lookupFn) +if err != nil && dyn.IsNoSuchKeyError(err) { + // Extract the failed path and suggest... +} +``` + +The problem with Option B is that by the time `Resolve` returns, the original +`dyn.Path` is lost — it's been formatted into the error string. We'd have to +re-parse it or change the error type. **Option A is cleaner.** + +### Example Error Messages + +| Reference | Typos | Error After | +|-----------|-------|-------------| +| `${bundle.git.origin_urlx}` | 1 (leaf) | `did you mean ${bundle.git.origin_url}?` | +| `${resources.jbs.my_job.id}` | 1 (middle) | `did you mean ${resources.jobs.my_job.id}?` | +| `${resources.jbs.my_jb.id}` | 2 (middle + middle) | `did you mean ${resources.jobs.my_job.id}?` | +| `${bundel.git.origin_urlx}` | 2 (root + leaf) | `did you mean ${bundle.git.origin_url}?` | +| `${workspace.root_paht}` | 1 (leaf) | `did you mean ${workspace.root_path}?` | +| `${var.my_clster_id}` | 1 (leaf) | `did you mean ${var.my_cluster_id}?` | +| `${completely.wrong.path}` | all | *(no suggestion — too far at first component)* | +| `${resources.jobs.my_jb.idd}` | 2 (deep) | `did you mean ${resources.jobs.my_job.id}?` | + +### Walk-Through: Multi-Level Typo + +For `${resources.jbs.my_jb.id}`, the fuzzy walk proceeds: + +``` +Component Tree at this level Exact? Fuzzy match +───────── ────────────────── ────── ─────────── +resources {bundle, resources, ...} yes — +jbs {jobs, pipelines, ...} no "jobs" (dist=1) +my_jb {my_job, other_job, ...} no "my_job" (dist=2) +id {id, name, ...} yes — + +Suggested path: resources.jobs.my_job.id +``` + +All four components resolved, so we suggest `${resources.jobs.my_job.id}`. + +### Walk-Through: Unfixable Path + +For `${zzz.yyy.xxx}`: + +``` +Component Tree at this level Exact? Fuzzy match +───────── ────────────────── ────── ─────────── +zzz {bundle, resources, ...} no none (all dist>3) + +Suggested path: "" (give up) +``` + +No suggestion produced. + +## Scope + +### What This Covers + +- Typos in struct field names: `${bundle.git.origin_urlx}` (keys from Go types) +- Typos in user-defined names: `${var.my_clster_id}` (keys from user config) +- Typos in resource type names: `${resources.jbs.my_job.id}` +- Typos in resource instance names: `${resources.jobs.my_jb.id}` +- **Multi-level typos**: `${resources.jbs.my_jb.id}` (typos at two levels) + +### What This Does NOT Cover + +- **Invalid path structure** (e.g., `${a..b}` or `${a[x]}`) — this is a parse + error, not a lookup error, and would be handled by the parser proposal. +- **References to the wrong section** (e.g., user writes `${bundle.cluster_id}` + when they mean `${var.cluster_id}`) — the prefix is valid so we'd only + suggest keys within `bundle.*`. Cross-section suggestions would require + searching the entire tree, which is a separate feature. +- **Array index out of bounds** (e.g., `${resources.jobs.foo.tasks[99]}`) — this + is an `indexOutOfBoundsError`, not a `noSuchKeyError`. No suggestions apply. + +## `var` Shorthand + +The `${var.foo}` shorthand is rewritten to `${variables.foo.value}` before +lookup (in `resolve_variable_references.go:209-222`). The suggestion function +receives the **rewritten** path. If we suggest a corrected path, we should +convert it back to the shorthand form for the user-facing message. + +For example: +- User writes: `${var.my_clster_id}` +- Rewritten to: `${variables.my_clster_id.value}` +- Suggestion from fuzzy walk: `variables.my_cluster_id.value` +- User-facing message: `did you mean ${var.my_cluster_id}?` + +This reverse mapping is straightforward: if the suggested path starts with +`variables.` and ends with `.value`, strip those and prefix with `var.`. + +## File Changes + +| File | Change | +|------|--------| +| `libs/dyn/mapping.go` | Add `StringKeys()` helper | +| `libs/dyn/dynvar/suggest.go` | **New**: `levenshtein()`, `closestKeyMatch()`, `SuggestPath()` | +| `libs/dyn/dynvar/suggest_test.go` | **New**: tests for distance, matching, and path suggestion | +| `libs/dyn/dynvar/resolve.go` | Add `SuggestFn` field, use it in `resolveKey` | +| `libs/dyn/dynvar/resolve_test.go` | Add tests for suggestion error messages | +| `bundle/config/mutator/resolve_variable_references.go` | Pass `WithSuggestFn` to `Resolve` | + +Note: no changes to `libs/dyn/visit.go` — the suggestion logic is entirely +separate from the traversal error path. + +## Testing + +### Unit Tests for Levenshtein + Suggestions + +```go +func TestLevenshtein(t *testing.T) { + assert.Equal(t, 0, levenshtein("abc", "abc")) + assert.Equal(t, 1, levenshtein("abc", "ab")) // deletion + assert.Equal(t, 1, levenshtein("abc", "abcd")) // insertion + assert.Equal(t, 1, levenshtein("abc", "adc")) // substitution + assert.Equal(t, 3, levenshtein("abc", "xyz")) // all different + assert.Equal(t, 3, levenshtein("", "abc")) // empty vs non-empty +} + +func TestClosestKeyMatch(t *testing.T) { + candidates := []string{"origin_url", "branch", "commit"} + + match, dist := closestKeyMatch("origin_urlx", candidates) + assert.Equal(t, "origin_url", match) + assert.Equal(t, 1, dist) + + match, _ = closestKeyMatch("zzzzzzz", candidates) + assert.Equal(t, "", match) +} +``` + +### Fuzzy Path Walk Tests + +```go +func TestSuggestPathSingleTypo(t *testing.T) { + root := dyn.V(map[string]dyn.Value{ + "bundle": dyn.V(map[string]dyn.Value{ + "git": dyn.V(map[string]dyn.Value{ + "origin_url": dyn.V(""), + "branch": dyn.V(""), + }), + }), + }) + + p := dyn.MustPathFromString("bundle.git.origin_urlx") + assert.Equal(t, "bundle.git.origin_url", SuggestPath(root, p)) +} + +func TestSuggestPathMultiLevelTypo(t *testing.T) { + root := dyn.V(map[string]dyn.Value{ + "resources": dyn.V(map[string]dyn.Value{ + "jobs": dyn.V(map[string]dyn.Value{ + "my_job": dyn.V(map[string]dyn.Value{ + "id": dyn.V(""), + }), + }), + }), + }) + + p := dyn.MustPathFromString("resources.jbs.my_jb.id") + assert.Equal(t, "resources.jobs.my_job.id", SuggestPath(root, p)) +} + +func TestSuggestPathNoMatch(t *testing.T) { + root := dyn.V(map[string]dyn.Value{ + "bundle": dyn.V(map[string]dyn.Value{ + "name": dyn.V(""), + }), + }) + + p := dyn.MustPathFromString("zzzzz.yyyyy") + assert.Equal(t, "", SuggestPath(root, p)) +} + +func TestSuggestPathWithIndex(t *testing.T) { + root := dyn.V(map[string]dyn.Value{ + "resources": dyn.V(map[string]dyn.Value{ + "jobs": dyn.V(map[string]dyn.Value{ + "my_job": dyn.V(map[string]dyn.Value{ + "tasks": dyn.V([]dyn.Value{ + dyn.V(map[string]dyn.Value{ + "task_key": dyn.V(""), + }), + }), + }), + }), + }), + }) + + p := dyn.MustPathFromString("resources.jobs.my_job.tasks[0].tsk_key") + assert.Equal(t, "resources.jobs.my_job.tasks[0].task_key", SuggestPath(root, p)) +} +``` + +### Integration-Level Tests + +```go +func TestResolveNotFoundWithSuggestion(t *testing.T) { + in := dyn.V(map[string]dyn.Value{ + "bundle": dyn.V(map[string]dyn.Value{ + "name": dyn.V("my-bundle"), + "target": dyn.V("dev"), + }), + "ref": dyn.V("${bundle.nme}"), + }) + + _, err := dynvar.Resolve(in, dynvar.DefaultLookup(in), + dynvar.WithSuggestFn(func(p dyn.Path) string { + return dynvar.SuggestPath(in, p) + }), + ) + assert.ErrorContains(t, err, "reference does not exist: ${bundle.nme}") + assert.ErrorContains(t, err, "did you mean ${bundle.name}?") +} + +func TestResolveNotFoundMultiLevelTypo(t *testing.T) { + in := dyn.V(map[string]dyn.Value{ + "resources": dyn.V(map[string]dyn.Value{ + "jobs": dyn.V(map[string]dyn.Value{ + "my_job": dyn.V(map[string]dyn.Value{ + "id": dyn.V("123"), + }), + }), + }), + "ref": dyn.V("${resources.jbs.my_jb.id}"), + }) + + _, err := dynvar.Resolve(in, dynvar.DefaultLookup(in), + dynvar.WithSuggestFn(func(p dyn.Path) string { + return dynvar.SuggestPath(in, p) + }), + ) + assert.ErrorContains(t, err, "did you mean ${resources.jobs.my_job.id}?") +} +``` + +## Alternatives Considered + +### A. Fix one component at a time (original proposal) + +Only suggest a fix for the first failing component. After the user fixes that, +they re-run and discover the next typo. + +**Rejected** because: +- Requires multiple round-trips for multi-level typos +- The fuzzy walk approach is barely more complex but gives a much better UX + +### B. Enumerate all valid paths in the error + +List all valid sibling keys: + +``` +reference does not exist: ${bundle.nme}; valid keys at "bundle" are: name, target, git +``` + +**Rejected** because for large maps (e.g., `resources.jobs` with dozens of jobs) +this would produce very noisy output. A single close match is more actionable. + +### C. Search the entire tree for the closest leaf path + +Walk the entire normalized tree and compute edit distance for every possible +leaf path against the full reference string. + +**Rejected** because: +- Expensive for large configs (every leaf × string distance) +- Could suggest paths in completely unrelated sections +- The per-component walk is more predictable and faster (bounded by path depth) + +### D. Do nothing — rely on docs/IDE support + +**Rejected** because: +- Many users don't use an IDE for YAML editing +- The error happens at `databricks bundle validate` time, which is the right + place for actionable feedback +- This is low-effort, high-value + +## Relationship to Parser Proposal + +This proposal is **independent** of the regex-to-parser migration. It can be +implemented with the current regex-based `NewRef` — the suggestion logic operates +at the resolution level, not the parsing level. + +However, the two proposals complement each other: +- The parser proposal improves error reporting for **malformed** references + (e.g., `${a..b}`, unterminated `${`) +- This proposal improves error reporting for **well-formed but invalid** + references (e.g., `${bundle.nme}`) + +Both can be implemented and shipped independently. From fb2fec30763734dc27a4d225d7c2b87510814d15 Mon Sep 17 00:00:00 2001 From: Shreyas Goenka Date: Thu, 12 Mar 2026 00:22:47 +0100 Subject: [PATCH 2/7] Add implementation context file for interpolation parser work Captures the codebase map, key files, invariants, and instructions needed for a new session to implement the two design proposals. Co-Authored-By: Claude Opus 4.6 --- design/implementation-context.md | 135 +++++++++++++++++++++++++++++++ 1 file changed, 135 insertions(+) create mode 100644 design/implementation-context.md diff --git a/design/implementation-context.md b/design/implementation-context.md new file mode 100644 index 0000000000..41f17c9299 --- /dev/null +++ b/design/implementation-context.md @@ -0,0 +1,135 @@ +# Implementation Context for Variable Interpolation Improvements + +This file captures the context needed to implement the two design proposals in +this directory. Read the design docs first, then use this as a map of the +codebase. + +## Two Proposals + +1. **`interpolation-parser.md`** — Replace regex-based `${...}` parsing with a + proper two-mode character scanner. Adds escape support (`$$`), error messages + with byte positions, and cleaner token-based interpolation. + +2. **`variable-lookup-suggestions.md`** — Add "did you mean?" suggestions when + a variable reference path doesn't exist. Uses Levenshtein distance and a + fuzzy walk of the config tree to suggest corrections, including for + multi-level typos. + +These are independent and can be implemented in either order. + +## Key Files to Modify + +### Proposal 1: Parser + +| File | What to do | +|------|------------| +| `libs/dyn/dynvar/parse.go` | **New**. Two-mode scanner: TEXT mode accumulates literals, REFERENCE mode scans `${...}`. Returns `[]Token`. See design doc for full pseudocode. | +| `libs/dyn/dynvar/parse_test.go` | **New**. Test valid refs, escapes (`$$`), errors (unterminated, empty, invalid chars). | +| `libs/dyn/dynvar/ref.go` | Replace regex + `Matches [][]string` with `Tokens []Token`. Update `NewRef`, `IsPure`, `References`. Remove the `re` regex variable. Keep `IsPureVariableReference` and `ContainsVariableReference` working. | +| `libs/dyn/dynvar/ref_test.go` | Update to match new internals. All existing behavioral tests should still pass. | +| `libs/dyn/dynvar/resolve.go` | Update `resolveRef` to use token concatenation instead of `strings.Replace` loop on regex matches. | +| `libs/dyn/dynvar/resolve_test.go` | Add tests for escape sequences. All existing tests must pass unchanged. | +| `python/databricks/bundles/core/_transform.py` | Has a Python regex that must stay in sync (separate PR). Comment on line 11-12 of `ref.go` references this. | + +### Proposal 2: Suggestions + +| File | What to do | +|------|------------| +| `libs/dyn/dynvar/suggest.go` | **New**. `levenshtein()`, `closestKeyMatch()`, `SuggestPath()`. | +| `libs/dyn/dynvar/suggest_test.go` | **New**. Unit tests for distance, matching, path suggestion (single typo, multi-level, with indices, no match). | +| `libs/dyn/mapping.go` | Add `StringKeys() []string` helper on `Mapping`. | +| `libs/dyn/dynvar/resolve.go` | Add `SuggestFn` field to `resolver`, `WithSuggestFn` option, use in `resolveKey` when `NoSuchKeyError`. | +| `libs/dyn/dynvar/resolve_test.go` | Integration tests with suggestions. | +| `bundle/config/mutator/resolve_variable_references.go` | Pass `WithSuggestFn` closure that calls `SuggestPath(normalized, p)`. | + +## Current Code Structure + +### `libs/dyn/dynvar/ref.go` — Variable Reference Detection +- `re` regex at line 14-15 matches `${path.to.var[0]}` patterns +- `baseVarDef = [a-zA-Z]+([-_]*[a-zA-Z0-9]+)*` — segment pattern +- `Ref` struct holds: `Value dyn.Value`, `Str string`, `Matches [][]string` +- `NewRef(v)` → uses `re.FindAllStringSubmatch(s, -1)` +- `IsPure()` → single match equals entire string (enables type retention) +- `References()` → extracts variable paths from match groups (`m[1]`) +- Must stay in sync with Python regex in `python/databricks/bundles/core/_transform.py` + +### `libs/dyn/dynvar/resolve.go` — Resolution Pipeline +- 3-phase pipeline: collect → resolve → replace +- `collectVariableReferences()` — walks tree, calls `NewRef` on each string value +- `resolveVariableReferences()` — resolves refs in sorted key order (deterministic cycle detection) +- `resolveRef()` — resolves all deps, then either: + - Pure substitution: single ref, retain original type + - String interpolation: `strings.Replace(ref.Str, ref.Matches[j][0], s, 1)` for each match +- `resolveKey()` — catches `NoSuchKeyError` → `"reference does not exist: ${%s}"`. **This is where suggestion logic hooks in.** +- `ErrSkipResolution` — leaves variable reference in place for multi-pass resolution + +### `libs/dyn/dynvar/lookup.go` — Lookup Interface +- `type Lookup func(path dyn.Path) (dyn.Value, error)` +- `ErrSkipResolution` sentinel error +- `DefaultLookup(in)` → creates lookup against a `dyn.Value` + +### `libs/dyn/path_string.go` — Path Parsing +- `NewPathFromString("foo.bar[0].baz")` → `Path{Key("foo"), Key("bar"), Index(0), Key("baz")}` +- Already handles dots, brackets, indices +- **Reuse this for path validation** in the new parser (don't reimplement) + +### `libs/dyn/visit.go` — Tree Traversal +- `noSuchKeyError{p Path}` — raised when map key doesn't exist +- At `visit.go:132-137`: has the map `m` with all sibling keys, but doesn't expose them in the error +- For suggestions, we do NOT modify this. Instead, `SuggestPath()` does its own walk. + +### `libs/dyn/mapping.go` — Map Type +- `Mapping` struct with `pairs []Pair` and `index map[string]int` +- `GetByString(key)` → `(Value, bool)` +- `Keys()` → `[]Value` +- **Need to add**: `StringKeys() []string` + +### `bundle/config/mutator/resolve_variable_references.go` — Bundle-Level Resolution +- Multi-round resolution (up to 11 rounds) +- Creates `normalized` tree via `convert.Normalize(b.Config, root, convert.IncludeMissingFields)` +- Rewrites `${var.foo}` → `${variables.foo.value}` before lookup +- Lookup function: `dyn.GetByPath(normalized, path)` +- `prefixes` control which paths are resolved vs skipped +- **This is where `WithSuggestFn` gets wired in**, passing `normalized` to `SuggestPath` + +### `libs/dyn/walk.go` — Tree Walking +- `Walk(v, fn)` — walks all nodes, calls fn on each +- `CollectLeafPaths(v)` — returns all leaf paths as strings + +## Important Invariants + +1. **`IsPure()` must work identically** — when a string is exactly `${...}` with + no surrounding text, the resolved value retains its original type (int, bool, + map, etc.). The parser must preserve this semantic. + +2. **`ErrSkipResolution` must work** — skipped variables are left as literal + `${...}` text in the output. The token-based interpolation must handle this. + +3. **Regex sync with Python** — the Python regex in `_transform.py` must + eventually match whatever the parser accepts. For now, the parser should + accept the same language as the regex (not broader). + +4. **Sorted resolution order** — keys are resolved in sorted order for + deterministic cycle detection errors. Don't change this. + +5. **`var` shorthand** — `${var.X}` is rewritten to `${variables.X.value}` + before lookup. Suggestions should reverse this for user-facing messages. + +## Build & Test + +```bash +# Run all unit tests +make test + +# Run specific package tests +go test ./libs/dyn/dynvar/... + +# Run acceptance tests (after changes) +go test ./acceptance/... + +# Lint +make lintfull + +# Format +make fmtfull +``` From 04e89956241be337feb1e6f11033a5f9987f2070 Mon Sep 17 00:00:00 2001 From: Shreyas Goenka Date: Thu, 12 Mar 2026 01:59:33 +0100 Subject: [PATCH 3/7] Replace regex-based interpolation parser with character scanner, add "did you mean" suggestions - Replace regex in dynvar/ref.go with a two-mode character scanner that produces typed tokens (literal/ref), enabling proper error reporting for malformed variable references like ${foo.bar-} or ${foo..bar} - Add Levenshtein-based "did you mean" suggestions when a valid reference fails to resolve (e.g., ${var.my_clster_id} -> did you mean ${var.my_cluster_id}?) - Add $$ escape sequence for producing literal ${ in output - Add standalone WarnMalformedReferences mutator in initialize phase - Update token-based string interpolation in resolver Co-Authored-By: Claude Opus 4.6 --- .../bad_syntax/out.deploy.direct.txt | 3 + .../bad_syntax/out.plan.direct.txt | 3 + .../bad_syntax/out.plan.terraform.txt | 3 + .../resource_deps/bad_syntax/output.txt | 3 + .../variables/did_you_mean/databricks.yml | 11 + .../variables/did_you_mean/out.test.toml | 5 + .../bundle/variables/did_you_mean/output.txt | 13 + .../bundle/variables/did_you_mean/script | 1 + .../malformed_reference/databricks.yml | 6 + .../malformed_reference/out.test.toml | 5 + .../variables/malformed_reference/output.txt | 12 + .../variables/malformed_reference/script | 1 + .../mutator/resolve_variable_references.go | 35 +- .../mutator/warn_malformed_references.go | 47 ++ bundle/phases/initialize.go | 4 + design/interpolation-parser.md | 496 ++---------------- libs/dyn/dynvar/interpolation/parse.go | 170 ++++++ libs/dyn/dynvar/interpolation/parse_test.go | 239 +++++++++ libs/dyn/dynvar/ref.go | 108 ++-- libs/dyn/dynvar/ref_test.go | 32 ++ libs/dyn/dynvar/resolve.go | 83 ++- libs/dyn/dynvar/resolve_test.go | 39 ++ libs/dyn/dynvar/suggest.go | 108 ++++ libs/dyn/dynvar/suggest_test.go | 175 ++++++ 24 files changed, 1092 insertions(+), 510 deletions(-) create mode 100644 acceptance/bundle/variables/did_you_mean/databricks.yml create mode 100644 acceptance/bundle/variables/did_you_mean/out.test.toml create mode 100644 acceptance/bundle/variables/did_you_mean/output.txt create mode 100644 acceptance/bundle/variables/did_you_mean/script create mode 100644 acceptance/bundle/variables/malformed_reference/databricks.yml create mode 100644 acceptance/bundle/variables/malformed_reference/out.test.toml create mode 100644 acceptance/bundle/variables/malformed_reference/output.txt create mode 100644 acceptance/bundle/variables/malformed_reference/script create mode 100644 bundle/config/mutator/warn_malformed_references.go create mode 100644 libs/dyn/dynvar/interpolation/parse.go create mode 100644 libs/dyn/dynvar/interpolation/parse_test.go create mode 100644 libs/dyn/dynvar/suggest.go create mode 100644 libs/dyn/dynvar/suggest_test.go diff --git a/acceptance/bundle/resource_deps/bad_syntax/out.deploy.direct.txt b/acceptance/bundle/resource_deps/bad_syntax/out.deploy.direct.txt index 6bed0296b3..2fcc34c7f6 100644 --- a/acceptance/bundle/resource_deps/bad_syntax/out.deploy.direct.txt +++ b/acceptance/bundle/resource_deps/bad_syntax/out.deploy.direct.txt @@ -1,3 +1,6 @@ +Warning: invalid variable reference ${resources.volumes.bar.bad..syntax} at position 0: invalid path + in databricks.yml:11:21 + Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/test-bundle/default/files... Deploying resources... Updating deployment state... diff --git a/acceptance/bundle/resource_deps/bad_syntax/out.plan.direct.txt b/acceptance/bundle/resource_deps/bad_syntax/out.plan.direct.txt index 1a40fdbaa3..2a3d2e7e0a 100644 --- a/acceptance/bundle/resource_deps/bad_syntax/out.plan.direct.txt +++ b/acceptance/bundle/resource_deps/bad_syntax/out.plan.direct.txt @@ -1,3 +1,6 @@ +Warning: invalid variable reference ${resources.volumes.bar.bad..syntax} at position 0: invalid path + in databricks.yml:11:21 + create volumes.bar create volumes.foo diff --git a/acceptance/bundle/resource_deps/bad_syntax/out.plan.terraform.txt b/acceptance/bundle/resource_deps/bad_syntax/out.plan.terraform.txt index b8f8078aba..339fa7f20c 100644 --- a/acceptance/bundle/resource_deps/bad_syntax/out.plan.terraform.txt +++ b/acceptance/bundle/resource_deps/bad_syntax/out.plan.terraform.txt @@ -1,3 +1,6 @@ +Warning: invalid variable reference ${resources.volumes.bar.bad..syntax} at position 0: invalid path + in databricks.yml:11:21 + Error: exit status 1 Error: Invalid attribute name diff --git a/acceptance/bundle/resource_deps/bad_syntax/output.txt b/acceptance/bundle/resource_deps/bad_syntax/output.txt index 0e9d83b643..e2867c47c8 100644 --- a/acceptance/bundle/resource_deps/bad_syntax/output.txt +++ b/acceptance/bundle/resource_deps/bad_syntax/output.txt @@ -1,5 +1,8 @@ >>> [CLI] bundle validate -o json +Warning: invalid variable reference ${resources.volumes.bar.bad..syntax} at position 0: invalid path + in databricks.yml:11:21 + { "volumes": { "bar": { diff --git a/acceptance/bundle/variables/did_you_mean/databricks.yml b/acceptance/bundle/variables/did_you_mean/databricks.yml new file mode 100644 index 0000000000..ff4da90ba8 --- /dev/null +++ b/acceptance/bundle/variables/did_you_mean/databricks.yml @@ -0,0 +1,11 @@ +bundle: + name: did-you-mean + +variables: + my_cluster_id: + default: abc123 + +resources: + jobs: + my_job: + name: "${var.my_clster_id}" diff --git a/acceptance/bundle/variables/did_you_mean/out.test.toml b/acceptance/bundle/variables/did_you_mean/out.test.toml new file mode 100644 index 0000000000..d560f1de04 --- /dev/null +++ b/acceptance/bundle/variables/did_you_mean/out.test.toml @@ -0,0 +1,5 @@ +Local = true +Cloud = false + +[EnvMatrix] + DATABRICKS_BUNDLE_ENGINE = ["terraform", "direct"] diff --git a/acceptance/bundle/variables/did_you_mean/output.txt b/acceptance/bundle/variables/did_you_mean/output.txt new file mode 100644 index 0000000000..b504098ca5 --- /dev/null +++ b/acceptance/bundle/variables/did_you_mean/output.txt @@ -0,0 +1,13 @@ + +>>> [CLI] bundle validate +Error: reference does not exist: ${var.my_clster_id}. did you mean ${var.my_cluster_id}? + +Name: did-you-mean +Target: default +Workspace: + User: [USERNAME] + Path: /Workspace/Users/[USERNAME]/.bundle/did-you-mean/default + +Found 1 error + +Exit code: 1 diff --git a/acceptance/bundle/variables/did_you_mean/script b/acceptance/bundle/variables/did_you_mean/script new file mode 100644 index 0000000000..5350876150 --- /dev/null +++ b/acceptance/bundle/variables/did_you_mean/script @@ -0,0 +1 @@ +trace $CLI bundle validate diff --git a/acceptance/bundle/variables/malformed_reference/databricks.yml b/acceptance/bundle/variables/malformed_reference/databricks.yml new file mode 100644 index 0000000000..63a0282395 --- /dev/null +++ b/acceptance/bundle/variables/malformed_reference/databricks.yml @@ -0,0 +1,6 @@ +bundle: + name: "${foo.bar-}" + +variables: + a: + default: hello diff --git a/acceptance/bundle/variables/malformed_reference/out.test.toml b/acceptance/bundle/variables/malformed_reference/out.test.toml new file mode 100644 index 0000000000..d560f1de04 --- /dev/null +++ b/acceptance/bundle/variables/malformed_reference/out.test.toml @@ -0,0 +1,5 @@ +Local = true +Cloud = false + +[EnvMatrix] + DATABRICKS_BUNDLE_ENGINE = ["terraform", "direct"] diff --git a/acceptance/bundle/variables/malformed_reference/output.txt b/acceptance/bundle/variables/malformed_reference/output.txt new file mode 100644 index 0000000000..05288411bb --- /dev/null +++ b/acceptance/bundle/variables/malformed_reference/output.txt @@ -0,0 +1,12 @@ + +>>> [CLI] bundle validate +Warning: invalid variable reference ${foo.bar-} at position 0: invalid path + in databricks.yml:2:9 + +Name: ${foo.bar-} +Target: default +Workspace: + User: [USERNAME] + Path: /Workspace/Users/[USERNAME]/.bundle/${foo.bar-}/default + +Found 1 warning diff --git a/acceptance/bundle/variables/malformed_reference/script b/acceptance/bundle/variables/malformed_reference/script new file mode 100644 index 0000000000..5350876150 --- /dev/null +++ b/acceptance/bundle/variables/malformed_reference/script @@ -0,0 +1 @@ +trace $CLI bundle validate diff --git a/bundle/config/mutator/resolve_variable_references.go b/bundle/config/mutator/resolve_variable_references.go index 9868b4fb95..ede46f1c48 100644 --- a/bundle/config/mutator/resolve_variable_references.go +++ b/bundle/config/mutator/resolve_variable_references.go @@ -151,6 +151,7 @@ func (m *resolveVariableReferences) Apply(ctx context.Context, b *bundle.Bundle) varPath := dyn.NewPath(dyn.Key("var")) var diags diag.Diagnostics + maxRounds := 1 + m.extraRounds for round := range maxRounds { @@ -202,6 +203,38 @@ func (m *resolveVariableReferences) resolveOnce(b *bundle.Bundle, prefixes []dyn // normalized, _ := convert.Normalize(b.Config, root, convert.IncludeMissingFields) + suggestFn := dynvar.WithSuggestFn(func(p dyn.Path) string { + // Rewrite var.X -> variables.X.value for suggestion lookup, + // then convert the suggestion back to var.X form. + isVar := p.HasPrefix(varPath) + if isVar { + newPath := dyn.NewPath(dyn.Key("variables"), p[1], dyn.Key("value")) + if len(p) > 2 { + newPath = newPath.Append(p[2:]...) + } + p = newPath + } + suggestion := dynvar.SuggestPath(normalized, p) + if suggestion == "" { + return "" + } + // Convert variables.X.value back to var.X for user-facing messages. + if isVar { + sp, err := dyn.NewPathFromString(suggestion) + if err != nil { + return suggestion + } + variablesPrefix := dyn.NewPath(dyn.Key("variables")) + valueSuffix := dyn.NewPath(dyn.Key("value")) + if rest, ok := sp.CutPrefix(variablesPrefix); ok { + if rest, ok := rest.CutSuffix(valueSuffix); ok { + return dyn.NewPath(dyn.Key("var")).Append(rest...).String() + } + } + } + return suggestion + }) + // If the pattern is nil, we resolve references in the entire configuration. root, err := dyn.MapByPattern(root, m.pattern, func(p dyn.Path, v dyn.Value) (dyn.Value, error) { // Resolve variable references in all values. @@ -236,7 +269,7 @@ func (m *resolveVariableReferences) resolveOnce(b *bundle.Bundle, prefixes []dyn } return dyn.InvalidValue, dynvar.ErrSkipResolution - }) + }, suggestFn) }) if err != nil { return dyn.InvalidValue, err diff --git a/bundle/config/mutator/warn_malformed_references.go b/bundle/config/mutator/warn_malformed_references.go new file mode 100644 index 0000000000..cf6934c7b7 --- /dev/null +++ b/bundle/config/mutator/warn_malformed_references.go @@ -0,0 +1,47 @@ +package mutator + +import ( + "context" + + "github.com/databricks/cli/bundle" + "github.com/databricks/cli/libs/diag" + "github.com/databricks/cli/libs/dyn" + "github.com/databricks/cli/libs/dyn/dynvar" +) + +type warnMalformedReferences struct{} + +// WarnMalformedReferences returns a mutator that emits warnings for strings +// containing malformed variable references (e.g. "${foo.bar-}"). +func WarnMalformedReferences() bundle.Mutator { + return &warnMalformedReferences{} +} + +func (*warnMalformedReferences) Name() string { + return "WarnMalformedReferences" +} + +func (*warnMalformedReferences) Validate(ctx context.Context, b *bundle.Bundle) error { + return nil +} + +func (*warnMalformedReferences) Apply(ctx context.Context, b *bundle.Bundle) diag.Diagnostics { + var diags diag.Diagnostics + err := b.Config.Mutate(func(root dyn.Value) (dyn.Value, error) { + _, err := dyn.Walk(root, func(p dyn.Path, v dyn.Value) (dyn.Value, error) { + // Only check values with source locations to avoid false positives + // from synthesized/computed values. + if len(v.Locations()) == 0 { + return v, nil + } + _, _, refDiags := dynvar.NewRefWithDiagnostics(v) + diags = diags.Extend(refDiags) + return v, nil + }) + return root, err + }) + if err != nil { + diags = diags.Extend(diag.FromErr(err)) + } + return diags +} diff --git a/bundle/phases/initialize.go b/bundle/phases/initialize.go index 761714b48e..83fc1b6047 100644 --- a/bundle/phases/initialize.go +++ b/bundle/phases/initialize.go @@ -99,6 +99,10 @@ func Initialize(ctx context.Context, b *bundle.Bundle) { // Resolves and sets values for bundle variables in the following order: from environment variables, from variable files and then defaults mutator.SetVariables(), + // Walks the configuration tree once to emit warnings for strings containing + // malformed variable references (e.g. "${foo.bar-}"). + mutator.WarnMalformedReferences(), + // Reads (dynamic): variables.*.lookup (checks for variable references in lookup fields) // Updates (dynamic): variables.*.lookup (resolves variable references in lookup fields) // Prevents circular references between lookup variables diff --git a/design/interpolation-parser.md b/design/interpolation-parser.md index 2495949adf..468fe094e3 100644 --- a/design/interpolation-parser.md +++ b/design/interpolation-parser.md @@ -1,473 +1,67 @@ -# Proposal: Replace Regex-Based Variable Interpolation with a Proper Parser +# Variable Interpolation: Parser & "Did You Mean" Suggestions -## Status: Draft +Author: Shreyas Goenka +Date: 12 March 2026 -## Background +## Motivation -DABs variable interpolation (`${...}` syntax) is currently implemented via regex -matching in `libs/dyn/dynvar/ref.go`: +DABs variable interpolation (`${...}`) was regex-based. This caused: -```go -baseVarDef = `[a-zA-Z]+([-_]*[a-zA-Z0-9]+)*` -re = regexp.MustCompile( - fmt.Sprintf(`\$\{(%s(\.%s(\[[0-9]+\])*)*(\[[0-9]+\])*)\}`, baseVarDef, baseVarDef), -) -``` +1. **Silent failures** — `${foo.bar-}` silently treated as literal text with no warning. +2. **No suggestions** — `${bundle.nme}` produces "reference does not exist" with no hint. +3. **No escape mechanism** — no way to produce a literal `${` in output. -This regex is used in `NewRef()` via `FindAllStringSubmatch` to extract all -`${...}` references from a string. The matched substrings are then resolved by -the pipeline in `resolve.go` (collect → resolve → replace). +## Design Decisions -### Problems with the Regex Approach +### Two-mode character scanner (not recursive descent) -1. **No error reporting.** A non-match produces zero information — the user - gets no feedback about *why* `${foo.bar-}` or `${foo..bar}` is silently - ignored. Invalid references are indistinguishable from literal text. +For syntax as simple as `${path.to.var[0]}`, a recursive descent parser is +overkill. A two-mode scanner (TEXT / REFERENCE) is sufficient and easy to +port to the Python implementation. -2. **No position information.** Errors cannot point to the character where - parsing fails. When resolution *does* fail, the error messages refer to the - matched string but not its location within the original value. +See `libs/dyn/dynvar/interpolation/parse.go`. -3. **Hard to extend.** Adding new syntax (e.g., default values like - `${var.name:-default}`, or function calls like `${upper(var.name)}`) requires - modifying a regex that is already at the edge of readability. +### Nested `${` handling -4. **No escape mechanism.** There is no way to produce a literal `${` in a - string. Users who need `${` in their output (e.g., shell scripts, ARM - templates) have no workaround. +Existing configs use patterns like `${var.foo_${var.tail}}` where the inner +reference resolves first. The old regex matched only `${var.tail}` (the +innermost pair). The new parser preserves this: when scanning for `}` inside +a reference, if another `${` is encountered, the outer `${` is abandoned +(treated as literal) and scanning restarts from the inner `${`. -5. **Dual maintenance burden.** The regex must be kept in sync with a Python - regex in `python/databricks/bundles/core/_transform.py` — a fragile - arrangement with no automated enforcement. +### `$$` escape sequence -6. **Silent acceptance of ambiguous input.** The regex approach cannot - distinguish between "this string has no variable references" and "this string - has a malformed variable reference that should be reported." +Following HCL2's precedent, `$$` before `{` produces a literal `$`. This is +backward compatible — no existing config uses `$${` (the old regex wouldn't +match it). -## Research: How Other Systems Parse `${...}` +### Malformed reference warnings -| System | Strategy | Escape | Nesting | Error Quality | -|--------|----------|--------|---------|---------------| -| Go `text/template` | State-function lexer | None | Paren depth | Line + template name | -| HCL2 (Terraform) | Ragel FSM + recursive descent | `$${` → literal `${` | Brace depth stack | Source range + suggestions | -| Python f-strings (PEP 701) | Mode-stack tokenizer | `{{` → `{` | Mode stack | Line/column | -| Rust `format!` | Iterator-based descent | `{{`/`}}` | N/A | Spans + suggestions | -| Bash | Char-by-char + depth tracking | `\$` | Full recursive | Line number | +A standalone `WarnMalformedReferences` mutator walks the config tree once +before variable resolution. It only checks values with source locations +(`len(v.Locations()) > 0`) to avoid false positives from synthesized values +(e.g., normalized/computed paths). -**Key insight from the research:** For a syntax as simple as `${path.to.var[0]}` -(no nested expressions, no function calls, no operators), a full recursive -descent parser is overkill. The right tool is a **two-mode character scanner** — -the same pattern used by Go's `text/template` lexer and HCL's scanner at their -core. This gives us proper error reporting, escape support, and extensibility -without the complexity of a full parser generator. +### "Did you mean" suggestions -## Proposed Design +When a valid-syntax reference fails to resolve (`NoSuchKeyError`), the +resolver calls a `SuggestFn` that walks the config tree component by +component using Levenshtein distance. The suggestion is appended to the +existing error: `did you mean ${var.my_cluster_id}?`. -### Architecture: Two-Phase Scanner +The `SuggestFn` receives the raw path from the reference (e.g., `var.X`), +rewrites it to `variables.X.value` for lookup, then converts the suggestion +back to `var.X` form for user-facing messages. -Replace the regex with a small, explicit scanner that operates in two modes: +See `libs/dyn/dynvar/suggest.go`. -``` -Mode 1: TEXT — accumulate literal characters -Mode 2: REFERENCE — accumulate variable path characters inside ${...} -``` +### Token-based resolution -The scanner produces a flat list of tokens. No AST, no recursive descent — just -a linear scan that cleanly separates literal text from variable references. +The resolver's string interpolation changed from `strings.Replace` (with +count=1 to avoid double-replacing duplicate refs) to a token concatenation +loop. Each `TokenRef` maps 1:1 to a resolved value, eliminating the ambiguity. -### Token Types +## Python sync -```go -// TokenKind represents the type of a parsed token. -type TokenKind int - -const ( - TokenLiteral TokenKind = iota // Literal text (no interpolation) - TokenRef // Variable reference: content between ${ and } -) -``` - -### Core Data Structure - -```go -// Token represents a parsed segment of an interpolation string. -type Token struct { - Kind TokenKind - Value string // For Literal: the text. For Ref: the variable path (e.g., "a.b[0].c"). - Start int // Byte offset of the start of this token in the original string. - End int // Byte offset past the end of this token. -} -``` - -### Scanner Implementation - -```go -// Parse parses a string that may contain ${...} variable references. -// It returns a slice of tokens representing literal text and variable references. -// -// Escape sequences: -// - "$$" produces a literal "$" (only when followed by "{") -// -// Examples: -// - "hello" → [Literal("hello")] -// - "${a.b}" → [Ref("a.b")] -// - "pre ${a.b} post" → [Literal("pre "), Ref("a.b"), Literal(" post")] -// - "$${a.b}" → [Literal("${a.b}")] -// - "${a.b} ${c[0]}" → [Ref("a.b"), Literal(" "), Ref("c[0]")] -func Parse(s string) ([]Token, error) { - var tokens []Token - i := 0 - buf := strings.Builder{} // accumulates literal text - - flushLiteral := func(end int) { - if buf.Len() > 0 { - tokens = append(tokens, Token{ - Kind: TokenLiteral, - Value: buf.String(), - Start: end - buf.Len(), - End: end, - }) - buf.Reset() - } - } - - for i < len(s) { - if s[i] != '$' { - buf.WriteByte(s[i]) - i++ - continue - } - - // We see '$'. Look ahead. - if i+1 >= len(s) { - // Trailing '$' — treat as literal. - buf.WriteByte('$') - i++ - continue - } - - switch s[i+1] { - case '$': - // Escape: "$$" → literal "$". - buf.WriteByte('$') - i += 2 - - case '{': - // Start of variable reference. - flushLiteral(i) - refStart := i - i += 2 // skip "${" - - // Scan the variable path until we find '}'. - pathStart := i - for i < len(s) && s[i] != '}' { - i++ - } - - if i >= len(s) { - return nil, fmt.Errorf( - "unterminated variable reference at position %d", - refStart, - ) - } - - path := s[pathStart:i] - i++ // skip '}' - - if path == "" { - return nil, fmt.Errorf( - "empty variable reference at position %d", - refStart, - ) - } - - // Validate the path content. - if err := validatePath(path, refStart); err != nil { - return nil, err - } - - tokens = append(tokens, Token{ - Kind: TokenRef, - Value: path, - Start: refStart, - End: i, - }) - - default: - // '$' not followed by '$' or '{' — treat as literal. - buf.WriteByte('$') - i++ - } - } - - flushLiteral(i) - return tokens, nil -} -``` - -### Path Validation - -Rather than encoding path rules in the regex, validate path contents explicitly -after extraction. This function reuses the existing `dyn.NewPathFromString` but -adds character-level error reporting: - -```go -// validatePath checks that a variable path is well-formed. -// It wraps dyn.NewPathFromString with position-aware error messages. -func validatePath(path string, refStart int) error { - _, err := dyn.NewPathFromString(path) - if err != nil { - return fmt.Errorf( - "invalid variable reference ${%s} at position %d: %w", - path, refStart, err, - ) - } - return nil -} -``` - -We should also add validation for the character set used in path segments. The -current regex implicitly enforces `[a-zA-Z]` start and `[a-zA-Z0-9_-]` -continuation. This should move to an explicit check inside path validation: - -```go -func validatePathSegment(seg string) error { - if len(seg) == 0 { - return fmt.Errorf("empty path segment") - } - if seg[0] < 'A' || (seg[0] > 'Z' && seg[0] < 'a') || seg[0] > 'z' { - return fmt.Errorf("path segment must start with a letter, got %q", seg[0]) - } - // ... check continuation characters ... -} -``` - -### Updated Ref Type - -The `Ref` struct changes from storing raw regex match groups to storing parsed -tokens: - -```go -type Ref struct { - Value dyn.Value // Original dyn.Value. - Str string // Original string content. - Tokens []Token // Parsed tokens (literals and references). -} - -func NewRef(v dyn.Value) (Ref, bool) { - s, ok := v.AsString() - if !ok { - return Ref{}, false - } - - tokens, err := Parse(s) - if err != nil { - // Return error through a new error-aware API (see Migration section). - return Ref{}, false - } - - // Check if any token is a reference. - hasRef := false - for _, t := range tokens { - if t.Kind == TokenRef { - hasRef = true - break - } - } - if !hasRef { - return Ref{}, false - } - - return Ref{Value: v, Str: s, Tokens: tokens}, true -} -``` - -### Updated Resolution Logic - -The string interpolation in `resolveRef` simplifies from a regex-replacement -loop to a token-concatenation loop: - -```go -func (r *resolver) resolveRef(ref Ref, seen []string) (dyn.Value, error) { - deps := ref.References() - resolved := make([]dyn.Value, len(deps)) - complete := true - - // ... resolve deps (unchanged) ... - - // Pure substitution (single ref, no literals). - if ref.IsPure() && complete { - return dyn.NewValue(resolved[0].Value(), ref.Value.Locations()), nil - } - - // String interpolation: concatenate tokens. - var buf strings.Builder - refIdx := 0 - for _, tok := range ref.Tokens { - switch tok.Kind { - case TokenLiteral: - buf.WriteString(tok.Value) - case TokenRef: - if !resolved[refIdx].IsValid() { - // Skipped — write original ${...} back. - buf.WriteString("${") - buf.WriteString(tok.Value) - buf.WriteByte('}') - } else { - s, err := valueToString(resolved[refIdx]) - if err != nil { - return dyn.InvalidValue, err - } - buf.WriteString(s) - } - refIdx++ - } - } - - return dyn.NewValue(buf.String(), ref.Value.Locations()), nil -} -``` - -This is cleaner than the current approach which uses `strings.Replace` with a -count of 1 — a trick needed to avoid double-replacing when the same variable -appears multiple times. - -### Helper Methods on Ref - -```go -// IsPure returns true if the string is a single variable reference with no -// surrounding text (e.g., "${a.b}" but not "x ${a.b}" or "${a} ${b}"). -func (v Ref) IsPure() bool { - return len(v.Tokens) == 1 && v.Tokens[0].Kind == TokenRef -} - -// References returns the variable paths referenced in this string. -func (v Ref) References() []string { - var out []string - for _, t := range v.Tokens { - if t.Kind == TokenRef { - out = append(out, t.Value) - } - } - return out -} -``` - -### Escape Sequence: `$$` - -Following HCL2's precedent, `$$` before `{` produces a literal `$`. This is the -most natural escape for users already familiar with Terraform/HCL: - -| Input | Output | -|-------|--------| -| `${a.b}` | *(resolved value of a.b)* | -| `$${a.b}` | `${a.b}` (literal) | -| `$$notbrace` | `$notbrace` (literal) | -| `$notbrace` | `$notbrace` (literal) | - -This is backward compatible: `$$` is not a valid prefix today (the regex -requires `${`), so no existing config uses `$$` in a way that would change -meaning. - -## File Changes - -| File | Change | -|------|--------| -| `libs/dyn/dynvar/ref.go` | Replace regex + `Matches` with `Parse()` + `[]Token` | -| `libs/dyn/dynvar/ref_test.go` | Update tests: add parser tests, keep behavioral tests | -| `libs/dyn/dynvar/resolve.go` | Update `resolveRef` to use token concatenation | -| `libs/dyn/dynvar/resolve_test.go` | Add tests for escape sequences, error messages | -| `libs/dyn/dynvar/parse.go` | **New file**: scanner + token types | -| `libs/dyn/dynvar/parse_test.go` | **New file**: scanner unit tests | -| `python/databricks/bundles/core/_transform.py` | Update Python side to match (separate PR) | - -## Migration Strategy - -### Phase 1: Add Parser, Keep Regex - -1. Implement `Parse()` in a new `parse.go` file with full test coverage. -2. Add a `NewRefWithDiagnostics(v dyn.Value) (Ref, diag.Diagnostics)` that - uses the parser and can return warnings for malformed references. -3. Keep the existing `NewRef` as-is, calling the parser internally but falling - back to the regex for any parse errors (belt-and-suspenders). -4. Add logging when the parser and regex disagree, to catch discrepancies. - -### Phase 2: Switch Over - -1. Remove the regex fallback — `NewRef` uses only the parser. -2. Update `Ref` to store `[]Token` instead of `[][]string`. -3. Update `resolveRef` to use token concatenation. -4. Remove the `re` variable. - -### Phase 3: Add Escape Support - -1. Enable `$$` escape handling in the parser. -2. Document the escape sequence. -3. Update the Python implementation. - -## Compatibility - -- **Forward compatible:** All strings that currently contain valid `${...}` - references will parse identically. The parser accepts a strict superset of - the regex (it can also report errors for malformed references). - -- **Backward compatible escape:** `$$` before `{` is a new feature, not a - breaking change. No existing valid config contains `$${` (the regex would not - match it, and a literal `$${` in YAML has no special meaning today). - -- **Error reporting is additive:** Strings that silently failed to match the - regex will now produce actionable error messages. This is a UX improvement, - not a breaking change, though it could surface new warnings for configs that - previously "worked" by accident (e.g., a typo like `${foo.bar-}` was silently - treated as literal text). - -## Testing Plan - -1. **Parser unit tests** (`parse_test.go`): - - Valid references: single, multiple, with indices, with hyphens/underscores - - Escape sequences: `$$`, `$` at end of string, `$` before non-`{` - - Error cases: unterminated `${`, empty `${}`, invalid characters in path - - Position tracking: verify `Start`/`End` offsets are correct - -2. **Ref behavioral tests** (`ref_test.go`): - - All existing tests pass unchanged - - New tests for `IsPure()` and `References()` using token-based `Ref` - -3. **Resolution tests** (`resolve_test.go`): - - All existing tests pass unchanged - - New tests for escape sequences in interpolation - - New tests verifying improved error messages - -4. **Acceptance tests**: - - Add acceptance test with `$$` escape in `databricks.yml` - - Verify existing acceptance tests pass without output changes - -## Why Not a More Powerful Parser? - -A recursive descent parser or parser combinator would allow richer syntax (nested -expressions, function calls, filters). We deliberately avoid this because: - -1. **YAGNI.** The current `${path.to.var[0]}` syntax covers all use cases. There - are no open feature requests for computed expressions inside `${...}`. - -2. **Two implementations.** Any syntax change must be mirrored in the Python - implementation. A simple scanner is easy to port; a recursive descent parser - is not. - -3. **Terraform alignment.** DABs variable references are conceptually similar to - HCL variable references. Keeping the syntax simple avoids user confusion - about what expressions are supported. - -If we ever need richer expressions, the token-based architecture makes it easy to -add a parser layer on top of the scanner without changing the `Ref`/`Token` types -or the resolution pipeline. - -## Summary - -Replace the regex in `dynvar/ref.go` with a ~80-line character scanner that: -- Produces the same results for all valid inputs -- Reports actionable errors for invalid inputs (with byte positions) -- Supports `$$` escape for literal `${` output -- Is straightforward to read, test, and extend -- Simplifies the interpolation logic in `resolve.go` from regex-replacement to - token concatenation +The Python regex in `python/databricks/bundles/core/_transform.py` needs a +corresponding update in a follow-up PR. diff --git a/libs/dyn/dynvar/interpolation/parse.go b/libs/dyn/dynvar/interpolation/parse.go new file mode 100644 index 0000000000..4232f7416c --- /dev/null +++ b/libs/dyn/dynvar/interpolation/parse.go @@ -0,0 +1,170 @@ +package interpolation + +import ( + "fmt" + "regexp" + "strings" +) + +// TokenKind represents the type of a parsed token. +type TokenKind int + +const ( + TokenLiteral TokenKind = iota // Literal text (no interpolation) + TokenRef // Variable reference: content between ${ and } +) + +// Token represents a parsed segment of an interpolation string. +type Token struct { + Kind TokenKind + Value string // For TokenLiteral: the literal text; For TokenRef: the path string (without ${}) + Start int // Start position in original string + End int // End position in original string (exclusive) +} + +const ( + dollarChar = '$' + openBrace = '{' + closeBrace = '}' +) + +// pathPattern validates the content inside ${...}. Each segment matches +// baseVarDef from libs/dyn/dynvar/ref.go: [a-zA-Z]+([-_]*[a-zA-Z0-9]+)* +var pathPattern = regexp.MustCompile( + `^[a-zA-Z]+([-_]*[a-zA-Z0-9]+)*(\.[a-zA-Z]+([-_]*[a-zA-Z0-9]+)*(\[[0-9]+\])*)*(\[[0-9]+\])*$`, +) + +// Parse parses a string that may contain ${...} variable references. +// It returns a slice of tokens representing literal text and variable references. +// +// Escape sequences: +// - "$$" produces a literal "$" +// +// Examples: +// - "hello" -> [Literal("hello")] +// - "${a.b}" -> [Ref("a.b")] +// - "pre ${a.b} post" -> [Literal("pre "), Ref("a.b"), Literal(" post")] +// - "$${a.b}" -> [Literal("${a.b}")] +func Parse(s string) ([]Token, error) { + if len(s) == 0 { + return nil, nil + } + + var tokens []Token + i := 0 + var buf strings.Builder + litStart := 0 // tracks the start position in the original string for the current literal + + flushLiteral := func(end int) { + if buf.Len() > 0 { + tokens = append(tokens, Token{ + Kind: TokenLiteral, + Value: buf.String(), + Start: litStart, + End: end, + }) + buf.Reset() + } + } + + for i < len(s) { + if s[i] != dollarChar { + if buf.Len() == 0 { + litStart = i + } + buf.WriteByte(s[i]) + i++ + continue + } + + // We see '$'. Look ahead. + if i+1 >= len(s) { + // Trailing '$' at end of string: treat as literal. + if buf.Len() == 0 { + litStart = i + } + buf.WriteByte(dollarChar) + i++ + continue + } + + switch s[i+1] { + case dollarChar: + // Escape: "$$" produces a literal "$". + if buf.Len() == 0 { + litStart = i + } + buf.WriteByte(dollarChar) + i += 2 + + case openBrace: + // Start of variable reference. + refStart := i + j := i + 2 // skip "${" + + // Scan until closing '}', watching for nested '${'. + pathStart := j + nested := false + for j < len(s) && s[j] != closeBrace { + if s[j] == dollarChar && j+1 < len(s) && s[j+1] == openBrace { + // Nested '${' inside a reference. Abandon the outer reference + // and treat its '${' as literal text. Rescan from the outer '$' + 1. + nested = true + break + } + j++ + } + + if nested { + // Treat the outer '${' as literal and continue from after '$'. + if buf.Len() == 0 { + litStart = i + } + buf.WriteByte(dollarChar) + i++ + continue + } + + if j >= len(s) { + return nil, fmt.Errorf( + "unterminated variable reference at position %d", refStart, + ) + } + + path := s[pathStart:j] + j++ // skip '}' + + if path == "" { + return nil, fmt.Errorf( + "empty variable reference at position %d", refStart, + ) + } + + if !pathPattern.MatchString(path) { + return nil, fmt.Errorf( + "invalid variable reference ${%s} at position %d: invalid path", path, refStart, + ) + } + + flushLiteral(i) + tokens = append(tokens, Token{ + Kind: TokenRef, + Value: path, + Start: refStart, + End: j, + }) + i = j + + default: + // '$' not followed by '$' or '{': treat as literal. + if buf.Len() == 0 { + litStart = i + } + buf.WriteByte(dollarChar) + i++ + } + } + + flushLiteral(i) + return tokens, nil +} diff --git a/libs/dyn/dynvar/interpolation/parse_test.go b/libs/dyn/dynvar/interpolation/parse_test.go new file mode 100644 index 0000000000..c3e33aaec8 --- /dev/null +++ b/libs/dyn/dynvar/interpolation/parse_test.go @@ -0,0 +1,239 @@ +package interpolation + +import ( + "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +func TestParseEmpty(t *testing.T) { + tokens, err := Parse("") + require.NoError(t, err) + assert.Nil(t, tokens) +} + +func TestParseLiteralOnly(t *testing.T) { + tokens, err := Parse("hello world") + require.NoError(t, err) + assert.Equal(t, []Token{ + {Kind: TokenLiteral, Value: "hello world", Start: 0, End: 11}, + }, tokens) +} + +func TestParseSingleRef(t *testing.T) { + tokens, err := Parse("${a.b}") + require.NoError(t, err) + assert.Equal(t, []Token{ + {Kind: TokenRef, Value: "a.b", Start: 0, End: 6}, + }, tokens) +} + +func TestParseMultipleRefs(t *testing.T) { + tokens, err := Parse("${a} ${b}") + require.NoError(t, err) + assert.Equal(t, []Token{ + {Kind: TokenRef, Value: "a", Start: 0, End: 4}, + {Kind: TokenLiteral, Value: " ", Start: 4, End: 5}, + {Kind: TokenRef, Value: "b", Start: 5, End: 9}, + }, tokens) +} + +func TestParseMixedLiteralAndRef(t *testing.T) { + tokens, err := Parse("pre ${a.b} post") + require.NoError(t, err) + assert.Equal(t, []Token{ + {Kind: TokenLiteral, Value: "pre ", Start: 0, End: 4}, + {Kind: TokenRef, Value: "a.b", Start: 4, End: 10}, + {Kind: TokenLiteral, Value: " post", Start: 10, End: 15}, + }, tokens) +} + +func TestParseValidPaths(t *testing.T) { + tests := []struct { + input string + path string + }{ + {"${a}", "a"}, + {"${abc}", "abc"}, + {"${a.b.c}", "a.b.c"}, + {"${a.b[0]}", "a.b[0]"}, + {"${a[0]}", "a[0]"}, + {"${a.b[0][1]}", "a.b[0][1]"}, + {"${a.b-c}", "a.b-c"}, + {"${a.b_c}", "a.b_c"}, + {"${a.b-c-d}", "a.b-c-d"}, + {"${a.b_c_d}", "a.b_c_d"}, + {"${abc.def.ghi}", "abc.def.ghi"}, + {"${a.b123}", "a.b123"}, + {"${resources.jobs.my-job.id}", "resources.jobs.my-job.id"}, + {"${var.my_var}", "var.my_var"}, + } + + for _, tt := range tests { + t.Run(tt.input, func(t *testing.T) { + tokens, err := Parse(tt.input) + require.NoError(t, err) + require.Len(t, tokens, 1) + assert.Equal(t, TokenRef, tokens[0].Kind) + assert.Equal(t, tt.path, tokens[0].Value) + }) + } +} + +func TestParseEscapeDollar(t *testing.T) { + t.Run("double_dollar", func(t *testing.T) { + tokens, err := Parse("$$") + require.NoError(t, err) + assert.Equal(t, []Token{ + {Kind: TokenLiteral, Value: "$", Start: 0, End: 2}, + }, tokens) + }) + + t.Run("escaped_ref", func(t *testing.T) { + tokens, err := Parse("$${a}") + require.NoError(t, err) + assert.Equal(t, []Token{ + {Kind: TokenLiteral, Value: "${a}", Start: 0, End: 5}, + }, tokens) + }) + + t.Run("double_escape", func(t *testing.T) { + tokens, err := Parse("$$$$") + require.NoError(t, err) + assert.Equal(t, []Token{ + {Kind: TokenLiteral, Value: "$$", Start: 0, End: 4}, + }, tokens) + }) + + t.Run("escape_then_ref", func(t *testing.T) { + tokens, err := Parse("$$$$${a.b}") + require.NoError(t, err) + assert.Equal(t, []Token{ + {Kind: TokenLiteral, Value: "$$", Start: 0, End: 4}, + {Kind: TokenRef, Value: "a.b", Start: 4, End: 10}, + }, tokens) + }) +} + +func TestParseDollarAtEnd(t *testing.T) { + tokens, err := Parse("abc$") + require.NoError(t, err) + assert.Equal(t, []Token{ + {Kind: TokenLiteral, Value: "abc$", Start: 0, End: 4}, + }, tokens) +} + +func TestParseDollarBeforeNonBrace(t *testing.T) { + tokens, err := Parse("$x") + require.NoError(t, err) + assert.Equal(t, []Token{ + {Kind: TokenLiteral, Value: "$x", Start: 0, End: 2}, + }, tokens) +} + +func TestParseNestedReference(t *testing.T) { + // Nested ${...} inside a reference: the outer ${ is treated as literal, + // and the inner reference is parsed normally. + tokens, err := Parse("${var.foo_${var.tail}}") + require.NoError(t, err) + assert.Equal(t, []Token{ + {Kind: TokenLiteral, Value: "${var.foo_", Start: 0, End: 10}, + {Kind: TokenRef, Value: "var.tail", Start: 10, End: 21}, + {Kind: TokenLiteral, Value: "}", Start: 21, End: 22}, + }, tokens) +} + +func TestParseUnterminatedRef(t *testing.T) { + _, err := Parse("${a.b") + require.Error(t, err) + assert.Contains(t, err.Error(), "unterminated") +} + +func TestParseEmptyRef(t *testing.T) { + _, err := Parse("${}") + require.Error(t, err) + assert.Contains(t, err.Error(), "empty") +} + +func TestParseInvalidPaths(t *testing.T) { + tests := []struct { + name string + input string + }{ + {"trailing_hyphen", "${foo.bar-}"}, + {"double_dot", "${foo..bar}"}, + {"leading_digit", "${0foo}"}, + {"hyphen_start_segment", "${foo.-bar}"}, + {"trailing_dot", "${foo.}"}, + {"leading_dot", "${.foo}"}, + {"space_in_path", "${foo. bar}"}, + {"special_char", "${foo.bar!}"}, + {"just_digits", "${123}"}, + {"trailing_underscore", "${foo.bar_}"}, + {"underscore_start_segment", "${foo._bar}"}, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + _, err := Parse(tt.input) + require.Error(t, err) + assert.Contains(t, err.Error(), "invalid") + }) + } +} + +func TestParsePositions(t *testing.T) { + t.Run("single_ref", func(t *testing.T) { + tokens, err := Parse("${a.b}") + require.NoError(t, err) + assert.Equal(t, []Token{ + {Kind: TokenRef, Value: "a.b", Start: 0, End: 6}, + }, tokens) + }) + + t.Run("literal_ref_literal", func(t *testing.T) { + tokens, err := Parse("pre ${a.b} post") + require.NoError(t, err) + assert.Equal(t, []Token{ + {Kind: TokenLiteral, Value: "pre ", Start: 0, End: 4}, + {Kind: TokenRef, Value: "a.b", Start: 4, End: 10}, + {Kind: TokenLiteral, Value: " post", Start: 10, End: 15}, + }, tokens) + }) + + t.Run("escaped_ref", func(t *testing.T) { + tokens, err := Parse("$${a}") + require.NoError(t, err) + assert.Equal(t, []Token{ + {Kind: TokenLiteral, Value: "${a}", Start: 0, End: 5}, + }, tokens) + }) + + t.Run("adjacent_refs", func(t *testing.T) { + tokens, err := Parse("${a}${b}") + require.NoError(t, err) + assert.Equal(t, []Token{ + {Kind: TokenRef, Value: "a", Start: 0, End: 4}, + {Kind: TokenRef, Value: "b", Start: 4, End: 8}, + }, tokens) + }) + + t.Run("dollar_sign_mid_literal", func(t *testing.T) { + tokens, err := Parse("a$b") + require.NoError(t, err) + assert.Equal(t, []Token{ + {Kind: TokenLiteral, Value: "a$b", Start: 0, End: 3}, + }, tokens) + }) + + t.Run("escape_between_refs", func(t *testing.T) { + tokens, err := Parse("${a}$$${b}") + require.NoError(t, err) + assert.Equal(t, []Token{ + {Kind: TokenRef, Value: "a", Start: 0, End: 4}, + {Kind: TokenLiteral, Value: "$", Start: 4, End: 6}, + {Kind: TokenRef, Value: "b", Start: 6, End: 10}, + }, tokens) + }) +} diff --git a/libs/dyn/dynvar/ref.go b/libs/dyn/dynvar/ref.go index a048c80cb8..772b7620b0 100644 --- a/libs/dyn/dynvar/ref.go +++ b/libs/dyn/dynvar/ref.go @@ -1,23 +1,13 @@ package dynvar import ( - "fmt" - "regexp" - + "github.com/databricks/cli/libs/diag" "github.com/databricks/cli/libs/dyn" -) - -var ( - // !!! Should be in sync with _variable_regex in Python code. - // !!! - // !!! See python/databricks/bundles/core/_transform.py - baseVarDef = `[a-zA-Z]+([-_]*[a-zA-Z0-9]+)*` - re = regexp.MustCompile(fmt.Sprintf(`\$\{(%s(\.%s(\[[0-9]+\])*)*(\[[0-9]+\])*)\}`, baseVarDef, baseVarDef)) + "github.com/databricks/cli/libs/dyn/dynvar/interpolation" ) // Ref represents a variable reference. // It is a string [dyn.Value] contained in a larger [dyn.Value]. -// Its path within the containing [dyn.Value] is also stored. type Ref struct { // Original value. Value dyn.Value @@ -25,13 +15,13 @@ type Ref struct { // String value in the original [dyn.Value]. Str string - // Matches of the variable reference in the string. - Matches [][]string + // Parsed tokens from the interpolation parser. + Tokens []interpolation.Token } // NewRef returns a new Ref if the given [dyn.Value] contains a string // with one or more variable references. It returns false if the given -// [dyn.Value] does not contain variable references. +// [dyn.Value] does not contain variable references or if parsing fails. // // Examples of a valid variable references: // - "${a.b}" @@ -39,53 +29,101 @@ type Ref struct { // - "${a.b[0].c}" // - "${a} ${b} ${c}" func NewRef(v dyn.Value) (Ref, bool) { + ref, ok, _ := newRef(v) + return ref, ok +} + +// NewRefWithDiagnostics returns a new Ref along with any diagnostics. +// Parse errors for malformed references (e.g. "${foo.bar-}") are returned +// as warnings. The second return value is false when no valid references +// are found (either no references at all, or a parse error occurred). +func NewRefWithDiagnostics(v dyn.Value) (Ref, bool, diag.Diagnostics) { + return newRef(v) +} + +func newRef(v dyn.Value) (Ref, bool, diag.Diagnostics) { s, ok := v.AsString() if !ok { - return Ref{}, false + return Ref{}, false, nil + } + + tokens, err := interpolation.Parse(s) + if err != nil { + // Return parse error as a warning diagnostic. + return Ref{}, false, diag.Diagnostics{{ + Severity: diag.Warning, + Summary: err.Error(), + Locations: v.Locations(), + }} } - // Check if the string contains any variable references. - m := re.FindAllStringSubmatch(s, -1) - if len(m) == 0 { - return Ref{}, false + // Check if any token is a variable reference. + hasRef := false + for _, t := range tokens { + if t.Kind == interpolation.TokenRef { + hasRef = true + break + } + } + + if !hasRef { + return Ref{}, false, nil } return Ref{ - Value: v, - Str: s, - Matches: m, - }, true + Value: v, + Str: s, + Tokens: tokens, + }, true, nil } // IsPure returns true if the variable reference contains a single // variable reference and nothing more. We need this so we can // interpolate values of non-string types (i.e. it can be substituted). func (v Ref) IsPure() bool { - // Need single match, equal to the incoming string. - if len(v.Matches) == 0 || len(v.Matches[0]) == 0 { - panic("invalid variable reference; expect at least one match") - } - return v.Matches[0][0] == v.Str + return len(v.Tokens) == 1 && v.Tokens[0].Kind == interpolation.TokenRef } +// References returns the path strings of all variable references. func (v Ref) References() []string { var out []string - for _, m := range v.Matches { - out = append(out, m[1]) + for _, t := range v.Tokens { + if t.Kind == interpolation.TokenRef { + out = append(out, t.Value) + } } return out } +// IsPureVariableReference returns true if s is a single variable reference +// with no surrounding text. func IsPureVariableReference(s string) bool { - return len(s) > 0 && re.FindString(s) == s + if len(s) == 0 { + return false + } + tokens, err := interpolation.Parse(s) + if err != nil { + return false + } + return len(tokens) == 1 && tokens[0].Kind == interpolation.TokenRef } +// ContainsVariableReference returns true if s contains at least one variable reference. func ContainsVariableReference(s string) bool { - return re.MatchString(s) + tokens, err := interpolation.Parse(s) + if err != nil { + return false + } + for _, t := range tokens { + if t.Kind == interpolation.TokenRef { + return true + } + } + return false } -// If s is a pure variable reference, this function returns the corresponding -// dyn.Path. Otherwise, it returns false. +// PureReferenceToPath returns the corresponding [dyn.Path] if s is a pure +// variable reference. Otherwise, it returns false. func PureReferenceToPath(s string) (dyn.Path, bool) { ref, ok := NewRef(dyn.V(s)) if !ok { diff --git a/libs/dyn/dynvar/ref_test.go b/libs/dyn/dynvar/ref_test.go index d59d80cba1..521527f9b3 100644 --- a/libs/dyn/dynvar/ref_test.go +++ b/libs/dyn/dynvar/ref_test.go @@ -3,6 +3,7 @@ package dynvar import ( "testing" + "github.com/databricks/cli/libs/diag" "github.com/databricks/cli/libs/dyn" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" @@ -47,6 +48,37 @@ func TestNewRefInvalidPattern(t *testing.T) { } } +func TestNewRefWithDiagnosticsMalformedReference(t *testing.T) { + for _, tc := range []struct { + in string + warning string + }{ + {"${foo.bar-}", "invalid"}, + {"${foo..bar}", "invalid"}, + {"${0foo}", "invalid"}, + {"${}", "empty"}, + {"${foo.bar", "unterminated"}, + } { + _, ok, diags := NewRefWithDiagnostics(dyn.V(tc.in)) + assert.False(t, ok, "should not match malformed pattern: %s", tc.in) + require.Len(t, diags, 1, tc.in) + assert.Equal(t, diag.Warning, diags[0].Severity, tc.in) + assert.Contains(t, diags[0].Summary, tc.warning, tc.in) + } +} + +func TestNewRefWithDiagnosticsValidReference(t *testing.T) { + _, ok, diags := NewRefWithDiagnostics(dyn.V("${foo.bar}")) + assert.True(t, ok) + assert.Empty(t, diags) +} + +func TestNewRefWithDiagnosticsNoReference(t *testing.T) { + _, ok, diags := NewRefWithDiagnostics(dyn.V("plain text")) + assert.False(t, ok) + assert.Empty(t, diags) +} + func TestIsPureVariableReference(t *testing.T) { assert.False(t, IsPureVariableReference("")) assert.False(t, IsPureVariableReference("${foo.bar} suffix")) diff --git a/libs/dyn/dynvar/resolve.go b/libs/dyn/dynvar/resolve.go index b1366d93bb..860fb50652 100644 --- a/libs/dyn/dynvar/resolve.go +++ b/libs/dyn/dynvar/resolve.go @@ -7,9 +7,25 @@ import ( "strings" "github.com/databricks/cli/libs/dyn" + "github.com/databricks/cli/libs/dyn/dynvar/interpolation" "github.com/databricks/cli/libs/utils" ) +// SuggestFn is a function that returns a suggested correction for a reference path. +// It returns an empty string if no suggestion is available. +type SuggestFn func(path dyn.Path) string + +// ResolveOption configures optional behavior for [Resolve]. +type ResolveOption func(*resolver) + +// WithSuggestFn configures a suggestion function that is called when a +// reference does not exist. The suggestion is appended to the error message. +func WithSuggestFn(fn SuggestFn) ResolveOption { + return func(r *resolver) { + r.suggestFn = fn + } +} + // Resolve resolves variable references in the given input value using the provided lookup function. // It returns the resolved output value and any error encountered during the resolution process. // @@ -33,8 +49,12 @@ import ( // If a cycle is detected in the variable references, an error is returned. // If for some path the resolution function returns [ErrSkipResolution], the variable reference is left in place. // This is useful when some variable references are not yet ready to be interpolated. -func Resolve(in dyn.Value, fn Lookup) (out dyn.Value, err error) { - return resolver{in: in, fn: fn}.run() +func Resolve(in dyn.Value, fn Lookup, opts ...ResolveOption) (out dyn.Value, err error) { + r := resolver{in: in, fn: fn} + for _, opt := range opts { + opt(&r) + } + return r.run() } type lookupResult struct { @@ -46,6 +66,8 @@ type resolver struct { in dyn.Value fn Lookup + suggestFn SuggestFn + refs map[string]Ref resolved map[string]dyn.Value @@ -156,30 +178,39 @@ func (r *resolver) resolveRef(ref Ref, seen []string) (dyn.Value, error) { return dyn.NewValue(resolved[0].Value(), ref.Value.Locations()), nil } - // Not pure; perform string interpolation. - for j := range ref.Matches { - // The value is invalid if resolution returned [ErrSkipResolution]. - // We must skip those and leave the original variable reference in place. - if !resolved[j].IsValid() { - continue - } - - // Try to turn the resolved value into a string. - s, ok := resolved[j].AsString() - if !ok { - // Only allow primitive types to be converted to string. - switch resolved[j].Kind() { - case dyn.KindString, dyn.KindBool, dyn.KindInt, dyn.KindFloat, dyn.KindTime, dyn.KindNil: - s = fmt.Sprint(resolved[j].AsAny()) - default: - return dyn.InvalidValue, fmt.Errorf("cannot interpolate non-primitive value of type %s into string", resolved[j].Kind()) + // Not pure; perform token-based string interpolation. + var buf strings.Builder + refIdx := 0 + for _, tok := range ref.Tokens { + switch tok.Kind { + case interpolation.TokenLiteral: + buf.WriteString(tok.Value) + case interpolation.TokenRef: + // The value is invalid if resolution returned [ErrSkipResolution]. + // We must skip those and leave the original variable reference in place. + if !resolved[refIdx].IsValid() { + buf.WriteString("${") + buf.WriteString(tok.Value) + buf.WriteByte('}') + } else { + // Try to turn the resolved value into a string. + s, ok := resolved[refIdx].AsString() + if !ok { + // Only allow primitive types to be converted to string. + switch resolved[refIdx].Kind() { + case dyn.KindString, dyn.KindBool, dyn.KindInt, dyn.KindFloat, dyn.KindTime, dyn.KindNil: + s = fmt.Sprint(resolved[refIdx].AsAny()) + default: + return dyn.InvalidValue, fmt.Errorf("cannot interpolate non-primitive value of type %s into string", resolved[refIdx].Kind()) + } + } + buf.WriteString(s) } + refIdx++ } - - ref.Str = strings.Replace(ref.Str, ref.Matches[j][0], s, 1) } - return dyn.NewValue(ref.Str, ref.Value.Locations()), nil + return dyn.NewValue(buf.String(), ref.Value.Locations()), nil } func (r *resolver) resolveKey(key string, seen []string) (dyn.Value, error) { @@ -198,7 +229,13 @@ func (r *resolver) resolveKey(key string, seen []string) (dyn.Value, error) { v, err := r.fn(p) if err != nil { if dyn.IsNoSuchKeyError(err) { - err = fmt.Errorf("reference does not exist: ${%s}", key) + msg := fmt.Sprintf("reference does not exist: ${%s}", key) + if r.suggestFn != nil { + if suggestion := r.suggestFn(p); suggestion != "" { + msg += fmt.Sprintf(". did you mean ${%s}?", suggestion) + } + } + err = errors.New(msg) } // Cache the return value and return to the caller. diff --git a/libs/dyn/dynvar/resolve_test.go b/libs/dyn/dynvar/resolve_test.go index 5b64029bae..7c2ddfc715 100644 --- a/libs/dyn/dynvar/resolve_test.go +++ b/libs/dyn/dynvar/resolve_test.go @@ -393,3 +393,42 @@ func TestResolveSequenceVariable(t *testing.T) { assert.Equal(t, "value1", seq[0].MustString()) assert.Equal(t, "value2", seq[1].MustString()) } + +func TestResolveWithSuggestFn(t *testing.T) { + in := dyn.V(map[string]dyn.Value{ + "name": dyn.V("hello"), + "ref": dyn.V("${nme}"), + }) + + _, err := dynvar.Resolve(in, dynvar.DefaultLookup(in), dynvar.WithSuggestFn(func(p dyn.Path) string { + return dynvar.SuggestPath(in, p) + })) + require.Error(t, err) + assert.Contains(t, err.Error(), "reference does not exist: ${nme}") + assert.Contains(t, err.Error(), "did you mean ${name}?") +} + +func TestResolveWithSuggestFnNoSuggestion(t *testing.T) { + in := dyn.V(map[string]dyn.Value{ + "name": dyn.V("hello"), + "ref": dyn.V("${completely_different_key}"), + }) + + _, err := dynvar.Resolve(in, dynvar.DefaultLookup(in), dynvar.WithSuggestFn(func(p dyn.Path) string { + return dynvar.SuggestPath(in, p) + })) + require.Error(t, err) + assert.Contains(t, err.Error(), "reference does not exist: ${completely_different_key}") + assert.NotContains(t, err.Error(), "did you mean") +} + +func TestResolveWithEscapeSequence(t *testing.T) { + in := dyn.V(map[string]dyn.Value{ + "a": dyn.V("hello"), + "ref": dyn.V("$${a} ${a}"), + }) + + out, err := dynvar.Resolve(in, dynvar.DefaultLookup(in)) + require.NoError(t, err) + assert.Equal(t, "${a} hello", getByPath(t, out, "ref").MustString()) +} diff --git a/libs/dyn/dynvar/suggest.go b/libs/dyn/dynvar/suggest.go new file mode 100644 index 0000000000..8c6a9a6a3a --- /dev/null +++ b/libs/dyn/dynvar/suggest.go @@ -0,0 +1,108 @@ +package dynvar + +import ( + "github.com/databricks/cli/libs/dyn" +) + +// levenshtein computes the edit distance between two strings using single-row DP. +func levenshtein(a, b string) int { + if len(a) < len(b) { + a, b = b, a + } + + row := make([]int, len(b)+1) + for j := range row { + row[j] = j + } + + for i := range len(a) { + prev := row[0] + row[0] = i + 1 + for j := range len(b) { + cost := 1 + if a[i] == b[j] { + cost = 0 + } + tmp := row[j+1] + row[j+1] = min( + row[j+1]+1, // deletion + row[j]+1, // insertion + prev+cost, // substitution + ) + prev = tmp + } + } + + return row[len(b)] +} + +// closestKeyMatch finds the candidate with the smallest edit distance to key, +// within a threshold of min(3, max(1, len(key)/2)). Returns ("", 0) if no +// candidate is within threshold. +func closestKeyMatch(key string, candidates []string) (string, int) { + threshold := min(3, max(1, len(key)/2)) + bestDist := threshold + 1 + bestMatch := "" + + for _, c := range candidates { + d := levenshtein(key, c) + if d < bestDist { + bestDist = d + bestMatch = c + } + } + + if bestMatch == "" { + return "", 0 + } + return bestMatch, bestDist +} + +// SuggestPath walks the reference path against root, correcting mistyped key +// components via fuzzy matching. Returns the corrected path string, or "" if +// the path cannot be corrected. +func SuggestPath(root dyn.Value, refPath dyn.Path) string { + cur := root + suggested := dyn.EmptyPath + + for _, c := range refPath { + if c.Key() != "" { + m, ok := cur.AsMap() + if !ok { + return "" + } + + key := c.Key() + if v, ok := m.GetByString(key); ok { + suggested = suggested.Append(dyn.Key(key)) + cur = v + continue + } + + // Collect candidate keys for fuzzy matching. + pairs := m.Pairs() + candidates := make([]string, len(pairs)) + for i, p := range pairs { + candidates[i] = p.Key.MustString() + } + + match, _ := closestKeyMatch(key, candidates) + if match == "" { + return "" + } + + v, _ := m.GetByString(match) + suggested = suggested.Append(dyn.Key(match)) + cur = v + } else { + seq, ok := cur.AsSequence() + if !ok || c.Index() < 0 || c.Index() >= len(seq) { + return "" + } + suggested = suggested.Append(dyn.Index(c.Index())) + cur = seq[c.Index()] + } + } + + return suggested.String() +} diff --git a/libs/dyn/dynvar/suggest_test.go b/libs/dyn/dynvar/suggest_test.go new file mode 100644 index 0000000000..cd4954a6eb --- /dev/null +++ b/libs/dyn/dynvar/suggest_test.go @@ -0,0 +1,175 @@ +package dynvar + +import ( + "testing" + + "github.com/databricks/cli/libs/dyn" + "github.com/stretchr/testify/assert" +) + +func TestLevenshtein(t *testing.T) { + cases := []struct { + a, b string + expected int + }{ + {"abc", "abc", 0}, + {"abc", "ab", 1}, + {"ab", "abc", 1}, + {"abc", "axc", 1}, + {"kitten", "sitting", 3}, + {"", "abc", 3}, + {"abc", "", 3}, + {"", "", 0}, + } + + for _, tc := range cases { + t.Run(tc.a+"_"+tc.b, func(t *testing.T) { + assert.Equal(t, tc.expected, levenshtein(tc.a, tc.b)) + }) + } +} + +func TestClosestKeyMatch(t *testing.T) { + cases := []struct { + name string + key string + candidates []string + expectedKey string + expectedDist int + }{ + { + name: "close match", + key: "nme", + candidates: []string{"name", "type", "value"}, + expectedKey: "name", + expectedDist: 1, + }, + { + name: "no match", + key: "xyz", + candidates: []string{"name", "type"}, + expectedKey: "", + expectedDist: 0, + }, + { + name: "exact match", + key: "name", + candidates: []string{"name", "type"}, + expectedKey: "name", + expectedDist: 0, + }, + { + name: "threshold boundary within", + key: "ab", + candidates: []string{"ax"}, + expectedKey: "ax", + expectedDist: 1, + }, + { + name: "threshold boundary exceeds", + key: "ab", + candidates: []string{"xyz"}, + expectedKey: "", + expectedDist: 0, + }, + } + + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + match, dist := closestKeyMatch(tc.key, tc.candidates) + assert.Equal(t, tc.expectedKey, match) + assert.Equal(t, tc.expectedDist, dist) + }) + } +} + +func TestSuggestPath(t *testing.T) { + root := dyn.V(map[string]dyn.Value{ + "bundle": dyn.V(map[string]dyn.Value{ + "name": dyn.V("my-bundle"), + "git": dyn.V(map[string]dyn.Value{ + "branch": dyn.V("main"), + }), + }), + "variables": dyn.V(map[string]dyn.Value{ + "my_cluster_id": dyn.V("abc123"), + }), + "workspace": dyn.V(map[string]dyn.Value{ + "file_path": dyn.V("/path"), + }), + }) + + cases := []struct { + name string + root dyn.Value + path dyn.Path + expected string + }{ + { + name: "single typo", + root: root, + path: dyn.MustPathFromString("bundle.nme"), + expected: "bundle.name", + }, + { + name: "multi-level typo", + root: root, + path: dyn.MustPathFromString("bundel.nme"), + expected: "bundle.name", + }, + { + name: "nested", + root: root, + path: dyn.MustPathFromString("bundle.git.brnch"), + expected: "bundle.git.branch", + }, + { + name: "variable typo", + root: root, + path: dyn.MustPathFromString("variables.my_clster_id"), + expected: "variables.my_cluster_id", + }, + { + name: "no match", + root: root, + path: dyn.MustPathFromString("variables.completely_wrong_name_that_is_very_different"), + expected: "", + }, + { + name: "exact match", + root: root, + path: dyn.MustPathFromString("bundle.name"), + expected: "bundle.name", + }, + { + name: "root key does not exist", + root: root, + path: dyn.MustPathFromString("nonexistent.field"), + expected: "", + }, + { + name: "path into non-map", + root: root, + path: dyn.MustPathFromString("bundle.name.sub"), + expected: "", + }, + { + name: "index passthrough", + root: dyn.V(map[string]dyn.Value{ + "items": dyn.V([]dyn.Value{ + dyn.V(map[string]dyn.Value{ + "value": dyn.V("first"), + }), + }), + }), + path: dyn.NewPath(dyn.Key("items"), dyn.Index(0), dyn.Key("vlue")), + expected: "items[0].value", + }, + } + + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + assert.Equal(t, tc.expected, SuggestPath(tc.root, tc.path)) + }) + } +} From 8b55146ca7be65e7dbcd1e1ed7f3faa2b21550a7 Mon Sep 17 00:00:00 2001 From: Shreyas Goenka Date: Thu, 12 Mar 2026 02:03:52 +0100 Subject: [PATCH 4/7] Clean up design docs: shorten suggestions doc, remove implementation context Co-Authored-By: Claude Opus 4.6 --- design/implementation-context.md | 135 ------ design/variable-lookup-suggestions.md | 619 ++------------------------ 2 files changed, 32 insertions(+), 722 deletions(-) delete mode 100644 design/implementation-context.md diff --git a/design/implementation-context.md b/design/implementation-context.md deleted file mode 100644 index 41f17c9299..0000000000 --- a/design/implementation-context.md +++ /dev/null @@ -1,135 +0,0 @@ -# Implementation Context for Variable Interpolation Improvements - -This file captures the context needed to implement the two design proposals in -this directory. Read the design docs first, then use this as a map of the -codebase. - -## Two Proposals - -1. **`interpolation-parser.md`** — Replace regex-based `${...}` parsing with a - proper two-mode character scanner. Adds escape support (`$$`), error messages - with byte positions, and cleaner token-based interpolation. - -2. **`variable-lookup-suggestions.md`** — Add "did you mean?" suggestions when - a variable reference path doesn't exist. Uses Levenshtein distance and a - fuzzy walk of the config tree to suggest corrections, including for - multi-level typos. - -These are independent and can be implemented in either order. - -## Key Files to Modify - -### Proposal 1: Parser - -| File | What to do | -|------|------------| -| `libs/dyn/dynvar/parse.go` | **New**. Two-mode scanner: TEXT mode accumulates literals, REFERENCE mode scans `${...}`. Returns `[]Token`. See design doc for full pseudocode. | -| `libs/dyn/dynvar/parse_test.go` | **New**. Test valid refs, escapes (`$$`), errors (unterminated, empty, invalid chars). | -| `libs/dyn/dynvar/ref.go` | Replace regex + `Matches [][]string` with `Tokens []Token`. Update `NewRef`, `IsPure`, `References`. Remove the `re` regex variable. Keep `IsPureVariableReference` and `ContainsVariableReference` working. | -| `libs/dyn/dynvar/ref_test.go` | Update to match new internals. All existing behavioral tests should still pass. | -| `libs/dyn/dynvar/resolve.go` | Update `resolveRef` to use token concatenation instead of `strings.Replace` loop on regex matches. | -| `libs/dyn/dynvar/resolve_test.go` | Add tests for escape sequences. All existing tests must pass unchanged. | -| `python/databricks/bundles/core/_transform.py` | Has a Python regex that must stay in sync (separate PR). Comment on line 11-12 of `ref.go` references this. | - -### Proposal 2: Suggestions - -| File | What to do | -|------|------------| -| `libs/dyn/dynvar/suggest.go` | **New**. `levenshtein()`, `closestKeyMatch()`, `SuggestPath()`. | -| `libs/dyn/dynvar/suggest_test.go` | **New**. Unit tests for distance, matching, path suggestion (single typo, multi-level, with indices, no match). | -| `libs/dyn/mapping.go` | Add `StringKeys() []string` helper on `Mapping`. | -| `libs/dyn/dynvar/resolve.go` | Add `SuggestFn` field to `resolver`, `WithSuggestFn` option, use in `resolveKey` when `NoSuchKeyError`. | -| `libs/dyn/dynvar/resolve_test.go` | Integration tests with suggestions. | -| `bundle/config/mutator/resolve_variable_references.go` | Pass `WithSuggestFn` closure that calls `SuggestPath(normalized, p)`. | - -## Current Code Structure - -### `libs/dyn/dynvar/ref.go` — Variable Reference Detection -- `re` regex at line 14-15 matches `${path.to.var[0]}` patterns -- `baseVarDef = [a-zA-Z]+([-_]*[a-zA-Z0-9]+)*` — segment pattern -- `Ref` struct holds: `Value dyn.Value`, `Str string`, `Matches [][]string` -- `NewRef(v)` → uses `re.FindAllStringSubmatch(s, -1)` -- `IsPure()` → single match equals entire string (enables type retention) -- `References()` → extracts variable paths from match groups (`m[1]`) -- Must stay in sync with Python regex in `python/databricks/bundles/core/_transform.py` - -### `libs/dyn/dynvar/resolve.go` — Resolution Pipeline -- 3-phase pipeline: collect → resolve → replace -- `collectVariableReferences()` — walks tree, calls `NewRef` on each string value -- `resolveVariableReferences()` — resolves refs in sorted key order (deterministic cycle detection) -- `resolveRef()` — resolves all deps, then either: - - Pure substitution: single ref, retain original type - - String interpolation: `strings.Replace(ref.Str, ref.Matches[j][0], s, 1)` for each match -- `resolveKey()` — catches `NoSuchKeyError` → `"reference does not exist: ${%s}"`. **This is where suggestion logic hooks in.** -- `ErrSkipResolution` — leaves variable reference in place for multi-pass resolution - -### `libs/dyn/dynvar/lookup.go` — Lookup Interface -- `type Lookup func(path dyn.Path) (dyn.Value, error)` -- `ErrSkipResolution` sentinel error -- `DefaultLookup(in)` → creates lookup against a `dyn.Value` - -### `libs/dyn/path_string.go` — Path Parsing -- `NewPathFromString("foo.bar[0].baz")` → `Path{Key("foo"), Key("bar"), Index(0), Key("baz")}` -- Already handles dots, brackets, indices -- **Reuse this for path validation** in the new parser (don't reimplement) - -### `libs/dyn/visit.go` — Tree Traversal -- `noSuchKeyError{p Path}` — raised when map key doesn't exist -- At `visit.go:132-137`: has the map `m` with all sibling keys, but doesn't expose them in the error -- For suggestions, we do NOT modify this. Instead, `SuggestPath()` does its own walk. - -### `libs/dyn/mapping.go` — Map Type -- `Mapping` struct with `pairs []Pair` and `index map[string]int` -- `GetByString(key)` → `(Value, bool)` -- `Keys()` → `[]Value` -- **Need to add**: `StringKeys() []string` - -### `bundle/config/mutator/resolve_variable_references.go` — Bundle-Level Resolution -- Multi-round resolution (up to 11 rounds) -- Creates `normalized` tree via `convert.Normalize(b.Config, root, convert.IncludeMissingFields)` -- Rewrites `${var.foo}` → `${variables.foo.value}` before lookup -- Lookup function: `dyn.GetByPath(normalized, path)` -- `prefixes` control which paths are resolved vs skipped -- **This is where `WithSuggestFn` gets wired in**, passing `normalized` to `SuggestPath` - -### `libs/dyn/walk.go` — Tree Walking -- `Walk(v, fn)` — walks all nodes, calls fn on each -- `CollectLeafPaths(v)` — returns all leaf paths as strings - -## Important Invariants - -1. **`IsPure()` must work identically** — when a string is exactly `${...}` with - no surrounding text, the resolved value retains its original type (int, bool, - map, etc.). The parser must preserve this semantic. - -2. **`ErrSkipResolution` must work** — skipped variables are left as literal - `${...}` text in the output. The token-based interpolation must handle this. - -3. **Regex sync with Python** — the Python regex in `_transform.py` must - eventually match whatever the parser accepts. For now, the parser should - accept the same language as the regex (not broader). - -4. **Sorted resolution order** — keys are resolved in sorted order for - deterministic cycle detection errors. Don't change this. - -5. **`var` shorthand** — `${var.X}` is rewritten to `${variables.X.value}` - before lookup. Suggestions should reverse this for user-facing messages. - -## Build & Test - -```bash -# Run all unit tests -make test - -# Run specific package tests -go test ./libs/dyn/dynvar/... - -# Run acceptance tests (after changes) -go test ./acceptance/... - -# Lint -make lintfull - -# Format -make fmtfull -``` diff --git a/design/variable-lookup-suggestions.md b/design/variable-lookup-suggestions.md index 8e8fa221a4..fb7e46e3ef 100644 --- a/design/variable-lookup-suggestions.md +++ b/design/variable-lookup-suggestions.md @@ -1,605 +1,50 @@ -# Proposal: "Did You Mean?" Suggestions for Invalid Variable References +# "Did You Mean?" Suggestions for Invalid Variable References -## Status: Draft +Author: Shreyas Goenka +Date: 12 March 2026 ## Problem -When a user writes a variable reference like `${bundle.git.origin_urlx}` (typo) -or `${var.my_clster_id}` (misspelling), the error message today is: +`${bundle.git.origin_urlx}` produces `reference does not exist` with no hint. +A single character typo in a long path can take minutes to spot. -``` -reference does not exist: ${bundle.git.origin_urlx} -``` +## Design: Fuzzy Path Walk -That's it. No suggestions, no list of valid keys, no indication of what went -wrong. The user has to go back to the docs or mentally diff against the schema -to figure out the correct name. +When a lookup fails with `NoSuchKeyError`, we do a separate fuzzy walk of the +normalized config tree (which includes all struct-defined fields via +`IncludeMissingFields` + all user-defined map keys). -This is a common source of frustration: a single character typo in a long path -like `${resources.jobs.my_pipeline.tasks[0].task_key}` can take minutes to spot. +The walk processes every component independently: +1. Exact key match → follow it +2. No exact match → Levenshtein fuzzy match against siblings → follow best match +3. Index components (`[0]`) → pass through verbatim +4. Any component unfixable (all candidates too far) → give up, no suggestion -Variable references are multi-level paths (e.g., `${resources.jobs.my_job.id}`). -A typo can occur in **any** component — or even in **multiple** components at -once. A good suggestion system must handle all of these cases. +This corrects **multiple** typos simultaneously (e.g., `${resources.jbs.my_jb.id}` +→ `did you mean ${resources.jobs.my_job.id}?`). -## What We Have Today +Distance threshold: `min(3, max(1, len(key)/2))`. -### The Error Path +See `libs/dyn/dynvar/suggest.go`. -When resolution fails, the call chain is: +## Wiring -``` -resolve_variable_references.go: dynvar.Resolve(v, lookupFn) - resolve.go: r.resolveKey(dep, seen) - resolve.go: r.fn(p) // calls the lookup function - resolve_variable_references.go: m.lookupFn(normalized, path, b) - resolve_variable_references.go: dyn.GetByPath(v, path) - visit.go: m.GetByString(c.key) // FAILS HERE - return noSuchKeyError{path} -``` - -At the point of failure in `visit.go:135-137`: -```go -m := v.MustMap() -ev, ok := m.GetByString(c.key) -if !ok { - return InvalidValue, noSuchKeyError{path} -} -``` - -The map `m` contains **all sibling keys** — i.e., the valid alternatives. But -that information is discarded. The error only carries the failed `path`. - -Back in `resolve.go:200-201`, the error is rewrapped into a flat string: -```go -if dyn.IsNoSuchKeyError(err) { - err = fmt.Errorf("reference does not exist: ${%s}", key) -} -``` - -**Crucially**, `visit.go` stops at the **first** non-existent key. For a path -like `${resources.jbs.my_jb.id}` with typos in both `jbs` and `my_jb`: -1. `resources` — exists, traverse into it -2. `jbs` — **does not exist** → `NoSuchKeyError` immediately - -The traversal never reaches `my_jb`, so we can only suggest fixing `jbs`. The -user has to fix that, re-run, and discover the second typo. This round-trip -is exactly the frustration we want to avoid. - -### The Normalized Config Tree - -In `resolve_variable_references.go:203`, before resolution begins: -```go -normalized, _ := convert.Normalize(b.Config, root, convert.IncludeMissingFields) -``` - -This `normalized` tree includes: -- All struct-defined fields (from Go types), even if unset (via `IncludeMissingFields`) -- All user-defined map keys (resource names, variable names, etc.) - -This is the right source of truth for suggestions. It contains everything the -user could validly reference. - -## Design - -### Approach: Fuzzy Path Walk Against the Config Tree - -Rather than relying on the error from `visit.go` (which only tells us about the -first failing component), we do a **separate fuzzy walk** of the config tree when -a lookup fails. This walk processes every component in the reference path and -can fix typos in **multiple** components simultaneously. - -The flow: -1. Lookup fails with `NoSuchKeyError` -2. We walk the reference path component by component against the normalized tree -3. At each component, if the exact key exists, we follow it -4. If not, we fuzzy-match against sibling keys and follow the best match -5. If all components are resolved (some via fuzzy matching), we suggest the - corrected full path -6. If any component can't be fuzzy-matched (too far from all candidates), we - give up on the suggestion - -### Implementation - -#### 1. Levenshtein Distance Utility - -```go -// File: libs/dyn/dynvar/suggest.go - -package dynvar - -// levenshtein computes the edit distance between two strings. -func levenshtein(a, b string) int { - if len(a) == 0 { - return len(b) - } - if len(b) == 0 { - return len(a) - } - - // Use single-row DP to save memory. - prev := make([]int, len(b)+1) - for j := range prev { - prev[j] = j - } - - for i := range len(a) { - curr := make([]int, len(b)+1) - curr[0] = i + 1 - for j := range len(b) { - cost := 1 - if a[i] == b[j] { - cost = 0 - } - curr[j+1] = min( - curr[j]+1, // insertion - prev[j+1]+1, // deletion - prev[j]+cost, // substitution - ) - } - prev = curr - } - - return prev[len(b)] -} -``` - -#### 2. Single-Key Match Function - -```go -// closestKeyMatch finds the closest matching key from a list of candidates. -// Returns the best match and its edit distance. -// Returns ("", -1) if no candidate is within the distance threshold. -func closestKeyMatch(key string, candidates []string) (string, int) { - // Threshold: allow up to 3 edits, but no more than half the key length. - // This avoids suggesting wildly different strings for short keys. - maxDist := min(3, max(1, len(key)/2)) - - bestMatch := "" - bestDist := maxDist + 1 - - for _, c := range candidates { - d := levenshtein(key, c) - if d < bestDist { - bestDist = d - bestMatch = c - } - } - - if bestMatch == "" { - return "", -1 - } - return bestMatch, bestDist -} -``` - -#### 3. Fuzzy Path Walk - -This is the core new function. It walks the reference path against the config -tree, fuzzy-matching at each level: - -```go -// suggestPath walks the reference path against root and tries to find the -// closest valid path. At each component, it first tries an exact match; if -// that fails, it fuzzy-matches against available keys. -// -// Returns the suggested path as a string, or "" if no reasonable suggestion -// can be made. -func suggestPath(root dyn.Value, refPath dyn.Path) string { - current := root - suggested := make(dyn.Path, 0, len(refPath)) - - for _, component := range refPath { - if component.IsIndex() { - // For index components (e.g., [0]), we can't fuzzy-match. - // Just check if the index is valid and pass through. - s, ok := current.AsSequence() - if !ok || component.Index() >= len(s) { - return "" - } - suggested = append(suggested, component) - current = s[component.Index()] - continue - } - - key := component.Key() - m, ok := current.AsMap() - if !ok { - // Expected a map but got something else — can't suggest. - return "" - } - - // Try exact match first. - if v, exists := m.GetByString(key); exists { - suggested = append(suggested, component) - current = v - continue - } - - // Exact match failed — try fuzzy match. - candidates := m.StringKeys() - match, _ := closestKeyMatch(key, candidates) - if match == "" { - // No close match — can't suggest beyond this point. - return "" - } - - suggested = append(suggested, dyn.Key(match)) - v, _ := m.GetByString(match) - current = v - } - - return suggested.String() -} -``` - -**Key properties:** -- Handles typos at **any** level: first, middle, last, or multiple levels -- Index components (`[0]`) are passed through verbatim — no fuzzy matching -- Stops suggesting as soon as any component can't be matched (no partial guesses) -- Each component is matched independently, so two typos in different components - are both corrected - -#### 4. Wire It Into Resolution - -The suggestion logic needs access to the normalized config tree that the lookup -function uses. Today, the `Lookup` function type is: - -```go -type Lookup func(path dyn.Path) (dyn.Value, error) -``` - -The `resolve.go` resolver doesn't have direct access to the underlying tree — -it only has the lookup function. We add the suggestion logic at the layer above, -in `resolve_variable_references.go`, which has access to the `normalized` tree. - -**Option A: Pass a suggest function into the resolver** - -Add an optional suggest callback to the resolver: - -```go -// SuggestFn takes a failed reference path and returns a suggested correction, -// or "" if no suggestion can be made. -type SuggestFn func(path dyn.Path) string - -func Resolve(in dyn.Value, fn Lookup, opts ...ResolveOption) (out dyn.Value, err error) { - r := resolver{in: in, fn: fn} - for _, opt := range opts { - opt(&r) - } - return r.run() -} - -type ResolveOption func(*resolver) - -func WithSuggestFn(fn SuggestFn) ResolveOption { - return func(r *resolver) { - r.suggestFn = fn - } -} -``` - -Then in `resolveKey`: -```go -v, err := r.fn(p) -if err != nil { - if dyn.IsNoSuchKeyError(err) { - msg := fmt.Sprintf("reference does not exist: ${%s}", key) - - if r.suggestFn != nil { - if suggestion := r.suggestFn(p); suggestion != "" { - msg += fmt.Sprintf("; did you mean ${%s}?", suggestion) - } - } - - err = fmt.Errorf(msg) - } - - r.lookups[key] = lookupResult{v: dyn.InvalidValue, err: err} - return dyn.InvalidValue, err -} -``` - -And in `resolve_variable_references.go`, pass the suggest function: - -```go -return dynvar.Resolve(v, lookupFn, dynvar.WithSuggestFn(func(p dyn.Path) string { - return dynvar.SuggestPath(normalized, p) -})) -``` - -**Option B: Suggest at the `resolve_variable_references.go` level** - -Instead of modifying `Resolve`'s signature, wrap the error after `Resolve` -returns. This is simpler but less clean: - -```go -out, err := dynvar.Resolve(v, lookupFn) -if err != nil && dyn.IsNoSuchKeyError(err) { - // Extract the failed path and suggest... -} -``` - -The problem with Option B is that by the time `Resolve` returns, the original -`dyn.Path` is lost — it's been formatted into the error string. We'd have to -re-parse it or change the error type. **Option A is cleaner.** - -### Example Error Messages - -| Reference | Typos | Error After | -|-----------|-------|-------------| -| `${bundle.git.origin_urlx}` | 1 (leaf) | `did you mean ${bundle.git.origin_url}?` | -| `${resources.jbs.my_job.id}` | 1 (middle) | `did you mean ${resources.jobs.my_job.id}?` | -| `${resources.jbs.my_jb.id}` | 2 (middle + middle) | `did you mean ${resources.jobs.my_job.id}?` | -| `${bundel.git.origin_urlx}` | 2 (root + leaf) | `did you mean ${bundle.git.origin_url}?` | -| `${workspace.root_paht}` | 1 (leaf) | `did you mean ${workspace.root_path}?` | -| `${var.my_clster_id}` | 1 (leaf) | `did you mean ${var.my_cluster_id}?` | -| `${completely.wrong.path}` | all | *(no suggestion — too far at first component)* | -| `${resources.jobs.my_jb.idd}` | 2 (deep) | `did you mean ${resources.jobs.my_job.id}?` | - -### Walk-Through: Multi-Level Typo - -For `${resources.jbs.my_jb.id}`, the fuzzy walk proceeds: - -``` -Component Tree at this level Exact? Fuzzy match -───────── ────────────────── ────── ─────────── -resources {bundle, resources, ...} yes — -jbs {jobs, pipelines, ...} no "jobs" (dist=1) -my_jb {my_job, other_job, ...} no "my_job" (dist=2) -id {id, name, ...} yes — - -Suggested path: resources.jobs.my_job.id -``` - -All four components resolved, so we suggest `${resources.jobs.my_job.id}`. - -### Walk-Through: Unfixable Path - -For `${zzz.yyy.xxx}`: - -``` -Component Tree at this level Exact? Fuzzy match -───────── ────────────────── ────── ─────────── -zzz {bundle, resources, ...} no none (all dist>3) - -Suggested path: "" (give up) -``` - -No suggestion produced. - -## Scope - -### What This Covers - -- Typos in struct field names: `${bundle.git.origin_urlx}` (keys from Go types) -- Typos in user-defined names: `${var.my_clster_id}` (keys from user config) -- Typos in resource type names: `${resources.jbs.my_job.id}` -- Typos in resource instance names: `${resources.jobs.my_jb.id}` -- **Multi-level typos**: `${resources.jbs.my_jb.id}` (typos at two levels) - -### What This Does NOT Cover - -- **Invalid path structure** (e.g., `${a..b}` or `${a[x]}`) — this is a parse - error, not a lookup error, and would be handled by the parser proposal. -- **References to the wrong section** (e.g., user writes `${bundle.cluster_id}` - when they mean `${var.cluster_id}`) — the prefix is valid so we'd only - suggest keys within `bundle.*`. Cross-section suggestions would require - searching the entire tree, which is a separate feature. -- **Array index out of bounds** (e.g., `${resources.jobs.foo.tasks[99]}`) — this - is an `indexOutOfBoundsError`, not a `noSuchKeyError`. No suggestions apply. +The suggestion callback is passed via `dynvar.WithSuggestFn(...)` into +`dynvar.Resolve`. On `NoSuchKeyError` in `resolveKey`, the suggestion is +appended to the error message. ## `var` Shorthand -The `${var.foo}` shorthand is rewritten to `${variables.foo.value}` before -lookup (in `resolve_variable_references.go:209-222`). The suggestion function -receives the **rewritten** path. If we suggest a corrected path, we should -convert it back to the shorthand form for the user-facing message. - -For example: -- User writes: `${var.my_clster_id}` -- Rewritten to: `${variables.my_clster_id.value}` -- Suggestion from fuzzy walk: `variables.my_cluster_id.value` -- User-facing message: `did you mean ${var.my_cluster_id}?` - -This reverse mapping is straightforward: if the suggested path starts with -`variables.` and ends with `.value`, strip those and prefix with `var.`. - -## File Changes - -| File | Change | -|------|--------| -| `libs/dyn/mapping.go` | Add `StringKeys()` helper | -| `libs/dyn/dynvar/suggest.go` | **New**: `levenshtein()`, `closestKeyMatch()`, `SuggestPath()` | -| `libs/dyn/dynvar/suggest_test.go` | **New**: tests for distance, matching, and path suggestion | -| `libs/dyn/dynvar/resolve.go` | Add `SuggestFn` field, use it in `resolveKey` | -| `libs/dyn/dynvar/resolve_test.go` | Add tests for suggestion error messages | -| `bundle/config/mutator/resolve_variable_references.go` | Pass `WithSuggestFn` to `Resolve` | - -Note: no changes to `libs/dyn/visit.go` — the suggestion logic is entirely -separate from the traversal error path. - -## Testing - -### Unit Tests for Levenshtein + Suggestions - -```go -func TestLevenshtein(t *testing.T) { - assert.Equal(t, 0, levenshtein("abc", "abc")) - assert.Equal(t, 1, levenshtein("abc", "ab")) // deletion - assert.Equal(t, 1, levenshtein("abc", "abcd")) // insertion - assert.Equal(t, 1, levenshtein("abc", "adc")) // substitution - assert.Equal(t, 3, levenshtein("abc", "xyz")) // all different - assert.Equal(t, 3, levenshtein("", "abc")) // empty vs non-empty -} - -func TestClosestKeyMatch(t *testing.T) { - candidates := []string{"origin_url", "branch", "commit"} - - match, dist := closestKeyMatch("origin_urlx", candidates) - assert.Equal(t, "origin_url", match) - assert.Equal(t, 1, dist) - - match, _ = closestKeyMatch("zzzzzzz", candidates) - assert.Equal(t, "", match) -} -``` - -### Fuzzy Path Walk Tests +`${var.foo}` is rewritten to `${variables.foo.value}` before lookup. The +`SuggestFn` in `resolve_variable_references.go` handles this bidirectionally: +rewrite `var.X` → `variables.X.value` for the fuzzy walk, then convert the +suggestion back to `var.X` form for the user-facing message. -```go -func TestSuggestPathSingleTypo(t *testing.T) { - root := dyn.V(map[string]dyn.Value{ - "bundle": dyn.V(map[string]dyn.Value{ - "git": dyn.V(map[string]dyn.Value{ - "origin_url": dyn.V(""), - "branch": dyn.V(""), - }), - }), - }) - - p := dyn.MustPathFromString("bundle.git.origin_urlx") - assert.Equal(t, "bundle.git.origin_url", SuggestPath(root, p)) -} - -func TestSuggestPathMultiLevelTypo(t *testing.T) { - root := dyn.V(map[string]dyn.Value{ - "resources": dyn.V(map[string]dyn.Value{ - "jobs": dyn.V(map[string]dyn.Value{ - "my_job": dyn.V(map[string]dyn.Value{ - "id": dyn.V(""), - }), - }), - }), - }) - - p := dyn.MustPathFromString("resources.jbs.my_jb.id") - assert.Equal(t, "resources.jobs.my_job.id", SuggestPath(root, p)) -} - -func TestSuggestPathNoMatch(t *testing.T) { - root := dyn.V(map[string]dyn.Value{ - "bundle": dyn.V(map[string]dyn.Value{ - "name": dyn.V(""), - }), - }) - - p := dyn.MustPathFromString("zzzzz.yyyyy") - assert.Equal(t, "", SuggestPath(root, p)) -} - -func TestSuggestPathWithIndex(t *testing.T) { - root := dyn.V(map[string]dyn.Value{ - "resources": dyn.V(map[string]dyn.Value{ - "jobs": dyn.V(map[string]dyn.Value{ - "my_job": dyn.V(map[string]dyn.Value{ - "tasks": dyn.V([]dyn.Value{ - dyn.V(map[string]dyn.Value{ - "task_key": dyn.V(""), - }), - }), - }), - }), - }), - }) - - p := dyn.MustPathFromString("resources.jobs.my_job.tasks[0].tsk_key") - assert.Equal(t, "resources.jobs.my_job.tasks[0].task_key", SuggestPath(root, p)) -} -``` - -### Integration-Level Tests - -```go -func TestResolveNotFoundWithSuggestion(t *testing.T) { - in := dyn.V(map[string]dyn.Value{ - "bundle": dyn.V(map[string]dyn.Value{ - "name": dyn.V("my-bundle"), - "target": dyn.V("dev"), - }), - "ref": dyn.V("${bundle.nme}"), - }) - - _, err := dynvar.Resolve(in, dynvar.DefaultLookup(in), - dynvar.WithSuggestFn(func(p dyn.Path) string { - return dynvar.SuggestPath(in, p) - }), - ) - assert.ErrorContains(t, err, "reference does not exist: ${bundle.nme}") - assert.ErrorContains(t, err, "did you mean ${bundle.name}?") -} - -func TestResolveNotFoundMultiLevelTypo(t *testing.T) { - in := dyn.V(map[string]dyn.Value{ - "resources": dyn.V(map[string]dyn.Value{ - "jobs": dyn.V(map[string]dyn.Value{ - "my_job": dyn.V(map[string]dyn.Value{ - "id": dyn.V("123"), - }), - }), - }), - "ref": dyn.V("${resources.jbs.my_jb.id}"), - }) - - _, err := dynvar.Resolve(in, dynvar.DefaultLookup(in), - dynvar.WithSuggestFn(func(p dyn.Path) string { - return dynvar.SuggestPath(in, p) - }), - ) - assert.ErrorContains(t, err, "did you mean ${resources.jobs.my_job.id}?") -} -``` - -## Alternatives Considered - -### A. Fix one component at a time (original proposal) - -Only suggest a fix for the first failing component. After the user fixes that, -they re-run and discover the next typo. - -**Rejected** because: -- Requires multiple round-trips for multi-level typos -- The fuzzy walk approach is barely more complex but gives a much better UX - -### B. Enumerate all valid paths in the error - -List all valid sibling keys: - -``` -reference does not exist: ${bundle.nme}; valid keys at "bundle" are: name, target, git -``` - -**Rejected** because for large maps (e.g., `resources.jobs` with dozens of jobs) -this would produce very noisy output. A single close match is more actionable. - -### C. Search the entire tree for the closest leaf path - -Walk the entire normalized tree and compute edit distance for every possible -leaf path against the full reference string. - -**Rejected** because: -- Expensive for large configs (every leaf × string distance) -- Could suggest paths in completely unrelated sections -- The per-component walk is more predictable and faster (bounded by path depth) - -### D. Do nothing — rely on docs/IDE support - -**Rejected** because: -- Many users don't use an IDE for YAML editing -- The error happens at `databricks bundle validate` time, which is the right - place for actionable feedback -- This is low-effort, high-value - -## Relationship to Parser Proposal - -This proposal is **independent** of the regex-to-parser migration. It can be -implemented with the current regex-based `NewRef` — the suggestion logic operates -at the resolution level, not the parsing level. +## Scope -However, the two proposals complement each other: -- The parser proposal improves error reporting for **malformed** references - (e.g., `${a..b}`, unterminated `${`) -- This proposal improves error reporting for **well-formed but invalid** - references (e.g., `${bundle.nme}`) +**Covered**: typos in struct fields, user-defined names, resource types/instances, +multi-level typos. -Both can be implemented and shipped independently. +**Not covered**: malformed references (handled by the parser), cross-section +suggestions (user writes `${bundle.X}` meaning `${var.X}`), array index +out of bounds. From 7cf85e5eb6c617f243bcd8b8a3d3eb740ff97b0a Mon Sep 17 00:00:00 2001 From: Shreyas Goenka Date: Thu, 12 Mar 2026 02:07:37 +0100 Subject: [PATCH 5/7] Add background research table to parser design doc Co-Authored-By: Claude Opus 4.6 --- design/interpolation-parser.md | 22 ++++++++++++++++++---- 1 file changed, 18 insertions(+), 4 deletions(-) diff --git a/design/interpolation-parser.md b/design/interpolation-parser.md index 468fe094e3..3bb0d378f1 100644 --- a/design/interpolation-parser.md +++ b/design/interpolation-parser.md @@ -11,13 +11,27 @@ DABs variable interpolation (`${...}`) was regex-based. This caused: 2. **No suggestions** — `${bundle.nme}` produces "reference does not exist" with no hint. 3. **No escape mechanism** — no way to produce a literal `${` in output. +## Background: How Other Systems Parse `${...}` + +| System | Strategy | Escape | Error Quality | +|--------|----------|--------|---------------| +| Go `text/template` | State-function lexer | None | Line + template name | +| HCL2 (Terraform) | Ragel FSM + recursive descent | `$${` → literal `${` | Source range + suggestions | +| Python f-strings | Mode-stack tokenizer | `{{` → `{` | Line/column | +| Rust `format!` | Iterator-based descent | `{{`/`}}` | Spans + suggestions | +| Bash | Char-by-char + depth tracking | `\$` | Line number | + +For a syntax as simple as `${path.to.var[0]}` (no nesting, no functions, no +operators), a full recursive descent parser is overkill. A **two-mode character +scanner** — the same core pattern used by Go's `text/template` and HCL — gives +proper error reporting and escape support without the complexity. + ## Design Decisions -### Two-mode character scanner (not recursive descent) +### Two-mode character scanner -For syntax as simple as `${path.to.var[0]}`, a recursive descent parser is -overkill. A two-mode scanner (TEXT / REFERENCE) is sufficient and easy to -port to the Python implementation. +A two-mode scanner (TEXT / REFERENCE) that produces a flat list of tokens. +No AST, no recursive descent. Easy to port to the Python implementation. See `libs/dyn/dynvar/interpolation/parse.go`. From 04fe53eb5a31616fe55bfcf233cf9aa78cf3189a Mon Sep 17 00:00:00 2001 From: Shreyas Goenka Date: Thu, 12 Mar 2026 02:15:43 +0100 Subject: [PATCH 6/7] Fix gofmt alignment in suggest.go Co-Authored-By: Claude Opus 4.6 --- libs/dyn/dynvar/suggest.go | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/libs/dyn/dynvar/suggest.go b/libs/dyn/dynvar/suggest.go index 8c6a9a6a3a..13909d27d5 100644 --- a/libs/dyn/dynvar/suggest.go +++ b/libs/dyn/dynvar/suggest.go @@ -25,9 +25,9 @@ func levenshtein(a, b string) int { } tmp := row[j+1] row[j+1] = min( - row[j+1]+1, // deletion - row[j]+1, // insertion - prev+cost, // substitution + row[j+1]+1, // deletion + row[j]+1, // insertion + prev+cost, // substitution ) prev = tmp } From 12949c559a0b333d6182f02eca7eecda40eebebe Mon Sep 17 00:00:00 2001 From: Shreyas Goenka Date: Thu, 12 Mar 2026 02:28:12 +0100 Subject: [PATCH 7/7] Replace monolithic path regex with per-segment validation Split path validation into per-segment traversal with a small key regex instead of one big regex for the entire path. This produces better error messages pointing to the exact invalid segment, e.g.: "invalid key "bar-"" instead of "invalid path" Also drops the unhelpful "at position N" from error messages since the YAML file location is already reported. Co-Authored-By: Claude Opus 4.6 --- .../bad_syntax/out.deploy.direct.txt | 2 +- .../bad_syntax/out.plan.direct.txt | 2 +- .../bad_syntax/out.plan.terraform.txt | 2 +- .../resource_deps/bad_syntax/output.txt | 2 +- .../variables/malformed_reference/output.txt | 2 +- libs/dyn/dynvar/interpolation/parse.go | 51 ++++++++++++++++--- 6 files changed, 49 insertions(+), 12 deletions(-) diff --git a/acceptance/bundle/resource_deps/bad_syntax/out.deploy.direct.txt b/acceptance/bundle/resource_deps/bad_syntax/out.deploy.direct.txt index 2fcc34c7f6..a985c8415e 100644 --- a/acceptance/bundle/resource_deps/bad_syntax/out.deploy.direct.txt +++ b/acceptance/bundle/resource_deps/bad_syntax/out.deploy.direct.txt @@ -1,4 +1,4 @@ -Warning: invalid variable reference ${resources.volumes.bar.bad..syntax} at position 0: invalid path +Warning: invalid variable reference ${resources.volumes.bar.bad..syntax}: invalid path in databricks.yml:11:21 Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/test-bundle/default/files... diff --git a/acceptance/bundle/resource_deps/bad_syntax/out.plan.direct.txt b/acceptance/bundle/resource_deps/bad_syntax/out.plan.direct.txt index 2a3d2e7e0a..a53d32c081 100644 --- a/acceptance/bundle/resource_deps/bad_syntax/out.plan.direct.txt +++ b/acceptance/bundle/resource_deps/bad_syntax/out.plan.direct.txt @@ -1,4 +1,4 @@ -Warning: invalid variable reference ${resources.volumes.bar.bad..syntax} at position 0: invalid path +Warning: invalid variable reference ${resources.volumes.bar.bad..syntax}: invalid path in databricks.yml:11:21 create volumes.bar diff --git a/acceptance/bundle/resource_deps/bad_syntax/out.plan.terraform.txt b/acceptance/bundle/resource_deps/bad_syntax/out.plan.terraform.txt index 339fa7f20c..ed957b8518 100644 --- a/acceptance/bundle/resource_deps/bad_syntax/out.plan.terraform.txt +++ b/acceptance/bundle/resource_deps/bad_syntax/out.plan.terraform.txt @@ -1,4 +1,4 @@ -Warning: invalid variable reference ${resources.volumes.bar.bad..syntax} at position 0: invalid path +Warning: invalid variable reference ${resources.volumes.bar.bad..syntax}: invalid path in databricks.yml:11:21 Error: exit status 1 diff --git a/acceptance/bundle/resource_deps/bad_syntax/output.txt b/acceptance/bundle/resource_deps/bad_syntax/output.txt index e2867c47c8..968ebf652a 100644 --- a/acceptance/bundle/resource_deps/bad_syntax/output.txt +++ b/acceptance/bundle/resource_deps/bad_syntax/output.txt @@ -1,6 +1,6 @@ >>> [CLI] bundle validate -o json -Warning: invalid variable reference ${resources.volumes.bar.bad..syntax} at position 0: invalid path +Warning: invalid variable reference ${resources.volumes.bar.bad..syntax}: invalid path in databricks.yml:11:21 { diff --git a/acceptance/bundle/variables/malformed_reference/output.txt b/acceptance/bundle/variables/malformed_reference/output.txt index 05288411bb..fba4d19be6 100644 --- a/acceptance/bundle/variables/malformed_reference/output.txt +++ b/acceptance/bundle/variables/malformed_reference/output.txt @@ -1,6 +1,6 @@ >>> [CLI] bundle validate -Warning: invalid variable reference ${foo.bar-} at position 0: invalid path +Warning: invalid variable reference ${foo.bar-}: invalid key "bar-" in databricks.yml:2:9 Name: ${foo.bar-} diff --git a/libs/dyn/dynvar/interpolation/parse.go b/libs/dyn/dynvar/interpolation/parse.go index 4232f7416c..401198eadd 100644 --- a/libs/dyn/dynvar/interpolation/parse.go +++ b/libs/dyn/dynvar/interpolation/parse.go @@ -1,6 +1,7 @@ package interpolation import ( + "errors" "fmt" "regexp" "strings" @@ -28,11 +29,13 @@ const ( closeBrace = '}' ) -// pathPattern validates the content inside ${...}. Each segment matches -// baseVarDef from libs/dyn/dynvar/ref.go: [a-zA-Z]+([-_]*[a-zA-Z0-9]+)* -var pathPattern = regexp.MustCompile( - `^[a-zA-Z]+([-_]*[a-zA-Z0-9]+)*(\.[a-zA-Z]+([-_]*[a-zA-Z0-9]+)*(\[[0-9]+\])*)*(\[[0-9]+\])*$`, -) +// keyPattern validates a single key segment in a variable path. +// Matches: [a-zA-Z]+([-_]*[a-zA-Z0-9]+)* +// Examples: "foo", "my-job", "a_b_c", "abc123" +var keyPattern = regexp.MustCompile(`^[a-zA-Z]+([-_]*[a-zA-Z0-9]+)*$`) + +// indexPattern matches one or more [N] index suffixes. +var indexPattern = regexp.MustCompile(`^(\[[0-9]+\])+$`) // Parse parses a string that may contain ${...} variable references. // It returns a slice of tokens representing literal text and variable references. @@ -140,9 +143,9 @@ func Parse(s string) ([]Token, error) { ) } - if !pathPattern.MatchString(path) { + if err := validatePath(path); err != nil { return nil, fmt.Errorf( - "invalid variable reference ${%s} at position %d: invalid path", path, refStart, + "invalid variable reference ${%s}: %w", path, err, ) } @@ -168,3 +171,37 @@ func Parse(s string) ([]Token, error) { flushLiteral(i) return tokens, nil } + +// validatePath validates the path inside a ${...} reference by splitting on +// '.' and validating each segment individually. +func validatePath(path string) error { + segments := strings.Split(path, ".") + for _, seg := range segments { + if seg == "" { + return errors.New("invalid path") + } + + // Strip trailing [N] index suffixes to get the key part. + key, idx := splitKeyAndIndex(seg) + + if key == "" { + return fmt.Errorf("invalid key %q", seg) + } + if !keyPattern.MatchString(key) { + return fmt.Errorf("invalid key %q", key) + } + if idx != "" && !indexPattern.MatchString(idx) { + return fmt.Errorf("invalid index in %q", seg) + } + } + return nil +} + +// splitKeyAndIndex splits "foo[0][1]" into ("foo", "[0][1]"). +func splitKeyAndIndex(seg string) (string, string) { + i := strings.IndexByte(seg, '[') + if i < 0 { + return seg, "" + } + return seg[:i], seg[i:] +}