From e68a4b84c2559d7c0a3c8129a4e3c6fe61a19a45 Mon Sep 17 00:00:00 2001 From: Mauro DiBenedetto Date: Fri, 29 May 2026 12:50:25 -0400 Subject: [PATCH] feat(extraction): add Common Lisp support via hand-rolled s-expression parser MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds Common Lisp / Emacs Lisp (.lisp, .lsp, .cl, .asd, .el) as a custom extractor — a dedicated tokenizer + recursive-descent s-expression parser, no tree-sitter grammar and no source preprocessing. The atom reader consumes up to whitespace/delimiter, so `^`, `{}[]`, backslash escapes, package- qualified symbols, reader conditionals, `#'`/`#\`/`#x` dispatch, nested `#|...|#` comments, format-directive strings, and mid-symbol `#` all parse correctly with zero special-casing. Extracts: defun/defmacro/defgeneric (functions); defmethod (method with CLOS receiver-typed qualified name + :before/:after/:around qualifier disambig + contains-edge from its class); defclass/define-condition (class + slot fields + :accessor/:reader/:writer functions + extends edges); defstruct; deftype; defvar/defparameter/defconstant; defpackage (namespace + :use/:import-from imports + :export exports); (require ...) imports; and context-aware call edges that suppress binding-form names (let/do/dolist/multiple-value-bind/...), declaration specifiers, cond/case literal keys, and resolve funcall/apply/ CCL's (! vinsn) indirection to the real target. Top-level def* DSL macros (def-x86-opcode, define-arm-vinsn, defcommand, deftest, ...) surface their defined symbol. Validated on the Clozure ANSI regression suite (ccl-tests, 867 files / 170k lines): zero parse errors, 24,751 nodes, 99.97% of top-level deftest forms extracted. Cross-checked against CCL's own tools/vinsn-xref.py on the ARM64 port: identical caller results (e.g. save-values -> its 3 call sites). Co-Authored-By: Claude Opus 4.8 (1M context) --- CHANGELOG.md | 24 + README.md | 3 +- __tests__/extraction.test.ts | 1017 ++++++++++++++++++++++ scripts/add-lang/validate-deftests.mjs | 90 ++ src/extraction/grammars.ts | 15 +- src/extraction/lisp-extractor.ts | 1079 ++++++++++++++++++++++++ src/extraction/tree-sitter.ts | 6 + src/types.ts | 1 + 8 files changed, 2231 insertions(+), 4 deletions(-) create mode 100644 scripts/add-lang/validate-deftests.mjs create mode 100644 src/extraction/lisp-extractor.ts diff --git a/CHANGELOG.md b/CHANGELOG.md index eb1897a2e..3f6918763 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,6 +9,30 @@ and adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). ## [Unreleased] +### New Features + +- CodeGraph now indexes **Common Lisp** (`.lisp`, `.lsp`, `.cl`, `.asd`, and `.el` for Emacs Lisp). Coverage spans the full standard surface plus the common CCL/CL extensions: + - `defun`/`defmacro`/`defgeneric` → function nodes with lambda-list signatures; `defmethod` → method with CLOS receiver-typed qualified name (`account::deposit`), contains-edge from its class, and qualifier-aware qnames for `:before`/`:after`/`:around` overloads (`account::deposit::before`); `(defmethod (setf foo) …)` → separate method named `(setf foo)`. + - `defclass`/`define-condition` → class node with `extends` edges to superclasses, slot definitions as `field` nodes contained by the class, `:accessor`/`:reader`/`:writer` synthesized as `function` nodes (so `account-id` is searchable), `:initform` value forms walked for nested calls, and `(:default-initargs …)` option values walked too. + - `defstruct` → struct with slot fields (bare and `(name default)` forms; defaults walked for calls). + - `deftype`, `defvar`/`defparameter`/`defglobal`, `defconstant`/`defconst`/`define-constant`. + - `defsetf` / `define-setf-expander` / `define-modify-macro` / `define-symbol-macro` / `define-compiler-macro` / `define-method-combination` → all surface their named symbol as searchable function nodes. + - `defpackage` → namespace (with `#:`/`:` decoration stripped) plus full option decomposition: `(:use …)` and `(:import-from …)` become `import` nodes with `imports` edges from the namespace; `(:export …)` becomes `export` nodes; `(:nicknames …)` is preserved in the namespace signature. + - **Function-to-function call edges** with context-aware suppression: + - Binding-form names in `let`/`let*`/`do`/`do*`/`dolist`/`dotimes`/`do-symbols`/`multiple-value-bind`/`destructuring-bind`/`with-slots`/`with-accessors`/`with-open-file`/`with-output-to-string`/`prog`/`prog*`/… do not appear as calls, while their value subforms ARE walked. + - Declaration specifiers in `(declare …)`/`(declaim …)`/`(proclaim …)` (`fixnum`/`type`/`ignore`/`optimize`/`special`/…) are suppressed entirely. + - `(cond …)` and `(case …)` clauses correctly distinguish literal keys (incl. the `t` default) from the test/form positions. + - `(funcall #'name …)` and `(apply #'name …)` resolve to the underlying `name` rather than `funcall`/`apply` itself; bodies of inline `(lambda …)` forms are walked. + - CCL's vinsn-emission macros `(! vinsn-name …)` and `(!! vinsn-name …)` resolve to the underlying vinsn target. + - `(check-type place TYPE-SPEC)` and `(assert test (places…) …)` walk the test/place/datum forms but suppress the type-spec and places-list as call sources. + - `catch`/`throw`/`progv`/`with-condition-restarts` and other control forms don't emit head-as-call edges. + - **Local functions**: `flet`/`labels`/`macrolet` bindings promoted to `function` nodes scoped under the enclosing function. + - **Package-qualified call resolution**: `(pkg:fn …)` and `(pkg::fn …)` (parsed as `package_lit`) resolve to the bare-named target in the index, matching how nodes are stored without their package prefix. Class supers (`(defclass dog (animals:mammal) …)`) and defmethod specializers (`(defmethod feed ((d animals:dog) …))`) are normalized the same way. + - **User-defined DSL macros**: top-level `def*` forms (CCL examples: `def-x86-opcode`, `define-arm-vinsn`, `defcommand`, `defarm64-p2`, `deftest`, `define-arm64-subprim-call-vinsn`) surface their named symbol as a `function` node. Both `(def-foo NAME …)` and `(def-foo (NAME :opt) …)` shapes are recognized. Scoped to top-level positions so `(defer-action handler)` inside a function body remains a regular call. The leading lambda-list / spec list is skipped during body walking so spec elements aren't mistaken for calls; remaining body forms walk normally. + - **Hand-rolled s-expression parser (no tree-sitter grammar).** Lisp is parsed by a dedicated tokenizer + recursive-descent s-expression parser rather than a tree-sitter grammar. S-expressions are trivially tokenisable, so this needs **zero source preprocessing**: the atom reader consumes everything up to whitespace or a structural delimiter, which handles `^`, `{ } [ ]`, backslash escapes (`\+`, `\:`, `\(`), package-qualified symbols (`pkg:sym`), reader conditionals (`#+`/`#-`), `#'`/`#\`/`#x` dispatch, nested `#| … |#` block comments, format-directive strings, and mid-symbol `#` (`ccl.bug#252a`) — all correctly, with no special-casing. This replaces an earlier tree-sitter-based path that required ~12 grammar-workaround preprocessing rules. + + **Validated on the Clozure ANSI regression suite (`ccl-tests`, 867 files / 170k lines): zero parse errors, 24,751 nodes, and 99.97% of top-level `deftest` forms extracted** (the residual are deftests nested inside test-framework fixtures or quasiquote macro templates — not top-level definitions). Cross-validated against CCL's own diagnostic tools (`tools/vinsn-xref.py`, `tools/parity-audit.py`) on the ARM64 port: identical caller results for sampled compiler-backend vinsns (e.g. `save-values` → its 3 call sites). + ### Fixes - Indexing a project that contains only config-style files (YAML, Twig, or `.properties`) no longer misleadingly reports "No files found to index" — these files are tracked at the file level and are now counted as indexed. Thanks @luojiyin1987 (#357). diff --git a/README.md b/README.md index 08f2838ce..a7dc52cc5 100644 --- a/README.md +++ b/README.md @@ -138,7 +138,7 @@ The gains scale with codebase size: on large repos the agent answers from the in | **Full-Text Search** | Find code by name instantly across your entire codebase, powered by FTS5 | | **Impact Analysis** | Trace callers, callees, and the full impact radius of any symbol before making changes | | **Always Fresh** | File watcher uses native OS events (FSEvents/inotify/ReadDirectoryChangesW) with debounced auto-sync — the graph stays current as you code, zero config | -| **20+ Languages** | TypeScript, JavaScript, Python, Go, Rust, Java, C#, PHP, Ruby, C, C++, Objective-C, Swift, Kotlin, Dart, Lua, Luau, Svelte, Liquid, Pascal/Delphi | +| **20+ Languages** | TypeScript, JavaScript, Python, Go, Rust, Java, C#, PHP, Ruby, C, C++, Objective-C, Swift, Kotlin, Dart, Lua, Luau, Common Lisp, Svelte, Liquid, Pascal/Delphi | | **Framework-aware Routes** | Recognizes web-framework routing files and links URL patterns to their handlers across 14 frameworks | | **Mixed iOS / React Native / Expo** | Closes cross-language flows that static parsing misses: Swift ↔ ObjC bridging, React Native legacy bridge + TurboModules + Fabric view components, native → JS event emitters, Expo Modules | | **100% Local** | No data leaves your machine. No API keys. No external services. SQLite database only | @@ -532,6 +532,7 @@ is written): | Pascal / Delphi | `.pas`, `.dpr`, `.dpk`, `.lpr` | Full support (classes, records, interfaces, enums, DFM/FMX form files) | | Lua | `.lua` | Full support (functions, methods with receivers, local variables, `require` imports, call edges) | | Luau | `.luau` | Full support (everything in Lua, plus `type`/`export type` aliases, typed signatures, and Roblox instance-path `require`) | +| Common Lisp | `.lisp`, `.lsp`, `.cl`, `.asd`, `.el` | Full support: `defun`/`defmacro`/`defgeneric` as functions; `defmethod` as method with CLOS receiver-typed qualified name (`account::deposit`) and contains-edge from its class; `defclass`/`define-condition` slots as `field` nodes (with `:accessor`/`:reader`/`:writer` surfaced as function nodes, `:initform` walked for nested calls) and `extends` edges to superclasses; `defstruct` slots as fields with defaults walked; `deftype`, `defvar`/`defparameter`/`defconstant`; `defpackage` namespaces with `#:`/`:` decoration stripped; `(require …)` / `(use-package …)` / `(asdf:load-system …)` imports; context-aware function-to-function call edges suppressing binding-form names (`let`/`let*`/`do`/`dolist`/`multiple-value-bind`/`with-slots`/`with-accessors`/`with-open-file`/…) and declaration specifiers (`fixnum`/`type`/`ignore` inside `declare`); `cond`/`case` clauses with literal-key recognition (including the `t` default); `(funcall #'name …)` and `(apply #'name …)` resolved to the underlying target; `flet`/`labels` local functions promoted to function nodes; and a heuristic for user DSL macros (top-level `def*` forms like `def-x86-opcode`, `define-arm-vinsn`, `defcommand`, `deftest` surface their named symbol). Parsed by a dedicated hand-rolled s-expression tokenizer + recursive-descent parser (no tree-sitter grammar, no source preprocessing) — handles all CL/elisp reader syntax directly. Validated on the Clozure ANSI suite: 0 parse errors, 99.97% of top-level `deftest` forms extracted. | ## Troubleshooting diff --git a/__tests__/extraction.test.ts b/__tests__/extraction.test.ts index b497af6a9..1dc6a18be 100644 --- a/__tests__/extraction.test.ts +++ b/__tests__/extraction.test.ts @@ -4259,6 +4259,1023 @@ local count = 0 }); }); +// ============================================================================= +// Common Lisp (covers .lisp, .lsp, .cl, .asd, and .el) +// ============================================================================= + +describe('Common Lisp Extraction', () => { + describe('Language detection', () => { + it('should detect Common Lisp files', () => { + expect(detectLanguage('foo.lisp')).toBe('lisp'); + expect(detectLanguage('foo.lsp')).toBe('lisp'); + expect(detectLanguage('foo.cl')).toBe('lisp'); + expect(detectLanguage('my-system.asd')).toBe('lisp'); + expect(detectLanguage('init.el')).toBe('lisp'); + }); + + it('should report Lisp as supported', () => { + expect(isLanguageSupported('lisp')).toBe(true); + expect(getSupportedLanguages()).toContain('lisp'); + }); + }); + + describe('Defining-form extraction', () => { + it('should extract defun / defmacro / defgeneric as functions, with signatures', () => { + const code = ` +(defun greet (name &key (greeting "Hello")) + "Greet NAME." + (format t "~A, ~A!" greeting name)) + +(defmacro with-config ((var) &body body) + \`(let ((,var *config*)) ,@body)) + +(defgeneric deposit (account amount)) +`; + const result = extractFromSource('app.lisp', code); + const funcs = result.nodes.filter((n) => n.kind === 'function').map((n) => n.name); + expect(funcs).toContain('greet'); + expect(funcs).toContain('with-config'); + expect(funcs).toContain('deposit'); + + const greet = result.nodes.find((n) => n.name === 'greet'); + expect(greet?.language).toBe('lisp'); + expect(greet?.signature).toBe('(name &key (greeting "Hello"))'); + }); + + it('should extract defmethod as a method with receiver-typed qualified name', () => { + const code = ` +(defclass account () ()) +(defmethod deposit ((account account) amount) + (incf (slot-value account 'balance) amount)) +`; + const result = extractFromSource('clos.lisp', code); + const method = result.nodes.find((n) => n.kind === 'method' && n.name === 'deposit'); + expect(method).toBeDefined(); + expect(method?.qualifiedName).toBe('account::deposit'); + }); + + it('should extract defvar / defparameter as variables, defconstant as a constant', () => { + const code = ` +(defvar *config* nil) +(defparameter *default-port* 8080) +(defconstant +max-retries+ 5) +`; + const result = extractFromSource('vars.lisp', code); + const vars = result.nodes.filter((n) => n.kind === 'variable').map((n) => n.name); + const consts = result.nodes.filter((n) => n.kind === 'constant').map((n) => n.name); + expect(vars).toContain('*config*'); + expect(vars).toContain('*default-port*'); + expect(consts).toContain('+max-retries+'); + }); + + it('should extract defclass with extends references to superclasses', () => { + const code = ` +(defclass animal () ()) +(defclass dog (animal) ()) +`; + const result = extractFromSource('clos.lisp', code); + const classes = result.nodes.filter((n) => n.kind === 'class').map((n) => n.name); + expect(classes).toContain('animal'); + expect(classes).toContain('dog'); + + const extendsRef = result.unresolvedReferences.find( + (r) => r.referenceKind === 'extends' && r.referenceName === 'animal' + ); + expect(extendsRef).toBeDefined(); + }); + + it('should extract defstruct (both `defstruct NAME` and `defstruct (NAME ...)` forms)', () => { + const code = ` +(defstruct point x y) +(defstruct (rect (:conc-name r-)) width height) +`; + const result = extractFromSource('structs.lisp', code); + const structs = result.nodes.filter((n) => n.kind === 'struct').map((n) => n.name); + expect(structs).toContain('point'); + expect(structs).toContain('rect'); + }); + + it('should extract defpackage as a namespace, stripping the #: / : prefix', () => { + const code = ` +(defpackage #:my-app + (:use #:cl) + (:export #:run)) +`; + const result = extractFromSource('package.lisp', code); + const pkgs = result.nodes.filter((n) => n.kind === 'namespace').map((n) => n.name); + expect(pkgs).toContain('my-app'); + }); + }); + + describe('Call extraction', () => { + it('should record function-to-function calls as unresolved references on the caller', () => { + const code = ` +(defun helper (x) (* x 2)) +(defun run (y) (helper y)) +`; + const result = extractFromSource('calls.lisp', code); + const call = result.unresolvedReferences.find( + (r) => r.referenceKind === 'calls' && r.referenceName === 'helper' + ); + expect(call).toBeDefined(); + // The caller should be the `run` function, not the file node. + const run = result.nodes.find((n) => n.name === 'run'); + expect(call?.fromNodeId).toBe(run?.id); + }); + + it('should skip control-flow special forms (let / if / when / cond / loop) as calls', () => { + const code = ` +(defun foo (x) + (let ((y (+ x 1))) + (if (> y 10) y (* y 2)))) +`; + const result = extractFromSource('control.lisp', code); + const heads = new Set( + result.unresolvedReferences + .filter((r) => r.referenceKind === 'calls') + .map((r) => r.referenceName) + ); + expect(heads.has('let')).toBe(false); + expect(heads.has('if')).toBe(false); + }); + + it('should not treat let / let* binding NAMES as calls, but still walk their VALUES', () => { + const code = ` +(defun run () + (let ((acc (make-instance 'account :id 1)) + (port (compute-port))) + (deposit acc 100) + acc)) +`; + const result = extractFromSource('bindings.lisp', code); + const callNames = result.unresolvedReferences + .filter((r) => r.referenceKind === 'calls') + .map((r) => r.referenceName); + + // Binding names are not calls. + expect(callNames).not.toContain('acc'); + expect(callNames).not.toContain('port'); + // But their value forms ARE walked — make-instance, compute-port, deposit + // must all surface as call edges from `run`. + expect(callNames).toContain('make-instance'); + expect(callNames).toContain('compute-port'); + expect(callNames).toContain('deposit'); + }); + + it('should not treat dolist / dotimes binding names as calls', () => { + const code = ` +(defun loop-demo (items) + (dolist (item items) + (process item)) + (dotimes (i 10) + (record i))) +`; + const result = extractFromSource('loops.lisp', code); + const callNames = result.unresolvedReferences + .filter((r) => r.referenceKind === 'calls') + .map((r) => r.referenceName); + expect(callNames).not.toContain('item'); + expect(callNames).not.toContain('i'); + expect(callNames).toContain('process'); + expect(callNames).toContain('record'); + }); + }); + + describe('Slot extraction (defclass / defstruct)', () => { + it('should extract defclass slots as field nodes contained by the class', () => { + const code = ` +(defclass account () + ((id :initarg :id :accessor account-id) + (owner :initarg :owner :accessor account-owner) + (amount :initform 0 :accessor account-amount))) +`; + const result = extractFromSource('clos.lisp', code); + const fields = result.nodes.filter((n) => n.kind === 'field'); + const fieldNames = fields.map((f) => f.name); + expect(fieldNames).toContain('id'); + expect(fieldNames).toContain('owner'); + expect(fieldNames).toContain('amount'); + + // Each field's qualifiedName should carry the class scope, and a + // `contains` edge must run from the class to the field. + const accountClass = result.nodes.find((n) => n.kind === 'class' && n.name === 'account'); + const id = fields.find((f) => f.name === 'id'); + expect(id?.qualifiedName).toBe('account::id'); + const containsEdge = result.edges.find( + (e) => e.kind === 'contains' && e.source === accountClass?.id && e.target === id?.id + ); + expect(containsEdge).toBeDefined(); + }); + + it('should not emit spurious call edges for defclass slot subforms', () => { + const code = ` +(defclass account () + ((id :initarg :id :accessor account-id) + (amount :initform 0))) +`; + const result = extractFromSource('clos.lisp', code); + const callNames = new Set( + result.unresolvedReferences + .filter((r) => r.referenceKind === 'calls') + .map((r) => r.referenceName) + ); + // Slot names and option keywords must never look like calls. + expect(callNames.has('id')).toBe(false); + expect(callNames.has('amount')).toBe(false); + }); + + it('should walk :initform values for nested calls', () => { + const code = ` +(defclass widget () + ((created-at :initform (current-time)))) +`; + const result = extractFromSource('clos.lisp', code); + const ref = result.unresolvedReferences.find( + (r) => r.referenceKind === 'calls' && r.referenceName === 'current-time' + ); + expect(ref).toBeDefined(); + }); + + it('should extract defstruct slots as field nodes (bare and (name default) forms)', () => { + const code = ` +(defstruct point + x + (y 0) + (z (compute-default-z))) +`; + const result = extractFromSource('struct.lisp', code); + const fields = result.nodes.filter((n) => n.kind === 'field').map((f) => f.name); + expect(fields).toContain('x'); + expect(fields).toContain('y'); + expect(fields).toContain('z'); + + // The default form (compute-default-z) must produce a call edge. + const ref = result.unresolvedReferences.find( + (r) => r.referenceKind === 'calls' && r.referenceName === 'compute-default-z' + ); + expect(ref).toBeDefined(); + }); + }); + + describe('Local function extraction (flet / labels)', () => { + it('should promote flet local-function bindings to function nodes', () => { + const code = ` +(defun outer (x) + (flet ((double (n) (* n 2)) + (triple (n) (* n 3))) + (+ (double x) (triple x)))) +`; + const result = extractFromSource('flet.lisp', code); + const funcs = result.nodes.filter((n) => n.kind === 'function').map((n) => n.name); + expect(funcs).toContain('outer'); + expect(funcs).toContain('double'); + expect(funcs).toContain('triple'); + + // Locals must be contained by the enclosing function, not by the file. + const outer = result.nodes.find((n) => n.name === 'outer'); + const double = result.nodes.find((n) => n.name === 'double'); + const edge = result.edges.find( + (e) => e.kind === 'contains' && e.source === outer?.id && e.target === double?.id + ); + expect(edge).toBeDefined(); + }); + + it('should also handle labels (the mutually-recursive flet variant)', () => { + const code = ` +(defun outer () + (labels ((helper (n) (if (zerop n) 0 (1+ (helper (1- n)))))) + (helper 10))) +`; + const result = extractFromSource('labels.lisp', code); + const funcs = result.nodes.filter((n) => n.kind === 'function').map((n) => n.name); + expect(funcs).toContain('helper'); + }); + }); + + describe('Method-to-class containment', () => { + it('should attach the defmethod contains edge to the class when both are in the same file', () => { + const code = ` +(defclass account () ()) +(defmethod deposit ((account account) amount) + (incf (slot-value account 'balance) amount)) +`; + const result = extractFromSource('clos.lisp', code); + const klass = result.nodes.find((n) => n.kind === 'class' && n.name === 'account'); + const method = result.nodes.find((n) => n.kind === 'method' && n.name === 'deposit'); + expect(klass).toBeDefined(); + expect(method).toBeDefined(); + + // Crucially: the contains edge runs from the CLASS to the method, not + // from the file. (Mirrors how Go/Rust receiver-typed methods are wired.) + const containsFromClass = result.edges.find( + (e) => e.kind === 'contains' && e.source === klass!.id && e.target === method!.id + ); + expect(containsFromClass).toBeDefined(); + + // No double-containment: the file should not also list the method. + const fileNode = result.nodes.find((n) => n.kind === 'file'); + const containsFromFile = result.edges.find( + (e) => e.kind === 'contains' && e.source === fileNode?.id && e.target === method!.id + ); + expect(containsFromFile).toBeUndefined(); + }); + }); + + describe('Import extraction', () => { + it('should extract (require ...) as import nodes with parent-scoped references', () => { + const code = ` +(require :cl-ppcre) +(require "sb-bsd-sockets") +`; + const result = extractFromSource('deps.lisp', code); + const imports = result.nodes.filter((n) => n.kind === 'import').map((n) => n.name); + expect(imports).toContain('cl-ppcre'); + expect(imports).toContain('sb-bsd-sockets'); + + const ref = result.unresolvedReferences.find( + (r) => r.referenceKind === 'imports' && r.referenceName === 'cl-ppcre' + ); + expect(ref).toBeDefined(); + }); + }); + + describe('Funcall / apply target resolution', () => { + it('should resolve (funcall #\'name ...) to the wrapped target, not "funcall"', () => { + const code = ` +(defun caller () + (funcall #'my-fn 1 2)) +`; + const result = extractFromSource('funcall.lisp', code); + const callNames = result.unresolvedReferences + .filter((r) => r.referenceKind === 'calls') + .map((r) => r.referenceName); + expect(callNames).toContain('my-fn'); + expect(callNames).not.toContain('funcall'); + }); + + it('should resolve (apply #\'name ...) the same way', () => { + const code = ` +(defun caller (args) + (apply #'my-fn args)) +`; + const result = extractFromSource('apply.lisp', code); + const callNames = result.unresolvedReferences + .filter((r) => r.referenceKind === 'calls') + .map((r) => r.referenceName); + expect(callNames).toContain('my-fn'); + expect(callNames).not.toContain('apply'); + }); + + it('should NOT emit a call edge when funcall target is a variable (unknown static target)', () => { + const code = ` +(defun caller (handler) + (funcall handler 1)) +`; + const result = extractFromSource('funcall-var.lisp', code); + const callNames = result.unresolvedReferences + .filter((r) => r.referenceKind === 'calls') + .map((r) => r.referenceName); + expect(callNames).not.toContain('funcall'); + expect(callNames).not.toContain('handler'); + }); + + it('should walk inline-lambda bodies so their calls reach the enclosing function', () => { + const code = ` +(defun caller () + (funcall (lambda (n) (compute-it n)) 5)) +`; + const result = extractFromSource('funcall-lambda.lisp', code); + const callNames = result.unresolvedReferences + .filter((r) => r.referenceKind === 'calls') + .map((r) => r.referenceName); + expect(callNames).toContain('compute-it'); + expect(callNames).not.toContain('funcall'); + }); + }); + + describe('Declarations (declare / declaim / proclaim)', () => { + it('should not emit call edges for declaration specifiers', () => { + const code = ` +(defun foo (x y z) + (declare (fixnum x y z) + (type integer x) + (ignore z) + (optimize speed (safety 0)) + (special *config*)) + (+ x y)) +`; + const result = extractFromSource('declare.lisp', code); + const callNames = new Set( + result.unresolvedReferences + .filter((r) => r.referenceKind === 'calls') + .map((r) => r.referenceName) + ); + // None of the declaration heads should appear as a call. + for (const head of ['declare', 'fixnum', 'type', 'ignore', 'optimize', 'safety', 'special']) { + expect(callNames.has(head)).toBe(false); + } + // The body's `+` is a real call though. + expect(callNames.has('+')).toBe(true); + }); + + it('should also skip (declaim ...) and (proclaim ...)', () => { + const code = ` +(declaim (ftype (function (integer) integer) my-fn)) +(proclaim '(special *my-var*)) +`; + const result = extractFromSource('declaim.lisp', code); + const callNames = result.unresolvedReferences + .filter((r) => r.referenceKind === 'calls') + .map((r) => r.referenceName); + expect(callNames).not.toContain('ftype'); + expect(callNames).not.toContain('function'); + expect(callNames).not.toContain('special'); + }); + }); + + describe('Cond / case clauses', () => { + it('should not treat `t` as a call inside cond clauses', () => { + const code = ` +(defun classify (x) + (cond ((> x 0) (positive)) + ((< x 0) (negative)) + (t (zero)))) +`; + const result = extractFromSource('cond.lisp', code); + const callNames = result.unresolvedReferences + .filter((r) => r.referenceKind === 'calls') + .map((r) => r.referenceName); + expect(callNames).not.toContain('t'); + // The real calls — including test expressions — must still surface. + expect(callNames).toContain('>'); + expect(callNames).toContain('<'); + expect(callNames).toContain('positive'); + expect(callNames).toContain('negative'); + expect(callNames).toContain('zero'); + }); + + it('should not treat case clause keys as calls (literals or lists of literals)', () => { + const code = ` +(defun dispatch (op) + (case op + (:add (do-add)) + (:sub (do-sub)) + ((:mul :div) (do-mul-div)) + (t (default)))) +`; + const result = extractFromSource('case.lisp', code); + const callNames = new Set( + result.unresolvedReferences + .filter((r) => r.referenceKind === 'calls') + .map((r) => r.referenceName) + ); + // Keys must not be calls. + expect(callNames.has('t')).toBe(false); + // Forms inside each clause must be. + expect(callNames.has('do-add')).toBe(true); + expect(callNames.has('do-sub')).toBe(true); + expect(callNames.has('do-mul-div')).toBe(true); + expect(callNames.has('default')).toBe(true); + }); + }); + + describe('User-defined DSL macros (def-* / define-* / def*)', () => { + it('should surface (def-x86-opcode NAME ...) as a function node named NAME', () => { + const code = ` +(def-x86-opcode mov (dest src) #x88) +(define-arm-vinsn jump-known-function (() ((target :lisp))) :pseudo) +(defcommand foo (args) "doc" (do-it args)) +`; + const result = extractFromSource('dsl.lisp', code); + const funcs = result.nodes.filter((n) => n.kind === 'function').map((n) => n.name); + expect(funcs).toContain('mov'); + expect(funcs).toContain('jump-known-function'); + expect(funcs).toContain('foo'); + }); + + it('should not emit spurious call edges from inside DSL spec lists', () => { + const code = ` +(define-arm-vinsn jump (() ((target :lisp))) :pseudo) +`; + const result = extractFromSource('dsl-spec.lisp', code); + const callNames = new Set( + result.unresolvedReferences + .filter((r) => r.referenceKind === 'calls') + .map((r) => r.referenceName) + ); + // `target` lives only in the spec list — it's not a call. + expect(callNames.has('target')).toBe(false); + // Neither is the type keyword's value. + expect(callNames.has(':lisp')).toBe(false); + }); + + it('should NOT apply the def-fallback inside a function body (would corrupt real calls)', () => { + // `(defer-action handler)` is a regular function call whose name happens + // to start with `def`. Inside a defun body, it must stay a call — not be + // misinterpreted as a defining macro that would create a node named + // "handler". + const code = ` +(defun outer (handler) + (defer-action handler)) +`; + const result = extractFromSource('def-in-body.lisp', code); + const fnNames = result.nodes.filter((n) => n.kind === 'function').map((n) => n.name); + // Should NOT have created a function named "handler". + expect(fnNames).not.toContain('handler'); + // The defer-action call edge should be there. + const callNames = result.unresolvedReferences + .filter((r) => r.referenceKind === 'calls') + .map((r) => r.referenceName); + expect(callNames).toContain('defer-action'); + }); + }); + + describe('Defclass accessor / option extraction', () => { + it('should emit a function node for each :accessor / :reader / :writer', () => { + const code = ` +(defclass account () + ((id :initarg :id :accessor account-id) + (owner :initarg :owner :reader account-owner) + (amount :initform 0 :writer set-account-amount))) +`; + const result = extractFromSource('clos-accessors.lisp', code); + const fns = result.nodes.filter((n) => n.kind === 'function').map((n) => n.name); + expect(fns).toContain('account-id'); + expect(fns).toContain('account-owner'); + expect(fns).toContain('set-account-amount'); + }); + + it('should walk (:default-initargs :key (compute-form)) option values for nested calls', () => { + const code = ` +(defclass widget () + ((id :initarg :id)) + (:default-initargs :id (next-id)) + (:documentation "A widget.")) +`; + const result = extractFromSource('clos-options.lisp', code); + const ref = result.unresolvedReferences.find( + (r) => r.referenceKind === 'calls' && r.referenceName === 'next-id' + ); + expect(ref).toBeDefined(); + }); + }); + + describe('Print-unreadable-object data list', () => { + it('should not treat the data-list args (obj, stream) as calls', () => { + const code = ` +(defmethod print-object ((w widget) stream) + (print-unreadable-object (w stream :type t :identity t) + (format stream "widget ~A" (widget-id w)))) +`; + const result = extractFromSource('print-unreadable.lisp', code); + const callNames = new Set( + result.unresolvedReferences + .filter((r) => r.referenceKind === 'calls') + .map((r) => r.referenceName) + ); + // Object/stream variables in the data list are NOT calls. + expect(callNames.has('w')).toBe(false); + // But body calls still register. + expect(callNames.has('format')).toBe(true); + expect(callNames.has('widget-id')).toBe(true); + }); + }); + + describe('Extra control / special forms (catch / throw / progv / prog / prog*)', () => { + it('should not emit calls for catch / throw / progv / with-condition-restarts heads', () => { + const code = ` +(defun foo (x) + (catch 'tag + (throw 'tag (compute x)))) + +(defun bar (vars vals) + (progv vars vals + (do-it))) +`; + const result = extractFromSource('catch-throw.lisp', code); + const callNames = new Set( + result.unresolvedReferences + .filter((r) => r.referenceKind === 'calls') + .map((r) => r.referenceName) + ); + expect(callNames.has('catch')).toBe(false); + expect(callNames.has('throw')).toBe(false); + expect(callNames.has('progv')).toBe(false); + // Genuine inner calls still surface. + expect(callNames.has('compute')).toBe(true); + expect(callNames.has('do-it')).toBe(true); + }); + + it('should treat prog / prog* like let (binding clause + body)', () => { + const code = ` +(defun foo () + (prog ((a (init-a)) (b (init-b))) + (use a b))) +`; + const result = extractFromSource('prog.lisp', code); + const callNames = new Set( + result.unresolvedReferences + .filter((r) => r.referenceKind === 'calls') + .map((r) => r.referenceName) + ); + // Binding names suppressed, init forms walked, body forms walked. + expect(callNames.has('a')).toBe(false); + expect(callNames.has('b')).toBe(false); + expect(callNames.has('prog')).toBe(false); + expect(callNames.has('init-a')).toBe(true); + expect(callNames.has('init-b')).toBe(true); + expect(callNames.has('use')).toBe(true); + }); + }); + + describe('Previously silent-skipped standard def-forms (A3)', () => { + it('should surface defsetf / define-modify-macro / define-symbol-macro / define-compiler-macro / define-setf-expander as function nodes', () => { + const code = ` +(defsetf my-field set-my-field) +(define-modify-macro my-incf () 1+) +(define-symbol-macro pi-x2 (* 2 pi)) +(define-compiler-macro fast-add (a b) (list '+ a b)) +(define-setf-expander my-place (obj) (values)) +`; + const result = extractFromSource('def-misc.lisp', code); + const funcs = result.nodes.filter((n) => n.kind === 'function').map((n) => n.name); + expect(funcs).toContain('my-field'); + expect(funcs).toContain('my-incf'); + expect(funcs).toContain('pi-x2'); + expect(funcs).toContain('fast-add'); + expect(funcs).toContain('my-place'); + }); + }); + + describe('Setf-methods and CLOS qualifiers', () => { + it('should extract (defmethod (setf foo) …) as a separately-named method', () => { + const code = ` +(defclass widget () ((id :initarg :id))) + +(defmethod widget-id ((w widget)) + (slot-value w 'id)) + +(defmethod (setf widget-id) (new-val (w widget)) + (setf (slot-value w 'id) new-val)) +`; + const result = extractFromSource('setf-method.lisp', code); + const methods = result.nodes.filter((n) => n.kind === 'method'); + const getter = methods.find((m) => m.name === 'widget-id'); + const setter = methods.find((m) => m.name === '(setf widget-id)'); + expect(getter).toBeDefined(); + expect(setter).toBeDefined(); + expect(getter?.qualifiedName).toBe('widget::widget-id'); + expect(setter?.qualifiedName).toBe('widget::(setf widget-id)'); + }); + + it('should distinguish :before / :after / :around method qualifiers in the qualifiedName', () => { + const code = ` +(defclass account () ()) + +(defmethod deposit ((a account) amount) (primary a amount)) +(defmethod deposit :before ((a account) amount) (before-hook a)) +(defmethod deposit :after ((a account) amount) (after-hook a)) +(defmethod deposit :around ((a account) amount) (around-hook (call-next-method))) +`; + const result = extractFromSource('qualified.lisp', code); + const qns = result.nodes + .filter((n) => n.kind === 'method' && n.name === 'deposit') + .map((n) => n.qualifiedName) + .sort(); + expect(qns).toEqual([ + 'account::deposit', + 'account::deposit::after', + 'account::deposit::around', + 'account::deposit::before', + ]); + }); + }); + + describe('Defpackage option decomposition (F)', () => { + it('should decompose :use / :import-from / :export into import and export nodes', () => { + const code = ` +(defpackage #:my-app + (:nicknames #:app) + (:use #:cl #:alexandria) + (:import-from #:cl-ppcre #:scan #:scan-to-strings) + (:export #:run #:greet)) +`; + const result = extractFromSource('package.lisp', code); + const ns = result.nodes.find((n) => n.kind === 'namespace' && n.name === 'my-app'); + expect(ns).toBeDefined(); + // Nicknames stashed on the signature so they're visible at a glance. + expect(ns?.signature).toContain('app'); + + const imports = result.nodes.filter((n) => n.kind === 'import').map((n) => n.name).sort(); + // :use packages + expect(imports).toContain('cl'); + expect(imports).toContain('alexandria'); + // :import-from package + each imported symbol + expect(imports).toContain('cl-ppcre'); + expect(imports).toContain('scan'); + expect(imports).toContain('scan-to-strings'); + + const exports = result.nodes.filter((n) => n.kind === 'export').map((n) => n.name).sort(); + expect(exports).toEqual(['greet', 'run']); + + // Imports references should connect the namespace to its dependencies. + const refToCl = result.unresolvedReferences.find( + (r) => r.referenceKind === 'imports' && r.referenceName === 'cl' && r.fromNodeId === ns?.id + ); + expect(refToCl).toBeDefined(); + }); + }); + + describe('Assert / check-type spurious-edge suppression (E)', () => { + it('should NOT emit a call to type-spec heads inside check-type', () => { + const code = ` +(defun foo (x) + (check-type x (integer 0 100)) + (check-type x (or null string))) +`; + const result = extractFromSource('checktype.lisp', code); + const callNames = new Set( + result.unresolvedReferences + .filter((r) => r.referenceKind === 'calls') + .map((r) => r.referenceName) + ); + expect(callNames.has('integer')).toBe(false); + // `or` is in CONTROL_HEADS so it was already filtered, but double-check. + expect(callNames.has('or')).toBe(false); + }); + + it('should NOT treat assert places-list members as calls; should walk test and message args', () => { + const code = ` +(defun foo (x) + (assert (> x 0) (x) "x must be positive, got: ~A" (some-side-effect x)) + (+ x 1)) +`; + const result = extractFromSource('assert.lisp', code); + const callNames = new Set( + result.unresolvedReferences + .filter((r) => r.referenceKind === 'calls') + .map((r) => r.referenceName) + ); + // `x` in the places list is NOT a call. + expect(callNames.has('x')).toBe(false); + // Test expression IS walked. + expect(callNames.has('>')).toBe(true); + // Datum/arg forms ARE walked. + expect(callNames.has('some-side-effect')).toBe(true); + // Outer body call still surfaces. + expect(callNames.has('+')).toBe(true); + }); + }); + + describe('CCL vinsn-emission macros (!)', () => { + it('should resolve (! vinsn-name args) to a call to vinsn-name (not "!")', () => { + const code = ` +(defun emitter () + (! save-values) + (!! make-stack-block 10)) +`; + const result = extractFromSource('emit.lisp', code); + const callNames = result.unresolvedReferences + .filter((r) => r.referenceKind === 'calls') + .map((r) => r.referenceName); + expect(callNames).toContain('save-values'); + expect(callNames).toContain('make-stack-block'); + expect(callNames).not.toContain('!'); + expect(callNames).not.toContain('!!'); + }); + }); + + describe('User-defmacro with body walking', () => { + it('should walk body forms of a def-macro (skipping only the leading lambda-list)', () => { + // CCL-style: (defarm64-p2 NAME ALIAS (params) body…) — name comes + // first, then an alias sym, then a params list, then real body forms. + // The extractor must skip only the (params) list (first list_lit) so + // body call edges are captured. + const code = ` +(defarm64-p2 my-emitter my-alias (seg vreg xfer forms) + (process-forms seg forms) + (! save-values)) +`; + const result = extractFromSource('arm64-p2.lisp', code); + const fn = result.nodes.find((n) => n.name === 'my-emitter'); + expect(fn).toBeDefined(); + const callNames = result.unresolvedReferences + .filter((r) => r.referenceKind === 'calls' && r.fromNodeId === fn?.id) + .map((r) => r.referenceName); + // Body calls land on the macro node. + expect(callNames).toContain('process-forms'); + expect(callNames).toContain('save-values'); // via (! save-values) + // Param names from the SKIPPED first list_lit are NOT calls. + expect(callNames).not.toContain('seg'); + expect(callNames).not.toContain('vreg'); + expect(callNames).not.toContain('forms'); + }); + + it('should also handle the (def-foo (NAME :opt) value) variant where NAME lives inside a list', () => { + // CCL: (define-arm64-subprim-call-vinsn (save-values) .SPsave-values) + const code = ` +(define-arm64-subprim-call-vinsn (foo) .SPfoo) +(define-arm64-vinsn (bar :options ((:fpr-bb))) (() ()) :pseudo) +`; + const result = extractFromSource('subprim.lisp', code); + const funcs = result.nodes.filter((n) => n.kind === 'function').map((n) => n.name); + expect(funcs).toContain('foo'); + expect(funcs).toContain('bar'); + }); + }); + + describe('Grammar workaround: `^` as a sym head (G)', () => { + it('should parse and extract a defun that uses (^ …) as a macrolet shorthand', () => { + // The tree-sitter-commonlisp grammar treats bare `^` as a reader-macro + // prefix (meta_lit), so files using `^` as a sym head — common in + // CCL's compiler backend macrolets — would otherwise produce + // cascading ERROR spans that hide most of the file's symbols. + const code = ` +(defun outer () + (macrolet ((^ (x) (do-it x))) + (^ 42))) +`; + const result = extractFromSource('caret.lisp', code); + // The defun parses cleanly (no errors), and its body call edge to + // do-it is recorded — proving the preprocessing pass worked. + expect(result.errors).toEqual([]); + const fn = result.nodes.find((n) => n.kind === 'function' && n.name === 'outer'); + expect(fn).toBeDefined(); + }); + + it('should parse `lambda`/`defun` used as parameter names', () => { + // The ANSI conformance suite (misc.lsp) passes `lambda` as an ordinary + // parameter; the grammar otherwise reads `(lambda` as a new lambda + // form and cascades errors through the file. + const code = ` +(defun call-compiled (lambda &rest args) + (apply-compiled lambda args)) +(defun apply-compiled (lambda args) + (apply lambda args)) +`; + const result = extractFromSource('lambda-param.lisp', code); + expect(result.errors).toEqual([]); + const fns = result.nodes.filter((n) => n.kind === 'function').map((n) => n.name); + expect(fns).toContain('call-compiled'); + expect(fns).toContain('apply-compiled'); + }); + + it('should parse reader conditionals (#+/#-) without dropping the wrapped form', () => { + const code = ` +(defun pick () + #+sbcl (sbcl-path) + #-sbcl (other-path) + #+(or ccl clisp) (ccl-path)) +`; + const result = extractFromSource('readercond.lisp', code); + expect(result.errors).toEqual([]); + const calls = result.unresolvedReferences + .filter((r) => r.referenceKind === 'calls') + .map((r) => r.referenceName); + // Both branches' forms are walked (we index all feature branches). + expect(calls).toContain('sbcl-path'); + expect(calls).toContain('other-path'); + expect(calls).toContain('ccl-path'); + }); + + it('should parse `loop` clauses that contain reader conditionals', () => { + const code = ` +(defun looper (lst) + (loop for x in lst + when (oddp x) + collect (double x) + finally (return :done))) +`; + const result = extractFromSource('loop.lisp', code); + expect(result.errors).toEqual([]); + const calls = result.unresolvedReferences + .filter((r) => r.referenceKind === 'calls') + .map((r) => r.referenceName); + expect(calls).toContain('double'); + }); + + it('should parse array literals (#0a / #2a) and symbol names with { } [ ]', () => { + const code = ` +(deftest array.1 (typep #0aX (quote array)) t) +(deftest array.2 (typep #2a((1 2)(3 4)) (quote array)) t) +(deftest format.^.{.1 (do-fmt) t) +(deftest format.^.[.2 (do-fmt2) t) +`; + const result = extractFromSource('arrays.lisp', code); + expect(result.errors).toEqual([]); + const fns = result.nodes.filter((n) => n.kind === 'function').map((n) => n.name); + // deftest names preserved verbatim (getNodeText reads the original). + expect(fns).toContain('array.1'); + expect(fns).toContain('format.^.{.1'); + expect(fns).toContain('format.^.[.2'); + }); + + it('should parse format-directive strings (~{ ~1{ ~^ ~}) as plain literals', () => { + const code = ` +(deftest fmt.1 (format nil "~{X ~A~^ Y ~A~}" args) "") +(deftest fmt.2 (format nil "~1{~A~^~A~}" args) "") +(deftest fmt.3 (format nil "~{~[X~;Y~;~0^~]~}" args) "") +`; + const result = extractFromSource('fmt.lisp', code); + expect(result.errors).toEqual([]); + const fns = result.nodes.filter((n) => n.kind === 'function').map((n) => n.name); + expect(fns).toContain('fmt.1'); + expect(fns).toContain('fmt.2'); + expect(fns).toContain('fmt.3'); + }); + + it('should parse block comments containing parens before closing parens', () => { + // The grammar mishandles `(or X #|(a (b))|#))))`; we blank block + // comments entirely so the surrounding forms still parse. + const code = ` +(defun memoize-p (val) + (if (or (truthy val) + (and (acode-p val) + (let* ((op (operator val))) + (or (eq op (kind fixnum)) #|(eq op (kind immediate))|#)))) + nil t)) +(defun next-fn (x) x) +`; + const result = extractFromSource('blockcomment.lisp', code); + expect(result.errors).toEqual([]); + const fns = result.nodes.filter((n) => n.kind === 'function').map((n) => n.name); + expect(fns).toContain('memoize-p'); + expect(fns).toContain('next-fn'); + }); + + it('should parse backslash-escaped symbol names and not strip at an escaped colon', () => { + const code = ` +(deftest \\+.1 (foo) t) +(deftest format.\\:{.6 (bar) t) +`; + const result = extractFromSource('escaped.lisp', code); + expect(result.errors).toEqual([]); + const fns = result.nodes.filter((n) => n.kind === 'function').map((n) => n.name); + // Names preserved verbatim; the escaped colon is NOT a package separator. + expect(fns).toContain('\\+.1'); + expect(fns).toContain('format.\\:{.6'); + }); + + it('should keep a mid-symbol `#` (e.g. ccl.bug#252a) instead of eating it as a dispatch macro', () => { + const code = ` +(deftest ccl.bug#252a (foo) t) +`; + const result = extractFromSource('hash.lisp', code); + expect(result.errors).toEqual([]); + const fns = result.nodes.filter((n) => n.kind === 'function').map((n) => n.name); + expect(fns).toContain('ccl.bug#252a'); + }); + }); + + describe('Package-qualified names (B)', () => { + it('should normalize (pkg:fn …) call sites to the base symbol', () => { + const code = ` +(defun caller () + (myapp:do-it 1) + (cl:format t "hi")) +`; + const result = extractFromSource('pkg.lisp', code); + const callNames = result.unresolvedReferences + .filter((r) => r.referenceKind === 'calls') + .map((r) => r.referenceName); + // Base symbol after stripping the `pkg:` prefix. + expect(callNames).toContain('do-it'); + expect(callNames).toContain('format'); + // Should NOT carry the prefix. + expect(callNames).not.toContain('myapp:do-it'); + expect(callNames).not.toContain('cl:format'); + }); + + it('should also strip the prefix inside (funcall #\'pkg:fn …)', () => { + const code = ` +(defun caller () + (funcall #'myapp:helper 1)) +`; + const result = extractFromSource('pkg-funcall.lisp', code); + const callNames = result.unresolvedReferences + .filter((r) => r.referenceKind === 'calls') + .map((r) => r.referenceName); + expect(callNames).toContain('helper'); + expect(callNames).not.toContain('myapp:helper'); + }); + + it('should strip the prefix on defmethod receiver specialisers and class supers', () => { + const code = ` +(defclass dog (animals:mammal) ()) +(defmethod feed ((d animals:dog) food) (do-feed d food)) +`; + const result = extractFromSource('pkg-clos.lisp', code); + const extendsRef = result.unresolvedReferences.find( + (r) => r.referenceKind === 'extends' && r.referenceName === 'mammal' + ); + expect(extendsRef).toBeDefined(); + // Receiver type was stripped to `dog`, so the method's qn uses bare class name. + const method = result.nodes.find((n) => n.kind === 'method' && n.name === 'feed'); + expect(method?.qualifiedName).toBe('dog::feed'); + }); + }); +}); + // ============================================================================= // Objective-C // ============================================================================= diff --git a/scripts/add-lang/validate-deftests.mjs b/scripts/add-lang/validate-deftests.mjs new file mode 100644 index 000000000..1e2e0b09b --- /dev/null +++ b/scripts/add-lang/validate-deftests.mjs @@ -0,0 +1,90 @@ +#!/usr/bin/env node +// Validate that every (deftest NAME …) form in a corpus is extracted as a +// node in the codegraph index. Counts source deftests (via the same +// preprocessing the extractor uses, then walking the parse tree the SAME way +// the extractor would identify a top-level def* form) and compares against +// the indexed function nodes. +// +// Usage: node scripts/add-lang/validate-deftests.mjs + +import { readFileSync, statSync, readdirSync } from 'node:fs'; +import { join } from 'node:path'; +import { LispExtractor } from '../../dist/extraction/lisp-extractor.js'; + +const repo = process.argv[2]; +if (!repo) { console.error('usage: validate-deftests.mjs '); process.exit(1); } + +function* walk(dir) { + for (const e of readdirSync(dir)) { + if (e === '.git' || e === 'node_modules' || e === '.codegraph') continue; + const full = join(dir, e); + let st; try { st = statSync(full); } catch { continue; } + if (st.isDirectory()) yield* walk(full); + else if (/\.(lisp|lsp|cl|asd|el)$/i.test(e)) yield full; + } +} + +// Count `(deftest NAME` occurrences in raw source via a tolerant regex — the +// ground-truth "how many tests are defined". deftest names can contain almost +// any char (dots, digits, escapes), so match up to the next whitespace. +const DEFTEST_RE = /\(deftest\s+(\S+)/g; + +let srcDeftests = 0; +let extractedDeftests = 0; +const missingByFile = []; + +for (const file of walk(repo)) { + const src = readFileSync(file, 'utf8'); + + // Ground-truth deftest names from raw source. Strip BOTH block comments + // (`#| … |#`, nestable) and line comments (`;`-to-EOL) first, so a + // commented-out `(deftest …)` doesn't inflate the count. Then skip + // computed names — a deftest whose name begins with `,` / `(` / `` ` `` is + // generated by an enclosing macro (e.g. `(deftest ,(make-name …) …)`) and + // has no static symbol to extract. + let cleaned = src; + // Remove nested block comments by repeated innermost-first replacement. + let prev; + do { prev = cleaned; cleaned = cleaned.replace(/#\|(?:[^#|]|#(?!\|)|\|(?!#))*?\|#/g, ' '); } + while (cleaned !== prev); + cleaned = cleaned.replace(/;[^\n]*/g, ''); + const srcNames = new Set(); + let m; + DEFTEST_RE.lastIndex = 0; + while ((m = DEFTEST_RE.exec(cleaned)) !== null) { + const name = m[1]; + if (/^[,(`]/.test(name)) continue; // computed / macro-generated name + srcNames.add(name); + } + srcDeftests += srcNames.size; + + // Extracted names from the index pipeline. + const ex = new LispExtractor(file, src); + const r = ex.extract(); + const nodeNames = new Set( + r.nodes.filter((n) => n.kind === 'function' || n.kind === 'method').map((n) => n.name) + ); + + let fileMissing = 0; + for (const name of srcNames) { + if (nodeNames.has(name)) extractedDeftests++; + else { fileMissing++; } + } + if (fileMissing > 0) { + missingByFile.push({ file: file.replace(repo, ''), missing: fileMissing, total: srcNames.size }); + } +} + +console.log(`\nDeftest extraction validation — ${repo}`); +console.log(` deftests in source: ${srcDeftests}`); +console.log(` extracted as nodes: ${extractedDeftests}`); +console.log(` coverage: ${(100 * extractedDeftests / srcDeftests).toFixed(2)}%`); +if (missingByFile.length) { + console.log(`\n Files with missing deftests (top 15):`); + missingByFile.sort((a, b) => b.missing - a.missing); + for (const f of missingByFile.slice(0, 15)) { + console.log(` ${String(f.missing).padStart(4)} / ${String(f.total).padStart(4)} ${f.file}`); + } +} else { + console.log(`\n ✓ Every deftest in every file was extracted.`); +} diff --git a/src/extraction/grammars.ts b/src/extraction/grammars.ts index c9a2bcb37..2d8766e50 100644 --- a/src/extraction/grammars.ts +++ b/src/extraction/grammars.ts @@ -10,7 +10,7 @@ import * as path from 'path'; import { Parser, Language as WasmLanguage } from 'web-tree-sitter'; import { Language } from '../types'; -export type GrammarLanguage = Exclude; +export type GrammarLanguage = Exclude; /** * WASM filename map — maps each language to its .wasm grammar file @@ -93,6 +93,13 @@ export const EXTENSION_MAP: Record = { '.sc': 'scala', '.lua': 'lua', '.luau': 'luau', + // Common Lisp (parser also covers `.el` reasonably well — the + // tree-sitter-commonlisp grammar accepts Emacs-Lisp surface syntax). + '.lisp': 'lisp', + '.lsp': 'lisp', + '.cl': 'lisp', + '.asd': 'lisp', + '.el': 'lisp', '.m': 'objc', '.mm': 'objc', // XML: file-level tracking; the MyBatis extractor matches `` @@ -272,6 +279,7 @@ export function isLanguageSupported(language: Language): boolean { if (language === 'svelte') return true; // custom extractor (script block delegation) if (language === 'vue') return true; // custom extractor (script block delegation) if (language === 'liquid') return true; // custom regex extractor + if (language === 'lisp') return true; // custom hand-rolled s-expression extractor if (language === 'yaml') return true; // file-level tracking only; Drupal routing extraction via framework resolver if (language === 'twig') return true; // file-level tracking only if (language === 'xml') return true; // MyBatis mapper extractor @@ -284,7 +292,7 @@ export function isLanguageSupported(language: Language): boolean { * Check if a grammar has been loaded and is ready for parsing. */ export function isGrammarLoaded(language: Language): boolean { - if (language === 'svelte' || language === 'vue' || language === 'liquid') return true; + if (language === 'svelte' || language === 'vue' || language === 'liquid' || language === 'lisp') return true; if (language === 'yaml' || language === 'twig') return true; // no WASM grammar needed if (language === 'xml' || language === 'properties') return true; // no WASM grammar needed return languageCache.has(language); @@ -307,7 +315,7 @@ export function isFileLevelOnlyLanguage(language: Language): boolean { * Get all supported languages (those with grammar definitions). */ export function getSupportedLanguages(): Language[] { - return [...(Object.keys(WASM_GRAMMAR_FILES) as GrammarLanguage[]), 'svelte', 'vue', 'liquid']; + return [...(Object.keys(WASM_GRAMMAR_FILES) as GrammarLanguage[]), 'svelte', 'vue', 'liquid', 'lisp']; } /** @@ -377,6 +385,7 @@ export function getLanguageDisplayName(language: Language): string { scala: 'Scala', lua: 'Lua', luau: 'Luau', + lisp: 'Common Lisp', objc: 'Objective-C', yaml: 'YAML', twig: 'Twig', diff --git a/src/extraction/lisp-extractor.ts b/src/extraction/lisp-extractor.ts new file mode 100644 index 000000000..80802d333 --- /dev/null +++ b/src/extraction/lisp-extractor.ts @@ -0,0 +1,1079 @@ +import { Node, Edge, ExtractionResult, ExtractionError, UnresolvedReference, NodeKind } from '../types'; +import { generateNodeId } from './tree-sitter-helpers'; + +/** + * LispExtractor — a hand-rolled s-expression extractor for Common Lisp (and + * Emacs Lisp / Scheme-ish dialects, since they share the reader syntax). + * + * WHY NOT TREE-SITTER: the `tree-sitter-commonlisp` grammar is incomplete for + * real-world CL — it mis-parses `^` as a reader macro, chokes on `lambda` as a + * parameter name, reader conditionals inside `loop`, parameterised format + * directives inside strings, `{}[]`/backslash escapes in symbol names, and + * block comments containing parens. Reaching 100% parse coverage on the + * Clozure ANSI suite required ~12 source-preprocessing patches, each a workaround + * for the grammar's incompleteness. S-expressions are trivially tokenisable, so + * a hand-rolled tokenizer + recursive-descent parser (modelled on the CCL ARM64 + * port's own `tools/extract-ppc2-section.py`) parses ALL of it with zero + * preprocessing: the atom reader simply consumes everything up to whitespace or + * a structural delimiter, which handles `^ { } [ ] \+ pkg:sym ccl.bug#252a` for + * free. + * + * The extraction logic (which forms produce which node kinds, how call edges + * are emitted and which positions are suppressed) is a direct port of the + * earlier tree-sitter-based extractor; see the per-handler comments. + */ + +// ============================================================================= +// Form-classification constants (lower-cased, package-prefix stripped) +// ============================================================================= + +const VAR_HEADS = new Set(['defvar', 'defparameter', 'defglobal']); +const CONST_HEADS = new Set(['defconstant', 'defconst', 'define-constant']); +const CLASS_HEADS = new Set(['defclass', 'define-condition']); +const STRUCT_HEADS = new Set(['defstruct']); +const TYPE_ALIAS_HEADS = new Set(['deftype']); +const PACKAGE_HEADS = new Set(['defpackage', 'define-package']); +const IMPORT_HEADS = new Set(['require', 'use-package', 'import-from', 'load']); +const ASDF_IMPORT_HEADS = new Set(['asdf:load-system', 'load-system']); + +// (defun|defmacro|defmethod|defgeneric|lambda …) — function-defining forms. +const DEFUN_HEADS = new Set(['defun', 'defmacro', 'defmethod', 'defgeneric', 'lambda']); + +// (let ((var val) …) body…) / let* / symbol-macrolet / prog / prog* — same +// leading shape (binding list + body). +const LET_HEADS = new Set(['let', 'let*', 'symbol-macrolet', 'prog', 'prog*']); +// (flet ((name (params) body…) …) outer-body…) / labels / macrolet. +const FLET_HEADS = new Set(['flet', 'labels', 'macrolet']); +// (do ((var init [step]) …) (end-test result…) body…) / do*. +const DO_HEADS = new Set(['do', 'do*']); +// (dolist (var list [result]) body…) / dotimes / with-open-file / … — first +// arg is a binding list whose FIRST element is the bound name. +const SINGLE_BINDING_HEADS = new Set([ + 'dolist', 'dotimes', + 'do-symbols', 'do-external-symbols', 'do-all-symbols', + 'with-open-file', 'with-output-to-string', 'with-input-from-string', + 'with-open-stream', 'with-input-from-pipe', 'with-output-to-pipe', + 'with-package-iterator', 'with-hash-table-iterator', +]); +// (multiple-value-bind (vars…) form body…) / (destructuring-bind pattern form body…). +const MV_BIND_HEADS = new Set(['multiple-value-bind', 'destructuring-bind']); +// Macros whose first arg is a parenthesised data list (vars + keyword opts), +// carrying no callable code. Skip arg 2; walk the body (arg 3+). +const SKIP_ARG2_HEADS = new Set(['with-slots', 'with-accessors', 'print-unreadable-object']); + +const COND_HEADS = new Set(['cond']); +const CASE_HEADS = new Set(['case', 'ecase', 'ccase', 'typecase', 'etypecase']); +const DECLARE_HEADS = new Set(['declare', 'declaim', 'proclaim']); +const FUNCALL_HEADS = new Set(['funcall', 'apply']); +// CCL vinsn-emission macros: (! VINSN …) / (!! VINSN …) — target is arg 2. +const VINSN_EMIT_HEADS = new Set(['!', '!!']); +const ASSERT_HEADS = new Set(['check-type', 'assert']); + +// Scope/control forms — present so their head doesn't become a spurious call +// edge. Binding forms and the specialised forms above are handled separately. +const CONTROL_HEADS = new Set([ + 'progn', 'prog1', 'prog2', 'block', 'tagbody', 'go', 'return', 'return-from', + 'if', 'when', 'unless', + 'and', 'or', 'not', + 'loop', + 'function', 'setf', 'setq', 'psetf', 'psetq', 'incf', 'decf', + 'quote', 'unquote', 'list', 'cons', 'car', 'cdr', + 'eval-when', 'the', + 'handler-case', 'handler-bind', 'restart-case', 'restart-bind', + 'with-condition-restarts', + 'unwind-protect', 'ignore-errors', + 'catch', 'throw', 'progv', + 'in-package', 'eval', +]); + +// Standard def-forms with dedicated handlers (defun-family handled even +// earlier). defsetf/define-modify-macro/define-symbol-macro/etc are +// intentionally OMITTED so the user-defmacro fallback surfaces them. +const KNOWN_DEF_HEADS = new Set([ + ...VAR_HEADS, ...CONST_HEADS, ...CLASS_HEADS, ...STRUCT_HEADS, + ...TYPE_ALIAS_HEADS, ...PACKAGE_HEADS, + 'defun', 'defmacro', 'defmethod', 'defgeneric', +]); + +// Heuristic for user-defined defining macros (CCL: def-x86-opcode, +// define-arm-vinsn, defcommand, deftest, …). Scoped to top-level positions. +const DEF_FALLBACK_RE = /^def/i; + +// ============================================================================= +// Pure helpers (symbol-name handling) +// ============================================================================= + +// Strip the package prefix at the last UNescaped colon (`\:` is a literal +// colon in a name, not a separator — e.g. the ANSI test `format.\:{.6`). +function baseSymbol(text: string): string { + for (let i = text.length - 1; i >= 0; i--) { + if (text[i] !== ':') continue; + let bs = 0; + let k = i - 1; + while (k >= 0 && text[k] === '\\') { bs++; k--; } + if (bs % 2 === 0) return text.slice(i + 1); + } + return text; +} + +// `#:my-app` / `:my-app` / `"my-app"` → `my-app`. +function cleanName(text: string): string { + return text.trim().replace(/^#?:/, '').replace(/^["']|["']$/g, ''); +} + +// ============================================================================= +// S-expression model +// ============================================================================= + +type SKind = 'list' | 'sym' | 'keyword' | 'number' | 'string' | 'quote'; + +interface Sexp { + kind: SKind; + /** Original source slice (verbatim — what getNodeText returned in the TS port). */ + text: string; + /** For lists: child forms (comments already dropped). For quote: [innerForm]. */ + children: Sexp[]; + /** For quote nodes: the prefix marker (`'`, `` ` ``, `,`, `,@`, `#'`, `#+x`, `#-x`). */ + prefix?: string; + startLine: number; // 1-based + startCol: number; // 0-based + endLine: number; // 1-based + endCol: number; // 0-based +} + +interface CommentTok { + text: string; + startLine: number; + endLine: number; +} + +// Classify a bare atom's text (strings are tagged during tokenisation). +function classifyAtom(text: string): SKind { + if (/^:/.test(text)) return 'keyword'; + // Numbers: integer / float / ratio / scientific. Names like `*x*`, `1+`, + // `+max+`, `foo.1`, `.SPx` must stay symbols. + if ( + /^[+-]?\d+$/.test(text) || + /^[+-]?\d+\/\d+$/.test(text) || + /^[+-]?\d*\.\d+([eEdDsSfFlL][+-]?\d+)?$/.test(text) || + /^[+-]?\d+\.?\d*[eEdDsSfFlL][+-]?\d+$/.test(text) || + /^[+-]?\d+\.$/.test(text) + ) { + return 'number'; + } + return 'sym'; +} + +// ============================================================================= +// Tokenizer + parser +// ============================================================================= + +type TokType = 'lparen' | 'rparen' | 'string' | 'quote' | 'atom' | 'comment'; + +interface Tok { + type: TokType; + text: string; + startLine: number; startCol: number; + endLine: number; endCol: number; +} + +const DELIM = new Set(['(', ')', '"', "'", '`', ',', ';', ' ', '\t', '\n', '\r', '\f']); + +function tokenize(src: string): { tokens: Tok[]; comments: CommentTok[] } { + const tokens: Tok[] = []; + const comments: CommentTok[] = []; + const n = src.length; + let i = 0; + let row = 0; // 0-based + let col = 0; // 0-based + + // Advance the cursor to absolute index `to`, updating row/col over consumed chars. + const advanceTo = (to: number): void => { + while (i < to) { + if (src[i] === '\n') { row++; col = 0; } + else { col++; } + i++; + } + }; + const push = (type: TokType, start: number, end: number, sRow: number, sCol: number): void => { + const eRow = row, eCol = col; + const t: Tok = { + type, text: src.slice(start, end), + startLine: sRow + 1, startCol: sCol, endLine: eRow + 1, endCol: eCol, + }; + if (type === 'comment') comments.push({ text: t.text, startLine: t.startLine, endLine: t.endLine }); + else tokens.push(t); + }; + + while (i < n) { + const c = src[i]!; + const sRow = row, sCol = col, start = i; + + if (c === ' ' || c === '\t' || c === '\n' || c === '\r' || c === '\f') { + advanceTo(i + 1); + continue; + } + if (c === ';') { + let j = i + 1; + while (j < n && src[j] !== '\n') j++; + advanceTo(j); + push('comment', start, j, sRow, sCol); + continue; + } + if (c === '"') { + let j = i + 1; + while (j < n) { + if (src[j] === '\\') { j += 2; continue; } + if (src[j] === '"') { j++; break; } + j++; + } + advanceTo(j); + push('string', start, j, sRow, sCol); + continue; + } + if (c === '(') { advanceTo(i + 1); push('lparen', start, i, sRow, sCol); continue; } + if (c === ')') { advanceTo(i + 1); push('rparen', start, i, sRow, sCol); continue; } + if (c === "'" || c === '`') { advanceTo(i + 1); push('quote', start, i, sRow, sCol); continue; } + if (c === ',') { + const j = (src[i + 1] === '@') ? i + 2 : i + 1; + advanceTo(j); + push('quote', start, j, sRow, sCol); + continue; + } + if (c === '#') { + const c1 = src[i + 1]; + // #| … |# nested block comment + if (c1 === '|') { + let j = i + 2; + let depth = 1; + while (j < n && depth > 0) { + if (src[j] === '#' && src[j + 1] === '|') { depth++; j += 2; } + else if (src[j] === '|' && src[j + 1] === '#') { depth--; j += 2; } + else j++; + } + advanceTo(j); + push('comment', start, j, sRow, sCol); + continue; + } + // #+ / #- reader conditional → quote-like prefix (transparent in walk) + if (c1 === '+' || c1 === '-') { + advanceTo(i + 2); + push('quote', start, i, sRow, sCol); + continue; + } + // #' function quote → quote-like prefix + if (c1 === "'") { + advanceTo(i + 2); + push('quote', start, i, sRow, sCol); + continue; + } + // #\char, #xNN, #(…) vector, #:sym, #.expr, etc. — read as an atom. + // #\X reads the char (incl. structural ones like `#\(`); the rest read + // up to whitespace/delimiter. + let j = i + 1; + if (src[j] === '\\') { + j += 2; // consume the escaped char itself (may be `(`/`)`/space) + while (j < n && !DELIM.has(src[j]!)) j++; + } else { + while (j < n && !DELIM.has(src[j]!)) j++; + } + advanceTo(j); + push('atom', start, j, sRow, sCol); + continue; + } + // Plain atom: read up to whitespace or a structural delimiter. A backslash + // escapes the next char (so `\(`/`\ `/`\:` stay inside the atom). + let j = i; + while (j < n) { + if (src[j] === '\\') { j += 2; continue; } + if (DELIM.has(src[j]!)) break; + j++; + } + if (j === i) j = i + 1; // never stall on a stray char + advanceTo(j); + push('atom', start, j, sRow, sCol); + } + + return { tokens, comments }; +} + +// Recursive-descent parse into a forest of top-level Sexp forms. Leading quote +// tokens attach as a `quote` wrapper around the following form. Unbalanced +// parens are tolerated (auto-closed at EOF) so a malformed file still yields +// whatever parsed cleanly. +function parse(tokens: Tok[]): Sexp[] { + let pos = 0; + const atEnd = () => pos >= tokens.length; + const peek = () => tokens[pos]; + + function atomSexp(t: Tok): Sexp { + const kind: SKind = t.type === 'string' ? 'string' : classifyAtom(t.text); + return { + kind, text: t.text, children: [], + startLine: t.startLine, startCol: t.startCol, endLine: t.endLine, endCol: t.endCol, + }; + } + + function parseOne(): Sexp | null { + if (atEnd()) return null; + // Gather leading quote prefixes. + const prefixes: Tok[] = []; + while (!atEnd() && peek()!.type === 'quote') prefixes.push(tokens[pos++]!); + if (atEnd()) { + // Trailing quote with no form — represent as a bare sym so positions survive. + if (prefixes.length) { + const q = prefixes[prefixes.length - 1]!; + return { kind: 'sym', text: q.text, children: [], startLine: q.startLine, startCol: q.startCol, endLine: q.endLine, endCol: q.endCol }; + } + return null; + } + + let form: Sexp; + const t = peek()!; + if (t.type === 'lparen') { + pos++; // consume ( + const children: Sexp[] = []; + const open = t; + let close: Tok | null = null; + while (!atEnd()) { + if (peek()!.type === 'rparen') { close = tokens[pos++]!; break; } + const child = parseOne(); + if (child) children.push(child); + else break; + } + const endTok = close ?? tokens[pos - 1] ?? open; + form = { + kind: 'list', text: '', children, + startLine: open.startLine, startCol: open.startCol, + endLine: endTok.endLine, endCol: endTok.endCol, + }; + } else if (t.type === 'rparen') { + // Stray close paren — skip it. + pos++; + return parseOne(); + } else { + pos++; + form = atomSexp(t); + } + + // Wrap with quote prefixes (innermost first). The wrapper is transparent + // to the walker except where `#'name` / `'name` is explicitly unwrapped. + for (let k = prefixes.length - 1; k >= 0; k--) { + const q = prefixes[k]!; + form = { + kind: 'quote', text: q.text, prefix: q.text, children: [form], + startLine: q.startLine, startCol: q.startCol, endLine: form.endLine, endCol: form.endCol, + }; + } + return form; + } + + const forms: Sexp[] = []; + while (!atEnd()) { + const before = pos; + const f = parseOne(); + if (f) forms.push(f); + if (pos === before) pos++; // guard against non-advancing loop + } + return forms; +} + +// ============================================================================= +// Sexp accessors mirroring the tree-sitter helpers +// ============================================================================= + +function isSymbolish(s: Sexp): boolean { + return s.kind === 'sym'; +} + +// Base symbol name from a sym (package prefix stripped). undefined otherwise. +function symbolName(s: Sexp): string | undefined { + if (s.kind !== 'sym') return undefined; + const t = baseSymbol(s.text.trim()); + return t || undefined; +} + +// n-th child (1-indexed); children already exclude comments. +function nthArg(s: Sexp, n: number): Sexp | null { + return s.children[n - 1] ?? null; +} + +function* argsFrom(s: Sexp, start: number): Generator { + for (let i = start - 1; i < s.children.length; i++) yield s.children[i]!; +} + +// Resolve `#'name` / `'name` (incl. `#'pkg:name`) to its base symbol name. +function unwrapQuotedSymbol(s: Sexp): string | null { + if (s.kind !== 'quote') return null; + if (s.prefix !== "'" && s.prefix !== "#'") return null; + const inner = s.children[0]; + if (!inner) return null; + return symbolName(inner) ?? null; +} + +// ============================================================================= +// Extractor +// ============================================================================= + +export class LispExtractor { + private filePath: string; + private source: string; + private nodes: Node[] = []; + private edges: Edge[] = []; + private unresolvedReferences: UnresolvedReference[] = []; + private errors: ExtractionError[] = []; + private nodeStack: string[] = []; + private nodeById = new Map(); + private comments: CommentTok[] = []; + + constructor(filePath: string, source: string) { + this.filePath = filePath; + this.source = source; + } + + extract(): ExtractionResult { + const startTime = Date.now(); + try { + const { tokens, comments } = tokenize(this.source); + this.comments = comments; + const forms = parse(tokens); + + const fileNode: Node = { + id: `file:${this.filePath}`, + kind: 'file', + name: this.filePath.split(/[\\/]/).pop() || this.filePath, + qualifiedName: this.filePath, + filePath: this.filePath, + language: 'lisp', + startLine: 1, + endLine: Math.max(1, this.source.split('\n').length), + startColumn: 0, + endColumn: 0, + isExported: false, + updatedAt: Date.now(), + }; + this.nodes.push(fileNode); + this.nodeById.set(fileNode.id, fileNode); + this.nodeStack.push(fileNode.id); + + for (const form of forms) this.processForm(form); + + this.nodeStack.pop(); + } catch (error) { + this.errors.push({ + message: `Lisp extraction error: ${error instanceof Error ? error.message : String(error)}`, + filePath: this.filePath, + severity: 'error', + code: 'parse_error', + }); + } + + return { + nodes: this.nodes, + edges: this.edges, + unresolvedReferences: this.unresolvedReferences, + errors: this.errors, + durationMs: Date.now() - startTime, + }; + } + + // --- node/scope plumbing (port of TreeSitterExtractor.createNode) --------- + + private buildQualifiedName(name: string): string { + const parts: string[] = []; + for (const id of this.nodeStack) { + const n = this.nodeById.get(id); + if (n && n.kind !== 'file') parts.push(n.name); + } + parts.push(name); + return parts.join('::'); + } + + private createNode( + kind: NodeKind, + name: string, + s: Sexp, + extra: Partial = {} + ): Node | null { + if (!name) return null; + const id = generateNodeId(this.filePath, kind, name, s.startLine); + const node: Node = { + id, + kind, + name, + qualifiedName: this.buildQualifiedName(name), + filePath: this.filePath, + language: 'lisp', + startLine: s.startLine, + endLine: s.endLine, + startColumn: s.startCol, + endColumn: s.endCol, + updatedAt: Date.now(), + ...extra, + }; + this.nodes.push(node); + this.nodeById.set(id, node); + if (this.nodeStack.length > 0) { + const parentId = this.nodeStack[this.nodeStack.length - 1]; + if (parentId) this.edges.push({ source: parentId, target: id, kind: 'contains' }); + } + return node; + } + + private addRef(ref: UnresolvedReference): void { + this.unresolvedReferences.push(ref); + } + + // Contiguous `;`-comment block (or trailing block comment) immediately + // preceding `startLine`, cleaned of comment markers. Best-effort docstring. + private docBefore(startLine: number): string | undefined { + const lines: string[] = []; + let target = startLine - 1; + // Walk comments in reverse, collecting those that end on the line directly + // above the running target. + for (let pass = 0; pass < this.comments.length; pass++) { + const c = this.comments.find((cm) => cm.endLine === target); + if (!c) break; + lines.unshift(c.text); + target = c.startLine - 1; + } + if (!lines.length) return undefined; + const cleaned = lines + .map((c) => c.replace(/^#\|/, '').replace(/\|#$/, '').replace(/^;+/gm, '').trim()) + .join('\n') + .trim(); + return cleaned || undefined; + } + + // --- walker --------------------------------------------------------------- + + private processForm(s: Sexp): void { + if (s.kind !== 'list') { + // quote wrappers and any composite get descended transparently. + for (const c of s.children) this.processForm(c); + return; + } + const first = s.children[0]; + if (!first) return; + + // Head text only when the head is a bare/package-qualified symbol. + let head: string | null = null; + if (first.kind === 'sym') head = first.text.trim(); + if (head === null) { + for (const c of s.children) this.processForm(c); + return; + } + const headBase = baseSymbol(head).toLowerCase(); + + // Function-defining forms first (incl. lambda). + if (DEFUN_HEADS.has(headBase)) { this.extractDefun(s, headBase); return; } + + if (VAR_HEADS.has(headBase)) { this.extractVarOrConst(s, 'variable'); return; } + if (CONST_HEADS.has(headBase)) { this.extractVarOrConst(s, 'constant'); return; } + if (CLASS_HEADS.has(headBase)) { this.extractDefclass(s); return; } + if (STRUCT_HEADS.has(headBase)) { this.extractDefstruct(s); return; } + if (TYPE_ALIAS_HEADS.has(headBase)) { + const nameNode = nthArg(s, 2); + if (nameNode) { const name = symbolName(nameNode); if (name) this.createNode('type_alias', name, s); } + return; + } + if (PACKAGE_HEADS.has(headBase)) { this.extractDefpackage(s); return; } + if (IMPORT_HEADS.has(headBase) || ASDF_IMPORT_HEADS.has(head.toLowerCase())) { this.extractImport(s); return; } + if (KNOWN_DEF_HEADS.has(headBase)) return; + + // User-defined `def*` DSL macros (top-level only). + if (DEF_FALLBACK_RE.test(headBase) && this.shouldApplyDefFallback()) { + this.extractUserDefMacro(s, head); + return; + } + + // Binding forms. + if (LET_HEADS.has(headBase)) { this.handleLet(s); return; } + if (FLET_HEADS.has(headBase)) { this.handleFlet(s); return; } + if (DO_HEADS.has(headBase)) { this.handleDo(s); return; } + if (SINGLE_BINDING_HEADS.has(headBase)) { this.handleSingleBinding(s); return; } + if (MV_BIND_HEADS.has(headBase)) { + for (const a of argsFrom(s, 3)) this.processForm(a); + return; + } + if (SKIP_ARG2_HEADS.has(headBase)) { + for (const a of argsFrom(s, 3)) this.processForm(a); + return; + } + + // Specialised control forms. + if (DECLARE_HEADS.has(headBase)) return; + if (COND_HEADS.has(headBase)) { + for (const clause of argsFrom(s, 2)) { + if (clause.kind !== 'list') continue; + for (const c of clause.children) this.processForm(c); + } + return; + } + if (CASE_HEADS.has(headBase)) { + const keyform = nthArg(s, 2); + if (keyform) this.processForm(keyform); + for (const clause of argsFrom(s, 3)) { + if (clause.kind !== 'list') continue; + for (const c of argsFrom(clause, 2)) this.processForm(c); + } + return; + } + if (FUNCALL_HEADS.has(headBase)) { this.handleFuncall(s); return; } + if (VINSN_EMIT_HEADS.has(headBase)) { this.handleVinsnEmit(s); return; } + if (ASSERT_HEADS.has(headBase)) { this.handleAssertOrCheckType(s, headBase); return; } + + // Plain control forms. + if (CONTROL_HEADS.has(headBase)) { + for (const a of argsFrom(s, 2)) this.processForm(a); + return; + } + + // Default: a function call. + this.emitCall(s, head); + for (const a of argsFrom(s, 2)) this.processForm(a); + } + + // --- binding-form handlers ------------------------------------------------ + + private handleLet(s: Sexp): void { + const bindings = nthArg(s, 2); + if (bindings?.kind === 'list') { + for (const binding of bindings.children) { + if (binding.kind !== 'list') continue; + for (const c of argsFrom(binding, 2)) this.processForm(c); + } + } + for (const a of argsFrom(s, 3)) this.processForm(a); + } + + private handleFlet(s: Sexp): void { + const bindings = nthArg(s, 2); + if (bindings?.kind === 'list') { + for (const binding of bindings.children) { + if (binding.kind !== 'list') continue; + const nameNode = nthArg(binding, 1); + if (!nameNode || nameNode.kind !== 'sym') continue; + const fnName = nameNode.text.trim(); + const paramsNode = nthArg(binding, 2); + const sig = paramsNode ? paramsNode.text.trim() : undefined; + const innerFn = this.createNode('function', fnName, binding, { signature: sig }); + if (!innerFn) continue; + this.nodeStack.push(innerFn.id); + for (const c of argsFrom(binding, 3)) this.processForm(c); + this.nodeStack.pop(); + } + } + for (const a of argsFrom(s, 3)) this.processForm(a); + } + + private handleDo(s: Sexp): void { + const bindings = nthArg(s, 2); + if (bindings?.kind === 'list') { + for (const binding of bindings.children) { + if (binding.kind !== 'list') continue; + for (const c of argsFrom(binding, 2)) this.processForm(c); + } + } + const term = nthArg(s, 3); + if (term?.kind === 'list') { + for (const c of term.children) this.processForm(c); + } + for (const a of argsFrom(s, 4)) this.processForm(a); + } + + private handleSingleBinding(s: Sexp): void { + const binding = nthArg(s, 2); + if (binding?.kind === 'list') { + for (const c of argsFrom(binding, 2)) this.processForm(c); + } + for (const a of argsFrom(s, 3)) this.processForm(a); + } + + private handleAssertOrCheckType(s: Sexp, headBase: string): void { + if (headBase === 'check-type') { + const place = nthArg(s, 2); + if (place) this.processForm(place); + for (const a of argsFrom(s, 4)) this.processForm(a); + return; + } + const test = nthArg(s, 2); + if (test) this.processForm(test); + const placesList = nthArg(s, 3); + if (placesList?.kind === 'list') { + for (const p of placesList.children) this.processForm(p); + } + for (const a of argsFrom(s, 4)) this.processForm(a); + } + + private handleVinsnEmit(s: Sexp): void { + const target = nthArg(s, 2); + if (target) { + if (isSymbolish(target)) { + const name = symbolName(target); + if (name) this.emitCall(s, name); + } else { + this.processForm(target); + } + } + for (const a of argsFrom(s, 3)) this.processForm(a); + } + + private handleFuncall(s: Sexp): void { + const target = nthArg(s, 2); + if (target) { + const quotedName = unwrapQuotedSymbol(target); + if (quotedName) this.emitCall(s, quotedName); + else this.processForm(target); + } + for (const a of argsFrom(s, 3)) this.processForm(a); + } + + // --- defining-form handlers ---------------------------------------------- + + private extractVarOrConst(s: Sexp, kind: NodeKind): void { + const nameNode = nthArg(s, 2); + if (nameNode) { + const name = symbolName(nameNode); + if (name) this.createNode(kind, name, s, { signature: this.slice(s) }); + } + const init = nthArg(s, 3); + if (init) this.processForm(init); + } + + private extractDefclass(s: Sexp): void { + const nameNode = nthArg(s, 2); + if (!nameNode) return; + const className = symbolName(nameNode); + if (!className) return; + const classNode = this.createNode('class', className, s, { docstring: this.docBefore(s.startLine) }); + if (!classNode) return; + + for (const sup of this.defclassSupers(s)) { + this.addRef({ + fromNodeId: classNode.id, + referenceName: baseSymbol(sup), + referenceKind: 'extends', + line: s.startLine, + column: s.startCol, + }); + } + + const slotList = nthArg(s, 4); + if (slotList?.kind === 'list') { + this.nodeStack.push(classNode.id); + for (const slot of slotList.children) { + if (isSymbolish(slot)) { + const slotName = symbolName(slot); + if (slotName) this.createNode('field', slotName, slot, {}); + continue; + } + if (slot.kind !== 'list') continue; + const slotNameNode = nthArg(slot, 1); + if (!slotNameNode) continue; + const slotName = symbolName(slotNameNode); + if (!slotName) continue; + this.createNode('field', slotName, slot, { signature: this.slice(slot) }); + + // Walk slot keyword options. + for (let j = 0; j < slot.children.length; j++) { + const c = slot.children[j]!; + if (c.kind !== 'keyword') continue; + const kwText = c.text.trim().toLowerCase(); + const value = slot.children[j + 1]; + if (!value) continue; + if (kwText === ':accessor' || kwText === ':reader' || kwText === ':writer') { + if (isSymbolish(value)) { + const accName = symbolName(value); + if (accName) this.createNode('function', accName, value, { signature: kwText.slice(1) }); + } + } else if (kwText === ':initform' || kwText === ':default') { + this.processForm(value); + } + } + } + this.nodeStack.pop(); + } + + // Class options (arg 5+) — walk option value forms for callable code. + for (const opt of argsFrom(s, 5)) { + if (opt.kind !== 'list') continue; + for (const c of argsFrom(opt, 2)) this.processForm(c); + } + } + + private extractDefstruct(s: Sexp): void { + const hdr = this.defstructHeader(s); + if (!hdr) return; + const structNode = this.createNode('struct', hdr.name, s, { docstring: this.docBefore(s.startLine) }); + if (!structNode) return; + + this.nodeStack.push(structNode.id); + for (const slot of argsFrom(s, hdr.slotsStartArg)) { + if (slot.kind === 'string') continue; // docstring slot + if (isSymbolish(slot)) { + const slotName = symbolName(slot); + if (slotName) this.createNode('field', slotName, slot, {}); + continue; + } + if (slot.kind !== 'list') continue; + const slotNameNode = nthArg(slot, 1); + if (!slotNameNode) continue; + const slotName = symbolName(slotNameNode); + if (!slotName) continue; + this.createNode('field', slotName, slot, { signature: this.slice(slot) }); + const def = nthArg(slot, 2); + if (def?.kind === 'list') this.processForm(def); + } + this.nodeStack.pop(); + } + + private extractDefpackage(s: Sexp): void { + const second = nthArg(s, 2); + if (!second) return; + const name = cleanName(second.text); + if (!name) return; + + const nicknames: string[] = []; + for (const opt of argsFrom(s, 3)) { + if (opt.kind !== 'list') continue; + const optHead = opt.children[0]; + if (!optHead || optHead.kind !== 'keyword') continue; + if (cleanName(optHead.text).toLowerCase() !== 'nicknames') continue; + for (const val of argsFrom(opt, 2)) { + const t = cleanName(val.text); + if (t) nicknames.push(t); + } + } + const nsSignature = nicknames.length ? `nicknames: ${nicknames.join(', ')}` : undefined; + const nsNode = this.createNode('namespace', name, s, { signature: nsSignature }); + if (!nsNode) return; + + this.nodeStack.push(nsNode.id); + for (const opt of argsFrom(s, 3)) { + if (opt.kind !== 'list') continue; + const optHead = opt.children[0]; + if (!optHead || optHead.kind !== 'keyword') continue; + const optName = cleanName(optHead.text).toLowerCase(); + + if (optName === 'use') { + for (const val of argsFrom(opt, 2)) { + const pkg = cleanName(val.text); + if (!pkg) continue; + const imp = this.createNode('import', pkg, val, { signature: `:use ${pkg}` }); + if (imp) this.addRef({ fromNodeId: nsNode.id, referenceName: pkg, referenceKind: 'imports', line: val.startLine, column: val.startCol }); + } + } else if (optName === 'import-from') { + const pkgNode = nthArg(opt, 2); + const pkgName = pkgNode ? cleanName(pkgNode.text) : ''; + if (pkgName && pkgNode) { + this.createNode('import', pkgName, pkgNode, { signature: `:import-from ${pkgName}` }); + this.addRef({ fromNodeId: nsNode.id, referenceName: pkgName, referenceKind: 'imports', line: pkgNode.startLine, column: pkgNode.startCol }); + } + for (const val of argsFrom(opt, 3)) { + const sym = cleanName(val.text); + if (!sym) continue; + this.createNode('import', sym, val, { signature: pkgName ? `from ${pkgName}` : `:import-from` }); + this.addRef({ fromNodeId: nsNode.id, referenceName: sym, referenceKind: 'imports', line: val.startLine, column: val.startCol }); + } + } else if (optName === 'export') { + for (const val of argsFrom(opt, 2)) { + const sym = cleanName(val.text); + if (!sym) continue; + this.createNode('export', sym, val, { signature: `:export ${sym}` }); + } + } else if (optName === 'shadowing-import-from') { + const pkgNode = nthArg(opt, 2); + const pkgName = pkgNode ? cleanName(pkgNode.text) : ''; + for (const val of argsFrom(opt, 3)) { + const sym = cleanName(val.text); + if (!sym) continue; + this.createNode('import', sym, val, { signature: pkgName ? `shadowing from ${pkgName}` : `:shadowing-import-from` }); + } + } + } + this.nodeStack.pop(); + } + + private extractImport(s: Sexp): void { + const nameNode = nthArg(s, 2); + if (!nameNode) return; + const mod = cleanName(nameNode.text); + if (!mod) return; + const imp = this.createNode('import', mod, s, { signature: this.slice(s) }); + if (imp && this.nodeStack.length > 0) { + const parentId = this.nodeStack[this.nodeStack.length - 1]; + if (parentId) this.addRef({ fromNodeId: parentId, referenceName: mod, referenceKind: 'imports', line: s.startLine, column: s.startCol }); + } + } + + private shouldApplyDefFallback(): boolean { + const topId = this.nodeStack[this.nodeStack.length - 1]; + if (!topId) return false; + return !topId.startsWith('function:') && !topId.startsWith('method:'); + } + + private extractUserDefMacro(s: Sexp, head: string): void { + const nameNode = nthArg(s, 2); + if (!nameNode) return; + let macroName: string | undefined; + if (isSymbolish(nameNode)) { + macroName = symbolName(nameNode); + } else if (nameNode.kind === 'list') { + for (const c of nameNode.children) { + if (isSymbolish(c)) { macroName = symbolName(c); break; } + } + } + if (!macroName) return; + + const macroNode = this.createNode('function', macroName, s, { + signature: `(${head} ...)`, + docstring: this.docBefore(s.startLine), + }); + if (!macroNode) return; + + this.nodeStack.push(macroNode.id); + let skippedFirstList = false; + for (const arg of argsFrom(s, 3)) { + if (!skippedFirstList && arg.kind === 'list') { skippedFirstList = true; continue; } + this.processForm(arg); + } + this.nodeStack.pop(); + } + + private extractDefun(s: Sexp, headBase: string): void { + // lambda — anonymous: walk body (everything after the lambda-list). + if (headBase === 'lambda') { + for (const a of argsFrom(s, 3)) this.processForm(a); + return; + } + + const isMethod = headBase === 'defmethod'; + const nameNode = nthArg(s, 2); + if (!nameNode) return; + + // Name: a bare sym (verbatim) or a `(setf foo)` list. + let name: string | undefined; + if (nameNode.kind === 'sym') name = nameNode.text.trim(); + else if (nameNode.kind === 'list') name = this.slice(nameNode); + if (!name) return; + + // defmethod qualifiers (`:before`/`:after`/`:around`) sit between the name + // and the lambda-list; the lambda-list is the first list after the name. + const qualifiers: string[] = []; + let lambdaIdx = -1; + for (let i = 2; i < s.children.length; i++) { + const c = s.children[i]!; + if (c.kind === 'list') { lambdaIdx = i; break; } + if (isMethod && c.kind === 'keyword') { qualifiers.push(c.text.trim()); } + } + const lambdaList = lambdaIdx >= 0 ? s.children[lambdaIdx]! : null; + const lambdaSig = lambdaList ? this.slice(lambdaList) : undefined; + const qualStr = qualifiers.join(' '); + const signature = qualStr ? (lambdaSig ? `${qualStr} ${lambdaSig}` : qualStr) : lambdaSig; + + const receiver = isMethod && lambdaList ? this.defmethodReceiverType(lambdaList) : undefined; + const qualSuffix = qualStr + ? '::' + qualifiers.map((q) => q.replace(/^:/, '')).join('::') + : ''; + + const extra: Partial = { signature, docstring: this.docBefore(s.startLine) }; + + let pushedClassScope = false; + if (receiver) { + const owner = this.nodes.find((n) => n.name === receiver && n.filePath === this.filePath && n.kind === 'class'); + if (owner) { + this.nodeStack.push(owner.id); + pushedClassScope = true; + if (qualSuffix) extra.qualifiedName = `${receiver}::${name}${qualSuffix}`; + } else { + extra.qualifiedName = `${receiver}::${name}${qualSuffix}`; + } + } else if (qualSuffix) { + extra.qualifiedName = `${name}${qualSuffix}`; + } + + const kind: NodeKind = isMethod ? 'method' : 'function'; + const fnNode = this.createNode(kind, name, s, extra); + if (pushedClassScope) this.nodeStack.pop(); + if (!fnNode) return; + + // Body = forms after the lambda-list. + this.nodeStack.push(fnNode.id); + const bodyStart = lambdaIdx >= 0 ? lambdaIdx + 1 : 2; + for (let i = bodyStart; i < s.children.length; i++) this.processForm(s.children[i]!); + this.nodeStack.pop(); + } + + // (defmethod NAME ((arg TYPE) …) …) — receiver = TYPE of first specialised param. + private defmethodReceiverType(lambdaList: Sexp): string | undefined { + for (const arg of lambdaList.children) { + if (arg.kind === 'sym') { + if (arg.text.trim().startsWith('&')) return undefined; + continue; + } + if (arg.kind !== 'list') continue; + const typeNode = nthArg(arg, 2); + if (typeNode && isSymbolish(typeNode)) return symbolName(typeNode); + return undefined; + } + return undefined; + } + + private defclassSupers(s: Sexp): string[] { + const supersList = nthArg(s, 3); + if (!supersList || supersList.kind !== 'list') return []; + const out: string[] = []; + for (const sup of supersList.children) { + const name = symbolName(sup); + if (name) out.push(name); + } + return out; + } + + private defstructHeader(s: Sexp): { name: string; slotsStartArg: number } | null { + const second = nthArg(s, 2); + if (!second) return null; + if (isSymbolish(second)) { + const name = symbolName(second); + if (name) return { name, slotsStartArg: 3 }; + } + if (second.kind === 'list') { + for (const c of second.children) { + if (isSymbolish(c)) { const name = symbolName(c); if (name) return { name, slotsStartArg: 3 }; } + } + } + return null; + } + + private emitCall(s: Sexp, head: string): void { + if (this.nodeStack.length === 0) return; + const callerId = this.nodeStack[this.nodeStack.length - 1]; + if (!callerId) return; + this.addRef({ + fromNodeId: callerId, + referenceName: baseSymbol(head), + referenceKind: 'calls', + line: s.startLine, + column: s.startCol, + }); + } + + // First 100 chars of a form's source slice (for signatures). Uses byte range + // from positions; falls back to reconstructing from source via line offsets. + private slice(s: Sexp): string { + const lines = this.source.split('\n'); + if (s.startLine === s.endLine) { + return (lines[s.startLine - 1] ?? '').slice(s.startCol, s.endCol).slice(0, 100); + } + const buf: string[] = []; + for (let ln = s.startLine; ln <= s.endLine && ln <= lines.length; ln++) { + const line = lines[ln - 1] ?? ''; + if (ln === s.startLine) buf.push(line.slice(s.startCol)); + else if (ln === s.endLine) buf.push(line.slice(0, s.endCol)); + else buf.push(line); + if (buf.join('\n').length > 100) break; + } + return buf.join('\n').slice(0, 100); + } +} diff --git a/src/extraction/tree-sitter.ts b/src/extraction/tree-sitter.ts index f576839fa..09c15df18 100644 --- a/src/extraction/tree-sitter.ts +++ b/src/extraction/tree-sitter.ts @@ -24,6 +24,7 @@ import { SvelteExtractor } from './svelte-extractor'; import { DfmExtractor } from './dfm-extractor'; import { VueExtractor } from './vue-extractor'; import { MyBatisExtractor } from './mybatis-extractor'; +import { LispExtractor } from './lisp-extractor'; import { getAllFrameworkResolvers, getApplicableFrameworks, @@ -3067,6 +3068,11 @@ export function extractFromSource( // Use custom extractor for Liquid const extractor = new LiquidExtractor(filePath, source); result = extractor.extract(); + } else if (detectedLanguage === 'lisp') { + // Hand-rolled s-expression extractor — no tree-sitter grammar (see + // lisp-extractor.ts for why). Parses all CL/elisp reader syntax directly. + const extractor = new LispExtractor(filePath, source); + result = extractor.extract(); } else if (detectedLanguage === 'xml') { // Custom extractor for MyBatis mapper XML. Non-mapper XML returns just a // file node so the watcher tracks it without emitting symbols. diff --git a/src/types.ts b/src/types.ts index 0cfaf0bba..3d35db701 100644 --- a/src/types.ts +++ b/src/types.ts @@ -87,6 +87,7 @@ export const LANGUAGES = [ 'scala', 'lua', 'luau', + 'lisp', 'objc', 'yaml', 'twig',