From 20c18c611821448d77b11c138c64d8037ff99997 Mon Sep 17 00:00:00 2001
From: Alex Cameron <asc@tetsuo.sh>
Date: Thu, 19 Feb 2026 23:31:39 +0900
Subject: [PATCH 1/2] feat: add shared memory support via \! shared header
 directive

---
 CLAUDE.md                                     |  1 +
 lib/Conversion/ForthToGPU/ForthToGPU.cpp      | 16 +++++++-
 lib/Translation/ForthToMLIR/ForthToMLIR.cpp   | 41 +++++++++++++++++--
 lib/Translation/ForthToMLIR/ForthToMLIR.h     |  2 +
 test/Conversion/ForthToGPU/shared-memory.mlir | 18 ++++++++
 test/Pipeline/shared-memory.forth             | 21 ++++++++++
 .../Forth/header-duplicate-param-error.forth  |  2 +-
 .../Forth/shared-declarations.forth           | 12 ++++++
 .../Forth/shared-ref-in-word-error.forth      |  6 +++
 9 files changed, 114 insertions(+), 5 deletions(-)
 create mode 100644 test/Conversion/ForthToGPU/shared-memory.mlir
 create mode 100644 test/Pipeline/shared-memory.forth
 create mode 100644 test/Translation/Forth/shared-declarations.forth
 create mode 100644 test/Translation/Forth/shared-ref-in-word-error.forth
diff --git a/CLAUDE.md b/CLAUDE.md
index a97eae5..4bd427f 100644
--- a/CLAUDE.md
+++ b/CLAUDE.md
@@ -89,6 +89,7 @@ uv run ruff format gpu_test/
 - **Operations**: All take stack as input and produce stack as output (except `forth.stack`)
 - **Supported Words**: literals, `DUP DROP SWAP OVER ROT NIP TUCK PICK ROLL`, `+ - * / MOD`, `AND OR XOR NOT LSHIFT RSHIFT`, `= < > <> <= >= 0=`, `@ !`, `CELLS`, `IF ELSE THEN`, `BEGIN UNTIL`, `BEGIN WHILE REPEAT`, `DO LOOP +LOOP I J K`, `LEAVE UNLOOP EXIT`, `TID-X/Y/Z BID-X/Y/Z BDIM-X/Y/Z GDIM-X/Y/Z GLOBAL-ID` (GPU indexing).
 - **Kernel Parameters**: Declared in the `\!` header. `\! kernel <name>` is required and must appear first. `\! param <name> i64[<N>]` becomes a `memref<Nxi64>` argument; `\! param <name> i64` becomes an `i64` argument. Using a param name in code emits `forth.param_ref` (arrays push address; scalars push value).
+- **Shared Memory**: `\! shared <name> i64[<N>]` declares GPU shared (workgroup) memory. Emits a tagged `memref.alloca` at kernel entry; ForthToGPU converts it to a `gpu.func` workgroup attribution (`memref<Nxi64, #gpu.address_space<workgroup>>`). Using the shared name in code pushes its base address onto the stack. Cannot be referenced inside word definitions.
 - **Conversion**: `!forth.stack` → `memref<256xi64>` with explicit stack pointer
 - **GPU**: Functions wrapped in `gpu.module`, `main` gets `gpu.kernel` attribute, configured with bare pointers for NVVM conversion
 - **User-defined Words**: Modeled as `func.func` with signature `(!forth.stack) -> !forth.stack`, called via `func.call`
diff --git a/lib/Conversion/ForthToGPU/ForthToGPU.cpp b/lib/Conversion/ForthToGPU/ForthToGPU.cpp
index 7d72597..4324519 100644
--- a/lib/Conversion/ForthToGPU/ForthToGPU.cpp
+++ b/lib/Conversion/ForthToGPU/ForthToGPU.cpp
@@ -158,7 +158,10 @@ struct ConvertForthToGPUPass
     }
 
     // Clone ops from each source block into the corresponding destination
-    // block. Replace func.return with gpu.return.
+    // block, with three transformations:
+    // - func.return → gpu.return
+    // - shared memref.alloca → gpu.func workgroup attribution
+    auto *ctx = funcOp.getContext();
     for (auto [srcBlock, dstBlock] :
          llvm::zip(funcOp.getBody(), gpuFunc.getBody())) {
       rewriter.setInsertionPointToEnd(&dstBlock);
@@ -168,6 +171,17 @@ struct ConvertForthToGPUPass
           for (Value operand : returnOp.getOperands())
             remappedOperands.push_back(mapping.lookup(operand));
           rewriter.create<gpu::ReturnOp>(returnOp.getLoc(), remappedOperands);
+        } else if (auto allocaOp = dyn_cast<memref::AllocaOp>(&op);
+                   allocaOp && allocaOp->hasAttr("forth.shared_name")) {
+          auto origType = cast<MemRefType>(allocaOp.getType());
+          auto addressSpace =
+              gpu::AddressSpaceAttr::get(ctx, gpu::AddressSpace::Workgroup);
+          auto sharedType =
+              MemRefType::get(origType.getShape(), origType.getElementType(),
+                              MemRefLayoutAttrInterface{}, addressSpace);
+          BlockArgument attr =
+              gpuFunc.addWorkgroupAttribution(sharedType, allocaOp.getLoc());
+          mapping.map(allocaOp.getResult(), attr);
         } else {
           rewriter.clone(op, mapping);
         }
diff --git a/lib/Translation/ForthToMLIR/ForthToMLIR.cpp b/lib/Translation/ForthToMLIR/ForthToMLIR.cpp
index 4440537..766e21c 100644
--- a/lib/Translation/ForthToMLIR/ForthToMLIR.cpp
+++ b/lib/Translation/ForthToMLIR/ForthToMLIR.cpp
@@ -279,13 +279,14 @@ LogicalResult ForthParser::parseHeader() {
         std::string nameUpper = toUpperCase(tokens[1]);
         for (const auto &param : paramDecls) {
           if (param.name == nameUpper) {
-            return emitErrorAt(lineLoc,
-                               "duplicate parameter name: " + nameUpper);
+            return emitErrorAt(lineLoc, "duplicate name: " + nameUpper +
+                                            " (already declared as param)");
           }
         }
         for (const auto &shared : sharedDecls) {
           if (shared.name == nameUpper) {
-            return emitErrorAt(lineLoc, "duplicate shared name: " + nameUpper);
+            return emitErrorAt(lineLoc, "duplicate name: " + nameUpper +
+                                            " (already declared as shared)");
           }
         }
 
@@ -389,6 +390,30 @@ Value ForthParser::emitOperation(StringRef word, Value inputStack,
     }
   }
 
+  // Check if word is a shared memory name (only valid outside word definitions)
+  if (!inWordDefinition) {
+    auto it = sharedAllocs.find(word);
+    if (it != sharedAllocs.end()) {
+      Value alloca = it->second;
+      Value ptrIndex =
+          builder.create<memref::ExtractAlignedPointerAsIndexOp>(loc, alloca);
+      Value ptrI64 = builder.create<arith::IndexCastOp>(
+          loc, builder.getI64Type(), ptrIndex);
+      return builder
+          .create<forth::PushValueOp>(loc, stackType, inputStack, ptrI64)
+          .getOutputStack();
+    }
+  } else {
+    for (const auto &shared : sharedDecls) {
+      if (word == shared.name) {
+        (void)emitError("shared memory '" + shared.name +
+                        "' cannot be referenced inside a word definition; "
+                        "pass the address from the caller instead");
+        return nullptr;
+      }
+    }
+  }
+
   // Check user-defined words first
   std::string mangledWord = mangleForthName(word);
   if (wordDefs.count(mangledWord)) {
@@ -1019,6 +1044,16 @@ OwningOpRef<ModuleOp> ForthParser::parseModule() {
   Block *entryBlock = funcOp.addEntryBlock();
   builder.setInsertionPointToStart(entryBlock);
 
+  // Emit shared memory allocations at kernel entry
+  for (const auto &shared : sharedDecls) {
+    int64_t size = shared.isArray ? shared.size : 1;
+    auto memrefType = MemRefType::get({size}, builder.getI64Type());
+    Value alloca = builder.create<memref::AllocaOp>(loc, memrefType);
+    alloca.getDefiningOp()->setAttr("forth.shared_name",
+                                    builder.getStringAttr(shared.name));
+    sharedAllocs[shared.name] = alloca;
+  }
+
   // Parse Forth operations
   Value finalStack;
   if (failed(parseOperations(finalStack))) {
diff --git a/lib/Translation/ForthToMLIR/ForthToMLIR.h b/lib/Translation/ForthToMLIR/ForthToMLIR.h
index 6c24e6d..09ac2cc 100644
--- a/lib/Translation/ForthToMLIR/ForthToMLIR.h
+++ b/lib/Translation/ForthToMLIR/ForthToMLIR.h
@@ -10,6 +10,7 @@
 #include "mlir/IR/BuiltinOps.h"
 #include "mlir/IR/MLIRContext.h"
 #include "llvm/Support/SourceMgr.h"
+#include "llvm/ADT/StringMap.h"
 #include <string>
 #include <unordered_set>
 #include <vector>
@@ -92,6 +93,7 @@ class ForthParser {
   std::unordered_set<std::string> wordDefs;
   std::vector<ParamDecl> paramDecls;
   std::vector<SharedDecl> sharedDecls;
+  llvm::StringMap<Value> sharedAllocs;
   std::string kernelName;
   const char *headerEndPtr = nullptr;
   bool inWordDefinition = false;
diff --git a/test/Conversion/ForthToGPU/shared-memory.mlir b/test/Conversion/ForthToGPU/shared-memory.mlir
new file mode 100644
index 0000000..d536dd7
--- /dev/null
+++ b/test/Conversion/ForthToGPU/shared-memory.mlir
@@ -0,0 +1,18 @@
+// RUN: %warpforth-opt --convert-forth-to-gpu %s | %FileCheck %s
+
+// CHECK: gpu.module @warpforth_module
+// CHECK: gpu.func @main(%arg0: memref<256xi64> {forth.param_name = "DATA"})
+// CHECK-SAME: workgroup(%{{.*}}: memref<256xi64, #gpu.address_space<workgroup>>)
+// CHECK-SAME: kernel
+// CHECK-NOT: memref.alloca() {forth.shared_name
+// CHECK: memref.extract_aligned_pointer_as_index %{{.*}} : memref<256xi64, #gpu.address_space<workgroup>>
+// CHECK: gpu.return
+
+module {
+  func.func private @main(%arg0: memref<256xi64> {forth.param_name = "DATA"}) attributes {forth.kernel} {
+    %alloca = memref.alloca() {forth.shared_name = "SCRATCH"} : memref<256xi64>
+    %ptr = memref.extract_aligned_pointer_as_index %alloca : memref<256xi64> -> index
+    %c0 = arith.constant 0 : index
+    return
+  }
+}
diff --git a/test/Pipeline/shared-memory.forth b/test/Pipeline/shared-memory.forth
new file mode 100644
index 0000000..635e8f1
--- /dev/null
+++ b/test/Pipeline/shared-memory.forth
@@ -0,0 +1,21 @@
+\ RUN: %warpforth-translate --forth-to-mlir %s | %warpforth-opt --warpforth-pipeline | %FileCheck %s
+\ RUN: %warpforth-translate --forth-to-mlir %s | %warpforth-opt --convert-forth-to-memref --convert-forth-to-gpu | %FileCheck %s --check-prefix=MID
+
+\ Verify that shared memory through the full pipeline produces a gpu.binary
+\ CHECK: gpu.binary @warpforth_module
+
+\ Verify intermediate MLIR structure: shared alloca becomes workgroup attribution
+\ MID: gpu.module @warpforth_module
+\ MID: gpu.func @main(%arg0: memref<256xi64> {forth.param_name = "DATA"})
+\ MID-SAME: workgroup(%{{.*}}: memref<256xi64, #gpu.address_space<workgroup>>)
+\ MID-SAME: kernel
+\ MID: memref.extract_aligned_pointer_as_index %{{.*}} : memref<256xi64, #gpu.address_space<workgroup>>
+\ MID: llvm.store
+\ MID: gpu.return
+
+\! kernel main
+\! param DATA i64[256]
+\! shared SCRATCH i64[256]
+GLOBAL-ID CELLS SCRATCH + !
+GLOBAL-ID CELLS SCRATCH + @
+GLOBAL-ID CELLS DATA + !
diff --git a/test/Translation/Forth/header-duplicate-param-error.forth b/test/Translation/Forth/header-duplicate-param-error.forth
index 9f6501b..f90bf26 100644
--- a/test/Translation/Forth/header-duplicate-param-error.forth
+++ b/test/Translation/Forth/header-duplicate-param-error.forth
@@ -1,5 +1,5 @@
 \ RUN: %not %warpforth-translate --forth-to-mlir %s 2>&1 | %FileCheck %s
-\ CHECK: duplicate parameter name: A
+\ CHECK: duplicate name: A (already declared as param)
 \! kernel main
 \! param A i64[4]
 \! param A i64[8]
diff --git a/test/Translation/Forth/shared-declarations.forth b/test/Translation/Forth/shared-declarations.forth
new file mode 100644
index 0000000..065b53a
--- /dev/null
+++ b/test/Translation/Forth/shared-declarations.forth
@@ -0,0 +1,12 @@
+\ RUN: %warpforth-translate --forth-to-mlir %s | %FileCheck %s
+
+\ Verify shared memory declarations produce tagged alloca and pointer push sequence
+\ CHECK: func.func private @main(%arg0: memref<256xi64> {forth.param_name = "DATA"})
+\ CHECK: memref.alloca() {forth.shared_name = "SCRATCH"} : memref<256xi64>
+\ CHECK: memref.extract_aligned_pointer_as_index
+\ CHECK: arith.index_cast
+\ CHECK: forth.push_value
+\! kernel main
+\! param DATA i64[256]
+\! shared SCRATCH i64[256]
+SCRATCH
diff --git a/test/Translation/Forth/shared-ref-in-word-error.forth b/test/Translation/Forth/shared-ref-in-word-error.forth
new file mode 100644
index 0000000..5f7b3f0
--- /dev/null
+++ b/test/Translation/Forth/shared-ref-in-word-error.forth
@@ -0,0 +1,6 @@
+\ RUN: %not %warpforth-translate --forth-to-mlir %s 2>&1 | %FileCheck %s
+\ CHECK: shared memory 'SCRATCH' cannot be referenced inside a word definition
+\! kernel main
+\! shared SCRATCH i64[256]
+: BAD-WORD SCRATCH @ ;
+BAD-WORD

From f723a10e40bdb063de51535bf2b2b72002eb3197 Mon Sep 17 00:00:00 2001
From: Alex Cameron <asc@tetsuo.sh>
Date: Thu, 19 Feb 2026 23:39:32 +0900
Subject: [PATCH 2/2] chore: fix stale TODO, comment typos, and add
 param/shared collision test

---
 lib/Conversion/ForthToGPU/ForthToGPU.cpp                    | 6 +++---
 lib/Translation/ForthToMLIR/ForthToMLIR.h                   | 1 -
 .../Forth/header-shared-param-duplicate-error.forth         | 5 +++++
 3 files changed, 8 insertions(+), 4 deletions(-)
 create mode 100644 test/Translation/Forth/header-shared-param-duplicate-error.forth

diff --git a/lib/Conversion/ForthToGPU/ForthToGPU.cpp b/lib/Conversion/ForthToGPU/ForthToGPU.cpp
index 4324519..17d7be4 100644
--- a/lib/Conversion/ForthToGPU/ForthToGPU.cpp
+++ b/lib/Conversion/ForthToGPU/ForthToGPU.cpp
@@ -158,9 +158,9 @@ struct ConvertForthToGPUPass
     }
 
     // Clone ops from each source block into the corresponding destination
-    // block, with three transformations:
-    // - func.return → gpu.return
-    // - shared memref.alloca → gpu.func workgroup attribution
+    // block, with two transformations:
+    // - func.return -> gpu.return
+    // - shared memref.alloca -> gpu.func workgroup attribution
     auto *ctx = funcOp.getContext();
     for (auto [srcBlock, dstBlock] :
          llvm::zip(funcOp.getBody(), gpuFunc.getBody())) {
diff --git a/lib/Translation/ForthToMLIR/ForthToMLIR.h b/lib/Translation/ForthToMLIR/ForthToMLIR.h
index 09ac2cc..025c8a4 100644
--- a/lib/Translation/ForthToMLIR/ForthToMLIR.h
+++ b/lib/Translation/ForthToMLIR/ForthToMLIR.h
@@ -26,7 +26,6 @@ struct ParamDecl {
 };
 
 /// A declared shared memory region: `shared <name> <type>`.
-/// TODO: Not yet consumed — scaffolding for shared memory support.
 struct SharedDecl {
   std::string name;
   bool isArray = false;
diff --git a/test/Translation/Forth/header-shared-param-duplicate-error.forth b/test/Translation/Forth/header-shared-param-duplicate-error.forth
new file mode 100644
index 0000000..249206c
--- /dev/null
+++ b/test/Translation/Forth/header-shared-param-duplicate-error.forth
@@ -0,0 +1,5 @@
+\ RUN: %not %warpforth-translate --forth-to-mlir %s 2>&1 | %FileCheck %s
+\ CHECK: duplicate name: A (already declared as param)
+\! kernel main
+\! param A i64[4]
+\! shared A i64[8]