From 20c18c611821448d77b11c138c64d8037ff99997 Mon Sep 17 00:00:00 2001 From: Alex Cameron Date: Thu, 19 Feb 2026 23:31:39 +0900 Subject: [PATCH 1/2] feat: add shared memory support via \! shared header directive --- CLAUDE.md | 1 + lib/Conversion/ForthToGPU/ForthToGPU.cpp | 16 +++++++- lib/Translation/ForthToMLIR/ForthToMLIR.cpp | 41 +++++++++++++++++-- lib/Translation/ForthToMLIR/ForthToMLIR.h | 2 + test/Conversion/ForthToGPU/shared-memory.mlir | 18 ++++++++ test/Pipeline/shared-memory.forth | 21 ++++++++++ .../Forth/header-duplicate-param-error.forth | 2 +- .../Forth/shared-declarations.forth | 12 ++++++ .../Forth/shared-ref-in-word-error.forth | 6 +++ 9 files changed, 114 insertions(+), 5 deletions(-) create mode 100644 test/Conversion/ForthToGPU/shared-memory.mlir create mode 100644 test/Pipeline/shared-memory.forth create mode 100644 test/Translation/Forth/shared-declarations.forth create mode 100644 test/Translation/Forth/shared-ref-in-word-error.forth diff --git a/CLAUDE.md b/CLAUDE.md index a97eae5..4bd427f 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -89,6 +89,7 @@ uv run ruff format gpu_test/ - **Operations**: All take stack as input and produce stack as output (except `forth.stack`) - **Supported Words**: literals, `DUP DROP SWAP OVER ROT NIP TUCK PICK ROLL`, `+ - * / MOD`, `AND OR XOR NOT LSHIFT RSHIFT`, `= < > <> <= >= 0=`, `@ !`, `CELLS`, `IF ELSE THEN`, `BEGIN UNTIL`, `BEGIN WHILE REPEAT`, `DO LOOP +LOOP I J K`, `LEAVE UNLOOP EXIT`, `TID-X/Y/Z BID-X/Y/Z BDIM-X/Y/Z GDIM-X/Y/Z GLOBAL-ID` (GPU indexing). - **Kernel Parameters**: Declared in the `\!` header. `\! kernel ` is required and must appear first. `\! param i64[]` becomes a `memref` argument; `\! param i64` becomes an `i64` argument. Using a param name in code emits `forth.param_ref` (arrays push address; scalars push value). +- **Shared Memory**: `\! shared i64[]` declares GPU shared (workgroup) memory. Emits a tagged `memref.alloca` at kernel entry; ForthToGPU converts it to a `gpu.func` workgroup attribution (`memref>`). Using the shared name in code pushes its base address onto the stack. Cannot be referenced inside word definitions. - **Conversion**: `!forth.stack` → `memref<256xi64>` with explicit stack pointer - **GPU**: Functions wrapped in `gpu.module`, `main` gets `gpu.kernel` attribute, configured with bare pointers for NVVM conversion - **User-defined Words**: Modeled as `func.func` with signature `(!forth.stack) -> !forth.stack`, called via `func.call` diff --git a/lib/Conversion/ForthToGPU/ForthToGPU.cpp b/lib/Conversion/ForthToGPU/ForthToGPU.cpp index 7d72597..4324519 100644 --- a/lib/Conversion/ForthToGPU/ForthToGPU.cpp +++ b/lib/Conversion/ForthToGPU/ForthToGPU.cpp @@ -158,7 +158,10 @@ struct ConvertForthToGPUPass } // Clone ops from each source block into the corresponding destination - // block. Replace func.return with gpu.return. + // block, with three transformations: + // - func.return → gpu.return + // - shared memref.alloca → gpu.func workgroup attribution + auto *ctx = funcOp.getContext(); for (auto [srcBlock, dstBlock] : llvm::zip(funcOp.getBody(), gpuFunc.getBody())) { rewriter.setInsertionPointToEnd(&dstBlock); @@ -168,6 +171,17 @@ struct ConvertForthToGPUPass for (Value operand : returnOp.getOperands()) remappedOperands.push_back(mapping.lookup(operand)); rewriter.create(returnOp.getLoc(), remappedOperands); + } else if (auto allocaOp = dyn_cast(&op); + allocaOp && allocaOp->hasAttr("forth.shared_name")) { + auto origType = cast(allocaOp.getType()); + auto addressSpace = + gpu::AddressSpaceAttr::get(ctx, gpu::AddressSpace::Workgroup); + auto sharedType = + MemRefType::get(origType.getShape(), origType.getElementType(), + MemRefLayoutAttrInterface{}, addressSpace); + BlockArgument attr = + gpuFunc.addWorkgroupAttribution(sharedType, allocaOp.getLoc()); + mapping.map(allocaOp.getResult(), attr); } else { rewriter.clone(op, mapping); } diff --git a/lib/Translation/ForthToMLIR/ForthToMLIR.cpp b/lib/Translation/ForthToMLIR/ForthToMLIR.cpp index 4440537..766e21c 100644 --- a/lib/Translation/ForthToMLIR/ForthToMLIR.cpp +++ b/lib/Translation/ForthToMLIR/ForthToMLIR.cpp @@ -279,13 +279,14 @@ LogicalResult ForthParser::parseHeader() { std::string nameUpper = toUpperCase(tokens[1]); for (const auto ¶m : paramDecls) { if (param.name == nameUpper) { - return emitErrorAt(lineLoc, - "duplicate parameter name: " + nameUpper); + return emitErrorAt(lineLoc, "duplicate name: " + nameUpper + + " (already declared as param)"); } } for (const auto &shared : sharedDecls) { if (shared.name == nameUpper) { - return emitErrorAt(lineLoc, "duplicate shared name: " + nameUpper); + return emitErrorAt(lineLoc, "duplicate name: " + nameUpper + + " (already declared as shared)"); } } @@ -389,6 +390,30 @@ Value ForthParser::emitOperation(StringRef word, Value inputStack, } } + // Check if word is a shared memory name (only valid outside word definitions) + if (!inWordDefinition) { + auto it = sharedAllocs.find(word); + if (it != sharedAllocs.end()) { + Value alloca = it->second; + Value ptrIndex = + builder.create(loc, alloca); + Value ptrI64 = builder.create( + loc, builder.getI64Type(), ptrIndex); + return builder + .create(loc, stackType, inputStack, ptrI64) + .getOutputStack(); + } + } else { + for (const auto &shared : sharedDecls) { + if (word == shared.name) { + (void)emitError("shared memory '" + shared.name + + "' cannot be referenced inside a word definition; " + "pass the address from the caller instead"); + return nullptr; + } + } + } + // Check user-defined words first std::string mangledWord = mangleForthName(word); if (wordDefs.count(mangledWord)) { @@ -1019,6 +1044,16 @@ OwningOpRef ForthParser::parseModule() { Block *entryBlock = funcOp.addEntryBlock(); builder.setInsertionPointToStart(entryBlock); + // Emit shared memory allocations at kernel entry + for (const auto &shared : sharedDecls) { + int64_t size = shared.isArray ? shared.size : 1; + auto memrefType = MemRefType::get({size}, builder.getI64Type()); + Value alloca = builder.create(loc, memrefType); + alloca.getDefiningOp()->setAttr("forth.shared_name", + builder.getStringAttr(shared.name)); + sharedAllocs[shared.name] = alloca; + } + // Parse Forth operations Value finalStack; if (failed(parseOperations(finalStack))) { diff --git a/lib/Translation/ForthToMLIR/ForthToMLIR.h b/lib/Translation/ForthToMLIR/ForthToMLIR.h index 6c24e6d..09ac2cc 100644 --- a/lib/Translation/ForthToMLIR/ForthToMLIR.h +++ b/lib/Translation/ForthToMLIR/ForthToMLIR.h @@ -10,6 +10,7 @@ #include "mlir/IR/BuiltinOps.h" #include "mlir/IR/MLIRContext.h" #include "llvm/Support/SourceMgr.h" +#include "llvm/ADT/StringMap.h" #include #include #include @@ -92,6 +93,7 @@ class ForthParser { std::unordered_set wordDefs; std::vector paramDecls; std::vector sharedDecls; + llvm::StringMap sharedAllocs; std::string kernelName; const char *headerEndPtr = nullptr; bool inWordDefinition = false; diff --git a/test/Conversion/ForthToGPU/shared-memory.mlir b/test/Conversion/ForthToGPU/shared-memory.mlir new file mode 100644 index 0000000..d536dd7 --- /dev/null +++ b/test/Conversion/ForthToGPU/shared-memory.mlir @@ -0,0 +1,18 @@ +// RUN: %warpforth-opt --convert-forth-to-gpu %s | %FileCheck %s + +// CHECK: gpu.module @warpforth_module +// CHECK: gpu.func @main(%arg0: memref<256xi64> {forth.param_name = "DATA"}) +// CHECK-SAME: workgroup(%{{.*}}: memref<256xi64, #gpu.address_space>) +// CHECK-SAME: kernel +// CHECK-NOT: memref.alloca() {forth.shared_name +// CHECK: memref.extract_aligned_pointer_as_index %{{.*}} : memref<256xi64, #gpu.address_space> +// CHECK: gpu.return + +module { + func.func private @main(%arg0: memref<256xi64> {forth.param_name = "DATA"}) attributes {forth.kernel} { + %alloca = memref.alloca() {forth.shared_name = "SCRATCH"} : memref<256xi64> + %ptr = memref.extract_aligned_pointer_as_index %alloca : memref<256xi64> -> index + %c0 = arith.constant 0 : index + return + } +} diff --git a/test/Pipeline/shared-memory.forth b/test/Pipeline/shared-memory.forth new file mode 100644 index 0000000..635e8f1 --- /dev/null +++ b/test/Pipeline/shared-memory.forth @@ -0,0 +1,21 @@ +\ RUN: %warpforth-translate --forth-to-mlir %s | %warpforth-opt --warpforth-pipeline | %FileCheck %s +\ RUN: %warpforth-translate --forth-to-mlir %s | %warpforth-opt --convert-forth-to-memref --convert-forth-to-gpu | %FileCheck %s --check-prefix=MID + +\ Verify that shared memory through the full pipeline produces a gpu.binary +\ CHECK: gpu.binary @warpforth_module + +\ Verify intermediate MLIR structure: shared alloca becomes workgroup attribution +\ MID: gpu.module @warpforth_module +\ MID: gpu.func @main(%arg0: memref<256xi64> {forth.param_name = "DATA"}) +\ MID-SAME: workgroup(%{{.*}}: memref<256xi64, #gpu.address_space>) +\ MID-SAME: kernel +\ MID: memref.extract_aligned_pointer_as_index %{{.*}} : memref<256xi64, #gpu.address_space> +\ MID: llvm.store +\ MID: gpu.return + +\! kernel main +\! param DATA i64[256] +\! shared SCRATCH i64[256] +GLOBAL-ID CELLS SCRATCH + ! +GLOBAL-ID CELLS SCRATCH + @ +GLOBAL-ID CELLS DATA + ! diff --git a/test/Translation/Forth/header-duplicate-param-error.forth b/test/Translation/Forth/header-duplicate-param-error.forth index 9f6501b..f90bf26 100644 --- a/test/Translation/Forth/header-duplicate-param-error.forth +++ b/test/Translation/Forth/header-duplicate-param-error.forth @@ -1,5 +1,5 @@ \ RUN: %not %warpforth-translate --forth-to-mlir %s 2>&1 | %FileCheck %s -\ CHECK: duplicate parameter name: A +\ CHECK: duplicate name: A (already declared as param) \! kernel main \! param A i64[4] \! param A i64[8] diff --git a/test/Translation/Forth/shared-declarations.forth b/test/Translation/Forth/shared-declarations.forth new file mode 100644 index 0000000..065b53a --- /dev/null +++ b/test/Translation/Forth/shared-declarations.forth @@ -0,0 +1,12 @@ +\ RUN: %warpforth-translate --forth-to-mlir %s | %FileCheck %s + +\ Verify shared memory declarations produce tagged alloca and pointer push sequence +\ CHECK: func.func private @main(%arg0: memref<256xi64> {forth.param_name = "DATA"}) +\ CHECK: memref.alloca() {forth.shared_name = "SCRATCH"} : memref<256xi64> +\ CHECK: memref.extract_aligned_pointer_as_index +\ CHECK: arith.index_cast +\ CHECK: forth.push_value +\! kernel main +\! param DATA i64[256] +\! shared SCRATCH i64[256] +SCRATCH diff --git a/test/Translation/Forth/shared-ref-in-word-error.forth b/test/Translation/Forth/shared-ref-in-word-error.forth new file mode 100644 index 0000000..5f7b3f0 --- /dev/null +++ b/test/Translation/Forth/shared-ref-in-word-error.forth @@ -0,0 +1,6 @@ +\ RUN: %not %warpforth-translate --forth-to-mlir %s 2>&1 | %FileCheck %s +\ CHECK: shared memory 'SCRATCH' cannot be referenced inside a word definition +\! kernel main +\! shared SCRATCH i64[256] +: BAD-WORD SCRATCH @ ; +BAD-WORD From f723a10e40bdb063de51535bf2b2b72002eb3197 Mon Sep 17 00:00:00 2001 From: Alex Cameron Date: Thu, 19 Feb 2026 23:39:32 +0900 Subject: [PATCH 2/2] chore: fix stale TODO, comment typos, and add param/shared collision test --- lib/Conversion/ForthToGPU/ForthToGPU.cpp | 6 +++--- lib/Translation/ForthToMLIR/ForthToMLIR.h | 1 - .../Forth/header-shared-param-duplicate-error.forth | 5 +++++ 3 files changed, 8 insertions(+), 4 deletions(-) create mode 100644 test/Translation/Forth/header-shared-param-duplicate-error.forth diff --git a/lib/Conversion/ForthToGPU/ForthToGPU.cpp b/lib/Conversion/ForthToGPU/ForthToGPU.cpp index 4324519..17d7be4 100644 --- a/lib/Conversion/ForthToGPU/ForthToGPU.cpp +++ b/lib/Conversion/ForthToGPU/ForthToGPU.cpp @@ -158,9 +158,9 @@ struct ConvertForthToGPUPass } // Clone ops from each source block into the corresponding destination - // block, with three transformations: - // - func.return → gpu.return - // - shared memref.alloca → gpu.func workgroup attribution + // block, with two transformations: + // - func.return -> gpu.return + // - shared memref.alloca -> gpu.func workgroup attribution auto *ctx = funcOp.getContext(); for (auto [srcBlock, dstBlock] : llvm::zip(funcOp.getBody(), gpuFunc.getBody())) { diff --git a/lib/Translation/ForthToMLIR/ForthToMLIR.h b/lib/Translation/ForthToMLIR/ForthToMLIR.h index 09ac2cc..025c8a4 100644 --- a/lib/Translation/ForthToMLIR/ForthToMLIR.h +++ b/lib/Translation/ForthToMLIR/ForthToMLIR.h @@ -26,7 +26,6 @@ struct ParamDecl { }; /// A declared shared memory region: `shared `. -/// TODO: Not yet consumed — scaffolding for shared memory support. struct SharedDecl { std::string name; bool isArray = false; diff --git a/test/Translation/Forth/header-shared-param-duplicate-error.forth b/test/Translation/Forth/header-shared-param-duplicate-error.forth new file mode 100644 index 0000000..249206c --- /dev/null +++ b/test/Translation/Forth/header-shared-param-duplicate-error.forth @@ -0,0 +1,5 @@ +\ RUN: %not %warpforth-translate --forth-to-mlir %s 2>&1 | %FileCheck %s +\ CHECK: duplicate name: A (already declared as param) +\! kernel main +\! param A i64[4] +\! shared A i64[8]