diff --git a/CMakeLists.txt b/CMakeLists.txt index b48ad6cad751..a6cabb9feecd 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -3,8 +3,15 @@ cmake_minimum_required(VERSION 3.13.4) include(CheckCXXSourceCompiles) set(POLYGEIST_ENABLE_CUDA 0 CACHE BOOL "Enable CUDA frontend and backend") +set(POLYGEIST_ENABLE_CUDA_SYNTAX_ONLY 0 CACHE BOOL "Enable CUDA syntax parsing without requiring CUDA toolkit") set(POLYGEIST_ENABLE_ROCM 0 CACHE BOOL "Enable ROCM backend") +# If CUDA_SYNTAX_ONLY is enabled, set flag for frontend-only mode +if(POLYGEIST_ENABLE_CUDA_SYNTAX_ONLY) + set(POLYGEIST_CUDA_FRONTEND_ONLY 1) + message(STATUS "CUDA syntax-only mode enabled (no CUDA toolkit required)") +endif() + set(POLYGEIST_ENABLE_POLYMER 0 CACHE BOOL "Enable Polymer") set(POLYGEIST_POLYMER_ENABLE_ISL 0 CACHE BOOL "Enable Polymer isl") set(POLYGEIST_POLYMER_ENABLE_PLUTO 0 CACHE BOOL "Enable Polymer pluto") diff --git a/include/polygeist/Passes/Passes.h b/include/polygeist/Passes/Passes.h index fd4ab057da3d..214d099a9bd1 100644 --- a/include/polygeist/Passes/Passes.h +++ b/include/polygeist/Passes/Passes.h @@ -79,6 +79,8 @@ createGpuSerializeToHsacoPass(StringRef arch, StringRef features, void registerGpuSerializeToCubinPass(); void registerGpuSerializeToHsacoPass(); +std::unique_ptr createConvertGPUToVortexPass(); + void populateForBreakToWhilePatterns(RewritePatternSet &patterns); } // namespace polygeist } // namespace mlir diff --git a/include/polygeist/Passes/Passes.td b/include/polygeist/Passes/Passes.td index 70081f5502a0..77326c9b0068 100644 --- a/include/polygeist/Passes/Passes.td +++ b/include/polygeist/Passes/Passes.td @@ -300,4 +300,27 @@ def ConvertPolygeistToLLVM : Pass<"convert-polygeist-to-llvm", "mlir::ModuleOp"> ]; } +def ConvertGPUToVortex : Pass<"convert-gpu-to-vortex", "ModuleOp"> { + let summary = "Lower GPU dialect operations to Vortex RISC-V intrinsics"; + let description = [{ + This pass converts GPU dialect operations to LLVM dialect with Vortex-specific + intrinsics. It lowers operations like gpu.thread_id, gpu.block_id to RISC-V + CSR reads via inline assembly, preparing the code for the Vortex GPGPU backend. + + Example: + %tid = gpu.thread_id x + becomes: + %tid = llvm.inline_asm "csrr $0, 0xCC0" : () -> i32 + + The pass automatically emits JSON metadata files describing kernel argument + layouts for runtime argument marshaling. Files are written to the current + working directory as .meta.json. + }]; + let constructor = "mlir::polygeist::createConvertGPUToVortexPass()"; + let dependentDialects = [ + "LLVM::LLVMDialect", + "gpu::GPUDialect", + ]; +} + #endif // POLYGEIST_PASSES diff --git a/lib/polygeist/ExecutionEngine/CMakeLists.txt b/lib/polygeist/ExecutionEngine/CMakeLists.txt index 3049f2fb3e54..b322448f5a2e 100644 --- a/lib/polygeist/ExecutionEngine/CMakeLists.txt +++ b/lib/polygeist/ExecutionEngine/CMakeLists.txt @@ -1,5 +1,11 @@ # TODO we do not support cross compilation currently +# Skip execution engine entirely if syntax-only mode +if(POLYGEIST_CUDA_FRONTEND_ONLY) + message(STATUS "Skipping CUDA execution engine (syntax-only mode)") + return() +endif() + if(POLYGEIST_ENABLE_CUDA) find_package(CUDA) enable_language(CUDA) diff --git a/lib/polygeist/Passes/CMakeLists.txt b/lib/polygeist/Passes/CMakeLists.txt index c385d548a428..5d7f98033389 100644 --- a/lib/polygeist/Passes/CMakeLists.txt +++ b/lib/polygeist/Passes/CMakeLists.txt @@ -18,6 +18,7 @@ add_mlir_dialect_library(MLIRPolygeistTransforms InnerSerialization.cpp ForBreakToWhile.cpp ConvertParallelToGPU.cpp + ConvertGPUToVortex.cpp SerializeToCubin.cpp SerializeToHsaco.cpp ParallelLoopUnroll.cpp @@ -80,7 +81,8 @@ target_compile_definitions(obj.MLIRPolygeistTransforms POLYGEIST_PGO_DATA_DIR_ENV_VAR="${POLYGEIST_PGO_DATA_DIR_ENV_VAR}" ) -if(POLYGEIST_ENABLE_CUDA) +# Only require CUDA toolkit if full CUDA support (not syntax-only mode) +if(POLYGEIST_ENABLE_CUDA AND NOT POLYGEIST_CUDA_FRONTEND_ONLY) find_package(CUDA) enable_language(CUDA) diff --git a/lib/polygeist/Passes/ConvertGPUToVortex.cpp b/lib/polygeist/Passes/ConvertGPUToVortex.cpp new file mode 100644 index 000000000000..53538c3d75fd --- /dev/null +++ b/lib/polygeist/Passes/ConvertGPUToVortex.cpp @@ -0,0 +1,762 @@ +//===- ConvertGPUToVortex.cpp - Lower GPU dialect to Vortex intrinsics ----===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file implements a pass that lowers GPU dialect operations to LLVM +// dialect with Vortex-specific intrinsics (CSR reads, custom instructions). +// +//===----------------------------------------------------------------------===// + +#include "mlir/Conversion/LLVMCommon/ConversionTarget.h" +#include "mlir/Conversion/LLVMCommon/Pattern.h" +#include "mlir/Dialect/GPU/IR/GPUDialect.h" +#include "mlir/Dialect/LLVMIR/LLVMDialect.h" +#include "mlir/IR/BuiltinOps.h" +#include "mlir/Pass/Pass.h" +#include "mlir/Transforms/DialectConversion.h" +#include "polygeist/Passes/Passes.h" +#include "llvm/ADT/StringMap.h" +#include "llvm/Support/FileSystem.h" +#include "llvm/Support/Path.h" +#include "llvm/Support/raw_ostream.h" +#include +#include + +using namespace mlir; +using namespace mlir::gpu; + +namespace { + +//===----------------------------------------------------------------------===// +// Vortex CSR Addresses (from vx_intrinsics.h) +//===----------------------------------------------------------------------===// + +constexpr uint32_t VX_CSR_THREAD_ID = 0xCC0; +constexpr uint32_t VX_CSR_WARP_ID = 0xCC1; +constexpr uint32_t VX_CSR_CORE_ID = 0xCC2; +constexpr uint32_t VX_CSR_NUM_THREADS = 0xFC0; +constexpr uint32_t VX_CSR_NUM_WARPS = 0xFC1; +constexpr uint32_t VX_CSR_NUM_CORES = 0xFC2; + +//===----------------------------------------------------------------------===// +// Preprocessing: Consolidate Polygeist Alternatives +//===----------------------------------------------------------------------===// + +/// Extract base kernel name by removing Polygeist variant suffix +/// Example: _Z12launch_basicPiS_ji_kernel94565344022848 -> _Z12launch_basicPiS_ji +static StringRef extractBaseKernelName(StringRef mangledName) { + size_t pos = mangledName.find("_kernel"); + if (pos != StringRef::npos) { + // Find where the numeric suffix starts after "_kernel" + size_t suffixStart = pos + 7; // Length of "_kernel" + if (suffixStart < mangledName.size() && + std::isdigit(mangledName[suffixStart])) { + return mangledName.substr(0, pos); + } + } + return mangledName; +} + +/// Consolidate polygeist.alternatives to first variant only +/// This preprocessing step simplifies downstream processing by: +/// 1. Replacing polygeist.alternatives with content of first alternative +/// 2. Ensuring single canonical launch configuration for Vortex +static void consolidatePolygeistAlternatives(ModuleOp module) { + SmallVector altOps; + + // Collect all polygeist.alternatives operations + module.walk([&](Operation *op) { + if (op->getName().getStringRef() == "polygeist.alternatives") { + altOps.push_back(op); + } + }); + + // Replace each alternatives op with content of its first region + for (Operation *altOp : altOps) { + if (altOp->getNumRegions() == 0 || altOp->getRegion(0).empty()) + continue; + + OpBuilder builder(altOp); + Region &firstRegion = altOp->getRegion(0); + Block &firstBlock = firstRegion.front(); + + // Move all operations from first region to parent block (before alternatives op) + // This inlines the first alternative's content + auto &ops = firstBlock.getOperations(); + for (Operation &innerOp : llvm::make_early_inc_range(ops)) { + // Skip the terminator (polygeist.polygeist_yield) + if (innerOp.getName().getStringRef() == "polygeist.polygeist_yield") + continue; + innerOp.moveBefore(altOp); + } + + // Erase the now-empty alternatives operation + altOp->erase(); + } +} + +/// Remove duplicate GPU kernel functions, keeping only the first variant +/// After Polygeist auto-tuning, multiple kernel variants exist but only +/// the first one is referenced after consolidating alternatives. +static void removeDuplicateKernels(ModuleOp module) { + // Track seen kernel base names + llvm::StringMap seenKernels; + SmallVector toErase; + + // Walk all GPU modules + module.walk([&](gpu::GPUModuleOp gpuModule) { + // Collect all kernel functions + for (auto gpuFunc : gpuModule.getOps()) { + if (!gpuFunc.isKernel()) + continue; + + StringRef funcName = gpuFunc.getName(); + StringRef baseName = extractBaseKernelName(funcName); + + // Check if we've seen this kernel base name before + auto it = seenKernels.find(baseName); + if (it != seenKernels.end()) { + // Duplicate found - mark for deletion + toErase.push_back(gpuFunc); + } else { + // First occurrence - keep it + seenKernels[baseName] = gpuFunc; + } + } + }); + + // Erase duplicate kernels + for (auto func : toErase) { + func.erase(); + } +} + +//===----------------------------------------------------------------------===// +// Helper Functions +//===----------------------------------------------------------------------===// + +/// Declare an external function to access TLS dim3_t variables +/// For thread-local variables like blockIdx/threadIdx, we generate helper +/// functions that return pointers to the TLS variables +/// Returns an LLVM function declaration +/// The function is declared within the gpu.module where it's being used +static LLVM::LLVMFuncOp getOrCreateDim3TLSAccessor(Operation *op, + OpBuilder &builder, + StringRef varName) { + // Find the gpu.module containing this operation + auto gpuModule = op->getParentOfType(); + MLIRContext *context = gpuModule.getContext(); + + // Create function name: e.g., "vx_get_blockIdx" + std::string funcName = ("vx_get_" + varName).str(); + + // Check if function already exists in gpu.module + if (auto existing = gpuModule.lookupSymbol(funcName)) { + return existing; + } + + // Create function type: () -> !llvm.ptr (returns pointer to dim3_t) + auto ptrType = LLVM::LLVMPointerType::get(context); + auto funcType = LLVM::LLVMFunctionType::get(ptrType, {}, /*isVarArg=*/false); + + // Declare external function within gpu.module + OpBuilder::InsertionGuard guard(builder); + builder.setInsertionPointToStart(gpuModule.getBody()); + + return builder.create( + gpuModule.getLoc(), + funcName, + funcType, + LLVM::Linkage::External); +} + +/// Access a field of a TLS dim3_t variable (threadIdx or blockIdx) +/// dimension: gpu::Dimension::x (0), y (1), or z (2) +static Value createDim3TLSAccess(Operation *op, + ConversionPatternRewriter &rewriter, + Location loc, + StringRef varName, + gpu::Dimension dimension) { + auto module = op->getParentOfType(); + MLIRContext *context = module.getContext(); + + // Get or create the TLS accessor function + auto accessorFunc = getOrCreateDim3TLSAccessor(op, rewriter, varName); + + // Call the accessor function to get pointer to TLS variable + auto ptrType = LLVM::LLVMPointerType::get(context); + auto callResult = rewriter.create( + loc, accessorFunc, ValueRange{}); + Value dim3Ptr = callResult.getResult(); + + // Create GEP to access the specific field (x=0, y=1, z=2) + auto i32Type = rewriter.getI32Type(); + auto dim3Type = LLVM::LLVMStructType::getLiteral( + context, {i32Type, i32Type, i32Type}); + + // GEP indices: [0, dimension] + // First 0 is to dereference the pointer + // Second index selects the struct field + SmallVector indices; + indices.push_back(0); // Base index + indices.push_back(static_cast(dimension)); // Field index (0=x, 1=y, 2=z) + + auto gep = rewriter.create( + loc, ptrType, dim3Type, dim3Ptr, indices); + + // Load the value from the computed address + auto result = rewriter.create(loc, i32Type, gep); + + return result.getResult(); +} + +//===----------------------------------------------------------------------===// +// Conversion Patterns +//===----------------------------------------------------------------------===// + +/// Lower gpu.thread_id to TLS variable access +/// Accesses the threadIdx TLS variable set by vx_spawn_threads() +struct ThreadIdOpLowering : public ConvertOpToLLVMPattern { + using ConvertOpToLLVMPattern::ConvertOpToLLVMPattern; + + LogicalResult + matchAndRewrite(ThreadIdOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + Location loc = op.getLoc(); + + // Get the dimension (X, Y, or Z) + auto dimension = op.getDimension(); + + // Access threadIdx.{x,y,z} from TLS + auto result = createDim3TLSAccess(op, rewriter, loc, + "threadIdx", dimension); + + rewriter.replaceOp(op, result); + return success(); + } +}; + +/// Lower gpu.block_id to TLS variable access +/// Accesses the blockIdx TLS variable set by vx_spawn_threads() +struct BlockIdOpLowering : public ConvertOpToLLVMPattern { + using ConvertOpToLLVMPattern::ConvertOpToLLVMPattern; + + LogicalResult + matchAndRewrite(BlockIdOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + Location loc = op.getLoc(); + + // Get the dimension (X, Y, or Z) + auto dimension = op.getDimension(); + + // Access blockIdx.{x,y,z} from TLS + auto result = createDim3TLSAccess(op, rewriter, loc, + "blockIdx", dimension); + + rewriter.replaceOp(op, result); + return success(); + } +}; + +/// Lower gpu.block_dim to TLS variable access +/// Accesses the blockDim global variable set by vx_spawn_threads() +struct BlockDimOpLowering : public ConvertOpToLLVMPattern { + using ConvertOpToLLVMPattern::ConvertOpToLLVMPattern; + + LogicalResult + matchAndRewrite(gpu::BlockDimOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + Location loc = op.getLoc(); + + // Get the dimension (X, Y, or Z) + auto dimension = op.getDimension(); + + // Access blockDim.{x,y,z} from global variable + // Note: blockDim is NOT thread-local, it's a regular global + auto result = createDim3TLSAccess(op, rewriter, loc, + "blockDim", dimension); + + rewriter.replaceOp(op, result); + return success(); + } +}; + +/// Lower gpu.grid_dim to TLS variable access +/// Accesses the gridDim global variable set by vx_spawn_threads() +struct GridDimOpLowering : public ConvertOpToLLVMPattern { + using ConvertOpToLLVMPattern::ConvertOpToLLVMPattern; + + LogicalResult + matchAndRewrite(gpu::GridDimOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + Location loc = op.getLoc(); + + // Get the dimension (X, Y, or Z) + auto dimension = op.getDimension(); + + // Access gridDim.{x,y,z} from global variable + // Note: gridDim is NOT thread-local, it's a regular global + auto result = createDim3TLSAccess(op, rewriter, loc, + "gridDim", dimension); + + rewriter.replaceOp(op, result); + return success(); + } +}; + +/// Lower gpu.barrier to Vortex vx_barrier call +/// Synchronizes all threads in a block using Vortex hardware barriers +struct BarrierOpLowering : public ConvertOpToLLVMPattern { + using ConvertOpToLLVMPattern::ConvertOpToLLVMPattern; + + LogicalResult + matchAndRewrite(gpu::BarrierOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + Location loc = op.getLoc(); + // Declare functions in gpu.module (not top-level module) so they're visible + auto gpuModule = op->getParentOfType(); + if (!gpuModule) + return failure(); + MLIRContext *context = gpuModule.getContext(); + + // Allocate barrier ID (simple counter for now) + // TODO: Proper barrier ID allocation to avoid conflicts + static int barrierIdCounter = 0; + int barrierId = barrierIdCounter++; + + // Create barrier ID constant + auto i32Type = rewriter.getI32Type(); + auto barIdConstant = rewriter.create( + loc, i32Type, rewriter.getI32IntegerAttr(barrierId)); + + // Declare vx_num_warps function in gpu.module if not already declared + auto vxNumWarpsFunc = gpuModule.lookupSymbol("vx_num_warps"); + if (!vxNumWarpsFunc) { + OpBuilder::InsertionGuard guard(rewriter); + rewriter.setInsertionPointToStart(gpuModule.getBody()); + + auto funcType = LLVM::LLVMFunctionType::get( + i32Type, {}, /*isVarArg=*/false); + + vxNumWarpsFunc = rewriter.create( + gpuModule.getLoc(), "vx_num_warps", funcType); + } + + // Call vx_num_warps() to get number of warps + auto numWarps = rewriter.create( + loc, vxNumWarpsFunc, ValueRange{}); + + // Declare vx_barrier function in gpu.module if not already declared + auto vxBarrierFunc = gpuModule.lookupSymbol("vx_barrier"); + if (!vxBarrierFunc) { + OpBuilder::InsertionGuard guard(rewriter); + rewriter.setInsertionPointToStart(gpuModule.getBody()); + + auto funcType = LLVM::LLVMFunctionType::get( + LLVM::LLVMVoidType::get(context), + {i32Type, i32Type}, + /*isVarArg=*/false); + + vxBarrierFunc = rewriter.create( + gpuModule.getLoc(), "vx_barrier", funcType); + } + + // Call vx_barrier(barrier_id, num_warps) + SmallVector args; + args.push_back(barIdConstant.getResult()); + args.push_back(numWarps.getResult()); + + rewriter.replaceOpWithNewOp( + op, vxBarrierFunc, args); + + return success(); + } +}; + +/// Lower printf calls to vx_printf +/// Matches: llvm.call @printf(format, args...) +/// Replaces with: llvm.call @vx_printf(format, args...) +/// vx_printf has the same signature as standard printf +struct PrintfOpLowering : public OpRewritePattern { + using OpRewritePattern::OpRewritePattern; + + LogicalResult matchAndRewrite(LLVM::CallOp callOp, + PatternRewriter &rewriter) const override { + // Only match calls to 'printf' + auto callee = callOp.getCalleeAttr(); + if (!callee) + return failure(); + + auto flatSymbolRef = callee.dyn_cast(); + if (!flatSymbolRef || flatSymbolRef.getValue() != "printf") + return failure(); + + // Only lower printf calls inside GPU modules + auto gpuModule = callOp->getParentOfType(); + if (!gpuModule) + return failure(); + + Location loc = callOp.getLoc(); + MLIRContext *context = gpuModule.getContext(); + auto i32Type = rewriter.getI32Type(); + + // Declare vx_printf function in gpu.module if not already declared + auto vxPrintfFunc = gpuModule.lookupSymbol("vx_printf"); + if (!vxPrintfFunc) { + OpBuilder::InsertionGuard guard(rewriter); + rewriter.setInsertionPointToStart(gpuModule.getBody()); + + auto ptrType = LLVM::LLVMPointerType::get(context); + auto funcType = LLVM::LLVMFunctionType::get(i32Type, {ptrType}, /*isVarArg=*/true); + vxPrintfFunc = rewriter.create( + gpuModule.getLoc(), "vx_printf", funcType); + } + + // Build argument list: pass all original arguments unchanged + SmallVector newArgs; + for (unsigned i = 0; i < callOp.getNumOperands(); ++i) { + newArgs.push_back(callOp.getOperand(i)); + } + + // Replace with call to vx_printf + rewriter.replaceOpWithNewOp( + callOp, vxPrintfFunc, newArgs); + + return success(); + } +}; + +/// Extract metadata from gpu.launch_func for Vortex kernel argument struct +/// For RV32, all arguments are 4 bytes (scalars and pointers) +struct LaunchFuncMetadataExtraction : public OpRewritePattern { + using OpRewritePattern::OpRewritePattern; + + LogicalResult matchAndRewrite(gpu::LaunchFuncOp launchOp, + PatternRewriter &rewriter) const override { + // Skip if metadata already exists (avoid infinite loop in greedy rewriter) + if (launchOp->hasAttr("vortex.kernel_metadata")) + return failure(); + + Location loc = launchOp.getLoc(); + + // Get kernel name + StringRef kernelName = launchOp.getKernelName().getValue(); + + // Get kernel arguments + auto kernelOperands = launchOp.getKernelOperands(); + unsigned numArgs = kernelOperands.size(); + + // For RV32: all arguments are 4 bytes (scalars and pointers) + // Calculate total struct size: numArgs * 4 + unsigned totalSize = numArgs * 4; + + // Build metadata string for debugging/documentation + std::string metadataStr = "Kernel: " + kernelName.str() + + "\nNum args: " + std::to_string(numArgs) + + "\nTotal size (RV32): " + std::to_string(totalSize) + " bytes\nArguments:\n"; + + unsigned offset = 0; + for (auto [idx, arg] : llvm::enumerate(kernelOperands)) { + Type argType = arg.getType(); + bool isPointer = argType.isa(); + + metadataStr += " [" + std::to_string(idx) + "] offset=" + std::to_string(offset) + + ", size=4, type=" + (isPointer ? "pointer" : "scalar") + "\n"; + offset += 4; + } + + // Emit metadata as a comment for now (can be enhanced to create LLVM metadata) + rewriter.startRootUpdate(launchOp); + launchOp->setAttr("vortex.kernel_metadata", + rewriter.getStringAttr(metadataStr)); + rewriter.finalizeRootUpdate(launchOp); + + // Note: We don't replace the op, just annotate it with metadata + // The actual launch lowering will be handled separately + return success(); + } +}; + +//===----------------------------------------------------------------------===// +// Kernel Metadata JSON Emission +//===----------------------------------------------------------------------===// + +/// Structure to hold kernel argument metadata +struct KernelArgInfo { + std::string name; + std::string type; // "ptr", "i32", "u32", "f32", "f64", etc. + unsigned size; // Size in bytes + unsigned offset; // Offset in args struct + bool isPointer; +}; + +/// Structure to hold complete kernel metadata +struct KernelMetadata { + std::string kernelName; + std::vector arguments; + unsigned totalArgsSize; +}; + +/// Convert MLIR type to metadata type string +static std::string getMetadataTypeString(Type type) { + if (type.isa() || type.isa()) + return "ptr"; + if (type.isInteger(32)) + return "i32"; + if (type.isInteger(64)) + return "i64"; + if (type.isF32()) + return "f32"; + if (type.isF64()) + return "f64"; + if (type.isIndex()) + return "i32"; // Index maps to i32 on RV32 + return "unknown"; +} + +/// Get size in bytes for a type on RV32 +static unsigned getTypeSizeRV32(Type type) { + // On RV32 Vortex, pointers are 4 bytes + if (type.isa() || type.isa()) + return 4; + if (type.isInteger(32) || type.isF32() || type.isIndex()) + return 4; + if (type.isInteger(64) || type.isF64()) + return 8; + return 4; // Default +} + +/// Convert metadata type string to C type +static std::string getCTypeString(const std::string &metaType) { + if (metaType == "ptr") return "uint32_t"; // RV32 pointer = 32-bit device address + if (metaType == "i32") return "int32_t"; + if (metaType == "u32") return "uint32_t"; + if (metaType == "i64") return "int64_t"; + if (metaType == "u64") return "uint64_t"; + if (metaType == "f32") return "float"; + if (metaType == "f64") return "double"; + return "uint32_t"; // Default +} + +/// Generate C header string for kernel args struct (Vortex-compatible) +static std::string generateKernelArgsHeader(const KernelMetadata &meta) { + std::ostringstream header; + + // Generate include guard + std::string guardName = meta.kernelName; + std::transform(guardName.begin(), guardName.end(), guardName.begin(), ::toupper); + std::replace(guardName.begin(), guardName.end(), '-', '_'); + + header << "// Auto-generated kernel argument structure for " << meta.kernelName << "\n"; + header << "// Generated by Polygeist ConvertGPUToVortex pass\n"; + header << "#ifndef " << guardName << "_ARGS_H\n"; + header << "#define " << guardName << "_ARGS_H\n\n"; + header << "#include \n\n"; + + header << "typedef struct {\n"; + for (const auto &arg : meta.arguments) { + std::string cType = getCTypeString(arg.type); + header << " " << cType << " " << arg.name << ";"; + header << " // offset=" << arg.offset << ", size=" << arg.size; + if (arg.isPointer) header << ", device pointer"; + header << "\n"; + } + header << "} " << meta.kernelName << "_args_t;\n\n"; + + header << "#define " << guardName << "_ARGS_SIZE " << meta.totalArgsSize << "\n\n"; + header << "#endif // " << guardName << "_ARGS_H\n"; + + return header.str(); +} + +/// Generate JSON string for kernel metadata (for runtime dynamic loading) +static std::string generateMetadataJSON(const KernelMetadata &meta) { + std::ostringstream json; + json << "{\n"; + json << " \"kernel_name\": \"" << meta.kernelName << "\",\n"; + json << " \"arguments\": [\n"; + + for (size_t i = 0; i < meta.arguments.size(); ++i) { + const auto &arg = meta.arguments[i]; + json << " {\n"; + json << " \"name\": \"" << arg.name << "\",\n"; + json << " \"type\": \"" << arg.type << "\",\n"; + json << " \"size\": " << arg.size << ",\n"; + json << " \"offset\": " << arg.offset << ",\n"; + json << " \"is_pointer\": " << (arg.isPointer ? "true" : "false") << "\n"; + json << " }"; + if (i < meta.arguments.size() - 1) + json << ","; + json << "\n"; + } + + json << " ],\n"; + json << " \"total_args_size\": " << meta.totalArgsSize << ",\n"; + json << " \"architecture\": \"rv32\"\n"; + json << "}\n"; + + return json.str(); +} + +/// Extract metadata from a GPU function and write metadata files +/// Generates both .meta.json (for runtime) and _args.h (for compile-time) +/// If outputDir is empty, uses current working directory +static void emitKernelMetadata(gpu::GPUFuncOp funcOp, + StringRef outputDir) { + if (!funcOp.isKernel()) + return; + + KernelMetadata meta; + meta.kernelName = funcOp.getName().str(); + + // Extract base kernel name (remove Polygeist suffix if present) + StringRef baseName = extractBaseKernelName(funcOp.getName()); + meta.kernelName = baseName.str(); + + unsigned offset = 0; + unsigned argIndex = 0; + + for (auto argType : funcOp.getArgumentTypes()) { + KernelArgInfo argInfo; + argInfo.name = "arg" + std::to_string(argIndex); + argInfo.type = getMetadataTypeString(argType); + argInfo.size = getTypeSizeRV32(argType); + argInfo.offset = offset; + argInfo.isPointer = argType.isa() || + argType.isa(); + + meta.arguments.push_back(argInfo); + offset += argInfo.size; + argIndex++; + } + + meta.totalArgsSize = offset; + + // Determine output directory + SmallString<256> outDir; + if (outputDir.empty()) { + llvm::sys::fs::current_path(outDir); + } else { + outDir = outputDir; + } + + // Write JSON metadata file + { + SmallString<256> jsonPath(outDir); + llvm::sys::path::append(jsonPath, meta.kernelName + ".meta.json"); + + std::error_code ec; + llvm::raw_fd_ostream outFile(jsonPath, ec); + if (ec) { + llvm::errs() << "Error writing metadata file " << jsonPath << ": " + << ec.message() << "\n"; + } else { + outFile << generateMetadataJSON(meta); + outFile.close(); + llvm::outs() << "Wrote kernel metadata: " << jsonPath << "\n"; + } + } + + // Write C header file + { + SmallString<256> headerPath(outDir); + llvm::sys::path::append(headerPath, meta.kernelName + "_args.h"); + + std::error_code ec; + llvm::raw_fd_ostream outFile(headerPath, ec); + if (ec) { + llvm::errs() << "Error writing header file " << headerPath << ": " + << ec.message() << "\n"; + } else { + outFile << generateKernelArgsHeader(meta); + outFile.close(); + llvm::outs() << "Wrote kernel args header: " << headerPath << "\n"; + } + } +} + +//===----------------------------------------------------------------------===// +// Pass Definition +//===----------------------------------------------------------------------===// + +// Use the tablegen-generated base class which handles the pass options correctly +#define GEN_PASS_DECL_CONVERTGPUTOVORTEX +#define GEN_PASS_DEF_CONVERTGPUTOVORTEX +#include "polygeist/Passes/Passes.h.inc" + +struct ConvertGPUToVortexPass + : public impl::ConvertGPUToVortexBase { + + ConvertGPUToVortexPass() = default; + + void runOnOperation() override { + MLIRContext *context = &getContext(); + ModuleOp module = getOperation(); + + // PREPROCESSING: Consolidate Polygeist auto-tuning artifacts + // This must happen before any conversion patterns are applied + consolidatePolygeistAlternatives(module); + removeDuplicateKernels(module); + + // Always emit kernel metadata for each kernel + // Files are written to current working directory: + // - .meta.json (for runtime dynamic loading) + // - _args.h (for compile-time type-safe usage) + module.walk([&](gpu::GPUModuleOp gpuModule) { + for (auto gpuFunc : gpuModule.getOps()) { + if (gpuFunc.isKernel()) { + emitKernelMetadata(gpuFunc, "" /* use current directory */); + } + } + }); + + // Set up type converter for GPU to LLVM types + LLVMTypeConverter typeConverter(context); + + // Set up conversion target + // Mark only the Vortex-specific GPU operations as illegal + // All other operations (including GPU structural ops) remain legal + // A subsequent --gpu-to-llvm pass will handle gpu.module/gpu.func conversion + ConversionTarget target(*context); + target.markUnknownOpDynamicallyLegal([](Operation *) { return true; }); + target.addIllegalOp(); + + // Set up rewrite patterns + RewritePatternSet patterns(context); + patterns.add(typeConverter); + + // Apply the conversion + if (failed(applyPartialConversion(module, target, std::move(patterns)))) { + signalPassFailure(); + } + + // Apply metadata extraction and printf lowering as separate greedy rewrites + RewritePatternSet metadataPatterns(context); + metadataPatterns.add(context); + if (failed(applyPatternsAndFoldGreedily(module, std::move(metadataPatterns)))) { + signalPassFailure(); + } + } +}; + +} // namespace + +//===----------------------------------------------------------------------===// +// Pass Registration +//===----------------------------------------------------------------------===// + +namespace mlir { +namespace polygeist { + +std::unique_ptr createConvertGPUToVortexPass() { + return std::make_unique(); +} + +} // namespace polygeist +} // namespace mlir diff --git a/lib/polygeist/Passes/SerializeToCubin.cpp b/lib/polygeist/Passes/SerializeToCubin.cpp index abb6dbee1c1e..278a83f2a0f8 100644 --- a/lib/polygeist/Passes/SerializeToCubin.cpp +++ b/lib/polygeist/Passes/SerializeToCubin.cpp @@ -412,5 +412,11 @@ std::unique_ptr createGpuSerializeToCubinPass( #else namespace mlir::polygeist { void registerGpuSerializeToCubinPass() {} +std::unique_ptr createGpuSerializeToCubinPass( + StringRef arch, StringRef features, int llvmOptLevel, int ptxasOptLevel, + std::string ptxasPath, std::string libDevicePath, bool outputIntermediate) { + llvm::errs() << "error: CUDA toolkit support not enabled in this build\n"; + return nullptr; +} } // namespace mlir::polygeist #endif diff --git a/tools/cgeist/CMakeLists.txt b/tools/cgeist/CMakeLists.txt index 84b4a739e9df..feca781d530a 100644 --- a/tools/cgeist/CMakeLists.txt +++ b/tools/cgeist/CMakeLists.txt @@ -31,11 +31,19 @@ add_clang_executable(cgeist Lib/TypeUtils.cc Lib/CGCall.cc ) -if(POLYGEIST_ENABLE_CUDA) +if(POLYGEIST_ENABLE_CUDA OR POLYGEIST_CUDA_FRONTEND_ONLY) target_compile_definitions(cgeist PRIVATE POLYGEIST_ENABLE_CUDA=1 ) +endif() + +# Define POLYGEIST_CUDA_FULL only if full CUDA support (execution engine available) +if(POLYGEIST_ENABLE_CUDA AND NOT POLYGEIST_CUDA_FRONTEND_ONLY) + target_compile_definitions(cgeist + PRIVATE + POLYGEIST_CUDA_FULL=1 + ) add_dependencies(cgeist execution_engine_cuda_wrapper_binary_include) endif() if(POLYGEIST_ENABLE_ROCM) diff --git a/tools/cgeist/Test/Verification/basic_hip_kernel.hip b/tools/cgeist/Test/Verification/basic_hip_kernel.hip new file mode 100644 index 000000000000..5945c9060a9d --- /dev/null +++ b/tools/cgeist/Test/Verification/basic_hip_kernel.hip @@ -0,0 +1,15 @@ +// Basic HIP kernel test +#include "Inputs/cuda.h" +#include "__clang_cuda_builtin_vars.h" + +__global__ void basic_kernel(int32_t* src, int32_t* dst, uint32_t count) { + uint32_t tid = blockIdx.x * blockDim.x + threadIdx.x; + if (tid < count) { + dst[tid] = src[tid]; + } +} + +void launch_basic(int32_t* d_src, int32_t* d_dst, uint32_t count, int threads_per_block) { + int num_blocks = (count + threads_per_block - 1) / threads_per_block; + basic_kernel<<>>(d_src, d_dst, count); +} diff --git a/tools/cgeist/Test/Verification/gpu_to_vortex_basic.mlir b/tools/cgeist/Test/Verification/gpu_to_vortex_basic.mlir new file mode 100644 index 000000000000..e566b8c74c5e --- /dev/null +++ b/tools/cgeist/Test/Verification/gpu_to_vortex_basic.mlir @@ -0,0 +1,81 @@ +// RUN: mlir-opt %s -convert-gpu-to-vortex | FileCheck %s + +// Test basic gpu.thread_id and gpu.block_id lowering to Vortex TLS access + +module { + // CHECK-LABEL: func @test_thread_id_x + func.func @test_thread_id_x() -> index { + // CHECK: llvm.mlir.global external thread_local @threadIdx + // CHECK: llvm.mlir.addressof @threadIdx + // CHECK: llvm.getelementptr + // CHECK: llvm.load + %tid = gpu.thread_id x + // CHECK: builtin.unrealized_conversion_cast + // CHECK: return + return %tid : index + } + + // CHECK-LABEL: func @test_thread_id_y + func.func @test_thread_id_y() -> index { + // CHECK: llvm.mlir.addressof @threadIdx + // CHECK: llvm.getelementptr + // CHECK-SAME: 1 + // CHECK: llvm.load + %tid = gpu.thread_id y + // CHECK: builtin.unrealized_conversion_cast + // CHECK: return + return %tid : index + } + + // CHECK-LABEL: func @test_thread_id_z + func.func @test_thread_id_z() -> index { + // CHECK: llvm.mlir.addressof @threadIdx + // CHECK: llvm.getelementptr + // CHECK-SAME: 2 + // CHECK: llvm.load + %tid = gpu.thread_id z + // CHECK: builtin.unrealized_conversion_cast + // CHECK: return + return %tid : index + } + + // CHECK-LABEL: func @test_block_id_x + func.func @test_block_id_x() -> index { + // CHECK: llvm.mlir.global external thread_local @blockIdx + // CHECK: llvm.mlir.addressof @blockIdx + // CHECK: llvm.getelementptr + // CHECK: llvm.load + %bid = gpu.block_id x + // CHECK: builtin.unrealized_conversion_cast + // CHECK: return + return %bid : index + } + + // CHECK-LABEL: func @test_block_id_y + func.func @test_block_id_y() -> index { + // CHECK: llvm.mlir.addressof @blockIdx + // CHECK: llvm.getelementptr + // CHECK-SAME: 1 + // CHECK: llvm.load + %bid = gpu.block_id y + // CHECK: builtin.unrealized_conversion_cast + // CHECK: return + return %bid : index + } + + // CHECK-LABEL: func @test_combined + func.func @test_combined() -> index { + // CHECK: llvm.mlir.addressof @threadIdx + // CHECK: llvm.getelementptr + // CHECK: llvm.load + %tid = gpu.thread_id x + // CHECK: llvm.mlir.addressof @blockIdx + // CHECK: llvm.getelementptr + // CHECK: llvm.load + %bid = gpu.block_id x + // CHECK: arith.addi + %sum = arith.addi %tid, %bid : index + // CHECK: return + return %sum : index + } +} diff --git a/tools/cgeist/Test/Verification/gpu_to_vortex_thread_model.mlir b/tools/cgeist/Test/Verification/gpu_to_vortex_thread_model.mlir new file mode 100644 index 000000000000..2b3864c0355b --- /dev/null +++ b/tools/cgeist/Test/Verification/gpu_to_vortex_thread_model.mlir @@ -0,0 +1,185 @@ +// RUN: polygeist-opt %s -convert-gpu-to-vortex | FileCheck %s + +// Test Developer A: Thread Model & Synchronization operations +// Tests for blockDim, gridDim, and gpu.barrier operations + +module { + //===----------------------------------------------------------------------===// + // Block Dimension Tests (blockDim.x, blockDim.y, blockDim.z) + //===----------------------------------------------------------------------===// + + // CHECK-LABEL: func @test_block_dim_x + func.func @test_block_dim_x() -> index { + // CHECK: llvm.mlir.addressof @blockDim + // CHECK: llvm.getelementptr {{.*}}[0, 0] + // CHECK: llvm.load + %bdim = gpu.block_dim x + // CHECK: builtin.unrealized_conversion_cast + // CHECK: return + return %bdim : index + } + + // CHECK-LABEL: func @test_block_dim_y + func.func @test_block_dim_y() -> index { + // CHECK: llvm.mlir.addressof @blockDim + // CHECK: llvm.getelementptr {{.*}}[0, 1] + // CHECK: llvm.load + %bdim = gpu.block_dim y + // CHECK: builtin.unrealized_conversion_cast + // CHECK: return + return %bdim : index + } + + // CHECK-LABEL: func @test_block_dim_z + func.func @test_block_dim_z() -> index { + // CHECK: llvm.mlir.addressof @blockDim + // CHECK: llvm.getelementptr {{.*}}[0, 2] + // CHECK: llvm.load + %bdim = gpu.block_dim z + // CHECK: builtin.unrealized_conversion_cast + // CHECK: return + return %bdim : index + } + + //===----------------------------------------------------------------------===// + // Grid Dimension Tests (gridDim.x, gridDim.y, gridDim.z) + //===----------------------------------------------------------------------===// + + // CHECK-LABEL: func @test_grid_dim_x + func.func @test_grid_dim_x() -> index { + // CHECK: llvm.mlir.addressof @gridDim + // CHECK: llvm.getelementptr {{.*}}[0, 0] + // CHECK: llvm.load + %gdim = gpu.grid_dim x + // CHECK: builtin.unrealized_conversion_cast + // CHECK: return + return %gdim : index + } + + // CHECK-LABEL: func @test_grid_dim_y + func.func @test_grid_dim_y() -> index { + // CHECK: llvm.mlir.addressof @gridDim + // CHECK: llvm.getelementptr {{.*}}[0, 1] + // CHECK: llvm.load + %gdim = gpu.grid_dim y + // CHECK: builtin.unrealized_conversion_cast + // CHECK: return + return %gdim : index + } + + // CHECK-LABEL: func @test_grid_dim_z + func.func @test_grid_dim_z() -> index { + // CHECK: llvm.mlir.addressof @gridDim + // CHECK: llvm.getelementptr {{.*}}[0, 2] + // CHECK: llvm.load + %gdim = gpu.grid_dim z + // CHECK: builtin.unrealized_conversion_cast + // CHECK: return + return %gdim : index + } + + //===----------------------------------------------------------------------===// + // Barrier Synchronization Tests + //===----------------------------------------------------------------------===// + + // CHECK-LABEL: func @test_simple_barrier + func.func @test_simple_barrier() { + // CHECK: %[[BAR_ID:.*]] = llvm.mlir.constant({{[0-9]+}} : i32) + // CHECK: llvm.mlir.addressof @blockDim + // CHECK: llvm.getelementptr {{.*}}[0, 0] + // CHECK: llvm.load + // CHECK: llvm.mlir.addressof @blockDim + // CHECK: llvm.getelementptr {{.*}}[0, 1] + // CHECK: llvm.load + // CHECK: llvm.mlir.addressof @blockDim + // CHECK: llvm.getelementptr {{.*}}[0, 2] + // CHECK: llvm.load + // CHECK: llvm.mul + // CHECK: %[[NUM_THREADS:.*]] = llvm.mul + // CHECK: llvm.call @vx_barrier(%[[BAR_ID]], %[[NUM_THREADS]]) + gpu.barrier + // CHECK: return + return + } + + // CHECK-LABEL: func @test_multiple_barriers + func.func @test_multiple_barriers() { + // First barrier + // CHECK: %[[BAR_ID_0:.*]] = llvm.mlir.constant({{[0-9]+}} : i32) + // CHECK: llvm.call @vx_barrier(%[[BAR_ID_0]] + gpu.barrier + + // Second barrier - ID should be different from first + // CHECK: %[[BAR_ID_1:.*]] = llvm.mlir.constant({{[0-9]+}} : i32) + // CHECK: llvm.call @vx_barrier(%[[BAR_ID_1]] + gpu.barrier + + // CHECK: return + return + } + + //===----------------------------------------------------------------------===// + // Combined Test: Global ID Computation Pattern + //===----------------------------------------------------------------------===// + + // CHECK-LABEL: func @test_global_id_pattern + func.func @test_global_id_pattern() -> index { + // Get threadIdx.x + // CHECK: llvm.mlir.addressof @threadIdx + // CHECK: llvm.getelementptr + // CHECK: llvm.load + %tid = gpu.thread_id x + + // Get blockIdx.x + // CHECK: llvm.mlir.addressof @blockIdx + // CHECK: llvm.getelementptr + // CHECK: llvm.load + %bid = gpu.block_id x + + // Get blockDim.x + // CHECK: llvm.mlir.addressof @blockDim + // CHECK: llvm.getelementptr + // CHECK: llvm.load + %bdim = gpu.block_dim x + + // Compute: blockIdx.x * blockDim.x + threadIdx.x + // CHECK: arith.muli + %temp = arith.muli %bid, %bdim : index + // CHECK: arith.addi + %gid = arith.addi %temp, %tid : index + + // CHECK: return + return %gid : index + } + + //===----------------------------------------------------------------------===// + // Realistic Kernel Pattern with Barrier + //===----------------------------------------------------------------------===// + + // CHECK-LABEL: func @test_kernel_with_barrier + func.func @test_kernel_with_barrier() -> index { + // Compute global ID + // CHECK: llvm.mlir.addressof @threadIdx + %tid = gpu.thread_id x + + // CHECK: llvm.mlir.addressof @blockIdx + %bid = gpu.block_id x + + // CHECK: llvm.mlir.addressof @blockDim + %bdim = gpu.block_dim x + + // CHECK: arith.muli + %temp = arith.muli %bid, %bdim : index + // CHECK: arith.addi + %gid = arith.addi %temp, %tid : index + + // Synchronize threads + // CHECK: llvm.mlir.constant({{[0-9]+}} : i32) + // CHECK: llvm.call @vx_barrier + gpu.barrier + + // CHECK: return + return %gid : index + } + +} diff --git a/tools/cgeist/driver.cc b/tools/cgeist/driver.cc index b49eecdcfec5..ff451007a79a 100644 --- a/tools/cgeist/driver.cc +++ b/tools/cgeist/driver.cc @@ -1025,7 +1025,7 @@ int main(int argc, char **argv) { CudaInstallationDetector detector(*driver, triple, argList); if (EmitCUDA) { -#if POLYGEIST_ENABLE_CUDA +#if POLYGEIST_CUDA_FULL std::string arch = CUDAGPUArch; if (arch == "") arch = "sm_60"; @@ -1104,7 +1104,7 @@ int main(int argc, char **argv) { llvm::errs() << "Failed to emit LLVM IR\n"; return -1; } -#if POLYGEIST_ENABLE_CUDA +#if POLYGEIST_CUDA_FULL if (EmitCUDA) { // This header defines: // unsigned char CudaRuntimeWrappers_cpp_bc[]