From b3f2a4ab3d57ff906e03dd03b6365ba99d2169bf Mon Sep 17 00:00:00 2001 From: Jianhui Li Date: Sat, 22 Nov 2025 00:26:44 +0000 Subject: [PATCH 01/28] adding anchor layout for load/store/prefetch_nd and dpas --- .../include/mlir/Dialect/XeGPU/IR/XeGPUOps.td | 29 ++++++++++++------- mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp | 2 ++ mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp | 28 ++++++++++-------- .../Transforms/XeGPUSubgroupDistribute.cpp | 4 +-- .../Dialect/XeGPU/Transforms/XeGPUUnroll.cpp | 4 +-- .../Transforms/XeGPUWgToSgDistribute.cpp | 20 +++++++++---- mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp | 9 +++--- mlir/test/Dialect/XeGPU/invalid.mlir | 6 ++-- .../Dialect/XeGPU/subgroup-distribute.mlir | 12 ++++---- mlir/test/Dialect/XeGPU/xegpu-blocking.mlir | 4 +-- .../XeGPU/xegpu-wg-to-sg-unify-ops.mlir | 6 ++-- 11 files changed, 72 insertions(+), 52 deletions(-) diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td index 4c67856b559b1..9ddc408a17f7f 100644 --- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td +++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td @@ -268,7 +268,8 @@ def XeGPU_PrefetchNdOp : XeGPU_Op<"prefetch_nd", []> { OptionalAttr: $const_offsets, OptionalAttr: $l1_hint, OptionalAttr: $l2_hint, - OptionalAttr: $l3_hint); + OptionalAttr: $l3_hint, + OptionalAttr:$anchor_layout); let extraClassDeclaration = extraBaseClassDeclaration # [{ xegpu::TensorDescType getTensorDescType() { @@ -360,7 +361,8 @@ def XeGPU_LoadNdOp : XeGPU_Op<"load_nd", [ OptionalAttr: $transpose, OptionalAttr: $l1_hint, OptionalAttr: $l2_hint, - OptionalAttr: $l3_hint); + OptionalAttr: $l3_hint, + OptionalAttr:$anchor_layout); let results = (outs XeGPU_ValueType: $value); @@ -454,7 +456,8 @@ def XeGPU_StoreNdOp : XeGPU_Op<"store_nd", [ OptionalAttr: $const_offsets, OptionalAttr: $l1_hint, OptionalAttr: $l2_hint, - OptionalAttr: $l3_hint); + OptionalAttr: $l3_hint, + OptionalAttr:$anchor_layout); let extraClassDeclaration = extraBaseClassDeclaration # [{ VectorType getValueType() { @@ -1046,7 +1049,7 @@ def XeGPU_StoreScatterOp : XeGPU_Op<"store", [MemoryEffects<[MemWrite]>]> { "xegpu::CachePolicyAttr": $l1_hint, "xegpu::CachePolicyAttr": $l2_hint, "xegpu::CachePolicyAttr": $l3_hint, - "xegpu::DistributeLayoutAttr": $layout)> + "xegpu::DistributeLayoutAttr": $anchor_layout)> ]; let hasVerifier = 1; @@ -1133,7 +1136,11 @@ def XeGPU_DpasOp : XeGPU_Op<"dpas", [Pure, AllElementTypesMatch<["lhs", "rhs"]>] let arguments = (ins XeGPU_DpasOprType : $lhs, XeGPU_DpasOprType : $rhs, - Optional: $acc); + Optional: $acc, + OptionalAttr:$anchor_layout_a, + OptionalAttr:$anchor_layout_b, + OptionalAttr:$anchor_layout_cd + ); let results = (outs XeGPU_DpasResType: $result); let extraClassDeclaration = [{ @@ -1319,7 +1326,7 @@ def XeGPU_LoadMatrixOp: XeGPU_Op<"load_matrix", [MemoryEffects<[MemRead]>, Variadic: $offsets, DenseI64ArrayAttr: $const_offsets, OptionalAttr:$subgroup_block_io, - OptionalAttr:$layout + OptionalAttr:$anchor_layout ); let results = (outs AnyTypeOf<[XeGPU_ValueType, XeGPU_ScalarType]>:$res); let assemblyFormat = [{ @@ -1338,7 +1345,7 @@ def XeGPU_LoadMatrixOp: XeGPU_Op<"load_matrix", [MemoryEffects<[MemRead]>, - `subgroup_block_io`: [optional] An attribute indicating that the operation can be lowered to a subgroup block load. When this attribute is present, the offsets are subgroup-uniform across all lanes. - - `layout`: [optional] An attribute for guiding distributions among + - `anchor_layout`: [optional] An attribute for guiding distributions among subgroups and/or work-items. It currently can accept either LayoutAttr or SliceAttr. Results: @@ -1347,7 +1354,7 @@ def XeGPU_LoadMatrixOp: XeGPU_Op<"load_matrix", [MemoryEffects<[MemRead]>, let builders = [ OpBuilder<(ins "Type":$res, "TypedValue": $mem_desc, - "llvm::ArrayRef": $offsets, "DistributeLayoutAttr": $layout)>, + "llvm::ArrayRef": $offsets, "DistributeLayoutAttr": $anchor_layout)>, ]; let extraClassDeclaration = [{ SmallVector getMixedOffsets() { @@ -1373,7 +1380,7 @@ def XeGPU_StoreMatrixOp: XeGPU_Op<"store_matrix", [MemoryEffects<[MemWrite]>, Variadic: $offsets, DenseI64ArrayAttr: $const_offsets, OptionalAttr:$subgroup_block_io, - OptionalAttr:$layout + OptionalAttr:$anchor_layout ); let assemblyFormat = [{ $data `,` $mem_desc `` custom($offsets, $const_offsets) prop-dict attr-dict `` `:` type(operands)}]; @@ -1389,13 +1396,13 @@ def XeGPU_StoreMatrixOp: XeGPU_Op<"store_matrix", [MemoryEffects<[MemWrite]>, - `subgroup_block_io`: [optional] An attribute indicating that the operation can be lowered to a subgroup block store. When this attribute is present, the offsets are subgroup-uniform across all lanes. - - `layout`: [optional] An attribute for guiding distributions among + - `anchor_layout`: [optional] An attribute for guiding distributions among subgroups and/or work-items. It currently can accept either LayoutAttr or SliceAttr. }]; let builders = [ OpBuilder<(ins "Value" : $data, "TypedValue": $mem_desc, - "llvm::ArrayRef": $offsets, "DistributeLayoutAttr": $layout)>, + "llvm::ArrayRef": $offsets, "DistributeLayoutAttr": $anchor_layout)>, ]; let extraClassDeclaration = [{ SmallVector getMixedOffsets() { diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp index fb5d1e758dbd1..b3d2c40712c96 100644 --- a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp +++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp @@ -22,6 +22,8 @@ using std::optional; namespace mlir { namespace xegpu { +//#include "mlir/Dialect/XeGPU/IR/XeGPUOpInterface.cpp.inc" + void XeGPUDialect::initialize() { addTypes< #define GET_TYPEDEF_LIST diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp index 85c9a966f0fe8..3240c0f40ce58 100644 --- a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp +++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp @@ -465,7 +465,7 @@ void PrefetchNdOp::build(OpBuilder &builder, OperationState &state, xegpu::CachePolicyAttr l3_hint) { return build(builder, state, tensorDesc, ValueRange(), DenseI64ArrayAttr(), - l1_hint, l2_hint, l3_hint); + l1_hint, l2_hint, l3_hint, /*anchor_layout=*/nullptr); } void PrefetchNdOp::build(OpBuilder &builder, OperationState &state, @@ -480,7 +480,7 @@ void PrefetchNdOp::build(OpBuilder &builder, OperationState &state, auto staticOffsetsAttr = builder.getDenseI64ArrayAttr(staticOffsets); build(builder, state, tensorDesc, dynamicOffsets, staticOffsetsAttr, l1_hint, - l2_hint, l3_hint); + l2_hint, l3_hint, /*anchor_layout=*/nullptr); } LogicalResult PrefetchNdOp::verify() { @@ -519,7 +519,7 @@ void LoadNdOp::build(OpBuilder &builder, OperationState &state, Type retType, return build(builder, state, retType, tensorDesc, ValueRange(), DenseI64ArrayAttr(), packed, transpose, l1_hint, l2_hint, - l3_hint); + l3_hint, /*anchor_layout=*/nullptr); } void LoadNdOp::build(OpBuilder &builder, OperationState &state, Type retType, @@ -535,7 +535,8 @@ void LoadNdOp::build(OpBuilder &builder, OperationState &state, Type retType, auto staticOffsetsAttr = builder.getDenseI64ArrayAttr(staticOffsets); build(builder, state, retType, tensorDesc, dynamicOffsets, staticOffsetsAttr, - packed, transpose, l1_hint, l2_hint, l3_hint); + packed, transpose, l1_hint, l2_hint, l3_hint, + /*anchor_layout=*/nullptr); } LogicalResult LoadNdOp::verify() { @@ -638,7 +639,8 @@ void StoreNdOp::build(OpBuilder &builder, OperationState &state, Value value, xegpu::CachePolicyAttr l3_hint) { return build(builder, state, value, tensorDesc, ValueRange(), - DenseI64ArrayAttr(), l1_hint, l2_hint, l3_hint); + DenseI64ArrayAttr(), l1_hint, l2_hint, l3_hint, + /*anchor_layout=*/nullptr); } void StoreNdOp::build(OpBuilder &builder, OperationState &state, Value value, @@ -653,7 +655,7 @@ void StoreNdOp::build(OpBuilder &builder, OperationState &state, Value value, auto staticOffsetsAttr = builder.getDenseI64ArrayAttr(staticOffsets); build(builder, state, value, tensorDesc, dynamicOffsets, staticOffsetsAttr, - l1_hint, l2_hint, l3_hint); + l1_hint, l2_hint, l3_hint, /*anchor_layout=*/nullptr); } LogicalResult StoreNdOp::verify() { @@ -876,7 +878,7 @@ void LoadGatherOp::build(OpBuilder &builder, OperationState &state, xegpu::CachePolicyAttr l2_hint, xegpu::CachePolicyAttr l3_hint) { build(builder, state, valueType, source, Value(), mask, IntegerAttr(), - l1_hint, l2_hint, l3_hint, /*layout=*/nullptr); + l1_hint, l2_hint, l3_hint, /*anchor_layout=*/nullptr); } void LoadGatherOp::build(OpBuilder &builder, OperationState &state, @@ -892,7 +894,7 @@ void LoadGatherOp::build(OpBuilder &builder, OperationState &state, auto offset = vector::FromElementsOp::create(builder, loc, type, values); build(builder, state, valueType, source, offset, mask, chunk_size, l1_hint, - l2_hint, l3_hint, /*layout=*/nullptr); + l2_hint, l3_hint, /*anchor_layout=*/nullptr); } void LoadGatherOp::build(OpBuilder &builder, OperationState &state, @@ -960,7 +962,7 @@ void StoreScatterOp::build(OpBuilder &builder, OperationState &state, xegpu::CachePolicyAttr l2_hint, xegpu::CachePolicyAttr l3_hint) { build(builder, state, value, dest, Value(), mask, IntegerAttr(), l1_hint, - l2_hint, l3_hint, /*layout=*/nullptr); + l2_hint, l3_hint, /*anchor_layout=*/nullptr); } void StoreScatterOp::build(OpBuilder &builder, OperationState &state, @@ -978,7 +980,7 @@ void StoreScatterOp::build(OpBuilder &builder, OperationState &state, // Call the correct builder overload that does not expect result types. build(builder, state, value, dest, offset, mask, chunk_size, l1_hint, l2_hint, - l3_hint, /*layout=*/nullptr); + l3_hint, /*anchor_layout=*/nullptr); } void StoreScatterOp::build( @@ -1155,7 +1157,8 @@ LogicalResult LoadMatrixOp::verify() { MemDescType mdescTy = getMemDesc().getType(); return IsValidMatrixOpParams(resTy, mdescTy, subgroup_block_io, - getLayoutAttr(), [&]() { return emitError(); }); + getAnchorLayoutAttr(), + [&]() { return emitError(); }); } //===----------------------------------------------------------------------===// @@ -1179,7 +1182,8 @@ LogicalResult StoreMatrixOp::verify() { UnitAttr subgroup_block_io = getSubgroupBlockIoAttr(); MemDescType mdescTy = getMemDesc().getType(); return IsValidMatrixOpParams(dataTy, mdescTy, subgroup_block_io, - getLayoutAttr(), [&]() { return emitError(); }); + getAnchorLayoutAttr(), + [&]() { return emitError(); }); } namespace mlir { diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp index 4455811a2e681..ac65babfcb4cb 100644 --- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp +++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp @@ -965,7 +965,7 @@ struct LoadMatrixDistribution final : public gpu::WarpDistributionPattern { SmallVector offsetsAsValues = vector::getAsValues(rewriter, matrixOp.getLoc(), offsets); - auto layout = matrixOp.getLayoutAttr(); + auto layout = matrixOp.getAnchorLayoutAttr(); if (!layout) return rewriter.notifyMatchFailure( matrixOp, "the matrix operation lacks layout attribute"); @@ -1041,7 +1041,7 @@ struct StoreMatrixDistribution final : public gpu::WarpDistributionPattern { SmallVector offsetsAsValues = vector::getAsValues(rewriter, matrixOp.getLoc(), offsets); - auto layout = matrixOp.getLayoutAttr(); + auto layout = matrixOp.getAnchorLayoutAttr(); if (!layout) return rewriter.notifyMatchFailure( matrixOp, "the matrix operation lacks layout attribute"); diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUUnroll.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUUnroll.cpp index 330553564f81a..b0b748c3409c3 100644 --- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUUnroll.cpp +++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUUnroll.cpp @@ -954,7 +954,7 @@ struct UnrollLoadMatrixOp : public UnrollPattern { Type elemTy = valueTy.getElementType(); ArrayRef shape = valueTy.getShape(); - auto layout = dyn_cast(op.getLayoutAttr()); + auto layout = dyn_cast(op.getAnchorLayoutAttr()); VectorType newValueTy = valueTy.cloneWith(*targetShape, elemTy); @@ -993,7 +993,7 @@ struct UnrollStoreMatrixOp : public UnrollPattern { VectorType valueTy = llvm::dyn_cast(op.getData().getType()); assert(valueTy && "the value type must be vector type!"); ArrayRef shape = valueTy.getShape(); - auto layout = dyn_cast(op.getLayoutAttr()); + auto layout = dyn_cast(op.getAnchorLayoutAttr()); SmallVector convertedValTypes = getUnrolledTypes(valueTy, *targetShape); diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp index 33d4b0457e5d3..2562c46adfa8d 100644 --- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp +++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp @@ -86,8 +86,16 @@ genOffsetsList(ConversionPatternRewriter &rewriter, OpType op, if (origOffsets.empty()) return failure(); + // if op is xegpu::CreateNdDescOp, call op.getLayoutAttr() + xegpu::DistributeLayoutAttr layout; + if constexpr (std::is_same_v || + std::is_same_v) { + layout = op.getAnchorLayoutAttr(); + } else { + layout = op.getLayoutAttr(); + } + // not applicable to ops without workgroup layout attributes - xegpu::DistributeLayoutAttr layout = op.getLayoutAttr(); if (!layout || !layout.isForWorkgroup()) return failure(); @@ -190,7 +198,7 @@ struct WgToSgCreateNdOp : public OpConversionPattern { xegpu::TensorDescType tdescTy = op.getType(); ArrayRef wgShape = tdescTy.getShape(); Type elemTy = tdescTy.getElementType(); - xegpu::DistributeLayoutAttr layout = op.getLayoutAttr(); + xegpu::DistributeLayoutAttr layout = tdescTy.getLayoutAttr(); SmallVector sgShape = getSgShapeAndCount(wgShape, layout).first; auto newTdescTy = xegpu::TensorDescType::get(ctx, sgShape, elemTy, tdescTy.getEncoding(), @@ -999,7 +1007,7 @@ struct WgToSgLoadMatrixOp : public OpConversionPattern { assert(valueTy && "the value type must be vector type!"); Type elemTy = valueTy.getElementType(); - xegpu::DistributeLayoutAttr layout = op.getLayoutAttr(); + xegpu::DistributeLayoutAttr layout = op.getAnchorLayoutAttr(); SmallVector sgShape = getSgShapeAndCount(wgShape, layout).first; VectorType newResTy = VectorType::get(sgShape, elemTy); SmallVector newOps; @@ -1025,7 +1033,7 @@ struct WgToSgStoreMatrixOp : public OpConversionPattern { if (failed(genOffsetsList(rewriter, op, offsetsList))) return failure(); - xegpu::DistributeLayoutAttr layout = op.getLayoutAttr(); + xegpu::DistributeLayoutAttr layout = op.getAnchorLayoutAttr(); for (auto [v, offsets] : llvm::zip(adaptor.getData(), offsetsList)) xegpu::StoreMatrixOp::create(rewriter, op.getLoc(), v, op.getMemDesc(), offsets, layout.dropSgLayoutAndData()); @@ -1409,12 +1417,12 @@ void XeGPUWgToSgDistributePass::runOnOperation() { target.addDynamicallyLegalOp( [=](xegpu::LoadMatrixOp op) -> bool { - return isLegal(op.getLayoutAttr()); + return isLegal(op.getAnchorLayoutAttr()); }); target.addDynamicallyLegalOp( [=](xegpu::StoreMatrixOp op) -> bool { - return isLegal(op.getLayoutAttr()); + return isLegal(op.getAnchorLayoutAttr()); }); target.addDynamicallyLegalOp( diff --git a/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp b/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp index b0905c4e9203b..4fe35a16b3994 100644 --- a/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp +++ b/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp @@ -135,12 +135,11 @@ xegpu::DistributeLayoutAttr xegpu::getDistributeLayoutAttr(const Value value) { // for LoadMatrixOp, the layout is attached to the property of the op if (auto loadOp = dyn_cast(defOp)) - return loadOp.getLayoutAttr(); + return loadOp.getAnchorLayoutAttr(); // for StoreMatrixOp, the layout is attached to the property of the op if (auto storeOp = dyn_cast(defOp)) - return storeOp.getLayoutAttr(); - + return storeOp.getAnchorLayoutAttr(); std::string layoutName = getLayoutName(result); if (defOp->hasAttr(layoutName)) return defOp->getAttrOfType(layoutName); @@ -168,10 +167,10 @@ xegpu::getDistributeLayoutAttr(const OpOperand &opr) { Operation *op = opr.getOwner(); if (auto loadOp = dyn_cast(op)) - return loadOp.getLayoutAttr(); + return loadOp.getAnchorLayoutAttr(); if (auto storeOp = dyn_cast(op)) - return storeOp.getLayoutAttr(); + return storeOp.getAnchorLayoutAttr(); std::string layoutName = xegpu::getLayoutName(opr); if (op->hasAttr(layoutName)) diff --git a/mlir/test/Dialect/XeGPU/invalid.mlir b/mlir/test/Dialect/XeGPU/invalid.mlir index 92f353717ac59..62ac880030cda 100644 --- a/mlir/test/Dialect/XeGPU/invalid.mlir +++ b/mlir/test/Dialect/XeGPU/invalid.mlir @@ -894,7 +894,7 @@ func.func @store_mem_desc_invalid_rank(%arg0: !xegpu.mem_desc<64xf16>, %arg1: ve // ----- func.func @simt_store_matrix_vector_nonlinear(%arg0: !xegpu.mem_desc<32x32xf32, #xegpu.mem_layout>, %arg1: vector<2x16xf32>) { // expected-error@+1 {{With subgroup_block_io, accessed data must be contiguous and coalesced}} - xegpu.store_matrix %arg1, %arg0[0, 0] {subgroup_block_io, layout = #xegpu.layout} : + xegpu.store_matrix %arg1, %arg0[0, 0] {subgroup_block_io, anchor_layout = #xegpu.layout} : vector<2x16xf32>, !xegpu.mem_desc<32x32xf32, #xegpu.mem_layout> return } @@ -902,7 +902,7 @@ func.func @simt_store_matrix_vector_nonlinear(%arg0: !xegpu.mem_desc<32x32xf32, // ----- func.func @simt_store_matrix_vector_noncoalesced(%arg0: !xegpu.mem_desc<32x32xf32, #xegpu.mem_layout>, %arg1: vector<16x2xf32>) { // expected-error@+1 {{With subgroup_block_io, the distributed dimensions must be contiguous}} - xegpu.store_matrix %arg1, %arg0[0, 0] {subgroup_block_io, layout = #xegpu.layout} : + xegpu.store_matrix %arg1, %arg0[0, 0] {subgroup_block_io, anchor_layout = #xegpu.layout} : vector<16x2xf32>, !xegpu.mem_desc<32x32xf32, #xegpu.mem_layout> return } @@ -910,7 +910,7 @@ func.func @simt_store_matrix_vector_noncoalesced(%arg0: !xegpu.mem_desc<32x32xf3 // ----- func.func @simt_store_matrix_vector_noncoalesced(%arg0: !xegpu.mem_desc<32x32xf32, #xegpu.mem_layout>, %arg1: vector<16x2xf32>) { // expected-error@+1 {{With subgroup_block_io, the block shape must match the lane layout}} - xegpu.store_matrix %arg1, %arg0[0, 0] {subgroup_block_io, layout = #xegpu.layout} : + xegpu.store_matrix %arg1, %arg0[0, 0] {subgroup_block_io, anchor_layout = #xegpu.layout} : vector<16x2xf32>, !xegpu.mem_desc<32x32xf32, #xegpu.mem_layout> return } diff --git a/mlir/test/Dialect/XeGPU/subgroup-distribute.mlir b/mlir/test/Dialect/XeGPU/subgroup-distribute.mlir index 8fd3cca5594cb..a7ce2c05b9d44 100644 --- a/mlir/test/Dialect/XeGPU/subgroup-distribute.mlir +++ b/mlir/test/Dialect/XeGPU/subgroup-distribute.mlir @@ -281,8 +281,8 @@ gpu.module @xevm_module{ gpu.module @xevm_module{ gpu.func @load_store_matrix_1(%arg0: !xegpu.mem_desc<32x32xf32>) { %c0 = arith.constant 0 : index - %1 = xegpu.load_matrix %arg0[%c0, %c0] <{layout = #xegpu.layout}> : !xegpu.mem_desc<32x32xf32>, index, index -> vector<2x8xf32> - xegpu.store_matrix %1, %arg0[%c0, %c0] <{layout = #xegpu.layout}> : vector<2x8xf32>, !xegpu.mem_desc<32x32xf32>, index, index + %1 = xegpu.load_matrix %arg0[%c0, %c0] <{anchor_layout = #xegpu.layout}> : !xegpu.mem_desc<32x32xf32>, index, index -> vector<2x8xf32> + xegpu.store_matrix %1, %arg0[%c0, %c0] <{anchor_layout = #xegpu.layout}> : vector<2x8xf32>, !xegpu.mem_desc<32x32xf32>, index, index gpu.return } } @@ -307,8 +307,8 @@ gpu.module @xevm_module{ gpu.func @load_store_matrix_2(%arg0: !xegpu.mem_desc<32x32xf32>) { %c0 = arith.constant 0 : index %c1 = arith.constant 1 : index - %1 = xegpu.load_matrix %arg0[%c0, %c1] <{layout = #xegpu.layout}> : !xegpu.mem_desc<32x32xf32>, index, index -> vector<8x4xf32> - xegpu.store_matrix %1, %arg0[%c0, %c1] <{layout = #xegpu.layout}> : vector<8x4xf32>, !xegpu.mem_desc<32x32xf32>, index, index + %1 = xegpu.load_matrix %arg0[%c0, %c1] <{anchor_layout = #xegpu.layout}> : !xegpu.mem_desc<32x32xf32>, index, index -> vector<8x4xf32> + xegpu.store_matrix %1, %arg0[%c0, %c1] <{anchor_layout = #xegpu.layout}> : vector<8x4xf32>, !xegpu.mem_desc<32x32xf32>, index, index gpu.return } } @@ -323,9 +323,9 @@ gpu.module @xevm_module{ gpu.func @load_store_matrix_3(%arg0: !xegpu.mem_desc<32x32xf32, #xegpu.mem_layout>) { %c0 = arith.constant 0 : index %c1 = arith.constant 1 : index - %1 = xegpu.load_matrix %arg0[%c0, %c1] {subgroup_block_io, layout = #xegpu.layout} : + %1 = xegpu.load_matrix %arg0[%c0, %c1] {subgroup_block_io, anchor_layout = #xegpu.layout} : !xegpu.mem_desc<32x32xf32, #xegpu.mem_layout>, index, index -> vector<16x2xf32> - xegpu.store_matrix %1, %arg0[%c0, %c1] {subgroup_block_io, layout = #xegpu.layout} : + xegpu.store_matrix %1, %arg0[%c0, %c1] {subgroup_block_io, anchor_layout = #xegpu.layout} : vector<16x2xf32>, !xegpu.mem_desc<32x32xf32, #xegpu.mem_layout>, index, index gpu.return } diff --git a/mlir/test/Dialect/XeGPU/xegpu-blocking.mlir b/mlir/test/Dialect/XeGPU/xegpu-blocking.mlir index d61908b422194..456d8e8a03cfc 100644 --- a/mlir/test/Dialect/XeGPU/xegpu-blocking.mlir +++ b/mlir/test/Dialect/XeGPU/xegpu-blocking.mlir @@ -569,7 +569,7 @@ gpu.module @test_kernel { %0 = xegpu.create_mem_desc %arg0 : memref<4096xi8, 3> -> !xegpu.mem_desc<32x32xf32> //CHECK-COUNT-8: xegpu.load_matrix {{.*}} : !xegpu.mem_desc<32x32xf32>, index, index -> vector<8x16xf32> //CHECK-COUNT-8: vector.insert_strided_slice {{.*}} : vector<8x16xf32> into vector<32x32xf32> - %1 = xegpu.load_matrix %0[0, 0] <{layout = #xegpu.layout}>: !xegpu.mem_desc<32x32xf32> -> vector<32x32xf32> + %1 = xegpu.load_matrix %0[0, 0] <{anchor_layout = #xegpu.layout}>: !xegpu.mem_desc<32x32xf32> -> vector<32x32xf32> gpu.return %1: vector<32x32xf32> } } @@ -580,7 +580,7 @@ gpu.module @test_kernel { gpu.func @unroll_store_matrix(%value: vector<32x32xf32>, %arg0 : memref<32768xi8, 3>) { %mdesc = xegpu.create_mem_desc %arg0 : memref<32768xi8, 3> -> !xegpu.mem_desc<64x128xf32> // CHECK-COUNT-8: xegpu.store_matrix {{.*}} : vector<8x16xf32>, !xegpu.mem_desc<64x128xf32>, index, index - xegpu.store_matrix %value, %mdesc[0, 0] {layout = #xegpu.layout} : vector<32x32xf32>, !xegpu.mem_desc<64x128xf32> + xegpu.store_matrix %value, %mdesc[0, 0] {anchor_layout = #xegpu.layout} : vector<32x32xf32>, !xegpu.mem_desc<64x128xf32> gpu.return } } diff --git a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops.mlir b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops.mlir index 5dde84e8e0bc2..3760737cf51f5 100644 --- a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops.mlir +++ b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops.mlir @@ -333,9 +333,9 @@ gpu.module @test_distribution { //CHECK: [[off_y:%.+]] = index.remu [[l_off_y]], [[c64]] //CHECK: [[c128:%.+]] = arith.constant 128 : index //CHECK: [[off_x:%.+]] = index.remu [[l_off_x]], [[c128]] - //CHECK: xegpu.load_matrix [[mdesc]][[[off_y]], [[off_x]]] <{layout = #xegpu.layout}>: !xegpu.mem_desc<64x128xf32>, index, index -> vector<32x32xf32> + //CHECK: xegpu.load_matrix [[mdesc]][[[off_y]], [[off_x]]] <{anchor_layout = #xegpu.layout}>: !xegpu.mem_desc<64x128xf32>, index, index -> vector<32x32xf32> %0 = xegpu.create_mem_desc %arg0 : memref<32768xi8, 3> -> !xegpu.mem_desc<64x128xf32> - %1 = xegpu.load_matrix %0[0, 0] <{layout = #xegpu.layout}>: !xegpu.mem_desc<64x128xf32> -> vector<64x128xf32> + %1 = xegpu.load_matrix %0[0, 0] <{anchor_layout = #xegpu.layout}>: !xegpu.mem_desc<64x128xf32> -> vector<64x128xf32> gpu.return } @@ -361,7 +361,7 @@ gpu.module @test_distribution { //CHECK: xegpu.store_matrix [[cst]], [[mdesc]][[[off_y]], [[off_x]]] : vector<32x32xf32>, !xegpu.mem_desc<64x128xf32>, index, index %cst = arith.constant {layout_result_0 = #xegpu.layout} dense<1.0> : vector<64x128xf32> %mdesc = xegpu.create_mem_desc %arg0 : memref<32768xi8, 3> -> !xegpu.mem_desc<64x128xf32> - xegpu.store_matrix %cst, %mdesc[0, 0] {layout = #xegpu.layout} : vector<64x128xf32>, !xegpu.mem_desc<64x128xf32> + xegpu.store_matrix %cst, %mdesc[0, 0] {anchor_layout = #xegpu.layout} : vector<64x128xf32>, !xegpu.mem_desc<64x128xf32> gpu.return } From bfae01fa3f6453ee1d0f67e98c3d6c2b1fcee8f2 Mon Sep 17 00:00:00 2001 From: Jianhui Li Date: Sat, 22 Nov 2025 07:46:04 +0000 Subject: [PATCH 02/28] propogation hornor pre-defined layout at anchor op --- .../include/mlir/Dialect/XeGPU/IR/XeGPUOps.td | 6 +- .../XeGPU/Transforms/XeGPUPropagateLayout.cpp | 448 +++++++++++------- .../Dialect/XeGPU/Transforms/XeGPUUnroll.cpp | 4 +- mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp | 8 +- .../XeGPU/propagate-layout-inst-data.mlir | 16 +- mlir/test/Dialect/XeGPU/propagate-layout.mlir | 79 +-- .../XeGPU/xegpu-wg-to-sg-unify-ops.mlir | 4 +- 7 files changed, 328 insertions(+), 237 deletions(-) diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td index 9ddc408a17f7f..70c61a445e8ae 100644 --- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td +++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td @@ -847,7 +847,7 @@ def XeGPU_LoadGatherOp : XeGPU_Op<"load", [MemoryEffects<[MemRead]>]> { OptionalAttr:$l1_hint, OptionalAttr:$l2_hint, OptionalAttr:$l3_hint, - OptionalAttr:$layout); + OptionalAttr:$anchor_layout); let results = (outs AnyTypeOf<[XeGPU_ValueType, XeGPU_ScalarType]>:$value); let extraClassDeclaration = extraBaseClassDeclaration # [{ @@ -906,7 +906,7 @@ def XeGPU_LoadGatherOp : XeGPU_Op<"load", [MemoryEffects<[MemRead]>]> { "xegpu::CachePolicyAttr": $l1_hint, "xegpu::CachePolicyAttr": $l2_hint, "xegpu::CachePolicyAttr": $l3_hint, - "xegpu::DistributeLayoutAttr": $layout)> + "xegpu::DistributeLayoutAttr": $anchor_layout)> ]; let hasVerifier = 1; @@ -991,7 +991,7 @@ def XeGPU_StoreScatterOp : XeGPU_Op<"store", [MemoryEffects<[MemWrite]>]> { OptionalAttr:$l1_hint, OptionalAttr:$l2_hint, OptionalAttr:$l3_hint, - OptionalAttr:$layout); + OptionalAttr:$anchor_layout); let extraClassDeclaration = extraBaseClassDeclaration#[{ Type getDestType() { diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp index b3a780abd3f12..6d45a51ab0267 100644 --- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp +++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp @@ -387,6 +387,8 @@ class LayoutInfoPropagation ArrayRef operands, ArrayRef results); + bool hasAnchorLayout(xegpu::DistributeLayoutAttr anchorLayout); + public: LayoutInfoPropagation(DataFlowSolver &solver, SymbolTableCollection &symbolTable, @@ -475,49 +477,72 @@ LogicalResult LayoutInfoPropagation::visitOperation( return success(); } +bool LayoutInfoPropagation::hasAnchorLayout( + xegpu::DistributeLayoutAttr anchorLayout) { + if (anchorLayout == nullptr) { + return false; + } + if (layoutKind == LayoutKind::InstData) { + return !(anchorLayout.getEffectiveInstDataAsInt().empty()); + } else if (layoutKind == LayoutKind::Lane) { + return !(anchorLayout.getEffectiveLaneLayoutAsInt().empty() || + anchorLayout.getEffectiveLaneDataAsInt().empty()); + } + return false; +} + void LayoutInfoPropagation::visitPrefetchNdOp( xegpu::PrefetchNdOp prefetch, ArrayRef operands, ArrayRef results) { - // Here we assign the default layout to the tensor descriptor operand of - // prefetch. - auto tdescTy = prefetch.getTensorDescType(); - - auto uArch = getUArch(getChipStr(prefetch).value_or("")); - const auto *uArchInstruction = - dyn_cast( - uArch->getInstruction( - xegpu::uArch::InstructionKind::Subgroup2DBlockPrefetch)); - - auto blockWHC = - uArchInstruction->getBlockWidthHeightCount(tdescTy.getElementType()); - if (!blockWHC) - prefetch.emitWarning("No known block params found for the element type."); - auto [bWidth, bHeight, bCount] = blockWHC.value(); - SmallVector instData; - int instWidth = xegpu::getLargestDivisor( - static_cast(tdescTy.getDimSize(tdescTy.getRank() - 1)), bWidth, - bCount); - if (instWidth == -1) - prefetch.emitWarning( - "No suitable instruction multiple found for the given shape."); - if (tdescTy.getRank() == 1) - instData = {instWidth}; - else { - int instHeight = xegpu::getLargestDivisor( - static_cast(tdescTy.getDimSize(tdescTy.getRank() - 2)), bHeight); - if (instHeight == -1) + + LayoutInfo prefetchLayout; + xegpu::DistributeLayoutAttr anchorLayout = prefetch.getAnchorLayoutAttr(); + if (hasAnchorLayout(anchorLayout)) { + prefetchLayout = LayoutInfo(anchorLayout); + } else { + // Here we assign the default layout to the tensor descriptor operand of + // prefetch. + auto tdescTy = prefetch.getTensorDescType(); + + auto uArch = getUArch(getChipStr(prefetch).value_or("")); + const auto *uArchInstruction = + dyn_cast( + uArch->getInstruction( + xegpu::uArch::InstructionKind::Subgroup2DBlockPrefetch)); + + auto blockWHC = + uArchInstruction->getBlockWidthHeightCount(tdescTy.getElementType()); + if (!blockWHC) + prefetch.emitWarning("No known block params found for the element type."); + auto [bWidth, bHeight, bCount] = blockWHC.value(); + SmallVector instData; + int instWidth = xegpu::getLargestDivisor( + static_cast(tdescTy.getDimSize(tdescTy.getRank() - 1)), bWidth, + bCount); + if (instWidth == -1) prefetch.emitWarning( "No suitable instruction multiple found for the given shape."); - instData = {instHeight, instWidth}; - } - LayoutInfo prefetchLayout; - if (layoutKind == LayoutKind::InstData) - prefetchLayout = - LayoutInfo(xegpu::LayoutAttr::get(tdescTy.getContext(), instData)); - else - prefetchLayout = getDefaultSIMTLayoutInfo( - tdescTy, uArch, uArchInstruction->getPackedFormatBitSize()); + if (tdescTy.getRank() == 1) + instData = {instWidth}; + else { + int instHeight = xegpu::getLargestDivisor( + static_cast(tdescTy.getDimSize(tdescTy.getRank() - 2)), bHeight); + if (instHeight == -1) + prefetch.emitWarning( + "No suitable instruction multiple found for the given shape."); + instData = {instHeight, instWidth}; + } + + if (layoutKind == LayoutKind::InstData) + prefetchLayout = + LayoutInfo(xegpu::LayoutAttr::get(tdescTy.getContext(), instData)); + else + prefetchLayout = getDefaultSIMTLayoutInfo( + tdescTy, uArch, uArchInstruction->getPackedFormatBitSize()); + prefetch.setAnchorLayoutAttr( + dyn_cast(prefetchLayout.get())); + } // Propagate the layout to the source tensor descriptor. propagateIfChanged(operands[0], operands[0]->meet(prefetchLayout)); } @@ -617,69 +642,96 @@ void LayoutInfoPropagation::visitUpdateNdOffsetOp( void LayoutInfoPropagation::visitDpasOp( xegpu::DpasOp dpas, ArrayRef operands, ArrayRef results) { - VectorType aTy = dpas.getLhsType(); - VectorType bTy = dpas.getRhsType(); - - auto uArch = getUArch(getChipStr(dpas).value_or("")); - const int subgroupSize = uArch->getSubgroupSize(); - const auto *uArchInstruction = - dyn_cast(uArch->getInstruction( - xegpu::uArch::InstructionKind::SubgroupMatrixMultiplyAcc)); - - const unsigned dataALen = aTy.getShape().front(); - auto supportedALen = uArchInstruction->getSupportedM(aTy.getElementType()); - const int maxALen = - xegpu::getLargestDivisor(dataALen, ArrayRef(supportedALen)); - if (maxALen == -1) - dpas.emitWarning( - "No suitable instruction multiple found for the given shape."); - - const unsigned dataBLen = bTy.getShape().back(); - auto supportedBLen = uArchInstruction->getSupportedK(bTy.getElementType()); - const int maxBLen = - xegpu::getLargestDivisor(dataBLen, ArrayRef(supportedBLen)); - if (maxBLen == -1) - dpas.emitWarning( - "No suitable instruction multiple found for the given shape."); - SmallVector instDataA = {maxALen, subgroupSize}; - SmallVector instDataB = {subgroupSize, maxBLen}; LayoutInfo dpasALayout; LayoutInfo dpasBLayout; LayoutInfo dpasCLayout; - if (layoutKind == LayoutKind::InstData) { - dpasALayout = - LayoutInfo(xegpu::LayoutAttr::get(dpas.getContext(), instDataA)); - dpasBLayout = - LayoutInfo(xegpu::LayoutAttr::get(dpas.getContext(), instDataB)); + xegpu::DistributeLayoutAttr anchorLayoutC = dpas.getAnchorLayoutCdAttr(); + if (hasAnchorLayout(anchorLayoutC)) { + xegpu::DistributeLayoutAttr anchorLayoutA = dpas.getAnchorLayoutAAttr(); + xegpu::DistributeLayoutAttr anchorLayoutB = dpas.getAnchorLayoutBAttr(); + assert(hasAnchorLayout(anchorLayoutA) && + "Expected anchor layout for DPAS A operand."); + assert(hasAnchorLayout(anchorLayoutB) && + "Expected anchor layout for DPAS B operand."); + dpasALayout = LayoutInfo(anchorLayoutA); + dpasBLayout = LayoutInfo(anchorLayoutB); + dpasCLayout = LayoutInfo(anchorLayoutC); + } else { - dpasALayout = getSIMTLayoutInfoForDPASOperand( - aTy, 0, uArch, uArchInstruction->getPackedFormatBitSizeA()); - dpasBLayout = getSIMTLayoutInfoForDPASOperand( - bTy, 1, uArch, uArchInstruction->getPackedFormatBitSizeB()); - } - propagateIfChanged(operands[0], operands[0]->meet(dpasALayout)); - propagateIfChanged(operands[1], operands[1]->meet(dpasBLayout)); - if (operands.size() > 2) { - VectorType cTy = dpas.getAccType(); - const unsigned dataCLen = bTy.getShape().back(); - auto supportedCLen = uArchInstruction->getSupportedN(bTy.getElementType()); - const int maxCLen = - xegpu::getLargestDivisor(dataCLen, ArrayRef(supportedCLen)); - if (maxCLen == -1) + VectorType aTy = dpas.getLhsType(); + VectorType bTy = dpas.getRhsType(); + + auto uArch = getUArch(getChipStr(dpas).value_or("")); + const int subgroupSize = uArch->getSubgroupSize(); + const auto *uArchInstruction = + dyn_cast(uArch->getInstruction( + xegpu::uArch::InstructionKind::SubgroupMatrixMultiplyAcc)); + + const unsigned dataALen = aTy.getShape().front(); + auto supportedALen = uArchInstruction->getSupportedM(aTy.getElementType()); + const int maxALen = + xegpu::getLargestDivisor(dataALen, ArrayRef(supportedALen)); + if (maxALen == -1) dpas.emitWarning( "No suitable instruction multiple found for the given shape."); - SmallVector instDataC = {maxALen, maxCLen}; - if (layoutKind == LayoutKind::InstData) - dpasCLayout = - LayoutInfo(xegpu::LayoutAttr::get(dpas.getContext(), instDataC)); - else - dpasCLayout = getSIMTLayoutInfoForDPASOperand( - cTy, 2, uArch, uArchInstruction->getPackedFormatBitSizeB()); + const unsigned dataBLen = bTy.getShape().back(); + auto supportedBLen = uArchInstruction->getSupportedN(bTy.getElementType()); + + const int maxBLen = + xegpu::getLargestDivisor(dataBLen, ArrayRef(supportedBLen)); + + if (maxBLen == -1) + dpas.emitWarning( + "No suitable instruction multiple found for the given shape."); + SmallVector instDataA = {maxALen, subgroupSize}; + SmallVector instDataB = {subgroupSize, maxBLen}; + + if (layoutKind == LayoutKind::InstData) { + dpasALayout = + LayoutInfo(xegpu::LayoutAttr::get(dpas.getContext(), instDataA)); + dpasBLayout = + LayoutInfo(xegpu::LayoutAttr::get(dpas.getContext(), instDataB)); + } else { + dpasALayout = getSIMTLayoutInfoForDPASOperand( + aTy, 0, uArch, uArchInstruction->getPackedFormatBitSizeA()); + dpasBLayout = getSIMTLayoutInfoForDPASOperand( + bTy, 1, uArch, uArchInstruction->getPackedFormatBitSizeB()); + } + if (operands.size() > 2) { + VectorType cTy = dpas.getAccType(); + if (layoutKind == LayoutKind::InstData) { + const unsigned dataCLen = bTy.getShape().back(); + auto supportedCLen = + uArchInstruction->getSupportedN(bTy.getElementType()); + const int maxCLen = xegpu::getLargestDivisor( + dataCLen, ArrayRef(supportedCLen)); + if (maxCLen == -1) + dpas.emitWarning( + "No suitable instruction multiple found for the given shape."); + SmallVector instDataC = {maxALen, maxCLen}; + dpasCLayout = + LayoutInfo(xegpu::LayoutAttr::get(dpas.getContext(), instDataC)); + } else + dpasCLayout = getSIMTLayoutInfoForDPASOperand( + cTy, 2, uArch, uArchInstruction->getPackedFormatBitSizeB()); + + dpas.setAnchorLayoutCdAttr( + dyn_cast(dpasCLayout.get())); + } + dpas.setAnchorLayoutAAttr( + dyn_cast(dpasALayout.get())); + dpas.setAnchorLayoutBAttr( + dyn_cast(dpasBLayout.get())); + } + + propagateIfChanged(operands[0], operands[0]->meet(dpasALayout)); + propagateIfChanged(operands[1], operands[1]->meet(dpasBLayout)); + if (operands.size() > 2) { propagateIfChanged(operands[2], operands[2]->meet(dpasCLayout)); } } @@ -689,43 +741,51 @@ void LayoutInfoPropagation::visitStoreNdOp( xegpu::StoreNdOp store, ArrayRef operands, ArrayRef results) { - auto uArch = getUArch(getChipStr(store).value_or("")); - const auto *uArchInstruction = - dyn_cast( - uArch->getInstruction( - xegpu::uArch::InstructionKind::Subgroup2DBlockStore)); - VectorType dataTy = store.getValueType(); - auto blockWHC = uArchInstruction->getBlockWidthHeightCount( - store.getValueType().getElementType()); - if (!blockWHC) - store.emitWarning("No known block params found for the element type."); - auto [bWidth, bHeight, bCount] = blockWHC.value(); - SmallVector instData; - int instWidth = xegpu::getLargestDivisor( - static_cast(dataTy.getDimSize(dataTy.getRank() - 1)), bWidth, - bCount); - if (instWidth == -1) - store.emitWarning( - "No suitable instruction multiple found for the given shape."); - if (dataTy.getRank() == 1) - instData = {instWidth}; - else { - int instHeight = xegpu::getLargestDivisor( - static_cast(dataTy.getDimSize(dataTy.getRank() - 2)), bHeight); - if (instHeight == -1) + LayoutInfo storeLayout; + xegpu::DistributeLayoutAttr anchorLayout = store.getAnchorLayoutAttr(); + if (hasAnchorLayout(anchorLayout)) { + storeLayout = LayoutInfo(anchorLayout); + } else { + auto uArch = getUArch(getChipStr(store).value_or("")); + const auto *uArchInstruction = + dyn_cast( + uArch->getInstruction( + xegpu::uArch::InstructionKind::Subgroup2DBlockStore)); + VectorType dataTy = store.getValueType(); + auto blockWHC = uArchInstruction->getBlockWidthHeightCount( + store.getValueType().getElementType()); + if (!blockWHC) + store.emitWarning("No known block params found for the element type."); + auto [bWidth, bHeight, bCount] = blockWHC.value(); + SmallVector instData; + int instWidth = xegpu::getLargestDivisor( + static_cast(dataTy.getDimSize(dataTy.getRank() - 1)), bWidth, + bCount); + if (instWidth == -1) store.emitWarning( "No suitable instruction multiple found for the given shape."); - instData = {instHeight, instWidth}; - } + if (dataTy.getRank() == 1) + instData = {instWidth}; + else { + int instHeight = xegpu::getLargestDivisor( + static_cast(dataTy.getDimSize(dataTy.getRank() - 2)), bHeight); + if (instHeight == -1) + store.emitWarning( + "No suitable instruction multiple found for the given shape."); + instData = {instHeight, instWidth}; + } - LayoutInfo storeLayout; - if (layoutKind == LayoutKind::InstData) - storeLayout = - LayoutInfo(xegpu::LayoutAttr::get(dataTy.getContext(), instData)); - else - storeLayout = - getDefaultSIMTLayoutInfo(store.getValueType(), uArch, - uArchInstruction->getPackedFormatBitSize()); + if (layoutKind == LayoutKind::InstData) + storeLayout = + LayoutInfo(xegpu::LayoutAttr::get(dataTy.getContext(), instData)); + else + storeLayout = + getDefaultSIMTLayoutInfo(store.getValueType(), uArch, + uArchInstruction->getPackedFormatBitSize()); + store.setAnchorLayoutAttr( + dyn_cast(storeLayout.get())); + } + // Propagate the layout to the value operand. // Both operands should have the same layout for (LayoutInfoLattice *operand : operands) propagateIfChanged(operand, operand->meet(storeLayout)); @@ -736,21 +796,31 @@ void LayoutInfoPropagation::visitStoreNdOp( void LayoutInfoPropagation::visitLoadNdOp( xegpu::LoadNdOp load, ArrayRef operands, ArrayRef results) { - LayoutInfo valueLayout = results[0]->getValue(); - // Need the layout of the value to propagate to the tensor descriptor. - if (!valueLayout.isAssigned()) - return; - LayoutInfo tensorDescLayout = valueLayout; - // LoadNdOp has the transpose effect. However, at the stage of this analysis - // this effect is not expected and should be abstracted away. Emit a - // warning. - if (auto transpose = load.getTranspose()) { - load.emitWarning("Transpose effect is not expected for LoadNdOp at " - "LayoutInfoPropagation stage."); - tensorDescLayout = valueLayout.transpose(transpose.value()); + + LayoutInfo loadLayout; + xegpu::DistributeLayoutAttr anchorLayout = load.getAnchorLayoutAttr(); + if (hasAnchorLayout(anchorLayout)) { + loadLayout = LayoutInfo(anchorLayout); + } else { + + LayoutInfo valueLayout = results[0]->getValue(); + // Need the layout of the value to propagate to the tensor descriptor. + if (!valueLayout.isAssigned()) + return; + loadLayout = valueLayout; + // LoadNdOp has the transpose effect. However, at the stage of this analysis + // this effect is not expected and should be abstracted away. Emit a + // warning. + if (auto transpose = load.getTranspose()) { + load.emitWarning("Transpose effect is not expected for LoadNdOp at " + "LayoutInfoPropagation stage."); + loadLayout = valueLayout.transpose(transpose.value()); + } + load.setAnchorLayoutAttr( + dyn_cast(loadLayout.get())); } // Propagate the new layout to the tensor descriptor operand. - propagateIfChanged(operands[0], operands[0]->meet(tensorDescLayout)); + propagateIfChanged(operands[0], operands[0]->meet(loadLayout)); } /// For vector::TransposeOp, the layout of the result is transposed and @@ -840,37 +910,49 @@ void LayoutInfoPropagation::visitVectorBitcastOp( void LayoutInfoPropagation::visitLoadGatherOp( xegpu::LoadGatherOp load, ArrayRef operands, ArrayRef results) { - // The layout is strictly determined by the payload type. - auto payloadTy = dyn_cast(load.getValueType()); - if (!payloadTy) { - load.emitWarning("Not propagating, non-vector payload supplied."); - return; - } - auto uArch = getUArch(getChipStr(load).value_or("")); - const int subgroupSize = uArch->getSubgroupSize(); - SmallVector instData{subgroupSize}; - if (auto chunkSize = load.getChunkSize().value_or(0); chunkSize > 1) - instData.push_back(chunkSize); - else if (auto srcTdescTy = - dyn_cast(load.getSourceType())) { - if (srcTdescTy.getChunkSizeAsInt() > 1) + + LayoutInfo loadLayout; + LayoutInfo maskLayout; + xegpu::DistributeLayoutAttr anchorLayout = load.getAnchorLayoutAttr(); + if (hasAnchorLayout(anchorLayout)) { + loadLayout = LayoutInfo(anchorLayout); + maskLayout = loadLayout; + } else { + + // The layout is strictly determined by the payload type. + auto payloadTy = dyn_cast(load.getValueType()); + if (!payloadTy) { + load.emitWarning("Not propagating, non-vector payload supplied."); + return; + } + auto uArch = getUArch(getChipStr(load).value_or("")); + const int subgroupSize = uArch->getSubgroupSize(); + SmallVector instData{subgroupSize}; + if (auto chunkSize = load.getChunkSize().value_or(0); chunkSize > 1) instData.push_back(chunkSize); - } - LayoutInfo layout; - if (layoutKind == LayoutKind::InstData) - layout = LayoutInfo(xegpu::LayoutAttr::get(load.getContext(), instData)); - else - layout = getDefaultSIMTLayoutInfo(payloadTy, uArch, - uArch->getGeneralPackedFormatBitSize(), - /*scattered*/ true); - - // Mask operand should have 1D default layout. - LayoutInfo maskLayout = - getDefaultSIMTLayoutInfo(load->getContext(), 1, subgroupSize); + else if (auto srcTdescTy = + dyn_cast(load.getSourceType())) { + if (srcTdescTy.getChunkSizeAsInt() > 1) + instData.push_back(chunkSize); + } + + if (layoutKind == LayoutKind::InstData) + loadLayout = + LayoutInfo(xegpu::LayoutAttr::get(load.getContext(), instData)); + else + loadLayout = getDefaultSIMTLayoutInfo( + payloadTy, uArch, uArch->getGeneralPackedFormatBitSize(), + /*scattered*/ true); + + // Mask operand should have 1D default layout. + maskLayout = getDefaultSIMTLayoutInfo(load->getContext(), 1, subgroupSize); + load.setAnchorLayoutAttr( + dyn_cast(loadLayout.get())); + } // Propagate the new layout to the tensor descriptor operand. if (isa(load.getSourceType())) - propagateIfChanged(operands[0], operands[0]->meet(layout)); + propagateIfChanged(operands[0], operands[0]->meet(loadLayout)); // Propagate the new layout to the mask and optional offset operand. propagateIfChanged(operands[1], operands[1]->meet(maskLayout)); if (load.getOffsets()) @@ -898,21 +980,26 @@ void LayoutInfoPropagation::visitCreateDescOp( void LayoutInfoPropagation::visitStoreScatterOp( xegpu::StoreScatterOp storeScatter, ArrayRef operands, ArrayRef results) { - // Currently, for 2D StoreScatterOp we expect that the height dimension of - // the tensor descriptor is equal to the subgroup size. This is ensured by - // the op verifier. - auto payloadTy = dyn_cast(storeScatter.getValueType()); - if (!payloadTy) { - storeScatter.emitWarning("Not propagating, non-vector payload supplied."); - return; - } - LayoutInfo payloadLayout; - auto uArch = getUArch(getChipStr(storeScatter).value_or("")); - const int subgroupSize = uArch->getSubgroupSize(); - if (auto layout = storeScatter.getLayoutAttr()) { - payloadLayout = LayoutInfo(layout); + LayoutInfo payloadLayout; + LayoutInfo maskLayout; + xegpu::DistributeLayoutAttr anchorLayout = storeScatter.getAnchorLayoutAttr(); + if (hasAnchorLayout(anchorLayout)) { + payloadLayout = LayoutInfo(anchorLayout); + maskLayout = payloadLayout; } else { + // Currently, for 2D StoreScatterOp we expect that the height dimension of + // the tensor descriptor is equal to the subgroup size. This is ensured by + // the op verifier. + auto payloadTy = dyn_cast(storeScatter.getValueType()); + if (!payloadTy) { + storeScatter.emitWarning("Not propagating, non-vector payload supplied."); + return; + } + + auto uArch = getUArch(getChipStr(storeScatter).value_or("")); + const int subgroupSize = uArch->getSubgroupSize(); + if (layoutKind == LayoutKind::InstData) { SmallVector instData{subgroupSize}; if (auto chunkSize = storeScatter.getChunkSize().value_or(0); @@ -936,10 +1023,13 @@ void LayoutInfoPropagation::visitStoreScatterOp( payloadTy, uArch, uArch->getGeneralPackedFormatBitSize(), /*scattered=*/true); } - } - LayoutInfo maskLayout = - getDefaultSIMTLayoutInfo(storeScatter->getContext(), 1, subgroupSize); + maskLayout = + getDefaultSIMTLayoutInfo(storeScatter->getContext(), 1, subgroupSize); + + storeScatter.setAnchorLayoutAttr( + dyn_cast(payloadLayout.get())); + } // Propagate the payload operand layout propagateIfChanged(operands[0], operands[0]->meet(payloadLayout)); // Propagate the destination (if tdesc) operand layout diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUUnroll.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUUnroll.cpp index b0b748c3409c3..c644f784606e9 100644 --- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUUnroll.cpp +++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUUnroll.cpp @@ -678,7 +678,7 @@ struct UnrollLoadGatherOpWithOffset pack(offsets, convertedOffsetTypes, *targetShape, loc, rewriter); } - auto layout = op.getLayoutAttr(); + auto layout = op.getAnchorLayoutAttr(); if (layout) layout = layout.dropInstData(); @@ -778,7 +778,7 @@ struct UnrollStoreScatterOpWithOffsets SmallVector convertedValues = pack(op.getValue(), convertedValTypes, *targetShape, loc, rewriter); - auto layout = op.getLayoutAttr(); + auto layout = op.getAnchorLayoutAttr(); if (layout) layout = layout.dropInstData(); diff --git a/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp b/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp index 4fe35a16b3994..572e5442760bc 100644 --- a/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp +++ b/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp @@ -147,7 +147,7 @@ xegpu::DistributeLayoutAttr xegpu::getDistributeLayoutAttr(const Value value) { // check for "permament" layout only after "temporary" layout name lookup // for backward compatibility if (auto loadGatherOp = dyn_cast(defOp)) - return loadGatherOp.getLayoutAttr(); + return loadGatherOp.getAnchorLayoutAttr(); } if (auto arg = dyn_cast(value)) { @@ -178,7 +178,7 @@ xegpu::getDistributeLayoutAttr(const OpOperand &opr) { // check for "permament" layout only after "temporary" layout name lookup if (auto storeScatterOp = dyn_cast(op)) - if (auto layout = storeScatterOp.getLayoutAttr()) + if (auto layout = storeScatterOp.getAnchorLayoutAttr()) return layout; return getDistributeLayoutAttr(opr.get()); @@ -193,7 +193,7 @@ maybePickPermanentLayout(xegpu::DistributeLayoutAttr layout, xegpu::DistributeLayoutAttr candidate = layout; if (auto loadOp = dyn_cast(owner)) { - if (auto perm = loadOp.getLayoutAttr()) + if (auto perm = loadOp.getAnchorLayoutAttr()) candidate = perm; } @@ -211,7 +211,7 @@ maybePickPermanentLayout(xegpu::DistributeLayoutAttr layout, if (auto storeOp = dyn_cast(owner)) { if (idx == 0) { - if (auto perm = storeOp.getLayoutAttr()) + if (auto perm = storeOp.getAnchorLayoutAttr()) candidate = perm; } } diff --git a/mlir/test/Dialect/XeGPU/propagate-layout-inst-data.mlir b/mlir/test/Dialect/XeGPU/propagate-layout-inst-data.mlir index c31ef323a94d2..62a33a4797d2b 100644 --- a/mlir/test/Dialect/XeGPU/propagate-layout-inst-data.mlir +++ b/mlir/test/Dialect/XeGPU/propagate-layout-inst-data.mlir @@ -5,14 +5,14 @@ // CHECK: %[[CST:.*]] = arith.constant {layout_result_0 = #xegpu.layout} dense<0.000000e+00> : vector<8x16xf32> // CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG0]][{{.*}}] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16, #xegpu.layout // CHECK: %[[T1:.*]] = xegpu.create_nd_tdesc %[[ARG1]][{{.*}}] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout> -// CHECK: %[[T2:.*]] = xegpu.load_nd %[[T0]] {layout_result_0 = #xegpu.layout} : +// CHECK: %[[T2:.*]] = xegpu.load_nd %[[T0]] <{anchor_layout = #xegpu.layout}> {layout_result_0 = #xegpu.layout} : // CHECK-SAME: !xegpu.tensor_desc<8x16xf16, #xegpu.layout> -> vector<8x16xf16> -// CHECK: %[[T3:.*]] = xegpu.load_nd %[[T1]] {layout_result_0 = #xegpu.layout} : +// CHECK: %[[T3:.*]] = xegpu.load_nd %[[T1]] <{anchor_layout = #xegpu.layout}> {layout_result_0 = #xegpu.layout} : // CHECK-SAME: !xegpu.tensor_desc<16x16xf16, #xegpu.layout> -> vector<16x16xf16> -// CHECK: %[[T4:.*]] = xegpu.dpas %[[T2]], %[[T3]], %[[CST]] {layout_result_0 = #xegpu.layout} : +// CHECK: %[[T4:.*]] = xegpu.dpas %[[T2]], %[[T3]], %[[CST]] {anchor_layout_a = #xegpu.layout, anchor_layout_b = #xegpu.layout, anchor_layout_cd = #xegpu.layout, layout_result_0 = #xegpu.layout} : // CHECK-SAME: vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32> -> vector<8x16xf32> // CHECK: %[[T5:.*]] = xegpu.create_nd_tdesc %[[ARG2]][{{.*}}] : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.layout -// CHECK: xegpu.store_nd %[[T4]], %[[T5]] : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.layout> +// CHECK: xegpu.store_nd %[[T4]], %[[T5]] <{anchor_layout = #xegpu.layout}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.layout> gpu.module @test { func.func @dpas_f16(%arg0: memref<8x16xf16>, %arg1: memref<16x16xf16>, %arg2: memref<8x16xf32>) { @@ -46,7 +46,7 @@ gpu.module @test_kernel { %out:3 = scf.for %k = %c0 to %c1024 step %c32 iter_args(%arg0 = %a_tdesc, %arg1 = %b_tdesc, %arg2 = %c_tdesc) -> (!xegpu.tensor_desc<16x32xf16>, !xegpu.tensor_desc<16x32xf16>, !xegpu.tensor_desc<16x32xf16>) { - //CHECK: xegpu.load_nd {{.*}} {layout_result_0 = #xegpu.layout} : + //CHECK: xegpu.load_nd {{.*}} <{anchor_layout = #xegpu.layout}> {layout_result_0 = #xegpu.layout} : //CHECK-SAME: !xegpu.tensor_desc<16x32xf16, #xegpu.layout> -> vector<16x32xf16> %a = xegpu.load_nd %arg0 : !xegpu.tensor_desc<16x32xf16> -> vector<16x32xf16> %b = xegpu.load_nd %arg1 : !xegpu.tensor_desc<16x32xf16> -> vector<16x32xf16> @@ -85,7 +85,7 @@ gpu.module @test_kernel { %out:3 = scf.for %k = %c0 to %c1024 step %c32 iter_args(%arg0 = %a_tdesc, %arg1 = %b_tdesc, %arg2 = %c_tdesc) -> (!xegpu.tensor_desc<12x32xf16>, !xegpu.tensor_desc<12x32xf16>, !xegpu.tensor_desc<12x32xf16>) { - //CHECK: xegpu.load_nd {{.*}} {layout_result_0 = #xegpu.layout} : + //CHECK: xegpu.load_nd {{.*}} <{anchor_layout = #xegpu.layout}> {layout_result_0 = #xegpu.layout} : //CHECK-SAME: !xegpu.tensor_desc<12x32xf16, #xegpu.layout> -> vector<12x32xf16> %a = xegpu.load_nd %arg0 : !xegpu.tensor_desc<12x32xf16> -> vector<12x32xf16> %b = xegpu.load_nd %arg1 : !xegpu.tensor_desc<12x32xf16> -> vector<12x32xf16> @@ -113,9 +113,9 @@ gpu.module @test { // CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: memref<256xf16>) { // CHECK: %{{.*}} = arith.constant {layout_result_0 = #xegpu.layout} dense : vector<16xi1> // CHECK: %{{.*}} = arith.constant {layout_result_0 = #xegpu.layout} dense<12> : vector<16xindex> -// CHECK: %{{.*}} = xegpu.load %[[ARG0]][%{{.*}}], %{{.*}} <{chunk_size = 8 : i64}> +// CHECK: %{{.*}} = xegpu.load %[[ARG0]][%{{.*}}], %{{.*}} <{anchor_layout = #xegpu.layout, chunk_size = 8 : i64}> // CHECK-SAME: {layout_result_0 = #xegpu.layout} : memref<256xf16>, vector<16xindex>, vector<16xi1> -> vector<16x8xf16> -// CHECK: xegpu.store %0, %[[ARG0]][%{{.*}}], %{{.*}} <{chunk_size = 8 : i64}> : vector<16x8xf16>, memref<256xf16>, vector<16xindex>, vector<16xi1> +// CHECK: xegpu.store %0, %[[ARG0]][%{{.*}}], %{{.*}} <{anchor_layout = #xegpu.layout, chunk_size = 8 : i64}> : vector<16x8xf16>, memref<256xf16>, vector<16xindex>, vector<16xi1> func.func @scatter_ops_chunksize(%src: memref<256xf16>) { %1 = arith.constant dense<1>: vector<16xi1> %offset = arith.constant dense<12> : vector<16xindex> diff --git a/mlir/test/Dialect/XeGPU/propagate-layout.mlir b/mlir/test/Dialect/XeGPU/propagate-layout.mlir index eb004932af4be..d1bee47dd6d37 100644 --- a/mlir/test/Dialect/XeGPU/propagate-layout.mlir +++ b/mlir/test/Dialect/XeGPU/propagate-layout.mlir @@ -6,14 +6,14 @@ gpu.module @test { // CHECK: %[[CST:.*]] = arith.constant {layout_result_0 = #xegpu.layout} dense<0.000000e+00> : vector<8x16xf32> // CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG0]][{{.*}}] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16, #xegpu.layout> // CHECK: %[[T1:.*]] = xegpu.create_nd_tdesc %[[ARG1]][{{.*}}] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout> -// CHECK: %[[T2:.*]] = xegpu.load_nd %[[T0]] {layout_result_0 = #xegpu.layout} : +// CHECK: %[[T2:.*]] = xegpu.load_nd %[[T0]] <{anchor_layout = #xegpu.layout}> {layout_result_0 = #xegpu.layout} : // CHECK-SAME: !xegpu.tensor_desc<8x16xf16, #xegpu.layout> -> vector<8x16xf16> -// CHECK: %[[T3:.*]] = xegpu.load_nd %[[T1]] {layout_result_0 = #xegpu.layout} : +// CHECK: %[[T3:.*]] = xegpu.load_nd %[[T1]] <{anchor_layout = #xegpu.layout}> {layout_result_0 = #xegpu.layout} : // CHECK-SAME: !xegpu.tensor_desc<16x16xf16, #xegpu.layout> -> vector<16x16xf16> -// CHECK: %[[T4:.*]] = xegpu.dpas %[[T2]], %[[T3]], %[[CST]] {layout_result_0 = #xegpu.layout} : +// CHECK: %[[T4:.*]] = xegpu.dpas %[[T2]], %[[T3]], %[[CST]] {anchor_layout_a = #xegpu.layout, anchor_layout_b = #xegpu.layout, anchor_layout_cd = #xegpu.layout, layout_result_0 = #xegpu.layout} : // CHECK-SAME: vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32> -> vector<8x16xf32> // CHECK: %[[T5:.*]] = xegpu.create_nd_tdesc %[[ARG2]][{{.*}}] : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.layout> -// CHECK: xegpu.store_nd %[[T4]], %[[T5]] : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.layout> +// CHECK: xegpu.store_nd %[[T4]], %[[T5]] <{anchor_layout = #xegpu.layout}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.layout> func.func @dpas_f16(%arg0: memref<8x16xf16>, %arg1: memref<16x16xf16>, %arg2: memref<8x16xf32>) { %c0 = arith.constant 0 : index %cst = arith.constant dense<0.000000e+00> : vector<8x16xf32> @@ -32,7 +32,8 @@ func.func @dpas_f16(%arg0: memref<8x16xf16>, %arg1: memref<16x16xf16>, %arg2: me gpu.module @test { // CHECK-LABEL: func.func @dpas_i8( // CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: vector<8x32xi8>, %[[ARG1:[0-9a-zA-Z]+]]: vector<32x16xi8>, %[[ARG2:[0-9a-zA-Z]+]]: memref<8x16xi32>) { -// CHECK: %[[T0:.*]] = xegpu.dpas %[[ARG0]], %[[ARG1]] {layout_result_0 = #xegpu.layout, anchor_layout_b = #xegpu.layout, layout_result_0 = #xegpu.layout} + func.func @dpas_i8(%arg0: vector<8x32xi8>, %arg1: vector<32x16xi8>, %arg2: memref<8x16xi32>) { %c0 = arith.constant 0 : index %0 = xegpu.dpas %arg0, %arg1 : vector<8x32xi8>, vector<32x16xi8> -> vector<8x16xi32> @@ -46,8 +47,8 @@ func.func @dpas_i8(%arg0: vector<8x32xi8>, %arg1: vector<32x16xi8>, %arg2: memre gpu.module @test { // CHECK-LABEL: func.func @load_with_transpose_effect( // CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: memref<8x16xf16>, %[[ARG0:[0-9a-zA-Z]+]]: memref<16x16xf16>, %[[ARG0:[0-9a-zA-Z]+]]: memref<8x16xf32>) { -// CHECK: %{{.*}} = xegpu.load_nd %{{.*}} <{transpose = array}> {layout_result_0 = #xegpu.layout} : -// CHECK-SAME: !xegpu.tensor_desc<16x16xf16, #xegpu.layout> -> vector<16x16xf16> +// CHECK: %{{.*}} = xegpu.load_nd %{{.*}} <{anchor_layout = #xegpu.layout}> {layout_result_0 = #xegpu.layout} : +// CHECK-SAME: !xegpu.tensor_desc<8x16xf16, #xegpu.layout> -> vector<8x16xf16> func.func @load_with_transpose_effect(%arg0: memref<8x16xf16>, %arg1: memref<16x16xf16>, %arg2: memref<8x16xf32>) { %c0 = arith.constant 0 : index %cst = arith.constant dense<0.000000e+00> : vector<8x16xf32> @@ -108,7 +109,7 @@ gpu.module @test { // CHECK-NEXT: %[[CST0:.*]] = arith.constant {layout_result_0 = #xegpu.layout} dense : vector<16xi1> // CHECK-NEXT: %[[T2:.*]] = xegpu.create_tdesc %[[ARG1]], %[[CST]] : memref<256xf16>, vector<16xindex> -> // CHECK-SAME: !xegpu.tensor_desc<16x16xf16, #xegpu.scatter_tdesc_attr, #xegpu.layout> -// CHECK-NEXT: %{{.*}} = xegpu.load %[[T2]], %[[CST0]] {layout_result_0 = #xegpu.layout} +// CHECK-NEXT: %{{.*}} = xegpu.load %[[T2]], %[[CST0]] <{anchor_layout = #xegpu.layout}> {layout_result_0 = #xegpu.layout} // CHECK-SAME: !xegpu.tensor_desc<16x16xf16, #xegpu.scatter_tdesc_attr, #xegpu.layout>, vector<16xi1> -> vector<16x16xf16> func.func @load_gather_with_chunksize(%arg0: memref<8x16xf16>, %arg1: memref<256xf16>, %arg2: memref<8x16xf32>) { %c0 = arith.constant 0 : index @@ -135,7 +136,7 @@ gpu.module @test { // CHECK-NEXT: %[[CST0:.*]] = arith.constant {layout_result_0 = #xegpu.layout} dense : vector<16xi1> // CHECK-NEXT: %[[T0:.*]] = xegpu.create_tdesc %[[ARG0]], %[[CST]] : memref<256xf32>, vector<16xindex> -> // CHECK-SAME: !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.layout> -// CHECK-NEXT: %{{.*}} = xegpu.load %[[T0]], %[[CST0]] {layout_result_0 = #xegpu.layout} : +// CHECK-NEXT: %{{.*}} = xegpu.load %[[T0]], %[[CST0]] <{anchor_layout = #xegpu.layout}> {layout_result_0 = #xegpu.layout} : // CHECK-SAME: !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.layout>, vector<16xi1> -> vector<16xf32> func.func @load_gather_1d(%arg0: memref<256xf32>, %arg1: !xegpu.tensor_desc<16xf32>) { %cst = arith.constant dense<[0, 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240]> : vector<16xindex> @@ -183,9 +184,9 @@ gpu.module @test { // CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: memref<256xf16>) { // CHECK: %[[MASK:.*]] = arith.constant {layout_result_0 = #xegpu.layout} dense : vector<16xi1> // CHECK: %[[OFFSETS:.*]] = arith.constant {layout_result_0 = #xegpu.layout} dense<12> : vector<16xindex> -// CHECK: %[[LOAD_VEC:.*]] = xegpu.load %[[ARG0]][%[[OFFSETS]]], %[[MASK]] <{chunk_size = 8 : i64}> +// CHECK: %[[LOAD_VEC:.*]] = xegpu.load %[[ARG0]][%[[OFFSETS]]], %[[MASK]] <{anchor_layout = #xegpu.layout, chunk_size = 8 : i64}> // CHECK-SAME: {layout_result_0 = #xegpu.layout} : memref<256xf16>, vector<16xindex>, vector<16xi1> -> vector<16x8xf16> -// CHECK: xegpu.store %[[LOAD_VEC]], %[[ARG0]][%[[OFFSETS]]], %[[MASK]] <{chunk_size = 8 : i64}> : vector<16x8xf16>, memref<256xf16>, vector<16xindex>, vector<16xi1> +// CHECK: xegpu.store %[[LOAD_VEC]], %[[ARG0]][%[[OFFSETS]]], %[[MASK]] <{anchor_layout = #xegpu.layout, chunk_size = 8 : i64}> : vector<16x8xf16>, memref<256xf16>, vector<16xindex>, vector<16xi1> func.func @scatter_ops_chunksize(%src: memref<256xf16>) { %1 = arith.constant dense<1>: vector<16xi1> %offset = arith.constant dense<12> : vector<16xindex> @@ -204,7 +205,7 @@ gpu.module @test { // CHECK: %[[OFFSETS:.*]] = arith.constant {layout_result_0 = #xegpu.layout} dense<12> : vector<16xindex> // CHECK: %[[LOAD_VEC:.*]] = xegpu.load %[[ARG0]][%[[OFFSETS]]], %[[MASK]] // CHECK-SAME: {layout_result_0 = #xegpu.layout} : memref<256xf16>, vector<16xindex>, vector<16xi1> -> vector<16xf16> -// CHECK: xegpu.store %[[LOAD_VEC]], %[[ARG0]][%[[OFFSETS]]], %[[MASK]] : vector<16xf16>, memref<256xf16>, vector<16xindex>, vector<16xi1> +// CHECK: xegpu.store %[[LOAD_VEC]], %[[ARG0]][%[[OFFSETS]]], %[[MASK]] <{anchor_layout = #xegpu.layout}> : vector<16xf16>, memref<256xf16>, vector<16xindex>, vector<16xi1> func.func @scatter_ops(%src: memref<256xf16>) { %1 = arith.constant dense<1>: vector<16xi1> %offset = arith.constant dense<12> : vector<16xindex> @@ -220,10 +221,10 @@ gpu.module @test { // CHECK: %[[MASK:.*]] = arith.constant {layout_result_0 = #xegpu.layout} dense : vector<16xi1> // CHECK: %[[OFFSETS:.*]] = arith.constant {layout_result_0 = #xegpu.layout} dense<12> : vector<16xindex> // CHECK: %[[LOAD_VEC:.*]] = xegpu.load %[[ARG0]][%[[OFFSETS]]], %[[MASK]] -// CHECK-SAME: {layout_result_0 = #xegpu.layout} : memref<256xf16>, vector<16xindex>, vector<16xi1> -> vector<16xf16> -// CHECK: %[[ADD_RES:.*]] = arith.addf %[[LOAD_VEC]], %[[LOAD_VEC]] {layout_result_0 = #xegpu.layout} : vector<16xf16> +// CHECK-SAME: {layout_result_0 = #xegpu.layout} : memref<256xf16>, vector<16xindex>, vector<16xi1> -> vector<16xf16> +// CHECK: %[[ADD_RES:.*]] = arith.addf %[[LOAD_VEC]], %[[LOAD_VEC]] {layout_result_0 = #xegpu.layout} : vector<16xf16> // CHECK: xegpu.store %[[ADD_RES]], %[[ARG0]][%[[OFFSETS]]], %[[MASK]] -// CHECK-SAME <{layout = #xegpu.layout}> : vector<16xf16>, memref<256xf16>, vector<16xindex>, vector<16xi1> +// CHECK-SAME <{anchor_layout = #xegpu.layout}> : vector<16xf16>, memref<256xf16>, vector<16xindex>, vector<16xi1> func.func @scatter_ops_custom_perm_layout(%src: memref<256xf16>) { %1 = arith.constant dense<1>: vector<16xi1> %offset = arith.constant dense<12> : vector<16xindex> @@ -239,11 +240,11 @@ gpu.module @test { // CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: memref<256xf16>) { // CHECK: %[[MASK:.*]] = arith.constant {layout_result_0 = #xegpu.layout} dense : vector<16xi1> // CHECK: %[[OFFSETS:.*]] = arith.constant {layout_result_0 = #xegpu.layout} dense<12> : vector<16xindex> -// CHECK: %[[LOAD_VEC:.*]] = xegpu.load %[[ARG0]][%[[OFFSETS]]], %[[MASK]] <{layout = #xegpu.layout}> +// CHECK: %[[LOAD_VEC:.*]] = xegpu.load %[[ARG0]][%[[OFFSETS]]], %[[MASK]] // CHECK-SAME: {layout_result_0 = #xegpu.layout} : memref<256xf16>, vector<16xindex>, vector<16xi1> -> vector<16xf16> -// CHECK: %[[ADD_RES:.*]] = arith.addf %[[LOAD_VEC]], %[[LOAD_VEC]] {layout_result_0 = #xegpu.layout} : vector<16xf16> +// CHECK: %[[ADD_RES:.*]] = arith.addf %[[LOAD_VEC]], %[[LOAD_VEC]] {layout_result_0 = #xegpu.layout} : vector<16xf16> // CHECK: xegpu.store %[[ADD_RES]], %[[ARG0]][%[[OFFSETS]]], %[[MASK]] -// CHECK-SAME <{layout = #xegpu.layout}> : vector<16xf16>, memref<256xf16>, vector<16xindex>, vector<16xi1> +// CHECK-SAME <{anchor_layout = #xegpu.layout}> : vector<16xf16>, memref<256xf16>, vector<16xindex>, vector<16xi1> func.func @scatter_ops_preserve_load_perm_layout(%src: memref<256xf16>) { %1 = arith.constant dense<1>: vector<16xi1> %offset = arith.constant dense<12> : vector<16xindex> @@ -256,9 +257,9 @@ func.func @scatter_ops_preserve_load_perm_layout(%src: memref<256xf16>) { // ----- gpu.module @test { // CHECK-LABEL: func.func @vector_bitcast_i16_to_f16( -// CHECK: %[[LOAD0:.*]] = xegpu.load_nd %{{.*}} {layout_result_0 = #xegpu.layout} +// CHECK: %[[LOAD0:.*]] = xegpu.load_nd %{{.*}} <{anchor_layout = #xegpu.layout}> {layout_result_0 = #xegpu.layout} // CHECK-SAME: !xegpu.tensor_desc<8x16xi16, #xegpu.layout> -> vector<8x16xi16> -// CHECK: %[[LOAD1:.*]] = xegpu.load_nd %{{.*}} {layout_result_0 = #xegpu.layout} +// CHECK: %[[LOAD1:.*]] = xegpu.load_nd %{{.*}} <{anchor_layout = #xegpu.layout}> {layout_result_0 = #xegpu.layout} // CHECK-SAME: !xegpu.tensor_desc<16x16xi16, #xegpu.layout> -> vector<16x16xi16> // CHECK: %{{.*}} = vector.bitcast %[[LOAD0]] {layout_result_0 = #xegpu.layout} // CHECK-SAME: vector<8x16xi16> to vector<8x16xf16> @@ -281,7 +282,7 @@ func.func @vector_bitcast_i16_to_f16(%arg0: memref<8x16xi16>, %arg1: memref<16x1 // ----- gpu.module @test { // CHECK-LABEL: func.func @vector_bitcast_i32_to_f16( -// CHECK: %[[LOAD:.*]] = xegpu.load_nd %{{.*}} {layout_result_0 = #xegpu.layout} +// CHECK: %[[LOAD:.*]] = xegpu.load_nd %{{.*}} <{anchor_layout = #xegpu.layout}> {layout_result_0 = #xegpu.layout} // CHECK-SAME: !xegpu.tensor_desc<16x8xi32, #xegpu.layout> -> vector<16x8xi32> // CHECK-NEXT: %{{.*}} = vector.bitcast %[[LOAD]] {layout_result_0 = #xegpu.layout} // CHECK-SAME: vector<16x8xi32> to vector<16x16xf16> @@ -302,7 +303,7 @@ func.func @vector_bitcast_i32_to_f16(%arg0: memref<8x16xf16>, %arg1: memref<16x8 // ----- gpu.module @test { // CHECK-LABEL: func.func @vector_bitcast_i16_to_i32( -// CHECK: %[[LOAD:.*]] = xegpu.load_nd %{{.*}} {layout_result_0 = #xegpu.layout} +// CHECK: %[[LOAD:.*]] = xegpu.load_nd %{{.*}} <{anchor_layout = #xegpu.layout}> {layout_result_0 = #xegpu.layout} // CHECK-SAME: !xegpu.tensor_desc<8x32xi16, #xegpu.layout> -> vector<8x32xi16> // CHECK-NEXT: %{{.*}} = vector.bitcast %[[LOAD]] {layout_result_0 = #xegpu.layout} // CHECK-SAME: vector<8x32xi16> to vector<8x16xi32> @@ -339,9 +340,9 @@ gpu.module @test { // CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<8x16xf16, #xegpu.layout>, // CHECK-SAME: %[[ARG1:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<16x16xf16, #xegpu.layout>, // CHECK-SAME: %[[ARG2:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<8x16xf32, #xegpu.layout>) { -// CHECK: %[[T1:.*]] = xegpu.load_nd %[[ARG1]] {layout_result_0 = #xegpu.layout} : +// CHECK: %[[T1:.*]] = xegpu.load_nd %[[ARG1]] <{anchor_layout = #xegpu.layout}> {layout_result_0 = #xegpu.layout} : // CHECK-SAME: !xegpu.tensor_desc<16x16xf16, #xegpu.layout> -> vector<16x16xf16> -// CHECK-NEXT: %[[T2:.*]] = xegpu.load_nd %[[ARG1]] {layout_result_0 = #xegpu.layout} : +// CHECK-NEXT: %[[T2:.*]] = xegpu.load_nd %[[ARG1]] <{anchor_layout = #xegpu.layout}> {layout_result_0 = #xegpu.layout} : // CHECK-SAME: !xegpu.tensor_desc<16x16xf16, #xegpu.layout> -> vector<16x16xf16> // CHECK-NEXT: %{{.*}} = arith.addf %[[T1]], %[[T2]] {layout_result_0 = #xegpu.layout} : vector<16x16xf16> func.func @binary_op_one_use(%arg0: !xegpu.tensor_desc<8x16xf16>, %arg1: !xegpu.tensor_desc<16x16xf16>, %arg2: !xegpu.tensor_desc<8x16xf32>) { @@ -362,9 +363,9 @@ gpu.module @test { // CHECK-SAME: %[[ARG2:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<8x16xf32, #xegpu.layout>, // CHECK-SAME: %[[ARG3:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<16x16xf16, #xegpu.layout>) { // CHECK: %[[T2:.*]] = arith.addf %{{.*}}, %{{.*}} {layout_result_0 = #xegpu.layout} : vector<16x16xf16> -// CHECK: %[[T3:.*]] = xegpu.dpas %{{.*}}, %[[T2]] {layout_result_0 = #xegpu.layout} : vector<8x16xf16>, vector<16x16xf16> -> vector<8x16xf32> -// CHECK-NEXT: xegpu.store_nd %[[T3]], %[[ARG2]] : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.layout> -// CHECK-NEXT: xegpu.store_nd %[[T2]], %[[ARG3]] : vector<16x16xf16>, !xegpu.tensor_desc<16x16xf16, #xegpu.layout> +// CHECK: %[[T3:.*]] = xegpu.dpas %{{.*}}, %[[T2]] {anchor_layout_a = #xegpu.layout, anchor_layout_b = #xegpu.layout, layout_result_0 = #xegpu.layout} : vector<8x16xf16>, vector<16x16xf16> -> vector<8x16xf32> +// CHECK-NEXT: xegpu.store_nd %[[T3]], %[[ARG2]] <{anchor_layout = #xegpu.layout}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.layout> +// CHECK-NEXT: xegpu.store_nd %[[T2]], %[[ARG3]] <{anchor_layout = #xegpu.layout}> : vector<16x16xf16>, !xegpu.tensor_desc<16x16xf16, #xegpu.layout> func.func @binary_op_multiple_uses(%arg0: !xegpu.tensor_desc<8x16xf16>, %arg1: !xegpu.tensor_desc<16x16xf16>, %arg2: !xegpu.tensor_desc<8x16xf32>, %arg3: !xegpu.tensor_desc<16x16xf16>) { %0 = xegpu.load_nd %arg0 : !xegpu.tensor_desc<8x16xf16> -> vector<8x16xf16> %1 = xegpu.load_nd %arg1 : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16> @@ -385,11 +386,11 @@ gpu.module @test { // CHECK-NEXT: %[[CST:.*]] = arith.constant {layout_result_0 = #xegpu.layout} dense<0.000000e+00> : vector<8x16xf32> // CHECK-NEXT: %[[T2:.*]]:3 = scf.for %{{.*}} iter_args(%[[ARG4:.*]] = %[[T0]], %[[ARG5:.*]] = %[[T1]], %[[ARG6:.*]] = %[[CST]]) -> // CHECK-SAME: (!xegpu.tensor_desc<8x16xf16, #xegpu.layout>, !xegpu.tensor_desc<16x16xf16, #xegpu.layout>, vector<8x16xf32>) { -// CHECK-NEXT: %[[T4:.*]] = xegpu.load_nd %[[ARG4]] {layout_result_0 = #xegpu.layout} : +// CHECK-NEXT: %[[T4:.*]] = xegpu.load_nd %[[ARG4]] <{anchor_layout = #xegpu.layout}> {layout_result_0 = #xegpu.layout} : // CHECK-SAME: !xegpu.tensor_desc<8x16xf16, #xegpu.layout> -> vector<8x16xf16> -// CHECK-NEXT: %[[T5:.*]] = xegpu.load_nd %[[ARG5]] {layout_result_0 = #xegpu.layout} : +// CHECK-NEXT: %[[T5:.*]] = xegpu.load_nd %[[ARG5]] <{anchor_layout = #xegpu.layout}> {layout_result_0 = #xegpu.layout} : // CHECK-SAME: !xegpu.tensor_desc<16x16xf16, #xegpu.layout> -> vector<16x16xf16> -// CHECK-NEXT: %[[T6:.*]] = xegpu.dpas %[[T4]], %[[T5]], %[[ARG6]] {layout_result_0 = #xegpu.layout} : +// CHECK-NEXT: %[[T6:.*]] = xegpu.dpas %[[T4]], %[[T5]], %[[ARG6]] {anchor_layout_a = #xegpu.layout, anchor_layout_b = #xegpu.layout, anchor_layout_cd = #xegpu.layout, layout_result_0 = #xegpu.layout} : // CHECK-SAME: vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32> -> vector<8x16xf32> // CHECK-NEXT: %[[T7:.*]] = xegpu.update_nd_offset %[[ARG4]], [{{.*}}] : !xegpu.tensor_desc<8x16xf16, #xegpu.layout> // CHECK-NEXT: %[[T8:.*]] = xegpu.update_nd_offset %[[ARG5]], [{{.*}}] : !xegpu.tensor_desc<16x16xf16, #xegpu.layout> @@ -397,7 +398,7 @@ gpu.module @test { // CHECK-SAME: !xegpu.tensor_desc<16x16xf16, #xegpu.layout>, vector<8x16xf32> // CHECK-NEXT: } {layout_result_2 = #xegpu.layout} // CHECK-NEXT: %[[T3:.*]] = xegpu.create_nd_tdesc %[[ARG2]][{{.*}}] : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.layout> -// CHECK-NEXT: xegpu.store_nd %[[T2]]#2, %[[T3]] : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.layout> +// CHECK-NEXT: xegpu.store_nd %[[T2]]#2, %[[T3]] <{anchor_layout = #xegpu.layout}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.layout> func.func @for_op(%arg0: memref<8x128xf16>, %arg1: memref<128x16xf16>, %arg2: memref<8x16xf32>) { %c0 = arith.constant 0 : index %c128 = arith.constant 128 : index @@ -425,11 +426,11 @@ gpu.module @test { // CHECK-SAME: %[[ARG1:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<16x16xf16, #xegpu.layout>, // CHECK-SAME: %[[ARG2:[0-9a-zA-Z]+]]: i1, %[[ARG3:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<8x16xf32, #xegpu.layout>) { // CHECK: %{{.*}} = scf.if %[[ARG2]] -> (vector<16x16xf16>) { -// CHECK-NEXT: %[[T3:.*]] = xegpu.load_nd %[[ARG1]] {layout_result_0 = #xegpu.layout} : +// CHECK-NEXT: %[[T3:.*]] = xegpu.load_nd %[[ARG1]] <{anchor_layout = #xegpu.layout}> {layout_result_0 = #xegpu.layout} : // CHECK-SAME: !xegpu.tensor_desc<16x16xf16, #xegpu.layout> -> vector<16x16xf16> // CHECK-NEXT: scf.yield %[[T3]] : vector<16x16xf16> // CHECK-NEXT: } else { -// CHECK-NEXT: %[[T4:.*]] = xegpu.load_nd %[[ARG1]] {layout_result_0 = #xegpu.layout} : +// CHECK-NEXT: %[[T4:.*]] = xegpu.load_nd %[[ARG1]] <{anchor_layout = #xegpu.layout}> {layout_result_0 = #xegpu.layout} : // CHECK-SAME: !xegpu.tensor_desc<16x16xf16, #xegpu.layout> -> vector<16x16xf16> // CHECK-NEXT: scf.yield %[[T4]] : vector<16x16xf16> // CHECK-NEXT: } {layout_result_0 = #xegpu.layout} @@ -455,11 +456,11 @@ gpu.module @test { // CHECK-SAME: %[[ARG2:[0-9a-zA-Z]+]]: i1, %[[ARG3:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<8x16xf32, #xegpu.layout>, // CHECK-SAME: %[[ARG4:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<16x16xf16, #xegpu.layout>) { // CHECK: %[[T1:.*]] = scf.if %[[ARG2]] -> (vector<16x16xf16>) { -// CHECK-NEXT: %[[T3:.*]] = xegpu.load_nd %[[ARG1]] {layout_result_0 = #xegpu.layout} : +// CHECK-NEXT: %[[T3:.*]] = xegpu.load_nd %[[ARG1]] <{anchor_layout = #xegpu.layout}> {layout_result_0 = #xegpu.layout} : // CHECK-SAME: !xegpu.tensor_desc<16x16xf16, #xegpu.layout> -> vector<16x16xf16> // CHECK-NEXT: scf.yield %[[T3]] : vector<16x16xf16> // CHECK-NEXT: } else { -// CHECK-NEXT: %[[T4:.*]] = xegpu.load_nd %[[ARG1]] {layout_result_0 = #xegpu.layout} : +// CHECK-NEXT: %[[T4:.*]] = xegpu.load_nd %[[ARG1]] <{anchor_layout = #xegpu.layout}> {layout_result_0 = #xegpu.layout} : // CHECK-SAME: !xegpu.tensor_desc<16x16xf16, #xegpu.layout> -> vector<16x16xf16> // CHECK-NEXT: scf.yield %[[T4]] : vector<16x16xf16> // CHECK-NEXT: } {layout_result_0 = #xegpu.layout} @@ -539,7 +540,7 @@ gpu.module @test { // CHECK-LABEL: func.func @prefetch_2d( // CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: memref<256x256xf16>) { // CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG0]][%{{.*}}, %{{.*}}] : memref<256x256xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout> -// CHECK-NEXT: xegpu.prefetch_nd %[[T0]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<16x16xf16, #xegpu.layout> +// CHECK-NEXT: xegpu.prefetch_nd %[[T0]] <{anchor_layout = #xegpu.layout, l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<16x16xf16, #xegpu.layout> func.func @prefetch_2d(%arg0: memref<256x256xf16>){ %c0 = arith.constant 0 : index %0 = xegpu.create_nd_tdesc %arg0[%c0, %c0] : memref<256x256xf16> -> !xegpu.tensor_desc<16x16xf16> @@ -552,7 +553,7 @@ gpu.module @test { // CHECK-LABEL: func.func @prefetch_1d( // CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: memref<256xf16>) { // CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG0]][%{{.*}}] : memref<256xf16> -> !xegpu.tensor_desc<16xf16, #xegpu.layout> -// CHECK-NEXT: xegpu.prefetch_nd %[[T0]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<16xf16, #xegpu.layout> +// CHECK-NEXT: xegpu.prefetch_nd %[[T0]] <{anchor_layout = #xegpu.layout, l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<16xf16, #xegpu.layout> func.func @prefetch_1d(%arg0: memref<256xf16>){ %c0 = arith.constant 0 : index %0 = xegpu.create_nd_tdesc %arg0[%c0] : memref<256xf16> -> !xegpu.tensor_desc<16xf16> @@ -599,7 +600,7 @@ gpu.module @test { // CHECK-LABEL: func.func @vector_shape_cast_1d_to_2d_dim1_distributed( // CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<16x16xf16, #xegpu.layout>, // CHECK-SAME: %[[ARG1:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<16x16xf16, #xegpu.layout>) { -// CHECK: %[[LOAD:.*]] = xegpu.load_nd %[[ARG0]] {layout_result_0 = #xegpu.layout} +// CHECK: %[[LOAD:.*]] = xegpu.load_nd %[[ARG0]] <{anchor_layout = #xegpu.layout}> {layout_result_0 = #xegpu.layout} // CHECK-SAME: !xegpu.tensor_desc<16x16xf16, #xegpu.layout> -> vector<16x16xf16> // CHECK-NEXT: %[[REDUCE:.*]] = vector.multi_reduction , %[[LOAD]], %{{[0-9a-zA-Z]+}} // CHECK-SAME: {layout_result_0 = #xegpu.slice<#xegpu.layout, dims = [0]>} [0] : vector<16x16xf16> to vector<16xf16> @@ -621,7 +622,7 @@ gpu.module @test { // CHECK-LABEL: func.func @vector_shape_cast_1d_to_2d_dim0_broadcasted( // CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<16x16xf16, #xegpu.layout>, // CHECK-SAME: %[[ARG1:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<16x16xf16, #xegpu.layout>) { -// CHECK: %[[LOAD:.*]] = xegpu.load_nd %arg0 {layout_result_0 = #xegpu.layout} +// CHECK: %[[LOAD:.*]] = xegpu.load_nd %arg0 <{anchor_layout = #xegpu.layout}> {layout_result_0 = #xegpu.layout} // CHECK-SAME: !xegpu.tensor_desc<16x16xf16, #xegpu.layout> -> vector<16x16xf16> // CHECK-NEXT: %[[REDUCE:.*]] = vector.multi_reduction , %[[LOAD]], %{{[0-9a-zA-Z]+}} // CHECK-SAME: {layout_result_0 = #xegpu.slice<#xegpu.layout, dims = [1]>} [1] diff --git a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops.mlir b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops.mlir index 3760737cf51f5..171cadeeaeaf9 100644 --- a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops.mlir +++ b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops.mlir @@ -286,7 +286,7 @@ gpu.module @test_distribution { // CHECK: %[[VAL:.*]] = arith.constant {layout_result_0 = #xegpu.layout} dense<2.550000e+01> : vector<8xf16> // CHECK: %[[CST:.*]] = arith.constant {layout_result_0 = #xegpu.layout} dense<0> : vector<8xindex> // CHECK: %[[MASK:.*]] = arith.constant {layout_result_0 = #xegpu.layout} dense : vector<8xi1> - // CHECK: xegpu.store %[[VAL]], %[[ARG0]][%[[CST]]], %[[MASK]] <{chunk_size = 1 : i64, l1_hint = #xegpu.cache_hint, layout = #xegpu.layout}> + // CHECK: xegpu.store %[[VAL]], %[[ARG0]][%[[CST]]], %[[MASK]] <{anchor_layout = #xegpu.layout, chunk_size = 1 : i64, l1_hint = #xegpu.cache_hint}> // CHECK-SAME: {layout_operand_0 = #xegpu.layout, layout_operand_2 = #xegpu.layout, // CHECK-SAME: layout_operand_3 = #xegpu.layout} // CHECK-SAME: : vector<8xf16>, memref<256xf16>, vector<8xindex>, vector<8xi1> @@ -554,7 +554,7 @@ gpu.module @test_distribution { %offset = arith.constant {layout_result_0 = #xegpu.layout } dense<0> : vector<256xindex> %mask = arith.constant {layout_result_0 = #xegpu.layout } dense<1> : vector<256xi1> - // CHECK: %[[LOAD:.*]] = xegpu.load {{.*}} <{chunk_size = 1 : i64, layout = #xegpu.slice<#xegpu.layout, dims = [0]>}> + // CHECK: %[[LOAD:.*]] = xegpu.load {{.*}} <{anchor_layout = #xegpu.slice<#xegpu.layout, dims = [0]>, chunk_size = 1 : i64}> // CHECK-SAME: {layout_result_0 = #xegpu.slice<#xegpu.layout, dims = [0]>} : // CHECK-SAME: memref<4096xf32>, vector<32xindex>, vector<32xi1> -> vector<32xf32> %3 = xegpu.load %2[%offset], %mask {chunk_size = 1, layout_result_0 = #xegpu.slice<#xegpu.layout, dims = [0]> } : memref<4096xf32>, vector<256xindex>, vector<256xi1> -> vector<256xf32> From 0482234e56256ac0824a4fb85bac492b50080fdc Mon Sep 17 00:00:00 2001 From: Jianhui Li Date: Mon, 24 Nov 2025 01:58:41 +0000 Subject: [PATCH 03/28] adding documentation --- .../include/mlir/Dialect/XeGPU/IR/XeGPUOps.td | 192 ++++++++++++++---- mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp | 2 +- .../XeGPU/Transforms/XeGPUPropagateLayout.cpp | 20 +- 3 files changed, 165 insertions(+), 49 deletions(-) diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td index 70c61a445e8ae..344fb23ba7b8d 100644 --- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td +++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td @@ -253,6 +253,22 @@ def XeGPU_PrefetchNdOp : XeGPU_Op<"prefetch_nd", []> { It issues an instruction to prefetch a block of data from continuous memory regions to each level of the cache based on their cache policy. + Arguments: + - `TensorDesc`: A tensor descriptor specifying the base nd-region of + memory and tensor tile to be prefetched. + + - `offsets`: index values representing per-dimension offsets from the + base position encoded in `TensorDesc`. It is encoded via "offsets" + and "const_offsets". + + - `l1_hint`, `l2_hint`, `l3_hint`: [optional] An cache-hint attribute + indicating the desired behavior at the L1, L2, and L3 cache levels. + + - `anchor_layout`: [optional] An attribute that identifies the operation + as an anchor, enabling users to assign a layout that governs distribution + at the subgroup and/or work-item level. Only valid at workgroup and subgroup + level. + Example: ```mlir xegpu.prefetch_nd %tdesc {l1_hint = #xegpu.cache_hint, @@ -326,16 +342,37 @@ def XeGPU_LoadNdOp : XeGPU_Op<"load_nd", [ a block of data from memory to register. It takes a set of optional cache hints for each level of cache, L1, L2 and L3. If hardware does not have a correspoding cache, Corresponding cache hint attribute will be masked. - VNNI transformation is an hardware feature for Intel GPU, which is used to - do data packing during the load for B operand of matrix operation, if - the bit width of the data type is less then 32 bits, e.g., fp16. And - transpose is another Intel hardware feature, which will do transpose - operation when loading the data if the bit width of the data type is - fp32 or fp64. It implies that vnni and transpose cannot exit at the - same time. It is only available to 1D or 2D blocked tensor_desc. + + On Intel GPUs, hardware-supported packing rearranges data elements during + the load of the B operand when the element bit-width is less than 32 bits + (for example, fp16). The transpose feature reorders data during the load + when the element type is fp32 or fp64. These two features are mutually + exclusive and shall not be enabled simultaneously. Both features support only + 2D blocked tensor_desc. In SIMT mode, result vector represents the data to be loaded by each work-item. + Arguments: + + - `TensorDesc`: A tensor descriptor specifying the base nd-region of memory + and the tensor tile to be loaded. + + - `offsets`: Index values representing per-dimension offsets from the base position + encoded in `TensorDesc`. They are encoded via `offsets` and `const_offsets`. + + - `packed`: [optional] A unit attribute indicating that packing is applied + during the load when supported by the hardware. Only valid at lane level. + + - `transpose`: [optional] An attribute describing a hardware-supported transpose + to be applied during the load. Only valid at Lane level. + + - `l1_hint`, `l2_hint`, `l3_hint`: [optional] Cache-hint attributes indicating the + desired behavior at the L1, L2, and L3 cache levels. + + - `anchor_layout`: [optional] An attribute that identifies the operation as an anchor, + enabling users to assign a layout that governs distribution at the subgroup and/or + work-item level. Only valid at workgroup and subgroup levels. + Example 1: ```mlir xegpu.load_nd %1 {transpose = [1, 0], @@ -391,7 +428,6 @@ def XeGPU_LoadNdOp : XeGPU_Op<"load_nd", [ return getTensorDescType().getShape(); } - }]; let assemblyFormat = [{ @@ -432,6 +468,23 @@ def XeGPU_StoreNdOp : XeGPU_Op<"store_nd", [ In SIMT mode, the input vector represents the data to be stored by each work-item. + Arguments: + + - `value`: A vector value representing the tensor tile to be stored. + + - `TensorDesc`: A tensor descriptor specifying the base nd-region of memory and + the tensor tile to be stored. + + - `offsets`: Index values representing per-dimension offsets from the base position + encoded in `TensorDesc`. They are encoded via `offsets` and `const_offsets`. + + - `l1_hint`, `l2_hint`, `l3_hint`: [optional] Cache-hint attributes indicating the + desired behavior at the L1, L2, and L3 cache levels. + + - `anchor_layout`: [optional] An attribute that identifies the operation as an anchor, + enabling users to assign a layout that governs distribution at the subgroup and/or + work-item level. Only valid at workgroup and subgroup levels. + Example 1: ```mlir xegpu.store_nd %3, %2 {l1_hint = #xegpu.cache_hint, @@ -568,8 +621,10 @@ def XeGPU_CreateDescOp: XeGPU_Op<"create_tdesc", [Pure, ViewLikeOpInterface]> { It accepts the following parameters: Arguments: + - `source`: a 1D memref or pointer (i64, i32, ui64, ui32) represents the flattened memory object. + - `offsets`: a vector containing offsets of each access point. Its size is fixed to the hardware supportted subgroup size, e.g., 16 on PVC, implying each element in the vector corresponds to a work-item (SIMT lane) @@ -668,17 +723,25 @@ def XeGPU_PrefetchOp : XeGPU_Op<"prefetch", []> { it works on scattered TensorDesc instead. Arguments: + - `source`: represents the memory region to be loaded from, which can be either a tensor_desc or a 1D memref or pointer (ui64, ui32, i64 or i32). In case of tensor_desc, offsets come from the producer create_tdesc op. tensor_desc cannot be used in SIMT mode. + - `offsets`: represents offsets from source. required if `source` in not a TensorDescType. offsets is a vector of `index` type and vector length is either the subgroup size or 1 in SIMT mode. scalar offset is also valid for SIMT mode. - - `l1_hint`, `l2_hint`, `l3_hint`: are optional cache hints for each level of cache. - - `offset_align_byte`: required if `source` is a pointer. If `source` is not a pointer, + + - `l1_hint`, `l2_hint`, `l3_hint`: [optional] cache hints for each level of cache. + + - `offset_align_byte`: [optional] required if `source` is a pointer. If `source` is not a pointer, it is not allowed. Represents the alignment in bytes of each offset in offsets. + - `anchor_layout`: [optional] An attribute that identifies the operation as an anchor, + enabling users to assign a layout that governs distribution at the subgroup and/or + work-item level. Only valid at workgroup and subgroup levels. + Example 1: ```mlir xegpu.prefetch %tdesc {l1_hint = #xegpu.cache_hint, @@ -727,7 +790,8 @@ def XeGPU_PrefetchOp : XeGPU_Op<"prefetch", []> { OptionalAttr:$l1_hint, OptionalAttr:$l2_hint, OptionalAttr:$l3_hint, - OptionalAttr:$offset_align_byte); + OptionalAttr:$offset_align_byte, + OptionalAttr:$anchor_layout); let extraClassDeclaration = extraBaseClassDeclaration # [{ Type getSourceType() { @@ -779,18 +843,27 @@ def XeGPU_LoadGatherOp : XeGPU_Op<"load", [MemoryEffects<[MemRead]>]> { each work-item. If size is not 1, size should be equal to the chunk size, Arguments: + - `source`: represents the memory region to be loaded from, which can be either a tensor_desc or a 1D memref or pointer (ui64, ui32, i64 or i32). In case of tensor_desc, offsets come from the producer create_tdesc op. tensor_desc cannot be used in SIMT mode. + - `offsets`: represents offsets from source. required if `source` in not a TensorDescType. offsets is a vector of `index` type and vector length is either the subgroup size or 1 in SIMT mode. scalar offset is also valid for SIMT mode. + - `mask`: is a vector of `i1` type, which is used to mask out the memory access. mask is a vector of size equal to the subgroup size, or 1 in SIMT mode. scalar mask is also valid for SIMT mode. - - `chunk_size`: (optional) represents contiguous number of elements to load from per work item. - - `l1_hint`, `l2_hint`, `l3_hint`: are optional cache hints for each level of cache. + + - `chunk_size`: [optional] represents contiguous number of elements to load from per work item. + + - `l1_hint`, `l2_hint`, `l3_hint`: [optional] cache hints for each level of cache. + + - `anchor_layout`: [optional] An attribute that identifies the operation as an anchor, + enabling users to assign a layout that governs distribution at the subgroup and/or + work-item level. Only valid at workgroup and subgroup levels. Results: - `res`: represents loaded data @@ -926,19 +999,30 @@ def XeGPU_StoreScatterOp : XeGPU_Op<"store", [MemoryEffects<[MemWrite]>]> { each work-item. If size is not 1, size should be equal to the chunk size. Arguments: + - `value`: represents the data to be stored. + - `dest`: represents the memory region to be stored to, which can be either a tensor_desc or a 1D memref or pointer (ui64, ui32, i64 or i32). In case of tensor_desc, offsets come from the producer create_tdesc op. tensor_desc cannot be used in SIMT mode. + - `offsets`: represents offsets from dest. required if `source` in not a TensorDescType. offsets is a vector of `index` type and vector length is either the subgroup size or 1 in SIMT mode. scalar offset is also valid for SIMT mode. + - `mask`: is a vector of `i1` type, which is used to mask out the memory access. mask is a vector of size equal to the subgroup size, or 1 in SIMT mode. scalar mask is also valid for SIMT mode. - - `chunk_size`: (optional) represents contiguous number of elements to store to per work item. - - `l1_hint`, `l2_hint`, `l3_hint`: are optional cache hints for each level of cache. + + - `chunk_size`: [optional] represents contiguous number of elements to store to per work item. + + - `l1_hint`, `l2_hint`, `l3_hint`: [optional] cache hints for each level of cache. + + - `anchor_layout`: [optional] An attribute that identifies the operation as an anchor, + enabling users to assign a layout that governs distribution at the subgroup and/or + work-item level. Only valid at workgroup and subgroup levels. + Example 1: ```mlir @@ -1115,22 +1199,28 @@ def XeGPU_DpasOp : XeGPU_Op<"dpas", [Pure, AllElementTypesMatch<["lhs", "rhs"]>] size, B of `kxn` size, and accumulate on matrix C of `mxn` to the same size matrix , `m=8`, `n=16` and `k=8 * 32/bit_width_of_elem_type`. So for fp16 data type, the matrices are `A: vector<8x16xf16>`, `B: vector<16x16xf16>`, - and `C/D: vector<8x16xf32>`. Besides the matrix size requirements, DPAS - also requires A and B to be loaded with the required data layout. Specially, - VNNI layout is required for B operand. It is achieved via adding `packed` - attribute to the `load_nd` operator. Due to the VNNI transformation, B operands - can be represented as a 3D vector, with the last dimension representing the VNNI - factor, which is computed as `32/bit_width_of_elem_type`. Thus, `B: vector<16x16xf16>` - can be represented as `B: vector<8x16x2xf16>`. + and `C/D: vector<8x16xf32>`. In SIMT code, each work-item from a subgroup holds a data fragment for A, B, C and the result, which are represented as 1D vectors. Please refer to [OpenCL Intel extentions] (https://registry.khronos.org/OpenCL/extensions/intel/cl_intel_subgroup_matrix_multiply_accumulate.html) for more details about the fragment distribution. - Note: on PVC, the hardware can perform load with VNNI transformation when data - element type is 16-bit or lower precision, taking 2 or 4 elements from - the first dimension and inserted into the newly added innermost dimension. + Arguments: + + - `lhs`: A vector value representing the left-hand-side matrix tile (A) participating in the + matrix multiply. + + - `rhs`: A vector value representing the right-hand-side matrix tile (B). + + - `acc`: [optional] A vector value representing the accumulator matrix tile (C). When present, the + result is computed as `lhs * rhs + acc`; otherwise, the accumulator is implicitly assumed to be zero. + + - `anchor_layout_a`, `anchor_layout_b`, `anchor_layout_cd`: [optional] Attributes that identify this + operation as anchors for operands A, B, and the accumulator/result, enabling users to assign layouts + that govern distribution at the subgroup and/or work-item level. Only valid at workgroup and subgroup + level. + }]; let arguments = (ins @@ -1187,13 +1277,31 @@ def XeGPU_AtomicRMWOp: XeGPU_Op<"atomic_rmw", [Pure, has the same shape with `TensorDesc`, and is used to enable or disable specific data points of the `TensorDesc`. The `value` operand represents the new value to be applied during the modification. + Arguments: + - `kind`: An attribute that specifies the atomic operation to be performed + (e.g., add, min, max, exchange, etc.). + + - `tensorDesc`: A `TensorDesc` describing the memory region on which the atomic + read-modify-write is performed. + + - `mask`: A predicate mask with the same shape as `tensorDesc`. Only elements + with a true (non-zero) mask value participate in the atomic operation; + masked-out elements are not modified. + + - `value`: The input values used by the atomic operation. It must have the same + shape and element type as `tensorDesc` and `result`. + + - `anchor_layout`: [optional] An attribute that identifies the operation as an anchor, + enabling users to assign a layout that governs distribution at the subgroup + and/or work-item level. Only valid at workgroup and subgroup levels. }]; let arguments = (ins AtomicRMWKindAttr:$kind, XeGPU_TensorDesc:$tensorDesc, XeGPU_MaskType:$mask, - XeGPU_ValueType:$value); + XeGPU_ValueType:$value, + OptionalAttr:$anchor_layout); let results = (outs XeGPU_ValueType:$result); @@ -1275,6 +1383,13 @@ def XeGPU_ConvertLayoutOp: XeGPU_Op<"convert_layout", [Pure, AllTypesMatch<["sou the `target_layout`. Both `input_layout` and `target_layout` must correspond to the same programming scope, such as workgroup-level (wg) or subgroup-level (sg) code. This operation is not valid once the IR is lowered to WI level because that is the end result of all distributions. + Arguments: + - `source`: The input vector whose data is to be redistributed. The source and + result types must match. + - `input_layout`: The layout attribute describing the current distribution of `source` + across subgroups and/or work-items. + - `target_layout`: The layout attribute describing the desired distribution of the result + across subgroups and/or work-items. }]; let arguments = (ins XeGPU_VectorType: $source, DistributeLayoutAttr: $input_layout, @@ -1342,12 +1457,13 @@ def XeGPU_LoadMatrixOp: XeGPU_Op<"load_matrix", [MemoryEffects<[MemRead]>, Arguments: - `mem_desc`: the memory descriptor identifying the SLM region. - `offsets`: the coordinates within the matrix to read from. - - `subgroup_block_io`: [optional] An attribute indicating that the operation can be - lowered to a subgroup block load. When this attribute is present, - the offsets are subgroup-uniform across all lanes. - - `anchor_layout`: [optional] An attribute for guiding distributions among - subgroups and/or work-items. It currently can accept either - LayoutAttr or SliceAttr. + - `subgroup_block_io`: [optional] An attribute indicating that the operation can be lowered + to a subgroup block load. When this attribute is present, the offsets are subgroup-uniform + across all lanes. Only used on subgroup and lane level. + - `anchor_layout`: [optional] An attribute that identifies the operation as an anchor, enabling + users to assign a layout that governs distribution at the subgroup and/or work-item level. + Only valid at workgroup and subgroup levels. + Results: - `res`: the matrix elements loaded from SLM. }]; @@ -1393,12 +1509,12 @@ def XeGPU_StoreMatrixOp: XeGPU_Op<"store_matrix", [MemoryEffects<[MemWrite]>, - `mem_desc`: the memory descriptor specifying the SLM region. - `offsets`: the coordinates within the matrix where the data will be written. - `data`: the values to be stored in the matrix. - - `subgroup_block_io`: [optional] An attribute indicating that the operation can be - lowered to a subgroup block store. When this attribute is present, - the offsets are subgroup-uniform across all lanes. - - `anchor_layout`: [optional] An attribute for guiding distributions among - subgroups and/or work-items. It currently can accept either - LayoutAttr or SliceAttr. + - `subgroup_block_io`: [optional] An attribute indicating that the operation can be lowered + to a subgroup block load. When this attribute is present, the offsets are subgroup-uniform + across all lanes. Only used on subgroup and lane level. + - `anchor_layout`: [optional] An attribute that identifies the operation as an anchor, enabling + users to assign a layout that governs distribution at the subgroup and/or work-item level. + Only valid at workgroup and subgroup levels. }]; let builders = [ OpBuilder<(ins "Value" : $data, "TypedValue": $mem_desc, diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp index 3240c0f40ce58..29daab384bf7f 100644 --- a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp +++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp @@ -828,7 +828,7 @@ void PrefetchOp::build(OpBuilder &builder, OperationState &state, Value source, xegpu::CachePolicyAttr l2_hint, xegpu::CachePolicyAttr l3_hint) { build(builder, state, source, Value(), l1_hint, l2_hint, l3_hint, - IntegerAttr{}); + IntegerAttr{}, /*anchor_layout=*/nullptr); } //===----------------------------------------------------------------------===// diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp index 6d45a51ab0267..3b5207dd92285 100644 --- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp +++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp @@ -387,7 +387,7 @@ class LayoutInfoPropagation ArrayRef operands, ArrayRef results); - bool hasAnchorLayout(xegpu::DistributeLayoutAttr anchorLayout); + bool hasParamsOfLayoutKind(xegpu::DistributeLayoutAttr anchorLayout); public: LayoutInfoPropagation(DataFlowSolver &solver, @@ -477,7 +477,7 @@ LogicalResult LayoutInfoPropagation::visitOperation( return success(); } -bool LayoutInfoPropagation::hasAnchorLayout( +bool LayoutInfoPropagation::hasParamsOfLayoutKind( xegpu::DistributeLayoutAttr anchorLayout) { if (anchorLayout == nullptr) { return false; @@ -497,7 +497,7 @@ void LayoutInfoPropagation::visitPrefetchNdOp( LayoutInfo prefetchLayout; xegpu::DistributeLayoutAttr anchorLayout = prefetch.getAnchorLayoutAttr(); - if (hasAnchorLayout(anchorLayout)) { + if (hasParamsOfLayoutKind(anchorLayout)) { prefetchLayout = LayoutInfo(anchorLayout); } else { // Here we assign the default layout to the tensor descriptor operand of @@ -648,12 +648,12 @@ void LayoutInfoPropagation::visitDpasOp( LayoutInfo dpasCLayout; xegpu::DistributeLayoutAttr anchorLayoutC = dpas.getAnchorLayoutCdAttr(); - if (hasAnchorLayout(anchorLayoutC)) { + if (hasParamsOfLayoutKind(anchorLayoutC)) { xegpu::DistributeLayoutAttr anchorLayoutA = dpas.getAnchorLayoutAAttr(); xegpu::DistributeLayoutAttr anchorLayoutB = dpas.getAnchorLayoutBAttr(); - assert(hasAnchorLayout(anchorLayoutA) && + assert(hasParamsOfLayoutKind(anchorLayoutA) && "Expected anchor layout for DPAS A operand."); - assert(hasAnchorLayout(anchorLayoutB) && + assert(hasParamsOfLayoutKind(anchorLayoutB) && "Expected anchor layout for DPAS B operand."); dpasALayout = LayoutInfo(anchorLayoutA); dpasBLayout = LayoutInfo(anchorLayoutB); @@ -743,7 +743,7 @@ void LayoutInfoPropagation::visitStoreNdOp( LayoutInfo storeLayout; xegpu::DistributeLayoutAttr anchorLayout = store.getAnchorLayoutAttr(); - if (hasAnchorLayout(anchorLayout)) { + if (hasParamsOfLayoutKind(anchorLayout)) { storeLayout = LayoutInfo(anchorLayout); } else { auto uArch = getUArch(getChipStr(store).value_or("")); @@ -799,7 +799,7 @@ void LayoutInfoPropagation::visitLoadNdOp( LayoutInfo loadLayout; xegpu::DistributeLayoutAttr anchorLayout = load.getAnchorLayoutAttr(); - if (hasAnchorLayout(anchorLayout)) { + if (hasParamsOfLayoutKind(anchorLayout)) { loadLayout = LayoutInfo(anchorLayout); } else { @@ -914,7 +914,7 @@ void LayoutInfoPropagation::visitLoadGatherOp( LayoutInfo loadLayout; LayoutInfo maskLayout; xegpu::DistributeLayoutAttr anchorLayout = load.getAnchorLayoutAttr(); - if (hasAnchorLayout(anchorLayout)) { + if (hasParamsOfLayoutKind(anchorLayout)) { loadLayout = LayoutInfo(anchorLayout); maskLayout = loadLayout; } else { @@ -984,7 +984,7 @@ void LayoutInfoPropagation::visitStoreScatterOp( LayoutInfo payloadLayout; LayoutInfo maskLayout; xegpu::DistributeLayoutAttr anchorLayout = storeScatter.getAnchorLayoutAttr(); - if (hasAnchorLayout(anchorLayout)) { + if (hasParamsOfLayoutKind(anchorLayout)) { payloadLayout = LayoutInfo(anchorLayout); maskLayout = payloadLayout; } else { From d1652af58eb344251976bc7a7379dff7937495a3 Mon Sep 17 00:00:00 2001 From: Jianhui Li Date: Tue, 25 Nov 2025 23:44:57 +0000 Subject: [PATCH 04/28] address feedback and add more documentation --- .../include/mlir/Dialect/XeGPU/IR/XeGPUOps.td | 296 ++++++++++++------ mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp | 2 - .../XeGPU/Transforms/XeGPUPropagateLayout.cpp | 16 +- 3 files changed, 200 insertions(+), 114 deletions(-) diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td index 344fb23ba7b8d..f6b7dc0384e52 100644 --- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td +++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td @@ -76,10 +76,10 @@ def XeGPU_CreateNdDescOp: XeGPU_Op<"create_nd_tdesc", [Pure, ViewLikeOpInterface For the case of dynamic memrefs or pointer, the shape and layout information of the memory region should be explicitly passed via `shape` and `strides` parameters. - - `offsets`: index values represents offsets from the "source" at the each dimension + - `offsets`: [optional] index values represents offsets from the "source" at the each dimension at which the subview of the target memory will be created. It is encoded via "offsets" and "const_offsets", such that it can accept various forms, such as, - operands (e.g., [%c0, %c]) and attributes (e.g., [2, 4]). + operands (e.g., [%c0, %c]) and attributes (e.g., [2, 4]). Offsets is optional and may be set at load_nd, store_nd, and prefetch_nd. - `shape`: the shape information of the memory region pointed by the "source". It is typically encoded via the MemRefType of the source, e.g., memref<4096x4096xf16>. @@ -253,28 +253,32 @@ def XeGPU_PrefetchNdOp : XeGPU_Op<"prefetch_nd", []> { It issues an instruction to prefetch a block of data from continuous memory regions to each level of the cache based on their cache policy. + This operation serves as an anchor through which users assign a layout attribute + to govern computation distribution. + Arguments: - `TensorDesc`: A tensor descriptor specifying the base nd-region of memory and tensor tile to be prefetched. - - `offsets`: index values representing per-dimension offsets from the + - `offsets`: [optional] index values representing per-dimension offsets from the base position encoded in `TensorDesc`. It is encoded via "offsets" and "const_offsets". - `l1_hint`, `l2_hint`, `l3_hint`: [optional] An cache-hint attribute indicating the desired behavior at the L1, L2, and L3 cache levels. - - `anchor_layout`: [optional] An attribute that identifies the operation - as an anchor, enabling users to assign a layout that governs distribution - at the subgroup and/or work-item level. Only valid at workgroup and subgroup - level. + - `anchor_layout`: [optional] Describes the expected layout of the `tensor_desc` operand. + Only valid at the workgroup and subgroup levels. - Example: + Example (Workgroup level): ```mlir - xegpu.prefetch_nd %tdesc {l1_hint = #xegpu.cache_hint, + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + xegpu.prefetch_nd %tdesc[%c0, %c1] {l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, - l3_hint = #xegpu.cache_hint} - : !xegpu.tensor_desc<8x16xf16> + l3_hint = #xegpu.cache_hint, + anchor_layout = #xegpu.layout } + : !xegpu.tensor_desc<32x256xf16> ``` }]; @@ -350,7 +354,10 @@ def XeGPU_LoadNdOp : XeGPU_Op<"load_nd", [ exclusive and shall not be enabled simultaneously. Both features support only 2D blocked tensor_desc. - In SIMT mode, result vector represents the data to be loaded by each work-item. + At lane level, result vector represents the data to be loaded by each lane. + + This operation serves as an anchor through which users assign a layout attribute + to govern computation distribution. Arguments: @@ -369,19 +376,18 @@ def XeGPU_LoadNdOp : XeGPU_Op<"load_nd", [ - `l1_hint`, `l2_hint`, `l3_hint`: [optional] Cache-hint attributes indicating the desired behavior at the L1, L2, and L3 cache levels. - - `anchor_layout`: [optional] An attribute that identifies the operation as an anchor, - enabling users to assign a layout that governs distribution at the subgroup and/or - work-item level. Only valid at workgroup and subgroup levels. + - `anchor_layout`: [optional] Describes the expected layout of the `tensor_desc` operand as well as the result of the load (they are identical). Only valid at workgroup and subgroup levels. - Example 1: + Example 1 (Workgroup level): ```mlir xegpu.load_nd %1 {transpose = [1, 0], l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, - l3_hint = #xegpu.cache_hint} - : !xegpu.tensor_desc<8x16xf32> -> vector<16x8xf32> + l3_hint = #xegpu.cache_hint, + anchor_layout = #xegpu.layout} + : !xegpu.tensor_desc<32x256xf32> -> vector<32x256xf32> ``` - Example 2 (SIMT mode): + Example 2 (lane level): ```mlir xegpu.load_nd %1 {l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}> @@ -466,7 +472,10 @@ def XeGPU_StoreNdOp : XeGPU_Op<"store_nd", [ Corresponding cache hint attribute will be masked. It is only available to 1D or 2D blocked tensor_desc. - In SIMT mode, the input vector represents the data to be stored by each work-item. + At lane level, the input vector represents the data to be stored by each lane. + + This operation serves as an anchor through which users assign a layout attribute + to govern computation distribution. Arguments: @@ -481,18 +490,18 @@ def XeGPU_StoreNdOp : XeGPU_Op<"store_nd", [ - `l1_hint`, `l2_hint`, `l3_hint`: [optional] Cache-hint attributes indicating the desired behavior at the L1, L2, and L3 cache levels. - - `anchor_layout`: [optional] An attribute that identifies the operation as an anchor, - enabling users to assign a layout that governs distribution at the subgroup and/or - work-item level. Only valid at workgroup and subgroup levels. + - `anchor_layout`: [optional] Describes the expected layout of the `tensor_desc` operand as well as + the value to be stored (they are identical). Only valid at workgroup and subgroup levels. - Example 1: + Example 1 (Workgroup level): ```mlir xegpu.store_nd %3, %2 {l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, - l3_hint = #xegpu.cache_hint} - : vector<8x16xf16>, !xegpu.tensor_desc<8x16xf16> + l3_hint = #xegpu.cache_hint, + anchor_layout = #xegpu.layout} + : vector<32x256xf16>, !xegpu.tensor_desc<32x256xf16> ``` - Example 2 (SIMT mode): + Example 2 (lane level): ```mlir xegpu.store_nd %3, %2 {l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, @@ -617,7 +626,7 @@ def XeGPU_CreateDescOp: XeGPU_Op<"create_tdesc", [Pure, ViewLikeOpInterface]> { "create_tdesc" is similar to "create_nd_tdesc" in terms that it creates a Tensor Descriptor (TensorDescType) for a memory region. While "create_nd_tdesc" is for creating continuous subviews, "create_tdesc" is for creating non-continuous - (scattered) subviews, allowing each work-item in a subgroup specifying their own offset. + (scattered) subviews, allowing each lane in a subgroup specifying their own offset. It accepts the following parameters: Arguments: @@ -627,13 +636,12 @@ def XeGPU_CreateDescOp: XeGPU_Op<"create_tdesc", [Pure, ViewLikeOpInterface]> { - `offsets`: a vector containing offsets of each access point. Its size is fixed to the hardware supportted subgroup size, e.g., 16 on PVC, - implying each element in the vector corresponds to a work-item (SIMT lane) - in the subgroup. + implying each element in the vector corresponds to a SIMT lane in the subgroup. Results: - `res`: scattered tensor descriptor - The first dimension of the result TensorDesc corresponds to work-items, so it should + The first dimension of the result TensorDesc corresponds to lanes, so it should match the dimension of offsets. It may also has a second dimension corresponding to the chunk_size if the chunk size is larger than 1. @@ -722,35 +730,39 @@ def XeGPU_PrefetchOp : XeGPU_Op<"prefetch", []> { As compared to prefetch_nd, which works on non-scattered TensorDesc, it works on scattered TensorDesc instead. + This operation serves as an anchor through which users assign a layout attribute + to govern computation distribution. + Arguments: - `source`: represents the memory region to be loaded from, which can be either a tensor_desc or a 1D memref or pointer (ui64, ui32, i64 or i32). In case of tensor_desc, offsets come from the producer create_tdesc op. - tensor_desc cannot be used in SIMT mode. + tensor_desc cannot be used at lane level. - `offsets`: represents offsets from source. required if `source` in not a TensorDescType. offsets is a vector of `index` type and vector length is either the subgroup size - or 1 in SIMT mode. scalar offset is also valid for SIMT mode. + or 1 at lane level. scalar offset is also valid for lane level. - `l1_hint`, `l2_hint`, `l3_hint`: [optional] cache hints for each level of cache. - `offset_align_byte`: [optional] required if `source` is a pointer. If `source` is not a pointer, it is not allowed. Represents the alignment in bytes of each offset in offsets. - - `anchor_layout`: [optional] An attribute that identifies the operation as an anchor, - enabling users to assign a layout that governs distribution at the subgroup and/or - work-item level. Only valid at workgroup and subgroup levels. + - `anchor_layout`: [optional] Describes the expected layout of the `tensor_desc` or `offsets` + operand. Only valid at workgroup and subgroup levels. - Example 1: + Example 1 (Workgroup level): ```mlir xegpu.prefetch %tdesc {l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, - l3_hint = #xegpu.cache_hint} - : !xegpu.tensor_desc<16xf16> + l3_hint = #xegpu.cache_hint, + anchor_layout = #xegpu.layout + } + : !xegpu.tensor_desc<256xf16> ``` - Example 2: + Example 2 (lane level): A variant accepts memref as base pointer and an offset instead of scattered TensorTdesc. It combines "create scattered TensorTdesc" and "prefetch with scattered TensorTdesc". The source operand could be a raw pointer (ui64, ui32, i64, i32). @@ -764,8 +776,8 @@ def XeGPU_PrefetchOp : XeGPU_Op<"prefetch", []> { : memref<1024xf32>, vector<4xindex> ``` - Example 3 (SIMT mode): - SIMT mode only accepts the offsets variant. + Example 3 (lane level): + lane level only accepts the offsets variant. ```mlir xegpu.prefetch %0[%1] {l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, @@ -773,8 +785,8 @@ def XeGPU_PrefetchOp : XeGPU_Op<"prefetch", []> { : memref<256xf32>, vector<1xindex> ``` - Example 4 (SIMT mode): - SIMT mode only accepts the offsets variant. + Example 4 (lane level): + lane level only accepts the offsets variant. ```mlir xegpu.prefetch %0[%1] {l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, @@ -831,63 +843,67 @@ def XeGPU_PrefetchOp : XeGPU_Op<"prefetch", []> { def XeGPU_LoadGatherOp : XeGPU_Op<"load", [MemoryEffects<[MemRead]>]> { let summary = "load a set of scattered data points from memory."; - let description = [{ It (aka. load) load data per each work-item. The output + let description = [{ It (aka. load) load data per each lane. The output describes the data being loaded at the subgroup level, so its size is - consistent with the number of work-items in a subgroup. When the chunk size + consistent with the number of lanes in a subgroup. When the chunk size is larger than 2, the output vector is a 2D vector, with dim-0 correspoding - to work-items, and dim-1 corresponding to the chunk size loaded by each work-item. + to lanes, and dim-1 corresponding to the chunk size loaded by each lane. The mask operand masks out memory access so that it is safe to pass out-of-boundary - addresses/offsets as long as they are masked. It applies to slots of SIMD lanes. + addresses/offsets as long as they are masked. Each mask element applies to one lane. + + In lane level, the result is a 1D vector that represents the data to be loaded by + each lane. If size is not 1, size should be equal to the chunk size. - In SIMT mode, the result is a 1D vector that represents the data to be loaded by - each work-item. If size is not 1, size should be equal to the chunk size, + This operation serves as an anchor through which users assign a layout attribute + to govern computation distribution. Arguments: - `source`: represents the memory region to be loaded from, which can be either a tensor_desc or a 1D memref or pointer (ui64, ui32, i64 or i32). In case of tensor_desc, offsets come from the producer create_tdesc op. - tensor_desc cannot be used in SIMT mode. + tensor_desc cannot be used at lane level. - `offsets`: represents offsets from source. required if `source` in not a TensorDescType. offsets is a vector of `index` type and vector length is either the subgroup size - or 1 in SIMT mode. scalar offset is also valid for SIMT mode. + or 1 at lane level. scalar offset is also valid for lane level. - `mask`: is a vector of `i1` type, which is used to mask out the memory access. - mask is a vector of size equal to the subgroup size, or 1 in SIMT mode. - scalar mask is also valid for SIMT mode. + mask is a vector of size equal to the subgroup size, or 1 at lane level. + scalar mask is also valid for lane level. - `chunk_size`: [optional] represents contiguous number of elements to load from per work item. - `l1_hint`, `l2_hint`, `l3_hint`: [optional] cache hints for each level of cache. - - `anchor_layout`: [optional] An attribute that identifies the operation as an anchor, - enabling users to assign a layout that governs distribution at the subgroup and/or - work-item level. Only valid at workgroup and subgroup levels. + - `anchor_layout`: [optional] Describes the expected layout of the `tensor_desc` operand or the result + of load. Only valid at workgroup and subgroup levels. Results: - `res`: represents loaded data - Example 1: + Example 1 (Workgroup level): ```mlir %2 = xegpu.load %1, %0 <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, - l3_hint = #xegpu.cache_hint}> - : !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr>, - vector<16xi1> -> vector<16xf32> + l3_hint = #xegpu.cache_hint}, + anchor_layout = #xegpu.layout> + : !xegpu.tensor_desc<256xf32, #xegpu.scatter_tdesc_attr>, + vector<256xi1> -> vector<256xf32> ``` - Example 2: + Example 2 (Subgroup level): ```mlir %2 = xegpu.load %1, %0 <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, - l3_hint = #xegpu.cache_hint}> + l3_hint = #xegpu.cache_hint}, + anchor_layout = #xegpu.layout> : !xegpu.tensor_desc<16x8xf32, #xegpu.scatter_tdesc_attr>, vector<16xi1> -> vector<16x8xf32> ``` - Example 3: + Example 3 (Subgroup level): A variant accepts memref as base pointer and an offset instead of scattered TensorTdesc. It combines "create scattered TensorTdesc" and "load with scattered TensorTdesc". The source operand could be a raw pointer (ui64, ui32, i64, i32). Please refer to create_tdesc @@ -898,12 +914,13 @@ def XeGPU_LoadGatherOp : XeGPU_Op<"load", [MemoryEffects<[MemRead]>]> { %mask = vector.constant_mask [16]: vector<16xi1> %val = xegpu.load %a[%offsets], %mask {l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, - l3_hint = #xegpu.cache_hint} + l3_hint = #xegpu.cache_hint, + anchor_layout = #xegpu.layout} : memref<1024xf32>, vector<16xi1>, vector<16xindex> -> vector<16xf32> ``` - Example 4 (SIMT mode): - SIMT mode only accepts the offsets variant. chunk_size can be inferred from result + Example 4 (lane level): + lane level only accepts the offsets variant. chunk_size can be inferred from result type. In this example, chunk_size is 8. ```mlir %2 = xegpu.load %1[%2], %0 <{l1_hint = #xegpu.cache_hint, @@ -979,7 +996,7 @@ def XeGPU_LoadGatherOp : XeGPU_Op<"load", [MemoryEffects<[MemRead]>]> { "xegpu::CachePolicyAttr": $l1_hint, "xegpu::CachePolicyAttr": $l2_hint, "xegpu::CachePolicyAttr": $l3_hint, - "xegpu::DistributeLayoutAttr": $anchor_layout)> + "xegpu::DistributeLayoutAttr": $layout)> ]; let hasVerifier = 1; @@ -995,8 +1012,11 @@ def XeGPU_StoreScatterOp : XeGPU_Op<"store", [MemoryEffects<[MemWrite]>]> { has transpose effect, which is similar to `load_gather`. Therefore, a transpose attribute is introduced on purpose, making sure users are aware of this implicit transformation. - In SIMT mode, the result is a 1D vector that represents the data to be stored by - each work-item. If size is not 1, size should be equal to the chunk size. + In lane level, the result is a 1D vector that represents the data to be stored by + each lane. If size is not 1, size should be equal to the chunk size. + + This operation serves as an anchor through which users assign a layout attribute + to govern computation distribution. Arguments: @@ -1005,42 +1025,43 @@ def XeGPU_StoreScatterOp : XeGPU_Op<"store", [MemoryEffects<[MemWrite]>]> { - `dest`: represents the memory region to be stored to, which can be either a tensor_desc or a 1D memref or pointer (ui64, ui32, i64 or i32). In case of tensor_desc, offsets come from the producer create_tdesc op. - tensor_desc cannot be used in SIMT mode. + tensor_desc cannot be used at lane level. - `offsets`: represents offsets from dest. required if `source` in not a TensorDescType. offsets is a vector of `index` type and vector length is either the subgroup size - or 1 in SIMT mode. scalar offset is also valid for SIMT mode. + or 1 at lane level. scalar offset is also valid for lane level. - `mask`: is a vector of `i1` type, which is used to mask out the memory access. - mask is a vector of size equal to the subgroup size, or 1 in SIMT mode. - scalar mask is also valid for SIMT mode. + mask is a vector of size equal to the subgroup size, or 1 at lane level. + scalar mask is also valid for lane level. - `chunk_size`: [optional] represents contiguous number of elements to store to per work item. - `l1_hint`, `l2_hint`, `l3_hint`: [optional] cache hints for each level of cache. - - `anchor_layout`: [optional] An attribute that identifies the operation as an anchor, - enabling users to assign a layout that governs distribution at the subgroup and/or - work-item level. Only valid at workgroup and subgroup levels. + - `anchor_layout`: [optional] Describes the expected layout of the `tensor_desc` operand or the value + to be stored. Only valid at workgroup and subgroup levels. - Example 1: + Example 1 (Workgroup level): ```mlir xegpu.store %0, %1, %2 <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, - l3_hint = #xegpu.cache_hint}> - : vector<16xf32>, !xegpu.tensor_desc<16xf32, #xegpu.scattered_tdesc_attr<>>, vector<16xi1> + l3_hint = #xegpu.cache_hint, + anchor_layout = #xegpu.layout}> + : vector<256xf32>, !xegpu.tensor_desc<256xf32, #xegpu.scattered_tdesc_attr<>>, vector<256xi1> ``` - Example 2: + Example 2 (Subgroup level): ```mlir xegpu.store %0, %1, %2 <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, - l3_hint = #xegpu.cache_hint}> + l3_hint = #xegpu.cache_hint, + anchor_layout = #xegpu.layout}> : vector<16x8xf32>, !xegpu.tensor_desc<16x8xf32, #xegpu.scattered_tdesc_attr>, vector<16xi1> ``` - Example 3: + Example 3 (Subgroup level): A variant accepts memref as base pointer and an offset instead of scattered TensorTdesc. It combines "create scattered TensorTdesc" and "store with scattered TensorTdesc". The dest operand could be a raw pointer (uint64_t). @@ -1052,12 +1073,13 @@ def XeGPU_StoreScatterOp : XeGPU_Op<"store", [MemoryEffects<[MemWrite]>]> { %mask = vector.constant_mask [16]: vector<16xi1> xegpu.store %val, %a[%offsets], %mask {l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, - l3_hint = #xegpu.cache_hint} + l3_hint = #xegpu.cache_hint, + anchor_layout = #xegpu.layout} : memref<1024xf32>, vector<16xi1>, vector<16xindex> -> vector<16xf32> ``` - Example 4 (SIMT mode): - SIMT mode only accepts the offsets variant. chunk_size can be inferred from value + Example 4 (Lane level): + Lane level IR only accepts the offsets variant. chunk_size can be inferred from value type. In this example, chunk_size is 8. ```mlir xegpu.store %0, %1[%2], %3 <{l1_hint = #xegpu.cache_hint, @@ -1133,7 +1155,7 @@ def XeGPU_StoreScatterOp : XeGPU_Op<"store", [MemoryEffects<[MemWrite]>]> { "xegpu::CachePolicyAttr": $l1_hint, "xegpu::CachePolicyAttr": $l2_hint, "xegpu::CachePolicyAttr": $l3_hint, - "xegpu::DistributeLayoutAttr": $anchor_layout)> + "xegpu::DistributeLayoutAttr": $layout)> ]; let hasVerifier = 1; @@ -1148,8 +1170,8 @@ def XeGPU_UpdateOffsetOp: XeGPU_Op<"update_offset", the current position in the number of elements. However, `update_nd_offset` is to update the start point of a 2D block, so its offset constains two elements representing the shift in each dimension. `update_offset` is to - update the offset per work-item, so its offsets contains values representing - shifts for each work-item. + update the offset per lane, so its offsets contains values representing + shifts for each lane. Example: ```mlir @@ -1201,11 +1223,14 @@ def XeGPU_DpasOp : XeGPU_Op<"dpas", [Pure, AllElementTypesMatch<["lhs", "rhs"]>] data type, the matrices are `A: vector<8x16xf16>`, `B: vector<16x16xf16>`, and `C/D: vector<8x16xf32>`. - In SIMT code, each work-item from a subgroup holds a data fragment for A, B, C and the result, + In lane level code, each lane from a subgroup holds a data fragment for A, B, C and the result, which are represented as 1D vectors. Please refer to [OpenCL Intel extentions] (https://registry.khronos.org/OpenCL/extensions/intel/cl_intel_subgroup_matrix_multiply_accumulate.html) for more details about the fragment distribution. + This operation serves as an anchor through which users assign a layout attribute + to govern computation distribution. + Arguments: - `lhs`: A vector value representing the left-hand-side matrix tile (A) participating in the @@ -1217,10 +1242,26 @@ def XeGPU_DpasOp : XeGPU_Op<"dpas", [Pure, AllElementTypesMatch<["lhs", "rhs"]>] result is computed as `lhs * rhs + acc`; otherwise, the accumulator is implicitly assumed to be zero. - `anchor_layout_a`, `anchor_layout_b`, `anchor_layout_cd`: [optional] Attributes that identify this - operation as anchors for operands A, B, and the accumulator/result, enabling users to assign layouts - that govern distribution at the subgroup and/or work-item level. Only valid at workgroup and subgroup + operation as anchor for operands A, B, and the accumulator/result, enabling users to assign layouts + that govern distribution at the subgroup and/or lane level. Only valid at workgroup and subgroup level. + Example 1 (Workgroup level): + + ```mlir + %d = xegpu.dpas %a, %b, %c <{ + anchor_layout_a = #xegpu.layout, + anchor_layout_b = #xegpu.layout, + anchor_layout_cd = #xegpu.layout} + : vector<64x128xf16>, vector<128x128xf16>, vector<64x128xf32> -> vector<64x128xf32> + ``` + + Example 2 (Lane level): + + ```mlir + %d = xegpu.dpas %a, %b, %c + : vector<8xf16>, vector<16xf16>, vector<8xf32> -> vector<8xf32> + ``` }]; let arguments = (ins @@ -1277,6 +1318,10 @@ def XeGPU_AtomicRMWOp: XeGPU_Op<"atomic_rmw", [Pure, has the same shape with `TensorDesc`, and is used to enable or disable specific data points of the `TensorDesc`. The `value` operand represents the new value to be applied during the modification. + + This operation serves as an anchor through which users assign a layout attribute + to govern computation distribution. + Arguments: - `kind`: An attribute that specifies the atomic operation to be performed (e.g., add, min, max, exchange, etc.). @@ -1293,7 +1338,7 @@ def XeGPU_AtomicRMWOp: XeGPU_Op<"atomic_rmw", [Pure, - `anchor_layout`: [optional] An attribute that identifies the operation as an anchor, enabling users to assign a layout that governs distribution at the subgroup - and/or work-item level. Only valid at workgroup and subgroup levels. + and/or lane level. Only valid at workgroup and subgroup levels. }]; let arguments = (ins @@ -1379,17 +1424,29 @@ def XeGPU_FenceOp: XeGPU_Op<"fence", []> { def XeGPU_ConvertLayoutOp: XeGPU_Op<"convert_layout", [Pure, AllTypesMatch<["source", "result"]>]> { let summary = "Convert the layout of the input operand"; let description = [{ - `convert_layout` redistribute data across subgroups and/or work-items from the `input_layout` to + `convert_layout` redistribute data across subgroups and/or lanes from the `input_layout` to the `target_layout`. Both `input_layout` and `target_layout` must correspond to the same programming - scope, such as workgroup-level (wg) or subgroup-level (sg) code. This operation is not valid once + scope, such as workgroup level (wg) or subgroup level (sg) code. This operation is not valid once the IR is lowered to WI level because that is the end result of all distributions. + + This operation serves as an anchor through which users assign a layout attribute + to govern computation distribution. + Arguments: - `source`: The input vector whose data is to be redistributed. The source and result types must match. - `input_layout`: The layout attribute describing the current distribution of `source` - across subgroups and/or work-items. + across subgroups and/or lanes. - `target_layout`: The layout attribute describing the desired distribution of the result - across subgroups and/or work-items. + across subgroups and/or lanes. + + Example (Subgroup level): + ```mlir + %coop_a = xegpu.convert_layout %a <{ + input_layout = #xegpu.layout, + target_layout = #xegpu.layout}> + : vector<128x128xf16> + ``` }]; let arguments = (ins XeGPU_VectorType: $source, DistributeLayoutAttr: $input_layout, @@ -1427,8 +1484,17 @@ def XeGPU_CreateMemDescOp: XeGPU_Op<"create_mem_desc", [Pure, Arguments: - `source` : a 1D statically shaped memref with element type i8, representing the raw SLM buffer. + Results: - `mem_desc` : the memory descriptor. + + Example: + ```mlir + %mdesc = xegpu.create_mem_desc %mref + : memref<4096xi8, 3> + -> !xegpu.mem_desc<32x64xf16, #xegpu.mem_layout> + ``` + }]; let arguments = (ins StaticShared1DMemRefOf<[I8]>:$source); let results = (outs XeGPU_MemDesc:$mem_desc); @@ -1454,23 +1520,35 @@ def XeGPU_LoadMatrixOp: XeGPU_Op<"load_matrix", [MemoryEffects<[MemRead]>, by the provided 2D `mem_desc`. Only 2D memory descriptors are supported; use the subview operation to obtain a compatible 2D `mem_desc` from a higher-rank descriptor if needed. + This operation serves as an anchor through which users assign a layout attribute + to govern computation distribution. + Arguments: - `mem_desc`: the memory descriptor identifying the SLM region. - `offsets`: the coordinates within the matrix to read from. - `subgroup_block_io`: [optional] An attribute indicating that the operation can be lowered to a subgroup block load. When this attribute is present, the offsets are subgroup-uniform across all lanes. Only used on subgroup and lane level. - - `anchor_layout`: [optional] An attribute that identifies the operation as an anchor, enabling - users to assign a layout that governs distribution at the subgroup and/or work-item level. + - `anchor_layout`: [optional] Describes the expected layout of the `mem_desc` operand as well as + the result of load (they are identical). Only valid at workgroup and subgroup levels. Results: - `res`: the matrix elements loaded from SLM. + + Example (Workgroup level): + ```mlir + %c0 = arith.constant 0 : index + %1 = xegpu.load_matrix %0[%c0, %c0] <{ + anchor_layout = #xegpu.layout }> + : !xegpu.mem_desc<128x128xf16, #xegpu.mem_layout> + , index, index -> vector<128x128xf16> + ``` }]; let builders = [ OpBuilder<(ins "Type":$res, "TypedValue": $mem_desc, - "llvm::ArrayRef": $offsets, "DistributeLayoutAttr": $anchor_layout)>, + "llvm::ArrayRef": $offsets, "DistributeLayoutAttr": $layout)>, ]; let extraClassDeclaration = [{ SmallVector getMixedOffsets() { @@ -1505,6 +1583,9 @@ def XeGPU_StoreMatrixOp: XeGPU_Op<"store_matrix", [MemoryEffects<[MemWrite]>, specified by a 2D `mem_desc`. Only 2D memory descriptors are supported; use the subview operation to obtain a 2D `mem_desc` from a higher-rank descriptor if needed. + This operation serves as an anchor through which users assign a layout attribute + to govern computation distribution. + Arguments: - `mem_desc`: the memory descriptor specifying the SLM region. - `offsets`: the coordinates within the matrix where the data will be written. @@ -1512,13 +1593,20 @@ def XeGPU_StoreMatrixOp: XeGPU_Op<"store_matrix", [MemoryEffects<[MemWrite]>, - `subgroup_block_io`: [optional] An attribute indicating that the operation can be lowered to a subgroup block load. When this attribute is present, the offsets are subgroup-uniform across all lanes. Only used on subgroup and lane level. - - `anchor_layout`: [optional] An attribute that identifies the operation as an anchor, enabling - users to assign a layout that governs distribution at the subgroup and/or work-item level. - Only valid at workgroup and subgroup levels. + - `anchor_layout`: [optional] Describes the expected layout of the `tensor_desc` operand as well as + the value to be stored (they are identical). Only valid at workgroup and subgroup levels. + + Example (Workgroup level): + ```mlir + %c0 = arith.constant 0 : index + xegpu.store_matrix %1, %0[%c0, %c0] <{ + anchor_layout = #xegpu.layout }> + : vector<128x128xf16>, !xegpu.mem_desc<128x128xf16>>, index, index + ``` }]; let builders = [ OpBuilder<(ins "Value" : $data, "TypedValue": $mem_desc, - "llvm::ArrayRef": $offsets, "DistributeLayoutAttr": $anchor_layout)>, + "llvm::ArrayRef": $offsets, "DistributeLayoutAttr": $layout)>, ]; let extraClassDeclaration = [{ SmallVector getMixedOffsets() { diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp index b3d2c40712c96..fb5d1e758dbd1 100644 --- a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp +++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp @@ -22,8 +22,6 @@ using std::optional; namespace mlir { namespace xegpu { -//#include "mlir/Dialect/XeGPU/IR/XeGPUOpInterface.cpp.inc" - void XeGPUDialect::initialize() { addTypes< #define GET_TYPEDEF_LIST diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp index 3b5207dd92285..8fb63da8cb0a0 100644 --- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp +++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp @@ -645,10 +645,10 @@ void LayoutInfoPropagation::visitDpasOp( LayoutInfo dpasALayout; LayoutInfo dpasBLayout; - LayoutInfo dpasCLayout; + LayoutInfo dpasCDLayout; - xegpu::DistributeLayoutAttr anchorLayoutC = dpas.getAnchorLayoutCdAttr(); - if (hasParamsOfLayoutKind(anchorLayoutC)) { + xegpu::DistributeLayoutAttr anchorLayoutCD = dpas.getAnchorLayoutCdAttr(); + if (hasParamsOfLayoutKind(anchorLayoutCD)) { xegpu::DistributeLayoutAttr anchorLayoutA = dpas.getAnchorLayoutAAttr(); xegpu::DistributeLayoutAttr anchorLayoutB = dpas.getAnchorLayoutBAttr(); assert(hasParamsOfLayoutKind(anchorLayoutA) && @@ -657,7 +657,7 @@ void LayoutInfoPropagation::visitDpasOp( "Expected anchor layout for DPAS B operand."); dpasALayout = LayoutInfo(anchorLayoutA); dpasBLayout = LayoutInfo(anchorLayoutB); - dpasCLayout = LayoutInfo(anchorLayoutC); + dpasCDLayout = LayoutInfo(anchorLayoutCD); } else { @@ -714,14 +714,14 @@ void LayoutInfoPropagation::visitDpasOp( dpas.emitWarning( "No suitable instruction multiple found for the given shape."); SmallVector instDataC = {maxALen, maxCLen}; - dpasCLayout = + dpasCDLayout = LayoutInfo(xegpu::LayoutAttr::get(dpas.getContext(), instDataC)); } else - dpasCLayout = getSIMTLayoutInfoForDPASOperand( + dpasCDLayout = getSIMTLayoutInfoForDPASOperand( cTy, 2, uArch, uArchInstruction->getPackedFormatBitSizeB()); dpas.setAnchorLayoutCdAttr( - dyn_cast(dpasCLayout.get())); + dyn_cast(dpasCDLayout.get())); } dpas.setAnchorLayoutAAttr( dyn_cast(dpasALayout.get())); @@ -732,7 +732,7 @@ void LayoutInfoPropagation::visitDpasOp( propagateIfChanged(operands[0], operands[0]->meet(dpasALayout)); propagateIfChanged(operands[1], operands[1]->meet(dpasBLayout)); if (operands.size() > 2) { - propagateIfChanged(operands[2], operands[2]->meet(dpasCLayout)); + propagateIfChanged(operands[2], operands[2]->meet(dpasCDLayout)); } } From b186bc2c20b0f1703170491f35466fd48950dabb Mon Sep 17 00:00:00 2001 From: Jianhui Li Date: Wed, 26 Nov 2025 01:41:52 +0000 Subject: [PATCH 05/28] rename anchor_layout to layout --- .../include/mlir/Dialect/XeGPU/IR/XeGPUOps.td | 82 +++++++++---------- mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp | 6 +- .../XeGPU/Transforms/XeGPUPropagateLayout.cpp | 34 ++++---- .../Transforms/XeGPUSubgroupDistribute.cpp | 4 +- .../Dialect/XeGPU/Transforms/XeGPUUnroll.cpp | 8 +- .../Transforms/XeGPUWgToSgDistribute.cpp | 14 ++-- mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp | 16 ++-- mlir/test/Dialect/XeGPU/invalid.mlir | 6 +- .../XeGPU/propagate-layout-inst-data.mlir | 16 ++-- mlir/test/Dialect/XeGPU/propagate-layout.mlir | 80 +++++++++--------- .../Dialect/XeGPU/subgroup-distribute.mlir | 12 +-- mlir/test/Dialect/XeGPU/xegpu-blocking.mlir | 4 +- .../XeGPU/xegpu-wg-to-sg-unify-ops.mlir | 10 +-- 13 files changed, 144 insertions(+), 148 deletions(-) diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td index f6b7dc0384e52..abcaa1da82e67 100644 --- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td +++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td @@ -236,7 +236,7 @@ def XeGPU_CreateNdDescOp: XeGPU_Op<"create_nd_tdesc", [Pure, ViewLikeOpInterface return static_cast(MemorySpace::Global); } - xegpu::DistributeLayoutAttr getLayoutAttr() { + xegpu::DistributeLayoutAttr getDescLayoutAttr() { return dyn_cast_if_present(getType().getLayout()); } @@ -267,7 +267,7 @@ def XeGPU_PrefetchNdOp : XeGPU_Op<"prefetch_nd", []> { - `l1_hint`, `l2_hint`, `l3_hint`: [optional] An cache-hint attribute indicating the desired behavior at the L1, L2, and L3 cache levels. - - `anchor_layout`: [optional] Describes the expected layout of the `tensor_desc` operand. + - `layout`: [optional] Describes the expected layout of the `tensor_desc` operand. Only valid at the workgroup and subgroup levels. Example (Workgroup level): @@ -277,7 +277,7 @@ def XeGPU_PrefetchNdOp : XeGPU_Op<"prefetch_nd", []> { xegpu.prefetch_nd %tdesc[%c0, %c1] {l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint, - anchor_layout = #xegpu.layout } + layout = #xegpu.layout } : !xegpu.tensor_desc<32x256xf16> ``` @@ -289,7 +289,7 @@ def XeGPU_PrefetchNdOp : XeGPU_Op<"prefetch_nd", []> { OptionalAttr: $l1_hint, OptionalAttr: $l2_hint, OptionalAttr: $l3_hint, - OptionalAttr:$anchor_layout); + OptionalAttr:$layout); let extraClassDeclaration = extraBaseClassDeclaration # [{ xegpu::TensorDescType getTensorDescType() { @@ -304,7 +304,7 @@ def XeGPU_PrefetchNdOp : XeGPU_Op<"prefetch_nd", []> { return getMixedValues(statics, dynamics, getContext()); } - xegpu::DistributeLayoutAttr getLayoutAttr() { + xegpu::DistributeLayoutAttr getDescLayoutAttr() { return dyn_cast_if_present(getTensorDescType().getLayout()); } @@ -376,7 +376,7 @@ def XeGPU_LoadNdOp : XeGPU_Op<"load_nd", [ - `l1_hint`, `l2_hint`, `l3_hint`: [optional] Cache-hint attributes indicating the desired behavior at the L1, L2, and L3 cache levels. - - `anchor_layout`: [optional] Describes the expected layout of the `tensor_desc` operand as well as the result of the load (they are identical). Only valid at workgroup and subgroup levels. + - `layout`: [optional] Describes the expected layout of the `tensor_desc` operand as well as the result of the load (they are identical). Only valid at workgroup and subgroup levels. Example 1 (Workgroup level): ```mlir @@ -384,7 +384,7 @@ def XeGPU_LoadNdOp : XeGPU_Op<"load_nd", [ l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint, - anchor_layout = #xegpu.layout} + layout = #xegpu.layout} : !xegpu.tensor_desc<32x256xf32> -> vector<32x256xf32> ``` Example 2 (lane level): @@ -405,7 +405,7 @@ def XeGPU_LoadNdOp : XeGPU_Op<"load_nd", [ OptionalAttr: $l1_hint, OptionalAttr: $l2_hint, OptionalAttr: $l3_hint, - OptionalAttr:$anchor_layout); + OptionalAttr:$layout); let results = (outs XeGPU_ValueType: $value); @@ -426,7 +426,7 @@ def XeGPU_LoadNdOp : XeGPU_Op<"load_nd", [ return getMixedValues(statics, dynamics, getContext()); } - xegpu::DistributeLayoutAttr getLayoutAttr() { + xegpu::DistributeLayoutAttr getDescLayoutAttr() { return dyn_cast_if_present(getTensorDescType().getLayout()); } @@ -490,7 +490,7 @@ def XeGPU_StoreNdOp : XeGPU_Op<"store_nd", [ - `l1_hint`, `l2_hint`, `l3_hint`: [optional] Cache-hint attributes indicating the desired behavior at the L1, L2, and L3 cache levels. - - `anchor_layout`: [optional] Describes the expected layout of the `tensor_desc` operand as well as + - `layout`: [optional] Describes the expected layout of the `tensor_desc` operand as well as the value to be stored (they are identical). Only valid at workgroup and subgroup levels. Example 1 (Workgroup level): @@ -498,7 +498,7 @@ def XeGPU_StoreNdOp : XeGPU_Op<"store_nd", [ xegpu.store_nd %3, %2 {l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint, - anchor_layout = #xegpu.layout} + layout = #xegpu.layout} : vector<32x256xf16>, !xegpu.tensor_desc<32x256xf16> ``` Example 2 (lane level): @@ -519,7 +519,7 @@ def XeGPU_StoreNdOp : XeGPU_Op<"store_nd", [ OptionalAttr: $l1_hint, OptionalAttr: $l2_hint, OptionalAttr: $l3_hint, - OptionalAttr:$anchor_layout); + OptionalAttr:$layout); let extraClassDeclaration = extraBaseClassDeclaration # [{ VectorType getValueType() { @@ -538,7 +538,7 @@ def XeGPU_StoreNdOp : XeGPU_Op<"store_nd", [ return getMixedValues(statics, dynamics, getContext()); } - xegpu::DistributeLayoutAttr getLayoutAttr() { + xegpu::DistributeLayoutAttr getDescLayoutAttr() { return dyn_cast_if_present(getTensorDescType().getLayout()); } @@ -749,7 +749,7 @@ def XeGPU_PrefetchOp : XeGPU_Op<"prefetch", []> { - `offset_align_byte`: [optional] required if `source` is a pointer. If `source` is not a pointer, it is not allowed. Represents the alignment in bytes of each offset in offsets. - - `anchor_layout`: [optional] Describes the expected layout of the `tensor_desc` or `offsets` + - `layout`: [optional] Describes the expected layout of the `tensor_desc` or `offsets` operand. Only valid at workgroup and subgroup levels. Example 1 (Workgroup level): @@ -757,7 +757,7 @@ def XeGPU_PrefetchOp : XeGPU_Op<"prefetch", []> { xegpu.prefetch %tdesc {l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint, - anchor_layout = #xegpu.layout + layout = #xegpu.layout } : !xegpu.tensor_desc<256xf16> ``` @@ -803,7 +803,7 @@ def XeGPU_PrefetchOp : XeGPU_Op<"prefetch", []> { OptionalAttr:$l2_hint, OptionalAttr:$l3_hint, OptionalAttr:$offset_align_byte, - OptionalAttr:$anchor_layout); + OptionalAttr:$layout); let extraClassDeclaration = extraBaseClassDeclaration # [{ Type getSourceType() { @@ -876,7 +876,7 @@ def XeGPU_LoadGatherOp : XeGPU_Op<"load", [MemoryEffects<[MemRead]>]> { - `l1_hint`, `l2_hint`, `l3_hint`: [optional] cache hints for each level of cache. - - `anchor_layout`: [optional] Describes the expected layout of the `tensor_desc` operand or the result + - `layout`: [optional] Describes the expected layout of the `tensor_desc` operand or the result of load. Only valid at workgroup and subgroup levels. Results: @@ -888,7 +888,7 @@ def XeGPU_LoadGatherOp : XeGPU_Op<"load", [MemoryEffects<[MemRead]>]> { %2 = xegpu.load %1, %0 <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}, - anchor_layout = #xegpu.layout> + layout = #xegpu.layout> : !xegpu.tensor_desc<256xf32, #xegpu.scatter_tdesc_attr>, vector<256xi1> -> vector<256xf32> ``` @@ -898,7 +898,7 @@ def XeGPU_LoadGatherOp : XeGPU_Op<"load", [MemoryEffects<[MemRead]>]> { %2 = xegpu.load %1, %0 <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}, - anchor_layout = #xegpu.layout> + layout = #xegpu.layout> : !xegpu.tensor_desc<16x8xf32, #xegpu.scatter_tdesc_attr>, vector<16xi1> -> vector<16x8xf32> ``` @@ -915,7 +915,7 @@ def XeGPU_LoadGatherOp : XeGPU_Op<"load", [MemoryEffects<[MemRead]>]> { %val = xegpu.load %a[%offsets], %mask {l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint, - anchor_layout = #xegpu.layout} + layout = #xegpu.layout} : memref<1024xf32>, vector<16xi1>, vector<16xindex> -> vector<16xf32> ``` @@ -937,7 +937,7 @@ def XeGPU_LoadGatherOp : XeGPU_Op<"load", [MemoryEffects<[MemRead]>]> { OptionalAttr:$l1_hint, OptionalAttr:$l2_hint, OptionalAttr:$l3_hint, - OptionalAttr:$anchor_layout); + OptionalAttr:$layout); let results = (outs AnyTypeOf<[XeGPU_ValueType, XeGPU_ScalarType]>:$value); let extraClassDeclaration = extraBaseClassDeclaration # [{ @@ -1039,7 +1039,7 @@ def XeGPU_StoreScatterOp : XeGPU_Op<"store", [MemoryEffects<[MemWrite]>]> { - `l1_hint`, `l2_hint`, `l3_hint`: [optional] cache hints for each level of cache. - - `anchor_layout`: [optional] Describes the expected layout of the `tensor_desc` operand or the value + - `layout`: [optional] Describes the expected layout of the `tensor_desc` operand or the value to be stored. Only valid at workgroup and subgroup levels. @@ -1048,7 +1048,7 @@ def XeGPU_StoreScatterOp : XeGPU_Op<"store", [MemoryEffects<[MemWrite]>]> { xegpu.store %0, %1, %2 <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint, - anchor_layout = #xegpu.layout}> + layout = #xegpu.layout}> : vector<256xf32>, !xegpu.tensor_desc<256xf32, #xegpu.scattered_tdesc_attr<>>, vector<256xi1> ``` @@ -1057,7 +1057,7 @@ def XeGPU_StoreScatterOp : XeGPU_Op<"store", [MemoryEffects<[MemWrite]>]> { xegpu.store %0, %1, %2 <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint, - anchor_layout = #xegpu.layout}> + layout = #xegpu.layout}> : vector<16x8xf32>, !xegpu.tensor_desc<16x8xf32, #xegpu.scattered_tdesc_attr>, vector<16xi1> ``` @@ -1074,7 +1074,7 @@ def XeGPU_StoreScatterOp : XeGPU_Op<"store", [MemoryEffects<[MemWrite]>]> { xegpu.store %val, %a[%offsets], %mask {l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint, - anchor_layout = #xegpu.layout} + layout = #xegpu.layout} : memref<1024xf32>, vector<16xi1>, vector<16xindex> -> vector<16xf32> ``` @@ -1097,7 +1097,7 @@ def XeGPU_StoreScatterOp : XeGPU_Op<"store", [MemoryEffects<[MemWrite]>]> { OptionalAttr:$l1_hint, OptionalAttr:$l2_hint, OptionalAttr:$l3_hint, - OptionalAttr:$anchor_layout); + OptionalAttr:$layout); let extraClassDeclaration = extraBaseClassDeclaration#[{ Type getDestType() { @@ -1241,7 +1241,7 @@ def XeGPU_DpasOp : XeGPU_Op<"dpas", [Pure, AllElementTypesMatch<["lhs", "rhs"]>] - `acc`: [optional] A vector value representing the accumulator matrix tile (C). When present, the result is computed as `lhs * rhs + acc`; otherwise, the accumulator is implicitly assumed to be zero. - - `anchor_layout_a`, `anchor_layout_b`, `anchor_layout_cd`: [optional] Attributes that identify this + - `layout_a`, `layout_b`, `layout_cd`: [optional] Attributes that identify this operation as anchor for operands A, B, and the accumulator/result, enabling users to assign layouts that govern distribution at the subgroup and/or lane level. Only valid at workgroup and subgroup level. @@ -1250,9 +1250,9 @@ def XeGPU_DpasOp : XeGPU_Op<"dpas", [Pure, AllElementTypesMatch<["lhs", "rhs"]>] ```mlir %d = xegpu.dpas %a, %b, %c <{ - anchor_layout_a = #xegpu.layout, - anchor_layout_b = #xegpu.layout, - anchor_layout_cd = #xegpu.layout} + layout_a = #xegpu.layout, + layout_b = #xegpu.layout, + layout_cd = #xegpu.layout} : vector<64x128xf16>, vector<128x128xf16>, vector<64x128xf32> -> vector<64x128xf32> ``` @@ -1268,9 +1268,9 @@ def XeGPU_DpasOp : XeGPU_Op<"dpas", [Pure, AllElementTypesMatch<["lhs", "rhs"]>] XeGPU_DpasOprType : $lhs, XeGPU_DpasOprType : $rhs, Optional: $acc, - OptionalAttr:$anchor_layout_a, - OptionalAttr:$anchor_layout_b, - OptionalAttr:$anchor_layout_cd + OptionalAttr:$layout_a, + OptionalAttr:$layout_b, + OptionalAttr:$layout_cd ); let results = (outs XeGPU_DpasResType: $result); @@ -1336,7 +1336,7 @@ def XeGPU_AtomicRMWOp: XeGPU_Op<"atomic_rmw", [Pure, - `value`: The input values used by the atomic operation. It must have the same shape and element type as `tensorDesc` and `result`. - - `anchor_layout`: [optional] An attribute that identifies the operation as an anchor, + - `layout`: [optional] An attribute that identifies the operation as an anchor, enabling users to assign a layout that governs distribution at the subgroup and/or lane level. Only valid at workgroup and subgroup levels. }]; @@ -1346,7 +1346,7 @@ def XeGPU_AtomicRMWOp: XeGPU_Op<"atomic_rmw", [Pure, XeGPU_TensorDesc:$tensorDesc, XeGPU_MaskType:$mask, XeGPU_ValueType:$value, - OptionalAttr:$anchor_layout); + OptionalAttr:$layout); let results = (outs XeGPU_ValueType:$result); @@ -1507,7 +1507,7 @@ def XeGPU_LoadMatrixOp: XeGPU_Op<"load_matrix", [MemoryEffects<[MemRead]>, Variadic: $offsets, DenseI64ArrayAttr: $const_offsets, OptionalAttr:$subgroup_block_io, - OptionalAttr:$anchor_layout + OptionalAttr:$layout ); let results = (outs AnyTypeOf<[XeGPU_ValueType, XeGPU_ScalarType]>:$res); let assemblyFormat = [{ @@ -1529,7 +1529,7 @@ def XeGPU_LoadMatrixOp: XeGPU_Op<"load_matrix", [MemoryEffects<[MemRead]>, - `subgroup_block_io`: [optional] An attribute indicating that the operation can be lowered to a subgroup block load. When this attribute is present, the offsets are subgroup-uniform across all lanes. Only used on subgroup and lane level. - - `anchor_layout`: [optional] Describes the expected layout of the `mem_desc` operand as well as + - `layout`: [optional] Describes the expected layout of the `mem_desc` operand as well as the result of load (they are identical). Only valid at workgroup and subgroup levels. @@ -1540,7 +1540,7 @@ def XeGPU_LoadMatrixOp: XeGPU_Op<"load_matrix", [MemoryEffects<[MemRead]>, ```mlir %c0 = arith.constant 0 : index %1 = xegpu.load_matrix %0[%c0, %c0] <{ - anchor_layout = #xegpu.layout }> + layout = #xegpu.layout }> : !xegpu.mem_desc<128x128xf16, #xegpu.mem_layout> , index, index -> vector<128x128xf16> ``` @@ -1574,7 +1574,7 @@ def XeGPU_StoreMatrixOp: XeGPU_Op<"store_matrix", [MemoryEffects<[MemWrite]>, Variadic: $offsets, DenseI64ArrayAttr: $const_offsets, OptionalAttr:$subgroup_block_io, - OptionalAttr:$anchor_layout + OptionalAttr:$layout ); let assemblyFormat = [{ $data `,` $mem_desc `` custom($offsets, $const_offsets) prop-dict attr-dict `` `:` type(operands)}]; @@ -1593,14 +1593,14 @@ def XeGPU_StoreMatrixOp: XeGPU_Op<"store_matrix", [MemoryEffects<[MemWrite]>, - `subgroup_block_io`: [optional] An attribute indicating that the operation can be lowered to a subgroup block load. When this attribute is present, the offsets are subgroup-uniform across all lanes. Only used on subgroup and lane level. - - `anchor_layout`: [optional] Describes the expected layout of the `tensor_desc` operand as well as + - `layout`: [optional] Describes the expected layout of the `tensor_desc` operand as well as the value to be stored (they are identical). Only valid at workgroup and subgroup levels. Example (Workgroup level): ```mlir %c0 = arith.constant 0 : index xegpu.store_matrix %1, %0[%c0, %c0] <{ - anchor_layout = #xegpu.layout }> + layout = #xegpu.layout }> : vector<128x128xf16>, !xegpu.mem_desc<128x128xf16>>, index, index ``` }]; diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp index 29daab384bf7f..8cb666298c959 100644 --- a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp +++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp @@ -1157,8 +1157,7 @@ LogicalResult LoadMatrixOp::verify() { MemDescType mdescTy = getMemDesc().getType(); return IsValidMatrixOpParams(resTy, mdescTy, subgroup_block_io, - getAnchorLayoutAttr(), - [&]() { return emitError(); }); + getLayoutAttr(), [&]() { return emitError(); }); } //===----------------------------------------------------------------------===// @@ -1182,8 +1181,7 @@ LogicalResult StoreMatrixOp::verify() { UnitAttr subgroup_block_io = getSubgroupBlockIoAttr(); MemDescType mdescTy = getMemDesc().getType(); return IsValidMatrixOpParams(dataTy, mdescTy, subgroup_block_io, - getAnchorLayoutAttr(), - [&]() { return emitError(); }); + getLayoutAttr(), [&]() { return emitError(); }); } namespace mlir { diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp index 8fb63da8cb0a0..f2b0e71c9397f 100644 --- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp +++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp @@ -496,7 +496,7 @@ void LayoutInfoPropagation::visitPrefetchNdOp( ArrayRef results) { LayoutInfo prefetchLayout; - xegpu::DistributeLayoutAttr anchorLayout = prefetch.getAnchorLayoutAttr(); + xegpu::DistributeLayoutAttr anchorLayout = prefetch.getLayoutAttr(); if (hasParamsOfLayoutKind(anchorLayout)) { prefetchLayout = LayoutInfo(anchorLayout); } else { @@ -540,7 +540,7 @@ void LayoutInfoPropagation::visitPrefetchNdOp( prefetchLayout = getDefaultSIMTLayoutInfo( tdescTy, uArch, uArchInstruction->getPackedFormatBitSize()); - prefetch.setAnchorLayoutAttr( + prefetch.setLayoutAttr( dyn_cast(prefetchLayout.get())); } // Propagate the layout to the source tensor descriptor. @@ -647,10 +647,10 @@ void LayoutInfoPropagation::visitDpasOp( LayoutInfo dpasBLayout; LayoutInfo dpasCDLayout; - xegpu::DistributeLayoutAttr anchorLayoutCD = dpas.getAnchorLayoutCdAttr(); + xegpu::DistributeLayoutAttr anchorLayoutCD = dpas.getLayoutCdAttr(); if (hasParamsOfLayoutKind(anchorLayoutCD)) { - xegpu::DistributeLayoutAttr anchorLayoutA = dpas.getAnchorLayoutAAttr(); - xegpu::DistributeLayoutAttr anchorLayoutB = dpas.getAnchorLayoutBAttr(); + xegpu::DistributeLayoutAttr anchorLayoutA = dpas.getLayoutAAttr(); + xegpu::DistributeLayoutAttr anchorLayoutB = dpas.getLayoutBAttr(); assert(hasParamsOfLayoutKind(anchorLayoutA) && "Expected anchor layout for DPAS A operand."); assert(hasParamsOfLayoutKind(anchorLayoutB) && @@ -720,12 +720,12 @@ void LayoutInfoPropagation::visitDpasOp( dpasCDLayout = getSIMTLayoutInfoForDPASOperand( cTy, 2, uArch, uArchInstruction->getPackedFormatBitSizeB()); - dpas.setAnchorLayoutCdAttr( + dpas.setLayoutCdAttr( dyn_cast(dpasCDLayout.get())); } - dpas.setAnchorLayoutAAttr( + dpas.setLayoutAAttr( dyn_cast(dpasALayout.get())); - dpas.setAnchorLayoutBAttr( + dpas.setLayoutBAttr( dyn_cast(dpasBLayout.get())); } @@ -742,7 +742,7 @@ void LayoutInfoPropagation::visitStoreNdOp( ArrayRef results) { LayoutInfo storeLayout; - xegpu::DistributeLayoutAttr anchorLayout = store.getAnchorLayoutAttr(); + xegpu::DistributeLayoutAttr anchorLayout = store.getLayoutAttr(); if (hasParamsOfLayoutKind(anchorLayout)) { storeLayout = LayoutInfo(anchorLayout); } else { @@ -782,7 +782,7 @@ void LayoutInfoPropagation::visitStoreNdOp( storeLayout = getDefaultSIMTLayoutInfo(store.getValueType(), uArch, uArchInstruction->getPackedFormatBitSize()); - store.setAnchorLayoutAttr( + store.setLayoutAttr( dyn_cast(storeLayout.get())); } // Propagate the layout to the value operand. @@ -798,7 +798,7 @@ void LayoutInfoPropagation::visitLoadNdOp( ArrayRef results) { LayoutInfo loadLayout; - xegpu::DistributeLayoutAttr anchorLayout = load.getAnchorLayoutAttr(); + xegpu::DistributeLayoutAttr anchorLayout = load.getLayoutAttr(); if (hasParamsOfLayoutKind(anchorLayout)) { loadLayout = LayoutInfo(anchorLayout); } else { @@ -816,8 +816,7 @@ void LayoutInfoPropagation::visitLoadNdOp( "LayoutInfoPropagation stage."); loadLayout = valueLayout.transpose(transpose.value()); } - load.setAnchorLayoutAttr( - dyn_cast(loadLayout.get())); + load.setLayoutAttr(dyn_cast(loadLayout.get())); } // Propagate the new layout to the tensor descriptor operand. propagateIfChanged(operands[0], operands[0]->meet(loadLayout)); @@ -913,7 +912,7 @@ void LayoutInfoPropagation::visitLoadGatherOp( LayoutInfo loadLayout; LayoutInfo maskLayout; - xegpu::DistributeLayoutAttr anchorLayout = load.getAnchorLayoutAttr(); + xegpu::DistributeLayoutAttr anchorLayout = load.getLayoutAttr(); if (hasParamsOfLayoutKind(anchorLayout)) { loadLayout = LayoutInfo(anchorLayout); maskLayout = loadLayout; @@ -947,8 +946,7 @@ void LayoutInfoPropagation::visitLoadGatherOp( // Mask operand should have 1D default layout. maskLayout = getDefaultSIMTLayoutInfo(load->getContext(), 1, subgroupSize); - load.setAnchorLayoutAttr( - dyn_cast(loadLayout.get())); + load.setLayoutAttr(dyn_cast(loadLayout.get())); } // Propagate the new layout to the tensor descriptor operand. if (isa(load.getSourceType())) @@ -983,7 +981,7 @@ void LayoutInfoPropagation::visitStoreScatterOp( LayoutInfo payloadLayout; LayoutInfo maskLayout; - xegpu::DistributeLayoutAttr anchorLayout = storeScatter.getAnchorLayoutAttr(); + xegpu::DistributeLayoutAttr anchorLayout = storeScatter.getLayoutAttr(); if (hasParamsOfLayoutKind(anchorLayout)) { payloadLayout = LayoutInfo(anchorLayout); maskLayout = payloadLayout; @@ -1027,7 +1025,7 @@ void LayoutInfoPropagation::visitStoreScatterOp( maskLayout = getDefaultSIMTLayoutInfo(storeScatter->getContext(), 1, subgroupSize); - storeScatter.setAnchorLayoutAttr( + storeScatter.setLayoutAttr( dyn_cast(payloadLayout.get())); } // Propagate the payload operand layout diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp index ac65babfcb4cb..4455811a2e681 100644 --- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp +++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp @@ -965,7 +965,7 @@ struct LoadMatrixDistribution final : public gpu::WarpDistributionPattern { SmallVector offsetsAsValues = vector::getAsValues(rewriter, matrixOp.getLoc(), offsets); - auto layout = matrixOp.getAnchorLayoutAttr(); + auto layout = matrixOp.getLayoutAttr(); if (!layout) return rewriter.notifyMatchFailure( matrixOp, "the matrix operation lacks layout attribute"); @@ -1041,7 +1041,7 @@ struct StoreMatrixDistribution final : public gpu::WarpDistributionPattern { SmallVector offsetsAsValues = vector::getAsValues(rewriter, matrixOp.getLoc(), offsets); - auto layout = matrixOp.getAnchorLayoutAttr(); + auto layout = matrixOp.getLayoutAttr(); if (!layout) return rewriter.notifyMatchFailure( matrixOp, "the matrix operation lacks layout attribute"); diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUUnroll.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUUnroll.cpp index c644f784606e9..330553564f81a 100644 --- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUUnroll.cpp +++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUUnroll.cpp @@ -678,7 +678,7 @@ struct UnrollLoadGatherOpWithOffset pack(offsets, convertedOffsetTypes, *targetShape, loc, rewriter); } - auto layout = op.getAnchorLayoutAttr(); + auto layout = op.getLayoutAttr(); if (layout) layout = layout.dropInstData(); @@ -778,7 +778,7 @@ struct UnrollStoreScatterOpWithOffsets SmallVector convertedValues = pack(op.getValue(), convertedValTypes, *targetShape, loc, rewriter); - auto layout = op.getAnchorLayoutAttr(); + auto layout = op.getLayoutAttr(); if (layout) layout = layout.dropInstData(); @@ -954,7 +954,7 @@ struct UnrollLoadMatrixOp : public UnrollPattern { Type elemTy = valueTy.getElementType(); ArrayRef shape = valueTy.getShape(); - auto layout = dyn_cast(op.getAnchorLayoutAttr()); + auto layout = dyn_cast(op.getLayoutAttr()); VectorType newValueTy = valueTy.cloneWith(*targetShape, elemTy); @@ -993,7 +993,7 @@ struct UnrollStoreMatrixOp : public UnrollPattern { VectorType valueTy = llvm::dyn_cast(op.getData().getType()); assert(valueTy && "the value type must be vector type!"); ArrayRef shape = valueTy.getShape(); - auto layout = dyn_cast(op.getAnchorLayoutAttr()); + auto layout = dyn_cast(op.getLayoutAttr()); SmallVector convertedValTypes = getUnrolledTypes(valueTy, *targetShape); diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp index 2562c46adfa8d..73876ce3b1639 100644 --- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp +++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp @@ -86,13 +86,13 @@ genOffsetsList(ConversionPatternRewriter &rewriter, OpType op, if (origOffsets.empty()) return failure(); - // if op is xegpu::CreateNdDescOp, call op.getLayoutAttr() + // if op is xegpu::CreateNdDescOp, call op.getDescLayoutAttr() xegpu::DistributeLayoutAttr layout; if constexpr (std::is_same_v || std::is_same_v) { - layout = op.getAnchorLayoutAttr(); - } else { layout = op.getLayoutAttr(); + } else { + layout = op.getDescLayoutAttr(); } // not applicable to ops without workgroup layout attributes @@ -1007,7 +1007,7 @@ struct WgToSgLoadMatrixOp : public OpConversionPattern { assert(valueTy && "the value type must be vector type!"); Type elemTy = valueTy.getElementType(); - xegpu::DistributeLayoutAttr layout = op.getAnchorLayoutAttr(); + xegpu::DistributeLayoutAttr layout = op.getLayoutAttr(); SmallVector sgShape = getSgShapeAndCount(wgShape, layout).first; VectorType newResTy = VectorType::get(sgShape, elemTy); SmallVector newOps; @@ -1033,7 +1033,7 @@ struct WgToSgStoreMatrixOp : public OpConversionPattern { if (failed(genOffsetsList(rewriter, op, offsetsList))) return failure(); - xegpu::DistributeLayoutAttr layout = op.getAnchorLayoutAttr(); + xegpu::DistributeLayoutAttr layout = op.getLayoutAttr(); for (auto [v, offsets] : llvm::zip(adaptor.getData(), offsetsList)) xegpu::StoreMatrixOp::create(rewriter, op.getLoc(), v, op.getMemDesc(), offsets, layout.dropSgLayoutAndData()); @@ -1417,12 +1417,12 @@ void XeGPUWgToSgDistributePass::runOnOperation() { target.addDynamicallyLegalOp( [=](xegpu::LoadMatrixOp op) -> bool { - return isLegal(op.getAnchorLayoutAttr()); + return isLegal(op.getLayoutAttr()); }); target.addDynamicallyLegalOp( [=](xegpu::StoreMatrixOp op) -> bool { - return isLegal(op.getAnchorLayoutAttr()); + return isLegal(op.getLayoutAttr()); }); target.addDynamicallyLegalOp( diff --git a/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp b/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp index 572e5442760bc..91432b1c11304 100644 --- a/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp +++ b/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp @@ -135,11 +135,11 @@ xegpu::DistributeLayoutAttr xegpu::getDistributeLayoutAttr(const Value value) { // for LoadMatrixOp, the layout is attached to the property of the op if (auto loadOp = dyn_cast(defOp)) - return loadOp.getAnchorLayoutAttr(); + return loadOp.getLayoutAttr(); // for StoreMatrixOp, the layout is attached to the property of the op if (auto storeOp = dyn_cast(defOp)) - return storeOp.getAnchorLayoutAttr(); + return storeOp.getLayoutAttr(); std::string layoutName = getLayoutName(result); if (defOp->hasAttr(layoutName)) return defOp->getAttrOfType(layoutName); @@ -147,7 +147,7 @@ xegpu::DistributeLayoutAttr xegpu::getDistributeLayoutAttr(const Value value) { // check for "permament" layout only after "temporary" layout name lookup // for backward compatibility if (auto loadGatherOp = dyn_cast(defOp)) - return loadGatherOp.getAnchorLayoutAttr(); + return loadGatherOp.getLayoutAttr(); } if (auto arg = dyn_cast(value)) { @@ -167,10 +167,10 @@ xegpu::getDistributeLayoutAttr(const OpOperand &opr) { Operation *op = opr.getOwner(); if (auto loadOp = dyn_cast(op)) - return loadOp.getAnchorLayoutAttr(); + return loadOp.getLayoutAttr(); if (auto storeOp = dyn_cast(op)) - return storeOp.getAnchorLayoutAttr(); + return storeOp.getLayoutAttr(); std::string layoutName = xegpu::getLayoutName(opr); if (op->hasAttr(layoutName)) @@ -178,7 +178,7 @@ xegpu::getDistributeLayoutAttr(const OpOperand &opr) { // check for "permament" layout only after "temporary" layout name lookup if (auto storeScatterOp = dyn_cast(op)) - if (auto layout = storeScatterOp.getAnchorLayoutAttr()) + if (auto layout = storeScatterOp.getLayoutAttr()) return layout; return getDistributeLayoutAttr(opr.get()); @@ -193,7 +193,7 @@ maybePickPermanentLayout(xegpu::DistributeLayoutAttr layout, xegpu::DistributeLayoutAttr candidate = layout; if (auto loadOp = dyn_cast(owner)) { - if (auto perm = loadOp.getAnchorLayoutAttr()) + if (auto perm = loadOp.getLayoutAttr()) candidate = perm; } @@ -211,7 +211,7 @@ maybePickPermanentLayout(xegpu::DistributeLayoutAttr layout, if (auto storeOp = dyn_cast(owner)) { if (idx == 0) { - if (auto perm = storeOp.getAnchorLayoutAttr()) + if (auto perm = storeOp.getLayoutAttr()) candidate = perm; } } diff --git a/mlir/test/Dialect/XeGPU/invalid.mlir b/mlir/test/Dialect/XeGPU/invalid.mlir index 62ac880030cda..92f353717ac59 100644 --- a/mlir/test/Dialect/XeGPU/invalid.mlir +++ b/mlir/test/Dialect/XeGPU/invalid.mlir @@ -894,7 +894,7 @@ func.func @store_mem_desc_invalid_rank(%arg0: !xegpu.mem_desc<64xf16>, %arg1: ve // ----- func.func @simt_store_matrix_vector_nonlinear(%arg0: !xegpu.mem_desc<32x32xf32, #xegpu.mem_layout>, %arg1: vector<2x16xf32>) { // expected-error@+1 {{With subgroup_block_io, accessed data must be contiguous and coalesced}} - xegpu.store_matrix %arg1, %arg0[0, 0] {subgroup_block_io, anchor_layout = #xegpu.layout} : + xegpu.store_matrix %arg1, %arg0[0, 0] {subgroup_block_io, layout = #xegpu.layout} : vector<2x16xf32>, !xegpu.mem_desc<32x32xf32, #xegpu.mem_layout> return } @@ -902,7 +902,7 @@ func.func @simt_store_matrix_vector_nonlinear(%arg0: !xegpu.mem_desc<32x32xf32, // ----- func.func @simt_store_matrix_vector_noncoalesced(%arg0: !xegpu.mem_desc<32x32xf32, #xegpu.mem_layout>, %arg1: vector<16x2xf32>) { // expected-error@+1 {{With subgroup_block_io, the distributed dimensions must be contiguous}} - xegpu.store_matrix %arg1, %arg0[0, 0] {subgroup_block_io, anchor_layout = #xegpu.layout} : + xegpu.store_matrix %arg1, %arg0[0, 0] {subgroup_block_io, layout = #xegpu.layout} : vector<16x2xf32>, !xegpu.mem_desc<32x32xf32, #xegpu.mem_layout> return } @@ -910,7 +910,7 @@ func.func @simt_store_matrix_vector_noncoalesced(%arg0: !xegpu.mem_desc<32x32xf3 // ----- func.func @simt_store_matrix_vector_noncoalesced(%arg0: !xegpu.mem_desc<32x32xf32, #xegpu.mem_layout>, %arg1: vector<16x2xf32>) { // expected-error@+1 {{With subgroup_block_io, the block shape must match the lane layout}} - xegpu.store_matrix %arg1, %arg0[0, 0] {subgroup_block_io, anchor_layout = #xegpu.layout} : + xegpu.store_matrix %arg1, %arg0[0, 0] {subgroup_block_io, layout = #xegpu.layout} : vector<16x2xf32>, !xegpu.mem_desc<32x32xf32, #xegpu.mem_layout> return } diff --git a/mlir/test/Dialect/XeGPU/propagate-layout-inst-data.mlir b/mlir/test/Dialect/XeGPU/propagate-layout-inst-data.mlir index 62a33a4797d2b..1d86a2a4939e5 100644 --- a/mlir/test/Dialect/XeGPU/propagate-layout-inst-data.mlir +++ b/mlir/test/Dialect/XeGPU/propagate-layout-inst-data.mlir @@ -5,14 +5,14 @@ // CHECK: %[[CST:.*]] = arith.constant {layout_result_0 = #xegpu.layout} dense<0.000000e+00> : vector<8x16xf32> // CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG0]][{{.*}}] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16, #xegpu.layout // CHECK: %[[T1:.*]] = xegpu.create_nd_tdesc %[[ARG1]][{{.*}}] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout> -// CHECK: %[[T2:.*]] = xegpu.load_nd %[[T0]] <{anchor_layout = #xegpu.layout}> {layout_result_0 = #xegpu.layout} : +// CHECK: %[[T2:.*]] = xegpu.load_nd %[[T0]] <{layout = #xegpu.layout}> {layout_result_0 = #xegpu.layout} : // CHECK-SAME: !xegpu.tensor_desc<8x16xf16, #xegpu.layout> -> vector<8x16xf16> -// CHECK: %[[T3:.*]] = xegpu.load_nd %[[T1]] <{anchor_layout = #xegpu.layout}> {layout_result_0 = #xegpu.layout} : +// CHECK: %[[T3:.*]] = xegpu.load_nd %[[T1]] <{layout = #xegpu.layout}> {layout_result_0 = #xegpu.layout} : // CHECK-SAME: !xegpu.tensor_desc<16x16xf16, #xegpu.layout> -> vector<16x16xf16> -// CHECK: %[[T4:.*]] = xegpu.dpas %[[T2]], %[[T3]], %[[CST]] {anchor_layout_a = #xegpu.layout, anchor_layout_b = #xegpu.layout, anchor_layout_cd = #xegpu.layout, layout_result_0 = #xegpu.layout} : +// CHECK: %[[T4:.*]] = xegpu.dpas %[[T2]], %[[T3]], %[[CST]] {layout_a = #xegpu.layout, layout_b = #xegpu.layout, layout_cd = #xegpu.layout, layout_result_0 = #xegpu.layout} : // CHECK-SAME: vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32> -> vector<8x16xf32> // CHECK: %[[T5:.*]] = xegpu.create_nd_tdesc %[[ARG2]][{{.*}}] : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.layout -// CHECK: xegpu.store_nd %[[T4]], %[[T5]] <{anchor_layout = #xegpu.layout}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.layout> +// CHECK: xegpu.store_nd %[[T4]], %[[T5]] <{layout = #xegpu.layout}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.layout> gpu.module @test { func.func @dpas_f16(%arg0: memref<8x16xf16>, %arg1: memref<16x16xf16>, %arg2: memref<8x16xf32>) { @@ -46,7 +46,7 @@ gpu.module @test_kernel { %out:3 = scf.for %k = %c0 to %c1024 step %c32 iter_args(%arg0 = %a_tdesc, %arg1 = %b_tdesc, %arg2 = %c_tdesc) -> (!xegpu.tensor_desc<16x32xf16>, !xegpu.tensor_desc<16x32xf16>, !xegpu.tensor_desc<16x32xf16>) { - //CHECK: xegpu.load_nd {{.*}} <{anchor_layout = #xegpu.layout}> {layout_result_0 = #xegpu.layout} : + //CHECK: xegpu.load_nd {{.*}} <{layout = #xegpu.layout}> {layout_result_0 = #xegpu.layout} : //CHECK-SAME: !xegpu.tensor_desc<16x32xf16, #xegpu.layout> -> vector<16x32xf16> %a = xegpu.load_nd %arg0 : !xegpu.tensor_desc<16x32xf16> -> vector<16x32xf16> %b = xegpu.load_nd %arg1 : !xegpu.tensor_desc<16x32xf16> -> vector<16x32xf16> @@ -85,7 +85,7 @@ gpu.module @test_kernel { %out:3 = scf.for %k = %c0 to %c1024 step %c32 iter_args(%arg0 = %a_tdesc, %arg1 = %b_tdesc, %arg2 = %c_tdesc) -> (!xegpu.tensor_desc<12x32xf16>, !xegpu.tensor_desc<12x32xf16>, !xegpu.tensor_desc<12x32xf16>) { - //CHECK: xegpu.load_nd {{.*}} <{anchor_layout = #xegpu.layout}> {layout_result_0 = #xegpu.layout} : + //CHECK: xegpu.load_nd {{.*}} <{layout = #xegpu.layout}> {layout_result_0 = #xegpu.layout} : //CHECK-SAME: !xegpu.tensor_desc<12x32xf16, #xegpu.layout> -> vector<12x32xf16> %a = xegpu.load_nd %arg0 : !xegpu.tensor_desc<12x32xf16> -> vector<12x32xf16> %b = xegpu.load_nd %arg1 : !xegpu.tensor_desc<12x32xf16> -> vector<12x32xf16> @@ -113,9 +113,9 @@ gpu.module @test { // CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: memref<256xf16>) { // CHECK: %{{.*}} = arith.constant {layout_result_0 = #xegpu.layout} dense : vector<16xi1> // CHECK: %{{.*}} = arith.constant {layout_result_0 = #xegpu.layout} dense<12> : vector<16xindex> -// CHECK: %{{.*}} = xegpu.load %[[ARG0]][%{{.*}}], %{{.*}} <{anchor_layout = #xegpu.layout, chunk_size = 8 : i64}> +// CHECK: %{{.*}} = xegpu.load %[[ARG0]][%{{.*}}], %{{.*}} <{chunk_size = 8 : i64, layout = #xegpu.layout}> // CHECK-SAME: {layout_result_0 = #xegpu.layout} : memref<256xf16>, vector<16xindex>, vector<16xi1> -> vector<16x8xf16> -// CHECK: xegpu.store %0, %[[ARG0]][%{{.*}}], %{{.*}} <{anchor_layout = #xegpu.layout, chunk_size = 8 : i64}> : vector<16x8xf16>, memref<256xf16>, vector<16xindex>, vector<16xi1> +// CHECK: xegpu.store %0, %[[ARG0]][%{{.*}}], %{{.*}} <{chunk_size = 8 : i64, layout = #xegpu.layout}> : vector<16x8xf16>, memref<256xf16>, vector<16xindex>, vector<16xi1> func.func @scatter_ops_chunksize(%src: memref<256xf16>) { %1 = arith.constant dense<1>: vector<16xi1> %offset = arith.constant dense<12> : vector<16xindex> diff --git a/mlir/test/Dialect/XeGPU/propagate-layout.mlir b/mlir/test/Dialect/XeGPU/propagate-layout.mlir index d1bee47dd6d37..f8b59b87a122b 100644 --- a/mlir/test/Dialect/XeGPU/propagate-layout.mlir +++ b/mlir/test/Dialect/XeGPU/propagate-layout.mlir @@ -6,14 +6,14 @@ gpu.module @test { // CHECK: %[[CST:.*]] = arith.constant {layout_result_0 = #xegpu.layout} dense<0.000000e+00> : vector<8x16xf32> // CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG0]][{{.*}}] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16, #xegpu.layout> // CHECK: %[[T1:.*]] = xegpu.create_nd_tdesc %[[ARG1]][{{.*}}] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout> -// CHECK: %[[T2:.*]] = xegpu.load_nd %[[T0]] <{anchor_layout = #xegpu.layout}> {layout_result_0 = #xegpu.layout} : +// CHECK: %[[T2:.*]] = xegpu.load_nd %[[T0]] <{layout = #xegpu.layout}> {layout_result_0 = #xegpu.layout} : // CHECK-SAME: !xegpu.tensor_desc<8x16xf16, #xegpu.layout> -> vector<8x16xf16> -// CHECK: %[[T3:.*]] = xegpu.load_nd %[[T1]] <{anchor_layout = #xegpu.layout}> {layout_result_0 = #xegpu.layout} : +// CHECK: %[[T3:.*]] = xegpu.load_nd %[[T1]] <{layout = #xegpu.layout}> {layout_result_0 = #xegpu.layout} : // CHECK-SAME: !xegpu.tensor_desc<16x16xf16, #xegpu.layout> -> vector<16x16xf16> -// CHECK: %[[T4:.*]] = xegpu.dpas %[[T2]], %[[T3]], %[[CST]] {anchor_layout_a = #xegpu.layout, anchor_layout_b = #xegpu.layout, anchor_layout_cd = #xegpu.layout, layout_result_0 = #xegpu.layout} : +// CHECK: %[[T4:.*]] = xegpu.dpas %[[T2]], %[[T3]], %[[CST]] {layout_a = #xegpu.layout, layout_b = #xegpu.layout, layout_cd = #xegpu.layout, layout_result_0 = #xegpu.layout} : // CHECK-SAME: vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32> -> vector<8x16xf32> // CHECK: %[[T5:.*]] = xegpu.create_nd_tdesc %[[ARG2]][{{.*}}] : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.layout> -// CHECK: xegpu.store_nd %[[T4]], %[[T5]] <{anchor_layout = #xegpu.layout}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.layout> +// CHECK: xegpu.store_nd %[[T4]], %[[T5]] <{layout = #xegpu.layout}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.layout> func.func @dpas_f16(%arg0: memref<8x16xf16>, %arg1: memref<16x16xf16>, %arg2: memref<8x16xf32>) { %c0 = arith.constant 0 : index %cst = arith.constant dense<0.000000e+00> : vector<8x16xf32> @@ -32,7 +32,7 @@ func.func @dpas_f16(%arg0: memref<8x16xf16>, %arg1: memref<16x16xf16>, %arg2: me gpu.module @test { // CHECK-LABEL: func.func @dpas_i8( // CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: vector<8x32xi8>, %[[ARG1:[0-9a-zA-Z]+]]: vector<32x16xi8>, %[[ARG2:[0-9a-zA-Z]+]]: memref<8x16xi32>) { -// CHECK: %[[T0:.*]] = xegpu.dpas %[[ARG0]], %[[ARG1]] {anchor_layout_a = #xegpu.layout, anchor_layout_b = #xegpu.layout, layout_result_0 = #xegpu.layout} +// CHECK: %[[T0:.*]] = xegpu.dpas %[[ARG0]], %[[ARG1]] {layout_a = #xegpu.layout, layout_b = #xegpu.layout, layout_result_0 = #xegpu.layout} func.func @dpas_i8(%arg0: vector<8x32xi8>, %arg1: vector<32x16xi8>, %arg2: memref<8x16xi32>) { %c0 = arith.constant 0 : index @@ -47,7 +47,7 @@ func.func @dpas_i8(%arg0: vector<8x32xi8>, %arg1: vector<32x16xi8>, %arg2: memre gpu.module @test { // CHECK-LABEL: func.func @load_with_transpose_effect( // CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: memref<8x16xf16>, %[[ARG0:[0-9a-zA-Z]+]]: memref<16x16xf16>, %[[ARG0:[0-9a-zA-Z]+]]: memref<8x16xf32>) { -// CHECK: %{{.*}} = xegpu.load_nd %{{.*}} <{anchor_layout = #xegpu.layout}> {layout_result_0 = #xegpu.layout} : +// CHECK: %{{.*}} = xegpu.load_nd %{{.*}} <{layout = #xegpu.layout}> {layout_result_0 = #xegpu.layout} : // CHECK-SAME: !xegpu.tensor_desc<8x16xf16, #xegpu.layout> -> vector<8x16xf16> func.func @load_with_transpose_effect(%arg0: memref<8x16xf16>, %arg1: memref<16x16xf16>, %arg2: memref<8x16xf32>) { %c0 = arith.constant 0 : index @@ -109,7 +109,7 @@ gpu.module @test { // CHECK-NEXT: %[[CST0:.*]] = arith.constant {layout_result_0 = #xegpu.layout} dense : vector<16xi1> // CHECK-NEXT: %[[T2:.*]] = xegpu.create_tdesc %[[ARG1]], %[[CST]] : memref<256xf16>, vector<16xindex> -> // CHECK-SAME: !xegpu.tensor_desc<16x16xf16, #xegpu.scatter_tdesc_attr, #xegpu.layout> -// CHECK-NEXT: %{{.*}} = xegpu.load %[[T2]], %[[CST0]] <{anchor_layout = #xegpu.layout}> {layout_result_0 = #xegpu.layout} +// CHECK-NEXT: %{{.*}} = xegpu.load %[[T2]], %[[CST0]] <{layout = #xegpu.layout}> {layout_result_0 = #xegpu.layout} // CHECK-SAME: !xegpu.tensor_desc<16x16xf16, #xegpu.scatter_tdesc_attr, #xegpu.layout>, vector<16xi1> -> vector<16x16xf16> func.func @load_gather_with_chunksize(%arg0: memref<8x16xf16>, %arg1: memref<256xf16>, %arg2: memref<8x16xf32>) { %c0 = arith.constant 0 : index @@ -136,7 +136,7 @@ gpu.module @test { // CHECK-NEXT: %[[CST0:.*]] = arith.constant {layout_result_0 = #xegpu.layout} dense : vector<16xi1> // CHECK-NEXT: %[[T0:.*]] = xegpu.create_tdesc %[[ARG0]], %[[CST]] : memref<256xf32>, vector<16xindex> -> // CHECK-SAME: !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.layout> -// CHECK-NEXT: %{{.*}} = xegpu.load %[[T0]], %[[CST0]] <{anchor_layout = #xegpu.layout}> {layout_result_0 = #xegpu.layout} : +// CHECK-NEXT: %{{.*}} = xegpu.load %[[T0]], %[[CST0]] <{layout = #xegpu.layout}> {layout_result_0 = #xegpu.layout} : // CHECK-SAME: !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.layout>, vector<16xi1> -> vector<16xf32> func.func @load_gather_1d(%arg0: memref<256xf32>, %arg1: !xegpu.tensor_desc<16xf32>) { %cst = arith.constant dense<[0, 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240]> : vector<16xindex> @@ -184,9 +184,9 @@ gpu.module @test { // CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: memref<256xf16>) { // CHECK: %[[MASK:.*]] = arith.constant {layout_result_0 = #xegpu.layout} dense : vector<16xi1> // CHECK: %[[OFFSETS:.*]] = arith.constant {layout_result_0 = #xegpu.layout} dense<12> : vector<16xindex> -// CHECK: %[[LOAD_VEC:.*]] = xegpu.load %[[ARG0]][%[[OFFSETS]]], %[[MASK]] <{anchor_layout = #xegpu.layout, chunk_size = 8 : i64}> +// CHECK: %[[LOAD_VEC:.*]] = xegpu.load %[[ARG0]][%[[OFFSETS]]], %[[MASK]] <{chunk_size = 8 : i64, layout = #xegpu.layout}> // CHECK-SAME: {layout_result_0 = #xegpu.layout} : memref<256xf16>, vector<16xindex>, vector<16xi1> -> vector<16x8xf16> -// CHECK: xegpu.store %[[LOAD_VEC]], %[[ARG0]][%[[OFFSETS]]], %[[MASK]] <{anchor_layout = #xegpu.layout, chunk_size = 8 : i64}> : vector<16x8xf16>, memref<256xf16>, vector<16xindex>, vector<16xi1> +// CHECK: xegpu.store %[[LOAD_VEC]], %[[ARG0]][%[[OFFSETS]]], %[[MASK]] <{chunk_size = 8 : i64, layout = #xegpu.layout}> : vector<16x8xf16>, memref<256xf16>, vector<16xindex>, vector<16xi1> func.func @scatter_ops_chunksize(%src: memref<256xf16>) { %1 = arith.constant dense<1>: vector<16xi1> %offset = arith.constant dense<12> : vector<16xindex> @@ -205,7 +205,7 @@ gpu.module @test { // CHECK: %[[OFFSETS:.*]] = arith.constant {layout_result_0 = #xegpu.layout} dense<12> : vector<16xindex> // CHECK: %[[LOAD_VEC:.*]] = xegpu.load %[[ARG0]][%[[OFFSETS]]], %[[MASK]] // CHECK-SAME: {layout_result_0 = #xegpu.layout} : memref<256xf16>, vector<16xindex>, vector<16xi1> -> vector<16xf16> -// CHECK: xegpu.store %[[LOAD_VEC]], %[[ARG0]][%[[OFFSETS]]], %[[MASK]] <{anchor_layout = #xegpu.layout}> : vector<16xf16>, memref<256xf16>, vector<16xindex>, vector<16xi1> +// CHECK: xegpu.store %[[LOAD_VEC]], %[[ARG0]][%[[OFFSETS]]], %[[MASK]] <{layout = #xegpu.layout}> : vector<16xf16>, memref<256xf16>, vector<16xindex>, vector<16xi1> func.func @scatter_ops(%src: memref<256xf16>) { %1 = arith.constant dense<1>: vector<16xi1> %offset = arith.constant dense<12> : vector<16xindex> @@ -218,13 +218,13 @@ func.func @scatter_ops(%src: memref<256xf16>) { gpu.module @test { // CHECK-LABEL: func.func @scatter_ops_custom_perm_layout( // CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: memref<256xf16>) { -// CHECK: %[[MASK:.*]] = arith.constant {layout_result_0 = #xegpu.layout} dense : vector<16xi1> -// CHECK: %[[OFFSETS:.*]] = arith.constant {layout_result_0 = #xegpu.layout} dense<12> : vector<16xindex> +// CHECK: %[[MASK:.*]] = arith.constant {layout_result_0 = #xegpu.layout} dense : vector<16xi1> +// CHECK: %[[OFFSETS:.*]] = arith.constant {layout_result_0 = #xegpu.layout} dense<12> : vector<16xindex> // CHECK: %[[LOAD_VEC:.*]] = xegpu.load %[[ARG0]][%[[OFFSETS]]], %[[MASK]] // CHECK-SAME: {layout_result_0 = #xegpu.layout} : memref<256xf16>, vector<16xindex>, vector<16xi1> -> vector<16xf16> -// CHECK: %[[ADD_RES:.*]] = arith.addf %[[LOAD_VEC]], %[[LOAD_VEC]] {layout_result_0 = #xegpu.layout} : vector<16xf16> +// CHECK: %[[ADD_RES:.*]] = arith.addf %[[LOAD_VEC]], %[[LOAD_VEC]] {layout_result_0 = #xegpu.layout} : vector<16xf16> // CHECK: xegpu.store %[[ADD_RES]], %[[ARG0]][%[[OFFSETS]]], %[[MASK]] -// CHECK-SAME <{anchor_layout = #xegpu.layout}> : vector<16xf16>, memref<256xf16>, vector<16xindex>, vector<16xi1> +// CHECK-SAME <{layout = #xegpu.layout}> : vector<16xf16>, memref<256xf16>, vector<16xindex>, vector<16xi1> func.func @scatter_ops_custom_perm_layout(%src: memref<256xf16>) { %1 = arith.constant dense<1>: vector<16xi1> %offset = arith.constant dense<12> : vector<16xindex> @@ -238,13 +238,13 @@ func.func @scatter_ops_custom_perm_layout(%src: memref<256xf16>) { gpu.module @test { // CHECK-LABEL: func.func @scatter_ops_preserve_load_perm_layout( // CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: memref<256xf16>) { -// CHECK: %[[MASK:.*]] = arith.constant {layout_result_0 = #xegpu.layout} dense : vector<16xi1> -// CHECK: %[[OFFSETS:.*]] = arith.constant {layout_result_0 = #xegpu.layout} dense<12> : vector<16xindex> +// CHECK: %[[MASK:.*]] = arith.constant {layout_result_0 = #xegpu.layout} dense : vector<16xi1> +// CHECK: %[[OFFSETS:.*]] = arith.constant {layout_result_0 = #xegpu.layout} dense<12> : vector<16xindex> // CHECK: %[[LOAD_VEC:.*]] = xegpu.load %[[ARG0]][%[[OFFSETS]]], %[[MASK]] // CHECK-SAME: {layout_result_0 = #xegpu.layout} : memref<256xf16>, vector<16xindex>, vector<16xi1> -> vector<16xf16> -// CHECK: %[[ADD_RES:.*]] = arith.addf %[[LOAD_VEC]], %[[LOAD_VEC]] {layout_result_0 = #xegpu.layout} : vector<16xf16> +// CHECK: %[[ADD_RES:.*]] = arith.addf %[[LOAD_VEC]], %[[LOAD_VEC]] {layout_result_0 = #xegpu.layout} : vector<16xf16> // CHECK: xegpu.store %[[ADD_RES]], %[[ARG0]][%[[OFFSETS]]], %[[MASK]] -// CHECK-SAME <{anchor_layout = #xegpu.layout}> : vector<16xf16>, memref<256xf16>, vector<16xindex>, vector<16xi1> +// CHECK-SAME <{layout = #xegpu.layout}> : vector<16xf16>, memref<256xf16>, vector<16xindex>, vector<16xi1> func.func @scatter_ops_preserve_load_perm_layout(%src: memref<256xf16>) { %1 = arith.constant dense<1>: vector<16xi1> %offset = arith.constant dense<12> : vector<16xindex> @@ -257,9 +257,9 @@ func.func @scatter_ops_preserve_load_perm_layout(%src: memref<256xf16>) { // ----- gpu.module @test { // CHECK-LABEL: func.func @vector_bitcast_i16_to_f16( -// CHECK: %[[LOAD0:.*]] = xegpu.load_nd %{{.*}} <{anchor_layout = #xegpu.layout}> {layout_result_0 = #xegpu.layout} +// CHECK: %[[LOAD0:.*]] = xegpu.load_nd %{{.*}} <{layout = #xegpu.layout}> {layout_result_0 = #xegpu.layout} // CHECK-SAME: !xegpu.tensor_desc<8x16xi16, #xegpu.layout> -> vector<8x16xi16> -// CHECK: %[[LOAD1:.*]] = xegpu.load_nd %{{.*}} <{anchor_layout = #xegpu.layout}> {layout_result_0 = #xegpu.layout} +// CHECK: %[[LOAD1:.*]] = xegpu.load_nd %{{.*}} <{layout = #xegpu.layout}> {layout_result_0 = #xegpu.layout} // CHECK-SAME: !xegpu.tensor_desc<16x16xi16, #xegpu.layout> -> vector<16x16xi16> // CHECK: %{{.*}} = vector.bitcast %[[LOAD0]] {layout_result_0 = #xegpu.layout} // CHECK-SAME: vector<8x16xi16> to vector<8x16xf16> @@ -282,7 +282,7 @@ func.func @vector_bitcast_i16_to_f16(%arg0: memref<8x16xi16>, %arg1: memref<16x1 // ----- gpu.module @test { // CHECK-LABEL: func.func @vector_bitcast_i32_to_f16( -// CHECK: %[[LOAD:.*]] = xegpu.load_nd %{{.*}} <{anchor_layout = #xegpu.layout}> {layout_result_0 = #xegpu.layout} +// CHECK: %[[LOAD:.*]] = xegpu.load_nd %{{.*}} <{layout = #xegpu.layout}> {layout_result_0 = #xegpu.layout} // CHECK-SAME: !xegpu.tensor_desc<16x8xi32, #xegpu.layout> -> vector<16x8xi32> // CHECK-NEXT: %{{.*}} = vector.bitcast %[[LOAD]] {layout_result_0 = #xegpu.layout} // CHECK-SAME: vector<16x8xi32> to vector<16x16xf16> @@ -303,7 +303,7 @@ func.func @vector_bitcast_i32_to_f16(%arg0: memref<8x16xf16>, %arg1: memref<16x8 // ----- gpu.module @test { // CHECK-LABEL: func.func @vector_bitcast_i16_to_i32( -// CHECK: %[[LOAD:.*]] = xegpu.load_nd %{{.*}} <{anchor_layout = #xegpu.layout}> {layout_result_0 = #xegpu.layout} +// CHECK: %[[LOAD:.*]] = xegpu.load_nd %{{.*}} <{layout = #xegpu.layout}> {layout_result_0 = #xegpu.layout} // CHECK-SAME: !xegpu.tensor_desc<8x32xi16, #xegpu.layout> -> vector<8x32xi16> // CHECK-NEXT: %{{.*}} = vector.bitcast %[[LOAD]] {layout_result_0 = #xegpu.layout} // CHECK-SAME: vector<8x32xi16> to vector<8x16xi32> @@ -340,9 +340,9 @@ gpu.module @test { // CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<8x16xf16, #xegpu.layout>, // CHECK-SAME: %[[ARG1:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<16x16xf16, #xegpu.layout>, // CHECK-SAME: %[[ARG2:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<8x16xf32, #xegpu.layout>) { -// CHECK: %[[T1:.*]] = xegpu.load_nd %[[ARG1]] <{anchor_layout = #xegpu.layout}> {layout_result_0 = #xegpu.layout} : +// CHECK: %[[T1:.*]] = xegpu.load_nd %[[ARG1]] <{layout = #xegpu.layout}> {layout_result_0 = #xegpu.layout} : // CHECK-SAME: !xegpu.tensor_desc<16x16xf16, #xegpu.layout> -> vector<16x16xf16> -// CHECK-NEXT: %[[T2:.*]] = xegpu.load_nd %[[ARG1]] <{anchor_layout = #xegpu.layout}> {layout_result_0 = #xegpu.layout} : +// CHECK-NEXT: %[[T2:.*]] = xegpu.load_nd %[[ARG1]] <{layout = #xegpu.layout}> {layout_result_0 = #xegpu.layout} : // CHECK-SAME: !xegpu.tensor_desc<16x16xf16, #xegpu.layout> -> vector<16x16xf16> // CHECK-NEXT: %{{.*}} = arith.addf %[[T1]], %[[T2]] {layout_result_0 = #xegpu.layout} : vector<16x16xf16> func.func @binary_op_one_use(%arg0: !xegpu.tensor_desc<8x16xf16>, %arg1: !xegpu.tensor_desc<16x16xf16>, %arg2: !xegpu.tensor_desc<8x16xf32>) { @@ -363,9 +363,9 @@ gpu.module @test { // CHECK-SAME: %[[ARG2:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<8x16xf32, #xegpu.layout>, // CHECK-SAME: %[[ARG3:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<16x16xf16, #xegpu.layout>) { // CHECK: %[[T2:.*]] = arith.addf %{{.*}}, %{{.*}} {layout_result_0 = #xegpu.layout} : vector<16x16xf16> -// CHECK: %[[T3:.*]] = xegpu.dpas %{{.*}}, %[[T2]] {anchor_layout_a = #xegpu.layout, anchor_layout_b = #xegpu.layout, layout_result_0 = #xegpu.layout} : vector<8x16xf16>, vector<16x16xf16> -> vector<8x16xf32> -// CHECK-NEXT: xegpu.store_nd %[[T3]], %[[ARG2]] <{anchor_layout = #xegpu.layout}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.layout> -// CHECK-NEXT: xegpu.store_nd %[[T2]], %[[ARG3]] <{anchor_layout = #xegpu.layout}> : vector<16x16xf16>, !xegpu.tensor_desc<16x16xf16, #xegpu.layout> +// CHECK: %[[T3:.*]] = xegpu.dpas %{{.*}}, %[[T2]] {layout_a = #xegpu.layout, layout_b = #xegpu.layout, layout_result_0 = #xegpu.layout} : vector<8x16xf16>, vector<16x16xf16> -> vector<8x16xf32> +// CHECK-NEXT: xegpu.store_nd %[[T3]], %[[ARG2]] <{layout = #xegpu.layout}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.layout> +// CHECK-NEXT: xegpu.store_nd %[[T2]], %[[ARG3]] <{layout = #xegpu.layout}> : vector<16x16xf16>, !xegpu.tensor_desc<16x16xf16, #xegpu.layout> func.func @binary_op_multiple_uses(%arg0: !xegpu.tensor_desc<8x16xf16>, %arg1: !xegpu.tensor_desc<16x16xf16>, %arg2: !xegpu.tensor_desc<8x16xf32>, %arg3: !xegpu.tensor_desc<16x16xf16>) { %0 = xegpu.load_nd %arg0 : !xegpu.tensor_desc<8x16xf16> -> vector<8x16xf16> %1 = xegpu.load_nd %arg1 : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16> @@ -386,11 +386,11 @@ gpu.module @test { // CHECK-NEXT: %[[CST:.*]] = arith.constant {layout_result_0 = #xegpu.layout} dense<0.000000e+00> : vector<8x16xf32> // CHECK-NEXT: %[[T2:.*]]:3 = scf.for %{{.*}} iter_args(%[[ARG4:.*]] = %[[T0]], %[[ARG5:.*]] = %[[T1]], %[[ARG6:.*]] = %[[CST]]) -> // CHECK-SAME: (!xegpu.tensor_desc<8x16xf16, #xegpu.layout>, !xegpu.tensor_desc<16x16xf16, #xegpu.layout>, vector<8x16xf32>) { -// CHECK-NEXT: %[[T4:.*]] = xegpu.load_nd %[[ARG4]] <{anchor_layout = #xegpu.layout}> {layout_result_0 = #xegpu.layout} : +// CHECK-NEXT: %[[T4:.*]] = xegpu.load_nd %[[ARG4]] <{layout = #xegpu.layout}> {layout_result_0 = #xegpu.layout} : // CHECK-SAME: !xegpu.tensor_desc<8x16xf16, #xegpu.layout> -> vector<8x16xf16> -// CHECK-NEXT: %[[T5:.*]] = xegpu.load_nd %[[ARG5]] <{anchor_layout = #xegpu.layout}> {layout_result_0 = #xegpu.layout} : +// CHECK-NEXT: %[[T5:.*]] = xegpu.load_nd %[[ARG5]] <{layout = #xegpu.layout}> {layout_result_0 = #xegpu.layout} : // CHECK-SAME: !xegpu.tensor_desc<16x16xf16, #xegpu.layout> -> vector<16x16xf16> -// CHECK-NEXT: %[[T6:.*]] = xegpu.dpas %[[T4]], %[[T5]], %[[ARG6]] {anchor_layout_a = #xegpu.layout, anchor_layout_b = #xegpu.layout, anchor_layout_cd = #xegpu.layout, layout_result_0 = #xegpu.layout} : +// CHECK-NEXT: %[[T6:.*]] = xegpu.dpas %[[T4]], %[[T5]], %[[ARG6]] {layout_a = #xegpu.layout, layout_b = #xegpu.layout, layout_cd = #xegpu.layout, layout_result_0 = #xegpu.layout} : // CHECK-SAME: vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32> -> vector<8x16xf32> // CHECK-NEXT: %[[T7:.*]] = xegpu.update_nd_offset %[[ARG4]], [{{.*}}] : !xegpu.tensor_desc<8x16xf16, #xegpu.layout> // CHECK-NEXT: %[[T8:.*]] = xegpu.update_nd_offset %[[ARG5]], [{{.*}}] : !xegpu.tensor_desc<16x16xf16, #xegpu.layout> @@ -398,7 +398,7 @@ gpu.module @test { // CHECK-SAME: !xegpu.tensor_desc<16x16xf16, #xegpu.layout>, vector<8x16xf32> // CHECK-NEXT: } {layout_result_2 = #xegpu.layout} // CHECK-NEXT: %[[T3:.*]] = xegpu.create_nd_tdesc %[[ARG2]][{{.*}}] : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.layout> -// CHECK-NEXT: xegpu.store_nd %[[T2]]#2, %[[T3]] <{anchor_layout = #xegpu.layout}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.layout> +// CHECK-NEXT: xegpu.store_nd %[[T2]]#2, %[[T3]] <{layout = #xegpu.layout}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.layout> func.func @for_op(%arg0: memref<8x128xf16>, %arg1: memref<128x16xf16>, %arg2: memref<8x16xf32>) { %c0 = arith.constant 0 : index %c128 = arith.constant 128 : index @@ -426,11 +426,11 @@ gpu.module @test { // CHECK-SAME: %[[ARG1:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<16x16xf16, #xegpu.layout>, // CHECK-SAME: %[[ARG2:[0-9a-zA-Z]+]]: i1, %[[ARG3:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<8x16xf32, #xegpu.layout>) { // CHECK: %{{.*}} = scf.if %[[ARG2]] -> (vector<16x16xf16>) { -// CHECK-NEXT: %[[T3:.*]] = xegpu.load_nd %[[ARG1]] <{anchor_layout = #xegpu.layout}> {layout_result_0 = #xegpu.layout} : +// CHECK-NEXT: %[[T3:.*]] = xegpu.load_nd %[[ARG1]] <{layout = #xegpu.layout}> {layout_result_0 = #xegpu.layout} : // CHECK-SAME: !xegpu.tensor_desc<16x16xf16, #xegpu.layout> -> vector<16x16xf16> // CHECK-NEXT: scf.yield %[[T3]] : vector<16x16xf16> // CHECK-NEXT: } else { -// CHECK-NEXT: %[[T4:.*]] = xegpu.load_nd %[[ARG1]] <{anchor_layout = #xegpu.layout}> {layout_result_0 = #xegpu.layout} : +// CHECK-NEXT: %[[T4:.*]] = xegpu.load_nd %[[ARG1]] <{layout = #xegpu.layout}> {layout_result_0 = #xegpu.layout} : // CHECK-SAME: !xegpu.tensor_desc<16x16xf16, #xegpu.layout> -> vector<16x16xf16> // CHECK-NEXT: scf.yield %[[T4]] : vector<16x16xf16> // CHECK-NEXT: } {layout_result_0 = #xegpu.layout} @@ -456,11 +456,11 @@ gpu.module @test { // CHECK-SAME: %[[ARG2:[0-9a-zA-Z]+]]: i1, %[[ARG3:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<8x16xf32, #xegpu.layout>, // CHECK-SAME: %[[ARG4:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<16x16xf16, #xegpu.layout>) { // CHECK: %[[T1:.*]] = scf.if %[[ARG2]] -> (vector<16x16xf16>) { -// CHECK-NEXT: %[[T3:.*]] = xegpu.load_nd %[[ARG1]] <{anchor_layout = #xegpu.layout}> {layout_result_0 = #xegpu.layout} : +// CHECK-NEXT: %[[T3:.*]] = xegpu.load_nd %[[ARG1]] <{layout = #xegpu.layout}> {layout_result_0 = #xegpu.layout} : // CHECK-SAME: !xegpu.tensor_desc<16x16xf16, #xegpu.layout> -> vector<16x16xf16> // CHECK-NEXT: scf.yield %[[T3]] : vector<16x16xf16> // CHECK-NEXT: } else { -// CHECK-NEXT: %[[T4:.*]] = xegpu.load_nd %[[ARG1]] <{anchor_layout = #xegpu.layout}> {layout_result_0 = #xegpu.layout} : +// CHECK-NEXT: %[[T4:.*]] = xegpu.load_nd %[[ARG1]] <{layout = #xegpu.layout}> {layout_result_0 = #xegpu.layout} : // CHECK-SAME: !xegpu.tensor_desc<16x16xf16, #xegpu.layout> -> vector<16x16xf16> // CHECK-NEXT: scf.yield %[[T4]] : vector<16x16xf16> // CHECK-NEXT: } {layout_result_0 = #xegpu.layout} @@ -540,7 +540,7 @@ gpu.module @test { // CHECK-LABEL: func.func @prefetch_2d( // CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: memref<256x256xf16>) { // CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG0]][%{{.*}}, %{{.*}}] : memref<256x256xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout> -// CHECK-NEXT: xegpu.prefetch_nd %[[T0]] <{anchor_layout = #xegpu.layout, l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<16x16xf16, #xegpu.layout> +// CHECK-NEXT: xegpu.prefetch_nd %[[T0]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, layout = #xegpu.layout}> : !xegpu.tensor_desc<16x16xf16, #xegpu.layout> func.func @prefetch_2d(%arg0: memref<256x256xf16>){ %c0 = arith.constant 0 : index %0 = xegpu.create_nd_tdesc %arg0[%c0, %c0] : memref<256x256xf16> -> !xegpu.tensor_desc<16x16xf16> @@ -553,7 +553,7 @@ gpu.module @test { // CHECK-LABEL: func.func @prefetch_1d( // CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: memref<256xf16>) { // CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG0]][%{{.*}}] : memref<256xf16> -> !xegpu.tensor_desc<16xf16, #xegpu.layout> -// CHECK-NEXT: xegpu.prefetch_nd %[[T0]] <{anchor_layout = #xegpu.layout, l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<16xf16, #xegpu.layout> +// CHECK-NEXT: xegpu.prefetch_nd %[[T0]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, layout = #xegpu.layout}> : !xegpu.tensor_desc<16xf16, #xegpu.layout> func.func @prefetch_1d(%arg0: memref<256xf16>){ %c0 = arith.constant 0 : index %0 = xegpu.create_nd_tdesc %arg0[%c0] : memref<256xf16> -> !xegpu.tensor_desc<16xf16> @@ -600,7 +600,7 @@ gpu.module @test { // CHECK-LABEL: func.func @vector_shape_cast_1d_to_2d_dim1_distributed( // CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<16x16xf16, #xegpu.layout>, // CHECK-SAME: %[[ARG1:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<16x16xf16, #xegpu.layout>) { -// CHECK: %[[LOAD:.*]] = xegpu.load_nd %[[ARG0]] <{anchor_layout = #xegpu.layout}> {layout_result_0 = #xegpu.layout} +// CHECK: %[[LOAD:.*]] = xegpu.load_nd %[[ARG0]] <{layout = #xegpu.layout}> {layout_result_0 = #xegpu.layout} // CHECK-SAME: !xegpu.tensor_desc<16x16xf16, #xegpu.layout> -> vector<16x16xf16> // CHECK-NEXT: %[[REDUCE:.*]] = vector.multi_reduction , %[[LOAD]], %{{[0-9a-zA-Z]+}} // CHECK-SAME: {layout_result_0 = #xegpu.slice<#xegpu.layout, dims = [0]>} [0] : vector<16x16xf16> to vector<16xf16> @@ -622,7 +622,7 @@ gpu.module @test { // CHECK-LABEL: func.func @vector_shape_cast_1d_to_2d_dim0_broadcasted( // CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<16x16xf16, #xegpu.layout>, // CHECK-SAME: %[[ARG1:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<16x16xf16, #xegpu.layout>) { -// CHECK: %[[LOAD:.*]] = xegpu.load_nd %arg0 <{anchor_layout = #xegpu.layout}> {layout_result_0 = #xegpu.layout} +// CHECK: %[[LOAD:.*]] = xegpu.load_nd %arg0 <{layout = #xegpu.layout}> {layout_result_0 = #xegpu.layout} // CHECK-SAME: !xegpu.tensor_desc<16x16xf16, #xegpu.layout> -> vector<16x16xf16> // CHECK-NEXT: %[[REDUCE:.*]] = vector.multi_reduction , %[[LOAD]], %{{[0-9a-zA-Z]+}} // CHECK-SAME: {layout_result_0 = #xegpu.slice<#xegpu.layout, dims = [1]>} [1] diff --git a/mlir/test/Dialect/XeGPU/subgroup-distribute.mlir b/mlir/test/Dialect/XeGPU/subgroup-distribute.mlir index a7ce2c05b9d44..8fd3cca5594cb 100644 --- a/mlir/test/Dialect/XeGPU/subgroup-distribute.mlir +++ b/mlir/test/Dialect/XeGPU/subgroup-distribute.mlir @@ -281,8 +281,8 @@ gpu.module @xevm_module{ gpu.module @xevm_module{ gpu.func @load_store_matrix_1(%arg0: !xegpu.mem_desc<32x32xf32>) { %c0 = arith.constant 0 : index - %1 = xegpu.load_matrix %arg0[%c0, %c0] <{anchor_layout = #xegpu.layout}> : !xegpu.mem_desc<32x32xf32>, index, index -> vector<2x8xf32> - xegpu.store_matrix %1, %arg0[%c0, %c0] <{anchor_layout = #xegpu.layout}> : vector<2x8xf32>, !xegpu.mem_desc<32x32xf32>, index, index + %1 = xegpu.load_matrix %arg0[%c0, %c0] <{layout = #xegpu.layout}> : !xegpu.mem_desc<32x32xf32>, index, index -> vector<2x8xf32> + xegpu.store_matrix %1, %arg0[%c0, %c0] <{layout = #xegpu.layout}> : vector<2x8xf32>, !xegpu.mem_desc<32x32xf32>, index, index gpu.return } } @@ -307,8 +307,8 @@ gpu.module @xevm_module{ gpu.func @load_store_matrix_2(%arg0: !xegpu.mem_desc<32x32xf32>) { %c0 = arith.constant 0 : index %c1 = arith.constant 1 : index - %1 = xegpu.load_matrix %arg0[%c0, %c1] <{anchor_layout = #xegpu.layout}> : !xegpu.mem_desc<32x32xf32>, index, index -> vector<8x4xf32> - xegpu.store_matrix %1, %arg0[%c0, %c1] <{anchor_layout = #xegpu.layout}> : vector<8x4xf32>, !xegpu.mem_desc<32x32xf32>, index, index + %1 = xegpu.load_matrix %arg0[%c0, %c1] <{layout = #xegpu.layout}> : !xegpu.mem_desc<32x32xf32>, index, index -> vector<8x4xf32> + xegpu.store_matrix %1, %arg0[%c0, %c1] <{layout = #xegpu.layout}> : vector<8x4xf32>, !xegpu.mem_desc<32x32xf32>, index, index gpu.return } } @@ -323,9 +323,9 @@ gpu.module @xevm_module{ gpu.func @load_store_matrix_3(%arg0: !xegpu.mem_desc<32x32xf32, #xegpu.mem_layout>) { %c0 = arith.constant 0 : index %c1 = arith.constant 1 : index - %1 = xegpu.load_matrix %arg0[%c0, %c1] {subgroup_block_io, anchor_layout = #xegpu.layout} : + %1 = xegpu.load_matrix %arg0[%c0, %c1] {subgroup_block_io, layout = #xegpu.layout} : !xegpu.mem_desc<32x32xf32, #xegpu.mem_layout>, index, index -> vector<16x2xf32> - xegpu.store_matrix %1, %arg0[%c0, %c1] {subgroup_block_io, anchor_layout = #xegpu.layout} : + xegpu.store_matrix %1, %arg0[%c0, %c1] {subgroup_block_io, layout = #xegpu.layout} : vector<16x2xf32>, !xegpu.mem_desc<32x32xf32, #xegpu.mem_layout>, index, index gpu.return } diff --git a/mlir/test/Dialect/XeGPU/xegpu-blocking.mlir b/mlir/test/Dialect/XeGPU/xegpu-blocking.mlir index 456d8e8a03cfc..d61908b422194 100644 --- a/mlir/test/Dialect/XeGPU/xegpu-blocking.mlir +++ b/mlir/test/Dialect/XeGPU/xegpu-blocking.mlir @@ -569,7 +569,7 @@ gpu.module @test_kernel { %0 = xegpu.create_mem_desc %arg0 : memref<4096xi8, 3> -> !xegpu.mem_desc<32x32xf32> //CHECK-COUNT-8: xegpu.load_matrix {{.*}} : !xegpu.mem_desc<32x32xf32>, index, index -> vector<8x16xf32> //CHECK-COUNT-8: vector.insert_strided_slice {{.*}} : vector<8x16xf32> into vector<32x32xf32> - %1 = xegpu.load_matrix %0[0, 0] <{anchor_layout = #xegpu.layout}>: !xegpu.mem_desc<32x32xf32> -> vector<32x32xf32> + %1 = xegpu.load_matrix %0[0, 0] <{layout = #xegpu.layout}>: !xegpu.mem_desc<32x32xf32> -> vector<32x32xf32> gpu.return %1: vector<32x32xf32> } } @@ -580,7 +580,7 @@ gpu.module @test_kernel { gpu.func @unroll_store_matrix(%value: vector<32x32xf32>, %arg0 : memref<32768xi8, 3>) { %mdesc = xegpu.create_mem_desc %arg0 : memref<32768xi8, 3> -> !xegpu.mem_desc<64x128xf32> // CHECK-COUNT-8: xegpu.store_matrix {{.*}} : vector<8x16xf32>, !xegpu.mem_desc<64x128xf32>, index, index - xegpu.store_matrix %value, %mdesc[0, 0] {anchor_layout = #xegpu.layout} : vector<32x32xf32>, !xegpu.mem_desc<64x128xf32> + xegpu.store_matrix %value, %mdesc[0, 0] {layout = #xegpu.layout} : vector<32x32xf32>, !xegpu.mem_desc<64x128xf32> gpu.return } } diff --git a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops.mlir b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops.mlir index 171cadeeaeaf9..5dde84e8e0bc2 100644 --- a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops.mlir +++ b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops.mlir @@ -286,7 +286,7 @@ gpu.module @test_distribution { // CHECK: %[[VAL:.*]] = arith.constant {layout_result_0 = #xegpu.layout} dense<2.550000e+01> : vector<8xf16> // CHECK: %[[CST:.*]] = arith.constant {layout_result_0 = #xegpu.layout} dense<0> : vector<8xindex> // CHECK: %[[MASK:.*]] = arith.constant {layout_result_0 = #xegpu.layout} dense : vector<8xi1> - // CHECK: xegpu.store %[[VAL]], %[[ARG0]][%[[CST]]], %[[MASK]] <{anchor_layout = #xegpu.layout, chunk_size = 1 : i64, l1_hint = #xegpu.cache_hint}> + // CHECK: xegpu.store %[[VAL]], %[[ARG0]][%[[CST]]], %[[MASK]] <{chunk_size = 1 : i64, l1_hint = #xegpu.cache_hint, layout = #xegpu.layout}> // CHECK-SAME: {layout_operand_0 = #xegpu.layout, layout_operand_2 = #xegpu.layout, // CHECK-SAME: layout_operand_3 = #xegpu.layout} // CHECK-SAME: : vector<8xf16>, memref<256xf16>, vector<8xindex>, vector<8xi1> @@ -333,9 +333,9 @@ gpu.module @test_distribution { //CHECK: [[off_y:%.+]] = index.remu [[l_off_y]], [[c64]] //CHECK: [[c128:%.+]] = arith.constant 128 : index //CHECK: [[off_x:%.+]] = index.remu [[l_off_x]], [[c128]] - //CHECK: xegpu.load_matrix [[mdesc]][[[off_y]], [[off_x]]] <{anchor_layout = #xegpu.layout}>: !xegpu.mem_desc<64x128xf32>, index, index -> vector<32x32xf32> + //CHECK: xegpu.load_matrix [[mdesc]][[[off_y]], [[off_x]]] <{layout = #xegpu.layout}>: !xegpu.mem_desc<64x128xf32>, index, index -> vector<32x32xf32> %0 = xegpu.create_mem_desc %arg0 : memref<32768xi8, 3> -> !xegpu.mem_desc<64x128xf32> - %1 = xegpu.load_matrix %0[0, 0] <{anchor_layout = #xegpu.layout}>: !xegpu.mem_desc<64x128xf32> -> vector<64x128xf32> + %1 = xegpu.load_matrix %0[0, 0] <{layout = #xegpu.layout}>: !xegpu.mem_desc<64x128xf32> -> vector<64x128xf32> gpu.return } @@ -361,7 +361,7 @@ gpu.module @test_distribution { //CHECK: xegpu.store_matrix [[cst]], [[mdesc]][[[off_y]], [[off_x]]] : vector<32x32xf32>, !xegpu.mem_desc<64x128xf32>, index, index %cst = arith.constant {layout_result_0 = #xegpu.layout} dense<1.0> : vector<64x128xf32> %mdesc = xegpu.create_mem_desc %arg0 : memref<32768xi8, 3> -> !xegpu.mem_desc<64x128xf32> - xegpu.store_matrix %cst, %mdesc[0, 0] {anchor_layout = #xegpu.layout} : vector<64x128xf32>, !xegpu.mem_desc<64x128xf32> + xegpu.store_matrix %cst, %mdesc[0, 0] {layout = #xegpu.layout} : vector<64x128xf32>, !xegpu.mem_desc<64x128xf32> gpu.return } @@ -554,7 +554,7 @@ gpu.module @test_distribution { %offset = arith.constant {layout_result_0 = #xegpu.layout } dense<0> : vector<256xindex> %mask = arith.constant {layout_result_0 = #xegpu.layout } dense<1> : vector<256xi1> - // CHECK: %[[LOAD:.*]] = xegpu.load {{.*}} <{anchor_layout = #xegpu.slice<#xegpu.layout, dims = [0]>, chunk_size = 1 : i64}> + // CHECK: %[[LOAD:.*]] = xegpu.load {{.*}} <{chunk_size = 1 : i64, layout = #xegpu.slice<#xegpu.layout, dims = [0]>}> // CHECK-SAME: {layout_result_0 = #xegpu.slice<#xegpu.layout, dims = [0]>} : // CHECK-SAME: memref<4096xf32>, vector<32xindex>, vector<32xi1> -> vector<32xf32> %3 = xegpu.load %2[%offset], %mask {chunk_size = 1, layout_result_0 = #xegpu.slice<#xegpu.layout, dims = [0]> } : memref<4096xf32>, vector<256xindex>, vector<256xi1> -> vector<256xf32> From 60f53969439d312413bea128b3fadcd7560a7285 Mon Sep 17 00:00:00 2001 From: Jianhui Li Date: Wed, 26 Nov 2025 02:21:08 +0000 Subject: [PATCH 06/28] fix test --- .../XeGPU/propagate-layout-inst-data.mlir | 24 +++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/mlir/test/Dialect/XeGPU/propagate-layout-inst-data.mlir b/mlir/test/Dialect/XeGPU/propagate-layout-inst-data.mlir index 1d86a2a4939e5..d911baa49acbb 100644 --- a/mlir/test/Dialect/XeGPU/propagate-layout-inst-data.mlir +++ b/mlir/test/Dialect/XeGPU/propagate-layout-inst-data.mlir @@ -1,5 +1,29 @@ // RUN: mlir-opt -xevm-attach-target='chip=pvc' -xegpu-propagate-layout="layout-kind=inst" -split-input-file %s | FileCheck %s + +// CHECK-LABEL: func.func @load_store_no_array_len( +// CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: memref<8x32xf32>, %[[ARG1:[0-9a-zA-Z]+]]: memref<8x32xf32>) { +// CHECK: %[[CST:.*]] = arith.constant dense<0.000000e+00> : vector<8x16xf32> +// CHECK: %[[TDESC_SRC:.*]] = xegpu.create_nd_tdesc %[[ARG0]] : memref<8x32xf32> -> !xegpu.tensor_desc<8x32xf32, #xegpu.layout> +// CHECK: %[[TDESC_DST:.*]] = xegpu.create_nd_tdesc %[[ARG1]] : memref<8x32xf32> -> !xegpu.tensor_desc<8x32xf32, #xegpu.layout> +// CHECK: %[[LOADED:.*]] = xegpu.load_nd %0 <{layout = #xegpu.layout}> {layout_result_0 = #xegpu.layout} : +// CHECK-SAME: !xegpu.tensor_desc<8x32xf32, #xegpu.layout> -> vector<8x32xf32> +// CHECK: xegpu.store_nd %[[LOADED]], %[[TDESC_DST]] <{layout = #xegpu.layout}> : vector<8x32xf32>, !xegpu.tensor_desc<8x32xf32, #xegpu.layout> +gpu.module @test { +// Although the uArch allows 8x32 inst data using block count (or array_len), +// it is up to optimization passes to decide on the block count usage. +func.func @load_store_no_array_len(%arg0: memref<8x32xf32>, %arg1: memref<8x32xf32>) { + %cst = arith.constant dense<0.000000e+00> : vector<8x16xf32> + %0 = xegpu.create_nd_tdesc %arg0 : memref<8x32xf32> -> !xegpu.tensor_desc<8x32xf32> + %1 = xegpu.create_nd_tdesc %arg1 : memref<8x32xf32> -> !xegpu.tensor_desc<8x32xf32> + %2 = xegpu.load_nd %0 : !xegpu.tensor_desc<8x32xf32> -> vector<8x32xf32> + xegpu.store_nd %2, %1 : vector<8x32xf32>, !xegpu.tensor_desc<8x32xf32> + return +} +} + +// ----- + // CHECK-LABEL: func.func @dpas_f16( // CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: memref<8x16xf16>, %[[ARG1:[0-9a-zA-Z]+]]: memref<16x16xf16>, %[[ARG2:[0-9a-zA-Z]+]]: memref<8x16xf32>) { // CHECK: %[[CST:.*]] = arith.constant {layout_result_0 = #xegpu.layout} dense<0.000000e+00> : vector<8x16xf32> From 72fa240c0faa8b01e9e5f8f0554f6e6712c423d2 Mon Sep 17 00:00:00 2001 From: Jianhui Li Date: Wed, 26 Nov 2025 18:21:45 +0000 Subject: [PATCH 07/28] fix clang-format --- clang-tools-extra/clang-tidy/.clang-format | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/clang-tools-extra/clang-tidy/.clang-format b/clang-tools-extra/clang-tidy/.clang-format index e97ba0573dd1e..b32e785264e6a 100644 --- a/clang-tools-extra/clang-tidy/.clang-format +++ b/clang-tools-extra/clang-tidy/.clang-format @@ -1,4 +1,8 @@ BasedOnStyle: LLVM -QualifierAlignment: Left -LineEnding: LF InsertNewlineAtEOF: true +KeepEmptyLines: + AtEndOfFile: false + AtStartOfBlock: false + AtStartOfFile: false +LineEnding: LF +QualifierAlignment: Left \ No newline at end of file From 5f25c89f7bc359bd9c9fe848ce2bd0e521c49af7 Mon Sep 17 00:00:00 2001 From: Jianhui Li Date: Wed, 26 Nov 2025 18:24:46 +0000 Subject: [PATCH 08/28] fix missing space in .clang-format --- clang-tools-extra/clang-tidy/.clang-format | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/clang-tools-extra/clang-tidy/.clang-format b/clang-tools-extra/clang-tidy/.clang-format index b32e785264e6a..fe94ed1fa4e81 100644 --- a/clang-tools-extra/clang-tidy/.clang-format +++ b/clang-tools-extra/clang-tidy/.clang-format @@ -5,4 +5,4 @@ KeepEmptyLines: AtStartOfBlock: false AtStartOfFile: false LineEnding: LF -QualifierAlignment: Left \ No newline at end of file +QualifierAlignment: Left From ae01e29d685852f29384ae72ac9a7744689c9cfc Mon Sep 17 00:00:00 2001 From: Jianhui Li Date: Wed, 26 Nov 2025 23:47:32 +0000 Subject: [PATCH 09/28] simplify: load/store/prefetch/loadmatrix/storematrix use anchor layout for blocking --- .../XeGPU/Transforms/XeGPUBlocking.cpp | 89 +++++++++++++------ mlir/test/Dialect/XeGPU/xegpu-blocking.mlir | 74 +++++++-------- 2 files changed, 95 insertions(+), 68 deletions(-) diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp index ec5feb8bc8c4a..36fc653cbabff 100644 --- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp +++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp @@ -175,33 +175,68 @@ XeGPUBlockingPass::getTileShape(const T &operandOrResult) const { std::optional> XeGPUBlockingPass::getTileShape(Operation *op) const { + + auto getShapeSkipLeadingUnitDim = [](xegpu::DistributeLayoutAttr layout) + -> std::optional> { + SmallVector instData = layout.getEffectiveInstDataAsInt(); + if (!instData.empty()) { + // Remove leading unit dimensions from inst_data + // For example, if the inst_data is [1, 1, 32] + // it will pass [32] as the unroll/blocking size. + auto it = llvm::find_if(instData, [](auto val) { return val != 1; }); + instData.erase(instData.begin(), it); + return instData; + } + return std::nullopt; + }; + if (isa(op)) + xegpu::UpdateOffsetOp>(op)) return getTileShape(op->getOpResult(0)); - if (isa(op)) + + xegpu::DistributeLayoutAttr layout = nullptr; + + if (isa(op)) + layout = dyn_cast(op).getLayoutAttr(); + if (isa(op)) + layout = dyn_cast(op).getLayoutAttr(); + + if (layout != nullptr) { + assert(layout.isForSubgroup() && + "Matrix load/store should have subgroup level layout"); + return layout.getEffectiveInstDataAsInt(); + } + + if (isa(op)) + layout = dyn_cast(op).getLayoutAttr(); + if (isa(op)) + layout = dyn_cast(op).getLayoutAttr(); + if (isa(op)) + layout = dyn_cast(op).getLayoutAttr(); + if (layout != nullptr) { + assert( + layout.isForSubgroup() && + "LoadGather/StoreScatter/Prefetch should have subgroup level layout"); + return getShapeSkipLeadingUnitDim(layout); + } + + if (isa(op)) return getTileShape(op->getOpOperand(0)); if (isa(op)) return getTileShape(op->getOpOperand(1)); - // Handle LoadGatherOp and StoreScatterOp (with and without offset) - if (auto loadGatherOp = dyn_cast(op)) { - if (loadGatherOp.getOffsets()) - return getTileShape(loadGatherOp->getOpResult(0)); - else - return getTileShape(loadGatherOp->getOpOperand(0)); - } + if (isa(op)) { - if (auto storeScatterOp = dyn_cast(op)) - return getTileShape(storeScatterOp.getOffsets() - ? storeScatterOp->getOpOperand(0) - : storeScatterOp->getOpOperand(1)); + auto layoutA = dyn_cast(op).getLayoutAAttr(); + auto layoutB = dyn_cast(op).getLayoutBAttr(); + auto layoutCD = dyn_cast(op).getLayoutCdAttr(); - if (isa(op)) { std::optional> aTile = - getTileShape(op->getOpOperand(0)); + layoutA.getEffectiveInstDataAsInt(); std::optional> bTile = - getTileShape(op->getOpOperand(1)); + layoutB.getEffectiveInstDataAsInt(); + std::optional> cdTile = + layoutCD.getEffectiveInstDataAsInt(); if (!aTile || aTile->size() != 2 || !bTile || bTile->size() != 2) return std::nullopt; @@ -210,14 +245,9 @@ XeGPUBlockingPass::getTileShape(Operation *op) const { if ((*aTile)[1] != (*bTile)[0]) return std::nullopt; - // semantic check for C - if (op->getNumOperands() == 3) { - std::optional> cTile = - getTileShape(op->getOpOperand(2)); - int64_t expectedCTile[2] = {(*aTile)[0], (*bTile)[1]}; - if (!cTile || !llvm::equal(*cTile, expectedCTile)) - return std::nullopt; - } + int64_t expectedCDTile[2] = {(*aTile)[0], (*bTile)[1]}; + if (!cdTile || !llvm::equal(*cdTile, expectedCDTile)) + return std::nullopt; return SmallVector({(*aTile)[0], (*aTile)[1], (*bTile)[1]}); } @@ -383,9 +413,12 @@ void XeGPUBlockingPass::runOnOperation() { } } - newTy = - xegpu::TensorDescType::get(ctx, tileShape, elemTy, encoding, - tdescTy.getLayoutAttr().dropInstData()); + xegpu::LayoutAttr newLayout = nullptr; + if (tdescTy.getLayoutAttr()) + newLayout = tdescTy.getLayoutAttr().dropInstData(); + + newTy = xegpu::TensorDescType::get(ctx, tileShape, elemTy, encoding, + newLayout); } else { newTy = VectorType::get(tileShape, elemTy); } diff --git a/mlir/test/Dialect/XeGPU/xegpu-blocking.mlir b/mlir/test/Dialect/XeGPU/xegpu-blocking.mlir index d61908b422194..c2ca63f6b289d 100644 --- a/mlir/test/Dialect/XeGPU/xegpu-blocking.mlir +++ b/mlir/test/Dialect/XeGPU/xegpu-blocking.mlir @@ -26,8 +26,8 @@ gpu.module @test_kernel { %a = xegpu.load_nd %arg0 : !xegpu.tensor_desc<16x32xf16, #a> -> vector<16x32xf16> //CHECK-COUNT-4: xegpu.load_nd {{.*}} -> vector<16x16xf16> %b = xegpu.load_nd %arg1 : !xegpu.tensor_desc<32x32xf16, #b> -> vector<32x32xf16> - //CHECK-COUNT-8: xegpu.dpas {{.*}} {layout_result_0 = #xegpu.layout} : vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32> -> vector<8x16xf32> - %c = xegpu.dpas %a, %b, %arg2 {layout_result_0 = #c}: vector<16x32xf16>, vector<32x32xf16>, vector<16x32xf32> -> vector<16x32xf32> + //CHECK-COUNT-8: xegpu.dpas {{.*}} + %c = xegpu.dpas %a, %b, %arg2 {layout_a=#a, layout_b = #b, layout_cd = #c,layout_result_0 = #c}: vector<16x32xf16>, vector<32x32xf16>, vector<16x32xf32> -> vector<16x32xf32> //CHECK-COUNT-4: xegpu.update_nd_offset {{.*}} : !xegpu.tensor_desc<8x16xf16, #xegpu.layout> %a_next_tdesc = xegpu.update_nd_offset %arg0, [%c0, %c32] : !xegpu.tensor_desc<16x32xf16, #a> //CHECK-COUNT-4: xegpu.update_nd_offset {{.*}} : !xegpu.tensor_desc<16x16xf16, #xegpu.layout> @@ -67,8 +67,8 @@ gpu.module @test_kernel { %a = xegpu.load_nd %arg0 : !xegpu.tensor_desc<16x32xf16, #l1> -> vector<16x32xf16> //CHECK-COUNT-4: xegpu.load_nd {{.*}} -> vector<16x16xf16> %b = xegpu.load_nd %arg1 : !xegpu.tensor_desc<32x32xf16, #l2> -> vector<32x32xf16> - //CHECK-COUNT-8: xegpu.dpas {{.*}} : vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32> -> vector<8x16xf32> - %c = xegpu.dpas %a, %b, %arg2 {layout_result_0 = #l1}: vector<16x32xf16>, vector<32x32xf16>, vector<16x32xf32> -> vector<16x32xf32> + //CHECK-COUNT-8: xegpu.dpas {{.*}} + %c = xegpu.dpas %a, %b, %arg2 {layout_a=#l1, layout_b = #l2, layout_cd = #l1,layout_result_0 = #l1}: vector<16x32xf16>, vector<32x32xf16>, vector<16x32xf32> -> vector<16x32xf32> //CHECK-COUNT-4: xegpu.update_nd_offset {{.*}} : !xegpu.tensor_desc<8x16xf16> %a_next_tdesc = xegpu.update_nd_offset %arg0, [%c0, %c32] : !xegpu.tensor_desc<16x32xf16, #l1> //CHECK-COUNT-4: xegpu.update_nd_offset {{.*}} : !xegpu.tensor_desc<16x16xf16> @@ -111,7 +111,7 @@ gpu.module @test_kernel { %a = xegpu.load_nd %arg0 : !xegpu.tensor_desc<8x16xf16, #l1> -> vector<8x16xf16> //CHECK-COUNT-2: xegpu.load_nd {{.*}} : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16> %b = xegpu.load_nd %arg1 : !xegpu.tensor_desc<16x32xf16, #l2> -> vector<16x32xf16> - %c = xegpu.dpas %a, %b, %arg2 {layout_result_0 = #l1}: vector<8x16xf16>, vector<16x32xf16>, vector<8x32xf32> -> vector<8x32xf32> + %c = xegpu.dpas %a, %b, %arg2 {layout_a=#l1, layout_b = #l2, layout_cd = #l1,layout_result_0 = #l1}: vector<8x16xf16>, vector<16x32xf16>, vector<8x32xf32> -> vector<8x32xf32> //CHECK: xegpu.update_nd_offset {{.*}} [%c0, %c32] : !xegpu.tensor_desc<8x16xf16> %a_next_tdesc = xegpu.update_nd_offset %arg0, [%c0, %c32] : !xegpu.tensor_desc<8x16xf16, #l1> //CHECK-COUNT-2: xegpu.update_nd_offset {{.*}} [%c32, %c0] : !xegpu.tensor_desc<16x16xf16> @@ -154,8 +154,8 @@ gpu.module @test_kernel { %b = xegpu.load_nd %arg1 : !xegpu.tensor_desc<32x32xf16, #b> -> vector<32x32xf16> //CHECK-COUNT-4: math.exp {{.*}} : vector<8x16xf16> %e = math.exp %a {layout_result_0 = #a} : vector<16x32xf16> - //CHECK-COUNT-8: xegpu.dpas {{.*}} {layout_result_0 = #xegpu.layout} : vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32> -> vector<8x16xf32> - %c = xegpu.dpas %e, %b, %arg2 {layout_result_0 = #c}: vector<16x32xf16>, vector<32x32xf16>, vector<16x32xf32> -> vector<16x32xf32> + //CHECK-COUNT-8: xegpu.dpas {{.*}} + %c = xegpu.dpas %e, %b, %arg2 {layout_a=#a, layout_b = #b, layout_cd = #c,layout_result_0 = #c}: vector<16x32xf16>, vector<32x32xf16>, vector<16x32xf32> -> vector<16x32xf32> //CHECK-COUNT-4: xegpu.update_nd_offset {{.*}} : !xegpu.tensor_desc<8x16xf16, #xegpu.layout> %a_next_tdesc = xegpu.update_nd_offset %arg0, [%c0, %c32] : !xegpu.tensor_desc<16x32xf16, #a> //CHECK-COUNT-4: xegpu.update_nd_offset {{.*}} : !xegpu.tensor_desc<16x16xf16, #xegpu.layout> @@ -353,14 +353,15 @@ gpu.module @test_kernel { } // ----- + gpu.module @test_kernel { // CHECK-LABEL: test_prefetch_load_store_update // CHECK-SAME: [[arg0:%.+]]: ui64 // CHECK-COUNT-2: xegpu.create_tdesc [[arg0]], {{.*}} : ui64, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>> - // CHECK-COUNT-2: xegpu.prefetch {{.*}} : !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>> + // CHECK-COUNT-2: xegpu.prefetch {{.*}} // CHECK-COUNT-2: xegpu.update_offset {{.*}} : !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>, vector<16xindex> - // CHECK-COUNT-2: xegpu.load {{.*}} : !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>, vector<16xi1> -> vector<16xf32> - // CHECK-COUNT-2: xegpu.store {{.*}} : vector<16xf32>, !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>, vector<16xi1> + // CHECK-COUNT-2: xegpu.load {{.*}} + // CHECK-COUNT-2: xegpu.store {{.*}} gpu.func @test_prefetch_load_store_update(%src: ui64) { @@ -372,7 +373,7 @@ gpu.module @test_kernel { ]> : vector<32xindex> %tdesc = xegpu.create_tdesc %src, %cst : ui64, vector<32xindex> -> !xegpu.tensor_desc<32xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.layout> - xegpu.prefetch %tdesc: !xegpu.tensor_desc<32xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.layout> + xegpu.prefetch %tdesc {layout = #xegpu.layout}: !xegpu.tensor_desc<32xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.layout> %delta = arith.constant dense<[ 32, 32, 32, 32, 32, 32, 32, 32, @@ -386,10 +387,10 @@ gpu.module @test_kernel { %c17 = arith.constant 17: index %mask = vector.create_mask %c17: vector<32xi1> - %ld_vec = xegpu.load %new_tdesc, %mask: !xegpu.tensor_desc<32xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.layout>, vector<32xi1> -> vector<32xf32> + %ld_vec = xegpu.load %new_tdesc, %mask {layout = #xegpu.layout}: !xegpu.tensor_desc<32xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.layout>, vector<32xi1> -> vector<32xf32> %st_vec = arith.addf %ld_vec, %ld_vec : vector<32xf32> - xegpu.store %st_vec, %tdesc, %mask: + xegpu.store %st_vec, %tdesc, %mask {layout = #xegpu.layout}: vector<32xf32>, !xegpu.tensor_desc<32xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.layout>, vector<32xi1> @@ -400,15 +401,14 @@ gpu.module @test_kernel { } // ----- - gpu.module @test_kernel { // CHECK-LABEL: test_prefetch_load_store_update_chunk // CHECK-SAME: [[arg0:%.+]]: ui64 // CHECK-COUNT-4: xegpu.create_tdesc [[arg0]], {{.*}} : ui64, vector<16xindex> -> !xegpu.tensor_desc<16x2xf32, #xegpu.scatter_tdesc_attr> - // CHECK-COUNT-4: xegpu.prefetch {{.*}} : !xegpu.tensor_desc<16x2xf32, #xegpu.scatter_tdesc_attr> + // CHECK-COUNT-4: xegpu.prefetch {{.*}} <{layout = #xegpu.layout}> : !xegpu.tensor_desc<16x2xf32, #xegpu.scatter_tdesc_attr> // CHECK-COUNT-4: xegpu.update_offset {{.*}} : !xegpu.tensor_desc<16x2xf32, #xegpu.scatter_tdesc_attr>, vector<16xindex> - // CHECK-COUNT-4: xegpu.load {{.*}} : !xegpu.tensor_desc<16x2xf32, #xegpu.scatter_tdesc_attr>, vector<16xi1> -> vector<16x2xf32> - // CHECK-COUNT-4: xegpu.store {{.*}} : vector<16x2xf32>, !xegpu.tensor_desc<16x2xf32, #xegpu.scatter_tdesc_attr>, vector<16xi1> + // CHECK-COUNT-4: xegpu.load {{.*}} <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<16x2xf32, #xegpu.scatter_tdesc_attr>, vector<16xi1> -> vector<16x2xf32> + // CHECK-COUNT-4: xegpu.store {{.*}} <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}> : vector<16x2xf32>, !xegpu.tensor_desc<16x2xf32, #xegpu.scatter_tdesc_attr>, vector<16xi1> gpu.func @test_prefetch_load_store_update_chunk(%src: ui64) { @@ -420,7 +420,7 @@ gpu.module @test_kernel { ]> : vector<32xindex> %tdesc = xegpu.create_tdesc %src, %cst : ui64, vector<32xindex> -> !xegpu.tensor_desc<32x4xf32, #xegpu.scatter_tdesc_attr, #xegpu.layout> - xegpu.prefetch %tdesc: !xegpu.tensor_desc<32x4xf32, #xegpu.scatter_tdesc_attr, #xegpu.layout> + xegpu.prefetch %tdesc {layout = #xegpu.layout}: !xegpu.tensor_desc<32x4xf32, #xegpu.scatter_tdesc_attr, #xegpu.layout> %delta = arith.constant dense<[ 32, 32, 32, 32, 32, 32, 32, 32, @@ -434,10 +434,10 @@ gpu.module @test_kernel { %c17 = arith.constant 17: index %mask = vector.create_mask %c17: vector<32xi1> - %ld_vec = xegpu.load %new_tdesc, %mask <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}>: !xegpu.tensor_desc<32x4xf32, #xegpu.scatter_tdesc_attr, #xegpu.layout>, vector<32xi1> -> vector<32x4xf32> + %ld_vec = xegpu.load %new_tdesc, %mask <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, layout = #xegpu.layout}>: !xegpu.tensor_desc<32x4xf32, #xegpu.scatter_tdesc_attr, #xegpu.layout>, vector<32xi1> -> vector<32x4xf32> %st_vec = arith.addf %ld_vec, %ld_vec : vector<32x4xf32> - xegpu.store %st_vec, %tdesc, %mask <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}>: + xegpu.store %st_vec, %tdesc, %mask <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, layout = #xegpu.layout}>: vector<32x4xf32>, !xegpu.tensor_desc<32x4xf32, #xegpu.scatter_tdesc_attr, #xegpu.layout>, vector<32xi1> @@ -476,7 +476,7 @@ gpu.module @test_kernel { ]> : vector<4x8xindex> %tdesc = xegpu.create_tdesc %src, %cst : ui64, vector<4x8xindex> -> !xegpu.tensor_desc<4x8x4xf32, #xegpu.scatter_tdesc_attr, #l> - xegpu.prefetch %tdesc: !xegpu.tensor_desc<4x8x4xf32, #xegpu.scatter_tdesc_attr, #l> + xegpu.prefetch %tdesc {layout = #l}: !xegpu.tensor_desc<4x8x4xf32, #xegpu.scatter_tdesc_attr, #l> %delta = arith.constant dense<[ [32, 32, 32, 32, 32, 32, 32, 32], @@ -490,10 +490,10 @@ gpu.module @test_kernel { %c4 = arith.constant 4: index %mask = vector.create_mask %c4, %c4: vector<4x8xi1> - %ld_vec = xegpu.load %new_tdesc, %mask <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}>: !xegpu.tensor_desc<4x8x4xf32, #xegpu.scatter_tdesc_attr, #l>, vector<4x8xi1> -> vector<4x8x4xf32> + %ld_vec = xegpu.load %new_tdesc, %mask <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, layout = #l}>: !xegpu.tensor_desc<4x8x4xf32, #xegpu.scatter_tdesc_attr, #l>, vector<4x8xi1> -> vector<4x8x4xf32> %st_vec = arith.addf %ld_vec, %ld_vec {layout_result_0 = #l} : vector<4x8x4xf32> - xegpu.store %st_vec, %tdesc, %mask <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}>: + xegpu.store %st_vec, %tdesc, %mask <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, layout = #l}>: vector<4x8x4xf32>, !xegpu.tensor_desc<4x8x4xf32, #xegpu.scatter_tdesc_attr, #l>, vector<4x8xi1> @@ -518,8 +518,8 @@ gpu.module @test_kernel { //CHECK: [[cvt:%.+]] = xegpu.convert_layout [[load_a]] <{input_layout = #xegpu.layout, target_layout = #xegpu.layout}> : vector<16x16xf16> //CHECK: [[a0:%.+]] = vector.extract_strided_slice [[cvt]] {offsets = [0, 0], sizes = [8, 16], strides = [1, 1]} : vector<16x16xf16> to vector<8x16xf16> //CHECK: [[a1:%.+]] = vector.extract_strided_slice [[cvt]] {offsets = [8, 0], sizes = [8, 16], strides = [1, 1]} : vector<16x16xf16> to vector<8x16xf16> - //CHECK: [[dpas0:%.+]] = xegpu.dpas [[a0]], [[load_b]] {layout_result_0 = #xegpu.layout} : vector<8x16xf16>, vector<16x16xf16> -> vector<8x16xf32> - //CHECK: [[dpas1:%.+]] = xegpu.dpas [[a1]], [[load_b]] {layout_result_0 = #xegpu.layout} : vector<8x16xf16>, vector<16x16xf16> -> vector<8x16xf32> + //CHECK: [[dpas0:%.+]] = xegpu.dpas [[a0]], [[load_b]] + //CHECK: [[dpas1:%.+]] = xegpu.dpas [[a1]], [[load_b]] //CHECK: [[c_tdesc_0:%.+]] = xegpu.create_nd_tdesc [[arg2]][[[c0]], [[c0]]] : memref<16x16xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.layout> //CHECK: [[c_tdesc_1:%.+]] = xegpu.create_nd_tdesc [[arg2]][[[c8]], [[c0]]] : memref<16x16xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.layout> //CHECK: xegpu.store_nd [[dpas0]], [[c_tdesc_0]] : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.layout> @@ -532,7 +532,7 @@ gpu.module @test_kernel { %a = xegpu.load_nd %a_tdesc : !xegpu.tensor_desc<16x16xf16, #b> -> vector<16x16xf16> %b = xegpu.load_nd %b_tdesc : !xegpu.tensor_desc<16x16xf16, #b> -> vector<16x16xf16> %e = xegpu.convert_layout %a <{input_layout = #b, target_layout = #a}> : vector<16x16xf16> - %c = xegpu.dpas %e, %b {layout_result_0 = #c}: vector<16x16xf16>, vector<16x16xf16> -> vector<16x16xf32> + %c = xegpu.dpas %e, %b {layout_a=#a, layout_b = #b, layout_cd = #c, layout_result_0 = #c}: vector<16x16xf16>, vector<16x16xf16> -> vector<16x16xf32> %c_tdesc = xegpu.create_nd_tdesc %C[%c0, %c0] : memref<16x16xf32> -> !xegpu.tensor_desc<16x16xf32, #c> xegpu.store_nd %c, %c_tdesc: vector<16x16xf32>, !xegpu.tensor_desc<16x16xf32, #c> gpu.return @@ -599,7 +599,7 @@ gpu.module @test_kernel { %c17 = arith.constant 17: index %mask = vector.create_mask %c17: vector<32xi1> - %ld = xegpu.load %src[%cst], %mask {chunk_size = 1, layout_result_0 = #xegpu.layout, l1_hint = #xegpu.cache_hint} : ui64, vector<32xindex>, vector<32xi1> -> vector<32xf32> + %ld = xegpu.load %src[%cst], %mask {chunk_size = 1, layout = #xegpu.layout, l1_hint = #xegpu.cache_hint} : ui64, vector<32xindex>, vector<32xi1> -> vector<32xf32> gpu.return %ld : vector<32xf32> } @@ -621,10 +621,7 @@ gpu.module @test_kernel { %mask = vector.create_mask %c17: vector<32xi1> %st_vec = arith.constant dense<1023.0>: vector<32xf32> - xegpu.store %st_vec, %src[%cst], %mask {chunk_size = 1, layout_operand_0 = #xegpu.layout, - layout_operand_2 = #xegpu.layout, - layout_operand_3 = #xegpu.layout, - l1_hint = #xegpu.cache_hint} : vector<32xf32>, ui64, vector<32xindex>, vector<32xi1> + xegpu.store %st_vec, %src[%cst], %mask {chunk_size = 1, layout = #xegpu.layout, l1_hint = #xegpu.cache_hint} : vector<32xf32>, ui64, vector<32xindex>, vector<32xi1> gpu.return } @@ -649,7 +646,7 @@ gpu.module @test_kernel { %c17 = arith.constant 17: index %mask = vector.create_mask %c17: vector<32xi1> - %ld = xegpu.load %src[%cst], %mask {chunk_size = 4, layout_result_0 = #xegpu.layout, l1_hint = #xegpu.cache_hint} : ui64, vector<32xindex>, vector<32xi1> -> vector<32x4xf32> + %ld = xegpu.load %src[%cst], %mask {chunk_size = 4, layout = #xegpu.layout, l1_hint = #xegpu.cache_hint} : ui64, vector<32xindex>, vector<32xi1> -> vector<32x4xf32> gpu.return %ld : vector<32x4xf32> } } @@ -675,10 +672,7 @@ gpu.module @test_kernel { %mask = vector.create_mask %c17: vector<32xi1> %st_vec = arith.constant dense<1023.>: vector<32x4xf32> - xegpu.store %st_vec, %src[%cst], %mask {chunk_size = 4, layout_operand_0 = #xegpu.layout, - layout_operand_2 = #xegpu.layout, - layout_operand_3 = #xegpu.layout, - l1_hint = #xegpu.cache_hint} : vector<32x4xf32>, ui64, vector<32xindex>, vector<32xi1> + xegpu.store %st_vec, %src[%cst], %mask {chunk_size = 4, layout = #xegpu.layout, l1_hint = #xegpu.cache_hint} : vector<32x4xf32>, ui64, vector<32xindex>, vector<32xi1> gpu.return } } @@ -704,7 +698,7 @@ gpu.module @test_kernel { ]]> : vector<1x1x32xindex> %mask = arith.constant {layout_result_0 = #xegpu.layout} dense : vector<1x1x32xi1> - %ld = xegpu.load %src[%cst], %mask {chunk_size = 1, layout_result_0 = #xegpu.layout, l1_hint = #xegpu.cache_hint} : ui64, vector<1x1x32xindex>, vector<1x1x32xi1> -> vector<1x1x32xf32> + %ld = xegpu.load %src[%cst], %mask {chunk_size = 1, layout = #xegpu.layout, l1_hint = #xegpu.cache_hint} : ui64, vector<1x1x32xindex>, vector<1x1x32xi1> -> vector<1x1x32xf32> gpu.return %ld : vector<1x1x32xf32> } @@ -778,10 +772,10 @@ gpu.module @test_kernel { 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248]] ]> : vector<1x1x32xindex> %mask = arith.constant {layout_result_0 = #inst_data} dense : vector<1x1x32xi1> - %a = xegpu.load %A[%cst], %mask {chunk_size = 1, layout_result_0 = #inst_data, l1_hint = #xegpu.cache_hint} : ui64, vector<1x1x32xindex>, vector<1x1x32xi1> -> vector<1x1x32xf32> - %b = xegpu.load %B[%cst], %mask {chunk_size = 1, layout_result_0 = #inst_data, l1_hint = #xegpu.cache_hint} : ui64, vector<1x1x32xindex>, vector<1x1x32xi1> -> vector<1x1x32xf32> + %a = xegpu.load %A[%cst], %mask {chunk_size = 1, layout = #inst_data, l1_hint = #xegpu.cache_hint} : ui64, vector<1x1x32xindex>, vector<1x1x32xi1> -> vector<1x1x32xf32> + %b = xegpu.load %B[%cst], %mask {chunk_size = 1, layout = #inst_data, l1_hint = #xegpu.cache_hint} : ui64, vector<1x1x32xindex>, vector<1x1x32xi1> -> vector<1x1x32xf32> %addf = arith.addf %a, %b {layout_result_0 = #inst_data} : vector<1x1x32xf32> - xegpu.store %addf, %C[%cst], %mask {chunk_size = 1, layout_operand_0 = #inst_data, layout_operand_2 = #inst_data, layout_operand_3 = #inst_data, l1_hint = #xegpu.cache_hint} : vector<1x1x32xf32>, ui64, vector<1x1x32xindex>, vector<1x1x32xi1> + xegpu.store %addf, %C[%cst], %mask {chunk_size = 1, layout = #inst_data, l1_hint = #xegpu.cache_hint} : vector<1x1x32xf32>, ui64, vector<1x1x32xindex>, vector<1x1x32xi1> gpu.return } } From 4795f24ef9533984af8d4c797647cfc9fcaf6d46 Mon Sep 17 00:00:00 2001 From: Jianhui Li Date: Thu, 27 Nov 2025 00:20:25 +0000 Subject: [PATCH 10/28] simplify: load_nd/store_nd/prefetch_nd use anchor layout for blocking --- .../XeGPU/Transforms/XeGPUBlocking.cpp | 11 +-- mlir/test/Dialect/XeGPU/xegpu-blocking.mlir | 78 +++++++++---------- 2 files changed, 45 insertions(+), 44 deletions(-) diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp index 36fc653cbabff..ce53e8fe970b1 100644 --- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp +++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp @@ -200,6 +200,12 @@ XeGPUBlockingPass::getTileShape(Operation *op) const { layout = dyn_cast(op).getLayoutAttr(); if (isa(op)) layout = dyn_cast(op).getLayoutAttr(); + if (isa(op)) + layout = dyn_cast(op).getLayoutAttr(); + if (isa(op)) + layout = dyn_cast(op).getLayoutAttr(); + if (isa(op)) + layout = dyn_cast(op).getLayoutAttr(); if (layout != nullptr) { assert(layout.isForSubgroup() && @@ -220,11 +226,6 @@ XeGPUBlockingPass::getTileShape(Operation *op) const { return getShapeSkipLeadingUnitDim(layout); } - if (isa(op)) - return getTileShape(op->getOpOperand(0)); - if (isa(op)) - return getTileShape(op->getOpOperand(1)); - if (isa(op)) { auto layoutA = dyn_cast(op).getLayoutAAttr(); diff --git a/mlir/test/Dialect/XeGPU/xegpu-blocking.mlir b/mlir/test/Dialect/XeGPU/xegpu-blocking.mlir index c2ca63f6b289d..b3032bac351ac 100644 --- a/mlir/test/Dialect/XeGPU/xegpu-blocking.mlir +++ b/mlir/test/Dialect/XeGPU/xegpu-blocking.mlir @@ -15,7 +15,7 @@ gpu.module @test_kernel { %n = arith.muli %block_id_y, %c32 : index %c_tdesc = xegpu.create_nd_tdesc %C[%m, %n] : memref<1024x1024xf32> -> !xegpu.tensor_desc<16x32xf32, #c> - %c_init = xegpu.load_nd %c_tdesc : !xegpu.tensor_desc<16x32xf32, #c> -> vector<16x32xf32> + %c_init = xegpu.load_nd %c_tdesc {layout = #c}: !xegpu.tensor_desc<16x32xf32, #c> -> vector<16x32xf32> %a_tdesc = xegpu.create_nd_tdesc %A[%m, %c0] : memref<1024x1024xf16> -> !xegpu.tensor_desc<16x32xf16, #a> %b_tdesc = xegpu.create_nd_tdesc %B[%c0, %n] : memref<1024x1024xf16> -> !xegpu.tensor_desc<32x32xf16, #b> @@ -23,9 +23,9 @@ gpu.module @test_kernel { iter_args(%arg0 = %a_tdesc, %arg1 = %b_tdesc, %arg2 = %c_init) -> (!xegpu.tensor_desc<16x32xf16, #a>, !xegpu.tensor_desc<32x32xf16, #b>, vector<16x32xf32>) { //CHECK-COUNT-4: xegpu.load_nd {{.*}} -> vector<8x16xf16> - %a = xegpu.load_nd %arg0 : !xegpu.tensor_desc<16x32xf16, #a> -> vector<16x32xf16> + %a = xegpu.load_nd %arg0 {layout = #a}: !xegpu.tensor_desc<16x32xf16, #a> -> vector<16x32xf16> //CHECK-COUNT-4: xegpu.load_nd {{.*}} -> vector<16x16xf16> - %b = xegpu.load_nd %arg1 : !xegpu.tensor_desc<32x32xf16, #b> -> vector<32x32xf16> + %b = xegpu.load_nd %arg1 {layout = #b}: !xegpu.tensor_desc<32x32xf16, #b> -> vector<32x32xf16> //CHECK-COUNT-8: xegpu.dpas {{.*}} %c = xegpu.dpas %a, %b, %arg2 {layout_a=#a, layout_b = #b, layout_cd = #c,layout_result_0 = #c}: vector<16x32xf16>, vector<32x32xf16>, vector<16x32xf32> -> vector<16x32xf32> //CHECK-COUNT-4: xegpu.update_nd_offset {{.*}} : !xegpu.tensor_desc<8x16xf16, #xegpu.layout> @@ -36,7 +36,7 @@ gpu.module @test_kernel { : !xegpu.tensor_desc<16x32xf16, #a>, !xegpu.tensor_desc<32x32xf16, #b>, vector<16x32xf32> } //CHECK-COUNT-4: xegpu.store_nd {{.*}} : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.layout> - xegpu.store_nd %out#2, %c_tdesc: vector<16x32xf32>, !xegpu.tensor_desc<16x32xf32, #c> + xegpu.store_nd %out#2, %c_tdesc {layout = #c}: vector<16x32xf32>, !xegpu.tensor_desc<16x32xf32, #c> gpu.return } } @@ -56,7 +56,7 @@ gpu.module @test_kernel { %n = arith.muli %block_id_y, %c32 : index %c_tdesc = xegpu.create_nd_tdesc %C[%m, %n] : memref<1024x1024xf32> -> !xegpu.tensor_desc<16x32xf32, #l1> - %c_init = xegpu.load_nd %c_tdesc : !xegpu.tensor_desc<16x32xf32, #l1> -> vector<16x32xf32> + %c_init = xegpu.load_nd %c_tdesc {layout = #l1}: !xegpu.tensor_desc<16x32xf32, #l1> -> vector<16x32xf32> %a_tdesc = xegpu.create_nd_tdesc %A[%m, %c0] : memref<1024x1024xf16> -> !xegpu.tensor_desc<16x32xf16, #l1> %b_tdesc = xegpu.create_nd_tdesc %B[%c0, %n] : memref<1024x1024xf16> -> !xegpu.tensor_desc<32x32xf16, #l2> @@ -64,9 +64,9 @@ gpu.module @test_kernel { iter_args(%arg0 = %a_tdesc, %arg1 = %b_tdesc, %arg2 = %c_init) -> (!xegpu.tensor_desc<16x32xf16, #l1>, !xegpu.tensor_desc<32x32xf16, #l2>, vector<16x32xf32>) { //CHECK-COUNT-4: xegpu.load_nd {{.*}} -> vector<8x16xf16> - %a = xegpu.load_nd %arg0 : !xegpu.tensor_desc<16x32xf16, #l1> -> vector<16x32xf16> + %a = xegpu.load_nd %arg0 {layout = #l1}: !xegpu.tensor_desc<16x32xf16, #l1> -> vector<16x32xf16> //CHECK-COUNT-4: xegpu.load_nd {{.*}} -> vector<16x16xf16> - %b = xegpu.load_nd %arg1 : !xegpu.tensor_desc<32x32xf16, #l2> -> vector<32x32xf16> + %b = xegpu.load_nd %arg1 {layout = #l2}: !xegpu.tensor_desc<32x32xf16, #l2> -> vector<32x32xf16> //CHECK-COUNT-8: xegpu.dpas {{.*}} %c = xegpu.dpas %a, %b, %arg2 {layout_a=#l1, layout_b = #l2, layout_cd = #l1,layout_result_0 = #l1}: vector<16x32xf16>, vector<32x32xf16>, vector<16x32xf32> -> vector<16x32xf32> //CHECK-COUNT-4: xegpu.update_nd_offset {{.*}} : !xegpu.tensor_desc<8x16xf16> @@ -77,7 +77,7 @@ gpu.module @test_kernel { : !xegpu.tensor_desc<16x32xf16, #l1>, !xegpu.tensor_desc<32x32xf16, #l2>, vector<16x32xf32> } //CHECK-COUNT-4: xegpu.store_nd {{.*}} : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32> - xegpu.store_nd %out#2, %c_tdesc: vector<16x32xf32>, !xegpu.tensor_desc<16x32xf32, #l1> + xegpu.store_nd %out#2, %c_tdesc {layout = #l1}: vector<16x32xf32>, !xegpu.tensor_desc<16x32xf32, #l1> gpu.return } } @@ -100,7 +100,7 @@ gpu.module @test_kernel { %c_tdesc = xegpu.create_nd_tdesc %C[%m, %n] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x32xf32, #l1> //CHECK-COUNT-2: xegpu.load_nd {{.*}} : !xegpu.tensor_desc<8x16xf32> -> vector<8x16xf32> - %c_init = xegpu.load_nd %c_tdesc : !xegpu.tensor_desc<8x32xf32, #l1> -> vector<8x32xf32> + %c_init = xegpu.load_nd %c_tdesc {layout = #l1}: !xegpu.tensor_desc<8x32xf32, #l1> -> vector<8x32xf32> %a_tdesc = xegpu.create_nd_tdesc %A[%m, %c0] : memref<1024x1024xf16> -> !xegpu.tensor_desc<8x16xf16, #l1> %b_tdesc = xegpu.create_nd_tdesc %B[%c0, %n] : memref<1024x1024xf16> -> !xegpu.tensor_desc<16x32xf16, #l2> @@ -108,9 +108,9 @@ gpu.module @test_kernel { iter_args(%arg0 = %a_tdesc, %arg1 = %b_tdesc, %arg2 = %c_init) -> (!xegpu.tensor_desc<8x16xf16, #l1>, !xegpu.tensor_desc<16x32xf16, #l2>, vector<8x32xf32>) { //CHECK: xegpu.load_nd {{.*}} : !xegpu.tensor_desc<8x16xf16> -> vector<8x16xf16> - %a = xegpu.load_nd %arg0 : !xegpu.tensor_desc<8x16xf16, #l1> -> vector<8x16xf16> + %a = xegpu.load_nd %arg0 {layout = #l1}: !xegpu.tensor_desc<8x16xf16, #l1> -> vector<8x16xf16> //CHECK-COUNT-2: xegpu.load_nd {{.*}} : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16> - %b = xegpu.load_nd %arg1 : !xegpu.tensor_desc<16x32xf16, #l2> -> vector<16x32xf16> + %b = xegpu.load_nd %arg1 {layout = #l2}: !xegpu.tensor_desc<16x32xf16, #l2> -> vector<16x32xf16> %c = xegpu.dpas %a, %b, %arg2 {layout_a=#l1, layout_b = #l2, layout_cd = #l1,layout_result_0 = #l1}: vector<8x16xf16>, vector<16x32xf16>, vector<8x32xf32> -> vector<8x32xf32> //CHECK: xegpu.update_nd_offset {{.*}} [%c0, %c32] : !xegpu.tensor_desc<8x16xf16> %a_next_tdesc = xegpu.update_nd_offset %arg0, [%c0, %c32] : !xegpu.tensor_desc<8x16xf16, #l1> @@ -120,7 +120,7 @@ gpu.module @test_kernel { : !xegpu.tensor_desc<8x16xf16, #l1>, !xegpu.tensor_desc<16x32xf16, #l2>, vector<8x32xf32> } //CHECK-COUNT-2: xegpu.store_nd {{.*}} : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32> - xegpu.store_nd %out#2, %c_tdesc: vector<8x32xf32>, !xegpu.tensor_desc<8x32xf32, #l1> + xegpu.store_nd %out#2, %c_tdesc {layout = #l1}: vector<8x32xf32>, !xegpu.tensor_desc<8x32xf32, #l1> gpu.return } } @@ -141,7 +141,7 @@ gpu.module @test_kernel { %n = arith.muli %block_id_y, %c32 : index %c_tdesc = xegpu.create_nd_tdesc %C[%m, %n] : memref<1024x1024xf32> -> !xegpu.tensor_desc<16x32xf32, #c> - %c_init = xegpu.load_nd %c_tdesc : !xegpu.tensor_desc<16x32xf32, #c> -> vector<16x32xf32> + %c_init = xegpu.load_nd %c_tdesc {layout = #c}: !xegpu.tensor_desc<16x32xf32, #c> -> vector<16x32xf32> %a_tdesc = xegpu.create_nd_tdesc %A[%m, %c0] : memref<1024x1024xf16> -> !xegpu.tensor_desc<16x32xf16, #a> %b_tdesc = xegpu.create_nd_tdesc %B[%c0, %n] : memref<1024x1024xf16> -> !xegpu.tensor_desc<32x32xf16, #b> @@ -149,9 +149,9 @@ gpu.module @test_kernel { iter_args(%arg0 = %a_tdesc, %arg1 = %b_tdesc, %arg2 = %c_init) -> (!xegpu.tensor_desc<16x32xf16, #a>, !xegpu.tensor_desc<32x32xf16, #b>, vector<16x32xf32>) { //CHECK-COUNT-4: xegpu.load_nd {{.*}} -> vector<8x16xf16> - %a = xegpu.load_nd %arg0 : !xegpu.tensor_desc<16x32xf16, #a> -> vector<16x32xf16> + %a = xegpu.load_nd %arg0 {layout = #a}: !xegpu.tensor_desc<16x32xf16, #a> -> vector<16x32xf16> //CHECK-COUNT-4: xegpu.load_nd {{.*}} -> vector<16x16xf16> - %b = xegpu.load_nd %arg1 : !xegpu.tensor_desc<32x32xf16, #b> -> vector<32x32xf16> + %b = xegpu.load_nd %arg1 {layout = #b}: !xegpu.tensor_desc<32x32xf16, #b> -> vector<32x32xf16> //CHECK-COUNT-4: math.exp {{.*}} : vector<8x16xf16> %e = math.exp %a {layout_result_0 = #a} : vector<16x32xf16> //CHECK-COUNT-8: xegpu.dpas {{.*}} @@ -164,7 +164,7 @@ gpu.module @test_kernel { : !xegpu.tensor_desc<16x32xf16, #a>, !xegpu.tensor_desc<32x32xf16, #b>, vector<16x32xf32> } //CHECK-COUNT-4: xegpu.store_nd {{.*}} : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.layout> - xegpu.store_nd %out#2, %c_tdesc: vector<16x32xf32>, !xegpu.tensor_desc<16x32xf32, #c> + xegpu.store_nd %out#2, %c_tdesc {layout = #c}: vector<16x32xf32>, !xegpu.tensor_desc<16x32xf32, #c> gpu.return } } @@ -188,14 +188,14 @@ gpu.module @test_kernel { iter_args(%arg0 = %a_tdesc, %arg1 = %b_tdesc, %arg2 = %c_tdesc) -> (!xegpu.tensor_desc<16x32xf16, #l>, !xegpu.tensor_desc<16x32xf16, #l>, !xegpu.tensor_desc<16x32xf16, #l>) { //CHECK-COUNT-8: xegpu.load_nd {{.*}} : !xegpu.tensor_desc<8x16xf16> -> vector<8x16xf16> - %a = xegpu.load_nd %arg0 : !xegpu.tensor_desc<16x32xf16, #l> -> vector<16x32xf16> - %b = xegpu.load_nd %arg1 : !xegpu.tensor_desc<16x32xf16, #l> -> vector<16x32xf16> + %a = xegpu.load_nd %arg0 {layout = #l}: !xegpu.tensor_desc<16x32xf16, #l> -> vector<16x32xf16> + %b = xegpu.load_nd %arg1 {layout = #l}: !xegpu.tensor_desc<16x32xf16, #l> -> vector<16x32xf16> //CHECK-COUNT-4: arith.addf {{.*}} : vector<8x16xf16> %c = arith.addf %a, %b {layout_result_0 = #l} : vector<16x32xf16> //CHECK-COUNT-4: xegpu.store_nd {{.*}} : vector<8x16xf16>, !xegpu.tensor_desc<8x16xf16> - xegpu.store_nd %c, %arg2: vector<16x32xf16>, !xegpu.tensor_desc<16x32xf16, #l> + xegpu.store_nd %c, %arg2 {layout = #l}: vector<16x32xf16>, !xegpu.tensor_desc<16x32xf16, #l> //CHECK-COUNT-12: xegpu.update_nd_offset {{.*}} : !xegpu.tensor_desc<8x16xf16> %a_next_tdesc = xegpu.update_nd_offset %arg0, [%c0, %c32] : !xegpu.tensor_desc<16x32xf16, #l> @@ -227,14 +227,14 @@ gpu.module @test_kernel { iter_args(%arg0 = %a_tdesc, %arg1 = %b_tdesc, %arg2 = %c_tdesc) -> (!xegpu.tensor_desc<32xf16, #l>, !xegpu.tensor_desc<32xf16, #l>, !xegpu.tensor_desc<32xf16, #l>) { //CHECK-COUNT-8: xegpu.load_nd {{.*}} : !xegpu.tensor_desc<8xf16> -> vector<8xf16> - %a = xegpu.load_nd %arg0 : !xegpu.tensor_desc<32xf16, #l> -> vector<32xf16> - %b = xegpu.load_nd %arg1 : !xegpu.tensor_desc<32xf16, #l> -> vector<32xf16> + %a = xegpu.load_nd %arg0 {layout = #l}: !xegpu.tensor_desc<32xf16, #l> -> vector<32xf16> + %b = xegpu.load_nd %arg1 {layout = #l}: !xegpu.tensor_desc<32xf16, #l> -> vector<32xf16> //CHECK-COUNT-4: arith.addf {{.*}} : vector<8xf16> %c = arith.addf %a, %b {layout_result_0 = #l} : vector<32xf16> //CHECK-COUNT-4: xegpu.store_nd {{.*}} : vector<8xf16>, !xegpu.tensor_desc<8xf16> - xegpu.store_nd %c, %arg2: vector<32xf16>, !xegpu.tensor_desc<32xf16, #l> + xegpu.store_nd %c, %arg2 {layout = #l}: vector<32xf16>, !xegpu.tensor_desc<32xf16, #l> //CHECK-COUNT-12: xegpu.update_nd_offset {{.*}} : !xegpu.tensor_desc<8xf16> %a_next_tdesc = xegpu.update_nd_offset %arg0, [%c32] : !xegpu.tensor_desc<32xf16, #l> @@ -257,12 +257,12 @@ gpu.module @test_kernel { %block_id_x = gpu.block_id x %m = arith.muli %block_id_x, %c64 : index %0 = xegpu.create_nd_tdesc %a[0, %m] : memref<16x512xf32> -> !xegpu.tensor_desc<16x64xf32, #l> - %1 = xegpu.load_nd %0: !xegpu.tensor_desc<16x64xf32, #l> -> vector<16x64xf32> + %1 = xegpu.load_nd %0 {layout = #l}: !xegpu.tensor_desc<16x64xf32, #l> -> vector<16x64xf32> // CHECK: vector.multi_reduction , {{.*}}, [[ACC:%[0-9A-Za-z]+]] [0] : vector<16x16xf32> to vector<16xf32> // CHECK-COUNT-3: vector.multi_reduction , {{.*}}, [[ACC]] [0] : vector<16x16xf32> to vector<16xf32> %2 = vector.multi_reduction , %1, %acc {layout_result_0 = #r} [0]: vector<16x64xf32> to vector<64xf32> %3 = xegpu.create_nd_tdesc %b[%m] : memref<512xf32> -> !xegpu.tensor_desc<64xf32, #r> - xegpu.store_nd %2, %3: vector<64xf32>, !xegpu.tensor_desc<64xf32, #r> + xegpu.store_nd %2, %3 {layout = #r}: vector<64xf32>, !xegpu.tensor_desc<64xf32, #r> gpu.return } } @@ -282,14 +282,14 @@ gpu.module @test_kernel { %m = arith.muli %block_id_x, %c32 : index %n = arith.muli %block_id_y, %c32 : index %0 = xegpu.create_nd_tdesc %a[%m, %n] : memref<512x32xf32> -> !xegpu.tensor_desc<32x128xf32, #l> - %1 = xegpu.load_nd %0: !xegpu.tensor_desc<32x128xf32, #l> -> vector<32x128xf32> + %1 = xegpu.load_nd %0 {layout = #l}: !xegpu.tensor_desc<32x128xf32, #l> -> vector<32x128xf32> // CHECK: vector.multi_reduction , {{.*}}, [[INIT:%[0-9A-Za-z]+]] [1] : vector<16x16xf32> to vector<16xf32> // CHECK-COUNT-1: vector.multi_reduction , {{.*}}, [[INIT]] [1] : vector<16x16xf32> to vector<16xf32> %2 = vector.multi_reduction , %1, %acc {layout_result_0 = #r} [1]: vector<32x128xf32> to vector<32xf32> %3 = xegpu.create_nd_tdesc %b[%n] : memref<512xf32> -> !xegpu.tensor_desc<32xf32, #r> - xegpu.store_nd %2, %3: vector<32xf32>, !xegpu.tensor_desc<32xf32, #r> + xegpu.store_nd %2, %3 {layout = #r}: vector<32xf32>, !xegpu.tensor_desc<32xf32, #r> gpu.return } } @@ -304,11 +304,11 @@ gpu.module @test_kernel { %block_id_x = gpu.block_id x %m = arith.muli %block_id_x, %c64 : index %0 = xegpu.create_nd_tdesc %a[%m] : memref<512xf32> -> !xegpu.tensor_desc<64xf32, #r> - %1 = xegpu.load_nd %0: !xegpu.tensor_desc<64xf32, #r> -> vector<64xf32> + %1 = xegpu.load_nd %0 {layout = #r}: !xegpu.tensor_desc<64xf32, #r> -> vector<64xf32> // CHECK-COUNT-4: vector.broadcast {{.*}} : vector<16xf32> to vector<16x16xf32> %2 = vector.broadcast %1 {layout_result_0 = #l} : vector<64xf32> to vector<16x64xf32> %3 = xegpu.create_nd_tdesc %b[0, %m] : memref<16x512xf32> -> !xegpu.tensor_desc<16x64xf32, #l> - xegpu.store_nd %2, %3: vector<16x64xf32>, !xegpu.tensor_desc<16x64xf32, #l> + xegpu.store_nd %2, %3 {layout = #l}: vector<16x64xf32>, !xegpu.tensor_desc<16x64xf32, #l> gpu.return } } @@ -323,7 +323,7 @@ gpu.module @test_kernel { %block_id_x = gpu.block_id x %m = arith.muli %block_id_x, %c32 : index %0 = xegpu.create_nd_tdesc %a[%m] : memref<512xf32> -> !xegpu.tensor_desc<32xf32, #r> - %1 = xegpu.load_nd %0: !xegpu.tensor_desc<32xf32, #r> -> vector<32xf32> + %1 = xegpu.load_nd %0 {layout = #r}: !xegpu.tensor_desc<32xf32, #r> -> vector<32xf32> %11 = vector.shape_cast %1 : vector<32xf32> to vector<32x1xf32> // CHECK-COUNT-8: vector.broadcast {{.*}}: vector<16x1xf32> to vector<16x16xf32> %2 = vector.broadcast %11 {layout_result_0 = #l} : vector<32x1xf32> to vector<32x64xf32> @@ -343,11 +343,11 @@ gpu.module @test_kernel { %block_id_x = gpu.block_id x %m = arith.muli %block_id_x, %c32 : index %0 = xegpu.create_nd_tdesc %a[%m, 0] : memref<512x8xf32> -> !xegpu.tensor_desc<32x8xf32, #l> - %1 = xegpu.load_nd %0: !xegpu.tensor_desc<32x8xf32, #l> -> vector<32x8xf32> + %1 = xegpu.load_nd %0 {layout = #l}: !xegpu.tensor_desc<32x8xf32, #l> -> vector<32x8xf32> // CHECK-COUNT-2: vector.transpose {{.*}} [1, 0] : vector<16x8xf32> to vector<8x16xf32> %2 = vector.transpose %1, [1, 0] {layout_result_0 = #t} : vector<32x8xf32> to vector<8x32xf32> %3 = xegpu.create_nd_tdesc %b[0, %m] : memref<8x512xf32> -> !xegpu.tensor_desc<8x32xf32, #t> - xegpu.store_nd %2, %3: vector<8x32xf32>, !xegpu.tensor_desc<8x32xf32, #t> + xegpu.store_nd %2, %3 {layout = #t}: vector<8x32xf32>, !xegpu.tensor_desc<8x32xf32, #t> gpu.return } } @@ -513,8 +513,8 @@ gpu.module @test_kernel { //CHECK: [[c0:%.+]] = arith.constant 0 : index //CHECK: [[a:%.+]] = xegpu.create_nd_tdesc [[arg0]][[[c0]], [[c0]]] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout> //CHECK: [[b:%.+]] = xegpu.create_nd_tdesc [[arg1]][[[c0]], [[c0]]] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout> - //CHECK: [[load_a:%.+]] = xegpu.load_nd [[a]] {layout_result_0 = #xegpu.layout} : !xegpu.tensor_desc<16x16xf16, #xegpu.layout> -> vector<16x16xf16> - //CHECK: [[load_b:%.+]] = xegpu.load_nd [[b]] {layout_result_0 = #xegpu.layout} : !xegpu.tensor_desc<16x16xf16, #xegpu.layout> -> vector<16x16xf16> + //CHECK: [[load_a:%.+]] = xegpu.load_nd [[a]] <{layout = #xegpu.layout}> {layout_result_0 = #xegpu.layout} : !xegpu.tensor_desc<16x16xf16, #xegpu.layout> -> vector<16x16xf16> + //CHECK: [[load_b:%.+]] = xegpu.load_nd [[b]] <{layout = #xegpu.layout}> {layout_result_0 = #xegpu.layout} : !xegpu.tensor_desc<16x16xf16, #xegpu.layout> -> vector<16x16xf16> //CHECK: [[cvt:%.+]] = xegpu.convert_layout [[load_a]] <{input_layout = #xegpu.layout, target_layout = #xegpu.layout}> : vector<16x16xf16> //CHECK: [[a0:%.+]] = vector.extract_strided_slice [[cvt]] {offsets = [0, 0], sizes = [8, 16], strides = [1, 1]} : vector<16x16xf16> to vector<8x16xf16> //CHECK: [[a1:%.+]] = vector.extract_strided_slice [[cvt]] {offsets = [8, 0], sizes = [8, 16], strides = [1, 1]} : vector<16x16xf16> to vector<8x16xf16> @@ -529,12 +529,12 @@ gpu.module @test_kernel { %c0 = arith.constant 0 : index %a_tdesc = xegpu.create_nd_tdesc %A[%c0, %c0] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16, #b> %b_tdesc = xegpu.create_nd_tdesc %B[%c0, %c0] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16, #b> - %a = xegpu.load_nd %a_tdesc : !xegpu.tensor_desc<16x16xf16, #b> -> vector<16x16xf16> - %b = xegpu.load_nd %b_tdesc : !xegpu.tensor_desc<16x16xf16, #b> -> vector<16x16xf16> + %a = xegpu.load_nd %a_tdesc {layout = #b}: !xegpu.tensor_desc<16x16xf16, #b> -> vector<16x16xf16> + %b = xegpu.load_nd %b_tdesc {layout = #b}: !xegpu.tensor_desc<16x16xf16, #b> -> vector<16x16xf16> %e = xegpu.convert_layout %a <{input_layout = #b, target_layout = #a}> : vector<16x16xf16> %c = xegpu.dpas %e, %b {layout_a=#a, layout_b = #b, layout_cd = #c, layout_result_0 = #c}: vector<16x16xf16>, vector<16x16xf16> -> vector<16x16xf32> %c_tdesc = xegpu.create_nd_tdesc %C[%c0, %c0] : memref<16x16xf32> -> !xegpu.tensor_desc<16x16xf32, #c> - xegpu.store_nd %c, %c_tdesc: vector<16x16xf32>, !xegpu.tensor_desc<16x16xf32, #c> + xegpu.store_nd %c, %c_tdesc {layout = #c}: vector<16x16xf32>, !xegpu.tensor_desc<16x16xf32, #c> gpu.return } } @@ -746,11 +746,11 @@ gpu.module @test_kernel { %b_tdesc = xegpu.create_nd_tdesc %B : memref<1024x1024xf32> -> !xegpu.tensor_desc<1x32xf32, #l> %c_tdesc = xegpu.create_nd_tdesc %C : memref<1024x1024xf32> -> !xegpu.tensor_desc<1x32xf32, #l> - %a = xegpu.load_nd %a_tdesc[%c0, %c0] : !xegpu.tensor_desc<1x32xf32, #l> -> vector<1x32xf32> - %b = xegpu.load_nd %b_tdesc[%c0, %c0] : !xegpu.tensor_desc<1x32xf32, #l> -> vector<1x32xf32> + %a = xegpu.load_nd %a_tdesc[%c0, %c0] {layout = #l}: !xegpu.tensor_desc<1x32xf32, #l> -> vector<1x32xf32> + %b = xegpu.load_nd %b_tdesc[%c0, %c0] {layout = #l}: !xegpu.tensor_desc<1x32xf32, #l> -> vector<1x32xf32> %result = arith.addf %a, %b {layout_result_0 = #l} : vector<1x32xf32> - xegpu.store_nd %result, %c_tdesc[%c0, %c0] : vector<1x32xf32>, !xegpu.tensor_desc<1x32xf32, #l> + xegpu.store_nd %result, %c_tdesc[%c0, %c0] {layout = #l}: vector<1x32xf32>, !xegpu.tensor_desc<1x32xf32, #l> gpu.return } } From 227c15d19e7f311d3c7ff58b8f63624c0954671b Mon Sep 17 00:00:00 2001 From: Jianhui Li Date: Thu, 27 Nov 2025 00:42:55 +0000 Subject: [PATCH 11/28] simplify: Avoid calling getTileShape(Result&Operand) from getTileShape(op) --- .../XeGPU/Transforms/XeGPUBlocking.cpp | 27 ++++++++++--------- 1 file changed, 14 insertions(+), 13 deletions(-) diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp index ce53e8fe970b1..6edbca6734905 100644 --- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp +++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp @@ -166,8 +166,8 @@ XeGPUBlockingPass::getTileShape(const T &operandOrResult) const { return instData; } - if (auto type = dyn_cast(value.getType())) - return llvm::to_vector(type.getShape()); + // if (auto type = dyn_cast(value.getType())) + // return llvm::to_vector(type.getShape()); } LDBG() << "failed to getTileShape for: " << value; return std::nullopt; @@ -190,12 +190,10 @@ XeGPUBlockingPass::getTileShape(Operation *op) const { return std::nullopt; }; - if (isa(op)) - return getTileShape(op->getOpResult(0)); - xegpu::DistributeLayoutAttr layout = nullptr; + if (isa(op)) + layout = xegpu::getDistributeLayoutAttr(op->getOpResult(0)); if (isa(op)) layout = dyn_cast(op).getLayoutAttr(); if (isa(op)) @@ -212,7 +210,8 @@ XeGPUBlockingPass::getTileShape(Operation *op) const { "Matrix load/store should have subgroup level layout"); return layout.getEffectiveInstDataAsInt(); } - + if (isa(op)) + layout = xegpu::getDistributeLayoutAttr(op->getOpResult(0)); if (isa(op)) layout = dyn_cast(op).getLayoutAttr(); if (isa(op)) @@ -254,14 +253,16 @@ XeGPUBlockingPass::getTileShape(Operation *op) const { } if (OpTrait::hasElementwiseMappableTraits(op) && op->getNumResults() == 1) - return getTileShape(op->getOpResult(0)); - + layout = xegpu::getDistributeLayoutAttr(op->getOpResult(0)); if (isa(op)) - return getTileShape(op->getOpOperand(0)); - + layout = xegpu::getDistributeLayoutAttr(op->getOpOperand(0)); if (isa(op)) - return getTileShape(op->getOpResult(0)); - + layout = xegpu::getDistributeLayoutAttr(op->getOpResult(0)); + if (layout != nullptr) { + assert(layout.isForSubgroup() && + "Other ops (Vector/Math/Arith) should have subgroup level layout"); + return getShapeSkipLeadingUnitDim(layout); + } return std::nullopt; } From 0e44d076aa47727144c86b0ece160505893bb970 Mon Sep 17 00:00:00 2001 From: Jianhui Li Date: Sat, 29 Nov 2025 23:43:20 +0000 Subject: [PATCH 12/28] simplify getDistributeLayoutAttr --- .../XeGPU/Transforms/XeGPUBlocking.cpp | 6 +- .../Transforms/XeGPUSubgroupDistribute.cpp | 11 +++- mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp | 61 ++++++++++++++----- .../Dialect/XeGPU/subgroup-distribute.mlir | 4 +- .../Dialect/XeGPU/xegpu-unroll-patterns.mlir | 4 +- .../XeGPU/xegpu-wg-to-sg-unify-ops.mlir | 8 +-- 6 files changed, 66 insertions(+), 28 deletions(-) diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp index 6edbca6734905..2f1524a3659c2 100644 --- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp +++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp @@ -268,19 +268,19 @@ XeGPUBlockingPass::getTileShape(Operation *op) const { bool XeGPUBlockingPass::needsUnroll(Operation *op) const { // skip the op if any of its operands or results has workgroup level layouts - bool hasWgLayoutOperands = + /* bool hasWgLayoutOperands = llvm::any_of(op->getOpOperands(), [](OpOperand &opr) { xegpu::DistributeLayoutAttr layout = xegpu::getDistributeLayoutAttr(opr); return layout && layout.isForWorkgroup(); - }); + }); */ bool hasWgLayoutResults = llvm::any_of(op->getOpResults(), [](OpResult result) { xegpu::DistributeLayoutAttr layout = xegpu::getDistributeLayoutAttr(result); return layout && layout.isForWorkgroup(); }); - if (hasWgLayoutOperands || hasWgLayoutResults) { + if (hasWgLayoutResults) { LDBG() << "skip unrolling for op with workgroup level layout: " << *op; return false; } diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp index b64eb5b29ccb0..991c29c8de4ed 100644 --- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp +++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp @@ -1647,8 +1647,11 @@ void XeGPUSubgroupDistributePass::runOnOperation() { // Layouts are needed for vector type only. if (!isa(operand.get().getType())) continue; - if (isa(op)) - continue; + // if (isa(op)) + // xegpu::DpasOp, + // xegpu::LoadGatherOp, xegpu::StoreScatterOp, + // xegpu::LoadNdOp, xegpu::StoreNdOp>(op)) + // continue; auto layout = xegpu::getDistributeLayoutAttr(operand.get()); if (!layout) { @@ -1660,6 +1663,10 @@ void XeGPUSubgroupDistributePass::runOnOperation() { xegpu::setDistributeLayoutAttr(operand, layout); } }); + + // dump out the op here + // getOperation()->dump(); + // Step 2: Move all operations of a GPU function inside // gpu.warp_execute_on_lane_0 operation. { diff --git a/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp b/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp index 91432b1c11304..dff1474e586d8 100644 --- a/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp +++ b/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp @@ -126,8 +126,8 @@ xegpu::DistributeLayoutAttr xegpu::getDistributeLayoutAttr(const Value value) { assert(defOp && "result must have a defining op"); // For ConvertLayoutOp, the layout is stored in the targetLayoutAttr - if (auto convertOp = dyn_cast(defOp)) - return convertOp.getTargetLayoutAttr(); + // if (auto convertOp = dyn_cast(defOp)) + // return convertOp.getTargetLayoutAttr(); // for LoadNdOp, the layout is stored in the tensor descriptor if (auto loadNd = dyn_cast(defOp)) @@ -137,17 +137,17 @@ xegpu::DistributeLayoutAttr xegpu::getDistributeLayoutAttr(const Value value) { if (auto loadOp = dyn_cast(defOp)) return loadOp.getLayoutAttr(); - // for StoreMatrixOp, the layout is attached to the property of the op - if (auto storeOp = dyn_cast(defOp)) - return storeOp.getLayoutAttr(); - std::string layoutName = getLayoutName(result); - if (defOp->hasAttr(layoutName)) - return defOp->getAttrOfType(layoutName); + // // for StoreMatrixOp, the layout is attached to the property of the op + // if (auto storeOp = dyn_cast(defOp)) + // return storeOp.getLayoutAttr(); // check for "permament" layout only after "temporary" layout name lookup // for backward compatibility if (auto loadGatherOp = dyn_cast(defOp)) return loadGatherOp.getLayoutAttr(); + std::string layoutName = getLayoutName(result); + if (defOp->hasAttr(layoutName)) + return defOp->getAttrOfType(layoutName); } if (auto arg = dyn_cast(value)) { @@ -162,26 +162,49 @@ xegpu::DistributeLayoutAttr xegpu::getDistributeLayoutAttr(const Value value) { return nullptr; } +// xegpu::DistributeLayoutAttr +// xegpu::getDistributeLayoutAttr(const OpOperand &opr) { +// Operation *op = opr.getOwner(); + +// if (auto loadOp = dyn_cast(op)) +// return loadOp.getLayoutAttr(); + +// if (auto storeOp = dyn_cast(op)) +// return storeOp.getLayoutAttr(); + +// std::string layoutName = xegpu::getLayoutName(opr); +// if (op->hasAttr(layoutName)) +// return op->getAttrOfType(layoutName); + +// // check for "permament" layout only after "temporary" layout name lookup +// if (auto storeScatterOp = dyn_cast(op)) +// if (auto layout = storeScatterOp.getLayoutAttr()) +// return layout; + +// return getDistributeLayoutAttr(opr.get()); +// } + xegpu::DistributeLayoutAttr xegpu::getDistributeLayoutAttr(const OpOperand &opr) { Operation *op = opr.getOwner(); - if (auto loadOp = dyn_cast(op)) - return loadOp.getLayoutAttr(); + // if (auto loadOp = dyn_cast(op)) + // return loadOp.getLayoutAttr(); if (auto storeOp = dyn_cast(op)) return storeOp.getLayoutAttr(); - std::string layoutName = xegpu::getLayoutName(opr); - if (op->hasAttr(layoutName)) - return op->getAttrOfType(layoutName); - // check for "permament" layout only after "temporary" layout name lookup if (auto storeScatterOp = dyn_cast(op)) if (auto layout = storeScatterOp.getLayoutAttr()) return layout; - return getDistributeLayoutAttr(opr.get()); + std::string layoutName = xegpu::getLayoutName(opr); + if (op->hasAttr(layoutName)) + return op->getAttrOfType(layoutName); + + return nullptr; + // return getDistributeLayoutAttr(opr.get()); } // Returns the permanent layout attribute for the given result if it's @@ -286,6 +309,14 @@ void xegpu::removeLayoutAttrs(Operation *op) { removeLayoutAttr(opr); for (OpResult result : nestOp->getOpResults()) removeLayoutAttr(result); + if (op->hasAttrOfType("layout")) + op->removeAttr("layout"); + if (op->hasAttrOfType("layout_a")) + op->removeAttr("layout_a"); + if (op->hasAttrOfType("layout_b")) + op->removeAttr("layout_b"); + if (op->hasAttrOfType("layout_cd")) + op->removeAttr("layout_cd"); }); } diff --git a/mlir/test/Dialect/XeGPU/subgroup-distribute.mlir b/mlir/test/Dialect/XeGPU/subgroup-distribute.mlir index 8fd3cca5594cb..076c1d6ca51a6 100644 --- a/mlir/test/Dialect/XeGPU/subgroup-distribute.mlir +++ b/mlir/test/Dialect/XeGPU/subgroup-distribute.mlir @@ -137,7 +137,7 @@ gpu.module @xevm_module{ %offset = arith.constant {layout_result_0 = #xegpu.layout} dense<12> : vector<16xindex> %loaded = scf.if %pred -> (vector<16x8xf16>) { %3 = xegpu.load %src[%offset], %1 <{chunk_size=8}> { - layout_result_0 = #xegpu.layout + layout = #xegpu.layout } : memref<256xf16>, vector<16xindex>, vector<16xi1> -> vector<16x8xf16> scf.yield %3 : vector<16x8xf16> } else { @@ -169,7 +169,7 @@ gpu.module @xevm_module{ %offset = arith.constant {layout_result_0 = #xegpu.layout} dense<12> : vector<16xindex> scf.if %pred { %3 = xegpu.load %src[%offset], %1 <{chunk_size=8}> { - layout_result_0 = #xegpu.layout + layout = #xegpu.layout } : memref<256xf16>, vector<16xindex>, vector<16xi1> -> vector<16x8xf16> xegpu.store %3, %src[%offset], %1 <{chunk_size=8}> : vector<16x8xf16>, memref<256xf16>, vector<16xindex>, vector<16xi1> } diff --git a/mlir/test/Dialect/XeGPU/xegpu-unroll-patterns.mlir b/mlir/test/Dialect/XeGPU/xegpu-unroll-patterns.mlir index dbc52b8a98894..c3be138fef38a 100644 --- a/mlir/test/Dialect/XeGPU/xegpu-unroll-patterns.mlir +++ b/mlir/test/Dialect/XeGPU/xegpu-unroll-patterns.mlir @@ -226,7 +226,7 @@ gpu.module @test { %c17 = arith.constant 17: index %mask = vector.create_mask %c17: vector<32xi1> - %ld = xegpu.load %src[%cst], %mask {chunk_size = 1, layout_result_0 = #xegpu.layout, l1_hint = #xegpu.cache_hint} : ui64, vector<32xindex>, vector<32xi1> -> vector<32xf32> + %ld = xegpu.load %src[%cst], %mask {chunk_size = 1, layout = #xegpu.layout, l1_hint = #xegpu.cache_hint} : ui64, vector<32xindex>, vector<32xi1> -> vector<32xf32> gpu.return %ld : vector<32xf32> } @@ -381,7 +381,7 @@ gpu.module @test { %c17 = arith.constant 17: index %mask = vector.create_mask %c17: vector<32xi1> - %ld = xegpu.load %src[%cst], %mask {chunk_size = 4, layout_result_0 = #xegpu.layout, l1_hint = #xegpu.cache_hint} : ui64, vector<32xindex>, vector<32xi1> -> vector<32x4xf32> + %ld = xegpu.load %src[%cst], %mask {chunk_size = 4, layout = #xegpu.layout, l1_hint = #xegpu.cache_hint} : ui64, vector<32xindex>, vector<32xi1> -> vector<32x4xf32> gpu.return %ld : vector<32x4xf32> } diff --git a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops.mlir b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops.mlir index 574b365443a0a..582dbe783e5e3 100644 --- a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops.mlir +++ b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops.mlir @@ -275,7 +275,7 @@ gpu.module @test_distribution { // CHECK-SAME: : memref, vector<32x4xindex>, vector<32x4xi1> -> vector<32x4xf16> %offset = arith.constant {layout_result_0 = #xegpu.layout} dense<0> : vector<256x16xindex> %mask = arith.constant {layout_result_0 = #xegpu.layout} dense<1> : vector<256x16xi1> - %load = xegpu.load %src[%offset], %mask {chunk_size = 1, layout_result_0 = #xegpu.layout, l1_hint = #xegpu.cache_hint} + %load = xegpu.load %src[%offset], %mask {chunk_size = 1, layout = #xegpu.layout, l1_hint = #xegpu.cache_hint} : memref, vector<256x16xindex>, vector<256x16xi1> -> vector<256x16xf16> gpu.return } @@ -310,7 +310,7 @@ gpu.module @test_distribution { // CHECK-SAME: : memref, vector<8xindex>, vector<8xi1> -> vector<8x4xf16> %offset = arith.constant {layout_result_0 = #xegpu.layout} dense<0> : vector<256xindex> %mask = arith.constant {layout_result_0 = #xegpu.layout} dense<1> : vector<256xi1> - %load = xegpu.load %src[%offset], %mask {chunk_size = 4, layout_result_0 = #xegpu.layout, l1_hint = #xegpu.cache_hint} + %load = xegpu.load %src[%offset], %mask {chunk_size = 4, layout = #xegpu.layout, l1_hint = #xegpu.cache_hint} : memref, vector<256xindex>, vector<256xi1> -> vector<256x4xf16> gpu.return } @@ -398,7 +398,7 @@ gpu.module @test_distribution { %cst_acc = arith.constant {layout_result_0 = #xegpu.slice<#xegpu.layout, dims = [3]>} dense<0.0> : vector<4x2x6xf16> %offset = arith.constant {layout_result_0 = #xegpu.layout} dense<0> : vector<4x2x6x32xindex> %mask = arith.constant {layout_result_0 = #xegpu.layout} dense : vector<4x2x6x32xi1> - %load = xegpu.load %src[%offset], %mask {layout_result_0 = #xegpu.layout} : ui64, vector<4x2x6x32xindex>, vector<4x2x6x32xi1> -> vector<4x2x6x32xf16> + %load = xegpu.load %src[%offset], %mask {layout = #xegpu.layout} : ui64, vector<4x2x6x32xindex>, vector<4x2x6x32xi1> -> vector<4x2x6x32xf16> // CHECK: vector.multi_reduction , {{.*}}, {{.*}} [3] : vector<1x1x1x32xf16> to vector<1x1x1xf16> %reduce = vector.multi_reduction , %load, %cst_acc {layout_result_0 = #xegpu.slice<#xegpu.layout, dims = [3]>} [3] : vector<4x2x6x32xf16> to vector<4x2x6xf16> @@ -592,7 +592,7 @@ gpu.module @test_distribution { // CHECK: %[[LOAD:.*]] = xegpu.load {{.*}} <{chunk_size = 1 : i64, layout = #xegpu.slice<#xegpu.layout, dims = [0]>}> // CHECK-SAME: {layout_result_0 = #xegpu.slice<#xegpu.layout, dims = [0]>} : // CHECK-SAME: memref<4096xf32>, vector<32xindex>, vector<32xi1> -> vector<32xf32> - %3 = xegpu.load %2[%offset], %mask {chunk_size = 1, layout_result_0 = #xegpu.slice<#xegpu.layout, dims = [0]> } : memref<4096xf32>, vector<256xindex>, vector<256xi1> -> vector<256xf32> + %3 = xegpu.load %2[%offset], %mask {chunk_size = 1, layout = #xegpu.slice<#xegpu.layout, dims = [0]> } : memref<4096xf32>, vector<256xindex>, vector<256xi1> -> vector<256xf32> // CHECK: %[[BROADCAST:.*]] = vector.broadcast %[[LOAD]] {layout_result_0 = #xegpu.layout} : vector<32xf32> to vector<32x32xf32> %4 = vector.broadcast %3 {layout_result_0 = From c3bc0850d6002208a5ce8d1cc410ca8fc85cd366 Mon Sep 17 00:00:00 2001 From: Jianhui Li Date: Sun, 30 Nov 2025 05:25:52 +0000 Subject: [PATCH 13/28] add interface getAnchorLayout --- .../mlir/Dialect/XeGPU/IR/CMakeLists.txt | 5 ++ mlir/include/mlir/Dialect/XeGPU/IR/XeGPU.h | 1 + .../mlir/Dialect/XeGPU/IR/XeGPUAttrs.td | 20 +++++ .../include/mlir/Dialect/XeGPU/IR/XeGPUOps.td | 73 ++++++++++++++++--- mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp | 2 + mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp | 45 +++++++----- 6 files changed, 115 insertions(+), 31 deletions(-) diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/CMakeLists.txt b/mlir/include/mlir/Dialect/XeGPU/IR/CMakeLists.txt index efca3cfa0dab7..b10219f71b531 100644 --- a/mlir/include/mlir/Dialect/XeGPU/IR/CMakeLists.txt +++ b/mlir/include/mlir/Dialect/XeGPU/IR/CMakeLists.txt @@ -15,3 +15,8 @@ set(LLVM_TARGET_DEFINITIONS XeGPUAttrs.td) mlir_tablegen(XeGPUAttrInterface.h.inc -gen-attr-interface-decls) mlir_tablegen(XeGPUAttrInterface.cpp.inc -gen-attr-interface-defs) add_mlir_dialect_tablegen_target(MLIRXeGPUAttrInterfaceIncGen) + +set(LLVM_TARGET_DEFINITIONS XeGPUAttrs.td) +mlir_tablegen(XeGPUOpInterface.h.inc -gen-op-interface-decls) +mlir_tablegen(XeGPUOpInterface.cpp.inc -gen-op-interface-defs) +add_mlir_dialect_tablegen_target(MLIRXeGPUOpInterfaceIncGen) diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPU.h b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPU.h index 0c059967bb898..7badfaf4a8216 100644 --- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPU.h +++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPU.h @@ -34,6 +34,7 @@ class SliceAttr; #include #include #include +#include // clang-format on #define GET_ATTRDEF_CLASSES diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td index 93c5187b00756..d5b2db75bcad9 100644 --- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td +++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td @@ -740,4 +740,24 @@ def XeGPU_MemLayoutAttr : XeGPUAttr<"MemLayout", "mem_layout"> { } +def AnchorLayoutInterface : OpInterface<"AnchorLayoutInterface"> { + let cppNamespace = "::mlir::xegpu"; + + let description = [{ + An attribute interface for accessing anchor layout information. + This interface provides a method to retrieve the anchor layout + from attributes that implement it. + }]; + + let methods = [ + InterfaceMethod< + /*desc=*/"Get the anchor layout attribute.", + /*retTy=*/"xegpu::DistributeLayoutAttr", + /*methodName=*/"getAnchorLayout", + /*args=*/(ins) + >, + ]; + +} + #endif // MLIR_DIALECT_XEGPU_IR_XEGPUATTRS_TD diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td index d93ffb70881bd..7a9202a45ae40 100644 --- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td +++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td @@ -17,6 +17,7 @@ include "mlir/Interfaces/ShapedOpInterfaces.td" include "mlir/Interfaces/SideEffectInterfaces.td" include "mlir/Interfaces/ViewLikeInterface.td" + // Base class for dialect operations. This operation inherits from the base // `Op` class in OpBase.td, and provides: // * The parent dialect of the operation. @@ -247,7 +248,7 @@ def XeGPU_CreateNdDescOp: XeGPU_Op<"create_nd_tdesc", [Pure, ViewLikeOpInterface }]; } -def XeGPU_PrefetchNdOp : XeGPU_Op<"prefetch_nd", []> { +def XeGPU_PrefetchNdOp : XeGPU_Op<"prefetch_nd", [AnchorLayoutInterface]> { let summary = "prefetches a n-D block to cache"; let description = [{ It issues an instruction to prefetch a block of data from continuous @@ -296,6 +297,10 @@ def XeGPU_PrefetchNdOp : XeGPU_Op<"prefetch_nd", []> { return getTensorDesc().getType(); } + xegpu::DistributeLayoutAttr getAnchorLayout() { + return getLayout().value_or(nullptr); + } + SmallVector getMixedOffsets() { auto statics = getConstOffsets().value_or(SmallVector()); auto dynamics = getOffsets(); @@ -337,7 +342,7 @@ def XeGPU_PrefetchNdOp : XeGPU_Op<"prefetch_nd", []> { def XeGPU_LoadNdOp : XeGPU_Op<"load_nd", [ - AllElementTypesMatch<["value", "TensorDesc"]>, MemoryEffects<[MemRead]> + AllElementTypesMatch<["value", "TensorDesc"]>, MemoryEffects<[MemRead]>, AnchorLayoutInterface ]> { let summary = "loads a n-D block from memory (represented by TensorDesc)" "to registers (represented by vector)"; @@ -418,6 +423,10 @@ def XeGPU_LoadNdOp : XeGPU_Op<"load_nd", [ return getTensorDesc().getType(); } + xegpu::DistributeLayoutAttr getAnchorLayout() { + return getLayout().value_or(nullptr); + } + SmallVector getMixedOffsets() { auto statics = getConstOffsets().value_or(SmallVector()); auto dynamics = getOffsets(); @@ -460,7 +469,7 @@ def XeGPU_LoadNdOp : XeGPU_Op<"load_nd", [ } def XeGPU_StoreNdOp : XeGPU_Op<"store_nd", [ - AllElementTypesMatch<["value", "TensorDesc"]>, MemoryEffects<[MemWrite]> + AllElementTypesMatch<["value", "TensorDesc"]>, MemoryEffects<[MemWrite]>, AnchorLayoutInterface ]> { let summary = "stores a n-D block register region back to memory, currently only supports 2D"; @@ -530,6 +539,10 @@ def XeGPU_StoreNdOp : XeGPU_Op<"store_nd", [ return getTensorDesc().getType(); } + xegpu::DistributeLayoutAttr getAnchorLayout() { + return getLayout().value_or(nullptr); + } + SmallVector getMixedOffsets() { auto statics = getConstOffsets().value_or(SmallVector()); auto dynamics = getOffsets(); @@ -721,7 +734,7 @@ def XeGPU_CreateDescOp: XeGPU_Op<"create_tdesc", [Pure, ViewLikeOpInterface]> { let hasVerifier = 1; } -def XeGPU_PrefetchOp : XeGPU_Op<"prefetch", []> { +def XeGPU_PrefetchOp : XeGPU_Op<"prefetch", [AnchorLayoutInterface]> { let summary = "prefetches a set of scattered data points to cache"; let description = [{ @@ -810,6 +823,10 @@ def XeGPU_PrefetchOp : XeGPU_Op<"prefetch", []> { return getSource().getType(); } + xegpu::DistributeLayoutAttr getAnchorLayout() { + return getLayout().value_or(nullptr); + } + TypedValue getTensorDesc() { if (auto tdescType = getTensorDescType()) { return llvm::cast>(getSource()); @@ -840,7 +857,7 @@ def XeGPU_PrefetchOp : XeGPU_Op<"prefetch", []> { let hasVerifier = 1; } -def XeGPU_LoadGatherOp : XeGPU_Op<"load", [MemoryEffects<[MemRead]>]> { +def XeGPU_LoadGatherOp : XeGPU_Op<"load", [MemoryEffects<[MemRead]>, AnchorLayoutInterface]> { let summary = "load a set of scattered data points from memory."; let description = [{ It (aka. load) load data per each lane. The output @@ -946,6 +963,10 @@ def XeGPU_LoadGatherOp : XeGPU_Op<"load", [MemoryEffects<[MemRead]>]> { return getSource().getType(); } + xegpu::DistributeLayoutAttr getAnchorLayout() { + return getLayout().value_or(nullptr); + } + TypedValue getTensorDesc() { if (auto tdescType = getTensorDescType()) { return llvm::cast>(getSource()); @@ -1002,7 +1023,7 @@ def XeGPU_LoadGatherOp : XeGPU_Op<"load", [MemoryEffects<[MemRead]>]> { let hasVerifier = 1; } -def XeGPU_StoreScatterOp : XeGPU_Op<"store", [MemoryEffects<[MemWrite]>]> { +def XeGPU_StoreScatterOp : XeGPU_Op<"store", [MemoryEffects<[MemWrite]>, AnchorLayoutInterface]> { let summary = "store data to scattered memory locations."; let description = [{ It (aka. store) stores data to scattered memory locations. The value is @@ -1104,6 +1125,10 @@ def XeGPU_StoreScatterOp : XeGPU_Op<"store", [MemoryEffects<[MemWrite]>]> { return getDest().getType(); } + xegpu::DistributeLayoutAttr getAnchorLayout() { + return getLayout().value_or(nullptr); + } + TypedValue getTensorDesc() { if (auto tdescType = getTensorDescType()) { return llvm::cast>(getDest()); @@ -1214,7 +1239,7 @@ def XeGPU_UpdateOffsetOp: XeGPU_Op<"update_offset", let hasVerifier = 1; } -def XeGPU_DpasOp : XeGPU_Op<"dpas", [Pure, AllElementTypesMatch<["lhs", "rhs"]>]> { +def XeGPU_DpasOp : XeGPU_Op<"dpas", [Pure, AllElementTypesMatch<["lhs", "rhs"]>, AnchorLayoutInterface]> { let summary = "It performs mma computation"; let description = [{DPAS performs matrix multiplication on matrix A of `mxk` @@ -1275,6 +1300,11 @@ def XeGPU_DpasOp : XeGPU_Op<"dpas", [Pure, AllElementTypesMatch<["lhs", "rhs"]>] let results = (outs XeGPU_DpasResType: $result); let extraClassDeclaration = [{ + + xegpu::DistributeLayoutAttr getAnchorLayout() { + return getLayoutCd().value_or(nullptr); + } + VectorType getLhsType() { return getLhs().getType(); } @@ -1308,7 +1338,8 @@ def XeGPU_DpasOp : XeGPU_Op<"dpas", [Pure, AllElementTypesMatch<["lhs", "rhs"]>] def XeGPU_AtomicRMWOp: XeGPU_Op<"atomic_rmw", [Pure, MemoryEffects<[MemRead, MemWrite]>, AllElementTypesMatch<["tensorDesc", "value", "result"]>, - AllShapesMatch<["tensorDesc", "value", "result"]>]> { + AllShapesMatch<["tensorDesc", "value", "result"]>, + AnchorLayoutInterface]> { let summary = "Atomic read-modify-write operation on the TensorDesc. "; let description = [{ @@ -1348,6 +1379,12 @@ def XeGPU_AtomicRMWOp: XeGPU_Op<"atomic_rmw", [Pure, XeGPU_ValueType:$value, OptionalAttr:$layout); + let extraClassDeclaration = [{ + xegpu::DistributeLayoutAttr getAnchorLayout() { + return getLayout().value_or(nullptr); + } + }]; + let results = (outs XeGPU_ValueType:$result); let assemblyFormat = [{ @@ -1421,7 +1458,7 @@ def XeGPU_FenceOp: XeGPU_Op<"fence", []> { let extraClassDeclaration = extraBaseClassDeclaration; } -def XeGPU_ConvertLayoutOp: XeGPU_Op<"convert_layout", [Pure, AllTypesMatch<["source", "result"]>]> { +def XeGPU_ConvertLayoutOp: XeGPU_Op<"convert_layout", [Pure, AllTypesMatch<["source", "result"]>, AnchorLayoutInterface]> { let summary = "Convert the layout of the input operand"; let description = [{ `convert_layout` redistribute data across subgroups and/or lanes from the `input_layout` to @@ -1455,6 +1492,11 @@ def XeGPU_ConvertLayoutOp: XeGPU_Op<"convert_layout", [Pure, AllTypesMatch<["sou let assemblyFormat = [{ $source prop-dict attr-dict `:` type($source) }]; + let extraClassDeclaration = [{ + xegpu::DistributeLayoutAttr getAnchorLayout() { + return getTargetLayout(); + } + }]; let hasFolder = 1; let hasVerifier = 1; @@ -1496,7 +1538,7 @@ def XeGPU_CreateMemDescOp: XeGPU_Op<"create_mem_desc", [Pure, } def XeGPU_LoadMatrixOp: XeGPU_Op<"load_matrix", [MemoryEffects<[MemRead]>, - AllElementTypesMatch<["mem_desc", "res"]>]> { + AllElementTypesMatch<["mem_desc", "res"]>, AnchorLayoutInterface]> { let arguments = (ins XeGPU_MemDesc:$mem_desc, Variadic: $offsets, DenseI64ArrayAttr: $const_offsets, @@ -1555,13 +1597,18 @@ def XeGPU_LoadMatrixOp: XeGPU_Op<"load_matrix", [MemoryEffects<[MemRead]>, return vecTy.getShape(); return {}; } + + xegpu::DistributeLayoutAttr getAnchorLayout() { + return getLayout().value_or(nullptr); + } + }]; let hasVerifier = 1; } def XeGPU_StoreMatrixOp: XeGPU_Op<"store_matrix", [MemoryEffects<[MemWrite]>, - AllElementTypesMatch<["mem_desc", "data"]>]> { + AllElementTypesMatch<["mem_desc", "data"]>, AnchorLayoutInterface]> { let arguments = (ins AnyTypeOf<[XeGPU_ValueType, XeGPU_ScalarType]>:$data, XeGPU_MemDesc:$mem_desc, @@ -1614,6 +1661,10 @@ def XeGPU_StoreMatrixOp: XeGPU_Op<"store_matrix", [MemoryEffects<[MemWrite]>, return {}; } + xegpu::DistributeLayoutAttr getAnchorLayout() { + return getLayout().value_or(nullptr); + } + }]; let hasVerifier = 1; diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp index fb5d1e758dbd1..bd3bcf4698c5a 100644 --- a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp +++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp @@ -36,6 +36,8 @@ void XeGPUDialect::initialize() { #include >(); } +#define GET_OP_INTERFACE_CLASSES +#include "mlir/Dialect/XeGPU/IR/XeGPUOpInterface.cpp.inc" // A `srcShape` consists of N distribution units, each being `subShapesLayout` x // `subShape`. A `delinearizedId` is used to identify a particular `subShape` diff --git a/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp b/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp index dff1474e586d8..3e77c5b2ca43f 100644 --- a/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp +++ b/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp @@ -125,26 +125,31 @@ xegpu::DistributeLayoutAttr xegpu::getDistributeLayoutAttr(const Value value) { Operation *defOp = result.getDefiningOp(); assert(defOp && "result must have a defining op"); - // For ConvertLayoutOp, the layout is stored in the targetLayoutAttr - // if (auto convertOp = dyn_cast(defOp)) - // return convertOp.getTargetLayoutAttr(); - - // for LoadNdOp, the layout is stored in the tensor descriptor - if (auto loadNd = dyn_cast(defOp)) - return getDistributeLayoutAttr(loadNd.getTensorDesc()); - - // for LoadMatrixOp, the layout is attached to the property of the op - if (auto loadOp = dyn_cast(defOp)) - return loadOp.getLayoutAttr(); - - // // for StoreMatrixOp, the layout is attached to the property of the op - // if (auto storeOp = dyn_cast(defOp)) - // return storeOp.getLayoutAttr(); - - // check for "permament" layout only after "temporary" layout name lookup - // for backward compatibility - if (auto loadGatherOp = dyn_cast(defOp)) - return loadGatherOp.getLayoutAttr(); + // // For ConvertLayoutOp, the layout is stored in the targetLayoutAttr + // // if (auto convertOp = dyn_cast(defOp)) + // // return convertOp.getTargetLayoutAttr(); + + // // for LoadNdOp, the layout is stored in the tensor descriptor + // if (auto loadNd = dyn_cast(defOp)) + // return getDistributeLayoutAttr(loadNd.getTensorDesc()); + + // // for LoadMatrixOp, the layout is attached to the property of the op + // if (auto loadOp = dyn_cast(defOp)) + // return loadOp.getLayoutAttr(); + + // // // for StoreMatrixOp, the layout is attached to the property of the op + // // if (auto storeOp = dyn_cast(defOp)) + // // return storeOp.getLayoutAttr(); + + // // check for "permament" layout only after "temporary" layout name lookup + // // for backward compatibility + // if (auto loadGatherOp = dyn_cast(defOp)) + // return loadGatherOp.getLayoutAttr(); + + if (auto anchorOp = dyn_cast(defOp)) { + return anchorOp.getAnchorLayout(); + } + std::string layoutName = getLayoutName(result); if (defOp->hasAttr(layoutName)) return defOp->getAttrOfType(layoutName); From 5d3e20e25ff8bbbcceafd4585cc890b68119b81f Mon Sep 17 00:00:00 2001 From: Jianhui Li Date: Wed, 10 Dec 2025 19:14:06 +0000 Subject: [PATCH 14/28] remove permantLayout parameter in setDistribueLayout function --- mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h | 6 +----- .../XeGPU/Transforms/XeGPUPropagateLayout.cpp | 2 +- mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp | 12 ++++++------ 3 files changed, 8 insertions(+), 12 deletions(-) diff --git a/mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h b/mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h index 58092c3bb9ed2..f23174cb0f697 100644 --- a/mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h +++ b/mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h @@ -104,15 +104,11 @@ void removeLayoutAttrs(Operation *op); /// Sets the DistributeLayoutAttr for a given OpOperand or OpResult by attaching /// it to the owner's dictionary attributes -/// If `respectPermLayout` is true the existing permament layout -/// attribute will be kept and assigned to the attribute dict instead -/// of the provided layout. template || std::is_same_v>> void setDistributeLayoutAttr(const T &operandOrResult, - const DistributeLayoutAttr layout, - bool respectPermLayout = false); + const DistributeLayoutAttr layout); /// Set the DistributeLayoutAttr for each OpOperand and OpResult of the given /// operation. If the operation contains regions, it is also applied recursively diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp index dc9eb96c169b4..89b6695bd921e 100644 --- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp +++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp @@ -1173,7 +1173,7 @@ static LogicalResult updateOp(mlir::OpBuilder &builder, mlir::Operation *op, } // If the result is a vector type, add a temporary layout attribute to the // op. - xegpu::setDistributeLayoutAttr(result, layout, /*respectPermLayout*/ true); + xegpu::setDistributeLayoutAttr(result, layout); } return success(); } diff --git a/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp b/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp index 94d95958c758b..e56d0a9cf0613 100644 --- a/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp +++ b/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp @@ -248,8 +248,7 @@ maybePickPermanentLayout(xegpu::DistributeLayoutAttr layout, template void xegpu::setDistributeLayoutAttr(const T &operandOrResult, - const DistributeLayoutAttr layout, - bool respectPermLayout) { + const DistributeLayoutAttr layout) { Operation *owner = operandOrResult.getOwner(); std::string name = xegpu::getLayoutName(operandOrResult); @@ -257,8 +256,9 @@ void xegpu::setDistributeLayoutAttr(const T &operandOrResult, return; DistributeLayoutAttr candidate = layout; - if (respectPermLayout) - candidate = maybePickPermanentLayout(layout, operandOrResult, owner, name); + // if (respectPermLayout) + // candidate = maybePickPermanentLayout(layout, operandOrResult, owner, + // name); if (candidate) owner->setAttr(name, candidate); @@ -267,12 +267,12 @@ void xegpu::setDistributeLayoutAttr(const T &operandOrResult, // Explicit instantiation for OpResult template void xegpu::setDistributeLayoutAttr( const mlir::OpResult &result, - const mlir::xegpu::DistributeLayoutAttr layout, bool respectPermLayout); + const mlir::xegpu::DistributeLayoutAttr layout); // Explicit instantiation for OpOperand template void xegpu::setDistributeLayoutAttr( const mlir::OpOperand &operand, - const mlir::xegpu::DistributeLayoutAttr layout, bool respectPermLayout); + const mlir::xegpu::DistributeLayoutAttr layout); void xegpu::setDistributeLayoutAttrs( Operation *op, function_ref getLayoutImpl) { From 1025b571c57d81a4be6d3aeb8844c8d1eca1dd72 Mon Sep 17 00:00:00 2001 From: Jianhui Li Date: Wed, 10 Dec 2025 22:48:11 +0000 Subject: [PATCH 15/28] add setAnchorLayout interface --- .../mlir/Dialect/XeGPU/IR/XeGPUAttrs.td | 8 +- .../include/mlir/Dialect/XeGPU/IR/XeGPUOps.td | 46 +++++++ .../mlir/Dialect/XeGPU/Utils/XeGPUUtils.h | 4 +- .../XeGPU/Transforms/XeGPUBlocking.cpp | 125 +++++++----------- .../Transforms/XeGPUSubgroupDistribute.cpp | 16 +-- .../Transforms/XeGPUWgToSgDistribute.cpp | 2 +- mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp | 27 ++-- .../lib/Dialect/XeGPU/TestXeGPUTransforms.cpp | 4 +- 8 files changed, 121 insertions(+), 111 deletions(-) diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td index c2cca3e19fd6a..a5064d6145341 100644 --- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td +++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td @@ -775,7 +775,7 @@ def AnchorLayoutInterface : OpInterface<"AnchorLayoutInterface"> { let description = [{ An attribute interface for accessing anchor layout information. - This interface provides a method to retrieve the anchor layout + This interface provides a method to set and retrieve the anchor layout from attributes that implement it. }]; @@ -786,6 +786,12 @@ def AnchorLayoutInterface : OpInterface<"AnchorLayoutInterface"> { /*methodName=*/"getAnchorLayout", /*args=*/(ins) >, + InterfaceMethod< + /*desc=*/"Set the anchor layout attribute.", + /*retTy=*/"void", + /*methodName=*/"setAnchorLayout", + /*args=*/(ins "xegpu::DistributeLayoutAttr":$layout) + >, ]; } diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td index 4583c6400058f..d0206ebea9359 100644 --- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td +++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td @@ -301,6 +301,10 @@ def XeGPU_PrefetchNdOp : XeGPU_Op<"prefetch_nd", [AnchorLayoutInterface]> { return getLayout().value_or(nullptr); } + void setAnchorLayout(xegpu::DistributeLayoutAttr anchorLayout) { + getProperties().layout = anchorLayout; + } + SmallVector getMixedOffsets() { auto statics = getConstOffsets().value_or(SmallVector()); auto dynamics = getOffsets(); @@ -428,6 +432,10 @@ def XeGPU_LoadNdOp : XeGPU_Op<"load_nd", [ return getLayout().value_or(nullptr); } + void setAnchorLayout(xegpu::DistributeLayoutAttr anchorLayout) { + getProperties().layout = anchorLayout; + } + SmallVector getMixedOffsets() { auto statics = getConstOffsets().value_or(SmallVector()); auto dynamics = getOffsets(); @@ -545,6 +553,10 @@ def XeGPU_StoreNdOp : XeGPU_Op<"store_nd", [ return getLayout().value_or(nullptr); } + void setAnchorLayout(xegpu::DistributeLayoutAttr anchorLayout) { + getProperties().layout = anchorLayout; + } + SmallVector getMixedOffsets() { auto statics = getConstOffsets().value_or(SmallVector()); auto dynamics = getOffsets(); @@ -830,6 +842,10 @@ def XeGPU_PrefetchOp : XeGPU_Op<"prefetch", [AnchorLayoutInterface]> { return getLayout().value_or(nullptr); } + void setAnchorLayout(xegpu::DistributeLayoutAttr anchorLayout) { + getProperties().layout = anchorLayout; + } + TypedValue getTensorDesc() { if (auto tdescType = getTensorDescType()) { return llvm::cast>(getSource()); @@ -970,6 +986,10 @@ def XeGPU_LoadGatherOp : XeGPU_Op<"load", [MemoryEffects<[MemRead]>, AnchorLayou return getLayout().value_or(nullptr); } + void setAnchorLayout(xegpu::DistributeLayoutAttr anchorLayout) { + getProperties().layout = anchorLayout; + } + TypedValue getTensorDesc() { if (auto tdescType = getTensorDescType()) { return llvm::cast>(getSource()); @@ -1132,6 +1152,10 @@ def XeGPU_StoreScatterOp : XeGPU_Op<"store", [MemoryEffects<[MemWrite]>, AnchorL return getLayout().value_or(nullptr); } + void setAnchorLayout(xegpu::DistributeLayoutAttr anchorLayout) { + getProperties().layout = anchorLayout; + } + TypedValue getTensorDesc() { if (auto tdescType = getTensorDescType()) { return llvm::cast>(getDest()); @@ -1308,6 +1332,10 @@ def XeGPU_DpasOp : XeGPU_Op<"dpas", [Pure, AllElementTypesMatch<["lhs", "rhs"]>, return getLayoutCd().value_or(nullptr); } + void setAnchorLayout(xegpu::DistributeLayoutAttr anchorLayout) { + getProperties().layout_cd = anchorLayout; + } + VectorType getLhsType() { return getLhs().getType(); } @@ -1386,6 +1414,11 @@ def XeGPU_AtomicRMWOp: XeGPU_Op<"atomic_rmw", [Pure, xegpu::DistributeLayoutAttr getAnchorLayout() { return getLayout().value_or(nullptr); } + + void setAnchorLayout(xegpu::DistributeLayoutAttr anchorLayout) { + getProperties().layout = anchorLayout; + } + }]; let results = (outs XeGPU_ValueType:$result); @@ -1499,6 +1532,11 @@ def XeGPU_ConvertLayoutOp: XeGPU_Op<"convert_layout", [Pure, AllTypesMatch<["sou xegpu::DistributeLayoutAttr getAnchorLayout() { return getTargetLayout(); } + + void setAnchorLayout(xegpu::DistributeLayoutAttr anchorLayout) { + getProperties().target_layout = anchorLayout; + } + }]; let hasFolder = 1; @@ -1605,6 +1643,10 @@ def XeGPU_LoadMatrixOp: XeGPU_Op<"load_matrix", [MemoryEffects<[MemRead]>, return getLayout().value_or(nullptr); } + void setAnchorLayout(xegpu::DistributeLayoutAttr anchorLayout) { + getProperties().layout = anchorLayout; + } + }]; let hasVerifier = 1; @@ -1668,6 +1710,10 @@ def XeGPU_StoreMatrixOp: XeGPU_Op<"store_matrix", [MemoryEffects<[MemWrite]>, return getLayout().value_or(nullptr); } + void setAnchorLayout(xegpu::DistributeLayoutAttr anchorLayout) { + getProperties().layout = anchorLayout; + } + }]; let hasVerifier = 1; diff --git a/mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h b/mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h index f23174cb0f697..5b05a2192f5a6 100644 --- a/mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h +++ b/mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h @@ -64,10 +64,10 @@ FailureOr getDistributedVectorType(VectorType originalType, LayoutAttr layout); /// Return the attribute name for the OpOperand to attach DistributeLayoutAttr -std::string getLayoutName(const OpOperand &operand); +std::string getTempLayoutName(const OpOperand &operand); /// Return the attribute name for the OpResult to attach DistributeLayoutAttr -std::string getLayoutName(const OpResult result); +std::string getTempLayoutName(const OpResult result); /// Retrieves the DistributeLayoutAttr associated with a given Value. For /// TensorDescType values, the DistributeLayoutAttr is extracted from the diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp index 2f1524a3659c2..b584479db7a7c 100644 --- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp +++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp @@ -166,8 +166,8 @@ XeGPUBlockingPass::getTileShape(const T &operandOrResult) const { return instData; } - // if (auto type = dyn_cast(value.getType())) - // return llvm::to_vector(type.getShape()); + if (auto type = dyn_cast(value.getType())) + return llvm::to_vector(type.getShape()); } LDBG() << "failed to getTileShape for: " << value; return std::nullopt; @@ -175,68 +175,33 @@ XeGPUBlockingPass::getTileShape(const T &operandOrResult) const { std::optional> XeGPUBlockingPass::getTileShape(Operation *op) const { - - auto getShapeSkipLeadingUnitDim = [](xegpu::DistributeLayoutAttr layout) - -> std::optional> { - SmallVector instData = layout.getEffectiveInstDataAsInt(); - if (!instData.empty()) { - // Remove leading unit dimensions from inst_data - // For example, if the inst_data is [1, 1, 32] - // it will pass [32] as the unroll/blocking size. - auto it = llvm::find_if(instData, [](auto val) { return val != 1; }); - instData.erase(instData.begin(), it); - return instData; - } - return std::nullopt; - }; - - xegpu::DistributeLayoutAttr layout = nullptr; - - if (isa(op)) - layout = xegpu::getDistributeLayoutAttr(op->getOpResult(0)); - if (isa(op)) - layout = dyn_cast(op).getLayoutAttr(); - if (isa(op)) - layout = dyn_cast(op).getLayoutAttr(); - if (isa(op)) - layout = dyn_cast(op).getLayoutAttr(); + if (isa(op)) + return getTileShape(op->getOpResult(0)); + if (isa(op)) + return getTileShape(op->getOpOperand(0)); if (isa(op)) - layout = dyn_cast(op).getLayoutAttr(); - if (isa(op)) - layout = dyn_cast(op).getLayoutAttr(); - - if (layout != nullptr) { - assert(layout.isForSubgroup() && - "Matrix load/store should have subgroup level layout"); - return layout.getEffectiveInstDataAsInt(); - } - if (isa(op)) - layout = xegpu::getDistributeLayoutAttr(op->getOpResult(0)); - if (isa(op)) - layout = dyn_cast(op).getLayoutAttr(); - if (isa(op)) - layout = dyn_cast(op).getLayoutAttr(); - if (isa(op)) - layout = dyn_cast(op).getLayoutAttr(); - if (layout != nullptr) { - assert( - layout.isForSubgroup() && - "LoadGather/StoreScatter/Prefetch should have subgroup level layout"); - return getShapeSkipLeadingUnitDim(layout); + return getTileShape(op->getOpOperand(1)); + + // Handle LoadGatherOp and StoreScatterOp (with and without offset) + if (auto loadGatherOp = dyn_cast(op)) { + if (loadGatherOp.getOffsets()) + return getTileShape(loadGatherOp->getOpResult(0)); + else + return getTileShape(loadGatherOp->getOpOperand(0)); } - if (isa(op)) { - - auto layoutA = dyn_cast(op).getLayoutAAttr(); - auto layoutB = dyn_cast(op).getLayoutBAttr(); - auto layoutCD = dyn_cast(op).getLayoutCdAttr(); + if (auto storeScatterOp = dyn_cast(op)) + return getTileShape(storeScatterOp.getOffsets() + ? storeScatterOp->getOpOperand(0) + : storeScatterOp->getOpOperand(1)); + if (isa(op)) { std::optional> aTile = - layoutA.getEffectiveInstDataAsInt(); + getTileShape(op->getOpOperand(0)); std::optional> bTile = - layoutB.getEffectiveInstDataAsInt(); - std::optional> cdTile = - layoutCD.getEffectiveInstDataAsInt(); + getTileShape(op->getOpOperand(1)); if (!aTile || aTile->size() != 2 || !bTile || bTile->size() != 2) return std::nullopt; @@ -245,42 +210,45 @@ XeGPUBlockingPass::getTileShape(Operation *op) const { if ((*aTile)[1] != (*bTile)[0]) return std::nullopt; - int64_t expectedCDTile[2] = {(*aTile)[0], (*bTile)[1]}; - if (!cdTile || !llvm::equal(*cdTile, expectedCDTile)) - return std::nullopt; + // semantic check for C + if (op->getNumOperands() == 3) { + std::optional> cTile = + getTileShape(op->getOpOperand(2)); + int64_t expectedCTile[2] = {(*aTile)[0], (*bTile)[1]}; + if (!cTile || !llvm::equal(*cTile, expectedCTile)) + return std::nullopt; + } return SmallVector({(*aTile)[0], (*aTile)[1], (*bTile)[1]}); } if (OpTrait::hasElementwiseMappableTraits(op) && op->getNumResults() == 1) - layout = xegpu::getDistributeLayoutAttr(op->getOpResult(0)); + return getTileShape(op->getOpResult(0)); + if (isa(op)) - layout = xegpu::getDistributeLayoutAttr(op->getOpOperand(0)); + return getTileShape(op->getOpOperand(0)); + if (isa(op)) - layout = xegpu::getDistributeLayoutAttr(op->getOpResult(0)); - if (layout != nullptr) { - assert(layout.isForSubgroup() && - "Other ops (Vector/Math/Arith) should have subgroup level layout"); - return getShapeSkipLeadingUnitDim(layout); - } + return getTileShape(op->getOpResult(0)); + return std::nullopt; } bool XeGPUBlockingPass::needsUnroll(Operation *op) const { // skip the op if any of its operands or results has workgroup level layouts - /* bool hasWgLayoutOperands = + bool hasWgLayoutOperands = llvm::any_of(op->getOpOperands(), [](OpOperand &opr) { xegpu::DistributeLayoutAttr layout = xegpu::getDistributeLayoutAttr(opr); return layout && layout.isForWorkgroup(); - }); */ + }); bool hasWgLayoutResults = llvm::any_of(op->getOpResults(), [](OpResult result) { xegpu::DistributeLayoutAttr layout = xegpu::getDistributeLayoutAttr(result); return layout && layout.isForWorkgroup(); }); - if (hasWgLayoutResults) { + if (hasWgLayoutOperands || hasWgLayoutResults) { LDBG() << "skip unrolling for op with workgroup level layout: " << *op; return false; } @@ -415,12 +383,9 @@ void XeGPUBlockingPass::runOnOperation() { } } - xegpu::LayoutAttr newLayout = nullptr; - if (tdescTy.getLayoutAttr()) - newLayout = tdescTy.getLayoutAttr().dropInstData(); - - newTy = xegpu::TensorDescType::get(ctx, tileShape, elemTy, encoding, - newLayout); + newTy = + xegpu::TensorDescType::get(ctx, tileShape, elemTy, encoding, + tdescTy.getLayoutAttr().dropInstData()); } else { newTy = VectorType::get(tileShape, elemTy); } @@ -447,14 +412,14 @@ void XeGPUBlockingPass::runOnOperation() { op->walk([](Operation *op) { // Remove the layout attributes cached per operands. for (OpOperand &opr : op->getOpOperands()) { - std::string name = xegpu::getLayoutName(opr); + std::string name = xegpu::getTempLayoutName(opr); if (op->hasAttrOfType(name)) op->removeAttr(name); } // Update the layout attributes per result. for (OpResult result : op->getOpResults()) { - std::string name = xegpu::getLayoutName(result); + std::string name = xegpu::getTempLayoutName(result); if (auto layout = op->getAttrOfType(name)) { op->removeAttr(name); if (!isa(op)) diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp index fbe511af39f43..4be6b44ff3082 100644 --- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp +++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp @@ -636,9 +636,9 @@ struct DpasDistribution final : public gpu::WarpDistributionPattern { auto dpasOp = operand->get().getDefiningOp(); unsigned operandIdx = operand->getOperandNumber(); - std::string layoutAName = xegpu::getLayoutName(dpasOp->getOpOperand(0)); - std::string layoutBName = xegpu::getLayoutName(dpasOp->getOpOperand(1)); - std::string layoutCName = xegpu::getLayoutName(dpasOp->getOpResult(0)); + std::string layoutAName = xegpu::getTempLayoutName(dpasOp->getOpOperand(0)); + std::string layoutBName = xegpu::getTempLayoutName(dpasOp->getOpOperand(1)); + std::string layoutCName = xegpu::getTempLayoutName(dpasOp->getOpResult(0)); xegpu::LayoutAttr layoutA = dpasOp->getAttrOfType(layoutAName); @@ -869,11 +869,11 @@ struct StoreDistribution final : public gpu::WarpDistributionPattern { storeScatterOp, "Expected at most 2D result at SG level"); std::string layoutPayloadName = - xegpu::getLayoutName(storeScatterOp->getOpOperand(0)); + xegpu::getTempLayoutName(storeScatterOp->getOpOperand(0)); std::string layoutOffsetsName = - xegpu::getLayoutName(storeScatterOp->getOpOperand(2)); + xegpu::getTempLayoutName(storeScatterOp->getOpOperand(2)); std::string layoutMaskName = - xegpu::getLayoutName(storeScatterOp->getOpOperand(3)); + xegpu::getTempLayoutName(storeScatterOp->getOpOperand(3)); xegpu::LayoutAttr layoutPayload = storeScatterOp->getAttrOfType(layoutPayloadName); @@ -1152,9 +1152,9 @@ struct LoadDistribution final : public gpu::WarpDistributionPattern { "Expected 1D offsets and mask vector"); // Assume offset and mask producers will be distributed as well. std::string layoutOffsetsName = - xegpu::getLayoutName(loadGatherOp->getOpOperand(1)); + xegpu::getTempLayoutName(loadGatherOp->getOpOperand(1)); std::string layoutMaskName = - xegpu::getLayoutName(loadGatherOp->getOpOperand(2)); + xegpu::getTempLayoutName(loadGatherOp->getOpOperand(2)); xegpu::LayoutAttr layoutOffsets = loadGatherOp->getAttrOfType(layoutOffsetsName); diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp index be82cda574f1e..a42b4c394a476 100644 --- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp +++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp @@ -1575,7 +1575,7 @@ void XeGPUWgToSgDistributePass::runOnOperation() { // Layout propagation pass will activated. getOperation()->walk([](Operation *op) { for (OpResult result : op->getOpResults()) { - std::string name = xegpu::getLayoutName(result); + std::string name = xegpu::getTempLayoutName(result); if (auto layout = op->getAttrOfType(name)) { op->removeAttr(name); if (!isa(op)) { diff --git a/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp b/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp index e56d0a9cf0613..19998b619c291 100644 --- a/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp +++ b/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp @@ -101,13 +101,13 @@ mlir::xegpu::getDistributedVectorType(VectorType originalType, return xegpu::getDistributedVectorType(helperTdescTy); } -std::string xegpu::getLayoutName(const OpOperand &operand) { +std::string xegpu::getTempLayoutName(const OpOperand &operand) { const StringRef prefix("layout_operand_"); unsigned idx = const_cast(operand).getOperandNumber(); return llvm::formatv("{0}{1}", prefix, idx).str(); } -std::string xegpu::getLayoutName(const OpResult result) { +std::string xegpu::getTempLayoutName(const OpResult result) { const StringRef prefix = "layout_result_"; return llvm::formatv("{0}{1}", prefix, result.getResultNumber()).str(); } @@ -149,7 +149,7 @@ xegpu::DistributeLayoutAttr xegpu::getDistributeLayoutAttr(const Value value) { return anchorOp.getAnchorLayout(); } - std::string layoutName = getLayoutName(result); + std::string layoutName = getTempLayoutName(result); if (defOp->hasAttr(layoutName)) return defOp->getAttrOfType(layoutName); } @@ -176,7 +176,7 @@ xegpu::DistributeLayoutAttr xegpu::getDistributeLayoutAttr(const Value value) { // if (auto storeOp = dyn_cast(op)) // return storeOp.getLayoutAttr(); -// std::string layoutName = xegpu::getLayoutName(opr); +// std::string layoutName = xegpu::getTempLayoutName(opr); // if (op->hasAttr(layoutName)) // return op->getAttrOfType(layoutName); @@ -192,18 +192,11 @@ xegpu::DistributeLayoutAttr xegpu::getDistributeLayoutAttr(const OpOperand &opr) { Operation *op = opr.getOwner(); - // if (auto loadOp = dyn_cast(op)) - // return loadOp.getLayoutAttr(); - - if (auto storeOp = dyn_cast(op)) - return storeOp.getLayoutAttr(); - - // check for "permament" layout only after "temporary" layout name lookup - if (auto storeScatterOp = dyn_cast(op)) - if (auto layout = storeScatterOp.getLayoutAttr()) - return layout; + if (auto anchorOp = dyn_cast(op)) { + return anchorOp.getAnchorLayout(); + } - std::string layoutName = xegpu::getLayoutName(opr); + std::string layoutName = xegpu::getTempLayoutName(opr); if (op->hasAttr(layoutName)) return op->getAttrOfType(layoutName); @@ -250,7 +243,7 @@ template void xegpu::setDistributeLayoutAttr(const T &operandOrResult, const DistributeLayoutAttr layout) { Operation *owner = operandOrResult.getOwner(); - std::string name = xegpu::getLayoutName(operandOrResult); + std::string name = xegpu::getTempLayoutName(operandOrResult); if (owner->hasAttrOfType(name)) return; @@ -294,7 +287,7 @@ void xegpu::setDistributeLayoutAttrs( template void xegpu::removeLayoutAttr(const T &operandOrResult) { Operation *owner = operandOrResult.getOwner(); - std::string name = xegpu::getLayoutName(operandOrResult); + std::string name = xegpu::getTempLayoutName(operandOrResult); if (owner->hasAttrOfType(name)) owner->removeAttr(name); } diff --git a/mlir/test/lib/Dialect/XeGPU/TestXeGPUTransforms.cpp b/mlir/test/lib/Dialect/XeGPU/TestXeGPUTransforms.cpp index 93d51441f5b81..de0efdc1ccc34 100644 --- a/mlir/test/lib/Dialect/XeGPU/TestXeGPUTransforms.cpp +++ b/mlir/test/lib/Dialect/XeGPU/TestXeGPUTransforms.cpp @@ -184,7 +184,7 @@ class TestStepOpPattern : public OpConversionPattern { matchAndRewrite(vector::StepOp op, OneToNOpAdaptor adaptor, ConversionPatternRewriter &rewriter) const override { - auto layoutName = xegpu::getLayoutName(op->getResult(0)); + auto layoutName = xegpu::getTempLayoutName(op->getResult(0)); auto sliceAttr = op->getAttrOfType(layoutName); if (!sliceAttr || sliceAttr.getRank() != 1) return failure(); @@ -324,7 +324,7 @@ struct TestXeGPULayoutInterface target.addDynamicallyLegalOp( [&](vector::StepOp op) -> bool { - auto layoutName = xegpu::getLayoutName(op->getResult(0)); + auto layoutName = xegpu::getTempLayoutName(op->getResult(0)); auto sliceAttr = op->getAttrOfType(layoutName); return isLegal(sliceAttr); }); From 3221cc84c9b7fd587492c27fe4f32d25a521f5fc Mon Sep 17 00:00:00 2001 From: Jianhui Li Date: Thu, 11 Dec 2025 06:28:17 +0000 Subject: [PATCH 16/28] refactor setDistributeLayoutAttr --- .../mlir/Dialect/XeGPU/Utils/XeGPUUtils.h | 12 +-- mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp | 89 +++++-------------- .../Dialect/XeGPU/optimize-transpose.mlir | 30 +++---- 3 files changed, 44 insertions(+), 87 deletions(-) diff --git a/mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h b/mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h index 5b05a2192f5a6..88ff28e7cc3c7 100644 --- a/mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h +++ b/mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h @@ -102,12 +102,12 @@ void removeLayoutAttr(const T &operandOrResult); /// applied recursively to the contained operations void removeLayoutAttrs(Operation *op); -/// Sets the DistributeLayoutAttr for a given OpOperand or OpResult by attaching -/// it to the owner's dictionary attributes -template || - std::is_same_v>> -void setDistributeLayoutAttr(const T &operandOrResult, +/// Sets the DistributeLayoutAttr for a given OpResult +void setDistributeLayoutAttr(const OpResult &Result, + const DistributeLayoutAttr layout); + +/// Sets the DistributeLayoutAttr for a given OpOperand +void setDistributeLayoutAttr(const OpOperand &opr, const DistributeLayoutAttr layout); /// Set the DistributeLayoutAttr for each OpOperand and OpResult of the given diff --git a/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp b/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp index 19998b619c291..755a686baaa06 100644 --- a/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp +++ b/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp @@ -124,27 +124,6 @@ xegpu::DistributeLayoutAttr xegpu::getDistributeLayoutAttr(const Value value) { Operation *defOp = result.getDefiningOp(); assert(defOp && "result must have a defining op"); - // // For ConvertLayoutOp, the layout is stored in the targetLayoutAttr - // // if (auto convertOp = dyn_cast(defOp)) - // // return convertOp.getTargetLayoutAttr(); - - // // for LoadNdOp, the layout is stored in the tensor descriptor - // if (auto loadNd = dyn_cast(defOp)) - // return getDistributeLayoutAttr(loadNd.getTensorDesc()); - - // // for LoadMatrixOp, the layout is attached to the property of the op - // if (auto loadOp = dyn_cast(defOp)) - // return loadOp.getLayoutAttr(); - - // // // for StoreMatrixOp, the layout is attached to the property of the op - // // if (auto storeOp = dyn_cast(defOp)) - // // return storeOp.getLayoutAttr(); - - // // check for "permament" layout only after "temporary" layout name lookup - // // for backward compatibility - // if (auto loadGatherOp = dyn_cast(defOp)) - // return loadGatherOp.getLayoutAttr(); - if (auto anchorOp = dyn_cast(defOp)) { return anchorOp.getAnchorLayout(); } @@ -166,28 +145,6 @@ xegpu::DistributeLayoutAttr xegpu::getDistributeLayoutAttr(const Value value) { return nullptr; } -// xegpu::DistributeLayoutAttr -// xegpu::getDistributeLayoutAttr(const OpOperand &opr) { -// Operation *op = opr.getOwner(); - -// if (auto loadOp = dyn_cast(op)) -// return loadOp.getLayoutAttr(); - -// if (auto storeOp = dyn_cast(op)) -// return storeOp.getLayoutAttr(); - -// std::string layoutName = xegpu::getTempLayoutName(opr); -// if (op->hasAttr(layoutName)) -// return op->getAttrOfType(layoutName); - -// // check for "permament" layout only after "temporary" layout name lookup -// if (auto storeScatterOp = dyn_cast(op)) -// if (auto layout = storeScatterOp.getLayoutAttr()) -// return layout; - -// return getDistributeLayoutAttr(opr.get()); -// } - xegpu::DistributeLayoutAttr xegpu::getDistributeLayoutAttr(const OpOperand &opr) { Operation *op = opr.getOwner(); @@ -200,8 +157,8 @@ xegpu::getDistributeLayoutAttr(const OpOperand &opr) { if (op->hasAttr(layoutName)) return op->getAttrOfType(layoutName); - return nullptr; - // return getDistributeLayoutAttr(opr.get()); + // return nullptr; + return getDistributeLayoutAttr(opr.get()); } // Returns the permanent layout attribute for the given result if it's @@ -239,33 +196,33 @@ maybePickPermanentLayout(xegpu::DistributeLayoutAttr layout, return candidate; } -template -void xegpu::setDistributeLayoutAttr(const T &operandOrResult, - const DistributeLayoutAttr layout) { - Operation *owner = operandOrResult.getOwner(); - std::string name = xegpu::getTempLayoutName(operandOrResult); +void xegpu::setDistributeLayoutAttr( + const mlir::OpResult &result, + const mlir::xegpu::DistributeLayoutAttr layout) { + Operation *owner = result.getOwner(); + + if (auto anchorOp = dyn_cast(owner)) { + if (anchorOp.getAnchorLayout() == layout) + return; + return anchorOp.setAnchorLayout(layout); + } + std::string name = xegpu::getTempLayoutName(result); if (owner->hasAttrOfType(name)) return; - - DistributeLayoutAttr candidate = layout; - // if (respectPermLayout) - // candidate = maybePickPermanentLayout(layout, operandOrResult, owner, - // name); - - if (candidate) - owner->setAttr(name, candidate); + owner->setAttr(name, layout); } -// Explicit instantiation for OpResult -template void xegpu::setDistributeLayoutAttr( - const mlir::OpResult &result, - const mlir::xegpu::DistributeLayoutAttr layout); +void xegpu::setDistributeLayoutAttr(const OpOperand &operand, + const DistributeLayoutAttr layout) { + Operation *owner = operand.getOwner(); -// Explicit instantiation for OpOperand -template void xegpu::setDistributeLayoutAttr( - const mlir::OpOperand &operand, - const mlir::xegpu::DistributeLayoutAttr layout); + std::string name = xegpu::getTempLayoutName(operand); + + if (owner->hasAttrOfType(name)) + return; + owner->setAttr(name, layout); +} void xegpu::setDistributeLayoutAttrs( Operation *op, function_ref getLayoutImpl) { diff --git a/mlir/test/Dialect/XeGPU/optimize-transpose.mlir b/mlir/test/Dialect/XeGPU/optimize-transpose.mlir index 24a0de6ed48a5..c748c1ca5ef88 100644 --- a/mlir/test/Dialect/XeGPU/optimize-transpose.mlir +++ b/mlir/test/Dialect/XeGPU/optimize-transpose.mlir @@ -10,7 +10,7 @@ // CHECK: %[[BDESC:.*]] = xegpu.create_nd_tdesc %[[T0]], shape : [64, %[[C32]]], strides : [%[[C32]], 1] : i64 // CHECK-SAME: -> !xegpu.tensor_desc<16x8xi32, #xegpu.layout> // CHECK-NEXT: %[[B:.*]] = xegpu.load_nd %[[BDESC]][%{{.*}}, %[[C16]]] -// CHECK-SAME: {layout_result_0 = #xegpu.layout} +// CHECK-SAME: {layout = #xegpu.layout} // CHECK-SAME: : !xegpu.tensor_desc<16x8xi32, #xegpu.layout> -> vector<16x8xi32> // CHECK: %[[BITCAST:.*]] = vector.bitcast %[[B]] // CHECK-SAME: {layout_result_0 = #xegpu.layout} : vector<16x8xi32> to vector<16x16xf16> @@ -38,7 +38,7 @@ gpu.func @no_scf(%arg0: memref<64x64xf16>, %arg1: vector<8x16xf16>) -> vector<8x // CHECK: %[[T1:.*]] = xegpu.create_nd_tdesc %[[T0]], shape : [64, %[[C16]]], strides : [%[[C16]], 1] : i64 // CHECK-SAME: -> !xegpu.tensor_desc<16x8xi32, #xegpu.layout> // CHECK: %[[T2:.*]] = xegpu.load_nd %[[T1]][%{{.*}}, %[[C16]]] -// CHECK-SAME: {layout_result_0 = #xegpu.layout} +// CHECK-SAME: {layout = #xegpu.layout} // CHECK-SAME: : !xegpu.tensor_desc<16x8xi32, #xegpu.layout> -> vector<16x8xi32> // CHECK: %[[T3:.*]] = vector.bitcast %[[T2]] // CHECK-SAME: {layout_result_0 = #xegpu.layout} : vector<16x8xi32> to vector<16x32xi8> @@ -73,7 +73,7 @@ gpu.func @no_scf_i8(%arg0: memref<64x64xi8>, %arg1: vector<8x32xi8>) -> vector<8 // CHECK: %{{.*}} = scf.for %[[K:.*]] = %{{.*}} to %{{.*}} step %{{.*}} iter_args(%{{.*}}) -> (vector<8x16xf32>) { // CHECK: %[[T7:.*]] = arith.shrui %[[K]], %[[C1]] : index // CHECK-NEXT: %[[T8:.*]] = xegpu.load_nd %[[T4]][%{{.*}}, %[[T7]]] -// CHECK-SAME: {layout_result_0 = #xegpu.layout} : +// CHECK-SAME: <{layout = #xegpu.layout}> : // CHECK-SAME: !xegpu.tensor_desc<16x8xi32, #xegpu.layout> -> vector<16x8xi32> // CHECK-NEXT: %{{.*}} = vector.bitcast %[[T8]] {layout_result_0 = #xegpu.layout} // CHECK-SAME: : vector<16x8xi32> to vector<16x16xf16> @@ -115,8 +115,8 @@ gpu.func @gemm_b_transpose(%arg0: memref<256x256xf16>, %arg1: memref<256x256xf16 // CHECK-SAME: -> !xegpu.tensor_desc<16x8xi32, #xegpu.layout> // CHECK: %{{.*}} = scf.for %[[K:.*]] = %{{.*}} iter_args(%{{.*}}) -> (vector<8x16xf32>) { // CHECK: %[[T7:.*]] = arith.shrui %[[K]], %[[C1]] : index -// CHECK-NEXT: %[[T8:.*]] = xegpu.load_nd %[[T4]][%{{.*}}, %[[T7]]] {layout_result_0 = #xegpu.layout< -// CHECK-SAME: lane_layout = [16, 1], lane_data = [1, 1]>} : +// CHECK-NEXT: %[[T8:.*]] = xegpu.load_nd %[[T4]][%{{.*}}, %[[T7]]] <{layout = #xegpu.layout< +// CHECK-SAME: lane_layout = [16, 1], lane_data = [1, 1]>}> : // CHECK-SAME: !xegpu.tensor_desc<16x8xi32, #xegpu.layout> -> vector<16x8xi32> // CHECK-NEXT: %{{.*}} = vector.bitcast %[[T8]] {layout_result_0 = #xegpu.layout} // CHECK-SAME: : vector<16x8xi32> to vector<16x16xf16> @@ -159,13 +159,13 @@ gpu.func @nested_scf(%arg0: memref<256x256xf16>, %arg1: memref<256x256xf16>, %ar // CHECK-SAME: -> !xegpu.tensor_desc<32x8xi32, #xegpu.layout> // CHECK: %{{.*}}:4 = scf.for %[[K:.*]] = %{{.*}} iter_args(%{{.*}}) -> (vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>) { // CHECK: %[[T5:.*]] = arith.shrui %[[K]], %[[C1]] : index -// CHECK: %[[T6:.*]] = xegpu.load_nd %[[T3]][%{{.*}}, %[[T5]]] {layout_result_0 = #xegpu.layout} +// CHECK: %[[T6:.*]] = xegpu.load_nd %[[T3]][%{{.*}}, %[[T5]]] <{layout = #xegpu.layout}> // CHECK-SAME: : !xegpu.tensor_desc<32x8xi32, #xegpu.layout> -> vector<32x8xi32> // CHECK: %[[T7:.*]] = vector.insert_strided_slice %[[T6]], %[[CST]] // CHECK-SAME: {layout_result_0 = #xegpu.layout, offsets = [0, 0], strides = [1, 1]} // CHECK-SAME: : vector<32x8xi32> into vector<32x16xi32> // CHECK: %[[T8:.*]] = arith.addi %[[T5]], %[[C8]] : index -// CHECK: %[[T9:.*]] = xegpu.load_nd %[[T3]][%{{.*}}, %[[T8]]] {layout_result_0 = #xegpu.layout} +// CHECK: %[[T9:.*]] = xegpu.load_nd %[[T3]][%{{.*}}, %[[T8]]] <{layout = #xegpu.layout}> // CHECK-SAME: : !xegpu.tensor_desc<32x8xi32, #xegpu.layout> -> vector<32x8xi32> // CHECK: %[[T10:.*]] = vector.insert_strided_slice %[[T9]], %[[T7]] // CHECK-SAME: {layout_result_0 = #xegpu.layout, offsets = [0, 8], strides = [1, 1]} @@ -225,12 +225,12 @@ gpu.func @large_loads(%arg0: vector<8x16xf16>, %arg1: memref<256x256xf16>, %arg2 // CHECK-SAME: !xegpu.tensor_desc<32x8xi32, #xegpu.layout> // CHECK: %{{.*}}:4 = scf.for %[[K:.*]] = %{{.*}} iter_args(%{{.*}}) -> (vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>) { // CHECK: %[[T5:.*]] = arith.shrui %[[K]], %[[C1]] : index -// CHECK: %[[T6:.*]] = xegpu.load_nd %[[T3]][%{{.*}}, %[[T5]]] {layout_result_0 = #xegpu.layout} +// CHECK: %[[T6:.*]] = xegpu.load_nd %[[T3]][%{{.*}}, %[[T5]]] <{layout = #xegpu.layout}> // CHECK-SAME: : !xegpu.tensor_desc<32x8xi32, #xegpu.layout> -> vector<32x8xi32> // CHECK: %[[T7:.*]] = vector.bitcast %[[T6]] {layout_result_0 = #xegpu.layout} // CHECK-SAME: : vector<32x8xi32> to vector<32x16xf16> // CHECK: %[[T8:.*]] = arith.addi %[[T5]], %[[C8]] : index -// CHECK: %[[T9:.*]] = xegpu.load_nd %[[T3]][%{{.*}}, %[[T8]]] {layout_result_0 = #xegpu.layout} +// CHECK: %[[T9:.*]] = xegpu.load_nd %[[T3]][%{{.*}}, %[[T8]]] <{layout = #xegpu.layout}> // CHECK-SAME: : !xegpu.tensor_desc<32x8xi32, #xegpu.layout> -> vector<32x8xi32> // CHECK: %[[T10:.*]] = vector.bitcast %[[T9]] {layout_result_0 = #xegpu.layout} // CHECK-SAME: : vector<32x8xi32> to vector<32x16xf16> @@ -244,12 +244,12 @@ gpu.func @array_length(%arg0: vector<8x16xf16>, %arg1: memref<256x256xf16>, %arg %c32 = arith.constant 32 : index %c256 = arith.constant 256 : index %0 = xegpu.create_nd_tdesc %arg2 : memref<256x256xf32> -> !xegpu.tensor_desc<8x16xf32, #a> - %1 = xegpu.load_nd %0[%c0, %c0] { layout_result_0 = #a } : !xegpu.tensor_desc<8x16xf32, #a> -> vector<8x16xf32> + %1 = xegpu.load_nd %0[%c0, %c0] { layout = #a } : !xegpu.tensor_desc<8x16xf32, #a> -> vector<8x16xf32> %3 = xegpu.create_nd_tdesc %arg1 : memref<256x256xf16> -> !xegpu.tensor_desc<32x16xf16, #b, #xegpu.block_tdesc_attr> %4:4 = scf.for %arg3 = %c0 to %c256 step %c32 iter_args(%arg4 = %1, %arg5 = %1, %arg6 = %1, %arg7 = %1) -> (vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>) { - %6 = xegpu.load_nd %3[%c0, %arg3] { layout_result_0 = #b } + %6 = xegpu.load_nd %3[%c0, %arg3] { layout = #b } : !xegpu.tensor_desc<32x16xf16, #b, #xegpu.block_tdesc_attr> -> vector<2x32x16xf16> %19 = vector.extract %6[0] { layout_result_0 = #b } : vector<32x16xf16> from vector<2x32x16xf16> %20 = vector.extract %6[1] { layout_result_0 = #b } : vector<32x16xf16> from vector<2x32x16xf16> @@ -265,10 +265,10 @@ gpu.func @array_length(%arg0: vector<8x16xf16>, %arg1: memref<256x256xf16>, %arg %12 = vector.transpose %8, [1, 0] { layout_result_0 = #bt } : vector<16x16xf16> to vector<16x16xf16> %13 = vector.transpose %9, [1, 0] { layout_result_0 = #bt } : vector<16x16xf16> to vector<16x16xf16> %14 = vector.transpose %10, [1, 0] { layout_result_0 = #bt } : vector<16x16xf16> to vector<16x16xf16> - %15 = xegpu.dpas %arg0, %11, %arg4 {layout_result_0 = #a} : vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32> -> vector<8x16xf32> - %16 = xegpu.dpas %arg0, %12, %arg5 {layout_result_0 = #a} : vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32> -> vector<8x16xf32> - %17 = xegpu.dpas %arg0, %13, %arg6 {layout_result_0 = #a} : vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32> -> vector<8x16xf32> - %18 = xegpu.dpas %arg0, %14, %arg7 {layout_result_0 = #a} : vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32> -> vector<8x16xf32> + %15 = xegpu.dpas %arg0, %11, %arg4 {layout = #a} : vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32> -> vector<8x16xf32> + %16 = xegpu.dpas %arg0, %12, %arg5 {layout = #a} : vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32> -> vector<8x16xf32> + %17 = xegpu.dpas %arg0, %13, %arg6 {layout = #a} : vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32> -> vector<8x16xf32> + %18 = xegpu.dpas %arg0, %14, %arg7 {layout = #a} : vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32> -> vector<8x16xf32> scf.yield %15, %16, %17, %18 : vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32> } {layout_result_0 = #a, layout_result_1 = #a, layout_result_2 = #a, layout_result_3 = #a} xegpu.store_nd %4#0, %0[%c0, %c0] : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #a> From 5448d6b498bfa6619bff7991583a4c5a7db0da7d Mon Sep 17 00:00:00 2001 From: Jianhui Li Date: Fri, 12 Dec 2025 05:03:50 +0000 Subject: [PATCH 17/28] adjusting tests to use anchor layout --- .../include/mlir/Dialect/XeGPU/IR/XeGPUOps.td | 2 +- .../mlir/Dialect/XeGPU/Utils/XeGPUUtils.h | 3 +- .../XeGPU/Transforms/XeGPUBlocking.cpp | 6 +- .../Transforms/XeGPUSubgroupDistribute.cpp | 12 +- mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp | 149 +++++++++++++++--- .../XeGPU/propagate-layout-inst-data.mlir | 14 +- mlir/test/Dialect/XeGPU/propagate-layout.mlir | 56 +++---- mlir/test/Dialect/XeGPU/transform-ops.mlir | 2 +- mlir/test/Dialect/XeGPU/xegpu-blocking.mlir | 6 +- .../test/Dialect/XeGPU/xegpu-wg-to-sg-rr.mlir | 32 ++-- .../XeGPU/xegpu-wg-to-sg-unify-ops-rr.mlir | 22 +-- .../XeGPU/xegpu-wg-to-sg-unify-ops.mlir | 62 ++++---- mlir/test/Dialect/XeGPU/xegpu-wg-to-sg.mlir | 45 +++--- 13 files changed, 252 insertions(+), 159 deletions(-) diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td index d0206ebea9359..97c7f69cc5d5f 100644 --- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td +++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td @@ -1119,7 +1119,7 @@ def XeGPU_StoreScatterOp : XeGPU_Op<"store", [MemoryEffects<[MemWrite]>, AnchorL l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint, layout = #xegpu.layout} - : memref<1024xf32>, vector<16xi1>, vector<16xindex> -> vector<16xf32> + : vector<16xf32>, memref<1024xf32>, vector<16xi1>, vector<16xindex> ``` Example 4 (Lane level): diff --git a/mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h b/mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h index 88ff28e7cc3c7..d851ed9e3ccf1 100644 --- a/mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h +++ b/mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h @@ -113,8 +113,7 @@ void setDistributeLayoutAttr(const OpOperand &opr, /// Set the DistributeLayoutAttr for each OpOperand and OpResult of the given /// operation. If the operation contains regions, it is also applied recursively /// to the contained operations -void setDistributeLayoutAttrs( - Operation *op, function_ref getLayoutImpl); +void retrieveDistributeLayoutAttrsRecursive(Operation *op); /// Extract a set of small vectors from a value with a given shape using /// vector.extract_stride_slice diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp index b584479db7a7c..bddc8f8c8de68 100644 --- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp +++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp @@ -158,7 +158,8 @@ XeGPUBlockingPass::getTileShape(const T &operandOrResult) const { bool skipLeadingUnitDimRemoval = definingOp && (isa(definingOp)); + xegpu::StoreNdOp, xegpu::PrefetchNdOp, vector::BroadcastOp>( + definingOp)); if (!skipLeadingUnitDimRemoval) { auto it = llvm::find_if(instData, [](auto val) { return val != 1; }); instData.erase(instData.begin(), it); @@ -283,8 +284,7 @@ void XeGPUBlockingPass::runOnOperation() { // Preserve the LayoutAttr for each operand to the owner's DictionaryAttr. // This ensures that the LayoutAttr remains accessible even if the defining // operation is replaced. - xegpu::setDistributeLayoutAttrs( - op, [](Value v) { return xegpu::getDistributeLayoutAttr(v); }); + xegpu::retrieveDistributeLayoutAttrsRecursive(op); auto getTileShapeAndCount = [](llvm::ArrayRef shape, xegpu::LayoutAttr layout) { diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp index 4be6b44ff3082..1b851d964c1a9 100644 --- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp +++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp @@ -1240,20 +1240,24 @@ static Value lowerToVectorReductions(TypedValue src, vector::ExtractStridedSliceOp extractOp = vector::ExtractStridedSliceOp::create(rewriter, loc, src, sliceOffsets, sliceSizes, {1, 1}); + int64_t nSliceElements = extractOp.getResult().getType().getNumElements(); + vector::ShapeCastOp slice = vector::ShapeCastOp::create( rewriter, loc, VectorType::get({nSliceElements}, sourceType.getElementType()), extractOp.getResult()); + // Shape cast is currently handled in xegpu side. So layouts must be // retained during lowering. Shape cast output has the same layout as the // accumulator. Shape cast source has the same layout as the original // reduction source. // TODO: other ops generated here may also need layout attributes. - xegpu::setDistributeLayoutAttr(slice->getOpOperand(0), - xegpu::getDistributeLayoutAttr(src)); - xegpu::setDistributeLayoutAttr(slice->getOpResult(0), - xegpu::getDistributeLayoutAttr(acc)); + auto srcLayout = xegpu::getDistributeLayoutAttr(src); + auto accLayout = xegpu::getDistributeLayoutAttr(acc); + + xegpu::setDistributeLayoutAttr(slice->getOpOperand(0), srcLayout); + xegpu::setDistributeLayoutAttr(slice->getOpResult(0), accLayout); // Extract and reduction results in scalars, so no result layout is needed. Value accExtract = vector::ExtractOp::create(rewriter, loc, acc, i); Value reduction = vector::ReductionOp::create( diff --git a/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp b/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp index 755a686baaa06..2c3589bbe6e4f 100644 --- a/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp +++ b/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp @@ -113,52 +113,115 @@ std::string xegpu::getTempLayoutName(const OpResult result) { } xegpu::DistributeLayoutAttr xegpu::getDistributeLayoutAttr(const Value value) { - if (!value) + if (!value) { + llvm::errs() << "getDistributeLayoutAttr: value is null\n"; return nullptr; + } + + llvm::errs() << "Getting layout for value: " << value << "\n"; if (auto tdescTy = - dyn_cast_if_present(value.getType())) - return tdescTy.getLayoutAttr(); + dyn_cast_if_present(value.getType())) { + auto layoutAttr = tdescTy.getLayoutAttr(); + llvm::errs() << " Found TensorDescType with layout\n"; + return layoutAttr; + } if (auto result = dyn_cast(value)) { Operation *defOp = result.getDefiningOp(); assert(defOp && "result must have a defining op"); + llvm::errs() << " Value is OpResult from: " << *defOp << "\n"; if (auto anchorOp = dyn_cast(defOp)) { - return anchorOp.getAnchorLayout(); + auto layout = anchorOp.getAnchorLayout(); + llvm::errs() << " Returning anchor layout from defining op\n"; + return layout; } std::string layoutName = getTempLayoutName(result); - if (defOp->hasAttr(layoutName)) - return defOp->getAttrOfType(layoutName); + if (defOp->hasAttr(layoutName)) { + auto layout = + defOp->getAttrOfType(layoutName); + llvm::errs() << " Returning temp layout from attribute: " << layoutName + << "\n"; + return layout; + } + llvm::errs() << " No layout found for OpResult\n"; } if (auto arg = dyn_cast(value)) { auto *parentOp = arg.getOwner()->getParentOp(); + llvm::errs() << " Value is BlockArgument, parent op: " << *parentOp + << "\n"; if (auto loop = dyn_cast(parentOp)) { OpOperand *tiedInit = loop.getTiedLoopInit(arg); - if (tiedInit) - return getDistributeLayoutAttr(tiedInit->get()); + if (tiedInit) { + llvm::errs() << " Following tied loop init\n"; + auto layout = getDistributeLayoutAttr(tiedInit->get()); + return layout; + } } + llvm::errs() << " No tied loop init found\n"; } + llvm::errs() << " Returning nullptr\n"; return nullptr; } xegpu::DistributeLayoutAttr xegpu::getDistributeLayoutAttr(const OpOperand &opr) { Operation *op = opr.getOwner(); + unsigned idx = const_cast(opr).getOperandNumber(); + + llvm::errs() << "Getting layout for operand " << idx << " of op: " << *op + << "\n"; if (auto anchorOp = dyn_cast(op)) { - return anchorOp.getAnchorLayout(); + if (auto dpasOp = dyn_cast(op)) { + if (idx == 0) { + llvm::errs() << " Returning DpasOp layout A\n"; + return dpasOp.getLayoutAAttr(); + } else if (idx == 1) { + llvm::errs() << " Returning DpasOp layout B\n"; + return dpasOp.getLayoutBAttr(); + } else if (idx == 2) { + llvm::errs() << " Returning DpasOp layout CD\n"; + return dpasOp.getLayoutCdAttr(); + } + } + if (auto convertOp = dyn_cast(op)) { + llvm::errs() << " Returning ConvertLayoutOp input layout\n"; + return convertOp.getInputLayoutAttr(); + } + auto layout = anchorOp.getAnchorLayout(); + // For store operations (StoreScatterOp, StoreNdOp, StoreMatrixOp), + // the layout is valid for the first two operands: value and memref/tdesc. + // For other operations, the layout applies to the first operand only. + if (isa( + op)) { + if (idx < 2) { + llvm::errs() << " Returning anchor layout for store op\n"; + return layout; + } + } else { + if (idx == 0) { + llvm::errs() << " Returning anchor layout\n"; + return layout; + } + } } std::string layoutName = xegpu::getTempLayoutName(opr); - if (op->hasAttr(layoutName)) - return op->getAttrOfType(layoutName); + if (op->hasAttr(layoutName)) { + auto layout = op->getAttrOfType(layoutName); + llvm::errs() << " Returning temp layout from attribute: " << layoutName + << "\n"; + return layout; + } - // return nullptr; - return getDistributeLayoutAttr(opr.get()); + auto layout = getDistributeLayoutAttr(opr.get()); + llvm::errs() << " Returning layout from operand value\n"; + return layout; } // Returns the permanent layout attribute for the given result if it's @@ -210,32 +273,70 @@ void xegpu::setDistributeLayoutAttr( std::string name = xegpu::getTempLayoutName(result); if (owner->hasAttrOfType(name)) return; - owner->setAttr(name, layout); + if (layout) + owner->setAttr(name, layout); } - void xegpu::setDistributeLayoutAttr(const OpOperand &operand, const DistributeLayoutAttr layout) { Operation *owner = operand.getOwner(); + unsigned idx = const_cast(operand).getOperandNumber(); + if (layout) + return; + if (auto anchorOp = dyn_cast(owner)) { + if (auto dpasOp = dyn_cast(owner)) { + if (idx == 0) { + llvm::errs() << " set DpasOp layout A\n"; + return dpasOp.setLayoutAAttr(layout); + } else if (idx == 1) { + llvm::errs() << " set DpasOp layout B\n"; + return dpasOp.setLayoutBAttr(layout); + } else if (idx == 2) { + llvm::errs() << " set DpasOp layout CD\n"; + return dpasOp.setLayoutCdAttr(layout); + } + } + if (auto convertOp = dyn_cast(owner)) { + llvm::errs() << " set ConvertLayoutOp input layout\n"; + return convertOp.setInputLayoutAttr(layout); + } - std::string name = xegpu::getTempLayoutName(operand); + // For store operations (StoreScatterOp, StoreNdOp, StoreMatrixOp), + // the layout is valid for the first two operands: value and memref/tdesc. + // For other operations, the layout applies to the first operand only. + if (isa( + owner)) { + if (idx < 2) { + llvm::errs() << " set anchor layout for store op\n"; + anchorOp.setAnchorLayout(layout); + } + } else { + if (idx == 0) { + llvm::errs() << " set anchor layout\n"; + anchorOp.setAnchorLayout(layout); + } + } + } + std::string name = xegpu::getTempLayoutName(operand); if (owner->hasAttrOfType(name)) return; - owner->setAttr(name, layout); + if (layout) + owner->setAttr(name, layout); } -void xegpu::setDistributeLayoutAttrs( - Operation *op, function_ref getLayoutImpl) { +void xegpu::retrieveDistributeLayoutAttrsRecursive(Operation *op) { op->walk([&](Operation *nestOp) { - if (isa(nestOp)) - return; - for (OpOperand &opr : nestOp->getOpOperands()) { - auto layout = getLayoutImpl(opr.get()); + auto layout = getDistributeLayoutAttr(opr.get()); + llvm::errs() << "Setting layout for operand " << opr.getOperandNumber() + << " of op: " << *nestOp << "\n"; setDistributeLayoutAttr(opr, layout); } + for (OpResult result : nestOp->getOpResults()) { - auto layout = getLayoutImpl(result); + auto layout = getDistributeLayoutAttr(result); + llvm::errs() << "Setting layout for result " << result.getResultNumber() + << " of op: " << *nestOp << "\n"; setDistributeLayoutAttr(result, layout); } }); diff --git a/mlir/test/Dialect/XeGPU/propagate-layout-inst-data.mlir b/mlir/test/Dialect/XeGPU/propagate-layout-inst-data.mlir index 32fb3178a8af2..5f70831f45e97 100644 --- a/mlir/test/Dialect/XeGPU/propagate-layout-inst-data.mlir +++ b/mlir/test/Dialect/XeGPU/propagate-layout-inst-data.mlir @@ -8,7 +8,7 @@ // CHECK: %[[TDESC_DST:.*]] = xegpu.create_nd_tdesc %[[ARG1]] : memref<8x32xf32> -> !xegpu.tensor_desc<8x32xf32, #xegpu.layout> // CHECK: xegpu.prefetch_nd %[[TDESC_SRC]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, layout = #xegpu.layout}> : // CHECK-SAME: !xegpu.tensor_desc<8x32xf32, #xegpu.layout> -// CHECK: %[[LOADED:.*]] = xegpu.load_nd %0 <{layout = #xegpu.layout}> {layout_result_0 = #xegpu.layout} : +// CHECK: %[[LOADED:.*]] = xegpu.load_nd %0 <{layout = #xegpu.layout}> // CHECK-SAME: !xegpu.tensor_desc<8x32xf32, #xegpu.layout> -> vector<8x32xf32> // CHECK: xegpu.store_nd %[[LOADED]], %[[TDESC_DST]] <{layout = #xegpu.layout}> : vector<8x32xf32>, !xegpu.tensor_desc<8x32xf32, #xegpu.layout> gpu.module @test { @@ -32,11 +32,11 @@ func.func @load_store_no_array_len(%arg0: memref<8x32xf32>, %arg1: memref<8x32xf // CHECK: %[[CST:.*]] = arith.constant {layout_result_0 = #xegpu.layout} dense<0.000000e+00> : vector<8x16xf32> // CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG0]][{{.*}}] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16, #xegpu.layout // CHECK: %[[T1:.*]] = xegpu.create_nd_tdesc %[[ARG1]][{{.*}}] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout> -// CHECK: %[[T2:.*]] = xegpu.load_nd %[[T0]] <{layout = #xegpu.layout}> {layout_result_0 = #xegpu.layout} : +// CHECK: %[[T2:.*]] = xegpu.load_nd %[[T0]] <{layout = #xegpu.layout}> : // CHECK-SAME: !xegpu.tensor_desc<8x16xf16, #xegpu.layout> -> vector<8x16xf16> -// CHECK: %[[T3:.*]] = xegpu.load_nd %[[T1]] <{layout = #xegpu.layout}> {layout_result_0 = #xegpu.layout} : +// CHECK: %[[T3:.*]] = xegpu.load_nd %[[T1]] <{layout = #xegpu.layout}> : // CHECK-SAME: !xegpu.tensor_desc<16x16xf16, #xegpu.layout> -> vector<16x16xf16> -// CHECK: %[[T4:.*]] = xegpu.dpas %[[T2]], %[[T3]], %[[CST]] {layout_a = #xegpu.layout, layout_b = #xegpu.layout, layout_cd = #xegpu.layout, layout_result_0 = #xegpu.layout} : +// CHECK: %[[T4:.*]] = xegpu.dpas %[[T2]], %[[T3]], %[[CST]] {layout_a = #xegpu.layout, layout_b = #xegpu.layout, layout_cd = #xegpu.layout} : // CHECK-SAME: vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32> -> vector<8x16xf32> // CHECK: %[[T5:.*]] = xegpu.create_nd_tdesc %[[ARG2]][{{.*}}] : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.layout // CHECK: xegpu.store_nd %[[T4]], %[[T5]] <{layout = #xegpu.layout}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.layout> @@ -73,7 +73,7 @@ gpu.module @test_kernel { %out:3 = scf.for %k = %c0 to %c1024 step %c32 iter_args(%arg0 = %a_tdesc, %arg1 = %b_tdesc, %arg2 = %c_tdesc) -> (!xegpu.tensor_desc<16x32xf16>, !xegpu.tensor_desc<16x32xf16>, !xegpu.tensor_desc<16x32xf16>) { - //CHECK: xegpu.load_nd {{.*}} <{layout = #xegpu.layout}> {layout_result_0 = #xegpu.layout} : + //CHECK: xegpu.load_nd {{.*}} <{layout = #xegpu.layout}> : //CHECK-SAME: !xegpu.tensor_desc<16x32xf16, #xegpu.layout> -> vector<16x32xf16> %a = xegpu.load_nd %arg0 : !xegpu.tensor_desc<16x32xf16> -> vector<16x32xf16> %b = xegpu.load_nd %arg1 : !xegpu.tensor_desc<16x32xf16> -> vector<16x32xf16> @@ -112,7 +112,7 @@ gpu.module @test_kernel { %out:3 = scf.for %k = %c0 to %c1024 step %c32 iter_args(%arg0 = %a_tdesc, %arg1 = %b_tdesc, %arg2 = %c_tdesc) -> (!xegpu.tensor_desc<12x32xf16>, !xegpu.tensor_desc<12x32xf16>, !xegpu.tensor_desc<12x32xf16>) { - //CHECK: xegpu.load_nd {{.*}} <{layout = #xegpu.layout}> {layout_result_0 = #xegpu.layout} : + //CHECK: xegpu.load_nd {{.*}} <{layout = #xegpu.layout}> : //CHECK-SAME: !xegpu.tensor_desc<12x32xf16, #xegpu.layout> -> vector<12x32xf16> %a = xegpu.load_nd %arg0 : !xegpu.tensor_desc<12x32xf16> -> vector<12x32xf16> %b = xegpu.load_nd %arg1 : !xegpu.tensor_desc<12x32xf16> -> vector<12x32xf16> @@ -141,7 +141,7 @@ gpu.module @test { // CHECK: %{{.*}} = arith.constant {layout_result_0 = #xegpu.layout} dense : vector<16xi1> // CHECK: %{{.*}} = arith.constant {layout_result_0 = #xegpu.layout} dense<12> : vector<16xindex> // CHECK: %{{.*}} = xegpu.load %[[ARG0]][%{{.*}}], %{{.*}} <{chunk_size = 8 : i64, layout = #xegpu.layout}> -// CHECK-SAME: {layout_result_0 = #xegpu.layout} : memref<256xf16>, vector<16xindex>, vector<16xi1> -> vector<16x8xf16> +// CHECK-SAME: memref<256xf16>, vector<16xindex>, vector<16xi1> -> vector<16x8xf16> // CHECK: xegpu.store %0, %[[ARG0]][%{{.*}}], %{{.*}} <{chunk_size = 8 : i64, layout = #xegpu.layout}> : vector<16x8xf16>, memref<256xf16>, vector<16xindex>, vector<16xi1> func.func @scatter_ops_chunksize(%src: memref<256xf16>) { %1 = arith.constant dense<1>: vector<16xi1> diff --git a/mlir/test/Dialect/XeGPU/propagate-layout.mlir b/mlir/test/Dialect/XeGPU/propagate-layout.mlir index 48e77d867508b..b88d8e1a78a26 100644 --- a/mlir/test/Dialect/XeGPU/propagate-layout.mlir +++ b/mlir/test/Dialect/XeGPU/propagate-layout.mlir @@ -6,11 +6,11 @@ gpu.module @test { // CHECK: %[[CST:.*]] = arith.constant {layout_result_0 = #xegpu.layout} dense<0.000000e+00> : vector<8x16xf32> // CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG0]][{{.*}}] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16, #xegpu.layout> // CHECK: %[[T1:.*]] = xegpu.create_nd_tdesc %[[ARG1]][{{.*}}] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout> -// CHECK: %[[T2:.*]] = xegpu.load_nd %[[T0]] <{layout = #xegpu.layout}> {layout_result_0 = #xegpu.layout} : +// CHECK: %[[T2:.*]] = xegpu.load_nd %[[T0]] <{layout = #xegpu.layout}> : // CHECK-SAME: !xegpu.tensor_desc<8x16xf16, #xegpu.layout> -> vector<8x16xf16> -// CHECK: %[[T3:.*]] = xegpu.load_nd %[[T1]] <{layout = #xegpu.layout}> {layout_result_0 = #xegpu.layout} : +// CHECK: %[[T3:.*]] = xegpu.load_nd %[[T1]] <{layout = #xegpu.layout}> : // CHECK-SAME: !xegpu.tensor_desc<16x16xf16, #xegpu.layout> -> vector<16x16xf16> -// CHECK: %[[T4:.*]] = xegpu.dpas %[[T2]], %[[T3]], %[[CST]] {layout_a = #xegpu.layout, layout_b = #xegpu.layout, layout_cd = #xegpu.layout, layout_result_0 = #xegpu.layout} : +// CHECK: %[[T4:.*]] = xegpu.dpas %[[T2]], %[[T3]], %[[CST]] {layout_a = #xegpu.layout, layout_b = #xegpu.layout, layout_cd = #xegpu.layout} : // CHECK-SAME: vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32> -> vector<8x16xf32> // CHECK: %[[T5:.*]] = xegpu.create_nd_tdesc %[[ARG2]][{{.*}}] : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.layout> // CHECK: xegpu.store_nd %[[T4]], %[[T5]] <{layout = #xegpu.layout}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.layout> @@ -32,7 +32,7 @@ func.func @dpas_f16(%arg0: memref<8x16xf16>, %arg1: memref<16x16xf16>, %arg2: me gpu.module @test { // CHECK-LABEL: func.func @dpas_i8( // CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: vector<8x32xi8>, %[[ARG1:[0-9a-zA-Z]+]]: vector<32x16xi8>, %[[ARG2:[0-9a-zA-Z]+]]: memref<8x16xi32>) { -// CHECK: %[[T0:.*]] = xegpu.dpas %[[ARG0]], %[[ARG1]] {layout_a = #xegpu.layout, layout_b = #xegpu.layout, layout_result_0 = #xegpu.layout} +// CHECK: %[[T0:.*]] = xegpu.dpas %[[ARG0]], %[[ARG1]] {layout_a = #xegpu.layout, layout_b = #xegpu.layout, layout_cd = #xegpu.layout} func.func @dpas_i8(%arg0: vector<8x32xi8>, %arg1: vector<32x16xi8>, %arg2: memref<8x16xi32>) { %c0 = arith.constant 0 : index @@ -47,7 +47,7 @@ func.func @dpas_i8(%arg0: vector<8x32xi8>, %arg1: vector<32x16xi8>, %arg2: memre gpu.module @test { // CHECK-LABEL: func.func @load_with_transpose_effect( // CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: memref<8x16xf16>, %[[ARG0:[0-9a-zA-Z]+]]: memref<16x16xf16>, %[[ARG0:[0-9a-zA-Z]+]]: memref<8x16xf32>) { -// CHECK: %{{.*}} = xegpu.load_nd %{{.*}} <{layout = #xegpu.layout}> {layout_result_0 = #xegpu.layout} : +// CHECK: %{{.*}} = xegpu.load_nd %{{.*}} <{layout = #xegpu.layout}> : // CHECK-SAME: !xegpu.tensor_desc<8x16xf16, #xegpu.layout> -> vector<8x16xf16> func.func @load_with_transpose_effect(%arg0: memref<8x16xf16>, %arg1: memref<16x16xf16>, %arg2: memref<8x16xf32>) { %c0 = arith.constant 0 : index @@ -109,7 +109,7 @@ gpu.module @test { // CHECK-NEXT: %[[CST0:.*]] = arith.constant {layout_result_0 = #xegpu.layout} dense : vector<16xi1> // CHECK-NEXT: %[[T2:.*]] = xegpu.create_tdesc %[[ARG1]], %[[CST]] : memref<256xf16>, vector<16xindex> -> // CHECK-SAME: !xegpu.tensor_desc<16x16xf16, #xegpu.scatter_tdesc_attr, #xegpu.layout> -// CHECK-NEXT: %{{.*}} = xegpu.load %[[T2]], %[[CST0]] <{layout = #xegpu.layout}> {layout_result_0 = #xegpu.layout} +// CHECK-NEXT: %{{.*}} = xegpu.load %[[T2]], %[[CST0]] <{layout = #xegpu.layout}> // CHECK-SAME: !xegpu.tensor_desc<16x16xf16, #xegpu.scatter_tdesc_attr, #xegpu.layout>, vector<16xi1> -> vector<16x16xf16> func.func @load_gather_with_chunksize(%arg0: memref<8x16xf16>, %arg1: memref<256xf16>, %arg2: memref<8x16xf32>) { %c0 = arith.constant 0 : index @@ -136,7 +136,7 @@ gpu.module @test { // CHECK-NEXT: %[[CST0:.*]] = arith.constant {layout_result_0 = #xegpu.layout} dense : vector<16xi1> // CHECK-NEXT: %[[T0:.*]] = xegpu.create_tdesc %[[ARG0]], %[[CST]] : memref<256xf32>, vector<16xindex> -> // CHECK-SAME: !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.layout> -// CHECK-NEXT: %{{.*}} = xegpu.load %[[T0]], %[[CST0]] <{layout = #xegpu.layout}> {layout_result_0 = #xegpu.layout} : +// CHECK-NEXT: %{{.*}} = xegpu.load %[[T0]], %[[CST0]] <{layout = #xegpu.layout}> : // CHECK-SAME: !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.layout>, vector<16xi1> -> vector<16xf32> func.func @load_gather_1d(%arg0: memref<256xf32>, %arg1: !xegpu.tensor_desc<16xf32>) { %cst = arith.constant dense<[0, 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240]> : vector<16xindex> @@ -185,7 +185,7 @@ gpu.module @test { // CHECK: %[[MASK:.*]] = arith.constant {layout_result_0 = #xegpu.layout} dense : vector<16xi1> // CHECK: %[[OFFSETS:.*]] = arith.constant {layout_result_0 = #xegpu.layout} dense<12> : vector<16xindex> // CHECK: %[[LOAD_VEC:.*]] = xegpu.load %[[ARG0]][%[[OFFSETS]]], %[[MASK]] <{chunk_size = 8 : i64, layout = #xegpu.layout}> -// CHECK-SAME: {layout_result_0 = #xegpu.layout} : memref<256xf16>, vector<16xindex>, vector<16xi1> -> vector<16x8xf16> +// CHECK-SAME: memref<256xf16>, vector<16xindex>, vector<16xi1> -> vector<16x8xf16> // CHECK: xegpu.store %[[LOAD_VEC]], %[[ARG0]][%[[OFFSETS]]], %[[MASK]] <{chunk_size = 8 : i64, layout = #xegpu.layout}> : vector<16x8xf16>, memref<256xf16>, vector<16xindex>, vector<16xi1> func.func @scatter_ops_chunksize(%src: memref<256xf16>) { %1 = arith.constant dense<1>: vector<16xi1> @@ -204,7 +204,7 @@ gpu.module @test { // CHECK: %[[MASK:.*]] = arith.constant {layout_result_0 = #xegpu.layout} dense : vector<16xi1> // CHECK: %[[OFFSETS:.*]] = arith.constant {layout_result_0 = #xegpu.layout} dense<12> : vector<16xindex> // CHECK: %[[LOAD_VEC:.*]] = xegpu.load %[[ARG0]][%[[OFFSETS]]], %[[MASK]] -// CHECK-SAME: {layout_result_0 = #xegpu.layout} : memref<256xf16>, vector<16xindex>, vector<16xi1> -> vector<16xf16> +// CHECK-SAME: memref<256xf16>, vector<16xindex>, vector<16xi1> -> vector<16xf16> // CHECK: xegpu.store %[[LOAD_VEC]], %[[ARG0]][%[[OFFSETS]]], %[[MASK]] <{layout = #xegpu.layout}> : vector<16xf16>, memref<256xf16>, vector<16xindex>, vector<16xi1> func.func @scatter_ops(%src: memref<256xf16>) { %1 = arith.constant dense<1>: vector<16xi1> @@ -221,7 +221,7 @@ gpu.module @test { // CHECK: %[[MASK:.*]] = arith.constant {layout_result_0 = #xegpu.layout} dense : vector<16xi1> // CHECK: %[[OFFSETS:.*]] = arith.constant {layout_result_0 = #xegpu.layout} dense<12> : vector<16xindex> // CHECK: %[[LOAD_VEC:.*]] = xegpu.load %[[ARG0]][%[[OFFSETS]]], %[[MASK]] -// CHECK-SAME: {layout_result_0 = #xegpu.layout} : memref<256xf16>, vector<16xindex>, vector<16xi1> -> vector<16xf16> +// CHECK-SAME: memref<256xf16>, vector<16xindex>, vector<16xi1> -> vector<16xf16> // CHECK: %[[ADD_RES:.*]] = arith.addf %[[LOAD_VEC]], %[[LOAD_VEC]] {layout_result_0 = #xegpu.layout} : vector<16xf16> // CHECK: xegpu.store %[[ADD_RES]], %[[ARG0]][%[[OFFSETS]]], %[[MASK]] // CHECK-SAME <{layout = #xegpu.layout}> : vector<16xf16>, memref<256xf16>, vector<16xindex>, vector<16xi1> @@ -241,7 +241,7 @@ gpu.module @test { // CHECK: %[[MASK:.*]] = arith.constant {layout_result_0 = #xegpu.layout} dense : vector<16xi1> // CHECK: %[[OFFSETS:.*]] = arith.constant {layout_result_0 = #xegpu.layout} dense<12> : vector<16xindex> // CHECK: %[[LOAD_VEC:.*]] = xegpu.load %[[ARG0]][%[[OFFSETS]]], %[[MASK]] -// CHECK-SAME: {layout_result_0 = #xegpu.layout} : memref<256xf16>, vector<16xindex>, vector<16xi1> -> vector<16xf16> +// CHECK-SAME: memref<256xf16>, vector<16xindex>, vector<16xi1> -> vector<16xf16> // CHECK: %[[ADD_RES:.*]] = arith.addf %[[LOAD_VEC]], %[[LOAD_VEC]] {layout_result_0 = #xegpu.layout} : vector<16xf16> // CHECK: xegpu.store %[[ADD_RES]], %[[ARG0]][%[[OFFSETS]]], %[[MASK]] // CHECK-SAME <{layout = #xegpu.layout}> : vector<16xf16>, memref<256xf16>, vector<16xindex>, vector<16xi1> @@ -257,9 +257,9 @@ func.func @scatter_ops_preserve_load_perm_layout(%src: memref<256xf16>) { // ----- gpu.module @test { // CHECK-LABEL: func.func @vector_bitcast_i16_to_f16( -// CHECK: %[[LOAD0:.*]] = xegpu.load_nd %{{.*}} <{layout = #xegpu.layout}> {layout_result_0 = #xegpu.layout} +// CHECK: %[[LOAD0:.*]] = xegpu.load_nd %{{.*}} <{layout = #xegpu.layout}> // CHECK-SAME: !xegpu.tensor_desc<8x16xi16, #xegpu.layout> -> vector<8x16xi16> -// CHECK: %[[LOAD1:.*]] = xegpu.load_nd %{{.*}} <{layout = #xegpu.layout}> {layout_result_0 = #xegpu.layout} +// CHECK: %[[LOAD1:.*]] = xegpu.load_nd %{{.*}} <{layout = #xegpu.layout}> // CHECK-SAME: !xegpu.tensor_desc<16x16xi16, #xegpu.layout> -> vector<16x16xi16> // CHECK: %{{.*}} = vector.bitcast %[[LOAD0]] {layout_result_0 = #xegpu.layout} // CHECK-SAME: vector<8x16xi16> to vector<8x16xf16> @@ -282,7 +282,7 @@ func.func @vector_bitcast_i16_to_f16(%arg0: memref<8x16xi16>, %arg1: memref<16x1 // ----- gpu.module @test { // CHECK-LABEL: func.func @vector_bitcast_i32_to_f16( -// CHECK: %[[LOAD:.*]] = xegpu.load_nd %{{.*}} <{layout = #xegpu.layout}> {layout_result_0 = #xegpu.layout} +// CHECK: %[[LOAD:.*]] = xegpu.load_nd %{{.*}} <{layout = #xegpu.layout}> // CHECK-SAME: !xegpu.tensor_desc<16x8xi32, #xegpu.layout> -> vector<16x8xi32> // CHECK-NEXT: %{{.*}} = vector.bitcast %[[LOAD]] {layout_result_0 = #xegpu.layout} // CHECK-SAME: vector<16x8xi32> to vector<16x16xf16> @@ -303,7 +303,7 @@ func.func @vector_bitcast_i32_to_f16(%arg0: memref<8x16xf16>, %arg1: memref<16x8 // ----- gpu.module @test { // CHECK-LABEL: func.func @vector_bitcast_i16_to_i32( -// CHECK: %[[LOAD:.*]] = xegpu.load_nd %{{.*}} <{layout = #xegpu.layout}> {layout_result_0 = #xegpu.layout} +// CHECK: %[[LOAD:.*]] = xegpu.load_nd %{{.*}} <{layout = #xegpu.layout}> // CHECK-SAME: !xegpu.tensor_desc<8x32xi16, #xegpu.layout> -> vector<8x32xi16> // CHECK-NEXT: %{{.*}} = vector.bitcast %[[LOAD]] {layout_result_0 = #xegpu.layout} // CHECK-SAME: vector<8x32xi16> to vector<8x16xi32> @@ -340,9 +340,9 @@ gpu.module @test { // CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<8x16xf16, #xegpu.layout>, // CHECK-SAME: %[[ARG1:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<16x16xf16, #xegpu.layout>, // CHECK-SAME: %[[ARG2:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<8x16xf32, #xegpu.layout>) { -// CHECK: %[[T1:.*]] = xegpu.load_nd %[[ARG1]] <{layout = #xegpu.layout}> {layout_result_0 = #xegpu.layout} : +// CHECK: %[[T1:.*]] = xegpu.load_nd %[[ARG1]] <{layout = #xegpu.layout}> : // CHECK-SAME: !xegpu.tensor_desc<16x16xf16, #xegpu.layout> -> vector<16x16xf16> -// CHECK-NEXT: %[[T2:.*]] = xegpu.load_nd %[[ARG1]] <{layout = #xegpu.layout}> {layout_result_0 = #xegpu.layout} : +// CHECK-NEXT: %[[T2:.*]] = xegpu.load_nd %[[ARG1]] <{layout = #xegpu.layout}> : // CHECK-SAME: !xegpu.tensor_desc<16x16xf16, #xegpu.layout> -> vector<16x16xf16> // CHECK-NEXT: %{{.*}} = arith.addf %[[T1]], %[[T2]] {layout_result_0 = #xegpu.layout} : vector<16x16xf16> func.func @binary_op_one_use(%arg0: !xegpu.tensor_desc<8x16xf16>, %arg1: !xegpu.tensor_desc<16x16xf16>, %arg2: !xegpu.tensor_desc<8x16xf32>) { @@ -363,7 +363,7 @@ gpu.module @test { // CHECK-SAME: %[[ARG2:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<8x16xf32, #xegpu.layout>, // CHECK-SAME: %[[ARG3:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<16x16xf16, #xegpu.layout>) { // CHECK: %[[T2:.*]] = arith.addf %{{.*}}, %{{.*}} {layout_result_0 = #xegpu.layout} : vector<16x16xf16> -// CHECK: %[[T3:.*]] = xegpu.dpas %{{.*}}, %[[T2]] {layout_a = #xegpu.layout, layout_b = #xegpu.layout, layout_result_0 = #xegpu.layout} : vector<8x16xf16>, vector<16x16xf16> -> vector<8x16xf32> +// CHECK: %[[T3:.*]] = xegpu.dpas %{{.*}}, %[[T2]] {layout_a = #xegpu.layout, layout_b = #xegpu.layout, layout_cd = #xegpu.layout} : vector<8x16xf16>, vector<16x16xf16> -> vector<8x16xf32> // CHECK-NEXT: xegpu.store_nd %[[T3]], %[[ARG2]] <{layout = #xegpu.layout}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.layout> // CHECK-NEXT: xegpu.store_nd %[[T2]], %[[ARG3]] <{layout = #xegpu.layout}> : vector<16x16xf16>, !xegpu.tensor_desc<16x16xf16, #xegpu.layout> func.func @binary_op_multiple_uses(%arg0: !xegpu.tensor_desc<8x16xf16>, %arg1: !xegpu.tensor_desc<16x16xf16>, %arg2: !xegpu.tensor_desc<8x16xf32>, %arg3: !xegpu.tensor_desc<16x16xf16>) { @@ -386,11 +386,11 @@ gpu.module @test { // CHECK-NEXT: %[[CST:.*]] = arith.constant {layout_result_0 = #xegpu.layout} dense<0.000000e+00> : vector<8x16xf32> // CHECK-NEXT: %[[T2:.*]]:3 = scf.for %{{.*}} iter_args(%[[ARG4:.*]] = %[[T0]], %[[ARG5:.*]] = %[[T1]], %[[ARG6:.*]] = %[[CST]]) -> // CHECK-SAME: (!xegpu.tensor_desc<8x16xf16, #xegpu.layout>, !xegpu.tensor_desc<16x16xf16, #xegpu.layout>, vector<8x16xf32>) { -// CHECK-NEXT: %[[T4:.*]] = xegpu.load_nd %[[ARG4]] <{layout = #xegpu.layout}> {layout_result_0 = #xegpu.layout} : +// CHECK-NEXT: %[[T4:.*]] = xegpu.load_nd %[[ARG4]] <{layout = #xegpu.layout}> : // CHECK-SAME: !xegpu.tensor_desc<8x16xf16, #xegpu.layout> -> vector<8x16xf16> -// CHECK-NEXT: %[[T5:.*]] = xegpu.load_nd %[[ARG5]] <{layout = #xegpu.layout}> {layout_result_0 = #xegpu.layout} : +// CHECK-NEXT: %[[T5:.*]] = xegpu.load_nd %[[ARG5]] <{layout = #xegpu.layout}> : // CHECK-SAME: !xegpu.tensor_desc<16x16xf16, #xegpu.layout> -> vector<16x16xf16> -// CHECK-NEXT: %[[T6:.*]] = xegpu.dpas %[[T4]], %[[T5]], %[[ARG6]] {layout_a = #xegpu.layout, layout_b = #xegpu.layout, layout_cd = #xegpu.layout, layout_result_0 = #xegpu.layout} : +// CHECK-NEXT: %[[T6:.*]] = xegpu.dpas %[[T4]], %[[T5]], %[[ARG6]] {layout_a = #xegpu.layout, layout_b = #xegpu.layout, layout_cd = #xegpu.layout} : // CHECK-SAME: vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32> -> vector<8x16xf32> // CHECK-NEXT: %[[T7:.*]] = xegpu.update_nd_offset %[[ARG4]], [{{.*}}] : !xegpu.tensor_desc<8x16xf16, #xegpu.layout> // CHECK-NEXT: %[[T8:.*]] = xegpu.update_nd_offset %[[ARG5]], [{{.*}}] : !xegpu.tensor_desc<16x16xf16, #xegpu.layout> @@ -426,11 +426,11 @@ gpu.module @test { // CHECK-SAME: %[[ARG1:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<16x16xf16, #xegpu.layout>, // CHECK-SAME: %[[ARG2:[0-9a-zA-Z]+]]: i1, %[[ARG3:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<8x16xf32, #xegpu.layout>) { // CHECK: %{{.*}} = scf.if %[[ARG2]] -> (vector<16x16xf16>) { -// CHECK-NEXT: %[[T3:.*]] = xegpu.load_nd %[[ARG1]] <{layout = #xegpu.layout}> {layout_result_0 = #xegpu.layout} : +// CHECK-NEXT: %[[T3:.*]] = xegpu.load_nd %[[ARG1]] <{layout = #xegpu.layout}> : // CHECK-SAME: !xegpu.tensor_desc<16x16xf16, #xegpu.layout> -> vector<16x16xf16> // CHECK-NEXT: scf.yield %[[T3]] : vector<16x16xf16> // CHECK-NEXT: } else { -// CHECK-NEXT: %[[T4:.*]] = xegpu.load_nd %[[ARG1]] <{layout = #xegpu.layout}> {layout_result_0 = #xegpu.layout} : +// CHECK-NEXT: %[[T4:.*]] = xegpu.load_nd %[[ARG1]] <{layout = #xegpu.layout}> : // CHECK-SAME: !xegpu.tensor_desc<16x16xf16, #xegpu.layout> -> vector<16x16xf16> // CHECK-NEXT: scf.yield %[[T4]] : vector<16x16xf16> // CHECK-NEXT: } {layout_result_0 = #xegpu.layout} @@ -456,11 +456,11 @@ gpu.module @test { // CHECK-SAME: %[[ARG2:[0-9a-zA-Z]+]]: i1, %[[ARG3:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<8x16xf32, #xegpu.layout>, // CHECK-SAME: %[[ARG4:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<16x16xf16, #xegpu.layout>) { // CHECK: %[[T1:.*]] = scf.if %[[ARG2]] -> (vector<16x16xf16>) { -// CHECK-NEXT: %[[T3:.*]] = xegpu.load_nd %[[ARG1]] <{layout = #xegpu.layout}> {layout_result_0 = #xegpu.layout} : +// CHECK-NEXT: %[[T3:.*]] = xegpu.load_nd %[[ARG1]] <{layout = #xegpu.layout}> : // CHECK-SAME: !xegpu.tensor_desc<16x16xf16, #xegpu.layout> -> vector<16x16xf16> // CHECK-NEXT: scf.yield %[[T3]] : vector<16x16xf16> // CHECK-NEXT: } else { -// CHECK-NEXT: %[[T4:.*]] = xegpu.load_nd %[[ARG1]] <{layout = #xegpu.layout}> {layout_result_0 = #xegpu.layout} : +// CHECK-NEXT: %[[T4:.*]] = xegpu.load_nd %[[ARG1]] <{layout = #xegpu.layout}> : // CHECK-SAME: !xegpu.tensor_desc<16x16xf16, #xegpu.layout> -> vector<16x16xf16> // CHECK-NEXT: scf.yield %[[T4]] : vector<16x16xf16> // CHECK-NEXT: } {layout_result_0 = #xegpu.layout} @@ -600,7 +600,7 @@ gpu.module @test { // CHECK-LABEL: func.func @vector_shape_cast_1d_to_2d_dim1_distributed( // CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<16x16xf16, #xegpu.layout>, // CHECK-SAME: %[[ARG1:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<16x16xf16, #xegpu.layout>) { -// CHECK: %[[LOAD:.*]] = xegpu.load_nd %[[ARG0]] <{layout = #xegpu.layout}> {layout_result_0 = #xegpu.layout} +// CHECK: %[[LOAD:.*]] = xegpu.load_nd %[[ARG0]] <{layout = #xegpu.layout}> // CHECK-SAME: !xegpu.tensor_desc<16x16xf16, #xegpu.layout> -> vector<16x16xf16> // CHECK-NEXT: %[[REDUCE:.*]] = vector.multi_reduction , %[[LOAD]], %{{[0-9a-zA-Z]+}} // CHECK-SAME: {layout_result_0 = #xegpu.slice<#xegpu.layout, dims = [0]>} [0] : vector<16x16xf16> to vector<16xf16> @@ -622,7 +622,7 @@ gpu.module @test { // CHECK-LABEL: func.func @vector_shape_cast_1d_to_2d_dim0_broadcasted( // CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<16x16xf16, #xegpu.layout>, // CHECK-SAME: %[[ARG1:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<16x16xf16, #xegpu.layout>) { -// CHECK: %[[LOAD:.*]] = xegpu.load_nd %arg0 <{layout = #xegpu.layout}> {layout_result_0 = #xegpu.layout} +// CHECK: %[[LOAD:.*]] = xegpu.load_nd %arg0 <{layout = #xegpu.layout}> // CHECK-SAME: !xegpu.tensor_desc<16x16xf16, #xegpu.layout> -> vector<16x16xf16> // CHECK-NEXT: %[[REDUCE:.*]] = vector.multi_reduction , %[[LOAD]], %{{[0-9a-zA-Z]+}} // CHECK-SAME: {layout_result_0 = #xegpu.slice<#xegpu.layout, dims = [1]>} [1] @@ -645,7 +645,7 @@ gpu.module @test { // CHECK-LABEL: func.func @vector_broadcast_1d_to_2d_broadcast_along_row( // CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<16x16xf16, #xegpu.layout>, // CHECK-SAME: %[[ARG1:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<16x16xf16, #xegpu.layout>) { -// CHECK: %[[LOAD:.*]] = xegpu.load_nd %[[ARG0]] <{layout = #xegpu.layout}> {layout_result_0 = #xegpu.layout} +// CHECK: %[[LOAD:.*]] = xegpu.load_nd %[[ARG0]] <{layout = #xegpu.layout}> // CHECK-SAME: !xegpu.tensor_desc<16x16xf16, #xegpu.layout> -> vector<16x16xf16> // CHECK-NEXT: %[[REDUCE:.*]] = vector.multi_reduction , %[[LOAD]], %{{[0-9a-zA-Z]+}} // CHECK-SAME: {layout_result_0 = #xegpu.slice<#xegpu.layout, dims = [0]>} [0] : vector<16x16xf16> to vector<16xf16> diff --git a/mlir/test/Dialect/XeGPU/transform-ops.mlir b/mlir/test/Dialect/XeGPU/transform-ops.mlir index 561034fb5880b..13ed24ebf0a3a 100644 --- a/mlir/test/Dialect/XeGPU/transform-ops.mlir +++ b/mlir/test/Dialect/XeGPU/transform-ops.mlir @@ -149,7 +149,7 @@ func.func @set_op_layout_attr_result_default_index(%arg0: memref<4096x4096xf16>, %4 = xegpu.create_nd_tdesc %arg2 : memref<4096x4096xf16> -> !xegpu.tensor_desc<256x256xf16> %5 = xegpu.load_nd %4[0, 0] : !xegpu.tensor_desc<256x256xf16> -> vector<256x256xf16> // CHECK: = xegpu.dpas - // CHECK-SAME: {layout_result_0 = #xegpu.layout} + // CHECK-SAME: {layout_cd = #xegpu.layout} %6 = xegpu.dpas %1, %3, %5 : vector<256x32xf16>, vector<32x256xf16>, vector<256x256xf16> -> vector<256x256xf16> return } diff --git a/mlir/test/Dialect/XeGPU/xegpu-blocking.mlir b/mlir/test/Dialect/XeGPU/xegpu-blocking.mlir index b3032bac351ac..9f9edcd416ddf 100644 --- a/mlir/test/Dialect/XeGPU/xegpu-blocking.mlir +++ b/mlir/test/Dialect/XeGPU/xegpu-blocking.mlir @@ -27,7 +27,7 @@ gpu.module @test_kernel { //CHECK-COUNT-4: xegpu.load_nd {{.*}} -> vector<16x16xf16> %b = xegpu.load_nd %arg1 {layout = #b}: !xegpu.tensor_desc<32x32xf16, #b> -> vector<32x32xf16> //CHECK-COUNT-8: xegpu.dpas {{.*}} - %c = xegpu.dpas %a, %b, %arg2 {layout_a=#a, layout_b = #b, layout_cd = #c,layout_result_0 = #c}: vector<16x32xf16>, vector<32x32xf16>, vector<16x32xf32> -> vector<16x32xf32> + %c = xegpu.dpas %a, %b, %arg2 {layout_a=#a, layout_b = #b, layout_cd = #c}: vector<16x32xf16>, vector<32x32xf16>, vector<16x32xf32> -> vector<16x32xf32> //CHECK-COUNT-4: xegpu.update_nd_offset {{.*}} : !xegpu.tensor_desc<8x16xf16, #xegpu.layout> %a_next_tdesc = xegpu.update_nd_offset %arg0, [%c0, %c32] : !xegpu.tensor_desc<16x32xf16, #a> //CHECK-COUNT-4: xegpu.update_nd_offset {{.*}} : !xegpu.tensor_desc<16x16xf16, #xegpu.layout> @@ -513,8 +513,8 @@ gpu.module @test_kernel { //CHECK: [[c0:%.+]] = arith.constant 0 : index //CHECK: [[a:%.+]] = xegpu.create_nd_tdesc [[arg0]][[[c0]], [[c0]]] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout> //CHECK: [[b:%.+]] = xegpu.create_nd_tdesc [[arg1]][[[c0]], [[c0]]] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout> - //CHECK: [[load_a:%.+]] = xegpu.load_nd [[a]] <{layout = #xegpu.layout}> {layout_result_0 = #xegpu.layout} : !xegpu.tensor_desc<16x16xf16, #xegpu.layout> -> vector<16x16xf16> - //CHECK: [[load_b:%.+]] = xegpu.load_nd [[b]] <{layout = #xegpu.layout}> {layout_result_0 = #xegpu.layout} : !xegpu.tensor_desc<16x16xf16, #xegpu.layout> -> vector<16x16xf16> + //CHECK: [[load_a:%.+]] = xegpu.load_nd [[a]] <{layout = #xegpu.layout}> : !xegpu.tensor_desc<16x16xf16, #xegpu.layout> -> vector<16x16xf16> + //CHECK: [[load_b:%.+]] = xegpu.load_nd [[b]] <{layout = #xegpu.layout}> : !xegpu.tensor_desc<16x16xf16, #xegpu.layout> -> vector<16x16xf16> //CHECK: [[cvt:%.+]] = xegpu.convert_layout [[load_a]] <{input_layout = #xegpu.layout, target_layout = #xegpu.layout}> : vector<16x16xf16> //CHECK: [[a0:%.+]] = vector.extract_strided_slice [[cvt]] {offsets = [0, 0], sizes = [8, 16], strides = [1, 1]} : vector<16x16xf16> to vector<8x16xf16> //CHECK: [[a1:%.+]] = vector.extract_strided_slice [[cvt]] {offsets = [8, 0], sizes = [8, 16], strides = [1, 1]} : vector<16x16xf16> to vector<8x16xf16> diff --git a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-rr.mlir b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-rr.mlir index 4829af3612de3..9b00adaaa56da 100644 --- a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-rr.mlir +++ b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-rr.mlir @@ -41,7 +41,7 @@ gpu.module @test_round_robin_assignment { -> !xegpu.tensor_desc<256x128xf32, #xegpu.layout> // CHECK-COUNT-4: xegpu.load_nd %{{.*}} : !xegpu.tensor_desc<16x16xf32, #xegpu.layout> -> vector<16x16xf32> // CHECK-NOT: xegpu.load_nd - %load = xegpu.load_nd %tdesc + %load = xegpu.load_nd %tdesc {layout = #xegpu.layout} : !xegpu.tensor_desc<256x128xf32, #xegpu.layout> -> vector<256x128xf32> gpu.return @@ -54,8 +54,7 @@ gpu.module @test_round_robin_assignment { -> !xegpu.tensor_desc<256x128xf32, #xegpu.layout> // CHECK-COUNT-4: xegpu.store_nd %{{.*}}, %{{.*}} : vector<16x16xf32>, !xegpu.tensor_desc<16x16xf32, #xegpu.layout> // CHECK-NOT: xegpu.store_nd - %load = xegpu.load_nd %tdesc - : !xegpu.tensor_desc<256x128xf32, #xegpu.layout> + %load = xegpu.load_nd %tdesc {layout = #xegpu.layout} : !xegpu.tensor_desc<256x128xf32, #xegpu.layout> -> vector<256x128xf32> xegpu.store_nd %load, %tdesc : vector<256x128xf32>, !xegpu.tensor_desc<256x128xf32, #xegpu.layout> @@ -79,20 +78,17 @@ gpu.module @test_round_robin_assignment { gpu.func @dpas(%a: memref<256x128xf16>, %b: memref<128x256xf16>) { // CHECK-COUNT-4: xegpu.create_nd_tdesc %[[ARG_0]][%{{.*}}, %{{.*}}] : memref<256x128xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout> // CHECK-COUNT-4: xegpu.create_nd_tdesc %[[ARG_1]][%{{.*}}, %{{.*}}] : memref<128x256xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout> - // CHECK-COUNT-16: xegpu.dpas %{{.*}}, %{{.*}} {layout_result_0 = #xegpu.layout} : vector<16x16xf16>, vector<16x16xf16> -> vector<16x16xf32> + // CHECK-COUNT-16: xegpu.dpas %{{.*}}, %{{.*}} {layout_cd = #xegpu.layout} : vector<16x16xf16>, vector<16x16xf16> -> vector<16x16xf32> // CHECK-NOT: xegpu.dpas %tdesc_a = xegpu.create_nd_tdesc %a[0, 0] : memref<256x128xf16> -> !xegpu.tensor_desc<256x128xf16, #xegpu.layout> - %load_a = xegpu.load_nd %tdesc_a - : !xegpu.tensor_desc<256x128xf16, #xegpu.layout> + %load_a = xegpu.load_nd %tdesc_a {layout = #xegpu.layout} : !xegpu.tensor_desc<256x128xf16, #xegpu.layout> -> vector<256x128xf16> %tdesc_b = xegpu.create_nd_tdesc %b[0, 0] : memref<128x256xf16> -> !xegpu.tensor_desc<128x256xf16, #xegpu.layout> - %load_b = xegpu.load_nd %tdesc_b - : !xegpu.tensor_desc<128x256xf16, #xegpu.layout> + %load_b = xegpu.load_nd %tdesc_b {layout = #xegpu.layout} : !xegpu.tensor_desc<128x256xf16, #xegpu.layout> -> vector<128x256xf16> - %dpas = xegpu.dpas %load_a, %load_b - {layout_result_0 = #xegpu.layout} + %dpas = xegpu.dpas %load_a, %load_b {layout_cd = #xegpu.layout} : vector<256x128xf16>, vector<128x256xf16> -> vector<256x256xf32> gpu.return } @@ -114,7 +110,7 @@ gpu.module @test_round_robin_assignment { gpu.func @broadcast(%src: memref<128x1xf32>) { %tdesc = xegpu.create_nd_tdesc %src[0, 0] : memref<128x1xf32> -> !xegpu.tensor_desc<128x1xf32, #xegpu.layout> - %load = xegpu.load_nd %tdesc + %load = xegpu.load_nd %tdesc {layout = #xegpu.layout} : !xegpu.tensor_desc<128x1xf32, #xegpu.layout> -> vector<128x1xf32> // CHECK-COUNT-2: vector.broadcast {{.*}} {layout_result_0 = #xegpu.layout} : vector<16x1xf32> to vector<16x32xf32> @@ -137,7 +133,7 @@ gpu.module @test_round_robin_assignment { // CHECK-SAME: (!xegpu.tensor_desc<16xf32>, !xegpu.tensor_desc<16xf32>, !xegpu.tensor_desc<16xf32>, !xegpu.tensor_desc<16xf32>) %2:2 = scf.for %arg2 = %c0 to %c1024 step %c256 iter_args(%arg3 = %0, %arg4 = %1) -> (!xegpu.tensor_desc<256xf32, #xegpu.layout>, !xegpu.tensor_desc<256xf32, #xegpu.layout>) { - %3 = xegpu.load_nd %0 : !xegpu.tensor_desc<256xf32, #xegpu.layout> -> vector<256xf32> + %3 = xegpu.load_nd %0 {layout = #xegpu.layout} : !xegpu.tensor_desc<256xf32, #xegpu.layout> -> vector<256xf32> xegpu.store_nd %3, %arg3 : vector<256xf32>, !xegpu.tensor_desc<256xf32, #xegpu.layout> %4 = xegpu.update_nd_offset %arg3, [256] : !xegpu.tensor_desc<256xf32, #xegpu.layout> %5 = xegpu.update_nd_offset %arg4, [256] : !xegpu.tensor_desc<256xf32, #xegpu.layout> @@ -153,7 +149,7 @@ gpu.module @test_round_robin_assignment { %c10_i32 = arith.constant 10 : i32 %c0_i32 = arith.constant 0 : i32 %0 = xegpu.create_nd_tdesc %arg0[0] : memref<1024xf32> -> !xegpu.tensor_desc<256xf32, #xegpu.layout> - %1 = xegpu.load_nd %0 : !xegpu.tensor_desc<256xf32, #xegpu.layout> -> vector<256xf32> + %1 = xegpu.load_nd %0 {layout = #xegpu.layout} : !xegpu.tensor_desc<256xf32, #xegpu.layout> -> vector<256xf32> %2 = xegpu.create_nd_tdesc %arg1[0] : memref<1024xf32> -> !xegpu.tensor_desc<256xf32, #xegpu.layout> // CHECK: scf.while ({{.*}}) : (vector<16xf32>, vector<16xf32>, i32) -> (vector<16xf32>, vector<16xf32>, i32) %3:2 = scf.while (%arg2 = %1, %arg3 = %c0_i32) : (vector<256xf32>, i32) -> (vector<256xf32>, i32) { @@ -166,7 +162,7 @@ gpu.module @test_round_robin_assignment { xegpu.store_nd %arg2, %2 : vector<256xf32>, !xegpu.tensor_desc<256xf32, #xegpu.layout> %4 = arith.addi %arg3, %c1_i32 : i32 %5 = xegpu.update_nd_offset %0, [256] : !xegpu.tensor_desc<256xf32, #xegpu.layout> - %6 = xegpu.load_nd %5 : !xegpu.tensor_desc<256xf32, #xegpu.layout> -> vector<256xf32> + %6 = xegpu.load_nd %5 {layout = #xegpu.layout} : !xegpu.tensor_desc<256xf32, #xegpu.layout> -> vector<256xf32> scf.yield %6, %4 : vector<256xf32>, i32 } gpu.return @@ -181,12 +177,12 @@ gpu.module @test_round_robin_assignment { // CHECK-LABEL: scf.if // CHECK-SAME: (vector<16xf32>, vector<16xf32>) %4 = scf.if %3 -> (vector<256xf32>) { - %5 = xegpu.load_nd %1 : !xegpu.tensor_desc<256xf32, #xegpu.layout> -> vector<256xf32> + %5 = xegpu.load_nd %1 {layout = #xegpu.layout} : !xegpu.tensor_desc<256xf32, #xegpu.layout> -> vector<256xf32> // CHECK-LABEL: scf.yield // CHECK-SAME: vector<16xf32>, vector<16xf32> scf.yield %5 : vector<256xf32> } else { - %5 = xegpu.load_nd %2 : !xegpu.tensor_desc<256xf32, #xegpu.layout> -> vector<256xf32> + %5 = xegpu.load_nd %2 {layout = #xegpu.layout} : !xegpu.tensor_desc<256xf32, #xegpu.layout> -> vector<256xf32> // CHECK-LABEL: scf.yield // CHECK-SAME: vector<16xf32>, vector<16xf32> scf.yield %5 : vector<256xf32> @@ -200,7 +196,7 @@ gpu.module @test_round_robin_assignment { %id = gpu.subgroup_id : index %t = xegpu.create_nd_tdesc %arg0[0] : memref<1024xf32> -> !xegpu.tensor_desc<256xf32, #xegpu.layout> - %d = xegpu.load_nd %t : !xegpu.tensor_desc<256xf32, #xegpu.layout> -> vector<256xf32> + %d = xegpu.load_nd %t {layout = #xegpu.layout}: !xegpu.tensor_desc<256xf32, #xegpu.layout> -> vector<256xf32> %0 = arith.cmpi eq, %id, %c10 : index // CHECK-LABEL: scf.if @@ -224,7 +220,7 @@ gpu.module @test_round_robin_assignment { %0 = xegpu.create_nd_tdesc %arg0[0, 0] : memref<32x64xf32> -> !xegpu.tensor_desc<32x64xf32, #xegpu.layout> // CHECK-COUNT-2: xegpu.load_nd {{.*}} : !xegpu.tensor_desc<16x16xf32, #xegpu.layout> -> vector<16x16xf32> // CHECK-COUNT-2: xegpu.convert_layout {{.*}} <{input_layout = #xegpu.layout, target_layout = #xegpu.layout}> : vector<16x16xf32> - %1 = xegpu.load_nd %0 : !xegpu.tensor_desc<32x64xf32, #xegpu.layout> -> vector<32x64xf32> + %1 = xegpu.load_nd %0 {layout = #xegpu.layout} : !xegpu.tensor_desc<32x64xf32, #xegpu.layout> -> vector<32x64xf32> %2 = xegpu.convert_layout %1 <{input_layout = #xegpu.layout, target_layout = #xegpu.layout}> : vector<32x64xf32> gpu.return diff --git a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops-rr.mlir b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops-rr.mlir index c95c64084f3f8..b3ba99887d763 100644 --- a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops-rr.mlir +++ b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops-rr.mlir @@ -14,11 +14,11 @@ gpu.module @test_distribution { // CHECK-LABEL: load_nd_tdesc_with_offset gpu.func @load_nd_tdesc_with_offset(%src: memref<256x128xf32>) { - // CHECK-COUNT-4: xegpu.load_nd {{%.*}}[{{%.*}}, {{%.*}}] : !xegpu.tensor_desc<16x16xf32, #xegpu.layout> -> vector<16x16xf32> + // CHECK-COUNT-4: xegpu.load_nd {{%.*}}[{{%.*}}, {{%.*}}] <{layout = #xegpu.layout}> : !xegpu.tensor_desc<16x16xf32, #xegpu.layout> -> vector<16x16xf32> // CHECK-NOT: xegpu.load_nd %tdesc = xegpu.create_nd_tdesc %src: memref<256x128xf32> -> !xegpu.tensor_desc<256x128xf32, #xegpu.layout> - %load = xegpu.load_nd %tdesc[0, 0] + %load = xegpu.load_nd %tdesc[0, 0] {layout = #xegpu.layout} : !xegpu.tensor_desc<256x128xf32, #xegpu.layout> -> vector<256x128xf32> gpu.return @@ -30,7 +30,7 @@ gpu.module @test_distribution { // CHECK-NOT: xegpu.store_nd %tdesc = xegpu.create_nd_tdesc %src: memref<256x128xf32> -> !xegpu.tensor_desc<256x128xf32, #xegpu.layout> - %load = xegpu.load_nd %tdesc[0, 0] + %load = xegpu.load_nd %tdesc[0, 0] {layout = #xegpu.layout} : !xegpu.tensor_desc<256x128xf32, #xegpu.layout> -> vector<256x128xf32> xegpu.store_nd %load, %tdesc[0, 0] @@ -53,23 +53,23 @@ gpu.module @test_distribution { // CHECK-SAME: (%[[ARG_0:.*]]: memref<256x128xf16>, %[[ARG_1:.*]]: memref<128x256xf16>) gpu.func @dpas(%a: memref<256x128xf16>, %b: memref<128x256xf16>) { // CHECK-COUNT-4: xegpu.create_nd_tdesc %[[ARG_0]] : memref<256x128xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout> - // CHECK-COUNT-4: xegpu.load_nd {{%.*}}[{{%.*}}, {{%.*}}] : !xegpu.tensor_desc<16x16xf16, #xegpu.layout> -> vector<16x16xf16> + // CHECK-COUNT-4: xegpu.load_nd {{%.*}}[{{%.*}}, {{%.*}}] <{layout = #xegpu.layout}> : !xegpu.tensor_desc<16x16xf16, #xegpu.layout> -> vector<16x16xf16> // CHECK-COUNT-4: xegpu.create_nd_tdesc %[[ARG_1]] : memref<128x256xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout> - // CHECK-COUNT-4: xegpu.load_nd {{%.*}}[{{%.*}}, {{%.*}}] : !xegpu.tensor_desc<16x16xf16, #xegpu.layout> -> vector<16x16xf16> - // CHECK-COUNT-16: xegpu.dpas %{{.*}}, %{{.*}} {layout_result_0 = #xegpu.layout} : vector<16x16xf16>, vector<16x16xf16> -> vector<16x16xf32> + // CHECK-COUNT-4: xegpu.load_nd {{%.*}}[{{%.*}}, {{%.*}}] <{layout = #xegpu.layout}> : !xegpu.tensor_desc<16x16xf16, #xegpu.layout> -> vector<16x16xf16> + // CHECK-COUNT-16: xegpu.dpas %{{.*}}, %{{.*}} {layout_cd = #xegpu.layout} : vector<16x16xf16>, vector<16x16xf16> -> vector<16x16xf32> // CHECK-NOT: xegpu.dpas %tdesc_a = xegpu.create_nd_tdesc %a : memref<256x128xf16> -> !xegpu.tensor_desc<256x128xf16, #xegpu.layout> - %load_a = xegpu.load_nd %tdesc_a[0, 0] + %load_a = xegpu.load_nd %tdesc_a[0, 0] {layout = #xegpu.layout} : !xegpu.tensor_desc<256x128xf16, #xegpu.layout> -> vector<256x128xf16> %tdesc_b = xegpu.create_nd_tdesc %b : memref<128x256xf16> -> !xegpu.tensor_desc<128x256xf16, #xegpu.layout> - %load_b = xegpu.load_nd %tdesc_b[0, 0] + %load_b = xegpu.load_nd %tdesc_b[0, 0] {layout = #xegpu.layout} : !xegpu.tensor_desc<128x256xf16, #xegpu.layout> -> vector<128x256xf16> %dpas = xegpu.dpas %load_a, %load_b - {layout_result_0 = #xegpu.layout} + {layout_cd = #xegpu.layout} : vector<256x128xf16>, vector<128x256xf16> -> vector<256x256xf32> gpu.return } @@ -80,7 +80,7 @@ gpu.module @test_distribution { %cst = arith.constant {layout_result_0 = #xegpu.slice<#xegpu.layout, dims = [1]>} dense<1.0> : vector<256xf32> %tdesc = xegpu.create_nd_tdesc %src : memref<256x64xf32> -> !xegpu.tensor_desc<256x64xf32, #xegpu.layout> - %load = xegpu.load_nd %tdesc[0, 0] + %load = xegpu.load_nd %tdesc[0, 0] {layout = #xegpu.layout} : !xegpu.tensor_desc<256x64xf32, #xegpu.layout> -> vector<256x64xf32> // CHECK-COUNT-2: vector.multi_reduction , {{.*}}, %[[CST]] [1] : vector<16x64xf32> to vector<16xf32> @@ -119,7 +119,7 @@ gpu.module @test_distribution { gpu.func @vector_transpose(%src: memref<256x128xf32>) { %tdesc = xegpu.create_nd_tdesc %src : memref<256x128xf32> -> !xegpu.tensor_desc<256x128xf32, #xegpu.layout> - %load = xegpu.load_nd %tdesc[0, 0] + %load = xegpu.load_nd %tdesc[0, 0] {layout = #xegpu.layout} : !xegpu.tensor_desc<256x128xf32, #xegpu.layout> -> vector<256x128xf32> // CHECK-COUNT-2: vector.transpose {{.*}}, [1, 0] {layout_result_0 = #xegpu.layout} : vector<32x16xf32> to vector<16x32xf32> diff --git a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops.mlir b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops.mlir index c557a2f140527..9179344b8fc2b 100644 --- a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops.mlir +++ b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops.mlir @@ -38,10 +38,10 @@ gpu.module @test_distribution { //CHECK-DAG: %[[OFF_Y:.*]] = arith.remui %[[L_OFF_Y]], %[[C256]] : index //CHECK-DAG: %[[C128:.*]] = arith.constant 128 : index //CHECK-DAG: %[[OFF_X:.*]] = arith.remui %[[L_OFF_X]], %[[C128]] : index - //CHECK-DAG: %[[LOAD:.*]] = xegpu.load_nd %[[TDESC]][{{%.*}}, {{%.*}}] : !xegpu.tensor_desc<32x32xf32, #xegpu.layout> -> vector<32x32xf32> + //CHECK-DAG: %[[LOAD:.*]] = xegpu.load_nd %[[TDESC]][{{%.*}}, {{%.*}}] <{layout = #xegpu.layout}> : !xegpu.tensor_desc<32x32xf32, #xegpu.layout> -> vector<32x32xf32> %tdesc = xegpu.create_nd_tdesc %src : memref<256x128xf32> -> !xegpu.tensor_desc<256x128xf32, #xegpu.layout> - %load = xegpu.load_nd %tdesc[0, 0] + %load = xegpu.load_nd %tdesc[0, 0] {layout = #xegpu.layout} : !xegpu.tensor_desc<256x128xf32, #xegpu.layout> -> vector<256x128xf32> gpu.return @@ -53,7 +53,7 @@ gpu.module @test_distribution { //CHECK: xegpu.store_nd %{{.*}}, {{%.*}}[{{%.*}}, {{%.*}}] : vector<32x32xf32>, !xegpu.tensor_desc<32x32xf32, #xegpu.layout> %tdesc = xegpu.create_nd_tdesc %src: memref<256x128xf32> -> !xegpu.tensor_desc<256x128xf32, #xegpu.layout> - %load = xegpu.load_nd %tdesc[0, 0] + %load = xegpu.load_nd %tdesc[0, 0] {layout = #xegpu.layout} : !xegpu.tensor_desc<256x128xf32, #xegpu.layout> -> vector<256x128xf32> xegpu.store_nd %load, %tdesc[0, 0] @@ -75,42 +75,43 @@ gpu.module @test_distribution { // CHECK-LABEL: dpas gpu.func @dpas(%a: memref<128x128xf16>, %b: memref<128x128xf16>) { - // CHECK: %[[DPAS:.*]] = xegpu.dpas %{{.*}}, %{{.*}} {layout_result_0 = #xegpu.layout} : vector<16x128xf16>, vector<128x16xf16> -> vector<16x16xf32> + // CHECK: %[[DPAS:.*]] = xegpu.dpas %{{.*}}, %{{.*}} {layout_cd = #xegpu.layout} : vector<16x128xf16>, vector<128x16xf16> -> vector<16x16xf32> %tdesc_a = xegpu.create_nd_tdesc %a : memref<128x128xf16> -> !xegpu.tensor_desc<128x128xf16, #xegpu.layout> - %load_a = xegpu.load_nd %tdesc_a[0, 0] + %load_a = xegpu.load_nd %tdesc_a[0, 0] {layout = #xegpu.layout} : !xegpu.tensor_desc<128x128xf16, #xegpu.layout> -> vector<128x128xf16> %tdesc_b = xegpu.create_nd_tdesc %b : memref<128x128xf16> -> !xegpu.tensor_desc<128x128xf16, #xegpu.layout> - %load_b = xegpu.load_nd %tdesc_b[0, 0] + %load_b = xegpu.load_nd %tdesc_b[0, 0] {layout = #xegpu.layout} : !xegpu.tensor_desc<128x128xf16, #xegpu.layout> -> vector<128x128xf16> %dpas = xegpu.dpas %load_a, %load_b - {layout_result_0 = #xegpu.layout} + {layout_cd = #xegpu.layout} : vector<128x128xf16>, vector<128x128xf16> -> vector<128x128xf32> gpu.return } // CHECK-LABEL: dpas_no_sg_data gpu.func @dpas_no_sg_data(%a: memref<128x128xf16>, %b: memref<128x128xf16>) { - // CHECK: %[[DPAS:.*]] = xegpu.dpas %{{.*}}, %{{.*}} {layout_result_0 = #xegpu.layout} : vector<16x16xf16>, vector<16x16xf16> -> vector<16x16xf32> + // CHECK: %[[DPAS:.*]] = xegpu.dpas %{{.*}}, %{{.*}} {layout_cd = #xegpu.layout} : vector<16x16xf16>, vector<16x16xf16> -> vector<16x16xf32> %tdesc_a = xegpu.create_nd_tdesc %a : memref<128x128xf16> -> !xegpu.tensor_desc<128x128xf16, #xegpu.layout> - %load_a = xegpu.load_nd %tdesc_a[0, 0] + %load_a = xegpu.load_nd %tdesc_a[0, 0] {layout = #xegpu.layout} : !xegpu.tensor_desc<128x128xf16, #xegpu.layout> -> vector<128x128xf16> %tdesc_b = xegpu.create_nd_tdesc %b : memref<128x128xf16> -> !xegpu.tensor_desc<128x128xf16, #xegpu.layout> - %load_b = xegpu.load_nd %tdesc_b[0, 0] + %load_b = xegpu.load_nd %tdesc_b[0, 0] {layout = #xegpu.layout } : !xegpu.tensor_desc<128x128xf16, #xegpu.layout> -> vector<128x128xf16> %dpas = xegpu.dpas %load_a, %load_b - {layout_result_0 = #xegpu.layout} + {layout_cd = #xegpu.layout} : vector<128x128xf16>, vector<128x128xf16> -> vector<128x128xf32> gpu.return } @@ -119,7 +120,7 @@ gpu.module @test_distribution { gpu.func @dpas_with_no_create_nd_desc(%a: vector<256x128xf32>, %b: vector<128x256xf32>) { // CHECK-NOT: vector<32x32xf32> %dpas = xegpu.dpas %a, %b - {layout = #xegpu.layout} + {layout_cd = #xegpu.layout} : vector<256x128xf32>, vector<128x256xf32> -> vector<256x256xf32> gpu.return } @@ -129,7 +130,7 @@ gpu.module @test_distribution { gpu.func @broadcast_dim1(%src: memref<256x1xf32>) { %tdesc = xegpu.create_nd_tdesc %src : memref<256x1xf32> -> !xegpu.tensor_desc<256x1xf32, #xegpu.layout> - %load = xegpu.load_nd %tdesc[0, 0] + %load = xegpu.load_nd %tdesc[0, 0] {layout = #xegpu.layout} : !xegpu.tensor_desc<256x1xf32, #xegpu.layout> -> vector<256x1xf32> // CHECK: vector.broadcast {{.*}} {layout_result_0 = #xegpu.layout} @@ -145,7 +146,7 @@ gpu.module @test_distribution { gpu.func @broadcast_dim0(%src: memref<1x128xf32>) { %tdesc = xegpu.create_nd_tdesc %src : memref<1x128xf32> -> !xegpu.tensor_desc<1x128xf32, #xegpu.layout> - %load = xegpu.load_nd %tdesc[0, 0] + %load = xegpu.load_nd %tdesc[0, 0] {layout = #xegpu.layout} : !xegpu.tensor_desc<1x128xf32, #xegpu.layout> -> vector<1x128xf32> // CHECK: vector.broadcast {{.*}} {layout_result_0 = #xegpu.layout} @@ -175,9 +176,9 @@ gpu.module @test_distribution { %3 = xegpu.create_nd_tdesc %arg0 : memref<1024x1024xf16> -> !xegpu.tensor_desc<128x128xf16, #xegpu.layout> %4 = xegpu.create_nd_tdesc %arg1 : memref<1024x1024xf16> -> !xegpu.tensor_desc<128x128xf16, #xegpu.layout> // load_nd with offset - %5 = xegpu.load_nd %2[%0, %1] : !xegpu.tensor_desc<128x128xf32, #xegpu.layout> -> vector<128x128xf32> - %6 = xegpu.load_nd %3[%0, %c0] : !xegpu.tensor_desc<128x128xf16, #xegpu.layout> -> vector<128x128xf16> - %7 = xegpu.load_nd %4[%c0, %1] : !xegpu.tensor_desc<128x128xf16, #xegpu.layout> -> vector<128x128xf16> + %5 = xegpu.load_nd %2[%0, %1] {layout = #xegpu.layout}: !xegpu.tensor_desc<128x128xf32, #xegpu.layout> -> vector<128x128xf32> + %6 = xegpu.load_nd %3[%0, %c0] {layout = #xegpu.layout}: !xegpu.tensor_desc<128x128xf16, #xegpu.layout> -> vector<128x128xf16> + %7 = xegpu.load_nd %4[%c0, %1] {layout = #xegpu.layout}: !xegpu.tensor_desc<128x128xf16, #xegpu.layout> -> vector<128x128xf16> // scf.for loop // CHECK: [[scf:%.+]]:3 = scf.for [[arg3:%.+]] = [[c0]] to [[c1024]] step [[c128]] // CHECK-SAME: iter_args([[arg4:%.+]] = {{.*}}, [[arg5:%.+]] = {{.*}}, [[arg6:%.+]] = {{.*}}) -> @@ -189,10 +190,10 @@ gpu.module @test_distribution { %8:3 = scf.for %arg3 = %c0 to %c1024 step %c128 iter_args(%arg4 = %6, %arg5 = %7, %arg6 = %5) -> (vector<128x128xf16>, vector<128x128xf16>, vector<128x128xf32>) { // load_nd with offset inside loop - %9 = xegpu.dpas %arg4, %arg5, %arg6 {layout_result_0 = #xegpu.layout} + %9 = xegpu.dpas %arg4, %arg5, %arg6 {layout_cd = #xegpu.layout} : vector<128x128xf16>, vector<128x128xf16>, vector<128x128xf32> -> vector<128x128xf32> - %10 = xegpu.load_nd %3[%arg3, %c0] : !xegpu.tensor_desc<128x128xf16, #xegpu.layout> -> vector<128x128xf16> - %11 = xegpu.load_nd %4[%c0, %arg3] : !xegpu.tensor_desc<128x128xf16, #xegpu.layout> -> vector<128x128xf16> + %10 = xegpu.load_nd %3[%arg3, %c0] {layout = #xegpu.layout}: !xegpu.tensor_desc<128x128xf16, #xegpu.layout> -> vector<128x128xf16> + %11 = xegpu.load_nd %4[%c0, %arg3] {layout = #xegpu.layout}: !xegpu.tensor_desc<128x128xf16, #xegpu.layout> -> vector<128x128xf16> scf.yield %10, %11, %9 : vector<128x128xf16>, vector<128x128xf16>, vector<128x128xf32> } // store_nd with offset @@ -215,7 +216,7 @@ gpu.module @test_distribution { // CHECK-NOT: index.sub %tdesc = xegpu.create_nd_tdesc %src : memref<256x128xf32> -> !xegpu.tensor_desc<256x128xf32, #xegpu.layout> - %load = xegpu.load_nd %tdesc[0, 0] + %load = xegpu.load_nd %tdesc[0, 0] {layout = #xegpu.layout} : !xegpu.tensor_desc<256x128xf32, #xegpu.layout> -> vector<256x128xf32> } {sg_id_range = #xegpu.range<[0, 32]>} @@ -228,7 +229,7 @@ gpu.module @test_distribution { // CHECK: %[[SUB:.*]] = index.sub %{{.*}}, %[[C2]] %tdesc = xegpu.create_nd_tdesc %src2 : memref<128x64xf32> -> !xegpu.tensor_desc<128x64xf32, #xegpu.layout> - %load = xegpu.load_nd %tdesc[0, 0] + %load = xegpu.load_nd %tdesc[0, 0] {layout = #xegpu.layout} : !xegpu.tensor_desc<128x64xf32, #xegpu.layout> -> vector<128x64xf32> %exp = math.exp %load {layout_result_0 = #xegpu.layout} : vector<128x64xf32> @@ -244,7 +245,7 @@ gpu.module @test_distribution { %c32 = arith.constant 32 : index %tdesc = xegpu.create_nd_tdesc %src : memref<256x128xf32> -> !xegpu.tensor_desc<256x128xf32, #xegpu.layout> - %load = xegpu.load_nd %tdesc[0, 0] + %load = xegpu.load_nd %tdesc[0, 0] {layout = #xegpu.layout} : !xegpu.tensor_desc<256x128xf32, #xegpu.layout> -> vector<256x128xf32> %cond1 = arith.cmpi sge, %sg_id, %c3 : index @@ -257,7 +258,7 @@ gpu.module @test_distribution { // CHECK: %[[SUB:.*]] = index.sub %{{.*}}, %[[C3]] %td = xegpu.create_nd_tdesc %src1 : memref<128x64xf32> -> !xegpu.tensor_desc<128x64xf32, #xegpu.layout> - %ld = xegpu.load_nd %td[0, 0] + %ld = xegpu.load_nd %td[0, 0] {layout = #xegpu.layout} : !xegpu.tensor_desc<128x64xf32, #xegpu.layout> -> vector<128x64xf32> %exp = math.exp %ld {layout_result_0 = #xegpu.layout} : vector<128x64xf32> @@ -287,13 +288,11 @@ gpu.module @test_distribution { // CHECK: %[[CST:.*]] = arith.constant {layout_result_0 = #xegpu.layout} dense<0> : vector<8xindex> // CHECK: %[[MASK:.*]] = arith.constant {layout_result_0 = #xegpu.layout} dense : vector<8xi1> // CHECK: xegpu.store %[[VAL]], %[[ARG0]][%[[CST]]], %[[MASK]] <{chunk_size = 1 : i64, l1_hint = #xegpu.cache_hint, layout = #xegpu.layout}> - // CHECK-SAME: {layout_operand_0 = #xegpu.layout, layout_operand_2 = #xegpu.layout, - // CHECK-SAME: layout_operand_3 = #xegpu.layout} - // CHECK-SAME: : vector<8xf16>, memref<256xf16>, vector<8xindex>, vector<8xi1> + // CHECK-SAME: : vector<8xf16>, memref<256xf16>, vector<8xindex>, vector<8xi1> %val = arith.constant {layout_result_0 = #xegpu.layout} dense<25.5> : vector<256xf16> %offset = arith.constant {layout_result_0 = #xegpu.layout} dense<0> : vector<256xindex> %mask = arith.constant {layout_result_0 = #xegpu.layout} dense<1> : vector<256xi1> - xegpu.store %val, %dest[%offset], %mask {chunk_size = 1, layout_operand_0 = #xegpu.layout, + xegpu.store %val, %dest[%offset], %mask {chunk_size = 1, layout = #xegpu.layout, layout_operand_2 = #xegpu.layout, layout_operand_3 = #xegpu.layout, l1_hint = #xegpu.cache_hint} @@ -370,7 +369,7 @@ gpu.module @test_distribution { %cst = arith.constant {layout_result_0 = #xegpu.slice<#xegpu.layout, dims = [0]>} dense<1.0> : vector<128xf32> %tdesc = xegpu.create_nd_tdesc %src : memref<4x128xf32> -> !xegpu.tensor_desc<4x128xf32, #xegpu.layout> - %load = xegpu.load_nd %tdesc[0, 0] + %load = xegpu.load_nd %tdesc[0, 0] {layout = #xegpu.layout} : !xegpu.tensor_desc<4x128xf32, #xegpu.layout> -> vector<4x128xf32> // CHECK: vector.multi_reduction , {{.*}}, {{.*}} [0] : vector<4x4xf32> to vector<4xf32> @@ -384,7 +383,7 @@ gpu.module @test_distribution { %cst = arith.constant {layout_result_0 = #xegpu.slice<#xegpu.layout, dims = [1]>} dense<1.0> : vector<256xf32> %tdesc = xegpu.create_nd_tdesc %src : memref<256x64xf32> -> !xegpu.tensor_desc<256x64xf32, #xegpu.layout> - %load = xegpu.load_nd %tdesc[0, 0] + %load = xegpu.load_nd %tdesc[0, 0] {layout = #xegpu.layout} : !xegpu.tensor_desc<256x64xf32, #xegpu.layout> -> vector<256x64xf32> // CHECK: vector.multi_reduction , {{.*}}, {{.*}} [1] : vector<16x64xf32> to vector<16xf32> @@ -468,7 +467,7 @@ gpu.module @test_distribution { gpu.func @vector_transpose(%src: memref<256x32xf32>) { %tdesc = xegpu.create_nd_tdesc %src : memref<256x32xf32> -> !xegpu.tensor_desc<256x32xf32, #xegpu.layout> - %load = xegpu.load_nd %tdesc[0, 0] + %load = xegpu.load_nd %tdesc[0, 0] {layout = #xegpu.layout} : !xegpu.tensor_desc<256x32xf32, #xegpu.layout> -> vector<256x32xf32> //CHECK: vector.transpose {{.*}}, [1, 0] {layout_result_0 = #xegpu.layout} : vector<64x32xf32> to vector<32x64xf32> @@ -624,7 +623,6 @@ gpu.module @test_distribution { %mask = arith.constant {layout_result_0 = #xegpu.layout } dense<1> : vector<256xi1> // CHECK: %[[LOAD:.*]] = xegpu.load {{.*}} <{chunk_size = 1 : i64, layout = #xegpu.slice<#xegpu.layout, dims = [0]>}> - // CHECK-SAME: {layout_result_0 = #xegpu.slice<#xegpu.layout, dims = [0]>} : // CHECK-SAME: memref<4096xf32>, vector<32xindex>, vector<32xi1> -> vector<32xf32> %3 = xegpu.load %2[%offset], %mask {chunk_size = 1, layout = #xegpu.slice<#xegpu.layout, dims = [0]> } : memref<4096xf32>, vector<256xindex>, vector<256xi1> -> vector<256xf32> diff --git a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg.mlir b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg.mlir index a8015cced7eb4..fb77cb93b60fc 100644 --- a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg.mlir +++ b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg.mlir @@ -89,17 +89,15 @@ gpu.module @test_1_1_assignment { gpu.func @dpas(%a: memref<128x128xf16>, %b: memref<128x128xf16>) { %tdesc_a = xegpu.create_nd_tdesc %a[0, 0] : memref<128x128xf16> -> !xegpu.tensor_desc<128x128xf16, #xegpu.layout> - %load_a = xegpu.load_nd %tdesc_a - : !xegpu.tensor_desc<128x128xf16, #xegpu.layout> + %load_a = xegpu.load_nd %tdesc_a {layout = #xegpu.layout} : !xegpu.tensor_desc<128x128xf16, #xegpu.layout> -> vector<128x128xf16> %tdesc_b = xegpu.create_nd_tdesc %b[0, 0] : memref<128x128xf16> -> !xegpu.tensor_desc<128x128xf16, #xegpu.layout> - %load_b = xegpu.load_nd %tdesc_b - : !xegpu.tensor_desc<128x128xf16, #xegpu.layout> + %load_b = xegpu.load_nd %tdesc_b {layout = #xegpu.layout} : !xegpu.tensor_desc<128x128xf16, #xegpu.layout> -> vector<128x128xf16> - // CHECK: %[[DPAS:.*]] = xegpu.dpas %{{.*}}, %{{.*}} {layout_result_0 = #xegpu.layout} : vector<16x128xf16>, vector<128x16xf16> -> vector<16x16xf32> + // CHECK: %[[DPAS:.*]] = xegpu.dpas %{{.*}}, %{{.*}} {layout_cd = #xegpu.layout} : vector<16x128xf16>, vector<128x16xf16> -> vector<16x16xf32> %dpas = xegpu.dpas %load_a, %load_b - {layout_result_0 = #xegpu.layout} + {layout_cd = #xegpu.layout} : vector<128x128xf16>, vector<128x128xf16> -> vector<128x128xf32> gpu.return } @@ -109,20 +107,18 @@ gpu.module @test_1_1_assignment { %tdesc_a = xegpu.create_nd_tdesc %a[0, 0] : memref<128x128xf16> -> !xegpu.tensor_desc<128x128xf16, #xegpu.layout> - %load_a = xegpu.load_nd %tdesc_a - : !xegpu.tensor_desc<128x128xf16, #xegpu.layout} : !xegpu.tensor_desc<128x128xf16, #xegpu.layout> -> vector<128x128xf16> %tdesc_b = xegpu.create_nd_tdesc %b[0, 0] : memref<128x128xf16> -> !xegpu.tensor_desc<128x128xf16, #xegpu.layout> - %load_b = xegpu.load_nd %tdesc_b - : !xegpu.tensor_desc<128x128xf16, #xegpu.layout} : !xegpu.tensor_desc<128x128xf16, #xegpu.layout> -> vector<128x128xf16> - // CHECK: %[[DPAS:.*]] = xegpu.dpas %{{.*}}, %{{.*}} {layout_result_0 = #xegpu.layout} : vector<16x16xf16>, vector<16x16xf16> -> vector<16x16xf32> + // CHECK: %[[DPAS:.*]] = xegpu.dpas %{{.*}}, %{{.*}} {layout_cd = #xegpu.layout} : vector<16x16xf16>, vector<16x16xf16> -> vector<16x16xf32> %dpas = xegpu.dpas %load_a, %load_b - {layout_result_0 = #xegpu.layout} + {layout_cd = #xegpu.layout} : vector<128x128xf16>, vector<128x128xf16> -> vector<128x128xf32> gpu.return } @@ -194,25 +190,24 @@ gpu.module @test_1_1_assignment { %0 = arith.muli %block_id_x, %c128 : index %1 = arith.muli %block_id_y, %c128 : index %2 = xegpu.create_nd_tdesc %arg2[%0, %1] : memref<1024x1024xf32> -> !xegpu.tensor_desc<128x128xf32, #xegpu.layout> - %3 = xegpu.load_nd %2 : !xegpu.tensor_desc<128x128xf32, #xegpu.layout> -> vector<128x128xf32> + %3 = xegpu.load_nd %2 {layout = #xegpu.layout} : !xegpu.tensor_desc<128x128xf32, #xegpu.layout> -> vector<128x128xf32> %4 = xegpu.create_nd_tdesc %arg0[%0, %c0] : memref<1024x1024xf16> -> !xegpu.tensor_desc<128x128xf16, #xegpu.layout> %5 = xegpu.create_nd_tdesc %arg1[%c0, %1] : memref<1024x1024xf16> -> !xegpu.tensor_desc<128x128xf16, #xegpu.layout> // CHECK: %[[SCF:.*]]:3 = scf.for %[[ARG3:.*]] = %[[C0]] to %[[C1024]] step %[[C128]] // CHECK-SAME: iter_args(%[[ARG4:.*]] = {{.*}}, %[[ARG5:.*]] = {{.*}}, %[[ARG6:.*]] = {{.*}}) -> // CHECK-SAME: (!xegpu.tensor_desc<16x128xf16>, !xegpu.tensor_desc<128x16xf16>, vector<16x16xf32>) - // CHECK: %[[A:.*]] = xegpu.load_nd %[[ARG4]] : !xegpu.tensor_desc<16x128xf16> -> vector<16x128xf16> - // CHECK: %[[B:.*]] = xegpu.load_nd %[[ARG5]] : !xegpu.tensor_desc<128x16xf16> -> vector<128x16xf16> + // CHECK: %[[A:.*]] = xegpu.load_nd %[[ARG4]] <{layout = #xegpu.layout}> : !xegpu.tensor_desc<16x128xf16> -> vector<16x128xf16> + // CHECK: %[[B:.*]] = xegpu.load_nd %[[ARG5]] <{layout = #xegpu.layout}> : !xegpu.tensor_desc<128x16xf16> -> vector<128x16xf16> // CHECK: %[[C:.*]] = xegpu.dpas %[[A]], %[[B]], %[[ARG6]] : vector<16x128xf16>, vector<128x16xf16>, vector<16x16xf32> -> vector<16x16xf32> // CHECK: %[[AT:.*]] = xegpu.update_nd_offset %[[ARG4]], [%[[C0]], %[[C128]]] : !xegpu.tensor_desc<16x128xf16> // CHECK: %[[BT:.*]] = xegpu.update_nd_offset %[[ARG5]], [%[[C128]], %[[C0]]] : !xegpu.tensor_desc<128x16xf16> // CHECK: scf.yield %[[AT]], %[[BT]], %[[C]] : !xegpu.tensor_desc<16x128xf16>, !xegpu.tensor_desc<128x16xf16>, vector<16x16xf32> %6:3 = scf.for %arg3 = %c0 to %c1024 step %c128 iter_args(%arg4 = %4, %arg5 = %5, %arg6 = %3) - -> (!xegpu.tensor_desc<128x128xf16, #xegpu.layout>, - !xegpu.tensor_desc<128x128xf16, #xegpu.layout>, vector<128x128xf32>) { - %8 = xegpu.load_nd %arg4 : !xegpu.tensor_desc<128x128xf16, #xegpu.layout> -> vector<128x128xf16> - %9 = xegpu.load_nd %arg5 : !xegpu.tensor_desc<128x128xf16, #xegpu.layout> -> vector<128x128xf16> - %10 = xegpu.dpas %8, %9, %arg6 {layout_result_0 = #xegpu.layout} + -> (!xegpu.tensor_desc<128x128xf16, #xegpu.layout>, !xegpu.tensor_desc<128x128xf16, #xegpu.layout>, vector<128x128xf32>) { + %8 = xegpu.load_nd %arg4 {layout = #xegpu.layout} : !xegpu.tensor_desc<128x128xf16, #xegpu.layout> -> vector<128x128xf16> + %9 = xegpu.load_nd %arg5 {layout = #xegpu.layout} : !xegpu.tensor_desc<128x128xf16, #xegpu.layout> -> vector<128x128xf16> + %10 = xegpu.dpas %8, %9, %arg6 {layout_cd = #xegpu.layout} : vector<128x128xf16>, vector<128x128xf16>, vector<128x128xf32> -> vector<128x128xf32> %11 = xegpu.update_nd_offset %arg4, [%c0, %c128] : !xegpu.tensor_desc<128x128xf16, #xegpu.layout> %12 = xegpu.update_nd_offset %arg5, [%c128, %c0] : !xegpu.tensor_desc<128x128xf16, #xegpu.layout> @@ -221,7 +216,7 @@ gpu.module @test_1_1_assignment { } %7 = xegpu.create_nd_tdesc %arg2[%0, %1] : memref<1024x1024xf32> -> !xegpu.tensor_desc<128x128xf32, #xegpu.layout> - xegpu.store_nd %6#2, %7 : vector<128x128xf32>, !xegpu.tensor_desc<128x128xf32, #xegpu.layout> + xegpu.store_nd %6#2, %7 {layout = #xegpu.layout } : vector<128x128xf32>, !xegpu.tensor_desc<128x128xf32, #xegpu.layout> gpu.return } @@ -230,7 +225,7 @@ gpu.module @test_1_1_assignment { %c10_i32 = arith.constant 10 : i32 %c0_i32 = arith.constant 0 : i32 %0 = xegpu.create_nd_tdesc %arg0[0] : memref<1024xf32> -> !xegpu.tensor_desc<256xf32, #xegpu.layout> - %1 = xegpu.load_nd %0 : !xegpu.tensor_desc<256xf32, #xegpu.layout> -> vector<256xf32> + %1 = xegpu.load_nd %0 {layout = #xegpu.layout} : !xegpu.tensor_desc<256xf32, #xegpu.layout> -> vector<256xf32> %2 = xegpu.create_nd_tdesc %arg1[0] : memref<1024xf32> -> !xegpu.tensor_desc<256xf32, #xegpu.layout> // CHECK: scf.while {{.*}} : (vector<16xf32>, i32) -> (vector<16xf32>, i32) @@ -244,7 +239,7 @@ gpu.module @test_1_1_assignment { xegpu.store_nd %arg2, %2 : vector<256xf32>, !xegpu.tensor_desc<256xf32, #xegpu.layout> %4 = arith.addi %arg3, %c1_i32 : i32 %5 = xegpu.update_nd_offset %0, [256] : !xegpu.tensor_desc<256xf32, #xegpu.layout> - %6 = xegpu.load_nd %5 : !xegpu.tensor_desc<256xf32, #xegpu.layout> -> vector<256xf32> + %6 = xegpu.load_nd %5 {layout = #xegpu.layout} : !xegpu.tensor_desc<256xf32, #xegpu.layout> -> vector<256xf32> scf.yield %6, %4 : vector<256xf32>, i32 } gpu.return @@ -263,14 +258,14 @@ gpu.module @test_1_1_assignment { %5 = scf.if %4 -> (vector<256xf32>) { // CHECK-LABEL: xegpu.load_nd // CHECK-SAME: !xegpu.tensor_desc<16xf32> -> vector<16xf32> - %2 = xegpu.load_nd %0 : !xegpu.tensor_desc<256xf32, #xegpu.layout> -> vector<256xf32> + %2 = xegpu.load_nd %0 {layout = #xegpu.layout} : !xegpu.tensor_desc<256xf32, #xegpu.layout> -> vector<256xf32> // CHECK-LABEL: scf.yield // CHECK-SAME: vector<16xf32> scf.yield %2 : vector<256xf32> } else { // CHECK-LABEL: xegpu.load_nd // CHECK-SAME: !xegpu.tensor_desc<16xf32> -> vector<16xf32> - %3 = xegpu.load_nd %1 : !xegpu.tensor_desc<256xf32, #xegpu.layout> -> vector<256xf32> + %3 = xegpu.load_nd %1 {layout = #xegpu.layout} : !xegpu.tensor_desc<256xf32, #xegpu.layout> -> vector<256xf32> // CHECK-LABEL: scf.yield // CHECK-SAME: vector<16xf32> scf.yield %3 : vector<256xf32> From efcc1b72b9de1e38aa5883eba7beec80ff912aae Mon Sep 17 00:00:00 2001 From: Jianhui Li Date: Fri, 12 Dec 2025 19:44:36 +0000 Subject: [PATCH 18/28] fixing XeGPU tests --- .../Transforms/XeGPUSubgroupDistribute.cpp | 63 ++++++++++++++++-- .../Transforms/XeGPUWgToSgDistribute.cpp | 16 +++-- mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp | 65 +++++++++++++++---- .../XeGPU/subgroup-distribute-unit.mlir | 7 +- .../Dialect/XeGPU/subgroup-distribute.mlir | 31 +++++---- .../test/Dialect/XeGPU/xegpu-wg-to-sg-rr.mlir | 7 +- .../XeGPU/xegpu-wg-to-sg-unify-ops-rr.mlir | 6 +- .../XeGPU/xegpu-wg-to-sg-unify-ops.mlir | 25 +++++-- mlir/test/Dialect/XeGPU/xegpu-wg-to-sg.mlir | 23 +++++-- .../Dialect/XeGPU/WG/simple_gemm.mlir | 26 ++++---- 10 files changed, 196 insertions(+), 73 deletions(-) diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp index 1b851d964c1a9..4e663e1dd6694 100644 --- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp +++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp @@ -630,22 +630,26 @@ struct DpasDistribution final : public gpu::WarpDistributionPattern { LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op warpOp, PatternRewriter &rewriter) const override { OpOperand *operand = getWarpResult(warpOp, llvm::IsaPred); - if (!operand) + if (!operand) { + DBGS() << "No dpas op result found in warp op\n"; return rewriter.notifyMatchFailure(warpOp, "warp result is not a xegpu::Dpas op"); + } auto dpasOp = operand->get().getDefiningOp(); unsigned operandIdx = operand->getOperandNumber(); - std::string layoutAName = xegpu::getTempLayoutName(dpasOp->getOpOperand(0)); - std::string layoutBName = xegpu::getTempLayoutName(dpasOp->getOpOperand(1)); - std::string layoutCName = xegpu::getTempLayoutName(dpasOp->getOpResult(0)); + DBGS() << "Found dpas op: " << *dpasOp << "\n"; xegpu::LayoutAttr layoutA = - dpasOp->getAttrOfType(layoutAName); + dyn_cast(dpasOp.getLayoutAAttr()); xegpu::LayoutAttr layoutB = - dpasOp->getAttrOfType(layoutBName); + dyn_cast(dpasOp.getLayoutBAttr()); xegpu::LayoutAttr layoutOut = - dpasOp->getAttrOfType(layoutCName); + dyn_cast(dpasOp.getLayoutCdAttr()); + + DBGS() << "Layout A: " << layoutA << ", Layout B: " << layoutB + << ", Layout Out: " << layoutOut << "\n"; + if (!layoutA || !layoutB || !layoutOut) return rewriter.notifyMatchFailure( dpasOp, @@ -657,6 +661,24 @@ struct DpasDistribution final : public gpu::WarpDistributionPattern { getDistVecTypeBasedOnLaneLayout(layoutB, dpasOp.getRhsType()); FailureOr distResultTypeByWarpOpOrFailure = getDistVecTypeBasedOnLaneLayout(layoutOut, dpasOp.getResultType()); + + DBGS() << "Original LHS type: " << dpasOp.getLhsType() << ", distributed: " + << (succeeded(distLhsTypeByWarpOpOrFailure) + ? distLhsTypeByWarpOpOrFailure.value() + : Type()) + << "\n"; + DBGS() << "Original RHS type: " << dpasOp.getRhsType() << ", distributed: " + << (succeeded(distRhsTypeByWarpOpOrFailure) + ? distRhsTypeByWarpOpOrFailure.value() + : Type()) + << "\n"; + DBGS() << "Original Result type: " << dpasOp.getResultType() + << ", distributed: " + << (succeeded(distResultTypeByWarpOpOrFailure) + ? distResultTypeByWarpOpOrFailure.value() + : Type()) + << "\n"; + if (failed(distLhsTypeByWarpOpOrFailure) || failed(distRhsTypeByWarpOpOrFailure) || failed(distResultTypeByWarpOpOrFailure)) @@ -671,11 +693,14 @@ struct DpasDistribution final : public gpu::WarpDistributionPattern { distRhsTypeByWarpOpOrFailure.value()}; // Dpas acc operand is optional. if (dpasOp.getAcc()) { + DBGS() << "Dpas has accumulator operand\n"; newYieldValues.push_back(dpasOp.getAcc()); newYieldTypes.push_back(distResultTypeByWarpOpOrFailure.value()); } // Create a new warp op without the dpas. SmallVector newRetIndices; + DBGS() << "Creating new warp op with " << newYieldValues.size() + << " yielded values\n"; gpu::WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndAppendReturns( rewriter, warpOp, newYieldValues, newYieldTypes, newRetIndices); @@ -685,6 +710,23 @@ struct DpasDistribution final : public gpu::WarpDistributionPattern { xegpu::getDistributedVectorType(dpasOp.getRhsType(), layoutB); FailureOr expectedDistResultTyOrFailure = xegpu::getDistributedVectorType(dpasOp.getResultType(), layoutOut); + + DBGS() << "Expected dist LHS type: " + << (succeeded(expectedDistLhsTyOrFailure) + ? expectedDistLhsTyOrFailure.value() + : Type()) + << "\n"; + DBGS() << "Expected dist RHS type: " + << (succeeded(expectedDistRhsTyOrFailure) + ? expectedDistRhsTyOrFailure.value() + : Type()) + << "\n"; + DBGS() << "Expected dist Result type: " + << (succeeded(expectedDistResultTyOrFailure) + ? expectedDistResultTyOrFailure.value() + : Type()) + << "\n"; + if (failed(expectedDistLhsTyOrFailure) || failed(expectedDistRhsTyOrFailure) || failed(expectedDistResultTyOrFailure)) @@ -704,6 +746,9 @@ struct DpasDistribution final : public gpu::WarpDistributionPattern { newDpasOperandExpectedTypes.push_back(distributedResultTy); for (unsigned i = 0; i < newRetIndices.size(); i++) { + DBGS() << "Resolving operand " << i << " with type " + << newWarpOp.getResult(newRetIndices[i]).getType() << " to " + << newDpasOperandExpectedTypes[i] << "\n"; newDpasOperands.push_back( resolveDistributedTy(newWarpOp.getResult(newRetIndices[i]), newDpasOperandExpectedTypes[i], rewriter)); @@ -711,13 +756,17 @@ struct DpasDistribution final : public gpu::WarpDistributionPattern { auto newDpasOp = xegpu::DpasOp::create(rewriter, newWarpOp->getLoc(), distributedResultTy, newDpasOperands, dpasOp->getAttrs()); + DBGS() << "Created new dpas op: " << *newDpasOp << "\n"; xegpu::removeLayoutAttrs(newDpasOp); Value distributedVal = newWarpOp.getResult(operandIdx); // Resolve the output type. + DBGS() << "Resolving output from " << newDpasOp.getResult().getType() + << " to " << distResultTypeByWarpOpOrFailure.value() << "\n"; Value typeResolved = resolveDistributedTy(newDpasOp.getResult(), distResultTypeByWarpOpOrFailure.value(), rewriter); rewriter.replaceAllUsesWith(distributedVal, typeResolved); + DBGS() << "Successfully distributed dpas op\n"; return success(); } }; diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp index a42b4c394a476..af90674d4ce0d 100644 --- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp +++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp @@ -425,10 +425,11 @@ struct WgToSgDpasOp : public OpConversionPattern { if (resultTy.getRank() != 2) return failure(); - auto originalLayout = xegpu::getDistributeLayoutAttr(op.getResult()); - if (!originalLayout) + auto layoutCd = op.getLayoutCdAttr(); + auto layoutA = op.getLayoutAAttr(); + auto layoutB = op.getLayoutBAttr(); + if (!layoutCd || !layoutA || !layoutB) return failure(); - size_t i = 0; SmallVector newDpasOps; for (auto aVec : adaptor.getLhs()) { @@ -447,11 +448,12 @@ struct WgToSgDpasOp : public OpConversionPattern { llvm::cast(bVec.getType()).getShape(); VectorType resTy = VectorType::get({aVecShape[0], bVecShape[1]}, resultTy.getElementType()); - tmpC = xegpu::DpasOp::create(rewriter, loc, resTy, operands); - xegpu::setDistributeLayoutAttr(cast(tmpC), - originalLayout.dropSgLayoutAndData()); + auto newDpasOp = xegpu::DpasOp::create(rewriter, loc, resTy, operands); + newDpasOp.setLayoutCdAttr(layoutCd.dropSgLayoutAndData()); + newDpasOp.setLayoutAAttr(layoutA.dropSgLayoutAndData()); + newDpasOp.setLayoutBAttr(layoutB.dropSgLayoutAndData()); - newDpasOps.push_back(tmpC); + newDpasOps.push_back(newDpasOp); } } rewriter.replaceOpWithMultiple(op, {newDpasOps}); diff --git a/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp b/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp index 2c3589bbe6e4f..f4da557a8fa32 100644 --- a/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp +++ b/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp @@ -264,39 +264,53 @@ void xegpu::setDistributeLayoutAttr( const mlir::xegpu::DistributeLayoutAttr layout) { Operation *owner = result.getOwner(); + llvm::errs() << "Setting layout for result of op: " << *owner << "\n"; + if (auto anchorOp = dyn_cast(owner)) { if (anchorOp.getAnchorLayout() == layout) return; - return anchorOp.setAnchorLayout(layout); + llvm::errs() << " Setting anchor layout\n"; + anchorOp.setAnchorLayout(layout); + llvm::errs() << " After setting anchor layout, op: " << *owner << "\n"; + return; } std::string name = xegpu::getTempLayoutName(result); - if (owner->hasAttrOfType(name)) + if (owner->hasAttrOfType(name)) { return; - if (layout) + } + if (layout) { + llvm::errs() << " Setting temp layout attribute: " << name << "\n"; owner->setAttr(name, layout); + llvm::errs() << " After setting temp layout, op: " << *owner << "\n"; + } } void xegpu::setDistributeLayoutAttr(const OpOperand &operand, const DistributeLayoutAttr layout) { Operation *owner = operand.getOwner(); unsigned idx = const_cast(operand).getOperandNumber(); - if (layout) + + llvm::errs() << "Setting layout for operand 1" << idx << " of op: " << *owner + << " with layout: " << layout << "\n"; + + if (!layout) { return; + } if (auto anchorOp = dyn_cast(owner)) { if (auto dpasOp = dyn_cast(owner)) { if (idx == 0) { - llvm::errs() << " set DpasOp layout A\n"; + llvm::errs() << " Setting DpasOp layout A\n"; return dpasOp.setLayoutAAttr(layout); } else if (idx == 1) { - llvm::errs() << " set DpasOp layout B\n"; + llvm::errs() << " Setting DpasOp layout B\n"; return dpasOp.setLayoutBAttr(layout); } else if (idx == 2) { - llvm::errs() << " set DpasOp layout CD\n"; + llvm::errs() << " Setting DpasOp layout CD\n"; return dpasOp.setLayoutCdAttr(layout); } } if (auto convertOp = dyn_cast(owner)) { - llvm::errs() << " set ConvertLayoutOp input layout\n"; + llvm::errs() << " Setting ConvertLayoutOp input layout\n"; return convertOp.setInputLayoutAttr(layout); } @@ -306,22 +320,28 @@ void xegpu::setDistributeLayoutAttr(const OpOperand &operand, if (isa( owner)) { if (idx < 2) { - llvm::errs() << " set anchor layout for store op\n"; + llvm::errs() << " Setting anchor layout for store op\n"; anchorOp.setAnchorLayout(layout); + llvm::errs() << " After setting anchor layout, op: " << *owner << "\n"; } } else { if (idx == 0) { - llvm::errs() << " set anchor layout\n"; + llvm::errs() << " Setting anchor layout\n"; anchorOp.setAnchorLayout(layout); + llvm::errs() << " After setting anchor layout, op: " << *owner << "\n"; } } } std::string name = xegpu::getTempLayoutName(operand); - if (owner->hasAttrOfType(name)) + if (owner->hasAttrOfType(name)) { return; - if (layout) + } + if (layout) { + llvm::errs() << " Setting temp layout attribute: " << name << "\n"; owner->setAttr(name, layout); + llvm::errs() << " After setting temp layout, op: " << *owner << "\n"; + } } void xegpu::retrieveDistributeLayoutAttrsRecursive(Operation *op) { @@ -448,6 +468,11 @@ void xegpu::doSCFStructuralTypeConversionWithTensorType( .getResult(0); }; + llvm::errs() + << "\n=== Stage 1: Converting VectorType to RankedTensorType ===\n"; + llvm::errs() << "Before conversion:\n"; + op->dump(); + { // convert VectorType to RankedTensorType for SCF Structural ops TypeConverter converter; converter.addConversion([](Type type) -> Type { return type; }); @@ -466,6 +491,12 @@ void xegpu::doSCFStructuralTypeConversionWithTensorType( (void)mlir::applyPartialConversion(op, target, std::move(patterns)); } + llvm::errs() << "\nAfter Stage 1:\n"; + op->dump(); + + llvm::errs() << "\n=== Stage 2: Propagating layout attributes to " + "RankedTensorType ===\n"; + { // propagate the layout attribute to RankedTensorType by checking // BuiltInUnrealizedCastOps // for VectorType to RankedTensorType cast. @@ -520,6 +551,12 @@ void xegpu::doSCFStructuralTypeConversionWithTensorType( }); } + llvm::errs() << "\nAfter Stage 2:\n"; + op->dump(); + + llvm::errs() + << "\n=== Stage 3: Converting RankedTensorType back to VectorType ===\n"; + { // perform the conversion from RankedTensorType to VectorType based on the // DistributeLayoutAttr @@ -583,6 +620,10 @@ void xegpu::doSCFStructuralTypeConversionWithTensorType( target); (void)mlir::applyPartialConversion(op, target, std::move(patterns)); } + + llvm::errs() << "\nAfter Stage 3 (final):\n"; + op->dump(); + llvm::errs() << "\n=== Transformation complete ===\n\n"; } std::optional xegpu::getChipStr(Operation *op) { diff --git a/mlir/test/Dialect/XeGPU/subgroup-distribute-unit.mlir b/mlir/test/Dialect/XeGPU/subgroup-distribute-unit.mlir index 216f3d19cff94..6f4678400239e 100644 --- a/mlir/test/Dialect/XeGPU/subgroup-distribute-unit.mlir +++ b/mlir/test/Dialect/XeGPU/subgroup-distribute-unit.mlir @@ -140,10 +140,9 @@ gpu.func @dpas(%laneid: index) { %2 = "some_op"() : () -> vector<8x16xf32> %3 = xegpu.dpas %0, %1, %2 { - layout_operand_0 = #xegpu.layout, - layout_operand_1 = #xegpu.layout, - layout_operand_2 = #xegpu.layout, - layout_result_0 = #xegpu.layout + layout_a = #xegpu.layout, + layout_b = #xegpu.layout, + layout_cd = #xegpu.layout } : vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32> -> vector<8x16xf32> gpu.yield %3 : vector<8x16xf32> diff --git a/mlir/test/Dialect/XeGPU/subgroup-distribute.mlir b/mlir/test/Dialect/XeGPU/subgroup-distribute.mlir index ee07b92f6e795..87c67ba6bf324 100644 --- a/mlir/test/Dialect/XeGPU/subgroup-distribute.mlir +++ b/mlir/test/Dialect/XeGPU/subgroup-distribute.mlir @@ -20,18 +20,20 @@ gpu.module @xevm_module{ %0 = xegpu.create_nd_tdesc %arg0 : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16, #xegpu.layout> %1 = xegpu.load_nd %0[%c0, %c0] - {layout_result_0 = #xegpu.layout} : + {layout = #xegpu.layout} : !xegpu.tensor_desc<8x16xf16, #xegpu.layout> -> vector<8x16xf16> %2 = xegpu.create_nd_tdesc %arg1: memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout> %3 = xegpu.load_nd %2[%c0, %c0] - {layout_result_0 = #xegpu.layout} + {layout = #xegpu.layout} : !xegpu.tensor_desc<16x16xf16, #xegpu.layout> -> vector<16x16xf16> %4 = xegpu.dpas %1, %3 - {layout_result_0 = #xegpu.layout} + {layout_a = #xegpu.layout, + layout_b = #xegpu.layout, + layout_cd = #xegpu.layout} : vector<8x16xf16>, vector<16x16xf16> -> vector<8x16xf32> %5 = math.exp %4 @@ -84,7 +86,7 @@ gpu.func @gemm(%arg0: memref<1024x1024xbf16>, %arg1: memref<1024x1024xbf16>, %ar %2 = xegpu.create_nd_tdesc %arg2 : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.layout> %3 = xegpu.load_nd %2[%0, %1] - {layout_result_0 = #xegpu.layout} + {layout = #xegpu.layout} : !xegpu.tensor_desc<8x16xf32, #xegpu.layout> -> vector<8x16xf32> %4 = scf.for %arg3 = %c0 to %c1024 step %c16 iter_args(%arg4 = %3) -> (vector<8x16xf32>) { @@ -95,14 +97,16 @@ gpu.func @gemm(%arg0: memref<1024x1024xbf16>, %arg1: memref<1024x1024xbf16>, %ar -> !xegpu.tensor_desc<16x16xbf16, #xegpu.layout> %7 = xegpu.load_nd %5[%0, %arg3] - {layout_result_0 = #xegpu.layout} + {layout = #xegpu.layout} : !xegpu.tensor_desc<8x16xbf16, #xegpu.layout> -> vector<8x16xbf16> %8 = xegpu.load_nd %6[%arg3, %1] - {layout_result_0 = #xegpu.layout} + {layout = #xegpu.layout} : !xegpu.tensor_desc<16x16xbf16, #xegpu.layout> -> vector<16x16xbf16> %9 = xegpu.dpas %7, %8, %arg4 - {layout_result_0 = #xegpu.layout} + {layout_a = #xegpu.layout, + layout_b = #xegpu.layout, + layout_cd = #xegpu.layout} : vector<8x16xbf16>, vector<16x16xbf16>, vector<8x16xf32> -> vector<8x16xf32> scf.yield %9 : vector<8x16xf32> @@ -194,17 +198,20 @@ gpu.module @xevm_module{ %c0 = arith.constant 0 : index %0 = xegpu.create_nd_tdesc %arg0 : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16, #xegpu.layout> - %1 = xegpu.load_nd %0[%c0, %c0] {layout_result_0 = #xegpu.layout} + %1 = xegpu.load_nd %0[%c0, %c0] {layout = #xegpu.layout} : !xegpu.tensor_desc<8x16xf16, #xegpu.layout> -> vector<8x16xf16> %2 = xegpu.create_nd_tdesc %arg1 : memref<16x8xi32> -> !xegpu.tensor_desc<16x8xi32, #xegpu.layout> - %3 = xegpu.load_nd %2[%c0, %c0] {layout_result_0 = #xegpu.layout} + %3 = xegpu.load_nd %2[%c0, %c0] {layout = #xegpu.layout} : !xegpu.tensor_desc<16x8xi32, #xegpu.layout> -> vector<16x8xi32> %4 = vector.bitcast %3 {layout_result_0 = #xegpu.layout} : vector<16x8xi32> to vector<16x16xf16> %5 = vector.transpose %4, [1, 0] {layout_result_0 = #xegpu.layout} : vector<16x16xf16> to vector<16x16xf16> - %6 = xegpu.dpas %1, %5 {layout_result_0 = #xegpu.layout} + %6 = xegpu.dpas %1, %5 + {layout_a = #xegpu.layout, + layout_b = #xegpu.layout, + layout_cd = #xegpu.layout} : vector<8x16xf16>, vector<16x16xf16> -> vector<8x16xf32> %7 = xegpu.create_nd_tdesc %arg2 : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.layout> @@ -341,7 +348,7 @@ gpu.module @xevm_module{ -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout> %tdesc1 = xegpu.create_nd_tdesc %arg1 : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout> - %0 = xegpu.load_nd %tdesc0[%c0, %c0] <{layout = #xegpu.layout}> {layout_result_0 = #xegpu.layout} : !xegpu.tensor_desc<16x16xf16, #xegpu.layout> -> vector<16x16xf16> + %0 = xegpu.load_nd %tdesc0[%c0, %c0] <{layout = #xegpu.layout}> : !xegpu.tensor_desc<16x16xf16, #xegpu.layout> -> vector<16x16xf16> %1 = vector.multi_reduction , %0, %cst {layout_result_0 = #xegpu.slice<#xegpu.layout, dims = [0]>} [0] : vector<16x16xf16> to vector<16xf16> // CHECK: %[[BCAST:.*]] = vector.broadcast %{{.*}} : f16 to vector<16xf16> %2 = vector.broadcast %1 {layout_result_0 = #xegpu.layout} : vector<16xf16> to vector<16x16xf16> @@ -356,7 +363,7 @@ gpu.module @xevm_module{ gpu.func @vector_broadcast_2d_to_2d_across_lane_lower_to_noop_case(%arg0: memref<16xf16>, %arg1: memref<16x16xf16>) { %c0 = arith.constant 0 : index %mask = vector.constant_mask [16] {layout_result_0 = #xegpu.slice<#xegpu.layout, dims = [1]>}: vector<16xi1> - %1 = xegpu.load %arg0[%c0], %mask {layout_result_0 = #xegpu.slice<#xegpu.layout, dims = [1]>}: memref<16xf16>, index, vector<16xi1> -> vector<16xf16> + %1 = xegpu.load %arg0[%c0], %mask {layout = #xegpu.slice<#xegpu.layout, dims = [1]>}: memref<16xf16>, index, vector<16xi1> -> vector<16xf16> %11 = vector.shape_cast %1 {layout_result_0 = #xegpu.layout} : vector<16xf16> to vector<16x1xf16> %2 = vector.broadcast %11 {layout_result_0 = #xegpu.layout} : vector<16x1xf16> to vector<16x16xf16> diff --git a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-rr.mlir b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-rr.mlir index 9b00adaaa56da..6b8b4f282b744 100644 --- a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-rr.mlir +++ b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-rr.mlir @@ -78,7 +78,7 @@ gpu.module @test_round_robin_assignment { gpu.func @dpas(%a: memref<256x128xf16>, %b: memref<128x256xf16>) { // CHECK-COUNT-4: xegpu.create_nd_tdesc %[[ARG_0]][%{{.*}}, %{{.*}}] : memref<256x128xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout> // CHECK-COUNT-4: xegpu.create_nd_tdesc %[[ARG_1]][%{{.*}}, %{{.*}}] : memref<128x256xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout> - // CHECK-COUNT-16: xegpu.dpas %{{.*}}, %{{.*}} {layout_cd = #xegpu.layout} : vector<16x16xf16>, vector<16x16xf16> -> vector<16x16xf32> + // CHECK-COUNT-16: xegpu.dpas %{{.*}}, %{{.*}} {layout_a = #xegpu.layout, layout_b = #xegpu.layout, layout_cd = #xegpu.layout} : vector<16x16xf16>, vector<16x16xf16> -> vector<16x16xf32> // CHECK-NOT: xegpu.dpas %tdesc_a = xegpu.create_nd_tdesc %a[0, 0] : memref<256x128xf16> -> !xegpu.tensor_desc<256x128xf16, #xegpu.layout> @@ -88,7 +88,10 @@ gpu.module @test_round_robin_assignment { -> !xegpu.tensor_desc<128x256xf16, #xegpu.layout> %load_b = xegpu.load_nd %tdesc_b {layout = #xegpu.layout} : !xegpu.tensor_desc<128x256xf16, #xegpu.layout> -> vector<128x256xf16> - %dpas = xegpu.dpas %load_a, %load_b {layout_cd = #xegpu.layout} + %dpas = xegpu.dpas %load_a, %load_b + {layout_a = #xegpu.layout, + layout_b = #xegpu.layout, + layout_cd = #xegpu.layout} : vector<256x128xf16>, vector<128x256xf16> -> vector<256x256xf32> gpu.return } diff --git a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops-rr.mlir b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops-rr.mlir index b3ba99887d763..f9a79b36c5790 100644 --- a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops-rr.mlir +++ b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops-rr.mlir @@ -56,7 +56,7 @@ gpu.module @test_distribution { // CHECK-COUNT-4: xegpu.load_nd {{%.*}}[{{%.*}}, {{%.*}}] <{layout = #xegpu.layout}> : !xegpu.tensor_desc<16x16xf16, #xegpu.layout> -> vector<16x16xf16> // CHECK-COUNT-4: xegpu.create_nd_tdesc %[[ARG_1]] : memref<128x256xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout> // CHECK-COUNT-4: xegpu.load_nd {{%.*}}[{{%.*}}, {{%.*}}] <{layout = #xegpu.layout}> : !xegpu.tensor_desc<16x16xf16, #xegpu.layout> -> vector<16x16xf16> - // CHECK-COUNT-16: xegpu.dpas %{{.*}}, %{{.*}} {layout_cd = #xegpu.layout} : vector<16x16xf16>, vector<16x16xf16> -> vector<16x16xf32> + // CHECK-COUNT-16: xegpu.dpas %{{.*}}, %{{.*}} {layout_a = #xegpu.layout, layout_b = #xegpu.layout, layout_cd = #xegpu.layout} : vector<16x16xf16>, vector<16x16xf16> -> vector<16x16xf32> // CHECK-NOT: xegpu.dpas %tdesc_a = xegpu.create_nd_tdesc %a : memref<256x128xf16> -> !xegpu.tensor_desc<256x128xf16, #xegpu.layout> @@ -69,7 +69,9 @@ gpu.module @test_distribution { : !xegpu.tensor_desc<128x256xf16, #xegpu.layout> -> vector<128x256xf16> %dpas = xegpu.dpas %load_a, %load_b - {layout_cd = #xegpu.layout} + {layout_a = #xegpu.layout, + layout_b = #xegpu.layout, + layout_cd = #xegpu.layout} : vector<256x128xf16>, vector<128x256xf16> -> vector<256x256xf32> gpu.return } diff --git a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops.mlir b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops.mlir index 9179344b8fc2b..da6ad976d3730 100644 --- a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops.mlir +++ b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops.mlir @@ -75,7 +75,7 @@ gpu.module @test_distribution { // CHECK-LABEL: dpas gpu.func @dpas(%a: memref<128x128xf16>, %b: memref<128x128xf16>) { - // CHECK: %[[DPAS:.*]] = xegpu.dpas %{{.*}}, %{{.*}} {layout_cd = #xegpu.layout} : vector<16x128xf16>, vector<128x16xf16> -> vector<16x16xf32> + // CHECK: %[[DPAS:.*]] = xegpu.dpas %{{.*}}, %{{.*}} {layout_a = #xegpu.layout, layout_b = #xegpu.layout, layout_cd = #xegpu.layout} : vector<16x128xf16>, vector<128x16xf16> -> vector<16x16xf32> %tdesc_a = xegpu.create_nd_tdesc %a : memref<128x128xf16> -> !xegpu.tensor_desc<128x128xf16, #xegpu.layout> %load_a = xegpu.load_nd %tdesc_a[0, 0] {layout = #xegpu.layout} @@ -87,14 +87,16 @@ gpu.module @test_distribution { : !xegpu.tensor_desc<128x128xf16, #xegpu.layout> -> vector<128x128xf16> %dpas = xegpu.dpas %load_a, %load_b - {layout_cd = #xegpu.layout} + {layout_a = #xegpu.layout, + layout_b = #xegpu.layout, + layout_cd = #xegpu.layout} : vector<128x128xf16>, vector<128x128xf16> -> vector<128x128xf32> gpu.return } // CHECK-LABEL: dpas_no_sg_data gpu.func @dpas_no_sg_data(%a: memref<128x128xf16>, %b: memref<128x128xf16>) { - // CHECK: %[[DPAS:.*]] = xegpu.dpas %{{.*}}, %{{.*}} {layout_cd = #xegpu.layout} : vector<16x16xf16>, vector<16x16xf16> -> vector<16x16xf32> + // CHECK: %[[DPAS:.*]] = xegpu.dpas %{{.*}}, %{{.*}} {layout_a = #xegpu.layout, layout_b = #xegpu.layout, layout_cd = #xegpu.layout} : vector<16x16xf16>, vector<16x16xf16> -> vector<16x16xf32> %tdesc_a = xegpu.create_nd_tdesc %a : memref<128x128xf16> -> !xegpu.tensor_desc<128x128xf16, #xegpu.layout> @@ -111,7 +113,11 @@ gpu.module @test_distribution { order = [1, 0]>> -> vector<128x128xf16> %dpas = xegpu.dpas %load_a, %load_b - {layout_cd = #xegpu.layout} + {layout_a = #xegpu.layout, + layout_b = #xegpu.layout, + layout_cd = #xegpu.layout} : vector<128x128xf16>, vector<128x128xf16> -> vector<128x128xf32> gpu.return } @@ -120,7 +126,9 @@ gpu.module @test_distribution { gpu.func @dpas_with_no_create_nd_desc(%a: vector<256x128xf32>, %b: vector<128x256xf32>) { // CHECK-NOT: vector<32x32xf32> %dpas = xegpu.dpas %a, %b - {layout_cd = #xegpu.layout} + {layout_a = #xegpu.layout, + layout_b = #xegpu.layout, + layout_cd = #xegpu.layout} : vector<256x128xf32>, vector<128x256xf32> -> vector<256x256xf32> gpu.return } @@ -190,8 +198,11 @@ gpu.module @test_distribution { %8:3 = scf.for %arg3 = %c0 to %c1024 step %c128 iter_args(%arg4 = %6, %arg5 = %7, %arg6 = %5) -> (vector<128x128xf16>, vector<128x128xf16>, vector<128x128xf32>) { // load_nd with offset inside loop - %9 = xegpu.dpas %arg4, %arg5, %arg6 {layout_cd = #xegpu.layout} - : vector<128x128xf16>, vector<128x128xf16>, vector<128x128xf32> -> vector<128x128xf32> + %9 = xegpu.dpas %arg4, %arg5, %arg6 + {layout_a = #xegpu.layout, + layout_b = #xegpu.layout, + layout_cd = #xegpu.layout} + : vector<128x128xf16>, vector<128x128xf16>, vector<128x128xf32> -> vector<128x128xf32> %10 = xegpu.load_nd %3[%arg3, %c0] {layout = #xegpu.layout}: !xegpu.tensor_desc<128x128xf16, #xegpu.layout> -> vector<128x128xf16> %11 = xegpu.load_nd %4[%c0, %arg3] {layout = #xegpu.layout}: !xegpu.tensor_desc<128x128xf16, #xegpu.layout> -> vector<128x128xf16> scf.yield %10, %11, %9 : vector<128x128xf16>, vector<128x128xf16>, vector<128x128xf32> diff --git a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg.mlir b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg.mlir index fb77cb93b60fc..50081ed34fe78 100644 --- a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg.mlir +++ b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg.mlir @@ -95,9 +95,11 @@ gpu.module @test_1_1_assignment { -> !xegpu.tensor_desc<128x128xf16, #xegpu.layout> %load_b = xegpu.load_nd %tdesc_b {layout = #xegpu.layout} : !xegpu.tensor_desc<128x128xf16, #xegpu.layout> -> vector<128x128xf16> - // CHECK: %[[DPAS:.*]] = xegpu.dpas %{{.*}}, %{{.*}} {layout_cd = #xegpu.layout} : vector<16x128xf16>, vector<128x16xf16> -> vector<16x16xf32> + // CHECK: %[[DPAS:.*]] = xegpu.dpas %{{.*}}, %{{.*}} {layout_a = #xegpu.layout, layout_b = #xegpu.layout, layout_cd = #xegpu.layout} : vector<16x128xf16>, vector<128x16xf16> -> vector<16x16xf32> %dpas = xegpu.dpas %load_a, %load_b - {layout_cd = #xegpu.layout} + {layout_a = #xegpu.layout, + layout_b = #xegpu.layout, + layout_cd = #xegpu.layout} : vector<128x128xf16>, vector<128x128xf16> -> vector<128x128xf32> gpu.return } @@ -116,9 +118,11 @@ gpu.module @test_1_1_assignment { %load_b = xegpu.load_nd %tdesc_b {layout = #xegpu.layout} : !xegpu.tensor_desc<128x128xf16, #xegpu.layout> -> vector<128x128xf16> - // CHECK: %[[DPAS:.*]] = xegpu.dpas %{{.*}}, %{{.*}} {layout_cd = #xegpu.layout} : vector<16x16xf16>, vector<16x16xf16> -> vector<16x16xf32> + // CHECK: %[[DPAS:.*]] = xegpu.dpas %{{.*}}, %{{.*}} {layout_a = #xegpu.layout, layout_b = #xegpu.layout, layout_cd = #xegpu.layout} : vector<16x16xf16>, vector<16x16xf16> -> vector<16x16xf32> %dpas = xegpu.dpas %load_a, %load_b - {layout_cd = #xegpu.layout} + {layout_a = #xegpu.layout, + layout_b = #xegpu.layout, + layout_cd = #xegpu.layout} : vector<128x128xf16>, vector<128x128xf16> -> vector<128x128xf32> gpu.return } @@ -141,7 +145,9 @@ gpu.module @test_1_1_assignment { gpu.func @dpas_with_no_create_nd_desc(%a: vector<256x128xf32>, %b: vector<128x256xf32>) { // CHECK-NOT: vector<32x32xf32> %dpas = xegpu.dpas %a, %b - {layout = #xegpu.layout} + {layout_a = #xegpu.layout, + layout_b = #xegpu.layout, + layout_cd = #xegpu.layout} : vector<256x128xf32>, vector<128x256xf32> -> vector<256x256xf32> gpu.return } @@ -207,8 +213,11 @@ gpu.module @test_1_1_assignment { -> (!xegpu.tensor_desc<128x128xf16, #xegpu.layout>, !xegpu.tensor_desc<128x128xf16, #xegpu.layout>, vector<128x128xf32>) { %8 = xegpu.load_nd %arg4 {layout = #xegpu.layout} : !xegpu.tensor_desc<128x128xf16, #xegpu.layout> -> vector<128x128xf16> %9 = xegpu.load_nd %arg5 {layout = #xegpu.layout} : !xegpu.tensor_desc<128x128xf16, #xegpu.layout> -> vector<128x128xf16> - %10 = xegpu.dpas %8, %9, %arg6 {layout_cd = #xegpu.layout} - : vector<128x128xf16>, vector<128x128xf16>, vector<128x128xf32> -> vector<128x128xf32> + %10 = xegpu.dpas %8, %9, %arg6 + {layout_a = #xegpu.layout, + layout_b = #xegpu.layout, + layout_cd = #xegpu.layout} + : vector<128x128xf16>, vector<128x128xf16>, vector<128x128xf32> -> vector<128x128xf32> %11 = xegpu.update_nd_offset %arg4, [%c0, %c128] : !xegpu.tensor_desc<128x128xf16, #xegpu.layout> %12 = xegpu.update_nd_offset %arg5, [%c128, %c0] : !xegpu.tensor_desc<128x128xf16, #xegpu.layout> scf.yield %11, %12, %10 : !xegpu.tensor_desc<128x128xf16, #xegpu.layout>, diff --git a/mlir/test/Integration/Dialect/XeGPU/WG/simple_gemm.mlir b/mlir/test/Integration/Dialect/XeGPU/WG/simple_gemm.mlir index 3f2fff9ab51e9..37f6d33e8ac30 100644 --- a/mlir/test/Integration/Dialect/XeGPU/WG/simple_gemm.mlir +++ b/mlir/test/Integration/Dialect/XeGPU/WG/simple_gemm.mlir @@ -53,34 +53,34 @@ module @gemm attributes {gpu.container_module} { %m = arith.muli %block_id_x, %c256 : index %n = arith.muli %block_id_y, %c256 : index %c_tdesc = xegpu.create_nd_tdesc %C : memref<256x256xf32> -> !xegpu.tensor_desc<256x256xf32, #c> - %c_init_value = xegpu.load_nd %c_tdesc[%m, %n] : !xegpu.tensor_desc<256x256xf32, #c> -> vector<256x256xf32> + %c_init_value = xegpu.load_nd %c_tdesc[%m, %n] {layout = #c}: !xegpu.tensor_desc<256x256xf32, #c> -> vector<256x256xf32> %a_tdesc = xegpu.create_nd_tdesc %A : memref<256x256xf16> -> !xegpu.tensor_desc<256x32xf16, #a> %b_tdesc = xegpu.create_nd_tdesc %B : memref<256x256xf16> -> !xegpu.tensor_desc<32x256xf16, #b> // Prefetch A 3 times. %a_prefetch_tdesc = xegpu.create_nd_tdesc %A : memref<256x256xf16> -> !xegpu.tensor_desc<256x32xf16, #a_prefetch> - xegpu.prefetch_nd %a_prefetch_tdesc[%m, %c0] : !xegpu.tensor_desc<256x32xf16, #a_prefetch> - xegpu.prefetch_nd %a_prefetch_tdesc[%m, %c32] : !xegpu.tensor_desc<256x32xf16, #a_prefetch> - xegpu.prefetch_nd %a_prefetch_tdesc[%m, %c64] : !xegpu.tensor_desc<256x32xf16, #a_prefetch> + xegpu.prefetch_nd %a_prefetch_tdesc[%m, %c0] {layout = #a_prefetch}: !xegpu.tensor_desc<256x32xf16, #a_prefetch> + xegpu.prefetch_nd %a_prefetch_tdesc[%m, %c32] {layout = #a_prefetch}: !xegpu.tensor_desc<256x32xf16, #a_prefetch> + xegpu.prefetch_nd %a_prefetch_tdesc[%m, %c64] {layout = #a_prefetch}: !xegpu.tensor_desc<256x32xf16, #a_prefetch> // Prefetch B 3 times. %b_prefetch_tdesc = xegpu.create_nd_tdesc %B : memref<256x256xf16> -> !xegpu.tensor_desc<32x256xf16, #b_prefetch> - xegpu.prefetch_nd %b_prefetch_tdesc[%c0, %n] : !xegpu.tensor_desc<32x256xf16, #b_prefetch> - xegpu.prefetch_nd %b_prefetch_tdesc[%c32, %n] : !xegpu.tensor_desc<32x256xf16, #b_prefetch> - xegpu.prefetch_nd %b_prefetch_tdesc[%c64, %n] : !xegpu.tensor_desc<32x256xf16, #b_prefetch> + xegpu.prefetch_nd %b_prefetch_tdesc[%c0, %n] {layout = #b_prefetch}: !xegpu.tensor_desc<32x256xf16, #b_prefetch> + xegpu.prefetch_nd %b_prefetch_tdesc[%c32, %n] {layout = #b_prefetch}: !xegpu.tensor_desc<32x256xf16, #b_prefetch> + xegpu.prefetch_nd %b_prefetch_tdesc[%c64, %n] {layout = #b_prefetch}: !xegpu.tensor_desc<32x256xf16, #b_prefetch> %out = scf.for %k = %c0 to %c256 step %c32 iter_args(%c_value = %c_init_value) -> (vector<256x256xf32>) { - %a_value = xegpu.load_nd %a_tdesc[%m, %k] : !xegpu.tensor_desc<256x32xf16, #a> -> vector<256x32xf16> - %b_value = xegpu.load_nd %b_tdesc[%k, %n] : !xegpu.tensor_desc<32x256xf16, #b> -> vector<32x256xf16> + %a_value = xegpu.load_nd %a_tdesc[%m, %k] {layout = #a}: !xegpu.tensor_desc<256x32xf16, #a> -> vector<256x32xf16> + %b_value = xegpu.load_nd %b_tdesc[%k, %n] {layout = #b}: !xegpu.tensor_desc<32x256xf16, #b> -> vector<32x256xf16> // Prefetch next tiles. %prefetch_offset = arith.addi %k, %c96 : index - xegpu.prefetch_nd %a_prefetch_tdesc[%m, %prefetch_offset] : !xegpu.tensor_desc<256x32xf16, #a_prefetch> - xegpu.prefetch_nd %b_prefetch_tdesc[%prefetch_offset, %n] : !xegpu.tensor_desc<32x256xf16, #b_prefetch> - %c_new_value = xegpu.dpas %a_value, %b_value, %c_value {layout_result_0 = #c} + xegpu.prefetch_nd %a_prefetch_tdesc[%m, %prefetch_offset] {layout = #a_prefetch}: !xegpu.tensor_desc<256x32xf16, #a_prefetch> + xegpu.prefetch_nd %b_prefetch_tdesc[%prefetch_offset, %n] {layout = #b_prefetch}: !xegpu.tensor_desc<32x256xf16, #b_prefetch> + %c_new_value = xegpu.dpas %a_value, %b_value, %c_value {layout_a = #a, layout_b = #b, layout_cd = #c} : vector<256x32xf16>, vector<32x256xf16>, vector<256x256xf32> -> vector<256x256xf32> scf.yield %c_new_value : vector<256x256xf32> } - xegpu.store_nd %out, %c_tdesc[%m, %n] : vector<256x256xf32>, !xegpu.tensor_desc<256x256xf32, #c> + xegpu.store_nd %out, %c_tdesc[%m, %n] {layout = #c}: vector<256x256xf32>, !xegpu.tensor_desc<256x256xf32, #c> gpu.return } } From ab0ccf0136ba4b28cba1391cca40d75f0c4026c2 Mon Sep 17 00:00:00 2001 From: Jianhui Li Date: Sat, 13 Dec 2025 02:58:27 +0000 Subject: [PATCH 19/28] add get/setTempDistributeLayoutAttr --- .../mlir/Dialect/XeGPU/Utils/XeGPUUtils.h | 28 ++-- .../XeGPU/Transforms/XeGPUBlocking.cpp | 5 +- .../Transforms/XeGPUSubgroupDistribute.cpp | 63 ++++---- .../Transforms/XeGPUWgToSgDistribute.cpp | 53 ++++--- mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp | 134 ++++++++---------- .../XeGPU/xegpu-wg-to-sg-elemwise.mlir | 30 ++-- 6 files changed, 155 insertions(+), 158 deletions(-) diff --git a/mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h b/mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h index d851ed9e3ccf1..7886c0269a15d 100644 --- a/mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h +++ b/mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h @@ -76,21 +76,11 @@ std::string getTempLayoutName(const OpResult result); /// found. DistributeLayoutAttr getDistributeLayoutAttr(const Value value); -template -AttrTy getDistributeLayoutAttrOfType(const Value value) { - return dyn_cast_if_present(getDistributeLayoutAttr(value)); -} - /// Retrieves the DistributeLayoutAttr associated with a given OpOperand. It /// will first check the operand_layout_{id} of the owner operation. If not /// found, it will check the operand itself and its defining op. DistributeLayoutAttr getDistributeLayoutAttr(const OpOperand &opr); -template -AttrTy getDistributeLayoutAttrOfType(const OpOperand &opr) { - return dyn_cast_if_present(getDistributeLayoutAttr(opr)); -} - /// Removes the LayoutAttr for a given OpOperand or OpResult if it exists. template || @@ -110,11 +100,29 @@ void setDistributeLayoutAttr(const OpResult &Result, void setDistributeLayoutAttr(const OpOperand &opr, const DistributeLayoutAttr layout); +/// get and set distribute layout attribute for non-anchor operations +/// (and offsets/masks of load/store ops before we get rid of their temp attrs) +template || + std::is_same_v>> +DistributeLayoutAttr getTempDistributeLayoutAttr(const T &operandOrResult); + +template || + std::is_same_v>> +void setTempDistributeLayoutAttr(const T &operandOrResult, + const DistributeLayoutAttr layout); + /// Set the DistributeLayoutAttr for each OpOperand and OpResult of the given /// operation. If the operation contains regions, it is also applied recursively /// to the contained operations void retrieveDistributeLayoutAttrsRecursive(Operation *op); +/// Attach layout attributes to all vector-type operands of operations within +/// the given operation's region. Reports an error if any vector operand lacks +/// a layout attribute. +bool localPropagateLayoutsFromAnchor(Operation *rootOp); + /// Extract a set of small vectors from a value with a given shape using /// vector.extract_stride_slice SmallVector extractVectorsWithShapeFromValue(OpBuilder &builder, diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp index bddc8f8c8de68..9d74433e1d673 100644 --- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp +++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp @@ -158,8 +158,7 @@ XeGPUBlockingPass::getTileShape(const T &operandOrResult) const { bool skipLeadingUnitDimRemoval = definingOp && (isa( - definingOp)); + xegpu::StoreNdOp, xegpu::PrefetchNdOp>(definingOp)); if (!skipLeadingUnitDimRemoval) { auto it = llvm::find_if(instData, [](auto val) { return val != 1; }); instData.erase(instData.begin(), it); @@ -284,6 +283,8 @@ void XeGPUBlockingPass::runOnOperation() { // Preserve the LayoutAttr for each operand to the owner's DictionaryAttr. // This ensures that the LayoutAttr remains accessible even if the defining // operation is replaced. + // TODO-LayoutRefactor: unify the local propagation for layout preprocessing + // replace the function with localPropagateLayoutsFromAnchor xegpu::retrieveDistributeLayoutAttrsRecursive(op); auto getTileShapeAndCount = [](llvm::ArrayRef shape, diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp index 4e663e1dd6694..bf8764e1f4541 100644 --- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp +++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp @@ -1273,8 +1273,9 @@ static Value lowerToVectorReductions(TypedValue src, rewriter, loc, acc.getType(), DenseElementsAttr::get(acc.getType(), zeroAttr)); // Reduction result should have the same layout as the accumulator. - xegpu::setDistributeLayoutAttr(cast(reductionResult), - xegpu::getDistributeLayoutAttr(acc)); + xegpu::setDistributeLayoutAttr( + cast(reductionResult), + xegpu::getTempDistributeLayoutAttr(dyn_cast(acc))); // For each slice of the source, extract the slice vector, do a reduction // and, insert the reduced value back to the result vector. for (int i = 0; i < nSlices; ++i) { @@ -1302,8 +1303,10 @@ static Value lowerToVectorReductions(TypedValue src, // accumulator. Shape cast source has the same layout as the original // reduction source. // TODO: other ops generated here may also need layout attributes. - auto srcLayout = xegpu::getDistributeLayoutAttr(src); - auto accLayout = xegpu::getDistributeLayoutAttr(acc); + auto srcLayout = + xegpu::getTempDistributeLayoutAttr(dyn_cast(src)); + auto accLayout = + xegpu::getTempDistributeLayoutAttr(dyn_cast(acc)); xegpu::setDistributeLayoutAttr(slice->getOpOperand(0), srcLayout); xegpu::setDistributeLayoutAttr(slice->getOpResult(0), accLayout); @@ -1399,7 +1402,7 @@ struct VectorMultiReductionDistribution : public gpu::WarpDistributionPattern { cast(warpOp.getResult(operandIdx).getType()); VectorType resultType = cast(reductionOp.getType()); xegpu::DistributeLayoutAttr sourceLayout = - xegpu::getDistributeLayoutAttr(reductionOp.getSource()); + xegpu::getTempDistributeLayoutAttr(reductionOp->getOpOperand(0)); FailureOr sourceDistTypeOrFailure = getDistVecTypeBasedOnLaneLayout(sourceLayout, sourceType); @@ -1561,9 +1564,10 @@ struct VectorBroadcastDistribution : public gpu::WarpDistributionPattern { dyn_cast(broadcastOp.getResult().getType()); xegpu::DistributeLayoutAttr sourceLayout = - xegpu::getDistributeLayoutAttr(broadcastOp->getOpOperand(0)); + xegpu::getTempDistributeLayoutAttr(broadcastOp->getOpOperand(0)); xegpu::DistributeLayoutAttr resultLayout = - xegpu::getDistributeLayoutAttr(broadcastOp.getResult()); + xegpu::getTempDistributeLayoutAttr( + dyn_cast(broadcastOp.getResult())); FailureOr sourceDistType; Type sourceElemOrDistType; @@ -1652,9 +1656,10 @@ struct VectorShapeCastDistribution : public gpu::WarpDistributionPattern { auto resultDistTy = cast(warpOp.getResult(operandNumber).getType()); xegpu::DistributeLayoutAttr sourceLayout = - xegpu::getDistributeLayoutAttr(shapeCastOp->getOpOperand(0)); + xegpu::getTempDistributeLayoutAttr(shapeCastOp->getOpOperand(0)); xegpu::DistributeLayoutAttr resultLayout = - xegpu::getDistributeLayoutAttr(shapeCastOp.getResult()); + xegpu::getTempDistributeLayoutAttr( + dyn_cast(shapeCastOp.getResult())); if (!sourceLayout || !resultLayout) return rewriter.notifyMatchFailure( warpOp, @@ -1737,7 +1742,7 @@ struct VectorExtractStridedSliceDistribution int sourceDistrDimSize = extractOp.getSourceVectorType().getShape()[distributedDim]; auto sourceLayout = - xegpu::getDistributeLayoutAttr(extractOp->getOpOperand(0)); + xegpu::getTempDistributeLayoutAttr(extractOp->getOpOperand(0)); if (!sourceLayout || sourceLayout.getEffectiveLaneLayoutAsInt().empty()) return rewriter.notifyMatchFailure( warpOp, "the source of extract_strided_slice op lacks distribution " @@ -1848,9 +1853,9 @@ struct VectorInsertStridedSliceDistribution int64_t srcDistrDimSize = srcType.getDimSize(sourceDistributedDim); // Obtain the source and dest layouts. auto destLayout = - xegpu::getDistributeLayoutAttr(insertOp->getOpOperand(1)); + xegpu::getTempDistributeLayoutAttr(insertOp->getOpOperand(1)); auto sourceLayout = - xegpu::getDistributeLayoutAttr(insertOp->getOpOperand(0)); + xegpu::getTempDistributeLayoutAttr(insertOp->getOpOperand(0)); if (!destLayout || !sourceLayout || destLayout.getEffectiveLaneLayoutAsInt().empty() || sourceLayout.getEffectiveLaneLayoutAsInt().empty()) @@ -1965,7 +1970,7 @@ struct VectorBitcastDistribution final : public gpu::WarpDistributionPattern { unsigned operandIdx = operand->getOperandNumber(); VectorType distributedSourceType = getDistVecTypeBasedOnLaneLayout( - xegpu::getDistributeLayoutAttr(bitcastOp.getSource()), + xegpu::getTempDistributeLayoutAttr(bitcastOp->getOpOperand(0)), bitcastOp.getSourceVectorType()) .value_or(VectorType()); if (!distributedSourceType) @@ -2008,9 +2013,9 @@ struct VectorTransposeDistribution final : public gpu::WarpDistributionPattern { auto transposeOp = operand->get().getDefiningOp(); unsigned operandIdx = operand->getOperandNumber(); xegpu::DistributeLayoutAttr sourceLayout = - xegpu::getDistributeLayoutAttr(transposeOp.getVector()); + xegpu::getTempDistributeLayoutAttr(transposeOp->getOpOperand(0)); xegpu::DistributeLayoutAttr resultLayout = - xegpu::getDistributeLayoutAttr(transposeOp.getResult()); + xegpu::getTempDistributeLayoutAttr(transposeOp->getOpResult(0)); if (!sourceLayout || !resultLayout) return rewriter.notifyMatchFailure( transposeOp, @@ -2093,30 +2098,10 @@ void XeGPUSubgroupDistributePass::runOnOperation() { // 1) It is assumed that there are no layout conflicts. // 2) Any existing layout attributes attached to the operands are ignored. Operation *op = getOperation(); - op->walk([&](Operation *op) { - for (OpOperand &operand : op->getOpOperands()) { - // Layouts are needed for vector type only. - if (!isa(operand.get().getType())) - continue; - // if (isa(op)) - // xegpu::DpasOp, - // xegpu::LoadGatherOp, xegpu::StoreScatterOp, - // xegpu::LoadNdOp, xegpu::StoreNdOp>(op)) - // continue; - - auto layout = xegpu::getDistributeLayoutAttr(operand.get()); - if (!layout) { - op->emitError("Could not find layout attribute for operand ") - << operand.getOperandNumber() << " of operation " << op->getName(); - signalPassFailure(); - return; - } - xegpu::setDistributeLayoutAttr(operand, layout); - } - }); - - // dump out the op here - // getOperation()->dump(); + if (!xegpu::localPropagateLayoutsFromAnchor(op)) { + signalPassFailure(); + return; + } // Step 2: Move all operations of a GPU function inside // gpu.warp_execute_on_lane_0 operation. diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp index af90674d4ce0d..f9f5a1f6b285e 100644 --- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp +++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp @@ -492,8 +492,8 @@ struct WgToSgVectorBroadcastOp VectorType resultType = op.getResult().getType(); ArrayRef wgShape = resultType.getShape(); - xegpu::DistributeLayoutAttr layout = - xegpu::getDistributeLayoutAttr(op.getResult()); + xegpu::DistributeLayoutAttr layout = xegpu::getTempDistributeLayoutAttr( + llvm::cast(op.getResult())); if (!layout || !layout.isForWorkgroup()) return failure(); @@ -535,8 +535,8 @@ struct WgToSgElementwiseOp : public ConversionPattern { ArrayRef wgShape = resultType.getShape(); - xegpu::DistributeLayoutAttr layout = - xegpu::getDistributeLayoutAttr(op->getResult(0)); + xegpu::DistributeLayoutAttr layout = xegpu::getTempDistributeLayoutAttr( + llvm::cast(op->getResult(0))); if (!layout || !layout.isForWorkgroup()) return failure(); @@ -742,7 +742,7 @@ struct WgToSgArithConstantOp : public OpConversionPattern { return failure(); xegpu::DistributeLayoutAttr layout = - xegpu::getDistributeLayoutAttr(op.getResult()); + xegpu::getTempDistributeLayoutAttr(dyn_cast(op.getResult())); if (!layout || !layout.isForWorkgroup()) return failure(); @@ -903,8 +903,8 @@ struct WgToSgLoadGatherOpWithOffset return failure(); ArrayRef wgShape = resultType.getShape(); - xegpu::DistributeLayoutAttr layout = - xegpu::getDistributeLayoutAttr(op.getResult()); + xegpu::DistributeLayoutAttr layout = op.getLayoutAttr(); + if (!layout || !layout.isForWorkgroup()) return failure(); @@ -957,8 +957,8 @@ struct WgToSgStoreScatterOpWithOffset if (!valueType) return failure(); - xegpu::DistributeLayoutAttr layout = - xegpu::getDistributeLayoutAttr(op.getOperand(0)); + xegpu::DistributeLayoutAttr layout = op.getLayoutAttr(); + if (!layout || !layout.isForWorkgroup()) return failure(); @@ -1052,7 +1052,7 @@ struct WgToSgVectorStepOp : public OpConversionPattern { matchAndRewrite(vector::StepOp op, OneToNOpAdaptor adaptor, ConversionPatternRewriter &rewriter) const override { xegpu::DistributeLayoutAttr layout = - xegpu::getDistributeLayoutAttr(op.getResult()); + xegpu::getTempDistributeLayoutAttr(dyn_cast(op.getResult())); if (!layout || !layout.isForWorkgroup()) return failure(); @@ -1109,7 +1109,7 @@ struct WgToSgVectorShapeCastOp ArrayRef wgShape = resultType.getShape(); xegpu::DistributeLayoutAttr layout = - xegpu::getDistributeLayoutAttr(op.getResult()); + xegpu::getTempDistributeLayoutAttr(dyn_cast(op.getResult())); if (!layout || !layout.isForWorkgroup()) return failure(); @@ -1142,6 +1142,7 @@ struct WgToSgVectorShapeCastOp // must be a slice of higher rank layout. int64_t sourceRank = srcType.getRank(); int64_t resultRank = sgShape.size(); + // TODO-LayoutRefactor: handle the case using getTempDistributeLayoutAttr xegpu::DistributeLayoutAttr sourceLayout = xegpu::getDistributeLayoutAttr(op.getSource()); if (sourceRank < resultRank && !sourceLayout.isSliceOf(layout)) @@ -1182,7 +1183,7 @@ struct WgToSgMultiDimReductionOp auto srcShape = srcType.getShape(); xegpu::DistributeLayoutAttr layout = - xegpu::getDistributeLayoutAttr(op.getResult()); + xegpu::getTempDistributeLayoutAttr(dyn_cast(op.getResult())); if (!layout || !layout.isForWorkgroup()) return failure(); @@ -1237,10 +1238,10 @@ struct WgToSgVectorTransposeOp ArrayRef wgShape = resultType.getShape(); xegpu::DistributeLayoutAttr layout = - xegpu::getDistributeLayoutAttr(op.getResult()); + xegpu::getTempDistributeLayoutAttr(dyn_cast(op.getResult())); if (!layout || !layout.isForWorkgroup()) return failure(); - + // TODO-LayoutRefactor: handle the case using getTempDistributeLayoutAttr xegpu::DistributeLayoutAttr sourceLayout = xegpu::getDistributeLayoutAttr(op.getVector()); if (!sourceLayout || !sourceLayout.isForWorkgroup()) @@ -1299,7 +1300,7 @@ struct WgToSgVectorMaskOp : public OpConversionPattern { typename OpConversionPattern::OneToNOpAdaptor adaptor, ConversionPatternRewriter &rewriter) const override { xegpu::DistributeLayoutAttr layout = - xegpu::getDistributeLayoutAttr(op.getResult()); + xegpu::getTempDistributeLayoutAttr(dyn_cast(op.getResult())); if (!layout || !layout.isForWorkgroup()) return failure(); @@ -1390,6 +1391,14 @@ struct XeGPUWgToSgDistributePass } // namespace void XeGPUWgToSgDistributePass::runOnOperation() { + + // TODO-LayoutRefactor: unify the local propagation for layout preprocessing + // Operation *op = getOperation(); + // if (!xegpu::localPropagateLayoutsFromAnchor(op)) { + // signalPassFailure(); + // return; + // } + // Track existing UnrealizedConversionCastOps SmallVector existingCastOps; getOperation()->walk([&](UnrealizedConversionCastOp castOp) { @@ -1480,7 +1489,7 @@ void XeGPUWgToSgDistributePass::runOnOperation() { }); target.addDynamicallyLegalOp([=](xegpu::DpasOp op) -> bool { - auto layout = xegpu::getDistributeLayoutAttr(op.getResult()); + auto layout = op.getLayoutCdAttr(); return isLegal(layout); }); @@ -1500,7 +1509,8 @@ void XeGPUWgToSgDistributePass::runOnOperation() { if (!vecType) return true; - auto layout = xegpu::getDistributeLayoutAttr(op.getResult()); + auto layout = xegpu::getTempDistributeLayoutAttr( + dyn_cast(op.getResult())); return isLegal(layout); }); @@ -1510,19 +1520,20 @@ void XeGPUWgToSgDistributePass::runOnOperation() { vector::ConstantMaskOp, vector::CreateMaskOp>( [=](Operation *op) -> bool { // Check for either a SliceAttr or LayoutAttr on the result. - auto layout = xegpu::getDistributeLayoutAttr(op->getResult(0)); + auto layout = xegpu::getTempDistributeLayoutAttr( + dyn_cast(op->getResult(0))); return isLegal(layout); }); target.addDynamicallyLegalOp( [=](xegpu::LoadGatherOp op) -> bool { - auto layout = xegpu::getDistributeLayoutAttr(op.getResult()); + auto layout = op.getLayoutAttr(); return isLegal(layout); }); target.addDynamicallyLegalOp( [=](xegpu::StoreScatterOp op) -> bool { - auto layout = xegpu::getDistributeLayoutAttr(op.getOperand(0)); + auto layout = op.getLayoutAttr(); return isLegal(layout); }); @@ -1552,7 +1563,7 @@ void XeGPUWgToSgDistributePass::runOnOperation() { } xegpu::DistributeLayoutAttr layout = - xegpu::getDistributeLayoutAttr(op->getResult(0)); + xegpu::getTempDistributeLayoutAttr(op->getResult(0)); return isLegal(layout); }); diff --git a/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp b/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp index f4da557a8fa32..49d6b45c2791c 100644 --- a/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp +++ b/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp @@ -114,27 +114,21 @@ std::string xegpu::getTempLayoutName(const OpResult result) { xegpu::DistributeLayoutAttr xegpu::getDistributeLayoutAttr(const Value value) { if (!value) { - llvm::errs() << "getDistributeLayoutAttr: value is null\n"; return nullptr; } - llvm::errs() << "Getting layout for value: " << value << "\n"; - if (auto tdescTy = dyn_cast_if_present(value.getType())) { auto layoutAttr = tdescTy.getLayoutAttr(); - llvm::errs() << " Found TensorDescType with layout\n"; return layoutAttr; } if (auto result = dyn_cast(value)) { Operation *defOp = result.getDefiningOp(); assert(defOp && "result must have a defining op"); - llvm::errs() << " Value is OpResult from: " << *defOp << "\n"; if (auto anchorOp = dyn_cast(defOp)) { auto layout = anchorOp.getAnchorLayout(); - llvm::errs() << " Returning anchor layout from defining op\n"; return layout; } @@ -142,55 +136,39 @@ xegpu::DistributeLayoutAttr xegpu::getDistributeLayoutAttr(const Value value) { if (defOp->hasAttr(layoutName)) { auto layout = defOp->getAttrOfType(layoutName); - llvm::errs() << " Returning temp layout from attribute: " << layoutName - << "\n"; return layout; } - llvm::errs() << " No layout found for OpResult\n"; } if (auto arg = dyn_cast(value)) { auto *parentOp = arg.getOwner()->getParentOp(); - llvm::errs() << " Value is BlockArgument, parent op: " << *parentOp - << "\n"; if (auto loop = dyn_cast(parentOp)) { OpOperand *tiedInit = loop.getTiedLoopInit(arg); if (tiedInit) { - llvm::errs() << " Following tied loop init\n"; auto layout = getDistributeLayoutAttr(tiedInit->get()); return layout; } } - llvm::errs() << " No tied loop init found\n"; } - llvm::errs() << " Returning nullptr\n"; return nullptr; } - xegpu::DistributeLayoutAttr xegpu::getDistributeLayoutAttr(const OpOperand &opr) { Operation *op = opr.getOwner(); unsigned idx = const_cast(opr).getOperandNumber(); - llvm::errs() << "Getting layout for operand " << idx << " of op: " << *op - << "\n"; - if (auto anchorOp = dyn_cast(op)) { if (auto dpasOp = dyn_cast(op)) { if (idx == 0) { - llvm::errs() << " Returning DpasOp layout A\n"; return dpasOp.getLayoutAAttr(); } else if (idx == 1) { - llvm::errs() << " Returning DpasOp layout B\n"; return dpasOp.getLayoutBAttr(); } else if (idx == 2) { - llvm::errs() << " Returning DpasOp layout CD\n"; return dpasOp.getLayoutCdAttr(); } } if (auto convertOp = dyn_cast(op)) { - llvm::errs() << " Returning ConvertLayoutOp input layout\n"; return convertOp.getInputLayoutAttr(); } auto layout = anchorOp.getAnchorLayout(); @@ -200,12 +178,10 @@ xegpu::getDistributeLayoutAttr(const OpOperand &opr) { if (isa( op)) { if (idx < 2) { - llvm::errs() << " Returning anchor layout for store op\n"; return layout; } } else { if (idx == 0) { - llvm::errs() << " Returning anchor layout\n"; return layout; } } @@ -214,13 +190,10 @@ xegpu::getDistributeLayoutAttr(const OpOperand &opr) { std::string layoutName = xegpu::getTempLayoutName(opr); if (op->hasAttr(layoutName)) { auto layout = op->getAttrOfType(layoutName); - llvm::errs() << " Returning temp layout from attribute: " << layoutName - << "\n"; return layout; } auto layout = getDistributeLayoutAttr(opr.get()); - llvm::errs() << " Returning layout from operand value\n"; return layout; } @@ -264,14 +237,10 @@ void xegpu::setDistributeLayoutAttr( const mlir::xegpu::DistributeLayoutAttr layout) { Operation *owner = result.getOwner(); - llvm::errs() << "Setting layout for result of op: " << *owner << "\n"; - if (auto anchorOp = dyn_cast(owner)) { if (anchorOp.getAnchorLayout() == layout) return; - llvm::errs() << " Setting anchor layout\n"; anchorOp.setAnchorLayout(layout); - llvm::errs() << " After setting anchor layout, op: " << *owner << "\n"; return; } @@ -280,37 +249,29 @@ void xegpu::setDistributeLayoutAttr( return; } if (layout) { - llvm::errs() << " Setting temp layout attribute: " << name << "\n"; owner->setAttr(name, layout); - llvm::errs() << " After setting temp layout, op: " << *owner << "\n"; } } + void xegpu::setDistributeLayoutAttr(const OpOperand &operand, const DistributeLayoutAttr layout) { Operation *owner = operand.getOwner(); unsigned idx = const_cast(operand).getOperandNumber(); - llvm::errs() << "Setting layout for operand 1" << idx << " of op: " << *owner - << " with layout: " << layout << "\n"; - if (!layout) { return; } if (auto anchorOp = dyn_cast(owner)) { if (auto dpasOp = dyn_cast(owner)) { if (idx == 0) { - llvm::errs() << " Setting DpasOp layout A\n"; return dpasOp.setLayoutAAttr(layout); } else if (idx == 1) { - llvm::errs() << " Setting DpasOp layout B\n"; return dpasOp.setLayoutBAttr(layout); } else if (idx == 2) { - llvm::errs() << " Setting DpasOp layout CD\n"; return dpasOp.setLayoutCdAttr(layout); } } if (auto convertOp = dyn_cast(owner)) { - llvm::errs() << " Setting ConvertLayoutOp input layout\n"; return convertOp.setInputLayoutAttr(layout); } @@ -320,15 +281,11 @@ void xegpu::setDistributeLayoutAttr(const OpOperand &operand, if (isa( owner)) { if (idx < 2) { - llvm::errs() << " Setting anchor layout for store op\n"; anchorOp.setAnchorLayout(layout); - llvm::errs() << " After setting anchor layout, op: " << *owner << "\n"; } } else { if (idx == 0) { - llvm::errs() << " Setting anchor layout\n"; anchorOp.setAnchorLayout(layout); - llvm::errs() << " After setting anchor layout, op: " << *owner << "\n"; } } } @@ -338,30 +295,86 @@ void xegpu::setDistributeLayoutAttr(const OpOperand &operand, return; } if (layout) { - llvm::errs() << " Setting temp layout attribute: " << name << "\n"; owner->setAttr(name, layout); - llvm::errs() << " After setting temp layout, op: " << *owner << "\n"; } } +template +xegpu::DistributeLayoutAttr +xegpu::getTempDistributeLayoutAttr(const T &operandOrResult) { + Operation *op = operandOrResult.getOwner(); + + std::string layoutName = xegpu::getTempLayoutName(operandOrResult); + if (op->hasAttr(layoutName)) { + auto layout = op->getAttrOfType(layoutName); + return layout; + } + + return nullptr; +} + +template xegpu::DistributeLayoutAttr +xegpu::getTempDistributeLayoutAttr(const OpResult &result); +template xegpu::DistributeLayoutAttr +xegpu::getTempDistributeLayoutAttr(const OpOperand &operand); + +template +void xegpu::setTempDistributeLayoutAttr( + const T &operandOrResult, const xegpu::DistributeLayoutAttr layout) { + Operation *owner = operandOrResult.getOwner(); + std::string name = xegpu::getTempLayoutName(operandOrResult); + if (owner->hasAttrOfType(name)) { + return; + } + if (layout) { + owner->setAttr(name, layout); + } +} + +template void xegpu::setTempDistributeLayoutAttr( + const mlir::OpResult &result, + const mlir::xegpu::DistributeLayoutAttr layout); + +template void xegpu::setTempDistributeLayoutAttr( + const mlir::OpOperand &operand, + const mlir::xegpu::DistributeLayoutAttr layout); + void xegpu::retrieveDistributeLayoutAttrsRecursive(Operation *op) { op->walk([&](Operation *nestOp) { for (OpOperand &opr : nestOp->getOpOperands()) { auto layout = getDistributeLayoutAttr(opr.get()); - llvm::errs() << "Setting layout for operand " << opr.getOperandNumber() - << " of op: " << *nestOp << "\n"; setDistributeLayoutAttr(opr, layout); } for (OpResult result : nestOp->getOpResults()) { auto layout = getDistributeLayoutAttr(result); - llvm::errs() << "Setting layout for result " << result.getResultNumber() - << " of op: " << *nestOp << "\n"; setDistributeLayoutAttr(result, layout); } }); } +/// Attach layout attributes to all vector-type operands of operations within +/// the given operation's region. Reports an error if any vector operand lacks +/// a layout attribute. +bool xegpu::localPropagateLayoutsFromAnchor(Operation *rootOp) { + auto result = rootOp->walk([&](Operation *op) { + for (OpOperand &operand : op->getOpOperands()) { + // Layouts are needed for vector type only. + if (!isa(operand.get().getType())) + continue; + auto layout = xegpu::getDistributeLayoutAttr(operand.get()); + if (!layout) { + op->emitError("Could not find layout attribute for operand ") + << operand.getOperandNumber() << " of operation " << op->getName(); + return WalkResult::interrupt(); + } + xegpu::setDistributeLayoutAttr(operand, layout); + } + return WalkResult::advance(); + }); + return !result.wasInterrupted(); +} + template void xegpu::removeLayoutAttr(const T &operandOrResult) { Operation *owner = operandOrResult.getOwner(); @@ -468,11 +481,6 @@ void xegpu::doSCFStructuralTypeConversionWithTensorType( .getResult(0); }; - llvm::errs() - << "\n=== Stage 1: Converting VectorType to RankedTensorType ===\n"; - llvm::errs() << "Before conversion:\n"; - op->dump(); - { // convert VectorType to RankedTensorType for SCF Structural ops TypeConverter converter; converter.addConversion([](Type type) -> Type { return type; }); @@ -491,12 +499,6 @@ void xegpu::doSCFStructuralTypeConversionWithTensorType( (void)mlir::applyPartialConversion(op, target, std::move(patterns)); } - llvm::errs() << "\nAfter Stage 1:\n"; - op->dump(); - - llvm::errs() << "\n=== Stage 2: Propagating layout attributes to " - "RankedTensorType ===\n"; - { // propagate the layout attribute to RankedTensorType by checking // BuiltInUnrealizedCastOps // for VectorType to RankedTensorType cast. @@ -551,12 +553,6 @@ void xegpu::doSCFStructuralTypeConversionWithTensorType( }); } - llvm::errs() << "\nAfter Stage 2:\n"; - op->dump(); - - llvm::errs() - << "\n=== Stage 3: Converting RankedTensorType back to VectorType ===\n"; - { // perform the conversion from RankedTensorType to VectorType based on the // DistributeLayoutAttr @@ -620,10 +616,6 @@ void xegpu::doSCFStructuralTypeConversionWithTensorType( target); (void)mlir::applyPartialConversion(op, target, std::move(patterns)); } - - llvm::errs() << "\nAfter Stage 3 (final):\n"; - op->dump(); - llvm::errs() << "\n=== Transformation complete ===\n\n"; } std::optional xegpu::getChipStr(Operation *op) { diff --git a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-elemwise.mlir b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-elemwise.mlir index 9580769d37313..6e9711442b92d 100644 --- a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-elemwise.mlir +++ b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-elemwise.mlir @@ -6,7 +6,7 @@ gpu.module @test_elementwise_ops { gpu.func @unary_ops_sg_layout_only(%a: memref<24x32xf32>) { %tdesc_a = xegpu.create_nd_tdesc %a[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<24x32xf32, #xegpu.layout> - %load_a = xegpu.load_nd %tdesc_a + %load_a = xegpu.load_nd %tdesc_a {layout = #xegpu.layout} : !xegpu.tensor_desc<24x32xf32, #xegpu.layout> -> vector<24x32xf32> // CHECK: math.exp {{.*}} : vector<12x8xf32> @@ -24,7 +24,7 @@ gpu.module @test_elementwise_ops { gpu.func @unary_ops(%a: memref<24x32xf32>) { %tdesc_a = xegpu.create_nd_tdesc %a[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<24x32xf32, #xegpu.layout> - %load_a = xegpu.load_nd %tdesc_a + %load_a = xegpu.load_nd %tdesc_a {layout = #xegpu.layout} : !xegpu.tensor_desc<24x32xf32, #xegpu.layout> -> vector<24x32xf32> // CHECK: math.exp {{.*}} {layout_result_0 = #xegpu.layout} : vector<12x8xf32> @@ -44,10 +44,10 @@ gpu.module @test_elementwise_ops { -> !xegpu.tensor_desc<24x32xf32, #xegpu.layout> %tdesc_b = xegpu.create_nd_tdesc %b[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<24x32xf32, #xegpu.layout> - %load_a = xegpu.load_nd %tdesc_a + %load_a = xegpu.load_nd %tdesc_a {layout = #xegpu.layout} : !xegpu.tensor_desc<24x32xf32, #xegpu.layout> -> vector<24x32xf32> - %load_b = xegpu.load_nd %tdesc_b + %load_b = xegpu.load_nd %tdesc_b {layout = #xegpu.layout} : !xegpu.tensor_desc<24x32xf32, #xegpu.layout> -> vector<24x32xf32> // CHECK: arith.addf {{.*}}, {{.*}} {layout_result_0 = #xegpu.layout} @@ -71,13 +71,13 @@ gpu.module @test_elementwise_ops { -> !xegpu.tensor_desc<24x32xf32, #xegpu.layout> %tdesc_c = xegpu.create_nd_tdesc %c[0, 0] : memref<24x32xi1> -> !xegpu.tensor_desc<24x32xi1, #xegpu.layout> - %load_a = xegpu.load_nd %tdesc_a + %load_a = xegpu.load_nd %tdesc_a {layout = #xegpu.layout} : !xegpu.tensor_desc<24x32xf32, #xegpu.layout> -> vector<24x32xf32> - %load_b = xegpu.load_nd %tdesc_b + %load_b = xegpu.load_nd %tdesc_b {layout = #xegpu.layout} : !xegpu.tensor_desc<24x32xf32, #xegpu.layout> -> vector<24x32xf32> - %load_c = xegpu.load_nd %tdesc_c + %load_c = xegpu.load_nd %tdesc_c {layout = #xegpu.layout} : !xegpu.tensor_desc<24x32xi1, #xegpu.layout> -> vector<24x32xi1> // CHECK: arith.select {{.*}}, {{.*}}, {{.*}} {layout_result_0 = #xegpu.layout} @@ -99,10 +99,10 @@ gpu.module @test_elementwise_ops { -> !xegpu.tensor_desc<24x32xf32, #xegpu.layout> %tdesc_b = xegpu.create_nd_tdesc %b[0, 0] : memref<24x32xi32> -> !xegpu.tensor_desc<24x32xi32, #xegpu.layout> - %load_a = xegpu.load_nd %tdesc_a + %load_a = xegpu.load_nd %tdesc_a {layout = #xegpu.layout} : !xegpu.tensor_desc<24x32xf32, #xegpu.layout> -> vector<24x32xf32> - %load_b = xegpu.load_nd %tdesc_b + %load_b = xegpu.load_nd %tdesc_b {layout = #xegpu.layout} : !xegpu.tensor_desc<24x32xi32, #xegpu.layout> -> vector<24x32xi32> // CHECK: arith.truncf {{.*}} {layout_result_0 = #xegpu.layout} @@ -128,16 +128,16 @@ gpu.module @test_elementwise_ops { -> !xegpu.tensor_desc<24x32xi32, #xegpu.layout> %tdesc_d = xegpu.create_nd_tdesc %d[0, 0] : memref<24x32xi32> -> !xegpu.tensor_desc<24x32xi32, #xegpu.layout> - %load_a = xegpu.load_nd %tdesc_a + %load_a = xegpu.load_nd %tdesc_a {layout = #xegpu.layout} : !xegpu.tensor_desc<24x32xf32, #xegpu.layout> -> vector<24x32xf32> - %load_b = xegpu.load_nd %tdesc_b + %load_b = xegpu.load_nd %tdesc_b {layout = #xegpu.layout} : !xegpu.tensor_desc<24x32xf32, #xegpu.layout> -> vector<24x32xf32> - %load_c = xegpu.load_nd %tdesc_c + %load_c = xegpu.load_nd %tdesc_c {layout = #xegpu.layout} : !xegpu.tensor_desc<24x32xi32, #xegpu.layout> -> vector<24x32xi32> - %load_d = xegpu.load_nd %tdesc_d + %load_d = xegpu.load_nd %tdesc_d {layout = #xegpu.layout} : !xegpu.tensor_desc<24x32xi32, #xegpu.layout> -> vector<24x32xi32> // CHECK: arith.cmpf ult, {{.*}}, {{.*}} {layout_result_0 = #xegpu.layout} @@ -160,10 +160,10 @@ gpu.module @test_elementwise_ops { -> !xegpu.tensor_desc<24x32xf32, #xegpu.layout> %tdesc_b = xegpu.create_nd_tdesc %b[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<24x32xf32, #xegpu.layout> - %load_a = xegpu.load_nd %tdesc_a + %load_a = xegpu.load_nd %tdesc_a {layout = #xegpu.layout} : !xegpu.tensor_desc<24x32xf32, #xegpu.layout> -> vector<24x32xf32> - %load_b = xegpu.load_nd %tdesc_b + %load_b = xegpu.load_nd %tdesc_b {layout = #xegpu.layout} : !xegpu.tensor_desc<24x32xf32, #xegpu.layout> -> vector<24x32xf32> // CHECK-COUNT-12: arith.negf {{.*}} {layout_result_0 = #xegpu.layout} : vector<2x2xf32> From 2c80b0ee5c79c4d19220336bec0fae5344d1e13c Mon Sep 17 00:00:00 2001 From: Jianhui Li Date: Sat, 13 Dec 2025 04:24:45 +0000 Subject: [PATCH 20/28] remove debug print --- .../XeGPU/Transforms/XeGPUBlocking.cpp | 7 +-- .../Transforms/XeGPUSubgroupDistribute.cpp | 51 +------------------ mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp | 13 ++--- 3 files changed, 12 insertions(+), 59 deletions(-) diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp index 9d74433e1d673..0c0a922c56305 100644 --- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp +++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp @@ -280,11 +280,12 @@ void XeGPUBlockingPass::runOnOperation() { MLIRContext *ctx = &getContext(); Operation *op = getOperation(); - // Preserve the LayoutAttr for each operand to the owner's DictionaryAttr. - // This ensures that the LayoutAttr remains accessible even if the defining - // operation is replaced. // TODO-LayoutRefactor: unify the local propagation for layout preprocessing // replace the function with localPropagateLayoutsFromAnchor + // if (!xegpu::localPropagateLayoutsFromAnchor(op)) { + // signalPassFailure(); + // return; + // } xegpu::retrieveDistributeLayoutAttrsRecursive(op); auto getTileShapeAndCount = [](llvm::ArrayRef shape, diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp index bf8764e1f4541..94da382e3b04c 100644 --- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp +++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp @@ -630,15 +630,12 @@ struct DpasDistribution final : public gpu::WarpDistributionPattern { LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op warpOp, PatternRewriter &rewriter) const override { OpOperand *operand = getWarpResult(warpOp, llvm::IsaPred); - if (!operand) { - DBGS() << "No dpas op result found in warp op\n"; + if (!operand) return rewriter.notifyMatchFailure(warpOp, "warp result is not a xegpu::Dpas op"); - } auto dpasOp = operand->get().getDefiningOp(); unsigned operandIdx = operand->getOperandNumber(); - DBGS() << "Found dpas op: " << *dpasOp << "\n"; xegpu::LayoutAttr layoutA = dyn_cast(dpasOp.getLayoutAAttr()); @@ -647,9 +644,6 @@ struct DpasDistribution final : public gpu::WarpDistributionPattern { xegpu::LayoutAttr layoutOut = dyn_cast(dpasOp.getLayoutCdAttr()); - DBGS() << "Layout A: " << layoutA << ", Layout B: " << layoutB - << ", Layout Out: " << layoutOut << "\n"; - if (!layoutA || !layoutB || !layoutOut) return rewriter.notifyMatchFailure( dpasOp, @@ -662,23 +656,6 @@ struct DpasDistribution final : public gpu::WarpDistributionPattern { FailureOr distResultTypeByWarpOpOrFailure = getDistVecTypeBasedOnLaneLayout(layoutOut, dpasOp.getResultType()); - DBGS() << "Original LHS type: " << dpasOp.getLhsType() << ", distributed: " - << (succeeded(distLhsTypeByWarpOpOrFailure) - ? distLhsTypeByWarpOpOrFailure.value() - : Type()) - << "\n"; - DBGS() << "Original RHS type: " << dpasOp.getRhsType() << ", distributed: " - << (succeeded(distRhsTypeByWarpOpOrFailure) - ? distRhsTypeByWarpOpOrFailure.value() - : Type()) - << "\n"; - DBGS() << "Original Result type: " << dpasOp.getResultType() - << ", distributed: " - << (succeeded(distResultTypeByWarpOpOrFailure) - ? distResultTypeByWarpOpOrFailure.value() - : Type()) - << "\n"; - if (failed(distLhsTypeByWarpOpOrFailure) || failed(distRhsTypeByWarpOpOrFailure) || failed(distResultTypeByWarpOpOrFailure)) @@ -693,14 +670,11 @@ struct DpasDistribution final : public gpu::WarpDistributionPattern { distRhsTypeByWarpOpOrFailure.value()}; // Dpas acc operand is optional. if (dpasOp.getAcc()) { - DBGS() << "Dpas has accumulator operand\n"; newYieldValues.push_back(dpasOp.getAcc()); newYieldTypes.push_back(distResultTypeByWarpOpOrFailure.value()); } // Create a new warp op without the dpas. SmallVector newRetIndices; - DBGS() << "Creating new warp op with " << newYieldValues.size() - << " yielded values\n"; gpu::WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndAppendReturns( rewriter, warpOp, newYieldValues, newYieldTypes, newRetIndices); @@ -711,22 +685,6 @@ struct DpasDistribution final : public gpu::WarpDistributionPattern { FailureOr expectedDistResultTyOrFailure = xegpu::getDistributedVectorType(dpasOp.getResultType(), layoutOut); - DBGS() << "Expected dist LHS type: " - << (succeeded(expectedDistLhsTyOrFailure) - ? expectedDistLhsTyOrFailure.value() - : Type()) - << "\n"; - DBGS() << "Expected dist RHS type: " - << (succeeded(expectedDistRhsTyOrFailure) - ? expectedDistRhsTyOrFailure.value() - : Type()) - << "\n"; - DBGS() << "Expected dist Result type: " - << (succeeded(expectedDistResultTyOrFailure) - ? expectedDistResultTyOrFailure.value() - : Type()) - << "\n"; - if (failed(expectedDistLhsTyOrFailure) || failed(expectedDistRhsTyOrFailure) || failed(expectedDistResultTyOrFailure)) @@ -746,9 +704,6 @@ struct DpasDistribution final : public gpu::WarpDistributionPattern { newDpasOperandExpectedTypes.push_back(distributedResultTy); for (unsigned i = 0; i < newRetIndices.size(); i++) { - DBGS() << "Resolving operand " << i << " with type " - << newWarpOp.getResult(newRetIndices[i]).getType() << " to " - << newDpasOperandExpectedTypes[i] << "\n"; newDpasOperands.push_back( resolveDistributedTy(newWarpOp.getResult(newRetIndices[i]), newDpasOperandExpectedTypes[i], rewriter)); @@ -756,17 +711,13 @@ struct DpasDistribution final : public gpu::WarpDistributionPattern { auto newDpasOp = xegpu::DpasOp::create(rewriter, newWarpOp->getLoc(), distributedResultTy, newDpasOperands, dpasOp->getAttrs()); - DBGS() << "Created new dpas op: " << *newDpasOp << "\n"; xegpu::removeLayoutAttrs(newDpasOp); Value distributedVal = newWarpOp.getResult(operandIdx); // Resolve the output type. - DBGS() << "Resolving output from " << newDpasOp.getResult().getType() - << " to " << distResultTypeByWarpOpOrFailure.value() << "\n"; Value typeResolved = resolveDistributedTy(newDpasOp.getResult(), distResultTypeByWarpOpOrFailure.value(), rewriter); rewriter.replaceAllUsesWith(distributedVal, typeResolved); - DBGS() << "Successfully distributed dpas op\n"; return success(); } }; diff --git a/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp b/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp index 49d6b45c2791c..6d6ca47dc10d5 100644 --- a/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp +++ b/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp @@ -113,15 +113,12 @@ std::string xegpu::getTempLayoutName(const OpResult result) { } xegpu::DistributeLayoutAttr xegpu::getDistributeLayoutAttr(const Value value) { - if (!value) { + if (!value) return nullptr; - } if (auto tdescTy = - dyn_cast_if_present(value.getType())) { - auto layoutAttr = tdescTy.getLayoutAttr(); - return layoutAttr; - } + dyn_cast_if_present(value.getType())) + return tdescTy.getLayoutAttr(); if (auto result = dyn_cast(value)) { Operation *defOp = result.getDefiningOp(); @@ -232,6 +229,8 @@ maybePickPermanentLayout(xegpu::DistributeLayoutAttr layout, return candidate; } +// TODO-LayoutRefactor: Remove this function after replacing use +// with setTempDistributeLayoutAttr or setAnchorLayout void xegpu::setDistributeLayoutAttr( const mlir::OpResult &result, const mlir::xegpu::DistributeLayoutAttr layout) { @@ -253,6 +252,8 @@ void xegpu::setDistributeLayoutAttr( } } +// TODO-LayoutRefactor: Remove this function after replacing use +// with setTempDistributeLayoutAttr or setAnchorLayout void xegpu::setDistributeLayoutAttr(const OpOperand &operand, const DistributeLayoutAttr layout) { Operation *owner = operand.getOwner(); From 2a57b036e710a0f0fc1818a0d874c5fd55c43f85 Mon Sep 17 00:00:00 2001 From: Jianhui Li Date: Sat, 13 Dec 2025 18:08:13 +0000 Subject: [PATCH 21/28] clean up setDistributeLayoutAttr in sg distribution --- .../Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp index 94da382e3b04c..6df7205438b03 100644 --- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp +++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp @@ -1224,7 +1224,7 @@ static Value lowerToVectorReductions(TypedValue src, rewriter, loc, acc.getType(), DenseElementsAttr::get(acc.getType(), zeroAttr)); // Reduction result should have the same layout as the accumulator. - xegpu::setDistributeLayoutAttr( + xegpu::setTempDistributeLayoutAttr( cast(reductionResult), xegpu::getTempDistributeLayoutAttr(dyn_cast(acc))); // For each slice of the source, extract the slice vector, do a reduction @@ -1259,8 +1259,8 @@ static Value lowerToVectorReductions(TypedValue src, auto accLayout = xegpu::getTempDistributeLayoutAttr(dyn_cast(acc)); - xegpu::setDistributeLayoutAttr(slice->getOpOperand(0), srcLayout); - xegpu::setDistributeLayoutAttr(slice->getOpResult(0), accLayout); + xegpu::setTempDistributeLayoutAttr(slice->getOpOperand(0), srcLayout); + xegpu::setTempDistributeLayoutAttr(slice->getOpResult(0), accLayout); // Extract and reduction results in scalars, so no result layout is needed. Value accExtract = vector::ExtractOp::create(rewriter, loc, acc, i); Value reduction = vector::ReductionOp::create( From 1dbd1e87ed4dc5f2ed01b1cc220ea77484cffd3e Mon Sep 17 00:00:00 2001 From: Jianhui Li Date: Sat, 13 Dec 2025 19:12:00 +0000 Subject: [PATCH 22/28] clean up setDistributeLayoutAttr in load optimization pass --- mlir/lib/Dialect/XeGPU/Transforms/XeGPUOptimizeBlockLoads.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUOptimizeBlockLoads.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUOptimizeBlockLoads.cpp index ab41fe4298d99..1b59f4da2ecfb 100644 --- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUOptimizeBlockLoads.cpp +++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUOptimizeBlockLoads.cpp @@ -465,7 +465,8 @@ struct XeGPUOptimizeBlockLoadsPass final // converted. target.addDynamicallyLegalOp( [&](vector::ExtractOp extractOp) { - auto layout = xegpu::getDistributeLayoutAttr(extractOp.getResult()); + auto layout = xegpu::getTempDistributeLayoutAttr( + dyn_cast(extractOp.getResult())); if (!layout) return true; auto laneLayout = layout.getEffectiveLaneLayoutAsInt(); From 52d560e2a109317199124d260db5744c8f147452 Mon Sep 17 00:00:00 2001 From: Jianhui Li Date: Sat, 13 Dec 2025 20:03:18 +0000 Subject: [PATCH 23/28] clean up setDistributeLayoutAttr in sg distribution pass --- .../Transforms/XeGPUOptimizeBlockLoads.cpp | 12 +++--- .../Transforms/XeGPUWgToSgDistribute.cpp | 41 ++++++++++--------- 2 files changed, 27 insertions(+), 26 deletions(-) diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUOptimizeBlockLoads.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUOptimizeBlockLoads.cpp index 1b59f4da2ecfb..bcafd2b53799f 100644 --- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUOptimizeBlockLoads.cpp +++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUOptimizeBlockLoads.cpp @@ -217,14 +217,14 @@ static Value generateLoads(ConversionPatternRewriter &rewriter, origLoadOp.getL3HintAttr(), origLoadOp.getLayoutAttr()); // Set the layout for the loadOp. auto layoutAttr = newTensorDesc.getType().getLayoutAttr(); - xegpu::setDistributeLayoutAttr(loadOp->getOpResult(0), layoutAttr); + loadOp.setAnchorLayout(layoutAttr); // Insert the loaded block into the right position in data. auto insertOp = vector::InsertStridedSliceOp::create( rewriter, loc, loadOp.getResult(), data, ArrayRef{localOffsetDim0, localOffsetDim1}, ArrayRef{1, 1}); // InsertOp must have the same layout as newTensorDesc. - xegpu::setDistributeLayoutAttr(insertOp->getOpResult(0), layoutAttr); + xegpu::setTempDistributeLayoutAttr(insertOp->getOpResult(0), layoutAttr); data = insertOp.getResult(); } } @@ -366,8 +366,8 @@ class XeGPULoadNdDescOpPattern final auto bitCastOp = vector::BitCastOp::create(rewriter, loadNdOp->getLoc(), bitcastType, slice); // BitCastOp must have the same layout as the original loadNdOp. - xegpu::setDistributeLayoutAttr(bitCastOp->getOpResult(0), - origTensorDescType.getLayoutAttr()); + xegpu::setTempDistributeLayoutAttr(bitCastOp->getOpResult(0), + origTensorDescType.getLayoutAttr()); arraySlices.push_back(bitCastOp.getResult()); } rewriter.replaceOpWithMultiple(loadNdOp, {arraySlices}); @@ -384,8 +384,8 @@ class XeGPULoadNdDescOpPattern final auto bitCastOp = vector::BitCastOp::create(rewriter, loadNdOp->getLoc(), loadNdOp.getType(), data); // BitCastOp must have the same layout as the original loadNdOp. - xegpu::setDistributeLayoutAttr(bitCastOp->getOpResult(0), - origTensorDescType.getLayoutAttr()); + xegpu::setTempDistributeLayoutAttr(bitCastOp->getOpResult(0), + origTensorDescType.getLayoutAttr()); rewriter.replaceOp(loadNdOp, bitCastOp); return success(); } diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp index f9f5a1f6b285e..eb2f389fd13b8 100644 --- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp +++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp @@ -508,8 +508,8 @@ struct WgToSgVectorBroadcastOp for (auto operand : adaptor.getOperands().front()) { auto newBroadcast = vector::BroadcastOp::create(rewriter, op.getLoc(), newResultType, operand); - xegpu::setDistributeLayoutAttr(newBroadcast->getResult(0), - layout.dropSgLayoutAndData()); + xegpu::setTempDistributeLayoutAttr(newBroadcast->getResult(0), + layout.dropSgLayoutAndData()); newBroadcastOps.push_back(newBroadcast.getResult()); } @@ -756,8 +756,8 @@ struct WgToSgArithConstantOp : public OpConversionPattern { auto eltType = vecType.getElementType(); auto setLayout = [&](Value val) { - xegpu::setDistributeLayoutAttr(llvm::dyn_cast(val), - layout.dropSgLayoutAndData()); + xegpu::setTempDistributeLayoutAttr(llvm::dyn_cast(val), + layout.dropSgLayoutAndData()); }; if (vecAttr.isSplat()) { @@ -932,7 +932,7 @@ struct WgToSgLoadGatherOpWithOffset rewriter, loc, newTy, op.getSource(), offsets, mask, chunkSizeAttr, op.getL1HintAttr(), op.getL2HintAttr(), op.getL3HintAttr(), newLayout); - xegpu::setDistributeLayoutAttr(newLoadOp->getResult(0), newLayout); + newLoadOp.setAnchorLayout(newLayout); newLoadOps.push_back(newLoadOp); } rewriter.replaceOpWithMultiple(op, {newLoadOps}); @@ -987,7 +987,8 @@ struct WgToSgStoreScatterOpWithOffset // Skip for operand one (memref) if (operand.getOperandNumber() == 1) continue; - xegpu::setDistributeLayoutAttr(operand, layout.dropSgLayoutAndData()); + xegpu::setTempDistributeLayoutAttr(operand, + layout.dropSgLayoutAndData()); } } rewriter.eraseOp(op); @@ -1080,12 +1081,12 @@ struct WgToSgVectorStepOp : public OpConversionPattern { vector::BroadcastOp::create(rewriter, loc, newTy, offsets[0]); auto finalSteps = arith::AddIOp::create(rewriter, loc, steps, bcastOffset); - xegpu::setDistributeLayoutAttr(steps->getResult(0), - layout.dropSgLayoutAndData()); - xegpu::setDistributeLayoutAttr(bcastOffset->getResult(0), - layout.dropSgLayoutAndData()); - xegpu::setDistributeLayoutAttr(finalSteps->getResult(0), - layout.dropSgLayoutAndData()); + xegpu::setTempDistributeLayoutAttr(steps->getResult(0), + layout.dropSgLayoutAndData()); + xegpu::setTempDistributeLayoutAttr(bcastOffset->getResult(0), + layout.dropSgLayoutAndData()); + xegpu::setTempDistributeLayoutAttr(finalSteps->getResult(0), + layout.dropSgLayoutAndData()); newOps.push_back(finalSteps); } @@ -1154,8 +1155,8 @@ struct WgToSgVectorShapeCastOp for (auto src : adaptor.getSource()) { auto newShapeCast = vector::ShapeCastOp::create(rewriter, op.getLoc(), newResultType, src); - xegpu::setDistributeLayoutAttr(newShapeCast->getResult(0), - layout.dropSgLayoutAndData()); + xegpu::setTempDistributeLayoutAttr(newShapeCast->getResult(0), + layout.dropSgLayoutAndData()); newShapeCastOps.push_back(newShapeCast.getResult()); } @@ -1216,8 +1217,8 @@ struct WgToSgMultiDimReductionOp auto newOp = vector::MultiDimReductionOp::create( rewriter, op.getLoc(), newDstType, op.getKind(), sgSrc, adaptor.getAcc()[0], op.getReductionDims()); - xegpu::setDistributeLayoutAttr(newOp->getResult(0), - layout.dropSgLayoutAndData()); + xegpu::setTempDistributeLayoutAttr(newOp->getResult(0), + layout.dropSgLayoutAndData()); newReductions.push_back(newOp.getResult()); } @@ -1280,8 +1281,8 @@ struct WgToSgVectorTransposeOp for (auto src : adaptor.getVector()) { auto newTranspose = vector::TransposeOp::create( rewriter, op.getLoc(), newResultType, src, permutation); - xegpu::setDistributeLayoutAttr(newTranspose->getResult(0), - layout.dropSgLayoutAndData()); + xegpu::setTempDistributeLayoutAttr(newTranspose->getResult(0), + layout.dropSgLayoutAndData()); newTransposeOps.push_back(newTranspose.getResult()); } @@ -1350,8 +1351,8 @@ struct WgToSgVectorMaskOp : public OpConversionPattern { auto newCreateMaskOp = vector::CreateMaskOp::create(rewriter, loc, resultType, maskOperands); - xegpu::setDistributeLayoutAttr(newCreateMaskOp->getResult(0), - layout.dropSgLayoutAndData()); + xegpu::setTempDistributeLayoutAttr(newCreateMaskOp->getResult(0), + layout.dropSgLayoutAndData()); newCreateMaskOps.push_back(newCreateMaskOp.getResult()); } From cc60c1b561fc05871549732fe55527d664df5498 Mon Sep 17 00:00:00 2001 From: Jianhui Li Date: Sat, 13 Dec 2025 20:07:39 +0000 Subject: [PATCH 24/28] replace get/setTempDistributeLayoutAttr to get/setLayoutAttr --- .../mlir/Dialect/XeGPU/Utils/XeGPUUtils.h | 6 +- .../Transforms/XeGPUOptimizeBlockLoads.cpp | 12 +-- .../Transforms/XeGPUSubgroupDistribute.cpp | 42 +++++------ .../Transforms/XeGPUWgToSgDistribute.cpp | 73 +++++++++---------- mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp | 19 +++-- 5 files changed, 71 insertions(+), 81 deletions(-) diff --git a/mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h b/mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h index 7886c0269a15d..0a26ecaca01fd 100644 --- a/mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h +++ b/mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h @@ -105,13 +105,13 @@ void setDistributeLayoutAttr(const OpOperand &opr, template || std::is_same_v>> -DistributeLayoutAttr getTempDistributeLayoutAttr(const T &operandOrResult); +DistributeLayoutAttr getTempLayoutAttr(const T &operandOrResult); template || std::is_same_v>> -void setTempDistributeLayoutAttr(const T &operandOrResult, - const DistributeLayoutAttr layout); +void setTempLayoutAttr(const T &operandOrResult, + const DistributeLayoutAttr layout); /// Set the DistributeLayoutAttr for each OpOperand and OpResult of the given /// operation. If the operation contains regions, it is also applied recursively diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUOptimizeBlockLoads.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUOptimizeBlockLoads.cpp index bcafd2b53799f..437a7e336683d 100644 --- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUOptimizeBlockLoads.cpp +++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUOptimizeBlockLoads.cpp @@ -224,7 +224,7 @@ static Value generateLoads(ConversionPatternRewriter &rewriter, ArrayRef{localOffsetDim0, localOffsetDim1}, ArrayRef{1, 1}); // InsertOp must have the same layout as newTensorDesc. - xegpu::setTempDistributeLayoutAttr(insertOp->getOpResult(0), layoutAttr); + xegpu::setTempLayoutAttr(insertOp->getOpResult(0), layoutAttr); data = insertOp.getResult(); } } @@ -366,8 +366,8 @@ class XeGPULoadNdDescOpPattern final auto bitCastOp = vector::BitCastOp::create(rewriter, loadNdOp->getLoc(), bitcastType, slice); // BitCastOp must have the same layout as the original loadNdOp. - xegpu::setTempDistributeLayoutAttr(bitCastOp->getOpResult(0), - origTensorDescType.getLayoutAttr()); + xegpu::setTempLayoutAttr(bitCastOp->getOpResult(0), + origTensorDescType.getLayoutAttr()); arraySlices.push_back(bitCastOp.getResult()); } rewriter.replaceOpWithMultiple(loadNdOp, {arraySlices}); @@ -384,8 +384,8 @@ class XeGPULoadNdDescOpPattern final auto bitCastOp = vector::BitCastOp::create(rewriter, loadNdOp->getLoc(), loadNdOp.getType(), data); // BitCastOp must have the same layout as the original loadNdOp. - xegpu::setTempDistributeLayoutAttr(bitCastOp->getOpResult(0), - origTensorDescType.getLayoutAttr()); + xegpu::setTempLayoutAttr(bitCastOp->getOpResult(0), + origTensorDescType.getLayoutAttr()); rewriter.replaceOp(loadNdOp, bitCastOp); return success(); } @@ -465,7 +465,7 @@ struct XeGPUOptimizeBlockLoadsPass final // converted. target.addDynamicallyLegalOp( [&](vector::ExtractOp extractOp) { - auto layout = xegpu::getTempDistributeLayoutAttr( + auto layout = xegpu::getTempLayoutAttr( dyn_cast(extractOp.getResult())); if (!layout) return true; diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp index 6df7205438b03..e010d1b9c5cf7 100644 --- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp +++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp @@ -1224,9 +1224,8 @@ static Value lowerToVectorReductions(TypedValue src, rewriter, loc, acc.getType(), DenseElementsAttr::get(acc.getType(), zeroAttr)); // Reduction result should have the same layout as the accumulator. - xegpu::setTempDistributeLayoutAttr( - cast(reductionResult), - xegpu::getTempDistributeLayoutAttr(dyn_cast(acc))); + xegpu::setTempLayoutAttr(cast(reductionResult), + xegpu::getTempLayoutAttr(dyn_cast(acc))); // For each slice of the source, extract the slice vector, do a reduction // and, insert the reduced value back to the result vector. for (int i = 0; i < nSlices; ++i) { @@ -1254,13 +1253,11 @@ static Value lowerToVectorReductions(TypedValue src, // accumulator. Shape cast source has the same layout as the original // reduction source. // TODO: other ops generated here may also need layout attributes. - auto srcLayout = - xegpu::getTempDistributeLayoutAttr(dyn_cast(src)); - auto accLayout = - xegpu::getTempDistributeLayoutAttr(dyn_cast(acc)); + auto srcLayout = xegpu::getTempLayoutAttr(dyn_cast(src)); + auto accLayout = xegpu::getTempLayoutAttr(dyn_cast(acc)); - xegpu::setTempDistributeLayoutAttr(slice->getOpOperand(0), srcLayout); - xegpu::setTempDistributeLayoutAttr(slice->getOpResult(0), accLayout); + xegpu::setTempLayoutAttr(slice->getOpOperand(0), srcLayout); + xegpu::setTempLayoutAttr(slice->getOpResult(0), accLayout); // Extract and reduction results in scalars, so no result layout is needed. Value accExtract = vector::ExtractOp::create(rewriter, loc, acc, i); Value reduction = vector::ReductionOp::create( @@ -1353,7 +1350,7 @@ struct VectorMultiReductionDistribution : public gpu::WarpDistributionPattern { cast(warpOp.getResult(operandIdx).getType()); VectorType resultType = cast(reductionOp.getType()); xegpu::DistributeLayoutAttr sourceLayout = - xegpu::getTempDistributeLayoutAttr(reductionOp->getOpOperand(0)); + xegpu::getTempLayoutAttr(reductionOp->getOpOperand(0)); FailureOr sourceDistTypeOrFailure = getDistVecTypeBasedOnLaneLayout(sourceLayout, sourceType); @@ -1515,10 +1512,9 @@ struct VectorBroadcastDistribution : public gpu::WarpDistributionPattern { dyn_cast(broadcastOp.getResult().getType()); xegpu::DistributeLayoutAttr sourceLayout = - xegpu::getTempDistributeLayoutAttr(broadcastOp->getOpOperand(0)); + xegpu::getTempLayoutAttr(broadcastOp->getOpOperand(0)); xegpu::DistributeLayoutAttr resultLayout = - xegpu::getTempDistributeLayoutAttr( - dyn_cast(broadcastOp.getResult())); + xegpu::getTempLayoutAttr(dyn_cast(broadcastOp.getResult())); FailureOr sourceDistType; Type sourceElemOrDistType; @@ -1607,10 +1603,9 @@ struct VectorShapeCastDistribution : public gpu::WarpDistributionPattern { auto resultDistTy = cast(warpOp.getResult(operandNumber).getType()); xegpu::DistributeLayoutAttr sourceLayout = - xegpu::getTempDistributeLayoutAttr(shapeCastOp->getOpOperand(0)); + xegpu::getTempLayoutAttr(shapeCastOp->getOpOperand(0)); xegpu::DistributeLayoutAttr resultLayout = - xegpu::getTempDistributeLayoutAttr( - dyn_cast(shapeCastOp.getResult())); + xegpu::getTempLayoutAttr(dyn_cast(shapeCastOp.getResult())); if (!sourceLayout || !resultLayout) return rewriter.notifyMatchFailure( warpOp, @@ -1692,8 +1687,7 @@ struct VectorExtractStridedSliceDistribution int64_t distributedDim = distributedDims[0]; int sourceDistrDimSize = extractOp.getSourceVectorType().getShape()[distributedDim]; - auto sourceLayout = - xegpu::getTempDistributeLayoutAttr(extractOp->getOpOperand(0)); + auto sourceLayout = xegpu::getTempLayoutAttr(extractOp->getOpOperand(0)); if (!sourceLayout || sourceLayout.getEffectiveLaneLayoutAsInt().empty()) return rewriter.notifyMatchFailure( warpOp, "the source of extract_strided_slice op lacks distribution " @@ -1803,10 +1797,8 @@ struct VectorInsertStridedSliceDistribution "rank) dims of dest vector"); int64_t srcDistrDimSize = srcType.getDimSize(sourceDistributedDim); // Obtain the source and dest layouts. - auto destLayout = - xegpu::getTempDistributeLayoutAttr(insertOp->getOpOperand(1)); - auto sourceLayout = - xegpu::getTempDistributeLayoutAttr(insertOp->getOpOperand(0)); + auto destLayout = xegpu::getTempLayoutAttr(insertOp->getOpOperand(1)); + auto sourceLayout = xegpu::getTempLayoutAttr(insertOp->getOpOperand(0)); if (!destLayout || !sourceLayout || destLayout.getEffectiveLaneLayoutAsInt().empty() || sourceLayout.getEffectiveLaneLayoutAsInt().empty()) @@ -1921,7 +1913,7 @@ struct VectorBitcastDistribution final : public gpu::WarpDistributionPattern { unsigned operandIdx = operand->getOperandNumber(); VectorType distributedSourceType = getDistVecTypeBasedOnLaneLayout( - xegpu::getTempDistributeLayoutAttr(bitcastOp->getOpOperand(0)), + xegpu::getTempLayoutAttr(bitcastOp->getOpOperand(0)), bitcastOp.getSourceVectorType()) .value_or(VectorType()); if (!distributedSourceType) @@ -1964,9 +1956,9 @@ struct VectorTransposeDistribution final : public gpu::WarpDistributionPattern { auto transposeOp = operand->get().getDefiningOp(); unsigned operandIdx = operand->getOperandNumber(); xegpu::DistributeLayoutAttr sourceLayout = - xegpu::getTempDistributeLayoutAttr(transposeOp->getOpOperand(0)); + xegpu::getTempLayoutAttr(transposeOp->getOpOperand(0)); xegpu::DistributeLayoutAttr resultLayout = - xegpu::getTempDistributeLayoutAttr(transposeOp->getOpResult(0)); + xegpu::getTempLayoutAttr(transposeOp->getOpResult(0)); if (!sourceLayout || !resultLayout) return rewriter.notifyMatchFailure( transposeOp, diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp index eb2f389fd13b8..f54be3ce928b7 100644 --- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp +++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp @@ -492,8 +492,8 @@ struct WgToSgVectorBroadcastOp VectorType resultType = op.getResult().getType(); ArrayRef wgShape = resultType.getShape(); - xegpu::DistributeLayoutAttr layout = xegpu::getTempDistributeLayoutAttr( - llvm::cast(op.getResult())); + xegpu::DistributeLayoutAttr layout = + xegpu::getTempLayoutAttr(llvm::cast(op.getResult())); if (!layout || !layout.isForWorkgroup()) return failure(); @@ -508,8 +508,8 @@ struct WgToSgVectorBroadcastOp for (auto operand : adaptor.getOperands().front()) { auto newBroadcast = vector::BroadcastOp::create(rewriter, op.getLoc(), newResultType, operand); - xegpu::setTempDistributeLayoutAttr(newBroadcast->getResult(0), - layout.dropSgLayoutAndData()); + xegpu::setTempLayoutAttr(newBroadcast->getResult(0), + layout.dropSgLayoutAndData()); newBroadcastOps.push_back(newBroadcast.getResult()); } @@ -535,8 +535,8 @@ struct WgToSgElementwiseOp : public ConversionPattern { ArrayRef wgShape = resultType.getShape(); - xegpu::DistributeLayoutAttr layout = xegpu::getTempDistributeLayoutAttr( - llvm::cast(op->getResult(0))); + xegpu::DistributeLayoutAttr layout = + xegpu::getTempLayoutAttr(llvm::cast(op->getResult(0))); if (!layout || !layout.isForWorkgroup()) return failure(); @@ -742,7 +742,7 @@ struct WgToSgArithConstantOp : public OpConversionPattern { return failure(); xegpu::DistributeLayoutAttr layout = - xegpu::getTempDistributeLayoutAttr(dyn_cast(op.getResult())); + xegpu::getTempLayoutAttr(dyn_cast(op.getResult())); if (!layout || !layout.isForWorkgroup()) return failure(); @@ -756,8 +756,8 @@ struct WgToSgArithConstantOp : public OpConversionPattern { auto eltType = vecType.getElementType(); auto setLayout = [&](Value val) { - xegpu::setTempDistributeLayoutAttr(llvm::dyn_cast(val), - layout.dropSgLayoutAndData()); + xegpu::setTempLayoutAttr(llvm::dyn_cast(val), + layout.dropSgLayoutAndData()); }; if (vecAttr.isSplat()) { @@ -987,8 +987,7 @@ struct WgToSgStoreScatterOpWithOffset // Skip for operand one (memref) if (operand.getOperandNumber() == 1) continue; - xegpu::setTempDistributeLayoutAttr(operand, - layout.dropSgLayoutAndData()); + xegpu::setTempLayoutAttr(operand, layout.dropSgLayoutAndData()); } } rewriter.eraseOp(op); @@ -1053,7 +1052,7 @@ struct WgToSgVectorStepOp : public OpConversionPattern { matchAndRewrite(vector::StepOp op, OneToNOpAdaptor adaptor, ConversionPatternRewriter &rewriter) const override { xegpu::DistributeLayoutAttr layout = - xegpu::getTempDistributeLayoutAttr(dyn_cast(op.getResult())); + xegpu::getTempLayoutAttr(dyn_cast(op.getResult())); if (!layout || !layout.isForWorkgroup()) return failure(); @@ -1081,12 +1080,12 @@ struct WgToSgVectorStepOp : public OpConversionPattern { vector::BroadcastOp::create(rewriter, loc, newTy, offsets[0]); auto finalSteps = arith::AddIOp::create(rewriter, loc, steps, bcastOffset); - xegpu::setTempDistributeLayoutAttr(steps->getResult(0), - layout.dropSgLayoutAndData()); - xegpu::setTempDistributeLayoutAttr(bcastOffset->getResult(0), - layout.dropSgLayoutAndData()); - xegpu::setTempDistributeLayoutAttr(finalSteps->getResult(0), - layout.dropSgLayoutAndData()); + xegpu::setTempLayoutAttr(steps->getResult(0), + layout.dropSgLayoutAndData()); + xegpu::setTempLayoutAttr(bcastOffset->getResult(0), + layout.dropSgLayoutAndData()); + xegpu::setTempLayoutAttr(finalSteps->getResult(0), + layout.dropSgLayoutAndData()); newOps.push_back(finalSteps); } @@ -1110,7 +1109,7 @@ struct WgToSgVectorShapeCastOp ArrayRef wgShape = resultType.getShape(); xegpu::DistributeLayoutAttr layout = - xegpu::getTempDistributeLayoutAttr(dyn_cast(op.getResult())); + xegpu::getTempLayoutAttr(dyn_cast(op.getResult())); if (!layout || !layout.isForWorkgroup()) return failure(); @@ -1143,7 +1142,7 @@ struct WgToSgVectorShapeCastOp // must be a slice of higher rank layout. int64_t sourceRank = srcType.getRank(); int64_t resultRank = sgShape.size(); - // TODO-LayoutRefactor: handle the case using getTempDistributeLayoutAttr + // TODO-LayoutRefactor: handle the case using getTempLayoutAttr xegpu::DistributeLayoutAttr sourceLayout = xegpu::getDistributeLayoutAttr(op.getSource()); if (sourceRank < resultRank && !sourceLayout.isSliceOf(layout)) @@ -1155,8 +1154,8 @@ struct WgToSgVectorShapeCastOp for (auto src : adaptor.getSource()) { auto newShapeCast = vector::ShapeCastOp::create(rewriter, op.getLoc(), newResultType, src); - xegpu::setTempDistributeLayoutAttr(newShapeCast->getResult(0), - layout.dropSgLayoutAndData()); + xegpu::setTempLayoutAttr(newShapeCast->getResult(0), + layout.dropSgLayoutAndData()); newShapeCastOps.push_back(newShapeCast.getResult()); } @@ -1184,7 +1183,7 @@ struct WgToSgMultiDimReductionOp auto srcShape = srcType.getShape(); xegpu::DistributeLayoutAttr layout = - xegpu::getTempDistributeLayoutAttr(dyn_cast(op.getResult())); + xegpu::getTempLayoutAttr(dyn_cast(op.getResult())); if (!layout || !layout.isForWorkgroup()) return failure(); @@ -1217,8 +1216,8 @@ struct WgToSgMultiDimReductionOp auto newOp = vector::MultiDimReductionOp::create( rewriter, op.getLoc(), newDstType, op.getKind(), sgSrc, adaptor.getAcc()[0], op.getReductionDims()); - xegpu::setTempDistributeLayoutAttr(newOp->getResult(0), - layout.dropSgLayoutAndData()); + xegpu::setTempLayoutAttr(newOp->getResult(0), + layout.dropSgLayoutAndData()); newReductions.push_back(newOp.getResult()); } @@ -1239,10 +1238,10 @@ struct WgToSgVectorTransposeOp ArrayRef wgShape = resultType.getShape(); xegpu::DistributeLayoutAttr layout = - xegpu::getTempDistributeLayoutAttr(dyn_cast(op.getResult())); + xegpu::getTempLayoutAttr(dyn_cast(op.getResult())); if (!layout || !layout.isForWorkgroup()) return failure(); - // TODO-LayoutRefactor: handle the case using getTempDistributeLayoutAttr + // TODO-LayoutRefactor: handle the case using getTempLayoutAttr xegpu::DistributeLayoutAttr sourceLayout = xegpu::getDistributeLayoutAttr(op.getVector()); if (!sourceLayout || !sourceLayout.isForWorkgroup()) @@ -1281,8 +1280,8 @@ struct WgToSgVectorTransposeOp for (auto src : adaptor.getVector()) { auto newTranspose = vector::TransposeOp::create( rewriter, op.getLoc(), newResultType, src, permutation); - xegpu::setTempDistributeLayoutAttr(newTranspose->getResult(0), - layout.dropSgLayoutAndData()); + xegpu::setTempLayoutAttr(newTranspose->getResult(0), + layout.dropSgLayoutAndData()); newTransposeOps.push_back(newTranspose.getResult()); } @@ -1301,7 +1300,7 @@ struct WgToSgVectorMaskOp : public OpConversionPattern { typename OpConversionPattern::OneToNOpAdaptor adaptor, ConversionPatternRewriter &rewriter) const override { xegpu::DistributeLayoutAttr layout = - xegpu::getTempDistributeLayoutAttr(dyn_cast(op.getResult())); + xegpu::getTempLayoutAttr(dyn_cast(op.getResult())); if (!layout || !layout.isForWorkgroup()) return failure(); @@ -1351,8 +1350,8 @@ struct WgToSgVectorMaskOp : public OpConversionPattern { auto newCreateMaskOp = vector::CreateMaskOp::create(rewriter, loc, resultType, maskOperands); - xegpu::setTempDistributeLayoutAttr(newCreateMaskOp->getResult(0), - layout.dropSgLayoutAndData()); + xegpu::setTempLayoutAttr(newCreateMaskOp->getResult(0), + layout.dropSgLayoutAndData()); newCreateMaskOps.push_back(newCreateMaskOp.getResult()); } @@ -1510,8 +1509,8 @@ void XeGPUWgToSgDistributePass::runOnOperation() { if (!vecType) return true; - auto layout = xegpu::getTempDistributeLayoutAttr( - dyn_cast(op.getResult())); + auto layout = + xegpu::getTempLayoutAttr(dyn_cast(op.getResult())); return isLegal(layout); }); @@ -1521,8 +1520,8 @@ void XeGPUWgToSgDistributePass::runOnOperation() { vector::ConstantMaskOp, vector::CreateMaskOp>( [=](Operation *op) -> bool { // Check for either a SliceAttr or LayoutAttr on the result. - auto layout = xegpu::getTempDistributeLayoutAttr( - dyn_cast(op->getResult(0))); + auto layout = + xegpu::getTempLayoutAttr(dyn_cast(op->getResult(0))); return isLegal(layout); }); @@ -1564,7 +1563,7 @@ void XeGPUWgToSgDistributePass::runOnOperation() { } xegpu::DistributeLayoutAttr layout = - xegpu::getTempDistributeLayoutAttr(op->getResult(0)); + xegpu::getTempLayoutAttr(op->getResult(0)); return isLegal(layout); }); diff --git a/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp b/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp index 6d6ca47dc10d5..238f6dd6511e4 100644 --- a/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp +++ b/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp @@ -230,7 +230,7 @@ maybePickPermanentLayout(xegpu::DistributeLayoutAttr layout, } // TODO-LayoutRefactor: Remove this function after replacing use -// with setTempDistributeLayoutAttr or setAnchorLayout +// with setTempLayoutAttr or setAnchorLayout void xegpu::setDistributeLayoutAttr( const mlir::OpResult &result, const mlir::xegpu::DistributeLayoutAttr layout) { @@ -253,7 +253,7 @@ void xegpu::setDistributeLayoutAttr( } // TODO-LayoutRefactor: Remove this function after replacing use -// with setTempDistributeLayoutAttr or setAnchorLayout +// with setTempLayoutAttr or setAnchorLayout void xegpu::setDistributeLayoutAttr(const OpOperand &operand, const DistributeLayoutAttr layout) { Operation *owner = operand.getOwner(); @@ -301,8 +301,7 @@ void xegpu::setDistributeLayoutAttr(const OpOperand &operand, } template -xegpu::DistributeLayoutAttr -xegpu::getTempDistributeLayoutAttr(const T &operandOrResult) { +xegpu::DistributeLayoutAttr xegpu::getTempLayoutAttr(const T &operandOrResult) { Operation *op = operandOrResult.getOwner(); std::string layoutName = xegpu::getTempLayoutName(operandOrResult); @@ -315,13 +314,13 @@ xegpu::getTempDistributeLayoutAttr(const T &operandOrResult) { } template xegpu::DistributeLayoutAttr -xegpu::getTempDistributeLayoutAttr(const OpResult &result); +xegpu::getTempLayoutAttr(const OpResult &result); template xegpu::DistributeLayoutAttr -xegpu::getTempDistributeLayoutAttr(const OpOperand &operand); +xegpu::getTempLayoutAttr(const OpOperand &operand); template -void xegpu::setTempDistributeLayoutAttr( - const T &operandOrResult, const xegpu::DistributeLayoutAttr layout) { +void xegpu::setTempLayoutAttr(const T &operandOrResult, + const xegpu::DistributeLayoutAttr layout) { Operation *owner = operandOrResult.getOwner(); std::string name = xegpu::getTempLayoutName(operandOrResult); if (owner->hasAttrOfType(name)) { @@ -332,11 +331,11 @@ void xegpu::setTempDistributeLayoutAttr( } } -template void xegpu::setTempDistributeLayoutAttr( +template void xegpu::setTempLayoutAttr( const mlir::OpResult &result, const mlir::xegpu::DistributeLayoutAttr layout); -template void xegpu::setTempDistributeLayoutAttr( +template void xegpu::setTempLayoutAttr( const mlir::OpOperand &operand, const mlir::xegpu::DistributeLayoutAttr layout); From 9436b7b5097a693d38248949df72f132dd60b79e Mon Sep 17 00:00:00 2001 From: Jianhui Li Date: Sat, 13 Dec 2025 20:15:57 +0000 Subject: [PATCH 25/28] shorten get/setTempLayoutAttr to get/setTempLayout --- .../mlir/Dialect/XeGPU/Utils/XeGPUUtils.h | 5 +- .../Transforms/XeGPUOptimizeBlockLoads.cpp | 14 ++--- .../Transforms/XeGPUSubgroupDistribute.cpp | 34 +++++----- .../Transforms/XeGPUWgToSgDistribute.cpp | 63 +++++++++---------- mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp | 18 +++--- 5 files changed, 65 insertions(+), 69 deletions(-) diff --git a/mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h b/mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h index 0a26ecaca01fd..5e0a5f7872ed5 100644 --- a/mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h +++ b/mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h @@ -105,13 +105,12 @@ void setDistributeLayoutAttr(const OpOperand &opr, template || std::is_same_v>> -DistributeLayoutAttr getTempLayoutAttr(const T &operandOrResult); +DistributeLayoutAttr getTempLayout(const T &operandOrResult); template || std::is_same_v>> -void setTempLayoutAttr(const T &operandOrResult, - const DistributeLayoutAttr layout); +void setTempLayout(const T &operandOrResult, const DistributeLayoutAttr layout); /// Set the DistributeLayoutAttr for each OpOperand and OpResult of the given /// operation. If the operation contains regions, it is also applied recursively diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUOptimizeBlockLoads.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUOptimizeBlockLoads.cpp index 437a7e336683d..27eeddb388194 100644 --- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUOptimizeBlockLoads.cpp +++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUOptimizeBlockLoads.cpp @@ -224,7 +224,7 @@ static Value generateLoads(ConversionPatternRewriter &rewriter, ArrayRef{localOffsetDim0, localOffsetDim1}, ArrayRef{1, 1}); // InsertOp must have the same layout as newTensorDesc. - xegpu::setTempLayoutAttr(insertOp->getOpResult(0), layoutAttr); + xegpu::setTempLayout(insertOp->getOpResult(0), layoutAttr); data = insertOp.getResult(); } } @@ -366,8 +366,8 @@ class XeGPULoadNdDescOpPattern final auto bitCastOp = vector::BitCastOp::create(rewriter, loadNdOp->getLoc(), bitcastType, slice); // BitCastOp must have the same layout as the original loadNdOp. - xegpu::setTempLayoutAttr(bitCastOp->getOpResult(0), - origTensorDescType.getLayoutAttr()); + xegpu::setTempLayout(bitCastOp->getOpResult(0), + origTensorDescType.getLayoutAttr()); arraySlices.push_back(bitCastOp.getResult()); } rewriter.replaceOpWithMultiple(loadNdOp, {arraySlices}); @@ -384,8 +384,8 @@ class XeGPULoadNdDescOpPattern final auto bitCastOp = vector::BitCastOp::create(rewriter, loadNdOp->getLoc(), loadNdOp.getType(), data); // BitCastOp must have the same layout as the original loadNdOp. - xegpu::setTempLayoutAttr(bitCastOp->getOpResult(0), - origTensorDescType.getLayoutAttr()); + xegpu::setTempLayout(bitCastOp->getOpResult(0), + origTensorDescType.getLayoutAttr()); rewriter.replaceOp(loadNdOp, bitCastOp); return success(); } @@ -465,8 +465,8 @@ struct XeGPUOptimizeBlockLoadsPass final // converted. target.addDynamicallyLegalOp( [&](vector::ExtractOp extractOp) { - auto layout = xegpu::getTempLayoutAttr( - dyn_cast(extractOp.getResult())); + auto layout = + xegpu::getTempLayout(dyn_cast(extractOp.getResult())); if (!layout) return true; auto laneLayout = layout.getEffectiveLaneLayoutAsInt(); diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp index e010d1b9c5cf7..bf914acf22707 100644 --- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp +++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp @@ -1224,8 +1224,8 @@ static Value lowerToVectorReductions(TypedValue src, rewriter, loc, acc.getType(), DenseElementsAttr::get(acc.getType(), zeroAttr)); // Reduction result should have the same layout as the accumulator. - xegpu::setTempLayoutAttr(cast(reductionResult), - xegpu::getTempLayoutAttr(dyn_cast(acc))); + xegpu::setTempLayout(cast(reductionResult), + xegpu::getTempLayout(dyn_cast(acc))); // For each slice of the source, extract the slice vector, do a reduction // and, insert the reduced value back to the result vector. for (int i = 0; i < nSlices; ++i) { @@ -1253,11 +1253,11 @@ static Value lowerToVectorReductions(TypedValue src, // accumulator. Shape cast source has the same layout as the original // reduction source. // TODO: other ops generated here may also need layout attributes. - auto srcLayout = xegpu::getTempLayoutAttr(dyn_cast(src)); - auto accLayout = xegpu::getTempLayoutAttr(dyn_cast(acc)); + auto srcLayout = xegpu::getTempLayout(dyn_cast(src)); + auto accLayout = xegpu::getTempLayout(dyn_cast(acc)); - xegpu::setTempLayoutAttr(slice->getOpOperand(0), srcLayout); - xegpu::setTempLayoutAttr(slice->getOpResult(0), accLayout); + xegpu::setTempLayout(slice->getOpOperand(0), srcLayout); + xegpu::setTempLayout(slice->getOpResult(0), accLayout); // Extract and reduction results in scalars, so no result layout is needed. Value accExtract = vector::ExtractOp::create(rewriter, loc, acc, i); Value reduction = vector::ReductionOp::create( @@ -1350,7 +1350,7 @@ struct VectorMultiReductionDistribution : public gpu::WarpDistributionPattern { cast(warpOp.getResult(operandIdx).getType()); VectorType resultType = cast(reductionOp.getType()); xegpu::DistributeLayoutAttr sourceLayout = - xegpu::getTempLayoutAttr(reductionOp->getOpOperand(0)); + xegpu::getTempLayout(reductionOp->getOpOperand(0)); FailureOr sourceDistTypeOrFailure = getDistVecTypeBasedOnLaneLayout(sourceLayout, sourceType); @@ -1512,9 +1512,9 @@ struct VectorBroadcastDistribution : public gpu::WarpDistributionPattern { dyn_cast(broadcastOp.getResult().getType()); xegpu::DistributeLayoutAttr sourceLayout = - xegpu::getTempLayoutAttr(broadcastOp->getOpOperand(0)); + xegpu::getTempLayout(broadcastOp->getOpOperand(0)); xegpu::DistributeLayoutAttr resultLayout = - xegpu::getTempLayoutAttr(dyn_cast(broadcastOp.getResult())); + xegpu::getTempLayout(dyn_cast(broadcastOp.getResult())); FailureOr sourceDistType; Type sourceElemOrDistType; @@ -1603,9 +1603,9 @@ struct VectorShapeCastDistribution : public gpu::WarpDistributionPattern { auto resultDistTy = cast(warpOp.getResult(operandNumber).getType()); xegpu::DistributeLayoutAttr sourceLayout = - xegpu::getTempLayoutAttr(shapeCastOp->getOpOperand(0)); + xegpu::getTempLayout(shapeCastOp->getOpOperand(0)); xegpu::DistributeLayoutAttr resultLayout = - xegpu::getTempLayoutAttr(dyn_cast(shapeCastOp.getResult())); + xegpu::getTempLayout(dyn_cast(shapeCastOp.getResult())); if (!sourceLayout || !resultLayout) return rewriter.notifyMatchFailure( warpOp, @@ -1687,7 +1687,7 @@ struct VectorExtractStridedSliceDistribution int64_t distributedDim = distributedDims[0]; int sourceDistrDimSize = extractOp.getSourceVectorType().getShape()[distributedDim]; - auto sourceLayout = xegpu::getTempLayoutAttr(extractOp->getOpOperand(0)); + auto sourceLayout = xegpu::getTempLayout(extractOp->getOpOperand(0)); if (!sourceLayout || sourceLayout.getEffectiveLaneLayoutAsInt().empty()) return rewriter.notifyMatchFailure( warpOp, "the source of extract_strided_slice op lacks distribution " @@ -1797,8 +1797,8 @@ struct VectorInsertStridedSliceDistribution "rank) dims of dest vector"); int64_t srcDistrDimSize = srcType.getDimSize(sourceDistributedDim); // Obtain the source and dest layouts. - auto destLayout = xegpu::getTempLayoutAttr(insertOp->getOpOperand(1)); - auto sourceLayout = xegpu::getTempLayoutAttr(insertOp->getOpOperand(0)); + auto destLayout = xegpu::getTempLayout(insertOp->getOpOperand(1)); + auto sourceLayout = xegpu::getTempLayout(insertOp->getOpOperand(0)); if (!destLayout || !sourceLayout || destLayout.getEffectiveLaneLayoutAsInt().empty() || sourceLayout.getEffectiveLaneLayoutAsInt().empty()) @@ -1913,7 +1913,7 @@ struct VectorBitcastDistribution final : public gpu::WarpDistributionPattern { unsigned operandIdx = operand->getOperandNumber(); VectorType distributedSourceType = getDistVecTypeBasedOnLaneLayout( - xegpu::getTempLayoutAttr(bitcastOp->getOpOperand(0)), + xegpu::getTempLayout(bitcastOp->getOpOperand(0)), bitcastOp.getSourceVectorType()) .value_or(VectorType()); if (!distributedSourceType) @@ -1956,9 +1956,9 @@ struct VectorTransposeDistribution final : public gpu::WarpDistributionPattern { auto transposeOp = operand->get().getDefiningOp(); unsigned operandIdx = operand->getOperandNumber(); xegpu::DistributeLayoutAttr sourceLayout = - xegpu::getTempLayoutAttr(transposeOp->getOpOperand(0)); + xegpu::getTempLayout(transposeOp->getOpOperand(0)); xegpu::DistributeLayoutAttr resultLayout = - xegpu::getTempLayoutAttr(transposeOp->getOpResult(0)); + xegpu::getTempLayout(transposeOp->getOpResult(0)); if (!sourceLayout || !resultLayout) return rewriter.notifyMatchFailure( transposeOp, diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp index f54be3ce928b7..ea2285a97789a 100644 --- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp +++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp @@ -493,7 +493,7 @@ struct WgToSgVectorBroadcastOp ArrayRef wgShape = resultType.getShape(); xegpu::DistributeLayoutAttr layout = - xegpu::getTempLayoutAttr(llvm::cast(op.getResult())); + xegpu::getTempLayout(llvm::cast(op.getResult())); if (!layout || !layout.isForWorkgroup()) return failure(); @@ -508,8 +508,8 @@ struct WgToSgVectorBroadcastOp for (auto operand : adaptor.getOperands().front()) { auto newBroadcast = vector::BroadcastOp::create(rewriter, op.getLoc(), newResultType, operand); - xegpu::setTempLayoutAttr(newBroadcast->getResult(0), - layout.dropSgLayoutAndData()); + xegpu::setTempLayout(newBroadcast->getResult(0), + layout.dropSgLayoutAndData()); newBroadcastOps.push_back(newBroadcast.getResult()); } @@ -536,7 +536,7 @@ struct WgToSgElementwiseOp : public ConversionPattern { ArrayRef wgShape = resultType.getShape(); xegpu::DistributeLayoutAttr layout = - xegpu::getTempLayoutAttr(llvm::cast(op->getResult(0))); + xegpu::getTempLayout(llvm::cast(op->getResult(0))); if (!layout || !layout.isForWorkgroup()) return failure(); @@ -742,7 +742,7 @@ struct WgToSgArithConstantOp : public OpConversionPattern { return failure(); xegpu::DistributeLayoutAttr layout = - xegpu::getTempLayoutAttr(dyn_cast(op.getResult())); + xegpu::getTempLayout(dyn_cast(op.getResult())); if (!layout || !layout.isForWorkgroup()) return failure(); @@ -756,8 +756,8 @@ struct WgToSgArithConstantOp : public OpConversionPattern { auto eltType = vecType.getElementType(); auto setLayout = [&](Value val) { - xegpu::setTempLayoutAttr(llvm::dyn_cast(val), - layout.dropSgLayoutAndData()); + xegpu::setTempLayout(llvm::dyn_cast(val), + layout.dropSgLayoutAndData()); }; if (vecAttr.isSplat()) { @@ -987,7 +987,7 @@ struct WgToSgStoreScatterOpWithOffset // Skip for operand one (memref) if (operand.getOperandNumber() == 1) continue; - xegpu::setTempLayoutAttr(operand, layout.dropSgLayoutAndData()); + xegpu::setTempLayout(operand, layout.dropSgLayoutAndData()); } } rewriter.eraseOp(op); @@ -1052,7 +1052,7 @@ struct WgToSgVectorStepOp : public OpConversionPattern { matchAndRewrite(vector::StepOp op, OneToNOpAdaptor adaptor, ConversionPatternRewriter &rewriter) const override { xegpu::DistributeLayoutAttr layout = - xegpu::getTempLayoutAttr(dyn_cast(op.getResult())); + xegpu::getTempLayout(dyn_cast(op.getResult())); if (!layout || !layout.isForWorkgroup()) return failure(); @@ -1080,12 +1080,11 @@ struct WgToSgVectorStepOp : public OpConversionPattern { vector::BroadcastOp::create(rewriter, loc, newTy, offsets[0]); auto finalSteps = arith::AddIOp::create(rewriter, loc, steps, bcastOffset); - xegpu::setTempLayoutAttr(steps->getResult(0), - layout.dropSgLayoutAndData()); - xegpu::setTempLayoutAttr(bcastOffset->getResult(0), - layout.dropSgLayoutAndData()); - xegpu::setTempLayoutAttr(finalSteps->getResult(0), - layout.dropSgLayoutAndData()); + xegpu::setTempLayout(steps->getResult(0), layout.dropSgLayoutAndData()); + xegpu::setTempLayout(bcastOffset->getResult(0), + layout.dropSgLayoutAndData()); + xegpu::setTempLayout(finalSteps->getResult(0), + layout.dropSgLayoutAndData()); newOps.push_back(finalSteps); } @@ -1109,7 +1108,7 @@ struct WgToSgVectorShapeCastOp ArrayRef wgShape = resultType.getShape(); xegpu::DistributeLayoutAttr layout = - xegpu::getTempLayoutAttr(dyn_cast(op.getResult())); + xegpu::getTempLayout(dyn_cast(op.getResult())); if (!layout || !layout.isForWorkgroup()) return failure(); @@ -1142,7 +1141,7 @@ struct WgToSgVectorShapeCastOp // must be a slice of higher rank layout. int64_t sourceRank = srcType.getRank(); int64_t resultRank = sgShape.size(); - // TODO-LayoutRefactor: handle the case using getTempLayoutAttr + // TODO-LayoutRefactor: handle the case using getTempLayout xegpu::DistributeLayoutAttr sourceLayout = xegpu::getDistributeLayoutAttr(op.getSource()); if (sourceRank < resultRank && !sourceLayout.isSliceOf(layout)) @@ -1154,8 +1153,8 @@ struct WgToSgVectorShapeCastOp for (auto src : adaptor.getSource()) { auto newShapeCast = vector::ShapeCastOp::create(rewriter, op.getLoc(), newResultType, src); - xegpu::setTempLayoutAttr(newShapeCast->getResult(0), - layout.dropSgLayoutAndData()); + xegpu::setTempLayout(newShapeCast->getResult(0), + layout.dropSgLayoutAndData()); newShapeCastOps.push_back(newShapeCast.getResult()); } @@ -1183,7 +1182,7 @@ struct WgToSgMultiDimReductionOp auto srcShape = srcType.getShape(); xegpu::DistributeLayoutAttr layout = - xegpu::getTempLayoutAttr(dyn_cast(op.getResult())); + xegpu::getTempLayout(dyn_cast(op.getResult())); if (!layout || !layout.isForWorkgroup()) return failure(); @@ -1216,8 +1215,7 @@ struct WgToSgMultiDimReductionOp auto newOp = vector::MultiDimReductionOp::create( rewriter, op.getLoc(), newDstType, op.getKind(), sgSrc, adaptor.getAcc()[0], op.getReductionDims()); - xegpu::setTempLayoutAttr(newOp->getResult(0), - layout.dropSgLayoutAndData()); + xegpu::setTempLayout(newOp->getResult(0), layout.dropSgLayoutAndData()); newReductions.push_back(newOp.getResult()); } @@ -1238,10 +1236,10 @@ struct WgToSgVectorTransposeOp ArrayRef wgShape = resultType.getShape(); xegpu::DistributeLayoutAttr layout = - xegpu::getTempLayoutAttr(dyn_cast(op.getResult())); + xegpu::getTempLayout(dyn_cast(op.getResult())); if (!layout || !layout.isForWorkgroup()) return failure(); - // TODO-LayoutRefactor: handle the case using getTempLayoutAttr + // TODO-LayoutRefactor: handle the case using getTempLayout xegpu::DistributeLayoutAttr sourceLayout = xegpu::getDistributeLayoutAttr(op.getVector()); if (!sourceLayout || !sourceLayout.isForWorkgroup()) @@ -1280,8 +1278,8 @@ struct WgToSgVectorTransposeOp for (auto src : adaptor.getVector()) { auto newTranspose = vector::TransposeOp::create( rewriter, op.getLoc(), newResultType, src, permutation); - xegpu::setTempLayoutAttr(newTranspose->getResult(0), - layout.dropSgLayoutAndData()); + xegpu::setTempLayout(newTranspose->getResult(0), + layout.dropSgLayoutAndData()); newTransposeOps.push_back(newTranspose.getResult()); } @@ -1300,7 +1298,7 @@ struct WgToSgVectorMaskOp : public OpConversionPattern { typename OpConversionPattern::OneToNOpAdaptor adaptor, ConversionPatternRewriter &rewriter) const override { xegpu::DistributeLayoutAttr layout = - xegpu::getTempLayoutAttr(dyn_cast(op.getResult())); + xegpu::getTempLayout(dyn_cast(op.getResult())); if (!layout || !layout.isForWorkgroup()) return failure(); @@ -1350,8 +1348,8 @@ struct WgToSgVectorMaskOp : public OpConversionPattern { auto newCreateMaskOp = vector::CreateMaskOp::create(rewriter, loc, resultType, maskOperands); - xegpu::setTempLayoutAttr(newCreateMaskOp->getResult(0), - layout.dropSgLayoutAndData()); + xegpu::setTempLayout(newCreateMaskOp->getResult(0), + layout.dropSgLayoutAndData()); newCreateMaskOps.push_back(newCreateMaskOp.getResult()); } @@ -1509,8 +1507,7 @@ void XeGPUWgToSgDistributePass::runOnOperation() { if (!vecType) return true; - auto layout = - xegpu::getTempLayoutAttr(dyn_cast(op.getResult())); + auto layout = xegpu::getTempLayout(dyn_cast(op.getResult())); return isLegal(layout); }); @@ -1521,7 +1518,7 @@ void XeGPUWgToSgDistributePass::runOnOperation() { [=](Operation *op) -> bool { // Check for either a SliceAttr or LayoutAttr on the result. auto layout = - xegpu::getTempLayoutAttr(dyn_cast(op->getResult(0))); + xegpu::getTempLayout(dyn_cast(op->getResult(0))); return isLegal(layout); }); @@ -1563,7 +1560,7 @@ void XeGPUWgToSgDistributePass::runOnOperation() { } xegpu::DistributeLayoutAttr layout = - xegpu::getTempLayoutAttr(op->getResult(0)); + xegpu::getTempLayout(op->getResult(0)); return isLegal(layout); }); diff --git a/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp b/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp index 238f6dd6511e4..24f732f492ec9 100644 --- a/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp +++ b/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp @@ -230,7 +230,7 @@ maybePickPermanentLayout(xegpu::DistributeLayoutAttr layout, } // TODO-LayoutRefactor: Remove this function after replacing use -// with setTempLayoutAttr or setAnchorLayout +// with setTempLayout or setAnchorLayout void xegpu::setDistributeLayoutAttr( const mlir::OpResult &result, const mlir::xegpu::DistributeLayoutAttr layout) { @@ -253,7 +253,7 @@ void xegpu::setDistributeLayoutAttr( } // TODO-LayoutRefactor: Remove this function after replacing use -// with setTempLayoutAttr or setAnchorLayout +// with setTempLayout or setAnchorLayout void xegpu::setDistributeLayoutAttr(const OpOperand &operand, const DistributeLayoutAttr layout) { Operation *owner = operand.getOwner(); @@ -301,7 +301,7 @@ void xegpu::setDistributeLayoutAttr(const OpOperand &operand, } template -xegpu::DistributeLayoutAttr xegpu::getTempLayoutAttr(const T &operandOrResult) { +xegpu::DistributeLayoutAttr xegpu::getTempLayout(const T &operandOrResult) { Operation *op = operandOrResult.getOwner(); std::string layoutName = xegpu::getTempLayoutName(operandOrResult); @@ -314,13 +314,13 @@ xegpu::DistributeLayoutAttr xegpu::getTempLayoutAttr(const T &operandOrResult) { } template xegpu::DistributeLayoutAttr -xegpu::getTempLayoutAttr(const OpResult &result); +xegpu::getTempLayout(const OpResult &result); template xegpu::DistributeLayoutAttr -xegpu::getTempLayoutAttr(const OpOperand &operand); +xegpu::getTempLayout(const OpOperand &operand); template -void xegpu::setTempLayoutAttr(const T &operandOrResult, - const xegpu::DistributeLayoutAttr layout) { +void xegpu::setTempLayout(const T &operandOrResult, + const xegpu::DistributeLayoutAttr layout) { Operation *owner = operandOrResult.getOwner(); std::string name = xegpu::getTempLayoutName(operandOrResult); if (owner->hasAttrOfType(name)) { @@ -331,11 +331,11 @@ void xegpu::setTempLayoutAttr(const T &operandOrResult, } } -template void xegpu::setTempLayoutAttr( +template void xegpu::setTempLayout( const mlir::OpResult &result, const mlir::xegpu::DistributeLayoutAttr layout); -template void xegpu::setTempLayoutAttr( +template void xegpu::setTempLayout( const mlir::OpOperand &operand, const mlir::xegpu::DistributeLayoutAttr layout); From 5beaa244e81a067f044e67ba001562a573d0f4f4 Mon Sep 17 00:00:00 2001 From: Jianhui Li Date: Tue, 16 Dec 2025 19:53:12 +0000 Subject: [PATCH 26/28] address feedbacks --- .../mlir/Dialect/XeGPU/IR/XeGPUAttrs.td | 2 +- .../mlir/Dialect/XeGPU/Utils/XeGPUUtils.h | 122 +++++++++--------- .../XeGPU/Transforms/XeGPUBlocking.cpp | 10 +- .../Transforms/XeGPUOptimizeBlockLoads.cpp | 10 +- .../Transforms/XeGPUSubgroupDistribute.cpp | 20 +-- .../Transforms/XeGPUWgToSgDistribute.cpp | 38 +++--- mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp | 56 ++++---- .../lib/Dialect/XeGPU/TestXeGPUTransforms.cpp | 4 +- 8 files changed, 131 insertions(+), 131 deletions(-) diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td index a5064d6145341..446f64fffa468 100644 --- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td +++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td @@ -776,7 +776,7 @@ def AnchorLayoutInterface : OpInterface<"AnchorLayoutInterface"> { let description = [{ An attribute interface for accessing anchor layout information. This interface provides a method to set and retrieve the anchor layout - from attributes that implement it. + from operations that implement it. }]; let methods = [ diff --git a/mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h b/mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h index 5e0a5f7872ed5..dbaec73ccfdec 100644 --- a/mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h +++ b/mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h @@ -63,65 +63,6 @@ FailureOr getDistributedVectorType(xegpu::TensorDescType tdescTy); FailureOr getDistributedVectorType(VectorType originalType, LayoutAttr layout); -/// Return the attribute name for the OpOperand to attach DistributeLayoutAttr -std::string getTempLayoutName(const OpOperand &operand); - -/// Return the attribute name for the OpResult to attach DistributeLayoutAttr -std::string getTempLayoutName(const OpResult result); - -/// Retrieves the DistributeLayoutAttr associated with a given Value. For -/// TensorDescType values, the DistributeLayoutAttr is extracted from the -/// TensorDescType itself. For other values, it is obtained from the attributes -/// of the defining operation. Returns nullptr if no DistributeLayoutAttr is -/// found. -DistributeLayoutAttr getDistributeLayoutAttr(const Value value); - -/// Retrieves the DistributeLayoutAttr associated with a given OpOperand. It -/// will first check the operand_layout_{id} of the owner operation. If not -/// found, it will check the operand itself and its defining op. -DistributeLayoutAttr getDistributeLayoutAttr(const OpOperand &opr); - -/// Removes the LayoutAttr for a given OpOperand or OpResult if it exists. -template || - std::is_same_v>> -void removeLayoutAttr(const T &operandOrResult); - -/// Removes the DistributeLayoutAttr for each OpOperand and OpResult of the -/// given operation if they exist. If the operation contains regions, it is also -/// applied recursively to the contained operations -void removeLayoutAttrs(Operation *op); - -/// Sets the DistributeLayoutAttr for a given OpResult -void setDistributeLayoutAttr(const OpResult &Result, - const DistributeLayoutAttr layout); - -/// Sets the DistributeLayoutAttr for a given OpOperand -void setDistributeLayoutAttr(const OpOperand &opr, - const DistributeLayoutAttr layout); - -/// get and set distribute layout attribute for non-anchor operations -/// (and offsets/masks of load/store ops before we get rid of their temp attrs) -template || - std::is_same_v>> -DistributeLayoutAttr getTempLayout(const T &operandOrResult); - -template || - std::is_same_v>> -void setTempLayout(const T &operandOrResult, const DistributeLayoutAttr layout); - -/// Set the DistributeLayoutAttr for each OpOperand and OpResult of the given -/// operation. If the operation contains regions, it is also applied recursively -/// to the contained operations -void retrieveDistributeLayoutAttrsRecursive(Operation *op); - -/// Attach layout attributes to all vector-type operands of operations within -/// the given operation's region. Reports an error if any vector operand lacks -/// a layout attribute. -bool localPropagateLayoutsFromAnchor(Operation *rootOp); - /// Extract a set of small vectors from a value with a given shape using /// vector.extract_stride_slice SmallVector extractVectorsWithShapeFromValue(OpBuilder &builder, @@ -177,6 +118,69 @@ template int getLargestDivisor(T dim, ArrayRef candidates, ArrayRef candidateMultiples = {}); +/// Return the attribute name for the OpOperand to attach DistributeLayoutAttr +std::string getLocalLayout(const OpOperand &operand); + +/// Return the attribute name for the OpResult to attach DistributeLayoutAttr +std::string getLocalLayout(const OpResult result); + +/// Retrieves the DistributeLayoutAttr associated with a given Value. For +/// TensorDescType values, the DistributeLayoutAttr is extracted from the +/// TensorDescType itself. For other values, it is obtained from the attributes +/// of the defining operation. Returns nullptr if no DistributeLayoutAttr is +/// found. +DistributeLayoutAttr getDistributeLayoutAttr(const Value value); + +/// Retrieves the DistributeLayoutAttr associated with a given OpOperand. It +/// will first check the operand_layout_{id} of the owner operation. If not +/// found, it will check the operand itself and its defining op. +DistributeLayoutAttr getDistributeLayoutAttr(const OpOperand &opr); + +/// Removes the LayoutAttr for a given OpOperand or OpResult if it exists. +template || + std::is_same_v>> +void removeLayoutAttr(const T &operandOrResult); + +/// Removes the DistributeLayoutAttr for each OpOperand and OpResult of the +/// given operation if they exist. If the operation contains regions, it is also +/// applied recursively to the contained operations +void removeLayoutAttrs(Operation *op); + +/// [to-be-deprecated] Sets the DistributeLayoutAttr for a given OpResult +/// user should use setAnchorLayout instead +void setDistributeLayoutAttr(const OpResult &Result, + const DistributeLayoutAttr layout); + +/// [to-be-deprecated] Sets the DistributeLayoutAttr for a given OpOperand +/// user should use setAnchorLayout instead +void setDistributeLayoutAttr(const OpOperand &opr, + const DistributeLayoutAttr layout); + +/// get and set distribute layout attribute for non-anchor operations +/// (and offsets/masks of load/store ops before we get rid of their temp attrs) +template || + std::is_same_v>> +DistributeLayoutAttr getTempLayout(const T &operandOrResult); + +template || + std::is_same_v>> +void setLocalLayout(const T &operandOrResult, + const DistributeLayoutAttr layout); + +/// [to-be-deprecated] Set the DistributeLayoutAttr for each OpOperand and +/// OpResult of of the given operation. If the operation contains regions, it is +/// also applied recursively to the contained operations operation. +/// TODO: To be replaced by recoverLocalLayouts() +void recoverLocalLayoutsDeprecated(Operation *op); + +/// Attach layout attributes to all vector-type operands of operations within +/// the given operation's region. Reports an error if any vector operand lacks +/// a layout attribute. +bool recoverLocalLayouts(Operation *rootOp); + } // namespace xegpu } // namespace mlir diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp index 0c0a922c56305..d96c608133d1a 100644 --- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp +++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp @@ -281,12 +281,12 @@ void XeGPUBlockingPass::runOnOperation() { Operation *op = getOperation(); // TODO-LayoutRefactor: unify the local propagation for layout preprocessing - // replace the function with localPropagateLayoutsFromAnchor - // if (!xegpu::localPropagateLayoutsFromAnchor(op)) { + // replace the function with recoverLocalLayouts + // if (!xegpu::recoverLocalLayouts(op)) { // signalPassFailure(); // return; // } - xegpu::retrieveDistributeLayoutAttrsRecursive(op); + xegpu::recoverLocalLayoutsDeprecated(op); auto getTileShapeAndCount = [](llvm::ArrayRef shape, xegpu::LayoutAttr layout) { @@ -414,14 +414,14 @@ void XeGPUBlockingPass::runOnOperation() { op->walk([](Operation *op) { // Remove the layout attributes cached per operands. for (OpOperand &opr : op->getOpOperands()) { - std::string name = xegpu::getTempLayoutName(opr); + std::string name = xegpu::getLocalLayout(opr); if (op->hasAttrOfType(name)) op->removeAttr(name); } // Update the layout attributes per result. for (OpResult result : op->getOpResults()) { - std::string name = xegpu::getTempLayoutName(result); + std::string name = xegpu::getLocalLayout(result); if (auto layout = op->getAttrOfType(name)) { op->removeAttr(name); if (!isa(op)) diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUOptimizeBlockLoads.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUOptimizeBlockLoads.cpp index 27eeddb388194..32cc38415c003 100644 --- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUOptimizeBlockLoads.cpp +++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUOptimizeBlockLoads.cpp @@ -224,7 +224,7 @@ static Value generateLoads(ConversionPatternRewriter &rewriter, ArrayRef{localOffsetDim0, localOffsetDim1}, ArrayRef{1, 1}); // InsertOp must have the same layout as newTensorDesc. - xegpu::setTempLayout(insertOp->getOpResult(0), layoutAttr); + xegpu::setLocalLayout(insertOp->getOpResult(0), layoutAttr); data = insertOp.getResult(); } } @@ -366,8 +366,8 @@ class XeGPULoadNdDescOpPattern final auto bitCastOp = vector::BitCastOp::create(rewriter, loadNdOp->getLoc(), bitcastType, slice); // BitCastOp must have the same layout as the original loadNdOp. - xegpu::setTempLayout(bitCastOp->getOpResult(0), - origTensorDescType.getLayoutAttr()); + xegpu::setLocalLayout(bitCastOp->getOpResult(0), + origTensorDescType.getLayoutAttr()); arraySlices.push_back(bitCastOp.getResult()); } rewriter.replaceOpWithMultiple(loadNdOp, {arraySlices}); @@ -384,8 +384,8 @@ class XeGPULoadNdDescOpPattern final auto bitCastOp = vector::BitCastOp::create(rewriter, loadNdOp->getLoc(), loadNdOp.getType(), data); // BitCastOp must have the same layout as the original loadNdOp. - xegpu::setTempLayout(bitCastOp->getOpResult(0), - origTensorDescType.getLayoutAttr()); + xegpu::setLocalLayout(bitCastOp->getOpResult(0), + origTensorDescType.getLayoutAttr()); rewriter.replaceOp(loadNdOp, bitCastOp); return success(); } diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp index bf914acf22707..b5566c081e2fe 100644 --- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp +++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp @@ -869,11 +869,11 @@ struct StoreDistribution final : public gpu::WarpDistributionPattern { storeScatterOp, "Expected at most 2D result at SG level"); std::string layoutPayloadName = - xegpu::getTempLayoutName(storeScatterOp->getOpOperand(0)); + xegpu::getLocalLayout(storeScatterOp->getOpOperand(0)); std::string layoutOffsetsName = - xegpu::getTempLayoutName(storeScatterOp->getOpOperand(2)); + xegpu::getLocalLayout(storeScatterOp->getOpOperand(2)); std::string layoutMaskName = - xegpu::getTempLayoutName(storeScatterOp->getOpOperand(3)); + xegpu::getLocalLayout(storeScatterOp->getOpOperand(3)); xegpu::LayoutAttr layoutPayload = storeScatterOp->getAttrOfType(layoutPayloadName); @@ -1152,9 +1152,9 @@ struct LoadDistribution final : public gpu::WarpDistributionPattern { "Expected 1D offsets and mask vector"); // Assume offset and mask producers will be distributed as well. std::string layoutOffsetsName = - xegpu::getTempLayoutName(loadGatherOp->getOpOperand(1)); + xegpu::getLocalLayout(loadGatherOp->getOpOperand(1)); std::string layoutMaskName = - xegpu::getTempLayoutName(loadGatherOp->getOpOperand(2)); + xegpu::getLocalLayout(loadGatherOp->getOpOperand(2)); xegpu::LayoutAttr layoutOffsets = loadGatherOp->getAttrOfType(layoutOffsetsName); @@ -1224,8 +1224,8 @@ static Value lowerToVectorReductions(TypedValue src, rewriter, loc, acc.getType(), DenseElementsAttr::get(acc.getType(), zeroAttr)); // Reduction result should have the same layout as the accumulator. - xegpu::setTempLayout(cast(reductionResult), - xegpu::getTempLayout(dyn_cast(acc))); + xegpu::setLocalLayout(cast(reductionResult), + xegpu::getTempLayout(dyn_cast(acc))); // For each slice of the source, extract the slice vector, do a reduction // and, insert the reduced value back to the result vector. for (int i = 0; i < nSlices; ++i) { @@ -1256,8 +1256,8 @@ static Value lowerToVectorReductions(TypedValue src, auto srcLayout = xegpu::getTempLayout(dyn_cast(src)); auto accLayout = xegpu::getTempLayout(dyn_cast(acc)); - xegpu::setTempLayout(slice->getOpOperand(0), srcLayout); - xegpu::setTempLayout(slice->getOpResult(0), accLayout); + xegpu::setLocalLayout(slice->getOpOperand(0), srcLayout); + xegpu::setLocalLayout(slice->getOpResult(0), accLayout); // Extract and reduction results in scalars, so no result layout is needed. Value accExtract = vector::ExtractOp::create(rewriter, loc, acc, i); Value reduction = vector::ReductionOp::create( @@ -2041,7 +2041,7 @@ void XeGPUSubgroupDistributePass::runOnOperation() { // 1) It is assumed that there are no layout conflicts. // 2) Any existing layout attributes attached to the operands are ignored. Operation *op = getOperation(); - if (!xegpu::localPropagateLayoutsFromAnchor(op)) { + if (!xegpu::recoverLocalLayouts(op)) { signalPassFailure(); return; } diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp index ea2285a97789a..860f0b2c6198d 100644 --- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp +++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp @@ -508,8 +508,8 @@ struct WgToSgVectorBroadcastOp for (auto operand : adaptor.getOperands().front()) { auto newBroadcast = vector::BroadcastOp::create(rewriter, op.getLoc(), newResultType, operand); - xegpu::setTempLayout(newBroadcast->getResult(0), - layout.dropSgLayoutAndData()); + xegpu::setLocalLayout(newBroadcast->getResult(0), + layout.dropSgLayoutAndData()); newBroadcastOps.push_back(newBroadcast.getResult()); } @@ -756,8 +756,8 @@ struct WgToSgArithConstantOp : public OpConversionPattern { auto eltType = vecType.getElementType(); auto setLayout = [&](Value val) { - xegpu::setTempLayout(llvm::dyn_cast(val), - layout.dropSgLayoutAndData()); + xegpu::setLocalLayout(llvm::dyn_cast(val), + layout.dropSgLayoutAndData()); }; if (vecAttr.isSplat()) { @@ -987,7 +987,7 @@ struct WgToSgStoreScatterOpWithOffset // Skip for operand one (memref) if (operand.getOperandNumber() == 1) continue; - xegpu::setTempLayout(operand, layout.dropSgLayoutAndData()); + xegpu::setLocalLayout(operand, layout.dropSgLayoutAndData()); } } rewriter.eraseOp(op); @@ -1080,11 +1080,11 @@ struct WgToSgVectorStepOp : public OpConversionPattern { vector::BroadcastOp::create(rewriter, loc, newTy, offsets[0]); auto finalSteps = arith::AddIOp::create(rewriter, loc, steps, bcastOffset); - xegpu::setTempLayout(steps->getResult(0), layout.dropSgLayoutAndData()); - xegpu::setTempLayout(bcastOffset->getResult(0), - layout.dropSgLayoutAndData()); - xegpu::setTempLayout(finalSteps->getResult(0), - layout.dropSgLayoutAndData()); + xegpu::setLocalLayout(steps->getResult(0), layout.dropSgLayoutAndData()); + xegpu::setLocalLayout(bcastOffset->getResult(0), + layout.dropSgLayoutAndData()); + xegpu::setLocalLayout(finalSteps->getResult(0), + layout.dropSgLayoutAndData()); newOps.push_back(finalSteps); } @@ -1153,8 +1153,8 @@ struct WgToSgVectorShapeCastOp for (auto src : adaptor.getSource()) { auto newShapeCast = vector::ShapeCastOp::create(rewriter, op.getLoc(), newResultType, src); - xegpu::setTempLayout(newShapeCast->getResult(0), - layout.dropSgLayoutAndData()); + xegpu::setLocalLayout(newShapeCast->getResult(0), + layout.dropSgLayoutAndData()); newShapeCastOps.push_back(newShapeCast.getResult()); } @@ -1215,7 +1215,7 @@ struct WgToSgMultiDimReductionOp auto newOp = vector::MultiDimReductionOp::create( rewriter, op.getLoc(), newDstType, op.getKind(), sgSrc, adaptor.getAcc()[0], op.getReductionDims()); - xegpu::setTempLayout(newOp->getResult(0), layout.dropSgLayoutAndData()); + xegpu::setLocalLayout(newOp->getResult(0), layout.dropSgLayoutAndData()); newReductions.push_back(newOp.getResult()); } @@ -1278,8 +1278,8 @@ struct WgToSgVectorTransposeOp for (auto src : adaptor.getVector()) { auto newTranspose = vector::TransposeOp::create( rewriter, op.getLoc(), newResultType, src, permutation); - xegpu::setTempLayout(newTranspose->getResult(0), - layout.dropSgLayoutAndData()); + xegpu::setLocalLayout(newTranspose->getResult(0), + layout.dropSgLayoutAndData()); newTransposeOps.push_back(newTranspose.getResult()); } @@ -1348,8 +1348,8 @@ struct WgToSgVectorMaskOp : public OpConversionPattern { auto newCreateMaskOp = vector::CreateMaskOp::create(rewriter, loc, resultType, maskOperands); - xegpu::setTempLayout(newCreateMaskOp->getResult(0), - layout.dropSgLayoutAndData()); + xegpu::setLocalLayout(newCreateMaskOp->getResult(0), + layout.dropSgLayoutAndData()); newCreateMaskOps.push_back(newCreateMaskOp.getResult()); } @@ -1392,7 +1392,7 @@ void XeGPUWgToSgDistributePass::runOnOperation() { // TODO-LayoutRefactor: unify the local propagation for layout preprocessing // Operation *op = getOperation(); - // if (!xegpu::localPropagateLayoutsFromAnchor(op)) { + // if (!xegpu::recoverLocalLayouts(op)) { // signalPassFailure(); // return; // } @@ -1585,7 +1585,7 @@ void XeGPUWgToSgDistributePass::runOnOperation() { // Layout propagation pass will activated. getOperation()->walk([](Operation *op) { for (OpResult result : op->getOpResults()) { - std::string name = xegpu::getTempLayoutName(result); + std::string name = xegpu::getLocalLayout(result); if (auto layout = op->getAttrOfType(name)) { op->removeAttr(name); if (!isa(op)) { diff --git a/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp b/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp index 24f732f492ec9..6388333bd7450 100644 --- a/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp +++ b/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp @@ -101,13 +101,13 @@ mlir::xegpu::getDistributedVectorType(VectorType originalType, return xegpu::getDistributedVectorType(helperTdescTy); } -std::string xegpu::getTempLayoutName(const OpOperand &operand) { +std::string xegpu::getLocalLayout(const OpOperand &operand) { const StringRef prefix("layout_operand_"); unsigned idx = const_cast(operand).getOperandNumber(); return llvm::formatv("{0}{1}", prefix, idx).str(); } -std::string xegpu::getTempLayoutName(const OpResult result) { +std::string xegpu::getLocalLayout(const OpResult result) { const StringRef prefix = "layout_result_"; return llvm::formatv("{0}{1}", prefix, result.getResultNumber()).str(); } @@ -129,7 +129,7 @@ xegpu::DistributeLayoutAttr xegpu::getDistributeLayoutAttr(const Value value) { return layout; } - std::string layoutName = getTempLayoutName(result); + std::string layoutName = getLocalLayout(result); if (defOp->hasAttr(layoutName)) { auto layout = defOp->getAttrOfType(layoutName); @@ -141,10 +141,8 @@ xegpu::DistributeLayoutAttr xegpu::getDistributeLayoutAttr(const Value value) { auto *parentOp = arg.getOwner()->getParentOp(); if (auto loop = dyn_cast(parentOp)) { OpOperand *tiedInit = loop.getTiedLoopInit(arg); - if (tiedInit) { - auto layout = getDistributeLayoutAttr(tiedInit->get()); - return layout; - } + if (tiedInit) + return getDistributeLayoutAttr(tiedInit->get()); } } @@ -169,22 +167,20 @@ xegpu::getDistributeLayoutAttr(const OpOperand &opr) { return convertOp.getInputLayoutAttr(); } auto layout = anchorOp.getAnchorLayout(); + + if (idx == 0) + return layout; + // For store operations (StoreScatterOp, StoreNdOp, StoreMatrixOp), // the layout is valid for the first two operands: value and memref/tdesc. // For other operations, the layout applies to the first operand only. if (isa( - op)) { - if (idx < 2) { - return layout; - } - } else { - if (idx == 0) { - return layout; - } - } + op) && + (idx < 2)) + return layout; } - std::string layoutName = xegpu::getTempLayoutName(opr); + std::string layoutName = xegpu::getLocalLayout(opr); if (op->hasAttr(layoutName)) { auto layout = op->getAttrOfType(layoutName); return layout; @@ -230,7 +226,7 @@ maybePickPermanentLayout(xegpu::DistributeLayoutAttr layout, } // TODO-LayoutRefactor: Remove this function after replacing use -// with setTempLayout or setAnchorLayout +// with setLocalLayout or setAnchorLayout void xegpu::setDistributeLayoutAttr( const mlir::OpResult &result, const mlir::xegpu::DistributeLayoutAttr layout) { @@ -243,7 +239,7 @@ void xegpu::setDistributeLayoutAttr( return; } - std::string name = xegpu::getTempLayoutName(result); + std::string name = xegpu::getLocalLayout(result); if (owner->hasAttrOfType(name)) { return; } @@ -253,7 +249,7 @@ void xegpu::setDistributeLayoutAttr( } // TODO-LayoutRefactor: Remove this function after replacing use -// with setTempLayout or setAnchorLayout +// with setLocalLayout or setAnchorLayout void xegpu::setDistributeLayoutAttr(const OpOperand &operand, const DistributeLayoutAttr layout) { Operation *owner = operand.getOwner(); @@ -291,7 +287,7 @@ void xegpu::setDistributeLayoutAttr(const OpOperand &operand, } } - std::string name = xegpu::getTempLayoutName(operand); + std::string name = xegpu::getLocalLayout(operand); if (owner->hasAttrOfType(name)) { return; } @@ -304,7 +300,7 @@ template xegpu::DistributeLayoutAttr xegpu::getTempLayout(const T &operandOrResult) { Operation *op = operandOrResult.getOwner(); - std::string layoutName = xegpu::getTempLayoutName(operandOrResult); + std::string layoutName = xegpu::getLocalLayout(operandOrResult); if (op->hasAttr(layoutName)) { auto layout = op->getAttrOfType(layoutName); return layout; @@ -319,10 +315,10 @@ template xegpu::DistributeLayoutAttr xegpu::getTempLayout(const OpOperand &operand); template -void xegpu::setTempLayout(const T &operandOrResult, - const xegpu::DistributeLayoutAttr layout) { +void xegpu::setLocalLayout(const T &operandOrResult, + const xegpu::DistributeLayoutAttr layout) { Operation *owner = operandOrResult.getOwner(); - std::string name = xegpu::getTempLayoutName(operandOrResult); + std::string name = xegpu::getLocalLayout(operandOrResult); if (owner->hasAttrOfType(name)) { return; } @@ -331,15 +327,15 @@ void xegpu::setTempLayout(const T &operandOrResult, } } -template void xegpu::setTempLayout( +template void xegpu::setLocalLayout( const mlir::OpResult &result, const mlir::xegpu::DistributeLayoutAttr layout); -template void xegpu::setTempLayout( +template void xegpu::setLocalLayout( const mlir::OpOperand &operand, const mlir::xegpu::DistributeLayoutAttr layout); -void xegpu::retrieveDistributeLayoutAttrsRecursive(Operation *op) { +void xegpu::recoverLocalLayoutsDeprecated(Operation *op) { op->walk([&](Operation *nestOp) { for (OpOperand &opr : nestOp->getOpOperands()) { auto layout = getDistributeLayoutAttr(opr.get()); @@ -356,7 +352,7 @@ void xegpu::retrieveDistributeLayoutAttrsRecursive(Operation *op) { /// Attach layout attributes to all vector-type operands of operations within /// the given operation's region. Reports an error if any vector operand lacks /// a layout attribute. -bool xegpu::localPropagateLayoutsFromAnchor(Operation *rootOp) { +bool xegpu::recoverLocalLayouts(Operation *rootOp) { auto result = rootOp->walk([&](Operation *op) { for (OpOperand &operand : op->getOpOperands()) { // Layouts are needed for vector type only. @@ -378,7 +374,7 @@ bool xegpu::localPropagateLayoutsFromAnchor(Operation *rootOp) { template void xegpu::removeLayoutAttr(const T &operandOrResult) { Operation *owner = operandOrResult.getOwner(); - std::string name = xegpu::getTempLayoutName(operandOrResult); + std::string name = xegpu::getLocalLayout(operandOrResult); if (owner->hasAttrOfType(name)) owner->removeAttr(name); } diff --git a/mlir/test/lib/Dialect/XeGPU/TestXeGPUTransforms.cpp b/mlir/test/lib/Dialect/XeGPU/TestXeGPUTransforms.cpp index de0efdc1ccc34..26c9c6ea53b28 100644 --- a/mlir/test/lib/Dialect/XeGPU/TestXeGPUTransforms.cpp +++ b/mlir/test/lib/Dialect/XeGPU/TestXeGPUTransforms.cpp @@ -184,7 +184,7 @@ class TestStepOpPattern : public OpConversionPattern { matchAndRewrite(vector::StepOp op, OneToNOpAdaptor adaptor, ConversionPatternRewriter &rewriter) const override { - auto layoutName = xegpu::getTempLayoutName(op->getResult(0)); + auto layoutName = xegpu::getLocalLayout(op->getResult(0)); auto sliceAttr = op->getAttrOfType(layoutName); if (!sliceAttr || sliceAttr.getRank() != 1) return failure(); @@ -324,7 +324,7 @@ struct TestXeGPULayoutInterface target.addDynamicallyLegalOp( [&](vector::StepOp op) -> bool { - auto layoutName = xegpu::getTempLayoutName(op->getResult(0)); + auto layoutName = xegpu::getLocalLayout(op->getResult(0)); auto sliceAttr = op->getAttrOfType(layoutName); return isLegal(sliceAttr); }); From 86b3340cb4d44b8ae180fa28b00b854e4ff4cf78 Mon Sep 17 00:00:00 2001 From: Jianhui Li Date: Tue, 16 Dec 2025 22:34:29 +0000 Subject: [PATCH 27/28] changing func names --- .../include/mlir/Dialect/XeGPU/IR/XeGPUOps.td | 22 +++++----- .../mlir/Dialect/XeGPU/Utils/XeGPUUtils.h | 14 +++---- .../XeGPU/Transforms/XeGPUBlocking.cpp | 10 ++--- .../Transforms/XeGPUOptimizeBlockLoads.cpp | 10 ++--- .../Transforms/XeGPUSubgroupDistribute.cpp | 20 +++++----- .../Transforms/XeGPUWgToSgDistribute.cpp | 40 ++++++++++--------- mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp | 34 ++++++++-------- .../lib/Dialect/XeGPU/TestXeGPUTransforms.cpp | 4 +- 8 files changed, 78 insertions(+), 76 deletions(-) diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td index 97c7f69cc5d5f..92ac8870b7068 100644 --- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td +++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td @@ -302,7 +302,7 @@ def XeGPU_PrefetchNdOp : XeGPU_Op<"prefetch_nd", [AnchorLayoutInterface]> { } void setAnchorLayout(xegpu::DistributeLayoutAttr anchorLayout) { - getProperties().layout = anchorLayout; + setLayoutAttr(anchorLayout); } SmallVector getMixedOffsets() { @@ -433,7 +433,7 @@ def XeGPU_LoadNdOp : XeGPU_Op<"load_nd", [ } void setAnchorLayout(xegpu::DistributeLayoutAttr anchorLayout) { - getProperties().layout = anchorLayout; + setLayoutAttr(anchorLayout); } SmallVector getMixedOffsets() { @@ -554,7 +554,7 @@ def XeGPU_StoreNdOp : XeGPU_Op<"store_nd", [ } void setAnchorLayout(xegpu::DistributeLayoutAttr anchorLayout) { - getProperties().layout = anchorLayout; + setLayoutAttr(anchorLayout); } SmallVector getMixedOffsets() { @@ -843,7 +843,7 @@ def XeGPU_PrefetchOp : XeGPU_Op<"prefetch", [AnchorLayoutInterface]> { } void setAnchorLayout(xegpu::DistributeLayoutAttr anchorLayout) { - getProperties().layout = anchorLayout; + setLayoutAttr(anchorLayout); } TypedValue getTensorDesc() { @@ -987,7 +987,7 @@ def XeGPU_LoadGatherOp : XeGPU_Op<"load", [MemoryEffects<[MemRead]>, AnchorLayou } void setAnchorLayout(xegpu::DistributeLayoutAttr anchorLayout) { - getProperties().layout = anchorLayout; + setLayoutAttr(anchorLayout); } TypedValue getTensorDesc() { @@ -1153,7 +1153,7 @@ def XeGPU_StoreScatterOp : XeGPU_Op<"store", [MemoryEffects<[MemWrite]>, AnchorL } void setAnchorLayout(xegpu::DistributeLayoutAttr anchorLayout) { - getProperties().layout = anchorLayout; + setLayoutAttr(anchorLayout); } TypedValue getTensorDesc() { @@ -1333,7 +1333,7 @@ def XeGPU_DpasOp : XeGPU_Op<"dpas", [Pure, AllElementTypesMatch<["lhs", "rhs"]>, } void setAnchorLayout(xegpu::DistributeLayoutAttr anchorLayout) { - getProperties().layout_cd = anchorLayout; + setLayoutCdAttr(anchorLayout); } VectorType getLhsType() { @@ -1416,7 +1416,7 @@ def XeGPU_AtomicRMWOp: XeGPU_Op<"atomic_rmw", [Pure, } void setAnchorLayout(xegpu::DistributeLayoutAttr anchorLayout) { - getProperties().layout = anchorLayout; + setLayoutAttr(anchorLayout); } }]; @@ -1534,7 +1534,7 @@ def XeGPU_ConvertLayoutOp: XeGPU_Op<"convert_layout", [Pure, AllTypesMatch<["sou } void setAnchorLayout(xegpu::DistributeLayoutAttr anchorLayout) { - getProperties().target_layout = anchorLayout; + setTargetLayoutAttr(anchorLayout); } }]; @@ -1644,7 +1644,7 @@ def XeGPU_LoadMatrixOp: XeGPU_Op<"load_matrix", [MemoryEffects<[MemRead]>, } void setAnchorLayout(xegpu::DistributeLayoutAttr anchorLayout) { - getProperties().layout = anchorLayout; + setLayoutAttr(anchorLayout); } }]; @@ -1711,7 +1711,7 @@ def XeGPU_StoreMatrixOp: XeGPU_Op<"store_matrix", [MemoryEffects<[MemWrite]>, } void setAnchorLayout(xegpu::DistributeLayoutAttr anchorLayout) { - getProperties().layout = anchorLayout; + setLayoutAttr(anchorLayout); } }]; diff --git a/mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h b/mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h index dbaec73ccfdec..098a308f1ace4 100644 --- a/mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h +++ b/mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h @@ -119,10 +119,10 @@ int getLargestDivisor(T dim, ArrayRef candidates, ArrayRef candidateMultiples = {}); /// Return the attribute name for the OpOperand to attach DistributeLayoutAttr -std::string getLocalLayout(const OpOperand &operand); +std::string getTemporaryLayout(const OpOperand &operand); /// Return the attribute name for the OpResult to attach DistributeLayoutAttr -std::string getLocalLayout(const OpResult result); +std::string getTemporaryLayout(const OpResult result); /// Retrieves the DistributeLayoutAttr associated with a given Value. For /// TensorDescType values, the DistributeLayoutAttr is extracted from the @@ -167,19 +167,19 @@ DistributeLayoutAttr getTempLayout(const T &operandOrResult); template || std::is_same_v>> -void setLocalLayout(const T &operandOrResult, - const DistributeLayoutAttr layout); +void setTemporaryLayout(const T &operandOrResult, + const DistributeLayoutAttr layout); /// [to-be-deprecated] Set the DistributeLayoutAttr for each OpOperand and /// OpResult of of the given operation. If the operation contains regions, it is /// also applied recursively to the contained operations operation. -/// TODO: To be replaced by recoverLocalLayouts() -void recoverLocalLayoutsDeprecated(Operation *op); +/// TODO: To be replaced by recoverTemporaryLayouts() +void recoverTemporaryLayoutsDeprecated(Operation *op); /// Attach layout attributes to all vector-type operands of operations within /// the given operation's region. Reports an error if any vector operand lacks /// a layout attribute. -bool recoverLocalLayouts(Operation *rootOp); +bool recoverTemporaryLayouts(Operation *rootOp); } // namespace xegpu diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp index d96c608133d1a..b9915846adf58 100644 --- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp +++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp @@ -281,12 +281,12 @@ void XeGPUBlockingPass::runOnOperation() { Operation *op = getOperation(); // TODO-LayoutRefactor: unify the local propagation for layout preprocessing - // replace the function with recoverLocalLayouts - // if (!xegpu::recoverLocalLayouts(op)) { + // replace the function with recoverTemporaryLayouts + // if (!xegpu::recoverTemporaryLayouts(op)) { // signalPassFailure(); // return; // } - xegpu::recoverLocalLayoutsDeprecated(op); + xegpu::recoverTemporaryLayoutsDeprecated(op); auto getTileShapeAndCount = [](llvm::ArrayRef shape, xegpu::LayoutAttr layout) { @@ -414,14 +414,14 @@ void XeGPUBlockingPass::runOnOperation() { op->walk([](Operation *op) { // Remove the layout attributes cached per operands. for (OpOperand &opr : op->getOpOperands()) { - std::string name = xegpu::getLocalLayout(opr); + std::string name = xegpu::getTemporaryLayout(opr); if (op->hasAttrOfType(name)) op->removeAttr(name); } // Update the layout attributes per result. for (OpResult result : op->getOpResults()) { - std::string name = xegpu::getLocalLayout(result); + std::string name = xegpu::getTemporaryLayout(result); if (auto layout = op->getAttrOfType(name)) { op->removeAttr(name); if (!isa(op)) diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUOptimizeBlockLoads.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUOptimizeBlockLoads.cpp index 32cc38415c003..96e85c653e39a 100644 --- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUOptimizeBlockLoads.cpp +++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUOptimizeBlockLoads.cpp @@ -224,7 +224,7 @@ static Value generateLoads(ConversionPatternRewriter &rewriter, ArrayRef{localOffsetDim0, localOffsetDim1}, ArrayRef{1, 1}); // InsertOp must have the same layout as newTensorDesc. - xegpu::setLocalLayout(insertOp->getOpResult(0), layoutAttr); + xegpu::setTemporaryLayout(insertOp->getOpResult(0), layoutAttr); data = insertOp.getResult(); } } @@ -366,8 +366,8 @@ class XeGPULoadNdDescOpPattern final auto bitCastOp = vector::BitCastOp::create(rewriter, loadNdOp->getLoc(), bitcastType, slice); // BitCastOp must have the same layout as the original loadNdOp. - xegpu::setLocalLayout(bitCastOp->getOpResult(0), - origTensorDescType.getLayoutAttr()); + xegpu::setTemporaryLayout(bitCastOp->getOpResult(0), + origTensorDescType.getLayoutAttr()); arraySlices.push_back(bitCastOp.getResult()); } rewriter.replaceOpWithMultiple(loadNdOp, {arraySlices}); @@ -384,8 +384,8 @@ class XeGPULoadNdDescOpPattern final auto bitCastOp = vector::BitCastOp::create(rewriter, loadNdOp->getLoc(), loadNdOp.getType(), data); // BitCastOp must have the same layout as the original loadNdOp. - xegpu::setLocalLayout(bitCastOp->getOpResult(0), - origTensorDescType.getLayoutAttr()); + xegpu::setTemporaryLayout(bitCastOp->getOpResult(0), + origTensorDescType.getLayoutAttr()); rewriter.replaceOp(loadNdOp, bitCastOp); return success(); } diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp index b5566c081e2fe..01ea8bfe37899 100644 --- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp +++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp @@ -869,11 +869,11 @@ struct StoreDistribution final : public gpu::WarpDistributionPattern { storeScatterOp, "Expected at most 2D result at SG level"); std::string layoutPayloadName = - xegpu::getLocalLayout(storeScatterOp->getOpOperand(0)); + xegpu::getTemporaryLayout(storeScatterOp->getOpOperand(0)); std::string layoutOffsetsName = - xegpu::getLocalLayout(storeScatterOp->getOpOperand(2)); + xegpu::getTemporaryLayout(storeScatterOp->getOpOperand(2)); std::string layoutMaskName = - xegpu::getLocalLayout(storeScatterOp->getOpOperand(3)); + xegpu::getTemporaryLayout(storeScatterOp->getOpOperand(3)); xegpu::LayoutAttr layoutPayload = storeScatterOp->getAttrOfType(layoutPayloadName); @@ -1152,9 +1152,9 @@ struct LoadDistribution final : public gpu::WarpDistributionPattern { "Expected 1D offsets and mask vector"); // Assume offset and mask producers will be distributed as well. std::string layoutOffsetsName = - xegpu::getLocalLayout(loadGatherOp->getOpOperand(1)); + xegpu::getTemporaryLayout(loadGatherOp->getOpOperand(1)); std::string layoutMaskName = - xegpu::getLocalLayout(loadGatherOp->getOpOperand(2)); + xegpu::getTemporaryLayout(loadGatherOp->getOpOperand(2)); xegpu::LayoutAttr layoutOffsets = loadGatherOp->getAttrOfType(layoutOffsetsName); @@ -1224,8 +1224,8 @@ static Value lowerToVectorReductions(TypedValue src, rewriter, loc, acc.getType(), DenseElementsAttr::get(acc.getType(), zeroAttr)); // Reduction result should have the same layout as the accumulator. - xegpu::setLocalLayout(cast(reductionResult), - xegpu::getTempLayout(dyn_cast(acc))); + xegpu::setTemporaryLayout(cast(reductionResult), + xegpu::getTempLayout(dyn_cast(acc))); // For each slice of the source, extract the slice vector, do a reduction // and, insert the reduced value back to the result vector. for (int i = 0; i < nSlices; ++i) { @@ -1256,8 +1256,8 @@ static Value lowerToVectorReductions(TypedValue src, auto srcLayout = xegpu::getTempLayout(dyn_cast(src)); auto accLayout = xegpu::getTempLayout(dyn_cast(acc)); - xegpu::setLocalLayout(slice->getOpOperand(0), srcLayout); - xegpu::setLocalLayout(slice->getOpResult(0), accLayout); + xegpu::setTemporaryLayout(slice->getOpOperand(0), srcLayout); + xegpu::setTemporaryLayout(slice->getOpResult(0), accLayout); // Extract and reduction results in scalars, so no result layout is needed. Value accExtract = vector::ExtractOp::create(rewriter, loc, acc, i); Value reduction = vector::ReductionOp::create( @@ -2041,7 +2041,7 @@ void XeGPUSubgroupDistributePass::runOnOperation() { // 1) It is assumed that there are no layout conflicts. // 2) Any existing layout attributes attached to the operands are ignored. Operation *op = getOperation(); - if (!xegpu::recoverLocalLayouts(op)) { + if (!xegpu::recoverTemporaryLayouts(op)) { signalPassFailure(); return; } diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp index 860f0b2c6198d..e3879de2161bf 100644 --- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp +++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp @@ -508,8 +508,8 @@ struct WgToSgVectorBroadcastOp for (auto operand : adaptor.getOperands().front()) { auto newBroadcast = vector::BroadcastOp::create(rewriter, op.getLoc(), newResultType, operand); - xegpu::setLocalLayout(newBroadcast->getResult(0), - layout.dropSgLayoutAndData()); + xegpu::setTemporaryLayout(newBroadcast->getResult(0), + layout.dropSgLayoutAndData()); newBroadcastOps.push_back(newBroadcast.getResult()); } @@ -756,8 +756,8 @@ struct WgToSgArithConstantOp : public OpConversionPattern { auto eltType = vecType.getElementType(); auto setLayout = [&](Value val) { - xegpu::setLocalLayout(llvm::dyn_cast(val), - layout.dropSgLayoutAndData()); + xegpu::setTemporaryLayout(llvm::dyn_cast(val), + layout.dropSgLayoutAndData()); }; if (vecAttr.isSplat()) { @@ -987,7 +987,7 @@ struct WgToSgStoreScatterOpWithOffset // Skip for operand one (memref) if (operand.getOperandNumber() == 1) continue; - xegpu::setLocalLayout(operand, layout.dropSgLayoutAndData()); + xegpu::setTemporaryLayout(operand, layout.dropSgLayoutAndData()); } } rewriter.eraseOp(op); @@ -1080,11 +1080,12 @@ struct WgToSgVectorStepOp : public OpConversionPattern { vector::BroadcastOp::create(rewriter, loc, newTy, offsets[0]); auto finalSteps = arith::AddIOp::create(rewriter, loc, steps, bcastOffset); - xegpu::setLocalLayout(steps->getResult(0), layout.dropSgLayoutAndData()); - xegpu::setLocalLayout(bcastOffset->getResult(0), - layout.dropSgLayoutAndData()); - xegpu::setLocalLayout(finalSteps->getResult(0), - layout.dropSgLayoutAndData()); + xegpu::setTemporaryLayout(steps->getResult(0), + layout.dropSgLayoutAndData()); + xegpu::setTemporaryLayout(bcastOffset->getResult(0), + layout.dropSgLayoutAndData()); + xegpu::setTemporaryLayout(finalSteps->getResult(0), + layout.dropSgLayoutAndData()); newOps.push_back(finalSteps); } @@ -1153,8 +1154,8 @@ struct WgToSgVectorShapeCastOp for (auto src : adaptor.getSource()) { auto newShapeCast = vector::ShapeCastOp::create(rewriter, op.getLoc(), newResultType, src); - xegpu::setLocalLayout(newShapeCast->getResult(0), - layout.dropSgLayoutAndData()); + xegpu::setTemporaryLayout(newShapeCast->getResult(0), + layout.dropSgLayoutAndData()); newShapeCastOps.push_back(newShapeCast.getResult()); } @@ -1215,7 +1216,8 @@ struct WgToSgMultiDimReductionOp auto newOp = vector::MultiDimReductionOp::create( rewriter, op.getLoc(), newDstType, op.getKind(), sgSrc, adaptor.getAcc()[0], op.getReductionDims()); - xegpu::setLocalLayout(newOp->getResult(0), layout.dropSgLayoutAndData()); + xegpu::setTemporaryLayout(newOp->getResult(0), + layout.dropSgLayoutAndData()); newReductions.push_back(newOp.getResult()); } @@ -1278,8 +1280,8 @@ struct WgToSgVectorTransposeOp for (auto src : adaptor.getVector()) { auto newTranspose = vector::TransposeOp::create( rewriter, op.getLoc(), newResultType, src, permutation); - xegpu::setLocalLayout(newTranspose->getResult(0), - layout.dropSgLayoutAndData()); + xegpu::setTemporaryLayout(newTranspose->getResult(0), + layout.dropSgLayoutAndData()); newTransposeOps.push_back(newTranspose.getResult()); } @@ -1348,8 +1350,8 @@ struct WgToSgVectorMaskOp : public OpConversionPattern { auto newCreateMaskOp = vector::CreateMaskOp::create(rewriter, loc, resultType, maskOperands); - xegpu::setLocalLayout(newCreateMaskOp->getResult(0), - layout.dropSgLayoutAndData()); + xegpu::setTemporaryLayout(newCreateMaskOp->getResult(0), + layout.dropSgLayoutAndData()); newCreateMaskOps.push_back(newCreateMaskOp.getResult()); } @@ -1392,7 +1394,7 @@ void XeGPUWgToSgDistributePass::runOnOperation() { // TODO-LayoutRefactor: unify the local propagation for layout preprocessing // Operation *op = getOperation(); - // if (!xegpu::recoverLocalLayouts(op)) { + // if (!xegpu::recoverTemporaryLayouts(op)) { // signalPassFailure(); // return; // } @@ -1585,7 +1587,7 @@ void XeGPUWgToSgDistributePass::runOnOperation() { // Layout propagation pass will activated. getOperation()->walk([](Operation *op) { for (OpResult result : op->getOpResults()) { - std::string name = xegpu::getLocalLayout(result); + std::string name = xegpu::getTemporaryLayout(result); if (auto layout = op->getAttrOfType(name)) { op->removeAttr(name); if (!isa(op)) { diff --git a/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp b/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp index 6388333bd7450..e8a5f43e96a1f 100644 --- a/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp +++ b/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp @@ -101,13 +101,13 @@ mlir::xegpu::getDistributedVectorType(VectorType originalType, return xegpu::getDistributedVectorType(helperTdescTy); } -std::string xegpu::getLocalLayout(const OpOperand &operand) { +std::string xegpu::getTemporaryLayout(const OpOperand &operand) { const StringRef prefix("layout_operand_"); unsigned idx = const_cast(operand).getOperandNumber(); return llvm::formatv("{0}{1}", prefix, idx).str(); } -std::string xegpu::getLocalLayout(const OpResult result) { +std::string xegpu::getTemporaryLayout(const OpResult result) { const StringRef prefix = "layout_result_"; return llvm::formatv("{0}{1}", prefix, result.getResultNumber()).str(); } @@ -129,7 +129,7 @@ xegpu::DistributeLayoutAttr xegpu::getDistributeLayoutAttr(const Value value) { return layout; } - std::string layoutName = getLocalLayout(result); + std::string layoutName = getTemporaryLayout(result); if (defOp->hasAttr(layoutName)) { auto layout = defOp->getAttrOfType(layoutName); @@ -180,7 +180,7 @@ xegpu::getDistributeLayoutAttr(const OpOperand &opr) { return layout; } - std::string layoutName = xegpu::getLocalLayout(opr); + std::string layoutName = xegpu::getTemporaryLayout(opr); if (op->hasAttr(layoutName)) { auto layout = op->getAttrOfType(layoutName); return layout; @@ -226,7 +226,7 @@ maybePickPermanentLayout(xegpu::DistributeLayoutAttr layout, } // TODO-LayoutRefactor: Remove this function after replacing use -// with setLocalLayout or setAnchorLayout +// with setTemporaryLayout or setAnchorLayout void xegpu::setDistributeLayoutAttr( const mlir::OpResult &result, const mlir::xegpu::DistributeLayoutAttr layout) { @@ -239,7 +239,7 @@ void xegpu::setDistributeLayoutAttr( return; } - std::string name = xegpu::getLocalLayout(result); + std::string name = xegpu::getTemporaryLayout(result); if (owner->hasAttrOfType(name)) { return; } @@ -249,7 +249,7 @@ void xegpu::setDistributeLayoutAttr( } // TODO-LayoutRefactor: Remove this function after replacing use -// with setLocalLayout or setAnchorLayout +// with setTemporaryLayout or setAnchorLayout void xegpu::setDistributeLayoutAttr(const OpOperand &operand, const DistributeLayoutAttr layout) { Operation *owner = operand.getOwner(); @@ -287,7 +287,7 @@ void xegpu::setDistributeLayoutAttr(const OpOperand &operand, } } - std::string name = xegpu::getLocalLayout(operand); + std::string name = xegpu::getTemporaryLayout(operand); if (owner->hasAttrOfType(name)) { return; } @@ -300,7 +300,7 @@ template xegpu::DistributeLayoutAttr xegpu::getTempLayout(const T &operandOrResult) { Operation *op = operandOrResult.getOwner(); - std::string layoutName = xegpu::getLocalLayout(operandOrResult); + std::string layoutName = xegpu::getTemporaryLayout(operandOrResult); if (op->hasAttr(layoutName)) { auto layout = op->getAttrOfType(layoutName); return layout; @@ -315,10 +315,10 @@ template xegpu::DistributeLayoutAttr xegpu::getTempLayout(const OpOperand &operand); template -void xegpu::setLocalLayout(const T &operandOrResult, - const xegpu::DistributeLayoutAttr layout) { +void xegpu::setTemporaryLayout(const T &operandOrResult, + const xegpu::DistributeLayoutAttr layout) { Operation *owner = operandOrResult.getOwner(); - std::string name = xegpu::getLocalLayout(operandOrResult); + std::string name = xegpu::getTemporaryLayout(operandOrResult); if (owner->hasAttrOfType(name)) { return; } @@ -327,15 +327,15 @@ void xegpu::setLocalLayout(const T &operandOrResult, } } -template void xegpu::setLocalLayout( +template void xegpu::setTemporaryLayout( const mlir::OpResult &result, const mlir::xegpu::DistributeLayoutAttr layout); -template void xegpu::setLocalLayout( +template void xegpu::setTemporaryLayout( const mlir::OpOperand &operand, const mlir::xegpu::DistributeLayoutAttr layout); -void xegpu::recoverLocalLayoutsDeprecated(Operation *op) { +void xegpu::recoverTemporaryLayoutsDeprecated(Operation *op) { op->walk([&](Operation *nestOp) { for (OpOperand &opr : nestOp->getOpOperands()) { auto layout = getDistributeLayoutAttr(opr.get()); @@ -352,7 +352,7 @@ void xegpu::recoverLocalLayoutsDeprecated(Operation *op) { /// Attach layout attributes to all vector-type operands of operations within /// the given operation's region. Reports an error if any vector operand lacks /// a layout attribute. -bool xegpu::recoverLocalLayouts(Operation *rootOp) { +bool xegpu::recoverTemporaryLayouts(Operation *rootOp) { auto result = rootOp->walk([&](Operation *op) { for (OpOperand &operand : op->getOpOperands()) { // Layouts are needed for vector type only. @@ -374,7 +374,7 @@ bool xegpu::recoverLocalLayouts(Operation *rootOp) { template void xegpu::removeLayoutAttr(const T &operandOrResult) { Operation *owner = operandOrResult.getOwner(); - std::string name = xegpu::getLocalLayout(operandOrResult); + std::string name = xegpu::getTemporaryLayout(operandOrResult); if (owner->hasAttrOfType(name)) owner->removeAttr(name); } diff --git a/mlir/test/lib/Dialect/XeGPU/TestXeGPUTransforms.cpp b/mlir/test/lib/Dialect/XeGPU/TestXeGPUTransforms.cpp index 26c9c6ea53b28..369c0493d69f0 100644 --- a/mlir/test/lib/Dialect/XeGPU/TestXeGPUTransforms.cpp +++ b/mlir/test/lib/Dialect/XeGPU/TestXeGPUTransforms.cpp @@ -184,7 +184,7 @@ class TestStepOpPattern : public OpConversionPattern { matchAndRewrite(vector::StepOp op, OneToNOpAdaptor adaptor, ConversionPatternRewriter &rewriter) const override { - auto layoutName = xegpu::getLocalLayout(op->getResult(0)); + auto layoutName = xegpu::getTemporaryLayout(op->getResult(0)); auto sliceAttr = op->getAttrOfType(layoutName); if (!sliceAttr || sliceAttr.getRank() != 1) return failure(); @@ -324,7 +324,7 @@ struct TestXeGPULayoutInterface target.addDynamicallyLegalOp( [&](vector::StepOp op) -> bool { - auto layoutName = xegpu::getLocalLayout(op->getResult(0)); + auto layoutName = xegpu::getTemporaryLayout(op->getResult(0)); auto sliceAttr = op->getAttrOfType(layoutName); return isLegal(sliceAttr); }); From 8cb195de12c365b8199cb18d94426270f04b8493 Mon Sep 17 00:00:00 2001 From: Jianhui Li Date: Tue, 16 Dec 2025 22:59:17 +0000 Subject: [PATCH 28/28] fixing names and use setLayoutAttr APIs --- .../mlir/Dialect/XeGPU/Utils/XeGPUUtils.h | 6 +-- .../XeGPU/Transforms/XeGPUBlocking.cpp | 4 +- .../Transforms/XeGPUOptimizeBlockLoads.cpp | 4 +- .../Transforms/XeGPUSubgroupDistribute.cpp | 38 +++++++++---------- .../Transforms/XeGPUWgToSgDistribute.cpp | 29 +++++++------- mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp | 25 ++++++------ .../lib/Dialect/XeGPU/TestXeGPUTransforms.cpp | 4 +- 7 files changed, 56 insertions(+), 54 deletions(-) diff --git a/mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h b/mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h index 098a308f1ace4..46d52516cbc15 100644 --- a/mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h +++ b/mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h @@ -119,10 +119,10 @@ int getLargestDivisor(T dim, ArrayRef candidates, ArrayRef candidateMultiples = {}); /// Return the attribute name for the OpOperand to attach DistributeLayoutAttr -std::string getTemporaryLayout(const OpOperand &operand); +std::string getTemporaryLayoutName(const OpOperand &operand); /// Return the attribute name for the OpResult to attach DistributeLayoutAttr -std::string getTemporaryLayout(const OpResult result); +std::string getTemporaryLayoutName(const OpResult result); /// Retrieves the DistributeLayoutAttr associated with a given Value. For /// TensorDescType values, the DistributeLayoutAttr is extracted from the @@ -162,7 +162,7 @@ void setDistributeLayoutAttr(const OpOperand &opr, template || std::is_same_v>> -DistributeLayoutAttr getTempLayout(const T &operandOrResult); +DistributeLayoutAttr getTemporaryLayout(const T &operandOrResult); template || diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp index b9915846adf58..ba2753f517ce6 100644 --- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp +++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp @@ -414,14 +414,14 @@ void XeGPUBlockingPass::runOnOperation() { op->walk([](Operation *op) { // Remove the layout attributes cached per operands. for (OpOperand &opr : op->getOpOperands()) { - std::string name = xegpu::getTemporaryLayout(opr); + std::string name = xegpu::getTemporaryLayoutName(opr); if (op->hasAttrOfType(name)) op->removeAttr(name); } // Update the layout attributes per result. for (OpResult result : op->getOpResults()) { - std::string name = xegpu::getTemporaryLayout(result); + std::string name = xegpu::getTemporaryLayoutName(result); if (auto layout = op->getAttrOfType(name)) { op->removeAttr(name); if (!isa(op)) diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUOptimizeBlockLoads.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUOptimizeBlockLoads.cpp index 96e85c653e39a..bb80df197d45b 100644 --- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUOptimizeBlockLoads.cpp +++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUOptimizeBlockLoads.cpp @@ -465,8 +465,8 @@ struct XeGPUOptimizeBlockLoadsPass final // converted. target.addDynamicallyLegalOp( [&](vector::ExtractOp extractOp) { - auto layout = - xegpu::getTempLayout(dyn_cast(extractOp.getResult())); + auto layout = xegpu::getTemporaryLayout( + dyn_cast(extractOp.getResult())); if (!layout) return true; auto laneLayout = layout.getEffectiveLaneLayoutAsInt(); diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp index 01ea8bfe37899..70fb7a5cc5cd9 100644 --- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp +++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp @@ -869,11 +869,11 @@ struct StoreDistribution final : public gpu::WarpDistributionPattern { storeScatterOp, "Expected at most 2D result at SG level"); std::string layoutPayloadName = - xegpu::getTemporaryLayout(storeScatterOp->getOpOperand(0)); + xegpu::getTemporaryLayoutName(storeScatterOp->getOpOperand(0)); std::string layoutOffsetsName = - xegpu::getTemporaryLayout(storeScatterOp->getOpOperand(2)); + xegpu::getTemporaryLayoutName(storeScatterOp->getOpOperand(2)); std::string layoutMaskName = - xegpu::getTemporaryLayout(storeScatterOp->getOpOperand(3)); + xegpu::getTemporaryLayoutName(storeScatterOp->getOpOperand(3)); xegpu::LayoutAttr layoutPayload = storeScatterOp->getAttrOfType(layoutPayloadName); @@ -1152,9 +1152,9 @@ struct LoadDistribution final : public gpu::WarpDistributionPattern { "Expected 1D offsets and mask vector"); // Assume offset and mask producers will be distributed as well. std::string layoutOffsetsName = - xegpu::getTemporaryLayout(loadGatherOp->getOpOperand(1)); + xegpu::getTemporaryLayoutName(loadGatherOp->getOpOperand(1)); std::string layoutMaskName = - xegpu::getTemporaryLayout(loadGatherOp->getOpOperand(2)); + xegpu::getTemporaryLayoutName(loadGatherOp->getOpOperand(2)); xegpu::LayoutAttr layoutOffsets = loadGatherOp->getAttrOfType(layoutOffsetsName); @@ -1225,7 +1225,7 @@ static Value lowerToVectorReductions(TypedValue src, DenseElementsAttr::get(acc.getType(), zeroAttr)); // Reduction result should have the same layout as the accumulator. xegpu::setTemporaryLayout(cast(reductionResult), - xegpu::getTempLayout(dyn_cast(acc))); + xegpu::getTemporaryLayout(dyn_cast(acc))); // For each slice of the source, extract the slice vector, do a reduction // and, insert the reduced value back to the result vector. for (int i = 0; i < nSlices; ++i) { @@ -1253,8 +1253,8 @@ static Value lowerToVectorReductions(TypedValue src, // accumulator. Shape cast source has the same layout as the original // reduction source. // TODO: other ops generated here may also need layout attributes. - auto srcLayout = xegpu::getTempLayout(dyn_cast(src)); - auto accLayout = xegpu::getTempLayout(dyn_cast(acc)); + auto srcLayout = xegpu::getTemporaryLayout(dyn_cast(src)); + auto accLayout = xegpu::getTemporaryLayout(dyn_cast(acc)); xegpu::setTemporaryLayout(slice->getOpOperand(0), srcLayout); xegpu::setTemporaryLayout(slice->getOpResult(0), accLayout); @@ -1350,7 +1350,7 @@ struct VectorMultiReductionDistribution : public gpu::WarpDistributionPattern { cast(warpOp.getResult(operandIdx).getType()); VectorType resultType = cast(reductionOp.getType()); xegpu::DistributeLayoutAttr sourceLayout = - xegpu::getTempLayout(reductionOp->getOpOperand(0)); + xegpu::getTemporaryLayout(reductionOp->getOpOperand(0)); FailureOr sourceDistTypeOrFailure = getDistVecTypeBasedOnLaneLayout(sourceLayout, sourceType); @@ -1512,9 +1512,9 @@ struct VectorBroadcastDistribution : public gpu::WarpDistributionPattern { dyn_cast(broadcastOp.getResult().getType()); xegpu::DistributeLayoutAttr sourceLayout = - xegpu::getTempLayout(broadcastOp->getOpOperand(0)); + xegpu::getTemporaryLayout(broadcastOp->getOpOperand(0)); xegpu::DistributeLayoutAttr resultLayout = - xegpu::getTempLayout(dyn_cast(broadcastOp.getResult())); + xegpu::getTemporaryLayout(dyn_cast(broadcastOp.getResult())); FailureOr sourceDistType; Type sourceElemOrDistType; @@ -1603,9 +1603,9 @@ struct VectorShapeCastDistribution : public gpu::WarpDistributionPattern { auto resultDistTy = cast(warpOp.getResult(operandNumber).getType()); xegpu::DistributeLayoutAttr sourceLayout = - xegpu::getTempLayout(shapeCastOp->getOpOperand(0)); + xegpu::getTemporaryLayout(shapeCastOp->getOpOperand(0)); xegpu::DistributeLayoutAttr resultLayout = - xegpu::getTempLayout(dyn_cast(shapeCastOp.getResult())); + xegpu::getTemporaryLayout(dyn_cast(shapeCastOp.getResult())); if (!sourceLayout || !resultLayout) return rewriter.notifyMatchFailure( warpOp, @@ -1687,7 +1687,7 @@ struct VectorExtractStridedSliceDistribution int64_t distributedDim = distributedDims[0]; int sourceDistrDimSize = extractOp.getSourceVectorType().getShape()[distributedDim]; - auto sourceLayout = xegpu::getTempLayout(extractOp->getOpOperand(0)); + auto sourceLayout = xegpu::getTemporaryLayout(extractOp->getOpOperand(0)); if (!sourceLayout || sourceLayout.getEffectiveLaneLayoutAsInt().empty()) return rewriter.notifyMatchFailure( warpOp, "the source of extract_strided_slice op lacks distribution " @@ -1797,8 +1797,8 @@ struct VectorInsertStridedSliceDistribution "rank) dims of dest vector"); int64_t srcDistrDimSize = srcType.getDimSize(sourceDistributedDim); // Obtain the source and dest layouts. - auto destLayout = xegpu::getTempLayout(insertOp->getOpOperand(1)); - auto sourceLayout = xegpu::getTempLayout(insertOp->getOpOperand(0)); + auto destLayout = xegpu::getTemporaryLayout(insertOp->getOpOperand(1)); + auto sourceLayout = xegpu::getTemporaryLayout(insertOp->getOpOperand(0)); if (!destLayout || !sourceLayout || destLayout.getEffectiveLaneLayoutAsInt().empty() || sourceLayout.getEffectiveLaneLayoutAsInt().empty()) @@ -1913,7 +1913,7 @@ struct VectorBitcastDistribution final : public gpu::WarpDistributionPattern { unsigned operandIdx = operand->getOperandNumber(); VectorType distributedSourceType = getDistVecTypeBasedOnLaneLayout( - xegpu::getTempLayout(bitcastOp->getOpOperand(0)), + xegpu::getTemporaryLayout(bitcastOp->getOpOperand(0)), bitcastOp.getSourceVectorType()) .value_or(VectorType()); if (!distributedSourceType) @@ -1956,9 +1956,9 @@ struct VectorTransposeDistribution final : public gpu::WarpDistributionPattern { auto transposeOp = operand->get().getDefiningOp(); unsigned operandIdx = operand->getOperandNumber(); xegpu::DistributeLayoutAttr sourceLayout = - xegpu::getTempLayout(transposeOp->getOpOperand(0)); + xegpu::getTemporaryLayout(transposeOp->getOpOperand(0)); xegpu::DistributeLayoutAttr resultLayout = - xegpu::getTempLayout(transposeOp->getOpResult(0)); + xegpu::getTemporaryLayout(transposeOp->getOpResult(0)); if (!sourceLayout || !resultLayout) return rewriter.notifyMatchFailure( transposeOp, diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp index e3879de2161bf..a5e90120c8756 100644 --- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp +++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp @@ -493,7 +493,7 @@ struct WgToSgVectorBroadcastOp ArrayRef wgShape = resultType.getShape(); xegpu::DistributeLayoutAttr layout = - xegpu::getTempLayout(llvm::cast(op.getResult())); + xegpu::getTemporaryLayout(llvm::cast(op.getResult())); if (!layout || !layout.isForWorkgroup()) return failure(); @@ -536,7 +536,7 @@ struct WgToSgElementwiseOp : public ConversionPattern { ArrayRef wgShape = resultType.getShape(); xegpu::DistributeLayoutAttr layout = - xegpu::getTempLayout(llvm::cast(op->getResult(0))); + xegpu::getTemporaryLayout(llvm::cast(op->getResult(0))); if (!layout || !layout.isForWorkgroup()) return failure(); @@ -742,7 +742,7 @@ struct WgToSgArithConstantOp : public OpConversionPattern { return failure(); xegpu::DistributeLayoutAttr layout = - xegpu::getTempLayout(dyn_cast(op.getResult())); + xegpu::getTemporaryLayout(dyn_cast(op.getResult())); if (!layout || !layout.isForWorkgroup()) return failure(); @@ -1052,7 +1052,7 @@ struct WgToSgVectorStepOp : public OpConversionPattern { matchAndRewrite(vector::StepOp op, OneToNOpAdaptor adaptor, ConversionPatternRewriter &rewriter) const override { xegpu::DistributeLayoutAttr layout = - xegpu::getTempLayout(dyn_cast(op.getResult())); + xegpu::getTemporaryLayout(dyn_cast(op.getResult())); if (!layout || !layout.isForWorkgroup()) return failure(); @@ -1109,7 +1109,7 @@ struct WgToSgVectorShapeCastOp ArrayRef wgShape = resultType.getShape(); xegpu::DistributeLayoutAttr layout = - xegpu::getTempLayout(dyn_cast(op.getResult())); + xegpu::getTemporaryLayout(dyn_cast(op.getResult())); if (!layout || !layout.isForWorkgroup()) return failure(); @@ -1142,7 +1142,7 @@ struct WgToSgVectorShapeCastOp // must be a slice of higher rank layout. int64_t sourceRank = srcType.getRank(); int64_t resultRank = sgShape.size(); - // TODO-LayoutRefactor: handle the case using getTempLayout + // TODO-LayoutRefactor: handle the case using getTemporaryLayout xegpu::DistributeLayoutAttr sourceLayout = xegpu::getDistributeLayoutAttr(op.getSource()); if (sourceRank < resultRank && !sourceLayout.isSliceOf(layout)) @@ -1183,7 +1183,7 @@ struct WgToSgMultiDimReductionOp auto srcShape = srcType.getShape(); xegpu::DistributeLayoutAttr layout = - xegpu::getTempLayout(dyn_cast(op.getResult())); + xegpu::getTemporaryLayout(dyn_cast(op.getResult())); if (!layout || !layout.isForWorkgroup()) return failure(); @@ -1238,10 +1238,10 @@ struct WgToSgVectorTransposeOp ArrayRef wgShape = resultType.getShape(); xegpu::DistributeLayoutAttr layout = - xegpu::getTempLayout(dyn_cast(op.getResult())); + xegpu::getTemporaryLayout(dyn_cast(op.getResult())); if (!layout || !layout.isForWorkgroup()) return failure(); - // TODO-LayoutRefactor: handle the case using getTempLayout + // TODO-LayoutRefactor: handle the case using getTemporaryLayout xegpu::DistributeLayoutAttr sourceLayout = xegpu::getDistributeLayoutAttr(op.getVector()); if (!sourceLayout || !sourceLayout.isForWorkgroup()) @@ -1300,7 +1300,7 @@ struct WgToSgVectorMaskOp : public OpConversionPattern { typename OpConversionPattern::OneToNOpAdaptor adaptor, ConversionPatternRewriter &rewriter) const override { xegpu::DistributeLayoutAttr layout = - xegpu::getTempLayout(dyn_cast(op.getResult())); + xegpu::getTemporaryLayout(dyn_cast(op.getResult())); if (!layout || !layout.isForWorkgroup()) return failure(); @@ -1509,7 +1509,8 @@ void XeGPUWgToSgDistributePass::runOnOperation() { if (!vecType) return true; - auto layout = xegpu::getTempLayout(dyn_cast(op.getResult())); + auto layout = + xegpu::getTemporaryLayout(dyn_cast(op.getResult())); return isLegal(layout); }); @@ -1520,7 +1521,7 @@ void XeGPUWgToSgDistributePass::runOnOperation() { [=](Operation *op) -> bool { // Check for either a SliceAttr or LayoutAttr on the result. auto layout = - xegpu::getTempLayout(dyn_cast(op->getResult(0))); + xegpu::getTemporaryLayout(dyn_cast(op->getResult(0))); return isLegal(layout); }); @@ -1562,7 +1563,7 @@ void XeGPUWgToSgDistributePass::runOnOperation() { } xegpu::DistributeLayoutAttr layout = - xegpu::getTempLayout(op->getResult(0)); + xegpu::getTemporaryLayout(op->getResult(0)); return isLegal(layout); }); @@ -1587,7 +1588,7 @@ void XeGPUWgToSgDistributePass::runOnOperation() { // Layout propagation pass will activated. getOperation()->walk([](Operation *op) { for (OpResult result : op->getOpResults()) { - std::string name = xegpu::getTemporaryLayout(result); + std::string name = xegpu::getTemporaryLayoutName(result); if (auto layout = op->getAttrOfType(name)) { op->removeAttr(name); if (!isa(op)) { diff --git a/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp b/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp index e8a5f43e96a1f..d3906e37ffbf1 100644 --- a/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp +++ b/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp @@ -101,13 +101,13 @@ mlir::xegpu::getDistributedVectorType(VectorType originalType, return xegpu::getDistributedVectorType(helperTdescTy); } -std::string xegpu::getTemporaryLayout(const OpOperand &operand) { +std::string xegpu::getTemporaryLayoutName(const OpOperand &operand) { const StringRef prefix("layout_operand_"); unsigned idx = const_cast(operand).getOperandNumber(); return llvm::formatv("{0}{1}", prefix, idx).str(); } -std::string xegpu::getTemporaryLayout(const OpResult result) { +std::string xegpu::getTemporaryLayoutName(const OpResult result) { const StringRef prefix = "layout_result_"; return llvm::formatv("{0}{1}", prefix, result.getResultNumber()).str(); } @@ -129,7 +129,7 @@ xegpu::DistributeLayoutAttr xegpu::getDistributeLayoutAttr(const Value value) { return layout; } - std::string layoutName = getTemporaryLayout(result); + std::string layoutName = getTemporaryLayoutName(result); if (defOp->hasAttr(layoutName)) { auto layout = defOp->getAttrOfType(layoutName); @@ -180,7 +180,7 @@ xegpu::getDistributeLayoutAttr(const OpOperand &opr) { return layout; } - std::string layoutName = xegpu::getTemporaryLayout(opr); + std::string layoutName = xegpu::getTemporaryLayoutName(opr); if (op->hasAttr(layoutName)) { auto layout = op->getAttrOfType(layoutName); return layout; @@ -239,7 +239,7 @@ void xegpu::setDistributeLayoutAttr( return; } - std::string name = xegpu::getTemporaryLayout(result); + std::string name = xegpu::getTemporaryLayoutName(result); if (owner->hasAttrOfType(name)) { return; } @@ -287,7 +287,7 @@ void xegpu::setDistributeLayoutAttr(const OpOperand &operand, } } - std::string name = xegpu::getTemporaryLayout(operand); + std::string name = xegpu::getTemporaryLayoutName(operand); if (owner->hasAttrOfType(name)) { return; } @@ -297,10 +297,11 @@ void xegpu::setDistributeLayoutAttr(const OpOperand &operand, } template -xegpu::DistributeLayoutAttr xegpu::getTempLayout(const T &operandOrResult) { +xegpu::DistributeLayoutAttr +xegpu::getTemporaryLayout(const T &operandOrResult) { Operation *op = operandOrResult.getOwner(); - std::string layoutName = xegpu::getTemporaryLayout(operandOrResult); + std::string layoutName = xegpu::getTemporaryLayoutName(operandOrResult); if (op->hasAttr(layoutName)) { auto layout = op->getAttrOfType(layoutName); return layout; @@ -310,15 +311,15 @@ xegpu::DistributeLayoutAttr xegpu::getTempLayout(const T &operandOrResult) { } template xegpu::DistributeLayoutAttr -xegpu::getTempLayout(const OpResult &result); +xegpu::getTemporaryLayout(const OpResult &result); template xegpu::DistributeLayoutAttr -xegpu::getTempLayout(const OpOperand &operand); +xegpu::getTemporaryLayout(const OpOperand &operand); template void xegpu::setTemporaryLayout(const T &operandOrResult, const xegpu::DistributeLayoutAttr layout) { Operation *owner = operandOrResult.getOwner(); - std::string name = xegpu::getTemporaryLayout(operandOrResult); + std::string name = xegpu::getTemporaryLayoutName(operandOrResult); if (owner->hasAttrOfType(name)) { return; } @@ -374,7 +375,7 @@ bool xegpu::recoverTemporaryLayouts(Operation *rootOp) { template void xegpu::removeLayoutAttr(const T &operandOrResult) { Operation *owner = operandOrResult.getOwner(); - std::string name = xegpu::getTemporaryLayout(operandOrResult); + std::string name = xegpu::getTemporaryLayoutName(operandOrResult); if (owner->hasAttrOfType(name)) owner->removeAttr(name); } diff --git a/mlir/test/lib/Dialect/XeGPU/TestXeGPUTransforms.cpp b/mlir/test/lib/Dialect/XeGPU/TestXeGPUTransforms.cpp index 369c0493d69f0..c97346ed6f8b5 100644 --- a/mlir/test/lib/Dialect/XeGPU/TestXeGPUTransforms.cpp +++ b/mlir/test/lib/Dialect/XeGPU/TestXeGPUTransforms.cpp @@ -184,7 +184,7 @@ class TestStepOpPattern : public OpConversionPattern { matchAndRewrite(vector::StepOp op, OneToNOpAdaptor adaptor, ConversionPatternRewriter &rewriter) const override { - auto layoutName = xegpu::getTemporaryLayout(op->getResult(0)); + auto layoutName = xegpu::getTemporaryLayoutName(op->getResult(0)); auto sliceAttr = op->getAttrOfType(layoutName); if (!sliceAttr || sliceAttr.getRank() != 1) return failure(); @@ -324,7 +324,7 @@ struct TestXeGPULayoutInterface target.addDynamicallyLegalOp( [&](vector::StepOp op) -> bool { - auto layoutName = xegpu::getTemporaryLayout(op->getResult(0)); + auto layoutName = xegpu::getTemporaryLayoutName(op->getResult(0)); auto sliceAttr = op->getAttrOfType(layoutName); return isLegal(sliceAttr); });