diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/CMakeLists.txt b/mlir/include/mlir/Dialect/XeGPU/IR/CMakeLists.txt index efca3cfa0dab7..b10219f71b531 100644 --- a/mlir/include/mlir/Dialect/XeGPU/IR/CMakeLists.txt +++ b/mlir/include/mlir/Dialect/XeGPU/IR/CMakeLists.txt @@ -15,3 +15,8 @@ set(LLVM_TARGET_DEFINITIONS XeGPUAttrs.td) mlir_tablegen(XeGPUAttrInterface.h.inc -gen-attr-interface-decls) mlir_tablegen(XeGPUAttrInterface.cpp.inc -gen-attr-interface-defs) add_mlir_dialect_tablegen_target(MLIRXeGPUAttrInterfaceIncGen) + +set(LLVM_TARGET_DEFINITIONS XeGPUAttrs.td) +mlir_tablegen(XeGPUOpInterface.h.inc -gen-op-interface-decls) +mlir_tablegen(XeGPUOpInterface.cpp.inc -gen-op-interface-defs) +add_mlir_dialect_tablegen_target(MLIRXeGPUOpInterfaceIncGen) diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPU.h b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPU.h index 0c059967bb898..7badfaf4a8216 100644 --- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPU.h +++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPU.h @@ -34,6 +34,7 @@ class SliceAttr; #include #include #include +#include // clang-format on #define GET_ATTRDEF_CLASSES diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td index eae0bd4e68a84..446f64fffa468 100644 --- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td +++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td @@ -770,4 +770,30 @@ def XeGPU_MemLayoutAttr : XeGPUAttr<"MemLayout", "mem_layout"> { } +def AnchorLayoutInterface : OpInterface<"AnchorLayoutInterface"> { + let cppNamespace = "::mlir::xegpu"; + + let description = [{ + An attribute interface for accessing anchor layout information. + This interface provides a method to set and retrieve the anchor layout + from operations that implement it. + }]; + + let methods = [ + InterfaceMethod< + /*desc=*/"Get the anchor layout attribute.", + /*retTy=*/"xegpu::DistributeLayoutAttr", + /*methodName=*/"getAnchorLayout", + /*args=*/(ins) + >, + InterfaceMethod< + /*desc=*/"Set the anchor layout attribute.", + /*retTy=*/"void", + /*methodName=*/"setAnchorLayout", + /*args=*/(ins "xegpu::DistributeLayoutAttr":$layout) + >, + ]; + +} + #endif // MLIR_DIALECT_XEGPU_IR_XEGPUATTRS_TD diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td index b54d620c3c0c3..92ac8870b7068 100644 --- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td +++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td @@ -17,6 +17,7 @@ include "mlir/Interfaces/ShapedOpInterfaces.td" include "mlir/Interfaces/SideEffectInterfaces.td" include "mlir/Interfaces/ViewLikeInterface.td" + // Base class for dialect operations. This operation inherits from the base // `Op` class in OpBase.td, and provides: // * The parent dialect of the operation. @@ -247,7 +248,7 @@ def XeGPU_CreateNdDescOp: XeGPU_Op<"create_nd_tdesc", [Pure, ViewLikeOpInterface }]; } -def XeGPU_PrefetchNdOp : XeGPU_Op<"prefetch_nd", []> { +def XeGPU_PrefetchNdOp : XeGPU_Op<"prefetch_nd", [AnchorLayoutInterface]> { let summary = "prefetches a n-D block to cache"; let description = [{ It issues an instruction to prefetch a block of data from continuous @@ -296,6 +297,14 @@ def XeGPU_PrefetchNdOp : XeGPU_Op<"prefetch_nd", []> { return getTensorDesc().getType(); } + xegpu::DistributeLayoutAttr getAnchorLayout() { + return getLayout().value_or(nullptr); + } + + void setAnchorLayout(xegpu::DistributeLayoutAttr anchorLayout) { + setLayoutAttr(anchorLayout); + } + SmallVector getMixedOffsets() { auto statics = getConstOffsets().value_or(SmallVector()); auto dynamics = getOffsets(); @@ -338,7 +347,7 @@ def XeGPU_PrefetchNdOp : XeGPU_Op<"prefetch_nd", []> { def XeGPU_LoadNdOp : XeGPU_Op<"load_nd", [ - AllElementTypesMatch<["value", "TensorDesc"]>, MemoryEffects<[MemRead]> + AllElementTypesMatch<["value", "TensorDesc"]>, MemoryEffects<[MemRead]>, AnchorLayoutInterface ]> { let summary = "loads a n-D block from memory (represented by TensorDesc)" "to registers (represented by vector)"; @@ -419,6 +428,14 @@ def XeGPU_LoadNdOp : XeGPU_Op<"load_nd", [ return getTensorDesc().getType(); } + xegpu::DistributeLayoutAttr getAnchorLayout() { + return getLayout().value_or(nullptr); + } + + void setAnchorLayout(xegpu::DistributeLayoutAttr anchorLayout) { + setLayoutAttr(anchorLayout); + } + SmallVector getMixedOffsets() { auto statics = getConstOffsets().value_or(SmallVector()); auto dynamics = getOffsets(); @@ -462,7 +479,7 @@ def XeGPU_LoadNdOp : XeGPU_Op<"load_nd", [ } def XeGPU_StoreNdOp : XeGPU_Op<"store_nd", [ - AllElementTypesMatch<["value", "TensorDesc"]>, MemoryEffects<[MemWrite]> + AllElementTypesMatch<["value", "TensorDesc"]>, MemoryEffects<[MemWrite]>, AnchorLayoutInterface ]> { let summary = "stores a n-D block register region back to memory, currently only supports 2D"; @@ -532,6 +549,14 @@ def XeGPU_StoreNdOp : XeGPU_Op<"store_nd", [ return getTensorDesc().getType(); } + xegpu::DistributeLayoutAttr getAnchorLayout() { + return getLayout().value_or(nullptr); + } + + void setAnchorLayout(xegpu::DistributeLayoutAttr anchorLayout) { + setLayoutAttr(anchorLayout); + } + SmallVector getMixedOffsets() { auto statics = getConstOffsets().value_or(SmallVector()); auto dynamics = getOffsets(); @@ -724,7 +749,7 @@ def XeGPU_CreateDescOp: XeGPU_Op<"create_tdesc", [Pure, ViewLikeOpInterface]> { let hasVerifier = 1; } -def XeGPU_PrefetchOp : XeGPU_Op<"prefetch", []> { +def XeGPU_PrefetchOp : XeGPU_Op<"prefetch", [AnchorLayoutInterface]> { let summary = "prefetches a set of scattered data points to cache"; let description = [{ @@ -813,6 +838,14 @@ def XeGPU_PrefetchOp : XeGPU_Op<"prefetch", []> { return getSource().getType(); } + xegpu::DistributeLayoutAttr getAnchorLayout() { + return getLayout().value_or(nullptr); + } + + void setAnchorLayout(xegpu::DistributeLayoutAttr anchorLayout) { + setLayoutAttr(anchorLayout); + } + TypedValue getTensorDesc() { if (auto tdescType = getTensorDescType()) { return llvm::cast>(getSource()); @@ -843,7 +876,7 @@ def XeGPU_PrefetchOp : XeGPU_Op<"prefetch", []> { let hasVerifier = 1; } -def XeGPU_LoadGatherOp : XeGPU_Op<"load", [MemoryEffects<[MemRead]>]> { +def XeGPU_LoadGatherOp : XeGPU_Op<"load", [MemoryEffects<[MemRead]>, AnchorLayoutInterface]> { let summary = "load a set of scattered data points from memory."; let description = [{ It (aka. load) load data per each lane. The output @@ -949,6 +982,14 @@ def XeGPU_LoadGatherOp : XeGPU_Op<"load", [MemoryEffects<[MemRead]>]> { return getSource().getType(); } + xegpu::DistributeLayoutAttr getAnchorLayout() { + return getLayout().value_or(nullptr); + } + + void setAnchorLayout(xegpu::DistributeLayoutAttr anchorLayout) { + setLayoutAttr(anchorLayout); + } + TypedValue getTensorDesc() { if (auto tdescType = getTensorDescType()) { return llvm::cast>(getSource()); @@ -1005,7 +1046,7 @@ def XeGPU_LoadGatherOp : XeGPU_Op<"load", [MemoryEffects<[MemRead]>]> { let hasVerifier = 1; } -def XeGPU_StoreScatterOp : XeGPU_Op<"store", [MemoryEffects<[MemWrite]>]> { +def XeGPU_StoreScatterOp : XeGPU_Op<"store", [MemoryEffects<[MemWrite]>, AnchorLayoutInterface]> { let summary = "store data to scattered memory locations."; let description = [{ It (aka. store) stores data to scattered memory locations. The value is @@ -1078,7 +1119,7 @@ def XeGPU_StoreScatterOp : XeGPU_Op<"store", [MemoryEffects<[MemWrite]>]> { l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint, layout = #xegpu.layout} - : memref<1024xf32>, vector<16xi1>, vector<16xindex> -> vector<16xf32> + : vector<16xf32>, memref<1024xf32>, vector<16xi1>, vector<16xindex> ``` Example 4 (Lane level): @@ -1107,6 +1148,14 @@ def XeGPU_StoreScatterOp : XeGPU_Op<"store", [MemoryEffects<[MemWrite]>]> { return getDest().getType(); } + xegpu::DistributeLayoutAttr getAnchorLayout() { + return getLayout().value_or(nullptr); + } + + void setAnchorLayout(xegpu::DistributeLayoutAttr anchorLayout) { + setLayoutAttr(anchorLayout); + } + TypedValue getTensorDesc() { if (auto tdescType = getTensorDescType()) { return llvm::cast>(getDest()); @@ -1217,7 +1266,7 @@ def XeGPU_UpdateOffsetOp: XeGPU_Op<"update_offset", let hasVerifier = 1; } -def XeGPU_DpasOp : XeGPU_Op<"dpas", [Pure, AllElementTypesMatch<["lhs", "rhs"]>]> { +def XeGPU_DpasOp : XeGPU_Op<"dpas", [Pure, AllElementTypesMatch<["lhs", "rhs"]>, AnchorLayoutInterface]> { let summary = "It performs mma computation"; let description = [{DPAS performs matrix multiplication on matrix A of `mxk` @@ -1278,6 +1327,15 @@ def XeGPU_DpasOp : XeGPU_Op<"dpas", [Pure, AllElementTypesMatch<["lhs", "rhs"]>] let results = (outs XeGPU_DpasResType: $result); let extraClassDeclaration = [{ + + xegpu::DistributeLayoutAttr getAnchorLayout() { + return getLayoutCd().value_or(nullptr); + } + + void setAnchorLayout(xegpu::DistributeLayoutAttr anchorLayout) { + setLayoutCdAttr(anchorLayout); + } + VectorType getLhsType() { return getLhs().getType(); } @@ -1311,7 +1369,8 @@ def XeGPU_DpasOp : XeGPU_Op<"dpas", [Pure, AllElementTypesMatch<["lhs", "rhs"]>] def XeGPU_AtomicRMWOp: XeGPU_Op<"atomic_rmw", [Pure, MemoryEffects<[MemRead, MemWrite]>, AllElementTypesMatch<["tensorDesc", "value", "result"]>, - AllShapesMatch<["tensorDesc", "value", "result"]>]> { + AllShapesMatch<["tensorDesc", "value", "result"]>, + AnchorLayoutInterface]> { let summary = "Atomic read-modify-write operation on the TensorDesc. "; let description = [{ @@ -1351,6 +1410,17 @@ def XeGPU_AtomicRMWOp: XeGPU_Op<"atomic_rmw", [Pure, XeGPU_ValueType:$value, OptionalAttr:$layout); + let extraClassDeclaration = [{ + xegpu::DistributeLayoutAttr getAnchorLayout() { + return getLayout().value_or(nullptr); + } + + void setAnchorLayout(xegpu::DistributeLayoutAttr anchorLayout) { + setLayoutAttr(anchorLayout); + } + + }]; + let results = (outs XeGPU_ValueType:$result); let assemblyFormat = [{ @@ -1424,7 +1494,7 @@ def XeGPU_FenceOp: XeGPU_Op<"fence", []> { let extraClassDeclaration = extraBaseClassDeclaration; } -def XeGPU_ConvertLayoutOp: XeGPU_Op<"convert_layout", [Pure, AllTypesMatch<["source", "result"]>]> { +def XeGPU_ConvertLayoutOp: XeGPU_Op<"convert_layout", [Pure, AllTypesMatch<["source", "result"]>, AnchorLayoutInterface]> { let summary = "Convert the layout of the input operand"; let description = [{ `convert_layout` redistribute data across subgroups and/or lanes from the `input_layout` to @@ -1458,6 +1528,16 @@ def XeGPU_ConvertLayoutOp: XeGPU_Op<"convert_layout", [Pure, AllTypesMatch<["sou let assemblyFormat = [{ $source prop-dict attr-dict `:` type($source) }]; + let extraClassDeclaration = [{ + xegpu::DistributeLayoutAttr getAnchorLayout() { + return getTargetLayout(); + } + + void setAnchorLayout(xegpu::DistributeLayoutAttr anchorLayout) { + setTargetLayoutAttr(anchorLayout); + } + + }]; let hasFolder = 1; let hasVerifier = 1; @@ -1499,7 +1579,7 @@ def XeGPU_CreateMemDescOp: XeGPU_Op<"create_mem_desc", [Pure, } def XeGPU_LoadMatrixOp: XeGPU_Op<"load_matrix", [MemoryEffects<[MemRead]>, - AllElementTypesMatch<["mem_desc", "res"]>]> { + AllElementTypesMatch<["mem_desc", "res"]>, AnchorLayoutInterface]> { let arguments = (ins XeGPU_MemDesc:$mem_desc, Variadic: $offsets, DenseI64ArrayAttr: $const_offsets, @@ -1558,13 +1638,22 @@ def XeGPU_LoadMatrixOp: XeGPU_Op<"load_matrix", [MemoryEffects<[MemRead]>, return vecTy.getShape(); return {}; } + + xegpu::DistributeLayoutAttr getAnchorLayout() { + return getLayout().value_or(nullptr); + } + + void setAnchorLayout(xegpu::DistributeLayoutAttr anchorLayout) { + setLayoutAttr(anchorLayout); + } + }]; let hasVerifier = 1; } def XeGPU_StoreMatrixOp: XeGPU_Op<"store_matrix", [MemoryEffects<[MemWrite]>, - AllElementTypesMatch<["mem_desc", "data"]>]> { + AllElementTypesMatch<["mem_desc", "data"]>, AnchorLayoutInterface]> { let arguments = (ins AnyTypeOf<[XeGPU_ValueType, XeGPU_ScalarType]>:$data, XeGPU_MemDesc:$mem_desc, @@ -1617,6 +1706,14 @@ def XeGPU_StoreMatrixOp: XeGPU_Op<"store_matrix", [MemoryEffects<[MemWrite]>, return {}; } + xegpu::DistributeLayoutAttr getAnchorLayout() { + return getLayout().value_or(nullptr); + } + + void setAnchorLayout(xegpu::DistributeLayoutAttr anchorLayout) { + setLayoutAttr(anchorLayout); + } + }]; let hasVerifier = 1; diff --git a/mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h b/mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h index 58092c3bb9ed2..46d52516cbc15 100644 --- a/mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h +++ b/mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h @@ -63,63 +63,6 @@ FailureOr getDistributedVectorType(xegpu::TensorDescType tdescTy); FailureOr getDistributedVectorType(VectorType originalType, LayoutAttr layout); -/// Return the attribute name for the OpOperand to attach DistributeLayoutAttr -std::string getLayoutName(const OpOperand &operand); - -/// Return the attribute name for the OpResult to attach DistributeLayoutAttr -std::string getLayoutName(const OpResult result); - -/// Retrieves the DistributeLayoutAttr associated with a given Value. For -/// TensorDescType values, the DistributeLayoutAttr is extracted from the -/// TensorDescType itself. For other values, it is obtained from the attributes -/// of the defining operation. Returns nullptr if no DistributeLayoutAttr is -/// found. -DistributeLayoutAttr getDistributeLayoutAttr(const Value value); - -template -AttrTy getDistributeLayoutAttrOfType(const Value value) { - return dyn_cast_if_present(getDistributeLayoutAttr(value)); -} - -/// Retrieves the DistributeLayoutAttr associated with a given OpOperand. It -/// will first check the operand_layout_{id} of the owner operation. If not -/// found, it will check the operand itself and its defining op. -DistributeLayoutAttr getDistributeLayoutAttr(const OpOperand &opr); - -template -AttrTy getDistributeLayoutAttrOfType(const OpOperand &opr) { - return dyn_cast_if_present(getDistributeLayoutAttr(opr)); -} - -/// Removes the LayoutAttr for a given OpOperand or OpResult if it exists. -template || - std::is_same_v>> -void removeLayoutAttr(const T &operandOrResult); - -/// Removes the DistributeLayoutAttr for each OpOperand and OpResult of the -/// given operation if they exist. If the operation contains regions, it is also -/// applied recursively to the contained operations -void removeLayoutAttrs(Operation *op); - -/// Sets the DistributeLayoutAttr for a given OpOperand or OpResult by attaching -/// it to the owner's dictionary attributes -/// If `respectPermLayout` is true the existing permament layout -/// attribute will be kept and assigned to the attribute dict instead -/// of the provided layout. -template || - std::is_same_v>> -void setDistributeLayoutAttr(const T &operandOrResult, - const DistributeLayoutAttr layout, - bool respectPermLayout = false); - -/// Set the DistributeLayoutAttr for each OpOperand and OpResult of the given -/// operation. If the operation contains regions, it is also applied recursively -/// to the contained operations -void setDistributeLayoutAttrs( - Operation *op, function_ref getLayoutImpl); - /// Extract a set of small vectors from a value with a given shape using /// vector.extract_stride_slice SmallVector extractVectorsWithShapeFromValue(OpBuilder &builder, @@ -175,6 +118,69 @@ template int getLargestDivisor(T dim, ArrayRef candidates, ArrayRef candidateMultiples = {}); +/// Return the attribute name for the OpOperand to attach DistributeLayoutAttr +std::string getTemporaryLayoutName(const OpOperand &operand); + +/// Return the attribute name for the OpResult to attach DistributeLayoutAttr +std::string getTemporaryLayoutName(const OpResult result); + +/// Retrieves the DistributeLayoutAttr associated with a given Value. For +/// TensorDescType values, the DistributeLayoutAttr is extracted from the +/// TensorDescType itself. For other values, it is obtained from the attributes +/// of the defining operation. Returns nullptr if no DistributeLayoutAttr is +/// found. +DistributeLayoutAttr getDistributeLayoutAttr(const Value value); + +/// Retrieves the DistributeLayoutAttr associated with a given OpOperand. It +/// will first check the operand_layout_{id} of the owner operation. If not +/// found, it will check the operand itself and its defining op. +DistributeLayoutAttr getDistributeLayoutAttr(const OpOperand &opr); + +/// Removes the LayoutAttr for a given OpOperand or OpResult if it exists. +template || + std::is_same_v>> +void removeLayoutAttr(const T &operandOrResult); + +/// Removes the DistributeLayoutAttr for each OpOperand and OpResult of the +/// given operation if they exist. If the operation contains regions, it is also +/// applied recursively to the contained operations +void removeLayoutAttrs(Operation *op); + +/// [to-be-deprecated] Sets the DistributeLayoutAttr for a given OpResult +/// user should use setAnchorLayout instead +void setDistributeLayoutAttr(const OpResult &Result, + const DistributeLayoutAttr layout); + +/// [to-be-deprecated] Sets the DistributeLayoutAttr for a given OpOperand +/// user should use setAnchorLayout instead +void setDistributeLayoutAttr(const OpOperand &opr, + const DistributeLayoutAttr layout); + +/// get and set distribute layout attribute for non-anchor operations +/// (and offsets/masks of load/store ops before we get rid of their temp attrs) +template || + std::is_same_v>> +DistributeLayoutAttr getTemporaryLayout(const T &operandOrResult); + +template || + std::is_same_v>> +void setTemporaryLayout(const T &operandOrResult, + const DistributeLayoutAttr layout); + +/// [to-be-deprecated] Set the DistributeLayoutAttr for each OpOperand and +/// OpResult of of the given operation. If the operation contains regions, it is +/// also applied recursively to the contained operations operation. +/// TODO: To be replaced by recoverTemporaryLayouts() +void recoverTemporaryLayoutsDeprecated(Operation *op); + +/// Attach layout attributes to all vector-type operands of operations within +/// the given operation's region. Reports an error if any vector operand lacks +/// a layout attribute. +bool recoverTemporaryLayouts(Operation *rootOp); + } // namespace xegpu } // namespace mlir diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp index 1a19ab5fd970b..ccf17da26c942 100644 --- a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp +++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp @@ -35,6 +35,8 @@ void XeGPUDialect::initialize() { #include >(); } +#define GET_OP_INTERFACE_CLASSES +#include "mlir/Dialect/XeGPU/IR/XeGPUOpInterface.cpp.inc" // A `srcShape` consists of N distribution units, each being `subShapesLayout` x // `subShape`. A `delinearizedId` is used to identify a particular `subShape` diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp index ec5feb8bc8c4a..ba2753f517ce6 100644 --- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp +++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp @@ -280,11 +280,13 @@ void XeGPUBlockingPass::runOnOperation() { MLIRContext *ctx = &getContext(); Operation *op = getOperation(); - // Preserve the LayoutAttr for each operand to the owner's DictionaryAttr. - // This ensures that the LayoutAttr remains accessible even if the defining - // operation is replaced. - xegpu::setDistributeLayoutAttrs( - op, [](Value v) { return xegpu::getDistributeLayoutAttr(v); }); + // TODO-LayoutRefactor: unify the local propagation for layout preprocessing + // replace the function with recoverTemporaryLayouts + // if (!xegpu::recoverTemporaryLayouts(op)) { + // signalPassFailure(); + // return; + // } + xegpu::recoverTemporaryLayoutsDeprecated(op); auto getTileShapeAndCount = [](llvm::ArrayRef shape, xegpu::LayoutAttr layout) { @@ -412,14 +414,14 @@ void XeGPUBlockingPass::runOnOperation() { op->walk([](Operation *op) { // Remove the layout attributes cached per operands. for (OpOperand &opr : op->getOpOperands()) { - std::string name = xegpu::getLayoutName(opr); + std::string name = xegpu::getTemporaryLayoutName(opr); if (op->hasAttrOfType(name)) op->removeAttr(name); } // Update the layout attributes per result. for (OpResult result : op->getOpResults()) { - std::string name = xegpu::getLayoutName(result); + std::string name = xegpu::getTemporaryLayoutName(result); if (auto layout = op->getAttrOfType(name)) { op->removeAttr(name); if (!isa(op)) diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUOptimizeBlockLoads.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUOptimizeBlockLoads.cpp index ab41fe4298d99..bb80df197d45b 100644 --- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUOptimizeBlockLoads.cpp +++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUOptimizeBlockLoads.cpp @@ -217,14 +217,14 @@ static Value generateLoads(ConversionPatternRewriter &rewriter, origLoadOp.getL3HintAttr(), origLoadOp.getLayoutAttr()); // Set the layout for the loadOp. auto layoutAttr = newTensorDesc.getType().getLayoutAttr(); - xegpu::setDistributeLayoutAttr(loadOp->getOpResult(0), layoutAttr); + loadOp.setAnchorLayout(layoutAttr); // Insert the loaded block into the right position in data. auto insertOp = vector::InsertStridedSliceOp::create( rewriter, loc, loadOp.getResult(), data, ArrayRef{localOffsetDim0, localOffsetDim1}, ArrayRef{1, 1}); // InsertOp must have the same layout as newTensorDesc. - xegpu::setDistributeLayoutAttr(insertOp->getOpResult(0), layoutAttr); + xegpu::setTemporaryLayout(insertOp->getOpResult(0), layoutAttr); data = insertOp.getResult(); } } @@ -366,8 +366,8 @@ class XeGPULoadNdDescOpPattern final auto bitCastOp = vector::BitCastOp::create(rewriter, loadNdOp->getLoc(), bitcastType, slice); // BitCastOp must have the same layout as the original loadNdOp. - xegpu::setDistributeLayoutAttr(bitCastOp->getOpResult(0), - origTensorDescType.getLayoutAttr()); + xegpu::setTemporaryLayout(bitCastOp->getOpResult(0), + origTensorDescType.getLayoutAttr()); arraySlices.push_back(bitCastOp.getResult()); } rewriter.replaceOpWithMultiple(loadNdOp, {arraySlices}); @@ -384,8 +384,8 @@ class XeGPULoadNdDescOpPattern final auto bitCastOp = vector::BitCastOp::create(rewriter, loadNdOp->getLoc(), loadNdOp.getType(), data); // BitCastOp must have the same layout as the original loadNdOp. - xegpu::setDistributeLayoutAttr(bitCastOp->getOpResult(0), - origTensorDescType.getLayoutAttr()); + xegpu::setTemporaryLayout(bitCastOp->getOpResult(0), + origTensorDescType.getLayoutAttr()); rewriter.replaceOp(loadNdOp, bitCastOp); return success(); } @@ -465,7 +465,8 @@ struct XeGPUOptimizeBlockLoadsPass final // converted. target.addDynamicallyLegalOp( [&](vector::ExtractOp extractOp) { - auto layout = xegpu::getDistributeLayoutAttr(extractOp.getResult()); + auto layout = xegpu::getTemporaryLayout( + dyn_cast(extractOp.getResult())); if (!layout) return true; auto laneLayout = layout.getEffectiveLaneLayoutAsInt(); diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp index 12d1c494a0b72..c9b3a153624bd 100644 --- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp +++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp @@ -1169,7 +1169,7 @@ static LogicalResult updateOp(mlir::OpBuilder &builder, mlir::Operation *op, } // If the result is a vector type, add a temporary layout attribute to the // op. - xegpu::setDistributeLayoutAttr(result, layout, /*respectPermLayout*/ true); + xegpu::setDistributeLayoutAttr(result, layout); } return success(); } diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp index 54a306d63cdcd..a1c0656d0bdb5 100644 --- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp +++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp @@ -636,16 +636,14 @@ struct DpasDistribution final : public gpu::WarpDistributionPattern { auto dpasOp = operand->get().getDefiningOp(); unsigned operandIdx = operand->getOperandNumber(); - std::string layoutAName = xegpu::getLayoutName(dpasOp->getOpOperand(0)); - std::string layoutBName = xegpu::getLayoutName(dpasOp->getOpOperand(1)); - std::string layoutCName = xegpu::getLayoutName(dpasOp->getOpResult(0)); xegpu::LayoutAttr layoutA = - dpasOp->getAttrOfType(layoutAName); + dyn_cast(dpasOp.getLayoutAAttr()); xegpu::LayoutAttr layoutB = - dpasOp->getAttrOfType(layoutBName); + dyn_cast(dpasOp.getLayoutBAttr()); xegpu::LayoutAttr layoutOut = - dpasOp->getAttrOfType(layoutCName); + dyn_cast(dpasOp.getLayoutCdAttr()); + if (!layoutA || !layoutB || !layoutOut) return rewriter.notifyMatchFailure( dpasOp, @@ -657,6 +655,7 @@ struct DpasDistribution final : public gpu::WarpDistributionPattern { getDistVecTypeBasedOnLaneLayout(layoutB, dpasOp.getRhsType()); FailureOr distResultTypeByWarpOpOrFailure = getDistVecTypeBasedOnLaneLayout(layoutOut, dpasOp.getResultType()); + if (failed(distLhsTypeByWarpOpOrFailure) || failed(distRhsTypeByWarpOpOrFailure) || failed(distResultTypeByWarpOpOrFailure)) @@ -685,6 +684,7 @@ struct DpasDistribution final : public gpu::WarpDistributionPattern { xegpu::getDistributedVectorType(dpasOp.getRhsType(), layoutB); FailureOr expectedDistResultTyOrFailure = xegpu::getDistributedVectorType(dpasOp.getResultType(), layoutOut); + if (failed(expectedDistLhsTyOrFailure) || failed(expectedDistRhsTyOrFailure) || failed(expectedDistResultTyOrFailure)) @@ -869,11 +869,11 @@ struct StoreDistribution final : public gpu::WarpDistributionPattern { storeScatterOp, "Expected at most 2D result at SG level"); std::string layoutPayloadName = - xegpu::getLayoutName(storeScatterOp->getOpOperand(0)); + xegpu::getTemporaryLayoutName(storeScatterOp->getOpOperand(0)); std::string layoutOffsetsName = - xegpu::getLayoutName(storeScatterOp->getOpOperand(2)); + xegpu::getTemporaryLayoutName(storeScatterOp->getOpOperand(2)); std::string layoutMaskName = - xegpu::getLayoutName(storeScatterOp->getOpOperand(3)); + xegpu::getTemporaryLayoutName(storeScatterOp->getOpOperand(3)); xegpu::LayoutAttr layoutPayload = storeScatterOp->getAttrOfType(layoutPayloadName); @@ -1152,9 +1152,9 @@ struct LoadDistribution final : public gpu::WarpDistributionPattern { "Expected 1D offsets and mask vector"); // Assume offset and mask producers will be distributed as well. std::string layoutOffsetsName = - xegpu::getLayoutName(loadGatherOp->getOpOperand(1)); + xegpu::getTemporaryLayoutName(loadGatherOp->getOpOperand(1)); std::string layoutMaskName = - xegpu::getLayoutName(loadGatherOp->getOpOperand(2)); + xegpu::getTemporaryLayoutName(loadGatherOp->getOpOperand(2)); xegpu::LayoutAttr layoutOffsets = loadGatherOp->getAttrOfType(layoutOffsetsName); @@ -1224,8 +1224,8 @@ static Value lowerToVectorReductions(TypedValue src, rewriter, loc, acc.getType(), DenseElementsAttr::get(acc.getType(), zeroAttr)); // Reduction result should have the same layout as the accumulator. - xegpu::setDistributeLayoutAttr(cast(reductionResult), - xegpu::getDistributeLayoutAttr(acc)); + xegpu::setTemporaryLayout(cast(reductionResult), + xegpu::getTemporaryLayout(dyn_cast(acc))); // For each slice of the source, extract the slice vector, do a reduction // and, insert the reduced value back to the result vector. for (int i = 0; i < nSlices; ++i) { @@ -1240,20 +1240,24 @@ static Value lowerToVectorReductions(TypedValue src, vector::ExtractStridedSliceOp extractOp = vector::ExtractStridedSliceOp::create(rewriter, loc, src, sliceOffsets, sliceSizes, {1, 1}); + int64_t nSliceElements = extractOp.getResult().getType().getNumElements(); + vector::ShapeCastOp slice = vector::ShapeCastOp::create( rewriter, loc, VectorType::get({nSliceElements}, sourceType.getElementType()), extractOp.getResult()); + // Shape cast is currently handled in xegpu side. So layouts must be // retained during lowering. Shape cast output has the same layout as the // accumulator. Shape cast source has the same layout as the original // reduction source. // TODO: other ops generated here may also need layout attributes. - xegpu::setDistributeLayoutAttr(slice->getOpOperand(0), - xegpu::getDistributeLayoutAttr(src)); - xegpu::setDistributeLayoutAttr(slice->getOpResult(0), - xegpu::getDistributeLayoutAttr(acc)); + auto srcLayout = xegpu::getTemporaryLayout(dyn_cast(src)); + auto accLayout = xegpu::getTemporaryLayout(dyn_cast(acc)); + + xegpu::setTemporaryLayout(slice->getOpOperand(0), srcLayout); + xegpu::setTemporaryLayout(slice->getOpResult(0), accLayout); // Extract and reduction results in scalars, so no result layout is needed. Value accExtract = vector::ExtractOp::create(rewriter, loc, acc, i); Value reduction = vector::ReductionOp::create( @@ -1346,7 +1350,7 @@ struct VectorMultiReductionDistribution : public gpu::WarpDistributionPattern { cast(warpOp.getResult(operandIdx).getType()); VectorType resultType = cast(reductionOp.getType()); xegpu::DistributeLayoutAttr sourceLayout = - xegpu::getDistributeLayoutAttr(reductionOp.getSource()); + xegpu::getTemporaryLayout(reductionOp->getOpOperand(0)); FailureOr sourceDistTypeOrFailure = getDistVecTypeBasedOnLaneLayout(sourceLayout, sourceType); @@ -1508,9 +1512,9 @@ struct VectorBroadcastDistribution : public gpu::WarpDistributionPattern { dyn_cast(broadcastOp.getResult().getType()); xegpu::DistributeLayoutAttr sourceLayout = - xegpu::getDistributeLayoutAttr(broadcastOp->getOpOperand(0)); + xegpu::getTemporaryLayout(broadcastOp->getOpOperand(0)); xegpu::DistributeLayoutAttr resultLayout = - xegpu::getDistributeLayoutAttr(broadcastOp.getResult()); + xegpu::getTemporaryLayout(dyn_cast(broadcastOp.getResult())); FailureOr sourceDistType; Type sourceElemOrDistType; @@ -1599,9 +1603,9 @@ struct VectorShapeCastDistribution : public gpu::WarpDistributionPattern { auto resultDistTy = cast(warpOp.getResult(operandNumber).getType()); xegpu::DistributeLayoutAttr sourceLayout = - xegpu::getDistributeLayoutAttr(shapeCastOp->getOpOperand(0)); + xegpu::getTemporaryLayout(shapeCastOp->getOpOperand(0)); xegpu::DistributeLayoutAttr resultLayout = - xegpu::getDistributeLayoutAttr(shapeCastOp.getResult()); + xegpu::getTemporaryLayout(dyn_cast(shapeCastOp.getResult())); if (!sourceLayout || !resultLayout) return rewriter.notifyMatchFailure( warpOp, @@ -1696,8 +1700,7 @@ struct VectorExtractStridedSliceDistribution int64_t distributedDim = distributedDims[0]; int sourceDistrDimSize = extractOp.getSourceVectorType().getShape()[distributedDim]; - auto sourceLayout = - xegpu::getDistributeLayoutAttr(extractOp->getOpOperand(0)); + auto sourceLayout = xegpu::getTemporaryLayout(extractOp->getOpOperand(0)); if (!sourceLayout || sourceLayout.getEffectiveLaneLayoutAsInt().empty()) return rewriter.notifyMatchFailure( warpOp, "the source of extract_strided_slice op lacks distribution " @@ -1807,10 +1810,8 @@ struct VectorInsertStridedSliceDistribution "rank) dims of dest vector"); int64_t srcDistrDimSize = srcType.getDimSize(sourceDistributedDim); // Obtain the source and dest layouts. - auto destLayout = - xegpu::getDistributeLayoutAttr(insertOp->getOpOperand(1)); - auto sourceLayout = - xegpu::getDistributeLayoutAttr(insertOp->getOpOperand(0)); + auto destLayout = xegpu::getTemporaryLayout(insertOp->getOpOperand(1)); + auto sourceLayout = xegpu::getTemporaryLayout(insertOp->getOpOperand(0)); if (!destLayout || !sourceLayout || destLayout.getEffectiveLaneLayoutAsInt().empty() || sourceLayout.getEffectiveLaneLayoutAsInt().empty()) @@ -1925,7 +1926,7 @@ struct VectorBitcastDistribution final : public gpu::WarpDistributionPattern { unsigned operandIdx = operand->getOperandNumber(); VectorType distributedSourceType = getDistVecTypeBasedOnLaneLayout( - xegpu::getDistributeLayoutAttr(bitcastOp.getSource()), + xegpu::getTemporaryLayout(bitcastOp->getOpOperand(0)), bitcastOp.getSourceVectorType()) .value_or(VectorType()); if (!distributedSourceType) @@ -1968,9 +1969,9 @@ struct VectorTransposeDistribution final : public gpu::WarpDistributionPattern { auto transposeOp = operand->get().getDefiningOp(); unsigned operandIdx = operand->getOperandNumber(); xegpu::DistributeLayoutAttr sourceLayout = - xegpu::getDistributeLayoutAttr(transposeOp.getVector()); + xegpu::getTemporaryLayout(transposeOp->getOpOperand(0)); xegpu::DistributeLayoutAttr resultLayout = - xegpu::getDistributeLayoutAttr(transposeOp.getResult()); + xegpu::getTemporaryLayout(transposeOp->getOpResult(0)); if (!sourceLayout || !resultLayout) return rewriter.notifyMatchFailure( transposeOp, @@ -2053,24 +2054,11 @@ void XeGPUSubgroupDistributePass::runOnOperation() { // 1) It is assumed that there are no layout conflicts. // 2) Any existing layout attributes attached to the operands are ignored. Operation *op = getOperation(); - op->walk([&](Operation *op) { - for (OpOperand &operand : op->getOpOperands()) { - // Layouts are needed for vector type only. - if (!isa(operand.get().getType())) - continue; - if (isa(op)) - continue; - - auto layout = xegpu::getDistributeLayoutAttr(operand.get()); - if (!layout) { - op->emitError("Could not find layout attribute for operand ") - << operand.getOperandNumber() << " of operation " << op->getName(); - signalPassFailure(); - return; - } - xegpu::setDistributeLayoutAttr(operand, layout); - } - }); + if (!xegpu::recoverTemporaryLayouts(op)) { + signalPassFailure(); + return; + } + // Step 2: Move all operations of a GPU function inside // gpu.warp_execute_on_lane_0 operation. { diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp index 95e27e46d90ab..07572a4950760 100644 --- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp +++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp @@ -425,10 +425,11 @@ struct WgToSgDpasOp : public OpConversionPattern { if (resultTy.getRank() != 2) return failure(); - auto originalLayout = xegpu::getDistributeLayoutAttr(op.getResult()); - if (!originalLayout) + auto layoutCd = op.getLayoutCdAttr(); + auto layoutA = op.getLayoutAAttr(); + auto layoutB = op.getLayoutBAttr(); + if (!layoutCd || !layoutA || !layoutB) return failure(); - size_t i = 0; SmallVector newDpasOps; for (auto aVec : adaptor.getLhs()) { @@ -447,11 +448,12 @@ struct WgToSgDpasOp : public OpConversionPattern { llvm::cast(bVec.getType()).getShape(); VectorType resTy = VectorType::get({aVecShape[0], bVecShape[1]}, resultTy.getElementType()); - tmpC = xegpu::DpasOp::create(rewriter, loc, resTy, operands); - xegpu::setDistributeLayoutAttr(cast(tmpC), - originalLayout.dropSgLayoutAndData()); + auto newDpasOp = xegpu::DpasOp::create(rewriter, loc, resTy, operands); + newDpasOp.setLayoutCdAttr(layoutCd.dropSgLayoutAndData()); + newDpasOp.setLayoutAAttr(layoutA.dropSgLayoutAndData()); + newDpasOp.setLayoutBAttr(layoutB.dropSgLayoutAndData()); - newDpasOps.push_back(tmpC); + newDpasOps.push_back(newDpasOp); } } rewriter.replaceOpWithMultiple(op, {newDpasOps}); @@ -491,7 +493,7 @@ struct WgToSgVectorBroadcastOp ArrayRef wgShape = resultType.getShape(); xegpu::DistributeLayoutAttr layout = - xegpu::getDistributeLayoutAttr(op.getResult()); + xegpu::getTemporaryLayout(llvm::cast(op.getResult())); if (!layout || !layout.isForWorkgroup()) return failure(); @@ -506,8 +508,8 @@ struct WgToSgVectorBroadcastOp for (auto operand : adaptor.getOperands().front()) { auto newBroadcast = vector::BroadcastOp::create(rewriter, op.getLoc(), newResultType, operand); - xegpu::setDistributeLayoutAttr(newBroadcast->getResult(0), - layout.dropSgLayoutAndData()); + xegpu::setTemporaryLayout(newBroadcast->getResult(0), + layout.dropSgLayoutAndData()); newBroadcastOps.push_back(newBroadcast.getResult()); } @@ -534,7 +536,7 @@ struct WgToSgElementwiseOp : public ConversionPattern { ArrayRef wgShape = resultType.getShape(); xegpu::DistributeLayoutAttr layout = - xegpu::getDistributeLayoutAttr(op->getResult(0)); + xegpu::getTemporaryLayout(llvm::cast(op->getResult(0))); if (!layout || !layout.isForWorkgroup()) return failure(); @@ -740,7 +742,7 @@ struct WgToSgArithConstantOp : public OpConversionPattern { return failure(); xegpu::DistributeLayoutAttr layout = - xegpu::getDistributeLayoutAttr(op.getResult()); + xegpu::getTemporaryLayout(dyn_cast(op.getResult())); if (!layout || !layout.isForWorkgroup()) return failure(); @@ -754,8 +756,8 @@ struct WgToSgArithConstantOp : public OpConversionPattern { auto eltType = vecType.getElementType(); auto setLayout = [&](Value val) { - xegpu::setDistributeLayoutAttr(llvm::dyn_cast(val), - layout.dropSgLayoutAndData()); + xegpu::setTemporaryLayout(llvm::dyn_cast(val), + layout.dropSgLayoutAndData()); }; if (vecAttr.isSplat()) { @@ -901,8 +903,8 @@ struct WgToSgLoadGatherOpWithOffset return failure(); ArrayRef wgShape = resultType.getShape(); - xegpu::DistributeLayoutAttr layout = - xegpu::getDistributeLayoutAttr(op.getResult()); + xegpu::DistributeLayoutAttr layout = op.getLayoutAttr(); + if (!layout || !layout.isForWorkgroup()) return failure(); @@ -930,7 +932,7 @@ struct WgToSgLoadGatherOpWithOffset rewriter, loc, newTy, op.getSource(), offsets, mask, chunkSizeAttr, op.getL1HintAttr(), op.getL2HintAttr(), op.getL3HintAttr(), newLayout); - xegpu::setDistributeLayoutAttr(newLoadOp->getResult(0), newLayout); + newLoadOp.setAnchorLayout(newLayout); newLoadOps.push_back(newLoadOp); } rewriter.replaceOpWithMultiple(op, {newLoadOps}); @@ -955,8 +957,8 @@ struct WgToSgStoreScatterOpWithOffset if (!valueType) return failure(); - xegpu::DistributeLayoutAttr layout = - xegpu::getDistributeLayoutAttr(op.getOperand(0)); + xegpu::DistributeLayoutAttr layout = op.getLayoutAttr(); + if (!layout || !layout.isForWorkgroup()) return failure(); @@ -985,7 +987,7 @@ struct WgToSgStoreScatterOpWithOffset // Skip for operand one (memref) if (operand.getOperandNumber() == 1) continue; - xegpu::setDistributeLayoutAttr(operand, layout.dropSgLayoutAndData()); + xegpu::setTemporaryLayout(operand, layout.dropSgLayoutAndData()); } } rewriter.eraseOp(op); @@ -1050,7 +1052,7 @@ struct WgToSgVectorStepOp : public OpConversionPattern { matchAndRewrite(vector::StepOp op, OneToNOpAdaptor adaptor, ConversionPatternRewriter &rewriter) const override { xegpu::DistributeLayoutAttr layout = - xegpu::getDistributeLayoutAttr(op.getResult()); + xegpu::getTemporaryLayout(dyn_cast(op.getResult())); if (!layout || !layout.isForWorkgroup()) return failure(); @@ -1078,12 +1080,12 @@ struct WgToSgVectorStepOp : public OpConversionPattern { vector::BroadcastOp::create(rewriter, loc, newTy, offsets[0]); auto finalSteps = arith::AddIOp::create(rewriter, loc, steps, bcastOffset); - xegpu::setDistributeLayoutAttr(steps->getResult(0), - layout.dropSgLayoutAndData()); - xegpu::setDistributeLayoutAttr(bcastOffset->getResult(0), - layout.dropSgLayoutAndData()); - xegpu::setDistributeLayoutAttr(finalSteps->getResult(0), - layout.dropSgLayoutAndData()); + xegpu::setTemporaryLayout(steps->getResult(0), + layout.dropSgLayoutAndData()); + xegpu::setTemporaryLayout(bcastOffset->getResult(0), + layout.dropSgLayoutAndData()); + xegpu::setTemporaryLayout(finalSteps->getResult(0), + layout.dropSgLayoutAndData()); newOps.push_back(finalSteps); } @@ -1107,7 +1109,7 @@ struct WgToSgVectorShapeCastOp ArrayRef wgShape = resultType.getShape(); xegpu::DistributeLayoutAttr layout = - xegpu::getDistributeLayoutAttr(op.getResult()); + xegpu::getTemporaryLayout(dyn_cast(op.getResult())); if (!layout || !layout.isForWorkgroup()) return failure(); @@ -1172,8 +1174,8 @@ struct WgToSgVectorShapeCastOp for (auto src : adaptor.getSource()) { auto newShapeCast = vector::ShapeCastOp::create(rewriter, op.getLoc(), newResultType, src); - xegpu::setDistributeLayoutAttr(newShapeCast->getResult(0), - layout.dropSgLayoutAndData()); + xegpu::setTemporaryLayout(newShapeCast->getResult(0), + layout.dropSgLayoutAndData()); newShapeCastOps.push_back(newShapeCast.getResult()); } @@ -1201,7 +1203,7 @@ struct WgToSgMultiDimReductionOp auto srcShape = srcType.getShape(); xegpu::DistributeLayoutAttr layout = - xegpu::getDistributeLayoutAttr(op.getResult()); + xegpu::getTemporaryLayout(dyn_cast(op.getResult())); if (!layout || !layout.isForWorkgroup()) return failure(); @@ -1234,8 +1236,8 @@ struct WgToSgMultiDimReductionOp auto newOp = vector::MultiDimReductionOp::create( rewriter, op.getLoc(), newDstType, op.getKind(), sgSrc, adaptor.getAcc()[0], op.getReductionDims()); - xegpu::setDistributeLayoutAttr(newOp->getResult(0), - layout.dropSgLayoutAndData()); + xegpu::setTemporaryLayout(newOp->getResult(0), + layout.dropSgLayoutAndData()); newReductions.push_back(newOp.getResult()); } @@ -1256,10 +1258,10 @@ struct WgToSgVectorTransposeOp ArrayRef wgShape = resultType.getShape(); xegpu::DistributeLayoutAttr layout = - xegpu::getDistributeLayoutAttr(op.getResult()); + xegpu::getTemporaryLayout(dyn_cast(op.getResult())); if (!layout || !layout.isForWorkgroup()) return failure(); - + // TODO-LayoutRefactor: handle the case using getTemporaryLayout xegpu::DistributeLayoutAttr sourceLayout = xegpu::getDistributeLayoutAttr(op.getVector()); if (!sourceLayout || !sourceLayout.isForWorkgroup()) @@ -1298,8 +1300,8 @@ struct WgToSgVectorTransposeOp for (auto src : adaptor.getVector()) { auto newTranspose = vector::TransposeOp::create( rewriter, op.getLoc(), newResultType, src, permutation); - xegpu::setDistributeLayoutAttr(newTranspose->getResult(0), - layout.dropSgLayoutAndData()); + xegpu::setTemporaryLayout(newTranspose->getResult(0), + layout.dropSgLayoutAndData()); newTransposeOps.push_back(newTranspose.getResult()); } @@ -1318,7 +1320,7 @@ struct WgToSgVectorMaskOp : public OpConversionPattern { typename OpConversionPattern::OneToNOpAdaptor adaptor, ConversionPatternRewriter &rewriter) const override { xegpu::DistributeLayoutAttr layout = - xegpu::getDistributeLayoutAttr(op.getResult()); + xegpu::getTemporaryLayout(dyn_cast(op.getResult())); if (!layout || !layout.isForWorkgroup()) return failure(); @@ -1368,8 +1370,8 @@ struct WgToSgVectorMaskOp : public OpConversionPattern { auto newCreateMaskOp = vector::CreateMaskOp::create(rewriter, loc, resultType, maskOperands); - xegpu::setDistributeLayoutAttr(newCreateMaskOp->getResult(0), - layout.dropSgLayoutAndData()); + xegpu::setTemporaryLayout(newCreateMaskOp->getResult(0), + layout.dropSgLayoutAndData()); newCreateMaskOps.push_back(newCreateMaskOp.getResult()); } @@ -1409,6 +1411,14 @@ struct XeGPUWgToSgDistributePass } // namespace void XeGPUWgToSgDistributePass::runOnOperation() { + + // TODO-LayoutRefactor: unify the local propagation for layout preprocessing + // Operation *op = getOperation(); + // if (!xegpu::recoverTemporaryLayouts(op)) { + // signalPassFailure(); + // return; + // } + // Track existing UnrealizedConversionCastOps SmallVector existingCastOps; getOperation()->walk([&](UnrealizedConversionCastOp castOp) { @@ -1499,7 +1509,7 @@ void XeGPUWgToSgDistributePass::runOnOperation() { }); target.addDynamicallyLegalOp([=](xegpu::DpasOp op) -> bool { - auto layout = xegpu::getDistributeLayoutAttr(op.getResult()); + auto layout = op.getLayoutCdAttr(); return isLegal(layout); }); @@ -1519,7 +1529,8 @@ void XeGPUWgToSgDistributePass::runOnOperation() { if (!vecType) return true; - auto layout = xegpu::getDistributeLayoutAttr(op.getResult()); + auto layout = + xegpu::getTemporaryLayout(dyn_cast(op.getResult())); return isLegal(layout); }); @@ -1529,19 +1540,20 @@ void XeGPUWgToSgDistributePass::runOnOperation() { vector::ConstantMaskOp, vector::CreateMaskOp>( [=](Operation *op) -> bool { // Check for either a SliceAttr or LayoutAttr on the result. - auto layout = xegpu::getDistributeLayoutAttr(op->getResult(0)); + auto layout = + xegpu::getTemporaryLayout(dyn_cast(op->getResult(0))); return isLegal(layout); }); target.addDynamicallyLegalOp( [=](xegpu::LoadGatherOp op) -> bool { - auto layout = xegpu::getDistributeLayoutAttr(op.getResult()); + auto layout = op.getLayoutAttr(); return isLegal(layout); }); target.addDynamicallyLegalOp( [=](xegpu::StoreScatterOp op) -> bool { - auto layout = xegpu::getDistributeLayoutAttr(op.getOperand(0)); + auto layout = op.getLayoutAttr(); return isLegal(layout); }); @@ -1571,7 +1583,7 @@ void XeGPUWgToSgDistributePass::runOnOperation() { } xegpu::DistributeLayoutAttr layout = - xegpu::getDistributeLayoutAttr(op->getResult(0)); + xegpu::getTemporaryLayout(op->getResult(0)); return isLegal(layout); }); @@ -1596,7 +1608,7 @@ void XeGPUWgToSgDistributePass::runOnOperation() { // Layout propagation pass will activated. getOperation()->walk([](Operation *op) { for (OpResult result : op->getOpResults()) { - std::string name = xegpu::getLayoutName(result); + std::string name = xegpu::getTemporaryLayoutName(result); if (auto layout = op->getAttrOfType(name)) { op->removeAttr(name); if (!isa(op)) { diff --git a/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp b/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp index 9f126fe8c2415..d3906e37ffbf1 100644 --- a/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp +++ b/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp @@ -101,13 +101,13 @@ mlir::xegpu::getDistributedVectorType(VectorType originalType, return xegpu::getDistributedVectorType(helperTdescTy); } -std::string xegpu::getLayoutName(const OpOperand &operand) { +std::string xegpu::getTemporaryLayoutName(const OpOperand &operand) { const StringRef prefix("layout_operand_"); unsigned idx = const_cast(operand).getOperandNumber(); return llvm::formatv("{0}{1}", prefix, idx).str(); } -std::string xegpu::getLayoutName(const OpResult result) { +std::string xegpu::getTemporaryLayoutName(const OpResult result) { const StringRef prefix = "layout_result_"; return llvm::formatv("{0}{1}", prefix, result.getResultNumber()).str(); } @@ -124,29 +124,17 @@ xegpu::DistributeLayoutAttr xegpu::getDistributeLayoutAttr(const Value value) { Operation *defOp = result.getDefiningOp(); assert(defOp && "result must have a defining op"); - // For ConvertLayoutOp, the layout is stored in the targetLayoutAttr - if (auto convertOp = dyn_cast(defOp)) - return convertOp.getTargetLayoutAttr(); - - // for LoadNdOp, the layout is stored in the tensor descriptor - if (auto loadNd = dyn_cast(defOp)) - return getDistributeLayoutAttr(loadNd.getTensorDesc()); - - // for LoadMatrixOp, the layout is attached to the property of the op - if (auto loadOp = dyn_cast(defOp)) - return loadOp.getLayoutAttr(); - - // for StoreMatrixOp, the layout is attached to the property of the op - if (auto storeOp = dyn_cast(defOp)) - return storeOp.getLayoutAttr(); - std::string layoutName = getLayoutName(result); - if (defOp->hasAttr(layoutName)) - return defOp->getAttrOfType(layoutName); - - // check for "permament" layout only after "temporary" layout name lookup - // for backward compatibility - if (auto loadGatherOp = dyn_cast(defOp)) - return loadGatherOp.getLayoutAttr(); + if (auto anchorOp = dyn_cast(defOp)) { + auto layout = anchorOp.getAnchorLayout(); + return layout; + } + + std::string layoutName = getTemporaryLayoutName(result); + if (defOp->hasAttr(layoutName)) { + auto layout = + defOp->getAttrOfType(layoutName); + return layout; + } } if (auto arg = dyn_cast(value)) { @@ -160,27 +148,46 @@ xegpu::DistributeLayoutAttr xegpu::getDistributeLayoutAttr(const Value value) { return nullptr; } - xegpu::DistributeLayoutAttr xegpu::getDistributeLayoutAttr(const OpOperand &opr) { Operation *op = opr.getOwner(); + unsigned idx = const_cast(opr).getOperandNumber(); + + if (auto anchorOp = dyn_cast(op)) { + if (auto dpasOp = dyn_cast(op)) { + if (idx == 0) { + return dpasOp.getLayoutAAttr(); + } else if (idx == 1) { + return dpasOp.getLayoutBAttr(); + } else if (idx == 2) { + return dpasOp.getLayoutCdAttr(); + } + } + if (auto convertOp = dyn_cast(op)) { + return convertOp.getInputLayoutAttr(); + } + auto layout = anchorOp.getAnchorLayout(); - if (auto loadOp = dyn_cast(op)) - return loadOp.getLayoutAttr(); - - if (auto storeOp = dyn_cast(op)) - return storeOp.getLayoutAttr(); - - std::string layoutName = xegpu::getLayoutName(opr); - if (op->hasAttr(layoutName)) - return op->getAttrOfType(layoutName); + if (idx == 0) + return layout; - // check for "permament" layout only after "temporary" layout name lookup - if (auto storeScatterOp = dyn_cast(op)) - if (auto layout = storeScatterOp.getLayoutAttr()) + // For store operations (StoreScatterOp, StoreNdOp, StoreMatrixOp), + // the layout is valid for the first two operands: value and memref/tdesc. + // For other operations, the layout applies to the first operand only. + if (isa( + op) && + (idx < 2)) return layout; + } - return getDistributeLayoutAttr(opr.get()); + std::string layoutName = xegpu::getTemporaryLayoutName(opr); + if (op->hasAttr(layoutName)) { + auto layout = op->getAttrOfType(layoutName); + return layout; + } + + auto layout = getDistributeLayoutAttr(opr.get()); + return layout; } // Returns the permanent layout attribute for the given result if it's @@ -218,55 +225,157 @@ maybePickPermanentLayout(xegpu::DistributeLayoutAttr layout, return candidate; } -template -void xegpu::setDistributeLayoutAttr(const T &operandOrResult, - const DistributeLayoutAttr layout, - bool respectPermLayout) { - Operation *owner = operandOrResult.getOwner(); - std::string name = xegpu::getLayoutName(operandOrResult); +// TODO-LayoutRefactor: Remove this function after replacing use +// with setTemporaryLayout or setAnchorLayout +void xegpu::setDistributeLayoutAttr( + const mlir::OpResult &result, + const mlir::xegpu::DistributeLayoutAttr layout) { + Operation *owner = result.getOwner(); - if (owner->hasAttrOfType(name)) + if (auto anchorOp = dyn_cast(owner)) { + if (anchorOp.getAnchorLayout() == layout) + return; + anchorOp.setAnchorLayout(layout); return; + } - DistributeLayoutAttr candidate = layout; - if (respectPermLayout) - candidate = maybePickPermanentLayout(layout, operandOrResult, owner, name); + std::string name = xegpu::getTemporaryLayoutName(result); + if (owner->hasAttrOfType(name)) { + return; + } + if (layout) { + owner->setAttr(name, layout); + } +} - if (candidate) - owner->setAttr(name, candidate); +// TODO-LayoutRefactor: Remove this function after replacing use +// with setTemporaryLayout or setAnchorLayout +void xegpu::setDistributeLayoutAttr(const OpOperand &operand, + const DistributeLayoutAttr layout) { + Operation *owner = operand.getOwner(); + unsigned idx = const_cast(operand).getOperandNumber(); + + if (!layout) { + return; + } + if (auto anchorOp = dyn_cast(owner)) { + if (auto dpasOp = dyn_cast(owner)) { + if (idx == 0) { + return dpasOp.setLayoutAAttr(layout); + } else if (idx == 1) { + return dpasOp.setLayoutBAttr(layout); + } else if (idx == 2) { + return dpasOp.setLayoutCdAttr(layout); + } + } + if (auto convertOp = dyn_cast(owner)) { + return convertOp.setInputLayoutAttr(layout); + } + + // For store operations (StoreScatterOp, StoreNdOp, StoreMatrixOp), + // the layout is valid for the first two operands: value and memref/tdesc. + // For other operations, the layout applies to the first operand only. + if (isa( + owner)) { + if (idx < 2) { + anchorOp.setAnchorLayout(layout); + } + } else { + if (idx == 0) { + anchorOp.setAnchorLayout(layout); + } + } + } + + std::string name = xegpu::getTemporaryLayoutName(operand); + if (owner->hasAttrOfType(name)) { + return; + } + if (layout) { + owner->setAttr(name, layout); + } } -// Explicit instantiation for OpResult -template void xegpu::setDistributeLayoutAttr( +template +xegpu::DistributeLayoutAttr +xegpu::getTemporaryLayout(const T &operandOrResult) { + Operation *op = operandOrResult.getOwner(); + + std::string layoutName = xegpu::getTemporaryLayoutName(operandOrResult); + if (op->hasAttr(layoutName)) { + auto layout = op->getAttrOfType(layoutName); + return layout; + } + + return nullptr; +} + +template xegpu::DistributeLayoutAttr +xegpu::getTemporaryLayout(const OpResult &result); +template xegpu::DistributeLayoutAttr +xegpu::getTemporaryLayout(const OpOperand &operand); + +template +void xegpu::setTemporaryLayout(const T &operandOrResult, + const xegpu::DistributeLayoutAttr layout) { + Operation *owner = operandOrResult.getOwner(); + std::string name = xegpu::getTemporaryLayoutName(operandOrResult); + if (owner->hasAttrOfType(name)) { + return; + } + if (layout) { + owner->setAttr(name, layout); + } +} + +template void xegpu::setTemporaryLayout( const mlir::OpResult &result, - const mlir::xegpu::DistributeLayoutAttr layout, bool respectPermLayout); + const mlir::xegpu::DistributeLayoutAttr layout); -// Explicit instantiation for OpOperand -template void xegpu::setDistributeLayoutAttr( +template void xegpu::setTemporaryLayout( const mlir::OpOperand &operand, - const mlir::xegpu::DistributeLayoutAttr layout, bool respectPermLayout); + const mlir::xegpu::DistributeLayoutAttr layout); -void xegpu::setDistributeLayoutAttrs( - Operation *op, function_ref getLayoutImpl) { +void xegpu::recoverTemporaryLayoutsDeprecated(Operation *op) { op->walk([&](Operation *nestOp) { - if (isa(nestOp)) - return; - for (OpOperand &opr : nestOp->getOpOperands()) { - auto layout = getLayoutImpl(opr.get()); + auto layout = getDistributeLayoutAttr(opr.get()); setDistributeLayoutAttr(opr, layout); } + for (OpResult result : nestOp->getOpResults()) { - auto layout = getLayoutImpl(result); + auto layout = getDistributeLayoutAttr(result); setDistributeLayoutAttr(result, layout); } }); } +/// Attach layout attributes to all vector-type operands of operations within +/// the given operation's region. Reports an error if any vector operand lacks +/// a layout attribute. +bool xegpu::recoverTemporaryLayouts(Operation *rootOp) { + auto result = rootOp->walk([&](Operation *op) { + for (OpOperand &operand : op->getOpOperands()) { + // Layouts are needed for vector type only. + if (!isa(operand.get().getType())) + continue; + auto layout = xegpu::getDistributeLayoutAttr(operand.get()); + if (!layout) { + op->emitError("Could not find layout attribute for operand ") + << operand.getOperandNumber() << " of operation " << op->getName(); + return WalkResult::interrupt(); + } + xegpu::setDistributeLayoutAttr(operand, layout); + } + return WalkResult::advance(); + }); + return !result.wasInterrupted(); +} + template void xegpu::removeLayoutAttr(const T &operandOrResult) { Operation *owner = operandOrResult.getOwner(); - std::string name = xegpu::getLayoutName(operandOrResult); + std::string name = xegpu::getTemporaryLayoutName(operandOrResult); if (owner->hasAttrOfType(name)) owner->removeAttr(name); } @@ -285,6 +394,14 @@ void xegpu::removeLayoutAttrs(Operation *op) { removeLayoutAttr(opr); for (OpResult result : nestOp->getOpResults()) removeLayoutAttr(result); + if (op->hasAttrOfType("layout")) + op->removeAttr("layout"); + if (op->hasAttrOfType("layout_a")) + op->removeAttr("layout_a"); + if (op->hasAttrOfType("layout_b")) + op->removeAttr("layout_b"); + if (op->hasAttrOfType("layout_cd")) + op->removeAttr("layout_cd"); }); } diff --git a/mlir/test/Dialect/XeGPU/optimize-transpose.mlir b/mlir/test/Dialect/XeGPU/optimize-transpose.mlir index 24a0de6ed48a5..c748c1ca5ef88 100644 --- a/mlir/test/Dialect/XeGPU/optimize-transpose.mlir +++ b/mlir/test/Dialect/XeGPU/optimize-transpose.mlir @@ -10,7 +10,7 @@ // CHECK: %[[BDESC:.*]] = xegpu.create_nd_tdesc %[[T0]], shape : [64, %[[C32]]], strides : [%[[C32]], 1] : i64 // CHECK-SAME: -> !xegpu.tensor_desc<16x8xi32, #xegpu.layout> // CHECK-NEXT: %[[B:.*]] = xegpu.load_nd %[[BDESC]][%{{.*}}, %[[C16]]] -// CHECK-SAME: {layout_result_0 = #xegpu.layout} +// CHECK-SAME: {layout = #xegpu.layout} // CHECK-SAME: : !xegpu.tensor_desc<16x8xi32, #xegpu.layout> -> vector<16x8xi32> // CHECK: %[[BITCAST:.*]] = vector.bitcast %[[B]] // CHECK-SAME: {layout_result_0 = #xegpu.layout} : vector<16x8xi32> to vector<16x16xf16> @@ -38,7 +38,7 @@ gpu.func @no_scf(%arg0: memref<64x64xf16>, %arg1: vector<8x16xf16>) -> vector<8x // CHECK: %[[T1:.*]] = xegpu.create_nd_tdesc %[[T0]], shape : [64, %[[C16]]], strides : [%[[C16]], 1] : i64 // CHECK-SAME: -> !xegpu.tensor_desc<16x8xi32, #xegpu.layout> // CHECK: %[[T2:.*]] = xegpu.load_nd %[[T1]][%{{.*}}, %[[C16]]] -// CHECK-SAME: {layout_result_0 = #xegpu.layout} +// CHECK-SAME: {layout = #xegpu.layout} // CHECK-SAME: : !xegpu.tensor_desc<16x8xi32, #xegpu.layout> -> vector<16x8xi32> // CHECK: %[[T3:.*]] = vector.bitcast %[[T2]] // CHECK-SAME: {layout_result_0 = #xegpu.layout} : vector<16x8xi32> to vector<16x32xi8> @@ -73,7 +73,7 @@ gpu.func @no_scf_i8(%arg0: memref<64x64xi8>, %arg1: vector<8x32xi8>) -> vector<8 // CHECK: %{{.*}} = scf.for %[[K:.*]] = %{{.*}} to %{{.*}} step %{{.*}} iter_args(%{{.*}}) -> (vector<8x16xf32>) { // CHECK: %[[T7:.*]] = arith.shrui %[[K]], %[[C1]] : index // CHECK-NEXT: %[[T8:.*]] = xegpu.load_nd %[[T4]][%{{.*}}, %[[T7]]] -// CHECK-SAME: {layout_result_0 = #xegpu.layout} : +// CHECK-SAME: <{layout = #xegpu.layout}> : // CHECK-SAME: !xegpu.tensor_desc<16x8xi32, #xegpu.layout> -> vector<16x8xi32> // CHECK-NEXT: %{{.*}} = vector.bitcast %[[T8]] {layout_result_0 = #xegpu.layout} // CHECK-SAME: : vector<16x8xi32> to vector<16x16xf16> @@ -115,8 +115,8 @@ gpu.func @gemm_b_transpose(%arg0: memref<256x256xf16>, %arg1: memref<256x256xf16 // CHECK-SAME: -> !xegpu.tensor_desc<16x8xi32, #xegpu.layout> // CHECK: %{{.*}} = scf.for %[[K:.*]] = %{{.*}} iter_args(%{{.*}}) -> (vector<8x16xf32>) { // CHECK: %[[T7:.*]] = arith.shrui %[[K]], %[[C1]] : index -// CHECK-NEXT: %[[T8:.*]] = xegpu.load_nd %[[T4]][%{{.*}}, %[[T7]]] {layout_result_0 = #xegpu.layout< -// CHECK-SAME: lane_layout = [16, 1], lane_data = [1, 1]>} : +// CHECK-NEXT: %[[T8:.*]] = xegpu.load_nd %[[T4]][%{{.*}}, %[[T7]]] <{layout = #xegpu.layout< +// CHECK-SAME: lane_layout = [16, 1], lane_data = [1, 1]>}> : // CHECK-SAME: !xegpu.tensor_desc<16x8xi32, #xegpu.layout> -> vector<16x8xi32> // CHECK-NEXT: %{{.*}} = vector.bitcast %[[T8]] {layout_result_0 = #xegpu.layout} // CHECK-SAME: : vector<16x8xi32> to vector<16x16xf16> @@ -159,13 +159,13 @@ gpu.func @nested_scf(%arg0: memref<256x256xf16>, %arg1: memref<256x256xf16>, %ar // CHECK-SAME: -> !xegpu.tensor_desc<32x8xi32, #xegpu.layout> // CHECK: %{{.*}}:4 = scf.for %[[K:.*]] = %{{.*}} iter_args(%{{.*}}) -> (vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>) { // CHECK: %[[T5:.*]] = arith.shrui %[[K]], %[[C1]] : index -// CHECK: %[[T6:.*]] = xegpu.load_nd %[[T3]][%{{.*}}, %[[T5]]] {layout_result_0 = #xegpu.layout} +// CHECK: %[[T6:.*]] = xegpu.load_nd %[[T3]][%{{.*}}, %[[T5]]] <{layout = #xegpu.layout}> // CHECK-SAME: : !xegpu.tensor_desc<32x8xi32, #xegpu.layout> -> vector<32x8xi32> // CHECK: %[[T7:.*]] = vector.insert_strided_slice %[[T6]], %[[CST]] // CHECK-SAME: {layout_result_0 = #xegpu.layout, offsets = [0, 0], strides = [1, 1]} // CHECK-SAME: : vector<32x8xi32> into vector<32x16xi32> // CHECK: %[[T8:.*]] = arith.addi %[[T5]], %[[C8]] : index -// CHECK: %[[T9:.*]] = xegpu.load_nd %[[T3]][%{{.*}}, %[[T8]]] {layout_result_0 = #xegpu.layout} +// CHECK: %[[T9:.*]] = xegpu.load_nd %[[T3]][%{{.*}}, %[[T8]]] <{layout = #xegpu.layout}> // CHECK-SAME: : !xegpu.tensor_desc<32x8xi32, #xegpu.layout> -> vector<32x8xi32> // CHECK: %[[T10:.*]] = vector.insert_strided_slice %[[T9]], %[[T7]] // CHECK-SAME: {layout_result_0 = #xegpu.layout, offsets = [0, 8], strides = [1, 1]} @@ -225,12 +225,12 @@ gpu.func @large_loads(%arg0: vector<8x16xf16>, %arg1: memref<256x256xf16>, %arg2 // CHECK-SAME: !xegpu.tensor_desc<32x8xi32, #xegpu.layout> // CHECK: %{{.*}}:4 = scf.for %[[K:.*]] = %{{.*}} iter_args(%{{.*}}) -> (vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>) { // CHECK: %[[T5:.*]] = arith.shrui %[[K]], %[[C1]] : index -// CHECK: %[[T6:.*]] = xegpu.load_nd %[[T3]][%{{.*}}, %[[T5]]] {layout_result_0 = #xegpu.layout} +// CHECK: %[[T6:.*]] = xegpu.load_nd %[[T3]][%{{.*}}, %[[T5]]] <{layout = #xegpu.layout}> // CHECK-SAME: : !xegpu.tensor_desc<32x8xi32, #xegpu.layout> -> vector<32x8xi32> // CHECK: %[[T7:.*]] = vector.bitcast %[[T6]] {layout_result_0 = #xegpu.layout} // CHECK-SAME: : vector<32x8xi32> to vector<32x16xf16> // CHECK: %[[T8:.*]] = arith.addi %[[T5]], %[[C8]] : index -// CHECK: %[[T9:.*]] = xegpu.load_nd %[[T3]][%{{.*}}, %[[T8]]] {layout_result_0 = #xegpu.layout} +// CHECK: %[[T9:.*]] = xegpu.load_nd %[[T3]][%{{.*}}, %[[T8]]] <{layout = #xegpu.layout}> // CHECK-SAME: : !xegpu.tensor_desc<32x8xi32, #xegpu.layout> -> vector<32x8xi32> // CHECK: %[[T10:.*]] = vector.bitcast %[[T9]] {layout_result_0 = #xegpu.layout} // CHECK-SAME: : vector<32x8xi32> to vector<32x16xf16> @@ -244,12 +244,12 @@ gpu.func @array_length(%arg0: vector<8x16xf16>, %arg1: memref<256x256xf16>, %arg %c32 = arith.constant 32 : index %c256 = arith.constant 256 : index %0 = xegpu.create_nd_tdesc %arg2 : memref<256x256xf32> -> !xegpu.tensor_desc<8x16xf32, #a> - %1 = xegpu.load_nd %0[%c0, %c0] { layout_result_0 = #a } : !xegpu.tensor_desc<8x16xf32, #a> -> vector<8x16xf32> + %1 = xegpu.load_nd %0[%c0, %c0] { layout = #a } : !xegpu.tensor_desc<8x16xf32, #a> -> vector<8x16xf32> %3 = xegpu.create_nd_tdesc %arg1 : memref<256x256xf16> -> !xegpu.tensor_desc<32x16xf16, #b, #xegpu.block_tdesc_attr> %4:4 = scf.for %arg3 = %c0 to %c256 step %c32 iter_args(%arg4 = %1, %arg5 = %1, %arg6 = %1, %arg7 = %1) -> (vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>) { - %6 = xegpu.load_nd %3[%c0, %arg3] { layout_result_0 = #b } + %6 = xegpu.load_nd %3[%c0, %arg3] { layout = #b } : !xegpu.tensor_desc<32x16xf16, #b, #xegpu.block_tdesc_attr> -> vector<2x32x16xf16> %19 = vector.extract %6[0] { layout_result_0 = #b } : vector<32x16xf16> from vector<2x32x16xf16> %20 = vector.extract %6[1] { layout_result_0 = #b } : vector<32x16xf16> from vector<2x32x16xf16> @@ -265,10 +265,10 @@ gpu.func @array_length(%arg0: vector<8x16xf16>, %arg1: memref<256x256xf16>, %arg %12 = vector.transpose %8, [1, 0] { layout_result_0 = #bt } : vector<16x16xf16> to vector<16x16xf16> %13 = vector.transpose %9, [1, 0] { layout_result_0 = #bt } : vector<16x16xf16> to vector<16x16xf16> %14 = vector.transpose %10, [1, 0] { layout_result_0 = #bt } : vector<16x16xf16> to vector<16x16xf16> - %15 = xegpu.dpas %arg0, %11, %arg4 {layout_result_0 = #a} : vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32> -> vector<8x16xf32> - %16 = xegpu.dpas %arg0, %12, %arg5 {layout_result_0 = #a} : vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32> -> vector<8x16xf32> - %17 = xegpu.dpas %arg0, %13, %arg6 {layout_result_0 = #a} : vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32> -> vector<8x16xf32> - %18 = xegpu.dpas %arg0, %14, %arg7 {layout_result_0 = #a} : vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32> -> vector<8x16xf32> + %15 = xegpu.dpas %arg0, %11, %arg4 {layout = #a} : vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32> -> vector<8x16xf32> + %16 = xegpu.dpas %arg0, %12, %arg5 {layout = #a} : vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32> -> vector<8x16xf32> + %17 = xegpu.dpas %arg0, %13, %arg6 {layout = #a} : vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32> -> vector<8x16xf32> + %18 = xegpu.dpas %arg0, %14, %arg7 {layout = #a} : vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32> -> vector<8x16xf32> scf.yield %15, %16, %17, %18 : vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32> } {layout_result_0 = #a, layout_result_1 = #a, layout_result_2 = #a, layout_result_3 = #a} xegpu.store_nd %4#0, %0[%c0, %c0] : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #a> diff --git a/mlir/test/Dialect/XeGPU/propagate-layout-inst-data.mlir b/mlir/test/Dialect/XeGPU/propagate-layout-inst-data.mlir index 32fb3178a8af2..5f70831f45e97 100644 --- a/mlir/test/Dialect/XeGPU/propagate-layout-inst-data.mlir +++ b/mlir/test/Dialect/XeGPU/propagate-layout-inst-data.mlir @@ -8,7 +8,7 @@ // CHECK: %[[TDESC_DST:.*]] = xegpu.create_nd_tdesc %[[ARG1]] : memref<8x32xf32> -> !xegpu.tensor_desc<8x32xf32, #xegpu.layout> // CHECK: xegpu.prefetch_nd %[[TDESC_SRC]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, layout = #xegpu.layout}> : // CHECK-SAME: !xegpu.tensor_desc<8x32xf32, #xegpu.layout> -// CHECK: %[[LOADED:.*]] = xegpu.load_nd %0 <{layout = #xegpu.layout}> {layout_result_0 = #xegpu.layout} : +// CHECK: %[[LOADED:.*]] = xegpu.load_nd %0 <{layout = #xegpu.layout}> // CHECK-SAME: !xegpu.tensor_desc<8x32xf32, #xegpu.layout> -> vector<8x32xf32> // CHECK: xegpu.store_nd %[[LOADED]], %[[TDESC_DST]] <{layout = #xegpu.layout}> : vector<8x32xf32>, !xegpu.tensor_desc<8x32xf32, #xegpu.layout> gpu.module @test { @@ -32,11 +32,11 @@ func.func @load_store_no_array_len(%arg0: memref<8x32xf32>, %arg1: memref<8x32xf // CHECK: %[[CST:.*]] = arith.constant {layout_result_0 = #xegpu.layout} dense<0.000000e+00> : vector<8x16xf32> // CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG0]][{{.*}}] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16, #xegpu.layout // CHECK: %[[T1:.*]] = xegpu.create_nd_tdesc %[[ARG1]][{{.*}}] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout> -// CHECK: %[[T2:.*]] = xegpu.load_nd %[[T0]] <{layout = #xegpu.layout}> {layout_result_0 = #xegpu.layout} : +// CHECK: %[[T2:.*]] = xegpu.load_nd %[[T0]] <{layout = #xegpu.layout}> : // CHECK-SAME: !xegpu.tensor_desc<8x16xf16, #xegpu.layout> -> vector<8x16xf16> -// CHECK: %[[T3:.*]] = xegpu.load_nd %[[T1]] <{layout = #xegpu.layout}> {layout_result_0 = #xegpu.layout} : +// CHECK: %[[T3:.*]] = xegpu.load_nd %[[T1]] <{layout = #xegpu.layout}> : // CHECK-SAME: !xegpu.tensor_desc<16x16xf16, #xegpu.layout> -> vector<16x16xf16> -// CHECK: %[[T4:.*]] = xegpu.dpas %[[T2]], %[[T3]], %[[CST]] {layout_a = #xegpu.layout, layout_b = #xegpu.layout, layout_cd = #xegpu.layout, layout_result_0 = #xegpu.layout} : +// CHECK: %[[T4:.*]] = xegpu.dpas %[[T2]], %[[T3]], %[[CST]] {layout_a = #xegpu.layout, layout_b = #xegpu.layout, layout_cd = #xegpu.layout} : // CHECK-SAME: vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32> -> vector<8x16xf32> // CHECK: %[[T5:.*]] = xegpu.create_nd_tdesc %[[ARG2]][{{.*}}] : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.layout // CHECK: xegpu.store_nd %[[T4]], %[[T5]] <{layout = #xegpu.layout}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.layout> @@ -73,7 +73,7 @@ gpu.module @test_kernel { %out:3 = scf.for %k = %c0 to %c1024 step %c32 iter_args(%arg0 = %a_tdesc, %arg1 = %b_tdesc, %arg2 = %c_tdesc) -> (!xegpu.tensor_desc<16x32xf16>, !xegpu.tensor_desc<16x32xf16>, !xegpu.tensor_desc<16x32xf16>) { - //CHECK: xegpu.load_nd {{.*}} <{layout = #xegpu.layout}> {layout_result_0 = #xegpu.layout} : + //CHECK: xegpu.load_nd {{.*}} <{layout = #xegpu.layout}> : //CHECK-SAME: !xegpu.tensor_desc<16x32xf16, #xegpu.layout> -> vector<16x32xf16> %a = xegpu.load_nd %arg0 : !xegpu.tensor_desc<16x32xf16> -> vector<16x32xf16> %b = xegpu.load_nd %arg1 : !xegpu.tensor_desc<16x32xf16> -> vector<16x32xf16> @@ -112,7 +112,7 @@ gpu.module @test_kernel { %out:3 = scf.for %k = %c0 to %c1024 step %c32 iter_args(%arg0 = %a_tdesc, %arg1 = %b_tdesc, %arg2 = %c_tdesc) -> (!xegpu.tensor_desc<12x32xf16>, !xegpu.tensor_desc<12x32xf16>, !xegpu.tensor_desc<12x32xf16>) { - //CHECK: xegpu.load_nd {{.*}} <{layout = #xegpu.layout}> {layout_result_0 = #xegpu.layout} : + //CHECK: xegpu.load_nd {{.*}} <{layout = #xegpu.layout}> : //CHECK-SAME: !xegpu.tensor_desc<12x32xf16, #xegpu.layout> -> vector<12x32xf16> %a = xegpu.load_nd %arg0 : !xegpu.tensor_desc<12x32xf16> -> vector<12x32xf16> %b = xegpu.load_nd %arg1 : !xegpu.tensor_desc<12x32xf16> -> vector<12x32xf16> @@ -141,7 +141,7 @@ gpu.module @test { // CHECK: %{{.*}} = arith.constant {layout_result_0 = #xegpu.layout} dense : vector<16xi1> // CHECK: %{{.*}} = arith.constant {layout_result_0 = #xegpu.layout} dense<12> : vector<16xindex> // CHECK: %{{.*}} = xegpu.load %[[ARG0]][%{{.*}}], %{{.*}} <{chunk_size = 8 : i64, layout = #xegpu.layout}> -// CHECK-SAME: {layout_result_0 = #xegpu.layout} : memref<256xf16>, vector<16xindex>, vector<16xi1> -> vector<16x8xf16> +// CHECK-SAME: memref<256xf16>, vector<16xindex>, vector<16xi1> -> vector<16x8xf16> // CHECK: xegpu.store %0, %[[ARG0]][%{{.*}}], %{{.*}} <{chunk_size = 8 : i64, layout = #xegpu.layout}> : vector<16x8xf16>, memref<256xf16>, vector<16xindex>, vector<16xi1> func.func @scatter_ops_chunksize(%src: memref<256xf16>) { %1 = arith.constant dense<1>: vector<16xi1> diff --git a/mlir/test/Dialect/XeGPU/propagate-layout.mlir b/mlir/test/Dialect/XeGPU/propagate-layout.mlir index 48e77d867508b..b88d8e1a78a26 100644 --- a/mlir/test/Dialect/XeGPU/propagate-layout.mlir +++ b/mlir/test/Dialect/XeGPU/propagate-layout.mlir @@ -6,11 +6,11 @@ gpu.module @test { // CHECK: %[[CST:.*]] = arith.constant {layout_result_0 = #xegpu.layout} dense<0.000000e+00> : vector<8x16xf32> // CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG0]][{{.*}}] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16, #xegpu.layout> // CHECK: %[[T1:.*]] = xegpu.create_nd_tdesc %[[ARG1]][{{.*}}] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout> -// CHECK: %[[T2:.*]] = xegpu.load_nd %[[T0]] <{layout = #xegpu.layout}> {layout_result_0 = #xegpu.layout} : +// CHECK: %[[T2:.*]] = xegpu.load_nd %[[T0]] <{layout = #xegpu.layout}> : // CHECK-SAME: !xegpu.tensor_desc<8x16xf16, #xegpu.layout> -> vector<8x16xf16> -// CHECK: %[[T3:.*]] = xegpu.load_nd %[[T1]] <{layout = #xegpu.layout}> {layout_result_0 = #xegpu.layout} : +// CHECK: %[[T3:.*]] = xegpu.load_nd %[[T1]] <{layout = #xegpu.layout}> : // CHECK-SAME: !xegpu.tensor_desc<16x16xf16, #xegpu.layout> -> vector<16x16xf16> -// CHECK: %[[T4:.*]] = xegpu.dpas %[[T2]], %[[T3]], %[[CST]] {layout_a = #xegpu.layout, layout_b = #xegpu.layout, layout_cd = #xegpu.layout, layout_result_0 = #xegpu.layout} : +// CHECK: %[[T4:.*]] = xegpu.dpas %[[T2]], %[[T3]], %[[CST]] {layout_a = #xegpu.layout, layout_b = #xegpu.layout, layout_cd = #xegpu.layout} : // CHECK-SAME: vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32> -> vector<8x16xf32> // CHECK: %[[T5:.*]] = xegpu.create_nd_tdesc %[[ARG2]][{{.*}}] : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.layout> // CHECK: xegpu.store_nd %[[T4]], %[[T5]] <{layout = #xegpu.layout}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.layout> @@ -32,7 +32,7 @@ func.func @dpas_f16(%arg0: memref<8x16xf16>, %arg1: memref<16x16xf16>, %arg2: me gpu.module @test { // CHECK-LABEL: func.func @dpas_i8( // CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: vector<8x32xi8>, %[[ARG1:[0-9a-zA-Z]+]]: vector<32x16xi8>, %[[ARG2:[0-9a-zA-Z]+]]: memref<8x16xi32>) { -// CHECK: %[[T0:.*]] = xegpu.dpas %[[ARG0]], %[[ARG1]] {layout_a = #xegpu.layout, layout_b = #xegpu.layout, layout_result_0 = #xegpu.layout} +// CHECK: %[[T0:.*]] = xegpu.dpas %[[ARG0]], %[[ARG1]] {layout_a = #xegpu.layout, layout_b = #xegpu.layout, layout_cd = #xegpu.layout} func.func @dpas_i8(%arg0: vector<8x32xi8>, %arg1: vector<32x16xi8>, %arg2: memref<8x16xi32>) { %c0 = arith.constant 0 : index @@ -47,7 +47,7 @@ func.func @dpas_i8(%arg0: vector<8x32xi8>, %arg1: vector<32x16xi8>, %arg2: memre gpu.module @test { // CHECK-LABEL: func.func @load_with_transpose_effect( // CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: memref<8x16xf16>, %[[ARG0:[0-9a-zA-Z]+]]: memref<16x16xf16>, %[[ARG0:[0-9a-zA-Z]+]]: memref<8x16xf32>) { -// CHECK: %{{.*}} = xegpu.load_nd %{{.*}} <{layout = #xegpu.layout}> {layout_result_0 = #xegpu.layout} : +// CHECK: %{{.*}} = xegpu.load_nd %{{.*}} <{layout = #xegpu.layout}> : // CHECK-SAME: !xegpu.tensor_desc<8x16xf16, #xegpu.layout> -> vector<8x16xf16> func.func @load_with_transpose_effect(%arg0: memref<8x16xf16>, %arg1: memref<16x16xf16>, %arg2: memref<8x16xf32>) { %c0 = arith.constant 0 : index @@ -109,7 +109,7 @@ gpu.module @test { // CHECK-NEXT: %[[CST0:.*]] = arith.constant {layout_result_0 = #xegpu.layout} dense : vector<16xi1> // CHECK-NEXT: %[[T2:.*]] = xegpu.create_tdesc %[[ARG1]], %[[CST]] : memref<256xf16>, vector<16xindex> -> // CHECK-SAME: !xegpu.tensor_desc<16x16xf16, #xegpu.scatter_tdesc_attr, #xegpu.layout> -// CHECK-NEXT: %{{.*}} = xegpu.load %[[T2]], %[[CST0]] <{layout = #xegpu.layout}> {layout_result_0 = #xegpu.layout} +// CHECK-NEXT: %{{.*}} = xegpu.load %[[T2]], %[[CST0]] <{layout = #xegpu.layout}> // CHECK-SAME: !xegpu.tensor_desc<16x16xf16, #xegpu.scatter_tdesc_attr, #xegpu.layout>, vector<16xi1> -> vector<16x16xf16> func.func @load_gather_with_chunksize(%arg0: memref<8x16xf16>, %arg1: memref<256xf16>, %arg2: memref<8x16xf32>) { %c0 = arith.constant 0 : index @@ -136,7 +136,7 @@ gpu.module @test { // CHECK-NEXT: %[[CST0:.*]] = arith.constant {layout_result_0 = #xegpu.layout} dense : vector<16xi1> // CHECK-NEXT: %[[T0:.*]] = xegpu.create_tdesc %[[ARG0]], %[[CST]] : memref<256xf32>, vector<16xindex> -> // CHECK-SAME: !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.layout> -// CHECK-NEXT: %{{.*}} = xegpu.load %[[T0]], %[[CST0]] <{layout = #xegpu.layout}> {layout_result_0 = #xegpu.layout} : +// CHECK-NEXT: %{{.*}} = xegpu.load %[[T0]], %[[CST0]] <{layout = #xegpu.layout}> : // CHECK-SAME: !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.layout>, vector<16xi1> -> vector<16xf32> func.func @load_gather_1d(%arg0: memref<256xf32>, %arg1: !xegpu.tensor_desc<16xf32>) { %cst = arith.constant dense<[0, 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240]> : vector<16xindex> @@ -185,7 +185,7 @@ gpu.module @test { // CHECK: %[[MASK:.*]] = arith.constant {layout_result_0 = #xegpu.layout} dense : vector<16xi1> // CHECK: %[[OFFSETS:.*]] = arith.constant {layout_result_0 = #xegpu.layout} dense<12> : vector<16xindex> // CHECK: %[[LOAD_VEC:.*]] = xegpu.load %[[ARG0]][%[[OFFSETS]]], %[[MASK]] <{chunk_size = 8 : i64, layout = #xegpu.layout}> -// CHECK-SAME: {layout_result_0 = #xegpu.layout} : memref<256xf16>, vector<16xindex>, vector<16xi1> -> vector<16x8xf16> +// CHECK-SAME: memref<256xf16>, vector<16xindex>, vector<16xi1> -> vector<16x8xf16> // CHECK: xegpu.store %[[LOAD_VEC]], %[[ARG0]][%[[OFFSETS]]], %[[MASK]] <{chunk_size = 8 : i64, layout = #xegpu.layout}> : vector<16x8xf16>, memref<256xf16>, vector<16xindex>, vector<16xi1> func.func @scatter_ops_chunksize(%src: memref<256xf16>) { %1 = arith.constant dense<1>: vector<16xi1> @@ -204,7 +204,7 @@ gpu.module @test { // CHECK: %[[MASK:.*]] = arith.constant {layout_result_0 = #xegpu.layout} dense : vector<16xi1> // CHECK: %[[OFFSETS:.*]] = arith.constant {layout_result_0 = #xegpu.layout} dense<12> : vector<16xindex> // CHECK: %[[LOAD_VEC:.*]] = xegpu.load %[[ARG0]][%[[OFFSETS]]], %[[MASK]] -// CHECK-SAME: {layout_result_0 = #xegpu.layout} : memref<256xf16>, vector<16xindex>, vector<16xi1> -> vector<16xf16> +// CHECK-SAME: memref<256xf16>, vector<16xindex>, vector<16xi1> -> vector<16xf16> // CHECK: xegpu.store %[[LOAD_VEC]], %[[ARG0]][%[[OFFSETS]]], %[[MASK]] <{layout = #xegpu.layout}> : vector<16xf16>, memref<256xf16>, vector<16xindex>, vector<16xi1> func.func @scatter_ops(%src: memref<256xf16>) { %1 = arith.constant dense<1>: vector<16xi1> @@ -221,7 +221,7 @@ gpu.module @test { // CHECK: %[[MASK:.*]] = arith.constant {layout_result_0 = #xegpu.layout} dense : vector<16xi1> // CHECK: %[[OFFSETS:.*]] = arith.constant {layout_result_0 = #xegpu.layout} dense<12> : vector<16xindex> // CHECK: %[[LOAD_VEC:.*]] = xegpu.load %[[ARG0]][%[[OFFSETS]]], %[[MASK]] -// CHECK-SAME: {layout_result_0 = #xegpu.layout} : memref<256xf16>, vector<16xindex>, vector<16xi1> -> vector<16xf16> +// CHECK-SAME: memref<256xf16>, vector<16xindex>, vector<16xi1> -> vector<16xf16> // CHECK: %[[ADD_RES:.*]] = arith.addf %[[LOAD_VEC]], %[[LOAD_VEC]] {layout_result_0 = #xegpu.layout} : vector<16xf16> // CHECK: xegpu.store %[[ADD_RES]], %[[ARG0]][%[[OFFSETS]]], %[[MASK]] // CHECK-SAME <{layout = #xegpu.layout}> : vector<16xf16>, memref<256xf16>, vector<16xindex>, vector<16xi1> @@ -241,7 +241,7 @@ gpu.module @test { // CHECK: %[[MASK:.*]] = arith.constant {layout_result_0 = #xegpu.layout} dense : vector<16xi1> // CHECK: %[[OFFSETS:.*]] = arith.constant {layout_result_0 = #xegpu.layout} dense<12> : vector<16xindex> // CHECK: %[[LOAD_VEC:.*]] = xegpu.load %[[ARG0]][%[[OFFSETS]]], %[[MASK]] -// CHECK-SAME: {layout_result_0 = #xegpu.layout} : memref<256xf16>, vector<16xindex>, vector<16xi1> -> vector<16xf16> +// CHECK-SAME: memref<256xf16>, vector<16xindex>, vector<16xi1> -> vector<16xf16> // CHECK: %[[ADD_RES:.*]] = arith.addf %[[LOAD_VEC]], %[[LOAD_VEC]] {layout_result_0 = #xegpu.layout} : vector<16xf16> // CHECK: xegpu.store %[[ADD_RES]], %[[ARG0]][%[[OFFSETS]]], %[[MASK]] // CHECK-SAME <{layout = #xegpu.layout}> : vector<16xf16>, memref<256xf16>, vector<16xindex>, vector<16xi1> @@ -257,9 +257,9 @@ func.func @scatter_ops_preserve_load_perm_layout(%src: memref<256xf16>) { // ----- gpu.module @test { // CHECK-LABEL: func.func @vector_bitcast_i16_to_f16( -// CHECK: %[[LOAD0:.*]] = xegpu.load_nd %{{.*}} <{layout = #xegpu.layout}> {layout_result_0 = #xegpu.layout} +// CHECK: %[[LOAD0:.*]] = xegpu.load_nd %{{.*}} <{layout = #xegpu.layout}> // CHECK-SAME: !xegpu.tensor_desc<8x16xi16, #xegpu.layout> -> vector<8x16xi16> -// CHECK: %[[LOAD1:.*]] = xegpu.load_nd %{{.*}} <{layout = #xegpu.layout}> {layout_result_0 = #xegpu.layout} +// CHECK: %[[LOAD1:.*]] = xegpu.load_nd %{{.*}} <{layout = #xegpu.layout}> // CHECK-SAME: !xegpu.tensor_desc<16x16xi16, #xegpu.layout> -> vector<16x16xi16> // CHECK: %{{.*}} = vector.bitcast %[[LOAD0]] {layout_result_0 = #xegpu.layout} // CHECK-SAME: vector<8x16xi16> to vector<8x16xf16> @@ -282,7 +282,7 @@ func.func @vector_bitcast_i16_to_f16(%arg0: memref<8x16xi16>, %arg1: memref<16x1 // ----- gpu.module @test { // CHECK-LABEL: func.func @vector_bitcast_i32_to_f16( -// CHECK: %[[LOAD:.*]] = xegpu.load_nd %{{.*}} <{layout = #xegpu.layout}> {layout_result_0 = #xegpu.layout} +// CHECK: %[[LOAD:.*]] = xegpu.load_nd %{{.*}} <{layout = #xegpu.layout}> // CHECK-SAME: !xegpu.tensor_desc<16x8xi32, #xegpu.layout> -> vector<16x8xi32> // CHECK-NEXT: %{{.*}} = vector.bitcast %[[LOAD]] {layout_result_0 = #xegpu.layout} // CHECK-SAME: vector<16x8xi32> to vector<16x16xf16> @@ -303,7 +303,7 @@ func.func @vector_bitcast_i32_to_f16(%arg0: memref<8x16xf16>, %arg1: memref<16x8 // ----- gpu.module @test { // CHECK-LABEL: func.func @vector_bitcast_i16_to_i32( -// CHECK: %[[LOAD:.*]] = xegpu.load_nd %{{.*}} <{layout = #xegpu.layout}> {layout_result_0 = #xegpu.layout} +// CHECK: %[[LOAD:.*]] = xegpu.load_nd %{{.*}} <{layout = #xegpu.layout}> // CHECK-SAME: !xegpu.tensor_desc<8x32xi16, #xegpu.layout> -> vector<8x32xi16> // CHECK-NEXT: %{{.*}} = vector.bitcast %[[LOAD]] {layout_result_0 = #xegpu.layout} // CHECK-SAME: vector<8x32xi16> to vector<8x16xi32> @@ -340,9 +340,9 @@ gpu.module @test { // CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<8x16xf16, #xegpu.layout>, // CHECK-SAME: %[[ARG1:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<16x16xf16, #xegpu.layout>, // CHECK-SAME: %[[ARG2:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<8x16xf32, #xegpu.layout>) { -// CHECK: %[[T1:.*]] = xegpu.load_nd %[[ARG1]] <{layout = #xegpu.layout}> {layout_result_0 = #xegpu.layout} : +// CHECK: %[[T1:.*]] = xegpu.load_nd %[[ARG1]] <{layout = #xegpu.layout}> : // CHECK-SAME: !xegpu.tensor_desc<16x16xf16, #xegpu.layout> -> vector<16x16xf16> -// CHECK-NEXT: %[[T2:.*]] = xegpu.load_nd %[[ARG1]] <{layout = #xegpu.layout}> {layout_result_0 = #xegpu.layout} : +// CHECK-NEXT: %[[T2:.*]] = xegpu.load_nd %[[ARG1]] <{layout = #xegpu.layout}> : // CHECK-SAME: !xegpu.tensor_desc<16x16xf16, #xegpu.layout> -> vector<16x16xf16> // CHECK-NEXT: %{{.*}} = arith.addf %[[T1]], %[[T2]] {layout_result_0 = #xegpu.layout} : vector<16x16xf16> func.func @binary_op_one_use(%arg0: !xegpu.tensor_desc<8x16xf16>, %arg1: !xegpu.tensor_desc<16x16xf16>, %arg2: !xegpu.tensor_desc<8x16xf32>) { @@ -363,7 +363,7 @@ gpu.module @test { // CHECK-SAME: %[[ARG2:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<8x16xf32, #xegpu.layout>, // CHECK-SAME: %[[ARG3:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<16x16xf16, #xegpu.layout>) { // CHECK: %[[T2:.*]] = arith.addf %{{.*}}, %{{.*}} {layout_result_0 = #xegpu.layout} : vector<16x16xf16> -// CHECK: %[[T3:.*]] = xegpu.dpas %{{.*}}, %[[T2]] {layout_a = #xegpu.layout, layout_b = #xegpu.layout, layout_result_0 = #xegpu.layout} : vector<8x16xf16>, vector<16x16xf16> -> vector<8x16xf32> +// CHECK: %[[T3:.*]] = xegpu.dpas %{{.*}}, %[[T2]] {layout_a = #xegpu.layout, layout_b = #xegpu.layout, layout_cd = #xegpu.layout} : vector<8x16xf16>, vector<16x16xf16> -> vector<8x16xf32> // CHECK-NEXT: xegpu.store_nd %[[T3]], %[[ARG2]] <{layout = #xegpu.layout}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.layout> // CHECK-NEXT: xegpu.store_nd %[[T2]], %[[ARG3]] <{layout = #xegpu.layout}> : vector<16x16xf16>, !xegpu.tensor_desc<16x16xf16, #xegpu.layout> func.func @binary_op_multiple_uses(%arg0: !xegpu.tensor_desc<8x16xf16>, %arg1: !xegpu.tensor_desc<16x16xf16>, %arg2: !xegpu.tensor_desc<8x16xf32>, %arg3: !xegpu.tensor_desc<16x16xf16>) { @@ -386,11 +386,11 @@ gpu.module @test { // CHECK-NEXT: %[[CST:.*]] = arith.constant {layout_result_0 = #xegpu.layout} dense<0.000000e+00> : vector<8x16xf32> // CHECK-NEXT: %[[T2:.*]]:3 = scf.for %{{.*}} iter_args(%[[ARG4:.*]] = %[[T0]], %[[ARG5:.*]] = %[[T1]], %[[ARG6:.*]] = %[[CST]]) -> // CHECK-SAME: (!xegpu.tensor_desc<8x16xf16, #xegpu.layout>, !xegpu.tensor_desc<16x16xf16, #xegpu.layout>, vector<8x16xf32>) { -// CHECK-NEXT: %[[T4:.*]] = xegpu.load_nd %[[ARG4]] <{layout = #xegpu.layout}> {layout_result_0 = #xegpu.layout} : +// CHECK-NEXT: %[[T4:.*]] = xegpu.load_nd %[[ARG4]] <{layout = #xegpu.layout}> : // CHECK-SAME: !xegpu.tensor_desc<8x16xf16, #xegpu.layout> -> vector<8x16xf16> -// CHECK-NEXT: %[[T5:.*]] = xegpu.load_nd %[[ARG5]] <{layout = #xegpu.layout}> {layout_result_0 = #xegpu.layout} : +// CHECK-NEXT: %[[T5:.*]] = xegpu.load_nd %[[ARG5]] <{layout = #xegpu.layout}> : // CHECK-SAME: !xegpu.tensor_desc<16x16xf16, #xegpu.layout> -> vector<16x16xf16> -// CHECK-NEXT: %[[T6:.*]] = xegpu.dpas %[[T4]], %[[T5]], %[[ARG6]] {layout_a = #xegpu.layout, layout_b = #xegpu.layout, layout_cd = #xegpu.layout, layout_result_0 = #xegpu.layout} : +// CHECK-NEXT: %[[T6:.*]] = xegpu.dpas %[[T4]], %[[T5]], %[[ARG6]] {layout_a = #xegpu.layout, layout_b = #xegpu.layout, layout_cd = #xegpu.layout} : // CHECK-SAME: vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32> -> vector<8x16xf32> // CHECK-NEXT: %[[T7:.*]] = xegpu.update_nd_offset %[[ARG4]], [{{.*}}] : !xegpu.tensor_desc<8x16xf16, #xegpu.layout> // CHECK-NEXT: %[[T8:.*]] = xegpu.update_nd_offset %[[ARG5]], [{{.*}}] : !xegpu.tensor_desc<16x16xf16, #xegpu.layout> @@ -426,11 +426,11 @@ gpu.module @test { // CHECK-SAME: %[[ARG1:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<16x16xf16, #xegpu.layout>, // CHECK-SAME: %[[ARG2:[0-9a-zA-Z]+]]: i1, %[[ARG3:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<8x16xf32, #xegpu.layout>) { // CHECK: %{{.*}} = scf.if %[[ARG2]] -> (vector<16x16xf16>) { -// CHECK-NEXT: %[[T3:.*]] = xegpu.load_nd %[[ARG1]] <{layout = #xegpu.layout}> {layout_result_0 = #xegpu.layout} : +// CHECK-NEXT: %[[T3:.*]] = xegpu.load_nd %[[ARG1]] <{layout = #xegpu.layout}> : // CHECK-SAME: !xegpu.tensor_desc<16x16xf16, #xegpu.layout> -> vector<16x16xf16> // CHECK-NEXT: scf.yield %[[T3]] : vector<16x16xf16> // CHECK-NEXT: } else { -// CHECK-NEXT: %[[T4:.*]] = xegpu.load_nd %[[ARG1]] <{layout = #xegpu.layout}> {layout_result_0 = #xegpu.layout} : +// CHECK-NEXT: %[[T4:.*]] = xegpu.load_nd %[[ARG1]] <{layout = #xegpu.layout}> : // CHECK-SAME: !xegpu.tensor_desc<16x16xf16, #xegpu.layout> -> vector<16x16xf16> // CHECK-NEXT: scf.yield %[[T4]] : vector<16x16xf16> // CHECK-NEXT: } {layout_result_0 = #xegpu.layout} @@ -456,11 +456,11 @@ gpu.module @test { // CHECK-SAME: %[[ARG2:[0-9a-zA-Z]+]]: i1, %[[ARG3:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<8x16xf32, #xegpu.layout>, // CHECK-SAME: %[[ARG4:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<16x16xf16, #xegpu.layout>) { // CHECK: %[[T1:.*]] = scf.if %[[ARG2]] -> (vector<16x16xf16>) { -// CHECK-NEXT: %[[T3:.*]] = xegpu.load_nd %[[ARG1]] <{layout = #xegpu.layout}> {layout_result_0 = #xegpu.layout} : +// CHECK-NEXT: %[[T3:.*]] = xegpu.load_nd %[[ARG1]] <{layout = #xegpu.layout}> : // CHECK-SAME: !xegpu.tensor_desc<16x16xf16, #xegpu.layout> -> vector<16x16xf16> // CHECK-NEXT: scf.yield %[[T3]] : vector<16x16xf16> // CHECK-NEXT: } else { -// CHECK-NEXT: %[[T4:.*]] = xegpu.load_nd %[[ARG1]] <{layout = #xegpu.layout}> {layout_result_0 = #xegpu.layout} : +// CHECK-NEXT: %[[T4:.*]] = xegpu.load_nd %[[ARG1]] <{layout = #xegpu.layout}> : // CHECK-SAME: !xegpu.tensor_desc<16x16xf16, #xegpu.layout> -> vector<16x16xf16> // CHECK-NEXT: scf.yield %[[T4]] : vector<16x16xf16> // CHECK-NEXT: } {layout_result_0 = #xegpu.layout} @@ -600,7 +600,7 @@ gpu.module @test { // CHECK-LABEL: func.func @vector_shape_cast_1d_to_2d_dim1_distributed( // CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<16x16xf16, #xegpu.layout>, // CHECK-SAME: %[[ARG1:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<16x16xf16, #xegpu.layout>) { -// CHECK: %[[LOAD:.*]] = xegpu.load_nd %[[ARG0]] <{layout = #xegpu.layout}> {layout_result_0 = #xegpu.layout} +// CHECK: %[[LOAD:.*]] = xegpu.load_nd %[[ARG0]] <{layout = #xegpu.layout}> // CHECK-SAME: !xegpu.tensor_desc<16x16xf16, #xegpu.layout> -> vector<16x16xf16> // CHECK-NEXT: %[[REDUCE:.*]] = vector.multi_reduction , %[[LOAD]], %{{[0-9a-zA-Z]+}} // CHECK-SAME: {layout_result_0 = #xegpu.slice<#xegpu.layout, dims = [0]>} [0] : vector<16x16xf16> to vector<16xf16> @@ -622,7 +622,7 @@ gpu.module @test { // CHECK-LABEL: func.func @vector_shape_cast_1d_to_2d_dim0_broadcasted( // CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<16x16xf16, #xegpu.layout>, // CHECK-SAME: %[[ARG1:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<16x16xf16, #xegpu.layout>) { -// CHECK: %[[LOAD:.*]] = xegpu.load_nd %arg0 <{layout = #xegpu.layout}> {layout_result_0 = #xegpu.layout} +// CHECK: %[[LOAD:.*]] = xegpu.load_nd %arg0 <{layout = #xegpu.layout}> // CHECK-SAME: !xegpu.tensor_desc<16x16xf16, #xegpu.layout> -> vector<16x16xf16> // CHECK-NEXT: %[[REDUCE:.*]] = vector.multi_reduction , %[[LOAD]], %{{[0-9a-zA-Z]+}} // CHECK-SAME: {layout_result_0 = #xegpu.slice<#xegpu.layout, dims = [1]>} [1] @@ -645,7 +645,7 @@ gpu.module @test { // CHECK-LABEL: func.func @vector_broadcast_1d_to_2d_broadcast_along_row( // CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<16x16xf16, #xegpu.layout>, // CHECK-SAME: %[[ARG1:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<16x16xf16, #xegpu.layout>) { -// CHECK: %[[LOAD:.*]] = xegpu.load_nd %[[ARG0]] <{layout = #xegpu.layout}> {layout_result_0 = #xegpu.layout} +// CHECK: %[[LOAD:.*]] = xegpu.load_nd %[[ARG0]] <{layout = #xegpu.layout}> // CHECK-SAME: !xegpu.tensor_desc<16x16xf16, #xegpu.layout> -> vector<16x16xf16> // CHECK-NEXT: %[[REDUCE:.*]] = vector.multi_reduction , %[[LOAD]], %{{[0-9a-zA-Z]+}} // CHECK-SAME: {layout_result_0 = #xegpu.slice<#xegpu.layout, dims = [0]>} [0] : vector<16x16xf16> to vector<16xf16> diff --git a/mlir/test/Dialect/XeGPU/subgroup-distribute-unit.mlir b/mlir/test/Dialect/XeGPU/subgroup-distribute-unit.mlir index 7819a438057c4..b136c89925682 100644 --- a/mlir/test/Dialect/XeGPU/subgroup-distribute-unit.mlir +++ b/mlir/test/Dialect/XeGPU/subgroup-distribute-unit.mlir @@ -140,10 +140,9 @@ gpu.func @dpas(%laneid: index) { %2 = "some_op"() : () -> vector<8x16xf32> %3 = xegpu.dpas %0, %1, %2 { - layout_operand_0 = #xegpu.layout, - layout_operand_1 = #xegpu.layout, - layout_operand_2 = #xegpu.layout, - layout_result_0 = #xegpu.layout + layout_a = #xegpu.layout, + layout_b = #xegpu.layout, + layout_cd = #xegpu.layout } : vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32> -> vector<8x16xf32> gpu.yield %3 : vector<8x16xf32> diff --git a/mlir/test/Dialect/XeGPU/subgroup-distribute.mlir b/mlir/test/Dialect/XeGPU/subgroup-distribute.mlir index e5e3d2a1c1ad5..87c67ba6bf324 100644 --- a/mlir/test/Dialect/XeGPU/subgroup-distribute.mlir +++ b/mlir/test/Dialect/XeGPU/subgroup-distribute.mlir @@ -20,18 +20,20 @@ gpu.module @xevm_module{ %0 = xegpu.create_nd_tdesc %arg0 : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16, #xegpu.layout> %1 = xegpu.load_nd %0[%c0, %c0] - {layout_result_0 = #xegpu.layout} : + {layout = #xegpu.layout} : !xegpu.tensor_desc<8x16xf16, #xegpu.layout> -> vector<8x16xf16> %2 = xegpu.create_nd_tdesc %arg1: memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout> %3 = xegpu.load_nd %2[%c0, %c0] - {layout_result_0 = #xegpu.layout} + {layout = #xegpu.layout} : !xegpu.tensor_desc<16x16xf16, #xegpu.layout> -> vector<16x16xf16> %4 = xegpu.dpas %1, %3 - {layout_result_0 = #xegpu.layout} + {layout_a = #xegpu.layout, + layout_b = #xegpu.layout, + layout_cd = #xegpu.layout} : vector<8x16xf16>, vector<16x16xf16> -> vector<8x16xf32> %5 = math.exp %4 @@ -84,7 +86,7 @@ gpu.func @gemm(%arg0: memref<1024x1024xbf16>, %arg1: memref<1024x1024xbf16>, %ar %2 = xegpu.create_nd_tdesc %arg2 : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.layout> %3 = xegpu.load_nd %2[%0, %1] - {layout_result_0 = #xegpu.layout} + {layout = #xegpu.layout} : !xegpu.tensor_desc<8x16xf32, #xegpu.layout> -> vector<8x16xf32> %4 = scf.for %arg3 = %c0 to %c1024 step %c16 iter_args(%arg4 = %3) -> (vector<8x16xf32>) { @@ -95,14 +97,16 @@ gpu.func @gemm(%arg0: memref<1024x1024xbf16>, %arg1: memref<1024x1024xbf16>, %ar -> !xegpu.tensor_desc<16x16xbf16, #xegpu.layout> %7 = xegpu.load_nd %5[%0, %arg3] - {layout_result_0 = #xegpu.layout} + {layout = #xegpu.layout} : !xegpu.tensor_desc<8x16xbf16, #xegpu.layout> -> vector<8x16xbf16> %8 = xegpu.load_nd %6[%arg3, %1] - {layout_result_0 = #xegpu.layout} + {layout = #xegpu.layout} : !xegpu.tensor_desc<16x16xbf16, #xegpu.layout> -> vector<16x16xbf16> %9 = xegpu.dpas %7, %8, %arg4 - {layout_result_0 = #xegpu.layout} + {layout_a = #xegpu.layout, + layout_b = #xegpu.layout, + layout_cd = #xegpu.layout} : vector<8x16xbf16>, vector<16x16xbf16>, vector<8x16xf32> -> vector<8x16xf32> scf.yield %9 : vector<8x16xf32> @@ -137,7 +141,7 @@ gpu.module @xevm_module{ %offset = arith.constant {layout_result_0 = #xegpu.layout} dense<12> : vector<16xindex> %loaded = scf.if %pred -> (vector<16x8xf16>) { %3 = xegpu.load %src[%offset], %1 <{chunk_size=8}> { - layout_result_0 = #xegpu.layout + layout = #xegpu.layout } : memref<256xf16>, vector<16xindex>, vector<16xi1> -> vector<16x8xf16> scf.yield %3 : vector<16x8xf16> } else { @@ -169,7 +173,7 @@ gpu.module @xevm_module{ %offset = arith.constant {layout_result_0 = #xegpu.layout} dense<12> : vector<16xindex> scf.if %pred { %3 = xegpu.load %src[%offset], %1 <{chunk_size=8}> { - layout_result_0 = #xegpu.layout + layout = #xegpu.layout } : memref<256xf16>, vector<16xindex>, vector<16xi1> -> vector<16x8xf16> xegpu.store %3, %src[%offset], %1 <{chunk_size=8}> : vector<16x8xf16>, memref<256xf16>, vector<16xindex>, vector<16xi1> } @@ -194,17 +198,20 @@ gpu.module @xevm_module{ %c0 = arith.constant 0 : index %0 = xegpu.create_nd_tdesc %arg0 : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16, #xegpu.layout> - %1 = xegpu.load_nd %0[%c0, %c0] {layout_result_0 = #xegpu.layout} + %1 = xegpu.load_nd %0[%c0, %c0] {layout = #xegpu.layout} : !xegpu.tensor_desc<8x16xf16, #xegpu.layout> -> vector<8x16xf16> %2 = xegpu.create_nd_tdesc %arg1 : memref<16x8xi32> -> !xegpu.tensor_desc<16x8xi32, #xegpu.layout> - %3 = xegpu.load_nd %2[%c0, %c0] {layout_result_0 = #xegpu.layout} + %3 = xegpu.load_nd %2[%c0, %c0] {layout = #xegpu.layout} : !xegpu.tensor_desc<16x8xi32, #xegpu.layout> -> vector<16x8xi32> %4 = vector.bitcast %3 {layout_result_0 = #xegpu.layout} : vector<16x8xi32> to vector<16x16xf16> %5 = vector.transpose %4, [1, 0] {layout_result_0 = #xegpu.layout} : vector<16x16xf16> to vector<16x16xf16> - %6 = xegpu.dpas %1, %5 {layout_result_0 = #xegpu.layout} + %6 = xegpu.dpas %1, %5 + {layout_a = #xegpu.layout, + layout_b = #xegpu.layout, + layout_cd = #xegpu.layout} : vector<8x16xf16>, vector<16x16xf16> -> vector<8x16xf32> %7 = xegpu.create_nd_tdesc %arg2 : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.layout> @@ -341,7 +348,7 @@ gpu.module @xevm_module{ -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout> %tdesc1 = xegpu.create_nd_tdesc %arg1 : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout> - %0 = xegpu.load_nd %tdesc0[%c0, %c0] <{layout = #xegpu.layout}> {layout_result_0 = #xegpu.layout} : !xegpu.tensor_desc<16x16xf16, #xegpu.layout> -> vector<16x16xf16> + %0 = xegpu.load_nd %tdesc0[%c0, %c0] <{layout = #xegpu.layout}> : !xegpu.tensor_desc<16x16xf16, #xegpu.layout> -> vector<16x16xf16> %1 = vector.multi_reduction , %0, %cst {layout_result_0 = #xegpu.slice<#xegpu.layout, dims = [0]>} [0] : vector<16x16xf16> to vector<16xf16> // CHECK: %[[BCAST:.*]] = vector.broadcast %{{.*}} : f16 to vector<16xf16> %2 = vector.broadcast %1 {layout_result_0 = #xegpu.layout} : vector<16xf16> to vector<16x16xf16> @@ -356,7 +363,7 @@ gpu.module @xevm_module{ gpu.func @vector_broadcast_2d_to_2d_across_lane_lower_to_noop_case(%arg0: memref<16xf16>, %arg1: memref<16x16xf16>) { %c0 = arith.constant 0 : index %mask = vector.constant_mask [16] {layout_result_0 = #xegpu.slice<#xegpu.layout, dims = [1]>}: vector<16xi1> - %1 = xegpu.load %arg0[%c0], %mask {layout_result_0 = #xegpu.slice<#xegpu.layout, dims = [1]>}: memref<16xf16>, index, vector<16xi1> -> vector<16xf16> + %1 = xegpu.load %arg0[%c0], %mask {layout = #xegpu.slice<#xegpu.layout, dims = [1]>}: memref<16xf16>, index, vector<16xi1> -> vector<16xf16> %11 = vector.shape_cast %1 {layout_result_0 = #xegpu.layout} : vector<16xf16> to vector<16x1xf16> %2 = vector.broadcast %11 {layout_result_0 = #xegpu.layout} : vector<16x1xf16> to vector<16x16xf16> diff --git a/mlir/test/Dialect/XeGPU/transform-ops.mlir b/mlir/test/Dialect/XeGPU/transform-ops.mlir index 561034fb5880b..13ed24ebf0a3a 100644 --- a/mlir/test/Dialect/XeGPU/transform-ops.mlir +++ b/mlir/test/Dialect/XeGPU/transform-ops.mlir @@ -149,7 +149,7 @@ func.func @set_op_layout_attr_result_default_index(%arg0: memref<4096x4096xf16>, %4 = xegpu.create_nd_tdesc %arg2 : memref<4096x4096xf16> -> !xegpu.tensor_desc<256x256xf16> %5 = xegpu.load_nd %4[0, 0] : !xegpu.tensor_desc<256x256xf16> -> vector<256x256xf16> // CHECK: = xegpu.dpas - // CHECK-SAME: {layout_result_0 = #xegpu.layout} + // CHECK-SAME: {layout_cd = #xegpu.layout} %6 = xegpu.dpas %1, %3, %5 : vector<256x32xf16>, vector<32x256xf16>, vector<256x256xf16> -> vector<256x256xf16> return } diff --git a/mlir/test/Dialect/XeGPU/xegpu-blocking.mlir b/mlir/test/Dialect/XeGPU/xegpu-blocking.mlir index d61908b422194..9f9edcd416ddf 100644 --- a/mlir/test/Dialect/XeGPU/xegpu-blocking.mlir +++ b/mlir/test/Dialect/XeGPU/xegpu-blocking.mlir @@ -15,7 +15,7 @@ gpu.module @test_kernel { %n = arith.muli %block_id_y, %c32 : index %c_tdesc = xegpu.create_nd_tdesc %C[%m, %n] : memref<1024x1024xf32> -> !xegpu.tensor_desc<16x32xf32, #c> - %c_init = xegpu.load_nd %c_tdesc : !xegpu.tensor_desc<16x32xf32, #c> -> vector<16x32xf32> + %c_init = xegpu.load_nd %c_tdesc {layout = #c}: !xegpu.tensor_desc<16x32xf32, #c> -> vector<16x32xf32> %a_tdesc = xegpu.create_nd_tdesc %A[%m, %c0] : memref<1024x1024xf16> -> !xegpu.tensor_desc<16x32xf16, #a> %b_tdesc = xegpu.create_nd_tdesc %B[%c0, %n] : memref<1024x1024xf16> -> !xegpu.tensor_desc<32x32xf16, #b> @@ -23,11 +23,11 @@ gpu.module @test_kernel { iter_args(%arg0 = %a_tdesc, %arg1 = %b_tdesc, %arg2 = %c_init) -> (!xegpu.tensor_desc<16x32xf16, #a>, !xegpu.tensor_desc<32x32xf16, #b>, vector<16x32xf32>) { //CHECK-COUNT-4: xegpu.load_nd {{.*}} -> vector<8x16xf16> - %a = xegpu.load_nd %arg0 : !xegpu.tensor_desc<16x32xf16, #a> -> vector<16x32xf16> + %a = xegpu.load_nd %arg0 {layout = #a}: !xegpu.tensor_desc<16x32xf16, #a> -> vector<16x32xf16> //CHECK-COUNT-4: xegpu.load_nd {{.*}} -> vector<16x16xf16> - %b = xegpu.load_nd %arg1 : !xegpu.tensor_desc<32x32xf16, #b> -> vector<32x32xf16> - //CHECK-COUNT-8: xegpu.dpas {{.*}} {layout_result_0 = #xegpu.layout} : vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32> -> vector<8x16xf32> - %c = xegpu.dpas %a, %b, %arg2 {layout_result_0 = #c}: vector<16x32xf16>, vector<32x32xf16>, vector<16x32xf32> -> vector<16x32xf32> + %b = xegpu.load_nd %arg1 {layout = #b}: !xegpu.tensor_desc<32x32xf16, #b> -> vector<32x32xf16> + //CHECK-COUNT-8: xegpu.dpas {{.*}} + %c = xegpu.dpas %a, %b, %arg2 {layout_a=#a, layout_b = #b, layout_cd = #c}: vector<16x32xf16>, vector<32x32xf16>, vector<16x32xf32> -> vector<16x32xf32> //CHECK-COUNT-4: xegpu.update_nd_offset {{.*}} : !xegpu.tensor_desc<8x16xf16, #xegpu.layout> %a_next_tdesc = xegpu.update_nd_offset %arg0, [%c0, %c32] : !xegpu.tensor_desc<16x32xf16, #a> //CHECK-COUNT-4: xegpu.update_nd_offset {{.*}} : !xegpu.tensor_desc<16x16xf16, #xegpu.layout> @@ -36,7 +36,7 @@ gpu.module @test_kernel { : !xegpu.tensor_desc<16x32xf16, #a>, !xegpu.tensor_desc<32x32xf16, #b>, vector<16x32xf32> } //CHECK-COUNT-4: xegpu.store_nd {{.*}} : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.layout> - xegpu.store_nd %out#2, %c_tdesc: vector<16x32xf32>, !xegpu.tensor_desc<16x32xf32, #c> + xegpu.store_nd %out#2, %c_tdesc {layout = #c}: vector<16x32xf32>, !xegpu.tensor_desc<16x32xf32, #c> gpu.return } } @@ -56,7 +56,7 @@ gpu.module @test_kernel { %n = arith.muli %block_id_y, %c32 : index %c_tdesc = xegpu.create_nd_tdesc %C[%m, %n] : memref<1024x1024xf32> -> !xegpu.tensor_desc<16x32xf32, #l1> - %c_init = xegpu.load_nd %c_tdesc : !xegpu.tensor_desc<16x32xf32, #l1> -> vector<16x32xf32> + %c_init = xegpu.load_nd %c_tdesc {layout = #l1}: !xegpu.tensor_desc<16x32xf32, #l1> -> vector<16x32xf32> %a_tdesc = xegpu.create_nd_tdesc %A[%m, %c0] : memref<1024x1024xf16> -> !xegpu.tensor_desc<16x32xf16, #l1> %b_tdesc = xegpu.create_nd_tdesc %B[%c0, %n] : memref<1024x1024xf16> -> !xegpu.tensor_desc<32x32xf16, #l2> @@ -64,11 +64,11 @@ gpu.module @test_kernel { iter_args(%arg0 = %a_tdesc, %arg1 = %b_tdesc, %arg2 = %c_init) -> (!xegpu.tensor_desc<16x32xf16, #l1>, !xegpu.tensor_desc<32x32xf16, #l2>, vector<16x32xf32>) { //CHECK-COUNT-4: xegpu.load_nd {{.*}} -> vector<8x16xf16> - %a = xegpu.load_nd %arg0 : !xegpu.tensor_desc<16x32xf16, #l1> -> vector<16x32xf16> + %a = xegpu.load_nd %arg0 {layout = #l1}: !xegpu.tensor_desc<16x32xf16, #l1> -> vector<16x32xf16> //CHECK-COUNT-4: xegpu.load_nd {{.*}} -> vector<16x16xf16> - %b = xegpu.load_nd %arg1 : !xegpu.tensor_desc<32x32xf16, #l2> -> vector<32x32xf16> - //CHECK-COUNT-8: xegpu.dpas {{.*}} : vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32> -> vector<8x16xf32> - %c = xegpu.dpas %a, %b, %arg2 {layout_result_0 = #l1}: vector<16x32xf16>, vector<32x32xf16>, vector<16x32xf32> -> vector<16x32xf32> + %b = xegpu.load_nd %arg1 {layout = #l2}: !xegpu.tensor_desc<32x32xf16, #l2> -> vector<32x32xf16> + //CHECK-COUNT-8: xegpu.dpas {{.*}} + %c = xegpu.dpas %a, %b, %arg2 {layout_a=#l1, layout_b = #l2, layout_cd = #l1,layout_result_0 = #l1}: vector<16x32xf16>, vector<32x32xf16>, vector<16x32xf32> -> vector<16x32xf32> //CHECK-COUNT-4: xegpu.update_nd_offset {{.*}} : !xegpu.tensor_desc<8x16xf16> %a_next_tdesc = xegpu.update_nd_offset %arg0, [%c0, %c32] : !xegpu.tensor_desc<16x32xf16, #l1> //CHECK-COUNT-4: xegpu.update_nd_offset {{.*}} : !xegpu.tensor_desc<16x16xf16> @@ -77,7 +77,7 @@ gpu.module @test_kernel { : !xegpu.tensor_desc<16x32xf16, #l1>, !xegpu.tensor_desc<32x32xf16, #l2>, vector<16x32xf32> } //CHECK-COUNT-4: xegpu.store_nd {{.*}} : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32> - xegpu.store_nd %out#2, %c_tdesc: vector<16x32xf32>, !xegpu.tensor_desc<16x32xf32, #l1> + xegpu.store_nd %out#2, %c_tdesc {layout = #l1}: vector<16x32xf32>, !xegpu.tensor_desc<16x32xf32, #l1> gpu.return } } @@ -100,7 +100,7 @@ gpu.module @test_kernel { %c_tdesc = xegpu.create_nd_tdesc %C[%m, %n] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x32xf32, #l1> //CHECK-COUNT-2: xegpu.load_nd {{.*}} : !xegpu.tensor_desc<8x16xf32> -> vector<8x16xf32> - %c_init = xegpu.load_nd %c_tdesc : !xegpu.tensor_desc<8x32xf32, #l1> -> vector<8x32xf32> + %c_init = xegpu.load_nd %c_tdesc {layout = #l1}: !xegpu.tensor_desc<8x32xf32, #l1> -> vector<8x32xf32> %a_tdesc = xegpu.create_nd_tdesc %A[%m, %c0] : memref<1024x1024xf16> -> !xegpu.tensor_desc<8x16xf16, #l1> %b_tdesc = xegpu.create_nd_tdesc %B[%c0, %n] : memref<1024x1024xf16> -> !xegpu.tensor_desc<16x32xf16, #l2> @@ -108,10 +108,10 @@ gpu.module @test_kernel { iter_args(%arg0 = %a_tdesc, %arg1 = %b_tdesc, %arg2 = %c_init) -> (!xegpu.tensor_desc<8x16xf16, #l1>, !xegpu.tensor_desc<16x32xf16, #l2>, vector<8x32xf32>) { //CHECK: xegpu.load_nd {{.*}} : !xegpu.tensor_desc<8x16xf16> -> vector<8x16xf16> - %a = xegpu.load_nd %arg0 : !xegpu.tensor_desc<8x16xf16, #l1> -> vector<8x16xf16> + %a = xegpu.load_nd %arg0 {layout = #l1}: !xegpu.tensor_desc<8x16xf16, #l1> -> vector<8x16xf16> //CHECK-COUNT-2: xegpu.load_nd {{.*}} : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16> - %b = xegpu.load_nd %arg1 : !xegpu.tensor_desc<16x32xf16, #l2> -> vector<16x32xf16> - %c = xegpu.dpas %a, %b, %arg2 {layout_result_0 = #l1}: vector<8x16xf16>, vector<16x32xf16>, vector<8x32xf32> -> vector<8x32xf32> + %b = xegpu.load_nd %arg1 {layout = #l2}: !xegpu.tensor_desc<16x32xf16, #l2> -> vector<16x32xf16> + %c = xegpu.dpas %a, %b, %arg2 {layout_a=#l1, layout_b = #l2, layout_cd = #l1,layout_result_0 = #l1}: vector<8x16xf16>, vector<16x32xf16>, vector<8x32xf32> -> vector<8x32xf32> //CHECK: xegpu.update_nd_offset {{.*}} [%c0, %c32] : !xegpu.tensor_desc<8x16xf16> %a_next_tdesc = xegpu.update_nd_offset %arg0, [%c0, %c32] : !xegpu.tensor_desc<8x16xf16, #l1> //CHECK-COUNT-2: xegpu.update_nd_offset {{.*}} [%c32, %c0] : !xegpu.tensor_desc<16x16xf16> @@ -120,7 +120,7 @@ gpu.module @test_kernel { : !xegpu.tensor_desc<8x16xf16, #l1>, !xegpu.tensor_desc<16x32xf16, #l2>, vector<8x32xf32> } //CHECK-COUNT-2: xegpu.store_nd {{.*}} : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32> - xegpu.store_nd %out#2, %c_tdesc: vector<8x32xf32>, !xegpu.tensor_desc<8x32xf32, #l1> + xegpu.store_nd %out#2, %c_tdesc {layout = #l1}: vector<8x32xf32>, !xegpu.tensor_desc<8x32xf32, #l1> gpu.return } } @@ -141,7 +141,7 @@ gpu.module @test_kernel { %n = arith.muli %block_id_y, %c32 : index %c_tdesc = xegpu.create_nd_tdesc %C[%m, %n] : memref<1024x1024xf32> -> !xegpu.tensor_desc<16x32xf32, #c> - %c_init = xegpu.load_nd %c_tdesc : !xegpu.tensor_desc<16x32xf32, #c> -> vector<16x32xf32> + %c_init = xegpu.load_nd %c_tdesc {layout = #c}: !xegpu.tensor_desc<16x32xf32, #c> -> vector<16x32xf32> %a_tdesc = xegpu.create_nd_tdesc %A[%m, %c0] : memref<1024x1024xf16> -> !xegpu.tensor_desc<16x32xf16, #a> %b_tdesc = xegpu.create_nd_tdesc %B[%c0, %n] : memref<1024x1024xf16> -> !xegpu.tensor_desc<32x32xf16, #b> @@ -149,13 +149,13 @@ gpu.module @test_kernel { iter_args(%arg0 = %a_tdesc, %arg1 = %b_tdesc, %arg2 = %c_init) -> (!xegpu.tensor_desc<16x32xf16, #a>, !xegpu.tensor_desc<32x32xf16, #b>, vector<16x32xf32>) { //CHECK-COUNT-4: xegpu.load_nd {{.*}} -> vector<8x16xf16> - %a = xegpu.load_nd %arg0 : !xegpu.tensor_desc<16x32xf16, #a> -> vector<16x32xf16> + %a = xegpu.load_nd %arg0 {layout = #a}: !xegpu.tensor_desc<16x32xf16, #a> -> vector<16x32xf16> //CHECK-COUNT-4: xegpu.load_nd {{.*}} -> vector<16x16xf16> - %b = xegpu.load_nd %arg1 : !xegpu.tensor_desc<32x32xf16, #b> -> vector<32x32xf16> + %b = xegpu.load_nd %arg1 {layout = #b}: !xegpu.tensor_desc<32x32xf16, #b> -> vector<32x32xf16> //CHECK-COUNT-4: math.exp {{.*}} : vector<8x16xf16> %e = math.exp %a {layout_result_0 = #a} : vector<16x32xf16> - //CHECK-COUNT-8: xegpu.dpas {{.*}} {layout_result_0 = #xegpu.layout} : vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32> -> vector<8x16xf32> - %c = xegpu.dpas %e, %b, %arg2 {layout_result_0 = #c}: vector<16x32xf16>, vector<32x32xf16>, vector<16x32xf32> -> vector<16x32xf32> + //CHECK-COUNT-8: xegpu.dpas {{.*}} + %c = xegpu.dpas %e, %b, %arg2 {layout_a=#a, layout_b = #b, layout_cd = #c,layout_result_0 = #c}: vector<16x32xf16>, vector<32x32xf16>, vector<16x32xf32> -> vector<16x32xf32> //CHECK-COUNT-4: xegpu.update_nd_offset {{.*}} : !xegpu.tensor_desc<8x16xf16, #xegpu.layout> %a_next_tdesc = xegpu.update_nd_offset %arg0, [%c0, %c32] : !xegpu.tensor_desc<16x32xf16, #a> //CHECK-COUNT-4: xegpu.update_nd_offset {{.*}} : !xegpu.tensor_desc<16x16xf16, #xegpu.layout> @@ -164,7 +164,7 @@ gpu.module @test_kernel { : !xegpu.tensor_desc<16x32xf16, #a>, !xegpu.tensor_desc<32x32xf16, #b>, vector<16x32xf32> } //CHECK-COUNT-4: xegpu.store_nd {{.*}} : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.layout> - xegpu.store_nd %out#2, %c_tdesc: vector<16x32xf32>, !xegpu.tensor_desc<16x32xf32, #c> + xegpu.store_nd %out#2, %c_tdesc {layout = #c}: vector<16x32xf32>, !xegpu.tensor_desc<16x32xf32, #c> gpu.return } } @@ -188,14 +188,14 @@ gpu.module @test_kernel { iter_args(%arg0 = %a_tdesc, %arg1 = %b_tdesc, %arg2 = %c_tdesc) -> (!xegpu.tensor_desc<16x32xf16, #l>, !xegpu.tensor_desc<16x32xf16, #l>, !xegpu.tensor_desc<16x32xf16, #l>) { //CHECK-COUNT-8: xegpu.load_nd {{.*}} : !xegpu.tensor_desc<8x16xf16> -> vector<8x16xf16> - %a = xegpu.load_nd %arg0 : !xegpu.tensor_desc<16x32xf16, #l> -> vector<16x32xf16> - %b = xegpu.load_nd %arg1 : !xegpu.tensor_desc<16x32xf16, #l> -> vector<16x32xf16> + %a = xegpu.load_nd %arg0 {layout = #l}: !xegpu.tensor_desc<16x32xf16, #l> -> vector<16x32xf16> + %b = xegpu.load_nd %arg1 {layout = #l}: !xegpu.tensor_desc<16x32xf16, #l> -> vector<16x32xf16> //CHECK-COUNT-4: arith.addf {{.*}} : vector<8x16xf16> %c = arith.addf %a, %b {layout_result_0 = #l} : vector<16x32xf16> //CHECK-COUNT-4: xegpu.store_nd {{.*}} : vector<8x16xf16>, !xegpu.tensor_desc<8x16xf16> - xegpu.store_nd %c, %arg2: vector<16x32xf16>, !xegpu.tensor_desc<16x32xf16, #l> + xegpu.store_nd %c, %arg2 {layout = #l}: vector<16x32xf16>, !xegpu.tensor_desc<16x32xf16, #l> //CHECK-COUNT-12: xegpu.update_nd_offset {{.*}} : !xegpu.tensor_desc<8x16xf16> %a_next_tdesc = xegpu.update_nd_offset %arg0, [%c0, %c32] : !xegpu.tensor_desc<16x32xf16, #l> @@ -227,14 +227,14 @@ gpu.module @test_kernel { iter_args(%arg0 = %a_tdesc, %arg1 = %b_tdesc, %arg2 = %c_tdesc) -> (!xegpu.tensor_desc<32xf16, #l>, !xegpu.tensor_desc<32xf16, #l>, !xegpu.tensor_desc<32xf16, #l>) { //CHECK-COUNT-8: xegpu.load_nd {{.*}} : !xegpu.tensor_desc<8xf16> -> vector<8xf16> - %a = xegpu.load_nd %arg0 : !xegpu.tensor_desc<32xf16, #l> -> vector<32xf16> - %b = xegpu.load_nd %arg1 : !xegpu.tensor_desc<32xf16, #l> -> vector<32xf16> + %a = xegpu.load_nd %arg0 {layout = #l}: !xegpu.tensor_desc<32xf16, #l> -> vector<32xf16> + %b = xegpu.load_nd %arg1 {layout = #l}: !xegpu.tensor_desc<32xf16, #l> -> vector<32xf16> //CHECK-COUNT-4: arith.addf {{.*}} : vector<8xf16> %c = arith.addf %a, %b {layout_result_0 = #l} : vector<32xf16> //CHECK-COUNT-4: xegpu.store_nd {{.*}} : vector<8xf16>, !xegpu.tensor_desc<8xf16> - xegpu.store_nd %c, %arg2: vector<32xf16>, !xegpu.tensor_desc<32xf16, #l> + xegpu.store_nd %c, %arg2 {layout = #l}: vector<32xf16>, !xegpu.tensor_desc<32xf16, #l> //CHECK-COUNT-12: xegpu.update_nd_offset {{.*}} : !xegpu.tensor_desc<8xf16> %a_next_tdesc = xegpu.update_nd_offset %arg0, [%c32] : !xegpu.tensor_desc<32xf16, #l> @@ -257,12 +257,12 @@ gpu.module @test_kernel { %block_id_x = gpu.block_id x %m = arith.muli %block_id_x, %c64 : index %0 = xegpu.create_nd_tdesc %a[0, %m] : memref<16x512xf32> -> !xegpu.tensor_desc<16x64xf32, #l> - %1 = xegpu.load_nd %0: !xegpu.tensor_desc<16x64xf32, #l> -> vector<16x64xf32> + %1 = xegpu.load_nd %0 {layout = #l}: !xegpu.tensor_desc<16x64xf32, #l> -> vector<16x64xf32> // CHECK: vector.multi_reduction , {{.*}}, [[ACC:%[0-9A-Za-z]+]] [0] : vector<16x16xf32> to vector<16xf32> // CHECK-COUNT-3: vector.multi_reduction , {{.*}}, [[ACC]] [0] : vector<16x16xf32> to vector<16xf32> %2 = vector.multi_reduction , %1, %acc {layout_result_0 = #r} [0]: vector<16x64xf32> to vector<64xf32> %3 = xegpu.create_nd_tdesc %b[%m] : memref<512xf32> -> !xegpu.tensor_desc<64xf32, #r> - xegpu.store_nd %2, %3: vector<64xf32>, !xegpu.tensor_desc<64xf32, #r> + xegpu.store_nd %2, %3 {layout = #r}: vector<64xf32>, !xegpu.tensor_desc<64xf32, #r> gpu.return } } @@ -282,14 +282,14 @@ gpu.module @test_kernel { %m = arith.muli %block_id_x, %c32 : index %n = arith.muli %block_id_y, %c32 : index %0 = xegpu.create_nd_tdesc %a[%m, %n] : memref<512x32xf32> -> !xegpu.tensor_desc<32x128xf32, #l> - %1 = xegpu.load_nd %0: !xegpu.tensor_desc<32x128xf32, #l> -> vector<32x128xf32> + %1 = xegpu.load_nd %0 {layout = #l}: !xegpu.tensor_desc<32x128xf32, #l> -> vector<32x128xf32> // CHECK: vector.multi_reduction , {{.*}}, [[INIT:%[0-9A-Za-z]+]] [1] : vector<16x16xf32> to vector<16xf32> // CHECK-COUNT-1: vector.multi_reduction , {{.*}}, [[INIT]] [1] : vector<16x16xf32> to vector<16xf32> %2 = vector.multi_reduction , %1, %acc {layout_result_0 = #r} [1]: vector<32x128xf32> to vector<32xf32> %3 = xegpu.create_nd_tdesc %b[%n] : memref<512xf32> -> !xegpu.tensor_desc<32xf32, #r> - xegpu.store_nd %2, %3: vector<32xf32>, !xegpu.tensor_desc<32xf32, #r> + xegpu.store_nd %2, %3 {layout = #r}: vector<32xf32>, !xegpu.tensor_desc<32xf32, #r> gpu.return } } @@ -304,11 +304,11 @@ gpu.module @test_kernel { %block_id_x = gpu.block_id x %m = arith.muli %block_id_x, %c64 : index %0 = xegpu.create_nd_tdesc %a[%m] : memref<512xf32> -> !xegpu.tensor_desc<64xf32, #r> - %1 = xegpu.load_nd %0: !xegpu.tensor_desc<64xf32, #r> -> vector<64xf32> + %1 = xegpu.load_nd %0 {layout = #r}: !xegpu.tensor_desc<64xf32, #r> -> vector<64xf32> // CHECK-COUNT-4: vector.broadcast {{.*}} : vector<16xf32> to vector<16x16xf32> %2 = vector.broadcast %1 {layout_result_0 = #l} : vector<64xf32> to vector<16x64xf32> %3 = xegpu.create_nd_tdesc %b[0, %m] : memref<16x512xf32> -> !xegpu.tensor_desc<16x64xf32, #l> - xegpu.store_nd %2, %3: vector<16x64xf32>, !xegpu.tensor_desc<16x64xf32, #l> + xegpu.store_nd %2, %3 {layout = #l}: vector<16x64xf32>, !xegpu.tensor_desc<16x64xf32, #l> gpu.return } } @@ -323,7 +323,7 @@ gpu.module @test_kernel { %block_id_x = gpu.block_id x %m = arith.muli %block_id_x, %c32 : index %0 = xegpu.create_nd_tdesc %a[%m] : memref<512xf32> -> !xegpu.tensor_desc<32xf32, #r> - %1 = xegpu.load_nd %0: !xegpu.tensor_desc<32xf32, #r> -> vector<32xf32> + %1 = xegpu.load_nd %0 {layout = #r}: !xegpu.tensor_desc<32xf32, #r> -> vector<32xf32> %11 = vector.shape_cast %1 : vector<32xf32> to vector<32x1xf32> // CHECK-COUNT-8: vector.broadcast {{.*}}: vector<16x1xf32> to vector<16x16xf32> %2 = vector.broadcast %11 {layout_result_0 = #l} : vector<32x1xf32> to vector<32x64xf32> @@ -343,24 +343,25 @@ gpu.module @test_kernel { %block_id_x = gpu.block_id x %m = arith.muli %block_id_x, %c32 : index %0 = xegpu.create_nd_tdesc %a[%m, 0] : memref<512x8xf32> -> !xegpu.tensor_desc<32x8xf32, #l> - %1 = xegpu.load_nd %0: !xegpu.tensor_desc<32x8xf32, #l> -> vector<32x8xf32> + %1 = xegpu.load_nd %0 {layout = #l}: !xegpu.tensor_desc<32x8xf32, #l> -> vector<32x8xf32> // CHECK-COUNT-2: vector.transpose {{.*}} [1, 0] : vector<16x8xf32> to vector<8x16xf32> %2 = vector.transpose %1, [1, 0] {layout_result_0 = #t} : vector<32x8xf32> to vector<8x32xf32> %3 = xegpu.create_nd_tdesc %b[0, %m] : memref<8x512xf32> -> !xegpu.tensor_desc<8x32xf32, #t> - xegpu.store_nd %2, %3: vector<8x32xf32>, !xegpu.tensor_desc<8x32xf32, #t> + xegpu.store_nd %2, %3 {layout = #t}: vector<8x32xf32>, !xegpu.tensor_desc<8x32xf32, #t> gpu.return } } // ----- + gpu.module @test_kernel { // CHECK-LABEL: test_prefetch_load_store_update // CHECK-SAME: [[arg0:%.+]]: ui64 // CHECK-COUNT-2: xegpu.create_tdesc [[arg0]], {{.*}} : ui64, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>> - // CHECK-COUNT-2: xegpu.prefetch {{.*}} : !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>> + // CHECK-COUNT-2: xegpu.prefetch {{.*}} // CHECK-COUNT-2: xegpu.update_offset {{.*}} : !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>, vector<16xindex> - // CHECK-COUNT-2: xegpu.load {{.*}} : !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>, vector<16xi1> -> vector<16xf32> - // CHECK-COUNT-2: xegpu.store {{.*}} : vector<16xf32>, !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>, vector<16xi1> + // CHECK-COUNT-2: xegpu.load {{.*}} + // CHECK-COUNT-2: xegpu.store {{.*}} gpu.func @test_prefetch_load_store_update(%src: ui64) { @@ -372,7 +373,7 @@ gpu.module @test_kernel { ]> : vector<32xindex> %tdesc = xegpu.create_tdesc %src, %cst : ui64, vector<32xindex> -> !xegpu.tensor_desc<32xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.layout> - xegpu.prefetch %tdesc: !xegpu.tensor_desc<32xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.layout> + xegpu.prefetch %tdesc {layout = #xegpu.layout}: !xegpu.tensor_desc<32xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.layout> %delta = arith.constant dense<[ 32, 32, 32, 32, 32, 32, 32, 32, @@ -386,10 +387,10 @@ gpu.module @test_kernel { %c17 = arith.constant 17: index %mask = vector.create_mask %c17: vector<32xi1> - %ld_vec = xegpu.load %new_tdesc, %mask: !xegpu.tensor_desc<32xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.layout>, vector<32xi1> -> vector<32xf32> + %ld_vec = xegpu.load %new_tdesc, %mask {layout = #xegpu.layout}: !xegpu.tensor_desc<32xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.layout>, vector<32xi1> -> vector<32xf32> %st_vec = arith.addf %ld_vec, %ld_vec : vector<32xf32> - xegpu.store %st_vec, %tdesc, %mask: + xegpu.store %st_vec, %tdesc, %mask {layout = #xegpu.layout}: vector<32xf32>, !xegpu.tensor_desc<32xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.layout>, vector<32xi1> @@ -400,15 +401,14 @@ gpu.module @test_kernel { } // ----- - gpu.module @test_kernel { // CHECK-LABEL: test_prefetch_load_store_update_chunk // CHECK-SAME: [[arg0:%.+]]: ui64 // CHECK-COUNT-4: xegpu.create_tdesc [[arg0]], {{.*}} : ui64, vector<16xindex> -> !xegpu.tensor_desc<16x2xf32, #xegpu.scatter_tdesc_attr> - // CHECK-COUNT-4: xegpu.prefetch {{.*}} : !xegpu.tensor_desc<16x2xf32, #xegpu.scatter_tdesc_attr> + // CHECK-COUNT-4: xegpu.prefetch {{.*}} <{layout = #xegpu.layout}> : !xegpu.tensor_desc<16x2xf32, #xegpu.scatter_tdesc_attr> // CHECK-COUNT-4: xegpu.update_offset {{.*}} : !xegpu.tensor_desc<16x2xf32, #xegpu.scatter_tdesc_attr>, vector<16xindex> - // CHECK-COUNT-4: xegpu.load {{.*}} : !xegpu.tensor_desc<16x2xf32, #xegpu.scatter_tdesc_attr>, vector<16xi1> -> vector<16x2xf32> - // CHECK-COUNT-4: xegpu.store {{.*}} : vector<16x2xf32>, !xegpu.tensor_desc<16x2xf32, #xegpu.scatter_tdesc_attr>, vector<16xi1> + // CHECK-COUNT-4: xegpu.load {{.*}} <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<16x2xf32, #xegpu.scatter_tdesc_attr>, vector<16xi1> -> vector<16x2xf32> + // CHECK-COUNT-4: xegpu.store {{.*}} <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}> : vector<16x2xf32>, !xegpu.tensor_desc<16x2xf32, #xegpu.scatter_tdesc_attr>, vector<16xi1> gpu.func @test_prefetch_load_store_update_chunk(%src: ui64) { @@ -420,7 +420,7 @@ gpu.module @test_kernel { ]> : vector<32xindex> %tdesc = xegpu.create_tdesc %src, %cst : ui64, vector<32xindex> -> !xegpu.tensor_desc<32x4xf32, #xegpu.scatter_tdesc_attr, #xegpu.layout> - xegpu.prefetch %tdesc: !xegpu.tensor_desc<32x4xf32, #xegpu.scatter_tdesc_attr, #xegpu.layout> + xegpu.prefetch %tdesc {layout = #xegpu.layout}: !xegpu.tensor_desc<32x4xf32, #xegpu.scatter_tdesc_attr, #xegpu.layout> %delta = arith.constant dense<[ 32, 32, 32, 32, 32, 32, 32, 32, @@ -434,10 +434,10 @@ gpu.module @test_kernel { %c17 = arith.constant 17: index %mask = vector.create_mask %c17: vector<32xi1> - %ld_vec = xegpu.load %new_tdesc, %mask <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}>: !xegpu.tensor_desc<32x4xf32, #xegpu.scatter_tdesc_attr, #xegpu.layout>, vector<32xi1> -> vector<32x4xf32> + %ld_vec = xegpu.load %new_tdesc, %mask <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, layout = #xegpu.layout}>: !xegpu.tensor_desc<32x4xf32, #xegpu.scatter_tdesc_attr, #xegpu.layout>, vector<32xi1> -> vector<32x4xf32> %st_vec = arith.addf %ld_vec, %ld_vec : vector<32x4xf32> - xegpu.store %st_vec, %tdesc, %mask <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}>: + xegpu.store %st_vec, %tdesc, %mask <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, layout = #xegpu.layout}>: vector<32x4xf32>, !xegpu.tensor_desc<32x4xf32, #xegpu.scatter_tdesc_attr, #xegpu.layout>, vector<32xi1> @@ -476,7 +476,7 @@ gpu.module @test_kernel { ]> : vector<4x8xindex> %tdesc = xegpu.create_tdesc %src, %cst : ui64, vector<4x8xindex> -> !xegpu.tensor_desc<4x8x4xf32, #xegpu.scatter_tdesc_attr, #l> - xegpu.prefetch %tdesc: !xegpu.tensor_desc<4x8x4xf32, #xegpu.scatter_tdesc_attr, #l> + xegpu.prefetch %tdesc {layout = #l}: !xegpu.tensor_desc<4x8x4xf32, #xegpu.scatter_tdesc_attr, #l> %delta = arith.constant dense<[ [32, 32, 32, 32, 32, 32, 32, 32], @@ -490,10 +490,10 @@ gpu.module @test_kernel { %c4 = arith.constant 4: index %mask = vector.create_mask %c4, %c4: vector<4x8xi1> - %ld_vec = xegpu.load %new_tdesc, %mask <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}>: !xegpu.tensor_desc<4x8x4xf32, #xegpu.scatter_tdesc_attr, #l>, vector<4x8xi1> -> vector<4x8x4xf32> + %ld_vec = xegpu.load %new_tdesc, %mask <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, layout = #l}>: !xegpu.tensor_desc<4x8x4xf32, #xegpu.scatter_tdesc_attr, #l>, vector<4x8xi1> -> vector<4x8x4xf32> %st_vec = arith.addf %ld_vec, %ld_vec {layout_result_0 = #l} : vector<4x8x4xf32> - xegpu.store %st_vec, %tdesc, %mask <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}>: + xegpu.store %st_vec, %tdesc, %mask <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, layout = #l}>: vector<4x8x4xf32>, !xegpu.tensor_desc<4x8x4xf32, #xegpu.scatter_tdesc_attr, #l>, vector<4x8xi1> @@ -513,13 +513,13 @@ gpu.module @test_kernel { //CHECK: [[c0:%.+]] = arith.constant 0 : index //CHECK: [[a:%.+]] = xegpu.create_nd_tdesc [[arg0]][[[c0]], [[c0]]] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout> //CHECK: [[b:%.+]] = xegpu.create_nd_tdesc [[arg1]][[[c0]], [[c0]]] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout> - //CHECK: [[load_a:%.+]] = xegpu.load_nd [[a]] {layout_result_0 = #xegpu.layout} : !xegpu.tensor_desc<16x16xf16, #xegpu.layout> -> vector<16x16xf16> - //CHECK: [[load_b:%.+]] = xegpu.load_nd [[b]] {layout_result_0 = #xegpu.layout} : !xegpu.tensor_desc<16x16xf16, #xegpu.layout> -> vector<16x16xf16> + //CHECK: [[load_a:%.+]] = xegpu.load_nd [[a]] <{layout = #xegpu.layout}> : !xegpu.tensor_desc<16x16xf16, #xegpu.layout> -> vector<16x16xf16> + //CHECK: [[load_b:%.+]] = xegpu.load_nd [[b]] <{layout = #xegpu.layout}> : !xegpu.tensor_desc<16x16xf16, #xegpu.layout> -> vector<16x16xf16> //CHECK: [[cvt:%.+]] = xegpu.convert_layout [[load_a]] <{input_layout = #xegpu.layout, target_layout = #xegpu.layout}> : vector<16x16xf16> //CHECK: [[a0:%.+]] = vector.extract_strided_slice [[cvt]] {offsets = [0, 0], sizes = [8, 16], strides = [1, 1]} : vector<16x16xf16> to vector<8x16xf16> //CHECK: [[a1:%.+]] = vector.extract_strided_slice [[cvt]] {offsets = [8, 0], sizes = [8, 16], strides = [1, 1]} : vector<16x16xf16> to vector<8x16xf16> - //CHECK: [[dpas0:%.+]] = xegpu.dpas [[a0]], [[load_b]] {layout_result_0 = #xegpu.layout} : vector<8x16xf16>, vector<16x16xf16> -> vector<8x16xf32> - //CHECK: [[dpas1:%.+]] = xegpu.dpas [[a1]], [[load_b]] {layout_result_0 = #xegpu.layout} : vector<8x16xf16>, vector<16x16xf16> -> vector<8x16xf32> + //CHECK: [[dpas0:%.+]] = xegpu.dpas [[a0]], [[load_b]] + //CHECK: [[dpas1:%.+]] = xegpu.dpas [[a1]], [[load_b]] //CHECK: [[c_tdesc_0:%.+]] = xegpu.create_nd_tdesc [[arg2]][[[c0]], [[c0]]] : memref<16x16xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.layout> //CHECK: [[c_tdesc_1:%.+]] = xegpu.create_nd_tdesc [[arg2]][[[c8]], [[c0]]] : memref<16x16xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.layout> //CHECK: xegpu.store_nd [[dpas0]], [[c_tdesc_0]] : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.layout> @@ -529,12 +529,12 @@ gpu.module @test_kernel { %c0 = arith.constant 0 : index %a_tdesc = xegpu.create_nd_tdesc %A[%c0, %c0] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16, #b> %b_tdesc = xegpu.create_nd_tdesc %B[%c0, %c0] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16, #b> - %a = xegpu.load_nd %a_tdesc : !xegpu.tensor_desc<16x16xf16, #b> -> vector<16x16xf16> - %b = xegpu.load_nd %b_tdesc : !xegpu.tensor_desc<16x16xf16, #b> -> vector<16x16xf16> + %a = xegpu.load_nd %a_tdesc {layout = #b}: !xegpu.tensor_desc<16x16xf16, #b> -> vector<16x16xf16> + %b = xegpu.load_nd %b_tdesc {layout = #b}: !xegpu.tensor_desc<16x16xf16, #b> -> vector<16x16xf16> %e = xegpu.convert_layout %a <{input_layout = #b, target_layout = #a}> : vector<16x16xf16> - %c = xegpu.dpas %e, %b {layout_result_0 = #c}: vector<16x16xf16>, vector<16x16xf16> -> vector<16x16xf32> + %c = xegpu.dpas %e, %b {layout_a=#a, layout_b = #b, layout_cd = #c, layout_result_0 = #c}: vector<16x16xf16>, vector<16x16xf16> -> vector<16x16xf32> %c_tdesc = xegpu.create_nd_tdesc %C[%c0, %c0] : memref<16x16xf32> -> !xegpu.tensor_desc<16x16xf32, #c> - xegpu.store_nd %c, %c_tdesc: vector<16x16xf32>, !xegpu.tensor_desc<16x16xf32, #c> + xegpu.store_nd %c, %c_tdesc {layout = #c}: vector<16x16xf32>, !xegpu.tensor_desc<16x16xf32, #c> gpu.return } } @@ -599,7 +599,7 @@ gpu.module @test_kernel { %c17 = arith.constant 17: index %mask = vector.create_mask %c17: vector<32xi1> - %ld = xegpu.load %src[%cst], %mask {chunk_size = 1, layout_result_0 = #xegpu.layout, l1_hint = #xegpu.cache_hint} : ui64, vector<32xindex>, vector<32xi1> -> vector<32xf32> + %ld = xegpu.load %src[%cst], %mask {chunk_size = 1, layout = #xegpu.layout, l1_hint = #xegpu.cache_hint} : ui64, vector<32xindex>, vector<32xi1> -> vector<32xf32> gpu.return %ld : vector<32xf32> } @@ -621,10 +621,7 @@ gpu.module @test_kernel { %mask = vector.create_mask %c17: vector<32xi1> %st_vec = arith.constant dense<1023.0>: vector<32xf32> - xegpu.store %st_vec, %src[%cst], %mask {chunk_size = 1, layout_operand_0 = #xegpu.layout, - layout_operand_2 = #xegpu.layout, - layout_operand_3 = #xegpu.layout, - l1_hint = #xegpu.cache_hint} : vector<32xf32>, ui64, vector<32xindex>, vector<32xi1> + xegpu.store %st_vec, %src[%cst], %mask {chunk_size = 1, layout = #xegpu.layout, l1_hint = #xegpu.cache_hint} : vector<32xf32>, ui64, vector<32xindex>, vector<32xi1> gpu.return } @@ -649,7 +646,7 @@ gpu.module @test_kernel { %c17 = arith.constant 17: index %mask = vector.create_mask %c17: vector<32xi1> - %ld = xegpu.load %src[%cst], %mask {chunk_size = 4, layout_result_0 = #xegpu.layout, l1_hint = #xegpu.cache_hint} : ui64, vector<32xindex>, vector<32xi1> -> vector<32x4xf32> + %ld = xegpu.load %src[%cst], %mask {chunk_size = 4, layout = #xegpu.layout, l1_hint = #xegpu.cache_hint} : ui64, vector<32xindex>, vector<32xi1> -> vector<32x4xf32> gpu.return %ld : vector<32x4xf32> } } @@ -675,10 +672,7 @@ gpu.module @test_kernel { %mask = vector.create_mask %c17: vector<32xi1> %st_vec = arith.constant dense<1023.>: vector<32x4xf32> - xegpu.store %st_vec, %src[%cst], %mask {chunk_size = 4, layout_operand_0 = #xegpu.layout, - layout_operand_2 = #xegpu.layout, - layout_operand_3 = #xegpu.layout, - l1_hint = #xegpu.cache_hint} : vector<32x4xf32>, ui64, vector<32xindex>, vector<32xi1> + xegpu.store %st_vec, %src[%cst], %mask {chunk_size = 4, layout = #xegpu.layout, l1_hint = #xegpu.cache_hint} : vector<32x4xf32>, ui64, vector<32xindex>, vector<32xi1> gpu.return } } @@ -704,7 +698,7 @@ gpu.module @test_kernel { ]]> : vector<1x1x32xindex> %mask = arith.constant {layout_result_0 = #xegpu.layout} dense : vector<1x1x32xi1> - %ld = xegpu.load %src[%cst], %mask {chunk_size = 1, layout_result_0 = #xegpu.layout, l1_hint = #xegpu.cache_hint} : ui64, vector<1x1x32xindex>, vector<1x1x32xi1> -> vector<1x1x32xf32> + %ld = xegpu.load %src[%cst], %mask {chunk_size = 1, layout = #xegpu.layout, l1_hint = #xegpu.cache_hint} : ui64, vector<1x1x32xindex>, vector<1x1x32xi1> -> vector<1x1x32xf32> gpu.return %ld : vector<1x1x32xf32> } @@ -752,11 +746,11 @@ gpu.module @test_kernel { %b_tdesc = xegpu.create_nd_tdesc %B : memref<1024x1024xf32> -> !xegpu.tensor_desc<1x32xf32, #l> %c_tdesc = xegpu.create_nd_tdesc %C : memref<1024x1024xf32> -> !xegpu.tensor_desc<1x32xf32, #l> - %a = xegpu.load_nd %a_tdesc[%c0, %c0] : !xegpu.tensor_desc<1x32xf32, #l> -> vector<1x32xf32> - %b = xegpu.load_nd %b_tdesc[%c0, %c0] : !xegpu.tensor_desc<1x32xf32, #l> -> vector<1x32xf32> + %a = xegpu.load_nd %a_tdesc[%c0, %c0] {layout = #l}: !xegpu.tensor_desc<1x32xf32, #l> -> vector<1x32xf32> + %b = xegpu.load_nd %b_tdesc[%c0, %c0] {layout = #l}: !xegpu.tensor_desc<1x32xf32, #l> -> vector<1x32xf32> %result = arith.addf %a, %b {layout_result_0 = #l} : vector<1x32xf32> - xegpu.store_nd %result, %c_tdesc[%c0, %c0] : vector<1x32xf32>, !xegpu.tensor_desc<1x32xf32, #l> + xegpu.store_nd %result, %c_tdesc[%c0, %c0] {layout = #l}: vector<1x32xf32>, !xegpu.tensor_desc<1x32xf32, #l> gpu.return } } @@ -778,10 +772,10 @@ gpu.module @test_kernel { 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248]] ]> : vector<1x1x32xindex> %mask = arith.constant {layout_result_0 = #inst_data} dense : vector<1x1x32xi1> - %a = xegpu.load %A[%cst], %mask {chunk_size = 1, layout_result_0 = #inst_data, l1_hint = #xegpu.cache_hint} : ui64, vector<1x1x32xindex>, vector<1x1x32xi1> -> vector<1x1x32xf32> - %b = xegpu.load %B[%cst], %mask {chunk_size = 1, layout_result_0 = #inst_data, l1_hint = #xegpu.cache_hint} : ui64, vector<1x1x32xindex>, vector<1x1x32xi1> -> vector<1x1x32xf32> + %a = xegpu.load %A[%cst], %mask {chunk_size = 1, layout = #inst_data, l1_hint = #xegpu.cache_hint} : ui64, vector<1x1x32xindex>, vector<1x1x32xi1> -> vector<1x1x32xf32> + %b = xegpu.load %B[%cst], %mask {chunk_size = 1, layout = #inst_data, l1_hint = #xegpu.cache_hint} : ui64, vector<1x1x32xindex>, vector<1x1x32xi1> -> vector<1x1x32xf32> %addf = arith.addf %a, %b {layout_result_0 = #inst_data} : vector<1x1x32xf32> - xegpu.store %addf, %C[%cst], %mask {chunk_size = 1, layout_operand_0 = #inst_data, layout_operand_2 = #inst_data, layout_operand_3 = #inst_data, l1_hint = #xegpu.cache_hint} : vector<1x1x32xf32>, ui64, vector<1x1x32xindex>, vector<1x1x32xi1> + xegpu.store %addf, %C[%cst], %mask {chunk_size = 1, layout = #inst_data, l1_hint = #xegpu.cache_hint} : vector<1x1x32xf32>, ui64, vector<1x1x32xindex>, vector<1x1x32xi1> gpu.return } } diff --git a/mlir/test/Dialect/XeGPU/xegpu-unroll-patterns.mlir b/mlir/test/Dialect/XeGPU/xegpu-unroll-patterns.mlir index dbc52b8a98894..c3be138fef38a 100644 --- a/mlir/test/Dialect/XeGPU/xegpu-unroll-patterns.mlir +++ b/mlir/test/Dialect/XeGPU/xegpu-unroll-patterns.mlir @@ -226,7 +226,7 @@ gpu.module @test { %c17 = arith.constant 17: index %mask = vector.create_mask %c17: vector<32xi1> - %ld = xegpu.load %src[%cst], %mask {chunk_size = 1, layout_result_0 = #xegpu.layout, l1_hint = #xegpu.cache_hint} : ui64, vector<32xindex>, vector<32xi1> -> vector<32xf32> + %ld = xegpu.load %src[%cst], %mask {chunk_size = 1, layout = #xegpu.layout, l1_hint = #xegpu.cache_hint} : ui64, vector<32xindex>, vector<32xi1> -> vector<32xf32> gpu.return %ld : vector<32xf32> } @@ -381,7 +381,7 @@ gpu.module @test { %c17 = arith.constant 17: index %mask = vector.create_mask %c17: vector<32xi1> - %ld = xegpu.load %src[%cst], %mask {chunk_size = 4, layout_result_0 = #xegpu.layout, l1_hint = #xegpu.cache_hint} : ui64, vector<32xindex>, vector<32xi1> -> vector<32x4xf32> + %ld = xegpu.load %src[%cst], %mask {chunk_size = 4, layout = #xegpu.layout, l1_hint = #xegpu.cache_hint} : ui64, vector<32xindex>, vector<32xi1> -> vector<32x4xf32> gpu.return %ld : vector<32x4xf32> } diff --git a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-elemwise.mlir b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-elemwise.mlir index 9580769d37313..6e9711442b92d 100644 --- a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-elemwise.mlir +++ b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-elemwise.mlir @@ -6,7 +6,7 @@ gpu.module @test_elementwise_ops { gpu.func @unary_ops_sg_layout_only(%a: memref<24x32xf32>) { %tdesc_a = xegpu.create_nd_tdesc %a[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<24x32xf32, #xegpu.layout> - %load_a = xegpu.load_nd %tdesc_a + %load_a = xegpu.load_nd %tdesc_a {layout = #xegpu.layout} : !xegpu.tensor_desc<24x32xf32, #xegpu.layout> -> vector<24x32xf32> // CHECK: math.exp {{.*}} : vector<12x8xf32> @@ -24,7 +24,7 @@ gpu.module @test_elementwise_ops { gpu.func @unary_ops(%a: memref<24x32xf32>) { %tdesc_a = xegpu.create_nd_tdesc %a[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<24x32xf32, #xegpu.layout> - %load_a = xegpu.load_nd %tdesc_a + %load_a = xegpu.load_nd %tdesc_a {layout = #xegpu.layout} : !xegpu.tensor_desc<24x32xf32, #xegpu.layout> -> vector<24x32xf32> // CHECK: math.exp {{.*}} {layout_result_0 = #xegpu.layout} : vector<12x8xf32> @@ -44,10 +44,10 @@ gpu.module @test_elementwise_ops { -> !xegpu.tensor_desc<24x32xf32, #xegpu.layout> %tdesc_b = xegpu.create_nd_tdesc %b[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<24x32xf32, #xegpu.layout> - %load_a = xegpu.load_nd %tdesc_a + %load_a = xegpu.load_nd %tdesc_a {layout = #xegpu.layout} : !xegpu.tensor_desc<24x32xf32, #xegpu.layout> -> vector<24x32xf32> - %load_b = xegpu.load_nd %tdesc_b + %load_b = xegpu.load_nd %tdesc_b {layout = #xegpu.layout} : !xegpu.tensor_desc<24x32xf32, #xegpu.layout> -> vector<24x32xf32> // CHECK: arith.addf {{.*}}, {{.*}} {layout_result_0 = #xegpu.layout} @@ -71,13 +71,13 @@ gpu.module @test_elementwise_ops { -> !xegpu.tensor_desc<24x32xf32, #xegpu.layout> %tdesc_c = xegpu.create_nd_tdesc %c[0, 0] : memref<24x32xi1> -> !xegpu.tensor_desc<24x32xi1, #xegpu.layout> - %load_a = xegpu.load_nd %tdesc_a + %load_a = xegpu.load_nd %tdesc_a {layout = #xegpu.layout} : !xegpu.tensor_desc<24x32xf32, #xegpu.layout> -> vector<24x32xf32> - %load_b = xegpu.load_nd %tdesc_b + %load_b = xegpu.load_nd %tdesc_b {layout = #xegpu.layout} : !xegpu.tensor_desc<24x32xf32, #xegpu.layout> -> vector<24x32xf32> - %load_c = xegpu.load_nd %tdesc_c + %load_c = xegpu.load_nd %tdesc_c {layout = #xegpu.layout} : !xegpu.tensor_desc<24x32xi1, #xegpu.layout> -> vector<24x32xi1> // CHECK: arith.select {{.*}}, {{.*}}, {{.*}} {layout_result_0 = #xegpu.layout} @@ -99,10 +99,10 @@ gpu.module @test_elementwise_ops { -> !xegpu.tensor_desc<24x32xf32, #xegpu.layout> %tdesc_b = xegpu.create_nd_tdesc %b[0, 0] : memref<24x32xi32> -> !xegpu.tensor_desc<24x32xi32, #xegpu.layout> - %load_a = xegpu.load_nd %tdesc_a + %load_a = xegpu.load_nd %tdesc_a {layout = #xegpu.layout} : !xegpu.tensor_desc<24x32xf32, #xegpu.layout> -> vector<24x32xf32> - %load_b = xegpu.load_nd %tdesc_b + %load_b = xegpu.load_nd %tdesc_b {layout = #xegpu.layout} : !xegpu.tensor_desc<24x32xi32, #xegpu.layout> -> vector<24x32xi32> // CHECK: arith.truncf {{.*}} {layout_result_0 = #xegpu.layout} @@ -128,16 +128,16 @@ gpu.module @test_elementwise_ops { -> !xegpu.tensor_desc<24x32xi32, #xegpu.layout> %tdesc_d = xegpu.create_nd_tdesc %d[0, 0] : memref<24x32xi32> -> !xegpu.tensor_desc<24x32xi32, #xegpu.layout> - %load_a = xegpu.load_nd %tdesc_a + %load_a = xegpu.load_nd %tdesc_a {layout = #xegpu.layout} : !xegpu.tensor_desc<24x32xf32, #xegpu.layout> -> vector<24x32xf32> - %load_b = xegpu.load_nd %tdesc_b + %load_b = xegpu.load_nd %tdesc_b {layout = #xegpu.layout} : !xegpu.tensor_desc<24x32xf32, #xegpu.layout> -> vector<24x32xf32> - %load_c = xegpu.load_nd %tdesc_c + %load_c = xegpu.load_nd %tdesc_c {layout = #xegpu.layout} : !xegpu.tensor_desc<24x32xi32, #xegpu.layout> -> vector<24x32xi32> - %load_d = xegpu.load_nd %tdesc_d + %load_d = xegpu.load_nd %tdesc_d {layout = #xegpu.layout} : !xegpu.tensor_desc<24x32xi32, #xegpu.layout> -> vector<24x32xi32> // CHECK: arith.cmpf ult, {{.*}}, {{.*}} {layout_result_0 = #xegpu.layout} @@ -160,10 +160,10 @@ gpu.module @test_elementwise_ops { -> !xegpu.tensor_desc<24x32xf32, #xegpu.layout> %tdesc_b = xegpu.create_nd_tdesc %b[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<24x32xf32, #xegpu.layout> - %load_a = xegpu.load_nd %tdesc_a + %load_a = xegpu.load_nd %tdesc_a {layout = #xegpu.layout} : !xegpu.tensor_desc<24x32xf32, #xegpu.layout> -> vector<24x32xf32> - %load_b = xegpu.load_nd %tdesc_b + %load_b = xegpu.load_nd %tdesc_b {layout = #xegpu.layout} : !xegpu.tensor_desc<24x32xf32, #xegpu.layout> -> vector<24x32xf32> // CHECK-COUNT-12: arith.negf {{.*}} {layout_result_0 = #xegpu.layout} : vector<2x2xf32> diff --git a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-rr.mlir b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-rr.mlir index 4829af3612de3..6b8b4f282b744 100644 --- a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-rr.mlir +++ b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-rr.mlir @@ -41,7 +41,7 @@ gpu.module @test_round_robin_assignment { -> !xegpu.tensor_desc<256x128xf32, #xegpu.layout> // CHECK-COUNT-4: xegpu.load_nd %{{.*}} : !xegpu.tensor_desc<16x16xf32, #xegpu.layout> -> vector<16x16xf32> // CHECK-NOT: xegpu.load_nd - %load = xegpu.load_nd %tdesc + %load = xegpu.load_nd %tdesc {layout = #xegpu.layout} : !xegpu.tensor_desc<256x128xf32, #xegpu.layout> -> vector<256x128xf32> gpu.return @@ -54,8 +54,7 @@ gpu.module @test_round_robin_assignment { -> !xegpu.tensor_desc<256x128xf32, #xegpu.layout> // CHECK-COUNT-4: xegpu.store_nd %{{.*}}, %{{.*}} : vector<16x16xf32>, !xegpu.tensor_desc<16x16xf32, #xegpu.layout> // CHECK-NOT: xegpu.store_nd - %load = xegpu.load_nd %tdesc - : !xegpu.tensor_desc<256x128xf32, #xegpu.layout> + %load = xegpu.load_nd %tdesc {layout = #xegpu.layout} : !xegpu.tensor_desc<256x128xf32, #xegpu.layout> -> vector<256x128xf32> xegpu.store_nd %load, %tdesc : vector<256x128xf32>, !xegpu.tensor_desc<256x128xf32, #xegpu.layout> @@ -79,20 +78,20 @@ gpu.module @test_round_robin_assignment { gpu.func @dpas(%a: memref<256x128xf16>, %b: memref<128x256xf16>) { // CHECK-COUNT-4: xegpu.create_nd_tdesc %[[ARG_0]][%{{.*}}, %{{.*}}] : memref<256x128xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout> // CHECK-COUNT-4: xegpu.create_nd_tdesc %[[ARG_1]][%{{.*}}, %{{.*}}] : memref<128x256xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout> - // CHECK-COUNT-16: xegpu.dpas %{{.*}}, %{{.*}} {layout_result_0 = #xegpu.layout} : vector<16x16xf16>, vector<16x16xf16> -> vector<16x16xf32> + // CHECK-COUNT-16: xegpu.dpas %{{.*}}, %{{.*}} {layout_a = #xegpu.layout, layout_b = #xegpu.layout, layout_cd = #xegpu.layout} : vector<16x16xf16>, vector<16x16xf16> -> vector<16x16xf32> // CHECK-NOT: xegpu.dpas %tdesc_a = xegpu.create_nd_tdesc %a[0, 0] : memref<256x128xf16> -> !xegpu.tensor_desc<256x128xf16, #xegpu.layout> - %load_a = xegpu.load_nd %tdesc_a - : !xegpu.tensor_desc<256x128xf16, #xegpu.layout> + %load_a = xegpu.load_nd %tdesc_a {layout = #xegpu.layout} : !xegpu.tensor_desc<256x128xf16, #xegpu.layout> -> vector<256x128xf16> %tdesc_b = xegpu.create_nd_tdesc %b[0, 0] : memref<128x256xf16> -> !xegpu.tensor_desc<128x256xf16, #xegpu.layout> - %load_b = xegpu.load_nd %tdesc_b - : !xegpu.tensor_desc<128x256xf16, #xegpu.layout> + %load_b = xegpu.load_nd %tdesc_b {layout = #xegpu.layout} : !xegpu.tensor_desc<128x256xf16, #xegpu.layout> -> vector<128x256xf16> - %dpas = xegpu.dpas %load_a, %load_b - {layout_result_0 = #xegpu.layout} + %dpas = xegpu.dpas %load_a, %load_b + {layout_a = #xegpu.layout, + layout_b = #xegpu.layout, + layout_cd = #xegpu.layout} : vector<256x128xf16>, vector<128x256xf16> -> vector<256x256xf32> gpu.return } @@ -114,7 +113,7 @@ gpu.module @test_round_robin_assignment { gpu.func @broadcast(%src: memref<128x1xf32>) { %tdesc = xegpu.create_nd_tdesc %src[0, 0] : memref<128x1xf32> -> !xegpu.tensor_desc<128x1xf32, #xegpu.layout> - %load = xegpu.load_nd %tdesc + %load = xegpu.load_nd %tdesc {layout = #xegpu.layout} : !xegpu.tensor_desc<128x1xf32, #xegpu.layout> -> vector<128x1xf32> // CHECK-COUNT-2: vector.broadcast {{.*}} {layout_result_0 = #xegpu.layout} : vector<16x1xf32> to vector<16x32xf32> @@ -137,7 +136,7 @@ gpu.module @test_round_robin_assignment { // CHECK-SAME: (!xegpu.tensor_desc<16xf32>, !xegpu.tensor_desc<16xf32>, !xegpu.tensor_desc<16xf32>, !xegpu.tensor_desc<16xf32>) %2:2 = scf.for %arg2 = %c0 to %c1024 step %c256 iter_args(%arg3 = %0, %arg4 = %1) -> (!xegpu.tensor_desc<256xf32, #xegpu.layout>, !xegpu.tensor_desc<256xf32, #xegpu.layout>) { - %3 = xegpu.load_nd %0 : !xegpu.tensor_desc<256xf32, #xegpu.layout> -> vector<256xf32> + %3 = xegpu.load_nd %0 {layout = #xegpu.layout} : !xegpu.tensor_desc<256xf32, #xegpu.layout> -> vector<256xf32> xegpu.store_nd %3, %arg3 : vector<256xf32>, !xegpu.tensor_desc<256xf32, #xegpu.layout> %4 = xegpu.update_nd_offset %arg3, [256] : !xegpu.tensor_desc<256xf32, #xegpu.layout> %5 = xegpu.update_nd_offset %arg4, [256] : !xegpu.tensor_desc<256xf32, #xegpu.layout> @@ -153,7 +152,7 @@ gpu.module @test_round_robin_assignment { %c10_i32 = arith.constant 10 : i32 %c0_i32 = arith.constant 0 : i32 %0 = xegpu.create_nd_tdesc %arg0[0] : memref<1024xf32> -> !xegpu.tensor_desc<256xf32, #xegpu.layout> - %1 = xegpu.load_nd %0 : !xegpu.tensor_desc<256xf32, #xegpu.layout> -> vector<256xf32> + %1 = xegpu.load_nd %0 {layout = #xegpu.layout} : !xegpu.tensor_desc<256xf32, #xegpu.layout> -> vector<256xf32> %2 = xegpu.create_nd_tdesc %arg1[0] : memref<1024xf32> -> !xegpu.tensor_desc<256xf32, #xegpu.layout> // CHECK: scf.while ({{.*}}) : (vector<16xf32>, vector<16xf32>, i32) -> (vector<16xf32>, vector<16xf32>, i32) %3:2 = scf.while (%arg2 = %1, %arg3 = %c0_i32) : (vector<256xf32>, i32) -> (vector<256xf32>, i32) { @@ -166,7 +165,7 @@ gpu.module @test_round_robin_assignment { xegpu.store_nd %arg2, %2 : vector<256xf32>, !xegpu.tensor_desc<256xf32, #xegpu.layout> %4 = arith.addi %arg3, %c1_i32 : i32 %5 = xegpu.update_nd_offset %0, [256] : !xegpu.tensor_desc<256xf32, #xegpu.layout> - %6 = xegpu.load_nd %5 : !xegpu.tensor_desc<256xf32, #xegpu.layout> -> vector<256xf32> + %6 = xegpu.load_nd %5 {layout = #xegpu.layout} : !xegpu.tensor_desc<256xf32, #xegpu.layout> -> vector<256xf32> scf.yield %6, %4 : vector<256xf32>, i32 } gpu.return @@ -181,12 +180,12 @@ gpu.module @test_round_robin_assignment { // CHECK-LABEL: scf.if // CHECK-SAME: (vector<16xf32>, vector<16xf32>) %4 = scf.if %3 -> (vector<256xf32>) { - %5 = xegpu.load_nd %1 : !xegpu.tensor_desc<256xf32, #xegpu.layout> -> vector<256xf32> + %5 = xegpu.load_nd %1 {layout = #xegpu.layout} : !xegpu.tensor_desc<256xf32, #xegpu.layout> -> vector<256xf32> // CHECK-LABEL: scf.yield // CHECK-SAME: vector<16xf32>, vector<16xf32> scf.yield %5 : vector<256xf32> } else { - %5 = xegpu.load_nd %2 : !xegpu.tensor_desc<256xf32, #xegpu.layout> -> vector<256xf32> + %5 = xegpu.load_nd %2 {layout = #xegpu.layout} : !xegpu.tensor_desc<256xf32, #xegpu.layout> -> vector<256xf32> // CHECK-LABEL: scf.yield // CHECK-SAME: vector<16xf32>, vector<16xf32> scf.yield %5 : vector<256xf32> @@ -200,7 +199,7 @@ gpu.module @test_round_robin_assignment { %id = gpu.subgroup_id : index %t = xegpu.create_nd_tdesc %arg0[0] : memref<1024xf32> -> !xegpu.tensor_desc<256xf32, #xegpu.layout> - %d = xegpu.load_nd %t : !xegpu.tensor_desc<256xf32, #xegpu.layout> -> vector<256xf32> + %d = xegpu.load_nd %t {layout = #xegpu.layout}: !xegpu.tensor_desc<256xf32, #xegpu.layout> -> vector<256xf32> %0 = arith.cmpi eq, %id, %c10 : index // CHECK-LABEL: scf.if @@ -224,7 +223,7 @@ gpu.module @test_round_robin_assignment { %0 = xegpu.create_nd_tdesc %arg0[0, 0] : memref<32x64xf32> -> !xegpu.tensor_desc<32x64xf32, #xegpu.layout> // CHECK-COUNT-2: xegpu.load_nd {{.*}} : !xegpu.tensor_desc<16x16xf32, #xegpu.layout> -> vector<16x16xf32> // CHECK-COUNT-2: xegpu.convert_layout {{.*}} <{input_layout = #xegpu.layout, target_layout = #xegpu.layout}> : vector<16x16xf32> - %1 = xegpu.load_nd %0 : !xegpu.tensor_desc<32x64xf32, #xegpu.layout> -> vector<32x64xf32> + %1 = xegpu.load_nd %0 {layout = #xegpu.layout} : !xegpu.tensor_desc<32x64xf32, #xegpu.layout> -> vector<32x64xf32> %2 = xegpu.convert_layout %1 <{input_layout = #xegpu.layout, target_layout = #xegpu.layout}> : vector<32x64xf32> gpu.return diff --git a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops-rr.mlir b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops-rr.mlir index 37a76f316e75a..ad346307437e4 100644 --- a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops-rr.mlir +++ b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops-rr.mlir @@ -14,11 +14,11 @@ gpu.module @test_distribution { // CHECK-LABEL: load_nd_tdesc_with_offset gpu.func @load_nd_tdesc_with_offset(%src: memref<256x128xf32>) { - // CHECK-COUNT-4: xegpu.load_nd {{%.*}}[{{%.*}}, {{%.*}}] : !xegpu.tensor_desc<16x16xf32, #xegpu.layout> -> vector<16x16xf32> + // CHECK-COUNT-4: xegpu.load_nd {{%.*}}[{{%.*}}, {{%.*}}] <{layout = #xegpu.layout}> : !xegpu.tensor_desc<16x16xf32, #xegpu.layout> -> vector<16x16xf32> // CHECK-NOT: xegpu.load_nd %tdesc = xegpu.create_nd_tdesc %src: memref<256x128xf32> -> !xegpu.tensor_desc<256x128xf32, #xegpu.layout> - %load = xegpu.load_nd %tdesc[0, 0] + %load = xegpu.load_nd %tdesc[0, 0] {layout = #xegpu.layout} : !xegpu.tensor_desc<256x128xf32, #xegpu.layout> -> vector<256x128xf32> gpu.return @@ -30,7 +30,7 @@ gpu.module @test_distribution { // CHECK-NOT: xegpu.store_nd %tdesc = xegpu.create_nd_tdesc %src: memref<256x128xf32> -> !xegpu.tensor_desc<256x128xf32, #xegpu.layout> - %load = xegpu.load_nd %tdesc[0, 0] + %load = xegpu.load_nd %tdesc[0, 0] {layout = #xegpu.layout} : !xegpu.tensor_desc<256x128xf32, #xegpu.layout> -> vector<256x128xf32> xegpu.store_nd %load, %tdesc[0, 0] @@ -53,23 +53,25 @@ gpu.module @test_distribution { // CHECK-SAME: (%[[ARG_0:.*]]: memref<256x128xf16>, %[[ARG_1:.*]]: memref<128x256xf16>) gpu.func @dpas(%a: memref<256x128xf16>, %b: memref<128x256xf16>) { // CHECK-COUNT-4: xegpu.create_nd_tdesc %[[ARG_0]] : memref<256x128xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout> - // CHECK-COUNT-4: xegpu.load_nd {{%.*}}[{{%.*}}, {{%.*}}] : !xegpu.tensor_desc<16x16xf16, #xegpu.layout> -> vector<16x16xf16> + // CHECK-COUNT-4: xegpu.load_nd {{%.*}}[{{%.*}}, {{%.*}}] <{layout = #xegpu.layout}> : !xegpu.tensor_desc<16x16xf16, #xegpu.layout> -> vector<16x16xf16> // CHECK-COUNT-4: xegpu.create_nd_tdesc %[[ARG_1]] : memref<128x256xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout> - // CHECK-COUNT-4: xegpu.load_nd {{%.*}}[{{%.*}}, {{%.*}}] : !xegpu.tensor_desc<16x16xf16, #xegpu.layout> -> vector<16x16xf16> - // CHECK-COUNT-16: xegpu.dpas %{{.*}}, %{{.*}} {layout_result_0 = #xegpu.layout} : vector<16x16xf16>, vector<16x16xf16> -> vector<16x16xf32> + // CHECK-COUNT-4: xegpu.load_nd {{%.*}}[{{%.*}}, {{%.*}}] <{layout = #xegpu.layout}> : !xegpu.tensor_desc<16x16xf16, #xegpu.layout> -> vector<16x16xf16> + // CHECK-COUNT-16: xegpu.dpas %{{.*}}, %{{.*}} {layout_a = #xegpu.layout, layout_b = #xegpu.layout, layout_cd = #xegpu.layout} : vector<16x16xf16>, vector<16x16xf16> -> vector<16x16xf32> // CHECK-NOT: xegpu.dpas %tdesc_a = xegpu.create_nd_tdesc %a : memref<256x128xf16> -> !xegpu.tensor_desc<256x128xf16, #xegpu.layout> - %load_a = xegpu.load_nd %tdesc_a[0, 0] + %load_a = xegpu.load_nd %tdesc_a[0, 0] {layout = #xegpu.layout} : !xegpu.tensor_desc<256x128xf16, #xegpu.layout> -> vector<256x128xf16> %tdesc_b = xegpu.create_nd_tdesc %b : memref<128x256xf16> -> !xegpu.tensor_desc<128x256xf16, #xegpu.layout> - %load_b = xegpu.load_nd %tdesc_b[0, 0] + %load_b = xegpu.load_nd %tdesc_b[0, 0] {layout = #xegpu.layout} : !xegpu.tensor_desc<128x256xf16, #xegpu.layout> -> vector<128x256xf16> %dpas = xegpu.dpas %load_a, %load_b - {layout_result_0 = #xegpu.layout} + {layout_a = #xegpu.layout, + layout_b = #xegpu.layout, + layout_cd = #xegpu.layout} : vector<256x128xf16>, vector<128x256xf16> -> vector<256x256xf32> gpu.return } @@ -80,7 +82,7 @@ gpu.module @test_distribution { %cst = arith.constant {layout_result_0 = #xegpu.slice<#xegpu.layout, dims = [1]>} dense<1.0> : vector<256xf32> %tdesc = xegpu.create_nd_tdesc %src : memref<256x64xf32> -> !xegpu.tensor_desc<256x64xf32, #xegpu.layout> - %load = xegpu.load_nd %tdesc[0, 0] + %load = xegpu.load_nd %tdesc[0, 0] {layout = #xegpu.layout} : !xegpu.tensor_desc<256x64xf32, #xegpu.layout> -> vector<256x64xf32> // CHECK-COUNT-2: vector.multi_reduction , {{.*}}, %[[CST]] [1] : vector<16x64xf32> to vector<16xf32> @@ -119,7 +121,7 @@ gpu.module @test_distribution { gpu.func @vector_transpose(%src: memref<256x128xf32>) { %tdesc = xegpu.create_nd_tdesc %src : memref<256x128xf32> -> !xegpu.tensor_desc<256x128xf32, #xegpu.layout> - %load = xegpu.load_nd %tdesc[0, 0] + %load = xegpu.load_nd %tdesc[0, 0] {layout = #xegpu.layout} : !xegpu.tensor_desc<256x128xf32, #xegpu.layout> -> vector<256x128xf32> // CHECK-COUNT-2: vector.transpose {{.*}}, [1, 0] {layout_result_0 = #xegpu.layout} : vector<32x16xf32> to vector<16x32xf32> diff --git a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops.mlir b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops.mlir index 69eb8ce9dfba5..da6ad976d3730 100644 --- a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops.mlir +++ b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops.mlir @@ -38,10 +38,10 @@ gpu.module @test_distribution { //CHECK-DAG: %[[OFF_Y:.*]] = arith.remui %[[L_OFF_Y]], %[[C256]] : index //CHECK-DAG: %[[C128:.*]] = arith.constant 128 : index //CHECK-DAG: %[[OFF_X:.*]] = arith.remui %[[L_OFF_X]], %[[C128]] : index - //CHECK-DAG: %[[LOAD:.*]] = xegpu.load_nd %[[TDESC]][{{%.*}}, {{%.*}}] : !xegpu.tensor_desc<32x32xf32, #xegpu.layout> -> vector<32x32xf32> + //CHECK-DAG: %[[LOAD:.*]] = xegpu.load_nd %[[TDESC]][{{%.*}}, {{%.*}}] <{layout = #xegpu.layout}> : !xegpu.tensor_desc<32x32xf32, #xegpu.layout> -> vector<32x32xf32> %tdesc = xegpu.create_nd_tdesc %src : memref<256x128xf32> -> !xegpu.tensor_desc<256x128xf32, #xegpu.layout> - %load = xegpu.load_nd %tdesc[0, 0] + %load = xegpu.load_nd %tdesc[0, 0] {layout = #xegpu.layout} : !xegpu.tensor_desc<256x128xf32, #xegpu.layout> -> vector<256x128xf32> gpu.return @@ -53,7 +53,7 @@ gpu.module @test_distribution { //CHECK: xegpu.store_nd %{{.*}}, {{%.*}}[{{%.*}}, {{%.*}}] : vector<32x32xf32>, !xegpu.tensor_desc<32x32xf32, #xegpu.layout> %tdesc = xegpu.create_nd_tdesc %src: memref<256x128xf32> -> !xegpu.tensor_desc<256x128xf32, #xegpu.layout> - %load = xegpu.load_nd %tdesc[0, 0] + %load = xegpu.load_nd %tdesc[0, 0] {layout = #xegpu.layout} : !xegpu.tensor_desc<256x128xf32, #xegpu.layout> -> vector<256x128xf32> xegpu.store_nd %load, %tdesc[0, 0] @@ -75,42 +75,49 @@ gpu.module @test_distribution { // CHECK-LABEL: dpas gpu.func @dpas(%a: memref<128x128xf16>, %b: memref<128x128xf16>) { - // CHECK: %[[DPAS:.*]] = xegpu.dpas %{{.*}}, %{{.*}} {layout_result_0 = #xegpu.layout} : vector<16x128xf16>, vector<128x16xf16> -> vector<16x16xf32> + // CHECK: %[[DPAS:.*]] = xegpu.dpas %{{.*}}, %{{.*}} {layout_a = #xegpu.layout, layout_b = #xegpu.layout, layout_cd = #xegpu.layout} : vector<16x128xf16>, vector<128x16xf16> -> vector<16x16xf32> %tdesc_a = xegpu.create_nd_tdesc %a : memref<128x128xf16> -> !xegpu.tensor_desc<128x128xf16, #xegpu.layout> - %load_a = xegpu.load_nd %tdesc_a[0, 0] + %load_a = xegpu.load_nd %tdesc_a[0, 0] {layout = #xegpu.layout} : !xegpu.tensor_desc<128x128xf16, #xegpu.layout> -> vector<128x128xf16> %tdesc_b = xegpu.create_nd_tdesc %b : memref<128x128xf16> -> !xegpu.tensor_desc<128x128xf16, #xegpu.layout> - %load_b = xegpu.load_nd %tdesc_b[0, 0] + %load_b = xegpu.load_nd %tdesc_b[0, 0] {layout = #xegpu.layout} : !xegpu.tensor_desc<128x128xf16, #xegpu.layout> -> vector<128x128xf16> %dpas = xegpu.dpas %load_a, %load_b - {layout_result_0 = #xegpu.layout} + {layout_a = #xegpu.layout, + layout_b = #xegpu.layout, + layout_cd = #xegpu.layout} : vector<128x128xf16>, vector<128x128xf16> -> vector<128x128xf32> gpu.return } // CHECK-LABEL: dpas_no_sg_data gpu.func @dpas_no_sg_data(%a: memref<128x128xf16>, %b: memref<128x128xf16>) { - // CHECK: %[[DPAS:.*]] = xegpu.dpas %{{.*}}, %{{.*}} {layout_result_0 = #xegpu.layout} : vector<16x16xf16>, vector<16x16xf16> -> vector<16x16xf32> + // CHECK: %[[DPAS:.*]] = xegpu.dpas %{{.*}}, %{{.*}} {layout_a = #xegpu.layout, layout_b = #xegpu.layout, layout_cd = #xegpu.layout} : vector<16x16xf16>, vector<16x16xf16> -> vector<16x16xf32> %tdesc_a = xegpu.create_nd_tdesc %a : memref<128x128xf16> -> !xegpu.tensor_desc<128x128xf16, #xegpu.layout> - %load_a = xegpu.load_nd %tdesc_a[0, 0] + %load_a = xegpu.load_nd %tdesc_a[0, 0] {layout = #xegpu.layout} : !xegpu.tensor_desc<128x128xf16, #xegpu.layout> -> vector<128x128xf16> %tdesc_b = xegpu.create_nd_tdesc %b : memref<128x128xf16> -> !xegpu.tensor_desc<128x128xf16, #xegpu.layout> - %load_b = xegpu.load_nd %tdesc_b[0, 0] + %load_b = xegpu.load_nd %tdesc_b[0, 0] {layout = #xegpu.layout } : !xegpu.tensor_desc<128x128xf16, #xegpu.layout> -> vector<128x128xf16> %dpas = xegpu.dpas %load_a, %load_b - {layout_result_0 = #xegpu.layout} + {layout_a = #xegpu.layout, + layout_b = #xegpu.layout, + layout_cd = #xegpu.layout} : vector<128x128xf16>, vector<128x128xf16> -> vector<128x128xf32> gpu.return } @@ -119,7 +126,9 @@ gpu.module @test_distribution { gpu.func @dpas_with_no_create_nd_desc(%a: vector<256x128xf32>, %b: vector<128x256xf32>) { // CHECK-NOT: vector<32x32xf32> %dpas = xegpu.dpas %a, %b - {layout = #xegpu.layout} + {layout_a = #xegpu.layout, + layout_b = #xegpu.layout, + layout_cd = #xegpu.layout} : vector<256x128xf32>, vector<128x256xf32> -> vector<256x256xf32> gpu.return } @@ -129,7 +138,7 @@ gpu.module @test_distribution { gpu.func @broadcast_dim1(%src: memref<256x1xf32>) { %tdesc = xegpu.create_nd_tdesc %src : memref<256x1xf32> -> !xegpu.tensor_desc<256x1xf32, #xegpu.layout> - %load = xegpu.load_nd %tdesc[0, 0] + %load = xegpu.load_nd %tdesc[0, 0] {layout = #xegpu.layout} : !xegpu.tensor_desc<256x1xf32, #xegpu.layout> -> vector<256x1xf32> // CHECK: vector.broadcast {{.*}} {layout_result_0 = #xegpu.layout} @@ -145,7 +154,7 @@ gpu.module @test_distribution { gpu.func @broadcast_dim0(%src: memref<1x128xf32>) { %tdesc = xegpu.create_nd_tdesc %src : memref<1x128xf32> -> !xegpu.tensor_desc<1x128xf32, #xegpu.layout> - %load = xegpu.load_nd %tdesc[0, 0] + %load = xegpu.load_nd %tdesc[0, 0] {layout = #xegpu.layout} : !xegpu.tensor_desc<1x128xf32, #xegpu.layout> -> vector<1x128xf32> // CHECK: vector.broadcast {{.*}} {layout_result_0 = #xegpu.layout} @@ -175,9 +184,9 @@ gpu.module @test_distribution { %3 = xegpu.create_nd_tdesc %arg0 : memref<1024x1024xf16> -> !xegpu.tensor_desc<128x128xf16, #xegpu.layout> %4 = xegpu.create_nd_tdesc %arg1 : memref<1024x1024xf16> -> !xegpu.tensor_desc<128x128xf16, #xegpu.layout> // load_nd with offset - %5 = xegpu.load_nd %2[%0, %1] : !xegpu.tensor_desc<128x128xf32, #xegpu.layout> -> vector<128x128xf32> - %6 = xegpu.load_nd %3[%0, %c0] : !xegpu.tensor_desc<128x128xf16, #xegpu.layout> -> vector<128x128xf16> - %7 = xegpu.load_nd %4[%c0, %1] : !xegpu.tensor_desc<128x128xf16, #xegpu.layout> -> vector<128x128xf16> + %5 = xegpu.load_nd %2[%0, %1] {layout = #xegpu.layout}: !xegpu.tensor_desc<128x128xf32, #xegpu.layout> -> vector<128x128xf32> + %6 = xegpu.load_nd %3[%0, %c0] {layout = #xegpu.layout}: !xegpu.tensor_desc<128x128xf16, #xegpu.layout> -> vector<128x128xf16> + %7 = xegpu.load_nd %4[%c0, %1] {layout = #xegpu.layout}: !xegpu.tensor_desc<128x128xf16, #xegpu.layout> -> vector<128x128xf16> // scf.for loop // CHECK: [[scf:%.+]]:3 = scf.for [[arg3:%.+]] = [[c0]] to [[c1024]] step [[c128]] // CHECK-SAME: iter_args([[arg4:%.+]] = {{.*}}, [[arg5:%.+]] = {{.*}}, [[arg6:%.+]] = {{.*}}) -> @@ -189,10 +198,13 @@ gpu.module @test_distribution { %8:3 = scf.for %arg3 = %c0 to %c1024 step %c128 iter_args(%arg4 = %6, %arg5 = %7, %arg6 = %5) -> (vector<128x128xf16>, vector<128x128xf16>, vector<128x128xf32>) { // load_nd with offset inside loop - %9 = xegpu.dpas %arg4, %arg5, %arg6 {layout_result_0 = #xegpu.layout} - : vector<128x128xf16>, vector<128x128xf16>, vector<128x128xf32> -> vector<128x128xf32> - %10 = xegpu.load_nd %3[%arg3, %c0] : !xegpu.tensor_desc<128x128xf16, #xegpu.layout> -> vector<128x128xf16> - %11 = xegpu.load_nd %4[%c0, %arg3] : !xegpu.tensor_desc<128x128xf16, #xegpu.layout> -> vector<128x128xf16> + %9 = xegpu.dpas %arg4, %arg5, %arg6 + {layout_a = #xegpu.layout, + layout_b = #xegpu.layout, + layout_cd = #xegpu.layout} + : vector<128x128xf16>, vector<128x128xf16>, vector<128x128xf32> -> vector<128x128xf32> + %10 = xegpu.load_nd %3[%arg3, %c0] {layout = #xegpu.layout}: !xegpu.tensor_desc<128x128xf16, #xegpu.layout> -> vector<128x128xf16> + %11 = xegpu.load_nd %4[%c0, %arg3] {layout = #xegpu.layout}: !xegpu.tensor_desc<128x128xf16, #xegpu.layout> -> vector<128x128xf16> scf.yield %10, %11, %9 : vector<128x128xf16>, vector<128x128xf16>, vector<128x128xf32> } // store_nd with offset @@ -215,7 +227,7 @@ gpu.module @test_distribution { // CHECK-NOT: index.sub %tdesc = xegpu.create_nd_tdesc %src : memref<256x128xf32> -> !xegpu.tensor_desc<256x128xf32, #xegpu.layout> - %load = xegpu.load_nd %tdesc[0, 0] + %load = xegpu.load_nd %tdesc[0, 0] {layout = #xegpu.layout} : !xegpu.tensor_desc<256x128xf32, #xegpu.layout> -> vector<256x128xf32> } {sg_id_range = #xegpu.range<[0, 32]>} @@ -228,7 +240,7 @@ gpu.module @test_distribution { // CHECK: %[[SUB:.*]] = index.sub %{{.*}}, %[[C2]] %tdesc = xegpu.create_nd_tdesc %src2 : memref<128x64xf32> -> !xegpu.tensor_desc<128x64xf32, #xegpu.layout> - %load = xegpu.load_nd %tdesc[0, 0] + %load = xegpu.load_nd %tdesc[0, 0] {layout = #xegpu.layout} : !xegpu.tensor_desc<128x64xf32, #xegpu.layout> -> vector<128x64xf32> %exp = math.exp %load {layout_result_0 = #xegpu.layout} : vector<128x64xf32> @@ -244,7 +256,7 @@ gpu.module @test_distribution { %c32 = arith.constant 32 : index %tdesc = xegpu.create_nd_tdesc %src : memref<256x128xf32> -> !xegpu.tensor_desc<256x128xf32, #xegpu.layout> - %load = xegpu.load_nd %tdesc[0, 0] + %load = xegpu.load_nd %tdesc[0, 0] {layout = #xegpu.layout} : !xegpu.tensor_desc<256x128xf32, #xegpu.layout> -> vector<256x128xf32> %cond1 = arith.cmpi sge, %sg_id, %c3 : index @@ -257,7 +269,7 @@ gpu.module @test_distribution { // CHECK: %[[SUB:.*]] = index.sub %{{.*}}, %[[C3]] %td = xegpu.create_nd_tdesc %src1 : memref<128x64xf32> -> !xegpu.tensor_desc<128x64xf32, #xegpu.layout> - %ld = xegpu.load_nd %td[0, 0] + %ld = xegpu.load_nd %td[0, 0] {layout = #xegpu.layout} : !xegpu.tensor_desc<128x64xf32, #xegpu.layout> -> vector<128x64xf32> %exp = math.exp %ld {layout_result_0 = #xegpu.layout} : vector<128x64xf32> @@ -275,7 +287,7 @@ gpu.module @test_distribution { // CHECK-SAME: : memref, vector<32x4xindex>, vector<32x4xi1> -> vector<32x4xf16> %offset = arith.constant {layout_result_0 = #xegpu.layout} dense<0> : vector<256x16xindex> %mask = arith.constant {layout_result_0 = #xegpu.layout} dense<1> : vector<256x16xi1> - %load = xegpu.load %src[%offset], %mask {chunk_size = 1, layout_result_0 = #xegpu.layout, l1_hint = #xegpu.cache_hint} + %load = xegpu.load %src[%offset], %mask {chunk_size = 1, layout = #xegpu.layout, l1_hint = #xegpu.cache_hint} : memref, vector<256x16xindex>, vector<256x16xi1> -> vector<256x16xf16> gpu.return } @@ -287,13 +299,11 @@ gpu.module @test_distribution { // CHECK: %[[CST:.*]] = arith.constant {layout_result_0 = #xegpu.layout} dense<0> : vector<8xindex> // CHECK: %[[MASK:.*]] = arith.constant {layout_result_0 = #xegpu.layout} dense : vector<8xi1> // CHECK: xegpu.store %[[VAL]], %[[ARG0]][%[[CST]]], %[[MASK]] <{chunk_size = 1 : i64, l1_hint = #xegpu.cache_hint, layout = #xegpu.layout}> - // CHECK-SAME: {layout_operand_0 = #xegpu.layout, layout_operand_2 = #xegpu.layout, - // CHECK-SAME: layout_operand_3 = #xegpu.layout} - // CHECK-SAME: : vector<8xf16>, memref<256xf16>, vector<8xindex>, vector<8xi1> + // CHECK-SAME: : vector<8xf16>, memref<256xf16>, vector<8xindex>, vector<8xi1> %val = arith.constant {layout_result_0 = #xegpu.layout} dense<25.5> : vector<256xf16> %offset = arith.constant {layout_result_0 = #xegpu.layout} dense<0> : vector<256xindex> %mask = arith.constant {layout_result_0 = #xegpu.layout} dense<1> : vector<256xi1> - xegpu.store %val, %dest[%offset], %mask {chunk_size = 1, layout_operand_0 = #xegpu.layout, + xegpu.store %val, %dest[%offset], %mask {chunk_size = 1, layout = #xegpu.layout, layout_operand_2 = #xegpu.layout, layout_operand_3 = #xegpu.layout, l1_hint = #xegpu.cache_hint} @@ -310,7 +320,7 @@ gpu.module @test_distribution { // CHECK-SAME: : memref, vector<8xindex>, vector<8xi1> -> vector<8x4xf16> %offset = arith.constant {layout_result_0 = #xegpu.layout} dense<0> : vector<256xindex> %mask = arith.constant {layout_result_0 = #xegpu.layout} dense<1> : vector<256xi1> - %load = xegpu.load %src[%offset], %mask {chunk_size = 4, layout_result_0 = #xegpu.layout, l1_hint = #xegpu.cache_hint} + %load = xegpu.load %src[%offset], %mask {chunk_size = 4, layout = #xegpu.layout, l1_hint = #xegpu.cache_hint} : memref, vector<256xindex>, vector<256xi1> -> vector<256x4xf16> gpu.return } @@ -370,7 +380,7 @@ gpu.module @test_distribution { %cst = arith.constant {layout_result_0 = #xegpu.slice<#xegpu.layout, dims = [0]>} dense<1.0> : vector<128xf32> %tdesc = xegpu.create_nd_tdesc %src : memref<4x128xf32> -> !xegpu.tensor_desc<4x128xf32, #xegpu.layout> - %load = xegpu.load_nd %tdesc[0, 0] + %load = xegpu.load_nd %tdesc[0, 0] {layout = #xegpu.layout} : !xegpu.tensor_desc<4x128xf32, #xegpu.layout> -> vector<4x128xf32> // CHECK: vector.multi_reduction , {{.*}}, {{.*}} [0] : vector<4x4xf32> to vector<4xf32> @@ -384,7 +394,7 @@ gpu.module @test_distribution { %cst = arith.constant {layout_result_0 = #xegpu.slice<#xegpu.layout, dims = [1]>} dense<1.0> : vector<256xf32> %tdesc = xegpu.create_nd_tdesc %src : memref<256x64xf32> -> !xegpu.tensor_desc<256x64xf32, #xegpu.layout> - %load = xegpu.load_nd %tdesc[0, 0] + %load = xegpu.load_nd %tdesc[0, 0] {layout = #xegpu.layout} : !xegpu.tensor_desc<256x64xf32, #xegpu.layout> -> vector<256x64xf32> // CHECK: vector.multi_reduction , {{.*}}, {{.*}} [1] : vector<16x64xf32> to vector<16xf32> @@ -398,7 +408,7 @@ gpu.module @test_distribution { %cst_acc = arith.constant {layout_result_0 = #xegpu.slice<#xegpu.layout, dims = [3]>} dense<0.0> : vector<4x2x6xf16> %offset = arith.constant {layout_result_0 = #xegpu.layout} dense<0> : vector<4x2x6x32xindex> %mask = arith.constant {layout_result_0 = #xegpu.layout} dense : vector<4x2x6x32xi1> - %load = xegpu.load %src[%offset], %mask {layout_result_0 = #xegpu.layout} : ui64, vector<4x2x6x32xindex>, vector<4x2x6x32xi1> -> vector<4x2x6x32xf16> + %load = xegpu.load %src[%offset], %mask {layout = #xegpu.layout} : ui64, vector<4x2x6x32xindex>, vector<4x2x6x32xi1> -> vector<4x2x6x32xf16> // CHECK: vector.multi_reduction , {{.*}}, {{.*}} [3] : vector<1x1x1x32xf16> to vector<1x1x1xf16> %reduce = vector.multi_reduction , %load, %cst_acc {layout_result_0 = #xegpu.slice<#xegpu.layout, dims = [3]>} [3] : vector<4x2x6x32xf16> to vector<4x2x6xf16> @@ -468,7 +478,7 @@ gpu.module @test_distribution { gpu.func @vector_transpose(%src: memref<256x32xf32>) { %tdesc = xegpu.create_nd_tdesc %src : memref<256x32xf32> -> !xegpu.tensor_desc<256x32xf32, #xegpu.layout> - %load = xegpu.load_nd %tdesc[0, 0] + %load = xegpu.load_nd %tdesc[0, 0] {layout = #xegpu.layout} : !xegpu.tensor_desc<256x32xf32, #xegpu.layout> -> vector<256x32xf32> //CHECK: vector.transpose {{.*}}, [1, 0] {layout_result_0 = #xegpu.layout} : vector<64x32xf32> to vector<32x64xf32> @@ -624,9 +634,8 @@ gpu.module @test_distribution { %mask = arith.constant {layout_result_0 = #xegpu.layout } dense<1> : vector<256xi1> // CHECK: %[[LOAD:.*]] = xegpu.load {{.*}} <{chunk_size = 1 : i64, layout = #xegpu.slice<#xegpu.layout, dims = [0]>}> - // CHECK-SAME: {layout_result_0 = #xegpu.slice<#xegpu.layout, dims = [0]>} : // CHECK-SAME: memref<4096xf32>, vector<32xindex>, vector<32xi1> -> vector<32xf32> - %3 = xegpu.load %2[%offset], %mask {chunk_size = 1, layout_result_0 = #xegpu.slice<#xegpu.layout, dims = [0]> } : memref<4096xf32>, vector<256xindex>, vector<256xi1> -> vector<256xf32> + %3 = xegpu.load %2[%offset], %mask {chunk_size = 1, layout = #xegpu.slice<#xegpu.layout, dims = [0]> } : memref<4096xf32>, vector<256xindex>, vector<256xi1> -> vector<256xf32> // CHECK: %[[BROADCAST:.*]] = vector.broadcast %[[LOAD]] {layout_result_0 = #xegpu.layout} : vector<32xf32> to vector<32x32xf32> %4 = vector.broadcast %3 {layout_result_0 = diff --git a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg.mlir b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg.mlir index a8015cced7eb4..50081ed34fe78 100644 --- a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg.mlir +++ b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg.mlir @@ -89,17 +89,17 @@ gpu.module @test_1_1_assignment { gpu.func @dpas(%a: memref<128x128xf16>, %b: memref<128x128xf16>) { %tdesc_a = xegpu.create_nd_tdesc %a[0, 0] : memref<128x128xf16> -> !xegpu.tensor_desc<128x128xf16, #xegpu.layout> - %load_a = xegpu.load_nd %tdesc_a - : !xegpu.tensor_desc<128x128xf16, #xegpu.layout> + %load_a = xegpu.load_nd %tdesc_a {layout = #xegpu.layout} : !xegpu.tensor_desc<128x128xf16, #xegpu.layout> -> vector<128x128xf16> %tdesc_b = xegpu.create_nd_tdesc %b[0, 0] : memref<128x128xf16> -> !xegpu.tensor_desc<128x128xf16, #xegpu.layout> - %load_b = xegpu.load_nd %tdesc_b - : !xegpu.tensor_desc<128x128xf16, #xegpu.layout> + %load_b = xegpu.load_nd %tdesc_b {layout = #xegpu.layout} : !xegpu.tensor_desc<128x128xf16, #xegpu.layout> -> vector<128x128xf16> - // CHECK: %[[DPAS:.*]] = xegpu.dpas %{{.*}}, %{{.*}} {layout_result_0 = #xegpu.layout} : vector<16x128xf16>, vector<128x16xf16> -> vector<16x16xf32> + // CHECK: %[[DPAS:.*]] = xegpu.dpas %{{.*}}, %{{.*}} {layout_a = #xegpu.layout, layout_b = #xegpu.layout, layout_cd = #xegpu.layout} : vector<16x128xf16>, vector<128x16xf16> -> vector<16x16xf32> %dpas = xegpu.dpas %load_a, %load_b - {layout_result_0 = #xegpu.layout} + {layout_a = #xegpu.layout, + layout_b = #xegpu.layout, + layout_cd = #xegpu.layout} : vector<128x128xf16>, vector<128x128xf16> -> vector<128x128xf32> gpu.return } @@ -109,20 +109,20 @@ gpu.module @test_1_1_assignment { %tdesc_a = xegpu.create_nd_tdesc %a[0, 0] : memref<128x128xf16> -> !xegpu.tensor_desc<128x128xf16, #xegpu.layout> - %load_a = xegpu.load_nd %tdesc_a - : !xegpu.tensor_desc<128x128xf16, #xegpu.layout} : !xegpu.tensor_desc<128x128xf16, #xegpu.layout> -> vector<128x128xf16> %tdesc_b = xegpu.create_nd_tdesc %b[0, 0] : memref<128x128xf16> -> !xegpu.tensor_desc<128x128xf16, #xegpu.layout> - %load_b = xegpu.load_nd %tdesc_b - : !xegpu.tensor_desc<128x128xf16, #xegpu.layout} : !xegpu.tensor_desc<128x128xf16, #xegpu.layout> -> vector<128x128xf16> - // CHECK: %[[DPAS:.*]] = xegpu.dpas %{{.*}}, %{{.*}} {layout_result_0 = #xegpu.layout} : vector<16x16xf16>, vector<16x16xf16> -> vector<16x16xf32> + // CHECK: %[[DPAS:.*]] = xegpu.dpas %{{.*}}, %{{.*}} {layout_a = #xegpu.layout, layout_b = #xegpu.layout, layout_cd = #xegpu.layout} : vector<16x16xf16>, vector<16x16xf16> -> vector<16x16xf32> %dpas = xegpu.dpas %load_a, %load_b - {layout_result_0 = #xegpu.layout} + {layout_a = #xegpu.layout, + layout_b = #xegpu.layout, + layout_cd = #xegpu.layout} : vector<128x128xf16>, vector<128x128xf16> -> vector<128x128xf32> gpu.return } @@ -145,7 +145,9 @@ gpu.module @test_1_1_assignment { gpu.func @dpas_with_no_create_nd_desc(%a: vector<256x128xf32>, %b: vector<128x256xf32>) { // CHECK-NOT: vector<32x32xf32> %dpas = xegpu.dpas %a, %b - {layout = #xegpu.layout} + {layout_a = #xegpu.layout, + layout_b = #xegpu.layout, + layout_cd = #xegpu.layout} : vector<256x128xf32>, vector<128x256xf32> -> vector<256x256xf32> gpu.return } @@ -194,26 +196,28 @@ gpu.module @test_1_1_assignment { %0 = arith.muli %block_id_x, %c128 : index %1 = arith.muli %block_id_y, %c128 : index %2 = xegpu.create_nd_tdesc %arg2[%0, %1] : memref<1024x1024xf32> -> !xegpu.tensor_desc<128x128xf32, #xegpu.layout> - %3 = xegpu.load_nd %2 : !xegpu.tensor_desc<128x128xf32, #xegpu.layout> -> vector<128x128xf32> + %3 = xegpu.load_nd %2 {layout = #xegpu.layout} : !xegpu.tensor_desc<128x128xf32, #xegpu.layout> -> vector<128x128xf32> %4 = xegpu.create_nd_tdesc %arg0[%0, %c0] : memref<1024x1024xf16> -> !xegpu.tensor_desc<128x128xf16, #xegpu.layout> %5 = xegpu.create_nd_tdesc %arg1[%c0, %1] : memref<1024x1024xf16> -> !xegpu.tensor_desc<128x128xf16, #xegpu.layout> // CHECK: %[[SCF:.*]]:3 = scf.for %[[ARG3:.*]] = %[[C0]] to %[[C1024]] step %[[C128]] // CHECK-SAME: iter_args(%[[ARG4:.*]] = {{.*}}, %[[ARG5:.*]] = {{.*}}, %[[ARG6:.*]] = {{.*}}) -> // CHECK-SAME: (!xegpu.tensor_desc<16x128xf16>, !xegpu.tensor_desc<128x16xf16>, vector<16x16xf32>) - // CHECK: %[[A:.*]] = xegpu.load_nd %[[ARG4]] : !xegpu.tensor_desc<16x128xf16> -> vector<16x128xf16> - // CHECK: %[[B:.*]] = xegpu.load_nd %[[ARG5]] : !xegpu.tensor_desc<128x16xf16> -> vector<128x16xf16> + // CHECK: %[[A:.*]] = xegpu.load_nd %[[ARG4]] <{layout = #xegpu.layout}> : !xegpu.tensor_desc<16x128xf16> -> vector<16x128xf16> + // CHECK: %[[B:.*]] = xegpu.load_nd %[[ARG5]] <{layout = #xegpu.layout}> : !xegpu.tensor_desc<128x16xf16> -> vector<128x16xf16> // CHECK: %[[C:.*]] = xegpu.dpas %[[A]], %[[B]], %[[ARG6]] : vector<16x128xf16>, vector<128x16xf16>, vector<16x16xf32> -> vector<16x16xf32> // CHECK: %[[AT:.*]] = xegpu.update_nd_offset %[[ARG4]], [%[[C0]], %[[C128]]] : !xegpu.tensor_desc<16x128xf16> // CHECK: %[[BT:.*]] = xegpu.update_nd_offset %[[ARG5]], [%[[C128]], %[[C0]]] : !xegpu.tensor_desc<128x16xf16> // CHECK: scf.yield %[[AT]], %[[BT]], %[[C]] : !xegpu.tensor_desc<16x128xf16>, !xegpu.tensor_desc<128x16xf16>, vector<16x16xf32> %6:3 = scf.for %arg3 = %c0 to %c1024 step %c128 iter_args(%arg4 = %4, %arg5 = %5, %arg6 = %3) - -> (!xegpu.tensor_desc<128x128xf16, #xegpu.layout>, - !xegpu.tensor_desc<128x128xf16, #xegpu.layout>, vector<128x128xf32>) { - %8 = xegpu.load_nd %arg4 : !xegpu.tensor_desc<128x128xf16, #xegpu.layout> -> vector<128x128xf16> - %9 = xegpu.load_nd %arg5 : !xegpu.tensor_desc<128x128xf16, #xegpu.layout> -> vector<128x128xf16> - %10 = xegpu.dpas %8, %9, %arg6 {layout_result_0 = #xegpu.layout} - : vector<128x128xf16>, vector<128x128xf16>, vector<128x128xf32> -> vector<128x128xf32> + -> (!xegpu.tensor_desc<128x128xf16, #xegpu.layout>, !xegpu.tensor_desc<128x128xf16, #xegpu.layout>, vector<128x128xf32>) { + %8 = xegpu.load_nd %arg4 {layout = #xegpu.layout} : !xegpu.tensor_desc<128x128xf16, #xegpu.layout> -> vector<128x128xf16> + %9 = xegpu.load_nd %arg5 {layout = #xegpu.layout} : !xegpu.tensor_desc<128x128xf16, #xegpu.layout> -> vector<128x128xf16> + %10 = xegpu.dpas %8, %9, %arg6 + {layout_a = #xegpu.layout, + layout_b = #xegpu.layout, + layout_cd = #xegpu.layout} + : vector<128x128xf16>, vector<128x128xf16>, vector<128x128xf32> -> vector<128x128xf32> %11 = xegpu.update_nd_offset %arg4, [%c0, %c128] : !xegpu.tensor_desc<128x128xf16, #xegpu.layout> %12 = xegpu.update_nd_offset %arg5, [%c128, %c0] : !xegpu.tensor_desc<128x128xf16, #xegpu.layout> scf.yield %11, %12, %10 : !xegpu.tensor_desc<128x128xf16, #xegpu.layout>, @@ -221,7 +225,7 @@ gpu.module @test_1_1_assignment { } %7 = xegpu.create_nd_tdesc %arg2[%0, %1] : memref<1024x1024xf32> -> !xegpu.tensor_desc<128x128xf32, #xegpu.layout> - xegpu.store_nd %6#2, %7 : vector<128x128xf32>, !xegpu.tensor_desc<128x128xf32, #xegpu.layout> + xegpu.store_nd %6#2, %7 {layout = #xegpu.layout } : vector<128x128xf32>, !xegpu.tensor_desc<128x128xf32, #xegpu.layout> gpu.return } @@ -230,7 +234,7 @@ gpu.module @test_1_1_assignment { %c10_i32 = arith.constant 10 : i32 %c0_i32 = arith.constant 0 : i32 %0 = xegpu.create_nd_tdesc %arg0[0] : memref<1024xf32> -> !xegpu.tensor_desc<256xf32, #xegpu.layout> - %1 = xegpu.load_nd %0 : !xegpu.tensor_desc<256xf32, #xegpu.layout> -> vector<256xf32> + %1 = xegpu.load_nd %0 {layout = #xegpu.layout} : !xegpu.tensor_desc<256xf32, #xegpu.layout> -> vector<256xf32> %2 = xegpu.create_nd_tdesc %arg1[0] : memref<1024xf32> -> !xegpu.tensor_desc<256xf32, #xegpu.layout> // CHECK: scf.while {{.*}} : (vector<16xf32>, i32) -> (vector<16xf32>, i32) @@ -244,7 +248,7 @@ gpu.module @test_1_1_assignment { xegpu.store_nd %arg2, %2 : vector<256xf32>, !xegpu.tensor_desc<256xf32, #xegpu.layout> %4 = arith.addi %arg3, %c1_i32 : i32 %5 = xegpu.update_nd_offset %0, [256] : !xegpu.tensor_desc<256xf32, #xegpu.layout> - %6 = xegpu.load_nd %5 : !xegpu.tensor_desc<256xf32, #xegpu.layout> -> vector<256xf32> + %6 = xegpu.load_nd %5 {layout = #xegpu.layout} : !xegpu.tensor_desc<256xf32, #xegpu.layout> -> vector<256xf32> scf.yield %6, %4 : vector<256xf32>, i32 } gpu.return @@ -263,14 +267,14 @@ gpu.module @test_1_1_assignment { %5 = scf.if %4 -> (vector<256xf32>) { // CHECK-LABEL: xegpu.load_nd // CHECK-SAME: !xegpu.tensor_desc<16xf32> -> vector<16xf32> - %2 = xegpu.load_nd %0 : !xegpu.tensor_desc<256xf32, #xegpu.layout> -> vector<256xf32> + %2 = xegpu.load_nd %0 {layout = #xegpu.layout} : !xegpu.tensor_desc<256xf32, #xegpu.layout> -> vector<256xf32> // CHECK-LABEL: scf.yield // CHECK-SAME: vector<16xf32> scf.yield %2 : vector<256xf32> } else { // CHECK-LABEL: xegpu.load_nd // CHECK-SAME: !xegpu.tensor_desc<16xf32> -> vector<16xf32> - %3 = xegpu.load_nd %1 : !xegpu.tensor_desc<256xf32, #xegpu.layout> -> vector<256xf32> + %3 = xegpu.load_nd %1 {layout = #xegpu.layout} : !xegpu.tensor_desc<256xf32, #xegpu.layout> -> vector<256xf32> // CHECK-LABEL: scf.yield // CHECK-SAME: vector<16xf32> scf.yield %3 : vector<256xf32> diff --git a/mlir/test/Integration/Dialect/XeGPU/WG/simple_gemm.mlir b/mlir/test/Integration/Dialect/XeGPU/WG/simple_gemm.mlir index 3f2fff9ab51e9..37f6d33e8ac30 100644 --- a/mlir/test/Integration/Dialect/XeGPU/WG/simple_gemm.mlir +++ b/mlir/test/Integration/Dialect/XeGPU/WG/simple_gemm.mlir @@ -53,34 +53,34 @@ module @gemm attributes {gpu.container_module} { %m = arith.muli %block_id_x, %c256 : index %n = arith.muli %block_id_y, %c256 : index %c_tdesc = xegpu.create_nd_tdesc %C : memref<256x256xf32> -> !xegpu.tensor_desc<256x256xf32, #c> - %c_init_value = xegpu.load_nd %c_tdesc[%m, %n] : !xegpu.tensor_desc<256x256xf32, #c> -> vector<256x256xf32> + %c_init_value = xegpu.load_nd %c_tdesc[%m, %n] {layout = #c}: !xegpu.tensor_desc<256x256xf32, #c> -> vector<256x256xf32> %a_tdesc = xegpu.create_nd_tdesc %A : memref<256x256xf16> -> !xegpu.tensor_desc<256x32xf16, #a> %b_tdesc = xegpu.create_nd_tdesc %B : memref<256x256xf16> -> !xegpu.tensor_desc<32x256xf16, #b> // Prefetch A 3 times. %a_prefetch_tdesc = xegpu.create_nd_tdesc %A : memref<256x256xf16> -> !xegpu.tensor_desc<256x32xf16, #a_prefetch> - xegpu.prefetch_nd %a_prefetch_tdesc[%m, %c0] : !xegpu.tensor_desc<256x32xf16, #a_prefetch> - xegpu.prefetch_nd %a_prefetch_tdesc[%m, %c32] : !xegpu.tensor_desc<256x32xf16, #a_prefetch> - xegpu.prefetch_nd %a_prefetch_tdesc[%m, %c64] : !xegpu.tensor_desc<256x32xf16, #a_prefetch> + xegpu.prefetch_nd %a_prefetch_tdesc[%m, %c0] {layout = #a_prefetch}: !xegpu.tensor_desc<256x32xf16, #a_prefetch> + xegpu.prefetch_nd %a_prefetch_tdesc[%m, %c32] {layout = #a_prefetch}: !xegpu.tensor_desc<256x32xf16, #a_prefetch> + xegpu.prefetch_nd %a_prefetch_tdesc[%m, %c64] {layout = #a_prefetch}: !xegpu.tensor_desc<256x32xf16, #a_prefetch> // Prefetch B 3 times. %b_prefetch_tdesc = xegpu.create_nd_tdesc %B : memref<256x256xf16> -> !xegpu.tensor_desc<32x256xf16, #b_prefetch> - xegpu.prefetch_nd %b_prefetch_tdesc[%c0, %n] : !xegpu.tensor_desc<32x256xf16, #b_prefetch> - xegpu.prefetch_nd %b_prefetch_tdesc[%c32, %n] : !xegpu.tensor_desc<32x256xf16, #b_prefetch> - xegpu.prefetch_nd %b_prefetch_tdesc[%c64, %n] : !xegpu.tensor_desc<32x256xf16, #b_prefetch> + xegpu.prefetch_nd %b_prefetch_tdesc[%c0, %n] {layout = #b_prefetch}: !xegpu.tensor_desc<32x256xf16, #b_prefetch> + xegpu.prefetch_nd %b_prefetch_tdesc[%c32, %n] {layout = #b_prefetch}: !xegpu.tensor_desc<32x256xf16, #b_prefetch> + xegpu.prefetch_nd %b_prefetch_tdesc[%c64, %n] {layout = #b_prefetch}: !xegpu.tensor_desc<32x256xf16, #b_prefetch> %out = scf.for %k = %c0 to %c256 step %c32 iter_args(%c_value = %c_init_value) -> (vector<256x256xf32>) { - %a_value = xegpu.load_nd %a_tdesc[%m, %k] : !xegpu.tensor_desc<256x32xf16, #a> -> vector<256x32xf16> - %b_value = xegpu.load_nd %b_tdesc[%k, %n] : !xegpu.tensor_desc<32x256xf16, #b> -> vector<32x256xf16> + %a_value = xegpu.load_nd %a_tdesc[%m, %k] {layout = #a}: !xegpu.tensor_desc<256x32xf16, #a> -> vector<256x32xf16> + %b_value = xegpu.load_nd %b_tdesc[%k, %n] {layout = #b}: !xegpu.tensor_desc<32x256xf16, #b> -> vector<32x256xf16> // Prefetch next tiles. %prefetch_offset = arith.addi %k, %c96 : index - xegpu.prefetch_nd %a_prefetch_tdesc[%m, %prefetch_offset] : !xegpu.tensor_desc<256x32xf16, #a_prefetch> - xegpu.prefetch_nd %b_prefetch_tdesc[%prefetch_offset, %n] : !xegpu.tensor_desc<32x256xf16, #b_prefetch> - %c_new_value = xegpu.dpas %a_value, %b_value, %c_value {layout_result_0 = #c} + xegpu.prefetch_nd %a_prefetch_tdesc[%m, %prefetch_offset] {layout = #a_prefetch}: !xegpu.tensor_desc<256x32xf16, #a_prefetch> + xegpu.prefetch_nd %b_prefetch_tdesc[%prefetch_offset, %n] {layout = #b_prefetch}: !xegpu.tensor_desc<32x256xf16, #b_prefetch> + %c_new_value = xegpu.dpas %a_value, %b_value, %c_value {layout_a = #a, layout_b = #b, layout_cd = #c} : vector<256x32xf16>, vector<32x256xf16>, vector<256x256xf32> -> vector<256x256xf32> scf.yield %c_new_value : vector<256x256xf32> } - xegpu.store_nd %out, %c_tdesc[%m, %n] : vector<256x256xf32>, !xegpu.tensor_desc<256x256xf32, #c> + xegpu.store_nd %out, %c_tdesc[%m, %n] {layout = #c}: vector<256x256xf32>, !xegpu.tensor_desc<256x256xf32, #c> gpu.return } } diff --git a/mlir/test/lib/Dialect/XeGPU/TestXeGPUTransforms.cpp b/mlir/test/lib/Dialect/XeGPU/TestXeGPUTransforms.cpp index 93d51441f5b81..c97346ed6f8b5 100644 --- a/mlir/test/lib/Dialect/XeGPU/TestXeGPUTransforms.cpp +++ b/mlir/test/lib/Dialect/XeGPU/TestXeGPUTransforms.cpp @@ -184,7 +184,7 @@ class TestStepOpPattern : public OpConversionPattern { matchAndRewrite(vector::StepOp op, OneToNOpAdaptor adaptor, ConversionPatternRewriter &rewriter) const override { - auto layoutName = xegpu::getLayoutName(op->getResult(0)); + auto layoutName = xegpu::getTemporaryLayoutName(op->getResult(0)); auto sliceAttr = op->getAttrOfType(layoutName); if (!sliceAttr || sliceAttr.getRank() != 1) return failure(); @@ -324,7 +324,7 @@ struct TestXeGPULayoutInterface target.addDynamicallyLegalOp( [&](vector::StepOp op) -> bool { - auto layoutName = xegpu::getLayoutName(op->getResult(0)); + auto layoutName = xegpu::getTemporaryLayoutName(op->getResult(0)); auto sliceAttr = op->getAttrOfType(layoutName); return isLegal(sliceAttr); });