@@ -2845,9 +2845,11 @@ void EmitPass::EmitInsertValueToLayoutStruct(InsertValueInst *IVI) {
28452845 }
28462846 } else {
28472847 CVariable *SrcV = GetSymbol(src0);
2848- if (DstV != SrcV && DstV->IsUniform() && SrcV->IsUniform()) {
2848+ Value *DstRoot = m_deSSA->getRootValue(IVI);
2849+ Value *SrcRoot = m_deSSA->getRootValue(src0);
2850+ if (DstRoot != SrcRoot && DstV->IsUniform() && SrcV->IsUniform()) {
28492851 emitCopyToOrFromLayoutStruct(IVI, src0);
2850- } else if (DstV != SrcV ) {
2852+ } else if (DstRoot != SrcRoot ) {
28512853 // Most often, SrcV has just one defined value and calling
28522854 // emitCopyToOrFromLayoutStruct() would copy all, thus special
28532855 // handling here to avoid copy undefined values.
@@ -2869,7 +2871,7 @@ void EmitPass::EmitInsertValueToLayoutStruct(InsertValueInst *IVI) {
28692871 n * (SrcV->IsUniform() ? 1 : nLanes));
28702872 if (II.size() == 2) {
28712873 uint32_t AOSStBytes = (uint32_t)m_DL->getTypeStoreSize(ty0);
2872- emitVectorCopyToAOS (AOSStBytes, eltDst, eltSrc, n);
2874+ emitLayoutStructCopyAOSToAOS (AOSStBytes, eltDst, eltSrc, n);
28732875 } else {
28742876 emitVectorCopy(eltDst, eltSrc, n);
28752877 }
@@ -19289,6 +19291,79 @@ void EmitPass::emitVectorCopyToOrFromAOS(uint32_t AOSBytes, CVariable *Dst, CVar
1928919291 }
1929019292}
1929119293
19294+ // This is to copy an AOS field of a struct to an AOS field of another struct.
19295+ // AOSBytes: the size of AOS struct (its members are laid out in AOS format).
19296+ // Dst: the start of Destination
19297+ // Src: the start of Source
19298+ // nElts: the number of elements to copy
19299+ // DstSubRegOffset : offset from Dst as the beginning location to be copied to
19300+ // SrcSubRegOffset : offset from Src as the beginning location to copy from
19301+ //
19302+ // For example, the following packed struct:
19303+ // __StructSOALayout_ {
19304+ // i32 s0
19305+ // __StructAOSLayout_ {
19306+ // <4xi8> s1;
19307+ // }
19308+ // i32 s2;
19309+ // } dst, src;
19310+ // Assume that the number of lanes is 16. dst(src)'s layout in GRFs:
19311+ // Lane 15 14 13 1 0
19312+ // ----------------------------------
19313+ // r10: s0 s0 s0 ...... s0 s0
19314+ // r11: s1 s1 s1 ...... s1 s1 // <4xi8> s1 is in AOS format
19315+ // r12: s2 s2 s2 ...... s2 s2
19316+ //
19317+ // For the following copy:
19318+ // dst.s1 = src.s1;
19319+ // the arguments are (numLanes = 16)
19320+ // AOSBytes = 4; nElts = 4
19321+ // (Dst, DstSubRegOffset) = (dst, 16*12) or (dst + 16*4, 0)
19322+ // (Src, SrcSubRegOffset) = (src, 16*12) or (src + 16*4, 0)
19323+ // the function generates 4 mov instructions:
19324+ // mov dst(1,0)<4>:b src(1,0)<4,1,0>:b
19325+ // mov dst(1,1)<4>:b src(1,1)<4,1,0>:b
19326+ // mov dst(1,2)<4>:b src(1,2)<4,1,0>:b
19327+ // mov dst(1,3)<4>:b src(1,3)<4,1,0>:b
19328+ //
19329+ // Note: for dst.s2 = src.s2, using emitVectorCopy()
19330+ //
19331+ void EmitPass::emitLayoutStructCopyAOSToAOS(uint32_t AOSBytes, CVariable *Dst, CVariable *Src, uint32_t nElts,
19332+ uint32_t DstSubRegOffset, uint32_t SrcSubRegOffset) {
19333+ assert(Dst->GetType() == Src->GetType());
19334+
19335+ bool srcUniform = Src->IsUniform();
19336+ bool dstUniform = Dst->IsUniform();
19337+
19338+ // Uniform vector copy.
19339+ if (srcUniform && dstUniform) {
19340+ emitUniformVectorCopy(Dst, Src, nElts, DstSubRegOffset, SrcSubRegOffset);
19341+ return;
19342+ }
19343+
19344+ const uint32_t nLanes = numLanes(m_currShader->m_SIMDSize);
19345+ unsigned doff = DstSubRegOffset, soff = SrcSubRegOffset;
19346+ uint32_t eltBytes = Dst->GetElemSize();
19347+ uint32_t stride = AOSBytes / eltBytes;
19348+ IGC_ASSERT(stride <= 4 && stride > 0);
19349+ IGC_ASSERT((AOSBytes % eltBytes) == 0);
19350+
19351+ uint DstVStride = dstUniform ? 1 : stride;
19352+ uint SrcVStride = srcUniform ? 0 : stride;
19353+ for (uint32_t i = 0; i < nElts; ++i) {
19354+ // Copy AOS field to AOS field
19355+ uint SrcSubReg = soff + i;
19356+ uint DstSubReg = doff + i;
19357+
19358+ m_encoder->SetDstRegion(DstVStride);
19359+ m_encoder->SetDstSubReg(DstSubReg);
19360+ m_encoder->SetSrcSubReg(0, SrcSubReg);
19361+ m_encoder->SetSrcRegion(0, SrcVStride, 1, 0);
19362+ m_encoder->Copy(Dst, Src);
19363+ m_encoder->Push();
19364+ }
19365+ }
19366+
1929219367// Push a new frame onto the stack by:
1929319368// Update FP to the current SP
1929419369// Increment SP by pushSize
0 commit comments