Skip to content

Commit 96891b7

Browse files
authored
[X86] EltsFromConsecutiveLoads - attempt to match consecutive truncated loads (#172051)
SelectionDAG::areNonVolatileConsecutiveLoads will only match loads that have a MemoryVT the same size as the stride byte size, which will fail for cases where large loads have been split (typically by shift+truncates) and we're trying to stitch them back together. As a fallback, this patch checks for cases where the candidate element's byte size is a multiple of full MemoryVT bytes distance away from the base load.
1 parent 249acb6 commit 96891b7

File tree

4 files changed

+37
-325
lines changed

4 files changed

+37
-325
lines changed

llvm/lib/Target/X86/X86ISelLowering.cpp

Lines changed: 14 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7448,8 +7448,20 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts,
74487448
return (0 <= BaseIdx && BaseIdx < (int)NumElems && LoadMask[BaseIdx] &&
74497449
Loads[BaseIdx] == Ld && ByteOffsets[BaseIdx] == 0);
74507450
}
7451-
return DAG.areNonVolatileConsecutiveLoads(Ld, Base, BaseSizeInBytes,
7452-
EltIdx - FirstLoadedElt);
7451+
int Stride = EltIdx - FirstLoadedElt;
7452+
if (DAG.areNonVolatileConsecutiveLoads(Ld, Base, BaseSizeInBytes, Stride))
7453+
return true;
7454+
// Try again using the memory load size (we might have broken a large load
7455+
// into smaller elements), ensure the stride is the full memory load size
7456+
// apart and a whole number of elements fit in each memory load.
7457+
unsigned BaseMemSizeInBits = Base->getMemoryVT().getSizeInBits();
7458+
if (((Stride * BaseSizeInBits) % BaseMemSizeInBits) == 0 &&
7459+
(BaseMemSizeInBits % BaseSizeInBits) == 0) {
7460+
unsigned Scale = BaseMemSizeInBits / BaseSizeInBits;
7461+
return DAG.areNonVolatileConsecutiveLoads(Ld, Base, BaseMemSizeInBits / 8,
7462+
Stride / Scale);
7463+
}
7464+
return false;
74537465
};
74547466

74557467
// Consecutive loads can contain UNDEFS but not ZERO elements.

llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast_from_memory.ll

Lines changed: 13 additions & 191 deletions
Original file line numberDiff line numberDiff line change
@@ -5002,203 +5002,25 @@ define void @vec512_i128_widen_to_i256_factor2_broadcast_to_v2i256_factor2(ptr %
50025002
;
50035003
; AVX1-LABEL: vec512_i128_widen_to_i256_factor2_broadcast_to_v2i256_factor2:
50045004
; AVX1: # %bb.0:
5005-
; AVX1-NEXT: pushq %rbx
5006-
; AVX1-NEXT: movq 16(%rdi), %rcx
5007-
; AVX1-NEXT: movq %rcx, %rax
5008-
; AVX1-NEXT: movq %rcx, %r8
5009-
; AVX1-NEXT: movq %rcx, %r9
5010-
; AVX1-NEXT: movq %rcx, %r10
5011-
; AVX1-NEXT: movl %ecx, %r11d
5012-
; AVX1-NEXT: movl %ecx, %ebx
5013-
; AVX1-NEXT: vmovd %ecx, %xmm0
5014-
; AVX1-NEXT: shrl $8, %ecx
5015-
; AVX1-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
5016-
; AVX1-NEXT: shrl $16, %ebx
5017-
; AVX1-NEXT: vpinsrb $2, %ebx, %xmm0, %xmm0
5018-
; AVX1-NEXT: shrl $24, %r11d
5019-
; AVX1-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
5020-
; AVX1-NEXT: shrq $32, %r10
5021-
; AVX1-NEXT: vpinsrb $4, %r10d, %xmm0, %xmm0
5022-
; AVX1-NEXT: shrq $40, %r9
5023-
; AVX1-NEXT: vpinsrb $5, %r9d, %xmm0, %xmm0
5024-
; AVX1-NEXT: shrq $48, %r8
5025-
; AVX1-NEXT: vpinsrb $6, %r8d, %xmm0, %xmm0
5026-
; AVX1-NEXT: movq 24(%rdi), %rcx
5027-
; AVX1-NEXT: shrq $56, %rax
5028-
; AVX1-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
5029-
; AVX1-NEXT: movl %ecx, %eax
5030-
; AVX1-NEXT: shrl $8, %eax
5031-
; AVX1-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0
5032-
; AVX1-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0
5033-
; AVX1-NEXT: movl %ecx, %eax
5034-
; AVX1-NEXT: shrl $16, %eax
5035-
; AVX1-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0
5036-
; AVX1-NEXT: movl %ecx, %eax
5037-
; AVX1-NEXT: shrl $24, %eax
5038-
; AVX1-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
5039-
; AVX1-NEXT: movq %rcx, %rax
5040-
; AVX1-NEXT: shrq $32, %rax
5041-
; AVX1-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0
5042-
; AVX1-NEXT: movq %rcx, %rax
5043-
; AVX1-NEXT: shrq $40, %rax
5044-
; AVX1-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0
5045-
; AVX1-NEXT: movq %rcx, %rax
5046-
; AVX1-NEXT: shrq $48, %rax
5047-
; AVX1-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0
5048-
; AVX1-NEXT: movq (%rdi), %rax
5049-
; AVX1-NEXT: shrq $56, %rcx
5050-
; AVX1-NEXT: vpinsrb $15, %ecx, %xmm0, %xmm0
5051-
; AVX1-NEXT: movl %eax, %ecx
5052-
; AVX1-NEXT: shrl $8, %ecx
5053-
; AVX1-NEXT: vmovd %eax, %xmm1
5054-
; AVX1-NEXT: vpinsrb $1, %ecx, %xmm1, %xmm1
5055-
; AVX1-NEXT: movl %eax, %ecx
5056-
; AVX1-NEXT: shrl $16, %ecx
5057-
; AVX1-NEXT: vpinsrb $2, %ecx, %xmm1, %xmm1
5058-
; AVX1-NEXT: movl %eax, %ecx
5059-
; AVX1-NEXT: shrl $24, %ecx
5060-
; AVX1-NEXT: vpinsrb $3, %ecx, %xmm1, %xmm1
5061-
; AVX1-NEXT: movq %rax, %rcx
5062-
; AVX1-NEXT: shrq $32, %rcx
5063-
; AVX1-NEXT: vpinsrb $4, %ecx, %xmm1, %xmm1
5064-
; AVX1-NEXT: movq %rax, %rcx
5065-
; AVX1-NEXT: shrq $40, %rcx
5066-
; AVX1-NEXT: vpinsrb $5, %ecx, %xmm1, %xmm1
5067-
; AVX1-NEXT: movq %rax, %rcx
5068-
; AVX1-NEXT: shrq $48, %rcx
5069-
; AVX1-NEXT: vpinsrb $6, %ecx, %xmm1, %xmm1
5070-
; AVX1-NEXT: movq 8(%rdi), %rcx
5071-
; AVX1-NEXT: shrq $56, %rax
5072-
; AVX1-NEXT: vpinsrb $7, %eax, %xmm1, %xmm1
5073-
; AVX1-NEXT: movl %ecx, %eax
5074-
; AVX1-NEXT: shrl $8, %eax
5075-
; AVX1-NEXT: vpinsrb $8, %ecx, %xmm1, %xmm1
5076-
; AVX1-NEXT: vpinsrb $9, %eax, %xmm1, %xmm1
5077-
; AVX1-NEXT: movl %ecx, %eax
5078-
; AVX1-NEXT: shrl $16, %eax
5079-
; AVX1-NEXT: vpinsrb $10, %eax, %xmm1, %xmm1
5080-
; AVX1-NEXT: movl %ecx, %eax
5081-
; AVX1-NEXT: shrl $24, %eax
5082-
; AVX1-NEXT: vpinsrb $11, %eax, %xmm1, %xmm1
5083-
; AVX1-NEXT: movq %rcx, %rax
5084-
; AVX1-NEXT: shrq $32, %rax
5085-
; AVX1-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
5086-
; AVX1-NEXT: movq %rcx, %rax
5087-
; AVX1-NEXT: shrq $40, %rax
5088-
; AVX1-NEXT: vpinsrb $13, %eax, %xmm1, %xmm1
5089-
; AVX1-NEXT: movq %rcx, %rax
5090-
; AVX1-NEXT: shrq $48, %rax
5091-
; AVX1-NEXT: vpinsrb $14, %eax, %xmm1, %xmm1
5092-
; AVX1-NEXT: shrq $56, %rcx
5093-
; AVX1-NEXT: vpinsrb $15, %ecx, %xmm1, %xmm1
5094-
; AVX1-NEXT: vpaddb 48(%rsi), %xmm0, %xmm2
5095-
; AVX1-NEXT: vpaddb 32(%rsi), %xmm1, %xmm3
5096-
; AVX1-NEXT: vpaddb 16(%rsi), %xmm0, %xmm0
5097-
; AVX1-NEXT: vpaddb (%rsi), %xmm1, %xmm1
5098-
; AVX1-NEXT: vmovdqa %xmm1, (%rdx)
5099-
; AVX1-NEXT: vmovdqa %xmm0, 16(%rdx)
5005+
; AVX1-NEXT: vmovdqa (%rdi), %xmm0
5006+
; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1
5007+
; AVX1-NEXT: vpaddb 48(%rsi), %xmm1, %xmm2
5008+
; AVX1-NEXT: vpaddb 32(%rsi), %xmm0, %xmm3
5009+
; AVX1-NEXT: vpaddb 16(%rsi), %xmm1, %xmm1
5010+
; AVX1-NEXT: vpaddb (%rsi), %xmm0, %xmm0
5011+
; AVX1-NEXT: vmovdqa %xmm0, (%rdx)
5012+
; AVX1-NEXT: vmovdqa %xmm1, 16(%rdx)
51005013
; AVX1-NEXT: vmovdqa %xmm3, 32(%rdx)
51015014
; AVX1-NEXT: vmovdqa %xmm2, 48(%rdx)
5102-
; AVX1-NEXT: popq %rbx
51035015
; AVX1-NEXT: retq
51045016
;
51055017
; AVX2-LABEL: vec512_i128_widen_to_i256_factor2_broadcast_to_v2i256_factor2:
51065018
; AVX2: # %bb.0:
5107-
; AVX2-NEXT: pushq %rbx
5108-
; AVX2-NEXT: movq 16(%rdi), %rcx
5109-
; AVX2-NEXT: movq %rcx, %rax
5110-
; AVX2-NEXT: movq %rcx, %r8
5111-
; AVX2-NEXT: movq %rcx, %r9
5112-
; AVX2-NEXT: movq %rcx, %r10
5113-
; AVX2-NEXT: movl %ecx, %r11d
5114-
; AVX2-NEXT: movl %ecx, %ebx
5115-
; AVX2-NEXT: vmovd %ecx, %xmm0
5116-
; AVX2-NEXT: shrl $8, %ecx
5117-
; AVX2-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
5118-
; AVX2-NEXT: shrl $16, %ebx
5119-
; AVX2-NEXT: vpinsrb $2, %ebx, %xmm0, %xmm0
5120-
; AVX2-NEXT: shrl $24, %r11d
5121-
; AVX2-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
5122-
; AVX2-NEXT: shrq $32, %r10
5123-
; AVX2-NEXT: vpinsrb $4, %r10d, %xmm0, %xmm0
5124-
; AVX2-NEXT: shrq $40, %r9
5125-
; AVX2-NEXT: vpinsrb $5, %r9d, %xmm0, %xmm0
5126-
; AVX2-NEXT: shrq $48, %r8
5127-
; AVX2-NEXT: vpinsrb $6, %r8d, %xmm0, %xmm0
5128-
; AVX2-NEXT: movq 24(%rdi), %rcx
5129-
; AVX2-NEXT: shrq $56, %rax
5130-
; AVX2-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
5131-
; AVX2-NEXT: movl %ecx, %eax
5132-
; AVX2-NEXT: shrl $8, %eax
5133-
; AVX2-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0
5134-
; AVX2-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0
5135-
; AVX2-NEXT: movl %ecx, %eax
5136-
; AVX2-NEXT: shrl $16, %eax
5137-
; AVX2-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0
5138-
; AVX2-NEXT: movl %ecx, %eax
5139-
; AVX2-NEXT: shrl $24, %eax
5140-
; AVX2-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
5141-
; AVX2-NEXT: movq %rcx, %rax
5142-
; AVX2-NEXT: shrq $32, %rax
5143-
; AVX2-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0
5144-
; AVX2-NEXT: movq %rcx, %rax
5145-
; AVX2-NEXT: shrq $40, %rax
5146-
; AVX2-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0
5147-
; AVX2-NEXT: movq %rcx, %rax
5148-
; AVX2-NEXT: shrq $48, %rax
5149-
; AVX2-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0
5150-
; AVX2-NEXT: movq (%rdi), %rax
5151-
; AVX2-NEXT: shrq $56, %rcx
5152-
; AVX2-NEXT: vpinsrb $15, %ecx, %xmm0, %xmm0
5153-
; AVX2-NEXT: movl %eax, %ecx
5154-
; AVX2-NEXT: shrl $8, %ecx
5155-
; AVX2-NEXT: vmovd %eax, %xmm1
5156-
; AVX2-NEXT: vpinsrb $1, %ecx, %xmm1, %xmm1
5157-
; AVX2-NEXT: movl %eax, %ecx
5158-
; AVX2-NEXT: shrl $16, %ecx
5159-
; AVX2-NEXT: vpinsrb $2, %ecx, %xmm1, %xmm1
5160-
; AVX2-NEXT: movl %eax, %ecx
5161-
; AVX2-NEXT: shrl $24, %ecx
5162-
; AVX2-NEXT: vpinsrb $3, %ecx, %xmm1, %xmm1
5163-
; AVX2-NEXT: movq %rax, %rcx
5164-
; AVX2-NEXT: shrq $32, %rcx
5165-
; AVX2-NEXT: vpinsrb $4, %ecx, %xmm1, %xmm1
5166-
; AVX2-NEXT: movq %rax, %rcx
5167-
; AVX2-NEXT: shrq $40, %rcx
5168-
; AVX2-NEXT: vpinsrb $5, %ecx, %xmm1, %xmm1
5169-
; AVX2-NEXT: movq %rax, %rcx
5170-
; AVX2-NEXT: shrq $48, %rcx
5171-
; AVX2-NEXT: vpinsrb $6, %ecx, %xmm1, %xmm1
5172-
; AVX2-NEXT: movq 8(%rdi), %rcx
5173-
; AVX2-NEXT: shrq $56, %rax
5174-
; AVX2-NEXT: vpinsrb $7, %eax, %xmm1, %xmm1
5175-
; AVX2-NEXT: movl %ecx, %eax
5176-
; AVX2-NEXT: shrl $8, %eax
5177-
; AVX2-NEXT: vpinsrb $8, %ecx, %xmm1, %xmm1
5178-
; AVX2-NEXT: vpinsrb $9, %eax, %xmm1, %xmm1
5179-
; AVX2-NEXT: movl %ecx, %eax
5180-
; AVX2-NEXT: shrl $16, %eax
5181-
; AVX2-NEXT: vpinsrb $10, %eax, %xmm1, %xmm1
5182-
; AVX2-NEXT: movl %ecx, %eax
5183-
; AVX2-NEXT: shrl $24, %eax
5184-
; AVX2-NEXT: vpinsrb $11, %eax, %xmm1, %xmm1
5185-
; AVX2-NEXT: movq %rcx, %rax
5186-
; AVX2-NEXT: shrq $32, %rax
5187-
; AVX2-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
5188-
; AVX2-NEXT: movq %rcx, %rax
5189-
; AVX2-NEXT: shrq $40, %rax
5190-
; AVX2-NEXT: vpinsrb $13, %eax, %xmm1, %xmm1
5191-
; AVX2-NEXT: movq %rcx, %rax
5192-
; AVX2-NEXT: shrq $48, %rax
5193-
; AVX2-NEXT: vpinsrb $14, %eax, %xmm1, %xmm1
5194-
; AVX2-NEXT: shrq $56, %rcx
5195-
; AVX2-NEXT: vpinsrb $15, %ecx, %xmm1, %xmm1
5196-
; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
5197-
; AVX2-NEXT: vpaddb 32(%rsi), %ymm0, %ymm1
5198-
; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0
5199-
; AVX2-NEXT: vmovdqa %ymm0, (%rdx)
5200-
; AVX2-NEXT: vmovdqa %ymm1, 32(%rdx)
5201-
; AVX2-NEXT: popq %rbx
5019+
; AVX2-NEXT: vmovdqa (%rdi), %ymm0
5020+
; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm1
5021+
; AVX2-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0
5022+
; AVX2-NEXT: vmovdqa %ymm0, 32(%rdx)
5023+
; AVX2-NEXT: vmovdqa %ymm1, (%rdx)
52025024
; AVX2-NEXT: vzeroupper
52035025
; AVX2-NEXT: retq
52045026
;

llvm/test/CodeGen/X86/load-partial.ll

Lines changed: 8 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -208,30 +208,14 @@ define <4 x float> @load_float4_float3_trunc_0122(ptr nocapture readonly derefer
208208
}
209209

210210
define <4 x float> @load_float4_float3_trunc_0123(ptr nocapture readonly dereferenceable(16)) nofree nosync {
211-
; SSE2-LABEL: load_float4_float3_trunc_0123:
212-
; SSE2: # %bb.0:
213-
; SSE2-NEXT: movaps (%rdi), %xmm0
214-
; SSE2-NEXT: movhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1]
215-
; SSE2-NEXT: retq
216-
;
217-
; SSSE3-LABEL: load_float4_float3_trunc_0123:
218-
; SSSE3: # %bb.0:
219-
; SSSE3-NEXT: movaps (%rdi), %xmm0
220-
; SSSE3-NEXT: movhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1]
221-
; SSSE3-NEXT: retq
222-
;
223-
; SSE41-LABEL: load_float4_float3_trunc_0123:
224-
; SSE41: # %bb.0:
225-
; SSE41-NEXT: movaps (%rdi), %xmm0
226-
; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
227-
; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
228-
; SSE41-NEXT: retq
211+
; SSE-LABEL: load_float4_float3_trunc_0123:
212+
; SSE: # %bb.0:
213+
; SSE-NEXT: movaps (%rdi), %xmm0
214+
; SSE-NEXT: retq
229215
;
230216
; AVX-LABEL: load_float4_float3_trunc_0123:
231217
; AVX: # %bb.0:
232218
; AVX-NEXT: vmovaps (%rdi), %xmm0
233-
; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
234-
; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
235219
; AVX-NEXT: retq
236220
%2 = load i64, ptr %0, align 16
237221
%3 = getelementptr inbounds <4 x float>, ptr %0, i64 0, i64 2
@@ -254,30 +238,14 @@ define <4 x float> @load_float4_float3_trunc_0123(ptr nocapture readonly derefer
254238
}
255239

256240
define <4 x float> @load_float4_float3_trunc_0123_unaligned(ptr nocapture readonly dereferenceable(16)) nofree nosync {
257-
; SSE2-LABEL: load_float4_float3_trunc_0123_unaligned:
258-
; SSE2: # %bb.0:
259-
; SSE2-NEXT: movups (%rdi), %xmm0
260-
; SSE2-NEXT: movhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1]
261-
; SSE2-NEXT: retq
262-
;
263-
; SSSE3-LABEL: load_float4_float3_trunc_0123_unaligned:
264-
; SSSE3: # %bb.0:
265-
; SSSE3-NEXT: movups (%rdi), %xmm0
266-
; SSSE3-NEXT: movhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1]
267-
; SSSE3-NEXT: retq
268-
;
269-
; SSE41-LABEL: load_float4_float3_trunc_0123_unaligned:
270-
; SSE41: # %bb.0:
271-
; SSE41-NEXT: movups (%rdi), %xmm0
272-
; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
273-
; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
274-
; SSE41-NEXT: retq
241+
; SSE-LABEL: load_float4_float3_trunc_0123_unaligned:
242+
; SSE: # %bb.0:
243+
; SSE-NEXT: movups (%rdi), %xmm0
244+
; SSE-NEXT: retq
275245
;
276246
; AVX-LABEL: load_float4_float3_trunc_0123_unaligned:
277247
; AVX: # %bb.0:
278248
; AVX-NEXT: vmovups (%rdi), %xmm0
279-
; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
280-
; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
281249
; AVX-NEXT: retq
282250
%2 = load i64, ptr %0, align 1
283251
%3 = getelementptr inbounds <4 x float>, ptr %0, i64 0, i64 2

0 commit comments

Comments
 (0)