[X86] Don't elide argument copies for scalarized vectors (PR63475)

nikic · nikic · commit 7025ac81f05c · 2023-07-13T14:49:48.000+02:00
When eliding argument copies, the memory layout between a plain store of the type and the layout of the argument lowering on the stack must match. For multi-part argument lowerings, this is not necessarily the case. The code already tried to prevent this optimization for "scalarized and extended" vectors, but the check for "extends" was incomplete. While a scalarized vector of i32s stores i32 values on the stack, these are stored in 8 byte stack slots (on x86_64), so effectively have padding. Rather than trying to add more special cases to handle this (which is not straightforward), I'm going in the other direction and exclude scalarized vectors from this optimization entirely. This seems like a rare case that is not worth the hassle -- the complete lack of test coverage is not reassuring either. Fixes #63475. Differential Revision: https://reviews.llvm.org/D154078
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -3851,21 +3851,18 @@ X86TargetLowering::LowerMemArgument(SDValue Chain, CallingConv::ID CallConv,
 
   EVT ArgVT = Ins[i].ArgVT;
 
-  // If this is a vector that has been split into multiple parts, and the
-  // scalar size of the parts don't match the vector element size, then we can't
-  // elide the copy. The parts will have padding between them instead of being
-  // packed like a vector.
-  bool ScalarizedAndExtendedVector =
-      ArgVT.isVector() && !VA.getLocVT().isVector() &&
-      VA.getLocVT().getSizeInBits() != ArgVT.getScalarSizeInBits();
+  // If this is a vector that has been split into multiple parts, don't elide
+  // the copy. The layout on the stack may not match the packed in-memory
+  // layout.
+  bool ScalarizedVector = ArgVT.isVector() && !VA.getLocVT().isVector();
 
   // This is an argument in memory. We might be able to perform copy elision.
   // If the argument is passed directly in memory without any extension, then we
   // can perform copy elision. Large vector types, for example, may be passed
   // indirectly by pointer.
   if (Flags.isCopyElisionCandidate() &&
       VA.getLocInfo() != CCValAssign::Indirect && !ExtendedInMem &&
-      !ScalarizedAndExtendedVector) {
+      !ScalarizedVector) {
     SDValue PartAddr;
     if (Ins[i].PartOffset == 0) {
       // If this is a one-part value or the first part of a multi-part value,
diff --git a/llvm/test/CodeGen/X86/pr63475.ll b/llvm/test/CodeGen/X86/pr63475.ll
@@ -27,7 +27,8 @@ define void @caller() nounwind {
   ret void
 }
 
-; FIXME: This is a miscompile.
+; Make sure the stack offsets are correct. The distance between them should
+; be 8, not 4.
 define void @callee(ptr %p0, ptr %p1, ptr %p2, ptr %p3, ptr %p4, ptr %p5, <7 x i32> %arg) nounwind {
 ; CHECK-LABEL: callee:
 ; CHECK:       # %bb.0: # %start
@@ -37,28 +38,41 @@ define void @callee(ptr %p0, ptr %p1, ptr %p2, ptr %p3, ptr %p4, ptr %p5, <7 x i
 ; CHECK-NEXT:    pushq %r13
 ; CHECK-NEXT:    pushq %r12
 ; CHECK-NEXT:    pushq %rbx
-; CHECK-NEXT:    pushq %rax
-; CHECK-NEXT:    movl 112(%rsp), %ebx
-; CHECK-NEXT:    movl 104(%rsp), %ebp
-; CHECK-NEXT:    movl 96(%rsp), %r14d
-; CHECK-NEXT:    movl 76(%rsp), %r15d
-; CHECK-NEXT:    movl 72(%rsp), %r12d
-; CHECK-NEXT:    movl 64(%rsp), %edi
-; CHECK-NEXT:    movl 68(%rsp), %r13d
-; CHECK-NEXT:    callq use@PLT
-; CHECK-NEXT:    movl %r13d, %edi
-; CHECK-NEXT:    callq use@PLT
-; CHECK-NEXT:    movl %r12d, %edi
+; CHECK-NEXT:    subq $40, %rsp
+; CHECK-NEXT:    movl 120(%rsp), %ebx
+; CHECK-NEXT:    movd %ebx, %xmm0
+; CHECK-NEXT:    movl 112(%rsp), %ebp
+; CHECK-NEXT:    movd %ebp, %xmm1
+; CHECK-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; CHECK-NEXT:    movl 104(%rsp), %r15d
+; CHECK-NEXT:    movd %r15d, %xmm0
+; CHECK-NEXT:    movl 96(%rsp), %edi
+; CHECK-NEXT:    movd %edi, %xmm2
+; CHECK-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
+; CHECK-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0]
+; CHECK-NEXT:    movl 136(%rsp), %r14d
+; CHECK-NEXT:    movd %r14d, %xmm0
+; CHECK-NEXT:    movl 128(%rsp), %r12d
+; CHECK-NEXT:    movd %r12d, %xmm1
+; CHECK-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; CHECK-NEXT:    movl 144(%rsp), %r13d
+; CHECK-NEXT:    movl %r13d, 36(%rsp)
+; CHECK-NEXT:    movq %xmm1, 28(%rsp)
+; CHECK-NEXT:    movdqu %xmm2, 12(%rsp)
 ; CHECK-NEXT:    callq use@PLT
 ; CHECK-NEXT:    movl %r15d, %edi
 ; CHECK-NEXT:    callq use@PLT
-; CHECK-NEXT:    movl %r14d, %edi
-; CHECK-NEXT:    callq use@PLT
 ; CHECK-NEXT:    movl %ebp, %edi
 ; CHECK-NEXT:    callq use@PLT
 ; CHECK-NEXT:    movl %ebx, %edi
 ; CHECK-NEXT:    callq use@PLT
-; CHECK-NEXT:    addq $8, %rsp
+; CHECK-NEXT:    movl %r12d, %edi
+; CHECK-NEXT:    callq use@PLT
+; CHECK-NEXT:    movl %r14d, %edi
+; CHECK-NEXT:    callq use@PLT
+; CHECK-NEXT:    movl %r13d, %edi
+; CHECK-NEXT:    callq use@PLT
+; CHECK-NEXT:    addq $40, %rsp
 ; CHECK-NEXT:    popq %rbx
 ; CHECK-NEXT:    popq %r12
 ; CHECK-NEXT:    popq %r13