Use nbdd0121 suggestion for reducing the perf impact

Urgau · Urgau · commit f1c72be1ebb7 · 2022-07-05T11:44:16.000+02:00
diff --git a/compiler/rustc_middle/src/ty/layout.rs b/compiler/rustc_middle/src/ty/layout.rs
@@ -3375,20 +3375,18 @@ impl<'tcx> LayoutCx<'tcx, TyCtxt<'tcx>> {
                         // Pass and return structures up to 2 pointers in size by value,
                         // matching `ScalarPair`. LLVM will usually pass these in 2 registers
                         // which is more efficient than by-ref.
-                        let max_by_val_size = Pointer.size(self) * 2;
+                        let ptr_size = Pointer.size(self);
+                        let max_by_val_size = ptr_size * 2;
                         let size = arg.layout.size;
 
                         if arg.layout.is_unsized() || size > max_by_val_size {
                             arg.make_indirect();
-                        } else if unlikely!(self.has_all_float(&arg.layout)) {
+                        } else if size > ptr_size && unlikely!(self.has_all_float(&arg.layout)) {
                             // We don't want to aggregate floats as an aggregates of Integer
-                            // because this will hurt the generated assembly (#93490)
-                            //
-                            // As an optimization we want to pass homogeneous aggregate of floats
-                            // greater than pointer size as indirect
-                            if size > Pointer.size(self) {
-                                arg.make_indirect();
-                            }
+                            // because this will hurt the generated assembly (#93490) but as an
+                            // optimization we want to pass homogeneous aggregate of floats
+                            // greater than pointer size as indirect.
+                            arg.make_indirect();
                         } else {
                             // We want to pass small aggregates as immediates, but using
                             // a LLVM aggregate type for this leads to bad optimizations,
diff --git a/src/test/assembly/x86-64-homogenous-floats.rs b/src/test/assembly/x86-64-homogenous-floats.rs
@@ -15,12 +15,15 @@ pub fn sum_f32(a: f32, b: f32) -> f32 {
     a + b
 }
 
-// CHECK-LABEL: sum_f32x2:
-// CHECK:      addss xmm{{[0-9]}}, xmm{{[0-9]}}
-// CHECK-NEXT: addss xmm{{[0-9]}}, xmm{{[0-9]}}
+// CHECK-LABEL: sum_f64x2:
+// CHECK:      mov     rax, [[PTR_IN:.*]]
+// CHECK-NEXT: movupd  [[XMMA:xmm[0-9]]], xmmword ptr [rsi]
+// CHECK-NEXT: movupd  [[XMMB:xmm[0-9]]], xmmword ptr [rdx]
+// CHECK-NEXT: addpd   [[XMMB]], [[XMMA]]
+// CHECK-NEXT: movupd  xmmword ptr {{\[}}[[PTR_IN]]{{\]}}, [[XMMB]]
 // CHECK-NEXT: ret
 #[no_mangle]
-pub fn sum_f32x2(a: [f32; 2], b: [f32; 2]) -> [f32; 2] {
+pub fn sum_f64x2(a: [f64; 2], b: [f64; 2]) -> [f64; 2] {
     [
         a[0] + b[0],
         a[1] + b[1],
diff --git a/src/test/codegen/homogeneous-floats.rs b/src/test/codegen/homogeneous-floats.rs
@@ -13,7 +13,7 @@ pub struct Foo {
     bar4: f32,
 }
 
-// CHECK: define [2 x float] @array_f32x2([2 x float] %0, [2 x float] %1)
+// CHECK: define i64 @array_f32x2(i64 %0, i64 %1)
 #[no_mangle]
 pub fn array_f32x2(a: [f32; 2], b: [f32; 2]) -> [f32; 2] {
     todo!()

Original file line number	Diff line number	Diff line change
`@@ -13,7 +13,7 @@ pub struct Foo {`
`13`	`13`	`bar4: f32,`
`14`	`14`	`}`
`15`	`15`
`16`		`-// CHECK: define [2 x float] @array_f32x2([2 x float] %0, [2 x float] %1)`
	`16`	`+// CHECK: define i64 @array_f32x2(i64 %0, i64 %1)`
`17`	`17`	`#[no_mangle]`
`18`	`18`	`pub fn array_f32x2(a: [f32; 2], b: [f32; 2]) -> [f32; 2] {`
`19`	`19`	`todo!()`