Skip to content

Commit a2f4be3

Browse files
AArch64: remove reliance on register allocator for simd/gpreg costing. [PR114741]
In PR114741 we see that we have a regression in codegen when SVE is enable where the simple testcase: void foo(unsigned v, unsigned *p) { *p = v & 1; } generates foo: fmov s31, w0 and z31.s, z31.s, #1 str s31, [x1] ret instead of: foo: and w0, w0, 1 str w0, [x1] ret This causes an impact it not just codesize but also performance. This is caused by the use of the ^ constraint modifier in the pattern <optab><mode>3. The documentation states that this modifier should only have an effect on the alternative costing in that a particular alternative is to be preferred unless a non-psuedo reload is needed. The pattern was trying to convey that whenever both r and w are required, that it should prefer r unless a reload is needed. This is because if a reload is needed then we can construct the constants more flexibly on the SIMD side. We were using this so simplify the implementation and to get generic cases such as: double negabs (double x) { unsigned long long y; memcpy (&y, &x, sizeof(double)); y = y | (1UL << 63); memcpy (&x, &y, sizeof(double)); return x; } which don't go through an expander. However the implementation of ^ in the register allocator is not according to the documentation in that it also has an effect during coloring. During initial register class selection it applies a penalty to a class, similar to how ? does. In this example the penalty makes the use of GP regs expensive enough that it no longer considers them: r106: preferred FP_REGS, alternative NO_REGS, allocno FP_REGS ;; 3--> b 0: i 9 r106=r105&0x1 :cortex_a53_slot_any:GENERAL_REGS+0(-1)FP_REGS+1(1)PR_LO_REGS+0(0) PR_HI_REGS+0(0):model 4 which is not the expected behavior. For GCC 14 this is a conservative fix. 1. we remove the ^ modifier from the logical optabs. 2. In order not to regress copysign we then move the copysign expansion to directly use the SIMD variant. Since copysign only supports floating point modes this is fine and no longer relies on the register allocator to select the right alternative. It once again regresses the general case, but this case wasn't optimized in earlier GCCs either so it's not a regression in GCC 14. This change gives strict better codegen than earlier GCCs and still optimizes the important cases. gcc/ChangeLog: PR target/114741 * config/aarch64/aarch64.md (<optab><mode>3): Remove ^ from alt 2. (copysign<GPF:mode>3): Use SIMD version of IOR directly. gcc/testsuite/ChangeLog: PR target/114741 * gcc.target/aarch64/fneg-abs_2.c: Update codegen. * gcc.target/aarch64/fneg-abs_4.c: xfail for now. * gcc.target/aarch64/pr114741.c: New test.
1 parent 82d6d38 commit a2f4be3

File tree

4 files changed

+48
-13
lines changed

4 files changed

+48
-13
lines changed

gcc/config/aarch64/aarch64.md

Lines changed: 15 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -4811,7 +4811,7 @@
48114811
""
48124812
{@ [ cons: =0 , 1 , 2 ; attrs: type , arch ]
48134813
[ r , %r , r ; logic_reg , * ] <logical>\t%<w>0, %<w>1, %<w>2
4814-
[ rk , ^r , <lconst> ; logic_imm , * ] <logical>\t%<w>0, %<w>1, %2
4814+
[ rk , r , <lconst> ; logic_imm , * ] <logical>\t%<w>0, %<w>1, %2
48154815
[ w , 0 , <lconst> ; * , sve ] <logical>\t%Z0.<s>, %Z0.<s>, #%2
48164816
[ w , w , w ; neon_logic , simd ] <logical>\t%0.<Vbtype>, %1.<Vbtype>, %2.<Vbtype>
48174817
}
@@ -7192,22 +7192,29 @@
71927192
(match_operand:GPF 2 "nonmemory_operand")]
71937193
"TARGET_SIMD"
71947194
{
7195-
machine_mode int_mode = <V_INT_EQUIV>mode;
7196-
rtx bitmask = gen_reg_rtx (int_mode);
7197-
emit_move_insn (bitmask, GEN_INT (HOST_WIDE_INT_M1U
7198-
<< (GET_MODE_BITSIZE (<MODE>mode) - 1)));
7195+
rtx signbit_const = GEN_INT (HOST_WIDE_INT_M1U
7196+
<< (GET_MODE_BITSIZE (<MODE>mode) - 1));
71997197
/* copysign (x, -1) should instead be expanded as orr with the sign
72007198
bit. */
72017199
rtx op2_elt = unwrap_const_vec_duplicate (operands[2]);
72027200
if (GET_CODE (op2_elt) == CONST_DOUBLE
72037201
&& real_isneg (CONST_DOUBLE_REAL_VALUE (op2_elt)))
72047202
{
7205-
emit_insn (gen_ior<v_int_equiv>3 (
7206-
lowpart_subreg (int_mode, operands[0], <MODE>mode),
7207-
lowpart_subreg (int_mode, operands[1], <MODE>mode), bitmask));
7203+
rtx v_bitmask
7204+
= force_reg (V2<V_INT_EQUIV>mode,
7205+
gen_const_vec_duplicate (V2<V_INT_EQUIV>mode,
7206+
signbit_const));
7207+
7208+
emit_insn (gen_iorv2<v_int_equiv>3 (
7209+
lowpart_subreg (V2<V_INT_EQUIV>mode, operands[0], <MODE>mode),
7210+
lowpart_subreg (V2<V_INT_EQUIV>mode, operands[1], <MODE>mode),
7211+
v_bitmask));
72087212
DONE;
72097213
}
72107214

7215+
machine_mode int_mode = <V_INT_EQUIV>mode;
7216+
rtx bitmask = gen_reg_rtx (int_mode);
7217+
emit_move_insn (bitmask, signbit_const);
72117218
operands[2] = force_reg (<MODE>mode, operands[2]);
72127219
emit_insn (gen_copysign<mode>3_insn (operands[0], operands[1], operands[2],
72137220
bitmask));

gcc/testsuite/gcc.target/aarch64/fneg-abs_2.c

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -9,8 +9,7 @@
99

1010
/*
1111
** f1:
12-
** movi v[0-9]+.2s, 0x80, lsl 24
13-
** orr v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b
12+
** orr v[0-9]+.2s, #?128, lsl #?24
1413
** ret
1514
*/
1615
float32_t f1 (float32_t a)
@@ -22,7 +21,7 @@ float32_t f1 (float32_t a)
2221
** f2:
2322
** movi v[0-9]+.4s, #?0
2423
** fneg v[0-9]+.2d, v[0-9]+.2d
25-
** orr v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b
24+
** orr v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b
2625
** ret
2726
*/
2827
float64_t f2 (float64_t a)

gcc/testsuite/gcc.target/aarch64/fneg-abs_4.c

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
#include <string.h>
88

99
/*
10-
** negabs:
10+
** negabs: { xfail *-*-* }
1111
** movi v31.4s, #?0
1212
** fneg v[0-9]+.2d, v[0-9]+.2d
1313
** orr v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b
@@ -23,7 +23,7 @@ double negabs (double x)
2323
}
2424

2525
/*
26-
** negabsf:
26+
** negabsf: { xfail *-*-* }
2727
** movi v[0-9]+.2s, 0x80, lsl 24
2828
** orr v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b
2929
** ret
Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
/* { dg-do compile } */
2+
/* { dg-options "-O2" } */
3+
/* { dg-final { check-function-bodies "**" "" "" { target lp64 } } } */
4+
5+
#pragma GCC target "+nosve"
6+
7+
/*
8+
** foo1:
9+
** and w0, w0, 1
10+
** str w0, \[x1\]
11+
** ret
12+
*/
13+
void foo1(unsigned v, unsigned *p)
14+
{
15+
*p = v & 1;
16+
}
17+
18+
#pragma GCC target "+sve"
19+
20+
/*
21+
** foo2:
22+
** and w0, w0, 1
23+
** str w0, \[x1\]
24+
** ret
25+
*/
26+
void foo2(unsigned v, unsigned *p)
27+
{
28+
*p = v & 1;
29+
}

0 commit comments

Comments
 (0)