Skip to content

Commit c98aabc

Browse files
AArch64: Add implementation for pow2 bitmask division.
This adds an implementation for the new optab for unsigned pow2 bitmask for AArch64. The implementation rewrites: x = y / (2 ^ (sizeof (y)/2)-1 into e.g. (for bytes) (x + ((x + 257) >> 8)) >> 8 where it's required that the additions be done in double the precision of x such that we don't lose any bits during an overflow. Essentially the sequence decomposes the division into doing two smaller divisions, one for the top and bottom parts of the number and adding the results back together. To account for the fact that shift by 8 would be division by 256 we add 1 to both parts of x such that when 255 we still get 1 as the answer. Because the amount we shift are half the original datatype we can use the halfing instructions the ISA provides to do the operation instead of using actual shifts. For AArch64 this means we generate for: void draw_bitmap1(uint8_t* restrict pixel, uint8_t level, int n) { for (int i = 0; i < (n & -16); i+=1) pixel[i] = (pixel[i] * level) / 0xff; } the following: movi v3.16b, 0x1 umull2 v1.8h, v0.16b, v2.16b umull v0.8h, v0.8b, v2.8b addhn v5.8b, v1.8h, v3.8h addhn v4.8b, v0.8h, v3.8h uaddw v1.8h, v1.8h, v5.8b uaddw v0.8h, v0.8h, v4.8b uzp2 v0.16b, v0.16b, v1.16b instead of: umull v2.8h, v1.8b, v5.8b umull2 v1.8h, v1.16b, v5.16b umull v0.4s, v2.4h, v3.4h umull2 v2.4s, v2.8h, v3.8h umull v4.4s, v1.4h, v3.4h umull2 v1.4s, v1.8h, v3.8h uzp2 v0.8h, v0.8h, v2.8h uzp2 v1.8h, v4.8h, v1.8h shrn v0.8b, v0.8h, 7 shrn2 v0.16b, v1.8h, 7 Which results in significantly faster code. Thanks for Wilco for the concept. gcc/ChangeLog: * config/aarch64/aarch64-simd.md (@aarch64_bitmask_udiv<mode>3): New. * config/aarch64/aarch64.cc (aarch64_vectorize_can_special_div_by_constant): New. gcc/testsuite/ChangeLog: * gcc.target/aarch64/div-by-bitmask.c: New test.
1 parent 8beff04 commit c98aabc

File tree

3 files changed

+156
-0
lines changed

3 files changed

+156
-0
lines changed

gcc/config/aarch64/aarch64-simd.md

Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4867,6 +4867,63 @@
48674867
}
48684868
)
48694869

4870+
;; div optimizations using narrowings
4871+
;; we can do the division e.g. shorts by 255 faster by calculating it as
4872+
;; (x + ((x + 257) >> 8)) >> 8 assuming the operation is done in
4873+
;; double the precision of x.
4874+
;;
4875+
;; If we imagine a short as being composed of two blocks of bytes then
4876+
;; adding 257 or 0b0000_0001_0000_0001 to the number is equivalent to
4877+
;; adding 1 to each sub component:
4878+
;;
4879+
;; short value of 16-bits
4880+
;; ┌──────────────┬────────────────┐
4881+
;; │ │ │
4882+
;; └──────────────┴────────────────┘
4883+
;; 8-bit part1 ▲ 8-bit part2 ▲
4884+
;; │ │
4885+
;; │ │
4886+
;; +1 +1
4887+
;;
4888+
;; after the first addition, we have to shift right by 8, and narrow the
4889+
;; results back to a byte. Remember that the addition must be done in
4890+
;; double the precision of the input. Since 8 is half the size of a short
4891+
;; we can use a narrowing halfing instruction in AArch64, addhn which also
4892+
;; does the addition in a wider precision and narrows back to a byte. The
4893+
;; shift itself is implicit in the operation as it writes back only the top
4894+
;; half of the result. i.e. bits 2*esize-1:esize.
4895+
;;
4896+
;; Since we have narrowed the result of the first part back to a byte, for
4897+
;; the second addition we can use a widening addition, uaddw.
4898+
;;
4899+
;; For the final shift, since it's unsigned arithmetic we emit an ushr by 8.
4900+
;;
4901+
;; The shift is later optimized by combine to a uzp2 with movi #0.
4902+
(define_expand "@aarch64_bitmask_udiv<mode>3"
4903+
[(match_operand:VQN 0 "register_operand")
4904+
(match_operand:VQN 1 "register_operand")
4905+
(match_operand:VQN 2 "immediate_operand")]
4906+
"TARGET_SIMD"
4907+
{
4908+
unsigned HOST_WIDE_INT size
4909+
= (1ULL << GET_MODE_UNIT_BITSIZE (<VNARROWQ>mode)) - 1;
4910+
rtx elt = unwrap_const_vec_duplicate (operands[2]);
4911+
if (!CONST_INT_P (elt) || UINTVAL (elt) != size)
4912+
FAIL;
4913+
4914+
rtx addend = gen_reg_rtx (<MODE>mode);
4915+
rtx val = aarch64_simd_gen_const_vector_dup (<VNARROWQ2>mode, 1);
4916+
emit_move_insn (addend, lowpart_subreg (<MODE>mode, val, <VNARROWQ2>mode));
4917+
rtx tmp1 = gen_reg_rtx (<VNARROWQ>mode);
4918+
rtx tmp2 = gen_reg_rtx (<MODE>mode);
4919+
emit_insn (gen_aarch64_addhn<mode> (tmp1, operands[1], addend));
4920+
unsigned bitsize = GET_MODE_UNIT_BITSIZE (<VNARROWQ>mode);
4921+
rtx shift_vector = aarch64_simd_gen_const_vector_dup (<MODE>mode, bitsize);
4922+
emit_insn (gen_aarch64_uaddw<Vnarrowq> (tmp2, operands[1], tmp1));
4923+
emit_insn (gen_aarch64_simd_lshr<mode> (operands[0], tmp2, shift_vector));
4924+
DONE;
4925+
})
4926+
48704927
;; pmul.
48714928

48724929
(define_insn "aarch64_pmul<mode>"

gcc/config/aarch64/aarch64.cc

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24306,6 +24306,40 @@ aarch64_vectorize_vec_perm_const (machine_mode vmode, machine_mode op_mode,
2430624306
return ret;
2430724307
}
2430824308

24309+
/* Implement TARGET_VECTORIZE_CAN_SPECIAL_DIV_BY_CONST. */
24310+
24311+
bool
24312+
aarch64_vectorize_can_special_div_by_constant (enum tree_code code,
24313+
tree vectype, wide_int cst,
24314+
rtx *output, rtx in0, rtx in1)
24315+
{
24316+
if (code != TRUNC_DIV_EXPR
24317+
|| !TYPE_UNSIGNED (vectype))
24318+
return false;
24319+
24320+
unsigned int flags = aarch64_classify_vector_mode (TYPE_MODE (vectype));
24321+
if ((flags & VEC_ANY_SVE) && !TARGET_SVE2)
24322+
return false;
24323+
24324+
if (in0 == NULL_RTX && in1 == NULL_RTX)
24325+
{
24326+
wide_int val = wi::add (cst, 1);
24327+
int pow = wi::exact_log2 (val);
24328+
return pow == (int)(element_precision (vectype) / 2);
24329+
}
24330+
24331+
if (!VECTOR_TYPE_P (vectype))
24332+
return false;
24333+
24334+
gcc_assert (output);
24335+
24336+
if (!*output)
24337+
*output = gen_reg_rtx (TYPE_MODE (vectype));
24338+
24339+
emit_insn (gen_aarch64_bitmask_udiv3 (TYPE_MODE (vectype), *output, in0, in1));
24340+
return true;
24341+
}
24342+
2430924343
/* Generate a byte permute mask for a register of mode MODE,
2431024344
which has NUNITS units. */
2431124345

@@ -27796,6 +27830,10 @@ aarch64_libgcc_floating_mode_supported_p
2779627830
#undef TARGET_VECTOR_ALIGNMENT
2779727831
#define TARGET_VECTOR_ALIGNMENT aarch64_simd_vector_alignment
2779827832

27833+
#undef TARGET_VECTORIZE_CAN_SPECIAL_DIV_BY_CONST
27834+
#define TARGET_VECTORIZE_CAN_SPECIAL_DIV_BY_CONST \
27835+
aarch64_vectorize_can_special_div_by_constant
27836+
2779927837
#undef TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT
2780027838
#define TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT \
2780127839
aarch64_vectorize_preferred_vector_alignment
Lines changed: 61 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,61 @@
1+
/* { dg-do compile } */
2+
/* { dg-additional-options "-O3 -std=c99" } */
3+
/* { dg-final { check-function-bodies "**" "" "" { target { le } } } } */
4+
5+
#include <stdint.h>
6+
7+
#pragma GCC target "+nosve"
8+
9+
/*
10+
** draw_bitmap1:
11+
** ...
12+
** addhn v[0-9]+.8b, v[0-9]+.8h, v[0-9]+.8h
13+
** addhn v[0-9]+.8b, v[0-9]+.8h, v[0-9]+.8h
14+
** uaddw v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8b
15+
** uaddw v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8b
16+
** uzp2 v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b
17+
** ...
18+
*/
19+
void draw_bitmap1(uint8_t* restrict pixel, uint8_t level, int n)
20+
{
21+
for (int i = 0; i < (n & -16); i+=1)
22+
pixel[i] = (pixel[i] * level) / 0xff;
23+
}
24+
25+
void draw_bitmap2(uint8_t* restrict pixel, uint8_t level, int n)
26+
{
27+
for (int i = 0; i < (n & -16); i+=1)
28+
pixel[i] = (pixel[i] * level) / 0xfe;
29+
}
30+
31+
/*
32+
** draw_bitmap3:
33+
** ...
34+
** addhn v[0-9]+.4h, v[0-9]+.4s, v[0-9]+.4s
35+
** addhn v[0-9]+.4h, v[0-9]+.4s, v[0-9]+.4s
36+
** uaddw v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4h
37+
** uaddw v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4h
38+
** uzp2 v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h
39+
** ...
40+
*/
41+
void draw_bitmap3(uint16_t* restrict pixel, uint16_t level, int n)
42+
{
43+
for (int i = 0; i < (n & -16); i+=1)
44+
pixel[i] = (pixel[i] * level) / 0xffffU;
45+
}
46+
47+
/*
48+
** draw_bitmap4:
49+
** ...
50+
** addhn v[0-9]+.2s, v[0-9]+.2d, v[0-9]+.2d
51+
** addhn v[0-9]+.2s, v[0-9]+.2d, v[0-9]+.2d
52+
** uaddw v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2s
53+
** uaddw v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2s
54+
** uzp2 v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s
55+
** ...
56+
*/
57+
void draw_bitmap4(uint32_t* restrict pixel, uint32_t level, int n)
58+
{
59+
for (int i = 0; i < (n & -16); i+=1)
60+
pixel[i] = (pixel[i] * (uint64_t)level) / 0xffffffffUL;
61+
}

0 commit comments

Comments
 (0)