Skip to content

Commit a89ac90

Browse files
AArch64: Add SVE2 implementation for pow2 bitmask division
In plenty of image and video processing code it's common to modify pixel values by a widening operation and then scale them back into range by dividing by 255. This patch adds an named function to allow us to emit an optimized sequence when doing an unsigned division that is equivalent to: x = y / (2 ^ (bitsize (y)/2)-1) For SVE2 this means we generate for: void draw_bitmap1(uint8_t* restrict pixel, uint8_t level, int n) { for (int i = 0; i < (n & -16); i+=1) pixel[i] = (pixel[i] * level) / 0xff; } the following: mov z3.b, #1 .L3: ld1b z0.h, p0/z, [x0, x3] mul z0.h, p1/m, z0.h, z2.h addhnb z1.b, z0.h, z3.h addhnb z0.b, z0.h, z1.h st1b z0.h, p0, [x0, x3] inch x3 whilelo p0.h, w3, w2 b.any .L3 instead of: .L3: ld1b z0.h, p1/z, [x0, x3] mul z0.h, p0/m, z0.h, z1.h umulh z0.h, p0/m, z0.h, z2.h lsr z0.h, z0.h, #7 st1b z0.h, p1, [x0, x3] inch x3 whilelo p1.h, w3, w2 b.any .L3 Which results in significantly faster code. gcc/ChangeLog: * config/aarch64/aarch64-sve2.md (@aarch64_bitmask_udiv<mode>3): New. gcc/testsuite/ChangeLog: * gcc.target/aarch64/sve2/div-by-bitmask_1.c: New test.
1 parent c98aabc commit a89ac90

File tree

2 files changed

+94
-0
lines changed

2 files changed

+94
-0
lines changed

gcc/config/aarch64/aarch64-sve2.md

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -71,6 +71,7 @@
7171
;; ---- [INT] Reciprocal approximation
7272
;; ---- [INT<-FP] Base-2 logarithm
7373
;; ---- [INT] Polynomial multiplication
74+
;; ---- [INT] Misc optab implementations
7475
;;
7576
;; == Permutation
7677
;; ---- [INT,FP] General permutes
@@ -2312,6 +2313,46 @@
23122313
"<sve_int_op>\t%0.<Vewtype>, %1.<Vetype>, %2.<Vetype>"
23132314
)
23142315

2316+
;; -------------------------------------------------------------------------
2317+
;; ---- [INT] Misc optab implementations
2318+
;; -------------------------------------------------------------------------
2319+
;; Includes:
2320+
;; - aarch64_bitmask_udiv
2321+
;; -------------------------------------------------------------------------
2322+
2323+
;; div optimizations using narrowings
2324+
;; we can do the division e.g. shorts by 255 faster by calculating it as
2325+
;; (x + ((x + 257) >> 8)) >> 8 assuming the operation is done in
2326+
;; double the precision of x.
2327+
;;
2328+
;; See aarch64-simd.md for bigger explanation.
2329+
(define_expand "@aarch64_bitmask_udiv<mode>3"
2330+
[(match_operand:SVE_FULL_HSDI 0 "register_operand")
2331+
(match_operand:SVE_FULL_HSDI 1 "register_operand")
2332+
(match_operand:SVE_FULL_HSDI 2 "immediate_operand")]
2333+
"TARGET_SVE2"
2334+
{
2335+
unsigned HOST_WIDE_INT size
2336+
= (1ULL << GET_MODE_UNIT_BITSIZE (<VNARROW>mode)) - 1;
2337+
rtx elt = unwrap_const_vec_duplicate (operands[2]);
2338+
if (!CONST_INT_P (elt) || UINTVAL (elt) != size)
2339+
FAIL;
2340+
2341+
rtx addend = gen_reg_rtx (<MODE>mode);
2342+
rtx tmp1 = gen_reg_rtx (<VNARROW>mode);
2343+
rtx tmp2 = gen_reg_rtx (<VNARROW>mode);
2344+
rtx val = aarch64_simd_gen_const_vector_dup (<VNARROW>mode, 1);
2345+
emit_move_insn (addend, lowpart_subreg (<MODE>mode, val, <VNARROW>mode));
2346+
emit_insn (gen_aarch64_sve (UNSPEC_ADDHNB, <MODE>mode, tmp1, operands[1],
2347+
addend));
2348+
emit_insn (gen_aarch64_sve (UNSPEC_ADDHNB, <MODE>mode, tmp2, operands[1],
2349+
lowpart_subreg (<MODE>mode, tmp1,
2350+
<VNARROW>mode)));
2351+
emit_move_insn (operands[0],
2352+
lowpart_subreg (<MODE>mode, tmp2, <VNARROW>mode));
2353+
DONE;
2354+
})
2355+
23152356
;; =========================================================================
23162357
;; == Permutation
23172358
;; =========================================================================
Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
/* { dg-do compile } */
2+
/* { dg-additional-options "-O2 -std=c99" } */
3+
/* { dg-final { check-function-bodies "**" "" "" { target { le } } } } */
4+
5+
#include <stdint.h>
6+
7+
/*
8+
** draw_bitmap1:
9+
** ...
10+
** mul z[0-9]+.h, p[0-9]+/m, z[0-9]+.h, z[0-9]+.h
11+
** addhnb z[0-9]+.b, z[0-9]+.h, z[0-9]+.h
12+
** addhnb z[0-9]+.b, z[0-9]+.h, z[0-9]+.h
13+
** ...
14+
*/
15+
void draw_bitmap1(uint8_t* restrict pixel, uint8_t level, int n)
16+
{
17+
for (int i = 0; i < (n & -16); i+=1)
18+
pixel[i] = (pixel[i] * level) / 0xff;
19+
}
20+
21+
void draw_bitmap2(uint8_t* restrict pixel, uint8_t level, int n)
22+
{
23+
for (int i = 0; i < (n & -16); i+=1)
24+
pixel[i] = (pixel[i] * level) / 0xfe;
25+
}
26+
27+
/*
28+
** draw_bitmap3:
29+
** ...
30+
** mul z[0-9]+.s, p[0-9]+/m, z[0-9]+.s, z[0-9]+.s
31+
** addhnb z[0-9]+.h, z[0-9]+.s, z[0-9]+.s
32+
** addhnb z[0-9]+.h, z[0-9]+.s, z[0-9]+.s
33+
** ...
34+
*/
35+
void draw_bitmap3(uint16_t* restrict pixel, uint16_t level, int n)
36+
{
37+
for (int i = 0; i < (n & -16); i+=1)
38+
pixel[i] = (pixel[i] * level) / 0xffffU;
39+
}
40+
41+
/*
42+
** draw_bitmap4:
43+
** ...
44+
** mul z[0-9]+.d, p[0-9]+/m, z[0-9]+.d, z[0-9]+.d
45+
** addhnb z[0-9]+.s, z[0-9]+.d, z[0-9]+.d
46+
** addhnb z[0-9]+.s, z[0-9]+.d, z[0-9]+.d
47+
** ...
48+
*/
49+
void draw_bitmap4(uint32_t* restrict pixel, uint32_t level, int n)
50+
{
51+
for (int i = 0; i < (n & -16); i+=1)
52+
pixel[i] = (pixel[i] * (uint64_t)level) / 0xffffffffUL;
53+
}

0 commit comments

Comments
 (0)