Skip to content

Commit fdf8b2a

Browse files
committed
add optimized aarch64 memcpy and memset
these are based on the ARM optimized-routines repository v20.05 (ef907c7a799a), with macro dependencies flattened out and memmove code removed from memcpy. this change is somewhat unfortunate since having the branch for memmove support in the large n case of memcpy is the performance-optimal and size-optimal way to do both, but it makes memcpy alone (static-linked) about 40% larger and suggests a policy that use of memcpy as memmove is supported. tabs used for alignment have also been replaced with spaces.
1 parent 9dce93a commit fdf8b2a

File tree

3 files changed

+304
-0
lines changed

3 files changed

+304
-0
lines changed

COPYRIGHT

+3
Original file line numberDiff line numberDiff line change
@@ -131,6 +131,9 @@ The ARM memcpy code (src/string/arm/memcpy.S) is Copyright © 2008
131131
The Android Open Source Project and is licensed under a two-clause BSD
132132
license. It was taken from Bionic libc, used on Android.
133133

134+
The AArch64 memcpy and memset code (src/string/aarch64/*) are
135+
Copyright © 1999-2019, Arm Limited.
136+
134137
The implementation of DES for crypt (src/crypt/crypt_des.c) is
135138
Copyright © 1994 David Burren. It is licensed under a BSD license.
136139

src/string/aarch64/memcpy.S

+186
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,186 @@
1+
/*
2+
* memcpy - copy memory area
3+
*
4+
* Copyright (c) 2012-2020, Arm Limited.
5+
* SPDX-License-Identifier: MIT
6+
*/
7+
8+
/* Assumptions:
9+
*
10+
* ARMv8-a, AArch64, unaligned accesses.
11+
*
12+
*/
13+
14+
#define dstin x0
15+
#define src x1
16+
#define count x2
17+
#define dst x3
18+
#define srcend x4
19+
#define dstend x5
20+
#define A_l x6
21+
#define A_lw w6
22+
#define A_h x7
23+
#define B_l x8
24+
#define B_lw w8
25+
#define B_h x9
26+
#define C_l x10
27+
#define C_lw w10
28+
#define C_h x11
29+
#define D_l x12
30+
#define D_h x13
31+
#define E_l x14
32+
#define E_h x15
33+
#define F_l x16
34+
#define F_h x17
35+
#define G_l count
36+
#define G_h dst
37+
#define H_l src
38+
#define H_h srcend
39+
#define tmp1 x14
40+
41+
/* This implementation of memcpy uses unaligned accesses and branchless
42+
sequences to keep the code small, simple and improve performance.
43+
44+
Copies are split into 3 main cases: small copies of up to 32 bytes, medium
45+
copies of up to 128 bytes, and large copies. The overhead of the overlap
46+
check is negligible since it is only required for large copies.
47+
48+
Large copies use a software pipelined loop processing 64 bytes per iteration.
49+
The destination pointer is 16-byte aligned to minimize unaligned accesses.
50+
The loop tail is handled by always copying 64 bytes from the end.
51+
*/
52+
53+
.global memcpy
54+
.type memcpy,%function
55+
memcpy:
56+
add srcend, src, count
57+
add dstend, dstin, count
58+
cmp count, 128
59+
b.hi .Lcopy_long
60+
cmp count, 32
61+
b.hi .Lcopy32_128
62+
63+
/* Small copies: 0..32 bytes. */
64+
cmp count, 16
65+
b.lo .Lcopy16
66+
ldp A_l, A_h, [src]
67+
ldp D_l, D_h, [srcend, -16]
68+
stp A_l, A_h, [dstin]
69+
stp D_l, D_h, [dstend, -16]
70+
ret
71+
72+
/* Copy 8-15 bytes. */
73+
.Lcopy16:
74+
tbz count, 3, .Lcopy8
75+
ldr A_l, [src]
76+
ldr A_h, [srcend, -8]
77+
str A_l, [dstin]
78+
str A_h, [dstend, -8]
79+
ret
80+
81+
.p2align 3
82+
/* Copy 4-7 bytes. */
83+
.Lcopy8:
84+
tbz count, 2, .Lcopy4
85+
ldr A_lw, [src]
86+
ldr B_lw, [srcend, -4]
87+
str A_lw, [dstin]
88+
str B_lw, [dstend, -4]
89+
ret
90+
91+
/* Copy 0..3 bytes using a branchless sequence. */
92+
.Lcopy4:
93+
cbz count, .Lcopy0
94+
lsr tmp1, count, 1
95+
ldrb A_lw, [src]
96+
ldrb C_lw, [srcend, -1]
97+
ldrb B_lw, [src, tmp1]
98+
strb A_lw, [dstin]
99+
strb B_lw, [dstin, tmp1]
100+
strb C_lw, [dstend, -1]
101+
.Lcopy0:
102+
ret
103+
104+
.p2align 4
105+
/* Medium copies: 33..128 bytes. */
106+
.Lcopy32_128:
107+
ldp A_l, A_h, [src]
108+
ldp B_l, B_h, [src, 16]
109+
ldp C_l, C_h, [srcend, -32]
110+
ldp D_l, D_h, [srcend, -16]
111+
cmp count, 64
112+
b.hi .Lcopy128
113+
stp A_l, A_h, [dstin]
114+
stp B_l, B_h, [dstin, 16]
115+
stp C_l, C_h, [dstend, -32]
116+
stp D_l, D_h, [dstend, -16]
117+
ret
118+
119+
.p2align 4
120+
/* Copy 65..128 bytes. */
121+
.Lcopy128:
122+
ldp E_l, E_h, [src, 32]
123+
ldp F_l, F_h, [src, 48]
124+
cmp count, 96
125+
b.ls .Lcopy96
126+
ldp G_l, G_h, [srcend, -64]
127+
ldp H_l, H_h, [srcend, -48]
128+
stp G_l, G_h, [dstend, -64]
129+
stp H_l, H_h, [dstend, -48]
130+
.Lcopy96:
131+
stp A_l, A_h, [dstin]
132+
stp B_l, B_h, [dstin, 16]
133+
stp E_l, E_h, [dstin, 32]
134+
stp F_l, F_h, [dstin, 48]
135+
stp C_l, C_h, [dstend, -32]
136+
stp D_l, D_h, [dstend, -16]
137+
ret
138+
139+
.p2align 4
140+
/* Copy more than 128 bytes. */
141+
.Lcopy_long:
142+
143+
/* Copy 16 bytes and then align dst to 16-byte alignment. */
144+
145+
ldp D_l, D_h, [src]
146+
and tmp1, dstin, 15
147+
bic dst, dstin, 15
148+
sub src, src, tmp1
149+
add count, count, tmp1 /* Count is now 16 too large. */
150+
ldp A_l, A_h, [src, 16]
151+
stp D_l, D_h, [dstin]
152+
ldp B_l, B_h, [src, 32]
153+
ldp C_l, C_h, [src, 48]
154+
ldp D_l, D_h, [src, 64]!
155+
subs count, count, 128 + 16 /* Test and readjust count. */
156+
b.ls .Lcopy64_from_end
157+
158+
.Lloop64:
159+
stp A_l, A_h, [dst, 16]
160+
ldp A_l, A_h, [src, 16]
161+
stp B_l, B_h, [dst, 32]
162+
ldp B_l, B_h, [src, 32]
163+
stp C_l, C_h, [dst, 48]
164+
ldp C_l, C_h, [src, 48]
165+
stp D_l, D_h, [dst, 64]!
166+
ldp D_l, D_h, [src, 64]!
167+
subs count, count, 64
168+
b.hi .Lloop64
169+
170+
/* Write the last iteration and copy 64 bytes from the end. */
171+
.Lcopy64_from_end:
172+
ldp E_l, E_h, [srcend, -64]
173+
stp A_l, A_h, [dst, 16]
174+
ldp A_l, A_h, [srcend, -48]
175+
stp B_l, B_h, [dst, 32]
176+
ldp B_l, B_h, [srcend, -32]
177+
stp C_l, C_h, [dst, 48]
178+
ldp C_l, C_h, [srcend, -16]
179+
stp D_l, D_h, [dst, 64]
180+
stp E_l, E_h, [dstend, -64]
181+
stp A_l, A_h, [dstend, -48]
182+
stp B_l, B_h, [dstend, -32]
183+
stp C_l, C_h, [dstend, -16]
184+
ret
185+
186+
.size memcpy,.-memcpy

src/string/aarch64/memset.S

+115
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,115 @@
1+
/*
2+
* memset - fill memory with a constant byte
3+
*
4+
* Copyright (c) 2012-2020, Arm Limited.
5+
* SPDX-License-Identifier: MIT
6+
*/
7+
8+
/* Assumptions:
9+
*
10+
* ARMv8-a, AArch64, Advanced SIMD, unaligned accesses.
11+
*
12+
*/
13+
14+
#define dstin x0
15+
#define val x1
16+
#define valw w1
17+
#define count x2
18+
#define dst x3
19+
#define dstend x4
20+
#define zva_val x5
21+
22+
.global memset
23+
.type memset,%function
24+
memset:
25+
26+
dup v0.16B, valw
27+
add dstend, dstin, count
28+
29+
cmp count, 96
30+
b.hi .Lset_long
31+
cmp count, 16
32+
b.hs .Lset_medium
33+
mov val, v0.D[0]
34+
35+
/* Set 0..15 bytes. */
36+
tbz count, 3, 1f
37+
str val, [dstin]
38+
str val, [dstend, -8]
39+
ret
40+
nop
41+
1: tbz count, 2, 2f
42+
str valw, [dstin]
43+
str valw, [dstend, -4]
44+
ret
45+
2: cbz count, 3f
46+
strb valw, [dstin]
47+
tbz count, 1, 3f
48+
strh valw, [dstend, -2]
49+
3: ret
50+
51+
/* Set 17..96 bytes. */
52+
.Lset_medium:
53+
str q0, [dstin]
54+
tbnz count, 6, .Lset96
55+
str q0, [dstend, -16]
56+
tbz count, 5, 1f
57+
str q0, [dstin, 16]
58+
str q0, [dstend, -32]
59+
1: ret
60+
61+
.p2align 4
62+
/* Set 64..96 bytes. Write 64 bytes from the start and
63+
32 bytes from the end. */
64+
.Lset96:
65+
str q0, [dstin, 16]
66+
stp q0, q0, [dstin, 32]
67+
stp q0, q0, [dstend, -32]
68+
ret
69+
70+
.p2align 4
71+
.Lset_long:
72+
and valw, valw, 255
73+
bic dst, dstin, 15
74+
str q0, [dstin]
75+
cmp count, 160
76+
ccmp valw, 0, 0, hs
77+
b.ne .Lno_zva
78+
79+
#ifndef SKIP_ZVA_CHECK
80+
mrs zva_val, dczid_el0
81+
and zva_val, zva_val, 31
82+
cmp zva_val, 4 /* ZVA size is 64 bytes. */
83+
b.ne .Lno_zva
84+
#endif
85+
str q0, [dst, 16]
86+
stp q0, q0, [dst, 32]
87+
bic dst, dst, 63
88+
sub count, dstend, dst /* Count is now 64 too large. */
89+
sub count, count, 128 /* Adjust count and bias for loop. */
90+
91+
.p2align 4
92+
.Lzva_loop:
93+
add dst, dst, 64
94+
dc zva, dst
95+
subs count, count, 64
96+
b.hi .Lzva_loop
97+
stp q0, q0, [dstend, -64]
98+
stp q0, q0, [dstend, -32]
99+
ret
100+
101+
.Lno_zva:
102+
sub count, dstend, dst /* Count is 16 too large. */
103+
sub dst, dst, 16 /* Dst is biased by -32. */
104+
sub count, count, 64 + 16 /* Adjust count and bias for loop. */
105+
.Lno_zva_loop:
106+
stp q0, q0, [dst, 32]
107+
stp q0, q0, [dst, 64]!
108+
subs count, count, 64
109+
b.hi .Lno_zva_loop
110+
stp q0, q0, [dstend, -64]
111+
stp q0, q0, [dstend, -32]
112+
ret
113+
114+
.size memset,.-memset
115+

0 commit comments

Comments
 (0)