Skip to content

Commit 3def6a8

Browse files
committed
loongarch: Add LASX optimization for dot.
1 parent 1310a09 commit 3def6a8

File tree

4 files changed

+332
-0
lines changed

4 files changed

+332
-0
lines changed

common_loongarch64.h

+19
Original file line numberDiff line numberDiff line change
@@ -124,7 +124,17 @@ static inline int WhereAmI(void){
124124
#define CMPLE fcmp.cle.d
125125
#define CMPLT fcmp.clt.d
126126
#define NEG fneg.d
127+
128+
#define XVFSUB xvfsub.d
129+
#define XVFADD xvfadd.d
130+
#define XVFMADD xvfmadd.d
131+
132+
#define VFSUB vfsub.d
133+
#define VFADD vfadd.d
134+
#define VFMADD vfmadd.d
135+
127136
#else
137+
128138
#define LD fld.s
129139
#define ST fst.s
130140
#define MADD fmadd.s
@@ -142,6 +152,15 @@ static inline int WhereAmI(void){
142152
#define CMPLE fcmp.cle.s
143153
#define CMPLT fcmp.clt.s
144154
#define NEG fneg.s
155+
156+
#define XVFSUB xvfsub.s
157+
#define XVFADD xvfadd.s
158+
#define XVFMADD xvfmadd.s
159+
160+
#define VFSUB vfsub.s
161+
#define VFADD vfadd.s
162+
#define VFMADD vfmadd.s
163+
145164
#endif /* defined(DOUBLE) */
146165

147166
#if defined(__64BIT__) && defined(USE64BITINT)

kernel/loongarch64/KERNEL.LOONGSON2K1000

Whitespace-only changes.

kernel/loongarch64/KERNEL.LOONGSON3R5

+4
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,8 @@
11
ifndef NO_LASX
2+
3+
SDOTKERNEL = dot_lasx.S
4+
DDOTKERNEL = dot_lasx.S
5+
26
DGEMMKERNEL = dgemm_kernel_16x4.S
37
DGEMMINCOPY = dgemm_ncopy_16.S
48
DGEMMITCOPY = dgemm_tcopy_16.S

kernel/loongarch64/dot_lasx.S

+309
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,309 @@
1+
/***************************************************************************
2+
Copyright (c) 2023, The OpenBLAS Project
3+
All rights reserved.
4+
Redistribution and use in source and binary forms, with or without
5+
modification, are permitted provided that the following conditions are
6+
met:
7+
1. Redistributions of source code must retain the above copyright
8+
notice, this list of conditions and the following disclaimer.
9+
2. Redistributions in binary form must reproduce the above copyright
10+
notice, this list of conditions and the following disclaimer in
11+
the documentation and/or other materials provided with the
12+
distribution.
13+
3. Neither the name of the OpenBLAS project nor the names of
14+
its contributors may be used to endorse or promote products
15+
derived from this software without specific prior written permission.
16+
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17+
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18+
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19+
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
20+
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21+
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
22+
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
23+
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24+
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
25+
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26+
*****************************************************************************/
27+
28+
#define ASSEMBLER
29+
30+
#include "common.h"
31+
32+
#define N $r4
33+
#define X $r5
34+
#define INCX $r6
35+
#define Y $r7
36+
#define INCY $r8
37+
38+
#define I $r17
39+
#define TEMP $r18
40+
41+
/* Don't change following FR unless you know the effects. */
42+
#define s1 $f8
43+
#define s2 $f9
44+
#define a1 $f10
45+
#define b1 $f11
46+
47+
PROLOGUE
48+
49+
#ifdef F_INTERFACE
50+
LDINT N, 0(N)
51+
LDINT INCX, 0(INCX)
52+
LDINT INCY, 0(INCY)
53+
#endif
54+
SUB s1, s1, s1
55+
SUB s2, s2, s2
56+
slli.d INCX, INCX, BASE_SHIFT
57+
li.d TEMP, SIZE
58+
slli.d INCY, INCY, BASE_SHIFT
59+
bge $r0, N, .L999
60+
bne INCX, TEMP, .L20 /* inc_x=1 */
61+
bne INCY, TEMP, .L20 /* inc_y=1 */
62+
#ifdef DOUBLE
63+
srai.d I, N, 4
64+
#else
65+
srai.d I, N, 5
66+
#endif
67+
68+
/* init $xr8 and $xr9 to zero */
69+
#ifdef DOUBLE
70+
xvldrepl.d $xr0, X, 0
71+
#else
72+
xvldrepl.w $xr0, X, 0
73+
#endif
74+
XVFSUB $xr8, $xr0, $xr0
75+
XVFSUB $xr9, $xr0, $xr0
76+
77+
/* !((inc_x == 1) && (inc_y == 1)) */
78+
bge $r0, I, .L12 /* <32 */
79+
.L11:
80+
/* case 32~ */
81+
xvld $xr0, X, 0
82+
xvld $xr1, X, 32
83+
xvld $xr2, X, 64
84+
xvld $xr3, X, 96
85+
xvld $xr4, Y, 0
86+
xvld $xr5, Y, 32
87+
xvld $xr6, Y, 64
88+
xvld $xr7, Y, 96
89+
addi.w I, I, -1
90+
addi.d X, X, 128
91+
addi.d Y, Y, 128
92+
XVFMADD $xr8, $xr0, $xr4, $xr8
93+
XVFMADD $xr9, $xr1, $xr5, $xr9
94+
XVFMADD $xr8, $xr2, $xr6, $xr8
95+
XVFMADD $xr9, $xr3, $xr7, $xr9
96+
bnez I, .L11
97+
.L12:
98+
#ifdef DOUBLE
99+
andi I, N, 0xf
100+
srai.d I, I, 2
101+
#else
102+
andi I, N, 0x1f
103+
srai.d I, I, 3
104+
#endif
105+
bge $r0, I, .L14 /* <8 */
106+
.L13:
107+
/* case 8~31 */
108+
xvld $xr0, X, 0
109+
xvld $xr4, Y, 0
110+
addi.w I, I, -1
111+
addi.d X, X, 32
112+
addi.d Y, Y, 32
113+
XVFMADD $xr8, $xr0, $xr4, $xr8
114+
bnez I, .L13
115+
.L14:
116+
/* store dot in s1 $f8 */
117+
XVFADD $xr8, $xr8, $xr9
118+
SUB s2, s2, s2 /* set s2 to 0.0 */
119+
xvpermi.q $xr0, $xr8, 0x1
120+
VFADD $vr8, $vr8, $vr0
121+
vpackod.d $vr0, $vr8, $vr8
122+
#ifdef DOUBLE
123+
VFADD $vr8, $vr8, $vr0
124+
#else
125+
VFADD $vr8, $vr8, $vr0
126+
vpackod.w $vr0, $vr8, $vr8
127+
VFADD $vr8, $vr8, $vr0
128+
#endif
129+
.L15:
130+
#ifdef DOUBLE
131+
andi I, N, 0x3
132+
#else
133+
andi I, N, 0x7
134+
#endif
135+
bge $r0, I, .L999 /* =0 */
136+
.align 3
137+
.L16:
138+
/* case 1~7 */
139+
LD a1, X, 0
140+
LD b1, Y, 0
141+
#ifdef DSDOT
142+
fcvt.d.s a1, a1
143+
fcvt.d.s b1, b1
144+
fmadd.d s1, b1, a1, s1
145+
#else
146+
MADD s1, b1, a1, s1
147+
#endif
148+
addi.d I, I, -1
149+
addi.d X, X, SIZE
150+
addi.d Y, Y, SIZE
151+
bnez I, .L16
152+
b .L999
153+
.align 3
154+
155+
.L20:
156+
/* !((inc_x == 1) && (inc_y == 1)) */
157+
srai.d I, N, 3
158+
#ifdef F_INTERFACE
159+
bgez INCX, .L21
160+
addi.d TEMP, N, -1
161+
mult TEMP, INCX
162+
mflo TEMP
163+
dsub X, X, TEMP
164+
.align 3
165+
166+
.L21:
167+
bgez INCY, .L22
168+
addi.d TEMP, N, -1
169+
mult TEMP, INCY
170+
mflo TEMP
171+
dsub Y, Y, TEMP
172+
.align 3
173+
174+
.L22:
175+
#endif
176+
bge $r0, I, .L25 /* <8 */
177+
.align 3
178+
179+
.L23:
180+
LD a1, X, 0 * SIZE
181+
add.d X, X, INCX
182+
LD b1, Y, 0 * SIZE
183+
add.d Y, Y, INCY
184+
#ifdef DSDOT
185+
fcvt.d.s a1, a1
186+
fcvt.d.s b1, b1
187+
fmadd.d s1, b1, a1, s1
188+
#else
189+
MADD s1, b1, a1, s1
190+
#endif
191+
192+
LD a1, X, 0 * SIZE
193+
add.d X, X, INCX
194+
LD b1, Y, 0 * SIZE
195+
add.d Y, Y, INCY
196+
#ifdef DSDOT
197+
fcvt.d.s a1, a1
198+
fcvt.d.s b1, b1
199+
fmadd.d s2, b1, a1, s2
200+
#else
201+
MADD s2, b1, a1, s2
202+
#endif
203+
204+
LD a1, X, 0 * SIZE
205+
add.d X, X, INCX
206+
LD b1, Y, 0 * SIZE
207+
add.d Y, Y, INCY
208+
#ifdef DSDOT
209+
fcvt.d.s a1, a1
210+
fcvt.d.s b1, b1
211+
fmadd.d s1, b1, a1, s1
212+
#else
213+
MADD s1, b1, a1, s1
214+
#endif
215+
216+
LD a1, X, 0 * SIZE
217+
add.d X, X, INCX
218+
LD b1, Y, 0 * SIZE
219+
add.d Y, Y, INCY
220+
#ifdef DSDOT
221+
fcvt.d.s a1, a1
222+
fcvt.d.s b1, b1
223+
fmadd.d s2, b1, a1, s2
224+
#else
225+
MADD s2, b1, a1, s2
226+
#endif
227+
228+
LD a1, X, 0 * SIZE
229+
add.d X, X, INCX
230+
LD b1, Y, 0 * SIZE
231+
add.d Y, Y, INCY
232+
#ifdef DSDOT
233+
fcvt.d.s a1, a1
234+
fcvt.d.s b1, b1
235+
fmadd.d s1, b1, a1, s1
236+
#else
237+
MADD s1, b1, a1, s1
238+
#endif
239+
240+
LD a1, X, 0 * SIZE
241+
add.d X, X, INCX
242+
LD b1, Y, 0 * SIZE
243+
add.d Y, Y, INCY
244+
#ifdef DSDOT
245+
fcvt.d.s a1, a1
246+
fcvt.d.s b1, b1
247+
fmadd.d s2, b1, a1, s2
248+
#else
249+
MADD s2, b1, a1, s2
250+
#endif
251+
252+
LD a1, X, 0 * SIZE
253+
add.d X, X, INCX
254+
LD b1, Y, 0 * SIZE
255+
add.d Y, Y, INCY
256+
#ifdef DSDOT
257+
fcvt.d.s a1, a1
258+
fcvt.d.s b1, b1
259+
fmadd.d s1, b1, a1, s1
260+
#else
261+
MADD s1, b1, a1, s1
262+
#endif
263+
264+
LD a1, X, 0 * SIZE
265+
add.d X, X, INCX
266+
LD b1, Y, 0 * SIZE
267+
add.d Y, Y, INCY
268+
addi.d I, I, -1
269+
#ifdef DSDOT
270+
fcvt.d.s a1, a1
271+
fcvt.d.s b1, b1
272+
fmadd.d s2, b1, a1, s2
273+
#else
274+
MADD s2, b1, a1, s2
275+
#endif
276+
blt $r0, I, .L23
277+
.align 3
278+
279+
.L25:
280+
andi I, N, 7
281+
bge $r0, I, .L999
282+
.align 3
283+
284+
.L26:
285+
LD a1, X, 0 * SIZE
286+
add.d X, X, INCX
287+
LD b1, Y, 0 * SIZE
288+
add.d Y, Y, INCY
289+
addi.d I, I, -1
290+
#ifdef DSDOT
291+
fcvt.d.s a1, a1
292+
fcvt.d.s b1, b1
293+
fmadd.d s1, b1, a1, s1
294+
#else
295+
MADD s1, b1, a1, s1
296+
#endif
297+
blt $r0, I, .L26
298+
.align 3
299+
300+
.L999:
301+
#ifdef DSDOT
302+
fadd.d $f0, s1, s2
303+
#else
304+
ADD $f0, s1, s2
305+
#endif
306+
move $r4, $r17
307+
jirl $r0, $r1, 0x0
308+
309+
EPILOGUE

0 commit comments

Comments
 (0)