Skip to content

Commit fe220a0

Browse files
authored
Merge pull request #5291 from guoyuanplct/develop
kernel/riscv64:fixed the performance problem in RISCV64_ZVL256 when OPENBLAS_K is small
2 parents bbdc265 + 83fcab7 commit fe220a0

File tree

2 files changed

+100
-1
lines changed

2 files changed

+100
-1
lines changed

kernel/riscv64/zaxpy_vector.c

Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43,8 +43,55 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
4343
#define VFNMSACVF_FLOAT RISCV_RVV(vfnmsac_vf_f64m4)
4444
#endif
4545

46+
#if !defined(DOUBLE)
47+
inline int small_caxpy_kernel(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
48+
#else
49+
inline int small_zaxpy_kernel(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
50+
#endif
51+
{
52+
BLASLONG i=0;
53+
BLASLONG ix,iy;
54+
BLASLONG inc_x2;
55+
BLASLONG inc_y2;
56+
57+
if ( n <= 0 ) return(0);
58+
if ( da_r == 0.0 && da_i == 0.0 ) return(0);
59+
60+
ix = 0;
61+
iy = 0;
62+
63+
inc_x2 = 2 * inc_x;
64+
inc_y2 = 2 * inc_y;
65+
66+
while(i < n)
67+
{
68+
#if !defined(CONJ)
69+
y[iy] += ( da_r * x[ix] - da_i * x[ix+1] ) ;
70+
y[iy+1] += ( da_r * x[ix+1] + da_i * x[ix] ) ;
71+
#else
72+
y[iy] += ( da_r * x[ix] + da_i * x[ix+1] ) ;
73+
y[iy+1] -= ( da_r * x[ix+1] - da_i * x[ix] ) ;
74+
#endif
75+
ix += inc_x2 ;
76+
iy += inc_y2 ;
77+
i++ ;
78+
79+
}
80+
return(0);
81+
82+
}
83+
4684
int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
4785
{
86+
#if !defined(DOUBLE)
87+
if(n < 16) {
88+
return small_caxpy_kernel(n, dummy0, dummy1, da_r, da_i, x, inc_x, y, inc_y, dummy, dummy2);
89+
}
90+
#else
91+
if(n < 8) {
92+
return small_zaxpy_kernel(n, dummy0, dummy1, da_r, da_i, x, inc_x, y, inc_y, dummy, dummy2);
93+
}
94+
#endif
4895
BLASLONG i = 0, j = 0;
4996
BLASLONG ix = 0,iy = 0;
5097
if(n <= 0) return(0);

kernel/riscv64/zdot_vector.c

Lines changed: 53 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -68,8 +68,60 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
6868
#define VFNMSACVV_FLOAT RISCV_RVV(vfnmsac_vv_f64m4)
6969
#endif
7070

71+
#if !defined(DOUBLE)
72+
inline OPENBLAS_COMPLEX_FLOAT small_cdot_kernel(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
73+
#else
74+
inline OPENBLAS_COMPLEX_FLOAT small_zdot_kernel(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
75+
#endif
76+
{
77+
BLASLONG i=0;
78+
BLASLONG ix=0,iy=0;
79+
FLOAT dot[2];
80+
OPENBLAS_COMPLEX_FLOAT result;
81+
BLASLONG inc_x2;
82+
BLASLONG inc_y2;
83+
84+
dot[0]=0.0;
85+
dot[1]=0.0;
86+
87+
CREAL(result) = 0.0 ;
88+
CIMAG(result) = 0.0 ;
89+
90+
if ( n < 1 ) return(result);
91+
92+
inc_x2 = 2 * inc_x ;
93+
inc_y2 = 2 * inc_y ;
94+
95+
while(i < n)
96+
{
97+
#if !defined(CONJ)
98+
dot[0] += ( x[ix] * y[iy] - x[ix+1] * y[iy+1] ) ;
99+
dot[1] += ( x[ix+1] * y[iy] + x[ix] * y[iy+1] ) ;
100+
#else
101+
dot[0] += ( x[ix] * y[iy] + x[ix+1] * y[iy+1] ) ;
102+
dot[1] -= ( x[ix+1] * y[iy] - x[ix] * y[iy+1] ) ;
103+
#endif
104+
ix += inc_x2 ;
105+
iy += inc_y2 ;
106+
i++ ;
107+
108+
}
109+
CREAL(result) = dot[0];
110+
CIMAG(result) = dot[1];
111+
return(result);
112+
113+
}
71114
OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
72115
{
116+
#if !defined(DOUBLE)
117+
if(n < 16) {
118+
return small_cdot_kernel(n, x, inc_x, y, inc_y);
119+
}
120+
#else
121+
if(n < 8) {
122+
return small_zdot_kernel(n, x, inc_x, y, inc_y);
123+
}
124+
#endif
73125
BLASLONG i=0, j=0;
74126
BLASLONG ix=0,iy=0;
75127
FLOAT dot[2];
@@ -148,4 +200,4 @@ OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLA
148200
CREAL(result) = dot[0];
149201
CIMAG(result) = dot[1];
150202
return(result);
151-
}
203+
}

0 commit comments

Comments
 (0)