Skip to content

Commit 8afddc1

Browse files
authored
Merge pull request #5262 from guoyuanplct/develop
kernel/riscv64:Fixed the bug of openblas_utest_ext failing in c/zgemv and some c/zgbmv tests:
2 parents 5366902 + 9a7e3f1 commit 8afddc1

File tree

1 file changed

+63
-20
lines changed

1 file changed

+63
-20
lines changed

kernel/riscv64/zgemv_n_vector.c

Lines changed: 63 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -66,7 +66,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i,
6666
BLASLONG lda2 = lda * 2;
6767
vy0_new = VLSEV_FLOAT(&y[iy], stride_y, gvl);
6868
vy1_new = VLSEV_FLOAT(&y[iy + 1], stride_y, gvl);
69-
for (k = 0, j = 0; k < m / gvl; k++)
69+
for (k = 0, j = 0; k < m / gvl; k ++)
7070
{
7171
a_ptr = a;
7272
ix = 0;
@@ -121,30 +121,73 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i,
121121
#endif
122122
a_ptr += lda2;
123123
ix += inc_x2;
124+
124125
}
125126

126-
for (; i < n; i += 4)
127+
for (i = n % 4 ; i < n; i += 4)
127128
{
128129
#if !defined(XCONJ)
129-
130-
x_v0 = VLSEV_FLOAT(&x[ix], inc_x2 * sizeof(FLOAT), 4);
131-
x_v1 = VLSEV_FLOAT(&x[ix + 1], inc_x2 * sizeof(FLOAT), 4);
132-
temp_rv = VFMUL_VF_FLOAT(x_v0, alpha_r, 4);
133-
temp_iv = VFMUL_VF_FLOAT(x_v0, alpha_i, 4);
134-
temp_rv = VFNMSACVF_FLOAT(temp_rv, alpha_i, x_v1, 4);
135-
temp_iv = VFMACCVF_FLOAT(temp_iv, alpha_r, x_v1, 4);
136-
VSEV_FLOAT(&temp_rr[0], temp_rv, 4);
137-
VSEV_FLOAT(&temp_ii[0], temp_iv, 4);
130+
// temp_rr[0] = alpha_r * x[ix] - alpha_i * x[ix + 1];
131+
// temp_rr[1] = alpha_r * x[ix + inc_x2] - alpha_i * x[ix + inc_x2 + 1];
132+
x_v0 = VLSEV_FLOAT(&x[ix], inc_x2 * sizeof(FLOAT), 2);
133+
x_v1 = VLSEV_FLOAT(&x[ix + 1], inc_x2 * sizeof(FLOAT), 2);
134+
temp_rv = VFMUL_VF_FLOAT(x_v0, alpha_r, 2);
135+
temp_rv = VFNMSACVF_FLOAT(temp_rv, alpha_i, x_v1, 2);
136+
137+
// temp_ii[0] = alpha_r * x[ix + 1] + alpha_i * x[ix];
138+
// temp_ii[1] = alpha_r * x[ix + inc_x2 + 1] + alpha_i * x[ix + inc_x2];
139+
temp_iv = VFMUL_VF_FLOAT(x_v0, alpha_i, 2);
140+
temp_iv = VFMACCVF_FLOAT(temp_iv, alpha_r, x_v1, 2);
141+
VSEV_FLOAT(&temp_rr[0], temp_rv, 2);
142+
VSEV_FLOAT(&temp_ii[0], temp_iv, 2);
143+
144+
// temp_rr[2] = alpha_r * x[ix + inc_x2 * 2] - alpha_i * x[ix + inc_x2 * 2 + 1];
145+
// temp_rr[3] = alpha_r * x[ix + inc_x2 * 3] - alpha_i * x[ix + inc_x2 * 3 + 1];
146+
x_v0 = VLSEV_FLOAT(&x[ix + inc_x2 * 2], inc_x2 * sizeof(FLOAT), 2);
147+
x_v1 = VLSEV_FLOAT(&x[ix + inc_x2 * 2 + 1], inc_x2 * sizeof(FLOAT), 2);
148+
temp_rv = VFMUL_VF_FLOAT(x_v0, alpha_r, 2);
149+
temp_rv = VFNMSACVF_FLOAT(temp_rv, alpha_i, x_v1, 2);
150+
151+
// temp_ii[2] = alpha_r * x[ix + inc_x2 * 2 + 1] + alpha_i * x[ix + inc_x2 * 2];
152+
// temp_ii[3] = alpha_r * x[ix + inc_x2 * 3 + 1] + alpha_i * x[ix + inc_x2 * 3];
153+
temp_iv = VFMUL_VF_FLOAT(x_v0, alpha_i, 2);
154+
temp_iv = VFMACCVF_FLOAT(temp_iv, alpha_r, x_v1, 2);
155+
VSEV_FLOAT(&temp_rr[2], temp_rv, 2);
156+
VSEV_FLOAT(&temp_ii[2], temp_iv, 2);
138157

139158
#else
140-
x_v0 = VLSEV_FLOAT(&x[ix], inc_x2 * sizeof(FLOAT), 4);
141-
x_v1 = VLSEV_FLOAT(&x[ix + 1], inc_x2 * sizeof(FLOAT), 4);
142-
temp_rv = VFMUL_VF_FLOAT(x_v0, alpha_r, 4);
143-
temp_iv = VFMUL_VF_FLOAT(x_v0, alpha_i, 4);
144-
temp_rv = VFMACCVF_FLOAT(temp_rv, alpha_i, x_v1, 4);
145-
temp_iv = VFNMSACVF_FLOAT(temp_iv, alpha_r, x_v1, 4);
146-
VSEV_FLOAT(&temp_rr[0], temp_rv, 4);
147-
VSEV_FLOAT(&temp_ii[0], temp_iv, 4);
159+
// temp_rr[0] = alpha_r * x[ix] + alpha_i * x[ix + 1];
160+
// temp_rr[1] = alpha_r * x[ix + inc_x2] + alpha_i * x[ix + inc_x2 + 1];
161+
x_v0 = VLSEV_FLOAT(&x[ix], inc_x2 * sizeof(FLOAT), 2);
162+
x_v1 = VLSEV_FLOAT(&x[ix + 1], inc_x2 * sizeof(FLOAT), 2);
163+
temp_rv = VFMUL_VF_FLOAT(x_v0, alpha_r, 2);
164+
temp_rv = VFMACCVF_FLOAT(temp_rv, alpha_i, x_v1, 2);
165+
166+
167+
// temp_ii[0] = alpha_r * x[ix + 1] - alpha_i * x[ix];
168+
// temp_ii[1] = alpha_r * x[ix + inc_x2 + 1] - alpha_i * x[ix + inc_x2];
169+
temp_iv = VFMUL_VF_FLOAT(x_v1, alpha_r, 2);
170+
temp_iv = VFNMSACVF_FLOAT(temp_iv, alpha_i, x_v0, 2);
171+
VSEV_FLOAT(&temp_rr[0], temp_rv, 2);
172+
VSEV_FLOAT(&temp_ii[0], temp_iv, 2);
173+
174+
175+
// temp_rr[2] = alpha_r * x[ix + inc_x2 * 2] + alpha_i * x[ix + inc_x2 * 2 + 1];
176+
// temp_rr[3] = alpha_r * x[ix + inc_x2 * 3] + alpha_i * x[ix + inc_x2 * 3 + 1];
177+
x_v0 = VLSEV_FLOAT(&x[ix + inc_x2 * 2], inc_x2 * sizeof(FLOAT), 2);
178+
x_v1 = VLSEV_FLOAT(&x[ix + inc_x2 * 2 + 1], inc_x2 * sizeof(FLOAT), 2);
179+
temp_rv = VFMUL_VF_FLOAT(x_v0, alpha_r, 2);
180+
temp_rv = VFMACCVF_FLOAT(temp_rv, alpha_i, x_v1, 2);
181+
182+
183+
temp_ii[2] = alpha_r * x[ix + inc_x2 * 2 + 1] - alpha_i * x[ix + inc_x2 * 2];
184+
temp_ii[3] = alpha_r * x[ix + inc_x2 * 3 + 1] - alpha_i * x[ix + inc_x2 * 3];
185+
temp_iv = VFMUL_VF_FLOAT(x_v1, alpha_r, 2);
186+
temp_iv = VFNMSACVF_FLOAT(temp_iv, alpha_i, x_v0, 2);
187+
VSEV_FLOAT(&temp_rr[2], temp_rv, 2);
188+
VSEV_FLOAT(&temp_ii[2], temp_iv, 2);
189+
190+
148191

149192
#endif
150193

@@ -257,7 +300,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i,
257300
VSSEV_FLOAT(&y[iy], stride_y, vy0, gvl);
258301
VSSEV_FLOAT(&y[iy + 1], stride_y, vy1, gvl);
259302
j += gvl * 2;
260-
iy += inc_yv;
303+
iy += inc_yv ;
261304
}
262305
// tail
263306
if (j / 2 < m)

0 commit comments

Comments
 (0)