@@ -66,7 +66,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i,
66
66
BLASLONG lda2 = lda * 2 ;
67
67
vy0_new = VLSEV_FLOAT (& y [iy ], stride_y , gvl );
68
68
vy1_new = VLSEV_FLOAT (& y [iy + 1 ], stride_y , gvl );
69
- for (k = 0 , j = 0 ; k < m / gvl ; k ++ )
69
+ for (k = 0 , j = 0 ; k < m / gvl ; k ++ )
70
70
{
71
71
a_ptr = a ;
72
72
ix = 0 ;
@@ -121,30 +121,73 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i,
121
121
#endif
122
122
a_ptr += lda2 ;
123
123
ix += inc_x2 ;
124
+
124
125
}
125
126
126
- for (; i < n ; i += 4 )
127
+ for (i = n % 4 ; i < n ; i += 4 )
127
128
{
128
129
#if !defined(XCONJ )
129
-
130
- x_v0 = VLSEV_FLOAT (& x [ix ], inc_x2 * sizeof (FLOAT ), 4 );
131
- x_v1 = VLSEV_FLOAT (& x [ix + 1 ], inc_x2 * sizeof (FLOAT ), 4 );
132
- temp_rv = VFMUL_VF_FLOAT (x_v0 , alpha_r , 4 );
133
- temp_iv = VFMUL_VF_FLOAT (x_v0 , alpha_i , 4 );
134
- temp_rv = VFNMSACVF_FLOAT (temp_rv , alpha_i , x_v1 , 4 );
135
- temp_iv = VFMACCVF_FLOAT (temp_iv , alpha_r , x_v1 , 4 );
136
- VSEV_FLOAT (& temp_rr [0 ], temp_rv , 4 );
137
- VSEV_FLOAT (& temp_ii [0 ], temp_iv , 4 );
130
+ // temp_rr[0] = alpha_r * x[ix] - alpha_i * x[ix + 1];
131
+ // temp_rr[1] = alpha_r * x[ix + inc_x2] - alpha_i * x[ix + inc_x2 + 1];
132
+ x_v0 = VLSEV_FLOAT (& x [ix ], inc_x2 * sizeof (FLOAT ), 2 );
133
+ x_v1 = VLSEV_FLOAT (& x [ix + 1 ], inc_x2 * sizeof (FLOAT ), 2 );
134
+ temp_rv = VFMUL_VF_FLOAT (x_v0 , alpha_r , 2 );
135
+ temp_rv = VFNMSACVF_FLOAT (temp_rv , alpha_i , x_v1 , 2 );
136
+
137
+ // temp_ii[0] = alpha_r * x[ix + 1] + alpha_i * x[ix];
138
+ // temp_ii[1] = alpha_r * x[ix + inc_x2 + 1] + alpha_i * x[ix + inc_x2];
139
+ temp_iv = VFMUL_VF_FLOAT (x_v0 , alpha_i , 2 );
140
+ temp_iv = VFMACCVF_FLOAT (temp_iv , alpha_r , x_v1 , 2 );
141
+ VSEV_FLOAT (& temp_rr [0 ], temp_rv , 2 );
142
+ VSEV_FLOAT (& temp_ii [0 ], temp_iv , 2 );
143
+
144
+ // temp_rr[2] = alpha_r * x[ix + inc_x2 * 2] - alpha_i * x[ix + inc_x2 * 2 + 1];
145
+ // temp_rr[3] = alpha_r * x[ix + inc_x2 * 3] - alpha_i * x[ix + inc_x2 * 3 + 1];
146
+ x_v0 = VLSEV_FLOAT (& x [ix + inc_x2 * 2 ], inc_x2 * sizeof (FLOAT ), 2 );
147
+ x_v1 = VLSEV_FLOAT (& x [ix + inc_x2 * 2 + 1 ], inc_x2 * sizeof (FLOAT ), 2 );
148
+ temp_rv = VFMUL_VF_FLOAT (x_v0 , alpha_r , 2 );
149
+ temp_rv = VFNMSACVF_FLOAT (temp_rv , alpha_i , x_v1 , 2 );
150
+
151
+ // temp_ii[2] = alpha_r * x[ix + inc_x2 * 2 + 1] + alpha_i * x[ix + inc_x2 * 2];
152
+ // temp_ii[3] = alpha_r * x[ix + inc_x2 * 3 + 1] + alpha_i * x[ix + inc_x2 * 3];
153
+ temp_iv = VFMUL_VF_FLOAT (x_v0 , alpha_i , 2 );
154
+ temp_iv = VFMACCVF_FLOAT (temp_iv , alpha_r , x_v1 , 2 );
155
+ VSEV_FLOAT (& temp_rr [2 ], temp_rv , 2 );
156
+ VSEV_FLOAT (& temp_ii [2 ], temp_iv , 2 );
138
157
139
158
#else
140
- x_v0 = VLSEV_FLOAT (& x [ix ], inc_x2 * sizeof (FLOAT ), 4 );
141
- x_v1 = VLSEV_FLOAT (& x [ix + 1 ], inc_x2 * sizeof (FLOAT ), 4 );
142
- temp_rv = VFMUL_VF_FLOAT (x_v0 , alpha_r , 4 );
143
- temp_iv = VFMUL_VF_FLOAT (x_v0 , alpha_i , 4 );
144
- temp_rv = VFMACCVF_FLOAT (temp_rv , alpha_i , x_v1 , 4 );
145
- temp_iv = VFNMSACVF_FLOAT (temp_iv , alpha_r , x_v1 , 4 );
146
- VSEV_FLOAT (& temp_rr [0 ], temp_rv , 4 );
147
- VSEV_FLOAT (& temp_ii [0 ], temp_iv , 4 );
159
+ // temp_rr[0] = alpha_r * x[ix] + alpha_i * x[ix + 1];
160
+ // temp_rr[1] = alpha_r * x[ix + inc_x2] + alpha_i * x[ix + inc_x2 + 1];
161
+ x_v0 = VLSEV_FLOAT (& x [ix ], inc_x2 * sizeof (FLOAT ), 2 );
162
+ x_v1 = VLSEV_FLOAT (& x [ix + 1 ], inc_x2 * sizeof (FLOAT ), 2 );
163
+ temp_rv = VFMUL_VF_FLOAT (x_v0 , alpha_r , 2 );
164
+ temp_rv = VFMACCVF_FLOAT (temp_rv , alpha_i , x_v1 , 2 );
165
+
166
+
167
+ // temp_ii[0] = alpha_r * x[ix + 1] - alpha_i * x[ix];
168
+ // temp_ii[1] = alpha_r * x[ix + inc_x2 + 1] - alpha_i * x[ix + inc_x2];
169
+ temp_iv = VFMUL_VF_FLOAT (x_v1 , alpha_r , 2 );
170
+ temp_iv = VFNMSACVF_FLOAT (temp_iv , alpha_i , x_v0 , 2 );
171
+ VSEV_FLOAT (& temp_rr [0 ], temp_rv , 2 );
172
+ VSEV_FLOAT (& temp_ii [0 ], temp_iv , 2 );
173
+
174
+
175
+ // temp_rr[2] = alpha_r * x[ix + inc_x2 * 2] + alpha_i * x[ix + inc_x2 * 2 + 1];
176
+ // temp_rr[3] = alpha_r * x[ix + inc_x2 * 3] + alpha_i * x[ix + inc_x2 * 3 + 1];
177
+ x_v0 = VLSEV_FLOAT (& x [ix + inc_x2 * 2 ], inc_x2 * sizeof (FLOAT ), 2 );
178
+ x_v1 = VLSEV_FLOAT (& x [ix + inc_x2 * 2 + 1 ], inc_x2 * sizeof (FLOAT ), 2 );
179
+ temp_rv = VFMUL_VF_FLOAT (x_v0 , alpha_r , 2 );
180
+ temp_rv = VFMACCVF_FLOAT (temp_rv , alpha_i , x_v1 , 2 );
181
+
182
+
183
+ temp_ii [2 ] = alpha_r * x [ix + inc_x2 * 2 + 1 ] - alpha_i * x [ix + inc_x2 * 2 ];
184
+ temp_ii [3 ] = alpha_r * x [ix + inc_x2 * 3 + 1 ] - alpha_i * x [ix + inc_x2 * 3 ];
185
+ temp_iv = VFMUL_VF_FLOAT (x_v1 , alpha_r , 2 );
186
+ temp_iv = VFNMSACVF_FLOAT (temp_iv , alpha_i , x_v0 , 2 );
187
+ VSEV_FLOAT (& temp_rr [2 ], temp_rv , 2 );
188
+ VSEV_FLOAT (& temp_ii [2 ], temp_iv , 2 );
189
+
190
+
148
191
149
192
#endif
150
193
@@ -257,7 +300,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i,
257
300
VSSEV_FLOAT (& y [iy ], stride_y , vy0 , gvl );
258
301
VSSEV_FLOAT (& y [iy + 1 ], stride_y , vy1 , gvl );
259
302
j += gvl * 2 ;
260
- iy += inc_yv ;
303
+ iy += inc_yv ;
261
304
}
262
305
// tail
263
306
if (j / 2 < m )
0 commit comments