@@ -243,8 +243,8 @@ cglobal vector_dmul_scalar, 4,4,3, dst, src, mul, len
243
243
. loop :
244
244
mulpd m1 , m0 , [ srcq + lenq ]
245
245
mulpd m2 , m0 , [ srcq + lenq + mmsize ]
246
- mova [ dstq + lenq ], m1
247
- mova [ dstq + lenq + mmsize ], m2
246
+ movaps [ dstq + lenq ], m1
247
+ movaps [ dstq + lenq + mmsize ], m2
248
248
sub lenq , 2 * mmsize
249
249
jge . loop
250
250
REP_RET
@@ -363,14 +363,14 @@ VECTOR_FMUL_ADD
363
363
%macro VECTOR_FMUL_REVERSE 0
364
364
c global vector_fmul_reverse , 4 , 4 , 2 , dst , src0 , src1 , len
365
365
%if cpuflag(avx2)
366
- mova m2 , [ pd_reverse ]
366
+ movaps m2 , [ pd_reverse ]
367
367
%endif
368
368
lea lenq , [ lend * 4 - 2 * mmsize ]
369
369
ALIGN 16
370
370
. loop :
371
371
%if cpuflag(avx2)
372
- vpermd m0, m2 , [ src1q ]
373
- vpermd m1, m2 , [ src1q + mmsize ]
372
+ vpermps m0 , m2 , [ src1q ]
373
+ vpermps m1 , m2 , [ src1q + mmsize ]
374
374
%elif cpuflag(avx)
375
375
vmovaps xmm0 , [ src1q + 16 ]
376
376
vinsertf128 m0 , m0 , [ src1q ], 1
@@ -386,8 +386,8 @@ ALIGN 16
386
386
%endif
387
387
mulps m0 , m0 , [ src0q + lenq + mmsize ]
388
388
mulps m1 , m1 , [ src0q + lenq ]
389
- mova [ dstq + lenq + mmsize ], m0
390
- mova [ dstq + lenq ], m1
389
+ movaps [ dstq + lenq + mmsize ], m0
390
+ movaps [ dstq + lenq ], m1
391
391
add src1q , 2 * mmsize
392
392
sub lenq , 2 * mmsize
393
393
jge . loop
0 commit comments