Skip to content

Commit 858acd8

Browse files
HecaiYuanmichaelni
authored andcommitted
loongarch: fixes fate-checkasm-sw_rgb failure
The reason for the failure is that the function yuv2rgb_1_c_template was modified in 095f803. The corresponding functional test was added in c601bb8. The code on loongarch was not updated in a timely manner, resulting in the error. Signed-off-by: yuanhecai <[email protected]> Reviewed-by: [email protected] Signed-off-by: Michael Niedermayer <[email protected]>
1 parent fd1772b commit 858acd8

File tree

2 files changed

+35
-71
lines changed

2 files changed

+35
-71
lines changed

libswscale/loongarch/output_lasx.c

Lines changed: 12 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -637,7 +637,7 @@ yuv2rgb_1_template_lasx(SwsInternal *c, const int16_t *buf0,
637637
int len_count = (dstW + 1) >> 1;
638638
const void *r, *g, *b;
639639

640-
if (uvalpha < 2048) {
640+
if (uvalpha == 0) {
641641
int count = 0;
642642
int head = YUVRGB_TABLE_HEADROOM;
643643
__m256i headroom = __lasx_xvreplgr2vr_h(head);
@@ -706,27 +706,32 @@ yuv2rgb_1_template_lasx(SwsInternal *c, const int16_t *buf0,
706706
const int16_t *ubuf1 = ubuf[1], *vbuf1 = vbuf[1];
707707
int count = 0;
708708
int HEADROOM = YUVRGB_TABLE_HEADROOM;
709-
__m256i headroom = __lasx_xvreplgr2vr_w(HEADROOM);
709+
int uvalpha1 = 4096 - uvalpha;
710+
__m256i headroom = __lasx_xvreplgr2vr_w(HEADROOM);
711+
__m256i uvalpha_tmp1 = __lasx_xvreplgr2vr_h(uvalpha1);
712+
__m256i uvalpha_tmp = __lasx_xvreplgr2vr_h(uvalpha);
710713

711714
for (i = 0; i < len; i += 16) {
712715
int Y1, Y2, U, V;
713716
int i_dex = i << 1;
714717
int c_dex = count << 1;
715718
__m256i src_y, src_u0, src_v0, src_u1, src_v1;
716-
__m256i y_l, y_h, u, v;
719+
__m256i y_l, y_h, u, v, u_ev, v_od;
717720

718721
DUP4_ARG2(__lasx_xvldx, buf0, i_dex, ubuf0, c_dex, vbuf0, c_dex,
719722
ubuf1, c_dex, src_y, src_u0, src_v0, src_u1);
720723
src_v1 = __lasx_xvldx(vbuf1, c_dex);
721724
src_u0 = __lasx_xvpermi_q(src_u0, src_v0, 0x02);
722725
src_u1 = __lasx_xvpermi_q(src_u1, src_v1, 0x02);
723726
src_y = __lasx_xvsrari_h(src_y, 7);
724-
u = __lasx_xvaddwev_w_h(src_u0, src_u1);
725-
v = __lasx_xvaddwod_w_h(src_u0, src_u1);
727+
u_ev = __lasx_xvmulwev_w_h(src_u0, uvalpha_tmp1);
728+
v_od = __lasx_xvmulwod_w_h(src_u0, uvalpha_tmp1);
729+
u = __lasx_xvmaddwev_w_h(u_ev, src_u1, uvalpha_tmp);
730+
v = __lasx_xvmaddwod_w_h(v_od, src_u1, uvalpha_tmp);
726731
y_l = __lasx_xvsllwil_w_h(src_y, 0);
727732
y_h = __lasx_xvexth_w_h(src_y);
728-
u = __lasx_xvsrari_w(u, 8);
729-
v = __lasx_xvsrari_w(v, 8);
733+
u = __lasx_xvsrari_w(u, 19);
734+
v = __lasx_xvsrari_w(v, 19);
730735
u = __lasx_xvadd_w(u, headroom);
731736
v = __lasx_xvadd_w(v, headroom);
732737
WRITE_YUV2RGB(y_l, y_l, u, u, 0, 1, 0, 4);
@@ -738,32 +743,6 @@ yuv2rgb_1_template_lasx(SwsInternal *c, const int16_t *buf0,
738743
WRITE_YUV2RGB(y_h, y_h, u, u, 4, 5, 3, 7);
739744
WRITE_YUV2RGB(y_h, y_h, v, v, 6, 7, 3, 7);
740745
}
741-
if (dstW - i >= 8) {
742-
int Y1, Y2, U, V;
743-
int i_dex = i << 1;
744-
__m256i src_y, src_u0, src_v0, src_u1, src_v1;
745-
__m256i uv;
746-
747-
src_y = __lasx_xvldx(buf0, i_dex);
748-
src_u0 = __lasx_xvldrepl_d((ubuf0 + count), 0);
749-
src_v0 = __lasx_xvldrepl_d((vbuf0 + count), 0);
750-
src_u1 = __lasx_xvldrepl_d((ubuf1 + count), 0);
751-
src_v1 = __lasx_xvldrepl_d((vbuf1 + count), 0);
752-
753-
src_u0 = __lasx_xvilvl_h(src_u1, src_u0);
754-
src_v0 = __lasx_xvilvl_h(src_v1, src_v0);
755-
src_u0 = __lasx_xvpermi_q(src_u0, src_v0, 0x02);
756-
src_y = __lasx_xvsrari_h(src_y, 7);
757-
uv = __lasx_xvhaddw_w_h(src_u0, src_u0);
758-
src_y = __lasx_vext2xv_w_h(src_y);
759-
uv = __lasx_xvsrari_w(uv, 8);
760-
uv = __lasx_xvadd_w(uv, headroom);
761-
WRITE_YUV2RGB(src_y, src_y, uv, uv, 0, 1, 0, 4);
762-
WRITE_YUV2RGB(src_y, src_y, uv, uv, 2, 3, 1, 5);
763-
WRITE_YUV2RGB(src_y, src_y, uv, uv, 4, 5, 2, 6);
764-
WRITE_YUV2RGB(src_y, src_y, uv, uv, 6, 7, 3, 7);
765-
i += 8;
766-
}
767746
for (; count < len_count; count++) {
768747
int Y1 = (buf0[count * 2 ] + 64) >> 7;
769748
int Y2 = (buf0[count * 2 + 1] + 64) >> 7;

libswscale/loongarch/output_lsx.c

Lines changed: 23 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -595,7 +595,7 @@ yuv2rgb_1_template_lsx(SwsInternal *c, const int16_t *buf0,
595595
int len_count = (dstW + 1) >> 1;
596596
const void *r, *g, *b;
597597

598-
if (uvalpha < 2048) {
598+
if (uvalpha == 0) {
599599
int count = 0;
600600
int head = YUVRGB_TABLE_HEADROOM;
601601
__m128i headroom = __lsx_vreplgr2vr_h(head);
@@ -659,61 +659,46 @@ yuv2rgb_1_template_lsx(SwsInternal *c, const int16_t *buf0,
659659
const int16_t *ubuf1 = ubuf[1], *vbuf1 = vbuf[1];
660660
int count = 0;
661661
int HEADROOM = YUVRGB_TABLE_HEADROOM;
662+
int uvalpha1 = 4096 - uvalpha;
662663
__m128i headroom = __lsx_vreplgr2vr_w(HEADROOM);
664+
__m128i uvalpha_tmp1 = __lsx_vreplgr2vr_h(uvalpha1);
665+
__m128i uvalpha_tmp = __lsx_vreplgr2vr_h(uvalpha);
663666

664667
for (i = 0; i < len; i += 8) {
665668
int Y1, Y2, U, V;
666669
int i_dex = i << 1;
667670
int c_dex = count << 1;
668671
__m128i src_y, src_u0, src_v0, src_u1, src_v1;
669-
__m128i y_l, y_h, u1, u2, v1, v2;
672+
__m128i y_l, y_h, u1, u2, v1, v2, u_ev, v_od;
670673

671674
DUP4_ARG2(__lsx_vldx, buf0, i_dex, ubuf0, c_dex, vbuf0, c_dex,
672675
ubuf1, c_dex, src_y, src_u0, src_v0, src_u1);
673676
src_v1 = __lsx_vldx(vbuf1, c_dex);
674677
src_y = __lsx_vsrari_h(src_y, 7);
675-
u1 = __lsx_vaddwev_w_h(src_u0, src_u1);
676-
v1 = __lsx_vaddwod_w_h(src_u0, src_u1);
677-
u2 = __lsx_vaddwev_w_h(src_v0, src_v1);
678-
v2 = __lsx_vaddwod_w_h(src_v0, src_v1);
678+
679+
u_ev = __lsx_vmulwev_w_h(src_u0, uvalpha_tmp1);
680+
v_od = __lsx_vmulwod_w_h(src_u0, uvalpha_tmp1);
681+
u1 = __lsx_vmaddwev_w_h(u_ev, src_u1, uvalpha_tmp);
682+
v1 = __lsx_vmaddwod_w_h(v_od, src_u1, uvalpha_tmp);
683+
u_ev = __lsx_vmulwev_w_h(src_v0, uvalpha_tmp1);
684+
v_od = __lsx_vmulwod_w_h(src_v0, uvalpha_tmp1);
685+
u2 = __lsx_vmaddwev_w_h(u_ev, src_v1, uvalpha_tmp);
686+
v2 = __lsx_vmaddwod_w_h(v_od, src_v1, uvalpha_tmp);
687+
679688
y_l = __lsx_vsllwil_w_h(src_y, 0);
680689
y_h = __lsx_vexth_w_h(src_y);
681-
u1 = __lsx_vsrari_w(u1, 8);
682-
v1 = __lsx_vsrari_w(v1, 8);
683-
u2 = __lsx_vsrari_w(u2, 8);
684-
v2 = __lsx_vsrari_w(v2, 8);
690+
u1 = __lsx_vsrari_w(u1, 19);
691+
v1 = __lsx_vsrari_w(v1, 19);
692+
u2 = __lsx_vsrari_w(u2, 19);
693+
v2 = __lsx_vsrari_w(v2, 19);
685694
u1 = __lsx_vadd_w(u1, headroom);
686695
v1 = __lsx_vadd_w(v1, headroom);
687696
u2 = __lsx_vadd_w(u2, headroom);
688697
v2 = __lsx_vadd_w(v2, headroom);
689-
WRITE_YUV2RGB_LSX(y_l, y_l, u1, v1, 0, 1, 0, 0);
690-
WRITE_YUV2RGB_LSX(y_l, y_l, u2, v2, 2, 3, 0, 0);
691-
WRITE_YUV2RGB_LSX(y_h, y_h, u1, v1, 0, 1, 1, 1);
692-
WRITE_YUV2RGB_LSX(y_h, y_h, u2, v2, 2, 3, 1, 1);
693-
}
694-
if (dstW - i >= 4) {
695-
int Y1, Y2, U, V;
696-
int i_dex = i << 1;
697-
__m128i src_y, src_u0, src_v0, src_u1, src_v1;
698-
__m128i uv;
699-
700-
src_y = __lsx_vldx(buf0, i_dex);
701-
src_u0 = __lsx_vldrepl_d((ubuf0 + count), 0);
702-
src_v0 = __lsx_vldrepl_d((vbuf0 + count), 0);
703-
src_u1 = __lsx_vldrepl_d((ubuf1 + count), 0);
704-
src_v1 = __lsx_vldrepl_d((vbuf1 + count), 0);
705-
706-
src_u0 = __lsx_vilvl_h(src_u1, src_u0);
707-
src_v0 = __lsx_vilvl_h(src_v1, src_v0);
708-
src_y = __lsx_vsrari_h(src_y, 7);
709-
src_y = __lsx_vsllwil_w_h(src_y, 0);
710-
uv = __lsx_vilvl_h(src_v0, src_u0);
711-
uv = __lsx_vhaddw_w_h(uv, uv);
712-
uv = __lsx_vsrari_w(uv, 8);
713-
uv = __lsx_vadd_w(uv, headroom);
714-
WRITE_YUV2RGB_LSX(src_y, src_y, uv, uv, 0, 1, 0, 1);
715-
WRITE_YUV2RGB_LSX(src_y, src_y, uv, uv, 2, 3, 2, 3);
716-
i += 4;
698+
WRITE_YUV2RGB_LSX(y_l, y_l, u1, u2, 0, 1, 0, 0);
699+
WRITE_YUV2RGB_LSX(y_l, y_l, v1, v2, 2, 3, 0, 0);
700+
WRITE_YUV2RGB_LSX(y_h, y_h, u1, u2, 0, 1, 1, 1);
701+
WRITE_YUV2RGB_LSX(y_h, y_h, v1, v2, 2, 3, 1, 1);
717702
}
718703
for (; count < len_count; count++) {
719704
int Y1 = (buf0[count * 2 ] + 64) >> 7;

0 commit comments

Comments
 (0)