@@ -595,7 +595,7 @@ yuv2rgb_1_template_lsx(SwsInternal *c, const int16_t *buf0,
595
595
int len_count = (dstW + 1 ) >> 1 ;
596
596
const void * r , * g , * b ;
597
597
598
- if (uvalpha < 2048 ) {
598
+ if (uvalpha == 0 ) {
599
599
int count = 0 ;
600
600
int head = YUVRGB_TABLE_HEADROOM ;
601
601
__m128i headroom = __lsx_vreplgr2vr_h (head );
@@ -659,61 +659,46 @@ yuv2rgb_1_template_lsx(SwsInternal *c, const int16_t *buf0,
659
659
const int16_t * ubuf1 = ubuf [1 ], * vbuf1 = vbuf [1 ];
660
660
int count = 0 ;
661
661
int HEADROOM = YUVRGB_TABLE_HEADROOM ;
662
+ int uvalpha1 = 4096 - uvalpha ;
662
663
__m128i headroom = __lsx_vreplgr2vr_w (HEADROOM );
664
+ __m128i uvalpha_tmp1 = __lsx_vreplgr2vr_h (uvalpha1 );
665
+ __m128i uvalpha_tmp = __lsx_vreplgr2vr_h (uvalpha );
663
666
664
667
for (i = 0 ; i < len ; i += 8 ) {
665
668
int Y1 , Y2 , U , V ;
666
669
int i_dex = i << 1 ;
667
670
int c_dex = count << 1 ;
668
671
__m128i src_y , src_u0 , src_v0 , src_u1 , src_v1 ;
669
- __m128i y_l , y_h , u1 , u2 , v1 , v2 ;
672
+ __m128i y_l , y_h , u1 , u2 , v1 , v2 , u_ev , v_od ;
670
673
671
674
DUP4_ARG2 (__lsx_vldx , buf0 , i_dex , ubuf0 , c_dex , vbuf0 , c_dex ,
672
675
ubuf1 , c_dex , src_y , src_u0 , src_v0 , src_u1 );
673
676
src_v1 = __lsx_vldx (vbuf1 , c_dex );
674
677
src_y = __lsx_vsrari_h (src_y , 7 );
675
- u1 = __lsx_vaddwev_w_h (src_u0 , src_u1 );
676
- v1 = __lsx_vaddwod_w_h (src_u0 , src_u1 );
677
- u2 = __lsx_vaddwev_w_h (src_v0 , src_v1 );
678
- v2 = __lsx_vaddwod_w_h (src_v0 , src_v1 );
678
+
679
+ u_ev = __lsx_vmulwev_w_h (src_u0 , uvalpha_tmp1 );
680
+ v_od = __lsx_vmulwod_w_h (src_u0 , uvalpha_tmp1 );
681
+ u1 = __lsx_vmaddwev_w_h (u_ev , src_u1 , uvalpha_tmp );
682
+ v1 = __lsx_vmaddwod_w_h (v_od , src_u1 , uvalpha_tmp );
683
+ u_ev = __lsx_vmulwev_w_h (src_v0 , uvalpha_tmp1 );
684
+ v_od = __lsx_vmulwod_w_h (src_v0 , uvalpha_tmp1 );
685
+ u2 = __lsx_vmaddwev_w_h (u_ev , src_v1 , uvalpha_tmp );
686
+ v2 = __lsx_vmaddwod_w_h (v_od , src_v1 , uvalpha_tmp );
687
+
679
688
y_l = __lsx_vsllwil_w_h (src_y , 0 );
680
689
y_h = __lsx_vexth_w_h (src_y );
681
- u1 = __lsx_vsrari_w (u1 , 8 );
682
- v1 = __lsx_vsrari_w (v1 , 8 );
683
- u2 = __lsx_vsrari_w (u2 , 8 );
684
- v2 = __lsx_vsrari_w (v2 , 8 );
690
+ u1 = __lsx_vsrari_w (u1 , 19 );
691
+ v1 = __lsx_vsrari_w (v1 , 19 );
692
+ u2 = __lsx_vsrari_w (u2 , 19 );
693
+ v2 = __lsx_vsrari_w (v2 , 19 );
685
694
u1 = __lsx_vadd_w (u1 , headroom );
686
695
v1 = __lsx_vadd_w (v1 , headroom );
687
696
u2 = __lsx_vadd_w (u2 , headroom );
688
697
v2 = __lsx_vadd_w (v2 , headroom );
689
- WRITE_YUV2RGB_LSX (y_l , y_l , u1 , v1 , 0 , 1 , 0 , 0 );
690
- WRITE_YUV2RGB_LSX (y_l , y_l , u2 , v2 , 2 , 3 , 0 , 0 );
691
- WRITE_YUV2RGB_LSX (y_h , y_h , u1 , v1 , 0 , 1 , 1 , 1 );
692
- WRITE_YUV2RGB_LSX (y_h , y_h , u2 , v2 , 2 , 3 , 1 , 1 );
693
- }
694
- if (dstW - i >= 4 ) {
695
- int Y1 , Y2 , U , V ;
696
- int i_dex = i << 1 ;
697
- __m128i src_y , src_u0 , src_v0 , src_u1 , src_v1 ;
698
- __m128i uv ;
699
-
700
- src_y = __lsx_vldx (buf0 , i_dex );
701
- src_u0 = __lsx_vldrepl_d ((ubuf0 + count ), 0 );
702
- src_v0 = __lsx_vldrepl_d ((vbuf0 + count ), 0 );
703
- src_u1 = __lsx_vldrepl_d ((ubuf1 + count ), 0 );
704
- src_v1 = __lsx_vldrepl_d ((vbuf1 + count ), 0 );
705
-
706
- src_u0 = __lsx_vilvl_h (src_u1 , src_u0 );
707
- src_v0 = __lsx_vilvl_h (src_v1 , src_v0 );
708
- src_y = __lsx_vsrari_h (src_y , 7 );
709
- src_y = __lsx_vsllwil_w_h (src_y , 0 );
710
- uv = __lsx_vilvl_h (src_v0 , src_u0 );
711
- uv = __lsx_vhaddw_w_h (uv , uv );
712
- uv = __lsx_vsrari_w (uv , 8 );
713
- uv = __lsx_vadd_w (uv , headroom );
714
- WRITE_YUV2RGB_LSX (src_y , src_y , uv , uv , 0 , 1 , 0 , 1 );
715
- WRITE_YUV2RGB_LSX (src_y , src_y , uv , uv , 2 , 3 , 2 , 3 );
716
- i += 4 ;
698
+ WRITE_YUV2RGB_LSX (y_l , y_l , u1 , u2 , 0 , 1 , 0 , 0 );
699
+ WRITE_YUV2RGB_LSX (y_l , y_l , v1 , v2 , 2 , 3 , 0 , 0 );
700
+ WRITE_YUV2RGB_LSX (y_h , y_h , u1 , u2 , 0 , 1 , 1 , 1 );
701
+ WRITE_YUV2RGB_LSX (y_h , y_h , v1 , v2 , 2 , 3 , 1 , 1 );
717
702
}
718
703
for (; count < len_count ; count ++ ) {
719
704
int Y1 = (buf0 [count * 2 ] + 64 ) >> 7 ;
0 commit comments