@@ -4055,50 +4055,38 @@ define <2 x double> @v_copysign_out_v2f64_mag_v2f64_sign_v2bf16(<2 x double> %ma
4055
4055
; GCN-LABEL: v_copysign_out_v2f64_mag_v2f64_sign_v2bf16:
4056
4056
; GCN: ; %bb.0:
4057
4057
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4058
- ; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4
4059
- ; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
4060
4058
; GCN-NEXT: s_brev_b32 s4, -2
4061
- ; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
4062
- ; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v4
4063
- ; GCN-NEXT: v_cvt_f64_f32_e32 v[4:5], v5
4064
- ; GCN-NEXT: v_cvt_f64_f32_e32 v[6:7], v6
4065
- ; GCN-NEXT: v_bfi_b32 v1, s4, v1, v7
4059
+ ; GCN-NEXT: v_bfi_b32 v1, s4, v1, v4
4066
4060
; GCN-NEXT: v_bfi_b32 v3, s4, v3, v5
4067
4061
; GCN-NEXT: s_setpc_b64 s[30:31]
4068
4062
;
4069
4063
; GFX7-LABEL: v_copysign_out_v2f64_mag_v2f64_sign_v2bf16:
4070
4064
; GFX7: ; %bb.0:
4071
4065
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4072
- ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
4073
- ; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
4074
- ; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v5
4075
- ; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
4076
- ; GFX7-NEXT: v_cvt_f64_f32_e32 v[4:5], v4
4077
- ; GFX7-NEXT: v_cvt_f64_f32_e32 v[6:7], v6
4078
4066
; GFX7-NEXT: s_brev_b32 s4, -2
4079
- ; GFX7-NEXT: v_bfi_b32 v1, s4, v1, v5
4080
- ; GFX7-NEXT: v_bfi_b32 v3, s4, v3, v7
4067
+ ; GFX7-NEXT: v_bfi_b32 v1, s4, v1, v4
4068
+ ; GFX7-NEXT: v_bfi_b32 v3, s4, v3, v5
4081
4069
; GFX7-NEXT: s_setpc_b64 s[30:31]
4082
4070
;
4083
4071
; GFX8-LABEL: v_copysign_out_v2f64_mag_v2f64_sign_v2bf16:
4084
4072
; GFX8: ; %bb.0:
4085
4073
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4086
- ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v4
4087
- ; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v4
4074
+ ; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v4
4075
+ ; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v4
4088
4076
; GFX8-NEXT: s_brev_b32 s4, -2
4089
- ; GFX8-NEXT: v_bfi_b32 v1, s4, v1 , v4
4090
- ; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16 , v5
4077
+ ; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16 , v4
4078
+ ; GFX8-NEXT: v_bfi_b32 v1, s4, v1 , v5
4091
4079
; GFX8-NEXT: v_bfi_b32 v3, s4, v3, v4
4092
4080
; GFX8-NEXT: s_setpc_b64 s[30:31]
4093
4081
;
4094
4082
; GFX9-LABEL: v_copysign_out_v2f64_mag_v2f64_sign_v2bf16:
4095
4083
; GFX9: ; %bb.0:
4096
4084
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4097
- ; GFX9-NEXT: v_lshrrev_b32_e32 v5, 16, v4
4098
- ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v4
4085
+ ; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v4
4086
+ ; GFX9-NEXT: v_lshrrev_b32_e32 v4, 16, v4
4099
4087
; GFX9-NEXT: s_brev_b32 s4, -2
4100
- ; GFX9-NEXT: v_bfi_b32 v1, s4, v1 , v4
4101
- ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16 , v5
4088
+ ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16 , v4
4089
+ ; GFX9-NEXT: v_bfi_b32 v1, s4, v1 , v5
4102
4090
; GFX9-NEXT: v_bfi_b32 v3, s4, v3, v4
4103
4091
; GFX9-NEXT: s_setpc_b64 s[30:31]
4104
4092
;
@@ -4969,71 +4957,63 @@ define amdgpu_ps <2 x i32> @s_copysign_out_v2f32_mag_v2f32_sign_v2bf16(<2 x floa
4969
4957
define amdgpu_ps <4 x i32 > @s_copysign_out_v2f64_mag_v2f64_sign_v2bf16 (<2 x double > inreg %mag , <2 x bfloat> inreg %sign ) {
4970
4958
; GCN-LABEL: s_copysign_out_v2f64_mag_v2f64_sign_v2bf16:
4971
4959
; GCN: ; %bb.0:
4972
- ; GCN-NEXT: v_mul_f32_e64 v0, 1.0, s5
4973
- ; GCN-NEXT: v_mul_f32_e64 v1, 1.0, s4
4974
- ; GCN-NEXT: s_brev_b32 s4, -2
4975
- ; GCN-NEXT: v_mov_b32_e32 v4, s3
4976
- ; GCN-NEXT: v_mov_b32_e32 v5, s1
4977
- ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
4978
- ; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v0
4979
- ; GCN-NEXT: v_cvt_f64_f32_e32 v[0:1], v1
4980
- ; GCN-NEXT: v_cvt_f64_f32_e32 v[2:3], v2
4981
- ; GCN-NEXT: v_bfi_b32 v0, s4, v4, v3
4982
- ; GCN-NEXT: v_bfi_b32 v1, s4, v5, v1
4960
+ ; GCN-NEXT: s_brev_b32 s6, -2
4961
+ ; GCN-NEXT: v_mov_b32_e32 v0, s3
4962
+ ; GCN-NEXT: v_mov_b32_e32 v1, s5
4963
+ ; GCN-NEXT: v_mov_b32_e32 v2, s1
4964
+ ; GCN-NEXT: v_mov_b32_e32 v3, s4
4965
+ ; GCN-NEXT: v_bfi_b32 v0, s6, v0, v1
4966
+ ; GCN-NEXT: v_bfi_b32 v1, s6, v2, v3
4983
4967
; GCN-NEXT: v_readfirstlane_b32 s1, v1
4984
4968
; GCN-NEXT: v_readfirstlane_b32 s3, v0
4985
4969
; GCN-NEXT: ; return to shader part epilog
4986
4970
;
4987
4971
; GFX7-LABEL: s_copysign_out_v2f64_mag_v2f64_sign_v2bf16:
4988
4972
; GFX7: ; %bb.0:
4989
- ; GFX7-NEXT: v_mul_f32_e64 v0, 1.0, s5
4990
- ; GFX7-NEXT: v_mul_f32_e64 v1, 1.0, s4
4991
- ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
4992
- ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v1
4993
- ; GFX7-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
4994
- ; GFX7-NEXT: v_cvt_f64_f32_e32 v[2:3], v2
4995
- ; GFX7-NEXT: s_brev_b32 s4, -2
4973
+ ; GFX7-NEXT: s_brev_b32 s6, -2
4996
4974
; GFX7-NEXT: v_mov_b32_e32 v0, s3
4997
- ; GFX7-NEXT: v_bfi_b32 v0, s4, v0, v1
4975
+ ; GFX7-NEXT: v_mov_b32_e32 v1, s5
4976
+ ; GFX7-NEXT: v_bfi_b32 v0, s6, v0, v1
4998
4977
; GFX7-NEXT: v_mov_b32_e32 v1, s1
4999
- ; GFX7-NEXT: v_bfi_b32 v1, s4, v1, v3
4978
+ ; GFX7-NEXT: v_mov_b32_e32 v2, s4
4979
+ ; GFX7-NEXT: v_bfi_b32 v1, s6, v1, v2
5000
4980
; GFX7-NEXT: v_readfirstlane_b32 s1, v1
5001
4981
; GFX7-NEXT: v_readfirstlane_b32 s3, v0
5002
4982
; GFX7-NEXT: ; return to shader part epilog
5003
4983
;
5004
4984
; GFX8-LABEL: s_copysign_out_v2f64_mag_v2f64_sign_v2bf16:
5005
4985
; GFX8: ; %bb.0:
5006
- ; GFX8-NEXT: s_lshr_b32 s5, s4, 16
5007
4986
; GFX8-NEXT: v_lshlrev_b32_e64 v0, 16, s4
5008
- ; GFX8-NEXT: s_brev_b32 s4 , -2
4987
+ ; GFX8-NEXT: s_brev_b32 s5 , -2
5009
4988
; GFX8-NEXT: v_mov_b32_e32 v1, s1
5010
- ; GFX8-NEXT: v_bfi_b32 v0, s4, v1, v0
5011
- ; GFX8-NEXT: v_lshlrev_b32_e64 v1, 16, s5
4989
+ ; GFX8-NEXT: s_lshr_b32 s1, s4, 16
4990
+ ; GFX8-NEXT: v_bfi_b32 v0, s5, v1, v0
4991
+ ; GFX8-NEXT: v_lshlrev_b32_e64 v1, 16, s1
5012
4992
; GFX8-NEXT: v_mov_b32_e32 v2, s3
5013
- ; GFX8-NEXT: v_bfi_b32 v1, s4 , v2, v1
4993
+ ; GFX8-NEXT: v_bfi_b32 v1, s5 , v2, v1
5014
4994
; GFX8-NEXT: v_readfirstlane_b32 s1, v0
5015
4995
; GFX8-NEXT: v_readfirstlane_b32 s3, v1
5016
4996
; GFX8-NEXT: ; return to shader part epilog
5017
4997
;
5018
4998
; GFX9-LABEL: s_copysign_out_v2f64_mag_v2f64_sign_v2bf16:
5019
4999
; GFX9: ; %bb.0:
5020
- ; GFX9-NEXT: s_lshr_b32 s5, s4, 16
5021
5000
; GFX9-NEXT: v_lshlrev_b32_e64 v0, 16, s4
5022
- ; GFX9-NEXT: s_brev_b32 s4 , -2
5001
+ ; GFX9-NEXT: s_brev_b32 s5 , -2
5023
5002
; GFX9-NEXT: v_mov_b32_e32 v1, s1
5024
- ; GFX9-NEXT: v_bfi_b32 v0, s4, v1, v0
5025
- ; GFX9-NEXT: v_lshlrev_b32_e64 v1, 16, s5
5003
+ ; GFX9-NEXT: s_lshr_b32 s1, s4, 16
5004
+ ; GFX9-NEXT: v_bfi_b32 v0, s5, v1, v0
5005
+ ; GFX9-NEXT: v_lshlrev_b32_e64 v1, 16, s1
5026
5006
; GFX9-NEXT: v_mov_b32_e32 v2, s3
5027
- ; GFX9-NEXT: v_bfi_b32 v1, s4 , v2, v1
5007
+ ; GFX9-NEXT: v_bfi_b32 v1, s5 , v2, v1
5028
5008
; GFX9-NEXT: v_readfirstlane_b32 s1, v0
5029
5009
; GFX9-NEXT: v_readfirstlane_b32 s3, v1
5030
5010
; GFX9-NEXT: ; return to shader part epilog
5031
5011
;
5032
5012
; GFX10-LABEL: s_copysign_out_v2f64_mag_v2f64_sign_v2bf16:
5033
5013
; GFX10: ; %bb.0:
5034
- ; GFX10-NEXT: s_lshr_b32 s5, s4, 16
5035
5014
; GFX10-NEXT: v_lshlrev_b32_e64 v0, 16, s4
5036
- ; GFX10-NEXT: v_lshlrev_b32_e64 v1, 16, s5
5015
+ ; GFX10-NEXT: s_lshr_b32 s4, s4, 16
5016
+ ; GFX10-NEXT: v_lshlrev_b32_e64 v1, 16, s4
5037
5017
; GFX10-NEXT: v_bfi_b32 v0, 0x7fffffff, s1, v0
5038
5018
; GFX10-NEXT: v_bfi_b32 v1, 0x7fffffff, s3, v1
5039
5019
; GFX10-NEXT: v_readfirstlane_b32 s1, v0
@@ -5042,14 +5022,15 @@ define amdgpu_ps <4 x i32> @s_copysign_out_v2f64_mag_v2f64_sign_v2bf16(<2 x doub
5042
5022
;
5043
5023
; GFX11-LABEL: s_copysign_out_v2f64_mag_v2f64_sign_v2bf16:
5044
5024
; GFX11: ; %bb.0:
5045
- ; GFX11-NEXT: s_lshr_b32 s5, s4, 16
5046
5025
; GFX11-NEXT: v_lshlrev_b32_e64 v0, 16, s4
5047
- ; GFX11-NEXT: v_lshlrev_b32_e64 v1, 16, s5
5048
- ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
5026
+ ; GFX11-NEXT: s_lshr_b32 s4, s4, 16
5027
+ ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
5028
+ ; GFX11-NEXT: v_lshlrev_b32_e64 v1, 16, s4
5049
5029
; GFX11-NEXT: v_bfi_b32 v0, 0x7fffffff, s1, v0
5050
- ; GFX11-NEXT: v_bfi_b32 v1, 0x7fffffff, s3, v1
5051
5030
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
5031
+ ; GFX11-NEXT: v_bfi_b32 v1, 0x7fffffff, s3, v1
5052
5032
; GFX11-NEXT: v_readfirstlane_b32 s1, v0
5033
+ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
5053
5034
; GFX11-NEXT: v_readfirstlane_b32 s3, v1
5054
5035
; GFX11-NEXT: ; return to shader part epilog
5055
5036
%sign.ext = fpext <2 x bfloat> %sign to <2 x double >
0 commit comments