AMDGPU: Avoid using kernels in f16 copysign test #142113

arsenm · 2025-05-30T09:33:58Z

Avoid the memory noise in tests that predate function support.

arsenm · 2025-05-30T09:34:11Z

llvmbot · 2025-05-30T09:34:36Z

@llvm/pr-subscribers-backend-amdgpu

Author: Matt Arsenault (arsenm)

Changes

Avoid the memory noise in tests that predate function support.

Patch is 120.67 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/142113.diff

1 Files Affected:

(modified) llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll (+790-1710)

diff --git a/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll b/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll
index 7ceeda810e5e6..d654537929255 100644
--- a/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll
@@ -13,684 +13,352 @@ declare <3 x half> @llvm.copysign.v3f16(<3 x half>, <3 x half>) #0
 declare <4 x half> @llvm.copysign.v4f16(<4 x half>, <4 x half>) #0
 declare i32 @llvm.amdgcn.workitem.id.x() #0
 
-define amdgpu_kernel void @s_copysign_f16(ptr addrspace(1) %arg_out, half %mag, half %sign) {
+define amdgpu_ps i16 @s_copysign_f16(half inreg %mag, half inreg %sign) {
 ; SI-LABEL: s_copysign_f16:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dword s0, s[4:5], 0xb
-; SI-NEXT:    s_brev_b32 s2, -2
-; SI-NEXT:    s_mov_b32 s3, 0xf000
-; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    v_cvt_f32_f16_e32 v0, s0
-; SI-NEXT:    s_lshr_b32 s0, s0, 16
-; SI-NEXT:    v_cvt_f32_f16_e32 v1, s0
-; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
-; SI-NEXT:    v_bfi_b32 v0, s2, v0, v1
+; SI-NEXT:    v_cvt_f16_f32_e32 v0, s0
+; SI-NEXT:    s_brev_b32 s0, -2
+; SI-NEXT:    v_mov_b32_e32 v1, s1
+; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; SI-NEXT:    v_bfi_b32 v0, s0, v0, v1
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; SI-NEXT:    s_mov_b32 s2, -1
-; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    buffer_store_short v0, off, s[0:3], 0
-; SI-NEXT:    s_endpgm
+; SI-NEXT:    v_readfirstlane_b32 s0, v0
+; SI-NEXT:    ; return to shader part epilog
 ;
 ; VI-LABEL: s_copysign_f16:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dword s2, s[4:5], 0x2c
-; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
-; VI-NEXT:    s_movk_i32 s3, 0x7fff
-; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_lshr_b32 s4, s2, 16
-; VI-NEXT:    v_mov_b32_e32 v0, s2
-; VI-NEXT:    v_mov_b32_e32 v1, s4
-; VI-NEXT:    v_bfi_b32 v2, s3, v0, v1
+; VI-NEXT:    s_movk_i32 s2, 0x7fff
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
-; VI-NEXT:    flat_store_short v[0:1], v2
-; VI-NEXT:    s_endpgm
+; VI-NEXT:    v_bfi_b32 v0, s2, v0, v1
+; VI-NEXT:    v_readfirstlane_b32 s0, v0
+; VI-NEXT:    ; return to shader part epilog
 ;
 ; GFX9-LABEL: s_copysign_f16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dword s2, s[4:5], 0x2c
-; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX9-NEXT:    s_movk_i32 s3, 0x7fff
-; GFX9-NEXT:    v_mov_b32_e32 v0, 0
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_lshr_b32 s4, s2, 16
-; GFX9-NEXT:    v_mov_b32_e32 v1, s2
-; GFX9-NEXT:    v_mov_b32_e32 v2, s4
-; GFX9-NEXT:    v_bfi_b32 v1, s3, v1, v2
-; GFX9-NEXT:    global_store_short v0, v1, s[0:1]
-; GFX9-NEXT:    s_endpgm
+; GFX9-NEXT:    s_movk_i32 s2, 0x7fff
+; GFX9-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-NEXT:    v_mov_b32_e32 v1, s1
+; GFX9-NEXT:    v_bfi_b32 v0, s2, v0, v1
+; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX9-NEXT:    ; return to shader part epilog
 ;
 ; GFX11-LABEL: s_copysign_f16:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    s_load_b32 s2, s[4:5], 0x2c
-; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-NEXT:    v_mov_b32_e32 v1, 0
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_lshr_b32 s3, s2, 16
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_mov_b32_e32 v0, s3
-; GFX11-NEXT:    v_bfi_b32 v0, 0x7fff, s2, v0
-; GFX11-NEXT:    global_store_b16 v1, v0, s[0:1]
-; GFX11-NEXT:    s_endpgm
+; GFX11-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_bfi_b32 v0, 0x7fff, s0, v0
+; GFX11-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX11-NEXT:    ; return to shader part epilog
   %out = call half @llvm.copysign.f16(half %mag, half %sign)
-  store half %out, ptr addrspace(1) %arg_out
-  ret void
+  %cast = bitcast half %out to i16
+  ret i16 %cast
 }
 
-define amdgpu_kernel void @s_test_copysign_f16_0(ptr addrspace(1) %out, half %mag) {
+define amdgpu_ps i16 @s_test_copysign_f16_0(half inreg %mag) {
 ; SI-LABEL: s_test_copysign_f16_0:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dword s6, s[4:5], 0xb
-; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
-; SI-NEXT:    s_mov_b32 s3, 0xf000
-; SI-NEXT:    s_mov_b32 s2, -1
-; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_and_b32 s4, s6, 0x7fff
-; SI-NEXT:    v_mov_b32_e32 v0, s4
-; SI-NEXT:    buffer_store_short v0, off, s[0:3], 0
-; SI-NEXT:    s_endpgm
+; SI-NEXT:    v_cvt_f16_f32_e32 v0, s0
+; SI-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
+; SI-NEXT:    v_readfirstlane_b32 s0, v0
+; SI-NEXT:    ; return to shader part epilog
 ;
 ; VI-LABEL: s_test_copysign_f16_0:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dword s2, s[4:5], 0x2c
-; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
-; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_and_b32 s2, s2, 0x7fff
-; VI-NEXT:    v_mov_b32_e32 v0, s0
-; VI-NEXT:    v_mov_b32_e32 v1, s1
-; VI-NEXT:    v_mov_b32_e32 v2, s2
-; VI-NEXT:    flat_store_short v[0:1], v2
-; VI-NEXT:    s_endpgm
+; VI-NEXT:    s_and_b32 s0, s0, 0x7fff
+; VI-NEXT:    ; return to shader part epilog
 ;
 ; GFX9-LABEL: s_test_copysign_f16_0:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dword s2, s[4:5], 0x2c
-; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX9-NEXT:    v_mov_b32_e32 v0, 0
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_and_b32 s2, s2, 0x7fff
-; GFX9-NEXT:    v_mov_b32_e32 v1, s2
-; GFX9-NEXT:    global_store_short v0, v1, s[0:1]
-; GFX9-NEXT:    s_endpgm
+; GFX9-NEXT:    s_and_b32 s0, s0, 0x7fff
+; GFX9-NEXT:    ; return to shader part epilog
 ;
-; GFX11-TRUE16-LABEL: s_test_copysign_f16_0:
-; GFX11-TRUE16:       ; %bb.0:
-; GFX11-TRUE16-NEXT:    s_clause 0x1
-; GFX11-TRUE16-NEXT:    s_load_b32 s2, s[4:5], 0x2c
-; GFX11-TRUE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v1, 0
-; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    s_and_b32 s2, s2, 0x7fff
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, s2
-; GFX11-TRUE16-NEXT:    global_store_b16 v1, v0, s[0:1]
-; GFX11-TRUE16-NEXT:    s_endpgm
-;
-; GFX11-FAKE16-LABEL: s_test_copysign_f16_0:
-; GFX11-FAKE16:       ; %bb.0:
-; GFX11-FAKE16-NEXT:    s_clause 0x1
-; GFX11-FAKE16-NEXT:    s_load_b32 s2, s[4:5], 0x2c
-; GFX11-FAKE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-FAKE16-NEXT:    s_and_b32 s2, s2, 0x7fff
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; GFX11-FAKE16-NEXT:    global_store_b16 v0, v1, s[0:1]
-; GFX11-FAKE16-NEXT:    s_endpgm
+; GFX11-LABEL: s_test_copysign_f16_0:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_and_b32 s0, s0, 0x7fff
+; GFX11-NEXT:    ; return to shader part epilog
   %result = call half @llvm.copysign.f16(half %mag, half 0.0)
-  store half %result, ptr addrspace(1) %out, align 4
-  ret void
+  %cast = bitcast half %result to i16
+  ret i16 %cast
 }
 
-define amdgpu_kernel void @s_test_copysign_f16_1(ptr addrspace(1) %out, half %mag) {
+define amdgpu_ps i16 @s_test_copysign_f16_1(half inreg %mag) {
 ; SI-LABEL: s_test_copysign_f16_1:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dword s6, s[4:5], 0xb
-; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
-; SI-NEXT:    s_mov_b32 s3, 0xf000
-; SI-NEXT:    s_mov_b32 s2, -1
-; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_and_b32 s4, s6, 0x7fff
-; SI-NEXT:    v_mov_b32_e32 v0, s4
-; SI-NEXT:    buffer_store_short v0, off, s[0:3], 0
-; SI-NEXT:    s_endpgm
+; SI-NEXT:    v_cvt_f16_f32_e32 v0, s0
+; SI-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
+; SI-NEXT:    v_readfirstlane_b32 s0, v0
+; SI-NEXT:    ; return to shader part epilog
 ;
 ; VI-LABEL: s_test_copysign_f16_1:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dword s2, s[4:5], 0x2c
-; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
-; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_and_b32 s2, s2, 0x7fff
-; VI-NEXT:    v_mov_b32_e32 v0, s0
-; VI-NEXT:    v_mov_b32_e32 v1, s1
-; VI-NEXT:    v_mov_b32_e32 v2, s2
-; VI-NEXT:    flat_store_short v[0:1], v2
-; VI-NEXT:    s_endpgm
+; VI-NEXT:    s_and_b32 s0, s0, 0x7fff
+; VI-NEXT:    ; return to shader part epilog
 ;
 ; GFX9-LABEL: s_test_copysign_f16_1:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dword s2, s[4:5], 0x2c
-; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX9-NEXT:    v_mov_b32_e32 v0, 0
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_and_b32 s2, s2, 0x7fff
-; GFX9-NEXT:    v_mov_b32_e32 v1, s2
-; GFX9-NEXT:    global_store_short v0, v1, s[0:1]
-; GFX9-NEXT:    s_endpgm
-;
-; GFX11-TRUE16-LABEL: s_test_copysign_f16_1:
-; GFX11-TRUE16:       ; %bb.0:
-; GFX11-TRUE16-NEXT:    s_clause 0x1
-; GFX11-TRUE16-NEXT:    s_load_b32 s2, s[4:5], 0x2c
-; GFX11-TRUE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v1, 0
-; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    s_and_b32 s2, s2, 0x7fff
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, s2
-; GFX11-TRUE16-NEXT:    global_store_b16 v1, v0, s[0:1]
-; GFX11-TRUE16-NEXT:    s_endpgm
+; GFX9-NEXT:    s_and_b32 s0, s0, 0x7fff
+; GFX9-NEXT:    ; return to shader part epilog
 ;
-; GFX11-FAKE16-LABEL: s_test_copysign_f16_1:
-; GFX11-FAKE16:       ; %bb.0:
-; GFX11-FAKE16-NEXT:    s_clause 0x1
-; GFX11-FAKE16-NEXT:    s_load_b32 s2, s[4:5], 0x2c
-; GFX11-FAKE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-FAKE16-NEXT:    s_and_b32 s2, s2, 0x7fff
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; GFX11-FAKE16-NEXT:    global_store_b16 v0, v1, s[0:1]
-; GFX11-FAKE16-NEXT:    s_endpgm
+; GFX11-LABEL: s_test_copysign_f16_1:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_and_b32 s0, s0, 0x7fff
+; GFX11-NEXT:    ; return to shader part epilog
   %result = call half @llvm.copysign.f16(half %mag, half 1.0)
-  store half %result, ptr addrspace(1) %out, align 4
-  ret void
+  %cast = bitcast half %result to i16
+  ret i16 %cast
 }
 
-define amdgpu_kernel void @s_test_copysign_f16_10.0(ptr addrspace(1) %out, half %mag) {
+define amdgpu_ps i16 @s_test_copysign_f16_10.0(half inreg %mag) {
 ; SI-LABEL: s_test_copysign_f16_10.0:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dword s6, s[4:5], 0xb
-; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
-; SI-NEXT:    s_mov_b32 s3, 0xf000
-; SI-NEXT:    s_mov_b32 s2, -1
-; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_and_b32 s4, s6, 0x7fff
-; SI-NEXT:    v_mov_b32_e32 v0, s4
-; SI-NEXT:    buffer_store_short v0, off, s[0:3], 0
-; SI-NEXT:    s_endpgm
+; SI-NEXT:    v_cvt_f16_f32_e32 v0, s0
+; SI-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
+; SI-NEXT:    v_readfirstlane_b32 s0, v0
+; SI-NEXT:    ; return to shader part epilog
 ;
 ; VI-LABEL: s_test_copysign_f16_10.0:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dword s2, s[4:5], 0x2c
-; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
-; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_and_b32 s2, s2, 0x7fff
-; VI-NEXT:    v_mov_b32_e32 v0, s0
-; VI-NEXT:    v_mov_b32_e32 v1, s1
-; VI-NEXT:    v_mov_b32_e32 v2, s2
-; VI-NEXT:    flat_store_short v[0:1], v2
-; VI-NEXT:    s_endpgm
+; VI-NEXT:    s_and_b32 s0, s0, 0x7fff
+; VI-NEXT:    ; return to shader part epilog
 ;
 ; GFX9-LABEL: s_test_copysign_f16_10.0:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dword s2, s[4:5], 0x2c
-; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX9-NEXT:    v_mov_b32_e32 v0, 0
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_and_b32 s2, s2, 0x7fff
-; GFX9-NEXT:    v_mov_b32_e32 v1, s2
-; GFX9-NEXT:    global_store_short v0, v1, s[0:1]
-; GFX9-NEXT:    s_endpgm
+; GFX9-NEXT:    s_and_b32 s0, s0, 0x7fff
+; GFX9-NEXT:    ; return to shader part epilog
 ;
-; GFX11-TRUE16-LABEL: s_test_copysign_f16_10.0:
-; GFX11-TRUE16:       ; %bb.0:
-; GFX11-TRUE16-NEXT:    s_clause 0x1
-; GFX11-TRUE16-NEXT:    s_load_b32 s2, s[4:5], 0x2c
-; GFX11-TRUE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v1, 0
-; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    s_and_b32 s2, s2, 0x7fff
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, s2
-; GFX11-TRUE16-NEXT:    global_store_b16 v1, v0, s[0:1]
-; GFX11-TRUE16-NEXT:    s_endpgm
-;
-; GFX11-FAKE16-LABEL: s_test_copysign_f16_10.0:
-; GFX11-FAKE16:       ; %bb.0:
-; GFX11-FAKE16-NEXT:    s_clause 0x1
-; GFX11-FAKE16-NEXT:    s_load_b32 s2, s[4:5], 0x2c
-; GFX11-FAKE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-FAKE16-NEXT:    s_and_b32 s2, s2, 0x7fff
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; GFX11-FAKE16-NEXT:    global_store_b16 v0, v1, s[0:1]
-; GFX11-FAKE16-NEXT:    s_endpgm
+; GFX11-LABEL: s_test_copysign_f16_10.0:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_and_b32 s0, s0, 0x7fff
+; GFX11-NEXT:    ; return to shader part epilog
   %result = call half @llvm.copysign.f16(half %mag, half 10.0)
-  store half %result, ptr addrspace(1) %out, align 4
-  ret void
+  %cast = bitcast half %result to i16
+  ret i16 %cast
 }
 
-define amdgpu_kernel void @s_test_copysign_f16_neg1(ptr addrspace(1) %out, half %mag) {
+define amdgpu_ps i16 @s_test_copysign_f16_neg1(half inreg %mag) {
 ; SI-LABEL: s_test_copysign_f16_neg1:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dword s6, s[4:5], 0xb
-; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
-; SI-NEXT:    s_mov_b32 s3, 0xf000
-; SI-NEXT:    s_mov_b32 s2, -1
-; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_or_b32 s4, s6, 0x8000
-; SI-NEXT:    v_mov_b32_e32 v0, s4
-; SI-NEXT:    buffer_store_short v0, off, s[0:3], 0
-; SI-NEXT:    s_endpgm
+; SI-NEXT:    v_cvt_f16_f32_e32 v0, s0
+; SI-NEXT:    v_or_b32_e32 v0, 0x8000, v0
+; SI-NEXT:    v_readfirstlane_b32 s0, v0
+; SI-NEXT:    ; return to shader part epilog
 ;
 ; VI-LABEL: s_test_copysign_f16_neg1:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dword s2, s[4:5], 0x2c
-; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
-; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_bitset1_b32 s2, 15
-; VI-NEXT:    v_mov_b32_e32 v0, s0
-; VI-NEXT:    v_mov_b32_e32 v1, s1
-; VI-NEXT:    v_mov_b32_e32 v2, s2
-; VI-NEXT:    flat_store_short v[0:1], v2
-; VI-NEXT:    s_endpgm
+; VI-NEXT:    s_bitset1_b32 s0, 15
+; VI-NEXT:    ; return to shader part epilog
 ;
 ; GFX9-LABEL: s_test_copysign_f16_neg1:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dword s2, s[4:5], 0x2c
-; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX9-NEXT:    v_mov_b32_e32 v0, 0
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_bitset1_b32 s2, 15
-; GFX9-NEXT:    v_mov_b32_e32 v1, s2
-; GFX9-NEXT:    global_store_short v0, v1, s[0:1]
-; GFX9-NEXT:    s_endpgm
+; GFX9-NEXT:    s_bitset1_b32 s0, 15
+; GFX9-NEXT:    ; return to shader part epilog
 ;
-; GFX11-TRUE16-LABEL: s_test_copysign_f16_neg1:
-; GFX11-TRUE16:       ; %bb.0:
-; GFX11-TRUE16-NEXT:    s_clause 0x1
-; GFX11-TRUE16-NEXT:    s_load_b32 s2, s[4:5], 0x2c
-; GFX11-TRUE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v1, 0
-; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    s_bitset1_b32 s2, 15
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, s2
-; GFX11-TRUE16-NEXT:    global_store_b16 v1, v0, s[0:1]
-; GFX11-TRUE16-NEXT:    s_endpgm
-;
-; GFX11-FAKE16-LABEL: s_test_copysign_f16_neg1:
-; GFX11-FAKE16:       ; %bb.0:
-; GFX11-FAKE16-NEXT:    s_clause 0x1
-; GFX11-FAKE16-NEXT:    s_load_b32 s2, s[4:5], 0x2c
-; GFX11-FAKE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-FAKE16-NEXT:    s_bitset1_b32 s2, 15
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; GFX11-FAKE16-NEXT:    global_store_b16 v0, v1, s[0:1]
-; GFX11-FAKE16-NEXT:    s_endpgm
+; GFX11-LABEL: s_test_copysign_f16_neg1:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_bitset1_b32 s0, 15
+; GFX11-NEXT:    ; return to shader part epilog
   %result = call half @llvm.copysign.f16(half %mag, half -1.0)
-  store half %result, ptr addrspace(1) %out, align 4
-  ret void
+  %cast = bitcast half %result to i16
+  ret i16 %cast
 }
 
-define amdgpu_kernel void @s_test_copysign_f16_neg10(ptr addrspace(1) %out, half %mag) {
+define amdgpu_ps i16 @s_test_copysign_f16_neg10(half inreg %mag) {
 ; SI-LABEL: s_test_copysign_f16_neg10:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dword s6, s[4:5], 0xb
-; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
-; SI-NEXT:    s_mov_b32 s3, 0xf000
-; SI-NEXT:    s_mov_b32 s2, -1
-; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_or_b32 s4, s6, 0x8000
-; SI-NEXT:    v_mov_b32_e32 v0, s4
-; SI-NEXT:    buffer_store_short v0, off, s[0:3], 0
-; SI-NEXT:    s_endpgm
+; SI-NEXT:    v_cvt_f16_f32_e32 v0, s0
+; SI-NEXT:    v_or_b32_e32 v0, 0x8000, v0
+; SI-NEXT:    v_readfirstlane_b32 s0, v0
+; SI-NEXT:    ; return to shader part epilog
 ;
 ; VI-LABEL: s_test_copysign_f16_neg10:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dword s2, s[4:5], 0x2c
-; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
-; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_bitset1_b32 s2, 15
-; VI-NEXT:    v_mov_b32_e32 v0, s0
-; VI-NEXT:    v_mov_b32_e32 v1, s1
-; VI-NEXT:    v_mov_b32_e32 v2, s2
-; VI-NEXT:    flat_store_short v[0:1], v2
-; VI-NEXT:    s_endpgm
+; VI-NEXT:    s_bitset1_b32 s0, 15
+; VI-NEXT:    ; return to shader part epilog
 ;
 ; GFX9-LABEL: s_test_copysign_f16_neg10:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dword s2, s[4:5], 0x2c
-; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX9-NEXT:    v_mov_b32_e32 v0, 0
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_bitset1_b32 s2, 15
-; GFX9-NEXT:    v_mov_b32_e32 v1, s2
-; GFX9-NEXT:    global_store_short v0, v1, s[0:1]
-; GFX9-NEXT:    s_endpgm
-;
-; GFX11-TRUE16-LABEL: s_test_copysign_f16_neg10:
-; GFX11-TRUE16:       ; %bb.0:
-; GFX11-TRUE16-NEXT:    s_clause 0x1
-; GFX11-TRUE16-NEXT:    s_load_b32 s2, s[4:5], 0x2c
-; GFX11-TRUE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v1, 0
-; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    s_bitset1_b32 s2, 15
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, s2
-; GFX11-TRUE16-NEXT:    global_store_b16 v1, v0, s[0:1]
-; GFX11-TRUE16-NEXT:    s_endpgm
+; GFX9-NEXT:    s_bitset1_b32 s0, 15
+; GFX9-NEXT:    ; return to shader part epilog
 ;
-; GFX11-FAKE16-LABEL: s_test_copysign_f16_neg10:
-; GFX11-FAKE16:       ; %bb.0:
-; GFX11-FAKE16-NEXT:    s_clause 0x1
-; GFX11-FAKE16-NEXT:    s_load_b32 s2, s[4:5], 0x2c
-; GFX11-FAKE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-FAKE16-NEXT:    s_bitset1_b32 s2, 15
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; GFX11-FAKE16-NEXT:    global_store_b16 v0, v1, s[0:1]
-; GFX11-FAKE16-NEXT:    s_endpgm
+; GFX11-LABEL: s_test_copysign_f16_neg10:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_bitset1_b32 s0, 15
+; GFX11-NEXT:    ; return to shader part epilog
   %result = call half @llvm.copysign.f16(half %mag, half -10.0)
-  store half %result, ptr addrspace(1) %out, align 4
-  ret void
+  %cast = bitcast half %result to i16
+  ret i16 %cast
 }
 
-define amdgpu_kernel void @s_test_copysign_f16_0_mag(ptr addrspace(1) %out, half %sign) {
+define amdgpu_ps i16 @s_test_copysign_f16_0_mag(half inreg %sign) {
 ; SI-LABEL: s_test_copysign_f16_0_mag:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dword s0, s[4:5], 0xb
-; SI-NEXT:    s_brev_b32 s2, -2
-; SI-NEXT:    s_mov_b32 s3, 0xf000
-; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    v_cvt_f32_f16_e32 v0, s0
-; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
-; SI-NEXT:    v_bfi_b32 v0, s2, 0, v0
-; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; SI-NEXT:    s_mov_b32 s2, -1
-; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    buffer_store_short v0, off, s[0:3], 0
-; SI-NEXT:    s_en...
[truncated]

shiltian · 2025-05-30T15:46:42Z

Do we have to use entry CC here?

arsenm · 2025-05-30T15:56:31Z

Do we have to use entry CC here?

No. The only reason to use amdgpu_ps is to get the return-value-is-SGPR behavior

arsenm · 2025-05-30T17:45:25Z

Merge activity

May 30, 5:45 PM UTC: A user started a stack merge that includes this pull request via Graphite.
May 30, 5:47 PM UTC: Graphite rebased this pull request as part of a merge.
May 30, 5:49 PM UTC: @arsenm merged this pull request with Graphite.

Avoid the memory noise in tests that predate function support.

llvm-ci · 2025-05-30T18:06:03Z

LLVM Buildbot has detected a new failure on builder llvm-clang-aarch64-darwin running on doug-worker-4 while building llvm at step 6 "test-build-unified-tree-check-all".

Full details are available at: https://lab.llvm.org/buildbot/#/builders/190/builds/20874

Here is the relevant piece of the build log for the reference

Step 6 (test-build-unified-tree-check-all) failure: test (failure)
******************** TEST 'lld :: MachO/arm64-thunk-visibility.s' FAILED ********************
Exit Code: 1

Command Output (stderr):
--
rm -rf /Users/buildbot/buildbot-root/aarch64-darwin/build/tools/lld/test/MachO/Output/arm64-thunk-visibility.s.tmp; split-file /Users/buildbot/buildbot-root/aarch64-darwin/llvm-project/lld/test/MachO/arm64-thunk-visibility.s /Users/buildbot/buildbot-root/aarch64-darwin/build/tools/lld/test/MachO/Output/arm64-thunk-visibility.s.tmp # RUN: at line 10
+ rm -rf /Users/buildbot/buildbot-root/aarch64-darwin/build/tools/lld/test/MachO/Output/arm64-thunk-visibility.s.tmp
+ split-file /Users/buildbot/buildbot-root/aarch64-darwin/llvm-project/lld/test/MachO/arm64-thunk-visibility.s /Users/buildbot/buildbot-root/aarch64-darwin/build/tools/lld/test/MachO/Output/arm64-thunk-visibility.s.tmp
/Users/buildbot/buildbot-root/aarch64-darwin/build/bin/llvm-mc -filetype=obj -triple=arm64-apple-darwin /Users/buildbot/buildbot-root/aarch64-darwin/build/tools/lld/test/MachO/Output/arm64-thunk-visibility.s.tmp/foo.s -o /Users/buildbot/buildbot-root/aarch64-darwin/build/tools/lld/test/MachO/Output/arm64-thunk-visibility.s.tmp/foo.o # RUN: at line 11
+ /Users/buildbot/buildbot-root/aarch64-darwin/build/bin/llvm-mc -filetype=obj -triple=arm64-apple-darwin /Users/buildbot/buildbot-root/aarch64-darwin/build/tools/lld/test/MachO/Output/arm64-thunk-visibility.s.tmp/foo.s -o /Users/buildbot/buildbot-root/aarch64-darwin/build/tools/lld/test/MachO/Output/arm64-thunk-visibility.s.tmp/foo.o
/Users/buildbot/buildbot-root/aarch64-darwin/build/bin/llvm-mc -filetype=obj -triple=arm64-apple-darwin /Users/buildbot/buildbot-root/aarch64-darwin/build/tools/lld/test/MachO/Output/arm64-thunk-visibility.s.tmp/bar.s -o /Users/buildbot/buildbot-root/aarch64-darwin/build/tools/lld/test/MachO/Output/arm64-thunk-visibility.s.tmp/bar.o # RUN: at line 12
+ /Users/buildbot/buildbot-root/aarch64-darwin/build/bin/llvm-mc -filetype=obj -triple=arm64-apple-darwin /Users/buildbot/buildbot-root/aarch64-darwin/build/tools/lld/test/MachO/Output/arm64-thunk-visibility.s.tmp/bar.s -o /Users/buildbot/buildbot-root/aarch64-darwin/build/tools/lld/test/MachO/Output/arm64-thunk-visibility.s.tmp/bar.o
ld64.lld -arch x86_64 -platform_version macos 11.0 11.0 -syslibroot /Users/buildbot/buildbot-root/aarch64-darwin/llvm-project/lld/test/MachO/Inputs/MacOSX.sdk -lSystem -fatal_warnings -arch arm64 -lSystem -o /Users/buildbot/buildbot-root/aarch64-darwin/build/tools/lld/test/MachO/Output/arm64-thunk-visibility.s.tmp.out /Users/buildbot/buildbot-root/aarch64-darwin/build/tools/lld/test/MachO/Output/arm64-thunk-visibility.s.tmp/foo.o /Users/buildbot/buildbot-root/aarch64-darwin/build/tools/lld/test/MachO/Output/arm64-thunk-visibility.s.tmp/bar.o # RUN: at line 13
+ ld64.lld -arch x86_64 -platform_version macos 11.0 11.0 -syslibroot /Users/buildbot/buildbot-root/aarch64-darwin/llvm-project/lld/test/MachO/Inputs/MacOSX.sdk -lSystem -fatal_warnings -arch arm64 -lSystem -o /Users/buildbot/buildbot-root/aarch64-darwin/build/tools/lld/test/MachO/Output/arm64-thunk-visibility.s.tmp.out /Users/buildbot/buildbot-root/aarch64-darwin/build/tools/lld/test/MachO/Output/arm64-thunk-visibility.s.tmp/foo.o /Users/buildbot/buildbot-root/aarch64-darwin/build/tools/lld/test/MachO/Output/arm64-thunk-visibility.s.tmp/bar.o
ld64.lld: error: failed to write output '/Users/buildbot/buildbot-root/aarch64-darwin/build/tools/lld/test/MachO/Output/arm64-thunk-visibility.s.tmp.out': No space left on device

--

********************

This was referenced May 30, 2025

AMDGPU: Move bf16 copysign tests to separate file #142114

Merged

AMDGPU: Add more f16 copysign tests #142115

Merged

arsenm added the backend:AMDGPU label May 30, 2025 — with Graphite App

arsenm requested review from broxigarchen, cdevadas, gandhi56, jayfoad, Pierre-vh, rampitec, shiltian and Sisyph May 30, 2025 13:21

arsenm marked this pull request as ready for review May 30, 2025 13:21

This was referenced May 30, 2025

AMDGPU: Handle vectors in copysign magnitude sign case #142156

Merged

AMDGPU: Handle vectors in copysign sign type combine #142157

Merged

shiltian approved these changes May 30, 2025

View reviewed changes

AMDGPU: Avoid using kernels in f16 copysign test

9538c1f

Avoid the memory noise in tests that predate function support.

arsenm force-pushed the users/arsenm/amdgpu/avoid-kernels-fcopysign-f16-test branch from 8bfa014 to 9538c1f Compare May 30, 2025 17:47

arsenm merged commit d11f9d4 into main May 30, 2025
6 of 10 checks passed

arsenm deleted the users/arsenm/amdgpu/avoid-kernels-fcopysign-f16-test branch May 30, 2025 17:49

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

AMDGPU: Avoid using kernels in f16 copysign test #142113

AMDGPU: Avoid using kernels in f16 copysign test #142113

Uh oh!

arsenm commented May 30, 2025

Uh oh!

arsenm commented May 30, 2025 •

edited

Loading

Uh oh!

llvmbot commented May 30, 2025

Uh oh!

shiltian commented May 30, 2025

Uh oh!

arsenm commented May 30, 2025

Uh oh!

arsenm commented May 30, 2025 •

edited

Loading

Uh oh!

Uh oh!

llvm-ci commented May 30, 2025

Uh oh!

Uh oh!

AMDGPU: Avoid using kernels in f16 copysign test #142113

AMDGPU: Avoid using kernels in f16 copysign test #142113

Uh oh!

Conversation

arsenm commented May 30, 2025

Uh oh!

arsenm commented May 30, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

llvmbot commented May 30, 2025

Uh oh!

shiltian commented May 30, 2025

Uh oh!

arsenm commented May 30, 2025

Uh oh!

arsenm commented May 30, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Merge activity

Uh oh!

Uh oh!

llvm-ci commented May 30, 2025

Uh oh!

Uh oh!

arsenm commented May 30, 2025 •

edited

Loading

arsenm commented May 30, 2025 •

edited

Loading