Skip to content

[AMDGPU][True16][GlobalISel] Fix v2*16 build_vector patterns #151496

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged

Conversation

mbrkusanin
Copy link
Collaborator

@mbrkusanin mbrkusanin commented Jul 31, 2025

  • Pattern with IMPLICIT_DEF failed to generate an entry in MatchTable and
    did not report an error, just silently failed. This is fixed by casting
    IMPLICIT_DEF to appropriate type. This also fixes selecting
    "build_vector s16, undef" for GlobalISel with True16.
  • Add pattern for "build_vector undef, s16" that will work for GlobalISel.
    True16 GlobalISel has a G_TRUNC that it needs to deal with.
  • Use REG_SEQUENCE for Real16 patterns instead of V_LSHLREV_B32_e64 to
    generate more optimal code.

@llvmbot
Copy link
Member

llvmbot commented Jul 31, 2025

@llvm/pr-subscribers-backend-amdgpu

Author: Mirko Brkušanin (mbrkusanin)

Changes
  • Pattern with IMPLICIT_DEF failed to generate and entry in MatchTable and
    did not an report error, just silently failed. This is fixed by casting
    IMPLICIT_DEF to appropriate type. This also fixes selecting
    "build_vector s16, undef" for GlobalISel with True16.
  • Add pattern for "build_vector undef, s16" that will work for GlobalISel.
    True16 GlobalISel has a G_TRUNC that it needs to deal with.
  • Use REG_SEQUENCE for Real16 patterns instead of V_LSHLREV_B32_e64 to
    generate more optimal code.

Patch is 49.48 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/151496.diff

10 Files Affected:

  • (modified) llvm/lib/Target/AMDGPU/SIInstructions.td (+17-3)
  • (modified) llvm/test/CodeGen/AMDGPU/build-vector-packed-partial-undef.ll (+643-150)
  • (modified) llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll (+1-3)
  • (modified) llvm/test/CodeGen/AMDGPU/fmaximum.ll (+1-1)
  • (modified) llvm/test/CodeGen/AMDGPU/fminimum.ll (+1-1)
  • (modified) llvm/test/CodeGen/AMDGPU/global-saddr-load.ll (+2-4)
  • (modified) llvm/test/CodeGen/AMDGPU/llvm.ldexp.ll (+4-4)
  • (modified) llvm/test/CodeGen/AMDGPU/mad-mix-hi.ll (+2-3)
  • (modified) llvm/test/CodeGen/AMDGPU/strict_fsub.f16.ll (+1-1)
  • (modified) llvm/test/CodeGen/AMDGPU/strict_ldexp.f16.ll (+6-6)
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index 54fa192aeec92..35605eb5a4bd3 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -3543,15 +3543,29 @@ def : GCNPat <
   (vecTy (UniformBinFrag<build_vector> (Ty undef), (Ty SReg_32:$src1))),
   (S_LSHL_B32 SReg_32:$src1, (i32 16))
 >;
-}
 
 def : GCNPat <
   (vecTy (DivergentBinFrag<build_vector> (Ty undef), (Ty VGPR_32:$src1))),
   (vecTy (V_LSHLREV_B32_e64 (i32 16), VGPR_32:$src1))
 >;
-} // End foreach Ty = ...
 }
 
+let True16Predicate = UseRealTrue16Insts in
+def : GCNPat <
+  (vecTy (DivergentBinFrag<build_vector> (Ty undef), (Ty VGPR_32:$src1))),
+  (REG_SEQUENCE VGPR_32, (Ty (IMPLICIT_DEF)), lo16, (Ty VGPR_32:$src1), hi16)
+>;
+
+} // End foreach Ty = ...
+} // End AddedComplexity = 1
+
+let True16Predicate = UseRealTrue16Insts in
+def : GCNPat <
+  (v2i16 (DivergentBinFrag<build_vector> (i16 undef), (i16 (trunc i32:$src1)))),
+  (REG_SEQUENCE VGPR_32, (i16 (IMPLICIT_DEF)), lo16,
+                         (i16 (EXTRACT_SUBREG VGPR_32:$src1, lo16)), hi16)
+>;
+
 let SubtargetPredicate = HasVOP3PInsts in {
 foreach p = [NotHasTrue16BitInsts, UseFakeTrue16Insts] in
 let True16Predicate = p in
@@ -3599,7 +3613,7 @@ def : GCNPat <
 >;
 def : GCNPat <
   (vecTy (DivergentBinFrag<build_vector> (Ty VGPR_16:$src0), (Ty undef))),
-  (REG_SEQUENCE VGPR_32, $src0, lo16, (IMPLICIT_DEF), hi16)
+  (REG_SEQUENCE VGPR_32, $src0, lo16, (Ty (IMPLICIT_DEF)), hi16)
 >;
 }
 
diff --git a/llvm/test/CodeGen/AMDGPU/build-vector-packed-partial-undef.ll b/llvm/test/CodeGen/AMDGPU/build-vector-packed-partial-undef.ll
index bdb52dbb95fa7..8b82d79a47a1a 100644
--- a/llvm/test/CodeGen/AMDGPU/build-vector-packed-partial-undef.ll
+++ b/llvm/test/CodeGen/AMDGPU/build-vector-packed-partial-undef.ll
@@ -1,8 +1,33 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -enable-var-scope -check-prefix=GFX9 %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=fiji < %s | FileCheck -enable-var-scope -check-prefix=GFX8 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=fiji < %s | FileCheck -enable-var-scope -check-prefixes=GFX8,GFX8-SDAG %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=fiji < %s | FileCheck -enable-var-scope -check-prefixes=GFX8,GFX8-GISEL %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -enable-var-scope -check-prefixes=GFX9,GFX9-SDAG %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -enable-var-scope -check-prefixes=GFX9,GFX9-GISEL %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11-FAKE,GFX11-FAKE16-SDAG %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11-FAKE,GFX11-FAKE16-GISEL %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11-TRUE,GFX11-TRUE16-SDAG %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11-TRUE,GFX11-TRUE16-GISEL %s
 
 define void @undef_lo_v2i16(i16 %arg0) {
+; GFX8-SDAG-LABEL: undef_lo_v2i16:
+; GFX8-SDAG:       ; %bb.0:
+; GFX8-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX8-SDAG-NEXT:    ;;#ASMSTART
+; GFX8-SDAG-NEXT:    ; use v0
+; GFX8-SDAG-NEXT:    ;;#ASMEND
+; GFX8-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-GISEL-LABEL: undef_lo_v2i16:
+; GFX8-GISEL:       ; %bb.0:
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX8-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX8-GISEL-NEXT:    ;;#ASMSTART
+; GFX8-GISEL-NEXT:    ; use v0
+; GFX8-GISEL-NEXT:    ;;#ASMEND
+; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX9-LABEL: undef_lo_v2i16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -12,20 +37,48 @@ define void @undef_lo_v2i16(i16 %arg0) {
 ; GFX9-NEXT:    ;;#ASMEND
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: undef_lo_v2i16:
-; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX8-NEXT:    ;;#ASMSTART
-; GFX8-NEXT:    ; use v0
-; GFX8-NEXT:    ;;#ASMEND
-; GFX8-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-FAKE-LABEL: undef_lo_v2i16:
+; GFX11-FAKE:       ; %bb.0:
+; GFX11-FAKE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX11-FAKE-NEXT:    ;;#ASMSTART
+; GFX11-FAKE-NEXT:    ; use v0
+; GFX11-FAKE-NEXT:    ;;#ASMEND
+; GFX11-FAKE-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-TRUE-LABEL: undef_lo_v2i16:
+; GFX11-TRUE:       ; %bb.0:
+; GFX11-TRUE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE-NEXT:    v_mov_b16_e32 v0.h, v0.l
+; GFX11-TRUE-NEXT:    ;;#ASMSTART
+; GFX11-TRUE-NEXT:    ; use v0
+; GFX11-TRUE-NEXT:    ;;#ASMEND
+; GFX11-TRUE-NEXT:    s_setpc_b64 s[30:31]
   %undef.lo = insertelement <2 x i16> poison, i16 %arg0, i32 1
   call void asm sideeffect "; use $0", "v"(<2 x i16> %undef.lo);
   ret void
 }
 
 define void @undef_lo_v2f16(half %arg0) {
+; GFX8-SDAG-LABEL: undef_lo_v2f16:
+; GFX8-SDAG:       ; %bb.0:
+; GFX8-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX8-SDAG-NEXT:    ;;#ASMSTART
+; GFX8-SDAG-NEXT:    ; use v0
+; GFX8-SDAG-NEXT:    ;;#ASMEND
+; GFX8-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-GISEL-LABEL: undef_lo_v2f16:
+; GFX8-GISEL:       ; %bb.0:
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX8-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX8-GISEL-NEXT:    ;;#ASMSTART
+; GFX8-GISEL-NEXT:    ; use v0
+; GFX8-GISEL-NEXT:    ;;#ASMEND
+; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX9-LABEL: undef_lo_v2f16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -35,20 +88,52 @@ define void @undef_lo_v2f16(half %arg0) {
 ; GFX9-NEXT:    ;;#ASMEND
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: undef_lo_v2f16:
-; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX8-NEXT:    ;;#ASMSTART
-; GFX8-NEXT:    ; use v0
-; GFX8-NEXT:    ;;#ASMEND
-; GFX8-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-FAKE-LABEL: undef_lo_v2f16:
+; GFX11-FAKE:       ; %bb.0:
+; GFX11-FAKE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX11-FAKE-NEXT:    ;;#ASMSTART
+; GFX11-FAKE-NEXT:    ; use v0
+; GFX11-FAKE-NEXT:    ;;#ASMEND
+; GFX11-FAKE-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-TRUE-LABEL: undef_lo_v2f16:
+; GFX11-TRUE:       ; %bb.0:
+; GFX11-TRUE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE-NEXT:    v_mov_b16_e32 v0.h, v0.l
+; GFX11-TRUE-NEXT:    ;;#ASMSTART
+; GFX11-TRUE-NEXT:    ; use v0
+; GFX11-TRUE-NEXT:    ;;#ASMEND
+; GFX11-TRUE-NEXT:    s_setpc_b64 s[30:31]
   %undef.lo = insertelement <2 x half> poison, half %arg0, i32 1
   call void asm sideeffect "; use $0", "v"(<2 x half> %undef.lo);
   ret void
 }
 
 define void @undef_lo_op_v2f16(half %arg0) {
+; GFX8-SDAG-LABEL: undef_lo_op_v2f16:
+; GFX8-SDAG:       ; %bb.0:
+; GFX8-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-SDAG-NEXT:    v_mov_b32_e32 v1, 0x3c00
+; GFX8-SDAG-NEXT:    v_add_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-SDAG-NEXT:    v_or_b32_e32 v0, 0x7e00, v0
+; GFX8-SDAG-NEXT:    ;;#ASMSTART
+; GFX8-SDAG-NEXT:    ; use v0
+; GFX8-SDAG-NEXT:    ;;#ASMEND
+; GFX8-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-GISEL-LABEL: undef_lo_op_v2f16:
+; GFX8-GISEL:       ; %bb.0:
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-GISEL-NEXT:    v_mov_b32_e32 v2, 0x3c00
+; GFX8-GISEL-NEXT:    v_add_f16_e64 v1, s4, 1.0
+; GFX8-GISEL-NEXT:    v_add_f16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-GISEL-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX8-GISEL-NEXT:    ;;#ASMSTART
+; GFX8-GISEL-NEXT:    ; use v0
+; GFX8-GISEL-NEXT:    ;;#ASMEND
+; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX9-LABEL: undef_lo_op_v2f16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -59,16 +144,27 @@ define void @undef_lo_op_v2f16(half %arg0) {
 ; GFX9-NEXT:    ;;#ASMEND
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: undef_lo_op_v2f16:
-; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v1, 0x3c00
-; GFX8-NEXT:    v_add_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT:    v_or_b32_e32 v0, 0x7e00, v0
-; GFX8-NEXT:    ;;#ASMSTART
-; GFX8-NEXT:    ; use v0
-; GFX8-NEXT:    ;;#ASMEND
-; GFX8-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-FAKE-LABEL: undef_lo_op_v2f16:
+; GFX11-FAKE:       ; %bb.0:
+; GFX11-FAKE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX11-FAKE-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE-NEXT:    v_pk_add_f16 v0, v0, 1.0 op_sel_hi:[1,0]
+; GFX11-FAKE-NEXT:    ;;#ASMSTART
+; GFX11-FAKE-NEXT:    ; use v0
+; GFX11-FAKE-NEXT:    ;;#ASMEND
+; GFX11-FAKE-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-TRUE-LABEL: undef_lo_op_v2f16:
+; GFX11-TRUE:       ; %bb.0:
+; GFX11-TRUE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE-NEXT:    v_mov_b16_e32 v0.h, v0.l
+; GFX11-TRUE-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE-NEXT:    v_pk_add_f16 v0, v0, 1.0 op_sel_hi:[1,0]
+; GFX11-TRUE-NEXT:    ;;#ASMSTART
+; GFX11-TRUE-NEXT:    ; use v0
+; GFX11-TRUE-NEXT:    ;;#ASMEND
+; GFX11-TRUE-NEXT:    s_setpc_b64 s[30:31]
   %undef.lo = insertelement <2 x half> poison, half %arg0, i32 1
   %op = fadd <2 x half> %undef.lo, <half 1.0, half 1.0>
   call void asm sideeffect "; use $0", "v"(<2 x half> %op);
@@ -76,26 +172,93 @@ define void @undef_lo_op_v2f16(half %arg0) {
 }
 
 define void @undef_lo_op_v2i16(i16 %arg0) {
-; GFX9-LABEL: undef_lo_op_v2i16:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX9-NEXT:    s_movk_i32 s4, 0x63
-; GFX9-NEXT:    v_pk_add_u16 v0, v0, s4 op_sel_hi:[1,0]
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use v0
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX8-SDAG-LABEL: undef_lo_op_v2i16:
+; GFX8-SDAG:       ; %bb.0:
+; GFX8-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-SDAG-NEXT:    v_mov_b32_e32 v1, 0x63
+; GFX8-SDAG-NEXT:    v_add_u16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-SDAG-NEXT:    ;;#ASMSTART
+; GFX8-SDAG-NEXT:    ; use v0
+; GFX8-SDAG-NEXT:    ;;#ASMEND
+; GFX8-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: undef_lo_op_v2i16:
-; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v1, 0x63
-; GFX8-NEXT:    v_add_u16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT:    ;;#ASMSTART
-; GFX8-NEXT:    ; use v0
-; GFX8-NEXT:    ;;#ASMEND
-; GFX8-NEXT:    s_setpc_b64 s[30:31]
+; GFX8-GISEL-LABEL: undef_lo_op_v2i16:
+; GFX8-GISEL:       ; %bb.0:
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-GISEL-NEXT:    v_mov_b32_e32 v1, 0x63
+; GFX8-GISEL-NEXT:    s_and_b32 s4, 0xffff, s4
+; GFX8-GISEL-NEXT:    v_add_u16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-GISEL-NEXT:    v_or_b32_e32 v0, s4, v0
+; GFX8-GISEL-NEXT:    ;;#ASMSTART
+; GFX8-GISEL-NEXT:    ; use v0
+; GFX8-GISEL-NEXT:    ;;#ASMEND
+; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-SDAG-LABEL: undef_lo_op_v2i16:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX9-SDAG-NEXT:    s_movk_i32 s4, 0x63
+; GFX9-SDAG-NEXT:    v_pk_add_u16 v0, v0, s4 op_sel_hi:[1,0]
+; GFX9-SDAG-NEXT:    ;;#ASMSTART
+; GFX9-SDAG-NEXT:    ; use v0
+; GFX9-SDAG-NEXT:    ;;#ASMEND
+; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-LABEL: undef_lo_op_v2i16:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, 0x630063
+; GFX9-GISEL-NEXT:    v_pk_add_u16 v0, v0, v1
+; GFX9-GISEL-NEXT:    ;;#ASMSTART
+; GFX9-GISEL-NEXT:    ; use v0
+; GFX9-GISEL-NEXT:    ;;#ASMEND
+; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-SDAG-LABEL: undef_lo_op_v2i16:
+; GFX11-FAKE16-SDAG:       ; %bb.0:
+; GFX11-FAKE16-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX11-FAKE16-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-SDAG-NEXT:    v_pk_add_u16 v0, 0x63, v0 op_sel_hi:[0,1]
+; GFX11-FAKE16-SDAG-NEXT:    ;;#ASMSTART
+; GFX11-FAKE16-SDAG-NEXT:    ; use v0
+; GFX11-FAKE16-SDAG-NEXT:    ;;#ASMEND
+; GFX11-FAKE16-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-GISEL-LABEL: undef_lo_op_v2i16:
+; GFX11-FAKE16-GISEL:       ; %bb.0:
+; GFX11-FAKE16-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX11-FAKE16-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-GISEL-NEXT:    v_pk_add_u16 v0, 0x630063, v0
+; GFX11-FAKE16-GISEL-NEXT:    ;;#ASMSTART
+; GFX11-FAKE16-GISEL-NEXT:    ; use v0
+; GFX11-FAKE16-GISEL-NEXT:    ;;#ASMEND
+; GFX11-FAKE16-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-TRUE16-SDAG-LABEL: undef_lo_op_v2i16:
+; GFX11-TRUE16-SDAG:       ; %bb.0:
+; GFX11-TRUE16-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-SDAG-NEXT:    v_mov_b16_e32 v0.h, v0.l
+; GFX11-TRUE16-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-SDAG-NEXT:    v_pk_add_u16 v0, 0x63, v0 op_sel_hi:[0,1]
+; GFX11-TRUE16-SDAG-NEXT:    ;;#ASMSTART
+; GFX11-TRUE16-SDAG-NEXT:    ; use v0
+; GFX11-TRUE16-SDAG-NEXT:    ;;#ASMEND
+; GFX11-TRUE16-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-TRUE16-GISEL-LABEL: undef_lo_op_v2i16:
+; GFX11-TRUE16-GISEL:       ; %bb.0:
+; GFX11-TRUE16-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-GISEL-NEXT:    v_mov_b16_e32 v0.h, v0.l
+; GFX11-TRUE16-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-GISEL-NEXT:    v_pk_add_u16 v0, 0x630063, v0
+; GFX11-TRUE16-GISEL-NEXT:    ;;#ASMSTART
+; GFX11-TRUE16-GISEL-NEXT:    ; use v0
+; GFX11-TRUE16-GISEL-NEXT:    ;;#ASMEND
+; GFX11-TRUE16-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %undef.lo = insertelement <2 x i16> poison, i16 %arg0, i32 1
   %op = add <2 x i16> %undef.lo, <i16 99, i16 99>
   call void asm sideeffect "; use $0", "v"(<2 x i16> %op);
@@ -103,6 +266,26 @@ define void @undef_lo_op_v2i16(i16 %arg0) {
 }
 
 define void @undef_lo3_v4i16(i16 %arg0) {
+; GFX8-SDAG-LABEL: undef_lo3_v4i16:
+; GFX8-SDAG:       ; %bb.0:
+; GFX8-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX8-SDAG-NEXT:    ;;#ASMSTART
+; GFX8-SDAG-NEXT:    ; use v[0:1]
+; GFX8-SDAG-NEXT:    ;;#ASMEND
+; GFX8-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-GISEL-LABEL: undef_lo3_v4i16:
+; GFX8-GISEL:       ; %bb.0:
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX8-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX8-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX8-GISEL-NEXT:    ;;#ASMSTART
+; GFX8-GISEL-NEXT:    ; use v[0:1]
+; GFX8-GISEL-NEXT:    ;;#ASMEND
+; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX9-LABEL: undef_lo3_v4i16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -112,20 +295,49 @@ define void @undef_lo3_v4i16(i16 %arg0) {
 ; GFX9-NEXT:    ;;#ASMEND
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: undef_lo3_v4i16:
-; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX8-NEXT:    ;;#ASMSTART
-; GFX8-NEXT:    ; use v[0:1]
-; GFX8-NEXT:    ;;#ASMEND
-; GFX8-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-FAKE-LABEL: undef_lo3_v4i16:
+; GFX11-FAKE:       ; %bb.0:
+; GFX11-FAKE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX11-FAKE-NEXT:    ;;#ASMSTART
+; GFX11-FAKE-NEXT:    ; use v[0:1]
+; GFX11-FAKE-NEXT:    ;;#ASMEND
+; GFX11-FAKE-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-TRUE-LABEL: undef_lo3_v4i16:
+; GFX11-TRUE:       ; %bb.0:
+; GFX11-TRUE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE-NEXT:    v_mov_b16_e32 v0.h, v0.l
+; GFX11-TRUE-NEXT:    ;;#ASMSTART
+; GFX11-TRUE-NEXT:    ; use v[0:1]
+; GFX11-TRUE-NEXT:    ;;#ASMEND
+; GFX11-TRUE-NEXT:    s_setpc_b64 s[30:31]
   %undef.lo = insertelement <4 x i16> poison, i16 %arg0, i32 1
   call void asm sideeffect "; use $0", "v"(<4 x i16> %undef.lo);
   ret void
 }
 
 define void @undef_lo3_v4f16(half %arg0) {
+; GFX8-SDAG-LABEL: undef_lo3_v4f16:
+; GFX8-SDAG:       ; %bb.0:
+; GFX8-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX8-SDAG-NEXT:    ;;#ASMSTART
+; GFX8-SDAG-NEXT:    ; use v[0:1]
+; GFX8-SDAG-NEXT:    ;;#ASMEND
+; GFX8-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-GISEL-LABEL: undef_lo3_v4f16:
+; GFX8-GISEL:       ; %bb.0:
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX8-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX8-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX8-GISEL-NEXT:    ;;#ASMSTART
+; GFX8-GISEL-NEXT:    ; use v[0:1]
+; GFX8-GISEL-NEXT:    ;;#ASMEND
+; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX9-LABEL: undef_lo3_v4f16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -135,20 +347,50 @@ define void @undef_lo3_v4f16(half %arg0) {
 ; GFX9-NEXT:    ;;#ASMEND
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: undef_lo3_v4f16:
-; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX8-NEXT:    ;;#ASMSTART
-; GFX8-NEXT:    ; use v[0:1]
-; GFX8-NEXT:    ;;#ASMEND
-; GFX8-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-FAKE-LABEL: undef_lo3_v4f16:
+; GFX11-FAKE:       ; %bb.0:
+; GFX11-FAKE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX11-FAKE-NEXT:    ;;#ASMSTART
+; GFX11-FAKE-NEXT:    ; use v[0:1]
+; GFX11-FAKE-NEXT:    ;;#ASMEND
+; GFX11-FAKE-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-TRUE-LABEL: undef_lo3_v4f16:
+; GFX11-TRUE:       ; %bb.0:
+; GFX11-TRUE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE-NEXT:    v_mov_b16_e32 v0.h, v0.l
+; GFX11-TRUE-NEXT:    ;;#ASMSTART
+; GFX11-TRUE-NEXT:    ; use v[0:1]
+; GFX11-TRUE-NEXT:    ;;#ASMEND
+; GFX11-TRUE-NEXT:    s_setpc_b64 s[30:31]
   %undef.lo = insertelement <4 x half> poison, half %arg0, i32 1
   call void asm sideeffect "; use $0", "v"(<4 x half> %undef.lo);
   ret void
 }
 
 define void @undef_lo2_v4i16(<2 x i16> %arg0) {
+; GFX8-SDAG-LABEL: undef_lo2_v4i16:
+; GFX8-SDAG:       ; %bb.0:
+; GFX8-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-SDAG-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX8-SDAG-NEXT:    v_alignbit_b32 v0, v1, v0, 16
+; GFX8-SDAG-NEXT:    ;;#ASMSTART
+; GFX8-SDAG-NEXT:    ; use v[0:1]
+; GFX8-SDAG-NEXT:    ;;#ASMEND
+; GFX8-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-GISEL-LABEL: undef_lo2_v4i16:
+; GFX8-GISEL:       ; %bb.0:
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-GISEL-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX8-GISEL-NEXT:    v_alignbit_b32 v0, v1, v0, 16
+; GFX8-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX8-GISEL-NEXT:    ;;#ASMSTART
+; GFX8-GISEL-NEXT:    ; use v[0:1]
+; GFX8-GISEL-NEXT:    ;;#ASM...
[truncated]

Comment on lines +24 to +25
; GFX8-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX8-GISEL-NEXT: v_lshlrev_b32_e32 v0, 16, v0
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Difference here between SDAG and GlobalISel for GFX8 looks like a legalizer issue. GFX8 makes different choices because of hasVOP3P check

@mbrkusanin mbrkusanin force-pushed the true16-global-isel-build-packed-vector branch from d8c1544 to 27be0cf Compare July 31, 2025 16:41
- Pattern with IMPLICIT_DEF failed to generate an entry in MatchTable and
did not an report error, just silently failed. This is fixed by casting
IMPLICIT_DEF to appropriate type. This also fixes selecting
"build_vector s16, undef" for GlobalISel with True16.
- Add pattern for "build_vector undef, s16" that will work for GlobalISel.
True16 GlobalISel has a G_TRUNC that it needs to deal with.
- Use REG_SEQUENCE for Real16 patterns instead of V_LSHLREV_B32_e64 to
generate more optimal code.
@mbrkusanin mbrkusanin force-pushed the true16-global-isel-build-packed-vector branch from 27be0cf to 345937e Compare August 1, 2025 17:16
@mbrkusanin mbrkusanin merged commit a24fae3 into llvm:main Aug 4, 2025
9 checks passed
@mbrkusanin mbrkusanin deleted the true16-global-isel-build-packed-vector branch August 4, 2025 11:06
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Projects
None yet
Development

Successfully merging this pull request may close these issues.

4 participants