-
Notifications
You must be signed in to change notification settings - Fork 14.7k
[AMDGPU][True16][GlobalISel] Fix v2*16 build_vector patterns #151496
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[AMDGPU][True16][GlobalISel] Fix v2*16 build_vector patterns #151496
Conversation
@llvm/pr-subscribers-backend-amdgpu Author: Mirko Brkušanin (mbrkusanin) Changes
Patch is 49.48 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/151496.diff 10 Files Affected:
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index 54fa192aeec92..35605eb5a4bd3 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -3543,15 +3543,29 @@ def : GCNPat <
(vecTy (UniformBinFrag<build_vector> (Ty undef), (Ty SReg_32:$src1))),
(S_LSHL_B32 SReg_32:$src1, (i32 16))
>;
-}
def : GCNPat <
(vecTy (DivergentBinFrag<build_vector> (Ty undef), (Ty VGPR_32:$src1))),
(vecTy (V_LSHLREV_B32_e64 (i32 16), VGPR_32:$src1))
>;
-} // End foreach Ty = ...
}
+let True16Predicate = UseRealTrue16Insts in
+def : GCNPat <
+ (vecTy (DivergentBinFrag<build_vector> (Ty undef), (Ty VGPR_32:$src1))),
+ (REG_SEQUENCE VGPR_32, (Ty (IMPLICIT_DEF)), lo16, (Ty VGPR_32:$src1), hi16)
+>;
+
+} // End foreach Ty = ...
+} // End AddedComplexity = 1
+
+let True16Predicate = UseRealTrue16Insts in
+def : GCNPat <
+ (v2i16 (DivergentBinFrag<build_vector> (i16 undef), (i16 (trunc i32:$src1)))),
+ (REG_SEQUENCE VGPR_32, (i16 (IMPLICIT_DEF)), lo16,
+ (i16 (EXTRACT_SUBREG VGPR_32:$src1, lo16)), hi16)
+>;
+
let SubtargetPredicate = HasVOP3PInsts in {
foreach p = [NotHasTrue16BitInsts, UseFakeTrue16Insts] in
let True16Predicate = p in
@@ -3599,7 +3613,7 @@ def : GCNPat <
>;
def : GCNPat <
(vecTy (DivergentBinFrag<build_vector> (Ty VGPR_16:$src0), (Ty undef))),
- (REG_SEQUENCE VGPR_32, $src0, lo16, (IMPLICIT_DEF), hi16)
+ (REG_SEQUENCE VGPR_32, $src0, lo16, (Ty (IMPLICIT_DEF)), hi16)
>;
}
diff --git a/llvm/test/CodeGen/AMDGPU/build-vector-packed-partial-undef.ll b/llvm/test/CodeGen/AMDGPU/build-vector-packed-partial-undef.ll
index bdb52dbb95fa7..8b82d79a47a1a 100644
--- a/llvm/test/CodeGen/AMDGPU/build-vector-packed-partial-undef.ll
+++ b/llvm/test/CodeGen/AMDGPU/build-vector-packed-partial-undef.ll
@@ -1,8 +1,33 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -enable-var-scope -check-prefix=GFX9 %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=fiji < %s | FileCheck -enable-var-scope -check-prefix=GFX8 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=fiji < %s | FileCheck -enable-var-scope -check-prefixes=GFX8,GFX8-SDAG %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=fiji < %s | FileCheck -enable-var-scope -check-prefixes=GFX8,GFX8-GISEL %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -enable-var-scope -check-prefixes=GFX9,GFX9-SDAG %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -enable-var-scope -check-prefixes=GFX9,GFX9-GISEL %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11-FAKE,GFX11-FAKE16-SDAG %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11-FAKE,GFX11-FAKE16-GISEL %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11-TRUE,GFX11-TRUE16-SDAG %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11-TRUE,GFX11-TRUE16-GISEL %s
define void @undef_lo_v2i16(i16 %arg0) {
+; GFX8-SDAG-LABEL: undef_lo_v2i16:
+; GFX8-SDAG: ; %bb.0:
+; GFX8-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-SDAG-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX8-SDAG-NEXT: ;;#ASMSTART
+; GFX8-SDAG-NEXT: ; use v0
+; GFX8-SDAG-NEXT: ;;#ASMEND
+; GFX8-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-GISEL-LABEL: undef_lo_v2i16:
+; GFX8-GISEL: ; %bb.0:
+; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX8-GISEL-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX8-GISEL-NEXT: ;;#ASMSTART
+; GFX8-GISEL-NEXT: ; use v0
+; GFX8-GISEL-NEXT: ;;#ASMEND
+; GFX8-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
; GFX9-LABEL: undef_lo_v2i16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -12,20 +37,48 @@ define void @undef_lo_v2i16(i16 %arg0) {
; GFX9-NEXT: ;;#ASMEND
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
-; GFX8-LABEL: undef_lo_v2i16:
-; GFX8: ; %bb.0:
-; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX8-NEXT: ;;#ASMSTART
-; GFX8-NEXT: ; use v0
-; GFX8-NEXT: ;;#ASMEND
-; GFX8-NEXT: s_setpc_b64 s[30:31]
+; GFX11-FAKE-LABEL: undef_lo_v2i16:
+; GFX11-FAKE: ; %bb.0:
+; GFX11-FAKE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX11-FAKE-NEXT: ;;#ASMSTART
+; GFX11-FAKE-NEXT: ; use v0
+; GFX11-FAKE-NEXT: ;;#ASMEND
+; GFX11-FAKE-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-TRUE-LABEL: undef_lo_v2i16:
+; GFX11-TRUE: ; %bb.0:
+; GFX11-TRUE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE-NEXT: v_mov_b16_e32 v0.h, v0.l
+; GFX11-TRUE-NEXT: ;;#ASMSTART
+; GFX11-TRUE-NEXT: ; use v0
+; GFX11-TRUE-NEXT: ;;#ASMEND
+; GFX11-TRUE-NEXT: s_setpc_b64 s[30:31]
%undef.lo = insertelement <2 x i16> poison, i16 %arg0, i32 1
call void asm sideeffect "; use $0", "v"(<2 x i16> %undef.lo);
ret void
}
define void @undef_lo_v2f16(half %arg0) {
+; GFX8-SDAG-LABEL: undef_lo_v2f16:
+; GFX8-SDAG: ; %bb.0:
+; GFX8-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-SDAG-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX8-SDAG-NEXT: ;;#ASMSTART
+; GFX8-SDAG-NEXT: ; use v0
+; GFX8-SDAG-NEXT: ;;#ASMEND
+; GFX8-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-GISEL-LABEL: undef_lo_v2f16:
+; GFX8-GISEL: ; %bb.0:
+; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX8-GISEL-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX8-GISEL-NEXT: ;;#ASMSTART
+; GFX8-GISEL-NEXT: ; use v0
+; GFX8-GISEL-NEXT: ;;#ASMEND
+; GFX8-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
; GFX9-LABEL: undef_lo_v2f16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -35,20 +88,52 @@ define void @undef_lo_v2f16(half %arg0) {
; GFX9-NEXT: ;;#ASMEND
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
-; GFX8-LABEL: undef_lo_v2f16:
-; GFX8: ; %bb.0:
-; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX8-NEXT: ;;#ASMSTART
-; GFX8-NEXT: ; use v0
-; GFX8-NEXT: ;;#ASMEND
-; GFX8-NEXT: s_setpc_b64 s[30:31]
+; GFX11-FAKE-LABEL: undef_lo_v2f16:
+; GFX11-FAKE: ; %bb.0:
+; GFX11-FAKE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX11-FAKE-NEXT: ;;#ASMSTART
+; GFX11-FAKE-NEXT: ; use v0
+; GFX11-FAKE-NEXT: ;;#ASMEND
+; GFX11-FAKE-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-TRUE-LABEL: undef_lo_v2f16:
+; GFX11-TRUE: ; %bb.0:
+; GFX11-TRUE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE-NEXT: v_mov_b16_e32 v0.h, v0.l
+; GFX11-TRUE-NEXT: ;;#ASMSTART
+; GFX11-TRUE-NEXT: ; use v0
+; GFX11-TRUE-NEXT: ;;#ASMEND
+; GFX11-TRUE-NEXT: s_setpc_b64 s[30:31]
%undef.lo = insertelement <2 x half> poison, half %arg0, i32 1
call void asm sideeffect "; use $0", "v"(<2 x half> %undef.lo);
ret void
}
define void @undef_lo_op_v2f16(half %arg0) {
+; GFX8-SDAG-LABEL: undef_lo_op_v2f16:
+; GFX8-SDAG: ; %bb.0:
+; GFX8-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-SDAG-NEXT: v_mov_b32_e32 v1, 0x3c00
+; GFX8-SDAG-NEXT: v_add_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-SDAG-NEXT: v_or_b32_e32 v0, 0x7e00, v0
+; GFX8-SDAG-NEXT: ;;#ASMSTART
+; GFX8-SDAG-NEXT: ; use v0
+; GFX8-SDAG-NEXT: ;;#ASMEND
+; GFX8-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-GISEL-LABEL: undef_lo_op_v2f16:
+; GFX8-GISEL: ; %bb.0:
+; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-GISEL-NEXT: v_mov_b32_e32 v2, 0x3c00
+; GFX8-GISEL-NEXT: v_add_f16_e64 v1, s4, 1.0
+; GFX8-GISEL-NEXT: v_add_f16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-GISEL-NEXT: v_or_b32_e32 v0, v1, v0
+; GFX8-GISEL-NEXT: ;;#ASMSTART
+; GFX8-GISEL-NEXT: ; use v0
+; GFX8-GISEL-NEXT: ;;#ASMEND
+; GFX8-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
; GFX9-LABEL: undef_lo_op_v2f16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -59,16 +144,27 @@ define void @undef_lo_op_v2f16(half %arg0) {
; GFX9-NEXT: ;;#ASMEND
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
-; GFX8-LABEL: undef_lo_op_v2f16:
-; GFX8: ; %bb.0:
-; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, 0x3c00
-; GFX8-NEXT: v_add_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT: v_or_b32_e32 v0, 0x7e00, v0
-; GFX8-NEXT: ;;#ASMSTART
-; GFX8-NEXT: ; use v0
-; GFX8-NEXT: ;;#ASMEND
-; GFX8-NEXT: s_setpc_b64 s[30:31]
+; GFX11-FAKE-LABEL: undef_lo_op_v2f16:
+; GFX11-FAKE: ; %bb.0:
+; GFX11-FAKE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX11-FAKE-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE-NEXT: v_pk_add_f16 v0, v0, 1.0 op_sel_hi:[1,0]
+; GFX11-FAKE-NEXT: ;;#ASMSTART
+; GFX11-FAKE-NEXT: ; use v0
+; GFX11-FAKE-NEXT: ;;#ASMEND
+; GFX11-FAKE-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-TRUE-LABEL: undef_lo_op_v2f16:
+; GFX11-TRUE: ; %bb.0:
+; GFX11-TRUE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE-NEXT: v_mov_b16_e32 v0.h, v0.l
+; GFX11-TRUE-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE-NEXT: v_pk_add_f16 v0, v0, 1.0 op_sel_hi:[1,0]
+; GFX11-TRUE-NEXT: ;;#ASMSTART
+; GFX11-TRUE-NEXT: ; use v0
+; GFX11-TRUE-NEXT: ;;#ASMEND
+; GFX11-TRUE-NEXT: s_setpc_b64 s[30:31]
%undef.lo = insertelement <2 x half> poison, half %arg0, i32 1
%op = fadd <2 x half> %undef.lo, <half 1.0, half 1.0>
call void asm sideeffect "; use $0", "v"(<2 x half> %op);
@@ -76,26 +172,93 @@ define void @undef_lo_op_v2f16(half %arg0) {
}
define void @undef_lo_op_v2i16(i16 %arg0) {
-; GFX9-LABEL: undef_lo_op_v2i16:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX9-NEXT: s_movk_i32 s4, 0x63
-; GFX9-NEXT: v_pk_add_u16 v0, v0, s4 op_sel_hi:[1,0]
-; GFX9-NEXT: ;;#ASMSTART
-; GFX9-NEXT: ; use v0
-; GFX9-NEXT: ;;#ASMEND
-; GFX9-NEXT: s_setpc_b64 s[30:31]
+; GFX8-SDAG-LABEL: undef_lo_op_v2i16:
+; GFX8-SDAG: ; %bb.0:
+; GFX8-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-SDAG-NEXT: v_mov_b32_e32 v1, 0x63
+; GFX8-SDAG-NEXT: v_add_u16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-SDAG-NEXT: ;;#ASMSTART
+; GFX8-SDAG-NEXT: ; use v0
+; GFX8-SDAG-NEXT: ;;#ASMEND
+; GFX8-SDAG-NEXT: s_setpc_b64 s[30:31]
;
-; GFX8-LABEL: undef_lo_op_v2i16:
-; GFX8: ; %bb.0:
-; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, 0x63
-; GFX8-NEXT: v_add_u16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT: ;;#ASMSTART
-; GFX8-NEXT: ; use v0
-; GFX8-NEXT: ;;#ASMEND
-; GFX8-NEXT: s_setpc_b64 s[30:31]
+; GFX8-GISEL-LABEL: undef_lo_op_v2i16:
+; GFX8-GISEL: ; %bb.0:
+; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-GISEL-NEXT: v_mov_b32_e32 v1, 0x63
+; GFX8-GISEL-NEXT: s_and_b32 s4, 0xffff, s4
+; GFX8-GISEL-NEXT: v_add_u16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-GISEL-NEXT: v_or_b32_e32 v0, s4, v0
+; GFX8-GISEL-NEXT: ;;#ASMSTART
+; GFX8-GISEL-NEXT: ; use v0
+; GFX8-GISEL-NEXT: ;;#ASMEND
+; GFX8-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-SDAG-LABEL: undef_lo_op_v2i16:
+; GFX9-SDAG: ; %bb.0:
+; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX9-SDAG-NEXT: s_movk_i32 s4, 0x63
+; GFX9-SDAG-NEXT: v_pk_add_u16 v0, v0, s4 op_sel_hi:[1,0]
+; GFX9-SDAG-NEXT: ;;#ASMSTART
+; GFX9-SDAG-NEXT: ; use v0
+; GFX9-SDAG-NEXT: ;;#ASMEND
+; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-LABEL: undef_lo_op_v2i16:
+; GFX9-GISEL: ; %bb.0:
+; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0x630063
+; GFX9-GISEL-NEXT: v_pk_add_u16 v0, v0, v1
+; GFX9-GISEL-NEXT: ;;#ASMSTART
+; GFX9-GISEL-NEXT: ; use v0
+; GFX9-GISEL-NEXT: ;;#ASMEND
+; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-SDAG-LABEL: undef_lo_op_v2i16:
+; GFX11-FAKE16-SDAG: ; %bb.0:
+; GFX11-FAKE16-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-SDAG-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX11-FAKE16-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-SDAG-NEXT: v_pk_add_u16 v0, 0x63, v0 op_sel_hi:[0,1]
+; GFX11-FAKE16-SDAG-NEXT: ;;#ASMSTART
+; GFX11-FAKE16-SDAG-NEXT: ; use v0
+; GFX11-FAKE16-SDAG-NEXT: ;;#ASMEND
+; GFX11-FAKE16-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-GISEL-LABEL: undef_lo_op_v2i16:
+; GFX11-FAKE16-GISEL: ; %bb.0:
+; GFX11-FAKE16-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-GISEL-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX11-FAKE16-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-GISEL-NEXT: v_pk_add_u16 v0, 0x630063, v0
+; GFX11-FAKE16-GISEL-NEXT: ;;#ASMSTART
+; GFX11-FAKE16-GISEL-NEXT: ; use v0
+; GFX11-FAKE16-GISEL-NEXT: ;;#ASMEND
+; GFX11-FAKE16-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-TRUE16-SDAG-LABEL: undef_lo_op_v2i16:
+; GFX11-TRUE16-SDAG: ; %bb.0:
+; GFX11-TRUE16-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-SDAG-NEXT: v_mov_b16_e32 v0.h, v0.l
+; GFX11-TRUE16-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-SDAG-NEXT: v_pk_add_u16 v0, 0x63, v0 op_sel_hi:[0,1]
+; GFX11-TRUE16-SDAG-NEXT: ;;#ASMSTART
+; GFX11-TRUE16-SDAG-NEXT: ; use v0
+; GFX11-TRUE16-SDAG-NEXT: ;;#ASMEND
+; GFX11-TRUE16-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-TRUE16-GISEL-LABEL: undef_lo_op_v2i16:
+; GFX11-TRUE16-GISEL: ; %bb.0:
+; GFX11-TRUE16-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-GISEL-NEXT: v_mov_b16_e32 v0.h, v0.l
+; GFX11-TRUE16-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-GISEL-NEXT: v_pk_add_u16 v0, 0x630063, v0
+; GFX11-TRUE16-GISEL-NEXT: ;;#ASMSTART
+; GFX11-TRUE16-GISEL-NEXT: ; use v0
+; GFX11-TRUE16-GISEL-NEXT: ;;#ASMEND
+; GFX11-TRUE16-GISEL-NEXT: s_setpc_b64 s[30:31]
%undef.lo = insertelement <2 x i16> poison, i16 %arg0, i32 1
%op = add <2 x i16> %undef.lo, <i16 99, i16 99>
call void asm sideeffect "; use $0", "v"(<2 x i16> %op);
@@ -103,6 +266,26 @@ define void @undef_lo_op_v2i16(i16 %arg0) {
}
define void @undef_lo3_v4i16(i16 %arg0) {
+; GFX8-SDAG-LABEL: undef_lo3_v4i16:
+; GFX8-SDAG: ; %bb.0:
+; GFX8-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-SDAG-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX8-SDAG-NEXT: ;;#ASMSTART
+; GFX8-SDAG-NEXT: ; use v[0:1]
+; GFX8-SDAG-NEXT: ;;#ASMEND
+; GFX8-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-GISEL-LABEL: undef_lo3_v4i16:
+; GFX8-GISEL: ; %bb.0:
+; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX8-GISEL-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX8-GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX8-GISEL-NEXT: ;;#ASMSTART
+; GFX8-GISEL-NEXT: ; use v[0:1]
+; GFX8-GISEL-NEXT: ;;#ASMEND
+; GFX8-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
; GFX9-LABEL: undef_lo3_v4i16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -112,20 +295,49 @@ define void @undef_lo3_v4i16(i16 %arg0) {
; GFX9-NEXT: ;;#ASMEND
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
-; GFX8-LABEL: undef_lo3_v4i16:
-; GFX8: ; %bb.0:
-; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX8-NEXT: ;;#ASMSTART
-; GFX8-NEXT: ; use v[0:1]
-; GFX8-NEXT: ;;#ASMEND
-; GFX8-NEXT: s_setpc_b64 s[30:31]
+; GFX11-FAKE-LABEL: undef_lo3_v4i16:
+; GFX11-FAKE: ; %bb.0:
+; GFX11-FAKE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX11-FAKE-NEXT: ;;#ASMSTART
+; GFX11-FAKE-NEXT: ; use v[0:1]
+; GFX11-FAKE-NEXT: ;;#ASMEND
+; GFX11-FAKE-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-TRUE-LABEL: undef_lo3_v4i16:
+; GFX11-TRUE: ; %bb.0:
+; GFX11-TRUE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE-NEXT: v_mov_b16_e32 v0.h, v0.l
+; GFX11-TRUE-NEXT: ;;#ASMSTART
+; GFX11-TRUE-NEXT: ; use v[0:1]
+; GFX11-TRUE-NEXT: ;;#ASMEND
+; GFX11-TRUE-NEXT: s_setpc_b64 s[30:31]
%undef.lo = insertelement <4 x i16> poison, i16 %arg0, i32 1
call void asm sideeffect "; use $0", "v"(<4 x i16> %undef.lo);
ret void
}
define void @undef_lo3_v4f16(half %arg0) {
+; GFX8-SDAG-LABEL: undef_lo3_v4f16:
+; GFX8-SDAG: ; %bb.0:
+; GFX8-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-SDAG-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX8-SDAG-NEXT: ;;#ASMSTART
+; GFX8-SDAG-NEXT: ; use v[0:1]
+; GFX8-SDAG-NEXT: ;;#ASMEND
+; GFX8-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-GISEL-LABEL: undef_lo3_v4f16:
+; GFX8-GISEL: ; %bb.0:
+; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX8-GISEL-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX8-GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX8-GISEL-NEXT: ;;#ASMSTART
+; GFX8-GISEL-NEXT: ; use v[0:1]
+; GFX8-GISEL-NEXT: ;;#ASMEND
+; GFX8-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
; GFX9-LABEL: undef_lo3_v4f16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -135,20 +347,50 @@ define void @undef_lo3_v4f16(half %arg0) {
; GFX9-NEXT: ;;#ASMEND
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
-; GFX8-LABEL: undef_lo3_v4f16:
-; GFX8: ; %bb.0:
-; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX8-NEXT: ;;#ASMSTART
-; GFX8-NEXT: ; use v[0:1]
-; GFX8-NEXT: ;;#ASMEND
-; GFX8-NEXT: s_setpc_b64 s[30:31]
+; GFX11-FAKE-LABEL: undef_lo3_v4f16:
+; GFX11-FAKE: ; %bb.0:
+; GFX11-FAKE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX11-FAKE-NEXT: ;;#ASMSTART
+; GFX11-FAKE-NEXT: ; use v[0:1]
+; GFX11-FAKE-NEXT: ;;#ASMEND
+; GFX11-FAKE-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-TRUE-LABEL: undef_lo3_v4f16:
+; GFX11-TRUE: ; %bb.0:
+; GFX11-TRUE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE-NEXT: v_mov_b16_e32 v0.h, v0.l
+; GFX11-TRUE-NEXT: ;;#ASMSTART
+; GFX11-TRUE-NEXT: ; use v[0:1]
+; GFX11-TRUE-NEXT: ;;#ASMEND
+; GFX11-TRUE-NEXT: s_setpc_b64 s[30:31]
%undef.lo = insertelement <4 x half> poison, half %arg0, i32 1
call void asm sideeffect "; use $0", "v"(<4 x half> %undef.lo);
ret void
}
define void @undef_lo2_v4i16(<2 x i16> %arg0) {
+; GFX8-SDAG-LABEL: undef_lo2_v4i16:
+; GFX8-SDAG: ; %bb.0:
+; GFX8-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-SDAG-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX8-SDAG-NEXT: v_alignbit_b32 v0, v1, v0, 16
+; GFX8-SDAG-NEXT: ;;#ASMSTART
+; GFX8-SDAG-NEXT: ; use v[0:1]
+; GFX8-SDAG-NEXT: ;;#ASMEND
+; GFX8-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-GISEL-LABEL: undef_lo2_v4i16:
+; GFX8-GISEL: ; %bb.0:
+; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX8-GISEL-NEXT: v_alignbit_b32 v0, v1, v0, 16
+; GFX8-GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX8-GISEL-NEXT: ;;#ASMSTART
+; GFX8-GISEL-NEXT: ; use v[0:1]
+; GFX8-GISEL-NEXT: ;;#ASM...
[truncated]
|
; GFX8-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 | ||
; GFX8-GISEL-NEXT: v_lshlrev_b32_e32 v0, 16, v0 |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Difference here between SDAG and GlobalISel for GFX8 looks like a legalizer issue. GFX8 makes different choices because of hasVOP3P check
d8c1544
to
27be0cf
Compare
- Pattern with IMPLICIT_DEF failed to generate an entry in MatchTable and did not an report error, just silently failed. This is fixed by casting IMPLICIT_DEF to appropriate type. This also fixes selecting "build_vector s16, undef" for GlobalISel with True16. - Add pattern for "build_vector undef, s16" that will work for GlobalISel. True16 GlobalISel has a G_TRUNC that it needs to deal with. - Use REG_SEQUENCE for Real16 patterns instead of V_LSHLREV_B32_e64 to generate more optimal code.
27be0cf
to
345937e
Compare
did not report an error, just silently failed. This is fixed by casting
IMPLICIT_DEF to appropriate type. This also fixes selecting
"build_vector s16, undef" for GlobalISel with True16.
True16 GlobalISel has a G_TRUNC that it needs to deal with.
generate more optimal code.