Skip to content

Commit 3dfd939

Browse files
authored
[AMDGPU] gfx1250 V_{MIN|MAX}_{I|U}64 opcodes (#151256)
1 parent 05bfcd8 commit 3dfd939

File tree

13 files changed

+1421
-179
lines changed

13 files changed

+1421
-179
lines changed

llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp

Lines changed: 24 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1342,13 +1342,30 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
13421342
.scalarize(0);
13431343

13441344
if (ST.hasVOP3PInsts()) {
1345-
getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
1346-
.legalFor({S32, S16, V2S16})
1347-
.clampMaxNumElements(0, S16, 2)
1348-
.minScalar(0, S16)
1349-
.widenScalarToNextPow2(0)
1350-
.scalarize(0)
1351-
.lower();
1345+
getActionDefinitionsBuilder(G_ABS)
1346+
.legalFor({S32, S16, V2S16})
1347+
.clampMaxNumElements(0, S16, 2)
1348+
.minScalar(0, S16)
1349+
.widenScalarToNextPow2(0)
1350+
.scalarize(0)
1351+
.lower();
1352+
if (ST.hasIntMinMax64()) {
1353+
getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
1354+
.legalFor({S32, S16, S64, V2S16})
1355+
.clampMaxNumElements(0, S16, 2)
1356+
.minScalar(0, S16)
1357+
.widenScalarToNextPow2(0)
1358+
.scalarize(0)
1359+
.lower();
1360+
} else {
1361+
getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
1362+
.legalFor({S32, S16, V2S16})
1363+
.clampMaxNumElements(0, S16, 2)
1364+
.minScalar(0, S16)
1365+
.widenScalarToNextPow2(0)
1366+
.scalarize(0)
1367+
.lower();
1368+
}
13521369
} else {
13531370
getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
13541371
.legalFor({S32, S16})

llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp

Lines changed: 12 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -4009,10 +4009,6 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
40094009
case AMDGPU::G_SADDE:
40104010
case AMDGPU::G_USUBE:
40114011
case AMDGPU::G_SSUBE:
4012-
case AMDGPU::G_SMIN:
4013-
case AMDGPU::G_SMAX:
4014-
case AMDGPU::G_UMIN:
4015-
case AMDGPU::G_UMAX:
40164012
case AMDGPU::G_ABS:
40174013
case AMDGPU::G_SHUFFLE_VECTOR:
40184014
case AMDGPU::G_SBFX:
@@ -4022,6 +4018,18 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
40224018
if (isSALUMapping(MI))
40234019
return getDefaultMappingSOP(MI);
40244020
return getDefaultMappingVOP(MI);
4021+
case AMDGPU::G_SMIN:
4022+
case AMDGPU::G_SMAX:
4023+
case AMDGPU::G_UMIN:
4024+
case AMDGPU::G_UMAX:
4025+
if (isSALUMapping(MI)) {
4026+
// There are no scalar 64-bit min and max, use vector instruction instead.
4027+
if (MRI.getType(MI.getOperand(0).getReg()).getSizeInBits() == 64 &&
4028+
Subtarget.hasIntMinMax64())
4029+
return getDefaultMappingVOP(MI);
4030+
return getDefaultMappingSOP(MI);
4031+
}
4032+
return getDefaultMappingVOP(MI);
40254033
case AMDGPU::G_FADD:
40264034
case AMDGPU::G_FSUB:
40274035
case AMDGPU::G_FMUL:

llvm/lib/Target/AMDGPU/GCNSubtarget.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1532,6 +1532,9 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
15321532
// instructions.
15331533
bool hasMadU64U32NoCarry() const { return GFX1250Insts; }
15341534

1535+
// \returns true if the target has V_{MIN|MAX}_{I|U}64 instructions.
1536+
bool hasIntMinMax64() const { return GFX1250Insts; }
1537+
15351538
// \returns true if the target has V_PK_ADD_{MIN|MAX}_{I|U}16 instructions.
15361539
bool hasPkAddMinMaxInsts() const { return GFX1250Insts; }
15371540

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -909,6 +909,10 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
909909
Custom);
910910
}
911911

912+
if (Subtarget->hasIntMinMax64())
913+
setOperationAction({ISD::SMIN, ISD::UMIN, ISD::SMAX, ISD::UMAX}, MVT::i64,
914+
Legal);
915+
912916
setOperationAction(ISD::INTRINSIC_WO_CHAIN,
913917
{MVT::Other, MVT::f32, MVT::v4f32, MVT::i16, MVT::f16,
914918
MVT::bf16, MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::i128,

llvm/lib/Target/AMDGPU/VOP3Instructions.td

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -202,6 +202,13 @@ defm V_MAXIMUM_F64 : VOP3Inst <"v_maximum_f64", VOP3_Profile<VOP_F64_F64_F64>, f
202202
} // End SchedRW = [WriteDoubleAdd]
203203
} // End SubtargetPredicate = HasIEEEMinimumMaximumInsts, ReadsModeReg = 0, AddedComplexity = 1
204204

205+
let SubtargetPredicate = isGFX1250Plus, SchedRW = [WriteDoubleAdd] in {
206+
defm V_MAX_I64 : VOP3Inst <"v_max_i64", VOP_I64_I64_I64_DPP, smax>;
207+
defm V_MAX_U64 : VOP3Inst <"v_max_u64", VOP_I64_I64_I64_DPP, umax>;
208+
defm V_MIN_I64 : VOP3Inst <"v_min_i64", VOP_I64_I64_I64_DPP, smin>;
209+
defm V_MIN_U64 : VOP3Inst <"v_min_u64", VOP_I64_I64_I64_DPP, umin>;
210+
} // End SubtargetPredicate = isGFX1250Plus, SchedRW = [WriteDoubleAdd]
211+
205212
} // End isReMaterializable = 1
206213

207214
let Uses = [MODE, VCC, EXEC] in {
@@ -1810,6 +1817,10 @@ defm V_BITOP3_B32 : VOP3_Real_BITOP3_gfx1250<0x234>;
18101817
defm V_MAD_U32 : VOP3Only_Realtriple_gfx1250<0x235>;
18111818
defm V_MAD_NC_U64_U32 : VOP3Only_Realtriple_gfx1250<0x2fa>;
18121819
defm V_MAD_NC_I64_I32 : VOP3Only_Realtriple_gfx1250<0x2fb>;
1820+
defm V_MIN_U64 : VOP3Only_Realtriple_gfx1250<0x318>;
1821+
defm V_MAX_U64 : VOP3Only_Realtriple_gfx1250<0x319>;
1822+
defm V_MIN_I64 : VOP3Only_Realtriple_gfx1250<0x31a>;
1823+
defm V_MAX_I64 : VOP3Only_Realtriple_gfx1250<0x31b>;
18131824

18141825
defm V_CVT_PK_FP8_F32 : VOP3Only_Realtriple_t16_and_fake16_gfx12<0x369, "v_cvt_pk_fp8_f32">;
18151826
defm V_CVT_PK_BF8_F32 : VOP3Only_Realtriple_t16_and_fake16_gfx12<0x36a, "v_cvt_pk_bf8_f32">;
Lines changed: 192 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,192 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2+
; RUN: llc -global-isel -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx1250 < %s | FileCheck %s
3+
4+
declare i64 @llvm.umin.i64(i64, i64)
5+
declare i64 @llvm.umax.i64(i64, i64)
6+
declare i64 @llvm.smin.i64(i64, i64)
7+
declare i64 @llvm.smax.i64(i64, i64)
8+
declare i64 @llvm.abs.i64(i64, i1)
9+
10+
declare <4 x i64> @llvm.umin.v4i64(<4 x i64>, <4 x i64>)
11+
declare <4 x i64> @llvm.umax.v4i64(<4 x i64>, <4 x i64>)
12+
declare <4 x i64> @llvm.smin.v4i64(<4 x i64>, <4 x i64>)
13+
declare <4 x i64> @llvm.smax.v4i64(<4 x i64>, <4 x i64>)
14+
15+
define i64 @test_umin_i64(i64 %a, i64 %b) {
16+
; CHECK-LABEL: test_umin_i64:
17+
; CHECK: ; %bb.0:
18+
; CHECK-NEXT: s_wait_loadcnt_dscnt 0x0
19+
; CHECK-NEXT: s_wait_kmcnt 0x0
20+
; CHECK-NEXT: v_min_u64 v[0:1], v[0:1], v[2:3]
21+
; CHECK-NEXT: s_set_pc_i64 s[30:31]
22+
%r = call i64 @llvm.umin.i64(i64 %a, i64 %b)
23+
ret i64 %r
24+
}
25+
26+
define i64 @test_umax_i64(i64 %a, i64 %b) {
27+
; CHECK-LABEL: test_umax_i64:
28+
; CHECK: ; %bb.0:
29+
; CHECK-NEXT: s_wait_loadcnt_dscnt 0x0
30+
; CHECK-NEXT: s_wait_kmcnt 0x0
31+
; CHECK-NEXT: v_max_u64 v[0:1], v[0:1], v[2:3]
32+
; CHECK-NEXT: s_set_pc_i64 s[30:31]
33+
%r = call i64 @llvm.umax.i64(i64 %a, i64 %b)
34+
ret i64 %r
35+
}
36+
37+
define i64 @test_smin_i64(i64 %a, i64 %b) {
38+
; CHECK-LABEL: test_smin_i64:
39+
; CHECK: ; %bb.0:
40+
; CHECK-NEXT: s_wait_loadcnt_dscnt 0x0
41+
; CHECK-NEXT: s_wait_kmcnt 0x0
42+
; CHECK-NEXT: v_min_i64 v[0:1], v[0:1], v[2:3]
43+
; CHECK-NEXT: s_set_pc_i64 s[30:31]
44+
%r = call i64 @llvm.smin.i64(i64 %a, i64 %b)
45+
ret i64 %r
46+
}
47+
48+
define i64 @test_smax_i64(i64 %a, i64 %b) {
49+
; CHECK-LABEL: test_smax_i64:
50+
; CHECK: ; %bb.0:
51+
; CHECK-NEXT: s_wait_loadcnt_dscnt 0x0
52+
; CHECK-NEXT: s_wait_kmcnt 0x0
53+
; CHECK-NEXT: v_max_i64 v[0:1], v[0:1], v[2:3]
54+
; CHECK-NEXT: s_set_pc_i64 s[30:31]
55+
%r = call i64 @llvm.smax.i64(i64 %a, i64 %b)
56+
ret i64 %r
57+
}
58+
59+
define <4 x i64> @test_umin_v4i64(<4 x i64> %a, <4 x i64> %b) {
60+
; CHECK-LABEL: test_umin_v4i64:
61+
; CHECK: ; %bb.0:
62+
; CHECK-NEXT: s_wait_loadcnt_dscnt 0x0
63+
; CHECK-NEXT: s_wait_kmcnt 0x0
64+
; CHECK-NEXT: v_min_u64 v[0:1], v[0:1], v[8:9]
65+
; CHECK-NEXT: v_min_u64 v[2:3], v[2:3], v[10:11]
66+
; CHECK-NEXT: v_min_u64 v[4:5], v[4:5], v[12:13]
67+
; CHECK-NEXT: v_min_u64 v[6:7], v[6:7], v[14:15]
68+
; CHECK-NEXT: s_set_pc_i64 s[30:31]
69+
%r = call <4 x i64> @llvm.umin.v4i64(<4 x i64> %a, <4 x i64> %b)
70+
ret <4 x i64> %r
71+
}
72+
73+
define <4 x i64> @test_umax_v4i64(<4 x i64> %a, <4 x i64> %b) {
74+
; CHECK-LABEL: test_umax_v4i64:
75+
; CHECK: ; %bb.0:
76+
; CHECK-NEXT: s_wait_loadcnt_dscnt 0x0
77+
; CHECK-NEXT: s_wait_kmcnt 0x0
78+
; CHECK-NEXT: v_max_u64 v[0:1], v[0:1], v[8:9]
79+
; CHECK-NEXT: v_max_u64 v[2:3], v[2:3], v[10:11]
80+
; CHECK-NEXT: v_max_u64 v[4:5], v[4:5], v[12:13]
81+
; CHECK-NEXT: v_max_u64 v[6:7], v[6:7], v[14:15]
82+
; CHECK-NEXT: s_set_pc_i64 s[30:31]
83+
%r = call <4 x i64> @llvm.umax.v4i64(<4 x i64> %a, <4 x i64> %b)
84+
ret <4 x i64> %r
85+
}
86+
87+
define <4 x i64> @test_smin_v4i64(<4 x i64> %a, <4 x i64> %b) {
88+
; CHECK-LABEL: test_smin_v4i64:
89+
; CHECK: ; %bb.0:
90+
; CHECK-NEXT: s_wait_loadcnt_dscnt 0x0
91+
; CHECK-NEXT: s_wait_kmcnt 0x0
92+
; CHECK-NEXT: v_min_i64 v[0:1], v[0:1], v[8:9]
93+
; CHECK-NEXT: v_min_i64 v[2:3], v[2:3], v[10:11]
94+
; CHECK-NEXT: v_min_i64 v[4:5], v[4:5], v[12:13]
95+
; CHECK-NEXT: v_min_i64 v[6:7], v[6:7], v[14:15]
96+
; CHECK-NEXT: s_set_pc_i64 s[30:31]
97+
%r = call <4 x i64> @llvm.smin.v4i64(<4 x i64> %a, <4 x i64> %b)
98+
ret <4 x i64> %r
99+
}
100+
101+
define <4 x i64> @test_smax_v4i64(<4 x i64> %a, <4 x i64> %b) {
102+
; CHECK-LABEL: test_smax_v4i64:
103+
; CHECK: ; %bb.0:
104+
; CHECK-NEXT: s_wait_loadcnt_dscnt 0x0
105+
; CHECK-NEXT: s_wait_kmcnt 0x0
106+
; CHECK-NEXT: v_max_i64 v[0:1], v[0:1], v[8:9]
107+
; CHECK-NEXT: v_max_i64 v[2:3], v[2:3], v[10:11]
108+
; CHECK-NEXT: v_max_i64 v[4:5], v[4:5], v[12:13]
109+
; CHECK-NEXT: v_max_i64 v[6:7], v[6:7], v[14:15]
110+
; CHECK-NEXT: s_set_pc_i64 s[30:31]
111+
%r = call <4 x i64> @llvm.smax.v4i64(<4 x i64> %a, <4 x i64> %b)
112+
ret <4 x i64> %r
113+
}
114+
115+
define i64 @test_abs_i64(i64 %a) {
116+
; CHECK-LABEL: test_abs_i64:
117+
; CHECK: ; %bb.0:
118+
; CHECK-NEXT: s_wait_loadcnt_dscnt 0x0
119+
; CHECK-NEXT: s_wait_kmcnt 0x0
120+
; CHECK-NEXT: v_ashrrev_i32_e32 v2, 31, v1
121+
; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
122+
; CHECK-NEXT: v_mov_b32_e32 v3, v2
123+
; CHECK-NEXT: v_add_nc_u64_e32 v[0:1], v[0:1], v[2:3]
124+
; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
125+
; CHECK-NEXT: v_xor_b32_e32 v0, v0, v2
126+
; CHECK-NEXT: v_xor_b32_e32 v1, v1, v2
127+
; CHECK-NEXT: s_set_pc_i64 s[30:31]
128+
%r = call i64 @llvm.abs.i64(i64 %a, i1 0)
129+
ret i64 %r
130+
}
131+
132+
define amdgpu_ps i64 @test_umin_i64_s(i64 inreg %a, i64 inreg %b) {
133+
; CHECK-LABEL: test_umin_i64_s:
134+
; CHECK: ; %bb.0:
135+
; CHECK-NEXT: v_min_u64 v[0:1], s[0:1], s[2:3]
136+
; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
137+
; CHECK-NEXT: v_readfirstlane_b32 s0, v0
138+
; CHECK-NEXT: v_readfirstlane_b32 s1, v1
139+
; CHECK-NEXT: ; return to shader part epilog
140+
%r = call i64 @llvm.umin.i64(i64 %a, i64 %b)
141+
ret i64 %r
142+
}
143+
144+
define amdgpu_ps i64 @test_umax_i64_s(i64 inreg %a, i64 inreg %b) {
145+
; CHECK-LABEL: test_umax_i64_s:
146+
; CHECK: ; %bb.0:
147+
; CHECK-NEXT: v_max_u64 v[0:1], s[0:1], s[2:3]
148+
; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
149+
; CHECK-NEXT: v_readfirstlane_b32 s0, v0
150+
; CHECK-NEXT: v_readfirstlane_b32 s1, v1
151+
; CHECK-NEXT: ; return to shader part epilog
152+
%r = call i64 @llvm.umax.i64(i64 %a, i64 %b)
153+
ret i64 %r
154+
}
155+
156+
define amdgpu_ps i64 @test_smin_i64_s(i64 inreg %a, i64 inreg %b) {
157+
; CHECK-LABEL: test_smin_i64_s:
158+
; CHECK: ; %bb.0:
159+
; CHECK-NEXT: v_min_i64 v[0:1], s[0:1], s[2:3]
160+
; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
161+
; CHECK-NEXT: v_readfirstlane_b32 s0, v0
162+
; CHECK-NEXT: v_readfirstlane_b32 s1, v1
163+
; CHECK-NEXT: ; return to shader part epilog
164+
%r = call i64 @llvm.smin.i64(i64 %a, i64 %b)
165+
ret i64 %r
166+
}
167+
168+
define amdgpu_ps i64 @test_smax_i64_s(i64 inreg %a, i64 inreg %b) {
169+
; CHECK-LABEL: test_smax_i64_s:
170+
; CHECK: ; %bb.0:
171+
; CHECK-NEXT: v_max_i64 v[0:1], s[0:1], s[2:3]
172+
; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
173+
; CHECK-NEXT: v_readfirstlane_b32 s0, v0
174+
; CHECK-NEXT: v_readfirstlane_b32 s1, v1
175+
; CHECK-NEXT: ; return to shader part epilog
176+
%r = call i64 @llvm.smax.i64(i64 %a, i64 %b)
177+
ret i64 %r
178+
}
179+
180+
define amdgpu_ps i64 @test_abs_i64_s(i64 inreg %a) {
181+
; CHECK-LABEL: test_abs_i64_s:
182+
; CHECK: ; %bb.0:
183+
; CHECK-NEXT: s_ashr_i32 s2, s1, 31
184+
; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
185+
; CHECK-NEXT: s_mov_b32 s3, s2
186+
; CHECK-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[2:3]
187+
; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
188+
; CHECK-NEXT: s_xor_b64 s[0:1], s[0:1], s[2:3]
189+
; CHECK-NEXT: ; return to shader part epilog
190+
%r = call i64 @llvm.abs.i64(i64 %a, i1 0)
191+
ret i64 %r
192+
}

0 commit comments

Comments
 (0)