Skip to content

Commit 2b19266

Browse files
committed
[AMDGPU][GlobalISel] Add combines with or/and that only use half of 64bit values
1 parent dfe3805 commit 2b19266

15 files changed

+688
-704
lines changed

llvm/lib/Target/AMDGPU/AMDGPUCombine.td

Lines changed: 19 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -151,6 +151,21 @@ def zext_of_shift_amount_combines : GICombineGroup<[
151151
canonicalize_zext_lshr, canonicalize_zext_ashr, canonicalize_zext_shl
152152
]>;
153153

154+
// (or i64:x, (zext i32:y)) -> i64:(merge (or i32:y, lo_32(x)), hi_32(x))
155+
def combine_or_s64_s32 : GICombineRule<
156+
(defs root:$dst),
157+
(match (G_ZEXT $zext_val, i32:$src_s32):$zext_inst,
158+
(G_OR $dst, i64:$src_s64, $zext_val):$dst),
159+
(apply [{ Helper.applyCombineOrS64S32(*${dst}, ${src_s64}.getReg(), ${src_s32}.getReg()); }])>;
160+
161+
// (and i64:x, 0xFFFFFFFF00000000) -> i64:(merge i32:0, hi_32(x))
162+
def combine_and_s64_himask : GICombineRule<
163+
(defs root:$dst),
164+
(match (G_CONSTANT $const, 0xFFFFFFFF00000000),
165+
(G_AND $dst, i64:$src_s64, $const):$dst),
166+
(apply [{ Helper.applyCombineAndS64HiMask(*${dst}, ${src_s64}.getReg()); }])>;
167+
168+
154169
let Predicates = [Has16BitInsts, NotHasMed3_16] in {
155170
// For gfx8, expand f16-fmed3-as-f32 into a min/max f16 sequence. This
156171
// saves one instruction compared to the promotion.
@@ -180,15 +195,17 @@ def gfx8_combines : GICombineGroup<[expand_promoted_fmed3]>;
180195
def AMDGPUPreLegalizerCombiner: GICombiner<
181196
"AMDGPUPreLegalizerCombinerImpl",
182197
[all_combines, combine_fmul_with_select_to_fldexp, clamp_i64_to_i16,
183-
foldable_fneg, combine_shuffle_vector_to_build_vector]> {
198+
foldable_fneg, combine_shuffle_vector_to_build_vector,
199+
combine_or_s64_s32, combine_and_s64_himask]> {
184200
let CombineAllMethodName = "tryCombineAllImpl";
185201
}
186202

187203
def AMDGPUPostLegalizerCombiner: GICombiner<
188204
"AMDGPUPostLegalizerCombinerImpl",
189205
[all_combines, gfx6gfx7_combines, gfx8_combines, combine_fmul_with_select_to_fldexp,
190206
uchar_to_float, cvt_f32_ubyteN, remove_fcanonicalize, foldable_fneg,
191-
rcp_sqrt_to_rsq, fdiv_by_sqrt_to_rsq_f16, sign_extension_in_reg, smulu64]> {
207+
rcp_sqrt_to_rsq, fdiv_by_sqrt_to_rsq_f16, sign_extension_in_reg, smulu64,
208+
combine_or_s64_s32, combine_and_s64_himask]> {
192209
let CombineAllMethodName = "tryCombineAllImpl";
193210
}
194211

llvm/lib/Target/AMDGPU/AMDGPUCombinerHelper.cpp

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -516,3 +516,29 @@ bool AMDGPUCombinerHelper::matchCombineFmulWithSelectToFldexp(
516516

517517
return true;
518518
}
519+
520+
void AMDGPUCombinerHelper::applyCombineOrS64S32(MachineInstr &MI,
521+
Register SrcS64,
522+
Register SrcS32) const {
523+
Register DstReg = MI.getOperand(0).getReg();
524+
525+
auto UnmergeParts = Builder.buildUnmerge(LLT::scalar(32), SrcS64);
526+
Register SrcS64Lo = UnmergeParts.getReg(0);
527+
Register SrcS64Hi = UnmergeParts.getReg(1);
528+
529+
auto Or = Builder.buildOr(LLT::scalar(32), SrcS32, SrcS64Lo).getReg(0);
530+
Builder.buildMergeValues(DstReg, {Or, SrcS64Hi});
531+
MI.eraseFromParent();
532+
}
533+
534+
void AMDGPUCombinerHelper::applyCombineAndS64HiMask(MachineInstr &MI,
535+
Register SrcS64) const {
536+
Register DstReg = MI.getOperand(0).getReg();
537+
538+
auto UnmergeParts = Builder.buildUnmerge(LLT::scalar(32), SrcS64);
539+
Register SrcS64Hi = UnmergeParts.getReg(1);
540+
541+
auto Const = Builder.buildConstant(LLT::scalar(32), 0).getReg(0);
542+
Builder.buildMergeValues(DstReg, {Const, SrcS64Hi});
543+
MI.eraseFromParent();
544+
}

llvm/lib/Target/AMDGPU/AMDGPUCombinerHelper.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,10 @@ class AMDGPUCombinerHelper : public CombinerHelper {
4343
bool matchCombineFmulWithSelectToFldexp(
4444
MachineInstr &MI, MachineInstr &Sel,
4545
std::function<void(MachineIRBuilder &)> &MatchInfo) const;
46+
47+
void applyCombineOrS64S32(MachineInstr &MI, Register S64, Register S32) const;
48+
49+
void applyCombineAndS64HiMask(MachineInstr &MI, Register S64) const;
4650
};
4751

4852
} // namespace llvm

llvm/test/CodeGen/AMDGPU/GlobalISel/combine-and-or-s64-s32.mir

Lines changed: 22 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -13,12 +13,9 @@ body: |
1313
; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr0
1414
; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr1
1515
; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr2
16-
; CHECK-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
17-
; CHECK-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[COPY2]](s32)
18-
; CHECK-NEXT: [[OR:%[0-9]+]]:_(s64) = disjoint G_OR [[MV]], [[ZEXT]]
19-
; CHECK-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[OR]](s64)
20-
; CHECK-NEXT: $sgpr0 = COPY [[UV]](s32)
21-
; CHECK-NEXT: $sgpr1 = COPY [[UV1]](s32)
16+
; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY2]], [[COPY]]
17+
; CHECK-NEXT: $sgpr0 = COPY [[OR]](s32)
18+
; CHECK-NEXT: $sgpr1 = COPY [[COPY1]](s32)
2219
; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1
2320
%0:_(s32) = COPY $sgpr0
2421
%1:_(s32) = COPY $sgpr1
@@ -40,14 +37,10 @@ body: |
4037
; CHECK-LABEL: name: test_combine_and_s64_himask
4138
; CHECK: liveins: $sgpr0, $sgpr1, $sgpr2
4239
; CHECK-NEXT: {{ $}}
43-
; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr0
44-
; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr1
45-
; CHECK-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
46-
; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 -4294967296
47-
; CHECK-NEXT: [[AND:%[0-9]+]]:_(s64) = G_AND [[MV]], [[C]]
48-
; CHECK-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AND]](s64)
49-
; CHECK-NEXT: $sgpr0 = COPY [[UV]](s32)
50-
; CHECK-NEXT: $sgpr1 = COPY [[UV1]](s32)
40+
; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr1
41+
; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
42+
; CHECK-NEXT: $sgpr0 = COPY [[C]](s32)
43+
; CHECK-NEXT: $sgpr1 = COPY [[COPY]](s32)
5144
; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1
5245
%0:_(s32) = COPY $sgpr0
5346
%1:_(s32) = COPY $sgpr1
@@ -68,17 +61,10 @@ body: |
6861
; CHECK-LABEL: name: test_combined
6962
; CHECK: liveins: $sgpr0, $sgpr1, $sgpr2
7063
; CHECK-NEXT: {{ $}}
71-
; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr0
72-
; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr1
73-
; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr2
74-
; CHECK-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
75-
; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 -4294967296
76-
; CHECK-NEXT: [[AND:%[0-9]+]]:_(s64) = G_AND [[MV]], [[C]]
77-
; CHECK-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[COPY2]](s32)
78-
; CHECK-NEXT: [[OR:%[0-9]+]]:_(s64) = disjoint G_OR [[AND]], [[ZEXT]]
79-
; CHECK-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[OR]](s64)
80-
; CHECK-NEXT: $sgpr0 = COPY [[UV]](s32)
81-
; CHECK-NEXT: $sgpr1 = COPY [[UV1]](s32)
64+
; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr1
65+
; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr2
66+
; CHECK-NEXT: $sgpr0 = COPY [[COPY1]](s32)
67+
; CHECK-NEXT: $sgpr1 = COPY [[COPY]](s32)
8268
; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1
8369
%0:_(s32) = COPY $sgpr0
8470
%1:_(s32) = COPY $sgpr1
@@ -105,9 +91,10 @@ body: |
10591
; CHECK-NEXT: {{ $}}
10692
; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $sgpr0_sgpr1
10793
; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr2
108-
; CHECK-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[COPY1]](s32)
109-
; CHECK-NEXT: [[OR:%[0-9]+]]:_(s64) = disjoint G_OR [[COPY]], [[ZEXT]]
110-
; CHECK-NEXT: $sgpr0_sgpr1 = COPY [[OR]](s64)
94+
; CHECK-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](s64)
95+
; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY1]], [[UV]]
96+
; CHECK-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR]](s32), [[UV1]](s32)
97+
; CHECK-NEXT: $sgpr0_sgpr1 = COPY [[MV]](s64)
11198
; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0_sgpr1
11299
%0:_(s64) = COPY $sgpr0_sgpr1
113100
%1:_(s32) = COPY $sgpr2
@@ -126,9 +113,10 @@ body: |
126113
; CHECK: liveins: $sgpr0_sgpr1, $sgpr2
127114
; CHECK-NEXT: {{ $}}
128115
; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $sgpr0_sgpr1
129-
; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 -4294967296
130-
; CHECK-NEXT: [[AND:%[0-9]+]]:_(s64) = G_AND [[COPY]], [[C]]
131-
; CHECK-NEXT: $sgpr0_sgpr1 = COPY [[AND]](s64)
116+
; CHECK-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](s64)
117+
; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
118+
; CHECK-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[C]](s32), [[UV1]](s32)
119+
; CHECK-NEXT: $sgpr0_sgpr1 = COPY [[MV]](s64)
132120
; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0_sgpr1
133121
%0:_(s64) = COPY $sgpr0_sgpr1
134122
%1:_(s64) = G_CONSTANT i64 -4294967296
@@ -148,11 +136,9 @@ body: |
148136
; CHECK-NEXT: {{ $}}
149137
; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1
150138
; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr2
151-
; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 -4294967296
152-
; CHECK-NEXT: [[AND:%[0-9]+]]:_(s64) = G_AND [[COPY]], [[C]]
153-
; CHECK-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[COPY1]](s32)
154-
; CHECK-NEXT: [[OR:%[0-9]+]]:_(s64) = G_OR [[AND]], [[ZEXT]]
155-
; CHECK-NEXT: $vgpr0_vgpr1 = COPY [[OR]](s64)
139+
; CHECK-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](s64)
140+
; CHECK-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY1]](s32), [[UV1]](s32)
141+
; CHECK-NEXT: $vgpr0_vgpr1 = COPY [[MV]](s64)
156142
%1:_(s64) = COPY $vgpr0_vgpr1
157143
%0:_(s32) = COPY $vgpr2
158144
%2:_(s64) = G_CONSTANT i64 -4294967296

llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-phis-no-lane-mask-merging.ll

Lines changed: 13 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -227,33 +227,30 @@ exit:
227227
define amdgpu_cs void @single_lane_execution_attribute(i32 inreg %.userdata0, <3 x i32> inreg %.WorkgroupId, <3 x i32> %.LocalInvocationId) #0 {
228228
; GFX10-LABEL: single_lane_execution_attribute:
229229
; GFX10: ; %bb.0: ; %.entry
230-
; GFX10-NEXT: s_mov_b32 s6, 0
231230
; GFX10-NEXT: s_getpc_b64 s[4:5]
232-
; GFX10-NEXT: s_mov_b32 s7, -1
233-
; GFX10-NEXT: s_mov_b32 s2, s1
234-
; GFX10-NEXT: s_and_b64 s[4:5], s[4:5], s[6:7]
235-
; GFX10-NEXT: s_mov_b32 s1, 0
231+
; GFX10-NEXT: s_mov_b32 s2, s0
232+
; GFX10-NEXT: s_mov_b32 s3, s5
236233
; GFX10-NEXT: v_mbcnt_lo_u32_b32 v1, -1, 0
237-
; GFX10-NEXT: s_or_b64 s[12:13], s[4:5], s[0:1]
238-
; GFX10-NEXT: s_load_dwordx8 s[4:11], s[12:13], 0x0
234+
; GFX10-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x0
239235
; GFX10-NEXT: v_mbcnt_hi_u32_b32 v1, -1, v1
240236
; GFX10-NEXT: v_lshlrev_b32_e32 v2, 2, v1
241237
; GFX10-NEXT: v_and_b32_e32 v3, 1, v1
242238
; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v3
243-
; GFX10-NEXT: s_xor_b32 s3, vcc_lo, exec_lo
239+
; GFX10-NEXT: s_xor_b32 s2, vcc_lo, exec_lo
244240
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
245241
; GFX10-NEXT: buffer_load_dword v2, v2, s[4:7], 0 offen
246-
; GFX10-NEXT: s_and_b32 vcc_lo, exec_lo, s3
242+
; GFX10-NEXT: s_and_b32 vcc_lo, exec_lo, s2
243+
; GFX10-NEXT: s_mov_b32 s2, 0
247244
; GFX10-NEXT: s_waitcnt vmcnt(0)
248245
; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 0, v2
249246
; GFX10-NEXT: s_cbranch_vccnz .LBB4_4
250247
; GFX10-NEXT: ; %bb.1: ; %.preheader.preheader
251248
; GFX10-NEXT: s_mov_b32 s3, 0
252249
; GFX10-NEXT: .LBB4_2: ; %.preheader
253250
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
254-
; GFX10-NEXT: v_mov_b32_e32 v3, s1
251+
; GFX10-NEXT: v_mov_b32_e32 v3, s2
255252
; GFX10-NEXT: v_add_nc_u32_e32 v1, -1, v1
256-
; GFX10-NEXT: s_add_i32 s1, s1, 4
253+
; GFX10-NEXT: s_add_i32 s2, s2, 4
257254
; GFX10-NEXT: buffer_load_dword v3, v3, s[4:7], 0 offen
258255
; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v1
259256
; GFX10-NEXT: s_waitcnt vmcnt(0)
@@ -262,19 +259,19 @@ define amdgpu_cs void @single_lane_execution_attribute(i32 inreg %.userdata0, <3
262259
; GFX10-NEXT: s_cbranch_vccnz .LBB4_2
263260
; GFX10-NEXT: ; %bb.3: ; %.preheader._crit_edge
264261
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, s3, v2
265-
; GFX10-NEXT: s_or_b32 s1, s0, vcc_lo
266-
; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, s1
262+
; GFX10-NEXT: s_or_b32 s2, s0, vcc_lo
263+
; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, s2
267264
; GFX10-NEXT: s_branch .LBB4_6
268265
; GFX10-NEXT: .LBB4_4:
269-
; GFX10-NEXT: s_mov_b32 s1, exec_lo
266+
; GFX10-NEXT: s_mov_b32 s2, exec_lo
270267
; GFX10-NEXT: ; implicit-def: $vgpr1
271-
; GFX10-NEXT: s_and_b32 vcc_lo, exec_lo, s1
268+
; GFX10-NEXT: s_and_b32 vcc_lo, exec_lo, s2
272269
; GFX10-NEXT: s_cbranch_vccz .LBB4_6
273270
; GFX10-NEXT: ; %bb.5: ; %.19
274271
; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0
275272
; GFX10-NEXT: v_or_b32_e32 v1, 2, v1
276273
; GFX10-NEXT: .LBB4_6: ; %.22
277-
; GFX10-NEXT: v_add_lshl_u32 v0, v0, s2, 2
274+
; GFX10-NEXT: v_add_lshl_u32 v0, v0, s1, 2
278275
; GFX10-NEXT: buffer_store_dword v1, v0, s[8:11], 0 offen
279276
; GFX10-NEXT: s_endpgm
280277
.entry:

0 commit comments

Comments
 (0)