Skip to content

Commit 62fdb1f

Browse files
committed
[DAGCombine] Skip PostInc combine with later users
When decided whether to generate a post-inc load/store, look at the other memory nodes that use the same base address and, if any proceed the current node, then don't do the combine. The change only seems to be affecting the Arm backend, which I was surprised at, but it appears to fix a lot of our issues around MVE masked load/stores having to store a temporary address after an early post-increment on a shared base address. Differential Revision: https://reviews.llvm.org/D75847
1 parent 8e45eaf commit 62fdb1f

File tree

5 files changed

+48
-43
lines changed

5 files changed

+48
-43
lines changed

llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14248,10 +14248,25 @@ static bool shouldCombineToPostInc(SDNode *N, SDValue Ptr, SDNode *PtrUse,
1424814248
if (isa<FrameIndexSDNode>(BasePtr) || isa<RegisterSDNode>(BasePtr))
1424914249
return false;
1425014250

14251+
SmallPtrSet<const SDNode *, 32> Visited;
1425114252
for (SDNode *Use : BasePtr.getNode()->uses()) {
1425214253
if (Use == Ptr.getNode())
1425314254
continue;
1425414255

14256+
// No if there's a later user which could perform the index instead.
14257+
if (isa<MemSDNode>(Use)) {
14258+
bool IsLoad = true;
14259+
bool IsMasked = false;
14260+
SDValue OtherPtr;
14261+
if (getCombineLoadStoreParts(Use, ISD::POST_INC, ISD::POST_DEC, IsLoad,
14262+
IsMasked, OtherPtr, TLI)) {
14263+
SmallVector<const SDNode *, 2> Worklist;
14264+
Worklist.push_back(Use);
14265+
if (SDNode::hasPredecessorHelper(N, Visited, Worklist))
14266+
return false;
14267+
}
14268+
}
14269+
1425514270
// If all the uses are load / store addresses, then don't do the
1425614271
// transformation.
1425714272
if (Use->getOpcode() == ISD::ADD || Use->getOpcode() == ISD::SUB) {

llvm/test/CodeGen/Thumb/frame-access.ll

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -404,8 +404,8 @@ entry:
404404
; CHECK-NEXT: sub sp, #508
405405
; CHECK-NEXT: sub sp, #8
406406
; Argument addresses computed relative to BP
407-
; CHECK: adds r0, r6, #7
408-
; CHECK-NEXT: adds r0, #13
407+
; CHECK: adds r4, r6, #7
408+
; CHECK-NEXT: adds r4, #13
409409
; CHECK: adds r1, r6, #7
410410
; CHECK-NEXT: adds r1, #9
411411
; CHECK: adds r5, r6, #7

llvm/test/CodeGen/Thumb2/LowOverheadLoops/cond-vector-reduce-mve-codegen.ll

Lines changed: 4 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -400,18 +400,16 @@ define dso_local void @continue_on_zero(i32* noalias nocapture %arg, i32* noalia
400400
; CHECK-NEXT: cmp r2, #0
401401
; CHECK-NEXT: it eq
402402
; CHECK-NEXT: popeq {r7, pc}
403-
; CHECK-NEXT: mov r3, r0
404403
; CHECK-NEXT: dlstp.32 lr, r2
405404
; CHECK-NEXT: .LBB4_1: @ %bb9
406405
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
407406
; CHECK-NEXT: vldrw.u32 q0, [r1], #16
408407
; CHECK-NEXT: vcmp.i32 ne, q0, zr
409408
; CHECK-NEXT: vpst
410-
; CHECK-NEXT: vldrwt.u32 q1, [r3], #16
409+
; CHECK-NEXT: vldrwt.u32 q1, [r0]
411410
; CHECK-NEXT: vmul.i32 q0, q1, q0
412411
; CHECK-NEXT: vpst
413-
; CHECK-NEXT: vstrwt.32 q0, [r0]
414-
; CHECK-NEXT: mov r0, r3
412+
; CHECK-NEXT: vstrwt.32 q0, [r0], #16
415413
; CHECK-NEXT: letp lr, .LBB4_1
416414
; CHECK-NEXT: @ %bb.2: @ %bb27
417415
; CHECK-NEXT: pop {r7, pc}
@@ -464,22 +462,20 @@ define dso_local arm_aapcs_vfpcc void @range_test(i32* noalias nocapture %arg, i
464462
; CHECK-NEXT: bic r12, r12, #3
465463
; CHECK-NEXT: sub.w r12, r12, #4
466464
; CHECK-NEXT: add.w lr, lr, r12, lsr #2
467-
; CHECK-NEXT: mov r12, r0
468465
; CHECK-NEXT: dls lr, lr
469466
; CHECK-NEXT: .LBB5_1: @ %bb12
470467
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
471468
; CHECK-NEXT: vctp.32 r3
472469
; CHECK-NEXT: vpst
473-
; CHECK-NEXT: vldrwt.u32 q0, [r12], #16
470+
; CHECK-NEXT: vldrwt.u32 q0, [r0]
474471
; CHECK-NEXT: vpttt.i32 ne, q0, zr
475472
; CHECK-NEXT: vcmpt.s32 le, q0, r2
476473
; CHECK-NEXT: vctpt.32 r3
477474
; CHECK-NEXT: vldrwt.u32 q1, [r1], #16
478475
; CHECK-NEXT: subs r3, #4
479476
; CHECK-NEXT: vmul.i32 q0, q1, q0
480477
; CHECK-NEXT: vpst
481-
; CHECK-NEXT: vstrwt.32 q0, [r0]
482-
; CHECK-NEXT: mov r0, r12
478+
; CHECK-NEXT: vstrwt.32 q0, [r0], #16
483479
; CHECK-NEXT: le lr, .LBB5_1
484480
; CHECK-NEXT: @ %bb.2: @ %bb32
485481
; CHECK-NEXT: pop {r7, pc}

llvm/test/CodeGen/Thumb2/LowOverheadLoops/extending-loads.ll

Lines changed: 20 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -8,14 +8,13 @@ define dso_local arm_aapcs_vfpcc void @sext_i8(i16* noalias nocapture %a, i8* no
88
; CHECK-NEXT: cmp r2, #0
99
; CHECK-NEXT: it eq
1010
; CHECK-NEXT: popeq {r7, pc}
11-
; CHECK-NEXT: mov r3, r0
1211
; CHECK-NEXT: dlstp.16 lr, r2
13-
; CHECK: .LBB0_1: @ %vector.body
14-
; CHECK: vldrb.s16 q0, [r1], #8
15-
; CHECK-NEXT: vldrh.u16 q1, [r3], #16
12+
; CHECK-NEXT: .LBB0_1: @ %vector.body
13+
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
14+
; CHECK-NEXT: vldrb.s16 q0, [r1], #8
15+
; CHECK-NEXT: vldrh.u16 q1, [r0]
1616
; CHECK-NEXT: vadd.i16 q0, q1, q0
17-
; CHECK-NEXT: vstrh.16 q0, [r0]
18-
; CHECK-NEXT: mov r0, r3
17+
; CHECK-NEXT: vstrh.16 q0, [r0], #16
1918
; CHECK-NEXT: letp lr, .LBB0_1
2019
; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup
2120
; CHECK-NEXT: pop {r7, pc}
@@ -63,14 +62,13 @@ define dso_local arm_aapcs_vfpcc void @zext_i8(i16* noalias nocapture %a, i8* no
6362
; CHECK-NEXT: cmp r2, #0
6463
; CHECK-NEXT: it eq
6564
; CHECK-NEXT: popeq {r7, pc}
66-
; CHECK-NEXT: mov r3, r0
6765
; CHECK-NEXT: dlstp.16 lr, r2
68-
; CHECK: .LBB1_1: @ %vector.body
69-
; CHECK: vldrb.u16 q0, [r1], #8
70-
; CHECK-NEXT: vldrh.u16 q1, [r3], #16
66+
; CHECK-NEXT: .LBB1_1: @ %vector.body
67+
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
68+
; CHECK-NEXT: vldrb.u16 q0, [r1], #8
69+
; CHECK-NEXT: vldrh.u16 q1, [r0]
7170
; CHECK-NEXT: vadd.i16 q0, q1, q0
72-
; CHECK-NEXT: vstrh.16 q0, [r0]
73-
; CHECK-NEXT: mov r0, r3
71+
; CHECK-NEXT: vstrh.16 q0, [r0], #16
7472
; CHECK-NEXT: letp lr, .LBB1_1
7573
; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup
7674
; CHECK-NEXT: pop {r7, pc}
@@ -118,14 +116,13 @@ define dso_local arm_aapcs_vfpcc void @sext_i16(i32* noalias nocapture %a, i16*
118116
; CHECK-NEXT: cmp r2, #0
119117
; CHECK-NEXT: it eq
120118
; CHECK-NEXT: popeq {r7, pc}
121-
; CHECK-NEXT: mov r3, r0
122119
; CHECK-NEXT: dlstp.32 lr, r2
123-
; CHECK: .LBB2_1: @ %vector.body
124-
; CHECK: vldrh.s32 q0, [r1], #8
125-
; CHECK-NEXT: vldrw.u32 q1, [r3], #16
120+
; CHECK-NEXT: .LBB2_1: @ %vector.body
121+
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
122+
; CHECK-NEXT: vldrh.s32 q0, [r1], #8
123+
; CHECK-NEXT: vldrw.u32 q1, [r0]
126124
; CHECK-NEXT: vadd.i32 q0, q1, q0
127-
; CHECK-NEXT: vstrw.32 q0, [r0]
128-
; CHECK-NEXT: mov r0, r3
125+
; CHECK-NEXT: vstrw.32 q0, [r0], #16
129126
; CHECK-NEXT: letp lr, .LBB2_1
130127
; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup
131128
; CHECK-NEXT: pop {r7, pc}
@@ -173,14 +170,13 @@ define dso_local arm_aapcs_vfpcc void @zext_i16(i32* noalias nocapture %a, i16*
173170
; CHECK-NEXT: cmp r2, #0
174171
; CHECK-NEXT: it eq
175172
; CHECK-NEXT: popeq {r7, pc}
176-
; CHECK-NEXT: mov r3, r0
177173
; CHECK-NEXT: dlstp.32 lr, r2
178-
; CHECK: .LBB3_1: @ %vector.body
179-
; CHECK: vldrh.u32 q0, [r1], #8
180-
; CHECK-NEXT: vldrw.u32 q1, [r3], #16
174+
; CHECK-NEXT: .LBB3_1: @ %vector.body
175+
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
176+
; CHECK-NEXT: vldrh.u32 q0, [r1], #8
177+
; CHECK-NEXT: vldrw.u32 q1, [r0]
181178
; CHECK-NEXT: vadd.i32 q0, q1, q0
182-
; CHECK-NEXT: vstrw.32 q0, [r0]
183-
; CHECK-NEXT: mov r0, r3
179+
; CHECK-NEXT: vstrw.32 q0, [r0], #16
184180
; CHECK-NEXT: letp lr, .LBB3_1
185181
; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup
186182
; CHECK-NEXT: pop {r7, pc}

llvm/test/CodeGen/Thumb2/mve-postinc-lsr.ll

Lines changed: 7 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -133,26 +133,24 @@ define void @fma_tailpred(float* noalias nocapture readonly %A, float* noalias n
133133
; CHECK-NEXT: bic r12, r12, #3
134134
; CHECK-NEXT: mov.w lr, #1
135135
; CHECK-NEXT: sub.w r12, r12, #4
136-
; CHECK-NEXT: subs r3, #1
137136
; CHECK-NEXT: vldrw.u32 q0, [r4]
138-
; CHECK-NEXT: vdup.32 q1, r3
139137
; CHECK-NEXT: add.w lr, lr, r12, lsr #2
140-
; CHECK-NEXT: mov.w r12, #0
141-
; CHECK-NEXT: mov r3, r2
138+
; CHECK-NEXT: sub.w r12, r3, #1
139+
; CHECK-NEXT: movs r3, #0
140+
; CHECK-NEXT: vdup.32 q1, r12
142141
; CHECK-NEXT: dls lr, lr
143142
; CHECK-NEXT: .LBB1_2: @ %vector.body
144143
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
145-
; CHECK-NEXT: vdup.32 q2, r12
146-
; CHECK-NEXT: add.w r12, r12, #4
144+
; CHECK-NEXT: vdup.32 q2, r3
145+
; CHECK-NEXT: adds r3, #4
147146
; CHECK-NEXT: vorr q2, q2, q0
148147
; CHECK-NEXT: vpttt.u32 cs, q1, q2
149148
; CHECK-NEXT: vldrwt.u32 q2, [r0], #16
150149
; CHECK-NEXT: vldrwt.u32 q3, [r1], #16
151-
; CHECK-NEXT: vldrwt.u32 q4, [r3], #16
150+
; CHECK-NEXT: vldrwt.u32 q4, [r2]
152151
; CHECK-NEXT: vfma.f32 q4, q3, q2
153152
; CHECK-NEXT: vpst
154-
; CHECK-NEXT: vstrwt.32 q4, [r2]
155-
; CHECK-NEXT: mov r2, r3
153+
; CHECK-NEXT: vstrwt.32 q4, [r2], #16
156154
; CHECK-NEXT: le lr, .LBB1_2
157155
; CHECK-NEXT: .LBB1_3: @ %for.cond.cleanup
158156
; CHECK-NEXT: vpop {d8, d9}

0 commit comments

Comments
 (0)