Skip to content

Commit b7c0518

Browse files
committed
[ARM] and, or, xor and add with shl combine
The generic dag combiner will fold: (shl (add x, c1), c2) -> (add (shl x, c2), c1 << c2) (shl (or x, c1), c2) -> (or (shl x, c2), c1 << c2) This can create constants which are too large to use as an immediate. Many ALU operations are also able of performing the shl, so we can unfold the transformation to prevent a mov imm instruction from being generated. Other patterns, such as b + ((a << 1) | 510), can also be simplified in the same manner. Differential Revision: https://reviews.llvm.org/D38084 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@317197 91177308-0d34-0410-b5e6-96231b3b80d8
1 parent 4746ebd commit b7c0518

File tree

2 files changed

+293
-7
lines changed

2 files changed

+293
-7
lines changed

lib/Target/ARM/ARMISelLowering.cpp

Lines changed: 120 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -9955,6 +9955,102 @@ static SDValue PerformADDCombineWithOperands(SDNode *N, SDValue N0, SDValue N1,
99559955
return SDValue();
99569956
}
99579957

9958+
static SDValue PerformSHLSimplify(SDNode *N,
9959+
TargetLowering::DAGCombinerInfo &DCI,
9960+
const ARMSubtarget *ST) {
9961+
// Allow the generic combiner to identify potential bswaps.
9962+
if (DCI.isBeforeLegalize())
9963+
return SDValue();
9964+
9965+
// DAG combiner will fold:
9966+
// (shl (add x, c1), c2) -> (add (shl x, c2), c1 << c2)
9967+
// (shl (or x, c1), c2) -> (or (shl x, c2), c1 << c2
9968+
// Other code patterns that can be also be modified have the following form:
9969+
// b + ((a << 1) | 510)
9970+
// b + ((a << 1) & 510)
9971+
// b + ((a << 1) ^ 510)
9972+
// b + ((a << 1) + 510)
9973+
9974+
// Many instructions can perform the shift for free, but it requires both
9975+
// the operands to be registers. If c1 << c2 is too large, a mov immediate
9976+
// instruction will needed. So, unfold back to the original pattern if:
9977+
// - if c1 and c2 are small enough that they don't require mov imms.
9978+
// - the user(s) of the node can perform an shl
9979+
9980+
// No shifted operands for 16-bit instructions.
9981+
if (ST->isThumb() && ST->isThumb1Only())
9982+
return SDValue();
9983+
9984+
// Check that all the users could perform the shl themselves.
9985+
for (auto U : N->uses()) {
9986+
switch(U->getOpcode()) {
9987+
default:
9988+
return SDValue();
9989+
case ISD::SUB:
9990+
case ISD::ADD:
9991+
case ISD::AND:
9992+
case ISD::OR:
9993+
case ISD::XOR:
9994+
case ISD::SETCC:
9995+
case ARMISD::CMP:
9996+
// Check that its not already using a shl.
9997+
if (U->getOperand(0).getOpcode() == ISD::SHL ||
9998+
U->getOperand(1).getOpcode() == ISD::SHL)
9999+
return SDValue();
10000+
break;
10001+
}
10002+
}
10003+
10004+
if (N->getOpcode() != ISD::ADD && N->getOpcode() != ISD::OR &&
10005+
N->getOpcode() != ISD::XOR && N->getOpcode() != ISD::AND)
10006+
return SDValue();
10007+
10008+
if (N->getOperand(0).getOpcode() != ISD::SHL)
10009+
return SDValue();
10010+
10011+
SDValue SHL = N->getOperand(0);
10012+
10013+
auto *C1ShlC2 = dyn_cast<ConstantSDNode>(N->getOperand(1));
10014+
auto *C2 = dyn_cast<ConstantSDNode>(SHL.getOperand(1));
10015+
if (!C1ShlC2 || !C2)
10016+
return SDValue();
10017+
10018+
DEBUG(dbgs() << "Trying to simplify shl: "; N->dump());
10019+
10020+
APInt C2Int = C2->getAPIntValue();
10021+
APInt C1Int = C1ShlC2->getAPIntValue();
10022+
10023+
// Check that performing a lshr will not lose any information.
10024+
APInt Mask = APInt::getHighBitsSet(C2Int.getBitWidth(),
10025+
C2Int.getBitWidth() - C2->getZExtValue());
10026+
if ((C1Int & Mask) != C1Int)
10027+
return SDValue();
10028+
10029+
// Shift the first constant.
10030+
C1Int.lshrInPlace(C2Int);
10031+
10032+
// The immediates are encoded as an 8-bit value that can be rotated.
10033+
unsigned Zeros = C1Int.countLeadingZeros() + C1Int.countTrailingZeros();
10034+
if (C1Int.getBitWidth() - Zeros > 8)
10035+
return SDValue();
10036+
10037+
Zeros = C2Int.countLeadingZeros() + C2Int.countTrailingZeros();
10038+
if (C2Int.getBitWidth() - Zeros > 8)
10039+
return SDValue();
10040+
10041+
SelectionDAG &DAG = DCI.DAG;
10042+
SDLoc dl(N);
10043+
SDValue X = SHL.getOperand(0);
10044+
SDValue BinOp = DAG.getNode(N->getOpcode(), dl, MVT::i32, X,
10045+
DAG.getConstant(C1Int, dl, MVT::i32));
10046+
// Shift left to compensate for the lshr of C1Int.
10047+
SDValue Res = DAG.getNode(ISD::SHL, dl, MVT::i32, BinOp, SHL.getOperand(1));
10048+
10049+
DAG.ReplaceAllUsesWith(SDValue(N, 0), Res);
10050+
return SDValue(N, 0);
10051+
}
10052+
10053+
995810054
/// PerformADDCombine - Target-specific dag combine xforms for ISD::ADD.
995910055
///
996010056
static SDValue PerformADDCombine(SDNode *N,
@@ -9963,6 +10059,10 @@ static SDValue PerformADDCombine(SDNode *N,
996310059
SDValue N0 = N->getOperand(0);
996410060
SDValue N1 = N->getOperand(1);
996510061

10062+
// Only works one way, because it needs an immediate operand.
10063+
if (SDValue Result = PerformSHLSimplify(N, DCI, Subtarget))
10064+
return Result;
10065+
996610066
// First try with the default operand order.
996710067
if (SDValue Result = PerformADDCombineWithOperands(N, N0, N1, DCI, Subtarget))
996810068
return Result;
@@ -10151,6 +10251,9 @@ static SDValue PerformANDCombine(SDNode *N,
1015110251
// fold (and (select cc, -1, c), x) -> (select cc, x, (and, x, c))
1015210252
if (SDValue Result = combineSelectAndUseCommutative(N, true, DCI))
1015310253
return Result;
10254+
10255+
if (SDValue Result = PerformSHLSimplify(N, DCI, Subtarget))
10256+
return Result;
1015410257
}
1015510258

1015610259
return SDValue();
@@ -10384,17 +10487,19 @@ static SDValue PerformORCombine(SDNode *N,
1038410487
return Result;
1038510488
}
1038610489

10387-
// The code below optimizes (or (and X, Y), Z).
10388-
// The AND operand needs to have a single user to make these optimizations
10389-
// profitable.
1039010490
SDValue N0 = N->getOperand(0);
10391-
if (N0.getOpcode() != ISD::AND || !N0.hasOneUse())
10392-
return SDValue();
1039310491
SDValue N1 = N->getOperand(1);
1039410492

1039510493
// (or (and B, A), (and C, ~A)) => (VBSL A, B, C) when A is a constant.
1039610494
if (Subtarget->hasNEON() && N1.getOpcode() == ISD::AND && VT.isVector() &&
1039710495
DAG.getTargetLoweringInfo().isTypeLegal(VT)) {
10496+
10497+
// The code below optimizes (or (and X, Y), Z).
10498+
// The AND operand needs to have a single user to make these optimizations
10499+
// profitable.
10500+
if (N0.getOpcode() != ISD::AND || !N0.hasOneUse())
10501+
return SDValue();
10502+
1039810503
APInt SplatUndef;
1039910504
unsigned SplatBitSize;
1040010505
bool HasAnyUndefs;
@@ -10427,8 +10532,13 @@ static SDValue PerformORCombine(SDNode *N,
1042710532

1042810533
// Try to use the ARM/Thumb2 BFI (bitfield insert) instruction when
1042910534
// reasonable.
10430-
if (SDValue Res = PerformORCombineToBFI(N, DCI, Subtarget))
10431-
return Res;
10535+
if (N0.getOpcode() == ISD::AND && N0.hasOneUse()) {
10536+
if (SDValue Res = PerformORCombineToBFI(N, DCI, Subtarget))
10537+
return Res;
10538+
}
10539+
10540+
if (SDValue Result = PerformSHLSimplify(N, DCI, Subtarget))
10541+
return Result;
1043210542

1043310543
return SDValue();
1043410544
}
@@ -10446,6 +10556,9 @@ static SDValue PerformXORCombine(SDNode *N,
1044610556
// fold (xor (select cc, 0, c), x) -> (select cc, x, (xor, x, c))
1044710557
if (SDValue Result = combineSelectAndUseCommutative(N, false, DCI))
1044810558
return Result;
10559+
10560+
if (SDValue Result = PerformSHLSimplify(N, DCI, Subtarget))
10561+
return Result;
1044910562
}
1045010563

1045110564
return SDValue();

test/CodeGen/ARM/unfold-shifts.ll

Lines changed: 173 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,173 @@
1+
; RUN: llc -mtriple armv6t2 %s -o - | FileCheck %s
2+
; RUN: llc -mtriple thumbv6t2 %s -o - | FileCheck %s --check-prefix=CHECK-T2
3+
; RUN: llc -mtriple armv7 %s -o - | FileCheck %s
4+
; RUN: llc -mtriple thumbv7 %s -o - | FileCheck %s --check-prefix=CHECK-T2
5+
; RUN: llc -mtriple thumbv7m %s -o - | FileCheck %s --check-prefix=CHECK-T2
6+
; RUN: llc -mtriple thumbv8m.main %s -o - | FileCheck %s --check-prefix=CHECK-T2
7+
8+
; CHECK-LABEL: unfold1
9+
; CHECK-NOT: mov
10+
; CHECK: orr r0, r0, #255
11+
; CHECK: add r0, r1, r0, lsl #1
12+
; CHECK-T2-NOT: mov
13+
; CHECK-T2: orr r0, r0, #255
14+
; CHECK-T2: add.w r0, r1, r0, lsl #1
15+
define arm_aapcscc i32 @unfold1(i32 %a, i32 %b) {
16+
entry:
17+
%or = shl i32 %a, 1
18+
%shl = or i32 %or, 510
19+
%add = add nsw i32 %shl, %b
20+
ret i32 %add
21+
}
22+
23+
; CHECK-LABEL: unfold2
24+
; CHECK-NOT: mov
25+
; CHECK: orr r0, r0, #4080
26+
; CHECK: sub r0, r1, r0, lsl #2
27+
; CHECK-T2-NOT: mov
28+
; CHECK-T2: orr r0, r0, #4080
29+
; CHECK-T2: sub.w r0, r1, r0, lsl #2
30+
define arm_aapcscc i32 @unfold2(i32 %a, i32 %b) {
31+
entry:
32+
%or = shl i32 %a, 2
33+
%shl = or i32 %or, 16320
34+
%sub = sub nsw i32 %b, %shl
35+
ret i32 %sub
36+
}
37+
38+
; CHECK-LABEL: unfold3
39+
; CHECK-NOT: mov
40+
; CHECK: orr r0, r0, #65280
41+
; CHECK: and r0, r1, r0, lsl #4
42+
; CHECK-T2-NOT: mov
43+
; CHECK-T2: orr r0, r0, #65280
44+
; CHECK-T2: and.w r0, r1, r0, lsl #4
45+
define arm_aapcscc i32 @unfold3(i32 %a, i32 %b) {
46+
entry:
47+
%or = shl i32 %a, 4
48+
%shl = or i32 %or, 1044480
49+
%and = and i32 %shl, %b
50+
ret i32 %and
51+
}
52+
53+
; CHECK-LABEL: unfold4
54+
; CHECK-NOT: mov
55+
; CHECK: orr r0, r0, #1044480
56+
; CHECK: eor r0, r1, r0, lsl #5
57+
; CHECK-T2-NOT: mov
58+
; CHECK-T2: orr r0, r0, #1044480
59+
; CHECK-T2: eor.w r0, r1, r0, lsl #5
60+
define arm_aapcscc i32 @unfold4(i32 %a, i32 %b) {
61+
entry:
62+
%or = shl i32 %a, 5
63+
%shl = or i32 %or, 33423360
64+
%xor = xor i32 %shl, %b
65+
ret i32 %xor
66+
}
67+
68+
; CHECK-LABEL: unfold5
69+
; CHECK-NOT: mov
70+
; CHECK: add r0, r0, #496
71+
; CHECK: orr r0, r1, r0, lsl #6
72+
; CHECK-T2: add.w r0, r0, #496
73+
; CHECK-T2: orr.w r0, r1, r0, lsl #6
74+
define arm_aapcscc i32 @unfold5(i32 %a, i32 %b) {
75+
entry:
76+
%add = shl i32 %a, 6
77+
%shl = add i32 %add, 31744
78+
%or = or i32 %shl, %b
79+
ret i32 %or
80+
}
81+
82+
; CHECK-LABEL: unfold6
83+
; CHECK-NOT: mov
84+
; CHECK: add r0, r0, #7936
85+
; CHECK: and r0, r1, r0, lsl #8
86+
; CHECK-T2-NOT: mov
87+
; CHECK-T2: add.w r0, r0, #7936
88+
; CHECK-T2: and.w r0, r1, r0, lsl #8
89+
define arm_aapcscc i32 @unfold6(i32 %a, i32 %b) {
90+
entry:
91+
%add = shl i32 %a, 8
92+
%shl = add i32 %add, 2031616
93+
%and = and i32 %shl, %b
94+
ret i32 %and
95+
}
96+
97+
; CHECK-LABEL: unfold7
98+
; CHECK-NOT: mov
99+
; CHECK: and r0, r0, #256
100+
; CHECK: add r0, r1, r0, lsl #1
101+
; CHECK-T2-NOT: mov
102+
; CHECK-T2: and r0, r0, #256
103+
; CHECK-T2: add.w r0, r1, r0, lsl #1
104+
define arm_aapcscc i32 @unfold7(i32 %a, i32 %b) {
105+
entry:
106+
%shl = shl i32 %a, 1
107+
%and = and i32 %shl, 512
108+
%add = add nsw i32 %and, %b
109+
ret i32 %add
110+
}
111+
112+
; CHECK-LABEL: unfold8
113+
; CHECK-NOT: mov
114+
; CHECK: add r0, r0, #126976
115+
; CHECK: eor r0, r1, r0, lsl #9
116+
; CHECK-T2-NOT: mov
117+
; CHECK-T2: add.w r0, r0, #126976
118+
; CHECK-T2: eor.w r0, r1, r0, lsl #9
119+
define arm_aapcscc i32 @unfold8(i32 %a, i32 %b) {
120+
entry:
121+
%add = shl i32 %a, 9
122+
%shl = add i32 %add, 65011712
123+
%xor = xor i32 %shl, %b
124+
ret i32 %xor
125+
}
126+
127+
; CHECK-LABEL: unfold9
128+
; CHECK-NOT: mov
129+
; CHECK: eor r0, r0, #255
130+
; CHECK: add r0, r1, r0, lsl #1
131+
; CHECK-T2-NOT: mov
132+
; CHECK-T2: eor r0, r0, #255
133+
; CHECK-T2: add.w r0, r1, r0, lsl #1
134+
define arm_aapcscc i32 @unfold9(i32 %a, i32 %b) {
135+
entry:
136+
%shl = shl i32 %a, 1
137+
%xor = xor i32 %shl, 510
138+
%add = add nsw i32 %xor, %b
139+
ret i32 %add
140+
}
141+
142+
; CHECK-LABEL: unfold10
143+
; CHECK-NOT: mov r2
144+
; CHECK: orr r2, r0, #4080
145+
; CHECK: cmp r1, r2, lsl #10
146+
; CHECK-T2-NOT: mov.w r2
147+
; CHECK-T2: orr r2, r0, #4080
148+
; CHECK-T2: cmp.w r1, r2, lsl #10
149+
define arm_aapcscc i32 @unfold10(i32 %a, i32 %b) {
150+
entry:
151+
%or = shl i32 %a, 10
152+
%shl = or i32 %or, 4177920
153+
%cmp = icmp sgt i32 %shl, %b
154+
%conv = zext i1 %cmp to i32
155+
ret i32 %conv
156+
}
157+
158+
; CHECK-LABEL: unfold11
159+
; CHECK-NOT: mov r2
160+
; CHECK: add r2, r0, #7936
161+
; CHECK: cmp r1, r2, lsl #11
162+
; CHECK-T2-NOT: mov.w r2
163+
; CHECK-T2: add.w r2, r0, #7936
164+
; CHECK-T2: cmp.w r1, r2, lsl #11
165+
define arm_aapcscc i32 @unfold11(i32 %a, i32 %b) {
166+
entry:
167+
%add = shl i32 %a, 11
168+
%shl = add i32 %add, 16252928
169+
%cmp = icmp sgt i32 %shl, %b
170+
%conv = zext i1 %cmp to i32
171+
ret i32 %conv
172+
}
173+

0 commit comments

Comments
 (0)