Skip to content

Commit e4c2199

Browse files
committed
[ARM][ParallelDSP] SExt mul for accumulation
For any unpaired muls, we accumulate them as an input to the reduction. Check the type of the mul and perform a sext if the existing accumlator input type is not the same. Differential Revision: https://reviews.llvm.org/D66993 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@370851 91177308-0d34-0410-b5e6-96231b3b80d8
1 parent da176cb commit e4c2199

File tree

5 files changed

+389
-5
lines changed

5 files changed

+389
-5
lines changed

lib/Target/ARM/ARMParallelDSP.cpp

Lines changed: 14 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -649,18 +649,27 @@ void ARMParallelDSP::InsertParallelMACs(Reduction &R) {
649649
if (MulCand->Paired)
650650
continue;
651651

652-
LLVM_DEBUG(dbgs() << "Accumulating unpaired mul: " << *MulCand->Root
653-
<< "\n");
652+
Value *Mul = MulCand->Root;
653+
LLVM_DEBUG(dbgs() << "Accumulating unpaired mul: " << *Mul << "\n");
654+
655+
if (R.getRoot()->getType() != Mul->getType()) {
656+
assert(R.is64Bit() && "expected 64-bit result");
657+
Mul = Builder.CreateSExt(Mul, R.getRoot()->getType());
658+
}
659+
654660
if (!Acc) {
655-
Acc = MulCand->Root;
661+
Acc = Mul;
656662
continue;
657663
}
658-
Acc = Builder.CreateAdd(MulCand->Root, Acc);
664+
665+
Acc = Builder.CreateAdd(Mul, Acc);
659666
InsertAfter = cast<Instruction>(Acc);
660667
}
661668

662669
if (!Acc)
663-
Acc = ConstantInt::get(IntegerType::get(M->getContext(), 32), 0);
670+
Acc = R.is64Bit() ?
671+
ConstantInt::get(IntegerType::get(M->getContext(), 64), 0) :
672+
ConstantInt::get(IntegerType::get(M->getContext(), 32), 0);
664673

665674
IntegerType *Ty = IntegerType::get(M->getContext(), 32);
666675
for (auto &Pair : R.getMulPairs()) {

test/CodeGen/ARM/ParallelDSP/blocks.ll

Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,33 @@ entry:
2525
ret i32 %res
2626
}
2727

28+
; CHECK-LABEL: single_block_64
29+
; CHECK: [[CAST_A:%[^ ]+]] = bitcast i16* %a to i32*
30+
; CHECK: [[A:%[^ ]+]] = load i32, i32* [[CAST_A]]
31+
; CHECK: [[CAST_B:%[^ ]+]] = bitcast i16* %b to i32*
32+
; CHECK: [[B:%[^ ]+]] = load i32, i32* [[CAST_B]]
33+
; CHECK: call i64 @llvm.arm.smlald(i32 [[A]], i32 [[B]], i64 %acc)
34+
define i64 @single_block_64(i16* %a, i16* %b, i64 %acc) {
35+
entry:
36+
%ld.a.0 = load i16, i16* %a
37+
%sext.a.0 = sext i16 %ld.a.0 to i32
38+
%ld.b.0 = load i16, i16* %b
39+
%sext.b.0 = sext i16 %ld.b.0 to i32
40+
%mul.0 = mul i32 %sext.a.0, %sext.b.0
41+
%addr.a.1 = getelementptr i16, i16* %a, i32 1
42+
%addr.b.1 = getelementptr i16, i16* %b, i32 1
43+
%ld.a.1 = load i16, i16* %addr.a.1
44+
%sext.a.1 = sext i16 %ld.a.1 to i32
45+
%ld.b.1 = load i16, i16* %addr.b.1
46+
%sext.b.1 = sext i16 %ld.b.1 to i32
47+
%mul.1 = mul i32 %sext.a.1, %sext.b.1
48+
%sext.mul.0 = sext i32 %mul.0 to i64
49+
%sext.mul.1 = sext i32 %mul.1 to i64
50+
%add = add i64 %sext.mul.0, %sext.mul.1
51+
%res = add i64 %add, %acc
52+
ret i64 %res
53+
}
54+
2855
; CHECK-LABEL: multi_block
2956
; CHECK: [[CAST_A:%[^ ]+]] = bitcast i16* %a to i32*
3057
; CHECK: [[A:%[^ ]+]] = load i32, i32* [[CAST_A]]
@@ -53,6 +80,36 @@ bb.1:
5380
ret i32 %res
5481
}
5582

83+
; CHECK-LABEL: multi_block_64
84+
; CHECK: [[CAST_A:%[^ ]+]] = bitcast i16* %a to i32*
85+
; CHECK: [[A:%[^ ]+]] = load i32, i32* [[CAST_A]]
86+
; CHECK: [[CAST_B:%[^ ]+]] = bitcast i16* %b to i32*
87+
; CHECK: [[B:%[^ ]+]] = load i32, i32* [[CAST_B]]
88+
; CHECK: call i64 @llvm.arm.smlald(i32 [[A]], i32 [[B]], i64 0)
89+
define i64 @multi_block_64(i16* %a, i16* %b, i64 %acc) {
90+
entry:
91+
%ld.a.0 = load i16, i16* %a
92+
%sext.a.0 = sext i16 %ld.a.0 to i32
93+
%ld.b.0 = load i16, i16* %b
94+
%sext.b.0 = sext i16 %ld.b.0 to i32
95+
%mul.0 = mul i32 %sext.a.0, %sext.b.0
96+
%addr.a.1 = getelementptr i16, i16* %a, i32 1
97+
%addr.b.1 = getelementptr i16, i16* %b, i32 1
98+
%ld.a.1 = load i16, i16* %addr.a.1
99+
%sext.a.1 = sext i16 %ld.a.1 to i32
100+
%ld.b.1 = load i16, i16* %addr.b.1
101+
%sext.b.1 = sext i16 %ld.b.1 to i32
102+
%mul.1 = mul i32 %sext.a.1, %sext.b.1
103+
%sext.mul.0 = sext i32 %mul.0 to i64
104+
%sext.mul.1 = sext i32 %mul.1 to i64
105+
%add = add i64 %sext.mul.0, %sext.mul.1
106+
br label %bb.1
107+
108+
bb.1:
109+
%res = add i64 %add, %acc
110+
ret i64 %res
111+
}
112+
56113
; CHECK-LABEL: multi_block_1
57114
; CHECK-NOT: call i32 @llvm.arm.smlad
58115
define i32 @multi_block_1(i16* %a, i16* %b, i32 %acc) {

test/CodeGen/ARM/ParallelDSP/exchange.ll

Lines changed: 123 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -139,6 +139,87 @@ entry:
139139
ret i32 %res
140140
}
141141

142+
; CHECK-LABEL: exchange_multi_use_64_1
143+
; CHECK: [[CAST_A:%[^ ]+]] = bitcast i16* %a to i32*
144+
; CHECK: [[LD_A:%[^ ]+]] = load i32, i32* [[CAST_A]]
145+
; CHECK: [[CAST_B:%[^ ]+]] = bitcast i16* %b to i32*
146+
; CHECK: [[LD_B:%[^ ]+]] = load i32, i32* [[CAST_B]]
147+
; CHECK: [[GEP:%[^ ]+]] = getelementptr i16, i16* %a, i32 2
148+
; CHECK: [[CAST_A_2:%[^ ]+]] = bitcast i16* [[GEP]] to i32*
149+
; CHECK: [[LD_A_2:%[^ ]+]] = load i32, i32* [[CAST_A_2]]
150+
; CHECK: [[X:%[^ ]+]] = call i64 @llvm.arm.smlaldx(i32 [[LD_A]], i32 [[LD_B]], i64 %acc
151+
; CHECK: call i64 @llvm.arm.smlald(i32 [[LD_A_2]], i32 [[LD_B]], i64 [[X]])
152+
define i64 @exchange_multi_use_64_1(i16* %a, i16* %b, i64 %acc) {
153+
entry:
154+
%addr.a.1 = getelementptr i16, i16* %a, i32 1
155+
%addr.b.1 = getelementptr i16, i16* %b, i32 1
156+
%ld.a.0 = load i16, i16* %a
157+
%sext.a.0 = sext i16 %ld.a.0 to i32
158+
%ld.b.0 = load i16, i16* %b
159+
%ld.a.1 = load i16, i16* %addr.a.1
160+
%ld.b.1 = load i16, i16* %addr.b.1
161+
%sext.a.1 = sext i16 %ld.a.1 to i32
162+
%sext.b.1 = sext i16 %ld.b.1 to i32
163+
%sext.b.0 = sext i16 %ld.b.0 to i32
164+
%mul.0 = mul i32 %sext.a.0, %sext.b.1
165+
%mul.1 = mul i32 %sext.a.1, %sext.b.0
166+
%add = add i32 %mul.0, %mul.1
167+
%addr.a.2 = getelementptr i16, i16* %a, i32 2
168+
%addr.a.3 = getelementptr i16, i16* %a, i32 3
169+
%ld.a.2 = load i16, i16* %addr.a.2
170+
%ld.a.3 = load i16, i16* %addr.a.3
171+
%sext.a.2 = sext i16 %ld.a.2 to i32
172+
%sext.a.3 = sext i16 %ld.a.3 to i32
173+
%mul.2 = mul i32 %sext.a.3, %sext.b.1
174+
%mul.3 = mul i32 %sext.a.2, %sext.b.0
175+
%add.1 = add i32 %mul.2, %mul.3
176+
%add.2 = add i32 %add, %add.1
177+
%sext.add.2 = sext i32 %add.2 to i64
178+
%res = add i64 %sext.add.2, %acc
179+
ret i64 %res
180+
}
181+
182+
; CHECK-LABEL: exchange_multi_use_64_2
183+
; CHECK: [[CAST_A:%[^ ]+]] = bitcast i16* %a to i32*
184+
; CHECK: [[LD_A:%[^ ]+]] = load i32, i32* [[CAST_A]]
185+
; CHECK: [[CAST_B:%[^ ]+]] = bitcast i16* %b to i32*
186+
; CHECK: [[LD_B:%[^ ]+]] = load i32, i32* [[CAST_B]]
187+
; CHECK: [[GEP:%[^ ]+]] = getelementptr i16, i16* %a, i32 2
188+
; CHECK: [[CAST_A_2:%[^ ]+]] = bitcast i16* [[GEP]] to i32*
189+
; CHECK: [[LD_A_2:%[^ ]+]] = load i32, i32* [[CAST_A_2]]
190+
; CHECK: [[X:%[^ ]+]] = call i64 @llvm.arm.smlaldx(i32 [[LD_A]], i32 [[LD_B]], i64 %acc
191+
; CHECK: call i64 @llvm.arm.smlald(i32 [[LD_A_2]], i32 [[LD_B]], i64 [[X]])
192+
define i64 @exchange_multi_use_64_2(i16* %a, i16* %b, i64 %acc) {
193+
entry:
194+
%addr.a.1 = getelementptr i16, i16* %a, i32 1
195+
%addr.b.1 = getelementptr i16, i16* %b, i32 1
196+
%ld.a.0 = load i16, i16* %a
197+
%sext.a.0 = sext i16 %ld.a.0 to i32
198+
%ld.b.0 = load i16, i16* %b
199+
%ld.a.1 = load i16, i16* %addr.a.1
200+
%ld.b.1 = load i16, i16* %addr.b.1
201+
%sext.a.1 = sext i16 %ld.a.1 to i32
202+
%sext.b.1 = sext i16 %ld.b.1 to i32
203+
%sext.b.0 = sext i16 %ld.b.0 to i32
204+
%mul.0 = mul i32 %sext.a.0, %sext.b.1
205+
%mul.1 = mul i32 %sext.a.1, %sext.b.0
206+
%add = add i32 %mul.0, %mul.1
207+
%sext.add = sext i32 %add to i64
208+
%addr.a.2 = getelementptr i16, i16* %a, i32 2
209+
%addr.a.3 = getelementptr i16, i16* %a, i32 3
210+
%ld.a.2 = load i16, i16* %addr.a.2
211+
%ld.a.3 = load i16, i16* %addr.a.3
212+
%sext.a.2 = sext i16 %ld.a.2 to i32
213+
%sext.a.3 = sext i16 %ld.a.3 to i32
214+
%mul.2 = mul i32 %sext.a.3, %sext.b.1
215+
%mul.3 = mul i32 %sext.a.2, %sext.b.0
216+
%add.1 = add i32 %mul.2, %mul.3
217+
%sext.add.1 = sext i32 %add.1 to i64
218+
%add.2 = add i64 %sext.add, %sext.add.1
219+
%res = add i64 %add.2, %acc
220+
ret i64 %res
221+
}
222+
142223
; CHECK-LABEL: exchange_multi_use_2
143224
; CHECK: [[CAST_A:%[^ ]+]] = bitcast i16* %a to i32*
144225
; CHECK: [[LD_A:%[^ ]+]] = load i32, i32* [[CAST_A]]
@@ -216,6 +297,48 @@ entry:
216297
ret i32 %res
217298
}
218299

300+
; TODO: Would it be better to generate a smlad and then sign extend it?
301+
; CHECK-LABEL: exchange_multi_use_64_3
302+
; CHECK: [[CAST_A:%[^ ]+]] = bitcast i16* %a to i32*
303+
; CHECK: [[LD_A:%[^ ]+]] = load i32, i32* [[CAST_A]]
304+
; CHECK: [[CAST_B:%[^ ]+]] = bitcast i16* %b to i32*
305+
; CHECK: [[LD_B:%[^ ]+]] = load i32, i32* [[CAST_B]]
306+
; CHECK: [[GEP:%[^ ]+]] = getelementptr i16, i16* %a, i32 2
307+
; CHECK: [[CAST_A_2:%[^ ]+]] = bitcast i16* [[GEP]] to i32*
308+
; CHECK: [[LD_A_2:%[^ ]+]] = load i32, i32* [[CAST_A_2]]
309+
; CHECK: [[ACC:%[^ ]+]] = call i64 @llvm.arm.smlald(i32 [[LD_A]], i32 [[LD_B]], i64 0)
310+
; CHECK: [[X:%[^ ]+]] = call i64 @llvm.arm.smlaldx(i32 [[LD_B]], i32 [[LD_A_2]], i64 [[ACC]])
311+
define i64 @exchange_multi_use_64_3(i16* %a, i16* %b, i64 %acc) {
312+
entry:
313+
%addr.a.1 = getelementptr i16, i16* %a, i32 1
314+
%addr.b.1 = getelementptr i16, i16* %b, i32 1
315+
%ld.a.0 = load i16, i16* %a
316+
%sext.a.0 = sext i16 %ld.a.0 to i32
317+
%ld.b.0 = load i16, i16* %b
318+
%ld.a.1 = load i16, i16* %addr.a.1
319+
%ld.b.1 = load i16, i16* %addr.b.1
320+
%sext.a.1 = sext i16 %ld.a.1 to i32
321+
%sext.b.1 = sext i16 %ld.b.1 to i32
322+
%sext.b.0 = sext i16 %ld.b.0 to i32
323+
%addr.a.2 = getelementptr i16, i16* %a, i32 2
324+
%addr.a.3 = getelementptr i16, i16* %a, i32 3
325+
%ld.a.2 = load i16, i16* %addr.a.2
326+
%ld.a.3 = load i16, i16* %addr.a.3
327+
%sext.a.2 = sext i16 %ld.a.2 to i32
328+
%sext.a.3 = sext i16 %ld.a.3 to i32
329+
%mul.2 = mul i32 %sext.b.0, %sext.a.3
330+
%mul.3 = mul i32 %sext.b.1, %sext.a.2
331+
%mul.0 = mul i32 %sext.a.0, %sext.b.0
332+
%mul.1 = mul i32 %sext.a.1, %sext.b.1
333+
%add = add i32 %mul.0, %mul.1
334+
%add.1 = add i32 %mul.2, %mul.3
335+
%sext.add = sext i32 %add to i64
336+
%sext.add.1 = sext i32 %add.1 to i64
337+
%add.2 = add i64 %sext.add, %sext.add.1
338+
%res = sub i64 %acc, %add.2
339+
ret i64 %res
340+
}
341+
219342
; TODO: Why isn't smladx generated too?
220343
; CHECK-LABEL: exchange_multi_use_4
221344
; CHECK: [[CAST_A:%[^ ]+]] = bitcast i16* %a to i32*

test/CodeGen/ARM/ParallelDSP/overlapping.ll

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,52 @@ entry:
4242
ret i32 %res
4343
}
4444

45+
; TODO: Is it really best to generate smlald for the first instruction? Does
46+
; this just increase register pressure unnecessarily?
47+
; CHECK-LABEL: overlap_64_1
48+
; CHECK: [[ADDR_A_1:%[^ ]+]] = getelementptr i16, i16* %a, i32 1
49+
; CHECK: [[ADDR_B_1:%[^ ]+]] = getelementptr i16, i16* %b, i32 1
50+
; CHECK: [[CAST_A:%[^ ]+]] = bitcast i16* %a to i32*
51+
; CHECK: [[LD_A:%[^ ]+]] = load i32, i32* [[CAST_A]]
52+
; CHECK: [[CAST_B:%[^ ]+]] = bitcast i16* %b to i32*
53+
; CHECK: [[LD_B:%[^ ]+]] = load i32, i32* [[CAST_B]]
54+
; CHECK: [[CAST_A_1:%[^ ]+]] = bitcast i16* [[ADDR_A_1]] to i32*
55+
; CHECK: [[LD_A_1:%[^ ]+]] = load i32, i32* [[CAST_A_1]]
56+
; CHECK: [[CAST_B_1:%[^ ]+]] = bitcast i16* [[ADDR_B_1]] to i32*
57+
; CHECK: [[LD_B_1:%[^ ]+]] = load i32, i32* [[CAST_B_1]]
58+
; CHECK: [[ACC:%[^ ]+]] = call i64 @llvm.arm.smlald(i32 [[LD_A_1]], i32 [[LD_B_1]], i64 %acc)
59+
; CHECK: [[RES:%[^ ]+]] = call i64 @llvm.arm.smlald(i32 [[LD_A]], i32 [[LD_B]], i64 [[ACC]])
60+
; CHECK: ret i64 [[RES]]
61+
define i64 @overlap_64_1(i16* %a, i16* %b, i64 %acc) {
62+
entry:
63+
%addr.a.1 = getelementptr i16, i16* %a, i32 1
64+
%addr.b.1 = getelementptr i16, i16* %b, i32 1
65+
%ld.a.0 = load i16, i16* %a
66+
%sext.a.0 = sext i16 %ld.a.0 to i32
67+
%ld.b.0 = load i16, i16* %b
68+
%ld.a.1 = load i16, i16* %addr.a.1
69+
%ld.b.1 = load i16, i16* %addr.b.1
70+
%sext.a.1 = sext i16 %ld.a.1 to i32
71+
%sext.b.1 = sext i16 %ld.b.1 to i32
72+
%sext.b.0 = sext i16 %ld.b.0 to i32
73+
%mul.0 = mul i32 %sext.a.0, %sext.b.0
74+
%mul.1 = mul i32 %sext.a.1, %sext.b.1
75+
%addr.a.2 = getelementptr i16, i16* %a, i32 2
76+
%addr.b.2 = getelementptr i16, i16* %b, i32 2
77+
%ld.a.2 = load i16, i16* %addr.a.2
78+
%ld.b.2 = load i16, i16* %addr.b.2
79+
%sext.a.2 = sext i16 %ld.a.2 to i32
80+
%sext.b.2 = sext i16 %ld.b.2 to i32
81+
%mul.2 = mul i32 %sext.a.2, %sext.b.2
82+
%add = add i32 %mul.0, %mul.1
83+
%add.1 = add i32 %mul.1, %mul.2
84+
%sext.add = sext i32 %add to i64
85+
%sext.add.1 = sext i32 %add.1 to i64
86+
%add.2 = add i64 %sext.add.1, %sext.add
87+
%res = add i64 %add.2, %acc
88+
ret i64 %res
89+
}
90+
4591
; CHECK-LABEL: overlap_2
4692
; CHECK: [[ADDR_A_1:%[^ ]+]] = getelementptr i16, i16* %a, i32 1
4793
; CHECK: [[ADDR_B_1:%[^ ]+]] = getelementptr i16, i16* %b, i32 1

0 commit comments

Comments
 (0)