Skip to content

Commit 27d1910

Browse files
committed
[ARM][ParallelDSP] Handle squaring multiplies
The logic in ARMParallelDSP is setup to merge two 16-bits loads into a 32-bit load and feed them into the smlads. This requires that four loads are combined for the four inputs, but there wasn't actually a check for this. Differential Revision: https://reviews.llvm.org/D78492
1 parent 2cf3c03 commit 27d1910

File tree

2 files changed

+279
-0
lines changed

2 files changed

+279
-0
lines changed

llvm/lib/Target/ARM/ARMParallelDSP.cpp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -570,6 +570,10 @@ bool ARMParallelDSP::CreateParallelPairs(Reduction &R) {
570570
auto Ld2 = static_cast<LoadInst*>(PMul0->RHS);
571571
auto Ld3 = static_cast<LoadInst*>(PMul1->RHS);
572572

573+
// Check that each mul is operating on two different loads.
574+
if (Ld0 == Ld2 || Ld1 == Ld3)
575+
return false;
576+
573577
if (AreSequentialLoads(Ld0, Ld1, PMul0->VecLd)) {
574578
if (AreSequentialLoads(Ld2, Ld3, PMul1->VecLd)) {
575579
LLVM_DEBUG(dbgs() << "OK: found two pairs of parallel loads!\n");
Lines changed: 275 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,275 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
2+
; RUN: opt -mtriple=armv8-a-linux-gnueabihf -arm-parallel-dsp -dce --verify %s -S -o - | FileCheck %s
3+
4+
define dso_local void @a() align 2 {
5+
; CHECK-LABEL: @a(
6+
; CHECK-NEXT: for.end:
7+
; CHECK-NEXT: [[B:%.*]] = alloca i32, align 4
8+
; CHECK-NEXT: [[TMP0:%.*]] = load i16, i16* bitcast (void ()* @a to i16*), align 2
9+
; CHECK-NEXT: [[CONV:%.*]] = sext i16 [[TMP0]] to i32
10+
; CHECK-NEXT: [[MUL:%.*]] = mul nsw i32 [[CONV]], [[CONV]]
11+
; CHECK-NEXT: [[TMP1:%.*]] = load i16, i16* getelementptr (i16, i16* bitcast (void ()* @a to i16*), i32 1), align 2
12+
; CHECK-NEXT: [[CONV3:%.*]] = sext i16 [[TMP1]] to i32
13+
; CHECK-NEXT: [[MUL6:%.*]] = mul nsw i32 [[CONV3]], [[CONV3]]
14+
; CHECK-NEXT: [[ADD:%.*]] = add nuw nsw i32 [[MUL6]], [[MUL]]
15+
; CHECK-NEXT: [[TMP2:%.*]] = load i16, i16* getelementptr (i16, i16* bitcast (void ()* @a to i16*), i32 2), align 2
16+
; CHECK-NEXT: [[CONV11:%.*]] = sext i16 [[TMP2]] to i32
17+
; CHECK-NEXT: [[MUL12:%.*]] = mul nsw i32 [[CONV11]], [[CONV3]]
18+
; CHECK-NEXT: [[ADD14:%.*]] = add nsw i32 [[MUL12]], [[ADD]]
19+
; CHECK-NEXT: [[TMP3:%.*]] = load i16, i16* getelementptr (i16, i16* bitcast (void ()* @a to i16*), i32 3), align 2
20+
; CHECK-NEXT: [[CONV17:%.*]] = sext i16 [[TMP3]] to i32
21+
; CHECK-NEXT: [[ADD19:%.*]] = add nsw i32 [[ADD14]], [[CONV17]]
22+
; CHECK-NEXT: store i32 [[ADD19]], i32* [[B]], align 4
23+
; CHECK-NEXT: [[TMP4:%.*]] = load i16, i16* getelementptr (i16, i16* bitcast (void ()* @a to i16*), i32 4), align 2
24+
; CHECK-NEXT: [[CONV21:%.*]] = sext i16 [[TMP4]] to i32
25+
; CHECK-NEXT: [[ADD_PTR:%.*]] = getelementptr inbounds i32, i32* [[B]], i32 [[CONV21]]
26+
; CHECK-NEXT: [[ARRAYIDX22:%.*]] = getelementptr inbounds i32, i32* [[ADD_PTR]], i32 9
27+
; CHECK-NEXT: [[TMP5:%.*]] = load i32, i32* [[ARRAYIDX22]], align 4
28+
; CHECK-NEXT: [[SHL:%.*]] = shl i32 [[TMP5]], 1
29+
; CHECK-NEXT: store i32 [[SHL]], i32* [[ARRAYIDX22]], align 4
30+
; CHECK-NEXT: br label [[FOR_COND23:%.*]]
31+
; CHECK: for.cond23:
32+
; CHECK-NEXT: br label [[FOR_COND23]]
33+
;
34+
for.end:
35+
%b = alloca i32, align 4
36+
%0 = bitcast i32* %b to i8*
37+
%1 = load i16, i16* bitcast (void ()* @a to i16*), align 2
38+
%conv = sext i16 %1 to i32
39+
%mul = mul nsw i32 %conv, %conv
40+
%2 = load i16, i16* getelementptr (i16, i16* bitcast (void ()* @a to i16*), i32 1), align 2
41+
%conv3 = sext i16 %2 to i32
42+
%mul6 = mul nsw i32 %conv3, %conv3
43+
%add = add nuw nsw i32 %mul6, %mul
44+
%3 = load i16, i16* getelementptr (i16, i16* bitcast (void ()* @a to i16*), i32 2), align 2
45+
%conv11 = sext i16 %3 to i32
46+
%mul12 = mul nsw i32 %conv11, %conv3
47+
%add14 = add nsw i32 %mul12, %add
48+
%4 = load i16, i16* getelementptr (i16, i16* bitcast (void ()* @a to i16*), i32 3), align 2
49+
%conv17 = sext i16 %4 to i32
50+
%add19 = add nsw i32 %add14, %conv17
51+
store i32 %add19, i32* %b, align 4
52+
%5 = load i16, i16* getelementptr (i16, i16* bitcast (void ()* @a to i16*), i32 4), align 2
53+
%conv21 = sext i16 %5 to i32
54+
%add.ptr = getelementptr inbounds i32, i32* %b, i32 %conv21
55+
%arrayidx22 = getelementptr inbounds i32, i32* %add.ptr, i32 9
56+
%6 = load i32, i32* %arrayidx22, align 4
57+
%shl = shl i32 %6, 1
58+
store i32 %shl, i32* %arrayidx22, align 4
59+
br label %for.cond23
60+
61+
for.cond23: ; preds = %for.cond23, %for.end
62+
br label %for.cond23
63+
}
64+
65+
define i32 @accumulate_square_a0(i16* %a, i16* %b, i32 %acc) {
66+
; CHECK-LABEL: @accumulate_square_a0(
67+
; CHECK-NEXT: entry:
68+
; CHECK-NEXT: [[ADDR_A_1:%.*]] = getelementptr i16, i16* [[A:%.*]], i32 1
69+
; CHECK-NEXT: [[ADDR_B_1:%.*]] = getelementptr i16, i16* [[B:%.*]], i32 1
70+
; CHECK-NEXT: [[LD_A_0:%.*]] = load i16, i16* [[A]]
71+
; CHECK-NEXT: [[SEXT_A_0:%.*]] = sext i16 [[LD_A_0]] to i32
72+
; CHECK-NEXT: [[TMP0:%.*]] = bitcast i16* [[ADDR_A_1]] to i32*
73+
; CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4
74+
; CHECK-NEXT: [[TMP2:%.*]] = trunc i32 [[TMP1]] to i16
75+
; CHECK-NEXT: [[TMP3:%.*]] = sext i16 [[TMP2]] to i32
76+
; CHECK-NEXT: [[TMP4:%.*]] = bitcast i16* [[ADDR_B_1]] to i32*
77+
; CHECK-NEXT: [[TMP5:%.*]] = load i32, i32* [[TMP4]], align 4
78+
; CHECK-NEXT: [[TMP6:%.*]] = trunc i32 [[TMP5]] to i16
79+
; CHECK-NEXT: [[TMP7:%.*]] = sext i16 [[TMP6]] to i32
80+
; CHECK-NEXT: [[MUL_0:%.*]] = mul i32 [[SEXT_A_0]], [[SEXT_A_0]]
81+
; CHECK-NEXT: [[TMP8:%.*]] = add i32 [[MUL_0]], [[ACC:%.*]]
82+
; CHECK-NEXT: [[MUL_1:%.*]] = mul i32 [[TMP3]], [[TMP7]]
83+
; CHECK-NEXT: [[TMP9:%.*]] = add i32 [[MUL_1]], [[TMP8]]
84+
; CHECK-NEXT: [[TMP10:%.*]] = call i32 @llvm.arm.smlad(i32 [[TMP1]], i32 [[TMP5]], i32 [[TMP9]])
85+
; CHECK-NEXT: ret i32 [[TMP10]]
86+
;
87+
entry:
88+
%addr.a.1 = getelementptr i16, i16* %a, i32 1
89+
%addr.b.1 = getelementptr i16, i16* %b, i32 1
90+
%ld.a.0 = load i16, i16* %a
91+
%sext.a.0 = sext i16 %ld.a.0 to i32
92+
%ld.b.0 = load i16, i16* %b
93+
%ld.a.1 = load i16, i16* %addr.a.1
94+
%ld.b.1 = load i16, i16* %addr.b.1
95+
%sext.a.1 = sext i16 %ld.a.1 to i32
96+
%sext.b.1 = sext i16 %ld.b.1 to i32
97+
%sext.b.0 = sext i16 %ld.b.0 to i32
98+
%mul.0 = mul i32 %sext.a.0, %sext.a.0
99+
%mul.1 = mul i32 %sext.a.1, %sext.b.1
100+
%addr.a.2 = getelementptr i16, i16* %a, i32 2
101+
%addr.b.2 = getelementptr i16, i16* %b, i32 2
102+
%ld.a.2 = load i16, i16* %addr.a.2
103+
%ld.b.2 = load i16, i16* %addr.b.2
104+
%sext.a.2 = sext i16 %ld.a.2 to i32
105+
%sext.b.2 = sext i16 %ld.b.2 to i32
106+
%mul.2 = mul i32 %sext.a.2, %sext.b.2
107+
%add = add i32 %mul.0, %mul.1
108+
%add.1 = add i32 %mul.1, %mul.2
109+
%add.2 = add i32 %add.1, %add
110+
%res = add i32 %add.2, %acc
111+
ret i32 %res
112+
}
113+
114+
define i32 @accumulate_square_a2(i16* %a, i16* %b, i32 %acc) {
115+
; CHECK-LABEL: @accumulate_square_a2(
116+
; CHECK-NEXT: entry:
117+
; CHECK-NEXT: [[TMP0:%.*]] = bitcast i16* [[A:%.*]] to i32*
118+
; CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4
119+
; CHECK-NEXT: [[TMP2:%.*]] = lshr i32 [[TMP1]], 16
120+
; CHECK-NEXT: [[TMP3:%.*]] = trunc i32 [[TMP2]] to i16
121+
; CHECK-NEXT: [[TMP4:%.*]] = sext i16 [[TMP3]] to i32
122+
; CHECK-NEXT: [[TMP5:%.*]] = bitcast i16* [[B:%.*]] to i32*
123+
; CHECK-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4
124+
; CHECK-NEXT: [[TMP7:%.*]] = lshr i32 [[TMP6]], 16
125+
; CHECK-NEXT: [[TMP8:%.*]] = trunc i32 [[TMP7]] to i16
126+
; CHECK-NEXT: [[TMP9:%.*]] = sext i16 [[TMP8]] to i32
127+
; CHECK-NEXT: [[MUL_1:%.*]] = mul i32 [[TMP4]], [[TMP9]]
128+
; CHECK-NEXT: [[ADDR_A_2:%.*]] = getelementptr i16, i16* [[A]], i32 2
129+
; CHECK-NEXT: [[ADDR_B_2:%.*]] = getelementptr i16, i16* [[B]], i32 2
130+
; CHECK-NEXT: [[LD_A_2:%.*]] = load i16, i16* [[ADDR_A_2]]
131+
; CHECK-NEXT: [[LD_B_2:%.*]] = load i16, i16* [[ADDR_B_2]]
132+
; CHECK-NEXT: [[SEXT_A_2:%.*]] = sext i16 [[LD_A_2]] to i32
133+
; CHECK-NEXT: [[SEXT_B_2:%.*]] = sext i16 [[LD_B_2]] to i32
134+
; CHECK-NEXT: [[MUL_2:%.*]] = mul i32 [[SEXT_A_2]], [[SEXT_A_2]]
135+
; CHECK-NEXT: [[TMP10:%.*]] = add i32 [[MUL_2]], [[ACC:%.*]]
136+
; CHECK-NEXT: [[TMP11:%.*]] = add i32 [[MUL_1]], [[TMP10]]
137+
; CHECK-NEXT: [[TMP12:%.*]] = call i32 @llvm.arm.smlad(i32 [[TMP1]], i32 [[TMP6]], i32 [[TMP11]])
138+
; CHECK-NEXT: [[RES:%.*]] = add i32 [[TMP12]], [[SEXT_B_2]]
139+
; CHECK-NEXT: ret i32 [[RES]]
140+
;
141+
entry:
142+
%addr.a.1 = getelementptr i16, i16* %a, i32 1
143+
%addr.b.1 = getelementptr i16, i16* %b, i32 1
144+
%ld.a.0 = load i16, i16* %a
145+
%sext.a.0 = sext i16 %ld.a.0 to i32
146+
%ld.b.0 = load i16, i16* %b
147+
%ld.a.1 = load i16, i16* %addr.a.1
148+
%ld.b.1 = load i16, i16* %addr.b.1
149+
%sext.a.1 = sext i16 %ld.a.1 to i32
150+
%sext.b.1 = sext i16 %ld.b.1 to i32
151+
%sext.b.0 = sext i16 %ld.b.0 to i32
152+
%mul.0 = mul i32 %sext.a.0, %sext.b.0
153+
%mul.1 = mul i32 %sext.a.1, %sext.b.1
154+
%addr.a.2 = getelementptr i16, i16* %a, i32 2
155+
%addr.b.2 = getelementptr i16, i16* %b, i32 2
156+
%ld.a.2 = load i16, i16* %addr.a.2
157+
%ld.b.2 = load i16, i16* %addr.b.2
158+
%sext.a.2 = sext i16 %ld.a.2 to i32
159+
%sext.b.2 = sext i16 %ld.b.2 to i32
160+
%mul.2 = mul i32 %sext.a.2, %sext.a.2
161+
%add = add i32 %mul.0, %mul.1
162+
%add.1 = add i32 %mul.1, %mul.2
163+
%add.2 = add i32 %add.1, %add
164+
%add.3 = add i32 %add.2, %acc
165+
%res = add i32 %add.3, %sext.b.2
166+
ret i32 %res
167+
}
168+
169+
define i32 @accumulate_square_b2(i16* %a, i16* %b, i32 %acc) {
170+
; CHECK-LABEL: @accumulate_square_b2(
171+
; CHECK-NEXT: entry:
172+
; CHECK-NEXT: [[ADDR_A_1:%.*]] = getelementptr i16, i16* [[A:%.*]], i32 1
173+
; CHECK-NEXT: [[ADDR_B_1:%.*]] = getelementptr i16, i16* [[B:%.*]], i32 1
174+
; CHECK-NEXT: [[LD_A_0:%.*]] = load i16, i16* [[A]]
175+
; CHECK-NEXT: [[SEXT_A_0:%.*]] = sext i16 [[LD_A_0]] to i32
176+
; CHECK-NEXT: [[LD_A_1:%.*]] = load i16, i16* [[ADDR_A_1]]
177+
; CHECK-NEXT: [[LD_B_1:%.*]] = load i16, i16* [[ADDR_B_1]]
178+
; CHECK-NEXT: [[SEXT_A_1:%.*]] = sext i16 [[LD_A_1]] to i32
179+
; CHECK-NEXT: [[SEXT_B_1:%.*]] = sext i16 [[LD_B_1]] to i32
180+
; CHECK-NEXT: [[MUL_0:%.*]] = mul i32 [[SEXT_A_0]], [[SEXT_A_0]]
181+
; CHECK-NEXT: [[MUL_1:%.*]] = mul i32 [[SEXT_A_1]], [[SEXT_B_1]]
182+
; CHECK-NEXT: [[ADDR_B_2:%.*]] = getelementptr i16, i16* [[B]], i32 2
183+
; CHECK-NEXT: [[LD_B_2:%.*]] = load i16, i16* [[ADDR_B_2]]
184+
; CHECK-NEXT: [[SEXT_B_2:%.*]] = sext i16 [[LD_B_2]] to i32
185+
; CHECK-NEXT: [[MUL_2:%.*]] = mul i32 [[SEXT_B_2]], [[SEXT_B_2]]
186+
; CHECK-NEXT: [[ADD:%.*]] = add i32 [[MUL_0]], [[MUL_1]]
187+
; CHECK-NEXT: [[ADD_1:%.*]] = add i32 [[MUL_1]], [[MUL_2]]
188+
; CHECK-NEXT: [[ADD_2:%.*]] = add i32 [[ADD_1]], [[ADD]]
189+
; CHECK-NEXT: [[RES:%.*]] = add i32 [[ADD_2]], [[ACC:%.*]]
190+
; CHECK-NEXT: ret i32 [[RES]]
191+
;
192+
entry:
193+
%addr.a.1 = getelementptr i16, i16* %a, i32 1
194+
%addr.b.1 = getelementptr i16, i16* %b, i32 1
195+
%ld.a.0 = load i16, i16* %a
196+
%sext.a.0 = sext i16 %ld.a.0 to i32
197+
%ld.b.0 = load i16, i16* %b
198+
%ld.a.1 = load i16, i16* %addr.a.1
199+
%ld.b.1 = load i16, i16* %addr.b.1
200+
%sext.a.1 = sext i16 %ld.a.1 to i32
201+
%sext.b.1 = sext i16 %ld.b.1 to i32
202+
%sext.b.0 = sext i16 %ld.b.0 to i32
203+
%mul.0 = mul i32 %sext.a.0, %sext.a.0
204+
%mul.1 = mul i32 %sext.a.1, %sext.b.1
205+
%addr.a.2 = getelementptr i16, i16* %a, i32 2
206+
%addr.b.2 = getelementptr i16, i16* %b, i32 2
207+
%ld.a.2 = load i16, i16* %addr.a.2
208+
%ld.b.2 = load i16, i16* %addr.b.2
209+
%sext.a.2 = sext i16 %ld.a.2 to i32
210+
%sext.b.2 = sext i16 %ld.b.2 to i32
211+
%mul.2 = mul i32 %sext.b.2, %sext.b.2
212+
%add = add i32 %mul.0, %mul.1
213+
%add.1 = add i32 %mul.1, %mul.2
214+
%add.2 = add i32 %add.1, %add
215+
%add.3 = add i32 %add.2, %sext.a.2
216+
%res = add i32 %add.2, %acc
217+
ret i32 %res
218+
}
219+
220+
define i32 @accumulate_square_a1(i16* %a, i16* %b, i32 %acc) {
221+
; CHECK-LABEL: @accumulate_square_a1(
222+
; CHECK-NEXT: entry:
223+
; CHECK-NEXT: [[ADDR_A_1:%.*]] = getelementptr i16, i16* [[A:%.*]], i32 1
224+
; CHECK-NEXT: [[ADDR_B_1:%.*]] = getelementptr i16, i16* [[B:%.*]], i32 1
225+
; CHECK-NEXT: [[LD_A_0:%.*]] = load i16, i16* [[A]]
226+
; CHECK-NEXT: [[SEXT_A_0:%.*]] = sext i16 [[LD_A_0]] to i32
227+
; CHECK-NEXT: [[LD_A_1:%.*]] = load i16, i16* [[ADDR_A_1]]
228+
; CHECK-NEXT: [[LD_B_1:%.*]] = load i16, i16* [[ADDR_B_1]]
229+
; CHECK-NEXT: [[SEXT_A_1:%.*]] = sext i16 [[LD_A_1]] to i32
230+
; CHECK-NEXT: [[SEXT_B_1:%.*]] = sext i16 [[LD_B_1]] to i32
231+
; CHECK-NEXT: [[MUL_0:%.*]] = mul i32 [[SEXT_A_0]], [[SEXT_A_0]]
232+
; CHECK-NEXT: [[MUL_1:%.*]] = mul i32 [[SEXT_A_1]], [[SEXT_A_1]]
233+
; CHECK-NEXT: [[ADDR_A_2:%.*]] = getelementptr i16, i16* [[A]], i32 2
234+
; CHECK-NEXT: [[ADDR_B_2:%.*]] = getelementptr i16, i16* [[B]], i32 2
235+
; CHECK-NEXT: [[LD_A_2:%.*]] = load i16, i16* [[ADDR_A_2]]
236+
; CHECK-NEXT: [[LD_B_2:%.*]] = load i16, i16* [[ADDR_B_2]]
237+
; CHECK-NEXT: [[SEXT_A_2:%.*]] = sext i16 [[LD_A_2]] to i32
238+
; CHECK-NEXT: [[SEXT_B_2:%.*]] = sext i16 [[LD_B_2]] to i32
239+
; CHECK-NEXT: [[MUL_2:%.*]] = mul i32 [[SEXT_A_2]], [[SEXT_B_2]]
240+
; CHECK-NEXT: [[ADD:%.*]] = add i32 [[MUL_1]], [[SEXT_B_1]]
241+
; CHECK-NEXT: [[ADD_1:%.*]] = add i32 [[MUL_0]], [[ADD]]
242+
; CHECK-NEXT: [[ADD_2:%.*]] = add i32 [[MUL_1]], [[MUL_2]]
243+
; CHECK-NEXT: [[ADD_3:%.*]] = add i32 [[ADD_2]], [[ADD_1]]
244+
; CHECK-NEXT: [[ADD_4:%.*]] = add i32 [[ADD_3]], [[SEXT_A_2]]
245+
; CHECK-NEXT: [[RES:%.*]] = add i32 [[ADD_4]], [[ACC:%.*]]
246+
; CHECK-NEXT: ret i32 [[RES]]
247+
;
248+
entry:
249+
%addr.a.1 = getelementptr i16, i16* %a, i32 1
250+
%addr.b.1 = getelementptr i16, i16* %b, i32 1
251+
%ld.a.0 = load i16, i16* %a
252+
%sext.a.0 = sext i16 %ld.a.0 to i32
253+
%ld.b.0 = load i16, i16* %b
254+
%ld.a.1 = load i16, i16* %addr.a.1
255+
%ld.b.1 = load i16, i16* %addr.b.1
256+
%sext.a.1 = sext i16 %ld.a.1 to i32
257+
%sext.b.1 = sext i16 %ld.b.1 to i32
258+
%sext.b.0 = sext i16 %ld.b.0 to i32
259+
%mul.0 = mul i32 %sext.a.0, %sext.a.0
260+
%mul.1 = mul i32 %sext.a.1, %sext.a.1
261+
%addr.a.2 = getelementptr i16, i16* %a, i32 2
262+
%addr.b.2 = getelementptr i16, i16* %b, i32 2
263+
%ld.a.2 = load i16, i16* %addr.a.2
264+
%ld.b.2 = load i16, i16* %addr.b.2
265+
%sext.a.2 = sext i16 %ld.a.2 to i32
266+
%sext.b.2 = sext i16 %ld.b.2 to i32
267+
%mul.2 = mul i32 %sext.a.2, %sext.b.2
268+
%add = add i32 %mul.1, %sext.b.1
269+
%add.1 = add i32 %mul.0, %add
270+
%add.2 = add i32 %mul.1, %mul.2
271+
%add.3 = add i32 %add.2, %add.1
272+
%add.4 = add i32 %add.3, %sext.a.2
273+
%res = add i32 %add.4, %acc
274+
ret i32 %res
275+
}

0 commit comments

Comments
 (0)