Skip to content

Commit 1232cfa

Browse files
committed
[ARM] Don't split trunc stores that can be better handled as VMOVN
We deliberately split stores of the form store(truncate(larger-than-legal-type)) into two stores, allowing each store to perform part of the truncate for free. There are times however where it makes more sense to use VMOVN to de-interlace the results back into a single vector, and store that in one go. This adds a check for that situation, not splitting the store if it looks like a VMOVN can be more useful. Differential Revision: https://reviews.llvm.org/D76511
1 parent 94caceb commit 1232cfa

File tree

2 files changed

+35
-88
lines changed

2 files changed

+35
-88
lines changed

llvm/lib/Target/ARM/ARMISelLowering.cpp

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13873,6 +13873,33 @@ static SDValue PerformSplittingToNarrowingStores(StoreSDNode *St,
1387313873
FromVT.getVectorNumElements() % NumElements != 0)
1387413874
return SDValue();
1387513875

13876+
// Test if the Trunc will be convertable to a VMOVN with a shuffle, and if so
13877+
// use the VMOVN over splitting the store. We are looking for patterns of:
13878+
// !rev: 0 N 1 N+1 2 N+2 ...
13879+
// rev: N 0 N+1 1 N+2 2 ...
13880+
auto isVMOVNOriginalMask = [&](ArrayRef<int> M, bool rev) {
13881+
unsigned NumElts = ToVT.getVectorNumElements();
13882+
if (NumElts != M.size() || (ToVT != MVT::v8i16 && ToVT != MVT::v16i8))
13883+
return false;
13884+
13885+
unsigned Off0 = rev ? NumElts : 0;
13886+
unsigned Off1 = rev ? 0 : NumElts;
13887+
13888+
for (unsigned i = 0; i < NumElts; i += 2) {
13889+
if (M[i] >= 0 && M[i] != (int)(Off0 + i / 2))
13890+
return false;
13891+
if (M[i + 1] >= 0 && M[i + 1] != (int)(Off1 + i / 2))
13892+
return false;
13893+
}
13894+
13895+
return true;
13896+
};
13897+
13898+
if (auto *Shuffle = dyn_cast<ShuffleVectorSDNode>(Trunc->getOperand(0)))
13899+
if (isVMOVNOriginalMask(Shuffle->getMask(), false) ||
13900+
isVMOVNOriginalMask(Shuffle->getMask(), true))
13901+
return SDValue();
13902+
1387613903
SDLoc DL(St);
1387713904
// Details about the old store
1387813905
SDValue Ch = St->getChain();

llvm/test/CodeGen/Thumb2/mve-vmovnstore.ll

Lines changed: 8 additions & 88 deletions
Original file line numberDiff line numberDiff line change
@@ -4,16 +4,8 @@
44
define arm_aapcs_vfpcc void @vmovn32_trunc1(<4 x i32> %src1, <4 x i32> %src2, <8 x i16> *%dest) {
55
; CHECK-LABEL: vmovn32_trunc1:
66
; CHECK: @ %bb.0: @ %entry
7-
; CHECK-NEXT: vmov.f32 s8, s2
8-
; CHECK-NEXT: vmov.f32 s9, s6
9-
; CHECK-NEXT: vmov.f32 s10, s3
10-
; CHECK-NEXT: vmov.f32 s11, s7
11-
; CHECK-NEXT: vstrh.32 q2, [r0, #8]
12-
; CHECK-NEXT: vmov.f32 s8, s0
13-
; CHECK-NEXT: vmov.f32 s9, s4
14-
; CHECK-NEXT: vmov.f32 s10, s1
15-
; CHECK-NEXT: vmov.f32 s11, s5
16-
; CHECK-NEXT: vstrh.32 q2, [r0]
7+
; CHECK-NEXT: vmovnt.i32 q0, q1
8+
; CHECK-NEXT: vstrw.32 q0, [r0]
179
; CHECK-NEXT: bx lr
1810
entry:
1911
%strided.vec = shufflevector <4 x i32> %src1, <4 x i32> %src2, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
@@ -25,16 +17,8 @@ entry:
2517
define arm_aapcs_vfpcc void @vmovn32_trunc2(<4 x i32> %src1, <4 x i32> %src2, <8 x i16> *%dest) {
2618
; CHECK-LABEL: vmovn32_trunc2:
2719
; CHECK: @ %bb.0: @ %entry
28-
; CHECK-NEXT: vmov.f32 s8, s6
29-
; CHECK-NEXT: vmov.f32 s9, s2
30-
; CHECK-NEXT: vmov.f32 s10, s7
31-
; CHECK-NEXT: vmov.f32 s11, s3
32-
; CHECK-NEXT: vstrh.32 q2, [r0, #8]
33-
; CHECK-NEXT: vmov.f32 s8, s4
34-
; CHECK-NEXT: vmov.f32 s9, s0
35-
; CHECK-NEXT: vmov.f32 s10, s5
36-
; CHECK-NEXT: vmov.f32 s11, s1
37-
; CHECK-NEXT: vstrh.32 q2, [r0]
20+
; CHECK-NEXT: vmovnt.i32 q1, q0
21+
; CHECK-NEXT: vstrw.32 q1, [r0]
3822
; CHECK-NEXT: bx lr
3923
entry:
4024
%strided.vec = shufflevector <4 x i32> %src1, <4 x i32> %src2, <8 x i32> <i32 4, i32 0, i32 5, i32 1, i32 6, i32 2, i32 7, i32 3>
@@ -46,40 +30,8 @@ entry:
4630
define arm_aapcs_vfpcc void @vmovn16_trunc1(<8 x i16> %src1, <8 x i16> %src2, <16 x i8> *%dest) {
4731
; CHECK-LABEL: vmovn16_trunc1:
4832
; CHECK: @ %bb.0: @ %entry
49-
; CHECK-NEXT: vmov.u16 r1, q0[4]
50-
; CHECK-NEXT: vmov.16 q2[0], r1
51-
; CHECK-NEXT: vmov.u16 r1, q1[4]
52-
; CHECK-NEXT: vmov.16 q2[1], r1
53-
; CHECK-NEXT: vmov.u16 r1, q0[5]
54-
; CHECK-NEXT: vmov.16 q2[2], r1
55-
; CHECK-NEXT: vmov.u16 r1, q1[5]
56-
; CHECK-NEXT: vmov.16 q2[3], r1
57-
; CHECK-NEXT: vmov.u16 r1, q0[6]
58-
; CHECK-NEXT: vmov.16 q2[4], r1
59-
; CHECK-NEXT: vmov.u16 r1, q1[6]
60-
; CHECK-NEXT: vmov.16 q2[5], r1
61-
; CHECK-NEXT: vmov.u16 r1, q0[7]
62-
; CHECK-NEXT: vmov.16 q2[6], r1
63-
; CHECK-NEXT: vmov.u16 r1, q1[7]
64-
; CHECK-NEXT: vmov.16 q2[7], r1
65-
; CHECK-NEXT: vmov.u16 r1, q0[0]
66-
; CHECK-NEXT: vstrb.16 q2, [r0, #8]
67-
; CHECK-NEXT: vmov.16 q2[0], r1
68-
; CHECK-NEXT: vmov.u16 r1, q1[0]
69-
; CHECK-NEXT: vmov.16 q2[1], r1
70-
; CHECK-NEXT: vmov.u16 r1, q0[1]
71-
; CHECK-NEXT: vmov.16 q2[2], r1
72-
; CHECK-NEXT: vmov.u16 r1, q1[1]
73-
; CHECK-NEXT: vmov.16 q2[3], r1
74-
; CHECK-NEXT: vmov.u16 r1, q0[2]
75-
; CHECK-NEXT: vmov.16 q2[4], r1
76-
; CHECK-NEXT: vmov.u16 r1, q1[2]
77-
; CHECK-NEXT: vmov.16 q2[5], r1
78-
; CHECK-NEXT: vmov.u16 r1, q0[3]
79-
; CHECK-NEXT: vmov.16 q2[6], r1
80-
; CHECK-NEXT: vmov.u16 r1, q1[3]
81-
; CHECK-NEXT: vmov.16 q2[7], r1
82-
; CHECK-NEXT: vstrb.16 q2, [r0]
33+
; CHECK-NEXT: vmovnt.i16 q0, q1
34+
; CHECK-NEXT: vstrw.32 q0, [r0]
8335
; CHECK-NEXT: bx lr
8436
entry:
8537
%strided.vec = shufflevector <8 x i16> %src1, <8 x i16> %src2, <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
@@ -91,40 +43,8 @@ entry:
9143
define arm_aapcs_vfpcc void @vmovn16_trunc2(<8 x i16> %src1, <8 x i16> %src2, <16 x i8> *%dest) {
9244
; CHECK-LABEL: vmovn16_trunc2:
9345
; CHECK: @ %bb.0: @ %entry
94-
; CHECK-NEXT: vmov.u16 r1, q1[4]
95-
; CHECK-NEXT: vmov.16 q2[0], r1
96-
; CHECK-NEXT: vmov.u16 r1, q0[4]
97-
; CHECK-NEXT: vmov.16 q2[1], r1
98-
; CHECK-NEXT: vmov.u16 r1, q1[5]
99-
; CHECK-NEXT: vmov.16 q2[2], r1
100-
; CHECK-NEXT: vmov.u16 r1, q0[5]
101-
; CHECK-NEXT: vmov.16 q2[3], r1
102-
; CHECK-NEXT: vmov.u16 r1, q1[6]
103-
; CHECK-NEXT: vmov.16 q2[4], r1
104-
; CHECK-NEXT: vmov.u16 r1, q0[6]
105-
; CHECK-NEXT: vmov.16 q2[5], r1
106-
; CHECK-NEXT: vmov.u16 r1, q1[7]
107-
; CHECK-NEXT: vmov.16 q2[6], r1
108-
; CHECK-NEXT: vmov.u16 r1, q0[7]
109-
; CHECK-NEXT: vmov.16 q2[7], r1
110-
; CHECK-NEXT: vmov.u16 r1, q1[0]
111-
; CHECK-NEXT: vstrb.16 q2, [r0, #8]
112-
; CHECK-NEXT: vmov.16 q2[0], r1
113-
; CHECK-NEXT: vmov.u16 r1, q0[0]
114-
; CHECK-NEXT: vmov.16 q2[1], r1
115-
; CHECK-NEXT: vmov.u16 r1, q1[1]
116-
; CHECK-NEXT: vmov.16 q2[2], r1
117-
; CHECK-NEXT: vmov.u16 r1, q0[1]
118-
; CHECK-NEXT: vmov.16 q2[3], r1
119-
; CHECK-NEXT: vmov.u16 r1, q1[2]
120-
; CHECK-NEXT: vmov.16 q2[4], r1
121-
; CHECK-NEXT: vmov.u16 r1, q0[2]
122-
; CHECK-NEXT: vmov.16 q2[5], r1
123-
; CHECK-NEXT: vmov.u16 r1, q1[3]
124-
; CHECK-NEXT: vmov.16 q2[6], r1
125-
; CHECK-NEXT: vmov.u16 r1, q0[3]
126-
; CHECK-NEXT: vmov.16 q2[7], r1
127-
; CHECK-NEXT: vstrb.16 q2, [r0]
46+
; CHECK-NEXT: vmovnt.i16 q1, q0
47+
; CHECK-NEXT: vstrw.32 q1, [r0]
12848
; CHECK-NEXT: bx lr
12949
entry:
13050
%strided.vec = shufflevector <8 x i16> %src1, <8 x i16> %src2, <16 x i32> <i32 8, i32 0, i32 9, i32 1, i32 10, i32 2, i32 11, i32 3, i32 12, i32 4, i32 13, i32 5, i32 14, i32 6, i32 15, i32 7>

0 commit comments

Comments
 (0)