Skip to content

Commit f88000a

Browse files
committed
[ARM][MVE] Add VHADD and VHSUB patterns
Add patterns that use a normal, non-wrapping, add and sub nodes along with an arm vshr imm node. Differential Revision: https://reviews.llvm.org/D77065
1 parent e144474 commit f88000a

File tree

3 files changed

+287
-22
lines changed

3 files changed

+287
-22
lines changed

llvm/lib/Target/ARM/ARMInstrMVE.td

Lines changed: 53 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -2069,7 +2069,8 @@ class MVE_VHSUB_<string suffix, bit U, bits<2> size,
20692069
: MVE_VHADDSUB<"vhsub", suffix, U, 0b1, size, pattern>;
20702070

20712071
multiclass MVE_VHADD_m<MVEVectorVTInfo VTI,
2072-
SDNode unpred_op, Intrinsic pred_int> {
2072+
SDNode unpred_op, Intrinsic pred_int, PatFrag add_op,
2073+
SDNode shift_op> {
20732074
def "" : MVE_VHADD_<VTI.Suffix, VTI.Unsigned, VTI.Size>;
20742075
defvar Inst = !cast<Instruction>(NAME);
20752076

@@ -2078,6 +2079,9 @@ multiclass MVE_VHADD_m<MVEVectorVTInfo VTI,
20782079
def : Pat<(VTI.Vec (unpred_op (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn), (i32 VTI.Unsigned))),
20792080
(VTI.Vec (Inst (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn)))>;
20802081

2082+
def : Pat<(VTI.Vec (shift_op (add_op (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn)), (i32 1))),
2083+
(Inst MQPR:$Qm, MQPR:$Qn)>;
2084+
20812085
// Predicated add-and-divide-by-two
20822086
def : Pat<(VTI.Vec (pred_int (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn), (i32 VTI.Unsigned),
20832087
(VTI.Pred VCCR:$mask), (VTI.Vec MQPR:$inactive))),
@@ -2087,18 +2091,44 @@ multiclass MVE_VHADD_m<MVEVectorVTInfo VTI,
20872091
}
20882092
}
20892093

2090-
multiclass MVE_VHADD<MVEVectorVTInfo VTI>
2091-
: MVE_VHADD_m<VTI, int_arm_mve_vhadd, int_arm_mve_hadd_predicated>;
2094+
multiclass MVE_VHADD<MVEVectorVTInfo VTI, PatFrag add_op, SDNode shift_op>
2095+
: MVE_VHADD_m<VTI, int_arm_mve_vhadd, int_arm_mve_hadd_predicated, add_op,
2096+
shift_op>;
2097+
2098+
def addnuw : PatFrag<(ops node:$lhs, node:$rhs),
2099+
(add node:$lhs, node:$rhs), [{
2100+
return N->getFlags().hasNoUnsignedWrap();
2101+
}]>;
20922102

2093-
defm MVE_VHADDs8 : MVE_VHADD<MVE_v16s8>;
2094-
defm MVE_VHADDs16 : MVE_VHADD<MVE_v8s16>;
2095-
defm MVE_VHADDs32 : MVE_VHADD<MVE_v4s32>;
2096-
defm MVE_VHADDu8 : MVE_VHADD<MVE_v16u8>;
2097-
defm MVE_VHADDu16 : MVE_VHADD<MVE_v8u16>;
2098-
defm MVE_VHADDu32 : MVE_VHADD<MVE_v4u32>;
2103+
def addnsw : PatFrag<(ops node:$lhs, node:$rhs),
2104+
(add node:$lhs, node:$rhs), [{
2105+
return N->getFlags().hasNoSignedWrap();
2106+
}]>;
2107+
2108+
def subnuw : PatFrag<(ops node:$lhs, node:$rhs),
2109+
(sub node:$lhs, node:$rhs), [{
2110+
return N->getFlags().hasNoUnsignedWrap();
2111+
}]>;
2112+
2113+
def subnsw : PatFrag<(ops node:$lhs, node:$rhs),
2114+
(sub node:$lhs, node:$rhs), [{
2115+
return N->getFlags().hasNoSignedWrap();
2116+
}]>;
2117+
2118+
// Halving add/sub perform the arithemtic operation with an extra bit of
2119+
// precision, before performing the shift, to void clipping errors. We're not
2120+
// modelling that here with these patterns, but we're using no wrap forms of
2121+
// add/sub to ensure that the extra bit of information is not needed.
2122+
defm MVE_VHADDs8 : MVE_VHADD<MVE_v16s8, addnsw, ARMvshrsImm>;
2123+
defm MVE_VHADDs16 : MVE_VHADD<MVE_v8s16, addnsw, ARMvshrsImm>;
2124+
defm MVE_VHADDs32 : MVE_VHADD<MVE_v4s32, addnsw, ARMvshrsImm>;
2125+
defm MVE_VHADDu8 : MVE_VHADD<MVE_v16u8, addnuw, ARMvshruImm>;
2126+
defm MVE_VHADDu16 : MVE_VHADD<MVE_v8u16, addnuw, ARMvshruImm>;
2127+
defm MVE_VHADDu32 : MVE_VHADD<MVE_v4u32, addnuw, ARMvshruImm>;
20992128

21002129
multiclass MVE_VHSUB_m<MVEVectorVTInfo VTI,
2101-
SDNode unpred_op, Intrinsic pred_int> {
2130+
SDNode unpred_op, Intrinsic pred_int, PatFrag sub_op,
2131+
SDNode shift_op> {
21022132
def "" : MVE_VHSUB_<VTI.Suffix, VTI.Unsigned, VTI.Size>;
21032133
defvar Inst = !cast<Instruction>(NAME);
21042134

@@ -2108,6 +2138,10 @@ multiclass MVE_VHSUB_m<MVEVectorVTInfo VTI,
21082138
(i32 VTI.Unsigned))),
21092139
(VTI.Vec (Inst (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn)))>;
21102140

2141+
def : Pat<(VTI.Vec (shift_op (sub_op (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn)), (i32 1))),
2142+
(Inst MQPR:$Qm, MQPR:$Qn)>;
2143+
2144+
21112145
// Predicated subtract-and-divide-by-two
21122146
def : Pat<(VTI.Vec (pred_int (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn),
21132147
(i32 VTI.Unsigned), (VTI.Pred VCCR:$mask),
@@ -2118,15 +2152,16 @@ multiclass MVE_VHSUB_m<MVEVectorVTInfo VTI,
21182152
}
21192153
}
21202154

2121-
multiclass MVE_VHSUB<MVEVectorVTInfo VTI>
2122-
: MVE_VHSUB_m<VTI, int_arm_mve_vhsub, int_arm_mve_hsub_predicated>;
2155+
multiclass MVE_VHSUB<MVEVectorVTInfo VTI, PatFrag sub_op, SDNode shift_op>
2156+
: MVE_VHSUB_m<VTI, int_arm_mve_vhsub, int_arm_mve_hsub_predicated, sub_op,
2157+
shift_op>;
21232158

2124-
defm MVE_VHSUBs8 : MVE_VHSUB<MVE_v16s8>;
2125-
defm MVE_VHSUBs16 : MVE_VHSUB<MVE_v8s16>;
2126-
defm MVE_VHSUBs32 : MVE_VHSUB<MVE_v4s32>;
2127-
defm MVE_VHSUBu8 : MVE_VHSUB<MVE_v16u8>;
2128-
defm MVE_VHSUBu16 : MVE_VHSUB<MVE_v8u16>;
2129-
defm MVE_VHSUBu32 : MVE_VHSUB<MVE_v4u32>;
2159+
defm MVE_VHSUBs8 : MVE_VHSUB<MVE_v16s8, subnsw, ARMvshrsImm>;
2160+
defm MVE_VHSUBs16 : MVE_VHSUB<MVE_v8s16, subnsw, ARMvshrsImm>;
2161+
defm MVE_VHSUBs32 : MVE_VHSUB<MVE_v4s32, subnsw, ARMvshrsImm>;
2162+
defm MVE_VHSUBu8 : MVE_VHSUB<MVE_v16u8, subnuw, ARMvshruImm>;
2163+
defm MVE_VHSUBu16 : MVE_VHSUB<MVE_v8u16, subnuw, ARMvshruImm>;
2164+
defm MVE_VHSUBu32 : MVE_VHSUB<MVE_v4u32, subnuw, ARMvshruImm>;
21302165

21312166
class MVE_VDUP<string suffix, bit B, bit E, list<dag> pattern=[]>
21322167
: MVE_p<(outs MQPR:$Qd), (ins rGPR:$Rt), NoItinerary,
Lines changed: 232 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,232 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2+
; RUN: llc -mtriple=thumbv8.1m.main-arm-none-eabi -mattr=+mve -verify-machineinstrs %s -o - | FileCheck %s
3+
4+
define arm_aapcs_vfpcc <16 x i8> @vhadds_v16i8(<16 x i8> %x, <16 x i8> %y) {
5+
; CHECK-LABEL: vhadds_v16i8:
6+
; CHECK: @ %bb.0:
7+
; CHECK-NEXT: vadd.i8 q0, q0, q1
8+
; CHECK-NEXT: vshr.s8 q0, q0, #1
9+
; CHECK-NEXT: bx lr
10+
%add = add <16 x i8> %x, %y
11+
%half = ashr <16 x i8> %add, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
12+
ret <16 x i8> %half
13+
}
14+
define arm_aapcs_vfpcc <16 x i8> @vhaddu_v16i8(<16 x i8> %x, <16 x i8> %y) {
15+
; CHECK-LABEL: vhaddu_v16i8:
16+
; CHECK: @ %bb.0:
17+
; CHECK-NEXT: vadd.i8 q0, q0, q1
18+
; CHECK-NEXT: vshr.u8 q0, q0, #1
19+
; CHECK-NEXT: bx lr
20+
%add = add <16 x i8> %x, %y
21+
%half = lshr <16 x i8> %add, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
22+
ret <16 x i8> %half
23+
}
24+
define arm_aapcs_vfpcc <8 x i16> @vhadds_v8i16(<8 x i16> %x, <8 x i16> %y) {
25+
; CHECK-LABEL: vhadds_v8i16:
26+
; CHECK: @ %bb.0:
27+
; CHECK-NEXT: vadd.i16 q0, q0, q1
28+
; CHECK-NEXT: vshr.s16 q0, q0, #1
29+
; CHECK-NEXT: bx lr
30+
%add = add <8 x i16> %x, %y
31+
%half = ashr <8 x i16> %add, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
32+
ret <8 x i16> %half
33+
}
34+
define arm_aapcs_vfpcc <8 x i16> @vhaddu_v8i16(<8 x i16> %x, <8 x i16> %y) {
35+
; CHECK-LABEL: vhaddu_v8i16:
36+
; CHECK: @ %bb.0:
37+
; CHECK-NEXT: vadd.i16 q0, q0, q1
38+
; CHECK-NEXT: vshr.u16 q0, q0, #1
39+
; CHECK-NEXT: bx lr
40+
%add = add <8 x i16> %x, %y
41+
%half = lshr <8 x i16> %add, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
42+
ret <8 x i16> %half
43+
}
44+
define arm_aapcs_vfpcc <4 x i32> @vhadds_v4i32(<4 x i32> %x, <4 x i32> %y) {
45+
; CHECK-LABEL: vhadds_v4i32:
46+
; CHECK: @ %bb.0:
47+
; CHECK-NEXT: vadd.i32 q0, q0, q1
48+
; CHECK-NEXT: vshr.s32 q0, q0, #1
49+
; CHECK-NEXT: bx lr
50+
%add = add <4 x i32> %x, %y
51+
%half = ashr <4 x i32> %add, <i32 1, i32 1, i32 1, i32 1>
52+
ret <4 x i32> %half
53+
}
54+
define arm_aapcs_vfpcc <4 x i32> @vhaddu_v4i32(<4 x i32> %x, <4 x i32> %y) {
55+
; CHECK-LABEL: vhaddu_v4i32:
56+
; CHECK: @ %bb.0:
57+
; CHECK-NEXT: vadd.i32 q0, q0, q1
58+
; CHECK-NEXT: vshr.u32 q0, q0, #1
59+
; CHECK-NEXT: bx lr
60+
%add = add <4 x i32> %x, %y
61+
%half = lshr <4 x i32> %add, <i32 1, i32 1, i32 1, i32 1>
62+
ret <4 x i32> %half
63+
}
64+
define arm_aapcs_vfpcc <16 x i8> @vhsubs_v16i8(<16 x i8> %x, <16 x i8> %y) {
65+
; CHECK-LABEL: vhsubs_v16i8:
66+
; CHECK: @ %bb.0:
67+
; CHECK-NEXT: vsub.i8 q0, q0, q1
68+
; CHECK-NEXT: vshr.s8 q0, q0, #1
69+
; CHECK-NEXT: bx lr
70+
%sub = sub <16 x i8> %x, %y
71+
%half = ashr <16 x i8> %sub, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
72+
ret <16 x i8> %half
73+
}
74+
define arm_aapcs_vfpcc <16 x i8> @vhsubu_v16i8(<16 x i8> %x, <16 x i8> %y) {
75+
; CHECK-LABEL: vhsubu_v16i8:
76+
; CHECK: @ %bb.0:
77+
; CHECK-NEXT: vsub.i8 q0, q0, q1
78+
; CHECK-NEXT: vshr.u8 q0, q0, #1
79+
; CHECK-NEXT: bx lr
80+
%sub = sub <16 x i8> %x, %y
81+
%half = lshr <16 x i8> %sub, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
82+
ret <16 x i8> %half
83+
}
84+
define arm_aapcs_vfpcc <8 x i16> @vhsubs_v8i16(<8 x i16> %x, <8 x i16> %y) {
85+
; CHECK-LABEL: vhsubs_v8i16:
86+
; CHECK: @ %bb.0:
87+
; CHECK-NEXT: vsub.i16 q0, q0, q1
88+
; CHECK-NEXT: vshr.s16 q0, q0, #1
89+
; CHECK-NEXT: bx lr
90+
%sub = sub <8 x i16> %x, %y
91+
%half = ashr <8 x i16> %sub, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
92+
ret <8 x i16> %half
93+
}
94+
define arm_aapcs_vfpcc <8 x i16> @vhsubu_v8i16(<8 x i16> %x, <8 x i16> %y) {
95+
; CHECK-LABEL: vhsubu_v8i16:
96+
; CHECK: @ %bb.0:
97+
; CHECK-NEXT: vsub.i16 q0, q0, q1
98+
; CHECK-NEXT: vshr.u16 q0, q0, #1
99+
; CHECK-NEXT: bx lr
100+
%sub = sub <8 x i16> %x, %y
101+
%half = lshr <8 x i16> %sub, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
102+
ret <8 x i16> %half
103+
}
104+
define arm_aapcs_vfpcc <4 x i32> @vhsubs_v4i32(<4 x i32> %x, <4 x i32> %y) {
105+
; CHECK-LABEL: vhsubs_v4i32:
106+
; CHECK: @ %bb.0:
107+
; CHECK-NEXT: vsub.i32 q0, q0, q1
108+
; CHECK-NEXT: vshr.s32 q0, q0, #1
109+
; CHECK-NEXT: bx lr
110+
%sub = sub <4 x i32> %x, %y
111+
%half = ashr <4 x i32> %sub, <i32 1, i32 1, i32 1, i32 1>
112+
ret <4 x i32> %half
113+
}
114+
define arm_aapcs_vfpcc <4 x i32> @vhsubu_v4i32(<4 x i32> %x, <4 x i32> %y) {
115+
; CHECK-LABEL: vhsubu_v4i32:
116+
; CHECK: @ %bb.0:
117+
; CHECK-NEXT: vsub.i32 q0, q0, q1
118+
; CHECK-NEXT: vshr.u32 q0, q0, #1
119+
; CHECK-NEXT: bx lr
120+
%sub = sub <4 x i32> %x, %y
121+
%half = lshr <4 x i32> %sub, <i32 1, i32 1, i32 1, i32 1>
122+
ret <4 x i32> %half
123+
}
124+
125+
define arm_aapcs_vfpcc <16 x i8> @vhadds_v16i8_nw(<16 x i8> %x, <16 x i8> %y) {
126+
; CHECK-LABEL: vhadds_v16i8_nw:
127+
; CHECK: @ %bb.0:
128+
; CHECK-NEXT: vhadd.s8 q0, q0, q1
129+
; CHECK-NEXT: bx lr
130+
%add = add nsw <16 x i8> %x, %y
131+
%half = ashr <16 x i8> %add, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
132+
ret <16 x i8> %half
133+
}
134+
define arm_aapcs_vfpcc <16 x i8> @vhaddu_v16i8_nw(<16 x i8> %x, <16 x i8> %y) {
135+
; CHECK-LABEL: vhaddu_v16i8_nw:
136+
; CHECK: @ %bb.0:
137+
; CHECK-NEXT: vhadd.u8 q0, q0, q1
138+
; CHECK-NEXT: bx lr
139+
%add = add nuw <16 x i8> %x, %y
140+
%half = lshr <16 x i8> %add, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
141+
ret <16 x i8> %half
142+
}
143+
define arm_aapcs_vfpcc <8 x i16> @vhadds_v8i16_nw(<8 x i16> %x, <8 x i16> %y) {
144+
; CHECK-LABEL: vhadds_v8i16_nw:
145+
; CHECK: @ %bb.0:
146+
; CHECK-NEXT: vhadd.s16 q0, q0, q1
147+
; CHECK-NEXT: bx lr
148+
%add = add nsw <8 x i16> %x, %y
149+
%half = ashr <8 x i16> %add, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
150+
ret <8 x i16> %half
151+
}
152+
define arm_aapcs_vfpcc <8 x i16> @vhaddu_v8i16_nw(<8 x i16> %x, <8 x i16> %y) {
153+
; CHECK-LABEL: vhaddu_v8i16_nw:
154+
; CHECK: @ %bb.0:
155+
; CHECK-NEXT: vhadd.u16 q0, q0, q1
156+
; CHECK-NEXT: bx lr
157+
%add = add nuw <8 x i16> %x, %y
158+
%half = lshr <8 x i16> %add, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
159+
ret <8 x i16> %half
160+
}
161+
define arm_aapcs_vfpcc <4 x i32> @vhadds_v4i32_nw(<4 x i32> %x, <4 x i32> %y) {
162+
; CHECK-LABEL: vhadds_v4i32_nw:
163+
; CHECK: @ %bb.0:
164+
; CHECK-NEXT: vhadd.s32 q0, q0, q1
165+
; CHECK-NEXT: bx lr
166+
%add = add nsw <4 x i32> %x, %y
167+
%half = ashr <4 x i32> %add, <i32 1, i32 1, i32 1, i32 1>
168+
ret <4 x i32> %half
169+
}
170+
define arm_aapcs_vfpcc <4 x i32> @vhaddu_v4i32_nw(<4 x i32> %x, <4 x i32> %y) {
171+
; CHECK-LABEL: vhaddu_v4i32_nw:
172+
; CHECK: @ %bb.0:
173+
; CHECK-NEXT: vhadd.u32 q0, q0, q1
174+
; CHECK-NEXT: bx lr
175+
%add = add nuw <4 x i32> %x, %y
176+
%half = lshr <4 x i32> %add, <i32 1, i32 1, i32 1, i32 1>
177+
ret <4 x i32> %half
178+
}
179+
define arm_aapcs_vfpcc <16 x i8> @vhsubs_v16i8_nw(<16 x i8> %x, <16 x i8> %y) {
180+
; CHECK-LABEL: vhsubs_v16i8_nw:
181+
; CHECK: @ %bb.0:
182+
; CHECK-NEXT: vhsub.s8 q0, q0, q1
183+
; CHECK-NEXT: bx lr
184+
%sub = sub nsw <16 x i8> %x, %y
185+
%half = ashr <16 x i8> %sub, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
186+
ret <16 x i8> %half
187+
}
188+
define arm_aapcs_vfpcc <16 x i8> @vhsubu_v16i8_nw(<16 x i8> %x, <16 x i8> %y) {
189+
; CHECK-LABEL: vhsubu_v16i8_nw:
190+
; CHECK: @ %bb.0:
191+
; CHECK-NEXT: vhsub.u8 q0, q0, q1
192+
; CHECK-NEXT: bx lr
193+
%sub = sub nuw <16 x i8> %x, %y
194+
%half = lshr <16 x i8> %sub, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
195+
ret <16 x i8> %half
196+
}
197+
define arm_aapcs_vfpcc <8 x i16> @vhsubs_v8i16_nw(<8 x i16> %x, <8 x i16> %y) {
198+
; CHECK-LABEL: vhsubs_v8i16_nw:
199+
; CHECK: @ %bb.0:
200+
; CHECK-NEXT: vhsub.s16 q0, q0, q1
201+
; CHECK-NEXT: bx lr
202+
%sub = sub nsw <8 x i16> %x, %y
203+
%half = ashr <8 x i16> %sub, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
204+
ret <8 x i16> %half
205+
}
206+
define arm_aapcs_vfpcc <8 x i16> @vhsubu_v8i16_nw(<8 x i16> %x, <8 x i16> %y) {
207+
; CHECK-LABEL: vhsubu_v8i16_nw:
208+
; CHECK: @ %bb.0:
209+
; CHECK-NEXT: vhsub.u16 q0, q0, q1
210+
; CHECK-NEXT: bx lr
211+
%sub = sub nuw <8 x i16> %x, %y
212+
%half = lshr <8 x i16> %sub, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
213+
ret <8 x i16> %half
214+
}
215+
define arm_aapcs_vfpcc <4 x i32> @vhsubs_v4i32_nw(<4 x i32> %x, <4 x i32> %y) {
216+
; CHECK-LABEL: vhsubs_v4i32_nw:
217+
; CHECK: @ %bb.0:
218+
; CHECK-NEXT: vhsub.s32 q0, q0, q1
219+
; CHECK-NEXT: bx lr
220+
%sub = sub nsw <4 x i32> %x, %y
221+
%half = ashr <4 x i32> %sub, <i32 1, i32 1, i32 1, i32 1>
222+
ret <4 x i32> %half
223+
}
224+
define arm_aapcs_vfpcc <4 x i32> @vhsubu_v4i32_nw(<4 x i32> %x, <4 x i32> %y) {
225+
; CHECK-LABEL: vhsubu_v4i32_nw:
226+
; CHECK: @ %bb.0:
227+
; CHECK-NEXT: vhsub.u32 q0, q0, q1
228+
; CHECK-NEXT: bx lr
229+
%sub = sub nuw <4 x i32> %x, %y
230+
%half = lshr <4 x i32> %sub, <i32 1, i32 1, i32 1, i32 1>
231+
ret <4 x i32> %half
232+
}

llvm/test/CodeGen/Thumb2/mve-vhaddsub.ll

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -28,8 +28,7 @@ entry:
2828
define arm_aapcs_vfpcc <4 x i32> @add_ashr_v4i32(<4 x i32> %src1, <4 x i32> %src2) {
2929
; CHECK-LABEL: add_ashr_v4i32:
3030
; CHECK: @ %bb.0: @ %entry
31-
; CHECK-NEXT: vadd.i32 q0, q0, q1
32-
; CHECK-NEXT: vshr.s32 q0, q0, #1
31+
; CHECK-NEXT: vhadd.s32 q0, q0, q1
3332
; CHECK-NEXT: bx lr
3433
entry:
3534
%0 = add nsw <4 x i32> %src1, %src2
@@ -100,8 +99,7 @@ entry:
10099
define arm_aapcs_vfpcc <4 x i32> @sub_ashr_v4i32(<4 x i32> %src1, <4 x i32> %src2) {
101100
; CHECK-LABEL: sub_ashr_v4i32:
102101
; CHECK: @ %bb.0: @ %entry
103-
; CHECK-NEXT: vsub.i32 q0, q0, q1
104-
; CHECK-NEXT: vshr.s32 q0, q0, #1
102+
; CHECK-NEXT: vhsub.s32 q0, q0, q1
105103
; CHECK-NEXT: bx lr
106104
entry:
107105
%0 = sub nsw <4 x i32> %src1, %src2

0 commit comments

Comments
 (0)