Skip to content

Commit 62f9712

Browse files
committed
[ARM][MVE] Add patterns for VRHADD
Add patterns which use standard add nodes along with arm vshr imm nodes. Differential Revision: https://reviews.llvm.org/D77069
1 parent 3ce0ad1 commit 62f9712

File tree

2 files changed

+339
-20
lines changed

2 files changed

+339
-20
lines changed

llvm/lib/Target/ARM/ARMInstrMVE.td

Lines changed: 51 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -2015,6 +2015,26 @@ class MVE_VRHADD_Base<string suffix, bit U, bits<2> size, list<dag> pattern=[]>
20152015
let validForTailPredication = 1;
20162016
}
20172017

2018+
def addnuw : PatFrag<(ops node:$lhs, node:$rhs),
2019+
(add node:$lhs, node:$rhs), [{
2020+
return N->getFlags().hasNoUnsignedWrap();
2021+
}]>;
2022+
2023+
def addnsw : PatFrag<(ops node:$lhs, node:$rhs),
2024+
(add node:$lhs, node:$rhs), [{
2025+
return N->getFlags().hasNoSignedWrap();
2026+
}]>;
2027+
2028+
def subnuw : PatFrag<(ops node:$lhs, node:$rhs),
2029+
(sub node:$lhs, node:$rhs), [{
2030+
return N->getFlags().hasNoUnsignedWrap();
2031+
}]>;
2032+
2033+
def subnsw : PatFrag<(ops node:$lhs, node:$rhs),
2034+
(sub node:$lhs, node:$rhs), [{
2035+
return N->getFlags().hasNoSignedWrap();
2036+
}]>;
2037+
20182038
multiclass MVE_VRHADD_m<MVEVectorVTInfo VTI,
20192039
SDNode unpred_op, Intrinsic pred_int> {
20202040
def "" : MVE_VRHADD_Base<VTI.Suffix, VTI.Unsigned, VTI.Size>;
@@ -2046,6 +2066,37 @@ defm MVE_VRHADDu8 : MVE_VRHADD<MVE_v16u8>;
20462066
defm MVE_VRHADDu16 : MVE_VRHADD<MVE_v8u16>;
20472067
defm MVE_VRHADDu32 : MVE_VRHADD<MVE_v4u32>;
20482068

2069+
// Rounding Halving Add perform the arithemtic operation with an extra bit of
2070+
// precision, before performing the shift, to void clipping errors. We're not
2071+
// modelling that here with these patterns, but we're using no wrap forms of
2072+
// add to ensure that the extra bit of information is not needed for the
2073+
// arithmetic or the rounding.
2074+
def : Pat<(v16i8 (ARMvshrsImm (addnsw (addnsw (v16i8 MQPR:$Qm), (v16i8 MQPR:$Qn)),
2075+
(v16i8 (ARMvmovImm (i32 3585)))),
2076+
(i32 1))),
2077+
(MVE_VRHADDs8 MQPR:$Qm, MQPR:$Qn)>;
2078+
def : Pat<(v8i16 (ARMvshrsImm (addnsw (addnsw (v8i16 MQPR:$Qm), (v8i16 MQPR:$Qn)),
2079+
(v8i16 (ARMvmovImm (i32 2049)))),
2080+
(i32 1))),
2081+
(MVE_VRHADDs16 MQPR:$Qm, MQPR:$Qn)>;
2082+
def : Pat<(v4i32 (ARMvshrsImm (addnsw (addnsw (v4i32 MQPR:$Qm), (v4i32 MQPR:$Qn)),
2083+
(v4i32 (ARMvmovImm (i32 1)))),
2084+
(i32 1))),
2085+
(MVE_VRHADDs32 MQPR:$Qm, MQPR:$Qn)>;
2086+
def : Pat<(v16i8 (ARMvshruImm (addnuw (addnuw (v16i8 MQPR:$Qm), (v16i8 MQPR:$Qn)),
2087+
(v16i8 (ARMvmovImm (i32 3585)))),
2088+
(i32 1))),
2089+
(MVE_VRHADDu8 MQPR:$Qm, MQPR:$Qn)>;
2090+
def : Pat<(v8i16 (ARMvshruImm (addnuw (addnuw (v8i16 MQPR:$Qm), (v8i16 MQPR:$Qn)),
2091+
(v8i16 (ARMvmovImm (i32 2049)))),
2092+
(i32 1))),
2093+
(MVE_VRHADDu16 MQPR:$Qm, MQPR:$Qn)>;
2094+
def : Pat<(v4i32 (ARMvshruImm (addnuw (addnuw (v4i32 MQPR:$Qm), (v4i32 MQPR:$Qn)),
2095+
(v4i32 (ARMvmovImm (i32 1)))),
2096+
(i32 1))),
2097+
(MVE_VRHADDu32 MQPR:$Qm, MQPR:$Qn)>;
2098+
2099+
20492100
class MVE_VHADDSUB<string iname, string suffix, bit U, bit subtract,
20502101
bits<2> size, list<dag> pattern=[]>
20512102
: MVE_int<iname, suffix, size, pattern> {
@@ -2095,26 +2146,6 @@ multiclass MVE_VHADD<MVEVectorVTInfo VTI, PatFrag add_op, SDNode shift_op>
20952146
: MVE_VHADD_m<VTI, int_arm_mve_vhadd, int_arm_mve_hadd_predicated, add_op,
20962147
shift_op>;
20972148

2098-
def addnuw : PatFrag<(ops node:$lhs, node:$rhs),
2099-
(add node:$lhs, node:$rhs), [{
2100-
return N->getFlags().hasNoUnsignedWrap();
2101-
}]>;
2102-
2103-
def addnsw : PatFrag<(ops node:$lhs, node:$rhs),
2104-
(add node:$lhs, node:$rhs), [{
2105-
return N->getFlags().hasNoSignedWrap();
2106-
}]>;
2107-
2108-
def subnuw : PatFrag<(ops node:$lhs, node:$rhs),
2109-
(sub node:$lhs, node:$rhs), [{
2110-
return N->getFlags().hasNoUnsignedWrap();
2111-
}]>;
2112-
2113-
def subnsw : PatFrag<(ops node:$lhs, node:$rhs),
2114-
(sub node:$lhs, node:$rhs), [{
2115-
return N->getFlags().hasNoSignedWrap();
2116-
}]>;
2117-
21182149
// Halving add/sub perform the arithemtic operation with an extra bit of
21192150
// precision, before performing the shift, to void clipping errors. We're not
21202151
// modelling that here with these patterns, but we're using no wrap forms of

llvm/test/CodeGen/Thumb2/mve-halving.ll

Lines changed: 288 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -230,3 +230,291 @@ define arm_aapcs_vfpcc <4 x i32> @vhsubu_v4i32_nw(<4 x i32> %x, <4 x i32> %y) {
230230
%half = lshr <4 x i32> %sub, <i32 1, i32 1, i32 1, i32 1>
231231
ret <4 x i32> %half
232232
}
233+
define arm_aapcs_vfpcc <16 x i8> @vrhadds_v16i8(<16 x i8> %x, <16 x i8> %y) {
234+
; CHECK-LABEL: vrhadds_v16i8:
235+
; CHECK: @ %bb.0:
236+
; CHECK-NEXT: vadd.i8 q0, q0, q1
237+
; CHECK-NEXT: vmov.i8 q1, #0x1
238+
; CHECK-NEXT: vadd.i8 q0, q0, q1
239+
; CHECK-NEXT: vshr.s8 q0, q0, #1
240+
; CHECK-NEXT: bx lr
241+
%add = add <16 x i8> %x, %y
242+
%round = add <16 x i8> %add, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
243+
%half = ashr <16 x i8> %round, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
244+
ret <16 x i8> %half
245+
}
246+
define arm_aapcs_vfpcc <16 x i8> @vrhaddu_v16i8(<16 x i8> %x, <16 x i8> %y) {
247+
; CHECK-LABEL: vrhaddu_v16i8:
248+
; CHECK: @ %bb.0:
249+
; CHECK-NEXT: vadd.i8 q0, q0, q1
250+
; CHECK-NEXT: vmov.i8 q1, #0x1
251+
; CHECK-NEXT: vadd.i8 q0, q0, q1
252+
; CHECK-NEXT: vshr.u8 q0, q0, #1
253+
; CHECK-NEXT: bx lr
254+
%add = add <16 x i8> %x, %y
255+
%round = add <16 x i8> %add, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
256+
%half = lshr <16 x i8> %round, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
257+
ret <16 x i8> %half
258+
}
259+
define arm_aapcs_vfpcc <8 x i16> @vrhadds_v8i16(<8 x i16> %x, <8 x i16> %y) {
260+
; CHECK-LABEL: vrhadds_v8i16:
261+
; CHECK: @ %bb.0:
262+
; CHECK-NEXT: vadd.i16 q0, q0, q1
263+
; CHECK-NEXT: vmov.i16 q1, #0x1
264+
; CHECK-NEXT: vadd.i16 q0, q0, q1
265+
; CHECK-NEXT: vshr.s16 q0, q0, #1
266+
; CHECK-NEXT: bx lr
267+
%add = add <8 x i16> %x, %y
268+
%round = add <8 x i16> %add, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
269+
%half = ashr <8 x i16> %round, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
270+
ret <8 x i16> %half
271+
}
272+
define arm_aapcs_vfpcc <8 x i16> @vrhaddu_v8i16(<8 x i16> %x, <8 x i16> %y) {
273+
; CHECK-LABEL: vrhaddu_v8i16:
274+
; CHECK: @ %bb.0:
275+
; CHECK-NEXT: vadd.i16 q0, q0, q1
276+
; CHECK-NEXT: vmov.i16 q1, #0x1
277+
; CHECK-NEXT: vadd.i16 q0, q0, q1
278+
; CHECK-NEXT: vshr.u16 q0, q0, #1
279+
; CHECK-NEXT: bx lr
280+
%add = add <8 x i16> %x, %y
281+
%round = add <8 x i16> %add, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
282+
%half = lshr <8 x i16> %round, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
283+
ret <8 x i16> %half
284+
}
285+
define arm_aapcs_vfpcc <4 x i32> @vrhadds_v4i32(<4 x i32> %x, <4 x i32> %y) {
286+
; CHECK-LABEL: vrhadds_v4i32:
287+
; CHECK: @ %bb.0:
288+
; CHECK-NEXT: vadd.i32 q0, q0, q1
289+
; CHECK-NEXT: vmov.i32 q1, #0x1
290+
; CHECK-NEXT: vadd.i32 q0, q0, q1
291+
; CHECK-NEXT: vshr.s32 q0, q0, #1
292+
; CHECK-NEXT: bx lr
293+
%add = add <4 x i32> %x, %y
294+
%round = add <4 x i32> %add, <i32 1, i32 1, i32 1, i32 1>
295+
%half = ashr <4 x i32> %round, <i32 1, i32 1, i32 1, i32 1>
296+
ret <4 x i32> %half
297+
}
298+
define arm_aapcs_vfpcc <4 x i32> @vrhaddu_v4i32(<4 x i32> %x, <4 x i32> %y) {
299+
; CHECK-LABEL: vrhaddu_v4i32:
300+
; CHECK: @ %bb.0:
301+
; CHECK-NEXT: vadd.i32 q0, q0, q1
302+
; CHECK-NEXT: vmov.i32 q1, #0x1
303+
; CHECK-NEXT: vadd.i32 q0, q0, q1
304+
; CHECK-NEXT: vshr.u32 q0, q0, #1
305+
; CHECK-NEXT: bx lr
306+
%add = add <4 x i32> %x, %y
307+
%round = add <4 x i32> %add, <i32 1, i32 1, i32 1, i32 1>
308+
%half = lshr <4 x i32> %round, <i32 1, i32 1, i32 1, i32 1>
309+
ret <4 x i32> %half
310+
}
311+
define arm_aapcs_vfpcc <16 x i8> @vrhadds_v16i8_nwop(<16 x i8> %x, <16 x i8> %y) {
312+
; CHECK-LABEL: vrhadds_v16i8_nwop:
313+
; CHECK: @ %bb.0:
314+
; CHECK-NEXT: vadd.i8 q0, q0, q1
315+
; CHECK-NEXT: vmov.i8 q1, #0x1
316+
; CHECK-NEXT: vadd.i8 q0, q0, q1
317+
; CHECK-NEXT: vshr.s8 q0, q0, #1
318+
; CHECK-NEXT: bx lr
319+
%add = add nsw <16 x i8> %x, %y
320+
%round = add <16 x i8> %add, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
321+
%half = ashr <16 x i8> %round, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
322+
ret <16 x i8> %half
323+
}
324+
define arm_aapcs_vfpcc <16 x i8> @vrhaddu_v16i8_nwop(<16 x i8> %x, <16 x i8> %y) {
325+
; CHECK-LABEL: vrhaddu_v16i8_nwop:
326+
; CHECK: @ %bb.0:
327+
; CHECK-NEXT: vadd.i8 q0, q0, q1
328+
; CHECK-NEXT: vmov.i8 q1, #0x1
329+
; CHECK-NEXT: vadd.i8 q0, q0, q1
330+
; CHECK-NEXT: vshr.u8 q0, q0, #1
331+
; CHECK-NEXT: bx lr
332+
%add = add nuw <16 x i8> %x, %y
333+
%round = add <16 x i8> %add, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
334+
%half = lshr <16 x i8> %round, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
335+
ret <16 x i8> %half
336+
}
337+
define arm_aapcs_vfpcc <8 x i16> @vrhadds_v8i16_nwop(<8 x i16> %x, <8 x i16> %y) {
338+
; CHECK-LABEL: vrhadds_v8i16_nwop:
339+
; CHECK: @ %bb.0:
340+
; CHECK-NEXT: vadd.i16 q0, q0, q1
341+
; CHECK-NEXT: vmov.i16 q1, #0x1
342+
; CHECK-NEXT: vadd.i16 q0, q0, q1
343+
; CHECK-NEXT: vshr.s16 q0, q0, #1
344+
; CHECK-NEXT: bx lr
345+
%add = add nsw <8 x i16> %x, %y
346+
%round = add <8 x i16> %add, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
347+
%half = ashr <8 x i16> %round, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
348+
ret <8 x i16> %half
349+
}
350+
define arm_aapcs_vfpcc <8 x i16> @vrhaddu_v8i16_nwop(<8 x i16> %x, <8 x i16> %y) {
351+
; CHECK-LABEL: vrhaddu_v8i16_nwop:
352+
; CHECK: @ %bb.0:
353+
; CHECK-NEXT: vadd.i16 q0, q0, q1
354+
; CHECK-NEXT: vmov.i16 q1, #0x1
355+
; CHECK-NEXT: vadd.i16 q0, q0, q1
356+
; CHECK-NEXT: vshr.u16 q0, q0, #1
357+
; CHECK-NEXT: bx lr
358+
%add = add nuw <8 x i16> %x, %y
359+
%round = add <8 x i16> %add, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
360+
%half = lshr <8 x i16> %round, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
361+
ret <8 x i16> %half
362+
}
363+
define arm_aapcs_vfpcc <4 x i32> @vrhadds_v4i32_nwop(<4 x i32> %x, <4 x i32> %y) {
364+
; CHECK-LABEL: vrhadds_v4i32_nwop:
365+
; CHECK: @ %bb.0:
366+
; CHECK-NEXT: vadd.i32 q0, q0, q1
367+
; CHECK-NEXT: vmov.i32 q1, #0x1
368+
; CHECK-NEXT: vadd.i32 q0, q0, q1
369+
; CHECK-NEXT: vshr.s32 q0, q0, #1
370+
; CHECK-NEXT: bx lr
371+
%add = add nsw <4 x i32> %x, %y
372+
%round = add <4 x i32> %add, <i32 1, i32 1, i32 1, i32 1>
373+
%half = ashr <4 x i32> %round, <i32 1, i32 1, i32 1, i32 1>
374+
ret <4 x i32> %half
375+
}
376+
define arm_aapcs_vfpcc <4 x i32> @vrhaddu_v4i32_nwop(<4 x i32> %x, <4 x i32> %y) {
377+
; CHECK-LABEL: vrhaddu_v4i32_nwop:
378+
; CHECK: @ %bb.0:
379+
; CHECK-NEXT: vadd.i32 q0, q0, q1
380+
; CHECK-NEXT: vmov.i32 q1, #0x1
381+
; CHECK-NEXT: vadd.i32 q0, q0, q1
382+
; CHECK-NEXT: vshr.u32 q0, q0, #1
383+
; CHECK-NEXT: bx lr
384+
%add = add nuw <4 x i32> %x, %y
385+
%round = add <4 x i32> %add, <i32 1, i32 1, i32 1, i32 1>
386+
%half = lshr <4 x i32> %round, <i32 1, i32 1, i32 1, i32 1>
387+
ret <4 x i32> %half
388+
}
389+
define arm_aapcs_vfpcc <16 x i8> @vrhadds_v16i8_nwrnd(<16 x i8> %x, <16 x i8> %y) {
390+
; CHECK-LABEL: vrhadds_v16i8_nwrnd:
391+
; CHECK: @ %bb.0:
392+
; CHECK-NEXT: vadd.i8 q0, q0, q1
393+
; CHECK-NEXT: vmov.i8 q1, #0x1
394+
; CHECK-NEXT: vhadd.s8 q0, q0, q1
395+
; CHECK-NEXT: bx lr
396+
%add = add <16 x i8> %x, %y
397+
%round = add nsw <16 x i8> %add, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
398+
%half = ashr <16 x i8> %round, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
399+
ret <16 x i8> %half
400+
}
401+
define arm_aapcs_vfpcc <16 x i8> @vrhaddu_v16i8_nwrnd(<16 x i8> %x, <16 x i8> %y) {
402+
; CHECK-LABEL: vrhaddu_v16i8_nwrnd:
403+
; CHECK: @ %bb.0:
404+
; CHECK-NEXT: vadd.i8 q0, q0, q1
405+
; CHECK-NEXT: vmov.i8 q1, #0x1
406+
; CHECK-NEXT: vhadd.u8 q0, q0, q1
407+
; CHECK-NEXT: bx lr
408+
%add = add <16 x i8> %x, %y
409+
%round = add nuw <16 x i8> %add, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
410+
%half = lshr <16 x i8> %round, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
411+
ret <16 x i8> %half
412+
}
413+
define arm_aapcs_vfpcc <8 x i16> @vrhadds_v8i16_nwrnd(<8 x i16> %x, <8 x i16> %y) {
414+
; CHECK-LABEL: vrhadds_v8i16_nwrnd:
415+
; CHECK: @ %bb.0:
416+
; CHECK-NEXT: vadd.i16 q0, q0, q1
417+
; CHECK-NEXT: vmov.i16 q1, #0x1
418+
; CHECK-NEXT: vhadd.s16 q0, q0, q1
419+
; CHECK-NEXT: bx lr
420+
%add = add <8 x i16> %x, %y
421+
%round = add nsw <8 x i16> %add, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
422+
%half = ashr <8 x i16> %round, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
423+
ret <8 x i16> %half
424+
}
425+
define arm_aapcs_vfpcc <8 x i16> @vrhaddu_v8i16_nwrnd(<8 x i16> %x, <8 x i16> %y) {
426+
; CHECK-LABEL: vrhaddu_v8i16_nwrnd:
427+
; CHECK: @ %bb.0:
428+
; CHECK-NEXT: vadd.i16 q0, q0, q1
429+
; CHECK-NEXT: vmov.i16 q1, #0x1
430+
; CHECK-NEXT: vhadd.u16 q0, q0, q1
431+
; CHECK-NEXT: bx lr
432+
%add = add <8 x i16> %x, %y
433+
%round = add nuw <8 x i16> %add, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
434+
%half = lshr <8 x i16> %round, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
435+
ret <8 x i16> %half
436+
}
437+
define arm_aapcs_vfpcc <4 x i32> @vrhadds_v4i32_nwrnd(<4 x i32> %x, <4 x i32> %y) {
438+
; CHECK-LABEL: vrhadds_v4i32_nwrnd:
439+
; CHECK: @ %bb.0:
440+
; CHECK-NEXT: vadd.i32 q0, q0, q1
441+
; CHECK-NEXT: vmov.i32 q1, #0x1
442+
; CHECK-NEXT: vhadd.s32 q0, q0, q1
443+
; CHECK-NEXT: bx lr
444+
%add = add <4 x i32> %x, %y
445+
%round = add nsw <4 x i32> %add, <i32 1, i32 1, i32 1, i32 1>
446+
%half = ashr <4 x i32> %round, <i32 1, i32 1, i32 1, i32 1>
447+
ret <4 x i32> %half
448+
}
449+
define arm_aapcs_vfpcc <4 x i32> @vrhaddu_v4i32_nwrnd(<4 x i32> %x, <4 x i32> %y) {
450+
; CHECK-LABEL: vrhaddu_v4i32_nwrnd:
451+
; CHECK: @ %bb.0:
452+
; CHECK-NEXT: vadd.i32 q0, q0, q1
453+
; CHECK-NEXT: vmov.i32 q1, #0x1
454+
; CHECK-NEXT: vhadd.u32 q0, q0, q1
455+
; CHECK-NEXT: bx lr
456+
%add = add <4 x i32> %x, %y
457+
%round = add nuw <4 x i32> %add, <i32 1, i32 1, i32 1, i32 1>
458+
%half = lshr <4 x i32> %round, <i32 1, i32 1, i32 1, i32 1>
459+
ret <4 x i32> %half
460+
}
461+
define arm_aapcs_vfpcc <16 x i8> @vrhadds_v16i8_both_nw(<16 x i8> %x, <16 x i8> %y) {
462+
; CHECK-LABEL: vrhadds_v16i8_both_nw:
463+
; CHECK: @ %bb.0:
464+
; CHECK-NEXT: vrhadd.s8 q0, q0, q1
465+
; CHECK-NEXT: bx lr
466+
%add = add nsw <16 x i8> %x, %y
467+
%round = add nsw <16 x i8> %add, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
468+
%half = ashr <16 x i8> %round, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
469+
ret <16 x i8> %half
470+
}
471+
define arm_aapcs_vfpcc <16 x i8> @vrhaddu_v16i8_both_nw(<16 x i8> %x, <16 x i8> %y) {
472+
; CHECK-LABEL: vrhaddu_v16i8_both_nw:
473+
; CHECK: @ %bb.0:
474+
; CHECK-NEXT: vrhadd.u8 q0, q0, q1
475+
; CHECK-NEXT: bx lr
476+
%add = add nuw <16 x i8> %x, %y
477+
%round = add nuw <16 x i8> %add, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
478+
%half = lshr <16 x i8> %round, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
479+
ret <16 x i8> %half
480+
}
481+
define arm_aapcs_vfpcc <8 x i16> @vrhadds_v8i16_both_nw(<8 x i16> %x, <8 x i16> %y) {
482+
; CHECK-LABEL: vrhadds_v8i16_both_nw:
483+
; CHECK: @ %bb.0:
484+
; CHECK-NEXT: vrhadd.s16 q0, q0, q1
485+
; CHECK-NEXT: bx lr
486+
%add = add nsw <8 x i16> %x, %y
487+
%round = add nsw <8 x i16> %add, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
488+
%half = ashr <8 x i16> %round, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
489+
ret <8 x i16> %half
490+
}
491+
define arm_aapcs_vfpcc <8 x i16> @vrhaddu_v8i16_both_nw(<8 x i16> %x, <8 x i16> %y) {
492+
; CHECK-LABEL: vrhaddu_v8i16_both_nw:
493+
; CHECK: @ %bb.0:
494+
; CHECK-NEXT: vrhadd.u16 q0, q0, q1
495+
; CHECK-NEXT: bx lr
496+
%add = add nuw <8 x i16> %x, %y
497+
%round = add nuw <8 x i16> %add, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
498+
%half = lshr <8 x i16> %round, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
499+
ret <8 x i16> %half
500+
}
501+
define arm_aapcs_vfpcc <4 x i32> @vrhadds_v4i32_both_nw(<4 x i32> %x, <4 x i32> %y) {
502+
; CHECK-LABEL: vrhadds_v4i32_both_nw:
503+
; CHECK: @ %bb.0:
504+
; CHECK-NEXT: vrhadd.s32 q0, q0, q1
505+
; CHECK-NEXT: bx lr
506+
%add = add nsw <4 x i32> %x, %y
507+
%round = add nsw <4 x i32> %add, <i32 1, i32 1, i32 1, i32 1>
508+
%half = ashr <4 x i32> %round, <i32 1, i32 1, i32 1, i32 1>
509+
ret <4 x i32> %half
510+
}
511+
define arm_aapcs_vfpcc <4 x i32> @vrhaddu_v4i32_both_nw(<4 x i32> %x, <4 x i32> %y) {
512+
; CHECK-LABEL: vrhaddu_v4i32_both_nw:
513+
; CHECK: @ %bb.0:
514+
; CHECK-NEXT: vrhadd.u32 q0, q0, q1
515+
; CHECK-NEXT: bx lr
516+
%add = add nuw <4 x i32> %x, %y
517+
%round = add nuw <4 x i32> %add, <i32 1, i32 1, i32 1, i32 1>
518+
%half = lshr <4 x i32> %round, <i32 1, i32 1, i32 1, i32 1>
519+
ret <4 x i32> %half
520+
}

0 commit comments

Comments
 (0)