Skip to content

Commit 68152f1

Browse files
authored
[WebAssembly] v16i8 mul support (#150209)
During target DAG combine, use two i16x8.extmul_low_i8x16 and a shuffle for v16i8 mul. On my AArch64 machine, using V8, I observe a 3.14% geomean improvement across 65 benchmarks, including: 9.2% for spec2017.x264, 6% for libyuv and 1.8% for ncnn.
1 parent 2780b8f commit 68152f1

File tree

3 files changed

+71
-166
lines changed

3 files changed

+71
-166
lines changed

llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp

Lines changed: 42 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3436,8 +3436,7 @@ static SDValue performSETCCCombine(SDNode *N,
34363436
return SDValue();
34373437
}
34383438

3439-
static SDValue performMulCombine(SDNode *N, SelectionDAG &DAG) {
3440-
assert(N->getOpcode() == ISD::MUL);
3439+
static SDValue TryWideExtMulCombine(SDNode *N, SelectionDAG &DAG) {
34413440
EVT VT = N->getValueType(0);
34423441
if (VT != MVT::v8i32 && VT != MVT::v16i32)
34433442
return SDValue();
@@ -3523,6 +3522,46 @@ static SDValue performMulCombine(SDNode *N, SelectionDAG &DAG) {
35233522
return SDValue();
35243523
}
35253524

3525+
static SDValue performMulCombine(SDNode *N,
3526+
TargetLowering::DAGCombinerInfo &DCI) {
3527+
assert(N->getOpcode() == ISD::MUL);
3528+
EVT VT = N->getValueType(0);
3529+
if (!VT.isVector())
3530+
return SDValue();
3531+
3532+
if (auto Res = TryWideExtMulCombine(N, DCI.DAG))
3533+
return Res;
3534+
3535+
// We don't natively support v16i8 mul, but we do support v8i16 so split the
3536+
// inputs and extend them to v8i16. Only do this before legalization in case
3537+
// a narrow vector is widened and may be simplified later.
3538+
if (!DCI.isBeforeLegalize() || VT != MVT::v16i8)
3539+
return SDValue();
3540+
3541+
SDLoc DL(N);
3542+
SelectionDAG &DAG = DCI.DAG;
3543+
SDValue LHS = N->getOperand(0);
3544+
SDValue RHS = N->getOperand(1);
3545+
SDValue LowLHS =
3546+
DAG.getNode(WebAssemblyISD::EXTEND_LOW_U, DL, MVT::v8i16, LHS);
3547+
SDValue HighLHS =
3548+
DAG.getNode(WebAssemblyISD::EXTEND_HIGH_U, DL, MVT::v8i16, LHS);
3549+
SDValue LowRHS =
3550+
DAG.getNode(WebAssemblyISD::EXTEND_LOW_U, DL, MVT::v8i16, RHS);
3551+
SDValue HighRHS =
3552+
DAG.getNode(WebAssemblyISD::EXTEND_HIGH_U, DL, MVT::v8i16, RHS);
3553+
3554+
SDValue MulLow =
3555+
DAG.getBitcast(VT, DAG.getNode(ISD::MUL, DL, MVT::v8i16, LowLHS, LowRHS));
3556+
SDValue MulHigh = DAG.getBitcast(
3557+
VT, DAG.getNode(ISD::MUL, DL, MVT::v8i16, HighLHS, HighRHS));
3558+
3559+
// Take the low byte of each lane.
3560+
return DAG.getVectorShuffle(
3561+
VT, DL, MulLow, MulHigh,
3562+
{0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30});
3563+
}
3564+
35263565
SDValue
35273566
WebAssemblyTargetLowering::PerformDAGCombine(SDNode *N,
35283567
DAGCombinerInfo &DCI) const {
@@ -3557,6 +3596,6 @@ WebAssemblyTargetLowering::PerformDAGCombine(SDNode *N,
35573596
return performLowerPartialReduction(N, DCI.DAG);
35583597
}
35593598
case ISD::MUL:
3560-
return performMulCombine(N, DCI.DAG);
3599+
return performMulCombine(N, DCI);
35613600
}
35623601
}

llvm/test/CodeGen/WebAssembly/simd-arith.ll

Lines changed: 7 additions & 129 deletions
Original file line numberDiff line numberDiff line change
@@ -199,139 +199,17 @@ define <16 x i8> @mul_v16i8(<16 x i8> %x, <16 x i8> %y) {
199199
; SIMD128-LABEL: mul_v16i8:
200200
; SIMD128: .functype mul_v16i8 (v128, v128) -> (v128)
201201
; SIMD128-NEXT: # %bb.0:
202-
; SIMD128-NEXT: i8x16.extract_lane_u $push4=, $0, 0
203-
; SIMD128-NEXT: i8x16.extract_lane_u $push3=, $1, 0
204-
; SIMD128-NEXT: i32.mul $push5=, $pop4, $pop3
205-
; SIMD128-NEXT: i8x16.splat $push6=, $pop5
206-
; SIMD128-NEXT: i8x16.extract_lane_u $push1=, $0, 1
207-
; SIMD128-NEXT: i8x16.extract_lane_u $push0=, $1, 1
208-
; SIMD128-NEXT: i32.mul $push2=, $pop1, $pop0
209-
; SIMD128-NEXT: i8x16.replace_lane $push7=, $pop6, 1, $pop2
210-
; SIMD128-NEXT: i8x16.extract_lane_u $push9=, $0, 2
211-
; SIMD128-NEXT: i8x16.extract_lane_u $push8=, $1, 2
212-
; SIMD128-NEXT: i32.mul $push10=, $pop9, $pop8
213-
; SIMD128-NEXT: i8x16.replace_lane $push11=, $pop7, 2, $pop10
214-
; SIMD128-NEXT: i8x16.extract_lane_u $push13=, $0, 3
215-
; SIMD128-NEXT: i8x16.extract_lane_u $push12=, $1, 3
216-
; SIMD128-NEXT: i32.mul $push14=, $pop13, $pop12
217-
; SIMD128-NEXT: i8x16.replace_lane $push15=, $pop11, 3, $pop14
218-
; SIMD128-NEXT: i8x16.extract_lane_u $push17=, $0, 4
219-
; SIMD128-NEXT: i8x16.extract_lane_u $push16=, $1, 4
220-
; SIMD128-NEXT: i32.mul $push18=, $pop17, $pop16
221-
; SIMD128-NEXT: i8x16.replace_lane $push19=, $pop15, 4, $pop18
222-
; SIMD128-NEXT: i8x16.extract_lane_u $push21=, $0, 5
223-
; SIMD128-NEXT: i8x16.extract_lane_u $push20=, $1, 5
224-
; SIMD128-NEXT: i32.mul $push22=, $pop21, $pop20
225-
; SIMD128-NEXT: i8x16.replace_lane $push23=, $pop19, 5, $pop22
226-
; SIMD128-NEXT: i8x16.extract_lane_u $push25=, $0, 6
227-
; SIMD128-NEXT: i8x16.extract_lane_u $push24=, $1, 6
228-
; SIMD128-NEXT: i32.mul $push26=, $pop25, $pop24
229-
; SIMD128-NEXT: i8x16.replace_lane $push27=, $pop23, 6, $pop26
230-
; SIMD128-NEXT: i8x16.extract_lane_u $push29=, $0, 7
231-
; SIMD128-NEXT: i8x16.extract_lane_u $push28=, $1, 7
232-
; SIMD128-NEXT: i32.mul $push30=, $pop29, $pop28
233-
; SIMD128-NEXT: i8x16.replace_lane $push31=, $pop27, 7, $pop30
234-
; SIMD128-NEXT: i8x16.extract_lane_u $push33=, $0, 8
235-
; SIMD128-NEXT: i8x16.extract_lane_u $push32=, $1, 8
236-
; SIMD128-NEXT: i32.mul $push34=, $pop33, $pop32
237-
; SIMD128-NEXT: i8x16.replace_lane $push35=, $pop31, 8, $pop34
238-
; SIMD128-NEXT: i8x16.extract_lane_u $push37=, $0, 9
239-
; SIMD128-NEXT: i8x16.extract_lane_u $push36=, $1, 9
240-
; SIMD128-NEXT: i32.mul $push38=, $pop37, $pop36
241-
; SIMD128-NEXT: i8x16.replace_lane $push39=, $pop35, 9, $pop38
242-
; SIMD128-NEXT: i8x16.extract_lane_u $push41=, $0, 10
243-
; SIMD128-NEXT: i8x16.extract_lane_u $push40=, $1, 10
244-
; SIMD128-NEXT: i32.mul $push42=, $pop41, $pop40
245-
; SIMD128-NEXT: i8x16.replace_lane $push43=, $pop39, 10, $pop42
246-
; SIMD128-NEXT: i8x16.extract_lane_u $push45=, $0, 11
247-
; SIMD128-NEXT: i8x16.extract_lane_u $push44=, $1, 11
248-
; SIMD128-NEXT: i32.mul $push46=, $pop45, $pop44
249-
; SIMD128-NEXT: i8x16.replace_lane $push47=, $pop43, 11, $pop46
250-
; SIMD128-NEXT: i8x16.extract_lane_u $push49=, $0, 12
251-
; SIMD128-NEXT: i8x16.extract_lane_u $push48=, $1, 12
252-
; SIMD128-NEXT: i32.mul $push50=, $pop49, $pop48
253-
; SIMD128-NEXT: i8x16.replace_lane $push51=, $pop47, 12, $pop50
254-
; SIMD128-NEXT: i8x16.extract_lane_u $push53=, $0, 13
255-
; SIMD128-NEXT: i8x16.extract_lane_u $push52=, $1, 13
256-
; SIMD128-NEXT: i32.mul $push54=, $pop53, $pop52
257-
; SIMD128-NEXT: i8x16.replace_lane $push55=, $pop51, 13, $pop54
258-
; SIMD128-NEXT: i8x16.extract_lane_u $push57=, $0, 14
259-
; SIMD128-NEXT: i8x16.extract_lane_u $push56=, $1, 14
260-
; SIMD128-NEXT: i32.mul $push58=, $pop57, $pop56
261-
; SIMD128-NEXT: i8x16.replace_lane $push59=, $pop55, 14, $pop58
262-
; SIMD128-NEXT: i8x16.extract_lane_u $push61=, $0, 15
263-
; SIMD128-NEXT: i8x16.extract_lane_u $push60=, $1, 15
264-
; SIMD128-NEXT: i32.mul $push62=, $pop61, $pop60
265-
; SIMD128-NEXT: i8x16.replace_lane $push63=, $pop59, 15, $pop62
266-
; SIMD128-NEXT: return $pop63
202+
; SIMD128-NEXT: i16x8.extmul_low_i8x16_u $push1=, $0, $1
203+
; SIMD128-NEXT: i16x8.extmul_high_i8x16_u $push0=, $0, $1
204+
; SIMD128-NEXT: i8x16.shuffle $push2=, $pop1, $pop0, 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
205+
; SIMD128-NEXT: return $pop2
267206
;
268207
; SIMD128-FAST-LABEL: mul_v16i8:
269208
; SIMD128-FAST: .functype mul_v16i8 (v128, v128) -> (v128)
270209
; SIMD128-FAST-NEXT: # %bb.0:
271-
; SIMD128-FAST-NEXT: i8x16.extract_lane_u $push5=, $0, 0
272-
; SIMD128-FAST-NEXT: i8x16.extract_lane_u $push4=, $1, 0
273-
; SIMD128-FAST-NEXT: i32.mul $push6=, $pop5, $pop4
274-
; SIMD128-FAST-NEXT: i8x16.splat $push7=, $pop6
275-
; SIMD128-FAST-NEXT: i8x16.extract_lane_u $push2=, $0, 1
276-
; SIMD128-FAST-NEXT: i8x16.extract_lane_u $push1=, $1, 1
277-
; SIMD128-FAST-NEXT: i32.mul $push3=, $pop2, $pop1
278-
; SIMD128-FAST-NEXT: i8x16.replace_lane $push8=, $pop7, 1, $pop3
279-
; SIMD128-FAST-NEXT: i8x16.extract_lane_u $push10=, $0, 2
280-
; SIMD128-FAST-NEXT: i8x16.extract_lane_u $push9=, $1, 2
281-
; SIMD128-FAST-NEXT: i32.mul $push11=, $pop10, $pop9
282-
; SIMD128-FAST-NEXT: i8x16.replace_lane $push12=, $pop8, 2, $pop11
283-
; SIMD128-FAST-NEXT: i8x16.extract_lane_u $push14=, $0, 3
284-
; SIMD128-FAST-NEXT: i8x16.extract_lane_u $push13=, $1, 3
285-
; SIMD128-FAST-NEXT: i32.mul $push15=, $pop14, $pop13
286-
; SIMD128-FAST-NEXT: i8x16.replace_lane $push16=, $pop12, 3, $pop15
287-
; SIMD128-FAST-NEXT: i8x16.extract_lane_u $push18=, $0, 4
288-
; SIMD128-FAST-NEXT: i8x16.extract_lane_u $push17=, $1, 4
289-
; SIMD128-FAST-NEXT: i32.mul $push19=, $pop18, $pop17
290-
; SIMD128-FAST-NEXT: i8x16.replace_lane $push20=, $pop16, 4, $pop19
291-
; SIMD128-FAST-NEXT: i8x16.extract_lane_u $push22=, $0, 5
292-
; SIMD128-FAST-NEXT: i8x16.extract_lane_u $push21=, $1, 5
293-
; SIMD128-FAST-NEXT: i32.mul $push23=, $pop22, $pop21
294-
; SIMD128-FAST-NEXT: i8x16.replace_lane $push24=, $pop20, 5, $pop23
295-
; SIMD128-FAST-NEXT: i8x16.extract_lane_u $push26=, $0, 6
296-
; SIMD128-FAST-NEXT: i8x16.extract_lane_u $push25=, $1, 6
297-
; SIMD128-FAST-NEXT: i32.mul $push27=, $pop26, $pop25
298-
; SIMD128-FAST-NEXT: i8x16.replace_lane $push28=, $pop24, 6, $pop27
299-
; SIMD128-FAST-NEXT: i8x16.extract_lane_u $push30=, $0, 7
300-
; SIMD128-FAST-NEXT: i8x16.extract_lane_u $push29=, $1, 7
301-
; SIMD128-FAST-NEXT: i32.mul $push31=, $pop30, $pop29
302-
; SIMD128-FAST-NEXT: i8x16.replace_lane $push32=, $pop28, 7, $pop31
303-
; SIMD128-FAST-NEXT: i8x16.extract_lane_u $push34=, $0, 8
304-
; SIMD128-FAST-NEXT: i8x16.extract_lane_u $push33=, $1, 8
305-
; SIMD128-FAST-NEXT: i32.mul $push35=, $pop34, $pop33
306-
; SIMD128-FAST-NEXT: i8x16.replace_lane $push36=, $pop32, 8, $pop35
307-
; SIMD128-FAST-NEXT: i8x16.extract_lane_u $push38=, $0, 9
308-
; SIMD128-FAST-NEXT: i8x16.extract_lane_u $push37=, $1, 9
309-
; SIMD128-FAST-NEXT: i32.mul $push39=, $pop38, $pop37
310-
; SIMD128-FAST-NEXT: i8x16.replace_lane $push40=, $pop36, 9, $pop39
311-
; SIMD128-FAST-NEXT: i8x16.extract_lane_u $push42=, $0, 10
312-
; SIMD128-FAST-NEXT: i8x16.extract_lane_u $push41=, $1, 10
313-
; SIMD128-FAST-NEXT: i32.mul $push43=, $pop42, $pop41
314-
; SIMD128-FAST-NEXT: i8x16.replace_lane $push44=, $pop40, 10, $pop43
315-
; SIMD128-FAST-NEXT: i8x16.extract_lane_u $push46=, $0, 11
316-
; SIMD128-FAST-NEXT: i8x16.extract_lane_u $push45=, $1, 11
317-
; SIMD128-FAST-NEXT: i32.mul $push47=, $pop46, $pop45
318-
; SIMD128-FAST-NEXT: i8x16.replace_lane $push48=, $pop44, 11, $pop47
319-
; SIMD128-FAST-NEXT: i8x16.extract_lane_u $push50=, $0, 12
320-
; SIMD128-FAST-NEXT: i8x16.extract_lane_u $push49=, $1, 12
321-
; SIMD128-FAST-NEXT: i32.mul $push51=, $pop50, $pop49
322-
; SIMD128-FAST-NEXT: i8x16.replace_lane $push52=, $pop48, 12, $pop51
323-
; SIMD128-FAST-NEXT: i8x16.extract_lane_u $push54=, $0, 13
324-
; SIMD128-FAST-NEXT: i8x16.extract_lane_u $push53=, $1, 13
325-
; SIMD128-FAST-NEXT: i32.mul $push55=, $pop54, $pop53
326-
; SIMD128-FAST-NEXT: i8x16.replace_lane $push56=, $pop52, 13, $pop55
327-
; SIMD128-FAST-NEXT: i8x16.extract_lane_u $push58=, $0, 14
328-
; SIMD128-FAST-NEXT: i8x16.extract_lane_u $push57=, $1, 14
329-
; SIMD128-FAST-NEXT: i32.mul $push59=, $pop58, $pop57
330-
; SIMD128-FAST-NEXT: i8x16.replace_lane $push60=, $pop56, 14, $pop59
331-
; SIMD128-FAST-NEXT: i8x16.extract_lane_u $push62=, $0, 15
332-
; SIMD128-FAST-NEXT: i8x16.extract_lane_u $push61=, $1, 15
333-
; SIMD128-FAST-NEXT: i32.mul $push63=, $pop62, $pop61
334-
; SIMD128-FAST-NEXT: i8x16.replace_lane $push0=, $pop60, 15, $pop63
210+
; SIMD128-FAST-NEXT: i16x8.extmul_low_i8x16_u $push2=, $0, $1
211+
; SIMD128-FAST-NEXT: i16x8.extmul_high_i8x16_u $push1=, $0, $1
212+
; SIMD128-FAST-NEXT: i8x16.shuffle $push0=, $pop2, $pop1, 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
335213
; SIMD128-FAST-NEXT: return $pop0
336214
;
337215
; NO-SIMD128-LABEL: mul_v16i8:

llvm/test/CodeGen/WebAssembly/vector-reduce.ll

Lines changed: 22 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -116,40 +116,28 @@ define i8 @pairwise_mul_v16i8(<16 x i8> %arg) {
116116
; SIMD128-LABEL: pairwise_mul_v16i8:
117117
; SIMD128: .functype pairwise_mul_v16i8 (v128) -> (i32)
118118
; SIMD128-NEXT: # %bb.0:
119-
; SIMD128-NEXT: i8x16.extract_lane_u $push26=, $0, 0
120-
; SIMD128-NEXT: i8x16.shuffle $push32=, $0, $0, 8, 9, 10, 11, 12, 13, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0
121-
; SIMD128-NEXT: local.tee $push31=, $1=, $pop32
122-
; SIMD128-NEXT: i8x16.extract_lane_u $push25=, $pop31, 0
123-
; SIMD128-NEXT: i32.mul $push27=, $pop26, $pop25
124-
; SIMD128-NEXT: i8x16.extract_lane_u $push23=, $0, 4
125-
; SIMD128-NEXT: i8x16.extract_lane_u $push22=, $1, 4
126-
; SIMD128-NEXT: i32.mul $push24=, $pop23, $pop22
127-
; SIMD128-NEXT: i32.mul $push28=, $pop27, $pop24
128-
; SIMD128-NEXT: i8x16.extract_lane_u $push19=, $0, 2
129-
; SIMD128-NEXT: i8x16.extract_lane_u $push18=, $1, 2
130-
; SIMD128-NEXT: i32.mul $push20=, $pop19, $pop18
131-
; SIMD128-NEXT: i8x16.extract_lane_u $push16=, $0, 6
132-
; SIMD128-NEXT: i8x16.extract_lane_u $push15=, $1, 6
133-
; SIMD128-NEXT: i32.mul $push17=, $pop16, $pop15
134-
; SIMD128-NEXT: i32.mul $push21=, $pop20, $pop17
135-
; SIMD128-NEXT: i32.mul $push29=, $pop28, $pop21
136-
; SIMD128-NEXT: i8x16.extract_lane_u $push11=, $0, 1
137-
; SIMD128-NEXT: i8x16.extract_lane_u $push10=, $1, 1
138-
; SIMD128-NEXT: i32.mul $push12=, $pop11, $pop10
139-
; SIMD128-NEXT: i8x16.extract_lane_u $push8=, $0, 5
140-
; SIMD128-NEXT: i8x16.extract_lane_u $push7=, $1, 5
141-
; SIMD128-NEXT: i32.mul $push9=, $pop8, $pop7
142-
; SIMD128-NEXT: i32.mul $push13=, $pop12, $pop9
143-
; SIMD128-NEXT: i8x16.extract_lane_u $push4=, $0, 3
144-
; SIMD128-NEXT: i8x16.extract_lane_u $push3=, $1, 3
145-
; SIMD128-NEXT: i32.mul $push5=, $pop4, $pop3
146-
; SIMD128-NEXT: i8x16.extract_lane_u $push1=, $0, 7
147-
; SIMD128-NEXT: i8x16.extract_lane_u $push0=, $1, 7
148-
; SIMD128-NEXT: i32.mul $push2=, $pop1, $pop0
149-
; SIMD128-NEXT: i32.mul $push6=, $pop5, $pop2
150-
; SIMD128-NEXT: i32.mul $push14=, $pop13, $pop6
151-
; SIMD128-NEXT: i32.mul $push30=, $pop29, $pop14
152-
; SIMD128-NEXT: return $pop30
119+
; SIMD128-NEXT: i8x16.shuffle $push20=, $0, $0, 8, 9, 10, 11, 12, 13, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0
120+
; SIMD128-NEXT: local.tee $push19=, $1=, $pop20
121+
; SIMD128-NEXT: i16x8.extmul_low_i8x16_u $push1=, $0, $pop19
122+
; SIMD128-NEXT: i16x8.extmul_high_i8x16_u $push0=, $0, $1
123+
; SIMD128-NEXT: i8x16.shuffle $push18=, $pop1, $pop0, 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
124+
; SIMD128-NEXT: local.tee $push17=, $0=, $pop18
125+
; SIMD128-NEXT: i8x16.shuffle $push16=, $0, $0, 4, 5, 6, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
126+
; SIMD128-NEXT: local.tee $push15=, $1=, $pop16
127+
; SIMD128-NEXT: i16x8.extmul_low_i8x16_u $push3=, $pop17, $pop15
128+
; SIMD128-NEXT: i16x8.extmul_high_i8x16_u $push2=, $0, $1
129+
; SIMD128-NEXT: i8x16.shuffle $push14=, $pop3, $pop2, 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
130+
; SIMD128-NEXT: local.tee $push13=, $0=, $pop14
131+
; SIMD128-NEXT: i8x16.shuffle $push12=, $0, $0, 2, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
132+
; SIMD128-NEXT: local.tee $push11=, $1=, $pop12
133+
; SIMD128-NEXT: i16x8.extmul_low_i8x16_u $push5=, $pop13, $pop11
134+
; SIMD128-NEXT: i16x8.extmul_high_i8x16_u $push4=, $0, $1
135+
; SIMD128-NEXT: i8x16.shuffle $push10=, $pop5, $pop4, 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
136+
; SIMD128-NEXT: local.tee $push9=, $0=, $pop10
137+
; SIMD128-NEXT: i8x16.shuffle $push6=, $0, $0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
138+
; SIMD128-NEXT: i16x8.extmul_low_i8x16_u $push7=, $pop9, $pop6
139+
; SIMD128-NEXT: i8x16.extract_lane_u $push8=, $pop7, 0
140+
; SIMD128-NEXT: return $pop8
153141
%res = tail call i8 @llvm.vector.reduce.mul.v16i8(<16 x i8> %arg)
154142
ret i8 %res
155143
}

0 commit comments

Comments
 (0)