-
Notifications
You must be signed in to change notification settings - Fork 14.7k
[WebAssembly] v16i8 mul support #150209
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[WebAssembly] v16i8 mul support #150209
Conversation
During target DAG combine, use two i16x8.extmul_low_i8x16 and a shuffle for v16i8 mul. On my AArch64 machine, using V8, I observe a 3.14% geomean improvement across 65 benchmarks, including: 9.2% for spec2017.x264, 6% for libyuv and 1.8% for ncnn.
@llvm/pr-subscribers-backend-webassembly Author: Sam Parker (sparker-arm) ChangesDuring target DAG combine, use two i16x8.extmul_low_i8x16 and a shuffle for v16i8 mul. On my AArch64 machine, using V8, I observe a 3.14% geomean improvement across 65 benchmarks, including: 9.2% for spec2017.x264, 6% for libyuv and 1.8% for ncnn. Full diff: https://github.com/llvm/llvm-project/pull/150209.diff 3 Files Affected:
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
index 09b8864dfd7e9..43aa37a25f882 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
@@ -3435,8 +3435,7 @@ static SDValue performSETCCCombine(SDNode *N,
return SDValue();
}
-static SDValue performMulCombine(SDNode *N, SelectionDAG &DAG) {
- assert(N->getOpcode() == ISD::MUL);
+static SDValue TryWideExtMulCombine(SDNode *N, SelectionDAG &DAG) {
EVT VT = N->getValueType(0);
if (VT != MVT::v8i32 && VT != MVT::v16i32)
return SDValue();
@@ -3522,6 +3521,46 @@ static SDValue performMulCombine(SDNode *N, SelectionDAG &DAG) {
return SDValue();
}
+static SDValue performMulCombine(SDNode *N,
+ TargetLowering::DAGCombinerInfo &DCI) {
+ assert(N->getOpcode() == ISD::MUL);
+ EVT VT = N->getValueType(0);
+ if (!VT.isVector())
+ return SDValue();
+
+ if (auto Res = TryWideExtMulCombine(N, DCI.DAG))
+ return Res;
+
+ // We don't natively support v16i8 mul, but we do support v8i16 so split the
+ // inputs and extend them to v8i16. Only do this before legalization in case
+ // a narrow vector is widened and may be simplified later.
+ if (!DCI.isBeforeLegalize() || VT != MVT::v16i8)
+ return SDValue();
+
+ SDLoc DL(N);
+ SelectionDAG &DAG = DCI.DAG;
+ SDValue LHS = N->getOperand(0);
+ SDValue RHS = N->getOperand(1);
+ SDValue LowLHS =
+ DAG.getNode(WebAssemblyISD::EXTEND_LOW_U, DL, MVT::v8i16, LHS);
+ SDValue HighLHS =
+ DAG.getNode(WebAssemblyISD::EXTEND_HIGH_U, DL, MVT::v8i16, LHS);
+ SDValue LowRHS =
+ DAG.getNode(WebAssemblyISD::EXTEND_LOW_U, DL, MVT::v8i16, RHS);
+ SDValue HighRHS =
+ DAG.getNode(WebAssemblyISD::EXTEND_HIGH_U, DL, MVT::v8i16, RHS);
+
+ SDValue MulLow =
+ DAG.getBitcast(VT, DAG.getNode(ISD::MUL, DL, MVT::v8i16, LowLHS, LowRHS));
+ SDValue MulHigh = DAG.getBitcast(
+ VT, DAG.getNode(ISD::MUL, DL, MVT::v8i16, HighLHS, HighRHS));
+
+ // Take the low byte of each lane.
+ return DAG.getVectorShuffle(
+ VT, DL, MulLow, MulHigh,
+ {0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30});
+}
+
SDValue
WebAssemblyTargetLowering::PerformDAGCombine(SDNode *N,
DAGCombinerInfo &DCI) const {
@@ -3556,6 +3595,6 @@ WebAssemblyTargetLowering::PerformDAGCombine(SDNode *N,
return performLowerPartialReduction(N, DCI.DAG);
}
case ISD::MUL:
- return performMulCombine(N, DCI.DAG);
+ return performMulCombine(N, DCI);
}
}
diff --git a/llvm/test/CodeGen/WebAssembly/simd-arith.ll b/llvm/test/CodeGen/WebAssembly/simd-arith.ll
index e3607e12bf530..36637e1d555bd 100644
--- a/llvm/test/CodeGen/WebAssembly/simd-arith.ll
+++ b/llvm/test/CodeGen/WebAssembly/simd-arith.ll
@@ -199,139 +199,17 @@ define <16 x i8> @mul_v16i8(<16 x i8> %x, <16 x i8> %y) {
; SIMD128-LABEL: mul_v16i8:
; SIMD128: .functype mul_v16i8 (v128, v128) -> (v128)
; SIMD128-NEXT: # %bb.0:
-; SIMD128-NEXT: i8x16.extract_lane_u $push4=, $0, 0
-; SIMD128-NEXT: i8x16.extract_lane_u $push3=, $1, 0
-; SIMD128-NEXT: i32.mul $push5=, $pop4, $pop3
-; SIMD128-NEXT: i8x16.splat $push6=, $pop5
-; SIMD128-NEXT: i8x16.extract_lane_u $push1=, $0, 1
-; SIMD128-NEXT: i8x16.extract_lane_u $push0=, $1, 1
-; SIMD128-NEXT: i32.mul $push2=, $pop1, $pop0
-; SIMD128-NEXT: i8x16.replace_lane $push7=, $pop6, 1, $pop2
-; SIMD128-NEXT: i8x16.extract_lane_u $push9=, $0, 2
-; SIMD128-NEXT: i8x16.extract_lane_u $push8=, $1, 2
-; SIMD128-NEXT: i32.mul $push10=, $pop9, $pop8
-; SIMD128-NEXT: i8x16.replace_lane $push11=, $pop7, 2, $pop10
-; SIMD128-NEXT: i8x16.extract_lane_u $push13=, $0, 3
-; SIMD128-NEXT: i8x16.extract_lane_u $push12=, $1, 3
-; SIMD128-NEXT: i32.mul $push14=, $pop13, $pop12
-; SIMD128-NEXT: i8x16.replace_lane $push15=, $pop11, 3, $pop14
-; SIMD128-NEXT: i8x16.extract_lane_u $push17=, $0, 4
-; SIMD128-NEXT: i8x16.extract_lane_u $push16=, $1, 4
-; SIMD128-NEXT: i32.mul $push18=, $pop17, $pop16
-; SIMD128-NEXT: i8x16.replace_lane $push19=, $pop15, 4, $pop18
-; SIMD128-NEXT: i8x16.extract_lane_u $push21=, $0, 5
-; SIMD128-NEXT: i8x16.extract_lane_u $push20=, $1, 5
-; SIMD128-NEXT: i32.mul $push22=, $pop21, $pop20
-; SIMD128-NEXT: i8x16.replace_lane $push23=, $pop19, 5, $pop22
-; SIMD128-NEXT: i8x16.extract_lane_u $push25=, $0, 6
-; SIMD128-NEXT: i8x16.extract_lane_u $push24=, $1, 6
-; SIMD128-NEXT: i32.mul $push26=, $pop25, $pop24
-; SIMD128-NEXT: i8x16.replace_lane $push27=, $pop23, 6, $pop26
-; SIMD128-NEXT: i8x16.extract_lane_u $push29=, $0, 7
-; SIMD128-NEXT: i8x16.extract_lane_u $push28=, $1, 7
-; SIMD128-NEXT: i32.mul $push30=, $pop29, $pop28
-; SIMD128-NEXT: i8x16.replace_lane $push31=, $pop27, 7, $pop30
-; SIMD128-NEXT: i8x16.extract_lane_u $push33=, $0, 8
-; SIMD128-NEXT: i8x16.extract_lane_u $push32=, $1, 8
-; SIMD128-NEXT: i32.mul $push34=, $pop33, $pop32
-; SIMD128-NEXT: i8x16.replace_lane $push35=, $pop31, 8, $pop34
-; SIMD128-NEXT: i8x16.extract_lane_u $push37=, $0, 9
-; SIMD128-NEXT: i8x16.extract_lane_u $push36=, $1, 9
-; SIMD128-NEXT: i32.mul $push38=, $pop37, $pop36
-; SIMD128-NEXT: i8x16.replace_lane $push39=, $pop35, 9, $pop38
-; SIMD128-NEXT: i8x16.extract_lane_u $push41=, $0, 10
-; SIMD128-NEXT: i8x16.extract_lane_u $push40=, $1, 10
-; SIMD128-NEXT: i32.mul $push42=, $pop41, $pop40
-; SIMD128-NEXT: i8x16.replace_lane $push43=, $pop39, 10, $pop42
-; SIMD128-NEXT: i8x16.extract_lane_u $push45=, $0, 11
-; SIMD128-NEXT: i8x16.extract_lane_u $push44=, $1, 11
-; SIMD128-NEXT: i32.mul $push46=, $pop45, $pop44
-; SIMD128-NEXT: i8x16.replace_lane $push47=, $pop43, 11, $pop46
-; SIMD128-NEXT: i8x16.extract_lane_u $push49=, $0, 12
-; SIMD128-NEXT: i8x16.extract_lane_u $push48=, $1, 12
-; SIMD128-NEXT: i32.mul $push50=, $pop49, $pop48
-; SIMD128-NEXT: i8x16.replace_lane $push51=, $pop47, 12, $pop50
-; SIMD128-NEXT: i8x16.extract_lane_u $push53=, $0, 13
-; SIMD128-NEXT: i8x16.extract_lane_u $push52=, $1, 13
-; SIMD128-NEXT: i32.mul $push54=, $pop53, $pop52
-; SIMD128-NEXT: i8x16.replace_lane $push55=, $pop51, 13, $pop54
-; SIMD128-NEXT: i8x16.extract_lane_u $push57=, $0, 14
-; SIMD128-NEXT: i8x16.extract_lane_u $push56=, $1, 14
-; SIMD128-NEXT: i32.mul $push58=, $pop57, $pop56
-; SIMD128-NEXT: i8x16.replace_lane $push59=, $pop55, 14, $pop58
-; SIMD128-NEXT: i8x16.extract_lane_u $push61=, $0, 15
-; SIMD128-NEXT: i8x16.extract_lane_u $push60=, $1, 15
-; SIMD128-NEXT: i32.mul $push62=, $pop61, $pop60
-; SIMD128-NEXT: i8x16.replace_lane $push63=, $pop59, 15, $pop62
-; SIMD128-NEXT: return $pop63
+; SIMD128-NEXT: i16x8.extmul_low_i8x16_u $push1=, $0, $1
+; SIMD128-NEXT: i16x8.extmul_high_i8x16_u $push0=, $0, $1
+; SIMD128-NEXT: i8x16.shuffle $push2=, $pop1, $pop0, 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
+; SIMD128-NEXT: return $pop2
;
; SIMD128-FAST-LABEL: mul_v16i8:
; SIMD128-FAST: .functype mul_v16i8 (v128, v128) -> (v128)
; SIMD128-FAST-NEXT: # %bb.0:
-; SIMD128-FAST-NEXT: i8x16.extract_lane_u $push5=, $0, 0
-; SIMD128-FAST-NEXT: i8x16.extract_lane_u $push4=, $1, 0
-; SIMD128-FAST-NEXT: i32.mul $push6=, $pop5, $pop4
-; SIMD128-FAST-NEXT: i8x16.splat $push7=, $pop6
-; SIMD128-FAST-NEXT: i8x16.extract_lane_u $push2=, $0, 1
-; SIMD128-FAST-NEXT: i8x16.extract_lane_u $push1=, $1, 1
-; SIMD128-FAST-NEXT: i32.mul $push3=, $pop2, $pop1
-; SIMD128-FAST-NEXT: i8x16.replace_lane $push8=, $pop7, 1, $pop3
-; SIMD128-FAST-NEXT: i8x16.extract_lane_u $push10=, $0, 2
-; SIMD128-FAST-NEXT: i8x16.extract_lane_u $push9=, $1, 2
-; SIMD128-FAST-NEXT: i32.mul $push11=, $pop10, $pop9
-; SIMD128-FAST-NEXT: i8x16.replace_lane $push12=, $pop8, 2, $pop11
-; SIMD128-FAST-NEXT: i8x16.extract_lane_u $push14=, $0, 3
-; SIMD128-FAST-NEXT: i8x16.extract_lane_u $push13=, $1, 3
-; SIMD128-FAST-NEXT: i32.mul $push15=, $pop14, $pop13
-; SIMD128-FAST-NEXT: i8x16.replace_lane $push16=, $pop12, 3, $pop15
-; SIMD128-FAST-NEXT: i8x16.extract_lane_u $push18=, $0, 4
-; SIMD128-FAST-NEXT: i8x16.extract_lane_u $push17=, $1, 4
-; SIMD128-FAST-NEXT: i32.mul $push19=, $pop18, $pop17
-; SIMD128-FAST-NEXT: i8x16.replace_lane $push20=, $pop16, 4, $pop19
-; SIMD128-FAST-NEXT: i8x16.extract_lane_u $push22=, $0, 5
-; SIMD128-FAST-NEXT: i8x16.extract_lane_u $push21=, $1, 5
-; SIMD128-FAST-NEXT: i32.mul $push23=, $pop22, $pop21
-; SIMD128-FAST-NEXT: i8x16.replace_lane $push24=, $pop20, 5, $pop23
-; SIMD128-FAST-NEXT: i8x16.extract_lane_u $push26=, $0, 6
-; SIMD128-FAST-NEXT: i8x16.extract_lane_u $push25=, $1, 6
-; SIMD128-FAST-NEXT: i32.mul $push27=, $pop26, $pop25
-; SIMD128-FAST-NEXT: i8x16.replace_lane $push28=, $pop24, 6, $pop27
-; SIMD128-FAST-NEXT: i8x16.extract_lane_u $push30=, $0, 7
-; SIMD128-FAST-NEXT: i8x16.extract_lane_u $push29=, $1, 7
-; SIMD128-FAST-NEXT: i32.mul $push31=, $pop30, $pop29
-; SIMD128-FAST-NEXT: i8x16.replace_lane $push32=, $pop28, 7, $pop31
-; SIMD128-FAST-NEXT: i8x16.extract_lane_u $push34=, $0, 8
-; SIMD128-FAST-NEXT: i8x16.extract_lane_u $push33=, $1, 8
-; SIMD128-FAST-NEXT: i32.mul $push35=, $pop34, $pop33
-; SIMD128-FAST-NEXT: i8x16.replace_lane $push36=, $pop32, 8, $pop35
-; SIMD128-FAST-NEXT: i8x16.extract_lane_u $push38=, $0, 9
-; SIMD128-FAST-NEXT: i8x16.extract_lane_u $push37=, $1, 9
-; SIMD128-FAST-NEXT: i32.mul $push39=, $pop38, $pop37
-; SIMD128-FAST-NEXT: i8x16.replace_lane $push40=, $pop36, 9, $pop39
-; SIMD128-FAST-NEXT: i8x16.extract_lane_u $push42=, $0, 10
-; SIMD128-FAST-NEXT: i8x16.extract_lane_u $push41=, $1, 10
-; SIMD128-FAST-NEXT: i32.mul $push43=, $pop42, $pop41
-; SIMD128-FAST-NEXT: i8x16.replace_lane $push44=, $pop40, 10, $pop43
-; SIMD128-FAST-NEXT: i8x16.extract_lane_u $push46=, $0, 11
-; SIMD128-FAST-NEXT: i8x16.extract_lane_u $push45=, $1, 11
-; SIMD128-FAST-NEXT: i32.mul $push47=, $pop46, $pop45
-; SIMD128-FAST-NEXT: i8x16.replace_lane $push48=, $pop44, 11, $pop47
-; SIMD128-FAST-NEXT: i8x16.extract_lane_u $push50=, $0, 12
-; SIMD128-FAST-NEXT: i8x16.extract_lane_u $push49=, $1, 12
-; SIMD128-FAST-NEXT: i32.mul $push51=, $pop50, $pop49
-; SIMD128-FAST-NEXT: i8x16.replace_lane $push52=, $pop48, 12, $pop51
-; SIMD128-FAST-NEXT: i8x16.extract_lane_u $push54=, $0, 13
-; SIMD128-FAST-NEXT: i8x16.extract_lane_u $push53=, $1, 13
-; SIMD128-FAST-NEXT: i32.mul $push55=, $pop54, $pop53
-; SIMD128-FAST-NEXT: i8x16.replace_lane $push56=, $pop52, 13, $pop55
-; SIMD128-FAST-NEXT: i8x16.extract_lane_u $push58=, $0, 14
-; SIMD128-FAST-NEXT: i8x16.extract_lane_u $push57=, $1, 14
-; SIMD128-FAST-NEXT: i32.mul $push59=, $pop58, $pop57
-; SIMD128-FAST-NEXT: i8x16.replace_lane $push60=, $pop56, 14, $pop59
-; SIMD128-FAST-NEXT: i8x16.extract_lane_u $push62=, $0, 15
-; SIMD128-FAST-NEXT: i8x16.extract_lane_u $push61=, $1, 15
-; SIMD128-FAST-NEXT: i32.mul $push63=, $pop62, $pop61
-; SIMD128-FAST-NEXT: i8x16.replace_lane $push0=, $pop60, 15, $pop63
+; SIMD128-FAST-NEXT: i16x8.extmul_low_i8x16_u $push2=, $0, $1
+; SIMD128-FAST-NEXT: i16x8.extmul_high_i8x16_u $push1=, $0, $1
+; SIMD128-FAST-NEXT: i8x16.shuffle $push0=, $pop2, $pop1, 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
; SIMD128-FAST-NEXT: return $pop0
;
; NO-SIMD128-LABEL: mul_v16i8:
diff --git a/llvm/test/CodeGen/WebAssembly/vector-reduce.ll b/llvm/test/CodeGen/WebAssembly/vector-reduce.ll
index 1d194b640eab2..4c30a3adf2378 100644
--- a/llvm/test/CodeGen/WebAssembly/vector-reduce.ll
+++ b/llvm/test/CodeGen/WebAssembly/vector-reduce.ll
@@ -116,40 +116,28 @@ define i8 @pairwise_mul_v16i8(<16 x i8> %arg) {
; SIMD128-LABEL: pairwise_mul_v16i8:
; SIMD128: .functype pairwise_mul_v16i8 (v128) -> (i32)
; SIMD128-NEXT: # %bb.0:
-; SIMD128-NEXT: i8x16.extract_lane_u $push26=, $0, 0
-; SIMD128-NEXT: i8x16.shuffle $push32=, $0, $0, 8, 9, 10, 11, 12, 13, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0
-; SIMD128-NEXT: local.tee $push31=, $1=, $pop32
-; SIMD128-NEXT: i8x16.extract_lane_u $push25=, $pop31, 0
-; SIMD128-NEXT: i32.mul $push27=, $pop26, $pop25
-; SIMD128-NEXT: i8x16.extract_lane_u $push23=, $0, 4
-; SIMD128-NEXT: i8x16.extract_lane_u $push22=, $1, 4
-; SIMD128-NEXT: i32.mul $push24=, $pop23, $pop22
-; SIMD128-NEXT: i32.mul $push28=, $pop27, $pop24
-; SIMD128-NEXT: i8x16.extract_lane_u $push19=, $0, 2
-; SIMD128-NEXT: i8x16.extract_lane_u $push18=, $1, 2
-; SIMD128-NEXT: i32.mul $push20=, $pop19, $pop18
-; SIMD128-NEXT: i8x16.extract_lane_u $push16=, $0, 6
-; SIMD128-NEXT: i8x16.extract_lane_u $push15=, $1, 6
-; SIMD128-NEXT: i32.mul $push17=, $pop16, $pop15
-; SIMD128-NEXT: i32.mul $push21=, $pop20, $pop17
-; SIMD128-NEXT: i32.mul $push29=, $pop28, $pop21
-; SIMD128-NEXT: i8x16.extract_lane_u $push11=, $0, 1
-; SIMD128-NEXT: i8x16.extract_lane_u $push10=, $1, 1
-; SIMD128-NEXT: i32.mul $push12=, $pop11, $pop10
-; SIMD128-NEXT: i8x16.extract_lane_u $push8=, $0, 5
-; SIMD128-NEXT: i8x16.extract_lane_u $push7=, $1, 5
-; SIMD128-NEXT: i32.mul $push9=, $pop8, $pop7
-; SIMD128-NEXT: i32.mul $push13=, $pop12, $pop9
-; SIMD128-NEXT: i8x16.extract_lane_u $push4=, $0, 3
-; SIMD128-NEXT: i8x16.extract_lane_u $push3=, $1, 3
-; SIMD128-NEXT: i32.mul $push5=, $pop4, $pop3
-; SIMD128-NEXT: i8x16.extract_lane_u $push1=, $0, 7
-; SIMD128-NEXT: i8x16.extract_lane_u $push0=, $1, 7
-; SIMD128-NEXT: i32.mul $push2=, $pop1, $pop0
-; SIMD128-NEXT: i32.mul $push6=, $pop5, $pop2
-; SIMD128-NEXT: i32.mul $push14=, $pop13, $pop6
-; SIMD128-NEXT: i32.mul $push30=, $pop29, $pop14
-; SIMD128-NEXT: return $pop30
+; SIMD128-NEXT: i8x16.shuffle $push20=, $0, $0, 8, 9, 10, 11, 12, 13, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0
+; SIMD128-NEXT: local.tee $push19=, $1=, $pop20
+; SIMD128-NEXT: i16x8.extmul_low_i8x16_u $push1=, $0, $pop19
+; SIMD128-NEXT: i16x8.extmul_high_i8x16_u $push0=, $0, $1
+; SIMD128-NEXT: i8x16.shuffle $push18=, $pop1, $pop0, 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
+; SIMD128-NEXT: local.tee $push17=, $0=, $pop18
+; SIMD128-NEXT: i8x16.shuffle $push16=, $0, $0, 4, 5, 6, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; SIMD128-NEXT: local.tee $push15=, $1=, $pop16
+; SIMD128-NEXT: i16x8.extmul_low_i8x16_u $push3=, $pop17, $pop15
+; SIMD128-NEXT: i16x8.extmul_high_i8x16_u $push2=, $0, $1
+; SIMD128-NEXT: i8x16.shuffle $push14=, $pop3, $pop2, 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
+; SIMD128-NEXT: local.tee $push13=, $0=, $pop14
+; SIMD128-NEXT: i8x16.shuffle $push12=, $0, $0, 2, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; SIMD128-NEXT: local.tee $push11=, $1=, $pop12
+; SIMD128-NEXT: i16x8.extmul_low_i8x16_u $push5=, $pop13, $pop11
+; SIMD128-NEXT: i16x8.extmul_high_i8x16_u $push4=, $0, $1
+; SIMD128-NEXT: i8x16.shuffle $push10=, $pop5, $pop4, 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
+; SIMD128-NEXT: local.tee $push9=, $0=, $pop10
+; SIMD128-NEXT: i8x16.shuffle $push6=, $0, $0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; SIMD128-NEXT: i16x8.extmul_low_i8x16_u $push7=, $pop9, $pop6
+; SIMD128-NEXT: i8x16.extract_lane_u $push8=, $pop7, 0
+; SIMD128-NEXT: return $pop8
%res = tail call i8 @llvm.vector.reduce.mul.v16i8(<16 x i8> %arg)
ret i8 %res
}
|
In writing code gen tests for #146864 , I realised that we don't have a v16i8 mul instruction and the code gen looked bad. So, I'd like to get this fixed first. |
During target DAG combine, use two i16x8.extmul_low_i8x16 and a shuffle for v16i8 mul.
On my AArch64 machine, using V8, I observe a 3.14% geomean improvement across 65 benchmarks, including: 9.2% for spec2017.x264, 6% for libyuv and 1.8% for ncnn.