-
Notifications
You must be signed in to change notification settings - Fork 14.7k
release/21.x: [AArch64] Keep floating-point conversion in SIMD (#147707) #151317
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
Stores can be issued faster if the result is kept in the SIMD/FP registers. The `HasOneUse` guards against creating two floating point conversions, if for example there's some arithmetic done on the converted value as well. Another approach would be to inspect the user instructions during lowering, but I don't see that type of check in the lowering too often. (cherry picked from commit 58d70dc)
@paulwalker-arm What do you think about merging this PR to the release branch? |
@llvm/pr-subscribers-backend-aarch64 Author: None (llvmbot) ChangesBackport 58d70dc Requested by: @guy-david Full diff: https://github.com/llvm/llvm-project/pull/151317.diff 5 Files Affected:
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index bde4ba993f69e..f5d14e8972d26 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -23908,6 +23908,67 @@ static SDValue combineBoolVectorAndTruncateStore(SelectionDAG &DAG,
Store->getMemOperand());
}
+// Combine store (fp_to_int X) to use vector semantics around the conversion
+// when NEON is available. This allows us to store the in-vector result directly
+// without transferring the result into a GPR in the process.
+static SDValue combineStoreValueFPToInt(StoreSDNode *ST,
+ TargetLowering::DAGCombinerInfo &DCI,
+ SelectionDAG &DAG,
+ const AArch64Subtarget *Subtarget) {
+ // Limit to post-legalization in order to avoid peeling truncating stores.
+ if (DCI.isBeforeLegalize())
+ return SDValue();
+ if (!Subtarget->isNeonAvailable())
+ return SDValue();
+ // Source operand is already a vector.
+ SDValue Value = ST->getValue();
+ if (Value.getValueType().isVector())
+ return SDValue();
+
+ // Look through potential assertions.
+ while (Value->isAssert())
+ Value = Value.getOperand(0);
+
+ if (Value.getOpcode() != ISD::FP_TO_SINT &&
+ Value.getOpcode() != ISD::FP_TO_UINT)
+ return SDValue();
+ if (!Value->hasOneUse())
+ return SDValue();
+
+ SDValue FPSrc = Value.getOperand(0);
+ EVT SrcVT = FPSrc.getValueType();
+ if (SrcVT != MVT::f32 && SrcVT != MVT::f64)
+ return SDValue();
+
+ // No support for assignments such as i64 = fp_to_sint i32
+ EVT VT = Value.getSimpleValueType();
+ if (VT != SrcVT.changeTypeToInteger())
+ return SDValue();
+
+ // Create a 128-bit element vector to avoid widening. The floating point
+ // conversion is transformed into a single element conversion via a pattern.
+ unsigned NumElements = 128 / SrcVT.getFixedSizeInBits();
+ EVT VecSrcVT = EVT::getVectorVT(*DAG.getContext(), SrcVT, NumElements);
+ EVT VecDstVT = VecSrcVT.changeTypeToInteger();
+ SDLoc DL(ST);
+ SDValue VecFP = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecSrcVT, FPSrc);
+ SDValue VecConv = DAG.getNode(Value.getOpcode(), DL, VecDstVT, VecFP);
+
+ if (ST->isTruncatingStore()) {
+ EVT NewVecDstVT = EVT::getVectorVT(
+ *DAG.getContext(), ST->getMemoryVT(),
+ VecDstVT.getFixedSizeInBits() / ST->getMemoryVT().getFixedSizeInBits());
+ VecConv = DAG.getNode(AArch64ISD::NVCAST, DL, NewVecDstVT, VecConv);
+ }
+
+ SDValue Zero = DAG.getVectorIdxConstant(0, DL);
+ SDValue Extracted =
+ DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, VecConv, Zero);
+
+ DCI.CombineTo(ST->getValue().getNode(), Extracted);
+ return SDValue(ST, 0);
+}
+
bool isHalvingTruncateOfLegalScalableType(EVT SrcVT, EVT DstVT) {
return (SrcVT == MVT::nxv8i16 && DstVT == MVT::nxv8i8) ||
(SrcVT == MVT::nxv4i32 && DstVT == MVT::nxv4i16) ||
@@ -23990,6 +24051,9 @@ static SDValue performSTORECombine(SDNode *N,
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
SDLoc DL(ST);
+ if (SDValue Res = combineStoreValueFPToInt(ST, DCI, DAG, Subtarget))
+ return Res;
+
auto hasValidElementTypeForFPTruncStore = [](EVT VT) {
EVT EltVT = VT.getVectorElementType();
return EltVT == MVT::f32 || EltVT == MVT::f64;
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index ddc685fae5e9a..13194147da8f6 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -6632,6 +6632,15 @@ def : Pat<(f16 (any_uint_to_fp (i32 (any_fp_to_uint f16:$Rn)))),
(UCVTFv1i16 (f16 (FCVTZUv1f16 f16:$Rn)))>;
}
+def : Pat<(v4i32 (any_fp_to_sint (v4f32 (scalar_to_vector (f32 FPR32:$src))))),
+ (v4i32 (INSERT_SUBREG (IMPLICIT_DEF), (i32 (FCVTZSv1i32 (f32 FPR32:$src))), ssub))>;
+def : Pat<(v4i32 (any_fp_to_uint (v4f32 (scalar_to_vector (f32 FPR32:$src))))),
+ (v4i32 (INSERT_SUBREG (IMPLICIT_DEF), (i32 (FCVTZUv1i32 (f32 FPR32:$src))), ssub))>;
+def : Pat<(v2i64 (any_fp_to_sint (v2f64 (scalar_to_vector (f64 FPR64:$src))))),
+ (v2i64 (INSERT_SUBREG (IMPLICIT_DEF), (i64 (FCVTZSv1i64 (f64 FPR64:$src))), dsub))>;
+def : Pat<(v2i64 (any_fp_to_uint (v2f64 (scalar_to_vector (f64 FPR64:$src))))),
+ (v2i64 (INSERT_SUBREG (IMPLICIT_DEF), (i64 (FCVTZUv1i64 (f64 FPR64:$src))), dsub))>;
+
// int -> float conversion of value in lane 0 of simd vector should use
// correct cvtf variant to avoid costly fpr <-> gpr register transfers.
def : Pat<(f32 (sint_to_fp (i32 (vector_extract (v4i32 FPR128:$Rn), (i64 0))))),
diff --git a/llvm/test/CodeGen/AArch64/selectopt-const.ll b/llvm/test/CodeGen/AArch64/selectopt-const.ll
index a44c746e0f281..fe48dbaf1ab76 100644
--- a/llvm/test/CodeGen/AArch64/selectopt-const.ll
+++ b/llvm/test/CodeGen/AArch64/selectopt-const.ll
@@ -29,8 +29,8 @@ define i32 @test_const(ptr %in1, ptr %in2, ptr %out, i32 %n, ptr %tbl) {
; CHECK-NEXT: csel x10, x9, xzr, lt
; CHECK-NEXT: subs x8, x8, #1
; CHECK-NEXT: ldr s3, [x4, x10]
-; CHECK-NEXT: fcvtzs w10, s3
-; CHECK-NEXT: str w10, [x2], #4
+; CHECK-NEXT: fcvtzs s3, s3
+; CHECK-NEXT: st1 { v3.s }[0], [x2], #4
; CHECK-NEXT: b.ne .LBB0_2
; CHECK-NEXT: .LBB0_3: // %for.cond.cleanup
; CHECK-NEXT: mov w0, wzr
diff --git a/llvm/test/CodeGen/AArch64/store-float-conversion.ll b/llvm/test/CodeGen/AArch64/store-float-conversion.ll
new file mode 100644
index 0000000000000..c46801fc16714
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/store-float-conversion.ll
@@ -0,0 +1,131 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -verify-machineinstrs -mtriple=aarch64 < %s | FileCheck %s
+
+define void @f32_to_u8(float %f, ptr %dst) {
+; CHECK-LABEL: f32_to_u8:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: fcvtzu s0, s0
+; CHECK-NEXT: str b0, [x0]
+; CHECK-NEXT: ret
+entry:
+ %conv = fptoui float %f to i32
+ %trunc = trunc i32 %conv to i8
+ store i8 %trunc, ptr %dst
+ ret void
+}
+
+define void @f32_to_s8(float %f, ptr %dst) {
+; CHECK-LABEL: f32_to_s8:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: fcvtzs s0, s0
+; CHECK-NEXT: str b0, [x0]
+; CHECK-NEXT: ret
+entry:
+ %conv = fptosi float %f to i32
+ %trunc = trunc i32 %conv to i8
+ store i8 %trunc, ptr %dst
+ ret void
+}
+
+define void @f32_to_u16(float %f, ptr %dst) {
+; CHECK-LABEL: f32_to_u16:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: fcvtzu s0, s0
+; CHECK-NEXT: str h0, [x0]
+; CHECK-NEXT: ret
+entry:
+ %conv = fptoui float %f to i32
+ %trunc = trunc i32 %conv to i16
+ store i16 %trunc, ptr %dst
+ ret void
+}
+
+define void @f32_to_s16(float %f, ptr %dst) {
+; CHECK-LABEL: f32_to_s16:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: fcvtzs s0, s0
+; CHECK-NEXT: str h0, [x0]
+; CHECK-NEXT: ret
+entry:
+ %conv = fptosi float %f to i32
+ %trunc = trunc i32 %conv to i16
+ store i16 %trunc, ptr %dst
+ ret void
+}
+
+define void @f32_to_u32(float %f, ptr %dst) {
+; CHECK-LABEL: f32_to_u32:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: fcvtzu s0, s0
+; CHECK-NEXT: str s0, [x0]
+; CHECK-NEXT: ret
+entry:
+ %conv = fptoui float %f to i32
+ store i32 %conv, ptr %dst
+ ret void
+}
+
+define void @f32_to_s32(float %f, ptr %dst) {
+; CHECK-LABEL: f32_to_s32:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: fcvtzs s0, s0
+; CHECK-NEXT: str s0, [x0]
+; CHECK-NEXT: ret
+entry:
+ %conv = fptosi float %f to i32
+ store i32 %conv, ptr %dst
+ ret void
+}
+
+define void @f32_to_s64(float %f, ptr %dst) {
+; CHECK-LABEL: f32_to_s64:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: fcvtzs w8, s0
+; CHECK-NEXT: sxtw x8, w8
+; CHECK-NEXT: str x8, [x0]
+; CHECK-NEXT: ret
+entry:
+ %conv = fptosi float %f to i32
+ %ext = sext i32 %conv to i64
+ store i64 %ext, ptr %dst
+ ret void
+}
+
+define void @f64_to_u64(double %d, ptr %dst) {
+; CHECK-LABEL: f64_to_u64:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: fcvtzu d0, d0
+; CHECK-NEXT: str d0, [x0]
+; CHECK-NEXT: ret
+entry:
+ %conv = fptoui double %d to i64
+ store i64 %conv, ptr %dst
+ ret void
+}
+
+define void @f64_to_s64(double %d, ptr %dst) {
+; CHECK-LABEL: f64_to_s64:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: fcvtzs d0, d0
+; CHECK-NEXT: str d0, [x0]
+; CHECK-NEXT: ret
+entry:
+ %conv = fptosi double %d to i64
+ store i64 %conv, ptr %dst
+ ret void
+}
+
+define i32 @f32_to_i32_multiple_uses(float %f, ptr %dst) {
+; CHECK-LABEL: f32_to_i32_multiple_uses:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: fcvtzs w8, s0
+; CHECK-NEXT: mov x9, x0
+; CHECK-NEXT: mov w0, w8
+; CHECK-NEXT: strb w8, [x9]
+; CHECK-NEXT: ret
+entry:
+ %conv = fptosi float %f to i32
+ %trunc = trunc i32 %conv to i8
+ store i8 %trunc, ptr %dst
+ ret i32 %conv
+}
diff --git a/llvm/test/CodeGen/AArch64/tbl-loops.ll b/llvm/test/CodeGen/AArch64/tbl-loops.ll
index aa0a163b96ac8..223698ba225a8 100644
--- a/llvm/test/CodeGen/AArch64/tbl-loops.ll
+++ b/llvm/test/CodeGen/AArch64/tbl-loops.ll
@@ -63,8 +63,8 @@ define void @loop1(ptr noalias nocapture noundef writeonly %dst, ptr nocapture n
; CHECK-NEXT: fcmp s2, #0.0
; CHECK-NEXT: fcsel s2, s0, s3, mi
; CHECK-NEXT: subs w10, w10, #1
-; CHECK-NEXT: fcvtzs w11, s2
-; CHECK-NEXT: strb w11, [x9], #1
+; CHECK-NEXT: fcvtzs s2, s2
+; CHECK-NEXT: st1 { v2.b }[0], [x9], #1
; CHECK-NEXT: b.ne .LBB0_7
; CHECK-NEXT: .LBB0_8: // %for.cond.cleanup
; CHECK-NEXT: ret
@@ -178,12 +178,12 @@ define void @loop2(ptr noalias nocapture noundef writeonly %dst, ptr nocapture n
; CHECK-NEXT: fcmp s3, s1
; CHECK-NEXT: fcsel s4, s1, s3, gt
; CHECK-NEXT: fcmp s3, #0.0
-; CHECK-NEXT: fcvtzs w11, s2
+; CHECK-NEXT: fcvtzs s2, s2
; CHECK-NEXT: fcsel s3, s0, s4, mi
; CHECK-NEXT: subs w10, w10, #1
-; CHECK-NEXT: strb w11, [x9]
-; CHECK-NEXT: fcvtzs w12, s3
-; CHECK-NEXT: strb w12, [x9, #1]
+; CHECK-NEXT: str b2, [x9]
+; CHECK-NEXT: fcvtzs s3, s3
+; CHECK-NEXT: stur b3, [x9, #1]
; CHECK-NEXT: add x9, x9, #2
; CHECK-NEXT: b.ne .LBB1_6
; CHECK-NEXT: .LBB1_7: // %for.cond.cleanup
@@ -395,19 +395,19 @@ define void @loop3(ptr noalias nocapture noundef writeonly %dst, ptr nocapture n
; CHECK-NEXT: fcsel s4, s1, s3, gt
; CHECK-NEXT: fcmp s3, #0.0
; CHECK-NEXT: ldr s3, [x8, #8]
-; CHECK-NEXT: fcvtzs w11, s2
+; CHECK-NEXT: fcvtzs s2, s2
; CHECK-NEXT: add x8, x8, #12
; CHECK-NEXT: fcsel s4, s0, s4, mi
; CHECK-NEXT: fcmp s3, s1
-; CHECK-NEXT: strb w11, [x9]
+; CHECK-NEXT: str b2, [x9]
; CHECK-NEXT: fcsel s5, s1, s3, gt
; CHECK-NEXT: fcmp s3, #0.0
-; CHECK-NEXT: fcvtzs w12, s4
+; CHECK-NEXT: fcvtzs s4, s4
; CHECK-NEXT: fcsel s3, s0, s5, mi
; CHECK-NEXT: subs w10, w10, #1
-; CHECK-NEXT: strb w12, [x9, #1]
-; CHECK-NEXT: fcvtzs w13, s3
-; CHECK-NEXT: strb w13, [x9, #2]
+; CHECK-NEXT: stur b4, [x9, #1]
+; CHECK-NEXT: fcvtzs s3, s3
+; CHECK-NEXT: stur b3, [x9, #2]
; CHECK-NEXT: add x9, x9, #3
; CHECK-NEXT: b.ne .LBB2_8
; CHECK-NEXT: .LBB2_9: // %for.cond.cleanup
@@ -563,26 +563,26 @@ define void @loop4(ptr noalias nocapture noundef writeonly %dst, ptr nocapture n
; CHECK-NEXT: fcmp s3, s1
; CHECK-NEXT: fcsel s4, s1, s3, gt
; CHECK-NEXT: fcmp s3, #0.0
-; CHECK-NEXT: fcvtzs w11, s2
+; CHECK-NEXT: fcvtzs s2, s2
; CHECK-NEXT: ldp s3, s5, [x8, #8]
; CHECK-NEXT: add x8, x8, #16
; CHECK-NEXT: fcsel s4, s0, s4, mi
; CHECK-NEXT: fcmp s3, s1
-; CHECK-NEXT: strb w11, [x9]
-; CHECK-NEXT: fcvtzs w12, s4
+; CHECK-NEXT: str b2, [x9]
+; CHECK-NEXT: fcvtzs s4, s4
; CHECK-NEXT: fcsel s6, s1, s3, gt
; CHECK-NEXT: fcmp s3, #0.0
; CHECK-NEXT: fcsel s3, s0, s6, mi
; CHECK-NEXT: fcmp s5, s1
-; CHECK-NEXT: strb w12, [x9, #1]
+; CHECK-NEXT: stur b4, [x9, #1]
; CHECK-NEXT: fcsel s6, s1, s5, gt
; CHECK-NEXT: fcmp s5, #0.0
-; CHECK-NEXT: fcvtzs w13, s3
-; CHECK-NEXT: fcsel s2, s0, s6, mi
+; CHECK-NEXT: fcvtzs s3, s3
+; CHECK-NEXT: fcsel s5, s0, s6, mi
; CHECK-NEXT: subs w10, w10, #1
-; CHECK-NEXT: strb w13, [x9, #2]
-; CHECK-NEXT: fcvtzs w14, s2
-; CHECK-NEXT: strb w14, [x9, #3]
+; CHECK-NEXT: stur b3, [x9, #2]
+; CHECK-NEXT: fcvtzs s5, s5
+; CHECK-NEXT: stur b5, [x9, #3]
; CHECK-NEXT: add x9, x9, #4
; CHECK-NEXT: b.ne .LBB3_6
; CHECK-NEXT: .LBB3_7: // %for.cond.cleanup
|
|
As far as I know the PR is not fixing an existing bug or performance regression so it depends if we've passed the point of accepting new optimisations. @guy-david will need to argue the case for the PRs importance based on their need. The PR itself is specific to AArch64 so there is no danger to other targets. That said, the PR has just landed so it seems prudent to wait for wider buildbot testing to complete before pulling into a release. |
This optimization is important for us to land in this release, since it also affects internal workloads we care about. It's rather conservative and concerns a very specific pattern around floating-point conversions in the AArch64 backend. Safety is above all, so we should definitely wait for the the more expensive bots to complete their verification. |
If this does get picked up, then Paul's fix at 3a4d506 must also be included. Maybe it's best to open a new request, I'm not sure how to update an existing one. |
This doesn't seem to be a regression or bugfix but rather a smaller performance improvement. Not sure that this would qualify for backporting. If you want to update this PR you can run WDYT: @paulwalker-arm @fhahn @nikic |
I feel it might be a bit late for patches like this so close to the release I'm afraid. The implementation might be a bit sub optimal in places too, I will leave a comment on the original patch. |
Will not backport. |
Backport 58d70dc
Requested by: @guy-david