-
Notifications
You must be signed in to change notification settings - Fork 14.7k
[AArch64] Fix post-inc stores of floating-point conversions #151372
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Conversation
f370a89
to
ba0936c
Compare
@llvm/pr-subscribers-backend-aarch64 Author: Guy David (guy-david) ChangesThe commit at #147707 introduced a bug because of missing patterns for post-inc stores where the input is a vector_extract with i64 types. Additionally, remove the early pre-legalization early-exit as it can miss its opportunity to apply the optimization. Full diff: https://github.com/llvm/llvm-project/pull/151372.diff 3 Files Affected:
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 4fef93cc5aec5..8eed5458fec23 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -24135,9 +24135,6 @@ static SDValue combineStoreValueFPToInt(StoreSDNode *ST,
TargetLowering::DAGCombinerInfo &DCI,
SelectionDAG &DAG,
const AArch64Subtarget *Subtarget) {
- // Limit to post-legalization in order to avoid peeling truncating stores.
- if (DCI.isBeforeLegalize())
- return SDValue();
if (!Subtarget->isNeonAvailable())
return SDValue();
// Source operand is already a vector.
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index 251fd44b6ea31..a62de87b072e9 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -9273,8 +9273,12 @@ multiclass St1LanePost128Pat<SDPatternOperator scalar_store, Operand VecIndex,
defm : St1LanePost128Pat<post_truncsti8, VectorIndexB, v16i8, i32, ST1i8_POST,
1>;
+defm : St1LanePost128Pat<post_truncsti8, VectorIndexB, v16i8, i64, ST1i8_POST,
+ 1>;
defm : St1LanePost128Pat<post_truncsti16, VectorIndexH, v8i16, i32, ST1i16_POST,
2>;
+defm : St1LanePost128Pat<post_truncsti16, VectorIndexH, v8i16, i64, ST1i16_POST,
+ 2>;
defm : St1LanePost128Pat<post_store, VectorIndexS, v4i32, i32, ST1i32_POST, 4>;
defm : St1LanePost128Pat<post_store, VectorIndexS, v4f32, f32, ST1i32_POST, 4>;
defm : St1LanePost128Pat<post_store, VectorIndexD, v2i64, i64, ST1i64_POST, 8>;
diff --git a/llvm/test/CodeGen/AArch64/store-float-conversion.ll b/llvm/test/CodeGen/AArch64/store-float-conversion.ll
index c46801fc16714..1d4073f673edb 100644
--- a/llvm/test/CodeGen/AArch64/store-float-conversion.ll
+++ b/llvm/test/CodeGen/AArch64/store-float-conversion.ll
@@ -27,6 +27,20 @@ entry:
ret void
}
+define ptr @f32_to_s8_inc(float %f, ptr %dst) {
+; CHECK-LABEL: f32_to_s8_inc:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: fcvtzs s0, s0
+; CHECK-NEXT: st1 { v0.b }[0], [x0], #1
+; CHECK-NEXT: ret
+entry:
+ %conv = fptosi float %f to i32
+ %trunc = trunc i32 %conv to i8
+ %next = getelementptr i8, ptr %dst, i64 1
+ store i8 %trunc, ptr %dst
+ ret ptr %next
+}
+
define void @f32_to_u16(float %f, ptr %dst) {
; CHECK-LABEL: f32_to_u16:
; CHECK: // %bb.0: // %entry
@@ -53,6 +67,20 @@ entry:
ret void
}
+define ptr @f32_to_s16_inc(float %f, ptr %dst) {
+; CHECK-LABEL: f32_to_s16_inc:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: fcvtzs s0, s0
+; CHECK-NEXT: st1 { v0.h }[0], [x0], #2
+; CHECK-NEXT: ret
+entry:
+ %conv = fptosi float %f to i32
+ %trunc = trunc i32 %conv to i16
+ %next = getelementptr i16, ptr %dst, i64 1
+ store i16 %trunc, ptr %dst
+ ret ptr %next
+}
+
define void @f32_to_u32(float %f, ptr %dst) {
; CHECK-LABEL: f32_to_u32:
; CHECK: // %bb.0: // %entry
@@ -77,6 +105,19 @@ entry:
ret void
}
+define ptr @f32_to_s32_inc(float %f, ptr %dst) {
+; CHECK-LABEL: f32_to_s32_inc:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: fcvtzs s0, s0
+; CHECK-NEXT: st1 { v0.s }[0], [x0], #4
+; CHECK-NEXT: ret
+entry:
+ %conv = fptosi float %f to i32
+ %next = getelementptr i32, ptr %dst, i64 1
+ store i32 %conv, ptr %dst
+ ret ptr %next
+}
+
define void @f32_to_s64(float %f, ptr %dst) {
; CHECK-LABEL: f32_to_s64:
; CHECK: // %bb.0: // %entry
@@ -115,6 +156,93 @@ entry:
ret void
}
+define ptr @f64_to_s64_inc(double %d, ptr %dst) {
+; CHECK-LABEL: f64_to_s64_inc:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: fcvtzs d0, d0
+; CHECK-NEXT: st1 { v0.d }[0], [x0], #8
+; CHECK-NEXT: ret
+entry:
+ %conv = fptosi double %d to i64
+ %next = getelementptr i64, ptr %dst, i64 1
+ store i64 %conv, ptr %dst
+ ret ptr %next
+}
+
+define void @f64_to_u8(double %d, ptr %dst) {
+; CHECK-LABEL: f64_to_u8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fcvtzu d0, d0
+; CHECK-NEXT: str b0, [x0]
+; CHECK-NEXT: ret
+ %conv = fptoui double %d to i64
+ %trunc = trunc i64 %conv to i8
+ store i8 %trunc, ptr %dst
+ ret void
+}
+
+define void @f64_to_s8(double %d, ptr %dst) {
+; CHECK-LABEL: f64_to_s8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fcvtzs d0, d0
+; CHECK-NEXT: str b0, [x0]
+; CHECK-NEXT: ret
+ %conv = fptosi double %d to i64
+ %trunc = trunc i64 %conv to i8
+ store i8 %trunc, ptr %dst
+ ret void
+}
+
+define ptr @f64_to_s8_inc(double %d, ptr %dst) {
+; CHECK-LABEL: f64_to_s8_inc:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fcvtzs d0, d0
+; CHECK-NEXT: st1 { v0.b }[0], [x0], #1
+; CHECK-NEXT: ret
+ %conv = fptosi double %d to i64
+ %trunc = trunc i64 %conv to i8
+ store i8 %trunc, ptr %dst
+ %next = getelementptr i8, ptr %dst, i64 1
+ ret ptr %next
+}
+
+define void @f64_to_u16(double %d, ptr %dst) {
+; CHECK-LABEL: f64_to_u16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fcvtzu d0, d0
+; CHECK-NEXT: str h0, [x0]
+; CHECK-NEXT: ret
+ %conv = fptoui double %d to i64
+ %trunc = trunc i64 %conv to i16
+ store i16 %trunc, ptr %dst
+ ret void
+}
+
+define void @f64_to_s16(double %d, ptr %dst) {
+; CHECK-LABEL: f64_to_s16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fcvtzs d0, d0
+; CHECK-NEXT: str h0, [x0]
+; CHECK-NEXT: ret
+ %conv = fptosi double %d to i64
+ %trunc = trunc i64 %conv to i16
+ store i16 %trunc, ptr %dst
+ ret void
+}
+
+define ptr @f64_to_s16_inc(double %d, ptr %dst) {
+; CHECK-LABEL: f64_to_s16_inc:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fcvtzs d0, d0
+; CHECK-NEXT: st1 { v0.h }[0], [x0], #2
+; CHECK-NEXT: ret
+ %conv = fptosi double %d to i64
+ %trunc = trunc i64 %conv to i16
+ %next = getelementptr i16, ptr %dst, i64 1
+ store i16 %trunc, ptr %dst
+ ret ptr %next
+}
+
define i32 @f32_to_i32_multiple_uses(float %f, ptr %dst) {
; CHECK-LABEL: f32_to_i32_multiple_uses:
; CHECK: // %bb.0: // %entry
|
ba0936c
to
89a50df
Compare
The commit at #147707 introduced a bug because of missing patterns for post-inc stores where the input is a vector_extract with i64 types. Additionally, remove the early pre-legalization early-exit as it can miss its opportunity to apply the optimization.
89a50df
to
0c0505d
Compare
You cannot fix it like this because the patterns are not really missing. It is a design decision for element types smaller than i32 to use i32 as their container type. This is common across element extracts, inserts, splats and likely elsewhere. It might be worth going the isel route you tried before but either restricting the pattern to the zero index case or implement a way to ensure the index is scaled appropriately for the target instruction. |
The original issue was fixed on top of main via 3a4d506.
The commit at #147707 introduced a bug because of missing patterns for post-inc truncating stores where the input is a vector_extract with i64 types.
Additionally, remove the early pre-legalization early-exit as it can miss its opportunity to apply the optimization.