Skip to content

Commit 5625e6a

Browse files
committed
[X86] Improve min/max reduction costs.
This is similar to what I recently did for getArithmeticReductionCost. I'm trying to account for the narrowing from 512->256->128 as we go. I've also added a new helper method getMinMaxCost that tries to handle the cases where we have native min/max instructions and fall back to cmp+select when we don't. Differential Revision: https://reviews.llvm.org/D76634
1 parent 5fe2809 commit 5625e6a

File tree

10 files changed

+1119
-1028
lines changed

10 files changed

+1119
-1028
lines changed

llvm/lib/Target/X86/X86TargetTransformInfo.cpp

Lines changed: 250 additions & 186 deletions
Large diffs are not rendered by default.

llvm/lib/Target/X86/X86TargetTransformInfo.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -156,6 +156,8 @@ class X86TTIImpl : public BasicTTIImplBase<X86TTIImpl> {
156156
int getArithmeticReductionCost(unsigned Opcode, Type *Ty,
157157
bool IsPairwiseForm);
158158

159+
int getMinMaxCost(Type *Ty, Type *CondTy, bool IsUnsigned);
160+
159161
int getMinMaxReductionCost(Type *Ty, Type *CondTy, bool IsPairwiseForm,
160162
bool IsUnsigned);
161163

llvm/test/Analysis/CostModel/X86/reduce-fmax.ll

Lines changed: 30 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -12,26 +12,26 @@
1212
define i32 @reduce_f64(i32 %arg) {
1313
; SSE-LABEL: 'reduce_f64'
1414
; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call double @llvm.experimental.vector.reduce.fmax.v1f64(<1 x double> undef)
15-
; SSE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call double @llvm.experimental.vector.reduce.fmax.v2f64(<2 x double> undef)
16-
; SSE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V4 = call double @llvm.experimental.vector.reduce.fmax.v4f64(<4 x double> undef)
17-
; SSE-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V8 = call double @llvm.experimental.vector.reduce.fmax.v8f64(<8 x double> undef)
18-
; SSE-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V16 = call double @llvm.experimental.vector.reduce.fmax.v16f64(<16 x double> undef)
15+
; SSE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call double @llvm.experimental.vector.reduce.fmax.v2f64(<2 x double> undef)
16+
; SSE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4 = call double @llvm.experimental.vector.reduce.fmax.v4f64(<4 x double> undef)
17+
; SSE-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V8 = call double @llvm.experimental.vector.reduce.fmax.v8f64(<8 x double> undef)
18+
; SSE-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call double @llvm.experimental.vector.reduce.fmax.v16f64(<16 x double> undef)
1919
; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
2020
;
2121
; AVX-LABEL: 'reduce_f64'
2222
; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call double @llvm.experimental.vector.reduce.fmax.v1f64(<1 x double> undef)
23-
; AVX-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call double @llvm.experimental.vector.reduce.fmax.v2f64(<2 x double> undef)
24-
; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4 = call double @llvm.experimental.vector.reduce.fmax.v4f64(<4 x double> undef)
25-
; AVX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8 = call double @llvm.experimental.vector.reduce.fmax.v8f64(<8 x double> undef)
26-
; AVX-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16 = call double @llvm.experimental.vector.reduce.fmax.v16f64(<16 x double> undef)
23+
; AVX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call double @llvm.experimental.vector.reduce.fmax.v2f64(<2 x double> undef)
24+
; AVX-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4 = call double @llvm.experimental.vector.reduce.fmax.v4f64(<4 x double> undef)
25+
; AVX-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V8 = call double @llvm.experimental.vector.reduce.fmax.v8f64(<8 x double> undef)
26+
; AVX-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V16 = call double @llvm.experimental.vector.reduce.fmax.v16f64(<16 x double> undef)
2727
; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
2828
;
2929
; AVX512-LABEL: 'reduce_f64'
3030
; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call double @llvm.experimental.vector.reduce.fmax.v1f64(<1 x double> undef)
31-
; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call double @llvm.experimental.vector.reduce.fmax.v2f64(<2 x double> undef)
32-
; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4 = call double @llvm.experimental.vector.reduce.fmax.v4f64(<4 x double> undef)
33-
; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8 = call double @llvm.experimental.vector.reduce.fmax.v8f64(<8 x double> undef)
34-
; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16 = call double @llvm.experimental.vector.reduce.fmax.v16f64(<16 x double> undef)
31+
; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call double @llvm.experimental.vector.reduce.fmax.v2f64(<2 x double> undef)
32+
; AVX512-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4 = call double @llvm.experimental.vector.reduce.fmax.v4f64(<4 x double> undef)
33+
; AVX512-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8 = call double @llvm.experimental.vector.reduce.fmax.v8f64(<8 x double> undef)
34+
; AVX512-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V16 = call double @llvm.experimental.vector.reduce.fmax.v16f64(<16 x double> undef)
3535
; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
3636
;
3737
%V1 = call double @llvm.experimental.vector.reduce.fmax.v1f64(<1 x double> undef)
@@ -43,49 +43,31 @@ define i32 @reduce_f64(i32 %arg) {
4343
}
4444

4545
define i32 @reduce_f32(i32 %arg) {
46-
; SSE2-LABEL: 'reduce_f32'
47-
; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call float @llvm.experimental.vector.reduce.fmax.v1f32(<1 x float> undef)
48-
; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2 = call float @llvm.experimental.vector.reduce.fmax.v2f32(<2 x float> undef)
49-
; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4 = call float @llvm.experimental.vector.reduce.fmax.v4f32(<4 x float> undef)
50-
; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8 = call float @llvm.experimental.vector.reduce.fmax.v8f32(<8 x float> undef)
51-
; SSE2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16 = call float @llvm.experimental.vector.reduce.fmax.v16f32(<16 x float> undef)
52-
; SSE2-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V32 = call float @llvm.experimental.vector.reduce.fmax.v32f32(<32 x float> undef)
53-
; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
54-
;
55-
; SSSE3-LABEL: 'reduce_f32'
56-
; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call float @llvm.experimental.vector.reduce.fmax.v1f32(<1 x float> undef)
57-
; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2 = call float @llvm.experimental.vector.reduce.fmax.v2f32(<2 x float> undef)
58-
; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4 = call float @llvm.experimental.vector.reduce.fmax.v4f32(<4 x float> undef)
59-
; SSSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8 = call float @llvm.experimental.vector.reduce.fmax.v8f32(<8 x float> undef)
60-
; SSSE3-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16 = call float @llvm.experimental.vector.reduce.fmax.v16f32(<16 x float> undef)
61-
; SSSE3-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V32 = call float @llvm.experimental.vector.reduce.fmax.v32f32(<32 x float> undef)
62-
; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
63-
;
64-
; SSE4-LABEL: 'reduce_f32'
65-
; SSE4-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call float @llvm.experimental.vector.reduce.fmax.v1f32(<1 x float> undef)
66-
; SSE4-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call float @llvm.experimental.vector.reduce.fmax.v2f32(<2 x float> undef)
67-
; SSE4-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4 = call float @llvm.experimental.vector.reduce.fmax.v4f32(<4 x float> undef)
68-
; SSE4-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8 = call float @llvm.experimental.vector.reduce.fmax.v8f32(<8 x float> undef)
69-
; SSE4-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V16 = call float @llvm.experimental.vector.reduce.fmax.v16f32(<16 x float> undef)
70-
; SSE4-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V32 = call float @llvm.experimental.vector.reduce.fmax.v32f32(<32 x float> undef)
71-
; SSE4-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
46+
; SSE-LABEL: 'reduce_f32'
47+
; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call float @llvm.experimental.vector.reduce.fmax.v1f32(<1 x float> undef)
48+
; SSE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call float @llvm.experimental.vector.reduce.fmax.v2f32(<2 x float> undef)
49+
; SSE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4 = call float @llvm.experimental.vector.reduce.fmax.v4f32(<4 x float> undef)
50+
; SSE-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V8 = call float @llvm.experimental.vector.reduce.fmax.v8f32(<8 x float> undef)
51+
; SSE-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V16 = call float @llvm.experimental.vector.reduce.fmax.v16f32(<16 x float> undef)
52+
; SSE-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V32 = call float @llvm.experimental.vector.reduce.fmax.v32f32(<32 x float> undef)
53+
; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
7254
;
7355
; AVX-LABEL: 'reduce_f32'
7456
; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call float @llvm.experimental.vector.reduce.fmax.v1f32(<1 x float> undef)
75-
; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2 = call float @llvm.experimental.vector.reduce.fmax.v2f32(<2 x float> undef)
76-
; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4 = call float @llvm.experimental.vector.reduce.fmax.v4f32(<4 x float> undef)
77-
; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8 = call float @llvm.experimental.vector.reduce.fmax.v8f32(<8 x float> undef)
78-
; AVX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16 = call float @llvm.experimental.vector.reduce.fmax.v16f32(<16 x float> undef)
79-
; AVX-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32 = call float @llvm.experimental.vector.reduce.fmax.v32f32(<32 x float> undef)
57+
; AVX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call float @llvm.experimental.vector.reduce.fmax.v2f32(<2 x float> undef)
58+
; AVX-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4 = call float @llvm.experimental.vector.reduce.fmax.v4f32(<4 x float> undef)
59+
; AVX-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8 = call float @llvm.experimental.vector.reduce.fmax.v8f32(<8 x float> undef)
60+
; AVX-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V16 = call float @llvm.experimental.vector.reduce.fmax.v16f32(<16 x float> undef)
61+
; AVX-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V32 = call float @llvm.experimental.vector.reduce.fmax.v32f32(<32 x float> undef)
8062
; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
8163
;
8264
; AVX512-LABEL: 'reduce_f32'
8365
; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call float @llvm.experimental.vector.reduce.fmax.v1f32(<1 x float> undef)
84-
; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2 = call float @llvm.experimental.vector.reduce.fmax.v2f32(<2 x float> undef)
85-
; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4 = call float @llvm.experimental.vector.reduce.fmax.v4f32(<4 x float> undef)
86-
; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8 = call float @llvm.experimental.vector.reduce.fmax.v8f32(<8 x float> undef)
87-
; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16 = call float @llvm.experimental.vector.reduce.fmax.v16f32(<16 x float> undef)
88-
; AVX512-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32 = call float @llvm.experimental.vector.reduce.fmax.v32f32(<32 x float> undef)
66+
; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call float @llvm.experimental.vector.reduce.fmax.v2f32(<2 x float> undef)
67+
; AVX512-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4 = call float @llvm.experimental.vector.reduce.fmax.v4f32(<4 x float> undef)
68+
; AVX512-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8 = call float @llvm.experimental.vector.reduce.fmax.v8f32(<8 x float> undef)
69+
; AVX512-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16 = call float @llvm.experimental.vector.reduce.fmax.v16f32(<16 x float> undef)
70+
; AVX512-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V32 = call float @llvm.experimental.vector.reduce.fmax.v32f32(<32 x float> undef)
8971
; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
9072
;
9173
%V1 = call float @llvm.experimental.vector.reduce.fmax.v1f32(<1 x float> undef)

0 commit comments

Comments
 (0)