Skip to content

Commit dad0929

Browse files
committed
[AMDGPU] v_cvt_scalef32_sr_pk16_* gfx1250 instructions
1 parent 792b728 commit dad0929

File tree

10 files changed

+415
-0
lines changed

10 files changed

+415
-0
lines changed

clang/include/clang/Basic/BuiltinsAMDGPU.def

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -746,6 +746,12 @@ TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk8_bf8_f32, "V2UiV8fUif", "nc",
746746
TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk8_fp4_f32, "UiV8fUif", "nc", "gfx1250-insts")
747747
TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk8_fp4_f16, "UiV8hUif", "nc", "gfx1250-insts")
748748
TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk8_fp4_bf16, "UiV8yUif", "nc", "gfx1250-insts")
749+
TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk16_bf6_bf16, "V3UiV16yUif", "nc", "gfx1250-insts")
750+
TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk16_bf6_f16, "V3UiV16hUif", "nc", "gfx1250-insts")
751+
TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk16_bf6_f32, "V3UiV16fUif", "nc", "gfx1250-insts")
752+
TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk16_fp6_bf16, "V3UiV16yUif", "nc", "gfx1250-insts")
753+
TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk16_fp6_f16, "V3UiV16hUif", "nc", "gfx1250-insts")
754+
TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk16_fp6_f32, "V3UiV16fUif", "nc", "gfx1250-insts")
749755
TARGET_BUILTIN(__builtin_amdgcn_cvt_pk_fp8_f32_e5m3, "iffiIb", "nc", "fp8e5m3-insts")
750756
TARGET_BUILTIN(__builtin_amdgcn_cvt_sr_fp8_f32_e5m3, "ifiiIi", "nc", "fp8e5m3-insts")
751757
TARGET_BUILTIN(__builtin_amdgcn_sat_pk4_i4_i8, "UsUi", "nc", "gfx1250-insts")

clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -929,6 +929,42 @@ void test_cvt_scalef32_pk(global uint2 *out2, bfloat8 srcbf8, half8 srch8, float
929929
// CHECK-NEXT: [[TMP43:%.*]] = call i32 @llvm.amdgcn.cvt.scalef32.sr.pk8.fp4.bf16(<8 x bfloat> [[TMP40]], i32 [[TMP41]], float [[TMP42]])
930930
// CHECK-NEXT: [[TMP44:%.*]] = load ptr addrspace(1), ptr [[OUT1_ADDR_ASCAST]], align 8
931931
// CHECK-NEXT: store i32 [[TMP43]], ptr addrspace(1) [[TMP44]], align 4
932+
// CHECK-NEXT: [[TMP45:%.*]] = load <16 x bfloat>, ptr [[SRCBF16_ADDR_ASCAST]], align 32
933+
// CHECK-NEXT: [[TMP46:%.*]] = load i32, ptr [[SR_ADDR_ASCAST]], align 4
934+
// CHECK-NEXT: [[TMP47:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 4
935+
// CHECK-NEXT: [[TMP48:%.*]] = call <3 x i32> @llvm.amdgcn.cvt.scalef32.sr.pk16.bf6.bf16(<16 x bfloat> [[TMP45]], i32 [[TMP46]], float [[TMP47]])
936+
// CHECK-NEXT: [[TMP49:%.*]] = load ptr addrspace(1), ptr [[OUT3_ADDR_ASCAST]], align 8
937+
// CHECK-NEXT: store <3 x i32> [[TMP48]], ptr addrspace(1) [[TMP49]], align 16
938+
// CHECK-NEXT: [[TMP50:%.*]] = load <16 x half>, ptr [[SRCH16_ADDR_ASCAST]], align 32
939+
// CHECK-NEXT: [[TMP51:%.*]] = load i32, ptr [[SR_ADDR_ASCAST]], align 4
940+
// CHECK-NEXT: [[TMP52:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 4
941+
// CHECK-NEXT: [[TMP53:%.*]] = call <3 x i32> @llvm.amdgcn.cvt.scalef32.sr.pk16.bf6.f16(<16 x half> [[TMP50]], i32 [[TMP51]], float [[TMP52]])
942+
// CHECK-NEXT: [[TMP54:%.*]] = load ptr addrspace(1), ptr [[OUT3_ADDR_ASCAST]], align 8
943+
// CHECK-NEXT: store <3 x i32> [[TMP53]], ptr addrspace(1) [[TMP54]], align 16
944+
// CHECK-NEXT: [[TMP55:%.*]] = load <16 x bfloat>, ptr [[SRCBF16_ADDR_ASCAST]], align 32
945+
// CHECK-NEXT: [[TMP56:%.*]] = load i32, ptr [[SR_ADDR_ASCAST]], align 4
946+
// CHECK-NEXT: [[TMP57:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 4
947+
// CHECK-NEXT: [[TMP58:%.*]] = call <3 x i32> @llvm.amdgcn.cvt.scalef32.sr.pk16.fp6.bf16(<16 x bfloat> [[TMP55]], i32 [[TMP56]], float [[TMP57]])
948+
// CHECK-NEXT: [[TMP59:%.*]] = load ptr addrspace(1), ptr [[OUT3_ADDR_ASCAST]], align 8
949+
// CHECK-NEXT: store <3 x i32> [[TMP58]], ptr addrspace(1) [[TMP59]], align 16
950+
// CHECK-NEXT: [[TMP60:%.*]] = load <16 x half>, ptr [[SRCH16_ADDR_ASCAST]], align 32
951+
// CHECK-NEXT: [[TMP61:%.*]] = load i32, ptr [[SR_ADDR_ASCAST]], align 4
952+
// CHECK-NEXT: [[TMP62:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 4
953+
// CHECK-NEXT: [[TMP63:%.*]] = call <3 x i32> @llvm.amdgcn.cvt.scalef32.sr.pk16.fp6.f16(<16 x half> [[TMP60]], i32 [[TMP61]], float [[TMP62]])
954+
// CHECK-NEXT: [[TMP64:%.*]] = load ptr addrspace(1), ptr [[OUT3_ADDR_ASCAST]], align 8
955+
// CHECK-NEXT: store <3 x i32> [[TMP63]], ptr addrspace(1) [[TMP64]], align 16
956+
// CHECK-NEXT: [[TMP65:%.*]] = load <16 x float>, ptr [[SRCF16_ADDR_ASCAST]], align 64
957+
// CHECK-NEXT: [[TMP66:%.*]] = load i32, ptr [[SR_ADDR_ASCAST]], align 4
958+
// CHECK-NEXT: [[TMP67:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 4
959+
// CHECK-NEXT: [[TMP68:%.*]] = call <3 x i32> @llvm.amdgcn.cvt.scalef32.sr.pk16.bf6.f32(<16 x float> [[TMP65]], i32 [[TMP66]], float [[TMP67]])
960+
// CHECK-NEXT: [[TMP69:%.*]] = load ptr addrspace(1), ptr [[OUT3_ADDR_ASCAST]], align 8
961+
// CHECK-NEXT: store <3 x i32> [[TMP68]], ptr addrspace(1) [[TMP69]], align 16
962+
// CHECK-NEXT: [[TMP70:%.*]] = load <16 x float>, ptr [[SRCF16_ADDR_ASCAST]], align 64
963+
// CHECK-NEXT: [[TMP71:%.*]] = load i32, ptr [[SR_ADDR_ASCAST]], align 4
964+
// CHECK-NEXT: [[TMP72:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 4
965+
// CHECK-NEXT: [[TMP73:%.*]] = call <3 x i32> @llvm.amdgcn.cvt.scalef32.sr.pk16.fp6.f32(<16 x float> [[TMP70]], i32 [[TMP71]], float [[TMP72]])
966+
// CHECK-NEXT: [[TMP74:%.*]] = load ptr addrspace(1), ptr [[OUT3_ADDR_ASCAST]], align 8
967+
// CHECK-NEXT: store <3 x i32> [[TMP73]], ptr addrspace(1) [[TMP74]], align 16
932968
// CHECK-NEXT: ret void
933969
//
934970
void test_cvt_scalef32_sr_pk(global uint2 *out2, bfloat8 srcbf8, half8 srch8, float8 srcf8,
@@ -944,6 +980,12 @@ void test_cvt_scalef32_sr_pk(global uint2 *out2, bfloat8 srcbf8, half8 srch8, fl
944980
*out1 = __builtin_amdgcn_cvt_scalef32_sr_pk8_fp4_f32(srcf8, sr, scale);
945981
*out1 = __builtin_amdgcn_cvt_scalef32_sr_pk8_fp4_f16(srch8, sr, scale);
946982
*out1 = __builtin_amdgcn_cvt_scalef32_sr_pk8_fp4_bf16(srcbf8, sr, scale);
983+
*out3 = __builtin_amdgcn_cvt_scalef32_sr_pk16_bf6_bf16(srcbf16, sr, scale);
984+
*out3 = __builtin_amdgcn_cvt_scalef32_sr_pk16_bf6_f16(srch16, sr, scale);
985+
*out3 = __builtin_amdgcn_cvt_scalef32_sr_pk16_fp6_bf16(srcbf16, sr, scale);
986+
*out3 = __builtin_amdgcn_cvt_scalef32_sr_pk16_fp6_f16(srch16, sr, scale);
987+
*out3 = __builtin_amdgcn_cvt_scalef32_sr_pk16_bf6_f32(srcf16, sr, scale);
988+
*out3 = __builtin_amdgcn_cvt_scalef32_sr_pk16_fp6_f32(srcf16, sr, scale);
947989
}
948990

949991
// CHECK-LABEL: @test_sat_pk4_i4_i8(

llvm/include/llvm/IR/IntrinsicsAMDGPU.td

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -701,6 +701,12 @@ def int_amdgcn_cvt_scalef32_sr_pk8_bf8_f32 : AMDGPUCvtScaleF32SRIntrinsic<llvm
701701
def int_amdgcn_cvt_scalef32_sr_pk8_fp4_f32 : AMDGPUCvtScaleF32SRIntrinsic<llvm_i32_ty, llvm_v8f32_ty, "cvt_scalef32_sr_pk8_fp4_f32">;
702702
def int_amdgcn_cvt_scalef32_sr_pk8_fp4_f16 : AMDGPUCvtScaleF32SRIntrinsic<llvm_i32_ty, llvm_v8f16_ty, "cvt_scalef32_sr_pk8_fp4_f16">;
703703
def int_amdgcn_cvt_scalef32_sr_pk8_fp4_bf16 : AMDGPUCvtScaleF32SRIntrinsic<llvm_i32_ty, llvm_v8bf16_ty, "cvt_scalef32_sr_pk8_fp4_bf16">;
704+
def int_amdgcn_cvt_scalef32_sr_pk16_fp6_f32 : AMDGPUCvtScaleF32SRIntrinsic<llvm_v3i32_ty, llvm_v16f32_ty, "cvt_scalef32_sr_pk16_fp6_f32">;
705+
def int_amdgcn_cvt_scalef32_sr_pk16_bf6_f32 : AMDGPUCvtScaleF32SRIntrinsic<llvm_v3i32_ty, llvm_v16f32_ty, "cvt_scalef32_sr_pk16_bf6_f32">;
706+
def int_amdgcn_cvt_scalef32_sr_pk16_fp6_f16 : AMDGPUCvtScaleF32SRIntrinsic<llvm_v3i32_ty, llvm_v16f16_ty, "cvt_scalef32_sr_pk16_fp6_f16">;
707+
def int_amdgcn_cvt_scalef32_sr_pk16_bf6_f16 : AMDGPUCvtScaleF32SRIntrinsic<llvm_v3i32_ty, llvm_v16f16_ty, "cvt_scalef32_sr_pk16_bf6_f16">;
708+
def int_amdgcn_cvt_scalef32_sr_pk16_fp6_bf16 : AMDGPUCvtScaleF32SRIntrinsic<llvm_v3i32_ty, llvm_v16bf16_ty, "cvt_scalef32_sr_pk16_fp6_bf16">;
709+
def int_amdgcn_cvt_scalef32_sr_pk16_bf6_bf16 : AMDGPUCvtScaleF32SRIntrinsic<llvm_v3i32_ty, llvm_v16bf16_ty, "cvt_scalef32_sr_pk16_bf6_bf16">;
704710

705711
def int_amdgcn_cvt_scalef32_2xpk16_fp6_f32 : AMDGPUCvtScaleF32ToFP6BF6Intrinsic<llvm_v6i32_ty, llvm_v16f32_ty, llvm_v16f32_ty, "cvt_scalef32_2xpk16_fp6_f32">;
706712
def int_amdgcn_cvt_scalef32_2xpk16_bf6_f32 : AMDGPUCvtScaleF32ToFP6BF6Intrinsic<llvm_v6i32_ty, llvm_v16f32_ty, llvm_v16f32_ty, "cvt_scalef32_2xpk16_bf6_f32">;

llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4633,6 +4633,12 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
46334633
case Intrinsic::amdgcn_cvt_scalef32_sr_pk8_fp4_f32:
46344634
case Intrinsic::amdgcn_cvt_scalef32_sr_pk8_fp4_f16:
46354635
case Intrinsic::amdgcn_cvt_scalef32_sr_pk8_fp4_bf16:
4636+
case Intrinsic::amdgcn_cvt_scalef32_sr_pk16_fp6_f32:
4637+
case Intrinsic::amdgcn_cvt_scalef32_sr_pk16_bf6_f32:
4638+
case Intrinsic::amdgcn_cvt_scalef32_sr_pk16_fp6_f16:
4639+
case Intrinsic::amdgcn_cvt_scalef32_sr_pk16_bf6_f16:
4640+
case Intrinsic::amdgcn_cvt_scalef32_sr_pk16_fp6_bf16:
4641+
case Intrinsic::amdgcn_cvt_scalef32_sr_pk16_bf6_bf16:
46364642
case Intrinsic::amdgcn_sat_pk4_i4_i8:
46374643
case Intrinsic::amdgcn_sat_pk4_u4_u8:
46384644
case Intrinsic::amdgcn_fmed3:

llvm/lib/Target/AMDGPU/SIInstrInfo.td

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2973,6 +2973,9 @@ def VOP_I32_F32_I32_F32 : VOPProfile<[i32, f32, i32, f32]>;
29732973
def VOP_V6I32_V32BF16_I32_F32 : VOPProfile<[v6i32, v32bf16, i32, f32]>;
29742974
def VOP_V6I32_V32F16_I32_F32 : VOPProfile<[v6i32, v32f16, i32, f32]>;
29752975
def VOP_V6I32_V32F32_I32_F32 : VOPProfile<[v6i32, v32f32, i32, f32]>;
2976+
def VOP_V3I32_V16F16_I32_F32 : VOPProfile<[v3i32, v16f16, i32, f32]>;
2977+
def VOP_V3I32_V16BF16_I32_F32 : VOPProfile<[v3i32, v16bf16, i32, f32]>;
2978+
def VOP_V3I32_V16F32_I32_F32 : VOPProfile<[v3i32, v16f32, i32, f32]>;
29762979
def VOP_V2I32_V8BF16_I32_F32 : VOPProfile<[v2i32, v8bf16, i32, f32]>;
29772980
def VOP_V2I32_V8F16_I32_F32 : VOPProfile<[v2i32, v8f16, i32, f32]>;
29782981
def VOP_V2I32_V8F32_I32_F32 : VOPProfile<[v2i32, v8f32, i32, f32]>;

llvm/lib/Target/AMDGPU/VOP3Instructions.td

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1820,6 +1820,12 @@ let SubtargetPredicate = isGFX1250Plus in {
18201820
defm V_CVT_SCALEF32_SR_PK8_FP4_F16 : VOP3Inst<"v_cvt_scalef32_sr_pk8_fp4_f16", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_I32_V8F16_I32_F32>, int_amdgcn_cvt_scalef32_sr_pk8_fp4_f16>;
18211821
defm V_CVT_SCALEF32_SR_PK8_FP4_BF16 : VOP3Inst<"v_cvt_scalef32_sr_pk8_fp4_bf16", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_I32_V8BF16_I32_F32>, int_amdgcn_cvt_scalef32_sr_pk8_fp4_bf16>;
18221822
} // End WaveSizePredicate = isWave32
1823+
defm V_CVT_SCALEF32_SR_PK16_BF6_BF16 : VOP3Inst<"v_cvt_scalef32_sr_pk16_bf6_bf16", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_V3I32_V16BF16_I32_F32>, int_amdgcn_cvt_scalef32_sr_pk16_bf6_bf16>;
1824+
defm V_CVT_SCALEF32_SR_PK16_BF6_F16 : VOP3Inst<"v_cvt_scalef32_sr_pk16_bf6_f16", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_V3I32_V16F16_I32_F32>, int_amdgcn_cvt_scalef32_sr_pk16_bf6_f16>;
1825+
defm V_CVT_SCALEF32_SR_PK16_BF6_F32 : VOP3Inst<"v_cvt_scalef32_sr_pk16_bf6_f32", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_V3I32_V16F32_I32_F32>, int_amdgcn_cvt_scalef32_sr_pk16_bf6_f32>;
1826+
defm V_CVT_SCALEF32_SR_PK16_FP6_BF16 : VOP3Inst<"v_cvt_scalef32_sr_pk16_fp6_bf16", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_V3I32_V16BF16_I32_F32>, int_amdgcn_cvt_scalef32_sr_pk16_fp6_bf16>;
1827+
defm V_CVT_SCALEF32_SR_PK16_FP6_F16 : VOP3Inst<"v_cvt_scalef32_sr_pk16_fp6_f16", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_V3I32_V16F16_I32_F32>, int_amdgcn_cvt_scalef32_sr_pk16_fp6_f16>;
1828+
defm V_CVT_SCALEF32_SR_PK16_FP6_F32 : VOP3Inst<"v_cvt_scalef32_sr_pk16_fp6_f32", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_V3I32_V16F32_I32_F32>, int_amdgcn_cvt_scalef32_sr_pk16_fp6_f32>;
18231829
} // End Constraints = "@earlyclobber $vdst"
18241830

18251831
let True16Predicate = UseRealTrue16Insts in {
@@ -2272,6 +2278,12 @@ defm V_CVT_SCALEF32_PK16_FP6_F16 : VOP3Only_Real_Base_gfx1250<0x2cf>;
22722278
defm V_CVT_SCALEF32_PK16_BF6_F16 : VOP3Only_Real_Base_gfx1250<0x2d0>;
22732279
defm V_CVT_SCALEF32_PK16_FP6_BF16 : VOP3Only_Real_Base_gfx1250<0x2d1>;
22742280
defm V_CVT_SCALEF32_PK16_BF6_BF16 : VOP3Only_Real_Base_gfx1250<0x2d2>;
2281+
defm V_CVT_SCALEF32_SR_PK16_FP6_F32 : VOP3Only_Real_Base_gfx1250<0x2d3>;
2282+
defm V_CVT_SCALEF32_SR_PK16_BF6_F32 : VOP3Only_Real_Base_gfx1250<0x2d4>;
2283+
defm V_CVT_SCALEF32_SR_PK16_FP6_F16 : VOP3Only_Real_Base_gfx1250<0x2d5>;
2284+
defm V_CVT_SCALEF32_SR_PK16_BF6_F16 : VOP3Only_Real_Base_gfx1250<0x2d6>;
2285+
defm V_CVT_SCALEF32_SR_PK16_FP6_BF16 : VOP3Only_Real_Base_gfx1250<0x2d7>;
2286+
defm V_CVT_SCALEF32_SR_PK16_BF6_BF16 : VOP3Only_Real_Base_gfx1250<0x2d8>;
22752287
defm V_CVT_SCALEF32_SR_PK8_FP4_F32 : VOP3Only_Real_Base_gfx1250<0x297>;
22762288
defm V_CVT_SCALEF32_SR_PK8_FP8_F32 : VOP3Only_Real_Base_gfx1250<0x298>;
22772289
defm V_CVT_SCALEF32_SR_PK8_BF8_F32 : VOP3Only_Real_Base_gfx1250<0x299>;

0 commit comments

Comments
 (0)