Skip to content

Commit bc463c0

Browse files
authored
[AMDGPU] v_cvt_scalef32_pk16_* gfx1250 instructions (llvm#151807)
1 parent c330585 commit bc463c0

File tree

10 files changed

+480
-0
lines changed

10 files changed

+480
-0
lines changed

clang/include/clang/Basic/BuiltinsAMDGPU.def

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -731,6 +731,12 @@ TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_pk8_bf8_f32, "V2UiV8ff", "nc", "gfx
731731
TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_pk8_fp4_f32, "UiV8ff", "nc", "gfx1250-insts")
732732
TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_pk8_fp4_f16, "UiV8hf", "nc", "gfx1250-insts")
733733
TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_pk8_fp4_bf16, "UiV8yf", "nc", "gfx1250-insts")
734+
TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_pk16_fp6_f32, "V3UiV16ff", "nc", "gfx1250-insts")
735+
TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_pk16_bf6_f32, "V3UiV16ff", "nc", "gfx1250-insts")
736+
TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_pk16_fp6_f16, "V3UiV16hf", "nc", "gfx1250-insts")
737+
TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_pk16_bf6_f16, "V3UiV16hf", "nc", "gfx1250-insts")
738+
TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_pk16_fp6_bf16, "V3UiV16yf", "nc", "gfx1250-insts")
739+
TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_pk16_bf6_bf16, "V3UiV16yf", "nc", "gfx1250-insts")
734740
TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk8_fp8_bf16, "V2UiV8yUif", "nc", "gfx1250-insts")
735741
TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk8_bf8_bf16, "V2UiV8yUif", "nc", "gfx1250-insts")
736742
TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk8_fp8_f16, "V2UiV8hUif", "nc", "gfx1250-insts")

clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -787,6 +787,36 @@ void test_cvt_scale_pk(global half8 *outh8, global bfloat8 *outy8, uint2 src2,
787787
// CHECK-NEXT: [[TMP34:%.*]] = call i32 @llvm.amdgcn.cvt.scalef32.pk8.fp4.bf16(<8 x bfloat> [[TMP32]], float [[TMP33]])
788788
// CHECK-NEXT: [[TMP35:%.*]] = load ptr addrspace(1), ptr [[OUT1_ADDR_ASCAST]], align 8
789789
// CHECK-NEXT: store i32 [[TMP34]], ptr addrspace(1) [[TMP35]], align 4
790+
// CHECK-NEXT: [[TMP36:%.*]] = load <16 x bfloat>, ptr [[SRCBF16_ADDR_ASCAST]], align 32
791+
// CHECK-NEXT: [[TMP37:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 4
792+
// CHECK-NEXT: [[TMP38:%.*]] = call <3 x i32> @llvm.amdgcn.cvt.scalef32.pk16.bf6.bf16(<16 x bfloat> [[TMP36]], float [[TMP37]])
793+
// CHECK-NEXT: [[TMP39:%.*]] = load ptr addrspace(1), ptr [[OUT3_ADDR_ASCAST]], align 8
794+
// CHECK-NEXT: store <3 x i32> [[TMP38]], ptr addrspace(1) [[TMP39]], align 16
795+
// CHECK-NEXT: [[TMP40:%.*]] = load <16 x half>, ptr [[SRCH16_ADDR_ASCAST]], align 32
796+
// CHECK-NEXT: [[TMP41:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 4
797+
// CHECK-NEXT: [[TMP42:%.*]] = call <3 x i32> @llvm.amdgcn.cvt.scalef32.pk16.bf6.f16(<16 x half> [[TMP40]], float [[TMP41]])
798+
// CHECK-NEXT: [[TMP43:%.*]] = load ptr addrspace(1), ptr [[OUT3_ADDR_ASCAST]], align 8
799+
// CHECK-NEXT: store <3 x i32> [[TMP42]], ptr addrspace(1) [[TMP43]], align 16
800+
// CHECK-NEXT: [[TMP44:%.*]] = load <16 x bfloat>, ptr [[SRCBF16_ADDR_ASCAST]], align 32
801+
// CHECK-NEXT: [[TMP45:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 4
802+
// CHECK-NEXT: [[TMP46:%.*]] = call <3 x i32> @llvm.amdgcn.cvt.scalef32.pk16.fp6.bf16(<16 x bfloat> [[TMP44]], float [[TMP45]])
803+
// CHECK-NEXT: [[TMP47:%.*]] = load ptr addrspace(1), ptr [[OUT3_ADDR_ASCAST]], align 8
804+
// CHECK-NEXT: store <3 x i32> [[TMP46]], ptr addrspace(1) [[TMP47]], align 16
805+
// CHECK-NEXT: [[TMP48:%.*]] = load <16 x half>, ptr [[SRCH16_ADDR_ASCAST]], align 32
806+
// CHECK-NEXT: [[TMP49:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 4
807+
// CHECK-NEXT: [[TMP50:%.*]] = call <3 x i32> @llvm.amdgcn.cvt.scalef32.pk16.fp6.f16(<16 x half> [[TMP48]], float [[TMP49]])
808+
// CHECK-NEXT: [[TMP51:%.*]] = load ptr addrspace(1), ptr [[OUT3_ADDR_ASCAST]], align 8
809+
// CHECK-NEXT: store <3 x i32> [[TMP50]], ptr addrspace(1) [[TMP51]], align 16
810+
// CHECK-NEXT: [[TMP52:%.*]] = load <16 x float>, ptr [[SRCF16_ADDR_ASCAST]], align 64
811+
// CHECK-NEXT: [[TMP53:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 4
812+
// CHECK-NEXT: [[TMP54:%.*]] = call <3 x i32> @llvm.amdgcn.cvt.scalef32.pk16.bf6.f32(<16 x float> [[TMP52]], float [[TMP53]])
813+
// CHECK-NEXT: [[TMP55:%.*]] = load ptr addrspace(1), ptr [[OUT3_ADDR_ASCAST]], align 8
814+
// CHECK-NEXT: store <3 x i32> [[TMP54]], ptr addrspace(1) [[TMP55]], align 16
815+
// CHECK-NEXT: [[TMP56:%.*]] = load <16 x float>, ptr [[SRCF16_ADDR_ASCAST]], align 64
816+
// CHECK-NEXT: [[TMP57:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 4
817+
// CHECK-NEXT: [[TMP58:%.*]] = call <3 x i32> @llvm.amdgcn.cvt.scalef32.pk16.fp6.f32(<16 x float> [[TMP56]], float [[TMP57]])
818+
// CHECK-NEXT: [[TMP59:%.*]] = load ptr addrspace(1), ptr [[OUT3_ADDR_ASCAST]], align 8
819+
// CHECK-NEXT: store <3 x i32> [[TMP58]], ptr addrspace(1) [[TMP59]], align 16
790820
// CHECK-NEXT: ret void
791821
//
792822
void test_cvt_scalef32_pk(global uint2 *out2, bfloat8 srcbf8, half8 srch8, float8 srcf8,
@@ -802,6 +832,12 @@ void test_cvt_scalef32_pk(global uint2 *out2, bfloat8 srcbf8, half8 srch8, float
802832
*out1 = __builtin_amdgcn_cvt_scalef32_pk8_fp4_f32(srcf8, scale);
803833
*out1 = __builtin_amdgcn_cvt_scalef32_pk8_fp4_f16(srch8, scale);
804834
*out1 = __builtin_amdgcn_cvt_scalef32_pk8_fp4_bf16(srcbf8, scale);
835+
*out3 = __builtin_amdgcn_cvt_scalef32_pk16_bf6_bf16(srcbf16, scale);
836+
*out3 = __builtin_amdgcn_cvt_scalef32_pk16_bf6_f16(srch16, scale);
837+
*out3 = __builtin_amdgcn_cvt_scalef32_pk16_fp6_bf16(srcbf16, scale);
838+
*out3 = __builtin_amdgcn_cvt_scalef32_pk16_fp6_f16(srch16, scale);
839+
*out3 = __builtin_amdgcn_cvt_scalef32_pk16_bf6_f32(srcf16, scale);
840+
*out3 = __builtin_amdgcn_cvt_scalef32_pk16_fp6_f32(srcf16, scale);
805841
}
806842

807843
// CHECK-LABEL: @test_cvt_scalef32_sr_pk(

llvm/include/llvm/IR/IntrinsicsAMDGPU.td

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -679,6 +679,12 @@ def int_amdgcn_cvt_scalef32_pk8_bf8_f32 : AMDGPUCvtScaleF32Intrinsic<llvm_v2i3
679679
def int_amdgcn_cvt_scalef32_pk8_fp4_f32 : AMDGPUCvtScaleF32Intrinsic<llvm_i32_ty, llvm_v8f32_ty, "cvt_scalef32_pk8_fp4_f32">;
680680
def int_amdgcn_cvt_scalef32_pk8_fp4_f16 : AMDGPUCvtScaleF32Intrinsic<llvm_i32_ty, llvm_v8f16_ty, "cvt_scalef32_pk8_fp4_f16">;
681681
def int_amdgcn_cvt_scalef32_pk8_fp4_bf16 : AMDGPUCvtScaleF32Intrinsic<llvm_i32_ty, llvm_v8bf16_ty, "cvt_scalef32_pk8_fp4_bf16">;
682+
def int_amdgcn_cvt_scalef32_pk16_fp6_f32 : AMDGPUCvtScaleF32Intrinsic<llvm_v3i32_ty, llvm_v16f32_ty, "cvt_scalef32_pk16_fp6_f32">;
683+
def int_amdgcn_cvt_scalef32_pk16_bf6_f32 : AMDGPUCvtScaleF32Intrinsic<llvm_v3i32_ty, llvm_v16f32_ty, "cvt_scalef32_pk16_bf6_f32">;
684+
def int_amdgcn_cvt_scalef32_pk16_fp6_f16 : AMDGPUCvtScaleF32Intrinsic<llvm_v3i32_ty, llvm_v16f16_ty, "cvt_scalef32_pk16_fp6_f16">;
685+
def int_amdgcn_cvt_scalef32_pk16_bf6_f16 : AMDGPUCvtScaleF32Intrinsic<llvm_v3i32_ty, llvm_v16f16_ty, "cvt_scalef32_pk16_bf6_f16">;
686+
def int_amdgcn_cvt_scalef32_pk16_fp6_bf16 : AMDGPUCvtScaleF32Intrinsic<llvm_v3i32_ty, llvm_v16bf16_ty, "cvt_scalef32_pk16_fp6_bf16">;
687+
def int_amdgcn_cvt_scalef32_pk16_bf6_bf16 : AMDGPUCvtScaleF32Intrinsic<llvm_v3i32_ty, llvm_v16bf16_ty, "cvt_scalef32_pk16_bf6_bf16">;
682688

683689
def int_amdgcn_cvt_scalef32_sr_pk32_fp6_f32 : AMDGPUCvtScaleF32SRIntrinsic<llvm_v6i32_ty, llvm_v32f32_ty, "cvt_scalef32_sr_pk32_fp6_f32">;
684690
def int_amdgcn_cvt_scalef32_sr_pk32_bf6_f32 : AMDGPUCvtScaleF32SRIntrinsic<llvm_v6i32_ty, llvm_v32f32_ty, "cvt_scalef32_sr_pk32_bf6_f32">;

llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4618,6 +4618,12 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
46184618
case Intrinsic::amdgcn_cvt_scalef32_pk8_fp4_f32:
46194619
case Intrinsic::amdgcn_cvt_scalef32_pk8_fp4_f16:
46204620
case Intrinsic::amdgcn_cvt_scalef32_pk8_fp4_bf16:
4621+
case Intrinsic::amdgcn_cvt_scalef32_pk16_fp6_f32:
4622+
case Intrinsic::amdgcn_cvt_scalef32_pk16_bf6_f32:
4623+
case Intrinsic::amdgcn_cvt_scalef32_pk16_fp6_f16:
4624+
case Intrinsic::amdgcn_cvt_scalef32_pk16_bf6_f16:
4625+
case Intrinsic::amdgcn_cvt_scalef32_pk16_fp6_bf16:
4626+
case Intrinsic::amdgcn_cvt_scalef32_pk16_bf6_bf16:
46214627
case Intrinsic::amdgcn_cvt_scalef32_sr_pk8_fp8_bf16:
46224628
case Intrinsic::amdgcn_cvt_scalef32_sr_pk8_bf8_bf16:
46234629
case Intrinsic::amdgcn_cvt_scalef32_sr_pk8_fp8_f16:

llvm/lib/Target/AMDGPU/SIInstrInfo.td

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2937,6 +2937,9 @@ def VOP_V2BF16_F32_F32_I32 : VOPProfile <[v2bf16, f32, f32, i32]>;
29372937
def VOP_V2F16_F32_F32_I32 : VOPProfile <[v2f16, f32, f32, i32]>;
29382938
def VOP_V6I32_V32F16_F32 : VOPProfile<[v6i32, v32f16, f32, untyped]>;
29392939
def VOP_V6I32_V32BF16_F32 : VOPProfile<[v6i32, v32bf16, f32, untyped]>;
2940+
def VOP_V3I32_V16F16_F32 : VOPProfile<[v3i32, v16f16, f32, untyped]>;
2941+
def VOP_V3I32_V16BF16_F32 : VOPProfile<[v3i32, v16bf16, f32, untyped]>;
2942+
def VOP_V3I32_V16F32_F32 : VOPProfile<[v3i32, v16f32, f32, untyped]>;
29402943
def VOP_V6I32_V16F32_V16F32_F32 : VOPProfile<[v6i32, v16f32, v16f32, f32]>;
29412944
def VOP_V2F16_I32_F32 : VOPProfile<[v2f16, i32, f32, untyped]>;
29422945
def VOP_V2I16_F32_F32_F32 : VOPProfile<[v2i16, f32, f32, f32]>;

llvm/lib/Target/AMDGPU/VOP3Instructions.td

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1802,6 +1802,12 @@ let SubtargetPredicate = isGFX1250Plus in {
18021802
defm V_CVT_SCALEF32_PK8_FP4_F16 : VOP3Inst<"v_cvt_scalef32_pk8_fp4_f16", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_I32_V8F16_F32>, int_amdgcn_cvt_scalef32_pk8_fp4_f16>;
18031803
defm V_CVT_SCALEF32_PK8_FP4_BF16 : VOP3Inst<"v_cvt_scalef32_pk8_fp4_bf16", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_I32_V8BF16_F32>, int_amdgcn_cvt_scalef32_pk8_fp4_bf16>;
18041804
} // End WaveSizePredicate = isWave32
1805+
defm V_CVT_SCALEF32_PK16_FP6_F32 : VOP3Inst<"v_cvt_scalef32_pk16_fp6_f32", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_V3I32_V16F32_F32>, int_amdgcn_cvt_scalef32_pk16_fp6_f32>;
1806+
defm V_CVT_SCALEF32_PK16_BF6_F32 : VOP3Inst<"v_cvt_scalef32_pk16_bf6_f32", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_V3I32_V16F32_F32>, int_amdgcn_cvt_scalef32_pk16_bf6_f32>;
1807+
defm V_CVT_SCALEF32_PK16_FP6_F16 : VOP3Inst<"v_cvt_scalef32_pk16_fp6_f16", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_V3I32_V16F16_F32>, int_amdgcn_cvt_scalef32_pk16_fp6_f16>;
1808+
defm V_CVT_SCALEF32_PK16_BF6_F16 : VOP3Inst<"v_cvt_scalef32_pk16_bf6_f16", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_V3I32_V16F16_F32>, int_amdgcn_cvt_scalef32_pk16_bf6_f16>;
1809+
defm V_CVT_SCALEF32_PK16_FP6_BF16 : VOP3Inst<"v_cvt_scalef32_pk16_fp6_bf16", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_V3I32_V16BF16_F32>, int_amdgcn_cvt_scalef32_pk16_fp6_bf16>;
1810+
defm V_CVT_SCALEF32_PK16_BF6_BF16 : VOP3Inst<"v_cvt_scalef32_pk16_bf6_bf16", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_V3I32_V16BF16_F32>, int_amdgcn_cvt_scalef32_pk16_bf6_bf16>;
18051811

18061812
let WaveSizePredicate = isWave32 in {
18071813
defm V_CVT_SCALEF32_SR_PK8_FP8_BF16 : VOP3Inst<"v_cvt_scalef32_sr_pk8_fp8_bf16", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_V2I32_V8BF16_I32_F32>, int_amdgcn_cvt_scalef32_sr_pk8_fp8_bf16>;
@@ -2260,6 +2266,12 @@ defm V_CVT_SCALE_PK16_F32_FP6 : VOP3Only_ScaleSel_Real_gfx1250<0x2c9>;
22602266
defm V_CVT_SCALE_PK16_F16_BF6 : VOP3Only_ScaleSel_Real_gfx1250<0x2ca>;
22612267
defm V_CVT_SCALE_PK16_BF16_BF6 : VOP3Only_ScaleSel_Real_gfx1250<0x2cb>;
22622268
defm V_CVT_SCALE_PK16_F32_BF6 : VOP3Only_ScaleSel_Real_gfx1250<0x2cc>;
2269+
defm V_CVT_SCALEF32_PK16_FP6_F32 : VOP3Only_Real_Base_gfx1250<0x2cd>;
2270+
defm V_CVT_SCALEF32_PK16_BF6_F32 : VOP3Only_Real_Base_gfx1250<0x2ce>;
2271+
defm V_CVT_SCALEF32_PK16_FP6_F16 : VOP3Only_Real_Base_gfx1250<0x2cf>;
2272+
defm V_CVT_SCALEF32_PK16_BF6_F16 : VOP3Only_Real_Base_gfx1250<0x2d0>;
2273+
defm V_CVT_SCALEF32_PK16_FP6_BF16 : VOP3Only_Real_Base_gfx1250<0x2d1>;
2274+
defm V_CVT_SCALEF32_PK16_BF6_BF16 : VOP3Only_Real_Base_gfx1250<0x2d2>;
22632275
defm V_CVT_SCALEF32_SR_PK8_FP4_F32 : VOP3Only_Real_Base_gfx1250<0x297>;
22642276
defm V_CVT_SCALEF32_SR_PK8_FP8_F32 : VOP3Only_Real_Base_gfx1250<0x298>;
22652277
defm V_CVT_SCALEF32_SR_PK8_BF8_F32 : VOP3Only_Real_Base_gfx1250<0x299>;

0 commit comments

Comments
 (0)