Skip to content

Commit 9ad7129

Browse files
committed
[AMDGPU] Add v_cvt_sr|pk_bf8|fp8_f16 gfx1250 instructions
1 parent b0f03f1 commit 9ad7129

28 files changed

+1710
-4
lines changed

clang/include/clang/Basic/BuiltinsAMDGPU.def

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -702,6 +702,10 @@ TARGET_BUILTIN(__builtin_amdgcn_cvt_f16_fp8, "hiIi", "nc", "gfx1250-insts")
702702
TARGET_BUILTIN(__builtin_amdgcn_cvt_f16_bf8, "hiIi", "nc", "gfx1250-insts")
703703
TARGET_BUILTIN(__builtin_amdgcn_cvt_pk_f16_fp8, "V2hs", "nc", "gfx1250-insts")
704704
TARGET_BUILTIN(__builtin_amdgcn_cvt_pk_f16_bf8, "V2hs", "nc", "gfx1250-insts")
705+
TARGET_BUILTIN(__builtin_amdgcn_cvt_pk_fp8_f16, "sV2h", "nc", "gfx1250-insts")
706+
TARGET_BUILTIN(__builtin_amdgcn_cvt_pk_bf8_f16, "sV2h", "nc", "gfx1250-insts")
707+
TARGET_BUILTIN(__builtin_amdgcn_cvt_sr_fp8_f16, "ihiUiIi", "nc", "gfx1250-insts")
708+
TARGET_BUILTIN(__builtin_amdgcn_cvt_sr_bf8_f16, "ihiUiIi", "nc", "gfx1250-insts")
705709
TARGET_BUILTIN(__builtin_amdgcn_sat_pk4_i4_i8, "UsUi", "nc", "gfx1250-insts")
706710
TARGET_BUILTIN(__builtin_amdgcn_sat_pk4_u4_u8, "UsUi", "nc", "gfx1250-insts")
707711

clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl

Lines changed: 138 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -398,6 +398,144 @@ void test_cvt_pk_f16_bf8(global half2* out, short a)
398398
out[0] = __builtin_amdgcn_cvt_pk_f16_bf8(a);
399399
}
400400

401+
// CHECK-LABEL: @test_cvt_pk_bf8_f16(
402+
// CHECK-NEXT: entry:
403+
// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
404+
// CHECK-NEXT: [[A_ADDR:%.*]] = alloca <2 x half>, align 4, addrspace(5)
405+
// CHECK-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr
406+
// CHECK-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr
407+
// CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr [[OUT_ADDR_ASCAST]], align 8
408+
// CHECK-NEXT: store <2 x half> [[A:%.*]], ptr [[A_ADDR_ASCAST]], align 4
409+
// CHECK-NEXT: [[TMP0:%.*]] = load <2 x half>, ptr [[A_ADDR_ASCAST]], align 4
410+
// CHECK-NEXT: [[TMP1:%.*]] = call i16 @llvm.amdgcn.cvt.pk.bf8.f16(<2 x half> [[TMP0]])
411+
// CHECK-NEXT: [[TMP2:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8
412+
// CHECK-NEXT: store i16 [[TMP1]], ptr addrspace(1) [[TMP2]], align 2
413+
// CHECK-NEXT: ret void
414+
//
415+
void test_cvt_pk_bf8_f16(global short* out, half2 a)
416+
{
417+
*out = __builtin_amdgcn_cvt_pk_bf8_f16(a);
418+
}
419+
420+
// CHECK-LABEL: @test_cvt_pk_fp8_f16(
421+
// CHECK-NEXT: entry:
422+
// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
423+
// CHECK-NEXT: [[A_ADDR:%.*]] = alloca <2 x half>, align 4, addrspace(5)
424+
// CHECK-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr
425+
// CHECK-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr
426+
// CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr [[OUT_ADDR_ASCAST]], align 8
427+
// CHECK-NEXT: store <2 x half> [[A:%.*]], ptr [[A_ADDR_ASCAST]], align 4
428+
// CHECK-NEXT: [[TMP0:%.*]] = load <2 x half>, ptr [[A_ADDR_ASCAST]], align 4
429+
// CHECK-NEXT: [[TMP1:%.*]] = call i16 @llvm.amdgcn.cvt.pk.fp8.f16(<2 x half> [[TMP0]])
430+
// CHECK-NEXT: [[TMP2:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8
431+
// CHECK-NEXT: store i16 [[TMP1]], ptr addrspace(1) [[TMP2]], align 2
432+
// CHECK-NEXT: ret void
433+
//
434+
void test_cvt_pk_fp8_f16(global short* out, half2 a)
435+
{
436+
*out = __builtin_amdgcn_cvt_pk_fp8_f16(a);
437+
}
438+
439+
// CHECK-LABEL: @test_cvt_sr_bf8_f16(
440+
// CHECK-NEXT: entry:
441+
// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
442+
// CHECK-NEXT: [[A_ADDR:%.*]] = alloca half, align 2, addrspace(5)
443+
// CHECK-NEXT: [[SR_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
444+
// CHECK-NEXT: [[OLD_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
445+
// CHECK-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr
446+
// CHECK-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr
447+
// CHECK-NEXT: [[SR_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SR_ADDR]] to ptr
448+
// CHECK-NEXT: [[OLD_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OLD_ADDR]] to ptr
449+
// CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr [[OUT_ADDR_ASCAST]], align 8
450+
// CHECK-NEXT: store half [[A:%.*]], ptr [[A_ADDR_ASCAST]], align 2
451+
// CHECK-NEXT: store i32 [[SR:%.*]], ptr [[SR_ADDR_ASCAST]], align 4
452+
// CHECK-NEXT: store i32 [[OLD:%.*]], ptr [[OLD_ADDR_ASCAST]], align 4
453+
// CHECK-NEXT: [[TMP0:%.*]] = load half, ptr [[A_ADDR_ASCAST]], align 2
454+
// CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[SR_ADDR_ASCAST]], align 4
455+
// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[OLD_ADDR_ASCAST]], align 4
456+
// CHECK-NEXT: [[TMP3:%.*]] = call i32 @llvm.amdgcn.cvt.sr.bf8.f16(half [[TMP0]], i32 [[TMP1]], i32 [[TMP2]], i32 0)
457+
// CHECK-NEXT: [[TMP4:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8
458+
// CHECK-NEXT: store i32 [[TMP3]], ptr addrspace(1) [[TMP4]], align 4
459+
// CHECK-NEXT: [[TMP5:%.*]] = load half, ptr [[A_ADDR_ASCAST]], align 2
460+
// CHECK-NEXT: [[TMP6:%.*]] = load i32, ptr [[SR_ADDR_ASCAST]], align 4
461+
// CHECK-NEXT: [[TMP7:%.*]] = load i32, ptr [[OLD_ADDR_ASCAST]], align 4
462+
// CHECK-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.cvt.sr.bf8.f16(half [[TMP5]], i32 [[TMP6]], i32 [[TMP7]], i32 1)
463+
// CHECK-NEXT: [[TMP9:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8
464+
// CHECK-NEXT: store i32 [[TMP8]], ptr addrspace(1) [[TMP9]], align 4
465+
// CHECK-NEXT: [[TMP10:%.*]] = load half, ptr [[A_ADDR_ASCAST]], align 2
466+
// CHECK-NEXT: [[TMP11:%.*]] = load i32, ptr [[SR_ADDR_ASCAST]], align 4
467+
// CHECK-NEXT: [[TMP12:%.*]] = load i32, ptr [[OLD_ADDR_ASCAST]], align 4
468+
// CHECK-NEXT: [[TMP13:%.*]] = call i32 @llvm.amdgcn.cvt.sr.bf8.f16(half [[TMP10]], i32 [[TMP11]], i32 [[TMP12]], i32 2)
469+
// CHECK-NEXT: [[TMP14:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8
470+
// CHECK-NEXT: store i32 [[TMP13]], ptr addrspace(1) [[TMP14]], align 4
471+
// CHECK-NEXT: [[TMP15:%.*]] = load half, ptr [[A_ADDR_ASCAST]], align 2
472+
// CHECK-NEXT: [[TMP16:%.*]] = load i32, ptr [[SR_ADDR_ASCAST]], align 4
473+
// CHECK-NEXT: [[TMP17:%.*]] = load i32, ptr [[OLD_ADDR_ASCAST]], align 4
474+
// CHECK-NEXT: [[TMP18:%.*]] = call i32 @llvm.amdgcn.cvt.sr.bf8.f16(half [[TMP15]], i32 [[TMP16]], i32 [[TMP17]], i32 3)
475+
// CHECK-NEXT: [[TMP19:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8
476+
// CHECK-NEXT: store i32 [[TMP18]], ptr addrspace(1) [[TMP19]], align 4
477+
// CHECK-NEXT: ret void
478+
//
479+
void test_cvt_sr_bf8_f16(global int* out, half a, uint sr, int old)
480+
{
481+
*out = __builtin_amdgcn_cvt_sr_bf8_f16(a, sr, old, 0);
482+
*out = __builtin_amdgcn_cvt_sr_bf8_f16(a, sr, old, 1);
483+
*out = __builtin_amdgcn_cvt_sr_bf8_f16(a, sr, old, 2);
484+
*out = __builtin_amdgcn_cvt_sr_bf8_f16(a, sr, old, 3);
485+
}
486+
487+
// CHECK-LABEL: @test_cvt_sr_fp8_f16(
488+
// CHECK-NEXT: entry:
489+
// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
490+
// CHECK-NEXT: [[A_ADDR:%.*]] = alloca half, align 2, addrspace(5)
491+
// CHECK-NEXT: [[SR_ADDR:%.*]] = alloca i16, align 2, addrspace(5)
492+
// CHECK-NEXT: [[OLD_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
493+
// CHECK-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr
494+
// CHECK-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr
495+
// CHECK-NEXT: [[SR_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SR_ADDR]] to ptr
496+
// CHECK-NEXT: [[OLD_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OLD_ADDR]] to ptr
497+
// CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr [[OUT_ADDR_ASCAST]], align 8
498+
// CHECK-NEXT: store half [[A:%.*]], ptr [[A_ADDR_ASCAST]], align 2
499+
// CHECK-NEXT: store i16 [[SR:%.*]], ptr [[SR_ADDR_ASCAST]], align 2
500+
// CHECK-NEXT: store i32 [[OLD:%.*]], ptr [[OLD_ADDR_ASCAST]], align 4
501+
// CHECK-NEXT: [[TMP0:%.*]] = load half, ptr [[A_ADDR_ASCAST]], align 2
502+
// CHECK-NEXT: [[TMP1:%.*]] = load i16, ptr [[SR_ADDR_ASCAST]], align 2
503+
// CHECK-NEXT: [[CONV:%.*]] = sext i16 [[TMP1]] to i32
504+
// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[OLD_ADDR_ASCAST]], align 4
505+
// CHECK-NEXT: [[TMP3:%.*]] = call i32 @llvm.amdgcn.cvt.sr.fp8.f16(half [[TMP0]], i32 [[CONV]], i32 [[TMP2]], i32 0)
506+
// CHECK-NEXT: [[TMP4:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8
507+
// CHECK-NEXT: store i32 [[TMP3]], ptr addrspace(1) [[TMP4]], align 4
508+
// CHECK-NEXT: [[TMP5:%.*]] = load half, ptr [[A_ADDR_ASCAST]], align 2
509+
// CHECK-NEXT: [[TMP6:%.*]] = load i16, ptr [[SR_ADDR_ASCAST]], align 2
510+
// CHECK-NEXT: [[CONV1:%.*]] = sext i16 [[TMP6]] to i32
511+
// CHECK-NEXT: [[TMP7:%.*]] = load i32, ptr [[OLD_ADDR_ASCAST]], align 4
512+
// CHECK-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.cvt.sr.fp8.f16(half [[TMP5]], i32 [[CONV1]], i32 [[TMP7]], i32 1)
513+
// CHECK-NEXT: [[TMP9:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8
514+
// CHECK-NEXT: store i32 [[TMP8]], ptr addrspace(1) [[TMP9]], align 4
515+
// CHECK-NEXT: [[TMP10:%.*]] = load half, ptr [[A_ADDR_ASCAST]], align 2
516+
// CHECK-NEXT: [[TMP11:%.*]] = load i16, ptr [[SR_ADDR_ASCAST]], align 2
517+
// CHECK-NEXT: [[CONV2:%.*]] = sext i16 [[TMP11]] to i32
518+
// CHECK-NEXT: [[TMP12:%.*]] = load i32, ptr [[OLD_ADDR_ASCAST]], align 4
519+
// CHECK-NEXT: [[TMP13:%.*]] = call i32 @llvm.amdgcn.cvt.sr.fp8.f16(half [[TMP10]], i32 [[CONV2]], i32 [[TMP12]], i32 2)
520+
// CHECK-NEXT: [[TMP14:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8
521+
// CHECK-NEXT: store i32 [[TMP13]], ptr addrspace(1) [[TMP14]], align 4
522+
// CHECK-NEXT: [[TMP15:%.*]] = load half, ptr [[A_ADDR_ASCAST]], align 2
523+
// CHECK-NEXT: [[TMP16:%.*]] = load i16, ptr [[SR_ADDR_ASCAST]], align 2
524+
// CHECK-NEXT: [[CONV3:%.*]] = sext i16 [[TMP16]] to i32
525+
// CHECK-NEXT: [[TMP17:%.*]] = load i32, ptr [[OLD_ADDR_ASCAST]], align 4
526+
// CHECK-NEXT: [[TMP18:%.*]] = call i32 @llvm.amdgcn.cvt.sr.fp8.f16(half [[TMP15]], i32 [[CONV3]], i32 [[TMP17]], i32 3)
527+
// CHECK-NEXT: [[TMP19:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8
528+
// CHECK-NEXT: store i32 [[TMP18]], ptr addrspace(1) [[TMP19]], align 4
529+
// CHECK-NEXT: ret void
530+
//
531+
void test_cvt_sr_fp8_f16(global int* out, half a, short sr, int old)
532+
{
533+
*out = __builtin_amdgcn_cvt_sr_fp8_f16(a, sr, old, 0);
534+
*out = __builtin_amdgcn_cvt_sr_fp8_f16(a, sr, old, 1);
535+
*out = __builtin_amdgcn_cvt_sr_fp8_f16(a, sr, old, 2);
536+
*out = __builtin_amdgcn_cvt_sr_fp8_f16(a, sr, old, 3);
537+
}
538+
401539
// CHECK-LABEL: @test_sat_pk4_i4_i8(
402540
// CHECK-NEXT: entry:
403541
// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)

llvm/include/llvm/IR/IntrinsicsAMDGPU.td

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -605,6 +605,30 @@ def int_amdgcn_cvt_pk_f16_bf8 : DefaultAttrsIntrinsic<
605605
[llvm_v2f16_ty], [llvm_i16_ty], [IntrNoMem, IntrSpeculatable]
606606
>, ClangBuiltin<"__builtin_amdgcn_cvt_pk_f16_bf8">;
607607

608+
def int_amdgcn_cvt_pk_fp8_f16
609+
: DefaultAttrsIntrinsic<[llvm_i16_ty], [llvm_v2f16_ty],
610+
[IntrNoMem, IntrSpeculatable]>,
611+
ClangBuiltin<"__builtin_amdgcn_cvt_pk_fp8_f16">;
612+
613+
def int_amdgcn_cvt_pk_bf8_f16
614+
: DefaultAttrsIntrinsic<[llvm_i16_ty], [llvm_v2f16_ty],
615+
[IntrNoMem, IntrSpeculatable]>,
616+
ClangBuiltin<"__builtin_amdgcn_cvt_pk_bf8_f16">;
617+
618+
// llvm.amdgcn.cvt.sr.fp8.f16 i32 vdst, half src, i32 seed, i32 old, imm byte_sel [0..3]
619+
// byte_sel selects byte to write in vdst.
620+
def int_amdgcn_cvt_sr_fp8_f16 : DefaultAttrsIntrinsic<
621+
[llvm_i32_ty], [llvm_half_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
622+
[IntrNoMem, IntrSpeculatable, ImmArg<ArgIndex<3>>]
623+
>, ClangBuiltin<"__builtin_amdgcn_cvt_sr_fp8_f16">;
624+
625+
// llvm.amdgcn.cvt.sr.bf8.f16 i32 vdst, half src, i32 seed, i32 old, imm byte_sel [0..3]
626+
// byte_sel selects byte to write in vdst.
627+
def int_amdgcn_cvt_sr_bf8_f16 : DefaultAttrsIntrinsic<
628+
[llvm_i32_ty], [llvm_half_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
629+
[IntrNoMem, IntrSpeculatable, ImmArg<ArgIndex<3>>]
630+
>, ClangBuiltin<"__builtin_amdgcn_cvt_sr_bf8_f16">;
631+
608632
class AMDGPUCvtScaleF32Intrinsic<LLVMType DstTy, LLVMType Src0Ty, string name> : DefaultAttrsIntrinsic<
609633
[DstTy], [Src0Ty, llvm_float_ty], [IntrNoMem, IntrSpeculatable]
610634
>, ClangBuiltin<"__builtin_amdgcn_"#name>;

llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4577,6 +4577,10 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
45774577
case Intrinsic::amdgcn_cvt_sr_pk_bf16_f32:
45784578
case Intrinsic::amdgcn_cvt_pk_f16_fp8:
45794579
case Intrinsic::amdgcn_cvt_pk_f16_bf8:
4580+
case Intrinsic::amdgcn_cvt_pk_fp8_f16:
4581+
case Intrinsic::amdgcn_cvt_pk_bf8_f16:
4582+
case Intrinsic::amdgcn_cvt_sr_fp8_f16:
4583+
case Intrinsic::amdgcn_cvt_sr_bf8_f16:
45804584
case Intrinsic::amdgcn_sat_pk4_i4_i8:
45814585
case Intrinsic::amdgcn_sat_pk4_u4_u8:
45824586
case Intrinsic::amdgcn_fmed3:

llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp

Lines changed: 23 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -689,6 +689,8 @@ class AMDGPUOperand : public MCParsedAsmOperand {
689689

690690
bool isVSrc_v2f16() const { return isVSrc_f16() || isLiteralImm(MVT::v2f16); }
691691

692+
bool isVSrc_NoInline_v2f16() const { return isVSrc_v2f16(); }
693+
692694
bool isVISrcB32() const {
693695
return isRegOrInlineNoMods(AMDGPU::VGPR_32RegClassID, MVT::i32);
694696
}
@@ -2036,6 +2038,7 @@ static const fltSemantics *getOpFltSemantics(uint8_t OperandType) {
20362038
case AMDGPU::OPERAND_REG_INLINE_C_FP16:
20372039
case AMDGPU::OPERAND_REG_INLINE_C_V2FP16:
20382040
case AMDGPU::OPERAND_REG_IMM_V2FP16:
2041+
case AMDGPU::OPERAND_REG_IMM_NOINLINE_V2FP16:
20392042
case AMDGPU::OPERAND_KIMM16:
20402043
return &APFloat::IEEEhalf();
20412044
case AMDGPU::OPERAND_REG_IMM_BF16:
@@ -2405,6 +2408,7 @@ void AMDGPUOperand::addLiteralImmOperand(MCInst &Inst, int64_t Val, bool ApplyMo
24052408
case AMDGPU::OPERAND_REG_INLINE_C_V2FP16:
24062409
case AMDGPU::OPERAND_REG_IMM_V2INT16:
24072410
case AMDGPU::OPERAND_REG_IMM_V2FP16:
2411+
case AMDGPU::OPERAND_REG_IMM_NOINLINE_V2FP16:
24082412
case AMDGPU::OPERAND_REG_IMM_V2FP32:
24092413
case AMDGPU::OPERAND_REG_IMM_V2INT32:
24102414
case AMDGPU::OPERAND_KIMM32:
@@ -2456,6 +2460,9 @@ void AMDGPUOperand::addLiteralImmOperand(MCInst &Inst, int64_t Val, bool ApplyMo
24562460
setImmKindConst();
24572461
return;
24582462
}
2463+
[[fallthrough]];
2464+
2465+
case AMDGPU::OPERAND_REG_IMM_NOINLINE_V2FP16:
24592466

24602467
Inst.addOperand(MCOperand::createImm(Lo_32(Val)));
24612468
setImmKindLiteral();
@@ -3761,6 +3768,9 @@ bool AMDGPUAsmParser::isInlineConstant(const MCInst &Inst,
37613768
OperandType == AMDGPU::OPERAND_REG_INLINE_C_BF16)
37623769
return AMDGPU::isInlinableLiteralBF16(Val, hasInv2PiInlineImm());
37633770

3771+
if (OperandType == AMDGPU::OPERAND_REG_IMM_NOINLINE_V2FP16)
3772+
return false;
3773+
37643774
llvm_unreachable("invalid operand type");
37653775
}
37663776
default:
@@ -9421,7 +9431,19 @@ void AMDGPUAsmParser::cvtVOP3P(MCInst &Inst, const OperandVector &Operands,
94219431
Opc == AMDGPU::V_CVT_SR_FP8_F32_gfx12_e64_dpp_gfx12 ||
94229432
Opc == AMDGPU::V_CVT_SR_FP8_F32_gfx12_e64_dpp8_gfx12 ||
94239433
Opc == AMDGPU::V_CVT_SR_BF8_F32_gfx12_e64_dpp_gfx12 ||
9424-
Opc == AMDGPU::V_CVT_SR_BF8_F32_gfx12_e64_dpp8_gfx12)) {
9434+
Opc == AMDGPU::V_CVT_SR_BF8_F32_gfx12_e64_dpp8_gfx12 ||
9435+
Opc == AMDGPU::V_CVT_SR_FP8_F16_t16_e64_dpp_gfx1250 ||
9436+
Opc == AMDGPU::V_CVT_SR_FP8_F16_fake16_e64_dpp_gfx1250 ||
9437+
Opc == AMDGPU::V_CVT_SR_FP8_F16_t16_e64_dpp8_gfx1250 ||
9438+
Opc == AMDGPU::V_CVT_SR_FP8_F16_fake16_e64_dpp8_gfx1250 ||
9439+
Opc == AMDGPU::V_CVT_SR_FP8_F16_t16_e64_gfx1250 ||
9440+
Opc == AMDGPU::V_CVT_SR_FP8_F16_fake16_e64_gfx1250 ||
9441+
Opc == AMDGPU::V_CVT_SR_BF8_F16_t16_e64_dpp_gfx1250 ||
9442+
Opc == AMDGPU::V_CVT_SR_BF8_F16_fake16_e64_dpp_gfx1250 ||
9443+
Opc == AMDGPU::V_CVT_SR_BF8_F16_t16_e64_dpp8_gfx1250 ||
9444+
Opc == AMDGPU::V_CVT_SR_BF8_F16_fake16_e64_dpp8_gfx1250 ||
9445+
Opc == AMDGPU::V_CVT_SR_BF8_F16_t16_e64_gfx1250 ||
9446+
Opc == AMDGPU::V_CVT_SR_BF8_F16_fake16_e64_gfx1250)) {
94259447
Inst.addOperand(Inst.getOperand(0));
94269448
}
94279449

llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -540,6 +540,8 @@ void AMDGPUInstPrinter::printImmediateV216(uint32_t Imm, uint8_t OpType,
540540
printImmediateBFloat16(static_cast<uint16_t>(Imm), STI, O))
541541
return;
542542
break;
543+
case AMDGPU::OPERAND_REG_IMM_NOINLINE_V2FP16:
544+
break;
543545
default:
544546
llvm_unreachable("bad operand type");
545547
}
@@ -770,6 +772,7 @@ void AMDGPUInstPrinter::printRegularOperand(const MCInst *MI, unsigned OpNo,
770772
case AMDGPU::OPERAND_REG_IMM_V2INT16:
771773
case AMDGPU::OPERAND_REG_IMM_V2BF16:
772774
case AMDGPU::OPERAND_REG_IMM_V2FP16:
775+
case AMDGPU::OPERAND_REG_IMM_NOINLINE_V2FP16:
773776
case AMDGPU::OPERAND_REG_INLINE_C_V2INT16:
774777
case AMDGPU::OPERAND_REG_INLINE_C_V2BF16:
775778
case AMDGPU::OPERAND_REG_INLINE_C_V2FP16:

llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -341,6 +341,9 @@ std::optional<uint64_t> AMDGPUMCCodeEmitter::getLitEncoding(
341341
return AMDGPU::getInlineEncodingV2BF16(static_cast<uint32_t>(Imm))
342342
.value_or(255);
343343

344+
case AMDGPU::OPERAND_REG_IMM_NOINLINE_V2FP16:
345+
return 255;
346+
344347
case AMDGPU::OPERAND_KIMM32:
345348
case AMDGPU::OPERAND_KIMM16:
346349
case AMDGPU::OPERAND_KIMM64:

llvm/lib/Target/AMDGPU/SIDefines.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -208,6 +208,7 @@ enum OperandType : unsigned {
208208
OPERAND_REG_IMM_V2BF16,
209209
OPERAND_REG_IMM_V2FP16,
210210
OPERAND_REG_IMM_V2INT16,
211+
OPERAND_REG_IMM_NOINLINE_V2FP16,
211212
OPERAND_REG_IMM_V2INT32,
212213
OPERAND_REG_IMM_V2FP32,
213214

llvm/lib/Target/AMDGPU/SIFoldOperands.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -468,6 +468,7 @@ bool SIFoldOperandsImpl::canUseImmWithOpSel(const MachineInstr *MI,
468468
case AMDGPU::OPERAND_REG_IMM_V2FP16:
469469
case AMDGPU::OPERAND_REG_IMM_V2BF16:
470470
case AMDGPU::OPERAND_REG_IMM_V2INT16:
471+
case AMDGPU::OPERAND_REG_IMM_NOINLINE_V2FP16:
471472
case AMDGPU::OPERAND_REG_INLINE_C_V2FP16:
472473
case AMDGPU::OPERAND_REG_INLINE_C_V2BF16:
473474
case AMDGPU::OPERAND_REG_INLINE_C_V2INT16:

llvm/lib/Target/AMDGPU/SIInstrInfo.cpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4438,6 +4438,8 @@ bool SIInstrInfo::isInlineConstant(int64_t Imm, uint8_t OperandType) const {
44384438
case AMDGPU::OPERAND_REG_IMM_V2BF16:
44394439
case AMDGPU::OPERAND_REG_INLINE_C_V2BF16:
44404440
return AMDGPU::isInlinableLiteralV2BF16(Imm);
4441+
case AMDGPU::OPERAND_REG_IMM_NOINLINE_V2FP16:
4442+
return false;
44414443
case AMDGPU::OPERAND_REG_IMM_FP16:
44424444
case AMDGPU::OPERAND_REG_INLINE_C_FP16: {
44434445
if (isInt<16>(Imm) || isUInt<16>(Imm)) {

0 commit comments

Comments
 (0)