diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp index a83caa0db8a69..d33765db9cc7d 100644 --- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp +++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp @@ -178,6 +178,10 @@ class AMDGPUOperand : public MCParsedAsmOperand { ImmTyBitOp3, ImmTyMatrixAFMT, ImmTyMatrixBFMT, + ImmTyMatrixAScale, + ImmTyMatrixBScale, + ImmTyMatrixAScaleFmt, + ImmTyMatrixBScaleFmt, ImmTyMatrixAReuse, ImmTyMatrixBReuse, ImmTyScaleSel, @@ -428,6 +432,10 @@ class AMDGPUOperand : public MCParsedAsmOperand { bool isIndexKey32bit() const { return isImmTy(ImmTyIndexKey32bit); } bool isMatrixAFMT() const { return isImmTy(ImmTyMatrixAFMT); } bool isMatrixBFMT() const { return isImmTy(ImmTyMatrixBFMT); } + bool isMatrixAScale() const { return isImmTy(ImmTyMatrixAScale); } + bool isMatrixBScale() const { return isImmTy(ImmTyMatrixBScale); } + bool isMatrixAScaleFmt() const { return isImmTy(ImmTyMatrixAScaleFmt); } + bool isMatrixBScaleFmt() const { return isImmTy(ImmTyMatrixBScaleFmt); } bool isMatrixAReuse() const { return isImmTy(ImmTyMatrixAReuse); } bool isMatrixBReuse() const { return isImmTy(ImmTyMatrixBReuse); } bool isTFE() const { return isImmTy(ImmTyTFE); } @@ -1183,6 +1191,10 @@ class AMDGPUOperand : public MCParsedAsmOperand { case ImmTyBitOp3: OS << "BitOp3"; break; case ImmTyMatrixAFMT: OS << "ImmTyMatrixAFMT"; break; case ImmTyMatrixBFMT: OS << "ImmTyMatrixBFMT"; break; + case ImmTyMatrixAScale: OS << "ImmTyMatrixAScale"; break; + case ImmTyMatrixBScale: OS << "ImmTyMatrixBScale"; break; + case ImmTyMatrixAScaleFmt: OS << "ImmTyMatrixAScaleFmt"; break; + case ImmTyMatrixBScaleFmt: OS << "ImmTyMatrixBScaleFmt"; break; case ImmTyMatrixAReuse: OS << "ImmTyMatrixAReuse"; break; case ImmTyMatrixBReuse: OS << "ImmTyMatrixBReuse"; break; case ImmTyScaleSel: OS << "ScaleSel" ; break; @@ -1728,6 +1740,14 @@ class AMDGPUAsmParser : public MCTargetAsmParser { AMDGPUOperand::ImmTy Type); ParseStatus parseMatrixAFMT(OperandVector &Operands); ParseStatus parseMatrixBFMT(OperandVector &Operands); + ParseStatus tryParseMatrixScale(OperandVector &Operands, StringRef Name, + AMDGPUOperand::ImmTy Type); + ParseStatus parseMatrixAScale(OperandVector &Operands); + ParseStatus parseMatrixBScale(OperandVector &Operands); + ParseStatus tryParseMatrixScaleFmt(OperandVector &Operands, StringRef Name, + AMDGPUOperand::ImmTy Type); + ParseStatus parseMatrixAScaleFmt(OperandVector &Operands); + ParseStatus parseMatrixBScaleFmt(OperandVector &Operands); ParseStatus parseDfmtNfmt(int64_t &Format); ParseStatus parseUfmt(int64_t &Format); @@ -7356,6 +7376,42 @@ ParseStatus AMDGPUAsmParser::parseMatrixBFMT(OperandVector &Operands) { AMDGPUOperand::ImmTyMatrixBFMT); } +ParseStatus AMDGPUAsmParser::tryParseMatrixScale(OperandVector &Operands, + StringRef Name, + AMDGPUOperand::ImmTy Type) { + return parseStringOrIntWithPrefix( + Operands, Name, {"MATRIX_SCALE_ROW0", "MATRIX_SCALE_ROW1"}, Type); +} + +ParseStatus AMDGPUAsmParser::parseMatrixAScale(OperandVector &Operands) { + return tryParseMatrixScale(Operands, "matrix_a_scale", + AMDGPUOperand::ImmTyMatrixAScale); +} + +ParseStatus AMDGPUAsmParser::parseMatrixBScale(OperandVector &Operands) { + return tryParseMatrixScale(Operands, "matrix_b_scale", + AMDGPUOperand::ImmTyMatrixBScale); +} + +ParseStatus AMDGPUAsmParser::tryParseMatrixScaleFmt(OperandVector &Operands, + StringRef Name, + AMDGPUOperand::ImmTy Type) { + return parseStringOrIntWithPrefix( + Operands, Name, + {"MATRIX_SCALE_FMT_E8", "MATRIX_SCALE_FMT_E5M3", "MATRIX_SCALE_FMT_E4M3"}, + Type); +} + +ParseStatus AMDGPUAsmParser::parseMatrixAScaleFmt(OperandVector &Operands) { + return tryParseMatrixScaleFmt(Operands, "matrix_a_scale_fmt", + AMDGPUOperand::ImmTyMatrixAScaleFmt); +} + +ParseStatus AMDGPUAsmParser::parseMatrixBScaleFmt(OperandVector &Operands) { + return tryParseMatrixScaleFmt(Operands, "matrix_b_scale_fmt", + AMDGPUOperand::ImmTyMatrixBScaleFmt); +} + // dfmt and nfmt (in a tbuffer instruction) are parsed as one to allow their // values to live in a joint format operand in the MCInst encoding. ParseStatus AMDGPUAsmParser::parseDfmtNfmt(int64_t &Format) { @@ -9489,6 +9545,34 @@ void AMDGPUAsmParser::cvtVOP3P(MCInst &Inst, const OperandVector &Operands, AMDGPUOperand::ImmTyMatrixBFMT, 0); } + int MatrixAScaleIdx = + AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::matrix_a_scale); + if (MatrixAScaleIdx != -1) { + addOptionalImmOperand(Inst, Operands, OptIdx, + AMDGPUOperand::ImmTyMatrixAScale, 0); + } + + int MatrixBScaleIdx = + AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::matrix_b_scale); + if (MatrixBScaleIdx != -1) { + addOptionalImmOperand(Inst, Operands, OptIdx, + AMDGPUOperand::ImmTyMatrixBScale, 0); + } + + int MatrixAScaleFmtIdx = + AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::matrix_a_scale_fmt); + if (MatrixAScaleFmtIdx != -1) { + addOptionalImmOperand(Inst, Operands, OptIdx, + AMDGPUOperand::ImmTyMatrixAScaleFmt, 0); + } + + int MatrixBScaleFmtIdx = + AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::matrix_b_scale_fmt); + if (MatrixBScaleFmtIdx != -1) { + addOptionalImmOperand(Inst, Operands, OptIdx, + AMDGPUOperand::ImmTyMatrixBScaleFmt, 0); + } + if (AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::matrix_a_reuse)) addOptionalImmOperand(Inst, Operands, OptIdx, AMDGPUOperand::ImmTyMatrixAReuse, 0); diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp index 42c4d8b8a9717..ee8683a549a80 100644 --- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp +++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp @@ -1393,6 +1393,75 @@ void AMDGPUInstPrinter::printMatrixBFMT(const MCInst *MI, unsigned OpNo, printMatrixFMT(MI, OpNo, STI, O, 'b'); } +void AMDGPUInstPrinter::printMatrixScale(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, + raw_ostream &O, char AorB) { + auto Imm = MI->getOperand(OpNo).getImm() & 1; + if (Imm == 0) + return; + + O << " matrix_" << AorB << "_scale:"; + switch (Imm) { + default: + O << Imm; + break; + case WMMA::MatrixScale::MATRIX_SCALE_ROW0: + O << "MATRIX_SCALE_ROW0"; + break; + case WMMA::MatrixScale::MATRIX_SCALE_ROW1: + O << "MATRIX_SCALE_ROW1"; + break; + } +} + +void AMDGPUInstPrinter::printMatrixAScale(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, + raw_ostream &O) { + printMatrixScale(MI, OpNo, STI, O, 'a'); +} + +void AMDGPUInstPrinter::printMatrixBScale(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, + raw_ostream &O) { + printMatrixScale(MI, OpNo, STI, O, 'b'); +} + +void AMDGPUInstPrinter::printMatrixScaleFmt(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, + raw_ostream &O, char AorB) { + auto Imm = MI->getOperand(OpNo).getImm() & 3; + if (Imm == 0) + return; + + O << " matrix_" << AorB << "_scale_fmt:"; + switch (Imm) { + default: + O << Imm; + break; + case WMMA::MatrixScaleFmt::MATRIX_SCALE_FMT_E8: + O << "MATRIX_SCALE_FMT_E8"; + break; + case WMMA::MatrixScaleFmt::MATRIX_SCALE_FMT_E5M3: + O << "MATRIX_SCALE_FMT_E5M3"; + break; + case WMMA::MatrixScaleFmt::MATRIX_SCALE_FMT_E4M3: + O << "MATRIX_SCALE_FMT_E4M3"; + break; + } +} + +void AMDGPUInstPrinter::printMatrixAScaleFmt(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, + raw_ostream &O) { + printMatrixScaleFmt(MI, OpNo, STI, O, 'a'); +} + +void AMDGPUInstPrinter::printMatrixBScaleFmt(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, + raw_ostream &O) { + printMatrixScaleFmt(MI, OpNo, STI, O, 'b'); +} + void AMDGPUInstPrinter::printInterpSlot(const MCInst *MI, unsigned OpNum, const MCSubtargetInfo &STI, raw_ostream &O) { diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h index f6739b14926e1..be32061c64537 100644 --- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h +++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h @@ -140,6 +140,19 @@ class AMDGPUInstPrinter : public MCInstPrinter { const MCSubtargetInfo &STI, raw_ostream &O); void printMatrixBFMT(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, raw_ostream &O); + void printMatrixScale(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, raw_ostream &O, char AorB); + void printMatrixAScale(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, raw_ostream &O); + void printMatrixBScale(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, raw_ostream &O); + void printMatrixScaleFmt(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, raw_ostream &O, + char AorB); + void printMatrixAScaleFmt(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, raw_ostream &O); + void printMatrixBScaleFmt(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, raw_ostream &O); void printInterpSlot(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, raw_ostream &O); void printInterpAttr(const MCInst *MI, unsigned OpNo, diff --git a/llvm/lib/Target/AMDGPU/SIDefines.h b/llvm/lib/Target/AMDGPU/SIDefines.h index c56414519a6fe..deadb7aed0f69 100644 --- a/llvm/lib/Target/AMDGPU/SIDefines.h +++ b/llvm/lib/Target/AMDGPU/SIDefines.h @@ -1018,6 +1018,17 @@ enum MatrixFMT : unsigned { MATRIX_FMT_BF6 = 3, MATRIX_FMT_FP4 = 4 }; + +enum MatrixScale : unsigned { + MATRIX_SCALE_ROW0 = 0, + MATRIX_SCALE_ROW1 = 1, +}; + +enum MatrixScaleFmt : unsigned { + MATRIX_SCALE_FMT_E8 = 0, + MATRIX_SCALE_FMT_E5M3 = 1, + MATRIX_SCALE_FMT_E4M3 = 2 +}; } // namespace WMMA namespace VOP3PEncoding { diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td index 4698a5805ee0c..50914a5ef231f 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td @@ -1310,6 +1310,12 @@ def bitop3_0 : DefaultOperand; def MatrixAFMT : CustomOperand; def MatrixBFMT : CustomOperand; +def MatrixAScale : CustomOperand; +def MatrixBScale : CustomOperand; + +def MatrixAScaleFmt : CustomOperand; +def MatrixBScaleFmt : CustomOperand; + def MatrixAReuse : NamedBitOperand<"matrix_a_reuse">; def MatrixBReuse : NamedBitOperand<"matrix_b_reuse">; @@ -2680,6 +2686,8 @@ class VOPProfile _ArgVT, bit _EnableClamp = 0> { field bit HasNeg = HasModifiers; field bit HasMatrixReuse = 0; field bit HasMatrixFMT = 0; + field bit HasMatrixScale = 0; + field bit HasMatrixReuse = 0; field bit HasSrc0Mods = HasModifiers; field bit HasSrc1Mods = !if(HasModifiers, !or(HasSrc1FloatMods, HasSrc1IntMods), 0); diff --git a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td index 95fcd4ac1c101..457c0eed4f047 100644 --- a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td +++ b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td @@ -1407,9 +1407,9 @@ let WaveSizePredicate = isWave64 in { } class VOP3PWMMA_Profile ArgTy, bit _IsSWMMAC, int _IndexType, - bit _IsIU, bit _IsFP8BF8XF32, bit _Has_ImodOp = 0, - bit _HasMatrixFMT = 0, bit _HasMatrixReuse = 0, - bit _IsF4 = 0> + bit _IsIU, bit _IsFP8BF8XF32, bit _Has_ImodOp = 0, + bit _HasMatrixFMT = 0, bit _HasMatrixScale = 0, + bit _Scale16 = 0, bit _HasMatrixReuse = 0, bit _IsF4 = 0> : VOP3P_Profile> { bit IsIU = _IsIU; bit NoABMods = !or(_IsFP8BF8XF32, _IsF4); // No IMOD support for A and B @@ -1417,6 +1417,8 @@ class VOP3PWMMA_Profile ArgTy, bit _IsSWMMAC, int _IndexType, int IndexType = _IndexType; let HasMatrixFMT = _HasMatrixFMT; + let HasMatrixScale = _HasMatrixScale; + bit Scale16 = _Scale16; let HasMatrixReuse = _HasMatrixReuse; bit HasIModOp = _Has_ImodOp; @@ -1455,6 +1457,7 @@ class VOP3PWMMA_Profile ArgTy, bit _IsSWMMAC, int _IndexType, IsC_F16: "_f16", IsC_BF16: "_bf16", 1: "_b32"))); + ValueType ScaleTy = !if(Scale16, i64, i32); // For f16 and bf16 matrices A and B, each element can be modified by // fneg(neg_lo,neg_hi = 1). For f32 and f64, neg_lo[0:1] is allowed, but @@ -1516,6 +1519,13 @@ class VOP3PWMMA_Profile ArgTy, bit _IsSWMMAC, int _IndexType, !eq(IndexType, 32): (ins IndexKey32bit:$index_key_32bit)); dag MatrixFMT = !if(HasMatrixFMT, (ins MatrixAFMT:$matrix_a_fmt, MatrixBFMT:$matrix_b_fmt), (ins)); + dag MatrixScaleSrc = !if(HasMatrixScale, + !if(Scale16, (ins VCSrc_b64:$scale_src0, VCSrc_b64:$scale_src1), + (ins VCSrc_b32:$scale_src0, VCSrc_b32:$scale_src1)), + (ins)); + dag MatrixScale = !if(HasMatrixScale, (ins MatrixAScale:$matrix_a_scale, MatrixBScale:$matrix_b_scale, + MatrixAScaleFmt:$matrix_a_scale_fmt, MatrixBScaleFmt:$matrix_b_scale_fmt), + (ins)); dag MatrixReuse = !if(HasMatrixReuse, (ins MatrixAReuse:$matrix_a_reuse, MatrixBReuse:$matrix_b_reuse), (ins)); dag Clamp = !if(HasClamp, (ins Clamp0:$clamp), (ins)); dag Neg = !cond(!and(NegLoAny, NegHiAny) : (ins neg_lo0:$neg_lo, neg_hi0:$neg_hi), @@ -1529,7 +1539,7 @@ class VOP3PWMMA_Profile ArgTy, bit _IsSWMMAC, int _IndexType, (ins VRegSrc_64:$src2), (ins VRegSrc_32:$src2)), IndexKey)), - MatrixFMT, MatrixReuse, Clamp, Neg); + MatrixScaleSrc, MatrixFMT, MatrixScale, MatrixReuse, Clamp, Neg); // asm @@ -1538,13 +1548,15 @@ class VOP3PWMMA_Profile ArgTy, bit _IsSWMMAC, int _IndexType, !eq(IndexType, 16) : "$index_key_16bit", !eq(IndexType, 32) : "$index_key_32bit"); string MatrxFMTAsm = !if(HasMatrixFMT, "$matrix_a_fmt$matrix_b_fmt", ""); + string MatrixScaleSrcAsm = !if(HasMatrixScale, ", $scale_src0, $scale_src1", ""); + string MatrixScaleAsm = !if(HasMatrixScale, "$matrix_a_scale$matrix_b_scale$matrix_a_scale_fmt$matrix_b_scale_fmt", ""); string MatrixReuseAsm = !if(HasMatrixReuse, "$matrix_a_reuse$matrix_b_reuse", ""); string ClampAsm = !if(HasClamp, "$clamp", ""); string NegAsm = !cond(!and(NegLoAny, NegHiAny) : "$neg_lo$neg_hi", !and(NegLoAny, !not(NegHiAny)) : "$neg_lo", !and(!not(NegLoAny), !not(NegHiAny)) : ""); - let AsmVOP3P = "$vdst, $src0, $src1, $src2"#IndexKeyAsm#MatrxFMTAsm#MatrixReuseAsm#NegAsm#ClampAsm; + let AsmVOP3P = "$vdst, $src0, $src1, $src2"#IndexKeyAsm#MatrixScaleSrcAsm#MatrxFMTAsm#MatrixScaleAsm#MatrixReuseAsm#NegAsm#ClampAsm; // isel patterns bit IsAB_BF16_IMod0 = !and(IsAB_BF16, !not(HasIModOp)); @@ -1606,20 +1618,27 @@ class VOP3PWMMA_Profile ArgTy, bit _IsSWMMAC, int _IndexType, dag MatrixFMTOutPat = !if(HasMatrixFMT, (ins i32:$matrix_a_fmt, i32:$matrix_b_fmt), (ins)); dag Src2InlineInPat = !con(!if(IsC_IMod1, (ins (VOP3PModsNegAbs i32:$src2_modifiers)), (ins)), (ins (Src2VT (WMMAVISrc Src2VT:$src2)))); dag Src2InlineOutPat = !con(!if(IsIUXF32, (ins), !if(IsC_IMod1, (ins i32:$src2_modifiers), (ins (i32 8)))), (ins Src2VT:$src2)); + dag MatrixScaleInPat = !if(HasMatrixScale, (ins timm:$matrix_a_scale, timm:$matrix_a_scale_fmt, ScaleTy:$scale_src0, + timm:$matrix_b_scale, timm:$matrix_b_scale_fmt, ScaleTy:$scale_src1), + (ins)); dag MatrixReuseInPat = !if(HasMatrixReuse, (ins timm:$matrix_a_reuse, timm:$matrix_b_reuse), (ins)); + dag MatrixScaleOutSrcPat = !if(HasMatrixScale, (ins ScaleTy:$scale_src0, ScaleTy:$scale_src1), (ins)); + dag MatrixScaleOutModPat = !if(HasMatrixScale, (ins i32:$matrix_a_scale, i32:$matrix_b_scale, i32:$matrix_a_scale_fmt, i32:$matrix_b_scale_fmt), (ins)); dag MatrixReuseOutModPat = !if(HasMatrixReuse, (ins i1:$matrix_a_reuse, i1:$matrix_b_reuse), (ins)); - dag WmmaInPat = !con(Src0InPat, Src1InPat, Src2InPatWmma, MatrixReuseInPat, ClampPat); - dag WmmaOutPat = !con(Src0OutPat, Src1OutPat, Src2OutPatWmma, MatrixFMTOutPat, MatrixReuseOutModPat, ClampPat); + dag WmmaInPat = !con(Src0InPat, Src1InPat, Src2InPatWmma, MatrixScaleInPat, MatrixReuseInPat, ClampPat); + dag WmmaOutPat = !con(Src0OutPat, Src1OutPat, Src2OutPatWmma, MatrixScaleOutSrcPat, MatrixFMTOutPat, + MatrixScaleOutModPat, MatrixReuseOutModPat, ClampPat); dag SwmmacInPat = !con(Src0InPat, Src1InPat, (ins Src2VT:$srcTiedDef), IndexInPat, MatrixReuseInPat, ClampPat); dag SwmmacOutPat = !con(Src0OutPat, Src1OutPat, (ins Src2VT:$srcTiedDef), IndexOutPat, MatrixReuseOutModPat, ClampPat); // wmma pattern where src2 is inline imm uses _threeaddr pseudo, // can't use _twoaddr since it would violate src2 tied to vdst constraint. - dag WmmaInlineInPat = !con(Src0InPat, Src1InPat, Src2InlineInPat, MatrixReuseInPat, ClampPat); - dag WmmaInlineOutPat = !con(Src0OutPat, Src1OutPat, Src2InlineOutPat, MatrixFMTOutPat, MatrixReuseOutModPat, ClampPat); + dag WmmaInlineInPat = !con(Src0InPat, Src1InPat, Src2InlineInPat, MatrixScaleInPat, MatrixReuseInPat, ClampPat); + dag WmmaInlineOutPat = !con(Src0OutPat, Src1OutPat, Src2InlineOutPat, MatrixScaleOutSrcPat, + MatrixFMTOutPat, MatrixScaleOutModPat, MatrixReuseOutModPat, ClampPat); } def WMMAInstInfoTable : GenericTable { @@ -1728,39 +1747,51 @@ def F32_FP8BF8_SWMMAC_w64 : VOP3PWMMA_Profile<[v4f32, i32, v2i32, v4f32], 1, // *** IU4X32_SWMMAC_w64 lanes 0-31 will have 8xi4 remaining lanes are ignored // for matrix A, index is i16; Matrix B uses all lanes -def F32_F32_WMMA_w32 : VOP3PWMMA_Profile<[v8f32, v2f32, v2f32, v8f32], 0, 0, 0, 0, 1, 0, 1>; -def F32_BF16X32_WMMA_w32 : VOP3PWMMA_Profile<[v8f32, v16bf16, v16bf16, v8f32], 0, 0, 0, 0, 1, 0, 1>; -def F32_F16X32_WMMA_w32 : VOP3PWMMA_Profile<[v8f32, v16f16, v16f16, v8f32], 0, 0, 0, 0, 1, 0, 1>; -def F16_F16X32_WMMA_w32 : VOP3PWMMA_Profile<[v8f16, v16f16, v16f16, v8f16], 0, 0, 0, 0, 1, 0, 1>; -def BF16_BF16X32_WMMA_w32 : VOP3PWMMA_Profile<[v8bf16, v16bf16, v16bf16, v8bf16], 0, 0, 0, 0, 1, 0, 1>; -def BF16F32_BF16_WMMA_w32 : VOP3PWMMA_Profile<[v8bf16, v16bf16, v16bf16, v8f32], 0, 0, 0, 0, 1, 0, 1>; -def F32_FP8BF8X64_WMMA_w32 : VOP3PWMMA_Profile<[v8f32, v8i32, v8i32, v8f32], 0, 0, 0, 1, 1, 0, 1>; -def F32_FP8BF8X128_WMMA_w32 : VOP3PWMMA_Profile<[v8f32, v16i32, v16i32, v8f32], 0, 0, 0, 1, 1, 0, 1>; -def F16_FP8BF8X64_WMMA_w32 : VOP3PWMMA_Profile<[v8f16, v8i32, v8i32, v8f16], 0, 0, 0, 1, 1, 0, 1>; -def F16_FP8BF8X128_WMMA_w32 : VOP3PWMMA_Profile<[v8f16, v16i32, v16i32, v8f16], 0, 0, 0, 1, 1, 0, 1>; -def F32_32X16X128_F4_WMMA_w32 : VOP3PWMMA_Profile<[v16f32, v16i32, v8i32, v16f32], 0, 0, 0, 0, 1, 0, 0, 1>; -def I32_IU8X64_WMMA_w32 : VOP3PWMMA_Profile<[v8i32, v8i32, v8i32, v8i32], 0, 0, 1, 0, 1, 0, 1>; -def F32_F16X64_SWMMAC_w32 : VOP3PWMMA_Profile<[v8f32, v16f16, v32f16, v8f32], 1, 16, 0, 0, 1, 0, 1>; -def F32_BF16X64_SWMMAC_w32 : VOP3PWMMA_Profile<[v8f32, v16bf16, v32bf16, v8f32], 1, 16, 0, 0, 1, 0, 1>; -def F16_F16X64_SWMMAC_w32 : VOP3PWMMA_Profile<[v8f16, v16f16, v32f16, v8f16], 1, 16, 0, 0, 1, 0, 1>; -def BF16_BF16X64_SWMMAC_w32 : VOP3PWMMA_Profile<[v8bf16, v16bf16, v32bf16, v8bf16], 1, 16, 0, 0, 1, 0, 1>; -def F32_FP8BF8X128_SWMMAC_w32 : VOP3PWMMA_Profile<[v8f32, v8i32, v16i32, v8f32], 1, 32, 0, 1, 1, 0, 1>; -def F16_FP8BF8X128_SWMMAC_w32 : VOP3PWMMA_Profile<[v8f16, v8i32, v16i32, v8f16], 1, 32, 0, 1, 1, 0, 1>; -def I32_IU8X128_SWMMAC_w32 : VOP3PWMMA_Profile<[v8i32, v8i32, v16i32, v8i32], 1, 32, 1, 0, 1, 0, 1>; - -multiclass WMMA_F8F6F4_Profiles { - def _f8_f8_w32 : VOP3PWMMA_Profile<[v8f32, v16i32, v16i32, v8f32], 0, 0, 0, 1, 1, 1, HasMatrixReuse>; - def _f8_f6_w32 : VOP3PWMMA_Profile<[v8f32, v16i32, v12i32, v8f32], 0, 0, 0, 1, 1, 1, HasMatrixReuse>; - def _f8_f4_w32 : VOP3PWMMA_Profile<[v8f32, v16i32, v8i32, v8f32], 0, 0, 0, 1, 1, 1, HasMatrixReuse>; - def _f6_f8_w32 : VOP3PWMMA_Profile<[v8f32, v12i32, v16i32, v8f32], 0, 0, 0, 1, 1, 1, HasMatrixReuse>; - def _f6_f6_w32 : VOP3PWMMA_Profile<[v8f32, v12i32, v12i32, v8f32], 0, 0, 0, 1, 1, 1, HasMatrixReuse>; - def _f6_f4_w32 : VOP3PWMMA_Profile<[v8f32, v12i32, v8i32, v8f32], 0, 0, 0, 1, 1, 1, HasMatrixReuse>; - def _f4_f8_w32 : VOP3PWMMA_Profile<[v8f32, v8i32, v16i32, v8f32], 0, 0, 0, 1, 1, 1, HasMatrixReuse>; - def _f4_f6_w32 : VOP3PWMMA_Profile<[v8f32, v8i32, v12i32, v8f32], 0, 0, 0, 1, 1, 1, HasMatrixReuse>; - def _f4_f4_w32 : VOP3PWMMA_Profile<[v8f32, v8i32, v8i32, v8f32], 0, 0, 0, 1, 1, 1, HasMatrixReuse>; -} - -defm F32_16X16X128_F8F6F4 : WMMA_F8F6F4_Profiles<0>; +def F32_F32_WMMA_w32 : VOP3PWMMA_Profile<[v8f32, v2f32, v2f32, v8f32], 0, 0, 0, 0, 1, 0, 0, 0, 1>; +def F32_BF16X32_WMMA_w32 : VOP3PWMMA_Profile<[v8f32, v16bf16, v16bf16, v8f32], 0, 0, 0, 0, 1, 0, 0, 0, 1>; +def F32_F16X32_WMMA_w32 : VOP3PWMMA_Profile<[v8f32, v16f16, v16f16, v8f32], 0, 0, 0, 0, 1, 0, 0, 0, 1>; +def F16_F16X32_WMMA_w32 : VOP3PWMMA_Profile<[v8f16, v16f16, v16f16, v8f16], 0, 0, 0, 0, 1, 0, 0, 0, 1>; +def BF16_BF16X32_WMMA_w32 : VOP3PWMMA_Profile<[v8bf16, v16bf16, v16bf16, v8bf16], 0, 0, 0, 0, 1, 0, 0, 0, 1>; +def BF16F32_BF16_WMMA_w32 : VOP3PWMMA_Profile<[v8bf16, v16bf16, v16bf16, v8f32], 0, 0, 0, 0, 1, 0, 0, 0, 1>; +def F32_FP8BF8X64_WMMA_w32 : VOP3PWMMA_Profile<[v8f32, v8i32, v8i32, v8f32], 0, 0, 0, 1, 1, 0, 0, 0, 1>; +def F32_FP8BF8X128_WMMA_w32 : VOP3PWMMA_Profile<[v8f32, v16i32, v16i32, v8f32], 0, 0, 0, 1, 1, 0, 0, 0, 1>; +def F16_FP8BF8X64_WMMA_w32 : VOP3PWMMA_Profile<[v8f16, v8i32, v8i32, v8f16], 0, 0, 0, 1, 1, 0, 0, 0, 1>; +def F16_FP8BF8X128_WMMA_w32 : VOP3PWMMA_Profile<[v8f16, v16i32, v16i32, v8f16], 0, 0, 0, 1, 1, 0, 0, 0, 1>; +def F32_32X16X128_F4_WMMA_w32 : VOP3PWMMA_Profile<[v16f32, v16i32, v8i32, v16f32], 0, 0, 0, 0, 1, 0, 0, 0, 0, 1>; +def I32_IU8X64_WMMA_w32 : VOP3PWMMA_Profile<[v8i32, v8i32, v8i32, v8i32], 0, 0, 1, 0, 1, 0, 0, 0, 1>; +def F32_F16X64_SWMMAC_w32 : VOP3PWMMA_Profile<[v8f32, v16f16, v32f16, v8f32], 1, 16, 0, 0, 1, 0, 0, 0, 1>; +def F32_BF16X64_SWMMAC_w32 : VOP3PWMMA_Profile<[v8f32, v16bf16, v32bf16, v8f32], 1, 16, 0, 0, 1, 0, 0, 0, 1>; +def F16_F16X64_SWMMAC_w32 : VOP3PWMMA_Profile<[v8f16, v16f16, v32f16, v8f16], 1, 16, 0, 0, 1, 0, 0, 0, 1>; +def BF16_BF16X64_SWMMAC_w32 : VOP3PWMMA_Profile<[v8bf16, v16bf16, v32bf16, v8bf16], 1, 16, 0, 0, 1, 0, 0, 0, 1>; +def F32_FP8BF8X128_SWMMAC_w32 : VOP3PWMMA_Profile<[v8f32, v8i32, v16i32, v8f32], 1, 32, 0, 1, 1, 0, 0, 0, 1>; +def F16_FP8BF8X128_SWMMAC_w32 : VOP3PWMMA_Profile<[v8f16, v8i32, v16i32, v8f16], 1, 32, 0, 1, 1, 0, 0, 0, 1>; +def I32_IU8X128_SWMMAC_w32 : VOP3PWMMA_Profile<[v8i32, v8i32, v16i32, v8i32], 1, 32, 1, 0, 1, 0, 0, 0, 1>; + +multiclass WMMA_F8F6F4_Profiles { + def _f8_f8_w32 : VOP3PWMMA_Profile<[v8f32, v16i32, v16i32, v8f32], 0, 0, 0, 1, 1, 1, HasMatrixScale, Scale16, HasMatrixReuse>; + def _f8_f6_w32 : VOP3PWMMA_Profile<[v8f32, v16i32, v12i32, v8f32], 0, 0, 0, 1, 1, 1, HasMatrixScale, Scale16, HasMatrixReuse>; + def _f8_f4_w32 : VOP3PWMMA_Profile<[v8f32, v16i32, v8i32, v8f32], 0, 0, 0, 1, 1, 1, HasMatrixScale, Scale16, HasMatrixReuse>; + def _f6_f8_w32 : VOP3PWMMA_Profile<[v8f32, v12i32, v16i32, v8f32], 0, 0, 0, 1, 1, 1, HasMatrixScale, Scale16, HasMatrixReuse>; + def _f6_f6_w32 : VOP3PWMMA_Profile<[v8f32, v12i32, v12i32, v8f32], 0, 0, 0, 1, 1, 1, HasMatrixScale, Scale16, HasMatrixReuse>; + def _f6_f4_w32 : VOP3PWMMA_Profile<[v8f32, v12i32, v8i32, v8f32], 0, 0, 0, 1, 1, 1, HasMatrixScale, Scale16, HasMatrixReuse>; + def _f4_f8_w32 : VOP3PWMMA_Profile<[v8f32, v8i32, v16i32, v8f32], 0, 0, 0, 1, 1, 1, HasMatrixScale, Scale16, HasMatrixReuse>; + def _f4_f6_w32 : VOP3PWMMA_Profile<[v8f32, v8i32, v12i32, v8f32], 0, 0, 0, 1, 1, 1, HasMatrixScale, Scale16, HasMatrixReuse>; + def _f4_f4_w32 : VOP3PWMMA_Profile<[v8f32, v8i32, v8i32, v8f32], 0, 0, 0, 1, 1, 1, HasMatrixScale, Scale16, HasMatrixReuse>; +} + +defm F32_16X16X128_F8F6F4 : WMMA_F8F6F4_Profiles<0, 0, 0>; + +class VOP_WMMA_LD_SCALE : VOP3P_Profile> { + let HasMatrixScale = 1; + let HasMatrixReuse = 1; + let HasNeg = 0; + let Src0RC64 = RC; + let Src1RC64 = RC; + let Ins64 = (ins Src0RC64:$src0, Src1RC64:$src1, MatrixAScale:$matrix_a_scale, MatrixBScale:$matrix_b_scale, + MatrixAScaleFmt:$matrix_a_scale_fmt, MatrixBScaleFmt:$matrix_b_scale_fmt, + MatrixAReuse:$matrix_a_reuse, MatrixBReuse:$matrix_b_reuse); + let AsmVOP3P = " $src0, $src1$matrix_a_scale$matrix_b_scale$matrix_a_scale_fmt$matrix_b_scale_fmt$matrix_a_reuse$matrix_b_reuse"; +} multiclass WMMAInst_SrcFormats_mc { foreach I = ["f8_f8", "f8_f6", "f8_f4", "f6_f8", "f6_f6", "f6_f4", "f4_f8", "f4_f6", "f4_f4"] in { @@ -1816,6 +1847,8 @@ defm V_WMMA_F32_16X16X128_F8F6F4 : WMMAInst_SrcFormats_mc<"v_wmma_f32_16 } // End is_wmma_xdl = 1. +defm V_WMMA_LD_SCALE_PAIRED_B32 : VOP3PInst<"v_wmma_ld_scale_paired_b32", VOP_WMMA_LD_SCALE>; +defm V_WMMA_LD_SCALE16_PAIRED_B64 : VOP3PInst<"v_wmma_ld_scale16_paired_b64", VOP_WMMA_LD_SCALE>; } // End SubtargetPredicate = isGFX125xOnly } // End WaveSizePredicate = isWave32 @@ -2283,6 +2316,9 @@ defm V_FMA_MIX_F32_BF16 : VOP3P_Realtriple; defm V_FMA_MIXLO_BF16 : VOP3P_Realtriple; defm V_FMA_MIXHI_BF16 : VOP3P_Realtriple; +defm V_WMMA_LD_SCALE_PAIRED_B32 : VOP3P_Real_gfx1250<0x35>; +defm V_WMMA_LD_SCALE16_PAIRED_B64 : VOP3P_Real_gfx1250<0x3a>; + let AssemblerPredicate = isGFX1250Plus in def : AMDGPUMnemonicAlias<"v_fma_mix_f32_f16", "v_fma_mix_f32">; diff --git a/llvm/lib/Target/AMDGPU/VOPInstructions.td b/llvm/lib/Target/AMDGPU/VOPInstructions.td index f027ab05c546c..3cad5a1c2c377 100644 --- a/llvm/lib/Target/AMDGPU/VOPInstructions.td +++ b/llvm/lib/Target/AMDGPU/VOPInstructions.td @@ -475,17 +475,24 @@ class VOP3Pe_Base { bits<1> index_key_32bit; bits<3> matrix_a_fmt; bits<3> matrix_b_fmt; + bits<1> matrix_a_scale; + bits<1> matrix_b_scale; + bits<2> matrix_a_scale_fmt; + bits<2> matrix_b_scale_fmt; bits<1> matrix_a_reuse; bits<1> matrix_b_reuse; } class VOP3Pe : Enc64, VOP3Pe_Base { let Inst{7-0} = !if(P.HasDst, vdst, 0); - let Inst{8} = !if(P.HasSrc0Mods, src0_modifiers{1}, 0); // neg_hi src0 - let Inst{9} = !if(P.HasSrc1Mods, src1_modifiers{1}, 0); // neg_hi src1 + let Inst{8} = !if(P.HasSrc0Mods, src0_modifiers{1}, + !if(P.HasMatrixScale, matrix_b_scale_fmt{0}, 0)); // neg_hi src0 + let Inst{9} = !if(P.HasSrc1Mods, src1_modifiers{1}, + !if(P.HasMatrixScale, matrix_b_scale_fmt{1}, 0)); // neg_hi src1 let Inst{10} = !if(P.HasSrc2Mods, src2_modifiers{1}, 0); // neg_hi src2 - let Inst{11} = !if(!and(P.HasSrc0, P.HasOpSel), src0_modifiers{2}, 0); // op_sel(0) + let Inst{11} = !if(!and(P.HasSrc0, P.HasOpSel), src0_modifiers{2}, + !if(P.HasMatrixScale, matrix_a_scale{0}, 0)); // op_sel(0) let Inst{12} = !if(!and(P.HasSrc1, P.HasOpSel), src1_modifiers{2}, 0); // op_sel(1) let Inst{13} = !if(!and(P.HasSrc2, P.HasOpSel), src2_modifiers{2}, !if(P.HasMatrixReuse, matrix_a_reuse, 0)); // op_sel(2) @@ -500,10 +507,17 @@ class VOP3Pe : Enc64, VOP3Pe_Base { let Inst{40-32} = !if(P.HasSrc0, src0, 0); let Inst{49-41} = !if(P.HasSrc1, src1, 0); let Inst{58-50} = !if(P.HasSrc2, src2, 0); - let Inst{59} = !if(!and(P.HasSrc0, P.HasOpSel), src0_modifiers{3}, !if(P.IsDOT, 1, ?)); // op_sel_hi(0) - let Inst{60} = !if(!and(P.HasSrc1, P.HasOpSel), src1_modifiers{3}, !if(P.IsDOT, 1, ?)); // op_sel_hi(1) - let Inst{61} = !if(P.HasSrc0Mods, src0_modifiers{0}, 0); // neg (lo) - let Inst{62} = !if(P.HasSrc1Mods, src1_modifiers{0}, 0); // neg (lo) + let Inst{59} = !cond(!and(P.HasSrc0, P.HasOpSel) : src0_modifiers{3}, + P.IsDOT : 1, + P.HasMatrixScale : matrix_b_scale{0}, + 1: ?); // op_sel_hi(0) + let Inst{60} = !if(!and(P.HasSrc1, P.HasOpSel), src1_modifiers{3}, + !if(P.HasMatrixScale, 0, + !if(P.IsDOT, 1, ?))); // op_sel_hi(1) + let Inst{61} = !if(P.HasSrc0Mods, src0_modifiers{0}, + !if(P.HasMatrixScale, matrix_a_scale_fmt{0}, 0)); // neg (lo) + let Inst{62} = !if(P.HasSrc1Mods, src1_modifiers{0}, + !if(P.HasMatrixScale, matrix_a_scale_fmt{1}, 0)); // neg (lo) let Inst{63} = !if(P.HasSrc2Mods, src2_modifiers{0}, 0); // neg (lo) } diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_wmma_w32.s b/llvm/test/MC/AMDGPU/gfx1250_asm_wmma_w32.s index d8dfd1e349145..309c74ae7ff7a 100644 --- a/llvm/test/MC/AMDGPU/gfx1250_asm_wmma_w32.s +++ b/llvm/test/MC/AMDGPU/gfx1250_asm_wmma_w32.s @@ -983,6 +983,176 @@ v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:39], v[40:47] neg_lo:[0,0,1] // WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 // GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +v_wmma_ld_scale_paired_b32 v1, v2 +// GFX1250: v_wmma_ld_scale_paired_b32 v1, v2 ; encoding: [0x00,0x00,0x35,0xcc,0x01,0x05,0x02,0x00] +// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 +// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU + +v_wmma_ld_scale_paired_b32 s1, s2 +// GFX1250: v_wmma_ld_scale_paired_b32 s1, s2 ; encoding: [0x00,0x00,0x35,0xcc,0x01,0x04,0x00,0x00] +// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 +// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU + +v_wmma_ld_scale_paired_b32 2, -4 +// GFX1250: v_wmma_ld_scale_paired_b32 2, -4 ; encoding: [0x00,0x00,0x35,0xcc,0x82,0x88,0x01,0x00] +// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 +// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU + +v_wmma_ld_scale_paired_b32 v1, v2 matrix_a_scale:MATRIX_SCALE_ROW0 matrix_b_scale:MATRIX_SCALE_ROW0 +// GFX1250: v_wmma_ld_scale_paired_b32 v1, v2 ; encoding: [0x00,0x00,0x35,0xcc,0x01,0x05,0x02,0x00] +// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 +// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU + +v_wmma_ld_scale_paired_b32 s0, s0 matrix_a_scale:MATRIX_SCALE_ROW1 +// GFX1250: v_wmma_ld_scale_paired_b32 s0, s0 matrix_a_scale:MATRIX_SCALE_ROW1 ; encoding: [0x00,0x08,0x35,0xcc,0x00,0x00,0x00,0x00] +// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 +// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU + +v_wmma_ld_scale_paired_b32 s0, s0 matrix_a_reuse +// GFX1250: v_wmma_ld_scale_paired_b32 s0, s0 matrix_a_reuse ; encoding: [0x00,0x20,0x35,0xcc,0x00,0x00,0x00,0x00] +// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 +// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU + +v_wmma_ld_scale_paired_b32 s0, s0 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_a_reuse +// GFX1250: v_wmma_ld_scale_paired_b32 s0, s0 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_a_reuse ; encoding: [0x00,0x28,0x35,0xcc,0x00,0x00,0x00,0x00] +// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 +// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU + +v_wmma_ld_scale_paired_b32 s0, s0 matrix_b_scale:MATRIX_SCALE_ROW1 +// GFX1250: v_wmma_ld_scale_paired_b32 s0, s0 matrix_b_scale:MATRIX_SCALE_ROW1 ; encoding: [0x00,0x00,0x35,0xcc,0x00,0x00,0x00,0x08] +// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 +// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU + +v_wmma_ld_scale_paired_b32 s0, s0 matrix_b_reuse +// GFX1250: v_wmma_ld_scale_paired_b32 s0, s0 matrix_b_reuse ; encoding: [0x00,0x40,0x35,0xcc,0x00,0x00,0x00,0x00] +// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 +// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU + +v_wmma_ld_scale_paired_b32 s0, s0 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_b_reuse +// GFX1250: v_wmma_ld_scale_paired_b32 s0, s0 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_b_reuse ; encoding: [0x00,0x40,0x35,0xcc,0x00,0x00,0x00,0x08] +// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 +// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU + +v_wmma_ld_scale_paired_b32 v1, v2 matrix_a_scale:MATRIX_SCALE_ROW0 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E8 matrix_b_scale_fmt:MATRIX_SCALE_FMT_E8 +// GFX1250: v_wmma_ld_scale_paired_b32 v1, v2 matrix_b_scale:MATRIX_SCALE_ROW1 ; encoding: [0x00,0x00,0x35,0xcc,0x01,0x05,0x02,0x08] +// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 +// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU + +v_wmma_ld_scale_paired_b32 v1, v2 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E5M3 +// GFX1250: v_wmma_ld_scale_paired_b32 v1, v2 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E5M3 ; encoding: [0x00,0x00,0x35,0xcc,0x01,0x05,0x02,0x28] +// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 +// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU + +v_wmma_ld_scale_paired_b32 v1, v2 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E4M3 +// GFX1250: v_wmma_ld_scale_paired_b32 v1, v2 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E4M3 ; encoding: [0x00,0x00,0x35,0xcc,0x01,0x05,0x02,0x48] +// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 +// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU + +v_wmma_ld_scale_paired_b32 v1, v2 matrix_b_scale_fmt:MATRIX_SCALE_FMT_E8 +// GFX1250: v_wmma_ld_scale_paired_b32 v1, v2 ; encoding: [0x00,0x00,0x35,0xcc,0x01,0x05,0x02,0x00] +// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 +// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU + +v_wmma_ld_scale_paired_b32 v1, v2 matrix_b_scale_fmt:MATRIX_SCALE_FMT_E5M3 +// GFX1250: v_wmma_ld_scale_paired_b32 v1, v2 matrix_b_scale_fmt:MATRIX_SCALE_FMT_E5M3 ; encoding: [0x00,0x01,0x35,0xcc,0x01,0x05,0x02,0x00] +// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 +// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU + +v_wmma_ld_scale_paired_b32 v1, v2 matrix_b_scale_fmt:MATRIX_SCALE_FMT_E4M3 +// GFX1250: v_wmma_ld_scale_paired_b32 v1, v2 matrix_b_scale_fmt:MATRIX_SCALE_FMT_E4M3 ; encoding: [0x00,0x02,0x35,0xcc,0x01,0x05,0x02,0x00] +// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 +// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU + +v_wmma_ld_scale_paired_b32 v1, v2 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E5M3 matrix_b_scale_fmt:MATRIX_SCALE_FMT_E4M3 matrix_a_reuse matrix_b_reuse +// GFX1250: v_wmma_ld_scale_paired_b32 v1, v2 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E5M3 matrix_b_scale_fmt:MATRIX_SCALE_FMT_E4M3 matrix_a_reuse matrix_b_reuse ; encoding: [0x00,0x6a,0x35,0xcc,0x01,0x05,0x02,0x28] +// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 +// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU + +v_wmma_ld_scale16_paired_b64 v[2:3], v[4:5] +// GFX1250: v_wmma_ld_scale16_paired_b64 v[2:3], v[4:5] ; encoding: [0x00,0x00,0x3a,0xcc,0x02,0x09,0x02,0x00] +// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 +// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU + +v_wmma_ld_scale16_paired_b64 s[2:3], s[4:5] +// GFX1250: v_wmma_ld_scale16_paired_b64 s[2:3], s[4:5] ; encoding: [0x00,0x00,0x3a,0xcc,0x02,0x08,0x00,0x00] +// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 +// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU + +v_wmma_ld_scale16_paired_b64 2, -4 +// GFX1250: v_wmma_ld_scale16_paired_b64 2, -4 ; encoding: [0x00,0x00,0x3a,0xcc,0x82,0x88,0x01,0x00] +// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 +// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU + +v_wmma_ld_scale16_paired_b64 v[2:3], v[4:5] matrix_a_scale:MATRIX_SCALE_ROW0 matrix_b_scale:MATRIX_SCALE_ROW0 +// GFX1250: v_wmma_ld_scale16_paired_b64 v[2:3], v[4:5] ; encoding: [0x00,0x00,0x3a,0xcc,0x02,0x09,0x02,0x00] +// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 +// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU + +v_wmma_ld_scale16_paired_b64 s[0:1], s[0:1] matrix_a_scale:MATRIX_SCALE_ROW1 +// GFX1250: v_wmma_ld_scale16_paired_b64 s[0:1], s[0:1] matrix_a_scale:MATRIX_SCALE_ROW1 ; encoding: [0x00,0x08,0x3a,0xcc,0x00,0x00,0x00,0x00] +// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 +// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU + +v_wmma_ld_scale16_paired_b64 s[0:1], s[0:1] matrix_a_reuse +// GFX1250: v_wmma_ld_scale16_paired_b64 s[0:1], s[0:1] matrix_a_reuse ; encoding: [0x00,0x20,0x3a,0xcc,0x00,0x00,0x00,0x00] +// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 +// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU + +v_wmma_ld_scale16_paired_b64 s[0:1], s[0:1] matrix_a_scale:MATRIX_SCALE_ROW1 matrix_a_reuse +// GFX1250: v_wmma_ld_scale16_paired_b64 s[0:1], s[0:1] matrix_a_scale:MATRIX_SCALE_ROW1 matrix_a_reuse ; encoding: [0x00,0x28,0x3a,0xcc,0x00,0x00,0x00,0x00] +// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 +// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU + +v_wmma_ld_scale16_paired_b64 s[0:1], s[0:1] matrix_b_scale:MATRIX_SCALE_ROW1 +// GFX1250: v_wmma_ld_scale16_paired_b64 s[0:1], s[0:1] matrix_b_scale:MATRIX_SCALE_ROW1 ; encoding: [0x00,0x00,0x3a,0xcc,0x00,0x00,0x00,0x08] +// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 +// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU + +v_wmma_ld_scale16_paired_b64 s[0:1], s[0:1] matrix_b_reuse +// GFX1250: v_wmma_ld_scale16_paired_b64 s[0:1], s[0:1] matrix_b_reuse ; encoding: [0x00,0x40,0x3a,0xcc,0x00,0x00,0x00,0x00] +// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 +// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU + +v_wmma_ld_scale16_paired_b64 s[0:1], s[0:1] matrix_b_scale:MATRIX_SCALE_ROW1 matrix_b_reuse +// GFX1250: v_wmma_ld_scale16_paired_b64 s[0:1], s[0:1] matrix_b_scale:MATRIX_SCALE_ROW1 matrix_b_reuse ; encoding: [0x00,0x40,0x3a,0xcc,0x00,0x00,0x00,0x08] +// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 +// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU + +v_wmma_ld_scale16_paired_b64 v[2:3], v[4:5] matrix_a_scale:MATRIX_SCALE_ROW0 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E8 matrix_b_scale_fmt:MATRIX_SCALE_FMT_E8 +// GFX1250: v_wmma_ld_scale16_paired_b64 v[2:3], v[4:5] matrix_b_scale:MATRIX_SCALE_ROW1 ; encoding: [0x00,0x00,0x3a,0xcc,0x02,0x09,0x02,0x08] +// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 +// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU + +v_wmma_ld_scale16_paired_b64 v[2:3], v[4:5] matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E5M3 +// GFX1250: v_wmma_ld_scale16_paired_b64 v[2:3], v[4:5] matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E5M3 ; encoding: [0x00,0x00,0x3a,0xcc,0x02,0x09,0x02,0x28] +// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 +// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU + +v_wmma_ld_scale16_paired_b64 v[2:3], v[4:5] matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E4M3 +// GFX1250: v_wmma_ld_scale16_paired_b64 v[2:3], v[4:5] matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E4M3 ; encoding: [0x00,0x00,0x3a,0xcc,0x02,0x09,0x02,0x48] +// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 +// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU + +v_wmma_ld_scale16_paired_b64 v[2:3], v[4:5] matrix_b_scale_fmt:MATRIX_SCALE_FMT_E8 +// GFX1250: v_wmma_ld_scale16_paired_b64 v[2:3], v[4:5] ; encoding: [0x00,0x00,0x3a,0xcc,0x02,0x09,0x02,0x00] +// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 +// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU + +v_wmma_ld_scale16_paired_b64 v[2:3], v[4:5] matrix_b_scale_fmt:MATRIX_SCALE_FMT_E5M3 +// GFX1250: v_wmma_ld_scale16_paired_b64 v[2:3], v[4:5] matrix_b_scale_fmt:MATRIX_SCALE_FMT_E5M3 ; encoding: [0x00,0x01,0x3a,0xcc,0x02,0x09,0x02,0x00] +// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 +// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU + +v_wmma_ld_scale16_paired_b64 v[2:3], v[4:5] matrix_b_scale_fmt:MATRIX_SCALE_FMT_E4M3 +// GFX1250: v_wmma_ld_scale16_paired_b64 v[2:3], v[4:5] matrix_b_scale_fmt:MATRIX_SCALE_FMT_E4M3 ; encoding: [0x00,0x02,0x3a,0xcc,0x02,0x09,0x02,0x00] +// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 +// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU + +v_wmma_ld_scale16_paired_b64 v[2:3], v[4:5] matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E5M3 matrix_b_scale_fmt:MATRIX_SCALE_FMT_E4M3 matrix_a_reuse matrix_b_reuse +// GFX1250: v_wmma_ld_scale16_paired_b64 v[2:3], v[4:5] matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E5M3 matrix_b_scale_fmt:MATRIX_SCALE_FMT_E4M3 matrix_a_reuse matrix_b_reuse ; encoding: [0x00,0x6a,0x3a,0xcc,0x02,0x09,0x02,0x28] +// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 +// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU + v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:39], v[40:47] neg_hi:[0,0,1] // GFX1250: v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:39], v[40:47] neg_hi:[0,0,1] ; encoding: [0x00,0x04,0x33,0xcc,0x08,0x31,0xa2,0x04] // WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_wmma_w32_err.s b/llvm/test/MC/AMDGPU/gfx1250_asm_wmma_w32_err.s index 421d96b5e9da6..1eae8f6ba451c 100644 --- a/llvm/test/MC/AMDGPU/gfx1250_asm_wmma_w32_err.s +++ b/llvm/test/MC/AMDGPU/gfx1250_asm_wmma_w32_err.s @@ -384,6 +384,16 @@ v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:39], v[40:47] matrix_b_fmt:-1 v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:39], v[40:47] matrix_b_fmt:xxx // GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid matrix_b_fmt value +v_wmma_ld_scale_paired_b32 v1, 100 +// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// GFX1250-ERR-NEXT: {{^}}v_wmma_ld_scale_paired_b32 v1, 100 +// GFX1250-ERR-NEXT: {{^}} ^ + +v_wmma_ld_scale_paired_b32 100, v1 +// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// GFX1250-ERR-NEXT: {{^}}v_wmma_ld_scale_paired_b32 100, v1 +// GFX1250-ERR-NEXT: {{^}} ^ + v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[0:7], v[20:35], v[40:47] // GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: wrong register tuple size for MATRIX_FMT_FP8 // GFX1250-ERR-NEXT: {{^}}v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[0:7], v[20:35], v[40:47] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_wmma_w32.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_wmma_w32.txt index e20f020cf878e..755a2a33cdcc7 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_wmma_w32.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_wmma_w32.txt @@ -586,6 +586,96 @@ 0x10,0x00,0x72,0xcc,0x00,0x11,0x42,0x3c # GFX1250: v_wmma_i32_16x16x64_iu8 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[1,0,0] ; encoding: [0x10,0x00,0x72,0xcc,0x00,0x11,0x42,0x3c] +0x00,0x00,0x3a,0xcc,0x82,0x88,0x01,0x00 +# GFX1250: v_wmma_ld_scale16_paired_b64 2, -4 ; encoding: [0x00,0x00,0x3a,0xcc,0x82,0x88,0x01,0x00] + +0x00,0x20,0x3a,0xcc,0x00,0x00,0x00,0x00 +# GFX1250: v_wmma_ld_scale16_paired_b64 s[0:1], s[0:1] matrix_a_reuse ; encoding: [0x00,0x20,0x3a,0xcc,0x00,0x00,0x00,0x00] + +0x00,0x08,0x3a,0xcc,0x00,0x00,0x00,0x00 +# GFX1250: v_wmma_ld_scale16_paired_b64 s[0:1], s[0:1] matrix_a_scale:MATRIX_SCALE_ROW1 ; encoding: [0x00,0x08,0x3a,0xcc,0x00,0x00,0x00,0x00] + +0x00,0x28,0x3a,0xcc,0x00,0x00,0x00,0x00 +# GFX1250: v_wmma_ld_scale16_paired_b64 s[0:1], s[0:1] matrix_a_scale:MATRIX_SCALE_ROW1 matrix_a_reuse ; encoding: [0x00,0x28,0x3a,0xcc,0x00,0x00,0x00,0x00] + +0x00,0x40,0x3a,0xcc,0x00,0x00,0x00,0x00 +# GFX1250: v_wmma_ld_scale16_paired_b64 s[0:1], s[0:1] matrix_b_reuse ; encoding: [0x00,0x40,0x3a,0xcc,0x00,0x00,0x00,0x00] + +0x00,0x00,0x3a,0xcc,0x00,0x00,0x00,0x08 +# GFX1250: v_wmma_ld_scale16_paired_b64 s[0:1], s[0:1] matrix_b_scale:MATRIX_SCALE_ROW1 ; encoding: [0x00,0x00,0x3a,0xcc,0x00,0x00,0x00,0x08] + +0x00,0x40,0x3a,0xcc,0x00,0x00,0x00,0x08 +# GFX1250: v_wmma_ld_scale16_paired_b64 s[0:1], s[0:1] matrix_b_scale:MATRIX_SCALE_ROW1 matrix_b_reuse ; encoding: [0x00,0x40,0x3a,0xcc,0x00,0x00,0x00,0x08] + +0x00,0x00,0x3a,0xcc,0x02,0x08,0x00,0x00 +# GFX1250: v_wmma_ld_scale16_paired_b64 s[2:3], s[4:5] ; encoding: [0x00,0x00,0x3a,0xcc,0x02,0x08,0x00,0x00] + +0x00,0x00,0x3a,0xcc,0x02,0x09,0x02,0x00 +# GFX1250: v_wmma_ld_scale16_paired_b64 v[2:3], v[4:5] ; encoding: [0x00,0x00,0x3a,0xcc,0x02,0x09,0x02,0x00] + +0x00,0x6a,0x3a,0xcc,0x02,0x09,0x02,0x28 +# GFX1250: v_wmma_ld_scale16_paired_b64 v[2:3], v[4:5] matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E5M3 matrix_b_scale_fmt:MATRIX_SCALE_FMT_E4M3 matrix_a_reuse matrix_b_reuse ; encoding: [0x00,0x6a,0x3a,0xcc,0x02,0x09,0x02,0x28] + +0x00,0x00,0x3a,0xcc,0x02,0x09,0x02,0x08 +# GFX1250: v_wmma_ld_scale16_paired_b64 v[2:3], v[4:5] matrix_b_scale:MATRIX_SCALE_ROW1 ; encoding: [0x00,0x00,0x3a,0xcc,0x02,0x09,0x02,0x08] + +0x00,0x00,0x3a,0xcc,0x02,0x09,0x02,0x48 +# GFX1250: v_wmma_ld_scale16_paired_b64 v[2:3], v[4:5] matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E4M3 ; encoding: [0x00,0x00,0x3a,0xcc,0x02,0x09,0x02,0x48] + +0x00,0x00,0x3a,0xcc,0x02,0x09,0x02,0x28 +# GFX1250: v_wmma_ld_scale16_paired_b64 v[2:3], v[4:5] matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E5M3 ; encoding: [0x00,0x00,0x3a,0xcc,0x02,0x09,0x02,0x28] + +0x00,0x02,0x3a,0xcc,0x02,0x09,0x02,0x00 +# GFX1250: v_wmma_ld_scale16_paired_b64 v[2:3], v[4:5] matrix_b_scale_fmt:MATRIX_SCALE_FMT_E4M3 ; encoding: [0x00,0x02,0x3a,0xcc,0x02,0x09,0x02,0x00] + +0x00,0x01,0x3a,0xcc,0x02,0x09,0x02,0x00 +# GFX1250: v_wmma_ld_scale16_paired_b64 v[2:3], v[4:5] matrix_b_scale_fmt:MATRIX_SCALE_FMT_E5M3 ; encoding: [0x00,0x01,0x3a,0xcc,0x02,0x09,0x02,0x00] + +0x00,0x00,0x35,0xcc,0x82,0x88,0x01,0x00 +# GFX1250: v_wmma_ld_scale_paired_b32 2, -4 ; encoding: [0x00,0x00,0x35,0xcc,0x82,0x88,0x01,0x00] + +0x00,0x20,0x35,0xcc,0x00,0x00,0x00,0x00 +# GFX1250: v_wmma_ld_scale_paired_b32 s0, s0 matrix_a_reuse ; encoding: [0x00,0x20,0x35,0xcc,0x00,0x00,0x00,0x00] + +0x00,0x08,0x35,0xcc,0x00,0x00,0x00,0x00 +# GFX1250: v_wmma_ld_scale_paired_b32 s0, s0 matrix_a_scale:MATRIX_SCALE_ROW1 ; encoding: [0x00,0x08,0x35,0xcc,0x00,0x00,0x00,0x00] + +0x00,0x28,0x35,0xcc,0x00,0x00,0x00,0x00 +# GFX1250: v_wmma_ld_scale_paired_b32 s0, s0 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_a_reuse ; encoding: [0x00,0x28,0x35,0xcc,0x00,0x00,0x00,0x00] + +0x00,0x40,0x35,0xcc,0x00,0x00,0x00,0x00 +# GFX1250: v_wmma_ld_scale_paired_b32 s0, s0 matrix_b_reuse ; encoding: [0x00,0x40,0x35,0xcc,0x00,0x00,0x00,0x00] + +0x00,0x00,0x35,0xcc,0x00,0x00,0x00,0x08 +# GFX1250: v_wmma_ld_scale_paired_b32 s0, s0 matrix_b_scale:MATRIX_SCALE_ROW1 ; encoding: [0x00,0x00,0x35,0xcc,0x00,0x00,0x00,0x08] + +0x00,0x40,0x35,0xcc,0x00,0x00,0x00,0x08 +# GFX1250: v_wmma_ld_scale_paired_b32 s0, s0 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_b_reuse ; encoding: [0x00,0x40,0x35,0xcc,0x00,0x00,0x00,0x08] + +0x00,0x00,0x35,0xcc,0x01,0x04,0x00,0x00 +# GFX1250: v_wmma_ld_scale_paired_b32 s1, s2 ; encoding: [0x00,0x00,0x35,0xcc,0x01,0x04,0x00,0x00] + +0x00,0x00,0x35,0xcc,0x01,0x05,0x02,0x00 +# GFX1250: v_wmma_ld_scale_paired_b32 v1, v2 ; encoding: [0x00,0x00,0x35,0xcc,0x01,0x05,0x02,0x00] + +0x00,0x6a,0x35,0xcc,0x01,0x05,0x02,0x28 +# GFX1250: v_wmma_ld_scale_paired_b32 v1, v2 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E5M3 matrix_b_scale_fmt:MATRIX_SCALE_FMT_E4M3 matrix_a_reuse matrix_b_reuse ; encoding: [0x00,0x6a,0x35,0xcc,0x01,0x05,0x02,0x28] + +0x00,0x00,0x35,0xcc,0x01,0x05,0x02,0x08 +# GFX1250: v_wmma_ld_scale_paired_b32 v1, v2 matrix_b_scale:MATRIX_SCALE_ROW1 ; encoding: [0x00,0x00,0x35,0xcc,0x01,0x05,0x02,0x08] + +0x00,0x00,0x35,0xcc,0x01,0x05,0x02,0x48 +# GFX1250: v_wmma_ld_scale_paired_b32 v1, v2 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E4M3 ; encoding: [0x00,0x00,0x35,0xcc,0x01,0x05,0x02,0x48] + +0x00,0x00,0x35,0xcc,0x01,0x05,0x02,0x28 +# GFX1250: v_wmma_ld_scale_paired_b32 v1, v2 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E5M3 ; encoding: [0x00,0x00,0x35,0xcc,0x01,0x05,0x02,0x28] + +0x00,0x02,0x35,0xcc,0x01,0x05,0x02,0x00 +# GFX1250: v_wmma_ld_scale_paired_b32 v1, v2 matrix_b_scale_fmt:MATRIX_SCALE_FMT_E4M3 ; encoding: [0x00,0x02,0x35,0xcc,0x01,0x05,0x02,0x00] + +0x00,0x01,0x35,0xcc,0x01,0x05,0x02,0x00 +# GFX1250: v_wmma_ld_scale_paired_b32 v1, v2 matrix_b_scale_fmt:MATRIX_SCALE_FMT_E5M3 ; encoding: [0x00,0x01,0x35,0xcc,0x01,0x05,0x02,0x00] + 0x10,0x00,0x87,0xcc,0x00,0x11,0xca,0x1b # GFX1250: v_wmma_f16_16x16x128_bf8_bf8 v[16:19], v[0:15], v[8:23], 1.0 ; encoding: [0x10,0x00,0x87,0xcc,0x00,0x11,0xca,0x1b]