Skip to content

Commit 3bf2cbc

Browse files
committed
[AMDGPU] gfx1250 v_permlane_* instructions
1 parent 12eab1a commit 3bf2cbc

File tree

13 files changed

+1064
-8
lines changed

13 files changed

+1064
-8
lines changed

clang/include/clang/Basic/BuiltinsAMDGPU.def

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -721,6 +721,12 @@ TARGET_BUILTIN(__builtin_amdgcn_cvt_sr_fp8_f32_e5m3, "ifiiIi", "nc", "fp8e5m3-in
721721
TARGET_BUILTIN(__builtin_amdgcn_sat_pk4_i4_i8, "UsUi", "nc", "gfx1250-insts")
722722
TARGET_BUILTIN(__builtin_amdgcn_sat_pk4_u4_u8, "UsUi", "nc", "gfx1250-insts")
723723

724+
TARGET_BUILTIN(__builtin_amdgcn_permlane_bcast, "iiii", "nc", "gfx1250-insts,wavefrontsize32")
725+
TARGET_BUILTIN(__builtin_amdgcn_permlane_up, "iiii", "nc", "gfx1250-insts,wavefrontsize32")
726+
TARGET_BUILTIN(__builtin_amdgcn_permlane_down, "iiii", "nc", "gfx1250-insts,wavefrontsize32")
727+
TARGET_BUILTIN(__builtin_amdgcn_permlane_xor, "iiii", "nc", "gfx1250-insts,wavefrontsize32")
728+
TARGET_BUILTIN(__builtin_amdgcn_permlane_idx_gen, "iii", "nc", "gfx1250-insts,wavefrontsize32")
729+
724730
// GFX1250 WMMA builtins
725731
TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_16x16x4_f32, "V8fIbV2fIbV2fIsV8fIbIb", "nc", "gfx1250-insts,wavefrontsize32")
726732
TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_16x16x32_bf16, "V8fIbV16yIbV16yIsV8fIbIb", "nc", "gfx1250-insts,wavefrontsize32")

clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl

Lines changed: 126 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -744,6 +744,132 @@ void test_permlane16_swap(global uint2* out, uint old, uint src) {
744744
*out = __builtin_amdgcn_permlane16_swap(old, src, false, true);
745745
}
746746

747+
// CHECK-LABEL: @test_permlane_bcast(
748+
// CHECK-NEXT: entry:
749+
// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
750+
// CHECK-NEXT: [[SRC0_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
751+
// CHECK-NEXT: [[SRC1_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
752+
// CHECK-NEXT: [[SRC2_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
753+
// CHECK-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr
754+
// CHECK-NEXT: [[SRC0_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SRC0_ADDR]] to ptr
755+
// CHECK-NEXT: [[SRC1_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SRC1_ADDR]] to ptr
756+
// CHECK-NEXT: [[SRC2_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SRC2_ADDR]] to ptr
757+
// CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr [[OUT_ADDR_ASCAST]], align 8
758+
// CHECK-NEXT: store i32 [[SRC0:%.*]], ptr [[SRC0_ADDR_ASCAST]], align 4
759+
// CHECK-NEXT: store i32 [[SRC1:%.*]], ptr [[SRC1_ADDR_ASCAST]], align 4
760+
// CHECK-NEXT: store i32 [[SRC2:%.*]], ptr [[SRC2_ADDR_ASCAST]], align 4
761+
// CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[SRC0_ADDR_ASCAST]], align 4
762+
// CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[SRC1_ADDR_ASCAST]], align 4
763+
// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[SRC2_ADDR_ASCAST]], align 4
764+
// CHECK-NEXT: [[TMP3:%.*]] = call i32 @llvm.amdgcn.permlane.bcast(i32 [[TMP0]], i32 [[TMP1]], i32 [[TMP2]])
765+
// CHECK-NEXT: [[TMP4:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8
766+
// CHECK-NEXT: store i32 [[TMP3]], ptr addrspace(1) [[TMP4]], align 4
767+
// CHECK-NEXT: ret void
768+
//
769+
void test_permlane_bcast(global uint* out, uint src0, uint src1, uint src2) {
770+
*out = __builtin_amdgcn_permlane_bcast(src0, src1, src2);
771+
}
772+
773+
// CHECK-LABEL: @test_permlane_down(
774+
// CHECK-NEXT: entry:
775+
// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
776+
// CHECK-NEXT: [[SRC0_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
777+
// CHECK-NEXT: [[SRC1_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
778+
// CHECK-NEXT: [[SRC2_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
779+
// CHECK-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr
780+
// CHECK-NEXT: [[SRC0_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SRC0_ADDR]] to ptr
781+
// CHECK-NEXT: [[SRC1_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SRC1_ADDR]] to ptr
782+
// CHECK-NEXT: [[SRC2_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SRC2_ADDR]] to ptr
783+
// CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr [[OUT_ADDR_ASCAST]], align 8
784+
// CHECK-NEXT: store i32 [[SRC0:%.*]], ptr [[SRC0_ADDR_ASCAST]], align 4
785+
// CHECK-NEXT: store i32 [[SRC1:%.*]], ptr [[SRC1_ADDR_ASCAST]], align 4
786+
// CHECK-NEXT: store i32 [[SRC2:%.*]], ptr [[SRC2_ADDR_ASCAST]], align 4
787+
// CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[SRC0_ADDR_ASCAST]], align 4
788+
// CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[SRC1_ADDR_ASCAST]], align 4
789+
// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[SRC2_ADDR_ASCAST]], align 4
790+
// CHECK-NEXT: [[TMP3:%.*]] = call i32 @llvm.amdgcn.permlane.down(i32 [[TMP0]], i32 [[TMP1]], i32 [[TMP2]])
791+
// CHECK-NEXT: [[TMP4:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8
792+
// CHECK-NEXT: store i32 [[TMP3]], ptr addrspace(1) [[TMP4]], align 4
793+
// CHECK-NEXT: ret void
794+
//
795+
void test_permlane_down(global uint* out, uint src0, uint src1, uint src2) {
796+
*out = __builtin_amdgcn_permlane_down(src0, src1, src2);
797+
}
798+
799+
// CHECK-LABEL: @test_permlane_up(
800+
// CHECK-NEXT: entry:
801+
// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
802+
// CHECK-NEXT: [[SRC0_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
803+
// CHECK-NEXT: [[SRC1_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
804+
// CHECK-NEXT: [[SRC2_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
805+
// CHECK-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr
806+
// CHECK-NEXT: [[SRC0_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SRC0_ADDR]] to ptr
807+
// CHECK-NEXT: [[SRC1_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SRC1_ADDR]] to ptr
808+
// CHECK-NEXT: [[SRC2_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SRC2_ADDR]] to ptr
809+
// CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr [[OUT_ADDR_ASCAST]], align 8
810+
// CHECK-NEXT: store i32 [[SRC0:%.*]], ptr [[SRC0_ADDR_ASCAST]], align 4
811+
// CHECK-NEXT: store i32 [[SRC1:%.*]], ptr [[SRC1_ADDR_ASCAST]], align 4
812+
// CHECK-NEXT: store i32 [[SRC2:%.*]], ptr [[SRC2_ADDR_ASCAST]], align 4
813+
// CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[SRC0_ADDR_ASCAST]], align 4
814+
// CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[SRC1_ADDR_ASCAST]], align 4
815+
// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[SRC2_ADDR_ASCAST]], align 4
816+
// CHECK-NEXT: [[TMP3:%.*]] = call i32 @llvm.amdgcn.permlane.up(i32 [[TMP0]], i32 [[TMP1]], i32 [[TMP2]])
817+
// CHECK-NEXT: [[TMP4:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8
818+
// CHECK-NEXT: store i32 [[TMP3]], ptr addrspace(1) [[TMP4]], align 4
819+
// CHECK-NEXT: ret void
820+
//
821+
void test_permlane_up(global uint* out, uint src0, uint src1, uint src2) {
822+
*out = __builtin_amdgcn_permlane_up(src0, src1, src2);
823+
}
824+
825+
// CHECK-LABEL: @test_permlane_xor(
826+
// CHECK-NEXT: entry:
827+
// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
828+
// CHECK-NEXT: [[SRC0_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
829+
// CHECK-NEXT: [[SRC1_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
830+
// CHECK-NEXT: [[SRC2_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
831+
// CHECK-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr
832+
// CHECK-NEXT: [[SRC0_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SRC0_ADDR]] to ptr
833+
// CHECK-NEXT: [[SRC1_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SRC1_ADDR]] to ptr
834+
// CHECK-NEXT: [[SRC2_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SRC2_ADDR]] to ptr
835+
// CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr [[OUT_ADDR_ASCAST]], align 8
836+
// CHECK-NEXT: store i32 [[SRC0:%.*]], ptr [[SRC0_ADDR_ASCAST]], align 4
837+
// CHECK-NEXT: store i32 [[SRC1:%.*]], ptr [[SRC1_ADDR_ASCAST]], align 4
838+
// CHECK-NEXT: store i32 [[SRC2:%.*]], ptr [[SRC2_ADDR_ASCAST]], align 4
839+
// CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[SRC0_ADDR_ASCAST]], align 4
840+
// CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[SRC1_ADDR_ASCAST]], align 4
841+
// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[SRC2_ADDR_ASCAST]], align 4
842+
// CHECK-NEXT: [[TMP3:%.*]] = call i32 @llvm.amdgcn.permlane.xor(i32 [[TMP0]], i32 [[TMP1]], i32 [[TMP2]])
843+
// CHECK-NEXT: [[TMP4:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8
844+
// CHECK-NEXT: store i32 [[TMP3]], ptr addrspace(1) [[TMP4]], align 4
845+
// CHECK-NEXT: ret void
846+
//
847+
void test_permlane_xor(global uint* out, uint src0, uint src1, uint src2) {
848+
*out = __builtin_amdgcn_permlane_xor(src0, src1, src2);
849+
}
850+
851+
// CHECK-LABEL: @test_permlane_idx_gen(
852+
// CHECK-NEXT: entry:
853+
// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
854+
// CHECK-NEXT: [[SRC0_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
855+
// CHECK-NEXT: [[SRC1_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
856+
// CHECK-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr
857+
// CHECK-NEXT: [[SRC0_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SRC0_ADDR]] to ptr
858+
// CHECK-NEXT: [[SRC1_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SRC1_ADDR]] to ptr
859+
// CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr [[OUT_ADDR_ASCAST]], align 8
860+
// CHECK-NEXT: store i32 [[SRC0:%.*]], ptr [[SRC0_ADDR_ASCAST]], align 4
861+
// CHECK-NEXT: store i32 [[SRC1:%.*]], ptr [[SRC1_ADDR_ASCAST]], align 4
862+
// CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[SRC0_ADDR_ASCAST]], align 4
863+
// CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[SRC1_ADDR_ASCAST]], align 4
864+
// CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.amdgcn.permlane.idx.gen(i32 [[TMP0]], i32 [[TMP1]])
865+
// CHECK-NEXT: [[TMP3:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8
866+
// CHECK-NEXT: store i32 [[TMP2]], ptr addrspace(1) [[TMP3]], align 4
867+
// CHECK-NEXT: ret void
868+
//
869+
void test_permlane_idx_gen(global uint* out, uint src0, uint src1) {
870+
*out = __builtin_amdgcn_permlane_idx_gen(src0, src1);
871+
}
872+
747873
// CHECK-LABEL: @test_prefetch(
748874
// CHECK-NEXT: entry:
749875
// CHECK-NEXT: [[FPTR_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)

llvm/include/llvm/IR/IntrinsicsAMDGPU.td

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3656,6 +3656,36 @@ def int_amdgcn_sat_pk4_i4_i8 : ClangBuiltin<"__builtin_amdgcn_sat_pk4_i4_i8">,
36563656
def int_amdgcn_sat_pk4_u4_u8 : ClangBuiltin<"__builtin_amdgcn_sat_pk4_u4_u8">,
36573657
DefaultAttrsIntrinsic<[llvm_i16_ty], [llvm_i32_ty], [IntrNoMem, IntrSpeculatable]>;
36583658

3659+
// llvm.amdgcn.permlane.bcast <src0> <src1> <src2>
3660+
def int_amdgcn_permlane_bcast : ClangBuiltin<"__builtin_amdgcn_permlane_bcast">,
3661+
Intrinsic<[llvm_i32_ty],
3662+
[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
3663+
[IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>;
3664+
3665+
// llvm.amdgcn.permlane.up <src0> <src1> <src2>
3666+
def int_amdgcn_permlane_up : ClangBuiltin<"__builtin_amdgcn_permlane_up">,
3667+
Intrinsic<[llvm_i32_ty],
3668+
[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
3669+
[IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>;
3670+
3671+
// llvm.amdgcn.permlane.down <src0> <src1> <src2>
3672+
def int_amdgcn_permlane_down : ClangBuiltin<"__builtin_amdgcn_permlane_down">,
3673+
Intrinsic<[llvm_i32_ty],
3674+
[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
3675+
[IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>;
3676+
3677+
// llvm.amdgcn.permlane.xor <src0> <src1> <src2>
3678+
def int_amdgcn_permlane_xor : ClangBuiltin<"__builtin_amdgcn_permlane_xor">,
3679+
Intrinsic<[llvm_i32_ty],
3680+
[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
3681+
[IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>;
3682+
3683+
// llvm.amdgcn.permlane.idx.gen <src0> <src1>
3684+
def int_amdgcn_permlane_idx_gen : ClangBuiltin<"__builtin_amdgcn_permlane_idx_gen">,
3685+
Intrinsic<[llvm_i32_ty],
3686+
[llvm_i32_ty, llvm_i32_ty],
3687+
[IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>;
3688+
36593689
//===----------------------------------------------------------------------===//
36603690
// Special Intrinsics for backend internal use only. No frontend
36613691
// should emit calls to these.

llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3204,6 +3204,18 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
32043204
constrainOpWithReadfirstlane(B, MI, 5);
32053205
return;
32063206
}
3207+
case Intrinsic::amdgcn_permlane_bcast:
3208+
case Intrinsic::amdgcn_permlane_up:
3209+
case Intrinsic::amdgcn_permlane_down:
3210+
case Intrinsic::amdgcn_permlane_xor:
3211+
// Doing a waterfall loop over these wouldn't make any sense.
3212+
constrainOpWithReadfirstlane(B, MI, 3);
3213+
constrainOpWithReadfirstlane(B, MI, 4);
3214+
return;
3215+
case Intrinsic::amdgcn_permlane_idx_gen: {
3216+
constrainOpWithReadfirstlane(B, MI, 3);
3217+
return;
3218+
}
32073219
case Intrinsic::amdgcn_sbfe:
32083220
applyMappingBFE(B, OpdMapper, true);
32093221
return;
@@ -4902,6 +4914,24 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
49024914
OpdsMapping[5] = getSGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
49034915
break;
49044916
}
4917+
case Intrinsic::amdgcn_permlane_bcast:
4918+
case Intrinsic::amdgcn_permlane_up:
4919+
case Intrinsic::amdgcn_permlane_down:
4920+
case Intrinsic::amdgcn_permlane_xor: {
4921+
unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
4922+
OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
4923+
OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
4924+
OpdsMapping[3] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4925+
OpdsMapping[4] = getSGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
4926+
break;
4927+
}
4928+
case Intrinsic::amdgcn_permlane_idx_gen: {
4929+
unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
4930+
OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
4931+
OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
4932+
OpdsMapping[3] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4933+
break;
4934+
}
49054935
case Intrinsic::amdgcn_permlane16_var:
49064936
case Intrinsic::amdgcn_permlanex16_var: {
49074937
unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);

llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -321,6 +321,11 @@ def : SourceOfDivergence<int_amdgcn_permlane16>;
321321
def : SourceOfDivergence<int_amdgcn_permlanex16>;
322322
def : SourceOfDivergence<int_amdgcn_permlane16_var>;
323323
def : SourceOfDivergence<int_amdgcn_permlanex16_var>;
324+
def : SourceOfDivergence<int_amdgcn_permlane_bcast>;
325+
def : SourceOfDivergence<int_amdgcn_permlane_up>;
326+
def : SourceOfDivergence<int_amdgcn_permlane_down>;
327+
def : SourceOfDivergence<int_amdgcn_permlane_xor>;
328+
def : SourceOfDivergence<int_amdgcn_permlane_idx_gen>;
324329
def : SourceOfDivergence<int_amdgcn_mov_dpp>;
325330
def : SourceOfDivergence<int_amdgcn_mov_dpp8>;
326331
def : SourceOfDivergence<int_amdgcn_update_dpp>;

llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -152,7 +152,12 @@ static bool isPermlane(const MachineInstr &MI) {
152152
Opcode == AMDGPU::V_PERMLANE16_SWAP_B32_e32 ||
153153
Opcode == AMDGPU::V_PERMLANE16_SWAP_B32_e64 ||
154154
Opcode == AMDGPU::V_PERMLANE32_SWAP_B32_e32 ||
155-
Opcode == AMDGPU::V_PERMLANE32_SWAP_B32_e64;
155+
Opcode == AMDGPU::V_PERMLANE32_SWAP_B32_e64 ||
156+
Opcode == AMDGPU::V_PERMLANE_BCAST_B32_e64 ||
157+
Opcode == AMDGPU::V_PERMLANE_UP_B32_e64 ||
158+
Opcode == AMDGPU::V_PERMLANE_DOWN_B32_e64 ||
159+
Opcode == AMDGPU::V_PERMLANE_XOR_B32_e64 ||
160+
Opcode == AMDGPU::V_PERMLANE_IDX_GEN_B32_e64;
156161
}
157162

158163
static bool isLdsDma(const MachineInstr &MI) {

llvm/lib/Target/AMDGPU/SIInstrInfo.cpp

Lines changed: 14 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -6304,22 +6304,29 @@ void SIInstrInfo::legalizeOperandsVOP3(MachineRegisterInfo &MRI,
63046304
};
63056305

63066306
if (Opc == AMDGPU::V_PERMLANE16_B32_e64 ||
6307-
Opc == AMDGPU::V_PERMLANEX16_B32_e64) {
6307+
Opc == AMDGPU::V_PERMLANEX16_B32_e64 ||
6308+
Opc == AMDGPU::V_PERMLANE_BCAST_B32_e64 ||
6309+
Opc == AMDGPU::V_PERMLANE_UP_B32_e64 ||
6310+
Opc == AMDGPU::V_PERMLANE_DOWN_B32_e64 ||
6311+
Opc == AMDGPU::V_PERMLANE_XOR_B32_e64 ||
6312+
Opc == AMDGPU::V_PERMLANE_IDX_GEN_B32_e64) {
63086313
// src1 and src2 must be scalar
63096314
MachineOperand &Src1 = MI.getOperand(VOP3Idx[1]);
6310-
MachineOperand &Src2 = MI.getOperand(VOP3Idx[2]);
63116315
const DebugLoc &DL = MI.getDebugLoc();
63126316
if (Src1.isReg() && !RI.isSGPRClass(MRI.getRegClass(Src1.getReg()))) {
63136317
Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
63146318
BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
63156319
.add(Src1);
63166320
Src1.ChangeToRegister(Reg, false);
63176321
}
6318-
if (Src2.isReg() && !RI.isSGPRClass(MRI.getRegClass(Src2.getReg()))) {
6319-
Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6320-
BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
6321-
.add(Src2);
6322-
Src2.ChangeToRegister(Reg, false);
6322+
if (VOP3Idx[2] != -1) {
6323+
MachineOperand &Src2 = MI.getOperand(VOP3Idx[2]);
6324+
if (Src2.isReg() && !RI.isSGPRClass(MRI.getRegClass(Src2.getReg()))) {
6325+
Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6326+
BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
6327+
.add(Src2);
6328+
Src2.ChangeToRegister(Reg, false);
6329+
}
63236330
}
63246331
}
63256332

llvm/lib/Target/AMDGPU/VOP3Instructions.td

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1053,6 +1053,14 @@ def VOP3_PERMLANE_VAR_Profile : VOP3_Profile<VOPProfile <[i32, i32, i32, untyped
10531053
let HasExtDPP = 0;
10541054
}
10551055

1056+
class VOP3_PERMLANE_NOOPSEL_Profile<VOPProfile P> : VOP3_Profile<P> {
1057+
let Ins64 = !con((ins VRegSrc_32:$src0, SSrc_b32:$src1),
1058+
!if(P.HasSrc2, (ins SSrc_b32:$src2), (ins)));
1059+
let HasClamp = 0;
1060+
let HasExtVOP3DPP = 0;
1061+
let HasExtDPP = 0;
1062+
}
1063+
10561064
def opsel_i1timm : SDNodeXForm<timm, [{
10571065
return CurDAG->getTargetConstant(
10581066
N->getZExtValue() ? SISrcMods::OP_SEL_0 : SISrcMods::NONE,
@@ -1136,6 +1144,18 @@ class PermlaneVarPat<SDPatternOperator permlane,
11361144
VGPR_32:$src1, VGPR_32:$vdst_in)
11371145
>;
11381146

1147+
class PermlaneNoDppPat3Src<SDPatternOperator permlane,
1148+
Instruction inst> : GCNPat<
1149+
(permlane i32:$src0, i32:$src1, i32:$src2),
1150+
(inst VGPR_32:$src0, SCSrc_b32:$src1, SCSrc_b32:$src2)
1151+
>;
1152+
1153+
class PermlaneNoDppPat2Src<SDPatternOperator permlane,
1154+
Instruction inst> : GCNPat<
1155+
(permlane i32:$src0, i32:$src1),
1156+
(inst VGPR_32:$src0, SCSrc_b32:$src1)
1157+
>;
1158+
11391159
class VOP3_BITOP3_Profile<VOPProfile pfl, VOP3Features f> : VOP3_Profile<pfl, f> {
11401160
let HasClamp = 0;
11411161
let HasOMod = 0;
@@ -1522,6 +1542,20 @@ let SubtargetPredicate = isGFX12Plus in {
15221542

15231543
} // End SubtargetPredicate = isGFX12Plus
15241544

1545+
let SubtargetPredicate = isGFX1250Plus, WaveSizePredicate = isWave32 in {
1546+
defm V_PERMLANE_BCAST_B32 : VOP3Inst<"v_permlane_bcast_b32", VOP3_PERMLANE_NOOPSEL_Profile<VOP_I32_I32_I32_I32>>;
1547+
defm V_PERMLANE_UP_B32 : VOP3Inst<"v_permlane_up_b32", VOP3_PERMLANE_NOOPSEL_Profile<VOP_I32_I32_I32_I32>>;
1548+
defm V_PERMLANE_DOWN_B32 : VOP3Inst<"v_permlane_down_b32", VOP3_PERMLANE_NOOPSEL_Profile<VOP_I32_I32_I32_I32>>;
1549+
defm V_PERMLANE_XOR_B32 : VOP3Inst<"v_permlane_xor_b32", VOP3_PERMLANE_NOOPSEL_Profile<VOP_I32_I32_I32_I32>>;
1550+
defm V_PERMLANE_IDX_GEN_B32 : VOP3Inst<"v_permlane_idx_gen_b32", VOP3_PERMLANE_NOOPSEL_Profile<VOP_I32_I32_I32>>;
1551+
1552+
def : PermlaneNoDppPat3Src<int_amdgcn_permlane_bcast, V_PERMLANE_BCAST_B32_e64>;
1553+
def : PermlaneNoDppPat3Src<int_amdgcn_permlane_up, V_PERMLANE_UP_B32_e64>;
1554+
def : PermlaneNoDppPat3Src<int_amdgcn_permlane_down, V_PERMLANE_DOWN_B32_e64>;
1555+
def : PermlaneNoDppPat3Src<int_amdgcn_permlane_xor, V_PERMLANE_XOR_B32_e64>;
1556+
def : PermlaneNoDppPat2Src<int_amdgcn_permlane_idx_gen, V_PERMLANE_IDX_GEN_B32_e64>;
1557+
} // End SubtargetPredicate = isGFX1250Plus, WaveSizePredicate = isWave32
1558+
15251559
let HasClamp = 0, HasModifiers = 1 in {
15261560
def BitOp3_B16_Profile : VOP3_BITOP3_Profile<VOPProfile <[i16, i16, i16, i16, i32]>, VOP3_OPSEL>;
15271561
def BitOp3_B16_t16_Profile : VOP3_Profile_True16<BitOp3_B16_Profile>;
@@ -1973,6 +2007,11 @@ defm V_ADD_MAX_I32 : VOP3Only_Realtriple_gfx1250<0x25e>;
19732007
defm V_ADD_MAX_U32 : VOP3Only_Realtriple_gfx1250<0x25f>;
19742008
defm V_ADD_MIN_I32 : VOP3Only_Realtriple_gfx1250<0x260>;
19752009
defm V_ADD_MIN_U32 : VOP3Only_Realtriple_gfx1250<0x261>;
2010+
defm V_PERMLANE_BCAST_B32 : VOP3Only_Real_Base_gfx12<0x270>;
2011+
defm V_PERMLANE_UP_B32 : VOP3Only_Real_Base_gfx12<0x271>;
2012+
defm V_PERMLANE_DOWN_B32 : VOP3Only_Real_Base_gfx12<0x272>;
2013+
defm V_PERMLANE_XOR_B32 : VOP3Only_Real_Base_gfx12<0x273>;
2014+
defm V_PERMLANE_IDX_GEN_B32 : VOP3Only_Real_Base_gfx12<0x314>;
19762015

19772016
//===----------------------------------------------------------------------===//
19782017
// GFX11, GFX12

0 commit comments

Comments
 (0)