diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def b/clang/include/clang/Basic/BuiltinsAMDGPU.def index bb3953ea1253d..0c4a485d60936 100644 --- a/clang/include/clang/Basic/BuiltinsAMDGPU.def +++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def @@ -721,6 +721,12 @@ TARGET_BUILTIN(__builtin_amdgcn_cvt_sr_fp8_f32_e5m3, "ifiiIi", "nc", "fp8e5m3-in TARGET_BUILTIN(__builtin_amdgcn_sat_pk4_i4_i8, "UsUi", "nc", "gfx1250-insts") TARGET_BUILTIN(__builtin_amdgcn_sat_pk4_u4_u8, "UsUi", "nc", "gfx1250-insts") +TARGET_BUILTIN(__builtin_amdgcn_permlane_bcast, "iiii", "nc", "gfx1250-insts,wavefrontsize32") +TARGET_BUILTIN(__builtin_amdgcn_permlane_up, "iiii", "nc", "gfx1250-insts,wavefrontsize32") +TARGET_BUILTIN(__builtin_amdgcn_permlane_down, "iiii", "nc", "gfx1250-insts,wavefrontsize32") +TARGET_BUILTIN(__builtin_amdgcn_permlane_xor, "iiii", "nc", "gfx1250-insts,wavefrontsize32") +TARGET_BUILTIN(__builtin_amdgcn_permlane_idx_gen, "iii", "nc", "gfx1250-insts,wavefrontsize32") + // GFX1250 WMMA builtins TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_16x16x4_f32, "V8fIbV2fIbV2fIsV8fIbIb", "nc", "gfx1250-insts,wavefrontsize32") TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_16x16x32_bf16, "V8fIbV16yIbV16yIsV8fIbIb", "nc", "gfx1250-insts,wavefrontsize32") diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl index 51ab970655b4a..2fd816cebd365 100644 --- a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl +++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl @@ -744,6 +744,132 @@ void test_permlane16_swap(global uint2* out, uint old, uint src) { *out = __builtin_amdgcn_permlane16_swap(old, src, false, true); } +// CHECK-LABEL: @test_permlane_bcast( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// CHECK-NEXT: [[SRC0_ADDR:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-NEXT: [[SRC1_ADDR:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-NEXT: [[SRC2_ADDR:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr +// CHECK-NEXT: [[SRC0_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SRC0_ADDR]] to ptr +// CHECK-NEXT: [[SRC1_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SRC1_ADDR]] to ptr +// CHECK-NEXT: [[SRC2_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SRC2_ADDR]] to ptr +// CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr [[OUT_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store i32 [[SRC0:%.*]], ptr [[SRC0_ADDR_ASCAST]], align 4 +// CHECK-NEXT: store i32 [[SRC1:%.*]], ptr [[SRC1_ADDR_ASCAST]], align 4 +// CHECK-NEXT: store i32 [[SRC2:%.*]], ptr [[SRC2_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[SRC0_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[SRC1_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[SRC2_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP3:%.*]] = call i32 @llvm.amdgcn.permlane.bcast(i32 [[TMP0]], i32 [[TMP1]], i32 [[TMP2]]) +// CHECK-NEXT: [[TMP4:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store i32 [[TMP3]], ptr addrspace(1) [[TMP4]], align 4 +// CHECK-NEXT: ret void +// +void test_permlane_bcast(global uint* out, uint src0, uint src1, uint src2) { + *out = __builtin_amdgcn_permlane_bcast(src0, src1, src2); +} + +// CHECK-LABEL: @test_permlane_down( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// CHECK-NEXT: [[SRC0_ADDR:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-NEXT: [[SRC1_ADDR:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-NEXT: [[SRC2_ADDR:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr +// CHECK-NEXT: [[SRC0_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SRC0_ADDR]] to ptr +// CHECK-NEXT: [[SRC1_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SRC1_ADDR]] to ptr +// CHECK-NEXT: [[SRC2_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SRC2_ADDR]] to ptr +// CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr [[OUT_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store i32 [[SRC0:%.*]], ptr [[SRC0_ADDR_ASCAST]], align 4 +// CHECK-NEXT: store i32 [[SRC1:%.*]], ptr [[SRC1_ADDR_ASCAST]], align 4 +// CHECK-NEXT: store i32 [[SRC2:%.*]], ptr [[SRC2_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[SRC0_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[SRC1_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[SRC2_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP3:%.*]] = call i32 @llvm.amdgcn.permlane.down(i32 [[TMP0]], i32 [[TMP1]], i32 [[TMP2]]) +// CHECK-NEXT: [[TMP4:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store i32 [[TMP3]], ptr addrspace(1) [[TMP4]], align 4 +// CHECK-NEXT: ret void +// +void test_permlane_down(global uint* out, uint src0, uint src1, uint src2) { + *out = __builtin_amdgcn_permlane_down(src0, src1, src2); +} + +// CHECK-LABEL: @test_permlane_up( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// CHECK-NEXT: [[SRC0_ADDR:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-NEXT: [[SRC1_ADDR:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-NEXT: [[SRC2_ADDR:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr +// CHECK-NEXT: [[SRC0_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SRC0_ADDR]] to ptr +// CHECK-NEXT: [[SRC1_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SRC1_ADDR]] to ptr +// CHECK-NEXT: [[SRC2_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SRC2_ADDR]] to ptr +// CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr [[OUT_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store i32 [[SRC0:%.*]], ptr [[SRC0_ADDR_ASCAST]], align 4 +// CHECK-NEXT: store i32 [[SRC1:%.*]], ptr [[SRC1_ADDR_ASCAST]], align 4 +// CHECK-NEXT: store i32 [[SRC2:%.*]], ptr [[SRC2_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[SRC0_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[SRC1_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[SRC2_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP3:%.*]] = call i32 @llvm.amdgcn.permlane.up(i32 [[TMP0]], i32 [[TMP1]], i32 [[TMP2]]) +// CHECK-NEXT: [[TMP4:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store i32 [[TMP3]], ptr addrspace(1) [[TMP4]], align 4 +// CHECK-NEXT: ret void +// +void test_permlane_up(global uint* out, uint src0, uint src1, uint src2) { + *out = __builtin_amdgcn_permlane_up(src0, src1, src2); +} + +// CHECK-LABEL: @test_permlane_xor( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// CHECK-NEXT: [[SRC0_ADDR:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-NEXT: [[SRC1_ADDR:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-NEXT: [[SRC2_ADDR:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr +// CHECK-NEXT: [[SRC0_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SRC0_ADDR]] to ptr +// CHECK-NEXT: [[SRC1_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SRC1_ADDR]] to ptr +// CHECK-NEXT: [[SRC2_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SRC2_ADDR]] to ptr +// CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr [[OUT_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store i32 [[SRC0:%.*]], ptr [[SRC0_ADDR_ASCAST]], align 4 +// CHECK-NEXT: store i32 [[SRC1:%.*]], ptr [[SRC1_ADDR_ASCAST]], align 4 +// CHECK-NEXT: store i32 [[SRC2:%.*]], ptr [[SRC2_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[SRC0_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[SRC1_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[SRC2_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP3:%.*]] = call i32 @llvm.amdgcn.permlane.xor(i32 [[TMP0]], i32 [[TMP1]], i32 [[TMP2]]) +// CHECK-NEXT: [[TMP4:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store i32 [[TMP3]], ptr addrspace(1) [[TMP4]], align 4 +// CHECK-NEXT: ret void +// +void test_permlane_xor(global uint* out, uint src0, uint src1, uint src2) { + *out = __builtin_amdgcn_permlane_xor(src0, src1, src2); +} + +// CHECK-LABEL: @test_permlane_idx_gen( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// CHECK-NEXT: [[SRC0_ADDR:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-NEXT: [[SRC1_ADDR:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr +// CHECK-NEXT: [[SRC0_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SRC0_ADDR]] to ptr +// CHECK-NEXT: [[SRC1_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SRC1_ADDR]] to ptr +// CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr [[OUT_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store i32 [[SRC0:%.*]], ptr [[SRC0_ADDR_ASCAST]], align 4 +// CHECK-NEXT: store i32 [[SRC1:%.*]], ptr [[SRC1_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[SRC0_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[SRC1_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.amdgcn.permlane.idx.gen(i32 [[TMP0]], i32 [[TMP1]]) +// CHECK-NEXT: [[TMP3:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store i32 [[TMP2]], ptr addrspace(1) [[TMP3]], align 4 +// CHECK-NEXT: ret void +// +void test_permlane_idx_gen(global uint* out, uint src0, uint src1) { + *out = __builtin_amdgcn_permlane_idx_gen(src0, src1); +} + // CHECK-LABEL: @test_prefetch( // CHECK-NEXT: entry: // CHECK-NEXT: [[FPTR_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td index 7265a76294c4c..eabdf521bb6e8 100644 --- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td +++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td @@ -3656,6 +3656,36 @@ def int_amdgcn_sat_pk4_i4_i8 : ClangBuiltin<"__builtin_amdgcn_sat_pk4_i4_i8">, def int_amdgcn_sat_pk4_u4_u8 : ClangBuiltin<"__builtin_amdgcn_sat_pk4_u4_u8">, DefaultAttrsIntrinsic<[llvm_i16_ty], [llvm_i32_ty], [IntrNoMem, IntrSpeculatable]>; +// llvm.amdgcn.permlane.bcast +def int_amdgcn_permlane_bcast : ClangBuiltin<"__builtin_amdgcn_permlane_bcast">, + Intrinsic<[llvm_i32_ty], + [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], + [IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>; + +// llvm.amdgcn.permlane.up +def int_amdgcn_permlane_up : ClangBuiltin<"__builtin_amdgcn_permlane_up">, + Intrinsic<[llvm_i32_ty], + [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], + [IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>; + +// llvm.amdgcn.permlane.down +def int_amdgcn_permlane_down : ClangBuiltin<"__builtin_amdgcn_permlane_down">, + Intrinsic<[llvm_i32_ty], + [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], + [IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>; + +// llvm.amdgcn.permlane.xor +def int_amdgcn_permlane_xor : ClangBuiltin<"__builtin_amdgcn_permlane_xor">, + Intrinsic<[llvm_i32_ty], + [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], + [IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>; + +// llvm.amdgcn.permlane.idx.gen +def int_amdgcn_permlane_idx_gen : ClangBuiltin<"__builtin_amdgcn_permlane_idx_gen">, + Intrinsic<[llvm_i32_ty], + [llvm_i32_ty, llvm_i32_ty], + [IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>; + //===----------------------------------------------------------------------===// // Special Intrinsics for backend internal use only. No frontend // should emit calls to these. diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp index c8e45d47c3660..5aa0ebfcce0e8 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp @@ -3204,6 +3204,18 @@ void AMDGPURegisterBankInfo::applyMappingImpl( constrainOpWithReadfirstlane(B, MI, 5); return; } + case Intrinsic::amdgcn_permlane_bcast: + case Intrinsic::amdgcn_permlane_up: + case Intrinsic::amdgcn_permlane_down: + case Intrinsic::amdgcn_permlane_xor: + // Doing a waterfall loop over these wouldn't make any sense. + constrainOpWithReadfirstlane(B, MI, 3); + constrainOpWithReadfirstlane(B, MI, 4); + return; + case Intrinsic::amdgcn_permlane_idx_gen: { + constrainOpWithReadfirstlane(B, MI, 3); + return; + } case Intrinsic::amdgcn_sbfe: applyMappingBFE(B, OpdMapper, true); return; @@ -4902,6 +4914,24 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { OpdsMapping[5] = getSGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI); break; } + case Intrinsic::amdgcn_permlane_bcast: + case Intrinsic::amdgcn_permlane_up: + case Intrinsic::amdgcn_permlane_down: + case Intrinsic::amdgcn_permlane_xor: { + unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI); + OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size); + OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size); + OpdsMapping[3] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); + OpdsMapping[4] = getSGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI); + break; + } + case Intrinsic::amdgcn_permlane_idx_gen: { + unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI); + OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size); + OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size); + OpdsMapping[3] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); + break; + } case Intrinsic::amdgcn_permlane16_var: case Intrinsic::amdgcn_permlanex16_var: { unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td b/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td index dfe0cbf18c476..10b8606816243 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td @@ -321,6 +321,11 @@ def : SourceOfDivergence; def : SourceOfDivergence; def : SourceOfDivergence; def : SourceOfDivergence; +def : SourceOfDivergence; +def : SourceOfDivergence; +def : SourceOfDivergence; +def : SourceOfDivergence; +def : SourceOfDivergence; def : SourceOfDivergence; def : SourceOfDivergence; def : SourceOfDivergence; diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp index 94886b04202b9..96cb5ae79534c 100644 --- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp +++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp @@ -152,7 +152,12 @@ static bool isPermlane(const MachineInstr &MI) { Opcode == AMDGPU::V_PERMLANE16_SWAP_B32_e32 || Opcode == AMDGPU::V_PERMLANE16_SWAP_B32_e64 || Opcode == AMDGPU::V_PERMLANE32_SWAP_B32_e32 || - Opcode == AMDGPU::V_PERMLANE32_SWAP_B32_e64; + Opcode == AMDGPU::V_PERMLANE32_SWAP_B32_e64 || + Opcode == AMDGPU::V_PERMLANE_BCAST_B32_e64 || + Opcode == AMDGPU::V_PERMLANE_UP_B32_e64 || + Opcode == AMDGPU::V_PERMLANE_DOWN_B32_e64 || + Opcode == AMDGPU::V_PERMLANE_XOR_B32_e64 || + Opcode == AMDGPU::V_PERMLANE_IDX_GEN_B32_e64; } static bool isLdsDma(const MachineInstr &MI) { diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index 044a681bfed3d..3f61bbd1d6e85 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -6304,10 +6304,14 @@ void SIInstrInfo::legalizeOperandsVOP3(MachineRegisterInfo &MRI, }; if (Opc == AMDGPU::V_PERMLANE16_B32_e64 || - Opc == AMDGPU::V_PERMLANEX16_B32_e64) { + Opc == AMDGPU::V_PERMLANEX16_B32_e64 || + Opc == AMDGPU::V_PERMLANE_BCAST_B32_e64 || + Opc == AMDGPU::V_PERMLANE_UP_B32_e64 || + Opc == AMDGPU::V_PERMLANE_DOWN_B32_e64 || + Opc == AMDGPU::V_PERMLANE_XOR_B32_e64 || + Opc == AMDGPU::V_PERMLANE_IDX_GEN_B32_e64) { // src1 and src2 must be scalar MachineOperand &Src1 = MI.getOperand(VOP3Idx[1]); - MachineOperand &Src2 = MI.getOperand(VOP3Idx[2]); const DebugLoc &DL = MI.getDebugLoc(); if (Src1.isReg() && !RI.isSGPRClass(MRI.getRegClass(Src1.getReg()))) { Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); @@ -6315,11 +6319,14 @@ void SIInstrInfo::legalizeOperandsVOP3(MachineRegisterInfo &MRI, .add(Src1); Src1.ChangeToRegister(Reg, false); } - if (Src2.isReg() && !RI.isSGPRClass(MRI.getRegClass(Src2.getReg()))) { - Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); - BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg) - .add(Src2); - Src2.ChangeToRegister(Reg, false); + if (VOP3Idx[2] != -1) { + MachineOperand &Src2 = MI.getOperand(VOP3Idx[2]); + if (Src2.isReg() && !RI.isSGPRClass(MRI.getRegClass(Src2.getReg()))) { + Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); + BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg) + .add(Src2); + Src2.ChangeToRegister(Reg, false); + } } } diff --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td index 1ffe39dc5cba5..19ce7f58f312c 100644 --- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td @@ -1053,6 +1053,14 @@ def VOP3_PERMLANE_VAR_Profile : VOP3_Profile : VOP3_Profile

{ + let Ins64 = !con((ins VRegSrc_32:$src0, SSrc_b32:$src1), + !if(P.HasSrc2, (ins SSrc_b32:$src2), (ins))); + let HasClamp = 0; + let HasExtVOP3DPP = 0; + let HasExtDPP = 0; +} + def opsel_i1timm : SDNodeXFormgetTargetConstant( N->getZExtValue() ? SISrcMods::OP_SEL_0 : SISrcMods::NONE, @@ -1136,6 +1144,18 @@ class PermlaneVarPat; +class PermlaneNoDppPat3Src : GCNPat< + (permlane i32:$src0, i32:$src1, i32:$src2), + (inst VGPR_32:$src0, SCSrc_b32:$src1, SCSrc_b32:$src2) +>; + +class PermlaneNoDppPat2Src : GCNPat< + (permlane i32:$src0, i32:$src1), + (inst VGPR_32:$src0, SCSrc_b32:$src1) +>; + class VOP3_BITOP3_Profile : VOP3_Profile { let HasClamp = 0; let HasOMod = 0; @@ -1522,6 +1542,20 @@ let SubtargetPredicate = isGFX12Plus in { } // End SubtargetPredicate = isGFX12Plus +let SubtargetPredicate = isGFX1250Plus, WaveSizePredicate = isWave32 in { + defm V_PERMLANE_BCAST_B32 : VOP3Inst<"v_permlane_bcast_b32", VOP3_PERMLANE_NOOPSEL_Profile>; + defm V_PERMLANE_UP_B32 : VOP3Inst<"v_permlane_up_b32", VOP3_PERMLANE_NOOPSEL_Profile>; + defm V_PERMLANE_DOWN_B32 : VOP3Inst<"v_permlane_down_b32", VOP3_PERMLANE_NOOPSEL_Profile>; + defm V_PERMLANE_XOR_B32 : VOP3Inst<"v_permlane_xor_b32", VOP3_PERMLANE_NOOPSEL_Profile>; + defm V_PERMLANE_IDX_GEN_B32 : VOP3Inst<"v_permlane_idx_gen_b32", VOP3_PERMLANE_NOOPSEL_Profile>; + + def : PermlaneNoDppPat3Src; + def : PermlaneNoDppPat3Src; + def : PermlaneNoDppPat3Src; + def : PermlaneNoDppPat3Src; + def : PermlaneNoDppPat2Src; +} // End SubtargetPredicate = isGFX1250Plus, WaveSizePredicate = isWave32 + let HasClamp = 0, HasModifiers = 1 in { def BitOp3_B16_Profile : VOP3_BITOP3_Profile, VOP3_OPSEL>; def BitOp3_B16_t16_Profile : VOP3_Profile_True16; @@ -1973,6 +2007,11 @@ defm V_ADD_MAX_I32 : VOP3Only_Realtriple_gfx1250<0x25e>; defm V_ADD_MAX_U32 : VOP3Only_Realtriple_gfx1250<0x25f>; defm V_ADD_MIN_I32 : VOP3Only_Realtriple_gfx1250<0x260>; defm V_ADD_MIN_U32 : VOP3Only_Realtriple_gfx1250<0x261>; +defm V_PERMLANE_BCAST_B32 : VOP3Only_Real_Base_gfx12<0x270>; +defm V_PERMLANE_UP_B32 : VOP3Only_Real_Base_gfx12<0x271>; +defm V_PERMLANE_DOWN_B32 : VOP3Only_Real_Base_gfx12<0x272>; +defm V_PERMLANE_XOR_B32 : VOP3Only_Real_Base_gfx12<0x273>; +defm V_PERMLANE_IDX_GEN_B32 : VOP3Only_Real_Base_gfx12<0x314>; //===----------------------------------------------------------------------===// // GFX11, GFX12 diff --git a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll index 10c656ac027af..30b74cb4804d5 100644 --- a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll +++ b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll @@ -796,6 +796,41 @@ define amdgpu_kernel void @v_permlane32_swap(ptr addrspace(1) %out, i32 %src0, i ret void } +; CHECK: DIVERGENT: %result = call i32 @llvm.amdgcn.permlane.bcast(i32 %src0, i32 %src1, i32 %src2) +define amdgpu_kernel void @v_permlane_bcast_b32(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) { + %result= call i32 @llvm.amdgcn.permlane.bcast(i32 %src0, i32 %src1, i32 %src2) + store i32 %result, ptr addrspace(1) %out + ret void +} + +; CHECK: DIVERGENT: %result = call i32 @llvm.amdgcn.permlane.up(i32 %src0, i32 %src1, i32 %src2) +define amdgpu_kernel void @v_permlane_up_b32(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) { + %result= call i32 @llvm.amdgcn.permlane.up(i32 %src0, i32 %src1, i32 %src2) + store i32 %result, ptr addrspace(1) %out + ret void +} + +; CHECK: DIVERGENT: %result = call i32 @llvm.amdgcn.permlane.down(i32 %src0, i32 %src1, i32 %src2) +define amdgpu_kernel void @v_permlane_down_b32(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) { + %result= call i32 @llvm.amdgcn.permlane.down(i32 %src0, i32 %src1, i32 %src2) + store i32 %result, ptr addrspace(1) %out + ret void +} + +; CHECK: DIVERGENT: %result = call i32 @llvm.amdgcn.permlane.xor(i32 %src0, i32 %src1, i32 %src2) +define amdgpu_kernel void @v_permlane_xor_b32(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) { + %result= call i32 @llvm.amdgcn.permlane.xor(i32 %src0, i32 %src1, i32 %src2) + store i32 %result, ptr addrspace(1) %out + ret void +} + +; CHECK: DIVERGENT: %result = call i32 @llvm.amdgcn.permlane.idx.gen(i32 %src0, i32 %src1) +define amdgpu_kernel void @v_permlane_idx_gen_b32(ptr addrspace(1) %out, i32 %src0, i32 %src1) { + %result= call i32 @llvm.amdgcn.permlane.idx.gen(i32 %src0, i32 %src1) + store i32 %result, ptr addrspace(1) %out + ret void +} + ; CHECK: DIVERGENT: %v = call i32 @llvm.amdgcn.dead.i32() define amdgpu_cs_chain void @dead(ptr addrspace(1) %out) { %v = call i32 @llvm.amdgcn.dead.i32() diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.gfx1250.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.gfx1250.ll new file mode 100644 index 0000000000000..4f7bbf8f3746f --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.gfx1250.ll @@ -0,0 +1,416 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -global-isel=0 -amdgpu-load-store-vectorizer=0 -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GFX1250,GFX1250-SDAG %s +; RUN: llc -global-isel=1 -amdgpu-load-store-vectorizer=0 -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GFX1250,GFX1250-GISEL %s + +define amdgpu_kernel void @v_permlane_bcast_b32_vss(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) { +; GFX1250-LABEL: v_permlane_bcast_b32_vss: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_load_b32 s4, s[4:5], 0x34 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_permlane_bcast_b32 v0, v0, s3, s4 +; GFX1250-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX1250-NEXT: s_endpgm + %v = call i32 @llvm.amdgcn.permlane.bcast(i32 %src0, i32 %src1, i32 %src2) + store i32 %v, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @v_permlane_bcast_b32_vii(ptr addrspace(1) %out, i32 %src0) { +; GFX1250-LABEL: v_permlane_bcast_b32_vii: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_load_b96 s[0:2], s[4:5], 0x24 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_permlane_bcast_b32 v0, v0, 1, 2 +; GFX1250-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX1250-NEXT: s_endpgm + %v = call i32 @llvm.amdgcn.permlane.bcast(i32 %src0, i32 1, i32 2) + store i32 %v, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @v_permlane_bcast_b32_vll(ptr addrspace(1) %out, i32 %src0) { +; GFX1250-LABEL: v_permlane_bcast_b32_vll: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_load_b96 s[0:2], s[4:5], 0x24 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 +; GFX1250-NEXT: s_movk_i32 s2, 0x64 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1250-NEXT: v_permlane_bcast_b32 v0, v0, s2, 0x66 +; GFX1250-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX1250-NEXT: s_endpgm + %v = call i32 @llvm.amdgcn.permlane.bcast(i32 %src0, i32 100, i32 102) + store i32 %v, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @v_permlane_bcast_b32_vvv(ptr addrspace(1) %out, i32 %src0) { +; GFX1250-SDAG-LABEL: v_permlane_bcast_b32_vvv: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_load_b96 s[0:2], s[4:5], 0x24 +; GFX1250-SDAG-NEXT: v_and_b32_e32 v1, 0x3ff, v0 +; GFX1250-SDAG-NEXT: v_bfe_u32 v0, v0, 10, 10 +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX1250-SDAG-NEXT: v_readfirstlane_b32 s3, v1 +; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-SDAG-NEXT: v_readfirstlane_b32 s2, v0 +; GFX1250-SDAG-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1250-SDAG-NEXT: v_permlane_bcast_b32 v1, v1, s3, s2 +; GFX1250-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-SDAG-NEXT: s_endpgm +; +; GFX1250-GISEL-LABEL: v_permlane_bcast_b32_vvv: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_load_b96 s[0:2], s[4:5], 0x24 +; GFX1250-GISEL-NEXT: v_and_b32_e32 v1, 0x3ff, v0 +; GFX1250-GISEL-NEXT: v_bfe_u32 v0, v0, 10, 10 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1250-GISEL-NEXT: v_readfirstlane_b32 s3, v1 +; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 +; GFX1250-GISEL-NEXT: v_readfirstlane_b32 s4, v0 +; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_permlane_bcast_b32 v0, v0, s3, s4 +; GFX1250-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX1250-GISEL-NEXT: s_endpgm + %tidx = call i32 @llvm.amdgcn.workitem.id.x() + %tidy = call i32 @llvm.amdgcn.workitem.id.y() + %v = call i32 @llvm.amdgcn.permlane.bcast(i32 %src0, i32 %tidx, i32 %tidy) + store i32 %v, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @v_permlane_down_b32_vss(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) { +; GFX1250-LABEL: v_permlane_down_b32_vss: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_load_b32 s4, s[4:5], 0x34 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_permlane_down_b32 v0, v0, s3, s4 +; GFX1250-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX1250-NEXT: s_endpgm + %v = call i32 @llvm.amdgcn.permlane.down(i32 %src0, i32 %src1, i32 %src2) + store i32 %v, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @v_permlane_down_b32_vii(ptr addrspace(1) %out, i32 %src0) { +; GFX1250-LABEL: v_permlane_down_b32_vii: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_load_b96 s[0:2], s[4:5], 0x24 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_permlane_down_b32 v0, v0, 1, 2 +; GFX1250-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX1250-NEXT: s_endpgm + %v = call i32 @llvm.amdgcn.permlane.down(i32 %src0, i32 1, i32 2) + store i32 %v, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @v_permlane_down_b32_vll(ptr addrspace(1) %out, i32 %src0) { +; GFX1250-LABEL: v_permlane_down_b32_vll: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_load_b96 s[0:2], s[4:5], 0x24 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 +; GFX1250-NEXT: s_movk_i32 s2, 0x64 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1250-NEXT: v_permlane_down_b32 v0, v0, s2, 0x66 +; GFX1250-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX1250-NEXT: s_endpgm + %v = call i32 @llvm.amdgcn.permlane.down(i32 %src0, i32 100, i32 102) + store i32 %v, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @v_permlane_down_b32_vvv(ptr addrspace(1) %out, i32 %src0) { +; GFX1250-SDAG-LABEL: v_permlane_down_b32_vvv: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_load_b96 s[0:2], s[4:5], 0x24 +; GFX1250-SDAG-NEXT: v_and_b32_e32 v1, 0x3ff, v0 +; GFX1250-SDAG-NEXT: v_bfe_u32 v0, v0, 10, 10 +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX1250-SDAG-NEXT: v_readfirstlane_b32 s3, v1 +; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-SDAG-NEXT: v_readfirstlane_b32 s2, v0 +; GFX1250-SDAG-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1250-SDAG-NEXT: v_permlane_down_b32 v1, v1, s3, s2 +; GFX1250-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-SDAG-NEXT: s_endpgm +; +; GFX1250-GISEL-LABEL: v_permlane_down_b32_vvv: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_load_b96 s[0:2], s[4:5], 0x24 +; GFX1250-GISEL-NEXT: v_and_b32_e32 v1, 0x3ff, v0 +; GFX1250-GISEL-NEXT: v_bfe_u32 v0, v0, 10, 10 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1250-GISEL-NEXT: v_readfirstlane_b32 s3, v1 +; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 +; GFX1250-GISEL-NEXT: v_readfirstlane_b32 s4, v0 +; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_permlane_down_b32 v0, v0, s3, s4 +; GFX1250-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX1250-GISEL-NEXT: s_endpgm + %tidx = call i32 @llvm.amdgcn.workitem.id.x() + %tidy = call i32 @llvm.amdgcn.workitem.id.y() + %v = call i32 @llvm.amdgcn.permlane.down(i32 %src0, i32 %tidx, i32 %tidy) + store i32 %v, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @v_permlane_up_b32_vss(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) { +; GFX1250-LABEL: v_permlane_up_b32_vss: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_load_b32 s4, s[4:5], 0x34 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_permlane_up_b32 v0, v0, s3, s4 +; GFX1250-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX1250-NEXT: s_endpgm + %v = call i32 @llvm.amdgcn.permlane.up(i32 %src0, i32 %src1, i32 %src2) + store i32 %v, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @v_permlane_up_b32_vii(ptr addrspace(1) %out, i32 %src0) { +; GFX1250-LABEL: v_permlane_up_b32_vii: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_load_b96 s[0:2], s[4:5], 0x24 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_permlane_up_b32 v0, v0, 1, 2 +; GFX1250-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX1250-NEXT: s_endpgm + %v = call i32 @llvm.amdgcn.permlane.up(i32 %src0, i32 1, i32 2) + store i32 %v, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @v_permlane_up_b32_vll(ptr addrspace(1) %out, i32 %src0) { +; GFX1250-LABEL: v_permlane_up_b32_vll: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_load_b96 s[0:2], s[4:5], 0x24 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 +; GFX1250-NEXT: s_movk_i32 s2, 0x64 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1250-NEXT: v_permlane_up_b32 v0, v0, s2, 0x66 +; GFX1250-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX1250-NEXT: s_endpgm + %v = call i32 @llvm.amdgcn.permlane.up(i32 %src0, i32 100, i32 102) + store i32 %v, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @v_permlane_up_b32_vvv(ptr addrspace(1) %out, i32 %src0) { +; GFX1250-SDAG-LABEL: v_permlane_up_b32_vvv: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_load_b96 s[0:2], s[4:5], 0x24 +; GFX1250-SDAG-NEXT: v_and_b32_e32 v1, 0x3ff, v0 +; GFX1250-SDAG-NEXT: v_bfe_u32 v0, v0, 10, 10 +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX1250-SDAG-NEXT: v_readfirstlane_b32 s3, v1 +; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-SDAG-NEXT: v_readfirstlane_b32 s2, v0 +; GFX1250-SDAG-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1250-SDAG-NEXT: v_permlane_up_b32 v1, v1, s3, s2 +; GFX1250-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-SDAG-NEXT: s_endpgm +; +; GFX1250-GISEL-LABEL: v_permlane_up_b32_vvv: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_load_b96 s[0:2], s[4:5], 0x24 +; GFX1250-GISEL-NEXT: v_and_b32_e32 v1, 0x3ff, v0 +; GFX1250-GISEL-NEXT: v_bfe_u32 v0, v0, 10, 10 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1250-GISEL-NEXT: v_readfirstlane_b32 s3, v1 +; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 +; GFX1250-GISEL-NEXT: v_readfirstlane_b32 s4, v0 +; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_permlane_up_b32 v0, v0, s3, s4 +; GFX1250-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX1250-GISEL-NEXT: s_endpgm + %tidx = call i32 @llvm.amdgcn.workitem.id.x() + %tidy = call i32 @llvm.amdgcn.workitem.id.y() + %v = call i32 @llvm.amdgcn.permlane.up(i32 %src0, i32 %tidx, i32 %tidy) + store i32 %v, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @v_permlane_xor_b32_vss(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) { +; GFX1250-LABEL: v_permlane_xor_b32_vss: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_load_b32 s4, s[4:5], 0x34 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_permlane_xor_b32 v0, v0, s3, s4 +; GFX1250-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX1250-NEXT: s_endpgm + %v = call i32 @llvm.amdgcn.permlane.xor(i32 %src0, i32 %src1, i32 %src2) + store i32 %v, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @v_permlane_xor_b32_vii(ptr addrspace(1) %out, i32 %src0) { +; GFX1250-LABEL: v_permlane_xor_b32_vii: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_load_b96 s[0:2], s[4:5], 0x24 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_permlane_xor_b32 v0, v0, 1, 2 +; GFX1250-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX1250-NEXT: s_endpgm + %v = call i32 @llvm.amdgcn.permlane.xor(i32 %src0, i32 1, i32 2) + store i32 %v, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @v_permlane_xor_b32_vll(ptr addrspace(1) %out, i32 %src0) { +; GFX1250-LABEL: v_permlane_xor_b32_vll: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_load_b96 s[0:2], s[4:5], 0x24 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 +; GFX1250-NEXT: s_movk_i32 s2, 0x64 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1250-NEXT: v_permlane_xor_b32 v0, v0, s2, 0x66 +; GFX1250-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX1250-NEXT: s_endpgm + %v = call i32 @llvm.amdgcn.permlane.xor(i32 %src0, i32 100, i32 102) + store i32 %v, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @v_permlane_xor_b32_vvv(ptr addrspace(1) %out, i32 %src0) { +; GFX1250-SDAG-LABEL: v_permlane_xor_b32_vvv: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_load_b96 s[0:2], s[4:5], 0x24 +; GFX1250-SDAG-NEXT: v_and_b32_e32 v1, 0x3ff, v0 +; GFX1250-SDAG-NEXT: v_bfe_u32 v0, v0, 10, 10 +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX1250-SDAG-NEXT: v_readfirstlane_b32 s3, v1 +; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-SDAG-NEXT: v_readfirstlane_b32 s2, v0 +; GFX1250-SDAG-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1250-SDAG-NEXT: v_permlane_xor_b32 v1, v1, s3, s2 +; GFX1250-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-SDAG-NEXT: s_endpgm +; +; GFX1250-GISEL-LABEL: v_permlane_xor_b32_vvv: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_load_b96 s[0:2], s[4:5], 0x24 +; GFX1250-GISEL-NEXT: v_and_b32_e32 v1, 0x3ff, v0 +; GFX1250-GISEL-NEXT: v_bfe_u32 v0, v0, 10, 10 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1250-GISEL-NEXT: v_readfirstlane_b32 s3, v1 +; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 +; GFX1250-GISEL-NEXT: v_readfirstlane_b32 s4, v0 +; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_permlane_xor_b32 v0, v0, s3, s4 +; GFX1250-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX1250-GISEL-NEXT: s_endpgm + %tidx = call i32 @llvm.amdgcn.workitem.id.x() + %tidy = call i32 @llvm.amdgcn.workitem.id.y() + %v = call i32 @llvm.amdgcn.permlane.xor(i32 %src0, i32 %tidx, i32 %tidy) + store i32 %v, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @v_permlane_idx_gen_b32_vs(ptr addrspace(1) %out, i32 %src0, i32 %src1) { +; GFX1250-LABEL: v_permlane_idx_gen_b32_vs: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_permlane_idx_gen_b32 v0, v0, s3 +; GFX1250-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX1250-NEXT: s_endpgm + %v = call i32 @llvm.amdgcn.permlane.idx.gen(i32 %src0, i32 %src1) + store i32 %v, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @v_permlane_idx_gen_b32_vi(ptr addrspace(1) %out, i32 %src0) { +; GFX1250-LABEL: v_permlane_idx_gen_b32_vi: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_load_b96 s[0:2], s[4:5], 0x24 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_permlane_idx_gen_b32 v0, v0, 1 +; GFX1250-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX1250-NEXT: s_endpgm + %v = call i32 @llvm.amdgcn.permlane.idx.gen(i32 %src0, i32 1) + store i32 %v, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @v_permlane_idx_gen_b32_vl(ptr addrspace(1) %out, i32 %src0) { +; GFX1250-LABEL: v_permlane_idx_gen_b32_vl: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_load_b96 s[0:2], s[4:5], 0x24 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_permlane_idx_gen_b32 v0, v0, 0x64 +; GFX1250-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX1250-NEXT: s_endpgm + %v = call i32 @llvm.amdgcn.permlane.idx.gen(i32 %src0, i32 100) + store i32 %v, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @v_permlane_idx_gen_b32_vv(ptr addrspace(1) %out) { +; GFX1250-LABEL: v_permlane_idx_gen_b32_vv: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1250-NEXT: v_bfe_u32 v1, v0, 10, 10 +; GFX1250-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1250-NEXT: v_readfirstlane_b32 s2, v1 +; GFX1250-NEXT: v_mov_b32_e32 v1, 0 +; GFX1250-NEXT: v_permlane_idx_gen_b32 v0, v0, s2 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX1250-NEXT: s_endpgm + %tidx = call i32 @llvm.amdgcn.workitem.id.x() + %tidy = call i32 @llvm.amdgcn.workitem.id.y() + %v = call i32 @llvm.amdgcn.permlane.idx.gen(i32 %tidx, i32 %tidy) + store i32 %v, ptr addrspace(1) %out + ret void +} diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3-fake16.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3-fake16.s index 1f40a3249ff94..887d4847c29a3 100644 --- a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3-fake16.s +++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3-fake16.s @@ -766,3 +766,123 @@ v_cvt_scale_pk8_f32_fp4 v[10:17], v20, 0xcf00 v_cvt_scale_pk8_f32_fp4 v[10:17], v20, v8 scale_sel:1 // GFX1250: v_cvt_scale_pk8_f32_fp4 v[10:17], v20, v8 scale_sel:1 ; encoding: [0x0a,0x08,0xa1,0xd6,0x14,0x11,0x02,0x00] + +v_permlane_bcast_b32 v5, v1, s2, s3 +// GFX1250: v_permlane_bcast_b32 v5, v1, s2, s3 ; encoding: [0x05,0x00,0x70,0xd6,0x01,0x05,0x0c,0x00] + +v_permlane_bcast_b32 v5, v1, s105, s105 +// GFX1250: v_permlane_bcast_b32 v5, v1, s105, s105 ; encoding: [0x05,0x00,0x70,0xd6,0x01,0xd3,0xa4,0x01] + +v_permlane_bcast_b32 v5, v1, ttmp15, ttmp15 +// GFX1250: v_permlane_bcast_b32 v5, v1, ttmp15, ttmp15 ; encoding: [0x05,0x00,0x70,0xd6,0x01,0xf7,0xec,0x01] + +v_permlane_bcast_b32 v5, v1, vcc_hi, exec_lo +// GFX1250: v_permlane_bcast_b32 v5, v1, vcc_hi, exec_lo ; encoding: [0x05,0x00,0x70,0xd6,0x01,0xd7,0xf8,0x01] + +v_permlane_bcast_b32 v5, v1, vcc_lo, m0 +// GFX1250: v_permlane_bcast_b32 v5, v1, vcc_lo, m0 ; encoding: [0x05,0x00,0x70,0xd6,0x01,0xd5,0xf4,0x01] + +v_permlane_bcast_b32 v5, v1, m0, vcc_hi +// GFX1250: v_permlane_bcast_b32 v5, v1, m0, vcc_hi ; encoding: [0x05,0x00,0x70,0xd6,0x01,0xfb,0xac,0x01] + +v_permlane_bcast_b32 v5, v1, exec_hi, vcc_lo +// GFX1250: v_permlane_bcast_b32 v5, v1, exec_hi, vcc_lo ; encoding: [0x05,0x00,0x70,0xd6,0x01,0xff,0xa8,0x01] + +v_permlane_bcast_b32 v5, v1, exec_lo, src_scc +// GFX1250: v_permlane_bcast_b32 v5, v1, exec_lo, src_scc ; encoding: [0x05,0x00,0x70,0xd6,0x01,0xfd,0xf4,0x03] + +v_permlane_down_b32 v5, v1, s2, s3 +// GFX1250: v_permlane_down_b32 v5, v1, s2, s3 ; encoding: [0x05,0x00,0x72,0xd6,0x01,0x05,0x0c,0x00] + +v_permlane_down_b32 v5, v1, s105, s105 +// GFX1250: v_permlane_down_b32 v5, v1, s105, s105 ; encoding: [0x05,0x00,0x72,0xd6,0x01,0xd3,0xa4,0x01] + +v_permlane_down_b32 v5, v1, ttmp15, ttmp15 +// GFX1250: v_permlane_down_b32 v5, v1, ttmp15, ttmp15 ; encoding: [0x05,0x00,0x72,0xd6,0x01,0xf7,0xec,0x01] + +v_permlane_down_b32 v5, v1, vcc_hi, exec_lo +// GFX1250: v_permlane_down_b32 v5, v1, vcc_hi, exec_lo ; encoding: [0x05,0x00,0x72,0xd6,0x01,0xd7,0xf8,0x01] + +v_permlane_down_b32 v5, v1, vcc_lo, m0 +// GFX1250: v_permlane_down_b32 v5, v1, vcc_lo, m0 ; encoding: [0x05,0x00,0x72,0xd6,0x01,0xd5,0xf4,0x01] + +v_permlane_down_b32 v5, v1, m0, vcc_hi +// GFX1250: v_permlane_down_b32 v5, v1, m0, vcc_hi ; encoding: [0x05,0x00,0x72,0xd6,0x01,0xfb,0xac,0x01] + +v_permlane_down_b32 v5, v1, exec_hi, vcc_lo +// GFX1250: v_permlane_down_b32 v5, v1, exec_hi, vcc_lo ; encoding: [0x05,0x00,0x72,0xd6,0x01,0xff,0xa8,0x01] + +v_permlane_down_b32 v5, v1, exec_lo, src_scc +// GFX1250: v_permlane_down_b32 v5, v1, exec_lo, src_scc ; encoding: [0x05,0x00,0x72,0xd6,0x01,0xfd,0xf4,0x03] + +v_permlane_up_b32 v5, v1, s2, s3 +// GFX1250: v_permlane_up_b32 v5, v1, s2, s3 ; encoding: [0x05,0x00,0x71,0xd6,0x01,0x05,0x0c,0x00] + +v_permlane_up_b32 v5, v1, s105, s105 +// GFX1250: v_permlane_up_b32 v5, v1, s105, s105 ; encoding: [0x05,0x00,0x71,0xd6,0x01,0xd3,0xa4,0x01] + +v_permlane_up_b32 v5, v1, ttmp15, ttmp15 +// GFX1250: v_permlane_up_b32 v5, v1, ttmp15, ttmp15 ; encoding: [0x05,0x00,0x71,0xd6,0x01,0xf7,0xec,0x01] + +v_permlane_up_b32 v5, v1, vcc_hi, exec_lo +// GFX1250: v_permlane_up_b32 v5, v1, vcc_hi, exec_lo ; encoding: [0x05,0x00,0x71,0xd6,0x01,0xd7,0xf8,0x01] + +v_permlane_up_b32 v5, v1, vcc_lo, m0 +// GFX1250: v_permlane_up_b32 v5, v1, vcc_lo, m0 ; encoding: [0x05,0x00,0x71,0xd6,0x01,0xd5,0xf4,0x01] + +v_permlane_up_b32 v5, v1, m0, vcc_hi +// GFX1250: v_permlane_up_b32 v5, v1, m0, vcc_hi ; encoding: [0x05,0x00,0x71,0xd6,0x01,0xfb,0xac,0x01] + +v_permlane_up_b32 v5, v1, exec_hi, vcc_lo +// GFX1250: v_permlane_up_b32 v5, v1, exec_hi, vcc_lo ; encoding: [0x05,0x00,0x71,0xd6,0x01,0xff,0xa8,0x01] + +v_permlane_up_b32 v5, v1, exec_lo, src_scc +// GFX1250: v_permlane_up_b32 v5, v1, exec_lo, src_scc ; encoding: [0x05,0x00,0x71,0xd6,0x01,0xfd,0xf4,0x03] + +v_permlane_xor_b32 v5, v1, s2, s3 +// GFX1250: v_permlane_xor_b32 v5, v1, s2, s3 ; encoding: [0x05,0x00,0x73,0xd6,0x01,0x05,0x0c,0x00] + +v_permlane_xor_b32 v5, v1, s105, s105 +// GFX1250: v_permlane_xor_b32 v5, v1, s105, s105 ; encoding: [0x05,0x00,0x73,0xd6,0x01,0xd3,0xa4,0x01] + +v_permlane_xor_b32 v5, v1, ttmp15, ttmp15 +// GFX1250: v_permlane_xor_b32 v5, v1, ttmp15, ttmp15 ; encoding: [0x05,0x00,0x73,0xd6,0x01,0xf7,0xec,0x01] + +v_permlane_xor_b32 v5, v1, vcc_hi, exec_lo +// GFX1250: v_permlane_xor_b32 v5, v1, vcc_hi, exec_lo ; encoding: [0x05,0x00,0x73,0xd6,0x01,0xd7,0xf8,0x01] + +v_permlane_xor_b32 v5, v1, vcc_lo, m0 +// GFX1250: v_permlane_xor_b32 v5, v1, vcc_lo, m0 ; encoding: [0x05,0x00,0x73,0xd6,0x01,0xd5,0xf4,0x01] + +v_permlane_xor_b32 v5, v1, m0, vcc_hi +// GFX1250: v_permlane_xor_b32 v5, v1, m0, vcc_hi ; encoding: [0x05,0x00,0x73,0xd6,0x01,0xfb,0xac,0x01] + +v_permlane_xor_b32 v5, v1, exec_hi, vcc_lo +// GFX1250: v_permlane_xor_b32 v5, v1, exec_hi, vcc_lo ; encoding: [0x05,0x00,0x73,0xd6,0x01,0xff,0xa8,0x01] + +v_permlane_xor_b32 v5, v1, exec_lo, src_scc +// GFX1250: v_permlane_xor_b32 v5, v1, exec_lo, src_scc ; encoding: [0x05,0x00,0x73,0xd6,0x01,0xfd,0xf4,0x03] + +v_permlane_idx_gen_b32 v5, v1, s2 +// GFX1250: v_permlane_idx_gen_b32 v5, v1, s2 ; encoding: [0x05,0x00,0x14,0xd7,0x01,0x05,0x00,0x00] + +v_permlane_idx_gen_b32 v5, v1, s105 +// GFX1250: v_permlane_idx_gen_b32 v5, v1, s105 ; encoding: [0x05,0x00,0x14,0xd7,0x01,0xd3,0x00,0x00] + +v_permlane_idx_gen_b32 v5, v1, ttmp15 +// GFX1250: v_permlane_idx_gen_b32 v5, v1, ttmp15 ; encoding: [0x05,0x00,0x14,0xd7,0x01,0xf7,0x00,0x00] + +v_permlane_idx_gen_b32 v5, v1, vcc_hi +// GFX1250: v_permlane_idx_gen_b32 v5, v1, vcc_hi ; encoding: [0x05,0x00,0x14,0xd7,0x01,0xd7,0x00,0x00] + +v_permlane_idx_gen_b32 v5, v1, vcc_lo +// GFX1250: v_permlane_idx_gen_b32 v5, v1, vcc_lo ; encoding: [0x05,0x00,0x14,0xd7,0x01,0xd5,0x00,0x00] + +v_permlane_idx_gen_b32 v5, v1, m0 +// GFX1250: v_permlane_idx_gen_b32 v5, v1, m0 ; encoding: [0x05,0x00,0x14,0xd7,0x01,0xfb,0x00,0x00] + +v_permlane_idx_gen_b32 v5, v1, exec_hi +// GFX1250: v_permlane_idx_gen_b32 v5, v1, exec_hi ; encoding: [0x05,0x00,0x14,0xd7,0x01,0xff,0x00,0x00] + +v_permlane_idx_gen_b32 v5, v1, exec_lo +// GFX1250: v_permlane_idx_gen_b32 v5, v1, exec_lo ; encoding: [0x05,0x00,0x14,0xd7,0x01,0xfd,0x00,0x00] diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3.s index 03f642d8ef33b..c1d23beba3f7f 100644 --- a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3.s +++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3.s @@ -766,3 +766,123 @@ v_cvt_scale_pk8_f32_fp4 v[10:17], v20, 0xcf00 v_cvt_scale_pk8_f32_fp4 v[10:17], v20, v8 scale_sel:1 // GFX1250: v_cvt_scale_pk8_f32_fp4 v[10:17], v20, v8 scale_sel:1 ; encoding: [0x0a,0x08,0xa1,0xd6,0x14,0x11,0x02,0x00] + +v_permlane_bcast_b32 v5, v1, s2, s3 +// GFX1250: v_permlane_bcast_b32 v5, v1, s2, s3 ; encoding: [0x05,0x00,0x70,0xd6,0x01,0x05,0x0c,0x00] + +v_permlane_bcast_b32 v5, v1, s105, s105 +// GFX1250: v_permlane_bcast_b32 v5, v1, s105, s105 ; encoding: [0x05,0x00,0x70,0xd6,0x01,0xd3,0xa4,0x01] + +v_permlane_bcast_b32 v5, v1, ttmp15, ttmp15 +// GFX1250: v_permlane_bcast_b32 v5, v1, ttmp15, ttmp15 ; encoding: [0x05,0x00,0x70,0xd6,0x01,0xf7,0xec,0x01] + +v_permlane_bcast_b32 v5, v1, vcc_hi, exec_lo +// GFX1250: v_permlane_bcast_b32 v5, v1, vcc_hi, exec_lo ; encoding: [0x05,0x00,0x70,0xd6,0x01,0xd7,0xf8,0x01] + +v_permlane_bcast_b32 v5, v1, vcc_lo, m0 +// GFX1250: v_permlane_bcast_b32 v5, v1, vcc_lo, m0 ; encoding: [0x05,0x00,0x70,0xd6,0x01,0xd5,0xf4,0x01] + +v_permlane_bcast_b32 v5, v1, m0, vcc_hi +// GFX1250: v_permlane_bcast_b32 v5, v1, m0, vcc_hi ; encoding: [0x05,0x00,0x70,0xd6,0x01,0xfb,0xac,0x01] + +v_permlane_bcast_b32 v5, v1, exec_hi, vcc_lo +// GFX1250: v_permlane_bcast_b32 v5, v1, exec_hi, vcc_lo ; encoding: [0x05,0x00,0x70,0xd6,0x01,0xff,0xa8,0x01] + +v_permlane_bcast_b32 v5, v1, exec_lo, src_scc +// GFX1250: v_permlane_bcast_b32 v5, v1, exec_lo, src_scc ; encoding: [0x05,0x00,0x70,0xd6,0x01,0xfd,0xf4,0x03] + +v_permlane_down_b32 v5, v1, s2, s3 +// GFX1250: v_permlane_down_b32 v5, v1, s2, s3 ; encoding: [0x05,0x00,0x72,0xd6,0x01,0x05,0x0c,0x00] + +v_permlane_down_b32 v5, v1, s105, s105 +// GFX1250: v_permlane_down_b32 v5, v1, s105, s105 ; encoding: [0x05,0x00,0x72,0xd6,0x01,0xd3,0xa4,0x01] + +v_permlane_down_b32 v5, v1, ttmp15, ttmp15 +// GFX1250: v_permlane_down_b32 v5, v1, ttmp15, ttmp15 ; encoding: [0x05,0x00,0x72,0xd6,0x01,0xf7,0xec,0x01] + +v_permlane_down_b32 v5, v1, vcc_hi, exec_lo +// GFX1250: v_permlane_down_b32 v5, v1, vcc_hi, exec_lo ; encoding: [0x05,0x00,0x72,0xd6,0x01,0xd7,0xf8,0x01] + +v_permlane_down_b32 v5, v1, vcc_lo, m0 +// GFX1250: v_permlane_down_b32 v5, v1, vcc_lo, m0 ; encoding: [0x05,0x00,0x72,0xd6,0x01,0xd5,0xf4,0x01] + +v_permlane_down_b32 v5, v1, m0, vcc_hi +// GFX1250: v_permlane_down_b32 v5, v1, m0, vcc_hi ; encoding: [0x05,0x00,0x72,0xd6,0x01,0xfb,0xac,0x01] + +v_permlane_down_b32 v5, v1, exec_hi, vcc_lo +// GFX1250: v_permlane_down_b32 v5, v1, exec_hi, vcc_lo ; encoding: [0x05,0x00,0x72,0xd6,0x01,0xff,0xa8,0x01] + +v_permlane_down_b32 v5, v1, exec_lo, src_scc +// GFX1250: v_permlane_down_b32 v5, v1, exec_lo, src_scc ; encoding: [0x05,0x00,0x72,0xd6,0x01,0xfd,0xf4,0x03] + +v_permlane_up_b32 v5, v1, s2, s3 +// GFX1250: v_permlane_up_b32 v5, v1, s2, s3 ; encoding: [0x05,0x00,0x71,0xd6,0x01,0x05,0x0c,0x00] + +v_permlane_up_b32 v5, v1, s105, s105 +// GFX1250: v_permlane_up_b32 v5, v1, s105, s105 ; encoding: [0x05,0x00,0x71,0xd6,0x01,0xd3,0xa4,0x01] + +v_permlane_up_b32 v5, v1, ttmp15, ttmp15 +// GFX1250: v_permlane_up_b32 v5, v1, ttmp15, ttmp15 ; encoding: [0x05,0x00,0x71,0xd6,0x01,0xf7,0xec,0x01] + +v_permlane_up_b32 v5, v1, vcc_hi, exec_lo +// GFX1250: v_permlane_up_b32 v5, v1, vcc_hi, exec_lo ; encoding: [0x05,0x00,0x71,0xd6,0x01,0xd7,0xf8,0x01] + +v_permlane_up_b32 v5, v1, vcc_lo, m0 +// GFX1250: v_permlane_up_b32 v5, v1, vcc_lo, m0 ; encoding: [0x05,0x00,0x71,0xd6,0x01,0xd5,0xf4,0x01] + +v_permlane_up_b32 v5, v1, m0, vcc_hi +// GFX1250: v_permlane_up_b32 v5, v1, m0, vcc_hi ; encoding: [0x05,0x00,0x71,0xd6,0x01,0xfb,0xac,0x01] + +v_permlane_up_b32 v5, v1, exec_hi, vcc_lo +// GFX1250: v_permlane_up_b32 v5, v1, exec_hi, vcc_lo ; encoding: [0x05,0x00,0x71,0xd6,0x01,0xff,0xa8,0x01] + +v_permlane_up_b32 v5, v1, exec_lo, src_scc +// GFX1250: v_permlane_up_b32 v5, v1, exec_lo, src_scc ; encoding: [0x05,0x00,0x71,0xd6,0x01,0xfd,0xf4,0x03] + +v_permlane_xor_b32 v5, v1, s2, s3 +// GFX1250: v_permlane_xor_b32 v5, v1, s2, s3 ; encoding: [0x05,0x00,0x73,0xd6,0x01,0x05,0x0c,0x00] + +v_permlane_xor_b32 v5, v1, s105, s105 +// GFX1250: v_permlane_xor_b32 v5, v1, s105, s105 ; encoding: [0x05,0x00,0x73,0xd6,0x01,0xd3,0xa4,0x01] + +v_permlane_xor_b32 v5, v1, ttmp15, ttmp15 +// GFX1250: v_permlane_xor_b32 v5, v1, ttmp15, ttmp15 ; encoding: [0x05,0x00,0x73,0xd6,0x01,0xf7,0xec,0x01] + +v_permlane_xor_b32 v5, v1, vcc_hi, exec_lo +// GFX1250: v_permlane_xor_b32 v5, v1, vcc_hi, exec_lo ; encoding: [0x05,0x00,0x73,0xd6,0x01,0xd7,0xf8,0x01] + +v_permlane_xor_b32 v5, v1, vcc_lo, m0 +// GFX1250: v_permlane_xor_b32 v5, v1, vcc_lo, m0 ; encoding: [0x05,0x00,0x73,0xd6,0x01,0xd5,0xf4,0x01] + +v_permlane_xor_b32 v5, v1, m0, vcc_hi +// GFX1250: v_permlane_xor_b32 v5, v1, m0, vcc_hi ; encoding: [0x05,0x00,0x73,0xd6,0x01,0xfb,0xac,0x01] + +v_permlane_xor_b32 v5, v1, exec_hi, vcc_lo +// GFX1250: v_permlane_xor_b32 v5, v1, exec_hi, vcc_lo ; encoding: [0x05,0x00,0x73,0xd6,0x01,0xff,0xa8,0x01] + +v_permlane_xor_b32 v5, v1, exec_lo, src_scc +// GFX1250: v_permlane_xor_b32 v5, v1, exec_lo, src_scc ; encoding: [0x05,0x00,0x73,0xd6,0x01,0xfd,0xf4,0x03] + +v_permlane_idx_gen_b32 v5, v1, s2 +// GFX1250: v_permlane_idx_gen_b32 v5, v1, s2 ; encoding: [0x05,0x00,0x14,0xd7,0x01,0x05,0x00,0x00] + +v_permlane_idx_gen_b32 v5, v1, s105 +// GFX1250: v_permlane_idx_gen_b32 v5, v1, s105 ; encoding: [0x05,0x00,0x14,0xd7,0x01,0xd3,0x00,0x00] + +v_permlane_idx_gen_b32 v5, v1, ttmp15 +// GFX1250: v_permlane_idx_gen_b32 v5, v1, ttmp15 ; encoding: [0x05,0x00,0x14,0xd7,0x01,0xf7,0x00,0x00] + +v_permlane_idx_gen_b32 v5, v1, vcc_hi +// GFX1250: v_permlane_idx_gen_b32 v5, v1, vcc_hi ; encoding: [0x05,0x00,0x14,0xd7,0x01,0xd7,0x00,0x00] + +v_permlane_idx_gen_b32 v5, v1, vcc_lo +// GFX1250: v_permlane_idx_gen_b32 v5, v1, vcc_lo ; encoding: [0x05,0x00,0x14,0xd7,0x01,0xd5,0x00,0x00] + +v_permlane_idx_gen_b32 v5, v1, m0 +// GFX1250: v_permlane_idx_gen_b32 v5, v1, m0 ; encoding: [0x05,0x00,0x14,0xd7,0x01,0xfb,0x00,0x00] + +v_permlane_idx_gen_b32 v5, v1, exec_hi +// GFX1250: v_permlane_idx_gen_b32 v5, v1, exec_hi ; encoding: [0x05,0x00,0x14,0xd7,0x01,0xff,0x00,0x00] + +v_permlane_idx_gen_b32 v5, v1, exec_lo +// GFX1250: v_permlane_idx_gen_b32 v5, v1, exec_lo ; encoding: [0x05,0x00,0x14,0xd7,0x01,0xfd,0x00,0x00] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3.txt index ce8cfcbc1e987..e4968fef71935 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3.txt @@ -820,3 +820,120 @@ 0x0a,0x08,0xa1,0xd6,0x14,0x11,0x02,0x00 # GFX1250: v_cvt_scale_pk8_f32_fp4 v[10:17], v20, v8 scale_sel:1 ; encoding: [0x0a,0x08,0xa1,0xd6,0x14,0x11,0x02,0x00] + +0x05,0x00,0x70,0xd6,0x01,0xff,0xa8,0x01 +# GFX1250: v_permlane_bcast_b32 v5, v1, exec_hi, vcc_lo ; encoding: [0x05,0x00,0x70,0xd6,0x01,0xff,0xa8,0x01] + +0x05,0x00,0x70,0xd6,0x01,0xfd,0xf4,0x03 +# GFX1250: v_permlane_bcast_b32 v5, v1, exec_lo, src_scc ; encoding: [0x05,0x00,0x70,0xd6,0x01,0xfd,0xf4,0x03] + +0x05,0x00,0x70,0xd6,0x01,0xfb,0xac,0x01 +# GFX1250: v_permlane_bcast_b32 v5, v1, m0, vcc_hi ; encoding: [0x05,0x00,0x70,0xd6,0x01,0xfb,0xac,0x01] + +0x05,0x00,0x70,0xd6,0x01,0xd3,0xa4,0x01 +# GFX1250: v_permlane_bcast_b32 v5, v1, s105, s105 ; encoding: [0x05,0x00,0x70,0xd6,0x01,0xd3,0xa4,0x01] + +0x05,0x00,0x70,0xd6,0x01,0x05,0x0c,0x00 +# GFX1250: v_permlane_bcast_b32 v5, v1, s2, s3 ; encoding: [0x05,0x00,0x70,0xd6,0x01,0x05,0x0c,0x00] + +0x05,0x00,0x70,0xd6,0x01,0xf7,0xec,0x01 +# GFX1250: v_permlane_bcast_b32 v5, v1, ttmp15, ttmp15 ; encoding: [0x05,0x00,0x70,0xd6,0x01,0xf7,0xec,0x01] + +0x05,0x00,0x70,0xd6,0x01,0xd7,0xf8,0x01 +# GFX1250: v_permlane_bcast_b32 v5, v1, vcc_hi, exec_lo ; encoding: [0x05,0x00,0x70,0xd6,0x01,0xd7,0xf8,0x01] + +0x05,0x00,0x70,0xd6,0x01,0xd5,0xf4,0x01 +# GFX1250: v_permlane_bcast_b32 v5, v1, vcc_lo, m0 ; encoding: [0x05,0x00,0x70,0xd6,0x01,0xd5,0xf4,0x01] + +0x05,0x00,0x72,0xd6,0x01,0xff,0xa8,0x01 +# GFX1250: v_permlane_down_b32 v5, v1, exec_hi, vcc_lo ; encoding: [0x05,0x00,0x72,0xd6,0x01,0xff,0xa8,0x01] + +0x05,0x00,0x72,0xd6,0x01,0xfd,0xf4,0x03 +# GFX1250: v_permlane_down_b32 v5, v1, exec_lo, src_scc ; encoding: [0x05,0x00,0x72,0xd6,0x01,0xfd,0xf4,0x03] + +0x05,0x00,0x72,0xd6,0x01,0xfb,0xac,0x01 +# GFX1250: v_permlane_down_b32 v5, v1, m0, vcc_hi ; encoding: [0x05,0x00,0x72,0xd6,0x01,0xfb,0xac,0x01] + +0x05,0x00,0x72,0xd6,0x01,0xd3,0xa4,0x01 +# GFX1250: v_permlane_down_b32 v5, v1, s105, s105 ; encoding: [0x05,0x00,0x72,0xd6,0x01,0xd3,0xa4,0x01] + +0x05,0x00,0x72,0xd6,0x01,0x05,0x0c,0x00 +# GFX1250: v_permlane_down_b32 v5, v1, s2, s3 ; encoding: [0x05,0x00,0x72,0xd6,0x01,0x05,0x0c,0x00] + +0x05,0x00,0x72,0xd6,0x01,0xf7,0xec,0x01 +# GFX1250: v_permlane_down_b32 v5, v1, ttmp15, ttmp15 ; encoding: [0x05,0x00,0x72,0xd6,0x01,0xf7,0xec,0x01] + +0x05,0x00,0x72,0xd6,0x01,0xd7,0xf8,0x01 +# GFX1250: v_permlane_down_b32 v5, v1, vcc_hi, exec_lo ; encoding: [0x05,0x00,0x72,0xd6,0x01,0xd7,0xf8,0x01] + +0x05,0x00,0x72,0xd6,0x01,0xd5,0xf4,0x01 +# GFX1250: v_permlane_down_b32 v5, v1, vcc_lo, m0 ; encoding: [0x05,0x00,0x72,0xd6,0x01,0xd5,0xf4,0x01] + +0x05,0x00,0x71,0xd6,0x01,0xfd,0xf4,0x03 +# GFX1250: v_permlane_up_b32 v5, v1, exec_lo, src_scc ; encoding: [0x05,0x00,0x71,0xd6,0x01,0xfd,0xf4,0x03] + +0x05,0x00,0x71,0xd6,0x01,0xfb,0xac,0x01 +# GFX1250: v_permlane_up_b32 v5, v1, m0, vcc_hi ; encoding: [0x05,0x00,0x71,0xd6,0x01,0xfb,0xac,0x01] + +0x05,0x00,0x71,0xd6,0x01,0xd3,0xa4,0x01 +# GFX1250: v_permlane_up_b32 v5, v1, s105, s105 ; encoding: [0x05,0x00,0x71,0xd6,0x01,0xd3,0xa4,0x01] + +0x05,0x00,0x71,0xd6,0x01,0x05,0x0c,0x00 +# GFX1250: v_permlane_up_b32 v5, v1, s2, s3 ; encoding: [0x05,0x00,0x71,0xd6,0x01,0x05,0x0c,0x00] + +0x05,0x00,0x71,0xd6,0x01,0xf7,0xec,0x01 +# GFX1250: v_permlane_up_b32 v5, v1, ttmp15, ttmp15 ; encoding: [0x05,0x00,0x71,0xd6,0x01,0xf7,0xec,0x01] + +0x05,0x00,0x71,0xd6,0x01,0xd7,0xf8,0x01 +# GFX1250: v_permlane_up_b32 v5, v1, vcc_hi, exec_lo ; encoding: [0x05,0x00,0x71,0xd6,0x01,0xd7,0xf8,0x01] + +0x05,0x00,0x71,0xd6,0x01,0xd5,0xf4,0x01 +# GFX1250: v_permlane_up_b32 v5, v1, vcc_lo, m0 ; encoding: [0x05,0x00,0x71,0xd6,0x01,0xd5,0xf4,0x01] + +0x05,0x00,0x73,0xd6,0x01,0xff,0xa8,0x01 +# GFX1250: v_permlane_xor_b32 v5, v1, exec_hi, vcc_lo ; encoding: [0x05,0x00,0x73,0xd6,0x01,0xff,0xa8,0x01] + +0x05,0x00,0x73,0xd6,0x01,0xfd,0xf4,0x03 +# GFX1250: v_permlane_xor_b32 v5, v1, exec_lo, src_scc ; encoding: [0x05,0x00,0x73,0xd6,0x01,0xfd,0xf4,0x03] + +0x05,0x00,0x73,0xd6,0x01,0xfb,0xac,0x01 +# GFX1250: v_permlane_xor_b32 v5, v1, m0, vcc_hi ; encoding: [0x05,0x00,0x73,0xd6,0x01,0xfb,0xac,0x01] + +0x05,0x00,0x73,0xd6,0x01,0xd3,0xa4,0x01 +# GFX1250: v_permlane_xor_b32 v5, v1, s105, s105 ; encoding: [0x05,0x00,0x73,0xd6,0x01,0xd3,0xa4,0x01] + +0x05,0x00,0x73,0xd6,0x01,0x05,0x0c,0x00 +# GFX1250: v_permlane_xor_b32 v5, v1, s2, s3 ; encoding: [0x05,0x00,0x73,0xd6,0x01,0x05,0x0c,0x00] + +0x05,0x00,0x73,0xd6,0x01,0xf7,0xec,0x01 +# GFX1250: v_permlane_xor_b32 v5, v1, ttmp15, ttmp15 ; encoding: [0x05,0x00,0x73,0xd6,0x01,0xf7,0xec,0x01] + +0x05,0x00,0x73,0xd6,0x01,0xd7,0xf8,0x01 +# GFX1250: v_permlane_xor_b32 v5, v1, vcc_hi, exec_lo ; encoding: [0x05,0x00,0x73,0xd6,0x01,0xd7,0xf8,0x01] + +0x05,0x00,0x73,0xd6,0x01,0xd5,0xf4,0x01 +# GFX1250: v_permlane_xor_b32 v5, v1, vcc_lo, m0 ; encoding: [0x05,0x00,0x73,0xd6,0x01,0xd5,0xf4,0x01] + +0x05,0x00,0x14,0xd7,0x01,0xff,0x00,0x00 +# GFX1250: v_permlane_idx_gen_b32 v5, v1, exec_hi ; encoding: [0x05,0x00,0x14,0xd7,0x01,0xff,0x00,0x00] + +0x05,0x00,0x14,0xd7,0x01,0xfd,0x00,0x00 +# GFX1250: v_permlane_idx_gen_b32 v5, v1, exec_lo ; encoding: [0x05,0x00,0x14,0xd7,0x01,0xfd,0x00,0x00] + +0x05,0x00,0x14,0xd7,0x01,0xfb,0x00,0x00 +# GFX1250: v_permlane_idx_gen_b32 v5, v1, m0 ; encoding: [0x05,0x00,0x14,0xd7,0x01,0xfb,0x00,0x00] + +0x05,0x00,0x14,0xd7,0x01,0xd3,0x00,0x00 +# GFX1250: v_permlane_idx_gen_b32 v5, v1, s105 ; encoding: [0x05,0x00,0x14,0xd7,0x01,0xd3,0x00,0x00] + +0x05,0x00,0x14,0xd7,0x01,0x05,0x00,0x00 +# GFX1250: v_permlane_idx_gen_b32 v5, v1, s2 ; encoding: [0x05,0x00,0x14,0xd7,0x01,0x05,0x00,0x00] + +0x05,0x00,0x14,0xd7,0x01,0xf7,0x00,0x00 +# GFX1250: v_permlane_idx_gen_b32 v5, v1, ttmp15 ; encoding: [0x05,0x00,0x14,0xd7,0x01,0xf7,0x00,0x00] + +0x05,0x00,0x14,0xd7,0x01,0xd7,0x00,0x00 +# GFX1250: v_permlane_idx_gen_b32 v5, v1, vcc_hi ; encoding: [0x05,0x00,0x14,0xd7,0x01,0xd7,0x00,0x00] + +0x05,0x00,0x14,0xd7,0x01,0xd5,0x00,0x00 +# GFX1250: v_permlane_idx_gen_b32 v5, v1, vcc_lo ; encoding: [0x05,0x00,0x14,0xd7,0x01,0xd5,0x00,0x00]