Skip to content

Commit b8eb61a

Browse files
authored
[AMDGPU] Implement addrspacecast from flat <-> private on gfx1250 (#152218)
1 parent 4c2d563 commit b8eb61a

File tree

7 files changed

+1374
-658
lines changed

7 files changed

+1374
-658
lines changed

llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp

Lines changed: 77 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -2271,6 +2271,9 @@ Register AMDGPULegalizerInfo::getSegmentAperture(
22712271
const unsigned ApertureRegNo = (AS == AMDGPUAS::LOCAL_ADDRESS)
22722272
? AMDGPU::SRC_SHARED_BASE
22732273
: AMDGPU::SRC_PRIVATE_BASE;
2274+
assert((ApertureRegNo != AMDGPU::SRC_PRIVATE_BASE ||
2275+
!ST.hasGloballyAddressableScratch()) &&
2276+
"Cannot use src_private_base with globally addressable scratch!");
22742277
// FIXME: It would be more natural to emit a COPY here, but then copy
22752278
// coalescing would kick in and it would think it's okay to use the "HI"
22762279
// subregister (instead of extracting the HI 32 bits) which is an artificial
@@ -2396,11 +2399,30 @@ bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
23962399
if (SrcAS == AMDGPUAS::FLAT_ADDRESS &&
23972400
(DestAS == AMDGPUAS::LOCAL_ADDRESS ||
23982401
DestAS == AMDGPUAS::PRIVATE_ADDRESS)) {
2402+
auto castFlatToLocalOrPrivate = [&](const DstOp &Dst) -> Register {
2403+
if (DestAS == AMDGPUAS::PRIVATE_ADDRESS &&
2404+
ST.hasGloballyAddressableScratch()) {
2405+
// flat -> private with globally addressable scratch: subtract
2406+
// src_flat_scratch_base_lo.
2407+
const LLT S32 = LLT::scalar(32);
2408+
Register SrcLo = B.buildExtract(S32, Src, 0).getReg(0);
2409+
Register FlatScratchBaseLo =
2410+
B.buildInstr(AMDGPU::S_MOV_B32, {S32},
2411+
{Register(AMDGPU::SRC_FLAT_SCRATCH_BASE_LO)})
2412+
.getReg(0);
2413+
MRI.setRegClass(FlatScratchBaseLo, &AMDGPU::SReg_32RegClass);
2414+
Register Sub = B.buildSub(S32, SrcLo, FlatScratchBaseLo).getReg(0);
2415+
return B.buildIntToPtr(Dst, Sub).getReg(0);
2416+
}
2417+
2418+
// Extract low 32-bits of the pointer.
2419+
return B.buildExtract(Dst, Src, 0).getReg(0);
2420+
};
2421+
23992422
// For llvm.amdgcn.addrspacecast.nonnull we can always assume non-null, for
24002423
// G_ADDRSPACE_CAST we need to guess.
24012424
if (isa<GIntrinsic>(MI) || isKnownNonNull(Src, MRI, TM, SrcAS)) {
2402-
// Extract low 32-bits of the pointer.
2403-
B.buildExtract(Dst, Src, 0);
2425+
castFlatToLocalOrPrivate(Dst);
24042426
MI.eraseFromParent();
24052427
return true;
24062428
}
@@ -2411,7 +2433,7 @@ bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
24112433
auto FlatNull = B.buildConstant(SrcTy, 0);
24122434

24132435
// Extract low 32-bits of the pointer.
2414-
auto PtrLo32 = B.buildExtract(DstTy, Src, 0);
2436+
auto PtrLo32 = castFlatToLocalOrPrivate(DstTy);
24152437

24162438
auto CmpRes =
24172439
B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0));
@@ -2425,14 +2447,45 @@ bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
24252447
(SrcAS == AMDGPUAS::LOCAL_ADDRESS ||
24262448
SrcAS == AMDGPUAS::PRIVATE_ADDRESS)) {
24272449
auto castLocalOrPrivateToFlat = [&](const DstOp &Dst) -> Register {
2428-
Register ApertureReg = getSegmentAperture(SrcAS, MRI, B);
2429-
if (!ApertureReg.isValid())
2430-
return false;
2431-
24322450
// Coerce the type of the low half of the result so we can use
24332451
// merge_values.
24342452
Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0);
24352453

2454+
if (SrcAS == AMDGPUAS::PRIVATE_ADDRESS &&
2455+
ST.hasGloballyAddressableScratch()) {
2456+
// For wave32: Addr = (TID[4:0] << 52) + FLAT_SCRATCH_BASE + privateAddr
2457+
// For wave64: Addr = (TID[5:0] << 51) + FLAT_SCRATCH_BASE + privateAddr
2458+
Register AllOnes = B.buildConstant(S32, -1).getReg(0);
2459+
Register ThreadID = B.buildConstant(S32, 0).getReg(0);
2460+
ThreadID = B.buildIntrinsic(Intrinsic::amdgcn_mbcnt_lo, {S32})
2461+
.addUse(AllOnes)
2462+
.addUse(ThreadID)
2463+
.getReg(0);
2464+
if (ST.isWave64()) {
2465+
ThreadID = B.buildIntrinsic(Intrinsic::amdgcn_mbcnt_hi, {S32})
2466+
.addUse(AllOnes)
2467+
.addUse(ThreadID)
2468+
.getReg(0);
2469+
}
2470+
Register ShAmt =
2471+
B.buildConstant(S32, 57 - 32 - ST.getWavefrontSizeLog2()).getReg(0);
2472+
Register SrcHi = B.buildShl(S32, ThreadID, ShAmt).getReg(0);
2473+
Register CvtPtr =
2474+
B.buildMergeLikeInstr(DstTy, {SrcAsInt, SrcHi}).getReg(0);
2475+
// Accessing src_flat_scratch_base_lo as a 64-bit operand gives the full
2476+
// 64-bit hi:lo value.
2477+
Register FlatScratchBase =
2478+
B.buildInstr(AMDGPU::S_MOV_B64, {S64},
2479+
{Register(AMDGPU::SRC_FLAT_SCRATCH_BASE)})
2480+
.getReg(0);
2481+
MRI.setRegClass(FlatScratchBase, &AMDGPU::SReg_64RegClass);
2482+
return B.buildPtrAdd(Dst, CvtPtr, FlatScratchBase).getReg(0);
2483+
}
2484+
2485+
Register ApertureReg = getSegmentAperture(SrcAS, MRI, B);
2486+
if (!ApertureReg.isValid())
2487+
return false;
2488+
24362489
// TODO: Should we allow mismatched types but matching sizes in merges to
24372490
// avoid the ptrtoint?
24382491
return B.buildMergeLikeInstr(Dst, {SrcAsInt, ApertureReg}).getReg(0);
@@ -5788,11 +5841,25 @@ bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI,
57885841
MachineRegisterInfo &MRI,
57895842
MachineIRBuilder &B,
57905843
unsigned AddrSpace) const {
5791-
Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B);
5792-
auto Unmerge = B.buildUnmerge(LLT::scalar(32), MI.getOperand(2).getReg());
5844+
const LLT S32 = LLT::scalar(32);
5845+
auto Unmerge = B.buildUnmerge(S32, MI.getOperand(2).getReg());
57935846
Register Hi32 = Unmerge.getReg(1);
57945847

5795-
B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg);
5848+
if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS &&
5849+
ST.hasGloballyAddressableScratch()) {
5850+
Register FlatScratchBaseHi =
5851+
B.buildInstr(AMDGPU::S_MOV_B32, {S32},
5852+
{Register(AMDGPU::SRC_FLAT_SCRATCH_BASE_HI)})
5853+
.getReg(0);
5854+
MRI.setRegClass(FlatScratchBaseHi, &AMDGPU::SReg_32RegClass);
5855+
// Test bits 63..58 against the aperture address.
5856+
Register XOR = B.buildXor(S32, Hi32, FlatScratchBaseHi).getReg(0);
5857+
B.buildICmp(ICmpInst::ICMP_ULT, MI.getOperand(0), XOR,
5858+
B.buildConstant(S32, 1u << 26));
5859+
} else {
5860+
Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B);
5861+
B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg);
5862+
}
57965863
MI.eraseFromParent();
57975864
return true;
57985865
}

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

Lines changed: 78 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -2098,10 +2098,17 @@ bool SITargetLowering::isNonGlobalAddrSpace(unsigned AS) {
20982098

20992099
bool SITargetLowering::isFreeAddrSpaceCast(unsigned SrcAS,
21002100
unsigned DestAS) const {
2101-
// Flat -> private/local is a simple truncate.
2102-
// Flat -> global is no-op
2103-
if (SrcAS == AMDGPUAS::FLAT_ADDRESS)
2101+
if (SrcAS == AMDGPUAS::FLAT_ADDRESS) {
2102+
if (DestAS == AMDGPUAS::PRIVATE_ADDRESS &&
2103+
Subtarget->hasGloballyAddressableScratch()) {
2104+
// Flat -> private requires subtracting src_flat_scratch_base_lo.
2105+
return false;
2106+
}
2107+
2108+
// Flat -> private/local is a simple truncate.
2109+
// Flat -> global is no-op
21042110
return true;
2111+
}
21052112

21062113
const GCNTargetMachine &TM =
21072114
static_cast<const GCNTargetMachine &>(getTargetMachine());
@@ -7650,6 +7657,9 @@ SDValue SITargetLowering::getSegmentAperture(unsigned AS, const SDLoc &DL,
76507657
const unsigned ApertureRegNo = (AS == AMDGPUAS::LOCAL_ADDRESS)
76517658
? AMDGPU::SRC_SHARED_BASE
76527659
: AMDGPU::SRC_PRIVATE_BASE;
7660+
assert((ApertureRegNo != AMDGPU::SRC_PRIVATE_BASE ||
7661+
!Subtarget->hasGloballyAddressableScratch()) &&
7662+
"Cannot use src_private_base with globally addressable scratch!");
76537663
// Note: this feature (register) is broken. When used as a 32-bit operand,
76547664
// it returns a wrong value (all zeroes?). The real value is in the upper 32
76557665
// bits.
@@ -7760,6 +7770,18 @@ SDValue SITargetLowering::lowerADDRSPACECAST(SDValue Op,
77607770
DestAS == AMDGPUAS::PRIVATE_ADDRESS) {
77617771
SDValue Ptr = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Src);
77627772

7773+
if (DestAS == AMDGPUAS::PRIVATE_ADDRESS &&
7774+
Subtarget->hasGloballyAddressableScratch()) {
7775+
// flat -> private with globally addressable scratch: subtract
7776+
// src_flat_scratch_base_lo.
7777+
SDValue FlatScratchBaseLo(
7778+
DAG.getMachineNode(
7779+
AMDGPU::S_MOV_B32, SL, MVT::i32,
7780+
DAG.getRegister(AMDGPU::SRC_FLAT_SCRATCH_BASE_LO, MVT::i32)),
7781+
0);
7782+
Ptr = DAG.getNode(ISD::SUB, SL, MVT::i32, Ptr, FlatScratchBaseLo);
7783+
}
7784+
77637785
if (IsNonNull || isKnownNonNull(Op, DAG, TM, SrcAS))
77647786
return Ptr;
77657787

@@ -7776,11 +7798,40 @@ SDValue SITargetLowering::lowerADDRSPACECAST(SDValue Op,
77767798
if (DestAS == AMDGPUAS::FLAT_ADDRESS) {
77777799
if (SrcAS == AMDGPUAS::LOCAL_ADDRESS ||
77787800
SrcAS == AMDGPUAS::PRIVATE_ADDRESS) {
7779-
7780-
SDValue Aperture = getSegmentAperture(SrcAS, SL, DAG);
7781-
SDValue CvtPtr =
7782-
DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, Src, Aperture);
7783-
CvtPtr = DAG.getNode(ISD::BITCAST, SL, MVT::i64, CvtPtr);
7801+
SDValue CvtPtr;
7802+
if (SrcAS == AMDGPUAS::PRIVATE_ADDRESS &&
7803+
Subtarget->hasGloballyAddressableScratch()) {
7804+
// For wave32: Addr = (TID[4:0] << 52) + FLAT_SCRATCH_BASE + privateAddr
7805+
// For wave64: Addr = (TID[5:0] << 51) + FLAT_SCRATCH_BASE + privateAddr
7806+
SDValue AllOnes = DAG.getSignedTargetConstant(-1, SL, MVT::i32);
7807+
SDValue ThreadID = DAG.getConstant(0, SL, MVT::i32);
7808+
ThreadID = DAG.getNode(
7809+
ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32,
7810+
DAG.getTargetConstant(Intrinsic::amdgcn_mbcnt_lo, SL, MVT::i32),
7811+
AllOnes, ThreadID);
7812+
if (Subtarget->isWave64())
7813+
ThreadID = DAG.getNode(
7814+
ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32,
7815+
DAG.getTargetConstant(Intrinsic::amdgcn_mbcnt_hi, SL, MVT::i32),
7816+
AllOnes, ThreadID);
7817+
SDValue ShAmt = DAG.getShiftAmountConstant(
7818+
57 - 32 - Subtarget->getWavefrontSizeLog2(), MVT::i32, SL);
7819+
SDValue SrcHi = DAG.getNode(ISD::SHL, SL, MVT::i32, ThreadID, ShAmt);
7820+
CvtPtr = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, Src, SrcHi);
7821+
CvtPtr = DAG.getNode(ISD::BITCAST, SL, MVT::i64, CvtPtr);
7822+
// Accessing src_flat_scratch_base_lo as a 64-bit operand gives the full
7823+
// 64-bit hi:lo value.
7824+
SDValue FlatScratchBase = {
7825+
DAG.getMachineNode(
7826+
AMDGPU::S_MOV_B64, SL, MVT::i64,
7827+
DAG.getRegister(AMDGPU::SRC_FLAT_SCRATCH_BASE, MVT::i64)),
7828+
0};
7829+
CvtPtr = DAG.getNode(ISD::ADD, SL, MVT::i64, CvtPtr, FlatScratchBase);
7830+
} else {
7831+
SDValue Aperture = getSegmentAperture(SrcAS, SL, DAG);
7832+
CvtPtr = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, Src, Aperture);
7833+
CvtPtr = DAG.getNode(ISD::BITCAST, SL, MVT::i64, CvtPtr);
7834+
}
77847835

77857836
if (IsNonNull || isKnownNonNull(Op, DAG, TM, SrcAS))
77867837
return CvtPtr;
@@ -9424,15 +9475,29 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
94249475
case Intrinsic::amdgcn_is_shared:
94259476
case Intrinsic::amdgcn_is_private: {
94269477
SDLoc SL(Op);
9427-
unsigned AS = (IntrinsicID == Intrinsic::amdgcn_is_shared)
9428-
? AMDGPUAS::LOCAL_ADDRESS
9429-
: AMDGPUAS::PRIVATE_ADDRESS;
9430-
SDValue Aperture = getSegmentAperture(AS, SL, DAG);
94319478
SDValue SrcVec =
94329479
DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Op.getOperand(1));
9433-
94349480
SDValue SrcHi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, SrcVec,
94359481
DAG.getConstant(1, SL, MVT::i32));
9482+
9483+
unsigned AS = (IntrinsicID == Intrinsic::amdgcn_is_shared)
9484+
? AMDGPUAS::LOCAL_ADDRESS
9485+
: AMDGPUAS::PRIVATE_ADDRESS;
9486+
if (AS == AMDGPUAS::PRIVATE_ADDRESS &&
9487+
Subtarget->hasGloballyAddressableScratch()) {
9488+
SDValue FlatScratchBaseHi(
9489+
DAG.getMachineNode(
9490+
AMDGPU::S_MOV_B32, DL, MVT::i32,
9491+
DAG.getRegister(AMDGPU::SRC_FLAT_SCRATCH_BASE_HI, MVT::i32)),
9492+
0);
9493+
// Test bits 63..58 against the aperture address.
9494+
return DAG.getSetCC(
9495+
SL, MVT::i1,
9496+
DAG.getNode(ISD::XOR, SL, MVT::i32, SrcHi, FlatScratchBaseHi),
9497+
DAG.getConstant(1u << 26, SL, MVT::i32), ISD::SETULT);
9498+
}
9499+
9500+
SDValue Aperture = getSegmentAperture(AS, SL, DAG);
94369501
return DAG.getSetCC(SL, MVT::i1, SrcHi, Aperture, ISD::SETEQ);
94379502
}
94389503
case Intrinsic::amdgcn_perm:

llvm/lib/Target/AMDGPU/SIRegisterInfo.td

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -866,7 +866,8 @@ def TTMP_64 : SIRegisterClass<"AMDGPU", [v2i32, i64, f64, v4i16, v4f16, v4bf16],
866866

867867
def SReg_64_XEXEC_XNULL : SIRegisterClass<"AMDGPU", [v2i32, i64, v2f32, f64, i1, v4i16, v4f16, v4bf16], 32,
868868
(add SGPR_64, VCC, FLAT_SCR, XNACK_MASK, SRC_SHARED_BASE,
869-
SRC_SHARED_LIMIT, SRC_PRIVATE_BASE, SRC_PRIVATE_LIMIT, TTMP_64, TBA, TMA)> {
869+
SRC_SHARED_LIMIT, SRC_PRIVATE_BASE, SRC_PRIVATE_LIMIT, TTMP_64, TBA, TMA,
870+
SRC_FLAT_SCRATCH_BASE)> {
870871
let CopyCost = 1;
871872
let AllocationPriority = 1;
872873
let HasSGPR = 1;

0 commit comments

Comments
 (0)