@@ -2098,10 +2098,17 @@ bool SITargetLowering::isNonGlobalAddrSpace(unsigned AS) {
2098
2098
2099
2099
bool SITargetLowering::isFreeAddrSpaceCast(unsigned SrcAS,
2100
2100
unsigned DestAS) const {
2101
- // Flat -> private/local is a simple truncate.
2102
- // Flat -> global is no-op
2103
- if (SrcAS == AMDGPUAS::FLAT_ADDRESS)
2101
+ if (SrcAS == AMDGPUAS::FLAT_ADDRESS) {
2102
+ if (DestAS == AMDGPUAS::PRIVATE_ADDRESS &&
2103
+ Subtarget->hasGloballyAddressableScratch()) {
2104
+ // Flat -> private requires subtracting src_flat_scratch_base_lo.
2105
+ return false;
2106
+ }
2107
+
2108
+ // Flat -> private/local is a simple truncate.
2109
+ // Flat -> global is no-op
2104
2110
return true;
2111
+ }
2105
2112
2106
2113
const GCNTargetMachine &TM =
2107
2114
static_cast<const GCNTargetMachine &>(getTargetMachine());
@@ -7650,6 +7657,9 @@ SDValue SITargetLowering::getSegmentAperture(unsigned AS, const SDLoc &DL,
7650
7657
const unsigned ApertureRegNo = (AS == AMDGPUAS::LOCAL_ADDRESS)
7651
7658
? AMDGPU::SRC_SHARED_BASE
7652
7659
: AMDGPU::SRC_PRIVATE_BASE;
7660
+ assert((ApertureRegNo != AMDGPU::SRC_PRIVATE_BASE ||
7661
+ !Subtarget->hasGloballyAddressableScratch()) &&
7662
+ "Cannot use src_private_base with globally addressable scratch!");
7653
7663
// Note: this feature (register) is broken. When used as a 32-bit operand,
7654
7664
// it returns a wrong value (all zeroes?). The real value is in the upper 32
7655
7665
// bits.
@@ -7760,6 +7770,18 @@ SDValue SITargetLowering::lowerADDRSPACECAST(SDValue Op,
7760
7770
DestAS == AMDGPUAS::PRIVATE_ADDRESS) {
7761
7771
SDValue Ptr = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Src);
7762
7772
7773
+ if (DestAS == AMDGPUAS::PRIVATE_ADDRESS &&
7774
+ Subtarget->hasGloballyAddressableScratch()) {
7775
+ // flat -> private with globally addressable scratch: subtract
7776
+ // src_flat_scratch_base_lo.
7777
+ SDValue FlatScratchBaseLo(
7778
+ DAG.getMachineNode(
7779
+ AMDGPU::S_MOV_B32, SL, MVT::i32,
7780
+ DAG.getRegister(AMDGPU::SRC_FLAT_SCRATCH_BASE_LO, MVT::i32)),
7781
+ 0);
7782
+ Ptr = DAG.getNode(ISD::SUB, SL, MVT::i32, Ptr, FlatScratchBaseLo);
7783
+ }
7784
+
7763
7785
if (IsNonNull || isKnownNonNull(Op, DAG, TM, SrcAS))
7764
7786
return Ptr;
7765
7787
@@ -7776,11 +7798,40 @@ SDValue SITargetLowering::lowerADDRSPACECAST(SDValue Op,
7776
7798
if (DestAS == AMDGPUAS::FLAT_ADDRESS) {
7777
7799
if (SrcAS == AMDGPUAS::LOCAL_ADDRESS ||
7778
7800
SrcAS == AMDGPUAS::PRIVATE_ADDRESS) {
7779
-
7780
- SDValue Aperture = getSegmentAperture(SrcAS, SL, DAG);
7781
- SDValue CvtPtr =
7782
- DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, Src, Aperture);
7783
- CvtPtr = DAG.getNode(ISD::BITCAST, SL, MVT::i64, CvtPtr);
7801
+ SDValue CvtPtr;
7802
+ if (SrcAS == AMDGPUAS::PRIVATE_ADDRESS &&
7803
+ Subtarget->hasGloballyAddressableScratch()) {
7804
+ // For wave32: Addr = (TID[4:0] << 52) + FLAT_SCRATCH_BASE + privateAddr
7805
+ // For wave64: Addr = (TID[5:0] << 51) + FLAT_SCRATCH_BASE + privateAddr
7806
+ SDValue AllOnes = DAG.getSignedTargetConstant(-1, SL, MVT::i32);
7807
+ SDValue ThreadID = DAG.getConstant(0, SL, MVT::i32);
7808
+ ThreadID = DAG.getNode(
7809
+ ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32,
7810
+ DAG.getTargetConstant(Intrinsic::amdgcn_mbcnt_lo, SL, MVT::i32),
7811
+ AllOnes, ThreadID);
7812
+ if (Subtarget->isWave64())
7813
+ ThreadID = DAG.getNode(
7814
+ ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32,
7815
+ DAG.getTargetConstant(Intrinsic::amdgcn_mbcnt_hi, SL, MVT::i32),
7816
+ AllOnes, ThreadID);
7817
+ SDValue ShAmt = DAG.getShiftAmountConstant(
7818
+ 57 - 32 - Subtarget->getWavefrontSizeLog2(), MVT::i32, SL);
7819
+ SDValue SrcHi = DAG.getNode(ISD::SHL, SL, MVT::i32, ThreadID, ShAmt);
7820
+ CvtPtr = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, Src, SrcHi);
7821
+ CvtPtr = DAG.getNode(ISD::BITCAST, SL, MVT::i64, CvtPtr);
7822
+ // Accessing src_flat_scratch_base_lo as a 64-bit operand gives the full
7823
+ // 64-bit hi:lo value.
7824
+ SDValue FlatScratchBase = {
7825
+ DAG.getMachineNode(
7826
+ AMDGPU::S_MOV_B64, SL, MVT::i64,
7827
+ DAG.getRegister(AMDGPU::SRC_FLAT_SCRATCH_BASE, MVT::i64)),
7828
+ 0};
7829
+ CvtPtr = DAG.getNode(ISD::ADD, SL, MVT::i64, CvtPtr, FlatScratchBase);
7830
+ } else {
7831
+ SDValue Aperture = getSegmentAperture(SrcAS, SL, DAG);
7832
+ CvtPtr = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, Src, Aperture);
7833
+ CvtPtr = DAG.getNode(ISD::BITCAST, SL, MVT::i64, CvtPtr);
7834
+ }
7784
7835
7785
7836
if (IsNonNull || isKnownNonNull(Op, DAG, TM, SrcAS))
7786
7837
return CvtPtr;
@@ -9424,15 +9475,29 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
9424
9475
case Intrinsic::amdgcn_is_shared:
9425
9476
case Intrinsic::amdgcn_is_private: {
9426
9477
SDLoc SL(Op);
9427
- unsigned AS = (IntrinsicID == Intrinsic::amdgcn_is_shared)
9428
- ? AMDGPUAS::LOCAL_ADDRESS
9429
- : AMDGPUAS::PRIVATE_ADDRESS;
9430
- SDValue Aperture = getSegmentAperture(AS, SL, DAG);
9431
9478
SDValue SrcVec =
9432
9479
DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Op.getOperand(1));
9433
-
9434
9480
SDValue SrcHi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, SrcVec,
9435
9481
DAG.getConstant(1, SL, MVT::i32));
9482
+
9483
+ unsigned AS = (IntrinsicID == Intrinsic::amdgcn_is_shared)
9484
+ ? AMDGPUAS::LOCAL_ADDRESS
9485
+ : AMDGPUAS::PRIVATE_ADDRESS;
9486
+ if (AS == AMDGPUAS::PRIVATE_ADDRESS &&
9487
+ Subtarget->hasGloballyAddressableScratch()) {
9488
+ SDValue FlatScratchBaseHi(
9489
+ DAG.getMachineNode(
9490
+ AMDGPU::S_MOV_B32, DL, MVT::i32,
9491
+ DAG.getRegister(AMDGPU::SRC_FLAT_SCRATCH_BASE_HI, MVT::i32)),
9492
+ 0);
9493
+ // Test bits 63..58 against the aperture address.
9494
+ return DAG.getSetCC(
9495
+ SL, MVT::i1,
9496
+ DAG.getNode(ISD::XOR, SL, MVT::i32, SrcHi, FlatScratchBaseHi),
9497
+ DAG.getConstant(1u << 26, SL, MVT::i32), ISD::SETULT);
9498
+ }
9499
+
9500
+ SDValue Aperture = getSegmentAperture(AS, SL, DAG);
9436
9501
return DAG.getSetCC(SL, MVT::i1, SrcHi, Aperture, ISD::SETEQ);
9437
9502
}
9438
9503
case Intrinsic::amdgcn_perm:
0 commit comments