[LLVM][CGP] Allow finer control for sinking compares. (#151366)

paulwalker-arm · web-flow · commit 94d374ab6cd5 · 2025-08-05T11:43:41.000+01:00
Compare sinking is selectable based on the result of
hasMultipleConditionRegisters. This function is too coarse grained by
not taking into account the differences between scalar and vector
compares. This PR extends the interface to take an EVT to allow finer
control.
    
The new interface is used by AArch64 to disable sinking of scalable
vector compares, but with isProfitableToSinkOperands updated to maintain
the cases that are specifically tested.
diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -518,10 +518,12 @@ class LLVM_ABI TargetLoweringBase {
     return true;
   }
 
-  /// Return true if multiple condition registers are available.
-  bool hasMultipleConditionRegisters() const {
-    return HasMultipleConditionRegisters;
-  }
+  /// Does the target have multiple (allocatable) condition registers that
+  /// can be used to store the results of comparisons for use by selects
+  /// and conditional branches. With multiple condition registers, the code
+  /// generator will not aggressively sink comparisons into the blocks of their
+  /// users.
+  virtual bool hasMultipleConditionRegisters(EVT VT) const { return false; }
 
   /// Return true if the target has BitExtract instructions.
   bool hasExtractBitsInsn() const { return HasExtractBitsInsn; }
@@ -2453,7 +2455,7 @@ class LLVM_ABI TargetLoweringBase {
                                                EVT VT) const {
     // If a target has multiple condition registers, then it likely has logical
     // operations on those registers.
-    if (hasMultipleConditionRegisters())
+    if (hasMultipleConditionRegisters(VT))
       return false;
     // Only do the transform if the value won't be split into multiple
     // registers.
@@ -2560,15 +2562,6 @@ class LLVM_ABI TargetLoweringBase {
     StackPointerRegisterToSaveRestore = R;
   }
 
-  /// Tells the code generator that the target has multiple (allocatable)
-  /// condition registers that can be used to store the results of comparisons
-  /// for use by selects and conditional branches. With multiple condition
-  /// registers, the code generator will not aggressively sink comparisons into
-  /// the blocks of their users.
-  void setHasMultipleConditionRegisters(bool hasManyRegs = true) {
-    HasMultipleConditionRegisters = hasManyRegs;
-  }
-
   /// Tells the code generator that the target has BitExtract instructions.
   /// The code generator will aggressively sink "shift"s into the blocks of
   /// their users if the users will generate "and" instructions which can be
@@ -3604,13 +3597,6 @@ class LLVM_ABI TargetLoweringBase {
 private:
   const TargetMachine &TM;
 
-  /// Tells the code generator that the target has multiple (allocatable)
-  /// condition registers that can be used to store the results of comparisons
-  /// for use by selects and conditional branches. With multiple condition
-  /// registers, the code generator will not aggressively sink comparisons into
-  /// the blocks of their users.
-  bool HasMultipleConditionRegisters;
-
   /// Tells the code generator that the target has BitExtract instructions.
   /// The code generator will aggressively sink "shift"s into the blocks of
   /// their users if the users will generate "and" instructions which can be
diff --git a/llvm/lib/CodeGen/CodeGenPrepare.cpp b/llvm/lib/CodeGen/CodeGenPrepare.cpp
@@ -1834,7 +1834,7 @@ bool CodeGenPrepare::unfoldPowerOf2Test(CmpInst *Cmp) {
 ///
 /// Return true if any changes are made.
 static bool sinkCmpExpression(CmpInst *Cmp, const TargetLowering &TLI) {
-  if (TLI.hasMultipleConditionRegisters())
+  if (TLI.hasMultipleConditionRegisters(EVT::getEVT(Cmp->getType())))
     return false;
 
   // Avoid sinking soft-FP comparisons, since this can move them into a loop.
diff --git a/llvm/lib/CodeGen/TargetLoweringBase.cpp b/llvm/lib/CodeGen/TargetLoweringBase.cpp
@@ -697,7 +697,6 @@ TargetLoweringBase::TargetLoweringBase(const TargetMachine &tm)
   MaxGluedStoresPerMemcpy = 0;
   MaxStoresPerMemsetOptSize = MaxStoresPerMemcpyOptSize =
       MaxStoresPerMemmoveOptSize = MaxLoadsPerMemcmpOptSize = 4;
-  HasMultipleConditionRegisters = false;
   HasExtractBitsInsn = false;
   JumpIsExpensive = JumpIsExpensiveOverride;
   PredictableSelectIsExpensive = false;
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -887,6 +887,10 @@ class AArch64TargetLowering : public TargetLowering {
   bool shouldScalarizeBinop(SDValue VecOp) const override {
     return VecOp.getOpcode() == ISD::SETCC;
   }
+
+  bool hasMultipleConditionRegisters(EVT VT) const override {
+    return VT.isScalableVector();
+  }
 };
 
 namespace AArch64 {
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -6284,10 +6284,17 @@ bool AArch64TTIImpl::isProfitableToSinkOperands(
     }
   }
 
-  auto ShouldSinkCondition = [](Value *Cond) -> bool {
+  auto ShouldSinkCondition = [](Value *Cond,
+                                SmallVectorImpl<Use *> &Ops) -> bool {
+    if (!isa<IntrinsicInst>(Cond))
+      return false;
     auto *II = dyn_cast<IntrinsicInst>(Cond);
-    return II && II->getIntrinsicID() == Intrinsic::vector_reduce_or &&
-           isa<ScalableVectorType>(II->getOperand(0)->getType());
+    if (II->getIntrinsicID() != Intrinsic::vector_reduce_or ||
+        !isa<ScalableVectorType>(II->getOperand(0)->getType()))
+      return false;
+    if (isa<CmpInst>(II->getOperand(0)))
+      Ops.push_back(&II->getOperandUse(0));
+    return true;
   };
 
   switch (I->getOpcode()) {
@@ -6303,7 +6310,7 @@ bool AArch64TTIImpl::isProfitableToSinkOperands(
     }
     break;
   case Instruction::Select: {
-    if (!ShouldSinkCondition(I->getOperand(0)))
+    if (!ShouldSinkCondition(I->getOperand(0), Ops))
       return false;
 
     Ops.push_back(&I->getOperandUse(0));
@@ -6313,7 +6320,7 @@ bool AArch64TTIImpl::isProfitableToSinkOperands(
     if (cast<BranchInst>(I)->isUnconditional())
       return false;
 
-    if (!ShouldSinkCondition(cast<BranchInst>(I)->getCondition()))
+    if (!ShouldSinkCondition(cast<BranchInst>(I)->getCondition(), Ops))
       return false;
 
     Ops.push_back(&I->getOperandUse(0));
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -601,14 +601,6 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
   setSchedulingPreference(Sched::RegPressure);
   setJumpIsExpensive(true);
 
-  // FIXME: This is only partially true. If we have to do vector compares, any
-  // SGPR pair can be a condition register. If we have a uniform condition, we
-  // are better off doing SALU operations, where there is only one SCC. For now,
-  // we don't have a way of knowing during instruction selection if a condition
-  // will be uniform and we always use vector compares. Assume we are using
-  // vector compares until that is fixed.
-  setHasMultipleConditionRegisters(true);
-
   setMinCmpXchgSizeInBits(32);
   setSupportsUnalignedAtomics(false);
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
@@ -388,6 +388,16 @@ class AMDGPUTargetLowering : public TargetLowering {
   MVT getFenceOperandTy(const DataLayout &DL) const override {
     return MVT::i32;
   }
+
+  bool hasMultipleConditionRegisters(EVT VT) const override {
+    // FIXME: This is only partially true. If we have to do vector compares, any
+    // SGPR pair can be a condition register. If we have a uniform condition, we
+    // are better off doing SALU operations, where there is only one SCC. For
+    // now, we don't have a way of knowing during instruction selection if a
+    // condition will be uniform and we always use vector compares. Assume we
+    // are using vector compares until that is fixed.
+    return true;
+  }
 };
 
 namespace AMDGPUISD {
diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -1433,7 +1433,6 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
   // With 32 condition bits, we don't need to sink (and duplicate) compares
   // aggressively in CodeGenPrep.
   if (Subtarget.useCRBits()) {
-    setHasMultipleConditionRegisters();
     setJumpIsExpensive();
   }
 
@@ -19856,3 +19855,7 @@ Value *PPCTargetLowering::emitMaskedAtomicCmpXchgIntrinsic(
   return Builder.CreateOr(
       Lo, Builder.CreateShl(Hi, ConstantInt::get(ValTy, 64)), "val64");
 }
+
+bool PPCTargetLowering::hasMultipleConditionRegisters(EVT VT) const {
+  return Subtarget.useCRBits();
+}
diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.h b/llvm/lib/Target/PowerPC/PPCISelLowering.h
@@ -1207,6 +1207,8 @@ namespace llvm {
                                   bool IsVarArg) const;
     bool supportsTailCallFor(const CallBase *CB) const;
 
+    bool hasMultipleConditionRegisters(EVT VT) const override;
+
   private:
     struct ReuseLoadInfo {
       SDValue Ptr;
diff --git a/llvm/test/Transforms/CodeGenPrepare/dont-sink-scalable-vector-compare.ll b/llvm/test/Transforms/CodeGenPrepare/dont-sink-scalable-vector-compare.ll
@@ -0,0 +1,44 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -passes='require<profile-summary>,function(codegenprepare)' -S < %s | FileCheck %s
+
+target triple = "aarch64-unknown-linux-gnu"
+
+define void @do_not_sink_scalable_vector_compare(ptr %a, ptr %b) #0 {
+; CHECK-LABEL: define void @do_not_sink_scalable_vector_compare(
+; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[STEP_VECTOR:%.*]] = call <vscale x 4 x i32> @llvm.stepvector.nxv4i32()
+; CHECK-NEXT:    [[TMP0:%.*]] = icmp ult <vscale x 4 x i32> [[STEP_VECTOR]], splat (i32 16)
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[SRC:%.*]] = getelementptr inbounds ptr, ptr [[A]], i64 [[INDEX]]
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr [[SRC]], i32 4, <vscale x 4 x i1> [[TMP0]], <vscale x 4 x i32> poison)
+; CHECK-NEXT:    [[DST:%.*]] = getelementptr inbounds ptr, ptr [[B]], i64 [[INDEX]]
+; CHECK-NEXT:    call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[WIDE_LOAD]], ptr [[DST]], i32 4, <vscale x 4 x i1> [[TMP0]])
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
+; CHECK-NEXT:    [[EXIT_COND:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
+; CHECK-NEXT:    br i1 [[EXIT_COND]], label %[[VECTOR_END:.*]], label %[[VECTOR_BODY]]
+; CHECK:       [[VECTOR_END]]:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %step.vector = call <vscale x 4 x i32> @llvm.stepvector()
+  %mask = icmp ult <vscale x 4 x i32> %step.vector, splat (i32 16)
+  br label %vector.body
+
+vector.body:
+  %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
+  %src = getelementptr inbounds ptr, ptr %a, i64 %index
+  %wide.load =  call <vscale x 4 x i32> @llvm.masked.load.nxv4i32(ptr %src, i32 4, <vscale x 4 x i1> %mask, <vscale x 4 x i32> poison)
+  %dst = getelementptr inbounds ptr, ptr %b, i64 %index
+  call void @llvm.masked.store.nxv4i32(<vscale x 4 x i32> %wide.load, ptr %dst, i32 4, <vscale x 4 x i1> %mask)
+  %index.next = add nuw i64 %index, 16
+  %exit.cond = icmp eq i64 %index.next, 1024
+  br i1 %exit.cond, label %vector.end, label %vector.body
+
+vector.end:
+  ret void
+}
+
+attributes #0 = { "target-features"="+sve" }

Original file line number	Diff line number	Diff line change
`@@ -1433,7 +1433,6 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,`
`1433`	`1433`	`// With 32 condition bits, we don't need to sink (and duplicate) compares`
`1434`	`1434`	`// aggressively in CodeGenPrep.`
`1435`	`1435`	`if (Subtarget.useCRBits()) {`
`1436`		`- setHasMultipleConditionRegisters();`
`1437`	`1436`	`setJumpIsExpensive();`
`1438`	`1437`	`}`
`1439`	`1438`
`@@ -19856,3 +19855,7 @@ Value *PPCTargetLowering::emitMaskedAtomicCmpXchgIntrinsic(`
`19856`	`19855`	`return Builder.CreateOr(`
`19857`	`19856`	`Lo, Builder.CreateShl(Hi, ConstantInt::get(ValTy, 64)), "val64");`
`19858`	`19857`	`}`
	`19858`	`+`
	`19859`	`+bool PPCTargetLowering::hasMultipleConditionRegisters(EVT VT) const {`
	`19860`	`+ return Subtarget.useCRBits();`
	`19861`	`+}`