From 97c173d9ba9e7a030141782fd6ada21e2ed1613a Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo@fhahn.com>
Date: Thu, 24 Jul 2025 19:58:51 +0100
Subject: [PATCH] [TailDup] Delay aggressive computed-goto taildup to after
 RegAlloc.

https://github.com/llvm/llvm-project/pull/114990 allowed more aggressive
tail duplication for computed-gotos in both pre- and post-regalloc tail
duplication.

In some cases, performing tail-duplication too early can lead to worse
results, especially if we duplicate blocks with a number of phi nodes.

This is causing a ~3% performance regression in some workloads using
Python 3.12.

This patch updates TailDup to delay aggressive tail-duplication for
computed gotos to after register allocation.

This means we can keep the non-duplicated version for a bit longer
throughout the backend, which should reduce compile-time as well as
allowing a number of optimizations and simplifications to trigger before
drastically expanding the CFG.

For the case in https://github.com/llvm/llvm-project/issues/106846, I
get the same performance with and without this patch on Skylake.
---
 llvm/lib/CodeGen/TailDuplicator.cpp           | 16 ++--
 .../AArch64/late-taildup-computed-goto.ll     | 77 +++++++------------
 ...o.mir => early-tail-dup-computed-goto.mir} | 44 ++++++-----
 3 files changed, 62 insertions(+), 75 deletions(-)
 rename llvm/test/CodeGen/X86/{tail-dup-computed-goto.mir => early-tail-dup-computed-goto.mir} (93%)

diff --git a/llvm/lib/CodeGen/TailDuplicator.cpp b/llvm/lib/CodeGen/TailDuplicator.cpp
index d319a979285f7..5d720fbbf1c61 100644
--- a/llvm/lib/CodeGen/TailDuplicator.cpp
+++ b/llvm/lib/CodeGen/TailDuplicator.cpp
@@ -610,6 +610,15 @@ bool TailDuplicator::shouldTailDuplicate(bool IsSimple,
   if (HasIndirectbr && PreRegAlloc)
     MaxDuplicateCount = TailDupIndirectBranchSize;
 
+  // Allow higher limits when the block has computed-gotos and running after
+  // register allocation. NB. This basically unfactors computed gotos that were
+  // factored early on in the compilation process to speed up edge based data
+  // flow. If we do not unfactor them again, it can seriously pessimize code
+  // with many computed jumps in the source code, such as interpreters.
+  // Therefore we do not restrict the computed gotos.
+  if (HasComputedGoto && !PreRegAlloc)
+    MaxDuplicateCount = std::max(MaxDuplicateCount, 10u);
+
   // Check the instructions in the block to determine whether tail-duplication
   // is invalid or unlikely to be profitable.
   unsigned InstrCount = 0;
@@ -663,12 +672,7 @@ bool TailDuplicator::shouldTailDuplicate(bool IsSimple,
   // Duplicating a BB which has both multiple predecessors and successors will
   // may cause huge amount of PHI nodes. If we want to remove this limitation,
   // we have to address https://github.com/llvm/llvm-project/issues/78578.
-  // NB. This basically unfactors computed gotos that were factored early on in
-  // the compilation process to speed up edge based data flow. If we do not
-  // unfactor them again, it can seriously pessimize code with many computed
-  // jumps in the source code, such as interpreters. Therefore we do not
-  // restrict the computed gotos.
-  if (!HasComputedGoto && TailBB.pred_size() > TailDupPredSize &&
+  if (PreRegAlloc && TailBB.pred_size() > TailDupPredSize &&
       TailBB.succ_size() > TailDupSuccSize) {
     // If TailBB or any of its successors contains a phi, we may have to add a
     // large number of additional phis with additional incoming values.
diff --git a/llvm/test/CodeGen/AArch64/late-taildup-computed-goto.ll b/llvm/test/CodeGen/AArch64/late-taildup-computed-goto.ll
index c4a027c6d8070..381904f776604 100644
--- a/llvm/test/CodeGen/AArch64/late-taildup-computed-goto.ll
+++ b/llvm/test/CodeGen/AArch64/late-taildup-computed-goto.ll
@@ -25,77 +25,58 @@ define void @test_interp(ptr %frame, ptr %dst) {
 ; CHECK-NEXT:    adrp x21, _opcode.targets@PAGE
 ; CHECK-NEXT:  Lloh1:
 ; CHECK-NEXT:    add x21, x21, _opcode.targets@PAGEOFF
-; CHECK-NEXT:    mov x22, xzr
+; CHECK-NEXT:    mov x24, xzr
 ; CHECK-NEXT:    add x8, x21, xzr, lsl #3
 ; CHECK-NEXT:    mov x19, x1
 ; CHECK-NEXT:    mov x20, x0
-; CHECK-NEXT:    add x23, x22, #1
+; CHECK-NEXT:    mov x23, xzr
+; CHECK-NEXT:    mov w22, #1 ; =0x1
+; CHECK-NEXT:    add x24, x24, #1
 ; CHECK-NEXT:    br x8
 ; CHECK-NEXT:  Ltmp0: ; Block address taken
 ; CHECK-NEXT:  LBB0_1: ; %loop.header
 ; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    add x8, x21, x23, lsl #3
+; CHECK-NEXT:    add x8, x21, x24, lsl #3
 ; CHECK-NEXT:    mov x20, xzr
-; CHECK-NEXT:    mov x22, xzr
-; CHECK-NEXT:    add x23, x23, #1
+; CHECK-NEXT:    mov x23, xzr
+; CHECK-NEXT:    add x24, x24, #1
 ; CHECK-NEXT:    br x8
 ; CHECK-NEXT:  Ltmp1: ; Block address taken
 ; CHECK-NEXT:  LBB0_2: ; %op1.bb
-; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    str xzr, [x19]
-; CHECK-NEXT:    mov w8, #1 ; =0x1
+; CHECK-NEXT:  Ltmp2: ; Block address taken
+; CHECK-NEXT:  LBB0_3: ; %op6.bb
+; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    ldr x0, [x20, #-8]!
-; CHECK-NEXT:    ldr x9, [x0, #8]
-; CHECK-NEXT:    str x8, [x0]
-; CHECK-NEXT:    ldr x8, [x9, #48]
+; CHECK-NEXT:    ldr x8, [x0, #8]
+; CHECK-NEXT:    str x22, [x0]
+; CHECK-NEXT:    ldr x8, [x8, #48]
 ; CHECK-NEXT:    blr x8
-; CHECK-NEXT:    add x8, x21, x23, lsl #3
-; CHECK-NEXT:    add x23, x23, #1
+; CHECK-NEXT:    add x8, x21, x24, lsl #3
+; CHECK-NEXT:    add x24, x24, #1
 ; CHECK-NEXT:    br x8
-; CHECK-NEXT:  Ltmp2: ; Block address taken
-; CHECK-NEXT:  LBB0_3: ; %op2.bb
+; CHECK-NEXT:  Ltmp3: ; Block address taken
+; CHECK-NEXT:  LBB0_4: ; %op2.bb
 ; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    add x8, x21, x23, lsl #3
+; CHECK-NEXT:    add x8, x21, x24, lsl #3
 ; CHECK-NEXT:    mov x20, xzr
-; CHECK-NEXT:    add x23, x23, #1
-; CHECK-NEXT:    str x22, [x19]
-; CHECK-NEXT:    mov x22, xzr
+; CHECK-NEXT:    str x23, [x19]
+; CHECK-NEXT:    mov x23, xzr
+; CHECK-NEXT:    add x24, x24, #1
 ; CHECK-NEXT:    br x8
-; CHECK-NEXT:  Ltmp3: ; Block address taken
-; CHECK-NEXT:  LBB0_4: ; %op4.bb
-; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    str x22, [x19]
-; CHECK-NEXT:    add x10, x21, x23, lsl #3
-; CHECK-NEXT:    add x23, x23, #1
-; CHECK-NEXT:    ldur x8, [x22, #12]
-; CHECK-NEXT:    ldur x9, [x20, #-8]
-; CHECK-NEXT:    add x22, x22, #20
-; CHECK-NEXT:    stp x8, x9, [x20, #-8]
-; CHECK-NEXT:    add x20, x20, #8
-; CHECK-NEXT:    br x10
 ; CHECK-NEXT:  Ltmp4: ; Block address taken
-; CHECK-NEXT:  LBB0_5: ; %op5.bb
+; CHECK-NEXT:  LBB0_5: ; %op4.bb
+; CHECK-NEXT:  Ltmp5: ; Block address taken
+; CHECK-NEXT:  LBB0_6: ; %op5.bb
 ; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    str x22, [x19]
-; CHECK-NEXT:    add x10, x21, x23, lsl #3
-; CHECK-NEXT:    add x23, x23, #1
-; CHECK-NEXT:    ldur x8, [x22, #12]
+; CHECK-NEXT:    str x23, [x19]
+; CHECK-NEXT:    ldur x8, [x23, #12]
 ; CHECK-NEXT:    ldur x9, [x20, #-8]
-; CHECK-NEXT:    add x22, x22, #20
+; CHECK-NEXT:    add x23, x23, #20
 ; CHECK-NEXT:    stp x8, x9, [x20, #-8]
+; CHECK-NEXT:    add x8, x21, x24, lsl #3
 ; CHECK-NEXT:    add x20, x20, #8
-; CHECK-NEXT:    br x10
-; CHECK-NEXT:  Ltmp5: ; Block address taken
-; CHECK-NEXT:  LBB0_6: ; %op6.bb
-; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    ldr x0, [x20, #-8]!
-; CHECK-NEXT:    mov w8, #1 ; =0x1
-; CHECK-NEXT:    ldr x9, [x0, #8]
-; CHECK-NEXT:    str x8, [x0]
-; CHECK-NEXT:    ldr x8, [x9, #48]
-; CHECK-NEXT:    blr x8
-; CHECK-NEXT:    add x8, x21, x23, lsl #3
-; CHECK-NEXT:    add x23, x23, #1
+; CHECK-NEXT:    add x24, x24, #1
 ; CHECK-NEXT:    br x8
 ; CHECK-NEXT:    .loh AdrpAdd Lloh0, Lloh1
 entry:
diff --git a/llvm/test/CodeGen/X86/tail-dup-computed-goto.mir b/llvm/test/CodeGen/X86/early-tail-dup-computed-goto.mir
similarity index 93%
rename from llvm/test/CodeGen/X86/tail-dup-computed-goto.mir
rename to llvm/test/CodeGen/X86/early-tail-dup-computed-goto.mir
index 17de405928d37..0f2896463a8af 100644
--- a/llvm/test/CodeGen/X86/tail-dup-computed-goto.mir
+++ b/llvm/test/CodeGen/X86/early-tail-dup-computed-goto.mir
@@ -1,6 +1,8 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
 # RUN: llc -mtriple=x86_64-unknown-linux-gnu -run-pass=early-tailduplication -tail-dup-pred-size=1 -tail-dup-succ-size=1 %s -o - | FileCheck %s
-# Check that only the computed goto is not be restrict by tail-dup-pred-size and tail-dup-succ-size.
+#
+# Check that only the computed goto and others are restricted by tail-dup-pred-size and tail-dup-succ-size.
+#
 --- |
   @computed_goto.dispatch = constant [5 x ptr] [ptr null, ptr blockaddress(@computed_goto, %bb1), ptr blockaddress(@computed_goto, %bb2), ptr blockaddress(@computed_goto, %bb3), ptr blockaddress(@computed_goto, %bb4)]
   declare i64 @f0()
@@ -30,54 +32,54 @@ tracksRegLiveness: true
 body:             |
   ; CHECK-LABEL: name: computed_goto
   ; CHECK: bb.0:
-  ; CHECK-NEXT:   successors: %bb.1(0x20000000), %bb.2(0x20000000), %bb.3(0x20000000), %bb.4(0x20000000)
+  ; CHECK-NEXT:   successors: %bb.5(0x80000000)
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
   ; CHECK-NEXT:   CALL64pcrel32 target-flags(x86-plt) @f0, csr_64, implicit $rsp, implicit $ssp, implicit-def $rsp, implicit-def $ssp, implicit-def $rax
   ; CHECK-NEXT:   ADJCALLSTACKUP64 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
-  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:gr64_nosp = COPY $rax
-  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:gr64_nosp = COPY [[COPY]]
-  ; CHECK-NEXT:   JMP64m $noreg, 8, [[COPY]], @computed_goto.dispatch, $noreg
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:gr64 = COPY $rax
+  ; CHECK-NEXT:   JMP_1 %bb.5
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.1.bb1 (ir-block-address-taken %ir-block.bb1):
-  ; CHECK-NEXT:   successors: %bb.1(0x20000000), %bb.2(0x20000000), %bb.3(0x20000000), %bb.4(0x20000000)
+  ; CHECK-NEXT:   successors: %bb.5(0x80000000)
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
   ; CHECK-NEXT:   CALL64pcrel32 target-flags(x86-plt) @f1, csr_64, implicit $rsp, implicit $ssp, implicit-def $rsp, implicit-def $ssp, implicit-def $rax
   ; CHECK-NEXT:   ADJCALLSTACKUP64 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
-  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:gr64_nosp = COPY $rax
-  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:gr64_nosp = COPY [[COPY2]]
-  ; CHECK-NEXT:   JMP64m $noreg, 8, [[COPY2]], @computed_goto.dispatch, $noreg
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:gr64 = COPY $rax
+  ; CHECK-NEXT:   JMP_1 %bb.5
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.2.bb2 (ir-block-address-taken %ir-block.bb2):
-  ; CHECK-NEXT:   successors: %bb.1(0x20000000), %bb.2(0x20000000), %bb.3(0x20000000), %bb.4(0x20000000)
+  ; CHECK-NEXT:   successors: %bb.5(0x80000000)
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
   ; CHECK-NEXT:   CALL64pcrel32 target-flags(x86-plt) @f2, csr_64, implicit $rsp, implicit $ssp, implicit-def $rsp, implicit-def $ssp, implicit-def $rax
   ; CHECK-NEXT:   ADJCALLSTACKUP64 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
-  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:gr64_nosp = COPY $rax
-  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:gr64_nosp = COPY [[COPY4]]
-  ; CHECK-NEXT:   JMP64m $noreg, 8, [[COPY4]], @computed_goto.dispatch, $noreg
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:gr64 = COPY $rax
+  ; CHECK-NEXT:   JMP_1 %bb.5
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.3.bb3 (ir-block-address-taken %ir-block.bb3):
-  ; CHECK-NEXT:   successors: %bb.1(0x20000000), %bb.2(0x20000000), %bb.3(0x20000000), %bb.4(0x20000000)
+  ; CHECK-NEXT:   successors: %bb.5(0x80000000)
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
   ; CHECK-NEXT:   CALL64pcrel32 target-flags(x86-plt) @f3, csr_64, implicit $rsp, implicit $ssp, implicit-def $rsp, implicit-def $ssp, implicit-def $rax
   ; CHECK-NEXT:   ADJCALLSTACKUP64 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
-  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:gr64_nosp = COPY $rax
-  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:gr64_nosp = COPY [[COPY6]]
-  ; CHECK-NEXT:   JMP64m $noreg, 8, [[COPY6]], @computed_goto.dispatch, $noreg
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:gr64 = COPY $rax
+  ; CHECK-NEXT:   JMP_1 %bb.5
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.4.bb4 (ir-block-address-taken %ir-block.bb4):
-  ; CHECK-NEXT:   successors: %bb.1(0x20000000), %bb.2(0x20000000), %bb.3(0x20000000), %bb.4(0x20000000)
+  ; CHECK-NEXT:   successors: %bb.5(0x80000000)
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
   ; CHECK-NEXT:   CALL64pcrel32 target-flags(x86-plt) @f4, csr_64, implicit $rsp, implicit $ssp, implicit-def $rsp, implicit-def $ssp, implicit-def $rax
   ; CHECK-NEXT:   ADJCALLSTACKUP64 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
-  ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:gr64_nosp = COPY $rax
-  ; CHECK-NEXT:   [[COPY9:%[0-9]+]]:gr64_nosp = COPY [[COPY8]]
-  ; CHECK-NEXT:   JMP64m $noreg, 8, [[COPY8]], @computed_goto.dispatch, $noreg
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:gr64 = COPY $rax
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.5:
+  ; CHECK-NEXT:   successors: %bb.1(0x20000000), %bb.2(0x20000000), %bb.3(0x20000000), %bb.4(0x20000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[PHI:%[0-9]+]]:gr64_nosp = PHI [[COPY]], %bb.0, [[COPY4]], %bb.4, [[COPY3]], %bb.3, [[COPY2]], %bb.2, [[COPY1]], %bb.1
+  ; CHECK-NEXT:   JMP64m $noreg, 8, [[PHI]], @computed_goto.dispatch, $noreg
   bb.0:
     ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
     CALL64pcrel32 target-flags(x86-plt) @f0, csr_64, implicit $rsp, implicit $ssp, implicit-def $rsp, implicit-def $ssp, implicit-def $rax