From 97c173d9ba9e7a030141782fd6ada21e2ed1613a Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Thu, 24 Jul 2025 19:58:51 +0100 Subject: [PATCH] [TailDup] Delay aggressive computed-goto taildup to after RegAlloc. https://github.com/llvm/llvm-project/pull/114990 allowed more aggressive tail duplication for computed-gotos in both pre- and post-regalloc tail duplication. In some cases, performing tail-duplication too early can lead to worse results, especially if we duplicate blocks with a number of phi nodes. This is causing a ~3% performance regression in some workloads using Python 3.12. This patch updates TailDup to delay aggressive tail-duplication for computed gotos to after register allocation. This means we can keep the non-duplicated version for a bit longer throughout the backend, which should reduce compile-time as well as allowing a number of optimizations and simplifications to trigger before drastically expanding the CFG. For the case in https://github.com/llvm/llvm-project/issues/106846, I get the same performance with and without this patch on Skylake. --- llvm/lib/CodeGen/TailDuplicator.cpp | 16 ++-- .../AArch64/late-taildup-computed-goto.ll | 77 +++++++------------ ...o.mir => early-tail-dup-computed-goto.mir} | 44 ++++++----- 3 files changed, 62 insertions(+), 75 deletions(-) rename llvm/test/CodeGen/X86/{tail-dup-computed-goto.mir => early-tail-dup-computed-goto.mir} (93%) diff --git a/llvm/lib/CodeGen/TailDuplicator.cpp b/llvm/lib/CodeGen/TailDuplicator.cpp index d319a979285f7..5d720fbbf1c61 100644 --- a/llvm/lib/CodeGen/TailDuplicator.cpp +++ b/llvm/lib/CodeGen/TailDuplicator.cpp @@ -610,6 +610,15 @@ bool TailDuplicator::shouldTailDuplicate(bool IsSimple, if (HasIndirectbr && PreRegAlloc) MaxDuplicateCount = TailDupIndirectBranchSize; + // Allow higher limits when the block has computed-gotos and running after + // register allocation. NB. This basically unfactors computed gotos that were + // factored early on in the compilation process to speed up edge based data + // flow. If we do not unfactor them again, it can seriously pessimize code + // with many computed jumps in the source code, such as interpreters. + // Therefore we do not restrict the computed gotos. + if (HasComputedGoto && !PreRegAlloc) + MaxDuplicateCount = std::max(MaxDuplicateCount, 10u); + // Check the instructions in the block to determine whether tail-duplication // is invalid or unlikely to be profitable. unsigned InstrCount = 0; @@ -663,12 +672,7 @@ bool TailDuplicator::shouldTailDuplicate(bool IsSimple, // Duplicating a BB which has both multiple predecessors and successors will // may cause huge amount of PHI nodes. If we want to remove this limitation, // we have to address https://github.com/llvm/llvm-project/issues/78578. - // NB. This basically unfactors computed gotos that were factored early on in - // the compilation process to speed up edge based data flow. If we do not - // unfactor them again, it can seriously pessimize code with many computed - // jumps in the source code, such as interpreters. Therefore we do not - // restrict the computed gotos. - if (!HasComputedGoto && TailBB.pred_size() > TailDupPredSize && + if (PreRegAlloc && TailBB.pred_size() > TailDupPredSize && TailBB.succ_size() > TailDupSuccSize) { // If TailBB or any of its successors contains a phi, we may have to add a // large number of additional phis with additional incoming values. diff --git a/llvm/test/CodeGen/AArch64/late-taildup-computed-goto.ll b/llvm/test/CodeGen/AArch64/late-taildup-computed-goto.ll index c4a027c6d8070..381904f776604 100644 --- a/llvm/test/CodeGen/AArch64/late-taildup-computed-goto.ll +++ b/llvm/test/CodeGen/AArch64/late-taildup-computed-goto.ll @@ -25,77 +25,58 @@ define void @test_interp(ptr %frame, ptr %dst) { ; CHECK-NEXT: adrp x21, _opcode.targets@PAGE ; CHECK-NEXT: Lloh1: ; CHECK-NEXT: add x21, x21, _opcode.targets@PAGEOFF -; CHECK-NEXT: mov x22, xzr +; CHECK-NEXT: mov x24, xzr ; CHECK-NEXT: add x8, x21, xzr, lsl #3 ; CHECK-NEXT: mov x19, x1 ; CHECK-NEXT: mov x20, x0 -; CHECK-NEXT: add x23, x22, #1 +; CHECK-NEXT: mov x23, xzr +; CHECK-NEXT: mov w22, #1 ; =0x1 +; CHECK-NEXT: add x24, x24, #1 ; CHECK-NEXT: br x8 ; CHECK-NEXT: Ltmp0: ; Block address taken ; CHECK-NEXT: LBB0_1: ; %loop.header ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: add x8, x21, x23, lsl #3 +; CHECK-NEXT: add x8, x21, x24, lsl #3 ; CHECK-NEXT: mov x20, xzr -; CHECK-NEXT: mov x22, xzr -; CHECK-NEXT: add x23, x23, #1 +; CHECK-NEXT: mov x23, xzr +; CHECK-NEXT: add x24, x24, #1 ; CHECK-NEXT: br x8 ; CHECK-NEXT: Ltmp1: ; Block address taken ; CHECK-NEXT: LBB0_2: ; %op1.bb -; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: str xzr, [x19] -; CHECK-NEXT: mov w8, #1 ; =0x1 +; CHECK-NEXT: Ltmp2: ; Block address taken +; CHECK-NEXT: LBB0_3: ; %op6.bb +; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: ldr x0, [x20, #-8]! -; CHECK-NEXT: ldr x9, [x0, #8] -; CHECK-NEXT: str x8, [x0] -; CHECK-NEXT: ldr x8, [x9, #48] +; CHECK-NEXT: ldr x8, [x0, #8] +; CHECK-NEXT: str x22, [x0] +; CHECK-NEXT: ldr x8, [x8, #48] ; CHECK-NEXT: blr x8 -; CHECK-NEXT: add x8, x21, x23, lsl #3 -; CHECK-NEXT: add x23, x23, #1 +; CHECK-NEXT: add x8, x21, x24, lsl #3 +; CHECK-NEXT: add x24, x24, #1 ; CHECK-NEXT: br x8 -; CHECK-NEXT: Ltmp2: ; Block address taken -; CHECK-NEXT: LBB0_3: ; %op2.bb +; CHECK-NEXT: Ltmp3: ; Block address taken +; CHECK-NEXT: LBB0_4: ; %op2.bb ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: add x8, x21, x23, lsl #3 +; CHECK-NEXT: add x8, x21, x24, lsl #3 ; CHECK-NEXT: mov x20, xzr -; CHECK-NEXT: add x23, x23, #1 -; CHECK-NEXT: str x22, [x19] -; CHECK-NEXT: mov x22, xzr +; CHECK-NEXT: str x23, [x19] +; CHECK-NEXT: mov x23, xzr +; CHECK-NEXT: add x24, x24, #1 ; CHECK-NEXT: br x8 -; CHECK-NEXT: Ltmp3: ; Block address taken -; CHECK-NEXT: LBB0_4: ; %op4.bb -; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: str x22, [x19] -; CHECK-NEXT: add x10, x21, x23, lsl #3 -; CHECK-NEXT: add x23, x23, #1 -; CHECK-NEXT: ldur x8, [x22, #12] -; CHECK-NEXT: ldur x9, [x20, #-8] -; CHECK-NEXT: add x22, x22, #20 -; CHECK-NEXT: stp x8, x9, [x20, #-8] -; CHECK-NEXT: add x20, x20, #8 -; CHECK-NEXT: br x10 ; CHECK-NEXT: Ltmp4: ; Block address taken -; CHECK-NEXT: LBB0_5: ; %op5.bb +; CHECK-NEXT: LBB0_5: ; %op4.bb +; CHECK-NEXT: Ltmp5: ; Block address taken +; CHECK-NEXT: LBB0_6: ; %op5.bb ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: str x22, [x19] -; CHECK-NEXT: add x10, x21, x23, lsl #3 -; CHECK-NEXT: add x23, x23, #1 -; CHECK-NEXT: ldur x8, [x22, #12] +; CHECK-NEXT: str x23, [x19] +; CHECK-NEXT: ldur x8, [x23, #12] ; CHECK-NEXT: ldur x9, [x20, #-8] -; CHECK-NEXT: add x22, x22, #20 +; CHECK-NEXT: add x23, x23, #20 ; CHECK-NEXT: stp x8, x9, [x20, #-8] +; CHECK-NEXT: add x8, x21, x24, lsl #3 ; CHECK-NEXT: add x20, x20, #8 -; CHECK-NEXT: br x10 -; CHECK-NEXT: Ltmp5: ; Block address taken -; CHECK-NEXT: LBB0_6: ; %op6.bb -; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: ldr x0, [x20, #-8]! -; CHECK-NEXT: mov w8, #1 ; =0x1 -; CHECK-NEXT: ldr x9, [x0, #8] -; CHECK-NEXT: str x8, [x0] -; CHECK-NEXT: ldr x8, [x9, #48] -; CHECK-NEXT: blr x8 -; CHECK-NEXT: add x8, x21, x23, lsl #3 -; CHECK-NEXT: add x23, x23, #1 +; CHECK-NEXT: add x24, x24, #1 ; CHECK-NEXT: br x8 ; CHECK-NEXT: .loh AdrpAdd Lloh0, Lloh1 entry: diff --git a/llvm/test/CodeGen/X86/tail-dup-computed-goto.mir b/llvm/test/CodeGen/X86/early-tail-dup-computed-goto.mir similarity index 93% rename from llvm/test/CodeGen/X86/tail-dup-computed-goto.mir rename to llvm/test/CodeGen/X86/early-tail-dup-computed-goto.mir index 17de405928d37..0f2896463a8af 100644 --- a/llvm/test/CodeGen/X86/tail-dup-computed-goto.mir +++ b/llvm/test/CodeGen/X86/early-tail-dup-computed-goto.mir @@ -1,6 +1,8 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5 # RUN: llc -mtriple=x86_64-unknown-linux-gnu -run-pass=early-tailduplication -tail-dup-pred-size=1 -tail-dup-succ-size=1 %s -o - | FileCheck %s -# Check that only the computed goto is not be restrict by tail-dup-pred-size and tail-dup-succ-size. +# +# Check that only the computed goto and others are restricted by tail-dup-pred-size and tail-dup-succ-size. +# --- | @computed_goto.dispatch = constant [5 x ptr] [ptr null, ptr blockaddress(@computed_goto, %bb1), ptr blockaddress(@computed_goto, %bb2), ptr blockaddress(@computed_goto, %bb3), ptr blockaddress(@computed_goto, %bb4)] declare i64 @f0() @@ -30,54 +32,54 @@ tracksRegLiveness: true body: | ; CHECK-LABEL: name: computed_goto ; CHECK: bb.0: - ; CHECK-NEXT: successors: %bb.1(0x20000000), %bb.2(0x20000000), %bb.3(0x20000000), %bb.4(0x20000000) + ; CHECK-NEXT: successors: %bb.5(0x80000000) ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp ; CHECK-NEXT: CALL64pcrel32 target-flags(x86-plt) @f0, csr_64, implicit $rsp, implicit $ssp, implicit-def $rsp, implicit-def $ssp, implicit-def $rax ; CHECK-NEXT: ADJCALLSTACKUP64 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp - ; CHECK-NEXT: [[COPY:%[0-9]+]]:gr64_nosp = COPY $rax - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gr64_nosp = COPY [[COPY]] - ; CHECK-NEXT: JMP64m $noreg, 8, [[COPY]], @computed_goto.dispatch, $noreg + ; CHECK-NEXT: [[COPY:%[0-9]+]]:gr64 = COPY $rax + ; CHECK-NEXT: JMP_1 %bb.5 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.1.bb1 (ir-block-address-taken %ir-block.bb1): - ; CHECK-NEXT: successors: %bb.1(0x20000000), %bb.2(0x20000000), %bb.3(0x20000000), %bb.4(0x20000000) + ; CHECK-NEXT: successors: %bb.5(0x80000000) ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp ; CHECK-NEXT: CALL64pcrel32 target-flags(x86-plt) @f1, csr_64, implicit $rsp, implicit $ssp, implicit-def $rsp, implicit-def $ssp, implicit-def $rax ; CHECK-NEXT: ADJCALLSTACKUP64 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp - ; CHECK-NEXT: [[COPY2:%[0-9]+]]:gr64_nosp = COPY $rax - ; CHECK-NEXT: [[COPY3:%[0-9]+]]:gr64_nosp = COPY [[COPY2]] - ; CHECK-NEXT: JMP64m $noreg, 8, [[COPY2]], @computed_goto.dispatch, $noreg + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gr64 = COPY $rax + ; CHECK-NEXT: JMP_1 %bb.5 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.2.bb2 (ir-block-address-taken %ir-block.bb2): - ; CHECK-NEXT: successors: %bb.1(0x20000000), %bb.2(0x20000000), %bb.3(0x20000000), %bb.4(0x20000000) + ; CHECK-NEXT: successors: %bb.5(0x80000000) ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp ; CHECK-NEXT: CALL64pcrel32 target-flags(x86-plt) @f2, csr_64, implicit $rsp, implicit $ssp, implicit-def $rsp, implicit-def $ssp, implicit-def $rax ; CHECK-NEXT: ADJCALLSTACKUP64 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp - ; CHECK-NEXT: [[COPY4:%[0-9]+]]:gr64_nosp = COPY $rax - ; CHECK-NEXT: [[COPY5:%[0-9]+]]:gr64_nosp = COPY [[COPY4]] - ; CHECK-NEXT: JMP64m $noreg, 8, [[COPY4]], @computed_goto.dispatch, $noreg + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:gr64 = COPY $rax + ; CHECK-NEXT: JMP_1 %bb.5 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.3.bb3 (ir-block-address-taken %ir-block.bb3): - ; CHECK-NEXT: successors: %bb.1(0x20000000), %bb.2(0x20000000), %bb.3(0x20000000), %bb.4(0x20000000) + ; CHECK-NEXT: successors: %bb.5(0x80000000) ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp ; CHECK-NEXT: CALL64pcrel32 target-flags(x86-plt) @f3, csr_64, implicit $rsp, implicit $ssp, implicit-def $rsp, implicit-def $ssp, implicit-def $rax ; CHECK-NEXT: ADJCALLSTACKUP64 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp - ; CHECK-NEXT: [[COPY6:%[0-9]+]]:gr64_nosp = COPY $rax - ; CHECK-NEXT: [[COPY7:%[0-9]+]]:gr64_nosp = COPY [[COPY6]] - ; CHECK-NEXT: JMP64m $noreg, 8, [[COPY6]], @computed_goto.dispatch, $noreg + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:gr64 = COPY $rax + ; CHECK-NEXT: JMP_1 %bb.5 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.4.bb4 (ir-block-address-taken %ir-block.bb4): - ; CHECK-NEXT: successors: %bb.1(0x20000000), %bb.2(0x20000000), %bb.3(0x20000000), %bb.4(0x20000000) + ; CHECK-NEXT: successors: %bb.5(0x80000000) ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp ; CHECK-NEXT: CALL64pcrel32 target-flags(x86-plt) @f4, csr_64, implicit $rsp, implicit $ssp, implicit-def $rsp, implicit-def $ssp, implicit-def $rax ; CHECK-NEXT: ADJCALLSTACKUP64 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp - ; CHECK-NEXT: [[COPY8:%[0-9]+]]:gr64_nosp = COPY $rax - ; CHECK-NEXT: [[COPY9:%[0-9]+]]:gr64_nosp = COPY [[COPY8]] - ; CHECK-NEXT: JMP64m $noreg, 8, [[COPY8]], @computed_goto.dispatch, $noreg + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:gr64 = COPY $rax + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.5: + ; CHECK-NEXT: successors: %bb.1(0x20000000), %bb.2(0x20000000), %bb.3(0x20000000), %bb.4(0x20000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[PHI:%[0-9]+]]:gr64_nosp = PHI [[COPY]], %bb.0, [[COPY4]], %bb.4, [[COPY3]], %bb.3, [[COPY2]], %bb.2, [[COPY1]], %bb.1 + ; CHECK-NEXT: JMP64m $noreg, 8, [[PHI]], @computed_goto.dispatch, $noreg bb.0: ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp CALL64pcrel32 target-flags(x86-plt) @f0, csr_64, implicit $rsp, implicit $ssp, implicit-def $rsp, implicit-def $ssp, implicit-def $rax