diff --git a/lld/ELF/Arch/LoongArch.cpp b/lld/ELF/Arch/LoongArch.cpp index a14553018fc36..8802c8c2e7f01 100644 --- a/lld/ELF/Arch/LoongArch.cpp +++ b/lld/ELF/Arch/LoongArch.cpp @@ -46,6 +46,8 @@ class LoongArch final : public TargetInfo { private: void tlsdescToIe(uint8_t *loc, const Relocation &rel, uint64_t val) const; void tlsdescToLe(uint8_t *loc, const Relocation &rel, uint64_t val) const; + bool tryGotToPCRel(uint8_t *loc, const Relocation &rHi20, + const Relocation &rLo12, uint64_t secAddr) const; }; } // end anonymous namespace @@ -1155,6 +1157,78 @@ void LoongArch::tlsdescToLe(uint8_t *loc, const Relocation &rel, } } +// Try GOT indirection to PC relative optimization. +// From: +// * pcalau12i $a0, %got_pc_hi20(sym_got) +// * ld.w/d $a0, $a0, %got_pc_lo12(sym_got) +// To: +// * pcalau12i $a0, %pc_hi20(sym) +// * addi.w/d $a0, $a0, %pc_lo12(sym) +// +// Note: Althouth the optimization has been performed, the GOT entries still +// exists, similarly to AArch64. Eliminating the entries will increase code +// complexity. +bool LoongArch::tryGotToPCRel(uint8_t *loc, const Relocation &rHi20, + const Relocation &rLo12, uint64_t secAddr) const { + // Check if the relocations apply to consecutive instructions. + if (rHi20.offset + 4 != rLo12.offset) + return false; + + // Check if the relocations reference the same symbol and skip undefined, + // preemptible and STT_GNU_IFUNC symbols. + if (!rHi20.sym || rHi20.sym != rLo12.sym || !rHi20.sym->isDefined() || + rHi20.sym->isPreemptible || rHi20.sym->isGnuIFunc()) + return false; + + // GOT references to absolute symbols can't be relaxed to use PCALAU12I/ADDI + // in position-independent code because these instructions produce a relative + // address. + if ((ctx.arg.isPic && !cast(*rHi20.sym).section)) + return false; + + // Check if the addends of the both relocations are zero. + if (rHi20.addend != 0 || rLo12.addend != 0) + return false; + + const uint32_t currInsn = read32le(loc); + const uint32_t nextInsn = read32le(loc + 4); + const uint32_t ldOpcode = ctx.arg.is64 ? LD_D : LD_W; + // Check if the first instruction is PCALAU12I and the second instruction is + // LD. + if ((currInsn & 0xfe000000) != PCALAU12I || + (nextInsn & 0xffc00000) != ldOpcode) + return false; + + // Check if use the same register. + if (getD5(currInsn) != getJ5(nextInsn) || getJ5(nextInsn) != getD5(nextInsn)) + return false; + + Symbol &sym = *rHi20.sym; + uint64_t symLocal = sym.getVA(ctx); + const int64_t displace = symLocal - getLoongArchPage(secAddr + rHi20.offset); + // Check if the symbol address is in + // [(PC & ~0xfff) - 2GiB - 0x800, (PC & ~0xfff) + 2GiB - 0x800). + const int64_t underflow = -0x80000000LL - 0x800; + const int64_t overflow = 0x80000000LL - 0x800; + if (!(displace >= underflow && displace < overflow)) + return false; + + Relocation newRHi20 = {RE_LOONGARCH_PAGE_PC, R_LARCH_PCALA_HI20, rHi20.offset, + rHi20.addend, &sym}; + Relocation newRLo12 = {R_ABS, R_LARCH_PCALA_LO12, rLo12.offset, rLo12.addend, + &sym}; + uint64_t pageDelta = + getLoongArchPageDelta(symLocal, secAddr + rHi20.offset, rHi20.type); + // pcalau12i $a0, %pc_hi20 + write32le(loc, insn(PCALAU12I, getD5(currInsn), 0, 0)); + relocate(loc, newRHi20, pageDelta); + // addi.w/d $a0, $a0, %pc_lo12 + write32le(loc + 4, insn(ctx.arg.is64 ? ADDI_D : ADDI_W, getD5(nextInsn), + getJ5(nextInsn), 0)); + relocate(loc + 4, newRLo12, SignExtend64(symLocal, 64)); + return true; +} + // During TLSDESC GD_TO_IE, the converted code sequence always includes an // instruction related to the Lo12 relocation (ld.[wd]). To obtain correct val // in `getRelocTargetVA`, expr of this instruction should be adjusted to @@ -1172,6 +1246,30 @@ RelExpr LoongArch::adjustTlsExpr(RelType type, RelExpr expr) const { return expr; } +static bool pairForGotRels(ArrayRef relocs) { + // Check if R_LARCH_GOT_PC_HI20 and R_LARCH_GOT_PC_LO12 always appear in + // pairs. + size_t i = 0; + const size_t size = relocs.size(); + for (; i != size; ++i) { + if (relocs[i].type == R_LARCH_GOT_PC_HI20) { + if (i + 1 < size && relocs[i + 1].type == R_LARCH_GOT_PC_LO12) { + ++i; + continue; + } + if (relaxable(relocs, i) && i + 2 < size && + relocs[i + 2].type == R_LARCH_GOT_PC_LO12) { + i += 2; + continue; + } + break; + } else if (relocs[i].type == R_LARCH_GOT_PC_LO12) { + break; + } + } + return i == size; +} + void LoongArch::relocateAlloc(InputSectionBase &sec, uint8_t *buf) const { const unsigned bits = ctx.arg.is64 ? 64 : 32; uint64_t secAddr = sec.getOutputSection()->addr; @@ -1181,6 +1279,7 @@ void LoongArch::relocateAlloc(InputSectionBase &sec, uint8_t *buf) const { secAddr += ehIn->getParent()->outSecOff; bool isExtreme = false, isRelax = false; const MutableArrayRef relocs = sec.relocs(); + const bool isPairForGotRels = pairForGotRels(relocs); for (size_t i = 0, size = relocs.size(); i != size; ++i) { Relocation &rel = relocs[i]; uint8_t *loc = buf + rel.offset; @@ -1264,6 +1363,24 @@ void LoongArch::relocateAlloc(InputSectionBase &sec, uint8_t *buf) const { tlsdescToLe(loc, rel, val); } continue; + case RE_LOONGARCH_GOT_PAGE_PC: + // In LoongArch, we try GOT indirection to PC relative optimization in + // normal or medium code model, whether or not with R_LARCH_RELAX + // relocation. Moreover, if the original code sequence can be relaxed to a + // single instruction `pcaddi`, the first instruction will be removed and + // it will not reach here. + if (isPairForGotRels && rel.type == R_LARCH_GOT_PC_HI20) { + bool isRelax = relaxable(relocs, i); + const Relocation lo12Rel = isRelax ? relocs[i + 2] : relocs[i + 1]; + if (lo12Rel.type == R_LARCH_GOT_PC_LO12 && + tryGotToPCRel(loc, rel, lo12Rel, secAddr)) { + // isRelax: skip relocations R_LARCH_RELAX, R_LARCH_GOT_PC_LO12 + // !isRelax: skip relocation R_LARCH_GOT_PC_LO12 + i += isRelax ? 2 : 1; + continue; + } + } + break; default: break; } diff --git a/lld/test/ELF/loongarch-pc-hi20-lo12-got.s b/lld/test/ELF/loongarch-pc-hi20-lo12-got.s new file mode 100644 index 0000000000000..acd94007d0ffc --- /dev/null +++ b/lld/test/ELF/loongarch-pc-hi20-lo12-got.s @@ -0,0 +1,145 @@ +# REQUIRES: loongarch +# RUN: rm -rf %t && split-file %s %t && cd %t + +# RUN: llvm-mc --filetype=obj --triple=loongarch64 a.s -o a.o +# RUN: llvm-mc --filetype=obj --triple=loongarch64 unpaired.s -o unpaired.o +# RUN: llvm-mc --filetype=obj --triple=loongarch64 lone-ldr.s -o lone-ldr.o + +# RUN: ld.lld a.o -T within-range.t -o a +# RUN: llvm-objdump -d --no-show-raw-insn a | FileCheck %s + +## This test verifies the encoding when the register $a0 is used. +# CHECK: pcalau12i $a0, 0 +# CHECK-NEXT: addi.d $a0, $a0, -2048 + +## PCALAU12I contains a nonzero addend, no relaxations should be applied. +# CHECK-NEXT: pcalau12i $a1, 2 +# CHECK-NEXT: ld.d $a1, $a1, -2048 + +## LD contains a nonzero addend, no relaxations should be applied. +# CHECK-NEXT: pcalau12i $a2, 2 +# CHECK-NEXT: ld.d $a2, $a2, -2040 + +## PCALAU12I and LD use different registers, no relaxations should be applied. +# CHECK-NEXT: pcalau12i $a3, 2 +# CHECK-NEXT: ld.d $a4, $a3, -2048 + +## PCALAU12I and LD use different registers, no relaxations should be applied. +# CHECK-NEXT: pcalau12i $a5, 2 +# CHECK-NEXT: ld.d $a5, $a6, -2048 + +# RUN: ld.lld a.o -T underflow-range.t -o a-underflow +# RUN: llvm-objdump -d --no-show-raw-insn a-underflow | FileCheck --check-prefix=OUTRANGE %s + +# RUN: ld.lld a.o -T overflow-range.t -o a-overflow +# RUN: llvm-objdump -d --no-show-raw-insn a-overflow | FileCheck --check-prefix=OUTRANGE %s + +# OUTRANGE: pcalau12i $a0, 1 +# OUTRANGE-NEXT: ld.d $a0, $a0, 0 + +## Relocations do not appear in pairs, no relaxations should be applied. +# RUN: ld.lld unpaired.o -T within-range.t -o unpaired +# RUN: llvm-objdump --no-show-raw-insn -d unpaired | FileCheck --check-prefix=UNPAIRED %s + +# UNPAIRED: pcalau12i $a0, 2 +# UNPAIRED-NEXT: b 8 +# UNPAIRED-NEXT: pcalau12i $a0, 2 +# UNPAIRED: ld.d $a0, $a0, -2048 + +## Relocations do not appear in pairs, no relaxations should be applied. +# RUN: ld.lld lone-ldr.o -T within-range.t -o lone-ldr +# RUN: llvm-objdump --no-show-raw-insn -d lone-ldr | FileCheck --check-prefix=LONE-LDR %s + +# LONE-LDR: ld.d $a0, $a0, -2048 + +## 32-bit code is mostly the same. We only test a few variants. +# RUN: llvm-mc --filetype=obj --triple=loongarch32 a.32.s -o a.32.o +# RUN: ld.lld a.32.o -T within-range.t -o a32 +# RUN: llvm-objdump -d --no-show-raw-insn a32 | FileCheck --check-prefix=CHECK32 %s + +## This test verifies the encoding when the register $a0 is used. +# CHECK32: pcalau12i $a0, 0 +# CHECK32-NEXT: addi.w $a0, $a0, -2048 + + +## This linker script ensures that .rodata and .text are sufficiently close to +## each other so that the pcalau12i + ld pair can be relaxed to pcalau12i + add. +#--- within-range.t +SECTIONS { + .rodata 0x1800: { *(.rodata) } + .text 0x2800: { *(.text) } + .got 0x3800: { *(.got) } +} + +## This linker script ensures that .rodata and .text are sufficiently far apart +## so that the pcalau12i + ld pair cannot be relaxed to pcalau12i + add. +#--- underflow-range.t +SECTIONS { + .rodata 0x800-4: { *(.rodata) } + .got 0x80002000: { *(.got) } + .text 0x80001000: { *(.text) } /* (0x800-4)+2GB+0x800+4 */ +} + +#--- overflow-range.t +SECTIONS { + .text 0x1000: { *(.text) } + .got 0x2000: { *(.got) } + .rodata 0x80000800 : { *(.rodata) } /* 0x1000+2GB-0x800 */ +} + +#--- a.s +## Symbol 'x' is nonpreemptible, the optimization should be applied. +.rodata +.hidden x +x: +.word 10 + +.text +.global _start +_start: + pcalau12i $a0, %got_pc_hi20(x) + ld.d $a0, $a0, %got_pc_lo12(x) + pcalau12i $a1, %got_pc_hi20(x+1) + ld.d $a1, $a1, %got_pc_lo12(x) + pcalau12i $a2, %got_pc_hi20(x) + ld.d $a2, $a2, %got_pc_lo12(x+8) + pcalau12i $a3, %got_pc_hi20(x) + ld.d $a4, $a3, %got_pc_lo12(x) + pcalau12i $a5, %got_pc_hi20(x) + ld.d $a5, $a6, %got_pc_lo12(x) + +#--- unpaired.s +.text +.hidden x +x: + nop +.global _start +_start: + pcalau12i $a0, %got_pc_hi20(x) + b L + pcalau12i $a0, %got_pc_hi20(x) +L: + ld.d $a0, $a0, %got_pc_lo12(x) + +#--- lone-ldr.s +.text +.hidden x +x: + nop +.global _start +_start: + ld.d $a0, $a0, %got_pc_lo12(x) + + +#--- a.32.s +## Symbol 'x' is nonpreemptible, the optimization should be applied. +.rodata +.hidden x +x: +.word 10 + +.text +.global _start +_start: + pcalau12i $a0, %got_pc_hi20(x) + ld.w $a0, $a0, %got_pc_lo12(x) diff --git a/lld/test/ELF/loongarch-relax-pc-hi20-lo12.s b/lld/test/ELF/loongarch-relax-pc-hi20-lo12.s index a33f866506e13..08d5d3e950d84 100644 --- a/lld/test/ELF/loongarch-relax-pc-hi20-lo12.s +++ b/lld/test/ELF/loongarch-relax-pc-hi20-lo12.s @@ -31,24 +31,26 @@ ## offset = 0x410000 - 0x10000: 0x400 pages, page offset 0 # NORELAX32-NEXT: 10000: pcalau12i $a0, 1024 # NORELAX32-NEXT: addi.w $a0, $a0, 0 +## Not relaxation, convertion to PCRel. # NORELAX32-NEXT: pcalau12i $a0, 1024 -# NORELAX32-NEXT: ld.w $a0, $a0, 4 +# NORELAX32-NEXT: addi.w $a0, $a0, 0 # NORELAX32-NEXT: pcalau12i $a0, 1024 # NORELAX32-NEXT: addi.w $a0, $a0, 0 # NORELAX32-NEXT: pcalau12i $a0, 1024 -# NORELAX32-NEXT: ld.w $a0, $a0, 4 +# NORELAX32-NEXT: addi.w $a0, $a0, 0 # NORELAX64-LABEL: <_start>: ## offset exceed range of pcaddi ## offset = 0x410000 - 0x10000: 0x400 pages, page offset 0 # NORELAX64-NEXT: 10000: pcalau12i $a0, 1024 # NORELAX64-NEXT: addi.d $a0, $a0, 0 +## Not relaxation, convertion to PCRel. # NORELAX64-NEXT: pcalau12i $a0, 1024 -# NORELAX64-NEXT: ld.d $a0, $a0, 8 +# NORELAX64-NEXT: addi.d $a0, $a0, 0 # NORELAX64-NEXT: pcalau12i $a0, 1024 # NORELAX64-NEXT: addi.d $a0, $a0, 0 # NORELAX64-NEXT: pcalau12i $a0, 1024 -# NORELAX64-NEXT: ld.d $a0, $a0, 8 +# NORELAX64-NEXT: addi.d $a0, $a0, 0 ## GOT references with non-zero addends. No relaxation.