Skip to content

Commit 283c47b

Browse files
authored
[lld][LoongArch] GOT indirection to PC relative optimization (#123743)
In LoongArch, we try GOT indirection to PC relative optimization in normal or medium code model, whether or not with R_LARCH_RELAX relocation. From: * pcalau12i $a0, %got_pc_hi20(sym_got) * ld.w/d $a0, $a0, %got_pc_lo12(sym_got) To: * pcalau12i $a0, %pc_hi20(sym) * addi.w/d $a0, $a0, %pc_lo12(sym) If the original code sequence can be relaxed into a single instruction `pcaddi`, this patch will not be taken (see #123566). The optimization related to GOT is split into two locations because the `relax()` function is part of an iteration fixed-point algorithm. We should minimize it to achieve better linker performance. Note: Althouth the optimization has been performed, the GOT entries still exists, similarly to AArch64. Eliminating the entries will increase code complexity.
1 parent 86916ff commit 283c47b

File tree

3 files changed

+268
-4
lines changed

3 files changed

+268
-4
lines changed

lld/ELF/Arch/LoongArch.cpp

Lines changed: 117 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,8 @@ class LoongArch final : public TargetInfo {
4646
private:
4747
void tlsdescToIe(uint8_t *loc, const Relocation &rel, uint64_t val) const;
4848
void tlsdescToLe(uint8_t *loc, const Relocation &rel, uint64_t val) const;
49+
bool tryGotToPCRel(uint8_t *loc, const Relocation &rHi20,
50+
const Relocation &rLo12, uint64_t secAddr) const;
4951
};
5052
} // end anonymous namespace
5153

@@ -1155,6 +1157,78 @@ void LoongArch::tlsdescToLe(uint8_t *loc, const Relocation &rel,
11551157
}
11561158
}
11571159

1160+
// Try GOT indirection to PC relative optimization.
1161+
// From:
1162+
// * pcalau12i $a0, %got_pc_hi20(sym_got)
1163+
// * ld.w/d $a0, $a0, %got_pc_lo12(sym_got)
1164+
// To:
1165+
// * pcalau12i $a0, %pc_hi20(sym)
1166+
// * addi.w/d $a0, $a0, %pc_lo12(sym)
1167+
//
1168+
// Note: Althouth the optimization has been performed, the GOT entries still
1169+
// exists, similarly to AArch64. Eliminating the entries will increase code
1170+
// complexity.
1171+
bool LoongArch::tryGotToPCRel(uint8_t *loc, const Relocation &rHi20,
1172+
const Relocation &rLo12, uint64_t secAddr) const {
1173+
// Check if the relocations apply to consecutive instructions.
1174+
if (rHi20.offset + 4 != rLo12.offset)
1175+
return false;
1176+
1177+
// Check if the relocations reference the same symbol and skip undefined,
1178+
// preemptible and STT_GNU_IFUNC symbols.
1179+
if (!rHi20.sym || rHi20.sym != rLo12.sym || !rHi20.sym->isDefined() ||
1180+
rHi20.sym->isPreemptible || rHi20.sym->isGnuIFunc())
1181+
return false;
1182+
1183+
// GOT references to absolute symbols can't be relaxed to use PCALAU12I/ADDI
1184+
// in position-independent code because these instructions produce a relative
1185+
// address.
1186+
if ((ctx.arg.isPic && !cast<Defined>(*rHi20.sym).section))
1187+
return false;
1188+
1189+
// Check if the addends of the both relocations are zero.
1190+
if (rHi20.addend != 0 || rLo12.addend != 0)
1191+
return false;
1192+
1193+
const uint32_t currInsn = read32le(loc);
1194+
const uint32_t nextInsn = read32le(loc + 4);
1195+
const uint32_t ldOpcode = ctx.arg.is64 ? LD_D : LD_W;
1196+
// Check if the first instruction is PCALAU12I and the second instruction is
1197+
// LD.
1198+
if ((currInsn & 0xfe000000) != PCALAU12I ||
1199+
(nextInsn & 0xffc00000) != ldOpcode)
1200+
return false;
1201+
1202+
// Check if use the same register.
1203+
if (getD5(currInsn) != getJ5(nextInsn) || getJ5(nextInsn) != getD5(nextInsn))
1204+
return false;
1205+
1206+
Symbol &sym = *rHi20.sym;
1207+
uint64_t symLocal = sym.getVA(ctx);
1208+
const int64_t displace = symLocal - getLoongArchPage(secAddr + rHi20.offset);
1209+
// Check if the symbol address is in
1210+
// [(PC & ~0xfff) - 2GiB - 0x800, (PC & ~0xfff) + 2GiB - 0x800).
1211+
const int64_t underflow = -0x80000000LL - 0x800;
1212+
const int64_t overflow = 0x80000000LL - 0x800;
1213+
if (!(displace >= underflow && displace < overflow))
1214+
return false;
1215+
1216+
Relocation newRHi20 = {RE_LOONGARCH_PAGE_PC, R_LARCH_PCALA_HI20, rHi20.offset,
1217+
rHi20.addend, &sym};
1218+
Relocation newRLo12 = {R_ABS, R_LARCH_PCALA_LO12, rLo12.offset, rLo12.addend,
1219+
&sym};
1220+
uint64_t pageDelta =
1221+
getLoongArchPageDelta(symLocal, secAddr + rHi20.offset, rHi20.type);
1222+
// pcalau12i $a0, %pc_hi20
1223+
write32le(loc, insn(PCALAU12I, getD5(currInsn), 0, 0));
1224+
relocate(loc, newRHi20, pageDelta);
1225+
// addi.w/d $a0, $a0, %pc_lo12
1226+
write32le(loc + 4, insn(ctx.arg.is64 ? ADDI_D : ADDI_W, getD5(nextInsn),
1227+
getJ5(nextInsn), 0));
1228+
relocate(loc + 4, newRLo12, SignExtend64(symLocal, 64));
1229+
return true;
1230+
}
1231+
11581232
// During TLSDESC GD_TO_IE, the converted code sequence always includes an
11591233
// instruction related to the Lo12 relocation (ld.[wd]). To obtain correct val
11601234
// in `getRelocTargetVA`, expr of this instruction should be adjusted to
@@ -1172,6 +1246,30 @@ RelExpr LoongArch::adjustTlsExpr(RelType type, RelExpr expr) const {
11721246
return expr;
11731247
}
11741248

1249+
static bool pairForGotRels(ArrayRef<Relocation> relocs) {
1250+
// Check if R_LARCH_GOT_PC_HI20 and R_LARCH_GOT_PC_LO12 always appear in
1251+
// pairs.
1252+
size_t i = 0;
1253+
const size_t size = relocs.size();
1254+
for (; i != size; ++i) {
1255+
if (relocs[i].type == R_LARCH_GOT_PC_HI20) {
1256+
if (i + 1 < size && relocs[i + 1].type == R_LARCH_GOT_PC_LO12) {
1257+
++i;
1258+
continue;
1259+
}
1260+
if (relaxable(relocs, i) && i + 2 < size &&
1261+
relocs[i + 2].type == R_LARCH_GOT_PC_LO12) {
1262+
i += 2;
1263+
continue;
1264+
}
1265+
break;
1266+
} else if (relocs[i].type == R_LARCH_GOT_PC_LO12) {
1267+
break;
1268+
}
1269+
}
1270+
return i == size;
1271+
}
1272+
11751273
void LoongArch::relocateAlloc(InputSectionBase &sec, uint8_t *buf) const {
11761274
const unsigned bits = ctx.arg.is64 ? 64 : 32;
11771275
uint64_t secAddr = sec.getOutputSection()->addr;
@@ -1181,6 +1279,7 @@ void LoongArch::relocateAlloc(InputSectionBase &sec, uint8_t *buf) const {
11811279
secAddr += ehIn->getParent()->outSecOff;
11821280
bool isExtreme = false, isRelax = false;
11831281
const MutableArrayRef<Relocation> relocs = sec.relocs();
1282+
const bool isPairForGotRels = pairForGotRels(relocs);
11841283
for (size_t i = 0, size = relocs.size(); i != size; ++i) {
11851284
Relocation &rel = relocs[i];
11861285
uint8_t *loc = buf + rel.offset;
@@ -1264,6 +1363,24 @@ void LoongArch::relocateAlloc(InputSectionBase &sec, uint8_t *buf) const {
12641363
tlsdescToLe(loc, rel, val);
12651364
}
12661365
continue;
1366+
case RE_LOONGARCH_GOT_PAGE_PC:
1367+
// In LoongArch, we try GOT indirection to PC relative optimization in
1368+
// normal or medium code model, whether or not with R_LARCH_RELAX
1369+
// relocation. Moreover, if the original code sequence can be relaxed to a
1370+
// single instruction `pcaddi`, the first instruction will be removed and
1371+
// it will not reach here.
1372+
if (isPairForGotRels && rel.type == R_LARCH_GOT_PC_HI20) {
1373+
bool isRelax = relaxable(relocs, i);
1374+
const Relocation lo12Rel = isRelax ? relocs[i + 2] : relocs[i + 1];
1375+
if (lo12Rel.type == R_LARCH_GOT_PC_LO12 &&
1376+
tryGotToPCRel(loc, rel, lo12Rel, secAddr)) {
1377+
// isRelax: skip relocations R_LARCH_RELAX, R_LARCH_GOT_PC_LO12
1378+
// !isRelax: skip relocation R_LARCH_GOT_PC_LO12
1379+
i += isRelax ? 2 : 1;
1380+
continue;
1381+
}
1382+
}
1383+
break;
12671384
default:
12681385
break;
12691386
}
Lines changed: 145 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,145 @@
1+
# REQUIRES: loongarch
2+
# RUN: rm -rf %t && split-file %s %t && cd %t
3+
4+
# RUN: llvm-mc --filetype=obj --triple=loongarch64 a.s -o a.o
5+
# RUN: llvm-mc --filetype=obj --triple=loongarch64 unpaired.s -o unpaired.o
6+
# RUN: llvm-mc --filetype=obj --triple=loongarch64 lone-ldr.s -o lone-ldr.o
7+
8+
# RUN: ld.lld a.o -T within-range.t -o a
9+
# RUN: llvm-objdump -d --no-show-raw-insn a | FileCheck %s
10+
11+
## This test verifies the encoding when the register $a0 is used.
12+
# CHECK: pcalau12i $a0, 0
13+
# CHECK-NEXT: addi.d $a0, $a0, -2048
14+
15+
## PCALAU12I contains a nonzero addend, no relaxations should be applied.
16+
# CHECK-NEXT: pcalau12i $a1, 2
17+
# CHECK-NEXT: ld.d $a1, $a1, -2048
18+
19+
## LD contains a nonzero addend, no relaxations should be applied.
20+
# CHECK-NEXT: pcalau12i $a2, 2
21+
# CHECK-NEXT: ld.d $a2, $a2, -2040
22+
23+
## PCALAU12I and LD use different registers, no relaxations should be applied.
24+
# CHECK-NEXT: pcalau12i $a3, 2
25+
# CHECK-NEXT: ld.d $a4, $a3, -2048
26+
27+
## PCALAU12I and LD use different registers, no relaxations should be applied.
28+
# CHECK-NEXT: pcalau12i $a5, 2
29+
# CHECK-NEXT: ld.d $a5, $a6, -2048
30+
31+
# RUN: ld.lld a.o -T underflow-range.t -o a-underflow
32+
# RUN: llvm-objdump -d --no-show-raw-insn a-underflow | FileCheck --check-prefix=OUTRANGE %s
33+
34+
# RUN: ld.lld a.o -T overflow-range.t -o a-overflow
35+
# RUN: llvm-objdump -d --no-show-raw-insn a-overflow | FileCheck --check-prefix=OUTRANGE %s
36+
37+
# OUTRANGE: pcalau12i $a0, 1
38+
# OUTRANGE-NEXT: ld.d $a0, $a0, 0
39+
40+
## Relocations do not appear in pairs, no relaxations should be applied.
41+
# RUN: ld.lld unpaired.o -T within-range.t -o unpaired
42+
# RUN: llvm-objdump --no-show-raw-insn -d unpaired | FileCheck --check-prefix=UNPAIRED %s
43+
44+
# UNPAIRED: pcalau12i $a0, 2
45+
# UNPAIRED-NEXT: b 8
46+
# UNPAIRED-NEXT: pcalau12i $a0, 2
47+
# UNPAIRED: ld.d $a0, $a0, -2048
48+
49+
## Relocations do not appear in pairs, no relaxations should be applied.
50+
# RUN: ld.lld lone-ldr.o -T within-range.t -o lone-ldr
51+
# RUN: llvm-objdump --no-show-raw-insn -d lone-ldr | FileCheck --check-prefix=LONE-LDR %s
52+
53+
# LONE-LDR: ld.d $a0, $a0, -2048
54+
55+
## 32-bit code is mostly the same. We only test a few variants.
56+
# RUN: llvm-mc --filetype=obj --triple=loongarch32 a.32.s -o a.32.o
57+
# RUN: ld.lld a.32.o -T within-range.t -o a32
58+
# RUN: llvm-objdump -d --no-show-raw-insn a32 | FileCheck --check-prefix=CHECK32 %s
59+
60+
## This test verifies the encoding when the register $a0 is used.
61+
# CHECK32: pcalau12i $a0, 0
62+
# CHECK32-NEXT: addi.w $a0, $a0, -2048
63+
64+
65+
## This linker script ensures that .rodata and .text are sufficiently close to
66+
## each other so that the pcalau12i + ld pair can be relaxed to pcalau12i + add.
67+
#--- within-range.t
68+
SECTIONS {
69+
.rodata 0x1800: { *(.rodata) }
70+
.text 0x2800: { *(.text) }
71+
.got 0x3800: { *(.got) }
72+
}
73+
74+
## This linker script ensures that .rodata and .text are sufficiently far apart
75+
## so that the pcalau12i + ld pair cannot be relaxed to pcalau12i + add.
76+
#--- underflow-range.t
77+
SECTIONS {
78+
.rodata 0x800-4: { *(.rodata) }
79+
.got 0x80002000: { *(.got) }
80+
.text 0x80001000: { *(.text) } /* (0x800-4)+2GB+0x800+4 */
81+
}
82+
83+
#--- overflow-range.t
84+
SECTIONS {
85+
.text 0x1000: { *(.text) }
86+
.got 0x2000: { *(.got) }
87+
.rodata 0x80000800 : { *(.rodata) } /* 0x1000+2GB-0x800 */
88+
}
89+
90+
#--- a.s
91+
## Symbol 'x' is nonpreemptible, the optimization should be applied.
92+
.rodata
93+
.hidden x
94+
x:
95+
.word 10
96+
97+
.text
98+
.global _start
99+
_start:
100+
pcalau12i $a0, %got_pc_hi20(x)
101+
ld.d $a0, $a0, %got_pc_lo12(x)
102+
pcalau12i $a1, %got_pc_hi20(x+1)
103+
ld.d $a1, $a1, %got_pc_lo12(x)
104+
pcalau12i $a2, %got_pc_hi20(x)
105+
ld.d $a2, $a2, %got_pc_lo12(x+8)
106+
pcalau12i $a3, %got_pc_hi20(x)
107+
ld.d $a4, $a3, %got_pc_lo12(x)
108+
pcalau12i $a5, %got_pc_hi20(x)
109+
ld.d $a5, $a6, %got_pc_lo12(x)
110+
111+
#--- unpaired.s
112+
.text
113+
.hidden x
114+
x:
115+
nop
116+
.global _start
117+
_start:
118+
pcalau12i $a0, %got_pc_hi20(x)
119+
b L
120+
pcalau12i $a0, %got_pc_hi20(x)
121+
L:
122+
ld.d $a0, $a0, %got_pc_lo12(x)
123+
124+
#--- lone-ldr.s
125+
.text
126+
.hidden x
127+
x:
128+
nop
129+
.global _start
130+
_start:
131+
ld.d $a0, $a0, %got_pc_lo12(x)
132+
133+
134+
#--- a.32.s
135+
## Symbol 'x' is nonpreemptible, the optimization should be applied.
136+
.rodata
137+
.hidden x
138+
x:
139+
.word 10
140+
141+
.text
142+
.global _start
143+
_start:
144+
pcalau12i $a0, %got_pc_hi20(x)
145+
ld.w $a0, $a0, %got_pc_lo12(x)

lld/test/ELF/loongarch-relax-pc-hi20-lo12.s

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -31,24 +31,26 @@
3131
## offset = 0x410000 - 0x10000: 0x400 pages, page offset 0
3232
# NORELAX32-NEXT: 10000: pcalau12i $a0, 1024
3333
# NORELAX32-NEXT: addi.w $a0, $a0, 0
34+
## Not relaxation, convertion to PCRel.
3435
# NORELAX32-NEXT: pcalau12i $a0, 1024
35-
# NORELAX32-NEXT: ld.w $a0, $a0, 4
36+
# NORELAX32-NEXT: addi.w $a0, $a0, 0
3637
# NORELAX32-NEXT: pcalau12i $a0, 1024
3738
# NORELAX32-NEXT: addi.w $a0, $a0, 0
3839
# NORELAX32-NEXT: pcalau12i $a0, 1024
39-
# NORELAX32-NEXT: ld.w $a0, $a0, 4
40+
# NORELAX32-NEXT: addi.w $a0, $a0, 0
4041

4142
# NORELAX64-LABEL: <_start>:
4243
## offset exceed range of pcaddi
4344
## offset = 0x410000 - 0x10000: 0x400 pages, page offset 0
4445
# NORELAX64-NEXT: 10000: pcalau12i $a0, 1024
4546
# NORELAX64-NEXT: addi.d $a0, $a0, 0
47+
## Not relaxation, convertion to PCRel.
4648
# NORELAX64-NEXT: pcalau12i $a0, 1024
47-
# NORELAX64-NEXT: ld.d $a0, $a0, 8
49+
# NORELAX64-NEXT: addi.d $a0, $a0, 0
4850
# NORELAX64-NEXT: pcalau12i $a0, 1024
4951
# NORELAX64-NEXT: addi.d $a0, $a0, 0
5052
# NORELAX64-NEXT: pcalau12i $a0, 1024
51-
# NORELAX64-NEXT: ld.d $a0, $a0, 8
53+
# NORELAX64-NEXT: addi.d $a0, $a0, 0
5254

5355

5456
## GOT references with non-zero addends. No relaxation.

0 commit comments

Comments
 (0)