Message ID | 20250409171526.862481-3-samuel.holland@sifive.com (mailing list archive) |
---|---|
State | New |
Headers | show |
Series | [v2,1/3] riscv: module: Fix out-of-bounds relocation access | expand |
Context | Check | Description |
---|---|---|
bjorn/pre-ci_am | success | Success |
bjorn/build-rv32-defconfig | success | build-rv32-defconfig |
bjorn/build-rv64-clang-allmodconfig | success | build-rv64-clang-allmodconfig |
bjorn/build-rv64-gcc-allmodconfig | success | build-rv64-gcc-allmodconfig |
bjorn/build-rv64-nommu-k210-defconfig | success | build-rv64-nommu-k210-defconfig |
bjorn/build-rv64-nommu-k210-virt | success | build-rv64-nommu-k210-virt |
bjorn/checkpatch | success | checkpatch |
bjorn/dtb-warn-rv64 | success | dtb-warn-rv64 |
bjorn/header-inline | success | header-inline |
bjorn/kdoc | success | kdoc |
bjorn/module-param | success | module-param |
bjorn/verify-fixes | success | verify-fixes |
bjorn/verify-signedoff | success | verify-signedoff |
On Wed, Apr 09, 2025 at 10:14:51AM -0700, Samuel Holland wrote: > perf reports that 99.63% of the cycles from `modprobe amdgpu` are spent > inside module_frob_arch_sections(). This is because amdgpu.ko contains > about 300000 relocations in its .rela.text section, and the algorithm in > count_max_entries() takes quadratic time. > > Apply two optimizations from the arm64 code, which together reduce the > total execution time by 99.58%. First, sort the relocations so duplicate > entries are adjacent. Second, reduce the number of relocations that must > be sorted by filtering to only relocations that need PLT/GOT entries, as > done in commit d4e0340919fb ("arm64/module: Optimize module load time by > optimizing PLT counting"). > > Unlike the arm64 code, here the filtering and sorting is done in a > scratch buffer, because the HI20 relocation search optimization in > apply_relocate_add() depends on the original order of the relocations. > This allows accumulating PLT/GOT relocations across sections so sorting > and counting is only done once per module. > > Signed-off-by: Samuel Holland <samuel.holland@sifive.com> > --- > > Changes in v2: > - Include R_RISCV_PLT32 relocations when computing the PLT size > - Accumulate PLT/GOT relocations across relocation sections > > arch/riscv/kernel/module-sections.c | 81 +++++++++++++++++++++++------ > 1 file changed, 65 insertions(+), 16 deletions(-) > > diff --git a/arch/riscv/kernel/module-sections.c b/arch/riscv/kernel/module-sections.c > index 91d0b355ceef..75551ac6504c 100644 > --- a/arch/riscv/kernel/module-sections.c > +++ b/arch/riscv/kernel/module-sections.c > @@ -9,6 +9,7 @@ > #include <linux/kernel.h> > #include <linux/module.h> > #include <linux/moduleloader.h> > +#include <linux/sort.h> > > unsigned long module_emit_got_entry(struct module *mod, unsigned long val) > { > @@ -55,44 +56,70 @@ unsigned long module_emit_plt_entry(struct module *mod, unsigned long val) > return (unsigned long)&plt[i]; > } > > -static int is_rela_equal(const Elf_Rela *x, const Elf_Rela *y) > +#define cmp_3way(a, b) ((a) < (b) ? -1 : (a) > (b)) > + > +static int cmp_rela(const void *a, const void *b) > { > - return x->r_info == y->r_info && x->r_addend == y->r_addend; > + const Elf_Rela *x = a, *y = b; > + int i; > + > + /* sort by type, symbol index and addend */ > + i = cmp_3way(x->r_info, y->r_info); > + if (i == 0) > + i = cmp_3way(x->r_addend, y->r_addend); > + return i; > } > > static bool duplicate_rela(const Elf_Rela *rela, int idx) > { > - int i; > - for (i = 0; i < idx; i++) { > - if (is_rela_equal(&rela[i], &rela[idx])) > - return true; > - } > - return false; > + /* > + * Entries are sorted by type, symbol index and addend. That means > + * that, if a duplicate entry exists, it must be in the preceding slot. > + */ > + return idx > 0 && cmp_rela(rela + idx, rela + idx - 1) == 0; > } > > -static void count_max_entries(Elf_Rela *relas, int num, > +static void count_max_entries(const Elf_Rela *relas, size_t num, > unsigned int *plts, unsigned int *gots) > { > - for (int i = 0; i < num; i++) { > + for (size_t i = 0; i < num; i++) { > + if (duplicate_rela(relas, i)) > + continue; > + > switch (ELF_R_TYPE(relas[i].r_info)) { > case R_RISCV_CALL_PLT: > case R_RISCV_PLT32: > - if (!duplicate_rela(relas, i)) > - (*plts)++; > + (*plts)++; > break; > case R_RISCV_GOT_HI20: > - if (!duplicate_rela(relas, i)) > - (*gots)++; > + (*gots)++; > break; > + default: > + unreachable(); > } > } > } > > +static bool rela_needs_plt_got_entry(const Elf_Rela *rela) > +{ > + switch (ELF_R_TYPE(rela->r_info)) { > + case R_RISCV_CALL_PLT: > + case R_RISCV_GOT_HI20: > + case R_RISCV_PLT32: > + return true; > + default: > + return false; > + } > +} > + > int module_frob_arch_sections(Elf_Ehdr *ehdr, Elf_Shdr *sechdrs, > char *secstrings, struct module *mod) > { > + size_t num_scratch_relas = 0; > unsigned int num_plts = 0; > unsigned int num_gots = 0; > + Elf_Rela *scratch = NULL; > + size_t scratch_size = 0; > int i; > > /* > @@ -122,9 +149,10 @@ int module_frob_arch_sections(Elf_Ehdr *ehdr, Elf_Shdr *sechdrs, > > /* Calculate the maxinum number of entries */ > for (i = 0; i < ehdr->e_shnum; i++) { > + size_t num_relas = sechdrs[i].sh_size / sizeof(Elf_Rela); > Elf_Rela *relas = (void *)ehdr + sechdrs[i].sh_offset; > - int num_rela = sechdrs[i].sh_size / sizeof(Elf_Rela); > Elf_Shdr *dst_sec = sechdrs + sechdrs[i].sh_info; > + size_t scratch_size_needed; > > if (sechdrs[i].sh_type != SHT_RELA) > continue; > @@ -133,7 +161,28 @@ int module_frob_arch_sections(Elf_Ehdr *ehdr, Elf_Shdr *sechdrs, > if (!(dst_sec->sh_flags & SHF_EXECINSTR)) > continue; > > - count_max_entries(relas, num_rela, &num_plts, &num_gots); > + /* > + * apply_relocate_add() relies on HI20 and LO12 relocation pairs being > + * close together, so sort a copy of the section to avoid interfering. > + */ > + scratch_size_needed = (num_scratch_relas + num_relas) * sizeof(*scratch); > + if (scratch_size_needed > scratch_size) { > + scratch_size = scratch_size_needed; Maybe not worth it, but when i is less than ehdr->e_shnum - 1 (we still have more sections to look at) we could use scratch_size_needed * 2, or something, in order to hopefully reduce the number of times kvrealloc() is called. > + scratch = kvrealloc(scratch, scratch_size, GFP_KERNEL); > + if (!scratch) > + return -ENOMEM; > + } > + > + for (size_t j = 0; j < num_relas; j++) > + if (rela_needs_plt_got_entry(&relas[j])) > + scratch[num_scratch_relas++] = relas[j]; > + } > + > + if (scratch) { > + /* sort the accumulated PLT/GOT relocations so duplicates are adjacent */ > + sort(scratch, num_scratch_relas, sizeof(*scratch), cmp_rela, NULL); > + count_max_entries(scratch, num_scratch_relas, &num_plts, &num_gots); > + kvfree(scratch); > } > > mod->arch.plt.shdr->sh_type = SHT_NOBITS; > -- > 2.47.0 > Reviewed-by: Andrew Jones <ajones@ventanamicro.com> Thanks, drew
Hi Drew, Thanks for the review again! On 2025-04-09 2:07 PM, Andrew Jones wrote: > On Wed, Apr 09, 2025 at 10:14:51AM -0700, Samuel Holland wrote: >> perf reports that 99.63% of the cycles from `modprobe amdgpu` are spent >> inside module_frob_arch_sections(). This is because amdgpu.ko contains >> about 300000 relocations in its .rela.text section, and the algorithm in >> count_max_entries() takes quadratic time. >> >> Apply two optimizations from the arm64 code, which together reduce the >> total execution time by 99.58%. First, sort the relocations so duplicate >> entries are adjacent. Second, reduce the number of relocations that must >> be sorted by filtering to only relocations that need PLT/GOT entries, as >> done in commit d4e0340919fb ("arm64/module: Optimize module load time by >> optimizing PLT counting"). >> >> Unlike the arm64 code, here the filtering and sorting is done in a >> scratch buffer, because the HI20 relocation search optimization in >> apply_relocate_add() depends on the original order of the relocations. >> This allows accumulating PLT/GOT relocations across sections so sorting >> and counting is only done once per module. >> >> Signed-off-by: Samuel Holland <samuel.holland@sifive.com> >> --- >> >> Changes in v2: >> - Include R_RISCV_PLT32 relocations when computing the PLT size >> - Accumulate PLT/GOT relocations across relocation sections >> >> arch/riscv/kernel/module-sections.c | 81 +++++++++++++++++++++++------ >> 1 file changed, 65 insertions(+), 16 deletions(-) >> >> diff --git a/arch/riscv/kernel/module-sections.c b/arch/riscv/kernel/module-sections.c >> index 91d0b355ceef..75551ac6504c 100644 >> --- a/arch/riscv/kernel/module-sections.c >> +++ b/arch/riscv/kernel/module-sections.c >> @@ -9,6 +9,7 @@ >> #include <linux/kernel.h> >> #include <linux/module.h> >> #include <linux/moduleloader.h> >> +#include <linux/sort.h> >> >> unsigned long module_emit_got_entry(struct module *mod, unsigned long val) >> { >> @@ -55,44 +56,70 @@ unsigned long module_emit_plt_entry(struct module *mod, unsigned long val) >> return (unsigned long)&plt[i]; >> } >> >> -static int is_rela_equal(const Elf_Rela *x, const Elf_Rela *y) >> +#define cmp_3way(a, b) ((a) < (b) ? -1 : (a) > (b)) >> + >> +static int cmp_rela(const void *a, const void *b) >> { >> - return x->r_info == y->r_info && x->r_addend == y->r_addend; >> + const Elf_Rela *x = a, *y = b; >> + int i; >> + >> + /* sort by type, symbol index and addend */ >> + i = cmp_3way(x->r_info, y->r_info); >> + if (i == 0) >> + i = cmp_3way(x->r_addend, y->r_addend); >> + return i; >> } >> >> static bool duplicate_rela(const Elf_Rela *rela, int idx) >> { >> - int i; >> - for (i = 0; i < idx; i++) { >> - if (is_rela_equal(&rela[i], &rela[idx])) >> - return true; >> - } >> - return false; >> + /* >> + * Entries are sorted by type, symbol index and addend. That means >> + * that, if a duplicate entry exists, it must be in the preceding slot. >> + */ >> + return idx > 0 && cmp_rela(rela + idx, rela + idx - 1) == 0; >> } >> >> -static void count_max_entries(Elf_Rela *relas, int num, >> +static void count_max_entries(const Elf_Rela *relas, size_t num, >> unsigned int *plts, unsigned int *gots) >> { >> - for (int i = 0; i < num; i++) { >> + for (size_t i = 0; i < num; i++) { >> + if (duplicate_rela(relas, i)) >> + continue; >> + >> switch (ELF_R_TYPE(relas[i].r_info)) { >> case R_RISCV_CALL_PLT: >> case R_RISCV_PLT32: >> - if (!duplicate_rela(relas, i)) >> - (*plts)++; >> + (*plts)++; >> break; >> case R_RISCV_GOT_HI20: >> - if (!duplicate_rela(relas, i)) >> - (*gots)++; >> + (*gots)++; >> break; >> + default: >> + unreachable(); >> } >> } >> } >> >> +static bool rela_needs_plt_got_entry(const Elf_Rela *rela) >> +{ >> + switch (ELF_R_TYPE(rela->r_info)) { >> + case R_RISCV_CALL_PLT: >> + case R_RISCV_GOT_HI20: >> + case R_RISCV_PLT32: >> + return true; >> + default: >> + return false; >> + } >> +} >> + >> int module_frob_arch_sections(Elf_Ehdr *ehdr, Elf_Shdr *sechdrs, >> char *secstrings, struct module *mod) >> { >> + size_t num_scratch_relas = 0; >> unsigned int num_plts = 0; >> unsigned int num_gots = 0; >> + Elf_Rela *scratch = NULL; >> + size_t scratch_size = 0; >> int i; >> >> /* >> @@ -122,9 +149,10 @@ int module_frob_arch_sections(Elf_Ehdr *ehdr, Elf_Shdr *sechdrs, >> >> /* Calculate the maxinum number of entries */ >> for (i = 0; i < ehdr->e_shnum; i++) { >> + size_t num_relas = sechdrs[i].sh_size / sizeof(Elf_Rela); >> Elf_Rela *relas = (void *)ehdr + sechdrs[i].sh_offset; >> - int num_rela = sechdrs[i].sh_size / sizeof(Elf_Rela); >> Elf_Shdr *dst_sec = sechdrs + sechdrs[i].sh_info; >> + size_t scratch_size_needed; >> >> if (sechdrs[i].sh_type != SHT_RELA) >> continue; >> @@ -133,7 +161,28 @@ int module_frob_arch_sections(Elf_Ehdr *ehdr, Elf_Shdr *sechdrs, >> if (!(dst_sec->sh_flags & SHF_EXECINSTR)) >> continue; >> >> - count_max_entries(relas, num_rela, &num_plts, &num_gots); >> + /* >> + * apply_relocate_add() relies on HI20 and LO12 relocation pairs being >> + * close together, so sort a copy of the section to avoid interfering. >> + */ >> + scratch_size_needed = (num_scratch_relas + num_relas) * sizeof(*scratch); >> + if (scratch_size_needed > scratch_size) { >> + scratch_size = scratch_size_needed; > > Maybe not worth it, but when i is less than ehdr->e_shnum - 1 (we still > have more sections to look at) we could use scratch_size_needed * 2, or > something, in order to hopefully reduce the number of times kvrealloc() > is called. I tried to keep the code as simple as possible, so I suppose it's not obvious what's going on here. The current code takes advantage of the fact that one of the relocation sections (.rela.text) is generally an order of magnitude or more larger than the others, is usually the first section processed, and much fewer than half of the relocations require a PLT/GOT entry. So in the common case: num_relas (.rela.text) > num_scratch_relas (all sections) + num_relas (any other section) and there will only be one allocation, for .rela.text. Regards, Samuel >> + scratch = kvrealloc(scratch, scratch_size, GFP_KERNEL); >> + if (!scratch) >> + return -ENOMEM; >> + } >> + >> + for (size_t j = 0; j < num_relas; j++) >> + if (rela_needs_plt_got_entry(&relas[j])) >> + scratch[num_scratch_relas++] = relas[j]; >> + } >> + >> + if (scratch) { >> + /* sort the accumulated PLT/GOT relocations so duplicates are adjacent */ >> + sort(scratch, num_scratch_relas, sizeof(*scratch), cmp_rela, NULL); >> + count_max_entries(scratch, num_scratch_relas, &num_plts, &num_gots); >> + kvfree(scratch); >> } >> >> mod->arch.plt.shdr->sh_type = SHT_NOBITS; >> -- >> 2.47.0 >> > > Reviewed-by: Andrew Jones <ajones@ventanamicro.com> > > Thanks, > drew
On Wed, Apr 09, 2025 at 02:18:29PM -0500, Samuel Holland wrote: > Hi Drew, > > Thanks for the review again! > > On 2025-04-09 2:07 PM, Andrew Jones wrote: > > On Wed, Apr 09, 2025 at 10:14:51AM -0700, Samuel Holland wrote: > >> perf reports that 99.63% of the cycles from `modprobe amdgpu` are spent > >> inside module_frob_arch_sections(). This is because amdgpu.ko contains > >> about 300000 relocations in its .rela.text section, and the algorithm in > >> count_max_entries() takes quadratic time. > >> > >> Apply two optimizations from the arm64 code, which together reduce the > >> total execution time by 99.58%. First, sort the relocations so duplicate > >> entries are adjacent. Second, reduce the number of relocations that must > >> be sorted by filtering to only relocations that need PLT/GOT entries, as > >> done in commit d4e0340919fb ("arm64/module: Optimize module load time by > >> optimizing PLT counting"). > >> > >> Unlike the arm64 code, here the filtering and sorting is done in a > >> scratch buffer, because the HI20 relocation search optimization in > >> apply_relocate_add() depends on the original order of the relocations. > >> This allows accumulating PLT/GOT relocations across sections so sorting > >> and counting is only done once per module. > >> > >> Signed-off-by: Samuel Holland <samuel.holland@sifive.com> > >> --- > >> > >> Changes in v2: > >> - Include R_RISCV_PLT32 relocations when computing the PLT size > >> - Accumulate PLT/GOT relocations across relocation sections > >> > >> arch/riscv/kernel/module-sections.c | 81 +++++++++++++++++++++++------ > >> 1 file changed, 65 insertions(+), 16 deletions(-) > >> > >> diff --git a/arch/riscv/kernel/module-sections.c b/arch/riscv/kernel/module-sections.c > >> index 91d0b355ceef..75551ac6504c 100644 > >> --- a/arch/riscv/kernel/module-sections.c > >> +++ b/arch/riscv/kernel/module-sections.c > >> @@ -9,6 +9,7 @@ > >> #include <linux/kernel.h> > >> #include <linux/module.h> > >> #include <linux/moduleloader.h> > >> +#include <linux/sort.h> > >> > >> unsigned long module_emit_got_entry(struct module *mod, unsigned long val) > >> { > >> @@ -55,44 +56,70 @@ unsigned long module_emit_plt_entry(struct module *mod, unsigned long val) > >> return (unsigned long)&plt[i]; > >> } > >> > >> -static int is_rela_equal(const Elf_Rela *x, const Elf_Rela *y) > >> +#define cmp_3way(a, b) ((a) < (b) ? -1 : (a) > (b)) > >> + > >> +static int cmp_rela(const void *a, const void *b) > >> { > >> - return x->r_info == y->r_info && x->r_addend == y->r_addend; > >> + const Elf_Rela *x = a, *y = b; > >> + int i; > >> + > >> + /* sort by type, symbol index and addend */ > >> + i = cmp_3way(x->r_info, y->r_info); > >> + if (i == 0) > >> + i = cmp_3way(x->r_addend, y->r_addend); > >> + return i; > >> } > >> > >> static bool duplicate_rela(const Elf_Rela *rela, int idx) > >> { > >> - int i; > >> - for (i = 0; i < idx; i++) { > >> - if (is_rela_equal(&rela[i], &rela[idx])) > >> - return true; > >> - } > >> - return false; > >> + /* > >> + * Entries are sorted by type, symbol index and addend. That means > >> + * that, if a duplicate entry exists, it must be in the preceding slot. > >> + */ > >> + return idx > 0 && cmp_rela(rela + idx, rela + idx - 1) == 0; > >> } > >> > >> -static void count_max_entries(Elf_Rela *relas, int num, > >> +static void count_max_entries(const Elf_Rela *relas, size_t num, > >> unsigned int *plts, unsigned int *gots) > >> { > >> - for (int i = 0; i < num; i++) { > >> + for (size_t i = 0; i < num; i++) { > >> + if (duplicate_rela(relas, i)) > >> + continue; > >> + > >> switch (ELF_R_TYPE(relas[i].r_info)) { > >> case R_RISCV_CALL_PLT: > >> case R_RISCV_PLT32: > >> - if (!duplicate_rela(relas, i)) > >> - (*plts)++; > >> + (*plts)++; > >> break; > >> case R_RISCV_GOT_HI20: > >> - if (!duplicate_rela(relas, i)) > >> - (*gots)++; > >> + (*gots)++; > >> break; > >> + default: > >> + unreachable(); > >> } > >> } > >> } > >> > >> +static bool rela_needs_plt_got_entry(const Elf_Rela *rela) > >> +{ > >> + switch (ELF_R_TYPE(rela->r_info)) { > >> + case R_RISCV_CALL_PLT: > >> + case R_RISCV_GOT_HI20: > >> + case R_RISCV_PLT32: > >> + return true; > >> + default: > >> + return false; > >> + } > >> +} > >> + > >> int module_frob_arch_sections(Elf_Ehdr *ehdr, Elf_Shdr *sechdrs, > >> char *secstrings, struct module *mod) > >> { > >> + size_t num_scratch_relas = 0; > >> unsigned int num_plts = 0; > >> unsigned int num_gots = 0; > >> + Elf_Rela *scratch = NULL; > >> + size_t scratch_size = 0; > >> int i; > >> > >> /* > >> @@ -122,9 +149,10 @@ int module_frob_arch_sections(Elf_Ehdr *ehdr, Elf_Shdr *sechdrs, > >> > >> /* Calculate the maxinum number of entries */ > >> for (i = 0; i < ehdr->e_shnum; i++) { > >> + size_t num_relas = sechdrs[i].sh_size / sizeof(Elf_Rela); > >> Elf_Rela *relas = (void *)ehdr + sechdrs[i].sh_offset; > >> - int num_rela = sechdrs[i].sh_size / sizeof(Elf_Rela); > >> Elf_Shdr *dst_sec = sechdrs + sechdrs[i].sh_info; > >> + size_t scratch_size_needed; > >> > >> if (sechdrs[i].sh_type != SHT_RELA) > >> continue; > >> @@ -133,7 +161,28 @@ int module_frob_arch_sections(Elf_Ehdr *ehdr, Elf_Shdr *sechdrs, > >> if (!(dst_sec->sh_flags & SHF_EXECINSTR)) > >> continue; > >> > >> - count_max_entries(relas, num_rela, &num_plts, &num_gots); > >> + /* > >> + * apply_relocate_add() relies on HI20 and LO12 relocation pairs being > >> + * close together, so sort a copy of the section to avoid interfering. > >> + */ > >> + scratch_size_needed = (num_scratch_relas + num_relas) * sizeof(*scratch); > >> + if (scratch_size_needed > scratch_size) { > >> + scratch_size = scratch_size_needed; > > > > Maybe not worth it, but when i is less than ehdr->e_shnum - 1 (we still > > have more sections to look at) we could use scratch_size_needed * 2, or > > something, in order to hopefully reduce the number of times kvrealloc() > > is called. > > I tried to keep the code as simple as possible, so I suppose it's not obvious > what's going on here. The current code takes advantage of the fact that one of > the relocation sections (.rela.text) is generally an order of magnitude or more > larger than the others, is usually the first section processed, and much fewer > than half of the relocations require a PLT/GOT entry. So in the common case: > > num_relas (.rela.text) > num_scratch_relas (all sections) + num_relas (any > other section) > > and there will only be one allocation, for .rela.text. Sounds good. Thanks, drew > > Regards, > Samuel > > >> + scratch = kvrealloc(scratch, scratch_size, GFP_KERNEL); > >> + if (!scratch) > >> + return -ENOMEM; > >> + } > >> + > >> + for (size_t j = 0; j < num_relas; j++) > >> + if (rela_needs_plt_got_entry(&relas[j])) > >> + scratch[num_scratch_relas++] = relas[j]; > >> + } > >> + > >> + if (scratch) { > >> + /* sort the accumulated PLT/GOT relocations so duplicates are adjacent */ > >> + sort(scratch, num_scratch_relas, sizeof(*scratch), cmp_rela, NULL); > >> + count_max_entries(scratch, num_scratch_relas, &num_plts, &num_gots); > >> + kvfree(scratch); > >> } > >> > >> mod->arch.plt.shdr->sh_type = SHT_NOBITS; > >> -- > >> 2.47.0 > >> > > > > Reviewed-by: Andrew Jones <ajones@ventanamicro.com> > > > > Thanks, > > drew >
diff --git a/arch/riscv/kernel/module-sections.c b/arch/riscv/kernel/module-sections.c index 91d0b355ceef..75551ac6504c 100644 --- a/arch/riscv/kernel/module-sections.c +++ b/arch/riscv/kernel/module-sections.c @@ -9,6 +9,7 @@ #include <linux/kernel.h> #include <linux/module.h> #include <linux/moduleloader.h> +#include <linux/sort.h> unsigned long module_emit_got_entry(struct module *mod, unsigned long val) { @@ -55,44 +56,70 @@ unsigned long module_emit_plt_entry(struct module *mod, unsigned long val) return (unsigned long)&plt[i]; } -static int is_rela_equal(const Elf_Rela *x, const Elf_Rela *y) +#define cmp_3way(a, b) ((a) < (b) ? -1 : (a) > (b)) + +static int cmp_rela(const void *a, const void *b) { - return x->r_info == y->r_info && x->r_addend == y->r_addend; + const Elf_Rela *x = a, *y = b; + int i; + + /* sort by type, symbol index and addend */ + i = cmp_3way(x->r_info, y->r_info); + if (i == 0) + i = cmp_3way(x->r_addend, y->r_addend); + return i; } static bool duplicate_rela(const Elf_Rela *rela, int idx) { - int i; - for (i = 0; i < idx; i++) { - if (is_rela_equal(&rela[i], &rela[idx])) - return true; - } - return false; + /* + * Entries are sorted by type, symbol index and addend. That means + * that, if a duplicate entry exists, it must be in the preceding slot. + */ + return idx > 0 && cmp_rela(rela + idx, rela + idx - 1) == 0; } -static void count_max_entries(Elf_Rela *relas, int num, +static void count_max_entries(const Elf_Rela *relas, size_t num, unsigned int *plts, unsigned int *gots) { - for (int i = 0; i < num; i++) { + for (size_t i = 0; i < num; i++) { + if (duplicate_rela(relas, i)) + continue; + switch (ELF_R_TYPE(relas[i].r_info)) { case R_RISCV_CALL_PLT: case R_RISCV_PLT32: - if (!duplicate_rela(relas, i)) - (*plts)++; + (*plts)++; break; case R_RISCV_GOT_HI20: - if (!duplicate_rela(relas, i)) - (*gots)++; + (*gots)++; break; + default: + unreachable(); } } } +static bool rela_needs_plt_got_entry(const Elf_Rela *rela) +{ + switch (ELF_R_TYPE(rela->r_info)) { + case R_RISCV_CALL_PLT: + case R_RISCV_GOT_HI20: + case R_RISCV_PLT32: + return true; + default: + return false; + } +} + int module_frob_arch_sections(Elf_Ehdr *ehdr, Elf_Shdr *sechdrs, char *secstrings, struct module *mod) { + size_t num_scratch_relas = 0; unsigned int num_plts = 0; unsigned int num_gots = 0; + Elf_Rela *scratch = NULL; + size_t scratch_size = 0; int i; /* @@ -122,9 +149,10 @@ int module_frob_arch_sections(Elf_Ehdr *ehdr, Elf_Shdr *sechdrs, /* Calculate the maxinum number of entries */ for (i = 0; i < ehdr->e_shnum; i++) { + size_t num_relas = sechdrs[i].sh_size / sizeof(Elf_Rela); Elf_Rela *relas = (void *)ehdr + sechdrs[i].sh_offset; - int num_rela = sechdrs[i].sh_size / sizeof(Elf_Rela); Elf_Shdr *dst_sec = sechdrs + sechdrs[i].sh_info; + size_t scratch_size_needed; if (sechdrs[i].sh_type != SHT_RELA) continue; @@ -133,7 +161,28 @@ int module_frob_arch_sections(Elf_Ehdr *ehdr, Elf_Shdr *sechdrs, if (!(dst_sec->sh_flags & SHF_EXECINSTR)) continue; - count_max_entries(relas, num_rela, &num_plts, &num_gots); + /* + * apply_relocate_add() relies on HI20 and LO12 relocation pairs being + * close together, so sort a copy of the section to avoid interfering. + */ + scratch_size_needed = (num_scratch_relas + num_relas) * sizeof(*scratch); + if (scratch_size_needed > scratch_size) { + scratch_size = scratch_size_needed; + scratch = kvrealloc(scratch, scratch_size, GFP_KERNEL); + if (!scratch) + return -ENOMEM; + } + + for (size_t j = 0; j < num_relas; j++) + if (rela_needs_plt_got_entry(&relas[j])) + scratch[num_scratch_relas++] = relas[j]; + } + + if (scratch) { + /* sort the accumulated PLT/GOT relocations so duplicates are adjacent */ + sort(scratch, num_scratch_relas, sizeof(*scratch), cmp_rela, NULL); + count_max_entries(scratch, num_scratch_relas, &num_plts, &num_gots); + kvfree(scratch); } mod->arch.plt.shdr->sh_type = SHT_NOBITS;
perf reports that 99.63% of the cycles from `modprobe amdgpu` are spent inside module_frob_arch_sections(). This is because amdgpu.ko contains about 300000 relocations in its .rela.text section, and the algorithm in count_max_entries() takes quadratic time. Apply two optimizations from the arm64 code, which together reduce the total execution time by 99.58%. First, sort the relocations so duplicate entries are adjacent. Second, reduce the number of relocations that must be sorted by filtering to only relocations that need PLT/GOT entries, as done in commit d4e0340919fb ("arm64/module: Optimize module load time by optimizing PLT counting"). Unlike the arm64 code, here the filtering and sorting is done in a scratch buffer, because the HI20 relocation search optimization in apply_relocate_add() depends on the original order of the relocations. This allows accumulating PLT/GOT relocations across sections so sorting and counting is only done once per module. Signed-off-by: Samuel Holland <samuel.holland@sifive.com> --- Changes in v2: - Include R_RISCV_PLT32 relocations when computing the PLT size - Accumulate PLT/GOT relocations across relocation sections arch/riscv/kernel/module-sections.c | 81 +++++++++++++++++++++++------ 1 file changed, 65 insertions(+), 16 deletions(-)