diff mbox series

[v2,3/3] riscv: module: Optimize PLT/GOT entry counting

Message ID 20250409171526.862481-3-samuel.holland@sifive.com (mailing list archive)
State New
Headers show
Series [v2,1/3] riscv: module: Fix out-of-bounds relocation access | expand

Checks

Context Check Description
bjorn/pre-ci_am success Success
bjorn/build-rv32-defconfig success build-rv32-defconfig
bjorn/build-rv64-clang-allmodconfig success build-rv64-clang-allmodconfig
bjorn/build-rv64-gcc-allmodconfig success build-rv64-gcc-allmodconfig
bjorn/build-rv64-nommu-k210-defconfig success build-rv64-nommu-k210-defconfig
bjorn/build-rv64-nommu-k210-virt success build-rv64-nommu-k210-virt
bjorn/checkpatch success checkpatch
bjorn/dtb-warn-rv64 success dtb-warn-rv64
bjorn/header-inline success header-inline
bjorn/kdoc success kdoc
bjorn/module-param success module-param
bjorn/verify-fixes success verify-fixes
bjorn/verify-signedoff success verify-signedoff

Commit Message

Samuel Holland April 9, 2025, 5:14 p.m. UTC
perf reports that 99.63% of the cycles from `modprobe amdgpu` are spent
inside module_frob_arch_sections(). This is because amdgpu.ko contains
about 300000 relocations in its .rela.text section, and the algorithm in
count_max_entries() takes quadratic time.

Apply two optimizations from the arm64 code, which together reduce the
total execution time by 99.58%. First, sort the relocations so duplicate
entries are adjacent. Second, reduce the number of relocations that must
be sorted by filtering to only relocations that need PLT/GOT entries, as
done in commit d4e0340919fb ("arm64/module: Optimize module load time by
optimizing PLT counting").

Unlike the arm64 code, here the filtering and sorting is done in a
scratch buffer, because the HI20 relocation search optimization in
apply_relocate_add() depends on the original order of the relocations.
This allows accumulating PLT/GOT relocations across sections so sorting
and counting is only done once per module.

Signed-off-by: Samuel Holland <samuel.holland@sifive.com>
---

Changes in v2:
 - Include R_RISCV_PLT32 relocations when computing the PLT size
 - Accumulate PLT/GOT relocations across relocation sections

 arch/riscv/kernel/module-sections.c | 81 +++++++++++++++++++++++------
 1 file changed, 65 insertions(+), 16 deletions(-)

Comments

Andrew Jones April 9, 2025, 7:07 p.m. UTC | #1
On Wed, Apr 09, 2025 at 10:14:51AM -0700, Samuel Holland wrote:
> perf reports that 99.63% of the cycles from `modprobe amdgpu` are spent
> inside module_frob_arch_sections(). This is because amdgpu.ko contains
> about 300000 relocations in its .rela.text section, and the algorithm in
> count_max_entries() takes quadratic time.
> 
> Apply two optimizations from the arm64 code, which together reduce the
> total execution time by 99.58%. First, sort the relocations so duplicate
> entries are adjacent. Second, reduce the number of relocations that must
> be sorted by filtering to only relocations that need PLT/GOT entries, as
> done in commit d4e0340919fb ("arm64/module: Optimize module load time by
> optimizing PLT counting").
> 
> Unlike the arm64 code, here the filtering and sorting is done in a
> scratch buffer, because the HI20 relocation search optimization in
> apply_relocate_add() depends on the original order of the relocations.
> This allows accumulating PLT/GOT relocations across sections so sorting
> and counting is only done once per module.
> 
> Signed-off-by: Samuel Holland <samuel.holland@sifive.com>
> ---
> 
> Changes in v2:
>  - Include R_RISCV_PLT32 relocations when computing the PLT size
>  - Accumulate PLT/GOT relocations across relocation sections
> 
>  arch/riscv/kernel/module-sections.c | 81 +++++++++++++++++++++++------
>  1 file changed, 65 insertions(+), 16 deletions(-)
> 
> diff --git a/arch/riscv/kernel/module-sections.c b/arch/riscv/kernel/module-sections.c
> index 91d0b355ceef..75551ac6504c 100644
> --- a/arch/riscv/kernel/module-sections.c
> +++ b/arch/riscv/kernel/module-sections.c
> @@ -9,6 +9,7 @@
>  #include <linux/kernel.h>
>  #include <linux/module.h>
>  #include <linux/moduleloader.h>
> +#include <linux/sort.h>
>  
>  unsigned long module_emit_got_entry(struct module *mod, unsigned long val)
>  {
> @@ -55,44 +56,70 @@ unsigned long module_emit_plt_entry(struct module *mod, unsigned long val)
>  	return (unsigned long)&plt[i];
>  }
>  
> -static int is_rela_equal(const Elf_Rela *x, const Elf_Rela *y)
> +#define cmp_3way(a, b)	((a) < (b) ? -1 : (a) > (b))
> +
> +static int cmp_rela(const void *a, const void *b)
>  {
> -	return x->r_info == y->r_info && x->r_addend == y->r_addend;
> +	const Elf_Rela *x = a, *y = b;
> +	int i;
> +
> +	/* sort by type, symbol index and addend */
> +	i = cmp_3way(x->r_info, y->r_info);
> +	if (i == 0)
> +		i = cmp_3way(x->r_addend, y->r_addend);
> +	return i;
>  }
>  
>  static bool duplicate_rela(const Elf_Rela *rela, int idx)
>  {
> -	int i;
> -	for (i = 0; i < idx; i++) {
> -		if (is_rela_equal(&rela[i], &rela[idx]))
> -			return true;
> -	}
> -	return false;
> +	/*
> +	 * Entries are sorted by type, symbol index and addend. That means
> +	 * that, if a duplicate entry exists, it must be in the preceding slot.
> +	 */
> +	return idx > 0 && cmp_rela(rela + idx, rela + idx - 1) == 0;
>  }
>  
> -static void count_max_entries(Elf_Rela *relas, int num,
> +static void count_max_entries(const Elf_Rela *relas, size_t num,
>  			      unsigned int *plts, unsigned int *gots)
>  {
> -	for (int i = 0; i < num; i++) {
> +	for (size_t i = 0; i < num; i++) {
> +		if (duplicate_rela(relas, i))
> +			continue;
> +
>  		switch (ELF_R_TYPE(relas[i].r_info)) {
>  		case R_RISCV_CALL_PLT:
>  		case R_RISCV_PLT32:
> -			if (!duplicate_rela(relas, i))
> -				(*plts)++;
> +			(*plts)++;
>  			break;
>  		case R_RISCV_GOT_HI20:
> -			if (!duplicate_rela(relas, i))
> -				(*gots)++;
> +			(*gots)++;
>  			break;
> +		default:
> +			unreachable();
>  		}
>  	}
>  }
>  
> +static bool rela_needs_plt_got_entry(const Elf_Rela *rela)
> +{
> +	switch (ELF_R_TYPE(rela->r_info)) {
> +	case R_RISCV_CALL_PLT:
> +	case R_RISCV_GOT_HI20:
> +	case R_RISCV_PLT32:
> +		return true;
> +	default:
> +		return false;
> +	}
> +}
> +
>  int module_frob_arch_sections(Elf_Ehdr *ehdr, Elf_Shdr *sechdrs,
>  			      char *secstrings, struct module *mod)
>  {
> +	size_t num_scratch_relas = 0;
>  	unsigned int num_plts = 0;
>  	unsigned int num_gots = 0;
> +	Elf_Rela *scratch = NULL;
> +	size_t scratch_size = 0;
>  	int i;
>  
>  	/*
> @@ -122,9 +149,10 @@ int module_frob_arch_sections(Elf_Ehdr *ehdr, Elf_Shdr *sechdrs,
>  
>  	/* Calculate the maxinum number of entries */
>  	for (i = 0; i < ehdr->e_shnum; i++) {
> +		size_t num_relas = sechdrs[i].sh_size / sizeof(Elf_Rela);
>  		Elf_Rela *relas = (void *)ehdr + sechdrs[i].sh_offset;
> -		int num_rela = sechdrs[i].sh_size / sizeof(Elf_Rela);
>  		Elf_Shdr *dst_sec = sechdrs + sechdrs[i].sh_info;
> +		size_t scratch_size_needed;
>  
>  		if (sechdrs[i].sh_type != SHT_RELA)
>  			continue;
> @@ -133,7 +161,28 @@ int module_frob_arch_sections(Elf_Ehdr *ehdr, Elf_Shdr *sechdrs,
>  		if (!(dst_sec->sh_flags & SHF_EXECINSTR))
>  			continue;
>  
> -		count_max_entries(relas, num_rela, &num_plts, &num_gots);
> +		/*
> +		 * apply_relocate_add() relies on HI20 and LO12 relocation pairs being
> +		 * close together, so sort a copy of the section to avoid interfering.
> +		 */
> +		scratch_size_needed = (num_scratch_relas + num_relas) * sizeof(*scratch);
> +		if (scratch_size_needed > scratch_size) {
> +			scratch_size = scratch_size_needed;

Maybe not worth it, but when i is less than ehdr->e_shnum - 1 (we still
have more sections to look at) we could use scratch_size_needed * 2, or
something, in order to hopefully reduce the number of times kvrealloc()
is called.

> +			scratch = kvrealloc(scratch, scratch_size, GFP_KERNEL);
> +			if (!scratch)
> +				return -ENOMEM;
> +		}
> +
> +		for (size_t j = 0; j < num_relas; j++)
> +			if (rela_needs_plt_got_entry(&relas[j]))
> +				scratch[num_scratch_relas++] = relas[j];
> +	}
> +
> +	if (scratch) {
> +		/* sort the accumulated PLT/GOT relocations so duplicates are adjacent */
> +		sort(scratch, num_scratch_relas, sizeof(*scratch), cmp_rela, NULL);
> +		count_max_entries(scratch, num_scratch_relas, &num_plts, &num_gots);
> +		kvfree(scratch);
>  	}
>  
>  	mod->arch.plt.shdr->sh_type = SHT_NOBITS;
> -- 
> 2.47.0
>

Reviewed-by: Andrew Jones <ajones@ventanamicro.com>

Thanks,
drew
Samuel Holland April 9, 2025, 7:18 p.m. UTC | #2
Hi Drew,

Thanks for the review again!

On 2025-04-09 2:07 PM, Andrew Jones wrote:
> On Wed, Apr 09, 2025 at 10:14:51AM -0700, Samuel Holland wrote:
>> perf reports that 99.63% of the cycles from `modprobe amdgpu` are spent
>> inside module_frob_arch_sections(). This is because amdgpu.ko contains
>> about 300000 relocations in its .rela.text section, and the algorithm in
>> count_max_entries() takes quadratic time.
>>
>> Apply two optimizations from the arm64 code, which together reduce the
>> total execution time by 99.58%. First, sort the relocations so duplicate
>> entries are adjacent. Second, reduce the number of relocations that must
>> be sorted by filtering to only relocations that need PLT/GOT entries, as
>> done in commit d4e0340919fb ("arm64/module: Optimize module load time by
>> optimizing PLT counting").
>>
>> Unlike the arm64 code, here the filtering and sorting is done in a
>> scratch buffer, because the HI20 relocation search optimization in
>> apply_relocate_add() depends on the original order of the relocations.
>> This allows accumulating PLT/GOT relocations across sections so sorting
>> and counting is only done once per module.
>>
>> Signed-off-by: Samuel Holland <samuel.holland@sifive.com>
>> ---
>>
>> Changes in v2:
>>  - Include R_RISCV_PLT32 relocations when computing the PLT size
>>  - Accumulate PLT/GOT relocations across relocation sections
>>
>>  arch/riscv/kernel/module-sections.c | 81 +++++++++++++++++++++++------
>>  1 file changed, 65 insertions(+), 16 deletions(-)
>>
>> diff --git a/arch/riscv/kernel/module-sections.c b/arch/riscv/kernel/module-sections.c
>> index 91d0b355ceef..75551ac6504c 100644
>> --- a/arch/riscv/kernel/module-sections.c
>> +++ b/arch/riscv/kernel/module-sections.c
>> @@ -9,6 +9,7 @@
>>  #include <linux/kernel.h>
>>  #include <linux/module.h>
>>  #include <linux/moduleloader.h>
>> +#include <linux/sort.h>
>>  
>>  unsigned long module_emit_got_entry(struct module *mod, unsigned long val)
>>  {
>> @@ -55,44 +56,70 @@ unsigned long module_emit_plt_entry(struct module *mod, unsigned long val)
>>  	return (unsigned long)&plt[i];
>>  }
>>  
>> -static int is_rela_equal(const Elf_Rela *x, const Elf_Rela *y)
>> +#define cmp_3way(a, b)	((a) < (b) ? -1 : (a) > (b))
>> +
>> +static int cmp_rela(const void *a, const void *b)
>>  {
>> -	return x->r_info == y->r_info && x->r_addend == y->r_addend;
>> +	const Elf_Rela *x = a, *y = b;
>> +	int i;
>> +
>> +	/* sort by type, symbol index and addend */
>> +	i = cmp_3way(x->r_info, y->r_info);
>> +	if (i == 0)
>> +		i = cmp_3way(x->r_addend, y->r_addend);
>> +	return i;
>>  }
>>  
>>  static bool duplicate_rela(const Elf_Rela *rela, int idx)
>>  {
>> -	int i;
>> -	for (i = 0; i < idx; i++) {
>> -		if (is_rela_equal(&rela[i], &rela[idx]))
>> -			return true;
>> -	}
>> -	return false;
>> +	/*
>> +	 * Entries are sorted by type, symbol index and addend. That means
>> +	 * that, if a duplicate entry exists, it must be in the preceding slot.
>> +	 */
>> +	return idx > 0 && cmp_rela(rela + idx, rela + idx - 1) == 0;
>>  }
>>  
>> -static void count_max_entries(Elf_Rela *relas, int num,
>> +static void count_max_entries(const Elf_Rela *relas, size_t num,
>>  			      unsigned int *plts, unsigned int *gots)
>>  {
>> -	for (int i = 0; i < num; i++) {
>> +	for (size_t i = 0; i < num; i++) {
>> +		if (duplicate_rela(relas, i))
>> +			continue;
>> +
>>  		switch (ELF_R_TYPE(relas[i].r_info)) {
>>  		case R_RISCV_CALL_PLT:
>>  		case R_RISCV_PLT32:
>> -			if (!duplicate_rela(relas, i))
>> -				(*plts)++;
>> +			(*plts)++;
>>  			break;
>>  		case R_RISCV_GOT_HI20:
>> -			if (!duplicate_rela(relas, i))
>> -				(*gots)++;
>> +			(*gots)++;
>>  			break;
>> +		default:
>> +			unreachable();
>>  		}
>>  	}
>>  }
>>  
>> +static bool rela_needs_plt_got_entry(const Elf_Rela *rela)
>> +{
>> +	switch (ELF_R_TYPE(rela->r_info)) {
>> +	case R_RISCV_CALL_PLT:
>> +	case R_RISCV_GOT_HI20:
>> +	case R_RISCV_PLT32:
>> +		return true;
>> +	default:
>> +		return false;
>> +	}
>> +}
>> +
>>  int module_frob_arch_sections(Elf_Ehdr *ehdr, Elf_Shdr *sechdrs,
>>  			      char *secstrings, struct module *mod)
>>  {
>> +	size_t num_scratch_relas = 0;
>>  	unsigned int num_plts = 0;
>>  	unsigned int num_gots = 0;
>> +	Elf_Rela *scratch = NULL;
>> +	size_t scratch_size = 0;
>>  	int i;
>>  
>>  	/*
>> @@ -122,9 +149,10 @@ int module_frob_arch_sections(Elf_Ehdr *ehdr, Elf_Shdr *sechdrs,
>>  
>>  	/* Calculate the maxinum number of entries */
>>  	for (i = 0; i < ehdr->e_shnum; i++) {
>> +		size_t num_relas = sechdrs[i].sh_size / sizeof(Elf_Rela);
>>  		Elf_Rela *relas = (void *)ehdr + sechdrs[i].sh_offset;
>> -		int num_rela = sechdrs[i].sh_size / sizeof(Elf_Rela);
>>  		Elf_Shdr *dst_sec = sechdrs + sechdrs[i].sh_info;
>> +		size_t scratch_size_needed;
>>  
>>  		if (sechdrs[i].sh_type != SHT_RELA)
>>  			continue;
>> @@ -133,7 +161,28 @@ int module_frob_arch_sections(Elf_Ehdr *ehdr, Elf_Shdr *sechdrs,
>>  		if (!(dst_sec->sh_flags & SHF_EXECINSTR))
>>  			continue;
>>  
>> -		count_max_entries(relas, num_rela, &num_plts, &num_gots);
>> +		/*
>> +		 * apply_relocate_add() relies on HI20 and LO12 relocation pairs being
>> +		 * close together, so sort a copy of the section to avoid interfering.
>> +		 */
>> +		scratch_size_needed = (num_scratch_relas + num_relas) * sizeof(*scratch);
>> +		if (scratch_size_needed > scratch_size) {
>> +			scratch_size = scratch_size_needed;
> 
> Maybe not worth it, but when i is less than ehdr->e_shnum - 1 (we still
> have more sections to look at) we could use scratch_size_needed * 2, or
> something, in order to hopefully reduce the number of times kvrealloc()
> is called.

I tried to keep the code as simple as possible, so I suppose it's not obvious
what's going on here. The current code takes advantage of the fact that one of
the relocation sections (.rela.text) is generally an order of magnitude or more
larger than the others, is usually the first section processed, and much fewer
than half of the relocations require a PLT/GOT entry. So in the common case:

  num_relas (.rela.text) > num_scratch_relas (all sections) + num_relas (any
other section)

and there will only be one allocation, for .rela.text.

Regards,
Samuel

>> +			scratch = kvrealloc(scratch, scratch_size, GFP_KERNEL);
>> +			if (!scratch)
>> +				return -ENOMEM;
>> +		}
>> +
>> +		for (size_t j = 0; j < num_relas; j++)
>> +			if (rela_needs_plt_got_entry(&relas[j]))
>> +				scratch[num_scratch_relas++] = relas[j];
>> +	}
>> +
>> +	if (scratch) {
>> +		/* sort the accumulated PLT/GOT relocations so duplicates are adjacent */
>> +		sort(scratch, num_scratch_relas, sizeof(*scratch), cmp_rela, NULL);
>> +		count_max_entries(scratch, num_scratch_relas, &num_plts, &num_gots);
>> +		kvfree(scratch);
>>  	}
>>  
>>  	mod->arch.plt.shdr->sh_type = SHT_NOBITS;
>> -- 
>> 2.47.0
>>
> 
> Reviewed-by: Andrew Jones <ajones@ventanamicro.com>
> 
> Thanks,
> drew
Andrew Jones April 9, 2025, 7:24 p.m. UTC | #3
On Wed, Apr 09, 2025 at 02:18:29PM -0500, Samuel Holland wrote:
> Hi Drew,
> 
> Thanks for the review again!
> 
> On 2025-04-09 2:07 PM, Andrew Jones wrote:
> > On Wed, Apr 09, 2025 at 10:14:51AM -0700, Samuel Holland wrote:
> >> perf reports that 99.63% of the cycles from `modprobe amdgpu` are spent
> >> inside module_frob_arch_sections(). This is because amdgpu.ko contains
> >> about 300000 relocations in its .rela.text section, and the algorithm in
> >> count_max_entries() takes quadratic time.
> >>
> >> Apply two optimizations from the arm64 code, which together reduce the
> >> total execution time by 99.58%. First, sort the relocations so duplicate
> >> entries are adjacent. Second, reduce the number of relocations that must
> >> be sorted by filtering to only relocations that need PLT/GOT entries, as
> >> done in commit d4e0340919fb ("arm64/module: Optimize module load time by
> >> optimizing PLT counting").
> >>
> >> Unlike the arm64 code, here the filtering and sorting is done in a
> >> scratch buffer, because the HI20 relocation search optimization in
> >> apply_relocate_add() depends on the original order of the relocations.
> >> This allows accumulating PLT/GOT relocations across sections so sorting
> >> and counting is only done once per module.
> >>
> >> Signed-off-by: Samuel Holland <samuel.holland@sifive.com>
> >> ---
> >>
> >> Changes in v2:
> >>  - Include R_RISCV_PLT32 relocations when computing the PLT size
> >>  - Accumulate PLT/GOT relocations across relocation sections
> >>
> >>  arch/riscv/kernel/module-sections.c | 81 +++++++++++++++++++++++------
> >>  1 file changed, 65 insertions(+), 16 deletions(-)
> >>
> >> diff --git a/arch/riscv/kernel/module-sections.c b/arch/riscv/kernel/module-sections.c
> >> index 91d0b355ceef..75551ac6504c 100644
> >> --- a/arch/riscv/kernel/module-sections.c
> >> +++ b/arch/riscv/kernel/module-sections.c
> >> @@ -9,6 +9,7 @@
> >>  #include <linux/kernel.h>
> >>  #include <linux/module.h>
> >>  #include <linux/moduleloader.h>
> >> +#include <linux/sort.h>
> >>  
> >>  unsigned long module_emit_got_entry(struct module *mod, unsigned long val)
> >>  {
> >> @@ -55,44 +56,70 @@ unsigned long module_emit_plt_entry(struct module *mod, unsigned long val)
> >>  	return (unsigned long)&plt[i];
> >>  }
> >>  
> >> -static int is_rela_equal(const Elf_Rela *x, const Elf_Rela *y)
> >> +#define cmp_3way(a, b)	((a) < (b) ? -1 : (a) > (b))
> >> +
> >> +static int cmp_rela(const void *a, const void *b)
> >>  {
> >> -	return x->r_info == y->r_info && x->r_addend == y->r_addend;
> >> +	const Elf_Rela *x = a, *y = b;
> >> +	int i;
> >> +
> >> +	/* sort by type, symbol index and addend */
> >> +	i = cmp_3way(x->r_info, y->r_info);
> >> +	if (i == 0)
> >> +		i = cmp_3way(x->r_addend, y->r_addend);
> >> +	return i;
> >>  }
> >>  
> >>  static bool duplicate_rela(const Elf_Rela *rela, int idx)
> >>  {
> >> -	int i;
> >> -	for (i = 0; i < idx; i++) {
> >> -		if (is_rela_equal(&rela[i], &rela[idx]))
> >> -			return true;
> >> -	}
> >> -	return false;
> >> +	/*
> >> +	 * Entries are sorted by type, symbol index and addend. That means
> >> +	 * that, if a duplicate entry exists, it must be in the preceding slot.
> >> +	 */
> >> +	return idx > 0 && cmp_rela(rela + idx, rela + idx - 1) == 0;
> >>  }
> >>  
> >> -static void count_max_entries(Elf_Rela *relas, int num,
> >> +static void count_max_entries(const Elf_Rela *relas, size_t num,
> >>  			      unsigned int *plts, unsigned int *gots)
> >>  {
> >> -	for (int i = 0; i < num; i++) {
> >> +	for (size_t i = 0; i < num; i++) {
> >> +		if (duplicate_rela(relas, i))
> >> +			continue;
> >> +
> >>  		switch (ELF_R_TYPE(relas[i].r_info)) {
> >>  		case R_RISCV_CALL_PLT:
> >>  		case R_RISCV_PLT32:
> >> -			if (!duplicate_rela(relas, i))
> >> -				(*plts)++;
> >> +			(*plts)++;
> >>  			break;
> >>  		case R_RISCV_GOT_HI20:
> >> -			if (!duplicate_rela(relas, i))
> >> -				(*gots)++;
> >> +			(*gots)++;
> >>  			break;
> >> +		default:
> >> +			unreachable();
> >>  		}
> >>  	}
> >>  }
> >>  
> >> +static bool rela_needs_plt_got_entry(const Elf_Rela *rela)
> >> +{
> >> +	switch (ELF_R_TYPE(rela->r_info)) {
> >> +	case R_RISCV_CALL_PLT:
> >> +	case R_RISCV_GOT_HI20:
> >> +	case R_RISCV_PLT32:
> >> +		return true;
> >> +	default:
> >> +		return false;
> >> +	}
> >> +}
> >> +
> >>  int module_frob_arch_sections(Elf_Ehdr *ehdr, Elf_Shdr *sechdrs,
> >>  			      char *secstrings, struct module *mod)
> >>  {
> >> +	size_t num_scratch_relas = 0;
> >>  	unsigned int num_plts = 0;
> >>  	unsigned int num_gots = 0;
> >> +	Elf_Rela *scratch = NULL;
> >> +	size_t scratch_size = 0;
> >>  	int i;
> >>  
> >>  	/*
> >> @@ -122,9 +149,10 @@ int module_frob_arch_sections(Elf_Ehdr *ehdr, Elf_Shdr *sechdrs,
> >>  
> >>  	/* Calculate the maxinum number of entries */
> >>  	for (i = 0; i < ehdr->e_shnum; i++) {
> >> +		size_t num_relas = sechdrs[i].sh_size / sizeof(Elf_Rela);
> >>  		Elf_Rela *relas = (void *)ehdr + sechdrs[i].sh_offset;
> >> -		int num_rela = sechdrs[i].sh_size / sizeof(Elf_Rela);
> >>  		Elf_Shdr *dst_sec = sechdrs + sechdrs[i].sh_info;
> >> +		size_t scratch_size_needed;
> >>  
> >>  		if (sechdrs[i].sh_type != SHT_RELA)
> >>  			continue;
> >> @@ -133,7 +161,28 @@ int module_frob_arch_sections(Elf_Ehdr *ehdr, Elf_Shdr *sechdrs,
> >>  		if (!(dst_sec->sh_flags & SHF_EXECINSTR))
> >>  			continue;
> >>  
> >> -		count_max_entries(relas, num_rela, &num_plts, &num_gots);
> >> +		/*
> >> +		 * apply_relocate_add() relies on HI20 and LO12 relocation pairs being
> >> +		 * close together, so sort a copy of the section to avoid interfering.
> >> +		 */
> >> +		scratch_size_needed = (num_scratch_relas + num_relas) * sizeof(*scratch);
> >> +		if (scratch_size_needed > scratch_size) {
> >> +			scratch_size = scratch_size_needed;
> > 
> > Maybe not worth it, but when i is less than ehdr->e_shnum - 1 (we still
> > have more sections to look at) we could use scratch_size_needed * 2, or
> > something, in order to hopefully reduce the number of times kvrealloc()
> > is called.
> 
> I tried to keep the code as simple as possible, so I suppose it's not obvious
> what's going on here. The current code takes advantage of the fact that one of
> the relocation sections (.rela.text) is generally an order of magnitude or more
> larger than the others, is usually the first section processed, and much fewer
> than half of the relocations require a PLT/GOT entry. So in the common case:
> 
>   num_relas (.rela.text) > num_scratch_relas (all sections) + num_relas (any
> other section)
> 
> and there will only be one allocation, for .rela.text.

Sounds good.

Thanks,
drew

> 
> Regards,
> Samuel
> 
> >> +			scratch = kvrealloc(scratch, scratch_size, GFP_KERNEL);
> >> +			if (!scratch)
> >> +				return -ENOMEM;
> >> +		}
> >> +
> >> +		for (size_t j = 0; j < num_relas; j++)
> >> +			if (rela_needs_plt_got_entry(&relas[j]))
> >> +				scratch[num_scratch_relas++] = relas[j];
> >> +	}
> >> +
> >> +	if (scratch) {
> >> +		/* sort the accumulated PLT/GOT relocations so duplicates are adjacent */
> >> +		sort(scratch, num_scratch_relas, sizeof(*scratch), cmp_rela, NULL);
> >> +		count_max_entries(scratch, num_scratch_relas, &num_plts, &num_gots);
> >> +		kvfree(scratch);
> >>  	}
> >>  
> >>  	mod->arch.plt.shdr->sh_type = SHT_NOBITS;
> >> -- 
> >> 2.47.0
> >>
> > 
> > Reviewed-by: Andrew Jones <ajones@ventanamicro.com>
> > 
> > Thanks,
> > drew
>
diff mbox series

Patch

diff --git a/arch/riscv/kernel/module-sections.c b/arch/riscv/kernel/module-sections.c
index 91d0b355ceef..75551ac6504c 100644
--- a/arch/riscv/kernel/module-sections.c
+++ b/arch/riscv/kernel/module-sections.c
@@ -9,6 +9,7 @@ 
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/moduleloader.h>
+#include <linux/sort.h>
 
 unsigned long module_emit_got_entry(struct module *mod, unsigned long val)
 {
@@ -55,44 +56,70 @@  unsigned long module_emit_plt_entry(struct module *mod, unsigned long val)
 	return (unsigned long)&plt[i];
 }
 
-static int is_rela_equal(const Elf_Rela *x, const Elf_Rela *y)
+#define cmp_3way(a, b)	((a) < (b) ? -1 : (a) > (b))
+
+static int cmp_rela(const void *a, const void *b)
 {
-	return x->r_info == y->r_info && x->r_addend == y->r_addend;
+	const Elf_Rela *x = a, *y = b;
+	int i;
+
+	/* sort by type, symbol index and addend */
+	i = cmp_3way(x->r_info, y->r_info);
+	if (i == 0)
+		i = cmp_3way(x->r_addend, y->r_addend);
+	return i;
 }
 
 static bool duplicate_rela(const Elf_Rela *rela, int idx)
 {
-	int i;
-	for (i = 0; i < idx; i++) {
-		if (is_rela_equal(&rela[i], &rela[idx]))
-			return true;
-	}
-	return false;
+	/*
+	 * Entries are sorted by type, symbol index and addend. That means
+	 * that, if a duplicate entry exists, it must be in the preceding slot.
+	 */
+	return idx > 0 && cmp_rela(rela + idx, rela + idx - 1) == 0;
 }
 
-static void count_max_entries(Elf_Rela *relas, int num,
+static void count_max_entries(const Elf_Rela *relas, size_t num,
 			      unsigned int *plts, unsigned int *gots)
 {
-	for (int i = 0; i < num; i++) {
+	for (size_t i = 0; i < num; i++) {
+		if (duplicate_rela(relas, i))
+			continue;
+
 		switch (ELF_R_TYPE(relas[i].r_info)) {
 		case R_RISCV_CALL_PLT:
 		case R_RISCV_PLT32:
-			if (!duplicate_rela(relas, i))
-				(*plts)++;
+			(*plts)++;
 			break;
 		case R_RISCV_GOT_HI20:
-			if (!duplicate_rela(relas, i))
-				(*gots)++;
+			(*gots)++;
 			break;
+		default:
+			unreachable();
 		}
 	}
 }
 
+static bool rela_needs_plt_got_entry(const Elf_Rela *rela)
+{
+	switch (ELF_R_TYPE(rela->r_info)) {
+	case R_RISCV_CALL_PLT:
+	case R_RISCV_GOT_HI20:
+	case R_RISCV_PLT32:
+		return true;
+	default:
+		return false;
+	}
+}
+
 int module_frob_arch_sections(Elf_Ehdr *ehdr, Elf_Shdr *sechdrs,
 			      char *secstrings, struct module *mod)
 {
+	size_t num_scratch_relas = 0;
 	unsigned int num_plts = 0;
 	unsigned int num_gots = 0;
+	Elf_Rela *scratch = NULL;
+	size_t scratch_size = 0;
 	int i;
 
 	/*
@@ -122,9 +149,10 @@  int module_frob_arch_sections(Elf_Ehdr *ehdr, Elf_Shdr *sechdrs,
 
 	/* Calculate the maxinum number of entries */
 	for (i = 0; i < ehdr->e_shnum; i++) {
+		size_t num_relas = sechdrs[i].sh_size / sizeof(Elf_Rela);
 		Elf_Rela *relas = (void *)ehdr + sechdrs[i].sh_offset;
-		int num_rela = sechdrs[i].sh_size / sizeof(Elf_Rela);
 		Elf_Shdr *dst_sec = sechdrs + sechdrs[i].sh_info;
+		size_t scratch_size_needed;
 
 		if (sechdrs[i].sh_type != SHT_RELA)
 			continue;
@@ -133,7 +161,28 @@  int module_frob_arch_sections(Elf_Ehdr *ehdr, Elf_Shdr *sechdrs,
 		if (!(dst_sec->sh_flags & SHF_EXECINSTR))
 			continue;
 
-		count_max_entries(relas, num_rela, &num_plts, &num_gots);
+		/*
+		 * apply_relocate_add() relies on HI20 and LO12 relocation pairs being
+		 * close together, so sort a copy of the section to avoid interfering.
+		 */
+		scratch_size_needed = (num_scratch_relas + num_relas) * sizeof(*scratch);
+		if (scratch_size_needed > scratch_size) {
+			scratch_size = scratch_size_needed;
+			scratch = kvrealloc(scratch, scratch_size, GFP_KERNEL);
+			if (!scratch)
+				return -ENOMEM;
+		}
+
+		for (size_t j = 0; j < num_relas; j++)
+			if (rela_needs_plt_got_entry(&relas[j]))
+				scratch[num_scratch_relas++] = relas[j];
+	}
+
+	if (scratch) {
+		/* sort the accumulated PLT/GOT relocations so duplicates are adjacent */
+		sort(scratch, num_scratch_relas, sizeof(*scratch), cmp_rela, NULL);
+		count_max_entries(scratch, num_scratch_relas, &num_plts, &num_gots);
+		kvfree(scratch);
 	}
 
 	mod->arch.plt.shdr->sh_type = SHT_NOBITS;