diff mbox series

[bpf-next] uprobes: Fix the xol slots reserved for uretprobe trampoline

Message ID 20240619013411.756995-1-liaochang1@huawei.com (mailing list archive)
State Changes Requested
Delegated to: BPF
Headers show
Series [bpf-next] uprobes: Fix the xol slots reserved for uretprobe trampoline | expand

Checks

Context Check Description
netdev/tree_selection success Clearly marked for bpf-next
netdev/apply fail Patch does not apply to bpf-next-0
bpf/vmtest-bpf-next-VM_Test-9 success Logs for aarch64-gcc / test (test_verifier, false, 360) / test_verifier on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-3 success Logs for Validate matrix.py
bpf/vmtest-bpf-next-VM_Test-1 success Logs for ShellCheck
bpf/vmtest-bpf-next-VM_Test-4 success Logs for aarch64-gcc / build / build for aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-0 success Logs for Lint
bpf/vmtest-bpf-next-PR success PR summary
bpf/vmtest-bpf-next-VM_Test-2 success Logs for Unittests
bpf/vmtest-bpf-next-VM_Test-5 success Logs for aarch64-gcc / build-release
bpf/vmtest-bpf-next-VM_Test-6 success Logs for aarch64-gcc / test (test_maps, false, 360) / test_maps on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-8 success Logs for aarch64-gcc / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-11 success Logs for s390x-gcc / build / build for s390x with gcc
bpf/vmtest-bpf-next-VM_Test-10 success Logs for aarch64-gcc / veristat
bpf/vmtest-bpf-next-VM_Test-12 success Logs for s390x-gcc / build-release
bpf/vmtest-bpf-next-VM_Test-14 success Logs for s390x-gcc / test (test_progs, false, 360) / test_progs on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-7 success Logs for aarch64-gcc / test (test_progs, false, 360) / test_progs on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-15 success Logs for s390x-gcc / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-16 success Logs for s390x-gcc / test (test_verifier, false, 360) / test_verifier on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-13 success Logs for s390x-gcc / test (test_maps, false, 360) / test_maps on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-19 success Logs for x86_64-gcc / build / build for x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-23 success Logs for x86_64-gcc / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-22 success Logs for x86_64-gcc / test (test_progs, false, 360) / test_progs on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-18 success Logs for set-matrix
bpf/vmtest-bpf-next-VM_Test-25 success Logs for x86_64-gcc / test (test_progs_parallel, true, 30) / test_progs_parallel on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-17 success Logs for s390x-gcc / veristat
bpf/vmtest-bpf-next-VM_Test-24 success Logs for x86_64-gcc / test (test_progs_no_alu32_parallel, true, 30) / test_progs_no_alu32_parallel on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-26 success Logs for x86_64-gcc / test (test_verifier, false, 360) / test_verifier on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-20 success Logs for x86_64-gcc / build-release
bpf/vmtest-bpf-next-VM_Test-27 success Logs for x86_64-gcc / veristat / veristat on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-21 success Logs for x86_64-gcc / test (test_maps, false, 360) / test_maps on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-28 success Logs for x86_64-llvm-17 / build / build for x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-31 success Logs for x86_64-llvm-17 / test (test_progs, false, 360) / test_progs on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-30 success Logs for x86_64-llvm-17 / test (test_maps, false, 360) / test_maps on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-33 success Logs for x86_64-llvm-17 / test (test_verifier, false, 360) / test_verifier on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-34 success Logs for x86_64-llvm-17 / veristat
bpf/vmtest-bpf-next-VM_Test-35 success Logs for x86_64-llvm-18 / build / build for x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-29 success Logs for x86_64-llvm-17 / build-release / build for x86_64 with llvm-17-O2
bpf/vmtest-bpf-next-VM_Test-36 success Logs for x86_64-llvm-18 / build-release / build for x86_64 with llvm-18-O2
bpf/vmtest-bpf-next-VM_Test-37 success Logs for x86_64-llvm-18 / test (test_maps, false, 360) / test_maps on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-38 success Logs for x86_64-llvm-18 / test (test_progs, false, 360) / test_progs on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-32 success Logs for x86_64-llvm-17 / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-41 success Logs for x86_64-llvm-18 / test (test_verifier, false, 360) / test_verifier on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-39 success Logs for x86_64-llvm-18 / test (test_progs_cpuv4, false, 360) / test_progs_cpuv4 on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-40 success Logs for x86_64-llvm-18 / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-42 success Logs for x86_64-llvm-18 / veristat

Commit Message

Liao, Chang June 19, 2024, 1:34 a.m. UTC
When the new uretprobe system call was added [1], the xol slots reserved
for the uretprobe trampoline might be insufficient on some architecture.
For example, on arm64, the trampoline is consist of three instructions
at least. So it should mark enough bits in area->bitmaps and
and area->slot_count for the reserved slots.

[1] https://lore.kernel.org/all/20240611112158.40795-4-jolsa@kernel.org/

Signed-off-by: Liao Chang <liaochang1@huawei.com>
---
 kernel/events/uprobes.c | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

Comments

Oleg Nesterov June 19, 2024, 2:38 p.m. UTC | #1
On 06/19, Liao Chang wrote:
>
> When the new uretprobe system call was added [1], the xol slots reserved
> for the uretprobe trampoline might be insufficient on some architecture.

Confused... this doesn't depend on the change above?

> For example, on arm64, the trampoline is consist of three instructions
> at least. So it should mark enough bits in area->bitmaps and
> and area->slot_count for the reserved slots.

Do you mean that on arm64 UPROBE_SWBP_INSN_SIZE > UPROBE_XOL_SLOT_BYTES ?

From arch/arm64/include/asm/uprobes.h

	#define MAX_UINSN_BYTES		AARCH64_INSN_SIZE

	#define UPROBE_SWBP_INSN	cpu_to_le32(BRK64_OPCODE_UPROBES)
	#define UPROBE_SWBP_INSN_SIZE	AARCH64_INSN_SIZE
	#define UPROBE_XOL_SLOT_BYTES	MAX_UINSN_BYTES

	typedef __le32 uprobe_opcode_t;

	struct arch_uprobe_task {
	};

	struct arch_uprobe {
		union {
			u8 insn[MAX_UINSN_BYTES];
			u8 ixol[MAX_UINSN_BYTES];

So it seems that UPROBE_SWBP_INSN_SIZE == MAX_UINSN_BYTES and it must
be less than UPROBE_XOL_SLOT_BYTES, otherwise

arch_uprobe_copy_ixol(..., uprobe->arch.ixol, sizeof(uprobe->arch.ixol))
in xol_get_insn_slot() won't fit the slot as well?

OTOH, it look as if UPROBE_SWBP_INSN_SIZE == UPROBE_XOL_SLOT_BYTES, so
I don't understand the problem...

Oleg.

> [1] https://lore.kernel.org/all/20240611112158.40795-4-jolsa@kernel.org/
> 
> Signed-off-by: Liao Chang <liaochang1@huawei.com>
> ---
>  kernel/events/uprobes.c | 11 +++++++----
>  1 file changed, 7 insertions(+), 4 deletions(-)
> 
> diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
> index 2816e65729ac..efd2d7f56622 100644
> --- a/kernel/events/uprobes.c
> +++ b/kernel/events/uprobes.c
> @@ -1485,7 +1485,7 @@ void * __weak arch_uprobe_trampoline(unsigned long *psize)
>  static struct xol_area *__create_xol_area(unsigned long vaddr)
>  {
>  	struct mm_struct *mm = current->mm;
> -	unsigned long insns_size;
> +	unsigned long insns_size, slot_nr;
>  	struct xol_area *area;
>  	void *insns;
>  
> @@ -1508,10 +1508,13 @@ static struct xol_area *__create_xol_area(unsigned long vaddr)
>  
>  	area->vaddr = vaddr;
>  	init_waitqueue_head(&area->wq);
> -	/* Reserve the 1st slot for get_trampoline_vaddr() */
> -	set_bit(0, area->bitmap);
> -	atomic_set(&area->slot_count, 1);
>  	insns = arch_uprobe_trampoline(&insns_size);
> +	/* Reserve enough slots for the uretprobe trampoline */
> +	for (slot_nr = 0;
> +	     slot_nr < max((insns_size / UPROBE_XOL_SLOT_BYTES), 1);
> +	     slot_nr++)
> +		set_bit(slot_nr, area->bitmap);
> +	atomic_set(&area->slot_count, slot_nr);
>  	arch_uprobe_copy_ixol(area->pages[0], 0, insns, insns_size);
>  
>  	if (!xol_add_vma(mm, area))
> -- 
> 2.34.1
>
Jiri Olsa June 19, 2024, 4:22 p.m. UTC | #2
On Wed, Jun 19, 2024 at 01:34:11AM +0000, Liao Chang wrote:
> When the new uretprobe system call was added [1], the xol slots reserved
> for the uretprobe trampoline might be insufficient on some architecture.

hum, uretprobe syscall is x86_64 specific, nothing was changed wrt slots
or other architectures.. could you be more specific in what's changed?

thanks,
jirka

> For example, on arm64, the trampoline is consist of three instructions
> at least. So it should mark enough bits in area->bitmaps and
> and area->slot_count for the reserved slots.
> 
> [1] https://lore.kernel.org/all/20240611112158.40795-4-jolsa@kernel.org/
> 
> Signed-off-by: Liao Chang <liaochang1@huawei.com>
> ---
>  kernel/events/uprobes.c | 11 +++++++----
>  1 file changed, 7 insertions(+), 4 deletions(-)
> 
> diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
> index 2816e65729ac..efd2d7f56622 100644
> --- a/kernel/events/uprobes.c
> +++ b/kernel/events/uprobes.c
> @@ -1485,7 +1485,7 @@ void * __weak arch_uprobe_trampoline(unsigned long *psize)
>  static struct xol_area *__create_xol_area(unsigned long vaddr)
>  {
>  	struct mm_struct *mm = current->mm;
> -	unsigned long insns_size;
> +	unsigned long insns_size, slot_nr;
>  	struct xol_area *area;
>  	void *insns;
>  
> @@ -1508,10 +1508,13 @@ static struct xol_area *__create_xol_area(unsigned long vaddr)
>  
>  	area->vaddr = vaddr;
>  	init_waitqueue_head(&area->wq);
> -	/* Reserve the 1st slot for get_trampoline_vaddr() */
> -	set_bit(0, area->bitmap);
> -	atomic_set(&area->slot_count, 1);
>  	insns = arch_uprobe_trampoline(&insns_size);
> +	/* Reserve enough slots for the uretprobe trampoline */
> +	for (slot_nr = 0;
> +	     slot_nr < max((insns_size / UPROBE_XOL_SLOT_BYTES), 1);
> +	     slot_nr++)
> +		set_bit(slot_nr, area->bitmap);
> +	atomic_set(&area->slot_count, slot_nr);
>  	arch_uprobe_copy_ixol(area->pages[0], 0, insns, insns_size);
>  
>  	if (!xol_add_vma(mm, area))
> -- 
> 2.34.1
>
Liao, Chang June 20, 2024, 2:39 a.m. UTC | #3
Hi, Oleg

在 2024/6/19 22:38, Oleg Nesterov 写道:
> On 06/19, Liao Chang wrote:
>>
>> When the new uretprobe system call was added [1], the xol slots reserved
>> for the uretprobe trampoline might be insufficient on some architecture.
> 
> Confused... this doesn't depend on the change above?

You are right, the uretprobe syscall is specifc to x86_64. This patch wouldn't
address that issue. However, when i asm porting uretprobe trampoline to arm64
to explore its benefits on that architecture, i discovered the problem that
single slot is not large enought for trampoline code.

> 
>> For example, on arm64, the trampoline is consist of three instructions
>> at least. So it should mark enough bits in area->bitmaps and
>> and area->slot_count for the reserved slots.
> 
> Do you mean that on arm64 UPROBE_SWBP_INSN_SIZE > UPROBE_XOL_SLOT_BYTES ?
> 
>>From arch/arm64/include/asm/uprobes.h
> 
> 	#define MAX_UINSN_BYTES		AARCH64_INSN_SIZE
> 
> 	#define UPROBE_SWBP_INSN	cpu_to_le32(BRK64_OPCODE_UPROBES)
> 	#define UPROBE_SWBP_INSN_SIZE	AARCH64_INSN_SIZE
> 	#define UPROBE_XOL_SLOT_BYTES	MAX_UINSN_BYTES
> 
> 	typedef __le32 uprobe_opcode_t;
> 
> 	struct arch_uprobe_task {
> 	};
> 
> 	struct arch_uprobe {
> 		union {
> 			u8 insn[MAX_UINSN_BYTES];
> 			u8 ixol[MAX_UINSN_BYTES];
> 
> So it seems that UPROBE_SWBP_INSN_SIZE == MAX_UINSN_BYTES and it must
> be less than UPROBE_XOL_SLOT_BYTES, otherwise
> 
> arch_uprobe_copy_ixol(..., uprobe->arch.ixol, sizeof(uprobe->arch.ixol))
> in xol_get_insn_slot() won't fit the slot as well?
This real reason is that the current implmentation seems to assume the
ixol slot has enough space for the uretprobe trampoline. This assumption
is true for x86_64, since the ixol slot is size of 128 bytes.

From arch/x86/include/asm/uprobes.h
	#define UPROBE_XOL_SLOT_BYTES	128

However, it is not large enough for arm64, the size of which is MAX_UINSN_BYTES
(4bytes).

From arch/arm64/include/asm/uprobes.h

 	#define MAX_UINSN_BYTES		AARCH64_INSN_SIZE // 4bytes
 	...
 	#define UPROBE_XOL_SLOT_BYTES	MAX_UINSN_BYTES

Here is an exmample of the uretprobe trampoline code used for arm64:

uretprobe_trampoline_for_arm64:
	str x8, [sp, #-8]!
	mov x8, __NR_uretprobe
	svc #0

Above code is 12 bytes in size, exceeding the capacity of single ixol slot
on arm64, so three slots are necessary to be reserved for the entire trampoline.

Thanks.

> 
> OTOH, it look as if UPROBE_SWBP_INSN_SIZE == UPROBE_XOL_SLOT_BYTES, so
> I don't understand the problem...
> 
> Oleg.
> 
>> [1] https://lore.kernel.org/all/20240611112158.40795-4-jolsa@kernel.org/
>>
>> Signed-off-by: Liao Chang <liaochang1@huawei.com>
>> ---
>>  kernel/events/uprobes.c | 11 +++++++----
>>  1 file changed, 7 insertions(+), 4 deletions(-)
>>
>> diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
>> index 2816e65729ac..efd2d7f56622 100644
>> --- a/kernel/events/uprobes.c
>> +++ b/kernel/events/uprobes.c
>> @@ -1485,7 +1485,7 @@ void * __weak arch_uprobe_trampoline(unsigned long *psize)
>>  static struct xol_area *__create_xol_area(unsigned long vaddr)
>>  {
>>  	struct mm_struct *mm = current->mm;
>> -	unsigned long insns_size;
>> +	unsigned long insns_size, slot_nr;
>>  	struct xol_area *area;
>>  	void *insns;
>>  
>> @@ -1508,10 +1508,13 @@ static struct xol_area *__create_xol_area(unsigned long vaddr)
>>  
>>  	area->vaddr = vaddr;
>>  	init_waitqueue_head(&area->wq);
>> -	/* Reserve the 1st slot for get_trampoline_vaddr() */
>> -	set_bit(0, area->bitmap);
>> -	atomic_set(&area->slot_count, 1);
>>  	insns = arch_uprobe_trampoline(&insns_size);
>> +	/* Reserve enough slots for the uretprobe trampoline */
>> +	for (slot_nr = 0;
>> +	     slot_nr < max((insns_size / UPROBE_XOL_SLOT_BYTES), 1);
>> +	     slot_nr++)
>> +		set_bit(slot_nr, area->bitmap);
>> +	atomic_set(&area->slot_count, slot_nr);
>>  	arch_uprobe_copy_ixol(area->pages[0], 0, insns, insns_size);
>>  
>>  	if (!xol_add_vma(mm, area))
>> -- 
>> 2.34.1
>>
> 
>
Liao, Chang June 20, 2024, 2:58 a.m. UTC | #4
Hi, Jiri

在 2024/6/20 0:22, Jiri Olsa 写道:
> On Wed, Jun 19, 2024 at 01:34:11AM +0000, Liao Chang wrote:
>> When the new uretprobe system call was added [1], the xol slots reserved
>> for the uretprobe trampoline might be insufficient on some architecture.
> 
> hum, uretprobe syscall is x86_64 specific, nothing was changed wrt slots
> or other architectures.. could you be more specific in what's changed?

I observed a significant performance degradation when using uprobe to trace Redis
on arm64 machine. redis-benchmark showed a decrease of around 7% with uprobes
attached to two hot functions, and a much worse result with uprobes on more hot
functions. Here is a samll snapshot of benchmark result.

No uprobe
---------
SET: 73686.54 rps
GET: 73702.83 rps

Uprobes on two hot functions
----------------------------
SET: 68441.59 rps, -7.1%
GET: 68951.25 rps, -6.4%

Uprobes at three hot functions
------------------------------
SET: 40953.39 rps,-44.4%
GET: 41609.45 rps,-43.5%

To investigate the potential improvements, i ported the uretprobe syscall and
trampoline feature for arm64. The trampoline code used on arm64 looks like this:

uretprobe_trampoline_for_arm64:
	str x8, [sp, #-8]!
	mov x8, __NR_uretprobe
	svc #0

Due to arm64 uses fixed-lenghth instruction of 4 bytes, the total size of the trampoline
code is 12 bytes, since the ixol slot size is typical 4 bytes, the misfit bewteen the
slot size of trampoline size requires more than one slot to reserve.

Thanks.

> 
> thanks,
> jirka
> 
>> For example, on arm64, the trampoline is consist of three instructions
>> at least. So it should mark enough bits in area->bitmaps and
>> and area->slot_count for the reserved slots.
>>
>> [1] https://lore.kernel.org/all/20240611112158.40795-4-jolsa@kernel.org/
>>
>> Signed-off-by: Liao Chang <liaochang1@huawei.com>
>> ---
>>  kernel/events/uprobes.c | 11 +++++++----
>>  1 file changed, 7 insertions(+), 4 deletions(-)
>>
>> diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
>> index 2816e65729ac..efd2d7f56622 100644
>> --- a/kernel/events/uprobes.c
>> +++ b/kernel/events/uprobes.c
>> @@ -1485,7 +1485,7 @@ void * __weak arch_uprobe_trampoline(unsigned long *psize)
>>  static struct xol_area *__create_xol_area(unsigned long vaddr)
>>  {
>>  	struct mm_struct *mm = current->mm;
>> -	unsigned long insns_size;
>> +	unsigned long insns_size, slot_nr;
>>  	struct xol_area *area;
>>  	void *insns;
>>  
>> @@ -1508,10 +1508,13 @@ static struct xol_area *__create_xol_area(unsigned long vaddr)
>>  
>>  	area->vaddr = vaddr;
>>  	init_waitqueue_head(&area->wq);
>> -	/* Reserve the 1st slot for get_trampoline_vaddr() */
>> -	set_bit(0, area->bitmap);
>> -	atomic_set(&area->slot_count, 1);
>>  	insns = arch_uprobe_trampoline(&insns_size);
>> +	/* Reserve enough slots for the uretprobe trampoline */
>> +	for (slot_nr = 0;
>> +	     slot_nr < max((insns_size / UPROBE_XOL_SLOT_BYTES), 1);
>> +	     slot_nr++)
>> +		set_bit(slot_nr, area->bitmap);
>> +	atomic_set(&area->slot_count, slot_nr);
>>  	arch_uprobe_copy_ixol(area->pages[0], 0, insns, insns_size);
>>  
>>  	if (!xol_add_vma(mm, area))
>> -- 
>> 2.34.1
>>
Oleg Nesterov June 20, 2024, 8:36 a.m. UTC | #5
On 06/20, Liao, Chang wrote:
>
> However, when i asm porting uretprobe trampoline to arm64
> to explore its benefits on that architecture, i discovered the problem that
> single slot is not large enought for trampoline code.

Ah, but then I'd suggest to make the changelog more clear. It looks as
if the problem was introduced by the patch from Jiri. Note that we was
confused as well ;)

And,

	+	/* Reserve enough slots for the uretprobe trampoline */
	+	for (slot_nr = 0;
	+	     slot_nr < max((insns_size / UPROBE_XOL_SLOT_BYTES), 1);
	+	     slot_nr++)

this doesn't look right. Just suppose that insns_size = UPROBE_XOL_SLOT_BYTES + 1.
I'd suggest DIV_ROUND_UP(insns_size, UPROBE_XOL_SLOT_BYTES).

And perhaps it would be better to send this change along with
uretprobe_trampoline_for_arm64 ?

Oleg.
Jiri Olsa June 20, 2024, 9:06 a.m. UTC | #6
On Thu, Jun 20, 2024 at 10:36:02AM +0200, Oleg Nesterov wrote:
> On 06/20, Liao, Chang wrote:
> >
> > However, when i asm porting uretprobe trampoline to arm64
> > to explore its benefits on that architecture, i discovered the problem that
> > single slot is not large enought for trampoline code.

ah ok, makes sense now.. x86_64 has the slot big enough for the trampoline,
but arm64 does not

> 
> Ah, but then I'd suggest to make the changelog more clear. It looks as
> if the problem was introduced by the patch from Jiri. Note that we was
> confused as well ;)
> 
> And,
> 
> 	+	/* Reserve enough slots for the uretprobe trampoline */
> 	+	for (slot_nr = 0;
> 	+	     slot_nr < max((insns_size / UPROBE_XOL_SLOT_BYTES), 1);
> 	+	     slot_nr++)
> 
> this doesn't look right. Just suppose that insns_size = UPROBE_XOL_SLOT_BYTES + 1.
> I'd suggest DIV_ROUND_UP(insns_size, UPROBE_XOL_SLOT_BYTES).
> 
> And perhaps it would be better to send this change along with
> uretprobe_trampoline_for_arm64 ?

+1, also I'm curious what's the gain on arm64?

thanks,
jirka
Liao, Chang June 20, 2024, 9:58 a.m. UTC | #7
在 2024/6/20 16:36, Oleg Nesterov 写道:
> On 06/20, Liao, Chang wrote:
>>
>> However, when i asm porting uretprobe trampoline to arm64
>> to explore its benefits on that architecture, i discovered the problem that
>> single slot is not large enought for trampoline code.
> 
> Ah, but then I'd suggest to make the changelog more clear. It looks as
> if the problem was introduced by the patch from Jiri. Note that we was
> confused as well ;)

Perhaps i should use 'RFC' in the subject line, instead of 'PATCH'?

> 
> And,
> 
> 	+	/* Reserve enough slots for the uretprobe trampoline */
> 	+	for (slot_nr = 0;
> 	+	     slot_nr < max((insns_size / UPROBE_XOL_SLOT_BYTES), 1);
> 	+	     slot_nr++)
> 
> this doesn't look right. Just suppose that insns_size = UPROBE_XOL_SLOT_BYTES + 1.
> I'd suggest DIV_ROUND_UP(insns_size, UPROBE_XOL_SLOT_BYTES).

Oh, what a stupid mistake to me. thanks for pointing that out.

> 
> And perhaps it would be better to send this change along with
> uretprobe_trampoline_for_arm64 ?

Sure, i would send them out in next revision.


> 
> Oleg.
>
Oleg Nesterov June 20, 2024, 10:52 a.m. UTC | #8
On 06/20, Liao, Chang wrote:
>
> 在 2024/6/20 16:36, Oleg Nesterov 写道:
> > On 06/20, Liao, Chang wrote:
> >>
> >> However, when i asm porting uretprobe trampoline to arm64
> >> to explore its benefits on that architecture, i discovered the problem that
> >> single slot is not large enought for trampoline code.
> >
> > Ah, but then I'd suggest to make the changelog more clear. It looks as
> > if the problem was introduced by the patch from Jiri. Note that we was
> > confused as well ;)
>
> Perhaps i should use 'RFC' in the subject line, instead of 'PATCH'?

Well. IMO the changelog should explain that the current code is correct,
but you are going to change arm64 and arch_uprobe_trampoline(psize) on
arm64 can return with *psize > UPROBE_XOL_SLOT_BYTES.

Oleg.
Liao, Chang June 20, 2024, 11:27 a.m. UTC | #9
在 2024/6/20 17:06, Jiri Olsa 写道:
> On Thu, Jun 20, 2024 at 10:36:02AM +0200, Oleg Nesterov wrote:
>> On 06/20, Liao, Chang wrote:
>>>
>>> However, when i asm porting uretprobe trampoline to arm64
>>> to explore its benefits on that architecture, i discovered the problem that
>>> single slot is not large enought for trampoline code.
> 
> ah ok, makes sense now.. x86_64 has the slot big enough for the trampoline,
> but arm64 does not
> 
>>
>> Ah, but then I'd suggest to make the changelog more clear. It looks as
>> if the problem was introduced by the patch from Jiri. Note that we was
>> confused as well ;)
>>
>> And,
>>
>> 	+	/* Reserve enough slots for the uretprobe trampoline */
>> 	+	for (slot_nr = 0;
>> 	+	     slot_nr < max((insns_size / UPROBE_XOL_SLOT_BYTES), 1);
>> 	+	     slot_nr++)
>>
>> this doesn't look right. Just suppose that insns_size = UPROBE_XOL_SLOT_BYTES + 1.
>> I'd suggest DIV_ROUND_UP(insns_size, UPROBE_XOL_SLOT_BYTES).
>>
>> And perhaps it would be better to send this change along with
>> uretprobe_trampoline_for_arm64 ?
> 
> +1, also I'm curious what's the gain on arm64?

I am currently finalizing the uretprobe trampoline and syscall implementation on arm64.
While i have addressed most of issues, there are stiil a few bugs that reguire more effort.
Once these are fixed, i will use Redis to evaluate the performance gains on arm64. In the
next revision, i will submit a patchset that includes all relevant code changs, testcases
and benchmark data, which will allows a comprehensive review and dicussion.

> 
> thanks,
> jirka
Liao, Chang June 20, 2024, 11:58 a.m. UTC | #10
在 2024/6/20 18:52, Oleg Nesterov 写道:
> On 06/20, Liao, Chang wrote:
>>
>> 在 2024/6/20 16:36, Oleg Nesterov 写道:
>>> On 06/20, Liao, Chang wrote:
>>>>
>>>> However, when i asm porting uretprobe trampoline to arm64
>>>> to explore its benefits on that architecture, i discovered the problem that
>>>> single slot is not large enought for trampoline code.
>>>
>>> Ah, but then I'd suggest to make the changelog more clear. It looks as
>>> if the problem was introduced by the patch from Jiri. Note that we was
>>> confused as well ;)
>>
>> Perhaps i should use 'RFC' in the subject line, instead of 'PATCH'?
> 
> Well. IMO the changelog should explain that the current code is correct,
> but you are going to change arm64 and arch_uprobe_trampoline(psize) on
> arm64 can return with *psize > UPROBE_XOL_SLOT_BYTES.

You are absolutely right, Here is a draft of the revised changelog for your
suggestion:

    uprobes: Adjust the xol slots reserved for the uretprobe trampoline on arm64

    Adjust ixol slots reservation logic to ensure to accommodate the uretprobe
    trampoline on arm64, which is potentailly larger than single xol slot. This
    ensure the trampoline code fis within the reserved memory and avoids potential
    errors.

> 
> Oleg.
> 
>
Liao, Chang July 3, 2024, 8:54 a.m. UTC | #11
Hi Jiri and Oleg,

在 2024/6/20 19:27, Liao, Chang 写道:
> 
> 
> 在 2024/6/20 17:06, Jiri Olsa 写道:
>> On Thu, Jun 20, 2024 at 10:36:02AM +0200, Oleg Nesterov wrote:
>>> On 06/20, Liao, Chang wrote:
>>>>
>>>> However, when i asm porting uretprobe trampoline to arm64
>>>> to explore its benefits on that architecture, i discovered the problem that
>>>> single slot is not large enought for trampoline code.
>>
>> ah ok, makes sense now.. x86_64 has the slot big enough for the trampoline,
>> but arm64 does not
>>
>>>
>>> Ah, but then I'd suggest to make the changelog more clear. It looks as
>>> if the problem was introduced by the patch from Jiri. Note that we was
>>> confused as well ;)
>>>
>>> And,
>>>
>>> 	+	/* Reserve enough slots for the uretprobe trampoline */
>>> 	+	for (slot_nr = 0;
>>> 	+	     slot_nr < max((insns_size / UPROBE_XOL_SLOT_BYTES), 1);
>>> 	+	     slot_nr++)
>>>
>>> this doesn't look right. Just suppose that insns_size = UPROBE_XOL_SLOT_BYTES + 1.
>>> I'd suggest DIV_ROUND_UP(insns_size, UPROBE_XOL_SLOT_BYTES).
>>>
>>> And perhaps it would be better to send this change along with
>>> uretprobe_trampoline_for_arm64 ?
>>
>> +1, also I'm curious what's the gain on arm64?
> 
> I am currently finalizing the uretprobe trampoline and syscall implementation on arm64.
> While i have addressed most of issues, there are stiil a few bugs that reguire more effort.
> Once these are fixed, i will use Redis to evaluate the performance gains on arm64. In the
> next revision, i will submit a patchset that includes all relevant code changs, testcases
> and benchmark data, which will allows a comprehensive review and dicussion.

This is an update on the development of uretprobe syscall for ARM64 architecture.

I've recently completed a basic implementation of the uretprobe syscall and trampoline
code for ARM64, with the uprobe syscall selftest passed. This allow me to revisit the
performance benchmark using uretprobe. With running Redis benchmark a Kunpeng server,
I observed a slight performance gain with the uretprobe syscall on ARM64. The performance
test spawned a Redis-server and Redis-benchmark on seperate cores within the same NUMA
node. The Redis-server handled each SET and GET request from Redis-benchmark which triggered
three uretprobe events with attached bpftrace program that increments the counter.
Here is the benchmark result:

On Kunpeng916 (Hi1616), 4 NUMA nodes, 64 Cores @ 2.4GHz :

-------------------------------------------------------------------------------
Test case       |  No uretprobe |  uretprobe(breakpoint) |  uretprobe (syscall)
===============================================================================
Redis SET (RPS) |  47025        |  40619   -13.6%        |  40670   -13.5%
-------------------------------------------------------------------------------
Redis GET (RPS) |  46715        |  41426   -11.3%        |  41274   -11.6%
-------------------------------------------------------------------------------

The detailed test scripts and bpf program are available upon any request.

Additionally, I've attempted to optimize the implementation of the uretprobe syscall and
trampoline, but the cause of the lower than expected performance gain compared to x86
remains unclear. Further investigation is necessary to identify potentail bottlenecks or
inefficiencies specific to ARM64. It is grateful for any insights or suggestions the
community might have on the potential reasons for the performance difference between
ARM64 and X86. The patch for the uretprobe syscall is attached below for reference.

---------------------------%<----------------------------
diff --git a/arch/arm64/kernel/probes/Makefile b/arch/arm64/kernel/probes/Makefile
index 8e4be92e25b1..059f38c0857f 100644
--- a/arch/arm64/kernel/probes/Makefile
+++ b/arch/arm64/kernel/probes/Makefile
@@ -3,4 +3,5 @@ obj-$(CONFIG_KPROBES)		+= kprobes.o decode-insn.o	\
 				   kprobes_trampoline.o		\
 				   simulate-insn.o
 obj-$(CONFIG_UPROBES)		+= uprobes.o decode-insn.o	\
+				   uprobes_trampoline.o		\
 				   simulate-insn.o
diff --git a/arch/arm64/kernel/probes/uprobes.c b/arch/arm64/kernel/probes/uprobes.c
index d49aef2657cd..632f97afd50f 100644
--- a/arch/arm64/kernel/probes/uprobes.c
+++ b/arch/arm64/kernel/probes/uprobes.c
@@ -5,12 +5,69 @@
 #include <linux/highmem.h>
 #include <linux/ptrace.h>
 #include <linux/uprobes.h>
+#include <linux/syscalls.h>
 #include <asm/cacheflush.h>

 #include "decode-insn.h"

 #define UPROBE_INV_FAULT_CODE	UINT_MAX

+extern char uretprobe_trampoline[] __read_mostly;
+extern char uretprobe_trampoline_end[] __read_mostly;
+extern char uretprobe_trampoline_svc[] __read_mostly;
+
+void *arch_uprobe_trampoline(unsigned long *psize)
+{
+	static uprobe_opcode_t insn = UPROBE_SWBP_INSN;
+	struct pt_regs *regs = task_pt_regs(current);
+
+	if (!compat_user_mode(regs)) {
+		*psize = uretprobe_trampoline_end - uretprobe_trampoline;
+		return uretprobe_trampoline;
+	}
+
+	*psize = UPROBE_SWBP_INSN_SIZE;
+	return &insn;
+}
+
+static unsigned long syscall_at_uprobe_trampoline(void)
+{
+	unsigned long tramp = uprobe_get_trampoline_vaddr();
+
+	return tramp + (uretprobe_trampoline_svc - uretprobe_trampoline);
+}
+
+SYSCALL_DEFINE0(uretprobe)
+{
+	int err;
+	struct pt_regs *regs = task_pt_regs(current);
+
+	if (compat_user_mode(regs))
+		goto sigill;
+
+	/* ensure uretprobe syscall invoked from uretprobe trampoline */
+	if ((regs->pc - AARCH64_INSN_SIZE) != syscall_at_uprobe_trampoline())
+		goto sigill;
+
+	/* restore the clobbered context used to invoke uretprobe syscall */
+	err = copy_from_user(&regs->regs[8], (void __user *)(regs->sp - 8),
+			     sizeof(regs->regs[8]));
+	if (err)
+		goto sigill;
+
+	uprobe_handle_trampoline(regs);
+
+	/* restore the real LR before return to the caller. */
+	regs->regs[30] = regs->pc;
+
+	/* use the real return value */
+	return regs->regs[0];
+
+sigill:
+	force_sig(SIGILL);
+	return -1;
+}
+
 void arch_uprobe_copy_ixol(struct page *page, unsigned long vaddr,
 		void *src, unsigned long len)
 {
diff --git a/arch/arm64/kernel/probes/uprobes_trampoline.S b/arch/arm64/kernel/probes/uprobes_trampoline.S
new file mode 100644
index 000000000000..670d4d9e97ec
--- /dev/null
+++ b/arch/arm64/kernel/probes/uprobes_trampoline.S
@@ -0,0 +1,21 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * trampoline entry and return code for uretprobes.
+ */
+
+#include <linux/linkage.h>
+#include <asm/asm-bug.h>
+#include <asm/assembler.h>
+#include <asm/unistd.h>
+
+	.text
+
+SYM_CODE_START(uretprobe_trampoline)
+	str x8, [sp, #-8]
+	mov x8, #__NR_uretprobe
+
+SYM_CODE_START(uretprobe_trampoline_svc)
+	svc #0x000
+
+SYM_CODE_START(uretprobe_trampoline_end)
+	nop
--------------------------->%----------------------------

Thanks.

> 
>>
>> thanks,
>> jirka
>
diff mbox series

Patch

diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index 2816e65729ac..efd2d7f56622 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -1485,7 +1485,7 @@  void * __weak arch_uprobe_trampoline(unsigned long *psize)
 static struct xol_area *__create_xol_area(unsigned long vaddr)
 {
 	struct mm_struct *mm = current->mm;
-	unsigned long insns_size;
+	unsigned long insns_size, slot_nr;
 	struct xol_area *area;
 	void *insns;
 
@@ -1508,10 +1508,13 @@  static struct xol_area *__create_xol_area(unsigned long vaddr)
 
 	area->vaddr = vaddr;
 	init_waitqueue_head(&area->wq);
-	/* Reserve the 1st slot for get_trampoline_vaddr() */
-	set_bit(0, area->bitmap);
-	atomic_set(&area->slot_count, 1);
 	insns = arch_uprobe_trampoline(&insns_size);
+	/* Reserve enough slots for the uretprobe trampoline */
+	for (slot_nr = 0;
+	     slot_nr < max((insns_size / UPROBE_XOL_SLOT_BYTES), 1);
+	     slot_nr++)
+		set_bit(slot_nr, area->bitmap);
+	atomic_set(&area->slot_count, slot_nr);
 	arch_uprobe_copy_ixol(area->pages[0], 0, insns, insns_size);
 
 	if (!xol_add_vma(mm, area))