diff mbox series

[bpf-next,v3,4/7] bpf, arm64: Impelment bpf_arch_text_poke() for arm64

Message ID 20220424154028.1698685-5-xukuohai@huawei.com (mailing list archive)
State New, archived
Headers show
Series bpf trampoline for arm64 | expand

Commit Message

Xu Kuohai April 24, 2022, 3:40 p.m. UTC
Impelment bpf_arch_text_poke() for arm64, so bpf trampoline code can use
it to replace nop with jump, or replace jump with nop.

Signed-off-by: Xu Kuohai <xukuohai@huawei.com>
Acked-by: Song Liu <songliubraving@fb.com>
---
 arch/arm64/net/bpf_jit_comp.c | 63 +++++++++++++++++++++++++++++++++++
 1 file changed, 63 insertions(+)

Comments

Jakub Sitnicki May 10, 2022, 11:45 a.m. UTC | #1
On Sun, Apr 24, 2022 at 11:40 AM -04, Xu Kuohai wrote:
> Impelment bpf_arch_text_poke() for arm64, so bpf trampoline code can use
> it to replace nop with jump, or replace jump with nop.
>
> Signed-off-by: Xu Kuohai <xukuohai@huawei.com>
> Acked-by: Song Liu <songliubraving@fb.com>
> ---
>  arch/arm64/net/bpf_jit_comp.c | 63 +++++++++++++++++++++++++++++++++++
>  1 file changed, 63 insertions(+)
>
> diff --git a/arch/arm64/net/bpf_jit_comp.c b/arch/arm64/net/bpf_jit_comp.c
> index 8ab4035dea27..3f9bdfec54c4 100644
> --- a/arch/arm64/net/bpf_jit_comp.c
> +++ b/arch/arm64/net/bpf_jit_comp.c
> @@ -9,6 +9,7 @@
>  
>  #include <linux/bitfield.h>
>  #include <linux/bpf.h>
> +#include <linux/memory.h>
>  #include <linux/filter.h>
>  #include <linux/printk.h>
>  #include <linux/slab.h>
> @@ -18,6 +19,7 @@
>  #include <asm/cacheflush.h>
>  #include <asm/debug-monitors.h>
>  #include <asm/insn.h>
> +#include <asm/patching.h>
>  #include <asm/set_memory.h>
>  
>  #include "bpf_jit.h"
> @@ -1529,3 +1531,64 @@ void bpf_jit_free_exec(void *addr)
>  {
>  	return vfree(addr);
>  }
> +
> +static int gen_branch_or_nop(enum aarch64_insn_branch_type type, void *ip,
> +			     void *addr, u32 *insn)
> +{
> +	if (!addr)
> +		*insn = aarch64_insn_gen_nop();
> +	else
> +		*insn = aarch64_insn_gen_branch_imm((unsigned long)ip,
> +						    (unsigned long)addr,
> +						    type);
> +
> +	return *insn != AARCH64_BREAK_FAULT ? 0 : -EFAULT;
> +}
> +
> +int bpf_arch_text_poke(void *ip, enum bpf_text_poke_type poke_type,
> +		       void *old_addr, void *new_addr)
> +{
> +	int ret;
> +	u32 old_insn;
> +	u32 new_insn;
> +	u32 replaced;
> +	enum aarch64_insn_branch_type branch_type;
> +
> +	if (!is_bpf_text_address((long)ip))
> +		/* Only poking bpf text is supported. Since kernel function
> +		 * entry is set up by ftrace, we reply on ftrace to poke kernel
> +		 * functions. For kernel funcitons, bpf_arch_text_poke() is only

Nit: s/funcitons/functions/

> +		 * called after a failed poke with ftrace. In this case, there
> +		 * is probably something wrong with fentry, so there is nothing
> +		 * we can do here. See register_fentry, unregister_fentry and
> +		 * modify_fentry for details.
> +		 */
> +		return -EINVAL;
> +
> +	if (poke_type == BPF_MOD_CALL)
> +		branch_type = AARCH64_INSN_BRANCH_LINK;
> +	else
> +		branch_type = AARCH64_INSN_BRANCH_NOLINK;
> +
> +	if (gen_branch_or_nop(branch_type, ip, old_addr, &old_insn) < 0)
> +		return -EFAULT;
> +
> +	if (gen_branch_or_nop(branch_type, ip, new_addr, &new_insn) < 0)
> +		return -EFAULT;
> +
> +	mutex_lock(&text_mutex);
> +	if (aarch64_insn_read(ip, &replaced)) {
> +		ret = -EFAULT;
> +		goto out;
> +	}
> +
> +	if (replaced != old_insn) {
> +		ret = -EFAULT;
> +		goto out;
> +	}
> +
> +	ret = aarch64_insn_patch_text_nosync((void *)ip, new_insn);

Nit: No need for the explicit cast to void *. Type already matches.

> +out:
> +	mutex_unlock(&text_mutex);
> +	return ret;
> +}
Xu Kuohai May 11, 2022, 3:18 a.m. UTC | #2
On 5/10/2022 7:45 PM, Jakub Sitnicki wrote:
> On Sun, Apr 24, 2022 at 11:40 AM -04, Xu Kuohai wrote:
>> Impelment bpf_arch_text_poke() for arm64, so bpf trampoline code can use
>> it to replace nop with jump, or replace jump with nop.
>>
>> Signed-off-by: Xu Kuohai <xukuohai@huawei.com>
>> Acked-by: Song Liu <songliubraving@fb.com>
>> ---
>>  arch/arm64/net/bpf_jit_comp.c | 63 +++++++++++++++++++++++++++++++++++
>>  1 file changed, 63 insertions(+)
>>
>> diff --git a/arch/arm64/net/bpf_jit_comp.c b/arch/arm64/net/bpf_jit_comp.c
>> index 8ab4035dea27..3f9bdfec54c4 100644
>> --- a/arch/arm64/net/bpf_jit_comp.c
>> +++ b/arch/arm64/net/bpf_jit_comp.c
>> @@ -9,6 +9,7 @@
>>  
>>  #include <linux/bitfield.h>
>>  #include <linux/bpf.h>
>> +#include <linux/memory.h>
>>  #include <linux/filter.h>
>>  #include <linux/printk.h>
>>  #include <linux/slab.h>
>> @@ -18,6 +19,7 @@
>>  #include <asm/cacheflush.h>
>>  #include <asm/debug-monitors.h>
>>  #include <asm/insn.h>
>> +#include <asm/patching.h>
>>  #include <asm/set_memory.h>
>>  
>>  #include "bpf_jit.h"
>> @@ -1529,3 +1531,64 @@ void bpf_jit_free_exec(void *addr)
>>  {
>>  	return vfree(addr);
>>  }
>> +
>> +static int gen_branch_or_nop(enum aarch64_insn_branch_type type, void *ip,
>> +			     void *addr, u32 *insn)
>> +{
>> +	if (!addr)
>> +		*insn = aarch64_insn_gen_nop();
>> +	else
>> +		*insn = aarch64_insn_gen_branch_imm((unsigned long)ip,
>> +						    (unsigned long)addr,
>> +						    type);
>> +
>> +	return *insn != AARCH64_BREAK_FAULT ? 0 : -EFAULT;
>> +}
>> +
>> +int bpf_arch_text_poke(void *ip, enum bpf_text_poke_type poke_type,
>> +		       void *old_addr, void *new_addr)
>> +{
>> +	int ret;
>> +	u32 old_insn;
>> +	u32 new_insn;
>> +	u32 replaced;
>> +	enum aarch64_insn_branch_type branch_type;
>> +
>> +	if (!is_bpf_text_address((long)ip))
>> +		/* Only poking bpf text is supported. Since kernel function
>> +		 * entry is set up by ftrace, we reply on ftrace to poke kernel
>> +		 * functions. For kernel funcitons, bpf_arch_text_poke() is only
> 
> Nit: s/funcitons/functions/
> 
>> +		 * called after a failed poke with ftrace. In this case, there
>> +		 * is probably something wrong with fentry, so there is nothing
>> +		 * we can do here. See register_fentry, unregister_fentry and
>> +		 * modify_fentry for details.
>> +		 */
>> +		return -EINVAL;
>> +
>> +	if (poke_type == BPF_MOD_CALL)
>> +		branch_type = AARCH64_INSN_BRANCH_LINK;
>> +	else
>> +		branch_type = AARCH64_INSN_BRANCH_NOLINK;
>> +
>> +	if (gen_branch_or_nop(branch_type, ip, old_addr, &old_insn) < 0)
>> +		return -EFAULT;
>> +
>> +	if (gen_branch_or_nop(branch_type, ip, new_addr, &new_insn) < 0)
>> +		return -EFAULT;
>> +
>> +	mutex_lock(&text_mutex);
>> +	if (aarch64_insn_read(ip, &replaced)) {
>> +		ret = -EFAULT;
>> +		goto out;
>> +	}
>> +
>> +	if (replaced != old_insn) {
>> +		ret = -EFAULT;
>> +		goto out;
>> +	}
>> +
>> +	ret = aarch64_insn_patch_text_nosync((void *)ip, new_insn);
> 
> Nit: No need for the explicit cast to void *. Type already matches.
> 
>> +out:
>> +	mutex_unlock(&text_mutex);
>> +	return ret;
>> +}
> 
> .
will fix in v4, thanks!
Mark Rutland May 13, 2022, 2:59 p.m. UTC | #3
On Sun, Apr 24, 2022 at 11:40:25AM -0400, Xu Kuohai wrote:
> Impelment bpf_arch_text_poke() for arm64, so bpf trampoline code can use
> it to replace nop with jump, or replace jump with nop.
> 
> Signed-off-by: Xu Kuohai <xukuohai@huawei.com>
> Acked-by: Song Liu <songliubraving@fb.com>
> ---
>  arch/arm64/net/bpf_jit_comp.c | 63 +++++++++++++++++++++++++++++++++++
>  1 file changed, 63 insertions(+)
> 
> diff --git a/arch/arm64/net/bpf_jit_comp.c b/arch/arm64/net/bpf_jit_comp.c
> index 8ab4035dea27..3f9bdfec54c4 100644
> --- a/arch/arm64/net/bpf_jit_comp.c
> +++ b/arch/arm64/net/bpf_jit_comp.c
> @@ -9,6 +9,7 @@
>  
>  #include <linux/bitfield.h>
>  #include <linux/bpf.h>
> +#include <linux/memory.h>
>  #include <linux/filter.h>
>  #include <linux/printk.h>
>  #include <linux/slab.h>
> @@ -18,6 +19,7 @@
>  #include <asm/cacheflush.h>
>  #include <asm/debug-monitors.h>
>  #include <asm/insn.h>
> +#include <asm/patching.h>
>  #include <asm/set_memory.h>
>  
>  #include "bpf_jit.h"
> @@ -1529,3 +1531,64 @@ void bpf_jit_free_exec(void *addr)
>  {
>  	return vfree(addr);
>  }
> +
> +static int gen_branch_or_nop(enum aarch64_insn_branch_type type, void *ip,
> +			     void *addr, u32 *insn)
> +{
> +	if (!addr)
> +		*insn = aarch64_insn_gen_nop();
> +	else
> +		*insn = aarch64_insn_gen_branch_imm((unsigned long)ip,
> +						    (unsigned long)addr,
> +						    type);
> +
> +	return *insn != AARCH64_BREAK_FAULT ? 0 : -EFAULT;
> +}
> +
> +int bpf_arch_text_poke(void *ip, enum bpf_text_poke_type poke_type,
> +		       void *old_addr, void *new_addr)
> +{
> +	int ret;
> +	u32 old_insn;
> +	u32 new_insn;
> +	u32 replaced;
> +	enum aarch64_insn_branch_type branch_type;
> +
> +	if (!is_bpf_text_address((long)ip))
> +		/* Only poking bpf text is supported. Since kernel function
> +		 * entry is set up by ftrace, we reply on ftrace to poke kernel
> +		 * functions. For kernel funcitons, bpf_arch_text_poke() is only
> +		 * called after a failed poke with ftrace. In this case, there
> +		 * is probably something wrong with fentry, so there is nothing
> +		 * we can do here. See register_fentry, unregister_fentry and
> +		 * modify_fentry for details.
> +		 */
> +		return -EINVAL;

If you rely on ftrace to poke functions, why do you need to patch text
at all? Why does the rest of this function exist?

I really don't like having another piece of code outside of ftrace
patching the ftrace patch-site; this needs a much better explanation.

> +
> +	if (poke_type == BPF_MOD_CALL)
> +		branch_type = AARCH64_INSN_BRANCH_LINK;
> +	else
> +		branch_type = AARCH64_INSN_BRANCH_NOLINK;
> +
> +	if (gen_branch_or_nop(branch_type, ip, old_addr, &old_insn) < 0)
> +		return -EFAULT;
> +
> +	if (gen_branch_or_nop(branch_type, ip, new_addr, &new_insn) < 0)
> +		return -EFAULT;
> +
> +	mutex_lock(&text_mutex);
> +	if (aarch64_insn_read(ip, &replaced)) {
> +		ret = -EFAULT;
> +		goto out;
> +	}
> +
> +	if (replaced != old_insn) {
> +		ret = -EFAULT;
> +		goto out;
> +	}
> +
> +	ret = aarch64_insn_patch_text_nosync((void *)ip, new_insn);

... and where does the actual synchronization come from in this case?

Thanks,
Mark.

> +out:
> +	mutex_unlock(&text_mutex);
> +	return ret;
> +}
> -- 
> 2.30.2
>
Xu Kuohai May 16, 2022, 6:55 a.m. UTC | #4
On 5/13/2022 10:59 PM, Mark Rutland wrote:
> On Sun, Apr 24, 2022 at 11:40:25AM -0400, Xu Kuohai wrote:
>> Impelment bpf_arch_text_poke() for arm64, so bpf trampoline code can use
>> it to replace nop with jump, or replace jump with nop.
>>
>> Signed-off-by: Xu Kuohai <xukuohai@huawei.com>
>> Acked-by: Song Liu <songliubraving@fb.com>
>> ---
>>  arch/arm64/net/bpf_jit_comp.c | 63 +++++++++++++++++++++++++++++++++++
>>  1 file changed, 63 insertions(+)
>>
>> diff --git a/arch/arm64/net/bpf_jit_comp.c b/arch/arm64/net/bpf_jit_comp.c
>> index 8ab4035dea27..3f9bdfec54c4 100644
>> --- a/arch/arm64/net/bpf_jit_comp.c
>> +++ b/arch/arm64/net/bpf_jit_comp.c
>> @@ -9,6 +9,7 @@
>>  
>>  #include <linux/bitfield.h>
>>  #include <linux/bpf.h>
>> +#include <linux/memory.h>
>>  #include <linux/filter.h>
>>  #include <linux/printk.h>
>>  #include <linux/slab.h>
>> @@ -18,6 +19,7 @@
>>  #include <asm/cacheflush.h>
>>  #include <asm/debug-monitors.h>
>>  #include <asm/insn.h>
>> +#include <asm/patching.h>
>>  #include <asm/set_memory.h>
>>  
>>  #include "bpf_jit.h"
>> @@ -1529,3 +1531,64 @@ void bpf_jit_free_exec(void *addr)
>>  {
>>  	return vfree(addr);
>>  }
>> +
>> +static int gen_branch_or_nop(enum aarch64_insn_branch_type type, void *ip,
>> +			     void *addr, u32 *insn)
>> +{
>> +	if (!addr)
>> +		*insn = aarch64_insn_gen_nop();
>> +	else
>> +		*insn = aarch64_insn_gen_branch_imm((unsigned long)ip,
>> +						    (unsigned long)addr,
>> +						    type);
>> +
>> +	return *insn != AARCH64_BREAK_FAULT ? 0 : -EFAULT;
>> +}
>> +
>> +int bpf_arch_text_poke(void *ip, enum bpf_text_poke_type poke_type,
>> +		       void *old_addr, void *new_addr)
>> +{
>> +	int ret;
>> +	u32 old_insn;
>> +	u32 new_insn;
>> +	u32 replaced;
>> +	enum aarch64_insn_branch_type branch_type;
>> +
>> +	if (!is_bpf_text_address((long)ip))
>> +		/* Only poking bpf text is supported. Since kernel function
>> +		 * entry is set up by ftrace, we reply on ftrace to poke kernel
>> +		 * functions. For kernel funcitons, bpf_arch_text_poke() is only
>> +		 * called after a failed poke with ftrace. In this case, there
>> +		 * is probably something wrong with fentry, so there is nothing
>> +		 * we can do here. See register_fentry, unregister_fentry and
>> +		 * modify_fentry for details.
>> +		 */
>> +		return -EINVAL;
> 
> If you rely on ftrace to poke functions, why do you need to patch text
> at all? Why does the rest of this function exist?
> 
> I really don't like having another piece of code outside of ftrace
> patching the ftrace patch-site; this needs a much better explanation.
> 

Sorry for the incorrect explaination in the comment. I don't think it's
reasonable to patch ftrace patch-site without ftrace code either.

The patching logic in register_fentry, unregister_fentry and
modify_fentry is as follows:

if (tr->func.ftrace_managed)
        ret = register_ftrace_direct((long)ip, (long)new_addr);
else
        ret = bpf_arch_text_poke(ip, BPF_MOD_CALL, NULL, new_addr,
                                 true);

ftrace patch-site is patched by ftrace code. bpf_arch_text_poke() is
only used to patch bpf prog and bpf trampoline, which are not managed by
ftrace.

>> +
>> +	if (poke_type == BPF_MOD_CALL)
>> +		branch_type = AARCH64_INSN_BRANCH_LINK;
>> +	else
>> +		branch_type = AARCH64_INSN_BRANCH_NOLINK;
>> +
>> +	if (gen_branch_or_nop(branch_type, ip, old_addr, &old_insn) < 0)
>> +		return -EFAULT;
>> +
>> +	if (gen_branch_or_nop(branch_type, ip, new_addr, &new_insn) < 0)
>> +		return -EFAULT;
>> +
>> +	mutex_lock(&text_mutex);
>> +	if (aarch64_insn_read(ip, &replaced)) {
>> +		ret = -EFAULT;
>> +		goto out;
>> +	}
>> +
>> +	if (replaced != old_insn) {
>> +		ret = -EFAULT;
>> +		goto out;
>> +	}
>> +
>> +	ret = aarch64_insn_patch_text_nosync((void *)ip, new_insn);
> 
> ... and where does the actual synchronization come from in this case?
> 

aarch64_insn_patch_text_nosync() replaces an instruction atomically, so
no other CPUs will fetch a half-new and half-old instruction.

The scenario here is that there is a chance that another CPU fetches the
old instruction after bpf_arch_text_poke() finishes, that is, different
CPUs may execute different versions of instructions at the same time.

1. When a new trampoline is attached, it doesn't seem to be an issue for
different CPUs to jump to different trampolines temporarily.

2. When an old trampoline is freed, we should wait for all other CPUs to
exit the trampoline and make sure the trampoline is no longer reachable,
IIUC, bpf_tramp_image_put() function already uses percpu_ref and rcu
tasks to do this.

> Thanks,
> Mark.
> 
>> +out:
>> +	mutex_unlock(&text_mutex);
>> +	return ret;
>> +}
>> -- 
>> 2.30.2
>>
> .
Mark Rutland May 16, 2022, 7:18 a.m. UTC | #5
On Mon, May 16, 2022 at 02:55:46PM +0800, Xu Kuohai wrote:
> On 5/13/2022 10:59 PM, Mark Rutland wrote:
> > On Sun, Apr 24, 2022 at 11:40:25AM -0400, Xu Kuohai wrote:
> >> Impelment bpf_arch_text_poke() for arm64, so bpf trampoline code can use
> >> it to replace nop with jump, or replace jump with nop.
> >>
> >> Signed-off-by: Xu Kuohai <xukuohai@huawei.com>
> >> Acked-by: Song Liu <songliubraving@fb.com>
> >> ---
> >>  arch/arm64/net/bpf_jit_comp.c | 63 +++++++++++++++++++++++++++++++++++
> >>  1 file changed, 63 insertions(+)
> >>
> >> diff --git a/arch/arm64/net/bpf_jit_comp.c b/arch/arm64/net/bpf_jit_comp.c
> >> index 8ab4035dea27..3f9bdfec54c4 100644
> >> --- a/arch/arm64/net/bpf_jit_comp.c
> >> +++ b/arch/arm64/net/bpf_jit_comp.c
> >> @@ -9,6 +9,7 @@
> >>  
> >>  #include <linux/bitfield.h>
> >>  #include <linux/bpf.h>
> >> +#include <linux/memory.h>
> >>  #include <linux/filter.h>
> >>  #include <linux/printk.h>
> >>  #include <linux/slab.h>
> >> @@ -18,6 +19,7 @@
> >>  #include <asm/cacheflush.h>
> >>  #include <asm/debug-monitors.h>
> >>  #include <asm/insn.h>
> >> +#include <asm/patching.h>
> >>  #include <asm/set_memory.h>
> >>  
> >>  #include "bpf_jit.h"
> >> @@ -1529,3 +1531,64 @@ void bpf_jit_free_exec(void *addr)
> >>  {
> >>  	return vfree(addr);
> >>  }
> >> +
> >> +static int gen_branch_or_nop(enum aarch64_insn_branch_type type, void *ip,
> >> +			     void *addr, u32 *insn)
> >> +{
> >> +	if (!addr)
> >> +		*insn = aarch64_insn_gen_nop();
> >> +	else
> >> +		*insn = aarch64_insn_gen_branch_imm((unsigned long)ip,
> >> +						    (unsigned long)addr,
> >> +						    type);
> >> +
> >> +	return *insn != AARCH64_BREAK_FAULT ? 0 : -EFAULT;
> >> +}
> >> +
> >> +int bpf_arch_text_poke(void *ip, enum bpf_text_poke_type poke_type,
> >> +		       void *old_addr, void *new_addr)
> >> +{
> >> +	int ret;
> >> +	u32 old_insn;
> >> +	u32 new_insn;
> >> +	u32 replaced;
> >> +	enum aarch64_insn_branch_type branch_type;
> >> +
> >> +	if (!is_bpf_text_address((long)ip))
> >> +		/* Only poking bpf text is supported. Since kernel function
> >> +		 * entry is set up by ftrace, we reply on ftrace to poke kernel
> >> +		 * functions. For kernel funcitons, bpf_arch_text_poke() is only
> >> +		 * called after a failed poke with ftrace. In this case, there
> >> +		 * is probably something wrong with fentry, so there is nothing
> >> +		 * we can do here. See register_fentry, unregister_fentry and
> >> +		 * modify_fentry for details.
> >> +		 */
> >> +		return -EINVAL;
> > 
> > If you rely on ftrace to poke functions, why do you need to patch text
> > at all? Why does the rest of this function exist?
> > 
> > I really don't like having another piece of code outside of ftrace
> > patching the ftrace patch-site; this needs a much better explanation.
> > 
> 
> Sorry for the incorrect explaination in the comment. I don't think it's
> reasonable to patch ftrace patch-site without ftrace code either.
> 
> The patching logic in register_fentry, unregister_fentry and
> modify_fentry is as follows:
> 
> if (tr->func.ftrace_managed)
>         ret = register_ftrace_direct((long)ip, (long)new_addr);
> else
>         ret = bpf_arch_text_poke(ip, BPF_MOD_CALL, NULL, new_addr,
>                                  true);
> 
> ftrace patch-site is patched by ftrace code. bpf_arch_text_poke() is
> only used to patch bpf prog and bpf trampoline, which are not managed by
> ftrace.

Sorry, I had misunderstood. Thanks for the correction!

I'll have another look with that in mind.

> >> +
> >> +	if (poke_type == BPF_MOD_CALL)
> >> +		branch_type = AARCH64_INSN_BRANCH_LINK;
> >> +	else
> >> +		branch_type = AARCH64_INSN_BRANCH_NOLINK;
> >> +
> >> +	if (gen_branch_or_nop(branch_type, ip, old_addr, &old_insn) < 0)
> >> +		return -EFAULT;
> >> +
> >> +	if (gen_branch_or_nop(branch_type, ip, new_addr, &new_insn) < 0)
> >> +		return -EFAULT;
> >> +
> >> +	mutex_lock(&text_mutex);
> >> +	if (aarch64_insn_read(ip, &replaced)) {
> >> +		ret = -EFAULT;
> >> +		goto out;
> >> +	}
> >> +
> >> +	if (replaced != old_insn) {
> >> +		ret = -EFAULT;
> >> +		goto out;
> >> +	}
> >> +
> >> +	ret = aarch64_insn_patch_text_nosync((void *)ip, new_insn);
> > 
> > ... and where does the actual synchronization come from in this case?
> 
> aarch64_insn_patch_text_nosync() replaces an instruction atomically, so
> no other CPUs will fetch a half-new and half-old instruction.
> 
> The scenario here is that there is a chance that another CPU fetches the
> old instruction after bpf_arch_text_poke() finishes, that is, different
> CPUs may execute different versions of instructions at the same time.
> 
> 1. When a new trampoline is attached, it doesn't seem to be an issue for
> different CPUs to jump to different trampolines temporarily.
>
> 2. When an old trampoline is freed, we should wait for all other CPUs to
> exit the trampoline and make sure the trampoline is no longer reachable,
> IIUC, bpf_tramp_image_put() function already uses percpu_ref and rcu
> tasks to do this.

It would be good to have a comment for these points.

Thanks,
Mark.
Xu Kuohai May 16, 2022, 7:58 a.m. UTC | #6
On 5/16/2022 3:18 PM, Mark Rutland wrote:
> On Mon, May 16, 2022 at 02:55:46PM +0800, Xu Kuohai wrote:
>> On 5/13/2022 10:59 PM, Mark Rutland wrote:
>>> On Sun, Apr 24, 2022 at 11:40:25AM -0400, Xu Kuohai wrote:
>>>> Impelment bpf_arch_text_poke() for arm64, so bpf trampoline code can use
>>>> it to replace nop with jump, or replace jump with nop.
>>>>
>>>> Signed-off-by: Xu Kuohai <xukuohai@huawei.com>
>>>> Acked-by: Song Liu <songliubraving@fb.com>
>>>> ---
>>>>  arch/arm64/net/bpf_jit_comp.c | 63 +++++++++++++++++++++++++++++++++++
>>>>  1 file changed, 63 insertions(+)
>>>>
>>>> diff --git a/arch/arm64/net/bpf_jit_comp.c b/arch/arm64/net/bpf_jit_comp.c
>>>> index 8ab4035dea27..3f9bdfec54c4 100644
>>>> --- a/arch/arm64/net/bpf_jit_comp.c
>>>> +++ b/arch/arm64/net/bpf_jit_comp.c
>>>> @@ -9,6 +9,7 @@
>>>>  
>>>>  #include <linux/bitfield.h>
>>>>  #include <linux/bpf.h>
>>>> +#include <linux/memory.h>
>>>>  #include <linux/filter.h>
>>>>  #include <linux/printk.h>
>>>>  #include <linux/slab.h>
>>>> @@ -18,6 +19,7 @@
>>>>  #include <asm/cacheflush.h>
>>>>  #include <asm/debug-monitors.h>
>>>>  #include <asm/insn.h>
>>>> +#include <asm/patching.h>
>>>>  #include <asm/set_memory.h>
>>>>  
>>>>  #include "bpf_jit.h"
>>>> @@ -1529,3 +1531,64 @@ void bpf_jit_free_exec(void *addr)
>>>>  {
>>>>  	return vfree(addr);
>>>>  }
>>>> +
>>>> +static int gen_branch_or_nop(enum aarch64_insn_branch_type type, void *ip,
>>>> +			     void *addr, u32 *insn)
>>>> +{
>>>> +	if (!addr)
>>>> +		*insn = aarch64_insn_gen_nop();
>>>> +	else
>>>> +		*insn = aarch64_insn_gen_branch_imm((unsigned long)ip,
>>>> +						    (unsigned long)addr,
>>>> +						    type);
>>>> +
>>>> +	return *insn != AARCH64_BREAK_FAULT ? 0 : -EFAULT;
>>>> +}
>>>> +
>>>> +int bpf_arch_text_poke(void *ip, enum bpf_text_poke_type poke_type,
>>>> +		       void *old_addr, void *new_addr)
>>>> +{
>>>> +	int ret;
>>>> +	u32 old_insn;
>>>> +	u32 new_insn;
>>>> +	u32 replaced;
>>>> +	enum aarch64_insn_branch_type branch_type;
>>>> +
>>>> +	if (!is_bpf_text_address((long)ip))
>>>> +		/* Only poking bpf text is supported. Since kernel function
>>>> +		 * entry is set up by ftrace, we reply on ftrace to poke kernel
>>>> +		 * functions. For kernel funcitons, bpf_arch_text_poke() is only
>>>> +		 * called after a failed poke with ftrace. In this case, there
>>>> +		 * is probably something wrong with fentry, so there is nothing
>>>> +		 * we can do here. See register_fentry, unregister_fentry and
>>>> +		 * modify_fentry for details.
>>>> +		 */
>>>> +		return -EINVAL;
>>>
>>> If you rely on ftrace to poke functions, why do you need to patch text
>>> at all? Why does the rest of this function exist?
>>>
>>> I really don't like having another piece of code outside of ftrace
>>> patching the ftrace patch-site; this needs a much better explanation.
>>>
>>
>> Sorry for the incorrect explaination in the comment. I don't think it's
>> reasonable to patch ftrace patch-site without ftrace code either.
>>
>> The patching logic in register_fentry, unregister_fentry and
>> modify_fentry is as follows:
>>
>> if (tr->func.ftrace_managed)
>>         ret = register_ftrace_direct((long)ip, (long)new_addr);
>> else
>>         ret = bpf_arch_text_poke(ip, BPF_MOD_CALL, NULL, new_addr,
>>                                  true);
>>
>> ftrace patch-site is patched by ftrace code. bpf_arch_text_poke() is
>> only used to patch bpf prog and bpf trampoline, which are not managed by
>> ftrace.
> 
> Sorry, I had misunderstood. Thanks for the correction!
> 
> I'll have another look with that in mind.
>>>>> +
>>>> +	if (poke_type == BPF_MOD_CALL)
>>>> +		branch_type = AARCH64_INSN_BRANCH_LINK;
>>>> +	else
>>>> +		branch_type = AARCH64_INSN_BRANCH_NOLINK;
>>>> +
>>>> +	if (gen_branch_or_nop(branch_type, ip, old_addr, &old_insn) < 0)
>>>> +		return -EFAULT;
>>>> +
>>>> +	if (gen_branch_or_nop(branch_type, ip, new_addr, &new_insn) < 0)
>>>> +		return -EFAULT;
>>>> +
>>>> +	mutex_lock(&text_mutex);
>>>> +	if (aarch64_insn_read(ip, &replaced)) {
>>>> +		ret = -EFAULT;
>>>> +		goto out;
>>>> +	}
>>>> +
>>>> +	if (replaced != old_insn) {
>>>> +		ret = -EFAULT;
>>>> +		goto out;
>>>> +	}
>>>> +
>>>> +	ret = aarch64_insn_patch_text_nosync((void *)ip, new_insn);
>>>
>>> ... and where does the actual synchronization come from in this case?
>>
>> aarch64_insn_patch_text_nosync() replaces an instruction atomically, so
>> no other CPUs will fetch a half-new and half-old instruction.
>>
>> The scenario here is that there is a chance that another CPU fetches the
>> old instruction after bpf_arch_text_poke() finishes, that is, different
>> CPUs may execute different versions of instructions at the same time.
>>
>> 1. When a new trampoline is attached, it doesn't seem to be an issue for
>> different CPUs to jump to different trampolines temporarily.
>>
>> 2. When an old trampoline is freed, we should wait for all other CPUs to
>> exit the trampoline and make sure the trampoline is no longer reachable,
>> IIUC, bpf_tramp_image_put() function already uses percpu_ref and rcu
>> tasks to do this.
> 
> It would be good to have a comment for these points>

will add a comment for this in v4, thanks!

> Thanks,
> Mark.
> .
diff mbox series

Patch

diff --git a/arch/arm64/net/bpf_jit_comp.c b/arch/arm64/net/bpf_jit_comp.c
index 8ab4035dea27..3f9bdfec54c4 100644
--- a/arch/arm64/net/bpf_jit_comp.c
+++ b/arch/arm64/net/bpf_jit_comp.c
@@ -9,6 +9,7 @@ 
 
 #include <linux/bitfield.h>
 #include <linux/bpf.h>
+#include <linux/memory.h>
 #include <linux/filter.h>
 #include <linux/printk.h>
 #include <linux/slab.h>
@@ -18,6 +19,7 @@ 
 #include <asm/cacheflush.h>
 #include <asm/debug-monitors.h>
 #include <asm/insn.h>
+#include <asm/patching.h>
 #include <asm/set_memory.h>
 
 #include "bpf_jit.h"
@@ -1529,3 +1531,64 @@  void bpf_jit_free_exec(void *addr)
 {
 	return vfree(addr);
 }
+
+static int gen_branch_or_nop(enum aarch64_insn_branch_type type, void *ip,
+			     void *addr, u32 *insn)
+{
+	if (!addr)
+		*insn = aarch64_insn_gen_nop();
+	else
+		*insn = aarch64_insn_gen_branch_imm((unsigned long)ip,
+						    (unsigned long)addr,
+						    type);
+
+	return *insn != AARCH64_BREAK_FAULT ? 0 : -EFAULT;
+}
+
+int bpf_arch_text_poke(void *ip, enum bpf_text_poke_type poke_type,
+		       void *old_addr, void *new_addr)
+{
+	int ret;
+	u32 old_insn;
+	u32 new_insn;
+	u32 replaced;
+	enum aarch64_insn_branch_type branch_type;
+
+	if (!is_bpf_text_address((long)ip))
+		/* Only poking bpf text is supported. Since kernel function
+		 * entry is set up by ftrace, we reply on ftrace to poke kernel
+		 * functions. For kernel funcitons, bpf_arch_text_poke() is only
+		 * called after a failed poke with ftrace. In this case, there
+		 * is probably something wrong with fentry, so there is nothing
+		 * we can do here. See register_fentry, unregister_fentry and
+		 * modify_fentry for details.
+		 */
+		return -EINVAL;
+
+	if (poke_type == BPF_MOD_CALL)
+		branch_type = AARCH64_INSN_BRANCH_LINK;
+	else
+		branch_type = AARCH64_INSN_BRANCH_NOLINK;
+
+	if (gen_branch_or_nop(branch_type, ip, old_addr, &old_insn) < 0)
+		return -EFAULT;
+
+	if (gen_branch_or_nop(branch_type, ip, new_addr, &new_insn) < 0)
+		return -EFAULT;
+
+	mutex_lock(&text_mutex);
+	if (aarch64_insn_read(ip, &replaced)) {
+		ret = -EFAULT;
+		goto out;
+	}
+
+	if (replaced != old_insn) {
+		ret = -EFAULT;
+		goto out;
+	}
+
+	ret = aarch64_insn_patch_text_nosync((void *)ip, new_insn);
+out:
+	mutex_unlock(&text_mutex);
+	return ret;
+}