diff mbox series

[bpf-next] bpf, arm64: Support up to 12 function arguments

Message ID 20230917150752.69612-1-xukuohai@huaweicloud.com (mailing list archive)
State New, archived
Headers show
Series [bpf-next] bpf, arm64: Support up to 12 function arguments | expand

Commit Message

Xu Kuohai Sept. 17, 2023, 3:07 p.m. UTC
From: Xu Kuohai <xukuohai@huawei.com>

Currently arm64 bpf trampoline supports up to 8 function arguments.
According to the statistics from commit
473e3150e30a ("bpf, x86: allow function arguments up to 12 for TRACING"),
there are about 200 functions accept 9 to 12 arguments, so adding support
for up to 12 function arguments.

Due to bpf only supports function arguments up to 16 bytes, according to
AAPCS64, starting from the first argument, each argument is first
attempted to be loaded to 1 or 2 smallest registers from x0-x7, if there
are no enough registers to hold the entire argument, then all remaining
arguments starting from this one are pushed to the stack for passing.

Signed-off-by: Xu Kuohai <xukuohai@huawei.com>
---
 arch/arm64/net/bpf_jit_comp.c                | 171 ++++++++++++++-----
 tools/testing/selftests/bpf/DENYLIST.aarch64 |   2 -
 2 files changed, 131 insertions(+), 42 deletions(-)

Comments

Florent Revest Sept. 20, 2023, 10:17 p.m. UTC | #1
On Sun, Sep 17, 2023 at 5:09 PM Xu Kuohai <xukuohai@huaweicloud.com> wrote:
>
> From: Xu Kuohai <xukuohai@huawei.com>
>
> Currently arm64 bpf trampoline supports up to 8 function arguments.
> According to the statistics from commit
> 473e3150e30a ("bpf, x86: allow function arguments up to 12 for TRACING"),
> there are about 200 functions accept 9 to 12 arguments, so adding support
> for up to 12 function arguments.

Thank you Xu, this will be a nice addition! :)

> Due to bpf only supports function arguments up to 16 bytes, according to
> AAPCS64, starting from the first argument, each argument is first
> attempted to be loaded to 1 or 2 smallest registers from x0-x7, if there
> are no enough registers to hold the entire argument, then all remaining
> arguments starting from this one are pushed to the stack for passing.

If I read the section 6.8.2 of the AAPCS64 correctly, there is a
corner case which I believe isn't covered by this logic.

void f(u128 a, u128 b, u128, c, u64 d, u128 e, u64 f) {}
- a goes on x0 and x1
- b goes on x2 and x3
- c goes on x4 and x5
- d goes on x6
- e spills on the stack because it doesn't fit in the remaining regs
- f goes on x7

Maybe it would be good to add something pathological like this to the
selftests ?

Otherwise I only have minor nitpicks.

> Signed-off-by: Xu Kuohai <xukuohai@huawei.com>
> ---
>  arch/arm64/net/bpf_jit_comp.c                | 171 ++++++++++++++-----
>  tools/testing/selftests/bpf/DENYLIST.aarch64 |   2 -
>  2 files changed, 131 insertions(+), 42 deletions(-)
>
> diff --git a/arch/arm64/net/bpf_jit_comp.c b/arch/arm64/net/bpf_jit_comp.c
> index 7d4af64e3982..a0cf526b07ea 100644
> --- a/arch/arm64/net/bpf_jit_comp.c
> +++ b/arch/arm64/net/bpf_jit_comp.c
> @@ -1705,7 +1705,7 @@ bool bpf_jit_supports_subprog_tailcalls(void)
>  }
>
>  static void invoke_bpf_prog(struct jit_ctx *ctx, struct bpf_tramp_link *l,
> -                           int args_off, int retval_off, int run_ctx_off,
> +                           int bargs_off, int retval_off, int run_ctx_off,
>                             bool save_ret)
>  {
>         __le32 *branch;
> @@ -1747,7 +1747,7 @@ static void invoke_bpf_prog(struct jit_ctx *ctx, struct bpf_tramp_link *l,
>         /* save return value to callee saved register x20 */
>         emit(A64_MOV(1, A64_R(20), A64_R(0)), ctx);
>
> -       emit(A64_ADD_I(1, A64_R(0), A64_SP, args_off), ctx);
> +       emit(A64_ADD_I(1, A64_R(0), A64_SP, bargs_off), ctx);
>         if (!p->jited)
>                 emit_addr_mov_i64(A64_R(1), (const u64)p->insnsi, ctx);
>
> @@ -1772,7 +1772,7 @@ static void invoke_bpf_prog(struct jit_ctx *ctx, struct bpf_tramp_link *l,
>  }
>
>  static void invoke_bpf_mod_ret(struct jit_ctx *ctx, struct bpf_tramp_links *tl,
> -                              int args_off, int retval_off, int run_ctx_off,
> +                              int bargs_off, int retval_off, int run_ctx_off,
>                                __le32 **branches)
>  {
>         int i;
> @@ -1782,7 +1782,7 @@ static void invoke_bpf_mod_ret(struct jit_ctx *ctx, struct bpf_tramp_links *tl,
>          */
>         emit(A64_STR64I(A64_ZR, A64_SP, retval_off), ctx);
>         for (i = 0; i < tl->nr_links; i++) {
> -               invoke_bpf_prog(ctx, tl->links[i], args_off, retval_off,
> +               invoke_bpf_prog(ctx, tl->links[i], bargs_off, retval_off,
>                                 run_ctx_off, true);
>                 /* if (*(u64 *)(sp + retval_off) !=  0)
>                  *      goto do_fexit;
> @@ -1796,23 +1796,111 @@ static void invoke_bpf_mod_ret(struct jit_ctx *ctx, struct bpf_tramp_links *tl,
>         }
>  }
>
> -static void save_args(struct jit_ctx *ctx, int args_off, int nregs)
> +struct arg_aux {
> +       /* how many args are passed through registers, the rest args are

the rest of the* args

> +        * passed through stack
> +        */
> +       int args_in_reg;

Maybe args_in_regs ? since args can go in multiple regs

> +       /* how many registers used for passing arguments */

are* used

> +       int regs_for_arg;

And here regs_for_args ? Since It's the number of registers used for all args

> +       /* how many stack slots used for arguments, each slot is 8 bytes */

are* used

> +       int stack_slots_for_arg;

And here stack_slots_for_args, for the same reason as above?

> +};
> +
> +static void calc_arg_aux(const struct btf_func_model *m,
> +                        struct arg_aux *a)
>  {
>         int i;
> +       int nregs;
> +       int slots;
> +       int stack_slots;
> +
> +       /* verifier ensures m->nr_args <= MAX_BPF_FUNC_ARGS */
> +       for (i = 0, nregs = 0; i < m->nr_args; i++) {
> +               slots = (m->arg_size[i] + 7) / 8;
> +               if (nregs + slots <= 8) /* passed through register ? */
> +                       nregs += slots;
> +               else
> +                       break;
> +       }
> +
> +       a->args_in_reg = i;
> +       a->regs_for_arg = nregs;
>
> -       for (i = 0; i < nregs; i++) {
> -               emit(A64_STR64I(i, A64_SP, args_off), ctx);
> -               args_off += 8;
> +       /* the rest arguments are passed through stack */
> +       for (stack_slots = 0; i < m->nr_args; i++)
> +               stack_slots += (m->arg_size[i] + 7) / 8;
> +
> +       a->stack_slots_for_arg = stack_slots;
> +}
> +
> +static void clear_garbage(struct jit_ctx *ctx, int reg, int effective_bytes)
> +{
> +       if (effective_bytes) {
> +               int garbage_bits = 64 - 8 * effective_bytes;
> +#ifdef CONFIG_CPU_BIG_ENDIAN
> +               /* garbage bits are at the right end */
> +               emit(A64_LSR(1, reg, reg, garbage_bits), ctx);
> +               emit(A64_LSL(1, reg, reg, garbage_bits), ctx);
> +#else
> +               /* garbage bits are at the left end */
> +               emit(A64_LSL(1, reg, reg, garbage_bits), ctx);
> +               emit(A64_LSR(1, reg, reg, garbage_bits), ctx);
> +#endif
>         }
>  }
>
> -static void restore_args(struct jit_ctx *ctx, int args_off, int nregs)
> +static void save_args(struct jit_ctx *ctx, int bargs_off, int oargs_off,
> +                     const struct btf_func_model *m,
> +                     const struct arg_aux *a,
> +                     bool for_call_origin)
>  {
>         int i;
> +       int reg;
> +       int doff;
> +       int soff;
> +       int slots;
> +       u8 tmp = bpf2a64[TMP_REG_1];
> +
> +       /* store argument registers to stack for call bpf, or restore argument

to* call bpf or "for the bpf program"

> +        * registers from stack for the original function
> +        */
> +       for (reg = 0; reg < a->regs_for_arg; reg++) {
> +               emit(for_call_origin ?
> +                    A64_LDR64I(reg, A64_SP, bargs_off) :
> +                    A64_STR64I(reg, A64_SP, bargs_off),
> +                    ctx);
> +               bargs_off += 8;
> +       }
>
> -       for (i = 0; i < nregs; i++) {
> -               emit(A64_LDR64I(i, A64_SP, args_off), ctx);
> -               args_off += 8;
> +       soff = 32; /* on stack arguments start from FP + 32 */
> +       doff = (for_call_origin ? oargs_off : bargs_off);
> +
> +       /* save on stack arguments */
> +       for (i = a->args_in_reg; i < m->nr_args; i++) {
> +               slots = (m->arg_size[i] + 7) / 8;
> +               /* verifier ensures arg_size <= 16, so slots equals 1 or 2 */
> +               while (slots-- > 0) {
> +                       emit(A64_LDR64I(tmp, A64_FP, soff), ctx);
> +                       /* if there is unused space in the last slot, clear
> +                        * the garbage contained in the space.
> +                        */
> +                       if (slots == 0 && !for_call_origin)
> +                               clear_garbage(ctx, tmp, m->arg_size[i] % 8);
> +                       emit(A64_STR64I(tmp, A64_SP, doff), ctx);
> +                       soff += 8;
> +                       doff += 8;
> +               }
> +       }
> +}
> +
> +static void restore_args(struct jit_ctx *ctx, int bargs_off, int nregs)
> +{
> +       int reg;
> +
> +       for (reg = 0; reg < nregs; reg++) {
> +               emit(A64_LDR64I(reg, A64_SP, bargs_off), ctx);
> +               bargs_off += 8;
>         }
>  }
>
> @@ -1829,17 +1917,21 @@ static void restore_args(struct jit_ctx *ctx, int args_off, int nregs)
>   */
>  static int prepare_trampoline(struct jit_ctx *ctx, struct bpf_tramp_image *im,
>                               struct bpf_tramp_links *tlinks, void *orig_call,
> -                             int nregs, u32 flags)
> +                             const struct btf_func_model *m,
> +                             const struct arg_aux *a,
> +                             u32 flags)
>  {
>         int i;
>         int stack_size;
>         int retaddr_off;
>         int regs_off;
>         int retval_off;
> -       int args_off;
> +       int bargs_off;
>         int nregs_off;
>         int ip_off;
>         int run_ctx_off;
> +       int oargs_off;
> +       int nregs;
>         struct bpf_tramp_links *fentry = &tlinks[BPF_TRAMP_FENTRY];
>         struct bpf_tramp_links *fexit = &tlinks[BPF_TRAMP_FEXIT];
>         struct bpf_tramp_links *fmod_ret = &tlinks[BPF_TRAMP_MODIFY_RETURN];
> @@ -1859,19 +1951,26 @@ static int prepare_trampoline(struct jit_ctx *ctx, struct bpf_tramp_image *im,
>          *
>          * SP + retval_off  [ return value      ] BPF_TRAMP_F_CALL_ORIG or
>          *                                        BPF_TRAMP_F_RET_FENTRY_RET
> -        *
>          *                  [ arg reg N         ]
>          *                  [ ...               ]
> -        * SP + args_off    [ arg reg 1         ]
> +        * SP + bargs_off   [ arg reg 1         ] for bpf
>          *
>          * SP + nregs_off   [ arg regs count    ]
>          *
>          * SP + ip_off      [ traced function   ] BPF_TRAMP_F_IP_ARG flag
>          *
>          * SP + run_ctx_off [ bpf_tramp_run_ctx ]
> +        *
> +        *                  [ stack arg N       ]
> +        *                  [ ...               ]
> +        * SP + oargs_off   [ stack arg 1       ] for original func
>          */
>
>         stack_size = 0;
> +       oargs_off = stack_size;
> +       if (flags & BPF_TRAMP_F_CALL_ORIG)
> +               stack_size += 8 * a->stack_slots_for_arg;
> +
>         run_ctx_off = stack_size;
>         /* room for bpf_tramp_run_ctx */
>         stack_size += round_up(sizeof(struct bpf_tramp_run_ctx), 8);
> @@ -1885,9 +1984,10 @@ static int prepare_trampoline(struct jit_ctx *ctx, struct bpf_tramp_image *im,
>         /* room for args count */
>         stack_size += 8;
>
> -       args_off = stack_size;
> +       bargs_off = stack_size;
>         /* room for args */
> -       stack_size += nregs * 8;
> +       nregs = a->regs_for_arg + a->stack_slots_for_arg;

Maybe this name no longer makes sense ?
Xu Kuohai Sept. 21, 2023, 5:21 a.m. UTC | #2
Hi Forent,

On 9/21/2023 6:17 AM, Florent Revest wrote:
> On Sun, Sep 17, 2023 at 5:09 PM Xu Kuohai <xukuohai@huaweicloud.com> wrote:
>>
>> From: Xu Kuohai <xukuohai@huawei.com>
>>
>> Currently arm64 bpf trampoline supports up to 8 function arguments.
>> According to the statistics from commit
>> 473e3150e30a ("bpf, x86: allow function arguments up to 12 for TRACING"),
>> there are about 200 functions accept 9 to 12 arguments, so adding support
>> for up to 12 function arguments.
> 
> Thank you Xu, this will be a nice addition! :)
>

Thanks a lot for your review and for your time.

>> Due to bpf only supports function arguments up to 16 bytes, according to
>> AAPCS64, starting from the first argument, each argument is first
>> attempted to be loaded to 1 or 2 smallest registers from x0-x7, if there
>> are no enough registers to hold the entire argument, then all remaining
>> arguments starting from this one are pushed to the stack for passing.
> 
> If I read the section 6.8.2 of the AAPCS64 correctly, there is a
> corner case which I believe isn't covered by this logic.
> 
> void f(u128 a, u128 b, u128, c, u64 d, u128 e, u64 f) {}
> - a goes on x0 and x1
> - b goes on x2 and x3
> - c goes on x4 and x5
> - d goes on x6
> - e spills on the stack because it doesn't fit in the remaining regs
> - f goes on x7
>

I guess you might have overlooked rule C.13 in AAPCS64. Non-floating type arguments
are copied to stack under rule C.15/C.17. However, C.13 is applied before C.15/C.17,
which means that NGRN is set to 8 before the stack is used. That is, all 8 parameter
arguments are used up and any remaining arguments can only be passed by the stack.

C.13    The NGRN is set to 8.

C.14    The NSAA is rounded up to the larger of 8 or the Natural Alignment of the
         argument’s type.

C.15    If the argument is a composite type then the argument is copied to memory
         at the adjusted NSAA. The NSAA is incremented by the size of the argument.
         The argument has now been allocated.

C.16    If the size of the argument is less than 8 bytes then the size of the argument
         is set to 8 bytes. The effect is as if the argument was copied to the least
         significant bits of a 64-bit register and the remaining bits filled with
         unspecified values.

C.17    The argument is copied to memory at the adjusted NSAA. The NSAA is incremented
         by the size of the argument. The argument has now been allocated.


And the following assembly code also shows 'e' and 'f' are passed by stack.

int func(__int128 a, __int128 b, __int128 c, int64_t d, __int128 e, int64_t f)
{
         return e == 5 || f == 7;
}

asseembly:

func:
         sub     sp, sp, #64
         stp     x0, x1, [sp, 48]
         stp     x2, x3, [sp, 32]
         stp     x4, x5, [sp, 16]
         str     x6, [sp, 8]
         ldr     x0, [sp, 64] // ** load the low 8 bytes of e from SP + 64 **
         cmp     x0, 5
         bne     .L27
         ldr     x0, [sp, 72] // ** load the high 8 bytes of e from SP + 72 **
         cmp     x0, 0
         beq     .L22
.L27:
         ldr     x0, [sp, 80] // ** load f from SP + 80 **
         cmp     x0, 7
         bne     .L24
.L22:
         mov     w0, 1
         b       .L26
.L24:
         mov     w0, 0
.L26:
         add     sp, sp, 64
         ret

Although the above case is fine, the current patch does not handle rule C.14 correctly.
For example, for the following func, an 8-byte padding is inserted between f and g by
C.14 to align g to 16 bytes, but this patch mistakenly treats it as part of g.

int func(__int128 a, __int128 b, __int128 c, int64_t d, __int128 e, int64_t f, __int128 g)
{
}

Maybe we could fix it by adding argument alignment to struct btf_func_model, I'll
give it a try.

> Maybe it would be good to add something pathological like this to the
> selftests ?
>

OK, will do

> Otherwise I only have minor nitpicks.
> 
>> Signed-off-by: Xu Kuohai <xukuohai@huawei.com>
>> ---
>>   arch/arm64/net/bpf_jit_comp.c                | 171 ++++++++++++++-----
>>   tools/testing/selftests/bpf/DENYLIST.aarch64 |   2 -
>>   2 files changed, 131 insertions(+), 42 deletions(-)
>>
>> diff --git a/arch/arm64/net/bpf_jit_comp.c b/arch/arm64/net/bpf_jit_comp.c
>> index 7d4af64e3982..a0cf526b07ea 100644
>> --- a/arch/arm64/net/bpf_jit_comp.c
>> +++ b/arch/arm64/net/bpf_jit_comp.c
>> @@ -1705,7 +1705,7 @@ bool bpf_jit_supports_subprog_tailcalls(void)
>>   }
>>
>>   static void invoke_bpf_prog(struct jit_ctx *ctx, struct bpf_tramp_link *l,
>> -                           int args_off, int retval_off, int run_ctx_off,
>> +                           int bargs_off, int retval_off, int run_ctx_off,
>>                              bool save_ret)
>>   {
>>          __le32 *branch;
>> @@ -1747,7 +1747,7 @@ static void invoke_bpf_prog(struct jit_ctx *ctx, struct bpf_tramp_link *l,
>>          /* save return value to callee saved register x20 */
>>          emit(A64_MOV(1, A64_R(20), A64_R(0)), ctx);
>>
>> -       emit(A64_ADD_I(1, A64_R(0), A64_SP, args_off), ctx);
>> +       emit(A64_ADD_I(1, A64_R(0), A64_SP, bargs_off), ctx);
>>          if (!p->jited)
>>                  emit_addr_mov_i64(A64_R(1), (const u64)p->insnsi, ctx);
>>
>> @@ -1772,7 +1772,7 @@ static void invoke_bpf_prog(struct jit_ctx *ctx, struct bpf_tramp_link *l,
>>   }
>>
>>   static void invoke_bpf_mod_ret(struct jit_ctx *ctx, struct bpf_tramp_links *tl,
>> -                              int args_off, int retval_off, int run_ctx_off,
>> +                              int bargs_off, int retval_off, int run_ctx_off,
>>                                 __le32 **branches)
>>   {
>>          int i;
>> @@ -1782,7 +1782,7 @@ static void invoke_bpf_mod_ret(struct jit_ctx *ctx, struct bpf_tramp_links *tl,
>>           */
>>          emit(A64_STR64I(A64_ZR, A64_SP, retval_off), ctx);
>>          for (i = 0; i < tl->nr_links; i++) {
>> -               invoke_bpf_prog(ctx, tl->links[i], args_off, retval_off,
>> +               invoke_bpf_prog(ctx, tl->links[i], bargs_off, retval_off,
>>                                  run_ctx_off, true);
>>                  /* if (*(u64 *)(sp + retval_off) !=  0)
>>                   *      goto do_fexit;
>> @@ -1796,23 +1796,111 @@ static void invoke_bpf_mod_ret(struct jit_ctx *ctx, struct bpf_tramp_links *tl,
>>          }
>>   }
>>
>> -static void save_args(struct jit_ctx *ctx, int args_off, int nregs)
>> +struct arg_aux {
>> +       /* how many args are passed through registers, the rest args are
> 
> the rest of the* args
> 
>> +        * passed through stack
>> +        */
>> +       int args_in_reg;
> 
> Maybe args_in_regs ? since args can go in multiple regs
> 
>> +       /* how many registers used for passing arguments */
> 
> are* used
> 
>> +       int regs_for_arg;
> 
> And here regs_for_args ? Since It's the number of registers used for all args
> 
>> +       /* how many stack slots used for arguments, each slot is 8 bytes */
> 
> are* used
> 
>> +       int stack_slots_for_arg;
> 
> And here stack_slots_for_args, for the same reason as above?
> 
>> +};
>> +
>> +static void calc_arg_aux(const struct btf_func_model *m,
>> +                        struct arg_aux *a)
>>   {
>>          int i;
>> +       int nregs;
>> +       int slots;
>> +       int stack_slots;
>> +
>> +       /* verifier ensures m->nr_args <= MAX_BPF_FUNC_ARGS */
>> +       for (i = 0, nregs = 0; i < m->nr_args; i++) {
>> +               slots = (m->arg_size[i] + 7) / 8;
>> +               if (nregs + slots <= 8) /* passed through register ? */
>> +                       nregs += slots;
>> +               else
>> +                       break;
>> +       }
>> +
>> +       a->args_in_reg = i;
>> +       a->regs_for_arg = nregs;
>>
>> -       for (i = 0; i < nregs; i++) {
>> -               emit(A64_STR64I(i, A64_SP, args_off), ctx);
>> -               args_off += 8;
>> +       /* the rest arguments are passed through stack */
>> +       for (stack_slots = 0; i < m->nr_args; i++)
>> +               stack_slots += (m->arg_size[i] + 7) / 8;
>> +
>> +       a->stack_slots_for_arg = stack_slots;
>> +}
>> +
>> +static void clear_garbage(struct jit_ctx *ctx, int reg, int effective_bytes)
>> +{
>> +       if (effective_bytes) {
>> +               int garbage_bits = 64 - 8 * effective_bytes;
>> +#ifdef CONFIG_CPU_BIG_ENDIAN
>> +               /* garbage bits are at the right end */
>> +               emit(A64_LSR(1, reg, reg, garbage_bits), ctx);
>> +               emit(A64_LSL(1, reg, reg, garbage_bits), ctx);
>> +#else
>> +               /* garbage bits are at the left end */
>> +               emit(A64_LSL(1, reg, reg, garbage_bits), ctx);
>> +               emit(A64_LSR(1, reg, reg, garbage_bits), ctx);
>> +#endif
>>          }
>>   }
>>
>> -static void restore_args(struct jit_ctx *ctx, int args_off, int nregs)
>> +static void save_args(struct jit_ctx *ctx, int bargs_off, int oargs_off,
>> +                     const struct btf_func_model *m,
>> +                     const struct arg_aux *a,
>> +                     bool for_call_origin)
>>   {
>>          int i;
>> +       int reg;
>> +       int doff;
>> +       int soff;
>> +       int slots;
>> +       u8 tmp = bpf2a64[TMP_REG_1];
>> +
>> +       /* store argument registers to stack for call bpf, or restore argument
> 
> to* call bpf or "for the bpf program"
> 

Sorry for these incorrect words :(, all will be fixed in the next version, thanks!

>> +        * registers from stack for the original function
>> +        */
>> +       for (reg = 0; reg < a->regs_for_arg; reg++) {
>> +               emit(for_call_origin ?
>> +                    A64_LDR64I(reg, A64_SP, bargs_off) :
>> +                    A64_STR64I(reg, A64_SP, bargs_off),
>> +                    ctx);
>> +               bargs_off += 8;
>> +       }
>>
>> -       for (i = 0; i < nregs; i++) {
>> -               emit(A64_LDR64I(i, A64_SP, args_off), ctx);
>> -               args_off += 8;
>> +       soff = 32; /* on stack arguments start from FP + 32 */
>> +       doff = (for_call_origin ? oargs_off : bargs_off);
>> +
>> +       /* save on stack arguments */
>> +       for (i = a->args_in_reg; i < m->nr_args; i++) {
>> +               slots = (m->arg_size[i] + 7) / 8;
>> +               /* verifier ensures arg_size <= 16, so slots equals 1 or 2 */
>> +               while (slots-- > 0) {
>> +                       emit(A64_LDR64I(tmp, A64_FP, soff), ctx);
>> +                       /* if there is unused space in the last slot, clear
>> +                        * the garbage contained in the space.
>> +                        */
>> +                       if (slots == 0 && !for_call_origin)
>> +                               clear_garbage(ctx, tmp, m->arg_size[i] % 8);
>> +                       emit(A64_STR64I(tmp, A64_SP, doff), ctx);
>> +                       soff += 8;
>> +                       doff += 8;
>> +               }
>> +       }
>> +}
>> +
>> +static void restore_args(struct jit_ctx *ctx, int bargs_off, int nregs)
>> +{
>> +       int reg;
>> +
>> +       for (reg = 0; reg < nregs; reg++) {
>> +               emit(A64_LDR64I(reg, A64_SP, bargs_off), ctx);
>> +               bargs_off += 8;
>>          }
>>   }
>>
>> @@ -1829,17 +1917,21 @@ static void restore_args(struct jit_ctx *ctx, int args_off, int nregs)
>>    */
>>   static int prepare_trampoline(struct jit_ctx *ctx, struct bpf_tramp_image *im,
>>                                struct bpf_tramp_links *tlinks, void *orig_call,
>> -                             int nregs, u32 flags)
>> +                             const struct btf_func_model *m,
>> +                             const struct arg_aux *a,
>> +                             u32 flags)
>>   {
>>          int i;
>>          int stack_size;
>>          int retaddr_off;
>>          int regs_off;
>>          int retval_off;
>> -       int args_off;
>> +       int bargs_off;
>>          int nregs_off;
>>          int ip_off;
>>          int run_ctx_off;
>> +       int oargs_off;
>> +       int nregs;
>>          struct bpf_tramp_links *fentry = &tlinks[BPF_TRAMP_FENTRY];
>>          struct bpf_tramp_links *fexit = &tlinks[BPF_TRAMP_FEXIT];
>>          struct bpf_tramp_links *fmod_ret = &tlinks[BPF_TRAMP_MODIFY_RETURN];
>> @@ -1859,19 +1951,26 @@ static int prepare_trampoline(struct jit_ctx *ctx, struct bpf_tramp_image *im,
>>           *
>>           * SP + retval_off  [ return value      ] BPF_TRAMP_F_CALL_ORIG or
>>           *                                        BPF_TRAMP_F_RET_FENTRY_RET
>> -        *
>>           *                  [ arg reg N         ]
>>           *                  [ ...               ]
>> -        * SP + args_off    [ arg reg 1         ]
>> +        * SP + bargs_off   [ arg reg 1         ] for bpf
>>           *
>>           * SP + nregs_off   [ arg regs count    ]
>>           *
>>           * SP + ip_off      [ traced function   ] BPF_TRAMP_F_IP_ARG flag
>>           *
>>           * SP + run_ctx_off [ bpf_tramp_run_ctx ]
>> +        *
>> +        *                  [ stack arg N       ]
>> +        *                  [ ...               ]
>> +        * SP + oargs_off   [ stack arg 1       ] for original func
>>           */
>>
>>          stack_size = 0;
>> +       oargs_off = stack_size;
>> +       if (flags & BPF_TRAMP_F_CALL_ORIG)
>> +               stack_size += 8 * a->stack_slots_for_arg;
>> +
>>          run_ctx_off = stack_size;
>>          /* room for bpf_tramp_run_ctx */
>>          stack_size += round_up(sizeof(struct bpf_tramp_run_ctx), 8);
>> @@ -1885,9 +1984,10 @@ static int prepare_trampoline(struct jit_ctx *ctx, struct bpf_tramp_image *im,
>>          /* room for args count */
>>          stack_size += 8;
>>
>> -       args_off = stack_size;
>> +       bargs_off = stack_size;
>>          /* room for args */
>> -       stack_size += nregs * 8;
>> +       nregs = a->regs_for_arg + a->stack_slots_for_arg;
> 
> Maybe this name no longer makes sense ?

OK, I'll remove it
Florent Revest Sept. 21, 2023, 10:09 a.m. UTC | #3
On Thu, Sep 21, 2023 at 7:21 AM Xu Kuohai <xukuohai@huaweicloud.com> wrote:
> On 9/21/2023 6:17 AM, Florent Revest wrote:
> > On Sun, Sep 17, 2023 at 5:09 PM Xu Kuohai <xukuohai@huaweicloud.com> wrote:
> >> Due to bpf only supports function arguments up to 16 bytes, according to
> >> AAPCS64, starting from the first argument, each argument is first
> >> attempted to be loaded to 1 or 2 smallest registers from x0-x7, if there
> >> are no enough registers to hold the entire argument, then all remaining
> >> arguments starting from this one are pushed to the stack for passing.
> >
> > If I read the section 6.8.2 of the AAPCS64 correctly, there is a
> > corner case which I believe isn't covered by this logic.
> >
> > void f(u128 a, u128 b, u128, c, u64 d, u128 e, u64 f) {}
> > - a goes on x0 and x1
> > - b goes on x2 and x3
> > - c goes on x4 and x5
> > - d goes on x6
> > - e spills on the stack because it doesn't fit in the remaining regs
> > - f goes on x7
> >
>
> I guess you might have overlooked rule C.13 in AAPCS64. Non-floating type arguments
> are copied to stack under rule C.15/C.17. However, C.13 is applied before C.15/C.17,
> which means that NGRN is set to 8 before the stack is used. That is, all 8 parameter
> arguments are used up and any remaining arguments can only be passed by the stack.
>
> C.13    The NGRN is set to 8.
>
> C.14    The NSAA is rounded up to the larger of 8 or the Natural Alignment of the
>          argument’s type.
>
> C.15    If the argument is a composite type then the argument is copied to memory
>          at the adjusted NSAA. The NSAA is incremented by the size of the argument.
>          The argument has now been allocated.
>
> C.16    If the size of the argument is less than 8 bytes then the size of the argument
>          is set to 8 bytes. The effect is as if the argument was copied to the least
>          significant bits of a 64-bit register and the remaining bits filled with
>          unspecified values.
>
> C.17    The argument is copied to memory at the adjusted NSAA. The NSAA is incremented
>          by the size of the argument. The argument has now been allocated.
>
>
> And the following assembly code also shows 'e' and 'f' are passed by stack.
>
> int func(__int128 a, __int128 b, __int128 c, int64_t d, __int128 e, int64_t f)
> {
>          return e == 5 || f == 7;
> }
>
> asseembly:
>
> func:
>          sub     sp, sp, #64
>          stp     x0, x1, [sp, 48]
>          stp     x2, x3, [sp, 32]
>          stp     x4, x5, [sp, 16]
>          str     x6, [sp, 8]
>          ldr     x0, [sp, 64] // ** load the low 8 bytes of e from SP + 64 **
>          cmp     x0, 5
>          bne     .L27
>          ldr     x0, [sp, 72] // ** load the high 8 bytes of e from SP + 72 **
>          cmp     x0, 0
>          beq     .L22
> .L27:
>          ldr     x0, [sp, 80] // ** load f from SP + 80 **
>          cmp     x0, 7
>          bne     .L24
> .L22:
>          mov     w0, 1
>          b       .L26
> .L24:
>          mov     w0, 0
> .L26:
>          add     sp, sp, 64
>          ret

Ah, that's great! :) It keeps things easy then. Thank you for the explanation!

> Although the above case is fine, the current patch does not handle rule C.14 correctly.
> For example, for the following func, an 8-byte padding is inserted between f and g by
> C.14 to align g to 16 bytes, but this patch mistakenly treats it as part of g.
>
> int func(__int128 a, __int128 b, __int128 c, int64_t d, __int128 e, int64_t f, __int128 g)
> {
> }
>
> Maybe we could fix it by adding argument alignment to struct btf_func_model, I'll
> give it a try.

Good catch!

> > Maybe it would be good to add something pathological like this to the
> > selftests ?
> >
>
> OK, will do

Just a thought on that topic, maybe it would be preferable to have a
new separate test for this ? Currently we have a test for 128b long
arguments and a test for many arguments, it's good to have these
separate because they are two dimensions of bpf architectural support:
if someone adds support for one xor the other feature to an arch, it's
good to see a test go green. Since that new test would cover both long
and many arguments, it should probably be a new test.

> >> -static void save_args(struct jit_ctx *ctx, int args_off, int nregs)
> >> +struct arg_aux {
> >> +       /* how many args are passed through registers, the rest args are
> >
> > the rest of the* args
> >
> >> +        * passed through stack
> >> +        */
> >> +       int args_in_reg;
> >
> > Maybe args_in_regs ? since args can go in multiple regs
> >
> >> +       /* how many registers used for passing arguments */
> >
> > are* used
> >
> >> +       int regs_for_arg;
> >
> > And here regs_for_args ? Since It's the number of registers used for all args
> >
> >> +       /* how many stack slots used for arguments, each slot is 8 bytes */
> >
> > are* used
> >
> >> +       int stack_slots_for_arg;
> >
> > And here stack_slots_for_args, for the same reason as above?
> >
> >> +};
> >> +
> >> +static void calc_arg_aux(const struct btf_func_model *m,
> >> +                        struct arg_aux *a)
> >>   {
> >>          int i;
> >> +       int nregs;
> >> +       int slots;
> >> +       int stack_slots;
> >> +
> >> +       /* verifier ensures m->nr_args <= MAX_BPF_FUNC_ARGS */
> >> +       for (i = 0, nregs = 0; i < m->nr_args; i++) {
> >> +               slots = (m->arg_size[i] + 7) / 8;
> >> +               if (nregs + slots <= 8) /* passed through register ? */
> >> +                       nregs += slots;
> >> +               else
> >> +                       break;
> >> +       }
> >> +
> >> +       a->args_in_reg = i;
> >> +       a->regs_for_arg = nregs;
> >>
> >> -       for (i = 0; i < nregs; i++) {
> >> -               emit(A64_STR64I(i, A64_SP, args_off), ctx);
> >> -               args_off += 8;
> >> +       /* the rest arguments are passed through stack */
> >> +       for (stack_slots = 0; i < m->nr_args; i++)
> >> +               stack_slots += (m->arg_size[i] + 7) / 8;
> >> +
> >> +       a->stack_slots_for_arg = stack_slots;
> >> +}
> >> +
> >> +static void clear_garbage(struct jit_ctx *ctx, int reg, int effective_bytes)
> >> +{
> >> +       if (effective_bytes) {
> >> +               int garbage_bits = 64 - 8 * effective_bytes;
> >> +#ifdef CONFIG_CPU_BIG_ENDIAN
> >> +               /* garbage bits are at the right end */
> >> +               emit(A64_LSR(1, reg, reg, garbage_bits), ctx);
> >> +               emit(A64_LSL(1, reg, reg, garbage_bits), ctx);
> >> +#else
> >> +               /* garbage bits are at the left end */
> >> +               emit(A64_LSL(1, reg, reg, garbage_bits), ctx);
> >> +               emit(A64_LSR(1, reg, reg, garbage_bits), ctx);
> >> +#endif
> >>          }
> >>   }
> >>
> >> -static void restore_args(struct jit_ctx *ctx, int args_off, int nregs)
> >> +static void save_args(struct jit_ctx *ctx, int bargs_off, int oargs_off,
> >> +                     const struct btf_func_model *m,
> >> +                     const struct arg_aux *a,
> >> +                     bool for_call_origin)
> >>   {
> >>          int i;
> >> +       int reg;
> >> +       int doff;
> >> +       int soff;
> >> +       int slots;
> >> +       u8 tmp = bpf2a64[TMP_REG_1];
> >> +
> >> +       /* store argument registers to stack for call bpf, or restore argument
> >
> > to* call bpf or "for the bpf program"
> >
>
> Sorry for these incorrect words :(, all will be fixed in the next version, thanks!

No problem!

> >>   static int prepare_trampoline(struct jit_ctx *ctx, struct bpf_tramp_image *im,
> >>                                struct bpf_tramp_links *tlinks, void *orig_call,
> >> -                             int nregs, u32 flags)
> >> +                             const struct btf_func_model *m,
> >> +                             const struct arg_aux *a,
> >> +                             u32 flags)
> >>   {
> >>          int i;
> >>          int stack_size;
> >>          int retaddr_off;
> >>          int regs_off;
> >>          int retval_off;
> >> -       int args_off;
> >> +       int bargs_off;
> >>          int nregs_off;
> >>          int ip_off;
> >>          int run_ctx_off;
> >> +       int oargs_off;
> >> +       int nregs;
> >>          struct bpf_tramp_links *fentry = &tlinks[BPF_TRAMP_FENTRY];
> >>          struct bpf_tramp_links *fexit = &tlinks[BPF_TRAMP_FEXIT];
> >>          struct bpf_tramp_links *fmod_ret = &tlinks[BPF_TRAMP_MODIFY_RETURN];
> >> @@ -1859,19 +1951,26 @@ static int prepare_trampoline(struct jit_ctx *ctx, struct bpf_tramp_image *im,
> >>           *
> >>           * SP + retval_off  [ return value      ] BPF_TRAMP_F_CALL_ORIG or
> >>           *                                        BPF_TRAMP_F_RET_FENTRY_RET
> >> -        *
> >>           *                  [ arg reg N         ]
> >>           *                  [ ...               ]
> >> -        * SP + args_off    [ arg reg 1         ]
> >> +        * SP + bargs_off   [ arg reg 1         ] for bpf
> >>           *
> >>           * SP + nregs_off   [ arg regs count    ]
> >>           *
> >>           * SP + ip_off      [ traced function   ] BPF_TRAMP_F_IP_ARG flag
> >>           *
> >>           * SP + run_ctx_off [ bpf_tramp_run_ctx ]
> >> +        *
> >> +        *                  [ stack arg N       ]
> >> +        *                  [ ...               ]
> >> +        * SP + oargs_off   [ stack arg 1       ] for original func
> >>           */
> >>
> >>          stack_size = 0;
> >> +       oargs_off = stack_size;
> >> +       if (flags & BPF_TRAMP_F_CALL_ORIG)
> >> +               stack_size += 8 * a->stack_slots_for_arg;
> >> +
> >>          run_ctx_off = stack_size;
> >>          /* room for bpf_tramp_run_ctx */
> >>          stack_size += round_up(sizeof(struct bpf_tramp_run_ctx), 8);
> >> @@ -1885,9 +1984,10 @@ static int prepare_trampoline(struct jit_ctx *ctx, struct bpf_tramp_image *im,
> >>          /* room for args count */
> >>          stack_size += 8;
> >>
> >> -       args_off = stack_size;
> >> +       bargs_off = stack_size;
> >>          /* room for args */
> >> -       stack_size += nregs * 8;
> >> +       nregs = a->regs_for_arg + a->stack_slots_for_arg;
> >
> > Maybe this name no longer makes sense ?
>
> OK, I'll remove it

Ack
Xu Kuohai Sept. 21, 2023, 11:19 a.m. UTC | #4
On 9/21/2023 6:09 PM, Florent Revest wrote:
> On Thu, Sep 21, 2023 at 7:21 AM Xu Kuohai <xukuohai@huaweicloud.com> wrote:
>> On 9/21/2023 6:17 AM, Florent Revest wrote:
>>> On Sun, Sep 17, 2023 at 5:09 PM Xu Kuohai <xukuohai@huaweicloud.com> wrote:
>>>> Due to bpf only supports function arguments up to 16 bytes, according to
>>>> AAPCS64, starting from the first argument, each argument is first
>>>> attempted to be loaded to 1 or 2 smallest registers from x0-x7, if there
>>>> are no enough registers to hold the entire argument, then all remaining
>>>> arguments starting from this one are pushed to the stack for passing.
>>>
>>> If I read the section 6.8.2 of the AAPCS64 correctly, there is a
>>> corner case which I believe isn't covered by this logic.
>>>
>>> void f(u128 a, u128 b, u128, c, u64 d, u128 e, u64 f) {}
>>> - a goes on x0 and x1
>>> - b goes on x2 and x3
>>> - c goes on x4 and x5
>>> - d goes on x6
>>> - e spills on the stack because it doesn't fit in the remaining regs
>>> - f goes on x7
>>>
>>
>> I guess you might have overlooked rule C.13 in AAPCS64. Non-floating type arguments
>> are copied to stack under rule C.15/C.17. However, C.13 is applied before C.15/C.17,
>> which means that NGRN is set to 8 before the stack is used. That is, all 8 parameter
>> arguments are used up and any remaining arguments can only be passed by the stack.
>>
>> C.13    The NGRN is set to 8.
>>
>> C.14    The NSAA is rounded up to the larger of 8 or the Natural Alignment of the
>>           argument’s type.
>>
>> C.15    If the argument is a composite type then the argument is copied to memory
>>           at the adjusted NSAA. The NSAA is incremented by the size of the argument.
>>           The argument has now been allocated.
>>
>> C.16    If the size of the argument is less than 8 bytes then the size of the argument
>>           is set to 8 bytes. The effect is as if the argument was copied to the least
>>           significant bits of a 64-bit register and the remaining bits filled with
>>           unspecified values.
>>
>> C.17    The argument is copied to memory at the adjusted NSAA. The NSAA is incremented
>>           by the size of the argument. The argument has now been allocated.
>>
>>
>> And the following assembly code also shows 'e' and 'f' are passed by stack.
>>
>> int func(__int128 a, __int128 b, __int128 c, int64_t d, __int128 e, int64_t f)
>> {
>>           return e == 5 || f == 7;
>> }
>>
>> asseembly:
>>
>> func:
>>           sub     sp, sp, #64
>>           stp     x0, x1, [sp, 48]
>>           stp     x2, x3, [sp, 32]
>>           stp     x4, x5, [sp, 16]
>>           str     x6, [sp, 8]
>>           ldr     x0, [sp, 64] // ** load the low 8 bytes of e from SP + 64 **
>>           cmp     x0, 5
>>           bne     .L27
>>           ldr     x0, [sp, 72] // ** load the high 8 bytes of e from SP + 72 **
>>           cmp     x0, 0
>>           beq     .L22
>> .L27:
>>           ldr     x0, [sp, 80] // ** load f from SP + 80 **
>>           cmp     x0, 7
>>           bne     .L24
>> .L22:
>>           mov     w0, 1
>>           b       .L26
>> .L24:
>>           mov     w0, 0
>> .L26:
>>           add     sp, sp, 64
>>           ret
> 
> Ah, that's great! :) It keeps things easy then. Thank you for the explanation!
> 
>> Although the above case is fine, the current patch does not handle rule C.14 correctly.
>> For example, for the following func, an 8-byte padding is inserted between f and g by
>> C.14 to align g to 16 bytes, but this patch mistakenly treats it as part of g.
>>
>> int func(__int128 a, __int128 b, __int128 c, int64_t d, __int128 e, int64_t f, __int128 g)
>> {
>> }
>>
>> Maybe we could fix it by adding argument alignment to struct btf_func_model, I'll
>> give it a try.
> 
> Good catch!
> 
>>> Maybe it would be good to add something pathological like this to the
>>> selftests ?
>>>
>>
>> OK, will do
> 
> Just a thought on that topic, maybe it would be preferable to have a
> new separate test for this ? Currently we have a test for 128b long
> arguments and a test for many arguments, it's good to have these
> separate because they are two dimensions of bpf architectural support:
> if someone adds support for one xor the other feature to an arch, it's
> good to see a test go green. Since that new test would cover both long
> and many arguments, it should probably be a new test.
>

Agree, I'll add a separate test for unaligned stack arguments, thanks.

>>>> -static void save_args(struct jit_ctx *ctx, int args_off, int nregs)
>>>> +struct arg_aux {
>>>> +       /* how many args are passed through registers, the rest args are
>>>
>>> the rest of the* args
>>>
>>>> +        * passed through stack
>>>> +        */
>>>> +       int args_in_reg;
>>>
>>> Maybe args_in_regs ? since args can go in multiple regs
>>>
>>>> +       /* how many registers used for passing arguments */
>>>
>>> are* used
>>>
>>>> +       int regs_for_arg;
>>>
>>> And here regs_for_args ? Since It's the number of registers used for all args
>>>
>>>> +       /* how many stack slots used for arguments, each slot is 8 bytes */
>>>
>>> are* used
>>>
>>>> +       int stack_slots_for_arg;
>>>
>>> And here stack_slots_for_args, for the same reason as above?
>>>
>>>> +};
>>>> +
>>>> +static void calc_arg_aux(const struct btf_func_model *m,
>>>> +                        struct arg_aux *a)
>>>>    {
>>>>           int i;
>>>> +       int nregs;
>>>> +       int slots;
>>>> +       int stack_slots;
>>>> +
>>>> +       /* verifier ensures m->nr_args <= MAX_BPF_FUNC_ARGS */
>>>> +       for (i = 0, nregs = 0; i < m->nr_args; i++) {
>>>> +               slots = (m->arg_size[i] + 7) / 8;
>>>> +               if (nregs + slots <= 8) /* passed through register ? */
>>>> +                       nregs += slots;
>>>> +               else
>>>> +                       break;
>>>> +       }
>>>> +
>>>> +       a->args_in_reg = i;
>>>> +       a->regs_for_arg = nregs;
>>>>
>>>> -       for (i = 0; i < nregs; i++) {
>>>> -               emit(A64_STR64I(i, A64_SP, args_off), ctx);
>>>> -               args_off += 8;
>>>> +       /* the rest arguments are passed through stack */
>>>> +       for (stack_slots = 0; i < m->nr_args; i++)
>>>> +               stack_slots += (m->arg_size[i] + 7) / 8;
>>>> +
>>>> +       a->stack_slots_for_arg = stack_slots;
>>>> +}
>>>> +
>>>> +static void clear_garbage(struct jit_ctx *ctx, int reg, int effective_bytes)
>>>> +{
>>>> +       if (effective_bytes) {
>>>> +               int garbage_bits = 64 - 8 * effective_bytes;
>>>> +#ifdef CONFIG_CPU_BIG_ENDIAN
>>>> +               /* garbage bits are at the right end */
>>>> +               emit(A64_LSR(1, reg, reg, garbage_bits), ctx);
>>>> +               emit(A64_LSL(1, reg, reg, garbage_bits), ctx);
>>>> +#else
>>>> +               /* garbage bits are at the left end */
>>>> +               emit(A64_LSL(1, reg, reg, garbage_bits), ctx);
>>>> +               emit(A64_LSR(1, reg, reg, garbage_bits), ctx);
>>>> +#endif
>>>>           }
>>>>    }
>>>>
>>>> -static void restore_args(struct jit_ctx *ctx, int args_off, int nregs)
>>>> +static void save_args(struct jit_ctx *ctx, int bargs_off, int oargs_off,
>>>> +                     const struct btf_func_model *m,
>>>> +                     const struct arg_aux *a,
>>>> +                     bool for_call_origin)
>>>>    {
>>>>           int i;
>>>> +       int reg;
>>>> +       int doff;
>>>> +       int soff;
>>>> +       int slots;
>>>> +       u8 tmp = bpf2a64[TMP_REG_1];
>>>> +
>>>> +       /* store argument registers to stack for call bpf, or restore argument
>>>
>>> to* call bpf or "for the bpf program"
>>>
>>
>> Sorry for these incorrect words :(, all will be fixed in the next version, thanks!
> 
> No problem!
> 
>>>>    static int prepare_trampoline(struct jit_ctx *ctx, struct bpf_tramp_image *im,
>>>>                                 struct bpf_tramp_links *tlinks, void *orig_call,
>>>> -                             int nregs, u32 flags)
>>>> +                             const struct btf_func_model *m,
>>>> +                             const struct arg_aux *a,
>>>> +                             u32 flags)
>>>>    {
>>>>           int i;
>>>>           int stack_size;
>>>>           int retaddr_off;
>>>>           int regs_off;
>>>>           int retval_off;
>>>> -       int args_off;
>>>> +       int bargs_off;
>>>>           int nregs_off;
>>>>           int ip_off;
>>>>           int run_ctx_off;
>>>> +       int oargs_off;
>>>> +       int nregs;
>>>>           struct bpf_tramp_links *fentry = &tlinks[BPF_TRAMP_FENTRY];
>>>>           struct bpf_tramp_links *fexit = &tlinks[BPF_TRAMP_FEXIT];
>>>>           struct bpf_tramp_links *fmod_ret = &tlinks[BPF_TRAMP_MODIFY_RETURN];
>>>> @@ -1859,19 +1951,26 @@ static int prepare_trampoline(struct jit_ctx *ctx, struct bpf_tramp_image *im,
>>>>            *
>>>>            * SP + retval_off  [ return value      ] BPF_TRAMP_F_CALL_ORIG or
>>>>            *                                        BPF_TRAMP_F_RET_FENTRY_RET
>>>> -        *
>>>>            *                  [ arg reg N         ]
>>>>            *                  [ ...               ]
>>>> -        * SP + args_off    [ arg reg 1         ]
>>>> +        * SP + bargs_off   [ arg reg 1         ] for bpf
>>>>            *
>>>>            * SP + nregs_off   [ arg regs count    ]
>>>>            *
>>>>            * SP + ip_off      [ traced function   ] BPF_TRAMP_F_IP_ARG flag
>>>>            *
>>>>            * SP + run_ctx_off [ bpf_tramp_run_ctx ]
>>>> +        *
>>>> +        *                  [ stack arg N       ]
>>>> +        *                  [ ...               ]
>>>> +        * SP + oargs_off   [ stack arg 1       ] for original func
>>>>            */
>>>>
>>>>           stack_size = 0;
>>>> +       oargs_off = stack_size;
>>>> +       if (flags & BPF_TRAMP_F_CALL_ORIG)
>>>> +               stack_size += 8 * a->stack_slots_for_arg;
>>>> +
>>>>           run_ctx_off = stack_size;
>>>>           /* room for bpf_tramp_run_ctx */
>>>>           stack_size += round_up(sizeof(struct bpf_tramp_run_ctx), 8);
>>>> @@ -1885,9 +1984,10 @@ static int prepare_trampoline(struct jit_ctx *ctx, struct bpf_tramp_image *im,
>>>>           /* room for args count */
>>>>           stack_size += 8;
>>>>
>>>> -       args_off = stack_size;
>>>> +       bargs_off = stack_size;
>>>>           /* room for args */
>>>> -       stack_size += nregs * 8;
>>>> +       nregs = a->regs_for_arg + a->stack_slots_for_arg;
>>>
>>> Maybe this name no longer makes sense ?
>>
>> OK, I'll remove it
> 
> Ack
diff mbox series

Patch

diff --git a/arch/arm64/net/bpf_jit_comp.c b/arch/arm64/net/bpf_jit_comp.c
index 7d4af64e3982..a0cf526b07ea 100644
--- a/arch/arm64/net/bpf_jit_comp.c
+++ b/arch/arm64/net/bpf_jit_comp.c
@@ -1705,7 +1705,7 @@  bool bpf_jit_supports_subprog_tailcalls(void)
 }
 
 static void invoke_bpf_prog(struct jit_ctx *ctx, struct bpf_tramp_link *l,
-			    int args_off, int retval_off, int run_ctx_off,
+			    int bargs_off, int retval_off, int run_ctx_off,
 			    bool save_ret)
 {
 	__le32 *branch;
@@ -1747,7 +1747,7 @@  static void invoke_bpf_prog(struct jit_ctx *ctx, struct bpf_tramp_link *l,
 	/* save return value to callee saved register x20 */
 	emit(A64_MOV(1, A64_R(20), A64_R(0)), ctx);
 
-	emit(A64_ADD_I(1, A64_R(0), A64_SP, args_off), ctx);
+	emit(A64_ADD_I(1, A64_R(0), A64_SP, bargs_off), ctx);
 	if (!p->jited)
 		emit_addr_mov_i64(A64_R(1), (const u64)p->insnsi, ctx);
 
@@ -1772,7 +1772,7 @@  static void invoke_bpf_prog(struct jit_ctx *ctx, struct bpf_tramp_link *l,
 }
 
 static void invoke_bpf_mod_ret(struct jit_ctx *ctx, struct bpf_tramp_links *tl,
-			       int args_off, int retval_off, int run_ctx_off,
+			       int bargs_off, int retval_off, int run_ctx_off,
 			       __le32 **branches)
 {
 	int i;
@@ -1782,7 +1782,7 @@  static void invoke_bpf_mod_ret(struct jit_ctx *ctx, struct bpf_tramp_links *tl,
 	 */
 	emit(A64_STR64I(A64_ZR, A64_SP, retval_off), ctx);
 	for (i = 0; i < tl->nr_links; i++) {
-		invoke_bpf_prog(ctx, tl->links[i], args_off, retval_off,
+		invoke_bpf_prog(ctx, tl->links[i], bargs_off, retval_off,
 				run_ctx_off, true);
 		/* if (*(u64 *)(sp + retval_off) !=  0)
 		 *	goto do_fexit;
@@ -1796,23 +1796,111 @@  static void invoke_bpf_mod_ret(struct jit_ctx *ctx, struct bpf_tramp_links *tl,
 	}
 }
 
-static void save_args(struct jit_ctx *ctx, int args_off, int nregs)
+struct arg_aux {
+	/* how many args are passed through registers, the rest args are
+	 * passed through stack
+	 */
+	int args_in_reg;
+	/* how many registers used for passing arguments */
+	int regs_for_arg;
+	/* how many stack slots used for arguments, each slot is 8 bytes */
+	int stack_slots_for_arg;
+};
+
+static void calc_arg_aux(const struct btf_func_model *m,
+			 struct arg_aux *a)
 {
 	int i;
+	int nregs;
+	int slots;
+	int stack_slots;
+
+	/* verifier ensures m->nr_args <= MAX_BPF_FUNC_ARGS */
+	for (i = 0, nregs = 0; i < m->nr_args; i++) {
+		slots = (m->arg_size[i] + 7) / 8;
+		if (nregs + slots <= 8) /* passed through register ? */
+			nregs += slots;
+		else
+			break;
+	}
+
+	a->args_in_reg = i;
+	a->regs_for_arg = nregs;
 
-	for (i = 0; i < nregs; i++) {
-		emit(A64_STR64I(i, A64_SP, args_off), ctx);
-		args_off += 8;
+	/* the rest arguments are passed through stack */
+	for (stack_slots = 0; i < m->nr_args; i++)
+		stack_slots += (m->arg_size[i] + 7) / 8;
+
+	a->stack_slots_for_arg = stack_slots;
+}
+
+static void clear_garbage(struct jit_ctx *ctx, int reg, int effective_bytes)
+{
+	if (effective_bytes) {
+		int garbage_bits = 64 - 8 * effective_bytes;
+#ifdef CONFIG_CPU_BIG_ENDIAN
+		/* garbage bits are at the right end */
+		emit(A64_LSR(1, reg, reg, garbage_bits), ctx);
+		emit(A64_LSL(1, reg, reg, garbage_bits), ctx);
+#else
+		/* garbage bits are at the left end */
+		emit(A64_LSL(1, reg, reg, garbage_bits), ctx);
+		emit(A64_LSR(1, reg, reg, garbage_bits), ctx);
+#endif
 	}
 }
 
-static void restore_args(struct jit_ctx *ctx, int args_off, int nregs)
+static void save_args(struct jit_ctx *ctx, int bargs_off, int oargs_off,
+		      const struct btf_func_model *m,
+		      const struct arg_aux *a,
+		      bool for_call_origin)
 {
 	int i;
+	int reg;
+	int doff;
+	int soff;
+	int slots;
+	u8 tmp = bpf2a64[TMP_REG_1];
+
+	/* store argument registers to stack for call bpf, or restore argument
+	 * registers from stack for the original function
+	 */
+	for (reg = 0; reg < a->regs_for_arg; reg++) {
+		emit(for_call_origin ?
+		     A64_LDR64I(reg, A64_SP, bargs_off) :
+		     A64_STR64I(reg, A64_SP, bargs_off),
+		     ctx);
+		bargs_off += 8;
+	}
 
-	for (i = 0; i < nregs; i++) {
-		emit(A64_LDR64I(i, A64_SP, args_off), ctx);
-		args_off += 8;
+	soff = 32; /* on stack arguments start from FP + 32 */
+	doff = (for_call_origin ? oargs_off : bargs_off);
+
+	/* save on stack arguments */
+	for (i = a->args_in_reg; i < m->nr_args; i++) {
+		slots = (m->arg_size[i] + 7) / 8;
+		/* verifier ensures arg_size <= 16, so slots equals 1 or 2 */
+		while (slots-- > 0) {
+			emit(A64_LDR64I(tmp, A64_FP, soff), ctx);
+			/* if there is unused space in the last slot, clear
+			 * the garbage contained in the space.
+			 */
+			if (slots == 0 && !for_call_origin)
+				clear_garbage(ctx, tmp, m->arg_size[i] % 8);
+			emit(A64_STR64I(tmp, A64_SP, doff), ctx);
+			soff += 8;
+			doff += 8;
+		}
+	}
+}
+
+static void restore_args(struct jit_ctx *ctx, int bargs_off, int nregs)
+{
+	int reg;
+
+	for (reg = 0; reg < nregs; reg++) {
+		emit(A64_LDR64I(reg, A64_SP, bargs_off), ctx);
+		bargs_off += 8;
 	}
 }
 
@@ -1829,17 +1917,21 @@  static void restore_args(struct jit_ctx *ctx, int args_off, int nregs)
  */
 static int prepare_trampoline(struct jit_ctx *ctx, struct bpf_tramp_image *im,
 			      struct bpf_tramp_links *tlinks, void *orig_call,
-			      int nregs, u32 flags)
+			      const struct btf_func_model *m,
+			      const struct arg_aux *a,
+			      u32 flags)
 {
 	int i;
 	int stack_size;
 	int retaddr_off;
 	int regs_off;
 	int retval_off;
-	int args_off;
+	int bargs_off;
 	int nregs_off;
 	int ip_off;
 	int run_ctx_off;
+	int oargs_off;
+	int nregs;
 	struct bpf_tramp_links *fentry = &tlinks[BPF_TRAMP_FENTRY];
 	struct bpf_tramp_links *fexit = &tlinks[BPF_TRAMP_FEXIT];
 	struct bpf_tramp_links *fmod_ret = &tlinks[BPF_TRAMP_MODIFY_RETURN];
@@ -1859,19 +1951,26 @@  static int prepare_trampoline(struct jit_ctx *ctx, struct bpf_tramp_image *im,
 	 *
 	 * SP + retval_off  [ return value      ] BPF_TRAMP_F_CALL_ORIG or
 	 *                                        BPF_TRAMP_F_RET_FENTRY_RET
-	 *
 	 *                  [ arg reg N         ]
 	 *                  [ ...               ]
-	 * SP + args_off    [ arg reg 1         ]
+	 * SP + bargs_off   [ arg reg 1         ] for bpf
 	 *
 	 * SP + nregs_off   [ arg regs count    ]
 	 *
 	 * SP + ip_off      [ traced function   ] BPF_TRAMP_F_IP_ARG flag
 	 *
 	 * SP + run_ctx_off [ bpf_tramp_run_ctx ]
+	 *
+	 *                  [ stack arg N       ]
+	 *                  [ ...               ]
+	 * SP + oargs_off   [ stack arg 1       ] for original func
 	 */
 
 	stack_size = 0;
+	oargs_off = stack_size;
+	if (flags & BPF_TRAMP_F_CALL_ORIG)
+		stack_size += 8 * a->stack_slots_for_arg;
+
 	run_ctx_off = stack_size;
 	/* room for bpf_tramp_run_ctx */
 	stack_size += round_up(sizeof(struct bpf_tramp_run_ctx), 8);
@@ -1885,9 +1984,10 @@  static int prepare_trampoline(struct jit_ctx *ctx, struct bpf_tramp_image *im,
 	/* room for args count */
 	stack_size += 8;
 
-	args_off = stack_size;
+	bargs_off = stack_size;
 	/* room for args */
-	stack_size += nregs * 8;
+	nregs = a->regs_for_arg + a->stack_slots_for_arg;
+	stack_size += 8 * nregs;
 
 	/* room for return value */
 	retval_off = stack_size;
@@ -1934,8 +2034,8 @@  static int prepare_trampoline(struct jit_ctx *ctx, struct bpf_tramp_image *im,
 	emit(A64_MOVZ(1, A64_R(10), nregs, 0), ctx);
 	emit(A64_STR64I(A64_R(10), A64_SP, nregs_off), ctx);
 
-	/* save arg regs */
-	save_args(ctx, args_off, nregs);
+	/* save args for bpf */
+	save_args(ctx, bargs_off, oargs_off, m, a, false);
 
 	/* save callee saved registers */
 	emit(A64_STR64I(A64_R(19), A64_SP, regs_off), ctx);
@@ -1947,7 +2047,7 @@  static int prepare_trampoline(struct jit_ctx *ctx, struct bpf_tramp_image *im,
 	}
 
 	for (i = 0; i < fentry->nr_links; i++)
-		invoke_bpf_prog(ctx, fentry->links[i], args_off,
+		invoke_bpf_prog(ctx, fentry->links[i], bargs_off,
 				retval_off, run_ctx_off,
 				flags & BPF_TRAMP_F_RET_FENTRY_RET);
 
@@ -1957,12 +2057,13 @@  static int prepare_trampoline(struct jit_ctx *ctx, struct bpf_tramp_image *im,
 		if (!branches)
 			return -ENOMEM;
 
-		invoke_bpf_mod_ret(ctx, fmod_ret, args_off, retval_off,
+		invoke_bpf_mod_ret(ctx, fmod_ret, bargs_off, retval_off,
 				   run_ctx_off, branches);
 	}
 
 	if (flags & BPF_TRAMP_F_CALL_ORIG) {
-		restore_args(ctx, args_off, nregs);
+		/* save args for original func */
+		save_args(ctx, bargs_off, oargs_off, m, a, true);
 		/* call original func */
 		emit(A64_LDR64I(A64_R(10), A64_SP, retaddr_off), ctx);
 		emit(A64_ADR(A64_LR, AARCH64_INSN_SIZE * 2), ctx);
@@ -1981,7 +2082,7 @@  static int prepare_trampoline(struct jit_ctx *ctx, struct bpf_tramp_image *im,
 	}
 
 	for (i = 0; i < fexit->nr_links; i++)
-		invoke_bpf_prog(ctx, fexit->links[i], args_off, retval_off,
+		invoke_bpf_prog(ctx, fexit->links[i], bargs_off, retval_off,
 				run_ctx_off, false);
 
 	if (flags & BPF_TRAMP_F_CALL_ORIG) {
@@ -1991,7 +2092,7 @@  static int prepare_trampoline(struct jit_ctx *ctx, struct bpf_tramp_image *im,
 	}
 
 	if (flags & BPF_TRAMP_F_RESTORE_REGS)
-		restore_args(ctx, args_off, nregs);
+		restore_args(ctx, bargs_off, a->regs_for_arg);
 
 	/* restore callee saved register x19 and x20 */
 	emit(A64_LDR64I(A64_R(19), A64_SP, regs_off), ctx);
@@ -2031,26 +2132,16 @@  int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image,
 				u32 flags, struct bpf_tramp_links *tlinks,
 				void *orig_call)
 {
-	int i, ret;
-	int nregs = m->nr_args;
+	int ret;
 	int max_insns = ((long)image_end - (long)image) / AARCH64_INSN_SIZE;
 	struct jit_ctx ctx = {
 		.image = NULL,
 		.idx = 0,
 	};
+	struct arg_aux aaux;
 
-	/* extra registers needed for struct argument */
-	for (i = 0; i < MAX_BPF_FUNC_ARGS; i++) {
-		/* The arg_size is at most 16 bytes, enforced by the verifier. */
-		if (m->arg_flags[i] & BTF_FMODEL_STRUCT_ARG)
-			nregs += (m->arg_size[i] + 7) / 8 - 1;
-	}
-
-	/* the first 8 registers are used for arguments */
-	if (nregs > 8)
-		return -ENOTSUPP;
-
-	ret = prepare_trampoline(&ctx, im, tlinks, orig_call, nregs, flags);
+	calc_arg_aux(m, &aaux);
+	ret = prepare_trampoline(&ctx, im, tlinks, orig_call, m, &aaux, flags);
 	if (ret < 0)
 		return ret;
 
@@ -2061,7 +2152,7 @@  int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image,
 	ctx.idx = 0;
 
 	jit_fill_hole(image, (unsigned int)(image_end - image));
-	ret = prepare_trampoline(&ctx, im, tlinks, orig_call, nregs, flags);
+	ret = prepare_trampoline(&ctx, im, tlinks, orig_call, m, &aaux, flags);
 
 	if (ret > 0 && validate_code(&ctx) < 0)
 		ret = -EINVAL;
diff --git a/tools/testing/selftests/bpf/DENYLIST.aarch64 b/tools/testing/selftests/bpf/DENYLIST.aarch64
index f5065576cae9..35a297a3c92f 100644
--- a/tools/testing/selftests/bpf/DENYLIST.aarch64
+++ b/tools/testing/selftests/bpf/DENYLIST.aarch64
@@ -11,8 +11,6 @@  kprobe_multi_test/link_api_addrs                 # link_fd unexpected link_fd: a
 kprobe_multi_test/link_api_syms                  # link_fd unexpected link_fd: actual -95 < expected 0
 kprobe_multi_test/skel_api                       # libbpf: failed to load BPF skeleton 'kprobe_multi': -3
 module_attach                                    # prog 'kprobe_multi': failed to auto-attach: -95
-fentry_test/fentry_many_args                     # fentry_many_args:FAIL:fentry_many_args_attach unexpected error: -524
-fexit_test/fexit_many_args                       # fexit_many_args:FAIL:fexit_many_args_attach unexpected error: -524
 fill_link_info/kprobe_multi_link_info            # bpf_program__attach_kprobe_multi_opts unexpected error: -95
 fill_link_info/kretprobe_multi_link_info         # bpf_program__attach_kprobe_multi_opts unexpected error: -95
 fill_link_info/kprobe_multi_invalid_ubuff        # bpf_program__attach_kprobe_multi_opts unexpected error: -95