diff mbox series

[v6,bpf-next,6/7] bpf: introduce bpf_prog_pack allocator

Message ID 20220121194926.1970172-7-song@kernel.org (mailing list archive)
State Superseded
Delegated to: BPF
Headers show
Series bpf_prog_pack allocator | expand

Checks

Context Check Description
bpf/vmtest-bpf-next-PR fail PR summary
netdev/tree_selection success Clearly marked for bpf-next
netdev/fixes_present success Fixes tag not required for -next series
netdev/subject_prefix success Link
netdev/cover_letter success Series has a cover letter
netdev/patch_count success Link
netdev/header_inline success No static functions without inline keyword in header files
netdev/build_32bit success Errors and warnings before: 1398 this patch: 1398
netdev/cc_maintainers warning 4 maintainers not CCed: kpsingh@kernel.org john.fastabend@gmail.com kafai@fb.com yhs@fb.com
netdev/build_clang success Errors and warnings before: 187 this patch: 187
netdev/module_param success Was 0 now: 0
netdev/verify_signedoff success Signed-off-by tag matches author and committer
netdev/verify_fixes success No Fixes tag
netdev/build_allmodconfig_warn success Errors and warnings before: 1415 this patch: 1415
netdev/checkpatch warning CHECK: Prefer using the BIT macro WARNING: line length of 82 exceeds 80 columns WARNING: line length of 85 exceeds 80 columns WARNING: line length of 87 exceeds 80 columns WARNING: line length of 99 exceeds 80 columns
netdev/kdoc success Errors and warnings before: 0 this patch: 0
netdev/source_inline success Was 0 now: 0
bpf/vmtest-bpf-next fail VM_Test

Commit Message

Song Liu Jan. 21, 2022, 7:49 p.m. UTC
From: Song Liu <songliubraving@fb.com>

Most BPF programs are small, but they consume a page each. For systems
with busy traffic and many BPF programs, this could add significant
pressure to instruction TLB.

Introduce bpf_prog_pack allocator to pack multiple BPF programs in a huge
page. The memory is then allocated in 64 byte chunks.

Memory allocated by bpf_prog_pack allocator is RO protected after initial
allocation. To write to it, the user (jit engine) need to use text poke
API.

Signed-off-by: Song Liu <songliubraving@fb.com>
---
 include/linux/filter.h |   7 ++
 kernel/bpf/core.c      | 184 ++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 187 insertions(+), 4 deletions(-)

Comments

Alexei Starovoitov Jan. 21, 2022, 11:55 p.m. UTC | #1
On Fri, Jan 21, 2022 at 11:49 AM Song Liu <song@kernel.org> wrote:
>
> +static struct bpf_binary_header *
> +__bpf_jit_binary_alloc(unsigned int proglen, u8 **image_ptr,
> +                      unsigned int alignment,
> +                      bpf_jit_fill_hole_t bpf_fill_ill_insns,
> +                      u32 round_up_to)
> +{
> +       struct bpf_binary_header *hdr;
> +       u32 size, hole, start;
> +
> +       WARN_ON_ONCE(!is_power_of_2(alignment) ||
> +                    alignment > BPF_IMAGE_ALIGNMENT);
> +
> +       /* Most of BPF filters are really small, but if some of them
> +        * fill a page, allow at least 128 extra bytes to insert a
> +        * random section of illegal instructions.
> +        */
> +       size = round_up(proglen + sizeof(*hdr) + 128, round_up_to);
> +
> +       if (bpf_jit_charge_modmem(size))
> +               return NULL;
> +       hdr = bpf_jit_alloc_exec(size);
> +       if (!hdr) {
> +               bpf_jit_uncharge_modmem(size);
> +               return NULL;
> +       }
> +
> +       /* Fill space with illegal/arch-dep instructions. */
> +       bpf_fill_ill_insns(hdr, size);
> +
> +       hdr->size = size;
> +       hole = min_t(unsigned int, size - (proglen + sizeof(*hdr)),
> +                    PAGE_SIZE - sizeof(*hdr));

It probably should be 'round_up_to' instead of PAGE_SIZE ?

> +       start = (get_random_int() % hole) & ~(alignment - 1);
> +
> +       /* Leave a random number of instructions before BPF code. */
> +       *image_ptr = &hdr->image[start];
> +
> +       return hdr;
> +}
> +
>  struct bpf_binary_header *
>  bpf_jit_binary_alloc(unsigned int proglen, u8 **image_ptr,
>                      unsigned int alignment,
>                      bpf_jit_fill_hole_t bpf_fill_ill_insns)
> +{
> +       return __bpf_jit_binary_alloc(proglen, image_ptr, alignment,
> +                                     bpf_fill_ill_insns, PAGE_SIZE);
> +}
> +
> +struct bpf_binary_header *
> +bpf_jit_binary_alloc_pack(unsigned int proglen, u8 **image_ptr,
> +                         unsigned int alignment,
> +                         bpf_jit_fill_hole_t bpf_fill_ill_insns)
>  {
>         struct bpf_binary_header *hdr;
>         u32 size, hole, start;
> @@ -875,11 +1034,16 @@ bpf_jit_binary_alloc(unsigned int proglen, u8 **image_ptr,
>          * fill a page, allow at least 128 extra bytes to insert a
>          * random section of illegal instructions.
>          */
> -       size = round_up(proglen + sizeof(*hdr) + 128, PAGE_SIZE);
> +       size = round_up(proglen + sizeof(*hdr) + 128, BPF_PROG_CHUNK_SIZE);
> +
> +       /* for too big program, use __bpf_jit_binary_alloc. */
> +       if (size > BPF_PROG_MAX_PACK_PROG_SIZE)
> +               return __bpf_jit_binary_alloc(proglen, image_ptr, alignment,
> +                                             bpf_fill_ill_insns, PAGE_SIZE);
>
>         if (bpf_jit_charge_modmem(size))
>                 return NULL;
> -       hdr = bpf_jit_alloc_exec(size);
> +       hdr = bpf_prog_pack_alloc(size);
>         if (!hdr) {
>                 bpf_jit_uncharge_modmem(size);
>                 return NULL;
> @@ -888,9 +1052,8 @@ bpf_jit_binary_alloc(unsigned int proglen, u8 **image_ptr,
>         /* Fill space with illegal/arch-dep instructions. */
>         bpf_fill_ill_insns(hdr, size);
>
> -       hdr->size = size;

I'm missing where it's assigned.
Looks like hdr->size stays zero, so free is never performed?

>         hole = min_t(unsigned int, size - (proglen + sizeof(*hdr)),
> -                    PAGE_SIZE - sizeof(*hdr));
> +                    BPF_PROG_CHUNK_SIZE - sizeof(*hdr));

Before this change size - (proglen + sizeof(*hdr)) would
be at least 128 and potentially bigger than PAGE_SIZE
when extra 128 crossed page boundary.
Hence min() was necessary with the 2nd arg being PAGE_SIZE - sizeof(*hdr).

With new code size - (proglen + sizeof(*hdr)) would
be between 128 and 128+64
while BPF_PROG_CHUNK_SIZE - sizeof(*hdr) is a constant less than 64.
What is the point of min() ?
Song Liu Jan. 22, 2022, 12:23 a.m. UTC | #2
> On Jan 21, 2022, at 3:55 PM, Alexei Starovoitov <alexei.starovoitov@gmail.com> wrote:
> 
> On Fri, Jan 21, 2022 at 11:49 AM Song Liu <song@kernel.org> wrote:
>> 
>> +static struct bpf_binary_header *
>> +__bpf_jit_binary_alloc(unsigned int proglen, u8 **image_ptr,
>> +                      unsigned int alignment,
>> +                      bpf_jit_fill_hole_t bpf_fill_ill_insns,
>> +                      u32 round_up_to)
>> +{
>> +       struct bpf_binary_header *hdr;
>> +       u32 size, hole, start;
>> +
>> +       WARN_ON_ONCE(!is_power_of_2(alignment) ||
>> +                    alignment > BPF_IMAGE_ALIGNMENT);
>> +
>> +       /* Most of BPF filters are really small, but if some of them
>> +        * fill a page, allow at least 128 extra bytes to insert a
>> +        * random section of illegal instructions.
>> +        */
>> +       size = round_up(proglen + sizeof(*hdr) + 128, round_up_to);
>> +
>> +       if (bpf_jit_charge_modmem(size))
>> +               return NULL;
>> +       hdr = bpf_jit_alloc_exec(size);
>> +       if (!hdr) {
>> +               bpf_jit_uncharge_modmem(size);
>> +               return NULL;
>> +       }
>> +
>> +       /* Fill space with illegal/arch-dep instructions. */
>> +       bpf_fill_ill_insns(hdr, size);
>> +
>> +       hdr->size = size;
>> +       hole = min_t(unsigned int, size - (proglen + sizeof(*hdr)),
>> +                    PAGE_SIZE - sizeof(*hdr));
> 
> It probably should be 'round_up_to' instead of PAGE_SIZE ?

Actually, some of these change is not longer needed after the following
change in v6:

  4. Change fall back round_up_to in bpf_jit_binary_alloc_pack() from
     BPF_PROG_MAX_PACK_PROG_SIZE to PAGE_SIZE.

My initial thought (last year) was if we allocate more than 2MB (either 
2.1MB or 3.9MB), we round up to 4MB to save page table entries. 
However, when I revisited this earlier today, I thought we should still
round up to PAGE_SIZE to save memory

Right now, I am not sure which way is better. What do you think? If we
round up to PAGE_SIZE, we don't need split out __bpf_jit_binary_alloc().

> 
>> +       start = (get_random_int() % hole) & ~(alignment - 1);
>> +
>> +       /* Leave a random number of instructions before BPF code. */
>> +       *image_ptr = &hdr->image[start];
>> +
>> +       return hdr;
>> +}
>> +
>> struct bpf_binary_header *
>> bpf_jit_binary_alloc(unsigned int proglen, u8 **image_ptr,
>>                     unsigned int alignment,
>>                     bpf_jit_fill_hole_t bpf_fill_ill_insns)
>> +{
>> +       return __bpf_jit_binary_alloc(proglen, image_ptr, alignment,
>> +                                     bpf_fill_ill_insns, PAGE_SIZE);
>> +}
>> +
>> +struct bpf_binary_header *
>> +bpf_jit_binary_alloc_pack(unsigned int proglen, u8 **image_ptr,
>> +                         unsigned int alignment,
>> +                         bpf_jit_fill_hole_t bpf_fill_ill_insns)
>> {
>>        struct bpf_binary_header *hdr;
>>        u32 size, hole, start;
>> @@ -875,11 +1034,16 @@ bpf_jit_binary_alloc(unsigned int proglen, u8 **image_ptr,
>>         * fill a page, allow at least 128 extra bytes to insert a
>>         * random section of illegal instructions.
>>         */
>> -       size = round_up(proglen + sizeof(*hdr) + 128, PAGE_SIZE);
>> +       size = round_up(proglen + sizeof(*hdr) + 128, BPF_PROG_CHUNK_SIZE);
>> +
>> +       /* for too big program, use __bpf_jit_binary_alloc. */
>> +       if (size > BPF_PROG_MAX_PACK_PROG_SIZE)
>> +               return __bpf_jit_binary_alloc(proglen, image_ptr, alignment,
>> +                                             bpf_fill_ill_insns, PAGE_SIZE);
>> 
>>        if (bpf_jit_charge_modmem(size))
>>                return NULL;
>> -       hdr = bpf_jit_alloc_exec(size);
>> +       hdr = bpf_prog_pack_alloc(size);
>>        if (!hdr) {
>>                bpf_jit_uncharge_modmem(size);
>>                return NULL;
>> @@ -888,9 +1052,8 @@ bpf_jit_binary_alloc(unsigned int proglen, u8 **image_ptr,
>>        /* Fill space with illegal/arch-dep instructions. */
>>        bpf_fill_ill_insns(hdr, size);
>> 
>> -       hdr->size = size;
> 
> I'm missing where it's assigned.
> Looks like hdr->size stays zero, so free is never performed?

This is read only memory, so we set it in bpf_fill_ill_insns(). There was a 
comment in x86/bpf_jit_comp.c. I guess we also need a comment here. 

> 
>>        hole = min_t(unsigned int, size - (proglen + sizeof(*hdr)),
>> -                    PAGE_SIZE - sizeof(*hdr));
>> +                    BPF_PROG_CHUNK_SIZE - sizeof(*hdr));
> 
> Before this change size - (proglen + sizeof(*hdr)) would
> be at least 128 and potentially bigger than PAGE_SIZE
> when extra 128 crossed page boundary.
> Hence min() was necessary with the 2nd arg being PAGE_SIZE - sizeof(*hdr).
> 
> With new code size - (proglen + sizeof(*hdr)) would
> be between 128 and 128+64
> while BPF_PROG_CHUNK_SIZE - sizeof(*hdr) is a constant less than 64.
> What is the point of min() ?

Yeah, I guess I didn't finish my math homework here. Will fix it in the
next version.
Alexei Starovoitov Jan. 22, 2022, 12:41 a.m. UTC | #3
On Fri, Jan 21, 2022 at 4:23 PM Song Liu <songliubraving@fb.com> wrote:
>
>
>
> > On Jan 21, 2022, at 3:55 PM, Alexei Starovoitov <alexei.starovoitov@gmail.com> wrote:
> >
> > On Fri, Jan 21, 2022 at 11:49 AM Song Liu <song@kernel.org> wrote:
> >>
> >> +static struct bpf_binary_header *
> >> +__bpf_jit_binary_alloc(unsigned int proglen, u8 **image_ptr,
> >> +                      unsigned int alignment,
> >> +                      bpf_jit_fill_hole_t bpf_fill_ill_insns,
> >> +                      u32 round_up_to)
> >> +{
> >> +       struct bpf_binary_header *hdr;
> >> +       u32 size, hole, start;
> >> +
> >> +       WARN_ON_ONCE(!is_power_of_2(alignment) ||
> >> +                    alignment > BPF_IMAGE_ALIGNMENT);
> >> +
> >> +       /* Most of BPF filters are really small, but if some of them
> >> +        * fill a page, allow at least 128 extra bytes to insert a
> >> +        * random section of illegal instructions.
> >> +        */
> >> +       size = round_up(proglen + sizeof(*hdr) + 128, round_up_to);
> >> +
> >> +       if (bpf_jit_charge_modmem(size))
> >> +               return NULL;
> >> +       hdr = bpf_jit_alloc_exec(size);
> >> +       if (!hdr) {
> >> +               bpf_jit_uncharge_modmem(size);
> >> +               return NULL;
> >> +       }
> >> +
> >> +       /* Fill space with illegal/arch-dep instructions. */
> >> +       bpf_fill_ill_insns(hdr, size);
> >> +
> >> +       hdr->size = size;
> >> +       hole = min_t(unsigned int, size - (proglen + sizeof(*hdr)),
> >> +                    PAGE_SIZE - sizeof(*hdr));
> >
> > It probably should be 'round_up_to' instead of PAGE_SIZE ?
>
> Actually, some of these change is not longer needed after the following
> change in v6:
>
>   4. Change fall back round_up_to in bpf_jit_binary_alloc_pack() from
>      BPF_PROG_MAX_PACK_PROG_SIZE to PAGE_SIZE.
>
> My initial thought (last year) was if we allocate more than 2MB (either
> 2.1MB or 3.9MB), we round up to 4MB to save page table entries.
> However, when I revisited this earlier today, I thought we should still
> round up to PAGE_SIZE to save memory
>
> Right now, I am not sure which way is better. What do you think? If we
> round up to PAGE_SIZE, we don't need split out __bpf_jit_binary_alloc().

The less code duplication the better.

> >
> >> +       start = (get_random_int() % hole) & ~(alignment - 1);
> >> +
> >> +       /* Leave a random number of instructions before BPF code. */
> >> +       *image_ptr = &hdr->image[start];
> >> +
> >> +       return hdr;
> >> +}
> >> +
> >> struct bpf_binary_header *
> >> bpf_jit_binary_alloc(unsigned int proglen, u8 **image_ptr,
> >>                     unsigned int alignment,
> >>                     bpf_jit_fill_hole_t bpf_fill_ill_insns)
> >> +{
> >> +       return __bpf_jit_binary_alloc(proglen, image_ptr, alignment,
> >> +                                     bpf_fill_ill_insns, PAGE_SIZE);
> >> +}
> >> +
> >> +struct bpf_binary_header *
> >> +bpf_jit_binary_alloc_pack(unsigned int proglen, u8 **image_ptr,
> >> +                         unsigned int alignment,
> >> +                         bpf_jit_fill_hole_t bpf_fill_ill_insns)
> >> {
> >>        struct bpf_binary_header *hdr;
> >>        u32 size, hole, start;
> >> @@ -875,11 +1034,16 @@ bpf_jit_binary_alloc(unsigned int proglen, u8 **image_ptr,
> >>         * fill a page, allow at least 128 extra bytes to insert a
> >>         * random section of illegal instructions.
> >>         */
> >> -       size = round_up(proglen + sizeof(*hdr) + 128, PAGE_SIZE);
> >> +       size = round_up(proglen + sizeof(*hdr) + 128, BPF_PROG_CHUNK_SIZE);
> >> +
> >> +       /* for too big program, use __bpf_jit_binary_alloc. */
> >> +       if (size > BPF_PROG_MAX_PACK_PROG_SIZE)
> >> +               return __bpf_jit_binary_alloc(proglen, image_ptr, alignment,
> >> +                                             bpf_fill_ill_insns, PAGE_SIZE);
> >>
> >>        if (bpf_jit_charge_modmem(size))
> >>                return NULL;
> >> -       hdr = bpf_jit_alloc_exec(size);
> >> +       hdr = bpf_prog_pack_alloc(size);
> >>        if (!hdr) {
> >>                bpf_jit_uncharge_modmem(size);
> >>                return NULL;
> >> @@ -888,9 +1052,8 @@ bpf_jit_binary_alloc(unsigned int proglen, u8 **image_ptr,
> >>        /* Fill space with illegal/arch-dep instructions. */
> >>        bpf_fill_ill_insns(hdr, size);
> >>
> >> -       hdr->size = size;
> >
> > I'm missing where it's assigned.
> > Looks like hdr->size stays zero, so free is never performed?
>
> This is read only memory, so we set it in bpf_fill_ill_insns(). There was a
> comment in x86/bpf_jit_comp.c. I guess we also need a comment here.

Ahh. Found it. Pls don't do it in fill_insn.
It's the wrong layering.
It feels that callbacks need to be redesigned.
I would operate on rw_header here and use
existing arch specific callback fill_insn to write into rw_image.
Both insns during JITing and 0xcc on both sides of the prog.
Populate rw_header->size (either before or after JITing)
and then do single text_poke_copy to write the whole thing
into the correct spot.
Song Liu Jan. 22, 2022, 1:01 a.m. UTC | #4
> On Jan 21, 2022, at 4:41 PM, Alexei Starovoitov <alexei.starovoitov@gmail.com> wrote:
> 
> On Fri, Jan 21, 2022 at 4:23 PM Song Liu <songliubraving@fb.com> wrote:
>> 
>> 
>> 
>>> On Jan 21, 2022, at 3:55 PM, Alexei Starovoitov <alexei.starovoitov@gmail.com> wrote:
>>> 
>>> On Fri, Jan 21, 2022 at 11:49 AM Song Liu <song@kernel.org> wrote:
>>>> 
>>>> +static struct bpf_binary_header *
>>>> +__bpf_jit_binary_alloc(unsigned int proglen, u8 **image_ptr,
>>>> +                      unsigned int alignment,
>>>> +                      bpf_jit_fill_hole_t bpf_fill_ill_insns,
>>>> +                      u32 round_up_to)
>>>> +{
>>>> +       struct bpf_binary_header *hdr;
>>>> +       u32 size, hole, start;
>>>> +
>>>> +       WARN_ON_ONCE(!is_power_of_2(alignment) ||
>>>> +                    alignment > BPF_IMAGE_ALIGNMENT);
>>>> +
>>>> +       /* Most of BPF filters are really small, but if some of them
>>>> +        * fill a page, allow at least 128 extra bytes to insert a
>>>> +        * random section of illegal instructions.
>>>> +        */
>>>> +       size = round_up(proglen + sizeof(*hdr) + 128, round_up_to);
>>>> +
>>>> +       if (bpf_jit_charge_modmem(size))
>>>> +               return NULL;
>>>> +       hdr = bpf_jit_alloc_exec(size);
>>>> +       if (!hdr) {
>>>> +               bpf_jit_uncharge_modmem(size);
>>>> +               return NULL;
>>>> +       }
>>>> +
>>>> +       /* Fill space with illegal/arch-dep instructions. */
>>>> +       bpf_fill_ill_insns(hdr, size);
>>>> +
>>>> +       hdr->size = size;
>>>> +       hole = min_t(unsigned int, size - (proglen + sizeof(*hdr)),
>>>> +                    PAGE_SIZE - sizeof(*hdr));
>>> 
>>> It probably should be 'round_up_to' instead of PAGE_SIZE ?
>> 
>> Actually, some of these change is not longer needed after the following
>> change in v6:
>> 
>>  4. Change fall back round_up_to in bpf_jit_binary_alloc_pack() from
>>     BPF_PROG_MAX_PACK_PROG_SIZE to PAGE_SIZE.
>> 
>> My initial thought (last year) was if we allocate more than 2MB (either
>> 2.1MB or 3.9MB), we round up to 4MB to save page table entries.
>> However, when I revisited this earlier today, I thought we should still
>> round up to PAGE_SIZE to save memory
>> 
>> Right now, I am not sure which way is better. What do you think? If we
>> round up to PAGE_SIZE, we don't need split out __bpf_jit_binary_alloc().
> 
> The less code duplication the better.

Got it. Will go with PAGE_SIZE. 

[...]

>>>> +
>>>>       if (bpf_jit_charge_modmem(size))
>>>>               return NULL;
>>>> -       hdr = bpf_jit_alloc_exec(size);
>>>> +       hdr = bpf_prog_pack_alloc(size);
>>>>       if (!hdr) {
>>>>               bpf_jit_uncharge_modmem(size);
>>>>               return NULL;
>>>> @@ -888,9 +1052,8 @@ bpf_jit_binary_alloc(unsigned int proglen, u8 **image_ptr,
>>>>       /* Fill space with illegal/arch-dep instructions. */
>>>>       bpf_fill_ill_insns(hdr, size);
>>>> 
>>>> -       hdr->size = size;
>>> 
>>> I'm missing where it's assigned.
>>> Looks like hdr->size stays zero, so free is never performed?
>> 
>> This is read only memory, so we set it in bpf_fill_ill_insns(). There was a
>> comment in x86/bpf_jit_comp.c. I guess we also need a comment here.
> 
> Ahh. Found it. Pls don't do it in fill_insn.
> It's the wrong layering.
> It feels that callbacks need to be redesigned.
> I would operate on rw_header here and use
> existing arch specific callback fill_insn to write into rw_image.
> Both insns during JITing and 0xcc on both sides of the prog.
> Populate rw_header->size (either before or after JITing)
> and then do single text_poke_copy to write the whole thing
> into the correct spot.

In this way, we need to allocate rw_image here, and free it in 
bpf_jit_comp.c. This feels a little weird to me, but I guess that
is still the cleanest solution for now. 

Thanks,
Song
Alexei Starovoitov Jan. 22, 2022, 1:12 a.m. UTC | #5
On Fri, Jan 21, 2022 at 5:01 PM Song Liu <songliubraving@fb.com> wrote:
>
> In this way, we need to allocate rw_image here, and free it in
> bpf_jit_comp.c. This feels a little weird to me, but I guess that
> is still the cleanest solution for now.

You mean inside bpf_jit_binary_alloc?
That won't be arch independent.
It needs to be split into generic piece that stays in core.c
and callbacks like bpf_jit_fill_hole_t
or into multiple helpers with prep in-between.
Don't worry if all archs need to be touched.
Song Liu Jan. 22, 2022, 1:30 a.m. UTC | #6
> On Jan 21, 2022, at 5:12 PM, Alexei Starovoitov <alexei.starovoitov@gmail.com> wrote:
> 
> On Fri, Jan 21, 2022 at 5:01 PM Song Liu <songliubraving@fb.com> wrote:
>> 
>> In this way, we need to allocate rw_image here, and free it in
>> bpf_jit_comp.c. This feels a little weird to me, but I guess that
>> is still the cleanest solution for now.
> 
> You mean inside bpf_jit_binary_alloc?
> That won't be arch independent.
> It needs to be split into generic piece that stays in core.c
> and callbacks like bpf_jit_fill_hole_t
> or into multiple helpers with prep in-between.
> Don't worry if all archs need to be touched.

How about we introduce callback bpf_jit_set_header_size_t? Then we
can split x86's jit_fill_hole() into two functions, one to fill the
hole, the other to set size. The rest of the logic gonna stay the same. 

Archs that do not use bpf_prog_pack won't need bpf_jit_set_header_size_t. 

Thanks,
Song
Alexei Starovoitov Jan. 22, 2022, 2:12 a.m. UTC | #7
On Fri, Jan 21, 2022 at 5:30 PM Song Liu <songliubraving@fb.com> wrote:
>
>
>
> > On Jan 21, 2022, at 5:12 PM, Alexei Starovoitov <alexei.starovoitov@gmail.com> wrote:
> >
> > On Fri, Jan 21, 2022 at 5:01 PM Song Liu <songliubraving@fb.com> wrote:
> >>
> >> In this way, we need to allocate rw_image here, and free it in
> >> bpf_jit_comp.c. This feels a little weird to me, but I guess that
> >> is still the cleanest solution for now.
> >
> > You mean inside bpf_jit_binary_alloc?
> > That won't be arch independent.
> > It needs to be split into generic piece that stays in core.c
> > and callbacks like bpf_jit_fill_hole_t
> > or into multiple helpers with prep in-between.
> > Don't worry if all archs need to be touched.
>
> How about we introduce callback bpf_jit_set_header_size_t? Then we
> can split x86's jit_fill_hole() into two functions, one to fill the
> hole, the other to set size. The rest of the logic gonna stay the same.
>
> Archs that do not use bpf_prog_pack won't need bpf_jit_set_header_size_t.

That's not any better.

Currently the choice of bpf_jit_binary_alloc_pack vs bpf_jit_binary_alloc
leaks into arch bits and bpf_prog_pack_max_size() doesn't
really make it generic.

Ideally all archs continue to use bpf_jit_binary_alloc()
and magic happens in a generic code.
If not then please remove bpf_prog_pack_max_size(),
since it doesn't provide much value and pick
bpf_jit_binary_alloc_pack() signature to fit x86 jit better.
It wouldn't need bpf_jit_fill_hole_t callback at all.
Please think it through so we don't need to redesign it
when another arch will decide to use huge pages for bpf progs.

cc-ing Ilya for ideas on how that would fit s390.
Song Liu Jan. 23, 2022, 1:03 a.m. UTC | #8
> On Jan 21, 2022, at 6:12 PM, Alexei Starovoitov <alexei.starovoitov@gmail.com> wrote:
> 
> On Fri, Jan 21, 2022 at 5:30 PM Song Liu <songliubraving@fb.com> wrote:
>> 
>> 
>> 
>>> On Jan 21, 2022, at 5:12 PM, Alexei Starovoitov <alexei.starovoitov@gmail.com> wrote:
>>> 
>>> On Fri, Jan 21, 2022 at 5:01 PM Song Liu <songliubraving@fb.com> wrote:
>>>> 
>>>> In this way, we need to allocate rw_image here, and free it in
>>>> bpf_jit_comp.c. This feels a little weird to me, but I guess that
>>>> is still the cleanest solution for now.
>>> 
>>> You mean inside bpf_jit_binary_alloc?
>>> That won't be arch independent.
>>> It needs to be split into generic piece that stays in core.c
>>> and callbacks like bpf_jit_fill_hole_t
>>> or into multiple helpers with prep in-between.
>>> Don't worry if all archs need to be touched.
>> 
>> How about we introduce callback bpf_jit_set_header_size_t? Then we
>> can split x86's jit_fill_hole() into two functions, one to fill the
>> hole, the other to set size. The rest of the logic gonna stay the same.
>> 
>> Archs that do not use bpf_prog_pack won't need bpf_jit_set_header_size_t.
> 
> That's not any better.
> 
> Currently the choice of bpf_jit_binary_alloc_pack vs bpf_jit_binary_alloc
> leaks into arch bits and bpf_prog_pack_max_size() doesn't
> really make it generic.
> 
> Ideally all archs continue to use bpf_jit_binary_alloc()
> and magic happens in a generic code.
> If not then please remove bpf_prog_pack_max_size(),
> since it doesn't provide much value and pick
> bpf_jit_binary_alloc_pack() signature to fit x86 jit better.
> It wouldn't need bpf_jit_fill_hole_t callback at all.
> Please think it through so we don't need to redesign it
> when another arch will decide to use huge pages for bpf progs.
> 
> cc-ing Ilya for ideas on how that would fit s390.

I guess we have a few different questions here:

1. Can we use bpf_jit_binary_alloc() for both regular page and shared 
huge page? 

I think the answer is no, as bpf_jit_binary_alloc() allocates a rw 
buffer, and arch calls bpf_jit_binary_lock_ro after JITing. The new 
allocator will return a slice of a shared huge page, which is locked
RO before JITing. 

2. The problem with bpf_prog_pack_max_size() limitation. 

I think this is the worst part of current version of bpf_prog_pack, 
but it shouldn't be too hard to fix. I will remove this limitation 
in the next version. 

3. How to set proper header->size? 

I guess we can introduce something similar to bpf_arch_text_poke() 
for this? 


My proposal for the next version is:
1. No changes to archs that do not use huge page, just keep using 
   bpf_jit_binary_alloc.

2. For x86_64 (and other arch that would support bpf program on huge
   pages):
   2.1 arch/bpf_jit_comp calls bpf_jit_binary_alloc_pack() to allocate
       an RO bpf_binary_header;
   2.2 arch allocates a temporary buffer for JIT. Once JIT is done, 
       use text_poke_copy to copy the code to the RO bpf_binary_header. 

3. Remove bpf_prog_pack_max_size limitation. 


Does this sound reasonable?

Thanks,
Song
Ilya Leoshkevich Jan. 24, 2022, 12:29 p.m. UTC | #9
On 1/23/22 02:03, Song Liu wrote:
> 
> 
>> On Jan 21, 2022, at 6:12 PM, Alexei Starovoitov <alexei.starovoitov@gmail.com> wrote:
>>
>> On Fri, Jan 21, 2022 at 5:30 PM Song Liu <songliubraving@fb.com> wrote:
>>>
>>>
>>>
>>>> On Jan 21, 2022, at 5:12 PM, Alexei Starovoitov <alexei.starovoitov@gmail.com> wrote:
>>>>
>>>> On Fri, Jan 21, 2022 at 5:01 PM Song Liu <songliubraving@fb.com> wrote:
>>>>>
>>>>> In this way, we need to allocate rw_image here, and free it in
>>>>> bpf_jit_comp.c. This feels a little weird to me, but I guess that
>>>>> is still the cleanest solution for now.
>>>>
>>>> You mean inside bpf_jit_binary_alloc?
>>>> That won't be arch independent.
>>>> It needs to be split into generic piece that stays in core.c
>>>> and callbacks like bpf_jit_fill_hole_t
>>>> or into multiple helpers with prep in-between.
>>>> Don't worry if all archs need to be touched.
>>>
>>> How about we introduce callback bpf_jit_set_header_size_t? Then we
>>> can split x86's jit_fill_hole() into two functions, one to fill the
>>> hole, the other to set size. The rest of the logic gonna stay the same.
>>>
>>> Archs that do not use bpf_prog_pack won't need bpf_jit_set_header_size_t.
>>
>> That's not any better.
>>
>> Currently the choice of bpf_jit_binary_alloc_pack vs bpf_jit_binary_alloc
>> leaks into arch bits and bpf_prog_pack_max_size() doesn't
>> really make it generic.
>>
>> Ideally all archs continue to use bpf_jit_binary_alloc()
>> and magic happens in a generic code.
>> If not then please remove bpf_prog_pack_max_size(),
>> since it doesn't provide much value and pick
>> bpf_jit_binary_alloc_pack() signature to fit x86 jit better.
>> It wouldn't need bpf_jit_fill_hole_t callback at all.
>> Please think it through so we don't need to redesign it
>> when another arch will decide to use huge pages for bpf progs.
>>
>> cc-ing Ilya for ideas on how that would fit s390.
> 
> I guess we have a few different questions here:
> 
> 1. Can we use bpf_jit_binary_alloc() for both regular page and shared
> huge page?
> 
> I think the answer is no, as bpf_jit_binary_alloc() allocates a rw
> buffer, and arch calls bpf_jit_binary_lock_ro after JITing. The new
> allocator will return a slice of a shared huge page, which is locked
> RO before JITing.
> 
> 2. The problem with bpf_prog_pack_max_size() limitation.
> 
> I think this is the worst part of current version of bpf_prog_pack,
> but it shouldn't be too hard to fix. I will remove this limitation
> in the next version.
> 
> 3. How to set proper header->size?
> 
> I guess we can introduce something similar to bpf_arch_text_poke()
> for this?
> 
> 
> My proposal for the next version is:
> 1. No changes to archs that do not use huge page, just keep using
>     bpf_jit_binary_alloc.
> 
> 2. For x86_64 (and other arch that would support bpf program on huge
>     pages):
>     2.1 arch/bpf_jit_comp calls bpf_jit_binary_alloc_pack() to allocate
>         an RO bpf_binary_header;
>     2.2 arch allocates a temporary buffer for JIT. Once JIT is done,
>         use text_poke_copy to copy the code to the RO bpf_binary_header.

Are arches expected to allocate rw buffers in different ways? If not,
I would consider putting this into the common code as well. Then
arch-specific code would do something like

   header = bpf_jit_binary_alloc_pack(size, &prg_buf, &prg_addr, ...);
   ...
   /*
    * Generate code into prg_buf, the code should assume that its first
    * byte is located at prg_addr.
    */
   ...
   bpf_jit_binary_finalize_pack(header, prg_buf);

where bpf_jit_binary_finalize_pack() would copy prg_buf to header and
free it.

If this won't work, I also don't see any big problems in the scheme
that you propose (especially if bpf_prog_pack_max_size() limitation is
gone).

[...]

Btw, are there any existing benchmarks that I can use to check whether
this is worth enabling on s390?

Best regards,
Ilya
Peter Zijlstra Jan. 24, 2022, 12:45 p.m. UTC | #10
On Sun, Jan 23, 2022 at 01:03:25AM +0000, Song Liu wrote:

> I guess we can introduce something similar to bpf_arch_text_poke() 
> for this? 

IIRC the s390 version of the text_poke_copy() function is called
s390_kernel_write().
Song Liu Jan. 24, 2022, 6:27 p.m. UTC | #11
> On Jan 24, 2022, at 4:29 AM, Ilya Leoshkevich <iii@linux.ibm.com> wrote:
> 
> 
> 
> On 1/23/22 02:03, Song Liu wrote:
>>> On Jan 21, 2022, at 6:12 PM, Alexei Starovoitov <alexei.starovoitov@gmail.com> wrote:
>>> 
>>> On Fri, Jan 21, 2022 at 5:30 PM Song Liu <songliubraving@fb.com> wrote:
>>>> 
>>>> 
>>>> 
>>>>> On Jan 21, 2022, at 5:12 PM, Alexei Starovoitov <alexei.starovoitov@gmail.com> wrote:
>>>>> 
>>>>> On Fri, Jan 21, 2022 at 5:01 PM Song Liu <songliubraving@fb.com> wrote:
>>>>>> 
>>>>>> In this way, we need to allocate rw_image here, and free it in
>>>>>> bpf_jit_comp.c. This feels a little weird to me, but I guess that
>>>>>> is still the cleanest solution for now.
>>>>> 
>>>>> You mean inside bpf_jit_binary_alloc?
>>>>> That won't be arch independent.
>>>>> It needs to be split into generic piece that stays in core.c
>>>>> and callbacks like bpf_jit_fill_hole_t
>>>>> or into multiple helpers with prep in-between.
>>>>> Don't worry if all archs need to be touched.
>>>> 
>>>> How about we introduce callback bpf_jit_set_header_size_t? Then we
>>>> can split x86's jit_fill_hole() into two functions, one to fill the
>>>> hole, the other to set size. The rest of the logic gonna stay the same.
>>>> 
>>>> Archs that do not use bpf_prog_pack won't need bpf_jit_set_header_size_t.
>>> 
>>> That's not any better.
>>> 
>>> Currently the choice of bpf_jit_binary_alloc_pack vs bpf_jit_binary_alloc
>>> leaks into arch bits and bpf_prog_pack_max_size() doesn't
>>> really make it generic.
>>> 
>>> Ideally all archs continue to use bpf_jit_binary_alloc()
>>> and magic happens in a generic code.
>>> If not then please remove bpf_prog_pack_max_size(),
>>> since it doesn't provide much value and pick
>>> bpf_jit_binary_alloc_pack() signature to fit x86 jit better.
>>> It wouldn't need bpf_jit_fill_hole_t callback at all.
>>> Please think it through so we don't need to redesign it
>>> when another arch will decide to use huge pages for bpf progs.
>>> 
>>> cc-ing Ilya for ideas on how that would fit s390.
>> I guess we have a few different questions here:
>> 1. Can we use bpf_jit_binary_alloc() for both regular page and shared
>> huge page?
>> I think the answer is no, as bpf_jit_binary_alloc() allocates a rw
>> buffer, and arch calls bpf_jit_binary_lock_ro after JITing. The new
>> allocator will return a slice of a shared huge page, which is locked
>> RO before JITing.
>> 2. The problem with bpf_prog_pack_max_size() limitation.
>> I think this is the worst part of current version of bpf_prog_pack,
>> but it shouldn't be too hard to fix. I will remove this limitation
>> in the next version.
>> 3. How to set proper header->size?
>> I guess we can introduce something similar to bpf_arch_text_poke()
>> for this?
>> My proposal for the next version is:
>> 1. No changes to archs that do not use huge page, just keep using
>>    bpf_jit_binary_alloc.
>> 2. For x86_64 (and other arch that would support bpf program on huge
>>    pages):
>>    2.1 arch/bpf_jit_comp calls bpf_jit_binary_alloc_pack() to allocate
>>        an RO bpf_binary_header;
>>    2.2 arch allocates a temporary buffer for JIT. Once JIT is done,
>>        use text_poke_copy to copy the code to the RO bpf_binary_header.
> 
> Are arches expected to allocate rw buffers in different ways? If not,
> I would consider putting this into the common code as well. Then
> arch-specific code would do something like
> 
>  header = bpf_jit_binary_alloc_pack(size, &prg_buf, &prg_addr, ...);
>  ...
>  /*
>   * Generate code into prg_buf, the code should assume that its first
>   * byte is located at prg_addr.
>   */
>  ...
>  bpf_jit_binary_finalize_pack(header, prg_buf);
> 
> where bpf_jit_binary_finalize_pack() would copy prg_buf to header and
> free it.

I think this should work. 

We will need an API like: bpf_arch_text_copy, which uses text_poke_copy() 
for x86_64 and s390_kernel_write() for x390. We will use bpf_arch_text_copy 
to 
  1) write header->size;
  2) do finally copy in bpf_jit_binary_finalize_pack().

The syntax of bpf_arch_text_copy is quite different to existing 
bpf_arch_text_poke, so I guess a new API is better. 

> 
> If this won't work, I also don't see any big problems in the scheme
> that you propose (especially if bpf_prog_pack_max_size() limitation is
> gone).
> 
> [...]
> 
> Btw, are there any existing benchmarks that I can use to check whether
> this is worth enabling on s390?

Unfortunately, we don't have a benchmark to share. Most of our benchmarks
are shadow tests that cannot run out of production environment. We have 
issues with iTLB misses for most of our big services. A typical system 
may see hundreds of iTLB misses per million instruction. Some sched_cls
programs are often the top triggers of these iTLB misses. 

Thanks,
Song
Alexei Starovoitov Jan. 25, 2022, 5:21 a.m. UTC | #12
On Mon, Jan 24, 2022 at 10:27 AM Song Liu <songliubraving@fb.com> wrote:
> >
> > Are arches expected to allocate rw buffers in different ways? If not,
> > I would consider putting this into the common code as well. Then
> > arch-specific code would do something like
> >
> >  header = bpf_jit_binary_alloc_pack(size, &prg_buf, &prg_addr, ...);
> >  ...
> >  /*
> >   * Generate code into prg_buf, the code should assume that its first
> >   * byte is located at prg_addr.
> >   */
> >  ...
> >  bpf_jit_binary_finalize_pack(header, prg_buf);
> >
> > where bpf_jit_binary_finalize_pack() would copy prg_buf to header and
> > free it.

It feels right, but bpf_jit_binary_finalize_pack() sounds 100% arch
dependent. The only thing it will do is perform a copy via text_poke.
What else?

> I think this should work.
>
> We will need an API like: bpf_arch_text_copy, which uses text_poke_copy()
> for x86_64 and s390_kernel_write() for x390. We will use bpf_arch_text_copy
> to
>   1) write header->size;
>   2) do finally copy in bpf_jit_binary_finalize_pack().

we can combine all text_poke operations into one.

Can we add an 'image' pointer into struct bpf_binary_header ?
Then do:
int bpf_jit_binary_alloc_pack(size, &ro_hdr, &rw_hdr);

ro_hdr->image would be the address used to compute offsets by JIT.
rw_hdr->image would point to kvmalloc-ed area for emitting insns.
rw_hdr->size would already be populated.

The JITs would write insns into rw_hdr->image including 'int 3' insns.
At the end the JIT will do text_poke_copy(ro_hdr, rw_hdr, rw_hdr->size);
That would be the only copy that will transfer everything into final
location.
Then kvfree(rw_hdr)

wdyt?
Song Liu Jan. 25, 2022, 7:21 a.m. UTC | #13
On Mon, Jan 24, 2022 at 9:21 PM Alexei Starovoitov
<alexei.starovoitov@gmail.com> wrote:
>
> On Mon, Jan 24, 2022 at 10:27 AM Song Liu <songliubraving@fb.com> wrote:
> > >
> > > Are arches expected to allocate rw buffers in different ways? If not,
> > > I would consider putting this into the common code as well. Then
> > > arch-specific code would do something like
> > >
> > >  header = bpf_jit_binary_alloc_pack(size, &prg_buf, &prg_addr, ...);
> > >  ...
> > >  /*
> > >   * Generate code into prg_buf, the code should assume that its first
> > >   * byte is located at prg_addr.
> > >   */
> > >  ...
> > >  bpf_jit_binary_finalize_pack(header, prg_buf);
> > >
> > > where bpf_jit_binary_finalize_pack() would copy prg_buf to header and
> > > free it.
>
> It feels right, but bpf_jit_binary_finalize_pack() sounds 100% arch
> dependent. The only thing it will do is perform a copy via text_poke.
> What else?
>
> > I think this should work.
> >
> > We will need an API like: bpf_arch_text_copy, which uses text_poke_copy()
> > for x86_64 and s390_kernel_write() for x390. We will use bpf_arch_text_copy
> > to
> >   1) write header->size;
> >   2) do finally copy in bpf_jit_binary_finalize_pack().
>
> we can combine all text_poke operations into one.
>
> Can we add an 'image' pointer into struct bpf_binary_header ?

There is a 4-byte hole in bpf_binary_header. How about we put
image_offset there? Actually we only need 2 bytes for offset.

> Then do:
> int bpf_jit_binary_alloc_pack(size, &ro_hdr, &rw_hdr);
>
> ro_hdr->image would be the address used to compute offsets by JIT.

If we only do one text_poke(), we cannot write ro_hdr->image yet. We
can use ro_hdr + rw_hdr->image_offset instead.

> rw_hdr->image would point to kvmalloc-ed area for emitting insns.
> rw_hdr->size would already be populated.
>
> The JITs would write insns into rw_hdr->image including 'int 3' insns.
> At the end the JIT will do text_poke_copy(ro_hdr, rw_hdr, rw_hdr->size);
> That would be the only copy that will transfer everything into final
> location.
> Then kvfree(rw_hdr)

The only problem is the asymmetry of allocating rw_hdr from bpf/core.c,
and freeing it from arch/bpf_jit_comp.c. But it doesn't bother me too much.

Thanks,
Song
Alexei Starovoitov Jan. 25, 2022, 7:59 p.m. UTC | #14
On Mon, Jan 24, 2022 at 11:21 PM Song Liu <song@kernel.org> wrote:
>
> On Mon, Jan 24, 2022 at 9:21 PM Alexei Starovoitov
> <alexei.starovoitov@gmail.com> wrote:
> >
> > On Mon, Jan 24, 2022 at 10:27 AM Song Liu <songliubraving@fb.com> wrote:
> > > >
> > > > Are arches expected to allocate rw buffers in different ways? If not,
> > > > I would consider putting this into the common code as well. Then
> > > > arch-specific code would do something like
> > > >
> > > >  header = bpf_jit_binary_alloc_pack(size, &prg_buf, &prg_addr, ...);
> > > >  ...
> > > >  /*
> > > >   * Generate code into prg_buf, the code should assume that its first
> > > >   * byte is located at prg_addr.
> > > >   */
> > > >  ...
> > > >  bpf_jit_binary_finalize_pack(header, prg_buf);
> > > >
> > > > where bpf_jit_binary_finalize_pack() would copy prg_buf to header and
> > > > free it.
> >
> > It feels right, but bpf_jit_binary_finalize_pack() sounds 100% arch
> > dependent. The only thing it will do is perform a copy via text_poke.
> > What else?
> >
> > > I think this should work.
> > >
> > > We will need an API like: bpf_arch_text_copy, which uses text_poke_copy()
> > > for x86_64 and s390_kernel_write() for x390. We will use bpf_arch_text_copy
> > > to
> > >   1) write header->size;
> > >   2) do finally copy in bpf_jit_binary_finalize_pack().
> >
> > we can combine all text_poke operations into one.
> >
> > Can we add an 'image' pointer into struct bpf_binary_header ?
>
> There is a 4-byte hole in bpf_binary_header. How about we put
> image_offset there? Actually we only need 2 bytes for offset.
>
> > Then do:
> > int bpf_jit_binary_alloc_pack(size, &ro_hdr, &rw_hdr);
> >
> > ro_hdr->image would be the address used to compute offsets by JIT.
>
> If we only do one text_poke(), we cannot write ro_hdr->image yet. We
> can use ro_hdr + rw_hdr->image_offset instead.

Good points.
Maybe let's go back to Ilya's suggestion and return 4 pointers
from bpf_jit_binary_alloc_pack ?

> > rw_hdr->image would point to kvmalloc-ed area for emitting insns.
> > rw_hdr->size would already be populated.
> >
> > The JITs would write insns into rw_hdr->image including 'int 3' insns.
> > At the end the JIT will do text_poke_copy(ro_hdr, rw_hdr, rw_hdr->size);
> > That would be the only copy that will transfer everything into final
> > location.
> > Then kvfree(rw_hdr)
>
> The only problem is the asymmetry of allocating rw_hdr from bpf/core.c,
> and freeing it from arch/bpf_jit_comp.c. But it doesn't bother me too much.

Indeed. Asymmetry needs to be fixed.
Let's then pass 4 pointers back into
bpf_jit_binary_finalize_pack()
which will call arch dependent weak function to do text_poke_copy
or use default __weak function that returns eopnotsupp
and then kvfree the rw_hdr ?
I'd like to avoid callbacks. imo __weak is easier to follow.
Song Liu Jan. 25, 2022, 10:25 p.m. UTC | #15
On Tue, Jan 25, 2022 at 12:00 PM Alexei Starovoitov
<alexei.starovoitov@gmail.com> wrote:
>
> On Mon, Jan 24, 2022 at 11:21 PM Song Liu <song@kernel.org> wrote:
> >
> > On Mon, Jan 24, 2022 at 9:21 PM Alexei Starovoitov
> > <alexei.starovoitov@gmail.com> wrote:
> > >
> > > On Mon, Jan 24, 2022 at 10:27 AM Song Liu <songliubraving@fb.com> wrote:
> > > > >
> > > > > Are arches expected to allocate rw buffers in different ways? If not,
> > > > > I would consider putting this into the common code as well. Then
> > > > > arch-specific code would do something like
> > > > >
> > > > >  header = bpf_jit_binary_alloc_pack(size, &prg_buf, &prg_addr, ...);
> > > > >  ...
> > > > >  /*
> > > > >   * Generate code into prg_buf, the code should assume that its first
> > > > >   * byte is located at prg_addr.
> > > > >   */
> > > > >  ...
> > > > >  bpf_jit_binary_finalize_pack(header, prg_buf);
> > > > >
> > > > > where bpf_jit_binary_finalize_pack() would copy prg_buf to header and
> > > > > free it.
> > >
> > > It feels right, but bpf_jit_binary_finalize_pack() sounds 100% arch
> > > dependent. The only thing it will do is perform a copy via text_poke.
> > > What else?
> > >
> > > > I think this should work.
> > > >
> > > > We will need an API like: bpf_arch_text_copy, which uses text_poke_copy()
> > > > for x86_64 and s390_kernel_write() for x390. We will use bpf_arch_text_copy
> > > > to
> > > >   1) write header->size;
> > > >   2) do finally copy in bpf_jit_binary_finalize_pack().
> > >
> > > we can combine all text_poke operations into one.
> > >
> > > Can we add an 'image' pointer into struct bpf_binary_header ?
> >
> > There is a 4-byte hole in bpf_binary_header. How about we put
> > image_offset there? Actually we only need 2 bytes for offset.
> >
> > > Then do:
> > > int bpf_jit_binary_alloc_pack(size, &ro_hdr, &rw_hdr);
> > >
> > > ro_hdr->image would be the address used to compute offsets by JIT.
> >
> > If we only do one text_poke(), we cannot write ro_hdr->image yet. We
> > can use ro_hdr + rw_hdr->image_offset instead.
>
> Good points.
> Maybe let's go back to Ilya's suggestion and return 4 pointers
> from bpf_jit_binary_alloc_pack ?

How about we use image_offset, like:

struct bpf_binary_header {
        u32 size;
        u32 image_offset;
        u8 image[] __aligned(BPF_IMAGE_ALIGNMENT);
};

Then we can use

image = (void *)header + header->image_offset;

In this way, we will only have two output pointers.

>
> > > rw_hdr->image would point to kvmalloc-ed area for emitting insns.
> > > rw_hdr->size would already be populated.
> > >
> > > The JITs would write insns into rw_hdr->image including 'int 3' insns.
> > > At the end the JIT will do text_poke_copy(ro_hdr, rw_hdr, rw_hdr->size);
> > > That would be the only copy that will transfer everything into final
> > > location.
> > > Then kvfree(rw_hdr)
> >
> > The only problem is the asymmetry of allocating rw_hdr from bpf/core.c,
> > and freeing it from arch/bpf_jit_comp.c. But it doesn't bother me too much.
>
> Indeed. Asymmetry needs to be fixed.
> Let's then pass 4 pointers back into
> bpf_jit_binary_finalize_pack()
> which will call arch dependent weak function to do text_poke_copy
> or use default __weak function that returns eopnotsupp
> and then kvfree the rw_hdr ?
> I'd like to avoid callbacks. imo __weak is easier to follow.

Yeah, I also like __weak function better.

Thanks,
Song
Alexei Starovoitov Jan. 25, 2022, 10:48 p.m. UTC | #16
On Tue, Jan 25, 2022 at 2:25 PM Song Liu <song@kernel.org> wrote:
>
> On Tue, Jan 25, 2022 at 12:00 PM Alexei Starovoitov
> <alexei.starovoitov@gmail.com> wrote:
> >
> > On Mon, Jan 24, 2022 at 11:21 PM Song Liu <song@kernel.org> wrote:
> > >
> > > On Mon, Jan 24, 2022 at 9:21 PM Alexei Starovoitov
> > > <alexei.starovoitov@gmail.com> wrote:
> > > >
> > > > On Mon, Jan 24, 2022 at 10:27 AM Song Liu <songliubraving@fb.com> wrote:
> > > > > >
> > > > > > Are arches expected to allocate rw buffers in different ways? If not,
> > > > > > I would consider putting this into the common code as well. Then
> > > > > > arch-specific code would do something like
> > > > > >
> > > > > >  header = bpf_jit_binary_alloc_pack(size, &prg_buf, &prg_addr, ...);
> > > > > >  ...
> > > > > >  /*
> > > > > >   * Generate code into prg_buf, the code should assume that its first
> > > > > >   * byte is located at prg_addr.
> > > > > >   */
> > > > > >  ...
> > > > > >  bpf_jit_binary_finalize_pack(header, prg_buf);
> > > > > >
> > > > > > where bpf_jit_binary_finalize_pack() would copy prg_buf to header and
> > > > > > free it.
> > > >
> > > > It feels right, but bpf_jit_binary_finalize_pack() sounds 100% arch
> > > > dependent. The only thing it will do is perform a copy via text_poke.
> > > > What else?
> > > >
> > > > > I think this should work.
> > > > >
> > > > > We will need an API like: bpf_arch_text_copy, which uses text_poke_copy()
> > > > > for x86_64 and s390_kernel_write() for x390. We will use bpf_arch_text_copy
> > > > > to
> > > > >   1) write header->size;
> > > > >   2) do finally copy in bpf_jit_binary_finalize_pack().
> > > >
> > > > we can combine all text_poke operations into one.
> > > >
> > > > Can we add an 'image' pointer into struct bpf_binary_header ?
> > >
> > > There is a 4-byte hole in bpf_binary_header. How about we put
> > > image_offset there? Actually we only need 2 bytes for offset.
> > >
> > > > Then do:
> > > > int bpf_jit_binary_alloc_pack(size, &ro_hdr, &rw_hdr);
> > > >
> > > > ro_hdr->image would be the address used to compute offsets by JIT.
> > >
> > > If we only do one text_poke(), we cannot write ro_hdr->image yet. We
> > > can use ro_hdr + rw_hdr->image_offset instead.
> >
> > Good points.
> > Maybe let's go back to Ilya's suggestion and return 4 pointers
> > from bpf_jit_binary_alloc_pack ?
>
> How about we use image_offset, like:
>
> struct bpf_binary_header {
>         u32 size;
>         u32 image_offset;
>         u8 image[] __aligned(BPF_IMAGE_ALIGNMENT);
> };
>
> Then we can use
>
> image = (void *)header + header->image_offset;

I'm not excited about it, since it leaks header details into JITs.
Looks like we don't need JIT to be aware of it.
How about we do random() % roundup(sizeof(struct bpf_binary_header), 64)
to pick the image start and populate
image-sizeof(struct bpf_binary_header) range
with 'int 3'.
This way we can completely hide binary_header inside generic code.
The bpf_jit_binary_alloc_pack() would return ro_image and rw_image only.
And JIT would pass them back into bpf_jit_binary_finalize_pack().
From the image pointer it would be trivial to get to binary_header with &63.
The 128 byte offset that we use today was chosen arbitrarily.
We were burning the whole page for a single program, so 128 bytes zone
at the front was ok.
Now we will be packing progs rounded up to 64 bytes, so it's better
to avoid wasting those 128 bytes regardless.
Song Liu Jan. 25, 2022, 11:09 p.m. UTC | #17
On Tue, Jan 25, 2022 at 2:48 PM Alexei Starovoitov
<alexei.starovoitov@gmail.com> wrote:
>
> On Tue, Jan 25, 2022 at 2:25 PM Song Liu <song@kernel.org> wrote:
> >
> > On Tue, Jan 25, 2022 at 12:00 PM Alexei Starovoitov
> > <alexei.starovoitov@gmail.com> wrote:
> > >
> > > On Mon, Jan 24, 2022 at 11:21 PM Song Liu <song@kernel.org> wrote:
> > > >
> > > > On Mon, Jan 24, 2022 at 9:21 PM Alexei Starovoitov
> > > > <alexei.starovoitov@gmail.com> wrote:
> > > > >
> > > > > On Mon, Jan 24, 2022 at 10:27 AM Song Liu <songliubraving@fb.com> wrote:
> > > > > > >
> > > > > > > Are arches expected to allocate rw buffers in different ways? If not,
> > > > > > > I would consider putting this into the common code as well. Then
> > > > > > > arch-specific code would do something like
> > > > > > >
> > > > > > >  header = bpf_jit_binary_alloc_pack(size, &prg_buf, &prg_addr, ...);
> > > > > > >  ...
> > > > > > >  /*
> > > > > > >   * Generate code into prg_buf, the code should assume that its first
> > > > > > >   * byte is located at prg_addr.
> > > > > > >   */
> > > > > > >  ...
> > > > > > >  bpf_jit_binary_finalize_pack(header, prg_buf);
> > > > > > >
> > > > > > > where bpf_jit_binary_finalize_pack() would copy prg_buf to header and
> > > > > > > free it.
> > > > >
> > > > > It feels right, but bpf_jit_binary_finalize_pack() sounds 100% arch
> > > > > dependent. The only thing it will do is perform a copy via text_poke.
> > > > > What else?
> > > > >
> > > > > > I think this should work.
> > > > > >
> > > > > > We will need an API like: bpf_arch_text_copy, which uses text_poke_copy()
> > > > > > for x86_64 and s390_kernel_write() for x390. We will use bpf_arch_text_copy
> > > > > > to
> > > > > >   1) write header->size;
> > > > > >   2) do finally copy in bpf_jit_binary_finalize_pack().
> > > > >
> > > > > we can combine all text_poke operations into one.
> > > > >
> > > > > Can we add an 'image' pointer into struct bpf_binary_header ?
> > > >
> > > > There is a 4-byte hole in bpf_binary_header. How about we put
> > > > image_offset there? Actually we only need 2 bytes for offset.
> > > >
> > > > > Then do:
> > > > > int bpf_jit_binary_alloc_pack(size, &ro_hdr, &rw_hdr);
> > > > >
> > > > > ro_hdr->image would be the address used to compute offsets by JIT.
> > > >
> > > > If we only do one text_poke(), we cannot write ro_hdr->image yet. We
> > > > can use ro_hdr + rw_hdr->image_offset instead.
> > >
> > > Good points.
> > > Maybe let's go back to Ilya's suggestion and return 4 pointers
> > > from bpf_jit_binary_alloc_pack ?
> >
> > How about we use image_offset, like:
> >
> > struct bpf_binary_header {
> >         u32 size;
> >         u32 image_offset;
> >         u8 image[] __aligned(BPF_IMAGE_ALIGNMENT);
> > };
> >
> > Then we can use
> >
> > image = (void *)header + header->image_offset;
>
> I'm not excited about it, since it leaks header details into JITs.
> Looks like we don't need JIT to be aware of it.
> How about we do random() % roundup(sizeof(struct bpf_binary_header), 64)
> to pick the image start and populate
> image-sizeof(struct bpf_binary_header) range
> with 'int 3'.
> This way we can completely hide binary_header inside generic code.
> The bpf_jit_binary_alloc_pack() would return ro_image and rw_image only.
> And JIT would pass them back into bpf_jit_binary_finalize_pack().
> From the image pointer it would be trivial to get to binary_header with &63.
> The 128 byte offset that we use today was chosen arbitrarily.
> We were burning the whole page for a single program, so 128 bytes zone
> at the front was ok.
> Now we will be packing progs rounded up to 64 bytes, so it's better
> to avoid wasting those 128 bytes regardless.

In bpf_jit_binary_hdr(), we calculate header as image & PAGE_MASK.
If we want s/PAGE_MASK/63 for x86_64, we will have different versions
of bpf_jit_binary_hdr(). It is not on any hot path, so we can use __weak for
it. Other than this, I think the solution works fine.

Thanks,
Song
Alexei Starovoitov Jan. 26, 2022, 12:38 a.m. UTC | #18
On Tue, Jan 25, 2022 at 3:09 PM Song Liu <song@kernel.org> wrote:
>
> On Tue, Jan 25, 2022 at 2:48 PM Alexei Starovoitov
> <alexei.starovoitov@gmail.com> wrote:
> >
> > On Tue, Jan 25, 2022 at 2:25 PM Song Liu <song@kernel.org> wrote:
> > >
> > > On Tue, Jan 25, 2022 at 12:00 PM Alexei Starovoitov
> > > <alexei.starovoitov@gmail.com> wrote:
> > > >
> > > > On Mon, Jan 24, 2022 at 11:21 PM Song Liu <song@kernel.org> wrote:
> > > > >
> > > > > On Mon, Jan 24, 2022 at 9:21 PM Alexei Starovoitov
> > > > > <alexei.starovoitov@gmail.com> wrote:
> > > > > >
> > > > > > On Mon, Jan 24, 2022 at 10:27 AM Song Liu <songliubraving@fb.com> wrote:
> > > > > > > >
> > > > > > > > Are arches expected to allocate rw buffers in different ways? If not,
> > > > > > > > I would consider putting this into the common code as well. Then
> > > > > > > > arch-specific code would do something like
> > > > > > > >
> > > > > > > >  header = bpf_jit_binary_alloc_pack(size, &prg_buf, &prg_addr, ...);
> > > > > > > >  ...
> > > > > > > >  /*
> > > > > > > >   * Generate code into prg_buf, the code should assume that its first
> > > > > > > >   * byte is located at prg_addr.
> > > > > > > >   */
> > > > > > > >  ...
> > > > > > > >  bpf_jit_binary_finalize_pack(header, prg_buf);
> > > > > > > >
> > > > > > > > where bpf_jit_binary_finalize_pack() would copy prg_buf to header and
> > > > > > > > free it.
> > > > > >
> > > > > > It feels right, but bpf_jit_binary_finalize_pack() sounds 100% arch
> > > > > > dependent. The only thing it will do is perform a copy via text_poke.
> > > > > > What else?
> > > > > >
> > > > > > > I think this should work.
> > > > > > >
> > > > > > > We will need an API like: bpf_arch_text_copy, which uses text_poke_copy()
> > > > > > > for x86_64 and s390_kernel_write() for x390. We will use bpf_arch_text_copy
> > > > > > > to
> > > > > > >   1) write header->size;
> > > > > > >   2) do finally copy in bpf_jit_binary_finalize_pack().
> > > > > >
> > > > > > we can combine all text_poke operations into one.
> > > > > >
> > > > > > Can we add an 'image' pointer into struct bpf_binary_header ?
> > > > >
> > > > > There is a 4-byte hole in bpf_binary_header. How about we put
> > > > > image_offset there? Actually we only need 2 bytes for offset.
> > > > >
> > > > > > Then do:
> > > > > > int bpf_jit_binary_alloc_pack(size, &ro_hdr, &rw_hdr);
> > > > > >
> > > > > > ro_hdr->image would be the address used to compute offsets by JIT.
> > > > >
> > > > > If we only do one text_poke(), we cannot write ro_hdr->image yet. We
> > > > > can use ro_hdr + rw_hdr->image_offset instead.
> > > >
> > > > Good points.
> > > > Maybe let's go back to Ilya's suggestion and return 4 pointers
> > > > from bpf_jit_binary_alloc_pack ?
> > >
> > > How about we use image_offset, like:
> > >
> > > struct bpf_binary_header {
> > >         u32 size;
> > >         u32 image_offset;
> > >         u8 image[] __aligned(BPF_IMAGE_ALIGNMENT);
> > > };
> > >
> > > Then we can use
> > >
> > > image = (void *)header + header->image_offset;
> >
> > I'm not excited about it, since it leaks header details into JITs.
> > Looks like we don't need JIT to be aware of it.
> > How about we do random() % roundup(sizeof(struct bpf_binary_header), 64)
> > to pick the image start and populate
> > image-sizeof(struct bpf_binary_header) range
> > with 'int 3'.
> > This way we can completely hide binary_header inside generic code.
> > The bpf_jit_binary_alloc_pack() would return ro_image and rw_image only.
> > And JIT would pass them back into bpf_jit_binary_finalize_pack().
> > From the image pointer it would be trivial to get to binary_header with &63.
> > The 128 byte offset that we use today was chosen arbitrarily.
> > We were burning the whole page for a single program, so 128 bytes zone
> > at the front was ok.
> > Now we will be packing progs rounded up to 64 bytes, so it's better
> > to avoid wasting those 128 bytes regardless.
>
> In bpf_jit_binary_hdr(), we calculate header as image & PAGE_MASK.
> If we want s/PAGE_MASK/63 for x86_64, we will have different versions
> of bpf_jit_binary_hdr(). It is not on any hot path, so we can use __weak for
> it. Other than this, I think the solution works fine.

I think it can stay generic.

The existing bpf_jit_binary_hdr() will do & PAGE_MASK
while bpf_jit_binary_hdr_pack() will do & 63.



> Thanks,
> Song
Song Liu Jan. 26, 2022, 12:50 a.m. UTC | #19
On Tue, Jan 25, 2022 at 4:38 PM Alexei Starovoitov
<alexei.starovoitov@gmail.com> wrote:
>
> On Tue, Jan 25, 2022 at 3:09 PM Song Liu <song@kernel.org> wrote:
> >
> > On Tue, Jan 25, 2022 at 2:48 PM Alexei Starovoitov
> > <alexei.starovoitov@gmail.com> wrote:
> > >
> > > On Tue, Jan 25, 2022 at 2:25 PM Song Liu <song@kernel.org> wrote:
> > > >
> > > > On Tue, Jan 25, 2022 at 12:00 PM Alexei Starovoitov
> > > > <alexei.starovoitov@gmail.com> wrote:
> > > > >
> > > > > On Mon, Jan 24, 2022 at 11:21 PM Song Liu <song@kernel.org> wrote:
> > > > > >
> > > > > > On Mon, Jan 24, 2022 at 9:21 PM Alexei Starovoitov
> > > > > > <alexei.starovoitov@gmail.com> wrote:
> > > > > > >
> > > > > > > On Mon, Jan 24, 2022 at 10:27 AM Song Liu <songliubraving@fb.com> wrote:
> > > > > > > > >
> > > > > > > > > Are arches expected to allocate rw buffers in different ways? If not,
> > > > > > > > > I would consider putting this into the common code as well. Then
> > > > > > > > > arch-specific code would do something like
> > > > > > > > >
> > > > > > > > >  header = bpf_jit_binary_alloc_pack(size, &prg_buf, &prg_addr, ...);
> > > > > > > > >  ...
> > > > > > > > >  /*
> > > > > > > > >   * Generate code into prg_buf, the code should assume that its first
> > > > > > > > >   * byte is located at prg_addr.
> > > > > > > > >   */
> > > > > > > > >  ...
> > > > > > > > >  bpf_jit_binary_finalize_pack(header, prg_buf);
> > > > > > > > >
> > > > > > > > > where bpf_jit_binary_finalize_pack() would copy prg_buf to header and
> > > > > > > > > free it.
> > > > > > >
> > > > > > > It feels right, but bpf_jit_binary_finalize_pack() sounds 100% arch
> > > > > > > dependent. The only thing it will do is perform a copy via text_poke.
> > > > > > > What else?
> > > > > > >
> > > > > > > > I think this should work.
> > > > > > > >
> > > > > > > > We will need an API like: bpf_arch_text_copy, which uses text_poke_copy()
> > > > > > > > for x86_64 and s390_kernel_write() for x390. We will use bpf_arch_text_copy
> > > > > > > > to
> > > > > > > >   1) write header->size;
> > > > > > > >   2) do finally copy in bpf_jit_binary_finalize_pack().
> > > > > > >
> > > > > > > we can combine all text_poke operations into one.
> > > > > > >
> > > > > > > Can we add an 'image' pointer into struct bpf_binary_header ?
> > > > > >
> > > > > > There is a 4-byte hole in bpf_binary_header. How about we put
> > > > > > image_offset there? Actually we only need 2 bytes for offset.
> > > > > >
> > > > > > > Then do:
> > > > > > > int bpf_jit_binary_alloc_pack(size, &ro_hdr, &rw_hdr);
> > > > > > >
> > > > > > > ro_hdr->image would be the address used to compute offsets by JIT.
> > > > > >
> > > > > > If we only do one text_poke(), we cannot write ro_hdr->image yet. We
> > > > > > can use ro_hdr + rw_hdr->image_offset instead.
> > > > >
> > > > > Good points.
> > > > > Maybe let's go back to Ilya's suggestion and return 4 pointers
> > > > > from bpf_jit_binary_alloc_pack ?
> > > >
> > > > How about we use image_offset, like:
> > > >
> > > > struct bpf_binary_header {
> > > >         u32 size;
> > > >         u32 image_offset;
> > > >         u8 image[] __aligned(BPF_IMAGE_ALIGNMENT);
> > > > };
> > > >
> > > > Then we can use
> > > >
> > > > image = (void *)header + header->image_offset;
> > >
> > > I'm not excited about it, since it leaks header details into JITs.
> > > Looks like we don't need JIT to be aware of it.
> > > How about we do random() % roundup(sizeof(struct bpf_binary_header), 64)
> > > to pick the image start and populate
> > > image-sizeof(struct bpf_binary_header) range
> > > with 'int 3'.
> > > This way we can completely hide binary_header inside generic code.
> > > The bpf_jit_binary_alloc_pack() would return ro_image and rw_image only.
> > > And JIT would pass them back into bpf_jit_binary_finalize_pack().
> > > From the image pointer it would be trivial to get to binary_header with &63.
> > > The 128 byte offset that we use today was chosen arbitrarily.
> > > We were burning the whole page for a single program, so 128 bytes zone
> > > at the front was ok.
> > > Now we will be packing progs rounded up to 64 bytes, so it's better
> > > to avoid wasting those 128 bytes regardless.
> >
> > In bpf_jit_binary_hdr(), we calculate header as image & PAGE_MASK.
> > If we want s/PAGE_MASK/63 for x86_64, we will have different versions
> > of bpf_jit_binary_hdr(). It is not on any hot path, so we can use __weak for
> > it. Other than this, I think the solution works fine.
>
> I think it can stay generic.
>
> The existing bpf_jit_binary_hdr() will do & PAGE_MASK
> while bpf_jit_binary_hdr_pack() will do & 63.

The problem with this approach is that we need bpf_prog_ksym_set_addr
to be smart to pick bpf_jit_binary_hdr() or bpf_jit_binary_hdr_pack().

Song
Alexei Starovoitov Jan. 26, 2022, 1:20 a.m. UTC | #20
On Tue, Jan 25, 2022 at 4:50 PM Song Liu <song@kernel.org> wrote:
>
> On Tue, Jan 25, 2022 at 4:38 PM Alexei Starovoitov
> <alexei.starovoitov@gmail.com> wrote:
> >
> > On Tue, Jan 25, 2022 at 3:09 PM Song Liu <song@kernel.org> wrote:
> > >
> > > On Tue, Jan 25, 2022 at 2:48 PM Alexei Starovoitov
> > > <alexei.starovoitov@gmail.com> wrote:
> > > >
> > > > On Tue, Jan 25, 2022 at 2:25 PM Song Liu <song@kernel.org> wrote:
> > > > >
> > > > > On Tue, Jan 25, 2022 at 12:00 PM Alexei Starovoitov
> > > > > <alexei.starovoitov@gmail.com> wrote:
> > > > > >
> > > > > > On Mon, Jan 24, 2022 at 11:21 PM Song Liu <song@kernel.org> wrote:
> > > > > > >
> > > > > > > On Mon, Jan 24, 2022 at 9:21 PM Alexei Starovoitov
> > > > > > > <alexei.starovoitov@gmail.com> wrote:
> > > > > > > >
> > > > > > > > On Mon, Jan 24, 2022 at 10:27 AM Song Liu <songliubraving@fb.com> wrote:
> > > > > > > > > >
> > > > > > > > > > Are arches expected to allocate rw buffers in different ways? If not,
> > > > > > > > > > I would consider putting this into the common code as well. Then
> > > > > > > > > > arch-specific code would do something like
> > > > > > > > > >
> > > > > > > > > >  header = bpf_jit_binary_alloc_pack(size, &prg_buf, &prg_addr, ...);
> > > > > > > > > >  ...
> > > > > > > > > >  /*
> > > > > > > > > >   * Generate code into prg_buf, the code should assume that its first
> > > > > > > > > >   * byte is located at prg_addr.
> > > > > > > > > >   */
> > > > > > > > > >  ...
> > > > > > > > > >  bpf_jit_binary_finalize_pack(header, prg_buf);
> > > > > > > > > >
> > > > > > > > > > where bpf_jit_binary_finalize_pack() would copy prg_buf to header and
> > > > > > > > > > free it.
> > > > > > > >
> > > > > > > > It feels right, but bpf_jit_binary_finalize_pack() sounds 100% arch
> > > > > > > > dependent. The only thing it will do is perform a copy via text_poke.
> > > > > > > > What else?
> > > > > > > >
> > > > > > > > > I think this should work.
> > > > > > > > >
> > > > > > > > > We will need an API like: bpf_arch_text_copy, which uses text_poke_copy()
> > > > > > > > > for x86_64 and s390_kernel_write() for x390. We will use bpf_arch_text_copy
> > > > > > > > > to
> > > > > > > > >   1) write header->size;
> > > > > > > > >   2) do finally copy in bpf_jit_binary_finalize_pack().
> > > > > > > >
> > > > > > > > we can combine all text_poke operations into one.
> > > > > > > >
> > > > > > > > Can we add an 'image' pointer into struct bpf_binary_header ?
> > > > > > >
> > > > > > > There is a 4-byte hole in bpf_binary_header. How about we put
> > > > > > > image_offset there? Actually we only need 2 bytes for offset.
> > > > > > >
> > > > > > > > Then do:
> > > > > > > > int bpf_jit_binary_alloc_pack(size, &ro_hdr, &rw_hdr);
> > > > > > > >
> > > > > > > > ro_hdr->image would be the address used to compute offsets by JIT.
> > > > > > >
> > > > > > > If we only do one text_poke(), we cannot write ro_hdr->image yet. We
> > > > > > > can use ro_hdr + rw_hdr->image_offset instead.
> > > > > >
> > > > > > Good points.
> > > > > > Maybe let's go back to Ilya's suggestion and return 4 pointers
> > > > > > from bpf_jit_binary_alloc_pack ?
> > > > >
> > > > > How about we use image_offset, like:
> > > > >
> > > > > struct bpf_binary_header {
> > > > >         u32 size;
> > > > >         u32 image_offset;
> > > > >         u8 image[] __aligned(BPF_IMAGE_ALIGNMENT);
> > > > > };
> > > > >
> > > > > Then we can use
> > > > >
> > > > > image = (void *)header + header->image_offset;
> > > >
> > > > I'm not excited about it, since it leaks header details into JITs.
> > > > Looks like we don't need JIT to be aware of it.
> > > > How about we do random() % roundup(sizeof(struct bpf_binary_header), 64)
> > > > to pick the image start and populate
> > > > image-sizeof(struct bpf_binary_header) range
> > > > with 'int 3'.
> > > > This way we can completely hide binary_header inside generic code.
> > > > The bpf_jit_binary_alloc_pack() would return ro_image and rw_image only.
> > > > And JIT would pass them back into bpf_jit_binary_finalize_pack().
> > > > From the image pointer it would be trivial to get to binary_header with &63.
> > > > The 128 byte offset that we use today was chosen arbitrarily.
> > > > We were burning the whole page for a single program, so 128 bytes zone
> > > > at the front was ok.
> > > > Now we will be packing progs rounded up to 64 bytes, so it's better
> > > > to avoid wasting those 128 bytes regardless.
> > >
> > > In bpf_jit_binary_hdr(), we calculate header as image & PAGE_MASK.
> > > If we want s/PAGE_MASK/63 for x86_64, we will have different versions
> > > of bpf_jit_binary_hdr(). It is not on any hot path, so we can use __weak for
> > > it. Other than this, I think the solution works fine.
> >
> > I think it can stay generic.
> >
> > The existing bpf_jit_binary_hdr() will do & PAGE_MASK
> > while bpf_jit_binary_hdr_pack() will do & 63.
>
> The problem with this approach is that we need bpf_prog_ksym_set_addr
> to be smart to pick bpf_jit_binary_hdr() or bpf_jit_binary_hdr_pack().

We can probably add a true JIT image size to bpf_prog_aux.
bpf_prog_ksym_set_addr() is approximating the end:
prog->aux->ksym.end   = addr + hdr->pages * PAGE_SIZE
which doesn't have to include all the 'int 3' padding after the end.

Or add a flag to bpf_prog_aux.
Ideally bpf_jit_free() would stay generic too.
Song Liu Jan. 26, 2022, 1:28 a.m. UTC | #21
On Tue, Jan 25, 2022 at 5:20 PM Alexei Starovoitov
<alexei.starovoitov@gmail.com> wrote:
>
> On Tue, Jan 25, 2022 at 4:50 PM Song Liu <song@kernel.org> wrote:
> >
> > On Tue, Jan 25, 2022 at 4:38 PM Alexei Starovoitov
> > <alexei.starovoitov@gmail.com> wrote:
> > >
[...]
> > > >
> > > > In bpf_jit_binary_hdr(), we calculate header as image & PAGE_MASK.
> > > > If we want s/PAGE_MASK/63 for x86_64, we will have different versions
> > > > of bpf_jit_binary_hdr(). It is not on any hot path, so we can use __weak for
> > > > it. Other than this, I think the solution works fine.
> > >
> > > I think it can stay generic.
> > >
> > > The existing bpf_jit_binary_hdr() will do & PAGE_MASK
> > > while bpf_jit_binary_hdr_pack() will do & 63.
> >
> > The problem with this approach is that we need bpf_prog_ksym_set_addr
> > to be smart to pick bpf_jit_binary_hdr() or bpf_jit_binary_hdr_pack().
>
> We can probably add a true JIT image size to bpf_prog_aux.
> bpf_prog_ksym_set_addr() is approximating the end:
> prog->aux->ksym.end   = addr + hdr->pages * PAGE_SIZE
> which doesn't have to include all the 'int 3' padding after the end.
>
> Or add a flag to bpf_prog_aux.
> Ideally bpf_jit_free() would stay generic too.

Both ideas sound promising. Let me try to implement them and see
which is better (or maybe we get both).

Thanks for the suggestions!
Song
Song Liu Jan. 26, 2022, 1:31 a.m. UTC | #22
On Tue, Jan 25, 2022 at 5:28 PM Song Liu <song@kernel.org> wrote:
>
> On Tue, Jan 25, 2022 at 5:20 PM Alexei Starovoitov
> <alexei.starovoitov@gmail.com> wrote:
> >
> > On Tue, Jan 25, 2022 at 4:50 PM Song Liu <song@kernel.org> wrote:
> > >
> > > On Tue, Jan 25, 2022 at 4:38 PM Alexei Starovoitov
> > > <alexei.starovoitov@gmail.com> wrote:
> > > >
> [...]
> > > > >
> > > > > In bpf_jit_binary_hdr(), we calculate header as image & PAGE_MASK.
> > > > > If we want s/PAGE_MASK/63 for x86_64, we will have different versions
> > > > > of bpf_jit_binary_hdr(). It is not on any hot path, so we can use __weak for
> > > > > it. Other than this, I think the solution works fine.
> > > >
> > > > I think it can stay generic.
> > > >
> > > > The existing bpf_jit_binary_hdr() will do & PAGE_MASK
> > > > while bpf_jit_binary_hdr_pack() will do & 63.
> > >
> > > The problem with this approach is that we need bpf_prog_ksym_set_addr
> > > to be smart to pick bpf_jit_binary_hdr() or bpf_jit_binary_hdr_pack().
> >
> > We can probably add a true JIT image size to bpf_prog_aux.
> > bpf_prog_ksym_set_addr() is approximating the end:
> > prog->aux->ksym.end   = addr + hdr->pages * PAGE_SIZE
> > which doesn't have to include all the 'int 3' padding after the end.

Actually, we can use prog->jited_len in bpf_prog_ksym_set_addr(), right?

Song
Alexei Starovoitov Jan. 26, 2022, 1:34 a.m. UTC | #23
On 1/25/22 5:31 PM, Song Liu wrote:
> On Tue, Jan 25, 2022 at 5:28 PM Song Liu <song@kernel.org> wrote:
>>
>> On Tue, Jan 25, 2022 at 5:20 PM Alexei Starovoitov
>> <alexei.starovoitov@gmail.com> wrote:
>>>
>>> On Tue, Jan 25, 2022 at 4:50 PM Song Liu <song@kernel.org> wrote:
>>>>
>>>> On Tue, Jan 25, 2022 at 4:38 PM Alexei Starovoitov
>>>> <alexei.starovoitov@gmail.com> wrote:
>>>>>
>> [...]
>>>>>>
>>>>>> In bpf_jit_binary_hdr(), we calculate header as image & PAGE_MASK.
>>>>>> If we want s/PAGE_MASK/63 for x86_64, we will have different versions
>>>>>> of bpf_jit_binary_hdr(). It is not on any hot path, so we can use __weak for
>>>>>> it. Other than this, I think the solution works fine.
>>>>>
>>>>> I think it can stay generic.
>>>>>
>>>>> The existing bpf_jit_binary_hdr() will do & PAGE_MASK
>>>>> while bpf_jit_binary_hdr_pack() will do & 63.
>>>>
>>>> The problem with this approach is that we need bpf_prog_ksym_set_addr
>>>> to be smart to pick bpf_jit_binary_hdr() or bpf_jit_binary_hdr_pack().
>>>
>>> We can probably add a true JIT image size to bpf_prog_aux.
>>> bpf_prog_ksym_set_addr() is approximating the end:
>>> prog->aux->ksym.end   = addr + hdr->pages * PAGE_SIZE
>>> which doesn't have to include all the 'int 3' padding after the end.
> 
> Actually, we can use prog->jited_len in bpf_prog_ksym_set_addr(), right?

Lol. Yeah. We should. Looks like somebody remembers their own
code in perf_event_bpf_emit_ksymbols() ;)
diff mbox series

Patch

diff --git a/include/linux/filter.h b/include/linux/filter.h
index 27ea68604c22..a58658442d2e 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -1074,6 +1074,13 @@  void *bpf_jit_alloc_exec(unsigned long size);
 void bpf_jit_free_exec(void *addr);
 void bpf_jit_free(struct bpf_prog *fp);
 
+struct bpf_binary_header *
+bpf_jit_binary_alloc_pack(unsigned int proglen, u8 **image_r_ptr,
+			  unsigned int alignment,
+			  bpf_jit_fill_hole_t bpf_fill_ill_insns);
+void bpf_jit_binary_free_pack(struct bpf_binary_header *hdr);
+int bpf_prog_pack_max_size(void);
+
 int bpf_jit_add_poke_descriptor(struct bpf_prog *prog,
 				struct bpf_jit_poke_descriptor *poke);
 
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index f252d8529b0b..a912818a5205 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -808,6 +808,116 @@  int bpf_jit_add_poke_descriptor(struct bpf_prog *prog,
 	return slot;
 }
 
+/*
+ * BPF program pack allocator.
+ *
+ * Most BPF programs are pretty small. Allocating a hole page for each
+ * program is sometime a waste. Many small bpf program also adds pressure
+ * to instruction TLB. To solve this issue, we introduce a BPF program pack
+ * allocator. The prog_pack allocator uses HPAGE_PMD_SIZE page (2MB on x86)
+ * to host BPF programs.
+ */
+#define BPF_PROG_PACK_SIZE	HPAGE_PMD_SIZE
+#define BPF_PROG_CHUNK_SHIFT	6
+#define BPF_PROG_CHUNK_SIZE	(1 << BPF_PROG_CHUNK_SHIFT)
+#define BPF_PROG_CHUNK_COUNT	(BPF_PROG_PACK_SIZE / BPF_PROG_CHUNK_SIZE)
+
+struct bpf_prog_pack {
+	struct list_head list;
+	void *ptr;
+	unsigned long bitmap[BITS_TO_LONGS(BPF_PROG_CHUNK_COUNT)];
+};
+
+#define BPF_PROG_MAX_PACK_PROG_SIZE	HPAGE_PMD_SIZE
+#define BPF_PROG_SIZE_TO_NBITS(size)	(round_up(size, BPF_PROG_CHUNK_SIZE) / BPF_PROG_CHUNK_SIZE)
+
+static DEFINE_MUTEX(pack_mutex);
+static LIST_HEAD(pack_list);
+
+static struct bpf_prog_pack *alloc_new_pack(void)
+{
+	struct bpf_prog_pack *pack;
+
+	pack = kzalloc(sizeof(*pack), GFP_KERNEL);
+	if (!pack)
+		return NULL;
+	pack->ptr = module_alloc(BPF_PROG_PACK_SIZE);
+	if (!pack->ptr) {
+		kfree(pack);
+		return NULL;
+	}
+	bitmap_zero(pack->bitmap, BPF_PROG_PACK_SIZE / BPF_PROG_CHUNK_SIZE);
+	list_add_tail(&pack->list, &pack_list);
+
+	set_vm_flush_reset_perms(pack);
+	set_memory_ro((unsigned long)pack->ptr, BPF_PROG_PACK_SIZE / PAGE_SIZE);
+	set_memory_x((unsigned long)pack->ptr, BPF_PROG_PACK_SIZE / PAGE_SIZE);
+	return pack;
+}
+
+static void *bpf_prog_pack_alloc(u32 size)
+{
+	unsigned int nbits = BPF_PROG_SIZE_TO_NBITS(size);
+	struct bpf_prog_pack *pack;
+	unsigned long pos;
+	void *ptr = NULL;
+
+	mutex_lock(&pack_mutex);
+	list_for_each_entry(pack, &pack_list, list) {
+		pos = bitmap_find_next_zero_area(pack->bitmap, BPF_PROG_CHUNK_COUNT, 0,
+						 nbits, 0);
+		if (pos < BPF_PROG_CHUNK_COUNT)
+			goto found_free_area;
+	}
+
+	pack = alloc_new_pack();
+	if (!pack)
+		goto out;
+
+	pos = 0;
+
+found_free_area:
+	bitmap_set(pack->bitmap, pos, nbits);
+	ptr = (void *)(pack->ptr) + (pos << BPF_PROG_CHUNK_SHIFT);
+
+out:
+	mutex_unlock(&pack_mutex);
+	return ptr;
+}
+
+static void bpf_prog_pack_free(struct bpf_binary_header *hdr)
+{
+	void *pack_ptr = (void *)((unsigned long)hdr & ~(BPF_PROG_PACK_SIZE - 1));
+	struct bpf_prog_pack *pack = NULL, *tmp;
+	unsigned int nbits;
+	unsigned long pos;
+
+	mutex_lock(&pack_mutex);
+
+	list_for_each_entry(tmp, &pack_list, list) {
+		if (tmp->ptr == pack_ptr) {
+			pack = tmp;
+			break;
+		}
+	}
+
+	if (WARN_ONCE(!pack, "bpf_prog_pack bug\n"))
+		goto out;
+
+	nbits = BPF_PROG_SIZE_TO_NBITS(hdr->size);
+	pos = ((unsigned long)hdr - (unsigned long)pack_ptr) >> BPF_PROG_CHUNK_SHIFT;
+
+	bitmap_clear(pack->bitmap, pos, nbits);
+	if (bitmap_find_next_zero_area(pack->bitmap, BPF_PROG_CHUNK_COUNT, 0,
+				       BPF_PROG_CHUNK_COUNT, 0) == 0) {
+		list_del(&pack->list);
+		module_memfree(pack->ptr);
+		kfree(pack);
+	}
+out:
+	mutex_unlock(&pack_mutex);
+}
+
 static atomic_long_t bpf_jit_current;
 
 /* Can be overridden by an arch's JIT compiler if it has a custom,
@@ -860,10 +970,59 @@  void __weak bpf_jit_free_exec(void *addr)
 	module_memfree(addr);
 }
 
+static struct bpf_binary_header *
+__bpf_jit_binary_alloc(unsigned int proglen, u8 **image_ptr,
+		       unsigned int alignment,
+		       bpf_jit_fill_hole_t bpf_fill_ill_insns,
+		       u32 round_up_to)
+{
+	struct bpf_binary_header *hdr;
+	u32 size, hole, start;
+
+	WARN_ON_ONCE(!is_power_of_2(alignment) ||
+		     alignment > BPF_IMAGE_ALIGNMENT);
+
+	/* Most of BPF filters are really small, but if some of them
+	 * fill a page, allow at least 128 extra bytes to insert a
+	 * random section of illegal instructions.
+	 */
+	size = round_up(proglen + sizeof(*hdr) + 128, round_up_to);
+
+	if (bpf_jit_charge_modmem(size))
+		return NULL;
+	hdr = bpf_jit_alloc_exec(size);
+	if (!hdr) {
+		bpf_jit_uncharge_modmem(size);
+		return NULL;
+	}
+
+	/* Fill space with illegal/arch-dep instructions. */
+	bpf_fill_ill_insns(hdr, size);
+
+	hdr->size = size;
+	hole = min_t(unsigned int, size - (proglen + sizeof(*hdr)),
+		     PAGE_SIZE - sizeof(*hdr));
+	start = (get_random_int() % hole) & ~(alignment - 1);
+
+	/* Leave a random number of instructions before BPF code. */
+	*image_ptr = &hdr->image[start];
+
+	return hdr;
+}
+
 struct bpf_binary_header *
 bpf_jit_binary_alloc(unsigned int proglen, u8 **image_ptr,
 		     unsigned int alignment,
 		     bpf_jit_fill_hole_t bpf_fill_ill_insns)
+{
+	return __bpf_jit_binary_alloc(proglen, image_ptr, alignment,
+				      bpf_fill_ill_insns, PAGE_SIZE);
+}
+
+struct bpf_binary_header *
+bpf_jit_binary_alloc_pack(unsigned int proglen, u8 **image_ptr,
+			  unsigned int alignment,
+			  bpf_jit_fill_hole_t bpf_fill_ill_insns)
 {
 	struct bpf_binary_header *hdr;
 	u32 size, hole, start;
@@ -875,11 +1034,16 @@  bpf_jit_binary_alloc(unsigned int proglen, u8 **image_ptr,
 	 * fill a page, allow at least 128 extra bytes to insert a
 	 * random section of illegal instructions.
 	 */
-	size = round_up(proglen + sizeof(*hdr) + 128, PAGE_SIZE);
+	size = round_up(proglen + sizeof(*hdr) + 128, BPF_PROG_CHUNK_SIZE);
+
+	/* for too big program, use __bpf_jit_binary_alloc. */
+	if (size > BPF_PROG_MAX_PACK_PROG_SIZE)
+		return __bpf_jit_binary_alloc(proglen, image_ptr, alignment,
+					      bpf_fill_ill_insns, PAGE_SIZE);
 
 	if (bpf_jit_charge_modmem(size))
 		return NULL;
-	hdr = bpf_jit_alloc_exec(size);
+	hdr = bpf_prog_pack_alloc(size);
 	if (!hdr) {
 		bpf_jit_uncharge_modmem(size);
 		return NULL;
@@ -888,9 +1052,8 @@  bpf_jit_binary_alloc(unsigned int proglen, u8 **image_ptr,
 	/* Fill space with illegal/arch-dep instructions. */
 	bpf_fill_ill_insns(hdr, size);
 
-	hdr->size = size;
 	hole = min_t(unsigned int, size - (proglen + sizeof(*hdr)),
-		     PAGE_SIZE - sizeof(*hdr));
+		     BPF_PROG_CHUNK_SIZE - sizeof(*hdr));
 	start = (get_random_int() % hole) & ~(alignment - 1);
 
 	/* Leave a random number of instructions before BPF code. */
@@ -907,6 +1070,19 @@  void bpf_jit_binary_free(struct bpf_binary_header *hdr)
 	bpf_jit_uncharge_modmem(size);
 }
 
+void bpf_jit_binary_free_pack(struct bpf_binary_header *hdr)
+{
+	u32 size = hdr->size;
+
+	bpf_prog_pack_free(hdr);
+	bpf_jit_uncharge_modmem(size);
+}
+
+int bpf_prog_pack_max_size(void)
+{
+	return BPF_PROG_MAX_PACK_PROG_SIZE;
+}
+
 /* This symbol is only overridden by archs that have different
  * requirements than the usual eBPF JITs, f.e. when they only
  * implement cBPF JIT, do not set images read-only, etc.