diff mbox series

[bpf-next,v4,1/5] bpf: Add bpf_link support for sk_msg and sk_skb progs

Message ID 20240404025310.2211688-1-yonghong.song@linux.dev (mailing list archive)
State Superseded
Delegated to: BPF
Headers show
Series bpf: Add bpf_link support for sk_msg and sk_skb progs | expand

Checks

Context Check Description
bpf/vmtest-bpf-next-VM_Test-43 fail Logs for x86_64-llvm-18 / test (test_progs, false, 360) / test_progs on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-44 success Logs for x86_64-llvm-18 / test (test_progs_cpuv4, false, 360) / test_progs_cpuv4 on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-45 success Logs for x86_64-llvm-18 / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-46 success Logs for x86_64-llvm-18 / test (test_verifier, false, 360) / test_verifier on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-47 success Logs for x86_64-llvm-18 / veristat
bpf/vmtest-bpf-next-PR success PR summary
netdev/series_format success Posting correctly formatted
netdev/tree_selection success Clearly marked for bpf-next, async
netdev/ynl success Generated files up to date; no warnings/errors; no diff in generated;
netdev/fixes_present success Fixes tag not required for -next series
netdev/header_inline success No static functions without inline keyword in header files
netdev/build_32bit success Errors and warnings before: 7516 this patch: 7516
netdev/build_tools success Errors and warnings before: 1 this patch: 1
netdev/cc_maintainers warning 11 maintainers not CCed: sdf@google.com kpsingh@kernel.org netdev@vger.kernel.org song@kernel.org martin.lau@linux.dev kuba@kernel.org haoluo@google.com jolsa@kernel.org edumazet@google.com eddyz87@gmail.com pabeni@redhat.com
netdev/build_clang success Errors and warnings before: 1228 this patch: 1228
netdev/verify_signedoff success Signed-off-by tag matches author and committer
netdev/deprecated_api success None detected
netdev/check_selftest success No net selftest shell script
netdev/verify_fixes success No Fixes tag
netdev/build_allmodconfig_warn success Errors and warnings before: 7898 this patch: 7898
netdev/checkpatch warning WARNING: line length of 81 exceeds 80 columns WARNING: line length of 82 exceeds 80 columns WARNING: line length of 83 exceeds 80 columns WARNING: line length of 85 exceeds 80 columns WARNING: line length of 86 exceeds 80 columns WARNING: line length of 89 exceeds 80 columns WARNING: line length of 90 exceeds 80 columns WARNING: line length of 92 exceeds 80 columns WARNING: line length of 94 exceeds 80 columns WARNING: line length of 96 exceeds 80 columns
netdev/build_clang_rust success No Rust files in patch. Skipping build
netdev/kdoc success Errors and warnings before: 6 this patch: 6
netdev/source_inline success Was 0 now: 0
bpf/vmtest-bpf-next-VM_Test-0 success Logs for Lint
bpf/vmtest-bpf-next-VM_Test-1 success Logs for ShellCheck
bpf/vmtest-bpf-next-VM_Test-2 success Logs for Unittests
bpf/vmtest-bpf-next-VM_Test-3 success Logs for Validate matrix.py
bpf/vmtest-bpf-next-VM_Test-5 success Logs for aarch64-gcc / build-release
bpf/vmtest-bpf-next-VM_Test-4 success Logs for aarch64-gcc / build / build for aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-9 success Logs for aarch64-gcc / test (test_verifier, false, 360) / test_verifier on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-10 success Logs for aarch64-gcc / veristat
bpf/vmtest-bpf-next-VM_Test-11 success Logs for s390x-gcc / build / build for s390x with gcc
bpf/vmtest-bpf-next-VM_Test-12 success Logs for s390x-gcc / build-release
bpf/vmtest-bpf-next-VM_Test-17 success Logs for s390x-gcc / veristat
bpf/vmtest-bpf-next-VM_Test-18 success Logs for set-matrix
bpf/vmtest-bpf-next-VM_Test-19 success Logs for x86_64-gcc / build / build for x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-20 success Logs for x86_64-gcc / build-release
bpf/vmtest-bpf-next-VM_Test-28 success Logs for x86_64-llvm-17 / build / build for x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-29 success Logs for x86_64-llvm-17 / build-release / build for x86_64 with llvm-17 and -O2 optimization
bpf/vmtest-bpf-next-VM_Test-34 success Logs for x86_64-llvm-17 / veristat
bpf/vmtest-bpf-next-VM_Test-35 success Logs for x86_64-llvm-18 / build / build for x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-36 success Logs for x86_64-llvm-18 / build-release / build for x86_64 with llvm-18 and -O2 optimization
bpf/vmtest-bpf-next-VM_Test-42 success Logs for x86_64-llvm-18 / veristat
bpf/vmtest-bpf-next-VM_Test-6 success Logs for aarch64-gcc / test (test_maps, false, 360) / test_maps on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-21 success Logs for x86_64-gcc / test (test_maps, false, 360) / test_maps on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-22 success Logs for x86_64-gcc / test (test_progs, false, 360) / test_progs on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-25 success Logs for x86_64-gcc / test (test_progs_parallel, true, 30) / test_progs_parallel on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-24 success Logs for x86_64-gcc / test (test_progs_no_alu32_parallel, true, 30) / test_progs_no_alu32_parallel on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-26 success Logs for x86_64-gcc / test (test_verifier, false, 360) / test_verifier on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-27 success Logs for x86_64-gcc / veristat / veristat on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-7 success Logs for aarch64-gcc / test (test_progs, false, 360) / test_progs on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-30 success Logs for x86_64-llvm-17 / test (test_maps, false, 360) / test_maps on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-16 success Logs for s390x-gcc / test (test_verifier, false, 360) / test_verifier on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-8 success Logs for aarch64-gcc / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-33 success Logs for x86_64-llvm-17 / test (test_verifier, false, 360) / test_verifier on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-37 success Logs for x86_64-llvm-18 / test (test_maps, false, 360) / test_maps on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-41 success Logs for x86_64-llvm-18 / test (test_verifier, false, 360) / test_verifier on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-23 success Logs for x86_64-gcc / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-31 success Logs for x86_64-llvm-17 / test (test_progs, false, 360) / test_progs on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-32 success Logs for x86_64-llvm-17 / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-38 success Logs for x86_64-llvm-18 / test (test_progs, false, 360) / test_progs on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-39 success Logs for x86_64-llvm-18 / test (test_progs_cpuv4, false, 360) / test_progs_cpuv4 on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-40 success Logs for x86_64-llvm-18 / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-13 success Logs for s390x-gcc / test (test_maps, false, 360) / test_maps on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-15 success Logs for s390x-gcc / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-14 success Logs for s390x-gcc / test (test_progs, false, 360) / test_progs on s390x with gcc

Commit Message

Yonghong Song April 4, 2024, 2:53 a.m. UTC
Add bpf_link support for sk_msg and sk_skb programs. We have an
internal request to support bpf_link for sk_msg programs so user
space can have a uniform handling with bpf_link based libbpf
APIs. Using bpf_link based libbpf API also has a benefit which
makes system robust by decoupling prog life cycle and
attachment life cycle.

Signed-off-by: Yonghong Song <yonghong.song@linux.dev>
---
 include/linux/bpf.h            |   6 +
 include/linux/skmsg.h          |   4 +
 include/uapi/linux/bpf.h       |   5 +
 kernel/bpf/syscall.c           |   4 +
 net/core/sock_map.c            | 268 ++++++++++++++++++++++++++++++++-
 tools/include/uapi/linux/bpf.h |   5 +
 6 files changed, 284 insertions(+), 8 deletions(-)

Comments

John Fastabend April 5, 2024, 3:19 p.m. UTC | #1
Yonghong Song wrote:
> Add bpf_link support for sk_msg and sk_skb programs. We have an
> internal request to support bpf_link for sk_msg programs so user
> space can have a uniform handling with bpf_link based libbpf
> APIs. Using bpf_link based libbpf API also has a benefit which
> makes system robust by decoupling prog life cycle and
> attachment life cycle.
> 
> Signed-off-by: Yonghong Song <yonghong.song@linux.dev>
> ---
>  include/linux/bpf.h            |   6 +
>  include/linux/skmsg.h          |   4 +
>  include/uapi/linux/bpf.h       |   5 +
>  kernel/bpf/syscall.c           |   4 +
>  net/core/sock_map.c            | 268 ++++++++++++++++++++++++++++++++-
>  tools/include/uapi/linux/bpf.h |   5 +
>  6 files changed, 284 insertions(+), 8 deletions(-)
> 

LGTM one question below.

> +/* Handle the following two cases:
> + * case 1: link != NULL, prog != NULL, old != NULL
> + * case 2: link != NULL, prog != NULL, old == NULL
> + */
> +static int sock_map_link_update_prog(struct bpf_link *link,
> +				     struct bpf_prog *prog,
> +				     struct bpf_prog *old)
> +{
> +	const struct sockmap_link *sockmap_link = container_of(link, struct sockmap_link, link);
> +	struct bpf_prog **pprog;
> +	struct bpf_link **plink;
> +	int ret = 0;
> +
> +	mutex_lock(&sockmap_mutex);
> +
> +	/* If old prog is not NULL, ensure old prog is the same as link->prog. */
> +	if (old && link->prog != old) {
> +		ret = -EINVAL;
> +		goto out;
> +	}
> +	/* Ensure link->prog has the same type/attach_type as the new prog. */
> +	if (link->prog->type != prog->type ||
> +	    link->prog->expected_attach_type != prog->expected_attach_type) {
> +		ret = -EINVAL;
> +		goto out;
> +	}
> +
> +	ret = sock_map_prog_lookup(sockmap_link->map, &pprog,
> +				   sockmap_link->attach_type);
> +	if (ret)
> +		goto out;
> +
> +	/* Ensure the same link between the one in map and the passed-in. */
> +	ret = sock_map_link_lookup(sockmap_link->map, &plink, link, false,
> +				   sockmap_link->attach_type);
> +	if (ret)
> +		goto out;
> +
> +	if (old) {
> +		ret = psock_replace_prog(pprog, prog, old);
> +		goto out;
> +	}
> +
> +	psock_set_prog(pprog, prog);
> +
> +out:
> +	if (!ret) {
> +		bpf_prog_inc(prog);
> +		old = xchg(&link->prog, prog);
> +		bpf_prog_put(old);

Need to check old? I don't think we can clal bpf_prog_put on null?

  if (old)
     bpf_prog_put(old)

> +	}
> +	mutex_unlock(&sockmap_mutex);
> +	return ret;
> +}
> +
Yonghong Song April 5, 2024, 3:53 p.m. UTC | #2
On 4/5/24 8:19 AM, John Fastabend wrote:
> Yonghong Song wrote:
>> Add bpf_link support for sk_msg and sk_skb programs. We have an
>> internal request to support bpf_link for sk_msg programs so user
>> space can have a uniform handling with bpf_link based libbpf
>> APIs. Using bpf_link based libbpf API also has a benefit which
>> makes system robust by decoupling prog life cycle and
>> attachment life cycle.
>>
>> Signed-off-by: Yonghong Song <yonghong.song@linux.dev>
>> ---
>>   include/linux/bpf.h            |   6 +
>>   include/linux/skmsg.h          |   4 +
>>   include/uapi/linux/bpf.h       |   5 +
>>   kernel/bpf/syscall.c           |   4 +
>>   net/core/sock_map.c            | 268 ++++++++++++++++++++++++++++++++-
>>   tools/include/uapi/linux/bpf.h |   5 +
>>   6 files changed, 284 insertions(+), 8 deletions(-)
>>
> LGTM one question below.
>
>> +/* Handle the following two cases:
>> + * case 1: link != NULL, prog != NULL, old != NULL
>> + * case 2: link != NULL, prog != NULL, old == NULL
>> + */
>> +static int sock_map_link_update_prog(struct bpf_link *link,
>> +				     struct bpf_prog *prog,
>> +				     struct bpf_prog *old)
>> +{
>> +	const struct sockmap_link *sockmap_link = container_of(link, struct sockmap_link, link);
>> +	struct bpf_prog **pprog;
>> +	struct bpf_link **plink;
>> +	int ret = 0;
>> +
>> +	mutex_lock(&sockmap_mutex);
>> +
>> +	/* If old prog is not NULL, ensure old prog is the same as link->prog. */
>> +	if (old && link->prog != old) {
>> +		ret = -EINVAL;
>> +		goto out;
>> +	}
>> +	/* Ensure link->prog has the same type/attach_type as the new prog. */
>> +	if (link->prog->type != prog->type ||
>> +	    link->prog->expected_attach_type != prog->expected_attach_type) {
>> +		ret = -EINVAL;
>> +		goto out;
>> +	}
>> +
>> +	ret = sock_map_prog_lookup(sockmap_link->map, &pprog,
>> +				   sockmap_link->attach_type);
>> +	if (ret)
>> +		goto out;
>> +
>> +	/* Ensure the same link between the one in map and the passed-in. */
>> +	ret = sock_map_link_lookup(sockmap_link->map, &plink, link, false,
>> +				   sockmap_link->attach_type);
>> +	if (ret)
>> +		goto out;
>> +
>> +	if (old) {
>> +		ret = psock_replace_prog(pprog, prog, old);
>> +		goto out;
>> +	}
>> +
>> +	psock_set_prog(pprog, prog);
>> +
>> +out:
>> +	if (!ret) {
>> +		bpf_prog_inc(prog);
>> +		old = xchg(&link->prog, prog);
>> +		bpf_prog_put(old);
> Need to check old? I don't think we can clal bpf_prog_put on null?
>
>    if (old)
>       bpf_prog_put(old)

The 'old' here represents the *old* link->prog program and
link->prog should not be NULL at this point.

If sock_map_link_update_prog() is called here, that means,
the kernel already holds a reference for the bpf link.
See kernel/bpf/syscall.c, link_update().

So the bpf_link is valid and not in released stage, and
link->prog should not be NULL.

>
>> +	}
>> +	mutex_unlock(&sockmap_mutex);
>> +	return ret;
>> +}
>> +
John Fastabend April 5, 2024, 4:23 p.m. UTC | #3
Yonghong Song wrote:
> 
> On 4/5/24 8:19 AM, John Fastabend wrote:
> > Yonghong Song wrote:
> >> Add bpf_link support for sk_msg and sk_skb programs. We have an
> >> internal request to support bpf_link for sk_msg programs so user
> >> space can have a uniform handling with bpf_link based libbpf
> >> APIs. Using bpf_link based libbpf API also has a benefit which
> >> makes system robust by decoupling prog life cycle and
> >> attachment life cycle.
> >>
> >> Signed-off-by: Yonghong Song <yonghong.song@linux.dev>
> >> ---
> >>   include/linux/bpf.h            |   6 +
> >>   include/linux/skmsg.h          |   4 +
> >>   include/uapi/linux/bpf.h       |   5 +
> >>   kernel/bpf/syscall.c           |   4 +
> >>   net/core/sock_map.c            | 268 ++++++++++++++++++++++++++++++++-
> >>   tools/include/uapi/linux/bpf.h |   5 +
> >>   6 files changed, 284 insertions(+), 8 deletions(-)
> >>
> > LGTM one question below.
> >
> >> +/* Handle the following two cases:
> >> + * case 1: link != NULL, prog != NULL, old != NULL
> >> + * case 2: link != NULL, prog != NULL, old == NULL
> >> + */
> >> +static int sock_map_link_update_prog(struct bpf_link *link,
> >> +				     struct bpf_prog *prog,
> >> +				     struct bpf_prog *old)
> >> +{
> >> +	const struct sockmap_link *sockmap_link = container_of(link, struct sockmap_link, link);
> >> +	struct bpf_prog **pprog;
> >> +	struct bpf_link **plink;
> >> +	int ret = 0;
> >> +
> >> +	mutex_lock(&sockmap_mutex);
> >> +
> >> +	/* If old prog is not NULL, ensure old prog is the same as link->prog. */
> >> +	if (old && link->prog != old) {
> >> +		ret = -EINVAL;
> >> +		goto out;
> >> +	}
> >> +	/* Ensure link->prog has the same type/attach_type as the new prog. */
> >> +	if (link->prog->type != prog->type ||
> >> +	    link->prog->expected_attach_type != prog->expected_attach_type) {
> >> +		ret = -EINVAL;
> >> +		goto out;
> >> +	}
> >> +
> >> +	ret = sock_map_prog_lookup(sockmap_link->map, &pprog,
> >> +				   sockmap_link->attach_type);
> >> +	if (ret)
> >> +		goto out;
> >> +
> >> +	/* Ensure the same link between the one in map and the passed-in. */
> >> +	ret = sock_map_link_lookup(sockmap_link->map, &plink, link, false,
> >> +				   sockmap_link->attach_type);
> >> +	if (ret)
> >> +		goto out;
> >> +
> >> +	if (old) {
> >> +		ret = psock_replace_prog(pprog, prog, old);
> >> +		goto out;
> >> +	}
> >> +
> >> +	psock_set_prog(pprog, prog);
> >> +
> >> +out:
> >> +	if (!ret) {
> >> +		bpf_prog_inc(prog);
> >> +		old = xchg(&link->prog, prog);
> >> +		bpf_prog_put(old);
> > Need to check old? I don't think we can clal bpf_prog_put on null?
> >
> >    if (old)
> >       bpf_prog_put(old)
> 
> The 'old' here represents the *old* link->prog program and
> link->prog should not be NULL at this point.

Ah ok. Maybe instead of using the input old var make it
explicit?

    if (!ret) {
       struct bpf_prog *old_link;

       bpf_prog_inc(prog);
       old_link = xchg(&link->prog, prog);
       bpf_prog_put(old)
    }

Is a bit more obious to me at least. Up to you I have a slight preference
for the explicit more verbose above.

Otherwise for the series.

Reviewed-by: John Fastabend <john.fastabend@gmail.com>
Yonghong Song April 5, 2024, 4:51 p.m. UTC | #4
On 4/5/24 9:23 AM, John Fastabend wrote:
> Yonghong Song wrote:
>> On 4/5/24 8:19 AM, John Fastabend wrote:
>>> Yonghong Song wrote:
>>>> Add bpf_link support for sk_msg and sk_skb programs. We have an
>>>> internal request to support bpf_link for sk_msg programs so user
>>>> space can have a uniform handling with bpf_link based libbpf
>>>> APIs. Using bpf_link based libbpf API also has a benefit which
>>>> makes system robust by decoupling prog life cycle and
>>>> attachment life cycle.
>>>>
>>>> Signed-off-by: Yonghong Song <yonghong.song@linux.dev>
>>>> ---
>>>>    include/linux/bpf.h            |   6 +
>>>>    include/linux/skmsg.h          |   4 +
>>>>    include/uapi/linux/bpf.h       |   5 +
>>>>    kernel/bpf/syscall.c           |   4 +
>>>>    net/core/sock_map.c            | 268 ++++++++++++++++++++++++++++++++-
>>>>    tools/include/uapi/linux/bpf.h |   5 +
>>>>    6 files changed, 284 insertions(+), 8 deletions(-)
>>>>
>>> LGTM one question below.
>>>
>>>> +/* Handle the following two cases:
>>>> + * case 1: link != NULL, prog != NULL, old != NULL
>>>> + * case 2: link != NULL, prog != NULL, old == NULL
>>>> + */
>>>> +static int sock_map_link_update_prog(struct bpf_link *link,
>>>> +				     struct bpf_prog *prog,
>>>> +				     struct bpf_prog *old)
>>>> +{
>>>> +	const struct sockmap_link *sockmap_link = container_of(link, struct sockmap_link, link);
>>>> +	struct bpf_prog **pprog;
>>>> +	struct bpf_link **plink;
>>>> +	int ret = 0;
>>>> +
>>>> +	mutex_lock(&sockmap_mutex);
>>>> +
>>>> +	/* If old prog is not NULL, ensure old prog is the same as link->prog. */
>>>> +	if (old && link->prog != old) {
>>>> +		ret = -EINVAL;
>>>> +		goto out;
>>>> +	}
>>>> +	/* Ensure link->prog has the same type/attach_type as the new prog. */
>>>> +	if (link->prog->type != prog->type ||
>>>> +	    link->prog->expected_attach_type != prog->expected_attach_type) {
>>>> +		ret = -EINVAL;
>>>> +		goto out;
>>>> +	}
>>>> +
>>>> +	ret = sock_map_prog_lookup(sockmap_link->map, &pprog,
>>>> +				   sockmap_link->attach_type);
>>>> +	if (ret)
>>>> +		goto out;
>>>> +
>>>> +	/* Ensure the same link between the one in map and the passed-in. */
>>>> +	ret = sock_map_link_lookup(sockmap_link->map, &plink, link, false,
>>>> +				   sockmap_link->attach_type);
>>>> +	if (ret)
>>>> +		goto out;
>>>> +
>>>> +	if (old) {
>>>> +		ret = psock_replace_prog(pprog, prog, old);
>>>> +		goto out;
>>>> +	}
>>>> +
>>>> +	psock_set_prog(pprog, prog);
>>>> +
>>>> +out:
>>>> +	if (!ret) {
>>>> +		bpf_prog_inc(prog);
>>>> +		old = xchg(&link->prog, prog);
>>>> +		bpf_prog_put(old);
>>> Need to check old? I don't think we can clal bpf_prog_put on null?
>>>
>>>     if (old)
>>>        bpf_prog_put(old)
>> The 'old' here represents the *old* link->prog program and
>> link->prog should not be NULL at this point.
> Ah ok. Maybe instead of using the input old var make it
> explicit?
>
>      if (!ret) {
>         struct bpf_prog *old_link;
>
>         bpf_prog_inc(prog);
>         old_link = xchg(&link->prog, prog);
>         bpf_prog_put(old)
>      }
>
> Is a bit more obious to me at least. Up to you I have a slight preference
> for the explicit more verbose above.

Regarding naming convention, yes, it is hard. My above code similar to
kernel/bpf/net_namespace.c:

static int bpf_netns_link_update_prog(struct bpf_link *link,
                                       struct bpf_prog *new_prog,
                                       struct bpf_prog *old_prog)
{
         struct bpf_netns_link *net_link =
                 container_of(link, struct bpf_netns_link, link);
         enum netns_bpf_attach_type type = net_link->netns_type;
         struct bpf_prog_array *run_array;
         struct net *net;
         int idx, ret;
         
         if (old_prog && old_prog != link->prog)
                 return -EPERM;
	...
	old_prog = xchg(&link->prog, new_prog);
         bpf_prog_put(old_prog);
	...
}

The 'old_prog' is reused in the above.

I am okay to change
	old = xchg(&link->prog, prog);
to
	old_link_prog = xchg(&link->prog, prog);

in next revision (if requested or additional changes needed)
or as a followup.

>
> Otherwise for the series.
>
> Reviewed-by: John Fastabend <john.fastabend@gmail.com>
John Fastabend April 5, 2024, 7:43 p.m. UTC | #5
Yonghong Song wrote:
> 
> On 4/5/24 9:23 AM, John Fastabend wrote:
> > Yonghong Song wrote:
> >> On 4/5/24 8:19 AM, John Fastabend wrote:
> >>> Yonghong Song wrote:
> >>>> Add bpf_link support for sk_msg and sk_skb programs. We have an
> >>>> internal request to support bpf_link for sk_msg programs so user
> >>>> space can have a uniform handling with bpf_link based libbpf
> >>>> APIs. Using bpf_link based libbpf API also has a benefit which
> >>>> makes system robust by decoupling prog life cycle and
> >>>> attachment life cycle.
> >>>>
> >>>> Signed-off-by: Yonghong Song <yonghong.song@linux.dev>
> >>>> ---
> >>>>    include/linux/bpf.h            |   6 +
> >>>>    include/linux/skmsg.h          |   4 +
> >>>>    include/uapi/linux/bpf.h       |   5 +
> >>>>    kernel/bpf/syscall.c           |   4 +
> >>>>    net/core/sock_map.c            | 268 ++++++++++++++++++++++++++++++++-
> >>>>    tools/include/uapi/linux/bpf.h |   5 +
> >>>>    6 files changed, 284 insertions(+), 8 deletions(-)
> >>>>
> >>> LGTM one question below.
> >>>
> >>>> +/* Handle the following two cases:
> >>>> + * case 1: link != NULL, prog != NULL, old != NULL
> >>>> + * case 2: link != NULL, prog != NULL, old == NULL
> >>>> + */
> >>>> +static int sock_map_link_update_prog(struct bpf_link *link,
> >>>> +				     struct bpf_prog *prog,
> >>>> +				     struct bpf_prog *old)
> >>>> +{
> >>>> +	const struct sockmap_link *sockmap_link = container_of(link, struct sockmap_link, link);
> >>>> +	struct bpf_prog **pprog;
> >>>> +	struct bpf_link **plink;
> >>>> +	int ret = 0;
> >>>> +
> >>>> +	mutex_lock(&sockmap_mutex);
> >>>> +
> >>>> +	/* If old prog is not NULL, ensure old prog is the same as link->prog. */
> >>>> +	if (old && link->prog != old) {
> >>>> +		ret = -EINVAL;
> >>>> +		goto out;
> >>>> +	}
> >>>> +	/* Ensure link->prog has the same type/attach_type as the new prog. */
> >>>> +	if (link->prog->type != prog->type ||
> >>>> +	    link->prog->expected_attach_type != prog->expected_attach_type) {
> >>>> +		ret = -EINVAL;
> >>>> +		goto out;
> >>>> +	}
> >>>> +
> >>>> +	ret = sock_map_prog_lookup(sockmap_link->map, &pprog,
> >>>> +				   sockmap_link->attach_type);
> >>>> +	if (ret)
> >>>> +		goto out;
> >>>> +
> >>>> +	/* Ensure the same link between the one in map and the passed-in. */
> >>>> +	ret = sock_map_link_lookup(sockmap_link->map, &plink, link, false,
> >>>> +				   sockmap_link->attach_type);
> >>>> +	if (ret)
> >>>> +		goto out;
> >>>> +
> >>>> +	if (old) {
> >>>> +		ret = psock_replace_prog(pprog, prog, old);
> >>>> +		goto out;
> >>>> +	}
> >>>> +
> >>>> +	psock_set_prog(pprog, prog);
> >>>> +
> >>>> +out:
> >>>> +	if (!ret) {
> >>>> +		bpf_prog_inc(prog);
> >>>> +		old = xchg(&link->prog, prog);
> >>>> +		bpf_prog_put(old);
> >>> Need to check old? I don't think we can clal bpf_prog_put on null?
> >>>
> >>>     if (old)
> >>>        bpf_prog_put(old)
> >> The 'old' here represents the *old* link->prog program and
> >> link->prog should not be NULL at this point.
> > Ah ok. Maybe instead of using the input old var make it
> > explicit?
> >
> >      if (!ret) {
> >         struct bpf_prog *old_link;
> >
> >         bpf_prog_inc(prog);
> >         old_link = xchg(&link->prog, prog);
> >         bpf_prog_put(old)
> >      }
> >
> > Is a bit more obious to me at least. Up to you I have a slight preference
> > for the explicit more verbose above.
> 
> Regarding naming convention, yes, it is hard. My above code similar to
> kernel/bpf/net_namespace.c:
> 
> static int bpf_netns_link_update_prog(struct bpf_link *link,
>                                        struct bpf_prog *new_prog,
>                                        struct bpf_prog *old_prog)
> {
>          struct bpf_netns_link *net_link =
>                  container_of(link, struct bpf_netns_link, link);
>          enum netns_bpf_attach_type type = net_link->netns_type;
>          struct bpf_prog_array *run_array;
>          struct net *net;
>          int idx, ret;
>          
>          if (old_prog && old_prog != link->prog)
>                  return -EPERM;
> 	...
> 	old_prog = xchg(&link->prog, new_prog);
>          bpf_prog_put(old_prog);
> 	...
> }
> 
> The 'old_prog' is reused in the above.
> 
> I am okay to change
> 	old = xchg(&link->prog, prog);
> to
> 	old_link_prog = xchg(&link->prog, prog);
> 
> in next revision (if requested or additional changes needed)
> or as a followup.

I'm good with this series as is LGTM. We can do a follow up if we want.
Although the xchg is exactly one line above so I'm not sure its even necessary.

> 
> >
> > Otherwise for the series.
> >
> > Reviewed-by: John Fastabend <john.fastabend@gmail.com>
Yonghong Song April 5, 2024, 8:05 p.m. UTC | #6
On 4/5/24 12:43 PM, John Fastabend wrote:
> Yonghong Song wrote:
>> On 4/5/24 9:23 AM, John Fastabend wrote:
>>> Yonghong Song wrote:
>>>> On 4/5/24 8:19 AM, John Fastabend wrote:
>>>>> Yonghong Song wrote:
>>>>>> Add bpf_link support for sk_msg and sk_skb programs. We have an
>>>>>> internal request to support bpf_link for sk_msg programs so user
>>>>>> space can have a uniform handling with bpf_link based libbpf
>>>>>> APIs. Using bpf_link based libbpf API also has a benefit which
>>>>>> makes system robust by decoupling prog life cycle and
>>>>>> attachment life cycle.
>>>>>>
>>>>>> Signed-off-by: Yonghong Song <yonghong.song@linux.dev>
>>>>>> ---
>>>>>>     include/linux/bpf.h            |   6 +
>>>>>>     include/linux/skmsg.h          |   4 +
>>>>>>     include/uapi/linux/bpf.h       |   5 +
>>>>>>     kernel/bpf/syscall.c           |   4 +
>>>>>>     net/core/sock_map.c            | 268 ++++++++++++++++++++++++++++++++-
>>>>>>     tools/include/uapi/linux/bpf.h |   5 +
>>>>>>     6 files changed, 284 insertions(+), 8 deletions(-)
>>>>>>
>>>>> LGTM one question below.
>>>>>
>>>>>> +/* Handle the following two cases:
>>>>>> + * case 1: link != NULL, prog != NULL, old != NULL
>>>>>> + * case 2: link != NULL, prog != NULL, old == NULL
>>>>>> + */
>>>>>> +static int sock_map_link_update_prog(struct bpf_link *link,
>>>>>> +				     struct bpf_prog *prog,
>>>>>> +				     struct bpf_prog *old)
>>>>>> +{
>>>>>> +	const struct sockmap_link *sockmap_link = container_of(link, struct sockmap_link, link);
>>>>>> +	struct bpf_prog **pprog;
>>>>>> +	struct bpf_link **plink;
>>>>>> +	int ret = 0;
>>>>>> +
>>>>>> +	mutex_lock(&sockmap_mutex);
>>>>>> +
>>>>>> +	/* If old prog is not NULL, ensure old prog is the same as link->prog. */
>>>>>> +	if (old && link->prog != old) {
>>>>>> +		ret = -EINVAL;
>>>>>> +		goto out;
>>>>>> +	}
>>>>>> +	/* Ensure link->prog has the same type/attach_type as the new prog. */
>>>>>> +	if (link->prog->type != prog->type ||
>>>>>> +	    link->prog->expected_attach_type != prog->expected_attach_type) {
>>>>>> +		ret = -EINVAL;
>>>>>> +		goto out;
>>>>>> +	}
>>>>>> +
>>>>>> +	ret = sock_map_prog_lookup(sockmap_link->map, &pprog,
>>>>>> +				   sockmap_link->attach_type);
>>>>>> +	if (ret)
>>>>>> +		goto out;
>>>>>> +
>>>>>> +	/* Ensure the same link between the one in map and the passed-in. */
>>>>>> +	ret = sock_map_link_lookup(sockmap_link->map, &plink, link, false,
>>>>>> +				   sockmap_link->attach_type);
>>>>>> +	if (ret)
>>>>>> +		goto out;
>>>>>> +
>>>>>> +	if (old) {
>>>>>> +		ret = psock_replace_prog(pprog, prog, old);
>>>>>> +		goto out;
>>>>>> +	}
>>>>>> +
>>>>>> +	psock_set_prog(pprog, prog);
>>>>>> +
>>>>>> +out:
>>>>>> +	if (!ret) {
>>>>>> +		bpf_prog_inc(prog);
>>>>>> +		old = xchg(&link->prog, prog);
>>>>>> +		bpf_prog_put(old);
>>>>> Need to check old? I don't think we can clal bpf_prog_put on null?
>>>>>
>>>>>      if (old)
>>>>>         bpf_prog_put(old)
>>>> The 'old' here represents the *old* link->prog program and
>>>> link->prog should not be NULL at this point.
>>> Ah ok. Maybe instead of using the input old var make it
>>> explicit?
>>>
>>>       if (!ret) {
>>>          struct bpf_prog *old_link;
>>>
>>>          bpf_prog_inc(prog);
>>>          old_link = xchg(&link->prog, prog);
>>>          bpf_prog_put(old)
>>>       }
>>>
>>> Is a bit more obious to me at least. Up to you I have a slight preference
>>> for the explicit more verbose above.
>> Regarding naming convention, yes, it is hard. My above code similar to
>> kernel/bpf/net_namespace.c:
>>
>> static int bpf_netns_link_update_prog(struct bpf_link *link,
>>                                         struct bpf_prog *new_prog,
>>                                         struct bpf_prog *old_prog)
>> {
>>           struct bpf_netns_link *net_link =
>>                   container_of(link, struct bpf_netns_link, link);
>>           enum netns_bpf_attach_type type = net_link->netns_type;
>>           struct bpf_prog_array *run_array;
>>           struct net *net;
>>           int idx, ret;
>>           
>>           if (old_prog && old_prog != link->prog)
>>                   return -EPERM;
>> 	...
>> 	old_prog = xchg(&link->prog, new_prog);
>>           bpf_prog_put(old_prog);
>> 	...
>> }
>>
>> The 'old_prog' is reused in the above.
>>
>> I am okay to change
>> 	old = xchg(&link->prog, prog);
>> to
>> 	old_link_prog = xchg(&link->prog, prog);
>>
>> in next revision (if requested or additional changes needed)
>> or as a followup.
> I'm good with this series as is LGTM. We can do a follow up if we want.

Thanks! Sounds good to me.

> Although the xchg is exactly one line above so I'm not sure its even necessary.

the xchg inpsock_set_prog() is used to set prog in pprog. The xchg I added is for 
&link->prog. I didn't make a change to psock_set_prog() since it is used 
in different places and I do not want to polute it with extra link 
parameter.

>
>>> Otherwise for the series.
>>>
>>> Reviewed-by: John Fastabend <john.fastabend@gmail.com>
>
Andrii Nakryiko April 5, 2024, 8:12 p.m. UTC | #7
On Wed, Apr 3, 2024 at 7:53 PM Yonghong Song <yonghong.song@linux.dev> wrote:
>
> Add bpf_link support for sk_msg and sk_skb programs. We have an
> internal request to support bpf_link for sk_msg programs so user
> space can have a uniform handling with bpf_link based libbpf
> APIs. Using bpf_link based libbpf API also has a benefit which
> makes system robust by decoupling prog life cycle and
> attachment life cycle.
>
> Signed-off-by: Yonghong Song <yonghong.song@linux.dev>
> ---
>  include/linux/bpf.h            |   6 +
>  include/linux/skmsg.h          |   4 +
>  include/uapi/linux/bpf.h       |   5 +
>  kernel/bpf/syscall.c           |   4 +
>  net/core/sock_map.c            | 268 ++++++++++++++++++++++++++++++++-
>  tools/include/uapi/linux/bpf.h |   5 +
>  6 files changed, 284 insertions(+), 8 deletions(-)
>

[...]

> @@ -103,7 +111,7 @@ int sock_map_prog_detach(const union bpf_attr *attr, enum bpf_prog_type ptype)
>                 goto put_prog;
>         }
>
> -       ret = sock_map_prog_update(map, NULL, prog, attr->attach_type);
> +       ret = sock_map_prog_update(map, NULL, prog, NULL, attr->attach_type);
>  put_prog:
>         bpf_prog_put(prog);
>  put_map:
> @@ -1488,21 +1496,79 @@ static int sock_map_prog_lookup(struct bpf_map *map, struct bpf_prog ***pprog,
>         return 0;
>  }
>
> +static int sock_map_link_lookup(struct bpf_map *map, struct bpf_link ***plink,
> +                               struct bpf_link *link, bool skip_check, u32 which)

why not combine prog + link into a single lookup? also it seems like
sock_map_prog_lookup has some additional EBUSY conditions, do we need
to replicate them here?

> +{
> +       struct sk_psock_progs *progs = sock_map_progs(map);
> +       struct bpf_link **cur_plink;
> +
> +       switch (which) {
> +       case BPF_SK_MSG_VERDICT:
> +               cur_plink = &progs->msg_parser_link;
> +               break;
> +#if IS_ENABLED(CONFIG_BPF_STREAM_PARSER)
> +       case BPF_SK_SKB_STREAM_PARSER:
> +               cur_plink = &progs->stream_parser_link;
> +               break;
> +#endif
> +       case BPF_SK_SKB_STREAM_VERDICT:
> +               cur_plink = &progs->stream_verdict_link;
> +               break;
> +       case BPF_SK_SKB_VERDICT:
> +               cur_plink = &progs->skb_verdict_link;
> +               break;
> +       default:
> +               return -EOPNOTSUPP;
> +       }
> +
> +       if (!skip_check && ((!link && *cur_plink) || (link && link != *cur_plink)))
> +               return -EBUSY;
> +
> +       *plink = cur_plink;
> +       return 0;
> +}
> +
> +/* Handle the following four cases:
> + * prog_attach: prog != NULL, old == NULL, link == NULL
> + * prog_detach: prog == NULL, old != NULL, link == NULL
> + * link_attach: prog != NULL, old == NULL, link != NULL
> + * link_detach: prog == NULL, old != NULL, link != NULL
> + */
>  static int sock_map_prog_update(struct bpf_map *map, struct bpf_prog *prog,
> -                               struct bpf_prog *old, u32 which)
> +                               struct bpf_prog *old, struct bpf_link *link,
> +                               u32 which)
>  {
>         struct bpf_prog **pprog;
> +       struct bpf_link **plink;
>         int ret;
>
> +       mutex_lock(&sockmap_mutex);
> +
>         ret = sock_map_prog_lookup(map, &pprog, which);
>         if (ret)
> -               return ret;
> +               goto out;
>
> -       if (old)
> -               return psock_replace_prog(pprog, prog, old);
> +       if (!link || prog)
> +               ret = sock_map_link_lookup(map, &plink, NULL, false, which);
> +       else
> +               ret = sock_map_link_lookup(map, &plink, NULL, true, which);
> +       if (ret)
> +               goto out;
> +
> +       if (old) {
> +               ret = psock_replace_prog(pprog, prog, old);
> +               if (!ret)
> +                       *plink = NULL;
> +               goto out;
> +       }
>
>         psock_set_prog(pprog, prog);
> -       return 0;
> +       if (link)
> +               *plink = link;

nit: feels more natural to do

if (old) {
    psock_replace_prog(...)
} else {
    psock_set_prog(...)
}

it's two alternatives, not one unlikely vs one main use case (but it's minor)

> +
> +out:
> +       mutex_unlock(&sockmap_mutex);
> +       return ret;
>  }
>
>  int sock_map_bpf_prog_query(const union bpf_attr *attr,
> @@ -1657,6 +1723,192 @@ void sock_map_close(struct sock *sk, long timeout)
>  }
>  EXPORT_SYMBOL_GPL(sock_map_close);
>
> +struct sockmap_link {
> +       struct bpf_link link;
> +       struct bpf_map *map;
> +       enum bpf_attach_type attach_type;
> +};
> +
> +static void sock_map_link_release(struct bpf_link *link)
> +{
> +       struct sockmap_link *sockmap_link = container_of(link, struct sockmap_link, link);
> +
> +       if (sockmap_link->map) {

nit: if (!sockmap_link->map) return;

and reduce nesting of everything else

> +               WARN_ON_ONCE(sock_map_prog_update(sockmap_link->map, NULL, link->prog, link,
> +                                                 sockmap_link->attach_type));

I think sockmap_link->map access in general has to be always protected
my sockmap_mutex (I'd do that even for the if above), because it can
race with force-detach logic at least

> +
> +               mutex_lock(&sockmap_mutex);
> +               bpf_map_put_with_uref(sockmap_link->map);
> +               sockmap_link->map = NULL;
> +               mutex_unlock(&sockmap_mutex);
> +       }
> +}
> +

[...]

> +       if (old) {
> +               ret = psock_replace_prog(pprog, prog, old);
> +               goto out;
> +       }
> +
> +       psock_set_prog(pprog, prog);
> +
> +out:

same nit, feels like

if (old) /* replace */ else /* set */ is more natural, and then you
can move xchg logic before out: knowing that it's the only success
case

> +       if (!ret) {
> +               bpf_prog_inc(prog);
> +               old = xchg(&link->prog, prog);
> +               bpf_prog_put(old);
> +       }
> +       mutex_unlock(&sockmap_mutex);
> +       return ret;
> +}
> +

[...]
Yonghong Song April 6, 2024, 5:21 a.m. UTC | #8
On 4/5/24 1:12 PM, Andrii Nakryiko wrote:
> On Wed, Apr 3, 2024 at 7:53 PM Yonghong Song <yonghong.song@linux.dev> wrote:
>> Add bpf_link support for sk_msg and sk_skb programs. We have an
>> internal request to support bpf_link for sk_msg programs so user
>> space can have a uniform handling with bpf_link based libbpf
>> APIs. Using bpf_link based libbpf API also has a benefit which
>> makes system robust by decoupling prog life cycle and
>> attachment life cycle.
>>
>> Signed-off-by: Yonghong Song <yonghong.song@linux.dev>
>> ---
>>   include/linux/bpf.h            |   6 +
>>   include/linux/skmsg.h          |   4 +
>>   include/uapi/linux/bpf.h       |   5 +
>>   kernel/bpf/syscall.c           |   4 +
>>   net/core/sock_map.c            | 268 ++++++++++++++++++++++++++++++++-
>>   tools/include/uapi/linux/bpf.h |   5 +
>>   6 files changed, 284 insertions(+), 8 deletions(-)
>>
> [...]
>
>> @@ -103,7 +111,7 @@ int sock_map_prog_detach(const union bpf_attr *attr, enum bpf_prog_type ptype)
>>                  goto put_prog;
>>          }
>>
>> -       ret = sock_map_prog_update(map, NULL, prog, attr->attach_type);
>> +       ret = sock_map_prog_update(map, NULL, prog, NULL, attr->attach_type);
>>   put_prog:
>>          bpf_prog_put(prog);
>>   put_map:
>> @@ -1488,21 +1496,79 @@ static int sock_map_prog_lookup(struct bpf_map *map, struct bpf_prog ***pprog,
>>          return 0;
>>   }
>>
>> +static int sock_map_link_lookup(struct bpf_map *map, struct bpf_link ***plink,
>> +                               struct bpf_link *link, bool skip_check, u32 which)
> why not combine prog + link into a single lookup? also it seems like
> sock_map_prog_lookup has some additional EBUSY conditions, do we need
> to replicate them here?

I can combine them together.

>
>> +{
>> +       struct sk_psock_progs *progs = sock_map_progs(map);
>> +       struct bpf_link **cur_plink;
>> +
>> +       switch (which) {
>> +       case BPF_SK_MSG_VERDICT:
>> +               cur_plink = &progs->msg_parser_link;
>> +               break;
>> +#if IS_ENABLED(CONFIG_BPF_STREAM_PARSER)
>> +       case BPF_SK_SKB_STREAM_PARSER:
>> +               cur_plink = &progs->stream_parser_link;
>> +               break;
>> +#endif
>> +       case BPF_SK_SKB_STREAM_VERDICT:
>> +               cur_plink = &progs->stream_verdict_link;
>> +               break;
>> +       case BPF_SK_SKB_VERDICT:
>> +               cur_plink = &progs->skb_verdict_link;
>> +               break;
>> +       default:
>> +               return -EOPNOTSUPP;
>> +       }
>> +
>> +       if (!skip_check && ((!link && *cur_plink) || (link && link != *cur_plink)))
>> +               return -EBUSY;
>> +
>> +       *plink = cur_plink;
>> +       return 0;
>> +}
>> +
>> +/* Handle the following four cases:
>> + * prog_attach: prog != NULL, old == NULL, link == NULL
>> + * prog_detach: prog == NULL, old != NULL, link == NULL
>> + * link_attach: prog != NULL, old == NULL, link != NULL
>> + * link_detach: prog == NULL, old != NULL, link != NULL
>> + */
>>   static int sock_map_prog_update(struct bpf_map *map, struct bpf_prog *prog,
>> -                               struct bpf_prog *old, u32 which)
>> +                               struct bpf_prog *old, struct bpf_link *link,
>> +                               u32 which)
>>   {
>>          struct bpf_prog **pprog;
>> +       struct bpf_link **plink;
>>          int ret;
>>
>> +       mutex_lock(&sockmap_mutex);
>> +
>>          ret = sock_map_prog_lookup(map, &pprog, which);
>>          if (ret)
>> -               return ret;
>> +               goto out;
>>
>> -       if (old)
>> -               return psock_replace_prog(pprog, prog, old);
>> +       if (!link || prog)
>> +               ret = sock_map_link_lookup(map, &plink, NULL, false, which);
>> +       else
>> +               ret = sock_map_link_lookup(map, &plink, NULL, true, which);
>> +       if (ret)
>> +               goto out;
>> +
>> +       if (old) {
>> +               ret = psock_replace_prog(pprog, prog, old);
>> +               if (!ret)
>> +                       *plink = NULL;
>> +               goto out;
>> +       }
>>
>>          psock_set_prog(pprog, prog);
>> -       return 0;
>> +       if (link)
>> +               *plink = link;
> nit: feels more natural to do
>
> if (old) {
>      psock_replace_prog(...)
> } else {
>      psock_set_prog(...)
> }
>
> it's two alternatives, not one unlikely vs one main use case (but it's minor)

Ack indeed better.

>
>> +
>> +out:
>> +       mutex_unlock(&sockmap_mutex);
>> +       return ret;
>>   }
>>
>>   int sock_map_bpf_prog_query(const union bpf_attr *attr,
>> @@ -1657,6 +1723,192 @@ void sock_map_close(struct sock *sk, long timeout)
>>   }
>>   EXPORT_SYMBOL_GPL(sock_map_close);
>>
>> +struct sockmap_link {
>> +       struct bpf_link link;
>> +       struct bpf_map *map;
>> +       enum bpf_attach_type attach_type;
>> +};
>> +
>> +static void sock_map_link_release(struct bpf_link *link)
>> +{
>> +       struct sockmap_link *sockmap_link = container_of(link, struct sockmap_link, link);
>> +
>> +       if (sockmap_link->map) {
> nit: if (!sockmap_link->map) return;
>
> and reduce nesting of everything else
>
>> +               WARN_ON_ONCE(sock_map_prog_update(sockmap_link->map, NULL, link->prog, link,
>> +                                                 sockmap_link->attach_type));
> I think sockmap_link->map access in general has to be always protected
> my sockmap_mutex (I'd do that even for the if above), because it can
> race with force-detach logic at least

Ack. will fix this.

>
>> +
>> +               mutex_lock(&sockmap_mutex);
>> +               bpf_map_put_with_uref(sockmap_link->map);
>> +               sockmap_link->map = NULL;
>> +               mutex_unlock(&sockmap_mutex);
>> +       }
>> +}
>> +
> [...]
>
>> +       if (old) {
>> +               ret = psock_replace_prog(pprog, prog, old);
>> +               goto out;
>> +       }
>> +
>> +       psock_set_prog(pprog, prog);
>> +
>> +out:
> same nit, feels like
>
> if (old) /* replace */ else /* set */ is more natural, and then you
> can move xchg logic before out: knowing that it's the only success
> case

Ack. will do.

>
>> +       if (!ret) {
>> +               bpf_prog_inc(prog);
>> +               old = xchg(&link->prog, prog);
>> +               bpf_prog_put(old);
>> +       }
>> +       mutex_unlock(&sockmap_mutex);
>> +       return ret;
>> +}
>> +
> [...]
diff mbox series

Patch

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 62762390c93d..5034c1b4ded7 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -2996,6 +2996,7 @@  int sock_map_prog_detach(const union bpf_attr *attr, enum bpf_prog_type ptype);
 int sock_map_update_elem_sys(struct bpf_map *map, void *key, void *value, u64 flags);
 int sock_map_bpf_prog_query(const union bpf_attr *attr,
 			    union bpf_attr __user *uattr);
+int sock_map_link_create(const union bpf_attr *attr, struct bpf_prog *prog);
 
 void sock_map_unhash(struct sock *sk);
 void sock_map_destroy(struct sock *sk);
@@ -3094,6 +3095,11 @@  static inline int sock_map_bpf_prog_query(const union bpf_attr *attr,
 {
 	return -EINVAL;
 }
+
+static inline int sock_map_link_create(const union bpf_attr *attr, struct bpf_prog *prog)
+{
+	return -EOPNOTSUPP;
+}
 #endif /* CONFIG_BPF_SYSCALL */
 #endif /* CONFIG_NET && CONFIG_BPF_SYSCALL */
 
diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h
index e65ec3fd2799..9c8dd4c01412 100644
--- a/include/linux/skmsg.h
+++ b/include/linux/skmsg.h
@@ -58,6 +58,10 @@  struct sk_psock_progs {
 	struct bpf_prog			*stream_parser;
 	struct bpf_prog			*stream_verdict;
 	struct bpf_prog			*skb_verdict;
+	struct bpf_link			*msg_parser_link;
+	struct bpf_link			*stream_parser_link;
+	struct bpf_link			*stream_verdict_link;
+	struct bpf_link			*skb_verdict_link;
 };
 
 enum sk_psock_state_bits {
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 79c548276b6b..4b24b7c2b612 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -1135,6 +1135,7 @@  enum bpf_link_type {
 	BPF_LINK_TYPE_TCX = 11,
 	BPF_LINK_TYPE_UPROBE_MULTI = 12,
 	BPF_LINK_TYPE_NETKIT = 13,
+	BPF_LINK_TYPE_SOCKMAP = 14,
 	__MAX_BPF_LINK_TYPE,
 };
 
@@ -6724,6 +6725,10 @@  struct bpf_link_info {
 			__u32 ifindex;
 			__u32 attach_type;
 		} netkit;
+		struct {
+			__u32 map_id;
+			__u32 attach_type;
+		} sockmap;
 	};
 } __attribute__((aligned(8)));
 
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index e44c276e8617..7d392ec83655 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -5213,6 +5213,10 @@  static int link_create(union bpf_attr *attr, bpfptr_t uattr)
 	case BPF_PROG_TYPE_SK_LOOKUP:
 		ret = netns_bpf_link_create(attr, prog);
 		break;
+	case BPF_PROG_TYPE_SK_MSG:
+	case BPF_PROG_TYPE_SK_SKB:
+		ret = sock_map_link_create(attr, prog);
+		break;
 #ifdef CONFIG_NET
 	case BPF_PROG_TYPE_XDP:
 		ret = bpf_xdp_link_attach(attr, prog);
diff --git a/net/core/sock_map.c b/net/core/sock_map.c
index 27d733c0f65e..1e1d24692706 100644
--- a/net/core/sock_map.c
+++ b/net/core/sock_map.c
@@ -24,8 +24,16 @@  struct bpf_stab {
 #define SOCK_CREATE_FLAG_MASK				\
 	(BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY)
 
+/* This mutex is used to
+ *  - protect race between prog/link attach/detach and link prog update, and
+ *  - protect race between releasing and accessing map in bpf_link.
+ * A single global mutex lock is used since it is expected contention is low.
+ */
+static DEFINE_MUTEX(sockmap_mutex);
+
 static int sock_map_prog_update(struct bpf_map *map, struct bpf_prog *prog,
-				struct bpf_prog *old, u32 which);
+				struct bpf_prog *old, struct bpf_link *link,
+				u32 which);
 static struct sk_psock_progs *sock_map_progs(struct bpf_map *map);
 
 static struct bpf_map *sock_map_alloc(union bpf_attr *attr)
@@ -71,7 +79,7 @@  int sock_map_get_from_fd(const union bpf_attr *attr, struct bpf_prog *prog)
 	map = __bpf_map_get(f);
 	if (IS_ERR(map))
 		return PTR_ERR(map);
-	ret = sock_map_prog_update(map, prog, NULL, attr->attach_type);
+	ret = sock_map_prog_update(map, prog, NULL, NULL, attr->attach_type);
 	fdput(f);
 	return ret;
 }
@@ -103,7 +111,7 @@  int sock_map_prog_detach(const union bpf_attr *attr, enum bpf_prog_type ptype)
 		goto put_prog;
 	}
 
-	ret = sock_map_prog_update(map, NULL, prog, attr->attach_type);
+	ret = sock_map_prog_update(map, NULL, prog, NULL, attr->attach_type);
 put_prog:
 	bpf_prog_put(prog);
 put_map:
@@ -1488,21 +1496,79 @@  static int sock_map_prog_lookup(struct bpf_map *map, struct bpf_prog ***pprog,
 	return 0;
 }
 
+static int sock_map_link_lookup(struct bpf_map *map, struct bpf_link ***plink,
+				struct bpf_link *link, bool skip_check, u32 which)
+{
+	struct sk_psock_progs *progs = sock_map_progs(map);
+	struct bpf_link **cur_plink;
+
+	switch (which) {
+	case BPF_SK_MSG_VERDICT:
+		cur_plink = &progs->msg_parser_link;
+		break;
+#if IS_ENABLED(CONFIG_BPF_STREAM_PARSER)
+	case BPF_SK_SKB_STREAM_PARSER:
+		cur_plink = &progs->stream_parser_link;
+		break;
+#endif
+	case BPF_SK_SKB_STREAM_VERDICT:
+		cur_plink = &progs->stream_verdict_link;
+		break;
+	case BPF_SK_SKB_VERDICT:
+		cur_plink = &progs->skb_verdict_link;
+		break;
+	default:
+		return -EOPNOTSUPP;
+	}
+
+	if (!skip_check && ((!link && *cur_plink) || (link && link != *cur_plink)))
+		return -EBUSY;
+
+	*plink = cur_plink;
+	return 0;
+}
+
+/* Handle the following four cases:
+ * prog_attach: prog != NULL, old == NULL, link == NULL
+ * prog_detach: prog == NULL, old != NULL, link == NULL
+ * link_attach: prog != NULL, old == NULL, link != NULL
+ * link_detach: prog == NULL, old != NULL, link != NULL
+ */
 static int sock_map_prog_update(struct bpf_map *map, struct bpf_prog *prog,
-				struct bpf_prog *old, u32 which)
+				struct bpf_prog *old, struct bpf_link *link,
+				u32 which)
 {
 	struct bpf_prog **pprog;
+	struct bpf_link **plink;
 	int ret;
 
+	mutex_lock(&sockmap_mutex);
+
 	ret = sock_map_prog_lookup(map, &pprog, which);
 	if (ret)
-		return ret;
+		goto out;
 
-	if (old)
-		return psock_replace_prog(pprog, prog, old);
+	if (!link || prog)
+		ret = sock_map_link_lookup(map, &plink, NULL, false, which);
+	else
+		ret = sock_map_link_lookup(map, &plink, NULL, true, which);
+	if (ret)
+		goto out;
+
+	if (old) {
+		ret = psock_replace_prog(pprog, prog, old);
+		if (!ret)
+			*plink = NULL;
+		goto out;
+	}
 
 	psock_set_prog(pprog, prog);
-	return 0;
+	if (link)
+		*plink = link;
+
+out:
+	mutex_unlock(&sockmap_mutex);
+	return ret;
 }
 
 int sock_map_bpf_prog_query(const union bpf_attr *attr,
@@ -1657,6 +1723,192 @@  void sock_map_close(struct sock *sk, long timeout)
 }
 EXPORT_SYMBOL_GPL(sock_map_close);
 
+struct sockmap_link {
+	struct bpf_link link;
+	struct bpf_map *map;
+	enum bpf_attach_type attach_type;
+};
+
+static void sock_map_link_release(struct bpf_link *link)
+{
+	struct sockmap_link *sockmap_link = container_of(link, struct sockmap_link, link);
+
+	if (sockmap_link->map) {
+		WARN_ON_ONCE(sock_map_prog_update(sockmap_link->map, NULL, link->prog, link,
+						  sockmap_link->attach_type));
+
+		mutex_lock(&sockmap_mutex);
+		bpf_map_put_with_uref(sockmap_link->map);
+		sockmap_link->map = NULL;
+		mutex_unlock(&sockmap_mutex);
+	}
+}
+
+static int sock_map_link_detach(struct bpf_link *link)
+{
+	sock_map_link_release(link);
+	return 0;
+}
+
+static void sock_map_link_dealloc(struct bpf_link *link)
+{
+	kfree(link);
+}
+
+/* Handle the following two cases:
+ * case 1: link != NULL, prog != NULL, old != NULL
+ * case 2: link != NULL, prog != NULL, old == NULL
+ */
+static int sock_map_link_update_prog(struct bpf_link *link,
+				     struct bpf_prog *prog,
+				     struct bpf_prog *old)
+{
+	const struct sockmap_link *sockmap_link = container_of(link, struct sockmap_link, link);
+	struct bpf_prog **pprog;
+	struct bpf_link **plink;
+	int ret = 0;
+
+	mutex_lock(&sockmap_mutex);
+
+	/* If old prog is not NULL, ensure old prog is the same as link->prog. */
+	if (old && link->prog != old) {
+		ret = -EINVAL;
+		goto out;
+	}
+	/* Ensure link->prog has the same type/attach_type as the new prog. */
+	if (link->prog->type != prog->type ||
+	    link->prog->expected_attach_type != prog->expected_attach_type) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	ret = sock_map_prog_lookup(sockmap_link->map, &pprog,
+				   sockmap_link->attach_type);
+	if (ret)
+		goto out;
+
+	/* Ensure the same link between the one in map and the passed-in. */
+	ret = sock_map_link_lookup(sockmap_link->map, &plink, link, false,
+				   sockmap_link->attach_type);
+	if (ret)
+		goto out;
+
+	if (old) {
+		ret = psock_replace_prog(pprog, prog, old);
+		goto out;
+	}
+
+	psock_set_prog(pprog, prog);
+
+out:
+	if (!ret) {
+		bpf_prog_inc(prog);
+		old = xchg(&link->prog, prog);
+		bpf_prog_put(old);
+	}
+	mutex_unlock(&sockmap_mutex);
+	return ret;
+}
+
+static u32 sock_map_link_get_map_id(const struct sockmap_link *sockmap_link)
+{
+	u32 map_id = 0;
+
+	mutex_lock(&sockmap_mutex);
+	if (sockmap_link->map)
+		map_id = sockmap_link->map->id;
+	mutex_unlock(&sockmap_mutex);
+	return map_id;
+}
+
+static int sock_map_link_fill_info(const struct bpf_link *link,
+				   struct bpf_link_info *info)
+{
+	const struct sockmap_link *sockmap_link = container_of(link, struct sockmap_link, link);
+	u32 map_id = sock_map_link_get_map_id(sockmap_link);
+
+	info->sockmap.map_id = map_id;
+	info->sockmap.attach_type = sockmap_link->attach_type;
+	return 0;
+}
+
+static void sock_map_link_show_fdinfo(const struct bpf_link *link,
+				      struct seq_file *seq)
+{
+	const struct sockmap_link *sockmap_link = container_of(link, struct sockmap_link, link);
+	u32 map_id = sock_map_link_get_map_id(sockmap_link);
+
+	seq_printf(seq, "map_id:\t%u\n", map_id);
+	seq_printf(seq, "attach_type:\t%u\n", sockmap_link->attach_type);
+}
+
+static const struct bpf_link_ops sock_map_link_ops = {
+	.release = sock_map_link_release,
+	.dealloc = sock_map_link_dealloc,
+	.detach = sock_map_link_detach,
+	.update_prog = sock_map_link_update_prog,
+	.fill_link_info = sock_map_link_fill_info,
+	.show_fdinfo = sock_map_link_show_fdinfo,
+};
+
+int sock_map_link_create(const union bpf_attr *attr, struct bpf_prog *prog)
+{
+	struct bpf_link_primer link_primer;
+	struct sockmap_link *sockmap_link;
+	enum bpf_attach_type attach_type;
+	struct bpf_map *map;
+	int ret;
+
+	if (attr->link_create.flags)
+		return -EINVAL;
+
+	map = bpf_map_get_with_uref(attr->link_create.target_fd);
+	if (IS_ERR(map))
+		return PTR_ERR(map);
+	if (map->map_type != BPF_MAP_TYPE_SOCKMAP && map->map_type != BPF_MAP_TYPE_SOCKHASH) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	sockmap_link = kzalloc(sizeof(*sockmap_link), GFP_USER);
+	if (!sockmap_link) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	attach_type = attr->link_create.attach_type;
+	bpf_link_init(&sockmap_link->link, BPF_LINK_TYPE_SOCKMAP, &sock_map_link_ops, prog);
+	sockmap_link->map = map;
+	sockmap_link->attach_type = attach_type;
+
+	ret = bpf_link_prime(&sockmap_link->link, &link_primer);
+	if (ret) {
+		kfree(sockmap_link);
+		goto out;
+	}
+
+	ret = sock_map_prog_update(map, prog, NULL, &sockmap_link->link, attach_type);
+	if (ret) {
+		bpf_link_cleanup(&link_primer);
+		goto out;
+	}
+
+	/* Increase refcnt for the prog since when old prog is replaced with
+	 * psock_replace_prog() and psock_set_prog() its refcnt will be decreased.
+	 *
+	 * Actually, we do not need to increase refcnt for the prog since bpf_link
+	 * will hold a reference. But in order to have less complexity w.r.t.
+	 * replacing/setting prog, let us increase the refcnt to make things simpler.
+	 */
+	bpf_prog_inc(prog);
+
+	return bpf_link_settle(&link_primer);
+
+out:
+	bpf_map_put_with_uref(map);
+	return ret;
+}
+
 static int sock_map_iter_attach_target(struct bpf_prog *prog,
 				       union bpf_iter_link_info *linfo,
 				       struct bpf_iter_aux_info *aux)
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 79c548276b6b..4b24b7c2b612 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -1135,6 +1135,7 @@  enum bpf_link_type {
 	BPF_LINK_TYPE_TCX = 11,
 	BPF_LINK_TYPE_UPROBE_MULTI = 12,
 	BPF_LINK_TYPE_NETKIT = 13,
+	BPF_LINK_TYPE_SOCKMAP = 14,
 	__MAX_BPF_LINK_TYPE,
 };
 
@@ -6724,6 +6725,10 @@  struct bpf_link_info {
 			__u32 ifindex;
 			__u32 attach_type;
 		} netkit;
+		struct {
+			__u32 map_id;
+			__u32 attach_type;
+		} sockmap;
 	};
 } __attribute__((aligned(8)));