diff mbox series

[bpf-next,v3,1/5] bpf: Add bpf_link support for sk_msg and sk_skb progs

Message ID 20240326022158.656285-1-yonghong.song@linux.dev (mailing list archive)
State Superseded
Delegated to: BPF
Headers show
Series bpf: Add bpf_link support for sk_msg and sk_skb progs | expand

Checks

Context Check Description
netdev/series_format success Posting correctly formatted
netdev/tree_selection success Clearly marked for bpf-next, async
netdev/ynl success Generated files up to date; no warnings/errors; no diff in generated;
netdev/fixes_present success Fixes tag not required for -next series
netdev/header_inline success No static functions without inline keyword in header files
netdev/build_32bit success Errors and warnings before: 7482 this patch: 7482
netdev/build_tools success Errors and warnings before: 1 this patch: 1
netdev/cc_maintainers warning 11 maintainers not CCed: song@kernel.org kpsingh@kernel.org eddyz87@gmail.com edumazet@google.com jolsa@kernel.org martin.lau@linux.dev netdev@vger.kernel.org kuba@kernel.org sdf@google.com haoluo@google.com pabeni@redhat.com
netdev/build_clang success Errors and warnings before: 2244 this patch: 2244
netdev/verify_signedoff success Signed-off-by tag matches author and committer
netdev/deprecated_api success None detected
netdev/check_selftest success No net selftest shell script
netdev/verify_fixes success No Fixes tag
netdev/build_allmodconfig_warn success Errors and warnings before: 7870 this patch: 7870
netdev/checkpatch warning WARNING: line length of 82 exceeds 80 columns WARNING: line length of 85 exceeds 80 columns WARNING: line length of 86 exceeds 80 columns WARNING: line length of 89 exceeds 80 columns WARNING: line length of 92 exceeds 80 columns
netdev/build_clang_rust success No Rust files in patch. Skipping build
netdev/kdoc success Errors and warnings before: 6 this patch: 6
netdev/source_inline success Was 0 now: 0
bpf/vmtest-bpf-next-PR fail PR summary
bpf/vmtest-bpf-next-VM_Test-13 success Logs for set-matrix
bpf/vmtest-bpf-next-VM_Test-3 success Logs for Validate matrix.py
bpf/vmtest-bpf-next-VM_Test-5 success Logs for aarch64-gcc / build-release
bpf/vmtest-bpf-next-VM_Test-0 success Logs for Lint
bpf/vmtest-bpf-next-VM_Test-2 success Logs for Unittests
bpf/vmtest-bpf-next-VM_Test-1 success Logs for ShellCheck
bpf/vmtest-bpf-next-VM_Test-19 success Logs for x86_64-gcc / build / build for x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-17 success Logs for s390x-gcc / veristat
bpf/vmtest-bpf-next-VM_Test-18 success Logs for set-matrix
bpf/vmtest-bpf-next-VM_Test-4 success Logs for aarch64-gcc / build / build for aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-10 success Logs for aarch64-gcc / veristat
bpf/vmtest-bpf-next-VM_Test-11 success Logs for s390x-gcc / build / build for s390x with gcc
bpf/vmtest-bpf-next-VM_Test-12 success Logs for s390x-gcc / build-release
bpf/vmtest-bpf-next-VM_Test-20 success Logs for x86_64-gcc / build-release
bpf/vmtest-bpf-next-VM_Test-28 success Logs for x86_64-llvm-17 / build / build for x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-29 success Logs for x86_64-llvm-17 / build-release / build for x86_64 with llvm-17 and -O2 optimization
bpf/vmtest-bpf-next-VM_Test-34 success Logs for x86_64-llvm-17 / veristat
bpf/vmtest-bpf-next-VM_Test-35 success Logs for x86_64-llvm-18 / build / build for x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-36 success Logs for x86_64-llvm-18 / build-release / build for x86_64 with llvm-18 and -O2 optimization
bpf/vmtest-bpf-next-VM_Test-42 success Logs for x86_64-llvm-18 / veristat
bpf/vmtest-bpf-next-VM_Test-9 success Logs for aarch64-gcc / test (test_verifier, false, 360) / test_verifier on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-8 success Logs for aarch64-gcc / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-7 success Logs for aarch64-gcc / test (test_progs, false, 360) / test_progs on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-6 success Logs for aarch64-gcc / test (test_maps, false, 360) / test_maps on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-21 success Logs for x86_64-gcc / test (test_maps, false, 360) / test_maps on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-24 success Logs for x86_64-gcc / test (test_progs_no_alu32_parallel, true, 30) / test_progs_no_alu32_parallel on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-25 success Logs for x86_64-gcc / test (test_progs_parallel, true, 30) / test_progs_parallel on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-26 success Logs for x86_64-gcc / test (test_verifier, false, 360) / test_verifier on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-27 success Logs for x86_64-gcc / veristat / veristat on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-33 success Logs for x86_64-llvm-17 / test (test_verifier, false, 360) / test_verifier on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-41 success Logs for x86_64-llvm-18 / test (test_verifier, false, 360) / test_verifier on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-16 success Logs for s390x-gcc / test (test_verifier, false, 360) / test_verifier on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-22 success Logs for x86_64-gcc / test (test_progs, false, 360) / test_progs on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-23 success Logs for x86_64-gcc / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-30 success Logs for x86_64-llvm-17 / test (test_maps, false, 360) / test_maps on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-31 success Logs for x86_64-llvm-17 / test (test_progs, false, 360) / test_progs on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-32 success Logs for x86_64-llvm-17 / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-37 success Logs for x86_64-llvm-18 / test (test_maps, false, 360) / test_maps on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-38 fail Logs for x86_64-llvm-18 / test (test_progs, false, 360) / test_progs on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-39 success Logs for x86_64-llvm-18 / test (test_progs_cpuv4, false, 360) / test_progs_cpuv4 on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-40 success Logs for x86_64-llvm-18 / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-15 success Logs for s390x-gcc / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-14 success Logs for s390x-gcc / test (test_progs, false, 360) / test_progs on s390x with gcc

Commit Message

Yonghong Song March 26, 2024, 2:21 a.m. UTC
Add bpf_link support for sk_msg and sk_skb programs. We have an
internal request to support bpf_link for sk_msg programs so user
space can have a uniform handling with bpf_link based libbpf
APIs. Using bpf_link based libbpf API also has a benefit which
makes system robust by decoupling prog life cycle and
attachment life cycle.

Signed-off-by: Yonghong Song <yonghong.song@linux.dev>
---
 include/linux/bpf.h            |   6 +
 include/linux/skmsg.h          |   4 +
 include/uapi/linux/bpf.h       |   5 +
 kernel/bpf/syscall.c           |   4 +
 net/core/sock_map.c            | 263 ++++++++++++++++++++++++++++++++-
 tools/include/uapi/linux/bpf.h |   5 +
 6 files changed, 279 insertions(+), 8 deletions(-)

Comments

Eduard Zingerman April 2, 2024, 5:39 p.m. UTC | #1
On Mon, 2024-03-25 at 19:21 -0700, Yonghong Song wrote:
[...]

> diff --git a/net/core/sock_map.c b/net/core/sock_map.c
> index 27d733c0f65e..dafc9aa6e192 100644
> --- a/net/core/sock_map.c
> +++ b/net/core/sock_map.c

[...]

> @@ -1488,21 +1492,90 @@ static int sock_map_prog_lookup(struct bpf_map *map, struct bpf_prog ***pprog,
>  	return 0;
>  }
>  
> +static int sock_map_link_lookup(struct bpf_map *map, struct bpf_link ***plink,
> +				struct bpf_link *link, bool skip_check, u32 which)
> +{
> +	struct sk_psock_progs *progs = sock_map_progs(map);
> +
> +	switch (which) {
> +	case BPF_SK_MSG_VERDICT:
> +		if (!skip_check &&
> +		    ((!link && progs->msg_parser_link) ||
> +		     (link && link != progs->msg_parser_link)))
> +			return -EBUSY;

These checks seem a bit repetitive, maybe factor it out as a single
check at the end of the function? E.g.:

	if (!skip_check &&
	    ((!link && **plink) || (link && link != **plink)))
		return -EBUSY;

Or inline these checks at call sites for sock_map_link_lookup()?
I tried this on top of this in [1] and all tests seem to pass.

[1] https://gist.github.com/eddyz87/38d832b3f1fc74120598d3480bc16ae1

> +		*plink = &progs->msg_parser_link;
> +		break;
> +#if IS_ENABLED(CONFIG_BPF_STREAM_PARSER)
> +	case BPF_SK_SKB_STREAM_PARSER:
> +		if (!skip_check &&
> +		    ((!link && progs->stream_parser_link) ||
> +		     (link && link != progs->stream_parser_link)))
> +			return -EBUSY;
> +		*plink = &progs->stream_parser_link;
> +		break;
> +#endif
> +	case BPF_SK_SKB_STREAM_VERDICT:
> +		if (!skip_check &&
> +		    ((!link && progs->stream_verdict_link) ||
> +		     (link && link != progs->stream_verdict_link)))
> +			return -EBUSY;
> +		*plink = &progs->stream_verdict_link;
> +		break;
> +	case BPF_SK_SKB_VERDICT:
> +		if (!skip_check &&
> +		    ((!link && progs->skb_verdict_link) ||
> +		     (link && link != progs->skb_verdict_link)))
> +			return -EBUSY;
> +		*plink = &progs->skb_verdict_link;
> +		break;
> +	default:
> +		return -EOPNOTSUPP;
> +	}
> +
> +	return 0;
> +}

[...]

> +/* Handle the following two cases:
> + * case 1: link != NULL, prog != NULL, old != NULL
> + * case 2: link != NULL, prog != NULL, old == NULL
> + */
> +static int sock_map_link_update_prog(struct bpf_link *link,
> +				     struct bpf_prog *prog,
> +				     struct bpf_prog *old)
> +{
> +	const struct sockmap_link *sockmap_link = get_sockmap_link(link);
> +	struct bpf_prog **pprog;
> +	struct bpf_link **plink;
> +	int ret = 0;
> +
> +	mutex_lock(&sockmap_prog_update_mutex);
> +
> +	/* If old prog not NULL, ensure old prog the same as link->prog. */
> +	if (old && link->prog != old) {
> +		ret = -EINVAL;
> +		goto out;
> +	}
> +	/* Ensure link->prog has the same type/attach_type as the new prog. */
> +	if (link->prog->type != prog->type ||
> +	    link->prog->expected_attach_type != prog->expected_attach_type) {
> +		ret = -EINVAL;
> +		goto out;
> +	}
> +
> +	ret = sock_map_prog_lookup(sockmap_link->map, &pprog,
> +				   sockmap_link->attach_type);
> +	if (ret)
> +		goto out;
> +
> +	/* Ensure the same link between the one in map and the passed-in. */
> +	ret = sock_map_link_lookup(sockmap_link->map, &plink, link, false,
> +				   sockmap_link->attach_type);
> +	if (ret)
> +		goto out;
> +
> +	if (old)
> +		return psock_replace_prog(pprog, prog, old);

should this be 'goto out' in order to unlock the mutex?

> +
> +	psock_set_prog(pprog, prog);
> +
> +out:
> +	if (!ret)
> +		bpf_prog_inc(prog);
> +	mutex_unlock(&sockmap_prog_update_mutex);
> +	return ret;
> +}

[...]
Andrii Nakryiko April 2, 2024, 5:45 p.m. UTC | #2
On Mon, Mar 25, 2024 at 7:22 PM Yonghong Song <yonghong.song@linux.dev> wrote:
>
> Add bpf_link support for sk_msg and sk_skb programs. We have an
> internal request to support bpf_link for sk_msg programs so user
> space can have a uniform handling with bpf_link based libbpf
> APIs. Using bpf_link based libbpf API also has a benefit which
> makes system robust by decoupling prog life cycle and
> attachment life cycle.
>
> Signed-off-by: Yonghong Song <yonghong.song@linux.dev>
> ---
>  include/linux/bpf.h            |   6 +
>  include/linux/skmsg.h          |   4 +
>  include/uapi/linux/bpf.h       |   5 +
>  kernel/bpf/syscall.c           |   4 +
>  net/core/sock_map.c            | 263 ++++++++++++++++++++++++++++++++-
>  tools/include/uapi/linux/bpf.h |   5 +
>  6 files changed, 279 insertions(+), 8 deletions(-)
>
> diff --git a/include/linux/bpf.h b/include/linux/bpf.h
> index 62762390c93d..5034c1b4ded7 100644
> --- a/include/linux/bpf.h
> +++ b/include/linux/bpf.h
> @@ -2996,6 +2996,7 @@ int sock_map_prog_detach(const union bpf_attr *attr, enum bpf_prog_type ptype);
>  int sock_map_update_elem_sys(struct bpf_map *map, void *key, void *value, u64 flags);
>  int sock_map_bpf_prog_query(const union bpf_attr *attr,
>                             union bpf_attr __user *uattr);
> +int sock_map_link_create(const union bpf_attr *attr, struct bpf_prog *prog);
>
>  void sock_map_unhash(struct sock *sk);
>  void sock_map_destroy(struct sock *sk);
> @@ -3094,6 +3095,11 @@ static inline int sock_map_bpf_prog_query(const union bpf_attr *attr,
>  {
>         return -EINVAL;
>  }
> +
> +static inline int sock_map_link_create(const union bpf_attr *attr, struct bpf_prog *prog)
> +{
> +       return -EOPNOTSUPP;
> +}
>  #endif /* CONFIG_BPF_SYSCALL */
>  #endif /* CONFIG_NET && CONFIG_BPF_SYSCALL */
>
> diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h
> index e65ec3fd2799..9c8dd4c01412 100644
> --- a/include/linux/skmsg.h
> +++ b/include/linux/skmsg.h
> @@ -58,6 +58,10 @@ struct sk_psock_progs {
>         struct bpf_prog                 *stream_parser;
>         struct bpf_prog                 *stream_verdict;
>         struct bpf_prog                 *skb_verdict;
> +       struct bpf_link                 *msg_parser_link;
> +       struct bpf_link                 *stream_parser_link;
> +       struct bpf_link                 *stream_verdict_link;
> +       struct bpf_link                 *skb_verdict_link;
>  };
>
>  enum sk_psock_state_bits {
> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> index 9585f5345353..31660c3ffc01 100644
> --- a/include/uapi/linux/bpf.h
> +++ b/include/uapi/linux/bpf.h
> @@ -1135,6 +1135,7 @@ enum bpf_link_type {
>         BPF_LINK_TYPE_TCX = 11,
>         BPF_LINK_TYPE_UPROBE_MULTI = 12,
>         BPF_LINK_TYPE_NETKIT = 13,
> +       BPF_LINK_TYPE_SOCKMAP = 14,
>         __MAX_BPF_LINK_TYPE,
>  };
>
> @@ -6720,6 +6721,10 @@ struct bpf_link_info {
>                         __u32 ifindex;
>                         __u32 attach_type;
>                 } netkit;
> +               struct {
> +                       __u32 map_id;
> +                       __u32 attach_type;
> +               } sockmap;
>         };
>  } __attribute__((aligned(8)));
>
> diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
> index e44c276e8617..7d392ec83655 100644
> --- a/kernel/bpf/syscall.c
> +++ b/kernel/bpf/syscall.c
> @@ -5213,6 +5213,10 @@ static int link_create(union bpf_attr *attr, bpfptr_t uattr)
>         case BPF_PROG_TYPE_SK_LOOKUP:
>                 ret = netns_bpf_link_create(attr, prog);
>                 break;
> +       case BPF_PROG_TYPE_SK_MSG:
> +       case BPF_PROG_TYPE_SK_SKB:
> +               ret = sock_map_link_create(attr, prog);
> +               break;
>  #ifdef CONFIG_NET
>         case BPF_PROG_TYPE_XDP:
>                 ret = bpf_xdp_link_attach(attr, prog);
> diff --git a/net/core/sock_map.c b/net/core/sock_map.c
> index 27d733c0f65e..dafc9aa6e192 100644
> --- a/net/core/sock_map.c
> +++ b/net/core/sock_map.c
> @@ -24,8 +24,12 @@ struct bpf_stab {
>  #define SOCK_CREATE_FLAG_MASK                          \
>         (BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY)
>
> +static DEFINE_MUTEX(sockmap_prog_update_mutex);
> +static DEFINE_MUTEX(sockmap_link_mutex);
> +
>  static int sock_map_prog_update(struct bpf_map *map, struct bpf_prog *prog,
> -                               struct bpf_prog *old, u32 which);
> +                               struct bpf_prog *old, struct bpf_link *link,
> +                               u32 which);
>  static struct sk_psock_progs *sock_map_progs(struct bpf_map *map);
>
>  static struct bpf_map *sock_map_alloc(union bpf_attr *attr)
> @@ -71,7 +75,7 @@ int sock_map_get_from_fd(const union bpf_attr *attr, struct bpf_prog *prog)
>         map = __bpf_map_get(f);
>         if (IS_ERR(map))
>                 return PTR_ERR(map);
> -       ret = sock_map_prog_update(map, prog, NULL, attr->attach_type);
> +       ret = sock_map_prog_update(map, prog, NULL, NULL, attr->attach_type);
>         fdput(f);
>         return ret;
>  }
> @@ -103,7 +107,7 @@ int sock_map_prog_detach(const union bpf_attr *attr, enum bpf_prog_type ptype)
>                 goto put_prog;
>         }
>
> -       ret = sock_map_prog_update(map, NULL, prog, attr->attach_type);
> +       ret = sock_map_prog_update(map, NULL, prog, NULL, attr->attach_type);
>  put_prog:
>         bpf_prog_put(prog);
>  put_map:
> @@ -1488,21 +1492,90 @@ static int sock_map_prog_lookup(struct bpf_map *map, struct bpf_prog ***pprog,
>         return 0;
>  }
>
> +static int sock_map_link_lookup(struct bpf_map *map, struct bpf_link ***plink,
> +                               struct bpf_link *link, bool skip_check, u32 which)
> +{
> +       struct sk_psock_progs *progs = sock_map_progs(map);
> +
> +       switch (which) {
> +       case BPF_SK_MSG_VERDICT:
> +               if (!skip_check &&
> +                   ((!link && progs->msg_parser_link) ||
> +                    (link && link != progs->msg_parser_link)))
> +                       return -EBUSY;
> +               *plink = &progs->msg_parser_link;
> +               break;
> +#if IS_ENABLED(CONFIG_BPF_STREAM_PARSER)
> +       case BPF_SK_SKB_STREAM_PARSER:
> +               if (!skip_check &&
> +                   ((!link && progs->stream_parser_link) ||
> +                    (link && link != progs->stream_parser_link)))
> +                       return -EBUSY;
> +               *plink = &progs->stream_parser_link;
> +               break;
> +#endif
> +       case BPF_SK_SKB_STREAM_VERDICT:
> +               if (!skip_check &&
> +                   ((!link && progs->stream_verdict_link) ||
> +                    (link && link != progs->stream_verdict_link)))
> +                       return -EBUSY;
> +               *plink = &progs->stream_verdict_link;
> +               break;
> +       case BPF_SK_SKB_VERDICT:
> +               if (!skip_check &&
> +                   ((!link && progs->skb_verdict_link) ||
> +                    (link && link != progs->skb_verdict_link)))
> +                       return -EBUSY;
> +               *plink = &progs->skb_verdict_link;
> +               break;
> +       default:
> +               return -EOPNOTSUPP;
> +       }
> +

you can simplify this by

struct bpf_link *cur_link;

switch (which) {
case BPF_SK_MSG_VERDICT:
    cur_link = progs->msg_parser_link;
    break;
case ...

}

and then perform that condition validating link and cur_link just once.


> +       return 0;
> +}
> +
> +/* Handle the following four cases:
> + * prog_attach: prog != NULL, old == NULL, link == NULL
> + * prog_detach: prog == NULL, old != NULL, link == NULL
> + * link_attach: prog != NULL, old == NULL, link != NULL
> + * link_detach: prog == NULL, old != NULL, link != NULL
> + */
>  static int sock_map_prog_update(struct bpf_map *map, struct bpf_prog *prog,
> -                               struct bpf_prog *old, u32 which)
> +                               struct bpf_prog *old, struct bpf_link *link,
> +                               u32 which)
>  {
>         struct bpf_prog **pprog;
> +       struct bpf_link **plink;
>         int ret;
>
> +       mutex_lock(&sockmap_prog_update_mutex);
> +
>         ret = sock_map_prog_lookup(map, &pprog, which);
>         if (ret)
> -               return ret;
> +               goto out;
>
> -       if (old)
> -               return psock_replace_prog(pprog, prog, old);
> +       if (!link || prog)
> +               ret = sock_map_link_lookup(map, &plink, NULL, false, which);
> +       else
> +               ret = sock_map_link_lookup(map, &plink, NULL, true, which);

it feels like it would be cleaner if sock_map_link_lookup() would just
return already set link (based on which), and then you'd check update
logic with prog vs link right here

e.g., it's quite obfuscated that we validate that there is no link set
currently (this code doesn't do anything with plink).

anyways, I find it a bit hard to follow as it's written, but I'll
defer to John to make a final call on logic

> +       if (ret)
> +               goto out;
> +
> +       if (old) {
> +               ret = psock_replace_prog(pprog, prog, old);
> +               if (!ret)
> +                       *plink = NULL;
> +               goto out;
> +       }
>
>         psock_set_prog(pprog, prog);
> -       return 0;
> +       if (link)
> +               *plink = link;
> +
> +out:
> +       mutex_unlock(&sockmap_prog_update_mutex);

why this mutex is not per-sockmap?

> +       return ret;
>  }
>
>  int sock_map_bpf_prog_query(const union bpf_attr *attr,
> @@ -1657,6 +1730,180 @@ void sock_map_close(struct sock *sk, long timeout)
>  }
>  EXPORT_SYMBOL_GPL(sock_map_close);
>
> +struct sockmap_link {
> +       struct bpf_link link;
> +       struct bpf_map *map;
> +       enum bpf_attach_type attach_type;
> +};
> +
> +static struct sockmap_link *get_sockmap_link(const struct bpf_link *link)
> +{
> +       return container_of(link, struct sockmap_link, link);
> +}

nit: do you really need this helper? container_of() is a pretty
straightforward by itself, imo

> +
> +static void sock_map_link_release(struct bpf_link *link)
> +{
> +       struct sockmap_link *sockmap_link = get_sockmap_link(link);
> +
> +       mutex_lock(&sockmap_link_mutex);

similar to the above, why is this mutex not sockmap-specific? And I'd
just combine sockmap_link_mutex and sockmap_prog_update_mutex in this
case to keep it simple.

> +       if (sockmap_link->map) {
> +               (void)sock_map_prog_update(sockmap_link->map, NULL, link->prog, link,
> +                                          sockmap_link->attach_type);

WARN() if it's not meant to ever fail?

> +               bpf_map_put_with_uref(sockmap_link->map);
> +               sockmap_link->map = NULL;
> +       }
> +       mutex_unlock(&sockmap_link_mutex);
> +}
> +
> +static int sock_map_link_detach(struct bpf_link *link)
> +{
> +       sock_map_link_release(link);
> +       return 0;
> +}
> +
> +static void sock_map_link_dealloc(struct bpf_link *link)
> +{
> +       kfree(get_sockmap_link(link));
> +}
> +
> +/* Handle the following two cases:
> + * case 1: link != NULL, prog != NULL, old != NULL
> + * case 2: link != NULL, prog != NULL, old == NULL
> + */
> +static int sock_map_link_update_prog(struct bpf_link *link,
> +                                    struct bpf_prog *prog,
> +                                    struct bpf_prog *old)
> +{
> +       const struct sockmap_link *sockmap_link = get_sockmap_link(link);
> +       struct bpf_prog **pprog;
> +       struct bpf_link **plink;
> +       int ret = 0;
> +
> +       mutex_lock(&sockmap_prog_update_mutex);
> +
> +       /* If old prog not NULL, ensure old prog the same as link->prog. */

typo: "is not NULL", "is the same"

> +       if (old && link->prog != old) {

hm.. even if old matches link->prog, we should unset old and set new
link (link overrides prog attachment, basically), it shouldn't matter
if old == link->prog, unless I'm missing something?

> +               ret = -EINVAL;
> +               goto out;
> +       }
> +       /* Ensure link->prog has the same type/attach_type as the new prog. */
> +       if (link->prog->type != prog->type ||
> +           link->prog->expected_attach_type != prog->expected_attach_type) {
> +               ret = -EINVAL;
> +               goto out;
> +       }
> +
> +       ret = sock_map_prog_lookup(sockmap_link->map, &pprog,
> +                                  sockmap_link->attach_type);
> +       if (ret)
> +               goto out;
> +
> +       /* Ensure the same link between the one in map and the passed-in. */
> +       ret = sock_map_link_lookup(sockmap_link->map, &plink, link, false,
> +                                  sockmap_link->attach_type);
> +       if (ret)
> +               goto out;
> +
> +       if (old)
> +               return psock_replace_prog(pprog, prog, old);
> +
> +       psock_set_prog(pprog, prog);
> +
> +out:
> +       if (!ret)
> +               bpf_prog_inc(prog);
> +       mutex_unlock(&sockmap_prog_update_mutex);
> +       return ret;
> +}
> +
> +static u32 sock_map_link_get_map_id(const struct sockmap_link *sockmap_link)
> +{
> +       u32 map_id = 0;
> +
> +       mutex_lock(&sockmap_link_mutex);
> +       if (sockmap_link->map)
> +               map_id = sockmap_link->map->id;
> +       mutex_unlock(&sockmap_link_mutex);
> +       return map_id;
> +}
> +
> +static int sock_map_link_fill_info(const struct bpf_link *link,
> +                                  struct bpf_link_info *info)
> +{
> +       const struct sockmap_link *sockmap_link = get_sockmap_link(link);
> +       u32 map_id = sock_map_link_get_map_id(sockmap_link);
> +
> +       info->sockmap.map_id = map_id;
> +       info->sockmap.attach_type = sockmap_link->attach_type;
> +       return 0;
> +}
> +
> +static void sock_map_link_show_fdinfo(const struct bpf_link *link,
> +                                     struct seq_file *seq)
> +{
> +       const struct sockmap_link *sockmap_link = get_sockmap_link(link);
> +       u32 map_id = sock_map_link_get_map_id(sockmap_link);
> +
> +       seq_printf(seq, "map_id:\t%u\n", map_id);
> +       seq_printf(seq, "attach_type:\t%u\n", sockmap_link->attach_type);
> +}
> +
> +static const struct bpf_link_ops sock_map_link_ops = {
> +       .release = sock_map_link_release,
> +       .dealloc = sock_map_link_dealloc,
> +       .detach = sock_map_link_detach,
> +       .update_prog = sock_map_link_update_prog,
> +       .fill_link_info = sock_map_link_fill_info,
> +       .show_fdinfo = sock_map_link_show_fdinfo,
> +};
> +
> +int sock_map_link_create(const union bpf_attr *attr, struct bpf_prog *prog)
> +{
> +       struct bpf_link_primer link_primer;
> +       struct sockmap_link *sockmap_link;
> +       enum bpf_attach_type attach_type;
> +       struct bpf_map *map;
> +       int ret;
> +
> +       if (attr->link_create.flags)
> +               return -EINVAL;
> +
> +       map = bpf_map_get_with_uref(attr->link_create.target_fd);
> +       if (IS_ERR(map))
> +               return PTR_ERR(map);

check that map is SOCKMAP?

> +
> +       sockmap_link = kzalloc(sizeof(*sockmap_link), GFP_USER);
> +       if (!sockmap_link) {
> +               ret = -ENOMEM;
> +               goto out;
> +       }
> +
> +       attach_type = attr->link_create.attach_type;
> +       bpf_link_init(&sockmap_link->link, BPF_LINK_TYPE_SOCKMAP, &sock_map_link_ops, prog);
> +       sockmap_link->map = map;
> +       sockmap_link->attach_type = attach_type;
> +
> +       ret = bpf_link_prime(&sockmap_link->link, &link_primer);
> +       if (ret) {
> +               kfree(sockmap_link);
> +               goto out;
> +       }
> +
> +       ret = sock_map_prog_update(map, prog, NULL, &sockmap_link->link, attach_type);
> +       if (ret) {
> +               bpf_link_cleanup(&link_primer);
> +               goto out;
> +       }
> +
> +       bpf_prog_inc(prog);

if link was created successfully, it "inherits" prog's refcnt, so you
shouldn't do another bpf_prog_inc()? generic link_create() logic puts
prog only if this function returns error

> +
> +       return bpf_link_settle(&link_primer);
> +
> +out:
> +       bpf_map_put_with_uref(map);
> +       return ret;
> +}
> +
>  static int sock_map_iter_attach_target(struct bpf_prog *prog,
>                                        union bpf_iter_link_info *linfo,
>                                        struct bpf_iter_aux_info *aux)
> diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
> index 9585f5345353..31660c3ffc01 100644
> --- a/tools/include/uapi/linux/bpf.h
> +++ b/tools/include/uapi/linux/bpf.h
> @@ -1135,6 +1135,7 @@ enum bpf_link_type {
>         BPF_LINK_TYPE_TCX = 11,
>         BPF_LINK_TYPE_UPROBE_MULTI = 12,
>         BPF_LINK_TYPE_NETKIT = 13,
> +       BPF_LINK_TYPE_SOCKMAP = 14,
>         __MAX_BPF_LINK_TYPE,
>  };
>
> @@ -6720,6 +6721,10 @@ struct bpf_link_info {
>                         __u32 ifindex;
>                         __u32 attach_type;
>                 } netkit;
> +               struct {
> +                       __u32 map_id;
> +                       __u32 attach_type;
> +               } sockmap;
>         };
>  } __attribute__((aligned(8)));
>
> --
> 2.43.0
>
Yonghong Song April 3, 2024, 12:06 a.m. UTC | #3
On 4/2/24 10:39 AM, Eduard Zingerman wrote:
> On Mon, 2024-03-25 at 19:21 -0700, Yonghong Song wrote:
> [...]
>
>> diff --git a/net/core/sock_map.c b/net/core/sock_map.c
>> index 27d733c0f65e..dafc9aa6e192 100644
>> --- a/net/core/sock_map.c
>> +++ b/net/core/sock_map.c
> [...]
>
>> @@ -1488,21 +1492,90 @@ static int sock_map_prog_lookup(struct bpf_map *map, struct bpf_prog ***pprog,
>>   	return 0;
>>   }
>>   
>> +static int sock_map_link_lookup(struct bpf_map *map, struct bpf_link ***plink,
>> +				struct bpf_link *link, bool skip_check, u32 which)
>> +{
>> +	struct sk_psock_progs *progs = sock_map_progs(map);
>> +
>> +	switch (which) {
>> +	case BPF_SK_MSG_VERDICT:
>> +		if (!skip_check &&
>> +		    ((!link && progs->msg_parser_link) ||
>> +		     (link && link != progs->msg_parser_link)))
>> +			return -EBUSY;
> These checks seem a bit repetitive, maybe factor it out as a single
> check at the end of the function? E.g.:
>
> 	if (!skip_check &&
> 	    ((!link && **plink) || (link && link != **plink)))
> 		return -EBUSY;
>
> Or inline these checks at call sites for sock_map_link_lookup()?
> I tried this on top of this in [1] and all tests seem to pass.

Andrii has a suggestion to do
	plink = progs->msg_parser_link;

and later plink can be used for checking. This indeed makes things easier.

>
> [1] https://gist.github.com/eddyz87/38d832b3f1fc74120598d3480bc16ae1
>
>> +		*plink = &progs->msg_parser_link;
>> +		break;
>> +#if IS_ENABLED(CONFIG_BPF_STREAM_PARSER)
>> +	case BPF_SK_SKB_STREAM_PARSER:
>> +		if (!skip_check &&
>> +		    ((!link && progs->stream_parser_link) ||
>> +		     (link && link != progs->stream_parser_link)))
>> +			return -EBUSY;
>> +		*plink = &progs->stream_parser_link;
>> +		break;
>> +#endif
>> +	case BPF_SK_SKB_STREAM_VERDICT:
>> +		if (!skip_check &&
>> +		    ((!link && progs->stream_verdict_link) ||
>> +		     (link && link != progs->stream_verdict_link)))
>> +			return -EBUSY;
>> +		*plink = &progs->stream_verdict_link;
>> +		break;
>> +	case BPF_SK_SKB_VERDICT:
>> +		if (!skip_check &&
>> +		    ((!link && progs->skb_verdict_link) ||
>> +		     (link && link != progs->skb_verdict_link)))
>> +			return -EBUSY;
>> +		*plink = &progs->skb_verdict_link;
>> +		break;
>> +	default:
>> +		return -EOPNOTSUPP;
>> +	}
>> +
>> +	return 0;
>> +}
> [...]
>
>> +/* Handle the following two cases:
>> + * case 1: link != NULL, prog != NULL, old != NULL
>> + * case 2: link != NULL, prog != NULL, old == NULL
>> + */
>> +static int sock_map_link_update_prog(struct bpf_link *link,
>> +				     struct bpf_prog *prog,
>> +				     struct bpf_prog *old)
>> +{
>> +	const struct sockmap_link *sockmap_link = get_sockmap_link(link);
>> +	struct bpf_prog **pprog;
>> +	struct bpf_link **plink;
>> +	int ret = 0;
>> +
>> +	mutex_lock(&sockmap_prog_update_mutex);
>> +
>> +	/* If old prog not NULL, ensure old prog the same as link->prog. */
>> +	if (old && link->prog != old) {
>> +		ret = -EINVAL;
>> +		goto out;
>> +	}
>> +	/* Ensure link->prog has the same type/attach_type as the new prog. */
>> +	if (link->prog->type != prog->type ||
>> +	    link->prog->expected_attach_type != prog->expected_attach_type) {
>> +		ret = -EINVAL;
>> +		goto out;
>> +	}
>> +
>> +	ret = sock_map_prog_lookup(sockmap_link->map, &pprog,
>> +				   sockmap_link->attach_type);
>> +	if (ret)
>> +		goto out;
>> +
>> +	/* Ensure the same link between the one in map and the passed-in. */
>> +	ret = sock_map_link_lookup(sockmap_link->map, &plink, link, false,
>> +				   sockmap_link->attach_type);
>> +	if (ret)
>> +		goto out;
>> +
>> +	if (old)
>> +		return psock_replace_prog(pprog, prog, old);
> should this be 'goto out' in order to unlock the mutex?

Good point. I missed a test case with non-NULL old. Will add in the next revision.

>
>> +
>> +	psock_set_prog(pprog, prog);
>> +
>> +out:
>> +	if (!ret)
>> +		bpf_prog_inc(prog);
>> +	mutex_unlock(&sockmap_prog_update_mutex);
>> +	return ret;
>> +}
> [...]
Yonghong Song April 3, 2024, 1:08 a.m. UTC | #4
On 4/2/24 10:45 AM, Andrii Nakryiko wrote:
> On Mon, Mar 25, 2024 at 7:22 PM Yonghong Song <yonghong.song@linux.dev> wrote:
>> Add bpf_link support for sk_msg and sk_skb programs. We have an
>> internal request to support bpf_link for sk_msg programs so user
>> space can have a uniform handling with bpf_link based libbpf
>> APIs. Using bpf_link based libbpf API also has a benefit which
>> makes system robust by decoupling prog life cycle and
>> attachment life cycle.
>>
>> Signed-off-by: Yonghong Song <yonghong.song@linux.dev>
>> ---
>>   include/linux/bpf.h            |   6 +
>>   include/linux/skmsg.h          |   4 +
>>   include/uapi/linux/bpf.h       |   5 +
>>   kernel/bpf/syscall.c           |   4 +
>>   net/core/sock_map.c            | 263 ++++++++++++++++++++++++++++++++-
>>   tools/include/uapi/linux/bpf.h |   5 +
>>   6 files changed, 279 insertions(+), 8 deletions(-)
>>
>> diff --git a/include/linux/bpf.h b/include/linux/bpf.h
>> index 62762390c93d..5034c1b4ded7 100644
>> --- a/include/linux/bpf.h
>> +++ b/include/linux/bpf.h
>> @@ -2996,6 +2996,7 @@ int sock_map_prog_detach(const union bpf_attr *attr, enum bpf_prog_type ptype);
>>   int sock_map_update_elem_sys(struct bpf_map *map, void *key, void *value, u64 flags);
>>   int sock_map_bpf_prog_query(const union bpf_attr *attr,
>>                              union bpf_attr __user *uattr);
>> +int sock_map_link_create(const union bpf_attr *attr, struct bpf_prog *prog);
>>
>>   void sock_map_unhash(struct sock *sk);
>>   void sock_map_destroy(struct sock *sk);
>> @@ -3094,6 +3095,11 @@ static inline int sock_map_bpf_prog_query(const union bpf_attr *attr,
>>   {
>>          return -EINVAL;
>>   }
>> +
>> +static inline int sock_map_link_create(const union bpf_attr *attr, struct bpf_prog *prog)
>> +{
>> +       return -EOPNOTSUPP;
>> +}
>>   #endif /* CONFIG_BPF_SYSCALL */
>>   #endif /* CONFIG_NET && CONFIG_BPF_SYSCALL */
>>
>> diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h
>> index e65ec3fd2799..9c8dd4c01412 100644
>> --- a/include/linux/skmsg.h
>> +++ b/include/linux/skmsg.h
>> @@ -58,6 +58,10 @@ struct sk_psock_progs {
>>          struct bpf_prog                 *stream_parser;
>>          struct bpf_prog                 *stream_verdict;
>>          struct bpf_prog                 *skb_verdict;
>> +       struct bpf_link                 *msg_parser_link;
>> +       struct bpf_link                 *stream_parser_link;
>> +       struct bpf_link                 *stream_verdict_link;
>> +       struct bpf_link                 *skb_verdict_link;
>>   };
>>
>>   enum sk_psock_state_bits {
>> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
>> index 9585f5345353..31660c3ffc01 100644
>> --- a/include/uapi/linux/bpf.h
>> +++ b/include/uapi/linux/bpf.h
>> @@ -1135,6 +1135,7 @@ enum bpf_link_type {
>>          BPF_LINK_TYPE_TCX = 11,
>>          BPF_LINK_TYPE_UPROBE_MULTI = 12,
>>          BPF_LINK_TYPE_NETKIT = 13,
>> +       BPF_LINK_TYPE_SOCKMAP = 14,
>>          __MAX_BPF_LINK_TYPE,
>>   };
>>
>> @@ -6720,6 +6721,10 @@ struct bpf_link_info {
>>                          __u32 ifindex;
>>                          __u32 attach_type;
>>                  } netkit;
>> +               struct {
>> +                       __u32 map_id;
>> +                       __u32 attach_type;
>> +               } sockmap;
>>          };
>>   } __attribute__((aligned(8)));
>>
>> diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
>> index e44c276e8617..7d392ec83655 100644
>> --- a/kernel/bpf/syscall.c
>> +++ b/kernel/bpf/syscall.c
>> @@ -5213,6 +5213,10 @@ static int link_create(union bpf_attr *attr, bpfptr_t uattr)
>>          case BPF_PROG_TYPE_SK_LOOKUP:
>>                  ret = netns_bpf_link_create(attr, prog);
>>                  break;
>> +       case BPF_PROG_TYPE_SK_MSG:
>> +       case BPF_PROG_TYPE_SK_SKB:
>> +               ret = sock_map_link_create(attr, prog);
>> +               break;
>>   #ifdef CONFIG_NET
>>          case BPF_PROG_TYPE_XDP:
>>                  ret = bpf_xdp_link_attach(attr, prog);
>> diff --git a/net/core/sock_map.c b/net/core/sock_map.c
>> index 27d733c0f65e..dafc9aa6e192 100644
>> --- a/net/core/sock_map.c
>> +++ b/net/core/sock_map.c
>> @@ -24,8 +24,12 @@ struct bpf_stab {
>>   #define SOCK_CREATE_FLAG_MASK                          \
>>          (BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY)
>>
>> +static DEFINE_MUTEX(sockmap_prog_update_mutex);
>> +static DEFINE_MUTEX(sockmap_link_mutex);
>> +
>>   static int sock_map_prog_update(struct bpf_map *map, struct bpf_prog *prog,
>> -                               struct bpf_prog *old, u32 which);
>> +                               struct bpf_prog *old, struct bpf_link *link,
>> +                               u32 which);
>>   static struct sk_psock_progs *sock_map_progs(struct bpf_map *map);
>>
>>   static struct bpf_map *sock_map_alloc(union bpf_attr *attr)
>> @@ -71,7 +75,7 @@ int sock_map_get_from_fd(const union bpf_attr *attr, struct bpf_prog *prog)
>>          map = __bpf_map_get(f);
>>          if (IS_ERR(map))
>>                  return PTR_ERR(map);
>> -       ret = sock_map_prog_update(map, prog, NULL, attr->attach_type);
>> +       ret = sock_map_prog_update(map, prog, NULL, NULL, attr->attach_type);
>>          fdput(f);
>>          return ret;
>>   }
>> @@ -103,7 +107,7 @@ int sock_map_prog_detach(const union bpf_attr *attr, enum bpf_prog_type ptype)
>>                  goto put_prog;
>>          }
>>
>> -       ret = sock_map_prog_update(map, NULL, prog, attr->attach_type);
>> +       ret = sock_map_prog_update(map, NULL, prog, NULL, attr->attach_type);
>>   put_prog:
>>          bpf_prog_put(prog);
>>   put_map:
>> @@ -1488,21 +1492,90 @@ static int sock_map_prog_lookup(struct bpf_map *map, struct bpf_prog ***pprog,
>>          return 0;
>>   }
>>
>> +static int sock_map_link_lookup(struct bpf_map *map, struct bpf_link ***plink,
>> +                               struct bpf_link *link, bool skip_check, u32 which)
>> +{
>> +       struct sk_psock_progs *progs = sock_map_progs(map);
>> +
>> +       switch (which) {
>> +       case BPF_SK_MSG_VERDICT:
>> +               if (!skip_check &&
>> +                   ((!link && progs->msg_parser_link) ||
>> +                    (link && link != progs->msg_parser_link)))
>> +                       return -EBUSY;
>> +               *plink = &progs->msg_parser_link;
>> +               break;
>> +#if IS_ENABLED(CONFIG_BPF_STREAM_PARSER)
>> +       case BPF_SK_SKB_STREAM_PARSER:
>> +               if (!skip_check &&
>> +                   ((!link && progs->stream_parser_link) ||
>> +                    (link && link != progs->stream_parser_link)))
>> +                       return -EBUSY;
>> +               *plink = &progs->stream_parser_link;
>> +               break;
>> +#endif
>> +       case BPF_SK_SKB_STREAM_VERDICT:
>> +               if (!skip_check &&
>> +                   ((!link && progs->stream_verdict_link) ||
>> +                    (link && link != progs->stream_verdict_link)))
>> +                       return -EBUSY;
>> +               *plink = &progs->stream_verdict_link;
>> +               break;
>> +       case BPF_SK_SKB_VERDICT:
>> +               if (!skip_check &&
>> +                   ((!link && progs->skb_verdict_link) ||
>> +                    (link && link != progs->skb_verdict_link)))
>> +                       return -EBUSY;
>> +               *plink = &progs->skb_verdict_link;
>> +               break;
>> +       default:
>> +               return -EOPNOTSUPP;
>> +       }
>> +
> you can simplify this by
>
> struct bpf_link *cur_link;
>
> switch (which) {
> case BPF_SK_MSG_VERDICT:
>      cur_link = progs->msg_parser_link;
>      break;
> case ...
>
> }
>
> and then perform that condition validating link and cur_link just once.

Indeed, this sounds simpler.

>
>
>> +       return 0;
>> +}
>> +
>> +/* Handle the following four cases:
>> + * prog_attach: prog != NULL, old == NULL, link == NULL
>> + * prog_detach: prog == NULL, old != NULL, link == NULL
>> + * link_attach: prog != NULL, old == NULL, link != NULL
>> + * link_detach: prog == NULL, old != NULL, link != NULL
>> + */
>>   static int sock_map_prog_update(struct bpf_map *map, struct bpf_prog *prog,
>> -                               struct bpf_prog *old, u32 which)
>> +                               struct bpf_prog *old, struct bpf_link *link,
>> +                               u32 which)
>>   {
>>          struct bpf_prog **pprog;
>> +       struct bpf_link **plink;
>>          int ret;
>>
>> +       mutex_lock(&sockmap_prog_update_mutex);
>> +
>>          ret = sock_map_prog_lookup(map, &pprog, which);
>>          if (ret)
>> -               return ret;
>> +               goto out;
>>
>> -       if (old)
>> -               return psock_replace_prog(pprog, prog, old);
>> +       if (!link || prog)
>> +               ret = sock_map_link_lookup(map, &plink, NULL, false, which);
>> +       else
>> +               ret = sock_map_link_lookup(map, &plink, NULL, true, which);
> it feels like it would be cleaner if sock_map_link_lookup() would just
> return already set link (based on which), and then you'd check update
> logic with prog vs link right here
>
> e.g., it's quite obfuscated that we validate that there is no link set
> currently (this code doesn't do anything with plink).
>
> anyways, I find it a bit hard to follow as it's written, but I'll
> defer to John to make a final call on logic
>
>> +       if (ret)
>> +               goto out;
>> +
>> +       if (old) {
>> +               ret = psock_replace_prog(pprog, prog, old);
>> +               if (!ret)
>> +                       *plink = NULL;
>> +               goto out;
>> +       }
>>
>>          psock_set_prog(pprog, prog);
>> -       return 0;
>> +       if (link)
>> +               *plink = link;
>> +
>> +out:
>> +       mutex_unlock(&sockmap_prog_update_mutex);
> why this mutex is not per-sockmap?

My thinking is the system probably won't have lots of sockmaps and
sockmap attach/detach/update_prog should not be that frequent. But
I could be wrong.

>
>> +       return ret;
>>   }
>>
>>   int sock_map_bpf_prog_query(const union bpf_attr *attr,
>> @@ -1657,6 +1730,180 @@ void sock_map_close(struct sock *sk, long timeout)
>>   }
>>   EXPORT_SYMBOL_GPL(sock_map_close);
>>
>> +struct sockmap_link {
>> +       struct bpf_link link;
>> +       struct bpf_map *map;
>> +       enum bpf_attach_type attach_type;
>> +};
>> +
>> +static struct sockmap_link *get_sockmap_link(const struct bpf_link *link)
>> +{
>> +       return container_of(link, struct sockmap_link, link);
>> +}
> nit: do you really need this helper? container_of() is a pretty
> straightforward by itself, imo

I can do this.

>
>> +
>> +static void sock_map_link_release(struct bpf_link *link)
>> +{
>> +       struct sockmap_link *sockmap_link = get_sockmap_link(link);
>> +
>> +       mutex_lock(&sockmap_link_mutex);
> similar to the above, why is this mutex not sockmap-specific? And I'd
> just combine sockmap_link_mutex and sockmap_prog_update_mutex in this
> case to keep it simple.

This is to protect sockmap_link->map. They could share the same lock.
Let me double check...

>
>> +       if (sockmap_link->map) {
>> +               (void)sock_map_prog_update(sockmap_link->map, NULL, link->prog, link,
>> +                                          sockmap_link->attach_type);
> WARN() if it's not meant to ever fail?

Yes, this is in link_release function and should not fail.
I can add a WARN here.

>
>> +               bpf_map_put_with_uref(sockmap_link->map);
>> +               sockmap_link->map = NULL;
>> +       }
>> +       mutex_unlock(&sockmap_link_mutex);
>> +}
>> +
>> +static int sock_map_link_detach(struct bpf_link *link)
>> +{
>> +       sock_map_link_release(link);
>> +       return 0;
>> +}
>> +
>> +static void sock_map_link_dealloc(struct bpf_link *link)
>> +{
>> +       kfree(get_sockmap_link(link));
>> +}
>> +
>> +/* Handle the following two cases:
>> + * case 1: link != NULL, prog != NULL, old != NULL
>> + * case 2: link != NULL, prog != NULL, old == NULL
>> + */
>> +static int sock_map_link_update_prog(struct bpf_link *link,
>> +                                    struct bpf_prog *prog,
>> +                                    struct bpf_prog *old)
>> +{
>> +       const struct sockmap_link *sockmap_link = get_sockmap_link(link);
>> +       struct bpf_prog **pprog;
>> +       struct bpf_link **plink;
>> +       int ret = 0;
>> +
>> +       mutex_lock(&sockmap_prog_update_mutex);
>> +
>> +       /* If old prog not NULL, ensure old prog the same as link->prog. */
> typo: "is not NULL", "is the same"
>
>> +       if (old && link->prog != old) {
> hm.. even if old matches link->prog, we should unset old and set new
> link (link overrides prog attachment, basically), it shouldn't matter
> if old == link->prog, unless I'm missing something?

In xdp link (net/core/dev.c), we have

         cur_prog = dev_xdp_prog(dev, mode);
         /* can't replace attached prog with link */
         if (link && cur_prog) {
                 NL_SET_ERR_MSG(extack, "Can't replace active XDP 
program with BPF link");
                 return -EBUSY;
         }
         if ((flags & XDP_FLAGS_REPLACE) && cur_prog != old_prog) {
                 NL_SET_ERR_MSG(extack, "Active program does not match 
expected");
                 return -EEXIST;
         }

if flags has XDP_FLAGS_REPLACE, link saved prog must be equal to old_prog
in order to do prog update.
for sockmap prog update, in link_update (syscall.c), the only way
we can get a non-NULL old_prog is with the following:

         if (flags & BPF_F_REPLACE) {
                 old_prog = bpf_prog_get(attr->link_update.old_prog_fd);
                 if (IS_ERR(old_prog)) {
                         ret = PTR_ERR(old_prog);
                         old_prog = NULL;
                         goto out_put_progs;
                 }
         } else if (attr->link_update.old_prog_fd) {
                 ret = -EINVAL;
                 goto out_put_progs;
         }
Basically, we have BPF_F_REPLACE here.
So similar to xdp link, I think we should check old_prog to
be equal to link->prog in order to do link update_prog.

>
>> +               ret = -EINVAL;
>> +               goto out;
>> +       }
>> +       /* Ensure link->prog has the same type/attach_type as the new prog. */
>> +       if (link->prog->type != prog->type ||
>> +           link->prog->expected_attach_type != prog->expected_attach_type) {
>> +               ret = -EINVAL;
>> +               goto out;
>> +       }
>> +
>> +       ret = sock_map_prog_lookup(sockmap_link->map, &pprog,
>> +                                  sockmap_link->attach_type);
>> +       if (ret)
>> +               goto out;
>> +
>> +       /* Ensure the same link between the one in map and the passed-in. */
>> +       ret = sock_map_link_lookup(sockmap_link->map, &plink, link, false,
>> +                                  sockmap_link->attach_type);
>> +       if (ret)
>> +               goto out;
>> +
>> +       if (old)
>> +               return psock_replace_prog(pprog, prog, old);
>> +
>> +       psock_set_prog(pprog, prog);
>> +
>> +out:
>> +       if (!ret)
>> +               bpf_prog_inc(prog);
>> +       mutex_unlock(&sockmap_prog_update_mutex);
>> +       return ret;
>> +}
>> +
>> +static u32 sock_map_link_get_map_id(const struct sockmap_link *sockmap_link)
>> +{
>> +       u32 map_id = 0;
>> +
>> +       mutex_lock(&sockmap_link_mutex);
>> +       if (sockmap_link->map)
>> +               map_id = sockmap_link->map->id;
>> +       mutex_unlock(&sockmap_link_mutex);
>> +       return map_id;
>> +}
>> +
>> +static int sock_map_link_fill_info(const struct bpf_link *link,
>> +                                  struct bpf_link_info *info)
>> +{
>> +       const struct sockmap_link *sockmap_link = get_sockmap_link(link);
>> +       u32 map_id = sock_map_link_get_map_id(sockmap_link);
>> +
>> +       info->sockmap.map_id = map_id;
>> +       info->sockmap.attach_type = sockmap_link->attach_type;
>> +       return 0;
>> +}
>> +
>> +static void sock_map_link_show_fdinfo(const struct bpf_link *link,
>> +                                     struct seq_file *seq)
>> +{
>> +       const struct sockmap_link *sockmap_link = get_sockmap_link(link);
>> +       u32 map_id = sock_map_link_get_map_id(sockmap_link);
>> +
>> +       seq_printf(seq, "map_id:\t%u\n", map_id);
>> +       seq_printf(seq, "attach_type:\t%u\n", sockmap_link->attach_type);
>> +}
>> +
>> +static const struct bpf_link_ops sock_map_link_ops = {
>> +       .release = sock_map_link_release,
>> +       .dealloc = sock_map_link_dealloc,
>> +       .detach = sock_map_link_detach,
>> +       .update_prog = sock_map_link_update_prog,
>> +       .fill_link_info = sock_map_link_fill_info,
>> +       .show_fdinfo = sock_map_link_show_fdinfo,
>> +};
>> +
>> +int sock_map_link_create(const union bpf_attr *attr, struct bpf_prog *prog)
>> +{
>> +       struct bpf_link_primer link_primer;
>> +       struct sockmap_link *sockmap_link;
>> +       enum bpf_attach_type attach_type;
>> +       struct bpf_map *map;
>> +       int ret;
>> +
>> +       if (attr->link_create.flags)
>> +               return -EINVAL;
>> +
>> +       map = bpf_map_get_with_uref(attr->link_create.target_fd);
>> +       if (IS_ERR(map))
>> +               return PTR_ERR(map);
> check that map is SOCKMAP?
Good point. Will do.
>
>> +
>> +       sockmap_link = kzalloc(sizeof(*sockmap_link), GFP_USER);
>> +       if (!sockmap_link) {
>> +               ret = -ENOMEM;
>> +               goto out;
>> +       }
>> +
>> +       attach_type = attr->link_create.attach_type;
>> +       bpf_link_init(&sockmap_link->link, BPF_LINK_TYPE_SOCKMAP, &sock_map_link_ops, prog);
>> +       sockmap_link->map = map;
>> +       sockmap_link->attach_type = attach_type;
>> +
>> +       ret = bpf_link_prime(&sockmap_link->link, &link_primer);
>> +       if (ret) {
>> +               kfree(sockmap_link);
>> +               goto out;
>> +       }
>> +
>> +       ret = sock_map_prog_update(map, prog, NULL, &sockmap_link->link, attach_type);
>> +       if (ret) {
>> +               bpf_link_cleanup(&link_primer);
>> +               goto out;
>> +       }
>> +
>> +       bpf_prog_inc(prog);
> if link was created successfully, it "inherits" prog's refcnt, so you
> shouldn't do another bpf_prog_inc()? generic link_create() logic puts
> prog only if this function returns error

The reason I did this is due to

static inline void psock_set_prog(struct bpf_prog **pprog,
                                   struct bpf_prog *prog)
{
         prog = xchg(pprog, prog);
         if (prog)
                 bpf_prog_put(prog);
}

You can see when the prog is swapped due to link_update or prog_attach,
its reference count is decremented by 1. This is necessary for prog_attach,
but as you mentioned, indeed, it is not necessary for link-based approach.
Let me see whether I can refactor code to make it easy not to increase
reference count of prog here.


>
>> +
>> +       return bpf_link_settle(&link_primer);
>> +
>> +out:
>> +       bpf_map_put_with_uref(map);
>> +       return ret;
>> +}
>> +
>>   static int sock_map_iter_attach_target(struct bpf_prog *prog,
>>                                         union bpf_iter_link_info *linfo,
>>                                         struct bpf_iter_aux_info *aux)
>> diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
>> index 9585f5345353..31660c3ffc01 100644
>> --- a/tools/include/uapi/linux/bpf.h
>> +++ b/tools/include/uapi/linux/bpf.h
>> @@ -1135,6 +1135,7 @@ enum bpf_link_type {
>>          BPF_LINK_TYPE_TCX = 11,
>>          BPF_LINK_TYPE_UPROBE_MULTI = 12,
>>          BPF_LINK_TYPE_NETKIT = 13,
>> +       BPF_LINK_TYPE_SOCKMAP = 14,
>>          __MAX_BPF_LINK_TYPE,
>>   };
>>
>> @@ -6720,6 +6721,10 @@ struct bpf_link_info {
>>                          __u32 ifindex;
>>                          __u32 attach_type;
>>                  } netkit;
>> +               struct {
>> +                       __u32 map_id;
>> +                       __u32 attach_type;
>> +               } sockmap;
>>          };
>>   } __attribute__((aligned(8)));
>>
>> --
>> 2.43.0
>>
Andrii Nakryiko April 3, 2024, 4:43 p.m. UTC | #5
On Tue, Apr 2, 2024 at 6:08 PM Yonghong Song <yonghong.song@linux.dev> wrote:
>
>
> On 4/2/24 10:45 AM, Andrii Nakryiko wrote:
> > On Mon, Mar 25, 2024 at 7:22 PM Yonghong Song <yonghong.song@linux.dev> wrote:
> >> Add bpf_link support for sk_msg and sk_skb programs. We have an
> >> internal request to support bpf_link for sk_msg programs so user
> >> space can have a uniform handling with bpf_link based libbpf
> >> APIs. Using bpf_link based libbpf API also has a benefit which
> >> makes system robust by decoupling prog life cycle and
> >> attachment life cycle.
> >>
> >> Signed-off-by: Yonghong Song <yonghong.song@linux.dev>
> >> ---
> >>   include/linux/bpf.h            |   6 +
> >>   include/linux/skmsg.h          |   4 +
> >>   include/uapi/linux/bpf.h       |   5 +
> >>   kernel/bpf/syscall.c           |   4 +
> >>   net/core/sock_map.c            | 263 ++++++++++++++++++++++++++++++++-
> >>   tools/include/uapi/linux/bpf.h |   5 +
> >>   6 files changed, 279 insertions(+), 8 deletions(-)
> >>

[...]

> >>          psock_set_prog(pprog, prog);
> >> -       return 0;
> >> +       if (link)
> >> +               *plink = link;
> >> +
> >> +out:
> >> +       mutex_unlock(&sockmap_prog_update_mutex);
> > why this mutex is not per-sockmap?
>
> My thinking is the system probably won't have lots of sockmaps and
> sockmap attach/detach/update_prog should not be that frequent. But
> I could be wrong.
>

That seems like an even more of an argument to keep mutex per sockmap.
It won't add a lot of memory, but it is conceptually cleaner, as each
sockmap instance (and corresponding links) are completely independent,
even from a locking perspective.

But I can't say I feel very strongly about this.

> >
> >> +       return ret;
> >>   }
> >>

[...]

> >
> >> +
> >> +static void sock_map_link_release(struct bpf_link *link)
> >> +{
> >> +       struct sockmap_link *sockmap_link = get_sockmap_link(link);
> >> +
> >> +       mutex_lock(&sockmap_link_mutex);
> > similar to the above, why is this mutex not sockmap-specific? And I'd
> > just combine sockmap_link_mutex and sockmap_prog_update_mutex in this
> > case to keep it simple.
>
> This is to protect sockmap_link->map. They could share the same lock.
> Let me double check...

If you keep that global sockmap_prog_update_mutex then I'd probably
reuse that one here for simplicity (and named it a bit more
generically, "sockmap_mutex" or something like that, just like we have
global "cgroup_mutex").

[...]

> >> +       if (old && link->prog != old) {
> > hm.. even if old matches link->prog, we should unset old and set new
> > link (link overrides prog attachment, basically), it shouldn't matter
> > if old == link->prog, unless I'm missing something?
>
> In xdp link (net/core/dev.c), we have
>
>          cur_prog = dev_xdp_prog(dev, mode);
>          /* can't replace attached prog with link */
>          if (link && cur_prog) {
>                  NL_SET_ERR_MSG(extack, "Can't replace active XDP
> program with BPF link");
>                  return -EBUSY;
>          }
>          if ((flags & XDP_FLAGS_REPLACE) && cur_prog != old_prog) {
>                  NL_SET_ERR_MSG(extack, "Active program does not match
> expected");
>                  return -EEXIST;
>          }
>
> if flags has XDP_FLAGS_REPLACE, link saved prog must be equal to old_prog
> in order to do prog update.
> for sockmap prog update, in link_update (syscall.c), the only way
> we can get a non-NULL old_prog is with the following:
>
>          if (flags & BPF_F_REPLACE) {
>                  old_prog = bpf_prog_get(attr->link_update.old_prog_fd);
>                  if (IS_ERR(old_prog)) {
>                          ret = PTR_ERR(old_prog);
>                          old_prog = NULL;
>                          goto out_put_progs;
>                  }
>          } else if (attr->link_update.old_prog_fd) {
>                  ret = -EINVAL;
>                  goto out_put_progs;
>          }
> Basically, we have BPF_F_REPLACE here.
> So similar to xdp link, I think we should check old_prog to
> be equal to link->prog in order to do link update_prog.

ah, ok, that's BPF_F_REPLACE case. See, it's confusing that we have
this logic split between multiple places, in dev_xdp_attach() it's a
bit more centralized.

>
> >
> >> +               ret = -EINVAL;
> >> +               goto out;
> >> +       }

[...]

> >> +
> >> +       ret = sock_map_prog_update(map, prog, NULL, &sockmap_link->link, attach_type);
> >> +       if (ret) {
> >> +               bpf_link_cleanup(&link_primer);
> >> +               goto out;
> >> +       }
> >> +
> >> +       bpf_prog_inc(prog);
> > if link was created successfully, it "inherits" prog's refcnt, so you
> > shouldn't do another bpf_prog_inc()? generic link_create() logic puts
> > prog only if this function returns error
>
> The reason I did this is due to
>
> static inline void psock_set_prog(struct bpf_prog **pprog,
>                                    struct bpf_prog *prog)
> {
>          prog = xchg(pprog, prog);
>          if (prog)
>                  bpf_prog_put(prog);
> }
>
> You can see when the prog is swapped due to link_update or prog_attach,
> its reference count is decremented by 1. This is necessary for prog_attach,
> but as you mentioned, indeed, it is not necessary for link-based approach.
> Let me see whether I can refactor code to make it easy not to increase
> reference count of prog here.
>

ah, ok, its another sockmap-specific convention, np

>
> >
> >> +
> >> +       return bpf_link_settle(&link_primer);
> >> +
> >> +out:
> >> +       bpf_map_put_with_uref(map);
> >> +       return ret;
> >> +}
> >> +
> >>   static int sock_map_iter_attach_target(struct bpf_prog *prog,
> >>                                         union bpf_iter_link_info *linfo,
> >>                                         struct bpf_iter_aux_info *aux)
> >> diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
> >> index 9585f5345353..31660c3ffc01 100644
> >> --- a/tools/include/uapi/linux/bpf.h
> >> +++ b/tools/include/uapi/linux/bpf.h
> >> @@ -1135,6 +1135,7 @@ enum bpf_link_type {
> >>          BPF_LINK_TYPE_TCX = 11,
> >>          BPF_LINK_TYPE_UPROBE_MULTI = 12,
> >>          BPF_LINK_TYPE_NETKIT = 13,
> >> +       BPF_LINK_TYPE_SOCKMAP = 14,
> >>          __MAX_BPF_LINK_TYPE,
> >>   };
> >>
> >> @@ -6720,6 +6721,10 @@ struct bpf_link_info {
> >>                          __u32 ifindex;
> >>                          __u32 attach_type;
> >>                  } netkit;
> >> +               struct {
> >> +                       __u32 map_id;
> >> +                       __u32 attach_type;
> >> +               } sockmap;
> >>          };
> >>   } __attribute__((aligned(8)));
> >>
> >> --
> >> 2.43.0
> >>
John Fastabend April 3, 2024, 5:47 p.m. UTC | #6
Andrii Nakryiko wrote:
> On Tue, Apr 2, 2024 at 6:08 PM Yonghong Song <yonghong.song@linux.dev> wrote:
> >
> >
> > On 4/2/24 10:45 AM, Andrii Nakryiko wrote:
> > > On Mon, Mar 25, 2024 at 7:22 PM Yonghong Song <yonghong.song@linux.dev> wrote:
> > >> Add bpf_link support for sk_msg and sk_skb programs. We have an
> > >> internal request to support bpf_link for sk_msg programs so user
> > >> space can have a uniform handling with bpf_link based libbpf
> > >> APIs. Using bpf_link based libbpf API also has a benefit which
> > >> makes system robust by decoupling prog life cycle and
> > >> attachment life cycle.
> > >>

Thanks again for working on it.

> > >> Signed-off-by: Yonghong Song <yonghong.song@linux.dev>
> > >> ---
> > >>   include/linux/bpf.h            |   6 +
> > >>   include/linux/skmsg.h          |   4 +
> > >>   include/uapi/linux/bpf.h       |   5 +
> > >>   kernel/bpf/syscall.c           |   4 +
> > >>   net/core/sock_map.c            | 263 ++++++++++++++++++++++++++++++++-
> > >>   tools/include/uapi/linux/bpf.h |   5 +
> > >>   6 files changed, 279 insertions(+), 8 deletions(-)
> > >>
> 
> [...]
> 
> > >>          psock_set_prog(pprog, prog);
> > >> -       return 0;
> > >> +       if (link)
> > >> +               *plink = link;
> > >> +
> > >> +out:
> > >> +       mutex_unlock(&sockmap_prog_update_mutex);
> > > why this mutex is not per-sockmap?
> >
> > My thinking is the system probably won't have lots of sockmaps and
> > sockmap attach/detach/update_prog should not be that frequent. But
> > I could be wrong.
> >

For my use case at least we have a map per protocol we want to inspect.
So its rather small set <10 I would say. Also they are created once
when the agent starts and when config changes from operator (user decides
to remove/add a parser). Config changing is rather rare. I don't think
this would be paticularly painful in practice now to have a global
lock.

> 
> That seems like an even more of an argument to keep mutex per sockmap.
> It won't add a lot of memory, but it is conceptually cleaner, as each
> sockmap instance (and corresponding links) are completely independent,
> even from a locking perspective.
> 
> But I can't say I feel very strongly about this.
> 
> > >
> > >> +       return ret;
> > >>   }
> > >>
> 
> [...]
> 
> > >
> > >> +
> > >> +static void sock_map_link_release(struct bpf_link *link)
> > >> +{
> > >> +       struct sockmap_link *sockmap_link = get_sockmap_link(link);
> > >> +
> > >> +       mutex_lock(&sockmap_link_mutex);
> > > similar to the above, why is this mutex not sockmap-specific? And I'd
> > > just combine sockmap_link_mutex and sockmap_prog_update_mutex in this
> > > case to keep it simple.
> >
> > This is to protect sockmap_link->map. They could share the same lock.
> > Let me double check...
> 
> If you keep that global sockmap_prog_update_mutex then I'd probably
> reuse that one here for simplicity (and named it a bit more
> generically, "sockmap_mutex" or something like that, just like we have
> global "cgroup_mutex").

I was leaning to a per map lock, but because a global lock simplifies this
part a bunch I would agree just use a single sockmap_mutex throughout.

If someone has a use case where they want to add/remove maps dynamically
maybe they can let us know what that is. For us, on my todo list, I want
to just remove the map notion and bind progs to socks directly. The
original map idea was for a L7 load balancer, but other than quick hacks
I've never built such a thing nor ran it in production. Maybe someday
I'll find the time.

> 
> [...]
> 
> > >> +       if (old && link->prog != old) {
> > > hm.. even if old matches link->prog, we should unset old and set new
> > > link (link overrides prog attachment, basically), it shouldn't matter
> > > if old == link->prog, unless I'm missing something?
> >
> > In xdp link (net/core/dev.c), we have
> >
> >          cur_prog = dev_xdp_prog(dev, mode);
> >          /* can't replace attached prog with link */
> >          if (link && cur_prog) {
> >                  NL_SET_ERR_MSG(extack, "Can't replace active XDP
> > program with BPF link");
> >                  return -EBUSY;
> >          }
> >          if ((flags & XDP_FLAGS_REPLACE) && cur_prog != old_prog) {
> >                  NL_SET_ERR_MSG(extack, "Active program does not match
> > expected");
> >                  return -EEXIST;
> >          }
> >
> > if flags has XDP_FLAGS_REPLACE, link saved prog must be equal to old_prog
> > in order to do prog update.
> > for sockmap prog update, in link_update (syscall.c), the only way
> > we can get a non-NULL old_prog is with the following:
> >
> >          if (flags & BPF_F_REPLACE) {
> >                  old_prog = bpf_prog_get(attr->link_update.old_prog_fd);
> >                  if (IS_ERR(old_prog)) {
> >                          ret = PTR_ERR(old_prog);
> >                          old_prog = NULL;
> >                          goto out_put_progs;
> >                  }
> >          } else if (attr->link_update.old_prog_fd) {
> >                  ret = -EINVAL;
> >                  goto out_put_progs;
> >          }
> > Basically, we have BPF_F_REPLACE here.
> > So similar to xdp link, I think we should check old_prog to
> > be equal to link->prog in order to do link update_prog.
> 
> ah, ok, that's BPF_F_REPLACE case. See, it's confusing that we have
> this logic split between multiple places, in dev_xdp_attach() it's a
> bit more centralized.
> 
> >
> > >
> > >> +               ret = -EINVAL;
> > >> +               goto out;
> > >> +       }
> 
> [...]
> 
> > >> +
> > >> +       ret = sock_map_prog_update(map, prog, NULL, &sockmap_link->link, attach_type);
> > >> +       if (ret) {
> > >> +               bpf_link_cleanup(&link_primer);
> > >> +               goto out;
> > >> +       }
> > >> +
> > >> +       bpf_prog_inc(prog);
> > > if link was created successfully, it "inherits" prog's refcnt, so you
> > > shouldn't do another bpf_prog_inc()? generic link_create() logic puts
> > > prog only if this function returns error
> >
> > The reason I did this is due to
> >
> > static inline void psock_set_prog(struct bpf_prog **pprog,
> >                                    struct bpf_prog *prog)
> > {
> >          prog = xchg(pprog, prog);
> >          if (prog)
> >                  bpf_prog_put(prog);
> > }
> >
> > You can see when the prog is swapped due to link_update or prog_attach,
> > its reference count is decremented by 1. This is necessary for prog_attach,
> > but as you mentioned, indeed, it is not necessary for link-based approach.
> > Let me see whether I can refactor code to make it easy not to increase
> > reference count of prog here.
> >
> 
> ah, ok, its another sockmap-specific convention, np
> 
> >
> > >
> > >> +
> > >> +       return bpf_link_settle(&link_primer);
> > >> +
> > >> +out:
> > >> +       bpf_map_put_with_uref(map);
> > >> +       return ret;
> > >> +}
> > >> +
> > >>   static int sock_map_iter_attach_target(struct bpf_prog *prog,
> > >>                                         union bpf_iter_link_info *linfo,
> > >>                                         struct bpf_iter_aux_info *aux)
> > >> diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
> > >> index 9585f5345353..31660c3ffc01 100644
> > >> --- a/tools/include/uapi/linux/bpf.h
> > >> +++ b/tools/include/uapi/linux/bpf.h
> > >> @@ -1135,6 +1135,7 @@ enum bpf_link_type {
> > >>          BPF_LINK_TYPE_TCX = 11,
> > >>          BPF_LINK_TYPE_UPROBE_MULTI = 12,
> > >>          BPF_LINK_TYPE_NETKIT = 13,
> > >> +       BPF_LINK_TYPE_SOCKMAP = 14,
> > >>          __MAX_BPF_LINK_TYPE,
> > >>   };
> > >>
> > >> @@ -6720,6 +6721,10 @@ struct bpf_link_info {
> > >>                          __u32 ifindex;
> > >>                          __u32 attach_type;
> > >>                  } netkit;
> > >> +               struct {
> > >> +                       __u32 map_id;
> > >> +                       __u32 attach_type;
> > >> +               } sockmap;
> > >>          };
> > >>   } __attribute__((aligned(8)));
> > >>
> > >> --
> > >> 2.43.0
> > >>
Martin KaFai Lau April 3, 2024, 10:09 p.m. UTC | #7
On 4/3/24 10:47 AM, John Fastabend wrote:
> on my todo list, I want
> to just remove the map notion and bind progs to socks directly.

Run the bpf prog without the sockmap? +1, it would be nice.

> but other than quick hacks I've never built such a thing nor ran it 
> in production. 

How do you see the interface will look like (e.g. attaching the bpf prog to a sk) ?

It will be nice if the whole function (e.g. sk->sk_data_ready or may be some of 
the sk->sk_prot) can be implemented completely in bpf. I don't have a concrete 
use case for now but I think it will be powerful.

[ It is orthogonal to what Yonghong is doing, so the title changed ]
John Fastabend April 4, 2024, 1:11 a.m. UTC | #8
Martin KaFai Lau wrote:
> On 4/3/24 10:47 AM, John Fastabend wrote:
> > on my todo list, I want
> > to just remove the map notion and bind progs to socks directly.
> 
> Run the bpf prog without the sockmap? +1, it would be nice.

Part of my motivation for doing this is almost all the bugs syzbot and
others find are related to removing sockets from the map. We never
do this in any of our code. Once a socket is in the map (added at
accept time) it stays there until TCP stack closes it.

Also we have to make up some size for the map that somehow looks like
max number of concurrent sessions for the application. For many
server applicatoins (nginx, httpd, ...) we know this, but is a bit
artifically derived.

> 
> > but other than quick hacks I've never built such a thing nor ran it 
> > in production. 
> 
> How do you see the interface will look like (e.g. attaching the bpf prog to a sk) ?

I would propse doing it directly with a helper/kfunc from the sockops
programs.

  attach_sk_msg_prog(sk, sk_msg_prog)
  attach_sk_skb_prog(sk, sk_skb_prog)

> 
> It will be nice if the whole function (e.g. sk->sk_data_ready or may be some of 
> the sk->sk_prot) can be implemented completely in bpf. I don't have a concrete 
> use case for now but I think it will be powerful.

Perhaps a data_ready prog could also replace the ops?

  attach_sk_data_ready(sk, sk_msg_data_ready)

The attach_sk_data_ready could use pretty much the logic we have for
creating psocks but only replace the sk_data_ready callback.

> 
> [ It is orthogonal to what Yonghong is doing, so the title changed ]
>
Yonghong Song April 4, 2024, 3:18 a.m. UTC | #9
On 4/3/24 10:47 AM, John Fastabend wrote:
> Andrii Nakryiko wrote:
>> On Tue, Apr 2, 2024 at 6:08 PM Yonghong Song <yonghong.song@linux.dev> wrote:
>>>
>>> On 4/2/24 10:45 AM, Andrii Nakryiko wrote:
>>>> On Mon, Mar 25, 2024 at 7:22 PM Yonghong Song <yonghong.song@linux.dev> wrote:
>>>>> Add bpf_link support for sk_msg and sk_skb programs. We have an
>>>>> internal request to support bpf_link for sk_msg programs so user
>>>>> space can have a uniform handling with bpf_link based libbpf
>>>>> APIs. Using bpf_link based libbpf API also has a benefit which
>>>>> makes system robust by decoupling prog life cycle and
>>>>> attachment life cycle.
>>>>>
> Thanks again for working on it.
>
>>>>> Signed-off-by: Yonghong Song <yonghong.song@linux.dev>
>>>>> ---
>>>>>    include/linux/bpf.h            |   6 +
>>>>>    include/linux/skmsg.h          |   4 +
>>>>>    include/uapi/linux/bpf.h       |   5 +
>>>>>    kernel/bpf/syscall.c           |   4 +
>>>>>    net/core/sock_map.c            | 263 ++++++++++++++++++++++++++++++++-
>>>>>    tools/include/uapi/linux/bpf.h |   5 +
>>>>>    6 files changed, 279 insertions(+), 8 deletions(-)
>>>>>
>> [...]
>>
>>>>>           psock_set_prog(pprog, prog);
>>>>> -       return 0;
>>>>> +       if (link)
>>>>> +               *plink = link;
>>>>> +
>>>>> +out:
>>>>> +       mutex_unlock(&sockmap_prog_update_mutex);
>>>> why this mutex is not per-sockmap?
>>> My thinking is the system probably won't have lots of sockmaps and
>>> sockmap attach/detach/update_prog should not be that frequent. But
>>> I could be wrong.
>>>
> For my use case at least we have a map per protocol we want to inspect.
> So its rather small set <10 I would say. Also they are created once
> when the agent starts and when config changes from operator (user decides
> to remove/add a parser). Config changing is rather rare. I don't think
> this would be paticularly painful in practice now to have a global
> lock.
>
>> That seems like an even more of an argument to keep mutex per sockmap.
>> It won't add a lot of memory, but it is conceptually cleaner, as each
>> sockmap instance (and corresponding links) are completely independent,
>> even from a locking perspective.
>>
>> But I can't say I feel very strongly about this.
>>
>>>>> +       return ret;
>>>>>    }
>>>>>
>> [...]
>>
>>>>> +
>>>>> +static void sock_map_link_release(struct bpf_link *link)
>>>>> +{
>>>>> +       struct sockmap_link *sockmap_link = get_sockmap_link(link);
>>>>> +
>>>>> +       mutex_lock(&sockmap_link_mutex);
>>>> similar to the above, why is this mutex not sockmap-specific? And I'd
>>>> just combine sockmap_link_mutex and sockmap_prog_update_mutex in this
>>>> case to keep it simple.
>>> This is to protect sockmap_link->map. They could share the same lock.
>>> Let me double check...
>> If you keep that global sockmap_prog_update_mutex then I'd probably
>> reuse that one here for simplicity (and named it a bit more
>> generically, "sockmap_mutex" or something like that, just like we have
>> global "cgroup_mutex").
> I was leaning to a per map lock, but because a global lock simplifies this
> part a bunch I would agree just use a single sockmap_mutex throughout.
>
> If someone has a use case where they want to add/remove maps dynamically
> maybe they can let us know what that is. For us, on my todo list, I want
> to just remove the map notion and bind progs to socks directly. The
> original map idea was for a L7 load balancer, but other than quick hacks
> I've never built such a thing nor ran it in production. Maybe someday
> I'll find the time.

I am using a single global lock.
   https://lore.kernel.org/bpf/20240404025305.2210999-1-yonghong.song@linux.dev/
Let us whether it makes sense or not with code.

John, it would be great if you can review the patch set. I am afraid
that I could miss something...

>
>> [...]
>>
>>>>> +       if (old && link->prog != old) {
>>>> hm.. even if old matches link->prog, we should unset old and set new
>>>> link (link overrides prog attachment, basically), it shouldn't matter
>>>> if old == link->prog, unless I'm missing something?
>>>
[...]
Yonghong Song April 4, 2024, 3:31 a.m. UTC | #10
On 4/3/24 6:11 PM, John Fastabend wrote:
> Martin KaFai Lau wrote:
>> On 4/3/24 10:47 AM, John Fastabend wrote:
>>> on my todo list, I want
>>> to just remove the map notion and bind progs to socks directly.
>> Run the bpf prog without the sockmap? +1, it would be nice.
> Part of my motivation for doing this is almost all the bugs syzbot and
> others find are related to removing sockets from the map. We never
> do this in any of our code. Once a socket is in the map (added at
> accept time) it stays there until TCP stack closes it.
>
> Also we have to make up some size for the map that somehow looks like
> max number of concurrent sessions for the application. For many
> server applicatoins (nginx, httpd, ...) we know this, but is a bit
> artifically derived.
>
>>> but other than quick hacks I've never built such a thing nor ran it
>>> in production.
>> How do you see the interface will look like (e.g. attaching the bpf prog to a sk) ?
> I would propse doing it directly with a helper/kfunc from the sockops
> programs.
>
>    attach_sk_msg_prog(sk, sk_msg_prog)
>    attach_sk_skb_prog(sk, sk_skb_prog)
>
>> It will be nice if the whole function (e.g. sk->sk_data_ready or may be some of
>> the sk->sk_prot) can be implemented completely in bpf. I don't have a concrete
>> use case for now but I think it will be powerful.
> Perhaps a data_ready prog could also replace the ops?
>
>    attach_sk_data_ready(sk, sk_msg_data_ready)
>
> The attach_sk_data_ready could use pretty much the logic we have for
> creating psocks but only replace the sk_data_ready callback.

sounds a good idea. Do we need to support detach function or atomic
update function as well? Can each sk has multiple sk_msg_prog programs?

>
>> [ It is orthogonal to what Yonghong is doing, so the title changed ]
>>
John Fastabend April 5, 2024, 4:41 a.m. UTC | #11
Yonghong Song wrote:
> 
> On 4/3/24 6:11 PM, John Fastabend wrote:
> > Martin KaFai Lau wrote:
> >> On 4/3/24 10:47 AM, John Fastabend wrote:
> >>> on my todo list, I want
> >>> to just remove the map notion and bind progs to socks directly.
> >> Run the bpf prog without the sockmap? +1, it would be nice.
> > Part of my motivation for doing this is almost all the bugs syzbot and
> > others find are related to removing sockets from the map. We never
> > do this in any of our code. Once a socket is in the map (added at
> > accept time) it stays there until TCP stack closes it.
> >
> > Also we have to make up some size for the map that somehow looks like
> > max number of concurrent sessions for the application. For many
> > server applicatoins (nginx, httpd, ...) we know this, but is a bit
> > artifically derived.
> >
> >>> but other than quick hacks I've never built such a thing nor ran it
> >>> in production.
> >> How do you see the interface will look like (e.g. attaching the bpf prog to a sk) ?
> > I would propse doing it directly with a helper/kfunc from the sockops
> > programs.
> >
> >    attach_sk_msg_prog(sk, sk_msg_prog)
> >    attach_sk_skb_prog(sk, sk_skb_prog)
> >
> >> It will be nice if the whole function (e.g. sk->sk_data_ready or may be some of
> >> the sk->sk_prot) can be implemented completely in bpf. I don't have a concrete
> >> use case for now but I think it will be powerful.
> > Perhaps a data_ready prog could also replace the ops?
> >
> >    attach_sk_data_ready(sk, sk_msg_data_ready)
> >
> > The attach_sk_data_ready could use pretty much the logic we have for
> > creating psocks but only replace the sk_data_ready callback.
> 
> sounds a good idea. Do we need to support detach function or atomic
> update function as well? Can each sk has multiple sk_msg_prog programs?

I've not found any use for multiple programs, detach functions, or updating
the psock once its created to be honest. Also why syzbot finds all the bugs
in this space because we unfortunately don't stress this area much. In the
original design I had fresh in my head building hardware load balancers and the
XDP redirect bits so a map seemed natural. Also we didn't have a lot of the
machinery we have now so went with the map. As I noted above the L7 LB
hasn't really got much traction on my side at least not yet.

In reality we've been using sk_msg and sk_skb progs attaching 1:1
with protocols and observing, auditing, adding/removing fields from
data streams.

I would probably suggest for first implementation of a sk msg attach without
maps I would just make it one prog no need for multiple programs and even
skip a detach function. Maybe there is some use for multiple programs but
we just have a single agent so it hasn't come up yet. Maybe similar to cgroups
though because we only have single prog in those at the moment.

Thanks.
John Fastabend April 5, 2024, 4:42 a.m. UTC | #12
Yonghong Song wrote:
> 
> On 4/3/24 10:47 AM, John Fastabend wrote:
> > Andrii Nakryiko wrote:
> >> On Tue, Apr 2, 2024 at 6:08 PM Yonghong Song <yonghong.song@linux.dev> wrote:
> >>>
> >>> On 4/2/24 10:45 AM, Andrii Nakryiko wrote:
> >>>> On Mon, Mar 25, 2024 at 7:22 PM Yonghong Song <yonghong.song@linux.dev> wrote:
> >>>>> Add bpf_link support for sk_msg and sk_skb programs. We have an
> >>>>> internal request to support bpf_link for sk_msg programs so user
> >>>>> space can have a uniform handling with bpf_link based libbpf
> >>>>> APIs. Using bpf_link based libbpf API also has a benefit which
> >>>>> makes system robust by decoupling prog life cycle and
> >>>>> attachment life cycle.
> >>>>>
> > Thanks again for working on it.
> >
> >>>>> Signed-off-by: Yonghong Song <yonghong.song@linux.dev>
> >>>>> ---
> >>>>>    include/linux/bpf.h            |   6 +
> >>>>>    include/linux/skmsg.h          |   4 +
> >>>>>    include/uapi/linux/bpf.h       |   5 +
> >>>>>    kernel/bpf/syscall.c           |   4 +
> >>>>>    net/core/sock_map.c            | 263 ++++++++++++++++++++++++++++++++-
> >>>>>    tools/include/uapi/linux/bpf.h |   5 +
> >>>>>    6 files changed, 279 insertions(+), 8 deletions(-)
> >>>>>
> >> [...]
> >>
> >>>>>           psock_set_prog(pprog, prog);
> >>>>> -       return 0;
> >>>>> +       if (link)
> >>>>> +               *plink = link;
> >>>>> +
> >>>>> +out:
> >>>>> +       mutex_unlock(&sockmap_prog_update_mutex);
> >>>> why this mutex is not per-sockmap?
> >>> My thinking is the system probably won't have lots of sockmaps and
> >>> sockmap attach/detach/update_prog should not be that frequent. But
> >>> I could be wrong.
> >>>
> > For my use case at least we have a map per protocol we want to inspect.
> > So its rather small set <10 I would say. Also they are created once
> > when the agent starts and when config changes from operator (user decides
> > to remove/add a parser). Config changing is rather rare. I don't think
> > this would be paticularly painful in practice now to have a global
> > lock.
> >
> >> That seems like an even more of an argument to keep mutex per sockmap.
> >> It won't add a lot of memory, but it is conceptually cleaner, as each
> >> sockmap instance (and corresponding links) are completely independent,
> >> even from a locking perspective.
> >>
> >> But I can't say I feel very strongly about this.
> >>
> >>>>> +       return ret;
> >>>>>    }
> >>>>>
> >> [...]
> >>
> >>>>> +
> >>>>> +static void sock_map_link_release(struct bpf_link *link)
> >>>>> +{
> >>>>> +       struct sockmap_link *sockmap_link = get_sockmap_link(link);
> >>>>> +
> >>>>> +       mutex_lock(&sockmap_link_mutex);
> >>>> similar to the above, why is this mutex not sockmap-specific? And I'd
> >>>> just combine sockmap_link_mutex and sockmap_prog_update_mutex in this
> >>>> case to keep it simple.
> >>> This is to protect sockmap_link->map. They could share the same lock.
> >>> Let me double check...
> >> If you keep that global sockmap_prog_update_mutex then I'd probably
> >> reuse that one here for simplicity (and named it a bit more
> >> generically, "sockmap_mutex" or something like that, just like we have
> >> global "cgroup_mutex").
> > I was leaning to a per map lock, but because a global lock simplifies this
> > part a bunch I would agree just use a single sockmap_mutex throughout.
> >
> > If someone has a use case where they want to add/remove maps dynamically
> > maybe they can let us know what that is. For us, on my todo list, I want
> > to just remove the map notion and bind progs to socks directly. The
> > original map idea was for a L7 load balancer, but other than quick hacks
> > I've never built such a thing nor ran it in production. Maybe someday
> > I'll find the time.
> 
> I am using a single global lock.
>    https://lore.kernel.org/bpf/20240404025305.2210999-1-yonghong.song@linux.dev/
> Let us whether it makes sense or not with code.
> 
> John, it would be great if you can review the patch set. I am afraid
> that I could miss something...

Yep I will. Hopefully tonight because I intended to do it today but worse
case top of list tomorrow. I can also drop it into our test harness which
runs some longer running stress stuff. Thanks!
Martin KaFai Lau April 6, 2024, 1:10 a.m. UTC | #13
On 4/4/24 9:41 PM, John Fastabend wrote:

>>>> How do you see the interface will look like (e.g. attaching the bpf prog to a sk) ?
>>> I would propse doing it directly with a helper/kfunc from the sockops
>>> programs.
>>>
>>>     attach_sk_msg_prog(sk, sk_msg_prog)
>>>     attach_sk_skb_prog(sk, sk_skb_prog)

or the whole 'struct sk_psock_progs'

attach_sk_parser(struct sock *sk, struct sk_psock_progs *progs).

>>>
>>>> It will be nice if the whole function (e.g. sk->sk_data_ready or may be some of
>>>> the sk->sk_prot) can be implemented completely in bpf. I don't have a concrete
>>>> use case for now but I think it will be powerful.
>>> Perhaps a data_ready prog could also replace the ops?
>>>
>>>     attach_sk_data_ready(sk, sk_msg_data_ready)

Other than sk_data_ready, I am also wondering how much of the 'struct proto' can 
be written in bpf. For example, the {tcp,udp}_bpf_prots.
May be with some help of new kfunc and some of the functions can just use the 
kernel default one.

>>> The attach_sk_data_ready could use pretty much the logic we have for
>>> creating psocks but only replace the sk_data_ready callback.
>>
>> sounds a good idea. Do we need to support detach function or atomic
>> update function as well? Can each sk has multiple sk_msg_prog programs?
> 
> I've not found any use for multiple programs, detach functions, or updating
> the psock once its created to be honest. Also why syzbot finds all the bugs
> in this space because we unfortunately don't stress this area much. In the
> original design I had fresh in my head building hardware load balancers and the
> XDP redirect bits so a map seemed natural. Also we didn't have a lot of the
> machinery we have now so went with the map. As I noted above the L7 LB
> hasn't really got much traction on my side at least not yet.
> 
> In reality we've been using sk_msg and sk_skb progs attaching 1:1
> with protocols and observing, auditing, adding/removing fields from
> data streams.
> 
> I would probably suggest for first implementation of a sk msg attach without
> maps I would just make it one prog no need for multiple programs and even
> skip a detach function. Maybe there is some use for multiple programs but

I would at least keep the detach (and update) program possibility open. Is it 
still too hard to support them without a map get into the way?
diff mbox series

Patch

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 62762390c93d..5034c1b4ded7 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -2996,6 +2996,7 @@  int sock_map_prog_detach(const union bpf_attr *attr, enum bpf_prog_type ptype);
 int sock_map_update_elem_sys(struct bpf_map *map, void *key, void *value, u64 flags);
 int sock_map_bpf_prog_query(const union bpf_attr *attr,
 			    union bpf_attr __user *uattr);
+int sock_map_link_create(const union bpf_attr *attr, struct bpf_prog *prog);
 
 void sock_map_unhash(struct sock *sk);
 void sock_map_destroy(struct sock *sk);
@@ -3094,6 +3095,11 @@  static inline int sock_map_bpf_prog_query(const union bpf_attr *attr,
 {
 	return -EINVAL;
 }
+
+static inline int sock_map_link_create(const union bpf_attr *attr, struct bpf_prog *prog)
+{
+	return -EOPNOTSUPP;
+}
 #endif /* CONFIG_BPF_SYSCALL */
 #endif /* CONFIG_NET && CONFIG_BPF_SYSCALL */
 
diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h
index e65ec3fd2799..9c8dd4c01412 100644
--- a/include/linux/skmsg.h
+++ b/include/linux/skmsg.h
@@ -58,6 +58,10 @@  struct sk_psock_progs {
 	struct bpf_prog			*stream_parser;
 	struct bpf_prog			*stream_verdict;
 	struct bpf_prog			*skb_verdict;
+	struct bpf_link			*msg_parser_link;
+	struct bpf_link			*stream_parser_link;
+	struct bpf_link			*stream_verdict_link;
+	struct bpf_link			*skb_verdict_link;
 };
 
 enum sk_psock_state_bits {
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 9585f5345353..31660c3ffc01 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -1135,6 +1135,7 @@  enum bpf_link_type {
 	BPF_LINK_TYPE_TCX = 11,
 	BPF_LINK_TYPE_UPROBE_MULTI = 12,
 	BPF_LINK_TYPE_NETKIT = 13,
+	BPF_LINK_TYPE_SOCKMAP = 14,
 	__MAX_BPF_LINK_TYPE,
 };
 
@@ -6720,6 +6721,10 @@  struct bpf_link_info {
 			__u32 ifindex;
 			__u32 attach_type;
 		} netkit;
+		struct {
+			__u32 map_id;
+			__u32 attach_type;
+		} sockmap;
 	};
 } __attribute__((aligned(8)));
 
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index e44c276e8617..7d392ec83655 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -5213,6 +5213,10 @@  static int link_create(union bpf_attr *attr, bpfptr_t uattr)
 	case BPF_PROG_TYPE_SK_LOOKUP:
 		ret = netns_bpf_link_create(attr, prog);
 		break;
+	case BPF_PROG_TYPE_SK_MSG:
+	case BPF_PROG_TYPE_SK_SKB:
+		ret = sock_map_link_create(attr, prog);
+		break;
 #ifdef CONFIG_NET
 	case BPF_PROG_TYPE_XDP:
 		ret = bpf_xdp_link_attach(attr, prog);
diff --git a/net/core/sock_map.c b/net/core/sock_map.c
index 27d733c0f65e..dafc9aa6e192 100644
--- a/net/core/sock_map.c
+++ b/net/core/sock_map.c
@@ -24,8 +24,12 @@  struct bpf_stab {
 #define SOCK_CREATE_FLAG_MASK				\
 	(BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY)
 
+static DEFINE_MUTEX(sockmap_prog_update_mutex);
+static DEFINE_MUTEX(sockmap_link_mutex);
+
 static int sock_map_prog_update(struct bpf_map *map, struct bpf_prog *prog,
-				struct bpf_prog *old, u32 which);
+				struct bpf_prog *old, struct bpf_link *link,
+				u32 which);
 static struct sk_psock_progs *sock_map_progs(struct bpf_map *map);
 
 static struct bpf_map *sock_map_alloc(union bpf_attr *attr)
@@ -71,7 +75,7 @@  int sock_map_get_from_fd(const union bpf_attr *attr, struct bpf_prog *prog)
 	map = __bpf_map_get(f);
 	if (IS_ERR(map))
 		return PTR_ERR(map);
-	ret = sock_map_prog_update(map, prog, NULL, attr->attach_type);
+	ret = sock_map_prog_update(map, prog, NULL, NULL, attr->attach_type);
 	fdput(f);
 	return ret;
 }
@@ -103,7 +107,7 @@  int sock_map_prog_detach(const union bpf_attr *attr, enum bpf_prog_type ptype)
 		goto put_prog;
 	}
 
-	ret = sock_map_prog_update(map, NULL, prog, attr->attach_type);
+	ret = sock_map_prog_update(map, NULL, prog, NULL, attr->attach_type);
 put_prog:
 	bpf_prog_put(prog);
 put_map:
@@ -1488,21 +1492,90 @@  static int sock_map_prog_lookup(struct bpf_map *map, struct bpf_prog ***pprog,
 	return 0;
 }
 
+static int sock_map_link_lookup(struct bpf_map *map, struct bpf_link ***plink,
+				struct bpf_link *link, bool skip_check, u32 which)
+{
+	struct sk_psock_progs *progs = sock_map_progs(map);
+
+	switch (which) {
+	case BPF_SK_MSG_VERDICT:
+		if (!skip_check &&
+		    ((!link && progs->msg_parser_link) ||
+		     (link && link != progs->msg_parser_link)))
+			return -EBUSY;
+		*plink = &progs->msg_parser_link;
+		break;
+#if IS_ENABLED(CONFIG_BPF_STREAM_PARSER)
+	case BPF_SK_SKB_STREAM_PARSER:
+		if (!skip_check &&
+		    ((!link && progs->stream_parser_link) ||
+		     (link && link != progs->stream_parser_link)))
+			return -EBUSY;
+		*plink = &progs->stream_parser_link;
+		break;
+#endif
+	case BPF_SK_SKB_STREAM_VERDICT:
+		if (!skip_check &&
+		    ((!link && progs->stream_verdict_link) ||
+		     (link && link != progs->stream_verdict_link)))
+			return -EBUSY;
+		*plink = &progs->stream_verdict_link;
+		break;
+	case BPF_SK_SKB_VERDICT:
+		if (!skip_check &&
+		    ((!link && progs->skb_verdict_link) ||
+		     (link && link != progs->skb_verdict_link)))
+			return -EBUSY;
+		*plink = &progs->skb_verdict_link;
+		break;
+	default:
+		return -EOPNOTSUPP;
+	}
+
+	return 0;
+}
+
+/* Handle the following four cases:
+ * prog_attach: prog != NULL, old == NULL, link == NULL
+ * prog_detach: prog == NULL, old != NULL, link == NULL
+ * link_attach: prog != NULL, old == NULL, link != NULL
+ * link_detach: prog == NULL, old != NULL, link != NULL
+ */
 static int sock_map_prog_update(struct bpf_map *map, struct bpf_prog *prog,
-				struct bpf_prog *old, u32 which)
+				struct bpf_prog *old, struct bpf_link *link,
+				u32 which)
 {
 	struct bpf_prog **pprog;
+	struct bpf_link **plink;
 	int ret;
 
+	mutex_lock(&sockmap_prog_update_mutex);
+
 	ret = sock_map_prog_lookup(map, &pprog, which);
 	if (ret)
-		return ret;
+		goto out;
 
-	if (old)
-		return psock_replace_prog(pprog, prog, old);
+	if (!link || prog)
+		ret = sock_map_link_lookup(map, &plink, NULL, false, which);
+	else
+		ret = sock_map_link_lookup(map, &plink, NULL, true, which);
+	if (ret)
+		goto out;
+
+	if (old) {
+		ret = psock_replace_prog(pprog, prog, old);
+		if (!ret)
+			*plink = NULL;
+		goto out;
+	}
 
 	psock_set_prog(pprog, prog);
-	return 0;
+	if (link)
+		*plink = link;
+
+out:
+	mutex_unlock(&sockmap_prog_update_mutex);
+	return ret;
 }
 
 int sock_map_bpf_prog_query(const union bpf_attr *attr,
@@ -1657,6 +1730,180 @@  void sock_map_close(struct sock *sk, long timeout)
 }
 EXPORT_SYMBOL_GPL(sock_map_close);
 
+struct sockmap_link {
+	struct bpf_link link;
+	struct bpf_map *map;
+	enum bpf_attach_type attach_type;
+};
+
+static struct sockmap_link *get_sockmap_link(const struct bpf_link *link)
+{
+	return container_of(link, struct sockmap_link, link);
+}
+
+static void sock_map_link_release(struct bpf_link *link)
+{
+	struct sockmap_link *sockmap_link = get_sockmap_link(link);
+
+	mutex_lock(&sockmap_link_mutex);
+	if (sockmap_link->map) {
+		(void)sock_map_prog_update(sockmap_link->map, NULL, link->prog, link,
+					   sockmap_link->attach_type);
+		bpf_map_put_with_uref(sockmap_link->map);
+		sockmap_link->map = NULL;
+	}
+	mutex_unlock(&sockmap_link_mutex);
+}
+
+static int sock_map_link_detach(struct bpf_link *link)
+{
+	sock_map_link_release(link);
+	return 0;
+}
+
+static void sock_map_link_dealloc(struct bpf_link *link)
+{
+	kfree(get_sockmap_link(link));
+}
+
+/* Handle the following two cases:
+ * case 1: link != NULL, prog != NULL, old != NULL
+ * case 2: link != NULL, prog != NULL, old == NULL
+ */
+static int sock_map_link_update_prog(struct bpf_link *link,
+				     struct bpf_prog *prog,
+				     struct bpf_prog *old)
+{
+	const struct sockmap_link *sockmap_link = get_sockmap_link(link);
+	struct bpf_prog **pprog;
+	struct bpf_link **plink;
+	int ret = 0;
+
+	mutex_lock(&sockmap_prog_update_mutex);
+
+	/* If old prog not NULL, ensure old prog the same as link->prog. */
+	if (old && link->prog != old) {
+		ret = -EINVAL;
+		goto out;
+	}
+	/* Ensure link->prog has the same type/attach_type as the new prog. */
+	if (link->prog->type != prog->type ||
+	    link->prog->expected_attach_type != prog->expected_attach_type) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	ret = sock_map_prog_lookup(sockmap_link->map, &pprog,
+				   sockmap_link->attach_type);
+	if (ret)
+		goto out;
+
+	/* Ensure the same link between the one in map and the passed-in. */
+	ret = sock_map_link_lookup(sockmap_link->map, &plink, link, false,
+				   sockmap_link->attach_type);
+	if (ret)
+		goto out;
+
+	if (old)
+		return psock_replace_prog(pprog, prog, old);
+
+	psock_set_prog(pprog, prog);
+
+out:
+	if (!ret)
+		bpf_prog_inc(prog);
+	mutex_unlock(&sockmap_prog_update_mutex);
+	return ret;
+}
+
+static u32 sock_map_link_get_map_id(const struct sockmap_link *sockmap_link)
+{
+	u32 map_id = 0;
+
+	mutex_lock(&sockmap_link_mutex);
+	if (sockmap_link->map)
+		map_id = sockmap_link->map->id;
+	mutex_unlock(&sockmap_link_mutex);
+	return map_id;
+}
+
+static int sock_map_link_fill_info(const struct bpf_link *link,
+				   struct bpf_link_info *info)
+{
+	const struct sockmap_link *sockmap_link = get_sockmap_link(link);
+	u32 map_id = sock_map_link_get_map_id(sockmap_link);
+
+	info->sockmap.map_id = map_id;
+	info->sockmap.attach_type = sockmap_link->attach_type;
+	return 0;
+}
+
+static void sock_map_link_show_fdinfo(const struct bpf_link *link,
+				      struct seq_file *seq)
+{
+	const struct sockmap_link *sockmap_link = get_sockmap_link(link);
+	u32 map_id = sock_map_link_get_map_id(sockmap_link);
+
+	seq_printf(seq, "map_id:\t%u\n", map_id);
+	seq_printf(seq, "attach_type:\t%u\n", sockmap_link->attach_type);
+}
+
+static const struct bpf_link_ops sock_map_link_ops = {
+	.release = sock_map_link_release,
+	.dealloc = sock_map_link_dealloc,
+	.detach = sock_map_link_detach,
+	.update_prog = sock_map_link_update_prog,
+	.fill_link_info = sock_map_link_fill_info,
+	.show_fdinfo = sock_map_link_show_fdinfo,
+};
+
+int sock_map_link_create(const union bpf_attr *attr, struct bpf_prog *prog)
+{
+	struct bpf_link_primer link_primer;
+	struct sockmap_link *sockmap_link;
+	enum bpf_attach_type attach_type;
+	struct bpf_map *map;
+	int ret;
+
+	if (attr->link_create.flags)
+		return -EINVAL;
+
+	map = bpf_map_get_with_uref(attr->link_create.target_fd);
+	if (IS_ERR(map))
+		return PTR_ERR(map);
+
+	sockmap_link = kzalloc(sizeof(*sockmap_link), GFP_USER);
+	if (!sockmap_link) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	attach_type = attr->link_create.attach_type;
+	bpf_link_init(&sockmap_link->link, BPF_LINK_TYPE_SOCKMAP, &sock_map_link_ops, prog);
+	sockmap_link->map = map;
+	sockmap_link->attach_type = attach_type;
+
+	ret = bpf_link_prime(&sockmap_link->link, &link_primer);
+	if (ret) {
+		kfree(sockmap_link);
+		goto out;
+	}
+
+	ret = sock_map_prog_update(map, prog, NULL, &sockmap_link->link, attach_type);
+	if (ret) {
+		bpf_link_cleanup(&link_primer);
+		goto out;
+	}
+
+	bpf_prog_inc(prog);
+
+	return bpf_link_settle(&link_primer);
+
+out:
+	bpf_map_put_with_uref(map);
+	return ret;
+}
+
 static int sock_map_iter_attach_target(struct bpf_prog *prog,
 				       union bpf_iter_link_info *linfo,
 				       struct bpf_iter_aux_info *aux)
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 9585f5345353..31660c3ffc01 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -1135,6 +1135,7 @@  enum bpf_link_type {
 	BPF_LINK_TYPE_TCX = 11,
 	BPF_LINK_TYPE_UPROBE_MULTI = 12,
 	BPF_LINK_TYPE_NETKIT = 13,
+	BPF_LINK_TYPE_SOCKMAP = 14,
 	__MAX_BPF_LINK_TYPE,
 };
 
@@ -6720,6 +6721,10 @@  struct bpf_link_info {
 			__u32 ifindex;
 			__u32 attach_type;
 		} netkit;
+		struct {
+			__u32 map_id;
+			__u32 attach_type;
+		} sockmap;
 	};
 } __attribute__((aligned(8)));