diff mbox series

[net-next,v3] sock_map: dump socket map id via diag

Message ID 20230319191913.61236-1-xiyou.wangcong@gmail.com (mailing list archive)
State Changes Requested
Delegated to: BPF
Headers show
Series [net-next,v3] sock_map: dump socket map id via diag | expand

Checks

Context Check Description
netdev/series_format success Single patches do not need cover letters
netdev/tree_selection success Clearly marked for net-next, async
netdev/fixes_present success Fixes tag not required for -next series
netdev/header_inline success No static functions without inline keyword in header files
netdev/build_32bit success Errors and warnings before: 1461 this patch: 1461
netdev/cc_maintainers warning 17 maintainers not CCed: davem@davemloft.net andrii@kernel.org dan.j.williams@intel.com dsahern@kernel.org pabeni@redhat.com song@kernel.org gustavoars@kernel.org haoluo@google.com yhs@fb.com kuba@kernel.org edumazet@google.com daniel@iogearbox.net kpsingh@kernel.org kuniyu@amazon.com jolsa@kernel.org martin.lau@linux.dev ast@kernel.org
netdev/build_clang success Errors and warnings before: 168 this patch: 168
netdev/verify_signedoff success Signed-off-by tag matches author and committer
netdev/deprecated_api success None detected
netdev/check_selftest success No net selftest shell script
netdev/verify_fixes success No Fixes tag
netdev/build_allmodconfig_warn success Errors and warnings before: 1455 this patch: 1455
netdev/checkpatch success total: 0 errors, 0 warnings, 0 checks, 114 lines checked
netdev/kdoc success Errors and warnings before: 0 this patch: 0
netdev/source_inline success Was 0 now: 0
bpf/vmtest-bpf-next-PR success PR summary
bpf/vmtest-bpf-next-VM_Test-1 success Logs for ShellCheck
bpf/vmtest-bpf-next-VM_Test-2 success Logs for build for aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-3 success Logs for build for aarch64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-5 success Logs for build for x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-6 success Logs for build for x86_64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-7 success Logs for llvm-toolchain
bpf/vmtest-bpf-next-VM_Test-8 success Logs for set-matrix
bpf/vmtest-bpf-next-VM_Test-4 success Logs for build for s390x with gcc
bpf/vmtest-bpf-next-VM_Test-9 success Logs for test_maps on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-10 success Logs for test_maps on aarch64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-12 success Logs for test_maps on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-13 success Logs for test_maps on x86_64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-14 success Logs for test_progs on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-15 success Logs for test_progs on aarch64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-17 success Logs for test_progs on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-18 success Logs for test_progs on x86_64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-19 success Logs for test_progs_no_alu32 on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-20 success Logs for test_progs_no_alu32 on aarch64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-22 success Logs for test_progs_no_alu32 on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-23 success Logs for test_progs_no_alu32 on x86_64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-24 success Logs for test_progs_no_alu32_parallel on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-25 success Logs for test_progs_no_alu32_parallel on aarch64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-26 success Logs for test_progs_no_alu32_parallel on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-27 success Logs for test_progs_no_alu32_parallel on x86_64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-28 success Logs for test_progs_parallel on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-29 success Logs for test_progs_parallel on aarch64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-30 success Logs for test_progs_parallel on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-31 success Logs for test_progs_parallel on x86_64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-32 success Logs for test_verifier on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-33 success Logs for test_verifier on aarch64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-35 success Logs for test_verifier on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-36 success Logs for test_verifier on x86_64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-34 success Logs for test_verifier on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-16 success Logs for test_progs on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-21 fail Logs for test_progs_no_alu32 on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-11 success Logs for test_maps on s390x with gcc

Commit Message

Cong Wang March 19, 2023, 7:19 p.m. UTC
From: Cong Wang <cong.wang@bytedance.com>

Currently there is no way to know which sockmap a socket has been added
to from outside, especially for that a socket can be added to multiple
sockmap's. We could dump this via socket diag, as shown below.

Sample output:

  # ./iproute2/misc/ss -tnaie --bpf-map
  ESTAB  0      344329     127.0.0.1:1234     127.0.0.1:40912 ino:21098 sk:5 cgroup:/user.slice/user-0.slice/session-c1.scope <-> sockmap: 1

  # bpftool map
  1: sockmap  flags 0x0
  	key 4B  value 4B  max_entries 2  memlock 4096B
	pids echo-sockmap(549)
  4: array  name pid_iter.rodata  flags 0x480
	key 4B  value 4B  max_entries 1  memlock 4096B
	btf_id 10  frozen
	pids bpftool(624)

In the future, we could dump other sockmap related stats too, hence I
make it a nested attribute.

Cc: John Fastabend <john.fastabend@gmail.com>
Cc: Jakub Sitnicki <jakub@cloudflare.com>
Signed-off-by: Cong Wang <cong.wang@bytedance.com>
---
v3: remove redundant rcu read lock
    use likely() for psock check

v2: rename enum's with more generic names
    sock_map_idiag_dump -> sock_map_diag_dump()
    make sock_map_diag_dump() return number of maps

 include/linux/bpf.h            |  1 +
 include/uapi/linux/inet_diag.h |  1 +
 include/uapi/linux/sock_diag.h |  8 ++++++
 include/uapi/linux/unix_diag.h |  1 +
 net/core/sock_map.c            | 46 ++++++++++++++++++++++++++++++++++
 net/ipv4/inet_diag.c           |  5 ++++
 net/unix/diag.c                |  6 +++++
 7 files changed, 68 insertions(+)

Comments

Stanislav Fomichev March 20, 2023, 6:13 p.m. UTC | #1
On Sun, Mar 19, 2023 at 12:19 PM Cong Wang <xiyou.wangcong@gmail.com> wrote:
>
> From: Cong Wang <cong.wang@bytedance.com>
>
> Currently there is no way to know which sockmap a socket has been added
> to from outside, especially for that a socket can be added to multiple
> sockmap's. We could dump this via socket diag, as shown below.
>
> Sample output:
>
>   # ./iproute2/misc/ss -tnaie --bpf-map
>   ESTAB  0      344329     127.0.0.1:1234     127.0.0.1:40912 ino:21098 sk:5 cgroup:/user.slice/user-0.slice/session-c1.scope <-> sockmap: 1
>
>   # bpftool map
>   1: sockmap  flags 0x0
>         key 4B  value 4B  max_entries 2  memlock 4096B
>         pids echo-sockmap(549)
>   4: array  name pid_iter.rodata  flags 0x480
>         key 4B  value 4B  max_entries 1  memlock 4096B
>         btf_id 10  frozen
>         pids bpftool(624)
>
> In the future, we could dump other sockmap related stats too, hence I
> make it a nested attribute.
>
> Cc: John Fastabend <john.fastabend@gmail.com>
> Cc: Jakub Sitnicki <jakub@cloudflare.com>
> Signed-off-by: Cong Wang <cong.wang@bytedance.com>

Looks fine from my POW, will let others comment.

One thing I still don't understand here: what is missing from the
socket iterators to implement this? Is it all the sk_psock_get magic?
I remember you dismissed Yonghong's suggestion on v1, but have you
actually tried it?

Also: a test would be nice to have. I know you've tested it with the
iproute2, but having something regularly exercised by the ci seems
good to have (and not a ton of work).

> ---
> v3: remove redundant rcu read lock
>     use likely() for psock check
>
> v2: rename enum's with more generic names
>     sock_map_idiag_dump -> sock_map_diag_dump()
>     make sock_map_diag_dump() return number of maps
>
>  include/linux/bpf.h            |  1 +
>  include/uapi/linux/inet_diag.h |  1 +
>  include/uapi/linux/sock_diag.h |  8 ++++++
>  include/uapi/linux/unix_diag.h |  1 +
>  net/core/sock_map.c            | 46 ++++++++++++++++++++++++++++++++++
>  net/ipv4/inet_diag.c           |  5 ++++
>  net/unix/diag.c                |  6 +++++
>  7 files changed, 68 insertions(+)
>
> diff --git a/include/linux/bpf.h b/include/linux/bpf.h
> index 6792a7940e1e..4cc315ce26a9 100644
> --- a/include/linux/bpf.h
> +++ b/include/linux/bpf.h
> @@ -2638,6 +2638,7 @@ int sock_map_bpf_prog_query(const union bpf_attr *attr,
>  void sock_map_unhash(struct sock *sk);
>  void sock_map_destroy(struct sock *sk);
>  void sock_map_close(struct sock *sk, long timeout);
> +int sock_map_diag_dump(struct sock *sk, struct sk_buff *skb, int attr);
>  #else
>  static inline int bpf_dev_bound_kfunc_check(struct bpf_verifier_log *log,
>                                             struct bpf_prog_aux *prog_aux)
> diff --git a/include/uapi/linux/inet_diag.h b/include/uapi/linux/inet_diag.h
> index 50655de04c9b..d1f1e4522633 100644
> --- a/include/uapi/linux/inet_diag.h
> +++ b/include/uapi/linux/inet_diag.h
> @@ -161,6 +161,7 @@ enum {
>         INET_DIAG_SK_BPF_STORAGES,
>         INET_DIAG_CGROUP_ID,
>         INET_DIAG_SOCKOPT,
> +       INET_DIAG_BPF_MAP,
>         __INET_DIAG_MAX,
>  };
>
> diff --git a/include/uapi/linux/sock_diag.h b/include/uapi/linux/sock_diag.h
> index 5f74a5f6091d..7c961940b408 100644
> --- a/include/uapi/linux/sock_diag.h
> +++ b/include/uapi/linux/sock_diag.h
> @@ -62,4 +62,12 @@ enum {
>
>  #define SK_DIAG_BPF_STORAGE_MAX        (__SK_DIAG_BPF_STORAGE_MAX - 1)
>
> +enum {
> +       SK_DIAG_BPF_MAP_NONE,
> +       SK_DIAG_BPF_MAP_IDS,
> +       __SK_DIAG_BPF_MAP_MAX,
> +};
> +
> +#define SK_DIAG_BPF_MAP_MAX        (__SK_DIAG_BPF_MAP_MAX - 1)
> +
>  #endif /* _UAPI__SOCK_DIAG_H__ */
> diff --git a/include/uapi/linux/unix_diag.h b/include/uapi/linux/unix_diag.h
> index a1988576fa8a..b95a2b33521d 100644
> --- a/include/uapi/linux/unix_diag.h
> +++ b/include/uapi/linux/unix_diag.h
> @@ -42,6 +42,7 @@ enum {
>         UNIX_DIAG_MEMINFO,
>         UNIX_DIAG_SHUTDOWN,
>         UNIX_DIAG_UID,
> +       UNIX_DIAG_BPF_MAP,
>
>         __UNIX_DIAG_MAX,
>  };
> diff --git a/net/core/sock_map.c b/net/core/sock_map.c
> index 9b854e236d23..c4049095f64e 100644
> --- a/net/core/sock_map.c
> +++ b/net/core/sock_map.c
> @@ -1656,6 +1656,52 @@ void sock_map_close(struct sock *sk, long timeout)
>  }
>  EXPORT_SYMBOL_GPL(sock_map_close);
>
> +int sock_map_diag_dump(struct sock *sk, struct sk_buff *skb, int attrtype)
> +{
> +       struct sk_psock_link *link;
> +       struct nlattr *nla, *attr;
> +       int nr_links = 0, ret = 0;
> +       struct sk_psock *psock;
> +       u32 *ids;
> +
> +       psock = sk_psock_get(sk);
> +       if (likely(!psock))
> +               return 0;
> +
> +       nla = nla_nest_start_noflag(skb, attrtype);
> +       if (!nla) {
> +               sk_psock_put(sk, psock);
> +               return -EMSGSIZE;
> +       }
> +       spin_lock_bh(&psock->link_lock);
> +       list_for_each_entry(link, &psock->link, list)
> +               nr_links++;
> +
> +       attr = nla_reserve(skb, SK_DIAG_BPF_MAP_IDS,
> +                          sizeof(link->map->id) * nr_links);
> +       if (!attr) {
> +               ret = -EMSGSIZE;
> +               goto unlock;
> +       }
> +
> +       ids = nla_data(attr);
> +       list_for_each_entry(link, &psock->link, list) {
> +               *ids = link->map->id;
> +               ids++;
> +       }
> +unlock:
> +       spin_unlock_bh(&psock->link_lock);
> +       sk_psock_put(sk, psock);
> +       if (ret) {
> +               nla_nest_cancel(skb, nla);
> +       } else {
> +               ret = nr_links;
> +               nla_nest_end(skb, nla);
> +       }
> +       return ret;
> +}
> +EXPORT_SYMBOL_GPL(sock_map_diag_dump);
> +
>  static int sock_map_iter_attach_target(struct bpf_prog *prog,
>                                        union bpf_iter_link_info *linfo,
>                                        struct bpf_iter_aux_info *aux)
> diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c
> index b812eb36f0e3..0949909d5b46 100644
> --- a/net/ipv4/inet_diag.c
> +++ b/net/ipv4/inet_diag.c
> @@ -197,6 +197,11 @@ int inet_diag_msg_attrs_fill(struct sock *sk, struct sk_buff *skb,
>                     &inet_sockopt))
>                 goto errout;
>
> +#ifdef CONFIG_BPF_SYSCALL
> +       if (sock_map_diag_dump(sk, skb, INET_DIAG_BPF_MAP) < 0)
> +               goto errout;
> +#endif
> +
>         return 0;
>  errout:
>         return 1;
> diff --git a/net/unix/diag.c b/net/unix/diag.c
> index 616b55c5b890..54aa8da2831e 100644
> --- a/net/unix/diag.c
> +++ b/net/unix/diag.c
> @@ -6,6 +6,7 @@
>  #include <linux/skbuff.h>
>  #include <linux/module.h>
>  #include <linux/uidgid.h>
> +#include <linux/bpf.h>
>  #include <net/netlink.h>
>  #include <net/af_unix.h>
>  #include <net/tcp_states.h>
> @@ -172,6 +173,11 @@ static int sk_diag_fill(struct sock *sk, struct sk_buff *skb, struct unix_diag_r
>             sk_diag_dump_uid(sk, skb, user_ns))
>                 goto out_nlmsg_trim;
>
> +#ifdef CONFIG_BPF_SYSCALL
> +       if (sock_map_diag_dump(sk, skb, UNIX_DIAG_BPF_MAP) < 0)
> +               goto out_nlmsg_trim;
> +#endif
> +
>         nlmsg_end(skb, nlh);
>         return 0;
>
> --
> 2.34.1
>
Martin KaFai Lau March 20, 2023, 8:29 p.m. UTC | #2
On 3/20/23 11:13 AM, Stanislav Fomichev wrote:
> One thing I still don't understand here: what is missing from the
> socket iterators to implement this? Is it all the sk_psock_get magic?
> I remember you dismissed Yonghong's suggestion on v1, but have you
> actually tried it?
would be useful to know what is missing to print the bpf map id without adding 
kernel code. There is new bpf_rdonly_cast() which will be useful here also.
Cong Wang March 25, 2023, 8:07 p.m. UTC | #3
On Mon, Mar 20, 2023 at 11:13:03AM -0700, Stanislav Fomichev wrote:
> On Sun, Mar 19, 2023 at 12:19 PM Cong Wang <xiyou.wangcong@gmail.com> wrote:
> >
> > From: Cong Wang <cong.wang@bytedance.com>
> >
> > Currently there is no way to know which sockmap a socket has been added
> > to from outside, especially for that a socket can be added to multiple
> > sockmap's. We could dump this via socket diag, as shown below.
> >
> > Sample output:
> >
> >   # ./iproute2/misc/ss -tnaie --bpf-map
> >   ESTAB  0      344329     127.0.0.1:1234     127.0.0.1:40912 ino:21098 sk:5 cgroup:/user.slice/user-0.slice/session-c1.scope <-> sockmap: 1
> >
> >   # bpftool map
> >   1: sockmap  flags 0x0
> >         key 4B  value 4B  max_entries 2  memlock 4096B
> >         pids echo-sockmap(549)
> >   4: array  name pid_iter.rodata  flags 0x480
> >         key 4B  value 4B  max_entries 1  memlock 4096B
> >         btf_id 10  frozen
> >         pids bpftool(624)
> >
> > In the future, we could dump other sockmap related stats too, hence I
> > make it a nested attribute.
> >
> > Cc: John Fastabend <john.fastabend@gmail.com>
> > Cc: Jakub Sitnicki <jakub@cloudflare.com>
> > Signed-off-by: Cong Wang <cong.wang@bytedance.com>
> 
> Looks fine from my POW, will let others comment.
> 
> One thing I still don't understand here: what is missing from the
> socket iterators to implement this? Is it all the sk_psock_get magic?
> I remember you dismissed Yonghong's suggestion on v1, but have you
> actually tried it?

I am very confused. So in order to figure out which sockmap a socket has
been added to, I have to dump *all* sockmap's??? It seems you are
suggesting to solve this with a more complex and unnecessary approach?
Please tell me why, I am really lost, I don't even see there is a point
to make here.

> 
> Also: a test would be nice to have. I know you've tested it with the
> iproute2, but having something regularly exercised by the ci seems
> good to have (and not a ton of work).

Sure, so where are the tests for socket diag? I don't see any within the
tree:

$ git grep INET_DIAG_SOCKOPT -- tools/
$

Note, this is not suitable for bpf selftests, because it is less relevant
to bpf, much more relevant to socket diag. I thought this is obvious.

Thanks.
Cong Wang March 25, 2023, 8:27 p.m. UTC | #4
On Mon, Mar 20, 2023 at 01:29:39PM -0700, Martin KaFai Lau wrote:
> On 3/20/23 11:13 AM, Stanislav Fomichev wrote:
> > One thing I still don't understand here: what is missing from the
> > socket iterators to implement this? Is it all the sk_psock_get magic?
> > I remember you dismissed Yonghong's suggestion on v1, but have you
> > actually tried it?
> would be useful to know what is missing to print the bpf map id without
> adding kernel code. There is new bpf_rdonly_cast() which will be useful here
> also.

So you don't consider eBPF code as kernel code, right? Interestingly
enough, eBPF code runs in kernel... and you still need to write eBPF
code. So what is the point of "without adding kernel code" here?

What is even more interesting is that even your own code does not agree
with you here, for example, you introduced INET_DIAG_SK_BPF_STORAGES, so
what was missing to print sk bpf storage without adding kernel code?

Thanks.
Martin KaFai Lau March 25, 2023, 10:10 p.m. UTC | #5
On 3/25/23 1:27 PM, Cong Wang wrote:
> On Mon, Mar 20, 2023 at 01:29:39PM -0700, Martin KaFai Lau wrote:
>> On 3/20/23 11:13 AM, Stanislav Fomichev wrote:
>>> One thing I still don't understand here: what is missing from the
>>> socket iterators to implement this? Is it all the sk_psock_get magic?
>>> I remember you dismissed Yonghong's suggestion on v1, but have you
>>> actually tried it?
>> would be useful to know what is missing to print the bpf map id without
>> adding kernel code. There is new bpf_rdonly_cast() which will be useful here
>> also.
> 
> So you don't consider eBPF code as kernel code, right? Interestingly
> enough, eBPF code runs in kernel... and you still need to write eBPF
> code. So what is the point of "without adding kernel code" here?

Interesting, how is it the same? if it needs to print something other than the 
map id in the future, even putting aside further kernel maintenance and 
additional review, does a new bpf prog need to upgrade and reboot the kernel?

Based on your idea, all possible sk_filter automatically qualify to be added to 
the kernel source tree now because they also run in the kernel?

> 
> What is even more interesting is that even your own code does not agree
> with you here, for example, you introduced INET_DIAG_SK_BPF_STORAGES, so
> what was missing to print sk bpf storage without adding kernel code?

Yep, you are absolutely correct. Only if bpf-iter was available earlier. If 
bpf-iter was available before INET_DIAG_SK_BPF_STORAGES was added, there was no 
need to add INET_DIAG_SK_BPF_STORAGES and it would be no brainer to explore the 
bpf-iter approach first. Things have been improving.

The question (from a few people) was to figure out what was missing in the 
bpf-iter approach to print this bpf bits and trying to help. It is the only 
reason I replied. Apparently, you have not even tried and not interested.
Stanislav Fomichev March 27, 2023, 4:33 p.m. UTC | #6
On 03/25, Cong Wang wrote:
> On Mon, Mar 20, 2023 at 11:13:03AM -0700, Stanislav Fomichev wrote:
> > On Sun, Mar 19, 2023 at 12:19 PM Cong Wang <xiyou.wangcong@gmail.com>  
> wrote:
> > >
> > > From: Cong Wang <cong.wang@bytedance.com>
> > >
> > > Currently there is no way to know which sockmap a socket has been  
> added
> > > to from outside, especially for that a socket can be added to multiple
> > > sockmap's. We could dump this via socket diag, as shown below.
> > >
> > > Sample output:
> > >
> > >   # ./iproute2/misc/ss -tnaie --bpf-map
> > >   ESTAB  0      344329     127.0.0.1:1234     127.0.0.1:40912  
> ino:21098 sk:5 cgroup:/user.slice/user-0.slice/session-c1.scope <->  
> sockmap: 1
> > >
> > >   # bpftool map
> > >   1: sockmap  flags 0x0
> > >         key 4B  value 4B  max_entries 2  memlock 4096B
> > >         pids echo-sockmap(549)
> > >   4: array  name pid_iter.rodata  flags 0x480
> > >         key 4B  value 4B  max_entries 1  memlock 4096B
> > >         btf_id 10  frozen
> > >         pids bpftool(624)
> > >
> > > In the future, we could dump other sockmap related stats too, hence I
> > > make it a nested attribute.
> > >
> > > Cc: John Fastabend <john.fastabend@gmail.com>
> > > Cc: Jakub Sitnicki <jakub@cloudflare.com>
> > > Signed-off-by: Cong Wang <cong.wang@bytedance.com>
> >
> > Looks fine from my POW, will let others comment.
> >
> > One thing I still don't understand here: what is missing from the
> > socket iterators to implement this? Is it all the sk_psock_get magic?
> > I remember you dismissed Yonghong's suggestion on v1, but have you
> > actually tried it?

> I am very confused. So in order to figure out which sockmap a socket has
> been added to, I have to dump *all* sockmap's??? It seems you are
> suggesting to solve this with a more complex and unnecessary approach?
> Please tell me why, I am really lost, I don't even see there is a point
> to make here.

With a socket iter, you can iterate over all sockets and run some bpf
program on it do dump some state. So you'd iterate over the sockets,
not sockmaps. For each socket you get a pointer to sock and you do the same
sk_psock_get+list_for_each_entry(psock->link).

(in theory; would be interesting to see whether it works in practice)

> >
> > Also: a test would be nice to have. I know you've tested it with the
> > iproute2, but having something regularly exercised by the ci seems
> > good to have (and not a ton of work).

> Sure, so where are the tests for socket diag? I don't see any within the
> tree:

> $ git grep INET_DIAG_SOCKOPT -- tools/
> $

> Note, this is not suitable for bpf selftests, because it is less relevant
> to bpf, much more relevant to socket diag. I thought this is obvious.

Never too late to start testing those sock diag paths :-)
Put them in the net selftests if you don't think bpf selftests is the right
fit?

> Thanks.
diff mbox series

Patch

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 6792a7940e1e..4cc315ce26a9 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -2638,6 +2638,7 @@  int sock_map_bpf_prog_query(const union bpf_attr *attr,
 void sock_map_unhash(struct sock *sk);
 void sock_map_destroy(struct sock *sk);
 void sock_map_close(struct sock *sk, long timeout);
+int sock_map_diag_dump(struct sock *sk, struct sk_buff *skb, int attr);
 #else
 static inline int bpf_dev_bound_kfunc_check(struct bpf_verifier_log *log,
 					    struct bpf_prog_aux *prog_aux)
diff --git a/include/uapi/linux/inet_diag.h b/include/uapi/linux/inet_diag.h
index 50655de04c9b..d1f1e4522633 100644
--- a/include/uapi/linux/inet_diag.h
+++ b/include/uapi/linux/inet_diag.h
@@ -161,6 +161,7 @@  enum {
 	INET_DIAG_SK_BPF_STORAGES,
 	INET_DIAG_CGROUP_ID,
 	INET_DIAG_SOCKOPT,
+	INET_DIAG_BPF_MAP,
 	__INET_DIAG_MAX,
 };
 
diff --git a/include/uapi/linux/sock_diag.h b/include/uapi/linux/sock_diag.h
index 5f74a5f6091d..7c961940b408 100644
--- a/include/uapi/linux/sock_diag.h
+++ b/include/uapi/linux/sock_diag.h
@@ -62,4 +62,12 @@  enum {
 
 #define SK_DIAG_BPF_STORAGE_MAX        (__SK_DIAG_BPF_STORAGE_MAX - 1)
 
+enum {
+	SK_DIAG_BPF_MAP_NONE,
+	SK_DIAG_BPF_MAP_IDS,
+	__SK_DIAG_BPF_MAP_MAX,
+};
+
+#define SK_DIAG_BPF_MAP_MAX        (__SK_DIAG_BPF_MAP_MAX - 1)
+
 #endif /* _UAPI__SOCK_DIAG_H__ */
diff --git a/include/uapi/linux/unix_diag.h b/include/uapi/linux/unix_diag.h
index a1988576fa8a..b95a2b33521d 100644
--- a/include/uapi/linux/unix_diag.h
+++ b/include/uapi/linux/unix_diag.h
@@ -42,6 +42,7 @@  enum {
 	UNIX_DIAG_MEMINFO,
 	UNIX_DIAG_SHUTDOWN,
 	UNIX_DIAG_UID,
+	UNIX_DIAG_BPF_MAP,
 
 	__UNIX_DIAG_MAX,
 };
diff --git a/net/core/sock_map.c b/net/core/sock_map.c
index 9b854e236d23..c4049095f64e 100644
--- a/net/core/sock_map.c
+++ b/net/core/sock_map.c
@@ -1656,6 +1656,52 @@  void sock_map_close(struct sock *sk, long timeout)
 }
 EXPORT_SYMBOL_GPL(sock_map_close);
 
+int sock_map_diag_dump(struct sock *sk, struct sk_buff *skb, int attrtype)
+{
+	struct sk_psock_link *link;
+	struct nlattr *nla, *attr;
+	int nr_links = 0, ret = 0;
+	struct sk_psock *psock;
+	u32 *ids;
+
+	psock = sk_psock_get(sk);
+	if (likely(!psock))
+		return 0;
+
+	nla = nla_nest_start_noflag(skb, attrtype);
+	if (!nla) {
+		sk_psock_put(sk, psock);
+		return -EMSGSIZE;
+	}
+	spin_lock_bh(&psock->link_lock);
+	list_for_each_entry(link, &psock->link, list)
+		nr_links++;
+
+	attr = nla_reserve(skb, SK_DIAG_BPF_MAP_IDS,
+			   sizeof(link->map->id) * nr_links);
+	if (!attr) {
+		ret = -EMSGSIZE;
+		goto unlock;
+	}
+
+	ids = nla_data(attr);
+	list_for_each_entry(link, &psock->link, list) {
+		*ids = link->map->id;
+		ids++;
+	}
+unlock:
+	spin_unlock_bh(&psock->link_lock);
+	sk_psock_put(sk, psock);
+	if (ret) {
+		nla_nest_cancel(skb, nla);
+	} else {
+		ret = nr_links;
+		nla_nest_end(skb, nla);
+	}
+	return ret;
+}
+EXPORT_SYMBOL_GPL(sock_map_diag_dump);
+
 static int sock_map_iter_attach_target(struct bpf_prog *prog,
 				       union bpf_iter_link_info *linfo,
 				       struct bpf_iter_aux_info *aux)
diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c
index b812eb36f0e3..0949909d5b46 100644
--- a/net/ipv4/inet_diag.c
+++ b/net/ipv4/inet_diag.c
@@ -197,6 +197,11 @@  int inet_diag_msg_attrs_fill(struct sock *sk, struct sk_buff *skb,
 		    &inet_sockopt))
 		goto errout;
 
+#ifdef CONFIG_BPF_SYSCALL
+	if (sock_map_diag_dump(sk, skb, INET_DIAG_BPF_MAP) < 0)
+		goto errout;
+#endif
+
 	return 0;
 errout:
 	return 1;
diff --git a/net/unix/diag.c b/net/unix/diag.c
index 616b55c5b890..54aa8da2831e 100644
--- a/net/unix/diag.c
+++ b/net/unix/diag.c
@@ -6,6 +6,7 @@ 
 #include <linux/skbuff.h>
 #include <linux/module.h>
 #include <linux/uidgid.h>
+#include <linux/bpf.h>
 #include <net/netlink.h>
 #include <net/af_unix.h>
 #include <net/tcp_states.h>
@@ -172,6 +173,11 @@  static int sk_diag_fill(struct sock *sk, struct sk_buff *skb, struct unix_diag_r
 	    sk_diag_dump_uid(sk, skb, user_ns))
 		goto out_nlmsg_trim;
 
+#ifdef CONFIG_BPF_SYSCALL
+	if (sock_map_diag_dump(sk, skb, UNIX_DIAG_BPF_MAP) < 0)
+		goto out_nlmsg_trim;
+#endif
+
 	nlmsg_end(skb, nlh);
 	return 0;