diff mbox series

[net-next] sock_map: dump socket map id via diag

Message ID 20230211201954.256230-1-xiyou.wangcong@gmail.com (mailing list archive)
State Changes Requested
Delegated to: Netdev Maintainers
Headers show
Series [net-next] sock_map: dump socket map id via diag | expand

Checks

Context Check Description
netdev/tree_selection success Clearly marked for net-next, async
netdev/fixes_present success Fixes tag not required for -next series
netdev/subject_prefix success Link
netdev/cover_letter success Single patches do not need cover letters
netdev/patch_count success Link
netdev/header_inline success No static functions without inline keyword in header files
netdev/build_32bit success Errors and warnings before: 1423 this patch: 1423
netdev/cc_maintainers warning 18 maintainers not CCed: dsahern@kernel.org pabeni@redhat.com daniel@iogearbox.net sdf@google.com dan.j.williams@intel.com andrii@kernel.org kuba@kernel.org jolsa@kernel.org edumazet@google.com gustavoars@kernel.org song@kernel.org martin.lau@linux.dev haoluo@google.com yhs@fb.com kpsingh@kernel.org ast@kernel.org kuniyu@amazon.com davem@davemloft.net
netdev/build_clang success Errors and warnings before: 152 this patch: 152
netdev/module_param success Was 0 now: 0
netdev/verify_signedoff success Signed-off-by tag matches author and committer
netdev/check_selftest success No net selftest shell script
netdev/verify_fixes success No Fixes tag
netdev/build_allmodconfig_warn success Errors and warnings before: 1418 this patch: 1418
netdev/checkpatch success total: 0 errors, 0 warnings, 0 checks, 117 lines checked
netdev/kdoc success Errors and warnings before: 0 this patch: 0
netdev/source_inline success Was 0 now: 0

Commit Message

Cong Wang Feb. 11, 2023, 8:19 p.m. UTC
From: Cong Wang <cong.wang@bytedance.com>

Currently there is no way to know which sockmap a socket has been added
to from outside, especially for that a socket can be added to multiple
sockmap's. We could dump this via socket diag, as shown below.

Sample output:

  # ./iproute2/misc/ss -tnaie --sockmap
  ESTAB  0      344329     127.0.0.1:1234     127.0.0.1:40912 ino:21098 sk:5 cgroup:/user.slice/user-0.slice/session-c1.scope <-> sockmap: 1

  # bpftool map
  1: sockmap  flags 0x0
  	key 4B  value 4B  max_entries 2  memlock 4096B
	pids echo-sockmap(549)
  4: array  name pid_iter.rodata  flags 0x480
	key 4B  value 4B  max_entries 1  memlock 4096B
	btf_id 10  frozen
	pids bpftool(624)

In the future, we could dump other sockmap related stats too, hence I
make it a nested attribute.

Cc: John Fastabend <john.fastabend@gmail.com>
Cc: Jakub Sitnicki <jakub@cloudflare.com>
Signed-off-by: Cong Wang <cong.wang@bytedance.com>
---
 include/linux/bpf.h            |  1 +
 include/uapi/linux/inet_diag.h |  1 +
 include/uapi/linux/sock_diag.h |  8 ++++++
 include/uapi/linux/unix_diag.h |  1 +
 net/core/sock_map.c            | 49 ++++++++++++++++++++++++++++++++++
 net/ipv4/inet_diag.c           |  5 ++++
 net/unix/diag.c                |  6 +++++
 7 files changed, 71 insertions(+)

Comments

Ido Schimmel Feb. 12, 2023, 11:35 a.m. UTC | #1
On Sat, Feb 11, 2023 at 12:19:54PM -0800, Cong Wang wrote:
> +int sock_map_idiag_dump(struct sock *sk, struct sk_buff *skb, int attrtype)
> +{
> +	struct sk_psock_link *link;
> +	struct nlattr *nla, *attr;
> +	int nr_links = 0, ret = 0;
> +	struct sk_psock *psock;
> +	u32 *ids;
> +
> +	rcu_read_lock();
> +	psock = sk_psock_get(sk);
> +	if (unlikely(!psock)) {
> +		rcu_read_unlock();
> +		return 0;
> +	}
> +
> +	nla = nla_nest_start_noflag(skb, attrtype);

Since 'INET_DIAG_SOCKMAP' is a new attribute, did you consider using
nla_nest_start() instead?

> +	if (!nla) {
> +		sk_psock_put(sk, psock);
> +		rcu_read_unlock();
> +		return -EMSGSIZE;
> +	}
> +	spin_lock_bh(&psock->link_lock);
> +	list_for_each_entry(link, &psock->link, list)
> +		nr_links++;
> +
> +	attr = nla_reserve(skb, SK_DIAG_BPF_SOCKMAP_MAP_ID,
> +			   sizeof(link->map->id) * nr_links);
> +	if (!attr) {
> +		ret = -EMSGSIZE;
> +		goto unlock;
> +	}
> +
> +	ids = nla_data(attr);
> +	list_for_each_entry(link, &psock->link, list) {
> +		*ids = link->map->id;
> +		ids++;
> +	}

No strong preferences, but I think a more "modern" netlink usage would
be to encode each ID in a separate u32 attribute rather than encoding an
array of u32 in a single attribute. Example:

[ INET_DIAG_SOCKMAP ]	// nested
	[ SK_DIAG_BPF_SOCKMAP_MAP_ID ] // u32
	[ SK_DIAG_BPF_SOCKMAP_MAP_ID ] // u32
	...

Or:

[ INET_DIAG_SOCKMAP ]	// nested
	[ SK_DIAG_BPF_SOCKMAP_MAP_IDS ] // nested
		[ SK_DIAG_BPF_SOCKMAP_MAP_ID ] // u32
		[ SK_DIAG_BPF_SOCKMAP_MAP_ID ] // u32
		...

> +unlock:
> +	spin_unlock_bh(&psock->link_lock);
> +	sk_psock_put(sk, psock);
> +	rcu_read_unlock();
> +	if (ret)
> +		nla_nest_cancel(skb, nla);
> +	else
> +		nla_nest_end(skb, nla);
> +	return ret;
> +}
> +EXPORT_SYMBOL_GPL(sock_map_idiag_dump);
Yonghong Song Feb. 13, 2023, 6:29 a.m. UTC | #2
On 2/11/23 12:19 PM, Cong Wang wrote:
> From: Cong Wang <cong.wang@bytedance.com>
> 
> Currently there is no way to know which sockmap a socket has been added
> to from outside, especially for that a socket can be added to multiple
> sockmap's. We could dump this via socket diag, as shown below.
> 
> Sample output:
> 
>    # ./iproute2/misc/ss -tnaie --sockmap
>    ESTAB  0      344329     127.0.0.1:1234     127.0.0.1:40912 ino:21098 sk:5 cgroup:/user.slice/user-0.slice/session-c1.scope <-> sockmap: 1
> 
>    # bpftool map
>    1: sockmap  flags 0x0
>    	key 4B  value 4B  max_entries 2  memlock 4096B
> 	pids echo-sockmap(549)
>    4: array  name pid_iter.rodata  flags 0x480
> 	key 4B  value 4B  max_entries 1  memlock 4096B
> 	btf_id 10  frozen
> 	pids bpftool(624)
> 
> In the future, we could dump other sockmap related stats too, hence I
> make it a nested attribute.

Have you considered to implement a sockmap iterator? This will be
similar to existing hash/array/sk_local_storage map iterators. The
link below is the kernel implementation for sk_local_storage map
iterator which iterates through all sockets.
    https://lore.kernel.org/bpf/20200723184116.590602-1-yhs@fb.com

This way, in the future, if you want to print out more information
from the socket, no kernel change is needed and you can just adjust
your bpf program.

> 
> Cc: John Fastabend <john.fastabend@gmail.com>
> Cc: Jakub Sitnicki <jakub@cloudflare.com>
> Signed-off-by: Cong Wang <cong.wang@bytedance.com>
> ---
>   include/linux/bpf.h            |  1 +
>   include/uapi/linux/inet_diag.h |  1 +
>   include/uapi/linux/sock_diag.h |  8 ++++++
>   include/uapi/linux/unix_diag.h |  1 +
>   net/core/sock_map.c            | 49 ++++++++++++++++++++++++++++++++++
>   net/ipv4/inet_diag.c           |  5 ++++
>   net/unix/diag.c                |  6 +++++
>   7 files changed, 71 insertions(+)
> 
[...]
Cong Wang Feb. 19, 2023, 8:21 p.m. UTC | #3
On Sun, Feb 12, 2023 at 01:35:18PM +0200, Ido Schimmel wrote:
> On Sat, Feb 11, 2023 at 12:19:54PM -0800, Cong Wang wrote:
> > +int sock_map_idiag_dump(struct sock *sk, struct sk_buff *skb, int attrtype)
> > +{
> > +	struct sk_psock_link *link;
> > +	struct nlattr *nla, *attr;
> > +	int nr_links = 0, ret = 0;
> > +	struct sk_psock *psock;
> > +	u32 *ids;
> > +
> > +	rcu_read_lock();
> > +	psock = sk_psock_get(sk);
> > +	if (unlikely(!psock)) {
> > +		rcu_read_unlock();
> > +		return 0;
> > +	}
> > +
> > +	nla = nla_nest_start_noflag(skb, attrtype);
> 
> Since 'INET_DIAG_SOCKMAP' is a new attribute, did you consider using
> nla_nest_start() instead?

Yes, but other INET_DIAG_* attributes are not new, hence why ss.c still
uses parse_rtattr(), I am not sure whether it is okay to change it to
parse_rtattr_nested() just because of this.

> 
> > +	if (!nla) {
> > +		sk_psock_put(sk, psock);
> > +		rcu_read_unlock();
> > +		return -EMSGSIZE;
> > +	}
> > +	spin_lock_bh(&psock->link_lock);
> > +	list_for_each_entry(link, &psock->link, list)
> > +		nr_links++;
> > +
> > +	attr = nla_reserve(skb, SK_DIAG_BPF_SOCKMAP_MAP_ID,
> > +			   sizeof(link->map->id) * nr_links);
> > +	if (!attr) {
> > +		ret = -EMSGSIZE;
> > +		goto unlock;
> > +	}
> > +
> > +	ids = nla_data(attr);
> > +	list_for_each_entry(link, &psock->link, list) {
> > +		*ids = link->map->id;
> > +		ids++;
> > +	}
> 
> No strong preferences, but I think a more "modern" netlink usage would
> be to encode each ID in a separate u32 attribute rather than encoding an
> array of u32 in a single attribute. Example:
> 
> [ INET_DIAG_SOCKMAP ]	// nested
> 	[ SK_DIAG_BPF_SOCKMAP_MAP_ID ] // u32
> 	[ SK_DIAG_BPF_SOCKMAP_MAP_ID ] // u32
> 	...

How do we know how many ID's we have here? Note, INET_DIAG_SOCKMAP in
the future could have other attributes, so can't simply mark the end of
ID's.

> 
> Or:
> 
> [ INET_DIAG_SOCKMAP ]	// nested
> 	[ SK_DIAG_BPF_SOCKMAP_MAP_IDS ] // nested
> 		[ SK_DIAG_BPF_SOCKMAP_MAP_ID ] // u32
> 		[ SK_DIAG_BPF_SOCKMAP_MAP_ID ] // u32
> 		...

Yet one more nested level...

I prefer the current layout.

Thanks.
Cong Wang Feb. 19, 2023, 8:22 p.m. UTC | #4
On Sun, Feb 12, 2023 at 10:29:11PM -0800, Yonghong Song wrote:
> 
> 
> On 2/11/23 12:19 PM, Cong Wang wrote:
> > From: Cong Wang <cong.wang@bytedance.com>
> > 
> > Currently there is no way to know which sockmap a socket has been added
> > to from outside, especially for that a socket can be added to multiple
> > sockmap's. We could dump this via socket diag, as shown below.
> > 
> > Sample output:
> > 
> >    # ./iproute2/misc/ss -tnaie --sockmap
> >    ESTAB  0      344329     127.0.0.1:1234     127.0.0.1:40912 ino:21098 sk:5 cgroup:/user.slice/user-0.slice/session-c1.scope <-> sockmap: 1
> > 
> >    # bpftool map
> >    1: sockmap  flags 0x0
> >    	key 4B  value 4B  max_entries 2  memlock 4096B
> > 	pids echo-sockmap(549)
> >    4: array  name pid_iter.rodata  flags 0x480
> > 	key 4B  value 4B  max_entries 1  memlock 4096B
> > 	btf_id 10  frozen
> > 	pids bpftool(624)
> > 
> > In the future, we could dump other sockmap related stats too, hence I
> > make it a nested attribute.
> 
> Have you considered to implement a sockmap iterator? This will be

I think you understand it wrong, I don't want to dump the map, please
double check the above requirement.

Thanks.
Jakub Sitnicki Feb. 21, 2023, 9:11 a.m. UTC | #5
On Sat, Feb 11, 2023 at 12:19 PM -08, Cong Wang wrote:
> From: Cong Wang <cong.wang@bytedance.com>
>
> Currently there is no way to know which sockmap a socket has been added
> to from outside, especially for that a socket can be added to multiple
> sockmap's. We could dump this via socket diag, as shown below.
>
> Sample output:
>
>   # ./iproute2/misc/ss -tnaie --sockmap
>   ESTAB  0      344329     127.0.0.1:1234     127.0.0.1:40912 ino:21098 sk:5 cgroup:/user.slice/user-0.slice/session-c1.scope <-> sockmap: 1
>
>   # bpftool map
>   1: sockmap  flags 0x0
>   	key 4B  value 4B  max_entries 2  memlock 4096B
> 	pids echo-sockmap(549)
>   4: array  name pid_iter.rodata  flags 0x480
> 	key 4B  value 4B  max_entries 1  memlock 4096B
> 	btf_id 10  frozen
> 	pids bpftool(624)
>
> In the future, we could dump other sockmap related stats too, hence I
> make it a nested attribute.
>
> Cc: John Fastabend <john.fastabend@gmail.com>
> Cc: Jakub Sitnicki <jakub@cloudflare.com>
> Signed-off-by: Cong Wang <cong.wang@bytedance.com>
> ---

Sorry for not replying sooner. This sounds useful. Another use case I
can see here is inspecting process' sockets:

1. get a dup FD with pidfd_getfd()
2. query sock_diag by socket cookie
3. find out which maps socket is in.


I don't know if it makes sense to tie the naming to sockmap. We also
have also map type that can hold socket references -
REUSEPORT_SOCKARRAY.

We might want to add sock_diag support for REUSEPORT_SOCKARRAY in the
future as well. So a map-type-agnostic name for the new inet_diag ext
might be more future proof. Like INET_DIAG_BPF_MAP.


Also, can you please add a simple selftest? They often serve as the only
documentation for the features. Perhaps in
tools/testing/selftests/bpf/prog_tests/sockmap_basic.c.

Thanks,
Jakub
Cong Wang March 12, 2023, 7:44 p.m. UTC | #6
On Tue, Feb 21, 2023 at 10:11:12AM +0100, Jakub Sitnicki wrote:
> On Sat, Feb 11, 2023 at 12:19 PM -08, Cong Wang wrote:
> > From: Cong Wang <cong.wang@bytedance.com>
> >
> > Currently there is no way to know which sockmap a socket has been added
> > to from outside, especially for that a socket can be added to multiple
> > sockmap's. We could dump this via socket diag, as shown below.
> >
> > Sample output:
> >
> >   # ./iproute2/misc/ss -tnaie --sockmap
> >   ESTAB  0      344329     127.0.0.1:1234     127.0.0.1:40912 ino:21098 sk:5 cgroup:/user.slice/user-0.slice/session-c1.scope <-> sockmap: 1
> >
> >   # bpftool map
> >   1: sockmap  flags 0x0
> >   	key 4B  value 4B  max_entries 2  memlock 4096B
> > 	pids echo-sockmap(549)
> >   4: array  name pid_iter.rodata  flags 0x480
> > 	key 4B  value 4B  max_entries 1  memlock 4096B
> > 	btf_id 10  frozen
> > 	pids bpftool(624)
> >
> > In the future, we could dump other sockmap related stats too, hence I
> > make it a nested attribute.
> >
> > Cc: John Fastabend <john.fastabend@gmail.com>
> > Cc: Jakub Sitnicki <jakub@cloudflare.com>
> > Signed-off-by: Cong Wang <cong.wang@bytedance.com>
> > ---
> 
> Sorry for not replying sooner. This sounds useful. Another use case I
> can see here is inspecting process' sockets:
> 
> 1. get a dup FD with pidfd_getfd()
> 2. query sock_diag by socket cookie
> 3. find out which maps socket is in.
> 
> 
> I don't know if it makes sense to tie the naming to sockmap. We also
> have also map type that can hold socket references -
> REUSEPORT_SOCKARRAY.
> 
> We might want to add sock_diag support for REUSEPORT_SOCKARRAY in the
> future as well. So a map-type-agnostic name for the new inet_diag ext
> might be more future proof. Like INET_DIAG_BPF_MAP.

Sounds reasonable. I didn't realize REUSEPORT_SOCKARRAY also needs this.

> 
> 
> Also, can you please add a simple selftest? They often serve as the only
> documentation for the features. Perhaps in
> tools/testing/selftests/bpf/prog_tests/sockmap_basic.c.
> 

Not sure if this makes sense, because this is mostly for socket diag, it
does not seem fitting well in bpf selftests.

In case you wonder how I tested it, I have an iproute2 patch which is
not posted here yet.

Thanks.
diff mbox series

Patch

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 35c18a98c21a..f23b97e0fe71 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -2539,6 +2539,7 @@  int sock_map_bpf_prog_query(const union bpf_attr *attr,
 void sock_map_unhash(struct sock *sk);
 void sock_map_destroy(struct sock *sk);
 void sock_map_close(struct sock *sk, long timeout);
+int sock_map_idiag_dump(struct sock *sk, struct sk_buff *skb, int attr);
 #else
 static inline int bpf_dev_bound_kfunc_check(struct bpf_verifier_log *log,
 					    struct bpf_prog_aux *prog_aux)
diff --git a/include/uapi/linux/inet_diag.h b/include/uapi/linux/inet_diag.h
index 50655de04c9b..6c6952c2eb70 100644
--- a/include/uapi/linux/inet_diag.h
+++ b/include/uapi/linux/inet_diag.h
@@ -161,6 +161,7 @@  enum {
 	INET_DIAG_SK_BPF_STORAGES,
 	INET_DIAG_CGROUP_ID,
 	INET_DIAG_SOCKOPT,
+	INET_DIAG_SOCKMAP,
 	__INET_DIAG_MAX,
 };
 
diff --git a/include/uapi/linux/sock_diag.h b/include/uapi/linux/sock_diag.h
index 5f74a5f6091d..5deb22057d14 100644
--- a/include/uapi/linux/sock_diag.h
+++ b/include/uapi/linux/sock_diag.h
@@ -62,4 +62,12 @@  enum {
 
 #define SK_DIAG_BPF_STORAGE_MAX        (__SK_DIAG_BPF_STORAGE_MAX - 1)
 
+enum {
+	SK_DIAG_BPF_SOCKMAP_NONE,
+	SK_DIAG_BPF_SOCKMAP_MAP_ID,
+	__SK_DIAG_BPF_SOCKMAP_MAX,
+};
+
+#define SK_DIAG_BPF_SOCKMAP_MAX        (__SK_DIAG_BPF_SOCKMAP_MAX - 1)
+
 #endif /* _UAPI__SOCK_DIAG_H__ */
diff --git a/include/uapi/linux/unix_diag.h b/include/uapi/linux/unix_diag.h
index a1988576fa8a..a21a25030b13 100644
--- a/include/uapi/linux/unix_diag.h
+++ b/include/uapi/linux/unix_diag.h
@@ -42,6 +42,7 @@  enum {
 	UNIX_DIAG_MEMINFO,
 	UNIX_DIAG_SHUTDOWN,
 	UNIX_DIAG_UID,
+	UNIX_DIAG_SOCKMAP,
 
 	__UNIX_DIAG_MAX,
 };
diff --git a/net/core/sock_map.c b/net/core/sock_map.c
index a68a7290a3b2..fa3068857a8a 100644
--- a/net/core/sock_map.c
+++ b/net/core/sock_map.c
@@ -1636,6 +1636,55 @@  void sock_map_close(struct sock *sk, long timeout)
 }
 EXPORT_SYMBOL_GPL(sock_map_close);
 
+int sock_map_idiag_dump(struct sock *sk, struct sk_buff *skb, int attrtype)
+{
+	struct sk_psock_link *link;
+	struct nlattr *nla, *attr;
+	int nr_links = 0, ret = 0;
+	struct sk_psock *psock;
+	u32 *ids;
+
+	rcu_read_lock();
+	psock = sk_psock_get(sk);
+	if (unlikely(!psock)) {
+		rcu_read_unlock();
+		return 0;
+	}
+
+	nla = nla_nest_start_noflag(skb, attrtype);
+	if (!nla) {
+		sk_psock_put(sk, psock);
+		rcu_read_unlock();
+		return -EMSGSIZE;
+	}
+	spin_lock_bh(&psock->link_lock);
+	list_for_each_entry(link, &psock->link, list)
+		nr_links++;
+
+	attr = nla_reserve(skb, SK_DIAG_BPF_SOCKMAP_MAP_ID,
+			   sizeof(link->map->id) * nr_links);
+	if (!attr) {
+		ret = -EMSGSIZE;
+		goto unlock;
+	}
+
+	ids = nla_data(attr);
+	list_for_each_entry(link, &psock->link, list) {
+		*ids = link->map->id;
+		ids++;
+	}
+unlock:
+	spin_unlock_bh(&psock->link_lock);
+	sk_psock_put(sk, psock);
+	rcu_read_unlock();
+	if (ret)
+		nla_nest_cancel(skb, nla);
+	else
+		nla_nest_end(skb, nla);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(sock_map_idiag_dump);
+
 static int sock_map_iter_attach_target(struct bpf_prog *prog,
 				       union bpf_iter_link_info *linfo,
 				       struct bpf_iter_aux_info *aux)
diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c
index b812eb36f0e3..8fe43b6324c2 100644
--- a/net/ipv4/inet_diag.c
+++ b/net/ipv4/inet_diag.c
@@ -197,6 +197,11 @@  int inet_diag_msg_attrs_fill(struct sock *sk, struct sk_buff *skb,
 		    &inet_sockopt))
 		goto errout;
 
+#ifdef CONFIG_BPF_SYSCALL
+	if (sock_map_idiag_dump(sk, skb, INET_DIAG_SOCKMAP))
+		goto errout;
+#endif
+
 	return 0;
 errout:
 	return 1;
diff --git a/net/unix/diag.c b/net/unix/diag.c
index 616b55c5b890..0c5d39889045 100644
--- a/net/unix/diag.c
+++ b/net/unix/diag.c
@@ -6,6 +6,7 @@ 
 #include <linux/skbuff.h>
 #include <linux/module.h>
 #include <linux/uidgid.h>
+#include <linux/bpf.h>
 #include <net/netlink.h>
 #include <net/af_unix.h>
 #include <net/tcp_states.h>
@@ -172,6 +173,11 @@  static int sk_diag_fill(struct sock *sk, struct sk_buff *skb, struct unix_diag_r
 	    sk_diag_dump_uid(sk, skb, user_ns))
 		goto out_nlmsg_trim;
 
+#ifdef CONFIG_BPF_SYSCALL
+	if (sock_map_idiag_dump(sk, skb, UNIX_DIAG_SOCKMAP))
+		goto out_nlmsg_trim;
+#endif
+
 	nlmsg_end(skb, nlh);
 	return 0;