Message ID | 20230319191913.61236-1-xiyou.wangcong@gmail.com (mailing list archive) |
---|---|
State | Changes Requested |
Delegated to: | BPF |
Headers | show |
Series | [net-next,v3] sock_map: dump socket map id via diag | expand |
On Sun, Mar 19, 2023 at 12:19 PM Cong Wang <xiyou.wangcong@gmail.com> wrote: > > From: Cong Wang <cong.wang@bytedance.com> > > Currently there is no way to know which sockmap a socket has been added > to from outside, especially for that a socket can be added to multiple > sockmap's. We could dump this via socket diag, as shown below. > > Sample output: > > # ./iproute2/misc/ss -tnaie --bpf-map > ESTAB 0 344329 127.0.0.1:1234 127.0.0.1:40912 ino:21098 sk:5 cgroup:/user.slice/user-0.slice/session-c1.scope <-> sockmap: 1 > > # bpftool map > 1: sockmap flags 0x0 > key 4B value 4B max_entries 2 memlock 4096B > pids echo-sockmap(549) > 4: array name pid_iter.rodata flags 0x480 > key 4B value 4B max_entries 1 memlock 4096B > btf_id 10 frozen > pids bpftool(624) > > In the future, we could dump other sockmap related stats too, hence I > make it a nested attribute. > > Cc: John Fastabend <john.fastabend@gmail.com> > Cc: Jakub Sitnicki <jakub@cloudflare.com> > Signed-off-by: Cong Wang <cong.wang@bytedance.com> Looks fine from my POW, will let others comment. One thing I still don't understand here: what is missing from the socket iterators to implement this? Is it all the sk_psock_get magic? I remember you dismissed Yonghong's suggestion on v1, but have you actually tried it? Also: a test would be nice to have. I know you've tested it with the iproute2, but having something regularly exercised by the ci seems good to have (and not a ton of work). > --- > v3: remove redundant rcu read lock > use likely() for psock check > > v2: rename enum's with more generic names > sock_map_idiag_dump -> sock_map_diag_dump() > make sock_map_diag_dump() return number of maps > > include/linux/bpf.h | 1 + > include/uapi/linux/inet_diag.h | 1 + > include/uapi/linux/sock_diag.h | 8 ++++++ > include/uapi/linux/unix_diag.h | 1 + > net/core/sock_map.c | 46 ++++++++++++++++++++++++++++++++++ > net/ipv4/inet_diag.c | 5 ++++ > net/unix/diag.c | 6 +++++ > 7 files changed, 68 insertions(+) > > diff --git a/include/linux/bpf.h b/include/linux/bpf.h > index 6792a7940e1e..4cc315ce26a9 100644 > --- a/include/linux/bpf.h > +++ b/include/linux/bpf.h > @@ -2638,6 +2638,7 @@ int sock_map_bpf_prog_query(const union bpf_attr *attr, > void sock_map_unhash(struct sock *sk); > void sock_map_destroy(struct sock *sk); > void sock_map_close(struct sock *sk, long timeout); > +int sock_map_diag_dump(struct sock *sk, struct sk_buff *skb, int attr); > #else > static inline int bpf_dev_bound_kfunc_check(struct bpf_verifier_log *log, > struct bpf_prog_aux *prog_aux) > diff --git a/include/uapi/linux/inet_diag.h b/include/uapi/linux/inet_diag.h > index 50655de04c9b..d1f1e4522633 100644 > --- a/include/uapi/linux/inet_diag.h > +++ b/include/uapi/linux/inet_diag.h > @@ -161,6 +161,7 @@ enum { > INET_DIAG_SK_BPF_STORAGES, > INET_DIAG_CGROUP_ID, > INET_DIAG_SOCKOPT, > + INET_DIAG_BPF_MAP, > __INET_DIAG_MAX, > }; > > diff --git a/include/uapi/linux/sock_diag.h b/include/uapi/linux/sock_diag.h > index 5f74a5f6091d..7c961940b408 100644 > --- a/include/uapi/linux/sock_diag.h > +++ b/include/uapi/linux/sock_diag.h > @@ -62,4 +62,12 @@ enum { > > #define SK_DIAG_BPF_STORAGE_MAX (__SK_DIAG_BPF_STORAGE_MAX - 1) > > +enum { > + SK_DIAG_BPF_MAP_NONE, > + SK_DIAG_BPF_MAP_IDS, > + __SK_DIAG_BPF_MAP_MAX, > +}; > + > +#define SK_DIAG_BPF_MAP_MAX (__SK_DIAG_BPF_MAP_MAX - 1) > + > #endif /* _UAPI__SOCK_DIAG_H__ */ > diff --git a/include/uapi/linux/unix_diag.h b/include/uapi/linux/unix_diag.h > index a1988576fa8a..b95a2b33521d 100644 > --- a/include/uapi/linux/unix_diag.h > +++ b/include/uapi/linux/unix_diag.h > @@ -42,6 +42,7 @@ enum { > UNIX_DIAG_MEMINFO, > UNIX_DIAG_SHUTDOWN, > UNIX_DIAG_UID, > + UNIX_DIAG_BPF_MAP, > > __UNIX_DIAG_MAX, > }; > diff --git a/net/core/sock_map.c b/net/core/sock_map.c > index 9b854e236d23..c4049095f64e 100644 > --- a/net/core/sock_map.c > +++ b/net/core/sock_map.c > @@ -1656,6 +1656,52 @@ void sock_map_close(struct sock *sk, long timeout) > } > EXPORT_SYMBOL_GPL(sock_map_close); > > +int sock_map_diag_dump(struct sock *sk, struct sk_buff *skb, int attrtype) > +{ > + struct sk_psock_link *link; > + struct nlattr *nla, *attr; > + int nr_links = 0, ret = 0; > + struct sk_psock *psock; > + u32 *ids; > + > + psock = sk_psock_get(sk); > + if (likely(!psock)) > + return 0; > + > + nla = nla_nest_start_noflag(skb, attrtype); > + if (!nla) { > + sk_psock_put(sk, psock); > + return -EMSGSIZE; > + } > + spin_lock_bh(&psock->link_lock); > + list_for_each_entry(link, &psock->link, list) > + nr_links++; > + > + attr = nla_reserve(skb, SK_DIAG_BPF_MAP_IDS, > + sizeof(link->map->id) * nr_links); > + if (!attr) { > + ret = -EMSGSIZE; > + goto unlock; > + } > + > + ids = nla_data(attr); > + list_for_each_entry(link, &psock->link, list) { > + *ids = link->map->id; > + ids++; > + } > +unlock: > + spin_unlock_bh(&psock->link_lock); > + sk_psock_put(sk, psock); > + if (ret) { > + nla_nest_cancel(skb, nla); > + } else { > + ret = nr_links; > + nla_nest_end(skb, nla); > + } > + return ret; > +} > +EXPORT_SYMBOL_GPL(sock_map_diag_dump); > + > static int sock_map_iter_attach_target(struct bpf_prog *prog, > union bpf_iter_link_info *linfo, > struct bpf_iter_aux_info *aux) > diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c > index b812eb36f0e3..0949909d5b46 100644 > --- a/net/ipv4/inet_diag.c > +++ b/net/ipv4/inet_diag.c > @@ -197,6 +197,11 @@ int inet_diag_msg_attrs_fill(struct sock *sk, struct sk_buff *skb, > &inet_sockopt)) > goto errout; > > +#ifdef CONFIG_BPF_SYSCALL > + if (sock_map_diag_dump(sk, skb, INET_DIAG_BPF_MAP) < 0) > + goto errout; > +#endif > + > return 0; > errout: > return 1; > diff --git a/net/unix/diag.c b/net/unix/diag.c > index 616b55c5b890..54aa8da2831e 100644 > --- a/net/unix/diag.c > +++ b/net/unix/diag.c > @@ -6,6 +6,7 @@ > #include <linux/skbuff.h> > #include <linux/module.h> > #include <linux/uidgid.h> > +#include <linux/bpf.h> > #include <net/netlink.h> > #include <net/af_unix.h> > #include <net/tcp_states.h> > @@ -172,6 +173,11 @@ static int sk_diag_fill(struct sock *sk, struct sk_buff *skb, struct unix_diag_r > sk_diag_dump_uid(sk, skb, user_ns)) > goto out_nlmsg_trim; > > +#ifdef CONFIG_BPF_SYSCALL > + if (sock_map_diag_dump(sk, skb, UNIX_DIAG_BPF_MAP) < 0) > + goto out_nlmsg_trim; > +#endif > + > nlmsg_end(skb, nlh); > return 0; > > -- > 2.34.1 >
On 3/20/23 11:13 AM, Stanislav Fomichev wrote: > One thing I still don't understand here: what is missing from the > socket iterators to implement this? Is it all the sk_psock_get magic? > I remember you dismissed Yonghong's suggestion on v1, but have you > actually tried it? would be useful to know what is missing to print the bpf map id without adding kernel code. There is new bpf_rdonly_cast() which will be useful here also.
On Mon, Mar 20, 2023 at 11:13:03AM -0700, Stanislav Fomichev wrote: > On Sun, Mar 19, 2023 at 12:19 PM Cong Wang <xiyou.wangcong@gmail.com> wrote: > > > > From: Cong Wang <cong.wang@bytedance.com> > > > > Currently there is no way to know which sockmap a socket has been added > > to from outside, especially for that a socket can be added to multiple > > sockmap's. We could dump this via socket diag, as shown below. > > > > Sample output: > > > > # ./iproute2/misc/ss -tnaie --bpf-map > > ESTAB 0 344329 127.0.0.1:1234 127.0.0.1:40912 ino:21098 sk:5 cgroup:/user.slice/user-0.slice/session-c1.scope <-> sockmap: 1 > > > > # bpftool map > > 1: sockmap flags 0x0 > > key 4B value 4B max_entries 2 memlock 4096B > > pids echo-sockmap(549) > > 4: array name pid_iter.rodata flags 0x480 > > key 4B value 4B max_entries 1 memlock 4096B > > btf_id 10 frozen > > pids bpftool(624) > > > > In the future, we could dump other sockmap related stats too, hence I > > make it a nested attribute. > > > > Cc: John Fastabend <john.fastabend@gmail.com> > > Cc: Jakub Sitnicki <jakub@cloudflare.com> > > Signed-off-by: Cong Wang <cong.wang@bytedance.com> > > Looks fine from my POW, will let others comment. > > One thing I still don't understand here: what is missing from the > socket iterators to implement this? Is it all the sk_psock_get magic? > I remember you dismissed Yonghong's suggestion on v1, but have you > actually tried it? I am very confused. So in order to figure out which sockmap a socket has been added to, I have to dump *all* sockmap's??? It seems you are suggesting to solve this with a more complex and unnecessary approach? Please tell me why, I am really lost, I don't even see there is a point to make here. > > Also: a test would be nice to have. I know you've tested it with the > iproute2, but having something regularly exercised by the ci seems > good to have (and not a ton of work). Sure, so where are the tests for socket diag? I don't see any within the tree: $ git grep INET_DIAG_SOCKOPT -- tools/ $ Note, this is not suitable for bpf selftests, because it is less relevant to bpf, much more relevant to socket diag. I thought this is obvious. Thanks.
On Mon, Mar 20, 2023 at 01:29:39PM -0700, Martin KaFai Lau wrote: > On 3/20/23 11:13 AM, Stanislav Fomichev wrote: > > One thing I still don't understand here: what is missing from the > > socket iterators to implement this? Is it all the sk_psock_get magic? > > I remember you dismissed Yonghong's suggestion on v1, but have you > > actually tried it? > would be useful to know what is missing to print the bpf map id without > adding kernel code. There is new bpf_rdonly_cast() which will be useful here > also. So you don't consider eBPF code as kernel code, right? Interestingly enough, eBPF code runs in kernel... and you still need to write eBPF code. So what is the point of "without adding kernel code" here? What is even more interesting is that even your own code does not agree with you here, for example, you introduced INET_DIAG_SK_BPF_STORAGES, so what was missing to print sk bpf storage without adding kernel code? Thanks.
On 3/25/23 1:27 PM, Cong Wang wrote: > On Mon, Mar 20, 2023 at 01:29:39PM -0700, Martin KaFai Lau wrote: >> On 3/20/23 11:13 AM, Stanislav Fomichev wrote: >>> One thing I still don't understand here: what is missing from the >>> socket iterators to implement this? Is it all the sk_psock_get magic? >>> I remember you dismissed Yonghong's suggestion on v1, but have you >>> actually tried it? >> would be useful to know what is missing to print the bpf map id without >> adding kernel code. There is new bpf_rdonly_cast() which will be useful here >> also. > > So you don't consider eBPF code as kernel code, right? Interestingly > enough, eBPF code runs in kernel... and you still need to write eBPF > code. So what is the point of "without adding kernel code" here? Interesting, how is it the same? if it needs to print something other than the map id in the future, even putting aside further kernel maintenance and additional review, does a new bpf prog need to upgrade and reboot the kernel? Based on your idea, all possible sk_filter automatically qualify to be added to the kernel source tree now because they also run in the kernel? > > What is even more interesting is that even your own code does not agree > with you here, for example, you introduced INET_DIAG_SK_BPF_STORAGES, so > what was missing to print sk bpf storage without adding kernel code? Yep, you are absolutely correct. Only if bpf-iter was available earlier. If bpf-iter was available before INET_DIAG_SK_BPF_STORAGES was added, there was no need to add INET_DIAG_SK_BPF_STORAGES and it would be no brainer to explore the bpf-iter approach first. Things have been improving. The question (from a few people) was to figure out what was missing in the bpf-iter approach to print this bpf bits and trying to help. It is the only reason I replied. Apparently, you have not even tried and not interested.
On 03/25, Cong Wang wrote: > On Mon, Mar 20, 2023 at 11:13:03AM -0700, Stanislav Fomichev wrote: > > On Sun, Mar 19, 2023 at 12:19 PM Cong Wang <xiyou.wangcong@gmail.com> > wrote: > > > > > > From: Cong Wang <cong.wang@bytedance.com> > > > > > > Currently there is no way to know which sockmap a socket has been > added > > > to from outside, especially for that a socket can be added to multiple > > > sockmap's. We could dump this via socket diag, as shown below. > > > > > > Sample output: > > > > > > # ./iproute2/misc/ss -tnaie --bpf-map > > > ESTAB 0 344329 127.0.0.1:1234 127.0.0.1:40912 > ino:21098 sk:5 cgroup:/user.slice/user-0.slice/session-c1.scope <-> > sockmap: 1 > > > > > > # bpftool map > > > 1: sockmap flags 0x0 > > > key 4B value 4B max_entries 2 memlock 4096B > > > pids echo-sockmap(549) > > > 4: array name pid_iter.rodata flags 0x480 > > > key 4B value 4B max_entries 1 memlock 4096B > > > btf_id 10 frozen > > > pids bpftool(624) > > > > > > In the future, we could dump other sockmap related stats too, hence I > > > make it a nested attribute. > > > > > > Cc: John Fastabend <john.fastabend@gmail.com> > > > Cc: Jakub Sitnicki <jakub@cloudflare.com> > > > Signed-off-by: Cong Wang <cong.wang@bytedance.com> > > > > Looks fine from my POW, will let others comment. > > > > One thing I still don't understand here: what is missing from the > > socket iterators to implement this? Is it all the sk_psock_get magic? > > I remember you dismissed Yonghong's suggestion on v1, but have you > > actually tried it? > I am very confused. So in order to figure out which sockmap a socket has > been added to, I have to dump *all* sockmap's??? It seems you are > suggesting to solve this with a more complex and unnecessary approach? > Please tell me why, I am really lost, I don't even see there is a point > to make here. With a socket iter, you can iterate over all sockets and run some bpf program on it do dump some state. So you'd iterate over the sockets, not sockmaps. For each socket you get a pointer to sock and you do the same sk_psock_get+list_for_each_entry(psock->link). (in theory; would be interesting to see whether it works in practice) > > > > Also: a test would be nice to have. I know you've tested it with the > > iproute2, but having something regularly exercised by the ci seems > > good to have (and not a ton of work). > Sure, so where are the tests for socket diag? I don't see any within the > tree: > $ git grep INET_DIAG_SOCKOPT -- tools/ > $ > Note, this is not suitable for bpf selftests, because it is less relevant > to bpf, much more relevant to socket diag. I thought this is obvious. Never too late to start testing those sock diag paths :-) Put them in the net selftests if you don't think bpf selftests is the right fit? > Thanks.
diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 6792a7940e1e..4cc315ce26a9 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -2638,6 +2638,7 @@ int sock_map_bpf_prog_query(const union bpf_attr *attr, void sock_map_unhash(struct sock *sk); void sock_map_destroy(struct sock *sk); void sock_map_close(struct sock *sk, long timeout); +int sock_map_diag_dump(struct sock *sk, struct sk_buff *skb, int attr); #else static inline int bpf_dev_bound_kfunc_check(struct bpf_verifier_log *log, struct bpf_prog_aux *prog_aux) diff --git a/include/uapi/linux/inet_diag.h b/include/uapi/linux/inet_diag.h index 50655de04c9b..d1f1e4522633 100644 --- a/include/uapi/linux/inet_diag.h +++ b/include/uapi/linux/inet_diag.h @@ -161,6 +161,7 @@ enum { INET_DIAG_SK_BPF_STORAGES, INET_DIAG_CGROUP_ID, INET_DIAG_SOCKOPT, + INET_DIAG_BPF_MAP, __INET_DIAG_MAX, }; diff --git a/include/uapi/linux/sock_diag.h b/include/uapi/linux/sock_diag.h index 5f74a5f6091d..7c961940b408 100644 --- a/include/uapi/linux/sock_diag.h +++ b/include/uapi/linux/sock_diag.h @@ -62,4 +62,12 @@ enum { #define SK_DIAG_BPF_STORAGE_MAX (__SK_DIAG_BPF_STORAGE_MAX - 1) +enum { + SK_DIAG_BPF_MAP_NONE, + SK_DIAG_BPF_MAP_IDS, + __SK_DIAG_BPF_MAP_MAX, +}; + +#define SK_DIAG_BPF_MAP_MAX (__SK_DIAG_BPF_MAP_MAX - 1) + #endif /* _UAPI__SOCK_DIAG_H__ */ diff --git a/include/uapi/linux/unix_diag.h b/include/uapi/linux/unix_diag.h index a1988576fa8a..b95a2b33521d 100644 --- a/include/uapi/linux/unix_diag.h +++ b/include/uapi/linux/unix_diag.h @@ -42,6 +42,7 @@ enum { UNIX_DIAG_MEMINFO, UNIX_DIAG_SHUTDOWN, UNIX_DIAG_UID, + UNIX_DIAG_BPF_MAP, __UNIX_DIAG_MAX, }; diff --git a/net/core/sock_map.c b/net/core/sock_map.c index 9b854e236d23..c4049095f64e 100644 --- a/net/core/sock_map.c +++ b/net/core/sock_map.c @@ -1656,6 +1656,52 @@ void sock_map_close(struct sock *sk, long timeout) } EXPORT_SYMBOL_GPL(sock_map_close); +int sock_map_diag_dump(struct sock *sk, struct sk_buff *skb, int attrtype) +{ + struct sk_psock_link *link; + struct nlattr *nla, *attr; + int nr_links = 0, ret = 0; + struct sk_psock *psock; + u32 *ids; + + psock = sk_psock_get(sk); + if (likely(!psock)) + return 0; + + nla = nla_nest_start_noflag(skb, attrtype); + if (!nla) { + sk_psock_put(sk, psock); + return -EMSGSIZE; + } + spin_lock_bh(&psock->link_lock); + list_for_each_entry(link, &psock->link, list) + nr_links++; + + attr = nla_reserve(skb, SK_DIAG_BPF_MAP_IDS, + sizeof(link->map->id) * nr_links); + if (!attr) { + ret = -EMSGSIZE; + goto unlock; + } + + ids = nla_data(attr); + list_for_each_entry(link, &psock->link, list) { + *ids = link->map->id; + ids++; + } +unlock: + spin_unlock_bh(&psock->link_lock); + sk_psock_put(sk, psock); + if (ret) { + nla_nest_cancel(skb, nla); + } else { + ret = nr_links; + nla_nest_end(skb, nla); + } + return ret; +} +EXPORT_SYMBOL_GPL(sock_map_diag_dump); + static int sock_map_iter_attach_target(struct bpf_prog *prog, union bpf_iter_link_info *linfo, struct bpf_iter_aux_info *aux) diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c index b812eb36f0e3..0949909d5b46 100644 --- a/net/ipv4/inet_diag.c +++ b/net/ipv4/inet_diag.c @@ -197,6 +197,11 @@ int inet_diag_msg_attrs_fill(struct sock *sk, struct sk_buff *skb, &inet_sockopt)) goto errout; +#ifdef CONFIG_BPF_SYSCALL + if (sock_map_diag_dump(sk, skb, INET_DIAG_BPF_MAP) < 0) + goto errout; +#endif + return 0; errout: return 1; diff --git a/net/unix/diag.c b/net/unix/diag.c index 616b55c5b890..54aa8da2831e 100644 --- a/net/unix/diag.c +++ b/net/unix/diag.c @@ -6,6 +6,7 @@ #include <linux/skbuff.h> #include <linux/module.h> #include <linux/uidgid.h> +#include <linux/bpf.h> #include <net/netlink.h> #include <net/af_unix.h> #include <net/tcp_states.h> @@ -172,6 +173,11 @@ static int sk_diag_fill(struct sock *sk, struct sk_buff *skb, struct unix_diag_r sk_diag_dump_uid(sk, skb, user_ns)) goto out_nlmsg_trim; +#ifdef CONFIG_BPF_SYSCALL + if (sock_map_diag_dump(sk, skb, UNIX_DIAG_BPF_MAP) < 0) + goto out_nlmsg_trim; +#endif + nlmsg_end(skb, nlh); return 0;