diff mbox series

[net-next,v2] tcp: Dump bound-only sockets in inet_diag.

Message ID bfb52b5103de808cda022e2d16bac6cf3ef747d6.1700780828.git.gnault@redhat.com (mailing list archive)
State Superseded
Delegated to: Netdev Maintainers
Headers show
Series [net-next,v2] tcp: Dump bound-only sockets in inet_diag. | expand

Checks

Context Check Description
netdev/series_format success Single patches do not need cover letters
netdev/codegen success Generated files up to date
netdev/tree_selection success Clearly marked for net-next
netdev/fixes_present success Fixes tag not required for -next series
netdev/header_inline success No static functions without inline keyword in header files
netdev/build_32bit success Errors and warnings before: 1115 this patch: 1115
netdev/cc_maintainers success CCed 5 of 5 maintainers
netdev/build_clang success Errors and warnings before: 1142 this patch: 1142
netdev/verify_signedoff success Signed-off-by tag matches author and committer
netdev/deprecated_api success None detected
netdev/check_selftest success No net selftest shell script
netdev/verify_fixes success No Fixes tag
netdev/build_allmodconfig_warn success Errors and warnings before: 1142 this patch: 1142
netdev/checkpatch warning CHECK: multiple assignments should be avoided WARNING: line length of 83 exceeds 80 columns
netdev/build_clang_rust success No Rust files in patch. Skipping build
netdev/kdoc success Errors and warnings before: 0 this patch: 0
netdev/source_inline success Was 0 now: 0

Commit Message

Guillaume Nault Nov. 23, 2023, 11:11 p.m. UTC
Walk the hashinfo->bhash2 table so that inet_diag can dump TCP sockets
that are bound but haven't yet called connect() or listen().

This allows ss to dump bound-only TCP sockets, together with listening
sockets (as there's no specific state for bound-only sockets). This is
similar to the UDP behaviour for which bound-only sockets are already
dumped by ss -lu.

The code is inspired by the ->lhash2 loop. However there's no manual
test of the source port, since this kind of filtering is already
handled by inet_diag_bc_sk(). Also, a maximum of 16 sockets are dumped
at a time, to avoid running with bh disabled for too long.

No change is needed for ss. With an IPv4, an IPv6 and an IPv6-only
socket, bound respectively to 40000, 64000, 60000, the result is:

  $ ss -lt
  State  Recv-Q Send-Q Local Address:Port  Peer Address:PortProcess
  UNCONN 0      0            0.0.0.0:40000      0.0.0.0:*
  UNCONN 0      0               [::]:60000         [::]:*
  UNCONN 0      0                  *:64000            *:*

Signed-off-by: Guillaume Nault <gnault@redhat.com>
---

v2:
  * Use ->bhash2 instead of ->bhash (Kuniyuki Iwashima).
  * Process no more than 16 sockets at a time (Kuniyuki Iwashima).

 net/ipv4/inet_diag.c | 88 +++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 87 insertions(+), 1 deletion(-)

Comments

Kuniyuki Iwashima Nov. 25, 2023, 1:39 a.m. UTC | #1
From: Guillaume Nault <gnault@redhat.com>
Date: Fri, 24 Nov 2023 00:11:42 +0100
> Walk the hashinfo->bhash2 table so that inet_diag can dump TCP sockets
> that are bound but haven't yet called connect() or listen().
> 
> This allows ss to dump bound-only TCP sockets, together with listening
> sockets (as there's no specific state for bound-only sockets). This is
> similar to the UDP behaviour for which bound-only sockets are already
> dumped by ss -lu.
> 
> The code is inspired by the ->lhash2 loop. However there's no manual
> test of the source port, since this kind of filtering is already
> handled by inet_diag_bc_sk(). Also, a maximum of 16 sockets are dumped
> at a time, to avoid running with bh disabled for too long.
> 
> No change is needed for ss. With an IPv4, an IPv6 and an IPv6-only
> socket, bound respectively to 40000, 64000, 60000, the result is:
> 
>   $ ss -lt
>   State  Recv-Q Send-Q Local Address:Port  Peer Address:PortProcess
>   UNCONN 0      0            0.0.0.0:40000      0.0.0.0:*
>   UNCONN 0      0               [::]:60000         [::]:*
>   UNCONN 0      0                  *:64000            *:*
> 
> Signed-off-by: Guillaume Nault <gnault@redhat.com>
> ---
> 
> v2:
>   * Use ->bhash2 instead of ->bhash (Kuniyuki Iwashima).
>   * Process no more than 16 sockets at a time (Kuniyuki Iwashima).
> 
>  net/ipv4/inet_diag.c | 88 +++++++++++++++++++++++++++++++++++++++++++-
>  1 file changed, 87 insertions(+), 1 deletion(-)
> 
> diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c
> index 7d0e7aaa71e0..d7fb6a625cb7 100644
> --- a/net/ipv4/inet_diag.c
> +++ b/net/ipv4/inet_diag.c
> @@ -1077,10 +1077,96 @@ void inet_diag_dump_icsk(struct inet_hashinfo *hashinfo, struct sk_buff *skb,
>  		s_i = num = s_num = 0;
>  	}
>  
> +/* Process a maximum of SKARR_SZ sockets at a time when walking hash buckets
> + * with bh disabled.
> + */
> +#define SKARR_SZ 16
> +
> +	/* Dump bound-only sockets */
> +	if (cb->args[0] == 1) {
> +		if (!(idiag_states & TCPF_CLOSE))
> +			goto skip_bind_ht;
> +
> +		for (i = s_i; i < hashinfo->bhash_size; i++) {
> +			struct inet_bind_hashbucket *ibb;
> +			struct inet_bind2_bucket *tb2;
> +			struct sock *sk_arr[SKARR_SZ];
> +			int num_arr[SKARR_SZ];
> +			int idx, accum, res;
> +
> +resume_bind_walk:
> +			num = 0;
> +			accum = 0;
> +			ibb = &hashinfo->bhash2[i];
> +
> +			spin_lock_bh(&ibb->lock);
> +			inet_bind_bucket_for_each(tb2, &ibb->chain) {
> +				if (!net_eq(ib2_net(tb2), net))
> +					continue;
> +
> +				sk_for_each_bound_bhash2(sk, &tb2->owners) {
> +					struct inet_sock *inet = inet_sk(sk);
> +
> +					if (num < s_num)
> +						goto next_bind;
> +
> +					if (sk->sk_state != TCP_CLOSE ||
> +					    !inet->inet_num)
> +						goto next_bind;
> +
> +					if (r->sdiag_family != AF_UNSPEC &&
> +					    r->sdiag_family != sk->sk_family)
> +						goto next_bind;
> +
> +					if (!inet_diag_bc_sk(bc, sk))
> +						goto next_bind;
> +
> +					if (!refcount_inc_not_zero(&sk->sk_refcnt))
> +						goto next_bind;

I guess this is copied from the ehash code below, but could
refcount_inc_not_zero() fail for bhash2 under spin_lock_bh() ?


> +
> +					num_arr[accum] = num;
> +					sk_arr[accum] = sk;
> +					if (++accum == SKARR_SZ)
> +						goto pause_bind_walk;
> +next_bind:
> +					num++;
> +				}
> +			}
> +pause_bind_walk:
> +			spin_unlock_bh(&ibb->lock);
> +
> +			res = 0;
> +			for (idx = 0; idx < accum; idx++) {
> +				if (res >= 0) {
> +					res = inet_sk_diag_fill(sk_arr[idx],
> +								NULL, skb, cb,
> +								r, NLM_F_MULTI,
> +								net_admin);
> +					if (res < 0)
> +						num = num_arr[idx];
> +				}
> +				sock_gen_put(sk_arr[idx]);
> +			}
> +			if (res < 0)
> +				goto done;
> +
> +			cond_resched();
> +
> +			if (accum == SKARR_SZ) {
> +				s_num = num + 1;
> +				goto resume_bind_walk;
> +			}
> +
> +			s_num = 0;
> +		}
> +skip_bind_ht:
> +		cb->args[0] = 2;
> +		s_i = num = s_num = 0;
> +	}
> +
>  	if (!(idiag_states & ~TCPF_LISTEN))
>  		goto out;
>  
> -#define SKARR_SZ 16
>  	for (i = s_i; i <= hashinfo->ehash_mask; i++) {
>  		struct inet_ehash_bucket *head = &hashinfo->ehash[i];
>  		spinlock_t *lock = inet_ehash_lockp(hashinfo, i);
> -- 
> 2.39.2
Guillaume Nault Nov. 27, 2023, 5:26 p.m. UTC | #2
On Fri, Nov 24, 2023 at 05:39:42PM -0800, Kuniyuki Iwashima wrote:
> > +			spin_lock_bh(&ibb->lock);
> > +			inet_bind_bucket_for_each(tb2, &ibb->chain) {
> > +				if (!net_eq(ib2_net(tb2), net))
> > +					continue;
> > +
> > +				sk_for_each_bound_bhash2(sk, &tb2->owners) {
> > +					struct inet_sock *inet = inet_sk(sk);
> > +
> > +					if (num < s_num)
> > +						goto next_bind;
> > +
> > +					if (sk->sk_state != TCP_CLOSE ||
> > +					    !inet->inet_num)
> > +						goto next_bind;
> > +
> > +					if (r->sdiag_family != AF_UNSPEC &&
> > +					    r->sdiag_family != sk->sk_family)
> > +						goto next_bind;
> > +
> > +					if (!inet_diag_bc_sk(bc, sk))
> > +						goto next_bind;
> > +
> > +					if (!refcount_inc_not_zero(&sk->sk_refcnt))
> > +						goto next_bind;
> 
> I guess this is copied from the ehash code below, but could
> refcount_inc_not_zero() fail for bhash2 under spin_lock_bh() ?

My understanding is that it can't fail, but I prefered to keep the test
to be on the safe side.

I can post a v3 using a plain sock_hold(), if you prefer.

> > +
> > +					num_arr[accum] = num;
> > +					sk_arr[accum] = sk;
> > +					if (++accum == SKARR_SZ)
> > +						goto pause_bind_walk;
> > +next_bind:
> > +					num++;
> > +				}
> > +			}
Kuniyuki Iwashima Nov. 27, 2023, 5:56 p.m. UTC | #3
From: Guillaume Nault <gnault@redhat.com>
Date: Mon, 27 Nov 2023 18:26:05 +0100
> On Fri, Nov 24, 2023 at 05:39:42PM -0800, Kuniyuki Iwashima wrote:
> > > +			spin_lock_bh(&ibb->lock);
> > > +			inet_bind_bucket_for_each(tb2, &ibb->chain) {
> > > +				if (!net_eq(ib2_net(tb2), net))
> > > +					continue;
> > > +
> > > +				sk_for_each_bound_bhash2(sk, &tb2->owners) {
> > > +					struct inet_sock *inet = inet_sk(sk);
> > > +
> > > +					if (num < s_num)
> > > +						goto next_bind;
> > > +
> > > +					if (sk->sk_state != TCP_CLOSE ||
> > > +					    !inet->inet_num)
> > > +						goto next_bind;
> > > +
> > > +					if (r->sdiag_family != AF_UNSPEC &&
> > > +					    r->sdiag_family != sk->sk_family)
> > > +						goto next_bind;
> > > +
> > > +					if (!inet_diag_bc_sk(bc, sk))
> > > +						goto next_bind;
> > > +
> > > +					if (!refcount_inc_not_zero(&sk->sk_refcnt))
> > > +						goto next_bind;
> > 
> > I guess this is copied from the ehash code below, but could
> > refcount_inc_not_zero() fail for bhash2 under spin_lock_bh() ?
> 
> My understanding is that it can't fail, but I prefered to keep the test
> to be on the safe side.
> 
> I can post a v3 using a plain sock_hold(), if you prefer.

I prefer sock_hold() because refcount_inc_not_zero() implies that it could
fail and is confusing if it never fails.
Guillaume Nault Nov. 28, 2023, 1:13 a.m. UTC | #4
On Mon, Nov 27, 2023 at 09:56:43AM -0800, Kuniyuki Iwashima wrote:
> From: Guillaume Nault <gnault@redhat.com>
> Date: Mon, 27 Nov 2023 18:26:05 +0100
> > On Fri, Nov 24, 2023 at 05:39:42PM -0800, Kuniyuki Iwashima wrote:
> > > > +			spin_lock_bh(&ibb->lock);
> > > > +			inet_bind_bucket_for_each(tb2, &ibb->chain) {
> > > > +				if (!net_eq(ib2_net(tb2), net))
> > > > +					continue;
> > > > +
> > > > +				sk_for_each_bound_bhash2(sk, &tb2->owners) {
> > > > +					struct inet_sock *inet = inet_sk(sk);
> > > > +
> > > > +					if (num < s_num)
> > > > +						goto next_bind;
> > > > +
> > > > +					if (sk->sk_state != TCP_CLOSE ||
> > > > +					    !inet->inet_num)
> > > > +						goto next_bind;
> > > > +
> > > > +					if (r->sdiag_family != AF_UNSPEC &&
> > > > +					    r->sdiag_family != sk->sk_family)
> > > > +						goto next_bind;
> > > > +
> > > > +					if (!inet_diag_bc_sk(bc, sk))
> > > > +						goto next_bind;
> > > > +
> > > > +					if (!refcount_inc_not_zero(&sk->sk_refcnt))
> > > > +						goto next_bind;
> > > 
> > > I guess this is copied from the ehash code below, but could
> > > refcount_inc_not_zero() fail for bhash2 under spin_lock_bh() ?
> > 
> > My understanding is that it can't fail, but I prefered to keep the test
> > to be on the safe side.
> > 
> > I can post a v3 using a plain sock_hold(), if you prefer.
> 
> I prefer sock_hold() because refcount_inc_not_zero() implies that it could
> fail and is confusing if it never fails.

Ok, I'll do that for v3.
Eric Dumazet Nov. 28, 2023, 10:14 a.m. UTC | #5
On Fri, Nov 24, 2023 at 12:11 AM Guillaume Nault <gnault@redhat.com> wrote:
>
> Walk the hashinfo->bhash2 table so that inet_diag can dump TCP sockets
> that are bound but haven't yet called connect() or listen().
>
> This allows ss to dump bound-only TCP sockets, together with listening
> sockets (as there's no specific state for bound-only sockets). This is
> similar to the UDP behaviour for which bound-only sockets are already
> dumped by ss -lu.
>
> The code is inspired by the ->lhash2 loop. However there's no manual
> test of the source port, since this kind of filtering is already
> handled by inet_diag_bc_sk(). Also, a maximum of 16 sockets are dumped
> at a time, to avoid running with bh disabled for too long.
>
> No change is needed for ss. With an IPv4, an IPv6 and an IPv6-only
> socket, bound respectively to 40000, 64000, 60000, the result is:
>
>   $ ss -lt
>   State  Recv-Q Send-Q Local Address:Port  Peer Address:PortProcess
>   UNCONN 0      0            0.0.0.0:40000      0.0.0.0:*
>   UNCONN 0      0               [::]:60000         [::]:*
>   UNCONN 0      0                  *:64000            *:*


Hmm...   "ss -l" is supposed to only list listening sockets.

So this change might confuse some users ?
Guillaume Nault Nov. 28, 2023, 10:18 p.m. UTC | #6
On Tue, Nov 28, 2023 at 11:14:28AM +0100, Eric Dumazet wrote:
> On Fri, Nov 24, 2023 at 12:11 AM Guillaume Nault <gnault@redhat.com> wrote:
> >
> > Walk the hashinfo->bhash2 table so that inet_diag can dump TCP sockets
> > that are bound but haven't yet called connect() or listen().
> >
> > This allows ss to dump bound-only TCP sockets, together with listening
> > sockets (as there's no specific state for bound-only sockets). This is
> > similar to the UDP behaviour for which bound-only sockets are already
> > dumped by ss -lu.
> >
> > The code is inspired by the ->lhash2 loop. However there's no manual
> > test of the source port, since this kind of filtering is already
> > handled by inet_diag_bc_sk(). Also, a maximum of 16 sockets are dumped
> > at a time, to avoid running with bh disabled for too long.
> >
> > No change is needed for ss. With an IPv4, an IPv6 and an IPv6-only
> > socket, bound respectively to 40000, 64000, 60000, the result is:
> >
> >   $ ss -lt
> >   State  Recv-Q Send-Q Local Address:Port  Peer Address:PortProcess
> >   UNCONN 0      0            0.0.0.0:40000      0.0.0.0:*
> >   UNCONN 0      0               [::]:60000         [::]:*
> >   UNCONN 0      0                  *:64000            *:*
> 
> 
> Hmm...   "ss -l" is supposed to only list listening sockets.
> 
> So this change might confuse some users ?
> 

On the other hand I can't find a more sensible solution. The problem is
that "ss -l" sets both the TCPF_LISTEN and the TCPF_CLOSE flags. And
since we don't have a way to express "bound but not yet listening"
sockets, these sockets fall into the CLOSE category. So we're really
just returning what ss asked for.

If we can't rely on TCPF_CLOSE, then I don't see what kind of filter we
could use to request a dump of these TCP sockets. Using "-a" doesn't
help as it just sets all the TCPF_* flags (appart from
TCPF_NEW_SYN_RECV). Adding a new option wouldn't help either as we
couldn't map it to any of the TCPF_* flags. In any case, we still need
to rely on TCPF_CLOSE.

So maybe we can just improve the ss man page for "-l" and explain that
it also lists closed sockets, which includes the bound-only ones
(this is already true for non-TCP sockets anyway). We could also tell
the user to run "ss state listening" for getting listening sockets
exclusively (or we could implement a new option, like "-L", to make
that shorter if necessary).

What do you think?
Eric Dumazet Nov. 29, 2023, 3:39 p.m. UTC | #7
On Tue, Nov 28, 2023 at 11:18 PM Guillaume Nault <gnault@redhat.com> wrote:
>
> On Tue, Nov 28, 2023 at 11:14:28AM +0100, Eric Dumazet wrote:
> > On Fri, Nov 24, 2023 at 12:11 AM Guillaume Nault <gnault@redhat.com> wrote:
> > >
> > > Walk the hashinfo->bhash2 table so that inet_diag can dump TCP sockets
> > > that are bound but haven't yet called connect() or listen().
> > >
> > > This allows ss to dump bound-only TCP sockets, together with listening
> > > sockets (as there's no specific state for bound-only sockets). This is
> > > similar to the UDP behaviour for which bound-only sockets are already
> > > dumped by ss -lu.
> > >
> > > The code is inspired by the ->lhash2 loop. However there's no manual
> > > test of the source port, since this kind of filtering is already
> > > handled by inet_diag_bc_sk(). Also, a maximum of 16 sockets are dumped
> > > at a time, to avoid running with bh disabled for too long.
> > >
> > > No change is needed for ss. With an IPv4, an IPv6 and an IPv6-only
> > > socket, bound respectively to 40000, 64000, 60000, the result is:
> > >
> > >   $ ss -lt
> > >   State  Recv-Q Send-Q Local Address:Port  Peer Address:PortProcess
> > >   UNCONN 0      0            0.0.0.0:40000      0.0.0.0:*
> > >   UNCONN 0      0               [::]:60000         [::]:*
> > >   UNCONN 0      0                  *:64000            *:*
> >
> >
> > Hmm...   "ss -l" is supposed to only list listening sockets.
> >
> > So this change might confuse some users ?
> >
>
> On the other hand I can't find a more sensible solution. The problem is
> that "ss -l" sets both the TCPF_LISTEN and the TCPF_CLOSE flags. And
> since we don't have a way to express "bound but not yet listening"
> sockets, these sockets fall into the CLOSE category. So we're really
> just returning what ss asked for.
>
> If we can't rely on TCPF_CLOSE, then I don't see what kind of filter we
> could use to request a dump of these TCP sockets. Using "-a" doesn't
> help as it just sets all the TCPF_* flags (appart from
> TCPF_NEW_SYN_RECV). Adding a new option wouldn't help either as we
> couldn't map it to any of the TCPF_* flags. In any case, we still need
> to rely on TCPF_CLOSE.
>
> So maybe we can just improve the ss man page for "-l" and explain that
> it also lists closed sockets, which includes the bound-only ones
> (this is already true for non-TCP sockets anyway). We could also tell
> the user to run "ss state listening" for getting listening sockets
> exclusively (or we could implement a new option, like "-L", to make
> that shorter if necessary).

This exists already : ss -t state LISTENING

>
> What do you think?
>

We might need a new bit in r->idiag_state (we have a lot of free bits
there), different from
the combination used by "ss -l"  which unfortunately used ( (1 <<
SS_LISTEN) | (1 << SS_CLOSE) ) )

"ss -t state bound" (or ss -tB ???)  would then set this new bit ( 1
<< SS_BOUND) and the kernel would handle this pseudo state ?

(mapped to CLOSED and in bhash2)


diff --git a/include/net/tcp_states.h b/include/net/tcp_states.h
index cc00118acca1b695a534bd73984b9d1f1794db25..97238c0f64aa6643cf492a856e8d67ddcca1a729
100644
--- a/include/net/tcp_states.h
+++ b/include/net/tcp_states.h
@@ -22,6 +22,7 @@ enum {
        TCP_LISTEN,
        TCP_CLOSING,    /* Now a valid state */
        TCP_NEW_SYN_RECV,
+       TCP_BOUND,

        TCP_MAX_STATES  /* Leave at the end! */
 };
@@ -43,6 +44,7 @@ enum {
        TCPF_LISTEN      = (1 << TCP_LISTEN),
        TCPF_CLOSING     = (1 << TCP_CLOSING),
        TCPF_NEW_SYN_RECV = (1 << TCP_NEW_SYN_RECV),
+       TCPF_BOUND       = (1 << TCP_BOUND),
 };

 #endif /* _LINUX_TCP_STATES_H */
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index a62423360a5681525496f8840bfe1d37ea3dc504..c3d22edb9e9563672356750153167884c92eae67
100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -6570,6 +6570,7 @@ enum {
        BPF_TCP_LISTEN,
        BPF_TCP_CLOSING,        /* Now a valid state */
        BPF_TCP_NEW_SYN_RECV,
+       BPF_TCP_BOUND,

        BPF_TCP_MAX_STATES      /* Leave at the end! */
 };
Guillaume Nault Nov. 29, 2023, 5:52 p.m. UTC | #8
On Wed, Nov 29, 2023 at 04:39:55PM +0100, Eric Dumazet wrote:
> On Tue, Nov 28, 2023 at 11:18 PM Guillaume Nault <gnault@redhat.com> wrote:
> >
> > On Tue, Nov 28, 2023 at 11:14:28AM +0100, Eric Dumazet wrote:
> > > On Fri, Nov 24, 2023 at 12:11 AM Guillaume Nault <gnault@redhat.com> wrote:
> > > >
> > > > Walk the hashinfo->bhash2 table so that inet_diag can dump TCP sockets
> > > > that are bound but haven't yet called connect() or listen().
> > > >
> > > > This allows ss to dump bound-only TCP sockets, together with listening
> > > > sockets (as there's no specific state for bound-only sockets). This is
> > > > similar to the UDP behaviour for which bound-only sockets are already
> > > > dumped by ss -lu.
> > > >
> > > > The code is inspired by the ->lhash2 loop. However there's no manual
> > > > test of the source port, since this kind of filtering is already
> > > > handled by inet_diag_bc_sk(). Also, a maximum of 16 sockets are dumped
> > > > at a time, to avoid running with bh disabled for too long.
> > > >
> > > > No change is needed for ss. With an IPv4, an IPv6 and an IPv6-only
> > > > socket, bound respectively to 40000, 64000, 60000, the result is:
> > > >
> > > >   $ ss -lt
> > > >   State  Recv-Q Send-Q Local Address:Port  Peer Address:PortProcess
> > > >   UNCONN 0      0            0.0.0.0:40000      0.0.0.0:*
> > > >   UNCONN 0      0               [::]:60000         [::]:*
> > > >   UNCONN 0      0                  *:64000            *:*
> > >
> > >
> > > Hmm...   "ss -l" is supposed to only list listening sockets.
> > >
> > > So this change might confuse some users ?
> > >
> >
> > On the other hand I can't find a more sensible solution. The problem is
> > that "ss -l" sets both the TCPF_LISTEN and the TCPF_CLOSE flags. And
> > since we don't have a way to express "bound but not yet listening"
> > sockets, these sockets fall into the CLOSE category. So we're really
> > just returning what ss asked for.
> >
> > If we can't rely on TCPF_CLOSE, then I don't see what kind of filter we
> > could use to request a dump of these TCP sockets. Using "-a" doesn't
> > help as it just sets all the TCPF_* flags (appart from
> > TCPF_NEW_SYN_RECV). Adding a new option wouldn't help either as we
> > couldn't map it to any of the TCPF_* flags. In any case, we still need
> > to rely on TCPF_CLOSE.
> >
> > So maybe we can just improve the ss man page for "-l" and explain that
> > it also lists closed sockets, which includes the bound-only ones
> > (this is already true for non-TCP sockets anyway). We could also tell
> > the user to run "ss state listening" for getting listening sockets
> > exclusively (or we could implement a new option, like "-L", to make
> > that shorter if necessary).
> 
> This exists already : ss -t state LISTENING
> 
> >
> > What do you think?
> >
> 
> We might need a new bit in r->idiag_state (we have a lot of free bits
> there), different from
> the combination used by "ss -l"  which unfortunately used ( (1 <<
> SS_LISTEN) | (1 << SS_CLOSE) ) )
> 
> "ss -t state bound" (or ss -tB ???)  would then set this new bit ( 1
> << SS_BOUND) and the kernel would handle this pseudo state ?
> 
> (mapped to CLOSED and in bhash2)

I'll do that in v3. Thanks.

> diff --git a/include/net/tcp_states.h b/include/net/tcp_states.h
> index cc00118acca1b695a534bd73984b9d1f1794db25..97238c0f64aa6643cf492a856e8d67ddcca1a729
> 100644
> --- a/include/net/tcp_states.h
> +++ b/include/net/tcp_states.h
> @@ -22,6 +22,7 @@ enum {
>         TCP_LISTEN,
>         TCP_CLOSING,    /* Now a valid state */
>         TCP_NEW_SYN_RECV,
> +       TCP_BOUND,
> 
>         TCP_MAX_STATES  /* Leave at the end! */
>  };
> @@ -43,6 +44,7 @@ enum {
>         TCPF_LISTEN      = (1 << TCP_LISTEN),
>         TCPF_CLOSING     = (1 << TCP_CLOSING),
>         TCPF_NEW_SYN_RECV = (1 << TCP_NEW_SYN_RECV),
> +       TCPF_BOUND       = (1 << TCP_BOUND),
>  };
> 
>  #endif /* _LINUX_TCP_STATES_H */
> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> index a62423360a5681525496f8840bfe1d37ea3dc504..c3d22edb9e9563672356750153167884c92eae67
> 100644
> --- a/include/uapi/linux/bpf.h
> +++ b/include/uapi/linux/bpf.h
> @@ -6570,6 +6570,7 @@ enum {
>         BPF_TCP_LISTEN,
>         BPF_TCP_CLOSING,        /* Now a valid state */
>         BPF_TCP_NEW_SYN_RECV,
> +       BPF_TCP_BOUND,
> 
>         BPF_TCP_MAX_STATES      /* Leave at the end! */
>  };
>
diff mbox series

Patch

diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c
index 7d0e7aaa71e0..d7fb6a625cb7 100644
--- a/net/ipv4/inet_diag.c
+++ b/net/ipv4/inet_diag.c
@@ -1077,10 +1077,96 @@  void inet_diag_dump_icsk(struct inet_hashinfo *hashinfo, struct sk_buff *skb,
 		s_i = num = s_num = 0;
 	}
 
+/* Process a maximum of SKARR_SZ sockets at a time when walking hash buckets
+ * with bh disabled.
+ */
+#define SKARR_SZ 16
+
+	/* Dump bound-only sockets */
+	if (cb->args[0] == 1) {
+		if (!(idiag_states & TCPF_CLOSE))
+			goto skip_bind_ht;
+
+		for (i = s_i; i < hashinfo->bhash_size; i++) {
+			struct inet_bind_hashbucket *ibb;
+			struct inet_bind2_bucket *tb2;
+			struct sock *sk_arr[SKARR_SZ];
+			int num_arr[SKARR_SZ];
+			int idx, accum, res;
+
+resume_bind_walk:
+			num = 0;
+			accum = 0;
+			ibb = &hashinfo->bhash2[i];
+
+			spin_lock_bh(&ibb->lock);
+			inet_bind_bucket_for_each(tb2, &ibb->chain) {
+				if (!net_eq(ib2_net(tb2), net))
+					continue;
+
+				sk_for_each_bound_bhash2(sk, &tb2->owners) {
+					struct inet_sock *inet = inet_sk(sk);
+
+					if (num < s_num)
+						goto next_bind;
+
+					if (sk->sk_state != TCP_CLOSE ||
+					    !inet->inet_num)
+						goto next_bind;
+
+					if (r->sdiag_family != AF_UNSPEC &&
+					    r->sdiag_family != sk->sk_family)
+						goto next_bind;
+
+					if (!inet_diag_bc_sk(bc, sk))
+						goto next_bind;
+
+					if (!refcount_inc_not_zero(&sk->sk_refcnt))
+						goto next_bind;
+
+					num_arr[accum] = num;
+					sk_arr[accum] = sk;
+					if (++accum == SKARR_SZ)
+						goto pause_bind_walk;
+next_bind:
+					num++;
+				}
+			}
+pause_bind_walk:
+			spin_unlock_bh(&ibb->lock);
+
+			res = 0;
+			for (idx = 0; idx < accum; idx++) {
+				if (res >= 0) {
+					res = inet_sk_diag_fill(sk_arr[idx],
+								NULL, skb, cb,
+								r, NLM_F_MULTI,
+								net_admin);
+					if (res < 0)
+						num = num_arr[idx];
+				}
+				sock_gen_put(sk_arr[idx]);
+			}
+			if (res < 0)
+				goto done;
+
+			cond_resched();
+
+			if (accum == SKARR_SZ) {
+				s_num = num + 1;
+				goto resume_bind_walk;
+			}
+
+			s_num = 0;
+		}
+skip_bind_ht:
+		cb->args[0] = 2;
+		s_i = num = s_num = 0;
+	}
+
 	if (!(idiag_states & ~TCPF_LISTEN))
 		goto out;
 
-#define SKARR_SZ 16
 	for (i = s_i; i <= hashinfo->ehash_mask; i++) {
 		struct inet_ehash_bucket *head = &hashinfo->ehash[i];
 		spinlock_t *lock = inet_ehash_lockp(hashinfo, i);