Message ID | 20230719113836.68859-7-hare@suse.de (mailing list archive) |
---|---|
State | Superseded |
Delegated to: | Netdev Maintainers |
Headers | show |
Series | net/tls: fixes for NVMe-over-TLS | expand |
> Implement ->read_sock() function for use with nvme-tcp. This implementation looks better. Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
On Wed, Jul 19, 2023 at 01:38:36PM +0200, Hannes Reinecke wrote: ... Hi Hannes, > diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c > index d0636ea13009..4829d2cb9a7c 100644 > --- a/net/tls/tls_sw.c > +++ b/net/tls/tls_sw.c > @@ -2202,6 +2202,102 @@ ssize_t tls_sw_splice_read(struct socket *sock, loff_t *ppos, > goto splice_read_end; > } > > +int tls_sw_read_sock(struct sock *sk, read_descriptor_t *desc, > + sk_read_actor_t read_actor) > +{ > + struct tls_context *tls_ctx = tls_get_ctx(sk); > + struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx); > + struct strp_msg *rxm = NULL; > + struct tls_msg *tlm; > + struct sk_buff *skb; > + struct sk_psock *psock; > + ssize_t copied = 0; > + bool bpf_strp_enabled; > + int err, used; > + > + psock = sk_psock_get(sk); > + err = tls_rx_reader_acquire(sk, ctx, true); > + if (err < 0) > + goto psock_put; skb is uninitialised here, however, it is used in the psock_put unwind path. Flagged by gcc-12 [-Wmaybe-uninitialized] and Smatch. > + bpf_strp_enabled = sk_psock_strp_enabled(psock); > + > + /* If crypto failed the connection is broken */ > + err = ctx->async_wait.err; > + if (err) > + goto read_sock_end; Likewise, here. > + > + do { > + if (!skb_queue_empty(&ctx->rx_list)) { > + skb = __skb_dequeue(&ctx->rx_list); > + rxm = strp_msg(skb); > + } else { > + struct tls_decrypt_arg darg; > + > + err = tls_rx_rec_wait(sk, psock, true, true); > + if (err <= 0) > + goto read_sock_end; > + > + memset(&darg.inargs, 0, sizeof(darg.inargs)); > + darg.zc = !bpf_strp_enabled && ctx->zc_capable; > + > + rxm = strp_msg(tls_strp_msg(ctx)); > + tlm = tls_msg(tls_strp_msg(ctx)); > + > + /* read_sock does not support reading control messages */ > + if (tlm->control != TLS_RECORD_TYPE_DATA) { > + err = -EINVAL; > + goto read_sock_requeue; > + } > + > + if (!bpf_strp_enabled) > + darg.async = ctx->async_capable; > + else > + darg.async = false; > + > + err = tls_rx_one_record(sk, NULL, &darg); > + if (err < 0) { > + tls_err_abort(sk, -EBADMSG); > + goto read_sock_end; > + } > + > + sk_flush_backlog(sk); > + skb = darg.skb; > + rxm = strp_msg(skb); > + > + tls_rx_rec_done(ctx); > + } > + > + used = read_actor(desc, skb, rxm->offset, rxm->full_len); > + if (used <= 0) { > + if (!copied) > + err = used; > + goto read_sock_end; > + } > + copied += used; > + if (used < rxm->full_len) { > + rxm->offset += used; > + rxm->full_len -= used; > + if (!desc->count) > + goto read_sock_requeue; > + } else { > + consume_skb(skb); > + if (!desc->count) > + skb = NULL; > + } > + } while (skb); > + > +read_sock_end: > + tls_rx_reader_release(sk, ctx); > +psock_put: > + if (psock) > + sk_psock_put(sk, psock); > + return copied ? : err; > + > +read_sock_requeue: > + __skb_queue_head(&ctx->rx_list, skb); > + goto read_sock_end; > +} > + > bool tls_sw_sock_is_readable(struct sock *sk) > { > struct tls_context *tls_ctx = tls_get_ctx(sk);
On Wed, 19 Jul 2023 13:38:36 +0200 Hannes Reinecke wrote: > Implement ->read_sock() function for use with nvme-tcp. > +int tls_sw_read_sock(struct sock *sk, read_descriptor_t *desc, > + sk_read_actor_t read_actor) > +{ > + struct tls_context *tls_ctx = tls_get_ctx(sk); > + struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx); > + struct strp_msg *rxm = NULL; > + struct tls_msg *tlm; > + struct sk_buff *skb; > + struct sk_psock *psock; > + ssize_t copied = 0; > + bool bpf_strp_enabled; bubble up the longer lines, like this: + struct tls_context *tls_ctx = tls_get_ctx(sk); + struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx); + struct strp_msg *rxm = NULL; + struct sk_psock *psock; + bool bpf_strp_enabled; + struct tls_msg *tlm; + struct sk_buff *skb; + ssize_t copied = 0; + int err, used; > + int err, used; > + > + psock = sk_psock_get(sk); > + err = tls_rx_reader_acquire(sk, ctx, true); > + if (err < 0) > + goto psock_put; > + bpf_strp_enabled = sk_psock_strp_enabled(psock); You're not servicing the BPF out of band queue, just error out if the BPF psock is enabled. It's barely used and endlessly buggy anyway. > + /* If crypto failed the connection is broken */ > + err = ctx->async_wait.err; > + if (err) > + goto read_sock_end; > + > + do { > + if (!skb_queue_empty(&ctx->rx_list)) { > + skb = __skb_dequeue(&ctx->rx_list); > + rxm = strp_msg(skb); > + } else { > + struct tls_decrypt_arg darg; > + > + err = tls_rx_rec_wait(sk, psock, true, true); > + if (err <= 0) > + goto read_sock_end; > + > + memset(&darg.inargs, 0, sizeof(darg.inargs)); > + darg.zc = !bpf_strp_enabled && ctx->zc_capable; And what are you zero-copying into my friend? zc == zero copy. Leave the zc be 0, like splice does, otherwise passing msg=NULL to tls_rx_one_record() may explode. Testing with TLS 1.2 should show that. > + rxm = strp_msg(tls_strp_msg(ctx)); > + tlm = tls_msg(tls_strp_msg(ctx)); > + > + /* read_sock does not support reading control messages */ > + if (tlm->control != TLS_RECORD_TYPE_DATA) { > + err = -EINVAL; > + goto read_sock_requeue; > + } > + > + if (!bpf_strp_enabled) > + darg.async = ctx->async_capable; > + else > + darg.async = false; Also don't bother with async. TLS 1.3 can't do async, anyway, and I don't think you wait for the completion :S > + err = tls_rx_one_record(sk, NULL, &darg); > + if (err < 0) { > + tls_err_abort(sk, -EBADMSG); > + goto read_sock_end; > + } > + > + sk_flush_backlog(sk); Hm, could be a bit often but okay. > + skb = darg.skb; > + rxm = strp_msg(skb); > + > + tls_rx_rec_done(ctx); > + } > + > + used = read_actor(desc, skb, rxm->offset, rxm->full_len); > + if (used <= 0) { > + if (!copied) > + err = used; > + goto read_sock_end; You have to requeue on error. > + } > + copied += used; > + if (used < rxm->full_len) { > + rxm->offset += used; > + rxm->full_len -= used; > + if (!desc->count) > + goto read_sock_requeue; And here. Like splice_read does. Otherwise you leak the skb. > + } else { > + consume_skb(skb); > + if (!desc->count) > + skb = NULL; > + } > + } while (skb); > + > +read_sock_end: > + tls_rx_reader_release(sk, ctx); > +psock_put: > + if (psock) > + sk_psock_put(sk, psock); > + return copied ? : err; > + > +read_sock_requeue: > + __skb_queue_head(&ctx->rx_list, skb); > + goto read_sock_end; > +} > + > bool tls_sw_sock_is_readable(struct sock *sk) > { > struct tls_context *tls_ctx = tls_get_ctx(sk);
On 7/20/23 20:32, Simon Horman wrote: > On Wed, Jul 19, 2023 at 01:38:36PM +0200, Hannes Reinecke wrote: > > ... > > Hi Hannes, > >> diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c >> index d0636ea13009..4829d2cb9a7c 100644 >> --- a/net/tls/tls_sw.c >> +++ b/net/tls/tls_sw.c >> @@ -2202,6 +2202,102 @@ ssize_t tls_sw_splice_read(struct socket *sock, loff_t *ppos, >> goto splice_read_end; >> } >> >> +int tls_sw_read_sock(struct sock *sk, read_descriptor_t *desc, >> + sk_read_actor_t read_actor) >> +{ >> + struct tls_context *tls_ctx = tls_get_ctx(sk); >> + struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx); >> + struct strp_msg *rxm = NULL; >> + struct tls_msg *tlm; >> + struct sk_buff *skb; >> + struct sk_psock *psock; >> + ssize_t copied = 0; >> + bool bpf_strp_enabled; >> + int err, used; >> + >> + psock = sk_psock_get(sk); >> + err = tls_rx_reader_acquire(sk, ctx, true); >> + if (err < 0) >> + goto psock_put; > > skb is uninitialised here, > however, it is used in the psock_put unwind path. > > Flagged by gcc-12 [-Wmaybe-uninitialized] and Smatch. > Will fix it up in the next round. Cheers, Hannes
On 7/21/23 05:02, Jakub Kicinski wrote: > On Wed, 19 Jul 2023 13:38:36 +0200 Hannes Reinecke wrote: >> Implement ->read_sock() function for use with nvme-tcp. > >> +int tls_sw_read_sock(struct sock *sk, read_descriptor_t *desc, >> + sk_read_actor_t read_actor) >> +{ >> + struct tls_context *tls_ctx = tls_get_ctx(sk); >> + struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx); >> + struct strp_msg *rxm = NULL; >> + struct tls_msg *tlm; >> + struct sk_buff *skb; >> + struct sk_psock *psock; >> + ssize_t copied = 0; >> + bool bpf_strp_enabled; > > bubble up the longer lines, like this: > > + struct tls_context *tls_ctx = tls_get_ctx(sk); > + struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx); > + struct strp_msg *rxm = NULL; > + struct sk_psock *psock; > + bool bpf_strp_enabled; > + struct tls_msg *tlm; > + struct sk_buff *skb; > + ssize_t copied = 0; > + int err, used; > Ok. >> + int err, used; >> + >> + psock = sk_psock_get(sk); >> + err = tls_rx_reader_acquire(sk, ctx, true); >> + if (err < 0) >> + goto psock_put; >> + bpf_strp_enabled = sk_psock_strp_enabled(psock); > > You're not servicing the BPF out of band queue, just error out if > the BPF psock is enabled. It's barely used and endlessly buggy anyway. > Have been wondering about that; will do. >> + /* If crypto failed the connection is broken */ >> + err = ctx->async_wait.err; >> + if (err) >> + goto read_sock_end; >> + >> + do { >> + if (!skb_queue_empty(&ctx->rx_list)) { >> + skb = __skb_dequeue(&ctx->rx_list); >> + rxm = strp_msg(skb); >> + } else { >> + struct tls_decrypt_arg darg; >> + >> + err = tls_rx_rec_wait(sk, psock, true, true); >> + if (err <= 0) >> + goto read_sock_end; >> + >> + memset(&darg.inargs, 0, sizeof(darg.inargs)); >> + darg.zc = !bpf_strp_enabled && ctx->zc_capable; > > And what are you zero-copying into my friend? zc == zero copy. > Leave the zc be 0, like splice does, otherwise passing msg=NULL > to tls_rx_one_record() may explode. Testing with TLS 1.2 should > show that. > Still beginning to learn how the stream parser works. (And have been wondering about the 'msg == NULL' case, too). Will fix it. >> + rxm = strp_msg(tls_strp_msg(ctx)); >> + tlm = tls_msg(tls_strp_msg(ctx)); >> + >> + /* read_sock does not support reading control messages */ >> + if (tlm->control != TLS_RECORD_TYPE_DATA) { >> + err = -EINVAL; >> + goto read_sock_requeue; >> + } >> + >> + if (!bpf_strp_enabled) >> + darg.async = ctx->async_capable; >> + else >> + darg.async = false; > > Also don't bother with async. TLS 1.3 can't do async, anyway, > and I don't think you wait for the completion :S > Ok. >> + err = tls_rx_one_record(sk, NULL, &darg); >> + if (err < 0) { >> + tls_err_abort(sk, -EBADMSG); >> + goto read_sock_end; >> + } >> + >> + sk_flush_backlog(sk); > > Hm, could be a bit often but okay. > When would you suggest to do it? (Do I need to do it at all?) >> + skb = darg.skb; >> + rxm = strp_msg(skb); >> + >> + tls_rx_rec_done(ctx); >> + } >> + >> + used = read_actor(desc, skb, rxm->offset, rxm->full_len); >> + if (used <= 0) { >> + if (!copied) >> + err = used; >> + goto read_sock_end; > > You have to requeue on error. > Ah, right. Did it in the previous version, but somehow got lost here. Will be fixing it up. >> + } >> + copied += used; >> + if (used < rxm->full_len) { >> + rxm->offset += used; >> + rxm->full_len -= used; >> + if (!desc->count) >> + goto read_sock_requeue; > > And here. Like splice_read does. Otherwise you leak the skb. > Will do. Thanks for the review! Cheers, Hannes
On Fri, 21 Jul 2023 15:53:05 +0200 Hannes Reinecke wrote: > >> + err = tls_rx_one_record(sk, NULL, &darg); > >> + if (err < 0) { > >> + tls_err_abort(sk, -EBADMSG); > >> + goto read_sock_end; > >> + } > >> + > >> + sk_flush_backlog(sk); > > > > Hm, could be a bit often but okay. > > > When would you suggest to do it? > (Do I need to do it at all?) I picked every 128kB for the normal Rx path. I looked thru my old notes and it seems I didn't measure smaller batches :( Only 128kB - 4MB. Flush every 128kB was showing a 4% throughput hit, but much better TCP behavior. Not sure how the perf hit would scale below 128kB, maybe the lower the threshold the lower the overhead because statistically only one of every 4 calls will have something to do? (GRO coalesces to 64kB == 4 TLS records). Dunno. You'd have to measure. But its not a blocker for me, FWIW, you can keep the flushing on every record.
diff --git a/net/tls/tls.h b/net/tls/tls.h index 86cef1c68e03..7e4d45537deb 100644 --- a/net/tls/tls.h +++ b/net/tls/tls.h @@ -110,6 +110,8 @@ bool tls_sw_sock_is_readable(struct sock *sk); ssize_t tls_sw_splice_read(struct socket *sock, loff_t *ppos, struct pipe_inode_info *pipe, size_t len, unsigned int flags); +int tls_sw_read_sock(struct sock *sk, read_descriptor_t *desc, + sk_read_actor_t read_actor); int tls_device_sendmsg(struct sock *sk, struct msghdr *msg, size_t size); void tls_device_splice_eof(struct socket *sock); diff --git a/net/tls/tls_main.c b/net/tls/tls_main.c index b6896126bb92..7dbb8cd8f809 100644 --- a/net/tls/tls_main.c +++ b/net/tls/tls_main.c @@ -962,10 +962,12 @@ static void build_proto_ops(struct proto_ops ops[TLS_NUM_CONFIG][TLS_NUM_CONFIG] ops[TLS_BASE][TLS_SW ] = ops[TLS_BASE][TLS_BASE]; ops[TLS_BASE][TLS_SW ].splice_read = tls_sw_splice_read; ops[TLS_BASE][TLS_SW ].poll = tls_sk_poll; + ops[TLS_BASE][TLS_SW ].read_sock = tls_sw_read_sock; ops[TLS_SW ][TLS_SW ] = ops[TLS_SW ][TLS_BASE]; ops[TLS_SW ][TLS_SW ].splice_read = tls_sw_splice_read; ops[TLS_SW ][TLS_SW ].poll = tls_sk_poll; + ops[TLS_SW ][TLS_SW ].read_sock = tls_sw_read_sock; #ifdef CONFIG_TLS_DEVICE ops[TLS_HW ][TLS_BASE] = ops[TLS_BASE][TLS_BASE]; diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c index d0636ea13009..4829d2cb9a7c 100644 --- a/net/tls/tls_sw.c +++ b/net/tls/tls_sw.c @@ -2202,6 +2202,102 @@ ssize_t tls_sw_splice_read(struct socket *sock, loff_t *ppos, goto splice_read_end; } +int tls_sw_read_sock(struct sock *sk, read_descriptor_t *desc, + sk_read_actor_t read_actor) +{ + struct tls_context *tls_ctx = tls_get_ctx(sk); + struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx); + struct strp_msg *rxm = NULL; + struct tls_msg *tlm; + struct sk_buff *skb; + struct sk_psock *psock; + ssize_t copied = 0; + bool bpf_strp_enabled; + int err, used; + + psock = sk_psock_get(sk); + err = tls_rx_reader_acquire(sk, ctx, true); + if (err < 0) + goto psock_put; + bpf_strp_enabled = sk_psock_strp_enabled(psock); + + /* If crypto failed the connection is broken */ + err = ctx->async_wait.err; + if (err) + goto read_sock_end; + + do { + if (!skb_queue_empty(&ctx->rx_list)) { + skb = __skb_dequeue(&ctx->rx_list); + rxm = strp_msg(skb); + } else { + struct tls_decrypt_arg darg; + + err = tls_rx_rec_wait(sk, psock, true, true); + if (err <= 0) + goto read_sock_end; + + memset(&darg.inargs, 0, sizeof(darg.inargs)); + darg.zc = !bpf_strp_enabled && ctx->zc_capable; + + rxm = strp_msg(tls_strp_msg(ctx)); + tlm = tls_msg(tls_strp_msg(ctx)); + + /* read_sock does not support reading control messages */ + if (tlm->control != TLS_RECORD_TYPE_DATA) { + err = -EINVAL; + goto read_sock_requeue; + } + + if (!bpf_strp_enabled) + darg.async = ctx->async_capable; + else + darg.async = false; + + err = tls_rx_one_record(sk, NULL, &darg); + if (err < 0) { + tls_err_abort(sk, -EBADMSG); + goto read_sock_end; + } + + sk_flush_backlog(sk); + skb = darg.skb; + rxm = strp_msg(skb); + + tls_rx_rec_done(ctx); + } + + used = read_actor(desc, skb, rxm->offset, rxm->full_len); + if (used <= 0) { + if (!copied) + err = used; + goto read_sock_end; + } + copied += used; + if (used < rxm->full_len) { + rxm->offset += used; + rxm->full_len -= used; + if (!desc->count) + goto read_sock_requeue; + } else { + consume_skb(skb); + if (!desc->count) + skb = NULL; + } + } while (skb); + +read_sock_end: + tls_rx_reader_release(sk, ctx); +psock_put: + if (psock) + sk_psock_put(sk, psock); + return copied ? : err; + +read_sock_requeue: + __skb_queue_head(&ctx->rx_list, skb); + goto read_sock_end; +} + bool tls_sw_sock_is_readable(struct sock *sk) { struct tls_context *tls_ctx = tls_get_ctx(sk);