diff mbox series

[6/6] net/tls: implement ->read_sock()

Message ID 20230719113836.68859-7-hare@suse.de (mailing list archive)
State Superseded
Delegated to: Netdev Maintainers
Headers show
Series net/tls: fixes for NVMe-over-TLS | expand

Checks

Context Check Description
netdev/series_format warning Target tree name not specified in the subject
netdev/tree_selection success Guessed tree name to be net-next
netdev/fixes_present success Fixes tag not required for -next series
netdev/header_inline success No static functions without inline keyword in header files
netdev/build_32bit success Errors and warnings before: 1342 this patch: 1342
netdev/cc_maintainers warning 4 maintainers not CCed: borisp@nvidia.com bpf@vger.kernel.org john.fastabend@gmail.com davem@davemloft.net
netdev/build_clang success Errors and warnings before: 1365 this patch: 1365
netdev/verify_signedoff success Signed-off-by tag matches author and committer
netdev/deprecated_api success None detected
netdev/check_selftest success No net selftest shell script
netdev/verify_fixes success No Fixes tag
netdev/build_allmodconfig_warn success Errors and warnings before: 1365 this patch: 1365
netdev/checkpatch fail ERROR: space prohibited before that close square bracket ']' WARNING: Duplicate signature WARNING: line length of 81 exceeds 80 columns
netdev/kdoc success Errors and warnings before: 0 this patch: 0
netdev/source_inline success Was 0 now: 0

Commit Message

Hannes Reinecke July 19, 2023, 11:38 a.m. UTC
Implement ->read_sock() function for use with nvme-tcp.

Signed-off-by: Hannes Reinecke <hare@suse.de>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Cc: Boris Pismenny <boris.pismenny@gmail.com>
Cc: Jakub Kicinski <kuba@kernel.org>
Cc: netdev@vger.kernel.org
---
 net/tls/tls.h      |  2 +
 net/tls/tls_main.c |  2 +
 net/tls/tls_sw.c   | 96 ++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 100 insertions(+)

Comments

Sagi Grimberg July 19, 2023, 11:55 a.m. UTC | #1
> Implement ->read_sock() function for use with nvme-tcp.

This implementation looks better.
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Simon Horman July 20, 2023, 6:32 p.m. UTC | #2
On Wed, Jul 19, 2023 at 01:38:36PM +0200, Hannes Reinecke wrote:

...

Hi Hannes,

> diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c
> index d0636ea13009..4829d2cb9a7c 100644
> --- a/net/tls/tls_sw.c
> +++ b/net/tls/tls_sw.c
> @@ -2202,6 +2202,102 @@ ssize_t tls_sw_splice_read(struct socket *sock,  loff_t *ppos,
>  	goto splice_read_end;
>  }
>  
> +int tls_sw_read_sock(struct sock *sk, read_descriptor_t *desc,
> +		     sk_read_actor_t read_actor)
> +{
> +	struct tls_context *tls_ctx = tls_get_ctx(sk);
> +	struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx);
> +	struct strp_msg *rxm = NULL;
> +	struct tls_msg *tlm;
> +	struct sk_buff *skb;
> +	struct sk_psock *psock;
> +	ssize_t copied = 0;
> +	bool bpf_strp_enabled;
> +	int err, used;
> +
> +	psock = sk_psock_get(sk);
> +	err = tls_rx_reader_acquire(sk, ctx, true);
> +	if (err < 0)
> +		goto psock_put;

skb is uninitialised here,
however, it is used in the psock_put unwind path.

Flagged by gcc-12 [-Wmaybe-uninitialized] and Smatch.

> +	bpf_strp_enabled = sk_psock_strp_enabled(psock);
> +
> +	/* If crypto failed the connection is broken */
> +	err = ctx->async_wait.err;
> +	if (err)
> +		goto read_sock_end;

Likewise, here.

> +
> +	do {
> +		if (!skb_queue_empty(&ctx->rx_list)) {
> +			skb = __skb_dequeue(&ctx->rx_list);
> +			rxm = strp_msg(skb);
> +		} else {
> +			struct tls_decrypt_arg darg;
> +
> +			err = tls_rx_rec_wait(sk, psock, true, true);
> +			if (err <= 0)
> +				goto read_sock_end;
> +
> +			memset(&darg.inargs, 0, sizeof(darg.inargs));
> +			darg.zc = !bpf_strp_enabled && ctx->zc_capable;
> +
> +			rxm = strp_msg(tls_strp_msg(ctx));
> +			tlm = tls_msg(tls_strp_msg(ctx));
> +
> +			/* read_sock does not support reading control messages */
> +			if (tlm->control != TLS_RECORD_TYPE_DATA) {
> +				err = -EINVAL;
> +				goto read_sock_requeue;
> +			}
> +
> +			if (!bpf_strp_enabled)
> +				darg.async = ctx->async_capable;
> +			else
> +				darg.async = false;
> +
> +			err = tls_rx_one_record(sk, NULL, &darg);
> +			if (err < 0) {
> +				tls_err_abort(sk, -EBADMSG);
> +				goto read_sock_end;
> +			}
> +
> +			sk_flush_backlog(sk);
> +			skb = darg.skb;
> +			rxm = strp_msg(skb);
> +
> +			tls_rx_rec_done(ctx);
> +		}
> +
> +		used = read_actor(desc, skb, rxm->offset, rxm->full_len);
> +		if (used <= 0) {
> +			if (!copied)
> +				err = used;
> +			goto read_sock_end;
> +		}
> +		copied += used;
> +		if (used < rxm->full_len) {
> +			rxm->offset += used;
> +			rxm->full_len -= used;
> +			if (!desc->count)
> +				goto read_sock_requeue;
> +		} else {
> +			consume_skb(skb);
> +			if (!desc->count)
> +				skb = NULL;
> +		}
> +	} while (skb);
> +
> +read_sock_end:
> +	tls_rx_reader_release(sk, ctx);
> +psock_put:
> +	if (psock)
> +		sk_psock_put(sk, psock);
> +	return copied ? : err;
> +
> +read_sock_requeue:
> +	__skb_queue_head(&ctx->rx_list, skb);
> +	goto read_sock_end;
> +}
> +
>  bool tls_sw_sock_is_readable(struct sock *sk)
>  {
>  	struct tls_context *tls_ctx = tls_get_ctx(sk);
Jakub Kicinski July 21, 2023, 3:02 a.m. UTC | #3
On Wed, 19 Jul 2023 13:38:36 +0200 Hannes Reinecke wrote:
> Implement ->read_sock() function for use with nvme-tcp.

> +int tls_sw_read_sock(struct sock *sk, read_descriptor_t *desc,
> +		     sk_read_actor_t read_actor)
> +{
> +	struct tls_context *tls_ctx = tls_get_ctx(sk);
> +	struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx);
> +	struct strp_msg *rxm = NULL;
> +	struct tls_msg *tlm;
> +	struct sk_buff *skb;
> +	struct sk_psock *psock;
> +	ssize_t copied = 0;
> +	bool bpf_strp_enabled;

bubble up the longer lines, like this:

+	struct tls_context *tls_ctx = tls_get_ctx(sk);
+	struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx);
+	struct strp_msg *rxm = NULL;
+	struct sk_psock *psock;
+	bool bpf_strp_enabled;
+	struct tls_msg *tlm;
+	struct sk_buff *skb;
+	ssize_t copied = 0;
+	int err, used;

> +	int err, used;
> +
> +	psock = sk_psock_get(sk);
> +	err = tls_rx_reader_acquire(sk, ctx, true);
> +	if (err < 0)
> +		goto psock_put;
> +	bpf_strp_enabled = sk_psock_strp_enabled(psock);

You're not servicing the BPF out of band queue, just error out if
the BPF psock is enabled. It's barely used and endlessly buggy anyway.

> +	/* If crypto failed the connection is broken */
> +	err = ctx->async_wait.err;
> +	if (err)
> +		goto read_sock_end;
> +
> +	do {
> +		if (!skb_queue_empty(&ctx->rx_list)) {
> +			skb = __skb_dequeue(&ctx->rx_list);
> +			rxm = strp_msg(skb);
> +		} else {
> +			struct tls_decrypt_arg darg;
> +
> +			err = tls_rx_rec_wait(sk, psock, true, true);
> +			if (err <= 0)
> +				goto read_sock_end;
> +
> +			memset(&darg.inargs, 0, sizeof(darg.inargs));
> +			darg.zc = !bpf_strp_enabled && ctx->zc_capable;

And what are you zero-copying into my friend? zc == zero copy.
Leave the zc be 0, like splice does, otherwise passing msg=NULL
to tls_rx_one_record() may explode. Testing with TLS 1.2 should
show that.

> +			rxm = strp_msg(tls_strp_msg(ctx));
> +			tlm = tls_msg(tls_strp_msg(ctx));
> +
> +			/* read_sock does not support reading control messages */
> +			if (tlm->control != TLS_RECORD_TYPE_DATA) {
> +				err = -EINVAL;
> +				goto read_sock_requeue;
> +			}
> +
> +			if (!bpf_strp_enabled)
> +				darg.async = ctx->async_capable;
> +			else
> +				darg.async = false;

Also don't bother with async. TLS 1.3 can't do async, anyway,
and I don't think you wait for the completion :S

> +			err = tls_rx_one_record(sk, NULL, &darg);
> +			if (err < 0) {
> +				tls_err_abort(sk, -EBADMSG);
> +				goto read_sock_end;
> +			}
> +
> +			sk_flush_backlog(sk);

Hm, could be a bit often but okay.

> +			skb = darg.skb;
> +			rxm = strp_msg(skb);
> +
> +			tls_rx_rec_done(ctx);
> +		}
> +
> +		used = read_actor(desc, skb, rxm->offset, rxm->full_len);
> +		if (used <= 0) {
> +			if (!copied)
> +				err = used;
> +			goto read_sock_end;

You have to requeue on error.

> +		}
> +		copied += used;
> +		if (used < rxm->full_len) {
> +			rxm->offset += used;
> +			rxm->full_len -= used;
> +			if (!desc->count)
> +				goto read_sock_requeue;

And here. Like splice_read does. Otherwise you leak the skb.

> +		} else {
> +			consume_skb(skb);
> +			if (!desc->count)
> +				skb = NULL;
> +		}
> +	} while (skb);
> +
> +read_sock_end:
> +	tls_rx_reader_release(sk, ctx);
> +psock_put:
> +	if (psock)
> +		sk_psock_put(sk, psock);
> +	return copied ? : err;
> +
> +read_sock_requeue:
> +	__skb_queue_head(&ctx->rx_list, skb);
> +	goto read_sock_end;
> +}
> +
>  bool tls_sw_sock_is_readable(struct sock *sk)
>  {
>  	struct tls_context *tls_ctx = tls_get_ctx(sk);
Hannes Reinecke July 21, 2023, 6:03 a.m. UTC | #4
On 7/20/23 20:32, Simon Horman wrote:
> On Wed, Jul 19, 2023 at 01:38:36PM +0200, Hannes Reinecke wrote:
> 
> ...
> 
> Hi Hannes,
> 
>> diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c
>> index d0636ea13009..4829d2cb9a7c 100644
>> --- a/net/tls/tls_sw.c
>> +++ b/net/tls/tls_sw.c
>> @@ -2202,6 +2202,102 @@ ssize_t tls_sw_splice_read(struct socket *sock,  loff_t *ppos,
>>   	goto splice_read_end;
>>   }
>>   
>> +int tls_sw_read_sock(struct sock *sk, read_descriptor_t *desc,
>> +		     sk_read_actor_t read_actor)
>> +{
>> +	struct tls_context *tls_ctx = tls_get_ctx(sk);
>> +	struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx);
>> +	struct strp_msg *rxm = NULL;
>> +	struct tls_msg *tlm;
>> +	struct sk_buff *skb;
>> +	struct sk_psock *psock;
>> +	ssize_t copied = 0;
>> +	bool bpf_strp_enabled;
>> +	int err, used;
>> +
>> +	psock = sk_psock_get(sk);
>> +	err = tls_rx_reader_acquire(sk, ctx, true);
>> +	if (err < 0)
>> +		goto psock_put;
> 
> skb is uninitialised here,
> however, it is used in the psock_put unwind path.
> 
> Flagged by gcc-12 [-Wmaybe-uninitialized] and Smatch.
> 
Will fix it up in the next round.

Cheers,

Hannes
Hannes Reinecke July 21, 2023, 1:53 p.m. UTC | #5
On 7/21/23 05:02, Jakub Kicinski wrote:
> On Wed, 19 Jul 2023 13:38:36 +0200 Hannes Reinecke wrote:
>> Implement ->read_sock() function for use with nvme-tcp.
> 
>> +int tls_sw_read_sock(struct sock *sk, read_descriptor_t *desc,
>> +		     sk_read_actor_t read_actor)
>> +{
>> +	struct tls_context *tls_ctx = tls_get_ctx(sk);
>> +	struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx);
>> +	struct strp_msg *rxm = NULL;
>> +	struct tls_msg *tlm;
>> +	struct sk_buff *skb;
>> +	struct sk_psock *psock;
>> +	ssize_t copied = 0;
>> +	bool bpf_strp_enabled;
> 
> bubble up the longer lines, like this:
> 
> +	struct tls_context *tls_ctx = tls_get_ctx(sk);
> +	struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx);
> +	struct strp_msg *rxm = NULL;
> +	struct sk_psock *psock;
> +	bool bpf_strp_enabled;
> +	struct tls_msg *tlm;
> +	struct sk_buff *skb;
> +	ssize_t copied = 0;
> +	int err, used;
> 
Ok.

>> +	int err, used;
>> +
>> +	psock = sk_psock_get(sk);
>> +	err = tls_rx_reader_acquire(sk, ctx, true);
>> +	if (err < 0)
>> +		goto psock_put;
>> +	bpf_strp_enabled = sk_psock_strp_enabled(psock);
> 
> You're not servicing the BPF out of band queue, just error out if
> the BPF psock is enabled. It's barely used and endlessly buggy anyway.
> 
Have been wondering about that; will do.

>> +	/* If crypto failed the connection is broken */
>> +	err = ctx->async_wait.err;
>> +	if (err)
>> +		goto read_sock_end;
>> +
>> +	do {
>> +		if (!skb_queue_empty(&ctx->rx_list)) {
>> +			skb = __skb_dequeue(&ctx->rx_list);
>> +			rxm = strp_msg(skb);
>> +		} else {
>> +			struct tls_decrypt_arg darg;
>> +
>> +			err = tls_rx_rec_wait(sk, psock, true, true);
>> +			if (err <= 0)
>> +				goto read_sock_end;
>> +
>> +			memset(&darg.inargs, 0, sizeof(darg.inargs));
>> +			darg.zc = !bpf_strp_enabled && ctx->zc_capable;
> 
> And what are you zero-copying into my friend? zc == zero copy.
> Leave the zc be 0, like splice does, otherwise passing msg=NULL
> to tls_rx_one_record() may explode. Testing with TLS 1.2 should
> show that.
> 
Still beginning to learn how the stream parser works.
(And have been wondering about the 'msg == NULL' case, too).
Will fix it.

>> +			rxm = strp_msg(tls_strp_msg(ctx));
>> +			tlm = tls_msg(tls_strp_msg(ctx));
>> +
>> +			/* read_sock does not support reading control messages */
>> +			if (tlm->control != TLS_RECORD_TYPE_DATA) {
>> +				err = -EINVAL;
>> +				goto read_sock_requeue;
>> +			}
>> +
>> +			if (!bpf_strp_enabled)
>> +				darg.async = ctx->async_capable;
>> +			else
>> +				darg.async = false;
> 
> Also don't bother with async. TLS 1.3 can't do async, anyway,
> and I don't think you wait for the completion :S
> 
Ok.

>> +			err = tls_rx_one_record(sk, NULL, &darg);
>> +			if (err < 0) {
>> +				tls_err_abort(sk, -EBADMSG);
>> +				goto read_sock_end;
>> +			}
>> +
>> +			sk_flush_backlog(sk);
> 
> Hm, could be a bit often but okay.
> 
When would you suggest to do it?
(Do I need to do it at all?)

>> +			skb = darg.skb;
>> +			rxm = strp_msg(skb);
>> +
>> +			tls_rx_rec_done(ctx);
>> +		}
>> +
>> +		used = read_actor(desc, skb, rxm->offset, rxm->full_len);
>> +		if (used <= 0) {
>> +			if (!copied)
>> +				err = used;
>> +			goto read_sock_end;
> 
> You have to requeue on error.
> 
Ah, right. Did it in the previous version, but somehow got
lost here.

Will be fixing it up.

>> +		}
>> +		copied += used;
>> +		if (used < rxm->full_len) {
>> +			rxm->offset += used;
>> +			rxm->full_len -= used;
>> +			if (!desc->count)
>> +				goto read_sock_requeue;
> 
> And here. Like splice_read does. Otherwise you leak the skb.
> 
Will do.

Thanks for the review!

Cheers,

Hannes
Jakub Kicinski July 21, 2023, 2:50 p.m. UTC | #6
On Fri, 21 Jul 2023 15:53:05 +0200 Hannes Reinecke wrote:
> >> +			err = tls_rx_one_record(sk, NULL, &darg);
> >> +			if (err < 0) {
> >> +				tls_err_abort(sk, -EBADMSG);
> >> +				goto read_sock_end;
> >> +			}
> >> +
> >> +			sk_flush_backlog(sk);  
> > 
> > Hm, could be a bit often but okay.
> >   
> When would you suggest to do it?
> (Do I need to do it at all?)

I picked every 128kB for the normal Rx path. I looked thru my old notes
and it seems I didn't measure smaller batches :( Only 128kB - 4MB.
Flush every 128kB was showing a 4% throughput hit, but much better TCP
behavior. Not sure how the perf hit would scale below 128kB, maybe 
the lower the threshold the lower the overhead because statistically
only one of every 4 calls will have something to do? (GRO coalesces
to 64kB == 4 TLS records). Dunno. You'd have to measure.

But its not a blocker for me, FWIW, you can keep the flushing on every
record.
diff mbox series

Patch

diff --git a/net/tls/tls.h b/net/tls/tls.h
index 86cef1c68e03..7e4d45537deb 100644
--- a/net/tls/tls.h
+++ b/net/tls/tls.h
@@ -110,6 +110,8 @@  bool tls_sw_sock_is_readable(struct sock *sk);
 ssize_t tls_sw_splice_read(struct socket *sock, loff_t *ppos,
 			   struct pipe_inode_info *pipe,
 			   size_t len, unsigned int flags);
+int tls_sw_read_sock(struct sock *sk, read_descriptor_t *desc,
+		     sk_read_actor_t read_actor);
 
 int tls_device_sendmsg(struct sock *sk, struct msghdr *msg, size_t size);
 void tls_device_splice_eof(struct socket *sock);
diff --git a/net/tls/tls_main.c b/net/tls/tls_main.c
index b6896126bb92..7dbb8cd8f809 100644
--- a/net/tls/tls_main.c
+++ b/net/tls/tls_main.c
@@ -962,10 +962,12 @@  static void build_proto_ops(struct proto_ops ops[TLS_NUM_CONFIG][TLS_NUM_CONFIG]
 	ops[TLS_BASE][TLS_SW  ] = ops[TLS_BASE][TLS_BASE];
 	ops[TLS_BASE][TLS_SW  ].splice_read	= tls_sw_splice_read;
 	ops[TLS_BASE][TLS_SW  ].poll		= tls_sk_poll;
+	ops[TLS_BASE][TLS_SW  ].read_sock	= tls_sw_read_sock;
 
 	ops[TLS_SW  ][TLS_SW  ] = ops[TLS_SW  ][TLS_BASE];
 	ops[TLS_SW  ][TLS_SW  ].splice_read	= tls_sw_splice_read;
 	ops[TLS_SW  ][TLS_SW  ].poll		= tls_sk_poll;
+	ops[TLS_SW  ][TLS_SW  ].read_sock	= tls_sw_read_sock;
 
 #ifdef CONFIG_TLS_DEVICE
 	ops[TLS_HW  ][TLS_BASE] = ops[TLS_BASE][TLS_BASE];
diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c
index d0636ea13009..4829d2cb9a7c 100644
--- a/net/tls/tls_sw.c
+++ b/net/tls/tls_sw.c
@@ -2202,6 +2202,102 @@  ssize_t tls_sw_splice_read(struct socket *sock,  loff_t *ppos,
 	goto splice_read_end;
 }
 
+int tls_sw_read_sock(struct sock *sk, read_descriptor_t *desc,
+		     sk_read_actor_t read_actor)
+{
+	struct tls_context *tls_ctx = tls_get_ctx(sk);
+	struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx);
+	struct strp_msg *rxm = NULL;
+	struct tls_msg *tlm;
+	struct sk_buff *skb;
+	struct sk_psock *psock;
+	ssize_t copied = 0;
+	bool bpf_strp_enabled;
+	int err, used;
+
+	psock = sk_psock_get(sk);
+	err = tls_rx_reader_acquire(sk, ctx, true);
+	if (err < 0)
+		goto psock_put;
+	bpf_strp_enabled = sk_psock_strp_enabled(psock);
+
+	/* If crypto failed the connection is broken */
+	err = ctx->async_wait.err;
+	if (err)
+		goto read_sock_end;
+
+	do {
+		if (!skb_queue_empty(&ctx->rx_list)) {
+			skb = __skb_dequeue(&ctx->rx_list);
+			rxm = strp_msg(skb);
+		} else {
+			struct tls_decrypt_arg darg;
+
+			err = tls_rx_rec_wait(sk, psock, true, true);
+			if (err <= 0)
+				goto read_sock_end;
+
+			memset(&darg.inargs, 0, sizeof(darg.inargs));
+			darg.zc = !bpf_strp_enabled && ctx->zc_capable;
+
+			rxm = strp_msg(tls_strp_msg(ctx));
+			tlm = tls_msg(tls_strp_msg(ctx));
+
+			/* read_sock does not support reading control messages */
+			if (tlm->control != TLS_RECORD_TYPE_DATA) {
+				err = -EINVAL;
+				goto read_sock_requeue;
+			}
+
+			if (!bpf_strp_enabled)
+				darg.async = ctx->async_capable;
+			else
+				darg.async = false;
+
+			err = tls_rx_one_record(sk, NULL, &darg);
+			if (err < 0) {
+				tls_err_abort(sk, -EBADMSG);
+				goto read_sock_end;
+			}
+
+			sk_flush_backlog(sk);
+			skb = darg.skb;
+			rxm = strp_msg(skb);
+
+			tls_rx_rec_done(ctx);
+		}
+
+		used = read_actor(desc, skb, rxm->offset, rxm->full_len);
+		if (used <= 0) {
+			if (!copied)
+				err = used;
+			goto read_sock_end;
+		}
+		copied += used;
+		if (used < rxm->full_len) {
+			rxm->offset += used;
+			rxm->full_len -= used;
+			if (!desc->count)
+				goto read_sock_requeue;
+		} else {
+			consume_skb(skb);
+			if (!desc->count)
+				skb = NULL;
+		}
+	} while (skb);
+
+read_sock_end:
+	tls_rx_reader_release(sk, ctx);
+psock_put:
+	if (psock)
+		sk_psock_put(sk, psock);
+	return copied ? : err;
+
+read_sock_requeue:
+	__skb_queue_head(&ctx->rx_list, skb);
+	goto read_sock_end;
+}
+
 bool tls_sw_sock_is_readable(struct sock *sk)
 {
 	struct tls_context *tls_ctx = tls_get_ctx(sk);