[14/17] nvmet-tcp: reference counting for queues

Message ID	20230811121755.24715-15-hare@suse.de (mailing list archive)
State	Superseded
Headers	show Received: from lindbergh.monkeyblade.net (lindbergh.monkeyblade.net [23.128.96.19]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id EFFEE12B6D for <netdev@vger.kernel.org>; Fri, 11 Aug 2023 12:19:05 +0000 (UTC) From: Hannes Reinecke <hare@suse.de> To: Christoph Hellwig <hch@lst.de> Cc: Sagi Grimberg <sagi@grimberg.me>, Keith Busch <kbusch@kernel.org>, linux-nvme@lists.infradead.org, Jakub Kicinski <kuba@kernel.org>, Eric Dumazet <edumazet@google.com>, Paolo Abeni <pabeni@redhat.com>, netdev@vger.kernel.org, Hannes Reinecke <hare@suse.de> Subject: [PATCH 14/17] nvmet-tcp: reference counting for queues Date: Fri, 11 Aug 2023 14:17:52 +0200 Message-Id: <20230811121755.24715-15-hare@suse.de> In-Reply-To: <20230811121755.24715-1-hare@suse.de> References: <20230811121755.24715-1-hare@suse.de> Precedence: bulk MIME-Version: 1.0 Content-Transfer-Encoding: 8bit
Series	nvme: In-kernel TLS support for TCP \| expand [PATCHv8,00/17] nvme: In-kernel TLS support for TCP [01/17] nvme-keyring: register '.nvme' keyring [02/17] nvme-keyring: define a 'psk' keytype [03/17] nvme: add TCP TSAS definitions [04/17] nvme-tcp: add definitions for TLS cipher suites [05/17] nvme-keyring: implement nvme_tls_psk_default() [06/17] security/keys: export key_lookup() [07/17] nvme-tcp: allocate socket file [08/17] nvme-tcp: enable TLS handshake upcall [09/17] nvme-tcp: control message handling for recvmsg() [10/17] nvme-fabrics: parse options 'keyring' and 'tls_key' [11/17] nvmet: make TCP sectype settable via configfs [12/17] nvmet-tcp: make nvmet_tcp_alloc_queue() a void function [13/17] nvmet-tcp: allocate socket file [14/17] nvmet-tcp: reference counting for queues [15/17] nvmet: Set 'TREQ' to 'required' when TLS is enabled [16/17] nvmet-tcp: enable TLS handshake upcall [17/17] nvmet-tcp: control messages for recvmsg()

Message ID

20230811121755.24715-15-hare@suse.de (mailing list archive)

State

Superseded

Headers

From: Hannes Reinecke <hare@suse.de>
To: Christoph Hellwig <hch@lst.de>
Cc: Sagi Grimberg <sagi@grimberg.me>,
	Keith Busch <kbusch@kernel.org>,
	linux-nvme@lists.infradead.org,
	Jakub Kicinski <kuba@kernel.org>,
	Eric Dumazet <edumazet@google.com>,
	Paolo Abeni <pabeni@redhat.com>,
	netdev@vger.kernel.org,
	Hannes Reinecke <hare@suse.de>
Subject: [PATCH 14/17] nvmet-tcp: reference counting for queues
Date: Fri, 11 Aug 2023 14:17:52 +0200
Message-Id: <20230811121755.24715-15-hare@suse.de>
In-Reply-To: <20230811121755.24715-1-hare@suse.de>
References: <20230811121755.24715-1-hare@suse.de>
Precedence: bulk
MIME-Version: 1.0
Content-Transfer-Encoding: 8bit

Series

nvme: In-kernel TLS support for TCP | expand

Context	Check	Description
netdev/tree_selection	success	Not a local patch, async

Context

Check

Description

netdev/tree_selection

success

Not a local patch, async

Commit Message

Hannes Reinecke Aug. 11, 2023, 12:17 p.m. UTC

The 'queue' structure is referenced from various places and
used as an argument of asynchronous functions, so it's really
hard to figure out if the queue is still valid when the
asynchronous function triggers.
So add reference counting to validate the queue structure.

Signed-off-by: Hannes Reinecke <hare@suse.de>
---
 drivers/nvme/target/tcp.c | 74 ++++++++++++++++++++++++++++++---------
 1 file changed, 57 insertions(+), 17 deletions(-)

Comments

Sagi Grimberg Aug. 13, 2023, 2:01 p.m. UTC | #1

On 8/11/23 15:17, Hannes Reinecke wrote:
> The 'queue' structure is referenced from various places and
> used as an argument of asynchronous functions, so it's really
> hard to figure out if the queue is still valid when the
> asynchronous function triggers.
> So add reference counting to validate the queue structure.
> 
> Signed-off-by: Hannes Reinecke <hare@suse.de>
> ---
>   drivers/nvme/target/tcp.c | 74 ++++++++++++++++++++++++++++++---------
>   1 file changed, 57 insertions(+), 17 deletions(-)
> 
> diff --git a/drivers/nvme/target/tcp.c b/drivers/nvme/target/tcp.c
> index f19ea9d923fd..84b726dfc1c4 100644
> --- a/drivers/nvme/target/tcp.c
> +++ b/drivers/nvme/target/tcp.c
> @@ -127,6 +127,7 @@ enum nvmet_tcp_queue_state {
>   };
>   
>   struct nvmet_tcp_queue {
> +	struct kref		kref;
>   	struct socket		*sock;
>   	struct nvmet_tcp_port	*port;
>   	struct work_struct	io_work;
> @@ -192,6 +193,9 @@ static struct workqueue_struct *nvmet_tcp_wq;
>   static const struct nvmet_fabrics_ops nvmet_tcp_ops;
>   static void nvmet_tcp_free_cmd(struct nvmet_tcp_cmd *c);
>   static void nvmet_tcp_free_cmd_buffers(struct nvmet_tcp_cmd *cmd);
> +static int nvmet_tcp_get_queue(struct nvmet_tcp_queue *queue);
> +static void nvmet_tcp_put_queue(struct nvmet_tcp_queue *queue);
> +static void nvmet_tcp_data_ready(struct sock *sk);
>   
>   static inline u16 nvmet_tcp_cmd_tag(struct nvmet_tcp_queue *queue,
>   		struct nvmet_tcp_cmd *cmd)
> @@ -1437,11 +1441,21 @@ static void nvmet_tcp_restore_socket_callbacks(struct nvmet_tcp_queue *queue)
>   	struct socket *sock = queue->sock;
>   
>   	write_lock_bh(&sock->sk->sk_callback_lock);
> +	/*
> +	 * Check if nvmet_tcp_set_queue_sock() has been called;
> +	 * if not the queue reference has not been increased
> +	 * and we're getting an refcount error on exit.
> +	 */
> +	if (sock->sk->sk_data_ready != nvmet_tcp_data_ready) {
> +		write_unlock_bh(&sock->sk->sk_callback_lock);
> +		return;
> +	}

This is really ugly I think.

>   	sock->sk->sk_data_ready =  queue->data_ready;
>   	sock->sk->sk_state_change = queue->state_change;
>   	sock->sk->sk_write_space = queue->write_space;
>   	sock->sk->sk_user_data = NULL;
>   	write_unlock_bh(&sock->sk->sk_callback_lock);
> +	nvmet_tcp_put_queue(queue);
>   }
>   
>   static void nvmet_tcp_uninit_data_in_cmds(struct nvmet_tcp_queue *queue)
> @@ -1474,6 +1488,30 @@ static void nvmet_tcp_free_cmd_data_in_buffers(struct nvmet_tcp_queue *queue)
>   		nvmet_tcp_free_cmd_buffers(&queue->connect);
>   }
>   
> +static void nvmet_tcp_release_queue_final(struct kref *kref)
> +{
> +	struct nvmet_tcp_queue *queue = container_of(kref, struct nvmet_tcp_queue, kref);
> +
> +	WARN_ON(queue->state != NVMET_TCP_Q_DISCONNECTING);
> +	nvmet_tcp_free_cmds(queue);
> +	ida_free(&nvmet_tcp_queue_ida, queue->idx);
> +	/* ->sock will be released by fput() */
> +	fput(queue->sock->file);
> +	kfree(queue);
> +}
> +
> +static int nvmet_tcp_get_queue(struct nvmet_tcp_queue *queue)
> +{
> +	if (!queue)
> +		return 0;
> +	return kref_get_unless_zero(&queue->kref);
> +}
> +
> +static void nvmet_tcp_put_queue(struct nvmet_tcp_queue *queue)
> +{
> +	kref_put(&queue->kref, nvmet_tcp_release_queue_final);
> +}
> +
>   static void nvmet_tcp_release_queue_work(struct work_struct *w)
>   {
>   	struct page *page;
> @@ -1493,15 +1531,11 @@ static void nvmet_tcp_release_queue_work(struct work_struct *w)
>   	nvmet_sq_destroy(&queue->nvme_sq);
>   	cancel_work_sync(&queue->io_work);
>   	nvmet_tcp_free_cmd_data_in_buffers(queue);
> -	/* ->sock will be released by fput() */
> -	fput(queue->sock->file);
> -	nvmet_tcp_free_cmds(queue);
>   	if (queue->hdr_digest || queue->data_digest)
>   		nvmet_tcp_free_crypto(queue);
> -	ida_free(&nvmet_tcp_queue_ida, queue->idx);
>   	page = virt_to_head_page(queue->pf_cache.va);
>   	__page_frag_cache_drain(page, queue->pf_cache.pagecnt_bias);
> -	kfree(queue);
> +	nvmet_tcp_put_queue(queue);

What made you pick these vs. the others for before/after the
final reference?

>   }
>   
>   static void nvmet_tcp_data_ready(struct sock *sk)
> @@ -1512,8 +1546,10 @@ static void nvmet_tcp_data_ready(struct sock *sk)
>   
>   	read_lock_bh(&sk->sk_callback_lock);
>   	queue = sk->sk_user_data;
> -	if (likely(queue))
> +	if (likely(nvmet_tcp_get_queue(queue))) {
>   		queue_work_on(queue_cpu(queue), nvmet_tcp_wq, &queue->io_work);
> +		nvmet_tcp_put_queue(queue);
> +	}

No... Why?

The shutdown code should serialize perfectly without this. Why add
a kref to the normal I/O path?

I thought we'd simply move release_work to do a kref_put and take
an extra reference when we fire the tls handshake...

>   	read_unlock_bh(&sk->sk_callback_lock);
>   }
>   
> @@ -1523,18 +1559,16 @@ static void nvmet_tcp_write_space(struct sock *sk)
>   
>   	read_lock_bh(&sk->sk_callback_lock);
>   	queue = sk->sk_user_data;
> -	if (unlikely(!queue))
> +	if (unlikely(!nvmet_tcp_get_queue(queue)))
>   		goto out;
>   
>   	if (unlikely(queue->state == NVMET_TCP_Q_CONNECTING)) {
>   		queue->write_space(sk);
> -		goto out;
> -	}
> -
> -	if (sk_stream_is_writeable(sk)) {
> +	} else if (sk_stream_is_writeable(sk)) {
>   		clear_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
>   		queue_work_on(queue_cpu(queue), nvmet_tcp_wq, &queue->io_work);
>   	}
> +	nvmet_tcp_put_queue(queue);

Same here...

Why not simply exclude the kref get/put to the tls handshake where
there is actually a race?

>   out:
>   	read_unlock_bh(&sk->sk_callback_lock);
>   }
> @@ -1545,7 +1579,7 @@ static void nvmet_tcp_state_change(struct sock *sk)
>   
>   	read_lock_bh(&sk->sk_callback_lock);
>   	queue = sk->sk_user_data;
> -	if (!queue)
> +	if (!nvmet_tcp_get_queue(queue))
>   		goto done;
>   
>   	switch (sk->sk_state) {
> @@ -1562,6 +1596,7 @@ static void nvmet_tcp_state_change(struct sock *sk)
>   		pr_warn("queue %d unhandled state %d\n",
>   			queue->idx, sk->sk_state);
>   	}
> +	nvmet_tcp_put_queue(queue);
>   done:
>   	read_unlock_bh(&sk->sk_callback_lock);
>   }
> @@ -1582,6 +1617,9 @@ static int nvmet_tcp_set_queue_sock(struct nvmet_tcp_queue *queue)
>   	if (ret < 0)
>   		return ret;
>   
> +	if (unlikely(!nvmet_tcp_get_queue(queue)))
> +		return -ECONNRESET;
> +

Can this actually fail?

>   	/*
>   	 * Cleanup whatever is sitting in the TCP transmit queue on socket
>   	 * close. This is done to prevent stale data from being sent should
> @@ -1603,6 +1641,7 @@ static int nvmet_tcp_set_queue_sock(struct nvmet_tcp_queue *queue)
>   		 * If the socket is already closing, don't even start
>   		 * consuming it
>   		 */
> +		nvmet_tcp_put_queue(queue);
>   		ret = -ENOTCONN;
>   	} else {
>   		sock->sk->sk_user_data = queue;
> @@ -1636,6 +1675,7 @@ static void nvmet_tcp_alloc_queue(struct nvmet_tcp_port *port,
>   
>   	INIT_WORK(&queue->release_work, nvmet_tcp_release_queue_work);
>   	INIT_WORK(&queue->io_work, nvmet_tcp_io_work);
> +	kref_init(&queue->kref);
>   	queue->sock = newsock;
>   	queue->port = port;
>   	queue->nr_cmds = 0;
> @@ -1672,15 +1712,15 @@ static void nvmet_tcp_alloc_queue(struct nvmet_tcp_port *port,
>   	mutex_unlock(&nvmet_tcp_queue_mutex);
>   
>   	ret = nvmet_tcp_set_queue_sock(queue);
> -	if (ret)
> -		goto out_destroy_sq;
> +	if (!ret)
> +		return;
>   
> -	return;
> -out_destroy_sq:
> +	queue->state = NVMET_TCP_Q_DISCONNECTING;
>   	mutex_lock(&nvmet_tcp_queue_mutex);
>   	list_del_init(&queue->queue_list);
>   	mutex_unlock(&nvmet_tcp_queue_mutex);
> -	nvmet_sq_destroy(&queue->nvme_sq);
> +	nvmet_tcp_put_queue(queue);
> +	return;
>   out_free_connect:
>   	nvmet_tcp_free_cmd(&queue->connect);
>   out_ida_remove:

Hannes Reinecke Aug. 13, 2023, 2:38 p.m. UTC | #2

On 8/13/23 16:01, Sagi Grimberg wrote:
> 
> 
> On 8/11/23 15:17, Hannes Reinecke wrote:
>> The 'queue' structure is referenced from various places and
>> used as an argument of asynchronous functions, so it's really
>> hard to figure out if the queue is still valid when the
>> asynchronous function triggers.
>> So add reference counting to validate the queue structure.
>>
>> Signed-off-by: Hannes Reinecke <hare@suse.de>
>> ---
>>   drivers/nvme/target/tcp.c | 74 ++++++++++++++++++++++++++++++---------
>>   1 file changed, 57 insertions(+), 17 deletions(-)
>>
>> diff --git a/drivers/nvme/target/tcp.c b/drivers/nvme/target/tcp.c
>> index f19ea9d923fd..84b726dfc1c4 100644
>> --- a/drivers/nvme/target/tcp.c
>> +++ b/drivers/nvme/target/tcp.c
>> @@ -127,6 +127,7 @@ enum nvmet_tcp_queue_state {
>>   };
>>   struct nvmet_tcp_queue {
>> +    struct kref        kref;
>>       struct socket        *sock;
>>       struct nvmet_tcp_port    *port;
>>       struct work_struct    io_work;
>> @@ -192,6 +193,9 @@ static struct workqueue_struct *nvmet_tcp_wq;
>>   static const struct nvmet_fabrics_ops nvmet_tcp_ops;
>>   static void nvmet_tcp_free_cmd(struct nvmet_tcp_cmd *c);
>>   static void nvmet_tcp_free_cmd_buffers(struct nvmet_tcp_cmd *cmd);
>> +static int nvmet_tcp_get_queue(struct nvmet_tcp_queue *queue);
>> +static void nvmet_tcp_put_queue(struct nvmet_tcp_queue *queue);
>> +static void nvmet_tcp_data_ready(struct sock *sk);
>>   static inline u16 nvmet_tcp_cmd_tag(struct nvmet_tcp_queue *queue,
>>           struct nvmet_tcp_cmd *cmd)
>> @@ -1437,11 +1441,21 @@ static void 
>> nvmet_tcp_restore_socket_callbacks(struct nvmet_tcp_queue *queue)
>>       struct socket *sock = queue->sock;
>>       write_lock_bh(&sock->sk->sk_callback_lock);
>> +    /*
>> +     * Check if nvmet_tcp_set_queue_sock() has been called;
>> +     * if not the queue reference has not been increased
>> +     * and we're getting an refcount error on exit.
>> +     */
>> +    if (sock->sk->sk_data_ready != nvmet_tcp_data_ready) {
>> +        write_unlock_bh(&sock->sk->sk_callback_lock);
>> +        return;
>> +    }
> 
> This is really ugly I think.
> 
Me too, but what would be the alternative?

>>       sock->sk->sk_data_ready =  queue->data_ready;
>>       sock->sk->sk_state_change = queue->state_change;
>>       sock->sk->sk_write_space = queue->write_space;
>>       sock->sk->sk_user_data = NULL;
>>       write_unlock_bh(&sock->sk->sk_callback_lock);
>> +    nvmet_tcp_put_queue(queue);
>>   }
>>   static void nvmet_tcp_uninit_data_in_cmds(struct nvmet_tcp_queue 
>> *queue)
>> @@ -1474,6 +1488,30 @@ static void 
>> nvmet_tcp_free_cmd_data_in_buffers(struct nvmet_tcp_queue *queue)
>>           nvmet_tcp_free_cmd_buffers(&queue->connect);
>>   }
>> +static void nvmet_tcp_release_queue_final(struct kref *kref)
>> +{
>> +    struct nvmet_tcp_queue *queue = container_of(kref, struct 
>> nvmet_tcp_queue, kref);
>> +
>> +    WARN_ON(queue->state != NVMET_TCP_Q_DISCONNECTING);
>> +    nvmet_tcp_free_cmds(queue);
>> +    ida_free(&nvmet_tcp_queue_ida, queue->idx);
>> +    /* ->sock will be released by fput() */
>> +    fput(queue->sock->file);
>> +    kfree(queue);
>> +}
>> +
>> +static int nvmet_tcp_get_queue(struct nvmet_tcp_queue *queue)
>> +{
>> +    if (!queue)
>> +        return 0;
>> +    return kref_get_unless_zero(&queue->kref);
>> +}
>> +
>> +static void nvmet_tcp_put_queue(struct nvmet_tcp_queue *queue)
>> +{
>> +    kref_put(&queue->kref, nvmet_tcp_release_queue_final);
>> +}
>> +
>>   static void nvmet_tcp_release_queue_work(struct work_struct *w)
>>   {
>>       struct page *page;
>> @@ -1493,15 +1531,11 @@ static void 
>> nvmet_tcp_release_queue_work(struct work_struct *w)
>>       nvmet_sq_destroy(&queue->nvme_sq);
>>       cancel_work_sync(&queue->io_work);
>>       nvmet_tcp_free_cmd_data_in_buffers(queue);
>> -    /* ->sock will be released by fput() */
>> -    fput(queue->sock->file);
>> -    nvmet_tcp_free_cmds(queue);
>>       if (queue->hdr_digest || queue->data_digest)
>>           nvmet_tcp_free_crypto(queue);
>> -    ida_free(&nvmet_tcp_queue_ida, queue->idx);
>>       page = virt_to_head_page(queue->pf_cache.va);
>>       __page_frag_cache_drain(page, queue->pf_cache.pagecnt_bias);
>> -    kfree(queue);
>> +    nvmet_tcp_put_queue(queue);
> 
> What made you pick these vs. the others for before/after the
> final reference?
> 
I wanted to call 'nvmet_tcp_put_queue()' for a failed allocation
in nvmet_tcp_alloc_queue(), and at that time the queue itself
is not live.
nvmet_tcp_release_queue() is only called on a live queue, so using
that as the kref ->release() function would either limit it's
usefulness or would require me to ensure that all calls in there
can be made with a non-initialized argument.

>>   }
>>   static void nvmet_tcp_data_ready(struct sock *sk)
>> @@ -1512,8 +1546,10 @@ static void nvmet_tcp_data_ready(struct sock *sk)
>>       read_lock_bh(&sk->sk_callback_lock);
>>       queue = sk->sk_user_data;
>> -    if (likely(queue))
>> +    if (likely(nvmet_tcp_get_queue(queue))) {
>>           queue_work_on(queue_cpu(queue), nvmet_tcp_wq, &queue->io_work);
>> +        nvmet_tcp_put_queue(queue);
>> +    }
> 
> No... Why?
> 
> The shutdown code should serialize perfectly without this. Why add
> a kref to the normal I/O path?
> 
> I thought we'd simply move release_work to do a kref_put and take
> an extra reference when we fire the tls handshake...
> 
Because I feel ever so slightly unsure about using the sk_user_data
argument. This function is completely asynchronous, and I can't really
see how we can ensure that sk_user_data references a valid object.
(If it were valid, why would we need to check for !queue ?)

If you have lifetime guarantees that the kref isn't required, by all
means, please tell me, and we can drop the kref thing here.
But I guess that would imply to _not_ having to check for (!queue)
which is fine by me, too.

>>       read_unlock_bh(&sk->sk_callback_lock);
>>   }
>> @@ -1523,18 +1559,16 @@ static void nvmet_tcp_write_space(struct sock 
>> *sk)
>>       read_lock_bh(&sk->sk_callback_lock);
>>       queue = sk->sk_user_data;
>> -    if (unlikely(!queue))
>> +    if (unlikely(!nvmet_tcp_get_queue(queue)))
>>           goto out;
>>       if (unlikely(queue->state == NVMET_TCP_Q_CONNECTING)) {
>>           queue->write_space(sk);
>> -        goto out;
>> -    }
>> -
>> -    if (sk_stream_is_writeable(sk)) {
>> +    } else if (sk_stream_is_writeable(sk)) {
>>           clear_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
>>           queue_work_on(queue_cpu(queue), nvmet_tcp_wq, &queue->io_work);
>>       }
>> +    nvmet_tcp_put_queue(queue);
> 
> Same here...
> 
> Why not simply exclude the kref get/put to the tls handshake where
> there is actually a race?
> 
It's not abot the handshake, it's about socket closing.
See above.

>>   out:
>>       read_unlock_bh(&sk->sk_callback_lock);
>>   }
>> @@ -1545,7 +1579,7 @@ static void nvmet_tcp_state_change(struct sock *sk)
>>       read_lock_bh(&sk->sk_callback_lock);
>>       queue = sk->sk_user_data;
>> -    if (!queue)
>> +    if (!nvmet_tcp_get_queue(queue))
>>           goto done;
>>       switch (sk->sk_state) {
>> @@ -1562,6 +1596,7 @@ static void nvmet_tcp_state_change(struct sock *sk)
>>           pr_warn("queue %d unhandled state %d\n",
>>               queue->idx, sk->sk_state);
>>       }
>> +    nvmet_tcp_put_queue(queue);
>>   done:
>>       read_unlock_bh(&sk->sk_callback_lock);
>>   }
>> @@ -1582,6 +1617,9 @@ static int nvmet_tcp_set_queue_sock(struct 
>> nvmet_tcp_queue *queue)
>>       if (ret < 0)
>>           return ret;
>> +    if (unlikely(!nvmet_tcp_get_queue(queue)))
>> +        return -ECONNRESET;
>> +
> 
> Can this actually fail?
> 
Yes. The socket might have been closed before this call.
Unlikely, I know, but still...

>>       /*
>>        * Cleanup whatever is sitting in the TCP transmit queue on socket
>>        * close. This is done to prevent stale data from being sent should
>> @@ -1603,6 +1641,7 @@ static int nvmet_tcp_set_queue_sock(struct 
>> nvmet_tcp_queue *queue)
>>            * If the socket is already closing, don't even start
>>            * consuming it
>>            */
>> +        nvmet_tcp_put_queue(queue);
>>           ret = -ENOTCONN;
>>       } else {
>>           sock->sk->sk_user_data = queue;
>> @@ -1636,6 +1675,7 @@ static void nvmet_tcp_alloc_queue(struct 
>> nvmet_tcp_port *port,
>>       INIT_WORK(&queue->release_work, nvmet_tcp_release_queue_work);
>>       INIT_WORK(&queue->io_work, nvmet_tcp_io_work);
>> +    kref_init(&queue->kref);
>>       queue->sock = newsock;
>>       queue->port = port;
>>       queue->nr_cmds = 0;
>> @@ -1672,15 +1712,15 @@ static void nvmet_tcp_alloc_queue(struct 
>> nvmet_tcp_port *port,
>>       mutex_unlock(&nvmet_tcp_queue_mutex);
>>       ret = nvmet_tcp_set_queue_sock(queue);
>> -    if (ret)
>> -        goto out_destroy_sq;
>> +    if (!ret)
>> +        return;
>> -    return;
>> -out_destroy_sq:
>> +    queue->state = NVMET_TCP_Q_DISCONNECTING;
>>       mutex_lock(&nvmet_tcp_queue_mutex);
>>       list_del_init(&queue->queue_list);
>>       mutex_unlock(&nvmet_tcp_queue_mutex);
>> -    nvmet_sq_destroy(&queue->nvme_sq);
>> +    nvmet_tcp_put_queue(queue);
>> +    return;
>>   out_free_connect:
>>       nvmet_tcp_free_cmd(&queue->connect);
>>   out_ida_remove:

But thanks for the review!

Cheers,

Hannes

Sagi Grimberg Aug. 14, 2023, 7:32 a.m. UTC | #3

On 8/13/23 17:38, Hannes Reinecke wrote:
> On 8/13/23 16:01, Sagi Grimberg wrote:
>>
>>
>> On 8/11/23 15:17, Hannes Reinecke wrote:
>>> The 'queue' structure is referenced from various places and
>>> used as an argument of asynchronous functions, so it's really
>>> hard to figure out if the queue is still valid when the
>>> asynchronous function triggers.
>>> So add reference counting to validate the queue structure.
>>>
>>> Signed-off-by: Hannes Reinecke <hare@suse.de>
>>> ---
>>>   drivers/nvme/target/tcp.c | 74 ++++++++++++++++++++++++++++++---------
>>>   1 file changed, 57 insertions(+), 17 deletions(-)
>>>
>>> diff --git a/drivers/nvme/target/tcp.c b/drivers/nvme/target/tcp.c
>>> index f19ea9d923fd..84b726dfc1c4 100644
>>> --- a/drivers/nvme/target/tcp.c
>>> +++ b/drivers/nvme/target/tcp.c
>>> @@ -127,6 +127,7 @@ enum nvmet_tcp_queue_state {
>>>   };
>>>   struct nvmet_tcp_queue {
>>> +    struct kref        kref;
>>>       struct socket        *sock;
>>>       struct nvmet_tcp_port    *port;
>>>       struct work_struct    io_work;
>>> @@ -192,6 +193,9 @@ static struct workqueue_struct *nvmet_tcp_wq;
>>>   static const struct nvmet_fabrics_ops nvmet_tcp_ops;
>>>   static void nvmet_tcp_free_cmd(struct nvmet_tcp_cmd *c);
>>>   static void nvmet_tcp_free_cmd_buffers(struct nvmet_tcp_cmd *cmd);
>>> +static int nvmet_tcp_get_queue(struct nvmet_tcp_queue *queue);
>>> +static void nvmet_tcp_put_queue(struct nvmet_tcp_queue *queue);
>>> +static void nvmet_tcp_data_ready(struct sock *sk);
>>>   static inline u16 nvmet_tcp_cmd_tag(struct nvmet_tcp_queue *queue,
>>>           struct nvmet_tcp_cmd *cmd)
>>> @@ -1437,11 +1441,21 @@ static void 
>>> nvmet_tcp_restore_socket_callbacks(struct nvmet_tcp_queue *queue)
>>>       struct socket *sock = queue->sock;
>>>       write_lock_bh(&sock->sk->sk_callback_lock);
>>> +    /*
>>> +     * Check if nvmet_tcp_set_queue_sock() has been called;
>>> +     * if not the queue reference has not been increased
>>> +     * and we're getting an refcount error on exit.
>>> +     */
>>> +    if (sock->sk->sk_data_ready != nvmet_tcp_data_ready) {
>>> +        write_unlock_bh(&sock->sk->sk_callback_lock);
>>> +        return;
>>> +    }
>>
>> This is really ugly I think.
>>
> Me too, but what would be the alternative?
> 
>>>       sock->sk->sk_data_ready =  queue->data_ready;
>>>       sock->sk->sk_state_change = queue->state_change;
>>>       sock->sk->sk_write_space = queue->write_space;
>>>       sock->sk->sk_user_data = NULL;
>>>       write_unlock_bh(&sock->sk->sk_callback_lock);
>>> +    nvmet_tcp_put_queue(queue);
>>>   }
>>>   static void nvmet_tcp_uninit_data_in_cmds(struct nvmet_tcp_queue 
>>> *queue)
>>> @@ -1474,6 +1488,30 @@ static void 
>>> nvmet_tcp_free_cmd_data_in_buffers(struct nvmet_tcp_queue *queue)
>>>           nvmet_tcp_free_cmd_buffers(&queue->connect);
>>>   }
>>> +static void nvmet_tcp_release_queue_final(struct kref *kref)
>>> +{
>>> +    struct nvmet_tcp_queue *queue = container_of(kref, struct 
>>> nvmet_tcp_queue, kref);
>>> +
>>> +    WARN_ON(queue->state != NVMET_TCP_Q_DISCONNECTING);
>>> +    nvmet_tcp_free_cmds(queue);
>>> +    ida_free(&nvmet_tcp_queue_ida, queue->idx);
>>> +    /* ->sock will be released by fput() */
>>> +    fput(queue->sock->file);
>>> +    kfree(queue);
>>> +}
>>> +
>>> +static int nvmet_tcp_get_queue(struct nvmet_tcp_queue *queue)
>>> +{
>>> +    if (!queue)
>>> +        return 0;
>>> +    return kref_get_unless_zero(&queue->kref);
>>> +}
>>> +
>>> +static void nvmet_tcp_put_queue(struct nvmet_tcp_queue *queue)
>>> +{
>>> +    kref_put(&queue->kref, nvmet_tcp_release_queue_final);
>>> +}
>>> +
>>>   static void nvmet_tcp_release_queue_work(struct work_struct *w)
>>>   {
>>>       struct page *page;
>>> @@ -1493,15 +1531,11 @@ static void 
>>> nvmet_tcp_release_queue_work(struct work_struct *w)
>>>       nvmet_sq_destroy(&queue->nvme_sq);
>>>       cancel_work_sync(&queue->io_work);
>>>       nvmet_tcp_free_cmd_data_in_buffers(queue);
>>> -    /* ->sock will be released by fput() */
>>> -    fput(queue->sock->file);
>>> -    nvmet_tcp_free_cmds(queue);
>>>       if (queue->hdr_digest || queue->data_digest)
>>>           nvmet_tcp_free_crypto(queue);
>>> -    ida_free(&nvmet_tcp_queue_ida, queue->idx);
>>>       page = virt_to_head_page(queue->pf_cache.va);
>>>       __page_frag_cache_drain(page, queue->pf_cache.pagecnt_bias);
>>> -    kfree(queue);
>>> +    nvmet_tcp_put_queue(queue);
>>
>> What made you pick these vs. the others for before/after the
>> final reference?
>>
> I wanted to call 'nvmet_tcp_put_queue()' for a failed allocation
> in nvmet_tcp_alloc_queue(), and at that time the queue itself
> is not live.
> nvmet_tcp_release_queue() is only called on a live queue, so using
> that as the kref ->release() function would either limit it's
> usefulness or would require me to ensure that all calls in there
> can be made with a non-initialized argument.
> 
>>>   }
>>>   static void nvmet_tcp_data_ready(struct sock *sk)
>>> @@ -1512,8 +1546,10 @@ static void nvmet_tcp_data_ready(struct sock *sk)
>>>       read_lock_bh(&sk->sk_callback_lock);
>>>       queue = sk->sk_user_data;
>>> -    if (likely(queue))
>>> +    if (likely(nvmet_tcp_get_queue(queue))) {
>>>           queue_work_on(queue_cpu(queue), nvmet_tcp_wq, 
>>> &queue->io_work);
>>> +        nvmet_tcp_put_queue(queue);
>>> +    }
>>
>> No... Why?
>>
>> The shutdown code should serialize perfectly without this. Why add
>> a kref to the normal I/O path?
>>
>> I thought we'd simply move release_work to do a kref_put and take
>> an extra reference when we fire the tls handshake...
>>
> Because I feel ever so slightly unsure about using the sk_user_data
> argument. This function is completely asynchronous, and I can't really
> see how we can ensure that sk_user_data references a valid object.
> (If it were valid, why would we need to check for !queue ?)
> 
> If you have lifetime guarantees that the kref isn't required, by all
> means, please tell me, and we can drop the kref thing here.
> But I guess that would imply to _not_ having to check for (!queue)
> which is fine by me, too.

Something doesn't add up here.
What I think you need to do, is add another reference just for the
tls handshake.

Then in the timeout handler you call tls_handshake_cancel():
- if you got %true back, you drop the reference and schedule
a release work
- if you got %false back, you simply ignore the timeout because
the .ta_done() was already triggered.
- in .ta_done() you drop the reference, cancel the timeout work
   and then continue or remove based on the status.

btw in the queue release you should call tls_handshake_cancel() as well.

diff --git a/drivers/nvme/target/tcp.c b/drivers/nvme/target/tcp.c
index f19ea9d923fd..84b726dfc1c4 100644
--- a/drivers/nvme/target/tcp.c
+++ b/drivers/nvme/target/tcp.c
@@ -127,6 +127,7 @@  enum nvmet_tcp_queue_state {
 };
 
 struct nvmet_tcp_queue {
+	struct kref		kref;
 	struct socket		*sock;
 	struct nvmet_tcp_port	*port;
 	struct work_struct	io_work;
@@ -192,6 +193,9 @@  static struct workqueue_struct *nvmet_tcp_wq;
 static const struct nvmet_fabrics_ops nvmet_tcp_ops;
 static void nvmet_tcp_free_cmd(struct nvmet_tcp_cmd *c);
 static void nvmet_tcp_free_cmd_buffers(struct nvmet_tcp_cmd *cmd);
+static int nvmet_tcp_get_queue(struct nvmet_tcp_queue *queue);
+static void nvmet_tcp_put_queue(struct nvmet_tcp_queue *queue);
+static void nvmet_tcp_data_ready(struct sock *sk);
 
 static inline u16 nvmet_tcp_cmd_tag(struct nvmet_tcp_queue *queue,
 		struct nvmet_tcp_cmd *cmd)
@@ -1437,11 +1441,21 @@  static void nvmet_tcp_restore_socket_callbacks(struct nvmet_tcp_queue *queue)
 	struct socket *sock = queue->sock;
 
 	write_lock_bh(&sock->sk->sk_callback_lock);
+	/*
+	 * Check if nvmet_tcp_set_queue_sock() has been called;
+	 * if not the queue reference has not been increased
+	 * and we're getting an refcount error on exit.
+	 */
+	if (sock->sk->sk_data_ready != nvmet_tcp_data_ready) {
+		write_unlock_bh(&sock->sk->sk_callback_lock);
+		return;
+	}
 	sock->sk->sk_data_ready =  queue->data_ready;
 	sock->sk->sk_state_change = queue->state_change;
 	sock->sk->sk_write_space = queue->write_space;
 	sock->sk->sk_user_data = NULL;
 	write_unlock_bh(&sock->sk->sk_callback_lock);
+	nvmet_tcp_put_queue(queue);
 }
 
 static void nvmet_tcp_uninit_data_in_cmds(struct nvmet_tcp_queue *queue)
@@ -1474,6 +1488,30 @@  static void nvmet_tcp_free_cmd_data_in_buffers(struct nvmet_tcp_queue *queue)
 		nvmet_tcp_free_cmd_buffers(&queue->connect);
 }
 
+static void nvmet_tcp_release_queue_final(struct kref *kref)
+{
+	struct nvmet_tcp_queue *queue = container_of(kref, struct nvmet_tcp_queue, kref);
+
+	WARN_ON(queue->state != NVMET_TCP_Q_DISCONNECTING);
+	nvmet_tcp_free_cmds(queue);
+	ida_free(&nvmet_tcp_queue_ida, queue->idx);
+	/* ->sock will be released by fput() */
+	fput(queue->sock->file);
+	kfree(queue);
+}
+
+static int nvmet_tcp_get_queue(struct nvmet_tcp_queue *queue)
+{
+	if (!queue)
+		return 0;
+	return kref_get_unless_zero(&queue->kref);
+}
+
+static void nvmet_tcp_put_queue(struct nvmet_tcp_queue *queue)
+{
+	kref_put(&queue->kref, nvmet_tcp_release_queue_final);
+}
+
 static void nvmet_tcp_release_queue_work(struct work_struct *w)
 {
 	struct page *page;
@@ -1493,15 +1531,11 @@  static void nvmet_tcp_release_queue_work(struct work_struct *w)
 	nvmet_sq_destroy(&queue->nvme_sq);
 	cancel_work_sync(&queue->io_work);
 	nvmet_tcp_free_cmd_data_in_buffers(queue);
-	/* ->sock will be released by fput() */
-	fput(queue->sock->file);
-	nvmet_tcp_free_cmds(queue);
 	if (queue->hdr_digest || queue->data_digest)
 		nvmet_tcp_free_crypto(queue);
-	ida_free(&nvmet_tcp_queue_ida, queue->idx);
 	page = virt_to_head_page(queue->pf_cache.va);
 	__page_frag_cache_drain(page, queue->pf_cache.pagecnt_bias);
-	kfree(queue);
+	nvmet_tcp_put_queue(queue);
 }
 
 static void nvmet_tcp_data_ready(struct sock *sk)
@@ -1512,8 +1546,10 @@  static void nvmet_tcp_data_ready(struct sock *sk)
 
 	read_lock_bh(&sk->sk_callback_lock);
 	queue = sk->sk_user_data;
-	if (likely(queue))
+	if (likely(nvmet_tcp_get_queue(queue))) {
 		queue_work_on(queue_cpu(queue), nvmet_tcp_wq, &queue->io_work);
+		nvmet_tcp_put_queue(queue);
+	}
 	read_unlock_bh(&sk->sk_callback_lock);
 }
 
@@ -1523,18 +1559,16 @@  static void nvmet_tcp_write_space(struct sock *sk)
 
 	read_lock_bh(&sk->sk_callback_lock);
 	queue = sk->sk_user_data;
-	if (unlikely(!queue))
+	if (unlikely(!nvmet_tcp_get_queue(queue)))
 		goto out;
 
 	if (unlikely(queue->state == NVMET_TCP_Q_CONNECTING)) {
 		queue->write_space(sk);
-		goto out;
-	}
-
-	if (sk_stream_is_writeable(sk)) {
+	} else if (sk_stream_is_writeable(sk)) {
 		clear_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
 		queue_work_on(queue_cpu(queue), nvmet_tcp_wq, &queue->io_work);
 	}
+	nvmet_tcp_put_queue(queue);
 out:
 	read_unlock_bh(&sk->sk_callback_lock);
 }
@@ -1545,7 +1579,7 @@  static void nvmet_tcp_state_change(struct sock *sk)
 
 	read_lock_bh(&sk->sk_callback_lock);
 	queue = sk->sk_user_data;
-	if (!queue)
+	if (!nvmet_tcp_get_queue(queue))
 		goto done;
 
 	switch (sk->sk_state) {
@@ -1562,6 +1596,7 @@  static void nvmet_tcp_state_change(struct sock *sk)
 		pr_warn("queue %d unhandled state %d\n",
 			queue->idx, sk->sk_state);
 	}
+	nvmet_tcp_put_queue(queue);
 done:
 	read_unlock_bh(&sk->sk_callback_lock);
 }
@@ -1582,6 +1617,9 @@  static int nvmet_tcp_set_queue_sock(struct nvmet_tcp_queue *queue)
 	if (ret < 0)
 		return ret;
 
+	if (unlikely(!nvmet_tcp_get_queue(queue)))
+		return -ECONNRESET;
+
 	/*
 	 * Cleanup whatever is sitting in the TCP transmit queue on socket
 	 * close. This is done to prevent stale data from being sent should
@@ -1603,6 +1641,7 @@  static int nvmet_tcp_set_queue_sock(struct nvmet_tcp_queue *queue)
 		 * If the socket is already closing, don't even start
 		 * consuming it
 		 */
+		nvmet_tcp_put_queue(queue);
 		ret = -ENOTCONN;
 	} else {
 		sock->sk->sk_user_data = queue;
@@ -1636,6 +1675,7 @@  static void nvmet_tcp_alloc_queue(struct nvmet_tcp_port *port,
 
 	INIT_WORK(&queue->release_work, nvmet_tcp_release_queue_work);
 	INIT_WORK(&queue->io_work, nvmet_tcp_io_work);
+	kref_init(&queue->kref);
 	queue->sock = newsock;
 	queue->port = port;
 	queue->nr_cmds = 0;
@@ -1672,15 +1712,15 @@  static void nvmet_tcp_alloc_queue(struct nvmet_tcp_port *port,
 	mutex_unlock(&nvmet_tcp_queue_mutex);
 
 	ret = nvmet_tcp_set_queue_sock(queue);
-	if (ret)
-		goto out_destroy_sq;
+	if (!ret)
+		return;
 
-	return;
-out_destroy_sq:
+	queue->state = NVMET_TCP_Q_DISCONNECTING;
 	mutex_lock(&nvmet_tcp_queue_mutex);
 	list_del_init(&queue->queue_list);
 	mutex_unlock(&nvmet_tcp_queue_mutex);
-	nvmet_sq_destroy(&queue->nvme_sq);
+	nvmet_tcp_put_queue(queue);
+	return;
 out_free_connect:
 	nvmet_tcp_free_cmd(&queue->connect);
 out_ida_remove:

[14/17] nvmet-tcp: reference counting for queues

Checks

Commit Message

Comments

Patch