diff mbox series

[v5,2/6] nbd: make sure request completion won't concurrent

Message ID 20210909141256.2606682-3-yukuai3@huawei.com (mailing list archive)
State New, archived
Headers show
Series handle unexpected message from server | expand

Commit Message

Yu Kuai Sept. 9, 2021, 2:12 p.m. UTC
commit cddce0116058 ("nbd: Aovid double completion of a request")
try to fix that nbd_clear_que() and recv_work() can complete a
request concurrently. However, the problem still exists:

t1                    t2                     t3

nbd_disconnect_and_put
 flush_workqueue
                      recv_work
                       blk_mq_complete_request
                        blk_mq_complete_request_remote -> this is true
                         WRITE_ONCE(rq->state, MQ_RQ_COMPLETE)
                          blk_mq_raise_softirq
                                             blk_done_softirq
                                              blk_complete_reqs
                                               nbd_complete_rq
                                                blk_mq_end_request
                                                 blk_mq_free_request
                                                  WRITE_ONCE(rq->state, MQ_RQ_IDLE)
  nbd_clear_que
   blk_mq_tagset_busy_iter
    nbd_clear_req
                                                   __blk_mq_free_request
                                                    blk_mq_put_tag
     blk_mq_complete_request -> complete again

There are three places where request can be completed in nbd:
recv_work(), nbd_clear_que() and nbd_xmit_timeout(). Since they
all hold cmd->lock before completing the request, it's easy to
avoid the problem by setting and checking a cmd flag.

Signed-off-by: Yu Kuai <yukuai3@huawei.com>
---
 drivers/block/nbd.c | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

Comments

Ming Lei Sept. 14, 2021, 12:57 a.m. UTC | #1
On Thu, Sep 09, 2021 at 10:12:52PM +0800, Yu Kuai wrote:
> commit cddce0116058 ("nbd: Aovid double completion of a request")
> try to fix that nbd_clear_que() and recv_work() can complete a
> request concurrently. However, the problem still exists:
> 
> t1                    t2                     t3
> 
> nbd_disconnect_and_put
>  flush_workqueue
>                       recv_work
>                        blk_mq_complete_request
>                         blk_mq_complete_request_remote -> this is true
>                          WRITE_ONCE(rq->state, MQ_RQ_COMPLETE)
>                           blk_mq_raise_softirq
>                                              blk_done_softirq
>                                               blk_complete_reqs
>                                                nbd_complete_rq
>                                                 blk_mq_end_request
>                                                  blk_mq_free_request
>                                                   WRITE_ONCE(rq->state, MQ_RQ_IDLE)
>   nbd_clear_que
>    blk_mq_tagset_busy_iter
>     nbd_clear_req
>                                                    __blk_mq_free_request
>                                                     blk_mq_put_tag
>      blk_mq_complete_request -> complete again
> 
> There are three places where request can be completed in nbd:
> recv_work(), nbd_clear_que() and nbd_xmit_timeout(). Since they
> all hold cmd->lock before completing the request, it's easy to
> avoid the problem by setting and checking a cmd flag.
> 
> Signed-off-by: Yu Kuai <yukuai3@huawei.com>
> ---
>  drivers/block/nbd.c | 11 +++++++++--
>  1 file changed, 9 insertions(+), 2 deletions(-)
> 
> diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c
> index 04861b585b62..550c8dc438ac 100644
> --- a/drivers/block/nbd.c
> +++ b/drivers/block/nbd.c
> @@ -406,7 +406,11 @@ static enum blk_eh_timer_return nbd_xmit_timeout(struct request *req,
>  	if (!mutex_trylock(&cmd->lock))
>  		return BLK_EH_RESET_TIMER;
>  
> -	__clear_bit(NBD_CMD_INFLIGHT, &cmd->flags);
> +	if (!__test_and_clear_bit(NBD_CMD_INFLIGHT, &cmd->flags)) {
> +		mutex_unlock(&cmd->lock);
> +		return BLK_EH_DONE;
> +	}
> +
>  	if (!refcount_inc_not_zero(&nbd->config_refs)) {
>  		cmd->status = BLK_STS_TIMEOUT;
>  		mutex_unlock(&cmd->lock);
> @@ -842,7 +846,10 @@ static bool nbd_clear_req(struct request *req, void *data, bool reserved)
>  
>  	mutex_lock(&cmd->lock);
>  	cmd->status = BLK_STS_IOERR;
> -	__clear_bit(NBD_CMD_INFLIGHT, &cmd->flags);
> +	if (!__test_and_clear_bit(NBD_CMD_INFLIGHT, &cmd->flags)) {
> +		mutex_unlock(&cmd->lock);
> +		return true;
> +	}
>  	mutex_unlock(&cmd->lock);

If this request has completed from other code paths, ->status shouldn't be
updated here, maybe it is done successfully.
Yu Kuai Sept. 14, 2021, 3:11 a.m. UTC | #2
On 2021/09/14 8:57, Ming Lei wrote:
> On Thu, Sep 09, 2021 at 10:12:52PM +0800, Yu Kuai wrote:
>> commit cddce0116058 ("nbd: Aovid double completion of a request")
>> try to fix that nbd_clear_que() and recv_work() can complete a
>> request concurrently. However, the problem still exists:
>>
>> t1                    t2                     t3
>>
>> nbd_disconnect_and_put
>>   flush_workqueue
>>                        recv_work
>>                         blk_mq_complete_request
>>                          blk_mq_complete_request_remote -> this is true
>>                           WRITE_ONCE(rq->state, MQ_RQ_COMPLETE)
>>                            blk_mq_raise_softirq
>>                                               blk_done_softirq
>>                                                blk_complete_reqs
>>                                                 nbd_complete_rq
>>                                                  blk_mq_end_request
>>                                                   blk_mq_free_request
>>                                                    WRITE_ONCE(rq->state, MQ_RQ_IDLE)
>>    nbd_clear_que
>>     blk_mq_tagset_busy_iter
>>      nbd_clear_req
>>                                                     __blk_mq_free_request
>>                                                      blk_mq_put_tag
>>       blk_mq_complete_request -> complete again
>>
>> There are three places where request can be completed in nbd:
>> recv_work(), nbd_clear_que() and nbd_xmit_timeout(). Since they
>> all hold cmd->lock before completing the request, it's easy to
>> avoid the problem by setting and checking a cmd flag.
>>
>> Signed-off-by: Yu Kuai <yukuai3@huawei.com>
>> ---
>>   drivers/block/nbd.c | 11 +++++++++--
>>   1 file changed, 9 insertions(+), 2 deletions(-)
>>
>> diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c
>> index 04861b585b62..550c8dc438ac 100644
>> --- a/drivers/block/nbd.c
>> +++ b/drivers/block/nbd.c
>> @@ -406,7 +406,11 @@ static enum blk_eh_timer_return nbd_xmit_timeout(struct request *req,
>>   	if (!mutex_trylock(&cmd->lock))
>>   		return BLK_EH_RESET_TIMER;
>>   
>> -	__clear_bit(NBD_CMD_INFLIGHT, &cmd->flags);
>> +	if (!__test_and_clear_bit(NBD_CMD_INFLIGHT, &cmd->flags)) {
>> +		mutex_unlock(&cmd->lock);
>> +		return BLK_EH_DONE;
>> +	}
>> +
>>   	if (!refcount_inc_not_zero(&nbd->config_refs)) {
>>   		cmd->status = BLK_STS_TIMEOUT;
>>   		mutex_unlock(&cmd->lock);
>> @@ -842,7 +846,10 @@ static bool nbd_clear_req(struct request *req, void *data, bool reserved)
>>   
>>   	mutex_lock(&cmd->lock);
>>   	cmd->status = BLK_STS_IOERR;
>> -	__clear_bit(NBD_CMD_INFLIGHT, &cmd->flags);
>> +	if (!__test_and_clear_bit(NBD_CMD_INFLIGHT, &cmd->flags)) {
>> +		mutex_unlock(&cmd->lock);
>> +		return true;
>> +	}
>>   	mutex_unlock(&cmd->lock);
> 
> If this request has completed from other code paths, ->status shouldn't be
> updated here, maybe it is done successfully.

Hi, Ming

Will change this in next iteration.

Thanks,
Kuai
diff mbox series

Patch

diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c
index 04861b585b62..550c8dc438ac 100644
--- a/drivers/block/nbd.c
+++ b/drivers/block/nbd.c
@@ -406,7 +406,11 @@  static enum blk_eh_timer_return nbd_xmit_timeout(struct request *req,
 	if (!mutex_trylock(&cmd->lock))
 		return BLK_EH_RESET_TIMER;
 
-	__clear_bit(NBD_CMD_INFLIGHT, &cmd->flags);
+	if (!__test_and_clear_bit(NBD_CMD_INFLIGHT, &cmd->flags)) {
+		mutex_unlock(&cmd->lock);
+		return BLK_EH_DONE;
+	}
+
 	if (!refcount_inc_not_zero(&nbd->config_refs)) {
 		cmd->status = BLK_STS_TIMEOUT;
 		mutex_unlock(&cmd->lock);
@@ -842,7 +846,10 @@  static bool nbd_clear_req(struct request *req, void *data, bool reserved)
 
 	mutex_lock(&cmd->lock);
 	cmd->status = BLK_STS_IOERR;
-	__clear_bit(NBD_CMD_INFLIGHT, &cmd->flags);
+	if (!__test_and_clear_bit(NBD_CMD_INFLIGHT, &cmd->flags)) {
+		mutex_unlock(&cmd->lock);
+		return true;
+	}
 	mutex_unlock(&cmd->lock);
 
 	blk_mq_complete_request(req);