diff mbox

[v2,4/8] blk-mq: fix blk_mq_quiesce_queue

Message ID 20170527142126.26079-5-ming.lei@redhat.com (mailing list archive)
State New, archived
Headers show

Commit Message

Ming Lei May 27, 2017, 2:21 p.m. UTC
It is required that no dispatch can happen any more once
blk_mq_quiesce_queue() returns, and we don't have such requirement
on APIs of stopping queue.

But blk_mq_quiesce_queue() still may not block/drain dispatch in the
following cases:

- direct issue or BLK_MQ_S_START_ON_RUN
- in theory, new RCU read-side critical sections may begin while
synchronize_rcu() was waiting, and end after synchronize_rcu()
returns, during the period dispatch still may happen

So use the new flag of QUEUE_FLAG_QUIESCED and evaluate it
inside RCU read-side critical sections for fixing the above issues.

This patch fixes request use-after-free[1][2] during canceling requets
of NVMe in nvme_dev_disable(), which can be triggered easily during
NVMe reset & remove test.

[1] oops kernel log when CONFIG_BLK_DEV_INTEGRITY is on
[  103.412969] BUG: unable to handle kernel NULL pointer dereference at 000000000000000a
[  103.412980] IP: bio_integrity_advance+0x48/0xf0
[  103.412981] PGD 275a88067
[  103.412981] P4D 275a88067
[  103.412982] PUD 276c43067
[  103.412983] PMD 0
[  103.412984]
[  103.412986] Oops: 0000 [#1] SMP
[  103.412989] Modules linked in: vfat fat intel_rapl sb_edac x86_pkg_temp_thermal intel_powerclamp coretemp kvm_intel kvm irqbypass crct10dif_pclmul crc32_pclmul ghash_clmulni_intel pcbc aesni_intel crypto_simd cryptd ipmi_ssif iTCO_wdt iTCO_vendor_support mxm_wmi glue_helper dcdbas ipmi_si mei_me pcspkr mei sg ipmi_devintf lpc_ich ipmi_msghandler shpchp acpi_power_meter wmi nfsd auth_rpcgss nfs_acl lockd grace sunrpc ip_tables xfs libcrc32c sd_mod mgag200 i2c_algo_bit drm_kms_helper syscopyarea sysfillrect sysimgblt fb_sys_fops ttm drm crc32c_intel nvme ahci nvme_core libahci libata tg3 i2c_core megaraid_sas ptp pps_core dm_mirror dm_region_hash dm_log dm_mod
[  103.413035] CPU: 0 PID: 102 Comm: kworker/0:2 Not tainted 4.11.0+ #1
[  103.413036] Hardware name: Dell Inc. PowerEdge R730xd/072T6D, BIOS 2.2.5 09/06/2016
[  103.413041] Workqueue: events nvme_remove_dead_ctrl_work [nvme]
[  103.413043] task: ffff9cc8775c8000 task.stack: ffffc033c252c000
[  103.413045] RIP: 0010:bio_integrity_advance+0x48/0xf0
[  103.413046] RSP: 0018:ffffc033c252fc10 EFLAGS: 00010202
[  103.413048] RAX: 0000000000000000 RBX: ffff9cc8720a8cc0 RCX: ffff9cca72958240
[  103.413049] RDX: ffff9cca72958000 RSI: 0000000000000008 RDI: ffff9cc872537f00
[  103.413049] RBP: ffffc033c252fc28 R08: 0000000000000000 R09: ffffffffb963a0d5
[  103.413050] R10: 000000000000063e R11: 0000000000000000 R12: ffff9cc8720a8d18
[  103.413051] R13: 0000000000001000 R14: ffff9cc872682e00 R15: 00000000fffffffb
[  103.413053] FS:  0000000000000000(0000) GS:ffff9cc877c00000(0000) knlGS:0000000000000000
[  103.413054] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[  103.413055] CR2: 000000000000000a CR3: 0000000276c41000 CR4: 00000000001406f0
[  103.413056] Call Trace:
[  103.413063]  bio_advance+0x2a/0xe0
[  103.413067]  blk_update_request+0x76/0x330
[  103.413072]  blk_mq_end_request+0x1a/0x70
[  103.413074]  blk_mq_dispatch_rq_list+0x370/0x410
[  103.413076]  ? blk_mq_flush_busy_ctxs+0x94/0xe0
[  103.413080]  blk_mq_sched_dispatch_requests+0x173/0x1a0
[  103.413083]  __blk_mq_run_hw_queue+0x8e/0xa0
[  103.413085]  __blk_mq_delay_run_hw_queue+0x9d/0xa0
[  103.413088]  blk_mq_start_hw_queue+0x17/0x20
[  103.413090]  blk_mq_start_hw_queues+0x32/0x50
[  103.413095]  nvme_kill_queues+0x54/0x80 [nvme_core]
[  103.413097]  nvme_remove_dead_ctrl_work+0x1f/0x40 [nvme]
[  103.413103]  process_one_work+0x149/0x360
[  103.413105]  worker_thread+0x4d/0x3c0
[  103.413109]  kthread+0x109/0x140
[  103.413111]  ? rescuer_thread+0x380/0x380
[  103.413113]  ? kthread_park+0x60/0x60
[  103.413120]  ret_from_fork+0x2c/0x40
[  103.413121] Code: 08 4c 8b 63 50 48 8b 80 80 00 00 00 48 8b 90 d0 03 00 00 31 c0 48 83 ba 40 02 00 00 00 48 8d 8a 40 02 00 00 48 0f 45 c1 c1 ee 09 <0f> b6 48 0a 0f b6 40 09 41 89 f5 83 e9 09 41 d3 ed 44 0f af e8
[  103.413145] RIP: bio_integrity_advance+0x48/0xf0 RSP: ffffc033c252fc10
[  103.413146] CR2: 000000000000000a
[  103.413157] ---[ end trace cd6875d16eb5a11e ]---
[  103.455368] Kernel panic - not syncing: Fatal exception
[  103.459826] Kernel Offset: 0x37600000 from 0xffffffff81000000 (relocation range: 0xffffffff80000000-0xffffffffbfffffff)
[  103.850916] ---[ end Kernel panic - not syncing: Fatal exception
[  103.857637] sched: Unexpected reschedule of offline CPU#1!
[  103.863762] ------------[ cut here ]------------

[2] kernel hang in blk_mq_freeze_queue_wait() when CONFIG_BLK_DEV_INTEGRITY is off
[  247.129825] INFO: task nvme-test:1772 blocked for more than 120 seconds.
[  247.137311]       Not tainted 4.12.0-rc2.upstream+ #4
[  247.142954] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
[  247.151704] Call Trace:
[  247.154445]  __schedule+0x28a/0x880
[  247.158341]  schedule+0x36/0x80
[  247.161850]  blk_mq_freeze_queue_wait+0x4b/0xb0
[  247.166913]  ? remove_wait_queue+0x60/0x60
[  247.171485]  blk_freeze_queue+0x1a/0x20
[  247.175770]  blk_cleanup_queue+0x7f/0x140
[  247.180252]  nvme_ns_remove+0xa3/0xb0 [nvme_core]
[  247.185503]  nvme_remove_namespaces+0x32/0x50 [nvme_core]
[  247.191532]  nvme_uninit_ctrl+0x2d/0xa0 [nvme_core]
[  247.196977]  nvme_remove+0x70/0x110 [nvme]
[  247.201545]  pci_device_remove+0x39/0xc0
[  247.205927]  device_release_driver_internal+0x141/0x200
[  247.211761]  device_release_driver+0x12/0x20
[  247.216531]  pci_stop_bus_device+0x8c/0xa0
[  247.221104]  pci_stop_and_remove_bus_device_locked+0x1a/0x30
[  247.227420]  remove_store+0x7c/0x90
[  247.231320]  dev_attr_store+0x18/0x30
[  247.235409]  sysfs_kf_write+0x3a/0x50
[  247.239497]  kernfs_fop_write+0xff/0x180
[  247.243867]  __vfs_write+0x37/0x160
[  247.247757]  ? selinux_file_permission+0xe5/0x120
[  247.253011]  ? security_file_permission+0x3b/0xc0
[  247.258260]  vfs_write+0xb2/0x1b0
[  247.261964]  ? syscall_trace_enter+0x1d0/0x2b0
[  247.266924]  SyS_write+0x55/0xc0
[  247.270540]  do_syscall_64+0x67/0x150
[  247.274636]  entry_SYSCALL64_slow_path+0x25/0x25
[  247.279794] RIP: 0033:0x7f5c96740840
[  247.283785] RSP: 002b:00007ffd00e87ee8 EFLAGS: 00000246 ORIG_RAX: 0000000000000001
[  247.292238] RAX: ffffffffffffffda RBX: 0000000000000002 RCX: 00007f5c96740840
[  247.300194] RDX: 0000000000000002 RSI: 00007f5c97060000 RDI: 0000000000000001
[  247.308159] RBP: 00007f5c97060000 R08: 000000000000000a R09: 00007f5c97059740
[  247.316123] R10: 0000000000000001 R11: 0000000000000246 R12: 00007f5c96a14400
[  247.324087] R13: 0000000000000002 R14: 0000000000000001 R15: 0000000000000000
[  370.016340] INFO: task nvme-test:1772 blocked for more than 120 seconds.

Signed-off-by: Ming Lei <ming.lei@redhat.com>
---
 block/blk-mq.c | 33 ++++++++++++++++++++++++++++-----
 1 file changed, 28 insertions(+), 5 deletions(-)

Comments

Bart Van Assche May 27, 2017, 9:46 p.m. UTC | #1
On Sat, 2017-05-27 at 22:21 +0800, Ming Lei wrote:
> It is required that no dispatch can happen any more once

> blk_mq_quiesce_queue() returns, and we don't have such requirement

> on APIs of stopping queue.

> 

> But blk_mq_quiesce_queue() still may not block/drain dispatch in the

> following cases:

> 

> - direct issue or BLK_MQ_S_START_ON_RUN

> - in theory, new RCU read-side critical sections may begin while

> synchronize_rcu() was waiting, and end after synchronize_rcu()

> returns, during the period dispatch still may happen


Hello Ming,

I think the title and the description of this patch are wrong. Since
the current queue quiescing mechanism works fine for drivers that do
not stop and restart a queue (e.g. SCSI and dm-core), please change the
title and description to reflect that the purpose of this patch is
to allow drivers that use the quiesce mechanism to restart a queue
without unquiescing it.

> @@ -209,6 +217,9 @@ void blk_mq_wake_waiters(struct request_queue *q)

>  	 * the queue are notified as well.

>  	 */

>  	wake_up_all(&q->mq_freeze_wq);

> +

> +	/* Forcibly unquiesce the queue to avoid having stuck requests */

> +	blk_mq_unquiesce_queue(q);

>  }


Should the block layer unquiesce a queue if a block driver hasn't 
done that before queue removal starts or should the block driver
itself do that? The block layer doesn't restart stopped queues from
inside blk_set_queue_dying() so why should it unquiesce a quiesced
queue?
 
>  bool blk_mq_can_queue(struct blk_mq_hw_ctx *hctx)

> @@ -1108,13 +1119,15 @@ static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx)

>  

>  	if (!(hctx->flags & BLK_MQ_F_BLOCKING)) {

>  		rcu_read_lock();

> -		blk_mq_sched_dispatch_requests(hctx);

> +		if (!blk_queue_quiesced(hctx->queue))

> +			blk_mq_sched_dispatch_requests(hctx);

>  		rcu_read_unlock();

>  	} else {

>  		might_sleep();

>  

>  		srcu_idx = srcu_read_lock(&hctx->queue_rq_srcu);

> -		blk_mq_sched_dispatch_requests(hctx);

> +		if (!blk_queue_quiesced(hctx->queue))

> +			blk_mq_sched_dispatch_requests(hctx);

>  		srcu_read_unlock(&hctx->queue_rq_srcu, srcu_idx);

>  	}

>  }


Sorry but I don't like these changes. Why have the blk_queue_quiesced()
calls be added at other code locations than the blk_mq_hctx_stopped() calls?
This will make the block layer unnecessary hard to maintain. Please consider
to change the blk_mq_hctx_stopped(hctx) calls in blk_mq_sched_dispatch_requests()
and *blk_mq_*run_hw_queue*() into blk_mq_hctx_stopped(hctx) || blk_queue_quiesced(q).

Thanks,

Bart.
Ming Lei May 28, 2017, 10:44 a.m. UTC | #2
On Sat, May 27, 2017 at 09:46:45PM +0000, Bart Van Assche wrote:
> On Sat, 2017-05-27 at 22:21 +0800, Ming Lei wrote:
> > It is required that no dispatch can happen any more once
> > blk_mq_quiesce_queue() returns, and we don't have such requirement
> > on APIs of stopping queue.
> > 
> > But blk_mq_quiesce_queue() still may not block/drain dispatch in the
> > following cases:
> > 
> > - direct issue or BLK_MQ_S_START_ON_RUN
> > - in theory, new RCU read-side critical sections may begin while
> > synchronize_rcu() was waiting, and end after synchronize_rcu()
> > returns, during the period dispatch still may happen
> 
> Hello Ming,

Hello Bart,

> 
> I think the title and the description of this patch are wrong. Since
> the current queue quiescing mechanism works fine for drivers that do
> not stop and restart a queue (e.g. SCSI and dm-core), please change the

I have provided the issues in current quiesce mechanism, now I post it again:

	But blk_mq_quiesce_queue() still may not block/drain dispatch in the
	following cases:
	
	- direct issue or BLK_MQ_S_START_ON_RUN
	- in theory, new RCU read-side critical sections may begin while
	synchronize_rcu() was waiting, and end after synchronize_rcu()
	returns, during the period dispatch still may happen

Not like stopping queue, any dispatching has to be drained/blocked
when the synchronize_rcu() returns, otherwise double free or
use-after-free can be triggered, which has been observed on NVMe
already.

> title and description to reflect that the purpose of this patch is
> to allow drivers that use the quiesce mechanism to restart a queue
> without unquiescing it.

First it is really a fix, and then a improvement, so could you tell me
where is wrong with the title and the description?

> 
> > @@ -209,6 +217,9 @@ void blk_mq_wake_waiters(struct request_queue *q)
> >  	 * the queue are notified as well.
> >  	 */
> >  	wake_up_all(&q->mq_freeze_wq);
> > +
> > +	/* Forcibly unquiesce the queue to avoid having stuck requests */
> > +	blk_mq_unquiesce_queue(q);
> >  }
> 
> Should the block layer unquiesce a queue if a block driver hasn't 
> done that before queue removal starts or should the block driver
> itself do that?

Some drivers might quiesce a queue and not unquiesce it, such as
NVMe.

OK, I will consider to fix drivers first.

> The block layer doesn't restart stopped queues from
> inside blk_set_queue_dying() so why should it unquiesce a quiesced
> queue?

If the quiesced queue isn't unquiesced, it may cause I/O hang, since
any I/O in sw queue/scheduler queue can't be completed at all.

OK, will fix driver in next post.

Actually the queue has to be started after blk_set_queue_dying(),
otherwise it can cause I/O hang too, and there can be lots of
writeback I/O in the following del_gendisk(). We have done it
in NVMe already, see nvme_kill_queues().

Maybe in future, we should consider to do that all in block layer.

>  
> >  bool blk_mq_can_queue(struct blk_mq_hw_ctx *hctx)
> > @@ -1108,13 +1119,15 @@ static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx)
> >  
> >  	if (!(hctx->flags & BLK_MQ_F_BLOCKING)) {
> >  		rcu_read_lock();
> > -		blk_mq_sched_dispatch_requests(hctx);
> > +		if (!blk_queue_quiesced(hctx->queue))
> > +			blk_mq_sched_dispatch_requests(hctx);
> >  		rcu_read_unlock();
> >  	} else {
> >  		might_sleep();
> >  
> >  		srcu_idx = srcu_read_lock(&hctx->queue_rq_srcu);
> > -		blk_mq_sched_dispatch_requests(hctx);
> > +		if (!blk_queue_quiesced(hctx->queue))
> > +			blk_mq_sched_dispatch_requests(hctx);
> >  		srcu_read_unlock(&hctx->queue_rq_srcu, srcu_idx);
> >  	}
> >  }
> 
> Sorry but I don't like these changes. Why have the blk_queue_quiesced()
> calls be added at other code locations than the blk_mq_hctx_stopped() calls?
> This will make the block layer unnecessary hard to maintain. Please consider
> to change the blk_mq_hctx_stopped(hctx) calls in blk_mq_sched_dispatch_requests()
> and *blk_mq_*run_hw_queue*() into blk_mq_hctx_stopped(hctx) || blk_queue_quiesced(q).

One benefit is that we make it explicit that the flag has to be checked
inside the RCU read-side critical sections. If you put it somewhere,
someone may put it out of read-side critical sections in future.


Thanks,
Ming
Bart Van Assche May 28, 2017, 4:10 p.m. UTC | #3
On Sun, 2017-05-28 at 18:44 +0800, Ming Lei wrote:
> On Sat, May 27, 2017 at 09:46:45PM +0000, Bart Van Assche wrote:

> > On Sat, 2017-05-27 at 22:21 +0800, Ming Lei wrote:

> > >  bool blk_mq_can_queue(struct blk_mq_hw_ctx *hctx)

> > > @@ -1108,13 +1119,15 @@ static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx)

> > >  

> > >  	if (!(hctx->flags & BLK_MQ_F_BLOCKING)) {

> > >  		rcu_read_lock();

> > > -		blk_mq_sched_dispatch_requests(hctx);

> > > +		if (!blk_queue_quiesced(hctx->queue))

> > > +			blk_mq_sched_dispatch_requests(hctx);

> > >  		rcu_read_unlock();

> > >  	} else {

> > >  		might_sleep();

> > >  

> > >  		srcu_idx = srcu_read_lock(&hctx->queue_rq_srcu);

> > > -		blk_mq_sched_dispatch_requests(hctx);

> > > +		if (!blk_queue_quiesced(hctx->queue))

> > > +			blk_mq_sched_dispatch_requests(hctx);

> > >  		srcu_read_unlock(&hctx->queue_rq_srcu, srcu_idx);

> > >  	}

> > >  }

> > 

> > Sorry but I don't like these changes. Why have the blk_queue_quiesced()

> > calls be added at other code locations than the blk_mq_hctx_stopped() calls?

> > This will make the block layer unnecessary hard to maintain. Please consider

> > to change the blk_mq_hctx_stopped(hctx) calls in blk_mq_sched_dispatch_requests()

> > and *blk_mq_*run_hw_queue*() into blk_mq_hctx_stopped(hctx) || blk_queue_quiesced(q).

> 

> One benefit is that we make it explicit that the flag has to be checked

> inside the RCU read-side critical sections. If you put it somewhere,

> someone may put it out of read-side critical sections in future.


Hello Ming,

I really would like to see the blk_queue_quiesced() tests as close as possible to
the blk_mq_hctx_stopped() tests. But I agree that we need a way to document and/or
verify that these tests occur with an RCU read-side lock held. Have you considered
to use rcu_read_lock_held() to document this?

Thanks,

Bart.
Ming Lei May 30, 2017, 12:22 a.m. UTC | #4
On Sun, May 28, 2017 at 04:10:09PM +0000, Bart Van Assche wrote:
> On Sun, 2017-05-28 at 18:44 +0800, Ming Lei wrote:
> > On Sat, May 27, 2017 at 09:46:45PM +0000, Bart Van Assche wrote:
> > > On Sat, 2017-05-27 at 22:21 +0800, Ming Lei wrote:
> > > >  bool blk_mq_can_queue(struct blk_mq_hw_ctx *hctx)
> > > > @@ -1108,13 +1119,15 @@ static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx)
> > > >  
> > > >  	if (!(hctx->flags & BLK_MQ_F_BLOCKING)) {
> > > >  		rcu_read_lock();
> > > > -		blk_mq_sched_dispatch_requests(hctx);
> > > > +		if (!blk_queue_quiesced(hctx->queue))
> > > > +			blk_mq_sched_dispatch_requests(hctx);
> > > >  		rcu_read_unlock();
> > > >  	} else {
> > > >  		might_sleep();
> > > >  
> > > >  		srcu_idx = srcu_read_lock(&hctx->queue_rq_srcu);
> > > > -		blk_mq_sched_dispatch_requests(hctx);
> > > > +		if (!blk_queue_quiesced(hctx->queue))
> > > > +			blk_mq_sched_dispatch_requests(hctx);
> > > >  		srcu_read_unlock(&hctx->queue_rq_srcu, srcu_idx);
> > > >  	}
> > > >  }
> > > 
> > > Sorry but I don't like these changes. Why have the blk_queue_quiesced()
> > > calls be added at other code locations than the blk_mq_hctx_stopped() calls?
> > > This will make the block layer unnecessary hard to maintain. Please consider
> > > to change the blk_mq_hctx_stopped(hctx) calls in blk_mq_sched_dispatch_requests()
> > > and *blk_mq_*run_hw_queue*() into blk_mq_hctx_stopped(hctx) || blk_queue_quiesced(q).
> > 
> > One benefit is that we make it explicit that the flag has to be checked
> > inside the RCU read-side critical sections. If you put it somewhere,
> > someone may put it out of read-side critical sections in future.
> 
> Hello Ming,
> 
> I really would like to see the blk_queue_quiesced() tests as close as possible to
> the blk_mq_hctx_stopped() tests. But I agree that we need a way to document and/or

Could you explain why we have to do that? And checking on stopped state
doesn't need to hold RCU/SRCU read lock, and that two states are really
different.

> verify that these tests occur with an RCU read-side lock held. Have you considered
> to use rcu_read_lock_held() to document this?

Then we need to check if it is RCU or SRCU, and make code ugly as
current check on BLOCKING.

Thanks,
Ming
Bart Van Assche May 30, 2017, 4:54 p.m. UTC | #5
On Tue, 2017-05-30 at 08:22 +0800, Ming Lei wrote:
> On Sun, May 28, 2017 at 04:10:09PM +0000, Bart Van Assche wrote:
> > I really would like to see the blk_queue_quiesced() tests as close as possible to
> > the blk_mq_hctx_stopped() tests. But I agree that we need a way to document and/or
> 
> Could you explain why we have to do that? And checking on stopped state
> doesn't need to hold RCU/SRCU read lock, and that two states are really
> different.

I'm really surprised that you ask me why ... It's because the purpose of the
"stopped" and "quiesced" flags is similar, namely preventing that dispatching
requests happens. It doesn't matter that with your patches applied it is no
longer needed to hold an RCU / SRCU lock when testing the stopped flag.

> > verify that these tests occur with an RCU read-side lock held. Have you considered
> > to use rcu_read_lock_held() to document this?
> 
> Then we need to check if it is RCU or SRCU, and make code ugly as
> current check on BLOCKING.

How about introducing a macro or inline function in the block layer that tests
whether either the RCU read lock or an SRCU read lock is held depending on the
value of the BLK_MQ_F_BLOCKING flag?

Bart.
Bart Van Assche May 30, 2017, 7:23 p.m. UTC | #6
On Sun, 2017-05-28 at 18:44 +0800, Ming Lei wrote:
> First it is really a fix, and then a improvement, so could you tell me
> where is wrong with the title and the description?

Hello Ming,

Can you explain me why you want to keep the blk_mq_stop_hw_queues() call in
nvme_suspend_queue()? Since immediately after that call the NVMe driver starts
freeing resources, shouldn't that call be changed into a call of
blk_mq_quiesce_queue()? I think the same comment also applies to the
blk_mq_stop_hw_queues() calls in nvme_rdma_error_recovery_work() and
nvme_rdma_shutdown_ctrl().

Bart.
Ming Lei May 31, 2017, 2:38 a.m. UTC | #7
On Tue, May 30, 2017 at 04:54:47PM +0000, Bart Van Assche wrote:
> On Tue, 2017-05-30 at 08:22 +0800, Ming Lei wrote:
> > On Sun, May 28, 2017 at 04:10:09PM +0000, Bart Van Assche wrote:
> > > I really would like to see the blk_queue_quiesced() tests as close as possible to
> > > the blk_mq_hctx_stopped() tests. But I agree that we need a way to document and/or
> > 
> > Could you explain why we have to do that? And checking on stopped state
> > doesn't need to hold RCU/SRCU read lock, and that two states are really
> > different.
> 
> I'm really surprised that you ask me why ... It's because the purpose of the
> "stopped" and "quiesced" flags is similar, namely preventing that dispatching

Actually they are not, and you can find it in patch 7.

But I will move the check into the dispatch function, and add comment about
rcu/scru read lock requirement.

> requests happens. It doesn't matter that with your patches applied it is no
> longer needed to hold an RCU / SRCU lock when testing the stopped flag.
> 
> > > verify that these tests occur with an RCU read-side lock held. Have you considered
> > > to use rcu_read_lock_held() to document this?
> > 
> > Then we need to check if it is RCU or SRCU, and make code ugly as
> > current check on BLOCKING.
> 
> How about introducing a macro or inline function in the block layer that tests
> whether either the RCU read lock or an SRCU read lock is held depending on the
> value of the BLK_MQ_F_BLOCKING flag?

That might make code clean, but need to check the condition two times.

Thanks,
Ming
Ming Lei May 31, 2017, 2:52 a.m. UTC | #8
On Tue, May 30, 2017 at 07:23:31PM +0000, Bart Van Assche wrote:
> On Sun, 2017-05-28 at 18:44 +0800, Ming Lei wrote:
> > First it is really a fix, and then a improvement, so could you tell me
> > where is wrong with the title and the description?
> 
> Hello Ming,
> 
> Can you explain me why you want to keep the blk_mq_stop_hw_queues() call in
> nvme_suspend_queue()? Since immediately after that call the NVMe driver starts
> freeing resources, shouldn't that call be changed into a call of
> blk_mq_quiesce_queue()? I think the same comment also applies to the
> blk_mq_stop_hw_queues() calls in nvme_rdma_error_recovery_work() and
> nvme_rdma_shutdown_ctrl().

Not only NVMe, also other suspend cases in virito-blk, dm, and ...

But that doesn't belong to this patchset, and definitely a follow up.
The big purpose of this patchset is to make blk_mq_quiesce_queue() easy to use,
correct and avoiding potential races.

Then we can fix cases of canceling requests, and other cases like you
mentioned.


Thanks,
Ming
diff mbox

Patch

diff --git a/block/blk-mq.c b/block/blk-mq.c
index 8ecbbb718946..470ee5514ea9 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -170,6 +170,10 @@  void blk_mq_quiesce_queue(struct request_queue *q)
 
 	__blk_mq_stop_hw_queues(q, true);
 
+	spin_lock_irq(q->queue_lock);
+	queue_flag_set(QUEUE_FLAG_QUIESCED, q);
+	spin_unlock_irq(q->queue_lock);
+
 	queue_for_each_hw_ctx(q, hctx, i) {
 		if (hctx->flags & BLK_MQ_F_BLOCKING)
 			synchronize_srcu(&hctx->queue_rq_srcu);
@@ -190,6 +194,10 @@  EXPORT_SYMBOL_GPL(blk_mq_quiesce_queue);
  */
 void blk_mq_unquiesce_queue(struct request_queue *q)
 {
+	spin_lock_irq(q->queue_lock);
+	queue_flag_clear(QUEUE_FLAG_QUIESCED, q);
+	spin_unlock_irq(q->queue_lock);
+
 	blk_mq_start_stopped_hw_queues(q, true);
 }
 EXPORT_SYMBOL_GPL(blk_mq_unquiesce_queue);
@@ -209,6 +217,9 @@  void blk_mq_wake_waiters(struct request_queue *q)
 	 * the queue are notified as well.
 	 */
 	wake_up_all(&q->mq_freeze_wq);
+
+	/* Forcibly unquiesce the queue to avoid having stuck requests */
+	blk_mq_unquiesce_queue(q);
 }
 
 bool blk_mq_can_queue(struct blk_mq_hw_ctx *hctx)
@@ -1108,13 +1119,15 @@  static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx)
 
 	if (!(hctx->flags & BLK_MQ_F_BLOCKING)) {
 		rcu_read_lock();
-		blk_mq_sched_dispatch_requests(hctx);
+		if (!blk_queue_quiesced(hctx->queue))
+			blk_mq_sched_dispatch_requests(hctx);
 		rcu_read_unlock();
 	} else {
 		might_sleep();
 
 		srcu_idx = srcu_read_lock(&hctx->queue_rq_srcu);
-		blk_mq_sched_dispatch_requests(hctx);
+		if (!blk_queue_quiesced(hctx->queue))
+			blk_mq_sched_dispatch_requests(hctx);
 		srcu_read_unlock(&hctx->queue_rq_srcu, srcu_idx);
 	}
 }
@@ -1519,9 +1532,14 @@  static void __blk_mq_try_issue_directly(struct request *rq, blk_qc_t *cookie,
 static void blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx,
 		struct request *rq, blk_qc_t *cookie)
 {
-	if (!(hctx->flags & BLK_MQ_F_BLOCKING)) {
+	bool blocking = hctx->flags & BLK_MQ_F_BLOCKING;
+	bool quiesced;
+
+	if (!blocking) {
 		rcu_read_lock();
-		__blk_mq_try_issue_directly(rq, cookie, false);
+		quiesced = blk_queue_quiesced(rq->q);
+		if (!quiesced)
+			__blk_mq_try_issue_directly(rq, cookie, false);
 		rcu_read_unlock();
 	} else {
 		unsigned int srcu_idx;
@@ -1529,9 +1547,14 @@  static void blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx,
 		might_sleep();
 
 		srcu_idx = srcu_read_lock(&hctx->queue_rq_srcu);
-		__blk_mq_try_issue_directly(rq, cookie, true);
+		quiesced = blk_queue_quiesced(rq->q);
+		if (!quiesced)
+			__blk_mq_try_issue_directly(rq, cookie, true);
 		srcu_read_unlock(&hctx->queue_rq_srcu, srcu_idx);
 	}
+
+	if (quiesced)
+		blk_mq_sched_insert_request(rq, false, false, false, blocking);
 }
 
 static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio)