diff mbox

[v6,9/9] block, scsi: Make SCSI device suspend and resume work reliably

Message ID 20171005000110.15904-10-bart.vanassche@wdc.com (mailing list archive)
State New, archived
Headers show

Commit Message

Bart Van Assche Oct. 5, 2017, 12:01 a.m. UTC
It is essential during suspend and resume that neither the filesystem
state nor the filesystem metadata in RAM changes. This is why while
the hibernation image is being written or restored that SCSI devices
are quiesced. The SCSI core quiesces devices through scsi_device_quiesce()
and scsi_device_resume(). In the SDEV_QUIESCE state execution of
non-preempt requests is deferred. This is realized by returning
BLKPREP_DEFER from inside scsi_prep_state_check() for quiesced SCSI
devices. Avoid that a full queue prevents power management requests
to be submitted by deferring allocation of non-preempt requests for
devices in the quiesced state. This patch has been tested by running
the following commands and by verifying that after resume the fio job
is still running:

for d in /sys/class/block/sd*[a-z]; do
  hcil=$(readlink "$d/device")
  hcil=${hcil#../../../}
  echo 4 > "$d/queue/nr_requests"
  echo 1 > "/sys/class/scsi_device/$hcil/device/queue_depth"
done
bdev=$(readlink /dev/disk/by-uuid/5217d83f-213e-4b42-b86e-20013325ba6c)
bdev=${bdev#../../}
hcil=$(readlink "/sys/block/$bdev/device")
hcil=${hcil#../../../}
fio --name="$bdev" --filename="/dev/$bdev" --buffered=0 --bs=512 --rw=randread \
  --ioengine=libaio --numjobs=4 --iodepth=16 --iodepth_batch=1 --thread \
  --loops=$((2**31)) &
pid=$!
sleep 1
systemctl hibernate
sleep 10
kill $pid

Reported-by: Oleksandr Natalenko <oleksandr@natalenko.name>
References: "I/O hangs after resuming from suspend-to-ram" (https://marc.info/?l=linux-block&m=150340235201348).
Signed-off-by: Bart Van Assche <bart.vanassche@wdc.com>
Cc: Martin K. Petersen <martin.petersen@oracle.com>
Cc: Ming Lei <ming.lei@redhat.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Hannes Reinecke <hare@suse.com>
Cc: Johannes Thumshirn <jthumshirn@suse.de>
---
 block/blk-core.c        | 38 ++++++++++++++++++++++++++++++--------
 block/blk-mq.c          |  4 ++--
 block/blk-timeout.c     |  2 +-
 drivers/scsi/scsi_lib.c | 27 +++++++++++++++++++--------
 fs/block_dev.c          |  4 ++--
 include/linux/blkdev.h  |  2 +-
 6 files changed, 55 insertions(+), 22 deletions(-)

Comments

Ming Lei Oct. 7, 2017, 4:33 a.m. UTC | #1
On Wed, Oct 04, 2017 at 05:01:10PM -0700, Bart Van Assche wrote:
> It is essential during suspend and resume that neither the filesystem
> state nor the filesystem metadata in RAM changes. This is why while
> the hibernation image is being written or restored that SCSI devices

quiesce isn't used only for suspend and resume, And the issue isn't
suspend/resume specific too. So please change the title/commit log
as sort of 'make SCSI quiesce more reliable/safe'. 

> are quiesced. The SCSI core quiesces devices through scsi_device_quiesce()
> and scsi_device_resume(). In the SDEV_QUIESCE state execution of
> non-preempt requests is deferred. This is realized by returning
> BLKPREP_DEFER from inside scsi_prep_state_check() for quiesced SCSI
> devices. Avoid that a full queue prevents power management requests
> to be submitted by deferring allocation of non-preempt requests for
> devices in the quiesced state. This patch has been tested by running
> the following commands and by verifying that after resume the fio job
> is still running:
> 
> for d in /sys/class/block/sd*[a-z]; do
>   hcil=$(readlink "$d/device")
>   hcil=${hcil#../../../}
>   echo 4 > "$d/queue/nr_requests"
>   echo 1 > "/sys/class/scsi_device/$hcil/device/queue_depth"
> done
> bdev=$(readlink /dev/disk/by-uuid/5217d83f-213e-4b42-b86e-20013325ba6c)
> bdev=${bdev#../../}
> hcil=$(readlink "/sys/block/$bdev/device")
> hcil=${hcil#../../../}
> fio --name="$bdev" --filename="/dev/$bdev" --buffered=0 --bs=512 --rw=randread \
>   --ioengine=libaio --numjobs=4 --iodepth=16 --iodepth_batch=1 --thread \
>   --loops=$((2**31)) &
> pid=$!
> sleep 1
> systemctl hibernate
> sleep 10
> kill $pid
> 
> Reported-by: Oleksandr Natalenko <oleksandr@natalenko.name>
> References: "I/O hangs after resuming from suspend-to-ram" (https://marc.info/?l=linux-block&m=150340235201348).
> Signed-off-by: Bart Van Assche <bart.vanassche@wdc.com>
> Cc: Martin K. Petersen <martin.petersen@oracle.com>
> Cc: Ming Lei <ming.lei@redhat.com>
> Cc: Christoph Hellwig <hch@lst.de>
> Cc: Hannes Reinecke <hare@suse.com>
> Cc: Johannes Thumshirn <jthumshirn@suse.de>
> ---
>  block/blk-core.c        | 38 ++++++++++++++++++++++++++++++--------
>  block/blk-mq.c          |  4 ++--
>  block/blk-timeout.c     |  2 +-
>  drivers/scsi/scsi_lib.c | 27 +++++++++++++++++++--------
>  fs/block_dev.c          |  4 ++--
>  include/linux/blkdev.h  |  2 +-
>  6 files changed, 55 insertions(+), 22 deletions(-)
> 
> diff --git a/block/blk-core.c b/block/blk-core.c
> index b8d90fc29b35..81a4bb119d50 100644
> --- a/block/blk-core.c
> +++ b/block/blk-core.c
> @@ -371,6 +371,7 @@ void blk_clear_preempt_only(struct request_queue *q)
>  
>  	spin_lock_irqsave(q->queue_lock, flags);
>  	queue_flag_clear(QUEUE_FLAG_PREEMPT_ONLY, q);
> +	wake_up_all(&q->mq_freeze_wq);
>  	spin_unlock_irqrestore(q->queue_lock, flags);
>  }
>  EXPORT_SYMBOL_GPL(blk_clear_preempt_only);
> @@ -792,15 +793,34 @@ struct request_queue *blk_alloc_queue(gfp_t gfp_mask)
>  }
>  EXPORT_SYMBOL(blk_alloc_queue);
>  
> -int blk_queue_enter(struct request_queue *q, bool nowait)
> +/**
> + * blk_queue_enter() - try to increase q->q_usage_counter
> + * @q: request queue pointer
> + * @flags: BLK_MQ_REQ_NOWAIT and/or BLK_MQ_REQ_PREEMPT
> + */
> +int blk_queue_enter(struct request_queue *q, unsigned int flags)
>  {
> +	const bool preempt = flags & BLK_MQ_REQ_PREEMPT;
> +
>  	while (true) {
>  		int ret;
>  
> -		if (percpu_ref_tryget_live(&q->q_usage_counter))
> -			return 0;
> +		if (percpu_ref_tryget_live(&q->q_usage_counter)) {
> +			/*
> +			 * The code that sets the PREEMPT_ONLY flag is
> +			 * responsible for ensuring that that flag is globally
> +			 * visible before the queue is unfrozen.
> +			 */
> +			if (preempt || !blk_queue_preempt_only(q)) {

PREEMPT_ONLY flag is checked without RCU read lock held, so the
synchronize_rcu() may just wait for completion of pre-exit
percpu_ref_tryget_live(), which can be reordered with the
reading on blk_queue_preempt_only().

> +				return 0;
> +			} else {
> +				percpu_ref_put(&q->q_usage_counter);
> +				WARN_ONCE("%s: Attempt to allocate non-preempt request in preempt-only mode.\n",
> +					  kobject_name(q->kobj.parent));
> +			}
> +		}
>  
> -		if (nowait)
> +		if (flags & BLK_MQ_REQ_NOWAIT)
>  			return -EBUSY;
>  
>  		/*
> @@ -813,7 +833,8 @@ int blk_queue_enter(struct request_queue *q, bool nowait)
>  		smp_rmb();
>  
>  		ret = wait_event_interruptible(q->mq_freeze_wq,
> -				!atomic_read(&q->mq_freeze_depth) ||
> +				(atomic_read(&q->mq_freeze_depth) == 0 &&
> +				 (preempt || !blk_queue_preempt_only(q))) ||
>  				blk_queue_dying(q));
>  		if (blk_queue_dying(q))
>  			return -ENODEV;
> @@ -1441,8 +1462,7 @@ static struct request *blk_old_get_request(struct request_queue *q,
>  	/* create ioc upfront */
>  	create_io_context(gfp_mask, q->node);
>  
> -	ret = blk_queue_enter(q, !(gfp_mask & __GFP_DIRECT_RECLAIM) ||
> -			      (op & REQ_NOWAIT));
> +	ret = blk_queue_enter(q, flags);
>  	if (ret)
>  		return ERR_PTR(ret);
>  	spin_lock_irq(q->queue_lock);
> @@ -2263,8 +2283,10 @@ blk_qc_t generic_make_request(struct bio *bio)
>  	current->bio_list = bio_list_on_stack;
>  	do {
>  		struct request_queue *q = bio->bi_disk->queue;
> +		unsigned int flags = bio->bi_opf & REQ_NOWAIT ?
> +			BLK_MQ_REQ_NOWAIT : 0;
>  
> -		if (likely(blk_queue_enter(q, bio->bi_opf & REQ_NOWAIT) == 0)) {
> +		if (likely(blk_queue_enter(q, flags) == 0)) {
>  			struct bio_list lower, same;
>  
>  			/* Create a fresh bio_list for all subordinate requests */
> diff --git a/block/blk-mq.c b/block/blk-mq.c
> index 271657992d1a..1604bc2d4a57 100644
> --- a/block/blk-mq.c
> +++ b/block/blk-mq.c
> @@ -386,7 +386,7 @@ struct request *blk_mq_alloc_request(struct request_queue *q, unsigned int op,
>  	struct request *rq;
>  	int ret;
>  
> -	ret = blk_queue_enter(q, flags & BLK_MQ_REQ_NOWAIT);
> +	ret = blk_queue_enter(q, flags);
>  	if (ret)
>  		return ERR_PTR(ret);
>  
> @@ -425,7 +425,7 @@ struct request *blk_mq_alloc_request_hctx(struct request_queue *q,
>  	if (hctx_idx >= q->nr_hw_queues)
>  		return ERR_PTR(-EIO);
>  
> -	ret = blk_queue_enter(q, true);
> +	ret = blk_queue_enter(q, flags);
>  	if (ret)
>  		return ERR_PTR(ret);
>  
> diff --git a/block/blk-timeout.c b/block/blk-timeout.c
> index 17ec83bb0900..b75d975cc5a5 100644
> --- a/block/blk-timeout.c
> +++ b/block/blk-timeout.c
> @@ -134,7 +134,7 @@ void blk_timeout_work(struct work_struct *work)
>  	struct request *rq, *tmp;
>  	int next_set = 0;
>  
> -	if (blk_queue_enter(q, true))
> +	if (blk_queue_enter(q, BLK_MQ_REQ_NOWAIT | BLK_MQ_REQ_PREEMPT))
>  		return;
>  	spin_lock_irqsave(q->queue_lock, flags);
>  
> diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
> index 1c16a247fae6..0ba7af5debc7 100644
> --- a/drivers/scsi/scsi_lib.c
> +++ b/drivers/scsi/scsi_lib.c
> @@ -2926,21 +2926,30 @@ static void scsi_wait_for_queuecommand(struct scsi_device *sdev)
>  int
>  scsi_device_quiesce(struct scsi_device *sdev)
>  {
> +	struct request_queue *q = sdev->request_queue;
>  	int err;
>  
> +	blk_mq_freeze_queue(q);
> +	if (blk_set_preempt_only(q)) {
> +		blk_mq_unfreeze_queue(q);
> +		return -EINVAL;
> +	}

This way is wrong, if blk_set_preempt_only() returns true
it means the queue has been in PREEMPT_ONLY already,
and failing scsi_device_quiesce() can break suspend/resume or
sending SCSI domain validation command.

The reasonable handling should be just going ahead if queue
is in PREEMPT_ONLY already.

> +	/*
> +	 * Ensure that the effect of blk_set_preempt_only() will be visible
> +	 * for percpu_ref_tryget() callers that occur after the queue
> +	 * unfreeze. See also https://lwn.net/Articles/573497/.
> +	 */
> +	synchronize_rcu();

This synchronize_rcu may be saved if we set the PREEMPT_ONLY flag
before freezing queue since blk_mq_freeze_queue() may implicate
one synchronize_rcu().

> +	blk_mq_unfreeze_queue(q);
> +
>  	mutex_lock(&sdev->state_mutex);
>  	err = scsi_device_set_state(sdev, SDEV_QUIESCE);
>  	mutex_unlock(&sdev->state_mutex);
>  
>  	if (err)
> -		return err;
> +		blk_clear_preempt_only(q);
>  
> -	scsi_run_queue(sdev->request_queue);
> -	while (atomic_read(&sdev->device_busy)) {
> -		msleep_interruptible(200);
> -		scsi_run_queue(sdev->request_queue);
> -	}
> -	return 0;
> +	return err;
>  }
>  EXPORT_SYMBOL(scsi_device_quiesce);
>  
> @@ -2961,8 +2970,10 @@ void scsi_device_resume(struct scsi_device *sdev)
>  	 */
>  	mutex_lock(&sdev->state_mutex);
>  	if (sdev->sdev_state == SDEV_QUIESCE &&
> -	    scsi_device_set_state(sdev, SDEV_RUNNING) == 0)
> +	    scsi_device_set_state(sdev, SDEV_RUNNING) == 0) {
> +		blk_clear_preempt_only(sdev->request_queue);
>  		scsi_run_queue(sdev->request_queue);
> +	}
>  	mutex_unlock(&sdev->state_mutex);

scsi_run_queue() can be removed, and blk_clear_preempt_only() needn't
to be run with holding sdev->state_mutex, just like in quiesce path.
Bart Van Assche Oct. 9, 2017, 9:16 p.m. UTC | #2
On Sat, 2017-10-07 at 12:33 +0800, Ming Lei wrote:
> On Wed, Oct 04, 2017 at 05:01:10PM -0700, Bart Van Assche wrote:

> > It is essential during suspend and resume that neither the filesystem

> > state nor the filesystem metadata in RAM changes. This is why while

> > the hibernation image is being written or restored that SCSI devices

> 

> quiesce isn't used only for suspend and resume, And the issue isn't

> suspend/resume specific too. So please change the title/commit log

> as sort of 'make SCSI quiesce more reliable/safe'. 


OK, I will modify the patch title.

> > -		if (percpu_ref_tryget_live(&q->q_usage_counter))

> > -			return 0;

> > +		if (percpu_ref_tryget_live(&q->q_usage_counter)) {

> > +			/*

> > +			 * The code that sets the PREEMPT_ONLY flag is

> > +			 * responsible for ensuring that that flag is globally

> > +			 * visible before the queue is unfrozen.

> > +			 */

> > +			if (preempt || !blk_queue_preempt_only(q)) {

> 

> PREEMPT_ONLY flag is checked without RCU read lock held, so the

> synchronize_rcu() may just wait for completion of pre-exit

> percpu_ref_tryget_live(), which can be reordered with the

> reading on blk_queue_preempt_only().


OK, I will add rcu_read_lock_sched/unlock_sched() calls around this code.

> >  int

> >  scsi_device_quiesce(struct scsi_device *sdev)

> >  {

> > +	struct request_queue *q = sdev->request_queue;

> >  	int err;

> >  

> > +	blk_mq_freeze_queue(q);

> > +	if (blk_set_preempt_only(q)) {

> > +		blk_mq_unfreeze_queue(q);

> > +		return -EINVAL;

> > +	}

> 

> This way is wrong, if blk_set_preempt_only() returns true

> it means the queue has been in PREEMPT_ONLY already,

> and failing scsi_device_quiesce() can break suspend/resume or

> sending SCSI domain validation command.


That's an interesting comment but I propose that what you suggest is implemented
through a separate patch. The above code preserves the existing behavior, namely
that scsi_device_quiesce() returns an error code if called when a SCSI device has
already been quiesced. See also scsi_device_set_state(). BTW, I don't think that
it is likely that this will occur. The only code other than the power management
code I know of that sets the SCSI QUIESCE state is the SCSI sysfs state store
method and the SCSI parallel code. I don't know any software that sets the SCSI
QUIESCE state through sysfs. And I'm not sure the SCSI parallel code has a
significant number of users.

> > +	/*

> > +	 * Ensure that the effect of blk_set_preempt_only() will be visible

> > +	 * for percpu_ref_tryget() callers that occur after the queue

> > +	 * unfreeze. See also https://lwn.net/Articles/573497/.

> > +	 */

> > +	synchronize_rcu();

> 

> This synchronize_rcu may be saved if we set the PREEMPT_ONLY flag

> before freezing queue since blk_mq_freeze_queue() may implicate

> one synchronize_rcu().


That's an interesting comment. I will look further into this.

> > @@ -2961,8 +2970,10 @@ void scsi_device_resume(struct scsi_device *sdev)

> >  	 */

> >  	mutex_lock(&sdev->state_mutex);

> >  	if (sdev->sdev_state == SDEV_QUIESCE &&

> > -	    scsi_device_set_state(sdev, SDEV_RUNNING) == 0)

> > +	    scsi_device_set_state(sdev, SDEV_RUNNING) == 0) {

> > +		blk_clear_preempt_only(sdev->request_queue);

> >  		scsi_run_queue(sdev->request_queue);

> > +	}

> >  	mutex_unlock(&sdev->state_mutex);

> 

> scsi_run_queue() can be removed, and blk_clear_preempt_only() needn't

> to be run with holding sdev->state_mutex, just like in quiesce path.


I will look into removing the scsi_run_queue() call. But I prefer to keep
the blk_clear_preempt_only() inside the critical section because that call
doesn't sleep and because it changes state information that is related to
the SCSI state.

Bart.
Bart Van Assche Oct. 9, 2017, 9:42 p.m. UTC | #3
On Mon, 2017-10-09 at 14:16 -0700, Bart Van Assche wrote:
> On Sat, 2017-10-07 at 12:33 +0800, Ming Lei wrote:

> > On Wed, Oct 04, 2017 at 05:01:10PM -0700, Bart Van Assche wrote:

> > >  int

> > >  scsi_device_quiesce(struct scsi_device *sdev)

> > >  {

> > > +	struct request_queue *q = sdev->request_queue;

> > >  	int err;

> > >  

> > > +	blk_mq_freeze_queue(q);

> > > +	if (blk_set_preempt_only(q)) {

> > > +		blk_mq_unfreeze_queue(q);

> > > +		return -EINVAL;

> > > +	}

> > 

> > This way is wrong, if blk_set_preempt_only() returns true

> > it means the queue has been in PREEMPT_ONLY already,

> > and failing scsi_device_quiesce() can break suspend/resume or

> > sending SCSI domain validation command.

> 

> That's an interesting comment but I propose that what you suggest is implemented

> through a separate patch. The above code preserves the existing behavior, namely

> that scsi_device_quiesce() returns an error code if called when a SCSI device has

> already been quiesced. See also scsi_device_set_state(). BTW, I don't think that

> it is likely that this will occur. The only code other than the power management

> code I know of that sets the SCSI QUIESCE state is the SCSI sysfs state store

> method and the SCSI parallel code. I don't know any software that sets the SCSI

> QUIESCE state through sysfs. And I'm not sure the SCSI parallel code has a

> significant number of users.


A correction: the current behavior when quiescing an already quiesced device is
that scsi_device_quiesce() returns 0 without changing any state. Anyway, I propose
to keep that behavior and not to add more complexity now.

Bart.
Ming Lei Oct. 10, 2017, 9:50 a.m. UTC | #4
On Mon, Oct 09, 2017 at 09:16:53PM +0000, Bart Van Assche wrote:
> On Sat, 2017-10-07 at 12:33 +0800, Ming Lei wrote:
> > On Wed, Oct 04, 2017 at 05:01:10PM -0700, Bart Van Assche wrote:
> > > It is essential during suspend and resume that neither the filesystem
> > > state nor the filesystem metadata in RAM changes. This is why while
> > > the hibernation image is being written or restored that SCSI devices
> > 
> > quiesce isn't used only for suspend and resume, And the issue isn't
> > suspend/resume specific too. So please change the title/commit log
> > as sort of 'make SCSI quiesce more reliable/safe'. 
> 
> OK, I will modify the patch title.
> 
> > > -		if (percpu_ref_tryget_live(&q->q_usage_counter))
> > > -			return 0;
> > > +		if (percpu_ref_tryget_live(&q->q_usage_counter)) {
> > > +			/*
> > > +			 * The code that sets the PREEMPT_ONLY flag is
> > > +			 * responsible for ensuring that that flag is globally
> > > +			 * visible before the queue is unfrozen.
> > > +			 */
> > > +			if (preempt || !blk_queue_preempt_only(q)) {
> > 
> > PREEMPT_ONLY flag is checked without RCU read lock held, so the
> > synchronize_rcu() may just wait for completion of pre-exit
> > percpu_ref_tryget_live(), which can be reordered with the
> > reading on blk_queue_preempt_only().
> 
> OK, I will add rcu_read_lock_sched/unlock_sched() calls around this code.
> 
> > >  int
> > >  scsi_device_quiesce(struct scsi_device *sdev)
> > >  {
> > > +	struct request_queue *q = sdev->request_queue;
> > >  	int err;
> > >  
> > > +	blk_mq_freeze_queue(q);
> > > +	if (blk_set_preempt_only(q)) {
> > > +		blk_mq_unfreeze_queue(q);
> > > +		return -EINVAL;
> > > +	}
> > 
> > This way is wrong, if blk_set_preempt_only() returns true
> > it means the queue has been in PREEMPT_ONLY already,
> > and failing scsi_device_quiesce() can break suspend/resume or
> > sending SCSI domain validation command.
> 
> That's an interesting comment but I propose that what you suggest is implemented
> through a separate patch. The above code preserves the existing behavior, namely
> that scsi_device_quiesce() returns an error code if called when a SCSI device has
> already been quiesced. See also scsi_device_set_state(). BTW, I don't think that

Please see scsi_device_set_state():

        if (state == oldstate)
                return 0;

And believe it or not, there is one real nested calling of scsi_device_quiesce()
in transport_spi.

> it is likely that this will occur. The only code other than the power management
> code I know of that sets the SCSI QUIESCE state is the SCSI sysfs state store
> method and the SCSI parallel code. I don't know any software that sets the SCSI
> QUIESCE state through sysfs. And I'm not sure the SCSI parallel code has a
> significant number of users.

As I know, one famous VM uses that, and the original report is that
this VM may hang after running I/O for days by our customer.
The revalidate path can trigger QUIESCE, and udev should cause that.

> 
> > > +	/*
> > > +	 * Ensure that the effect of blk_set_preempt_only() will be visible
> > > +	 * for percpu_ref_tryget() callers that occur after the queue
> > > +	 * unfreeze. See also https://lwn.net/Articles/573497/.
> > > +	 */
> > > +	synchronize_rcu();
> > 
> > This synchronize_rcu may be saved if we set the PREEMPT_ONLY flag
> > before freezing queue since blk_mq_freeze_queue() may implicate
> > one synchronize_rcu().
> 
> That's an interesting comment. I will look further into this.
> 
> > > @@ -2961,8 +2970,10 @@ void scsi_device_resume(struct scsi_device *sdev)
> > >  	 */
> > >  	mutex_lock(&sdev->state_mutex);
> > >  	if (sdev->sdev_state == SDEV_QUIESCE &&
> > > -	    scsi_device_set_state(sdev, SDEV_RUNNING) == 0)
> > > +	    scsi_device_set_state(sdev, SDEV_RUNNING) == 0) {
> > > +		blk_clear_preempt_only(sdev->request_queue);
> > >  		scsi_run_queue(sdev->request_queue);
> > > +	}
> > >  	mutex_unlock(&sdev->state_mutex);
> > 
> > scsi_run_queue() can be removed, and blk_clear_preempt_only() needn't
> > to be run with holding sdev->state_mutex, just like in quiesce path.
> 
> I will look into removing the scsi_run_queue() call. But I prefer to keep
> the blk_clear_preempt_only() inside the critical section because that call
> doesn't sleep and because it changes state information that is related to
> the SCSI state.

The thing is that the same state change in blk_set_preempt_only() can't
be protected by the lock, and this way may confuse people wrt. inconsistent
lock usage on same state protection.
diff mbox

Patch

diff --git a/block/blk-core.c b/block/blk-core.c
index b8d90fc29b35..81a4bb119d50 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -371,6 +371,7 @@  void blk_clear_preempt_only(struct request_queue *q)
 
 	spin_lock_irqsave(q->queue_lock, flags);
 	queue_flag_clear(QUEUE_FLAG_PREEMPT_ONLY, q);
+	wake_up_all(&q->mq_freeze_wq);
 	spin_unlock_irqrestore(q->queue_lock, flags);
 }
 EXPORT_SYMBOL_GPL(blk_clear_preempt_only);
@@ -792,15 +793,34 @@  struct request_queue *blk_alloc_queue(gfp_t gfp_mask)
 }
 EXPORT_SYMBOL(blk_alloc_queue);
 
-int blk_queue_enter(struct request_queue *q, bool nowait)
+/**
+ * blk_queue_enter() - try to increase q->q_usage_counter
+ * @q: request queue pointer
+ * @flags: BLK_MQ_REQ_NOWAIT and/or BLK_MQ_REQ_PREEMPT
+ */
+int blk_queue_enter(struct request_queue *q, unsigned int flags)
 {
+	const bool preempt = flags & BLK_MQ_REQ_PREEMPT;
+
 	while (true) {
 		int ret;
 
-		if (percpu_ref_tryget_live(&q->q_usage_counter))
-			return 0;
+		if (percpu_ref_tryget_live(&q->q_usage_counter)) {
+			/*
+			 * The code that sets the PREEMPT_ONLY flag is
+			 * responsible for ensuring that that flag is globally
+			 * visible before the queue is unfrozen.
+			 */
+			if (preempt || !blk_queue_preempt_only(q)) {
+				return 0;
+			} else {
+				percpu_ref_put(&q->q_usage_counter);
+				WARN_ONCE("%s: Attempt to allocate non-preempt request in preempt-only mode.\n",
+					  kobject_name(q->kobj.parent));
+			}
+		}
 
-		if (nowait)
+		if (flags & BLK_MQ_REQ_NOWAIT)
 			return -EBUSY;
 
 		/*
@@ -813,7 +833,8 @@  int blk_queue_enter(struct request_queue *q, bool nowait)
 		smp_rmb();
 
 		ret = wait_event_interruptible(q->mq_freeze_wq,
-				!atomic_read(&q->mq_freeze_depth) ||
+				(atomic_read(&q->mq_freeze_depth) == 0 &&
+				 (preempt || !blk_queue_preempt_only(q))) ||
 				blk_queue_dying(q));
 		if (blk_queue_dying(q))
 			return -ENODEV;
@@ -1441,8 +1462,7 @@  static struct request *blk_old_get_request(struct request_queue *q,
 	/* create ioc upfront */
 	create_io_context(gfp_mask, q->node);
 
-	ret = blk_queue_enter(q, !(gfp_mask & __GFP_DIRECT_RECLAIM) ||
-			      (op & REQ_NOWAIT));
+	ret = blk_queue_enter(q, flags);
 	if (ret)
 		return ERR_PTR(ret);
 	spin_lock_irq(q->queue_lock);
@@ -2263,8 +2283,10 @@  blk_qc_t generic_make_request(struct bio *bio)
 	current->bio_list = bio_list_on_stack;
 	do {
 		struct request_queue *q = bio->bi_disk->queue;
+		unsigned int flags = bio->bi_opf & REQ_NOWAIT ?
+			BLK_MQ_REQ_NOWAIT : 0;
 
-		if (likely(blk_queue_enter(q, bio->bi_opf & REQ_NOWAIT) == 0)) {
+		if (likely(blk_queue_enter(q, flags) == 0)) {
 			struct bio_list lower, same;
 
 			/* Create a fresh bio_list for all subordinate requests */
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 271657992d1a..1604bc2d4a57 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -386,7 +386,7 @@  struct request *blk_mq_alloc_request(struct request_queue *q, unsigned int op,
 	struct request *rq;
 	int ret;
 
-	ret = blk_queue_enter(q, flags & BLK_MQ_REQ_NOWAIT);
+	ret = blk_queue_enter(q, flags);
 	if (ret)
 		return ERR_PTR(ret);
 
@@ -425,7 +425,7 @@  struct request *blk_mq_alloc_request_hctx(struct request_queue *q,
 	if (hctx_idx >= q->nr_hw_queues)
 		return ERR_PTR(-EIO);
 
-	ret = blk_queue_enter(q, true);
+	ret = blk_queue_enter(q, flags);
 	if (ret)
 		return ERR_PTR(ret);
 
diff --git a/block/blk-timeout.c b/block/blk-timeout.c
index 17ec83bb0900..b75d975cc5a5 100644
--- a/block/blk-timeout.c
+++ b/block/blk-timeout.c
@@ -134,7 +134,7 @@  void blk_timeout_work(struct work_struct *work)
 	struct request *rq, *tmp;
 	int next_set = 0;
 
-	if (blk_queue_enter(q, true))
+	if (blk_queue_enter(q, BLK_MQ_REQ_NOWAIT | BLK_MQ_REQ_PREEMPT))
 		return;
 	spin_lock_irqsave(q->queue_lock, flags);
 
diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
index 1c16a247fae6..0ba7af5debc7 100644
--- a/drivers/scsi/scsi_lib.c
+++ b/drivers/scsi/scsi_lib.c
@@ -2926,21 +2926,30 @@  static void scsi_wait_for_queuecommand(struct scsi_device *sdev)
 int
 scsi_device_quiesce(struct scsi_device *sdev)
 {
+	struct request_queue *q = sdev->request_queue;
 	int err;
 
+	blk_mq_freeze_queue(q);
+	if (blk_set_preempt_only(q)) {
+		blk_mq_unfreeze_queue(q);
+		return -EINVAL;
+	}
+	/*
+	 * Ensure that the effect of blk_set_preempt_only() will be visible
+	 * for percpu_ref_tryget() callers that occur after the queue
+	 * unfreeze. See also https://lwn.net/Articles/573497/.
+	 */
+	synchronize_rcu();
+	blk_mq_unfreeze_queue(q);
+
 	mutex_lock(&sdev->state_mutex);
 	err = scsi_device_set_state(sdev, SDEV_QUIESCE);
 	mutex_unlock(&sdev->state_mutex);
 
 	if (err)
-		return err;
+		blk_clear_preempt_only(q);
 
-	scsi_run_queue(sdev->request_queue);
-	while (atomic_read(&sdev->device_busy)) {
-		msleep_interruptible(200);
-		scsi_run_queue(sdev->request_queue);
-	}
-	return 0;
+	return err;
 }
 EXPORT_SYMBOL(scsi_device_quiesce);
 
@@ -2961,8 +2970,10 @@  void scsi_device_resume(struct scsi_device *sdev)
 	 */
 	mutex_lock(&sdev->state_mutex);
 	if (sdev->sdev_state == SDEV_QUIESCE &&
-	    scsi_device_set_state(sdev, SDEV_RUNNING) == 0)
+	    scsi_device_set_state(sdev, SDEV_RUNNING) == 0) {
+		blk_clear_preempt_only(sdev->request_queue);
 		scsi_run_queue(sdev->request_queue);
+	}
 	mutex_unlock(&sdev->state_mutex);
 }
 EXPORT_SYMBOL(scsi_device_resume);
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 93d088ffc05c..98cf2d7ee9d3 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -674,7 +674,7 @@  int bdev_read_page(struct block_device *bdev, sector_t sector,
 	if (!ops->rw_page || bdev_get_integrity(bdev))
 		return result;
 
-	result = blk_queue_enter(bdev->bd_queue, false);
+	result = blk_queue_enter(bdev->bd_queue, 0);
 	if (result)
 		return result;
 	result = ops->rw_page(bdev, sector + get_start_sect(bdev), page, false);
@@ -710,7 +710,7 @@  int bdev_write_page(struct block_device *bdev, sector_t sector,
 
 	if (!ops->rw_page || bdev_get_integrity(bdev))
 		return -EOPNOTSUPP;
-	result = blk_queue_enter(bdev->bd_queue, false);
+	result = blk_queue_enter(bdev->bd_queue, 0);
 	if (result)
 		return result;
 
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index c4174f78c6eb..9f21a8bbacc6 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -972,7 +972,7 @@  extern int scsi_cmd_ioctl(struct request_queue *, struct gendisk *, fmode_t,
 extern int sg_scsi_ioctl(struct request_queue *, struct gendisk *, fmode_t,
 			 struct scsi_ioctl_command __user *);
 
-extern int blk_queue_enter(struct request_queue *q, bool nowait);
+extern int blk_queue_enter(struct request_queue *q, unsigned int flags);
 extern void blk_queue_exit(struct request_queue *q);
 extern void blk_start_queue(struct request_queue *q);
 extern void blk_start_queue_async(struct request_queue *q);