diff mbox

[V3,7/8] block: allow to allocate req with REQF_PREEMPT when queue is preempt frozen

Message ID 20170902130840.24609-8-ming.lei@redhat.com (mailing list archive)
State Not Applicable, archived
Headers show

Commit Message

Ming Lei Sept. 2, 2017, 1:08 p.m. UTC
REQF_PREEMPT is a bit special because the request is required
to be dispatched to lld even when SCSI device is quiesced.

So this patch introduces __blk_get_request() to allow block
layer to allocate request when queue is preempt frozen, since we
will preempt freeze queue before quiescing SCSI device in the
following patch for supporting safe SCSI quiescing.

Signed-off-by: Ming Lei <ming.lei@redhat.com>
---
 block/blk-core.c       | 28 ++++++++++++++++++++--------
 block/blk-mq.c         | 14 ++++++++++++--
 include/linux/blk-mq.h |  7 ++++---
 include/linux/blkdev.h | 17 +++++++++++++++--
 4 files changed, 51 insertions(+), 15 deletions(-)

Comments

Ming Lei Sept. 2, 2017, 1:12 p.m. UTC | #1
On Sat, Sep 02, 2017 at 09:08:39PM +0800, Ming Lei wrote:
> REQF_PREEMPT is a bit special because the request is required
> to be dispatched to lld even when SCSI device is quiesced.
> 
> So this patch introduces __blk_get_request() to allow block
> layer to allocate request when queue is preempt frozen, since we
> will preempt freeze queue before quiescing SCSI device in the
> following patch for supporting safe SCSI quiescing.
> 
> Signed-off-by: Ming Lei <ming.lei@redhat.com>
> ---
>  block/blk-core.c       | 28 ++++++++++++++++++++--------
>  block/blk-mq.c         | 14 ++++++++++++--
>  include/linux/blk-mq.h |  7 ++++---
>  include/linux/blkdev.h | 17 +++++++++++++++--
>  4 files changed, 51 insertions(+), 15 deletions(-)
> 
> diff --git a/block/blk-core.c b/block/blk-core.c
> index 2549b0a0535d..f7a6fbb87dea 100644
> --- a/block/blk-core.c
> +++ b/block/blk-core.c
> @@ -1404,7 +1404,8 @@ static struct request *get_request(struct request_queue *q, unsigned int op,
>  }
>  
>  static struct request *blk_old_get_request(struct request_queue *q,
> -					   unsigned int op, gfp_t gfp_mask)
> +					   unsigned int op, gfp_t gfp_mask,
> +					   unsigned int flags)
>  {
>  	struct request *rq;
>  	int ret = 0;
> @@ -1414,9 +1415,20 @@ static struct request *blk_old_get_request(struct request_queue *q,
>  	/* create ioc upfront */
>  	create_io_context(gfp_mask, q->node);
>  
> -	ret = blk_queue_enter(q, !(gfp_mask & __GFP_DIRECT_RECLAIM));
> +	/*
> +	 * We need to allocate req of REQF_PREEMPT in preempt freezing.
> +	 * No normal freezing can be started when preempt freezing
> +	 * is in-progress, and queue dying is checked before starting
> +	 * preempt freezing, so it is safe to use blk_queue_enter_live()
> +	 * in case of preempt freezing.
> +	 */
> +	if ((flags & BLK_MQ_REQ_PREEMPT) && blk_queue_is_preempt_frozen(q))
> +		blk_queue_enter_live(q);
> +	else
> +		ret = blk_queue_enter(q, !(gfp_mask & __GFP_DIRECT_RECLAIM));
>  	if (ret)
>  		return ERR_PTR(ret);
> +
>  	spin_lock_irq(q->queue_lock);
>  	rq = get_request(q, op, NULL, gfp_mask);
>  	if (IS_ERR(rq)) {
> @@ -1432,26 +1444,26 @@ static struct request *blk_old_get_request(struct request_queue *q,
>  	return rq;
>  }
>  
> -struct request *blk_get_request(struct request_queue *q, unsigned int op,
> -				gfp_t gfp_mask)
> +struct request *__blk_get_request(struct request_queue *q, unsigned int op,
> +				  gfp_t gfp_mask, unsigned int flags)
>  {
>  	struct request *req;
>  
>  	if (q->mq_ops) {
>  		req = blk_mq_alloc_request(q, op,
> -			(gfp_mask & __GFP_DIRECT_RECLAIM) ?
> -				0 : BLK_MQ_REQ_NOWAIT);
> +			flags | ((gfp_mask & __GFP_DIRECT_RECLAIM) ?
> +				0 : BLK_MQ_REQ_NOWAIT));
>  		if (!IS_ERR(req) && q->mq_ops->initialize_rq_fn)
>  			q->mq_ops->initialize_rq_fn(req);
>  	} else {
> -		req = blk_old_get_request(q, op, gfp_mask);
> +		req = blk_old_get_request(q, op, gfp_mask, flags);
>  		if (!IS_ERR(req) && q->initialize_rq_fn)
>  			q->initialize_rq_fn(req);
>  	}
>  
>  	return req;
>  }
> -EXPORT_SYMBOL(blk_get_request);
> +EXPORT_SYMBOL(__blk_get_request);
>  
>  /**
>   * blk_requeue_request - put a request back on queue
> diff --git a/block/blk-mq.c b/block/blk-mq.c
> index 54b8d8b9f40e..e81001d1da27 100644
> --- a/block/blk-mq.c
> +++ b/block/blk-mq.c
> @@ -496,9 +496,19 @@ struct request *blk_mq_alloc_request(struct request_queue *q, unsigned int op,
>  {
>  	struct blk_mq_alloc_data alloc_data = { .flags = flags };
>  	struct request *rq;
> -	int ret;
> +	int ret = 0;
>  
> -	ret = blk_queue_enter(q, flags & BLK_MQ_REQ_NOWAIT);
> +	/*
> +	 * We need to allocate req of REQF_PREEMPT in preempt freezing.
> +	 * No normal freezing can be started when preempt freezing
> +	 * is in-progress, and queue dying is checked before starting
> +	 * preempt freezing, so it is safe to use blk_queue_enter_live()
> +	 * in case of preempt freezing.
> +	 */
> +	if ((flags & BLK_MQ_REQ_PREEMPT) && blk_queue_is_preempt_frozen(q))
> +		blk_queue_enter_live(q);
> +	else
> +		ret = blk_queue_enter(q, flags & BLK_MQ_REQ_NOWAIT);
>  	if (ret)
>  		return ERR_PTR(ret);
>  
> diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
> index 5ae8c82d6273..596f433eb54c 100644
> --- a/include/linux/blk-mq.h
> +++ b/include/linux/blk-mq.h
> @@ -200,9 +200,10 @@ void blk_mq_free_request(struct request *rq);
>  bool blk_mq_can_queue(struct blk_mq_hw_ctx *);
>  
>  enum {
> -	BLK_MQ_REQ_NOWAIT	= (1 << 0), /* return when out of requests */
> -	BLK_MQ_REQ_RESERVED	= (1 << 1), /* allocate from reserved pool */
> -	BLK_MQ_REQ_INTERNAL	= (1 << 2), /* allocate internal/sched tag */
> +	BLK_MQ_REQ_PREEMPT	= BLK_REQ_PREEMPT, /* allocate for RQF_PREEMPT */
> +	BLK_MQ_REQ_NOWAIT	= (1 << 8), /* return when out of requests */
> +	BLK_MQ_REQ_RESERVED	= (1 << 9), /* allocate from reserved pool */
> +	BLK_MQ_REQ_INTERNAL	= (1 << 10), /* allocate internal/sched tag */
>  };
>  
>  struct request *blk_mq_alloc_request(struct request_queue *q, unsigned int op,
> diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
> index 5618d174100a..ff371c42eb3f 100644
> --- a/include/linux/blkdev.h
> +++ b/include/linux/blkdev.h
> @@ -862,6 +862,11 @@ enum {
>  	BLKPREP_INVALID,	/* invalid command, kill, return -EREMOTEIO */
>  };
>  
> +/* passed to __blk_get_request */
> +enum {
> +	BLK_REQ_PREEMPT	= (1 << 0), /* allocate for RQF_PREEMPT */
> +};
> +
>  extern unsigned long blk_max_low_pfn, blk_max_pfn;
>  
>  /*
> @@ -944,8 +949,9 @@ extern void blk_rq_init(struct request_queue *q, struct request *rq);
>  extern void blk_init_request_from_bio(struct request *req, struct bio *bio);
>  extern void blk_put_request(struct request *);
>  extern void __blk_put_request(struct request_queue *, struct request *);
> -extern struct request *blk_get_request(struct request_queue *, unsigned int op,
> -				       gfp_t gfp_mask);
> +extern struct request *__blk_get_request(struct request_queue *,
> +					 unsigned int op, gfp_t gfp_mask,
> +					 unsigned int flags);
>  extern void blk_requeue_request(struct request_queue *, struct request *);
>  extern int blk_lld_busy(struct request_queue *q);
>  extern int blk_rq_prep_clone(struct request *rq, struct request *rq_src,
> @@ -996,6 +1002,13 @@ blk_status_t errno_to_blk_status(int errno);
>  
>  bool blk_mq_poll(struct request_queue *q, blk_qc_t cookie);
>  
> +static inline struct request *blk_get_request(struct request_queue *q,
> +					      unsigned int op,
> +					      gfp_t gfp_mask)
> +{
> +	return __blk_get_request(q, op, gfp_mask, 0);
> +}
> +
>  static inline struct request_queue *bdev_get_queue(struct block_device *bdev)
>  {
>  	return bdev->bd_disk->queue;	/* this is never NULL */
> -- 
> 2.9.5
> 

Hi Bart,

Please let us know if V3 addresses your previous concern about calling
blk_queue_enter_live() during preempt freezing.


Thanks,
Ming
Bart Van Assche Sept. 4, 2017, 4:13 a.m. UTC | #2
On Sat, 2017-09-02 at 21:12 +0800, Ming Lei wrote:
> Please let us know if V3 addresses your previous concern about calling

> blk_queue_enter_live() during preempt freezing.


Do you understand how request queue cleanup works? The algorithm used for
request queue cleanup is as follows:
* Set the DYING flag. This flag makes all later blk_get_request() calls
  fail.
* Wait until all pending requests have finished.
* Set the DEAD flag. For the traditional block layer, this flag causes
  blk_run_queue() not to call .request_fn() anymore. For blk-mq it is
  guaranteed in another way that .queue_rq() won't be called anymore after
  this flag has been set.

Allowing blk_get_request() to succeed after the DYING flag has been set is
completely wrong because that could result in a request being queued after
the DEAD flag has been set, resulting in either a hanging request or a kernel
crash. This is why it's completely wrong to add a blk_queue_enter_live() call
in blk_old_get_request() or blk_mq_alloc_request(). Hence my NAK for any
patch that adds a blk_queue_enter_live() call to any function called from
blk_get_request(). That includes the patch at the start of this e-mail thread.

Bart.
Ming Lei Sept. 4, 2017, 7:16 a.m. UTC | #3
On Mon, Sep 04, 2017 at 04:13:26AM +0000, Bart Van Assche wrote:
> On Sat, 2017-09-02 at 21:12 +0800, Ming Lei wrote:
> > Please let us know if V3 addresses your previous concern about calling
> > blk_queue_enter_live() during preempt freezing.
> 
> Do you understand how request queue cleanup works? The algorithm used for
> request queue cleanup is as follows:
> * Set the DYING flag. This flag makes all later blk_get_request() calls
>   fail.

If you look at the __blk_freeze_queue_start(), you will see that preemp
freeze will fail after queue is dying, and blk_queue_enter_live() won't
be called at all if queue is dying, right?

> * Wait until all pending requests have finished.
> * Set the DEAD flag. For the traditional block layer, this flag causes
>   blk_run_queue() not to call .request_fn() anymore. For blk-mq it is
>   guaranteed in another way that .queue_rq() won't be called anymore after
>   this flag has been set.
> 
> Allowing blk_get_request() to succeed after the DYING flag has been set is
> completely wrong because that could result in a request being queued after

See above, this patch changes nothing about this fact, please look at
the patch carefully next time just before posting your long comment.

> the DEAD flag has been set, resulting in either a hanging request or a kernel
> crash. This is why it's completely wrong to add a blk_queue_enter_live() call
> in blk_old_get_request() or blk_mq_alloc_request(). Hence my NAK for any
> patch that adds a blk_queue_enter_live() call to any function called from
> blk_get_request(). That includes the patch at the start of this e-mail thread.
> 
> Bart.
Bart Van Assche Sept. 4, 2017, 3:40 p.m. UTC | #4
On Mon, 2017-09-04 at 15:16 +0800, Ming Lei wrote:
> On Mon, Sep 04, 2017 at 04:13:26AM +0000, Bart Van Assche wrote:

> > Allowing blk_get_request() to succeed after the DYING flag has been set is

> > completely wrong because that could result in a request being queued after

> > the DEAD flag has been set, resulting in either a hanging request or a kernel

> > crash. This is why it's completely wrong to add a blk_queue_enter_live() call

> > in blk_old_get_request() or blk_mq_alloc_request(). Hence my NAK for any

> > patch that adds a blk_queue_enter_live() call to any function called from

> > blk_get_request(). That includes the patch at the start of this e-mail thread.

>

> See above, this patch changes nothing about this fact, please look at

> the patch carefully next time just before posting your long comment.


Are you really sure that your patch does not allow blk_get_request() to
succeed after the DYING flag has been set? blk_mq_alloc_request() calls both
blk_queue_is_preempt_frozen() and blk_queue_enter_live() without holding
any lock. A thread that is running concurrently with blk_mq_get_request()
can unfreeze the queue after blk_queue_is_preempt_frozen() returned and
before blk_queue_enter_live() is called. This means that with your patch
series applied blk_get_request() can succeed after the DYING flag has been
set, which is something we don't want. Additionally, I don't think we want
to introduce any kind of locking in blk_mq_get_request() because that would
be a serialization point.

Have you considered to use the blk-mq "reserved request" mechanism to avoid
starvation of power management requests instead of making the block layer
even more complicated than it already is?

Note: extending blk_mq_freeze/unfreeze_queue() to the legacy block layer
could be useful to make scsi_wait_for_queuecommand() more elegant. However,
I don't think we should spend our time on legacy block layer / SCSI core
changes. The code I'm referring to is the following:

/**
 * scsi_wait_for_queuecommand() - wait for ongoing queuecommand() calls
 * @sdev: SCSI device pointer.
 *
 * Wait until the ongoing shost->hostt->queuecommand() calls that are
 * invoked from scsi_request_fn() have finished.
 */
static void scsi_wait_for_queuecommand(struct scsi_device *sdev)
{
	WARN_ON_ONCE(sdev->host->use_blk_mq);

	while (scsi_request_fn_active(sdev))
		msleep(20);
}

Bart.
Ming Lei Sept. 4, 2017, 4:08 p.m. UTC | #5
On Mon, Sep 04, 2017 at 03:40:35PM +0000, Bart Van Assche wrote:
> On Mon, 2017-09-04 at 15:16 +0800, Ming Lei wrote:
> > On Mon, Sep 04, 2017 at 04:13:26AM +0000, Bart Van Assche wrote:
> > > Allowing blk_get_request() to succeed after the DYING flag has been set is
> > > completely wrong because that could result in a request being queued after
> > > the DEAD flag has been set, resulting in either a hanging request or a kernel
> > > crash. This is why it's completely wrong to add a blk_queue_enter_live() call
> > > in blk_old_get_request() or blk_mq_alloc_request(). Hence my NAK for any
> > > patch that adds a blk_queue_enter_live() call to any function called from
> > > blk_get_request(). That includes the patch at the start of this e-mail thread.
> >
> > See above, this patch changes nothing about this fact, please look at
> > the patch carefully next time just before posting your long comment.
> 
> Are you really sure that your patch does not allow blk_get_request() to
> succeed after the DYING flag has been set? blk_mq_alloc_request() calls both

Yeah, I am pretty sure.

Firstly blk_queue_freeze_preempt() is exclusive, that means it will wait
for completion of all pending freezing(both normal and preempt), and other
freezing can't be started too if there is in-progress preempt
freezing, actually it is a typical read/write lock use case, but
we need to support nested normal freezing, so we can't use rwsem.

Also DYNING flag is checked first before starting preempt freezing, the
API will return and preempt_freezing flag isn't set if DYNING is set.

Secondly after preempt freezing is started:

	- for block legacy path, dying is always tested in the entry of
	__get_request(), so no new request is allocated after queue is dying.

	- for blk-mq, it is normal for the DYNING flag to be set just
	between blk_queue_enter() and allocating the request, because
	we depend on lld to handle the case. Even we can enhance the point
	by checking dying flag in blk_queue_enter(), but that is just
	a improvement, not mean V3 isn't correct. 

> blk_queue_is_preempt_frozen() and blk_queue_enter_live() without holding
> any lock. A thread that is running concurrently with blk_mq_get_request()
> can unfreeze the queue after blk_queue_is_preempt_frozen() returned and
> before blk_queue_enter_live() is called. This means that with your patch

preempt freezing is exclusive, so no other freezing can be started at all,
then no such issue you worried about.

> series applied blk_get_request() can succeed after the DYING flag has been
> set, which is something we don't want. Additionally, I don't think we want
> to introduce any kind of locking in blk_mq_get_request() because that would
> be a serialization point.

That needn't to be worried about, as you saw, we can check
percpu_ref_is_dying() first, then acquire the lock to check
flag if queue is freezing.

> 
> Have you considered to use the blk-mq "reserved request" mechanism to avoid
> starvation of power management requests instead of making the block layer
> even more complicated than it already is?

reserved request is really a bad idea, that means the reserved request
can't be used for normal I/O, we all know the request/tag space is
precious, and some device has a quite small tag space, such as sata.
This way will affect performance definitely.

Also I don't think the approach is complicated, and actually the idea
is simple, and the implementation isn't complicated too.

> 
> Note: extending blk_mq_freeze/unfreeze_queue() to the legacy block layer
> could be useful to make scsi_wait_for_queuecommand() more elegant. However,
> I don't think we should spend our time on legacy block layer / SCSI core
> changes. The code I'm referring to is the following:

That can be side-product of this approach, but this patchset is just
focus on fixing I/O hang.
Bart Van Assche Sept. 4, 2017, 4:18 p.m. UTC | #6
On Tue, 2017-09-05 at 00:08 +0800, Ming Lei wrote:
> On Mon, Sep 04, 2017 at 03:40:35PM +0000, Bart Van Assche wrote:

> > On Mon, 2017-09-04 at 15:16 +0800, Ming Lei wrote:

> > > On Mon, Sep 04, 2017 at 04:13:26AM +0000, Bart Van Assche wrote:

> > > > Allowing blk_get_request() to succeed after the DYING flag has been set is

> > > > completely wrong because that could result in a request being queued after

> > > > the DEAD flag has been set, resulting in either a hanging request or a kernel

> > > > crash. This is why it's completely wrong to add a blk_queue_enter_live() call

> > > > in blk_old_get_request() or blk_mq_alloc_request(). Hence my NAK for any

> > > > patch that adds a blk_queue_enter_live() call to any function called from

> > > > blk_get_request(). That includes the patch at the start of this e-mail thread.

> > > 

> > > See above, this patch changes nothing about this fact, please look at

> > > the patch carefully next time just before posting your long comment.

> > 

> > Are you really sure that your patch does not allow blk_get_request() to

> > succeed after the DYING flag has been set? blk_mq_alloc_request() calls both

> > blk_queue_is_preempt_frozen() and blk_queue_enter_live() without holding

> > any lock. A thread that is running concurrently with blk_mq_get_request()

> > can unfreeze the queue after blk_queue_is_preempt_frozen() returned and

> > before blk_queue_enter_live() is called. This means that with your patch

> > series applied blk_get_request() can succeed after the DYING flag has been

> > set, which is something we don't want. Additionally, I don't think we want

> > to introduce any kind of locking in blk_mq_get_request() because that would

> > be a serialization point.

>

> Yeah, I am pretty sure.

> 

> Firstly blk_queue_freeze_preempt() is exclusive, that means it will wait

> for completion of all pending freezing(both normal and preempt), and other

> freezing can't be started too if there is in-progress preempt

> freezing, actually it is a typical read/write lock use case, but

> we need to support nested normal freezing, so we can't use rwsem. 


You seem to overlook that blk_get_request() can be called from another thread
than the thread that is performing the freezing and unfreezing.

Bart.
Ming Lei Sept. 4, 2017, 4:28 p.m. UTC | #7
On Mon, Sep 04, 2017 at 04:18:32PM +0000, Bart Van Assche wrote:
> On Tue, 2017-09-05 at 00:08 +0800, Ming Lei wrote:
> > On Mon, Sep 04, 2017 at 03:40:35PM +0000, Bart Van Assche wrote:
> > > On Mon, 2017-09-04 at 15:16 +0800, Ming Lei wrote:
> > > > On Mon, Sep 04, 2017 at 04:13:26AM +0000, Bart Van Assche wrote:
> > > > > Allowing blk_get_request() to succeed after the DYING flag has been set is
> > > > > completely wrong because that could result in a request being queued after
> > > > > the DEAD flag has been set, resulting in either a hanging request or a kernel
> > > > > crash. This is why it's completely wrong to add a blk_queue_enter_live() call
> > > > > in blk_old_get_request() or blk_mq_alloc_request(). Hence my NAK for any
> > > > > patch that adds a blk_queue_enter_live() call to any function called from
> > > > > blk_get_request(). That includes the patch at the start of this e-mail thread.
> > > > 
> > > > See above, this patch changes nothing about this fact, please look at
> > > > the patch carefully next time just before posting your long comment.
> > > 
> > > Are you really sure that your patch does not allow blk_get_request() to
> > > succeed after the DYING flag has been set? blk_mq_alloc_request() calls both
> > > blk_queue_is_preempt_frozen() and blk_queue_enter_live() without holding
> > > any lock. A thread that is running concurrently with blk_mq_get_request()
> > > can unfreeze the queue after blk_queue_is_preempt_frozen() returned and
> > > before blk_queue_enter_live() is called. This means that with your patch
> > > series applied blk_get_request() can succeed after the DYING flag has been
> > > set, which is something we don't want. Additionally, I don't think we want
> > > to introduce any kind of locking in blk_mq_get_request() because that would
> > > be a serialization point.
> >
> > Yeah, I am pretty sure.
> > 
> > Firstly blk_queue_freeze_preempt() is exclusive, that means it will wait
> > for completion of all pending freezing(both normal and preempt), and other
> > freezing can't be started too if there is in-progress preempt
> > freezing, actually it is a typical read/write lock use case, but
> > we need to support nested normal freezing, so we can't use rwsem. 
> 
> You seem to overlook that blk_get_request() can be called from another thread
> than the thread that is performing the freezing and unfreezing.

The freezing state of queue can be observed in all context, so this
issue isn't related with context, please see blk_queue_is_preempt_frozen()
which is called from blk_get_request():

        if (!percpu_ref_is_dying(&q->q_usage_counter))
                return false;

        spin_lock(&q->freeze_lock);
        preempt_frozen = q->preempt_freezing;
        preempt_unfreezing = q->preempt_unfreezing;
        spin_unlock(&q->freeze_lock);

        return preempt_frozen && !preempt_unfreezing;


If the queue is in preempt freezing, blk_queue_enter_live() is called
before allocating request, otherwise blk_queue_enter() is called.

BTW, we can add the check of queue dying in this helper as one
improvement.
Bart Van Assche Sept. 5, 2017, 1:40 a.m. UTC | #8
On Tue, 2017-09-05 at 00:08 +0800, Ming Lei wrote:
> On Mon, Sep 04, 2017 at 03:40:35PM +0000, Bart Van Assche wrote:

> > Have you considered to use the blk-mq "reserved request" mechanism to avoid

> > starvation of power management requests instead of making the block layer

> > even more complicated than it already is?

> 

> reserved request is really a bad idea, that means the reserved request

> can't be used for normal I/O, we all know the request/tag space is

> precious, and some device has a quite small tag space, such as sata.

> This way will affect performance definitely.


Sorry but I'm neither convinced that reserving a request for power management 
would be a bad idea nor that it would have a significant performance impact nor
that it would be complicated to implement. Have you noticed that the Linux ATA
implementation already reserves a request for internal use and thereby reduces
the queue depth from 32 to 31 (see also ATA_TAG_INTERNAL)? What I would like to
know if is whether the performance impact of reserving a request is more or less
than 1%.

Bart.
Ming Lei Sept. 5, 2017, 2:23 a.m. UTC | #9
On Tue, Sep 05, 2017 at 01:40:11AM +0000, Bart Van Assche wrote:
> On Tue, 2017-09-05 at 00:08 +0800, Ming Lei wrote:
> > On Mon, Sep 04, 2017 at 03:40:35PM +0000, Bart Van Assche wrote:
> > > Have you considered to use the blk-mq "reserved request" mechanism to avoid
> > > starvation of power management requests instead of making the block layer
> > > even more complicated than it already is?
> > 
> > reserved request is really a bad idea, that means the reserved request
> > can't be used for normal I/O, we all know the request/tag space is
> > precious, and some device has a quite small tag space, such as sata.
> > This way will affect performance definitely.
> 
> Sorry but I'm neither convinced that reserving a request for power management 
> would be a bad idea nor that it would have a significant performance impact nor
> that it would be complicated to implement. Have you noticed that the Linux ATA
> implementation already reserves a request for internal use and thereby reduces
> the queue depth from 32 to 31 (see also ATA_TAG_INTERNAL)? What I would like to
> know if is whether the performance impact of reserving a request is more or less
> than 1%.

Firstly we really can avoid the reservation, why do we have to
wast one precious tag just for PM, which may never happen on
one machine from its running. For SATA, the internal tag is for EH,
I believe the reservation is inevitable.

Secondly reserving one tag may decrease the concurrent I/O by 1,
that definitely hurts performance, especially for random I/O. Think
about why NVMe increases its queue depth so many. Not mention
there are some devices which have only one tag(.can_queue is 1),
how can you reserve one tag on this kind of device?

Finally bad result will follow if you reserve one tag for PM only.
Suppose it is doable to reserve one tag, did you consider
how to do that? Tag can be shared in host wide, do you want to
reserve one tag just for one request_queue?
	- If yes, lots of tag can be reserved/wasted for the unusual PM
	or sort of commands, even worse the whole tag space of HBA may
	not be enough for the reservation if there are lots of LUNs in
	this HBA.
	
	- If not, and you just reserve one tag for one HBA, then all
	PM commands share the one reservation. During suspend/resume, all
	these PM commands have to run exclusively(serialized) for diskes
	attached to the HBA, that will slow down the suspend/resume very
	much because there may be lots of LUNs in this HBA.

That is why I said reserving one tag is really bad, isn't it?


Thanks,
Ming
Ming Lei Sept. 8, 2017, 3:08 a.m. UTC | #10
On Tue, Sep 05, 2017 at 10:23:25AM +0800, Ming Lei wrote:
> On Tue, Sep 05, 2017 at 01:40:11AM +0000, Bart Van Assche wrote:
> > On Tue, 2017-09-05 at 00:08 +0800, Ming Lei wrote:
> > > On Mon, Sep 04, 2017 at 03:40:35PM +0000, Bart Van Assche wrote:
> > > > Have you considered to use the blk-mq "reserved request" mechanism to avoid
> > > > starvation of power management requests instead of making the block layer
> > > > even more complicated than it already is?
> > > 
> > > reserved request is really a bad idea, that means the reserved request
> > > can't be used for normal I/O, we all know the request/tag space is
> > > precious, and some device has a quite small tag space, such as sata.
> > > This way will affect performance definitely.
> > 
> > Sorry but I'm neither convinced that reserving a request for power management 
> > would be a bad idea nor that it would have a significant performance impact nor
> > that it would be complicated to implement. Have you noticed that the Linux ATA
> > implementation already reserves a request for internal use and thereby reduces
> > the queue depth from 32 to 31 (see also ATA_TAG_INTERNAL)? What I would like to
> > know if is whether the performance impact of reserving a request is more or less
> > than 1%.
> 
> Firstly we really can avoid the reservation, why do we have to
> wast one precious tag just for PM, which may never happen on
> one machine from its running. For SATA, the internal tag is for EH,
> I believe the reservation is inevitable.
> 
> Secondly reserving one tag may decrease the concurrent I/O by 1,
> that definitely hurts performance, especially for random I/O. Think
> about why NVMe increases its queue depth so many. Not mention
> there are some devices which have only one tag(.can_queue is 1),
> how can you reserve one tag on this kind of device?
> 
> Finally bad result will follow if you reserve one tag for PM only.
> Suppose it is doable to reserve one tag, did you consider
> how to do that? Tag can be shared in host wide, do you want to
> reserve one tag just for one request_queue?
> 	- If yes, lots of tag can be reserved/wasted for the unusual PM
> 	or sort of commands, even worse the whole tag space of HBA may
> 	not be enough for the reservation if there are lots of LUNs in
> 	this HBA.
> 	
> 	- If not, and you just reserve one tag for one HBA, then all
> 	PM commands share the one reservation. During suspend/resume, all
> 	these PM commands have to run exclusively(serialized) for diskes
> 	attached to the HBA, that will slow down the suspend/resume very
> 	much because there may be lots of LUNs in this HBA.
> 
> That is why I said reserving one tag is really bad, isn't it?

Hi Bart,

Looks I replied or clarified all your questions/comments on this
patchset.

So could you let me know if you still object this approach or
patchset?

If not, I think we need to move on and fix the real issue.
Bart Van Assche Sept. 8, 2017, 5:28 p.m. UTC | #11
On Fri, 2017-09-08 at 11:08 +0800, Ming Lei wrote:
> Looks I replied or clarified all your questions/comments on this

> patchset.


No, you have not addressed all my comments, you know that you have not
addressed all my comments so you should not have written that you have
addressed all my comments. This patch series not only introduces ugly
changes in the request queue freeze mechanism but it also introduces an
unacceptable race condition between blk_get_request() and request queue
cleanup.

BTW, you don't have to spend more time on this patch series. I have
implemented an alternative and much cleaner approach to fix SCSI device
suspend. I'm currently testing that patch series.

Bart.
Ming Lei Sept. 9, 2017, 7:21 a.m. UTC | #12
On Fri, Sep 08, 2017 at 05:28:16PM +0000, Bart Van Assche wrote:
> On Fri, 2017-09-08 at 11:08 +0800, Ming Lei wrote:
> > Looks I replied or clarified all your questions/comments on this
> > patchset.
> 
> No, you have not addressed all my comments, you know that you have not
> addressed all my comments so you should not have written that you have

I do not know.

> addressed all my comments. This patch series not only introduces ugly
> changes in the request queue freeze mechanism but it also introduces an
> unacceptable race condition between blk_get_request() and request queue
> cleanup.

So what is the race? Could you reply in original thread?

> 
> BTW, you don't have to spend more time on this patch series. I have
> implemented an alternative and much cleaner approach to fix SCSI device
> suspend. I'm currently testing that patch series.

You do not understand the issue at all, it is not only related with
suspend. Please take a look at scsi_execute(), each request run
via scsi_execute() need to be dispatched successfully even when
SCSI device is quiesced.
diff mbox

Patch

diff --git a/block/blk-core.c b/block/blk-core.c
index 2549b0a0535d..f7a6fbb87dea 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1404,7 +1404,8 @@  static struct request *get_request(struct request_queue *q, unsigned int op,
 }
 
 static struct request *blk_old_get_request(struct request_queue *q,
-					   unsigned int op, gfp_t gfp_mask)
+					   unsigned int op, gfp_t gfp_mask,
+					   unsigned int flags)
 {
 	struct request *rq;
 	int ret = 0;
@@ -1414,9 +1415,20 @@  static struct request *blk_old_get_request(struct request_queue *q,
 	/* create ioc upfront */
 	create_io_context(gfp_mask, q->node);
 
-	ret = blk_queue_enter(q, !(gfp_mask & __GFP_DIRECT_RECLAIM));
+	/*
+	 * We need to allocate req of REQF_PREEMPT in preempt freezing.
+	 * No normal freezing can be started when preempt freezing
+	 * is in-progress, and queue dying is checked before starting
+	 * preempt freezing, so it is safe to use blk_queue_enter_live()
+	 * in case of preempt freezing.
+	 */
+	if ((flags & BLK_MQ_REQ_PREEMPT) && blk_queue_is_preempt_frozen(q))
+		blk_queue_enter_live(q);
+	else
+		ret = blk_queue_enter(q, !(gfp_mask & __GFP_DIRECT_RECLAIM));
 	if (ret)
 		return ERR_PTR(ret);
+
 	spin_lock_irq(q->queue_lock);
 	rq = get_request(q, op, NULL, gfp_mask);
 	if (IS_ERR(rq)) {
@@ -1432,26 +1444,26 @@  static struct request *blk_old_get_request(struct request_queue *q,
 	return rq;
 }
 
-struct request *blk_get_request(struct request_queue *q, unsigned int op,
-				gfp_t gfp_mask)
+struct request *__blk_get_request(struct request_queue *q, unsigned int op,
+				  gfp_t gfp_mask, unsigned int flags)
 {
 	struct request *req;
 
 	if (q->mq_ops) {
 		req = blk_mq_alloc_request(q, op,
-			(gfp_mask & __GFP_DIRECT_RECLAIM) ?
-				0 : BLK_MQ_REQ_NOWAIT);
+			flags | ((gfp_mask & __GFP_DIRECT_RECLAIM) ?
+				0 : BLK_MQ_REQ_NOWAIT));
 		if (!IS_ERR(req) && q->mq_ops->initialize_rq_fn)
 			q->mq_ops->initialize_rq_fn(req);
 	} else {
-		req = blk_old_get_request(q, op, gfp_mask);
+		req = blk_old_get_request(q, op, gfp_mask, flags);
 		if (!IS_ERR(req) && q->initialize_rq_fn)
 			q->initialize_rq_fn(req);
 	}
 
 	return req;
 }
-EXPORT_SYMBOL(blk_get_request);
+EXPORT_SYMBOL(__blk_get_request);
 
 /**
  * blk_requeue_request - put a request back on queue
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 54b8d8b9f40e..e81001d1da27 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -496,9 +496,19 @@  struct request *blk_mq_alloc_request(struct request_queue *q, unsigned int op,
 {
 	struct blk_mq_alloc_data alloc_data = { .flags = flags };
 	struct request *rq;
-	int ret;
+	int ret = 0;
 
-	ret = blk_queue_enter(q, flags & BLK_MQ_REQ_NOWAIT);
+	/*
+	 * We need to allocate req of REQF_PREEMPT in preempt freezing.
+	 * No normal freezing can be started when preempt freezing
+	 * is in-progress, and queue dying is checked before starting
+	 * preempt freezing, so it is safe to use blk_queue_enter_live()
+	 * in case of preempt freezing.
+	 */
+	if ((flags & BLK_MQ_REQ_PREEMPT) && blk_queue_is_preempt_frozen(q))
+		blk_queue_enter_live(q);
+	else
+		ret = blk_queue_enter(q, flags & BLK_MQ_REQ_NOWAIT);
 	if (ret)
 		return ERR_PTR(ret);
 
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 5ae8c82d6273..596f433eb54c 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -200,9 +200,10 @@  void blk_mq_free_request(struct request *rq);
 bool blk_mq_can_queue(struct blk_mq_hw_ctx *);
 
 enum {
-	BLK_MQ_REQ_NOWAIT	= (1 << 0), /* return when out of requests */
-	BLK_MQ_REQ_RESERVED	= (1 << 1), /* allocate from reserved pool */
-	BLK_MQ_REQ_INTERNAL	= (1 << 2), /* allocate internal/sched tag */
+	BLK_MQ_REQ_PREEMPT	= BLK_REQ_PREEMPT, /* allocate for RQF_PREEMPT */
+	BLK_MQ_REQ_NOWAIT	= (1 << 8), /* return when out of requests */
+	BLK_MQ_REQ_RESERVED	= (1 << 9), /* allocate from reserved pool */
+	BLK_MQ_REQ_INTERNAL	= (1 << 10), /* allocate internal/sched tag */
 };
 
 struct request *blk_mq_alloc_request(struct request_queue *q, unsigned int op,
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 5618d174100a..ff371c42eb3f 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -862,6 +862,11 @@  enum {
 	BLKPREP_INVALID,	/* invalid command, kill, return -EREMOTEIO */
 };
 
+/* passed to __blk_get_request */
+enum {
+	BLK_REQ_PREEMPT	= (1 << 0), /* allocate for RQF_PREEMPT */
+};
+
 extern unsigned long blk_max_low_pfn, blk_max_pfn;
 
 /*
@@ -944,8 +949,9 @@  extern void blk_rq_init(struct request_queue *q, struct request *rq);
 extern void blk_init_request_from_bio(struct request *req, struct bio *bio);
 extern void blk_put_request(struct request *);
 extern void __blk_put_request(struct request_queue *, struct request *);
-extern struct request *blk_get_request(struct request_queue *, unsigned int op,
-				       gfp_t gfp_mask);
+extern struct request *__blk_get_request(struct request_queue *,
+					 unsigned int op, gfp_t gfp_mask,
+					 unsigned int flags);
 extern void blk_requeue_request(struct request_queue *, struct request *);
 extern int blk_lld_busy(struct request_queue *q);
 extern int blk_rq_prep_clone(struct request *rq, struct request *rq_src,
@@ -996,6 +1002,13 @@  blk_status_t errno_to_blk_status(int errno);
 
 bool blk_mq_poll(struct request_queue *q, blk_qc_t cookie);
 
+static inline struct request *blk_get_request(struct request_queue *q,
+					      unsigned int op,
+					      gfp_t gfp_mask)
+{
+	return __blk_get_request(q, op, gfp_mask, 0);
+}
+
 static inline struct request_queue *bdev_get_queue(struct block_device *bdev)
 {
 	return bdev->bd_disk->queue;	/* this is never NULL */