Message ID | 20170902130840.24609-8-ming.lei@redhat.com (mailing list archive) |
---|---|
State | Not Applicable, archived |
Headers | show |
On Sat, Sep 02, 2017 at 09:08:39PM +0800, Ming Lei wrote: > REQF_PREEMPT is a bit special because the request is required > to be dispatched to lld even when SCSI device is quiesced. > > So this patch introduces __blk_get_request() to allow block > layer to allocate request when queue is preempt frozen, since we > will preempt freeze queue before quiescing SCSI device in the > following patch for supporting safe SCSI quiescing. > > Signed-off-by: Ming Lei <ming.lei@redhat.com> > --- > block/blk-core.c | 28 ++++++++++++++++++++-------- > block/blk-mq.c | 14 ++++++++++++-- > include/linux/blk-mq.h | 7 ++++--- > include/linux/blkdev.h | 17 +++++++++++++++-- > 4 files changed, 51 insertions(+), 15 deletions(-) > > diff --git a/block/blk-core.c b/block/blk-core.c > index 2549b0a0535d..f7a6fbb87dea 100644 > --- a/block/blk-core.c > +++ b/block/blk-core.c > @@ -1404,7 +1404,8 @@ static struct request *get_request(struct request_queue *q, unsigned int op, > } > > static struct request *blk_old_get_request(struct request_queue *q, > - unsigned int op, gfp_t gfp_mask) > + unsigned int op, gfp_t gfp_mask, > + unsigned int flags) > { > struct request *rq; > int ret = 0; > @@ -1414,9 +1415,20 @@ static struct request *blk_old_get_request(struct request_queue *q, > /* create ioc upfront */ > create_io_context(gfp_mask, q->node); > > - ret = blk_queue_enter(q, !(gfp_mask & __GFP_DIRECT_RECLAIM)); > + /* > + * We need to allocate req of REQF_PREEMPT in preempt freezing. > + * No normal freezing can be started when preempt freezing > + * is in-progress, and queue dying is checked before starting > + * preempt freezing, so it is safe to use blk_queue_enter_live() > + * in case of preempt freezing. > + */ > + if ((flags & BLK_MQ_REQ_PREEMPT) && blk_queue_is_preempt_frozen(q)) > + blk_queue_enter_live(q); > + else > + ret = blk_queue_enter(q, !(gfp_mask & __GFP_DIRECT_RECLAIM)); > if (ret) > return ERR_PTR(ret); > + > spin_lock_irq(q->queue_lock); > rq = get_request(q, op, NULL, gfp_mask); > if (IS_ERR(rq)) { > @@ -1432,26 +1444,26 @@ static struct request *blk_old_get_request(struct request_queue *q, > return rq; > } > > -struct request *blk_get_request(struct request_queue *q, unsigned int op, > - gfp_t gfp_mask) > +struct request *__blk_get_request(struct request_queue *q, unsigned int op, > + gfp_t gfp_mask, unsigned int flags) > { > struct request *req; > > if (q->mq_ops) { > req = blk_mq_alloc_request(q, op, > - (gfp_mask & __GFP_DIRECT_RECLAIM) ? > - 0 : BLK_MQ_REQ_NOWAIT); > + flags | ((gfp_mask & __GFP_DIRECT_RECLAIM) ? > + 0 : BLK_MQ_REQ_NOWAIT)); > if (!IS_ERR(req) && q->mq_ops->initialize_rq_fn) > q->mq_ops->initialize_rq_fn(req); > } else { > - req = blk_old_get_request(q, op, gfp_mask); > + req = blk_old_get_request(q, op, gfp_mask, flags); > if (!IS_ERR(req) && q->initialize_rq_fn) > q->initialize_rq_fn(req); > } > > return req; > } > -EXPORT_SYMBOL(blk_get_request); > +EXPORT_SYMBOL(__blk_get_request); > > /** > * blk_requeue_request - put a request back on queue > diff --git a/block/blk-mq.c b/block/blk-mq.c > index 54b8d8b9f40e..e81001d1da27 100644 > --- a/block/blk-mq.c > +++ b/block/blk-mq.c > @@ -496,9 +496,19 @@ struct request *blk_mq_alloc_request(struct request_queue *q, unsigned int op, > { > struct blk_mq_alloc_data alloc_data = { .flags = flags }; > struct request *rq; > - int ret; > + int ret = 0; > > - ret = blk_queue_enter(q, flags & BLK_MQ_REQ_NOWAIT); > + /* > + * We need to allocate req of REQF_PREEMPT in preempt freezing. > + * No normal freezing can be started when preempt freezing > + * is in-progress, and queue dying is checked before starting > + * preempt freezing, so it is safe to use blk_queue_enter_live() > + * in case of preempt freezing. > + */ > + if ((flags & BLK_MQ_REQ_PREEMPT) && blk_queue_is_preempt_frozen(q)) > + blk_queue_enter_live(q); > + else > + ret = blk_queue_enter(q, flags & BLK_MQ_REQ_NOWAIT); > if (ret) > return ERR_PTR(ret); > > diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h > index 5ae8c82d6273..596f433eb54c 100644 > --- a/include/linux/blk-mq.h > +++ b/include/linux/blk-mq.h > @@ -200,9 +200,10 @@ void blk_mq_free_request(struct request *rq); > bool blk_mq_can_queue(struct blk_mq_hw_ctx *); > > enum { > - BLK_MQ_REQ_NOWAIT = (1 << 0), /* return when out of requests */ > - BLK_MQ_REQ_RESERVED = (1 << 1), /* allocate from reserved pool */ > - BLK_MQ_REQ_INTERNAL = (1 << 2), /* allocate internal/sched tag */ > + BLK_MQ_REQ_PREEMPT = BLK_REQ_PREEMPT, /* allocate for RQF_PREEMPT */ > + BLK_MQ_REQ_NOWAIT = (1 << 8), /* return when out of requests */ > + BLK_MQ_REQ_RESERVED = (1 << 9), /* allocate from reserved pool */ > + BLK_MQ_REQ_INTERNAL = (1 << 10), /* allocate internal/sched tag */ > }; > > struct request *blk_mq_alloc_request(struct request_queue *q, unsigned int op, > diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h > index 5618d174100a..ff371c42eb3f 100644 > --- a/include/linux/blkdev.h > +++ b/include/linux/blkdev.h > @@ -862,6 +862,11 @@ enum { > BLKPREP_INVALID, /* invalid command, kill, return -EREMOTEIO */ > }; > > +/* passed to __blk_get_request */ > +enum { > + BLK_REQ_PREEMPT = (1 << 0), /* allocate for RQF_PREEMPT */ > +}; > + > extern unsigned long blk_max_low_pfn, blk_max_pfn; > > /* > @@ -944,8 +949,9 @@ extern void blk_rq_init(struct request_queue *q, struct request *rq); > extern void blk_init_request_from_bio(struct request *req, struct bio *bio); > extern void blk_put_request(struct request *); > extern void __blk_put_request(struct request_queue *, struct request *); > -extern struct request *blk_get_request(struct request_queue *, unsigned int op, > - gfp_t gfp_mask); > +extern struct request *__blk_get_request(struct request_queue *, > + unsigned int op, gfp_t gfp_mask, > + unsigned int flags); > extern void blk_requeue_request(struct request_queue *, struct request *); > extern int blk_lld_busy(struct request_queue *q); > extern int blk_rq_prep_clone(struct request *rq, struct request *rq_src, > @@ -996,6 +1002,13 @@ blk_status_t errno_to_blk_status(int errno); > > bool blk_mq_poll(struct request_queue *q, blk_qc_t cookie); > > +static inline struct request *blk_get_request(struct request_queue *q, > + unsigned int op, > + gfp_t gfp_mask) > +{ > + return __blk_get_request(q, op, gfp_mask, 0); > +} > + > static inline struct request_queue *bdev_get_queue(struct block_device *bdev) > { > return bdev->bd_disk->queue; /* this is never NULL */ > -- > 2.9.5 > Hi Bart, Please let us know if V3 addresses your previous concern about calling blk_queue_enter_live() during preempt freezing. Thanks, Ming
On Sat, 2017-09-02 at 21:12 +0800, Ming Lei wrote: > Please let us know if V3 addresses your previous concern about calling > blk_queue_enter_live() during preempt freezing. Do you understand how request queue cleanup works? The algorithm used for request queue cleanup is as follows: * Set the DYING flag. This flag makes all later blk_get_request() calls fail. * Wait until all pending requests have finished. * Set the DEAD flag. For the traditional block layer, this flag causes blk_run_queue() not to call .request_fn() anymore. For blk-mq it is guaranteed in another way that .queue_rq() won't be called anymore after this flag has been set. Allowing blk_get_request() to succeed after the DYING flag has been set is completely wrong because that could result in a request being queued after the DEAD flag has been set, resulting in either a hanging request or a kernel crash. This is why it's completely wrong to add a blk_queue_enter_live() call in blk_old_get_request() or blk_mq_alloc_request(). Hence my NAK for any patch that adds a blk_queue_enter_live() call to any function called from blk_get_request(). That includes the patch at the start of this e-mail thread. Bart.
On Mon, Sep 04, 2017 at 04:13:26AM +0000, Bart Van Assche wrote: > On Sat, 2017-09-02 at 21:12 +0800, Ming Lei wrote: > > Please let us know if V3 addresses your previous concern about calling > > blk_queue_enter_live() during preempt freezing. > > Do you understand how request queue cleanup works? The algorithm used for > request queue cleanup is as follows: > * Set the DYING flag. This flag makes all later blk_get_request() calls > fail. If you look at the __blk_freeze_queue_start(), you will see that preemp freeze will fail after queue is dying, and blk_queue_enter_live() won't be called at all if queue is dying, right? > * Wait until all pending requests have finished. > * Set the DEAD flag. For the traditional block layer, this flag causes > blk_run_queue() not to call .request_fn() anymore. For blk-mq it is > guaranteed in another way that .queue_rq() won't be called anymore after > this flag has been set. > > Allowing blk_get_request() to succeed after the DYING flag has been set is > completely wrong because that could result in a request being queued after See above, this patch changes nothing about this fact, please look at the patch carefully next time just before posting your long comment. > the DEAD flag has been set, resulting in either a hanging request or a kernel > crash. This is why it's completely wrong to add a blk_queue_enter_live() call > in blk_old_get_request() or blk_mq_alloc_request(). Hence my NAK for any > patch that adds a blk_queue_enter_live() call to any function called from > blk_get_request(). That includes the patch at the start of this e-mail thread. > > Bart.
On Mon, 2017-09-04 at 15:16 +0800, Ming Lei wrote: > On Mon, Sep 04, 2017 at 04:13:26AM +0000, Bart Van Assche wrote: > > Allowing blk_get_request() to succeed after the DYING flag has been set is > > completely wrong because that could result in a request being queued after > > the DEAD flag has been set, resulting in either a hanging request or a kernel > > crash. This is why it's completely wrong to add a blk_queue_enter_live() call > > in blk_old_get_request() or blk_mq_alloc_request(). Hence my NAK for any > > patch that adds a blk_queue_enter_live() call to any function called from > > blk_get_request(). That includes the patch at the start of this e-mail thread. > > See above, this patch changes nothing about this fact, please look at > the patch carefully next time just before posting your long comment. Are you really sure that your patch does not allow blk_get_request() to succeed after the DYING flag has been set? blk_mq_alloc_request() calls both blk_queue_is_preempt_frozen() and blk_queue_enter_live() without holding any lock. A thread that is running concurrently with blk_mq_get_request() can unfreeze the queue after blk_queue_is_preempt_frozen() returned and before blk_queue_enter_live() is called. This means that with your patch series applied blk_get_request() can succeed after the DYING flag has been set, which is something we don't want. Additionally, I don't think we want to introduce any kind of locking in blk_mq_get_request() because that would be a serialization point. Have you considered to use the blk-mq "reserved request" mechanism to avoid starvation of power management requests instead of making the block layer even more complicated than it already is? Note: extending blk_mq_freeze/unfreeze_queue() to the legacy block layer could be useful to make scsi_wait_for_queuecommand() more elegant. However, I don't think we should spend our time on legacy block layer / SCSI core changes. The code I'm referring to is the following: /** * scsi_wait_for_queuecommand() - wait for ongoing queuecommand() calls * @sdev: SCSI device pointer. * * Wait until the ongoing shost->hostt->queuecommand() calls that are * invoked from scsi_request_fn() have finished. */ static void scsi_wait_for_queuecommand(struct scsi_device *sdev) { WARN_ON_ONCE(sdev->host->use_blk_mq); while (scsi_request_fn_active(sdev)) msleep(20); } Bart.
On Mon, Sep 04, 2017 at 03:40:35PM +0000, Bart Van Assche wrote: > On Mon, 2017-09-04 at 15:16 +0800, Ming Lei wrote: > > On Mon, Sep 04, 2017 at 04:13:26AM +0000, Bart Van Assche wrote: > > > Allowing blk_get_request() to succeed after the DYING flag has been set is > > > completely wrong because that could result in a request being queued after > > > the DEAD flag has been set, resulting in either a hanging request or a kernel > > > crash. This is why it's completely wrong to add a blk_queue_enter_live() call > > > in blk_old_get_request() or blk_mq_alloc_request(). Hence my NAK for any > > > patch that adds a blk_queue_enter_live() call to any function called from > > > blk_get_request(). That includes the patch at the start of this e-mail thread. > > > > See above, this patch changes nothing about this fact, please look at > > the patch carefully next time just before posting your long comment. > > Are you really sure that your patch does not allow blk_get_request() to > succeed after the DYING flag has been set? blk_mq_alloc_request() calls both Yeah, I am pretty sure. Firstly blk_queue_freeze_preempt() is exclusive, that means it will wait for completion of all pending freezing(both normal and preempt), and other freezing can't be started too if there is in-progress preempt freezing, actually it is a typical read/write lock use case, but we need to support nested normal freezing, so we can't use rwsem. Also DYNING flag is checked first before starting preempt freezing, the API will return and preempt_freezing flag isn't set if DYNING is set. Secondly after preempt freezing is started: - for block legacy path, dying is always tested in the entry of __get_request(), so no new request is allocated after queue is dying. - for blk-mq, it is normal for the DYNING flag to be set just between blk_queue_enter() and allocating the request, because we depend on lld to handle the case. Even we can enhance the point by checking dying flag in blk_queue_enter(), but that is just a improvement, not mean V3 isn't correct. > blk_queue_is_preempt_frozen() and blk_queue_enter_live() without holding > any lock. A thread that is running concurrently with blk_mq_get_request() > can unfreeze the queue after blk_queue_is_preempt_frozen() returned and > before blk_queue_enter_live() is called. This means that with your patch preempt freezing is exclusive, so no other freezing can be started at all, then no such issue you worried about. > series applied blk_get_request() can succeed after the DYING flag has been > set, which is something we don't want. Additionally, I don't think we want > to introduce any kind of locking in blk_mq_get_request() because that would > be a serialization point. That needn't to be worried about, as you saw, we can check percpu_ref_is_dying() first, then acquire the lock to check flag if queue is freezing. > > Have you considered to use the blk-mq "reserved request" mechanism to avoid > starvation of power management requests instead of making the block layer > even more complicated than it already is? reserved request is really a bad idea, that means the reserved request can't be used for normal I/O, we all know the request/tag space is precious, and some device has a quite small tag space, such as sata. This way will affect performance definitely. Also I don't think the approach is complicated, and actually the idea is simple, and the implementation isn't complicated too. > > Note: extending blk_mq_freeze/unfreeze_queue() to the legacy block layer > could be useful to make scsi_wait_for_queuecommand() more elegant. However, > I don't think we should spend our time on legacy block layer / SCSI core > changes. The code I'm referring to is the following: That can be side-product of this approach, but this patchset is just focus on fixing I/O hang.
On Tue, 2017-09-05 at 00:08 +0800, Ming Lei wrote: > On Mon, Sep 04, 2017 at 03:40:35PM +0000, Bart Van Assche wrote: > > On Mon, 2017-09-04 at 15:16 +0800, Ming Lei wrote: > > > On Mon, Sep 04, 2017 at 04:13:26AM +0000, Bart Van Assche wrote: > > > > Allowing blk_get_request() to succeed after the DYING flag has been set is > > > > completely wrong because that could result in a request being queued after > > > > the DEAD flag has been set, resulting in either a hanging request or a kernel > > > > crash. This is why it's completely wrong to add a blk_queue_enter_live() call > > > > in blk_old_get_request() or blk_mq_alloc_request(). Hence my NAK for any > > > > patch that adds a blk_queue_enter_live() call to any function called from > > > > blk_get_request(). That includes the patch at the start of this e-mail thread. > > > > > > See above, this patch changes nothing about this fact, please look at > > > the patch carefully next time just before posting your long comment. > > > > Are you really sure that your patch does not allow blk_get_request() to > > succeed after the DYING flag has been set? blk_mq_alloc_request() calls both > > blk_queue_is_preempt_frozen() and blk_queue_enter_live() without holding > > any lock. A thread that is running concurrently with blk_mq_get_request() > > can unfreeze the queue after blk_queue_is_preempt_frozen() returned and > > before blk_queue_enter_live() is called. This means that with your patch > > series applied blk_get_request() can succeed after the DYING flag has been > > set, which is something we don't want. Additionally, I don't think we want > > to introduce any kind of locking in blk_mq_get_request() because that would > > be a serialization point. > > Yeah, I am pretty sure. > > Firstly blk_queue_freeze_preempt() is exclusive, that means it will wait > for completion of all pending freezing(both normal and preempt), and other > freezing can't be started too if there is in-progress preempt > freezing, actually it is a typical read/write lock use case, but > we need to support nested normal freezing, so we can't use rwsem. You seem to overlook that blk_get_request() can be called from another thread than the thread that is performing the freezing and unfreezing. Bart.
On Mon, Sep 04, 2017 at 04:18:32PM +0000, Bart Van Assche wrote: > On Tue, 2017-09-05 at 00:08 +0800, Ming Lei wrote: > > On Mon, Sep 04, 2017 at 03:40:35PM +0000, Bart Van Assche wrote: > > > On Mon, 2017-09-04 at 15:16 +0800, Ming Lei wrote: > > > > On Mon, Sep 04, 2017 at 04:13:26AM +0000, Bart Van Assche wrote: > > > > > Allowing blk_get_request() to succeed after the DYING flag has been set is > > > > > completely wrong because that could result in a request being queued after > > > > > the DEAD flag has been set, resulting in either a hanging request or a kernel > > > > > crash. This is why it's completely wrong to add a blk_queue_enter_live() call > > > > > in blk_old_get_request() or blk_mq_alloc_request(). Hence my NAK for any > > > > > patch that adds a blk_queue_enter_live() call to any function called from > > > > > blk_get_request(). That includes the patch at the start of this e-mail thread. > > > > > > > > See above, this patch changes nothing about this fact, please look at > > > > the patch carefully next time just before posting your long comment. > > > > > > Are you really sure that your patch does not allow blk_get_request() to > > > succeed after the DYING flag has been set? blk_mq_alloc_request() calls both > > > blk_queue_is_preempt_frozen() and blk_queue_enter_live() without holding > > > any lock. A thread that is running concurrently with blk_mq_get_request() > > > can unfreeze the queue after blk_queue_is_preempt_frozen() returned and > > > before blk_queue_enter_live() is called. This means that with your patch > > > series applied blk_get_request() can succeed after the DYING flag has been > > > set, which is something we don't want. Additionally, I don't think we want > > > to introduce any kind of locking in blk_mq_get_request() because that would > > > be a serialization point. > > > > Yeah, I am pretty sure. > > > > Firstly blk_queue_freeze_preempt() is exclusive, that means it will wait > > for completion of all pending freezing(both normal and preempt), and other > > freezing can't be started too if there is in-progress preempt > > freezing, actually it is a typical read/write lock use case, but > > we need to support nested normal freezing, so we can't use rwsem. > > You seem to overlook that blk_get_request() can be called from another thread > than the thread that is performing the freezing and unfreezing. The freezing state of queue can be observed in all context, so this issue isn't related with context, please see blk_queue_is_preempt_frozen() which is called from blk_get_request(): if (!percpu_ref_is_dying(&q->q_usage_counter)) return false; spin_lock(&q->freeze_lock); preempt_frozen = q->preempt_freezing; preempt_unfreezing = q->preempt_unfreezing; spin_unlock(&q->freeze_lock); return preempt_frozen && !preempt_unfreezing; If the queue is in preempt freezing, blk_queue_enter_live() is called before allocating request, otherwise blk_queue_enter() is called. BTW, we can add the check of queue dying in this helper as one improvement.
On Tue, 2017-09-05 at 00:08 +0800, Ming Lei wrote: > On Mon, Sep 04, 2017 at 03:40:35PM +0000, Bart Van Assche wrote: > > Have you considered to use the blk-mq "reserved request" mechanism to avoid > > starvation of power management requests instead of making the block layer > > even more complicated than it already is? > > reserved request is really a bad idea, that means the reserved request > can't be used for normal I/O, we all know the request/tag space is > precious, and some device has a quite small tag space, such as sata. > This way will affect performance definitely. Sorry but I'm neither convinced that reserving a request for power management would be a bad idea nor that it would have a significant performance impact nor that it would be complicated to implement. Have you noticed that the Linux ATA implementation already reserves a request for internal use and thereby reduces the queue depth from 32 to 31 (see also ATA_TAG_INTERNAL)? What I would like to know if is whether the performance impact of reserving a request is more or less than 1%. Bart.
On Tue, Sep 05, 2017 at 01:40:11AM +0000, Bart Van Assche wrote: > On Tue, 2017-09-05 at 00:08 +0800, Ming Lei wrote: > > On Mon, Sep 04, 2017 at 03:40:35PM +0000, Bart Van Assche wrote: > > > Have you considered to use the blk-mq "reserved request" mechanism to avoid > > > starvation of power management requests instead of making the block layer > > > even more complicated than it already is? > > > > reserved request is really a bad idea, that means the reserved request > > can't be used for normal I/O, we all know the request/tag space is > > precious, and some device has a quite small tag space, such as sata. > > This way will affect performance definitely. > > Sorry but I'm neither convinced that reserving a request for power management > would be a bad idea nor that it would have a significant performance impact nor > that it would be complicated to implement. Have you noticed that the Linux ATA > implementation already reserves a request for internal use and thereby reduces > the queue depth from 32 to 31 (see also ATA_TAG_INTERNAL)? What I would like to > know if is whether the performance impact of reserving a request is more or less > than 1%. Firstly we really can avoid the reservation, why do we have to wast one precious tag just for PM, which may never happen on one machine from its running. For SATA, the internal tag is for EH, I believe the reservation is inevitable. Secondly reserving one tag may decrease the concurrent I/O by 1, that definitely hurts performance, especially for random I/O. Think about why NVMe increases its queue depth so many. Not mention there are some devices which have only one tag(.can_queue is 1), how can you reserve one tag on this kind of device? Finally bad result will follow if you reserve one tag for PM only. Suppose it is doable to reserve one tag, did you consider how to do that? Tag can be shared in host wide, do you want to reserve one tag just for one request_queue? - If yes, lots of tag can be reserved/wasted for the unusual PM or sort of commands, even worse the whole tag space of HBA may not be enough for the reservation if there are lots of LUNs in this HBA. - If not, and you just reserve one tag for one HBA, then all PM commands share the one reservation. During suspend/resume, all these PM commands have to run exclusively(serialized) for diskes attached to the HBA, that will slow down the suspend/resume very much because there may be lots of LUNs in this HBA. That is why I said reserving one tag is really bad, isn't it? Thanks, Ming
On Tue, Sep 05, 2017 at 10:23:25AM +0800, Ming Lei wrote: > On Tue, Sep 05, 2017 at 01:40:11AM +0000, Bart Van Assche wrote: > > On Tue, 2017-09-05 at 00:08 +0800, Ming Lei wrote: > > > On Mon, Sep 04, 2017 at 03:40:35PM +0000, Bart Van Assche wrote: > > > > Have you considered to use the blk-mq "reserved request" mechanism to avoid > > > > starvation of power management requests instead of making the block layer > > > > even more complicated than it already is? > > > > > > reserved request is really a bad idea, that means the reserved request > > > can't be used for normal I/O, we all know the request/tag space is > > > precious, and some device has a quite small tag space, such as sata. > > > This way will affect performance definitely. > > > > Sorry but I'm neither convinced that reserving a request for power management > > would be a bad idea nor that it would have a significant performance impact nor > > that it would be complicated to implement. Have you noticed that the Linux ATA > > implementation already reserves a request for internal use and thereby reduces > > the queue depth from 32 to 31 (see also ATA_TAG_INTERNAL)? What I would like to > > know if is whether the performance impact of reserving a request is more or less > > than 1%. > > Firstly we really can avoid the reservation, why do we have to > wast one precious tag just for PM, which may never happen on > one machine from its running. For SATA, the internal tag is for EH, > I believe the reservation is inevitable. > > Secondly reserving one tag may decrease the concurrent I/O by 1, > that definitely hurts performance, especially for random I/O. Think > about why NVMe increases its queue depth so many. Not mention > there are some devices which have only one tag(.can_queue is 1), > how can you reserve one tag on this kind of device? > > Finally bad result will follow if you reserve one tag for PM only. > Suppose it is doable to reserve one tag, did you consider > how to do that? Tag can be shared in host wide, do you want to > reserve one tag just for one request_queue? > - If yes, lots of tag can be reserved/wasted for the unusual PM > or sort of commands, even worse the whole tag space of HBA may > not be enough for the reservation if there are lots of LUNs in > this HBA. > > - If not, and you just reserve one tag for one HBA, then all > PM commands share the one reservation. During suspend/resume, all > these PM commands have to run exclusively(serialized) for diskes > attached to the HBA, that will slow down the suspend/resume very > much because there may be lots of LUNs in this HBA. > > That is why I said reserving one tag is really bad, isn't it? Hi Bart, Looks I replied or clarified all your questions/comments on this patchset. So could you let me know if you still object this approach or patchset? If not, I think we need to move on and fix the real issue.
On Fri, 2017-09-08 at 11:08 +0800, Ming Lei wrote: > Looks I replied or clarified all your questions/comments on this > patchset. No, you have not addressed all my comments, you know that you have not addressed all my comments so you should not have written that you have addressed all my comments. This patch series not only introduces ugly changes in the request queue freeze mechanism but it also introduces an unacceptable race condition between blk_get_request() and request queue cleanup. BTW, you don't have to spend more time on this patch series. I have implemented an alternative and much cleaner approach to fix SCSI device suspend. I'm currently testing that patch series. Bart.
On Fri, Sep 08, 2017 at 05:28:16PM +0000, Bart Van Assche wrote: > On Fri, 2017-09-08 at 11:08 +0800, Ming Lei wrote: > > Looks I replied or clarified all your questions/comments on this > > patchset. > > No, you have not addressed all my comments, you know that you have not > addressed all my comments so you should not have written that you have I do not know. > addressed all my comments. This patch series not only introduces ugly > changes in the request queue freeze mechanism but it also introduces an > unacceptable race condition between blk_get_request() and request queue > cleanup. So what is the race? Could you reply in original thread? > > BTW, you don't have to spend more time on this patch series. I have > implemented an alternative and much cleaner approach to fix SCSI device > suspend. I'm currently testing that patch series. You do not understand the issue at all, it is not only related with suspend. Please take a look at scsi_execute(), each request run via scsi_execute() need to be dispatched successfully even when SCSI device is quiesced.
diff --git a/block/blk-core.c b/block/blk-core.c index 2549b0a0535d..f7a6fbb87dea 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -1404,7 +1404,8 @@ static struct request *get_request(struct request_queue *q, unsigned int op, } static struct request *blk_old_get_request(struct request_queue *q, - unsigned int op, gfp_t gfp_mask) + unsigned int op, gfp_t gfp_mask, + unsigned int flags) { struct request *rq; int ret = 0; @@ -1414,9 +1415,20 @@ static struct request *blk_old_get_request(struct request_queue *q, /* create ioc upfront */ create_io_context(gfp_mask, q->node); - ret = blk_queue_enter(q, !(gfp_mask & __GFP_DIRECT_RECLAIM)); + /* + * We need to allocate req of REQF_PREEMPT in preempt freezing. + * No normal freezing can be started when preempt freezing + * is in-progress, and queue dying is checked before starting + * preempt freezing, so it is safe to use blk_queue_enter_live() + * in case of preempt freezing. + */ + if ((flags & BLK_MQ_REQ_PREEMPT) && blk_queue_is_preempt_frozen(q)) + blk_queue_enter_live(q); + else + ret = blk_queue_enter(q, !(gfp_mask & __GFP_DIRECT_RECLAIM)); if (ret) return ERR_PTR(ret); + spin_lock_irq(q->queue_lock); rq = get_request(q, op, NULL, gfp_mask); if (IS_ERR(rq)) { @@ -1432,26 +1444,26 @@ static struct request *blk_old_get_request(struct request_queue *q, return rq; } -struct request *blk_get_request(struct request_queue *q, unsigned int op, - gfp_t gfp_mask) +struct request *__blk_get_request(struct request_queue *q, unsigned int op, + gfp_t gfp_mask, unsigned int flags) { struct request *req; if (q->mq_ops) { req = blk_mq_alloc_request(q, op, - (gfp_mask & __GFP_DIRECT_RECLAIM) ? - 0 : BLK_MQ_REQ_NOWAIT); + flags | ((gfp_mask & __GFP_DIRECT_RECLAIM) ? + 0 : BLK_MQ_REQ_NOWAIT)); if (!IS_ERR(req) && q->mq_ops->initialize_rq_fn) q->mq_ops->initialize_rq_fn(req); } else { - req = blk_old_get_request(q, op, gfp_mask); + req = blk_old_get_request(q, op, gfp_mask, flags); if (!IS_ERR(req) && q->initialize_rq_fn) q->initialize_rq_fn(req); } return req; } -EXPORT_SYMBOL(blk_get_request); +EXPORT_SYMBOL(__blk_get_request); /** * blk_requeue_request - put a request back on queue diff --git a/block/blk-mq.c b/block/blk-mq.c index 54b8d8b9f40e..e81001d1da27 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -496,9 +496,19 @@ struct request *blk_mq_alloc_request(struct request_queue *q, unsigned int op, { struct blk_mq_alloc_data alloc_data = { .flags = flags }; struct request *rq; - int ret; + int ret = 0; - ret = blk_queue_enter(q, flags & BLK_MQ_REQ_NOWAIT); + /* + * We need to allocate req of REQF_PREEMPT in preempt freezing. + * No normal freezing can be started when preempt freezing + * is in-progress, and queue dying is checked before starting + * preempt freezing, so it is safe to use blk_queue_enter_live() + * in case of preempt freezing. + */ + if ((flags & BLK_MQ_REQ_PREEMPT) && blk_queue_is_preempt_frozen(q)) + blk_queue_enter_live(q); + else + ret = blk_queue_enter(q, flags & BLK_MQ_REQ_NOWAIT); if (ret) return ERR_PTR(ret); diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 5ae8c82d6273..596f433eb54c 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -200,9 +200,10 @@ void blk_mq_free_request(struct request *rq); bool blk_mq_can_queue(struct blk_mq_hw_ctx *); enum { - BLK_MQ_REQ_NOWAIT = (1 << 0), /* return when out of requests */ - BLK_MQ_REQ_RESERVED = (1 << 1), /* allocate from reserved pool */ - BLK_MQ_REQ_INTERNAL = (1 << 2), /* allocate internal/sched tag */ + BLK_MQ_REQ_PREEMPT = BLK_REQ_PREEMPT, /* allocate for RQF_PREEMPT */ + BLK_MQ_REQ_NOWAIT = (1 << 8), /* return when out of requests */ + BLK_MQ_REQ_RESERVED = (1 << 9), /* allocate from reserved pool */ + BLK_MQ_REQ_INTERNAL = (1 << 10), /* allocate internal/sched tag */ }; struct request *blk_mq_alloc_request(struct request_queue *q, unsigned int op, diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 5618d174100a..ff371c42eb3f 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -862,6 +862,11 @@ enum { BLKPREP_INVALID, /* invalid command, kill, return -EREMOTEIO */ }; +/* passed to __blk_get_request */ +enum { + BLK_REQ_PREEMPT = (1 << 0), /* allocate for RQF_PREEMPT */ +}; + extern unsigned long blk_max_low_pfn, blk_max_pfn; /* @@ -944,8 +949,9 @@ extern void blk_rq_init(struct request_queue *q, struct request *rq); extern void blk_init_request_from_bio(struct request *req, struct bio *bio); extern void blk_put_request(struct request *); extern void __blk_put_request(struct request_queue *, struct request *); -extern struct request *blk_get_request(struct request_queue *, unsigned int op, - gfp_t gfp_mask); +extern struct request *__blk_get_request(struct request_queue *, + unsigned int op, gfp_t gfp_mask, + unsigned int flags); extern void blk_requeue_request(struct request_queue *, struct request *); extern int blk_lld_busy(struct request_queue *q); extern int blk_rq_prep_clone(struct request *rq, struct request *rq_src, @@ -996,6 +1002,13 @@ blk_status_t errno_to_blk_status(int errno); bool blk_mq_poll(struct request_queue *q, blk_qc_t cookie); +static inline struct request *blk_get_request(struct request_queue *q, + unsigned int op, + gfp_t gfp_mask) +{ + return __blk_get_request(q, op, gfp_mask, 0); +} + static inline struct request_queue *bdev_get_queue(struct block_device *bdev) { return bdev->bd_disk->queue; /* this is never NULL */
REQF_PREEMPT is a bit special because the request is required to be dispatched to lld even when SCSI device is quiesced. So this patch introduces __blk_get_request() to allow block layer to allocate request when queue is preempt frozen, since we will preempt freeze queue before quiescing SCSI device in the following patch for supporting safe SCSI quiescing. Signed-off-by: Ming Lei <ming.lei@redhat.com> --- block/blk-core.c | 28 ++++++++++++++++++++-------- block/blk-mq.c | 14 ++++++++++++-- include/linux/blk-mq.h | 7 ++++--- include/linux/blkdev.h | 17 +++++++++++++++-- 4 files changed, 51 insertions(+), 15 deletions(-)