Message ID | d1b5da71a5cba1f17d4eaafb207547a4651f81a9.1491418411.git.osandov@fb.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
On Wed, Apr 05, 2017 at 12:01:29PM -0700, Omar Sandoval wrote: > From: Omar Sandoval <osandov@fb.com> > > While dispatching requests, if we fail to get a driver tag, we mark the > hardware queue as waiting for a tag and put the requests on a > hctx->dispatch list to be run later when a driver tag is freed. However, > blk_mq_dispatch_rq_list() may dispatch requests from multiple hardware > queues if using a single-queue scheduler with a multiqueue device. If It can't perform well by using a SQ scheduler on a MQ device, so just be curious why someone wants to do that in this way,:-) I guess you mean that ops.mq.dispatch_request() may dispatch requests from other hardware queues in blk_mq_sched_dispatch_requests() instead of current hctx. If that is true, it looks like a issue in usage of I/O scheduler, since the mq-deadline scheduler just queues requests in one per-request_queue linked list, for MQ device, the scheduler queue should have been per-hctx. > blk_mq_get_driver_tag() fails, it doesn't update the hardware queue we > are processing. This means we end up using the hardware queue of the > previous request, which may or may not be the same as that of the > current request. If it isn't, the wrong hardware queue will end up > waiting for a tag, and the requests will be on the wrong dispatch list, > leading to a hang. > > The fix is twofold: > > 1. Make sure we save which hardware queue we were trying to get a > request for in blk_mq_get_driver_tag() regardless of whether it > succeeds or not. It shouldn't have be needed, once rq->mq_ctx is set, the hctx should been fixed already, that means we can just pass the hctx to blk_mq_get_driver_tag(). > 2. Make blk_mq_dispatch_rq_list() take a request_queue instead of a > blk_mq_hw_queue to make it clear that it must handle multiple > hardware queues, since I've already messed this up on a couple of > occasions. > > This didn't appear in testing with nvme and mq-deadline because nvme has > more driver tags than the default number of scheduler tags. However, > with the blk_mq_update_nr_hw_queues() fix, it showed up with nbd. > > Signed-off-by: Omar Sandoval <osandov@fb.com> > --- > block/blk-mq-sched.c | 9 +++++---- > block/blk-mq.c | 25 +++++++++++++------------ > block/blk-mq.h | 2 +- > 3 files changed, 19 insertions(+), 17 deletions(-) > > diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c > index 09af8ff18719..fc00f00898d3 100644 > --- a/block/blk-mq-sched.c > +++ b/block/blk-mq-sched.c > @@ -171,7 +171,8 @@ void blk_mq_sched_put_request(struct request *rq) > > void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx) > { > - struct elevator_queue *e = hctx->queue->elevator; > + struct request_queue *q = hctx->queue; > + struct elevator_queue *e = q->elevator; > const bool has_sched_dispatch = e && e->type->ops.mq.dispatch_request; > bool did_work = false; > LIST_HEAD(rq_list); > @@ -203,10 +204,10 @@ void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx) > */ > if (!list_empty(&rq_list)) { > blk_mq_sched_mark_restart_hctx(hctx); > - did_work = blk_mq_dispatch_rq_list(hctx, &rq_list); > + did_work = blk_mq_dispatch_rq_list(q, &rq_list); > } else if (!has_sched_dispatch) { > blk_mq_flush_busy_ctxs(hctx, &rq_list); > - blk_mq_dispatch_rq_list(hctx, &rq_list); > + blk_mq_dispatch_rq_list(q, &rq_list); > } > > /* > @@ -222,7 +223,7 @@ void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx) > if (!rq) > break; > list_add(&rq->queuelist, &rq_list); > - } while (blk_mq_dispatch_rq_list(hctx, &rq_list)); > + } while (blk_mq_dispatch_rq_list(q, &rq_list)); > } > } > > diff --git a/block/blk-mq.c b/block/blk-mq.c > index f7cd3208bcdf..09cff6d1ba76 100644 > --- a/block/blk-mq.c > +++ b/block/blk-mq.c > @@ -863,12 +863,8 @@ bool blk_mq_get_driver_tag(struct request *rq, struct blk_mq_hw_ctx **hctx, > .flags = wait ? 0 : BLK_MQ_REQ_NOWAIT, > }; > > - if (rq->tag != -1) { > -done: > - if (hctx) > - *hctx = data.hctx; > - return true; > - } > + if (rq->tag != -1) > + goto done; > > if (blk_mq_tag_is_reserved(data.hctx->sched_tags, rq->internal_tag)) > data.flags |= BLK_MQ_REQ_RESERVED; > @@ -880,10 +876,12 @@ bool blk_mq_get_driver_tag(struct request *rq, struct blk_mq_hw_ctx **hctx, > atomic_inc(&data.hctx->nr_active); > } > data.hctx->tags->rqs[rq->tag] = rq; > - goto done; > } > > - return false; > +done: > + if (hctx) > + *hctx = data.hctx; > + return rq->tag != -1; > } > > static void __blk_mq_put_driver_tag(struct blk_mq_hw_ctx *hctx, > @@ -980,17 +978,20 @@ static bool blk_mq_dispatch_wait_add(struct blk_mq_hw_ctx *hctx) > return true; > } > > -bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *list) > +bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list) > { > - struct request_queue *q = hctx->queue; > + struct blk_mq_hw_ctx *hctx; > struct request *rq; > int errors, queued, ret = BLK_MQ_RQ_QUEUE_OK; > > + if (list_empty(list)) > + return false; > + > /* > * Now process all the entries, sending them to the driver. > */ > errors = queued = 0; > - while (!list_empty(list)) { > + do { > struct blk_mq_queue_data bd; > > rq = list_first_entry(list, struct request, queuelist); > @@ -1053,7 +1054,7 @@ bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *list) > > if (ret == BLK_MQ_RQ_QUEUE_BUSY) > break; > - } > + } while (!list_empty(list)); > > hctx->dispatched[queued_to_index(queued)]++; > > diff --git a/block/blk-mq.h b/block/blk-mq.h > index 8d49c06fc520..7e6f2e467696 100644 > --- a/block/blk-mq.h > +++ b/block/blk-mq.h > @@ -30,7 +30,7 @@ void blk_mq_freeze_queue(struct request_queue *q); > void blk_mq_free_queue(struct request_queue *q); > int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr); > void blk_mq_wake_waiters(struct request_queue *q); > -bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *, struct list_head *); > +bool blk_mq_dispatch_rq_list(struct request_queue *, struct list_head *); > void blk_mq_flush_busy_ctxs(struct blk_mq_hw_ctx *hctx, struct list_head *list); > bool blk_mq_hctx_has_pending(struct blk_mq_hw_ctx *hctx); > bool blk_mq_get_driver_tag(struct request *rq, struct blk_mq_hw_ctx **hctx, > -- > 2.12.2 >
On Thu, Apr 06, 2017 at 12:31:18PM +0800, Ming Lei wrote: > On Wed, Apr 05, 2017 at 12:01:29PM -0700, Omar Sandoval wrote: > > From: Omar Sandoval <osandov@fb.com> > > > > While dispatching requests, if we fail to get a driver tag, we mark the > > hardware queue as waiting for a tag and put the requests on a > > hctx->dispatch list to be run later when a driver tag is freed. However, > > blk_mq_dispatch_rq_list() may dispatch requests from multiple hardware > > queues if using a single-queue scheduler with a multiqueue device. If > > It can't perform well by using a SQ scheduler on a MQ device, so just be > curious why someone wants to do that in this way,:-) I don't know why anyone would want to, but it has to work :) The only reason we noticed this is because when the NBD device is created, it only has a single queue, so we automatically assign mq-deadline to it. Later, we update the number of queues, but it's still using mq-deadline. > I guess you mean that ops.mq.dispatch_request() may dispatch requests > from other hardware queues in blk_mq_sched_dispatch_requests() instead > of current hctx. Yup, that's right. It's weird, and I talked to Jens about just forcing the MQ device into an SQ mode when using an SQ scheduler, but this way works fine more or less. > If that is true, it looks like a issue in usage of I/O scheduler, since > the mq-deadline scheduler just queues requests in one per-request_queue > linked list, for MQ device, the scheduler queue should have been per-hctx. That's an option, but that's a different scheduling policy. Again, I agree that it's strange, but it's reasonable behavior. > > blk_mq_get_driver_tag() fails, it doesn't update the hardware queue we > > are processing. This means we end up using the hardware queue of the > > previous request, which may or may not be the same as that of the > > current request. If it isn't, the wrong hardware queue will end up > > waiting for a tag, and the requests will be on the wrong dispatch list, > > leading to a hang. > > > > The fix is twofold: > > > > 1. Make sure we save which hardware queue we were trying to get a > > request for in blk_mq_get_driver_tag() regardless of whether it > > succeeds or not. > > It shouldn't have be needed, once rq->mq_ctx is set, the hctx should been > fixed already, that means we can just pass the hctx to blk_mq_get_driver_tag(). We still need to do the blk_mq_map_queue(rq->q, rq->mq_ctx->cpu) step to get the hctx. We could do that at every callsite of blk_mq_get_driver_tag(), but this way it's just factored into blk_mq_get_driver_tag(). Thanks! > > 2. Make blk_mq_dispatch_rq_list() take a request_queue instead of a > > blk_mq_hw_queue to make it clear that it must handle multiple > > hardware queues, since I've already messed this up on a couple of > > occasions. > > > > This didn't appear in testing with nvme and mq-deadline because nvme has > > more driver tags than the default number of scheduler tags. However, > > with the blk_mq_update_nr_hw_queues() fix, it showed up with nbd. > > > > Signed-off-by: Omar Sandoval <osandov@fb.com>
On Thu, Apr 06, 2017 at 12:57:51AM -0700, Omar Sandoval wrote: > On Thu, Apr 06, 2017 at 12:31:18PM +0800, Ming Lei wrote: > > On Wed, Apr 05, 2017 at 12:01:29PM -0700, Omar Sandoval wrote: > > > From: Omar Sandoval <osandov@fb.com> > > > > > > While dispatching requests, if we fail to get a driver tag, we mark the > > > hardware queue as waiting for a tag and put the requests on a > > > hctx->dispatch list to be run later when a driver tag is freed. However, > > > blk_mq_dispatch_rq_list() may dispatch requests from multiple hardware > > > queues if using a single-queue scheduler with a multiqueue device. If > > > > It can't perform well by using a SQ scheduler on a MQ device, so just be > > curious why someone wants to do that in this way,:-) > > I don't know why anyone would want to, but it has to work :) The only > reason we noticed this is because when the NBD device is created, it > only has a single queue, so we automatically assign mq-deadline to it. > Later, we update the number of queues, but it's still using mq-deadline. > > > I guess you mean that ops.mq.dispatch_request() may dispatch requests > > from other hardware queues in blk_mq_sched_dispatch_requests() instead > > of current hctx. > > Yup, that's right. It's weird, and I talked to Jens about just forcing > the MQ device into an SQ mode when using an SQ scheduler, but this way > works fine more or less. Or just switch the elevator to the MQ default one when the device becomes MQ? Or let mq-deadline's .dispatch_request() just return reqs in current hctx? > > > If that is true, it looks like a issue in usage of I/O scheduler, since > > the mq-deadline scheduler just queues requests in one per-request_queue > > linked list, for MQ device, the scheduler queue should have been per-hctx. > > That's an option, but that's a different scheduling policy. Again, I > agree that it's strange, but it's reasonable behavior. IMO, the current mq-deadline isn't good/ready for MQ device, and it doesn't make sense to use it for MQ. > > > > blk_mq_get_driver_tag() fails, it doesn't update the hardware queue we > > > are processing. This means we end up using the hardware queue of the > > > previous request, which may or may not be the same as that of the > > > current request. If it isn't, the wrong hardware queue will end up > > > waiting for a tag, and the requests will be on the wrong dispatch list, > > > leading to a hang. > > > > > > The fix is twofold: > > > > > > 1. Make sure we save which hardware queue we were trying to get a > > > request for in blk_mq_get_driver_tag() regardless of whether it > > > succeeds or not. > > > > It shouldn't have be needed, once rq->mq_ctx is set, the hctx should been > > fixed already, that means we can just pass the hctx to blk_mq_get_driver_tag(). > > We still need to do the blk_mq_map_queue(rq->q, rq->mq_ctx->cpu) step to > get the hctx. We could do that at every callsite of > blk_mq_get_driver_tag(), but this way it's just factored into > blk_mq_get_driver_tag(). We can just use the hctx passed to blk_mq_dispatch_rq_list(). > > Thanks! > > > > 2. Make blk_mq_dispatch_rq_list() take a request_queue instead of a > > > blk_mq_hw_queue to make it clear that it must handle multiple > > > hardware queues, since I've already messed this up on a couple of > > > occasions. The 2nd part makes code less readable too, and it should have been enough to pass 'hctx' into blk_mq_dispatch_rq_list() if we make sure that ops.mq.dispatch_request() only returns requests from current hw queue, since we have passed 'hctx' to elevator already. Thanks, Ming
On 04/06/2017 02:23 AM, Ming Lei wrote: > On Thu, Apr 06, 2017 at 12:57:51AM -0700, Omar Sandoval wrote: >> On Thu, Apr 06, 2017 at 12:31:18PM +0800, Ming Lei wrote: >>> On Wed, Apr 05, 2017 at 12:01:29PM -0700, Omar Sandoval wrote: >>>> From: Omar Sandoval <osandov@fb.com> >>>> >>>> While dispatching requests, if we fail to get a driver tag, we mark the >>>> hardware queue as waiting for a tag and put the requests on a >>>> hctx->dispatch list to be run later when a driver tag is freed. However, >>>> blk_mq_dispatch_rq_list() may dispatch requests from multiple hardware >>>> queues if using a single-queue scheduler with a multiqueue device. If >>> >>> It can't perform well by using a SQ scheduler on a MQ device, so just be >>> curious why someone wants to do that in this way,:-) >> >> I don't know why anyone would want to, but it has to work :) The only >> reason we noticed this is because when the NBD device is created, it >> only has a single queue, so we automatically assign mq-deadline to it. >> Later, we update the number of queues, but it's still using mq-deadline. >> >>> I guess you mean that ops.mq.dispatch_request() may dispatch requests >>> from other hardware queues in blk_mq_sched_dispatch_requests() instead >>> of current hctx. >> >> Yup, that's right. It's weird, and I talked to Jens about just forcing >> the MQ device into an SQ mode when using an SQ scheduler, but this way >> works fine more or less. > > Or just switch the elevator to the MQ default one when the device becomes > MQ? Or let mq-deadline's .dispatch_request() just return reqs in current > hctx? No, that would be a really bad idea imho. First of all, I don't want kernel driven scheduler changes. Secondly, the framework should work with a non-direct mapping between hardware dispatch queues and scheduling queues. While we could force a single queue usage to make that a 1:1 mapping always, that loses big benefits on eg nbd, which uses multiple hardware queues to up the bandwidth. Similarly on nvme, for example, we still scale better with N submission queues and 1 scheduling queue compared to having just 1 submission queue. >>> If that is true, it looks like a issue in usage of I/O scheduler, since >>> the mq-deadline scheduler just queues requests in one per-request_queue >>> linked list, for MQ device, the scheduler queue should have been per-hctx. >> >> That's an option, but that's a different scheduling policy. Again, I >> agree that it's strange, but it's reasonable behavior. > > IMO, the current mq-deadline isn't good/ready for MQ device, and it > doesn't make sense to use it for MQ. I don't think that's true at all. I do agree that it's somewhat quirky since it does introduce scheduling dependencies between the hardware queues, and we have to work at making that well understood and explicit, as not to introduce bugs due to that. But in reality, all multiqueue hardware we are deadling with are mapped to a single resource. As such, it makes a lot of sense to schedule it as such. Hence I don't think that a single queue deadline approach is necessarily a bad idea for even fast storage.
Hi Jens, Thanks for your comment! On Thu, Apr 06, 2017 at 01:29:26PM -0600, Jens Axboe wrote: > On 04/06/2017 02:23 AM, Ming Lei wrote: > > On Thu, Apr 06, 2017 at 12:57:51AM -0700, Omar Sandoval wrote: > >> On Thu, Apr 06, 2017 at 12:31:18PM +0800, Ming Lei wrote: > >>> On Wed, Apr 05, 2017 at 12:01:29PM -0700, Omar Sandoval wrote: > >>>> From: Omar Sandoval <osandov@fb.com> > >>>> > >>>> While dispatching requests, if we fail to get a driver tag, we mark the > >>>> hardware queue as waiting for a tag and put the requests on a > >>>> hctx->dispatch list to be run later when a driver tag is freed. However, > >>>> blk_mq_dispatch_rq_list() may dispatch requests from multiple hardware > >>>> queues if using a single-queue scheduler with a multiqueue device. If > >>> > >>> It can't perform well by using a SQ scheduler on a MQ device, so just be > >>> curious why someone wants to do that in this way,:-) > >> > >> I don't know why anyone would want to, but it has to work :) The only > >> reason we noticed this is because when the NBD device is created, it > >> only has a single queue, so we automatically assign mq-deadline to it. > >> Later, we update the number of queues, but it's still using mq-deadline. > >> > >>> I guess you mean that ops.mq.dispatch_request() may dispatch requests > >>> from other hardware queues in blk_mq_sched_dispatch_requests() instead > >>> of current hctx. > >> > >> Yup, that's right. It's weird, and I talked to Jens about just forcing > >> the MQ device into an SQ mode when using an SQ scheduler, but this way > >> works fine more or less. > > > > Or just switch the elevator to the MQ default one when the device becomes > > MQ? Or let mq-deadline's .dispatch_request() just return reqs in current > > hctx? > > No, that would be a really bad idea imho. First of all, I don't want > kernel driven scheduler changes. Secondly, the framework should work > with a non-direct mapping between hardware dispatch queues and > scheduling queues. > > While we could force a single queue usage to make that a 1:1 mapping > always, that loses big benefits on eg nbd, which uses multiple hardware > queues to up the bandwidth. Similarly on nvme, for example, we still > scale better with N submission queues and 1 scheduling queue compared to > having just 1 submission queue. Looks that isn't what I meant. And my 2nd point is to make mq-deadline's dispatch_request(hctx) just returns requests mapped to the hw queue of 'hctx', then we can avoid to mess up blk-mq.c and blk-mq-sched.c. > > >>> If that is true, it looks like a issue in usage of I/O scheduler, since > >>> the mq-deadline scheduler just queues requests in one per-request_queue > >>> linked list, for MQ device, the scheduler queue should have been per-hctx. > >> > >> That's an option, but that's a different scheduling policy. Again, I > >> agree that it's strange, but it's reasonable behavior. > > > > IMO, the current mq-deadline isn't good/ready for MQ device, and it > > doesn't make sense to use it for MQ. > > I don't think that's true at all. I do agree that it's somewhat quirky > since it does introduce scheduling dependencies between the hardware > queues, and we have to work at making that well understood and explicit, > as not to introduce bugs due to that. But in reality, all multiqueue > hardware we are deadling with are mapped to a single resource. As such, > it makes a lot of sense to schedule it as such. Hence I don't think that > a single queue deadline approach is necessarily a bad idea for even fast > storage. When we map all mq into one single deadline queue, it can't scale well, for example, I just run a simple write test(libaio, dio, bs:4k, 4jobs) over one commodity NVMe in a dual-socket(four cores) system: IO scheduler| CPU utilization ------------------------------- none | 60% ------------------------------- mq-deadline | 100% ------------------------------- And IO throughput is basically same in two cases. Thanks, Ming
On 04/06/2017 09:23 PM, Ming Lei wrote: > Hi Jens, > > Thanks for your comment! > > On Thu, Apr 06, 2017 at 01:29:26PM -0600, Jens Axboe wrote: >> On 04/06/2017 02:23 AM, Ming Lei wrote: >>> On Thu, Apr 06, 2017 at 12:57:51AM -0700, Omar Sandoval wrote: >>>> On Thu, Apr 06, 2017 at 12:31:18PM +0800, Ming Lei wrote: >>>>> On Wed, Apr 05, 2017 at 12:01:29PM -0700, Omar Sandoval wrote: >>>>>> From: Omar Sandoval <osandov@fb.com> >>>>>> >>>>>> While dispatching requests, if we fail to get a driver tag, we mark the >>>>>> hardware queue as waiting for a tag and put the requests on a >>>>>> hctx->dispatch list to be run later when a driver tag is freed. However, >>>>>> blk_mq_dispatch_rq_list() may dispatch requests from multiple hardware >>>>>> queues if using a single-queue scheduler with a multiqueue device. If >>>>> >>>>> It can't perform well by using a SQ scheduler on a MQ device, so just be >>>>> curious why someone wants to do that in this way,:-) >>>> >>>> I don't know why anyone would want to, but it has to work :) The only >>>> reason we noticed this is because when the NBD device is created, it >>>> only has a single queue, so we automatically assign mq-deadline to it. >>>> Later, we update the number of queues, but it's still using mq-deadline. >>>> >>>>> I guess you mean that ops.mq.dispatch_request() may dispatch requests >>>>> from other hardware queues in blk_mq_sched_dispatch_requests() instead >>>>> of current hctx. >>>> >>>> Yup, that's right. It's weird, and I talked to Jens about just forcing >>>> the MQ device into an SQ mode when using an SQ scheduler, but this way >>>> works fine more or less. >>> >>> Or just switch the elevator to the MQ default one when the device becomes >>> MQ? Or let mq-deadline's .dispatch_request() just return reqs in current >>> hctx? >> >> No, that would be a really bad idea imho. First of all, I don't want >> kernel driven scheduler changes. Secondly, the framework should work >> with a non-direct mapping between hardware dispatch queues and >> scheduling queues. >> >> While we could force a single queue usage to make that a 1:1 mapping >> always, that loses big benefits on eg nbd, which uses multiple hardware >> queues to up the bandwidth. Similarly on nvme, for example, we still >> scale better with N submission queues and 1 scheduling queue compared to >> having just 1 submission queue. > > Looks that isn't what I meant. And my 2nd point is to make > mq-deadline's dispatch_request(hctx) just returns requests mapped to > the hw queue of 'hctx', then we can avoid to mess up blk-mq.c and > blk-mq-sched.c. That would indeed be better. But let's assume that we have 4 hardware queues, we're asked to run queue X. But the FIFO dictates that the first request that should run is on queue Y, since it's expired. What do we do? Then we'd have to abort dispatching on queue X, and run queue Y appropriately. This shuffle can happen all the time. I think it'd be worthwhile to work towards the goal of improving mq-deadline to deal with this, and subsequently cleaning up the interface. It would be a cleaner implementation, though efficiency might suffer further. >>>>> If that is true, it looks like a issue in usage of I/O scheduler, since >>>>> the mq-deadline scheduler just queues requests in one per-request_queue >>>>> linked list, for MQ device, the scheduler queue should have been per-hctx. >>>> >>>> That's an option, but that's a different scheduling policy. Again, I >>>> agree that it's strange, but it's reasonable behavior. >>> >>> IMO, the current mq-deadline isn't good/ready for MQ device, and it >>> doesn't make sense to use it for MQ. >> >> I don't think that's true at all. I do agree that it's somewhat quirky >> since it does introduce scheduling dependencies between the hardware >> queues, and we have to work at making that well understood and explicit, >> as not to introduce bugs due to that. But in reality, all multiqueue >> hardware we are deadling with are mapped to a single resource. As such, >> it makes a lot of sense to schedule it as such. Hence I don't think that >> a single queue deadline approach is necessarily a bad idea for even fast >> storage. > > When we map all mq into one single deadline queue, it can't scale well, for > example, I just run a simple write test(libaio, dio, bs:4k, 4jobs) over one > commodity NVMe in a dual-socket(four cores) system: > > IO scheduler| CPU utilization > ------------------------------- > none | 60% > ------------------------------- > mq-deadline | 100% > ------------------------------- > > And IO throughput is basically same in two cases. > That's a given, for low blocksize and high iops, we'll be hitting the deadline lock pretty hard. We dispatch single requests at the time, to optimize for rotational storage, since it improves merging a lot. If we modified e->type->ops.mq.dispatch_request() to take a "dump this many requests" parameter and ramped up the depth quickly, then we could drastically reduce the overhead for your case. The initial version dumped the whole thing. It had lower overhead on flash, but didn't reach the performance of old deadline on !mq since we continually emptied the queue and eliminated merging opportunities. blk_mq_sched_dispatch_requests() could add logic that brought back the old behavior if NONROT is set.
On Fri, Apr 07, 2017 at 08:45:41AM -0600, Jens Axboe wrote: > On 04/06/2017 09:23 PM, Ming Lei wrote: > > Hi Jens, > > > > Thanks for your comment! > > > > On Thu, Apr 06, 2017 at 01:29:26PM -0600, Jens Axboe wrote: > >> On 04/06/2017 02:23 AM, Ming Lei wrote: > >>> On Thu, Apr 06, 2017 at 12:57:51AM -0700, Omar Sandoval wrote: > >>>> On Thu, Apr 06, 2017 at 12:31:18PM +0800, Ming Lei wrote: > >>>>> On Wed, Apr 05, 2017 at 12:01:29PM -0700, Omar Sandoval wrote: > >>>>>> From: Omar Sandoval <osandov@fb.com> > >>>>>> > >>>>>> While dispatching requests, if we fail to get a driver tag, we mark the > >>>>>> hardware queue as waiting for a tag and put the requests on a > >>>>>> hctx->dispatch list to be run later when a driver tag is freed. However, > >>>>>> blk_mq_dispatch_rq_list() may dispatch requests from multiple hardware > >>>>>> queues if using a single-queue scheduler with a multiqueue device. If > >>>>> > >>>>> It can't perform well by using a SQ scheduler on a MQ device, so just be > >>>>> curious why someone wants to do that in this way,:-) > >>>> > >>>> I don't know why anyone would want to, but it has to work :) The only > >>>> reason we noticed this is because when the NBD device is created, it > >>>> only has a single queue, so we automatically assign mq-deadline to it. > >>>> Later, we update the number of queues, but it's still using mq-deadline. > >>>> > >>>>> I guess you mean that ops.mq.dispatch_request() may dispatch requests > >>>>> from other hardware queues in blk_mq_sched_dispatch_requests() instead > >>>>> of current hctx. > >>>> > >>>> Yup, that's right. It's weird, and I talked to Jens about just forcing > >>>> the MQ device into an SQ mode when using an SQ scheduler, but this way > >>>> works fine more or less. > >>> > >>> Or just switch the elevator to the MQ default one when the device becomes > >>> MQ? Or let mq-deadline's .dispatch_request() just return reqs in current > >>> hctx? > >> > >> No, that would be a really bad idea imho. First of all, I don't want > >> kernel driven scheduler changes. Secondly, the framework should work > >> with a non-direct mapping between hardware dispatch queues and > >> scheduling queues. > >> > >> While we could force a single queue usage to make that a 1:1 mapping > >> always, that loses big benefits on eg nbd, which uses multiple hardware > >> queues to up the bandwidth. Similarly on nvme, for example, we still > >> scale better with N submission queues and 1 scheduling queue compared to > >> having just 1 submission queue. > > > > Looks that isn't what I meant. And my 2nd point is to make > > mq-deadline's dispatch_request(hctx) just returns requests mapped to > > the hw queue of 'hctx', then we can avoid to mess up blk-mq.c and > > blk-mq-sched.c. > > That would indeed be better. But let's assume that we have 4 hardware > queues, we're asked to run queue X. But the FIFO dictates that the first > request that should run is on queue Y, since it's expired. What do we > do? Then we'd have to abort dispatching on queue X, and run queue Y > appropriately. The previous rule is to run queue Y on the CPUs(sw queues) mapped to queue Y, and that should still work by following this rule. But this patch is starting to break the rule, :-( > > This shuffle can happen all the time. I think it'd be worthwhile to work > towards the goal of improving mq-deadline to deal with this, and > subsequently cleaning up the interface. It would be a cleaner > implementation, though efficiency might suffer further. > > >>>>> If that is true, it looks like a issue in usage of I/O scheduler, since > >>>>> the mq-deadline scheduler just queues requests in one per-request_queue > >>>>> linked list, for MQ device, the scheduler queue should have been per-hctx. > >>>> > >>>> That's an option, but that's a different scheduling policy. Again, I > >>>> agree that it's strange, but it's reasonable behavior. > >>> > >>> IMO, the current mq-deadline isn't good/ready for MQ device, and it > >>> doesn't make sense to use it for MQ. > >> > >> I don't think that's true at all. I do agree that it's somewhat quirky > >> since it does introduce scheduling dependencies between the hardware > >> queues, and we have to work at making that well understood and explicit, > >> as not to introduce bugs due to that. But in reality, all multiqueue > >> hardware we are deadling with are mapped to a single resource. As such, > >> it makes a lot of sense to schedule it as such. Hence I don't think that > >> a single queue deadline approach is necessarily a bad idea for even fast > >> storage. > > > > When we map all mq into one single deadline queue, it can't scale well, for > > example, I just run a simple write test(libaio, dio, bs:4k, 4jobs) over one > > commodity NVMe in a dual-socket(four cores) system: > > > > IO scheduler| CPU utilization > > ------------------------------- > > none | 60% > > ------------------------------- > > mq-deadline | 100% > > ------------------------------- > > > > And IO throughput is basically same in two cases. > > > That's a given, for low blocksize and high iops, we'll be hitting the > deadline lock pretty hard. We dispatch single requests at the time, to > optimize for rotational storage, since it improves merging a lot. If we > modified e->type->ops.mq.dispatch_request() to take a "dump this many > requests" parameter and ramped up the depth quickly, then we could > drastically reduce the overhead for your case. The initial version > dumped the whole thing. It had lower overhead on flash, but didn't reach > the performance of old deadline on !mq since we continually emptied the > queue and eliminated merging opportunities. > > blk_mq_sched_dispatch_requests() could add logic that brought back the > old behavior if NONROT is set. From my further test, looks the issue is related with the single mq-deadline queue(includes the single lock) which need to be accessed from different NUMA nodes. The following tree implements per-hctx mq-deadline queue, and basically makes q->last_merge, elevator_queue->hash and 'struct deadline_data' defined as per-hctx. With this patches, the issue disappered. Thanks Ming
On Tue, Apr 11, 2017 at 10:12:49AM +0800, Ming Lei wrote: > On Fri, Apr 07, 2017 at 08:45:41AM -0600, Jens Axboe wrote: > > On 04/06/2017 09:23 PM, Ming Lei wrote: > > > Hi Jens, > > > > > > Thanks for your comment! > > > > > > On Thu, Apr 06, 2017 at 01:29:26PM -0600, Jens Axboe wrote: > > >> On 04/06/2017 02:23 AM, Ming Lei wrote: > > >>> On Thu, Apr 06, 2017 at 12:57:51AM -0700, Omar Sandoval wrote: > > >>>> On Thu, Apr 06, 2017 at 12:31:18PM +0800, Ming Lei wrote: > > >>>>> On Wed, Apr 05, 2017 at 12:01:29PM -0700, Omar Sandoval wrote: > > >>>>>> From: Omar Sandoval <osandov@fb.com> > > >>>>>> > > >>>>>> While dispatching requests, if we fail to get a driver tag, we mark the > > >>>>>> hardware queue as waiting for a tag and put the requests on a > > >>>>>> hctx->dispatch list to be run later when a driver tag is freed. However, > > >>>>>> blk_mq_dispatch_rq_list() may dispatch requests from multiple hardware > > >>>>>> queues if using a single-queue scheduler with a multiqueue device. If > > >>>>> > > >>>>> It can't perform well by using a SQ scheduler on a MQ device, so just be > > >>>>> curious why someone wants to do that in this way,:-) > > >>>> > > >>>> I don't know why anyone would want to, but it has to work :) The only > > >>>> reason we noticed this is because when the NBD device is created, it > > >>>> only has a single queue, so we automatically assign mq-deadline to it. > > >>>> Later, we update the number of queues, but it's still using mq-deadline. > > >>>> > > >>>>> I guess you mean that ops.mq.dispatch_request() may dispatch requests > > >>>>> from other hardware queues in blk_mq_sched_dispatch_requests() instead > > >>>>> of current hctx. > > >>>> > > >>>> Yup, that's right. It's weird, and I talked to Jens about just forcing > > >>>> the MQ device into an SQ mode when using an SQ scheduler, but this way > > >>>> works fine more or less. > > >>> > > >>> Or just switch the elevator to the MQ default one when the device becomes > > >>> MQ? Or let mq-deadline's .dispatch_request() just return reqs in current > > >>> hctx? > > >> > > >> No, that would be a really bad idea imho. First of all, I don't want > > >> kernel driven scheduler changes. Secondly, the framework should work > > >> with a non-direct mapping between hardware dispatch queues and > > >> scheduling queues. > > >> > > >> While we could force a single queue usage to make that a 1:1 mapping > > >> always, that loses big benefits on eg nbd, which uses multiple hardware > > >> queues to up the bandwidth. Similarly on nvme, for example, we still > > >> scale better with N submission queues and 1 scheduling queue compared to > > >> having just 1 submission queue. > > > > > > Looks that isn't what I meant. And my 2nd point is to make > > > mq-deadline's dispatch_request(hctx) just returns requests mapped to > > > the hw queue of 'hctx', then we can avoid to mess up blk-mq.c and > > > blk-mq-sched.c. > > > > That would indeed be better. But let's assume that we have 4 hardware > > queues, we're asked to run queue X. But the FIFO dictates that the first > > request that should run is on queue Y, since it's expired. What do we > > do? Then we'd have to abort dispatching on queue X, and run queue Y > > appropriately. > > The previous rule is to run queue Y on the CPUs(sw queues) mapped to > queue Y, and that should still work by following this rule. But this > patch is starting to break the rule, :-( > > > > > This shuffle can happen all the time. I think it'd be worthwhile to work > > towards the goal of improving mq-deadline to deal with this, and > > subsequently cleaning up the interface. It would be a cleaner > > implementation, though efficiency might suffer further. > > > > >>>>> If that is true, it looks like a issue in usage of I/O scheduler, since > > >>>>> the mq-deadline scheduler just queues requests in one per-request_queue > > >>>>> linked list, for MQ device, the scheduler queue should have been per-hctx. > > >>>> > > >>>> That's an option, but that's a different scheduling policy. Again, I > > >>>> agree that it's strange, but it's reasonable behavior. > > >>> > > >>> IMO, the current mq-deadline isn't good/ready for MQ device, and it > > >>> doesn't make sense to use it for MQ. > > >> > > >> I don't think that's true at all. I do agree that it's somewhat quirky > > >> since it does introduce scheduling dependencies between the hardware > > >> queues, and we have to work at making that well understood and explicit, > > >> as not to introduce bugs due to that. But in reality, all multiqueue > > >> hardware we are deadling with are mapped to a single resource. As such, > > >> it makes a lot of sense to schedule it as such. Hence I don't think that > > >> a single queue deadline approach is necessarily a bad idea for even fast > > >> storage. > > > > > > When we map all mq into one single deadline queue, it can't scale well, for > > > example, I just run a simple write test(libaio, dio, bs:4k, 4jobs) over one > > > commodity NVMe in a dual-socket(four cores) system: > > > > > > IO scheduler| CPU utilization > > > ------------------------------- > > > none | 60% > > > ------------------------------- > > > mq-deadline | 100% > > > ------------------------------- > > > > > > And IO throughput is basically same in two cases. > > > > > That's a given, for low blocksize and high iops, we'll be hitting the > > deadline lock pretty hard. We dispatch single requests at the time, to > > optimize for rotational storage, since it improves merging a lot. If we > > modified e->type->ops.mq.dispatch_request() to take a "dump this many > > requests" parameter and ramped up the depth quickly, then we could > > drastically reduce the overhead for your case. The initial version > > dumped the whole thing. It had lower overhead on flash, but didn't reach > > the performance of old deadline on !mq since we continually emptied the > > queue and eliminated merging opportunities. > > > > blk_mq_sched_dispatch_requests() could add logic that brought back the > > old behavior if NONROT is set. > > From my further test, looks the issue is related with the single > mq-deadline queue(includes the single lock) which need to be accessed > from different NUMA nodes. > > The following tree implements per-hctx mq-deadline queue, and basically > makes q->last_merge, elevator_queue->hash and 'struct deadline_data' > defined as per-hctx. With this patches, the issue disappered. Sorry for missing the link: https://github.com/ming1/linux/commits/my_v4.11-rcX Thanks, Ming
diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c index 09af8ff18719..fc00f00898d3 100644 --- a/block/blk-mq-sched.c +++ b/block/blk-mq-sched.c @@ -171,7 +171,8 @@ void blk_mq_sched_put_request(struct request *rq) void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx) { - struct elevator_queue *e = hctx->queue->elevator; + struct request_queue *q = hctx->queue; + struct elevator_queue *e = q->elevator; const bool has_sched_dispatch = e && e->type->ops.mq.dispatch_request; bool did_work = false; LIST_HEAD(rq_list); @@ -203,10 +204,10 @@ void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx) */ if (!list_empty(&rq_list)) { blk_mq_sched_mark_restart_hctx(hctx); - did_work = blk_mq_dispatch_rq_list(hctx, &rq_list); + did_work = blk_mq_dispatch_rq_list(q, &rq_list); } else if (!has_sched_dispatch) { blk_mq_flush_busy_ctxs(hctx, &rq_list); - blk_mq_dispatch_rq_list(hctx, &rq_list); + blk_mq_dispatch_rq_list(q, &rq_list); } /* @@ -222,7 +223,7 @@ void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx) if (!rq) break; list_add(&rq->queuelist, &rq_list); - } while (blk_mq_dispatch_rq_list(hctx, &rq_list)); + } while (blk_mq_dispatch_rq_list(q, &rq_list)); } } diff --git a/block/blk-mq.c b/block/blk-mq.c index f7cd3208bcdf..09cff6d1ba76 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -863,12 +863,8 @@ bool blk_mq_get_driver_tag(struct request *rq, struct blk_mq_hw_ctx **hctx, .flags = wait ? 0 : BLK_MQ_REQ_NOWAIT, }; - if (rq->tag != -1) { -done: - if (hctx) - *hctx = data.hctx; - return true; - } + if (rq->tag != -1) + goto done; if (blk_mq_tag_is_reserved(data.hctx->sched_tags, rq->internal_tag)) data.flags |= BLK_MQ_REQ_RESERVED; @@ -880,10 +876,12 @@ bool blk_mq_get_driver_tag(struct request *rq, struct blk_mq_hw_ctx **hctx, atomic_inc(&data.hctx->nr_active); } data.hctx->tags->rqs[rq->tag] = rq; - goto done; } - return false; +done: + if (hctx) + *hctx = data.hctx; + return rq->tag != -1; } static void __blk_mq_put_driver_tag(struct blk_mq_hw_ctx *hctx, @@ -980,17 +978,20 @@ static bool blk_mq_dispatch_wait_add(struct blk_mq_hw_ctx *hctx) return true; } -bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *list) +bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list) { - struct request_queue *q = hctx->queue; + struct blk_mq_hw_ctx *hctx; struct request *rq; int errors, queued, ret = BLK_MQ_RQ_QUEUE_OK; + if (list_empty(list)) + return false; + /* * Now process all the entries, sending them to the driver. */ errors = queued = 0; - while (!list_empty(list)) { + do { struct blk_mq_queue_data bd; rq = list_first_entry(list, struct request, queuelist); @@ -1053,7 +1054,7 @@ bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *list) if (ret == BLK_MQ_RQ_QUEUE_BUSY) break; - } + } while (!list_empty(list)); hctx->dispatched[queued_to_index(queued)]++; diff --git a/block/blk-mq.h b/block/blk-mq.h index 8d49c06fc520..7e6f2e467696 100644 --- a/block/blk-mq.h +++ b/block/blk-mq.h @@ -30,7 +30,7 @@ void blk_mq_freeze_queue(struct request_queue *q); void blk_mq_free_queue(struct request_queue *q); int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr); void blk_mq_wake_waiters(struct request_queue *q); -bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *, struct list_head *); +bool blk_mq_dispatch_rq_list(struct request_queue *, struct list_head *); void blk_mq_flush_busy_ctxs(struct blk_mq_hw_ctx *hctx, struct list_head *list); bool blk_mq_hctx_has_pending(struct blk_mq_hw_ctx *hctx); bool blk_mq_get_driver_tag(struct request *rq, struct blk_mq_hw_ctx **hctx,