Message ID | 20240119160338.1191281-4-axboe@kernel.dk (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | mq-deadline scalability improvements | expand |
On 1/19/24 08:02, Jens Axboe wrote: > If we attempt to insert a list of requests, but someone else is already > running an insertion, then fallback to queueing that list internally and > let the existing inserter finish the operation. The current inserter > will either see and flush this list, of if it ends before we're done > doing our bucket insert, then we'll flush it and insert ourselves. > > This reduces contention on the dd->lock, which protects any request > insertion or dispatch, by having a backup point to insert into which > will either be flushed immediately or by an existing inserter. As the > alternative is to just keep spinning on the dd->lock, it's very easy > to get into a situation where multiple processes are trying to do IO > and all sit and spin on this lock. With this alternative patch I achieve 20% higher IOPS than with patch 3/4 of this series for 1..4 CPU cores (null_blk + fio in an x86 VM): --- block/mq-deadline.c | 61 ++++++++++++++++++++++++++++++++++++--------- 1 file changed, 49 insertions(+), 12 deletions(-) diff --git a/block/mq-deadline.c b/block/mq-deadline.c index 83bc21801226..fd9038a84dc5 100644 --- a/block/mq-deadline.c +++ b/block/mq-deadline.c @@ -89,11 +89,15 @@ struct deadline_data { */ struct { spinlock_t lock; + spinlock_t insert_lock; spinlock_t zone_lock; } ____cacheline_aligned_in_smp; unsigned long run_state; + struct list_head at_head; + struct list_head at_tail; + struct dd_per_prio per_prio[DD_PRIO_COUNT]; /* Data direction of latest dispatched request. */ @@ -120,6 +124,9 @@ static const enum dd_prio ioprio_class_to_prio[] = { [IOPRIO_CLASS_IDLE] = DD_IDLE_PRIO, }; +static void dd_insert_request(struct request_queue *q, struct request *rq, + blk_insert_t flags, struct list_head *free); + static inline struct rb_root * deadline_rb_root(struct dd_per_prio *per_prio, struct request *rq) { @@ -592,6 +599,30 @@ static struct request *dd_dispatch_prio_aged_requests(struct deadline_data *dd, return NULL; } +static void dd_do_insert(struct request_queue *q, struct list_head *free) +{ + struct deadline_data *dd = q->elevator->elevator_data; + struct request *rq; + LIST_HEAD(at_head); + LIST_HEAD(at_tail); + + spin_lock(&dd->insert_lock); + list_splice_init(&dd->at_head, &at_head); + list_splice_init(&dd->at_tail, &at_tail); + spin_unlock(&dd->insert_lock); + + while (!list_empty(&at_head)) { + rq = list_first_entry(&at_head, struct request, queuelist); + list_del_init(&rq->queuelist); + dd_insert_request(q, rq, BLK_MQ_INSERT_AT_HEAD, free); + } + while (!list_empty(&at_tail)) { + rq = list_first_entry(&at_tail, struct request, queuelist); + list_del_init(&rq->queuelist); + dd_insert_request(q, rq, 0, free); + } +} + /* * Called from blk_mq_run_hw_queue() -> __blk_mq_sched_dispatch_requests(). * @@ -606,6 +637,7 @@ static struct request *dd_dispatch_request(struct blk_mq_hw_ctx *hctx) const unsigned long now = jiffies; struct request *rq; enum dd_prio prio; + LIST_HEAD(free); /* * If someone else is already dispatching, skip this one. This will @@ -620,6 +652,7 @@ static struct request *dd_dispatch_request(struct blk_mq_hw_ctx *hctx) return NULL; spin_lock(&dd->lock); + dd_do_insert(hctx->queue, &free); rq = dd_dispatch_prio_aged_requests(dd, now); if (rq) goto unlock; @@ -638,6 +671,8 @@ static struct request *dd_dispatch_request(struct blk_mq_hw_ctx *hctx) clear_bit(DD_DISPATCHING, &dd->run_state); spin_unlock(&dd->lock); + blk_mq_free_requests(&free); + return rq; } @@ -727,8 +762,12 @@ static int dd_init_sched(struct request_queue *q, struct elevator_type *e) eq->elevator_data = dd; spin_lock_init(&dd->lock); + spin_lock_init(&dd->insert_lock); spin_lock_init(&dd->zone_lock); + INIT_LIST_HEAD(&dd->at_head); + INIT_LIST_HEAD(&dd->at_tail); + for (prio = 0; prio <= DD_PRIO_MAX; prio++) { struct dd_per_prio *per_prio = &dd->per_prio[prio]; @@ -899,19 +938,13 @@ static void dd_insert_requests(struct blk_mq_hw_ctx *hctx, { struct request_queue *q = hctx->queue; struct deadline_data *dd = q->elevator->elevator_data; - LIST_HEAD(free); - - spin_lock(&dd->lock); - while (!list_empty(list)) { - struct request *rq; - rq = list_first_entry(list, struct request, queuelist); - list_del_init(&rq->queuelist); - dd_insert_request(q, rq, flags, &free); - } - spin_unlock(&dd->lock); - - blk_mq_free_requests(&free); + spin_lock(&dd->insert_lock); + if (flags & BLK_MQ_INSERT_AT_HEAD) + list_splice_init(list, &dd->at_head); + else + list_splice_init(list, &dd->at_tail); + spin_unlock(&dd->insert_lock); } /* Callback from inside blk_mq_rq_ctx_init(). */ @@ -990,6 +1023,10 @@ static bool dd_has_work(struct blk_mq_hw_ctx *hctx) struct deadline_data *dd = hctx->queue->elevator->elevator_data; enum dd_prio prio; + if (!list_empty_careful(&dd->at_head) || + !list_empty_careful(&dd->at_tail)) + return true; + for (prio = 0; prio <= DD_PRIO_MAX; prio++) if (dd_has_work_for_prio(&dd->per_prio[prio])) return true;
On 1/19/24 4:16 PM, Bart Van Assche wrote: > On 1/19/24 08:02, Jens Axboe wrote: >> If we attempt to insert a list of requests, but someone else is already >> running an insertion, then fallback to queueing that list internally and >> let the existing inserter finish the operation. The current inserter >> will either see and flush this list, of if it ends before we're done >> doing our bucket insert, then we'll flush it and insert ourselves. >> >> This reduces contention on the dd->lock, which protects any request >> insertion or dispatch, by having a backup point to insert into which >> will either be flushed immediately or by an existing inserter. As the >> alternative is to just keep spinning on the dd->lock, it's very easy >> to get into a situation where multiple processes are trying to do IO >> and all sit and spin on this lock. > > With this alternative patch I achieve 20% higher IOPS than with patch > 3/4 of this series for 1..4 CPU cores (null_blk + fio in an x86 VM): Performance aside, I think this is a much better approach rather than mine. Haven't tested yet, but I think this instead of my patch 3 and the other patches and this should further drastically cut down on the overhead. Can you send a "proper" patch and I'll just replace the one that I have?
On 1/19/24 5:05 PM, Jens Axboe wrote: > On 1/19/24 4:16 PM, Bart Van Assche wrote: >> On 1/19/24 08:02, Jens Axboe wrote: >>> If we attempt to insert a list of requests, but someone else is already >>> running an insertion, then fallback to queueing that list internally and >>> let the existing inserter finish the operation. The current inserter >>> will either see and flush this list, of if it ends before we're done >>> doing our bucket insert, then we'll flush it and insert ourselves. >>> >>> This reduces contention on the dd->lock, which protects any request >>> insertion or dispatch, by having a backup point to insert into which >>> will either be flushed immediately or by an existing inserter. As the >>> alternative is to just keep spinning on the dd->lock, it's very easy >>> to get into a situation where multiple processes are trying to do IO >>> and all sit and spin on this lock. >> >> With this alternative patch I achieve 20% higher IOPS than with patch >> 3/4 of this series for 1..4 CPU cores (null_blk + fio in an x86 VM): > > Performance aside, I think this is a much better approach rather than > mine. Haven't tested yet, but I think this instead of my patch 3 and the > other patches and this should further drastically cut down on the > overhead. Can you send a "proper" patch and I'll just replace the one > that I have? I'd probably just fold in this incremental, as I think it cleans it up. diff --git a/block/mq-deadline.c b/block/mq-deadline.c index 88991a791c56..977c512465ca 100644 --- a/block/mq-deadline.c +++ b/block/mq-deadline.c @@ -599,10 +599,21 @@ static struct request *dd_dispatch_prio_aged_requests(struct deadline_data *dd, return NULL; } +static void __dd_do_insert(struct request_queue *q, blk_insert_t flags, + struct list_head *list, struct list_head *free) +{ + while (!list_empty(list)) { + struct request *rq; + + rq = list_first_entry(list, struct request, queuelist); + list_del_init(&rq->queuelist); + dd_insert_request(q, rq, flags, free); + } +} + static void dd_do_insert(struct request_queue *q, struct list_head *free) { struct deadline_data *dd = q->elevator->elevator_data; - struct request *rq; LIST_HEAD(at_head); LIST_HEAD(at_tail); @@ -611,16 +622,8 @@ static void dd_do_insert(struct request_queue *q, struct list_head *free) list_splice_init(&dd->at_tail, &at_tail); spin_unlock(&dd->insert_lock); - while (!list_empty(&at_head)) { - rq = list_first_entry(&at_head, struct request, queuelist); - list_del_init(&rq->queuelist); - dd_insert_request(q, rq, BLK_MQ_INSERT_AT_HEAD, free); - } - while (!list_empty(&at_tail)) { - rq = list_first_entry(&at_tail, struct request, queuelist); - list_del_init(&rq->queuelist); - dd_insert_request(q, rq, 0, free); - } + __dd_do_insert(q, BLK_MQ_INSERT_AT_HEAD, &at_head, free); + __dd_do_insert(q, 0, &at_tail, free); } /*
On 1/19/24 5:05 PM, Jens Axboe wrote: > On 1/19/24 4:16 PM, Bart Van Assche wrote: >> On 1/19/24 08:02, Jens Axboe wrote: >>> If we attempt to insert a list of requests, but someone else is already >>> running an insertion, then fallback to queueing that list internally and >>> let the existing inserter finish the operation. The current inserter >>> will either see and flush this list, of if it ends before we're done >>> doing our bucket insert, then we'll flush it and insert ourselves. >>> >>> This reduces contention on the dd->lock, which protects any request >>> insertion or dispatch, by having a backup point to insert into which >>> will either be flushed immediately or by an existing inserter. As the >>> alternative is to just keep spinning on the dd->lock, it's very easy >>> to get into a situation where multiple processes are trying to do IO >>> and all sit and spin on this lock. >> >> With this alternative patch I achieve 20% higher IOPS than with patch >> 3/4 of this series for 1..4 CPU cores (null_blk + fio in an x86 VM): > > Performance aside, I think this is a much better approach rather than > mine. Haven't tested yet, but I think this instead of my patch 3 and the > other patches and this should further drastically cut down on the > overhead. Can you send a "proper" patch and I'll just replace the one > that I have? Ran with this real quick and the incremental I sent, here's what I see. For reference, this is before the series: Device IOPS sys contention diff ==================================================== null_blk 879K 89% 93.6% nvme0n1 901K 86% 94.5% and now with the series: Device IOPS sys contention diff ==================================================== null_blk 2867K 11.1% ~6.0% +326% nvme0n1 3162K 9.9% ~5.0% +350% which looks really good, it removes the last bit of contention that was still there. And talk about a combined improvement...
On 1/19/24 16:05, Jens Axboe wrote:
> Can you send a "proper" patch and I'll just replace the one that I have?
Please take a look at
https://lore.kernel.org/linux-block/20240122235332.2299150-1-bvanassche@acm.org/T/#u
Thanks,
Bart.
diff --git a/block/mq-deadline.c b/block/mq-deadline.c index b579ce282176..cc3155d50e0d 100644 --- a/block/mq-deadline.c +++ b/block/mq-deadline.c @@ -81,8 +81,18 @@ struct dd_per_prio { enum { DD_DISPATCHING = 0, + DD_INSERTING = 1, + DD_BUCKETS = 2, }; +#define DD_CPU_BUCKETS 32 +#define DD_CPU_BUCKETS_MASK (DD_CPU_BUCKETS - 1) + +struct dd_bucket_list { + struct list_head list; + spinlock_t lock; +} ____cacheline_aligned_in_smp; + struct deadline_data { /* * run time data @@ -94,6 +104,8 @@ struct deadline_data { unsigned long run_state; + struct dd_bucket_list bucket_lists[DD_CPU_BUCKETS]; + struct dd_per_prio per_prio[DD_PRIO_COUNT]; /* Data direction of latest dispatched request. */ @@ -714,7 +726,7 @@ static int dd_init_sched(struct request_queue *q, struct elevator_type *e) struct deadline_data *dd; struct elevator_queue *eq; enum dd_prio prio; - int ret = -ENOMEM; + int i, ret = -ENOMEM; eq = elevator_alloc(q, e); if (!eq) @@ -729,6 +741,11 @@ static int dd_init_sched(struct request_queue *q, struct elevator_type *e) spin_lock_init(&dd->lock); spin_lock_init(&dd->zone_lock); + for (i = 0; i < DD_CPU_BUCKETS; i++) { + INIT_LIST_HEAD(&dd->bucket_lists[i].list); + spin_lock_init(&dd->bucket_lists[i].lock); + } + for (prio = 0; prio <= DD_PRIO_MAX; prio++) { struct dd_per_prio *per_prio = &dd->per_prio[prio]; @@ -878,6 +895,94 @@ static void dd_insert_request(struct request_queue *q, struct request *rq, } } +static void dd_dispatch_from_buckets(struct deadline_data *dd, + struct list_head *list) +{ + int i; + + if (!test_bit(DD_BUCKETS, &dd->run_state) || + !test_and_clear_bit(DD_BUCKETS, &dd->run_state)) + return; + + for (i = 0; i < DD_CPU_BUCKETS; i++) { + struct dd_bucket_list *bucket = &dd->bucket_lists[i]; + + if (list_empty_careful(&bucket->list)) + continue; + spin_lock(&bucket->lock); + list_splice_init(&bucket->list, list); + spin_unlock(&bucket->lock); + } +} + +/* + * If we can grab the dd->lock, then just return and do the insertion as per + * usual. If not, add to one of our internal buckets, and afterwards recheck + * if if we should retry. + */ +static bool dd_insert_to_bucket(struct deadline_data *dd, + struct list_head *list) + __acquires(&dd->lock) +{ + struct dd_bucket_list *bucket; + + /* + * If we can grab the lock, proceed as per usual. If not, and insert + * isn't running, force grab the lock and proceed as per usual. + */ + if (spin_trylock(&dd->lock)) + return false; + if (!test_bit(DD_INSERTING, &dd->run_state)) { + spin_lock(&dd->lock); + return false; + } + + if (!test_bit(DD_BUCKETS, &dd->run_state)) + set_bit(DD_BUCKETS, &dd->run_state); + + bucket = &dd->bucket_lists[get_cpu() & DD_CPU_BUCKETS_MASK]; + spin_lock(&bucket->lock); + list_splice_init(list, &bucket->list); + spin_unlock(&bucket->lock); + put_cpu(); + + /* + * Insertion still running, we are done. + */ + if (test_bit(DD_INSERTING, &dd->run_state)) + return true; + + /* + * We may be too late, play it safe and grab the lock. This will + * flush the above bucket insert as well and insert it. + */ + spin_lock(&dd->lock); + return false; +} + +static void __dd_insert_requests(struct request_queue *q, + struct deadline_data *dd, + struct list_head *list, blk_insert_t flags, + struct list_head *free) +{ + set_bit(DD_INSERTING, &dd->run_state); + do { + while (!list_empty(list)) { + struct request *rq; + + rq = list_first_entry(list, struct request, queuelist); + list_del_init(&rq->queuelist); + dd_insert_request(q, rq, flags, free); + } + + dd_dispatch_from_buckets(dd, list); + if (list_empty(list)) + break; + } while (1); + + clear_bit(DD_INSERTING, &dd->run_state); +} + /* * Called from blk_mq_insert_request() or blk_mq_dispatch_plug_list(). */ @@ -889,16 +994,23 @@ static void dd_insert_requests(struct blk_mq_hw_ctx *hctx, struct deadline_data *dd = q->elevator->elevator_data; LIST_HEAD(free); - spin_lock(&dd->lock); - while (!list_empty(list)) { - struct request *rq; + /* + * If dispatch is busy and we ended up adding to our internal bucket, + * then we're done for now. + */ + if (dd_insert_to_bucket(dd, list)) + return; - rq = list_first_entry(list, struct request, queuelist); - list_del_init(&rq->queuelist); - dd_insert_request(q, rq, flags, &free); - } - spin_unlock(&dd->lock); + do { + __dd_insert_requests(q, dd, list, flags, &free); + /* + * If buckets is set after inserting was cleared, be safe and do + * another loop as we could be racing with bucket insertion. + */ + } while (test_bit(DD_BUCKETS, &dd->run_state)); + + spin_unlock(&dd->lock); blk_mq_free_requests(&free); }
If we attempt to insert a list of requests, but someone else is already running an insertion, then fallback to queueing that list internally and let the existing inserter finish the operation. The current inserter will either see and flush this list, of if it ends before we're done doing our bucket insert, then we'll flush it and insert ourselves. This reduces contention on the dd->lock, which protects any request insertion or dispatch, by having a backup point to insert into which will either be flushed immediately or by an existing inserter. As the alternative is to just keep spinning on the dd->lock, it's very easy to get into a situation where multiple processes are trying to do IO and all sit and spin on this lock. With the previous dispatch optimization, this drastically reduces contention for a sample cases of 32 threads doing IO to devices. The test case looks as follows: fio --bs=512 --group_reporting=1 --gtod_reduce=1 --invalidate=1 \ --ioengine=io_uring --norandommap --runtime=60 --rw=randread \ --thread --time_based=1 --buffered=0 --fixedbufs=1 --numjobs=32 \ --iodepth=4 --iodepth_batch_submit=4 --iodepth_batch_complete=4 \ --name=scaletest --filename=/dev/$DEV Before: Device IOPS sys contention diff ==================================================== null_blk 879K 89% 93.6% nvme0n1 901K 86% 94.5% and after this and the previous dispatch patch: Device IOPS sys contention diff ==================================================== null_blk 2311K 10.3% 21.1% +257% nvme0n1 2610K 11.0% 24.6% +289% Signed-off-by: Jens Axboe <axboe@kernel.dk> --- block/mq-deadline.c | 130 +++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 121 insertions(+), 9 deletions(-)