[PATCHv2] blk-mq: remove hybrid polling

Message ID	20230320194926.3353144-1-kbusch@meta.com (mailing list archive)
State	New
Headers	show Return-Path: <io-uring-owner@vger.kernel.org> Received-SPF: Fail (protection.outlook.com: domain of meta.com does not designate 66.220.155.178 as permitted sender) receiver=protection.outlook.com; client-ip=66.220.155.178; helo=66-220-155-178.mail-mxout.facebook.com; From: Keith Busch <kbusch@meta.com> To: linux-block@vger.kernel.org, Jens Axboe <axboe@kernel.dk>, io-uring@vger.kernel.org Cc: Pavel Begunkov <asml.silence@gmail.com>, Keith Busch <kbusch@kernel.org> Subject: [PATCHv2] blk-mq: remove hybrid polling Date: Mon, 20 Mar 2023 12:49:26 -0700 Message-Id: <20230320194926.3353144-1-kbusch@meta.com> MIME-Version: 1.0 Content-Transfer-Encoding: quoted-printable Content-Type: text/plain X-MS-Office365-Filtering-Correlation-Id: d06c9a6f-d4f9-435e-235b-08db297c3889 Precedence: bulk
Series	[PATCHv2] blk-mq: remove hybrid polling \| expand [PATCHv2] blk-mq: remove hybrid polling

diff --git a/Documentation/ABI/stable/sysfs-block b/Documentation/ABI/stable/sysfs-block index 282de3680367d..c57e5b7cb5326 100644 --- a/Documentation/ABI/stable/sysfs-block +++ b/Documentation/ABI/stable/sysfs-block @@ -336,18 +336,11 @@ What: /sys/block/<disk>/queue/io_poll_delay Date: November 2016 Contact: linux-block@vger.kernel.org Description: - [RW] If polling is enabled, this controls what kind of polling - will be performed. It defaults to -1, which is classic polling. + [RW] This was used to control what kind of polling will be + performed. It is now fixed to -1, which is classic polling. In this mode, the CPU will repeatedly ask for completions - without giving up any time. If set to 0, a hybrid polling mode - is used, where the kernel will attempt to make an educated guess - at when the IO will complete. Based on this guess, the kernel - will put the process issuing IO to sleep for an amount of time, - before entering a classic poll loop. This mode might be a little - slower than pure classic polling, but it will be more efficient. - If set to a value larger than 0, the kernel will put the process - issuing IO to sleep for this amount of microseconds before - entering classic polling. + without giving up any time. + <deprecated> What: /sys/block/<disk>/queue/io_timeout diff --git a/block/blk-core.c b/block/blk-core.c index 9e5e0277a4d95..269765d16cfd9 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -263,13 +263,7 @@ static void blk_free_queue_rcu(struct rcu_head *rcu_head) static void blk_free_queue(struct request_queue *q) { - if (q->poll_stat) - blk_stat_remove_callback(q, q->poll_cb); - blk_stat_free_callback(q->poll_cb); - blk_free_queue_stats(q->stats); - kfree(q->poll_stat); - if (queue_is_mq(q)) blk_mq_release(q); diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c index b01818f8e216e..212a7f301e730 100644 --- a/block/blk-mq-debugfs.c +++ b/block/blk-mq-debugfs.c @@ -15,33 +15,8 @@ #include "blk-mq-tag.h" #include "blk-rq-qos.h" -static void print_stat(struct seq_file *m, struct blk_rq_stat *stat) -{ - if (stat->nr_samples) { - seq_printf(m, "samples=%d, mean=%llu, min=%llu, max=%llu", - stat->nr_samples, stat->mean, stat->min, stat->max); - } else { - seq_puts(m, "samples=0"); - } -} - static int queue_poll_stat_show(void *data, struct seq_file *m) { - struct request_queue *q = data; - int bucket; - - if (!q->poll_stat) - return 0; - - for (bucket = 0; bucket < (BLK_MQ_POLL_STATS_BKTS / 2); bucket++) { - seq_printf(m, "read (%d Bytes): ", 1 << (9 + bucket)); - print_stat(m, &q->poll_stat[2 * bucket]); - seq_puts(m, "\n"); - - seq_printf(m, "write (%d Bytes): ", 1 << (9 + bucket)); - print_stat(m, &q->poll_stat[2 * bucket + 1]); - seq_puts(m, "\n"); - } return 0; } @@ -282,7 +257,6 @@ static const char *const rqf_name[] = { RQF_NAME(STATS), RQF_NAME(SPECIAL_PAYLOAD), RQF_NAME(ZONE_WRITE_LOCKED), - RQF_NAME(MQ_POLL_SLEPT), RQF_NAME(TIMED_OUT), RQF_NAME(ELV), RQF_NAME(RESV), diff --git a/block/blk-mq.c b/block/blk-mq.c index a875b1cdff9b5..4e30459df8151 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -46,51 +46,15 @@ static DEFINE_PER_CPU(struct llist_head, blk_cpu_done); -static void blk_mq_poll_stats_start(struct request_queue *q); -static void blk_mq_poll_stats_fn(struct blk_stat_callback *cb); - -static int blk_mq_poll_stats_bkt(const struct request *rq) -{ - int ddir, sectors, bucket; - - ddir = rq_data_dir(rq); - sectors = blk_rq_stats_sectors(rq); - - bucket = ddir + 2 * ilog2(sectors); - - if (bucket < 0) - return -1; - else if (bucket >= BLK_MQ_POLL_STATS_BKTS) - return ddir + BLK_MQ_POLL_STATS_BKTS - 2; - - return bucket; -} - -#define BLK_QC_T_SHIFT 16 -#define BLK_QC_T_INTERNAL (1U << 31) - static inline struct blk_mq_hw_ctx *blk_qc_to_hctx(struct request_queue *q, blk_qc_t qc) { - return xa_load(&q->hctx_table, - (qc & ~BLK_QC_T_INTERNAL) >> BLK_QC_T_SHIFT); -} - -static inline struct request *blk_qc_to_rq(struct blk_mq_hw_ctx *hctx, - blk_qc_t qc) -{ - unsigned int tag = qc & ((1U << BLK_QC_T_SHIFT) - 1); - - if (qc & BLK_QC_T_INTERNAL) - return blk_mq_tag_to_rq(hctx->sched_tags, tag); - return blk_mq_tag_to_rq(hctx->tags, tag); + return xa_load(&q->hctx_table, qc); } static inline blk_qc_t blk_rq_to_qc(struct request *rq) { - return (rq->mq_hctx->queue_num << BLK_QC_T_SHIFT) | - (rq->tag != -1 ? - rq->tag : (rq->internal_tag | BLK_QC_T_INTERNAL)); + return rq->mq_hctx->queue_num; } /* @@ -1038,10 +1002,8 @@ static inline void blk_account_io_start(struct request *req) static inline void __blk_mq_end_request_acct(struct request *rq, u64 now) { - if (rq->rq_flags & RQF_STATS) { - blk_mq_poll_stats_start(rq->q); + if (rq->rq_flags & RQF_STATS) blk_stat_add(rq, now); - } blk_mq_sched_completed_request(rq, now); blk_account_io_done(rq, now); @@ -4222,14 +4184,8 @@ int blk_mq_init_allocated_queue(struct blk_mq_tag_set *set, /* mark the queue as mq asap */ q->mq_ops = set->ops; - q->poll_cb = blk_stat_alloc_callback(blk_mq_poll_stats_fn, - blk_mq_poll_stats_bkt, - BLK_MQ_POLL_STATS_BKTS, q); - if (!q->poll_cb) - goto err_exit; - if (blk_mq_alloc_ctxs(q)) - goto err_poll; + goto err_exit; /* init q->mq_kobj and sw queues' kobjects */ blk_mq_sysfs_init(q); @@ -4257,11 +4213,6 @@ int blk_mq_init_allocated_queue(struct blk_mq_tag_set *set, q->nr_requests = set->queue_depth; - /* - * Default to classic polling - */ - q->poll_nsec = BLK_MQ_POLL_CLASSIC; - blk_mq_init_cpu_queues(q, set->nr_hw_queues); blk_mq_add_queue_tag_set(set, q); blk_mq_map_swqueue(q); @@ -4269,9 +4220,6 @@ int blk_mq_init_allocated_queue(struct blk_mq_tag_set *set, err_hctxs: blk_mq_release(q); -err_poll: - blk_stat_free_callback(q->poll_cb); - q->poll_cb = NULL; err_exit: q->mq_ops = NULL; return -ENOMEM; @@ -4768,138 +4716,8 @@ void blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, int nr_hw_queues) } EXPORT_SYMBOL_GPL(blk_mq_update_nr_hw_queues); -/* Enable polling stats and return whether they were already enabled. */ -static bool blk_poll_stats_enable(struct request_queue *q) -{ - if (q->poll_stat) - return true; - - return blk_stats_alloc_enable(q); -} - -static void blk_mq_poll_stats_start(struct request_queue *q) -{ - /* - * We don't arm the callback if polling stats are not enabled or the - * callback is already active. - */ - if (!q->poll_stat || blk_stat_is_active(q->poll_cb)) - return; - - blk_stat_activate_msecs(q->poll_cb, 100); -} - -static void blk_mq_poll_stats_fn(struct blk_stat_callback *cb) -{ - struct request_queue *q = cb->data; - int bucket; - - for (bucket = 0; bucket < BLK_MQ_POLL_STATS_BKTS; bucket++) { - if (cb->stat[bucket].nr_samples) - q->poll_stat[bucket] = cb->stat[bucket]; - } -} - -static unsigned long blk_mq_poll_nsecs(struct request_queue *q, - struct request *rq) -{ - unsigned long ret = 0; - int bucket; - - /* - * If stats collection isn't on, don't sleep but turn it on for - * future users - */ - if (!blk_poll_stats_enable(q)) - return 0; - - /* - * As an optimistic guess, use half of the mean service time - * for this type of request. We can (and should) make this smarter. - * For instance, if the completion latencies are tight, we can - * get closer than just half the mean. This is especially - * important on devices where the completion latencies are longer - * than ~10 usec. We do use the stats for the relevant IO size - * if available which does lead to better estimates. - */ - bucket = blk_mq_poll_stats_bkt(rq); - if (bucket < 0) - return ret; - - if (q->poll_stat[bucket].nr_samples) - ret = (q->poll_stat[bucket].mean + 1) / 2; - - return ret; -} - -static bool blk_mq_poll_hybrid(struct request_queue *q, blk_qc_t qc) -{ - struct blk_mq_hw_ctx *hctx = blk_qc_to_hctx(q, qc); - struct request *rq = blk_qc_to_rq(hctx, qc); - struct hrtimer_sleeper hs; - enum hrtimer_mode mode; - unsigned int nsecs; - ktime_t kt; - - /* - * If a request has completed on queue that uses an I/O scheduler, we - * won't get back a request from blk_qc_to_rq. - */ - if (!rq || (rq->rq_flags & RQF_MQ_POLL_SLEPT)) - return false; - - /* - * If we get here, hybrid polling is enabled. Hence poll_nsec can be: - * - * 0: use half of prev avg - * >0: use this specific value - */ - if (q->poll_nsec > 0) - nsecs = q->poll_nsec; - else - nsecs = blk_mq_poll_nsecs(q, rq); - - if (!nsecs) - return false; - - rq->rq_flags |= RQF_MQ_POLL_SLEPT; - - /* - * This will be replaced with the stats tracking code, using - * 'avg_completion_time / 2' as the pre-sleep target. - */ - kt = nsecs; - - mode = HRTIMER_MODE_REL; - hrtimer_init_sleeper_on_stack(&hs, CLOCK_MONOTONIC, mode); - hrtimer_set_expires(&hs.timer, kt); - - do { - if (blk_mq_rq_state(rq) == MQ_RQ_COMPLETE) - break; - set_current_state(TASK_UNINTERRUPTIBLE); - hrtimer_sleeper_start_expires(&hs, mode); - if (hs.task) - io_schedule(); - hrtimer_cancel(&hs.timer); - mode = HRTIMER_MODE_ABS; - } while (hs.task && !signal_pending(current)); - - __set_current_state(TASK_RUNNING); - destroy_hrtimer_on_stack(&hs.timer); - - /* - * If we sleep, have the caller restart the poll loop to reset the - * state. Like for the other success return cases, the caller is - * responsible for checking if the IO completed. If the IO isn't - * complete, we'll get called again and will go straight to the busy - * poll loop. - */ - return true; -} - -static int blk_mq_poll_classic(struct request_queue *q, blk_qc_t cookie, - struct io_comp_batch *iob, unsigned int flags) +int blk_mq_poll(struct request_queue *q, blk_qc_t cookie, struct io_comp_batch *iob, + unsigned int flags) { struct blk_mq_hw_ctx *hctx = blk_qc_to_hctx(q, cookie); long state = get_current_state(); @@ -4926,17 +4744,6 @@ static int blk_mq_poll_classic(struct request_queue *q, blk_qc_t cookie, return 0; } -int blk_mq_poll(struct request_queue *q, blk_qc_t cookie, struct io_comp_batch *iob, - unsigned int flags) -{ - if (!(flags & BLK_POLL_NOSLEEP) && - q->poll_nsec != BLK_MQ_POLL_CLASSIC) { - if (blk_mq_poll_hybrid(q, cookie)) - return 1; - } - return blk_mq_poll_classic(q, cookie, iob, flags); -} - unsigned int blk_mq_rq_cpu(struct request *rq) { return rq->mq_ctx->cpu; diff --git a/block/blk-stat.c b/block/blk-stat.c index c6ca16abf911e..74a1a8c32d86f 100644 --- a/block/blk-stat.c +++ b/block/blk-stat.c @@ -231,21 +231,3 @@ void blk_free_queue_stats(struct blk_queue_stats *stats) kfree(stats); } - -bool blk_stats_alloc_enable(struct request_queue *q) -{ - struct blk_rq_stat *poll_stat; - - poll_stat = kcalloc(BLK_MQ_POLL_STATS_BKTS, sizeof(*poll_stat), - GFP_ATOMIC); - if (!poll_stat) - return false; - - if (cmpxchg(&q->poll_stat, NULL, poll_stat) != NULL) { - kfree(poll_stat); - return true; - } - - blk_stat_add_callback(q, q->poll_cb); - return false; -} diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index f1fce1c7fa44b..1a743b4f29582 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -408,35 +408,12 @@ queue_rq_affinity_store(struct request_queue *q, const char *page, size_t count) static ssize_t queue_poll_delay_show(struct request_queue *q, char *page) { - int val; - - if (q->poll_nsec == BLK_MQ_POLL_CLASSIC) - val = BLK_MQ_POLL_CLASSIC; - else - val = q->poll_nsec / 1000; - - return sprintf(page, "%d\n", val); + return sprintf(page, "%d\n", -1); } static ssize_t queue_poll_delay_store(struct request_queue *q, const char *page, size_t count) { - int err, val; - - if (!q->mq_ops || !q->mq_ops->poll) - return -EINVAL; - - err = kstrtoint(page, 10, &val); - if (err < 0) - return err; - - if (val == BLK_MQ_POLL_CLASSIC) - q->poll_nsec = BLK_MQ_POLL_CLASSIC; - else if (val >= 0) - q->poll_nsec = val * 1000; - else - return -EINVAL; - return count; } diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index dd5ce1137f04a..1dacb2c81fdda 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -57,8 +57,6 @@ typedef __u32 __bitwise req_flags_t; #define RQF_SPECIAL_PAYLOAD ((__force req_flags_t)(1 << 18)) /* The per-zone write lock is held for this request */ #define RQF_ZONE_WRITE_LOCKED ((__force req_flags_t)(1 << 19)) -/* already slept for hybrid poll */ -#define RQF_MQ_POLL_SLEPT ((__force req_flags_t)(1 << 20)) /* ->timeout has been called, don't expire again */ #define RQF_TIMED_OUT ((__force req_flags_t)(1 << 21)) /* queue has elevator attached */ diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index d1aee08f8c181..6ede578dfbc64 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -44,12 +44,6 @@ extern const struct device_type disk_type; extern struct device_type part_type; extern struct class block_class; -/* Must be consistent with blk_mq_poll_stats_bkt() */ -#define BLK_MQ_POLL_STATS_BKTS 16 - -/* Doing classic polling */ -#define BLK_MQ_POLL_CLASSIC -1 - /* * Maximum number of blkcg policies allowed to be registered concurrently. * Defined here to simplify include dependency. @@ -468,10 +462,6 @@ struct request_queue { #endif unsigned int rq_timeout; - int poll_nsec; - - struct blk_stat_callback *poll_cb; - struct blk_rq_stat *poll_stat; struct timer_list timeout; struct work_struct timeout_work; @@ -870,8 +860,6 @@ blk_status_t errno_to_blk_status(int errno); /* only poll the hardware once, don't continue until a completion was found */ #define BLK_POLL_ONESHOT (1 << 0) -/* do not sleep to wait for the expected completion time */ -#define BLK_POLL_NOSLEEP (1 << 1) int bio_poll(struct bio *bio, struct io_comp_batch *iob, unsigned int flags); int iocb_bio_iopoll(struct kiocb *kiocb, struct io_comp_batch *iob, unsigned int flags); diff --git a/io_uring/rw.c b/io_uring/rw.c index 4c233910e2009..a099dc0543d95 100644 --- a/io_uring/rw.c +++ b/io_uring/rw.c @@ -1002,7 +1002,7 @@ void io_rw_fail(struct io_kiocb *req) int io_do_iopoll(struct io_ring_ctx *ctx, bool force_nonspin) { struct io_wq_work_node *pos, *start, *prev; - unsigned int poll_flags = BLK_POLL_NOSLEEP; + unsigned int poll_flags = 0; DEFINE_IO_COMP_BATCH(iob); int nr_events = 0;

[PATCHv2] blk-mq: remove hybrid polling

Commit Message

Comments

Patch