[9/9] writeback: throttle buffered writeback

Message ID	1459350477-16404-10-git-send-email-axboe@fb.com (mailing list archive)
State	New, archived
Headers	show Return-Path: <linux-fsdevel-owner@kernel.org> From: Jens Axboe <axboe@fb.com> To: <linux-kernel@vger.kernel.org>, <linux-fsdevel@vger.kernel.org>, <linux-block@vger.kernel.org> CC: Jens Axboe <axboe@fb.com> Subject: [PATCH 9/9] writeback: throttle buffered writeback Date: Wed, 30 Mar 2016 09:07:57 -0600 Message-ID: <1459350477-16404-10-git-send-email-axboe@fb.com> In-Reply-To: <1459350477-16404-1-git-send-email-axboe@fb.com> References: <1459350477-16404-1-git-send-email-axboe@fb.com> MIME-Version: 1.0 Content-Type: text/plain Sender: linux-fsdevel-owner@vger.kernel.org Precedence: bulk

diff --git a/block/Makefile b/block/Makefile index 9eda2322b2d4..9df911a3b569 100644 --- a/block/Makefile +++ b/block/Makefile @@ -5,7 +5,7 @@ obj-$(CONFIG_BLOCK) := bio.o elevator.o blk-core.o blk-tag.o blk-sysfs.o \ blk-flush.o blk-settings.o blk-ioc.o blk-map.o \ blk-exec.o blk-merge.o blk-softirq.o blk-timeout.o \ - blk-lib.o blk-mq.o blk-mq-tag.o \ + blk-lib.o blk-mq.o blk-mq-tag.o blk-wb.o \ blk-mq-sysfs.o blk-mq-cpu.o blk-mq-cpumap.o ioctl.o \ genhd.o scsi_ioctl.o partition-generic.o ioprio.o \ badblocks.o partitions/ diff --git a/block/blk-core.c b/block/blk-core.c index 827f8badd143..85a92cd6047b 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -39,6 +39,7 @@ #include "blk.h" #include "blk-mq.h" +#include "blk-wb.h" EXPORT_TRACEPOINT_SYMBOL_GPL(block_bio_remap); EXPORT_TRACEPOINT_SYMBOL_GPL(block_rq_remap); @@ -863,6 +864,9 @@ blk_init_allocated_queue(struct request_queue *q, request_fn_proc *rfn, */ blk_queue_make_request(q, blk_queue_bio); + if (blk_wb_init(q)) + goto fail; + q->sg_reserved_size = INT_MAX; /* Protect q->elevator from elevator_change */ @@ -880,6 +884,7 @@ blk_init_allocated_queue(struct request_queue *q, request_fn_proc *rfn, fail: blk_free_flush_queue(q->fq); + blk_wb_exit(q); return NULL; } EXPORT_SYMBOL(blk_init_allocated_queue); @@ -1485,6 +1490,8 @@ void __blk_put_request(struct request_queue *q, struct request *req) /* this is a bio leak */ WARN_ON(req->bio != NULL); + blk_wb_done(q->rq_wb, req); + /* * Request may not have originated from ll_rw_blk. if not, * it didn't come out of our reserved rq pools @@ -1714,6 +1721,7 @@ static blk_qc_t blk_queue_bio(struct request_queue *q, struct bio *bio) int el_ret, rw_flags, where = ELEVATOR_INSERT_SORT; struct request *req; unsigned int request_count = 0; + bool wb_acct; /* * low level driver can indicate that it wants pages above a @@ -1766,6 +1774,8 @@ static blk_qc_t blk_queue_bio(struct request_queue *q, struct bio *bio) } get_rq: + wb_acct = blk_wb_wait(q->rq_wb, bio, q->queue_lock); + /* * This sync check and mask will be re-done in init_request_from_bio(), * but we need to set it earlier to expose the sync flag to the @@ -1781,11 +1791,16 @@ get_rq: */ req = get_request(q, rw_flags, bio, GFP_NOIO); if (IS_ERR(req)) { + if (wb_acct) + __blk_wb_done(q->rq_wb); bio->bi_error = PTR_ERR(req); bio_endio(bio); goto out_unlock; } + if (wb_acct) + req->cmd_flags |= REQ_BUF_INFLIGHT; + /* * After dropping the lock and possibly sleeping here, our request * may now be mergeable after it had proven unmergeable (above). diff --git a/block/blk-mq.c b/block/blk-mq.c index 1699baf39b78..437cdc9b429c 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -29,6 +29,7 @@ #include "blk.h" #include "blk-mq.h" #include "blk-mq-tag.h" +#include "blk-wb.h" static DEFINE_MUTEX(all_q_mutex); static LIST_HEAD(all_q_list); @@ -274,6 +275,9 @@ static void __blk_mq_free_request(struct blk_mq_hw_ctx *hctx, if (rq->cmd_flags & REQ_MQ_INFLIGHT) atomic_dec(&hctx->nr_active); + + blk_wb_done(q->rq_wb, rq); + rq->cmd_flags = 0; clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags); @@ -1253,6 +1257,7 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio) struct blk_plug *plug; struct request *same_queue_rq = NULL; blk_qc_t cookie; + bool wb_acct; blk_queue_bounce(q, &bio); @@ -1270,9 +1275,17 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio) } else request_count = blk_plug_queued_count(q); + wb_acct = blk_wb_wait(q->rq_wb, bio, NULL); + rq = blk_mq_map_request(q, bio, &data); - if (unlikely(!rq)) + if (unlikely(!rq)) { + if (wb_acct) + __blk_wb_done(q->rq_wb); return BLK_QC_T_NONE; + } + + if (wb_acct) + rq->cmd_flags |= REQ_BUF_INFLIGHT; cookie = blk_tag_to_qc_t(rq->tag, data.hctx->queue_num); @@ -1349,6 +1362,7 @@ static blk_qc_t blk_sq_make_request(struct request_queue *q, struct bio *bio) struct blk_map_ctx data; struct request *rq; blk_qc_t cookie; + bool wb_acct; blk_queue_bounce(q, &bio); @@ -1363,9 +1377,17 @@ static blk_qc_t blk_sq_make_request(struct request_queue *q, struct bio *bio) blk_attempt_plug_merge(q, bio, &request_count, NULL)) return BLK_QC_T_NONE; + wb_acct = blk_wb_wait(q->rq_wb, bio, NULL); + rq = blk_mq_map_request(q, bio, &data); - if (unlikely(!rq)) + if (unlikely(!rq)) { + if (wb_acct) + __blk_wb_done(q->rq_wb); return BLK_QC_T_NONE; + } + + if (wb_acct) + rq->cmd_flags |= REQ_BUF_INFLIGHT; cookie = blk_tag_to_qc_t(rq->tag, data.hctx->queue_num); @@ -2062,6 +2084,9 @@ struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set, */ q->nr_requests = set->queue_depth; + if (blk_wb_init(q)) + goto err_hctxs; + if (set->ops->complete) blk_queue_softirq_done(q, set->ops->complete); @@ -2097,6 +2122,8 @@ void blk_mq_free_queue(struct request_queue *q) list_del_init(&q->all_q_node); mutex_unlock(&all_q_mutex); + blk_wb_exit(q); + blk_mq_del_queue_tag_set(q); blk_mq_exit_hw_queues(q, set, set->nr_hw_queues); diff --git a/block/blk-settings.c b/block/blk-settings.c index 06e01682f827..bd713a8aa755 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -13,6 +13,7 @@ #include <linux/gfp.h> #include "blk.h" +#include "blk-wb.h" unsigned long blk_max_low_pfn; EXPORT_SYMBOL(blk_max_low_pfn); @@ -823,6 +824,8 @@ EXPORT_SYMBOL(blk_queue_update_dma_alignment); void blk_set_queue_depth(struct request_queue *q, unsigned int depth) { q->queue_depth = depth; + if (q->rq_wb) + blk_wb_update_limits(q->rq_wb, depth); } EXPORT_SYMBOL(blk_set_queue_depth); diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index 954e510452d7..2afd5cb8f003 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -13,6 +13,7 @@ #include "blk.h" #include "blk-mq.h" +#include "blk-wb.h" struct queue_sysfs_entry { struct attribute attr; @@ -347,6 +348,76 @@ static ssize_t queue_poll_store(struct request_queue *q, const char *page, return ret; } +static ssize_t queue_wb_stats_show(struct request_queue *q, char *page) +{ + struct rq_wb *wb = q->rq_wb; + + if (!q->rq_wb) + return -EINVAL; + + return sprintf(page, "idle=%d, normal=%d, max=%d, inflight=%d, wait=%d," + " timer=%d, bdp_wait=%d\n", wb->wb_idle, + wb->wb_normal, wb->wb_max, + atomic_read(&wb->inflight), + waitqueue_active(&wb->wait), + timer_pending(&wb->timer), + *wb->bdp_wait); +} + +static ssize_t queue_wb_perc_show(struct request_queue *q, char *page) +{ + if (!q->rq_wb) + return -EINVAL; + + return queue_var_show(q->rq_wb->perc, page); +} + +static ssize_t queue_wb_perc_store(struct request_queue *q, const char *page, + size_t count) +{ + unsigned long perc; + ssize_t ret; + + if (!q->rq_wb) + return -EINVAL; + + ret = queue_var_store(&perc, page, count); + if (ret < 0) + return ret; + if (perc > 100) + return -EINVAL; + + q->rq_wb->perc = perc; + blk_wb_update_limits(q->rq_wb, blk_queue_depth(q)); + return ret; +} + +static ssize_t queue_wb_cache_delay_show(struct request_queue *q, char *page) +{ + if (!q->rq_wb) + return -EINVAL; + + return queue_var_show(q->rq_wb->cache_delay_usecs, page); +} + +static ssize_t queue_wb_cache_delay_store(struct request_queue *q, + const char *page, size_t count) +{ + unsigned long var; + ssize_t ret; + + if (!q->rq_wb) + return -EINVAL; + + ret = queue_var_store(&var, page, count); + if (ret < 0) + return ret; + + q->rq_wb->cache_delay_usecs = var; + q->rq_wb->cache_delay = usecs_to_jiffies(var); + return ret; +} + static ssize_t queue_wc_show(struct request_queue *q, char *page) { if (test_bit(QUEUE_FLAG_WC, &q->queue_flags)) @@ -516,6 +587,21 @@ static struct queue_sysfs_entry queue_wc_entry = { .store = queue_wc_store, }; +static struct queue_sysfs_entry queue_wb_stats_entry = { + .attr = {.name = "wb_stats", .mode = S_IRUGO }, + .show = queue_wb_stats_show, +}; +static struct queue_sysfs_entry queue_wb_cache_delay_entry = { + .attr = {.name = "wb_cache_usecs", .mode = S_IRUGO | S_IWUSR }, + .show = queue_wb_cache_delay_show, + .store = queue_wb_cache_delay_store, +}; +static struct queue_sysfs_entry queue_wb_perc_entry = { + .attr = {.name = "wb_percent", .mode = S_IRUGO | S_IWUSR }, + .show = queue_wb_perc_show, + .store = queue_wb_perc_store, +}; + static struct attribute *default_attrs[] = { &queue_requests_entry.attr, &queue_ra_entry.attr, @@ -542,6 +628,9 @@ static struct attribute *default_attrs[] = { &queue_random_entry.attr, &queue_poll_entry.attr, &queue_wc_entry.attr, + &queue_wb_stats_entry.attr, + &queue_wb_cache_delay_entry.attr, + &queue_wb_perc_entry.attr, NULL, }; diff --git a/block/blk-wb.c b/block/blk-wb.c new file mode 100644 index 000000000000..d93dd1ccf16a --- /dev/null +++ b/block/blk-wb.c @@ -0,0 +1,238 @@ +/* + * buffered writeback throttling + * + * Copyright (C) 2016 Jens Axboe + * + * Things that need changing: + * + * - Auto-detection of optimal wb_percent setting. A lower setting + * is appropriate on rotating storage (wb_percent=15 gives good + * separation, while still getting full bandwidth with wb cache). + * + */ +#include <linux/kernel.h> +#include <linux/bio.h> +#include <linux/blkdev.h> + +#include "blk.h" +#include "blk-wb.h" + +static inline bool rwb_enabled(struct rq_wb *rwb) +{ + return rwb->wb_normal != 0; +} + +void __blk_wb_done(struct rq_wb *rwb) +{ + int inflight, limit = rwb->wb_normal; + + inflight = atomic_dec_return(&rwb->inflight); + if (inflight >= limit) + return; + + /* + * If the device does caching, we can still flood it with IO + * even at a low depth. If caching is on, delay a bit before + * submitting the next, if we're still purely background + * activity. + */ + if (test_bit(QUEUE_FLAG_WC, &rwb->q->queue_flags) && !*rwb->bdp_wait && + time_before(jiffies, rwb->last_comp + rwb->cache_delay)) { + if (!timer_pending(&rwb->timer)) + mod_timer(&rwb->timer, jiffies + rwb->cache_delay); + return; + } + + if (waitqueue_active(&rwb->wait)) { + int diff = limit - inflight; + + if (diff >= rwb->wb_idle / 2) + wake_up_nr(&rwb->wait, 1); + } +} + +/* + * Called on completion of a request. Note that it's also called when + * a request is merged, when the request gets freed. + */ +void blk_wb_done(struct rq_wb *rwb, struct request *rq) +{ + if (!(rq->cmd_flags & REQ_BUF_INFLIGHT)) { + if (rwb_enabled(rwb)) { + const unsigned long cur = jiffies; + + if (cur != rwb->last_comp) + rwb->last_comp = cur; + } + } else + __blk_wb_done(rwb); +} + +/* + * Increment 'v', if 'v' is below 'below'. Returns true if we succeeded, + * false if 'v' + 1 would be bigger than 'below'. + */ +static bool atomic_inc_below(atomic_t *v, int below) +{ + int cur = atomic_read(v); + + for (;;) { + int old; + + if (cur >= below) + return false; + old = atomic_cmpxchg(v, cur, cur + 1); + if (old == cur) + break; + cur = old; + } + + return true; +} + +static inline unsigned int get_limit(struct rq_wb *rwb, unsigned int rw) +{ + unsigned int limit; + + /* + * At this point we know it's a buffered write. If REQ_SYNC is + * set, then it's WB_SYNC_ALL writeback. Bump the limit 4x for + * those, since someone is (or will be) waiting on that. + */ + if ((rw & REQ_SYNC) || *rwb->bdp_wait) + limit = rwb->wb_max; + else if (time_before(jiffies, rwb->last_comp + HZ / 10)) { + /* + * If less than 100ms since we completed unrelated IO, + * limit us to half the depth for background writeback. + */ + limit = rwb->wb_idle; + } else + limit = rwb->wb_normal; + + return limit; +} + +/* + * Block if we will exceed our limit, or if we are currently waiting for + * the timer to kick off queuing again. + */ +static void __blk_wb_wait(struct rq_wb *rwb, unsigned int rw, spinlock_t *lock) +{ + DEFINE_WAIT(wait); + + if (!timer_pending(&rwb->timer) && + atomic_inc_below(&rwb->inflight, get_limit(rwb, rw))) + return; + + do { + prepare_to_wait_exclusive(&rwb->wait, &wait, + TASK_UNINTERRUPTIBLE); + + if (!timer_pending(&rwb->timer) && + atomic_inc_below(&rwb->inflight, get_limit(rwb, rw))) + break; + + if (lock) + spin_unlock_irq(lock); + + io_schedule(); + + if (lock) + spin_lock_irq(lock); + } while (1); + + finish_wait(&rwb->wait, &wait); +} + +/* + * Returns true if the IO request should be accounted, false if not. + * May sleep, if we have exceeded the writeback limits. Caller can pass + * in an irq held spinlock, if it holds one when calling this function. + * If we do sleep, we'll release and re-grab it. + */ +bool blk_wb_wait(struct rq_wb *rwb, struct bio *bio, spinlock_t *lock) +{ + /* + * If disabled, or not a WRITE (or a discard), do nothing + */ + if (!rwb_enabled(rwb) || !(bio->bi_rw & REQ_WRITE) || + (bio->bi_rw & REQ_DISCARD)) + return false; + + /* + * Don't throttle WRITE_ODIRECT + */ + if ((bio->bi_rw & (REQ_SYNC | REQ_NOIDLE)) == REQ_SYNC) + return false; + + __blk_wb_wait(rwb, bio->bi_rw, lock); + return true; +} + +static void calc_wb_limits(struct rq_wb *rwb, unsigned int depth, + unsigned int perc) +{ + /* + * We'll use depth==64 as a reasonable max limit that should be able + * to achieve full device bandwidth anywhere. + */ + depth = min(64U, depth); + + /* + * Full perf writes are max 'perc' percentage of the depth + */ + rwb->wb_max = (perc * depth + 1) / 100; + if (!rwb->wb_max && perc) + rwb->wb_max = 1; + rwb->wb_normal = (rwb->wb_max + 1) / 2; + rwb->wb_idle = (rwb->wb_max + 3) / 4; +} + +void blk_wb_update_limits(struct rq_wb *rwb, unsigned int depth) +{ + calc_wb_limits(rwb, depth, rwb->perc); + wake_up_all(&rwb->wait); +} + +static void blk_wb_timer(unsigned long data) +{ + struct rq_wb *rwb = (struct rq_wb *) data; + + if (waitqueue_active(&rwb->wait)) + wake_up_nr(&rwb->wait, 1); +} + +#define DEF_WB_PERC 50 +#define DEF_WB_CACHE_DELAY 10000 + +int blk_wb_init(struct request_queue *q) +{ + struct rq_wb *rwb; + + rwb = kzalloc(sizeof(*rwb), GFP_KERNEL); + if (!rwb) + return -ENOMEM; + + atomic_set(&rwb->inflight, 0); + init_waitqueue_head(&rwb->wait); + rwb->last_comp = jiffies; + rwb->bdp_wait = &q->backing_dev_info.wb.dirty_sleeping; + setup_timer(&rwb->timer, blk_wb_timer, (unsigned long) rwb); + rwb->perc = DEF_WB_PERC; + rwb->cache_delay_usecs = DEF_WB_CACHE_DELAY; + rwb->cache_delay = usecs_to_jiffies(rwb->cache_delay); + rwb->q = q; + blk_wb_update_limits(rwb, blk_queue_depth(q)); + q->rq_wb = rwb; + return 0; +} + +void blk_wb_exit(struct request_queue *q) +{ + if (q->rq_wb) + del_timer_sync(&q->rq_wb->timer); + + kfree(q->rq_wb); + q->rq_wb = NULL; +} diff --git a/block/blk-wb.h b/block/blk-wb.h new file mode 100644 index 000000000000..201bc00ac7a7 --- /dev/null +++ b/block/blk-wb.h @@ -0,0 +1,33 @@ +#ifndef BLK_WB_H +#define BLK_WB_H + +#include <linux/atomic.h> +#include <linux/wait.h> + +struct rq_wb { + /* + * Settings that govern how we throttle + */ + unsigned int perc; /* INPUT */ + unsigned int wb_idle; /* idle writeback */ + unsigned int wb_normal; /* normal writeback */ + unsigned int wb_max; /* max throughput writeback */ + + unsigned int cache_delay; + unsigned int cache_delay_usecs; + unsigned long last_comp; + unsigned int *bdp_wait; + struct request_queue *q; + atomic_t inflight; + wait_queue_head_t wait; + struct timer_list timer; +}; + +void __blk_wb_done(struct rq_wb *); +void blk_wb_done(struct rq_wb *, struct request *); +bool blk_wb_wait(struct rq_wb *, struct bio *, spinlock_t *); +int blk_wb_init(struct request_queue *); +void blk_wb_exit(struct request_queue *); +void blk_wb_update_limits(struct rq_wb *, unsigned int); + +#endif diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index 86a38ea1823f..6f2a174b771c 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -188,6 +188,7 @@ enum rq_flag_bits { __REQ_PM, /* runtime pm request */ __REQ_HASHED, /* on IO scheduler merge hash */ __REQ_MQ_INFLIGHT, /* track inflight for MQ */ + __REQ_BUF_INFLIGHT, /* track inflight for buffered */ __REQ_NR_BITS, /* stops here */ }; @@ -241,6 +242,7 @@ enum rq_flag_bits { #define REQ_PM (1ULL << __REQ_PM) #define REQ_HASHED (1ULL << __REQ_HASHED) #define REQ_MQ_INFLIGHT (1ULL << __REQ_MQ_INFLIGHT) +#define REQ_BUF_INFLIGHT (1ULL << __REQ_BUF_INFLIGHT) typedef unsigned int blk_qc_t; #define BLK_QC_T_NONE -1U diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 08b897b159d1..ee9b90ff4fde 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -37,6 +37,7 @@ struct bsg_job; struct blkcg_gq; struct blk_flush_queue; struct pr_ops; +struct rq_wb; #define BLKDEV_MIN_RQ 4 #define BLKDEV_MAX_RQ 128 /* Default maximum */ @@ -290,6 +291,8 @@ struct request_queue { int nr_rqs[2]; /* # allocated [a]sync rqs */ int nr_rqs_elvpriv; /* # allocated rqs w/ elvpriv */ + struct rq_wb *rq_wb; + /* * If blkcg is not used, @q->root_rl serves all requests. If blkcg * is used, root blkg allocates from @q->root_rl and all other

[9/9] writeback: throttle buffered writeback

Commit Message

Patch