[6/6] writeback: throttle buffered writeback

Message ID	1458669320-6819-7-git-send-email-axboe@fb.com (mailing list archive)
State	New, archived
Headers	show Return-Path: <linux-fsdevel-owner@kernel.org> From: Jens Axboe <axboe@fb.com> To: <linux-kernel@vger.kernel.org>, <linux-fsdevel@vger.kernel.org>, <linux-block@vger.kernel.org> CC: Jens Axboe <axboe@fb.com> Subject: [PATCH 6/6] writeback: throttle buffered writeback Date: Tue, 22 Mar 2016 11:55:20 -0600 Message-ID: <1458669320-6819-7-git-send-email-axboe@fb.com> In-Reply-To: <1458669320-6819-1-git-send-email-axboe@fb.com> References: <1458669320-6819-1-git-send-email-axboe@fb.com> MIME-Version: 1.0 Content-Type: text/plain Sender: linux-fsdevel-owner@vger.kernel.org Precedence: bulk

diff --git a/block/Makefile b/block/Makefile index 9eda2322b2d4..9df911a3b569 100644 --- a/block/Makefile +++ b/block/Makefile @@ -5,7 +5,7 @@ obj-$(CONFIG_BLOCK) := bio.o elevator.o blk-core.o blk-tag.o blk-sysfs.o \ blk-flush.o blk-settings.o blk-ioc.o blk-map.o \ blk-exec.o blk-merge.o blk-softirq.o blk-timeout.o \ - blk-lib.o blk-mq.o blk-mq-tag.o \ + blk-lib.o blk-mq.o blk-mq-tag.o blk-wb.o \ blk-mq-sysfs.o blk-mq-cpu.o blk-mq-cpumap.o ioctl.o \ genhd.o scsi_ioctl.o partition-generic.o ioprio.o \ badblocks.o partitions/ diff --git a/block/blk-core.c b/block/blk-core.c index a9fe3d88af99..24cc0481e3ca 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -39,6 +39,7 @@ #include "blk.h" #include "blk-mq.h" +#include "blk-wb.h" EXPORT_TRACEPOINT_SYMBOL_GPL(block_bio_remap); EXPORT_TRACEPOINT_SYMBOL_GPL(block_rq_remap); @@ -848,6 +849,9 @@ blk_init_allocated_queue(struct request_queue *q, request_fn_proc *rfn, if (blk_init_rl(&q->root_rl, q, GFP_KERNEL)) goto fail; + if (blk_buffered_writeback_init(q)) + goto fail; + INIT_WORK(&q->timeout_work, blk_timeout_work); q->request_fn = rfn; q->prep_rq_fn = NULL; @@ -880,6 +884,7 @@ blk_init_allocated_queue(struct request_queue *q, request_fn_proc *rfn, fail: blk_free_flush_queue(q->fq); + blk_buffered_writeback_exit(q); return NULL; } EXPORT_SYMBOL(blk_init_allocated_queue); @@ -1485,6 +1490,8 @@ void __blk_put_request(struct request_queue *q, struct request *req) /* this is a bio leak */ WARN_ON(req->bio != NULL); + blk_buffered_writeback_done(q->rq_wb, req); + /* * Request may not have originated from ll_rw_blk. if not, * it didn't come out of our reserved rq pools @@ -1715,6 +1722,7 @@ static blk_qc_t blk_queue_bio(struct request_queue *q, struct bio *bio) struct request *req; unsigned int request_count = 0; u64 rw_flags; + bool wb_acct; /* * low level driver can indicate that it wants pages above a @@ -1767,6 +1775,8 @@ static blk_qc_t blk_queue_bio(struct request_queue *q, struct bio *bio) } get_rq: + wb_acct = blk_buffered_writeback_wait(q->rq_wb, bio, q->queue_lock); + /* * This sync check and mask will be re-done in init_request_from_bio(), * but we need to set it earlier to expose the sync flag to the @@ -1775,6 +1785,8 @@ get_rq: rw_flags = bio_data_dir(bio); if (sync) rw_flags |= REQ_SYNC; + if (wb_acct) + rw_flags |= REQ_BUF_INFLIGHT; /* * Grab a free request. This is might sleep but can not fail. @@ -1782,6 +1794,8 @@ get_rq: */ req = get_request(q, rw_flags, bio, GFP_NOIO); if (IS_ERR(req)) { + if (wb_acct) + __blk_buffered_writeback_done(q->rq_wb); bio->bi_error = PTR_ERR(req); bio_endio(bio); goto out_unlock; diff --git a/block/blk-lib.c b/block/blk-lib.c index 9ebf65379556..b39d92361f30 100644 --- a/block/blk-lib.c +++ b/block/blk-lib.c @@ -6,6 +6,7 @@ #include <linux/bio.h> #include <linux/blkdev.h> #include <linux/scatterlist.h> +#include <linux/atomic.h> #include "blk.h" diff --git a/block/blk-mq.c b/block/blk-mq.c index 050f7a13021b..55aace97fd35 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -29,6 +29,7 @@ #include "blk.h" #include "blk-mq.h" #include "blk-mq-tag.h" +#include "blk-wb.h" static DEFINE_MUTEX(all_q_mutex); static LIST_HEAD(all_q_list); @@ -274,6 +275,9 @@ static void __blk_mq_free_request(struct blk_mq_hw_ctx *hctx, if (rq->cmd_flags & REQ_MQ_INFLIGHT) atomic_dec(&hctx->nr_active); + + blk_buffered_writeback_done(q->rq_wb, rq); + rq->cmd_flags = 0; clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags); @@ -1253,6 +1257,7 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio) struct blk_plug *plug; struct request *same_queue_rq = NULL; blk_qc_t cookie; + bool wb_acct; blk_queue_bounce(q, &bio); @@ -1270,9 +1275,17 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio) } else request_count = blk_plug_queued_count(q); + wb_acct = blk_buffered_writeback_wait(q->rq_wb, bio, NULL); + rq = blk_mq_map_request(q, bio, &data); - if (unlikely(!rq)) + if (unlikely(!rq)) { + if (wb_acct) + __blk_buffered_writeback_done(q->rq_wb); return BLK_QC_T_NONE; + } + + if (wb_acct) + rq->cmd_flags |= REQ_BUF_INFLIGHT; cookie = blk_tag_to_qc_t(rq->tag, data.hctx->queue_num); @@ -1349,6 +1362,7 @@ static blk_qc_t blk_sq_make_request(struct request_queue *q, struct bio *bio) struct blk_map_ctx data; struct request *rq; blk_qc_t cookie; + bool wb_acct; blk_queue_bounce(q, &bio); @@ -1363,9 +1377,17 @@ static blk_qc_t blk_sq_make_request(struct request_queue *q, struct bio *bio) blk_attempt_plug_merge(q, bio, &request_count, NULL)) return BLK_QC_T_NONE; + wb_acct = blk_buffered_writeback_wait(q->rq_wb, bio, NULL); + rq = blk_mq_map_request(q, bio, &data); - if (unlikely(!rq)) + if (unlikely(!rq)) { + if (wb_acct) + __blk_buffered_writeback_done(q->rq_wb); return BLK_QC_T_NONE; + } + + if (wb_acct) + rq->cmd_flags |= REQ_BUF_INFLIGHT; cookie = blk_tag_to_qc_t(rq->tag, data.hctx->queue_num); @@ -2018,6 +2040,9 @@ struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set, /* mark the queue as mq asap */ q->mq_ops = set->ops; + if (blk_buffered_writeback_init(q)) + return ERR_PTR(-ENOMEM); + q->queue_ctx = alloc_percpu(struct blk_mq_ctx); if (!q->queue_ctx) return ERR_PTR(-ENOMEM); @@ -2084,6 +2109,7 @@ err_map: kfree(q->queue_hw_ctx); err_percpu: free_percpu(q->queue_ctx); + blk_buffered_writeback_exit(q); return ERR_PTR(-ENOMEM); } EXPORT_SYMBOL(blk_mq_init_allocated_queue); @@ -2096,6 +2122,8 @@ void blk_mq_free_queue(struct request_queue *q) list_del_init(&q->all_q_node); mutex_unlock(&all_q_mutex); + blk_buffered_writeback_exit(q); + blk_mq_del_queue_tag_set(q); blk_mq_exit_hw_queues(q, set, set->nr_hw_queues); diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index deb2270bf1f3..da5e0517b9af 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -13,6 +13,7 @@ #include "blk.h" #include "blk-mq.h" +#include "blk-wb.h" struct queue_sysfs_entry { struct attribute attr; @@ -358,7 +359,6 @@ static ssize_t queue_wc_show(struct request_queue *q, char *page) static ssize_t queue_wc_store(struct request_queue *q, const char *page, size_t count) { - ssize_t ret; int set = -1; if (!strncmp(page, "write back", 10)) @@ -377,6 +377,71 @@ static ssize_t queue_wc_store(struct request_queue *q, const char *page, queue_flag_clear(QUEUE_FLAG_WC, q); spin_unlock_irq(q->queue_lock); + return count; +} + +static ssize_t queue_wb_stats_show(struct request_queue *q, char *page) +{ + struct rq_wb *wb = q->rq_wb; + + if (!q->rq_wb) + return -EINVAL; + + return sprintf(page, "limit=%d, batch=%d, inflight=%d, wait=%d, timer=%d\n", + wb->limit, wb->batch, atomic_read(&wb->inflight), + waitqueue_active(&wb->wait), timer_pending(&wb->timer)); +} + +static ssize_t queue_wb_depth_show(struct request_queue *q, char *page) +{ + if (!q->rq_wb) + return -EINVAL; + + return queue_var_show(q->rq_wb->limit, page); +} + +static ssize_t queue_wb_depth_store(struct request_queue *q, const char *page, + size_t count) +{ + unsigned long var; + ssize_t ret; + + if (!q->rq_wb) + return -EINVAL; + + ret = queue_var_store(&var, page, count); + if (ret < 0) + return ret; + if (var != (unsigned int) var) + return -EINVAL; + + blk_update_wb_limit(q->rq_wb, var); + return ret; +} + +static ssize_t queue_wb_cache_delay_show(struct request_queue *q, char *page) +{ + if (!q->rq_wb) + return -EINVAL; + + return queue_var_show(q->rq_wb->cache_delay_usecs, page); +} + +static ssize_t queue_wb_cache_delay_store(struct request_queue *q, + const char *page, size_t count) +{ + unsigned long var; + ssize_t ret; + + if (!q->rq_wb) + return -EINVAL; + + ret = queue_var_store(&var, page, count); + if (ret < 0) + return ret; + + q->rq_wb->cache_delay_usecs = var; + q->rq_wb->cache_delay = usecs_to_jiffies(var); return ret; } @@ -517,6 +582,21 @@ static struct queue_sysfs_entry queue_wc_entry = { .store = queue_wc_store, }; +static struct queue_sysfs_entry queue_wb_stats_entry = { + .attr = {.name = "wb_stats", .mode = S_IRUGO }, + .show = queue_wb_stats_show, +}; +static struct queue_sysfs_entry queue_wb_cache_delay_entry = { + .attr = {.name = "wb_cache_usecs", .mode = S_IRUGO | S_IWUSR }, + .show = queue_wb_cache_delay_show, + .store = queue_wb_cache_delay_store, +}; +static struct queue_sysfs_entry queue_wb_depth_entry = { + .attr = {.name = "wb_depth", .mode = S_IRUGO | S_IWUSR }, + .show = queue_wb_depth_show, + .store = queue_wb_depth_store, +}; + static struct attribute *default_attrs[] = { &queue_requests_entry.attr, &queue_ra_entry.attr, @@ -543,6 +623,9 @@ static struct attribute *default_attrs[] = { &queue_random_entry.attr, &queue_poll_entry.attr, &queue_wc_entry.attr, + &queue_wb_stats_entry.attr, + &queue_wb_cache_delay_entry.attr, + &queue_wb_depth_entry.attr, NULL, }; diff --git a/block/blk-wb.c b/block/blk-wb.c new file mode 100644 index 000000000000..2aa3753a8e1e --- /dev/null +++ b/block/blk-wb.c @@ -0,0 +1,219 @@ +/* + * buffered writeback throttling + * + * Copyright (C) 2016 Jens Axboe + * + * Things that need changing: + * + * - Auto-detection of most of this, no tunables. Cache type we can get, + * and most other settings we can tweak/gather based on time. + * - Better solution for rwb->bdp_wait? + * - Higher depth for WB_SYNC_ALL? + * + */ +#include <linux/kernel.h> +#include <linux/bio.h> +#include <linux/blkdev.h> + +#include "blk.h" +#include "blk-wb.h" + +void __blk_buffered_writeback_done(struct rq_wb *rwb) +{ + int inflight; + + inflight = atomic_dec_return(&rwb->inflight); + if (inflight >= rwb->limit) + return; + + /* + * If the device does caching, we can still flood it with IO + * even at a low depth. If caching is on, delay a bit before + * submitting the next, if we're still purely background + * activity. + */ + if (test_bit(QUEUE_FLAG_WC, &rwb->q->queue_flags) && !*rwb->bdp_wait && + time_before(jiffies, rwb->last_comp + rwb->cache_delay)) { + if (!timer_pending(&rwb->timer)) + mod_timer(&rwb->timer, jiffies + rwb->cache_delay); + return; + } + + if (waitqueue_active(&rwb->wait)) { + int diff = rwb->limit - inflight; + + if (diff >= rwb->batch) + wake_up_nr(&rwb->wait, 1); + } +} + +/* + * Called on completion of a request. Note that it's also called when + * a request is merged, when the request gets freed. + */ +void blk_buffered_writeback_done(struct rq_wb *rwb, struct request *rq) +{ + if (!(rq->cmd_flags & REQ_BUF_INFLIGHT)) { + const unsigned long cur = jiffies; + + if (rwb->limit && cur != rwb->last_comp) + rwb->last_comp = cur; + } else + __blk_buffered_writeback_done(rwb); +} + +/* + * Increment 'v', if 'v' is below 'below'. Returns true if we succeeded, + * false if 'v' + 1 would be bigger than 'below'. + */ +static bool atomic_inc_below(atomic_t *v, int below) +{ + int cur = atomic_read(v); + + for (;;) { + int old; + + if (cur >= below) + return false; + old = atomic_cmpxchg(v, cur, cur + 1); + if (old == cur) + break; + cur = old; + } + + return true; +} + +/* + * Block if we will exceed our limit, or if we are currently waiting for + * the timer to kick off queuing again. + */ +static void __blk_buffered_writeback_wait(struct rq_wb *rwb, unsigned int limit, + spinlock_t *lock) +{ + DEFINE_WAIT(wait); + + if (!timer_pending(&rwb->timer) && + atomic_inc_below(&rwb->inflight, limit)) + return; + + do { + prepare_to_wait_exclusive(&rwb->wait, &wait, + TASK_UNINTERRUPTIBLE); + + if (!timer_pending(&rwb->timer) && + atomic_inc_below(&rwb->inflight, limit)) + break; + + if (lock) + spin_unlock_irq(lock); + + io_schedule(); + + if (lock) + spin_lock_irq(lock); + } while (1); + + finish_wait(&rwb->wait, &wait); +} + +/* + * Returns true if the IO request should be accounted, false if not. + * May sleep, if we have exceeded the writeback limits. Caller can pass + * in an irq held spinlock, if it holds one when calling this function. + * If we do sleep, we'll release and re-grab it. + */ +bool blk_buffered_writeback_wait(struct rq_wb *rwb, struct bio *bio, + spinlock_t *lock) +{ + unsigned int limit; + + /* + * If disabled, or not a WRITE (or a discard), do nothing + */ + if (!rwb->limit || !(bio->bi_rw & REQ_WRITE) || + (bio->bi_rw & REQ_DISCARD)) + return false; + + /* + * Don't throttle WRITE_ODIRECT + */ + if ((bio->bi_rw & (REQ_SYNC | REQ_NOIDLE)) == REQ_SYNC) + return false; + + /* + * At this point we know it's a buffered write. If REQ_SYNC is + * set, then it's WB_SYNC_ALL writeback. Bump the limit 4x for + * those, since someone is (or will be) waiting on that. + */ + limit = rwb->limit; + if (bio->bi_rw & REQ_SYNC) + limit <<= 2; + else if (limit != 1) { + /* + * If less than 100ms since we completed unrelated IO, + * limit us to a depth of 1 for background writeback. + */ + if (time_before(jiffies, rwb->last_comp + HZ / 10)) + limit = 1; + else if (!*rwb->bdp_wait) + limit >>= 1; + } + + __blk_buffered_writeback_wait(rwb, limit, lock); + return true; +} + +void blk_update_wb_limit(struct rq_wb *rwb, unsigned int limit) +{ + rwb->limit = limit; + rwb->batch = rwb->limit / 2; + if (!rwb->batch && rwb->limit) + rwb->batch = 1; + else if (rwb->batch > 4) + rwb->batch = 4; + + wake_up_all(&rwb->wait); +} + +static void blk_buffered_writeback_timer(unsigned long data) +{ + struct rq_wb *rwb = (struct rq_wb *) data; + + if (waitqueue_active(&rwb->wait)) + wake_up_nr(&rwb->wait, 1); +} + +#define DEF_WB_LIMIT 4 +#define DEF_WB_CACHE_DELAY 10000 + +int blk_buffered_writeback_init(struct request_queue *q) +{ + struct rq_wb *rwb; + + rwb = kzalloc(sizeof(*rwb), GFP_KERNEL); + if (!rwb) + return -ENOMEM; + + atomic_set(&rwb->inflight, 0); + init_waitqueue_head(&rwb->wait); + rwb->last_comp = jiffies; + rwb->bdp_wait = &q->backing_dev_info.wb.dirty_sleeping; + setup_timer(&rwb->timer, blk_buffered_writeback_timer, + (unsigned long) rwb); + rwb->cache_delay_usecs = DEF_WB_CACHE_DELAY; + rwb->cache_delay = usecs_to_jiffies(rwb->cache_delay); + rwb->q = q; + blk_update_wb_limit(rwb, DEF_WB_LIMIT); + q->rq_wb = rwb; + return 0; +} + +void blk_buffered_writeback_exit(struct request_queue *q) +{ + if (q->rq_wb) + del_timer_sync(&q->rq_wb->timer); + + kfree(q->rq_wb); + q->rq_wb = NULL; +} diff --git a/block/blk-wb.h b/block/blk-wb.h new file mode 100644 index 000000000000..f9d5dc817c80 --- /dev/null +++ b/block/blk-wb.h @@ -0,0 +1,24 @@ +#ifndef BLK_WB_H +#define BLK_WB_H + +struct rq_wb { + unsigned int limit; + unsigned int batch; + unsigned int cache_delay; + unsigned int cache_delay_usecs; + unsigned long last_comp; + unsigned int *bdp_wait; + struct request_queue *q; + atomic_t inflight; + wait_queue_head_t wait; + struct timer_list timer; +}; + +void __blk_buffered_writeback_done(struct rq_wb *); +void blk_buffered_writeback_done(struct rq_wb *, struct request *); +bool blk_buffered_writeback_wait(struct rq_wb *, struct bio *, spinlock_t *); +int blk_buffered_writeback_init(struct request_queue *); +void blk_buffered_writeback_exit(struct request_queue *); +void blk_update_wb_limit(struct rq_wb *, unsigned int); + +#endif diff --git a/include/linux/backing-dev-defs.h b/include/linux/backing-dev-defs.h index 1b4d69f68c33..f702309216b4 100644 --- a/include/linux/backing-dev-defs.h +++ b/include/linux/backing-dev-defs.h @@ -116,6 +116,8 @@ struct bdi_writeback { struct list_head work_list; struct delayed_work dwork; /* work item used for writeback */ + int dirty_sleeping; /* waiting on dirty limit exceeded */ + struct list_head bdi_node; /* anchored at bdi->wb_list */ #ifdef CONFIG_CGROUP_WRITEBACK diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index 86a38ea1823f..6f2a174b771c 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -188,6 +188,7 @@ enum rq_flag_bits { __REQ_PM, /* runtime pm request */ __REQ_HASHED, /* on IO scheduler merge hash */ __REQ_MQ_INFLIGHT, /* track inflight for MQ */ + __REQ_BUF_INFLIGHT, /* track inflight for buffered */ __REQ_NR_BITS, /* stops here */ }; @@ -241,6 +242,7 @@ enum rq_flag_bits { #define REQ_PM (1ULL << __REQ_PM) #define REQ_HASHED (1ULL << __REQ_HASHED) #define REQ_MQ_INFLIGHT (1ULL << __REQ_MQ_INFLIGHT) +#define REQ_BUF_INFLIGHT (1ULL << __REQ_BUF_INFLIGHT) typedef unsigned int blk_qc_t; #define BLK_QC_T_NONE -1U diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index da5e85c35318..b33e16c4f570 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -37,6 +37,7 @@ struct bsg_job; struct blkcg_gq; struct blk_flush_queue; struct pr_ops; +struct rq_wb; #define BLKDEV_MIN_RQ 4 #define BLKDEV_MAX_RQ 128 /* Default maximum */ @@ -290,6 +291,8 @@ struct request_queue { int nr_rqs[2]; /* # allocated [a]sync rqs */ int nr_rqs_elvpriv; /* # allocated rqs w/ elvpriv */ + struct rq_wb *rq_wb; + /* * If blkcg is not used, @q->root_rl serves all requests. If blkcg * is used, root blkg allocates from @q->root_rl and all other diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 11ff8f758631..15e696bc5d14 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -1746,7 +1746,9 @@ pause: pause, start_time); __set_current_state(TASK_KILLABLE); + wb->dirty_sleeping = 1; io_schedule_timeout(pause); + wb->dirty_sleeping = 0; current->dirty_paused_when = now + pause; current->nr_dirtied = 0;

[6/6] writeback: throttle buffered writeback

Commit Message

Comments

Patch