diff mbox

[V4,07/15] blk-throttle: make throtl_slice tunable

Message ID 24137f881a149d30b98ef040367f26378128d4d2.1479161136.git.shli@fb.com (mailing list archive)
State New, archived
Headers show

Commit Message

Shaohua Li Nov. 14, 2016, 10:22 p.m. UTC
throtl_slice is important for blk-throttling. A lot of stuffes depend on
it, for example, throughput measurement. It has 100ms default value,
which is not appropriate for all disks. For example, for SSD we might
use a smaller value to make the throughput smoother. This patch makes it
tunable.

Signed-off-by: Shaohua Li <shli@fb.com>
---
 block/blk-sysfs.c    | 11 ++++++++
 block/blk-throttle.c | 77 +++++++++++++++++++++++++++++++++++++---------------
 block/blk.h          |  3 ++
 3 files changed, 69 insertions(+), 22 deletions(-)

Comments

Tejun Heo Nov. 22, 2016, 9:27 p.m. UTC | #1
Hello,

On Mon, Nov 14, 2016 at 02:22:14PM -0800, Shaohua Li wrote:
> throtl_slice is important for blk-throttling. A lot of stuffes depend on
> it, for example, throughput measurement. It has 100ms default value,
> which is not appropriate for all disks. For example, for SSD we might
> use a smaller value to make the throughput smoother. This patch makes it
> tunable.

It bothers me a bit because time slice doesn't mean anything inherent
to throttling.  It really is an implementation detail - throttling can
be implemented at per-operation level without time slice involved at
all.  It's okay to expose the knob if necessary but the meaning of the
knob is almost completely arbitrary to users (as it has no inherent
meaning).

Thanks.
Shaohua Li Nov. 22, 2016, 11:18 p.m. UTC | #2
On Tue, Nov 22, 2016 at 04:27:15PM -0500, Tejun Heo wrote:
> Hello,
> 
> On Mon, Nov 14, 2016 at 02:22:14PM -0800, Shaohua Li wrote:
> > throtl_slice is important for blk-throttling. A lot of stuffes depend on
> > it, for example, throughput measurement. It has 100ms default value,
> > which is not appropriate for all disks. For example, for SSD we might
> > use a smaller value to make the throughput smoother. This patch makes it
> > tunable.
> 
> It bothers me a bit because time slice doesn't mean anything inherent
> to throttling.  It really is an implementation detail - throttling can
> be implemented at per-operation level without time slice involved at
> all.  It's okay to expose the knob if necessary but the meaning of the
> knob is almost completely arbitrary to users (as it has no inherent
> meaning).

Hmm, it's not a real 'time slice'. The name is a bit confusion. Maybe rename it
to 'throtl_interval' or 'throtl_sampling_time'? not sure. bandwidth and iops
are always in terms of a time interval we measure them. We can't say the
iops/bw for a single io. So this is really a tuable knob.

Thanks,
Shaohua
--
To unsubscribe from this list: send the line "unsubscribe linux-block" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Tejun Heo Nov. 23, 2016, 9:17 p.m. UTC | #3
Hello,

On Tue, Nov 22, 2016 at 03:18:24PM -0800, Shaohua Li wrote:
> Hmm, it's not a real 'time slice'. The name is a bit confusion. Maybe rename it
> to 'throtl_interval' or 'throtl_sampling_time'? not sure. bandwidth and iops
> are always in terms of a time interval we measure them. We can't say the
> iops/bw for a single io. So this is really a tuable knob.

Yeah, maybe using a more indicative name is better.  However, even if
we say that this is the sampling period, it's not clear how adjusting
the knob would affect the behavior as that's not something clearly
defined in blk-throtl's operation model.  For contrast, compare it
with the latency target, the implemented behavior might not succeed to
follow the intended configuration perfectly but what the intention of
the configuration is clear regardless.

That said, if this needs to be a tunable knob, it's fine to have it,
but let's at least try to document what the effects of changing the
variable is.

Thanks.
diff mbox

Patch

diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index 9cc8d7c..3e284e4 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -526,6 +526,14 @@  static struct queue_sysfs_entry queue_dax_entry = {
 	.show = queue_dax_show,
 };
 
+#ifdef CONFIG_BLK_DEV_THROTTLING
+static struct queue_sysfs_entry throtl_slice_entry = {
+	.attr = {.name = "throttling_slice", .mode = S_IRUGO | S_IWUSR },
+	.show = blk_throtl_slice_show,
+	.store = blk_throtl_slice_store,
+};
+#endif
+
 static struct attribute *default_attrs[] = {
 	&queue_requests_entry.attr,
 	&queue_ra_entry.attr,
@@ -553,6 +561,9 @@  static struct attribute *default_attrs[] = {
 	&queue_poll_entry.attr,
 	&queue_wc_entry.attr,
 	&queue_dax_entry.attr,
+#ifdef CONFIG_BLK_DEV_THROTTLING
+	&throtl_slice_entry.attr,
+#endif
 	NULL,
 };
 
diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index eff3120..e85b2b6 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -19,7 +19,8 @@  static int throtl_grp_quantum = 8;
 static int throtl_quantum = 32;
 
 /* Throttling is performed over 100ms slice and after that slice is renewed */
-static unsigned long throtl_slice = HZ/10;	/* 100 ms */
+#define DFL_THROTL_SLICE (HZ / 10)
+#define MAX_THROTL_SLICE (HZ / 5)
 
 static struct blkcg_policy blkcg_policy_throtl;
 
@@ -158,6 +159,8 @@  struct throtl_data
 	/* Total Number of queued bios on READ and WRITE lists */
 	unsigned int nr_queued[2];
 
+	unsigned int throtl_slice;
+
 	/* Work for dispatching throttled bios */
 	struct work_struct dispatch_work;
 	unsigned int limit_index;
@@ -589,7 +592,7 @@  static void throtl_dequeue_tg(struct throtl_grp *tg)
 static void throtl_schedule_pending_timer(struct throtl_service_queue *sq,
 					  unsigned long expires)
 {
-	unsigned long max_expire = jiffies + 8 * throtl_slice;
+	unsigned long max_expire = jiffies + 8 * sq_to_tg(sq)->td->throtl_slice;
 
 	if (time_after(expires, max_expire))
 		expires = max_expire;
@@ -650,7 +653,7 @@  static inline void throtl_start_new_slice_with_credit(struct throtl_grp *tg,
 	if (time_after_eq(start, tg->slice_start[rw]))
 		tg->slice_start[rw] = start;
 
-	tg->slice_end[rw] = jiffies + throtl_slice;
+	tg->slice_end[rw] = jiffies + tg->td->throtl_slice;
 	throtl_log(&tg->service_queue,
 		   "[%c] new slice with credit start=%lu end=%lu jiffies=%lu",
 		   rw == READ ? 'R' : 'W', tg->slice_start[rw],
@@ -662,7 +665,7 @@  static inline void throtl_start_new_slice(struct throtl_grp *tg, bool rw)
 	tg->bytes_disp[rw] = 0;
 	tg->io_disp[rw] = 0;
 	tg->slice_start[rw] = jiffies;
-	tg->slice_end[rw] = jiffies + throtl_slice;
+	tg->slice_end[rw] = jiffies + tg->td->throtl_slice;
 	throtl_log(&tg->service_queue,
 		   "[%c] new slice start=%lu end=%lu jiffies=%lu",
 		   rw == READ ? 'R' : 'W', tg->slice_start[rw],
@@ -672,13 +675,13 @@  static inline void throtl_start_new_slice(struct throtl_grp *tg, bool rw)
 static inline void throtl_set_slice_end(struct throtl_grp *tg, bool rw,
 					unsigned long jiffy_end)
 {
-	tg->slice_end[rw] = roundup(jiffy_end, throtl_slice);
+	tg->slice_end[rw] = roundup(jiffy_end, tg->td->throtl_slice);
 }
 
 static inline void throtl_extend_slice(struct throtl_grp *tg, bool rw,
 				       unsigned long jiffy_end)
 {
-	tg->slice_end[rw] = roundup(jiffy_end, throtl_slice);
+	tg->slice_end[rw] = roundup(jiffy_end, tg->td->throtl_slice);
 	throtl_log(&tg->service_queue,
 		   "[%c] extend slice start=%lu end=%lu jiffies=%lu",
 		   rw == READ ? 'R' : 'W', tg->slice_start[rw],
@@ -718,19 +721,20 @@  static inline void throtl_trim_slice(struct throtl_grp *tg, bool rw)
 	 * is bad because it does not allow new slice to start.
 	 */
 
-	throtl_set_slice_end(tg, rw, jiffies + throtl_slice);
+	throtl_set_slice_end(tg, rw, jiffies + tg->td->throtl_slice);
 
 	time_elapsed = jiffies - tg->slice_start[rw];
 
-	nr_slices = time_elapsed / throtl_slice;
+	nr_slices = time_elapsed / tg->td->throtl_slice;
 
 	if (!nr_slices)
 		return;
-	tmp = tg_bps_limit(tg, rw) * throtl_slice * nr_slices;
+	tmp = tg_bps_limit(tg, rw) * tg->td->throtl_slice * nr_slices;
 	do_div(tmp, HZ);
 	bytes_trim = tmp;
 
-	io_trim = (tg_iops_limit(tg, rw) * throtl_slice * nr_slices) / HZ;
+	io_trim = (tg_iops_limit(tg, rw) * tg->td->throtl_slice * nr_slices) /
+		HZ;
 
 	if (!bytes_trim && !io_trim)
 		return;
@@ -745,7 +749,7 @@  static inline void throtl_trim_slice(struct throtl_grp *tg, bool rw)
 	else
 		tg->io_disp[rw] = 0;
 
-	tg->slice_start[rw] += nr_slices * throtl_slice;
+	tg->slice_start[rw] += nr_slices * tg->td->throtl_slice;
 
 	throtl_log(&tg->service_queue,
 		   "[%c] trim slice nr=%lu bytes=%llu io=%lu start=%lu end=%lu jiffies=%lu",
@@ -765,9 +769,9 @@  static bool tg_with_in_iops_limit(struct throtl_grp *tg, struct bio *bio,
 
 	/* Slice has just started. Consider one slice interval */
 	if (!jiffy_elapsed)
-		jiffy_elapsed_rnd = throtl_slice;
+		jiffy_elapsed_rnd = tg->td->throtl_slice;
 
-	jiffy_elapsed_rnd = roundup(jiffy_elapsed_rnd, throtl_slice);
+	jiffy_elapsed_rnd = roundup(jiffy_elapsed_rnd, tg->td->throtl_slice);
 
 	/*
 	 * jiffy_elapsed_rnd should not be a big value as minimum iops can be
@@ -814,9 +818,9 @@  static bool tg_with_in_bps_limit(struct throtl_grp *tg, struct bio *bio,
 
 	/* Slice has just started. Consider one slice interval */
 	if (!jiffy_elapsed)
-		jiffy_elapsed_rnd = throtl_slice;
+		jiffy_elapsed_rnd = tg->td->throtl_slice;
 
-	jiffy_elapsed_rnd = roundup(jiffy_elapsed_rnd, throtl_slice);
+	jiffy_elapsed_rnd = roundup(jiffy_elapsed_rnd, tg->td->throtl_slice);
 
 	tmp = tg_bps_limit(tg, rw) * jiffy_elapsed_rnd;
 	do_div(tmp, HZ);
@@ -881,8 +885,10 @@  static bool tg_may_dispatch(struct throtl_grp *tg, struct bio *bio,
 	if (throtl_slice_used(tg, rw) && !(tg->service_queue.nr_queued[rw]))
 		throtl_start_new_slice(tg, rw);
 	else {
-		if (time_before(tg->slice_end[rw], jiffies + throtl_slice))
-			throtl_extend_slice(tg, rw, jiffies + throtl_slice);
+		if (time_before(tg->slice_end[rw],
+		    jiffies + tg->td->throtl_slice))
+			throtl_extend_slice(tg, rw,
+				jiffies + tg->td->throtl_slice);
 	}
 
 	if (tg_with_in_bps_limit(tg, bio, &bps_wait) &&
@@ -1632,7 +1638,7 @@  static bool throtl_can_upgrade(struct throtl_data *td,
 	if (td->limit_index != LIMIT_HIGH)
 		return false;
 
-	if (time_before(jiffies, td->high_downgrade_time + throtl_slice))
+	if (time_before(jiffies, td->high_downgrade_time + td->throtl_slice))
 		return false;
 
 	rcu_read_lock();
@@ -1689,8 +1695,9 @@  static bool throtl_downgrade_check_one(struct throtl_grp *tg)
 	 * If cgroup is below high limit, consider downgrade and throttle other
 	 * cgroups
 	 */
-	if (time_after_eq(now, td->high_upgrade_time + throtl_slice) &&
-	    time_after_eq(now, tg_last_high_overflow_time(tg) + throtl_slice))
+	if (time_after_eq(now, td->high_upgrade_time + td->throtl_slice) &&
+	    time_after_eq(now, tg_last_high_overflow_time(tg) +
+					td->throtl_slice))
 		return true;
 	return false;
 }
@@ -1723,10 +1730,11 @@  static void throtl_downgrade_check(struct throtl_grp *tg)
 		return;
 	if (!list_empty(&tg_to_blkg(tg)->blkcg->css.children))
 		return;
-	if (time_after(tg->last_check_time + throtl_slice, now))
+	if (time_after(tg->last_check_time + tg->td->throtl_slice, now))
 		return;
 
-	if (time_before(now, tg_last_high_overflow_time(tg) + throtl_slice))
+	if (time_before(now, tg_last_high_overflow_time(tg) +
+			tg->td->throtl_slice))
 		return;
 
 	elapsed_time = now - tg->last_check_time;
@@ -1965,6 +1973,7 @@  int blk_throtl_init(struct request_queue *q)
 
 	q->td = td;
 	td->queue = q;
+	td->throtl_slice = DFL_THROTL_SLICE;
 
 	td->limit_valid[LIMIT_MAX] = true;
 	td->limit_index = LIMIT_MAX;
@@ -1985,6 +1994,30 @@  void blk_throtl_exit(struct request_queue *q)
 	kfree(q->td);
 }
 
+ssize_t blk_throtl_slice_show(struct request_queue *q, char *page)
+{
+	if (!q->td)
+		return -EINVAL;
+	return sprintf(page, "%ums\n", jiffies_to_msecs(q->td->throtl_slice));
+}
+
+ssize_t blk_throtl_slice_store(struct request_queue *q,
+	const char *page, size_t count)
+{
+	unsigned long v;
+	unsigned long t;
+
+	if (!q->td)
+		return -EINVAL;
+	if (kstrtoul(page, 10, &v))
+		return -EINVAL;
+	t = msecs_to_jiffies(v);
+	if (t == 0 || t > MAX_THROTL_SLICE)
+		return -EINVAL;
+	q->td->throtl_slice = t;
+	return count;
+}
+
 static int __init throtl_init(void)
 {
 	kthrotld_workqueue = alloc_workqueue("kthrotld", WQ_MEM_RECLAIM, 0);
diff --git a/block/blk.h b/block/blk.h
index 74444c4..39c14dd 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -289,6 +289,9 @@  static inline struct io_context *create_io_context(gfp_t gfp_mask, int node)
 extern void blk_throtl_drain(struct request_queue *q);
 extern int blk_throtl_init(struct request_queue *q);
 extern void blk_throtl_exit(struct request_queue *q);
+extern ssize_t blk_throtl_slice_show(struct request_queue *q, char *page);
+extern ssize_t blk_throtl_slice_store(struct request_queue *q,
+	const char *page, size_t count);
 #else /* CONFIG_BLK_DEV_THROTTLING */
 static inline void blk_throtl_drain(struct request_queue *q) { }
 static inline int blk_throtl_init(struct request_queue *q) { return 0; }