diff mbox

[06/14] blkcg: add generic throttling mechanism

Message ID 20180703151503.2549-7-josef@toxicpanda.com (mailing list archive)
State New, archived
Headers show

Commit Message

Josef Bacik July 3, 2018, 3:14 p.m. UTC
From: Josef Bacik <jbacik@fb.com>

Since IO can be issued from literally anywhere it's almost impossible to
do throttling without having some sort of adverse effect somewhere else
in the system because of locking or other dependencies.  The best way to
solve this is to do the throttling when we know we aren't holding any
other kernel resources.  Do this by tracking throttling in a per-blkg
basis, and if we require throttling flag the task that it needs to check
before it returns to user space and possibly sleep there.

This is to address the case where a process is doing work that is
generating IO that can't be throttled, whether that is directly with a
lot of REQ_META IO, or indirectly by allocating so much memory that it
is swamping the disk with REQ_SWAP.  We can't use task_add_work as we
don't want to induce a memory allocation in the IO path, so simply
saving the request queue in the task and flagging it to do the
notify_resume thing achieves the same result without the overhead of a
memory allocation.

Signed-off-by: Josef Bacik <jbacik@fb.com>
Acked-by: Tejun Heo <tj@kernel.org>
---
 block/blk-cgroup.c          | 220 ++++++++++++++++++++++++++++++++++++++++++++
 include/linux/blk-cgroup.h  |  99 ++++++++++++++++++++
 include/linux/cgroup-defs.h |   3 +
 include/linux/sched.h       |   8 ++
 include/linux/tracehook.h   |   2 +
 5 files changed, 332 insertions(+)

Comments

Konstantin Khlebnikov Aug. 5, 2018, 12:41 p.m. UTC | #1
On 03.07.2018 18:14, Josef Bacik wrote:
> From: Josef Bacik <jbacik@fb.com>
> 
> Since IO can be issued from literally anywhere it's almost impossible to
> do throttling without having some sort of adverse effect somewhere else
> in the system because of locking or other dependencies.  The best way to
> solve this is to do the throttling when we know we aren't holding any
> other kernel resources.  Do this by tracking throttling in a per-blkg
> basis, and if we require throttling flag the task that it needs to check
> before it returns to user space and possibly sleep there.
> 
> This is to address the case where a process is doing work that is
> generating IO that can't be throttled, whether that is directly with a
> lot of REQ_META IO, or indirectly by allocating so much memory that it
> is swamping the disk with REQ_SWAP.  We can't use task_add_work as we
> don't want to induce a memory allocation in the IO path, so simply
> saving the request queue in the task and flagging it to do the
> notify_resume thing achieves the same result without the overhead of a
> memory allocation.

Technically speaking task_add_work could be used for this job without memory allocation.
I've done the same thing some years ago by embedding call_head into task_struct.
https://lore.kernel.org/patchwork/patch/533621/

> 
> Signed-off-by: Josef Bacik <jbacik@fb.com>
> Acked-by: Tejun Heo <tj@kernel.org>
> ---
>   block/blk-cgroup.c          | 220 ++++++++++++++++++++++++++++++++++++++++++++
>   include/linux/blk-cgroup.h  |  99 ++++++++++++++++++++
>   include/linux/cgroup-defs.h |   3 +
>   include/linux/sched.h       |   8 ++
>   include/linux/tracehook.h   |   2 +
>   5 files changed, 332 insertions(+)
> 
> diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
> index 7dc6f05cc44b..d3310ec96c2a 100644
> --- a/block/blk-cgroup.c
> +++ b/block/blk-cgroup.c
> @@ -27,6 +27,7 @@
>   #include <linux/atomic.h>
>   #include <linux/ctype.h>
>   #include <linux/blk-cgroup.h>
> +#include <linux/tracehook.h>
>   #include "blk.h"
>   
>   #define MAX_KEY_LEN 100
> @@ -999,6 +1000,14 @@ static int blkcg_print_stat(struct seq_file *sf, void *v)
>   		if (!blkcg_debug_stats)
>   			goto next;
>   
> +		if (atomic_read(&blkg->use_delay)) {
> +			has_stats = true;
> +			off += scnprintf(buf+off, size-off,
> +					 " use_delay=%d delay_nsec=%llu",
> +					 atomic_read(&blkg->use_delay),
> +					(unsigned long long)atomic64_read(&blkg->delay_nsec));
> +		}
> +
>   		for (i = 0; i < BLKCG_MAX_POLS; i++) {
>   			struct blkcg_policy *pol = blkcg_policy[i];
>   			size_t written;
> @@ -1326,6 +1335,13 @@ static void blkcg_bind(struct cgroup_subsys_state *root_css)
>   	mutex_unlock(&blkcg_pol_mutex);
>   }
>   
> +static void blkcg_exit(struct task_struct *tsk)
> +{
> +	if (tsk->throttle_queue)
> +		blk_put_queue(tsk->throttle_queue);
> +	tsk->throttle_queue = NULL;
> +}
> +
>   struct cgroup_subsys io_cgrp_subsys = {
>   	.css_alloc = blkcg_css_alloc,
>   	.css_offline = blkcg_css_offline,
> @@ -1335,6 +1351,7 @@ struct cgroup_subsys io_cgrp_subsys = {
>   	.dfl_cftypes = blkcg_files,
>   	.legacy_cftypes = blkcg_legacy_files,
>   	.legacy_name = "blkio",
> +	.exit = blkcg_exit,
>   #ifdef CONFIG_MEMCG
>   	/*
>   	 * This ensures that, if available, memcg is automatically enabled
> @@ -1586,5 +1603,208 @@ void blkcg_policy_unregister(struct blkcg_policy *pol)
>   }
>   EXPORT_SYMBOL_GPL(blkcg_policy_unregister);
>   
> +/*
> + * Scale the accumulated delay based on how long it has been since we updated
> + * the delay.  We only call this when we are adding delay, in case it's been a
> + * while since we added delay, and when we are checking to see if we need to
> + * delay a task, to account for any delays that may have occurred.
> + */
> +static void blkcg_scale_delay(struct blkcg_gq *blkg, u64 now)
> +{
> +	u64 old = atomic64_read(&blkg->delay_start);
> +
> +	/*
> +	 * We only want to scale down every second.  The idea here is that we
> +	 * want to delay people for min(delay_nsec, NSEC_PER_SEC) in a certain
> +	 * time window.  We only want to throttle tasks for recent delay that
> +	 * has occurred, in 1 second time windows since that's the maximum
> +	 * things can be throttled.  We save the current delay window in
> +	 * blkg->last_delay so we know what amount is still left to be charged
> +	 * to the blkg from this point onward.  blkg->last_use keeps track of
> +	 * the use_delay counter.  The idea is if we're unthrottling the blkg we
> +	 * are ok with whatever is happening now, and we can take away more of
> +	 * the accumulated delay as we've already throttled enough that
> +	 * everybody is happy with their IO latencies.
> +	 */
> +	if (time_before64(old + NSEC_PER_SEC, now) &&
> +	    atomic64_cmpxchg(&blkg->delay_start, old, now) == old) {
> +		u64 cur = atomic64_read(&blkg->delay_nsec);
> +		u64 sub = min_t(u64, blkg->last_delay, now - old);
> +		int cur_use = atomic_read(&blkg->use_delay);
> +
> +		/*
> +		 * We've been unthrottled, subtract a larger chunk of our
> +		 * accumulated delay.
> +		 */
> +		if (cur_use < blkg->last_use)
> +			sub = max_t(u64, sub, blkg->last_delay >> 1);
> +
> +		/*
> +		 * This shouldn't happen, but handle it anyway.  Our delay_nsec
> +		 * should only ever be growing except here where we subtract out
> +		 * min(last_delay, 1 second), but lord knows bugs happen and I'd
> +		 * rather not end up with negative numbers.
> +		 */
> +		if (unlikely(cur < sub)) {
> +			atomic64_set(&blkg->delay_nsec, 0);
> +			blkg->last_delay = 0;
> +		} else {
> +			atomic64_sub(sub, &blkg->delay_nsec);
> +			blkg->last_delay = cur - sub;
> +		}
> +		blkg->last_use = cur_use;
> +	}
> +}
> +
> +/*
> + * This is called when we want to actually walk up the hierarchy and check to
> + * see if we need to throttle, and then actually throttle if there is some
> + * accumulated delay.  This should only be called upon return to user space so
> + * we're not holding some lock that would induce a priority inversion.
> + */
> +static void blkcg_maybe_throttle_blkg(struct blkcg_gq *blkg, bool use_memdelay)
> +{
> +	u64 now = ktime_to_ns(ktime_get());
> +	u64 exp;
> +	u64 delay_nsec = 0;
> +	int tok;
> +
> +	while (blkg->parent) {
> +		if (atomic_read(&blkg->use_delay)) {
> +			blkcg_scale_delay(blkg, now);
> +			delay_nsec = max_t(u64, delay_nsec,
> +					   atomic64_read(&blkg->delay_nsec));
> +		}
> +		blkg = blkg->parent;
> +	}
> +
> +	if (!delay_nsec)
> +		return;
> +
> +	/*
> +	 * Let's not sleep for all eternity if we've amassed a huge delay.
> +	 * Swapping or metadata IO can accumulate 10's of seconds worth of
> +	 * delay, and we want userspace to be able to do _something_ so cap the
> +	 * delays at 1 second.  If there's 10's of seconds worth of delay then
> +	 * the tasks will be delayed for 1 second for every syscall.
> +	 */
> +	delay_nsec = min_t(u64, delay_nsec, 250 * NSEC_PER_MSEC);
> +
> +	/*
> +	 * TODO: the use_memdelay flag is going to be for the upcoming psi stuff
> +	 * that hasn't landed upstream yet.  Once that stuff is in place we need
> +	 * to do a psi_memstall_enter/leave if memdelay is set.
> +	 */
> +
> +	exp = ktime_add_ns(now, delay_nsec);
> +	tok = io_schedule_prepare();
> +	do {
> +		__set_current_state(TASK_KILLABLE);
> +		if (!schedule_hrtimeout(&exp, HRTIMER_MODE_ABS))
> +			break;
> +	} while (!fatal_signal_pending(current));
> +	io_schedule_finish(tok);
> +}
> +
> +/**
> + * blkcg_maybe_throttle_current - throttle the current task if it has been marked
> + *
> + * This is only called if we've been marked with set_notify_resume().  Obviously
> + * we can be set_notify_resume() for reasons other than blkcg throttling, so we
> + * check to see if current->throttle_queue is set and if not this doesn't do
> + * anything.  This should only ever be called by the resume code, it's not meant
> + * to be called by people willy-nilly as it will actually do the work to
> + * throttle the task if it is setup for throttling.
> + */
> +void blkcg_maybe_throttle_current(void)
> +{
> +	struct request_queue *q = current->throttle_queue;
> +	struct cgroup_subsys_state *css;
> +	struct blkcg *blkcg;
> +	struct blkcg_gq *blkg;
> +	bool use_memdelay = current->use_memdelay;
> +
> +	if (!q)
> +		return;
> +
> +	current->throttle_queue = NULL;
> +	current->use_memdelay = false;
> +
> +	rcu_read_lock();
> +	css = kthread_blkcg();
> +	if (css)
> +		blkcg = css_to_blkcg(css);
> +	else
> +		blkcg = css_to_blkcg(task_css(current, io_cgrp_id));
> +
> +	if (!blkcg)
> +		goto out;
> +	blkg = blkg_lookup(blkcg, q);
> +	if (!blkg)
> +		goto out;
> +	blkg = blkg_try_get(blkg);
> +	if (!blkg)
> +		goto out;
> +	rcu_read_unlock();
> +	blk_put_queue(q);
> +
> +	blkcg_maybe_throttle_blkg(blkg, use_memdelay);
> +	blkg_put(blkg);
> +	return;
> +out:
> +	rcu_read_unlock();
> +	blk_put_queue(q);
> +}
> +EXPORT_SYMBOL_GPL(blkcg_maybe_throttle_current);
> +
> +/**
> + * blkcg_schedule_throttle - this task needs to check for throttling
> + * @q - the request queue IO was submitted on
> + * @use_memdelay - do we charge this to memory delay for PSI
> + *
> + * This is called by the IO controller when we know there's delay accumulated
> + * for the blkg for this task.  We do not pass the blkg because there are places
> + * we call this that may not have that information, the swapping code for
> + * instance will only have a request_queue at that point.  This set's the
> + * notify_resume for the task to check and see if it requires throttling before
> + * returning to user space.
> + *
> + * We will only schedule once per syscall.  You can call this over and over
> + * again and it will only do the check once upon return to user space, and only
> + * throttle once.  If the task needs to be throttled again it'll need to be
> + * re-set at the next time we see the task.
> + */
> +void blkcg_schedule_throttle(struct request_queue *q, bool use_memdelay)
> +{
> +	if (unlikely(current->flags & PF_KTHREAD))
> +		return;
> +
> +	if (!blk_get_queue(q))
> +		return;
> +
> +	if (current->throttle_queue)
> +		blk_put_queue(current->throttle_queue);
> +	current->throttle_queue = q;
> +	if (use_memdelay)
> +		current->use_memdelay = use_memdelay;
> +	set_notify_resume(current);
> +}
> +EXPORT_SYMBOL_GPL(blkcg_schedule_throttle);
> +
> +/**
> + * blkcg_add_delay - add delay to this blkg
> + * @now - the current time in nanoseconds
> + * @delta - how many nanoseconds of delay to add
> + *
> + * Charge @delta to the blkg's current delay accumulation.  This is used to
> + * throttle tasks if an IO controller thinks we need more throttling.
> + */
> +void blkcg_add_delay(struct blkcg_gq *blkg, u64 now, u64 delta)
> +{
> +	blkcg_scale_delay(blkg, now);
> +	atomic64_add(delta, &blkg->delay_nsec);
> +}
> +EXPORT_SYMBOL_GPL(blkcg_add_delay);
> +
>   module_param(blkcg_debug_stats, bool, 0644);
>   MODULE_PARM_DESC(blkcg_debug_stats, "True if you want debug stats, false if not");
> diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h
> index a8f9ba8f33a4..de57de4831d5 100644
> --- a/include/linux/blk-cgroup.h
> +++ b/include/linux/blk-cgroup.h
> @@ -136,6 +136,12 @@ struct blkcg_gq {
>   	struct blkg_policy_data		*pd[BLKCG_MAX_POLS];
>   
>   	struct rcu_head			rcu_head;
> +
> +	atomic_t			use_delay;
> +	atomic64_t			delay_nsec;
> +	atomic64_t			delay_start;
> +	u64				last_delay;
> +	int				last_use;
>   };
>   
>   typedef struct blkcg_policy_data *(blkcg_pol_alloc_cpd_fn)(gfp_t gfp);
> @@ -241,6 +247,26 @@ static inline struct blkcg *bio_blkcg(struct bio *bio)
>   	return css_to_blkcg(task_css(current, io_cgrp_id));
>   }
>   
> +static inline bool blk_cgroup_congested(void)
> +{
> +	struct cgroup_subsys_state *css;
> +	bool ret = false;
> +
> +	rcu_read_lock();
> +	css = kthread_blkcg();
> +	if (!css)
> +		css = task_css(current, io_cgrp_id);
> +	while (css) {
> +		if (atomic_read(&css->cgroup->congestion_count)) {
> +			ret = true;
> +			break;
> +		}
> +		css = css->parent;
> +	}
> +	rcu_read_unlock();
> +	return ret;
> +}
> +
>   /**
>    * bio_issue_as_root_blkg - see if this bio needs to be issued as root blkg
>    * @return: true if this bio needs to be submitted with the root blkg context.
> @@ -374,6 +400,21 @@ static inline void blkg_get(struct blkcg_gq *blkg)
>   	atomic_inc(&blkg->refcnt);
>   }
>   
> +/**
> + * blkg_try_get - try and get a blkg reference
> + * @blkg: blkg to get
> + *
> + * This is for use when doing an RCU lookup of the blkg.  We may be in the midst
> + * of freeing this blkg, so we can only use it if the refcnt is not zero.
> + */
> +static inline struct blkcg_gq *blkg_try_get(struct blkcg_gq *blkg)
> +{
> +	if (atomic_inc_not_zero(&blkg->refcnt))
> +		return blkg;
> +	return NULL;
> +}
> +
> +
>   void __blkg_release_rcu(struct rcu_head *rcu);
>   
>   /**
> @@ -734,6 +775,59 @@ static inline bool blkcg_bio_issue_check(struct request_queue *q,
>   	return !throtl;
>   }
>   
> +static inline void blkcg_use_delay(struct blkcg_gq *blkg)
> +{
> +	if (atomic_add_return(1, &blkg->use_delay) == 1)
> +		atomic_inc(&blkg->blkcg->css.cgroup->congestion_count);
> +}
> +
> +static inline int blkcg_unuse_delay(struct blkcg_gq *blkg)
> +{
> +	int old = atomic_read(&blkg->use_delay);
> +
> +	if (old == 0)
> +		return 0;
> +
> +	/*
> +	 * We do this song and dance because we can race with somebody else
> +	 * adding or removing delay.  If we just did an atomic_dec we'd end up
> +	 * negative and we'd already be in trouble.  We need to subtract 1 and
> +	 * then check to see if we were the last delay so we can drop the
> +	 * congestion count on the cgroup.
> +	 */
> +	while (old) {
> +		int cur = atomic_cmpxchg(&blkg->use_delay, old, old - 1);
> +		if (cur == old)
> +			break;
> +		old = cur;
> +	}
> +
> +	if (old == 0)
> +		return 0;
> +	if (old == 1)
> +		atomic_dec(&blkg->blkcg->css.cgroup->congestion_count);
> +	return 1;
> +}
> +
> +static inline void blkcg_clear_delay(struct blkcg_gq *blkg)
> +{
> +	int old = atomic_read(&blkg->use_delay);
> +	if (!old)
> +		return;
> +	/* We only want 1 person clearing the congestion count for this blkg. */
> +	while (old) {
> +		int cur = atomic_cmpxchg(&blkg->use_delay, old, 0);
> +		if (cur == old) {
> +			atomic_dec(&blkg->blkcg->css.cgroup->congestion_count);
> +			break;
> +		}
> +		old = cur;
> +	}
> +}
> +
> +void blkcg_add_delay(struct blkcg_gq *blkg, u64 now, u64 delta);
> +void blkcg_schedule_throttle(struct request_queue *q, bool use_memdelay);
> +void blkcg_maybe_throttle_current(void);
>   #else	/* CONFIG_BLK_CGROUP */
>   
>   struct blkcg {
> @@ -753,8 +847,13 @@ struct blkcg_policy {
>   
>   #define blkcg_root_css	((struct cgroup_subsys_state *)ERR_PTR(-EINVAL))
>   
> +static inline void blkcg_maybe_throttle_current(void) { }
> +static inline bool blk_cgroup_congested(void) { return false; }
> +
>   #ifdef CONFIG_BLOCK
>   
> +static inline void blkcg_schedule_throttle(struct request_queue *q, bool use_memdelay) { }
> +
>   static inline struct blkcg_gq *blkg_lookup(struct blkcg *blkcg, void *key) { return NULL; }
>   static inline int blkcg_init_queue(struct request_queue *q) { return 0; }
>   static inline void blkcg_drain_queue(struct request_queue *q) { }
> diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h
> index c0e68f903011..ff20b677fb9f 100644
> --- a/include/linux/cgroup-defs.h
> +++ b/include/linux/cgroup-defs.h
> @@ -438,6 +438,9 @@ struct cgroup {
>   	/* used to store eBPF programs */
>   	struct cgroup_bpf bpf;
>   
> +	/* If there is block congestion on this cgroup. */
> +	atomic_t congestion_count;
> +
>   	/* ids of the ancestors at each level including self */
>   	int ancestor_ids[];
>   };
> diff --git a/include/linux/sched.h b/include/linux/sched.h
> index 3aa4fcb74e76..1c59d0573e4d 100644
> --- a/include/linux/sched.h
> +++ b/include/linux/sched.h
> @@ -734,6 +734,10 @@ struct task_struct {
>   	/* disallow userland-initiated cgroup migration */
>   	unsigned			no_cgroup_migration:1;
>   #endif
> +#ifdef CONFIG_BLK_CGROUP
> +	/* to be used once the psi infrastructure lands upstream. */
> +	unsigned			use_memdelay:1;
> +#endif
>   
>   	unsigned long			atomic_flags; /* Flags requiring atomic access. */
>   
> @@ -1151,6 +1155,10 @@ struct task_struct {
>   	unsigned int			memcg_nr_pages_over_high;
>   #endif
>   
> +#ifdef CONFIG_BLK_CGROUP
> +	struct request_queue		*throttle_queue;
> +#endif
> +
>   #ifdef CONFIG_UPROBES
>   	struct uprobe_task		*utask;
>   #endif
> diff --git a/include/linux/tracehook.h b/include/linux/tracehook.h
> index 4a8841963c2e..05589a3e37f4 100644
> --- a/include/linux/tracehook.h
> +++ b/include/linux/tracehook.h
> @@ -51,6 +51,7 @@
>   #include <linux/security.h>
>   #include <linux/task_work.h>
>   #include <linux/memcontrol.h>
> +#include <linux/blk-cgroup.h>
>   struct linux_binprm;
>   
>   /*
> @@ -192,6 +193,7 @@ static inline void tracehook_notify_resume(struct pt_regs *regs)
>   		task_work_run();
>   
>   	mem_cgroup_handle_over_high();
> +	blkcg_maybe_throttle_current();
>   }
>   
>   #endif	/* <linux/tracehook.h> */
>
diff mbox

Patch

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 7dc6f05cc44b..d3310ec96c2a 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -27,6 +27,7 @@ 
 #include <linux/atomic.h>
 #include <linux/ctype.h>
 #include <linux/blk-cgroup.h>
+#include <linux/tracehook.h>
 #include "blk.h"
 
 #define MAX_KEY_LEN 100
@@ -999,6 +1000,14 @@  static int blkcg_print_stat(struct seq_file *sf, void *v)
 		if (!blkcg_debug_stats)
 			goto next;
 
+		if (atomic_read(&blkg->use_delay)) {
+			has_stats = true;
+			off += scnprintf(buf+off, size-off,
+					 " use_delay=%d delay_nsec=%llu",
+					 atomic_read(&blkg->use_delay),
+					(unsigned long long)atomic64_read(&blkg->delay_nsec));
+		}
+
 		for (i = 0; i < BLKCG_MAX_POLS; i++) {
 			struct blkcg_policy *pol = blkcg_policy[i];
 			size_t written;
@@ -1326,6 +1335,13 @@  static void blkcg_bind(struct cgroup_subsys_state *root_css)
 	mutex_unlock(&blkcg_pol_mutex);
 }
 
+static void blkcg_exit(struct task_struct *tsk)
+{
+	if (tsk->throttle_queue)
+		blk_put_queue(tsk->throttle_queue);
+	tsk->throttle_queue = NULL;
+}
+
 struct cgroup_subsys io_cgrp_subsys = {
 	.css_alloc = blkcg_css_alloc,
 	.css_offline = blkcg_css_offline,
@@ -1335,6 +1351,7 @@  struct cgroup_subsys io_cgrp_subsys = {
 	.dfl_cftypes = blkcg_files,
 	.legacy_cftypes = blkcg_legacy_files,
 	.legacy_name = "blkio",
+	.exit = blkcg_exit,
 #ifdef CONFIG_MEMCG
 	/*
 	 * This ensures that, if available, memcg is automatically enabled
@@ -1586,5 +1603,208 @@  void blkcg_policy_unregister(struct blkcg_policy *pol)
 }
 EXPORT_SYMBOL_GPL(blkcg_policy_unregister);
 
+/*
+ * Scale the accumulated delay based on how long it has been since we updated
+ * the delay.  We only call this when we are adding delay, in case it's been a
+ * while since we added delay, and when we are checking to see if we need to
+ * delay a task, to account for any delays that may have occurred.
+ */
+static void blkcg_scale_delay(struct blkcg_gq *blkg, u64 now)
+{
+	u64 old = atomic64_read(&blkg->delay_start);
+
+	/*
+	 * We only want to scale down every second.  The idea here is that we
+	 * want to delay people for min(delay_nsec, NSEC_PER_SEC) in a certain
+	 * time window.  We only want to throttle tasks for recent delay that
+	 * has occurred, in 1 second time windows since that's the maximum
+	 * things can be throttled.  We save the current delay window in
+	 * blkg->last_delay so we know what amount is still left to be charged
+	 * to the blkg from this point onward.  blkg->last_use keeps track of
+	 * the use_delay counter.  The idea is if we're unthrottling the blkg we
+	 * are ok with whatever is happening now, and we can take away more of
+	 * the accumulated delay as we've already throttled enough that
+	 * everybody is happy with their IO latencies.
+	 */
+	if (time_before64(old + NSEC_PER_SEC, now) &&
+	    atomic64_cmpxchg(&blkg->delay_start, old, now) == old) {
+		u64 cur = atomic64_read(&blkg->delay_nsec);
+		u64 sub = min_t(u64, blkg->last_delay, now - old);
+		int cur_use = atomic_read(&blkg->use_delay);
+
+		/*
+		 * We've been unthrottled, subtract a larger chunk of our
+		 * accumulated delay.
+		 */
+		if (cur_use < blkg->last_use)
+			sub = max_t(u64, sub, blkg->last_delay >> 1);
+
+		/*
+		 * This shouldn't happen, but handle it anyway.  Our delay_nsec
+		 * should only ever be growing except here where we subtract out
+		 * min(last_delay, 1 second), but lord knows bugs happen and I'd
+		 * rather not end up with negative numbers.
+		 */
+		if (unlikely(cur < sub)) {
+			atomic64_set(&blkg->delay_nsec, 0);
+			blkg->last_delay = 0;
+		} else {
+			atomic64_sub(sub, &blkg->delay_nsec);
+			blkg->last_delay = cur - sub;
+		}
+		blkg->last_use = cur_use;
+	}
+}
+
+/*
+ * This is called when we want to actually walk up the hierarchy and check to
+ * see if we need to throttle, and then actually throttle if there is some
+ * accumulated delay.  This should only be called upon return to user space so
+ * we're not holding some lock that would induce a priority inversion.
+ */
+static void blkcg_maybe_throttle_blkg(struct blkcg_gq *blkg, bool use_memdelay)
+{
+	u64 now = ktime_to_ns(ktime_get());
+	u64 exp;
+	u64 delay_nsec = 0;
+	int tok;
+
+	while (blkg->parent) {
+		if (atomic_read(&blkg->use_delay)) {
+			blkcg_scale_delay(blkg, now);
+			delay_nsec = max_t(u64, delay_nsec,
+					   atomic64_read(&blkg->delay_nsec));
+		}
+		blkg = blkg->parent;
+	}
+
+	if (!delay_nsec)
+		return;
+
+	/*
+	 * Let's not sleep for all eternity if we've amassed a huge delay.
+	 * Swapping or metadata IO can accumulate 10's of seconds worth of
+	 * delay, and we want userspace to be able to do _something_ so cap the
+	 * delays at 1 second.  If there's 10's of seconds worth of delay then
+	 * the tasks will be delayed for 1 second for every syscall.
+	 */
+	delay_nsec = min_t(u64, delay_nsec, 250 * NSEC_PER_MSEC);
+
+	/*
+	 * TODO: the use_memdelay flag is going to be for the upcoming psi stuff
+	 * that hasn't landed upstream yet.  Once that stuff is in place we need
+	 * to do a psi_memstall_enter/leave if memdelay is set.
+	 */
+
+	exp = ktime_add_ns(now, delay_nsec);
+	tok = io_schedule_prepare();
+	do {
+		__set_current_state(TASK_KILLABLE);
+		if (!schedule_hrtimeout(&exp, HRTIMER_MODE_ABS))
+			break;
+	} while (!fatal_signal_pending(current));
+	io_schedule_finish(tok);
+}
+
+/**
+ * blkcg_maybe_throttle_current - throttle the current task if it has been marked
+ *
+ * This is only called if we've been marked with set_notify_resume().  Obviously
+ * we can be set_notify_resume() for reasons other than blkcg throttling, so we
+ * check to see if current->throttle_queue is set and if not this doesn't do
+ * anything.  This should only ever be called by the resume code, it's not meant
+ * to be called by people willy-nilly as it will actually do the work to
+ * throttle the task if it is setup for throttling.
+ */
+void blkcg_maybe_throttle_current(void)
+{
+	struct request_queue *q = current->throttle_queue;
+	struct cgroup_subsys_state *css;
+	struct blkcg *blkcg;
+	struct blkcg_gq *blkg;
+	bool use_memdelay = current->use_memdelay;
+
+	if (!q)
+		return;
+
+	current->throttle_queue = NULL;
+	current->use_memdelay = false;
+
+	rcu_read_lock();
+	css = kthread_blkcg();
+	if (css)
+		blkcg = css_to_blkcg(css);
+	else
+		blkcg = css_to_blkcg(task_css(current, io_cgrp_id));
+
+	if (!blkcg)
+		goto out;
+	blkg = blkg_lookup(blkcg, q);
+	if (!blkg)
+		goto out;
+	blkg = blkg_try_get(blkg);
+	if (!blkg)
+		goto out;
+	rcu_read_unlock();
+	blk_put_queue(q);
+
+	blkcg_maybe_throttle_blkg(blkg, use_memdelay);
+	blkg_put(blkg);
+	return;
+out:
+	rcu_read_unlock();
+	blk_put_queue(q);
+}
+EXPORT_SYMBOL_GPL(blkcg_maybe_throttle_current);
+
+/**
+ * blkcg_schedule_throttle - this task needs to check for throttling
+ * @q - the request queue IO was submitted on
+ * @use_memdelay - do we charge this to memory delay for PSI
+ *
+ * This is called by the IO controller when we know there's delay accumulated
+ * for the blkg for this task.  We do not pass the blkg because there are places
+ * we call this that may not have that information, the swapping code for
+ * instance will only have a request_queue at that point.  This set's the
+ * notify_resume for the task to check and see if it requires throttling before
+ * returning to user space.
+ *
+ * We will only schedule once per syscall.  You can call this over and over
+ * again and it will only do the check once upon return to user space, and only
+ * throttle once.  If the task needs to be throttled again it'll need to be
+ * re-set at the next time we see the task.
+ */
+void blkcg_schedule_throttle(struct request_queue *q, bool use_memdelay)
+{
+	if (unlikely(current->flags & PF_KTHREAD))
+		return;
+
+	if (!blk_get_queue(q))
+		return;
+
+	if (current->throttle_queue)
+		blk_put_queue(current->throttle_queue);
+	current->throttle_queue = q;
+	if (use_memdelay)
+		current->use_memdelay = use_memdelay;
+	set_notify_resume(current);
+}
+EXPORT_SYMBOL_GPL(blkcg_schedule_throttle);
+
+/**
+ * blkcg_add_delay - add delay to this blkg
+ * @now - the current time in nanoseconds
+ * @delta - how many nanoseconds of delay to add
+ *
+ * Charge @delta to the blkg's current delay accumulation.  This is used to
+ * throttle tasks if an IO controller thinks we need more throttling.
+ */
+void blkcg_add_delay(struct blkcg_gq *blkg, u64 now, u64 delta)
+{
+	blkcg_scale_delay(blkg, now);
+	atomic64_add(delta, &blkg->delay_nsec);
+}
+EXPORT_SYMBOL_GPL(blkcg_add_delay);
+
 module_param(blkcg_debug_stats, bool, 0644);
 MODULE_PARM_DESC(blkcg_debug_stats, "True if you want debug stats, false if not");
diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h
index a8f9ba8f33a4..de57de4831d5 100644
--- a/include/linux/blk-cgroup.h
+++ b/include/linux/blk-cgroup.h
@@ -136,6 +136,12 @@  struct blkcg_gq {
 	struct blkg_policy_data		*pd[BLKCG_MAX_POLS];
 
 	struct rcu_head			rcu_head;
+
+	atomic_t			use_delay;
+	atomic64_t			delay_nsec;
+	atomic64_t			delay_start;
+	u64				last_delay;
+	int				last_use;
 };
 
 typedef struct blkcg_policy_data *(blkcg_pol_alloc_cpd_fn)(gfp_t gfp);
@@ -241,6 +247,26 @@  static inline struct blkcg *bio_blkcg(struct bio *bio)
 	return css_to_blkcg(task_css(current, io_cgrp_id));
 }
 
+static inline bool blk_cgroup_congested(void)
+{
+	struct cgroup_subsys_state *css;
+	bool ret = false;
+
+	rcu_read_lock();
+	css = kthread_blkcg();
+	if (!css)
+		css = task_css(current, io_cgrp_id);
+	while (css) {
+		if (atomic_read(&css->cgroup->congestion_count)) {
+			ret = true;
+			break;
+		}
+		css = css->parent;
+	}
+	rcu_read_unlock();
+	return ret;
+}
+
 /**
  * bio_issue_as_root_blkg - see if this bio needs to be issued as root blkg
  * @return: true if this bio needs to be submitted with the root blkg context.
@@ -374,6 +400,21 @@  static inline void blkg_get(struct blkcg_gq *blkg)
 	atomic_inc(&blkg->refcnt);
 }
 
+/**
+ * blkg_try_get - try and get a blkg reference
+ * @blkg: blkg to get
+ *
+ * This is for use when doing an RCU lookup of the blkg.  We may be in the midst
+ * of freeing this blkg, so we can only use it if the refcnt is not zero.
+ */
+static inline struct blkcg_gq *blkg_try_get(struct blkcg_gq *blkg)
+{
+	if (atomic_inc_not_zero(&blkg->refcnt))
+		return blkg;
+	return NULL;
+}
+
+
 void __blkg_release_rcu(struct rcu_head *rcu);
 
 /**
@@ -734,6 +775,59 @@  static inline bool blkcg_bio_issue_check(struct request_queue *q,
 	return !throtl;
 }
 
+static inline void blkcg_use_delay(struct blkcg_gq *blkg)
+{
+	if (atomic_add_return(1, &blkg->use_delay) == 1)
+		atomic_inc(&blkg->blkcg->css.cgroup->congestion_count);
+}
+
+static inline int blkcg_unuse_delay(struct blkcg_gq *blkg)
+{
+	int old = atomic_read(&blkg->use_delay);
+
+	if (old == 0)
+		return 0;
+
+	/*
+	 * We do this song and dance because we can race with somebody else
+	 * adding or removing delay.  If we just did an atomic_dec we'd end up
+	 * negative and we'd already be in trouble.  We need to subtract 1 and
+	 * then check to see if we were the last delay so we can drop the
+	 * congestion count on the cgroup.
+	 */
+	while (old) {
+		int cur = atomic_cmpxchg(&blkg->use_delay, old, old - 1);
+		if (cur == old)
+			break;
+		old = cur;
+	}
+
+	if (old == 0)
+		return 0;
+	if (old == 1)
+		atomic_dec(&blkg->blkcg->css.cgroup->congestion_count);
+	return 1;
+}
+
+static inline void blkcg_clear_delay(struct blkcg_gq *blkg)
+{
+	int old = atomic_read(&blkg->use_delay);
+	if (!old)
+		return;
+	/* We only want 1 person clearing the congestion count for this blkg. */
+	while (old) {
+		int cur = atomic_cmpxchg(&blkg->use_delay, old, 0);
+		if (cur == old) {
+			atomic_dec(&blkg->blkcg->css.cgroup->congestion_count);
+			break;
+		}
+		old = cur;
+	}
+}
+
+void blkcg_add_delay(struct blkcg_gq *blkg, u64 now, u64 delta);
+void blkcg_schedule_throttle(struct request_queue *q, bool use_memdelay);
+void blkcg_maybe_throttle_current(void);
 #else	/* CONFIG_BLK_CGROUP */
 
 struct blkcg {
@@ -753,8 +847,13 @@  struct blkcg_policy {
 
 #define blkcg_root_css	((struct cgroup_subsys_state *)ERR_PTR(-EINVAL))
 
+static inline void blkcg_maybe_throttle_current(void) { }
+static inline bool blk_cgroup_congested(void) { return false; }
+
 #ifdef CONFIG_BLOCK
 
+static inline void blkcg_schedule_throttle(struct request_queue *q, bool use_memdelay) { }
+
 static inline struct blkcg_gq *blkg_lookup(struct blkcg *blkcg, void *key) { return NULL; }
 static inline int blkcg_init_queue(struct request_queue *q) { return 0; }
 static inline void blkcg_drain_queue(struct request_queue *q) { }
diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h
index c0e68f903011..ff20b677fb9f 100644
--- a/include/linux/cgroup-defs.h
+++ b/include/linux/cgroup-defs.h
@@ -438,6 +438,9 @@  struct cgroup {
 	/* used to store eBPF programs */
 	struct cgroup_bpf bpf;
 
+	/* If there is block congestion on this cgroup. */
+	atomic_t congestion_count;
+
 	/* ids of the ancestors at each level including self */
 	int ancestor_ids[];
 };
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 3aa4fcb74e76..1c59d0573e4d 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -734,6 +734,10 @@  struct task_struct {
 	/* disallow userland-initiated cgroup migration */
 	unsigned			no_cgroup_migration:1;
 #endif
+#ifdef CONFIG_BLK_CGROUP
+	/* to be used once the psi infrastructure lands upstream. */
+	unsigned			use_memdelay:1;
+#endif
 
 	unsigned long			atomic_flags; /* Flags requiring atomic access. */
 
@@ -1151,6 +1155,10 @@  struct task_struct {
 	unsigned int			memcg_nr_pages_over_high;
 #endif
 
+#ifdef CONFIG_BLK_CGROUP
+	struct request_queue		*throttle_queue;
+#endif
+
 #ifdef CONFIG_UPROBES
 	struct uprobe_task		*utask;
 #endif
diff --git a/include/linux/tracehook.h b/include/linux/tracehook.h
index 4a8841963c2e..05589a3e37f4 100644
--- a/include/linux/tracehook.h
+++ b/include/linux/tracehook.h
@@ -51,6 +51,7 @@ 
 #include <linux/security.h>
 #include <linux/task_work.h>
 #include <linux/memcontrol.h>
+#include <linux/blk-cgroup.h>
 struct linux_binprm;
 
 /*
@@ -192,6 +193,7 @@  static inline void tracehook_notify_resume(struct pt_regs *regs)
 		task_work_run();
 
 	mem_cgroup_handle_over_high();
+	blkcg_maybe_throttle_current();
 }
 
 #endif	/* <linux/tracehook.h> */