diff mbox series

[v2,08/14] mm: memcg: move cgroup v1 oom handling code into memcontrol-v1.c

Message ID 20240625005906.106920-9-roman.gushchin@linux.dev (mailing list archive)
State New
Headers show
Series mm: memcg: separate legacy cgroup v1 code and put under config option | expand

Commit Message

Roman Gushchin June 25, 2024, 12:59 a.m. UTC
Cgroup v1 supports a complicated OOM handling in userspace mechanism,
which is not supported by cgroup v2. Let's move the corresponding code
into memcontrol-v1.c.

Aside from mechanical code movement this patch introduces two new
functions: memcg1_oom_prepare() and memcg1_oom_finish().
Those are implementing cgroup v1-specific parts of the common memcg
OOM handling path.

Signed-off-by: Roman Gushchin <roman.gushchin@linux.dev>
---
 mm/memcontrol-v1.c | 229 ++++++++++++++++++++++++++++++++++++++++++++-
 mm/memcontrol-v1.h |   3 +-
 mm/memcontrol.c    | 216 +-----------------------------------------
 3 files changed, 231 insertions(+), 217 deletions(-)

Comments

Michal Hocko June 25, 2024, 7:08 a.m. UTC | #1
On Mon 24-06-24 17:59:00, Roman Gushchin wrote:
> Cgroup v1 supports a complicated OOM handling in userspace mechanism,
> which is not supported by cgroup v2. Let's move the corresponding code
> into memcontrol-v1.c.
> 
> Aside from mechanical code movement this patch introduces two new
> functions: memcg1_oom_prepare() and memcg1_oom_finish().
> Those are implementing cgroup v1-specific parts of the common memcg
> OOM handling path.
> 
> Signed-off-by: Roman Gushchin <roman.gushchin@linux.dev>

Acked-by: Michal Hocko <mhocko@suse.com>

> ---
>  mm/memcontrol-v1.c | 229 ++++++++++++++++++++++++++++++++++++++++++++-
>  mm/memcontrol-v1.h |   3 +-
>  mm/memcontrol.c    | 216 +-----------------------------------------
>  3 files changed, 231 insertions(+), 217 deletions(-)
> 
> diff --git a/mm/memcontrol-v1.c b/mm/memcontrol-v1.c
> index d7b5c4c14732..253d49d5fb12 100644
> --- a/mm/memcontrol-v1.c
> +++ b/mm/memcontrol-v1.c
> @@ -110,7 +110,13 @@ struct mem_cgroup_event {
>  	struct work_struct remove;
>  };
>  
> -extern spinlock_t memcg_oom_lock;
> +#ifdef CONFIG_LOCKDEP
> +static struct lockdep_map memcg_oom_lock_dep_map = {
> +	.name = "memcg_oom_lock",
> +};
> +#endif
> +
> +DEFINE_SPINLOCK(memcg_oom_lock);
>  
>  static void __mem_cgroup_insert_exceeded(struct mem_cgroup_per_node *mz,
>  					 struct mem_cgroup_tree_per_node *mctz,
> @@ -1469,7 +1475,7 @@ static int mem_cgroup_oom_notify_cb(struct mem_cgroup *memcg)
>  	return 0;
>  }
>  
> -void mem_cgroup_oom_notify(struct mem_cgroup *memcg)
> +static void mem_cgroup_oom_notify(struct mem_cgroup *memcg)
>  {
>  	struct mem_cgroup *iter;
>  
> @@ -1959,6 +1965,225 @@ void memcg1_css_offline(struct mem_cgroup *memcg)
>  	spin_unlock_irq(&memcg->event_list_lock);
>  }
>  
> +/*
> + * Check OOM-Killer is already running under our hierarchy.
> + * If someone is running, return false.
> + */
> +static bool mem_cgroup_oom_trylock(struct mem_cgroup *memcg)
> +{
> +	struct mem_cgroup *iter, *failed = NULL;
> +
> +	spin_lock(&memcg_oom_lock);
> +
> +	for_each_mem_cgroup_tree(iter, memcg) {
> +		if (iter->oom_lock) {
> +			/*
> +			 * this subtree of our hierarchy is already locked
> +			 * so we cannot give a lock.
> +			 */
> +			failed = iter;
> +			mem_cgroup_iter_break(memcg, iter);
> +			break;
> +		} else
> +			iter->oom_lock = true;
> +	}
> +
> +	if (failed) {
> +		/*
> +		 * OK, we failed to lock the whole subtree so we have
> +		 * to clean up what we set up to the failing subtree
> +		 */
> +		for_each_mem_cgroup_tree(iter, memcg) {
> +			if (iter == failed) {
> +				mem_cgroup_iter_break(memcg, iter);
> +				break;
> +			}
> +			iter->oom_lock = false;
> +		}
> +	} else
> +		mutex_acquire(&memcg_oom_lock_dep_map, 0, 1, _RET_IP_);
> +
> +	spin_unlock(&memcg_oom_lock);
> +
> +	return !failed;
> +}
> +
> +static void mem_cgroup_oom_unlock(struct mem_cgroup *memcg)
> +{
> +	struct mem_cgroup *iter;
> +
> +	spin_lock(&memcg_oom_lock);
> +	mutex_release(&memcg_oom_lock_dep_map, _RET_IP_);
> +	for_each_mem_cgroup_tree(iter, memcg)
> +		iter->oom_lock = false;
> +	spin_unlock(&memcg_oom_lock);
> +}
> +
> +static void mem_cgroup_mark_under_oom(struct mem_cgroup *memcg)
> +{
> +	struct mem_cgroup *iter;
> +
> +	spin_lock(&memcg_oom_lock);
> +	for_each_mem_cgroup_tree(iter, memcg)
> +		iter->under_oom++;
> +	spin_unlock(&memcg_oom_lock);
> +}
> +
> +static void mem_cgroup_unmark_under_oom(struct mem_cgroup *memcg)
> +{
> +	struct mem_cgroup *iter;
> +
> +	/*
> +	 * Be careful about under_oom underflows because a child memcg
> +	 * could have been added after mem_cgroup_mark_under_oom.
> +	 */
> +	spin_lock(&memcg_oom_lock);
> +	for_each_mem_cgroup_tree(iter, memcg)
> +		if (iter->under_oom > 0)
> +			iter->under_oom--;
> +	spin_unlock(&memcg_oom_lock);
> +}
> +
> +static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq);
> +
> +struct oom_wait_info {
> +	struct mem_cgroup *memcg;
> +	wait_queue_entry_t	wait;
> +};
> +
> +static int memcg_oom_wake_function(wait_queue_entry_t *wait,
> +	unsigned mode, int sync, void *arg)
> +{
> +	struct mem_cgroup *wake_memcg = (struct mem_cgroup *)arg;
> +	struct mem_cgroup *oom_wait_memcg;
> +	struct oom_wait_info *oom_wait_info;
> +
> +	oom_wait_info = container_of(wait, struct oom_wait_info, wait);
> +	oom_wait_memcg = oom_wait_info->memcg;
> +
> +	if (!mem_cgroup_is_descendant(wake_memcg, oom_wait_memcg) &&
> +	    !mem_cgroup_is_descendant(oom_wait_memcg, wake_memcg))
> +		return 0;
> +	return autoremove_wake_function(wait, mode, sync, arg);
> +}
> +
> +void memcg_oom_recover(struct mem_cgroup *memcg)
> +{
> +	/*
> +	 * For the following lockless ->under_oom test, the only required
> +	 * guarantee is that it must see the state asserted by an OOM when
> +	 * this function is called as a result of userland actions
> +	 * triggered by the notification of the OOM.  This is trivially
> +	 * achieved by invoking mem_cgroup_mark_under_oom() before
> +	 * triggering notification.
> +	 */
> +	if (memcg && memcg->under_oom)
> +		__wake_up(&memcg_oom_waitq, TASK_NORMAL, 0, memcg);
> +}
> +
> +/**
> + * mem_cgroup_oom_synchronize - complete memcg OOM handling
> + * @handle: actually kill/wait or just clean up the OOM state
> + *
> + * This has to be called at the end of a page fault if the memcg OOM
> + * handler was enabled.
> + *
> + * Memcg supports userspace OOM handling where failed allocations must
> + * sleep on a waitqueue until the userspace task resolves the
> + * situation.  Sleeping directly in the charge context with all kinds
> + * of locks held is not a good idea, instead we remember an OOM state
> + * in the task and mem_cgroup_oom_synchronize() has to be called at
> + * the end of the page fault to complete the OOM handling.
> + *
> + * Returns %true if an ongoing memcg OOM situation was detected and
> + * completed, %false otherwise.
> + */
> +bool mem_cgroup_oom_synchronize(bool handle)
> +{
> +	struct mem_cgroup *memcg = current->memcg_in_oom;
> +	struct oom_wait_info owait;
> +	bool locked;
> +
> +	/* OOM is global, do not handle */
> +	if (!memcg)
> +		return false;
> +
> +	if (!handle)
> +		goto cleanup;
> +
> +	owait.memcg = memcg;
> +	owait.wait.flags = 0;
> +	owait.wait.func = memcg_oom_wake_function;
> +	owait.wait.private = current;
> +	INIT_LIST_HEAD(&owait.wait.entry);
> +
> +	prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE);
> +	mem_cgroup_mark_under_oom(memcg);
> +
> +	locked = mem_cgroup_oom_trylock(memcg);
> +
> +	if (locked)
> +		mem_cgroup_oom_notify(memcg);
> +
> +	schedule();
> +	mem_cgroup_unmark_under_oom(memcg);
> +	finish_wait(&memcg_oom_waitq, &owait.wait);
> +
> +	if (locked)
> +		mem_cgroup_oom_unlock(memcg);
> +cleanup:
> +	current->memcg_in_oom = NULL;
> +	css_put(&memcg->css);
> +	return true;
> +}
> +
> +
> +bool memcg1_oom_prepare(struct mem_cgroup *memcg, bool *locked)
> +{
> +	/*
> +	 * We are in the middle of the charge context here, so we
> +	 * don't want to block when potentially sitting on a callstack
> +	 * that holds all kinds of filesystem and mm locks.
> +	 *
> +	 * cgroup1 allows disabling the OOM killer and waiting for outside
> +	 * handling until the charge can succeed; remember the context and put
> +	 * the task to sleep at the end of the page fault when all locks are
> +	 * released.
> +	 *
> +	 * On the other hand, in-kernel OOM killer allows for an async victim
> +	 * memory reclaim (oom_reaper) and that means that we are not solely
> +	 * relying on the oom victim to make a forward progress and we can
> +	 * invoke the oom killer here.
> +	 *
> +	 * Please note that mem_cgroup_out_of_memory might fail to find a
> +	 * victim and then we have to bail out from the charge path.
> +	 */
> +	if (READ_ONCE(memcg->oom_kill_disable)) {
> +		if (current->in_user_fault) {
> +			css_get(&memcg->css);
> +			current->memcg_in_oom = memcg;
> +		}
> +		return false;
> +	}
> +
> +	mem_cgroup_mark_under_oom(memcg);
> +
> +	*locked = mem_cgroup_oom_trylock(memcg);
> +
> +	if (*locked)
> +		mem_cgroup_oom_notify(memcg);
> +
> +	mem_cgroup_unmark_under_oom(memcg);
> +
> +	return true;
> +}
> +
> +void memcg1_oom_finish(struct mem_cgroup *memcg, bool locked)
> +{
> +	if (locked)
> +		mem_cgroup_oom_unlock(memcg);
> +}
> +
>  static int __init memcg1_init(void)
>  {
>  	int node;
> diff --git a/mm/memcontrol-v1.h b/mm/memcontrol-v1.h
> index ef1b7037cbdc..3de956b2422f 100644
> --- a/mm/memcontrol-v1.h
> +++ b/mm/memcontrol-v1.h
> @@ -87,9 +87,10 @@ enum res_type {
>  bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg,
>  				enum mem_cgroup_events_target target);
>  unsigned long mem_cgroup_usage(struct mem_cgroup *memcg, bool swap);
> -void mem_cgroup_oom_notify(struct mem_cgroup *memcg);
>  ssize_t memcg_write_event_control(struct kernfs_open_file *of,
>  				  char *buf, size_t nbytes, loff_t off);
>  
> +bool memcg1_oom_prepare(struct mem_cgroup *memcg, bool *locked);
> +void memcg1_oom_finish(struct mem_cgroup *memcg, bool locked);
>  
>  #endif	/* __MM_MEMCONTROL_V1_H */
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> index 92fb72bbd494..8abd364ac837 100644
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -1616,130 +1616,6 @@ static bool mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
>  	return ret;
>  }
>  
> -#ifdef CONFIG_LOCKDEP
> -static struct lockdep_map memcg_oom_lock_dep_map = {
> -	.name = "memcg_oom_lock",
> -};
> -#endif
> -
> -DEFINE_SPINLOCK(memcg_oom_lock);
> -
> -/*
> - * Check OOM-Killer is already running under our hierarchy.
> - * If someone is running, return false.
> - */
> -static bool mem_cgroup_oom_trylock(struct mem_cgroup *memcg)
> -{
> -	struct mem_cgroup *iter, *failed = NULL;
> -
> -	spin_lock(&memcg_oom_lock);
> -
> -	for_each_mem_cgroup_tree(iter, memcg) {
> -		if (iter->oom_lock) {
> -			/*
> -			 * this subtree of our hierarchy is already locked
> -			 * so we cannot give a lock.
> -			 */
> -			failed = iter;
> -			mem_cgroup_iter_break(memcg, iter);
> -			break;
> -		} else
> -			iter->oom_lock = true;
> -	}
> -
> -	if (failed) {
> -		/*
> -		 * OK, we failed to lock the whole subtree so we have
> -		 * to clean up what we set up to the failing subtree
> -		 */
> -		for_each_mem_cgroup_tree(iter, memcg) {
> -			if (iter == failed) {
> -				mem_cgroup_iter_break(memcg, iter);
> -				break;
> -			}
> -			iter->oom_lock = false;
> -		}
> -	} else
> -		mutex_acquire(&memcg_oom_lock_dep_map, 0, 1, _RET_IP_);
> -
> -	spin_unlock(&memcg_oom_lock);
> -
> -	return !failed;
> -}
> -
> -static void mem_cgroup_oom_unlock(struct mem_cgroup *memcg)
> -{
> -	struct mem_cgroup *iter;
> -
> -	spin_lock(&memcg_oom_lock);
> -	mutex_release(&memcg_oom_lock_dep_map, _RET_IP_);
> -	for_each_mem_cgroup_tree(iter, memcg)
> -		iter->oom_lock = false;
> -	spin_unlock(&memcg_oom_lock);
> -}
> -
> -static void mem_cgroup_mark_under_oom(struct mem_cgroup *memcg)
> -{
> -	struct mem_cgroup *iter;
> -
> -	spin_lock(&memcg_oom_lock);
> -	for_each_mem_cgroup_tree(iter, memcg)
> -		iter->under_oom++;
> -	spin_unlock(&memcg_oom_lock);
> -}
> -
> -static void mem_cgroup_unmark_under_oom(struct mem_cgroup *memcg)
> -{
> -	struct mem_cgroup *iter;
> -
> -	/*
> -	 * Be careful about under_oom underflows because a child memcg
> -	 * could have been added after mem_cgroup_mark_under_oom.
> -	 */
> -	spin_lock(&memcg_oom_lock);
> -	for_each_mem_cgroup_tree(iter, memcg)
> -		if (iter->under_oom > 0)
> -			iter->under_oom--;
> -	spin_unlock(&memcg_oom_lock);
> -}
> -
> -static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq);
> -
> -struct oom_wait_info {
> -	struct mem_cgroup *memcg;
> -	wait_queue_entry_t	wait;
> -};
> -
> -static int memcg_oom_wake_function(wait_queue_entry_t *wait,
> -	unsigned mode, int sync, void *arg)
> -{
> -	struct mem_cgroup *wake_memcg = (struct mem_cgroup *)arg;
> -	struct mem_cgroup *oom_wait_memcg;
> -	struct oom_wait_info *oom_wait_info;
> -
> -	oom_wait_info = container_of(wait, struct oom_wait_info, wait);
> -	oom_wait_memcg = oom_wait_info->memcg;
> -
> -	if (!mem_cgroup_is_descendant(wake_memcg, oom_wait_memcg) &&
> -	    !mem_cgroup_is_descendant(oom_wait_memcg, wake_memcg))
> -		return 0;
> -	return autoremove_wake_function(wait, mode, sync, arg);
> -}
> -
> -void memcg_oom_recover(struct mem_cgroup *memcg)
> -{
> -	/*
> -	 * For the following lockless ->under_oom test, the only required
> -	 * guarantee is that it must see the state asserted by an OOM when
> -	 * this function is called as a result of userland actions
> -	 * triggered by the notification of the OOM.  This is trivially
> -	 * achieved by invoking mem_cgroup_mark_under_oom() before
> -	 * triggering notification.
> -	 */
> -	if (memcg && memcg->under_oom)
> -		__wake_up(&memcg_oom_waitq, TASK_NORMAL, 0, memcg);
> -}
> -
>  /*
>   * Returns true if successfully killed one or more processes. Though in some
>   * corner cases it can return true even without killing any process.
> @@ -1753,104 +1629,16 @@ static bool mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int order)
>  
>  	memcg_memory_event(memcg, MEMCG_OOM);
>  
> -	/*
> -	 * We are in the middle of the charge context here, so we
> -	 * don't want to block when potentially sitting on a callstack
> -	 * that holds all kinds of filesystem and mm locks.
> -	 *
> -	 * cgroup1 allows disabling the OOM killer and waiting for outside
> -	 * handling until the charge can succeed; remember the context and put
> -	 * the task to sleep at the end of the page fault when all locks are
> -	 * released.
> -	 *
> -	 * On the other hand, in-kernel OOM killer allows for an async victim
> -	 * memory reclaim (oom_reaper) and that means that we are not solely
> -	 * relying on the oom victim to make a forward progress and we can
> -	 * invoke the oom killer here.
> -	 *
> -	 * Please note that mem_cgroup_out_of_memory might fail to find a
> -	 * victim and then we have to bail out from the charge path.
> -	 */
> -	if (READ_ONCE(memcg->oom_kill_disable)) {
> -		if (current->in_user_fault) {
> -			css_get(&memcg->css);
> -			current->memcg_in_oom = memcg;
> -		}
> +	if (!memcg1_oom_prepare(memcg, &locked))
>  		return false;
> -	}
> -
> -	mem_cgroup_mark_under_oom(memcg);
>  
> -	locked = mem_cgroup_oom_trylock(memcg);
> -
> -	if (locked)
> -		mem_cgroup_oom_notify(memcg);
> -
> -	mem_cgroup_unmark_under_oom(memcg);
>  	ret = mem_cgroup_out_of_memory(memcg, mask, order);
>  
> -	if (locked)
> -		mem_cgroup_oom_unlock(memcg);
> +	memcg1_oom_finish(memcg, locked);
>  
>  	return ret;
>  }
>  
> -/**
> - * mem_cgroup_oom_synchronize - complete memcg OOM handling
> - * @handle: actually kill/wait or just clean up the OOM state
> - *
> - * This has to be called at the end of a page fault if the memcg OOM
> - * handler was enabled.
> - *
> - * Memcg supports userspace OOM handling where failed allocations must
> - * sleep on a waitqueue until the userspace task resolves the
> - * situation.  Sleeping directly in the charge context with all kinds
> - * of locks held is not a good idea, instead we remember an OOM state
> - * in the task and mem_cgroup_oom_synchronize() has to be called at
> - * the end of the page fault to complete the OOM handling.
> - *
> - * Returns %true if an ongoing memcg OOM situation was detected and
> - * completed, %false otherwise.
> - */
> -bool mem_cgroup_oom_synchronize(bool handle)
> -{
> -	struct mem_cgroup *memcg = current->memcg_in_oom;
> -	struct oom_wait_info owait;
> -	bool locked;
> -
> -	/* OOM is global, do not handle */
> -	if (!memcg)
> -		return false;
> -
> -	if (!handle)
> -		goto cleanup;
> -
> -	owait.memcg = memcg;
> -	owait.wait.flags = 0;
> -	owait.wait.func = memcg_oom_wake_function;
> -	owait.wait.private = current;
> -	INIT_LIST_HEAD(&owait.wait.entry);
> -
> -	prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE);
> -	mem_cgroup_mark_under_oom(memcg);
> -
> -	locked = mem_cgroup_oom_trylock(memcg);
> -
> -	if (locked)
> -		mem_cgroup_oom_notify(memcg);
> -
> -	schedule();
> -	mem_cgroup_unmark_under_oom(memcg);
> -	finish_wait(&memcg_oom_waitq, &owait.wait);
> -
> -	if (locked)
> -		mem_cgroup_oom_unlock(memcg);
> -cleanup:
> -	current->memcg_in_oom = NULL;
> -	css_put(&memcg->css);
> -	return true;
> -}
> -
>  /**
>   * mem_cgroup_get_oom_group - get a memory cgroup to clean up after OOM
>   * @victim: task to be killed by the OOM killer
> -- 
> 2.45.2
diff mbox series

Patch

diff --git a/mm/memcontrol-v1.c b/mm/memcontrol-v1.c
index d7b5c4c14732..253d49d5fb12 100644
--- a/mm/memcontrol-v1.c
+++ b/mm/memcontrol-v1.c
@@ -110,7 +110,13 @@  struct mem_cgroup_event {
 	struct work_struct remove;
 };
 
-extern spinlock_t memcg_oom_lock;
+#ifdef CONFIG_LOCKDEP
+static struct lockdep_map memcg_oom_lock_dep_map = {
+	.name = "memcg_oom_lock",
+};
+#endif
+
+DEFINE_SPINLOCK(memcg_oom_lock);
 
 static void __mem_cgroup_insert_exceeded(struct mem_cgroup_per_node *mz,
 					 struct mem_cgroup_tree_per_node *mctz,
@@ -1469,7 +1475,7 @@  static int mem_cgroup_oom_notify_cb(struct mem_cgroup *memcg)
 	return 0;
 }
 
-void mem_cgroup_oom_notify(struct mem_cgroup *memcg)
+static void mem_cgroup_oom_notify(struct mem_cgroup *memcg)
 {
 	struct mem_cgroup *iter;
 
@@ -1959,6 +1965,225 @@  void memcg1_css_offline(struct mem_cgroup *memcg)
 	spin_unlock_irq(&memcg->event_list_lock);
 }
 
+/*
+ * Check OOM-Killer is already running under our hierarchy.
+ * If someone is running, return false.
+ */
+static bool mem_cgroup_oom_trylock(struct mem_cgroup *memcg)
+{
+	struct mem_cgroup *iter, *failed = NULL;
+
+	spin_lock(&memcg_oom_lock);
+
+	for_each_mem_cgroup_tree(iter, memcg) {
+		if (iter->oom_lock) {
+			/*
+			 * this subtree of our hierarchy is already locked
+			 * so we cannot give a lock.
+			 */
+			failed = iter;
+			mem_cgroup_iter_break(memcg, iter);
+			break;
+		} else
+			iter->oom_lock = true;
+	}
+
+	if (failed) {
+		/*
+		 * OK, we failed to lock the whole subtree so we have
+		 * to clean up what we set up to the failing subtree
+		 */
+		for_each_mem_cgroup_tree(iter, memcg) {
+			if (iter == failed) {
+				mem_cgroup_iter_break(memcg, iter);
+				break;
+			}
+			iter->oom_lock = false;
+		}
+	} else
+		mutex_acquire(&memcg_oom_lock_dep_map, 0, 1, _RET_IP_);
+
+	spin_unlock(&memcg_oom_lock);
+
+	return !failed;
+}
+
+static void mem_cgroup_oom_unlock(struct mem_cgroup *memcg)
+{
+	struct mem_cgroup *iter;
+
+	spin_lock(&memcg_oom_lock);
+	mutex_release(&memcg_oom_lock_dep_map, _RET_IP_);
+	for_each_mem_cgroup_tree(iter, memcg)
+		iter->oom_lock = false;
+	spin_unlock(&memcg_oom_lock);
+}
+
+static void mem_cgroup_mark_under_oom(struct mem_cgroup *memcg)
+{
+	struct mem_cgroup *iter;
+
+	spin_lock(&memcg_oom_lock);
+	for_each_mem_cgroup_tree(iter, memcg)
+		iter->under_oom++;
+	spin_unlock(&memcg_oom_lock);
+}
+
+static void mem_cgroup_unmark_under_oom(struct mem_cgroup *memcg)
+{
+	struct mem_cgroup *iter;
+
+	/*
+	 * Be careful about under_oom underflows because a child memcg
+	 * could have been added after mem_cgroup_mark_under_oom.
+	 */
+	spin_lock(&memcg_oom_lock);
+	for_each_mem_cgroup_tree(iter, memcg)
+		if (iter->under_oom > 0)
+			iter->under_oom--;
+	spin_unlock(&memcg_oom_lock);
+}
+
+static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq);
+
+struct oom_wait_info {
+	struct mem_cgroup *memcg;
+	wait_queue_entry_t	wait;
+};
+
+static int memcg_oom_wake_function(wait_queue_entry_t *wait,
+	unsigned mode, int sync, void *arg)
+{
+	struct mem_cgroup *wake_memcg = (struct mem_cgroup *)arg;
+	struct mem_cgroup *oom_wait_memcg;
+	struct oom_wait_info *oom_wait_info;
+
+	oom_wait_info = container_of(wait, struct oom_wait_info, wait);
+	oom_wait_memcg = oom_wait_info->memcg;
+
+	if (!mem_cgroup_is_descendant(wake_memcg, oom_wait_memcg) &&
+	    !mem_cgroup_is_descendant(oom_wait_memcg, wake_memcg))
+		return 0;
+	return autoremove_wake_function(wait, mode, sync, arg);
+}
+
+void memcg_oom_recover(struct mem_cgroup *memcg)
+{
+	/*
+	 * For the following lockless ->under_oom test, the only required
+	 * guarantee is that it must see the state asserted by an OOM when
+	 * this function is called as a result of userland actions
+	 * triggered by the notification of the OOM.  This is trivially
+	 * achieved by invoking mem_cgroup_mark_under_oom() before
+	 * triggering notification.
+	 */
+	if (memcg && memcg->under_oom)
+		__wake_up(&memcg_oom_waitq, TASK_NORMAL, 0, memcg);
+}
+
+/**
+ * mem_cgroup_oom_synchronize - complete memcg OOM handling
+ * @handle: actually kill/wait or just clean up the OOM state
+ *
+ * This has to be called at the end of a page fault if the memcg OOM
+ * handler was enabled.
+ *
+ * Memcg supports userspace OOM handling where failed allocations must
+ * sleep on a waitqueue until the userspace task resolves the
+ * situation.  Sleeping directly in the charge context with all kinds
+ * of locks held is not a good idea, instead we remember an OOM state
+ * in the task and mem_cgroup_oom_synchronize() has to be called at
+ * the end of the page fault to complete the OOM handling.
+ *
+ * Returns %true if an ongoing memcg OOM situation was detected and
+ * completed, %false otherwise.
+ */
+bool mem_cgroup_oom_synchronize(bool handle)
+{
+	struct mem_cgroup *memcg = current->memcg_in_oom;
+	struct oom_wait_info owait;
+	bool locked;
+
+	/* OOM is global, do not handle */
+	if (!memcg)
+		return false;
+
+	if (!handle)
+		goto cleanup;
+
+	owait.memcg = memcg;
+	owait.wait.flags = 0;
+	owait.wait.func = memcg_oom_wake_function;
+	owait.wait.private = current;
+	INIT_LIST_HEAD(&owait.wait.entry);
+
+	prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE);
+	mem_cgroup_mark_under_oom(memcg);
+
+	locked = mem_cgroup_oom_trylock(memcg);
+
+	if (locked)
+		mem_cgroup_oom_notify(memcg);
+
+	schedule();
+	mem_cgroup_unmark_under_oom(memcg);
+	finish_wait(&memcg_oom_waitq, &owait.wait);
+
+	if (locked)
+		mem_cgroup_oom_unlock(memcg);
+cleanup:
+	current->memcg_in_oom = NULL;
+	css_put(&memcg->css);
+	return true;
+}
+
+
+bool memcg1_oom_prepare(struct mem_cgroup *memcg, bool *locked)
+{
+	/*
+	 * We are in the middle of the charge context here, so we
+	 * don't want to block when potentially sitting on a callstack
+	 * that holds all kinds of filesystem and mm locks.
+	 *
+	 * cgroup1 allows disabling the OOM killer and waiting for outside
+	 * handling until the charge can succeed; remember the context and put
+	 * the task to sleep at the end of the page fault when all locks are
+	 * released.
+	 *
+	 * On the other hand, in-kernel OOM killer allows for an async victim
+	 * memory reclaim (oom_reaper) and that means that we are not solely
+	 * relying on the oom victim to make a forward progress and we can
+	 * invoke the oom killer here.
+	 *
+	 * Please note that mem_cgroup_out_of_memory might fail to find a
+	 * victim and then we have to bail out from the charge path.
+	 */
+	if (READ_ONCE(memcg->oom_kill_disable)) {
+		if (current->in_user_fault) {
+			css_get(&memcg->css);
+			current->memcg_in_oom = memcg;
+		}
+		return false;
+	}
+
+	mem_cgroup_mark_under_oom(memcg);
+
+	*locked = mem_cgroup_oom_trylock(memcg);
+
+	if (*locked)
+		mem_cgroup_oom_notify(memcg);
+
+	mem_cgroup_unmark_under_oom(memcg);
+
+	return true;
+}
+
+void memcg1_oom_finish(struct mem_cgroup *memcg, bool locked)
+{
+	if (locked)
+		mem_cgroup_oom_unlock(memcg);
+}
+
 static int __init memcg1_init(void)
 {
 	int node;
diff --git a/mm/memcontrol-v1.h b/mm/memcontrol-v1.h
index ef1b7037cbdc..3de956b2422f 100644
--- a/mm/memcontrol-v1.h
+++ b/mm/memcontrol-v1.h
@@ -87,9 +87,10 @@  enum res_type {
 bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg,
 				enum mem_cgroup_events_target target);
 unsigned long mem_cgroup_usage(struct mem_cgroup *memcg, bool swap);
-void mem_cgroup_oom_notify(struct mem_cgroup *memcg);
 ssize_t memcg_write_event_control(struct kernfs_open_file *of,
 				  char *buf, size_t nbytes, loff_t off);
 
+bool memcg1_oom_prepare(struct mem_cgroup *memcg, bool *locked);
+void memcg1_oom_finish(struct mem_cgroup *memcg, bool locked);
 
 #endif	/* __MM_MEMCONTROL_V1_H */
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 92fb72bbd494..8abd364ac837 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1616,130 +1616,6 @@  static bool mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
 	return ret;
 }
 
-#ifdef CONFIG_LOCKDEP
-static struct lockdep_map memcg_oom_lock_dep_map = {
-	.name = "memcg_oom_lock",
-};
-#endif
-
-DEFINE_SPINLOCK(memcg_oom_lock);
-
-/*
- * Check OOM-Killer is already running under our hierarchy.
- * If someone is running, return false.
- */
-static bool mem_cgroup_oom_trylock(struct mem_cgroup *memcg)
-{
-	struct mem_cgroup *iter, *failed = NULL;
-
-	spin_lock(&memcg_oom_lock);
-
-	for_each_mem_cgroup_tree(iter, memcg) {
-		if (iter->oom_lock) {
-			/*
-			 * this subtree of our hierarchy is already locked
-			 * so we cannot give a lock.
-			 */
-			failed = iter;
-			mem_cgroup_iter_break(memcg, iter);
-			break;
-		} else
-			iter->oom_lock = true;
-	}
-
-	if (failed) {
-		/*
-		 * OK, we failed to lock the whole subtree so we have
-		 * to clean up what we set up to the failing subtree
-		 */
-		for_each_mem_cgroup_tree(iter, memcg) {
-			if (iter == failed) {
-				mem_cgroup_iter_break(memcg, iter);
-				break;
-			}
-			iter->oom_lock = false;
-		}
-	} else
-		mutex_acquire(&memcg_oom_lock_dep_map, 0, 1, _RET_IP_);
-
-	spin_unlock(&memcg_oom_lock);
-
-	return !failed;
-}
-
-static void mem_cgroup_oom_unlock(struct mem_cgroup *memcg)
-{
-	struct mem_cgroup *iter;
-
-	spin_lock(&memcg_oom_lock);
-	mutex_release(&memcg_oom_lock_dep_map, _RET_IP_);
-	for_each_mem_cgroup_tree(iter, memcg)
-		iter->oom_lock = false;
-	spin_unlock(&memcg_oom_lock);
-}
-
-static void mem_cgroup_mark_under_oom(struct mem_cgroup *memcg)
-{
-	struct mem_cgroup *iter;
-
-	spin_lock(&memcg_oom_lock);
-	for_each_mem_cgroup_tree(iter, memcg)
-		iter->under_oom++;
-	spin_unlock(&memcg_oom_lock);
-}
-
-static void mem_cgroup_unmark_under_oom(struct mem_cgroup *memcg)
-{
-	struct mem_cgroup *iter;
-
-	/*
-	 * Be careful about under_oom underflows because a child memcg
-	 * could have been added after mem_cgroup_mark_under_oom.
-	 */
-	spin_lock(&memcg_oom_lock);
-	for_each_mem_cgroup_tree(iter, memcg)
-		if (iter->under_oom > 0)
-			iter->under_oom--;
-	spin_unlock(&memcg_oom_lock);
-}
-
-static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq);
-
-struct oom_wait_info {
-	struct mem_cgroup *memcg;
-	wait_queue_entry_t	wait;
-};
-
-static int memcg_oom_wake_function(wait_queue_entry_t *wait,
-	unsigned mode, int sync, void *arg)
-{
-	struct mem_cgroup *wake_memcg = (struct mem_cgroup *)arg;
-	struct mem_cgroup *oom_wait_memcg;
-	struct oom_wait_info *oom_wait_info;
-
-	oom_wait_info = container_of(wait, struct oom_wait_info, wait);
-	oom_wait_memcg = oom_wait_info->memcg;
-
-	if (!mem_cgroup_is_descendant(wake_memcg, oom_wait_memcg) &&
-	    !mem_cgroup_is_descendant(oom_wait_memcg, wake_memcg))
-		return 0;
-	return autoremove_wake_function(wait, mode, sync, arg);
-}
-
-void memcg_oom_recover(struct mem_cgroup *memcg)
-{
-	/*
-	 * For the following lockless ->under_oom test, the only required
-	 * guarantee is that it must see the state asserted by an OOM when
-	 * this function is called as a result of userland actions
-	 * triggered by the notification of the OOM.  This is trivially
-	 * achieved by invoking mem_cgroup_mark_under_oom() before
-	 * triggering notification.
-	 */
-	if (memcg && memcg->under_oom)
-		__wake_up(&memcg_oom_waitq, TASK_NORMAL, 0, memcg);
-}
-
 /*
  * Returns true if successfully killed one or more processes. Though in some
  * corner cases it can return true even without killing any process.
@@ -1753,104 +1629,16 @@  static bool mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int order)
 
 	memcg_memory_event(memcg, MEMCG_OOM);
 
-	/*
-	 * We are in the middle of the charge context here, so we
-	 * don't want to block when potentially sitting on a callstack
-	 * that holds all kinds of filesystem and mm locks.
-	 *
-	 * cgroup1 allows disabling the OOM killer and waiting for outside
-	 * handling until the charge can succeed; remember the context and put
-	 * the task to sleep at the end of the page fault when all locks are
-	 * released.
-	 *
-	 * On the other hand, in-kernel OOM killer allows for an async victim
-	 * memory reclaim (oom_reaper) and that means that we are not solely
-	 * relying on the oom victim to make a forward progress and we can
-	 * invoke the oom killer here.
-	 *
-	 * Please note that mem_cgroup_out_of_memory might fail to find a
-	 * victim and then we have to bail out from the charge path.
-	 */
-	if (READ_ONCE(memcg->oom_kill_disable)) {
-		if (current->in_user_fault) {
-			css_get(&memcg->css);
-			current->memcg_in_oom = memcg;
-		}
+	if (!memcg1_oom_prepare(memcg, &locked))
 		return false;
-	}
-
-	mem_cgroup_mark_under_oom(memcg);
 
-	locked = mem_cgroup_oom_trylock(memcg);
-
-	if (locked)
-		mem_cgroup_oom_notify(memcg);
-
-	mem_cgroup_unmark_under_oom(memcg);
 	ret = mem_cgroup_out_of_memory(memcg, mask, order);
 
-	if (locked)
-		mem_cgroup_oom_unlock(memcg);
+	memcg1_oom_finish(memcg, locked);
 
 	return ret;
 }
 
-/**
- * mem_cgroup_oom_synchronize - complete memcg OOM handling
- * @handle: actually kill/wait or just clean up the OOM state
- *
- * This has to be called at the end of a page fault if the memcg OOM
- * handler was enabled.
- *
- * Memcg supports userspace OOM handling where failed allocations must
- * sleep on a waitqueue until the userspace task resolves the
- * situation.  Sleeping directly in the charge context with all kinds
- * of locks held is not a good idea, instead we remember an OOM state
- * in the task and mem_cgroup_oom_synchronize() has to be called at
- * the end of the page fault to complete the OOM handling.
- *
- * Returns %true if an ongoing memcg OOM situation was detected and
- * completed, %false otherwise.
- */
-bool mem_cgroup_oom_synchronize(bool handle)
-{
-	struct mem_cgroup *memcg = current->memcg_in_oom;
-	struct oom_wait_info owait;
-	bool locked;
-
-	/* OOM is global, do not handle */
-	if (!memcg)
-		return false;
-
-	if (!handle)
-		goto cleanup;
-
-	owait.memcg = memcg;
-	owait.wait.flags = 0;
-	owait.wait.func = memcg_oom_wake_function;
-	owait.wait.private = current;
-	INIT_LIST_HEAD(&owait.wait.entry);
-
-	prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE);
-	mem_cgroup_mark_under_oom(memcg);
-
-	locked = mem_cgroup_oom_trylock(memcg);
-
-	if (locked)
-		mem_cgroup_oom_notify(memcg);
-
-	schedule();
-	mem_cgroup_unmark_under_oom(memcg);
-	finish_wait(&memcg_oom_waitq, &owait.wait);
-
-	if (locked)
-		mem_cgroup_oom_unlock(memcg);
-cleanup:
-	current->memcg_in_oom = NULL;
-	css_put(&memcg->css);
-	return true;
-}
-
 /**
  * mem_cgroup_get_oom_group - get a memory cgroup to clean up after OOM
  * @victim: task to be killed by the OOM killer