diff mbox series

[v2,06/14] mm: memcg: move legacy memcg event code into memcontrol-v1.c

Message ID 20240625005906.106920-7-roman.gushchin@linux.dev (mailing list archive)
State New
Headers show
Series mm: memcg: separate legacy cgroup v1 code and put under config option | expand

Commit Message

Roman Gushchin June 25, 2024, 12:58 a.m. UTC
Cgroup v1's memory controller contains a pretty complicated
event notifications mechanism which is not used on cgroup v2.
Let's move the corresponding code into memcontrol-v1.c.

Please, note, that mem_cgroup_event_ratelimit() remains in
memcontrol.c, otherwise it would require exporting too many
details on memcg stats outside of memcontrol.c.

Signed-off-by: Roman Gushchin <roman.gushchin@linux.dev>
---
 include/linux/memcontrol.h |  12 -
 mm/memcontrol-v1.c         | 653 +++++++++++++++++++++++++++++++++++
 mm/memcontrol-v1.h         |  51 +++
 mm/memcontrol.c            | 687 +------------------------------------
 4 files changed, 709 insertions(+), 694 deletions(-)

Comments

Michal Hocko June 25, 2024, 7:07 a.m. UTC | #1
On Mon 24-06-24 17:58:58, Roman Gushchin wrote:
> Cgroup v1's memory controller contains a pretty complicated
> event notifications mechanism which is not used on cgroup v2.
> Let's move the corresponding code into memcontrol-v1.c.
> 
> Please, note, that mem_cgroup_event_ratelimit() remains in
> memcontrol.c, otherwise it would require exporting too many
> details on memcg stats outside of memcontrol.c.
> 
> Signed-off-by: Roman Gushchin <roman.gushchin@linux.dev>

Acked-by: Michal Hocko <mhocko@suse.com>

> ---
>  include/linux/memcontrol.h |  12 -
>  mm/memcontrol-v1.c         | 653 +++++++++++++++++++++++++++++++++++
>  mm/memcontrol-v1.h         |  51 +++
>  mm/memcontrol.c            | 687 +------------------------------------
>  4 files changed, 709 insertions(+), 694 deletions(-)
> 
> diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
> index 83c8327455d8..588179d29849 100644
> --- a/include/linux/memcontrol.h
> +++ b/include/linux/memcontrol.h
> @@ -69,18 +69,6 @@ struct mem_cgroup_id {
>  	refcount_t ref;
>  };
>  
> -/*
> - * Per memcg event counter is incremented at every pagein/pageout. With THP,
> - * it will be incremented by the number of pages. This counter is used
> - * to trigger some periodic events. This is straightforward and better
> - * than using jiffies etc. to handle periodic memcg event.
> - */
> -enum mem_cgroup_events_target {
> -	MEM_CGROUP_TARGET_THRESH,
> -	MEM_CGROUP_TARGET_SOFTLIMIT,
> -	MEM_CGROUP_NTARGETS,
> -};
> -
>  struct memcg_vmstats_percpu;
>  struct memcg_vmstats;
>  struct lruvec_stats_percpu;
> diff --git a/mm/memcontrol-v1.c b/mm/memcontrol-v1.c
> index c25e038ac874..4b2290ceace6 100644
> --- a/mm/memcontrol-v1.c
> +++ b/mm/memcontrol-v1.c
> @@ -6,6 +6,10 @@
>  #include <linux/pagewalk.h>
>  #include <linux/backing-dev.h>
>  #include <linux/swap_cgroup.h>
> +#include <linux/eventfd.h>
> +#include <linux/poll.h>
> +#include <linux/sort.h>
> +#include <linux/file.h>
>  
>  #include "internal.h"
>  #include "swap.h"
> @@ -60,6 +64,54 @@ static struct move_charge_struct {
>  	.waitq = __WAIT_QUEUE_HEAD_INITIALIZER(mc.waitq),
>  };
>  
> +/* for OOM */
> +struct mem_cgroup_eventfd_list {
> +	struct list_head list;
> +	struct eventfd_ctx *eventfd;
> +};
> +
> +/*
> + * cgroup_event represents events which userspace want to receive.
> + */
> +struct mem_cgroup_event {
> +	/*
> +	 * memcg which the event belongs to.
> +	 */
> +	struct mem_cgroup *memcg;
> +	/*
> +	 * eventfd to signal userspace about the event.
> +	 */
> +	struct eventfd_ctx *eventfd;
> +	/*
> +	 * Each of these stored in a list by the cgroup.
> +	 */
> +	struct list_head list;
> +	/*
> +	 * register_event() callback will be used to add new userspace
> +	 * waiter for changes related to this event.  Use eventfd_signal()
> +	 * on eventfd to send notification to userspace.
> +	 */
> +	int (*register_event)(struct mem_cgroup *memcg,
> +			      struct eventfd_ctx *eventfd, const char *args);
> +	/*
> +	 * unregister_event() callback will be called when userspace closes
> +	 * the eventfd or on cgroup removing.  This callback must be set,
> +	 * if you want provide notification functionality.
> +	 */
> +	void (*unregister_event)(struct mem_cgroup *memcg,
> +				 struct eventfd_ctx *eventfd);
> +	/*
> +	 * All fields below needed to unregister event when
> +	 * userspace closes eventfd.
> +	 */
> +	poll_table pt;
> +	wait_queue_head_t *wqh;
> +	wait_queue_entry_t wait;
> +	struct work_struct remove;
> +};
> +
> +extern spinlock_t memcg_oom_lock;
> +
>  static void __mem_cgroup_insert_exceeded(struct mem_cgroup_per_node *mz,
>  					 struct mem_cgroup_tree_per_node *mctz,
>  					 unsigned long new_usage_in_excess)
> @@ -1306,6 +1358,607 @@ void memcg1_move_task(void)
>  }
>  #endif
>  
> +static void __mem_cgroup_threshold(struct mem_cgroup *memcg, bool swap)
> +{
> +	struct mem_cgroup_threshold_ary *t;
> +	unsigned long usage;
> +	int i;
> +
> +	rcu_read_lock();
> +	if (!swap)
> +		t = rcu_dereference(memcg->thresholds.primary);
> +	else
> +		t = rcu_dereference(memcg->memsw_thresholds.primary);
> +
> +	if (!t)
> +		goto unlock;
> +
> +	usage = mem_cgroup_usage(memcg, swap);
> +
> +	/*
> +	 * current_threshold points to threshold just below or equal to usage.
> +	 * If it's not true, a threshold was crossed after last
> +	 * call of __mem_cgroup_threshold().
> +	 */
> +	i = t->current_threshold;
> +
> +	/*
> +	 * Iterate backward over array of thresholds starting from
> +	 * current_threshold and check if a threshold is crossed.
> +	 * If none of thresholds below usage is crossed, we read
> +	 * only one element of the array here.
> +	 */
> +	for (; i >= 0 && unlikely(t->entries[i].threshold > usage); i--)
> +		eventfd_signal(t->entries[i].eventfd);
> +
> +	/* i = current_threshold + 1 */
> +	i++;
> +
> +	/*
> +	 * Iterate forward over array of thresholds starting from
> +	 * current_threshold+1 and check if a threshold is crossed.
> +	 * If none of thresholds above usage is crossed, we read
> +	 * only one element of the array here.
> +	 */
> +	for (; i < t->size && unlikely(t->entries[i].threshold <= usage); i++)
> +		eventfd_signal(t->entries[i].eventfd);
> +
> +	/* Update current_threshold */
> +	t->current_threshold = i - 1;
> +unlock:
> +	rcu_read_unlock();
> +}
> +
> +static void mem_cgroup_threshold(struct mem_cgroup *memcg)
> +{
> +	while (memcg) {
> +		__mem_cgroup_threshold(memcg, false);
> +		if (do_memsw_account())
> +			__mem_cgroup_threshold(memcg, true);
> +
> +		memcg = parent_mem_cgroup(memcg);
> +	}
> +}
> +
> +/*
> + * Check events in order.
> + *
> + */
> +void memcg_check_events(struct mem_cgroup *memcg, int nid)
> +{
> +	if (IS_ENABLED(CONFIG_PREEMPT_RT))
> +		return;
> +
> +	/* threshold event is triggered in finer grain than soft limit */
> +	if (unlikely(mem_cgroup_event_ratelimit(memcg,
> +						MEM_CGROUP_TARGET_THRESH))) {
> +		bool do_softlimit;
> +
> +		do_softlimit = mem_cgroup_event_ratelimit(memcg,
> +						MEM_CGROUP_TARGET_SOFTLIMIT);
> +		mem_cgroup_threshold(memcg);
> +		if (unlikely(do_softlimit))
> +			memcg1_update_tree(memcg, nid);
> +	}
> +}
> +
> +static int compare_thresholds(const void *a, const void *b)
> +{
> +	const struct mem_cgroup_threshold *_a = a;
> +	const struct mem_cgroup_threshold *_b = b;
> +
> +	if (_a->threshold > _b->threshold)
> +		return 1;
> +
> +	if (_a->threshold < _b->threshold)
> +		return -1;
> +
> +	return 0;
> +}
> +
> +static int mem_cgroup_oom_notify_cb(struct mem_cgroup *memcg)
> +{
> +	struct mem_cgroup_eventfd_list *ev;
> +
> +	spin_lock(&memcg_oom_lock);
> +
> +	list_for_each_entry(ev, &memcg->oom_notify, list)
> +		eventfd_signal(ev->eventfd);
> +
> +	spin_unlock(&memcg_oom_lock);
> +	return 0;
> +}
> +
> +void mem_cgroup_oom_notify(struct mem_cgroup *memcg)
> +{
> +	struct mem_cgroup *iter;
> +
> +	for_each_mem_cgroup_tree(iter, memcg)
> +		mem_cgroup_oom_notify_cb(iter);
> +}
> +
> +static int __mem_cgroup_usage_register_event(struct mem_cgroup *memcg,
> +	struct eventfd_ctx *eventfd, const char *args, enum res_type type)
> +{
> +	struct mem_cgroup_thresholds *thresholds;
> +	struct mem_cgroup_threshold_ary *new;
> +	unsigned long threshold;
> +	unsigned long usage;
> +	int i, size, ret;
> +
> +	ret = page_counter_memparse(args, "-1", &threshold);
> +	if (ret)
> +		return ret;
> +
> +	mutex_lock(&memcg->thresholds_lock);
> +
> +	if (type == _MEM) {
> +		thresholds = &memcg->thresholds;
> +		usage = mem_cgroup_usage(memcg, false);
> +	} else if (type == _MEMSWAP) {
> +		thresholds = &memcg->memsw_thresholds;
> +		usage = mem_cgroup_usage(memcg, true);
> +	} else
> +		BUG();
> +
> +	/* Check if a threshold crossed before adding a new one */
> +	if (thresholds->primary)
> +		__mem_cgroup_threshold(memcg, type == _MEMSWAP);
> +
> +	size = thresholds->primary ? thresholds->primary->size + 1 : 1;
> +
> +	/* Allocate memory for new array of thresholds */
> +	new = kmalloc(struct_size(new, entries, size), GFP_KERNEL);
> +	if (!new) {
> +		ret = -ENOMEM;
> +		goto unlock;
> +	}
> +	new->size = size;
> +
> +	/* Copy thresholds (if any) to new array */
> +	if (thresholds->primary)
> +		memcpy(new->entries, thresholds->primary->entries,
> +		       flex_array_size(new, entries, size - 1));
> +
> +	/* Add new threshold */
> +	new->entries[size - 1].eventfd = eventfd;
> +	new->entries[size - 1].threshold = threshold;
> +
> +	/* Sort thresholds. Registering of new threshold isn't time-critical */
> +	sort(new->entries, size, sizeof(*new->entries),
> +			compare_thresholds, NULL);
> +
> +	/* Find current threshold */
> +	new->current_threshold = -1;
> +	for (i = 0; i < size; i++) {
> +		if (new->entries[i].threshold <= usage) {
> +			/*
> +			 * new->current_threshold will not be used until
> +			 * rcu_assign_pointer(), so it's safe to increment
> +			 * it here.
> +			 */
> +			++new->current_threshold;
> +		} else
> +			break;
> +	}
> +
> +	/* Free old spare buffer and save old primary buffer as spare */
> +	kfree(thresholds->spare);
> +	thresholds->spare = thresholds->primary;
> +
> +	rcu_assign_pointer(thresholds->primary, new);
> +
> +	/* To be sure that nobody uses thresholds */
> +	synchronize_rcu();
> +
> +unlock:
> +	mutex_unlock(&memcg->thresholds_lock);
> +
> +	return ret;
> +}
> +
> +static int mem_cgroup_usage_register_event(struct mem_cgroup *memcg,
> +	struct eventfd_ctx *eventfd, const char *args)
> +{
> +	return __mem_cgroup_usage_register_event(memcg, eventfd, args, _MEM);
> +}
> +
> +static int memsw_cgroup_usage_register_event(struct mem_cgroup *memcg,
> +	struct eventfd_ctx *eventfd, const char *args)
> +{
> +	return __mem_cgroup_usage_register_event(memcg, eventfd, args, _MEMSWAP);
> +}
> +
> +static void __mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg,
> +	struct eventfd_ctx *eventfd, enum res_type type)
> +{
> +	struct mem_cgroup_thresholds *thresholds;
> +	struct mem_cgroup_threshold_ary *new;
> +	unsigned long usage;
> +	int i, j, size, entries;
> +
> +	mutex_lock(&memcg->thresholds_lock);
> +
> +	if (type == _MEM) {
> +		thresholds = &memcg->thresholds;
> +		usage = mem_cgroup_usage(memcg, false);
> +	} else if (type == _MEMSWAP) {
> +		thresholds = &memcg->memsw_thresholds;
> +		usage = mem_cgroup_usage(memcg, true);
> +	} else
> +		BUG();
> +
> +	if (!thresholds->primary)
> +		goto unlock;
> +
> +	/* Check if a threshold crossed before removing */
> +	__mem_cgroup_threshold(memcg, type == _MEMSWAP);
> +
> +	/* Calculate new number of threshold */
> +	size = entries = 0;
> +	for (i = 0; i < thresholds->primary->size; i++) {
> +		if (thresholds->primary->entries[i].eventfd != eventfd)
> +			size++;
> +		else
> +			entries++;
> +	}
> +
> +	new = thresholds->spare;
> +
> +	/* If no items related to eventfd have been cleared, nothing to do */
> +	if (!entries)
> +		goto unlock;
> +
> +	/* Set thresholds array to NULL if we don't have thresholds */
> +	if (!size) {
> +		kfree(new);
> +		new = NULL;
> +		goto swap_buffers;
> +	}
> +
> +	new->size = size;
> +
> +	/* Copy thresholds and find current threshold */
> +	new->current_threshold = -1;
> +	for (i = 0, j = 0; i < thresholds->primary->size; i++) {
> +		if (thresholds->primary->entries[i].eventfd == eventfd)
> +			continue;
> +
> +		new->entries[j] = thresholds->primary->entries[i];
> +		if (new->entries[j].threshold <= usage) {
> +			/*
> +			 * new->current_threshold will not be used
> +			 * until rcu_assign_pointer(), so it's safe to increment
> +			 * it here.
> +			 */
> +			++new->current_threshold;
> +		}
> +		j++;
> +	}
> +
> +swap_buffers:
> +	/* Swap primary and spare array */
> +	thresholds->spare = thresholds->primary;
> +
> +	rcu_assign_pointer(thresholds->primary, new);
> +
> +	/* To be sure that nobody uses thresholds */
> +	synchronize_rcu();
> +
> +	/* If all events are unregistered, free the spare array */
> +	if (!new) {
> +		kfree(thresholds->spare);
> +		thresholds->spare = NULL;
> +	}
> +unlock:
> +	mutex_unlock(&memcg->thresholds_lock);
> +}
> +
> +static void mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg,
> +	struct eventfd_ctx *eventfd)
> +{
> +	return __mem_cgroup_usage_unregister_event(memcg, eventfd, _MEM);
> +}
> +
> +static void memsw_cgroup_usage_unregister_event(struct mem_cgroup *memcg,
> +	struct eventfd_ctx *eventfd)
> +{
> +	return __mem_cgroup_usage_unregister_event(memcg, eventfd, _MEMSWAP);
> +}
> +
> +static int mem_cgroup_oom_register_event(struct mem_cgroup *memcg,
> +	struct eventfd_ctx *eventfd, const char *args)
> +{
> +	struct mem_cgroup_eventfd_list *event;
> +
> +	event = kmalloc(sizeof(*event),	GFP_KERNEL);
> +	if (!event)
> +		return -ENOMEM;
> +
> +	spin_lock(&memcg_oom_lock);
> +
> +	event->eventfd = eventfd;
> +	list_add(&event->list, &memcg->oom_notify);
> +
> +	/* already in OOM ? */
> +	if (memcg->under_oom)
> +		eventfd_signal(eventfd);
> +	spin_unlock(&memcg_oom_lock);
> +
> +	return 0;
> +}
> +
> +static void mem_cgroup_oom_unregister_event(struct mem_cgroup *memcg,
> +	struct eventfd_ctx *eventfd)
> +{
> +	struct mem_cgroup_eventfd_list *ev, *tmp;
> +
> +	spin_lock(&memcg_oom_lock);
> +
> +	list_for_each_entry_safe(ev, tmp, &memcg->oom_notify, list) {
> +		if (ev->eventfd == eventfd) {
> +			list_del(&ev->list);
> +			kfree(ev);
> +		}
> +	}
> +
> +	spin_unlock(&memcg_oom_lock);
> +}
> +
> +/*
> + * DO NOT USE IN NEW FILES.
> + *
> + * "cgroup.event_control" implementation.
> + *
> + * This is way over-engineered.  It tries to support fully configurable
> + * events for each user.  Such level of flexibility is completely
> + * unnecessary especially in the light of the planned unified hierarchy.
> + *
> + * Please deprecate this and replace with something simpler if at all
> + * possible.
> + */
> +
> +/*
> + * Unregister event and free resources.
> + *
> + * Gets called from workqueue.
> + */
> +static void memcg_event_remove(struct work_struct *work)
> +{
> +	struct mem_cgroup_event *event =
> +		container_of(work, struct mem_cgroup_event, remove);
> +	struct mem_cgroup *memcg = event->memcg;
> +
> +	remove_wait_queue(event->wqh, &event->wait);
> +
> +	event->unregister_event(memcg, event->eventfd);
> +
> +	/* Notify userspace the event is going away. */
> +	eventfd_signal(event->eventfd);
> +
> +	eventfd_ctx_put(event->eventfd);
> +	kfree(event);
> +	css_put(&memcg->css);
> +}
> +
> +/*
> + * Gets called on EPOLLHUP on eventfd when user closes it.
> + *
> + * Called with wqh->lock held and interrupts disabled.
> + */
> +static int memcg_event_wake(wait_queue_entry_t *wait, unsigned mode,
> +			    int sync, void *key)
> +{
> +	struct mem_cgroup_event *event =
> +		container_of(wait, struct mem_cgroup_event, wait);
> +	struct mem_cgroup *memcg = event->memcg;
> +	__poll_t flags = key_to_poll(key);
> +
> +	if (flags & EPOLLHUP) {
> +		/*
> +		 * If the event has been detached at cgroup removal, we
> +		 * can simply return knowing the other side will cleanup
> +		 * for us.
> +		 *
> +		 * We can't race against event freeing since the other
> +		 * side will require wqh->lock via remove_wait_queue(),
> +		 * which we hold.
> +		 */
> +		spin_lock(&memcg->event_list_lock);
> +		if (!list_empty(&event->list)) {
> +			list_del_init(&event->list);
> +			/*
> +			 * We are in atomic context, but cgroup_event_remove()
> +			 * may sleep, so we have to call it in workqueue.
> +			 */
> +			schedule_work(&event->remove);
> +		}
> +		spin_unlock(&memcg->event_list_lock);
> +	}
> +
> +	return 0;
> +}
> +
> +static void memcg_event_ptable_queue_proc(struct file *file,
> +		wait_queue_head_t *wqh, poll_table *pt)
> +{
> +	struct mem_cgroup_event *event =
> +		container_of(pt, struct mem_cgroup_event, pt);
> +
> +	event->wqh = wqh;
> +	add_wait_queue(wqh, &event->wait);
> +}
> +
> +/*
> + * DO NOT USE IN NEW FILES.
> + *
> + * Parse input and register new cgroup event handler.
> + *
> + * Input must be in format '<event_fd> <control_fd> <args>'.
> + * Interpretation of args is defined by control file implementation.
> + */
> +ssize_t memcg_write_event_control(struct kernfs_open_file *of,
> +				  char *buf, size_t nbytes, loff_t off)
> +{
> +	struct cgroup_subsys_state *css = of_css(of);
> +	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
> +	struct mem_cgroup_event *event;
> +	struct cgroup_subsys_state *cfile_css;
> +	unsigned int efd, cfd;
> +	struct fd efile;
> +	struct fd cfile;
> +	struct dentry *cdentry;
> +	const char *name;
> +	char *endp;
> +	int ret;
> +
> +	if (IS_ENABLED(CONFIG_PREEMPT_RT))
> +		return -EOPNOTSUPP;
> +
> +	buf = strstrip(buf);
> +
> +	efd = simple_strtoul(buf, &endp, 10);
> +	if (*endp != ' ')
> +		return -EINVAL;
> +	buf = endp + 1;
> +
> +	cfd = simple_strtoul(buf, &endp, 10);
> +	if ((*endp != ' ') && (*endp != '\0'))
> +		return -EINVAL;
> +	buf = endp + 1;
> +
> +	event = kzalloc(sizeof(*event), GFP_KERNEL);
> +	if (!event)
> +		return -ENOMEM;
> +
> +	event->memcg = memcg;
> +	INIT_LIST_HEAD(&event->list);
> +	init_poll_funcptr(&event->pt, memcg_event_ptable_queue_proc);
> +	init_waitqueue_func_entry(&event->wait, memcg_event_wake);
> +	INIT_WORK(&event->remove, memcg_event_remove);
> +
> +	efile = fdget(efd);
> +	if (!efile.file) {
> +		ret = -EBADF;
> +		goto out_kfree;
> +	}
> +
> +	event->eventfd = eventfd_ctx_fileget(efile.file);
> +	if (IS_ERR(event->eventfd)) {
> +		ret = PTR_ERR(event->eventfd);
> +		goto out_put_efile;
> +	}
> +
> +	cfile = fdget(cfd);
> +	if (!cfile.file) {
> +		ret = -EBADF;
> +		goto out_put_eventfd;
> +	}
> +
> +	/* the process need read permission on control file */
> +	/* AV: shouldn't we check that it's been opened for read instead? */
> +	ret = file_permission(cfile.file, MAY_READ);
> +	if (ret < 0)
> +		goto out_put_cfile;
> +
> +	/*
> +	 * The control file must be a regular cgroup1 file. As a regular cgroup
> +	 * file can't be renamed, it's safe to access its name afterwards.
> +	 */
> +	cdentry = cfile.file->f_path.dentry;
> +	if (cdentry->d_sb->s_type != &cgroup_fs_type || !d_is_reg(cdentry)) {
> +		ret = -EINVAL;
> +		goto out_put_cfile;
> +	}
> +
> +	/*
> +	 * Determine the event callbacks and set them in @event.  This used
> +	 * to be done via struct cftype but cgroup core no longer knows
> +	 * about these events.  The following is crude but the whole thing
> +	 * is for compatibility anyway.
> +	 *
> +	 * DO NOT ADD NEW FILES.
> +	 */
> +	name = cdentry->d_name.name;
> +
> +	if (!strcmp(name, "memory.usage_in_bytes")) {
> +		event->register_event = mem_cgroup_usage_register_event;
> +		event->unregister_event = mem_cgroup_usage_unregister_event;
> +	} else if (!strcmp(name, "memory.oom_control")) {
> +		event->register_event = mem_cgroup_oom_register_event;
> +		event->unregister_event = mem_cgroup_oom_unregister_event;
> +	} else if (!strcmp(name, "memory.pressure_level")) {
> +		event->register_event = vmpressure_register_event;
> +		event->unregister_event = vmpressure_unregister_event;
> +	} else if (!strcmp(name, "memory.memsw.usage_in_bytes")) {
> +		event->register_event = memsw_cgroup_usage_register_event;
> +		event->unregister_event = memsw_cgroup_usage_unregister_event;
> +	} else {
> +		ret = -EINVAL;
> +		goto out_put_cfile;
> +	}
> +
> +	/*
> +	 * Verify @cfile should belong to @css.  Also, remaining events are
> +	 * automatically removed on cgroup destruction but the removal is
> +	 * asynchronous, so take an extra ref on @css.
> +	 */
> +	cfile_css = css_tryget_online_from_dir(cdentry->d_parent,
> +					       &memory_cgrp_subsys);
> +	ret = -EINVAL;
> +	if (IS_ERR(cfile_css))
> +		goto out_put_cfile;
> +	if (cfile_css != css) {
> +		css_put(cfile_css);
> +		goto out_put_cfile;
> +	}
> +
> +	ret = event->register_event(memcg, event->eventfd, buf);
> +	if (ret)
> +		goto out_put_css;
> +
> +	vfs_poll(efile.file, &event->pt);
> +
> +	spin_lock_irq(&memcg->event_list_lock);
> +	list_add(&event->list, &memcg->event_list);
> +	spin_unlock_irq(&memcg->event_list_lock);
> +
> +	fdput(cfile);
> +	fdput(efile);
> +
> +	return nbytes;
> +
> +out_put_css:
> +	css_put(css);
> +out_put_cfile:
> +	fdput(cfile);
> +out_put_eventfd:
> +	eventfd_ctx_put(event->eventfd);
> +out_put_efile:
> +	fdput(efile);
> +out_kfree:
> +	kfree(event);
> +
> +	return ret;
> +}
> +
> +void memcg1_css_offline(struct mem_cgroup *memcg)
> +{
> +	struct mem_cgroup_event *event, *tmp;
> +
> +	/*
> +	 * Unregister events and notify userspace.
> +	 * Notify userspace about cgroup removing only after rmdir of cgroup
> +	 * directory to avoid race between userspace and kernelspace.
> +	 */
> +	spin_lock_irq(&memcg->event_list_lock);
> +	list_for_each_entry_safe(event, tmp, &memcg->event_list, list) {
> +		list_del_init(&event->list);
> +		schedule_work(&event->remove);
> +	}
> +	spin_unlock_irq(&memcg->event_list_lock);
> +}
> +
>  static int __init memcg1_init(void)
>  {
>  	int node;
> diff --git a/mm/memcontrol-v1.h b/mm/memcontrol-v1.h
> index d377c0be9880..524a2c76ffc9 100644
> --- a/mm/memcontrol-v1.h
> +++ b/mm/memcontrol-v1.h
> @@ -41,4 +41,55 @@ u64 mem_cgroup_move_charge_read(struct cgroup_subsys_state *css,
>  int mem_cgroup_move_charge_write(struct cgroup_subsys_state *css,
>  				 struct cftype *cft, u64 val);
>  
> +/*
> + * Per memcg event counter is incremented at every pagein/pageout. With THP,
> + * it will be incremented by the number of pages. This counter is used
> + * to trigger some periodic events. This is straightforward and better
> + * than using jiffies etc. to handle periodic memcg event.
> + */
> +enum mem_cgroup_events_target {
> +	MEM_CGROUP_TARGET_THRESH,
> +	MEM_CGROUP_TARGET_SOFTLIMIT,
> +	MEM_CGROUP_NTARGETS,
> +};
> +
> +/* Whether legacy memory+swap accounting is active */
> +static bool do_memsw_account(void)
> +{
> +	return !cgroup_subsys_on_dfl(memory_cgrp_subsys);
> +}
> +
> +/*
> + * Iteration constructs for visiting all cgroups (under a tree).  If
> + * loops are exited prematurely (break), mem_cgroup_iter_break() must
> + * be used for reference counting.
> + */
> +#define for_each_mem_cgroup_tree(iter, root)		\
> +	for (iter = mem_cgroup_iter(root, NULL, NULL);	\
> +	     iter != NULL;				\
> +	     iter = mem_cgroup_iter(root, iter, NULL))
> +
> +#define for_each_mem_cgroup(iter)			\
> +	for (iter = mem_cgroup_iter(NULL, NULL, NULL);	\
> +	     iter != NULL;				\
> +	     iter = mem_cgroup_iter(NULL, iter, NULL))
> +
> +void memcg1_css_offline(struct mem_cgroup *memcg);
> +
> +/* for encoding cft->private value on file */
> +enum res_type {
> +	_MEM,
> +	_MEMSWAP,
> +	_KMEM,
> +	_TCP,
> +};
> +
> +bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg,
> +				enum mem_cgroup_events_target target);
> +unsigned long mem_cgroup_usage(struct mem_cgroup *memcg, bool swap);
> +void mem_cgroup_oom_notify(struct mem_cgroup *memcg);
> +ssize_t memcg_write_event_control(struct kernfs_open_file *of,
> +				  char *buf, size_t nbytes, loff_t off);
> +
> +
>  #endif	/* __MM_MEMCONTROL_V1_H */
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> index da2c0fa0de1b..bd4b26a73596 100644
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -46,9 +46,6 @@
>  #include <linux/slab.h>
>  #include <linux/swapops.h>
>  #include <linux/spinlock.h>
> -#include <linux/eventfd.h>
> -#include <linux/poll.h>
> -#include <linux/sort.h>
>  #include <linux/fs.h>
>  #include <linux/seq_file.h>
>  #include <linux/parser.h>
> @@ -59,7 +56,6 @@
>  #include <linux/cpu.h>
>  #include <linux/oom.h>
>  #include <linux/lockdep.h>
> -#include <linux/file.h>
>  #include <linux/resume_user_mode.h>
>  #include <linux/psi.h>
>  #include <linux/seq_buf.h>
> @@ -97,91 +93,13 @@ static bool cgroup_memory_nobpf __ro_after_init;
>  static DECLARE_WAIT_QUEUE_HEAD(memcg_cgwb_frn_waitq);
>  #endif
>  
> -/* Whether legacy memory+swap accounting is active */
> -static bool do_memsw_account(void)
> -{
> -	return !cgroup_subsys_on_dfl(memory_cgrp_subsys);
> -}
> -
>  #define THRESHOLDS_EVENTS_TARGET 128
>  #define SOFTLIMIT_EVENTS_TARGET 1024
>  
> -/* for OOM */
> -struct mem_cgroup_eventfd_list {
> -	struct list_head list;
> -	struct eventfd_ctx *eventfd;
> -};
> -
> -/*
> - * cgroup_event represents events which userspace want to receive.
> - */
> -struct mem_cgroup_event {
> -	/*
> -	 * memcg which the event belongs to.
> -	 */
> -	struct mem_cgroup *memcg;
> -	/*
> -	 * eventfd to signal userspace about the event.
> -	 */
> -	struct eventfd_ctx *eventfd;
> -	/*
> -	 * Each of these stored in a list by the cgroup.
> -	 */
> -	struct list_head list;
> -	/*
> -	 * register_event() callback will be used to add new userspace
> -	 * waiter for changes related to this event.  Use eventfd_signal()
> -	 * on eventfd to send notification to userspace.
> -	 */
> -	int (*register_event)(struct mem_cgroup *memcg,
> -			      struct eventfd_ctx *eventfd, const char *args);
> -	/*
> -	 * unregister_event() callback will be called when userspace closes
> -	 * the eventfd or on cgroup removing.  This callback must be set,
> -	 * if you want provide notification functionality.
> -	 */
> -	void (*unregister_event)(struct mem_cgroup *memcg,
> -				 struct eventfd_ctx *eventfd);
> -	/*
> -	 * All fields below needed to unregister event when
> -	 * userspace closes eventfd.
> -	 */
> -	poll_table pt;
> -	wait_queue_head_t *wqh;
> -	wait_queue_entry_t wait;
> -	struct work_struct remove;
> -};
> -
> -static void mem_cgroup_threshold(struct mem_cgroup *memcg);
> -static void mem_cgroup_oom_notify(struct mem_cgroup *memcg);
> -
> -/* for encoding cft->private value on file */
> -enum res_type {
> -	_MEM,
> -	_MEMSWAP,
> -	_KMEM,
> -	_TCP,
> -};
> -
>  #define MEMFILE_PRIVATE(x, val)	((x) << 16 | (val))
>  #define MEMFILE_TYPE(val)	((val) >> 16 & 0xffff)
>  #define MEMFILE_ATTR(val)	((val) & 0xffff)
>  
> -/*
> - * Iteration constructs for visiting all cgroups (under a tree).  If
> - * loops are exited prematurely (break), mem_cgroup_iter_break() must
> - * be used for reference counting.
> - */
> -#define for_each_mem_cgroup_tree(iter, root)		\
> -	for (iter = mem_cgroup_iter(root, NULL, NULL);	\
> -	     iter != NULL;				\
> -	     iter = mem_cgroup_iter(root, iter, NULL))
> -
> -#define for_each_mem_cgroup(iter)			\
> -	for (iter = mem_cgroup_iter(NULL, NULL, NULL);	\
> -	     iter != NULL;				\
> -	     iter = mem_cgroup_iter(NULL, iter, NULL))
> -
>  static inline bool task_is_dying(void)
>  {
>  	return tsk_is_oom_victim(current) || fatal_signal_pending(current) ||
> @@ -940,8 +858,8 @@ void mem_cgroup_charge_statistics(struct mem_cgroup *memcg, int nr_pages)
>  	__this_cpu_add(memcg->vmstats_percpu->nr_page_events, nr_pages);
>  }
>  
> -static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg,
> -				       enum mem_cgroup_events_target target)
> +bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg,
> +				enum mem_cgroup_events_target target)
>  {
>  	unsigned long val, next;
>  
> @@ -965,28 +883,6 @@ static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg,
>  	return false;
>  }
>  
> -/*
> - * Check events in order.
> - *
> - */
> -void memcg_check_events(struct mem_cgroup *memcg, int nid)
> -{
> -	if (IS_ENABLED(CONFIG_PREEMPT_RT))
> -		return;
> -
> -	/* threshold event is triggered in finer grain than soft limit */
> -	if (unlikely(mem_cgroup_event_ratelimit(memcg,
> -						MEM_CGROUP_TARGET_THRESH))) {
> -		bool do_softlimit;
> -
> -		do_softlimit = mem_cgroup_event_ratelimit(memcg,
> -						MEM_CGROUP_TARGET_SOFTLIMIT);
> -		mem_cgroup_threshold(memcg);
> -		if (unlikely(do_softlimit))
> -			memcg1_update_tree(memcg, nid);
> -	}
> -}
> -
>  struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)
>  {
>  	/*
> @@ -1726,7 +1622,7 @@ static struct lockdep_map memcg_oom_lock_dep_map = {
>  };
>  #endif
>  
> -static DEFINE_SPINLOCK(memcg_oom_lock);
> +DEFINE_SPINLOCK(memcg_oom_lock);
>  
>  /*
>   * Check OOM-Killer is already running under our hierarchy.
> @@ -3545,7 +3441,7 @@ static int mem_cgroup_hierarchy_write(struct cgroup_subsys_state *css,
>  	return -EINVAL;
>  }
>  
> -static unsigned long mem_cgroup_usage(struct mem_cgroup *memcg, bool swap)
> +unsigned long mem_cgroup_usage(struct mem_cgroup *memcg, bool swap)
>  {
>  	unsigned long val;
>  
> @@ -4046,331 +3942,6 @@ static int mem_cgroup_swappiness_write(struct cgroup_subsys_state *css,
>  	return 0;
>  }
>  
> -static void __mem_cgroup_threshold(struct mem_cgroup *memcg, bool swap)
> -{
> -	struct mem_cgroup_threshold_ary *t;
> -	unsigned long usage;
> -	int i;
> -
> -	rcu_read_lock();
> -	if (!swap)
> -		t = rcu_dereference(memcg->thresholds.primary);
> -	else
> -		t = rcu_dereference(memcg->memsw_thresholds.primary);
> -
> -	if (!t)
> -		goto unlock;
> -
> -	usage = mem_cgroup_usage(memcg, swap);
> -
> -	/*
> -	 * current_threshold points to threshold just below or equal to usage.
> -	 * If it's not true, a threshold was crossed after last
> -	 * call of __mem_cgroup_threshold().
> -	 */
> -	i = t->current_threshold;
> -
> -	/*
> -	 * Iterate backward over array of thresholds starting from
> -	 * current_threshold and check if a threshold is crossed.
> -	 * If none of thresholds below usage is crossed, we read
> -	 * only one element of the array here.
> -	 */
> -	for (; i >= 0 && unlikely(t->entries[i].threshold > usage); i--)
> -		eventfd_signal(t->entries[i].eventfd);
> -
> -	/* i = current_threshold + 1 */
> -	i++;
> -
> -	/*
> -	 * Iterate forward over array of thresholds starting from
> -	 * current_threshold+1 and check if a threshold is crossed.
> -	 * If none of thresholds above usage is crossed, we read
> -	 * only one element of the array here.
> -	 */
> -	for (; i < t->size && unlikely(t->entries[i].threshold <= usage); i++)
> -		eventfd_signal(t->entries[i].eventfd);
> -
> -	/* Update current_threshold */
> -	t->current_threshold = i - 1;
> -unlock:
> -	rcu_read_unlock();
> -}
> -
> -static void mem_cgroup_threshold(struct mem_cgroup *memcg)
> -{
> -	while (memcg) {
> -		__mem_cgroup_threshold(memcg, false);
> -		if (do_memsw_account())
> -			__mem_cgroup_threshold(memcg, true);
> -
> -		memcg = parent_mem_cgroup(memcg);
> -	}
> -}
> -
> -static int compare_thresholds(const void *a, const void *b)
> -{
> -	const struct mem_cgroup_threshold *_a = a;
> -	const struct mem_cgroup_threshold *_b = b;
> -
> -	if (_a->threshold > _b->threshold)
> -		return 1;
> -
> -	if (_a->threshold < _b->threshold)
> -		return -1;
> -
> -	return 0;
> -}
> -
> -static int mem_cgroup_oom_notify_cb(struct mem_cgroup *memcg)
> -{
> -	struct mem_cgroup_eventfd_list *ev;
> -
> -	spin_lock(&memcg_oom_lock);
> -
> -	list_for_each_entry(ev, &memcg->oom_notify, list)
> -		eventfd_signal(ev->eventfd);
> -
> -	spin_unlock(&memcg_oom_lock);
> -	return 0;
> -}
> -
> -static void mem_cgroup_oom_notify(struct mem_cgroup *memcg)
> -{
> -	struct mem_cgroup *iter;
> -
> -	for_each_mem_cgroup_tree(iter, memcg)
> -		mem_cgroup_oom_notify_cb(iter);
> -}
> -
> -static int __mem_cgroup_usage_register_event(struct mem_cgroup *memcg,
> -	struct eventfd_ctx *eventfd, const char *args, enum res_type type)
> -{
> -	struct mem_cgroup_thresholds *thresholds;
> -	struct mem_cgroup_threshold_ary *new;
> -	unsigned long threshold;
> -	unsigned long usage;
> -	int i, size, ret;
> -
> -	ret = page_counter_memparse(args, "-1", &threshold);
> -	if (ret)
> -		return ret;
> -
> -	mutex_lock(&memcg->thresholds_lock);
> -
> -	if (type == _MEM) {
> -		thresholds = &memcg->thresholds;
> -		usage = mem_cgroup_usage(memcg, false);
> -	} else if (type == _MEMSWAP) {
> -		thresholds = &memcg->memsw_thresholds;
> -		usage = mem_cgroup_usage(memcg, true);
> -	} else
> -		BUG();
> -
> -	/* Check if a threshold crossed before adding a new one */
> -	if (thresholds->primary)
> -		__mem_cgroup_threshold(memcg, type == _MEMSWAP);
> -
> -	size = thresholds->primary ? thresholds->primary->size + 1 : 1;
> -
> -	/* Allocate memory for new array of thresholds */
> -	new = kmalloc(struct_size(new, entries, size), GFP_KERNEL);
> -	if (!new) {
> -		ret = -ENOMEM;
> -		goto unlock;
> -	}
> -	new->size = size;
> -
> -	/* Copy thresholds (if any) to new array */
> -	if (thresholds->primary)
> -		memcpy(new->entries, thresholds->primary->entries,
> -		       flex_array_size(new, entries, size - 1));
> -
> -	/* Add new threshold */
> -	new->entries[size - 1].eventfd = eventfd;
> -	new->entries[size - 1].threshold = threshold;
> -
> -	/* Sort thresholds. Registering of new threshold isn't time-critical */
> -	sort(new->entries, size, sizeof(*new->entries),
> -			compare_thresholds, NULL);
> -
> -	/* Find current threshold */
> -	new->current_threshold = -1;
> -	for (i = 0; i < size; i++) {
> -		if (new->entries[i].threshold <= usage) {
> -			/*
> -			 * new->current_threshold will not be used until
> -			 * rcu_assign_pointer(), so it's safe to increment
> -			 * it here.
> -			 */
> -			++new->current_threshold;
> -		} else
> -			break;
> -	}
> -
> -	/* Free old spare buffer and save old primary buffer as spare */
> -	kfree(thresholds->spare);
> -	thresholds->spare = thresholds->primary;
> -
> -	rcu_assign_pointer(thresholds->primary, new);
> -
> -	/* To be sure that nobody uses thresholds */
> -	synchronize_rcu();
> -
> -unlock:
> -	mutex_unlock(&memcg->thresholds_lock);
> -
> -	return ret;
> -}
> -
> -static int mem_cgroup_usage_register_event(struct mem_cgroup *memcg,
> -	struct eventfd_ctx *eventfd, const char *args)
> -{
> -	return __mem_cgroup_usage_register_event(memcg, eventfd, args, _MEM);
> -}
> -
> -static int memsw_cgroup_usage_register_event(struct mem_cgroup *memcg,
> -	struct eventfd_ctx *eventfd, const char *args)
> -{
> -	return __mem_cgroup_usage_register_event(memcg, eventfd, args, _MEMSWAP);
> -}
> -
> -static void __mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg,
> -	struct eventfd_ctx *eventfd, enum res_type type)
> -{
> -	struct mem_cgroup_thresholds *thresholds;
> -	struct mem_cgroup_threshold_ary *new;
> -	unsigned long usage;
> -	int i, j, size, entries;
> -
> -	mutex_lock(&memcg->thresholds_lock);
> -
> -	if (type == _MEM) {
> -		thresholds = &memcg->thresholds;
> -		usage = mem_cgroup_usage(memcg, false);
> -	} else if (type == _MEMSWAP) {
> -		thresholds = &memcg->memsw_thresholds;
> -		usage = mem_cgroup_usage(memcg, true);
> -	} else
> -		BUG();
> -
> -	if (!thresholds->primary)
> -		goto unlock;
> -
> -	/* Check if a threshold crossed before removing */
> -	__mem_cgroup_threshold(memcg, type == _MEMSWAP);
> -
> -	/* Calculate new number of threshold */
> -	size = entries = 0;
> -	for (i = 0; i < thresholds->primary->size; i++) {
> -		if (thresholds->primary->entries[i].eventfd != eventfd)
> -			size++;
> -		else
> -			entries++;
> -	}
> -
> -	new = thresholds->spare;
> -
> -	/* If no items related to eventfd have been cleared, nothing to do */
> -	if (!entries)
> -		goto unlock;
> -
> -	/* Set thresholds array to NULL if we don't have thresholds */
> -	if (!size) {
> -		kfree(new);
> -		new = NULL;
> -		goto swap_buffers;
> -	}
> -
> -	new->size = size;
> -
> -	/* Copy thresholds and find current threshold */
> -	new->current_threshold = -1;
> -	for (i = 0, j = 0; i < thresholds->primary->size; i++) {
> -		if (thresholds->primary->entries[i].eventfd == eventfd)
> -			continue;
> -
> -		new->entries[j] = thresholds->primary->entries[i];
> -		if (new->entries[j].threshold <= usage) {
> -			/*
> -			 * new->current_threshold will not be used
> -			 * until rcu_assign_pointer(), so it's safe to increment
> -			 * it here.
> -			 */
> -			++new->current_threshold;
> -		}
> -		j++;
> -	}
> -
> -swap_buffers:
> -	/* Swap primary and spare array */
> -	thresholds->spare = thresholds->primary;
> -
> -	rcu_assign_pointer(thresholds->primary, new);
> -
> -	/* To be sure that nobody uses thresholds */
> -	synchronize_rcu();
> -
> -	/* If all events are unregistered, free the spare array */
> -	if (!new) {
> -		kfree(thresholds->spare);
> -		thresholds->spare = NULL;
> -	}
> -unlock:
> -	mutex_unlock(&memcg->thresholds_lock);
> -}
> -
> -static void mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg,
> -	struct eventfd_ctx *eventfd)
> -{
> -	return __mem_cgroup_usage_unregister_event(memcg, eventfd, _MEM);
> -}
> -
> -static void memsw_cgroup_usage_unregister_event(struct mem_cgroup *memcg,
> -	struct eventfd_ctx *eventfd)
> -{
> -	return __mem_cgroup_usage_unregister_event(memcg, eventfd, _MEMSWAP);
> -}
> -
> -static int mem_cgroup_oom_register_event(struct mem_cgroup *memcg,
> -	struct eventfd_ctx *eventfd, const char *args)
> -{
> -	struct mem_cgroup_eventfd_list *event;
> -
> -	event = kmalloc(sizeof(*event),	GFP_KERNEL);
> -	if (!event)
> -		return -ENOMEM;
> -
> -	spin_lock(&memcg_oom_lock);
> -
> -	event->eventfd = eventfd;
> -	list_add(&event->list, &memcg->oom_notify);
> -
> -	/* already in OOM ? */
> -	if (memcg->under_oom)
> -		eventfd_signal(eventfd);
> -	spin_unlock(&memcg_oom_lock);
> -
> -	return 0;
> -}
> -
> -static void mem_cgroup_oom_unregister_event(struct mem_cgroup *memcg,
> -	struct eventfd_ctx *eventfd)
> -{
> -	struct mem_cgroup_eventfd_list *ev, *tmp;
> -
> -	spin_lock(&memcg_oom_lock);
> -
> -	list_for_each_entry_safe(ev, tmp, &memcg->oom_notify, list) {
> -		if (ev->eventfd == eventfd) {
> -			list_del(&ev->list);
> -			kfree(ev);
> -		}
> -	}
> -
> -	spin_unlock(&memcg_oom_lock);
> -}
> -
>  static int mem_cgroup_oom_control_read(struct seq_file *sf, void *v)
>  {
>  	struct mem_cgroup *memcg = mem_cgroup_from_seq(sf);
> @@ -4611,243 +4182,6 @@ static void memcg_wb_domain_size_changed(struct mem_cgroup *memcg)
>  
>  #endif	/* CONFIG_CGROUP_WRITEBACK */
>  
> -/*
> - * DO NOT USE IN NEW FILES.
> - *
> - * "cgroup.event_control" implementation.
> - *
> - * This is way over-engineered.  It tries to support fully configurable
> - * events for each user.  Such level of flexibility is completely
> - * unnecessary especially in the light of the planned unified hierarchy.
> - *
> - * Please deprecate this and replace with something simpler if at all
> - * possible.
> - */
> -
> -/*
> - * Unregister event and free resources.
> - *
> - * Gets called from workqueue.
> - */
> -static void memcg_event_remove(struct work_struct *work)
> -{
> -	struct mem_cgroup_event *event =
> -		container_of(work, struct mem_cgroup_event, remove);
> -	struct mem_cgroup *memcg = event->memcg;
> -
> -	remove_wait_queue(event->wqh, &event->wait);
> -
> -	event->unregister_event(memcg, event->eventfd);
> -
> -	/* Notify userspace the event is going away. */
> -	eventfd_signal(event->eventfd);
> -
> -	eventfd_ctx_put(event->eventfd);
> -	kfree(event);
> -	css_put(&memcg->css);
> -}
> -
> -/*
> - * Gets called on EPOLLHUP on eventfd when user closes it.
> - *
> - * Called with wqh->lock held and interrupts disabled.
> - */
> -static int memcg_event_wake(wait_queue_entry_t *wait, unsigned mode,
> -			    int sync, void *key)
> -{
> -	struct mem_cgroup_event *event =
> -		container_of(wait, struct mem_cgroup_event, wait);
> -	struct mem_cgroup *memcg = event->memcg;
> -	__poll_t flags = key_to_poll(key);
> -
> -	if (flags & EPOLLHUP) {
> -		/*
> -		 * If the event has been detached at cgroup removal, we
> -		 * can simply return knowing the other side will cleanup
> -		 * for us.
> -		 *
> -		 * We can't race against event freeing since the other
> -		 * side will require wqh->lock via remove_wait_queue(),
> -		 * which we hold.
> -		 */
> -		spin_lock(&memcg->event_list_lock);
> -		if (!list_empty(&event->list)) {
> -			list_del_init(&event->list);
> -			/*
> -			 * We are in atomic context, but cgroup_event_remove()
> -			 * may sleep, so we have to call it in workqueue.
> -			 */
> -			schedule_work(&event->remove);
> -		}
> -		spin_unlock(&memcg->event_list_lock);
> -	}
> -
> -	return 0;
> -}
> -
> -static void memcg_event_ptable_queue_proc(struct file *file,
> -		wait_queue_head_t *wqh, poll_table *pt)
> -{
> -	struct mem_cgroup_event *event =
> -		container_of(pt, struct mem_cgroup_event, pt);
> -
> -	event->wqh = wqh;
> -	add_wait_queue(wqh, &event->wait);
> -}
> -
> -/*
> - * DO NOT USE IN NEW FILES.
> - *
> - * Parse input and register new cgroup event handler.
> - *
> - * Input must be in format '<event_fd> <control_fd> <args>'.
> - * Interpretation of args is defined by control file implementation.
> - */
> -static ssize_t memcg_write_event_control(struct kernfs_open_file *of,
> -					 char *buf, size_t nbytes, loff_t off)
> -{
> -	struct cgroup_subsys_state *css = of_css(of);
> -	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
> -	struct mem_cgroup_event *event;
> -	struct cgroup_subsys_state *cfile_css;
> -	unsigned int efd, cfd;
> -	struct fd efile;
> -	struct fd cfile;
> -	struct dentry *cdentry;
> -	const char *name;
> -	char *endp;
> -	int ret;
> -
> -	if (IS_ENABLED(CONFIG_PREEMPT_RT))
> -		return -EOPNOTSUPP;
> -
> -	buf = strstrip(buf);
> -
> -	efd = simple_strtoul(buf, &endp, 10);
> -	if (*endp != ' ')
> -		return -EINVAL;
> -	buf = endp + 1;
> -
> -	cfd = simple_strtoul(buf, &endp, 10);
> -	if ((*endp != ' ') && (*endp != '\0'))
> -		return -EINVAL;
> -	buf = endp + 1;
> -
> -	event = kzalloc(sizeof(*event), GFP_KERNEL);
> -	if (!event)
> -		return -ENOMEM;
> -
> -	event->memcg = memcg;
> -	INIT_LIST_HEAD(&event->list);
> -	init_poll_funcptr(&event->pt, memcg_event_ptable_queue_proc);
> -	init_waitqueue_func_entry(&event->wait, memcg_event_wake);
> -	INIT_WORK(&event->remove, memcg_event_remove);
> -
> -	efile = fdget(efd);
> -	if (!efile.file) {
> -		ret = -EBADF;
> -		goto out_kfree;
> -	}
> -
> -	event->eventfd = eventfd_ctx_fileget(efile.file);
> -	if (IS_ERR(event->eventfd)) {
> -		ret = PTR_ERR(event->eventfd);
> -		goto out_put_efile;
> -	}
> -
> -	cfile = fdget(cfd);
> -	if (!cfile.file) {
> -		ret = -EBADF;
> -		goto out_put_eventfd;
> -	}
> -
> -	/* the process need read permission on control file */
> -	/* AV: shouldn't we check that it's been opened for read instead? */
> -	ret = file_permission(cfile.file, MAY_READ);
> -	if (ret < 0)
> -		goto out_put_cfile;
> -
> -	/*
> -	 * The control file must be a regular cgroup1 file. As a regular cgroup
> -	 * file can't be renamed, it's safe to access its name afterwards.
> -	 */
> -	cdentry = cfile.file->f_path.dentry;
> -	if (cdentry->d_sb->s_type != &cgroup_fs_type || !d_is_reg(cdentry)) {
> -		ret = -EINVAL;
> -		goto out_put_cfile;
> -	}
> -
> -	/*
> -	 * Determine the event callbacks and set them in @event.  This used
> -	 * to be done via struct cftype but cgroup core no longer knows
> -	 * about these events.  The following is crude but the whole thing
> -	 * is for compatibility anyway.
> -	 *
> -	 * DO NOT ADD NEW FILES.
> -	 */
> -	name = cdentry->d_name.name;
> -
> -	if (!strcmp(name, "memory.usage_in_bytes")) {
> -		event->register_event = mem_cgroup_usage_register_event;
> -		event->unregister_event = mem_cgroup_usage_unregister_event;
> -	} else if (!strcmp(name, "memory.oom_control")) {
> -		event->register_event = mem_cgroup_oom_register_event;
> -		event->unregister_event = mem_cgroup_oom_unregister_event;
> -	} else if (!strcmp(name, "memory.pressure_level")) {
> -		event->register_event = vmpressure_register_event;
> -		event->unregister_event = vmpressure_unregister_event;
> -	} else if (!strcmp(name, "memory.memsw.usage_in_bytes")) {
> -		event->register_event = memsw_cgroup_usage_register_event;
> -		event->unregister_event = memsw_cgroup_usage_unregister_event;
> -	} else {
> -		ret = -EINVAL;
> -		goto out_put_cfile;
> -	}
> -
> -	/*
> -	 * Verify @cfile should belong to @css.  Also, remaining events are
> -	 * automatically removed on cgroup destruction but the removal is
> -	 * asynchronous, so take an extra ref on @css.
> -	 */
> -	cfile_css = css_tryget_online_from_dir(cdentry->d_parent,
> -					       &memory_cgrp_subsys);
> -	ret = -EINVAL;
> -	if (IS_ERR(cfile_css))
> -		goto out_put_cfile;
> -	if (cfile_css != css) {
> -		css_put(cfile_css);
> -		goto out_put_cfile;
> -	}
> -
> -	ret = event->register_event(memcg, event->eventfd, buf);
> -	if (ret)
> -		goto out_put_css;
> -
> -	vfs_poll(efile.file, &event->pt);
> -
> -	spin_lock_irq(&memcg->event_list_lock);
> -	list_add(&event->list, &memcg->event_list);
> -	spin_unlock_irq(&memcg->event_list_lock);
> -
> -	fdput(cfile);
> -	fdput(efile);
> -
> -	return nbytes;
> -
> -out_put_css:
> -	css_put(css);
> -out_put_cfile:
> -	fdput(cfile);
> -out_put_eventfd:
> -	eventfd_ctx_put(event->eventfd);
> -out_put_efile:
> -	fdput(efile);
> -out_kfree:
> -	kfree(event);
> -
> -	return ret;
> -}
> -
>  #if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_SLUB_DEBUG)
>  static int mem_cgroup_slab_show(struct seq_file *m, void *p)
>  {
> @@ -5314,19 +4648,8 @@ static int mem_cgroup_css_online(struct cgroup_subsys_state *css)
>  static void mem_cgroup_css_offline(struct cgroup_subsys_state *css)
>  {
>  	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
> -	struct mem_cgroup_event *event, *tmp;
>  
> -	/*
> -	 * Unregister events and notify userspace.
> -	 * Notify userspace about cgroup removing only after rmdir of cgroup
> -	 * directory to avoid race between userspace and kernelspace.
> -	 */
> -	spin_lock_irq(&memcg->event_list_lock);
> -	list_for_each_entry_safe(event, tmp, &memcg->event_list, list) {
> -		list_del_init(&event->list);
> -		schedule_work(&event->remove);
> -	}
> -	spin_unlock_irq(&memcg->event_list_lock);
> +	memcg1_css_offline(memcg);
>  
>  	page_counter_set_min(&memcg->memory, 0);
>  	page_counter_set_low(&memcg->memory, 0);
> -- 
> 2.45.2
diff mbox series

Patch

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 83c8327455d8..588179d29849 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -69,18 +69,6 @@  struct mem_cgroup_id {
 	refcount_t ref;
 };
 
-/*
- * Per memcg event counter is incremented at every pagein/pageout. With THP,
- * it will be incremented by the number of pages. This counter is used
- * to trigger some periodic events. This is straightforward and better
- * than using jiffies etc. to handle periodic memcg event.
- */
-enum mem_cgroup_events_target {
-	MEM_CGROUP_TARGET_THRESH,
-	MEM_CGROUP_TARGET_SOFTLIMIT,
-	MEM_CGROUP_NTARGETS,
-};
-
 struct memcg_vmstats_percpu;
 struct memcg_vmstats;
 struct lruvec_stats_percpu;
diff --git a/mm/memcontrol-v1.c b/mm/memcontrol-v1.c
index c25e038ac874..4b2290ceace6 100644
--- a/mm/memcontrol-v1.c
+++ b/mm/memcontrol-v1.c
@@ -6,6 +6,10 @@ 
 #include <linux/pagewalk.h>
 #include <linux/backing-dev.h>
 #include <linux/swap_cgroup.h>
+#include <linux/eventfd.h>
+#include <linux/poll.h>
+#include <linux/sort.h>
+#include <linux/file.h>
 
 #include "internal.h"
 #include "swap.h"
@@ -60,6 +64,54 @@  static struct move_charge_struct {
 	.waitq = __WAIT_QUEUE_HEAD_INITIALIZER(mc.waitq),
 };
 
+/* for OOM */
+struct mem_cgroup_eventfd_list {
+	struct list_head list;
+	struct eventfd_ctx *eventfd;
+};
+
+/*
+ * cgroup_event represents events which userspace want to receive.
+ */
+struct mem_cgroup_event {
+	/*
+	 * memcg which the event belongs to.
+	 */
+	struct mem_cgroup *memcg;
+	/*
+	 * eventfd to signal userspace about the event.
+	 */
+	struct eventfd_ctx *eventfd;
+	/*
+	 * Each of these stored in a list by the cgroup.
+	 */
+	struct list_head list;
+	/*
+	 * register_event() callback will be used to add new userspace
+	 * waiter for changes related to this event.  Use eventfd_signal()
+	 * on eventfd to send notification to userspace.
+	 */
+	int (*register_event)(struct mem_cgroup *memcg,
+			      struct eventfd_ctx *eventfd, const char *args);
+	/*
+	 * unregister_event() callback will be called when userspace closes
+	 * the eventfd or on cgroup removing.  This callback must be set,
+	 * if you want provide notification functionality.
+	 */
+	void (*unregister_event)(struct mem_cgroup *memcg,
+				 struct eventfd_ctx *eventfd);
+	/*
+	 * All fields below needed to unregister event when
+	 * userspace closes eventfd.
+	 */
+	poll_table pt;
+	wait_queue_head_t *wqh;
+	wait_queue_entry_t wait;
+	struct work_struct remove;
+};
+
+extern spinlock_t memcg_oom_lock;
+
 static void __mem_cgroup_insert_exceeded(struct mem_cgroup_per_node *mz,
 					 struct mem_cgroup_tree_per_node *mctz,
 					 unsigned long new_usage_in_excess)
@@ -1306,6 +1358,607 @@  void memcg1_move_task(void)
 }
 #endif
 
+static void __mem_cgroup_threshold(struct mem_cgroup *memcg, bool swap)
+{
+	struct mem_cgroup_threshold_ary *t;
+	unsigned long usage;
+	int i;
+
+	rcu_read_lock();
+	if (!swap)
+		t = rcu_dereference(memcg->thresholds.primary);
+	else
+		t = rcu_dereference(memcg->memsw_thresholds.primary);
+
+	if (!t)
+		goto unlock;
+
+	usage = mem_cgroup_usage(memcg, swap);
+
+	/*
+	 * current_threshold points to threshold just below or equal to usage.
+	 * If it's not true, a threshold was crossed after last
+	 * call of __mem_cgroup_threshold().
+	 */
+	i = t->current_threshold;
+
+	/*
+	 * Iterate backward over array of thresholds starting from
+	 * current_threshold and check if a threshold is crossed.
+	 * If none of thresholds below usage is crossed, we read
+	 * only one element of the array here.
+	 */
+	for (; i >= 0 && unlikely(t->entries[i].threshold > usage); i--)
+		eventfd_signal(t->entries[i].eventfd);
+
+	/* i = current_threshold + 1 */
+	i++;
+
+	/*
+	 * Iterate forward over array of thresholds starting from
+	 * current_threshold+1 and check if a threshold is crossed.
+	 * If none of thresholds above usage is crossed, we read
+	 * only one element of the array here.
+	 */
+	for (; i < t->size && unlikely(t->entries[i].threshold <= usage); i++)
+		eventfd_signal(t->entries[i].eventfd);
+
+	/* Update current_threshold */
+	t->current_threshold = i - 1;
+unlock:
+	rcu_read_unlock();
+}
+
+static void mem_cgroup_threshold(struct mem_cgroup *memcg)
+{
+	while (memcg) {
+		__mem_cgroup_threshold(memcg, false);
+		if (do_memsw_account())
+			__mem_cgroup_threshold(memcg, true);
+
+		memcg = parent_mem_cgroup(memcg);
+	}
+}
+
+/*
+ * Check events in order.
+ *
+ */
+void memcg_check_events(struct mem_cgroup *memcg, int nid)
+{
+	if (IS_ENABLED(CONFIG_PREEMPT_RT))
+		return;
+
+	/* threshold event is triggered in finer grain than soft limit */
+	if (unlikely(mem_cgroup_event_ratelimit(memcg,
+						MEM_CGROUP_TARGET_THRESH))) {
+		bool do_softlimit;
+
+		do_softlimit = mem_cgroup_event_ratelimit(memcg,
+						MEM_CGROUP_TARGET_SOFTLIMIT);
+		mem_cgroup_threshold(memcg);
+		if (unlikely(do_softlimit))
+			memcg1_update_tree(memcg, nid);
+	}
+}
+
+static int compare_thresholds(const void *a, const void *b)
+{
+	const struct mem_cgroup_threshold *_a = a;
+	const struct mem_cgroup_threshold *_b = b;
+
+	if (_a->threshold > _b->threshold)
+		return 1;
+
+	if (_a->threshold < _b->threshold)
+		return -1;
+
+	return 0;
+}
+
+static int mem_cgroup_oom_notify_cb(struct mem_cgroup *memcg)
+{
+	struct mem_cgroup_eventfd_list *ev;
+
+	spin_lock(&memcg_oom_lock);
+
+	list_for_each_entry(ev, &memcg->oom_notify, list)
+		eventfd_signal(ev->eventfd);
+
+	spin_unlock(&memcg_oom_lock);
+	return 0;
+}
+
+void mem_cgroup_oom_notify(struct mem_cgroup *memcg)
+{
+	struct mem_cgroup *iter;
+
+	for_each_mem_cgroup_tree(iter, memcg)
+		mem_cgroup_oom_notify_cb(iter);
+}
+
+static int __mem_cgroup_usage_register_event(struct mem_cgroup *memcg,
+	struct eventfd_ctx *eventfd, const char *args, enum res_type type)
+{
+	struct mem_cgroup_thresholds *thresholds;
+	struct mem_cgroup_threshold_ary *new;
+	unsigned long threshold;
+	unsigned long usage;
+	int i, size, ret;
+
+	ret = page_counter_memparse(args, "-1", &threshold);
+	if (ret)
+		return ret;
+
+	mutex_lock(&memcg->thresholds_lock);
+
+	if (type == _MEM) {
+		thresholds = &memcg->thresholds;
+		usage = mem_cgroup_usage(memcg, false);
+	} else if (type == _MEMSWAP) {
+		thresholds = &memcg->memsw_thresholds;
+		usage = mem_cgroup_usage(memcg, true);
+	} else
+		BUG();
+
+	/* Check if a threshold crossed before adding a new one */
+	if (thresholds->primary)
+		__mem_cgroup_threshold(memcg, type == _MEMSWAP);
+
+	size = thresholds->primary ? thresholds->primary->size + 1 : 1;
+
+	/* Allocate memory for new array of thresholds */
+	new = kmalloc(struct_size(new, entries, size), GFP_KERNEL);
+	if (!new) {
+		ret = -ENOMEM;
+		goto unlock;
+	}
+	new->size = size;
+
+	/* Copy thresholds (if any) to new array */
+	if (thresholds->primary)
+		memcpy(new->entries, thresholds->primary->entries,
+		       flex_array_size(new, entries, size - 1));
+
+	/* Add new threshold */
+	new->entries[size - 1].eventfd = eventfd;
+	new->entries[size - 1].threshold = threshold;
+
+	/* Sort thresholds. Registering of new threshold isn't time-critical */
+	sort(new->entries, size, sizeof(*new->entries),
+			compare_thresholds, NULL);
+
+	/* Find current threshold */
+	new->current_threshold = -1;
+	for (i = 0; i < size; i++) {
+		if (new->entries[i].threshold <= usage) {
+			/*
+			 * new->current_threshold will not be used until
+			 * rcu_assign_pointer(), so it's safe to increment
+			 * it here.
+			 */
+			++new->current_threshold;
+		} else
+			break;
+	}
+
+	/* Free old spare buffer and save old primary buffer as spare */
+	kfree(thresholds->spare);
+	thresholds->spare = thresholds->primary;
+
+	rcu_assign_pointer(thresholds->primary, new);
+
+	/* To be sure that nobody uses thresholds */
+	synchronize_rcu();
+
+unlock:
+	mutex_unlock(&memcg->thresholds_lock);
+
+	return ret;
+}
+
+static int mem_cgroup_usage_register_event(struct mem_cgroup *memcg,
+	struct eventfd_ctx *eventfd, const char *args)
+{
+	return __mem_cgroup_usage_register_event(memcg, eventfd, args, _MEM);
+}
+
+static int memsw_cgroup_usage_register_event(struct mem_cgroup *memcg,
+	struct eventfd_ctx *eventfd, const char *args)
+{
+	return __mem_cgroup_usage_register_event(memcg, eventfd, args, _MEMSWAP);
+}
+
+static void __mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg,
+	struct eventfd_ctx *eventfd, enum res_type type)
+{
+	struct mem_cgroup_thresholds *thresholds;
+	struct mem_cgroup_threshold_ary *new;
+	unsigned long usage;
+	int i, j, size, entries;
+
+	mutex_lock(&memcg->thresholds_lock);
+
+	if (type == _MEM) {
+		thresholds = &memcg->thresholds;
+		usage = mem_cgroup_usage(memcg, false);
+	} else if (type == _MEMSWAP) {
+		thresholds = &memcg->memsw_thresholds;
+		usage = mem_cgroup_usage(memcg, true);
+	} else
+		BUG();
+
+	if (!thresholds->primary)
+		goto unlock;
+
+	/* Check if a threshold crossed before removing */
+	__mem_cgroup_threshold(memcg, type == _MEMSWAP);
+
+	/* Calculate new number of threshold */
+	size = entries = 0;
+	for (i = 0; i < thresholds->primary->size; i++) {
+		if (thresholds->primary->entries[i].eventfd != eventfd)
+			size++;
+		else
+			entries++;
+	}
+
+	new = thresholds->spare;
+
+	/* If no items related to eventfd have been cleared, nothing to do */
+	if (!entries)
+		goto unlock;
+
+	/* Set thresholds array to NULL if we don't have thresholds */
+	if (!size) {
+		kfree(new);
+		new = NULL;
+		goto swap_buffers;
+	}
+
+	new->size = size;
+
+	/* Copy thresholds and find current threshold */
+	new->current_threshold = -1;
+	for (i = 0, j = 0; i < thresholds->primary->size; i++) {
+		if (thresholds->primary->entries[i].eventfd == eventfd)
+			continue;
+
+		new->entries[j] = thresholds->primary->entries[i];
+		if (new->entries[j].threshold <= usage) {
+			/*
+			 * new->current_threshold will not be used
+			 * until rcu_assign_pointer(), so it's safe to increment
+			 * it here.
+			 */
+			++new->current_threshold;
+		}
+		j++;
+	}
+
+swap_buffers:
+	/* Swap primary and spare array */
+	thresholds->spare = thresholds->primary;
+
+	rcu_assign_pointer(thresholds->primary, new);
+
+	/* To be sure that nobody uses thresholds */
+	synchronize_rcu();
+
+	/* If all events are unregistered, free the spare array */
+	if (!new) {
+		kfree(thresholds->spare);
+		thresholds->spare = NULL;
+	}
+unlock:
+	mutex_unlock(&memcg->thresholds_lock);
+}
+
+static void mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg,
+	struct eventfd_ctx *eventfd)
+{
+	return __mem_cgroup_usage_unregister_event(memcg, eventfd, _MEM);
+}
+
+static void memsw_cgroup_usage_unregister_event(struct mem_cgroup *memcg,
+	struct eventfd_ctx *eventfd)
+{
+	return __mem_cgroup_usage_unregister_event(memcg, eventfd, _MEMSWAP);
+}
+
+static int mem_cgroup_oom_register_event(struct mem_cgroup *memcg,
+	struct eventfd_ctx *eventfd, const char *args)
+{
+	struct mem_cgroup_eventfd_list *event;
+
+	event = kmalloc(sizeof(*event),	GFP_KERNEL);
+	if (!event)
+		return -ENOMEM;
+
+	spin_lock(&memcg_oom_lock);
+
+	event->eventfd = eventfd;
+	list_add(&event->list, &memcg->oom_notify);
+
+	/* already in OOM ? */
+	if (memcg->under_oom)
+		eventfd_signal(eventfd);
+	spin_unlock(&memcg_oom_lock);
+
+	return 0;
+}
+
+static void mem_cgroup_oom_unregister_event(struct mem_cgroup *memcg,
+	struct eventfd_ctx *eventfd)
+{
+	struct mem_cgroup_eventfd_list *ev, *tmp;
+
+	spin_lock(&memcg_oom_lock);
+
+	list_for_each_entry_safe(ev, tmp, &memcg->oom_notify, list) {
+		if (ev->eventfd == eventfd) {
+			list_del(&ev->list);
+			kfree(ev);
+		}
+	}
+
+	spin_unlock(&memcg_oom_lock);
+}
+
+/*
+ * DO NOT USE IN NEW FILES.
+ *
+ * "cgroup.event_control" implementation.
+ *
+ * This is way over-engineered.  It tries to support fully configurable
+ * events for each user.  Such level of flexibility is completely
+ * unnecessary especially in the light of the planned unified hierarchy.
+ *
+ * Please deprecate this and replace with something simpler if at all
+ * possible.
+ */
+
+/*
+ * Unregister event and free resources.
+ *
+ * Gets called from workqueue.
+ */
+static void memcg_event_remove(struct work_struct *work)
+{
+	struct mem_cgroup_event *event =
+		container_of(work, struct mem_cgroup_event, remove);
+	struct mem_cgroup *memcg = event->memcg;
+
+	remove_wait_queue(event->wqh, &event->wait);
+
+	event->unregister_event(memcg, event->eventfd);
+
+	/* Notify userspace the event is going away. */
+	eventfd_signal(event->eventfd);
+
+	eventfd_ctx_put(event->eventfd);
+	kfree(event);
+	css_put(&memcg->css);
+}
+
+/*
+ * Gets called on EPOLLHUP on eventfd when user closes it.
+ *
+ * Called with wqh->lock held and interrupts disabled.
+ */
+static int memcg_event_wake(wait_queue_entry_t *wait, unsigned mode,
+			    int sync, void *key)
+{
+	struct mem_cgroup_event *event =
+		container_of(wait, struct mem_cgroup_event, wait);
+	struct mem_cgroup *memcg = event->memcg;
+	__poll_t flags = key_to_poll(key);
+
+	if (flags & EPOLLHUP) {
+		/*
+		 * If the event has been detached at cgroup removal, we
+		 * can simply return knowing the other side will cleanup
+		 * for us.
+		 *
+		 * We can't race against event freeing since the other
+		 * side will require wqh->lock via remove_wait_queue(),
+		 * which we hold.
+		 */
+		spin_lock(&memcg->event_list_lock);
+		if (!list_empty(&event->list)) {
+			list_del_init(&event->list);
+			/*
+			 * We are in atomic context, but cgroup_event_remove()
+			 * may sleep, so we have to call it in workqueue.
+			 */
+			schedule_work(&event->remove);
+		}
+		spin_unlock(&memcg->event_list_lock);
+	}
+
+	return 0;
+}
+
+static void memcg_event_ptable_queue_proc(struct file *file,
+		wait_queue_head_t *wqh, poll_table *pt)
+{
+	struct mem_cgroup_event *event =
+		container_of(pt, struct mem_cgroup_event, pt);
+
+	event->wqh = wqh;
+	add_wait_queue(wqh, &event->wait);
+}
+
+/*
+ * DO NOT USE IN NEW FILES.
+ *
+ * Parse input and register new cgroup event handler.
+ *
+ * Input must be in format '<event_fd> <control_fd> <args>'.
+ * Interpretation of args is defined by control file implementation.
+ */
+ssize_t memcg_write_event_control(struct kernfs_open_file *of,
+				  char *buf, size_t nbytes, loff_t off)
+{
+	struct cgroup_subsys_state *css = of_css(of);
+	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
+	struct mem_cgroup_event *event;
+	struct cgroup_subsys_state *cfile_css;
+	unsigned int efd, cfd;
+	struct fd efile;
+	struct fd cfile;
+	struct dentry *cdentry;
+	const char *name;
+	char *endp;
+	int ret;
+
+	if (IS_ENABLED(CONFIG_PREEMPT_RT))
+		return -EOPNOTSUPP;
+
+	buf = strstrip(buf);
+
+	efd = simple_strtoul(buf, &endp, 10);
+	if (*endp != ' ')
+		return -EINVAL;
+	buf = endp + 1;
+
+	cfd = simple_strtoul(buf, &endp, 10);
+	if ((*endp != ' ') && (*endp != '\0'))
+		return -EINVAL;
+	buf = endp + 1;
+
+	event = kzalloc(sizeof(*event), GFP_KERNEL);
+	if (!event)
+		return -ENOMEM;
+
+	event->memcg = memcg;
+	INIT_LIST_HEAD(&event->list);
+	init_poll_funcptr(&event->pt, memcg_event_ptable_queue_proc);
+	init_waitqueue_func_entry(&event->wait, memcg_event_wake);
+	INIT_WORK(&event->remove, memcg_event_remove);
+
+	efile = fdget(efd);
+	if (!efile.file) {
+		ret = -EBADF;
+		goto out_kfree;
+	}
+
+	event->eventfd = eventfd_ctx_fileget(efile.file);
+	if (IS_ERR(event->eventfd)) {
+		ret = PTR_ERR(event->eventfd);
+		goto out_put_efile;
+	}
+
+	cfile = fdget(cfd);
+	if (!cfile.file) {
+		ret = -EBADF;
+		goto out_put_eventfd;
+	}
+
+	/* the process need read permission on control file */
+	/* AV: shouldn't we check that it's been opened for read instead? */
+	ret = file_permission(cfile.file, MAY_READ);
+	if (ret < 0)
+		goto out_put_cfile;
+
+	/*
+	 * The control file must be a regular cgroup1 file. As a regular cgroup
+	 * file can't be renamed, it's safe to access its name afterwards.
+	 */
+	cdentry = cfile.file->f_path.dentry;
+	if (cdentry->d_sb->s_type != &cgroup_fs_type || !d_is_reg(cdentry)) {
+		ret = -EINVAL;
+		goto out_put_cfile;
+	}
+
+	/*
+	 * Determine the event callbacks and set them in @event.  This used
+	 * to be done via struct cftype but cgroup core no longer knows
+	 * about these events.  The following is crude but the whole thing
+	 * is for compatibility anyway.
+	 *
+	 * DO NOT ADD NEW FILES.
+	 */
+	name = cdentry->d_name.name;
+
+	if (!strcmp(name, "memory.usage_in_bytes")) {
+		event->register_event = mem_cgroup_usage_register_event;
+		event->unregister_event = mem_cgroup_usage_unregister_event;
+	} else if (!strcmp(name, "memory.oom_control")) {
+		event->register_event = mem_cgroup_oom_register_event;
+		event->unregister_event = mem_cgroup_oom_unregister_event;
+	} else if (!strcmp(name, "memory.pressure_level")) {
+		event->register_event = vmpressure_register_event;
+		event->unregister_event = vmpressure_unregister_event;
+	} else if (!strcmp(name, "memory.memsw.usage_in_bytes")) {
+		event->register_event = memsw_cgroup_usage_register_event;
+		event->unregister_event = memsw_cgroup_usage_unregister_event;
+	} else {
+		ret = -EINVAL;
+		goto out_put_cfile;
+	}
+
+	/*
+	 * Verify @cfile should belong to @css.  Also, remaining events are
+	 * automatically removed on cgroup destruction but the removal is
+	 * asynchronous, so take an extra ref on @css.
+	 */
+	cfile_css = css_tryget_online_from_dir(cdentry->d_parent,
+					       &memory_cgrp_subsys);
+	ret = -EINVAL;
+	if (IS_ERR(cfile_css))
+		goto out_put_cfile;
+	if (cfile_css != css) {
+		css_put(cfile_css);
+		goto out_put_cfile;
+	}
+
+	ret = event->register_event(memcg, event->eventfd, buf);
+	if (ret)
+		goto out_put_css;
+
+	vfs_poll(efile.file, &event->pt);
+
+	spin_lock_irq(&memcg->event_list_lock);
+	list_add(&event->list, &memcg->event_list);
+	spin_unlock_irq(&memcg->event_list_lock);
+
+	fdput(cfile);
+	fdput(efile);
+
+	return nbytes;
+
+out_put_css:
+	css_put(css);
+out_put_cfile:
+	fdput(cfile);
+out_put_eventfd:
+	eventfd_ctx_put(event->eventfd);
+out_put_efile:
+	fdput(efile);
+out_kfree:
+	kfree(event);
+
+	return ret;
+}
+
+void memcg1_css_offline(struct mem_cgroup *memcg)
+{
+	struct mem_cgroup_event *event, *tmp;
+
+	/*
+	 * Unregister events and notify userspace.
+	 * Notify userspace about cgroup removing only after rmdir of cgroup
+	 * directory to avoid race between userspace and kernelspace.
+	 */
+	spin_lock_irq(&memcg->event_list_lock);
+	list_for_each_entry_safe(event, tmp, &memcg->event_list, list) {
+		list_del_init(&event->list);
+		schedule_work(&event->remove);
+	}
+	spin_unlock_irq(&memcg->event_list_lock);
+}
+
 static int __init memcg1_init(void)
 {
 	int node;
diff --git a/mm/memcontrol-v1.h b/mm/memcontrol-v1.h
index d377c0be9880..524a2c76ffc9 100644
--- a/mm/memcontrol-v1.h
+++ b/mm/memcontrol-v1.h
@@ -41,4 +41,55 @@  u64 mem_cgroup_move_charge_read(struct cgroup_subsys_state *css,
 int mem_cgroup_move_charge_write(struct cgroup_subsys_state *css,
 				 struct cftype *cft, u64 val);
 
+/*
+ * Per memcg event counter is incremented at every pagein/pageout. With THP,
+ * it will be incremented by the number of pages. This counter is used
+ * to trigger some periodic events. This is straightforward and better
+ * than using jiffies etc. to handle periodic memcg event.
+ */
+enum mem_cgroup_events_target {
+	MEM_CGROUP_TARGET_THRESH,
+	MEM_CGROUP_TARGET_SOFTLIMIT,
+	MEM_CGROUP_NTARGETS,
+};
+
+/* Whether legacy memory+swap accounting is active */
+static bool do_memsw_account(void)
+{
+	return !cgroup_subsys_on_dfl(memory_cgrp_subsys);
+}
+
+/*
+ * Iteration constructs for visiting all cgroups (under a tree).  If
+ * loops are exited prematurely (break), mem_cgroup_iter_break() must
+ * be used for reference counting.
+ */
+#define for_each_mem_cgroup_tree(iter, root)		\
+	for (iter = mem_cgroup_iter(root, NULL, NULL);	\
+	     iter != NULL;				\
+	     iter = mem_cgroup_iter(root, iter, NULL))
+
+#define for_each_mem_cgroup(iter)			\
+	for (iter = mem_cgroup_iter(NULL, NULL, NULL);	\
+	     iter != NULL;				\
+	     iter = mem_cgroup_iter(NULL, iter, NULL))
+
+void memcg1_css_offline(struct mem_cgroup *memcg);
+
+/* for encoding cft->private value on file */
+enum res_type {
+	_MEM,
+	_MEMSWAP,
+	_KMEM,
+	_TCP,
+};
+
+bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg,
+				enum mem_cgroup_events_target target);
+unsigned long mem_cgroup_usage(struct mem_cgroup *memcg, bool swap);
+void mem_cgroup_oom_notify(struct mem_cgroup *memcg);
+ssize_t memcg_write_event_control(struct kernfs_open_file *of,
+				  char *buf, size_t nbytes, loff_t off);
+
+
 #endif	/* __MM_MEMCONTROL_V1_H */
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index da2c0fa0de1b..bd4b26a73596 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -46,9 +46,6 @@ 
 #include <linux/slab.h>
 #include <linux/swapops.h>
 #include <linux/spinlock.h>
-#include <linux/eventfd.h>
-#include <linux/poll.h>
-#include <linux/sort.h>
 #include <linux/fs.h>
 #include <linux/seq_file.h>
 #include <linux/parser.h>
@@ -59,7 +56,6 @@ 
 #include <linux/cpu.h>
 #include <linux/oom.h>
 #include <linux/lockdep.h>
-#include <linux/file.h>
 #include <linux/resume_user_mode.h>
 #include <linux/psi.h>
 #include <linux/seq_buf.h>
@@ -97,91 +93,13 @@  static bool cgroup_memory_nobpf __ro_after_init;
 static DECLARE_WAIT_QUEUE_HEAD(memcg_cgwb_frn_waitq);
 #endif
 
-/* Whether legacy memory+swap accounting is active */
-static bool do_memsw_account(void)
-{
-	return !cgroup_subsys_on_dfl(memory_cgrp_subsys);
-}
-
 #define THRESHOLDS_EVENTS_TARGET 128
 #define SOFTLIMIT_EVENTS_TARGET 1024
 
-/* for OOM */
-struct mem_cgroup_eventfd_list {
-	struct list_head list;
-	struct eventfd_ctx *eventfd;
-};
-
-/*
- * cgroup_event represents events which userspace want to receive.
- */
-struct mem_cgroup_event {
-	/*
-	 * memcg which the event belongs to.
-	 */
-	struct mem_cgroup *memcg;
-	/*
-	 * eventfd to signal userspace about the event.
-	 */
-	struct eventfd_ctx *eventfd;
-	/*
-	 * Each of these stored in a list by the cgroup.
-	 */
-	struct list_head list;
-	/*
-	 * register_event() callback will be used to add new userspace
-	 * waiter for changes related to this event.  Use eventfd_signal()
-	 * on eventfd to send notification to userspace.
-	 */
-	int (*register_event)(struct mem_cgroup *memcg,
-			      struct eventfd_ctx *eventfd, const char *args);
-	/*
-	 * unregister_event() callback will be called when userspace closes
-	 * the eventfd or on cgroup removing.  This callback must be set,
-	 * if you want provide notification functionality.
-	 */
-	void (*unregister_event)(struct mem_cgroup *memcg,
-				 struct eventfd_ctx *eventfd);
-	/*
-	 * All fields below needed to unregister event when
-	 * userspace closes eventfd.
-	 */
-	poll_table pt;
-	wait_queue_head_t *wqh;
-	wait_queue_entry_t wait;
-	struct work_struct remove;
-};
-
-static void mem_cgroup_threshold(struct mem_cgroup *memcg);
-static void mem_cgroup_oom_notify(struct mem_cgroup *memcg);
-
-/* for encoding cft->private value on file */
-enum res_type {
-	_MEM,
-	_MEMSWAP,
-	_KMEM,
-	_TCP,
-};
-
 #define MEMFILE_PRIVATE(x, val)	((x) << 16 | (val))
 #define MEMFILE_TYPE(val)	((val) >> 16 & 0xffff)
 #define MEMFILE_ATTR(val)	((val) & 0xffff)
 
-/*
- * Iteration constructs for visiting all cgroups (under a tree).  If
- * loops are exited prematurely (break), mem_cgroup_iter_break() must
- * be used for reference counting.
- */
-#define for_each_mem_cgroup_tree(iter, root)		\
-	for (iter = mem_cgroup_iter(root, NULL, NULL);	\
-	     iter != NULL;				\
-	     iter = mem_cgroup_iter(root, iter, NULL))
-
-#define for_each_mem_cgroup(iter)			\
-	for (iter = mem_cgroup_iter(NULL, NULL, NULL);	\
-	     iter != NULL;				\
-	     iter = mem_cgroup_iter(NULL, iter, NULL))
-
 static inline bool task_is_dying(void)
 {
 	return tsk_is_oom_victim(current) || fatal_signal_pending(current) ||
@@ -940,8 +858,8 @@  void mem_cgroup_charge_statistics(struct mem_cgroup *memcg, int nr_pages)
 	__this_cpu_add(memcg->vmstats_percpu->nr_page_events, nr_pages);
 }
 
-static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg,
-				       enum mem_cgroup_events_target target)
+bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg,
+				enum mem_cgroup_events_target target)
 {
 	unsigned long val, next;
 
@@ -965,28 +883,6 @@  static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg,
 	return false;
 }
 
-/*
- * Check events in order.
- *
- */
-void memcg_check_events(struct mem_cgroup *memcg, int nid)
-{
-	if (IS_ENABLED(CONFIG_PREEMPT_RT))
-		return;
-
-	/* threshold event is triggered in finer grain than soft limit */
-	if (unlikely(mem_cgroup_event_ratelimit(memcg,
-						MEM_CGROUP_TARGET_THRESH))) {
-		bool do_softlimit;
-
-		do_softlimit = mem_cgroup_event_ratelimit(memcg,
-						MEM_CGROUP_TARGET_SOFTLIMIT);
-		mem_cgroup_threshold(memcg);
-		if (unlikely(do_softlimit))
-			memcg1_update_tree(memcg, nid);
-	}
-}
-
 struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)
 {
 	/*
@@ -1726,7 +1622,7 @@  static struct lockdep_map memcg_oom_lock_dep_map = {
 };
 #endif
 
-static DEFINE_SPINLOCK(memcg_oom_lock);
+DEFINE_SPINLOCK(memcg_oom_lock);
 
 /*
  * Check OOM-Killer is already running under our hierarchy.
@@ -3545,7 +3441,7 @@  static int mem_cgroup_hierarchy_write(struct cgroup_subsys_state *css,
 	return -EINVAL;
 }
 
-static unsigned long mem_cgroup_usage(struct mem_cgroup *memcg, bool swap)
+unsigned long mem_cgroup_usage(struct mem_cgroup *memcg, bool swap)
 {
 	unsigned long val;
 
@@ -4046,331 +3942,6 @@  static int mem_cgroup_swappiness_write(struct cgroup_subsys_state *css,
 	return 0;
 }
 
-static void __mem_cgroup_threshold(struct mem_cgroup *memcg, bool swap)
-{
-	struct mem_cgroup_threshold_ary *t;
-	unsigned long usage;
-	int i;
-
-	rcu_read_lock();
-	if (!swap)
-		t = rcu_dereference(memcg->thresholds.primary);
-	else
-		t = rcu_dereference(memcg->memsw_thresholds.primary);
-
-	if (!t)
-		goto unlock;
-
-	usage = mem_cgroup_usage(memcg, swap);
-
-	/*
-	 * current_threshold points to threshold just below or equal to usage.
-	 * If it's not true, a threshold was crossed after last
-	 * call of __mem_cgroup_threshold().
-	 */
-	i = t->current_threshold;
-
-	/*
-	 * Iterate backward over array of thresholds starting from
-	 * current_threshold and check if a threshold is crossed.
-	 * If none of thresholds below usage is crossed, we read
-	 * only one element of the array here.
-	 */
-	for (; i >= 0 && unlikely(t->entries[i].threshold > usage); i--)
-		eventfd_signal(t->entries[i].eventfd);
-
-	/* i = current_threshold + 1 */
-	i++;
-
-	/*
-	 * Iterate forward over array of thresholds starting from
-	 * current_threshold+1 and check if a threshold is crossed.
-	 * If none of thresholds above usage is crossed, we read
-	 * only one element of the array here.
-	 */
-	for (; i < t->size && unlikely(t->entries[i].threshold <= usage); i++)
-		eventfd_signal(t->entries[i].eventfd);
-
-	/* Update current_threshold */
-	t->current_threshold = i - 1;
-unlock:
-	rcu_read_unlock();
-}
-
-static void mem_cgroup_threshold(struct mem_cgroup *memcg)
-{
-	while (memcg) {
-		__mem_cgroup_threshold(memcg, false);
-		if (do_memsw_account())
-			__mem_cgroup_threshold(memcg, true);
-
-		memcg = parent_mem_cgroup(memcg);
-	}
-}
-
-static int compare_thresholds(const void *a, const void *b)
-{
-	const struct mem_cgroup_threshold *_a = a;
-	const struct mem_cgroup_threshold *_b = b;
-
-	if (_a->threshold > _b->threshold)
-		return 1;
-
-	if (_a->threshold < _b->threshold)
-		return -1;
-
-	return 0;
-}
-
-static int mem_cgroup_oom_notify_cb(struct mem_cgroup *memcg)
-{
-	struct mem_cgroup_eventfd_list *ev;
-
-	spin_lock(&memcg_oom_lock);
-
-	list_for_each_entry(ev, &memcg->oom_notify, list)
-		eventfd_signal(ev->eventfd);
-
-	spin_unlock(&memcg_oom_lock);
-	return 0;
-}
-
-static void mem_cgroup_oom_notify(struct mem_cgroup *memcg)
-{
-	struct mem_cgroup *iter;
-
-	for_each_mem_cgroup_tree(iter, memcg)
-		mem_cgroup_oom_notify_cb(iter);
-}
-
-static int __mem_cgroup_usage_register_event(struct mem_cgroup *memcg,
-	struct eventfd_ctx *eventfd, const char *args, enum res_type type)
-{
-	struct mem_cgroup_thresholds *thresholds;
-	struct mem_cgroup_threshold_ary *new;
-	unsigned long threshold;
-	unsigned long usage;
-	int i, size, ret;
-
-	ret = page_counter_memparse(args, "-1", &threshold);
-	if (ret)
-		return ret;
-
-	mutex_lock(&memcg->thresholds_lock);
-
-	if (type == _MEM) {
-		thresholds = &memcg->thresholds;
-		usage = mem_cgroup_usage(memcg, false);
-	} else if (type == _MEMSWAP) {
-		thresholds = &memcg->memsw_thresholds;
-		usage = mem_cgroup_usage(memcg, true);
-	} else
-		BUG();
-
-	/* Check if a threshold crossed before adding a new one */
-	if (thresholds->primary)
-		__mem_cgroup_threshold(memcg, type == _MEMSWAP);
-
-	size = thresholds->primary ? thresholds->primary->size + 1 : 1;
-
-	/* Allocate memory for new array of thresholds */
-	new = kmalloc(struct_size(new, entries, size), GFP_KERNEL);
-	if (!new) {
-		ret = -ENOMEM;
-		goto unlock;
-	}
-	new->size = size;
-
-	/* Copy thresholds (if any) to new array */
-	if (thresholds->primary)
-		memcpy(new->entries, thresholds->primary->entries,
-		       flex_array_size(new, entries, size - 1));
-
-	/* Add new threshold */
-	new->entries[size - 1].eventfd = eventfd;
-	new->entries[size - 1].threshold = threshold;
-
-	/* Sort thresholds. Registering of new threshold isn't time-critical */
-	sort(new->entries, size, sizeof(*new->entries),
-			compare_thresholds, NULL);
-
-	/* Find current threshold */
-	new->current_threshold = -1;
-	for (i = 0; i < size; i++) {
-		if (new->entries[i].threshold <= usage) {
-			/*
-			 * new->current_threshold will not be used until
-			 * rcu_assign_pointer(), so it's safe to increment
-			 * it here.
-			 */
-			++new->current_threshold;
-		} else
-			break;
-	}
-
-	/* Free old spare buffer and save old primary buffer as spare */
-	kfree(thresholds->spare);
-	thresholds->spare = thresholds->primary;
-
-	rcu_assign_pointer(thresholds->primary, new);
-
-	/* To be sure that nobody uses thresholds */
-	synchronize_rcu();
-
-unlock:
-	mutex_unlock(&memcg->thresholds_lock);
-
-	return ret;
-}
-
-static int mem_cgroup_usage_register_event(struct mem_cgroup *memcg,
-	struct eventfd_ctx *eventfd, const char *args)
-{
-	return __mem_cgroup_usage_register_event(memcg, eventfd, args, _MEM);
-}
-
-static int memsw_cgroup_usage_register_event(struct mem_cgroup *memcg,
-	struct eventfd_ctx *eventfd, const char *args)
-{
-	return __mem_cgroup_usage_register_event(memcg, eventfd, args, _MEMSWAP);
-}
-
-static void __mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg,
-	struct eventfd_ctx *eventfd, enum res_type type)
-{
-	struct mem_cgroup_thresholds *thresholds;
-	struct mem_cgroup_threshold_ary *new;
-	unsigned long usage;
-	int i, j, size, entries;
-
-	mutex_lock(&memcg->thresholds_lock);
-
-	if (type == _MEM) {
-		thresholds = &memcg->thresholds;
-		usage = mem_cgroup_usage(memcg, false);
-	} else if (type == _MEMSWAP) {
-		thresholds = &memcg->memsw_thresholds;
-		usage = mem_cgroup_usage(memcg, true);
-	} else
-		BUG();
-
-	if (!thresholds->primary)
-		goto unlock;
-
-	/* Check if a threshold crossed before removing */
-	__mem_cgroup_threshold(memcg, type == _MEMSWAP);
-
-	/* Calculate new number of threshold */
-	size = entries = 0;
-	for (i = 0; i < thresholds->primary->size; i++) {
-		if (thresholds->primary->entries[i].eventfd != eventfd)
-			size++;
-		else
-			entries++;
-	}
-
-	new = thresholds->spare;
-
-	/* If no items related to eventfd have been cleared, nothing to do */
-	if (!entries)
-		goto unlock;
-
-	/* Set thresholds array to NULL if we don't have thresholds */
-	if (!size) {
-		kfree(new);
-		new = NULL;
-		goto swap_buffers;
-	}
-
-	new->size = size;
-
-	/* Copy thresholds and find current threshold */
-	new->current_threshold = -1;
-	for (i = 0, j = 0; i < thresholds->primary->size; i++) {
-		if (thresholds->primary->entries[i].eventfd == eventfd)
-			continue;
-
-		new->entries[j] = thresholds->primary->entries[i];
-		if (new->entries[j].threshold <= usage) {
-			/*
-			 * new->current_threshold will not be used
-			 * until rcu_assign_pointer(), so it's safe to increment
-			 * it here.
-			 */
-			++new->current_threshold;
-		}
-		j++;
-	}
-
-swap_buffers:
-	/* Swap primary and spare array */
-	thresholds->spare = thresholds->primary;
-
-	rcu_assign_pointer(thresholds->primary, new);
-
-	/* To be sure that nobody uses thresholds */
-	synchronize_rcu();
-
-	/* If all events are unregistered, free the spare array */
-	if (!new) {
-		kfree(thresholds->spare);
-		thresholds->spare = NULL;
-	}
-unlock:
-	mutex_unlock(&memcg->thresholds_lock);
-}
-
-static void mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg,
-	struct eventfd_ctx *eventfd)
-{
-	return __mem_cgroup_usage_unregister_event(memcg, eventfd, _MEM);
-}
-
-static void memsw_cgroup_usage_unregister_event(struct mem_cgroup *memcg,
-	struct eventfd_ctx *eventfd)
-{
-	return __mem_cgroup_usage_unregister_event(memcg, eventfd, _MEMSWAP);
-}
-
-static int mem_cgroup_oom_register_event(struct mem_cgroup *memcg,
-	struct eventfd_ctx *eventfd, const char *args)
-{
-	struct mem_cgroup_eventfd_list *event;
-
-	event = kmalloc(sizeof(*event),	GFP_KERNEL);
-	if (!event)
-		return -ENOMEM;
-
-	spin_lock(&memcg_oom_lock);
-
-	event->eventfd = eventfd;
-	list_add(&event->list, &memcg->oom_notify);
-
-	/* already in OOM ? */
-	if (memcg->under_oom)
-		eventfd_signal(eventfd);
-	spin_unlock(&memcg_oom_lock);
-
-	return 0;
-}
-
-static void mem_cgroup_oom_unregister_event(struct mem_cgroup *memcg,
-	struct eventfd_ctx *eventfd)
-{
-	struct mem_cgroup_eventfd_list *ev, *tmp;
-
-	spin_lock(&memcg_oom_lock);
-
-	list_for_each_entry_safe(ev, tmp, &memcg->oom_notify, list) {
-		if (ev->eventfd == eventfd) {
-			list_del(&ev->list);
-			kfree(ev);
-		}
-	}
-
-	spin_unlock(&memcg_oom_lock);
-}
-
 static int mem_cgroup_oom_control_read(struct seq_file *sf, void *v)
 {
 	struct mem_cgroup *memcg = mem_cgroup_from_seq(sf);
@@ -4611,243 +4182,6 @@  static void memcg_wb_domain_size_changed(struct mem_cgroup *memcg)
 
 #endif	/* CONFIG_CGROUP_WRITEBACK */
 
-/*
- * DO NOT USE IN NEW FILES.
- *
- * "cgroup.event_control" implementation.
- *
- * This is way over-engineered.  It tries to support fully configurable
- * events for each user.  Such level of flexibility is completely
- * unnecessary especially in the light of the planned unified hierarchy.
- *
- * Please deprecate this and replace with something simpler if at all
- * possible.
- */
-
-/*
- * Unregister event and free resources.
- *
- * Gets called from workqueue.
- */
-static void memcg_event_remove(struct work_struct *work)
-{
-	struct mem_cgroup_event *event =
-		container_of(work, struct mem_cgroup_event, remove);
-	struct mem_cgroup *memcg = event->memcg;
-
-	remove_wait_queue(event->wqh, &event->wait);
-
-	event->unregister_event(memcg, event->eventfd);
-
-	/* Notify userspace the event is going away. */
-	eventfd_signal(event->eventfd);
-
-	eventfd_ctx_put(event->eventfd);
-	kfree(event);
-	css_put(&memcg->css);
-}
-
-/*
- * Gets called on EPOLLHUP on eventfd when user closes it.
- *
- * Called with wqh->lock held and interrupts disabled.
- */
-static int memcg_event_wake(wait_queue_entry_t *wait, unsigned mode,
-			    int sync, void *key)
-{
-	struct mem_cgroup_event *event =
-		container_of(wait, struct mem_cgroup_event, wait);
-	struct mem_cgroup *memcg = event->memcg;
-	__poll_t flags = key_to_poll(key);
-
-	if (flags & EPOLLHUP) {
-		/*
-		 * If the event has been detached at cgroup removal, we
-		 * can simply return knowing the other side will cleanup
-		 * for us.
-		 *
-		 * We can't race against event freeing since the other
-		 * side will require wqh->lock via remove_wait_queue(),
-		 * which we hold.
-		 */
-		spin_lock(&memcg->event_list_lock);
-		if (!list_empty(&event->list)) {
-			list_del_init(&event->list);
-			/*
-			 * We are in atomic context, but cgroup_event_remove()
-			 * may sleep, so we have to call it in workqueue.
-			 */
-			schedule_work(&event->remove);
-		}
-		spin_unlock(&memcg->event_list_lock);
-	}
-
-	return 0;
-}
-
-static void memcg_event_ptable_queue_proc(struct file *file,
-		wait_queue_head_t *wqh, poll_table *pt)
-{
-	struct mem_cgroup_event *event =
-		container_of(pt, struct mem_cgroup_event, pt);
-
-	event->wqh = wqh;
-	add_wait_queue(wqh, &event->wait);
-}
-
-/*
- * DO NOT USE IN NEW FILES.
- *
- * Parse input and register new cgroup event handler.
- *
- * Input must be in format '<event_fd> <control_fd> <args>'.
- * Interpretation of args is defined by control file implementation.
- */
-static ssize_t memcg_write_event_control(struct kernfs_open_file *of,
-					 char *buf, size_t nbytes, loff_t off)
-{
-	struct cgroup_subsys_state *css = of_css(of);
-	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
-	struct mem_cgroup_event *event;
-	struct cgroup_subsys_state *cfile_css;
-	unsigned int efd, cfd;
-	struct fd efile;
-	struct fd cfile;
-	struct dentry *cdentry;
-	const char *name;
-	char *endp;
-	int ret;
-
-	if (IS_ENABLED(CONFIG_PREEMPT_RT))
-		return -EOPNOTSUPP;
-
-	buf = strstrip(buf);
-
-	efd = simple_strtoul(buf, &endp, 10);
-	if (*endp != ' ')
-		return -EINVAL;
-	buf = endp + 1;
-
-	cfd = simple_strtoul(buf, &endp, 10);
-	if ((*endp != ' ') && (*endp != '\0'))
-		return -EINVAL;
-	buf = endp + 1;
-
-	event = kzalloc(sizeof(*event), GFP_KERNEL);
-	if (!event)
-		return -ENOMEM;
-
-	event->memcg = memcg;
-	INIT_LIST_HEAD(&event->list);
-	init_poll_funcptr(&event->pt, memcg_event_ptable_queue_proc);
-	init_waitqueue_func_entry(&event->wait, memcg_event_wake);
-	INIT_WORK(&event->remove, memcg_event_remove);
-
-	efile = fdget(efd);
-	if (!efile.file) {
-		ret = -EBADF;
-		goto out_kfree;
-	}
-
-	event->eventfd = eventfd_ctx_fileget(efile.file);
-	if (IS_ERR(event->eventfd)) {
-		ret = PTR_ERR(event->eventfd);
-		goto out_put_efile;
-	}
-
-	cfile = fdget(cfd);
-	if (!cfile.file) {
-		ret = -EBADF;
-		goto out_put_eventfd;
-	}
-
-	/* the process need read permission on control file */
-	/* AV: shouldn't we check that it's been opened for read instead? */
-	ret = file_permission(cfile.file, MAY_READ);
-	if (ret < 0)
-		goto out_put_cfile;
-
-	/*
-	 * The control file must be a regular cgroup1 file. As a regular cgroup
-	 * file can't be renamed, it's safe to access its name afterwards.
-	 */
-	cdentry = cfile.file->f_path.dentry;
-	if (cdentry->d_sb->s_type != &cgroup_fs_type || !d_is_reg(cdentry)) {
-		ret = -EINVAL;
-		goto out_put_cfile;
-	}
-
-	/*
-	 * Determine the event callbacks and set them in @event.  This used
-	 * to be done via struct cftype but cgroup core no longer knows
-	 * about these events.  The following is crude but the whole thing
-	 * is for compatibility anyway.
-	 *
-	 * DO NOT ADD NEW FILES.
-	 */
-	name = cdentry->d_name.name;
-
-	if (!strcmp(name, "memory.usage_in_bytes")) {
-		event->register_event = mem_cgroup_usage_register_event;
-		event->unregister_event = mem_cgroup_usage_unregister_event;
-	} else if (!strcmp(name, "memory.oom_control")) {
-		event->register_event = mem_cgroup_oom_register_event;
-		event->unregister_event = mem_cgroup_oom_unregister_event;
-	} else if (!strcmp(name, "memory.pressure_level")) {
-		event->register_event = vmpressure_register_event;
-		event->unregister_event = vmpressure_unregister_event;
-	} else if (!strcmp(name, "memory.memsw.usage_in_bytes")) {
-		event->register_event = memsw_cgroup_usage_register_event;
-		event->unregister_event = memsw_cgroup_usage_unregister_event;
-	} else {
-		ret = -EINVAL;
-		goto out_put_cfile;
-	}
-
-	/*
-	 * Verify @cfile should belong to @css.  Also, remaining events are
-	 * automatically removed on cgroup destruction but the removal is
-	 * asynchronous, so take an extra ref on @css.
-	 */
-	cfile_css = css_tryget_online_from_dir(cdentry->d_parent,
-					       &memory_cgrp_subsys);
-	ret = -EINVAL;
-	if (IS_ERR(cfile_css))
-		goto out_put_cfile;
-	if (cfile_css != css) {
-		css_put(cfile_css);
-		goto out_put_cfile;
-	}
-
-	ret = event->register_event(memcg, event->eventfd, buf);
-	if (ret)
-		goto out_put_css;
-
-	vfs_poll(efile.file, &event->pt);
-
-	spin_lock_irq(&memcg->event_list_lock);
-	list_add(&event->list, &memcg->event_list);
-	spin_unlock_irq(&memcg->event_list_lock);
-
-	fdput(cfile);
-	fdput(efile);
-
-	return nbytes;
-
-out_put_css:
-	css_put(css);
-out_put_cfile:
-	fdput(cfile);
-out_put_eventfd:
-	eventfd_ctx_put(event->eventfd);
-out_put_efile:
-	fdput(efile);
-out_kfree:
-	kfree(event);
-
-	return ret;
-}
-
 #if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_SLUB_DEBUG)
 static int mem_cgroup_slab_show(struct seq_file *m, void *p)
 {
@@ -5314,19 +4648,8 @@  static int mem_cgroup_css_online(struct cgroup_subsys_state *css)
 static void mem_cgroup_css_offline(struct cgroup_subsys_state *css)
 {
 	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
-	struct mem_cgroup_event *event, *tmp;
 
-	/*
-	 * Unregister events and notify userspace.
-	 * Notify userspace about cgroup removing only after rmdir of cgroup
-	 * directory to avoid race between userspace and kernelspace.
-	 */
-	spin_lock_irq(&memcg->event_list_lock);
-	list_for_each_entry_safe(event, tmp, &memcg->event_list, list) {
-		list_del_init(&event->list);
-		schedule_work(&event->remove);
-	}
-	spin_unlock_irq(&memcg->event_list_lock);
+	memcg1_css_offline(memcg);
 
 	page_counter_set_min(&memcg->memory, 0);
 	page_counter_set_low(&memcg->memory, 0);