Message ID | 20240625005906.106920-7-roman.gushchin@linux.dev (mailing list archive) |
---|---|
State | New |
Headers | show |
Series | mm: memcg: separate legacy cgroup v1 code and put under config option | expand |
On Mon 24-06-24 17:58:58, Roman Gushchin wrote: > Cgroup v1's memory controller contains a pretty complicated > event notifications mechanism which is not used on cgroup v2. > Let's move the corresponding code into memcontrol-v1.c. > > Please, note, that mem_cgroup_event_ratelimit() remains in > memcontrol.c, otherwise it would require exporting too many > details on memcg stats outside of memcontrol.c. > > Signed-off-by: Roman Gushchin <roman.gushchin@linux.dev> Acked-by: Michal Hocko <mhocko@suse.com> > --- > include/linux/memcontrol.h | 12 - > mm/memcontrol-v1.c | 653 +++++++++++++++++++++++++++++++++++ > mm/memcontrol-v1.h | 51 +++ > mm/memcontrol.c | 687 +------------------------------------ > 4 files changed, 709 insertions(+), 694 deletions(-) > > diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h > index 83c8327455d8..588179d29849 100644 > --- a/include/linux/memcontrol.h > +++ b/include/linux/memcontrol.h > @@ -69,18 +69,6 @@ struct mem_cgroup_id { > refcount_t ref; > }; > > -/* > - * Per memcg event counter is incremented at every pagein/pageout. With THP, > - * it will be incremented by the number of pages. This counter is used > - * to trigger some periodic events. This is straightforward and better > - * than using jiffies etc. to handle periodic memcg event. > - */ > -enum mem_cgroup_events_target { > - MEM_CGROUP_TARGET_THRESH, > - MEM_CGROUP_TARGET_SOFTLIMIT, > - MEM_CGROUP_NTARGETS, > -}; > - > struct memcg_vmstats_percpu; > struct memcg_vmstats; > struct lruvec_stats_percpu; > diff --git a/mm/memcontrol-v1.c b/mm/memcontrol-v1.c > index c25e038ac874..4b2290ceace6 100644 > --- a/mm/memcontrol-v1.c > +++ b/mm/memcontrol-v1.c > @@ -6,6 +6,10 @@ > #include <linux/pagewalk.h> > #include <linux/backing-dev.h> > #include <linux/swap_cgroup.h> > +#include <linux/eventfd.h> > +#include <linux/poll.h> > +#include <linux/sort.h> > +#include <linux/file.h> > > #include "internal.h" > #include "swap.h" > @@ -60,6 +64,54 @@ static struct move_charge_struct { > .waitq = __WAIT_QUEUE_HEAD_INITIALIZER(mc.waitq), > }; > > +/* for OOM */ > +struct mem_cgroup_eventfd_list { > + struct list_head list; > + struct eventfd_ctx *eventfd; > +}; > + > +/* > + * cgroup_event represents events which userspace want to receive. > + */ > +struct mem_cgroup_event { > + /* > + * memcg which the event belongs to. > + */ > + struct mem_cgroup *memcg; > + /* > + * eventfd to signal userspace about the event. > + */ > + struct eventfd_ctx *eventfd; > + /* > + * Each of these stored in a list by the cgroup. > + */ > + struct list_head list; > + /* > + * register_event() callback will be used to add new userspace > + * waiter for changes related to this event. Use eventfd_signal() > + * on eventfd to send notification to userspace. > + */ > + int (*register_event)(struct mem_cgroup *memcg, > + struct eventfd_ctx *eventfd, const char *args); > + /* > + * unregister_event() callback will be called when userspace closes > + * the eventfd or on cgroup removing. This callback must be set, > + * if you want provide notification functionality. > + */ > + void (*unregister_event)(struct mem_cgroup *memcg, > + struct eventfd_ctx *eventfd); > + /* > + * All fields below needed to unregister event when > + * userspace closes eventfd. > + */ > + poll_table pt; > + wait_queue_head_t *wqh; > + wait_queue_entry_t wait; > + struct work_struct remove; > +}; > + > +extern spinlock_t memcg_oom_lock; > + > static void __mem_cgroup_insert_exceeded(struct mem_cgroup_per_node *mz, > struct mem_cgroup_tree_per_node *mctz, > unsigned long new_usage_in_excess) > @@ -1306,6 +1358,607 @@ void memcg1_move_task(void) > } > #endif > > +static void __mem_cgroup_threshold(struct mem_cgroup *memcg, bool swap) > +{ > + struct mem_cgroup_threshold_ary *t; > + unsigned long usage; > + int i; > + > + rcu_read_lock(); > + if (!swap) > + t = rcu_dereference(memcg->thresholds.primary); > + else > + t = rcu_dereference(memcg->memsw_thresholds.primary); > + > + if (!t) > + goto unlock; > + > + usage = mem_cgroup_usage(memcg, swap); > + > + /* > + * current_threshold points to threshold just below or equal to usage. > + * If it's not true, a threshold was crossed after last > + * call of __mem_cgroup_threshold(). > + */ > + i = t->current_threshold; > + > + /* > + * Iterate backward over array of thresholds starting from > + * current_threshold and check if a threshold is crossed. > + * If none of thresholds below usage is crossed, we read > + * only one element of the array here. > + */ > + for (; i >= 0 && unlikely(t->entries[i].threshold > usage); i--) > + eventfd_signal(t->entries[i].eventfd); > + > + /* i = current_threshold + 1 */ > + i++; > + > + /* > + * Iterate forward over array of thresholds starting from > + * current_threshold+1 and check if a threshold is crossed. > + * If none of thresholds above usage is crossed, we read > + * only one element of the array here. > + */ > + for (; i < t->size && unlikely(t->entries[i].threshold <= usage); i++) > + eventfd_signal(t->entries[i].eventfd); > + > + /* Update current_threshold */ > + t->current_threshold = i - 1; > +unlock: > + rcu_read_unlock(); > +} > + > +static void mem_cgroup_threshold(struct mem_cgroup *memcg) > +{ > + while (memcg) { > + __mem_cgroup_threshold(memcg, false); > + if (do_memsw_account()) > + __mem_cgroup_threshold(memcg, true); > + > + memcg = parent_mem_cgroup(memcg); > + } > +} > + > +/* > + * Check events in order. > + * > + */ > +void memcg_check_events(struct mem_cgroup *memcg, int nid) > +{ > + if (IS_ENABLED(CONFIG_PREEMPT_RT)) > + return; > + > + /* threshold event is triggered in finer grain than soft limit */ > + if (unlikely(mem_cgroup_event_ratelimit(memcg, > + MEM_CGROUP_TARGET_THRESH))) { > + bool do_softlimit; > + > + do_softlimit = mem_cgroup_event_ratelimit(memcg, > + MEM_CGROUP_TARGET_SOFTLIMIT); > + mem_cgroup_threshold(memcg); > + if (unlikely(do_softlimit)) > + memcg1_update_tree(memcg, nid); > + } > +} > + > +static int compare_thresholds(const void *a, const void *b) > +{ > + const struct mem_cgroup_threshold *_a = a; > + const struct mem_cgroup_threshold *_b = b; > + > + if (_a->threshold > _b->threshold) > + return 1; > + > + if (_a->threshold < _b->threshold) > + return -1; > + > + return 0; > +} > + > +static int mem_cgroup_oom_notify_cb(struct mem_cgroup *memcg) > +{ > + struct mem_cgroup_eventfd_list *ev; > + > + spin_lock(&memcg_oom_lock); > + > + list_for_each_entry(ev, &memcg->oom_notify, list) > + eventfd_signal(ev->eventfd); > + > + spin_unlock(&memcg_oom_lock); > + return 0; > +} > + > +void mem_cgroup_oom_notify(struct mem_cgroup *memcg) > +{ > + struct mem_cgroup *iter; > + > + for_each_mem_cgroup_tree(iter, memcg) > + mem_cgroup_oom_notify_cb(iter); > +} > + > +static int __mem_cgroup_usage_register_event(struct mem_cgroup *memcg, > + struct eventfd_ctx *eventfd, const char *args, enum res_type type) > +{ > + struct mem_cgroup_thresholds *thresholds; > + struct mem_cgroup_threshold_ary *new; > + unsigned long threshold; > + unsigned long usage; > + int i, size, ret; > + > + ret = page_counter_memparse(args, "-1", &threshold); > + if (ret) > + return ret; > + > + mutex_lock(&memcg->thresholds_lock); > + > + if (type == _MEM) { > + thresholds = &memcg->thresholds; > + usage = mem_cgroup_usage(memcg, false); > + } else if (type == _MEMSWAP) { > + thresholds = &memcg->memsw_thresholds; > + usage = mem_cgroup_usage(memcg, true); > + } else > + BUG(); > + > + /* Check if a threshold crossed before adding a new one */ > + if (thresholds->primary) > + __mem_cgroup_threshold(memcg, type == _MEMSWAP); > + > + size = thresholds->primary ? thresholds->primary->size + 1 : 1; > + > + /* Allocate memory for new array of thresholds */ > + new = kmalloc(struct_size(new, entries, size), GFP_KERNEL); > + if (!new) { > + ret = -ENOMEM; > + goto unlock; > + } > + new->size = size; > + > + /* Copy thresholds (if any) to new array */ > + if (thresholds->primary) > + memcpy(new->entries, thresholds->primary->entries, > + flex_array_size(new, entries, size - 1)); > + > + /* Add new threshold */ > + new->entries[size - 1].eventfd = eventfd; > + new->entries[size - 1].threshold = threshold; > + > + /* Sort thresholds. Registering of new threshold isn't time-critical */ > + sort(new->entries, size, sizeof(*new->entries), > + compare_thresholds, NULL); > + > + /* Find current threshold */ > + new->current_threshold = -1; > + for (i = 0; i < size; i++) { > + if (new->entries[i].threshold <= usage) { > + /* > + * new->current_threshold will not be used until > + * rcu_assign_pointer(), so it's safe to increment > + * it here. > + */ > + ++new->current_threshold; > + } else > + break; > + } > + > + /* Free old spare buffer and save old primary buffer as spare */ > + kfree(thresholds->spare); > + thresholds->spare = thresholds->primary; > + > + rcu_assign_pointer(thresholds->primary, new); > + > + /* To be sure that nobody uses thresholds */ > + synchronize_rcu(); > + > +unlock: > + mutex_unlock(&memcg->thresholds_lock); > + > + return ret; > +} > + > +static int mem_cgroup_usage_register_event(struct mem_cgroup *memcg, > + struct eventfd_ctx *eventfd, const char *args) > +{ > + return __mem_cgroup_usage_register_event(memcg, eventfd, args, _MEM); > +} > + > +static int memsw_cgroup_usage_register_event(struct mem_cgroup *memcg, > + struct eventfd_ctx *eventfd, const char *args) > +{ > + return __mem_cgroup_usage_register_event(memcg, eventfd, args, _MEMSWAP); > +} > + > +static void __mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg, > + struct eventfd_ctx *eventfd, enum res_type type) > +{ > + struct mem_cgroup_thresholds *thresholds; > + struct mem_cgroup_threshold_ary *new; > + unsigned long usage; > + int i, j, size, entries; > + > + mutex_lock(&memcg->thresholds_lock); > + > + if (type == _MEM) { > + thresholds = &memcg->thresholds; > + usage = mem_cgroup_usage(memcg, false); > + } else if (type == _MEMSWAP) { > + thresholds = &memcg->memsw_thresholds; > + usage = mem_cgroup_usage(memcg, true); > + } else > + BUG(); > + > + if (!thresholds->primary) > + goto unlock; > + > + /* Check if a threshold crossed before removing */ > + __mem_cgroup_threshold(memcg, type == _MEMSWAP); > + > + /* Calculate new number of threshold */ > + size = entries = 0; > + for (i = 0; i < thresholds->primary->size; i++) { > + if (thresholds->primary->entries[i].eventfd != eventfd) > + size++; > + else > + entries++; > + } > + > + new = thresholds->spare; > + > + /* If no items related to eventfd have been cleared, nothing to do */ > + if (!entries) > + goto unlock; > + > + /* Set thresholds array to NULL if we don't have thresholds */ > + if (!size) { > + kfree(new); > + new = NULL; > + goto swap_buffers; > + } > + > + new->size = size; > + > + /* Copy thresholds and find current threshold */ > + new->current_threshold = -1; > + for (i = 0, j = 0; i < thresholds->primary->size; i++) { > + if (thresholds->primary->entries[i].eventfd == eventfd) > + continue; > + > + new->entries[j] = thresholds->primary->entries[i]; > + if (new->entries[j].threshold <= usage) { > + /* > + * new->current_threshold will not be used > + * until rcu_assign_pointer(), so it's safe to increment > + * it here. > + */ > + ++new->current_threshold; > + } > + j++; > + } > + > +swap_buffers: > + /* Swap primary and spare array */ > + thresholds->spare = thresholds->primary; > + > + rcu_assign_pointer(thresholds->primary, new); > + > + /* To be sure that nobody uses thresholds */ > + synchronize_rcu(); > + > + /* If all events are unregistered, free the spare array */ > + if (!new) { > + kfree(thresholds->spare); > + thresholds->spare = NULL; > + } > +unlock: > + mutex_unlock(&memcg->thresholds_lock); > +} > + > +static void mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg, > + struct eventfd_ctx *eventfd) > +{ > + return __mem_cgroup_usage_unregister_event(memcg, eventfd, _MEM); > +} > + > +static void memsw_cgroup_usage_unregister_event(struct mem_cgroup *memcg, > + struct eventfd_ctx *eventfd) > +{ > + return __mem_cgroup_usage_unregister_event(memcg, eventfd, _MEMSWAP); > +} > + > +static int mem_cgroup_oom_register_event(struct mem_cgroup *memcg, > + struct eventfd_ctx *eventfd, const char *args) > +{ > + struct mem_cgroup_eventfd_list *event; > + > + event = kmalloc(sizeof(*event), GFP_KERNEL); > + if (!event) > + return -ENOMEM; > + > + spin_lock(&memcg_oom_lock); > + > + event->eventfd = eventfd; > + list_add(&event->list, &memcg->oom_notify); > + > + /* already in OOM ? */ > + if (memcg->under_oom) > + eventfd_signal(eventfd); > + spin_unlock(&memcg_oom_lock); > + > + return 0; > +} > + > +static void mem_cgroup_oom_unregister_event(struct mem_cgroup *memcg, > + struct eventfd_ctx *eventfd) > +{ > + struct mem_cgroup_eventfd_list *ev, *tmp; > + > + spin_lock(&memcg_oom_lock); > + > + list_for_each_entry_safe(ev, tmp, &memcg->oom_notify, list) { > + if (ev->eventfd == eventfd) { > + list_del(&ev->list); > + kfree(ev); > + } > + } > + > + spin_unlock(&memcg_oom_lock); > +} > + > +/* > + * DO NOT USE IN NEW FILES. > + * > + * "cgroup.event_control" implementation. > + * > + * This is way over-engineered. It tries to support fully configurable > + * events for each user. Such level of flexibility is completely > + * unnecessary especially in the light of the planned unified hierarchy. > + * > + * Please deprecate this and replace with something simpler if at all > + * possible. > + */ > + > +/* > + * Unregister event and free resources. > + * > + * Gets called from workqueue. > + */ > +static void memcg_event_remove(struct work_struct *work) > +{ > + struct mem_cgroup_event *event = > + container_of(work, struct mem_cgroup_event, remove); > + struct mem_cgroup *memcg = event->memcg; > + > + remove_wait_queue(event->wqh, &event->wait); > + > + event->unregister_event(memcg, event->eventfd); > + > + /* Notify userspace the event is going away. */ > + eventfd_signal(event->eventfd); > + > + eventfd_ctx_put(event->eventfd); > + kfree(event); > + css_put(&memcg->css); > +} > + > +/* > + * Gets called on EPOLLHUP on eventfd when user closes it. > + * > + * Called with wqh->lock held and interrupts disabled. > + */ > +static int memcg_event_wake(wait_queue_entry_t *wait, unsigned mode, > + int sync, void *key) > +{ > + struct mem_cgroup_event *event = > + container_of(wait, struct mem_cgroup_event, wait); > + struct mem_cgroup *memcg = event->memcg; > + __poll_t flags = key_to_poll(key); > + > + if (flags & EPOLLHUP) { > + /* > + * If the event has been detached at cgroup removal, we > + * can simply return knowing the other side will cleanup > + * for us. > + * > + * We can't race against event freeing since the other > + * side will require wqh->lock via remove_wait_queue(), > + * which we hold. > + */ > + spin_lock(&memcg->event_list_lock); > + if (!list_empty(&event->list)) { > + list_del_init(&event->list); > + /* > + * We are in atomic context, but cgroup_event_remove() > + * may sleep, so we have to call it in workqueue. > + */ > + schedule_work(&event->remove); > + } > + spin_unlock(&memcg->event_list_lock); > + } > + > + return 0; > +} > + > +static void memcg_event_ptable_queue_proc(struct file *file, > + wait_queue_head_t *wqh, poll_table *pt) > +{ > + struct mem_cgroup_event *event = > + container_of(pt, struct mem_cgroup_event, pt); > + > + event->wqh = wqh; > + add_wait_queue(wqh, &event->wait); > +} > + > +/* > + * DO NOT USE IN NEW FILES. > + * > + * Parse input and register new cgroup event handler. > + * > + * Input must be in format '<event_fd> <control_fd> <args>'. > + * Interpretation of args is defined by control file implementation. > + */ > +ssize_t memcg_write_event_control(struct kernfs_open_file *of, > + char *buf, size_t nbytes, loff_t off) > +{ > + struct cgroup_subsys_state *css = of_css(of); > + struct mem_cgroup *memcg = mem_cgroup_from_css(css); > + struct mem_cgroup_event *event; > + struct cgroup_subsys_state *cfile_css; > + unsigned int efd, cfd; > + struct fd efile; > + struct fd cfile; > + struct dentry *cdentry; > + const char *name; > + char *endp; > + int ret; > + > + if (IS_ENABLED(CONFIG_PREEMPT_RT)) > + return -EOPNOTSUPP; > + > + buf = strstrip(buf); > + > + efd = simple_strtoul(buf, &endp, 10); > + if (*endp != ' ') > + return -EINVAL; > + buf = endp + 1; > + > + cfd = simple_strtoul(buf, &endp, 10); > + if ((*endp != ' ') && (*endp != '\0')) > + return -EINVAL; > + buf = endp + 1; > + > + event = kzalloc(sizeof(*event), GFP_KERNEL); > + if (!event) > + return -ENOMEM; > + > + event->memcg = memcg; > + INIT_LIST_HEAD(&event->list); > + init_poll_funcptr(&event->pt, memcg_event_ptable_queue_proc); > + init_waitqueue_func_entry(&event->wait, memcg_event_wake); > + INIT_WORK(&event->remove, memcg_event_remove); > + > + efile = fdget(efd); > + if (!efile.file) { > + ret = -EBADF; > + goto out_kfree; > + } > + > + event->eventfd = eventfd_ctx_fileget(efile.file); > + if (IS_ERR(event->eventfd)) { > + ret = PTR_ERR(event->eventfd); > + goto out_put_efile; > + } > + > + cfile = fdget(cfd); > + if (!cfile.file) { > + ret = -EBADF; > + goto out_put_eventfd; > + } > + > + /* the process need read permission on control file */ > + /* AV: shouldn't we check that it's been opened for read instead? */ > + ret = file_permission(cfile.file, MAY_READ); > + if (ret < 0) > + goto out_put_cfile; > + > + /* > + * The control file must be a regular cgroup1 file. As a regular cgroup > + * file can't be renamed, it's safe to access its name afterwards. > + */ > + cdentry = cfile.file->f_path.dentry; > + if (cdentry->d_sb->s_type != &cgroup_fs_type || !d_is_reg(cdentry)) { > + ret = -EINVAL; > + goto out_put_cfile; > + } > + > + /* > + * Determine the event callbacks and set them in @event. This used > + * to be done via struct cftype but cgroup core no longer knows > + * about these events. The following is crude but the whole thing > + * is for compatibility anyway. > + * > + * DO NOT ADD NEW FILES. > + */ > + name = cdentry->d_name.name; > + > + if (!strcmp(name, "memory.usage_in_bytes")) { > + event->register_event = mem_cgroup_usage_register_event; > + event->unregister_event = mem_cgroup_usage_unregister_event; > + } else if (!strcmp(name, "memory.oom_control")) { > + event->register_event = mem_cgroup_oom_register_event; > + event->unregister_event = mem_cgroup_oom_unregister_event; > + } else if (!strcmp(name, "memory.pressure_level")) { > + event->register_event = vmpressure_register_event; > + event->unregister_event = vmpressure_unregister_event; > + } else if (!strcmp(name, "memory.memsw.usage_in_bytes")) { > + event->register_event = memsw_cgroup_usage_register_event; > + event->unregister_event = memsw_cgroup_usage_unregister_event; > + } else { > + ret = -EINVAL; > + goto out_put_cfile; > + } > + > + /* > + * Verify @cfile should belong to @css. Also, remaining events are > + * automatically removed on cgroup destruction but the removal is > + * asynchronous, so take an extra ref on @css. > + */ > + cfile_css = css_tryget_online_from_dir(cdentry->d_parent, > + &memory_cgrp_subsys); > + ret = -EINVAL; > + if (IS_ERR(cfile_css)) > + goto out_put_cfile; > + if (cfile_css != css) { > + css_put(cfile_css); > + goto out_put_cfile; > + } > + > + ret = event->register_event(memcg, event->eventfd, buf); > + if (ret) > + goto out_put_css; > + > + vfs_poll(efile.file, &event->pt); > + > + spin_lock_irq(&memcg->event_list_lock); > + list_add(&event->list, &memcg->event_list); > + spin_unlock_irq(&memcg->event_list_lock); > + > + fdput(cfile); > + fdput(efile); > + > + return nbytes; > + > +out_put_css: > + css_put(css); > +out_put_cfile: > + fdput(cfile); > +out_put_eventfd: > + eventfd_ctx_put(event->eventfd); > +out_put_efile: > + fdput(efile); > +out_kfree: > + kfree(event); > + > + return ret; > +} > + > +void memcg1_css_offline(struct mem_cgroup *memcg) > +{ > + struct mem_cgroup_event *event, *tmp; > + > + /* > + * Unregister events and notify userspace. > + * Notify userspace about cgroup removing only after rmdir of cgroup > + * directory to avoid race between userspace and kernelspace. > + */ > + spin_lock_irq(&memcg->event_list_lock); > + list_for_each_entry_safe(event, tmp, &memcg->event_list, list) { > + list_del_init(&event->list); > + schedule_work(&event->remove); > + } > + spin_unlock_irq(&memcg->event_list_lock); > +} > + > static int __init memcg1_init(void) > { > int node; > diff --git a/mm/memcontrol-v1.h b/mm/memcontrol-v1.h > index d377c0be9880..524a2c76ffc9 100644 > --- a/mm/memcontrol-v1.h > +++ b/mm/memcontrol-v1.h > @@ -41,4 +41,55 @@ u64 mem_cgroup_move_charge_read(struct cgroup_subsys_state *css, > int mem_cgroup_move_charge_write(struct cgroup_subsys_state *css, > struct cftype *cft, u64 val); > > +/* > + * Per memcg event counter is incremented at every pagein/pageout. With THP, > + * it will be incremented by the number of pages. This counter is used > + * to trigger some periodic events. This is straightforward and better > + * than using jiffies etc. to handle periodic memcg event. > + */ > +enum mem_cgroup_events_target { > + MEM_CGROUP_TARGET_THRESH, > + MEM_CGROUP_TARGET_SOFTLIMIT, > + MEM_CGROUP_NTARGETS, > +}; > + > +/* Whether legacy memory+swap accounting is active */ > +static bool do_memsw_account(void) > +{ > + return !cgroup_subsys_on_dfl(memory_cgrp_subsys); > +} > + > +/* > + * Iteration constructs for visiting all cgroups (under a tree). If > + * loops are exited prematurely (break), mem_cgroup_iter_break() must > + * be used for reference counting. > + */ > +#define for_each_mem_cgroup_tree(iter, root) \ > + for (iter = mem_cgroup_iter(root, NULL, NULL); \ > + iter != NULL; \ > + iter = mem_cgroup_iter(root, iter, NULL)) > + > +#define for_each_mem_cgroup(iter) \ > + for (iter = mem_cgroup_iter(NULL, NULL, NULL); \ > + iter != NULL; \ > + iter = mem_cgroup_iter(NULL, iter, NULL)) > + > +void memcg1_css_offline(struct mem_cgroup *memcg); > + > +/* for encoding cft->private value on file */ > +enum res_type { > + _MEM, > + _MEMSWAP, > + _KMEM, > + _TCP, > +}; > + > +bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg, > + enum mem_cgroup_events_target target); > +unsigned long mem_cgroup_usage(struct mem_cgroup *memcg, bool swap); > +void mem_cgroup_oom_notify(struct mem_cgroup *memcg); > +ssize_t memcg_write_event_control(struct kernfs_open_file *of, > + char *buf, size_t nbytes, loff_t off); > + > + > #endif /* __MM_MEMCONTROL_V1_H */ > diff --git a/mm/memcontrol.c b/mm/memcontrol.c > index da2c0fa0de1b..bd4b26a73596 100644 > --- a/mm/memcontrol.c > +++ b/mm/memcontrol.c > @@ -46,9 +46,6 @@ > #include <linux/slab.h> > #include <linux/swapops.h> > #include <linux/spinlock.h> > -#include <linux/eventfd.h> > -#include <linux/poll.h> > -#include <linux/sort.h> > #include <linux/fs.h> > #include <linux/seq_file.h> > #include <linux/parser.h> > @@ -59,7 +56,6 @@ > #include <linux/cpu.h> > #include <linux/oom.h> > #include <linux/lockdep.h> > -#include <linux/file.h> > #include <linux/resume_user_mode.h> > #include <linux/psi.h> > #include <linux/seq_buf.h> > @@ -97,91 +93,13 @@ static bool cgroup_memory_nobpf __ro_after_init; > static DECLARE_WAIT_QUEUE_HEAD(memcg_cgwb_frn_waitq); > #endif > > -/* Whether legacy memory+swap accounting is active */ > -static bool do_memsw_account(void) > -{ > - return !cgroup_subsys_on_dfl(memory_cgrp_subsys); > -} > - > #define THRESHOLDS_EVENTS_TARGET 128 > #define SOFTLIMIT_EVENTS_TARGET 1024 > > -/* for OOM */ > -struct mem_cgroup_eventfd_list { > - struct list_head list; > - struct eventfd_ctx *eventfd; > -}; > - > -/* > - * cgroup_event represents events which userspace want to receive. > - */ > -struct mem_cgroup_event { > - /* > - * memcg which the event belongs to. > - */ > - struct mem_cgroup *memcg; > - /* > - * eventfd to signal userspace about the event. > - */ > - struct eventfd_ctx *eventfd; > - /* > - * Each of these stored in a list by the cgroup. > - */ > - struct list_head list; > - /* > - * register_event() callback will be used to add new userspace > - * waiter for changes related to this event. Use eventfd_signal() > - * on eventfd to send notification to userspace. > - */ > - int (*register_event)(struct mem_cgroup *memcg, > - struct eventfd_ctx *eventfd, const char *args); > - /* > - * unregister_event() callback will be called when userspace closes > - * the eventfd or on cgroup removing. This callback must be set, > - * if you want provide notification functionality. > - */ > - void (*unregister_event)(struct mem_cgroup *memcg, > - struct eventfd_ctx *eventfd); > - /* > - * All fields below needed to unregister event when > - * userspace closes eventfd. > - */ > - poll_table pt; > - wait_queue_head_t *wqh; > - wait_queue_entry_t wait; > - struct work_struct remove; > -}; > - > -static void mem_cgroup_threshold(struct mem_cgroup *memcg); > -static void mem_cgroup_oom_notify(struct mem_cgroup *memcg); > - > -/* for encoding cft->private value on file */ > -enum res_type { > - _MEM, > - _MEMSWAP, > - _KMEM, > - _TCP, > -}; > - > #define MEMFILE_PRIVATE(x, val) ((x) << 16 | (val)) > #define MEMFILE_TYPE(val) ((val) >> 16 & 0xffff) > #define MEMFILE_ATTR(val) ((val) & 0xffff) > > -/* > - * Iteration constructs for visiting all cgroups (under a tree). If > - * loops are exited prematurely (break), mem_cgroup_iter_break() must > - * be used for reference counting. > - */ > -#define for_each_mem_cgroup_tree(iter, root) \ > - for (iter = mem_cgroup_iter(root, NULL, NULL); \ > - iter != NULL; \ > - iter = mem_cgroup_iter(root, iter, NULL)) > - > -#define for_each_mem_cgroup(iter) \ > - for (iter = mem_cgroup_iter(NULL, NULL, NULL); \ > - iter != NULL; \ > - iter = mem_cgroup_iter(NULL, iter, NULL)) > - > static inline bool task_is_dying(void) > { > return tsk_is_oom_victim(current) || fatal_signal_pending(current) || > @@ -940,8 +858,8 @@ void mem_cgroup_charge_statistics(struct mem_cgroup *memcg, int nr_pages) > __this_cpu_add(memcg->vmstats_percpu->nr_page_events, nr_pages); > } > > -static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg, > - enum mem_cgroup_events_target target) > +bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg, > + enum mem_cgroup_events_target target) > { > unsigned long val, next; > > @@ -965,28 +883,6 @@ static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg, > return false; > } > > -/* > - * Check events in order. > - * > - */ > -void memcg_check_events(struct mem_cgroup *memcg, int nid) > -{ > - if (IS_ENABLED(CONFIG_PREEMPT_RT)) > - return; > - > - /* threshold event is triggered in finer grain than soft limit */ > - if (unlikely(mem_cgroup_event_ratelimit(memcg, > - MEM_CGROUP_TARGET_THRESH))) { > - bool do_softlimit; > - > - do_softlimit = mem_cgroup_event_ratelimit(memcg, > - MEM_CGROUP_TARGET_SOFTLIMIT); > - mem_cgroup_threshold(memcg); > - if (unlikely(do_softlimit)) > - memcg1_update_tree(memcg, nid); > - } > -} > - > struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p) > { > /* > @@ -1726,7 +1622,7 @@ static struct lockdep_map memcg_oom_lock_dep_map = { > }; > #endif > > -static DEFINE_SPINLOCK(memcg_oom_lock); > +DEFINE_SPINLOCK(memcg_oom_lock); > > /* > * Check OOM-Killer is already running under our hierarchy. > @@ -3545,7 +3441,7 @@ static int mem_cgroup_hierarchy_write(struct cgroup_subsys_state *css, > return -EINVAL; > } > > -static unsigned long mem_cgroup_usage(struct mem_cgroup *memcg, bool swap) > +unsigned long mem_cgroup_usage(struct mem_cgroup *memcg, bool swap) > { > unsigned long val; > > @@ -4046,331 +3942,6 @@ static int mem_cgroup_swappiness_write(struct cgroup_subsys_state *css, > return 0; > } > > -static void __mem_cgroup_threshold(struct mem_cgroup *memcg, bool swap) > -{ > - struct mem_cgroup_threshold_ary *t; > - unsigned long usage; > - int i; > - > - rcu_read_lock(); > - if (!swap) > - t = rcu_dereference(memcg->thresholds.primary); > - else > - t = rcu_dereference(memcg->memsw_thresholds.primary); > - > - if (!t) > - goto unlock; > - > - usage = mem_cgroup_usage(memcg, swap); > - > - /* > - * current_threshold points to threshold just below or equal to usage. > - * If it's not true, a threshold was crossed after last > - * call of __mem_cgroup_threshold(). > - */ > - i = t->current_threshold; > - > - /* > - * Iterate backward over array of thresholds starting from > - * current_threshold and check if a threshold is crossed. > - * If none of thresholds below usage is crossed, we read > - * only one element of the array here. > - */ > - for (; i >= 0 && unlikely(t->entries[i].threshold > usage); i--) > - eventfd_signal(t->entries[i].eventfd); > - > - /* i = current_threshold + 1 */ > - i++; > - > - /* > - * Iterate forward over array of thresholds starting from > - * current_threshold+1 and check if a threshold is crossed. > - * If none of thresholds above usage is crossed, we read > - * only one element of the array here. > - */ > - for (; i < t->size && unlikely(t->entries[i].threshold <= usage); i++) > - eventfd_signal(t->entries[i].eventfd); > - > - /* Update current_threshold */ > - t->current_threshold = i - 1; > -unlock: > - rcu_read_unlock(); > -} > - > -static void mem_cgroup_threshold(struct mem_cgroup *memcg) > -{ > - while (memcg) { > - __mem_cgroup_threshold(memcg, false); > - if (do_memsw_account()) > - __mem_cgroup_threshold(memcg, true); > - > - memcg = parent_mem_cgroup(memcg); > - } > -} > - > -static int compare_thresholds(const void *a, const void *b) > -{ > - const struct mem_cgroup_threshold *_a = a; > - const struct mem_cgroup_threshold *_b = b; > - > - if (_a->threshold > _b->threshold) > - return 1; > - > - if (_a->threshold < _b->threshold) > - return -1; > - > - return 0; > -} > - > -static int mem_cgroup_oom_notify_cb(struct mem_cgroup *memcg) > -{ > - struct mem_cgroup_eventfd_list *ev; > - > - spin_lock(&memcg_oom_lock); > - > - list_for_each_entry(ev, &memcg->oom_notify, list) > - eventfd_signal(ev->eventfd); > - > - spin_unlock(&memcg_oom_lock); > - return 0; > -} > - > -static void mem_cgroup_oom_notify(struct mem_cgroup *memcg) > -{ > - struct mem_cgroup *iter; > - > - for_each_mem_cgroup_tree(iter, memcg) > - mem_cgroup_oom_notify_cb(iter); > -} > - > -static int __mem_cgroup_usage_register_event(struct mem_cgroup *memcg, > - struct eventfd_ctx *eventfd, const char *args, enum res_type type) > -{ > - struct mem_cgroup_thresholds *thresholds; > - struct mem_cgroup_threshold_ary *new; > - unsigned long threshold; > - unsigned long usage; > - int i, size, ret; > - > - ret = page_counter_memparse(args, "-1", &threshold); > - if (ret) > - return ret; > - > - mutex_lock(&memcg->thresholds_lock); > - > - if (type == _MEM) { > - thresholds = &memcg->thresholds; > - usage = mem_cgroup_usage(memcg, false); > - } else if (type == _MEMSWAP) { > - thresholds = &memcg->memsw_thresholds; > - usage = mem_cgroup_usage(memcg, true); > - } else > - BUG(); > - > - /* Check if a threshold crossed before adding a new one */ > - if (thresholds->primary) > - __mem_cgroup_threshold(memcg, type == _MEMSWAP); > - > - size = thresholds->primary ? thresholds->primary->size + 1 : 1; > - > - /* Allocate memory for new array of thresholds */ > - new = kmalloc(struct_size(new, entries, size), GFP_KERNEL); > - if (!new) { > - ret = -ENOMEM; > - goto unlock; > - } > - new->size = size; > - > - /* Copy thresholds (if any) to new array */ > - if (thresholds->primary) > - memcpy(new->entries, thresholds->primary->entries, > - flex_array_size(new, entries, size - 1)); > - > - /* Add new threshold */ > - new->entries[size - 1].eventfd = eventfd; > - new->entries[size - 1].threshold = threshold; > - > - /* Sort thresholds. Registering of new threshold isn't time-critical */ > - sort(new->entries, size, sizeof(*new->entries), > - compare_thresholds, NULL); > - > - /* Find current threshold */ > - new->current_threshold = -1; > - for (i = 0; i < size; i++) { > - if (new->entries[i].threshold <= usage) { > - /* > - * new->current_threshold will not be used until > - * rcu_assign_pointer(), so it's safe to increment > - * it here. > - */ > - ++new->current_threshold; > - } else > - break; > - } > - > - /* Free old spare buffer and save old primary buffer as spare */ > - kfree(thresholds->spare); > - thresholds->spare = thresholds->primary; > - > - rcu_assign_pointer(thresholds->primary, new); > - > - /* To be sure that nobody uses thresholds */ > - synchronize_rcu(); > - > -unlock: > - mutex_unlock(&memcg->thresholds_lock); > - > - return ret; > -} > - > -static int mem_cgroup_usage_register_event(struct mem_cgroup *memcg, > - struct eventfd_ctx *eventfd, const char *args) > -{ > - return __mem_cgroup_usage_register_event(memcg, eventfd, args, _MEM); > -} > - > -static int memsw_cgroup_usage_register_event(struct mem_cgroup *memcg, > - struct eventfd_ctx *eventfd, const char *args) > -{ > - return __mem_cgroup_usage_register_event(memcg, eventfd, args, _MEMSWAP); > -} > - > -static void __mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg, > - struct eventfd_ctx *eventfd, enum res_type type) > -{ > - struct mem_cgroup_thresholds *thresholds; > - struct mem_cgroup_threshold_ary *new; > - unsigned long usage; > - int i, j, size, entries; > - > - mutex_lock(&memcg->thresholds_lock); > - > - if (type == _MEM) { > - thresholds = &memcg->thresholds; > - usage = mem_cgroup_usage(memcg, false); > - } else if (type == _MEMSWAP) { > - thresholds = &memcg->memsw_thresholds; > - usage = mem_cgroup_usage(memcg, true); > - } else > - BUG(); > - > - if (!thresholds->primary) > - goto unlock; > - > - /* Check if a threshold crossed before removing */ > - __mem_cgroup_threshold(memcg, type == _MEMSWAP); > - > - /* Calculate new number of threshold */ > - size = entries = 0; > - for (i = 0; i < thresholds->primary->size; i++) { > - if (thresholds->primary->entries[i].eventfd != eventfd) > - size++; > - else > - entries++; > - } > - > - new = thresholds->spare; > - > - /* If no items related to eventfd have been cleared, nothing to do */ > - if (!entries) > - goto unlock; > - > - /* Set thresholds array to NULL if we don't have thresholds */ > - if (!size) { > - kfree(new); > - new = NULL; > - goto swap_buffers; > - } > - > - new->size = size; > - > - /* Copy thresholds and find current threshold */ > - new->current_threshold = -1; > - for (i = 0, j = 0; i < thresholds->primary->size; i++) { > - if (thresholds->primary->entries[i].eventfd == eventfd) > - continue; > - > - new->entries[j] = thresholds->primary->entries[i]; > - if (new->entries[j].threshold <= usage) { > - /* > - * new->current_threshold will not be used > - * until rcu_assign_pointer(), so it's safe to increment > - * it here. > - */ > - ++new->current_threshold; > - } > - j++; > - } > - > -swap_buffers: > - /* Swap primary and spare array */ > - thresholds->spare = thresholds->primary; > - > - rcu_assign_pointer(thresholds->primary, new); > - > - /* To be sure that nobody uses thresholds */ > - synchronize_rcu(); > - > - /* If all events are unregistered, free the spare array */ > - if (!new) { > - kfree(thresholds->spare); > - thresholds->spare = NULL; > - } > -unlock: > - mutex_unlock(&memcg->thresholds_lock); > -} > - > -static void mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg, > - struct eventfd_ctx *eventfd) > -{ > - return __mem_cgroup_usage_unregister_event(memcg, eventfd, _MEM); > -} > - > -static void memsw_cgroup_usage_unregister_event(struct mem_cgroup *memcg, > - struct eventfd_ctx *eventfd) > -{ > - return __mem_cgroup_usage_unregister_event(memcg, eventfd, _MEMSWAP); > -} > - > -static int mem_cgroup_oom_register_event(struct mem_cgroup *memcg, > - struct eventfd_ctx *eventfd, const char *args) > -{ > - struct mem_cgroup_eventfd_list *event; > - > - event = kmalloc(sizeof(*event), GFP_KERNEL); > - if (!event) > - return -ENOMEM; > - > - spin_lock(&memcg_oom_lock); > - > - event->eventfd = eventfd; > - list_add(&event->list, &memcg->oom_notify); > - > - /* already in OOM ? */ > - if (memcg->under_oom) > - eventfd_signal(eventfd); > - spin_unlock(&memcg_oom_lock); > - > - return 0; > -} > - > -static void mem_cgroup_oom_unregister_event(struct mem_cgroup *memcg, > - struct eventfd_ctx *eventfd) > -{ > - struct mem_cgroup_eventfd_list *ev, *tmp; > - > - spin_lock(&memcg_oom_lock); > - > - list_for_each_entry_safe(ev, tmp, &memcg->oom_notify, list) { > - if (ev->eventfd == eventfd) { > - list_del(&ev->list); > - kfree(ev); > - } > - } > - > - spin_unlock(&memcg_oom_lock); > -} > - > static int mem_cgroup_oom_control_read(struct seq_file *sf, void *v) > { > struct mem_cgroup *memcg = mem_cgroup_from_seq(sf); > @@ -4611,243 +4182,6 @@ static void memcg_wb_domain_size_changed(struct mem_cgroup *memcg) > > #endif /* CONFIG_CGROUP_WRITEBACK */ > > -/* > - * DO NOT USE IN NEW FILES. > - * > - * "cgroup.event_control" implementation. > - * > - * This is way over-engineered. It tries to support fully configurable > - * events for each user. Such level of flexibility is completely > - * unnecessary especially in the light of the planned unified hierarchy. > - * > - * Please deprecate this and replace with something simpler if at all > - * possible. > - */ > - > -/* > - * Unregister event and free resources. > - * > - * Gets called from workqueue. > - */ > -static void memcg_event_remove(struct work_struct *work) > -{ > - struct mem_cgroup_event *event = > - container_of(work, struct mem_cgroup_event, remove); > - struct mem_cgroup *memcg = event->memcg; > - > - remove_wait_queue(event->wqh, &event->wait); > - > - event->unregister_event(memcg, event->eventfd); > - > - /* Notify userspace the event is going away. */ > - eventfd_signal(event->eventfd); > - > - eventfd_ctx_put(event->eventfd); > - kfree(event); > - css_put(&memcg->css); > -} > - > -/* > - * Gets called on EPOLLHUP on eventfd when user closes it. > - * > - * Called with wqh->lock held and interrupts disabled. > - */ > -static int memcg_event_wake(wait_queue_entry_t *wait, unsigned mode, > - int sync, void *key) > -{ > - struct mem_cgroup_event *event = > - container_of(wait, struct mem_cgroup_event, wait); > - struct mem_cgroup *memcg = event->memcg; > - __poll_t flags = key_to_poll(key); > - > - if (flags & EPOLLHUP) { > - /* > - * If the event has been detached at cgroup removal, we > - * can simply return knowing the other side will cleanup > - * for us. > - * > - * We can't race against event freeing since the other > - * side will require wqh->lock via remove_wait_queue(), > - * which we hold. > - */ > - spin_lock(&memcg->event_list_lock); > - if (!list_empty(&event->list)) { > - list_del_init(&event->list); > - /* > - * We are in atomic context, but cgroup_event_remove() > - * may sleep, so we have to call it in workqueue. > - */ > - schedule_work(&event->remove); > - } > - spin_unlock(&memcg->event_list_lock); > - } > - > - return 0; > -} > - > -static void memcg_event_ptable_queue_proc(struct file *file, > - wait_queue_head_t *wqh, poll_table *pt) > -{ > - struct mem_cgroup_event *event = > - container_of(pt, struct mem_cgroup_event, pt); > - > - event->wqh = wqh; > - add_wait_queue(wqh, &event->wait); > -} > - > -/* > - * DO NOT USE IN NEW FILES. > - * > - * Parse input and register new cgroup event handler. > - * > - * Input must be in format '<event_fd> <control_fd> <args>'. > - * Interpretation of args is defined by control file implementation. > - */ > -static ssize_t memcg_write_event_control(struct kernfs_open_file *of, > - char *buf, size_t nbytes, loff_t off) > -{ > - struct cgroup_subsys_state *css = of_css(of); > - struct mem_cgroup *memcg = mem_cgroup_from_css(css); > - struct mem_cgroup_event *event; > - struct cgroup_subsys_state *cfile_css; > - unsigned int efd, cfd; > - struct fd efile; > - struct fd cfile; > - struct dentry *cdentry; > - const char *name; > - char *endp; > - int ret; > - > - if (IS_ENABLED(CONFIG_PREEMPT_RT)) > - return -EOPNOTSUPP; > - > - buf = strstrip(buf); > - > - efd = simple_strtoul(buf, &endp, 10); > - if (*endp != ' ') > - return -EINVAL; > - buf = endp + 1; > - > - cfd = simple_strtoul(buf, &endp, 10); > - if ((*endp != ' ') && (*endp != '\0')) > - return -EINVAL; > - buf = endp + 1; > - > - event = kzalloc(sizeof(*event), GFP_KERNEL); > - if (!event) > - return -ENOMEM; > - > - event->memcg = memcg; > - INIT_LIST_HEAD(&event->list); > - init_poll_funcptr(&event->pt, memcg_event_ptable_queue_proc); > - init_waitqueue_func_entry(&event->wait, memcg_event_wake); > - INIT_WORK(&event->remove, memcg_event_remove); > - > - efile = fdget(efd); > - if (!efile.file) { > - ret = -EBADF; > - goto out_kfree; > - } > - > - event->eventfd = eventfd_ctx_fileget(efile.file); > - if (IS_ERR(event->eventfd)) { > - ret = PTR_ERR(event->eventfd); > - goto out_put_efile; > - } > - > - cfile = fdget(cfd); > - if (!cfile.file) { > - ret = -EBADF; > - goto out_put_eventfd; > - } > - > - /* the process need read permission on control file */ > - /* AV: shouldn't we check that it's been opened for read instead? */ > - ret = file_permission(cfile.file, MAY_READ); > - if (ret < 0) > - goto out_put_cfile; > - > - /* > - * The control file must be a regular cgroup1 file. As a regular cgroup > - * file can't be renamed, it's safe to access its name afterwards. > - */ > - cdentry = cfile.file->f_path.dentry; > - if (cdentry->d_sb->s_type != &cgroup_fs_type || !d_is_reg(cdentry)) { > - ret = -EINVAL; > - goto out_put_cfile; > - } > - > - /* > - * Determine the event callbacks and set them in @event. This used > - * to be done via struct cftype but cgroup core no longer knows > - * about these events. The following is crude but the whole thing > - * is for compatibility anyway. > - * > - * DO NOT ADD NEW FILES. > - */ > - name = cdentry->d_name.name; > - > - if (!strcmp(name, "memory.usage_in_bytes")) { > - event->register_event = mem_cgroup_usage_register_event; > - event->unregister_event = mem_cgroup_usage_unregister_event; > - } else if (!strcmp(name, "memory.oom_control")) { > - event->register_event = mem_cgroup_oom_register_event; > - event->unregister_event = mem_cgroup_oom_unregister_event; > - } else if (!strcmp(name, "memory.pressure_level")) { > - event->register_event = vmpressure_register_event; > - event->unregister_event = vmpressure_unregister_event; > - } else if (!strcmp(name, "memory.memsw.usage_in_bytes")) { > - event->register_event = memsw_cgroup_usage_register_event; > - event->unregister_event = memsw_cgroup_usage_unregister_event; > - } else { > - ret = -EINVAL; > - goto out_put_cfile; > - } > - > - /* > - * Verify @cfile should belong to @css. Also, remaining events are > - * automatically removed on cgroup destruction but the removal is > - * asynchronous, so take an extra ref on @css. > - */ > - cfile_css = css_tryget_online_from_dir(cdentry->d_parent, > - &memory_cgrp_subsys); > - ret = -EINVAL; > - if (IS_ERR(cfile_css)) > - goto out_put_cfile; > - if (cfile_css != css) { > - css_put(cfile_css); > - goto out_put_cfile; > - } > - > - ret = event->register_event(memcg, event->eventfd, buf); > - if (ret) > - goto out_put_css; > - > - vfs_poll(efile.file, &event->pt); > - > - spin_lock_irq(&memcg->event_list_lock); > - list_add(&event->list, &memcg->event_list); > - spin_unlock_irq(&memcg->event_list_lock); > - > - fdput(cfile); > - fdput(efile); > - > - return nbytes; > - > -out_put_css: > - css_put(css); > -out_put_cfile: > - fdput(cfile); > -out_put_eventfd: > - eventfd_ctx_put(event->eventfd); > -out_put_efile: > - fdput(efile); > -out_kfree: > - kfree(event); > - > - return ret; > -} > - > #if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_SLUB_DEBUG) > static int mem_cgroup_slab_show(struct seq_file *m, void *p) > { > @@ -5314,19 +4648,8 @@ static int mem_cgroup_css_online(struct cgroup_subsys_state *css) > static void mem_cgroup_css_offline(struct cgroup_subsys_state *css) > { > struct mem_cgroup *memcg = mem_cgroup_from_css(css); > - struct mem_cgroup_event *event, *tmp; > > - /* > - * Unregister events and notify userspace. > - * Notify userspace about cgroup removing only after rmdir of cgroup > - * directory to avoid race between userspace and kernelspace. > - */ > - spin_lock_irq(&memcg->event_list_lock); > - list_for_each_entry_safe(event, tmp, &memcg->event_list, list) { > - list_del_init(&event->list); > - schedule_work(&event->remove); > - } > - spin_unlock_irq(&memcg->event_list_lock); > + memcg1_css_offline(memcg); > > page_counter_set_min(&memcg->memory, 0); > page_counter_set_low(&memcg->memory, 0); > -- > 2.45.2
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 83c8327455d8..588179d29849 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -69,18 +69,6 @@ struct mem_cgroup_id { refcount_t ref; }; -/* - * Per memcg event counter is incremented at every pagein/pageout. With THP, - * it will be incremented by the number of pages. This counter is used - * to trigger some periodic events. This is straightforward and better - * than using jiffies etc. to handle periodic memcg event. - */ -enum mem_cgroup_events_target { - MEM_CGROUP_TARGET_THRESH, - MEM_CGROUP_TARGET_SOFTLIMIT, - MEM_CGROUP_NTARGETS, -}; - struct memcg_vmstats_percpu; struct memcg_vmstats; struct lruvec_stats_percpu; diff --git a/mm/memcontrol-v1.c b/mm/memcontrol-v1.c index c25e038ac874..4b2290ceace6 100644 --- a/mm/memcontrol-v1.c +++ b/mm/memcontrol-v1.c @@ -6,6 +6,10 @@ #include <linux/pagewalk.h> #include <linux/backing-dev.h> #include <linux/swap_cgroup.h> +#include <linux/eventfd.h> +#include <linux/poll.h> +#include <linux/sort.h> +#include <linux/file.h> #include "internal.h" #include "swap.h" @@ -60,6 +64,54 @@ static struct move_charge_struct { .waitq = __WAIT_QUEUE_HEAD_INITIALIZER(mc.waitq), }; +/* for OOM */ +struct mem_cgroup_eventfd_list { + struct list_head list; + struct eventfd_ctx *eventfd; +}; + +/* + * cgroup_event represents events which userspace want to receive. + */ +struct mem_cgroup_event { + /* + * memcg which the event belongs to. + */ + struct mem_cgroup *memcg; + /* + * eventfd to signal userspace about the event. + */ + struct eventfd_ctx *eventfd; + /* + * Each of these stored in a list by the cgroup. + */ + struct list_head list; + /* + * register_event() callback will be used to add new userspace + * waiter for changes related to this event. Use eventfd_signal() + * on eventfd to send notification to userspace. + */ + int (*register_event)(struct mem_cgroup *memcg, + struct eventfd_ctx *eventfd, const char *args); + /* + * unregister_event() callback will be called when userspace closes + * the eventfd or on cgroup removing. This callback must be set, + * if you want provide notification functionality. + */ + void (*unregister_event)(struct mem_cgroup *memcg, + struct eventfd_ctx *eventfd); + /* + * All fields below needed to unregister event when + * userspace closes eventfd. + */ + poll_table pt; + wait_queue_head_t *wqh; + wait_queue_entry_t wait; + struct work_struct remove; +}; + +extern spinlock_t memcg_oom_lock; + static void __mem_cgroup_insert_exceeded(struct mem_cgroup_per_node *mz, struct mem_cgroup_tree_per_node *mctz, unsigned long new_usage_in_excess) @@ -1306,6 +1358,607 @@ void memcg1_move_task(void) } #endif +static void __mem_cgroup_threshold(struct mem_cgroup *memcg, bool swap) +{ + struct mem_cgroup_threshold_ary *t; + unsigned long usage; + int i; + + rcu_read_lock(); + if (!swap) + t = rcu_dereference(memcg->thresholds.primary); + else + t = rcu_dereference(memcg->memsw_thresholds.primary); + + if (!t) + goto unlock; + + usage = mem_cgroup_usage(memcg, swap); + + /* + * current_threshold points to threshold just below or equal to usage. + * If it's not true, a threshold was crossed after last + * call of __mem_cgroup_threshold(). + */ + i = t->current_threshold; + + /* + * Iterate backward over array of thresholds starting from + * current_threshold and check if a threshold is crossed. + * If none of thresholds below usage is crossed, we read + * only one element of the array here. + */ + for (; i >= 0 && unlikely(t->entries[i].threshold > usage); i--) + eventfd_signal(t->entries[i].eventfd); + + /* i = current_threshold + 1 */ + i++; + + /* + * Iterate forward over array of thresholds starting from + * current_threshold+1 and check if a threshold is crossed. + * If none of thresholds above usage is crossed, we read + * only one element of the array here. + */ + for (; i < t->size && unlikely(t->entries[i].threshold <= usage); i++) + eventfd_signal(t->entries[i].eventfd); + + /* Update current_threshold */ + t->current_threshold = i - 1; +unlock: + rcu_read_unlock(); +} + +static void mem_cgroup_threshold(struct mem_cgroup *memcg) +{ + while (memcg) { + __mem_cgroup_threshold(memcg, false); + if (do_memsw_account()) + __mem_cgroup_threshold(memcg, true); + + memcg = parent_mem_cgroup(memcg); + } +} + +/* + * Check events in order. + * + */ +void memcg_check_events(struct mem_cgroup *memcg, int nid) +{ + if (IS_ENABLED(CONFIG_PREEMPT_RT)) + return; + + /* threshold event is triggered in finer grain than soft limit */ + if (unlikely(mem_cgroup_event_ratelimit(memcg, + MEM_CGROUP_TARGET_THRESH))) { + bool do_softlimit; + + do_softlimit = mem_cgroup_event_ratelimit(memcg, + MEM_CGROUP_TARGET_SOFTLIMIT); + mem_cgroup_threshold(memcg); + if (unlikely(do_softlimit)) + memcg1_update_tree(memcg, nid); + } +} + +static int compare_thresholds(const void *a, const void *b) +{ + const struct mem_cgroup_threshold *_a = a; + const struct mem_cgroup_threshold *_b = b; + + if (_a->threshold > _b->threshold) + return 1; + + if (_a->threshold < _b->threshold) + return -1; + + return 0; +} + +static int mem_cgroup_oom_notify_cb(struct mem_cgroup *memcg) +{ + struct mem_cgroup_eventfd_list *ev; + + spin_lock(&memcg_oom_lock); + + list_for_each_entry(ev, &memcg->oom_notify, list) + eventfd_signal(ev->eventfd); + + spin_unlock(&memcg_oom_lock); + return 0; +} + +void mem_cgroup_oom_notify(struct mem_cgroup *memcg) +{ + struct mem_cgroup *iter; + + for_each_mem_cgroup_tree(iter, memcg) + mem_cgroup_oom_notify_cb(iter); +} + +static int __mem_cgroup_usage_register_event(struct mem_cgroup *memcg, + struct eventfd_ctx *eventfd, const char *args, enum res_type type) +{ + struct mem_cgroup_thresholds *thresholds; + struct mem_cgroup_threshold_ary *new; + unsigned long threshold; + unsigned long usage; + int i, size, ret; + + ret = page_counter_memparse(args, "-1", &threshold); + if (ret) + return ret; + + mutex_lock(&memcg->thresholds_lock); + + if (type == _MEM) { + thresholds = &memcg->thresholds; + usage = mem_cgroup_usage(memcg, false); + } else if (type == _MEMSWAP) { + thresholds = &memcg->memsw_thresholds; + usage = mem_cgroup_usage(memcg, true); + } else + BUG(); + + /* Check if a threshold crossed before adding a new one */ + if (thresholds->primary) + __mem_cgroup_threshold(memcg, type == _MEMSWAP); + + size = thresholds->primary ? thresholds->primary->size + 1 : 1; + + /* Allocate memory for new array of thresholds */ + new = kmalloc(struct_size(new, entries, size), GFP_KERNEL); + if (!new) { + ret = -ENOMEM; + goto unlock; + } + new->size = size; + + /* Copy thresholds (if any) to new array */ + if (thresholds->primary) + memcpy(new->entries, thresholds->primary->entries, + flex_array_size(new, entries, size - 1)); + + /* Add new threshold */ + new->entries[size - 1].eventfd = eventfd; + new->entries[size - 1].threshold = threshold; + + /* Sort thresholds. Registering of new threshold isn't time-critical */ + sort(new->entries, size, sizeof(*new->entries), + compare_thresholds, NULL); + + /* Find current threshold */ + new->current_threshold = -1; + for (i = 0; i < size; i++) { + if (new->entries[i].threshold <= usage) { + /* + * new->current_threshold will not be used until + * rcu_assign_pointer(), so it's safe to increment + * it here. + */ + ++new->current_threshold; + } else + break; + } + + /* Free old spare buffer and save old primary buffer as spare */ + kfree(thresholds->spare); + thresholds->spare = thresholds->primary; + + rcu_assign_pointer(thresholds->primary, new); + + /* To be sure that nobody uses thresholds */ + synchronize_rcu(); + +unlock: + mutex_unlock(&memcg->thresholds_lock); + + return ret; +} + +static int mem_cgroup_usage_register_event(struct mem_cgroup *memcg, + struct eventfd_ctx *eventfd, const char *args) +{ + return __mem_cgroup_usage_register_event(memcg, eventfd, args, _MEM); +} + +static int memsw_cgroup_usage_register_event(struct mem_cgroup *memcg, + struct eventfd_ctx *eventfd, const char *args) +{ + return __mem_cgroup_usage_register_event(memcg, eventfd, args, _MEMSWAP); +} + +static void __mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg, + struct eventfd_ctx *eventfd, enum res_type type) +{ + struct mem_cgroup_thresholds *thresholds; + struct mem_cgroup_threshold_ary *new; + unsigned long usage; + int i, j, size, entries; + + mutex_lock(&memcg->thresholds_lock); + + if (type == _MEM) { + thresholds = &memcg->thresholds; + usage = mem_cgroup_usage(memcg, false); + } else if (type == _MEMSWAP) { + thresholds = &memcg->memsw_thresholds; + usage = mem_cgroup_usage(memcg, true); + } else + BUG(); + + if (!thresholds->primary) + goto unlock; + + /* Check if a threshold crossed before removing */ + __mem_cgroup_threshold(memcg, type == _MEMSWAP); + + /* Calculate new number of threshold */ + size = entries = 0; + for (i = 0; i < thresholds->primary->size; i++) { + if (thresholds->primary->entries[i].eventfd != eventfd) + size++; + else + entries++; + } + + new = thresholds->spare; + + /* If no items related to eventfd have been cleared, nothing to do */ + if (!entries) + goto unlock; + + /* Set thresholds array to NULL if we don't have thresholds */ + if (!size) { + kfree(new); + new = NULL; + goto swap_buffers; + } + + new->size = size; + + /* Copy thresholds and find current threshold */ + new->current_threshold = -1; + for (i = 0, j = 0; i < thresholds->primary->size; i++) { + if (thresholds->primary->entries[i].eventfd == eventfd) + continue; + + new->entries[j] = thresholds->primary->entries[i]; + if (new->entries[j].threshold <= usage) { + /* + * new->current_threshold will not be used + * until rcu_assign_pointer(), so it's safe to increment + * it here. + */ + ++new->current_threshold; + } + j++; + } + +swap_buffers: + /* Swap primary and spare array */ + thresholds->spare = thresholds->primary; + + rcu_assign_pointer(thresholds->primary, new); + + /* To be sure that nobody uses thresholds */ + synchronize_rcu(); + + /* If all events are unregistered, free the spare array */ + if (!new) { + kfree(thresholds->spare); + thresholds->spare = NULL; + } +unlock: + mutex_unlock(&memcg->thresholds_lock); +} + +static void mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg, + struct eventfd_ctx *eventfd) +{ + return __mem_cgroup_usage_unregister_event(memcg, eventfd, _MEM); +} + +static void memsw_cgroup_usage_unregister_event(struct mem_cgroup *memcg, + struct eventfd_ctx *eventfd) +{ + return __mem_cgroup_usage_unregister_event(memcg, eventfd, _MEMSWAP); +} + +static int mem_cgroup_oom_register_event(struct mem_cgroup *memcg, + struct eventfd_ctx *eventfd, const char *args) +{ + struct mem_cgroup_eventfd_list *event; + + event = kmalloc(sizeof(*event), GFP_KERNEL); + if (!event) + return -ENOMEM; + + spin_lock(&memcg_oom_lock); + + event->eventfd = eventfd; + list_add(&event->list, &memcg->oom_notify); + + /* already in OOM ? */ + if (memcg->under_oom) + eventfd_signal(eventfd); + spin_unlock(&memcg_oom_lock); + + return 0; +} + +static void mem_cgroup_oom_unregister_event(struct mem_cgroup *memcg, + struct eventfd_ctx *eventfd) +{ + struct mem_cgroup_eventfd_list *ev, *tmp; + + spin_lock(&memcg_oom_lock); + + list_for_each_entry_safe(ev, tmp, &memcg->oom_notify, list) { + if (ev->eventfd == eventfd) { + list_del(&ev->list); + kfree(ev); + } + } + + spin_unlock(&memcg_oom_lock); +} + +/* + * DO NOT USE IN NEW FILES. + * + * "cgroup.event_control" implementation. + * + * This is way over-engineered. It tries to support fully configurable + * events for each user. Such level of flexibility is completely + * unnecessary especially in the light of the planned unified hierarchy. + * + * Please deprecate this and replace with something simpler if at all + * possible. + */ + +/* + * Unregister event and free resources. + * + * Gets called from workqueue. + */ +static void memcg_event_remove(struct work_struct *work) +{ + struct mem_cgroup_event *event = + container_of(work, struct mem_cgroup_event, remove); + struct mem_cgroup *memcg = event->memcg; + + remove_wait_queue(event->wqh, &event->wait); + + event->unregister_event(memcg, event->eventfd); + + /* Notify userspace the event is going away. */ + eventfd_signal(event->eventfd); + + eventfd_ctx_put(event->eventfd); + kfree(event); + css_put(&memcg->css); +} + +/* + * Gets called on EPOLLHUP on eventfd when user closes it. + * + * Called with wqh->lock held and interrupts disabled. + */ +static int memcg_event_wake(wait_queue_entry_t *wait, unsigned mode, + int sync, void *key) +{ + struct mem_cgroup_event *event = + container_of(wait, struct mem_cgroup_event, wait); + struct mem_cgroup *memcg = event->memcg; + __poll_t flags = key_to_poll(key); + + if (flags & EPOLLHUP) { + /* + * If the event has been detached at cgroup removal, we + * can simply return knowing the other side will cleanup + * for us. + * + * We can't race against event freeing since the other + * side will require wqh->lock via remove_wait_queue(), + * which we hold. + */ + spin_lock(&memcg->event_list_lock); + if (!list_empty(&event->list)) { + list_del_init(&event->list); + /* + * We are in atomic context, but cgroup_event_remove() + * may sleep, so we have to call it in workqueue. + */ + schedule_work(&event->remove); + } + spin_unlock(&memcg->event_list_lock); + } + + return 0; +} + +static void memcg_event_ptable_queue_proc(struct file *file, + wait_queue_head_t *wqh, poll_table *pt) +{ + struct mem_cgroup_event *event = + container_of(pt, struct mem_cgroup_event, pt); + + event->wqh = wqh; + add_wait_queue(wqh, &event->wait); +} + +/* + * DO NOT USE IN NEW FILES. + * + * Parse input and register new cgroup event handler. + * + * Input must be in format '<event_fd> <control_fd> <args>'. + * Interpretation of args is defined by control file implementation. + */ +ssize_t memcg_write_event_control(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) +{ + struct cgroup_subsys_state *css = of_css(of); + struct mem_cgroup *memcg = mem_cgroup_from_css(css); + struct mem_cgroup_event *event; + struct cgroup_subsys_state *cfile_css; + unsigned int efd, cfd; + struct fd efile; + struct fd cfile; + struct dentry *cdentry; + const char *name; + char *endp; + int ret; + + if (IS_ENABLED(CONFIG_PREEMPT_RT)) + return -EOPNOTSUPP; + + buf = strstrip(buf); + + efd = simple_strtoul(buf, &endp, 10); + if (*endp != ' ') + return -EINVAL; + buf = endp + 1; + + cfd = simple_strtoul(buf, &endp, 10); + if ((*endp != ' ') && (*endp != '\0')) + return -EINVAL; + buf = endp + 1; + + event = kzalloc(sizeof(*event), GFP_KERNEL); + if (!event) + return -ENOMEM; + + event->memcg = memcg; + INIT_LIST_HEAD(&event->list); + init_poll_funcptr(&event->pt, memcg_event_ptable_queue_proc); + init_waitqueue_func_entry(&event->wait, memcg_event_wake); + INIT_WORK(&event->remove, memcg_event_remove); + + efile = fdget(efd); + if (!efile.file) { + ret = -EBADF; + goto out_kfree; + } + + event->eventfd = eventfd_ctx_fileget(efile.file); + if (IS_ERR(event->eventfd)) { + ret = PTR_ERR(event->eventfd); + goto out_put_efile; + } + + cfile = fdget(cfd); + if (!cfile.file) { + ret = -EBADF; + goto out_put_eventfd; + } + + /* the process need read permission on control file */ + /* AV: shouldn't we check that it's been opened for read instead? */ + ret = file_permission(cfile.file, MAY_READ); + if (ret < 0) + goto out_put_cfile; + + /* + * The control file must be a regular cgroup1 file. As a regular cgroup + * file can't be renamed, it's safe to access its name afterwards. + */ + cdentry = cfile.file->f_path.dentry; + if (cdentry->d_sb->s_type != &cgroup_fs_type || !d_is_reg(cdentry)) { + ret = -EINVAL; + goto out_put_cfile; + } + + /* + * Determine the event callbacks and set them in @event. This used + * to be done via struct cftype but cgroup core no longer knows + * about these events. The following is crude but the whole thing + * is for compatibility anyway. + * + * DO NOT ADD NEW FILES. + */ + name = cdentry->d_name.name; + + if (!strcmp(name, "memory.usage_in_bytes")) { + event->register_event = mem_cgroup_usage_register_event; + event->unregister_event = mem_cgroup_usage_unregister_event; + } else if (!strcmp(name, "memory.oom_control")) { + event->register_event = mem_cgroup_oom_register_event; + event->unregister_event = mem_cgroup_oom_unregister_event; + } else if (!strcmp(name, "memory.pressure_level")) { + event->register_event = vmpressure_register_event; + event->unregister_event = vmpressure_unregister_event; + } else if (!strcmp(name, "memory.memsw.usage_in_bytes")) { + event->register_event = memsw_cgroup_usage_register_event; + event->unregister_event = memsw_cgroup_usage_unregister_event; + } else { + ret = -EINVAL; + goto out_put_cfile; + } + + /* + * Verify @cfile should belong to @css. Also, remaining events are + * automatically removed on cgroup destruction but the removal is + * asynchronous, so take an extra ref on @css. + */ + cfile_css = css_tryget_online_from_dir(cdentry->d_parent, + &memory_cgrp_subsys); + ret = -EINVAL; + if (IS_ERR(cfile_css)) + goto out_put_cfile; + if (cfile_css != css) { + css_put(cfile_css); + goto out_put_cfile; + } + + ret = event->register_event(memcg, event->eventfd, buf); + if (ret) + goto out_put_css; + + vfs_poll(efile.file, &event->pt); + + spin_lock_irq(&memcg->event_list_lock); + list_add(&event->list, &memcg->event_list); + spin_unlock_irq(&memcg->event_list_lock); + + fdput(cfile); + fdput(efile); + + return nbytes; + +out_put_css: + css_put(css); +out_put_cfile: + fdput(cfile); +out_put_eventfd: + eventfd_ctx_put(event->eventfd); +out_put_efile: + fdput(efile); +out_kfree: + kfree(event); + + return ret; +} + +void memcg1_css_offline(struct mem_cgroup *memcg) +{ + struct mem_cgroup_event *event, *tmp; + + /* + * Unregister events and notify userspace. + * Notify userspace about cgroup removing only after rmdir of cgroup + * directory to avoid race between userspace and kernelspace. + */ + spin_lock_irq(&memcg->event_list_lock); + list_for_each_entry_safe(event, tmp, &memcg->event_list, list) { + list_del_init(&event->list); + schedule_work(&event->remove); + } + spin_unlock_irq(&memcg->event_list_lock); +} + static int __init memcg1_init(void) { int node; diff --git a/mm/memcontrol-v1.h b/mm/memcontrol-v1.h index d377c0be9880..524a2c76ffc9 100644 --- a/mm/memcontrol-v1.h +++ b/mm/memcontrol-v1.h @@ -41,4 +41,55 @@ u64 mem_cgroup_move_charge_read(struct cgroup_subsys_state *css, int mem_cgroup_move_charge_write(struct cgroup_subsys_state *css, struct cftype *cft, u64 val); +/* + * Per memcg event counter is incremented at every pagein/pageout. With THP, + * it will be incremented by the number of pages. This counter is used + * to trigger some periodic events. This is straightforward and better + * than using jiffies etc. to handle periodic memcg event. + */ +enum mem_cgroup_events_target { + MEM_CGROUP_TARGET_THRESH, + MEM_CGROUP_TARGET_SOFTLIMIT, + MEM_CGROUP_NTARGETS, +}; + +/* Whether legacy memory+swap accounting is active */ +static bool do_memsw_account(void) +{ + return !cgroup_subsys_on_dfl(memory_cgrp_subsys); +} + +/* + * Iteration constructs for visiting all cgroups (under a tree). If + * loops are exited prematurely (break), mem_cgroup_iter_break() must + * be used for reference counting. + */ +#define for_each_mem_cgroup_tree(iter, root) \ + for (iter = mem_cgroup_iter(root, NULL, NULL); \ + iter != NULL; \ + iter = mem_cgroup_iter(root, iter, NULL)) + +#define for_each_mem_cgroup(iter) \ + for (iter = mem_cgroup_iter(NULL, NULL, NULL); \ + iter != NULL; \ + iter = mem_cgroup_iter(NULL, iter, NULL)) + +void memcg1_css_offline(struct mem_cgroup *memcg); + +/* for encoding cft->private value on file */ +enum res_type { + _MEM, + _MEMSWAP, + _KMEM, + _TCP, +}; + +bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg, + enum mem_cgroup_events_target target); +unsigned long mem_cgroup_usage(struct mem_cgroup *memcg, bool swap); +void mem_cgroup_oom_notify(struct mem_cgroup *memcg); +ssize_t memcg_write_event_control(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off); + + #endif /* __MM_MEMCONTROL_V1_H */ diff --git a/mm/memcontrol.c b/mm/memcontrol.c index da2c0fa0de1b..bd4b26a73596 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -46,9 +46,6 @@ #include <linux/slab.h> #include <linux/swapops.h> #include <linux/spinlock.h> -#include <linux/eventfd.h> -#include <linux/poll.h> -#include <linux/sort.h> #include <linux/fs.h> #include <linux/seq_file.h> #include <linux/parser.h> @@ -59,7 +56,6 @@ #include <linux/cpu.h> #include <linux/oom.h> #include <linux/lockdep.h> -#include <linux/file.h> #include <linux/resume_user_mode.h> #include <linux/psi.h> #include <linux/seq_buf.h> @@ -97,91 +93,13 @@ static bool cgroup_memory_nobpf __ro_after_init; static DECLARE_WAIT_QUEUE_HEAD(memcg_cgwb_frn_waitq); #endif -/* Whether legacy memory+swap accounting is active */ -static bool do_memsw_account(void) -{ - return !cgroup_subsys_on_dfl(memory_cgrp_subsys); -} - #define THRESHOLDS_EVENTS_TARGET 128 #define SOFTLIMIT_EVENTS_TARGET 1024 -/* for OOM */ -struct mem_cgroup_eventfd_list { - struct list_head list; - struct eventfd_ctx *eventfd; -}; - -/* - * cgroup_event represents events which userspace want to receive. - */ -struct mem_cgroup_event { - /* - * memcg which the event belongs to. - */ - struct mem_cgroup *memcg; - /* - * eventfd to signal userspace about the event. - */ - struct eventfd_ctx *eventfd; - /* - * Each of these stored in a list by the cgroup. - */ - struct list_head list; - /* - * register_event() callback will be used to add new userspace - * waiter for changes related to this event. Use eventfd_signal() - * on eventfd to send notification to userspace. - */ - int (*register_event)(struct mem_cgroup *memcg, - struct eventfd_ctx *eventfd, const char *args); - /* - * unregister_event() callback will be called when userspace closes - * the eventfd or on cgroup removing. This callback must be set, - * if you want provide notification functionality. - */ - void (*unregister_event)(struct mem_cgroup *memcg, - struct eventfd_ctx *eventfd); - /* - * All fields below needed to unregister event when - * userspace closes eventfd. - */ - poll_table pt; - wait_queue_head_t *wqh; - wait_queue_entry_t wait; - struct work_struct remove; -}; - -static void mem_cgroup_threshold(struct mem_cgroup *memcg); -static void mem_cgroup_oom_notify(struct mem_cgroup *memcg); - -/* for encoding cft->private value on file */ -enum res_type { - _MEM, - _MEMSWAP, - _KMEM, - _TCP, -}; - #define MEMFILE_PRIVATE(x, val) ((x) << 16 | (val)) #define MEMFILE_TYPE(val) ((val) >> 16 & 0xffff) #define MEMFILE_ATTR(val) ((val) & 0xffff) -/* - * Iteration constructs for visiting all cgroups (under a tree). If - * loops are exited prematurely (break), mem_cgroup_iter_break() must - * be used for reference counting. - */ -#define for_each_mem_cgroup_tree(iter, root) \ - for (iter = mem_cgroup_iter(root, NULL, NULL); \ - iter != NULL; \ - iter = mem_cgroup_iter(root, iter, NULL)) - -#define for_each_mem_cgroup(iter) \ - for (iter = mem_cgroup_iter(NULL, NULL, NULL); \ - iter != NULL; \ - iter = mem_cgroup_iter(NULL, iter, NULL)) - static inline bool task_is_dying(void) { return tsk_is_oom_victim(current) || fatal_signal_pending(current) || @@ -940,8 +858,8 @@ void mem_cgroup_charge_statistics(struct mem_cgroup *memcg, int nr_pages) __this_cpu_add(memcg->vmstats_percpu->nr_page_events, nr_pages); } -static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg, - enum mem_cgroup_events_target target) +bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg, + enum mem_cgroup_events_target target) { unsigned long val, next; @@ -965,28 +883,6 @@ static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg, return false; } -/* - * Check events in order. - * - */ -void memcg_check_events(struct mem_cgroup *memcg, int nid) -{ - if (IS_ENABLED(CONFIG_PREEMPT_RT)) - return; - - /* threshold event is triggered in finer grain than soft limit */ - if (unlikely(mem_cgroup_event_ratelimit(memcg, - MEM_CGROUP_TARGET_THRESH))) { - bool do_softlimit; - - do_softlimit = mem_cgroup_event_ratelimit(memcg, - MEM_CGROUP_TARGET_SOFTLIMIT); - mem_cgroup_threshold(memcg); - if (unlikely(do_softlimit)) - memcg1_update_tree(memcg, nid); - } -} - struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p) { /* @@ -1726,7 +1622,7 @@ static struct lockdep_map memcg_oom_lock_dep_map = { }; #endif -static DEFINE_SPINLOCK(memcg_oom_lock); +DEFINE_SPINLOCK(memcg_oom_lock); /* * Check OOM-Killer is already running under our hierarchy. @@ -3545,7 +3441,7 @@ static int mem_cgroup_hierarchy_write(struct cgroup_subsys_state *css, return -EINVAL; } -static unsigned long mem_cgroup_usage(struct mem_cgroup *memcg, bool swap) +unsigned long mem_cgroup_usage(struct mem_cgroup *memcg, bool swap) { unsigned long val; @@ -4046,331 +3942,6 @@ static int mem_cgroup_swappiness_write(struct cgroup_subsys_state *css, return 0; } -static void __mem_cgroup_threshold(struct mem_cgroup *memcg, bool swap) -{ - struct mem_cgroup_threshold_ary *t; - unsigned long usage; - int i; - - rcu_read_lock(); - if (!swap) - t = rcu_dereference(memcg->thresholds.primary); - else - t = rcu_dereference(memcg->memsw_thresholds.primary); - - if (!t) - goto unlock; - - usage = mem_cgroup_usage(memcg, swap); - - /* - * current_threshold points to threshold just below or equal to usage. - * If it's not true, a threshold was crossed after last - * call of __mem_cgroup_threshold(). - */ - i = t->current_threshold; - - /* - * Iterate backward over array of thresholds starting from - * current_threshold and check if a threshold is crossed. - * If none of thresholds below usage is crossed, we read - * only one element of the array here. - */ - for (; i >= 0 && unlikely(t->entries[i].threshold > usage); i--) - eventfd_signal(t->entries[i].eventfd); - - /* i = current_threshold + 1 */ - i++; - - /* - * Iterate forward over array of thresholds starting from - * current_threshold+1 and check if a threshold is crossed. - * If none of thresholds above usage is crossed, we read - * only one element of the array here. - */ - for (; i < t->size && unlikely(t->entries[i].threshold <= usage); i++) - eventfd_signal(t->entries[i].eventfd); - - /* Update current_threshold */ - t->current_threshold = i - 1; -unlock: - rcu_read_unlock(); -} - -static void mem_cgroup_threshold(struct mem_cgroup *memcg) -{ - while (memcg) { - __mem_cgroup_threshold(memcg, false); - if (do_memsw_account()) - __mem_cgroup_threshold(memcg, true); - - memcg = parent_mem_cgroup(memcg); - } -} - -static int compare_thresholds(const void *a, const void *b) -{ - const struct mem_cgroup_threshold *_a = a; - const struct mem_cgroup_threshold *_b = b; - - if (_a->threshold > _b->threshold) - return 1; - - if (_a->threshold < _b->threshold) - return -1; - - return 0; -} - -static int mem_cgroup_oom_notify_cb(struct mem_cgroup *memcg) -{ - struct mem_cgroup_eventfd_list *ev; - - spin_lock(&memcg_oom_lock); - - list_for_each_entry(ev, &memcg->oom_notify, list) - eventfd_signal(ev->eventfd); - - spin_unlock(&memcg_oom_lock); - return 0; -} - -static void mem_cgroup_oom_notify(struct mem_cgroup *memcg) -{ - struct mem_cgroup *iter; - - for_each_mem_cgroup_tree(iter, memcg) - mem_cgroup_oom_notify_cb(iter); -} - -static int __mem_cgroup_usage_register_event(struct mem_cgroup *memcg, - struct eventfd_ctx *eventfd, const char *args, enum res_type type) -{ - struct mem_cgroup_thresholds *thresholds; - struct mem_cgroup_threshold_ary *new; - unsigned long threshold; - unsigned long usage; - int i, size, ret; - - ret = page_counter_memparse(args, "-1", &threshold); - if (ret) - return ret; - - mutex_lock(&memcg->thresholds_lock); - - if (type == _MEM) { - thresholds = &memcg->thresholds; - usage = mem_cgroup_usage(memcg, false); - } else if (type == _MEMSWAP) { - thresholds = &memcg->memsw_thresholds; - usage = mem_cgroup_usage(memcg, true); - } else - BUG(); - - /* Check if a threshold crossed before adding a new one */ - if (thresholds->primary) - __mem_cgroup_threshold(memcg, type == _MEMSWAP); - - size = thresholds->primary ? thresholds->primary->size + 1 : 1; - - /* Allocate memory for new array of thresholds */ - new = kmalloc(struct_size(new, entries, size), GFP_KERNEL); - if (!new) { - ret = -ENOMEM; - goto unlock; - } - new->size = size; - - /* Copy thresholds (if any) to new array */ - if (thresholds->primary) - memcpy(new->entries, thresholds->primary->entries, - flex_array_size(new, entries, size - 1)); - - /* Add new threshold */ - new->entries[size - 1].eventfd = eventfd; - new->entries[size - 1].threshold = threshold; - - /* Sort thresholds. Registering of new threshold isn't time-critical */ - sort(new->entries, size, sizeof(*new->entries), - compare_thresholds, NULL); - - /* Find current threshold */ - new->current_threshold = -1; - for (i = 0; i < size; i++) { - if (new->entries[i].threshold <= usage) { - /* - * new->current_threshold will not be used until - * rcu_assign_pointer(), so it's safe to increment - * it here. - */ - ++new->current_threshold; - } else - break; - } - - /* Free old spare buffer and save old primary buffer as spare */ - kfree(thresholds->spare); - thresholds->spare = thresholds->primary; - - rcu_assign_pointer(thresholds->primary, new); - - /* To be sure that nobody uses thresholds */ - synchronize_rcu(); - -unlock: - mutex_unlock(&memcg->thresholds_lock); - - return ret; -} - -static int mem_cgroup_usage_register_event(struct mem_cgroup *memcg, - struct eventfd_ctx *eventfd, const char *args) -{ - return __mem_cgroup_usage_register_event(memcg, eventfd, args, _MEM); -} - -static int memsw_cgroup_usage_register_event(struct mem_cgroup *memcg, - struct eventfd_ctx *eventfd, const char *args) -{ - return __mem_cgroup_usage_register_event(memcg, eventfd, args, _MEMSWAP); -} - -static void __mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg, - struct eventfd_ctx *eventfd, enum res_type type) -{ - struct mem_cgroup_thresholds *thresholds; - struct mem_cgroup_threshold_ary *new; - unsigned long usage; - int i, j, size, entries; - - mutex_lock(&memcg->thresholds_lock); - - if (type == _MEM) { - thresholds = &memcg->thresholds; - usage = mem_cgroup_usage(memcg, false); - } else if (type == _MEMSWAP) { - thresholds = &memcg->memsw_thresholds; - usage = mem_cgroup_usage(memcg, true); - } else - BUG(); - - if (!thresholds->primary) - goto unlock; - - /* Check if a threshold crossed before removing */ - __mem_cgroup_threshold(memcg, type == _MEMSWAP); - - /* Calculate new number of threshold */ - size = entries = 0; - for (i = 0; i < thresholds->primary->size; i++) { - if (thresholds->primary->entries[i].eventfd != eventfd) - size++; - else - entries++; - } - - new = thresholds->spare; - - /* If no items related to eventfd have been cleared, nothing to do */ - if (!entries) - goto unlock; - - /* Set thresholds array to NULL if we don't have thresholds */ - if (!size) { - kfree(new); - new = NULL; - goto swap_buffers; - } - - new->size = size; - - /* Copy thresholds and find current threshold */ - new->current_threshold = -1; - for (i = 0, j = 0; i < thresholds->primary->size; i++) { - if (thresholds->primary->entries[i].eventfd == eventfd) - continue; - - new->entries[j] = thresholds->primary->entries[i]; - if (new->entries[j].threshold <= usage) { - /* - * new->current_threshold will not be used - * until rcu_assign_pointer(), so it's safe to increment - * it here. - */ - ++new->current_threshold; - } - j++; - } - -swap_buffers: - /* Swap primary and spare array */ - thresholds->spare = thresholds->primary; - - rcu_assign_pointer(thresholds->primary, new); - - /* To be sure that nobody uses thresholds */ - synchronize_rcu(); - - /* If all events are unregistered, free the spare array */ - if (!new) { - kfree(thresholds->spare); - thresholds->spare = NULL; - } -unlock: - mutex_unlock(&memcg->thresholds_lock); -} - -static void mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg, - struct eventfd_ctx *eventfd) -{ - return __mem_cgroup_usage_unregister_event(memcg, eventfd, _MEM); -} - -static void memsw_cgroup_usage_unregister_event(struct mem_cgroup *memcg, - struct eventfd_ctx *eventfd) -{ - return __mem_cgroup_usage_unregister_event(memcg, eventfd, _MEMSWAP); -} - -static int mem_cgroup_oom_register_event(struct mem_cgroup *memcg, - struct eventfd_ctx *eventfd, const char *args) -{ - struct mem_cgroup_eventfd_list *event; - - event = kmalloc(sizeof(*event), GFP_KERNEL); - if (!event) - return -ENOMEM; - - spin_lock(&memcg_oom_lock); - - event->eventfd = eventfd; - list_add(&event->list, &memcg->oom_notify); - - /* already in OOM ? */ - if (memcg->under_oom) - eventfd_signal(eventfd); - spin_unlock(&memcg_oom_lock); - - return 0; -} - -static void mem_cgroup_oom_unregister_event(struct mem_cgroup *memcg, - struct eventfd_ctx *eventfd) -{ - struct mem_cgroup_eventfd_list *ev, *tmp; - - spin_lock(&memcg_oom_lock); - - list_for_each_entry_safe(ev, tmp, &memcg->oom_notify, list) { - if (ev->eventfd == eventfd) { - list_del(&ev->list); - kfree(ev); - } - } - - spin_unlock(&memcg_oom_lock); -} - static int mem_cgroup_oom_control_read(struct seq_file *sf, void *v) { struct mem_cgroup *memcg = mem_cgroup_from_seq(sf); @@ -4611,243 +4182,6 @@ static void memcg_wb_domain_size_changed(struct mem_cgroup *memcg) #endif /* CONFIG_CGROUP_WRITEBACK */ -/* - * DO NOT USE IN NEW FILES. - * - * "cgroup.event_control" implementation. - * - * This is way over-engineered. It tries to support fully configurable - * events for each user. Such level of flexibility is completely - * unnecessary especially in the light of the planned unified hierarchy. - * - * Please deprecate this and replace with something simpler if at all - * possible. - */ - -/* - * Unregister event and free resources. - * - * Gets called from workqueue. - */ -static void memcg_event_remove(struct work_struct *work) -{ - struct mem_cgroup_event *event = - container_of(work, struct mem_cgroup_event, remove); - struct mem_cgroup *memcg = event->memcg; - - remove_wait_queue(event->wqh, &event->wait); - - event->unregister_event(memcg, event->eventfd); - - /* Notify userspace the event is going away. */ - eventfd_signal(event->eventfd); - - eventfd_ctx_put(event->eventfd); - kfree(event); - css_put(&memcg->css); -} - -/* - * Gets called on EPOLLHUP on eventfd when user closes it. - * - * Called with wqh->lock held and interrupts disabled. - */ -static int memcg_event_wake(wait_queue_entry_t *wait, unsigned mode, - int sync, void *key) -{ - struct mem_cgroup_event *event = - container_of(wait, struct mem_cgroup_event, wait); - struct mem_cgroup *memcg = event->memcg; - __poll_t flags = key_to_poll(key); - - if (flags & EPOLLHUP) { - /* - * If the event has been detached at cgroup removal, we - * can simply return knowing the other side will cleanup - * for us. - * - * We can't race against event freeing since the other - * side will require wqh->lock via remove_wait_queue(), - * which we hold. - */ - spin_lock(&memcg->event_list_lock); - if (!list_empty(&event->list)) { - list_del_init(&event->list); - /* - * We are in atomic context, but cgroup_event_remove() - * may sleep, so we have to call it in workqueue. - */ - schedule_work(&event->remove); - } - spin_unlock(&memcg->event_list_lock); - } - - return 0; -} - -static void memcg_event_ptable_queue_proc(struct file *file, - wait_queue_head_t *wqh, poll_table *pt) -{ - struct mem_cgroup_event *event = - container_of(pt, struct mem_cgroup_event, pt); - - event->wqh = wqh; - add_wait_queue(wqh, &event->wait); -} - -/* - * DO NOT USE IN NEW FILES. - * - * Parse input and register new cgroup event handler. - * - * Input must be in format '<event_fd> <control_fd> <args>'. - * Interpretation of args is defined by control file implementation. - */ -static ssize_t memcg_write_event_control(struct kernfs_open_file *of, - char *buf, size_t nbytes, loff_t off) -{ - struct cgroup_subsys_state *css = of_css(of); - struct mem_cgroup *memcg = mem_cgroup_from_css(css); - struct mem_cgroup_event *event; - struct cgroup_subsys_state *cfile_css; - unsigned int efd, cfd; - struct fd efile; - struct fd cfile; - struct dentry *cdentry; - const char *name; - char *endp; - int ret; - - if (IS_ENABLED(CONFIG_PREEMPT_RT)) - return -EOPNOTSUPP; - - buf = strstrip(buf); - - efd = simple_strtoul(buf, &endp, 10); - if (*endp != ' ') - return -EINVAL; - buf = endp + 1; - - cfd = simple_strtoul(buf, &endp, 10); - if ((*endp != ' ') && (*endp != '\0')) - return -EINVAL; - buf = endp + 1; - - event = kzalloc(sizeof(*event), GFP_KERNEL); - if (!event) - return -ENOMEM; - - event->memcg = memcg; - INIT_LIST_HEAD(&event->list); - init_poll_funcptr(&event->pt, memcg_event_ptable_queue_proc); - init_waitqueue_func_entry(&event->wait, memcg_event_wake); - INIT_WORK(&event->remove, memcg_event_remove); - - efile = fdget(efd); - if (!efile.file) { - ret = -EBADF; - goto out_kfree; - } - - event->eventfd = eventfd_ctx_fileget(efile.file); - if (IS_ERR(event->eventfd)) { - ret = PTR_ERR(event->eventfd); - goto out_put_efile; - } - - cfile = fdget(cfd); - if (!cfile.file) { - ret = -EBADF; - goto out_put_eventfd; - } - - /* the process need read permission on control file */ - /* AV: shouldn't we check that it's been opened for read instead? */ - ret = file_permission(cfile.file, MAY_READ); - if (ret < 0) - goto out_put_cfile; - - /* - * The control file must be a regular cgroup1 file. As a regular cgroup - * file can't be renamed, it's safe to access its name afterwards. - */ - cdentry = cfile.file->f_path.dentry; - if (cdentry->d_sb->s_type != &cgroup_fs_type || !d_is_reg(cdentry)) { - ret = -EINVAL; - goto out_put_cfile; - } - - /* - * Determine the event callbacks and set them in @event. This used - * to be done via struct cftype but cgroup core no longer knows - * about these events. The following is crude but the whole thing - * is for compatibility anyway. - * - * DO NOT ADD NEW FILES. - */ - name = cdentry->d_name.name; - - if (!strcmp(name, "memory.usage_in_bytes")) { - event->register_event = mem_cgroup_usage_register_event; - event->unregister_event = mem_cgroup_usage_unregister_event; - } else if (!strcmp(name, "memory.oom_control")) { - event->register_event = mem_cgroup_oom_register_event; - event->unregister_event = mem_cgroup_oom_unregister_event; - } else if (!strcmp(name, "memory.pressure_level")) { - event->register_event = vmpressure_register_event; - event->unregister_event = vmpressure_unregister_event; - } else if (!strcmp(name, "memory.memsw.usage_in_bytes")) { - event->register_event = memsw_cgroup_usage_register_event; - event->unregister_event = memsw_cgroup_usage_unregister_event; - } else { - ret = -EINVAL; - goto out_put_cfile; - } - - /* - * Verify @cfile should belong to @css. Also, remaining events are - * automatically removed on cgroup destruction but the removal is - * asynchronous, so take an extra ref on @css. - */ - cfile_css = css_tryget_online_from_dir(cdentry->d_parent, - &memory_cgrp_subsys); - ret = -EINVAL; - if (IS_ERR(cfile_css)) - goto out_put_cfile; - if (cfile_css != css) { - css_put(cfile_css); - goto out_put_cfile; - } - - ret = event->register_event(memcg, event->eventfd, buf); - if (ret) - goto out_put_css; - - vfs_poll(efile.file, &event->pt); - - spin_lock_irq(&memcg->event_list_lock); - list_add(&event->list, &memcg->event_list); - spin_unlock_irq(&memcg->event_list_lock); - - fdput(cfile); - fdput(efile); - - return nbytes; - -out_put_css: - css_put(css); -out_put_cfile: - fdput(cfile); -out_put_eventfd: - eventfd_ctx_put(event->eventfd); -out_put_efile: - fdput(efile); -out_kfree: - kfree(event); - - return ret; -} - #if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_SLUB_DEBUG) static int mem_cgroup_slab_show(struct seq_file *m, void *p) { @@ -5314,19 +4648,8 @@ static int mem_cgroup_css_online(struct cgroup_subsys_state *css) static void mem_cgroup_css_offline(struct cgroup_subsys_state *css) { struct mem_cgroup *memcg = mem_cgroup_from_css(css); - struct mem_cgroup_event *event, *tmp; - /* - * Unregister events and notify userspace. - * Notify userspace about cgroup removing only after rmdir of cgroup - * directory to avoid race between userspace and kernelspace. - */ - spin_lock_irq(&memcg->event_list_lock); - list_for_each_entry_safe(event, tmp, &memcg->event_list, list) { - list_del_init(&event->list); - schedule_work(&event->remove); - } - spin_unlock_irq(&memcg->event_list_lock); + memcg1_css_offline(memcg); page_counter_set_min(&memcg->memory, 0); page_counter_set_low(&memcg->memory, 0);
Cgroup v1's memory controller contains a pretty complicated event notifications mechanism which is not used on cgroup v2. Let's move the corresponding code into memcontrol-v1.c. Please, note, that mem_cgroup_event_ratelimit() remains in memcontrol.c, otherwise it would require exporting too many details on memcg stats outside of memcontrol.c. Signed-off-by: Roman Gushchin <roman.gushchin@linux.dev> --- include/linux/memcontrol.h | 12 - mm/memcontrol-v1.c | 653 +++++++++++++++++++++++++++++++++++ mm/memcontrol-v1.h | 51 +++ mm/memcontrol.c | 687 +------------------------------------ 4 files changed, 709 insertions(+), 694 deletions(-)