@@ -842,14 +842,16 @@ struct buffer_head *alloc_page_buffers(struct page *page, unsigned long size,
struct buffer_head *bh, *head;
gfp_t gfp = GFP_NOFS | __GFP_ACCOUNT;
long offset;
- struct mem_cgroup *memcg, *old_memcg;
+ struct mem_cgroup *memcg;
+ struct obj_cgroup *objcg, *old_objcg;
if (retry)
gfp |= __GFP_NOFAIL;
/* The page lock pins the memcg */
memcg = page_memcg(page);
- old_memcg = set_active_memcg(memcg);
+ objcg = get_obj_cgroup_from_mem_cgroup(memcg);
+ old_objcg = set_active_objcg(objcg);
head = NULL;
offset = PAGE_SIZE;
@@ -868,7 +870,9 @@ struct buffer_head *alloc_page_buffers(struct page *page, unsigned long size,
set_bh_page(bh, page, offset);
}
out:
- set_active_memcg(old_memcg);
+ set_active_objcg(old_objcg);
+ if (objcg)
+ obj_cgroup_put(objcg);
return head;
/*
* In case anything failed, we just free everything we got.
@@ -530,7 +530,7 @@ static struct fanotify_event *fanotify_alloc_event(struct fsnotify_group *group,
struct inode *dirid = fanotify_dfid_inode(mask, data, data_type, dir);
const struct path *path = fsnotify_data_path(data, data_type);
unsigned int fid_mode = FAN_GROUP_FLAG(group, FANOTIFY_FID_BITS);
- struct mem_cgroup *old_memcg;
+ struct obj_cgroup *old_objcg;
struct inode *child = NULL;
bool name_event = false;
@@ -580,7 +580,7 @@ static struct fanotify_event *fanotify_alloc_event(struct fsnotify_group *group,
gfp |= __GFP_RETRY_MAYFAIL;
/* Whoever is interested in the event, pays for the allocation. */
- old_memcg = set_active_memcg(group->memcg);
+ old_objcg = set_active_objcg(group->objcg);
if (fanotify_is_perm_event(mask)) {
event = fanotify_alloc_perm_event(path, gfp);
@@ -608,7 +608,7 @@ static struct fanotify_event *fanotify_alloc_event(struct fsnotify_group *group,
event->pid = get_pid(task_tgid(current));
out:
- set_active_memcg(old_memcg);
+ set_active_objcg(old_objcg);
return event;
}
@@ -985,7 +985,7 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags)
group->fanotify_data.user = user;
group->fanotify_data.flags = flags;
atomic_inc(&user->fanotify_listeners);
- group->memcg = get_mem_cgroup_from_mm(current->mm);
+ group->objcg = get_obj_cgroup_from_current();
group->overflow_event = fanotify_alloc_overflow_event();
if (unlikely(!group->overflow_event)) {
@@ -24,7 +24,8 @@ static void fsnotify_final_destroy_group(struct fsnotify_group *group)
if (group->ops->free_group_priv)
group->ops->free_group_priv(group);
- mem_cgroup_put(group->memcg);
+ if (group->objcg)
+ obj_cgroup_put(group->objcg);
mutex_destroy(&group->mark_mutex);
kfree(group);
@@ -66,7 +66,7 @@ int inotify_handle_inode_event(struct fsnotify_mark *inode_mark, u32 mask,
int ret;
int len = 0;
int alloc_len = sizeof(struct inotify_event_info);
- struct mem_cgroup *old_memcg;
+ struct obj_cgroup *old_objcg;
if (name) {
len = name->len;
@@ -81,12 +81,12 @@ int inotify_handle_inode_event(struct fsnotify_mark *inode_mark, u32 mask,
/*
* Whoever is interested in the event, pays for the allocation. Do not
- * trigger OOM killer in the target monitoring memcg as it may have
+ * trigger OOM killer in the target monitoring objcg as it may have
* security repercussion.
*/
- old_memcg = set_active_memcg(group->memcg);
+ old_objcg = set_active_objcg(group->objcg);
event = kmalloc(alloc_len, GFP_KERNEL_ACCOUNT | __GFP_RETRY_MAYFAIL);
- set_active_memcg(old_memcg);
+ set_active_objcg(old_objcg);
if (unlikely(!event)) {
/*
@@ -649,7 +649,7 @@ static struct fsnotify_group *inotify_new_group(unsigned int max_events)
oevent->name_len = 0;
group->max_events = max_events;
- group->memcg = get_mem_cgroup_from_mm(current->mm);
+ group->objcg = get_obj_cgroup_from_current();
spin_lock_init(&group->inotify_data.idr_lock);
idr_init(&group->inotify_data.idr);
@@ -158,7 +158,7 @@ struct bpf_map {
u32 btf_value_type_id;
struct btf *btf;
#ifdef CONFIG_MEMCG_KMEM
- struct mem_cgroup *memcg;
+ struct obj_cgroup *objcg;
#endif
char name[BPF_OBJ_NAME_LEN];
u32 btf_vmlinux_value_type_id;
@@ -220,7 +220,7 @@ struct fsnotify_group {
* notification list is too
* full */
- struct mem_cgroup *memcg; /* memcg to charge allocations */
+ struct obj_cgroup *objcg; /* objcg to charge allocations */
/* groups can define private fields here or use the void *private */
union {
@@ -1643,6 +1643,7 @@ static inline void memcg_set_shrinker_bit(struct mem_cgroup *memcg,
int __memcg_kmem_charge_page(struct page *page, gfp_t gfp, int order);
void __memcg_kmem_uncharge_page(struct page *page, int order);
+struct obj_cgroup *get_obj_cgroup_from_mem_cgroup(struct mem_cgroup *memcg);
struct obj_cgroup *get_obj_cgroup_from_current(void);
int obj_cgroup_charge(struct obj_cgroup *objcg, gfp_t gfp, size_t size);
@@ -1693,6 +1694,20 @@ static inline int memcg_cache_id(struct mem_cgroup *memcg)
struct mem_cgroup *mem_cgroup_from_obj(void *p);
#else
+static inline
+struct obj_cgroup *get_obj_cgroup_from_mem_cgroup(struct mem_cgroup *memcg)
+{
+ return NULL;
+}
+
+static inline struct obj_cgroup *get_obj_cgroup_from_current(void)
+{
+ return NULL;
+}
+
+static inline void obj_cgroup_put(struct obj_cgroup *objcg)
+{
+}
static inline int memcg_kmem_charge_page(struct page *page, gfp_t gfp,
int order)
@@ -390,37 +390,38 @@ void bpf_map_free_id(struct bpf_map *map, bool do_idr_lock)
}
#ifdef CONFIG_MEMCG_KMEM
-static void bpf_map_save_memcg(struct bpf_map *map)
+static void bpf_map_save_objcg(struct bpf_map *map)
{
- map->memcg = get_mem_cgroup_from_mm(current->mm);
+ map->objcg = get_obj_cgroup_from_current();
}
-static void bpf_map_release_memcg(struct bpf_map *map)
+static void bpf_map_release_objcg(struct bpf_map *map)
{
- mem_cgroup_put(map->memcg);
+ if (map->objcg)
+ obj_cgroup_put(map->objcg);
}
void *bpf_map_kmalloc_node(const struct bpf_map *map, size_t size, gfp_t flags,
int node)
{
- struct mem_cgroup *old_memcg;
+ struct obj_cgroup *old_objcg;
void *ptr;
- old_memcg = set_active_memcg(map->memcg);
+ old_objcg = set_active_objcg(map->objcg);
ptr = kmalloc_node(size, flags | __GFP_ACCOUNT, node);
- set_active_memcg(old_memcg);
+ set_active_objcg(old_objcg);
return ptr;
}
void *bpf_map_kzalloc(const struct bpf_map *map, size_t size, gfp_t flags)
{
- struct mem_cgroup *old_memcg;
+ struct obj_cgroup *old_objcg;
void *ptr;
- old_memcg = set_active_memcg(map->memcg);
+ old_objcg = set_active_objcg(map->objcg);
ptr = kzalloc(size, flags | __GFP_ACCOUNT);
- set_active_memcg(old_memcg);
+ set_active_objcg(old_objcg);
return ptr;
}
@@ -428,22 +429,22 @@ void *bpf_map_kzalloc(const struct bpf_map *map, size_t size, gfp_t flags)
void __percpu *bpf_map_alloc_percpu(const struct bpf_map *map, size_t size,
size_t align, gfp_t flags)
{
- struct mem_cgroup *old_memcg;
+ struct obj_cgroup *old_objcg;
void __percpu *ptr;
- old_memcg = set_active_memcg(map->memcg);
+ old_objcg = set_active_objcg(map->objcg);
ptr = __alloc_percpu_gfp(size, align, flags | __GFP_ACCOUNT);
- set_active_memcg(old_memcg);
+ set_active_objcg(old_objcg);
return ptr;
}
#else
-static void bpf_map_save_memcg(struct bpf_map *map)
+static void bpf_map_save_objcg(struct bpf_map *map)
{
}
-static void bpf_map_release_memcg(struct bpf_map *map)
+static void bpf_map_release_objcg(struct bpf_map *map)
{
}
#endif
@@ -454,7 +455,7 @@ static void bpf_map_free_deferred(struct work_struct *work)
struct bpf_map *map = container_of(work, struct bpf_map, work);
security_bpf_map_free(map);
- bpf_map_release_memcg(map);
+ bpf_map_release_objcg(map);
/* implementation dependent freeing */
map->ops->map_free(map);
}
@@ -877,7 +878,7 @@ static int map_create(union bpf_attr *attr)
if (err)
goto free_map_sec;
- bpf_map_save_memcg(map);
+ bpf_map_save_objcg(map);
err = bpf_map_new_fd(map, f_flags);
if (err < 0) {
@@ -3023,6 +3023,22 @@ struct mem_cgroup *mem_cgroup_from_obj(void *p)
return page_memcg_check(page);
}
+struct obj_cgroup *get_obj_cgroup_from_mem_cgroup(struct mem_cgroup *memcg)
+{
+ struct obj_cgroup *objcg = NULL;
+
+ rcu_read_lock();
+ for (; !mem_cgroup_is_root(memcg); memcg = parent_mem_cgroup(memcg)) {
+ objcg = rcu_dereference(memcg->objcg);
+ if (objcg && obj_cgroup_tryget(objcg))
+ break;
+ objcg = NULL;
+ }
+ rcu_read_unlock();
+
+ return objcg;
+}
+
__always_inline struct obj_cgroup *get_obj_cgroup_from_current(void)
{
struct obj_cgroup *objcg;
@@ -5356,16 +5372,33 @@ static struct mem_cgroup *mem_cgroup_alloc(void)
return ERR_PTR(error);
}
+#ifdef CONFIG_MEMCG_KMEM
+static inline struct obj_cgroup *memcg_obj_cgroup(struct mem_cgroup *memcg)
+{
+ return memcg ? memcg->objcg : NULL;
+}
+#else
+static inline struct obj_cgroup *memcg_obj_cgroup(struct mem_cgroup *memcg)
+{
+ return NULL;
+}
+#endif
+
static struct cgroup_subsys_state * __ref
mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
{
struct mem_cgroup *parent = mem_cgroup_from_css(parent_css);
- struct mem_cgroup *memcg, *old_memcg;
+ struct mem_cgroup *memcg;
+ struct obj_cgroup *old_objcg;
long error = -ENOMEM;
- old_memcg = set_active_memcg(parent);
+ /*
+ * The @parent cannot be offlined, so @parent->objcg cannot be freed
+ * under us.
+ */
+ old_objcg = set_active_objcg(memcg_obj_cgroup(parent));
memcg = mem_cgroup_alloc();
- set_active_memcg(old_memcg);
+ set_active_objcg(old_objcg);
if (IS_ERR(memcg))
return ERR_CAST(memcg);
Some users of remote memory cgroup charging (e.g. bpf and fsnotify) can switch to the remote objcg charging APIs. Because they just want to charge kernel memory to the remote memcg. Finally, all the kernel memory are charged by using obj_cgroup APIs and do not hold a refcount to the memory cgroup. It can prevent long-living objects from pinning the original memory cgroup in the memory efficiently. Signed-off-by: Muchun Song <songmuchun@bytedance.com> --- fs/buffer.c | 10 ++++++--- fs/notify/fanotify/fanotify.c | 6 +++--- fs/notify/fanotify/fanotify_user.c | 2 +- fs/notify/group.c | 3 ++- fs/notify/inotify/inotify_fsnotify.c | 8 ++++---- fs/notify/inotify/inotify_user.c | 2 +- include/linux/bpf.h | 2 +- include/linux/fsnotify_backend.h | 2 +- include/linux/memcontrol.h | 15 ++++++++++++++ kernel/bpf/syscall.c | 35 ++++++++++++++++---------------- mm/memcontrol.c | 39 +++++++++++++++++++++++++++++++++--- 11 files changed, 89 insertions(+), 35 deletions(-)