diff mbox series

[bpf-next,v7,06/34] bpf: prepare for memcg-based memory accounting for bpf maps

Message ID 20201119173754.4125257-7-guro@fb.com (mailing list archive)
State New, archived
Headers show
Series bpf: switch to memcg-based memory accounting | expand

Commit Message

Roman Gushchin Nov. 19, 2020, 5:37 p.m. UTC
In the absolute majority of cases if a process is making a kernel
allocation, it's memory cgroup is getting charged.

Bpf maps can be updated from an interrupt context and in such
case there is no process which can be charged. It makes the memory
accounting of bpf maps non-trivial.

Fortunately, after commit 4127c6504f25 ("mm: kmem: enable kernel
memcg accounting from interrupt contexts") and b87d8cefe43c
("mm, memcg: rework remote charging API to support nesting")
it's finally possible.

To do it, a pointer to the memory cgroup of the process, which created
the map, is saved, and this cgroup can be charged for all allocations
made from an interrupt context. This commit introduces 2 helpers:
bpf_map_kmalloc_node() and bpf_map_alloc_percpu(). They can be used in
the bpf code for accounted memory allocations, both in the process and
interrupt contexts. In the interrupt context they're using the saved
memory cgroup, otherwise the current cgroup is getting charged.

Signed-off-by: Roman Gushchin <guro@fb.com>
---
 include/linux/bpf.h  | 26 +++++++++++++++
 kernel/bpf/syscall.c | 76 ++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 102 insertions(+)

Comments

Song Liu Nov. 20, 2020, 1:05 a.m. UTC | #1
> On Nov 19, 2020, at 9:37 AM, Roman Gushchin <guro@fb.com> wrote:
> 
> In the absolute majority of cases if a process is making a kernel
> allocation, it's memory cgroup is getting charged.
> 
> Bpf maps can be updated from an interrupt context and in such
> case there is no process which can be charged. It makes the memory
> accounting of bpf maps non-trivial.
> 
> Fortunately, after commit 4127c6504f25 ("mm: kmem: enable kernel
> memcg accounting from interrupt contexts") and b87d8cefe43c
> ("mm, memcg: rework remote charging API to support nesting")
> it's finally possible.
> 
> To do it, a pointer to the memory cgroup of the process, which created
> the map, is saved, and this cgroup can be charged for all allocations
> made from an interrupt context. This commit introduces 2 helpers:
> bpf_map_kmalloc_node() and bpf_map_alloc_percpu(). They can be used in
> the bpf code for accounted memory allocations, both in the process and
> interrupt contexts. In the interrupt context they're using the saved
> memory cgroup, otherwise the current cgroup is getting charged.
> 
> Signed-off-by: Roman Gushchin <guro@fb.com>

Acked-by: Song Liu <songliubraving@fb.com>

[...]
diff mbox series

Patch

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index e1bcb6d7345c..b11436cb9e3d 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -20,6 +20,7 @@ 
 #include <linux/module.h>
 #include <linux/kallsyms.h>
 #include <linux/capability.h>
+#include <linux/slab.h>
 
 struct bpf_verifier_env;
 struct bpf_verifier_log;
@@ -37,6 +38,7 @@  struct bpf_iter_aux_info;
 struct bpf_local_storage;
 struct bpf_local_storage_map;
 struct kobject;
+struct mem_cgroup;
 
 extern struct idr btf_idr;
 extern spinlock_t btf_idr_lock;
@@ -161,6 +163,9 @@  struct bpf_map {
 	u32 btf_value_type_id;
 	struct btf *btf;
 	struct bpf_map_memory memory;
+#ifdef CONFIG_MEMCG_KMEM
+	struct mem_cgroup *memcg;
+#endif
 	char name[BPF_OBJ_NAME_LEN];
 	u32 btf_vmlinux_value_type_id;
 	bool bypass_spec_v1;
@@ -1240,6 +1245,27 @@  int  generic_map_delete_batch(struct bpf_map *map,
 struct bpf_map *bpf_map_get_curr_or_next(u32 *id);
 struct bpf_prog *bpf_prog_get_curr_or_next(u32 *id);
 
+#ifdef CONFIG_MEMCG_KMEM
+void *bpf_map_kmalloc_node(const struct bpf_map *map, size_t size, gfp_t flags,
+			   int node);
+void __percpu *bpf_map_alloc_percpu(const struct bpf_map *map, size_t size,
+				    size_t align, gfp_t gfp);
+#else
+static inline void *
+bpf_map_kmalloc_node(const struct bpf_map *map, size_t size, gfp_t flags,
+		     int node)
+{
+	return kmalloc_node(size, flags, node);
+}
+
+static inline void __percpu *
+bpf_map_alloc_percpu(const struct bpf_map *map, size_t size, size_t align,
+		     gfp_t gfp)
+{
+	return __alloc_percpu_gfp(size, align, gfp);
+}
+#endif
+
 extern int sysctl_unprivileged_bpf_disabled;
 
 static inline bool bpf_allow_ptr_leaks(void)
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index f3fe9f53f93c..4154c616788c 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -31,6 +31,8 @@ 
 #include <linux/poll.h>
 #include <linux/bpf-netns.h>
 #include <linux/rcupdate_trace.h>
+#include <linux/memcontrol.h>
+#include <linux/sched/mm.h>
 
 #define IS_FD_ARRAY(map) ((map)->map_type == BPF_MAP_TYPE_PERF_EVENT_ARRAY || \
 			  (map)->map_type == BPF_MAP_TYPE_CGROUP_ARRAY || \
@@ -456,6 +458,77 @@  void bpf_map_free_id(struct bpf_map *map, bool do_idr_lock)
 		__release(&map_idr_lock);
 }
 
+#ifdef CONFIG_MEMCG_KMEM
+static void bpf_map_save_memcg(struct bpf_map *map)
+{
+	map->memcg = get_mem_cgroup_from_mm(current->mm);
+}
+
+static void bpf_map_release_memcg(struct bpf_map *map)
+{
+	mem_cgroup_put(map->memcg);
+}
+
+void *bpf_map_kmalloc_node(const struct bpf_map *map, size_t size, gfp_t flags,
+			   int node)
+{
+	struct mem_cgroup *old_memcg;
+	bool in_interrupt;
+	void *ptr;
+
+	/*
+	 * If the memory allocation is performed from an interrupt context,
+	 * the memory cgroup to charge can't be determined from the context
+	 * of the current task. Instead, we charge the memory cgroup, which
+	 * contained the process created the map.
+	 */
+	in_interrupt = in_interrupt();
+	if (in_interrupt)
+		old_memcg = set_active_memcg(map->memcg);
+
+	ptr = kmalloc_node(size, flags, node);
+
+	if (in_interrupt)
+		set_active_memcg(old_memcg);
+
+	return ptr;
+}
+
+void __percpu *bpf_map_alloc_percpu(const struct bpf_map *map, size_t size,
+				    size_t align, gfp_t gfp)
+{
+	struct mem_cgroup *old_memcg;
+	bool in_interrupt;
+	void *ptr;
+
+	/*
+	 * If the memory allocation is performed from an interrupt context,
+	 * the memory cgroup to charge can't be determined from the context
+	 * of the current task. Instead, we charge the memory cgroup, which
+	 * contained the process created the map.
+	 */
+	in_interrupt = in_interrupt();
+	if (in_interrupt)
+		old_memcg = set_active_memcg(map->memcg);
+
+	ptr = __alloc_percpu_gfp(size, align, gfp);
+
+	if (in_interrupt)
+		set_active_memcg(old_memcg);
+
+	return ptr;
+}
+
+#else
+static void bpf_map_save_memcg(struct bpf_map *map)
+{
+}
+
+static void bpf_map_release_memcg(struct bpf_map *map)
+{
+}
+#endif
+
 /* called from workqueue */
 static void bpf_map_free_deferred(struct work_struct *work)
 {
@@ -464,6 +537,7 @@  static void bpf_map_free_deferred(struct work_struct *work)
 
 	bpf_map_charge_move(&mem, &map->memory);
 	security_bpf_map_free(map);
+	bpf_map_release_memcg(map);
 	/* implementation dependent freeing */
 	map->ops->map_free(map);
 	bpf_map_charge_finish(&mem);
@@ -875,6 +949,8 @@  static int map_create(union bpf_attr *attr)
 	if (err)
 		goto free_map_sec;
 
+	bpf_map_save_memcg(map);
+
 	err = bpf_map_new_fd(map, f_flags);
 	if (err < 0) {
 		/* failed to allocate fd.