diff mbox series

[bpf-next,v5,06/34] bpf: prepare for memcg-based memory accounting for bpf maps

Message ID 20201112221543.3621014-7-guro@fb.com (mailing list archive)
State Changes Requested
Delegated to: BPF
Headers show
Series bpf: switch to memcg-based memory accounting | expand

Commit Message

Roman Gushchin Nov. 12, 2020, 10:15 p.m. UTC
In the absolute majority of cases if a process is making a kernel
allocation, it's memory cgroup is getting charged.

Bpf maps can be updated from an interrupt context and in such
case there is no process which can be charged. It makes the memory
accounting of bpf maps non-trivial.

Fortunately, after commits 4127c6504f25 ("mm: kmem: enable kernel
memcg accounting from interrupt contexts") and b87d8cefe43c
("mm, memcg: rework remote charging API to support nesting")
it's finally possible.

To do it, a pointer to the memory cgroup of the process which created
the map is saved, and this cgroup is getting charged for all
allocations made from an interrupt context.

Allocations made from a process context will be accounted in a usual way.

Signed-off-by: Roman Gushchin <guro@fb.com>
---
 include/linux/bpf.h  |  4 ++++
 kernel/bpf/helpers.c | 37 ++++++++++++++++++++++++++++++++++++-
 kernel/bpf/syscall.c | 25 +++++++++++++++++++++++++
 3 files changed, 65 insertions(+), 1 deletion(-)

Comments

Song Liu Nov. 13, 2020, 5:46 p.m. UTC | #1
> On Nov 12, 2020, at 2:15 PM, Roman Gushchin <guro@fb.com> wrote:

[...]

> 
> +#ifdef CONFIG_MEMCG_KMEM
> +static __always_inline int __bpf_map_update_elem(struct bpf_map *map, void *key,
> +						 void *value, u64 flags)
> +{
> +	struct mem_cgroup *old_memcg;
> +	bool in_interrupt;
> +	int ret;
> +
> +	/*
> +	 * If update from an interrupt context results in a memory allocation,
> +	 * the memory cgroup to charge can't be determined from the context
> +	 * of the current task. Instead, we charge the memory cgroup, which
> +	 * contained a process created the map.
> +	 */
> +	in_interrupt = in_interrupt();
> +	if (in_interrupt)
> +		old_memcg = set_active_memcg(map->memcg);

set_active_memcg() checks in_interrupt() again. Maybe we can introduce another
helper to avoid checking it twice? Something like

static inline struct mem_cgroup *
set_active_memcg_int(struct mem_cgroup *memcg)
{
        struct mem_cgroup *old;

        old = this_cpu_read(int_active_memcg);
        this_cpu_write(int_active_memcg, memcg);
        return old;
}

Thanks,
Song

[...]
Roman Gushchin Nov. 13, 2020, 7:40 p.m. UTC | #2
On Fri, Nov 13, 2020 at 09:46:49AM -0800, Song Liu wrote:
> 
> 
> > On Nov 12, 2020, at 2:15 PM, Roman Gushchin <guro@fb.com> wrote:
> 
> [...]
> 
> > 
> > +#ifdef CONFIG_MEMCG_KMEM
> > +static __always_inline int __bpf_map_update_elem(struct bpf_map *map, void *key,
> > +						 void *value, u64 flags)
> > +{
> > +	struct mem_cgroup *old_memcg;
> > +	bool in_interrupt;
> > +	int ret;
> > +
> > +	/*
> > +	 * If update from an interrupt context results in a memory allocation,
> > +	 * the memory cgroup to charge can't be determined from the context
> > +	 * of the current task. Instead, we charge the memory cgroup, which
> > +	 * contained a process created the map.
> > +	 */
> > +	in_interrupt = in_interrupt();
> > +	if (in_interrupt)
> > +		old_memcg = set_active_memcg(map->memcg);
> 
> set_active_memcg() checks in_interrupt() again. Maybe we can introduce another
> helper to avoid checking it twice? Something like
> 
> static inline struct mem_cgroup *
> set_active_memcg_int(struct mem_cgroup *memcg)
> {
>         struct mem_cgroup *old;
> 
>         old = this_cpu_read(int_active_memcg);
>         this_cpu_write(int_active_memcg, memcg);
>         return old;
> }

Yeah, it's a good idea!

in_interrupt() check is very cheap (like checking some bits in a per-cpu variable),
so I don't think there will be any measurable difference. So I suggest to implement
it later as an enhancement on top (maybe in the next merge window), to avoid an another
delay. Otherwise I'll need to send a patch to mm@, wait for reviews and an inclusion
into the mm tree, etc). Does it work for you?

Thanks!
Song Liu Nov. 13, 2020, 8:48 p.m. UTC | #3
> On Nov 13, 2020, at 11:40 AM, Roman Gushchin <guro@fb.com> wrote:
> 
> On Fri, Nov 13, 2020 at 09:46:49AM -0800, Song Liu wrote:
>> 
>> 
>>> On Nov 12, 2020, at 2:15 PM, Roman Gushchin <guro@fb.com> wrote:
>> 
>> [...]
>> 
>>> 
>>> +#ifdef CONFIG_MEMCG_KMEM
>>> +static __always_inline int __bpf_map_update_elem(struct bpf_map *map, void *key,
>>> +						 void *value, u64 flags)
>>> +{
>>> +	struct mem_cgroup *old_memcg;
>>> +	bool in_interrupt;
>>> +	int ret;
>>> +
>>> +	/*
>>> +	 * If update from an interrupt context results in a memory allocation,
>>> +	 * the memory cgroup to charge can't be determined from the context
>>> +	 * of the current task. Instead, we charge the memory cgroup, which
>>> +	 * contained a process created the map.
>>> +	 */
>>> +	in_interrupt = in_interrupt();
>>> +	if (in_interrupt)
>>> +		old_memcg = set_active_memcg(map->memcg);
>> 
>> set_active_memcg() checks in_interrupt() again. Maybe we can introduce another
>> helper to avoid checking it twice? Something like
>> 
>> static inline struct mem_cgroup *
>> set_active_memcg_int(struct mem_cgroup *memcg)
>> {
>>        struct mem_cgroup *old;
>> 
>>        old = this_cpu_read(int_active_memcg);
>>        this_cpu_write(int_active_memcg, memcg);
>>        return old;
>> }
> 
> Yeah, it's a good idea!
> 
> in_interrupt() check is very cheap (like checking some bits in a per-cpu variable),
> so I don't think there will be any measurable difference. So I suggest to implement
> it later as an enhancement on top (maybe in the next merge window), to avoid an another
> delay. Otherwise I'll need to send a patch to mm@, wait for reviews and an inclusion
> into the mm tree, etc). Does it work for you?

Yeah, that works. 

Acked-by: Song Liu <songliubraving@fb.com>
diff mbox series

Patch

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 581b2a2e78eb..1d6e7b125877 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -37,6 +37,7 @@  struct bpf_iter_aux_info;
 struct bpf_local_storage;
 struct bpf_local_storage_map;
 struct kobject;
+struct mem_cgroup;
 
 extern struct idr btf_idr;
 extern spinlock_t btf_idr_lock;
@@ -161,6 +162,9 @@  struct bpf_map {
 	u32 btf_value_type_id;
 	struct btf *btf;
 	struct bpf_map_memory memory;
+#ifdef CONFIG_MEMCG_KMEM
+	struct mem_cgroup *memcg;
+#endif
 	char name[BPF_OBJ_NAME_LEN];
 	u32 btf_vmlinux_value_type_id;
 	bool bypass_spec_v1;
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index 25520f5eeaf6..b6327cbe7e41 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -14,6 +14,7 @@ 
 #include <linux/jiffies.h>
 #include <linux/pid_namespace.h>
 #include <linux/proc_ns.h>
+#include <linux/sched/mm.h>
 
 #include "../../lib/kstrtox.h"
 
@@ -41,11 +42,45 @@  const struct bpf_func_proto bpf_map_lookup_elem_proto = {
 	.arg2_type	= ARG_PTR_TO_MAP_KEY,
 };
 
+#ifdef CONFIG_MEMCG_KMEM
+static __always_inline int __bpf_map_update_elem(struct bpf_map *map, void *key,
+						 void *value, u64 flags)
+{
+	struct mem_cgroup *old_memcg;
+	bool in_interrupt;
+	int ret;
+
+	/*
+	 * If update from an interrupt context results in a memory allocation,
+	 * the memory cgroup to charge can't be determined from the context
+	 * of the current task. Instead, we charge the memory cgroup, which
+	 * contained a process created the map.
+	 */
+	in_interrupt = in_interrupt();
+	if (in_interrupt)
+		old_memcg = set_active_memcg(map->memcg);
+
+	ret = map->ops->map_update_elem(map, key, value, flags);
+
+	if (in_interrupt)
+		set_active_memcg(old_memcg);
+
+	return ret;
+}
+#else
+static __always_inline int __bpf_map_update_elem(struct bpf_map *map, void *key,
+						 void *value, u64 flags)
+{
+	return map->ops->map_update_elem(map, key, value, flags);
+}
+#endif
+
 BPF_CALL_4(bpf_map_update_elem, struct bpf_map *, map, void *, key,
 	   void *, value, u64, flags)
 {
 	WARN_ON_ONCE(!rcu_read_lock_held());
-	return map->ops->map_update_elem(map, key, value, flags);
+
+	return __bpf_map_update_elem(map, key, value, flags);
 }
 
 const struct bpf_func_proto bpf_map_update_elem_proto = {
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index f3fe9f53f93c..2d77fc2496da 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -31,6 +31,7 @@ 
 #include <linux/poll.h>
 #include <linux/bpf-netns.h>
 #include <linux/rcupdate_trace.h>
+#include <linux/memcontrol.h>
 
 #define IS_FD_ARRAY(map) ((map)->map_type == BPF_MAP_TYPE_PERF_EVENT_ARRAY || \
 			  (map)->map_type == BPF_MAP_TYPE_CGROUP_ARRAY || \
@@ -456,6 +457,27 @@  void bpf_map_free_id(struct bpf_map *map, bool do_idr_lock)
 		__release(&map_idr_lock);
 }
 
+#ifdef CONFIG_MEMCG_KMEM
+static void bpf_map_save_memcg(struct bpf_map *map)
+{
+	map->memcg = get_mem_cgroup_from_mm(current->mm);
+}
+
+static void bpf_map_release_memcg(struct bpf_map *map)
+{
+	mem_cgroup_put(map->memcg);
+}
+
+#else
+static void bpf_map_save_memcg(struct bpf_map *map)
+{
+}
+
+static void bpf_map_release_memcg(struct bpf_map *map)
+{
+}
+#endif
+
 /* called from workqueue */
 static void bpf_map_free_deferred(struct work_struct *work)
 {
@@ -464,6 +486,7 @@  static void bpf_map_free_deferred(struct work_struct *work)
 
 	bpf_map_charge_move(&mem, &map->memory);
 	security_bpf_map_free(map);
+	bpf_map_release_memcg(map);
 	/* implementation dependent freeing */
 	map->ops->map_free(map);
 	bpf_map_charge_finish(&mem);
@@ -875,6 +898,8 @@  static int map_create(union bpf_attr *attr)
 	if (err)
 		goto free_map_sec;
 
+	bpf_map_save_memcg(map);
+
 	err = bpf_map_new_fd(map, f_flags);
 	if (err < 0) {
 		/* failed to allocate fd.