diff mbox series

[RFC,bpf-next,4/6] bpf: Introduce BPF_MA_NO_REUSE for bpf memory allocator

Message ID 20221230041151.1231169-5-houtao@huaweicloud.com (mailing list archive)
State RFC
Delegated to: BPF
Headers show
Series bpf: Handle reuse in bpf memory alloc | expand

Checks

Context Check Description
bpf/vmtest-bpf-next-PR pending PR summary
bpf/vmtest-bpf-next-VM_Test-1 success Logs for ShellCheck
bpf/vmtest-bpf-next-VM_Test-3 success Logs for build for aarch64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-5 success Logs for build for x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-6 success Logs for build for x86_64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-7 success Logs for llvm-toolchain
bpf/vmtest-bpf-next-VM_Test-8 success Logs for set-matrix
bpf/vmtest-bpf-next-VM_Test-2 success Logs for build for aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-4 success Logs for build for s390x with gcc
bpf/vmtest-bpf-next-VM_Test-11 pending Logs for test_maps on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-21 pending Logs for test_progs_no_alu32 on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-9 success Logs for test_maps on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-10 success Logs for test_maps on aarch64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-12 success Logs for test_maps on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-13 success Logs for test_maps on x86_64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-14 fail Logs for test_progs on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-15 fail Logs for test_progs on aarch64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-17 fail Logs for test_progs on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-18 fail Logs for test_progs on x86_64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-19 fail Logs for test_progs_no_alu32 on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-20 fail Logs for test_progs_no_alu32 on aarch64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-22 fail Logs for test_progs_no_alu32 on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-23 fail Logs for test_progs_no_alu32 on x86_64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-24 success Logs for test_progs_no_alu32_parallel on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-25 success Logs for test_progs_no_alu32_parallel on aarch64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-27 success Logs for test_progs_no_alu32_parallel on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-28 success Logs for test_progs_no_alu32_parallel on x86_64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-29 success Logs for test_progs_parallel on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-30 success Logs for test_progs_parallel on aarch64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-32 success Logs for test_progs_parallel on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-33 success Logs for test_progs_parallel on x86_64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-34 success Logs for test_verifier on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-35 success Logs for test_verifier on aarch64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-37 success Logs for test_verifier on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-38 success Logs for test_verifier on x86_64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-36 success Logs for test_verifier on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-16 success Logs for test_progs on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-31 success Logs for test_progs_parallel on s390x with gcc
netdev/tree_selection success Clearly marked for bpf-next, async
netdev/fixes_present success Fixes tag not required for -next series
netdev/subject_prefix success Link
netdev/cover_letter success Series has a cover letter
netdev/patch_count success Link
netdev/header_inline success No static functions without inline keyword in header files
netdev/build_32bit success Errors and warnings before: 50 this patch: 50
netdev/cc_maintainers success CCed 12 of 12 maintainers
netdev/build_clang success Errors and warnings before: 1 this patch: 1
netdev/module_param success Was 0 now: 0
netdev/verify_signedoff success Signed-off-by tag matches author and committer
netdev/check_selftest success No net selftest shell script
netdev/verify_fixes success No Fixes tag
netdev/build_allmodconfig_warn success Errors and warnings before: 50 this patch: 50
netdev/checkpatch warning WARNING: Do not crash the kernel unless it is absolutely unavoidable--use WARN_ON_ONCE() plus recovery code (if feasible) instead of BUG() or variants WARNING: line length of 100 exceeds 80 columns WARNING: line length of 82 exceeds 80 columns WARNING: line length of 83 exceeds 80 columns WARNING: line length of 90 exceeds 80 columns WARNING: line length of 91 exceeds 80 columns WARNING: line length of 92 exceeds 80 columns
netdev/kdoc success Errors and warnings before: 0 this patch: 0
netdev/source_inline fail Was 0 now: 2
bpf/vmtest-bpf-next-VM_Test-26 success Logs for test_progs_no_alu32_parallel on s390x with gcc

Commit Message

Hou Tao Dec. 30, 2022, 4:11 a.m. UTC
From: Hou Tao <houtao1@huawei.com>

Currently the freed element in bpf memory allocator may be reused by new
allocation, the reuse may lead to two problems. One problem is that the
lookup procedure may get incorrect result if the found element is freed
and then reused. Another problem is that lookup procedure may still use
special fields in map value or allocated object and at the same time
these special fields are reinitialized by new allocation. The latter
problem can be mitigated by using ctor in bpf memory allocator, but it
only works for case in which all elements have the same type.

So introducing BPF_MA_NO_REUSE to disable immediate reuse of freed
elements. These freed elements will be moved into a global per-cpu free
list instead. After the number of freed elements reaches the threshold,
these free elements will be moved into a dymaically allocated object
and being freed by a global per-cpu worker through
call_rcu_tasks_trace().

Signed-off-by: Hou Tao <houtao1@huawei.com>
---
 include/linux/bpf_mem_alloc.h |   2 +
 kernel/bpf/memalloc.c         | 175 +++++++++++++++++++++++++++++++++-
 2 files changed, 173 insertions(+), 4 deletions(-)
diff mbox series

Patch

diff --git a/include/linux/bpf_mem_alloc.h b/include/linux/bpf_mem_alloc.h
index b9f6b9155fa5..2a10b721832d 100644
--- a/include/linux/bpf_mem_alloc.h
+++ b/include/linux/bpf_mem_alloc.h
@@ -19,6 +19,8 @@  struct bpf_mem_alloc {
 /* flags for bpf_mem_alloc_init() */
 enum {
 	BPF_MA_PERCPU = 1,
+	/* Don't reuse freed elements during allocation */
+	BPF_MA_NO_REUSE = 2,
 };
 
 int bpf_mem_alloc_init(struct bpf_mem_alloc *ma, int size, unsigned int flags,
diff --git a/kernel/bpf/memalloc.c b/kernel/bpf/memalloc.c
index 454c86596111..e5eaf765624b 100644
--- a/kernel/bpf/memalloc.c
+++ b/kernel/bpf/memalloc.c
@@ -35,6 +35,23 @@ 
  */
 #define LLIST_NODE_SZ sizeof(struct llist_node)
 
+#define BPF_MA_FREE_TYPE_NR 2
+
+struct bpf_ma_free_context {
+	raw_spinlock_t lock;
+	local_t active;
+	/* For both no per-cpu and per-cpu */
+	struct llist_head to_free[BPF_MA_FREE_TYPE_NR];
+	unsigned int to_free_cnt[BPF_MA_FREE_TYPE_NR];
+	struct llist_head to_free_extra[BPF_MA_FREE_TYPE_NR];
+	struct delayed_work dwork;
+};
+
+struct bpf_ma_free_batch {
+	struct rcu_head rcu;
+	struct llist_node *to_free[BPF_MA_FREE_TYPE_NR];
+};
+
 /* similar to kmalloc, but sizeof == 8 bucket is gone */
 static u8 size_index[24] __ro_after_init = {
 	3,	/* 8 */
@@ -63,6 +80,9 @@  static u8 size_index[24] __ro_after_init = {
 	2	/* 192 */
 };
 
+static DEFINE_PER_CPU(struct bpf_ma_free_context, percpu_free_ctx);
+static struct workqueue_struct *bpf_ma_free_wq;
+
 static int bpf_mem_cache_idx(size_t size)
 {
 	if (!size || size > 4096)
@@ -609,14 +629,11 @@  static void notrace *unit_alloc(struct bpf_mem_cache *c)
  * add it to the free_llist of the current cpu.
  * Let kfree() logic deal with it when it's later called from irq_work.
  */
-static void notrace unit_free(struct bpf_mem_cache *c, void *ptr)
+static void notrace reuse_free(struct bpf_mem_cache *c, struct llist_node *llnode)
 {
-	struct llist_node *llnode = ptr - LLIST_NODE_SZ;
 	unsigned long flags;
 	int cnt = 0;
 
-	BUILD_BUG_ON(LLIST_NODE_SZ > 8);
-
 	local_irq_save(flags);
 	if (local_inc_return(&c->active) == 1) {
 		__llist_add(llnode, &c->free_llist);
@@ -638,6 +655,137 @@  static void notrace unit_free(struct bpf_mem_cache *c, void *ptr)
 		irq_work_raise(c);
 }
 
+static void batch_free_rcu(struct rcu_head *rcu)
+{
+	struct bpf_ma_free_batch *batch = container_of(rcu, struct bpf_ma_free_batch, rcu);
+
+	free_llist(batch->to_free[0], false);
+	free_llist(batch->to_free[1], true);
+	kfree(batch);
+}
+
+static void batch_free_rcu_tasks_trace(struct rcu_head *rcu)
+{
+	if (rcu_trace_implies_rcu_gp())
+		batch_free_rcu(rcu);
+	else
+		call_rcu(rcu, batch_free_rcu);
+}
+
+static void bpf_ma_schedule_free_dwork(struct bpf_ma_free_context *ctx)
+{
+	long delay, left;
+	u64 to_free_cnt;
+
+	to_free_cnt = ctx->to_free_cnt[0] + ctx->to_free_cnt[1];
+	delay = to_free_cnt >= 256 ? 1 : HZ;
+	if (delayed_work_pending(&ctx->dwork)) {
+		left = ctx->dwork.timer.expires - jiffies;
+		if (delay < left)
+			mod_delayed_work(bpf_ma_free_wq, &ctx->dwork, delay);
+		return;
+	}
+	queue_delayed_work(bpf_ma_free_wq, &ctx->dwork, delay);
+}
+
+static void bpf_ma_splice_to_free_list(struct bpf_ma_free_context *ctx, struct llist_node **to_free)
+{
+	struct llist_node *tmp[BPF_MA_FREE_TYPE_NR];
+	unsigned long flags;
+	unsigned int i;
+
+	raw_spin_lock_irqsave(&ctx->lock, flags);
+	for (i = 0; i < ARRAY_SIZE(tmp); i++) {
+		tmp[i] = __llist_del_all(&ctx->to_free[i]);
+		ctx->to_free_cnt[i] = 0;
+	}
+	raw_spin_unlock_irqrestore(&ctx->lock, flags);
+
+	for (i = 0; i < ARRAY_SIZE(tmp); i++) {
+		struct llist_node *first, *last;
+
+		first = llist_del_all(&ctx->to_free_extra[i]);
+		if (!first) {
+			to_free[i] = tmp[i];
+			continue;
+		}
+		last = first;
+		while (last->next)
+			last = last->next;
+		to_free[i] = first;
+		last->next = tmp[i];
+	}
+}
+
+static inline bool bpf_ma_has_to_free(const struct bpf_ma_free_context *ctx)
+{
+	return !llist_empty(&ctx->to_free[0]) || !llist_empty(&ctx->to_free[1]) ||
+	       !llist_empty(&ctx->to_free_extra[0]) || !llist_empty(&ctx->to_free_extra[1]);
+}
+
+static void bpf_ma_free_dwork(struct work_struct *work)
+{
+	struct bpf_ma_free_context *ctx = container_of(to_delayed_work(work),
+						       struct bpf_ma_free_context, dwork);
+	struct llist_node *to_free[BPF_MA_FREE_TYPE_NR];
+	struct bpf_ma_free_batch *batch;
+	unsigned long flags;
+
+	bpf_ma_splice_to_free_list(ctx, to_free);
+
+	batch = kmalloc(sizeof(*batch), GFP_NOWAIT | __GFP_NOWARN);
+	if (!batch) {
+		/* TODO: handle ENOMEM case better ? */
+		rcu_barrier_tasks_trace();
+		rcu_barrier();
+		free_llist(to_free[0], false);
+		free_llist(to_free[1], true);
+		goto check;
+	}
+
+	batch->to_free[0] = to_free[0];
+	batch->to_free[1] = to_free[1];
+	call_rcu_tasks_trace(&batch->rcu, batch_free_rcu_tasks_trace);
+check:
+	raw_spin_lock_irqsave(&ctx->lock, flags);
+	if (bpf_ma_has_to_free(ctx))
+		bpf_ma_schedule_free_dwork(ctx);
+	raw_spin_unlock_irqrestore(&ctx->lock, flags);
+}
+
+static void notrace direct_free(struct bpf_mem_cache *c, struct llist_node *llnode)
+{
+	struct bpf_ma_free_context *ctx;
+	bool percpu = !!c->percpu_size;
+	unsigned long flags;
+
+	local_irq_save(flags);
+	ctx = this_cpu_ptr(&percpu_free_ctx);
+	if (local_inc_return(&ctx->active) == 1) {
+		raw_spin_lock(&ctx->lock);
+		__llist_add(llnode, &ctx->to_free[percpu]);
+		ctx->to_free_cnt[percpu] += 1;
+		bpf_ma_schedule_free_dwork(ctx);
+		raw_spin_unlock(&ctx->lock);
+	} else {
+		llist_add(llnode, &ctx->to_free_extra[percpu]);
+	}
+	local_dec(&ctx->active);
+	local_irq_restore(flags);
+}
+
+static inline void unit_free(struct bpf_mem_cache *c, void *ptr)
+{
+	struct llist_node *llnode = ptr - LLIST_NODE_SZ;
+
+	BUILD_BUG_ON(LLIST_NODE_SZ > 8);
+
+	if (c->ma->flags & BPF_MA_NO_REUSE)
+		direct_free(c, llnode);
+	else
+		reuse_free(c, llnode);
+}
+
 /* Called from BPF program or from sys_bpf syscall.
  * In both cases migration is disabled.
  */
@@ -686,3 +834,22 @@  void notrace bpf_mem_cache_free(struct bpf_mem_alloc *ma, void *ptr)
 
 	unit_free(this_cpu_ptr(ma->cache), ptr);
 }
+
+static int __init bpf_ma_init(void)
+{
+	int cpu;
+
+	bpf_ma_free_wq = alloc_workqueue("bpf_ma_free", WQ_MEM_RECLAIM, 0);
+	BUG_ON(!bpf_ma_free_wq);
+
+	for_each_possible_cpu(cpu) {
+		struct bpf_ma_free_context *ctx;
+
+		ctx = per_cpu_ptr(&percpu_free_ctx, cpu);
+		raw_spin_lock_init(&ctx->lock);
+		INIT_DELAYED_WORK(&ctx->dwork, bpf_ma_free_dwork);
+	}
+
+	return 0;
+}
+fs_initcall(bpf_ma_init);