diff mbox series

[RFC,bpf-next,v3,4/6] bpf: Introduce BPF_MA_FREE_AFTER_RCU_GP

Message ID 20230429101215.111262-5-houtao@huaweicloud.com (mailing list archive)
State RFC
Delegated to: BPF
Headers show
Series Handle immediate reuse in bpf memory allocator | expand

Checks

Context Check Description
netdev/series_format success Posting correctly formatted
netdev/tree_selection success Clearly marked for bpf-next, async
netdev/fixes_present success Fixes tag not required for -next series
netdev/header_inline success No static functions without inline keyword in header files
netdev/build_32bit success Errors and warnings before: 182 this patch: 182
netdev/cc_maintainers success CCed 12 of 12 maintainers
netdev/build_clang success Errors and warnings before: 20 this patch: 20
netdev/verify_signedoff success Signed-off-by tag matches author and committer
netdev/deprecated_api success None detected
netdev/check_selftest success No net selftest shell script
netdev/verify_fixes success No Fixes tag
netdev/build_allmodconfig_warn success Errors and warnings before: 182 this patch: 182
netdev/checkpatch warning WARNING: line length of 85 exceeds 80 columns WARNING: line length of 86 exceeds 80 columns WARNING: line length of 91 exceeds 80 columns WARNING: line length of 96 exceeds 80 columns
netdev/kdoc success Errors and warnings before: 0 this patch: 0
netdev/source_inline success Was 0 now: 0

Commit Message

Hou Tao April 29, 2023, 10:12 a.m. UTC
From: Hou Tao <houtao1@huawei.com>

Beside REUSE_AFTER_RCU_GP, also introduce FREE_AFTER_RCU_GP to solve
the immediate reuse problem as well. Compared with REUSE_AFTER_RCU_GP,
the implementation of FREE_AFTER_RCU_GP is much simpler. It doesn't try
to reuse these freed elements after one RCU GP is passed, instead it
just directly frees these elements back to slab subsystem after one RCU
GP. The shortcoming of FREE_AFTER_RCU_GP is that sleep-able program must
access these elements by using bpf_rcu_read_{lock,unlock}, otherwise
there will be use-after-free problem.

To simplify the implementation, FREE_AFTER_RCU_GP uses a global per-cpu
free list to temporarily keep these freed elements and uses a per-cpu
kworker to dynamically allocate RCU callback to free these freed
elements when the number of freed elements is above the threshold.

Signed-off-by: Hou Tao <houtao1@huawei.com>
---
 include/linux/bpf_mem_alloc.h |   1 +
 kernel/bpf/memalloc.c         | 139 ++++++++++++++++++++++++++++++++++
 2 files changed, 140 insertions(+)
diff mbox series

Patch

diff --git a/include/linux/bpf_mem_alloc.h b/include/linux/bpf_mem_alloc.h
index e7f68432713b..61e8556208a2 100644
--- a/include/linux/bpf_mem_alloc.h
+++ b/include/linux/bpf_mem_alloc.h
@@ -19,6 +19,7 @@  struct bpf_mem_alloc {
 enum {
 	BPF_MA_PERCPU = 1U << 0,
 	BPF_MA_REUSE_AFTER_RCU_GP = 1U << 1,
+	BPF_MA_FREE_AFTER_RCU_GP = 1U << 2,
 };
 
 /* 'size != 0' is for bpf_mem_alloc which manages fixed-size objects.
diff --git a/kernel/bpf/memalloc.c b/kernel/bpf/memalloc.c
index 262100f89610..5f6a4f2cfd37 100644
--- a/kernel/bpf/memalloc.c
+++ b/kernel/bpf/memalloc.c
@@ -63,7 +63,26 @@  static u8 size_index[24] __ro_after_init = {
 	2	/* 192 */
 };
 
+#define BPF_MA_FREE_TYPE_NR 2
+
+struct bpf_ma_free_ctx {
+	raw_spinlock_t lock;
+	int cpu;
+	local_t active;
+	/* For both no per-cpu and per-cpu */
+	struct llist_head to_free[BPF_MA_FREE_TYPE_NR];
+	unsigned int to_free_cnt[BPF_MA_FREE_TYPE_NR];
+	struct llist_head to_free_extra[BPF_MA_FREE_TYPE_NR];
+	struct delayed_work dwork;
+};
+
+struct bpf_free_batch {
+	struct rcu_head rcu;
+	struct llist_node *to_free[BPF_MA_FREE_TYPE_NR];
+};
+
 static struct workqueue_struct *bpf_ma_wq;
+static DEFINE_PER_CPU(struct bpf_ma_free_ctx, percpu_free_ctx);
 
 static void bpf_ma_prepare_reuse_work(struct work_struct *work);
 
@@ -910,6 +929,112 @@  static void notrace immediate_reuse_free(struct bpf_mem_cache *c, struct llist_n
 		irq_work_raise(c);
 }
 
+static void bpf_ma_batch_free_cb(struct rcu_head *rcu)
+{
+	struct bpf_free_batch *batch = container_of(rcu, struct bpf_free_batch, rcu);
+
+	free_all(batch->to_free[0], false);
+	free_all(batch->to_free[1], true);
+	kfree(batch);
+}
+
+static void bpf_ma_schedule_free_dwork(struct bpf_ma_free_ctx *ctx)
+{
+	long delay, left;
+	u64 to_free_cnt;
+
+	/* TODO: More reasonable threshold ? */
+	to_free_cnt = ctx->to_free_cnt[0] + ctx->to_free_cnt[1];
+	delay = to_free_cnt >= 256 ? 0 : HZ;
+	if (delayed_work_pending(&ctx->dwork)) {
+		left = ctx->dwork.timer.expires - jiffies;
+		if (delay < left)
+			mod_delayed_work(bpf_ma_wq, &ctx->dwork, delay);
+		return;
+	}
+	queue_delayed_work(bpf_ma_wq, &ctx->dwork, delay);
+}
+
+static void splice_llist(struct llist_head *llist, struct llist_node **head)
+{
+	struct llist_node *first, *last;
+
+	first = llist_del_all(llist);
+	if (!first)
+		return;
+
+	last = first;
+	while (last->next)
+		last = last->next;
+	last->next = *head;
+	*head = first;
+}
+
+static void bpf_ma_splice_to_free_list(struct bpf_ma_free_ctx *ctx, struct llist_node **to_free)
+{
+	unsigned long flags;
+
+	local_irq_save(flags);
+	/* Might be interrupted by a NMI which invokes unit_free() */
+	if (ctx->cpu == smp_processor_id())
+		WARN_ON_ONCE(local_inc_return(&ctx->active) != 1);
+	raw_spin_lock(&ctx->lock);
+	to_free[0] = __llist_del_all(&ctx->to_free[0]);
+	to_free[1] = __llist_del_all(&ctx->to_free[1]);
+	ctx->to_free_cnt[0] = 0;
+	ctx->to_free_cnt[1] = 0;
+	raw_spin_unlock(&ctx->lock);
+	if (ctx->cpu == smp_processor_id())
+		local_dec(&ctx->active);
+	local_irq_restore(flags);
+
+	splice_llist(&ctx->to_free_extra[0], &to_free[0]);
+	splice_llist(&ctx->to_free_extra[1], &to_free[1]);
+}
+
+static void bpf_ma_free_dwork(struct work_struct *work)
+{
+	struct bpf_ma_free_ctx *ctx = container_of(to_delayed_work(work),
+						       struct bpf_ma_free_ctx, dwork);
+	struct llist_node *to_free[BPF_MA_FREE_TYPE_NR];
+	struct bpf_free_batch *batch;
+
+	bpf_ma_splice_to_free_list(ctx, to_free);
+
+	batch = kmalloc(sizeof(*batch), GFP_KERNEL);
+	if (!batch) {
+		synchronize_rcu_expedited();
+		free_all(to_free[0], false);
+		free_all(to_free[1], true);
+		return;
+	}
+
+	batch->to_free[0] = to_free[0];
+	batch->to_free[1] = to_free[1];
+	call_rcu(&batch->rcu, bpf_ma_batch_free_cb);
+}
+
+static void notrace wait_gp_direct_free(struct bpf_mem_cache *c, struct llist_node *llnode)
+{
+	bool percpu = !!c->percpu_size;
+	struct bpf_ma_free_ctx *ctx;
+	unsigned long flags;
+
+	local_irq_save(flags);
+	ctx = this_cpu_ptr(&percpu_free_ctx);
+	if (local_inc_return(&ctx->active) == 1) {
+		raw_spin_lock(&ctx->lock);
+		__llist_add(llnode, &ctx->to_free[percpu]);
+		ctx->to_free_cnt[percpu] += 1;
+		bpf_ma_schedule_free_dwork(ctx);
+		raw_spin_unlock(&ctx->lock);
+	} else {
+		llist_add(llnode, &ctx->to_free_extra[percpu]);
+	}
+	local_dec(&ctx->active);
+	local_irq_restore(flags);
+}
+
 static inline void notrace unit_free(struct bpf_mem_cache *c, void *ptr)
 {
 	struct llist_node *llnode = ptr - LLIST_NODE_SZ;
@@ -918,6 +1043,8 @@  static inline void notrace unit_free(struct bpf_mem_cache *c, void *ptr)
 
 	if (c->flags & BPF_MA_REUSE_AFTER_RCU_GP)
 		wait_gp_reuse_free(c, llnode);
+	else if (c->flags & BPF_MA_FREE_AFTER_RCU_GP)
+		wait_gp_direct_free(c, llnode);
 	else
 		immediate_reuse_free(c, llnode);
 }
@@ -1016,8 +1143,20 @@  void notrace *bpf_mem_cache_alloc_flags(struct bpf_mem_alloc *ma, gfp_t flags)
 
 static int __init bpf_ma_init(void)
 {
+	int cpu;
+
 	bpf_ma_wq = alloc_workqueue("bpf_ma", WQ_MEM_RECLAIM, 0);
 	BUG_ON(!bpf_ma_wq);
+
+	for_each_possible_cpu(cpu) {
+		struct bpf_ma_free_ctx *ctx;
+
+		ctx = per_cpu_ptr(&percpu_free_ctx, cpu);
+		raw_spin_lock_init(&ctx->lock);
+		ctx->cpu = cpu;
+		INIT_DELAYED_WORK(&ctx->dwork, bpf_ma_free_dwork);
+	}
+
 	return 0;
 }
 late_initcall(bpf_ma_init);