[RFC,bpf-next,v3,3/6] bpf: Introduce BPF_MA_REUSE_AFTER_RCU_GP

Message ID	20230429101215.111262-4-houtao@huaweicloud.com (mailing list archive)
State	Superseded
Headers	show Return-Path: <rcu-owner@vger.kernel.org> From: Hou Tao <houtao@huaweicloud.com> To: bpf@vger.kernel.org, Martin KaFai Lau <martin.lau@linux.dev>, Alexei Starovoitov <alexei.starovoitov@gmail.com> Cc: Andrii Nakryiko <andrii@kernel.org>, Song Liu <song@kernel.org>, Hao Luo <haoluo@google.com>, Yonghong Song <yhs@fb.com>, Daniel Borkmann <daniel@iogearbox.net>, KP Singh <kpsingh@kernel.org>, Stanislav Fomichev <sdf@google.com>, Jiri Olsa <jolsa@kernel.org>, John Fastabend <john.fastabend@gmail.com>, "Paul E . McKenney" <paulmck@kernel.org>, rcu@vger.kernel.org, houtao1@huawei.com Subject: [RFC bpf-next v3 3/6] bpf: Introduce BPF_MA_REUSE_AFTER_RCU_GP Date: Sat, 29 Apr 2023 18:12:12 +0800 Message-Id: <20230429101215.111262-4-houtao@huaweicloud.com> In-Reply-To: <20230429101215.111262-1-houtao@huaweicloud.com> References: <20230429101215.111262-1-houtao@huaweicloud.com> MIME-Version: 1.0 Content-Transfer-Encoding: 8bit Precedence: bulk
Series	Handle immediate reuse in bpf memory allocator \| expand [RFC,bpf-next,v3,0/6] Handle immediate reuse in bpf memory allocator [RFC,bpf-next,v3,1/6] bpf: Factor out a common helper free_all() [RFC,bpf-next,v3,2/6] bpf: Pass bitwise flags to bpf_mem_alloc_init() [RFC,bpf-next,v3,3/6] bpf: Introduce BPF_MA_REUSE_AFTER_RCU_GP [RFC,bpf-next,v3,4/6] bpf: Introduce BPF_MA_FREE_AFTER_RCU_GP [RFC,bpf-next,v3,5/6] bpf: Add two module parameters in htab for memory benchmark [RFC,bpf-next,v3,6/6] selftests/bpf: Add benchmark for bpf memory allocator

diff --git a/include/linux/bpf_mem_alloc.h b/include/linux/bpf_mem_alloc.h index 148347950e16..e7f68432713b 100644 --- a/include/linux/bpf_mem_alloc.h +++ b/include/linux/bpf_mem_alloc.h @@ -18,6 +18,7 @@ struct bpf_mem_alloc { /* flags for bpf_mem_alloc_init() */ enum { BPF_MA_PERCPU = 1U << 0, + BPF_MA_REUSE_AFTER_RCU_GP = 1U << 1, }; /* 'size != 0' is for bpf_mem_alloc which manages fixed-size objects. diff --git a/kernel/bpf/memalloc.c b/kernel/bpf/memalloc.c index 072102476019..262100f89610 100644 --- a/kernel/bpf/memalloc.c +++ b/kernel/bpf/memalloc.c @@ -63,6 +63,10 @@ static u8 size_index[24] __ro_after_init = { 2 /* 192 */ }; +static struct workqueue_struct *bpf_ma_wq; + +static void bpf_ma_prepare_reuse_work(struct work_struct *work); + static int bpf_mem_cache_idx(size_t size) { if (!size || size > 4096) @@ -98,18 +102,36 @@ struct bpf_mem_cache { int free_cnt; int low_watermark, high_watermark, batch; int percpu_size; + int cpu; unsigned int flags; + raw_spinlock_t reuse_lock; + bool abort_reuse; + struct llist_head reuse_ready_head; + struct llist_node *reuse_ready_tail; + struct llist_head wait_for_free; + struct llist_head prepare_reuse_head; + struct llist_node *prepare_reuse_tail; + unsigned int prepare_reuse_cnt; + atomic_t reuse_cb_in_progress; + struct work_struct reuse_work; + struct rcu_head rcu; struct llist_head free_by_rcu; struct llist_head waiting_for_gp; - atomic_t call_rcu_in_progress; + atomic_t free_cb_in_progress; }; struct bpf_mem_caches { struct bpf_mem_cache cache[NUM_CACHES]; }; +struct bpf_reuse_batch { + struct bpf_mem_cache *c; + struct llist_node *head, *tail; + struct rcu_head rcu; +}; + static struct llist_node notrace *__llist_del_first(struct llist_head *head) { struct llist_node *entry, *next; @@ -154,6 +176,45 @@ static struct mem_cgroup *get_memcg(const struct bpf_mem_cache *c) #endif } +static void *bpf_ma_get_reusable_obj(struct bpf_mem_cache *c) +{ + if (c->flags & BPF_MA_REUSE_AFTER_RCU_GP) { + unsigned long flags; + void *obj; + + if (llist_empty(&c->reuse_ready_head) && llist_empty(&c->wait_for_free)) + return NULL; + + /* reuse_ready_head and wait_for_free may be manipulated by + * kworker and RCU callbacks. + */ + raw_spin_lock_irqsave(&c->reuse_lock, flags); + obj = __llist_del_first(&c->reuse_ready_head); + if (obj) { + if (llist_empty(&c->reuse_ready_head)) + c->reuse_ready_tail = NULL; + } else { + obj = __llist_del_first(&c->wait_for_free); + } + raw_spin_unlock_irqrestore(&c->reuse_lock, flags); + return obj; + } + + /* + * free_by_rcu is only manipulated by irq work refill_work(). + * IRQ works on the same CPU are called sequentially, so it is + * safe to use __llist_del_first() here. If alloc_bulk() is + * invoked by the initial prefill, there will be no running + * refill_work(), so __llist_del_first() is fine as well. + * + * In most cases, objects on free_by_rcu are from the same CPU. + * If some objects come from other CPUs, it doesn't incur any + * harm because NUMA_NO_NODE means the preference for current + * numa node and it is not a guarantee. + */ + return __llist_del_first(&c->free_by_rcu); +} + /* Mostly runs from irq_work except __init phase. */ static void alloc_bulk(struct bpf_mem_cache *c, int cnt, int node) { @@ -165,19 +226,7 @@ static void alloc_bulk(struct bpf_mem_cache *c, int cnt, int node) memcg = get_memcg(c); old_memcg = set_active_memcg(memcg); for (i = 0; i < cnt; i++) { - /* - * free_by_rcu is only manipulated by irq work refill_work(). - * IRQ works on the same CPU are called sequentially, so it is - * safe to use __llist_del_first() here. If alloc_bulk() is - * invoked by the initial prefill, there will be no running - * refill_work(), so __llist_del_first() is fine as well. - * - * In most cases, objects on free_by_rcu are from the same CPU. - * If some objects come from other CPUs, it doesn't incur any - * harm because NUMA_NO_NODE means the preference for current - * numa node and it is not a guarantee. - */ - obj = __llist_del_first(&c->free_by_rcu); + obj = bpf_ma_get_reusable_obj(c); if (!obj) { /* Allocate, but don't deplete atomic reserves that typical * GFP_ATOMIC would do. irq_work runs on this cpu and kmalloc @@ -236,7 +285,7 @@ static void __free_rcu(struct rcu_head *head) struct bpf_mem_cache *c = container_of(head, struct bpf_mem_cache, rcu); free_all(llist_del_all(&c->waiting_for_gp), !!c->percpu_size); - atomic_set(&c->call_rcu_in_progress, 0); + atomic_set(&c->free_cb_in_progress, 0); } static void __free_rcu_tasks_trace(struct rcu_head *head) @@ -264,7 +313,7 @@ static void do_call_rcu(struct bpf_mem_cache *c) { struct llist_node *llnode, *t; - if (atomic_xchg(&c->call_rcu_in_progress, 1)) + if (atomic_xchg(&c->free_cb_in_progress, 1)) return; WARN_ON_ONCE(!llist_empty(&c->waiting_for_gp)); @@ -409,6 +458,8 @@ int bpf_mem_alloc_init(struct bpf_mem_alloc *ma, int size, unsigned int flags) c->objcg = objcg; c->percpu_size = percpu_size; c->flags = flags; + c->cpu = cpu; + INIT_WORK(&c->reuse_work, bpf_ma_prepare_reuse_work); prefill_mem_cache(c, cpu); } ma->cache = pc; @@ -433,6 +484,8 @@ int bpf_mem_alloc_init(struct bpf_mem_alloc *ma, int size, unsigned int flags) c->unit_size = sizes[i]; c->objcg = objcg; c->flags = flags; + c->cpu = cpu; + INIT_WORK(&c->reuse_work, bpf_ma_prepare_reuse_work); prefill_mem_cache(c, cpu); } } @@ -444,18 +497,40 @@ int bpf_mem_alloc_init(struct bpf_mem_alloc *ma, int size, unsigned int flags) static void drain_mem_cache(struct bpf_mem_cache *c) { bool percpu = !!c->percpu_size; + struct llist_node *head[3]; + unsigned long flags; /* No progs are using this bpf_mem_cache, but htab_map_free() called * bpf_mem_cache_free() for all remaining elements and they can be in * free_by_rcu or in waiting_for_gp lists, so drain those lists now. * - * Except for waiting_for_gp list, there are no concurrent operations - * on these lists, so it is safe to use __llist_del_all(). + * Except for waiting_for_gp and free_llist_extra list, there are no + * concurrent operations on these lists, so it is safe to use + * __llist_del_all(). */ free_all(__llist_del_all(&c->free_by_rcu), percpu); free_all(llist_del_all(&c->waiting_for_gp), percpu); free_all(__llist_del_all(&c->free_llist), percpu); - free_all(__llist_del_all(&c->free_llist_extra), percpu); + free_all(llist_del_all(&c->free_llist_extra), percpu); + + if (!(c->flags & BPF_MA_REUSE_AFTER_RCU_GP)) + return; + + raw_spin_lock_irqsave(&c->reuse_lock, flags); + /* Indicate kworker and RCU callback to free elements directly + * instead of adding new elements into these lists. + */ + c->abort_reuse = true; + head[0] = __llist_del_all(&c->prepare_reuse_head); + c->prepare_reuse_tail = NULL; + head[1] = __llist_del_all(&c->reuse_ready_head); + c->reuse_ready_tail = NULL; + head[2] = __llist_del_all(&c->wait_for_free); + raw_spin_unlock_irqrestore(&c->reuse_lock, flags); + + free_all(head[0], percpu); + free_all(head[1], percpu); + free_all(head[2], percpu); } static void free_mem_alloc_no_barrier(struct bpf_mem_alloc *ma) @@ -466,10 +541,39 @@ static void free_mem_alloc_no_barrier(struct bpf_mem_alloc *ma) ma->caches = NULL; } +static void bpf_ma_cancel_reuse_work(struct bpf_mem_alloc *ma) +{ + struct bpf_mem_caches *cc; + struct bpf_mem_cache *c; + int cpu, i; + + if (ma->cache) { + for_each_possible_cpu(cpu) { + c = per_cpu_ptr(ma->cache, cpu); + cancel_work_sync(&c->reuse_work); + } + } + if (ma->caches) { + for_each_possible_cpu(cpu) { + cc = per_cpu_ptr(ma->caches, cpu); + for (i = 0; i < NUM_CACHES; i++) { + c = &cc->cache[i]; + cancel_work_sync(&c->reuse_work); + } + } + } +} + static void free_mem_alloc(struct bpf_mem_alloc *ma) { - /* waiting_for_gp lists was drained, but __free_rcu might - * still execute. Wait for it now before we freeing percpu caches. + bool reuse_after_rcu_gp = ma->flags & BPF_MA_REUSE_AFTER_RCU_GP; + + /* Cancel the inflight kworkers */ + if (reuse_after_rcu_gp) + bpf_ma_cancel_reuse_work(ma); + + /* For normal bpf ma, waiting_for_gp lists was drained, but __free_rcu + * might still execute. Wait for it now before we freeing percpu caches. * * rcu_barrier_tasks_trace() doesn't imply synchronize_rcu_tasks_trace(), * but rcu_barrier_tasks_trace() and rcu_barrier() below are only used @@ -477,9 +581,13 @@ static void free_mem_alloc(struct bpf_mem_alloc *ma) * so if call_rcu(head, __free_rcu) is skipped due to * rcu_trace_implies_rcu_gp(), it will be OK to skip rcu_barrier() by * using rcu_trace_implies_rcu_gp() as well. + * + * For reuse-after-rcu-gp bpf ma, use rcu_barrier_tasks_trace() to + * wait for the pending bpf_ma_free_reusable_cb() and use rcu_barrier() + * to wait for the pending bpf_ma_reuse_cb(). */ rcu_barrier_tasks_trace(); - if (!rcu_trace_implies_rcu_gp()) + if (reuse_after_rcu_gp || !rcu_trace_implies_rcu_gp()) rcu_barrier(); free_mem_alloc_no_barrier(ma); } @@ -512,6 +620,7 @@ static void destroy_mem_alloc(struct bpf_mem_alloc *ma, int rcu_in_progress) } /* Defer barriers into worker to let the rest of map memory to be freed */ + copy->flags = ma->flags; copy->cache = ma->cache; ma->cache = NULL; copy->caches = ma->caches; @@ -541,7 +650,9 @@ void bpf_mem_alloc_destroy(struct bpf_mem_alloc *ma) */ irq_work_sync(&c->refill_work); drain_mem_cache(c); - rcu_in_progress += atomic_read(&c->call_rcu_in_progress); + rcu_in_progress += atomic_read(&c->free_cb_in_progress); + /* Pending kworkers or RCU callbacks */ + rcu_in_progress += atomic_read(&c->reuse_cb_in_progress); } /* objcg is the same across cpus */ if (c->objcg) @@ -556,7 +667,8 @@ void bpf_mem_alloc_destroy(struct bpf_mem_alloc *ma) c = &cc->cache[i]; irq_work_sync(&c->refill_work); drain_mem_cache(c); - rcu_in_progress += atomic_read(&c->call_rcu_in_progress); + rcu_in_progress += atomic_read(&c->free_cb_in_progress); + rcu_in_progress += atomic_read(&c->reuse_cb_in_progress); } } if (c->objcg) @@ -600,18 +712,183 @@ static void notrace *unit_alloc(struct bpf_mem_cache *c) return llnode; } +static void bpf_ma_add_to_reuse_ready_or_free(struct bpf_mem_cache *c, struct llist_node *head, + struct llist_node *tail) +{ + unsigned long flags; + bool abort; + + raw_spin_lock_irqsave(&c->reuse_lock, flags); + abort = c->abort_reuse; + if (!abort) { + if (llist_empty(&c->reuse_ready_head)) + c->reuse_ready_tail = tail; + __llist_add_batch(head, tail, &c->reuse_ready_head); + } + raw_spin_unlock_irqrestore(&c->reuse_lock, flags); + + /* Don't move these objects to reuse_ready list and free + * these objects directly. + */ + if (abort) + free_all(head, !!c->percpu_size); +} + +static void bpf_ma_reuse_cb(struct rcu_head *rcu) +{ + struct bpf_reuse_batch *batch = container_of(rcu, struct bpf_reuse_batch, rcu); + struct bpf_mem_cache *c = batch->c; + + bpf_ma_add_to_reuse_ready_or_free(c, batch->head, batch->tail); + atomic_dec(&c->reuse_cb_in_progress); + kfree(batch); +} + +static bool bpf_ma_try_free_reuse_objs(struct bpf_mem_cache *c) +{ + struct llist_node *head, *tail; + bool do_free; + + if (llist_empty(&c->reuse_ready_head)) + return false; + + do_free = !atomic_xchg(&c->free_cb_in_progress, 1); + if (!do_free) + return false; + + head = __llist_del_all(&c->reuse_ready_head); + tail = c->reuse_ready_tail; + c->reuse_ready_tail = NULL; + + __llist_add_batch(head, tail, &c->wait_for_free); + + return true; +} + +static void bpf_ma_free_reusable_cb(struct rcu_head *rcu) +{ + struct bpf_mem_cache *c = container_of(rcu, struct bpf_mem_cache, rcu); + struct llist_node *head; + unsigned long flags; + + raw_spin_lock_irqsave(&c->reuse_lock, flags); + head = __llist_del_all(&c->wait_for_free); + raw_spin_unlock_irqrestore(&c->reuse_lock, flags); + + free_all(head, !!c->percpu_size); + atomic_set(&c->free_cb_in_progress, 0); +} + +static void bpf_ma_prepare_reuse_work(struct work_struct *work) +{ + struct bpf_mem_cache *c = container_of(work, struct bpf_mem_cache, reuse_work); + struct llist_node *head, *tail, *llnode, *tmp; + struct bpf_reuse_batch *batch; + unsigned long flags; + bool do_free; + + local_irq_save(flags); + /* When CPU is offline, the running CPU may be different with + * the CPU which submitted the work. When these two CPUs are the same, + * kworker may be interrupted by NMI, so increase active to protect + * again such concurrency. + */ + if (c->cpu == smp_processor_id()) + WARN_ON_ONCE(local_inc_return(&c->active) != 1); + raw_spin_lock(&c->reuse_lock); + head = __llist_del_all(&c->prepare_reuse_head); + tail = c->prepare_reuse_tail; + c->prepare_reuse_tail = NULL; + c->prepare_reuse_cnt = 0; + if (c->cpu == smp_processor_id()) + local_dec(&c->active); + + /* Try to free elements in reusable list. Before these elements are + * freed in RCU cb, these element will still be available for reuse. + */ + do_free = bpf_ma_try_free_reuse_objs(c); + raw_spin_unlock(&c->reuse_lock); + local_irq_restore(flags); + + if (do_free) + call_rcu_tasks_trace(&c->rcu, bpf_ma_free_reusable_cb); + + llist_for_each_safe(llnode, tmp, llist_del_all(&c->free_llist_extra)) { + if (!head) + tail = llnode; + llnode->next = head; + head = llnode->next; + } + /* Draining is in progress ? */ + if (!head) { + /* kworker completes and no RCU callback */ + atomic_dec(&c->reuse_cb_in_progress); + return; + } + + batch = kmalloc(sizeof(*batch), GFP_KERNEL); + if (!batch) { + synchronize_rcu_expedited(); + bpf_ma_add_to_reuse_ready_or_free(c, head, tail); + /* kworker completes and no RCU callback */ + atomic_dec(&c->reuse_cb_in_progress); + return; + } + + batch->c = c; + batch->head = head; + batch->tail = tail; + call_rcu(&batch->rcu, bpf_ma_reuse_cb); +} + +static void notrace wait_gp_reuse_free(struct bpf_mem_cache *c, struct llist_node *llnode) +{ + unsigned long flags; + + local_irq_save(flags); + /* In case a NMI-context bpf program is also freeing object. */ + if (local_inc_return(&c->active) == 1) { + bool try_queue_work = false; + + /* kworker may remove elements from prepare_reuse_head */ + raw_spin_lock(&c->reuse_lock); + if (llist_empty(&c->prepare_reuse_head)) + c->prepare_reuse_tail = llnode; + __llist_add(llnode, &c->prepare_reuse_head); + if (++c->prepare_reuse_cnt > c->high_watermark) { + /* Zero out prepare_reuse_cnt early to prevent + * unnecessary queue_work(). + */ + c->prepare_reuse_cnt = 0; + try_queue_work = true; + } + raw_spin_unlock(&c->reuse_lock); + + if (try_queue_work && !work_pending(&c->reuse_work)) { + /* Use reuse_cb_in_progress to indicate there is + * inflight reuse kworker or reuse RCU callback. + */ + atomic_inc(&c->reuse_cb_in_progress); + /* Already queued */ + if (!queue_work(bpf_ma_wq, &c->reuse_work)) + atomic_dec(&c->reuse_cb_in_progress); + } + } else { + llist_add(llnode, &c->free_llist_extra); + } + local_dec(&c->active); + local_irq_restore(flags); +} + /* Though 'ptr' object could have been allocated on a different cpu * add it to the free_llist of the current cpu. * Let kfree() logic deal with it when it's later called from irq_work. */ -static void notrace unit_free(struct bpf_mem_cache *c, void *ptr) +static void notrace immediate_reuse_free(struct bpf_mem_cache *c, struct llist_node *llnode) { - struct llist_node *llnode = ptr - LLIST_NODE_SZ; unsigned long flags; int cnt = 0; - BUILD_BUG_ON(LLIST_NODE_SZ > 8); - local_irq_save(flags); if (local_inc_return(&c->active) == 1) { __llist_add(llnode, &c->free_llist); @@ -633,6 +910,18 @@ static void notrace unit_free(struct bpf_mem_cache *c, void *ptr) irq_work_raise(c); } +static inline void notrace unit_free(struct bpf_mem_cache *c, void *ptr) +{ + struct llist_node *llnode = ptr - LLIST_NODE_SZ; + + BUILD_BUG_ON(LLIST_NODE_SZ > 8); + + if (c->flags & BPF_MA_REUSE_AFTER_RCU_GP) + wait_gp_reuse_free(c, llnode); + else + immediate_reuse_free(c, llnode); +} + /* Called from BPF program or from sys_bpf syscall. * In both cases migration is disabled. */ @@ -724,3 +1013,11 @@ void notrace *bpf_mem_cache_alloc_flags(struct bpf_mem_alloc *ma, gfp_t flags) return !ret ? NULL : ret + LLIST_NODE_SZ; } + +static int __init bpf_ma_init(void) +{ + bpf_ma_wq = alloc_workqueue("bpf_ma", WQ_MEM_RECLAIM, 0); + BUG_ON(!bpf_ma_wq); + return 0; +} +late_initcall(bpf_ma_init);

[RFC,bpf-next,v3,3/6] bpf: Introduce BPF_MA_REUSE_AFTER_RCU_GP

Commit Message

Comments

Patch