[RFC,1/6] percpu-refcount: Add managed mode for RCU released objects

Message ID	20240916050811.473556-2-Neeraj.Upadhyay@amd.com (mailing list archive)
State	Handled Elsewhere
Delegated to:	Paul Moore
Headers	show Received: from NAM04-MW2-obe.outbound.protection.outlook.com (mail-mw2nam04on2050.outbound.protection.outlook.com [40.107.101.50]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id 8199B224D6; Mon, 16 Sep 2024 05:08:56 +0000 (UTC) Received-SPF: Pass (protection.outlook.com: domain of amd.com designates 165.204.84.17 as permitted sender) receiver=protection.outlook.com; client-ip=165.204.84.17; helo=SATLEXMB04.amd.com; pr=C From: Neeraj Upadhyay <Neeraj.Upadhyay@amd.com> To: <linux-kernel@vger.kernel.org> CC: <john.johansen@canonical.com>, <paul@paul-moore.com>, <jmorris@namei.org>, <serge@hallyn.com>, <linux-security-module@vger.kernel.org>, <gautham.shenoy@amd.com>, <Santosh.Shukla@amd.com>, <Ananth.Narayan@amd.com>, <Raghavendra.KodsaraThimmappa@amd.com>, <paulmck@kernel.org>, <boqun.feng@gmail.com>, <vinicius.gomes@intel.com>, <mjguzik@gmail.com>, <dennis@kernel.org>, <tj@kernel.org>, <cl@linux.com>, <linux-mm@kvack.org>, <rcu@vger.kernel.org> Subject: [RFC 1/6] percpu-refcount: Add managed mode for RCU released objects Date: Mon, 16 Sep 2024 10:38:06 +0530 Message-ID: <20240916050811.473556-2-Neeraj.Upadhyay@amd.com> In-Reply-To: <20240916050811.473556-1-Neeraj.Upadhyay@amd.com> References: <20240916050811.473556-1-Neeraj.Upadhyay@amd.com> Precedence: bulk MIME-Version: 1.0 Content-Transfer-Encoding: 8bit Content-Type: text/plain
Series	Managed Percpu Refcount \| expand [RFC,0/6] Managed Percpu Refcount [RFC,1/6] percpu-refcount: Add managed mode for RCU released objects [RFC,2/6] percpu-refcount: Add torture test for percpu refcount [RFC,3/6] percpu-refcount: Extend managed mode to allow runtime switching [RFC,4/6] percpu-refcount-torture: Extend test with runtime mode switches [RFC,5/6] apparmor: Switch labels to percpu refcount in atomic mode [RFC,6/6] apparmor: Switch labels to percpu ref managed mode

diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 09126bb8cc9f..0f02a1b04fe9 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -4665,6 +4665,18 @@ allocator. This parameter is primarily for debugging and performance comparison. + percpu_refcount.max_scan_count= [KNL] + Specifies the maximum number of percpu ref nodes which + are processed in one run of percpu ref manager thread. + + Default: 100 + + percpu_refcount.scan_interval= [KNL] + Specifies the duration (ms) between two runs of manager + thread. + + Default: 5000 ms + pirq= [SMP,APIC] Manual mp-table setup See Documentation/arch/x86/i386/IO-APIC.rst. diff --git a/include/linux/percpu-refcount.h b/include/linux/percpu-refcount.h index d73a1c08c3e3..e6aea81b3d01 100644 --- a/include/linux/percpu-refcount.h +++ b/include/linux/percpu-refcount.h @@ -68,6 +68,11 @@ enum { __PERCPU_REF_FLAG_BITS = 2, }; +/* Auxiliary flags */ +enum { + __PERCPU_REL_MANAGED = 1LU << 0, /* operating in managed mode */ +}; + /* @flags for percpu_ref_init() */ enum { /* @@ -90,6 +95,10 @@ enum { * Allow switching from atomic mode to percpu mode. */ PERCPU_REF_ALLOW_REINIT = 1 << 2, + /* + * Manage release of the percpu ref. + */ + PERCPU_REF_REL_MANAGED = 1 << 3, }; struct percpu_ref_data { @@ -100,6 +109,9 @@ struct percpu_ref_data { bool allow_reinit:1; struct rcu_head rcu; struct percpu_ref *ref; + unsigned int aux_flags; + struct llist_node node; + }; struct percpu_ref { @@ -126,6 +138,7 @@ void percpu_ref_switch_to_atomic(struct percpu_ref *ref, percpu_ref_func_t *confirm_switch); void percpu_ref_switch_to_atomic_sync(struct percpu_ref *ref); void percpu_ref_switch_to_percpu(struct percpu_ref *ref); +int percpu_ref_switch_to_managed(struct percpu_ref *ref); void percpu_ref_kill_and_confirm(struct percpu_ref *ref, percpu_ref_func_t *confirm_kill); void percpu_ref_resurrect(struct percpu_ref *ref); diff --git a/lib/percpu-refcount.c b/lib/percpu-refcount.c index 668f6aa6a75d..7b97f9728c5b 100644 --- a/lib/percpu-refcount.c +++ b/lib/percpu-refcount.c @@ -5,6 +5,9 @@ #include <linux/sched.h> #include <linux/wait.h> #include <linux/slab.h> +#include <linux/llist.h> +#include <linux/moduleparam.h> +#include <linux/types.h> #include <linux/mm.h> #include <linux/percpu-refcount.h> @@ -38,6 +41,7 @@ static DEFINE_SPINLOCK(percpu_ref_switch_lock); static DECLARE_WAIT_QUEUE_HEAD(percpu_ref_switch_waitq); +static LLIST_HEAD(percpu_ref_manage_head); static unsigned long __percpu *percpu_count_ptr(struct percpu_ref *ref) { @@ -45,6 +49,8 @@ static unsigned long __percpu *percpu_count_ptr(struct percpu_ref *ref) (ref->percpu_count_ptr & ~__PERCPU_REF_ATOMIC_DEAD); } +int percpu_ref_switch_to_managed(struct percpu_ref *ref); + /** * percpu_ref_init - initialize a percpu refcount * @ref: percpu_ref to initialize @@ -80,6 +86,9 @@ int percpu_ref_init(struct percpu_ref *ref, percpu_ref_func_t *release, return -ENOMEM; } + if (flags & PERCPU_REF_REL_MANAGED) + flags |= PERCPU_REF_ALLOW_REINIT; + data->force_atomic = flags & PERCPU_REF_INIT_ATOMIC; data->allow_reinit = flags & PERCPU_REF_ALLOW_REINIT; @@ -101,10 +110,73 @@ int percpu_ref_init(struct percpu_ref *ref, percpu_ref_func_t *release, data->confirm_switch = NULL; data->ref = ref; ref->data = data; + init_llist_node(&data->node); + + if (flags & PERCPU_REF_REL_MANAGED) + percpu_ref_switch_to_managed(ref); + return 0; } EXPORT_SYMBOL_GPL(percpu_ref_init); +static bool percpu_ref_is_managed(struct percpu_ref *ref) +{ + return (ref->data->aux_flags & __PERCPU_REL_MANAGED) != 0; +} + +static void __percpu_ref_switch_mode(struct percpu_ref *ref, + percpu_ref_func_t *confirm_switch); + +static int __percpu_ref_switch_to_managed(struct percpu_ref *ref) +{ + unsigned long __percpu *percpu_count; + struct percpu_ref_data *data; + int ret = -1; + + data = ref->data; + + if (WARN_ONCE(!percpu_ref_tryget(ref), "Percpu ref is not active")) + return ret; + + if (WARN_ONCE(!data->allow_reinit, "Percpu ref does not allow switch")) + goto err_switch_managed; + + if (WARN_ONCE(percpu_ref_is_managed(ref), "Percpu ref is already managed")) + goto err_switch_managed; + + data->aux_flags |= __PERCPU_REL_MANAGED; + data->force_atomic = false; + if (!__ref_is_percpu(ref, &percpu_count)) + __percpu_ref_switch_mode(ref, NULL); + /* Ensure ordering of percpu mode switch and node scan */ + smp_mb(); + llist_add(&data->node, &percpu_ref_manage_head); + + return 0; + +err_switch_managed: + percpu_ref_put(ref); + return ret; +} + +/** + * percpu_ref_switch_to_managed - Switch an unmanaged ref to percpu mode. + * + * @ref: percpu_ref to switch to managed mode + * + */ +int percpu_ref_switch_to_managed(struct percpu_ref *ref) +{ + unsigned long flags; + int ret; + + spin_lock_irqsave(&percpu_ref_switch_lock, flags); + ret = __percpu_ref_switch_to_managed(ref); + spin_unlock_irqrestore(&percpu_ref_switch_lock, flags); + return ret; +} +EXPORT_SYMBOL_GPL(percpu_ref_switch_to_managed); + static void __percpu_ref_exit(struct percpu_ref *ref) { unsigned long __percpu *percpu_count = percpu_count_ptr(ref); @@ -283,6 +355,27 @@ static void __percpu_ref_switch_mode(struct percpu_ref *ref, __percpu_ref_switch_to_percpu(ref); } +static bool __percpu_ref_switch_to_atomic_checked(struct percpu_ref *ref, + percpu_ref_func_t *confirm_switch, + bool check_managed) +{ + unsigned long flags; + + spin_lock_irqsave(&percpu_ref_switch_lock, flags); + if (check_managed && WARN_ONCE(percpu_ref_is_managed(ref), + "Percpu ref is managed, cannot switch to atomic mode")) { + spin_unlock_irqrestore(&percpu_ref_switch_lock, flags); + return false; + } + + ref->data->force_atomic = true; + __percpu_ref_switch_mode(ref, confirm_switch); + + spin_unlock_irqrestore(&percpu_ref_switch_lock, flags); + + return true; +} + /** * percpu_ref_switch_to_atomic - switch a percpu_ref to atomic mode * @ref: percpu_ref to switch to atomic mode @@ -306,17 +399,16 @@ static void __percpu_ref_switch_mode(struct percpu_ref *ref, void percpu_ref_switch_to_atomic(struct percpu_ref *ref, percpu_ref_func_t *confirm_switch) { - unsigned long flags; - - spin_lock_irqsave(&percpu_ref_switch_lock, flags); - - ref->data->force_atomic = true; - __percpu_ref_switch_mode(ref, confirm_switch); - - spin_unlock_irqrestore(&percpu_ref_switch_lock, flags); + (void)__percpu_ref_switch_to_atomic_checked(ref, confirm_switch, true); } EXPORT_SYMBOL_GPL(percpu_ref_switch_to_atomic); +static void __percpu_ref_switch_to_atomic_sync_checked(struct percpu_ref *ref, bool check_managed) +{ + if (!__percpu_ref_switch_to_atomic_checked(ref, NULL, check_managed)) + return; + wait_event(percpu_ref_switch_waitq, !ref->data->confirm_switch); +} /** * percpu_ref_switch_to_atomic_sync - switch a percpu_ref to atomic mode * @ref: percpu_ref to switch to atomic mode @@ -327,11 +419,28 @@ EXPORT_SYMBOL_GPL(percpu_ref_switch_to_atomic); */ void percpu_ref_switch_to_atomic_sync(struct percpu_ref *ref) { - percpu_ref_switch_to_atomic(ref, NULL); - wait_event(percpu_ref_switch_waitq, !ref->data->confirm_switch); + __percpu_ref_switch_to_atomic_sync_checked(ref, true); } EXPORT_SYMBOL_GPL(percpu_ref_switch_to_atomic_sync); +static void __percpu_ref_switch_to_percpu_checked(struct percpu_ref *ref, bool check_managed) +{ + unsigned long flags; + + spin_lock_irqsave(&percpu_ref_switch_lock, flags); + + if (check_managed && WARN_ONCE(percpu_ref_is_managed(ref), + "Percpu ref is managed, cannot switch to percpu mode")) { + spin_unlock_irqrestore(&percpu_ref_switch_lock, flags); + return; + } + + ref->data->force_atomic = false; + __percpu_ref_switch_mode(ref, NULL); + + spin_unlock_irqrestore(&percpu_ref_switch_lock, flags); +} + /** * percpu_ref_switch_to_percpu - switch a percpu_ref to percpu mode * @ref: percpu_ref to switch to percpu mode @@ -352,14 +461,7 @@ EXPORT_SYMBOL_GPL(percpu_ref_switch_to_atomic_sync); */ void percpu_ref_switch_to_percpu(struct percpu_ref *ref) { - unsigned long flags; - - spin_lock_irqsave(&percpu_ref_switch_lock, flags); - - ref->data->force_atomic = false; - __percpu_ref_switch_mode(ref, NULL); - - spin_unlock_irqrestore(&percpu_ref_switch_lock, flags); + __percpu_ref_switch_to_percpu_checked(ref, true); } EXPORT_SYMBOL_GPL(percpu_ref_switch_to_percpu); @@ -472,8 +574,226 @@ void percpu_ref_resurrect(struct percpu_ref *ref) ref->percpu_count_ptr &= ~__PERCPU_REF_DEAD; percpu_ref_get(ref); - __percpu_ref_switch_mode(ref, NULL); + if (percpu_ref_is_managed(ref)) { + ref->data->aux_flags &= ~__PERCPU_REL_MANAGED; + __percpu_ref_switch_to_managed(ref); + } else { + __percpu_ref_switch_mode(ref, NULL); + } spin_unlock_irqrestore(&percpu_ref_switch_lock, flags); } EXPORT_SYMBOL_GPL(percpu_ref_resurrect); + +#define DEFAULT_SCAN_INTERVAL_MS 5000 +/* Interval duration between two ref scans. */ +static ulong scan_interval = DEFAULT_SCAN_INTERVAL_MS; +module_param(scan_interval, ulong, 0444); + +#define DEFAULT_MAX_SCAN_COUNT 100 +/* Number of percpu refs scanned in one iteration of worker execution. */ +static int max_scan_count = DEFAULT_MAX_SCAN_COUNT; +module_param(max_scan_count, int, 0444); + +static void percpu_ref_release_work_fn(struct work_struct *work); + +/* + * Sentinel llist nodes for lockless list traveral and deletions by + * the pcpu ref release worker, while nodes are added from + * percpu_ref_init() and percpu_ref_switch_to_managed(). + * + * Sentinel node marks the head of list traversal for the current + * iteration of kworker execution. + */ +struct percpu_ref_sen_node { + bool inuse; + struct llist_node node; +}; + +/* + * We need two sentinel nodes for lockless list manipulations from release + * worker - first node will be used in current reclaim iteration. The second + * node will be used in next iteration. Next iteration marks the first node + * as free, for use in subsequent iteration. + */ +#define PERCPU_REF_SEN_NODES_COUNT 2 + +/* Track last processed percpu ref node */ +static struct llist_node *last_percpu_ref_node; + +static struct percpu_ref_sen_node + percpu_ref_sen_nodes[PERCPU_REF_SEN_NODES_COUNT]; + +static DECLARE_DELAYED_WORK(percpu_ref_release_work, percpu_ref_release_work_fn); + +static bool percpu_ref_is_sen_node(struct llist_node *node) +{ + return &percpu_ref_sen_nodes[0].node <= node && + node <= &percpu_ref_sen_nodes[PERCPU_REF_SEN_NODES_COUNT - 1].node; +} + +static struct llist_node *percpu_ref_get_sen_node(void) +{ + int i; + struct percpu_ref_sen_node *sn; + + for (i = 0; i < PERCPU_REF_SEN_NODES_COUNT; i++) { + sn = &percpu_ref_sen_nodes[i]; + if (!sn->inuse) { + sn->inuse = true; + return &sn->node; + } + } + + return NULL; +} + +static void percpu_ref_put_sen_node(struct llist_node *node) +{ + struct percpu_ref_sen_node *sn = container_of(node, struct percpu_ref_sen_node, node); + + sn->inuse = false; + init_llist_node(node); +} + +static void percpu_ref_put_all_sen_nodes_except(struct llist_node *node) +{ + int i; + + for (i = 0; i < PERCPU_REF_SEN_NODES_COUNT; i++) { + if (&percpu_ref_sen_nodes[i].node == node) + continue; + percpu_ref_sen_nodes[i].inuse = false; + init_llist_node(&percpu_ref_sen_nodes[i].node); + } +} + +static struct workqueue_struct *percpu_ref_release_wq; + +static void percpu_ref_release_work_fn(struct work_struct *work) +{ + struct llist_node *pos, *first, *head, *prev, *next; + struct llist_node *sen_node; + struct percpu_ref *ref; + int count = 0; + bool held; + + first = READ_ONCE(percpu_ref_manage_head.first); + if (!first) + goto queue_release_work; + + /* + * Enqueue a dummy node to mark the start of scan. This dummy + * node is used as start point of scan and ensures that + * there is no additional synchronization required with new + * label node additions to the llist. Any new labels will + * be processed in next run of the kworker. + * + * SCAN START PTR + * | + * v + * +----------+ +------+ +------+ +------+ + * | | | | | | | | + * | head ------> dummy|--->|label |--->| label|--->NULL + * | | | node | | | | | + * +----------+ +------+ +------+ +------+ + * + * + * New label addition: + * + * SCAN START PTR + * | + * v + * +----------+ +------+ +------+ +------+ +------+ + * | | | | | | | | | | + * | head |--> label|--> dummy|--->|label |--->| label|--->NULL + * | | | | | node | | | | | + * +----------+ +------+ +------+ +------+ +------+ + * + */ + if (last_percpu_ref_node == NULL || last_percpu_ref_node->next == NULL) { +retry_sentinel_get: + sen_node = percpu_ref_get_sen_node(); + /* + * All sentinel nodes are in use? This should not happen, as we + * require only one sentinel for the start of list traversal and + * other sentinel node is freed during the traversal. + */ + if (WARN_ONCE(!sen_node, "All sentinel nodes are in use")) { + /* Use first node as the sentinel node */ + head = first->next; + if (!head) { + struct llist_node *ign_node = NULL; + /* + * We exhausted sentinel nodes. However, there aren't + * enough nodes in the llist. So, we have leaked + * sentinel nodes. Reclaim sentinels and retry. + */ + if (percpu_ref_is_sen_node(first)) + ign_node = first; + percpu_ref_put_all_sen_nodes_except(ign_node); + goto retry_sentinel_get; + } + prev = first; + } else { + llist_add(sen_node, &percpu_ref_manage_head); + prev = sen_node; + head = prev->next; + } + } else { + prev = last_percpu_ref_node; + head = prev->next; + } + + last_percpu_ref_node = NULL; + llist_for_each_safe(pos, next, head) { + /* Free sentinel node which is present in the list */ + if (percpu_ref_is_sen_node(pos)) { + prev->next = pos->next; + percpu_ref_put_sen_node(pos); + continue; + } + + ref = container_of(pos, struct percpu_ref_data, node)->ref; + __percpu_ref_switch_to_atomic_sync_checked(ref, false); + /* + * Drop the ref while in RCU read critical section to + * prevent obj free while we manipulating node. + */ + rcu_read_lock(); + percpu_ref_put(ref); + held = percpu_ref_tryget(ref); + if (!held) { + prev->next = pos->next; + init_llist_node(pos); + ref->percpu_count_ptr |= __PERCPU_REF_DEAD; + } + rcu_read_unlock(); + if (!held) + continue; + __percpu_ref_switch_to_percpu_checked(ref, false); + count++; + if (count == max_scan_count) { + last_percpu_ref_node = pos; + break; + } + prev = pos; + } + +queue_release_work: + queue_delayed_work(percpu_ref_release_wq, &percpu_ref_release_work, + scan_interval); +} + +static __init int percpu_ref_setup(void) +{ + percpu_ref_release_wq = alloc_workqueue("percpu_ref_release_wq", + WQ_UNBOUND | WQ_MEM_RECLAIM | WQ_FREEZABLE, 0); + if (!percpu_ref_release_wq) + return -ENOMEM; + + queue_delayed_work(percpu_ref_release_wq, &percpu_ref_release_work, + scan_interval); + return 0; +} +early_initcall(percpu_ref_setup);

[RFC,1/6] percpu-refcount: Add managed mode for RCU released objects

Commit Message

Patch