diff mbox series

[RFC,3/6] percpu-refcount: Extend managed mode to allow runtime switching

Message ID 20240916050811.473556-4-Neeraj.Upadhyay@amd.com (mailing list archive)
State Handled Elsewhere
Delegated to: Paul Moore
Headers show
Series Managed Percpu Refcount | expand

Commit Message

Neeraj Upadhyay Sept. 16, 2024, 5:08 a.m. UTC
Provide more flexibility in terms of runtime mode switch for a managed
percpu ref. This can be useful for cases when in some scenarios, a
managed ref's object enters shutdown phase. Instead of waiting for
manager thread to process the ref, user can dirctly invoke
percpu_ref_kill() for the ref.

The init modes are same as in existing code. Runtime mode switching
allows switching back a managed ref to unmanaged mode, which allows
transitions to all reinit modes from managed mode.

To -->       A   P   P(RI)   M   D  D(RI)  D(RI/M)   EX   REI   RES

  A          y   n     y     y   n    y       y       y     y     y
  P          n   n     n     n   y    n       n       y     n     n
  M          y*  n     y*    y   n    y*      y       y*    y     y
  P(RI)      y   n     y     y   n    y       y       y     y     y
  D(RI)      y   n     y     y   n    y       y       -     y     y
  D(RI/M)    y*  n     y*    y   n    y*      y       -     y     y

Modes:
A - Atomic  P - PerCPU  M - Managed  P(RI) - PerCPU with ReInit
D(RI) - Dead with ReInit  D(RI/M) - Dead with ReInit and Managed

PerCPU Ref Ops:

KLL - Kill  REI - Reinit  RES - Resurrect

(RI) is for modes which are initialized with PERCPU_REF_ALLOW_REINIT.
The transitions shown above are the allowed transitions and can be
indirect transitions. For example, managed ref switches to P(RI) mode
when percpu_ref_switch_to_unmanaged() is called for it. P(RI) mode
can be directly switched to A mode using percpu_ref_switch_to_atomic().

Signed-off-by: Neeraj Upadhyay <Neeraj.Upadhyay@amd.com>
---
 include/linux/percpu-refcount.h |   3 +-
 lib/percpu-refcount.c           | 248 +++++++++++---------------------
 2 files changed, 88 insertions(+), 163 deletions(-)
diff mbox series

Patch

diff --git a/include/linux/percpu-refcount.h b/include/linux/percpu-refcount.h
index e6aea81b3d01..fe967db431a6 100644
--- a/include/linux/percpu-refcount.h
+++ b/include/linux/percpu-refcount.h
@@ -110,7 +110,7 @@  struct percpu_ref_data {
 	struct rcu_head		rcu;
 	struct percpu_ref	*ref;
 	unsigned int		aux_flags;
-	struct llist_node	node;
+	struct list_head	node;
 
 };
 
@@ -139,6 +139,7 @@  void percpu_ref_switch_to_atomic(struct percpu_ref *ref,
 void percpu_ref_switch_to_atomic_sync(struct percpu_ref *ref);
 void percpu_ref_switch_to_percpu(struct percpu_ref *ref);
 int percpu_ref_switch_to_managed(struct percpu_ref *ref);
+void percpu_ref_switch_to_unmanaged(struct percpu_ref *ref);
 void percpu_ref_kill_and_confirm(struct percpu_ref *ref,
 				 percpu_ref_func_t *confirm_kill);
 void percpu_ref_resurrect(struct percpu_ref *ref);
diff --git a/lib/percpu-refcount.c b/lib/percpu-refcount.c
index 7d0c85c7ce57..b79e36905aa4 100644
--- a/lib/percpu-refcount.c
+++ b/lib/percpu-refcount.c
@@ -5,7 +5,7 @@ 
 #include <linux/sched.h>
 #include <linux/wait.h>
 #include <linux/slab.h>
-#include <linux/llist.h>
+#include <linux/list.h>
 #include <linux/moduleparam.h>
 #include <linux/types.h>
 #include <linux/mm.h>
@@ -43,7 +43,12 @@ 
 
 static DEFINE_SPINLOCK(percpu_ref_switch_lock);
 static DECLARE_WAIT_QUEUE_HEAD(percpu_ref_switch_waitq);
-static LLIST_HEAD(percpu_ref_manage_head);
+static struct list_head percpu_ref_manage_head = LIST_HEAD_INIT(percpu_ref_manage_head);
+/* Spinlock protects node additions/deletions */
+static DEFINE_SPINLOCK(percpu_ref_manage_lock);
+/* Mutex synchronizes node deletions with the node being scanned */
+static DEFINE_MUTEX(percpu_ref_active_switch_mutex);
+static struct list_head *next_percpu_ref_node = &percpu_ref_manage_head;
 
 static unsigned long __percpu *percpu_count_ptr(struct percpu_ref *ref)
 {
@@ -112,7 +117,7 @@  int percpu_ref_init(struct percpu_ref *ref, percpu_ref_func_t *release,
 	data->confirm_switch = NULL;
 	data->ref = ref;
 	ref->data = data;
-	init_llist_node(&data->node);
+	INIT_LIST_HEAD(&data->node);
 
 	if (flags & PERCPU_REF_REL_MANAGED)
 		percpu_ref_switch_to_managed(ref);
@@ -150,9 +155,9 @@  static int __percpu_ref_switch_to_managed(struct percpu_ref *ref)
 	data->force_atomic = false;
 	if (!__ref_is_percpu(ref, &percpu_count))
 		__percpu_ref_switch_mode(ref, NULL);
-	/* Ensure ordering of percpu mode switch and node scan */
-	smp_mb();
-	llist_add(&data->node, &percpu_ref_manage_head);
+	spin_lock(&percpu_ref_manage_lock);
+	list_add(&data->node, &percpu_ref_manage_head);
+	spin_unlock(&percpu_ref_manage_lock);
 
 	return 0;
 
@@ -162,7 +167,7 @@  static int __percpu_ref_switch_to_managed(struct percpu_ref *ref)
 }
 
 /**
- * percpu_ref_switch_to_managed - Switch an unmanaged ref to percpu mode.
+ * percpu_ref_switch_to_managed - Switch an unmanaged ref to percpu managed mode.
  *
  * @ref: percpu_ref to switch to managed mode
  *
@@ -179,6 +184,47 @@  int percpu_ref_switch_to_managed(struct percpu_ref *ref)
 }
 EXPORT_SYMBOL_GPL(percpu_ref_switch_to_managed);
 
+/**
+ * percpu_ref_switch_to_unmanaged - Switch a managed ref to percpu mode.
+ *
+ * @ref: percpu_ref to switch back to unmanaged percpu mode
+ *
+ * Must only be called with elevated refcount.
+ */
+void percpu_ref_switch_to_unmanaged(struct percpu_ref *ref)
+{
+	bool mutex_taken = false;
+	struct list_head *node;
+	unsigned long flags;
+
+	might_sleep();
+
+	WARN_ONCE(!percpu_ref_is_managed(ref), "Percpu ref is not managed");
+
+	node = &ref->data->node;
+	spin_lock(&percpu_ref_manage_lock);
+	if (list_empty(node)) {
+		spin_unlock(&percpu_ref_manage_lock);
+		mutex_taken = true;
+		mutex_lock(&percpu_ref_active_switch_mutex);
+		spin_lock(&percpu_ref_manage_lock);
+	}
+
+	if (next_percpu_ref_node == node)
+		next_percpu_ref_node = next_percpu_ref_node->next;
+	list_del_init(node);
+	spin_unlock(&percpu_ref_manage_lock);
+	if (mutex_taken)
+		mutex_unlock(&percpu_ref_active_switch_mutex);
+
+	/* Drop the pseudo-init reference */
+	percpu_ref_put(ref);
+	spin_lock_irqsave(&percpu_ref_switch_lock, flags);
+	ref->data->aux_flags &= ~__PERCPU_REL_MANAGED;
+	spin_unlock_irqrestore(&percpu_ref_switch_lock, flags);
+}
+EXPORT_SYMBOL_GPL(percpu_ref_switch_to_unmanaged);
+
 static void __percpu_ref_exit(struct percpu_ref *ref)
 {
 	unsigned long __percpu *percpu_count = percpu_count_ptr(ref);
@@ -599,164 +645,35 @@  module_param(max_scan_count, int, 0444);
 
 static void percpu_ref_release_work_fn(struct work_struct *work);
 
-/*
- * Sentinel llist nodes for lockless list traveral and deletions by
- * the pcpu ref release worker, while nodes are added from
- * percpu_ref_init() and percpu_ref_switch_to_managed().
- *
- * Sentinel node marks the head of list traversal for the current
- * iteration of kworker execution.
- */
-struct percpu_ref_sen_node {
-	bool inuse;
-	struct llist_node node;
-};
-
-/*
- * We need two sentinel nodes for lockless list manipulations from release
- * worker - first node will be used in current reclaim iteration. The second
- * node will be used in next iteration. Next iteration marks the first node
- * as free, for use in subsequent iteration.
- */
-#define PERCPU_REF_SEN_NODES_COUNT     2
-
-/* Track last processed percpu ref node */
-static struct llist_node *last_percpu_ref_node;
-
-static struct percpu_ref_sen_node
-	percpu_ref_sen_nodes[PERCPU_REF_SEN_NODES_COUNT];
-
 static DECLARE_DELAYED_WORK(percpu_ref_release_work, percpu_ref_release_work_fn);
 
-static bool percpu_ref_is_sen_node(struct llist_node *node)
-{
-	return &percpu_ref_sen_nodes[0].node <= node &&
-		node <= &percpu_ref_sen_nodes[PERCPU_REF_SEN_NODES_COUNT - 1].node;
-}
-
-static struct llist_node *percpu_ref_get_sen_node(void)
-{
-	int i;
-	struct percpu_ref_sen_node *sn;
-
-	for (i = 0; i < PERCPU_REF_SEN_NODES_COUNT; i++) {
-		sn = &percpu_ref_sen_nodes[i];
-		if (!sn->inuse) {
-			sn->inuse = true;
-			return &sn->node;
-		}
-	}
-
-	return NULL;
-}
-
-static void percpu_ref_put_sen_node(struct llist_node *node)
-{
-	struct percpu_ref_sen_node *sn = container_of(node, struct percpu_ref_sen_node, node);
-
-	sn->inuse = false;
-	init_llist_node(node);
-}
-
-static void percpu_ref_put_all_sen_nodes_except(struct llist_node *node)
-{
-	int i;
-
-	for (i = 0; i < PERCPU_REF_SEN_NODES_COUNT; i++) {
-		if (&percpu_ref_sen_nodes[i].node == node)
-			continue;
-		percpu_ref_sen_nodes[i].inuse = false;
-		init_llist_node(&percpu_ref_sen_nodes[i].node);
-	}
-}
-
 static struct workqueue_struct *percpu_ref_release_wq;
 
 static void percpu_ref_release_work_fn(struct work_struct *work)
 {
-	struct llist_node *pos, *first, *head, *prev, *next;
-	struct llist_node *sen_node;
+	struct list_head *node;
 	struct percpu_ref *ref;
 	int count = 0;
 	bool held;
-	struct llist_node *last_node = READ_ONCE(last_percpu_ref_node);
 
-	first = READ_ONCE(percpu_ref_manage_head.first);
-	if (!first)
+	mutex_lock(&percpu_ref_active_switch_mutex);
+	spin_lock(&percpu_ref_manage_lock);
+	if (list_empty(&percpu_ref_manage_head)) {
+		next_percpu_ref_node = &percpu_ref_manage_head;
+		spin_unlock(&percpu_ref_manage_lock);
+		mutex_unlock(&percpu_ref_active_switch_mutex);
 		goto queue_release_work;
-
-	/*
-	 * Enqueue a dummy node to mark the start of scan. This dummy
-	 * node is used as start point of scan and ensures that
-	 * there is no additional synchronization required with new
-	 * label node additions to the llist. Any new labels will
-	 * be processed in next run of the kworker.
-	 *
-	 *                SCAN START PTR
-	 *                     |
-	 *                     v
-	 * +----------+     +------+    +------+    +------+
-	 * |          |     |      |    |      |    |      |
-	 * |   head   ------> dummy|--->|label |--->| label|--->NULL
-	 * |          |     | node |    |      |    |      |
-	 * +----------+     +------+    +------+    +------+
-	 *
-	 *
-	 * New label addition:
-	 *
-	 *                       SCAN START PTR
-	 *                            |
-	 *                            v
-	 * +----------+  +------+  +------+    +------+    +------+
-	 * |          |  |      |  |      |    |      |    |      |
-	 * |   head   |--> label|--> dummy|--->|label |--->| label|--->NULL
-	 * |          |  |      |  | node |    |      |    |      |
-	 * +----------+  +------+  +------+    +------+    +------+
-	 *
-	 */
-	if (last_node == NULL || last_node->next == NULL) {
-retry_sentinel_get:
-		sen_node = percpu_ref_get_sen_node();
-		/*
-		 * All sentinel nodes are in use? This should not happen, as we
-		 * require only one sentinel for the start of list traversal and
-		 * other sentinel node is freed during the traversal.
-		 */
-		if (WARN_ONCE(!sen_node, "All sentinel nodes are in use")) {
-			/* Use first node as the sentinel node */
-			head = first->next;
-			if (!head) {
-				struct llist_node *ign_node = NULL;
-				/*
-				 * We exhausted sentinel nodes. However, there aren't
-				 * enough nodes in the llist. So, we have leaked
-				 * sentinel nodes. Reclaim sentinels and retry.
-				 */
-				if (percpu_ref_is_sen_node(first))
-					ign_node = first;
-				percpu_ref_put_all_sen_nodes_except(ign_node);
-				goto retry_sentinel_get;
-			}
-			prev = first;
-		} else {
-			llist_add(sen_node, &percpu_ref_manage_head);
-			prev = sen_node;
-			head = prev->next;
-		}
-	} else {
-		prev = last_node;
-		head = prev->next;
 	}
+	if (next_percpu_ref_node == &percpu_ref_manage_head)
+		node = percpu_ref_manage_head.next;
+	else
+		node = next_percpu_ref_node;
+	next_percpu_ref_node = node->next;
+	list_del_init(node);
+	spin_unlock(&percpu_ref_manage_lock);
 
-	llist_for_each_safe(pos, next, head) {
-		/* Free sentinel node which is present in the list */
-		if (percpu_ref_is_sen_node(pos)) {
-			prev->next = pos->next;
-			percpu_ref_put_sen_node(pos);
-			continue;
-		}
-
-		ref = container_of(pos, struct percpu_ref_data, node)->ref;
+	while (!list_is_head(node, &percpu_ref_manage_head)) {
+		ref = container_of(node, struct percpu_ref_data, node)->ref;
 		__percpu_ref_switch_to_atomic_sync_checked(ref, false);
 		/*
 		 * Drop the ref while in RCU read critical section to
@@ -765,24 +682,31 @@  static void percpu_ref_release_work_fn(struct work_struct *work)
 		rcu_read_lock();
 		percpu_ref_put(ref);
 		held = percpu_ref_tryget(ref);
-		if (!held) {
-			prev->next = pos->next;
-			init_llist_node(pos);
+		if (held) {
+			spin_lock(&percpu_ref_manage_lock);
+			list_add(node, &percpu_ref_manage_head);
+			spin_unlock(&percpu_ref_manage_lock);
+			__percpu_ref_switch_to_percpu_checked(ref, false);
+		} else {
 			ref->percpu_count_ptr |= __PERCPU_REF_DEAD;
 		}
 		rcu_read_unlock();
-		if (!held)
-			continue;
-		__percpu_ref_switch_to_percpu_checked(ref, false);
+		mutex_unlock(&percpu_ref_active_switch_mutex);
 		count++;
-		if (count == READ_ONCE(max_scan_count)) {
-			WRITE_ONCE(last_percpu_ref_node, pos);
+		if (count == READ_ONCE(max_scan_count))
 			goto queue_release_work;
+		mutex_lock(&percpu_ref_active_switch_mutex);
+		spin_lock(&percpu_ref_manage_lock);
+		node = next_percpu_ref_node;
+		if (!list_is_head(next_percpu_ref_node, &percpu_ref_manage_head)) {
+			next_percpu_ref_node = next_percpu_ref_node->next;
+			list_del_init(node);
 		}
-		prev = pos;
+		spin_unlock(&percpu_ref_manage_lock);
 	}
 
-	WRITE_ONCE(last_percpu_ref_node, NULL);
+	mutex_unlock(&percpu_ref_active_switch_mutex);
+
 queue_release_work:
 	queue_delayed_work(percpu_ref_release_wq, &percpu_ref_release_work,
 			   scan_interval);