diff mbox series

[NET-PREV,07/51] net: Introduce nd_lock and primitives to work with it

Message ID 174265432664.356712.15568949959158340088.stgit@pro.pro (mailing list archive)
State RFC
Delegated to: Netdev Maintainers
Headers show
Series Kill rtnl_lock using fine-grained nd_lock | expand

Checks

Context Check Description
netdev/tree_selection success Guessing tree name failed - patch did not apply, async

Commit Message

Kirill Tkhai March 22, 2025, 2:38 p.m. UTC
Signed-off-by: Kirill Tkhai <tkhai@ya.ru>
---
 include/linux/netdevice.h |   28 ++++
 net/core/dev.c            |  329 +++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 357 insertions(+)
diff mbox series

Patch

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 614ec5d3d75b..e36e64310bd4 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1716,6 +1716,14 @@  enum netdev_reg_state {
 	NETREG_DUMMY,		/* dummy device for NAPI poll */
 };
 
+struct nd_lock {
+	struct mutex mutex;
+	struct list_head list;
+	int nr; /* number of entries in list */
+	refcount_t usage;
+	struct rcu_head rcu;
+};
+
 /**
  *	struct net_device - The DEVICE structure.
  *
@@ -2081,6 +2089,8 @@  struct net_device {
 	char			name[IFNAMSIZ];
 	struct netdev_name_node	*name_node;
 	struct dev_ifalias	__rcu *ifalias;
+	struct nd_lock		__rcu *nd_lock; /* lock protecting this dev */
+	struct list_head	nd_lock_entry; /* entry in nd_lock::list */
 	/*
 	 *	I/O specific fields
 	 *	FIXME: Merge these and struct ifmap into one
@@ -3094,6 +3104,24 @@  static inline int dev_direct_xmit(struct sk_buff *skb, u16 queue_id)
 	return ret;
 }
 
+void unlock_netdev(struct nd_lock *nd_lock);
+bool lock_netdev(struct net_device *dev, struct nd_lock **nd_lock);
+bool lock_netdev_nested(struct net_device *dev, struct nd_lock **nd_lock,
+				      struct nd_lock *held_lock);
+bool double_lock_netdev(struct net_device *dev, struct nd_lock **nd_lock,
+			struct net_device *dev2, struct nd_lock **nd_lock2);
+void double_unlock_netdev(struct nd_lock *nd_lock, struct nd_lock *nd_lock2);
+
+struct nd_lock *alloc_nd_lock(void);
+void put_nd_lock(struct nd_lock *nd_lock);
+void attach_nd_lock(struct net_device *dev, struct nd_lock *nd_lock);
+void detach_nd_lock(struct net_device *dev);
+struct nd_lock *attach_new_nd_lock(struct net_device *dev);
+
+extern struct nd_lock fallback_nd_lock;
+
+void nd_lock_transfer_devices(struct nd_lock **p_lock, struct nd_lock **p_lock2);
+
 int register_netdevice(struct net_device *dev);
 void unregister_netdevice_queue(struct net_device *dev, struct list_head *head);
 void unregister_netdevice_many(struct list_head *head);
diff --git a/net/core/dev.c b/net/core/dev.c
index 0d0b983a6c21..9d98ab1e76bd 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -170,6 +170,25 @@  static int call_netdevice_notifiers_extack(unsigned long val,
 					   struct net_device *dev,
 					   struct netlink_ext_ack *extack);
 
+/* While unregistering many devices at once, e.g., in ->exit_batch_rtnl
+ * methods, every netdev must be locked.
+ * Instead of taking all original nd_locks of devices at once, we transfer
+ * devices to relate to this @fallback_nd_lock. It allows to own a single
+ * lock during the unregistration. See locks_ordered() for locking order
+ * details.
+ *
+ * Not a first priority TODO is to change this algorithm to use one
+ * of original locks of these devices to transfer every device to.
+ *
+ * XXX: look at comment to nd_lock_transfer_devices().
+ */
+struct nd_lock fallback_nd_lock = {
+	.mutex = __MUTEX_INITIALIZER(fallback_nd_lock.mutex),
+	.list = LIST_HEAD_INIT(fallback_nd_lock.list),
+	.usage = REFCOUNT_INIT(1),
+};
+EXPORT_SYMBOL(fallback_nd_lock);
+
 static DEFINE_MUTEX(ifalias_mutex);
 
 /* protects napi_hash addition/deletion and napi_gen_id */
@@ -10322,6 +10341,315 @@  static void netdev_do_free_pcpu_stats(struct net_device *dev)
 	}
 }
 
+struct nd_lock *alloc_nd_lock(void)
+{
+	struct nd_lock *nd_lock = kmalloc(sizeof(*nd_lock), GFP_KERNEL);
+
+	if (!nd_lock)
+		return NULL;
+
+	mutex_init(&nd_lock->mutex);
+	INIT_LIST_HEAD(&nd_lock->list);
+	nd_lock->nr = 0;
+	refcount_set(&nd_lock->usage, 1);
+	return nd_lock;
+}
+EXPORT_SYMBOL(alloc_nd_lock);
+
+void put_nd_lock(struct nd_lock *nd_lock)
+{
+	if (!refcount_dec_and_test(&nd_lock->usage))
+		return;
+	BUG_ON(!list_empty(&nd_lock->list));
+	kfree_rcu(nd_lock, rcu);
+}
+EXPORT_SYMBOL(put_nd_lock);
+
+/* Locking order: fallback_nd_lock is first,
+ * then prefer lock with smaller address.
+ */
+static bool locks_ordered(struct nd_lock *nd_lock, struct nd_lock *nd_lock2)
+{
+	if ((nd_lock <= nd_lock2 && nd_lock2 != &fallback_nd_lock) ||
+	    nd_lock == &fallback_nd_lock)
+		return true;
+	return false;
+}
+
+/* Lock alive @dev or return false. @held_lock is optional argument.
+ * In case of @held_lock is passed, the caller must guarantee that
+ * dev->nd_lock is after @held_lock in the locking order (for details
+ * see locks_ordered()).
+ * Usually, held_lock is fallback_nd_lock.
+ */
+static bool __lock_netdev(struct net_device *dev, struct nd_lock **ret_nd_lock,
+			  struct nd_lock *held_lock)
+{
+	struct nd_lock *nd_lock;
+	bool got;
+
+	if (held_lock)
+		lockdep_assert_held(&held_lock->mutex);
+
+	while (1) {
+		rcu_read_lock();
+		nd_lock = rcu_dereference(dev->nd_lock);
+		if (nd_lock && nd_lock != held_lock)
+			got = refcount_inc_not_zero(&nd_lock->usage);
+		rcu_read_unlock();
+
+		if (unlikely(!nd_lock)) {
+			/* Someone is unregistering @dev in parallel */
+			*ret_nd_lock = NULL;
+			return false;
+		}
+
+		/* The same lock as we own. Nothing to do. */
+		if (nd_lock == held_lock)
+			break;
+
+		if (unlikely(!got)) {
+			/* @dev->nd_lock changed or @dev is unregistering */
+			cond_resched();
+			continue;
+		}
+
+		WARN_ON(held_lock && !locks_ordered(held_lock, nd_lock));
+
+		if (!held_lock)
+			mutex_lock(&nd_lock->mutex);
+		else
+			mutex_lock_nested(&nd_lock->mutex, SINGLE_DEPTH_NESTING);
+		/* Check after mutex is locked it has not changed */
+		if (likely(nd_lock == rcu_access_pointer(dev->nd_lock)))
+			break;
+		mutex_unlock(&nd_lock->mutex);
+		put_nd_lock(nd_lock);
+		cond_resched();
+	}
+
+	*ret_nd_lock = nd_lock;
+	return true;
+}
+
+bool lock_netdev(struct net_device *dev, struct nd_lock **nd_lock)
+{
+	return __lock_netdev(dev, nd_lock, NULL);
+}
+EXPORT_SYMBOL(lock_netdev);
+
+bool lock_netdev_nested(struct net_device *dev, struct nd_lock **nd_lock,
+			struct nd_lock *held_lock)
+{
+	return __lock_netdev(dev, nd_lock, held_lock);
+}
+EXPORT_SYMBOL(lock_netdev_nested);
+
+void unlock_netdev(struct nd_lock *nd_lock)
+{
+	mutex_unlock(&nd_lock->mutex);
+	put_nd_lock(nd_lock);
+}
+EXPORT_SYMBOL(unlock_netdev);
+
+/* Double lock two devices and return locks they currently attached to.
+ * It's acceptable for one of devices to be NULL, or @dev and @dev2 may
+ * point to the same device. Pair bracket double_unlock_netdev() are able
+ * to handle such cases.
+ */
+bool double_lock_netdev(struct net_device *dev, struct nd_lock **ret_nd_lock,
+			struct net_device *dev2, struct nd_lock **ret_nd_lock2)
+{
+	struct nd_lock *nd_lock, *nd_lock2;
+	bool got, got2, ret;
+
+	if (WARN_ON_ONCE(!dev && !dev2))
+		return false;
+
+	if (dev == dev2 || !dev != !dev2) {
+		ret = lock_netdev(dev ? : dev2, ret_nd_lock);
+		*ret_nd_lock2 = *ret_nd_lock;
+		return ret;
+	}
+
+	while (1) {
+		got = got2 = false;
+		rcu_read_lock();
+		nd_lock = rcu_dereference(dev->nd_lock);
+		if (nd_lock)
+			got = refcount_inc_not_zero(&nd_lock->usage);
+		nd_lock2 = rcu_dereference(dev2->nd_lock);
+		if (nd_lock2) {
+			if (nd_lock2 != nd_lock)
+				got2 = refcount_inc_not_zero(&nd_lock2->usage);
+		}
+		rcu_read_unlock();
+		if (!got || (!got2 && nd_lock2 != nd_lock))
+			goto restart;
+
+		if (locks_ordered(nd_lock, nd_lock2)){
+			mutex_lock(&nd_lock->mutex);
+			if (nd_lock != nd_lock2)
+				mutex_lock_nested(&nd_lock2->mutex, SINGLE_DEPTH_NESTING);
+		} else {
+			mutex_lock(&nd_lock2->mutex);
+			if (nd_lock != nd_lock2)
+				mutex_lock_nested(&nd_lock->mutex, SINGLE_DEPTH_NESTING);
+		}
+
+		if (likely(nd_lock == rcu_access_pointer(dev->nd_lock) &&
+			   nd_lock2 == rcu_access_pointer(dev2->nd_lock))) {
+			/* Both locks are acquired and correct */
+			break;
+		}
+		if (nd_lock != nd_lock2)
+			mutex_unlock(&nd_lock2->mutex);
+		mutex_unlock(&nd_lock->mutex);
+restart:
+		if (got)
+			put_nd_lock(nd_lock);
+		if (got2)
+			put_nd_lock(nd_lock2);
+		if (!nd_lock || !nd_lock2) {
+			*ret_nd_lock = *ret_nd_lock2 = NULL;
+			return false;
+		}
+	}
+
+	*ret_nd_lock = nd_lock;
+	*ret_nd_lock2 = nd_lock2;
+	return true;
+}
+EXPORT_SYMBOL(double_lock_netdev);
+
+void double_unlock_netdev(struct nd_lock *nd_lock, struct nd_lock *nd_lock2)
+{
+	if (nd_lock != nd_lock2)
+		unlock_netdev(nd_lock);
+	unlock_netdev(nd_lock2);
+}
+EXPORT_SYMBOL(double_unlock_netdev);
+
+/* Make set of devices protected by @p_lock and set of devices protected
+ * by @p_lock2 to be protected the same lock (this function chooses one
+ * of @p_lock and @p_lock2 as that common lock).
+ *
+ * 1)We call this in drivers which make two or more devices bound each other.
+ *   E.g., drivers using newlink (like bonding, bridge and veth), or connecting
+ *   several devices in switch (like dsa). Nested configurations are also
+ *   handled to relate the same nd_lock (e.g., if veth is attached to bridge,
+ *   the same lock will be shared by both veth peers, all bridge ports
+ *   and the bridge itself).
+ *
+ *   This allow to introduce sane locking like:
+ *
+ *   lock_netdev(bridge, &nd_lock)
+ *   ioctl(change bridge)
+ *     netdevice notifier for bridge       // protected by nd_lock
+ *       netdevice notifier for veth       // protected by nd_lock
+ *         change veth parameter           // protected by nd_lock
+ *       netdevice notifier for other port // protected by nd_lock
+ *         change port device parameter    // protected by nd_lock
+ *   unlock_netdev(nd_lock)
+ *
+ * So, each lock protects some group devices in the system, and all
+ * of devices in the group are connected in some logical way.
+ *
+ * 2)The main rule for choosing common lock is simple: we prefer fallback_nd_lock.
+ *   Why it is so? Along with common used virtual devices, there are
+ *   several hardware devices, which connect devices in groups and
+ *   touches or modifies several devices together in one ioctl
+ *   or netdevice event (e.g., mlx5). Not having every of devices zoo
+ *   physically, it's impossible to organize them in small exact groups
+ *   and test. So, we attach them to bigger fallback group.
+ *
+ *   Let we have converted bridge driver and not converted my_driver. In case
+ *   of we attach my_driver dev1 to the bridge, the bridge and my_driver dev1
+ *   must relate to the same nd_lock. But the only nd_lock we can attach is
+ *   fallback_nd_lock, otherwise my_driver dev1 may appear in different lock
+ *   groups with some my_driver dev2 after my_driver dev2 is loaded. This
+ *   would be wrong, since dev1 and dev2 may be used  in same ioctl or netdevice
+ *   event. So, fallback_nd_lock will be used as result lock.
+ *
+ *   Note, that after all hardware drivers organize their logically connected
+ *   devices in correct nd_lock groups, we remove this rule.
+ *
+ *   The second rule is we prefer to migrate from smaller list, since
+ *   there are less iterations.
+ *
+ * 3)Note, that reverse operation (splitting a lock into two locks) is not
+ *   implemented at the moment (and it maybe useless).
+ *
+ * 4)Newly used lock for both sets is returned in @p_lock2 argument.
+ */
+void nd_lock_transfer_devices(struct nd_lock **p_lock, struct nd_lock **p_lock2)
+{
+	struct nd_lock *lock = *p_lock, *lock2 = *p_lock2;
+	struct net_device *dev;
+
+	lockdep_assert_held(&lock->mutex);
+	lockdep_assert_held(&lock2->mutex);
+
+	if (lock == lock2)
+		return;
+
+	if (lock == &fallback_nd_lock ||
+	    (lock2 != &fallback_nd_lock && lock->nr > lock2->nr))
+		swap(lock, lock2);
+
+	list_for_each_entry(dev, &lock->list, nd_lock_entry)
+		rcu_assign_pointer(dev->nd_lock, lock2);
+
+	list_splice(&lock->list, &lock2->list);
+	refcount_add(lock->nr, &lock2->usage);
+	lock2->nr += lock->nr;
+	lock->nr = 0;
+	/* Our caller must own @lock since its locked */
+	WARN_ON(refcount_sub_and_test(lock->nr, &lock->usage));
+
+	*p_lock = lock;
+	*p_lock2 = lock2;
+}
+EXPORT_SYMBOL(nd_lock_transfer_devices);
+
+void attach_nd_lock(struct net_device *dev, struct nd_lock *nd_lock)
+{
+	lockdep_assert_held(&nd_lock->mutex);
+	rcu_assign_pointer(dev->nd_lock, nd_lock);
+	list_add(&dev->nd_lock_entry, &nd_lock->list);
+	refcount_inc(&nd_lock->usage);
+	nd_lock->nr++;
+}
+EXPORT_SYMBOL(attach_nd_lock);
+
+void detach_nd_lock(struct net_device *dev)
+{
+	struct nd_lock *nd_lock = rcu_dereference_protected(dev->nd_lock, true);
+
+	lockdep_assert_held(&nd_lock->mutex);
+	rcu_assign_pointer(dev->nd_lock, NULL);
+	list_del_init(&dev->nd_lock_entry);
+	nd_lock->nr--;
+	/* Our caller must own @lock since its locked */
+	WARN_ON(refcount_dec_and_test(&nd_lock->usage));
+}
+EXPORT_SYMBOL(detach_nd_lock);
+
+struct nd_lock *attach_new_nd_lock(struct net_device *dev)
+{
+	struct nd_lock *nd_lock = alloc_nd_lock();
+	if (!nd_lock)
+		return NULL;
+
+	mutex_lock(&nd_lock->mutex);
+	attach_nd_lock(dev, nd_lock);
+	mutex_unlock(&nd_lock->mutex);
+	put_nd_lock(nd_lock);
+
+	return nd_lock;
+}
+EXPORT_SYMBOL(attach_new_nd_lock);
+
 /**
  * register_netdevice() - register a network device
  * @dev: device to register
@@ -11094,6 +11422,7 @@  struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
 	INIT_LIST_HEAD(&dev->link_watch_list);
 	INIT_LIST_HEAD(&dev->adj_list.upper);
 	INIT_LIST_HEAD(&dev->adj_list.lower);
+	INIT_LIST_HEAD(&dev->nd_lock_entry);
 	INIT_LIST_HEAD(&dev->ptype_all);
 	INIT_LIST_HEAD(&dev->ptype_specific);
 	INIT_LIST_HEAD(&dev->net_notifier_list);