diff mbox series

[NET-PREV,37/51] net: Introduce delayed event work

Message ID 174265454834.356712.6297354306843654837.stgit@pro.pro (mailing list archive)
State RFC
Delegated to: Netdev Maintainers
Headers show
Series Kill rtnl_lock using fine-grained nd_lock | expand

Checks

Context Check Description
netdev/tree_selection success Guessing tree name failed - patch did not apply, async

Commit Message

Kirill Tkhai March 22, 2025, 2:42 p.m. UTC
Some drivers (e.g., failover and netvsc) use netdevice notifiers
to link devices each other by calling netdev_master_upper_dev_link().
Since we want 1)to make both of the devices using the same lock after
linking, and 2)to call netdevice notifiers with nd_lock is locked,
we can't do these two options at the same time, because there will
be a problem with priority inversion:

lock_netdev(dev1, &nd_lock1);
call_netdevice_notifier()
  lock_netdev(dev2, &nd_lock2); <--- problem here if !locks_ordered()
  nd_lock_transfer_devices(nd_lock, nd_lock2);
  netdev_master_upper_dev_link(dev1, dev2);

We can't use double_lock_netdev() instead of lock_netdev() here,
since dev2 is unknown at that moment.

This patch introduces interface to allow handling events in delayed work.
It consists of three:
1)Delayed work to call event callback. The work starting without
  any locks locked, so it can take locks of both devices in correct
  order;
2)Completion to notify the task that delayed work is done;
3)task_work to allow task to wait for the completion in
  the place where task has nd_lock unlocked.

Here is an example of what happens on module loading:

[Task]                                [Work]

insmod slave_netdev_drv.ko
  enter to kernel
    init_module()
      ...
      ...
      lock_netdev()
      call_netdevice_notifier()
        schedule_delayed_event()
      unlock_netdev()
                                       delayed_event_work()
                                         double_lock_netdev(dev1, &nd_lock1, dev2, &nd_lock2)
                                         nd_lock_transfer_devices(nd_lock, nd_lock2)
                                         netdev_master_upper_dev_link(dev1, dev2)
                                         double_unlock_netdev(nd_lock1, nd_lock2)
                                         complete()


    wait_for_delayed_event_work()
      wait_for_completion()
  exit to userspace

As it's seen, using of task work allows to remain user-visible behavior here.
We return from syscall to userspace after delayed work is completed and
all events are handled. This is why we need this task work.

Signed-off-by: Kirill Tkhai <tkhai@ya.ru>
---
 include/linux/netdevice.h |    2 +
 net/core/dev.c            |   95 +++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 97 insertions(+)
diff mbox series

Patch

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 2e9052e808a4..83b675ec2b0a 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -2991,6 +2991,8 @@  netdev_notifier_info_to_extack(const struct netdev_notifier_info *info)
 int call_netdevice_notifiers(unsigned long val, struct net_device *dev);
 int call_netdevice_notifiers_info(unsigned long val,
 				  struct netdev_notifier_info *info);
+int schedule_delayed_event(struct net_device *dev,
+			   void (*func)(struct net_device *dev));
 
 #define for_each_netdev(net, d)		\
 		list_for_each_entry(d, &(net)->dev_base_head, dev_list)
diff --git a/net/core/dev.c b/net/core/dev.c
index e6809a80644e..1c447446215d 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -154,6 +154,7 @@ 
 #include <linux/pm_runtime.h>
 #include <linux/prandom.h>
 #include <linux/once_lite.h>
+#include <linux/task_work.h>
 #include <net/netdev_rx_queue.h>
 #include <net/page_pool/types.h>
 #include <net/page_pool/helpers.h>
@@ -2088,6 +2089,100 @@  static int call_netdevice_notifiers_mtu(unsigned long val,
 	return call_netdevice_notifiers_info(val, &info.info);
 }
 
+struct event_info {
+	struct work_struct work;
+	struct net_device *dev;
+	netdevice_tracker dev_tracker;
+	void (*func)(struct net_device *slave_dev);
+
+	struct callback_head task_work;
+	struct completion comp;
+	refcount_t usage;
+};
+
+static void put_delayed_reg_info(struct event_info *info)
+{
+	if (refcount_dec_and_test(&info->usage))
+		kfree(info);
+}
+
+static void delayed_event_work(struct work_struct *work)
+{
+	struct event_info *info;
+	struct net_device *dev;
+
+	info = container_of(work, struct event_info, work);
+	dev = info->dev;
+
+	info->func(dev);
+
+	/* Not needed to own device during all @info life.
+	 * Put device right after callback is handled,
+	 * since a task submitted this work may wait for
+	 * @dev counter.
+	 */
+	netdev_put(dev, &info->dev_tracker);
+	info->dev = NULL;
+
+	complete(&info->comp);
+	put_delayed_reg_info(info);
+}
+
+static void wait_for_delayed_event_work(struct callback_head *task_work)
+{
+	struct event_info *info;
+
+	info = container_of(task_work, struct event_info, task_work);
+	wait_for_completion(&info->comp);
+
+	put_delayed_reg_info(info);
+}
+
+static struct event_info *alloc_delayed_event_info(struct net_device *dev,
+				     void (*func)(struct net_device *dev))
+{
+	struct event_info *info;
+
+	info = kmalloc(sizeof(*info), GFP_KERNEL);
+	if (!info)
+		return NULL;
+
+	INIT_WORK(&info->work, delayed_event_work);
+	init_task_work(&info->task_work, wait_for_delayed_event_work);
+	init_completion(&info->comp);
+	refcount_set(&info->usage, 1);
+	info->func = func;
+	info->dev = dev;
+	netdev_hold(dev, &info->dev_tracker, GFP_KERNEL);
+
+	return info;
+}
+
+int schedule_delayed_event(struct net_device *dev,
+			   void (*func)(struct net_device *dev))
+{
+	struct event_info *info;
+
+	info = alloc_delayed_event_info(dev, func);
+	if (!info)
+		return NOTIFY_DONE;
+
+	/* In case of the notifier is called from regular task,
+	 * make the task to wait for registration is completed
+	 * before task is returned to userspace. E.g., a syscall
+	 * caller will have failover already connected after
+	 * he loaded slave device driver.
+	 */
+	if (!(current->flags & PF_KTHREAD)) {
+		if (!task_work_add(current, &info->task_work, TWA_RESUME))
+			refcount_inc(&info->usage);
+	}
+
+	schedule_work(&info->work);
+	return NOTIFY_OK;
+}
+EXPORT_SYMBOL_GPL(schedule_delayed_event);
+
 #ifdef CONFIG_NET_INGRESS
 static DEFINE_STATIC_KEY_FALSE(ingress_needed_key);