@@ -2991,6 +2991,8 @@ netdev_notifier_info_to_extack(const struct netdev_notifier_info *info)
int call_netdevice_notifiers(unsigned long val, struct net_device *dev);
int call_netdevice_notifiers_info(unsigned long val,
struct netdev_notifier_info *info);
+int schedule_delayed_event(struct net_device *dev,
+ void (*func)(struct net_device *dev));
#define for_each_netdev(net, d) \
list_for_each_entry(d, &(net)->dev_base_head, dev_list)
@@ -154,6 +154,7 @@
#include <linux/pm_runtime.h>
#include <linux/prandom.h>
#include <linux/once_lite.h>
+#include <linux/task_work.h>
#include <net/netdev_rx_queue.h>
#include <net/page_pool/types.h>
#include <net/page_pool/helpers.h>
@@ -2088,6 +2089,100 @@ static int call_netdevice_notifiers_mtu(unsigned long val,
return call_netdevice_notifiers_info(val, &info.info);
}
+struct event_info {
+ struct work_struct work;
+ struct net_device *dev;
+ netdevice_tracker dev_tracker;
+ void (*func)(struct net_device *slave_dev);
+
+ struct callback_head task_work;
+ struct completion comp;
+ refcount_t usage;
+};
+
+static void put_delayed_reg_info(struct event_info *info)
+{
+ if (refcount_dec_and_test(&info->usage))
+ kfree(info);
+}
+
+static void delayed_event_work(struct work_struct *work)
+{
+ struct event_info *info;
+ struct net_device *dev;
+
+ info = container_of(work, struct event_info, work);
+ dev = info->dev;
+
+ info->func(dev);
+
+ /* Not needed to own device during all @info life.
+ * Put device right after callback is handled,
+ * since a task submitted this work may wait for
+ * @dev counter.
+ */
+ netdev_put(dev, &info->dev_tracker);
+ info->dev = NULL;
+
+ complete(&info->comp);
+ put_delayed_reg_info(info);
+}
+
+static void wait_for_delayed_event_work(struct callback_head *task_work)
+{
+ struct event_info *info;
+
+ info = container_of(task_work, struct event_info, task_work);
+ wait_for_completion(&info->comp);
+
+ put_delayed_reg_info(info);
+}
+
+static struct event_info *alloc_delayed_event_info(struct net_device *dev,
+ void (*func)(struct net_device *dev))
+{
+ struct event_info *info;
+
+ info = kmalloc(sizeof(*info), GFP_KERNEL);
+ if (!info)
+ return NULL;
+
+ INIT_WORK(&info->work, delayed_event_work);
+ init_task_work(&info->task_work, wait_for_delayed_event_work);
+ init_completion(&info->comp);
+ refcount_set(&info->usage, 1);
+ info->func = func;
+ info->dev = dev;
+ netdev_hold(dev, &info->dev_tracker, GFP_KERNEL);
+
+ return info;
+}
+
+int schedule_delayed_event(struct net_device *dev,
+ void (*func)(struct net_device *dev))
+{
+ struct event_info *info;
+
+ info = alloc_delayed_event_info(dev, func);
+ if (!info)
+ return NOTIFY_DONE;
+
+ /* In case of the notifier is called from regular task,
+ * make the task to wait for registration is completed
+ * before task is returned to userspace. E.g., a syscall
+ * caller will have failover already connected after
+ * he loaded slave device driver.
+ */
+ if (!(current->flags & PF_KTHREAD)) {
+ if (!task_work_add(current, &info->task_work, TWA_RESUME))
+ refcount_inc(&info->usage);
+ }
+
+ schedule_work(&info->work);
+ return NOTIFY_OK;
+}
+EXPORT_SYMBOL_GPL(schedule_delayed_event);
+
#ifdef CONFIG_NET_INGRESS
static DEFINE_STATIC_KEY_FALSE(ingress_needed_key);
Some drivers (e.g., failover and netvsc) use netdevice notifiers to link devices each other by calling netdev_master_upper_dev_link(). Since we want 1)to make both of the devices using the same lock after linking, and 2)to call netdevice notifiers with nd_lock is locked, we can't do these two options at the same time, because there will be a problem with priority inversion: lock_netdev(dev1, &nd_lock1); call_netdevice_notifier() lock_netdev(dev2, &nd_lock2); <--- problem here if !locks_ordered() nd_lock_transfer_devices(nd_lock, nd_lock2); netdev_master_upper_dev_link(dev1, dev2); We can't use double_lock_netdev() instead of lock_netdev() here, since dev2 is unknown at that moment. This patch introduces interface to allow handling events in delayed work. It consists of three: 1)Delayed work to call event callback. The work starting without any locks locked, so it can take locks of both devices in correct order; 2)Completion to notify the task that delayed work is done; 3)task_work to allow task to wait for the completion in the place where task has nd_lock unlocked. Here is an example of what happens on module loading: [Task] [Work] insmod slave_netdev_drv.ko enter to kernel init_module() ... ... lock_netdev() call_netdevice_notifier() schedule_delayed_event() unlock_netdev() delayed_event_work() double_lock_netdev(dev1, &nd_lock1, dev2, &nd_lock2) nd_lock_transfer_devices(nd_lock, nd_lock2) netdev_master_upper_dev_link(dev1, dev2) double_unlock_netdev(nd_lock1, nd_lock2) complete() wait_for_delayed_event_work() wait_for_completion() exit to userspace As it's seen, using of task work allows to remain user-visible behavior here. We return from syscall to userspace after delayed work is completed and all events are handled. This is why we need this task work. Signed-off-by: Kirill Tkhai <tkhai@ya.ru> --- include/linux/netdevice.h | 2 + net/core/dev.c | 95 +++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 97 insertions(+)