diff mbox series

[net-next,v4,05/13] net: devlink: track netdev with devlink_port assigned

Message ID 20221102160211.662752-6-jiri@resnulli.us (mailing list archive)
State Accepted
Commit 02a68a47eadedf95748facfca6ced31fb0181d52
Delegated to: Netdev Maintainers
Headers show
Series net: fix netdev to devlink_port linkage and expose to user | expand

Checks

Context Check Description
netdev/tree_selection success Clearly marked for net-next, async
netdev/fixes_present success Fixes tag not required for -next series
netdev/subject_prefix success Link
netdev/cover_letter success Series has a cover letter
netdev/patch_count success Link
netdev/header_inline success No static functions without inline keyword in header files
netdev/build_32bit success Errors and warnings before: 4343 this patch: 4343
netdev/cc_maintainers warning 1 maintainers not CCed: petrm@nvidia.com
netdev/build_clang success Errors and warnings before: 1048 this patch: 1048
netdev/module_param success Was 0 now: 0
netdev/verify_signedoff success Signed-off-by tag matches author and committer
netdev/check_selftest success No net selftest shell script
netdev/verify_fixes success No Fixes tag
netdev/build_allmodconfig_warn success Errors and warnings before: 4530 this patch: 4530
netdev/checkpatch success total: 0 errors, 0 warnings, 0 checks, 189 lines checked
netdev/kdoc success Errors and warnings before: 0 this patch: 0
netdev/source_inline success Was 0 now: 0

Commit Message

Jiri Pirko Nov. 2, 2022, 4:02 p.m. UTC
From: Jiri Pirko <jiri@nvidia.com>

Currently, ethernet drivers are using devlink_port_type_eth_set() and
devlink_port_type_clear() to set devlink port type and link to related
netdev.

Instead of calling them directly, let the driver use
SET_NETDEV_DEVLINK_PORT macro to assign devlink_port pointer and let
devlink to track it. Note the devlink port pointer is static during
the time netdevice is registered.

In devlink code, use per-namespace netdev notifier to track
the netdevices with devlink_port assigned and change the internal
devlink_port type and related type pointer accordingly.

Signed-off-by: Jiri Pirko <jiri@nvidia.com>
---
v3->v4:
- s/_devlink_port/port in SET_NETDEV_DEVLINK_PORT() macro
- fixed register_netdevice() error path
- put "dev" into into "()" in SET_NETDEV_DEVLINK_PORT() macro
v1->v2:
- added kdoc for devlink_port struct field
---
 include/linux/netdevice.h | 19 ++++++++++
 net/core/dev.c            | 14 +++++---
 net/core/devlink.c        | 75 ++++++++++++++++++++++++++++++++++++---
 3 files changed, 99 insertions(+), 9 deletions(-)

Comments

Ido Schimmel Nov. 6, 2022, 9:09 a.m. UTC | #1
On Wed, Nov 02, 2022 at 05:02:03PM +0100, Jiri Pirko wrote:
> @@ -9645,10 +9649,13 @@ struct devlink *devlink_alloc_ns(const struct devlink_ops *ops,
>  
>  	ret = xa_alloc_cyclic(&devlinks, &devlink->index, devlink, xa_limit_31b,
>  			      &last_id, GFP_KERNEL);
> -	if (ret < 0) {
> -		kfree(devlink);
> -		return NULL;
> -	}
> +	if (ret < 0)
> +		goto err_xa_alloc;
> +
> +	devlink->netdevice_nb.notifier_call = devlink_netdevice_event;
> +	ret = register_netdevice_notifier_net(net, &devlink->netdevice_nb);
> +	if (ret)
> +		goto err_register_netdevice_notifier;
>  
>  	devlink->dev = dev;
>  	devlink->ops = ops;
> @@ -9675,6 +9682,12 @@ struct devlink *devlink_alloc_ns(const struct devlink_ops *ops,
>  	init_completion(&devlink->comp);
>  
>  	return devlink;
> +
> +err_register_netdevice_notifier:
> +	xa_erase(&devlinks, devlink->index);
> +err_xa_alloc:
> +	kfree(devlink);
> +	return NULL;
>  }
>  EXPORT_SYMBOL_GPL(devlink_alloc_ns);
>  
> @@ -9828,6 +9841,10 @@ void devlink_free(struct devlink *devlink)
>  	WARN_ON(!list_empty(&devlink->port_list));
>  
>  	xa_destroy(&devlink->snapshot_ids);
> +
> +	unregister_netdevice_notifier_net(devlink_net(devlink),
> +					  &devlink->netdevice_nb);
> +
>  	xa_erase(&devlinks, devlink->index);
>  
>  	kfree(devlink);

The network namespace of the devlink instance can change throughout the
lifetime of the devlink instance, but the notifier block is always
registered in the initial namespace. This leads to
unregister_netdevice_notifier_net() failing to unregister the notifier
block, which leads to use-after-free. Reproduce (with KASAN enabled):

# echo "10 0" > /sys/bus/netdevsim/new_device
# ip netns add bla
# devlink dev reload netdevsim/netdevsim10 netns bla
# echo 10 > /sys/bus/netdevsim/del_device
# ip link add dummy10 up type dummy

I see two possible solutions:

1. Use register_netdevice_notifier() instead of
register_netdevice_notifier_net().

2. Move the notifier block to the correct namespace in devlink_reload().
Jiri Pirko Nov. 7, 2022, 7:40 a.m. UTC | #2
Sun, Nov 06, 2022 at 10:09:42AM CET, idosch@idosch.org wrote:
>On Wed, Nov 02, 2022 at 05:02:03PM +0100, Jiri Pirko wrote:
>> @@ -9645,10 +9649,13 @@ struct devlink *devlink_alloc_ns(const struct devlink_ops *ops,
>>  
>>  	ret = xa_alloc_cyclic(&devlinks, &devlink->index, devlink, xa_limit_31b,
>>  			      &last_id, GFP_KERNEL);
>> -	if (ret < 0) {
>> -		kfree(devlink);
>> -		return NULL;
>> -	}
>> +	if (ret < 0)
>> +		goto err_xa_alloc;
>> +
>> +	devlink->netdevice_nb.notifier_call = devlink_netdevice_event;
>> +	ret = register_netdevice_notifier_net(net, &devlink->netdevice_nb);
>> +	if (ret)
>> +		goto err_register_netdevice_notifier;
>>  
>>  	devlink->dev = dev;
>>  	devlink->ops = ops;
>> @@ -9675,6 +9682,12 @@ struct devlink *devlink_alloc_ns(const struct devlink_ops *ops,
>>  	init_completion(&devlink->comp);
>>  
>>  	return devlink;
>> +
>> +err_register_netdevice_notifier:
>> +	xa_erase(&devlinks, devlink->index);
>> +err_xa_alloc:
>> +	kfree(devlink);
>> +	return NULL;
>>  }
>>  EXPORT_SYMBOL_GPL(devlink_alloc_ns);
>>  
>> @@ -9828,6 +9841,10 @@ void devlink_free(struct devlink *devlink)
>>  	WARN_ON(!list_empty(&devlink->port_list));
>>  
>>  	xa_destroy(&devlink->snapshot_ids);
>> +
>> +	unregister_netdevice_notifier_net(devlink_net(devlink),
>> +					  &devlink->netdevice_nb);
>> +
>>  	xa_erase(&devlinks, devlink->index);
>>  
>>  	kfree(devlink);
>
>The network namespace of the devlink instance can change throughout the
>lifetime of the devlink instance, but the notifier block is always
>registered in the initial namespace. This leads to
>unregister_netdevice_notifier_net() failing to unregister the notifier
>block, which leads to use-after-free. Reproduce (with KASAN enabled):
>
># echo "10 0" > /sys/bus/netdevsim/new_device
># ip netns add bla
># devlink dev reload netdevsim/netdevsim10 netns bla
># echo 10 > /sys/bus/netdevsim/del_device
># ip link add dummy10 up type dummy
>
>I see two possible solutions:
>
>1. Use register_netdevice_notifier() instead of
>register_netdevice_notifier_net().
>
>2. Move the notifier block to the correct namespace in devlink_reload().

Yep, this was my intension, slipped my mind. Thanks! Will send a
follow-up.
diff mbox series

Patch

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 4b5052db978f..f048a30ea10b 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1999,6 +1999,11 @@  enum netdev_ml_priv_type {
  *					registered
  *	@offload_xstats_l3:	L3 HW stats for this netdevice.
  *
+ *	@devlink_port:	Pointer to related devlink port structure.
+ *			Assigned by a driver before netdev registration using
+ *			SET_NETDEV_DEVLINK_PORT macro. This pointer is static
+ *			during the time netdevice is registered.
+ *
  *	FIXME: cleanup struct net_device such that network protocol info
  *	moves out.
  */
@@ -2349,9 +2354,22 @@  struct net_device {
 	netdevice_tracker	watchdog_dev_tracker;
 	netdevice_tracker	dev_registered_tracker;
 	struct rtnl_hw_stats64	*offload_xstats_l3;
+
+	struct devlink_port	*devlink_port;
 };
 #define to_net_dev(d) container_of(d, struct net_device, dev)
 
+/*
+ * Driver should use this to assign devlink port instance to a netdevice
+ * before it registers the netdevice. Therefore devlink_port is static
+ * during the netdev lifetime after it is registered.
+ */
+#define SET_NETDEV_DEVLINK_PORT(dev, port)			\
+({								\
+	WARN_ON((dev)->reg_state != NETREG_UNINITIALIZED);	\
+	((dev)->devlink_port = (port));				\
+})
+
 static inline bool netif_elide_gro(const struct net_device *dev)
 {
 	if (!(dev->features & NETIF_F_GRO) || dev->xdp_prog)
@@ -2785,6 +2803,7 @@  enum netdev_cmd {
 	NETDEV_PRE_TYPE_CHANGE,
 	NETDEV_POST_TYPE_CHANGE,
 	NETDEV_POST_INIT,
+	NETDEV_PRE_UNINIT,
 	NETDEV_RELEASE,
 	NETDEV_NOTIFY_PEERS,
 	NETDEV_JOIN,
diff --git a/net/core/dev.c b/net/core/dev.c
index 2e4f1c97b59e..3bacee3bee78 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -1621,10 +1621,10 @@  const char *netdev_cmd_to_name(enum netdev_cmd cmd)
 	N(UP) N(DOWN) N(REBOOT) N(CHANGE) N(REGISTER) N(UNREGISTER)
 	N(CHANGEMTU) N(CHANGEADDR) N(GOING_DOWN) N(CHANGENAME) N(FEAT_CHANGE)
 	N(BONDING_FAILOVER) N(PRE_UP) N(PRE_TYPE_CHANGE) N(POST_TYPE_CHANGE)
-	N(POST_INIT) N(RELEASE) N(NOTIFY_PEERS) N(JOIN) N(CHANGEUPPER)
-	N(RESEND_IGMP) N(PRECHANGEMTU) N(CHANGEINFODATA) N(BONDING_INFO)
-	N(PRECHANGEUPPER) N(CHANGELOWERSTATE) N(UDP_TUNNEL_PUSH_INFO)
-	N(UDP_TUNNEL_DROP_INFO) N(CHANGE_TX_QUEUE_LEN)
+	N(POST_INIT) N(PRE_UNINIT) N(RELEASE) N(NOTIFY_PEERS) N(JOIN)
+	N(CHANGEUPPER) N(RESEND_IGMP) N(PRECHANGEMTU) N(CHANGEINFODATA)
+	N(BONDING_INFO) N(PRECHANGEUPPER) N(CHANGELOWERSTATE)
+	N(UDP_TUNNEL_PUSH_INFO) N(UDP_TUNNEL_DROP_INFO) N(CHANGE_TX_QUEUE_LEN)
 	N(CVLAN_FILTER_PUSH_INFO) N(CVLAN_FILTER_DROP_INFO)
 	N(SVLAN_FILTER_PUSH_INFO) N(SVLAN_FILTER_DROP_INFO)
 	N(PRE_CHANGEADDR) N(OFFLOAD_XSTATS_ENABLE) N(OFFLOAD_XSTATS_DISABLE)
@@ -10060,7 +10060,7 @@  int register_netdevice(struct net_device *dev)
 	dev->reg_state = ret ? NETREG_UNREGISTERED : NETREG_REGISTERED;
 	write_unlock(&dev_base_lock);
 	if (ret)
-		goto err_uninit;
+		goto err_uninit_notify;
 
 	__netdev_update_features(dev);
 
@@ -10107,6 +10107,8 @@  int register_netdevice(struct net_device *dev)
 out:
 	return ret;
 
+err_uninit_notify:
+	call_netdevice_notifiers(NETDEV_PRE_UNINIT, dev);
 err_uninit:
 	if (dev->netdev_ops->ndo_uninit)
 		dev->netdev_ops->ndo_uninit(dev);
@@ -10856,6 +10858,8 @@  void unregister_netdevice_many_notify(struct list_head *head,
 		netdev_name_node_alt_flush(dev);
 		netdev_name_node_free(dev->name_node);
 
+		call_netdevice_notifiers(NETDEV_PRE_UNINIT, dev);
+
 		if (dev->netdev_ops->ndo_uninit)
 			dev->netdev_ops->ndo_uninit(dev);
 
diff --git a/net/core/devlink.c b/net/core/devlink.c
index 3387dfbb80c5..6f06c05c7b1a 100644
--- a/net/core/devlink.c
+++ b/net/core/devlink.c
@@ -71,6 +71,7 @@  struct devlink {
 	refcount_t refcount;
 	struct completion comp;
 	struct rcu_head rcu;
+	struct notifier_block netdevice_nb;
 	char priv[] __aligned(NETDEV_ALIGN);
 };
 
@@ -9615,6 +9616,9 @@  void devlink_set_features(struct devlink *devlink, u64 features)
 }
 EXPORT_SYMBOL_GPL(devlink_set_features);
 
+static int devlink_netdevice_event(struct notifier_block *nb,
+				   unsigned long event, void *ptr);
+
 /**
  *	devlink_alloc_ns - Allocate new devlink instance resources
  *	in specific namespace
@@ -9645,10 +9649,13 @@  struct devlink *devlink_alloc_ns(const struct devlink_ops *ops,
 
 	ret = xa_alloc_cyclic(&devlinks, &devlink->index, devlink, xa_limit_31b,
 			      &last_id, GFP_KERNEL);
-	if (ret < 0) {
-		kfree(devlink);
-		return NULL;
-	}
+	if (ret < 0)
+		goto err_xa_alloc;
+
+	devlink->netdevice_nb.notifier_call = devlink_netdevice_event;
+	ret = register_netdevice_notifier_net(net, &devlink->netdevice_nb);
+	if (ret)
+		goto err_register_netdevice_notifier;
 
 	devlink->dev = dev;
 	devlink->ops = ops;
@@ -9675,6 +9682,12 @@  struct devlink *devlink_alloc_ns(const struct devlink_ops *ops,
 	init_completion(&devlink->comp);
 
 	return devlink;
+
+err_register_netdevice_notifier:
+	xa_erase(&devlinks, devlink->index);
+err_xa_alloc:
+	kfree(devlink);
+	return NULL;
 }
 EXPORT_SYMBOL_GPL(devlink_alloc_ns);
 
@@ -9828,6 +9841,10 @@  void devlink_free(struct devlink *devlink)
 	WARN_ON(!list_empty(&devlink->port_list));
 
 	xa_destroy(&devlink->snapshot_ids);
+
+	unregister_netdevice_notifier_net(devlink_net(devlink),
+					  &devlink->netdevice_nb);
+
 	xa_erase(&devlinks, devlink->index);
 
 	kfree(devlink);
@@ -10121,6 +10138,56 @@  void devlink_port_type_clear(struct devlink_port *devlink_port)
 }
 EXPORT_SYMBOL_GPL(devlink_port_type_clear);
 
+static int devlink_netdevice_event(struct notifier_block *nb,
+				   unsigned long event, void *ptr)
+{
+	struct net_device *netdev = netdev_notifier_info_to_dev(ptr);
+	struct devlink_port *devlink_port = netdev->devlink_port;
+	struct devlink *devlink;
+
+	devlink = container_of(nb, struct devlink, netdevice_nb);
+
+	if (!devlink_port || devlink_port->devlink != devlink)
+		return NOTIFY_OK;
+
+	switch (event) {
+	case NETDEV_POST_INIT:
+		/* Set the type but not netdev pointer. It is going to be set
+		 * later on by NETDEV_REGISTER event. Happens once during
+		 * netdevice register
+		 */
+		__devlink_port_type_set(devlink_port, DEVLINK_PORT_TYPE_ETH,
+					NULL, true);
+		break;
+	case NETDEV_REGISTER:
+		/* Set the netdev on top of previously set type. Note this
+		 * event happens also during net namespace change so here
+		 * we take into account netdev pointer appearing in this
+		 * namespace.
+		 */
+		__devlink_port_type_set(devlink_port, DEVLINK_PORT_TYPE_ETH,
+					netdev, true);
+		break;
+	case NETDEV_UNREGISTER:
+		/* Clear netdev pointer, but not the type. This event happens
+		 * also during net namespace change so we need to clear
+		 * pointer to netdev that is going to another net namespace.
+		 */
+		__devlink_port_type_set(devlink_port, DEVLINK_PORT_TYPE_ETH,
+					NULL, true);
+		break;
+	case NETDEV_PRE_UNINIT:
+		/* Clear the type and the netdev pointer. Happens one during
+		 * netdevice unregister.
+		 */
+		__devlink_port_type_set(devlink_port, DEVLINK_PORT_TYPE_NOTSET,
+					NULL, true);
+		break;
+	}
+
+	return NOTIFY_OK;
+}
+
 static int __devlink_port_attrs_set(struct devlink_port *devlink_port,
 				    enum devlink_port_flavour flavour)
 {