diff mbox series

[RFC,net-next,04/10] devlink: always check if the devlink instance is registered

Message ID 20221217011953.152487-5-kuba@kernel.org (mailing list archive)
State RFC
Delegated to: Netdev Maintainers
Headers show
Series devlink: remove the wait-for-references on unregister | expand

Checks

Context Check Description
netdev/tree_selection success Clearly marked for net-next
netdev/apply fail Patch does not apply to net-next

Commit Message

Jakub Kicinski Dec. 17, 2022, 1:19 a.m. UTC
Always check under the instance lock whether the devlink instance
is still / already registered.

This is a no-op for the most part, as the unregistration path currently
waits for all references. On the init path, however, we may temporarily
open up a race with netdev code, if netdevs are registered before the
devlink instance. This is temporary, the next change fixes it, and this
commit has been split out for the ease of review.

Note that in case of iterating over sub-objects which have their
own lock (regions and line cards) we assume an implicit dependency
between those objects existing and devlink unregistration.

Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/devlink.h |  1 +
 net/devlink/basic.c   | 35 +++++++++++++++++++++++++++++------
 net/devlink/core.c    | 25 +++++++++++++++++++++----
 net/devlink/netlink.c | 10 ++++++++--
 4 files changed, 59 insertions(+), 12 deletions(-)

Comments

Keller, Jacob E Dec. 19, 2022, 5:48 p.m. UTC | #1
On 12/16/2022 5:19 PM, Jakub Kicinski wrote:
> Always check under the instance lock whether the devlink instance
> is still / already registered.
> 

Ok. So now the reference ensures less about whats valid. It guarantees a
lock but doesn't ensure that the devlink remains registered unless you
acquire the lock and check that the devlink is alive under lock now?

> This is a no-op for the most part, as the unregistration path currently
> waits for all references. On the init path, however, we may temporarily
> open up a race with netdev code, if netdevs are registered before the
> devlink instance. This is temporary, the next change fixes it, and this
> commit has been split out for the ease of review.
> 

This means you're adding the problem here, but its fixed in next commit..?

> Note that in case of iterating over sub-objects which have their
> own lock (regions and line cards) we assume an implicit dependency
> between those objects existing and devlink unregistration.
> 

That seems reasonable.

> Signed-off-by: Jakub Kicinski <kuba@kernel.org>
> ---
>  include/net/devlink.h |  1 +
>  net/devlink/basic.c   | 35 +++++++++++++++++++++++++++++------
>  net/devlink/core.c    | 25 +++++++++++++++++++++----
>  net/devlink/netlink.c | 10 ++++++++--
>  4 files changed, 59 insertions(+), 12 deletions(-)
> 
> diff --git a/include/net/devlink.h b/include/net/devlink.h
> index 6a2e4f21779f..36e013d3aa52 100644
> --- a/include/net/devlink.h
> +++ b/include/net/devlink.h
> @@ -1626,6 +1626,7 @@ struct device *devlink_to_dev(const struct devlink *devlink);
>  void devl_lock(struct devlink *devlink);
>  int devl_trylock(struct devlink *devlink);
>  void devl_unlock(struct devlink *devlink);
> +bool devl_is_alive(struct devlink *devlink);
>  void devl_assert_locked(struct devlink *devlink);
>  bool devl_lock_is_held(struct devlink *devlink);
>  
> diff --git a/net/devlink/basic.c b/net/devlink/basic.c
> index 5f33d74eef83..6b18e70a39fd 100644
> --- a/net/devlink/basic.c
> +++ b/net/devlink/basic.c
> @@ -2130,6 +2130,9 @@ static int devlink_nl_cmd_linecard_get_dumpit(struct sk_buff *msg,
>  		int idx = 0;
>  
>  		mutex_lock(&devlink->linecards_lock);
> +		if (!devl_is_alive(devlink))
> +			goto next_devlink;
> +
>  		list_for_each_entry(linecard, &devlink->linecard_list, list) {
>  			if (idx < dump->idx) {
>  				idx++;
> @@ -2151,6 +2154,7 @@ static int devlink_nl_cmd_linecard_get_dumpit(struct sk_buff *msg,
>  			}
>  			idx++;
>  		}
> +next_devlink:
>  		mutex_unlock(&devlink->linecards_lock);
>  		devlink_put(devlink);
>  	}
> @@ -7809,6 +7813,12 @@ devlink_nl_cmd_health_reporter_get_dumpit(struct sk_buff *msg,
>  		int idx = 0;
>  
>  		mutex_lock(&devlink->reporters_lock);
> +		if (!devl_is_alive(devlink)) {
> +			mutex_unlock(&devlink->reporters_lock);
> +			devlink_put(devlink);
> +			continue;
> +		}
> +
>  		list_for_each_entry(reporter, &devlink->reporter_list,
>  				    list) {
>  			if (idx < dump->idx) {
> @@ -7830,6 +7840,9 @@ devlink_nl_cmd_health_reporter_get_dumpit(struct sk_buff *msg,
>  		mutex_unlock(&devlink->reporters_lock);
>  
>  		devl_lock(devlink);
> +		if (!devl_is_alive(devlink))
> +			goto next_devlink;
> +
>  		xa_for_each(&devlink->ports, port_index, port) {
>  			mutex_lock(&port->reporters_lock);
>  			list_for_each_entry(reporter, &port->reporter_list, list) {
> @@ -7853,6 +7866,7 @@ devlink_nl_cmd_health_reporter_get_dumpit(struct sk_buff *msg,
>  			}
>  			mutex_unlock(&port->reporters_lock);
>  		}
> +next_devlink:
>  		devl_unlock(devlink);
>  		devlink_put(devlink);
>  	}
> @@ -12218,7 +12232,8 @@ void devlink_compat_running_version(struct devlink *devlink,
>  		return;
>  
>  	devl_lock(devlink);
> -	__devlink_compat_running_version(devlink, buf, len);
> +	if (devl_is_alive(devlink))
> +		__devlink_compat_running_version(devlink, buf, len);
>  	devl_unlock(devlink);
>  }
>  
> @@ -12227,20 +12242,28 @@ int devlink_compat_flash_update(struct devlink *devlink, const char *file_name)
>  	struct devlink_flash_update_params params = {};
>  	int ret;
>  
> -	if (!devlink->ops->flash_update)
> -		return -EOPNOTSUPP;
> +	devl_lock(devlink);
> +	if (!devl_is_alive(devlink)) {
> +		ret = -ENODEV;
> +		goto out_unlock;
> +	}
> +
> +	if (!devlink->ops->flash_update) {
> +		ret = -EOPNOTSUPP;
> +		goto out_unlock;
> +	}
>  
>  	ret = request_firmware(&params.fw, file_name, devlink->dev);
>  	if (ret)
> -		return ret;
> +		goto out_unlock;
>  
> -	devl_lock(devlink);
>  	devlink_flash_update_begin_notify(devlink);
>  	ret = devlink->ops->flash_update(devlink, &params, NULL);
>  	devlink_flash_update_end_notify(devlink);
> -	devl_unlock(devlink);
>  
>  	release_firmware(params.fw);
> +out_unlock:
> +	devl_unlock(devlink);
>  
>  	return ret;
>  }
> diff --git a/net/devlink/core.c b/net/devlink/core.c
> index d3b8336946fd..2abad8247597 100644
> --- a/net/devlink/core.c
> +++ b/net/devlink/core.c
> @@ -67,6 +67,21 @@ void devl_unlock(struct devlink *devlink)
>  }
>  EXPORT_SYMBOL_GPL(devl_unlock);
>  
> +bool devl_is_alive(struct devlink *devlink)
> +{
> +	return xa_get_mark(&devlinks, devlink->index, DEVLINK_REGISTERED);
> +}
> +EXPORT_SYMBOL_GPL(devl_is_alive);
> +
> +/**
> + * devlink_try_get() - try to obtain a reference on a devlink instance
> + * @devlink: instance to reference
> + *
> + * Obtain a reference on a devlink instance. A reference on a devlink instance
> + * only implies that it's safe to take the instance lock. It does not imply
> + * that the instance is registered, use devl_is_alive() after taking
> + * the instance lock to check registration status.
> + */
>  struct devlink *__must_check devlink_try_get(struct devlink *devlink)
>  {
>  	if (refcount_inc_not_zero(&devlink->refcount))
> @@ -300,10 +315,12 @@ static void __net_exit devlink_pernet_pre_exit(struct net *net)
>  	devlinks_xa_for_each_registered_get(net, index, devlink) {
>  		WARN_ON(!(devlink->features & DEVLINK_F_RELOAD));
>  		devl_lock(devlink);
> -		err = devlink_reload(devlink, &init_net,
> -				     DEVLINK_RELOAD_ACTION_DRIVER_REINIT,
> -				     DEVLINK_RELOAD_LIMIT_UNSPEC,
> -				     &actions_performed, NULL);
> +		err = 0;
> +		if (devl_is_alive(devlink))
> +			err = devlink_reload(devlink, &init_net,
> +					     DEVLINK_RELOAD_ACTION_DRIVER_REINIT,
> +					     DEVLINK_RELOAD_LIMIT_UNSPEC,
> +					     &actions_performed, NULL);
>  		devl_unlock(devlink);
>  		devlink_put(devlink);
>  
> diff --git a/net/devlink/netlink.c b/net/devlink/netlink.c
> index b38df704be1c..773efaabb6ad 100644
> --- a/net/devlink/netlink.c
> +++ b/net/devlink/netlink.c
> @@ -98,7 +98,8 @@ devlink_get_from_attrs_lock(struct net *net, struct nlattr **attrs)
>  
>  	devlinks_xa_for_each_registered_get(net, index, devlink) {
>  		devl_lock(devlink);
> -		if (strcmp(devlink->dev->bus->name, busname) == 0 &&
> +		if (devl_is_alive(devlink) &&
> +		    strcmp(devlink->dev->bus->name, busname) == 0 &&
>  		    strcmp(dev_name(devlink->dev), devname) == 0)
>  			return devlink;
>  		devl_unlock(devlink);
> @@ -210,7 +211,12 @@ int devlink_instance_iter_dump(struct sk_buff *msg, struct netlink_callback *cb)
>  
>  	devlink_dump_for_each_instance_get(msg, dump, devlink) {
>  		devl_lock(devlink);
> -		err = cmd->dump_one(msg, devlink, cb);
> +
> +		if (devl_is_alive(devlink))
> +			err = cmd->dump_one(msg, devlink, cb);
> +		else
> +			err = 0;
> +
>  		devl_unlock(devlink);
>  		devlink_put(devlink);
>
Jakub Kicinski Dec. 19, 2022, 9:55 p.m. UTC | #2
On Mon, 19 Dec 2022 09:48:54 -0800 Jacob Keller wrote:
> On 12/16/2022 5:19 PM, Jakub Kicinski wrote:
> > Always check under the instance lock whether the devlink instance
> > is still / already registered.
> 
> Ok. So now the reference ensures less about whats valid. It guarantees a
> lock but doesn't ensure that the devlink remains registered unless you
> acquire the lock and check that the devlink is alive under lock now?

Correct.

> > This is a no-op for the most part, as the unregistration path currently
> > waits for all references. On the init path, however, we may temporarily
> > open up a race with netdev code, if netdevs are registered before the
> > devlink instance. This is temporary, the next change fixes it, and this
> > commit has been split out for the ease of review.
> >   
> 
> This means you're adding the problem here, but its fixed in next commit..?

Yes, I can squash when posting for applying, but TBH I think the clarity
of the changes outweighs the tiny and transient race.
Keller, Jacob E Dec. 19, 2022, 10:08 p.m. UTC | #3
On 12/19/2022 1:55 PM, Jakub Kicinski wrote:
> On Mon, 19 Dec 2022 09:48:54 -0800 Jacob Keller wrote:
>> On 12/16/2022 5:19 PM, Jakub Kicinski wrote:
>>> Always check under the instance lock whether the devlink instance
>>> is still / already registered.
>>
>> Ok. So now the reference ensures less about whats valid. It guarantees a
>> lock but doesn't ensure that the devlink remains registered unless you
>> acquire the lock and check that the devlink is alive under lock now?
> 
> Correct.
> 
>>> This is a no-op for the most part, as the unregistration path currently
>>> waits for all references. On the init path, however, we may temporarily
>>> open up a race with netdev code, if netdevs are registered before the
>>> devlink instance. This is temporary, the next change fixes it, and this
>>> commit has been split out for the ease of review.
>>>   
>>
>> This means you're adding the problem here, but its fixed in next commit..?
> 
> Yes, I can squash when posting for applying, but TBH I think the clarity
> of the changes outweighs the tiny and transient race.

I would agree. The only reason I could think it to be a problem is if a
bisect lands precisely on this commit and you happen to hit this... what
are the side effects of the race here? If the side effects don't include
significant issues I think its fine.
Jiri Pirko Jan. 2, 2023, 1:58 p.m. UTC | #4
Sat, Dec 17, 2022 at 02:19:47AM CET, kuba@kernel.org wrote:
>Always check under the instance lock whether the devlink instance
>is still / already registered.
>
>This is a no-op for the most part, as the unregistration path currently
>waits for all references. On the init path, however, we may temporarily
>open up a race with netdev code, if netdevs are registered before the
>devlink instance. This is temporary, the next change fixes it, and this
>commit has been split out for the ease of review.
>
>Note that in case of iterating over sub-objects which have their
>own lock (regions and line cards) we assume an implicit dependency
>between those objects existing and devlink unregistration.

This would be probably very valuable to add as a comment inside the code
for the future reader mind sake.


>
>Signed-off-by: Jakub Kicinski <kuba@kernel.org>
>---
> include/net/devlink.h |  1 +
> net/devlink/basic.c   | 35 +++++++++++++++++++++++++++++------
> net/devlink/core.c    | 25 +++++++++++++++++++++----
> net/devlink/netlink.c | 10 ++++++++--
> 4 files changed, 59 insertions(+), 12 deletions(-)
>
>diff --git a/include/net/devlink.h b/include/net/devlink.h
>index 6a2e4f21779f..36e013d3aa52 100644
>--- a/include/net/devlink.h
>+++ b/include/net/devlink.h
>@@ -1626,6 +1626,7 @@ struct device *devlink_to_dev(const struct devlink *devlink);
> void devl_lock(struct devlink *devlink);
> int devl_trylock(struct devlink *devlink);
> void devl_unlock(struct devlink *devlink);
>+bool devl_is_alive(struct devlink *devlink);
> void devl_assert_locked(struct devlink *devlink);
> bool devl_lock_is_held(struct devlink *devlink);
> 
>diff --git a/net/devlink/basic.c b/net/devlink/basic.c
>index 5f33d74eef83..6b18e70a39fd 100644
>--- a/net/devlink/basic.c
>+++ b/net/devlink/basic.c
>@@ -2130,6 +2130,9 @@ static int devlink_nl_cmd_linecard_get_dumpit(struct sk_buff *msg,
> 		int idx = 0;
> 
> 		mutex_lock(&devlink->linecards_lock);
>+		if (!devl_is_alive(devlink))
>+			goto next_devlink;
>+
> 		list_for_each_entry(linecard, &devlink->linecard_list, list) {
> 			if (idx < dump->idx) {
> 				idx++;
>@@ -2151,6 +2154,7 @@ static int devlink_nl_cmd_linecard_get_dumpit(struct sk_buff *msg,
> 			}
> 			idx++;
> 		}
>+next_devlink:
> 		mutex_unlock(&devlink->linecards_lock);
> 		devlink_put(devlink);
> 	}
>@@ -7809,6 +7813,12 @@ devlink_nl_cmd_health_reporter_get_dumpit(struct sk_buff *msg,
> 		int idx = 0;
> 
> 		mutex_lock(&devlink->reporters_lock);
>+		if (!devl_is_alive(devlink)) {
>+			mutex_unlock(&devlink->reporters_lock);
>+			devlink_put(devlink);
>+			continue;
>+		}
>+
> 		list_for_each_entry(reporter, &devlink->reporter_list,
> 				    list) {
> 			if (idx < dump->idx) {
>@@ -7830,6 +7840,9 @@ devlink_nl_cmd_health_reporter_get_dumpit(struct sk_buff *msg,
> 		mutex_unlock(&devlink->reporters_lock);
> 
> 		devl_lock(devlink);
>+		if (!devl_is_alive(devlink))
>+			goto next_devlink;
>+
> 		xa_for_each(&devlink->ports, port_index, port) {
> 			mutex_lock(&port->reporters_lock);
> 			list_for_each_entry(reporter, &port->reporter_list, list) {
>@@ -7853,6 +7866,7 @@ devlink_nl_cmd_health_reporter_get_dumpit(struct sk_buff *msg,
> 			}
> 			mutex_unlock(&port->reporters_lock);
> 		}
>+next_devlink:
> 		devl_unlock(devlink);
> 		devlink_put(devlink);
> 	}
>@@ -12218,7 +12232,8 @@ void devlink_compat_running_version(struct devlink *devlink,
> 		return;
> 
> 	devl_lock(devlink);
>-	__devlink_compat_running_version(devlink, buf, len);
>+	if (devl_is_alive(devlink))
>+		__devlink_compat_running_version(devlink, buf, len);
> 	devl_unlock(devlink);
> }
> 
>@@ -12227,20 +12242,28 @@ int devlink_compat_flash_update(struct devlink *devlink, const char *file_name)
> 	struct devlink_flash_update_params params = {};
> 	int ret;
> 
>-	if (!devlink->ops->flash_update)
>-		return -EOPNOTSUPP;
>+	devl_lock(devlink);
>+	if (!devl_is_alive(devlink)) {
>+		ret = -ENODEV;
>+		goto out_unlock;
>+	}
>+
>+	if (!devlink->ops->flash_update) {
>+		ret = -EOPNOTSUPP;
>+		goto out_unlock;
>+	}
> 
> 	ret = request_firmware(&params.fw, file_name, devlink->dev);
> 	if (ret)
>-		return ret;
>+		goto out_unlock;
> 
>-	devl_lock(devlink);
> 	devlink_flash_update_begin_notify(devlink);
> 	ret = devlink->ops->flash_update(devlink, &params, NULL);
> 	devlink_flash_update_end_notify(devlink);
>-	devl_unlock(devlink);
> 
> 	release_firmware(params.fw);
>+out_unlock:
>+	devl_unlock(devlink);
> 
> 	return ret;
> }
>diff --git a/net/devlink/core.c b/net/devlink/core.c
>index d3b8336946fd..2abad8247597 100644
>--- a/net/devlink/core.c
>+++ b/net/devlink/core.c
>@@ -67,6 +67,21 @@ void devl_unlock(struct devlink *devlink)
> }
> EXPORT_SYMBOL_GPL(devl_unlock);
> 
>+bool devl_is_alive(struct devlink *devlink)

Why "alive"? To be consistent with the existing terminology, how about
to name it devl_is_registered()?

Also, "devl_" implicates that it should be called with devlink instance
lock held, so probably devlink_is_registered() would be better.


>+{
>+	return xa_get_mark(&devlinks, devlink->index, DEVLINK_REGISTERED);
>+}
>+EXPORT_SYMBOL_GPL(devl_is_alive);
>+
>+/**
>+ * devlink_try_get() - try to obtain a reference on a devlink instance
>+ * @devlink: instance to reference
>+ *
>+ * Obtain a reference on a devlink instance. A reference on a devlink instance
>+ * only implies that it's safe to take the instance lock. It does not imply
>+ * that the instance is registered, use devl_is_alive() after taking
>+ * the instance lock to check registration status.
>+ */

This comment is not related to the patch, should be added in a separate
one.


> struct devlink *__must_check devlink_try_get(struct devlink *devlink)
> {
> 	if (refcount_inc_not_zero(&devlink->refcount))
>@@ -300,10 +315,12 @@ static void __net_exit devlink_pernet_pre_exit(struct net *net)
> 	devlinks_xa_for_each_registered_get(net, index, devlink) {
> 		WARN_ON(!(devlink->features & DEVLINK_F_RELOAD));
> 		devl_lock(devlink);
>-		err = devlink_reload(devlink, &init_net,
>-				     DEVLINK_RELOAD_ACTION_DRIVER_REINIT,
>-				     DEVLINK_RELOAD_LIMIT_UNSPEC,
>-				     &actions_performed, NULL);
>+		err = 0;
>+		if (devl_is_alive(devlink))
>+			err = devlink_reload(devlink, &init_net,
>+					     DEVLINK_RELOAD_ACTION_DRIVER_REINIT,
>+					     DEVLINK_RELOAD_LIMIT_UNSPEC,
>+					     &actions_performed, NULL);
> 		devl_unlock(devlink);
> 		devlink_put(devlink);
> 
>diff --git a/net/devlink/netlink.c b/net/devlink/netlink.c
>index b38df704be1c..773efaabb6ad 100644
>--- a/net/devlink/netlink.c
>+++ b/net/devlink/netlink.c
>@@ -98,7 +98,8 @@ devlink_get_from_attrs_lock(struct net *net, struct nlattr **attrs)
> 
> 	devlinks_xa_for_each_registered_get(net, index, devlink) {
> 		devl_lock(devlink);
>-		if (strcmp(devlink->dev->bus->name, busname) == 0 &&
>+		if (devl_is_alive(devlink) &&
>+		    strcmp(devlink->dev->bus->name, busname) == 0 &&
> 		    strcmp(dev_name(devlink->dev), devname) == 0)
> 			return devlink;
> 		devl_unlock(devlink);
>@@ -210,7 +211,12 @@ int devlink_instance_iter_dump(struct sk_buff *msg, struct netlink_callback *cb)
> 
> 	devlink_dump_for_each_instance_get(msg, dump, devlink) {
> 		devl_lock(devlink);
>-		err = cmd->dump_one(msg, devlink, cb);
>+
>+		if (devl_is_alive(devlink))
>+			err = cmd->dump_one(msg, devlink, cb);
>+		else
>+			err = 0;
>+
> 		devl_unlock(devlink);
> 		devlink_put(devlink);
> 
>-- 
>2.38.1
>
Jiri Pirko Jan. 2, 2023, 2:57 p.m. UTC | #5
Sat, Dec 17, 2022 at 02:19:47AM CET, kuba@kernel.org wrote:
>Always check under the instance lock whether the devlink instance
>is still / already registered.
>
>This is a no-op for the most part, as the unregistration path currently
>waits for all references. On the init path, however, we may temporarily
>open up a race with netdev code, if netdevs are registered before the
>devlink instance. This is temporary, the next change fixes it, and this
>commit has been split out for the ease of review.
>
>Note that in case of iterating over sub-objects which have their
>own lock (regions and line cards) we assume an implicit dependency
>between those objects existing and devlink unregistration.
>
>Signed-off-by: Jakub Kicinski <kuba@kernel.org>
>---
> include/net/devlink.h |  1 +
> net/devlink/basic.c   | 35 +++++++++++++++++++++++++++++------
> net/devlink/core.c    | 25 +++++++++++++++++++++----
> net/devlink/netlink.c | 10 ++++++++--
> 4 files changed, 59 insertions(+), 12 deletions(-)
>
>diff --git a/include/net/devlink.h b/include/net/devlink.h
>index 6a2e4f21779f..36e013d3aa52 100644
>--- a/include/net/devlink.h
>+++ b/include/net/devlink.h
>@@ -1626,6 +1626,7 @@ struct device *devlink_to_dev(const struct devlink *devlink);
> void devl_lock(struct devlink *devlink);
> int devl_trylock(struct devlink *devlink);
> void devl_unlock(struct devlink *devlink);
>+bool devl_is_alive(struct devlink *devlink);
> void devl_assert_locked(struct devlink *devlink);
> bool devl_lock_is_held(struct devlink *devlink);
> 
>diff --git a/net/devlink/basic.c b/net/devlink/basic.c
>index 5f33d74eef83..6b18e70a39fd 100644
>--- a/net/devlink/basic.c
>+++ b/net/devlink/basic.c
>@@ -2130,6 +2130,9 @@ static int devlink_nl_cmd_linecard_get_dumpit(struct sk_buff *msg,
> 		int idx = 0;
> 
> 		mutex_lock(&devlink->linecards_lock);
>+		if (!devl_is_alive(devlink))
>+			goto next_devlink;


Thinking about this a bit more, things would be cleaner if reporters and
linecards are converted to rely on instance lock as well. I don't see a
good reason for a separate lock in both cases, really.

Also, we could introduce devlinks_xa_for_each_registered_get_lock()
iterator that would lock the instance as well right away to avoid
this devl_is_alive() dance on multiple places when you iterate devlinks.


>+
> 		list_for_each_entry(linecard, &devlink->linecard_list, list) {
> 			if (idx < dump->idx) {
> 				idx++;
>@@ -2151,6 +2154,7 @@ static int devlink_nl_cmd_linecard_get_dumpit(struct sk_buff *msg,
> 			}
> 			idx++;
> 		}
>+next_devlink:
> 		mutex_unlock(&devlink->linecards_lock);
> 		devlink_put(devlink);
> 	}
>@@ -7809,6 +7813,12 @@ devlink_nl_cmd_health_reporter_get_dumpit(struct sk_buff *msg,
> 		int idx = 0;
> 
> 		mutex_lock(&devlink->reporters_lock);
>+		if (!devl_is_alive(devlink)) {
>+			mutex_unlock(&devlink->reporters_lock);
>+			devlink_put(devlink);
>+			continue;
>+		}
>+
> 		list_for_each_entry(reporter, &devlink->reporter_list,
> 				    list) {
> 			if (idx < dump->idx) {
>@@ -7830,6 +7840,9 @@ devlink_nl_cmd_health_reporter_get_dumpit(struct sk_buff *msg,
> 		mutex_unlock(&devlink->reporters_lock);
> 
> 		devl_lock(devlink);
>+		if (!devl_is_alive(devlink))
>+			goto next_devlink;
>+
> 		xa_for_each(&devlink->ports, port_index, port) {
> 			mutex_lock(&port->reporters_lock);
> 			list_for_each_entry(reporter, &port->reporter_list, list) {
>@@ -7853,6 +7866,7 @@ devlink_nl_cmd_health_reporter_get_dumpit(struct sk_buff *msg,
> 			}
> 			mutex_unlock(&port->reporters_lock);
> 		}
>+next_devlink:
> 		devl_unlock(devlink);
> 		devlink_put(devlink);
> 	}
>@@ -12218,7 +12232,8 @@ void devlink_compat_running_version(struct devlink *devlink,
> 		return;
> 
> 	devl_lock(devlink);

How about to have a helper, something like devl_lock_alive() (or
devl_lock_registered() with the naming scheme I suggest in the other
thread)? Then you can do:

	if (!devl_lock_alive(devlink))
		return;
	__devlink_compat_running_version(devlink, buf, len);
	devl_unlock(devlink);



>-	__devlink_compat_running_version(devlink, buf, len);
>+	if (devl_is_alive(devlink))
>+		__devlink_compat_running_version(devlink, buf, len);
> 	devl_unlock(devlink);
> }
> 
>@@ -12227,20 +12242,28 @@ int devlink_compat_flash_update(struct devlink *devlink, const char *file_name)
> 	struct devlink_flash_update_params params = {};
> 	int ret;
> 
>-	if (!devlink->ops->flash_update)
>-		return -EOPNOTSUPP;
>+	devl_lock(devlink);
>+	if (!devl_is_alive(devlink)) {
>+		ret = -ENODEV;
>+		goto out_unlock;
>+	}
>+
>+	if (!devlink->ops->flash_update) {
>+		ret = -EOPNOTSUPP;
>+		goto out_unlock;
>+	}
> 
> 	ret = request_firmware(&params.fw, file_name, devlink->dev);
> 	if (ret)
>-		return ret;
>+		goto out_unlock;
> 
>-	devl_lock(devlink);
> 	devlink_flash_update_begin_notify(devlink);
> 	ret = devlink->ops->flash_update(devlink, &params, NULL);
> 	devlink_flash_update_end_notify(devlink);
>-	devl_unlock(devlink);
> 
> 	release_firmware(params.fw);
>+out_unlock:
>+	devl_unlock(devlink);
> 
> 	return ret;
> }
>diff --git a/net/devlink/core.c b/net/devlink/core.c
>index d3b8336946fd..2abad8247597 100644
>--- a/net/devlink/core.c
>+++ b/net/devlink/core.c
>@@ -67,6 +67,21 @@ void devl_unlock(struct devlink *devlink)
> }
> EXPORT_SYMBOL_GPL(devl_unlock);
> 
>+bool devl_is_alive(struct devlink *devlink)
>+{
>+	return xa_get_mark(&devlinks, devlink->index, DEVLINK_REGISTERED);
>+}
>+EXPORT_SYMBOL_GPL(devl_is_alive);
>+
>+/**
>+ * devlink_try_get() - try to obtain a reference on a devlink instance
>+ * @devlink: instance to reference
>+ *
>+ * Obtain a reference on a devlink instance. A reference on a devlink instance
>+ * only implies that it's safe to take the instance lock. It does not imply
>+ * that the instance is registered, use devl_is_alive() after taking
>+ * the instance lock to check registration status.
>+ */
> struct devlink *__must_check devlink_try_get(struct devlink *devlink)
> {
> 	if (refcount_inc_not_zero(&devlink->refcount))
>@@ -300,10 +315,12 @@ static void __net_exit devlink_pernet_pre_exit(struct net *net)
> 	devlinks_xa_for_each_registered_get(net, index, devlink) {
> 		WARN_ON(!(devlink->features & DEVLINK_F_RELOAD));
> 		devl_lock(devlink);
>-		err = devlink_reload(devlink, &init_net,
>-				     DEVLINK_RELOAD_ACTION_DRIVER_REINIT,
>-				     DEVLINK_RELOAD_LIMIT_UNSPEC,
>-				     &actions_performed, NULL);
>+		err = 0;
>+		if (devl_is_alive(devlink))
>+			err = devlink_reload(devlink, &init_net,
>+					     DEVLINK_RELOAD_ACTION_DRIVER_REINIT,
>+					     DEVLINK_RELOAD_LIMIT_UNSPEC,
>+					     &actions_performed, NULL);
> 		devl_unlock(devlink);
> 		devlink_put(devlink);
> 
>diff --git a/net/devlink/netlink.c b/net/devlink/netlink.c
>index b38df704be1c..773efaabb6ad 100644
>--- a/net/devlink/netlink.c
>+++ b/net/devlink/netlink.c
>@@ -98,7 +98,8 @@ devlink_get_from_attrs_lock(struct net *net, struct nlattr **attrs)
> 
> 	devlinks_xa_for_each_registered_get(net, index, devlink) {
> 		devl_lock(devlink);
>-		if (strcmp(devlink->dev->bus->name, busname) == 0 &&
>+		if (devl_is_alive(devlink) &&
>+		    strcmp(devlink->dev->bus->name, busname) == 0 &&
> 		    strcmp(dev_name(devlink->dev), devname) == 0)
> 			return devlink;
> 		devl_unlock(devlink);
>@@ -210,7 +211,12 @@ int devlink_instance_iter_dump(struct sk_buff *msg, struct netlink_callback *cb)
> 
> 	devlink_dump_for_each_instance_get(msg, dump, devlink) {
> 		devl_lock(devlink);
>-		err = cmd->dump_one(msg, devlink, cb);
>+
>+		if (devl_is_alive(devlink))
>+			err = cmd->dump_one(msg, devlink, cb);
>+		else
>+			err = 0;
>+
> 		devl_unlock(devlink);
> 		devlink_put(devlink);
> 
>-- 
>2.38.1
>
Jiri Pirko Jan. 2, 2023, 3:12 p.m. UTC | #6
Mon, Jan 02, 2023 at 03:57:24PM CET, jiri@resnulli.us wrote:
>Sat, Dec 17, 2022 at 02:19:47AM CET, kuba@kernel.org wrote:
>>Always check under the instance lock whether the devlink instance
>>is still / already registered.
>>
>>This is a no-op for the most part, as the unregistration path currently
>>waits for all references. On the init path, however, we may temporarily
>>open up a race with netdev code, if netdevs are registered before the
>>devlink instance. This is temporary, the next change fixes it, and this
>>commit has been split out for the ease of review.
>>
>>Note that in case of iterating over sub-objects which have their
>>own lock (regions and line cards) we assume an implicit dependency
>>between those objects existing and devlink unregistration.
>>
>>Signed-off-by: Jakub Kicinski <kuba@kernel.org>
>>---
>> include/net/devlink.h |  1 +
>> net/devlink/basic.c   | 35 +++++++++++++++++++++++++++++------
>> net/devlink/core.c    | 25 +++++++++++++++++++++----
>> net/devlink/netlink.c | 10 ++++++++--
>> 4 files changed, 59 insertions(+), 12 deletions(-)
>>
>>diff --git a/include/net/devlink.h b/include/net/devlink.h
>>index 6a2e4f21779f..36e013d3aa52 100644
>>--- a/include/net/devlink.h
>>+++ b/include/net/devlink.h
>>@@ -1626,6 +1626,7 @@ struct device *devlink_to_dev(const struct devlink *devlink);
>> void devl_lock(struct devlink *devlink);
>> int devl_trylock(struct devlink *devlink);
>> void devl_unlock(struct devlink *devlink);
>>+bool devl_is_alive(struct devlink *devlink);
>> void devl_assert_locked(struct devlink *devlink);
>> bool devl_lock_is_held(struct devlink *devlink);
>> 
>>diff --git a/net/devlink/basic.c b/net/devlink/basic.c
>>index 5f33d74eef83..6b18e70a39fd 100644
>>--- a/net/devlink/basic.c
>>+++ b/net/devlink/basic.c
>>@@ -2130,6 +2130,9 @@ static int devlink_nl_cmd_linecard_get_dumpit(struct sk_buff *msg,
>> 		int idx = 0;
>> 
>> 		mutex_lock(&devlink->linecards_lock);
>>+		if (!devl_is_alive(devlink))
>>+			goto next_devlink;
>
>
>Thinking about this a bit more, things would be cleaner if reporters and
>linecards are converted to rely on instance lock as well. I don't see a
>good reason for a separate lock in both cases, really.
>
>Also, we could introduce devlinks_xa_for_each_registered_get_lock()
>iterator that would lock the instance as well right away to avoid
>this devl_is_alive() dance on multiple places when you iterate devlinks.

devlinks_xa_find_get_locked()
would do the check&lock at the end.



>
>
>>+
>> 		list_for_each_entry(linecard, &devlink->linecard_list, list) {
>> 			if (idx < dump->idx) {
>> 				idx++;

[...]
Jakub Kicinski Jan. 2, 2023, 11:05 p.m. UTC | #7
On Mon, 2 Jan 2023 14:58:16 +0100 Jiri Pirko wrote:
> Sat, Dec 17, 2022 at 02:19:47AM CET, kuba@kernel.org wrote:
> >Always check under the instance lock whether the devlink instance
> >is still / already registered.
> >
> >This is a no-op for the most part, as the unregistration path currently
> >waits for all references. On the init path, however, we may temporarily
> >open up a race with netdev code, if netdevs are registered before the
> >devlink instance. This is temporary, the next change fixes it, and this
> >commit has been split out for the ease of review.
> >
> >Note that in case of iterating over sub-objects which have their
> >own lock (regions and line cards) we assume an implicit dependency
> >between those objects existing and devlink unregistration.  
> 
> This would be probably very valuable to add as a comment inside the code
> for the future reader mind sake.

Where, tho?

I'm strongly against the pointlessly fine-grained locking going forward
so hopefully there won't be any more per-subobject locks added anyway.

> >+bool devl_is_alive(struct devlink *devlink)  
> 
> Why "alive"? To be consistent with the existing terminology, how about
> to name it devl_is_registered()?

I dislike the similarity to device_is_registered() which has very
different semantics. I prefer alive.

> Also, "devl_" implicates that it should be called with devlink instance
> lock held, so probably devlink_is_registered() would be better.

I'm guessing you realized this isn't correct later on.

> >+{
> >+	return xa_get_mark(&devlinks, devlink->index, DEVLINK_REGISTERED);
> >+}
> >+EXPORT_SYMBOL_GPL(devl_is_alive);
> >+
> >+/**
> >+ * devlink_try_get() - try to obtain a reference on a devlink instance
> >+ * @devlink: instance to reference
> >+ *
> >+ * Obtain a reference on a devlink instance. A reference on a devlink instance
> >+ * only implies that it's safe to take the instance lock. It does not imply
> >+ * that the instance is registered, use devl_is_alive() after taking
> >+ * the instance lock to check registration status.
> >+ */  
> 
> This comment is not related to the patch, should be added in a separate
> one.

The point of adding this comment is to say that one has to use
devl_is_alive() after accessing an instance by reference.
It is very much in the right patch.
Jakub Kicinski Jan. 2, 2023, 11:16 p.m. UTC | #8
On Mon, 2 Jan 2023 15:57:24 +0100 Jiri Pirko wrote:
> Sat, Dec 17, 2022 at 02:19:47AM CET, kuba@kernel.org wrote:
> >Always check under the instance lock whether the devlink instance
> >is still / already registered.
> >
> >This is a no-op for the most part, as the unregistration path currently
> >waits for all references. On the init path, however, we may temporarily
> >open up a race with netdev code, if netdevs are registered before the
> >devlink instance. This is temporary, the next change fixes it, and this
> >commit has been split out for the ease of review.
> >
> >Note that in case of iterating over sub-objects which have their
> >own lock (regions and line cards) we assume an implicit dependency
> >between those objects existing and devlink unregistration.

> >diff --git a/net/devlink/basic.c b/net/devlink/basic.c
> >index 5f33d74eef83..6b18e70a39fd 100644
> >--- a/net/devlink/basic.c
> >+++ b/net/devlink/basic.c
> >@@ -2130,6 +2130,9 @@ static int devlink_nl_cmd_linecard_get_dumpit(struct sk_buff *msg,
> > 		int idx = 0;
> > 
> > 		mutex_lock(&devlink->linecards_lock);
> >+		if (!devl_is_alive(devlink))
> >+			goto next_devlink;  
> 
> Thinking about this a bit more, things would be cleaner if reporters and
> linecards are converted to rely on instance lock as well. I don't see a
> good reason for a separate lock in both cases, really.

We had discussion before, I'm pretty sure.
IIRC you said that mlx4's locking prevents us from using the instance
lock for regions.

> Also, we could introduce devlinks_xa_for_each_registered_get_lock()
> iterator that would lock the instance as well right away to avoid
> this devl_is_alive() dance on multiple places when you iterate devlinks.

That's what I started with, but the ability to factor our the
unlock/put on error paths made the callback approach much cleaner.
And after using the callback for all the dumps there's only a couple
places which would use devlinks_xa_for_each_registered_get_lock().

> >@@ -12218,7 +12232,8 @@ void devlink_compat_running_version(struct devlink *devlink,
> > 		return;
> > 
> > 	devl_lock(devlink);  
> 
> How about to have a helper, something like devl_lock_alive() (or
> devl_lock_registered() with the naming scheme I suggest in the other
> thread)? Then you can do:
> 
> 	if (!devl_lock_alive(devlink))
> 		return;
> 	__devlink_compat_running_version(devlink, buf, len);
> 	devl_unlock(devlink);

I guess aesthetic preference.

If I had the cycles I'd make devlink_try_get() return a wrapped type

struct devlink_ref {
	struct devlink *devlink;
};

which one would have to pass to devl_lock_from_ref() or some such:

struct devlink *devl_lock_from_ref(struct devlink_ref dref)
{
	if (!dref.devlink)
		return NULL;
	devl_lock(dref.devlink);
	if (devl_lock_alive(dref.devlink))
		return dref.devlink;
	devl_unlock(dref.devlink);
	return NULL;
}

But the number of calls to devl_is_alive() is quite small after all
the cleanup, so I don't think the extra helpers are justified at this
point. "Normal coders" should not be exposed to any of the lifetime
details, not when coding the drivers, not when adding typical devlink
features/subobjects.
Jiri Pirko Jan. 3, 2023, 9:26 a.m. UTC | #9
Tue, Jan 03, 2023 at 12:05:14AM CET, kuba@kernel.org wrote:
>On Mon, 2 Jan 2023 14:58:16 +0100 Jiri Pirko wrote:
>> Sat, Dec 17, 2022 at 02:19:47AM CET, kuba@kernel.org wrote:
>> >Always check under the instance lock whether the devlink instance
>> >is still / already registered.
>> >
>> >This is a no-op for the most part, as the unregistration path currently
>> >waits for all references. On the init path, however, we may temporarily
>> >open up a race with netdev code, if netdevs are registered before the
>> >devlink instance. This is temporary, the next change fixes it, and this
>> >commit has been split out for the ease of review.
>> >
>> >Note that in case of iterating over sub-objects which have their
>> >own lock (regions and line cards) we assume an implicit dependency
>> >between those objects existing and devlink unregistration.  
>> 
>> This would be probably very valuable to add as a comment inside the code
>> for the future reader mind sake.
>
>Where, tho?
>
>I'm strongly against the pointlessly fine-grained locking going forward
>so hopefully there won't be any more per-subobject locks added anyway.

Agreed. That is what I suggested in the other thread too.


>
>> >+bool devl_is_alive(struct devlink *devlink)  
>> 
>> Why "alive"? To be consistent with the existing terminology, how about
>> to name it devl_is_registered()?
>
>I dislike the similarity to device_is_registered() which has very
>different semantics. I prefer alive.

Interesting. Didn't occur to me to look into device.h when reading
devlink.c code. I mean, is device_register() behaviour in sync with
devlink_register?

Your alive() helper is checking "register mark". It's an odd and unneded
inconsistency in newly added code :/


>
>> Also, "devl_" implicates that it should be called with devlink instance
>> lock held, so probably devlink_is_registered() would be better.
>
>I'm guessing you realized this isn't correct later on.

From what I see, no need to hold instance mutex for xa mark checking,
alhough I understand why you want the helper to be called with the lock.
Perhaps assert and a little comment would make this clear?


>
>> >+{
>> >+	return xa_get_mark(&devlinks, devlink->index, DEVLINK_REGISTERED);
>> >+}
>> >+EXPORT_SYMBOL_GPL(devl_is_alive);
>> >+
>> >+/**
>> >+ * devlink_try_get() - try to obtain a reference on a devlink instance
>> >+ * @devlink: instance to reference
>> >+ *
>> >+ * Obtain a reference on a devlink instance. A reference on a devlink instance
>> >+ * only implies that it's safe to take the instance lock. It does not imply
>> >+ * that the instance is registered, use devl_is_alive() after taking
>> >+ * the instance lock to check registration status.
>> >+ */  
>> 
>> This comment is not related to the patch, should be added in a separate
>> one.
>
>The point of adding this comment is to say that one has to use
>devl_is_alive() after accessing an instance by reference.
>It is very much in the right patch.

Gotha! My mistake, sorry.
Jiri Pirko Jan. 3, 2023, 9:30 a.m. UTC | #10
Tue, Jan 03, 2023 at 12:16:30AM CET, kuba@kernel.org wrote:
>On Mon, 2 Jan 2023 15:57:24 +0100 Jiri Pirko wrote:
>> Sat, Dec 17, 2022 at 02:19:47AM CET, kuba@kernel.org wrote:
>> >Always check under the instance lock whether the devlink instance
>> >is still / already registered.
>> >
>> >This is a no-op for the most part, as the unregistration path currently
>> >waits for all references. On the init path, however, we may temporarily
>> >open up a race with netdev code, if netdevs are registered before the
>> >devlink instance. This is temporary, the next change fixes it, and this
>> >commit has been split out for the ease of review.
>> >
>> >Note that in case of iterating over sub-objects which have their
>> >own lock (regions and line cards) we assume an implicit dependency
>> >between those objects existing and devlink unregistration.
>
>> >diff --git a/net/devlink/basic.c b/net/devlink/basic.c
>> >index 5f33d74eef83..6b18e70a39fd 100644
>> >--- a/net/devlink/basic.c
>> >+++ b/net/devlink/basic.c
>> >@@ -2130,6 +2130,9 @@ static int devlink_nl_cmd_linecard_get_dumpit(struct sk_buff *msg,
>> > 		int idx = 0;
>> > 
>> > 		mutex_lock(&devlink->linecards_lock);
>> >+		if (!devl_is_alive(devlink))
>> >+			goto next_devlink;  
>> 
>> Thinking about this a bit more, things would be cleaner if reporters and
>> linecards are converted to rely on instance lock as well. I don't see a
>> good reason for a separate lock in both cases, really.
>
>We had discussion before, I'm pretty sure.
>IIRC you said that mlx4's locking prevents us from using the instance
>lock for regions.

Yeah, let me check it out again. For the linecards, that could be done.
Let me take care of these.


>
>> Also, we could introduce devlinks_xa_for_each_registered_get_lock()
>> iterator that would lock the instance as well right away to avoid
>> this devl_is_alive() dance on multiple places when you iterate devlinks.
>
>That's what I started with, but the ability to factor our the
>unlock/put on error paths made the callback approach much cleaner.
>And after using the callback for all the dumps there's only a couple
>places which would use devlinks_xa_for_each_registered_get_lock().

I see. Okay.


>
>> >@@ -12218,7 +12232,8 @@ void devlink_compat_running_version(struct devlink *devlink,
>> > 		return;
>> > 
>> > 	devl_lock(devlink);  
>> 
>> How about to have a helper, something like devl_lock_alive() (or
>> devl_lock_registered() with the naming scheme I suggest in the other
>> thread)? Then you can do:
>> 
>> 	if (!devl_lock_alive(devlink))
>> 		return;
>> 	__devlink_compat_running_version(devlink, buf, len);
>> 	devl_unlock(devlink);
>
>I guess aesthetic preference.
>
>If I had the cycles I'd make devlink_try_get() return a wrapped type
>
>struct devlink_ref {
>	struct devlink *devlink;
>};
>
>which one would have to pass to devl_lock_from_ref() or some such:
>
>struct devlink *devl_lock_from_ref(struct devlink_ref dref)
>{
>	if (!dref.devlink)
>		return NULL;
>	devl_lock(dref.devlink);
>	if (devl_lock_alive(dref.devlink))
>		return dref.devlink;
>	devl_unlock(dref.devlink);
>	return NULL;
>}
>
>But the number of calls to devl_is_alive() is quite small after all
>the cleanup, so I don't think the extra helpers are justified at this
>point. "Normal coders" should not be exposed to any of the lifetime
>details, not when coding the drivers, not when adding typical devlink
>features/subobjects.

Fair point.
Jiri Pirko Jan. 3, 2023, 12:26 p.m. UTC | #11
Sat, Dec 17, 2022 at 02:19:47AM CET, kuba@kernel.org wrote:

[...]


>+bool devl_is_alive(struct devlink *devlink)
>+{
>+	return xa_get_mark(&devlinks, devlink->index, DEVLINK_REGISTERED);
>+}
>+EXPORT_SYMBOL_GPL(devl_is_alive);

Why is this exported? Drivers should not use this, as you said.
Jakub Kicinski Jan. 4, 2023, 2:49 a.m. UTC | #12
On Tue, 3 Jan 2023 10:26:12 +0100 Jiri Pirko wrote:
> >> Why "alive"? To be consistent with the existing terminology, how about
> >> to name it devl_is_registered()?  
> >
> >I dislike the similarity to device_is_registered() which has very
> >different semantics. I prefer alive.  
> 
> Interesting. Didn't occur to me to look into device.h when reading
> devlink.c code. I mean, is device_register() behaviour in sync with
> devlink_register?
> 
> Your alive() helper is checking "register mark". It's an odd and unneded
> inconsistency in newly added code :/

Alright.

> >> Also, "devl_" implicates that it should be called with devlink instance
> >> lock held, so probably devlink_is_registered() would be better.  
> >
> >I'm guessing you realized this isn't correct later on.  
> 
> From what I see, no need to hold instance mutex for xa mark checking,
> alhough I understand why you want the helper to be called with the lock.
> Perhaps assert and a little comment would make this clear?

I'll add the comment. The assert would have to OR holding the subobject
locks. Is that what you had in mind?
Jakub Kicinski Jan. 4, 2023, 2:50 a.m. UTC | #13
On Tue, 3 Jan 2023 13:26:30 +0100 Jiri Pirko wrote:
> >+bool devl_is_alive(struct devlink *devlink)
> >+{
> >+	return xa_get_mark(&devlinks, devlink->index, DEVLINK_REGISTERED);
> >+}
> >+EXPORT_SYMBOL_GPL(devl_is_alive);  
> 
> Why is this exported? Drivers should not use this, as you said.

I'll make it a static inline in the internal header.
Jiri Pirko Jan. 4, 2023, 4:14 p.m. UTC | #14
Wed, Jan 04, 2023 at 03:49:59AM CET, kuba@kernel.org wrote:
>On Tue, 3 Jan 2023 10:26:12 +0100 Jiri Pirko wrote:
>> >> Why "alive"? To be consistent with the existing terminology, how about
>> >> to name it devl_is_registered()?  
>> >
>> >I dislike the similarity to device_is_registered() which has very
>> >different semantics. I prefer alive.  
>> 
>> Interesting. Didn't occur to me to look into device.h when reading
>> devlink.c code. I mean, is device_register() behaviour in sync with
>> devlink_register?
>> 
>> Your alive() helper is checking "register mark". It's an odd and unneded
>> inconsistency in newly added code :/
>
>Alright.
>
>> >> Also, "devl_" implicates that it should be called with devlink instance
>> >> lock held, so probably devlink_is_registered() would be better.  
>> >
>> >I'm guessing you realized this isn't correct later on.  
>> 
>> From what I see, no need to hold instance mutex for xa mark checking,
>> alhough I understand why you want the helper to be called with the lock.
>> Perhaps assert and a little comment would make this clear?
>
>I'll add the comment. The assert would have to OR holding the subobject
>locks. Is that what you had in mind?

Ah right, the subobject locks. That will go away I'm sure. Yes, that is
what I had in mind. After that, we can put assert here.
diff mbox series

Patch

diff --git a/include/net/devlink.h b/include/net/devlink.h
index 6a2e4f21779f..36e013d3aa52 100644
--- a/include/net/devlink.h
+++ b/include/net/devlink.h
@@ -1626,6 +1626,7 @@  struct device *devlink_to_dev(const struct devlink *devlink);
 void devl_lock(struct devlink *devlink);
 int devl_trylock(struct devlink *devlink);
 void devl_unlock(struct devlink *devlink);
+bool devl_is_alive(struct devlink *devlink);
 void devl_assert_locked(struct devlink *devlink);
 bool devl_lock_is_held(struct devlink *devlink);
 
diff --git a/net/devlink/basic.c b/net/devlink/basic.c
index 5f33d74eef83..6b18e70a39fd 100644
--- a/net/devlink/basic.c
+++ b/net/devlink/basic.c
@@ -2130,6 +2130,9 @@  static int devlink_nl_cmd_linecard_get_dumpit(struct sk_buff *msg,
 		int idx = 0;
 
 		mutex_lock(&devlink->linecards_lock);
+		if (!devl_is_alive(devlink))
+			goto next_devlink;
+
 		list_for_each_entry(linecard, &devlink->linecard_list, list) {
 			if (idx < dump->idx) {
 				idx++;
@@ -2151,6 +2154,7 @@  static int devlink_nl_cmd_linecard_get_dumpit(struct sk_buff *msg,
 			}
 			idx++;
 		}
+next_devlink:
 		mutex_unlock(&devlink->linecards_lock);
 		devlink_put(devlink);
 	}
@@ -7809,6 +7813,12 @@  devlink_nl_cmd_health_reporter_get_dumpit(struct sk_buff *msg,
 		int idx = 0;
 
 		mutex_lock(&devlink->reporters_lock);
+		if (!devl_is_alive(devlink)) {
+			mutex_unlock(&devlink->reporters_lock);
+			devlink_put(devlink);
+			continue;
+		}
+
 		list_for_each_entry(reporter, &devlink->reporter_list,
 				    list) {
 			if (idx < dump->idx) {
@@ -7830,6 +7840,9 @@  devlink_nl_cmd_health_reporter_get_dumpit(struct sk_buff *msg,
 		mutex_unlock(&devlink->reporters_lock);
 
 		devl_lock(devlink);
+		if (!devl_is_alive(devlink))
+			goto next_devlink;
+
 		xa_for_each(&devlink->ports, port_index, port) {
 			mutex_lock(&port->reporters_lock);
 			list_for_each_entry(reporter, &port->reporter_list, list) {
@@ -7853,6 +7866,7 @@  devlink_nl_cmd_health_reporter_get_dumpit(struct sk_buff *msg,
 			}
 			mutex_unlock(&port->reporters_lock);
 		}
+next_devlink:
 		devl_unlock(devlink);
 		devlink_put(devlink);
 	}
@@ -12218,7 +12232,8 @@  void devlink_compat_running_version(struct devlink *devlink,
 		return;
 
 	devl_lock(devlink);
-	__devlink_compat_running_version(devlink, buf, len);
+	if (devl_is_alive(devlink))
+		__devlink_compat_running_version(devlink, buf, len);
 	devl_unlock(devlink);
 }
 
@@ -12227,20 +12242,28 @@  int devlink_compat_flash_update(struct devlink *devlink, const char *file_name)
 	struct devlink_flash_update_params params = {};
 	int ret;
 
-	if (!devlink->ops->flash_update)
-		return -EOPNOTSUPP;
+	devl_lock(devlink);
+	if (!devl_is_alive(devlink)) {
+		ret = -ENODEV;
+		goto out_unlock;
+	}
+
+	if (!devlink->ops->flash_update) {
+		ret = -EOPNOTSUPP;
+		goto out_unlock;
+	}
 
 	ret = request_firmware(&params.fw, file_name, devlink->dev);
 	if (ret)
-		return ret;
+		goto out_unlock;
 
-	devl_lock(devlink);
 	devlink_flash_update_begin_notify(devlink);
 	ret = devlink->ops->flash_update(devlink, &params, NULL);
 	devlink_flash_update_end_notify(devlink);
-	devl_unlock(devlink);
 
 	release_firmware(params.fw);
+out_unlock:
+	devl_unlock(devlink);
 
 	return ret;
 }
diff --git a/net/devlink/core.c b/net/devlink/core.c
index d3b8336946fd..2abad8247597 100644
--- a/net/devlink/core.c
+++ b/net/devlink/core.c
@@ -67,6 +67,21 @@  void devl_unlock(struct devlink *devlink)
 }
 EXPORT_SYMBOL_GPL(devl_unlock);
 
+bool devl_is_alive(struct devlink *devlink)
+{
+	return xa_get_mark(&devlinks, devlink->index, DEVLINK_REGISTERED);
+}
+EXPORT_SYMBOL_GPL(devl_is_alive);
+
+/**
+ * devlink_try_get() - try to obtain a reference on a devlink instance
+ * @devlink: instance to reference
+ *
+ * Obtain a reference on a devlink instance. A reference on a devlink instance
+ * only implies that it's safe to take the instance lock. It does not imply
+ * that the instance is registered, use devl_is_alive() after taking
+ * the instance lock to check registration status.
+ */
 struct devlink *__must_check devlink_try_get(struct devlink *devlink)
 {
 	if (refcount_inc_not_zero(&devlink->refcount))
@@ -300,10 +315,12 @@  static void __net_exit devlink_pernet_pre_exit(struct net *net)
 	devlinks_xa_for_each_registered_get(net, index, devlink) {
 		WARN_ON(!(devlink->features & DEVLINK_F_RELOAD));
 		devl_lock(devlink);
-		err = devlink_reload(devlink, &init_net,
-				     DEVLINK_RELOAD_ACTION_DRIVER_REINIT,
-				     DEVLINK_RELOAD_LIMIT_UNSPEC,
-				     &actions_performed, NULL);
+		err = 0;
+		if (devl_is_alive(devlink))
+			err = devlink_reload(devlink, &init_net,
+					     DEVLINK_RELOAD_ACTION_DRIVER_REINIT,
+					     DEVLINK_RELOAD_LIMIT_UNSPEC,
+					     &actions_performed, NULL);
 		devl_unlock(devlink);
 		devlink_put(devlink);
 
diff --git a/net/devlink/netlink.c b/net/devlink/netlink.c
index b38df704be1c..773efaabb6ad 100644
--- a/net/devlink/netlink.c
+++ b/net/devlink/netlink.c
@@ -98,7 +98,8 @@  devlink_get_from_attrs_lock(struct net *net, struct nlattr **attrs)
 
 	devlinks_xa_for_each_registered_get(net, index, devlink) {
 		devl_lock(devlink);
-		if (strcmp(devlink->dev->bus->name, busname) == 0 &&
+		if (devl_is_alive(devlink) &&
+		    strcmp(devlink->dev->bus->name, busname) == 0 &&
 		    strcmp(dev_name(devlink->dev), devname) == 0)
 			return devlink;
 		devl_unlock(devlink);
@@ -210,7 +211,12 @@  int devlink_instance_iter_dump(struct sk_buff *msg, struct netlink_callback *cb)
 
 	devlink_dump_for_each_instance_get(msg, dump, devlink) {
 		devl_lock(devlink);
-		err = cmd->dump_one(msg, devlink, cb);
+
+		if (devl_is_alive(devlink))
+			err = cmd->dump_one(msg, devlink, cb);
+		else
+			err = 0;
+
 		devl_unlock(devlink);
 		devlink_put(devlink);