diff mbox series

wifi: wilc1000: validate chip id during bus probe

Message ID 20240122220350.1449413-1-davidm@egauge.net (mailing list archive)
State Changes Requested
Delegated to: Kalle Valo
Headers show
Series wifi: wilc1000: validate chip id during bus probe | expand

Commit Message

David Mosberger-Tang Jan. 22, 2024, 10:03 p.m. UTC
Previously, the driver created a net device (typically wlan0) as soon
as the module was loaded.  This commit changes the driver to follow
normal Linux convention of creating the net device only when bus
probing detects a supported chip.

Signed-off-by: David Mosberger-Tang <davidm@egauge.net>
---
V2 -> V3: Add missing forward declarations, actually :-(

 drivers/net/wireless/microchip/wilc1000/spi.c | 133 ++++++++++++------
 .../net/wireless/microchip/wilc1000/wlan.h    |   1 +
 2 files changed, 90 insertions(+), 44 deletions(-)

Comments

Alexis Lothoré (eBPF Foundation) Jan. 23, 2024, 10:18 a.m. UTC | #1
On 1/22/24 23:03, David Mosberger-Tang wrote:
> Previously, the driver created a net device (typically wlan0) as soon
> as the module was loaded.  This commit changes the driver to follow
> normal Linux convention of creating the net device only when bus
> probing detects a supported chip.

I would gladly help review/test the patch, but please give us some time between
versions to take a look (even if you can mention if you found issues yourself).
Also, each version should be a separate thread, bearing the new version in the
"Subject" line.
Additionally (to answer your cover letter), the patches must target the wireless
branches (likely wireless-testing), not linux-next
(https://wireless.wiki.kernel.org/en/developers/documentation/git-guide)

> Signed-off-by: David Mosberger-Tang <davidm@egauge.net>
> ---
> V2 -> V3: Add missing forward declarations, actually :-(
> 
>  drivers/net/wireless/microchip/wilc1000/spi.c | 133 ++++++++++++------
>  .../net/wireless/microchip/wilc1000/wlan.h    |   1 +
>  2 files changed, 90 insertions(+), 44 deletions(-)
> 
> diff --git a/drivers/net/wireless/microchip/wilc1000/spi.c b/drivers/net/wireless/microchip/wilc1000/spi.c
> index 77b4cdff73c3..dd6935dc1bc9 100644
> --- a/drivers/net/wireless/microchip/wilc1000/spi.c
> +++ b/drivers/net/wireless/microchip/wilc1000/spi.c
> @@ -42,7 +42,7 @@ MODULE_PARM_DESC(enable_crc16,
>  #define WILC_SPI_RSP_HDR_EXTRA_DATA	8
>  
>  struct wilc_spi {
> -	bool isinit;		/* true if SPI protocol has been configured */
> +	bool isinit;		/* true if wilc_spi_init was successful */
>  	bool probing_crc;	/* true if we're probing chip's CRC config */
>  	bool crc7_enabled;	/* true if crc7 is currently enabled */
>  	bool crc16_enabled;	/* true if crc16 is currently enabled */
> @@ -55,6 +55,8 @@ struct wilc_spi {
>  static const struct wilc_hif_func wilc_hif_spi;
>  
>  static int wilc_spi_reset(struct wilc *wilc);
> +static int wilc_spi_configure_bus_protocol(struct wilc *wilc);
> +static int wilc_validate_chipid(struct wilc *wilc);
>  
>  /********************************************
>   *
> @@ -232,6 +234,22 @@ static int wilc_bus_probe(struct spi_device *spi)
>  	}
>  	clk_prepare_enable(wilc->rtc_clk);
>  
> +	/* we need power to configure the bus protocol and to read the chip id: */
> +
> +	wilc_wlan_power(wilc, true);
> +
> +	ret = wilc_spi_configure_bus_protocol(wilc);
> +
> +	if (ret == 0)
> +		ret = wilc_validate_chipid(wilc);
> +
> +	wilc_wlan_power(wilc, false);
> +
> +	if (ret) {
> +		ret = -ENODEV;
> +		goto netdev_cleanup;

I have a working wilc-over-spi setup with which I can easily unplug the module,
so I gave a try to your series, and while the lack of chip detect indeed makes
the netdevice registration not executed, I've got a nasty kasan warning:

 driver_probe_device from __driver_attach+0x1a0/0x29c



                                                 [141/1863]
 __driver_attach from bus_for_each_dev+0xf0/0x14c
 bus_for_each_dev from bus_add_driver+0x130/0x288
 bus_add_driver from driver_register+0xd4/0x1c0
 driver_register from do_one_initcall+0xfc/0x204
 do_one_initcall from kernel_init_freeable+0x240/0x2a0
 kernel_init_freeable from kernel_init+0x20/0x144
 kernel_init from ret_from_fork+0x14/0x28
Exception stack(0xc3163fb0 to 0xc3163ff8)
3fa0:                                     00000000 00000000 00000000 00000000
3fc0: 00000000 00000000 00000000 00000000 00000000 00000000 00000000 00000000
3fe0: 00000000 00000000 00000000 00000000 00000013 00000000

Allocated by task 1:
 kasan_set_track+0x3c/0x5c
 __kasan_kmalloc+0x8c/0x94
 __kmalloc_node+0x64/0x184
 kvmalloc_node+0x48/0x114
 alloc_netdev_mqs+0x68/0x664
 alloc_etherdev_mqs+0x28/0x34
 wilc_netdev_ifc_init+0x34/0x37c
 wilc_cfg80211_init+0x278/0x330
 wilc_bus_probe+0xb4/0x398
 spi_probe+0xb8/0xdc
 really_probe+0x134/0x588
 __driver_probe_device+0xe0/0x288
 driver_probe_device+0x60/0x118
 __driver_attach+0x1a0/0x29c
 bus_for_each_dev+0xf0/0x14c
 bus_add_driver+0x130/0x288
 driver_register+0xd4/0x1c0
 do_one_initcall+0xfc/0x204
 kernel_init_freeable+0x240/0x2a0
 kernel_init+0x20/0x144
 ret_from_fork+0x14/0x28

Freed by task 1:
 kasan_set_track+0x3c/0x5c
 kasan_save_free_info+0x30/0x3c
 __kasan_slab_free+0xe4/0x12c
 __kmem_cache_free+0x94/0x1cc
 device_release+0x54/0xf8
 kobject_put+0xf4/0x238
 netdev_run_todo+0x414/0x7dc
 wilc_netdev_cleanup+0xe4/0x244
 wilc_bus_probe+0x2b8/0x398
 spi_probe+0xb8/0xdc
 really_probe+0x134/0x588
 __driver_probe_device+0xe0/0x288
 driver_probe_device+0x60/0x118
 __driver_attach+0x1a0/0x29c
 bus_for_each_dev+0xf0/0x14c
 bus_add_driver+0x130/0x288
 driver_register+0xd4/0x1c0
 do_one_initcall+0xfc/0x204
 kernel_init_freeable+0x240/0x2a0
 kernel_init+0x20/0x144
 ret_from_fork+0x14/0x28

It looks like an already existing/dormant issue in the error-managing path of
spi probe of the driver (looks like we are trying to unregister a netdevice
which has never been registered ?), but since your series triggers it, it should
be handled too.

> +	}
> +
>  	return 0;
>  
>  netdev_cleanup:
> @@ -1074,58 +1092,43 @@ static int wilc_spi_write(struct wilc *wilc, u32 addr, u8 *buf, u32 size)
>   *
>   ********************************************/
>  
> -static int wilc_spi_reset(struct wilc *wilc)
> +static const char *
> +strbool(bool val)

I am not convinced we need a dedicated helper just to print boolean values (and
why the super short line break ?)
Also, such change should likely be in a separate patch since it generate quite
some lines of diff but none of those being about the initial topic

>  {
> -	struct spi_device *spi = to_spi_device(wilc->dev);
> -	struct wilc_spi *spi_priv = wilc->bus_data;
> -	int result;
> -
> -	result = wilc_spi_special_cmd(wilc, CMD_RESET);
> -	if (result && !spi_priv->probing_crc)
> -		dev_err(&spi->dev, "Failed cmd reset\n");
> -
> -	return result;
> -}
> -
> -static bool wilc_spi_is_init(struct wilc *wilc)
> -{
> -	struct wilc_spi *spi_priv = wilc->bus_data;
> -
> -	return spi_priv->isinit;
> +	return val ? "on" : "off";
>  }
>  
> -static int wilc_spi_deinit(struct wilc *wilc)
> +static int wilc_validate_chipid(struct wilc *wilc)
>  {
> -	struct wilc_spi *spi_priv = wilc->bus_data;
> +	struct spi_device *spi = to_spi_device(wilc->dev);
> +	u32 chipid, base_id;
> +	int ret;
>  
> -	spi_priv->isinit = false;
> -	wilc_wlan_power(wilc, false);
> +	/*
> +	 * make sure can read chip id without protocol error
> +	 */
> +	ret = wilc_spi_read_reg(wilc, WILC_CHIPID, &chipid);
> +	if (ret) {
> +		dev_err(&spi->dev, "Fail cmd read chip id...\n");
> +		return ret;
> +	}
> +	base_id = chipid & ~WILC_CHIP_REV_FIELD;
> +	if (base_id != WILC_1000_BASE_ID && base_id != WILC_3000_BASE_ID) {

- WILC3000 is currently not supported (yet) by the upstream driver, so there is
no reason to validate its presence if we can not handle it later. Any mention of
it should then be removed from this series
- I see that there is already a helper to handle masking and check chip id in
wlan.c (is_wilc1000). You should likely reuse this/avoid the duplication

> +		dev_err(&spi->dev, "Unknown chip id 0x%x\n", chipid);
> +		return ret;
> +	}
> +	dev_info(&spi->dev, "Detected chip id 0x%x (crc7=%s, crc16=%s)\n",
> +		 chipid, strbool(enable_crc7), strbool(enable_crc16));
>  	return 0;
>  }
>  
> -static int wilc_spi_init(struct wilc *wilc, bool resume)
> +static int wilc_spi_configure_bus_protocol(struct wilc *wilc)
>  {
>  	struct spi_device *spi = to_spi_device(wilc->dev);
>  	struct wilc_spi *spi_priv = wilc->bus_data;
>  	u32 reg;
> -	u32 chipid;
>  	int ret, i;
>  
> -	if (spi_priv->isinit) {
> -		/* Confirm we can read chipid register without error: */
> -		ret = wilc_spi_read_reg(wilc, WILC_CHIPID, &chipid);
> -		if (ret == 0)
> -			return 0;
> -
> -		dev_err(&spi->dev, "Fail cmd read chip id...\n");
> -	}
> -
> -	wilc_wlan_power(wilc, true);
> -
> -	/*
> -	 * configure protocol
> -	 */
> -
>  	/*
>  	 * Infer the CRC settings that are currently in effect.  This
>  	 * is necessary because we can't be sure that the chip has
> @@ -1176,12 +1179,54 @@ static int wilc_spi_init(struct wilc *wilc, bool resume)
>  
>  	spi_priv->probing_crc = false;
>  
> -	/*
> -	 * make sure can read chip id without protocol error
> -	 */
> -	ret = wilc_spi_read_reg(wilc, WILC_CHIPID, &chipid);
> +	return 0;
> +}
> +
> +static int wilc_spi_reset(struct wilc *wilc)
> +{
> +	struct spi_device *spi = to_spi_device(wilc->dev);
> +	struct wilc_spi *spi_priv = wilc->bus_data;
> +	int result;
> +
> +	result = wilc_spi_special_cmd(wilc, CMD_RESET);
> +	if (result && !spi_priv->probing_crc)
> +		dev_err(&spi->dev, "Failed cmd reset\n");
> +
> +	return result;
> +}
> +
> +static bool wilc_spi_is_init(struct wilc *wilc)
> +{
> +	struct wilc_spi *spi_priv = wilc->bus_data;
> +
> +	return spi_priv->isinit;
> +}
> +
> +static int wilc_spi_deinit(struct wilc *wilc)
> +{
> +	struct wilc_spi *spi_priv = wilc->bus_data;
> +
> +	spi_priv->isinit = false;
> +	wilc_wlan_power(wilc, false);
> +	return 0;
> +}
> +
> +static int wilc_spi_init(struct wilc *wilc, bool resume)
> +{
> +	struct wilc_spi *spi_priv = wilc->bus_data;
> +	int ret;
> +
> +	if (spi_priv->isinit) {

Ok, so indeed in this new version of the patch, the flag still makes sense for
upper layers.

> +		/* Confirm we can read chipid register without error: */
> +		if (wilc_validate_chipid(wilc) == 0)
> +			return 0;
> +	}
> +
> +	wilc_wlan_power(wilc, true);
> +
> +	ret = wilc_spi_configure_bus_protocol(wilc);
>  	if (ret) {
> -		dev_err(&spi->dev, "Fail cmd read chip id...\n");
> +		wilc_wlan_power(wilc, false);
>  		return ret;
>  	}
>  
> diff --git a/drivers/net/wireless/microchip/wilc1000/wlan.h b/drivers/net/wireless/microchip/wilc1000/wlan.h
> index a72cd5cac81d..43dda9f0d9ca 100644
> --- a/drivers/net/wireless/microchip/wilc1000/wlan.h
> +++ b/drivers/net/wireless/microchip/wilc1000/wlan.h
> @@ -182,6 +182,7 @@
>  #define WILC_CORTUS_BOOT_FROM_IRAM	0x71
>  
>  #define WILC_1000_BASE_ID		0x100000
> +#define WILC_3000_BASE_ID		0x300000

See comment above for WILC3000

>  
>  #define WILC_1000_BASE_ID_2A		0x1002A0
>  #define WILC_1000_BASE_ID_2A_REV1	(WILC_1000_BASE_ID_2A + 1)
Kalle Valo Jan. 23, 2024, 11:06 a.m. UTC | #2
Alexis Lothoré <alexis.lothore@bootlin.com> writes:

> On 1/22/24 23:03, David Mosberger-Tang wrote:
>> Previously, the driver created a net device (typically wlan0) as soon
>> as the module was loaded.  This commit changes the driver to follow
>> normal Linux convention of creating the net device only when bus
>> probing detects a supported chip.
>
> I would gladly help review/test the patch, but please give us some time between
> versions to take a look (even if you can mention if you found issues yourself).
> Also, each version should be a separate thread, bearing the new version in the
> "Subject" line.
> Additionally (to answer your cover letter), the patches must target the wireless
> branches (likely wireless-testing), not linux-next
> (https://wireless.wiki.kernel.org/en/developers/documentation/git-guide)

Actually wireless-next is preferred for the baseline (unless it's a fix
going to -rc releases):

https://git.kernel.org/pub/scm/linux/kernel/git/wireless/wireless-next.git/
Alexis Lothoré (eBPF Foundation) Jan. 23, 2024, 12:44 p.m. UTC | #3
On 1/23/24 12:06, Kalle Valo wrote:
> Alexis Lothoré <alexis.lothore@bootlin.com> writes:
> 
>> On 1/22/24 23:03, David Mosberger-Tang wrote:
>>> Previously, the driver created a net device (typically wlan0) as soon
>>> as the module was loaded.  This commit changes the driver to follow
>>> normal Linux convention of creating the net device only when bus
>>> probing detects a supported chip.
>>
>> I would gladly help review/test the patch, but please give us some time between
>> versions to take a look (even if you can mention if you found issues yourself).
>> Also, each version should be a separate thread, bearing the new version in the
>> "Subject" line.
>> Additionally (to answer your cover letter), the patches must target the wireless
>> branches (likely wireless-testing), not linux-next
>> (https://wireless.wiki.kernel.org/en/developers/documentation/git-guide)
> 
> Actually wireless-next is preferred for the baseline (unless it's a fix
> going to -rc releases):
> 
> https://git.kernel.org/pub/scm/linux/kernel/git/wireless/wireless-next.git/

Oh, ok, thanks for the correction, I may have misinterpreted the wiki then
Kalle Valo Jan. 23, 2024, 12:59 p.m. UTC | #4
Alexis Lothoré <alexis.lothore@bootlin.com> writes:

> On 1/23/24 12:06, Kalle Valo wrote:
>> Alexis Lothoré <alexis.lothore@bootlin.com> writes:
>> 
>>> On 1/22/24 23:03, David Mosberger-Tang wrote:
>>>> Previously, the driver created a net device (typically wlan0) as soon
>>>> as the module was loaded.  This commit changes the driver to follow
>>>> normal Linux convention of creating the net device only when bus
>>>> probing detects a supported chip.
>>>
>>> I would gladly help review/test the patch, but please give us some time between
>>> versions to take a look (even if you can mention if you found issues yourself).
>>> Also, each version should be a separate thread, bearing the new version in the
>>> "Subject" line.
>>> Additionally (to answer your cover letter), the patches must target the wireless
>>> branches (likely wireless-testing), not linux-next
>>> (https://wireless.wiki.kernel.org/en/developers/documentation/git-guide)
>> 
>> Actually wireless-next is preferred for the baseline (unless it's a fix
>> going to -rc releases):
>> 
>> https://git.kernel.org/pub/scm/linux/kernel/git/wireless/wireless-next.git/
>
> Oh, ok, thanks for the correction, I may have misinterpreted the wiki then

Ah, we should update that page. That page was written before we had
common wireless and wireless-next trees.

I don't know Johannes thoughts on this but my recommendation for
baseline:

* use wireless tree for important fixes going to -rc releases

* for other patches use either driver specific tree (eg. iwlwifi, mt76,
  ath) or wireless-next (if no driver specific tree available)

* for automated testing etc. use wireless-testing as it's a merge of
  wireless and wireless-next and contains all latest code
Alexis Lothoré (eBPF Foundation) Jan. 23, 2024, 1:29 p.m. UTC | #5
On 1/23/24 13:59, Kalle Valo wrote:
> Alexis Lothoré <alexis.lothore@bootlin.com> writes:
> 
>> On 1/23/24 12:06, Kalle Valo wrote:
>>> Alexis Lothoré <alexis.lothore@bootlin.com> writes:
>>>
>>>> On 1/22/24 23:03, David Mosberger-Tang wrote:
>>>>> Previously, the driver created a net device (typically wlan0) as soon
>>>>> as the module was loaded.  This commit changes the driver to follow
>>>>> normal Linux convention of creating the net device only when bus
>>>>> probing detects a supported chip.
>>>>
>>>> I would gladly help review/test the patch, but please give us some time between
>>>> versions to take a look (even if you can mention if you found issues yourself).
>>>> Also, each version should be a separate thread, bearing the new version in the
>>>> "Subject" line.
>>>> Additionally (to answer your cover letter), the patches must target the wireless
>>>> branches (likely wireless-testing), not linux-next
>>>> (https://wireless.wiki.kernel.org/en/developers/documentation/git-guide)
>>>
>>> Actually wireless-next is preferred for the baseline (unless it's a fix
>>> going to -rc releases):
>>>
>>> https://git.kernel.org/pub/scm/linux/kernel/git/wireless/wireless-next.git/
>>
>> Oh, ok, thanks for the correction, I may have misinterpreted the wiki then
> 
> Ah, we should update that page. That page was written before we had
> common wireless and wireless-next trees.
> 
> I don't know Johannes thoughts on this but my recommendation for
> baseline:
> 
> * use wireless tree for important fixes going to -rc releases
> 
> * for other patches use either driver specific tree (eg. iwlwifi, mt76,
>   ath) or wireless-next (if no driver specific tree available)
> 
> * for automated testing etc. use wireless-testing as it's a merge of
>   wireless and wireless-next and contains all latest code

Thanks for the details. I'll wait a bit in case Johannes or anyone else wants to
add anything, then I can take care of updating the corresponding page
David Mosberger-Tang Jan. 23, 2024, 5:39 p.m. UTC | #6
On Tue, 2024-01-23 at 11:18 +0100, Alexis Lothoré wrote:
> On 1/22/24 23:03, David Mosberger-Tang wrote:
> > Previously, the driver created a net device (typically wlan0) as soon
> > as the module was loaded.  This commit changes the driver to follow
> > normal Linux convention of creating the net device only when bus
> > probing detects a supported chip.
> 
> I would gladly help review/test the patch, but please give us some time between
> versions to take a look (even if you can mention if you found issues yourself).
> Also, each version should be a separate thread, bearing the new version in the
> "Subject" line.
> Additionally (to answer your cover letter), the patches must target the wireless
> branches (likely wireless-testing), not linux-next
> (https://wireless.wiki.kernel.org/en/developers/documentation/git-guide)

Yeah, sorry about that.

> I have a working wilc-over-spi setup with which I can easily unplug the module,
> so I gave a try to your series, and while the lack of chip detect indeed makes
> the netdevice registration not executed, I've got a nasty kasan warning:
> 
>  driver_probe_device from __driver_attach+0x1a0/0x29c
> 
> 
> 
>                                                  [141/1863]
>  __driver_attach from bus_for_each_dev+0xf0/0x14c
>  bus_for_each_dev from bus_add_driver+0x130/0x288
>  bus_add_driver from driver_register+0xd4/0x1c0
>  driver_register from do_one_initcall+0xfc/0x204
>  do_one_initcall from kernel_init_freeable+0x240/0x2a0
>  kernel_init_freeable from kernel_init+0x20/0x144
>  kernel_init from ret_from_fork+0x14/0x28
> Exception stack(0xc3163fb0 to 0xc3163ff8)
> 3fa0:                                     00000000 00000000 00000000 00000000
> 3fc0: 00000000 00000000 00000000 00000000 00000000 00000000 00000000 00000000
> 3fe0: 00000000 00000000 00000000 00000000 00000013 00000000
> 
> Allocated by task 1:
>  kasan_set_track+0x3c/0x5c
>  __kasan_kmalloc+0x8c/0x94
>  __kmalloc_node+0x64/0x184
>  kvmalloc_node+0x48/0x114
>  alloc_netdev_mqs+0x68/0x664
>  alloc_etherdev_mqs+0x28/0x34
>  wilc_netdev_ifc_init+0x34/0x37c
>  wilc_cfg80211_init+0x278/0x330
>  wilc_bus_probe+0xb4/0x398
>  spi_probe+0xb8/0xdc
>  really_probe+0x134/0x588
>  __driver_probe_device+0xe0/0x288
>  driver_probe_device+0x60/0x118
>  __driver_attach+0x1a0/0x29c
>  bus_for_each_dev+0xf0/0x14c
>  bus_add_driver+0x130/0x288
>  driver_register+0xd4/0x1c0
>  do_one_initcall+0xfc/0x204
>  kernel_init_freeable+0x240/0x2a0
>  kernel_init+0x20/0x144
>  ret_from_fork+0x14/0x28
> 
> Freed by task 1:
>  kasan_set_track+0x3c/0x5c
>  kasan_save_free_info+0x30/0x3c
>  __kasan_slab_free+0xe4/0x12c
>  __kmem_cache_free+0x94/0x1cc
>  device_release+0x54/0xf8
>  kobject_put+0xf4/0x238
>  netdev_run_todo+0x414/0x7dc
>  wilc_netdev_cleanup+0xe4/0x244
>  wilc_bus_probe+0x2b8/0x398
>  spi_probe+0xb8/0xdc
>  really_probe+0x134/0x588
>  __driver_probe_device+0xe0/0x288
>  driver_probe_device+0x60/0x118
>  __driver_attach+0x1a0/0x29c
>  bus_for_each_dev+0xf0/0x14c
>  bus_add_driver+0x130/0x288
>  driver_register+0xd4/0x1c0
>  do_one_initcall+0xfc/0x204
>  kernel_init_freeable+0x240/0x2a0
>  kernel_init+0x20/0x144
>  ret_from_fork+0x14/0x28
> 
> It looks like an already existing/dormant issue in the error-managing path of
> spi probe of the driver (looks like we are trying to unregister a netdevice
> which has never been registered ?), but since your series triggers it, it should
> be handled too.

I need help interpreting this.  What does KASAN actually complain about?  A
double free or something else?

register_netdev() does get called (through wilc_cfg80211_init()) and then when
the chip detect fails, unregister_netdev() gets called (from
wilc_netdev_cleanup()), so I don't see any obvious issues, but there is a lot of
other stuff going on there that I'm not familiar with.

> 
> > +	}
> > +
> >  	return 0;
> >  
> >  netdev_cleanup:
> > @@ -1074,58 +1092,43 @@ static int wilc_spi_write(struct wilc *wilc, u32 addr, u8 *buf, u32 size)
> >   *
> >   ********************************************/
> >  
> > -static int wilc_spi_reset(struct wilc *wilc)
> > +static const char *
> > +strbool(bool val)
> 
> I am not convinced we need a dedicated helper just to print boolean values (and
> why the super short line break ?)

Sure, I can remove that.

> Also, such change should likely be in a separate patch since it generate quite
> some lines of diff but none of those being about the initial topic

Sure, I can rearrange the order of the functions to mimize the size of the diff.

> >  {
> > -	struct spi_device *spi = to_spi_device(wilc->dev);
> > -	struct wilc_spi *spi_priv = wilc->bus_data;
> > -	int result;
> > -
> > -	result = wilc_spi_special_cmd(wilc, CMD_RESET);
> > -	if (result && !spi_priv->probing_crc)
> > -		dev_err(&spi->dev, "Failed cmd reset\n");
> > -
> > -	return result;
> > -}
> > -
> > -static bool wilc_spi_is_init(struct wilc *wilc)
> > -{
> > -	struct wilc_spi *spi_priv = wilc->bus_data;
> > -
> > -	return spi_priv->isinit;
> > +	return val ? "on" : "off";
> >  }
> >  
> > -static int wilc_spi_deinit(struct wilc *wilc)
> > +static int wilc_validate_chipid(struct wilc *wilc)
> >  {
> > -	struct wilc_spi *spi_priv = wilc->bus_data;
> > +	struct spi_device *spi = to_spi_device(wilc->dev);
> > +	u32 chipid, base_id;
> > +	int ret;
> >  
> > -	spi_priv->isinit = false;
> > -	wilc_wlan_power(wilc, false);
> > +	/*
> > +	 * make sure can read chip id without protocol error
> > +	 */
> > +	ret = wilc_spi_read_reg(wilc, WILC_CHIPID, &chipid);
> > +	if (ret) {
> > +		dev_err(&spi->dev, "Fail cmd read chip id...\n");
> > +		return ret;
> > +	}
> > +	base_id = chipid & ~WILC_CHIP_REV_FIELD;
> > +	if (base_id != WILC_1000_BASE_ID && base_id != WILC_3000_BASE_ID) {
> 
> - WILC3000 is currently not supported (yet) by the upstream driver, so there is
> no reason to validate its presence if we can not handle it later. Any mention of
> it should then be removed from this series

Oh, I didn't realize that.  I was just going off of this web page:

 https://www.microchip.com/en-us/software-library/wilc1000_3000_linux_driver

as I never played with WILC3000.

> - I see that there is already a helper to handle masking and check chip id in
> wlan.c (is_wilc1000). You should likely reuse this/avoid the duplication

Sure, it'll need to be an exported symbol, but other than that, it's fine.

> 
> > +		dev_err(&spi->dev, "Unknown chip id 0x%x\n", chipid);
> > +		return ret;
> > +	}
> > +	dev_info(&spi->dev, "Detected chip id 0x%x (crc7=%s, crc16=%s)\n",
> > +		 chipid, strbool(enable_crc7), strbool(enable_crc16));
> >  	return 0;
> >  }
> >  
> > -static int wilc_spi_init(struct wilc *wilc, bool resume)
> > +static int wilc_spi_configure_bus_protocol(struct wilc *wilc)
> >  {
> >  	struct spi_device *spi = to_spi_device(wilc->dev);
> >  	struct wilc_spi *spi_priv = wilc->bus_data;
> >  	u32 reg;
> > -	u32 chipid;
> >  	int ret, i;
> >  
> > -	if (spi_priv->isinit) {
> > -		/* Confirm we can read chipid register without error: */
> > -		ret = wilc_spi_read_reg(wilc, WILC_CHIPID, &chipid);
> > -		if (ret == 0)
> > -			return 0;
> > -
> > -		dev_err(&spi->dev, "Fail cmd read chip id...\n");
> > -	}
> > -
> > -	wilc_wlan_power(wilc, true);
> > -
> > -	/*
> > -	 * configure protocol
> > -	 */
> > -
> >  	/*
> >  	 * Infer the CRC settings that are currently in effect.  This
> >  	 * is necessary because we can't be sure that the chip has
> > @@ -1176,12 +1179,54 @@ static int wilc_spi_init(struct wilc *wilc, bool resume)
> >  
> >  	spi_priv->probing_crc = false;
> >  
> > -	/*
> > -	 * make sure can read chip id without protocol error
> > -	 */
> > -	ret = wilc_spi_read_reg(wilc, WILC_CHIPID, &chipid);
> > +	return 0;
> > +}
> > +
> > +static int wilc_spi_reset(struct wilc *wilc)
> > +{
> > +	struct spi_device *spi = to_spi_device(wilc->dev);
> > +	struct wilc_spi *spi_priv = wilc->bus_data;
> > +	int result;
> > +
> > +	result = wilc_spi_special_cmd(wilc, CMD_RESET);
> > +	if (result && !spi_priv->probing_crc)
> > +		dev_err(&spi->dev, "Failed cmd reset\n");
> > +
> > +	return result;
> > +}
> > +
> > +static bool wilc_spi_is_init(struct wilc *wilc)
> > +{
> > +	struct wilc_spi *spi_priv = wilc->bus_data;
> > +
> > +	return spi_priv->isinit;
> > +}
> > +
> > +static int wilc_spi_deinit(struct wilc *wilc)
> > +{
> > +	struct wilc_spi *spi_priv = wilc->bus_data;
> > +
> > +	spi_priv->isinit = false;
> > +	wilc_wlan_power(wilc, false);
> > +	return 0;
> > +}
> > +
> > +static int wilc_spi_init(struct wilc *wilc, bool resume)
> > +{
> > +	struct wilc_spi *spi_priv = wilc->bus_data;
> > +	int ret;
> > +
> > +	if (spi_priv->isinit) {
> 
> Ok, so indeed in this new version of the patch, the flag still makes sense for
> upper layers.
> 
> > +		/* Confirm we can read chipid register without error: */
> > +		if (wilc_validate_chipid(wilc) == 0)
> > +			return 0;
> > +	}
> > +
> > +	wilc_wlan_power(wilc, true);
> > +
> > +	ret = wilc_spi_configure_bus_protocol(wilc);
> >  	if (ret) {
> > -		dev_err(&spi->dev, "Fail cmd read chip id...\n");
> > +		wilc_wlan_power(wilc, false);
> >  		return ret;
> >  	}
> >  
> > diff --git a/drivers/net/wireless/microchip/wilc1000/wlan.h b/drivers/net/wireless/microchip/wilc1000/wlan.h
> > index a72cd5cac81d..43dda9f0d9ca 100644
> > --- a/drivers/net/wireless/microchip/wilc1000/wlan.h
> > +++ b/drivers/net/wireless/microchip/wilc1000/wlan.h
> > @@ -182,6 +182,7 @@
> >  #define WILC_CORTUS_BOOT_FROM_IRAM	0x71
> >  
> >  #define WILC_1000_BASE_ID		0x100000
> > +#define WILC_3000_BASE_ID		0x300000
> 
> See comment above for WILC3000
> 
> >  
> >  #define WILC_1000_BASE_ID_2A		0x1002A0
> >  #define WILC_1000_BASE_ID_2A_REV1	(WILC_1000_BASE_ID_2A + 1)
>
Alexis Lothoré (eBPF Foundation) Jan. 24, 2024, 9:01 a.m. UTC | #7
On 1/23/24 18:39, David Mosberger-Tang wrote:
> On Tue, 2024-01-23 at 11:18 +0100, Alexis Lothoré wrote:
>> On 1/22/24 23:03, David Mosberger-Tang wrote:

[...]

>> I have a working wilc-over-spi setup with which I can easily unplug the module,
>> so I gave a try to your series, and while the lack of chip detect indeed makes
>> the netdevice registration not executed, I've got a nasty kasan warning:
>>
>>  driver_probe_device from __driver_attach+0x1a0/0x29c
>>
>>
>>
>>                                                  [141/1863]
>>  __driver_attach from bus_for_each_dev+0xf0/0x14c
>>  bus_for_each_dev from bus_add_driver+0x130/0x288
>>  bus_add_driver from driver_register+0xd4/0x1c0
>>  driver_register from do_one_initcall+0xfc/0x204
>>  do_one_initcall from kernel_init_freeable+0x240/0x2a0
>>  kernel_init_freeable from kernel_init+0x20/0x144
>>  kernel_init from ret_from_fork+0x14/0x28
>> Exception stack(0xc3163fb0 to 0xc3163ff8)
>> 3fa0:                                     00000000 00000000 00000000 00000000
>> 3fc0: 00000000 00000000 00000000 00000000 00000000 00000000 00000000 00000000
>> 3fe0: 00000000 00000000 00000000 00000000 00000013 00000000
>>
>> Allocated by task 1:
>>  kasan_set_track+0x3c/0x5c
>>  __kasan_kmalloc+0x8c/0x94
>>  __kmalloc_node+0x64/0x184
>>  kvmalloc_node+0x48/0x114
>>  alloc_netdev_mqs+0x68/0x664
>>  alloc_etherdev_mqs+0x28/0x34
>>  wilc_netdev_ifc_init+0x34/0x37c
>>  wilc_cfg80211_init+0x278/0x330
>>  wilc_bus_probe+0xb4/0x398
>>  spi_probe+0xb8/0xdc
>>  really_probe+0x134/0x588
>>  __driver_probe_device+0xe0/0x288
>>  driver_probe_device+0x60/0x118
>>  __driver_attach+0x1a0/0x29c
>>  bus_for_each_dev+0xf0/0x14c
>>  bus_add_driver+0x130/0x288
>>  driver_register+0xd4/0x1c0
>>  do_one_initcall+0xfc/0x204
>>  kernel_init_freeable+0x240/0x2a0
>>  kernel_init+0x20/0x144
>>  ret_from_fork+0x14/0x28
>>
>> Freed by task 1:
>>  kasan_set_track+0x3c/0x5c
>>  kasan_save_free_info+0x30/0x3c
>>  __kasan_slab_free+0xe4/0x12c
>>  __kmem_cache_free+0x94/0x1cc
>>  device_release+0x54/0xf8
>>  kobject_put+0xf4/0x238
>>  netdev_run_todo+0x414/0x7dc
>>  wilc_netdev_cleanup+0xe4/0x244
>>  wilc_bus_probe+0x2b8/0x398
>>  spi_probe+0xb8/0xdc
>>  really_probe+0x134/0x588
>>  __driver_probe_device+0xe0/0x288
>>  driver_probe_device+0x60/0x118
>>  __driver_attach+0x1a0/0x29c
>>  bus_for_each_dev+0xf0/0x14c
>>  bus_add_driver+0x130/0x288
>>  driver_register+0xd4/0x1c0
>>  do_one_initcall+0xfc/0x204
>>  kernel_init_freeable+0x240/0x2a0
>>  kernel_init+0x20/0x144
>>  ret_from_fork+0x14/0x28
>>
>> It looks like an already existing/dormant issue in the error-managing path of
>> spi probe of the driver (looks like we are trying to unregister a netdevice
>> which has never been registered ?), but since your series triggers it, it should
>> be handled too.
> 
> I need help interpreting this.  What does KASAN actually complain about?  A
> double free or something else?

I see that the kasan dump from my last email is truncated, but the first line
clearly mentions a use-after-free:

==================================================================
BUG: KASAN: slab-use-after-free in wilc_netdev_cleanup+0x294/0x2c0
Read of size 4 at addr c3c91ce8 by task swapper/1

CPU: 0 PID: 1 Comm: swapper Not tainted 6.7.0-wt+ #843
Hardware name: Atmel SAMA5
 unwind_backtrace from show_stack+0x18/0x1c
 show_stack from dump_stack_lvl+0x34/0x48
 dump_stack_lvl from print_report+0x154/0x500
 print_report from kasan_report+0xd8/0x100
 kasan_report from wilc_netdev_cleanup+0x294/0x2c0
 wilc_netdev_cleanup from wilc_bus_probe+0x2b8/0x398
 wilc_bus_probe from spi_probe+0xb8/0xdc
 spi_probe from really_probe+0x134/0x588
 really_probe from __driver_probe_device+0xe0/0x288
 __driver_probe_device from driver_probe_device+0x60/0x118
 driver_probe_device from __driver_attach+0x1a0/0x29c
 __driver_attach from bus_for_each_dev+0xf0/0x14c
 bus_for_each_dev from bus_add_driver+0x130/0x288
 bus_add_driver from driver_register+0xd4/0x1c0
 driver_register from do_one_initcall+0xfc/0x204
 do_one_initcall from kernel_init_freeable+0x240/0x2a0
 kernel_init_freeable from kernel_init+0x20/0x144
 kernel_init from ret_from_fork+0x14/0x28

Not sure though what's wrong without digging further.

> register_netdev() does get called (through wilc_cfg80211_init()) and then when
> the chip detect fails, unregister_netdev() gets called (from
> wilc_netdev_cleanup()), so I don't see any obvious issues, but there is a lot of
> other stuff going on there that I'm not familiar with.

My bad, your statement made me realize I overlooked things here: aside from the
kasan warning, your patch makes the probe function do the following steps:
- create and register (wiphy and) netdevice
- check if chip is detected on bus
- unregister/clean up everything if chip does not respond

There's no point in pre-registering the netdevice so early if we add an error
path due to chip being absent, I would even say that the whole point of your
series is to prevent registering the device if chip can not be accessed. So IMHO
chip detection should be done before trying to register the netdevice. It will
prevent all the complications due to the whole reverse unregistration (and all
weird issues that can occur because of device being registered while not being
fully ready)

>>> +	if (base_id != WILC_1000_BASE_ID && base_id != WILC_3000_BASE_ID) {
>>
>> - WILC3000 is currently not supported (yet) by the upstream driver, so there is
>> no reason to validate its presence if we can not handle it later. Any mention of
>> it should then be removed from this series
> 
> Oh, I didn't realize that.  I was just going off of this web page:
> 
>  https://www.microchip.com/en-us/software-library/wilc1000_3000_linux_driver

Understood, but again, your patch must be based on upstream trees :)
David Mosberger-Tang Jan. 24, 2024, 4:15 p.m. UTC | #8
On Wed, 2024-01-24 at 10:01 +0100, Alexis Lothoré wrote:
> On 1/23/24 18:39, David Mosberger-Tang wrote:
> 
> > 
> > > What does KASAN actually complain about?  A double free or something else?
> 
> I see that the kasan dump from my last email is truncated, but the first line
> clearly mentions a use-after-free:
> 
> ==================================================================
> BUG: KASAN: slab-use-after-free in wilc_netdev_cleanup+0x294/0x2c0

Ah, that's helpful, thanks!  Can you map wilc_netdev_cleanup+0x294 to the
corresponding source line?  Are you on ARM64 or something else?

  --david
David Mosberger-Tang Jan. 24, 2024, 5:31 p.m. UTC | #9
Alexis,

On Wed, 2024-01-24 at 10:01 +0100, Alexis Lothoré wrote:
> ==================================================================
> BUG: KASAN: slab-use-after-free in wilc_netdev_cleanup+0x294/0x2c0
> Read of size 4 at addr c3c91ce8 by task swapper/1

OK, I think I see what's going on: it's the list traversal.  Here is what
wilc_netdev_cleanup() does:

	list_for_each_entry_rcu(vif, &wilc->vif_list, list) {
		if (vif->ndev)
			unregister_netdev(vif->ndev);
	}

The problem is that "vif" is the private part of the netdev, so when the netdev
is freed, the vif structure is no longer valid and list_for_each_entry_rcu()
ends up dereferencing a dangling pointer.

Ajay or Alexis, could you propose a fix for this - this is not something I'm
familiar with.

Thanks!

  --david
David Mosberger-Tang Jan. 24, 2024, 6:45 p.m. UTC | #10
On Wed, 2024-01-24 at 10:31 -0700, David Mosberger-Tang wrote:
> Alexis,
> 
> On Wed, 2024-01-24 at 10:01 +0100, Alexis Lothoré wrote:
> > ==================================================================
> > BUG: KASAN: slab-use-after-free in wilc_netdev_cleanup+0x294/0x2c0
> > Read of size 4 at addr c3c91ce8 by task swapper/1
> 
> OK, I think I see what's going on: it's the list traversal.  Here is what
> wilc_netdev_cleanup() does:
> 
> 	list_for_each_entry_rcu(vif, &wilc->vif_list, list) {
> 		if (vif->ndev)
> 			unregister_netdev(vif->ndev);
> 	}
> 
> The problem is that "vif" is the private part of the netdev, so when the netdev
> is freed, the vif structure is no longer valid and list_for_each_entry_rcu()
> ends up dereferencing a dangling pointer.
> 
> Ajay or Alexis, could you propose a fix for this - this is not something I'm
> familiar with.

Actually, after staring at the code a little longer, is there anything wrong
with delaying the unregister_netdev() call until after the vif has been removed
from the vif-list?  Something along the lines of the below.

  --david

diff --git a/drivers/net/wireless/microchip/wilc1000/netdev.c
b/drivers/net/wireless/microchip/wilc1000/netdev.c
index 0bf6aef4661e..e78e7d971243 100644
--- a/drivers/net/wireless/microchip/wilc1000/netdev.c
+++ b/drivers/net/wireless/microchip/wilc1000/netdev.c
@@ -884,7 +884,7 @@ static const struct net_device_ops wilc_netdev_ops = {
 void wilc_netdev_cleanup(struct wilc *wilc)
 {
 	struct wilc_vif *vif;
-	int srcu_idx, ifc_cnt = 0;
+	int ifc_cnt = 0;
 
 	if (!wilc)
 		return;
@@ -894,13 +894,6 @@ void wilc_netdev_cleanup(struct wilc *wilc)
 		wilc->firmware = NULL;
 	}
 
-	srcu_idx = srcu_read_lock(&wilc->srcu);
-	list_for_each_entry_rcu(vif, &wilc->vif_list, list) {
-		if (vif->ndev)
-			unregister_netdev(vif->ndev);
-	}
-	srcu_read_unlock(&wilc->srcu, srcu_idx);
-
 	wilc_wfi_deinit_mon_interface(wilc, false);
 	destroy_workqueue(wilc->hif_workqueue);
 
@@ -918,6 +911,10 @@ void wilc_netdev_cleanup(struct wilc *wilc)
 		mutex_unlock(&wilc->vif_mutex);
 		synchronize_srcu(&wilc->srcu);
 		ifc_cnt++;
+
+		if (vif->ndev)
+			/* vif gets freed as part of this call: */
+			unregister_netdev(vif->ndev);
 	}
 
 	wilc_wlan_cfg_deinit(wilc);
Ajay Singh Jan. 25, 2024, 6:23 a.m. UTC | #11
On 1/24/24 11:45, David Mosberger-Tang wrote:
> EXTERNAL EMAIL: Do not click links or open attachments unless you know the content is safe
> 
> On Wed, 2024-01-24 at 10:31 -0700, David Mosberger-Tang wrote:
>> Alexis,
>>
>> On Wed, 2024-01-24 at 10:01 +0100, Alexis Lothoré wrote:
>>> ==================================================================
>>> BUG: KASAN: slab-use-after-free in wilc_netdev_cleanup+0x294/0x2c0
>>> Read of size 4 at addr c3c91ce8 by task swapper/1
>>
>> OK, I think I see what's going on: it's the list traversal.  Here is what
>> wilc_netdev_cleanup() does:
>>
>>       list_for_each_entry_rcu(vif, &wilc->vif_list, list) {
>>               if (vif->ndev)
>>                       unregister_netdev(vif->ndev);
>>       }
>>
>> The problem is that "vif" is the private part of the netdev, so when the netdev
>> is freed, the vif structure is no longer valid and list_for_each_entry_rcu()
>> ends up dereferencing a dangling pointer.
>>
>> Ajay or Alexis, could you propose a fix for this - this is not something I'm
>> familiar with.
> 
> Actually, after staring at the code a little longer, is there anything wrong
> with delaying the unregister_netdev() call until after the vif has been removed
> from the vif-list?  Something along the lines of the below.

I think we need to investigate the actual reason for Kasan warning.
First, we need to confirm if this warning exists without the patch(test
by simulating a force error in wilc_bus_probe()). When
wilc_netdev_cleanup() is also called from wilc_bus_remove(), I believe
this warning was not observed. Once it is confirmed, the fix can be done
accordingly.

Avoiding netdev initialization in wilc_cfg80211_init() would require lot
of modification and changes in API call order. IMO the Kasan warning fix
should be independent of netdev initialization order and it should
free-up the resources for all cases. Suppose if the card is not present,
the probe API should clean-up and exit gracefully. For detecting the
chip_id, I have created the below patch considering the above scenarios,
please check if it makes sense.


--- a/drivers/net/wireless/microchip/wilc1000/spi.c

+++ b/drivers/net/wireless/microchip/wilc1000/spi.c

@@ -225,6 +225,11 @@ static int wilc_bus_probe(struct spi_device *spi)

        if (ret < 0)

                goto netdev_cleanup;



+       if (!is_wilc_chip_connected(wilc)) {

+               ret = -ENODEV;

+               goto netdev_cleanup;

+       }

+

        wilc->rtc_clk = devm_clk_get_optional(&spi->dev, "rtc");

        if (IS_ERR(wilc->rtc_clk)) {

                ret = PTR_ERR(wilc->rtc_clk);

diff --git a/drivers/net/wireless/microchip/wilc1000/wlan.c
b/drivers/net/wireless/microchip/wilc1000/wlan.c
index 6b2f2269ddf8..6f95456b053b 100644

--- a/drivers/net/wireless/microchip/wilc1000/wlan.c

+++ b/drivers/net/wireless/microchip/wilc1000/wlan.c

@@ -1537,3 +1537,22 @@ int wilc_wlan_init(struct net_device *dev)



        return ret;

 }

+

+bool is_wilc_chip_connected(struct wilc *wl)

+{

+       u32 chipid = 0;

+       int ret;

+

+       acquire_bus(wl, WILC_BUS_ACQUIRE_ONLY);

+       ret = wl->hif_func->hif_init(wl, false);

+       if (!ret) {

+               wl->hif_func->hif_read_reg(wl, WILC_CHIPID, &chipid);

+               wl->hif_func->hif_deinit(wl);

+       }

+        release_bus(wl, WILC_BUS_RELEASE_ONLY);

+        if (ret || !is_wilc1000(chipid))

+               return false;

+

+       return true;

+}

+EXPORT_SYMBOL_GPL(is_wilc_chip_connected);

diff --git a/drivers/net/wireless/microchip/wilc1000/wlan.h
b/drivers/net/wireless/microchip/wilc1000/wlan.h
index f02775f7e41f..8585dda0790c 100644

--- a/drivers/net/wireless/microchip/wilc1000/wlan.h

+++ b/drivers/net/wireless/microchip/wilc1000/wlan.h

@@ -440,4 +440,5 @@ int wilc_send_config_pkt(struct wilc_vif *vif, u8
mode, struct wid *wids,
                         u32 count);

 int wilc_wlan_init(struct net_device *dev);

 u32 wilc_get_chipid(struct wilc *wilc, bool update);

+bool is_wilc_chip_connected(struct wilc *wilc);

 #endif

--


Regards,
Ajay
Alexis Lothoré (eBPF Foundation) Jan. 25, 2024, 11:04 a.m. UTC | #12
On 1/25/24 07:23, Ajay.Kathat@microchip.com wrote:
> On 1/24/24 11:45, David Mosberger-Tang wrote:
>> EXTERNAL EMAIL: Do not click links or open attachments unless you know the content is safe
>>
>> On Wed, 2024-01-24 at 10:31 -0700, David Mosberger-Tang wrote:
>>> Alexis,
>>>
>>> On Wed, 2024-01-24 at 10:01 +0100, Alexis Lothoré wrote:
>>>> ==================================================================
>>>> BUG: KASAN: slab-use-after-free in wilc_netdev_cleanup+0x294/0x2c0
>>>> Read of size 4 at addr c3c91ce8 by task swapper/1

Replying a bit late to your initial questions:
- I am running an ARM32 platform (SAMA5D27)
- for the wilc_netdev_cleanup line, the 0x294 offset indeed points to
list_for_each_entry_rcu(vif, &wilc->vif_list, list) in my case, but you seemed
to have identified this already.

>>>
>>> OK, I think I see what's going on: it's the list traversal.  Here is what
>>> wilc_netdev_cleanup() does:
>>>
>>>       list_for_each_entry_rcu(vif, &wilc->vif_list, list) {
>>>               if (vif->ndev)
>>>                       unregister_netdev(vif->ndev);
>>>       }
>>>
>>> The problem is that "vif" is the private part of the netdev, so when the netdev
>>> is freed, the vif structure is no longer valid and list_for_each_entry_rcu()
>>> ends up dereferencing a dangling pointer.

Your diagnostic sounds relevant :)

>>> Ajay or Alexis, could you propose a fix for this - this is not something I'm
>>> familiar with.
>>
>> Actually, after staring at the code a little longer, is there anything wrong
>> with delaying the unregister_netdev() call until after the vif has been removed
>> from the vif-list?  Something along the lines of the below.

I guess you _could_ do something like this, but I think you have to be very
careful about potential races. If I'm not wrong the following could happen:
- you enter the wilc_netdev_cleanup and start removing vifs from list
- meanwhile, because your net device is still registered, userspace can start
calling concurrently some cgf80211_ops
- some of those ops may try to get the vif matching your netdevice from the
list, but it is not there anymore

Maybe some rtnl or wiphy lock (used from cfg80211 core or net core) will save
you from this for some of wilc_netdev_cleanup calls, but I think that won't be
true for the one in the error path of the probe chain.

> I think we need to investigate the actual reason for Kasan warning.
> First, we need to confirm if this warning exists without the patch(test
> by simulating a force error in wilc_bus_probe()). When
> wilc_netdev_cleanup() is also called from wilc_bus_remove(), I believe
> this warning was not observed. Once it is confirmed, the fix can be done
> accordingly.

It happens too in wilc_bus_remove:
echo "spi0.1" > /sys/bus/spi/drivers/wilc1000_spi/unbind

==================================================================
BUG: KASAN: slab-use-after-free in wilc_netdev_cleanup+0xf0/0x244
Read of size 4 at addr c3c91ce8 by task sh/91

CPU: 0 PID: 91 Comm: sh Not tainted 6.7.0-wt+ #845
Hardware name: Atmel SAMA5
 unwind_backtrace from show_stack+0x18/0x1c
 show_stack from dump_stack_lvl+0x34/0x48
 dump_stack_lvl from print_report+0x154/0x500
 print_report from kasan_report+0xd8/0x100
 kasan_report from wilc_netdev_cleanup+0xf0/0x244
 wilc_netdev_cleanup from wilc_bus_remove+0x50/0x5c
 wilc_bus_remove from spi_remove+0x40/0x50
 spi_remove from device_release_driver_internal+0x21c/0x2ac
 device_release_driver_internal from unbind_store+0x70/0xac
 unbind_store from kernfs_fop_write_iter+0x1a0/0x284
 kernfs_fop_write_iter from vfs_write+0x38c/0x6f4
 vfs_write from ksys_write+0xd8/0x178
 ksys_write from ret_fast_syscall+0x0/0x1c
Exception stack(0xc588ffa8 to 0xc588fff0)
ffa0:                   00000007 004eff68 00000001 004eff68 00000007 00000001
ffc0: 00000007 004eff68 00000001 00000004 00000007 b69546c0 00000020 004ed190
ffe0: 00000004 b6954678 aec3a041 aebd1a26

> Avoiding netdev initialization in wilc_cfg80211_init() would require lot
> of modification and changes in API call order. IMO the Kasan warning fix
> should be independent of netdev initialization order and it should
> free-up the resources for all cases. Suppose if the card is not present,
> the probe API should clean-up and exit gracefully. For detecting the
> chip_id, I have created the below patch considering the above scenarios,
> please check if it makes sense.

Agree about the wilc_netdev_cleanup fix being a separated topic, to be handled
accordingly.
Still, the scenario I mention above, if true, makes me more confident that we
should not register at all the netdevice before being able to manage it. Maybe
those cases are already handled correctly with some checks to make sure no real
crash occurs, but all those checks could be avoided if we did not register the
netdevice so early

Alexis
Ajay Singh Jan. 25, 2024, 5:15 p.m. UTC | #13
On 1/25/24 04:04, Alexis Lothoré wrote:
> EXTERNAL EMAIL: Do not click links or open attachments unless you know the content is safe
> 
> On 1/25/24 07:23, Ajay.Kathat@microchip.com wrote:
>> On 1/24/24 11:45, David Mosberger-Tang wrote:
>>> EXTERNAL EMAIL: Do not click links or open attachments unless you know the content is safe
>>>
>>> On Wed, 2024-01-24 at 10:31 -0700, David Mosberger-Tang wrote:
>>>> Alexis,
>>>>
>>>> On Wed, 2024-01-24 at 10:01 +0100, Alexis Lothoré wrote:
>>>>> ==================================================================
>>>>> BUG: KASAN: slab-use-after-free in wilc_netdev_cleanup+0x294/0x2c0
>>>>> Read of size 4 at addr c3c91ce8 by task swapper/1
> 
> Replying a bit late to your initial questions:
> - I am running an ARM32 platform (SAMA5D27)
> - for the wilc_netdev_cleanup line, the 0x294 offset indeed points to
> list_for_each_entry_rcu(vif, &wilc->vif_list, list) in my case, but you seemed
> to have identified this already.
> 
>>>>
>>>> OK, I think I see what's going on: it's the list traversal.  Here is what
>>>> wilc_netdev_cleanup() does:
>>>>
>>>>       list_for_each_entry_rcu(vif, &wilc->vif_list, list) {
>>>>               if (vif->ndev)
>>>>                       unregister_netdev(vif->ndev);
>>>>       }
>>>>
>>>> The problem is that "vif" is the private part of the netdev, so when the netdev
>>>> is freed, the vif structure is no longer valid and list_for_each_entry_rcu()
>>>> ends up dereferencing a dangling pointer.
> 
> Your diagnostic sounds relevant :)
> 
>>>> Ajay or Alexis, could you propose a fix for this - this is not something I'm
>>>> familiar with.
>>>
>>> Actually, after staring at the code a little longer, is there anything wrong
>>> with delaying the unregister_netdev() call until after the vif has been removed
>>> from the vif-list?  Something along the lines of the below.
> 
> I guess you _could_ do something like this, but I think you have to be very
> careful about potential races. If I'm not wrong the following could happen:
> - you enter the wilc_netdev_cleanup and start removing vifs from list
> - meanwhile, because your net device is still registered, userspace can start
> calling concurrently some cgf80211_ops

IFAIK cfg80211_ops shouldn't get called until the netdev interface is
up(ifconfig wlan0 up). In the probe, only the netdev interface is
allocated and cfg80211_ops is registered and the cfg80211_ops callback
should be called when the interface is up.

> - some of those ops may try to get the vif matching your netdevice from the
> list, but it is not there anymore
> 
I have not tested this by enabling Kasan configuration so I haven't
observe this issue yet. As I understand, the warning(use-after-free) is
reported for the below line in wilc_netdev_cleanup() so that must be the
code where used-after-free warning is reported. Isn't it.

>>>>       list_for_each_entry_rcu(vif, &wilc->vif_list, list) {


> Maybe some rtnl or wiphy lock (used from cfg80211 core or net core) will save
> you from this for some of wilc_netdev_cleanup calls, but I think that won't be
> true for the one in the error path of the probe chain.
> 
>> I think we need to investigate the actual reason for Kasan warning.
>> First, we need to confirm if this warning exists without the patch(test
>> by simulating a force error in wilc_bus_probe()). When
>> wilc_netdev_cleanup() is also called from wilc_bus_remove(), I believe
>> this warning was not observed. Once it is confirmed, the fix can be done
>> accordingly.
> 
> It happens too in wilc_bus_remove:
> echo "spi0.1" > /sys/bus/spi/drivers/wilc1000_spi/unbind

Okay, so it confirms that this warning is not related to the probe
function.

> 
> ==================================================================
> BUG: KASAN: slab-use-after-free in wilc_netdev_cleanup+0xf0/0x244
> Read of size 4 at addr c3c91ce8 by task sh/91
> 
> CPU: 0 PID: 91 Comm: sh Not tainted 6.7.0-wt+ #845
> Hardware name: Atmel SAMA5
>  unwind_backtrace from show_stack+0x18/0x1c
>  show_stack from dump_stack_lvl+0x34/0x48
>  dump_stack_lvl from print_report+0x154/0x500
>  print_report from kasan_report+0xd8/0x100
>  kasan_report from wilc_netdev_cleanup+0xf0/0x244
>  wilc_netdev_cleanup from wilc_bus_remove+0x50/0x5c
>  wilc_bus_remove from spi_remove+0x40/0x50
>  spi_remove from device_release_driver_internal+0x21c/0x2ac
>  device_release_driver_internal from unbind_store+0x70/0xac
>  unbind_store from kernfs_fop_write_iter+0x1a0/0x284
>  kernfs_fop_write_iter from vfs_write+0x38c/0x6f4
>  vfs_write from ksys_write+0xd8/0x178
>  ksys_write from ret_fast_syscall+0x0/0x1c
> Exception stack(0xc588ffa8 to 0xc588fff0)
> ffa0:                   00000007 004eff68 00000001 004eff68 00000007 00000001
> ffc0: 00000007 004eff68 00000001 00000004 00000007 b69546c0 00000020 004ed190
> ffe0: 00000004 b6954678 aec3a041 aebd1a26
> 
>> Avoiding netdev initialization in wilc_cfg80211_init() would require lot
>> of modification and changes in API call order. IMO the Kasan warning fix
>> should be independent of netdev initialization order and it should
>> free-up the resources for all cases. Suppose if the card is not present,
>> the probe API should clean-up and exit gracefully. For detecting the
>> chip_id, I have created the below patch considering the above scenarios,
>> please check if it makes sense.
> 
> Agree about the wilc_netdev_cleanup fix being a separated topic, to be handled
> accordingly.
> Still, the scenario I mention above, if true, makes me more confident that we
> should not register at all the netdevice before being able to manage it. Maybe
> those cases are already handled correctly with some checks to make sure no real
> crash occurs, but all those checks could be avoided if we did not register the
> netdevice so early
> neering

I think this issue is not related to early registration of netdevice
since same behavior was observed with "echo "spi0.1" >
/sys/bus/spi/drivers/wilc1000_spi/unbind" command.

If needed, the netdevice allocation can be delayed until other
conditions(resources) are allocated/successful. But it is also possible
that the other conditions(resource) are successful but netdevice
allocation gets failed, in that case allocating other resources before
may not be correct. For the success case, all the condition should succeed.
Currently the driver initialization has a order that already invokes
"netdev_clean:" in wilc_bus_probe() for failure cases, so it should free
up the resources completely. Additionally, this warning was reported at
run time(not only in probe). I believe if it is fix in
wilc_netdev_cleanup() then it will cover for all the scenarios.

Regards,
Ajay
David Mosberger-Tang Jan. 25, 2024, 7:17 p.m. UTC | #14
On Thu, 2024-01-25 at 12:04 +0100, Alexis Lothoré wrote:
> On 1/25/24 07:23, Ajay.Kathat@microchip.com wrote:
> > On 1/24/24 11:45, David Mosberger-Tang wrote:
> > 
> > > > 
> > > > OK, I think I see what's going on: it's the list traversal.  Here is what
> > > > wilc_netdev_cleanup() does:
> > > > 
> > > >       list_for_each_entry_rcu(vif, &wilc->vif_list, list) {
> > > >               if (vif->ndev)
> > > >                       unregister_netdev(vif->ndev);
> > > >       }
> > > > 
> > > > The problem is that "vif" is the private part of the netdev, so when the netdev
> > > > is freed, the vif structure is no longer valid and list_for_each_entry_rcu()
> > > > ends up dereferencing a dangling pointer.
> 
> Your diagnostic sounds relevant :)

Yeah, it's definitely what's going on.  And it's not just the list traversal:
afterwards, wilc_netdev_cleanup() continues to access the vif structure while
removing them from the vif-list.

I think the original idea of calling unregister_netdev() is probably the right
one as, like you said, you want to remove the device from being visible to the
user before tearing down anything else.  If I understood the problem correctly,
the use-after-free caused by this line in wilc_netdev_ifc_init():

	ndev->needs_free_netdev = true;

This causes unregister_netdev() to implicitly call free_netdev().  Without that
code, I think you could call unregister_netdev() early on (as it is right now)
and when done with using the vifs, call free_netdev() while avoiding any
dangling references.

In any case, this is definitely not my area of expertise.  I don't fully
understand the motivition behind needs_free_netdev, even after reading
https://docs.kernel.org/networking/netdevices.html - I suspect the use of that
flag has evolved over the years and the docs may not be entirely up-to-date
anymore.  One driver I looked at (wireless/ath/wil6210/netdev.c) sets
needs_free_netdev only for secondary vifs (i.e., all but the first vif).

Hopefully someone else on this list can figure out what the right solution is
here.

Thanks,

  --david
diff mbox series

Patch

diff --git a/drivers/net/wireless/microchip/wilc1000/spi.c b/drivers/net/wireless/microchip/wilc1000/spi.c
index 77b4cdff73c3..dd6935dc1bc9 100644
--- a/drivers/net/wireless/microchip/wilc1000/spi.c
+++ b/drivers/net/wireless/microchip/wilc1000/spi.c
@@ -42,7 +42,7 @@  MODULE_PARM_DESC(enable_crc16,
 #define WILC_SPI_RSP_HDR_EXTRA_DATA	8
 
 struct wilc_spi {
-	bool isinit;		/* true if SPI protocol has been configured */
+	bool isinit;		/* true if wilc_spi_init was successful */
 	bool probing_crc;	/* true if we're probing chip's CRC config */
 	bool crc7_enabled;	/* true if crc7 is currently enabled */
 	bool crc16_enabled;	/* true if crc16 is currently enabled */
@@ -55,6 +55,8 @@  struct wilc_spi {
 static const struct wilc_hif_func wilc_hif_spi;
 
 static int wilc_spi_reset(struct wilc *wilc);
+static int wilc_spi_configure_bus_protocol(struct wilc *wilc);
+static int wilc_validate_chipid(struct wilc *wilc);
 
 /********************************************
  *
@@ -232,6 +234,22 @@  static int wilc_bus_probe(struct spi_device *spi)
 	}
 	clk_prepare_enable(wilc->rtc_clk);
 
+	/* we need power to configure the bus protocol and to read the chip id: */
+
+	wilc_wlan_power(wilc, true);
+
+	ret = wilc_spi_configure_bus_protocol(wilc);
+
+	if (ret == 0)
+		ret = wilc_validate_chipid(wilc);
+
+	wilc_wlan_power(wilc, false);
+
+	if (ret) {
+		ret = -ENODEV;
+		goto netdev_cleanup;
+	}
+
 	return 0;
 
 netdev_cleanup:
@@ -1074,58 +1092,43 @@  static int wilc_spi_write(struct wilc *wilc, u32 addr, u8 *buf, u32 size)
  *
  ********************************************/
 
-static int wilc_spi_reset(struct wilc *wilc)
+static const char *
+strbool(bool val)
 {
-	struct spi_device *spi = to_spi_device(wilc->dev);
-	struct wilc_spi *spi_priv = wilc->bus_data;
-	int result;
-
-	result = wilc_spi_special_cmd(wilc, CMD_RESET);
-	if (result && !spi_priv->probing_crc)
-		dev_err(&spi->dev, "Failed cmd reset\n");
-
-	return result;
-}
-
-static bool wilc_spi_is_init(struct wilc *wilc)
-{
-	struct wilc_spi *spi_priv = wilc->bus_data;
-
-	return spi_priv->isinit;
+	return val ? "on" : "off";
 }
 
-static int wilc_spi_deinit(struct wilc *wilc)
+static int wilc_validate_chipid(struct wilc *wilc)
 {
-	struct wilc_spi *spi_priv = wilc->bus_data;
+	struct spi_device *spi = to_spi_device(wilc->dev);
+	u32 chipid, base_id;
+	int ret;
 
-	spi_priv->isinit = false;
-	wilc_wlan_power(wilc, false);
+	/*
+	 * make sure can read chip id without protocol error
+	 */
+	ret = wilc_spi_read_reg(wilc, WILC_CHIPID, &chipid);
+	if (ret) {
+		dev_err(&spi->dev, "Fail cmd read chip id...\n");
+		return ret;
+	}
+	base_id = chipid & ~WILC_CHIP_REV_FIELD;
+	if (base_id != WILC_1000_BASE_ID && base_id != WILC_3000_BASE_ID) {
+		dev_err(&spi->dev, "Unknown chip id 0x%x\n", chipid);
+		return ret;
+	}
+	dev_info(&spi->dev, "Detected chip id 0x%x (crc7=%s, crc16=%s)\n",
+		 chipid, strbool(enable_crc7), strbool(enable_crc16));
 	return 0;
 }
 
-static int wilc_spi_init(struct wilc *wilc, bool resume)
+static int wilc_spi_configure_bus_protocol(struct wilc *wilc)
 {
 	struct spi_device *spi = to_spi_device(wilc->dev);
 	struct wilc_spi *spi_priv = wilc->bus_data;
 	u32 reg;
-	u32 chipid;
 	int ret, i;
 
-	if (spi_priv->isinit) {
-		/* Confirm we can read chipid register without error: */
-		ret = wilc_spi_read_reg(wilc, WILC_CHIPID, &chipid);
-		if (ret == 0)
-			return 0;
-
-		dev_err(&spi->dev, "Fail cmd read chip id...\n");
-	}
-
-	wilc_wlan_power(wilc, true);
-
-	/*
-	 * configure protocol
-	 */
-
 	/*
 	 * Infer the CRC settings that are currently in effect.  This
 	 * is necessary because we can't be sure that the chip has
@@ -1176,12 +1179,54 @@  static int wilc_spi_init(struct wilc *wilc, bool resume)
 
 	spi_priv->probing_crc = false;
 
-	/*
-	 * make sure can read chip id without protocol error
-	 */
-	ret = wilc_spi_read_reg(wilc, WILC_CHIPID, &chipid);
+	return 0;
+}
+
+static int wilc_spi_reset(struct wilc *wilc)
+{
+	struct spi_device *spi = to_spi_device(wilc->dev);
+	struct wilc_spi *spi_priv = wilc->bus_data;
+	int result;
+
+	result = wilc_spi_special_cmd(wilc, CMD_RESET);
+	if (result && !spi_priv->probing_crc)
+		dev_err(&spi->dev, "Failed cmd reset\n");
+
+	return result;
+}
+
+static bool wilc_spi_is_init(struct wilc *wilc)
+{
+	struct wilc_spi *spi_priv = wilc->bus_data;
+
+	return spi_priv->isinit;
+}
+
+static int wilc_spi_deinit(struct wilc *wilc)
+{
+	struct wilc_spi *spi_priv = wilc->bus_data;
+
+	spi_priv->isinit = false;
+	wilc_wlan_power(wilc, false);
+	return 0;
+}
+
+static int wilc_spi_init(struct wilc *wilc, bool resume)
+{
+	struct wilc_spi *spi_priv = wilc->bus_data;
+	int ret;
+
+	if (spi_priv->isinit) {
+		/* Confirm we can read chipid register without error: */
+		if (wilc_validate_chipid(wilc) == 0)
+			return 0;
+	}
+
+	wilc_wlan_power(wilc, true);
+
+	ret = wilc_spi_configure_bus_protocol(wilc);
 	if (ret) {
-		dev_err(&spi->dev, "Fail cmd read chip id...\n");
+		wilc_wlan_power(wilc, false);
 		return ret;
 	}
 
diff --git a/drivers/net/wireless/microchip/wilc1000/wlan.h b/drivers/net/wireless/microchip/wilc1000/wlan.h
index a72cd5cac81d..43dda9f0d9ca 100644
--- a/drivers/net/wireless/microchip/wilc1000/wlan.h
+++ b/drivers/net/wireless/microchip/wilc1000/wlan.h
@@ -182,6 +182,7 @@ 
 #define WILC_CORTUS_BOOT_FROM_IRAM	0x71
 
 #define WILC_1000_BASE_ID		0x100000
+#define WILC_3000_BASE_ID		0x300000
 
 #define WILC_1000_BASE_ID_2A		0x1002A0
 #define WILC_1000_BASE_ID_2A_REV1	(WILC_1000_BASE_ID_2A + 1)