diff mbox series

[net] net: atlantic: keep rings across suspend/resume

Message ID 20241212023946.3979643-1-lorenz@brun.one (mailing list archive)
State Changes Requested
Delegated to: Netdev Maintainers
Headers show
Series [net] net: atlantic: keep rings across suspend/resume | expand

Checks

Context Check Description
netdev/series_format success Single patches do not need cover letters
netdev/tree_selection success Clearly marked for net
netdev/ynl success Generated files up to date; no warnings/errors; no diff in generated;
netdev/fixes_present success Fixes tag present in non-next series
netdev/header_inline success No static functions without inline keyword in header files
netdev/build_32bit success Errors and warnings before: 0 this patch: 0
netdev/build_tools success No tools touched, skip
netdev/cc_maintainers warning 1 maintainers not CCed: andrew+netdev@lunn.ch
netdev/build_clang success Errors and warnings before: 0 this patch: 0
netdev/verify_signedoff success Signed-off-by tag matches author and committer
netdev/deprecated_api success None detected
netdev/check_selftest success No net selftest shell script
netdev/verify_fixes success Fixes tag looks correct
netdev/build_allmodconfig_warn success Errors and warnings before: 5 this patch: 5
netdev/checkpatch warning WARNING: line length of 93 exceeds 80 columns
netdev/build_clang_rust success No Rust files in patch. Skipping build
netdev/kdoc success Errors and warnings before: 0 this patch: 0
netdev/source_inline success Was 0 now: 0
netdev/contest success net-next-2024-12-12--09-01 (tests: 795)

Commit Message

Lorenz Brun Dec. 12, 2024, 2:39 a.m. UTC
The rings are order-6 allocations which tend to fail on suspend due to
fragmentation. As memory is kept during suspend/resume, we don't need to
reallocate them.

This does not touch the PTP rings which, if enabled, still reallocate.
Fixing these is harder as the whole structure is reinitialized.

Fixes: cbe6c3a8f8f4 ("net: atlantic: invert deep par in pm functions, preventing null derefs")
Signed-off-by: Lorenz Brun <lorenz@brun.one>
---
 drivers/net/ethernet/aquantia/atlantic/aq_main.c     |  4 ++--
 drivers/net/ethernet/aquantia/atlantic/aq_nic.c      |  7 ++++---
 drivers/net/ethernet/aquantia/atlantic/aq_nic.h      |  2 +-
 drivers/net/ethernet/aquantia/atlantic/aq_pci_func.c |  4 ++--
 drivers/net/ethernet/aquantia/atlantic/aq_vec.c      | 10 ++++++++++
 5 files changed, 19 insertions(+), 8 deletions(-)

Comments

Michal Swiatkowski Dec. 12, 2024, 9:54 a.m. UTC | #1
On Thu, Dec 12, 2024 at 03:39:24AM +0100, Lorenz Brun wrote:
> The rings are order-6 allocations which tend to fail on suspend due to
> fragmentation. As memory is kept during suspend/resume, we don't need to
> reallocate them.
> 
> This does not touch the PTP rings which, if enabled, still reallocate.
> Fixing these is harder as the whole structure is reinitialized.
> 
> Fixes: cbe6c3a8f8f4 ("net: atlantic: invert deep par in pm functions, preventing null derefs")
> Signed-off-by: Lorenz Brun <lorenz@brun.one>
> ---
>  drivers/net/ethernet/aquantia/atlantic/aq_main.c     |  4 ++--
>  drivers/net/ethernet/aquantia/atlantic/aq_nic.c      |  7 ++++---
>  drivers/net/ethernet/aquantia/atlantic/aq_nic.h      |  2 +-
>  drivers/net/ethernet/aquantia/atlantic/aq_pci_func.c |  4 ++--
>  drivers/net/ethernet/aquantia/atlantic/aq_vec.c      | 10 ++++++++++
>  5 files changed, 19 insertions(+), 8 deletions(-)
> 
> diff --git a/drivers/net/ethernet/aquantia/atlantic/aq_main.c b/drivers/net/ethernet/aquantia/atlantic/aq_main.c
> index c1d1673c5749..cd3709ba7229 100644
> --- a/drivers/net/ethernet/aquantia/atlantic/aq_main.c
> +++ b/drivers/net/ethernet/aquantia/atlantic/aq_main.c
> @@ -84,7 +84,7 @@ int aq_ndev_open(struct net_device *ndev)
>  
>  err_exit:
>  	if (err < 0)
> -		aq_nic_deinit(aq_nic, true);
> +		aq_nic_deinit(aq_nic, true, false);
Only my suggestion:
Instead of passing another boolean to the function you can have:
aq_nic_deinit(...)
{
	always without free
}

aq_nic_deinit_and_free(...)
{
	aq_nic_deinit(...);
	free
}

It may be easier to read.

>  
>  	return err;
>  }
> @@ -95,7 +95,7 @@ int aq_ndev_close(struct net_device *ndev)
>  	int err = 0;
>  
>  	err = aq_nic_stop(aq_nic);
> -	aq_nic_deinit(aq_nic, true);
> +	aq_nic_deinit(aq_nic, true, false);
>  
>  	return err;
>  }
> diff --git a/drivers/net/ethernet/aquantia/atlantic/aq_nic.c b/drivers/net/ethernet/aquantia/atlantic/aq_nic.c
> index fe0e3e2a8117..a6324ae88acf 100644
> --- a/drivers/net/ethernet/aquantia/atlantic/aq_nic.c
> +++ b/drivers/net/ethernet/aquantia/atlantic/aq_nic.c
> @@ -1422,7 +1422,7 @@ void aq_nic_set_power(struct aq_nic_s *self)
>  		}
>  }
>  
> -void aq_nic_deinit(struct aq_nic_s *self, bool link_down)
Will be nice to have a kernel-doc.

> +void aq_nic_deinit(struct aq_nic_s *self, bool link_down, bool keep_rings)
>  {
>  	struct aq_vec_s *aq_vec = NULL;
>  	unsigned int i = 0U;
> @@ -1433,7 +1433,8 @@ void aq_nic_deinit(struct aq_nic_s *self, bool link_down)
>  	for (i = 0U; i < self->aq_vecs; i++) {
>  		aq_vec = self->aq_vec[i];
>  		aq_vec_deinit(aq_vec);
> -		aq_vec_ring_free(aq_vec);
> +		if (!keep_rings)
> +			aq_vec_ring_free(aq_vec);
>  	}
>  
>  	aq_ptp_unregister(self);
> @@ -1499,7 +1500,7 @@ void aq_nic_shutdown(struct aq_nic_s *self)
>  		if (err < 0)
>  			goto err_exit;
>  	}
> -	aq_nic_deinit(self, !self->aq_hw->aq_nic_cfg->wol);
> +	aq_nic_deinit(self, !self->aq_hw->aq_nic_cfg->wol, false);
>  	aq_nic_set_power(self);
>  
>  err_exit:
> diff --git a/drivers/net/ethernet/aquantia/atlantic/aq_nic.h b/drivers/net/ethernet/aquantia/atlantic/aq_nic.h
> index ad33f8586532..f0543a5cc087 100644
> --- a/drivers/net/ethernet/aquantia/atlantic/aq_nic.h
> +++ b/drivers/net/ethernet/aquantia/atlantic/aq_nic.h
> @@ -189,7 +189,7 @@ int aq_nic_get_regs(struct aq_nic_s *self, struct ethtool_regs *regs, void *p);
>  int aq_nic_get_regs_count(struct aq_nic_s *self);
>  u64 *aq_nic_get_stats(struct aq_nic_s *self, u64 *data);
>  int aq_nic_stop(struct aq_nic_s *self);
> -void aq_nic_deinit(struct aq_nic_s *self, bool link_down);
> +void aq_nic_deinit(struct aq_nic_s *self, bool link_down, bool keep_rings);
>  void aq_nic_set_power(struct aq_nic_s *self);
>  void aq_nic_free_hot_resources(struct aq_nic_s *self);
>  void aq_nic_free_vectors(struct aq_nic_s *self);
> diff --git a/drivers/net/ethernet/aquantia/atlantic/aq_pci_func.c b/drivers/net/ethernet/aquantia/atlantic/aq_pci_func.c
> index 43c71f6b314f..1015eab5ee50 100644
> --- a/drivers/net/ethernet/aquantia/atlantic/aq_pci_func.c
> +++ b/drivers/net/ethernet/aquantia/atlantic/aq_pci_func.c
> @@ -390,7 +390,7 @@ static int aq_suspend_common(struct device *dev)
>  	if (netif_running(nic->ndev))
>  		aq_nic_stop(nic);
>  
> -	aq_nic_deinit(nic, !nic->aq_hw->aq_nic_cfg->wol);
> +	aq_nic_deinit(nic, !nic->aq_hw->aq_nic_cfg->wol, true);
>  	aq_nic_set_power(nic);
>  
>  	rtnl_unlock();
> @@ -426,7 +426,7 @@ static int atl_resume_common(struct device *dev)
>  
>  err_exit:
>  	if (ret < 0)
> -		aq_nic_deinit(nic, true);
> +		aq_nic_deinit(nic, true, false);
>  
>  	rtnl_unlock();
>  
> diff --git a/drivers/net/ethernet/aquantia/atlantic/aq_vec.c b/drivers/net/ethernet/aquantia/atlantic/aq_vec.c
> index 9769ab4f9bef..3b51d6ee0812 100644
> --- a/drivers/net/ethernet/aquantia/atlantic/aq_vec.c
> +++ b/drivers/net/ethernet/aquantia/atlantic/aq_vec.c
> @@ -132,6 +132,16 @@ int aq_vec_ring_alloc(struct aq_vec_s *self, struct aq_nic_s *aq_nic,
>  	unsigned int i = 0U;
>  	int err = 0;
>  
> +	if (self && self->tx_rings == aq_nic_cfg->tcs && self->rx_rings == aq_nic_cfg->tcs) {
> +		/* Correct rings already allocated, nothing to do here */
> +		return 0;
Is the same number of Tx/Rx always enough to say that the vector is the
same? It has more additinal data in the structure.

> +	} else if (self && (self->tx_rings > 0 || self->rx_rings > 0)) {
> +		/* Allocated rings are different, free rings and reallocate */
> +		pr_notice("%s: cannot reuse rings, have %d, need %d, reallocating", __func__,
> +			  self->tx_rings, aq_nic_cfg->tcs);
> +		aq_vec_ring_free(self);
> +	}
> +
>  	for (i = 0; i < aq_nic_cfg->tcs; ++i) {
>  		const unsigned int idx_ring = AQ_NIC_CFG_TCVEC2RING(aq_nic_cfg,
>  								    i, idx);

Thanks,
Michal
> -- 
> 2.44.1
Andrew Lunn Dec. 12, 2024, 5:20 p.m. UTC | #2
On Thu, Dec 12, 2024 at 03:39:24AM +0100, Lorenz Brun wrote:
> The rings are order-6 allocations which tend to fail on suspend due to
> fragmentation. As memory is kept during suspend/resume, we don't need to
> reallocate them.

I don't know this driver. Are there other reasons to reallocate the
rings? Change of MTU? ethtool settings? If they are also potentially
going to run into memory fragmentation issues, maybe it would be
better to use smaller order allocations, or vmalloc, if the hardware
supports that, etc.

	Andrew
Jakub Kicinski Dec. 13, 2024, 1:21 a.m. UTC | #3
On Thu, 12 Dec 2024 03:39:24 +0100 Lorenz Brun wrote:
> -void aq_nic_deinit(struct aq_nic_s *self, bool link_down)
> +void aq_nic_deinit(struct aq_nic_s *self, bool link_down, bool keep_rings)
>  {
>  	struct aq_vec_s *aq_vec = NULL;
>  	unsigned int i = 0U;
> @@ -1433,7 +1433,8 @@ void aq_nic_deinit(struct aq_nic_s *self, bool link_down)
>  	for (i = 0U; i < self->aq_vecs; i++) {
>  		aq_vec = self->aq_vec[i];
>  		aq_vec_deinit(aq_vec);
> -		aq_vec_ring_free(aq_vec);
> +		if (!keep_rings)
> +			aq_vec_ring_free(aq_vec);
>  	}

I'd suggest to break out the memory freeing from aq_nic_deinit(),
and conversely allocating from aq_nic_init(). Then explicitly call
free / alloc from where aq_nic_deinit() / aq_nic_init() are called.

The booleans passed into init functions are pretty error prone.
Pretty quickly one needs to grep the entire driver to find which
callsites pass what.
Simon Horman Dec. 13, 2024, 1:58 p.m. UTC | #4
On Thu, Dec 12, 2024 at 03:39:24AM +0100, Lorenz Brun wrote:
> The rings are order-6 allocations which tend to fail on suspend due to
> fragmentation. As memory is kept during suspend/resume, we don't need to
> reallocate them.
> 
> This does not touch the PTP rings which, if enabled, still reallocate.
> Fixing these is harder as the whole structure is reinitialized.
> 
> Fixes: cbe6c3a8f8f4 ("net: atlantic: invert deep par in pm functions, preventing null derefs")
> Signed-off-by: Lorenz Brun <lorenz@brun.one>

...

> diff --git a/drivers/net/ethernet/aquantia/atlantic/aq_vec.c b/drivers/net/ethernet/aquantia/atlantic/aq_vec.c
> index 9769ab4f9bef..3b51d6ee0812 100644
> --- a/drivers/net/ethernet/aquantia/atlantic/aq_vec.c
> +++ b/drivers/net/ethernet/aquantia/atlantic/aq_vec.c
> @@ -132,6 +132,16 @@ int aq_vec_ring_alloc(struct aq_vec_s *self, struct aq_nic_s *aq_nic,
>  	unsigned int i = 0U;
>  	int err = 0;
>  
> +	if (self && self->tx_rings == aq_nic_cfg->tcs && self->rx_rings == aq_nic_cfg->tcs) {
> +		/* Correct rings already allocated, nothing to do here */
> +		return 0;
> +	} else if (self && (self->tx_rings > 0 || self->rx_rings > 0)) {
> +		/* Allocated rings are different, free rings and reallocate */
> +		pr_notice("%s: cannot reuse rings, have %d, need %d, reallocating", __func__,
> +			  self->tx_rings, aq_nic_cfg->tcs);
> +		aq_vec_ring_free(self);
> +	}
> +

Hi Lorenzo,

Can self be NULL here?

In the for loop below it is dereferenced unconditionally and
thus assumed not to be NULL there.

Flagged by Smatch.

>  	for (i = 0; i < aq_nic_cfg->tcs; ++i) {
>  		const unsigned int idx_ring = AQ_NIC_CFG_TCVEC2RING(aq_nic_cfg,
>  								    i, idx);
> -- 
> 2.44.1
> 
>
Lorenz Brun Dec. 13, 2024, 5 p.m. UTC | #5
Am Do, 12. Dez 2024 um 18:20:26 +01:00:00 schrieb Andrew Lunn 
<andrew@lunn.ch>:
> On Thu, Dec 12, 2024 at 03:39:24AM +0100, Lorenz Brun wrote:
>>  The rings are order-6 allocations which tend to fail on suspend due 
>> to
>>  fragmentation. As memory is kept during suspend/resume, we don't 
>> need to
>>  reallocate them.
> 
> I don't know this driver. Are there other reasons to reallocate the
> rings? Change of MTU? ethtool settings? If they are also potentially
> going to run into memory fragmentation issues, maybe it would be
> better to use smaller order allocations, or vmalloc, if the hardware
> supports that, etc.
> 
> 	Andrew

ethtool settings do indeed reallocate, but not during unsuspend where 
we have GFP_NOIO. Smaller oder allocations would definitely be better, 
but on systems without IOMMU that would probably pose a problem as 
rings are generally assumed to be contigous by hardware (as far as I 
understand). I don't have access to hardware docs, so I don't know if 
you can make the HW work without physically-contiguous memory.

Linux just really doesn't handle high-order allocations well, I got one 
unsuspend failure with 6GiB (!!) of free space in the relevant region 
(but no order-6 or higher). I have no idea why it doesn't defragment 
before failing the allocation as it clearly has enough memory.

kworker/u97:14: page allocation failure: order:6, 
mode:0x40d00(GFP_NOIO|__GFP_COMP|__GFP_ZERO), 
nodemask=(null),cpuset=/,mems_allowed=0
Node 0 Normal: 787628*4kB (UME) 234026*8kB (UME) 50882*16kB (UME) 
13751*32kB (UME) 35*64kB (UME) 9*128kB (M) 0*256kB 0*512kB 0*1024kB 
1*2048kB (H) 0*4096kB = 6282304kB


Regards,
Lorenz
diff mbox series

Patch

diff --git a/drivers/net/ethernet/aquantia/atlantic/aq_main.c b/drivers/net/ethernet/aquantia/atlantic/aq_main.c
index c1d1673c5749..cd3709ba7229 100644
--- a/drivers/net/ethernet/aquantia/atlantic/aq_main.c
+++ b/drivers/net/ethernet/aquantia/atlantic/aq_main.c
@@ -84,7 +84,7 @@  int aq_ndev_open(struct net_device *ndev)
 
 err_exit:
 	if (err < 0)
-		aq_nic_deinit(aq_nic, true);
+		aq_nic_deinit(aq_nic, true, false);
 
 	return err;
 }
@@ -95,7 +95,7 @@  int aq_ndev_close(struct net_device *ndev)
 	int err = 0;
 
 	err = aq_nic_stop(aq_nic);
-	aq_nic_deinit(aq_nic, true);
+	aq_nic_deinit(aq_nic, true, false);
 
 	return err;
 }
diff --git a/drivers/net/ethernet/aquantia/atlantic/aq_nic.c b/drivers/net/ethernet/aquantia/atlantic/aq_nic.c
index fe0e3e2a8117..a6324ae88acf 100644
--- a/drivers/net/ethernet/aquantia/atlantic/aq_nic.c
+++ b/drivers/net/ethernet/aquantia/atlantic/aq_nic.c
@@ -1422,7 +1422,7 @@  void aq_nic_set_power(struct aq_nic_s *self)
 		}
 }
 
-void aq_nic_deinit(struct aq_nic_s *self, bool link_down)
+void aq_nic_deinit(struct aq_nic_s *self, bool link_down, bool keep_rings)
 {
 	struct aq_vec_s *aq_vec = NULL;
 	unsigned int i = 0U;
@@ -1433,7 +1433,8 @@  void aq_nic_deinit(struct aq_nic_s *self, bool link_down)
 	for (i = 0U; i < self->aq_vecs; i++) {
 		aq_vec = self->aq_vec[i];
 		aq_vec_deinit(aq_vec);
-		aq_vec_ring_free(aq_vec);
+		if (!keep_rings)
+			aq_vec_ring_free(aq_vec);
 	}
 
 	aq_ptp_unregister(self);
@@ -1499,7 +1500,7 @@  void aq_nic_shutdown(struct aq_nic_s *self)
 		if (err < 0)
 			goto err_exit;
 	}
-	aq_nic_deinit(self, !self->aq_hw->aq_nic_cfg->wol);
+	aq_nic_deinit(self, !self->aq_hw->aq_nic_cfg->wol, false);
 	aq_nic_set_power(self);
 
 err_exit:
diff --git a/drivers/net/ethernet/aquantia/atlantic/aq_nic.h b/drivers/net/ethernet/aquantia/atlantic/aq_nic.h
index ad33f8586532..f0543a5cc087 100644
--- a/drivers/net/ethernet/aquantia/atlantic/aq_nic.h
+++ b/drivers/net/ethernet/aquantia/atlantic/aq_nic.h
@@ -189,7 +189,7 @@  int aq_nic_get_regs(struct aq_nic_s *self, struct ethtool_regs *regs, void *p);
 int aq_nic_get_regs_count(struct aq_nic_s *self);
 u64 *aq_nic_get_stats(struct aq_nic_s *self, u64 *data);
 int aq_nic_stop(struct aq_nic_s *self);
-void aq_nic_deinit(struct aq_nic_s *self, bool link_down);
+void aq_nic_deinit(struct aq_nic_s *self, bool link_down, bool keep_rings);
 void aq_nic_set_power(struct aq_nic_s *self);
 void aq_nic_free_hot_resources(struct aq_nic_s *self);
 void aq_nic_free_vectors(struct aq_nic_s *self);
diff --git a/drivers/net/ethernet/aquantia/atlantic/aq_pci_func.c b/drivers/net/ethernet/aquantia/atlantic/aq_pci_func.c
index 43c71f6b314f..1015eab5ee50 100644
--- a/drivers/net/ethernet/aquantia/atlantic/aq_pci_func.c
+++ b/drivers/net/ethernet/aquantia/atlantic/aq_pci_func.c
@@ -390,7 +390,7 @@  static int aq_suspend_common(struct device *dev)
 	if (netif_running(nic->ndev))
 		aq_nic_stop(nic);
 
-	aq_nic_deinit(nic, !nic->aq_hw->aq_nic_cfg->wol);
+	aq_nic_deinit(nic, !nic->aq_hw->aq_nic_cfg->wol, true);
 	aq_nic_set_power(nic);
 
 	rtnl_unlock();
@@ -426,7 +426,7 @@  static int atl_resume_common(struct device *dev)
 
 err_exit:
 	if (ret < 0)
-		aq_nic_deinit(nic, true);
+		aq_nic_deinit(nic, true, false);
 
 	rtnl_unlock();
 
diff --git a/drivers/net/ethernet/aquantia/atlantic/aq_vec.c b/drivers/net/ethernet/aquantia/atlantic/aq_vec.c
index 9769ab4f9bef..3b51d6ee0812 100644
--- a/drivers/net/ethernet/aquantia/atlantic/aq_vec.c
+++ b/drivers/net/ethernet/aquantia/atlantic/aq_vec.c
@@ -132,6 +132,16 @@  int aq_vec_ring_alloc(struct aq_vec_s *self, struct aq_nic_s *aq_nic,
 	unsigned int i = 0U;
 	int err = 0;
 
+	if (self && self->tx_rings == aq_nic_cfg->tcs && self->rx_rings == aq_nic_cfg->tcs) {
+		/* Correct rings already allocated, nothing to do here */
+		return 0;
+	} else if (self && (self->tx_rings > 0 || self->rx_rings > 0)) {
+		/* Allocated rings are different, free rings and reallocate */
+		pr_notice("%s: cannot reuse rings, have %d, need %d, reallocating", __func__,
+			  self->tx_rings, aq_nic_cfg->tcs);
+		aq_vec_ring_free(self);
+	}
+
 	for (i = 0; i < aq_nic_cfg->tcs; ++i) {
 		const unsigned int idx_ring = AQ_NIC_CFG_TCVEC2RING(aq_nic_cfg,
 								    i, idx);