diff mbox series

[iwl-net,1/3] idpf: fix memory leaks and crashes while performing a soft reset

Message ID 20240724134024.2182959-2-aleksander.lobakin@intel.com (mailing list archive)
State Awaiting Upstream
Delegated to: Netdev Maintainers
Headers show
Series idpf: fix 3 bugs revealed by the Chapter I | expand

Checks

Context Check Description
netdev/series_format success Posting correctly formatted
netdev/tree_selection success Clearly marked for net
netdev/ynl success Generated files up to date; no warnings/errors; no diff in generated;
netdev/fixes_present success Fixes tag present in non-next series
netdev/header_inline success No static functions without inline keyword in header files
netdev/build_32bit success Errors and warnings before: 273 this patch: 273
netdev/build_tools success No tools touched, skip
netdev/cc_maintainers fail 3 blamed authors not CCed: sridhar.samudrala@intel.com willemb@google.com joshua.a.hay@intel.com; 4 maintainers not CCed: sridhar.samudrala@intel.com willemb@google.com przemyslaw.kitszel@intel.com joshua.a.hay@intel.com
netdev/build_clang success Errors and warnings before: 281 this patch: 281
netdev/verify_signedoff success Signed-off-by tag matches author and committer
netdev/deprecated_api success None detected
netdev/check_selftest success No net selftest shell script
netdev/verify_fixes success Fixes tag looks correct
netdev/build_allmodconfig_warn success Errors and warnings before: 281 this patch: 281
netdev/checkpatch success total: 0 errors, 0 warnings, 0 checks, 75 lines checked
netdev/build_clang_rust success No Rust files in patch. Skipping build
netdev/kdoc success Errors and warnings before: 35 this patch: 35
netdev/source_inline success Was 0 now: 0

Commit Message

Alexander Lobakin July 24, 2024, 1:40 p.m. UTC
The second tagged commit introduced a UAF, as it removed restoring
q_vector->vport pointers after reinitializating the structures.
This is due to that all queue allocation functions are performed here
with the new temporary vport structure and those functions rewrite
the backpointers to the vport. Then, this new struct is freed and
the pointers start leading to nowhere.

But generally speaking, the current logic is very fragile. It claims
to be more reliable when the system is low on memory, but in fact, it
consumes two times more memory as at the moment of running this
function, there are two vports allocated with their queues and vectors.
Moreover, it claims to prevent the driver from running into "bad state",
but in fact, any error during the rebuild leaves the old vport in the
partially allocated state.
Finally, if the interface is down when the function is called, it always
allocates a new queue set, but when the user decides to enable the
interface later on, vport_open() allocates them once again, IOW there's
a clear memory leak here.

Just don't allocate a new queue set when performing a reset, that solves
crashes and memory leaks. Readd the old queue number and reopen the
interface on rollback - that solves limbo states when the device is left
disabled and/or without HW queues enabled.

Fixes: 02cbfba1add5 ("idpf: add ethtool callbacks")
Fixes: e4891e4687c8 ("idpf: split &idpf_queue into 4 strictly-typed queue structures")
Signed-off-by: Alexander Lobakin <aleksander.lobakin@intel.com>
---
 drivers/net/ethernet/intel/idpf/idpf_lib.c | 30 +++++++++++-----------
 1 file changed, 15 insertions(+), 15 deletions(-)

Comments

Simon Horman July 26, 2024, 4:09 p.m. UTC | #1
On Wed, Jul 24, 2024 at 03:40:22PM +0200, Alexander Lobakin wrote:
> The second tagged commit introduced a UAF, as it removed restoring
> q_vector->vport pointers after reinitializating the structures.
> This is due to that all queue allocation functions are performed here
> with the new temporary vport structure and those functions rewrite
> the backpointers to the vport. Then, this new struct is freed and
> the pointers start leading to nowhere.
> 
> But generally speaking, the current logic is very fragile. It claims
> to be more reliable when the system is low on memory, but in fact, it
> consumes two times more memory as at the moment of running this
> function, there are two vports allocated with their queues and vectors.
> Moreover, it claims to prevent the driver from running into "bad state",
> but in fact, any error during the rebuild leaves the old vport in the
> partially allocated state.
> Finally, if the interface is down when the function is called, it always
> allocates a new queue set, but when the user decides to enable the
> interface later on, vport_open() allocates them once again, IOW there's
> a clear memory leak here.
> 
> Just don't allocate a new queue set when performing a reset, that solves
> crashes and memory leaks. Readd the old queue number and reopen the
> interface on rollback - that solves limbo states when the device is left
> disabled and/or without HW queues enabled.
> 
> Fixes: 02cbfba1add5 ("idpf: add ethtool callbacks")
> Fixes: e4891e4687c8 ("idpf: split &idpf_queue into 4 strictly-typed queue structures")
> Signed-off-by: Alexander Lobakin <aleksander.lobakin@intel.com>
> ---
>  drivers/net/ethernet/intel/idpf/idpf_lib.c | 30 +++++++++++-----------
>  1 file changed, 15 insertions(+), 15 deletions(-)
> 
> diff --git a/drivers/net/ethernet/intel/idpf/idpf_lib.c b/drivers/net/ethernet/intel/idpf/idpf_lib.c

...

> @@ -1932,17 +1926,23 @@ int idpf_initiate_soft_reset(struct idpf_vport *vport,
>  
>  	err = idpf_set_real_num_queues(vport);
>  	if (err)
> -		goto err_reset;
> +		goto err_open;
>  
>  	if (current_state == __IDPF_VPORT_UP)
> -		err = idpf_vport_open(vport, false);
> +		err = idpf_vport_open(vport);
>  
>  	kfree(new_vport);
>  
>  	return err;
>  
>  err_reset:
> -	idpf_vport_queues_rel(new_vport);
> +	idpf_send_add_queues_msg(vport, vport->num_txq, vport->num_complq,
> +				 vport->num_rxq, vport->num_bufq);
> +
> +err_open:
> +	if (current_state == __IDPF_VPORT_UP)
> +		idpf_vport_open(vport);

Hi Alexander,

Can the system end up in an odd state if this call to idpf_vport_open(), or
the one above, fails. Likewise if the above call to
idpf_send_add_queues_msg() fails.

> +
>  free_vport:
>  	kfree(new_vport);
>  

...
Alexander Lobakin July 29, 2024, 8:54 a.m. UTC | #2
From: Simon Horman <horms@kernel.org>
Date: Fri, 26 Jul 2024 17:09:54 +0100

> On Wed, Jul 24, 2024 at 03:40:22PM +0200, Alexander Lobakin wrote:
>> The second tagged commit introduced a UAF, as it removed restoring
>> q_vector->vport pointers after reinitializating the structures.
>> This is due to that all queue allocation functions are performed here
>> with the new temporary vport structure and those functions rewrite
>> the backpointers to the vport. Then, this new struct is freed and
>> the pointers start leading to nowhere.

[...]

>>  err_reset:
>> -	idpf_vport_queues_rel(new_vport);
>> +	idpf_send_add_queues_msg(vport, vport->num_txq, vport->num_complq,
>> +				 vport->num_rxq, vport->num_bufq);
>> +
>> +err_open:
>> +	if (current_state == __IDPF_VPORT_UP)
>> +		idpf_vport_open(vport);
> 
> Hi Alexander,
> 
> Can the system end up in an odd state if this call to idpf_vport_open(), or
> the one above, fails. Likewise if the above call to
> idpf_send_add_queues_msg() fails.

Adding the queues with the parameters that were before changing them
almost can't fail. But if any of these two fails, it really will be in
an odd state...

Perhaps we need to do a more powerful reset then? Can we somehow tell
the kernel that in fact our iface is down, so that the user could try
to enable it manually once again?
Anyway, feels like a separate series or patch to -next, what do you think?

> 
>> +
>>  free_vport:
>>  	kfree(new_vport);

Thanks,
Olek
Simon Horman July 30, 2024, 11:03 a.m. UTC | #3
On Mon, Jul 29, 2024 at 10:54:50AM +0200, Alexander Lobakin wrote:
> From: Simon Horman <horms@kernel.org>
> Date: Fri, 26 Jul 2024 17:09:54 +0100
> 
> > On Wed, Jul 24, 2024 at 03:40:22PM +0200, Alexander Lobakin wrote:
> >> The second tagged commit introduced a UAF, as it removed restoring
> >> q_vector->vport pointers after reinitializating the structures.
> >> This is due to that all queue allocation functions are performed here
> >> with the new temporary vport structure and those functions rewrite
> >> the backpointers to the vport. Then, this new struct is freed and
> >> the pointers start leading to nowhere.
> 
> [...]
> 
> >>  err_reset:
> >> -	idpf_vport_queues_rel(new_vport);
> >> +	idpf_send_add_queues_msg(vport, vport->num_txq, vport->num_complq,
> >> +				 vport->num_rxq, vport->num_bufq);
> >> +
> >> +err_open:
> >> +	if (current_state == __IDPF_VPORT_UP)
> >> +		idpf_vport_open(vport);
> > 
> > Hi Alexander,
> > 
> > Can the system end up in an odd state if this call to idpf_vport_open(), or
> > the one above, fails. Likewise if the above call to
> > idpf_send_add_queues_msg() fails.
> 
> Adding the queues with the parameters that were before changing them
> almost can't fail. But if any of these two fails, it really will be in
> an odd state...
> 
> Perhaps we need to do a more powerful reset then? Can we somehow tell
> the kernel that in fact our iface is down, so that the user could try
> to enable it manually once again?
> Anyway, feels like a separate series or patch to -next, what do you think?
> 
> > 
> >> +
> >>  free_vport:
> >>  	kfree(new_vport);
> 
> Thanks,
> Olek
>
Simon Horman July 30, 2024, 4:37 p.m. UTC | #4
On Mon, Jul 29, 2024 at 10:54:50AM +0200, Alexander Lobakin wrote:
> From: Simon Horman <horms@kernel.org>
> Date: Fri, 26 Jul 2024 17:09:54 +0100
> 
> > On Wed, Jul 24, 2024 at 03:40:22PM +0200, Alexander Lobakin wrote:
> >> The second tagged commit introduced a UAF, as it removed restoring
> >> q_vector->vport pointers after reinitializating the structures.
> >> This is due to that all queue allocation functions are performed here
> >> with the new temporary vport structure and those functions rewrite
> >> the backpointers to the vport. Then, this new struct is freed and
> >> the pointers start leading to nowhere.
> 
> [...]
> 
> >>  err_reset:
> >> -	idpf_vport_queues_rel(new_vport);
> >> +	idpf_send_add_queues_msg(vport, vport->num_txq, vport->num_complq,
> >> +				 vport->num_rxq, vport->num_bufq);
> >> +
> >> +err_open:
> >> +	if (current_state == __IDPF_VPORT_UP)
> >> +		idpf_vport_open(vport);
> > 
> > Hi Alexander,
> > 
> > Can the system end up in an odd state if this call to idpf_vport_open(), or
> > the one above, fails. Likewise if the above call to
> > idpf_send_add_queues_msg() fails.
> 
> Adding the queues with the parameters that were before changing them
> almost can't fail. But if any of these two fails, it really will be in
> an odd state...

Thanks for the clarification, this is my concern.

> Perhaps we need to do a more powerful reset then? Can we somehow tell
> the kernel that in fact our iface is down, so that the user could try
> to enable it manually once again?
> Anyway, feels like a separate series or patch to -next, what do you think?

Yes, sure. I agree that this patch improves things, and more extreme
corner cases can be addressed separately.

With the above in mind, I'm happy with this patch.

Reviewed-by: Simon Horman <horms@kernel.org>
Singh, Krishneil K Aug. 2, 2024, 12:21 a.m. UTC | #5
> -----Original Message-----
> From: Simon Horman <horms@kernel.org>
> Sent: Tuesday, July 30, 2024 9:37 AM
> To: Lobakin, Aleksander <aleksander.lobakin@intel.com>
> Cc: intel-wired-lan@lists.osuosl.org; Nguyen, Anthony L
> <anthony.l.nguyen@intel.com>; David S. Miller <davem@davemloft.net>; Eric
> Dumazet <edumazet@google.com>; Jakub Kicinski <kuba@kernel.org>; Paolo
> Abeni <pabeni@redhat.com>; NEX SW NCIS OSDT ITP Upstreaming
> <nex.sw.ncis.osdt.itp.upstreaming@intel.com>; netdev@vger.kernel.org; linux-
> kernel@vger.kernel.org
> Subject: Re: [PATCH iwl-net 1/3] idpf: fix memory leaks and crashes while
> performing a soft reset
> 
> On Mon, Jul 29, 2024 at 10:54:50AM +0200, Alexander Lobakin wrote:
> > From: Simon Horman <horms@kernel.org>
> > Date: Fri, 26 Jul 2024 17:09:54 +0100
> >
> > > On Wed, Jul 24, 2024 at 03:40:22PM +0200, Alexander Lobakin wrote:
> > >> The second tagged commit introduced a UAF, as it removed restoring
> > >> q_vector->vport pointers after reinitializating the structures.
> > >> This is due to that all queue allocation functions are performed here
> > >> with the new temporary vport structure and those functions rewrite
> > >> the backpointers to the vport. Then, this new struct is freed and
> > >> the pointers start leading to nowhere.
> >
> > [...]
> >
> > >>  err_reset:
> > >> -	idpf_vport_queues_rel(new_vport);
> > >> +	idpf_send_add_queues_msg(vport, vport->num_txq, vport-
> >num_complq,
> > >> +				 vport->num_rxq, vport->num_bufq);
> > >> +
> > >> +err_open:
> > >> +	if (current_state == __IDPF_VPORT_UP)
> > >> +		idpf_vport_open(vport);
> > >
> > > Hi Alexander,
> > >
> > > Can the system end up in an odd state if this call to idpf_vport_open(), or
> > > the one above, fails. Likewise if the above call to
> > > idpf_send_add_queues_msg() fails.
> >
> > Adding the queues with the parameters that were before changing them
> > almost can't fail. But if any of these two fails, it really will be in
> > an odd state...
> 
> Thanks for the clarification, this is my concern.
> 
> > Perhaps we need to do a more powerful reset then? Can we somehow tell
> > the kernel that in fact our iface is down, so that the user could try
> > to enable it manually once again?
> > Anyway, feels like a separate series or patch to -next, what do you think?
> 
> Yes, sure. I agree that this patch improves things, and more extreme
> corner cases can be addressed separately.
> 
> With the above in mind, I'm happy with this patch.
> 
> Reviewed-by: Simon Horman <horms@kernel.org>
> 

Tested-by: Krishneil Singh <krishneil.k.singh@intel.com>
diff mbox series

Patch

diff --git a/drivers/net/ethernet/intel/idpf/idpf_lib.c b/drivers/net/ethernet/intel/idpf/idpf_lib.c
index 5dbf2b4ba1b0..10b884dd3475 100644
--- a/drivers/net/ethernet/intel/idpf/idpf_lib.c
+++ b/drivers/net/ethernet/intel/idpf/idpf_lib.c
@@ -1335,9 +1335,8 @@  static void idpf_rx_init_buf_tail(struct idpf_vport *vport)
 /**
  * idpf_vport_open - Bring up a vport
  * @vport: vport to bring up
- * @alloc_res: allocate queue resources
  */
-static int idpf_vport_open(struct idpf_vport *vport, bool alloc_res)
+static int idpf_vport_open(struct idpf_vport *vport)
 {
 	struct idpf_netdev_priv *np = netdev_priv(vport->netdev);
 	struct idpf_adapter *adapter = vport->adapter;
@@ -1350,11 +1349,9 @@  static int idpf_vport_open(struct idpf_vport *vport, bool alloc_res)
 	/* we do not allow interface up just yet */
 	netif_carrier_off(vport->netdev);
 
-	if (alloc_res) {
-		err = idpf_vport_queues_alloc(vport);
-		if (err)
-			return err;
-	}
+	err = idpf_vport_queues_alloc(vport);
+	if (err)
+		return err;
 
 	err = idpf_vport_intr_alloc(vport);
 	if (err) {
@@ -1539,7 +1536,7 @@  void idpf_init_task(struct work_struct *work)
 	np = netdev_priv(vport->netdev);
 	np->state = __IDPF_VPORT_DOWN;
 	if (test_and_clear_bit(IDPF_VPORT_UP_REQUESTED, vport_config->flags))
-		idpf_vport_open(vport, true);
+		idpf_vport_open(vport);
 
 	/* Spawn and return 'idpf_init_task' work queue until all the
 	 * default vports are created
@@ -1898,9 +1895,6 @@  int idpf_initiate_soft_reset(struct idpf_vport *vport,
 		goto free_vport;
 	}
 
-	err = idpf_vport_queues_alloc(new_vport);
-	if (err)
-		goto free_vport;
 	if (current_state <= __IDPF_VPORT_DOWN) {
 		idpf_send_delete_queues_msg(vport);
 	} else {
@@ -1932,17 +1926,23 @@  int idpf_initiate_soft_reset(struct idpf_vport *vport,
 
 	err = idpf_set_real_num_queues(vport);
 	if (err)
-		goto err_reset;
+		goto err_open;
 
 	if (current_state == __IDPF_VPORT_UP)
-		err = idpf_vport_open(vport, false);
+		err = idpf_vport_open(vport);
 
 	kfree(new_vport);
 
 	return err;
 
 err_reset:
-	idpf_vport_queues_rel(new_vport);
+	idpf_send_add_queues_msg(vport, vport->num_txq, vport->num_complq,
+				 vport->num_rxq, vport->num_bufq);
+
+err_open:
+	if (current_state == __IDPF_VPORT_UP)
+		idpf_vport_open(vport);
+
 free_vport:
 	kfree(new_vport);
 
@@ -2171,7 +2171,7 @@  static int idpf_open(struct net_device *netdev)
 	idpf_vport_ctrl_lock(netdev);
 	vport = idpf_netdev_to_vport(netdev);
 
-	err = idpf_vport_open(vport, true);
+	err = idpf_vport_open(vport);
 
 	idpf_vport_ctrl_unlock(netdev);