diff mbox series

crypto: qat - Fix ADF_DEV_RESET_SYNC memory leak

Message ID Zjs6VxtkL8QLtHIH@gondor.apana.org.au (mailing list archive)
State Accepted
Delegated to: Herbert Xu
Headers show
Series crypto: qat - Fix ADF_DEV_RESET_SYNC memory leak | expand

Commit Message

Herbert Xu May 8, 2024, 8:39 a.m. UTC
On Fri, Feb 09, 2024 at 01:43:42PM +0100, Damian Muszynski wrote:
>
> @@ -146,11 +147,19 @@ static void adf_device_reset_worker(struct work_struct *work)
>  	adf_dev_restarted_notify(accel_dev);
>  	clear_bit(ADF_STATUS_RESTARTING, &accel_dev->status);
>  
> -	/* The dev is back alive. Notify the caller if in sync mode */
> -	if (reset_data->mode == ADF_DEV_RESET_SYNC)
> -		complete(&reset_data->compl);
> -	else
> +	/*
> +	 * The dev is back alive. Notify the caller if in sync mode
> +	 *
> +	 * If device restart will take a more time than expected,
> +	 * the schedule_reset() function can timeout and exit. This can be
> +	 * detected by calling the completion_done() function. In this case
> +	 * the reset_data structure needs to be freed here.
> +	 */
> +	if (reset_data->mode == ADF_DEV_RESET_ASYNC ||
> +	    completion_done(&reset_data->compl))
>  		kfree(reset_data);
> +	else
> +		complete(&reset_data->compl);

This doesn't work because until you call complete, completion_done
will always return false.  IOW we now have a memory leak instead of
a UAF.

---8<---
Using completion_done to determine whether the caller has gone
away only works after a complete call.  Furthermore it's still
possible that the caller has not yet called wait_for_completion,
resulting in another potential UAF.

Fix this by making the caller use cancel_work_sync and then freeing
the memory safely.

Fixes: 7d42e097607c ("crypto: qat - resolve race condition during AER recovery")
Cc: <stable@vger.kernel.org> #6.8+
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>

Comments

Damian Muszynski May 8, 2024, 10:23 a.m. UTC | #1
Hi Herbert,

Thanks for your vigilance. I based this fix on the description of
completion_done() which can be misunderstood as can be seen.

--
Damian

On 2024-05-08 at 16:39:51 +0800, Herbert Xu wrote:
> On Fri, Feb 09, 2024 at 01:43:42PM +0100, Damian Muszynski wrote:
> >
> > @@ -146,11 +147,19 @@ static void adf_device_reset_worker(struct work_struct *work)
> >  	adf_dev_restarted_notify(accel_dev);
> >  	clear_bit(ADF_STATUS_RESTARTING, &accel_dev->status);
> >
> > -	/* The dev is back alive. Notify the caller if in sync mode */
> > -	if (reset_data->mode == ADF_DEV_RESET_SYNC)
> > -		complete(&reset_data->compl);
> > -	else
> > +	/*
> > +	 * The dev is back alive. Notify the caller if in sync mode
> > +	 *
> > +	 * If device restart will take a more time than expected,
> > +	 * the schedule_reset() function can timeout and exit. This can be
> > +	 * detected by calling the completion_done() function. In this case
> > +	 * the reset_data structure needs to be freed here.
> > +	 */
> > +	if (reset_data->mode == ADF_DEV_RESET_ASYNC ||
> > +	    completion_done(&reset_data->compl))
> >  		kfree(reset_data);
> > +	else
> > +		complete(&reset_data->compl);
>
> This doesn't work because until you call complete, completion_done
> will always return false.  IOW we now have a memory leak instead of
> a UAF.
>
> ---8<---
> Using completion_done to determine whether the caller has gone
> away only works after a complete call.  Furthermore it's still
> possible that the caller has not yet called wait_for_completion,
> resulting in another potential UAF.
>
> Fix this by making the caller use cancel_work_sync and then freeing
> the memory safely.
>
> Fixes: 7d42e097607c ("crypto: qat - resolve race condition during AER recovery")
> Cc: <stable@vger.kernel.org> #6.8+
> Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
>
> diff --git a/drivers/crypto/intel/qat/qat_common/adf_aer.c b/drivers/crypto/intel/qat/qat_common/adf_aer.c
> index 9da2278bd5b7..04260f61d042 100644
> --- a/drivers/crypto/intel/qat/qat_common/adf_aer.c
> +++ b/drivers/crypto/intel/qat/qat_common/adf_aer.c
> @@ -130,8 +130,7 @@ static void adf_device_reset_worker(struct work_struct *work)
>  	if (adf_dev_restart(accel_dev)) {
>  		/* The device hanged and we can't restart it so stop here */
>  		dev_err(&GET_DEV(accel_dev), "Restart device failed\n");
> -		if (reset_data->mode == ADF_DEV_RESET_ASYNC ||
> -		    completion_done(&reset_data->compl))
> +		if (reset_data->mode == ADF_DEV_RESET_ASYNC)
>  			kfree(reset_data);
>  		WARN(1, "QAT: device restart failed. Device is unusable\n");
>  		return;
> @@ -147,16 +146,8 @@ static void adf_device_reset_worker(struct work_struct *work)
>  	adf_dev_restarted_notify(accel_dev);
>  	clear_bit(ADF_STATUS_RESTARTING, &accel_dev->status);
>
> -	/*
> -	 * The dev is back alive. Notify the caller if in sync mode
> -	 *
> -	 * If device restart will take a more time than expected,
> -	 * the schedule_reset() function can timeout and exit. This can be
> -	 * detected by calling the completion_done() function. In this case
> -	 * the reset_data structure needs to be freed here.
> -	 */
> -	if (reset_data->mode == ADF_DEV_RESET_ASYNC ||
> -	    completion_done(&reset_data->compl))
> +	/* The dev is back alive. Notify the caller if in sync mode */
> +	if (reset_data->mode == ADF_DEV_RESET_ASYNC)
>  		kfree(reset_data);
>  	else
>  		complete(&reset_data->compl);
> @@ -191,10 +182,10 @@ static int adf_dev_aer_schedule_reset(struct adf_accel_dev *accel_dev,
>  		if (!timeout) {
>  			dev_err(&GET_DEV(accel_dev),
>  				"Reset device timeout expired\n");
> +			cancel_work_sync(&reset_data->reset_work);
>  			ret = -EFAULT;
> -		} else {
> -			kfree(reset_data);
>  		}
> +		kfree(reset_data);
>  		return ret;
>  	}
>  	return 0;
> --
> Email: Herbert Xu <herbert@gondor.apana.org.au>
> Home Page: http://gondor.apana.org.au/~herbert/
> PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
Cabiddu, Giovanni May 16, 2024, 9:16 p.m. UTC | #2
On Wed, May 08, 2024 at 04:39:51PM +0800, Herbert Xu wrote:
> On Fri, Feb 09, 2024 at 01:43:42PM +0100, Damian Muszynski wrote:
> >
> > @@ -146,11 +147,19 @@ static void adf_device_reset_worker(struct work_struct *work)
> >  	adf_dev_restarted_notify(accel_dev);
> >  	clear_bit(ADF_STATUS_RESTARTING, &accel_dev->status);
> >  
> > -	/* The dev is back alive. Notify the caller if in sync mode */
> > -	if (reset_data->mode == ADF_DEV_RESET_SYNC)
> > -		complete(&reset_data->compl);
> > -	else
> > +	/*
> > +	 * The dev is back alive. Notify the caller if in sync mode
> > +	 *
> > +	 * If device restart will take a more time than expected,
> > +	 * the schedule_reset() function can timeout and exit. This can be
> > +	 * detected by calling the completion_done() function. In this case
> > +	 * the reset_data structure needs to be freed here.
> > +	 */
> > +	if (reset_data->mode == ADF_DEV_RESET_ASYNC ||
> > +	    completion_done(&reset_data->compl))
> >  		kfree(reset_data);
> > +	else
> > +		complete(&reset_data->compl);
> 
> This doesn't work because until you call complete, completion_done
> will always return false.  IOW we now have a memory leak instead of
> a UAF.
> 
> ---8<---
> Using completion_done to determine whether the caller has gone
> away only works after a complete call.  Furthermore it's still
> possible that the caller has not yet called wait_for_completion,
> resulting in another potential UAF.
> 
> Fix this by making the caller use cancel_work_sync and then freeing
> the memory safely.
> 
> Fixes: 7d42e097607c ("crypto: qat - resolve race condition during AER recovery")
> Cc: <stable@vger.kernel.org> #6.8+
> Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Reviewed-by: Giovanni Cabiddu <giovanni.cabiddu@intel.com>

This is also present in 6.6+ and 6.7+.
diff mbox series

Patch

diff --git a/drivers/crypto/intel/qat/qat_common/adf_aer.c b/drivers/crypto/intel/qat/qat_common/adf_aer.c
index 9da2278bd5b7..04260f61d042 100644
--- a/drivers/crypto/intel/qat/qat_common/adf_aer.c
+++ b/drivers/crypto/intel/qat/qat_common/adf_aer.c
@@ -130,8 +130,7 @@  static void adf_device_reset_worker(struct work_struct *work)
 	if (adf_dev_restart(accel_dev)) {
 		/* The device hanged and we can't restart it so stop here */
 		dev_err(&GET_DEV(accel_dev), "Restart device failed\n");
-		if (reset_data->mode == ADF_DEV_RESET_ASYNC ||
-		    completion_done(&reset_data->compl))
+		if (reset_data->mode == ADF_DEV_RESET_ASYNC)
 			kfree(reset_data);
 		WARN(1, "QAT: device restart failed. Device is unusable\n");
 		return;
@@ -147,16 +146,8 @@  static void adf_device_reset_worker(struct work_struct *work)
 	adf_dev_restarted_notify(accel_dev);
 	clear_bit(ADF_STATUS_RESTARTING, &accel_dev->status);
 
-	/*
-	 * The dev is back alive. Notify the caller if in sync mode
-	 *
-	 * If device restart will take a more time than expected,
-	 * the schedule_reset() function can timeout and exit. This can be
-	 * detected by calling the completion_done() function. In this case
-	 * the reset_data structure needs to be freed here.
-	 */
-	if (reset_data->mode == ADF_DEV_RESET_ASYNC ||
-	    completion_done(&reset_data->compl))
+	/* The dev is back alive. Notify the caller if in sync mode */
+	if (reset_data->mode == ADF_DEV_RESET_ASYNC)
 		kfree(reset_data);
 	else
 		complete(&reset_data->compl);
@@ -191,10 +182,10 @@  static int adf_dev_aer_schedule_reset(struct adf_accel_dev *accel_dev,
 		if (!timeout) {
 			dev_err(&GET_DEV(accel_dev),
 				"Reset device timeout expired\n");
+			cancel_work_sync(&reset_data->reset_work);
 			ret = -EFAULT;
-		} else {
-			kfree(reset_data);
 		}
+		kfree(reset_data);
 		return ret;
 	}
 	return 0;