diff mbox series

[RESEND,v3,1/5] drm/amdgpu: Create a module param to disable soft recovery

Message ID 20230810192330.198326-2-andrealmeid@igalia.com (mailing list archive)
State New, archived
Headers show
Series drm/amdgpu: Add new reset option and rework coredump | expand

Commit Message

André Almeida Aug. 10, 2023, 7:23 p.m. UTC
Create a module parameter to disable soft recoveries on amdgpu, making
every recovery go through the device reset path. This option makes
easier to force device resets for testing and debugging purposes.

Signed-off-by: André Almeida <andrealmeid@igalia.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu.h      | 1 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c  | 9 +++++++++
 drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c | 6 +++++-
 3 files changed, 15 insertions(+), 1 deletion(-)

Comments

Christian König Aug. 15, 2023, 6:44 a.m. UTC | #1
Am 10.08.23 um 21:23 schrieb André Almeida:
> Create a module parameter to disable soft recoveries on amdgpu, making
> every recovery go through the device reset path. This option makes
> easier to force device resets for testing and debugging purposes.

I'm still torn apart on this. On the one hand it's certainly useful for 
developers on the other hand module parameters are not meant to be used 
by developers, they are meant to be used by end users.

Now we have to ask what's the use case to disable soft recovery by an 
end user? I don't see any.

Maybe we can overload the amdgpu_gpu_recovery module option with this. 
Or even better merge all the developer module parameter into a 
amdgpu_debug option. This way it should be pretty obvious that this 
isn't meant to be used by someone who doesn't know how to use it.

Regards,
Christian.

>
> Signed-off-by: André Almeida <andrealmeid@igalia.com>
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu.h      | 1 +
>   drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c  | 9 +++++++++
>   drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c | 6 +++++-
>   3 files changed, 15 insertions(+), 1 deletion(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> index 2e3c7c15cb8e..9c6a332261ab 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> @@ -189,6 +189,7 @@ extern uint amdgpu_force_long_training;
>   extern int amdgpu_lbpw;
>   extern int amdgpu_compute_multipipe;
>   extern int amdgpu_gpu_recovery;
> +extern bool amdgpu_soft_recovery;
>   extern int amdgpu_emu_mode;
>   extern uint amdgpu_smu_memory_pool_size;
>   extern int amdgpu_smu_pptable_id;
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> index 0fec81d6a7df..27e7fa36cc60 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> @@ -163,6 +163,7 @@ uint amdgpu_force_long_training;
>   int amdgpu_lbpw = -1;
>   int amdgpu_compute_multipipe = -1;
>   int amdgpu_gpu_recovery = -1; /* auto */
> +bool amdgpu_soft_recovery = true;
>   int amdgpu_emu_mode;
>   uint amdgpu_smu_memory_pool_size;
>   int amdgpu_smu_pptable_id = -1;
> @@ -538,6 +539,14 @@ module_param_named(compute_multipipe, amdgpu_compute_multipipe, int, 0444);
>   MODULE_PARM_DESC(gpu_recovery, "Enable GPU recovery mechanism, (1 = enable, 0 = disable, -1 = auto)");
>   module_param_named(gpu_recovery, amdgpu_gpu_recovery, int, 0444);
>   
> +/**
> + * DOC: gpu_soft_recovery (bool)
> + * Set true to allow the driver to try soft recoveries if a job get stuck. Set
> + * to false to always force a GPU reset during recovery.
> + */
> +MODULE_PARM_DESC(gpu_soft_recovery, "Enable GPU soft recovery mechanism (default: true)");
> +module_param_named(gpu_soft_recovery, amdgpu_soft_recovery, bool, 0644);
> +
>   /**
>    * DOC: emu_mode (int)
>    * Set value 1 to enable emulation mode. This is only needed when running on an emulator. The default is 0 (disabled).
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
> index 80d6e132e409..40678d9fb17e 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
> @@ -434,8 +434,12 @@ bool amdgpu_ring_soft_recovery(struct amdgpu_ring *ring, unsigned int vmid,
>   			       struct dma_fence *fence)
>   {
>   	unsigned long flags;
> +	ktime_t deadline;
>   
> -	ktime_t deadline = ktime_add_us(ktime_get(), 10000);
> +	if (!amdgpu_soft_recovery)
> +		return false;
> +
> +	deadline = ktime_add_us(ktime_get(), 10000);
>   
>   	if (amdgpu_sriov_vf(ring->adev) || !ring->funcs->soft_recovery || !fence)
>   		return false;
diff mbox series

Patch

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index 2e3c7c15cb8e..9c6a332261ab 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -189,6 +189,7 @@  extern uint amdgpu_force_long_training;
 extern int amdgpu_lbpw;
 extern int amdgpu_compute_multipipe;
 extern int amdgpu_gpu_recovery;
+extern bool amdgpu_soft_recovery;
 extern int amdgpu_emu_mode;
 extern uint amdgpu_smu_memory_pool_size;
 extern int amdgpu_smu_pptable_id;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
index 0fec81d6a7df..27e7fa36cc60 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
@@ -163,6 +163,7 @@  uint amdgpu_force_long_training;
 int amdgpu_lbpw = -1;
 int amdgpu_compute_multipipe = -1;
 int amdgpu_gpu_recovery = -1; /* auto */
+bool amdgpu_soft_recovery = true;
 int amdgpu_emu_mode;
 uint amdgpu_smu_memory_pool_size;
 int amdgpu_smu_pptable_id = -1;
@@ -538,6 +539,14 @@  module_param_named(compute_multipipe, amdgpu_compute_multipipe, int, 0444);
 MODULE_PARM_DESC(gpu_recovery, "Enable GPU recovery mechanism, (1 = enable, 0 = disable, -1 = auto)");
 module_param_named(gpu_recovery, amdgpu_gpu_recovery, int, 0444);
 
+/**
+ * DOC: gpu_soft_recovery (bool)
+ * Set true to allow the driver to try soft recoveries if a job get stuck. Set
+ * to false to always force a GPU reset during recovery.
+ */
+MODULE_PARM_DESC(gpu_soft_recovery, "Enable GPU soft recovery mechanism (default: true)");
+module_param_named(gpu_soft_recovery, amdgpu_soft_recovery, bool, 0644);
+
 /**
  * DOC: emu_mode (int)
  * Set value 1 to enable emulation mode. This is only needed when running on an emulator. The default is 0 (disabled).
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
index 80d6e132e409..40678d9fb17e 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
@@ -434,8 +434,12 @@  bool amdgpu_ring_soft_recovery(struct amdgpu_ring *ring, unsigned int vmid,
 			       struct dma_fence *fence)
 {
 	unsigned long flags;
+	ktime_t deadline;
 
-	ktime_t deadline = ktime_add_us(ktime_get(), 10000);
+	if (!amdgpu_soft_recovery)
+		return false;
+
+	deadline = ktime_add_us(ktime_get(), 10000);
 
 	if (amdgpu_sriov_vf(ring->adev) || !ring->funcs->soft_recovery || !fence)
 		return false;