diff mbox series

[1/2] drm: Create an app info option for wedge events

Message ID 20250228121353.1442591-2-andrealmeid@igalia.com (mailing list archive)
State New
Headers show
Series drm: Create an app info option for wedge events | expand

Commit Message

André Almeida Feb. 28, 2025, 12:13 p.m. UTC
When a device get wedged, it might be caused by a guilty application.
For userspace, knowing which app was the cause can be useful for some
situations, like for implementing a policy, logs or for giving a chance
for the compositor to let the user know what app caused the problem.
This is an optional argument, when `PID=-1` there's no information about
the app caused the problem, or if any app was involved during the hang.

Sometimes just the PID isn't enough giving that the app might be already
dead by the time userspace will try to check what was this PID's name,
so to make the life easier also notify what's the app's name in the user
event.

Signed-off-by: André Almeida <andrealmeid@igalia.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c |  2 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_job.c    |  2 +-
 drivers/gpu/drm/drm_drv.c                  | 16 +++++++++++++---
 drivers/gpu/drm/i915/gt/intel_reset.c      |  3 ++-
 drivers/gpu/drm/xe/xe_device.c             |  3 ++-
 include/drm/drm_device.h                   |  8 ++++++++
 include/drm/drm_drv.h                      |  3 ++-
 7 files changed, 29 insertions(+), 8 deletions(-)

Comments

Raag Jadav Feb. 28, 2025, 2:20 p.m. UTC | #1
Cc: Lucas

On Fri, Feb 28, 2025 at 09:13:52AM -0300, André Almeida wrote:
> When a device get wedged, it might be caused by a guilty application.
> For userspace, knowing which app was the cause can be useful for some
> situations, like for implementing a policy, logs or for giving a chance
> for the compositor to let the user know what app caused the problem.
> This is an optional argument, when `PID=-1` there's no information about
> the app caused the problem, or if any app was involved during the hang.
> 
> Sometimes just the PID isn't enough giving that the app might be already
> dead by the time userspace will try to check what was this PID's name,
> so to make the life easier also notify what's the app's name in the user
> event.
> 
> Signed-off-by: André Almeida <andrealmeid@igalia.com>
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_device.c |  2 +-
>  drivers/gpu/drm/amd/amdgpu/amdgpu_job.c    |  2 +-
>  drivers/gpu/drm/drm_drv.c                  | 16 +++++++++++++---
>  drivers/gpu/drm/i915/gt/intel_reset.c      |  3 ++-
>  drivers/gpu/drm/xe/xe_device.c             |  3 ++-
>  include/drm/drm_device.h                   |  8 ++++++++
>  include/drm/drm_drv.h                      |  3 ++-
>  7 files changed, 29 insertions(+), 8 deletions(-)
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> index 24ba52d76045..00b9b87dafd8 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> @@ -6124,7 +6124,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
>  	atomic_set(&adev->reset_domain->reset_res, r);
>  
>  	if (!r)
> -		drm_dev_wedged_event(adev_to_drm(adev), DRM_WEDGE_RECOVERY_NONE);
> +		drm_dev_wedged_event(adev_to_drm(adev), DRM_WEDGE_RECOVERY_NONE, NULL);
>  
>  	return r;
>  }
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
> index ef1b77f1e88f..3ed9cbcab1ad 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
> @@ -150,7 +150,7 @@ static enum drm_gpu_sched_stat amdgpu_job_timedout(struct drm_sched_job *s_job)
>  			amdgpu_fence_driver_force_completion(ring);
>  			if (amdgpu_ring_sched_ready(ring))
>  				drm_sched_start(&ring->sched, 0);
> -			drm_dev_wedged_event(adev_to_drm(adev), DRM_WEDGE_RECOVERY_NONE);
> +			drm_dev_wedged_event(adev_to_drm(adev), DRM_WEDGE_RECOVERY_NONE, NULL);
>  			dev_err(adev->dev, "Ring %s reset succeeded\n", ring->sched.name);
>  			goto exit;
>  		}
> diff --git a/drivers/gpu/drm/drm_drv.c b/drivers/gpu/drm/drm_drv.c
> index 17fc5dc708f4..48faafd82a99 100644
> --- a/drivers/gpu/drm/drm_drv.c
> +++ b/drivers/gpu/drm/drm_drv.c
> @@ -522,6 +522,7 @@ static const char *drm_get_wedge_recovery(unsigned int opt)
>   * drm_dev_wedged_event - generate a device wedged uevent
>   * @dev: DRM device
>   * @method: method(s) to be used for recovery
> + * @info: optional information about the guilty app
>   *
>   * This generates a device wedged uevent for the DRM device specified by @dev.
>   * Recovery @method\(s) of choice will be sent in the uevent environment as
> @@ -534,13 +535,14 @@ static const char *drm_get_wedge_recovery(unsigned int opt)
>   *
>   * Returns: 0 on success, negative error code otherwise.
>   */
> -int drm_dev_wedged_event(struct drm_device *dev, unsigned long method)
> +int drm_dev_wedged_event(struct drm_device *dev, unsigned long method,
> +			 struct drm_wedge_app_info *info)
>  {
>  	const char *recovery = NULL;
>  	unsigned int len, opt;
>  	/* Event string length up to 28+ characters with available methods */
> -	char event_string[32];
> -	char *envp[] = { event_string, NULL };
> +	char event_string[32], pid_string[15], comm_string[TASK_COMM_LEN];
> +	char *envp[] = { event_string, pid_string, comm_string, NULL };
>  
>  	len = scnprintf(event_string, sizeof(event_string), "%s", "WEDGED=");
>  
> @@ -562,6 +564,14 @@ int drm_dev_wedged_event(struct drm_device *dev, unsigned long method)
>  	drm_info(dev, "device wedged, %s\n", method == DRM_WEDGE_RECOVERY_NONE ?
>  		 "but recovered through reset" : "needs recovery");
>  
> +	if (info) {
> +		snprintf(pid_string, sizeof(pid_string), "PID=%u", info->pid);
> +		snprintf(comm_string, sizeof(comm_string), "APP=%s", info->comm);
> +	} else {
> +		snprintf(pid_string, sizeof(pid_string), "%s", "PID=-1");
> +		snprintf(comm_string, sizeof(comm_string), "%s", "APP=none");
> +	}

This is not much use for wedge cases that needs recovery, since at that point
the userspace will need to clean house anyway.

Which leaves us with only 'none' case and perhaps the need for standardization
of "optional telemetry collection".

Thoughts?

Raag
André Almeida Feb. 28, 2025, 9:54 p.m. UTC | #2
Hi Raag,

On 2/28/25 11:20, Raag Jadav wrote:
> Cc: Lucas
>
> On Fri, Feb 28, 2025 at 09:13:52AM -0300, André Almeida wrote:
>> When a device get wedged, it might be caused by a guilty application.
>> For userspace, knowing which app was the cause can be useful for some
>> situations, like for implementing a policy, logs or for giving a chance
>> for the compositor to let the user know what app caused the problem.
>> This is an optional argument, when `PID=-1` there's no information about
>> the app caused the problem, or if any app was involved during the hang.
>>
>> Sometimes just the PID isn't enough giving that the app might be already
>> dead by the time userspace will try to check what was this PID's name,
>> so to make the life easier also notify what's the app's name in the user
>> event.
>>
>> Signed-off-by: André Almeida <andrealmeid@igalia.com>
>> ---
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_device.c |  2 +-
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_job.c    |  2 +-
>>   drivers/gpu/drm/drm_drv.c                  | 16 +++++++++++++---
>>   drivers/gpu/drm/i915/gt/intel_reset.c      |  3 ++-
>>   drivers/gpu/drm/xe/xe_device.c             |  3 ++-
>>   include/drm/drm_device.h                   |  8 ++++++++
>>   include/drm/drm_drv.h                      |  3 ++-
>>   7 files changed, 29 insertions(+), 8 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> index 24ba52d76045..00b9b87dafd8 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> @@ -6124,7 +6124,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
>>   	atomic_set(&adev->reset_domain->reset_res, r);
>>   
>>   	if (!r)
>> -		drm_dev_wedged_event(adev_to_drm(adev), DRM_WEDGE_RECOVERY_NONE);
>> +		drm_dev_wedged_event(adev_to_drm(adev), DRM_WEDGE_RECOVERY_NONE, NULL);
>>   
>>   	return r;
>>   }
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
>> index ef1b77f1e88f..3ed9cbcab1ad 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
>> @@ -150,7 +150,7 @@ static enum drm_gpu_sched_stat amdgpu_job_timedout(struct drm_sched_job *s_job)
>>   			amdgpu_fence_driver_force_completion(ring);
>>   			if (amdgpu_ring_sched_ready(ring))
>>   				drm_sched_start(&ring->sched, 0);
>> -			drm_dev_wedged_event(adev_to_drm(adev), DRM_WEDGE_RECOVERY_NONE);
>> +			drm_dev_wedged_event(adev_to_drm(adev), DRM_WEDGE_RECOVERY_NONE, NULL);
>>   			dev_err(adev->dev, "Ring %s reset succeeded\n", ring->sched.name);
>>   			goto exit;
>>   		}
>> diff --git a/drivers/gpu/drm/drm_drv.c b/drivers/gpu/drm/drm_drv.c
>> index 17fc5dc708f4..48faafd82a99 100644
>> --- a/drivers/gpu/drm/drm_drv.c
>> +++ b/drivers/gpu/drm/drm_drv.c
>> @@ -522,6 +522,7 @@ static const char *drm_get_wedge_recovery(unsigned int opt)
>>    * drm_dev_wedged_event - generate a device wedged uevent
>>    * @dev: DRM device
>>    * @method: method(s) to be used for recovery
>> + * @info: optional information about the guilty app
>>    *
>>    * This generates a device wedged uevent for the DRM device specified by @dev.
>>    * Recovery @method\(s) of choice will be sent in the uevent environment as
>> @@ -534,13 +535,14 @@ static const char *drm_get_wedge_recovery(unsigned int opt)
>>    *
>>    * Returns: 0 on success, negative error code otherwise.
>>    */
>> -int drm_dev_wedged_event(struct drm_device *dev, unsigned long method)
>> +int drm_dev_wedged_event(struct drm_device *dev, unsigned long method,
>> +			 struct drm_wedge_app_info *info)
>>   {
>>   	const char *recovery = NULL;
>>   	unsigned int len, opt;
>>   	/* Event string length up to 28+ characters with available methods */
>> -	char event_string[32];
>> -	char *envp[] = { event_string, NULL };
>> +	char event_string[32], pid_string[15], comm_string[TASK_COMM_LEN];
>> +	char *envp[] = { event_string, pid_string, comm_string, NULL };
>>   
>>   	len = scnprintf(event_string, sizeof(event_string), "%s", "WEDGED=");
>>   
>> @@ -562,6 +564,14 @@ int drm_dev_wedged_event(struct drm_device *dev, unsigned long method)
>>   	drm_info(dev, "device wedged, %s\n", method == DRM_WEDGE_RECOVERY_NONE ?
>>   		 "but recovered through reset" : "needs recovery");
>>   
>> +	if (info) {
>> +		snprintf(pid_string, sizeof(pid_string), "PID=%u", info->pid);
>> +		snprintf(comm_string, sizeof(comm_string), "APP=%s", info->comm);
>> +	} else {
>> +		snprintf(pid_string, sizeof(pid_string), "%s", "PID=-1");
>> +		snprintf(comm_string, sizeof(comm_string), "%s", "APP=none");
>> +	}
> This is not much use for wedge cases that needs recovery, since at that point
> the userspace will need to clean house anyway.
>
> Which leaves us with only 'none' case and perhaps the need for standardization
> of "optional telemetry collection".
>
> Thoughts?

I had the feeling that 'none' was already meant to be used for that. Do 
you think we should move to another naming? Given that we didn't reach 
the merge window yet we could potentially change that name without much 
damage.

> Raag
>
Raag Jadav March 1, 2025, 5:53 a.m. UTC | #3
On Fri, Feb 28, 2025 at 06:54:12PM -0300, André Almeida wrote:
> Hi Raag,
> 
> On 2/28/25 11:20, Raag Jadav wrote:
> > Cc: Lucas
> > 
> > On Fri, Feb 28, 2025 at 09:13:52AM -0300, André Almeida wrote:
> > > When a device get wedged, it might be caused by a guilty application.
> > > For userspace, knowing which app was the cause can be useful for some
> > > situations, like for implementing a policy, logs or for giving a chance
> > > for the compositor to let the user know what app caused the problem.
> > > This is an optional argument, when `PID=-1` there's no information about
> > > the app caused the problem, or if any app was involved during the hang.
> > > 
> > > Sometimes just the PID isn't enough giving that the app might be already
> > > dead by the time userspace will try to check what was this PID's name,
> > > so to make the life easier also notify what's the app's name in the user
> > > event.
> > > 
> > > Signed-off-by: André Almeida <andrealmeid@igalia.com>
> > > ---
> > >   drivers/gpu/drm/amd/amdgpu/amdgpu_device.c |  2 +-
> > >   drivers/gpu/drm/amd/amdgpu/amdgpu_job.c    |  2 +-
> > >   drivers/gpu/drm/drm_drv.c                  | 16 +++++++++++++---
> > >   drivers/gpu/drm/i915/gt/intel_reset.c      |  3 ++-
> > >   drivers/gpu/drm/xe/xe_device.c             |  3 ++-
> > >   include/drm/drm_device.h                   |  8 ++++++++
> > >   include/drm/drm_drv.h                      |  3 ++-
> > >   7 files changed, 29 insertions(+), 8 deletions(-)
> > > 
> > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> > > index 24ba52d76045..00b9b87dafd8 100644
> > > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> > > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> > > @@ -6124,7 +6124,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
> > >   	atomic_set(&adev->reset_domain->reset_res, r);
> > >   	if (!r)
> > > -		drm_dev_wedged_event(adev_to_drm(adev), DRM_WEDGE_RECOVERY_NONE);
> > > +		drm_dev_wedged_event(adev_to_drm(adev), DRM_WEDGE_RECOVERY_NONE, NULL);
> > >   	return r;
> > >   }
> > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
> > > index ef1b77f1e88f..3ed9cbcab1ad 100644
> > > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
> > > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
> > > @@ -150,7 +150,7 @@ static enum drm_gpu_sched_stat amdgpu_job_timedout(struct drm_sched_job *s_job)
> > >   			amdgpu_fence_driver_force_completion(ring);
> > >   			if (amdgpu_ring_sched_ready(ring))
> > >   				drm_sched_start(&ring->sched, 0);
> > > -			drm_dev_wedged_event(adev_to_drm(adev), DRM_WEDGE_RECOVERY_NONE);
> > > +			drm_dev_wedged_event(adev_to_drm(adev), DRM_WEDGE_RECOVERY_NONE, NULL);
> > >   			dev_err(adev->dev, "Ring %s reset succeeded\n", ring->sched.name);
> > >   			goto exit;
> > >   		}
> > > diff --git a/drivers/gpu/drm/drm_drv.c b/drivers/gpu/drm/drm_drv.c
> > > index 17fc5dc708f4..48faafd82a99 100644
> > > --- a/drivers/gpu/drm/drm_drv.c
> > > +++ b/drivers/gpu/drm/drm_drv.c
> > > @@ -522,6 +522,7 @@ static const char *drm_get_wedge_recovery(unsigned int opt)
> > >    * drm_dev_wedged_event - generate a device wedged uevent
> > >    * @dev: DRM device
> > >    * @method: method(s) to be used for recovery
> > > + * @info: optional information about the guilty app
> > >    *
> > >    * This generates a device wedged uevent for the DRM device specified by @dev.
> > >    * Recovery @method\(s) of choice will be sent in the uevent environment as
> > > @@ -534,13 +535,14 @@ static const char *drm_get_wedge_recovery(unsigned int opt)
> > >    *
> > >    * Returns: 0 on success, negative error code otherwise.
> > >    */
> > > -int drm_dev_wedged_event(struct drm_device *dev, unsigned long method)
> > > +int drm_dev_wedged_event(struct drm_device *dev, unsigned long method,
> > > +			 struct drm_wedge_app_info *info)
> > >   {
> > >   	const char *recovery = NULL;
> > >   	unsigned int len, opt;
> > >   	/* Event string length up to 28+ characters with available methods */
> > > -	char event_string[32];
> > > -	char *envp[] = { event_string, NULL };
> > > +	char event_string[32], pid_string[15], comm_string[TASK_COMM_LEN];
> > > +	char *envp[] = { event_string, pid_string, comm_string, NULL };
> > >   	len = scnprintf(event_string, sizeof(event_string), "%s", "WEDGED=");
> > > @@ -562,6 +564,14 @@ int drm_dev_wedged_event(struct drm_device *dev, unsigned long method)
> > >   	drm_info(dev, "device wedged, %s\n", method == DRM_WEDGE_RECOVERY_NONE ?
> > >   		 "but recovered through reset" : "needs recovery");
> > > +	if (info) {
> > > +		snprintf(pid_string, sizeof(pid_string), "PID=%u", info->pid);
> > > +		snprintf(comm_string, sizeof(comm_string), "APP=%s", info->comm);
> > > +	} else {
> > > +		snprintf(pid_string, sizeof(pid_string), "%s", "PID=-1");
> > > +		snprintf(comm_string, sizeof(comm_string), "%s", "APP=none");
> > > +	}
> > This is not much use for wedge cases that needs recovery, since at that point
> > the userspace will need to clean house anyway.
> > 
> > Which leaves us with only 'none' case and perhaps the need for standardization
> > of "optional telemetry collection".
> > 
> > Thoughts?
> 
> I had the feeling that 'none' was already meant to be used for that. Do you
> think we should move to another naming? Given that we didn't reach the merge
> window yet we could potentially change that name without much damage.

No, I meant thoughts on possible telemetry data that the drivers might
think is useful for userspace (along with PID) and can be presented in
a vendor agnostic manner (just like wedged event).

Raag
André Almeida March 10, 2025, 9:27 p.m. UTC | #4
Em 01/03/2025 02:53, Raag Jadav escreveu:
> On Fri, Feb 28, 2025 at 06:54:12PM -0300, André Almeida wrote:
>> Hi Raag,
>>
>> On 2/28/25 11:20, Raag Jadav wrote:
>>> Cc: Lucas
>>>
>>> On Fri, Feb 28, 2025 at 09:13:52AM -0300, André Almeida wrote:
>>>> When a device get wedged, it might be caused by a guilty application.
>>>> For userspace, knowing which app was the cause can be useful for some
>>>> situations, like for implementing a policy, logs or for giving a chance
>>>> for the compositor to let the user know what app caused the problem.
>>>> This is an optional argument, when `PID=-1` there's no information about
>>>> the app caused the problem, or if any app was involved during the hang.
>>>>
>>>> Sometimes just the PID isn't enough giving that the app might be already
>>>> dead by the time userspace will try to check what was this PID's name,
>>>> so to make the life easier also notify what's the app's name in the user
>>>> event.
>>>>
>>>> Signed-off-by: André Almeida <andrealmeid@igalia.com>

[...]

>>>>    	len = scnprintf(event_string, sizeof(event_string), "%s", "WEDGED=");
>>>> @@ -562,6 +564,14 @@ int drm_dev_wedged_event(struct drm_device *dev, unsigned long method)
>>>>    	drm_info(dev, "device wedged, %s\n", method == DRM_WEDGE_RECOVERY_NONE ?
>>>>    		 "but recovered through reset" : "needs recovery");
>>>> +	if (info) {
>>>> +		snprintf(pid_string, sizeof(pid_string), "PID=%u", info->pid);
>>>> +		snprintf(comm_string, sizeof(comm_string), "APP=%s", info->comm);
>>>> +	} else {
>>>> +		snprintf(pid_string, sizeof(pid_string), "%s", "PID=-1");
>>>> +		snprintf(comm_string, sizeof(comm_string), "%s", "APP=none");
>>>> +	}
>>> This is not much use for wedge cases that needs recovery, since at that point
>>> the userspace will need to clean house anyway.
>>>
>>> Which leaves us with only 'none' case and perhaps the need for standardization
>>> of "optional telemetry collection".
>>>
>>> Thoughts?
>>
>> I had the feeling that 'none' was already meant to be used for that. Do you
>> think we should move to another naming? Given that we didn't reach the merge
>> window yet we could potentially change that name without much damage.
> 
> No, I meant thoughts on possible telemetry data that the drivers might
> think is useful for userspace (along with PID) and can be presented in
> a vendor agnostic manner (just like wedged event).

I'm not if I agree that this will only be used for telemetry and for the 
`none` use case. As stated by Xaver, there's use case to know which app 
caused the device to get wedged (like switching to software rendering) 
and to display something for the user after the recovery is done (e.g. 
"The game <app name> stopped working and Plasma has reset").
Raag Jadav March 11, 2025, 5:09 p.m. UTC | #5
On Mon, Mar 10, 2025 at 06:27:53PM -0300, André Almeida wrote:
> Em 01/03/2025 02:53, Raag Jadav escreveu:
> > On Fri, Feb 28, 2025 at 06:54:12PM -0300, André Almeida wrote:
> > > Hi Raag,
> > > 
> > > On 2/28/25 11:20, Raag Jadav wrote:
> > > > Cc: Lucas
> > > > 
> > > > On Fri, Feb 28, 2025 at 09:13:52AM -0300, André Almeida wrote:
> > > > > When a device get wedged, it might be caused by a guilty application.
> > > > > For userspace, knowing which app was the cause can be useful for some
> > > > > situations, like for implementing a policy, logs or for giving a chance
> > > > > for the compositor to let the user know what app caused the problem.
> > > > > This is an optional argument, when `PID=-1` there's no information about
> > > > > the app caused the problem, or if any app was involved during the hang.
> > > > > 
> > > > > Sometimes just the PID isn't enough giving that the app might be already
> > > > > dead by the time userspace will try to check what was this PID's name,
> > > > > so to make the life easier also notify what's the app's name in the user
> > > > > event.
> > > > > 
> > > > > Signed-off-by: André Almeida <andrealmeid@igalia.com>
> 
> [...]
> 
> > > > >    	len = scnprintf(event_string, sizeof(event_string), "%s", "WEDGED=");
> > > > > @@ -562,6 +564,14 @@ int drm_dev_wedged_event(struct drm_device *dev, unsigned long method)
> > > > >    	drm_info(dev, "device wedged, %s\n", method == DRM_WEDGE_RECOVERY_NONE ?
> > > > >    		 "but recovered through reset" : "needs recovery");
> > > > > +	if (info) {
> > > > > +		snprintf(pid_string, sizeof(pid_string), "PID=%u", info->pid);
> > > > > +		snprintf(comm_string, sizeof(comm_string), "APP=%s", info->comm);
> > > > > +	} else {
> > > > > +		snprintf(pid_string, sizeof(pid_string), "%s", "PID=-1");
> > > > > +		snprintf(comm_string, sizeof(comm_string), "%s", "APP=none");
> > > > > +	}
> > > > This is not much use for wedge cases that needs recovery, since at that point
> > > > the userspace will need to clean house anyway.
> > > > 
> > > > Which leaves us with only 'none' case and perhaps the need for standardization
> > > > of "optional telemetry collection".
> > > > 
> > > > Thoughts?
> > > 
> > > I had the feeling that 'none' was already meant to be used for that. Do you
> > > think we should move to another naming? Given that we didn't reach the merge
> > > window yet we could potentially change that name without much damage.
> > 
> > No, I meant thoughts on possible telemetry data that the drivers might
> > think is useful for userspace (along with PID) and can be presented in
> > a vendor agnostic manner (just like wedged event).
> 
> I'm not if I agree that this will only be used for telemetry and for the
> `none` use case. As stated by Xaver, there's use case to know which app
> caused the device to get wedged (like switching to software rendering) and
> to display something for the user after the recovery is done (e.g. "The game
> <app name> stopped working and Plasma has reset").

Sure, but since this information is already available in coredump, I was
hoping to have something like a standardized DRM level coredump with both
vendor specific and agnostic sections, which the drivers can (and hopefully
transition to) use in conjunction with wedged event to provide wider
telemetry and is useful for all wedge cases.

Would that serve the usecase here?

Raag
Raag Jadav March 12, 2025, 10:06 a.m. UTC | #6
On Tue, Mar 11, 2025 at 07:09:45PM +0200, Raag Jadav wrote:
> On Mon, Mar 10, 2025 at 06:27:53PM -0300, André Almeida wrote:
> > Em 01/03/2025 02:53, Raag Jadav escreveu:
> > > On Fri, Feb 28, 2025 at 06:54:12PM -0300, André Almeida wrote:
> > > > Hi Raag,
> > > > 
> > > > On 2/28/25 11:20, Raag Jadav wrote:
> > > > > Cc: Lucas
> > > > > 
> > > > > On Fri, Feb 28, 2025 at 09:13:52AM -0300, André Almeida wrote:
> > > > > > When a device get wedged, it might be caused by a guilty application.
> > > > > > For userspace, knowing which app was the cause can be useful for some
> > > > > > situations, like for implementing a policy, logs or for giving a chance
> > > > > > for the compositor to let the user know what app caused the problem.
> > > > > > This is an optional argument, when `PID=-1` there's no information about
> > > > > > the app caused the problem, or if any app was involved during the hang.
> > > > > > 
> > > > > > Sometimes just the PID isn't enough giving that the app might be already
> > > > > > dead by the time userspace will try to check what was this PID's name,
> > > > > > so to make the life easier also notify what's the app's name in the user
> > > > > > event.
> > > > > > 
> > > > > > Signed-off-by: André Almeida <andrealmeid@igalia.com>
> > 
> > [...]
> > 
> > > > > >    	len = scnprintf(event_string, sizeof(event_string), "%s", "WEDGED=");
> > > > > > @@ -562,6 +564,14 @@ int drm_dev_wedged_event(struct drm_device *dev, unsigned long method)
> > > > > >    	drm_info(dev, "device wedged, %s\n", method == DRM_WEDGE_RECOVERY_NONE ?
> > > > > >    		 "but recovered through reset" : "needs recovery");
> > > > > > +	if (info) {
> > > > > > +		snprintf(pid_string, sizeof(pid_string), "PID=%u", info->pid);
> > > > > > +		snprintf(comm_string, sizeof(comm_string), "APP=%s", info->comm);
> > > > > > +	} else {
> > > > > > +		snprintf(pid_string, sizeof(pid_string), "%s", "PID=-1");
> > > > > > +		snprintf(comm_string, sizeof(comm_string), "%s", "APP=none");
> > > > > > +	}
> > > > > This is not much use for wedge cases that needs recovery, since at that point
> > > > > the userspace will need to clean house anyway.
> > > > > 
> > > > > Which leaves us with only 'none' case and perhaps the need for standardization
> > > > > of "optional telemetry collection".
> > > > > 
> > > > > Thoughts?
> > > > 
> > > > I had the feeling that 'none' was already meant to be used for that. Do you
> > > > think we should move to another naming? Given that we didn't reach the merge
> > > > window yet we could potentially change that name without much damage.
> > > 
> > > No, I meant thoughts on possible telemetry data that the drivers might
> > > think is useful for userspace (along with PID) and can be presented in
> > > a vendor agnostic manner (just like wedged event).
> > 
> > I'm not if I agree that this will only be used for telemetry and for the
> > `none` use case. As stated by Xaver, there's use case to know which app
> > caused the device to get wedged (like switching to software rendering) and
> > to display something for the user after the recovery is done (e.g. "The game
> > <app name> stopped working and Plasma has reset").
> 
> Sure, but since this information is already available in coredump, I was
> hoping to have something like a standardized DRM level coredump with both
> vendor specific and agnostic sections, which the drivers can (and hopefully
> transition to) use in conjunction with wedged event to provide wider
> telemetry and is useful for all wedge cases.

This is more useful because,

1. It gives drivers an opportunity to present the telemetry that they are
   interested in without needing to add a new event string (like PID or APP)
   for their case.

2. When we consider wedging as a usecase, there's a lot more that goes
   into it than an application that might be behaving strangely. So a wider
   telemetry is what I would hope to look at in such a scenario.

Raag
diff mbox series

Patch

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 24ba52d76045..00b9b87dafd8 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -6124,7 +6124,7 @@  int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
 	atomic_set(&adev->reset_domain->reset_res, r);
 
 	if (!r)
-		drm_dev_wedged_event(adev_to_drm(adev), DRM_WEDGE_RECOVERY_NONE);
+		drm_dev_wedged_event(adev_to_drm(adev), DRM_WEDGE_RECOVERY_NONE, NULL);
 
 	return r;
 }
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
index ef1b77f1e88f..3ed9cbcab1ad 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
@@ -150,7 +150,7 @@  static enum drm_gpu_sched_stat amdgpu_job_timedout(struct drm_sched_job *s_job)
 			amdgpu_fence_driver_force_completion(ring);
 			if (amdgpu_ring_sched_ready(ring))
 				drm_sched_start(&ring->sched, 0);
-			drm_dev_wedged_event(adev_to_drm(adev), DRM_WEDGE_RECOVERY_NONE);
+			drm_dev_wedged_event(adev_to_drm(adev), DRM_WEDGE_RECOVERY_NONE, NULL);
 			dev_err(adev->dev, "Ring %s reset succeeded\n", ring->sched.name);
 			goto exit;
 		}
diff --git a/drivers/gpu/drm/drm_drv.c b/drivers/gpu/drm/drm_drv.c
index 17fc5dc708f4..48faafd82a99 100644
--- a/drivers/gpu/drm/drm_drv.c
+++ b/drivers/gpu/drm/drm_drv.c
@@ -522,6 +522,7 @@  static const char *drm_get_wedge_recovery(unsigned int opt)
  * drm_dev_wedged_event - generate a device wedged uevent
  * @dev: DRM device
  * @method: method(s) to be used for recovery
+ * @info: optional information about the guilty app
  *
  * This generates a device wedged uevent for the DRM device specified by @dev.
  * Recovery @method\(s) of choice will be sent in the uevent environment as
@@ -534,13 +535,14 @@  static const char *drm_get_wedge_recovery(unsigned int opt)
  *
  * Returns: 0 on success, negative error code otherwise.
  */
-int drm_dev_wedged_event(struct drm_device *dev, unsigned long method)
+int drm_dev_wedged_event(struct drm_device *dev, unsigned long method,
+			 struct drm_wedge_app_info *info)
 {
 	const char *recovery = NULL;
 	unsigned int len, opt;
 	/* Event string length up to 28+ characters with available methods */
-	char event_string[32];
-	char *envp[] = { event_string, NULL };
+	char event_string[32], pid_string[15], comm_string[TASK_COMM_LEN];
+	char *envp[] = { event_string, pid_string, comm_string, NULL };
 
 	len = scnprintf(event_string, sizeof(event_string), "%s", "WEDGED=");
 
@@ -562,6 +564,14 @@  int drm_dev_wedged_event(struct drm_device *dev, unsigned long method)
 	drm_info(dev, "device wedged, %s\n", method == DRM_WEDGE_RECOVERY_NONE ?
 		 "but recovered through reset" : "needs recovery");
 
+	if (info) {
+		snprintf(pid_string, sizeof(pid_string), "PID=%u", info->pid);
+		snprintf(comm_string, sizeof(comm_string), "APP=%s", info->comm);
+	} else {
+		snprintf(pid_string, sizeof(pid_string), "%s", "PID=-1");
+		snprintf(comm_string, sizeof(comm_string), "%s", "APP=none");
+	}
+
 	return kobject_uevent_env(&dev->primary->kdev->kobj, KOBJ_CHANGE, envp);
 }
 EXPORT_SYMBOL(drm_dev_wedged_event);
diff --git a/drivers/gpu/drm/i915/gt/intel_reset.c b/drivers/gpu/drm/i915/gt/intel_reset.c
index d6dc12fd87c1..928b9f048b6a 100644
--- a/drivers/gpu/drm/i915/gt/intel_reset.c
+++ b/drivers/gpu/drm/i915/gt/intel_reset.c
@@ -1424,7 +1424,8 @@  static void intel_gt_reset_global(struct intel_gt *gt,
 		kobject_uevent_env(kobj, KOBJ_CHANGE, reset_done_event);
 	else
 		drm_dev_wedged_event(&gt->i915->drm,
-				     DRM_WEDGE_RECOVERY_REBIND | DRM_WEDGE_RECOVERY_BUS_RESET);
+				     DRM_WEDGE_RECOVERY_REBIND | DRM_WEDGE_RECOVERY_BUS_RESET,
+				     NULL);
 }
 
 /**
diff --git a/drivers/gpu/drm/xe/xe_device.c b/drivers/gpu/drm/xe/xe_device.c
index fc4a49f25c09..8a349c7daf24 100644
--- a/drivers/gpu/drm/xe/xe_device.c
+++ b/drivers/gpu/drm/xe/xe_device.c
@@ -1037,7 +1037,8 @@  void xe_device_declare_wedged(struct xe_device *xe)
 
 		/* Notify userspace of wedged device */
 		drm_dev_wedged_event(&xe->drm,
-				     DRM_WEDGE_RECOVERY_REBIND | DRM_WEDGE_RECOVERY_BUS_RESET);
+				     DRM_WEDGE_RECOVERY_REBIND | DRM_WEDGE_RECOVERY_BUS_RESET,
+				     NULL);
 	}
 
 	for_each_gt(gt, xe, id)
diff --git a/include/drm/drm_device.h b/include/drm/drm_device.h
index 6ea54a578cda..8b9d614257e6 100644
--- a/include/drm/drm_device.h
+++ b/include/drm/drm_device.h
@@ -30,6 +30,14 @@  struct pci_controller;
 #define DRM_WEDGE_RECOVERY_REBIND	BIT(1)	/* unbind + bind driver */
 #define DRM_WEDGE_RECOVERY_BUS_RESET	BIT(2)	/* unbind + reset bus device + bind */
 
+/**
+ * struct drm_wedge_app_info - information about the guilty app of a wedge dev
+ */
+struct drm_wedge_app_info {
+	pid_t pid;
+	char *comm;
+};
+
 /**
  * enum switch_power_state - power state of drm device
  */
diff --git a/include/drm/drm_drv.h b/include/drm/drm_drv.h
index a43d707b5f36..8fc6412a6345 100644
--- a/include/drm/drm_drv.h
+++ b/include/drm/drm_drv.h
@@ -482,7 +482,8 @@  void drm_put_dev(struct drm_device *dev);
 bool drm_dev_enter(struct drm_device *dev, int *idx);
 void drm_dev_exit(int idx);
 void drm_dev_unplug(struct drm_device *dev);
-int drm_dev_wedged_event(struct drm_device *dev, unsigned long method);
+int drm_dev_wedged_event(struct drm_device *dev, unsigned long method,
+			 struct drm_wedge_app_info *info);
 
 /**
  * drm_dev_is_unplugged - is a DRM device unplugged