diff mbox series

[v1,3/3] drm/amdgpu: update the error logging for more information

Message ID 20250411130428.4104957-3-sunil.khatri@amd.com (mailing list archive)
State New
Headers show
Series [v1,1/3] drm: function to get process name and pid | expand

Commit Message

Sunil Khatri April 11, 2025, 1:04 p.m. UTC
add process and pid information in the userqueue error
logging to make it more useful in resolving the error
by logs.

Sample log:
[   42.444297] [drm:amdgpu_userqueue_wait_for_signal [amdgpu]] *ERROR* Timed out waiting for fence f=000000001c74d978 for comm:Xwayland pid:3427
[   42.444669] [drm:amdgpu_userqueue_suspend [amdgpu]] *ERROR* Not suspending userqueue, timeout waiting for comm:Xwayland pid:3427
[   42.824729] [drm:amdgpu_userqueue_wait_for_signal [amdgpu]] *ERROR* Timed out waiting for fence f=0000000074407d3e for comm:systemd-logind pid:1058
[   42.825082] [drm:amdgpu_userqueue_suspend [amdgpu]] *ERROR* Not suspending userqueue, timeout waiting for comm:systemd-logind pid:1058

Signed-off-by: Sunil Khatri <sunil.khatri@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_userqueue.c | 45 +++++++++++++++----
 1 file changed, 37 insertions(+), 8 deletions(-)

Comments

Alex Deucher April 11, 2025, 2:24 p.m. UTC | #1
On Fri, Apr 11, 2025 at 9:05 AM Sunil Khatri <sunil.khatri@amd.com> wrote:
>
> add process and pid information in the userqueue error
> logging to make it more useful in resolving the error
> by logs.
>
> Sample log:
> [   42.444297] [drm:amdgpu_userqueue_wait_for_signal [amdgpu]] *ERROR* Timed out waiting for fence f=000000001c74d978 for comm:Xwayland pid:3427
> [   42.444669] [drm:amdgpu_userqueue_suspend [amdgpu]] *ERROR* Not suspending userqueue, timeout waiting for comm:Xwayland pid:3427
> [   42.824729] [drm:amdgpu_userqueue_wait_for_signal [amdgpu]] *ERROR* Timed out waiting for fence f=0000000074407d3e for comm:systemd-logind pid:1058
> [   42.825082] [drm:amdgpu_userqueue_suspend [amdgpu]] *ERROR* Not suspending userqueue, timeout waiting for comm:systemd-logind pid:1058
>
> Signed-off-by: Sunil Khatri <sunil.khatri@amd.com>
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_userqueue.c | 45 +++++++++++++++----
>  1 file changed, 37 insertions(+), 8 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userqueue.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_userqueue.c
> index ecd49cf15b2a..5b58c41618ee 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userqueue.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userqueue.c
> @@ -62,12 +62,17 @@ amdgpu_userqueue_cleanup(struct amdgpu_userq_mgr *uq_mgr,
>         struct amdgpu_device *adev = uq_mgr->adev;
>         const struct amdgpu_userq_funcs *uq_funcs = adev->userq_funcs[queue->queue_type];
>         struct dma_fence *f = queue->last_fence;
> +       struct drm_file *file;
> +       char proc_log[50];
>         int ret;
>
>         if (f && !dma_fence_is_signaled(f)) {
>                 ret = dma_fence_wait_timeout(f, true, msecs_to_jiffies(100));
>                 if (ret <= 0) {
> -                       DRM_ERROR("Timed out waiting for fence f=%p\n", f);
> +                       file = uq_mgr->file;
> +                       drm_process_info(file, proc_log, sizeof(proc_log));
> +                       DRM_ERROR("Timed out waiting for fence f=%p for %s\n",
> +                                 f, proc_log);

user drm_err() here and below so we get proper handling of multiple devices.

Alex

>                         return;
>                 }
>         }
> @@ -427,6 +432,8 @@ amdgpu_userqueue_resume_all(struct amdgpu_userq_mgr *uq_mgr)
>         const struct amdgpu_userq_funcs *userq_funcs;
>         struct amdgpu_usermode_queue *queue;
>         int queue_id;
> +       struct drm_file *file;
> +       char proc_log[50];
>         int ret = 0;
>
>         /* Resume all the queues for this process */
> @@ -435,8 +442,12 @@ amdgpu_userqueue_resume_all(struct amdgpu_userq_mgr *uq_mgr)
>                 ret = userq_funcs->resume(uq_mgr, queue);
>         }
>
> -       if (ret)
> -               DRM_ERROR("Failed to resume all the queue\n");
> +       if (ret) {
> +               file = uq_mgr->file;
> +               drm_process_info(file, proc_log, sizeof(proc_log));
> +               DRM_ERROR("Failed to resume all the queue for %s\n",
> +                         proc_log);
> +               }
>         return ret;
>  }
>
> @@ -585,6 +596,8 @@ amdgpu_userqueue_suspend_all(struct amdgpu_userq_mgr *uq_mgr)
>         const struct amdgpu_userq_funcs *userq_funcs;
>         struct amdgpu_usermode_queue *queue;
>         int queue_id;
> +       struct drm_file *file;
> +       char proc_log[50];
>         int ret = 0;
>
>         /* Try to suspend all the queues in this process ctx */
> @@ -593,8 +606,12 @@ amdgpu_userqueue_suspend_all(struct amdgpu_userq_mgr *uq_mgr)
>                 ret += userq_funcs->suspend(uq_mgr, queue);
>         }
>
> -       if (ret)
> -               DRM_ERROR("Couldn't suspend all the queues\n");
> +       if (ret) {
> +               file = uq_mgr->file;
> +               drm_process_info(file, proc_log, sizeof(proc_log));
> +               DRM_ERROR("Couldn't suspend all the queues for %s\n",
> +                         proc_log);
> +               }
>         return ret;
>  }
>
> @@ -602,6 +619,8 @@ static int
>  amdgpu_userqueue_wait_for_signal(struct amdgpu_userq_mgr *uq_mgr)
>  {
>         struct amdgpu_usermode_queue *queue;
> +       struct drm_file *file;
> +       char proc_log[50];
>         int queue_id, ret;
>
>         idr_for_each_entry(&uq_mgr->userq_idr, queue, queue_id) {
> @@ -611,7 +630,10 @@ amdgpu_userqueue_wait_for_signal(struct amdgpu_userq_mgr *uq_mgr)
>                         continue;
>                 ret = dma_fence_wait_timeout(f, true, msecs_to_jiffies(100));
>                 if (ret <= 0) {
> -                       DRM_ERROR("Timed out waiting for fence f=%p\n", f);
> +                       file = uq_mgr->file;
> +                       drm_process_info(file, proc_log, sizeof(proc_log));
> +                       DRM_ERROR("Timed out waiting for fence f=%p for %s\n",
> +                                 f, proc_log);
>                         return -ETIMEDOUT;
>                 }
>         }
> @@ -624,19 +646,26 @@ amdgpu_userqueue_suspend(struct amdgpu_userq_mgr *uq_mgr,
>                          struct amdgpu_eviction_fence *ev_fence)
>  {
>         int ret;
> +       struct drm_file *file;
> +       char proc_log[50];
>         struct amdgpu_fpriv *fpriv = uq_mgr_to_fpriv(uq_mgr);
>         struct amdgpu_eviction_fence_mgr *evf_mgr = &fpriv->evf_mgr;
>
>         /* Wait for any pending userqueue fence work to finish */
>         ret = amdgpu_userqueue_wait_for_signal(uq_mgr);
>         if (ret) {
> -               DRM_ERROR("Not suspending userqueue, timeout waiting for work\n");
> +               file = uq_mgr->file;
> +               drm_process_info(file, proc_log, sizeof(proc_log));
> +               DRM_ERROR("Not suspending userqueue, timeout waiting for %s\n",
> +                         proc_log);
>                 return;
>         }
>
>         ret = amdgpu_userqueue_suspend_all(uq_mgr);
>         if (ret) {
> -               DRM_ERROR("Failed to evict userqueue\n");
> +               file = uq_mgr->file;
> +               drm_process_info(file, proc_log, sizeof(proc_log));
> +               DRM_ERROR("Failed to evict userqueue for %s\n", proc_log);
>                 return;
>         }
>
> --
> 2.34.1
>
Khatri, Sunil April 11, 2025, 4:01 p.m. UTC | #2
On 4/11/2025 7:54 PM, Alex Deucher wrote:
> On Fri, Apr 11, 2025 at 9:05 AM Sunil Khatri <sunil.khatri@amd.com> wrote:
>> add process and pid information in the userqueue error
>> logging to make it more useful in resolving the error
>> by logs.
>>
>> Sample log:
>> [   42.444297] [drm:amdgpu_userqueue_wait_for_signal [amdgpu]] *ERROR* Timed out waiting for fence f=000000001c74d978 for comm:Xwayland pid:3427
>> [   42.444669] [drm:amdgpu_userqueue_suspend [amdgpu]] *ERROR* Not suspending userqueue, timeout waiting for comm:Xwayland pid:3427
>> [   42.824729] [drm:amdgpu_userqueue_wait_for_signal [amdgpu]] *ERROR* Timed out waiting for fence f=0000000074407d3e for comm:systemd-logind pid:1058
>> [   42.825082] [drm:amdgpu_userqueue_suspend [amdgpu]] *ERROR* Not suspending userqueue, timeout waiting for comm:systemd-logind pid:1058
>>
>> Signed-off-by: Sunil Khatri <sunil.khatri@amd.com>
>> ---
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_userqueue.c | 45 +++++++++++++++----
>>   1 file changed, 37 insertions(+), 8 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userqueue.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_userqueue.c
>> index ecd49cf15b2a..5b58c41618ee 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userqueue.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userqueue.c
>> @@ -62,12 +62,17 @@ amdgpu_userqueue_cleanup(struct amdgpu_userq_mgr *uq_mgr,
>>          struct amdgpu_device *adev = uq_mgr->adev;
>>          const struct amdgpu_userq_funcs *uq_funcs = adev->userq_funcs[queue->queue_type];
>>          struct dma_fence *f = queue->last_fence;
>> +       struct drm_file *file;
>> +       char proc_log[50];
>>          int ret;
>>
>>          if (f && !dma_fence_is_signaled(f)) {
>>                  ret = dma_fence_wait_timeout(f, true, msecs_to_jiffies(100));
>>                  if (ret <= 0) {
>> -                       DRM_ERROR("Timed out waiting for fence f=%p\n", f);
>> +                       file = uq_mgr->file;
>> +                       drm_process_info(file, proc_log, sizeof(proc_log));
>> +                       DRM_ERROR("Timed out waiting for fence f=%p for %s\n",
>> +                                 f, proc_log);
> user drm_err() here and below so we get proper handling of multiple devices.
>
> Alex
Sure Alex. Once i have the main drm patch reviewed would update these too.
Sunil
>
>>                          return;
>>                  }
>>          }
>> @@ -427,6 +432,8 @@ amdgpu_userqueue_resume_all(struct amdgpu_userq_mgr *uq_mgr)
>>          const struct amdgpu_userq_funcs *userq_funcs;
>>          struct amdgpu_usermode_queue *queue;
>>          int queue_id;
>> +       struct drm_file *file;
>> +       char proc_log[50];
>>          int ret = 0;
>>
>>          /* Resume all the queues for this process */
>> @@ -435,8 +442,12 @@ amdgpu_userqueue_resume_all(struct amdgpu_userq_mgr *uq_mgr)
>>                  ret = userq_funcs->resume(uq_mgr, queue);
>>          }
>>
>> -       if (ret)
>> -               DRM_ERROR("Failed to resume all the queue\n");
>> +       if (ret) {
>> +               file = uq_mgr->file;
>> +               drm_process_info(file, proc_log, sizeof(proc_log));
>> +               DRM_ERROR("Failed to resume all the queue for %s\n",
>> +                         proc_log);
>> +               }
>>          return ret;
>>   }
>>
>> @@ -585,6 +596,8 @@ amdgpu_userqueue_suspend_all(struct amdgpu_userq_mgr *uq_mgr)
>>          const struct amdgpu_userq_funcs *userq_funcs;
>>          struct amdgpu_usermode_queue *queue;
>>          int queue_id;
>> +       struct drm_file *file;
>> +       char proc_log[50];
>>          int ret = 0;
>>
>>          /* Try to suspend all the queues in this process ctx */
>> @@ -593,8 +606,12 @@ amdgpu_userqueue_suspend_all(struct amdgpu_userq_mgr *uq_mgr)
>>                  ret += userq_funcs->suspend(uq_mgr, queue);
>>          }
>>
>> -       if (ret)
>> -               DRM_ERROR("Couldn't suspend all the queues\n");
>> +       if (ret) {
>> +               file = uq_mgr->file;
>> +               drm_process_info(file, proc_log, sizeof(proc_log));
>> +               DRM_ERROR("Couldn't suspend all the queues for %s\n",
>> +                         proc_log);
>> +               }
>>          return ret;
>>   }
>>
>> @@ -602,6 +619,8 @@ static int
>>   amdgpu_userqueue_wait_for_signal(struct amdgpu_userq_mgr *uq_mgr)
>>   {
>>          struct amdgpu_usermode_queue *queue;
>> +       struct drm_file *file;
>> +       char proc_log[50];
>>          int queue_id, ret;
>>
>>          idr_for_each_entry(&uq_mgr->userq_idr, queue, queue_id) {
>> @@ -611,7 +630,10 @@ amdgpu_userqueue_wait_for_signal(struct amdgpu_userq_mgr *uq_mgr)
>>                          continue;
>>                  ret = dma_fence_wait_timeout(f, true, msecs_to_jiffies(100));
>>                  if (ret <= 0) {
>> -                       DRM_ERROR("Timed out waiting for fence f=%p\n", f);
>> +                       file = uq_mgr->file;
>> +                       drm_process_info(file, proc_log, sizeof(proc_log));
>> +                       DRM_ERROR("Timed out waiting for fence f=%p for %s\n",
>> +                                 f, proc_log);
>>                          return -ETIMEDOUT;
>>                  }
>>          }
>> @@ -624,19 +646,26 @@ amdgpu_userqueue_suspend(struct amdgpu_userq_mgr *uq_mgr,
>>                           struct amdgpu_eviction_fence *ev_fence)
>>   {
>>          int ret;
>> +       struct drm_file *file;
>> +       char proc_log[50];
>>          struct amdgpu_fpriv *fpriv = uq_mgr_to_fpriv(uq_mgr);
>>          struct amdgpu_eviction_fence_mgr *evf_mgr = &fpriv->evf_mgr;
>>
>>          /* Wait for any pending userqueue fence work to finish */
>>          ret = amdgpu_userqueue_wait_for_signal(uq_mgr);
>>          if (ret) {
>> -               DRM_ERROR("Not suspending userqueue, timeout waiting for work\n");
>> +               file = uq_mgr->file;
>> +               drm_process_info(file, proc_log, sizeof(proc_log));
>> +               DRM_ERROR("Not suspending userqueue, timeout waiting for %s\n",
>> +                         proc_log);
>>                  return;
>>          }
>>
>>          ret = amdgpu_userqueue_suspend_all(uq_mgr);
>>          if (ret) {
>> -               DRM_ERROR("Failed to evict userqueue\n");
>> +               file = uq_mgr->file;
>> +               drm_process_info(file, proc_log, sizeof(proc_log));
>> +               DRM_ERROR("Failed to evict userqueue for %s\n", proc_log);
>>                  return;
>>          }
>>
>> --
>> 2.34.1
>>
diff mbox series

Patch

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userqueue.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_userqueue.c
index ecd49cf15b2a..5b58c41618ee 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userqueue.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userqueue.c
@@ -62,12 +62,17 @@  amdgpu_userqueue_cleanup(struct amdgpu_userq_mgr *uq_mgr,
 	struct amdgpu_device *adev = uq_mgr->adev;
 	const struct amdgpu_userq_funcs *uq_funcs = adev->userq_funcs[queue->queue_type];
 	struct dma_fence *f = queue->last_fence;
+	struct drm_file *file;
+	char proc_log[50];
 	int ret;
 
 	if (f && !dma_fence_is_signaled(f)) {
 		ret = dma_fence_wait_timeout(f, true, msecs_to_jiffies(100));
 		if (ret <= 0) {
-			DRM_ERROR("Timed out waiting for fence f=%p\n", f);
+			file = uq_mgr->file;
+			drm_process_info(file, proc_log, sizeof(proc_log));
+			DRM_ERROR("Timed out waiting for fence f=%p for %s\n",
+				  f, proc_log);
 			return;
 		}
 	}
@@ -427,6 +432,8 @@  amdgpu_userqueue_resume_all(struct amdgpu_userq_mgr *uq_mgr)
 	const struct amdgpu_userq_funcs *userq_funcs;
 	struct amdgpu_usermode_queue *queue;
 	int queue_id;
+	struct drm_file *file;
+	char proc_log[50];
 	int ret = 0;
 
 	/* Resume all the queues for this process */
@@ -435,8 +442,12 @@  amdgpu_userqueue_resume_all(struct amdgpu_userq_mgr *uq_mgr)
 		ret = userq_funcs->resume(uq_mgr, queue);
 	}
 
-	if (ret)
-		DRM_ERROR("Failed to resume all the queue\n");
+	if (ret) {
+		file = uq_mgr->file;
+		drm_process_info(file, proc_log, sizeof(proc_log));
+		DRM_ERROR("Failed to resume all the queue for %s\n",
+			  proc_log);
+		}
 	return ret;
 }
 
@@ -585,6 +596,8 @@  amdgpu_userqueue_suspend_all(struct amdgpu_userq_mgr *uq_mgr)
 	const struct amdgpu_userq_funcs *userq_funcs;
 	struct amdgpu_usermode_queue *queue;
 	int queue_id;
+	struct drm_file *file;
+	char proc_log[50];
 	int ret = 0;
 
 	/* Try to suspend all the queues in this process ctx */
@@ -593,8 +606,12 @@  amdgpu_userqueue_suspend_all(struct amdgpu_userq_mgr *uq_mgr)
 		ret += userq_funcs->suspend(uq_mgr, queue);
 	}
 
-	if (ret)
-		DRM_ERROR("Couldn't suspend all the queues\n");
+	if (ret) {
+		file = uq_mgr->file;
+		drm_process_info(file, proc_log, sizeof(proc_log));
+		DRM_ERROR("Couldn't suspend all the queues for %s\n",
+			  proc_log);
+		}
 	return ret;
 }
 
@@ -602,6 +619,8 @@  static int
 amdgpu_userqueue_wait_for_signal(struct amdgpu_userq_mgr *uq_mgr)
 {
 	struct amdgpu_usermode_queue *queue;
+	struct drm_file *file;
+	char proc_log[50];
 	int queue_id, ret;
 
 	idr_for_each_entry(&uq_mgr->userq_idr, queue, queue_id) {
@@ -611,7 +630,10 @@  amdgpu_userqueue_wait_for_signal(struct amdgpu_userq_mgr *uq_mgr)
 			continue;
 		ret = dma_fence_wait_timeout(f, true, msecs_to_jiffies(100));
 		if (ret <= 0) {
-			DRM_ERROR("Timed out waiting for fence f=%p\n", f);
+			file = uq_mgr->file;
+			drm_process_info(file, proc_log, sizeof(proc_log));
+			DRM_ERROR("Timed out waiting for fence f=%p for %s\n",
+				  f, proc_log);
 			return -ETIMEDOUT;
 		}
 	}
@@ -624,19 +646,26 @@  amdgpu_userqueue_suspend(struct amdgpu_userq_mgr *uq_mgr,
 			 struct amdgpu_eviction_fence *ev_fence)
 {
 	int ret;
+	struct drm_file *file;
+	char proc_log[50];
 	struct amdgpu_fpriv *fpriv = uq_mgr_to_fpriv(uq_mgr);
 	struct amdgpu_eviction_fence_mgr *evf_mgr = &fpriv->evf_mgr;
 
 	/* Wait for any pending userqueue fence work to finish */
 	ret = amdgpu_userqueue_wait_for_signal(uq_mgr);
 	if (ret) {
-		DRM_ERROR("Not suspending userqueue, timeout waiting for work\n");
+		file = uq_mgr->file;
+		drm_process_info(file, proc_log, sizeof(proc_log));
+		DRM_ERROR("Not suspending userqueue, timeout waiting for %s\n",
+			  proc_log);
 		return;
 	}
 
 	ret = amdgpu_userqueue_suspend_all(uq_mgr);
 	if (ret) {
-		DRM_ERROR("Failed to evict userqueue\n");
+		file = uq_mgr->file;
+		drm_process_info(file, proc_log, sizeof(proc_log));
+		DRM_ERROR("Failed to evict userqueue for %s\n", proc_log);
 		return;
 	}