Message ID | 20250411130428.4104957-3-sunil.khatri@amd.com (mailing list archive) |
---|---|
State | New |
Headers | show |
Series | [v1,1/3] drm: function to get process name and pid | expand |
On Fri, Apr 11, 2025 at 9:05 AM Sunil Khatri <sunil.khatri@amd.com> wrote: > > add process and pid information in the userqueue error > logging to make it more useful in resolving the error > by logs. > > Sample log: > [ 42.444297] [drm:amdgpu_userqueue_wait_for_signal [amdgpu]] *ERROR* Timed out waiting for fence f=000000001c74d978 for comm:Xwayland pid:3427 > [ 42.444669] [drm:amdgpu_userqueue_suspend [amdgpu]] *ERROR* Not suspending userqueue, timeout waiting for comm:Xwayland pid:3427 > [ 42.824729] [drm:amdgpu_userqueue_wait_for_signal [amdgpu]] *ERROR* Timed out waiting for fence f=0000000074407d3e for comm:systemd-logind pid:1058 > [ 42.825082] [drm:amdgpu_userqueue_suspend [amdgpu]] *ERROR* Not suspending userqueue, timeout waiting for comm:systemd-logind pid:1058 > > Signed-off-by: Sunil Khatri <sunil.khatri@amd.com> > --- > drivers/gpu/drm/amd/amdgpu/amdgpu_userqueue.c | 45 +++++++++++++++---- > 1 file changed, 37 insertions(+), 8 deletions(-) > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userqueue.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_userqueue.c > index ecd49cf15b2a..5b58c41618ee 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userqueue.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userqueue.c > @@ -62,12 +62,17 @@ amdgpu_userqueue_cleanup(struct amdgpu_userq_mgr *uq_mgr, > struct amdgpu_device *adev = uq_mgr->adev; > const struct amdgpu_userq_funcs *uq_funcs = adev->userq_funcs[queue->queue_type]; > struct dma_fence *f = queue->last_fence; > + struct drm_file *file; > + char proc_log[50]; > int ret; > > if (f && !dma_fence_is_signaled(f)) { > ret = dma_fence_wait_timeout(f, true, msecs_to_jiffies(100)); > if (ret <= 0) { > - DRM_ERROR("Timed out waiting for fence f=%p\n", f); > + file = uq_mgr->file; > + drm_process_info(file, proc_log, sizeof(proc_log)); > + DRM_ERROR("Timed out waiting for fence f=%p for %s\n", > + f, proc_log); user drm_err() here and below so we get proper handling of multiple devices. Alex > return; > } > } > @@ -427,6 +432,8 @@ amdgpu_userqueue_resume_all(struct amdgpu_userq_mgr *uq_mgr) > const struct amdgpu_userq_funcs *userq_funcs; > struct amdgpu_usermode_queue *queue; > int queue_id; > + struct drm_file *file; > + char proc_log[50]; > int ret = 0; > > /* Resume all the queues for this process */ > @@ -435,8 +442,12 @@ amdgpu_userqueue_resume_all(struct amdgpu_userq_mgr *uq_mgr) > ret = userq_funcs->resume(uq_mgr, queue); > } > > - if (ret) > - DRM_ERROR("Failed to resume all the queue\n"); > + if (ret) { > + file = uq_mgr->file; > + drm_process_info(file, proc_log, sizeof(proc_log)); > + DRM_ERROR("Failed to resume all the queue for %s\n", > + proc_log); > + } > return ret; > } > > @@ -585,6 +596,8 @@ amdgpu_userqueue_suspend_all(struct amdgpu_userq_mgr *uq_mgr) > const struct amdgpu_userq_funcs *userq_funcs; > struct amdgpu_usermode_queue *queue; > int queue_id; > + struct drm_file *file; > + char proc_log[50]; > int ret = 0; > > /* Try to suspend all the queues in this process ctx */ > @@ -593,8 +606,12 @@ amdgpu_userqueue_suspend_all(struct amdgpu_userq_mgr *uq_mgr) > ret += userq_funcs->suspend(uq_mgr, queue); > } > > - if (ret) > - DRM_ERROR("Couldn't suspend all the queues\n"); > + if (ret) { > + file = uq_mgr->file; > + drm_process_info(file, proc_log, sizeof(proc_log)); > + DRM_ERROR("Couldn't suspend all the queues for %s\n", > + proc_log); > + } > return ret; > } > > @@ -602,6 +619,8 @@ static int > amdgpu_userqueue_wait_for_signal(struct amdgpu_userq_mgr *uq_mgr) > { > struct amdgpu_usermode_queue *queue; > + struct drm_file *file; > + char proc_log[50]; > int queue_id, ret; > > idr_for_each_entry(&uq_mgr->userq_idr, queue, queue_id) { > @@ -611,7 +630,10 @@ amdgpu_userqueue_wait_for_signal(struct amdgpu_userq_mgr *uq_mgr) > continue; > ret = dma_fence_wait_timeout(f, true, msecs_to_jiffies(100)); > if (ret <= 0) { > - DRM_ERROR("Timed out waiting for fence f=%p\n", f); > + file = uq_mgr->file; > + drm_process_info(file, proc_log, sizeof(proc_log)); > + DRM_ERROR("Timed out waiting for fence f=%p for %s\n", > + f, proc_log); > return -ETIMEDOUT; > } > } > @@ -624,19 +646,26 @@ amdgpu_userqueue_suspend(struct amdgpu_userq_mgr *uq_mgr, > struct amdgpu_eviction_fence *ev_fence) > { > int ret; > + struct drm_file *file; > + char proc_log[50]; > struct amdgpu_fpriv *fpriv = uq_mgr_to_fpriv(uq_mgr); > struct amdgpu_eviction_fence_mgr *evf_mgr = &fpriv->evf_mgr; > > /* Wait for any pending userqueue fence work to finish */ > ret = amdgpu_userqueue_wait_for_signal(uq_mgr); > if (ret) { > - DRM_ERROR("Not suspending userqueue, timeout waiting for work\n"); > + file = uq_mgr->file; > + drm_process_info(file, proc_log, sizeof(proc_log)); > + DRM_ERROR("Not suspending userqueue, timeout waiting for %s\n", > + proc_log); > return; > } > > ret = amdgpu_userqueue_suspend_all(uq_mgr); > if (ret) { > - DRM_ERROR("Failed to evict userqueue\n"); > + file = uq_mgr->file; > + drm_process_info(file, proc_log, sizeof(proc_log)); > + DRM_ERROR("Failed to evict userqueue for %s\n", proc_log); > return; > } > > -- > 2.34.1 >
On 4/11/2025 7:54 PM, Alex Deucher wrote: > On Fri, Apr 11, 2025 at 9:05 AM Sunil Khatri <sunil.khatri@amd.com> wrote: >> add process and pid information in the userqueue error >> logging to make it more useful in resolving the error >> by logs. >> >> Sample log: >> [ 42.444297] [drm:amdgpu_userqueue_wait_for_signal [amdgpu]] *ERROR* Timed out waiting for fence f=000000001c74d978 for comm:Xwayland pid:3427 >> [ 42.444669] [drm:amdgpu_userqueue_suspend [amdgpu]] *ERROR* Not suspending userqueue, timeout waiting for comm:Xwayland pid:3427 >> [ 42.824729] [drm:amdgpu_userqueue_wait_for_signal [amdgpu]] *ERROR* Timed out waiting for fence f=0000000074407d3e for comm:systemd-logind pid:1058 >> [ 42.825082] [drm:amdgpu_userqueue_suspend [amdgpu]] *ERROR* Not suspending userqueue, timeout waiting for comm:systemd-logind pid:1058 >> >> Signed-off-by: Sunil Khatri <sunil.khatri@amd.com> >> --- >> drivers/gpu/drm/amd/amdgpu/amdgpu_userqueue.c | 45 +++++++++++++++---- >> 1 file changed, 37 insertions(+), 8 deletions(-) >> >> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userqueue.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_userqueue.c >> index ecd49cf15b2a..5b58c41618ee 100644 >> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userqueue.c >> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userqueue.c >> @@ -62,12 +62,17 @@ amdgpu_userqueue_cleanup(struct amdgpu_userq_mgr *uq_mgr, >> struct amdgpu_device *adev = uq_mgr->adev; >> const struct amdgpu_userq_funcs *uq_funcs = adev->userq_funcs[queue->queue_type]; >> struct dma_fence *f = queue->last_fence; >> + struct drm_file *file; >> + char proc_log[50]; >> int ret; >> >> if (f && !dma_fence_is_signaled(f)) { >> ret = dma_fence_wait_timeout(f, true, msecs_to_jiffies(100)); >> if (ret <= 0) { >> - DRM_ERROR("Timed out waiting for fence f=%p\n", f); >> + file = uq_mgr->file; >> + drm_process_info(file, proc_log, sizeof(proc_log)); >> + DRM_ERROR("Timed out waiting for fence f=%p for %s\n", >> + f, proc_log); > user drm_err() here and below so we get proper handling of multiple devices. > > Alex Sure Alex. Once i have the main drm patch reviewed would update these too. Sunil > >> return; >> } >> } >> @@ -427,6 +432,8 @@ amdgpu_userqueue_resume_all(struct amdgpu_userq_mgr *uq_mgr) >> const struct amdgpu_userq_funcs *userq_funcs; >> struct amdgpu_usermode_queue *queue; >> int queue_id; >> + struct drm_file *file; >> + char proc_log[50]; >> int ret = 0; >> >> /* Resume all the queues for this process */ >> @@ -435,8 +442,12 @@ amdgpu_userqueue_resume_all(struct amdgpu_userq_mgr *uq_mgr) >> ret = userq_funcs->resume(uq_mgr, queue); >> } >> >> - if (ret) >> - DRM_ERROR("Failed to resume all the queue\n"); >> + if (ret) { >> + file = uq_mgr->file; >> + drm_process_info(file, proc_log, sizeof(proc_log)); >> + DRM_ERROR("Failed to resume all the queue for %s\n", >> + proc_log); >> + } >> return ret; >> } >> >> @@ -585,6 +596,8 @@ amdgpu_userqueue_suspend_all(struct amdgpu_userq_mgr *uq_mgr) >> const struct amdgpu_userq_funcs *userq_funcs; >> struct amdgpu_usermode_queue *queue; >> int queue_id; >> + struct drm_file *file; >> + char proc_log[50]; >> int ret = 0; >> >> /* Try to suspend all the queues in this process ctx */ >> @@ -593,8 +606,12 @@ amdgpu_userqueue_suspend_all(struct amdgpu_userq_mgr *uq_mgr) >> ret += userq_funcs->suspend(uq_mgr, queue); >> } >> >> - if (ret) >> - DRM_ERROR("Couldn't suspend all the queues\n"); >> + if (ret) { >> + file = uq_mgr->file; >> + drm_process_info(file, proc_log, sizeof(proc_log)); >> + DRM_ERROR("Couldn't suspend all the queues for %s\n", >> + proc_log); >> + } >> return ret; >> } >> >> @@ -602,6 +619,8 @@ static int >> amdgpu_userqueue_wait_for_signal(struct amdgpu_userq_mgr *uq_mgr) >> { >> struct amdgpu_usermode_queue *queue; >> + struct drm_file *file; >> + char proc_log[50]; >> int queue_id, ret; >> >> idr_for_each_entry(&uq_mgr->userq_idr, queue, queue_id) { >> @@ -611,7 +630,10 @@ amdgpu_userqueue_wait_for_signal(struct amdgpu_userq_mgr *uq_mgr) >> continue; >> ret = dma_fence_wait_timeout(f, true, msecs_to_jiffies(100)); >> if (ret <= 0) { >> - DRM_ERROR("Timed out waiting for fence f=%p\n", f); >> + file = uq_mgr->file; >> + drm_process_info(file, proc_log, sizeof(proc_log)); >> + DRM_ERROR("Timed out waiting for fence f=%p for %s\n", >> + f, proc_log); >> return -ETIMEDOUT; >> } >> } >> @@ -624,19 +646,26 @@ amdgpu_userqueue_suspend(struct amdgpu_userq_mgr *uq_mgr, >> struct amdgpu_eviction_fence *ev_fence) >> { >> int ret; >> + struct drm_file *file; >> + char proc_log[50]; >> struct amdgpu_fpriv *fpriv = uq_mgr_to_fpriv(uq_mgr); >> struct amdgpu_eviction_fence_mgr *evf_mgr = &fpriv->evf_mgr; >> >> /* Wait for any pending userqueue fence work to finish */ >> ret = amdgpu_userqueue_wait_for_signal(uq_mgr); >> if (ret) { >> - DRM_ERROR("Not suspending userqueue, timeout waiting for work\n"); >> + file = uq_mgr->file; >> + drm_process_info(file, proc_log, sizeof(proc_log)); >> + DRM_ERROR("Not suspending userqueue, timeout waiting for %s\n", >> + proc_log); >> return; >> } >> >> ret = amdgpu_userqueue_suspend_all(uq_mgr); >> if (ret) { >> - DRM_ERROR("Failed to evict userqueue\n"); >> + file = uq_mgr->file; >> + drm_process_info(file, proc_log, sizeof(proc_log)); >> + DRM_ERROR("Failed to evict userqueue for %s\n", proc_log); >> return; >> } >> >> -- >> 2.34.1 >>
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userqueue.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_userqueue.c index ecd49cf15b2a..5b58c41618ee 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userqueue.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userqueue.c @@ -62,12 +62,17 @@ amdgpu_userqueue_cleanup(struct amdgpu_userq_mgr *uq_mgr, struct amdgpu_device *adev = uq_mgr->adev; const struct amdgpu_userq_funcs *uq_funcs = adev->userq_funcs[queue->queue_type]; struct dma_fence *f = queue->last_fence; + struct drm_file *file; + char proc_log[50]; int ret; if (f && !dma_fence_is_signaled(f)) { ret = dma_fence_wait_timeout(f, true, msecs_to_jiffies(100)); if (ret <= 0) { - DRM_ERROR("Timed out waiting for fence f=%p\n", f); + file = uq_mgr->file; + drm_process_info(file, proc_log, sizeof(proc_log)); + DRM_ERROR("Timed out waiting for fence f=%p for %s\n", + f, proc_log); return; } } @@ -427,6 +432,8 @@ amdgpu_userqueue_resume_all(struct amdgpu_userq_mgr *uq_mgr) const struct amdgpu_userq_funcs *userq_funcs; struct amdgpu_usermode_queue *queue; int queue_id; + struct drm_file *file; + char proc_log[50]; int ret = 0; /* Resume all the queues for this process */ @@ -435,8 +442,12 @@ amdgpu_userqueue_resume_all(struct amdgpu_userq_mgr *uq_mgr) ret = userq_funcs->resume(uq_mgr, queue); } - if (ret) - DRM_ERROR("Failed to resume all the queue\n"); + if (ret) { + file = uq_mgr->file; + drm_process_info(file, proc_log, sizeof(proc_log)); + DRM_ERROR("Failed to resume all the queue for %s\n", + proc_log); + } return ret; } @@ -585,6 +596,8 @@ amdgpu_userqueue_suspend_all(struct amdgpu_userq_mgr *uq_mgr) const struct amdgpu_userq_funcs *userq_funcs; struct amdgpu_usermode_queue *queue; int queue_id; + struct drm_file *file; + char proc_log[50]; int ret = 0; /* Try to suspend all the queues in this process ctx */ @@ -593,8 +606,12 @@ amdgpu_userqueue_suspend_all(struct amdgpu_userq_mgr *uq_mgr) ret += userq_funcs->suspend(uq_mgr, queue); } - if (ret) - DRM_ERROR("Couldn't suspend all the queues\n"); + if (ret) { + file = uq_mgr->file; + drm_process_info(file, proc_log, sizeof(proc_log)); + DRM_ERROR("Couldn't suspend all the queues for %s\n", + proc_log); + } return ret; } @@ -602,6 +619,8 @@ static int amdgpu_userqueue_wait_for_signal(struct amdgpu_userq_mgr *uq_mgr) { struct amdgpu_usermode_queue *queue; + struct drm_file *file; + char proc_log[50]; int queue_id, ret; idr_for_each_entry(&uq_mgr->userq_idr, queue, queue_id) { @@ -611,7 +630,10 @@ amdgpu_userqueue_wait_for_signal(struct amdgpu_userq_mgr *uq_mgr) continue; ret = dma_fence_wait_timeout(f, true, msecs_to_jiffies(100)); if (ret <= 0) { - DRM_ERROR("Timed out waiting for fence f=%p\n", f); + file = uq_mgr->file; + drm_process_info(file, proc_log, sizeof(proc_log)); + DRM_ERROR("Timed out waiting for fence f=%p for %s\n", + f, proc_log); return -ETIMEDOUT; } } @@ -624,19 +646,26 @@ amdgpu_userqueue_suspend(struct amdgpu_userq_mgr *uq_mgr, struct amdgpu_eviction_fence *ev_fence) { int ret; + struct drm_file *file; + char proc_log[50]; struct amdgpu_fpriv *fpriv = uq_mgr_to_fpriv(uq_mgr); struct amdgpu_eviction_fence_mgr *evf_mgr = &fpriv->evf_mgr; /* Wait for any pending userqueue fence work to finish */ ret = amdgpu_userqueue_wait_for_signal(uq_mgr); if (ret) { - DRM_ERROR("Not suspending userqueue, timeout waiting for work\n"); + file = uq_mgr->file; + drm_process_info(file, proc_log, sizeof(proc_log)); + DRM_ERROR("Not suspending userqueue, timeout waiting for %s\n", + proc_log); return; } ret = amdgpu_userqueue_suspend_all(uq_mgr); if (ret) { - DRM_ERROR("Failed to evict userqueue\n"); + file = uq_mgr->file; + drm_process_info(file, proc_log, sizeof(proc_log)); + DRM_ERROR("Failed to evict userqueue for %s\n", proc_log); return; }
add process and pid information in the userqueue error logging to make it more useful in resolving the error by logs. Sample log: [ 42.444297] [drm:amdgpu_userqueue_wait_for_signal [amdgpu]] *ERROR* Timed out waiting for fence f=000000001c74d978 for comm:Xwayland pid:3427 [ 42.444669] [drm:amdgpu_userqueue_suspend [amdgpu]] *ERROR* Not suspending userqueue, timeout waiting for comm:Xwayland pid:3427 [ 42.824729] [drm:amdgpu_userqueue_wait_for_signal [amdgpu]] *ERROR* Timed out waiting for fence f=0000000074407d3e for comm:systemd-logind pid:1058 [ 42.825082] [drm:amdgpu_userqueue_suspend [amdgpu]] *ERROR* Not suspending userqueue, timeout waiting for comm:systemd-logind pid:1058 Signed-off-by: Sunil Khatri <sunil.khatri@amd.com> --- drivers/gpu/drm/amd/amdgpu/amdgpu_userqueue.c | 45 +++++++++++++++---- 1 file changed, 37 insertions(+), 8 deletions(-)