Message ID | 20250131110328.706695-6-pierre-eric.pelloux-prayer@amd.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | Improve gpu_scheduler trace events + uAPI | expand |
On 31/01/2025 11:03, Pierre-Eric Pelloux-Prayer wrote: > Trace the fence dependencies similarly to how we print fences: > > ... , dependencies:{fence=606:38006} > > This allows tools to analyze the dependencies between the jobs (previously > it was only possible for fences traced by drm_sched_job_wait_dep). > > Since drm_sched_job and drm_run_job use the same base event class, > the caller of trace_drm_run_job have to pass a dep_count of 0 (which > is ignored because dependencies are only considered at submit time). > > Signed-off-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com> > --- > .../gpu/drm/scheduler/gpu_scheduler_trace.h | 59 ++++++++++++++++--- > drivers/gpu/drm/scheduler/sched_entity.c | 8 ++- > drivers/gpu/drm/scheduler/sched_main.c | 2 +- > 3 files changed, 58 insertions(+), 11 deletions(-) > > diff --git a/drivers/gpu/drm/scheduler/gpu_scheduler_trace.h b/drivers/gpu/drm/scheduler/gpu_scheduler_trace.h > index 3cdd8d8f8021..ca19cd9a146a 100644 > --- a/drivers/gpu/drm/scheduler/gpu_scheduler_trace.h > +++ b/drivers/gpu/drm/scheduler/gpu_scheduler_trace.h > @@ -26,15 +26,41 @@ > > #include <linux/stringify.h> > #include <linux/types.h> > +#include <linux/trace_seq.h> > #include <linux/tracepoint.h> > > #undef TRACE_SYSTEM > #define TRACE_SYSTEM gpu_scheduler > #define TRACE_INCLUDE_FILE gpu_scheduler_trace > > +#ifndef __TRACE_EVENT_GPU_SCHEDULER_PRINT_FN > +#define __TRACE_EVENT_GPU_SCHEDULER_PRINT_FN > +/* Similar to trace_print_array_seq but for fences. */ > +static inline const char *__print_dma_fence_array(struct trace_seq *p, const void *buf, int count) > +{ > + const char *ret = trace_seq_buffer_ptr(p); > + u64 *fences = (u64 *) buf; > + const char *prefix = ""; > + > + trace_seq_putc(p, '{'); > + for (int i = 0; i < count; i++) { > + u64 context = fences[2 * i], seqno = fences[2 * i + 1]; > + > + trace_seq_printf(p, "%sfence=%llu:%llu", > + prefix, context, seqno); > + prefix = ","; > + } > + trace_seq_putc(p, '}'); > + trace_seq_putc(p, 0); > + > + return ret; > +} > +#endif > + > DECLARE_EVENT_CLASS(drm_sched_job, > - TP_PROTO(struct drm_sched_job *sched_job, struct drm_sched_entity *entity), > - TP_ARGS(sched_job, entity), > + TP_PROTO(struct drm_sched_job *sched_job, struct drm_sched_entity *entity, > + unsigned int dep_count), > + TP_ARGS(sched_job, entity, dep_count), > TP_STRUCT__entry( > __field(uint64_t, id) > __string(name, sched_job->sched->name) > @@ -43,9 +69,14 @@ DECLARE_EVENT_CLASS(drm_sched_job, > __string(dev, dev_name(sched_job->sched->dev)) > __field(uint64_t, fence_context) > __field(uint64_t, fence_seqno) > + __field(int, n_deps) > + __dynamic_array(u64, deps, dep_count * 2) > ), > > TP_fast_assign( > + unsigned long idx; > + struct dma_fence *fence; > + u64 *dyn_arr; > __entry->id = sched_job->id; > __assign_str(name); > __entry->job_count = spsc_queue_count(&entity->job_queue); > @@ -54,22 +85,32 @@ DECLARE_EVENT_CLASS(drm_sched_job, > __assign_str(dev); > __entry->fence_context = sched_job->s_fence->finished.context; > __entry->fence_seqno = sched_job->s_fence->finished.seqno; > - > + __entry->n_deps = dep_count; > + if (dep_count) { > + dyn_arr = __get_dynamic_array(deps); > + xa_for_each(&sched_job->dependencies, idx, fence) { > + dyn_arr[2 * idx] = fence->context; > + dyn_arr[2 * idx + 1] = fence->seqno; > + } > + } > ), > - TP_printk("dev=%s, id=%llu, fence=%llu:%llu, ring=%s, job count:%u, hw job count:%d", > + TP_printk("dev=%s, id=%llu, fence=%llu:%llu, ring=%s, job count:%u, hw job count:%d, dependencies:%s", > __get_str(dev), __entry->id, > __entry->fence_context, __entry->fence_seqno, __get_str(name), > - __entry->job_count, __entry->hw_job_count) > + __entry->job_count, __entry->hw_job_count, > + __print_dma_fence_array(p, __get_dynamic_array(deps), __entry->n_deps)) > ); > > DEFINE_EVENT(drm_sched_job, drm_sched_job, > - TP_PROTO(struct drm_sched_job *sched_job, struct drm_sched_entity *entity), > - TP_ARGS(sched_job, entity) > + TP_PROTO(struct drm_sched_job *sched_job, struct drm_sched_entity *entity, > + unsigned int dep_count), > + TP_ARGS(sched_job, entity, dep_count) > ); > > DEFINE_EVENT(drm_sched_job, drm_run_job, > - TP_PROTO(struct drm_sched_job *sched_job, struct drm_sched_entity *entity), > - TP_ARGS(sched_job, entity) > + TP_PROTO(struct drm_sched_job *sched_job, struct drm_sched_entity *entity, > + unsigned int dep_count), > + TP_ARGS(sched_job, entity, 0) > ); > > TRACE_EVENT(drm_sched_process_job, > diff --git a/drivers/gpu/drm/scheduler/sched_entity.c b/drivers/gpu/drm/scheduler/sched_entity.c > index 69bcf0e99d57..0ce3a82fe6fd 100644 > --- a/drivers/gpu/drm/scheduler/sched_entity.c > +++ b/drivers/gpu/drm/scheduler/sched_entity.c > @@ -592,7 +592,13 @@ void drm_sched_entity_push_job(struct drm_sched_job *sched_job) > bool first; > ktime_t submit_ts; > > - trace_drm_sched_job(sched_job, entity); > + if (trace_drm_sched_job_enabled()) { > + unsigned long index, n = 0; > + void *f; > + > + xa_for_each(&sched_job->dependencies, index, f) { n++; } > + trace_drm_sched_job(sched_job, entity, n); > + } I wonder if it would be good enough to simply trace dependencies one by one in drm_sched_job_add_dependency and drop all the array magic/support code? In which case that was the drm_sched_job_depdency tracepoint I hinted in my reply to 4/7. I know you are at v7 but as we are going towards the stable ABI route I am thinking it is worth discussing this option. We could even leave the separate "wait dep" vs "add dep" and have a set like: drm_sched_job_add_dependency drm_sched_job_queue drm_sched_job_wait_dependency drm_sched_job_run drm_sched_job_done By tracing them one by one we remove the complications from the kernel side and let the userspace parse multiple lines. Shouldn't be a problem for there. Regards, Tvrtko > atomic_inc(entity->rq->sched->score); > WRITE_ONCE(entity->last_user, current->group_leader); > > diff --git a/drivers/gpu/drm/scheduler/sched_main.c b/drivers/gpu/drm/scheduler/sched_main.c > index ad306d3d7282..c645f07ebe26 100644 > --- a/drivers/gpu/drm/scheduler/sched_main.c > +++ b/drivers/gpu/drm/scheduler/sched_main.c > @@ -1214,7 +1214,7 @@ static void drm_sched_run_job_work(struct work_struct *w) > atomic_add(sched_job->credits, &sched->credit_count); > drm_sched_job_begin(sched_job); > > - trace_drm_run_job(sched_job, entity); > + trace_drm_run_job(sched_job, entity, 0); > fence = sched->ops->run_job(sched_job); > complete_all(&entity->entity_idle); > drm_sched_fence_scheduled(s_fence, fence);
diff --git a/drivers/gpu/drm/scheduler/gpu_scheduler_trace.h b/drivers/gpu/drm/scheduler/gpu_scheduler_trace.h index 3cdd8d8f8021..ca19cd9a146a 100644 --- a/drivers/gpu/drm/scheduler/gpu_scheduler_trace.h +++ b/drivers/gpu/drm/scheduler/gpu_scheduler_trace.h @@ -26,15 +26,41 @@ #include <linux/stringify.h> #include <linux/types.h> +#include <linux/trace_seq.h> #include <linux/tracepoint.h> #undef TRACE_SYSTEM #define TRACE_SYSTEM gpu_scheduler #define TRACE_INCLUDE_FILE gpu_scheduler_trace +#ifndef __TRACE_EVENT_GPU_SCHEDULER_PRINT_FN +#define __TRACE_EVENT_GPU_SCHEDULER_PRINT_FN +/* Similar to trace_print_array_seq but for fences. */ +static inline const char *__print_dma_fence_array(struct trace_seq *p, const void *buf, int count) +{ + const char *ret = trace_seq_buffer_ptr(p); + u64 *fences = (u64 *) buf; + const char *prefix = ""; + + trace_seq_putc(p, '{'); + for (int i = 0; i < count; i++) { + u64 context = fences[2 * i], seqno = fences[2 * i + 1]; + + trace_seq_printf(p, "%sfence=%llu:%llu", + prefix, context, seqno); + prefix = ","; + } + trace_seq_putc(p, '}'); + trace_seq_putc(p, 0); + + return ret; +} +#endif + DECLARE_EVENT_CLASS(drm_sched_job, - TP_PROTO(struct drm_sched_job *sched_job, struct drm_sched_entity *entity), - TP_ARGS(sched_job, entity), + TP_PROTO(struct drm_sched_job *sched_job, struct drm_sched_entity *entity, + unsigned int dep_count), + TP_ARGS(sched_job, entity, dep_count), TP_STRUCT__entry( __field(uint64_t, id) __string(name, sched_job->sched->name) @@ -43,9 +69,14 @@ DECLARE_EVENT_CLASS(drm_sched_job, __string(dev, dev_name(sched_job->sched->dev)) __field(uint64_t, fence_context) __field(uint64_t, fence_seqno) + __field(int, n_deps) + __dynamic_array(u64, deps, dep_count * 2) ), TP_fast_assign( + unsigned long idx; + struct dma_fence *fence; + u64 *dyn_arr; __entry->id = sched_job->id; __assign_str(name); __entry->job_count = spsc_queue_count(&entity->job_queue); @@ -54,22 +85,32 @@ DECLARE_EVENT_CLASS(drm_sched_job, __assign_str(dev); __entry->fence_context = sched_job->s_fence->finished.context; __entry->fence_seqno = sched_job->s_fence->finished.seqno; - + __entry->n_deps = dep_count; + if (dep_count) { + dyn_arr = __get_dynamic_array(deps); + xa_for_each(&sched_job->dependencies, idx, fence) { + dyn_arr[2 * idx] = fence->context; + dyn_arr[2 * idx + 1] = fence->seqno; + } + } ), - TP_printk("dev=%s, id=%llu, fence=%llu:%llu, ring=%s, job count:%u, hw job count:%d", + TP_printk("dev=%s, id=%llu, fence=%llu:%llu, ring=%s, job count:%u, hw job count:%d, dependencies:%s", __get_str(dev), __entry->id, __entry->fence_context, __entry->fence_seqno, __get_str(name), - __entry->job_count, __entry->hw_job_count) + __entry->job_count, __entry->hw_job_count, + __print_dma_fence_array(p, __get_dynamic_array(deps), __entry->n_deps)) ); DEFINE_EVENT(drm_sched_job, drm_sched_job, - TP_PROTO(struct drm_sched_job *sched_job, struct drm_sched_entity *entity), - TP_ARGS(sched_job, entity) + TP_PROTO(struct drm_sched_job *sched_job, struct drm_sched_entity *entity, + unsigned int dep_count), + TP_ARGS(sched_job, entity, dep_count) ); DEFINE_EVENT(drm_sched_job, drm_run_job, - TP_PROTO(struct drm_sched_job *sched_job, struct drm_sched_entity *entity), - TP_ARGS(sched_job, entity) + TP_PROTO(struct drm_sched_job *sched_job, struct drm_sched_entity *entity, + unsigned int dep_count), + TP_ARGS(sched_job, entity, 0) ); TRACE_EVENT(drm_sched_process_job, diff --git a/drivers/gpu/drm/scheduler/sched_entity.c b/drivers/gpu/drm/scheduler/sched_entity.c index 69bcf0e99d57..0ce3a82fe6fd 100644 --- a/drivers/gpu/drm/scheduler/sched_entity.c +++ b/drivers/gpu/drm/scheduler/sched_entity.c @@ -592,7 +592,13 @@ void drm_sched_entity_push_job(struct drm_sched_job *sched_job) bool first; ktime_t submit_ts; - trace_drm_sched_job(sched_job, entity); + if (trace_drm_sched_job_enabled()) { + unsigned long index, n = 0; + void *f; + + xa_for_each(&sched_job->dependencies, index, f) { n++; } + trace_drm_sched_job(sched_job, entity, n); + } atomic_inc(entity->rq->sched->score); WRITE_ONCE(entity->last_user, current->group_leader); diff --git a/drivers/gpu/drm/scheduler/sched_main.c b/drivers/gpu/drm/scheduler/sched_main.c index ad306d3d7282..c645f07ebe26 100644 --- a/drivers/gpu/drm/scheduler/sched_main.c +++ b/drivers/gpu/drm/scheduler/sched_main.c @@ -1214,7 +1214,7 @@ static void drm_sched_run_job_work(struct work_struct *w) atomic_add(sched_job->credits, &sched->credit_count); drm_sched_job_begin(sched_job); - trace_drm_run_job(sched_job, entity); + trace_drm_run_job(sched_job, entity, 0); fence = sched->ops->run_job(sched_job); complete_all(&entity->entity_idle); drm_sched_fence_scheduled(s_fence, fence);
Trace the fence dependencies similarly to how we print fences: ... , dependencies:{fence=606:38006} This allows tools to analyze the dependencies between the jobs (previously it was only possible for fences traced by drm_sched_job_wait_dep). Since drm_sched_job and drm_run_job use the same base event class, the caller of trace_drm_run_job have to pass a dep_count of 0 (which is ignored because dependencies are only considered at submit time). Signed-off-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com> --- .../gpu/drm/scheduler/gpu_scheduler_trace.h | 59 ++++++++++++++++--- drivers/gpu/drm/scheduler/sched_entity.c | 8 ++- drivers/gpu/drm/scheduler/sched_main.c | 2 +- 3 files changed, 58 insertions(+), 11 deletions(-)