Message ID | 20240614081657.408397-4-pierre-eric.pelloux-prayer@amd.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | Improve gpu_scheduler trace events + uAPI | expand |
Am Freitag, dem 14.06.2024 um 10:16 +0200 schrieb Pierre-Eric Pelloux- Prayer: > Trace the fence dependencies similarly to how we print fences: > > ... , dependencies:{(context:606, seqno:38006)} > > This allows tools to analyze the dependencies between the jobs (previously > it was only possible for fences traced by drm_sched_job_wait_dep). > > Since drm_sched_job and drm_run_job use the same base event class, > the caller of trace_drm_run_job have to pass a dep_count of 0 (which > is ignored because dependencies are only considered at submit time). > > Signed-off-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com> > --- > .../gpu/drm/scheduler/gpu_scheduler_trace.h | 59 ++++++++++++++++--- > drivers/gpu/drm/scheduler/sched_entity.c | 8 ++- > drivers/gpu/drm/scheduler/sched_main.c | 2 +- > 3 files changed, 58 insertions(+), 11 deletions(-) > > diff --git a/drivers/gpu/drm/scheduler/gpu_scheduler_trace.h b/drivers/gpu/drm/scheduler/gpu_scheduler_trace.h > index f6da377cdb99..0abcad26839c 100644 > --- a/drivers/gpu/drm/scheduler/gpu_scheduler_trace.h > +++ b/drivers/gpu/drm/scheduler/gpu_scheduler_trace.h > @@ -26,15 +26,41 @@ > > #include <linux/stringify.h> > #include <linux/types.h> > +#include <linux/trace_seq.h> > #include <linux/tracepoint.h> > > #undef TRACE_SYSTEM > #define TRACE_SYSTEM gpu_scheduler > #define TRACE_INCLUDE_FILE gpu_scheduler_trace > > +#ifndef __TRACE_EVENT_GPU_SCHEDULER_PRINT_FN > +#define __TRACE_EVENT_GPU_SCHEDULER_PRINT_FN > +/* Similar to trace_print_array_seq but for fences. */ > +static inline const char *__print_dma_fence_array(struct trace_seq *p, const void *buf, int count) > +{ > + const char *ret = trace_seq_buffer_ptr(p); > + u64 *fences = (u64 *) buf; > + const char *prefix = ""; > + > + trace_seq_putc(p, '{'); > + for (int i = 0; i < count; i++) { > + u64 context = fences[2 * i], seqno = fences[2 * i + 1]; > + > + trace_seq_printf(p, "%s(context:%llu, seqno:%lld)", > + prefix, context, seqno); > + prefix = ","; > + } > + trace_seq_putc(p, '}'); > + trace_seq_putc(p, 0); > + > + return ret; > +} > +#endif > + > DECLARE_EVENT_CLASS(drm_sched_job, > - TP_PROTO(struct drm_sched_job *sched_job, struct drm_sched_entity *entity), > - TP_ARGS(sched_job, entity), > + TP_PROTO(struct drm_sched_job *sched_job, struct drm_sched_entity *entity, > + unsigned int dep_count), > + TP_ARGS(sched_job, entity, dep_count), > TP_STRUCT__entry( > __string(name, sched_job->sched->name) > __field(uint64_t, id) > @@ -43,9 +69,14 @@ DECLARE_EVENT_CLASS(drm_sched_job, > __string(dev, dev_name(sched_job->sched->dev)) > __field(uint64_t, fence_context) > __field(uint64_t, fence_seqno) > + __field(int, n_deps) > + __dynamic_array(u64, deps, dep_count * 2) > ), > > TP_fast_assign( > + unsigned long idx; > + struct dma_fence *fence; > + u64 *dyn_arr; > __entry->id = sched_job->id; > __assign_str(name); > __entry->job_count = spsc_queue_count(&entity->job_queue); > @@ -54,22 +85,32 @@ DECLARE_EVENT_CLASS(drm_sched_job, > __assign_str(dev); > __entry->fence_context = sched_job->s_fence->finished.context; > __entry->fence_seqno = sched_job->s_fence->finished.seqno; > - > + __entry->n_deps = dep_count; > + if (dep_count) { > + dyn_arr = __get_dynamic_array(deps); > + xa_for_each(&sched_job->dependencies, idx, fence) { > + dyn_arr[2 * idx] = fence->context; > + dyn_arr[2 * idx + 1] = fence->seqno; > + } > + } > ), > - TP_printk("id=%llu, fence=(context:%llu, seqno:%lld), ring=%s, job count:%u, hw job count:%d", > + TP_printk("id=%llu, fence=(context:%llu, seqno:%lld), ring=%s, job count:%u, hw job count:%d, dependencies:%s", > __entry->id, > __entry->fence_context, __entry->fence_seqno, __get_str(name), > - __entry->job_count, __entry->hw_job_count) > + __entry->job_count, __entry->hw_job_count, > + __print_dma_fence_array(p, __get_dynamic_array(deps), __entry->n_deps)) > ); > > DEFINE_EVENT(drm_sched_job, drm_sched_job, > - TP_PROTO(struct drm_sched_job *sched_job, struct drm_sched_entity *entity), > - TP_ARGS(sched_job, entity) > + TP_PROTO(struct drm_sched_job *sched_job, struct drm_sched_entity *entity, > + unsigned int dep_count), > + TP_ARGS(sched_job, entity, dep_count) > ); > > DEFINE_EVENT_PRINT(drm_sched_job, drm_run_job, > - TP_PROTO(struct drm_sched_job *sched_job, struct drm_sched_entity *entity), > - TP_ARGS(sched_job, entity), > + TP_PROTO(struct drm_sched_job *sched_job, struct drm_sched_entity *entity, > + unsigned int dep_count), > + TP_ARGS(sched_job, entity, 0), > TP_printk("dev=%s, id=%llu, fence=(context:%llu, seqno:%lld), ring=%s, job count:%u, hw job count:%d", > __get_str(dev), __entry->id, > __entry->fence_context, __entry->fence_seqno, __get_str(name), > diff --git a/drivers/gpu/drm/scheduler/sched_entity.c b/drivers/gpu/drm/scheduler/sched_entity.c > index 58c8161289fe..aa33a9d29b14 100644 > --- a/drivers/gpu/drm/scheduler/sched_entity.c > +++ b/drivers/gpu/drm/scheduler/sched_entity.c > @@ -583,7 +583,13 @@ void drm_sched_entity_push_job(struct drm_sched_job *sched_job) > bool first; > ktime_t submit_ts; > > - trace_drm_sched_job(sched_job, entity); > + if (trace_drm_sched_job_enabled()) { > + unsigned long index; > + void *f; > + > + xa_for_each(&sched_job->dependencies, index, f) { } I did need to do a double take on that one. While using index for the total number of dependencies is correct for the dense and at this point in time fully populated dependencies array, it is a bit non-obvious. Maybe add a small comment here or have an actual counter in the loop body? Regards, Lucas > + trace_drm_sched_job(sched_job, entity, index); > + } > atomic_inc(entity->rq->sched->score); > WRITE_ONCE(entity->last_user, current->group_leader); > > diff --git a/drivers/gpu/drm/scheduler/sched_main.c b/drivers/gpu/drm/scheduler/sched_main.c > index 7e90c9f95611..0bc9b7b24b34 100644 > --- a/drivers/gpu/drm/scheduler/sched_main.c > +++ b/drivers/gpu/drm/scheduler/sched_main.c > @@ -1201,7 +1201,7 @@ static void drm_sched_run_job_work(struct work_struct *w) > atomic_add(sched_job->credits, &sched->credit_count); > drm_sched_job_begin(sched_job); > > - trace_drm_run_job(sched_job, entity); > + trace_drm_run_job(sched_job, entity, 0); > fence = sched->ops->run_job(sched_job); > complete_all(&entity->entity_idle); > drm_sched_fence_scheduled(s_fence, fence);
Le 03/07/2024 à 13:05, Lucas Stach a écrit : > Am Freitag, dem 14.06.2024 um 10:16 +0200 schrieb Pierre-Eric Pelloux- > Prayer: >> Trace the fence dependencies similarly to how we print fences: >> >> ... , dependencies:{(context:606, seqno:38006)} >> >> This allows tools to analyze the dependencies between the jobs (previously >> it was only possible for fences traced by drm_sched_job_wait_dep). >> >> Since drm_sched_job and drm_run_job use the same base event class, >> the caller of trace_drm_run_job have to pass a dep_count of 0 (which >> is ignored because dependencies are only considered at submit time). >> >> Signed-off-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com> >> --- >> .../gpu/drm/scheduler/gpu_scheduler_trace.h | 59 ++++++++++++++++--- >> drivers/gpu/drm/scheduler/sched_entity.c | 8 ++- >> drivers/gpu/drm/scheduler/sched_main.c | 2 +- >> 3 files changed, 58 insertions(+), 11 deletions(-) >> >> diff --git a/drivers/gpu/drm/scheduler/gpu_scheduler_trace.h b/drivers/gpu/drm/scheduler/gpu_scheduler_trace.h >> index f6da377cdb99..0abcad26839c 100644 >> --- a/drivers/gpu/drm/scheduler/gpu_scheduler_trace.h >> +++ b/drivers/gpu/drm/scheduler/gpu_scheduler_trace.h >> @@ -26,15 +26,41 @@ >> >> #include <linux/stringify.h> >> #include <linux/types.h> >> +#include <linux/trace_seq.h> >> #include <linux/tracepoint.h> >> >> #undef TRACE_SYSTEM >> #define TRACE_SYSTEM gpu_scheduler >> #define TRACE_INCLUDE_FILE gpu_scheduler_trace >> >> +#ifndef __TRACE_EVENT_GPU_SCHEDULER_PRINT_FN >> +#define __TRACE_EVENT_GPU_SCHEDULER_PRINT_FN >> +/* Similar to trace_print_array_seq but for fences. */ >> +static inline const char *__print_dma_fence_array(struct trace_seq *p, const void *buf, int count) >> +{ >> + const char *ret = trace_seq_buffer_ptr(p); >> + u64 *fences = (u64 *) buf; >> + const char *prefix = ""; >> + >> + trace_seq_putc(p, '{'); >> + for (int i = 0; i < count; i++) { >> + u64 context = fences[2 * i], seqno = fences[2 * i + 1]; >> + >> + trace_seq_printf(p, "%s(context:%llu, seqno:%lld)", >> + prefix, context, seqno); >> + prefix = ","; >> + } >> + trace_seq_putc(p, '}'); >> + trace_seq_putc(p, 0); >> + >> + return ret; >> +} >> +#endif >> + >> DECLARE_EVENT_CLASS(drm_sched_job, >> - TP_PROTO(struct drm_sched_job *sched_job, struct drm_sched_entity *entity), >> - TP_ARGS(sched_job, entity), >> + TP_PROTO(struct drm_sched_job *sched_job, struct drm_sched_entity *entity, >> + unsigned int dep_count), >> + TP_ARGS(sched_job, entity, dep_count), >> TP_STRUCT__entry( >> __string(name, sched_job->sched->name) >> __field(uint64_t, id) >> @@ -43,9 +69,14 @@ DECLARE_EVENT_CLASS(drm_sched_job, >> __string(dev, dev_name(sched_job->sched->dev)) >> __field(uint64_t, fence_context) >> __field(uint64_t, fence_seqno) >> + __field(int, n_deps) >> + __dynamic_array(u64, deps, dep_count * 2) >> ), >> >> TP_fast_assign( >> + unsigned long idx; >> + struct dma_fence *fence; >> + u64 *dyn_arr; >> __entry->id = sched_job->id; >> __assign_str(name); >> __entry->job_count = spsc_queue_count(&entity->job_queue); >> @@ -54,22 +85,32 @@ DECLARE_EVENT_CLASS(drm_sched_job, >> __assign_str(dev); >> __entry->fence_context = sched_job->s_fence->finished.context; >> __entry->fence_seqno = sched_job->s_fence->finished.seqno; >> - >> + __entry->n_deps = dep_count; >> + if (dep_count) { >> + dyn_arr = __get_dynamic_array(deps); >> + xa_for_each(&sched_job->dependencies, idx, fence) { >> + dyn_arr[2 * idx] = fence->context; >> + dyn_arr[2 * idx + 1] = fence->seqno; >> + } >> + } >> ), >> - TP_printk("id=%llu, fence=(context:%llu, seqno:%lld), ring=%s, job count:%u, hw job count:%d", >> + TP_printk("id=%llu, fence=(context:%llu, seqno:%lld), ring=%s, job count:%u, hw job count:%d, dependencies:%s", >> __entry->id, >> __entry->fence_context, __entry->fence_seqno, __get_str(name), >> - __entry->job_count, __entry->hw_job_count) >> + __entry->job_count, __entry->hw_job_count, >> + __print_dma_fence_array(p, __get_dynamic_array(deps), __entry->n_deps)) >> ); >> >> DEFINE_EVENT(drm_sched_job, drm_sched_job, >> - TP_PROTO(struct drm_sched_job *sched_job, struct drm_sched_entity *entity), >> - TP_ARGS(sched_job, entity) >> + TP_PROTO(struct drm_sched_job *sched_job, struct drm_sched_entity *entity, >> + unsigned int dep_count), >> + TP_ARGS(sched_job, entity, dep_count) >> ); >> >> DEFINE_EVENT_PRINT(drm_sched_job, drm_run_job, >> - TP_PROTO(struct drm_sched_job *sched_job, struct drm_sched_entity *entity), >> - TP_ARGS(sched_job, entity), >> + TP_PROTO(struct drm_sched_job *sched_job, struct drm_sched_entity *entity, >> + unsigned int dep_count), >> + TP_ARGS(sched_job, entity, 0), >> TP_printk("dev=%s, id=%llu, fence=(context:%llu, seqno:%lld), ring=%s, job count:%u, hw job count:%d", >> __get_str(dev), __entry->id, >> __entry->fence_context, __entry->fence_seqno, __get_str(name), >> diff --git a/drivers/gpu/drm/scheduler/sched_entity.c b/drivers/gpu/drm/scheduler/sched_entity.c >> index 58c8161289fe..aa33a9d29b14 100644 >> --- a/drivers/gpu/drm/scheduler/sched_entity.c >> +++ b/drivers/gpu/drm/scheduler/sched_entity.c >> @@ -583,7 +583,13 @@ void drm_sched_entity_push_job(struct drm_sched_job *sched_job) >> bool first; >> ktime_t submit_ts; >> >> - trace_drm_sched_job(sched_job, entity); >> + if (trace_drm_sched_job_enabled()) { >> + unsigned long index; >> + void *f; >> + >> + xa_for_each(&sched_job->dependencies, index, f) { } > > I did need to do a double take on that one. While using index for the > total number of dependencies is correct for the dense and at this point > in time fully populated dependencies array, it is a bit non-obvious. > Maybe add a small comment here or have an actual counter in the loop > body? Makes sense. I'll add a counter in v6. thanks, Pierre-Eric > > Regards, > Lucas > >> + trace_drm_sched_job(sched_job, entity, index); >> + } >> atomic_inc(entity->rq->sched->score); >> WRITE_ONCE(entity->last_user, current->group_leader); >> >> diff --git a/drivers/gpu/drm/scheduler/sched_main.c b/drivers/gpu/drm/scheduler/sched_main.c >> index 7e90c9f95611..0bc9b7b24b34 100644 >> --- a/drivers/gpu/drm/scheduler/sched_main.c >> +++ b/drivers/gpu/drm/scheduler/sched_main.c >> @@ -1201,7 +1201,7 @@ static void drm_sched_run_job_work(struct work_struct *w) >> atomic_add(sched_job->credits, &sched->credit_count); >> drm_sched_job_begin(sched_job); >> >> - trace_drm_run_job(sched_job, entity); >> + trace_drm_run_job(sched_job, entity, 0); >> fence = sched->ops->run_job(sched_job); >> complete_all(&entity->entity_idle); >> drm_sched_fence_scheduled(s_fence, fence);
diff --git a/drivers/gpu/drm/scheduler/gpu_scheduler_trace.h b/drivers/gpu/drm/scheduler/gpu_scheduler_trace.h index f6da377cdb99..0abcad26839c 100644 --- a/drivers/gpu/drm/scheduler/gpu_scheduler_trace.h +++ b/drivers/gpu/drm/scheduler/gpu_scheduler_trace.h @@ -26,15 +26,41 @@ #include <linux/stringify.h> #include <linux/types.h> +#include <linux/trace_seq.h> #include <linux/tracepoint.h> #undef TRACE_SYSTEM #define TRACE_SYSTEM gpu_scheduler #define TRACE_INCLUDE_FILE gpu_scheduler_trace +#ifndef __TRACE_EVENT_GPU_SCHEDULER_PRINT_FN +#define __TRACE_EVENT_GPU_SCHEDULER_PRINT_FN +/* Similar to trace_print_array_seq but for fences. */ +static inline const char *__print_dma_fence_array(struct trace_seq *p, const void *buf, int count) +{ + const char *ret = trace_seq_buffer_ptr(p); + u64 *fences = (u64 *) buf; + const char *prefix = ""; + + trace_seq_putc(p, '{'); + for (int i = 0; i < count; i++) { + u64 context = fences[2 * i], seqno = fences[2 * i + 1]; + + trace_seq_printf(p, "%s(context:%llu, seqno:%lld)", + prefix, context, seqno); + prefix = ","; + } + trace_seq_putc(p, '}'); + trace_seq_putc(p, 0); + + return ret; +} +#endif + DECLARE_EVENT_CLASS(drm_sched_job, - TP_PROTO(struct drm_sched_job *sched_job, struct drm_sched_entity *entity), - TP_ARGS(sched_job, entity), + TP_PROTO(struct drm_sched_job *sched_job, struct drm_sched_entity *entity, + unsigned int dep_count), + TP_ARGS(sched_job, entity, dep_count), TP_STRUCT__entry( __string(name, sched_job->sched->name) __field(uint64_t, id) @@ -43,9 +69,14 @@ DECLARE_EVENT_CLASS(drm_sched_job, __string(dev, dev_name(sched_job->sched->dev)) __field(uint64_t, fence_context) __field(uint64_t, fence_seqno) + __field(int, n_deps) + __dynamic_array(u64, deps, dep_count * 2) ), TP_fast_assign( + unsigned long idx; + struct dma_fence *fence; + u64 *dyn_arr; __entry->id = sched_job->id; __assign_str(name); __entry->job_count = spsc_queue_count(&entity->job_queue); @@ -54,22 +85,32 @@ DECLARE_EVENT_CLASS(drm_sched_job, __assign_str(dev); __entry->fence_context = sched_job->s_fence->finished.context; __entry->fence_seqno = sched_job->s_fence->finished.seqno; - + __entry->n_deps = dep_count; + if (dep_count) { + dyn_arr = __get_dynamic_array(deps); + xa_for_each(&sched_job->dependencies, idx, fence) { + dyn_arr[2 * idx] = fence->context; + dyn_arr[2 * idx + 1] = fence->seqno; + } + } ), - TP_printk("id=%llu, fence=(context:%llu, seqno:%lld), ring=%s, job count:%u, hw job count:%d", + TP_printk("id=%llu, fence=(context:%llu, seqno:%lld), ring=%s, job count:%u, hw job count:%d, dependencies:%s", __entry->id, __entry->fence_context, __entry->fence_seqno, __get_str(name), - __entry->job_count, __entry->hw_job_count) + __entry->job_count, __entry->hw_job_count, + __print_dma_fence_array(p, __get_dynamic_array(deps), __entry->n_deps)) ); DEFINE_EVENT(drm_sched_job, drm_sched_job, - TP_PROTO(struct drm_sched_job *sched_job, struct drm_sched_entity *entity), - TP_ARGS(sched_job, entity) + TP_PROTO(struct drm_sched_job *sched_job, struct drm_sched_entity *entity, + unsigned int dep_count), + TP_ARGS(sched_job, entity, dep_count) ); DEFINE_EVENT_PRINT(drm_sched_job, drm_run_job, - TP_PROTO(struct drm_sched_job *sched_job, struct drm_sched_entity *entity), - TP_ARGS(sched_job, entity), + TP_PROTO(struct drm_sched_job *sched_job, struct drm_sched_entity *entity, + unsigned int dep_count), + TP_ARGS(sched_job, entity, 0), TP_printk("dev=%s, id=%llu, fence=(context:%llu, seqno:%lld), ring=%s, job count:%u, hw job count:%d", __get_str(dev), __entry->id, __entry->fence_context, __entry->fence_seqno, __get_str(name), diff --git a/drivers/gpu/drm/scheduler/sched_entity.c b/drivers/gpu/drm/scheduler/sched_entity.c index 58c8161289fe..aa33a9d29b14 100644 --- a/drivers/gpu/drm/scheduler/sched_entity.c +++ b/drivers/gpu/drm/scheduler/sched_entity.c @@ -583,7 +583,13 @@ void drm_sched_entity_push_job(struct drm_sched_job *sched_job) bool first; ktime_t submit_ts; - trace_drm_sched_job(sched_job, entity); + if (trace_drm_sched_job_enabled()) { + unsigned long index; + void *f; + + xa_for_each(&sched_job->dependencies, index, f) { } + trace_drm_sched_job(sched_job, entity, index); + } atomic_inc(entity->rq->sched->score); WRITE_ONCE(entity->last_user, current->group_leader); diff --git a/drivers/gpu/drm/scheduler/sched_main.c b/drivers/gpu/drm/scheduler/sched_main.c index 7e90c9f95611..0bc9b7b24b34 100644 --- a/drivers/gpu/drm/scheduler/sched_main.c +++ b/drivers/gpu/drm/scheduler/sched_main.c @@ -1201,7 +1201,7 @@ static void drm_sched_run_job_work(struct work_struct *w) atomic_add(sched_job->credits, &sched->credit_count); drm_sched_job_begin(sched_job); - trace_drm_run_job(sched_job, entity); + trace_drm_run_job(sched_job, entity, 0); fence = sched->ops->run_job(sched_job); complete_all(&entity->entity_idle); drm_sched_fence_scheduled(s_fence, fence);
Trace the fence dependencies similarly to how we print fences: ... , dependencies:{(context:606, seqno:38006)} This allows tools to analyze the dependencies between the jobs (previously it was only possible for fences traced by drm_sched_job_wait_dep). Since drm_sched_job and drm_run_job use the same base event class, the caller of trace_drm_run_job have to pass a dep_count of 0 (which is ignored because dependencies are only considered at submit time). Signed-off-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com> --- .../gpu/drm/scheduler/gpu_scheduler_trace.h | 59 ++++++++++++++++--- drivers/gpu/drm/scheduler/sched_entity.c | 8 ++- drivers/gpu/drm/scheduler/sched_main.c | 2 +- 3 files changed, 58 insertions(+), 11 deletions(-)