Message ID | 20240913124857.389630-2-adrian.larumbe@collabora.com (mailing list archive) |
---|---|
State | New |
Headers | show |
Series | Support fdinfo runtime and memory stats on Panthor | expand |
Hi Adrián, kernel test robot noticed the following build warnings: [auto build test WARNING on linus/master] [also build test WARNING on v6.11-rc7 next-20240913] [cannot apply to drm-misc/drm-misc-next] [If your patch is applied to the wrong git tree, kindly drop us a note. And when submitting patch, we suggest to use '--base' as documented in https://git-scm.com/docs/git-format-patch#_base_tree_information] url: https://github.com/intel-lab-lkp/linux/commits/Adri-n-Larumbe/drm-panthor-introduce-job-cycle-and-timestamp-accounting/20240913-205038 base: linus/master patch link: https://lore.kernel.org/r/20240913124857.389630-2-adrian.larumbe%40collabora.com patch subject: [PATCH v6 1/5] drm/panthor: introduce job cycle and timestamp accounting config: alpha-allyesconfig (https://download.01.org/0day-ci/archive/20240914/202409140506.OBoqSiVk-lkp@intel.com/config) compiler: alpha-linux-gcc (GCC) 13.3.0 reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20240914/202409140506.OBoqSiVk-lkp@intel.com/reproduce) If you fix the issue in a separate patch/commit (i.e. not just a new version of the same patch/commit), kindly add following tags | Reported-by: kernel test robot <lkp@intel.com> | Closes: https://lore.kernel.org/oe-kbuild-all/202409140506.OBoqSiVk-lkp@intel.com/ All warnings (new ones prefixed by >>): drivers/gpu/drm/panthor/panthor_sched.c:322: warning: Excess struct member 'runnable' description in 'panthor_scheduler' drivers/gpu/drm/panthor/panthor_sched.c:322: warning: Excess struct member 'idle' description in 'panthor_scheduler' drivers/gpu/drm/panthor/panthor_sched.c:322: warning: Excess struct member 'waiting' description in 'panthor_scheduler' drivers/gpu/drm/panthor/panthor_sched.c:322: warning: Excess struct member 'has_ref' description in 'panthor_scheduler' drivers/gpu/drm/panthor/panthor_sched.c:322: warning: Excess struct member 'in_progress' description in 'panthor_scheduler' drivers/gpu/drm/panthor/panthor_sched.c:322: warning: Excess struct member 'stopped_groups' description in 'panthor_scheduler' >> drivers/gpu/drm/panthor/panthor_sched.c:494: warning: Function parameter or struct member 'profiling' not described in 'panthor_queue' drivers/gpu/drm/panthor/panthor_sched.c:494: warning: Excess struct member 'mem' description in 'panthor_queue' drivers/gpu/drm/panthor/panthor_sched.c:494: warning: Excess struct member 'input' description in 'panthor_queue' drivers/gpu/drm/panthor/panthor_sched.c:494: warning: Excess struct member 'output' description in 'panthor_queue' drivers/gpu/drm/panthor/panthor_sched.c:494: warning: Excess struct member 'input_fw_va' description in 'panthor_queue' drivers/gpu/drm/panthor/panthor_sched.c:494: warning: Excess struct member 'output_fw_va' description in 'panthor_queue' drivers/gpu/drm/panthor/panthor_sched.c:494: warning: Excess struct member 'gpu_va' description in 'panthor_queue' drivers/gpu/drm/panthor/panthor_sched.c:494: warning: Excess struct member 'ref' description in 'panthor_queue' drivers/gpu/drm/panthor/panthor_sched.c:494: warning: Excess struct member 'gt' description in 'panthor_queue' drivers/gpu/drm/panthor/panthor_sched.c:494: warning: Excess struct member 'sync64' description in 'panthor_queue' drivers/gpu/drm/panthor/panthor_sched.c:494: warning: Excess struct member 'bo' description in 'panthor_queue' drivers/gpu/drm/panthor/panthor_sched.c:494: warning: Excess struct member 'offset' description in 'panthor_queue' drivers/gpu/drm/panthor/panthor_sched.c:494: warning: Excess struct member 'kmap' description in 'panthor_queue' drivers/gpu/drm/panthor/panthor_sched.c:494: warning: Excess struct member 'lock' description in 'panthor_queue' drivers/gpu/drm/panthor/panthor_sched.c:494: warning: Excess struct member 'id' description in 'panthor_queue' drivers/gpu/drm/panthor/panthor_sched.c:494: warning: Excess struct member 'seqno' description in 'panthor_queue' drivers/gpu/drm/panthor/panthor_sched.c:494: warning: Excess struct member 'last_fence' description in 'panthor_queue' drivers/gpu/drm/panthor/panthor_sched.c:494: warning: Excess struct member 'in_flight_jobs' description in 'panthor_queue' >> drivers/gpu/drm/panthor/panthor_sched.c:494: warning: Excess struct member 'profiling_info' description in 'panthor_queue' drivers/gpu/drm/panthor/panthor_sched.c:494: warning: Excess struct member 'slots' description in 'panthor_queue' drivers/gpu/drm/panthor/panthor_sched.c:494: warning: Excess struct member 'slot_count' description in 'panthor_queue' drivers/gpu/drm/panthor/panthor_sched.c:494: warning: Excess struct member 'profiling_seqno' description in 'panthor_queue' drivers/gpu/drm/panthor/panthor_sched.c:813: warning: Excess struct member 'start' description in 'panthor_job' drivers/gpu/drm/panthor/panthor_sched.c:813: warning: Excess struct member 'size' description in 'panthor_job' drivers/gpu/drm/panthor/panthor_sched.c:813: warning: Excess struct member 'latest_flush' description in 'panthor_job' drivers/gpu/drm/panthor/panthor_sched.c:813: warning: Excess struct member 'start' description in 'panthor_job' drivers/gpu/drm/panthor/panthor_sched.c:813: warning: Excess struct member 'end' description in 'panthor_job' >> drivers/gpu/drm/panthor/panthor_sched.c:813: warning: Excess struct member 'mask' description in 'panthor_job' >> drivers/gpu/drm/panthor/panthor_sched.c:813: warning: Excess struct member 'slot' description in 'panthor_job' drivers/gpu/drm/panthor/panthor_sched.c:1734: warning: Function parameter or struct member 'ptdev' not described in 'panthor_sched_report_fw_events' drivers/gpu/drm/panthor/panthor_sched.c:1734: warning: Function parameter or struct member 'events' not described in 'panthor_sched_report_fw_events' drivers/gpu/drm/panthor/panthor_sched.c:2626: warning: Function parameter or struct member 'ptdev' not described in 'panthor_sched_report_mmu_fault' vim +494 drivers/gpu/drm/panthor/panthor_sched.c de85488138247d Boris Brezillon 2024-02-29 397 de85488138247d Boris Brezillon 2024-02-29 398 /** @ringbuf: Command stream ring-buffer. */ de85488138247d Boris Brezillon 2024-02-29 399 struct panthor_kernel_bo *ringbuf; de85488138247d Boris Brezillon 2024-02-29 400 de85488138247d Boris Brezillon 2024-02-29 401 /** @iface: Firmware interface. */ de85488138247d Boris Brezillon 2024-02-29 402 struct { de85488138247d Boris Brezillon 2024-02-29 403 /** @mem: FW memory allocated for this interface. */ de85488138247d Boris Brezillon 2024-02-29 404 struct panthor_kernel_bo *mem; de85488138247d Boris Brezillon 2024-02-29 405 de85488138247d Boris Brezillon 2024-02-29 406 /** @input: Input interface. */ de85488138247d Boris Brezillon 2024-02-29 407 struct panthor_fw_ringbuf_input_iface *input; de85488138247d Boris Brezillon 2024-02-29 408 de85488138247d Boris Brezillon 2024-02-29 409 /** @output: Output interface. */ de85488138247d Boris Brezillon 2024-02-29 410 const struct panthor_fw_ringbuf_output_iface *output; de85488138247d Boris Brezillon 2024-02-29 411 de85488138247d Boris Brezillon 2024-02-29 412 /** @input_fw_va: FW virtual address of the input interface buffer. */ de85488138247d Boris Brezillon 2024-02-29 413 u32 input_fw_va; de85488138247d Boris Brezillon 2024-02-29 414 de85488138247d Boris Brezillon 2024-02-29 415 /** @output_fw_va: FW virtual address of the output interface buffer. */ de85488138247d Boris Brezillon 2024-02-29 416 u32 output_fw_va; de85488138247d Boris Brezillon 2024-02-29 417 } iface; de85488138247d Boris Brezillon 2024-02-29 418 de85488138247d Boris Brezillon 2024-02-29 419 /** de85488138247d Boris Brezillon 2024-02-29 420 * @syncwait: Stores information about the synchronization object this de85488138247d Boris Brezillon 2024-02-29 421 * queue is waiting on. de85488138247d Boris Brezillon 2024-02-29 422 */ de85488138247d Boris Brezillon 2024-02-29 423 struct { de85488138247d Boris Brezillon 2024-02-29 424 /** @gpu_va: GPU address of the synchronization object. */ de85488138247d Boris Brezillon 2024-02-29 425 u64 gpu_va; de85488138247d Boris Brezillon 2024-02-29 426 de85488138247d Boris Brezillon 2024-02-29 427 /** @ref: Reference value to compare against. */ de85488138247d Boris Brezillon 2024-02-29 428 u64 ref; de85488138247d Boris Brezillon 2024-02-29 429 de85488138247d Boris Brezillon 2024-02-29 430 /** @gt: True if this is a greater-than test. */ de85488138247d Boris Brezillon 2024-02-29 431 bool gt; de85488138247d Boris Brezillon 2024-02-29 432 de85488138247d Boris Brezillon 2024-02-29 433 /** @sync64: True if this is a 64-bit sync object. */ de85488138247d Boris Brezillon 2024-02-29 434 bool sync64; de85488138247d Boris Brezillon 2024-02-29 435 de85488138247d Boris Brezillon 2024-02-29 436 /** @bo: Buffer object holding the synchronization object. */ de85488138247d Boris Brezillon 2024-02-29 437 struct drm_gem_object *obj; de85488138247d Boris Brezillon 2024-02-29 438 de85488138247d Boris Brezillon 2024-02-29 439 /** @offset: Offset of the synchronization object inside @bo. */ de85488138247d Boris Brezillon 2024-02-29 440 u64 offset; de85488138247d Boris Brezillon 2024-02-29 441 de85488138247d Boris Brezillon 2024-02-29 442 /** de85488138247d Boris Brezillon 2024-02-29 443 * @kmap: Kernel mapping of the buffer object holding the de85488138247d Boris Brezillon 2024-02-29 444 * synchronization object. de85488138247d Boris Brezillon 2024-02-29 445 */ de85488138247d Boris Brezillon 2024-02-29 446 void *kmap; de85488138247d Boris Brezillon 2024-02-29 447 } syncwait; de85488138247d Boris Brezillon 2024-02-29 448 de85488138247d Boris Brezillon 2024-02-29 449 /** @fence_ctx: Fence context fields. */ de85488138247d Boris Brezillon 2024-02-29 450 struct { de85488138247d Boris Brezillon 2024-02-29 451 /** @lock: Used to protect access to all fences allocated by this context. */ de85488138247d Boris Brezillon 2024-02-29 452 spinlock_t lock; de85488138247d Boris Brezillon 2024-02-29 453 de85488138247d Boris Brezillon 2024-02-29 454 /** de85488138247d Boris Brezillon 2024-02-29 455 * @id: Fence context ID. de85488138247d Boris Brezillon 2024-02-29 456 * de85488138247d Boris Brezillon 2024-02-29 457 * Allocated with dma_fence_context_alloc(). de85488138247d Boris Brezillon 2024-02-29 458 */ de85488138247d Boris Brezillon 2024-02-29 459 u64 id; de85488138247d Boris Brezillon 2024-02-29 460 de85488138247d Boris Brezillon 2024-02-29 461 /** @seqno: Sequence number of the last initialized fence. */ de85488138247d Boris Brezillon 2024-02-29 462 atomic64_t seqno; de85488138247d Boris Brezillon 2024-02-29 463 7b6f9ec6ad5112 Boris Brezillon 2024-07-03 464 /** 7b6f9ec6ad5112 Boris Brezillon 2024-07-03 465 * @last_fence: Fence of the last submitted job. 7b6f9ec6ad5112 Boris Brezillon 2024-07-03 466 * 7b6f9ec6ad5112 Boris Brezillon 2024-07-03 467 * We return this fence when we get an empty command stream. 7b6f9ec6ad5112 Boris Brezillon 2024-07-03 468 * This way, we are guaranteed that all earlier jobs have completed 7b6f9ec6ad5112 Boris Brezillon 2024-07-03 469 * when drm_sched_job::s_fence::finished without having to feed 7b6f9ec6ad5112 Boris Brezillon 2024-07-03 470 * the CS ring buffer with a dummy job that only signals the fence. 7b6f9ec6ad5112 Boris Brezillon 2024-07-03 471 */ 7b6f9ec6ad5112 Boris Brezillon 2024-07-03 472 struct dma_fence *last_fence; 7b6f9ec6ad5112 Boris Brezillon 2024-07-03 473 de85488138247d Boris Brezillon 2024-02-29 474 /** de85488138247d Boris Brezillon 2024-02-29 475 * @in_flight_jobs: List containing all in-flight jobs. de85488138247d Boris Brezillon 2024-02-29 476 * de85488138247d Boris Brezillon 2024-02-29 477 * Used to keep track and signal panthor_job::done_fence when the de85488138247d Boris Brezillon 2024-02-29 478 * synchronization object attached to the queue is signaled. de85488138247d Boris Brezillon 2024-02-29 479 */ de85488138247d Boris Brezillon 2024-02-29 480 struct list_head in_flight_jobs; de85488138247d Boris Brezillon 2024-02-29 481 } fence_ctx; a706810cebb072 Adrián Larumbe 2024-09-13 482 a706810cebb072 Adrián Larumbe 2024-09-13 483 /** @profiling_info: Job profiling data slots and access information. */ a706810cebb072 Adrián Larumbe 2024-09-13 484 struct { a706810cebb072 Adrián Larumbe 2024-09-13 485 /** @slots: Kernel BO holding the slots. */ a706810cebb072 Adrián Larumbe 2024-09-13 486 struct panthor_kernel_bo *slots; a706810cebb072 Adrián Larumbe 2024-09-13 487 a706810cebb072 Adrián Larumbe 2024-09-13 488 /** @slot_count: Number of jobs ringbuffer can hold at once. */ a706810cebb072 Adrián Larumbe 2024-09-13 489 u32 slot_count; a706810cebb072 Adrián Larumbe 2024-09-13 490 a706810cebb072 Adrián Larumbe 2024-09-13 491 /** @profiling_seqno: Index of the next available profiling information slot. */ a706810cebb072 Adrián Larumbe 2024-09-13 492 u32 seqno; a706810cebb072 Adrián Larumbe 2024-09-13 493 } profiling; de85488138247d Boris Brezillon 2024-02-29 @494 }; de85488138247d Boris Brezillon 2024-02-29 495
Hi Adrián, kernel test robot noticed the following build errors: [auto build test ERROR on linus/master] [also build test ERROR on v6.11-rc7 next-20240913] [cannot apply to drm-misc/drm-misc-next] [If your patch is applied to the wrong git tree, kindly drop us a note. And when submitting patch, we suggest to use '--base' as documented in https://git-scm.com/docs/git-format-patch#_base_tree_information] url: https://github.com/intel-lab-lkp/linux/commits/Adri-n-Larumbe/drm-panthor-introduce-job-cycle-and-timestamp-accounting/20240913-205038 base: linus/master patch link: https://lore.kernel.org/r/20240913124857.389630-2-adrian.larumbe%40collabora.com patch subject: [PATCH v6 1/5] drm/panthor: introduce job cycle and timestamp accounting config: i386-buildonly-randconfig-003-20240915 (https://download.01.org/0day-ci/archive/20240915/202409152243.r3t2jdOJ-lkp@intel.com/config) compiler: clang version 18.1.8 (https://github.com/llvm/llvm-project 3b5b5c1ec4a3095ab096dd780e84d7ab81f3d7ff) reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20240915/202409152243.r3t2jdOJ-lkp@intel.com/reproduce) If you fix the issue in a separate patch/commit (i.e. not just a new version of the same patch/commit), kindly add following tags | Reported-by: kernel test robot <lkp@intel.com> | Closes: https://lore.kernel.org/oe-kbuild-all/202409152243.r3t2jdOJ-lkp@intel.com/ All errors (new ones prefixed by >>): >> drivers/gpu/drm/panthor/panthor_sched.c:2885:12: error: call to '__compiletime_assert_371' declared with 'error' attribute: min(ringbuf_size - start, size) signedness error 2885 | written = min(ringbuf_size - start, size); | ^ include/linux/minmax.h:129:19: note: expanded from macro 'min' 129 | #define min(x, y) __careful_cmp(min, x, y) | ^ include/linux/minmax.h:105:2: note: expanded from macro '__careful_cmp' 105 | __careful_cmp_once(op, x, y, __UNIQUE_ID(x_), __UNIQUE_ID(y_)) | ^ include/linux/minmax.h:100:2: note: expanded from macro '__careful_cmp_once' 100 | BUILD_BUG_ON_MSG(!__types_ok(x,y,ux,uy), \ | ^ note: (skipping 2 expansions in backtrace; use -fmacro-backtrace-limit=0 to see all) include/linux/compiler_types.h:498:2: note: expanded from macro '_compiletime_assert' 498 | __compiletime_assert(condition, msg, prefix, suffix) | ^ include/linux/compiler_types.h:491:4: note: expanded from macro '__compiletime_assert' 491 | prefix ## suffix(); \ | ^ <scratch space>:68:1: note: expanded from here 68 | __compiletime_assert_371 | ^ 1 error generated. vim +2885 drivers/gpu/drm/panthor/panthor_sched.c 2862 2863 #define JOB_INSTR(__prof, __instr) \ 2864 { \ 2865 .profile_mask = __prof, \ 2866 .instr = __instr, \ 2867 } 2868 2869 static void 2870 copy_instrs_to_ringbuf(struct panthor_queue *queue, 2871 struct panthor_job *job, 2872 struct panthor_job_ringbuf_instrs *instrs) 2873 { 2874 ssize_t ringbuf_size = panthor_kernel_bo_size(queue->ringbuf); 2875 u32 start = job->ringbuf.start & (ringbuf_size - 1); 2876 ssize_t size, written; 2877 2878 /* 2879 * We need to write a whole slot, including any trailing zeroes 2880 * that may come at the end of it. Also, because instrs.buffer has 2881 * been zero-initialised, there's no need to pad it with 0's 2882 */ 2883 instrs->count = ALIGN(instrs->count, NUM_INSTRS_PER_CACHE_LINE); 2884 size = instrs->count * sizeof(u64); > 2885 written = min(ringbuf_size - start, size); 2886 2887 memcpy(queue->ringbuf->kmap + start, instrs->buffer, written); 2888 2889 if (written < size) 2890 memcpy(queue->ringbuf->kmap, 2891 &instrs->buffer[(ringbuf_size - start)/sizeof(u64)], 2892 size - written); 2893 } 2894
Hi Adrián,
kernel test robot noticed the following build errors:
[auto build test ERROR on linus/master]
[also build test ERROR on v6.11 next-20240913]
[cannot apply to drm-misc/drm-misc-next]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]
url: https://github.com/intel-lab-lkp/linux/commits/Adri-n-Larumbe/drm-panthor-introduce-job-cycle-and-timestamp-accounting/20240913-205038
base: linus/master
patch link: https://lore.kernel.org/r/20240913124857.389630-2-adrian.larumbe%40collabora.com
patch subject: [PATCH v6 1/5] drm/panthor: introduce job cycle and timestamp accounting
config: i386-buildonly-randconfig-002-20240915 (https://download.01.org/0day-ci/archive/20240916/202409160050.m7VEd3pY-lkp@intel.com/config)
compiler: clang version 18.1.8 (https://github.com/llvm/llvm-project 3b5b5c1ec4a3095ab096dd780e84d7ab81f3d7ff)
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20240916/202409160050.m7VEd3pY-lkp@intel.com/reproduce)
If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202409160050.m7VEd3pY-lkp@intel.com/
All errors (new ones prefixed by >>):
>> ld.lld: error: call to __compiletime_assert_341 marked "dontcall-error": min(ringbuf_size - start, size) signedness error
On 13/09/2024 13:42, Adrián Larumbe wrote: > Enable calculations of job submission times in clock cycles and wall > time. This is done by expanding the boilerplate command stream when running > a job to include instructions that compute said times right before and > after a user CS. > > A separate kernel BO is created per queue to store those values. Jobs can > access their sampled data through an index different from that of the > queue's ringbuffer. The reason for this is saving memory on the profiling > information kernel BO, since the amount of simultaneous profiled jobs we > can write into the queue's ringbuffer might be much smaller than for > regular jobs, as the former take more CSF instructions. > > This commit is done in preparation for enabling DRM fdinfo support in the > Panthor driver, which depends on the numbers calculated herein. > > A profile mode mask has been added that will in a future commit allow UM to > toggle performance metric sampling behaviour, which is disabled by default > to save power. When a ringbuffer CS is constructed, timestamp and cycling > sampling instructions are added depending on the enabled flags in the > profiling mask. > > A helper was provided that calculates the number of instructions for a > given set of enablement mask, and these are passed as the number of credits > when initialising a DRM scheduler job. > > Signed-off-by: Adrián Larumbe <adrian.larumbe@collabora.com> > Reviewed-by: Boris Brezillon <boris.brezillon@collabora.com> > Reviewed-by: Liviu Dudau <liviu.dudau@arm.com> Sorry I've been a bit slow about reviewing this. The kernel bot has pointed out a few minor issues and there's a few more below. > --- > drivers/gpu/drm/panthor/panthor_device.h | 22 ++ > drivers/gpu/drm/panthor/panthor_sched.c | 337 ++++++++++++++++++++--- > 2 files changed, 315 insertions(+), 44 deletions(-) > > diff --git a/drivers/gpu/drm/panthor/panthor_device.h b/drivers/gpu/drm/panthor/panthor_device.h > index e388c0472ba7..a48e30d0af30 100644 > --- a/drivers/gpu/drm/panthor/panthor_device.h > +++ b/drivers/gpu/drm/panthor/panthor_device.h > @@ -66,6 +66,25 @@ struct panthor_irq { > atomic_t suspended; > }; > > +/** > + * enum panthor_device_profiling_mode - Profiling state > + */ > +enum panthor_device_profiling_flags { > + /** @PANTHOR_DEVICE_PROFILING_DISABLED: Profiling is disabled. */ > + PANTHOR_DEVICE_PROFILING_DISABLED = 0, > + > + /** @PANTHOR_DEVICE_PROFILING_CYCLES: Sampling job cycles. */ > + PANTHOR_DEVICE_PROFILING_CYCLES = BIT(0), > + > + /** @PANTHOR_DEVICE_PROFILING_TIMESTAMP: Sampling job timestamp. */ > + PANTHOR_DEVICE_PROFILING_TIMESTAMP = BIT(1), > + > + /** @PANTHOR_DEVICE_PROFILING_ALL: Sampling everything. */ > + PANTHOR_DEVICE_PROFILING_ALL = > + PANTHOR_DEVICE_PROFILING_CYCLES | > + PANTHOR_DEVICE_PROFILING_TIMESTAMP, > +}; > + > /** > * struct panthor_device - Panthor device > */ > @@ -162,6 +181,9 @@ struct panthor_device { > */ > struct page *dummy_latest_flush; > } pm; > + > + /** @profile_mask: User-set profiling flags for job accounting. */ > + u32 profile_mask; > }; > > /** > diff --git a/drivers/gpu/drm/panthor/panthor_sched.c b/drivers/gpu/drm/panthor/panthor_sched.c > index 42afdf0ddb7e..bcba52558f1e 100644 > --- a/drivers/gpu/drm/panthor/panthor_sched.c > +++ b/drivers/gpu/drm/panthor/panthor_sched.c > @@ -93,6 +93,9 @@ > #define MIN_CSGS 3 > #define MAX_CSG_PRIO 0xf > > +#define NUM_INSTRS_PER_CACHE_LINE (64 / sizeof(u64)) > +#define MAX_INSTRS_PER_JOB 24 > + > struct panthor_group; > > /** > @@ -476,6 +479,18 @@ struct panthor_queue { > */ > struct list_head in_flight_jobs; > } fence_ctx; > + > + /** @profiling_info: Job profiling data slots and access information. */ kerneldoc name doesn't match: s/profiling_info/profiling/ > + struct { > + /** @slots: Kernel BO holding the slots. */ > + struct panthor_kernel_bo *slots; > + > + /** @slot_count: Number of jobs ringbuffer can hold at once. */ > + u32 slot_count; > + > + /** @profiling_seqno: Index of the next available profiling information slot. */ s/profiling_seqno/seqno/ > + u32 seqno; > + } profiling; > }; > > /** > @@ -661,6 +676,18 @@ struct panthor_group { > struct list_head wait_node; > }; > > +struct panthor_job_profiling_data { > + struct { > + u64 before; > + u64 after; > + } cycles; > + > + struct { > + u64 before; > + u64 after; > + } time; > +}; > + > /** > * group_queue_work() - Queue a group work > * @group: Group to queue the work for. > @@ -774,6 +801,15 @@ struct panthor_job { > > /** @done_fence: Fence signaled when the job is finished or cancelled. */ > struct dma_fence *done_fence; > + > + /** @profiling: Job profiling information. */ > + struct { > + /** @mask: Current device job profiling enablement bitmask. */ > + u32 mask; > + > + /** @slot: Job index in the profiling slots BO. */ > + u32 slot; > + } profiling; > }; > > static void > @@ -838,6 +874,7 @@ static void group_free_queue(struct panthor_group *group, struct panthor_queue * > > panthor_kernel_bo_destroy(queue->ringbuf); > panthor_kernel_bo_destroy(queue->iface.mem); > + panthor_kernel_bo_destroy(queue->profiling.slots); > > /* Release the last_fence we were holding, if any. */ > dma_fence_put(queue->fence_ctx.last_fence); > @@ -1982,8 +2019,6 @@ tick_ctx_init(struct panthor_scheduler *sched, > } > } > > -#define NUM_INSTRS_PER_SLOT 16 > - > static void > group_term_post_processing(struct panthor_group *group) > { > @@ -2815,65 +2850,211 @@ static void group_sync_upd_work(struct work_struct *work) > group_put(group); > } > > -static struct dma_fence * > -queue_run_job(struct drm_sched_job *sched_job) > +struct panthor_job_ringbuf_instrs { > + u64 buffer[MAX_INSTRS_PER_JOB]; > + u32 count; > +}; > + > +struct panthor_job_instr { > + u32 profile_mask; > + u64 instr; > +}; > + > +#define JOB_INSTR(__prof, __instr) \ > + { \ > + .profile_mask = __prof, \ > + .instr = __instr, \ > + } > + > +static void > +copy_instrs_to_ringbuf(struct panthor_queue *queue, > + struct panthor_job *job, > + struct panthor_job_ringbuf_instrs *instrs) > +{ > + ssize_t ringbuf_size = panthor_kernel_bo_size(queue->ringbuf); > + u32 start = job->ringbuf.start & (ringbuf_size - 1); > + ssize_t size, written; > + > + /* > + * We need to write a whole slot, including any trailing zeroes > + * that may come at the end of it. Also, because instrs.buffer has > + * been zero-initialised, there's no need to pad it with 0's > + */ > + instrs->count = ALIGN(instrs->count, NUM_INSTRS_PER_CACHE_LINE); > + size = instrs->count * sizeof(u64); > + written = min(ringbuf_size - start, size); This causes a signedness error - I think the easiest thing is to just make size/written unsigned. You might also want to consider a WARN_ON(size > ringbuf_size) or similar to catch that (impossible) case which would cause the below logic to fail. > + > + memcpy(queue->ringbuf->kmap + start, instrs->buffer, written); > + > + if (written < size) > + memcpy(queue->ringbuf->kmap, > + &instrs->buffer[(ringbuf_size - start)/sizeof(u64)], ^^^^^^^^^^^^^^^^^^^^ I believe this is equal to 'written', so this can be rewritten as: + &instrs->buffer[written / sizeof(u64)], which I find clearer, especially since you've used 'written' just below. > + size - written); > +} > + > +struct panthor_job_cs_params { > + u32 profile_mask; > + u64 addr_reg; u64 val_reg; > + u64 cycle_reg; u64 time_reg; > + u64 sync_addr; u64 times_addr; > + u64 cs_start; u64 cs_size; > + u32 last_flush; u32 waitall_mask; > +}; > + > +static void > +get_job_cs_params(struct panthor_job *job, struct panthor_job_cs_params *params) > { > - struct panthor_job *job = container_of(sched_job, struct panthor_job, base); > struct panthor_group *group = job->group; > struct panthor_queue *queue = group->queues[job->queue_idx]; > struct panthor_device *ptdev = group->ptdev; > struct panthor_scheduler *sched = ptdev->scheduler; > - u32 ringbuf_size = panthor_kernel_bo_size(queue->ringbuf); > - u32 ringbuf_insert = queue->iface.input->insert & (ringbuf_size - 1); > - u64 addr_reg = ptdev->csif_info.cs_reg_count - > - ptdev->csif_info.unpreserved_cs_reg_count; > - u64 val_reg = addr_reg + 2; > - u64 sync_addr = panthor_kernel_bo_gpuva(group->syncobjs) + > - job->queue_idx * sizeof(struct panthor_syncobj_64b); > - u32 waitall_mask = GENMASK(sched->sb_slot_count - 1, 0); > - struct dma_fence *done_fence; > - int ret; > > - u64 call_instrs[NUM_INSTRS_PER_SLOT] = { > + params->addr_reg = ptdev->csif_info.cs_reg_count - > + ptdev->csif_info.unpreserved_cs_reg_count; > + params->val_reg = params->addr_reg + 2; > + params->cycle_reg = params->addr_reg; > + params->time_reg = params->val_reg; > + > + params->sync_addr = panthor_kernel_bo_gpuva(group->syncobjs) + > + job->queue_idx * sizeof(struct panthor_syncobj_64b); > + params->times_addr = panthor_kernel_bo_gpuva(queue->profiling.slots) + > + (job->profiling.slot * sizeof(struct panthor_job_profiling_data)); > + params->waitall_mask = GENMASK(sched->sb_slot_count - 1, 0); > + > + params->cs_start = job->call_info.start; > + params->cs_size = job->call_info.size; > + params->last_flush = job->call_info.latest_flush; > + > + params->profile_mask = job->profiling.mask; > +} > + > +static void > +prepare_job_instrs(const struct panthor_job_cs_params *params, > + struct panthor_job_ringbuf_instrs *instrs) > +{ > + const struct panthor_job_instr instr_seq[] = { > /* MOV32 rX+2, cs.latest_flush */ > - (2ull << 56) | (val_reg << 48) | job->call_info.latest_flush, > + JOB_INSTR(PANTHOR_DEVICE_PROFILING_DISABLED, > + (2ull << 56) | (params->val_reg << 48) | params->last_flush), > > /* FLUSH_CACHE2.clean_inv_all.no_wait.signal(0) rX+2 */ > - (36ull << 56) | (0ull << 48) | (val_reg << 40) | (0 << 16) | 0x233, > + JOB_INSTR(PANTHOR_DEVICE_PROFILING_DISABLED, > + (36ull << 56) | (0ull << 48) | (params->val_reg << 40) | (0 << 16) | 0x233), > + > + /* MOV48 rX:rX+1, cycles_offset */ > + JOB_INSTR(PANTHOR_DEVICE_PROFILING_CYCLES, > + (1ull << 56) | (params->cycle_reg << 48) | > + (params->times_addr + > + offsetof(struct panthor_job_profiling_data, cycles.before))), > + /* STORE_STATE cycles */ > + JOB_INSTR(PANTHOR_DEVICE_PROFILING_CYCLES, > + (40ull << 56) | (params->cycle_reg << 40) | (1ll << 32)), > + /* MOV48 rX:rX+1, time_offset */ > + JOB_INSTR(PANTHOR_DEVICE_PROFILING_TIMESTAMP, > + (1ull << 56) | (params->time_reg << 48) | > + (params->times_addr + > + offsetof(struct panthor_job_profiling_data, time.before))), > + /* STORE_STATE timer */ > + JOB_INSTR(PANTHOR_DEVICE_PROFILING_TIMESTAMP, > + (40ull << 56) | (params->time_reg << 40) | (0ll << 32)), > > /* MOV48 rX:rX+1, cs.start */ > - (1ull << 56) | (addr_reg << 48) | job->call_info.start, > - > + JOB_INSTR(PANTHOR_DEVICE_PROFILING_DISABLED, > + (1ull << 56) | (params->addr_reg << 48) | params->cs_start), > /* MOV32 rX+2, cs.size */ > - (2ull << 56) | (val_reg << 48) | job->call_info.size, > - > + JOB_INSTR(PANTHOR_DEVICE_PROFILING_DISABLED, > + (2ull << 56) | (params->val_reg << 48) | params->cs_size), > /* WAIT(0) => waits for FLUSH_CACHE2 instruction */ > - (3ull << 56) | (1 << 16), > - > + JOB_INSTR(PANTHOR_DEVICE_PROFILING_DISABLED, (3ull << 56) | (1 << 16)), > /* CALL rX:rX+1, rX+2 */ > - (32ull << 56) | (addr_reg << 40) | (val_reg << 32), > + JOB_INSTR(PANTHOR_DEVICE_PROFILING_DISABLED, > + (32ull << 56) | (params->addr_reg << 40) | (params->val_reg << 32)), > + > + /* MOV48 rX:rX+1, cycles_offset */ > + JOB_INSTR(PANTHOR_DEVICE_PROFILING_CYCLES, > + (1ull << 56) | (params->cycle_reg << 48) | > + (params->times_addr + > + offsetof(struct panthor_job_profiling_data, cycles.after))), > + /* STORE_STATE cycles */ > + JOB_INSTR(PANTHOR_DEVICE_PROFILING_CYCLES, > + (40ull << 56) | (params->cycle_reg << 40) | (1ll << 32)), > + > + /* MOV48 rX:rX+1, time_offset */ > + JOB_INSTR(PANTHOR_DEVICE_PROFILING_TIMESTAMP, > + (1ull << 56) | (params->time_reg << 48) | > + (params->times_addr + > + offsetof(struct panthor_job_profiling_data, time.after))), > + /* STORE_STATE timer */ > + JOB_INSTR(PANTHOR_DEVICE_PROFILING_TIMESTAMP, > + (40ull << 56) | (params->time_reg << 40) | (0ll << 32)), > > /* MOV48 rX:rX+1, sync_addr */ > - (1ull << 56) | (addr_reg << 48) | sync_addr, > - > + JOB_INSTR(PANTHOR_DEVICE_PROFILING_DISABLED, > + (1ull << 56) | (params->addr_reg << 48) | params->sync_addr), > /* MOV48 rX+2, #1 */ > - (1ull << 56) | (val_reg << 48) | 1, > - > + JOB_INSTR(PANTHOR_DEVICE_PROFILING_DISABLED, > + (1ull << 56) | (params->val_reg << 48) | 1), > /* WAIT(all) */ > - (3ull << 56) | (waitall_mask << 16), > - > + JOB_INSTR(PANTHOR_DEVICE_PROFILING_DISABLED, > + (3ull << 56) | (params->waitall_mask << 16)), > /* SYNC_ADD64.system_scope.propage_err.nowait rX:rX+1, rX+2*/ > - (51ull << 56) | (0ull << 48) | (addr_reg << 40) | (val_reg << 32) | (0 << 16) | 1, > - > + JOB_INSTR(PANTHOR_DEVICE_PROFILING_DISABLED, > + (51ull << 56) | (0ull << 48) | (params->addr_reg << 40) | > + (params->val_reg << 32) | (0 << 16) | 1), > /* ERROR_BARRIER, so we can recover from faults at job > * boundaries. > */ > - (47ull << 56), > + JOB_INSTR(PANTHOR_DEVICE_PROFILING_DISABLED, (47ull << 56)), Personally I think this would be easier to read if instead of using JOB_INSTR directly we define a few extra helper macros of the below form: #define JOB_INSTR_ALWAYS(instr) \ JOB_INSTR(PANTHOR_DEVICE_PROFILING_DISABLED, instr) #define JOB_INSTR_TIMESTAMP(instr) \ JOB_INSTR(PANTHOR_DEVICE_PROFILING_TIMESTAMP, instr) #define JOB_INSTR_CYCLES(instr) \ JOB_INSTR(PANTHOR_DEVICE_PROFILING_CYCLES, instr) In particular I think the ...PROFILING_DISABLED flag is somewhat confusing because actually means "always" not only when profiling is disabled. > + }; > + u32 pad; > + > + /* NEED to be cacheline aligned to please the prefetcher. */ > + static_assert(sizeof(instrs->buffer) % 64 == 0, > + "panthor_job_ringbuf_instrs::buffer is not aligned on a cacheline"); > + > + /* Make sure we have enough storage to store the whole sequence. */ > + static_assert(ALIGN(ARRAY_SIZE(instr_seq), NUM_INSTRS_PER_CACHE_LINE) == > + ARRAY_SIZE(instrs->buffer), > + "instr_seq vs panthor_job_ringbuf_instrs::buffer size mismatch"); > + > + for (u32 i = 0; i < ARRAY_SIZE(instr_seq); i++) { > + /* If the profile mask of this instruction is not enabled, skip it. */ > + if (instr_seq[i].profile_mask && > + !(instr_seq[i].profile_mask & params->profile_mask)) > + continue; > + > + instrs->buffer[instrs->count++] = instr_seq[i].instr; > + } > + > + pad = ALIGN(instrs->count, NUM_INSTRS_PER_CACHE_LINE); > + memset(&instrs->buffer[instrs->count], 0, > + (pad - instrs->count) * sizeof(instrs->buffer[0])); > + instrs->count = pad; > +} > + > +static u32 calc_job_credits(u32 profile_mask) > +{ > + struct panthor_job_ringbuf_instrs instrs; You need to initialize this as instrs.count is read by prepare_job_instrs(). > + struct panthor_job_cs_params params = { > + .profile_mask = profile_mask, > }; > > - /* Need to be cacheline aligned to please the prefetcher. */ > - static_assert(sizeof(call_instrs) % 64 == 0, > - "call_instrs is not aligned on a cacheline"); > + prepare_job_instrs(¶ms, &instrs); > + return instrs.count; > +} > + > +static struct dma_fence * > +queue_run_job(struct drm_sched_job *sched_job) > +{ > + struct panthor_job *job = container_of(sched_job, struct panthor_job, base); > + struct panthor_group *group = job->group; > + struct panthor_queue *queue = group->queues[job->queue_idx]; > + struct panthor_device *ptdev = group->ptdev; > + struct panthor_scheduler *sched = ptdev->scheduler; > + struct panthor_job_ringbuf_instrs instrs; > + struct panthor_job_cs_params cs_params; > + struct dma_fence *done_fence; > + int ret; > > /* Stream size is zero, nothing to do except making sure all previously > * submitted jobs are done before we signal the > @@ -2900,17 +3081,23 @@ queue_run_job(struct drm_sched_job *sched_job) > queue->fence_ctx.id, > atomic64_inc_return(&queue->fence_ctx.seqno)); > > - memcpy(queue->ringbuf->kmap + ringbuf_insert, > - call_instrs, sizeof(call_instrs)); > + job->profiling.slot = queue->profiling.seqno++; > + if (queue->profiling.seqno == queue->profiling.slot_count) > + queue->profiling.seqno = 0; > + > + job->ringbuf.start = queue->iface.input->insert; > + > + get_job_cs_params(job, &cs_params); > + prepare_job_instrs(&cs_params, &instrs); > + copy_instrs_to_ringbuf(queue, job, &instrs); > + > + job->ringbuf.end = job->ringbuf.start + (instrs.count * sizeof(u64)); > > panthor_job_get(&job->base); > spin_lock(&queue->fence_ctx.lock); > list_add_tail(&job->node, &queue->fence_ctx.in_flight_jobs); > spin_unlock(&queue->fence_ctx.lock); > > - job->ringbuf.start = queue->iface.input->insert; > - job->ringbuf.end = job->ringbuf.start + sizeof(call_instrs); > - > /* Make sure the ring buffer is updated before the INSERT > * register. > */ > @@ -3003,6 +3190,34 @@ static const struct drm_sched_backend_ops panthor_queue_sched_ops = { > .free_job = queue_free_job, > }; > > +static u32 calc_profiling_ringbuf_num_slots(struct panthor_device *ptdev, > + u32 cs_ringbuf_size) > +{ > + u32 min_profiled_job_instrs = U32_MAX; > + u32 last_flag = fls(PANTHOR_DEVICE_PROFILING_ALL); > + > + /* > + * We want to calculate the minimum size of a profiled job's CS, > + * because since they need additional instructions for the sampling > + * of performance metrics, they might take up further slots in > + * the queue's ringbuffer. This means we might not need as many job > + * slots for keeping track of their profiling information. What we > + * need is the maximum number of slots we should allocate to this end, > + * which matches the maximum number of profiled jobs we can place > + * simultaneously in the queue's ring buffer. > + * That has to be calculated separately for every single job profiling > + * flag, but not in the case job profiling is disabled, since unprofiled > + * jobs don't need to keep track of this at all. > + */ > + for (u32 i = 0; i < last_flag; i++) { > + if (BIT(i) & PANTHOR_DEVICE_PROFILING_ALL) > + min_profiled_job_instrs = > + min(min_profiled_job_instrs, calc_job_credits(BIT(i))); > + } > + > + return DIV_ROUND_UP(cs_ringbuf_size, min_profiled_job_instrs * sizeof(u64)); > +} I may be missing something, but is there a situation where this is different to calc_job_credits(0)? AFAICT the infrastructure you've added can only add extra instructions to the no-flags case - whereas this implies you're thinking that instructions may also be removed (or replaced). Steve > + > static struct panthor_queue * > group_create_queue(struct panthor_group *group, > const struct drm_panthor_queue_create *args) > @@ -3056,9 +3271,35 @@ group_create_queue(struct panthor_group *group, > goto err_free_queue; > } > > + queue->profiling.slot_count = > + calc_profiling_ringbuf_num_slots(group->ptdev, args->ringbuf_size); > + > + queue->profiling.slots = > + panthor_kernel_bo_create(group->ptdev, group->vm, > + queue->profiling.slot_count * > + sizeof(struct panthor_job_profiling_data), > + DRM_PANTHOR_BO_NO_MMAP, > + DRM_PANTHOR_VM_BIND_OP_MAP_NOEXEC | > + DRM_PANTHOR_VM_BIND_OP_MAP_UNCACHED, > + PANTHOR_VM_KERNEL_AUTO_VA); > + > + if (IS_ERR(queue->profiling.slots)) { > + ret = PTR_ERR(queue->profiling.slots); > + goto err_free_queue; > + } > + > + ret = panthor_kernel_bo_vmap(queue->profiling.slots); > + if (ret) > + goto err_free_queue; > + > + /* > + * Credit limit argument tells us the total number of instructions > + * across all CS slots in the ringbuffer, with some jobs requiring > + * twice as many as others, depending on their profiling status. > + */ > ret = drm_sched_init(&queue->scheduler, &panthor_queue_sched_ops, > group->ptdev->scheduler->wq, 1, > - args->ringbuf_size / (NUM_INSTRS_PER_SLOT * sizeof(u64)), > + args->ringbuf_size / sizeof(u64), > 0, msecs_to_jiffies(JOB_TIMEOUT_MS), > group->ptdev->reset.wq, > NULL, "panthor-queue", group->ptdev->base.dev); > @@ -3354,6 +3595,7 @@ panthor_job_create(struct panthor_file *pfile, > { > struct panthor_group_pool *gpool = pfile->groups; > struct panthor_job *job; > + u32 credits; > int ret; > > if (qsubmit->pad) > @@ -3407,9 +3649,16 @@ panthor_job_create(struct panthor_file *pfile, > } > } > > + job->profiling.mask = pfile->ptdev->profile_mask; > + credits = calc_job_credits(job->profiling.mask); > + if (credits == 0) { > + ret = -EINVAL; > + goto err_put_job; > + } > + > ret = drm_sched_job_init(&job->base, > &job->group->queues[job->queue_idx]->entity, > - 1, job->group); > + credits, job->group); > if (ret) > goto err_put_job; >
diff --git a/drivers/gpu/drm/panthor/panthor_device.h b/drivers/gpu/drm/panthor/panthor_device.h index e388c0472ba7..a48e30d0af30 100644 --- a/drivers/gpu/drm/panthor/panthor_device.h +++ b/drivers/gpu/drm/panthor/panthor_device.h @@ -66,6 +66,25 @@ struct panthor_irq { atomic_t suspended; }; +/** + * enum panthor_device_profiling_mode - Profiling state + */ +enum panthor_device_profiling_flags { + /** @PANTHOR_DEVICE_PROFILING_DISABLED: Profiling is disabled. */ + PANTHOR_DEVICE_PROFILING_DISABLED = 0, + + /** @PANTHOR_DEVICE_PROFILING_CYCLES: Sampling job cycles. */ + PANTHOR_DEVICE_PROFILING_CYCLES = BIT(0), + + /** @PANTHOR_DEVICE_PROFILING_TIMESTAMP: Sampling job timestamp. */ + PANTHOR_DEVICE_PROFILING_TIMESTAMP = BIT(1), + + /** @PANTHOR_DEVICE_PROFILING_ALL: Sampling everything. */ + PANTHOR_DEVICE_PROFILING_ALL = + PANTHOR_DEVICE_PROFILING_CYCLES | + PANTHOR_DEVICE_PROFILING_TIMESTAMP, +}; + /** * struct panthor_device - Panthor device */ @@ -162,6 +181,9 @@ struct panthor_device { */ struct page *dummy_latest_flush; } pm; + + /** @profile_mask: User-set profiling flags for job accounting. */ + u32 profile_mask; }; /** diff --git a/drivers/gpu/drm/panthor/panthor_sched.c b/drivers/gpu/drm/panthor/panthor_sched.c index 42afdf0ddb7e..bcba52558f1e 100644 --- a/drivers/gpu/drm/panthor/panthor_sched.c +++ b/drivers/gpu/drm/panthor/panthor_sched.c @@ -93,6 +93,9 @@ #define MIN_CSGS 3 #define MAX_CSG_PRIO 0xf +#define NUM_INSTRS_PER_CACHE_LINE (64 / sizeof(u64)) +#define MAX_INSTRS_PER_JOB 24 + struct panthor_group; /** @@ -476,6 +479,18 @@ struct panthor_queue { */ struct list_head in_flight_jobs; } fence_ctx; + + /** @profiling_info: Job profiling data slots and access information. */ + struct { + /** @slots: Kernel BO holding the slots. */ + struct panthor_kernel_bo *slots; + + /** @slot_count: Number of jobs ringbuffer can hold at once. */ + u32 slot_count; + + /** @profiling_seqno: Index of the next available profiling information slot. */ + u32 seqno; + } profiling; }; /** @@ -661,6 +676,18 @@ struct panthor_group { struct list_head wait_node; }; +struct panthor_job_profiling_data { + struct { + u64 before; + u64 after; + } cycles; + + struct { + u64 before; + u64 after; + } time; +}; + /** * group_queue_work() - Queue a group work * @group: Group to queue the work for. @@ -774,6 +801,15 @@ struct panthor_job { /** @done_fence: Fence signaled when the job is finished or cancelled. */ struct dma_fence *done_fence; + + /** @profiling: Job profiling information. */ + struct { + /** @mask: Current device job profiling enablement bitmask. */ + u32 mask; + + /** @slot: Job index in the profiling slots BO. */ + u32 slot; + } profiling; }; static void @@ -838,6 +874,7 @@ static void group_free_queue(struct panthor_group *group, struct panthor_queue * panthor_kernel_bo_destroy(queue->ringbuf); panthor_kernel_bo_destroy(queue->iface.mem); + panthor_kernel_bo_destroy(queue->profiling.slots); /* Release the last_fence we were holding, if any. */ dma_fence_put(queue->fence_ctx.last_fence); @@ -1982,8 +2019,6 @@ tick_ctx_init(struct panthor_scheduler *sched, } } -#define NUM_INSTRS_PER_SLOT 16 - static void group_term_post_processing(struct panthor_group *group) { @@ -2815,65 +2850,211 @@ static void group_sync_upd_work(struct work_struct *work) group_put(group); } -static struct dma_fence * -queue_run_job(struct drm_sched_job *sched_job) +struct panthor_job_ringbuf_instrs { + u64 buffer[MAX_INSTRS_PER_JOB]; + u32 count; +}; + +struct panthor_job_instr { + u32 profile_mask; + u64 instr; +}; + +#define JOB_INSTR(__prof, __instr) \ + { \ + .profile_mask = __prof, \ + .instr = __instr, \ + } + +static void +copy_instrs_to_ringbuf(struct panthor_queue *queue, + struct panthor_job *job, + struct panthor_job_ringbuf_instrs *instrs) +{ + ssize_t ringbuf_size = panthor_kernel_bo_size(queue->ringbuf); + u32 start = job->ringbuf.start & (ringbuf_size - 1); + ssize_t size, written; + + /* + * We need to write a whole slot, including any trailing zeroes + * that may come at the end of it. Also, because instrs.buffer has + * been zero-initialised, there's no need to pad it with 0's + */ + instrs->count = ALIGN(instrs->count, NUM_INSTRS_PER_CACHE_LINE); + size = instrs->count * sizeof(u64); + written = min(ringbuf_size - start, size); + + memcpy(queue->ringbuf->kmap + start, instrs->buffer, written); + + if (written < size) + memcpy(queue->ringbuf->kmap, + &instrs->buffer[(ringbuf_size - start)/sizeof(u64)], + size - written); +} + +struct panthor_job_cs_params { + u32 profile_mask; + u64 addr_reg; u64 val_reg; + u64 cycle_reg; u64 time_reg; + u64 sync_addr; u64 times_addr; + u64 cs_start; u64 cs_size; + u32 last_flush; u32 waitall_mask; +}; + +static void +get_job_cs_params(struct panthor_job *job, struct panthor_job_cs_params *params) { - struct panthor_job *job = container_of(sched_job, struct panthor_job, base); struct panthor_group *group = job->group; struct panthor_queue *queue = group->queues[job->queue_idx]; struct panthor_device *ptdev = group->ptdev; struct panthor_scheduler *sched = ptdev->scheduler; - u32 ringbuf_size = panthor_kernel_bo_size(queue->ringbuf); - u32 ringbuf_insert = queue->iface.input->insert & (ringbuf_size - 1); - u64 addr_reg = ptdev->csif_info.cs_reg_count - - ptdev->csif_info.unpreserved_cs_reg_count; - u64 val_reg = addr_reg + 2; - u64 sync_addr = panthor_kernel_bo_gpuva(group->syncobjs) + - job->queue_idx * sizeof(struct panthor_syncobj_64b); - u32 waitall_mask = GENMASK(sched->sb_slot_count - 1, 0); - struct dma_fence *done_fence; - int ret; - u64 call_instrs[NUM_INSTRS_PER_SLOT] = { + params->addr_reg = ptdev->csif_info.cs_reg_count - + ptdev->csif_info.unpreserved_cs_reg_count; + params->val_reg = params->addr_reg + 2; + params->cycle_reg = params->addr_reg; + params->time_reg = params->val_reg; + + params->sync_addr = panthor_kernel_bo_gpuva(group->syncobjs) + + job->queue_idx * sizeof(struct panthor_syncobj_64b); + params->times_addr = panthor_kernel_bo_gpuva(queue->profiling.slots) + + (job->profiling.slot * sizeof(struct panthor_job_profiling_data)); + params->waitall_mask = GENMASK(sched->sb_slot_count - 1, 0); + + params->cs_start = job->call_info.start; + params->cs_size = job->call_info.size; + params->last_flush = job->call_info.latest_flush; + + params->profile_mask = job->profiling.mask; +} + +static void +prepare_job_instrs(const struct panthor_job_cs_params *params, + struct panthor_job_ringbuf_instrs *instrs) +{ + const struct panthor_job_instr instr_seq[] = { /* MOV32 rX+2, cs.latest_flush */ - (2ull << 56) | (val_reg << 48) | job->call_info.latest_flush, + JOB_INSTR(PANTHOR_DEVICE_PROFILING_DISABLED, + (2ull << 56) | (params->val_reg << 48) | params->last_flush), /* FLUSH_CACHE2.clean_inv_all.no_wait.signal(0) rX+2 */ - (36ull << 56) | (0ull << 48) | (val_reg << 40) | (0 << 16) | 0x233, + JOB_INSTR(PANTHOR_DEVICE_PROFILING_DISABLED, + (36ull << 56) | (0ull << 48) | (params->val_reg << 40) | (0 << 16) | 0x233), + + /* MOV48 rX:rX+1, cycles_offset */ + JOB_INSTR(PANTHOR_DEVICE_PROFILING_CYCLES, + (1ull << 56) | (params->cycle_reg << 48) | + (params->times_addr + + offsetof(struct panthor_job_profiling_data, cycles.before))), + /* STORE_STATE cycles */ + JOB_INSTR(PANTHOR_DEVICE_PROFILING_CYCLES, + (40ull << 56) | (params->cycle_reg << 40) | (1ll << 32)), + /* MOV48 rX:rX+1, time_offset */ + JOB_INSTR(PANTHOR_DEVICE_PROFILING_TIMESTAMP, + (1ull << 56) | (params->time_reg << 48) | + (params->times_addr + + offsetof(struct panthor_job_profiling_data, time.before))), + /* STORE_STATE timer */ + JOB_INSTR(PANTHOR_DEVICE_PROFILING_TIMESTAMP, + (40ull << 56) | (params->time_reg << 40) | (0ll << 32)), /* MOV48 rX:rX+1, cs.start */ - (1ull << 56) | (addr_reg << 48) | job->call_info.start, - + JOB_INSTR(PANTHOR_DEVICE_PROFILING_DISABLED, + (1ull << 56) | (params->addr_reg << 48) | params->cs_start), /* MOV32 rX+2, cs.size */ - (2ull << 56) | (val_reg << 48) | job->call_info.size, - + JOB_INSTR(PANTHOR_DEVICE_PROFILING_DISABLED, + (2ull << 56) | (params->val_reg << 48) | params->cs_size), /* WAIT(0) => waits for FLUSH_CACHE2 instruction */ - (3ull << 56) | (1 << 16), - + JOB_INSTR(PANTHOR_DEVICE_PROFILING_DISABLED, (3ull << 56) | (1 << 16)), /* CALL rX:rX+1, rX+2 */ - (32ull << 56) | (addr_reg << 40) | (val_reg << 32), + JOB_INSTR(PANTHOR_DEVICE_PROFILING_DISABLED, + (32ull << 56) | (params->addr_reg << 40) | (params->val_reg << 32)), + + /* MOV48 rX:rX+1, cycles_offset */ + JOB_INSTR(PANTHOR_DEVICE_PROFILING_CYCLES, + (1ull << 56) | (params->cycle_reg << 48) | + (params->times_addr + + offsetof(struct panthor_job_profiling_data, cycles.after))), + /* STORE_STATE cycles */ + JOB_INSTR(PANTHOR_DEVICE_PROFILING_CYCLES, + (40ull << 56) | (params->cycle_reg << 40) | (1ll << 32)), + + /* MOV48 rX:rX+1, time_offset */ + JOB_INSTR(PANTHOR_DEVICE_PROFILING_TIMESTAMP, + (1ull << 56) | (params->time_reg << 48) | + (params->times_addr + + offsetof(struct panthor_job_profiling_data, time.after))), + /* STORE_STATE timer */ + JOB_INSTR(PANTHOR_DEVICE_PROFILING_TIMESTAMP, + (40ull << 56) | (params->time_reg << 40) | (0ll << 32)), /* MOV48 rX:rX+1, sync_addr */ - (1ull << 56) | (addr_reg << 48) | sync_addr, - + JOB_INSTR(PANTHOR_DEVICE_PROFILING_DISABLED, + (1ull << 56) | (params->addr_reg << 48) | params->sync_addr), /* MOV48 rX+2, #1 */ - (1ull << 56) | (val_reg << 48) | 1, - + JOB_INSTR(PANTHOR_DEVICE_PROFILING_DISABLED, + (1ull << 56) | (params->val_reg << 48) | 1), /* WAIT(all) */ - (3ull << 56) | (waitall_mask << 16), - + JOB_INSTR(PANTHOR_DEVICE_PROFILING_DISABLED, + (3ull << 56) | (params->waitall_mask << 16)), /* SYNC_ADD64.system_scope.propage_err.nowait rX:rX+1, rX+2*/ - (51ull << 56) | (0ull << 48) | (addr_reg << 40) | (val_reg << 32) | (0 << 16) | 1, - + JOB_INSTR(PANTHOR_DEVICE_PROFILING_DISABLED, + (51ull << 56) | (0ull << 48) | (params->addr_reg << 40) | + (params->val_reg << 32) | (0 << 16) | 1), /* ERROR_BARRIER, so we can recover from faults at job * boundaries. */ - (47ull << 56), + JOB_INSTR(PANTHOR_DEVICE_PROFILING_DISABLED, (47ull << 56)), + }; + u32 pad; + + /* NEED to be cacheline aligned to please the prefetcher. */ + static_assert(sizeof(instrs->buffer) % 64 == 0, + "panthor_job_ringbuf_instrs::buffer is not aligned on a cacheline"); + + /* Make sure we have enough storage to store the whole sequence. */ + static_assert(ALIGN(ARRAY_SIZE(instr_seq), NUM_INSTRS_PER_CACHE_LINE) == + ARRAY_SIZE(instrs->buffer), + "instr_seq vs panthor_job_ringbuf_instrs::buffer size mismatch"); + + for (u32 i = 0; i < ARRAY_SIZE(instr_seq); i++) { + /* If the profile mask of this instruction is not enabled, skip it. */ + if (instr_seq[i].profile_mask && + !(instr_seq[i].profile_mask & params->profile_mask)) + continue; + + instrs->buffer[instrs->count++] = instr_seq[i].instr; + } + + pad = ALIGN(instrs->count, NUM_INSTRS_PER_CACHE_LINE); + memset(&instrs->buffer[instrs->count], 0, + (pad - instrs->count) * sizeof(instrs->buffer[0])); + instrs->count = pad; +} + +static u32 calc_job_credits(u32 profile_mask) +{ + struct panthor_job_ringbuf_instrs instrs; + struct panthor_job_cs_params params = { + .profile_mask = profile_mask, }; - /* Need to be cacheline aligned to please the prefetcher. */ - static_assert(sizeof(call_instrs) % 64 == 0, - "call_instrs is not aligned on a cacheline"); + prepare_job_instrs(¶ms, &instrs); + return instrs.count; +} + +static struct dma_fence * +queue_run_job(struct drm_sched_job *sched_job) +{ + struct panthor_job *job = container_of(sched_job, struct panthor_job, base); + struct panthor_group *group = job->group; + struct panthor_queue *queue = group->queues[job->queue_idx]; + struct panthor_device *ptdev = group->ptdev; + struct panthor_scheduler *sched = ptdev->scheduler; + struct panthor_job_ringbuf_instrs instrs; + struct panthor_job_cs_params cs_params; + struct dma_fence *done_fence; + int ret; /* Stream size is zero, nothing to do except making sure all previously * submitted jobs are done before we signal the @@ -2900,17 +3081,23 @@ queue_run_job(struct drm_sched_job *sched_job) queue->fence_ctx.id, atomic64_inc_return(&queue->fence_ctx.seqno)); - memcpy(queue->ringbuf->kmap + ringbuf_insert, - call_instrs, sizeof(call_instrs)); + job->profiling.slot = queue->profiling.seqno++; + if (queue->profiling.seqno == queue->profiling.slot_count) + queue->profiling.seqno = 0; + + job->ringbuf.start = queue->iface.input->insert; + + get_job_cs_params(job, &cs_params); + prepare_job_instrs(&cs_params, &instrs); + copy_instrs_to_ringbuf(queue, job, &instrs); + + job->ringbuf.end = job->ringbuf.start + (instrs.count * sizeof(u64)); panthor_job_get(&job->base); spin_lock(&queue->fence_ctx.lock); list_add_tail(&job->node, &queue->fence_ctx.in_flight_jobs); spin_unlock(&queue->fence_ctx.lock); - job->ringbuf.start = queue->iface.input->insert; - job->ringbuf.end = job->ringbuf.start + sizeof(call_instrs); - /* Make sure the ring buffer is updated before the INSERT * register. */ @@ -3003,6 +3190,34 @@ static const struct drm_sched_backend_ops panthor_queue_sched_ops = { .free_job = queue_free_job, }; +static u32 calc_profiling_ringbuf_num_slots(struct panthor_device *ptdev, + u32 cs_ringbuf_size) +{ + u32 min_profiled_job_instrs = U32_MAX; + u32 last_flag = fls(PANTHOR_DEVICE_PROFILING_ALL); + + /* + * We want to calculate the minimum size of a profiled job's CS, + * because since they need additional instructions for the sampling + * of performance metrics, they might take up further slots in + * the queue's ringbuffer. This means we might not need as many job + * slots for keeping track of their profiling information. What we + * need is the maximum number of slots we should allocate to this end, + * which matches the maximum number of profiled jobs we can place + * simultaneously in the queue's ring buffer. + * That has to be calculated separately for every single job profiling + * flag, but not in the case job profiling is disabled, since unprofiled + * jobs don't need to keep track of this at all. + */ + for (u32 i = 0; i < last_flag; i++) { + if (BIT(i) & PANTHOR_DEVICE_PROFILING_ALL) + min_profiled_job_instrs = + min(min_profiled_job_instrs, calc_job_credits(BIT(i))); + } + + return DIV_ROUND_UP(cs_ringbuf_size, min_profiled_job_instrs * sizeof(u64)); +} + static struct panthor_queue * group_create_queue(struct panthor_group *group, const struct drm_panthor_queue_create *args) @@ -3056,9 +3271,35 @@ group_create_queue(struct panthor_group *group, goto err_free_queue; } + queue->profiling.slot_count = + calc_profiling_ringbuf_num_slots(group->ptdev, args->ringbuf_size); + + queue->profiling.slots = + panthor_kernel_bo_create(group->ptdev, group->vm, + queue->profiling.slot_count * + sizeof(struct panthor_job_profiling_data), + DRM_PANTHOR_BO_NO_MMAP, + DRM_PANTHOR_VM_BIND_OP_MAP_NOEXEC | + DRM_PANTHOR_VM_BIND_OP_MAP_UNCACHED, + PANTHOR_VM_KERNEL_AUTO_VA); + + if (IS_ERR(queue->profiling.slots)) { + ret = PTR_ERR(queue->profiling.slots); + goto err_free_queue; + } + + ret = panthor_kernel_bo_vmap(queue->profiling.slots); + if (ret) + goto err_free_queue; + + /* + * Credit limit argument tells us the total number of instructions + * across all CS slots in the ringbuffer, with some jobs requiring + * twice as many as others, depending on their profiling status. + */ ret = drm_sched_init(&queue->scheduler, &panthor_queue_sched_ops, group->ptdev->scheduler->wq, 1, - args->ringbuf_size / (NUM_INSTRS_PER_SLOT * sizeof(u64)), + args->ringbuf_size / sizeof(u64), 0, msecs_to_jiffies(JOB_TIMEOUT_MS), group->ptdev->reset.wq, NULL, "panthor-queue", group->ptdev->base.dev); @@ -3354,6 +3595,7 @@ panthor_job_create(struct panthor_file *pfile, { struct panthor_group_pool *gpool = pfile->groups; struct panthor_job *job; + u32 credits; int ret; if (qsubmit->pad) @@ -3407,9 +3649,16 @@ panthor_job_create(struct panthor_file *pfile, } } + job->profiling.mask = pfile->ptdev->profile_mask; + credits = calc_job_credits(job->profiling.mask); + if (credits == 0) { + ret = -EINVAL; + goto err_put_job; + } + ret = drm_sched_job_init(&job->base, &job->group->queues[job->queue_idx]->entity, - 1, job->group); + credits, job->group); if (ret) goto err_put_job;