diff mbox series

[RFC,08/10] dma-buf/dma-fence: Introduce long-running completion fences

Message ID 20230404002211.3611376-9-matthew.brost@intel.com (mailing list archive)
State New, archived
Headers show
Series Xe DRM scheduler and long running workload plans | expand

Commit Message

Matthew Brost April 4, 2023, 12:22 a.m. UTC
From: Thomas Hellström <thomas.hellstrom@linux.intel.com>

For long-running workloads, drivers either need to open-code completion
waits, invent their own synchronization primitives or internally use
dma-fences that do not obey the cross-driver dma-fence protocol, but
without any lockdep annotation all these approaches are error prone.

So since for example the drm scheduler uses dma-fences it is desirable for
a driver to be able to use it for throttling and error handling also with
internal dma-fences tha do not obey the cros-driver dma-fence protocol.

Introduce long-running completion fences in form of dma-fences, and add
lockdep annotation for them. In particular:

* Do not allow waiting under any memory management locks.
* Do not allow to attach them to a dma-resv object.
* Introduce a new interface for adding callbacks making the helper adding
  a callback sign off on that it is aware that the dma-fence may not
  complete anytime soon. Typically this will be the scheduler chaining
  a new long-running fence on another one.

Signed-off-by: Matthew Brost <matthew.brost@intel.com>
Signed-off-by: Thomas Hellström <thomas.hellstrom@linux.intel.com>
---
 drivers/dma-buf/dma-fence.c | 142 ++++++++++++++++++++++++++----------
 drivers/dma-buf/dma-resv.c  |   5 ++
 include/linux/dma-fence.h   |  55 +++++++++++++-
 3 files changed, 160 insertions(+), 42 deletions(-)

Comments

Christian König April 4, 2023, 9:09 a.m. UTC | #1
Am 04.04.23 um 02:22 schrieb Matthew Brost:
> From: Thomas Hellström <thomas.hellstrom@linux.intel.com>
>
> For long-running workloads, drivers either need to open-code completion
> waits, invent their own synchronization primitives or internally use
> dma-fences that do not obey the cross-driver dma-fence protocol, but
> without any lockdep annotation all these approaches are error prone.
>
> So since for example the drm scheduler uses dma-fences it is desirable for
> a driver to be able to use it for throttling and error handling also with
> internal dma-fences tha do not obey the cros-driver dma-fence protocol.
>
> Introduce long-running completion fences in form of dma-fences, and add
> lockdep annotation for them. In particular:
>
> * Do not allow waiting under any memory management locks.
> * Do not allow to attach them to a dma-resv object.
> * Introduce a new interface for adding callbacks making the helper adding
>    a callback sign off on that it is aware that the dma-fence may not
>    complete anytime soon. Typically this will be the scheduler chaining
>    a new long-running fence on another one.

Well that's pretty much what I tried before: 
https://lwn.net/Articles/893704/

And the reasons why it was rejected haven't changed.

Regards,
Christian.

>
> Signed-off-by: Matthew Brost <matthew.brost@intel.com>
> Signed-off-by: Thomas Hellström <thomas.hellstrom@linux.intel.com>
> ---
>   drivers/dma-buf/dma-fence.c | 142 ++++++++++++++++++++++++++----------
>   drivers/dma-buf/dma-resv.c  |   5 ++
>   include/linux/dma-fence.h   |  55 +++++++++++++-
>   3 files changed, 160 insertions(+), 42 deletions(-)
>
> diff --git a/drivers/dma-buf/dma-fence.c b/drivers/dma-buf/dma-fence.c
> index f177c56269bb..9726b2a3c67d 100644
> --- a/drivers/dma-buf/dma-fence.c
> +++ b/drivers/dma-buf/dma-fence.c
> @@ -111,6 +111,20 @@ static atomic64_t dma_fence_context_counter = ATOMIC64_INIT(1);
>    * drivers/gpu should ever call dma_fence_wait() in such contexts.
>    */
>   
> +/**
> + * DOC: Long-Running (lr) dma-fences.
> + *
> + * * Long-running dma-fences are NOT required to complete in reasonable time.
> + *   Typically they signal completion of user-space controlled workloads and
> + *   as such, need to never be part of a cross-driver contract, never waited
> + *   for inside a kernel lock, nor attached to a dma-resv. There are helpers
> + *   and warnings in place to help facilitate that that never happens.
> + *
> + * * The motivation for their existense is that helpers that are intended to
> + *   be used by drivers may use dma-fences that, given the workloads mentioned
> + *   above, become long-running.
> + */
> +
>   static const char *dma_fence_stub_get_name(struct dma_fence *fence)
>   {
>           return "stub";
> @@ -284,6 +298,34 @@ static struct lockdep_map dma_fence_lockdep_map = {
>   	.name = "dma_fence_map"
>   };
>   
> +static struct lockdep_map dma_fence_lr_lockdep_map = {
> +	.name = "dma_fence_lr_map"
> +};
> +
> +static bool __dma_fence_begin_signalling(struct lockdep_map *map)
> +{
> +	/* explicitly nesting ... */
> +	if (lock_is_held_type(map, 1))
> +		return true;
> +
> +	/* rely on might_sleep check for soft/hardirq locks */
> +	if (in_atomic())
> +		return true;
> +
> +	/* ... and non-recursive readlock */
> +	lock_acquire(map, 0, 0, 1, 1, NULL, _RET_IP_);
> +
> +	return false;
> +}
> +
> +static void __dma_fence_end_signalling(bool cookie, struct lockdep_map *map)
> +{
> +	if (cookie)
> +		return;
> +
> +	lock_release(map, _RET_IP_);
> +}
> +
>   /**
>    * dma_fence_begin_signalling - begin a critical DMA fence signalling section
>    *
> @@ -300,18 +342,7 @@ static struct lockdep_map dma_fence_lockdep_map = {
>    */
>   bool dma_fence_begin_signalling(void)
>   {
> -	/* explicitly nesting ... */
> -	if (lock_is_held_type(&dma_fence_lockdep_map, 1))
> -		return true;
> -
> -	/* rely on might_sleep check for soft/hardirq locks */
> -	if (in_atomic())
> -		return true;
> -
> -	/* ... and non-recursive readlock */
> -	lock_acquire(&dma_fence_lockdep_map, 0, 0, 1, 1, NULL, _RET_IP_);
> -
> -	return false;
> +	return __dma_fence_begin_signalling(&dma_fence_lockdep_map);
>   }
>   EXPORT_SYMBOL(dma_fence_begin_signalling);
>   
> @@ -323,25 +354,61 @@ EXPORT_SYMBOL(dma_fence_begin_signalling);
>    */
>   void dma_fence_end_signalling(bool cookie)
>   {
> -	if (cookie)
> -		return;
> -
> -	lock_release(&dma_fence_lockdep_map, _RET_IP_);
> +	__dma_fence_end_signalling(cookie, &dma_fence_lockdep_map);
>   }
>   EXPORT_SYMBOL(dma_fence_end_signalling);
>   
> -void __dma_fence_might_wait(void)
> +/**
> + * dma_fence_lr begin_signalling - begin a critical long-running DMA fence
> + * signalling section
> + *
> + * Drivers should use this to annotate the beginning of any code section
> + * required to eventually complete &dma_fence by calling dma_fence_signal().
> + *
> + * The end of these critical sections are annotated with
> + * dma_fence_lr_end_signalling(). Ideally the section should encompass all
> + * locks that are ever required to signal a long-running dma-fence.
> + *
> + * Return: An opaque cookie needed by the implementation, which needs to be
> + * passed to dma_fence_lr end_signalling().
> + */
> +bool dma_fence_lr_begin_signalling(void)
> +{
> +	return __dma_fence_begin_signalling(&dma_fence_lr_lockdep_map);
> +}
> +EXPORT_SYMBOL(dma_fence_lr_begin_signalling);
> +
> +/**
> + * dma_fence_lr_end_signalling - end a critical DMA fence signalling section
> + * @cookie: opaque cookie from dma_fence_lr_begin_signalling()
> + *
> + * Closes a critical section annotation opened by
> + * dma_fence_lr_begin_signalling().
> + */
> +void dma_fence_lr_end_signalling(bool cookie)
> +{
> +	__dma_fence_end_signalling(cookie, &dma_fence_lr_lockdep_map);
> +}
> +EXPORT_SYMBOL(dma_fence_lr_end_signalling);
> +
> +static void ___dma_fence_might_wait(struct lockdep_map *map)
>   {
>   	bool tmp;
>   
> -	tmp = lock_is_held_type(&dma_fence_lockdep_map, 1);
> +	tmp = lock_is_held_type(map, 1);
>   	if (tmp)
> -		lock_release(&dma_fence_lockdep_map, _THIS_IP_);
> -	lock_map_acquire(&dma_fence_lockdep_map);
> -	lock_map_release(&dma_fence_lockdep_map);
> +		lock_release(map, _THIS_IP_);
> +	lock_map_acquire(map);
> +	lock_map_release(map);
>   	if (tmp)
> -		lock_acquire(&dma_fence_lockdep_map, 0, 0, 1, 1, NULL, _THIS_IP_);
> +		lock_acquire(map, 0, 0, 1, 1, NULL, _THIS_IP_);
> +}
> +
> +void __dma_fence_might_wait(void)
> +{
> +	___dma_fence_might_wait(&dma_fence_lockdep_map);
>   }
> +
>   #endif
>   
>   
> @@ -506,7 +573,11 @@ dma_fence_wait_timeout(struct dma_fence *fence, bool intr, signed long timeout)
>   
>   	might_sleep();
>   
> -	__dma_fence_might_wait();
> +#ifdef CONFIG_LOCKDEP
> +	___dma_fence_might_wait(dma_fence_is_lr(fence) ?
> +				&dma_fence_lr_lockdep_map :
> +				&dma_fence_lockdep_map);
> +#endif
>   
>   	dma_fence_enable_sw_signaling(fence);
>   
> @@ -618,29 +689,22 @@ void dma_fence_enable_sw_signaling(struct dma_fence *fence)
>   EXPORT_SYMBOL(dma_fence_enable_sw_signaling);
>   
>   /**
> - * dma_fence_add_callback - add a callback to be called when the fence
> + * dma_fence_lr_add_callback - add a callback to be called when the fence
>    * is signaled
>    * @fence: the fence to wait on
>    * @cb: the callback to register
>    * @func: the function to call
>    *
> - * Add a software callback to the fence. The caller should keep a reference to
> - * the fence.
> - *
> - * @cb will be initialized by dma_fence_add_callback(), no initialization
> - * by the caller is required. Any number of callbacks can be registered
> - * to a fence, but a callback can only be registered to one fence at a time.
> - *
> - * If fence is already signaled, this function will return -ENOENT (and
> - * *not* call the callback).
> - *
> - * Note that the callback can be called from an atomic context or irq context.
> + * This function is identical to dma_fence_add_callback() but allows adding
> + * callbacks also to lr dma-fences. The naming helps annotating the fact that
> + * we're adding a callback to a a lr fence and that the callback might therefore
> + * not be called within a reasonable amount of time.
>    *
> - * Returns 0 in case of success, -ENOENT if the fence is already signaled
> + * Return: 0 in case of success, -ENOENT if the fence is already signaled
>    * and -EINVAL in case of error.
>    */
> -int dma_fence_add_callback(struct dma_fence *fence, struct dma_fence_cb *cb,
> -			   dma_fence_func_t func)
> +int dma_fence_lr_add_callback(struct dma_fence *fence, struct dma_fence_cb *cb,
> +			      dma_fence_func_t func)
>   {
>   	unsigned long flags;
>   	int ret = 0;
> @@ -667,7 +731,7 @@ int dma_fence_add_callback(struct dma_fence *fence, struct dma_fence_cb *cb,
>   
>   	return ret;
>   }
> -EXPORT_SYMBOL(dma_fence_add_callback);
> +EXPORT_SYMBOL(dma_fence_lr_add_callback);
>   
>   /**
>    * dma_fence_get_status - returns the status upon completion
> diff --git a/drivers/dma-buf/dma-resv.c b/drivers/dma-buf/dma-resv.c
> index 2a594b754af1..fa0210c1442e 100644
> --- a/drivers/dma-buf/dma-resv.c
> +++ b/drivers/dma-buf/dma-resv.c
> @@ -292,6 +292,7 @@ void dma_resv_add_fence(struct dma_resv *obj, struct dma_fence *fence,
>   	 * individually.
>   	 */
>   	WARN_ON(dma_fence_is_container(fence));
> +	WARN_ON_ONCE(dma_fence_is_lr(fence));
>   
>   	fobj = dma_resv_fences_list(obj);
>   	count = fobj->num_fences;
> @@ -340,6 +341,7 @@ void dma_resv_replace_fences(struct dma_resv *obj, uint64_t context,
>   	unsigned int i;
>   
>   	dma_resv_assert_held(obj);
> +	WARN_ON_ONCE(dma_fence_is_lr(replacement));
>   
>   	list = dma_resv_fences_list(obj);
>   	for (i = 0; list && i < list->num_fences; ++i) {
> @@ -764,6 +766,7 @@ static int __init dma_resv_lockdep(void)
>   	struct ww_acquire_ctx ctx;
>   	struct dma_resv obj;
>   	struct address_space mapping;
> +	bool lr_cookie;
>   	int ret;
>   
>   	if (!mm)
> @@ -772,6 +775,7 @@ static int __init dma_resv_lockdep(void)
>   	dma_resv_init(&obj);
>   	address_space_init_once(&mapping);
>   
> +	lr_cookie = dma_fence_lr_begin_signalling();
>   	mmap_read_lock(mm);
>   	ww_acquire_init(&ctx, &reservation_ww_class);
>   	ret = dma_resv_lock(&obj, &ctx);
> @@ -792,6 +796,7 @@ static int __init dma_resv_lockdep(void)
>   	ww_mutex_unlock(&obj.lock);
>   	ww_acquire_fini(&ctx);
>   	mmap_read_unlock(mm);
> +	dma_fence_lr_end_signalling(lr_cookie);
>   
>   	mmput(mm);
>   
> diff --git a/include/linux/dma-fence.h b/include/linux/dma-fence.h
> index d54b595a0fe0..08d21e26782b 100644
> --- a/include/linux/dma-fence.h
> +++ b/include/linux/dma-fence.h
> @@ -99,6 +99,7 @@ enum dma_fence_flag_bits {
>   	DMA_FENCE_FLAG_SIGNALED_BIT,
>   	DMA_FENCE_FLAG_TIMESTAMP_BIT,
>   	DMA_FENCE_FLAG_ENABLE_SIGNAL_BIT,
> +	DMA_FENCE_FLAG_LR_BIT,
>   	DMA_FENCE_FLAG_USER_BITS, /* must always be last member */
>   };
>   
> @@ -279,6 +280,11 @@ struct dma_fence_ops {
>   	void (*set_deadline)(struct dma_fence *fence, ktime_t deadline);
>   };
>   
> +static inline bool dma_fence_is_lr(const struct dma_fence *fence)
> +{
> +	return test_bit(DMA_FENCE_FLAG_LR_BIT, &fence->flags);
> +}
> +
>   void dma_fence_init(struct dma_fence *fence, const struct dma_fence_ops *ops,
>   		    spinlock_t *lock, u64 context, u64 seqno);
>   
> @@ -377,13 +383,23 @@ dma_fence_get_rcu_safe(struct dma_fence __rcu **fencep)
>   #ifdef CONFIG_LOCKDEP
>   bool dma_fence_begin_signalling(void);
>   void dma_fence_end_signalling(bool cookie);
> +bool dma_fence_lr_begin_signalling(void);
> +void dma_fence_lr_end_signalling(bool cookie);
>   void __dma_fence_might_wait(void);
>   #else
> +
>   static inline bool dma_fence_begin_signalling(void)
>   {
>   	return true;
>   }
> +
>   static inline void dma_fence_end_signalling(bool cookie) {}
> +static inline bool dma_fence_lr_begin_signalling(void)
> +{
> +	return true;
> +}
> +
> +static inline void dma_fence_lr_end_signalling(bool cookie) {}
>   static inline void __dma_fence_might_wait(void) {}
>   #endif
>   
> @@ -394,9 +410,42 @@ int dma_fence_signal_timestamp_locked(struct dma_fence *fence,
>   				      ktime_t timestamp);
>   signed long dma_fence_default_wait(struct dma_fence *fence,
>   				   bool intr, signed long timeout);
> -int dma_fence_add_callback(struct dma_fence *fence,
> -			   struct dma_fence_cb *cb,
> -			   dma_fence_func_t func);
> +
> +int dma_fence_lr_add_callback(struct dma_fence *fence,
> +			      struct dma_fence_cb *cb,
> +			      dma_fence_func_t func);
> +
> +/**
> + * dma_fence_add_callback - add a callback to be called when the fence
> + * is signaled
> + * @fence: the fence to wait on
> + * @cb: the callback to register
> + * @func: the function to call
> + *
> + * Add a software callback to the fence. The caller should keep a reference to
> + * the fence.
> + *
> + * @cb will be initialized by dma_fence_add_callback(), no initialization
> + * by the caller is required. Any number of callbacks can be registered
> + * to a fence, but a callback can only be registered to one fence at a time.
> + *
> + * If fence is already signaled, this function will return -ENOENT (and
> + * *not* call the callback).
> + *
> + * Note that the callback can be called from an atomic context or irq context.
> + *
> + * Returns 0 in case of success, -ENOENT if the fence is already signaled
> + * and -EINVAL in case of error.
> + */
> +static inline int dma_fence_add_callback(struct dma_fence *fence,
> +					 struct dma_fence_cb *cb,
> +					 dma_fence_func_t func)
> +{
> +	WARN_ON(IS_ENABLED(CONFIG_LOCKDEP) && dma_fence_is_lr(fence));
> +
> +	return dma_fence_lr_add_callback(fence, cb, func);
> +}
> +
>   bool dma_fence_remove_callback(struct dma_fence *fence,
>   			       struct dma_fence_cb *cb);
>   void dma_fence_enable_sw_signaling(struct dma_fence *fence);
Thomas Hellstrom April 4, 2023, 12:54 p.m. UTC | #2
Hi, Christian,

On 4/4/23 11:09, Christian König wrote:
> Am 04.04.23 um 02:22 schrieb Matthew Brost:
>> From: Thomas Hellström <thomas.hellstrom@linux.intel.com>
>>
>> For long-running workloads, drivers either need to open-code completion
>> waits, invent their own synchronization primitives or internally use
>> dma-fences that do not obey the cross-driver dma-fence protocol, but
>> without any lockdep annotation all these approaches are error prone.
>>
>> So since for example the drm scheduler uses dma-fences it is 
>> desirable for
>> a driver to be able to use it for throttling and error handling also 
>> with
>> internal dma-fences tha do not obey the cros-driver dma-fence protocol.
>>
>> Introduce long-running completion fences in form of dma-fences, and add
>> lockdep annotation for them. In particular:
>>
>> * Do not allow waiting under any memory management locks.
>> * Do not allow to attach them to a dma-resv object.
>> * Introduce a new interface for adding callbacks making the helper 
>> adding
>>    a callback sign off on that it is aware that the dma-fence may not
>>    complete anytime soon. Typically this will be the scheduler chaining
>>    a new long-running fence on another one.
>
> Well that's pretty much what I tried before: 
> https://lwn.net/Articles/893704/
>
> And the reasons why it was rejected haven't changed.
>
> Regards,
> Christian.
>
Yes, TBH this was mostly to get discussion going how we'd best tackle 
this problem while being able to reuse the scheduler for long-running 
workloads.

I couldn't see any clear decision on your series, though, but one main 
difference I see is that this is intended for driver-internal use only. 
(I'm counting using the drm_scheduler as a helper for driver-private 
use). This is by no means a way to try tackle the indefinite fence problem.

We could ofc invent a completely different data-type that abstracts the 
synchronization the scheduler needs in the long-running case, or each 
driver could hack something up, like sleeping in the prepare_job() or 
run_job() callback for throttling, but those waits should still be 
annotated in one way or annotated one way or another (and probably in a 
similar way across drivers) to make sure we don't do anything bad.

  So any suggestions as to what would be the better solution here would 
be appreciated.

Thanks,

Thomas
Christian König April 4, 2023, 1:10 p.m. UTC | #3
Am 04.04.23 um 14:54 schrieb Thomas Hellström:
> Hi, Christian,
>
> On 4/4/23 11:09, Christian König wrote:
>> Am 04.04.23 um 02:22 schrieb Matthew Brost:
>>> From: Thomas Hellström <thomas.hellstrom@linux.intel.com>
>>>
>>> For long-running workloads, drivers either need to open-code completion
>>> waits, invent their own synchronization primitives or internally use
>>> dma-fences that do not obey the cross-driver dma-fence protocol, but
>>> without any lockdep annotation all these approaches are error prone.
>>>
>>> So since for example the drm scheduler uses dma-fences it is 
>>> desirable for
>>> a driver to be able to use it for throttling and error handling also 
>>> with
>>> internal dma-fences tha do not obey the cros-driver dma-fence protocol.
>>>
>>> Introduce long-running completion fences in form of dma-fences, and add
>>> lockdep annotation for them. In particular:
>>>
>>> * Do not allow waiting under any memory management locks.
>>> * Do not allow to attach them to a dma-resv object.
>>> * Introduce a new interface for adding callbacks making the helper 
>>> adding
>>>    a callback sign off on that it is aware that the dma-fence may not
>>>    complete anytime soon. Typically this will be the scheduler chaining
>>>    a new long-running fence on another one.
>>
>> Well that's pretty much what I tried before: 
>> https://lwn.net/Articles/893704/
>>
>> And the reasons why it was rejected haven't changed.
>>
>> Regards,
>> Christian.
>>
> Yes, TBH this was mostly to get discussion going how we'd best tackle 
> this problem while being able to reuse the scheduler for long-running 
> workloads.
>
> I couldn't see any clear decision on your series, though, but one main 
> difference I see is that this is intended for driver-internal use 
> only. (I'm counting using the drm_scheduler as a helper for 
> driver-private use). This is by no means a way to try tackle the 
> indefinite fence problem.

Well this was just my latest try to tackle this, but essentially the 
problems are the same as with your approach: When we express such 
operations as dma_fence there is always the change that we leak that 
somewhere.

My approach of adding a flag noting that this operation is dangerous and 
can't be synced with something memory management depends on tried to 
contain this as much as possible, but Daniel still pretty clearly 
rejected it (for good reasons I think).

>
> We could ofc invent a completely different data-type that abstracts 
> the synchronization the scheduler needs in the long-running case, or 
> each driver could hack something up, like sleeping in the 
> prepare_job() or run_job() callback for throttling, but those waits 
> should still be annotated in one way or annotated one way or another 
> (and probably in a similar way across drivers) to make sure we don't 
> do anything bad.
>
>  So any suggestions as to what would be the better solution here would 
> be appreciated.

Mhm, do we really the the GPU scheduler for that?

I mean in the 1 to 1 case  you basically just need a component which 
collects the dependencies as dma_fence and if all of them are fulfilled 
schedules a work item.

As long as the work item itself doesn't produce a dma_fence it can then 
still just wait for other none dma_fence dependencies.

Then the work function could submit the work and wait for the result.

The work item would then pretty much represent what you want, you can 
wait for it to finish and pass it along as long running dependency.

Maybe give it a funky name and wrap it up in a structure, but that's 
basically it.

Regards,
Christian.

>
> Thanks,
>
> Thomas
>
>
>
>
>
Thomas Hellström (Intel) April 4, 2023, 6:14 p.m. UTC | #4
On 4/4/23 15:10, Christian König wrote:
> Am 04.04.23 um 14:54 schrieb Thomas Hellström:
>> Hi, Christian,
>>
>> On 4/4/23 11:09, Christian König wrote:
>>> Am 04.04.23 um 02:22 schrieb Matthew Brost:
>>>> From: Thomas Hellström <thomas.hellstrom@linux.intel.com>
>>>>
>>>> For long-running workloads, drivers either need to open-code 
>>>> completion
>>>> waits, invent their own synchronization primitives or internally use
>>>> dma-fences that do not obey the cross-driver dma-fence protocol, but
>>>> without any lockdep annotation all these approaches are error prone.
>>>>
>>>> So since for example the drm scheduler uses dma-fences it is 
>>>> desirable for
>>>> a driver to be able to use it for throttling and error handling 
>>>> also with
>>>> internal dma-fences tha do not obey the cros-driver dma-fence 
>>>> protocol.
>>>>
>>>> Introduce long-running completion fences in form of dma-fences, and 
>>>> add
>>>> lockdep annotation for them. In particular:
>>>>
>>>> * Do not allow waiting under any memory management locks.
>>>> * Do not allow to attach them to a dma-resv object.
>>>> * Introduce a new interface for adding callbacks making the helper 
>>>> adding
>>>>    a callback sign off on that it is aware that the dma-fence may not
>>>>    complete anytime soon. Typically this will be the scheduler 
>>>> chaining
>>>>    a new long-running fence on another one.
>>>
>>> Well that's pretty much what I tried before: 
>>> https://lwn.net/Articles/893704/
>>>
>>> And the reasons why it was rejected haven't changed.
>>>
>>> Regards,
>>> Christian.
>>>
>> Yes, TBH this was mostly to get discussion going how we'd best tackle 
>> this problem while being able to reuse the scheduler for long-running 
>> workloads.
>>
>> I couldn't see any clear decision on your series, though, but one 
>> main difference I see is that this is intended for driver-internal 
>> use only. (I'm counting using the drm_scheduler as a helper for 
>> driver-private use). This is by no means a way to try tackle the 
>> indefinite fence problem.
>
> Well this was just my latest try to tackle this, but essentially the 
> problems are the same as with your approach: When we express such 
> operations as dma_fence there is always the change that we leak that 
> somewhere.
>
> My approach of adding a flag noting that this operation is dangerous 
> and can't be synced with something memory management depends on tried 
> to contain this as much as possible, but Daniel still pretty clearly 
> rejected it (for good reasons I think).
>
>>
>> We could ofc invent a completely different data-type that abstracts 
>> the synchronization the scheduler needs in the long-running case, or 
>> each driver could hack something up, like sleeping in the 
>> prepare_job() or run_job() callback for throttling, but those waits 
>> should still be annotated in one way or annotated one way or another 
>> (and probably in a similar way across drivers) to make sure we don't 
>> do anything bad.
>>
>>  So any suggestions as to what would be the better solution here 
>> would be appreciated.
>
> Mhm, do we really the the GPU scheduler for that?
>
> I mean in the 1 to 1 case  you basically just need a component which 
> collects the dependencies as dma_fence and if all of them are 
> fulfilled schedules a work item.
>
> As long as the work item itself doesn't produce a dma_fence it can 
> then still just wait for other none dma_fence dependencies.
>
> Then the work function could submit the work and wait for the result.
>
> The work item would then pretty much represent what you want, you can 
> wait for it to finish and pass it along as long running dependency.
>
> Maybe give it a funky name and wrap it up in a structure, but that's 
> basically it.
>
This very much sounds like a i915_sw_fence for the dependency tracking 
and dma_fence_work for the actual work although it's completion fence is 
a dma_fence.

Although that goes against the whole idea of a condition for merging the 
xe driver would be that we implement some sort of minimal scaffolding 
for long-running workloads in the drm scheduler, and the thinking behind 
that is to avoid implementing intel-specific solutions like those...

Thanks,

Thomas



> Regards,
> Christian.
>
>>
>> Thanks,
>>
>> Thomas
>>
>>
>>
>>
>>
Daniel Vetter April 4, 2023, 7 p.m. UTC | #5
On Tue, 4 Apr 2023 at 15:10, Christian König <christian.koenig@amd.com> wrote:
>
> Am 04.04.23 um 14:54 schrieb Thomas Hellström:
> > Hi, Christian,
> >
> > On 4/4/23 11:09, Christian König wrote:
> >> Am 04.04.23 um 02:22 schrieb Matthew Brost:
> >>> From: Thomas Hellström <thomas.hellstrom@linux.intel.com>
> >>>
> >>> For long-running workloads, drivers either need to open-code completion
> >>> waits, invent their own synchronization primitives or internally use
> >>> dma-fences that do not obey the cross-driver dma-fence protocol, but
> >>> without any lockdep annotation all these approaches are error prone.
> >>>
> >>> So since for example the drm scheduler uses dma-fences it is
> >>> desirable for
> >>> a driver to be able to use it for throttling and error handling also
> >>> with
> >>> internal dma-fences tha do not obey the cros-driver dma-fence protocol.
> >>>
> >>> Introduce long-running completion fences in form of dma-fences, and add
> >>> lockdep annotation for them. In particular:
> >>>
> >>> * Do not allow waiting under any memory management locks.
> >>> * Do not allow to attach them to a dma-resv object.
> >>> * Introduce a new interface for adding callbacks making the helper
> >>> adding
> >>>    a callback sign off on that it is aware that the dma-fence may not
> >>>    complete anytime soon. Typically this will be the scheduler chaining
> >>>    a new long-running fence on another one.
> >>
> >> Well that's pretty much what I tried before:
> >> https://lwn.net/Articles/893704/
> >>
> >> And the reasons why it was rejected haven't changed.
> >>
> >> Regards,
> >> Christian.
> >>
> > Yes, TBH this was mostly to get discussion going how we'd best tackle
> > this problem while being able to reuse the scheduler for long-running
> > workloads.
> >
> > I couldn't see any clear decision on your series, though, but one main
> > difference I see is that this is intended for driver-internal use
> > only. (I'm counting using the drm_scheduler as a helper for
> > driver-private use). This is by no means a way to try tackle the
> > indefinite fence problem.
>
> Well this was just my latest try to tackle this, but essentially the
> problems are the same as with your approach: When we express such
> operations as dma_fence there is always the change that we leak that
> somewhere.
>
> My approach of adding a flag noting that this operation is dangerous and
> can't be synced with something memory management depends on tried to
> contain this as much as possible, but Daniel still pretty clearly
> rejected it (for good reasons I think).

Yeah I still don't like dma_fence that somehow have totally different
semantics in that critical piece of "will it complete or will it
deadlock?" :-)
>
> >
> > We could ofc invent a completely different data-type that abstracts
> > the synchronization the scheduler needs in the long-running case, or
> > each driver could hack something up, like sleeping in the
> > prepare_job() or run_job() callback for throttling, but those waits
> > should still be annotated in one way or annotated one way or another
> > (and probably in a similar way across drivers) to make sure we don't
> > do anything bad.
> >
> >  So any suggestions as to what would be the better solution here would
> > be appreciated.
>
> Mhm, do we really the the GPU scheduler for that?
>
> I mean in the 1 to 1 case  you basically just need a component which
> collects the dependencies as dma_fence and if all of them are fulfilled
> schedules a work item.
>
> As long as the work item itself doesn't produce a dma_fence it can then
> still just wait for other none dma_fence dependencies.

Yeah that's the important thing, for long-running jobs dependencies as
dma_fence should be totally fine. You're just not allowed to have any
outgoing dma_fences at all (except the magic preemption fence).

> Then the work function could submit the work and wait for the result.
>
> The work item would then pretty much represent what you want, you can
> wait for it to finish and pass it along as long running dependency.
>
> Maybe give it a funky name and wrap it up in a structure, but that's
> basically it.

Like do we need this? If the kernel ever waits for a long-running
compute job to finnish I'd call that a bug. Any functional
dependencies between engines or whatever are userspace's problem only,
which it needs to sort out using userspace memory fences.

The only things the kernel needs are some way to track dependencies as
dma_fence (because memory management move the memory away and we need
to move it back in, ideally pipelined). And it needs the special
preempt fence (if we don't have pagefaults) so that you have a fence
to attach to all the dma_resv for memory management purposes. Now the
scheduler already has almost all the pieces (at least if we assume
there's some magic fw which time-slices these contexts on its own),
and we just need a few minimal changes:
- allowing the scheduler to ignore the completion fence and just
immediately push the next "job" in if its dependencies are ready
- maybe minimal amounts of scaffolding to handle the preemption
dma_fence because that's not entirely trivial. I think ideally we'd
put that into drm_sched_entity since you can only ever have one active
preempt dma_fence per gpu ctx/entity.

None of this needs a dma_fence_is_lr anywhere at all.

Of course there's the somewhat related issue of "how do we transport
these userspace memory fences around from app to compositor", and
that's a lot more gnarly. I still don't think dma_fence_is_lr is
anywhere near what the solution should look like for that.
-Daniel


> Regards,
> Christian.
>
> >
> > Thanks,
> >
> > Thomas
> >
> >
> >
> >
> >
>
Matthew Brost April 4, 2023, 7:02 p.m. UTC | #6
On Tue, Apr 04, 2023 at 08:14:01PM +0200, Thomas Hellström (Intel) wrote:
> 
> On 4/4/23 15:10, Christian König wrote:
> > Am 04.04.23 um 14:54 schrieb Thomas Hellström:
> > > Hi, Christian,
> > > 
> > > On 4/4/23 11:09, Christian König wrote:
> > > > Am 04.04.23 um 02:22 schrieb Matthew Brost:
> > > > > From: Thomas Hellström <thomas.hellstrom@linux.intel.com>
> > > > > 
> > > > > For long-running workloads, drivers either need to open-code
> > > > > completion
> > > > > waits, invent their own synchronization primitives or internally use
> > > > > dma-fences that do not obey the cross-driver dma-fence protocol, but
> > > > > without any lockdep annotation all these approaches are error prone.
> > > > > 
> > > > > So since for example the drm scheduler uses dma-fences it is
> > > > > desirable for
> > > > > a driver to be able to use it for throttling and error
> > > > > handling also with
> > > > > internal dma-fences tha do not obey the cros-driver
> > > > > dma-fence protocol.
> > > > > 
> > > > > Introduce long-running completion fences in form of
> > > > > dma-fences, and add
> > > > > lockdep annotation for them. In particular:
> > > > > 
> > > > > * Do not allow waiting under any memory management locks.
> > > > > * Do not allow to attach them to a dma-resv object.
> > > > > * Introduce a new interface for adding callbacks making the
> > > > > helper adding
> > > > >    a callback sign off on that it is aware that the dma-fence may not
> > > > >    complete anytime soon. Typically this will be the
> > > > > scheduler chaining
> > > > >    a new long-running fence on another one.
> > > > 
> > > > Well that's pretty much what I tried before:
> > > > https://lwn.net/Articles/893704/
> > > > 

I don't think this quite the same, this explictly enforces that we don't
break the dma-fence rules (in path of memory allocations, exported in
any way), essentially this just SW sync point reusing dma-fence the
infrastructure for signaling / callbacks. I believe your series tried to
export these fences to user space (admittedly I haven't fully read your
series).

In this use case we essentially just want to flow control the ring via
the dma-scheduler + maintain a list of pending jobs so the TDR can be
used for cleanup if LR entity encounters an error. To me this seems
perfectly reasonable but I know dma-femce rules are akin to a holy war.

If we return NULL in run_job, now we have to be able to sink all jobs
in the backend regardless on ring space, maintain a list of jobs pending
for cleanup after errors, and write a different cleanup path as now the
TDR doesn't work. Seems very, very silly to duplicate all of this code
when the DRM scheduler provides all of this for us. Also if we go this
route, now all drivers are going to invent ways to handle LR jobs /w the
DRM scheduler.

This solution is pretty clear, mark the scheduler as LR, and don't
export any fences from the scheduler. If you try to export these fences
a blow up happens.

> > > > And the reasons why it was rejected haven't changed.
> > > > 
> > > > Regards,
> > > > Christian.
> > > > 
> > > Yes, TBH this was mostly to get discussion going how we'd best
> > > tackle this problem while being able to reuse the scheduler for
> > > long-running workloads.
> > > 
> > > I couldn't see any clear decision on your series, though, but one
> > > main difference I see is that this is intended for driver-internal
> > > use only. (I'm counting using the drm_scheduler as a helper for
> > > driver-private use). This is by no means a way to try tackle the
> > > indefinite fence problem.
> > 
> > Well this was just my latest try to tackle this, but essentially the
> > problems are the same as with your approach: When we express such
> > operations as dma_fence there is always the change that we leak that
> > somewhere.
> > 
> > My approach of adding a flag noting that this operation is dangerous and
> > can't be synced with something memory management depends on tried to
> > contain this as much as possible, but Daniel still pretty clearly
> > rejected it (for good reasons I think).
> > 
> > > 
> > > We could ofc invent a completely different data-type that abstracts
> > > the synchronization the scheduler needs in the long-running case, or
> > > each driver could hack something up, like sleeping in the
> > > prepare_job() or run_job() callback for throttling, but those waits
> > > should still be annotated in one way or annotated one way or another
> > > (and probably in a similar way across drivers) to make sure we don't
> > > do anything bad.
> > > 
> > >  So any suggestions as to what would be the better solution here
> > > would be appreciated.
> > 
> > Mhm, do we really the the GPU scheduler for that?
> > 

I think we need to solve this within the DRM scheduler one way or
another.

> > I mean in the 1 to 1 case  you basically just need a component which
> > collects the dependencies as dma_fence and if all of them are fulfilled
> > schedules a work item.
> > 
> > As long as the work item itself doesn't produce a dma_fence it can then
> > still just wait for other none dma_fence dependencies.
> > 
> > Then the work function could submit the work and wait for the result.
> > 
> > The work item would then pretty much represent what you want, you can
> > wait for it to finish and pass it along as long running dependency.
> > 
> > Maybe give it a funky name and wrap it up in a structure, but that's
> > basically it.
> > 
> This very much sounds like a i915_sw_fence for the dependency tracking and
> dma_fence_work for the actual work although it's completion fence is a
> dma_fence.
>

Agree this does sound to i915ish as stated below one of mandates in Xe
was to use the DRM scheduler. Beyond that as someone who a submission
backend in the i915 and Xe, I love how the DRM scheduler works (single
entry point), it makes everything so much easier.

Matt

> Although that goes against the whole idea of a condition for merging the xe
> driver would be that we implement some sort of minimal scaffolding for
> long-running workloads in the drm scheduler, and the thinking behind that is
> to avoid implementing intel-specific solutions like those...
> 
> Thanks,
> 
> Thomas
> 
> 
> 
> > Regards,
> > Christian.
> > 
> > > 
> > > Thanks,
> > > 
> > > Thomas
> > > 
> > > 
> > > 
> > > 
> > >
Daniel Vetter April 4, 2023, 7:25 p.m. UTC | #7
On Tue, Apr 04, 2023 at 07:02:23PM +0000, Matthew Brost wrote:
> On Tue, Apr 04, 2023 at 08:14:01PM +0200, Thomas Hellström (Intel) wrote:
> > 
> > On 4/4/23 15:10, Christian König wrote:
> > > Am 04.04.23 um 14:54 schrieb Thomas Hellström:
> > > > Hi, Christian,
> > > > 
> > > > On 4/4/23 11:09, Christian König wrote:
> > > > > Am 04.04.23 um 02:22 schrieb Matthew Brost:
> > > > > > From: Thomas Hellström <thomas.hellstrom@linux.intel.com>
> > > > > > 
> > > > > > For long-running workloads, drivers either need to open-code
> > > > > > completion
> > > > > > waits, invent their own synchronization primitives or internally use
> > > > > > dma-fences that do not obey the cross-driver dma-fence protocol, but
> > > > > > without any lockdep annotation all these approaches are error prone.
> > > > > > 
> > > > > > So since for example the drm scheduler uses dma-fences it is
> > > > > > desirable for
> > > > > > a driver to be able to use it for throttling and error
> > > > > > handling also with
> > > > > > internal dma-fences tha do not obey the cros-driver
> > > > > > dma-fence protocol.
> > > > > > 
> > > > > > Introduce long-running completion fences in form of
> > > > > > dma-fences, and add
> > > > > > lockdep annotation for them. In particular:
> > > > > > 
> > > > > > * Do not allow waiting under any memory management locks.
> > > > > > * Do not allow to attach them to a dma-resv object.
> > > > > > * Introduce a new interface for adding callbacks making the
> > > > > > helper adding
> > > > > >    a callback sign off on that it is aware that the dma-fence may not
> > > > > >    complete anytime soon. Typically this will be the
> > > > > > scheduler chaining
> > > > > >    a new long-running fence on another one.
> > > > > 
> > > > > Well that's pretty much what I tried before:
> > > > > https://lwn.net/Articles/893704/
> > > > > 
> 
> I don't think this quite the same, this explictly enforces that we don't
> break the dma-fence rules (in path of memory allocations, exported in
> any way), essentially this just SW sync point reusing dma-fence the
> infrastructure for signaling / callbacks. I believe your series tried to
> export these fences to user space (admittedly I haven't fully read your
> series).
> 
> In this use case we essentially just want to flow control the ring via
> the dma-scheduler + maintain a list of pending jobs so the TDR can be
> used for cleanup if LR entity encounters an error. To me this seems
> perfectly reasonable but I know dma-femce rules are akin to a holy war.
> 
> If we return NULL in run_job, now we have to be able to sink all jobs
> in the backend regardless on ring space, maintain a list of jobs pending
> for cleanup after errors, and write a different cleanup path as now the
> TDR doesn't work. Seems very, very silly to duplicate all of this code
> when the DRM scheduler provides all of this for us. Also if we go this
> route, now all drivers are going to invent ways to handle LR jobs /w the
> DRM scheduler.
> 
> This solution is pretty clear, mark the scheduler as LR, and don't
> export any fences from the scheduler. If you try to export these fences
> a blow up happens.

The problem is if you mix things up. Like for resets you need all the
schedulers on an engine/set-of-engines to quiescent or things get
potentially hilarious. If you now have a scheduler in forever limbo, the
dma_fence guarantees are right out the window.

But the issue you're having is fairly specific if it's just about
ringspace. I think the dumbest fix is to just block in submit if you run
out of per-ctx ringspace, and call it a day. This notion that somehow the
kernel is supposed to provide a bottomless queue of anything userspace
submits simply doesn't hold up in reality (as much as userspace standards
committees would like it to), and as long as it doesn't have a real-world
perf impact it doesn't really matter why we end up blocking in the submit
ioctl. It might also be a simple memory allocation that hits a snag in
page reclaim.

> > > > > And the reasons why it was rejected haven't changed.
> > > > > 
> > > > > Regards,
> > > > > Christian.
> > > > > 
> > > > Yes, TBH this was mostly to get discussion going how we'd best
> > > > tackle this problem while being able to reuse the scheduler for
> > > > long-running workloads.
> > > > 
> > > > I couldn't see any clear decision on your series, though, but one
> > > > main difference I see is that this is intended for driver-internal
> > > > use only. (I'm counting using the drm_scheduler as a helper for
> > > > driver-private use). This is by no means a way to try tackle the
> > > > indefinite fence problem.
> > > 
> > > Well this was just my latest try to tackle this, but essentially the
> > > problems are the same as with your approach: When we express such
> > > operations as dma_fence there is always the change that we leak that
> > > somewhere.
> > > 
> > > My approach of adding a flag noting that this operation is dangerous and
> > > can't be synced with something memory management depends on tried to
> > > contain this as much as possible, but Daniel still pretty clearly
> > > rejected it (for good reasons I think).
> > > 
> > > > 
> > > > We could ofc invent a completely different data-type that abstracts
> > > > the synchronization the scheduler needs in the long-running case, or
> > > > each driver could hack something up, like sleeping in the
> > > > prepare_job() or run_job() callback for throttling, but those waits
> > > > should still be annotated in one way or annotated one way or another
> > > > (and probably in a similar way across drivers) to make sure we don't
> > > > do anything bad.
> > > > 
> > > >  So any suggestions as to what would be the better solution here
> > > > would be appreciated.
> > > 
> > > Mhm, do we really the the GPU scheduler for that?
> > > 
> 
> I think we need to solve this within the DRM scheduler one way or
> another.

Yeah so if we conclude that the queue really must be bottomless then I
agree drm-sched should help out sort out the mess. Because I'm guessing
that every driver will have this issue. But that's a big if.

I guess if we teach the drm scheduler that some jobs are fairly endless
then maybe it wouldn't be too far-fetched to also teach it to wait for a
previous one to finish (but not with the dma_fence that preempts, which we
put into the dma_resv for memory management, but some other struct
completion). The scheduler already has a concept of not stuffing too much
stuff into the same queue after all, so this should fit?
-Daniel


> > > I mean in the 1 to 1 case  you basically just need a component which
> > > collects the dependencies as dma_fence and if all of them are fulfilled
> > > schedules a work item.
> > > 
> > > As long as the work item itself doesn't produce a dma_fence it can then
> > > still just wait for other none dma_fence dependencies.
> > > 
> > > Then the work function could submit the work and wait for the result.
> > > 
> > > The work item would then pretty much represent what you want, you can
> > > wait for it to finish and pass it along as long running dependency.
> > > 
> > > Maybe give it a funky name and wrap it up in a structure, but that's
> > > basically it.
> > > 
> > This very much sounds like a i915_sw_fence for the dependency tracking and
> > dma_fence_work for the actual work although it's completion fence is a
> > dma_fence.
> >
> 
> Agree this does sound to i915ish as stated below one of mandates in Xe
> was to use the DRM scheduler. Beyond that as someone who a submission
> backend in the i915 and Xe, I love how the DRM scheduler works (single
> entry point), it makes everything so much easier.
> 
> Matt
> 
> > Although that goes against the whole idea of a condition for merging the xe
> > driver would be that we implement some sort of minimal scaffolding for
> > long-running workloads in the drm scheduler, and the thinking behind that is
> > to avoid implementing intel-specific solutions like those...
> > 
> > Thanks,
> > 
> > Thomas
> > 
> > 
> > 
> > > Regards,
> > > Christian.
> > > 
> > > > 
> > > > Thanks,
> > > > 
> > > > Thomas
> > > > 
> > > > 
> > > > 
> > > > 
> > > >
Matthew Brost April 4, 2023, 7:48 p.m. UTC | #8
On Tue, Apr 04, 2023 at 09:25:52PM +0200, Daniel Vetter wrote:
> On Tue, Apr 04, 2023 at 07:02:23PM +0000, Matthew Brost wrote:
> > On Tue, Apr 04, 2023 at 08:14:01PM +0200, Thomas Hellström (Intel) wrote:
> > > 
> > > On 4/4/23 15:10, Christian König wrote:
> > > > Am 04.04.23 um 14:54 schrieb Thomas Hellström:
> > > > > Hi, Christian,
> > > > > 
> > > > > On 4/4/23 11:09, Christian König wrote:
> > > > > > Am 04.04.23 um 02:22 schrieb Matthew Brost:
> > > > > > > From: Thomas Hellström <thomas.hellstrom@linux.intel.com>
> > > > > > > 
> > > > > > > For long-running workloads, drivers either need to open-code
> > > > > > > completion
> > > > > > > waits, invent their own synchronization primitives or internally use
> > > > > > > dma-fences that do not obey the cross-driver dma-fence protocol, but
> > > > > > > without any lockdep annotation all these approaches are error prone.
> > > > > > > 
> > > > > > > So since for example the drm scheduler uses dma-fences it is
> > > > > > > desirable for
> > > > > > > a driver to be able to use it for throttling and error
> > > > > > > handling also with
> > > > > > > internal dma-fences tha do not obey the cros-driver
> > > > > > > dma-fence protocol.
> > > > > > > 
> > > > > > > Introduce long-running completion fences in form of
> > > > > > > dma-fences, and add
> > > > > > > lockdep annotation for them. In particular:
> > > > > > > 
> > > > > > > * Do not allow waiting under any memory management locks.
> > > > > > > * Do not allow to attach them to a dma-resv object.
> > > > > > > * Introduce a new interface for adding callbacks making the
> > > > > > > helper adding
> > > > > > >    a callback sign off on that it is aware that the dma-fence may not
> > > > > > >    complete anytime soon. Typically this will be the
> > > > > > > scheduler chaining
> > > > > > >    a new long-running fence on another one.
> > > > > > 
> > > > > > Well that's pretty much what I tried before:
> > > > > > https://lwn.net/Articles/893704/
> > > > > > 
> > 
> > I don't think this quite the same, this explictly enforces that we don't
> > break the dma-fence rules (in path of memory allocations, exported in
> > any way), essentially this just SW sync point reusing dma-fence the
> > infrastructure for signaling / callbacks. I believe your series tried to
> > export these fences to user space (admittedly I haven't fully read your
> > series).
> > 
> > In this use case we essentially just want to flow control the ring via
> > the dma-scheduler + maintain a list of pending jobs so the TDR can be
> > used for cleanup if LR entity encounters an error. To me this seems
> > perfectly reasonable but I know dma-femce rules are akin to a holy war.
> > 
> > If we return NULL in run_job, now we have to be able to sink all jobs
> > in the backend regardless on ring space, maintain a list of jobs pending
> > for cleanup after errors, and write a different cleanup path as now the
> > TDR doesn't work. Seems very, very silly to duplicate all of this code
> > when the DRM scheduler provides all of this for us. Also if we go this
> > route, now all drivers are going to invent ways to handle LR jobs /w the
> > DRM scheduler.
> > 
> > This solution is pretty clear, mark the scheduler as LR, and don't
> > export any fences from the scheduler. If you try to export these fences
> > a blow up happens.
> 
> The problem is if you mix things up. Like for resets you need all the
> schedulers on an engine/set-of-engines to quiescent or things get
> potentially hilarious. If you now have a scheduler in forever limbo, the
> dma_fence guarantees are right out the window.
> 

Right, a GT reset on Xe is:

Stop all schedulers
Do a reset
Ban any schedulers which we think caused the GT reset
Resubmit all schedulers which we think were good
Restart all schedulers

None of this flow depends on LR dma-fences, all of this uses the DRM
sched infrastructure and work very well compared to the i915. Rewriting
all this with a driver specific implementation is what we are trying to
avoid.

Similarly if LR entity hangs on its own (not a GT reset, rather the
firmware does the reset for us) we use all the DRM scheduler
infrastructure to handle this. Again this works rather well...

> But the issue you're having is fairly specific if it's just about
> ringspace. I think the dumbest fix is to just block in submit if you run
> out of per-ctx ringspace, and call it a day. This notion that somehow the

How does that not break the dma-fence rules? A job can publish its
finished fence after ARM, if the finished fence fence waits on ring
space that may not free up in a reasonable amount of time we now have
broken the dma-dence rules. My understanding is any dma-fence must only
on other dma-fence, Christian seems to agree and NAK'd just blocking if
no space available [1]. IMO this series ensures we don't break dma-fence
rules by restricting how the finished fence can be used.

> kernel is supposed to provide a bottomless queue of anything userspace
> submits simply doesn't hold up in reality (as much as userspace standards
> committees would like it to), and as long as it doesn't have a real-world
> perf impact it doesn't really matter why we end up blocking in the submit
> ioctl. It might also be a simple memory allocation that hits a snag in
> page reclaim.
> 
> > > > > > And the reasons why it was rejected haven't changed.
> > > > > > 
> > > > > > Regards,
> > > > > > Christian.
> > > > > > 
> > > > > Yes, TBH this was mostly to get discussion going how we'd best
> > > > > tackle this problem while being able to reuse the scheduler for
> > > > > long-running workloads.
> > > > > 
> > > > > I couldn't see any clear decision on your series, though, but one
> > > > > main difference I see is that this is intended for driver-internal
> > > > > use only. (I'm counting using the drm_scheduler as a helper for
> > > > > driver-private use). This is by no means a way to try tackle the
> > > > > indefinite fence problem.
> > > > 
> > > > Well this was just my latest try to tackle this, but essentially the
> > > > problems are the same as with your approach: When we express such
> > > > operations as dma_fence there is always the change that we leak that
> > > > somewhere.
> > > > 
> > > > My approach of adding a flag noting that this operation is dangerous and
> > > > can't be synced with something memory management depends on tried to
> > > > contain this as much as possible, but Daniel still pretty clearly
> > > > rejected it (for good reasons I think).
> > > > 
> > > > > 
> > > > > We could ofc invent a completely different data-type that abstracts
> > > > > the synchronization the scheduler needs in the long-running case, or
> > > > > each driver could hack something up, like sleeping in the
> > > > > prepare_job() or run_job() callback for throttling, but those waits
> > > > > should still be annotated in one way or annotated one way or another
> > > > > (and probably in a similar way across drivers) to make sure we don't
> > > > > do anything bad.
> > > > > 
> > > > >  So any suggestions as to what would be the better solution here
> > > > > would be appreciated.
> > > > 
> > > > Mhm, do we really the the GPU scheduler for that?
> > > > 
> > 
> > I think we need to solve this within the DRM scheduler one way or
> > another.
> 
> Yeah so if we conclude that the queue really must be bottomless then I
> agree drm-sched should help out sort out the mess. Because I'm guessing
> that every driver will have this issue. But that's a big if.
> 
> I guess if we teach the drm scheduler that some jobs are fairly endless
> then maybe it wouldn't be too far-fetched to also teach it to wait for a
> previous one to finish (but not with the dma_fence that preempts, which we
> put into the dma_resv for memory management, but some other struct
> completion). The scheduler already has a concept of not stuffing too much
> stuff into the same queue after all, so this should fit?

See above, exact same situation as spinning on flow controling the ring,
this IMO absolutely breaks the dma-fence rules. IMO the correct solution
is to have a DRM that doesn't export dma-fences, this is exactly what
this series does as if we try to, boom lockdep / warn on blow up.

Matt

[1] https://patchwork.freedesktop.org/patch/525461/?series=114772&rev=2

> -Daniel
> 
> 
> > > > I mean in the 1 to 1 case  you basically just need a component which
> > > > collects the dependencies as dma_fence and if all of them are fulfilled
> > > > schedules a work item.
> > > > 
> > > > As long as the work item itself doesn't produce a dma_fence it can then
> > > > still just wait for other none dma_fence dependencies.
> > > > 
> > > > Then the work function could submit the work and wait for the result.
> > > > 
> > > > The work item would then pretty much represent what you want, you can
> > > > wait for it to finish and pass it along as long running dependency.
> > > > 
> > > > Maybe give it a funky name and wrap it up in a structure, but that's
> > > > basically it.
> > > > 
> > > This very much sounds like a i915_sw_fence for the dependency tracking and
> > > dma_fence_work for the actual work although it's completion fence is a
> > > dma_fence.
> > >
> > 
> > Agree this does sound to i915ish as stated below one of mandates in Xe
> > was to use the DRM scheduler. Beyond that as someone who a submission
> > backend in the i915 and Xe, I love how the DRM scheduler works (single
> > entry point), it makes everything so much easier.
> > 
> > Matt
> > 
> > > Although that goes against the whole idea of a condition for merging the xe
> > > driver would be that we implement some sort of minimal scaffolding for
> > > long-running workloads in the drm scheduler, and the thinking behind that is
> > > to avoid implementing intel-specific solutions like those...
> > > 
> > > Thanks,
> > > 
> > > Thomas
> > > 
> > > 
> > > 
> > > > Regards,
> > > > Christian.
> > > > 
> > > > > 
> > > > > Thanks,
> > > > > 
> > > > > Thomas
> > > > > 
> > > > > 
> > > > > 
> > > > > 
> > > > > 
> 
> -- 
> Daniel Vetter
> Software Engineer, Intel Corporation
> http://blog.ffwll.ch
Matthew Brost April 4, 2023, 8:03 p.m. UTC | #9
On Tue, Apr 04, 2023 at 09:00:59PM +0200, Daniel Vetter wrote:
> On Tue, 4 Apr 2023 at 15:10, Christian König <christian.koenig@amd.com> wrote:
> >
> > Am 04.04.23 um 14:54 schrieb Thomas Hellström:
> > > Hi, Christian,
> > >
> > > On 4/4/23 11:09, Christian König wrote:
> > >> Am 04.04.23 um 02:22 schrieb Matthew Brost:
> > >>> From: Thomas Hellström <thomas.hellstrom@linux.intel.com>
> > >>>
> > >>> For long-running workloads, drivers either need to open-code completion
> > >>> waits, invent their own synchronization primitives or internally use
> > >>> dma-fences that do not obey the cross-driver dma-fence protocol, but
> > >>> without any lockdep annotation all these approaches are error prone.
> > >>>
> > >>> So since for example the drm scheduler uses dma-fences it is
> > >>> desirable for
> > >>> a driver to be able to use it for throttling and error handling also
> > >>> with
> > >>> internal dma-fences tha do not obey the cros-driver dma-fence protocol.
> > >>>
> > >>> Introduce long-running completion fences in form of dma-fences, and add
> > >>> lockdep annotation for them. In particular:
> > >>>
> > >>> * Do not allow waiting under any memory management locks.
> > >>> * Do not allow to attach them to a dma-resv object.
> > >>> * Introduce a new interface for adding callbacks making the helper
> > >>> adding
> > >>>    a callback sign off on that it is aware that the dma-fence may not
> > >>>    complete anytime soon. Typically this will be the scheduler chaining
> > >>>    a new long-running fence on another one.
> > >>
> > >> Well that's pretty much what I tried before:
> > >> https://lwn.net/Articles/893704/
> > >>
> > >> And the reasons why it was rejected haven't changed.
> > >>
> > >> Regards,
> > >> Christian.
> > >>
> > > Yes, TBH this was mostly to get discussion going how we'd best tackle
> > > this problem while being able to reuse the scheduler for long-running
> > > workloads.
> > >
> > > I couldn't see any clear decision on your series, though, but one main
> > > difference I see is that this is intended for driver-internal use
> > > only. (I'm counting using the drm_scheduler as a helper for
> > > driver-private use). This is by no means a way to try tackle the
> > > indefinite fence problem.
> >
> > Well this was just my latest try to tackle this, but essentially the
> > problems are the same as with your approach: When we express such
> > operations as dma_fence there is always the change that we leak that
> > somewhere.
> >
> > My approach of adding a flag noting that this operation is dangerous and
> > can't be synced with something memory management depends on tried to
> > contain this as much as possible, but Daniel still pretty clearly
> > rejected it (for good reasons I think).
> 
> Yeah I still don't like dma_fence that somehow have totally different
> semantics in that critical piece of "will it complete or will it
> deadlock?" :-)

Not going to touch LR dma-fences in this reply, I think we can continue
the LR fence discussion of the fork of this thread I just responded to.
Have a response the preempt fence discussion below.

> >
> > >
> > > We could ofc invent a completely different data-type that abstracts
> > > the synchronization the scheduler needs in the long-running case, or
> > > each driver could hack something up, like sleeping in the
> > > prepare_job() or run_job() callback for throttling, but those waits
> > > should still be annotated in one way or annotated one way or another
> > > (and probably in a similar way across drivers) to make sure we don't
> > > do anything bad.
> > >
> > >  So any suggestions as to what would be the better solution here would
> > > be appreciated.
> >
> > Mhm, do we really the the GPU scheduler for that?
> >
> > I mean in the 1 to 1 case  you basically just need a component which
> > collects the dependencies as dma_fence and if all of them are fulfilled
> > schedules a work item.
> >
> > As long as the work item itself doesn't produce a dma_fence it can then
> > still just wait for other none dma_fence dependencies.
> 
> Yeah that's the important thing, for long-running jobs dependencies as
> dma_fence should be totally fine. You're just not allowed to have any
> outgoing dma_fences at all (except the magic preemption fence).
> 
> > Then the work function could submit the work and wait for the result.
> >
> > The work item would then pretty much represent what you want, you can
> > wait for it to finish and pass it along as long running dependency.
> >
> > Maybe give it a funky name and wrap it up in a structure, but that's
> > basically it.
> 
> Like do we need this? If the kernel ever waits for a long-running
> compute job to finnish I'd call that a bug. Any functional
> dependencies between engines or whatever are userspace's problem only,
> which it needs to sort out using userspace memory fences.
> 
> The only things the kernel needs are some way to track dependencies as
> dma_fence (because memory management move the memory away and we need
> to move it back in, ideally pipelined). And it needs the special
> preempt fence (if we don't have pagefaults) so that you have a fence
> to attach to all the dma_resv for memory management purposes. Now the
> scheduler already has almost all the pieces (at least if we assume
> there's some magic fw which time-slices these contexts on its own),
> and we just need a few minimal changes:
> - allowing the scheduler to ignore the completion fence and just
> immediately push the next "job" in if its dependencies are ready
> - maybe minimal amounts of scaffolding to handle the preemption
> dma_fence because that's not entirely trivial. I think ideally we'd
> put that into drm_sched_entity since you can only ever have one active
> preempt dma_fence per gpu ctx/entity.
> 

Yep, preempt fence is per entity in Xe (xe_engine). We install these
into the VM and all external BOs mapped in the VM dma-resv slots.
Wondering if we can make all of this very generic between the DRM
scheduler + GPUVA...

Matt

> None of this needs a dma_fence_is_lr anywhere at all.
> 
> Of course there's the somewhat related issue of "how do we transport
> these userspace memory fences around from app to compositor", and
> that's a lot more gnarly. I still don't think dma_fence_is_lr is
> anywhere near what the solution should look like for that.
> -Daniel
> 
> 
> > Regards,
> > Christian.
> >
> > >
> > > Thanks,
> > >
> > > Thomas
> > >
> > >
> > >
> > >
> > >
> >
> 
> 
> -- 
> Daniel Vetter
> Software Engineer, Intel Corporation
> http://blog.ffwll.ch
Daniel Vetter April 4, 2023, 8:11 p.m. UTC | #10
On Tue, 4 Apr 2023 at 22:04, Matthew Brost <matthew.brost@intel.com> wrote:
>
> On Tue, Apr 04, 2023 at 09:00:59PM +0200, Daniel Vetter wrote:
> > On Tue, 4 Apr 2023 at 15:10, Christian König <christian.koenig@amd.com> wrote:
> > >
> > > Am 04.04.23 um 14:54 schrieb Thomas Hellström:
> > > > Hi, Christian,
> > > >
> > > > On 4/4/23 11:09, Christian König wrote:
> > > >> Am 04.04.23 um 02:22 schrieb Matthew Brost:
> > > >>> From: Thomas Hellström <thomas.hellstrom@linux.intel.com>
> > > >>>
> > > >>> For long-running workloads, drivers either need to open-code completion
> > > >>> waits, invent their own synchronization primitives or internally use
> > > >>> dma-fences that do not obey the cross-driver dma-fence protocol, but
> > > >>> without any lockdep annotation all these approaches are error prone.
> > > >>>
> > > >>> So since for example the drm scheduler uses dma-fences it is
> > > >>> desirable for
> > > >>> a driver to be able to use it for throttling and error handling also
> > > >>> with
> > > >>> internal dma-fences tha do not obey the cros-driver dma-fence protocol.
> > > >>>
> > > >>> Introduce long-running completion fences in form of dma-fences, and add
> > > >>> lockdep annotation for them. In particular:
> > > >>>
> > > >>> * Do not allow waiting under any memory management locks.
> > > >>> * Do not allow to attach them to a dma-resv object.
> > > >>> * Introduce a new interface for adding callbacks making the helper
> > > >>> adding
> > > >>>    a callback sign off on that it is aware that the dma-fence may not
> > > >>>    complete anytime soon. Typically this will be the scheduler chaining
> > > >>>    a new long-running fence on another one.
> > > >>
> > > >> Well that's pretty much what I tried before:
> > > >> https://lwn.net/Articles/893704/
> > > >>
> > > >> And the reasons why it was rejected haven't changed.
> > > >>
> > > >> Regards,
> > > >> Christian.
> > > >>
> > > > Yes, TBH this was mostly to get discussion going how we'd best tackle
> > > > this problem while being able to reuse the scheduler for long-running
> > > > workloads.
> > > >
> > > > I couldn't see any clear decision on your series, though, but one main
> > > > difference I see is that this is intended for driver-internal use
> > > > only. (I'm counting using the drm_scheduler as a helper for
> > > > driver-private use). This is by no means a way to try tackle the
> > > > indefinite fence problem.
> > >
> > > Well this was just my latest try to tackle this, but essentially the
> > > problems are the same as with your approach: When we express such
> > > operations as dma_fence there is always the change that we leak that
> > > somewhere.
> > >
> > > My approach of adding a flag noting that this operation is dangerous and
> > > can't be synced with something memory management depends on tried to
> > > contain this as much as possible, but Daniel still pretty clearly
> > > rejected it (for good reasons I think).
> >
> > Yeah I still don't like dma_fence that somehow have totally different
> > semantics in that critical piece of "will it complete or will it
> > deadlock?" :-)
>
> Not going to touch LR dma-fences in this reply, I think we can continue
> the LR fence discussion of the fork of this thread I just responded to.
> Have a response the preempt fence discussion below.
>
> > >
> > > >
> > > > We could ofc invent a completely different data-type that abstracts
> > > > the synchronization the scheduler needs in the long-running case, or
> > > > each driver could hack something up, like sleeping in the
> > > > prepare_job() or run_job() callback for throttling, but those waits
> > > > should still be annotated in one way or annotated one way or another
> > > > (and probably in a similar way across drivers) to make sure we don't
> > > > do anything bad.
> > > >
> > > >  So any suggestions as to what would be the better solution here would
> > > > be appreciated.
> > >
> > > Mhm, do we really the the GPU scheduler for that?
> > >
> > > I mean in the 1 to 1 case  you basically just need a component which
> > > collects the dependencies as dma_fence and if all of them are fulfilled
> > > schedules a work item.
> > >
> > > As long as the work item itself doesn't produce a dma_fence it can then
> > > still just wait for other none dma_fence dependencies.
> >
> > Yeah that's the important thing, for long-running jobs dependencies as
> > dma_fence should be totally fine. You're just not allowed to have any
> > outgoing dma_fences at all (except the magic preemption fence).
> >
> > > Then the work function could submit the work and wait for the result.
> > >
> > > The work item would then pretty much represent what you want, you can
> > > wait for it to finish and pass it along as long running dependency.
> > >
> > > Maybe give it a funky name and wrap it up in a structure, but that's
> > > basically it.
> >
> > Like do we need this? If the kernel ever waits for a long-running
> > compute job to finnish I'd call that a bug. Any functional
> > dependencies between engines or whatever are userspace's problem only,
> > which it needs to sort out using userspace memory fences.
> >
> > The only things the kernel needs are some way to track dependencies as
> > dma_fence (because memory management move the memory away and we need
> > to move it back in, ideally pipelined). And it needs the special
> > preempt fence (if we don't have pagefaults) so that you have a fence
> > to attach to all the dma_resv for memory management purposes. Now the
> > scheduler already has almost all the pieces (at least if we assume
> > there's some magic fw which time-slices these contexts on its own),
> > and we just need a few minimal changes:
> > - allowing the scheduler to ignore the completion fence and just
> > immediately push the next "job" in if its dependencies are ready
> > - maybe minimal amounts of scaffolding to handle the preemption
> > dma_fence because that's not entirely trivial. I think ideally we'd
> > put that into drm_sched_entity since you can only ever have one active
> > preempt dma_fence per gpu ctx/entity.
> >
>
> Yep, preempt fence is per entity in Xe (xe_engine). We install these
> into the VM and all external BOs mapped in the VM dma-resv slots.
> Wondering if we can make all of this very generic between the DRM
> scheduler + GPUVA...

I think if the drm/sched just takes care of the preempt ctx dma_fence
(and still stores it in the same slot in the drm_sched_job struct like
a end-of-batch dma_fence job would), and then the gpuva shared code
just has functions to smash these into the right dma_resv structures
then you have all the pieces. Maybe for a bit more flexibility the
gpuva code takes dma_fence and not directly the drm_sched_job, but
maybe even that level of integration makes sense (but imo optional, a
bit of driver glue code is fine).

Yeah that's roughly what I think we should at least aim for since
there's quiet a few drivers in-flight that all need these pieces (more
or less at least).
-Daniel
>
> Matt
>
> > None of this needs a dma_fence_is_lr anywhere at all.
> >
> > Of course there's the somewhat related issue of "how do we transport
> > these userspace memory fences around from app to compositor", and
> > that's a lot more gnarly. I still don't think dma_fence_is_lr is
> > anywhere near what the solution should look like for that.
> > -Daniel
> >
> >
> > > Regards,
> > > Christian.
> > >
> > > >
> > > > Thanks,
> > > >
> > > > Thomas
> > > >
> > > >
> > > >
> > > >
> > > >
> > >
> >
> >
> > --
> > Daniel Vetter
> > Software Engineer, Intel Corporation
> > http://blog.ffwll.ch
Matthew Brost April 4, 2023, 8:19 p.m. UTC | #11
On Tue, Apr 04, 2023 at 10:11:59PM +0200, Daniel Vetter wrote:
> On Tue, 4 Apr 2023 at 22:04, Matthew Brost <matthew.brost@intel.com> wrote:
> >
> > On Tue, Apr 04, 2023 at 09:00:59PM +0200, Daniel Vetter wrote:
> > > On Tue, 4 Apr 2023 at 15:10, Christian König <christian.koenig@amd.com> wrote:
> > > >
> > > > Am 04.04.23 um 14:54 schrieb Thomas Hellström:
> > > > > Hi, Christian,
> > > > >
> > > > > On 4/4/23 11:09, Christian König wrote:
> > > > >> Am 04.04.23 um 02:22 schrieb Matthew Brost:
> > > > >>> From: Thomas Hellström <thomas.hellstrom@linux.intel.com>
> > > > >>>
> > > > >>> For long-running workloads, drivers either need to open-code completion
> > > > >>> waits, invent their own synchronization primitives or internally use
> > > > >>> dma-fences that do not obey the cross-driver dma-fence protocol, but
> > > > >>> without any lockdep annotation all these approaches are error prone.
> > > > >>>
> > > > >>> So since for example the drm scheduler uses dma-fences it is
> > > > >>> desirable for
> > > > >>> a driver to be able to use it for throttling and error handling also
> > > > >>> with
> > > > >>> internal dma-fences tha do not obey the cros-driver dma-fence protocol.
> > > > >>>
> > > > >>> Introduce long-running completion fences in form of dma-fences, and add
> > > > >>> lockdep annotation for them. In particular:
> > > > >>>
> > > > >>> * Do not allow waiting under any memory management locks.
> > > > >>> * Do not allow to attach them to a dma-resv object.
> > > > >>> * Introduce a new interface for adding callbacks making the helper
> > > > >>> adding
> > > > >>>    a callback sign off on that it is aware that the dma-fence may not
> > > > >>>    complete anytime soon. Typically this will be the scheduler chaining
> > > > >>>    a new long-running fence on another one.
> > > > >>
> > > > >> Well that's pretty much what I tried before:
> > > > >> https://lwn.net/Articles/893704/
> > > > >>
> > > > >> And the reasons why it was rejected haven't changed.
> > > > >>
> > > > >> Regards,
> > > > >> Christian.
> > > > >>
> > > > > Yes, TBH this was mostly to get discussion going how we'd best tackle
> > > > > this problem while being able to reuse the scheduler for long-running
> > > > > workloads.
> > > > >
> > > > > I couldn't see any clear decision on your series, though, but one main
> > > > > difference I see is that this is intended for driver-internal use
> > > > > only. (I'm counting using the drm_scheduler as a helper for
> > > > > driver-private use). This is by no means a way to try tackle the
> > > > > indefinite fence problem.
> > > >
> > > > Well this was just my latest try to tackle this, but essentially the
> > > > problems are the same as with your approach: When we express such
> > > > operations as dma_fence there is always the change that we leak that
> > > > somewhere.
> > > >
> > > > My approach of adding a flag noting that this operation is dangerous and
> > > > can't be synced with something memory management depends on tried to
> > > > contain this as much as possible, but Daniel still pretty clearly
> > > > rejected it (for good reasons I think).
> > >
> > > Yeah I still don't like dma_fence that somehow have totally different
> > > semantics in that critical piece of "will it complete or will it
> > > deadlock?" :-)
> >
> > Not going to touch LR dma-fences in this reply, I think we can continue
> > the LR fence discussion of the fork of this thread I just responded to.
> > Have a response the preempt fence discussion below.
> >
> > > >
> > > > >
> > > > > We could ofc invent a completely different data-type that abstracts
> > > > > the synchronization the scheduler needs in the long-running case, or
> > > > > each driver could hack something up, like sleeping in the
> > > > > prepare_job() or run_job() callback for throttling, but those waits
> > > > > should still be annotated in one way or annotated one way or another
> > > > > (and probably in a similar way across drivers) to make sure we don't
> > > > > do anything bad.
> > > > >
> > > > >  So any suggestions as to what would be the better solution here would
> > > > > be appreciated.
> > > >
> > > > Mhm, do we really the the GPU scheduler for that?
> > > >
> > > > I mean in the 1 to 1 case  you basically just need a component which
> > > > collects the dependencies as dma_fence and if all of them are fulfilled
> > > > schedules a work item.
> > > >
> > > > As long as the work item itself doesn't produce a dma_fence it can then
> > > > still just wait for other none dma_fence dependencies.
> > >
> > > Yeah that's the important thing, for long-running jobs dependencies as
> > > dma_fence should be totally fine. You're just not allowed to have any
> > > outgoing dma_fences at all (except the magic preemption fence).
> > >
> > > > Then the work function could submit the work and wait for the result.
> > > >
> > > > The work item would then pretty much represent what you want, you can
> > > > wait for it to finish and pass it along as long running dependency.
> > > >
> > > > Maybe give it a funky name and wrap it up in a structure, but that's
> > > > basically it.
> > >
> > > Like do we need this? If the kernel ever waits for a long-running
> > > compute job to finnish I'd call that a bug. Any functional
> > > dependencies between engines or whatever are userspace's problem only,
> > > which it needs to sort out using userspace memory fences.
> > >
> > > The only things the kernel needs are some way to track dependencies as
> > > dma_fence (because memory management move the memory away and we need
> > > to move it back in, ideally pipelined). And it needs the special
> > > preempt fence (if we don't have pagefaults) so that you have a fence
> > > to attach to all the dma_resv for memory management purposes. Now the
> > > scheduler already has almost all the pieces (at least if we assume
> > > there's some magic fw which time-slices these contexts on its own),
> > > and we just need a few minimal changes:
> > > - allowing the scheduler to ignore the completion fence and just
> > > immediately push the next "job" in if its dependencies are ready
> > > - maybe minimal amounts of scaffolding to handle the preemption
> > > dma_fence because that's not entirely trivial. I think ideally we'd
> > > put that into drm_sched_entity since you can only ever have one active
> > > preempt dma_fence per gpu ctx/entity.
> > >
> >
> > Yep, preempt fence is per entity in Xe (xe_engine). We install these
> > into the VM and all external BOs mapped in the VM dma-resv slots.
> > Wondering if we can make all of this very generic between the DRM
> > scheduler + GPUVA...
> 
> I think if the drm/sched just takes care of the preempt ctx dma_fence
> (and still stores it in the same slot in the drm_sched_job struct like
> a end-of-batch dma_fence job would), and then the gpuva shared code
> just has functions to smash these into the right dma_resv structures
> then you have all the pieces. Maybe for a bit more flexibility the
> gpuva code takes dma_fence and not directly the drm_sched_job, but
> maybe even that level of integration makes sense (but imo optional, a
> bit of driver glue code is fine).
> 
> Yeah that's roughly what I think we should at least aim for since
> there's quiet a few drivers in-flight that all need these pieces (more
> or less at least).

That is very close to what I'm thinking too, we want to tackle userptr +
GPUVA too, think that will be next but can add this to the list of
things to do.

Matt

> -Daniel
> >
> > Matt
> >
> > > None of this needs a dma_fence_is_lr anywhere at all.
> > >
> > > Of course there's the somewhat related issue of "how do we transport
> > > these userspace memory fences around from app to compositor", and
> > > that's a lot more gnarly. I still don't think dma_fence_is_lr is
> > > anywhere near what the solution should look like for that.
> > > -Daniel
> > >
> > >
> > > > Regards,
> > > > Christian.
> > > >
> > > > >
> > > > > Thanks,
> > > > >
> > > > > Thomas
> > > > >
> > > > >
> > > > >
> > > > >
> > > > >
> > > >
> > >
> > >
> > > --
> > > Daniel Vetter
> > > Software Engineer, Intel Corporation
> > > http://blog.ffwll.ch
> 
> 
> 
> -- 
> Daniel Vetter
> Software Engineer, Intel Corporation
> http://blog.ffwll.ch
Daniel Vetter April 4, 2023, 8:31 p.m. UTC | #12
On Tue, Apr 04, 2023 at 08:19:37PM +0000, Matthew Brost wrote:
> On Tue, Apr 04, 2023 at 10:11:59PM +0200, Daniel Vetter wrote:
> > On Tue, 4 Apr 2023 at 22:04, Matthew Brost <matthew.brost@intel.com> wrote:
> > >
> > > On Tue, Apr 04, 2023 at 09:00:59PM +0200, Daniel Vetter wrote:
> > > > On Tue, 4 Apr 2023 at 15:10, Christian König <christian.koenig@amd.com> wrote:
> > > > >
> > > > > Am 04.04.23 um 14:54 schrieb Thomas Hellström:
> > > > > > Hi, Christian,
> > > > > >
> > > > > > On 4/4/23 11:09, Christian König wrote:
> > > > > >> Am 04.04.23 um 02:22 schrieb Matthew Brost:
> > > > > >>> From: Thomas Hellström <thomas.hellstrom@linux.intel.com>
> > > > > >>>
> > > > > >>> For long-running workloads, drivers either need to open-code completion
> > > > > >>> waits, invent their own synchronization primitives or internally use
> > > > > >>> dma-fences that do not obey the cross-driver dma-fence protocol, but
> > > > > >>> without any lockdep annotation all these approaches are error prone.
> > > > > >>>
> > > > > >>> So since for example the drm scheduler uses dma-fences it is
> > > > > >>> desirable for
> > > > > >>> a driver to be able to use it for throttling and error handling also
> > > > > >>> with
> > > > > >>> internal dma-fences tha do not obey the cros-driver dma-fence protocol.
> > > > > >>>
> > > > > >>> Introduce long-running completion fences in form of dma-fences, and add
> > > > > >>> lockdep annotation for them. In particular:
> > > > > >>>
> > > > > >>> * Do not allow waiting under any memory management locks.
> > > > > >>> * Do not allow to attach them to a dma-resv object.
> > > > > >>> * Introduce a new interface for adding callbacks making the helper
> > > > > >>> adding
> > > > > >>>    a callback sign off on that it is aware that the dma-fence may not
> > > > > >>>    complete anytime soon. Typically this will be the scheduler chaining
> > > > > >>>    a new long-running fence on another one.
> > > > > >>
> > > > > >> Well that's pretty much what I tried before:
> > > > > >> https://lwn.net/Articles/893704/
> > > > > >>
> > > > > >> And the reasons why it was rejected haven't changed.
> > > > > >>
> > > > > >> Regards,
> > > > > >> Christian.
> > > > > >>
> > > > > > Yes, TBH this was mostly to get discussion going how we'd best tackle
> > > > > > this problem while being able to reuse the scheduler for long-running
> > > > > > workloads.
> > > > > >
> > > > > > I couldn't see any clear decision on your series, though, but one main
> > > > > > difference I see is that this is intended for driver-internal use
> > > > > > only. (I'm counting using the drm_scheduler as a helper for
> > > > > > driver-private use). This is by no means a way to try tackle the
> > > > > > indefinite fence problem.
> > > > >
> > > > > Well this was just my latest try to tackle this, but essentially the
> > > > > problems are the same as with your approach: When we express such
> > > > > operations as dma_fence there is always the change that we leak that
> > > > > somewhere.
> > > > >
> > > > > My approach of adding a flag noting that this operation is dangerous and
> > > > > can't be synced with something memory management depends on tried to
> > > > > contain this as much as possible, but Daniel still pretty clearly
> > > > > rejected it (for good reasons I think).
> > > >
> > > > Yeah I still don't like dma_fence that somehow have totally different
> > > > semantics in that critical piece of "will it complete or will it
> > > > deadlock?" :-)
> > >
> > > Not going to touch LR dma-fences in this reply, I think we can continue
> > > the LR fence discussion of the fork of this thread I just responded to.
> > > Have a response the preempt fence discussion below.
> > >
> > > > >
> > > > > >
> > > > > > We could ofc invent a completely different data-type that abstracts
> > > > > > the synchronization the scheduler needs in the long-running case, or
> > > > > > each driver could hack something up, like sleeping in the
> > > > > > prepare_job() or run_job() callback for throttling, but those waits
> > > > > > should still be annotated in one way or annotated one way or another
> > > > > > (and probably in a similar way across drivers) to make sure we don't
> > > > > > do anything bad.
> > > > > >
> > > > > >  So any suggestions as to what would be the better solution here would
> > > > > > be appreciated.
> > > > >
> > > > > Mhm, do we really the the GPU scheduler for that?
> > > > >
> > > > > I mean in the 1 to 1 case  you basically just need a component which
> > > > > collects the dependencies as dma_fence and if all of them are fulfilled
> > > > > schedules a work item.
> > > > >
> > > > > As long as the work item itself doesn't produce a dma_fence it can then
> > > > > still just wait for other none dma_fence dependencies.
> > > >
> > > > Yeah that's the important thing, for long-running jobs dependencies as
> > > > dma_fence should be totally fine. You're just not allowed to have any
> > > > outgoing dma_fences at all (except the magic preemption fence).
> > > >
> > > > > Then the work function could submit the work and wait for the result.
> > > > >
> > > > > The work item would then pretty much represent what you want, you can
> > > > > wait for it to finish and pass it along as long running dependency.
> > > > >
> > > > > Maybe give it a funky name and wrap it up in a structure, but that's
> > > > > basically it.
> > > >
> > > > Like do we need this? If the kernel ever waits for a long-running
> > > > compute job to finnish I'd call that a bug. Any functional
> > > > dependencies between engines or whatever are userspace's problem only,
> > > > which it needs to sort out using userspace memory fences.
> > > >
> > > > The only things the kernel needs are some way to track dependencies as
> > > > dma_fence (because memory management move the memory away and we need
> > > > to move it back in, ideally pipelined). And it needs the special
> > > > preempt fence (if we don't have pagefaults) so that you have a fence
> > > > to attach to all the dma_resv for memory management purposes. Now the
> > > > scheduler already has almost all the pieces (at least if we assume
> > > > there's some magic fw which time-slices these contexts on its own),
> > > > and we just need a few minimal changes:
> > > > - allowing the scheduler to ignore the completion fence and just
> > > > immediately push the next "job" in if its dependencies are ready
> > > > - maybe minimal amounts of scaffolding to handle the preemption
> > > > dma_fence because that's not entirely trivial. I think ideally we'd
> > > > put that into drm_sched_entity since you can only ever have one active
> > > > preempt dma_fence per gpu ctx/entity.
> > > >
> > >
> > > Yep, preempt fence is per entity in Xe (xe_engine). We install these
> > > into the VM and all external BOs mapped in the VM dma-resv slots.
> > > Wondering if we can make all of this very generic between the DRM
> > > scheduler + GPUVA...
> > 
> > I think if the drm/sched just takes care of the preempt ctx dma_fence
> > (and still stores it in the same slot in the drm_sched_job struct like
> > a end-of-batch dma_fence job would), and then the gpuva shared code
> > just has functions to smash these into the right dma_resv structures
> > then you have all the pieces. Maybe for a bit more flexibility the
> > gpuva code takes dma_fence and not directly the drm_sched_job, but
> > maybe even that level of integration makes sense (but imo optional, a
> > bit of driver glue code is fine).
> > 
> > Yeah that's roughly what I think we should at least aim for since
> > there's quiet a few drivers in-flight that all need these pieces (more
> > or less at least).
> 
> That is very close to what I'm thinking too, we want to tackle userptr +
> GPUVA too, think that will be next but can add this to the list of
> things to do.

I discussed userptr+gpuva a bit with Rodrigo (and maybe Thomas H not sure
anymore) and it sounded a bit like that's maybe a bridge too far. At least
until we have some other drivers that also need that combo. But can't hurt
to at least think how it would ideally integrate from xe's pov.
-Daniel

> 
> Matt
> 
> > -Daniel
> > >
> > > Matt
> > >
> > > > None of this needs a dma_fence_is_lr anywhere at all.
> > > >
> > > > Of course there's the somewhat related issue of "how do we transport
> > > > these userspace memory fences around from app to compositor", and
> > > > that's a lot more gnarly. I still don't think dma_fence_is_lr is
> > > > anywhere near what the solution should look like for that.
> > > > -Daniel
> > > >
> > > >
> > > > > Regards,
> > > > > Christian.
> > > > >
> > > > > >
> > > > > > Thanks,
> > > > > >
> > > > > > Thomas
> > > > > >
> > > > > >
> > > > > >
> > > > > >
> > > > > >
> > > > >
> > > >
> > > >
> > > > --
> > > > Daniel Vetter
> > > > Software Engineer, Intel Corporation
> > > > http://blog.ffwll.ch
> > 
> > 
> > 
> > -- 
> > Daniel Vetter
> > Software Engineer, Intel Corporation
> > http://blog.ffwll.ch
Matthew Brost April 4, 2023, 8:46 p.m. UTC | #13
On Tue, Apr 04, 2023 at 10:31:58PM +0200, Daniel Vetter wrote:
> On Tue, Apr 04, 2023 at 08:19:37PM +0000, Matthew Brost wrote:
> > On Tue, Apr 04, 2023 at 10:11:59PM +0200, Daniel Vetter wrote:
> > > On Tue, 4 Apr 2023 at 22:04, Matthew Brost <matthew.brost@intel.com> wrote:
> > > >
> > > > On Tue, Apr 04, 2023 at 09:00:59PM +0200, Daniel Vetter wrote:
> > > > > On Tue, 4 Apr 2023 at 15:10, Christian König <christian.koenig@amd.com> wrote:
> > > > > >
> > > > > > Am 04.04.23 um 14:54 schrieb Thomas Hellström:
> > > > > > > Hi, Christian,
> > > > > > >
> > > > > > > On 4/4/23 11:09, Christian König wrote:
> > > > > > >> Am 04.04.23 um 02:22 schrieb Matthew Brost:
> > > > > > >>> From: Thomas Hellström <thomas.hellstrom@linux.intel.com>
> > > > > > >>>
> > > > > > >>> For long-running workloads, drivers either need to open-code completion
> > > > > > >>> waits, invent their own synchronization primitives or internally use
> > > > > > >>> dma-fences that do not obey the cross-driver dma-fence protocol, but
> > > > > > >>> without any lockdep annotation all these approaches are error prone.
> > > > > > >>>
> > > > > > >>> So since for example the drm scheduler uses dma-fences it is
> > > > > > >>> desirable for
> > > > > > >>> a driver to be able to use it for throttling and error handling also
> > > > > > >>> with
> > > > > > >>> internal dma-fences tha do not obey the cros-driver dma-fence protocol.
> > > > > > >>>
> > > > > > >>> Introduce long-running completion fences in form of dma-fences, and add
> > > > > > >>> lockdep annotation for them. In particular:
> > > > > > >>>
> > > > > > >>> * Do not allow waiting under any memory management locks.
> > > > > > >>> * Do not allow to attach them to a dma-resv object.
> > > > > > >>> * Introduce a new interface for adding callbacks making the helper
> > > > > > >>> adding
> > > > > > >>>    a callback sign off on that it is aware that the dma-fence may not
> > > > > > >>>    complete anytime soon. Typically this will be the scheduler chaining
> > > > > > >>>    a new long-running fence on another one.
> > > > > > >>
> > > > > > >> Well that's pretty much what I tried before:
> > > > > > >> https://lwn.net/Articles/893704/
> > > > > > >>
> > > > > > >> And the reasons why it was rejected haven't changed.
> > > > > > >>
> > > > > > >> Regards,
> > > > > > >> Christian.
> > > > > > >>
> > > > > > > Yes, TBH this was mostly to get discussion going how we'd best tackle
> > > > > > > this problem while being able to reuse the scheduler for long-running
> > > > > > > workloads.
> > > > > > >
> > > > > > > I couldn't see any clear decision on your series, though, but one main
> > > > > > > difference I see is that this is intended for driver-internal use
> > > > > > > only. (I'm counting using the drm_scheduler as a helper for
> > > > > > > driver-private use). This is by no means a way to try tackle the
> > > > > > > indefinite fence problem.
> > > > > >
> > > > > > Well this was just my latest try to tackle this, but essentially the
> > > > > > problems are the same as with your approach: When we express such
> > > > > > operations as dma_fence there is always the change that we leak that
> > > > > > somewhere.
> > > > > >
> > > > > > My approach of adding a flag noting that this operation is dangerous and
> > > > > > can't be synced with something memory management depends on tried to
> > > > > > contain this as much as possible, but Daniel still pretty clearly
> > > > > > rejected it (for good reasons I think).
> > > > >
> > > > > Yeah I still don't like dma_fence that somehow have totally different
> > > > > semantics in that critical piece of "will it complete or will it
> > > > > deadlock?" :-)
> > > >
> > > > Not going to touch LR dma-fences in this reply, I think we can continue
> > > > the LR fence discussion of the fork of this thread I just responded to.
> > > > Have a response the preempt fence discussion below.
> > > >
> > > > > >
> > > > > > >
> > > > > > > We could ofc invent a completely different data-type that abstracts
> > > > > > > the synchronization the scheduler needs in the long-running case, or
> > > > > > > each driver could hack something up, like sleeping in the
> > > > > > > prepare_job() or run_job() callback for throttling, but those waits
> > > > > > > should still be annotated in one way or annotated one way or another
> > > > > > > (and probably in a similar way across drivers) to make sure we don't
> > > > > > > do anything bad.
> > > > > > >
> > > > > > >  So any suggestions as to what would be the better solution here would
> > > > > > > be appreciated.
> > > > > >
> > > > > > Mhm, do we really the the GPU scheduler for that?
> > > > > >
> > > > > > I mean in the 1 to 1 case  you basically just need a component which
> > > > > > collects the dependencies as dma_fence and if all of them are fulfilled
> > > > > > schedules a work item.
> > > > > >
> > > > > > As long as the work item itself doesn't produce a dma_fence it can then
> > > > > > still just wait for other none dma_fence dependencies.
> > > > >
> > > > > Yeah that's the important thing, for long-running jobs dependencies as
> > > > > dma_fence should be totally fine. You're just not allowed to have any
> > > > > outgoing dma_fences at all (except the magic preemption fence).
> > > > >
> > > > > > Then the work function could submit the work and wait for the result.
> > > > > >
> > > > > > The work item would then pretty much represent what you want, you can
> > > > > > wait for it to finish and pass it along as long running dependency.
> > > > > >
> > > > > > Maybe give it a funky name and wrap it up in a structure, but that's
> > > > > > basically it.
> > > > >
> > > > > Like do we need this? If the kernel ever waits for a long-running
> > > > > compute job to finnish I'd call that a bug. Any functional
> > > > > dependencies between engines or whatever are userspace's problem only,
> > > > > which it needs to sort out using userspace memory fences.
> > > > >
> > > > > The only things the kernel needs are some way to track dependencies as
> > > > > dma_fence (because memory management move the memory away and we need
> > > > > to move it back in, ideally pipelined). And it needs the special
> > > > > preempt fence (if we don't have pagefaults) so that you have a fence
> > > > > to attach to all the dma_resv for memory management purposes. Now the
> > > > > scheduler already has almost all the pieces (at least if we assume
> > > > > there's some magic fw which time-slices these contexts on its own),
> > > > > and we just need a few minimal changes:
> > > > > - allowing the scheduler to ignore the completion fence and just
> > > > > immediately push the next "job" in if its dependencies are ready
> > > > > - maybe minimal amounts of scaffolding to handle the preemption
> > > > > dma_fence because that's not entirely trivial. I think ideally we'd
> > > > > put that into drm_sched_entity since you can only ever have one active
> > > > > preempt dma_fence per gpu ctx/entity.
> > > > >
> > > >
> > > > Yep, preempt fence is per entity in Xe (xe_engine). We install these
> > > > into the VM and all external BOs mapped in the VM dma-resv slots.
> > > > Wondering if we can make all of this very generic between the DRM
> > > > scheduler + GPUVA...
> > > 
> > > I think if the drm/sched just takes care of the preempt ctx dma_fence
> > > (and still stores it in the same slot in the drm_sched_job struct like
> > > a end-of-batch dma_fence job would), and then the gpuva shared code
> > > just has functions to smash these into the right dma_resv structures
> > > then you have all the pieces. Maybe for a bit more flexibility the
> > > gpuva code takes dma_fence and not directly the drm_sched_job, but
> > > maybe even that level of integration makes sense (but imo optional, a
> > > bit of driver glue code is fine).
> > > 
> > > Yeah that's roughly what I think we should at least aim for since
> > > there's quiet a few drivers in-flight that all need these pieces (more
> > > or less at least).
> > 
> > That is very close to what I'm thinking too, we want to tackle userptr +
> > GPUVA too, think that will be next but can add this to the list of
> > things to do.
> 
> I discussed userptr+gpuva a bit with Rodrigo (and maybe Thomas H not sure
> anymore) and it sounded a bit like that's maybe a bridge too far. At least
> until we have some other drivers that also need that combo. But can't hurt
> to at least think how it would ideally integrate from xe's pov.
> -Daniel

I spoke with dakr today about on IRC, Nouveua is going to implement
userptr soon. I think the idea would be for Xe and Nouveua to
collaborate on what we stick in GPUVA for userptr + if we have common
DRM helper functions. We may land on something really small (e.g. we
store userpr address with a NULL gem in the gpuva structure) or we might
land on common locking, page population, and MMU notifier? Interested to
see where we land.

Matt

> 
> > 
> > Matt
> > 
> > > -Daniel
> > > >
> > > > Matt
> > > >
> > > > > None of this needs a dma_fence_is_lr anywhere at all.
> > > > >
> > > > > Of course there's the somewhat related issue of "how do we transport
> > > > > these userspace memory fences around from app to compositor", and
> > > > > that's a lot more gnarly. I still don't think dma_fence_is_lr is
> > > > > anywhere near what the solution should look like for that.
> > > > > -Daniel
> > > > >
> > > > >
> > > > > > Regards,
> > > > > > Christian.
> > > > > >
> > > > > > >
> > > > > > > Thanks,
> > > > > > >
> > > > > > > Thomas
> > > > > > >
> > > > > > >
> > > > > > >
> > > > > > >
> > > > > > >
> > > > > >
> > > > >
> > > > >
> > > > > --
> > > > > Daniel Vetter
> > > > > Software Engineer, Intel Corporation
> > > > > http://blog.ffwll.ch
> > > 
> > > 
> > > 
> > > -- 
> > > Daniel Vetter
> > > Software Engineer, Intel Corporation
> > > http://blog.ffwll.ch
> 
> -- 
> Daniel Vetter
> Software Engineer, Intel Corporation
> http://blog.ffwll.ch
Thomas Hellstrom April 5, 2023, 12:35 p.m. UTC | #14
Hi,

On 4/4/23 21:25, Daniel Vetter wrote:
> On Tue, Apr 04, 2023 at 07:02:23PM +0000, Matthew Brost wrote:
>> On Tue, Apr 04, 2023 at 08:14:01PM +0200, Thomas Hellström (Intel) wrote:
>>> On 4/4/23 15:10, Christian König wrote:
>>>> Am 04.04.23 um 14:54 schrieb Thomas Hellström:
>>>>> Hi, Christian,
>>>>>
>>>>> On 4/4/23 11:09, Christian König wrote:
>>>>>> Am 04.04.23 um 02:22 schrieb Matthew Brost:
>>>>>>> From: Thomas Hellström <thomas.hellstrom@linux.intel.com>
>>>>>>>
>>>>>>> For long-running workloads, drivers either need to open-code
>>>>>>> completion
>>>>>>> waits, invent their own synchronization primitives or internally use
>>>>>>> dma-fences that do not obey the cross-driver dma-fence protocol, but
>>>>>>> without any lockdep annotation all these approaches are error prone.
>>>>>>>
>>>>>>> So since for example the drm scheduler uses dma-fences it is
>>>>>>> desirable for
>>>>>>> a driver to be able to use it for throttling and error
>>>>>>> handling also with
>>>>>>> internal dma-fences tha do not obey the cros-driver
>>>>>>> dma-fence protocol.
>>>>>>>
>>>>>>> Introduce long-running completion fences in form of
>>>>>>> dma-fences, and add
>>>>>>> lockdep annotation for them. In particular:
>>>>>>>
>>>>>>> * Do not allow waiting under any memory management locks.
>>>>>>> * Do not allow to attach them to a dma-resv object.
>>>>>>> * Introduce a new interface for adding callbacks making the
>>>>>>> helper adding
>>>>>>>     a callback sign off on that it is aware that the dma-fence may not
>>>>>>>     complete anytime soon. Typically this will be the
>>>>>>> scheduler chaining
>>>>>>>     a new long-running fence on another one.
>>>>>> Well that's pretty much what I tried before:
>>>>>> https://lwn.net/Articles/893704/
>>>>>>
>> I don't think this quite the same, this explictly enforces that we don't
>> break the dma-fence rules (in path of memory allocations, exported in
>> any way), essentially this just SW sync point reusing dma-fence the
>> infrastructure for signaling / callbacks. I believe your series tried to
>> export these fences to user space (admittedly I haven't fully read your
>> series).
>>
>> In this use case we essentially just want to flow control the ring via
>> the dma-scheduler + maintain a list of pending jobs so the TDR can be
>> used for cleanup if LR entity encounters an error. To me this seems
>> perfectly reasonable but I know dma-femce rules are akin to a holy war.
>>
>> If we return NULL in run_job, now we have to be able to sink all jobs
>> in the backend regardless on ring space, maintain a list of jobs pending
>> for cleanup after errors, and write a different cleanup path as now the
>> TDR doesn't work. Seems very, very silly to duplicate all of this code
>> when the DRM scheduler provides all of this for us. Also if we go this
>> route, now all drivers are going to invent ways to handle LR jobs /w the
>> DRM scheduler.
>>
>> This solution is pretty clear, mark the scheduler as LR, and don't
>> export any fences from the scheduler. If you try to export these fences
>> a blow up happens.
> The problem is if you mix things up. Like for resets you need all the
> schedulers on an engine/set-of-engines to quiescent or things get
> potentially hilarious. If you now have a scheduler in forever limbo, the
> dma_fence guarantees are right out the window.
>
> But the issue you're having is fairly specific if it's just about
> ringspace. I think the dumbest fix is to just block in submit if you run
> out of per-ctx ringspace, and call it a day. This notion that somehow the
> kernel is supposed to provide a bottomless queue of anything userspace
> submits simply doesn't hold up in reality (as much as userspace standards
> committees would like it to), and as long as it doesn't have a real-world
> perf impact it doesn't really matter why we end up blocking in the submit
> ioctl. It might also be a simple memory allocation that hits a snag in
> page reclaim.

So it seems the discussion around the long-running synchronization 
diverged a bit between threads and this thread was hijacked for 
preempt-fences and userptr.

Do I understand it correctly that the recommendation from both Daniel 
and Christian is to *not* use the drm scheduler for long-running compute 
jobs, but track any internal dma-fence dependencies (pipelined clearing 
or whatever) in a separate mechanism and handle unresolved dependencies 
on other long-running jobs using -EWOULDBLOCK?

Thanks,
Thomas





>>>>>> And the reasons why it was rejected haven't changed.
>>>>>>
>>>>>> Regards,
>>>>>> Christian.
>>>>>>
>>>>> Yes, TBH this was mostly to get discussion going how we'd best
>>>>> tackle this problem while being able to reuse the scheduler for
>>>>> long-running workloads.
>>>>>
>>>>> I couldn't see any clear decision on your series, though, but one
>>>>> main difference I see is that this is intended for driver-internal
>>>>> use only. (I'm counting using the drm_scheduler as a helper for
>>>>> driver-private use). This is by no means a way to try tackle the
>>>>> indefinite fence problem.
>>>> Well this was just my latest try to tackle this, but essentially the
>>>> problems are the same as with your approach: When we express such
>>>> operations as dma_fence there is always the change that we leak that
>>>> somewhere.
>>>>
>>>> My approach of adding a flag noting that this operation is dangerous and
>>>> can't be synced with something memory management depends on tried to
>>>> contain this as much as possible, but Daniel still pretty clearly
>>>> rejected it (for good reasons I think).
>>>>
>>>>> We could ofc invent a completely different data-type that abstracts
>>>>> the synchronization the scheduler needs in the long-running case, or
>>>>> each driver could hack something up, like sleeping in the
>>>>> prepare_job() or run_job() callback for throttling, but those waits
>>>>> should still be annotated in one way or annotated one way or another
>>>>> (and probably in a similar way across drivers) to make sure we don't
>>>>> do anything bad.
>>>>>
>>>>>   So any suggestions as to what would be the better solution here
>>>>> would be appreciated.
>>>> Mhm, do we really the the GPU scheduler for that?
>>>>
>> I think we need to solve this within the DRM scheduler one way or
>> another.
> Yeah so if we conclude that the queue really must be bottomless then I
> agree drm-sched should help out sort out the mess. Because I'm guessing
> that every driver will have this issue. But that's a big if.
>
> I guess if we teach the drm scheduler that some jobs are fairly endless
> then maybe it wouldn't be too far-fetched to also teach it to wait for a
> previous one to finish (but not with the dma_fence that preempts, which we
> put into the dma_resv for memory management, but some other struct
> completion). The scheduler already has a concept of not stuffing too much
> stuff into the same queue after all, so this should fit?
> -Daniel
>
>
>>>> I mean in the 1 to 1 case  you basically just need a component which
>>>> collects the dependencies as dma_fence and if all of them are fulfilled
>>>> schedules a work item.
>>>>
>>>> As long as the work item itself doesn't produce a dma_fence it can then
>>>> still just wait for other none dma_fence dependencies.
>>>>
>>>> Then the work function could submit the work and wait for the result.
>>>>
>>>> The work item would then pretty much represent what you want, you can
>>>> wait for it to finish and pass it along as long running dependency.
>>>>
>>>> Maybe give it a funky name and wrap it up in a structure, but that's
>>>> basically it.
>>>>
>>> This very much sounds like a i915_sw_fence for the dependency tracking and
>>> dma_fence_work for the actual work although it's completion fence is a
>>> dma_fence.
>>>
>> Agree this does sound to i915ish as stated below one of mandates in Xe
>> was to use the DRM scheduler. Beyond that as someone who a submission
>> backend in the i915 and Xe, I love how the DRM scheduler works (single
>> entry point), it makes everything so much easier.
>>
>> Matt
>>
>>> Although that goes against the whole idea of a condition for merging the xe
>>> driver would be that we implement some sort of minimal scaffolding for
>>> long-running workloads in the drm scheduler, and the thinking behind that is
>>> to avoid implementing intel-specific solutions like those...
>>>
>>> Thanks,
>>>
>>> Thomas
>>>
>>>
>>>
>>>> Regards,
>>>> Christian.
>>>>
>>>>> Thanks,
>>>>>
>>>>> Thomas
>>>>>
>>>>>
>>>>>
>>>>>
>>>>>
Christian König April 5, 2023, 12:39 p.m. UTC | #15
Am 05.04.23 um 14:35 schrieb Thomas Hellström:
> Hi,
>
> On 4/4/23 21:25, Daniel Vetter wrote:
>> On Tue, Apr 04, 2023 at 07:02:23PM +0000, Matthew Brost wrote:
>>> On Tue, Apr 04, 2023 at 08:14:01PM +0200, Thomas Hellström (Intel) 
>>> wrote:
>>>> On 4/4/23 15:10, Christian König wrote:
>>>>> Am 04.04.23 um 14:54 schrieb Thomas Hellström:
>>>>>> Hi, Christian,
>>>>>>
>>>>>> On 4/4/23 11:09, Christian König wrote:
>>>>>>> Am 04.04.23 um 02:22 schrieb Matthew Brost:
>>>>>>>> From: Thomas Hellström <thomas.hellstrom@linux.intel.com>
>>>>>>>>
>>>>>>>> For long-running workloads, drivers either need to open-code
>>>>>>>> completion
>>>>>>>> waits, invent their own synchronization primitives or 
>>>>>>>> internally use
>>>>>>>> dma-fences that do not obey the cross-driver dma-fence 
>>>>>>>> protocol, but
>>>>>>>> without any lockdep annotation all these approaches are error 
>>>>>>>> prone.
>>>>>>>>
>>>>>>>> So since for example the drm scheduler uses dma-fences it is
>>>>>>>> desirable for
>>>>>>>> a driver to be able to use it for throttling and error
>>>>>>>> handling also with
>>>>>>>> internal dma-fences tha do not obey the cros-driver
>>>>>>>> dma-fence protocol.
>>>>>>>>
>>>>>>>> Introduce long-running completion fences in form of
>>>>>>>> dma-fences, and add
>>>>>>>> lockdep annotation for them. In particular:
>>>>>>>>
>>>>>>>> * Do not allow waiting under any memory management locks.
>>>>>>>> * Do not allow to attach them to a dma-resv object.
>>>>>>>> * Introduce a new interface for adding callbacks making the
>>>>>>>> helper adding
>>>>>>>>     a callback sign off on that it is aware that the dma-fence 
>>>>>>>> may not
>>>>>>>>     complete anytime soon. Typically this will be the
>>>>>>>> scheduler chaining
>>>>>>>>     a new long-running fence on another one.
>>>>>>> Well that's pretty much what I tried before:
>>>>>>> https://lwn.net/Articles/893704/
>>>>>>>
>>> I don't think this quite the same, this explictly enforces that we 
>>> don't
>>> break the dma-fence rules (in path of memory allocations, exported in
>>> any way), essentially this just SW sync point reusing dma-fence the
>>> infrastructure for signaling / callbacks. I believe your series 
>>> tried to
>>> export these fences to user space (admittedly I haven't fully read your
>>> series).
>>>
>>> In this use case we essentially just want to flow control the ring via
>>> the dma-scheduler + maintain a list of pending jobs so the TDR can be
>>> used for cleanup if LR entity encounters an error. To me this seems
>>> perfectly reasonable but I know dma-femce rules are akin to a holy war.
>>>
>>> If we return NULL in run_job, now we have to be able to sink all jobs
>>> in the backend regardless on ring space, maintain a list of jobs 
>>> pending
>>> for cleanup after errors, and write a different cleanup path as now the
>>> TDR doesn't work. Seems very, very silly to duplicate all of this code
>>> when the DRM scheduler provides all of this for us. Also if we go this
>>> route, now all drivers are going to invent ways to handle LR jobs /w 
>>> the
>>> DRM scheduler.
>>>
>>> This solution is pretty clear, mark the scheduler as LR, and don't
>>> export any fences from the scheduler. If you try to export these fences
>>> a blow up happens.
>> The problem is if you mix things up. Like for resets you need all the
>> schedulers on an engine/set-of-engines to quiescent or things get
>> potentially hilarious. If you now have a scheduler in forever limbo, the
>> dma_fence guarantees are right out the window.
>>
>> But the issue you're having is fairly specific if it's just about
>> ringspace. I think the dumbest fix is to just block in submit if you run
>> out of per-ctx ringspace, and call it a day. This notion that somehow 
>> the
>> kernel is supposed to provide a bottomless queue of anything userspace
>> submits simply doesn't hold up in reality (as much as userspace 
>> standards
>> committees would like it to), and as long as it doesn't have a 
>> real-world
>> perf impact it doesn't really matter why we end up blocking in the 
>> submit
>> ioctl. It might also be a simple memory allocation that hits a snag in
>> page reclaim.
>
> So it seems the discussion around the long-running synchronization 
> diverged a bit between threads and this thread was hijacked for 
> preempt-fences and userptr.
>
> Do I understand it correctly that the recommendation from both Daniel 
> and Christian is to *not* use the drm scheduler for long-running 
> compute jobs, but track any internal dma-fence dependencies (pipelined 
> clearing or whatever) in a separate mechanism and handle unresolved 
> dependencies on other long-running jobs using -EWOULDBLOCK?

Yeah, I think that's a good summary.

If needed we could extract some scheduler functionality into separate 
components, but the fundamental problem is that to the GPU scheduler 
provides a dma_fence interface to the outside to signal job completion 
and Daniel and I seem to agree that you really don't want that.

Regards,
Christian.

>
> Thanks,
> Thomas
>
>
>
>
>
>>>>>>> And the reasons why it was rejected haven't changed.
>>>>>>>
>>>>>>> Regards,
>>>>>>> Christian.
>>>>>>>
>>>>>> Yes, TBH this was mostly to get discussion going how we'd best
>>>>>> tackle this problem while being able to reuse the scheduler for
>>>>>> long-running workloads.
>>>>>>
>>>>>> I couldn't see any clear decision on your series, though, but one
>>>>>> main difference I see is that this is intended for driver-internal
>>>>>> use only. (I'm counting using the drm_scheduler as a helper for
>>>>>> driver-private use). This is by no means a way to try tackle the
>>>>>> indefinite fence problem.
>>>>> Well this was just my latest try to tackle this, but essentially the
>>>>> problems are the same as with your approach: When we express such
>>>>> operations as dma_fence there is always the change that we leak that
>>>>> somewhere.
>>>>>
>>>>> My approach of adding a flag noting that this operation is 
>>>>> dangerous and
>>>>> can't be synced with something memory management depends on tried to
>>>>> contain this as much as possible, but Daniel still pretty clearly
>>>>> rejected it (for good reasons I think).
>>>>>
>>>>>> We could ofc invent a completely different data-type that abstracts
>>>>>> the synchronization the scheduler needs in the long-running case, or
>>>>>> each driver could hack something up, like sleeping in the
>>>>>> prepare_job() or run_job() callback for throttling, but those waits
>>>>>> should still be annotated in one way or annotated one way or another
>>>>>> (and probably in a similar way across drivers) to make sure we don't
>>>>>> do anything bad.
>>>>>>
>>>>>>   So any suggestions as to what would be the better solution here
>>>>>> would be appreciated.
>>>>> Mhm, do we really the the GPU scheduler for that?
>>>>>
>>> I think we need to solve this within the DRM scheduler one way or
>>> another.
>> Yeah so if we conclude that the queue really must be bottomless then I
>> agree drm-sched should help out sort out the mess. Because I'm guessing
>> that every driver will have this issue. But that's a big if.
>>
>> I guess if we teach the drm scheduler that some jobs are fairly endless
>> then maybe it wouldn't be too far-fetched to also teach it to wait for a
>> previous one to finish (but not with the dma_fence that preempts, 
>> which we
>> put into the dma_resv for memory management, but some other struct
>> completion). The scheduler already has a concept of not stuffing too 
>> much
>> stuff into the same queue after all, so this should fit?
>> -Daniel
>>
>>
>>>>> I mean in the 1 to 1 case  you basically just need a component which
>>>>> collects the dependencies as dma_fence and if all of them are 
>>>>> fulfilled
>>>>> schedules a work item.
>>>>>
>>>>> As long as the work item itself doesn't produce a dma_fence it can 
>>>>> then
>>>>> still just wait for other none dma_fence dependencies.
>>>>>
>>>>> Then the work function could submit the work and wait for the result.
>>>>>
>>>>> The work item would then pretty much represent what you want, you can
>>>>> wait for it to finish and pass it along as long running dependency.
>>>>>
>>>>> Maybe give it a funky name and wrap it up in a structure, but that's
>>>>> basically it.
>>>>>
>>>> This very much sounds like a i915_sw_fence for the dependency 
>>>> tracking and
>>>> dma_fence_work for the actual work although it's completion fence is a
>>>> dma_fence.
>>>>
>>> Agree this does sound to i915ish as stated below one of mandates in Xe
>>> was to use the DRM scheduler. Beyond that as someone who a submission
>>> backend in the i915 and Xe, I love how the DRM scheduler works (single
>>> entry point), it makes everything so much easier.
>>>
>>> Matt
>>>
>>>> Although that goes against the whole idea of a condition for 
>>>> merging the xe
>>>> driver would be that we implement some sort of minimal scaffolding for
>>>> long-running workloads in the drm scheduler, and the thinking 
>>>> behind that is
>>>> to avoid implementing intel-specific solutions like those...
>>>>
>>>> Thanks,
>>>>
>>>> Thomas
>>>>
>>>>
>>>>
>>>>> Regards,
>>>>> Christian.
>>>>>
>>>>>> Thanks,
>>>>>>
>>>>>> Thomas
>>>>>>
>>>>>>
>>>>>>
>>>>>>
>>>>>>
Daniel Vetter April 5, 2023, 12:45 p.m. UTC | #16
On Wed, Apr 05, 2023 at 02:39:35PM +0200, Christian König wrote:
> Am 05.04.23 um 14:35 schrieb Thomas Hellström:
> > Hi,
> > 
> > On 4/4/23 21:25, Daniel Vetter wrote:
> > > On Tue, Apr 04, 2023 at 07:02:23PM +0000, Matthew Brost wrote:
> > > > On Tue, Apr 04, 2023 at 08:14:01PM +0200, Thomas Hellström
> > > > (Intel) wrote:
> > > > > On 4/4/23 15:10, Christian König wrote:
> > > > > > Am 04.04.23 um 14:54 schrieb Thomas Hellström:
> > > > > > > Hi, Christian,
> > > > > > > 
> > > > > > > On 4/4/23 11:09, Christian König wrote:
> > > > > > > > Am 04.04.23 um 02:22 schrieb Matthew Brost:
> > > > > > > > > From: Thomas Hellström <thomas.hellstrom@linux.intel.com>
> > > > > > > > > 
> > > > > > > > > For long-running workloads, drivers either need to open-code
> > > > > > > > > completion
> > > > > > > > > waits, invent their own synchronization
> > > > > > > > > primitives or internally use
> > > > > > > > > dma-fences that do not obey the cross-driver
> > > > > > > > > dma-fence protocol, but
> > > > > > > > > without any lockdep annotation all these
> > > > > > > > > approaches are error prone.
> > > > > > > > > 
> > > > > > > > > So since for example the drm scheduler uses dma-fences it is
> > > > > > > > > desirable for
> > > > > > > > > a driver to be able to use it for throttling and error
> > > > > > > > > handling also with
> > > > > > > > > internal dma-fences tha do not obey the cros-driver
> > > > > > > > > dma-fence protocol.
> > > > > > > > > 
> > > > > > > > > Introduce long-running completion fences in form of
> > > > > > > > > dma-fences, and add
> > > > > > > > > lockdep annotation for them. In particular:
> > > > > > > > > 
> > > > > > > > > * Do not allow waiting under any memory management locks.
> > > > > > > > > * Do not allow to attach them to a dma-resv object.
> > > > > > > > > * Introduce a new interface for adding callbacks making the
> > > > > > > > > helper adding
> > > > > > > > >     a callback sign off on that it is aware
> > > > > > > > > that the dma-fence may not
> > > > > > > > >     complete anytime soon. Typically this will be the
> > > > > > > > > scheduler chaining
> > > > > > > > >     a new long-running fence on another one.
> > > > > > > > Well that's pretty much what I tried before:
> > > > > > > > https://lwn.net/Articles/893704/
> > > > > > > > 
> > > > I don't think this quite the same, this explictly enforces that
> > > > we don't
> > > > break the dma-fence rules (in path of memory allocations, exported in
> > > > any way), essentially this just SW sync point reusing dma-fence the
> > > > infrastructure for signaling / callbacks. I believe your series
> > > > tried to
> > > > export these fences to user space (admittedly I haven't fully read your
> > > > series).
> > > > 
> > > > In this use case we essentially just want to flow control the ring via
> > > > the dma-scheduler + maintain a list of pending jobs so the TDR can be
> > > > used for cleanup if LR entity encounters an error. To me this seems
> > > > perfectly reasonable but I know dma-femce rules are akin to a holy war.
> > > > 
> > > > If we return NULL in run_job, now we have to be able to sink all jobs
> > > > in the backend regardless on ring space, maintain a list of jobs
> > > > pending
> > > > for cleanup after errors, and write a different cleanup path as now the
> > > > TDR doesn't work. Seems very, very silly to duplicate all of this code
> > > > when the DRM scheduler provides all of this for us. Also if we go this
> > > > route, now all drivers are going to invent ways to handle LR
> > > > jobs /w the
> > > > DRM scheduler.
> > > > 
> > > > This solution is pretty clear, mark the scheduler as LR, and don't
> > > > export any fences from the scheduler. If you try to export these fences
> > > > a blow up happens.
> > > The problem is if you mix things up. Like for resets you need all the
> > > schedulers on an engine/set-of-engines to quiescent or things get
> > > potentially hilarious. If you now have a scheduler in forever limbo, the
> > > dma_fence guarantees are right out the window.
> > > 
> > > But the issue you're having is fairly specific if it's just about
> > > ringspace. I think the dumbest fix is to just block in submit if you run
> > > out of per-ctx ringspace, and call it a day. This notion that
> > > somehow the
> > > kernel is supposed to provide a bottomless queue of anything userspace
> > > submits simply doesn't hold up in reality (as much as userspace
> > > standards
> > > committees would like it to), and as long as it doesn't have a
> > > real-world
> > > perf impact it doesn't really matter why we end up blocking in the
> > > submit
> > > ioctl. It might also be a simple memory allocation that hits a snag in
> > > page reclaim.
> > 
> > So it seems the discussion around the long-running synchronization
> > diverged a bit between threads and this thread was hijacked for
> > preempt-fences and userptr.
> > 
> > Do I understand it correctly that the recommendation from both Daniel
> > and Christian is to *not* use the drm scheduler for long-running compute
> > jobs, but track any internal dma-fence dependencies (pipelined clearing
> > or whatever) in a separate mechanism and handle unresolved dependencies
> > on other long-running jobs using -EWOULDBLOCK?
> 
> Yeah, I think that's a good summary.
> 
> If needed we could extract some scheduler functionality into separate
> components, but the fundamental problem is that to the GPU scheduler
> provides a dma_fence interface to the outside to signal job completion and
> Daniel and I seem to agree that you really don't want that.

I think I'm on something slightly different:

- For anything which semantically is not a dma_fence I agree it probably
  should be handled with EWOULDBLOCK and passed to userspace. Either with
  a submit thread or userspace memory fences. Note that in practice you
  will have a bunch of blocking left in the ioctl, stuff like mutexes or
  memory allocations when things get really tight and you end up in
  synchronous reclaim. Not any different from userspace ending up in
  synchronous reclaim due to a page fault really. Trying to shoehorn
  userspace memory fences or anything else long-running into drm/sched
  dependency handling is just way too much a can of worms.

- For the memory management dependencies, which are all dma_fence when
  pipeline, I do think pushing them through the drm/sched makes sense. It
  has all the stuff to handle that already, plus it's imo also the ideal
  place to handle the preempt-ctx dma_fence scaffolding/semantics. Which
  would give you a really neatly unified command submission interface
  since in both cases (end-of-batch and long-running) you fish the
  dma_fence you need to stuff in all the right dma_resv object (for memory
  management purpose) out of the same place: The drm_sched_job struct.

So I'm _not_ on the "do not use drm/sched for long-running jobs at all".
That doesn't make much sense to me because you'll just reinventing the
exact same dma_fence dependency handling and memory management shuffling
we already have. That seems silly.
-Daniel

> 
> Regards,
> Christian.
> 
> > 
> > Thanks,
> > Thomas
> > 
> > 
> > 
> > 
> > 
> > > > > > > > And the reasons why it was rejected haven't changed.
> > > > > > > > 
> > > > > > > > Regards,
> > > > > > > > Christian.
> > > > > > > > 
> > > > > > > Yes, TBH this was mostly to get discussion going how we'd best
> > > > > > > tackle this problem while being able to reuse the scheduler for
> > > > > > > long-running workloads.
> > > > > > > 
> > > > > > > I couldn't see any clear decision on your series, though, but one
> > > > > > > main difference I see is that this is intended for driver-internal
> > > > > > > use only. (I'm counting using the drm_scheduler as a helper for
> > > > > > > driver-private use). This is by no means a way to try tackle the
> > > > > > > indefinite fence problem.
> > > > > > Well this was just my latest try to tackle this, but essentially the
> > > > > > problems are the same as with your approach: When we express such
> > > > > > operations as dma_fence there is always the change that we leak that
> > > > > > somewhere.
> > > > > > 
> > > > > > My approach of adding a flag noting that this operation
> > > > > > is dangerous and
> > > > > > can't be synced with something memory management depends on tried to
> > > > > > contain this as much as possible, but Daniel still pretty clearly
> > > > > > rejected it (for good reasons I think).
> > > > > > 
> > > > > > > We could ofc invent a completely different data-type that abstracts
> > > > > > > the synchronization the scheduler needs in the long-running case, or
> > > > > > > each driver could hack something up, like sleeping in the
> > > > > > > prepare_job() or run_job() callback for throttling, but those waits
> > > > > > > should still be annotated in one way or annotated one way or another
> > > > > > > (and probably in a similar way across drivers) to make sure we don't
> > > > > > > do anything bad.
> > > > > > > 
> > > > > > >   So any suggestions as to what would be the better solution here
> > > > > > > would be appreciated.
> > > > > > Mhm, do we really the the GPU scheduler for that?
> > > > > > 
> > > > I think we need to solve this within the DRM scheduler one way or
> > > > another.
> > > Yeah so if we conclude that the queue really must be bottomless then I
> > > agree drm-sched should help out sort out the mess. Because I'm guessing
> > > that every driver will have this issue. But that's a big if.
> > > 
> > > I guess if we teach the drm scheduler that some jobs are fairly endless
> > > then maybe it wouldn't be too far-fetched to also teach it to wait for a
> > > previous one to finish (but not with the dma_fence that preempts,
> > > which we
> > > put into the dma_resv for memory management, but some other struct
> > > completion). The scheduler already has a concept of not stuffing too
> > > much
> > > stuff into the same queue after all, so this should fit?
> > > -Daniel
> > > 
> > > 
> > > > > > I mean in the 1 to 1 case  you basically just need a component which
> > > > > > collects the dependencies as dma_fence and if all of
> > > > > > them are fulfilled
> > > > > > schedules a work item.
> > > > > > 
> > > > > > As long as the work item itself doesn't produce a
> > > > > > dma_fence it can then
> > > > > > still just wait for other none dma_fence dependencies.
> > > > > > 
> > > > > > Then the work function could submit the work and wait for the result.
> > > > > > 
> > > > > > The work item would then pretty much represent what you want, you can
> > > > > > wait for it to finish and pass it along as long running dependency.
> > > > > > 
> > > > > > Maybe give it a funky name and wrap it up in a structure, but that's
> > > > > > basically it.
> > > > > > 
> > > > > This very much sounds like a i915_sw_fence for the
> > > > > dependency tracking and
> > > > > dma_fence_work for the actual work although it's completion fence is a
> > > > > dma_fence.
> > > > > 
> > > > Agree this does sound to i915ish as stated below one of mandates in Xe
> > > > was to use the DRM scheduler. Beyond that as someone who a submission
> > > > backend in the i915 and Xe, I love how the DRM scheduler works (single
> > > > entry point), it makes everything so much easier.
> > > > 
> > > > Matt
> > > > 
> > > > > Although that goes against the whole idea of a condition for
> > > > > merging the xe
> > > > > driver would be that we implement some sort of minimal scaffolding for
> > > > > long-running workloads in the drm scheduler, and the
> > > > > thinking behind that is
> > > > > to avoid implementing intel-specific solutions like those...
> > > > > 
> > > > > Thanks,
> > > > > 
> > > > > Thomas
> > > > > 
> > > > > 
> > > > > 
> > > > > > Regards,
> > > > > > Christian.
> > > > > > 
> > > > > > > Thanks,
> > > > > > > 
> > > > > > > Thomas
> > > > > > > 
> > > > > > > 
> > > > > > > 
> > > > > > > 
> > > > > > > 
>
Daniel Vetter April 5, 2023, 1:09 p.m. UTC | #17
On Tue, Apr 04, 2023 at 07:48:27PM +0000, Matthew Brost wrote:
> On Tue, Apr 04, 2023 at 09:25:52PM +0200, Daniel Vetter wrote:
> > On Tue, Apr 04, 2023 at 07:02:23PM +0000, Matthew Brost wrote:
> > > On Tue, Apr 04, 2023 at 08:14:01PM +0200, Thomas Hellström (Intel) wrote:
> > > > 
> > > > On 4/4/23 15:10, Christian König wrote:
> > > > > Am 04.04.23 um 14:54 schrieb Thomas Hellström:
> > > > > > Hi, Christian,
> > > > > > 
> > > > > > On 4/4/23 11:09, Christian König wrote:
> > > > > > > Am 04.04.23 um 02:22 schrieb Matthew Brost:
> > > > > > > > From: Thomas Hellström <thomas.hellstrom@linux.intel.com>
> > > > > > > > 
> > > > > > > > For long-running workloads, drivers either need to open-code
> > > > > > > > completion
> > > > > > > > waits, invent their own synchronization primitives or internally use
> > > > > > > > dma-fences that do not obey the cross-driver dma-fence protocol, but
> > > > > > > > without any lockdep annotation all these approaches are error prone.
> > > > > > > > 
> > > > > > > > So since for example the drm scheduler uses dma-fences it is
> > > > > > > > desirable for
> > > > > > > > a driver to be able to use it for throttling and error
> > > > > > > > handling also with
> > > > > > > > internal dma-fences tha do not obey the cros-driver
> > > > > > > > dma-fence protocol.
> > > > > > > > 
> > > > > > > > Introduce long-running completion fences in form of
> > > > > > > > dma-fences, and add
> > > > > > > > lockdep annotation for them. In particular:
> > > > > > > > 
> > > > > > > > * Do not allow waiting under any memory management locks.
> > > > > > > > * Do not allow to attach them to a dma-resv object.
> > > > > > > > * Introduce a new interface for adding callbacks making the
> > > > > > > > helper adding
> > > > > > > >    a callback sign off on that it is aware that the dma-fence may not
> > > > > > > >    complete anytime soon. Typically this will be the
> > > > > > > > scheduler chaining
> > > > > > > >    a new long-running fence on another one.
> > > > > > > 
> > > > > > > Well that's pretty much what I tried before:
> > > > > > > https://lwn.net/Articles/893704/
> > > > > > > 
> > > 
> > > I don't think this quite the same, this explictly enforces that we don't
> > > break the dma-fence rules (in path of memory allocations, exported in
> > > any way), essentially this just SW sync point reusing dma-fence the
> > > infrastructure for signaling / callbacks. I believe your series tried to
> > > export these fences to user space (admittedly I haven't fully read your
> > > series).
> > > 
> > > In this use case we essentially just want to flow control the ring via
> > > the dma-scheduler + maintain a list of pending jobs so the TDR can be
> > > used for cleanup if LR entity encounters an error. To me this seems
> > > perfectly reasonable but I know dma-femce rules are akin to a holy war.
> > > 
> > > If we return NULL in run_job, now we have to be able to sink all jobs
> > > in the backend regardless on ring space, maintain a list of jobs pending
> > > for cleanup after errors, and write a different cleanup path as now the
> > > TDR doesn't work. Seems very, very silly to duplicate all of this code
> > > when the DRM scheduler provides all of this for us. Also if we go this
> > > route, now all drivers are going to invent ways to handle LR jobs /w the
> > > DRM scheduler.
> > > 
> > > This solution is pretty clear, mark the scheduler as LR, and don't
> > > export any fences from the scheduler. If you try to export these fences
> > > a blow up happens.
> > 
> > The problem is if you mix things up. Like for resets you need all the
> > schedulers on an engine/set-of-engines to quiescent or things get
> > potentially hilarious. If you now have a scheduler in forever limbo, the
> > dma_fence guarantees are right out the window.
> > 
> 
> Right, a GT reset on Xe is:
> 
> Stop all schedulers
> Do a reset
> Ban any schedulers which we think caused the GT reset
> Resubmit all schedulers which we think were good
> Restart all schedulers
> 
> None of this flow depends on LR dma-fences, all of this uses the DRM
> sched infrastructure and work very well compared to the i915. Rewriting
> all this with a driver specific implementation is what we are trying to
> avoid.
> 
> Similarly if LR entity hangs on its own (not a GT reset, rather the
> firmware does the reset for us) we use all the DRM scheduler
> infrastructure to handle this. Again this works rather well...

Yeah this is why I don't think duplicating everything that long-running
jobs need makes any sense. iow I agree with you.

> > But the issue you're having is fairly specific if it's just about
> > ringspace. I think the dumbest fix is to just block in submit if you run
> > out of per-ctx ringspace, and call it a day. This notion that somehow the
> 
> How does that not break the dma-fence rules? A job can publish its
> finished fence after ARM, if the finished fence fence waits on ring
> space that may not free up in a reasonable amount of time we now have
> broken the dma-dence rules. My understanding is any dma-fence must only
> on other dma-fence, Christian seems to agree and NAK'd just blocking if
> no space available [1]. IMO this series ensures we don't break dma-fence
> rules by restricting how the finished fence can be used.

Oh I meant in the submit ioctl, _before_ you even call
drm_sched_job_arm(). It's ok to block in there indefinitely.

> > kernel is supposed to provide a bottomless queue of anything userspace
> > submits simply doesn't hold up in reality (as much as userspace standards
> > committees would like it to), and as long as it doesn't have a real-world
> > perf impact it doesn't really matter why we end up blocking in the submit
> > ioctl. It might also be a simple memory allocation that hits a snag in
> > page reclaim.
> > 
> > > > > > > And the reasons why it was rejected haven't changed.
> > > > > > > 
> > > > > > > Regards,
> > > > > > > Christian.
> > > > > > > 
> > > > > > Yes, TBH this was mostly to get discussion going how we'd best
> > > > > > tackle this problem while being able to reuse the scheduler for
> > > > > > long-running workloads.
> > > > > > 
> > > > > > I couldn't see any clear decision on your series, though, but one
> > > > > > main difference I see is that this is intended for driver-internal
> > > > > > use only. (I'm counting using the drm_scheduler as a helper for
> > > > > > driver-private use). This is by no means a way to try tackle the
> > > > > > indefinite fence problem.
> > > > > 
> > > > > Well this was just my latest try to tackle this, but essentially the
> > > > > problems are the same as with your approach: When we express such
> > > > > operations as dma_fence there is always the change that we leak that
> > > > > somewhere.
> > > > > 
> > > > > My approach of adding a flag noting that this operation is dangerous and
> > > > > can't be synced with something memory management depends on tried to
> > > > > contain this as much as possible, but Daniel still pretty clearly
> > > > > rejected it (for good reasons I think).
> > > > > 
> > > > > > 
> > > > > > We could ofc invent a completely different data-type that abstracts
> > > > > > the synchronization the scheduler needs in the long-running case, or
> > > > > > each driver could hack something up, like sleeping in the
> > > > > > prepare_job() or run_job() callback for throttling, but those waits
> > > > > > should still be annotated in one way or annotated one way or another
> > > > > > (and probably in a similar way across drivers) to make sure we don't
> > > > > > do anything bad.
> > > > > > 
> > > > > >  So any suggestions as to what would be the better solution here
> > > > > > would be appreciated.
> > > > > 
> > > > > Mhm, do we really the the GPU scheduler for that?
> > > > > 
> > > 
> > > I think we need to solve this within the DRM scheduler one way or
> > > another.
> > 
> > Yeah so if we conclude that the queue really must be bottomless then I
> > agree drm-sched should help out sort out the mess. Because I'm guessing
> > that every driver will have this issue. But that's a big if.
> > 
> > I guess if we teach the drm scheduler that some jobs are fairly endless
> > then maybe it wouldn't be too far-fetched to also teach it to wait for a
> > previous one to finish (but not with the dma_fence that preempts, which we
> > put into the dma_resv for memory management, but some other struct
> > completion). The scheduler already has a concept of not stuffing too much
> > stuff into the same queue after all, so this should fit?
> 
> See above, exact same situation as spinning on flow controling the ring,
> this IMO absolutely breaks the dma-fence rules. IMO the correct solution
> is to have a DRM that doesn't export dma-fences, this is exactly what
> this series does as if we try to, boom lockdep / warn on blow up.

I dont think it's impossible to do this correctly, but definitely very,
very hard. Which is why neither Christian nor me like the idea :-)

Essentially you'd have to make sure that any indefinite way will still
react to drm_sched_job, so that you're not holding up a gt reset or
anything like that, but only ever hold up forward progress for this
specific scheduler/drm_sched_entity. Which you can do as long (and again,
another hugely tricky detail) you still obey the preempt-ctx dma_fence and
manage to preempt the underlying long-running ctx even when the drm/sched
is stuck waiting for an indefinite fence (like waiting for ringspace or
something like that).

So I don't think it's impossible, but very far away from "a good idea" :-)

Hence to proposal to bail out of this entire mess by throwing EWOULDBLCK
back to userspace directly from the ioctl function, where you still can do
that without breaking any dma_fence rules. Or if it's not a case that
matters in practice, simply block in the ioctl handler instead of
returning EWOULDBLCK.
-Daniel

> 
> Matt
> 
> [1] https://patchwork.freedesktop.org/patch/525461/?series=114772&rev=2
> 
> > -Daniel
> > 
> > 
> > > > > I mean in the 1 to 1 case  you basically just need a component which
> > > > > collects the dependencies as dma_fence and if all of them are fulfilled
> > > > > schedules a work item.
> > > > > 
> > > > > As long as the work item itself doesn't produce a dma_fence it can then
> > > > > still just wait for other none dma_fence dependencies.
> > > > > 
> > > > > Then the work function could submit the work and wait for the result.
> > > > > 
> > > > > The work item would then pretty much represent what you want, you can
> > > > > wait for it to finish and pass it along as long running dependency.
> > > > > 
> > > > > Maybe give it a funky name and wrap it up in a structure, but that's
> > > > > basically it.
> > > > > 
> > > > This very much sounds like a i915_sw_fence for the dependency tracking and
> > > > dma_fence_work for the actual work although it's completion fence is a
> > > > dma_fence.
> > > >
> > > 
> > > Agree this does sound to i915ish as stated below one of mandates in Xe
> > > was to use the DRM scheduler. Beyond that as someone who a submission
> > > backend in the i915 and Xe, I love how the DRM scheduler works (single
> > > entry point), it makes everything so much easier.
> > > 
> > > Matt
> > > 
> > > > Although that goes against the whole idea of a condition for merging the xe
> > > > driver would be that we implement some sort of minimal scaffolding for
> > > > long-running workloads in the drm scheduler, and the thinking behind that is
> > > > to avoid implementing intel-specific solutions like those...
> > > > 
> > > > Thanks,
> > > > 
> > > > Thomas
> > > > 
> > > > 
> > > > 
> > > > > Regards,
> > > > > Christian.
> > > > > 
> > > > > > 
> > > > > > Thanks,
> > > > > > 
> > > > > > Thomas
> > > > > > 
> > > > > > 
> > > > > > 
> > > > > > 
> > > > > > 
> > 
> > -- 
> > Daniel Vetter
> > Software Engineer, Intel Corporation
> > http://blog.ffwll.ch
Christian König April 5, 2023, 2:08 p.m. UTC | #18
Am 05.04.23 um 14:45 schrieb Daniel Vetter:
> On Wed, Apr 05, 2023 at 02:39:35PM +0200, Christian König wrote:
>> Am 05.04.23 um 14:35 schrieb Thomas Hellström:
>>> Hi,
>>>
>>> On 4/4/23 21:25, Daniel Vetter wrote:
>>>> On Tue, Apr 04, 2023 at 07:02:23PM +0000, Matthew Brost wrote:
>>>>> On Tue, Apr 04, 2023 at 08:14:01PM +0200, Thomas Hellström
>>>>> (Intel) wrote:
>>>>>> On 4/4/23 15:10, Christian König wrote:
>>>>>>> Am 04.04.23 um 14:54 schrieb Thomas Hellström:
>>>>>>>> Hi, Christian,
>>>>>>>>
>>>>>>>> On 4/4/23 11:09, Christian König wrote:
>>>>>>>>> Am 04.04.23 um 02:22 schrieb Matthew Brost:
>>>>>>>>>> From: Thomas Hellström <thomas.hellstrom@linux.intel.com>
>>>>>>>>>>
>>>>>>>>>> For long-running workloads, drivers either need to open-code
>>>>>>>>>> completion
>>>>>>>>>> waits, invent their own synchronization
>>>>>>>>>> primitives or internally use
>>>>>>>>>> dma-fences that do not obey the cross-driver
>>>>>>>>>> dma-fence protocol, but
>>>>>>>>>> without any lockdep annotation all these
>>>>>>>>>> approaches are error prone.
>>>>>>>>>>
>>>>>>>>>> So since for example the drm scheduler uses dma-fences it is
>>>>>>>>>> desirable for
>>>>>>>>>> a driver to be able to use it for throttling and error
>>>>>>>>>> handling also with
>>>>>>>>>> internal dma-fences tha do not obey the cros-driver
>>>>>>>>>> dma-fence protocol.
>>>>>>>>>>
>>>>>>>>>> Introduce long-running completion fences in form of
>>>>>>>>>> dma-fences, and add
>>>>>>>>>> lockdep annotation for them. In particular:
>>>>>>>>>>
>>>>>>>>>> * Do not allow waiting under any memory management locks.
>>>>>>>>>> * Do not allow to attach them to a dma-resv object.
>>>>>>>>>> * Introduce a new interface for adding callbacks making the
>>>>>>>>>> helper adding
>>>>>>>>>>      a callback sign off on that it is aware
>>>>>>>>>> that the dma-fence may not
>>>>>>>>>>      complete anytime soon. Typically this will be the
>>>>>>>>>> scheduler chaining
>>>>>>>>>>      a new long-running fence on another one.
>>>>>>>>> Well that's pretty much what I tried before:
>>>>>>>>> https://lwn.net/Articles/893704/
>>>>>>>>>
>>>>> I don't think this quite the same, this explictly enforces that
>>>>> we don't
>>>>> break the dma-fence rules (in path of memory allocations, exported in
>>>>> any way), essentially this just SW sync point reusing dma-fence the
>>>>> infrastructure for signaling / callbacks. I believe your series
>>>>> tried to
>>>>> export these fences to user space (admittedly I haven't fully read your
>>>>> series).
>>>>>
>>>>> In this use case we essentially just want to flow control the ring via
>>>>> the dma-scheduler + maintain a list of pending jobs so the TDR can be
>>>>> used for cleanup if LR entity encounters an error. To me this seems
>>>>> perfectly reasonable but I know dma-femce rules are akin to a holy war.
>>>>>
>>>>> If we return NULL in run_job, now we have to be able to sink all jobs
>>>>> in the backend regardless on ring space, maintain a list of jobs
>>>>> pending
>>>>> for cleanup after errors, and write a different cleanup path as now the
>>>>> TDR doesn't work. Seems very, very silly to duplicate all of this code
>>>>> when the DRM scheduler provides all of this for us. Also if we go this
>>>>> route, now all drivers are going to invent ways to handle LR
>>>>> jobs /w the
>>>>> DRM scheduler.
>>>>>
>>>>> This solution is pretty clear, mark the scheduler as LR, and don't
>>>>> export any fences from the scheduler. If you try to export these fences
>>>>> a blow up happens.
>>>> The problem is if you mix things up. Like for resets you need all the
>>>> schedulers on an engine/set-of-engines to quiescent or things get
>>>> potentially hilarious. If you now have a scheduler in forever limbo, the
>>>> dma_fence guarantees are right out the window.
>>>>
>>>> But the issue you're having is fairly specific if it's just about
>>>> ringspace. I think the dumbest fix is to just block in submit if you run
>>>> out of per-ctx ringspace, and call it a day. This notion that
>>>> somehow the
>>>> kernel is supposed to provide a bottomless queue of anything userspace
>>>> submits simply doesn't hold up in reality (as much as userspace
>>>> standards
>>>> committees would like it to), and as long as it doesn't have a
>>>> real-world
>>>> perf impact it doesn't really matter why we end up blocking in the
>>>> submit
>>>> ioctl. It might also be a simple memory allocation that hits a snag in
>>>> page reclaim.
>>> So it seems the discussion around the long-running synchronization
>>> diverged a bit between threads and this thread was hijacked for
>>> preempt-fences and userptr.
>>>
>>> Do I understand it correctly that the recommendation from both Daniel
>>> and Christian is to *not* use the drm scheduler for long-running compute
>>> jobs, but track any internal dma-fence dependencies (pipelined clearing
>>> or whatever) in a separate mechanism and handle unresolved dependencies
>>> on other long-running jobs using -EWOULDBLOCK?
>> Yeah, I think that's a good summary.
>>
>> If needed we could extract some scheduler functionality into separate
>> components, but the fundamental problem is that to the GPU scheduler
>> provides a dma_fence interface to the outside to signal job completion and
>> Daniel and I seem to agree that you really don't want that.
> I think I'm on something slightly different:
>
> - For anything which semantically is not a dma_fence I agree it probably
>    should be handled with EWOULDBLOCK and passed to userspace. Either with
>    a submit thread or userspace memory fences. Note that in practice you
>    will have a bunch of blocking left in the ioctl, stuff like mutexes or
>    memory allocations when things get really tight and you end up in
>    synchronous reclaim. Not any different from userspace ending up in
>    synchronous reclaim due to a page fault really. Trying to shoehorn
>    userspace memory fences or anything else long-running into drm/sched
>    dependency handling is just way too much a can of worms.
>
> - For the memory management dependencies, which are all dma_fence when
>    pipeline, I do think pushing them through the drm/sched makes sense. It
>    has all the stuff to handle that already, plus it's imo also the ideal
>    place to handle the preempt-ctx dma_fence scaffolding/semantics. Which
>    would give you a really neatly unified command submission interface
>    since in both cases (end-of-batch and long-running) you fish the
>    dma_fence you need to stuff in all the right dma_resv object (for memory
>    management purpose) out of the same place: The drm_sched_job struct.
>
> So I'm _not_ on the "do not use drm/sched for long-running jobs at all".
> That doesn't make much sense to me because you'll just reinventing the
> exact same dma_fence dependency handling and memory management shuffling
> we already have. That seems silly.

How about we stuff the functionality we still want to have into a 
drm_job object?

I mean that really isn't that much, basically just looking at 
drm_syncobj, dma_resv etc.. and extracting all the dependencies.

Christian.

> -Daniel
>
>> Regards,
>> Christian.
>>
>>> Thanks,
>>> Thomas
>>>
>>>
>>>
>>>
>>>
>>>>>>>>> And the reasons why it was rejected haven't changed.
>>>>>>>>>
>>>>>>>>> Regards,
>>>>>>>>> Christian.
>>>>>>>>>
>>>>>>>> Yes, TBH this was mostly to get discussion going how we'd best
>>>>>>>> tackle this problem while being able to reuse the scheduler for
>>>>>>>> long-running workloads.
>>>>>>>>
>>>>>>>> I couldn't see any clear decision on your series, though, but one
>>>>>>>> main difference I see is that this is intended for driver-internal
>>>>>>>> use only. (I'm counting using the drm_scheduler as a helper for
>>>>>>>> driver-private use). This is by no means a way to try tackle the
>>>>>>>> indefinite fence problem.
>>>>>>> Well this was just my latest try to tackle this, but essentially the
>>>>>>> problems are the same as with your approach: When we express such
>>>>>>> operations as dma_fence there is always the change that we leak that
>>>>>>> somewhere.
>>>>>>>
>>>>>>> My approach of adding a flag noting that this operation
>>>>>>> is dangerous and
>>>>>>> can't be synced with something memory management depends on tried to
>>>>>>> contain this as much as possible, but Daniel still pretty clearly
>>>>>>> rejected it (for good reasons I think).
>>>>>>>
>>>>>>>> We could ofc invent a completely different data-type that abstracts
>>>>>>>> the synchronization the scheduler needs in the long-running case, or
>>>>>>>> each driver could hack something up, like sleeping in the
>>>>>>>> prepare_job() or run_job() callback for throttling, but those waits
>>>>>>>> should still be annotated in one way or annotated one way or another
>>>>>>>> (and probably in a similar way across drivers) to make sure we don't
>>>>>>>> do anything bad.
>>>>>>>>
>>>>>>>>    So any suggestions as to what would be the better solution here
>>>>>>>> would be appreciated.
>>>>>>> Mhm, do we really the the GPU scheduler for that?
>>>>>>>
>>>>> I think we need to solve this within the DRM scheduler one way or
>>>>> another.
>>>> Yeah so if we conclude that the queue really must be bottomless then I
>>>> agree drm-sched should help out sort out the mess. Because I'm guessing
>>>> that every driver will have this issue. But that's a big if.
>>>>
>>>> I guess if we teach the drm scheduler that some jobs are fairly endless
>>>> then maybe it wouldn't be too far-fetched to also teach it to wait for a
>>>> previous one to finish (but not with the dma_fence that preempts,
>>>> which we
>>>> put into the dma_resv for memory management, but some other struct
>>>> completion). The scheduler already has a concept of not stuffing too
>>>> much
>>>> stuff into the same queue after all, so this should fit?
>>>> -Daniel
>>>>
>>>>
>>>>>>> I mean in the 1 to 1 case  you basically just need a component which
>>>>>>> collects the dependencies as dma_fence and if all of
>>>>>>> them are fulfilled
>>>>>>> schedules a work item.
>>>>>>>
>>>>>>> As long as the work item itself doesn't produce a
>>>>>>> dma_fence it can then
>>>>>>> still just wait for other none dma_fence dependencies.
>>>>>>>
>>>>>>> Then the work function could submit the work and wait for the result.
>>>>>>>
>>>>>>> The work item would then pretty much represent what you want, you can
>>>>>>> wait for it to finish and pass it along as long running dependency.
>>>>>>>
>>>>>>> Maybe give it a funky name and wrap it up in a structure, but that's
>>>>>>> basically it.
>>>>>>>
>>>>>> This very much sounds like a i915_sw_fence for the
>>>>>> dependency tracking and
>>>>>> dma_fence_work for the actual work although it's completion fence is a
>>>>>> dma_fence.
>>>>>>
>>>>> Agree this does sound to i915ish as stated below one of mandates in Xe
>>>>> was to use the DRM scheduler. Beyond that as someone who a submission
>>>>> backend in the i915 and Xe, I love how the DRM scheduler works (single
>>>>> entry point), it makes everything so much easier.
>>>>>
>>>>> Matt
>>>>>
>>>>>> Although that goes against the whole idea of a condition for
>>>>>> merging the xe
>>>>>> driver would be that we implement some sort of minimal scaffolding for
>>>>>> long-running workloads in the drm scheduler, and the
>>>>>> thinking behind that is
>>>>>> to avoid implementing intel-specific solutions like those...
>>>>>>
>>>>>> Thanks,
>>>>>>
>>>>>> Thomas
>>>>>>
>>>>>>
>>>>>>
>>>>>>> Regards,
>>>>>>> Christian.
>>>>>>>
>>>>>>>> Thanks,
>>>>>>>>
>>>>>>>> Thomas
>>>>>>>>
>>>>>>>>
>>>>>>>>
>>>>>>>>
>>>>>>>>
Matthew Brost April 5, 2023, 11:58 p.m. UTC | #19
On Wed, Apr 05, 2023 at 03:09:08PM +0200, Daniel Vetter wrote:
> On Tue, Apr 04, 2023 at 07:48:27PM +0000, Matthew Brost wrote:
> > On Tue, Apr 04, 2023 at 09:25:52PM +0200, Daniel Vetter wrote:
> > > On Tue, Apr 04, 2023 at 07:02:23PM +0000, Matthew Brost wrote:
> > > > On Tue, Apr 04, 2023 at 08:14:01PM +0200, Thomas Hellström (Intel) wrote:
> > > > > 
> > > > > On 4/4/23 15:10, Christian König wrote:
> > > > > > Am 04.04.23 um 14:54 schrieb Thomas Hellström:
> > > > > > > Hi, Christian,
> > > > > > > 
> > > > > > > On 4/4/23 11:09, Christian König wrote:
> > > > > > > > Am 04.04.23 um 02:22 schrieb Matthew Brost:
> > > > > > > > > From: Thomas Hellström <thomas.hellstrom@linux.intel.com>
> > > > > > > > > 
> > > > > > > > > For long-running workloads, drivers either need to open-code
> > > > > > > > > completion
> > > > > > > > > waits, invent their own synchronization primitives or internally use
> > > > > > > > > dma-fences that do not obey the cross-driver dma-fence protocol, but
> > > > > > > > > without any lockdep annotation all these approaches are error prone.
> > > > > > > > > 
> > > > > > > > > So since for example the drm scheduler uses dma-fences it is
> > > > > > > > > desirable for
> > > > > > > > > a driver to be able to use it for throttling and error
> > > > > > > > > handling also with
> > > > > > > > > internal dma-fences tha do not obey the cros-driver
> > > > > > > > > dma-fence protocol.
> > > > > > > > > 
> > > > > > > > > Introduce long-running completion fences in form of
> > > > > > > > > dma-fences, and add
> > > > > > > > > lockdep annotation for them. In particular:
> > > > > > > > > 
> > > > > > > > > * Do not allow waiting under any memory management locks.
> > > > > > > > > * Do not allow to attach them to a dma-resv object.
> > > > > > > > > * Introduce a new interface for adding callbacks making the
> > > > > > > > > helper adding
> > > > > > > > >    a callback sign off on that it is aware that the dma-fence may not
> > > > > > > > >    complete anytime soon. Typically this will be the
> > > > > > > > > scheduler chaining
> > > > > > > > >    a new long-running fence on another one.
> > > > > > > > 
> > > > > > > > Well that's pretty much what I tried before:
> > > > > > > > https://lwn.net/Articles/893704/
> > > > > > > > 
> > > > 
> > > > I don't think this quite the same, this explictly enforces that we don't
> > > > break the dma-fence rules (in path of memory allocations, exported in
> > > > any way), essentially this just SW sync point reusing dma-fence the
> > > > infrastructure for signaling / callbacks. I believe your series tried to
> > > > export these fences to user space (admittedly I haven't fully read your
> > > > series).
> > > > 
> > > > In this use case we essentially just want to flow control the ring via
> > > > the dma-scheduler + maintain a list of pending jobs so the TDR can be
> > > > used for cleanup if LR entity encounters an error. To me this seems
> > > > perfectly reasonable but I know dma-femce rules are akin to a holy war.
> > > > 
> > > > If we return NULL in run_job, now we have to be able to sink all jobs
> > > > in the backend regardless on ring space, maintain a list of jobs pending
> > > > for cleanup after errors, and write a different cleanup path as now the
> > > > TDR doesn't work. Seems very, very silly to duplicate all of this code
> > > > when the DRM scheduler provides all of this for us. Also if we go this
> > > > route, now all drivers are going to invent ways to handle LR jobs /w the
> > > > DRM scheduler.
> > > > 
> > > > This solution is pretty clear, mark the scheduler as LR, and don't
> > > > export any fences from the scheduler. If you try to export these fences
> > > > a blow up happens.
> > > 
> > > The problem is if you mix things up. Like for resets you need all the
> > > schedulers on an engine/set-of-engines to quiescent or things get
> > > potentially hilarious. If you now have a scheduler in forever limbo, the
> > > dma_fence guarantees are right out the window.
> > > 
> > 
> > Right, a GT reset on Xe is:
> > 
> > Stop all schedulers
> > Do a reset
> > Ban any schedulers which we think caused the GT reset
> > Resubmit all schedulers which we think were good
> > Restart all schedulers
> > 
> > None of this flow depends on LR dma-fences, all of this uses the DRM
> > sched infrastructure and work very well compared to the i915. Rewriting
> > all this with a driver specific implementation is what we are trying to
> > avoid.
> > 
> > Similarly if LR entity hangs on its own (not a GT reset, rather the
> > firmware does the reset for us) we use all the DRM scheduler
> > infrastructure to handle this. Again this works rather well...
> 
> Yeah this is why I don't think duplicating everything that long-running
> jobs need makes any sense. iow I agree with you.
> 

Glad we agree.

> > > But the issue you're having is fairly specific if it's just about
> > > ringspace. I think the dumbest fix is to just block in submit if you run
> > > out of per-ctx ringspace, and call it a day. This notion that somehow the
> > 
> > How does that not break the dma-fence rules? A job can publish its
> > finished fence after ARM, if the finished fence fence waits on ring
> > space that may not free up in a reasonable amount of time we now have
> > broken the dma-dence rules. My understanding is any dma-fence must only
> > on other dma-fence, Christian seems to agree and NAK'd just blocking if
> > no space available [1]. IMO this series ensures we don't break dma-fence
> > rules by restricting how the finished fence can be used.
> 
> Oh I meant in the submit ioctl, _before_ you even call
> drm_sched_job_arm(). It's ok to block in there indefinitely.
>

Ok, but how do we determine if their is ring space, wait on xe_hw_fence
which is a dma-fence. We just move a wait from the scheduler to the exec
IOCTL and I realy fail to see the point of that.

> > > kernel is supposed to provide a bottomless queue of anything userspace
> > > submits simply doesn't hold up in reality (as much as userspace standards
> > > committees would like it to), and as long as it doesn't have a real-world
> > > perf impact it doesn't really matter why we end up blocking in the submit
> > > ioctl. It might also be a simple memory allocation that hits a snag in
> > > page reclaim.
> > > 
> > > > > > > > And the reasons why it was rejected haven't changed.
> > > > > > > > 
> > > > > > > > Regards,
> > > > > > > > Christian.
> > > > > > > > 
> > > > > > > Yes, TBH this was mostly to get discussion going how we'd best
> > > > > > > tackle this problem while being able to reuse the scheduler for
> > > > > > > long-running workloads.
> > > > > > > 
> > > > > > > I couldn't see any clear decision on your series, though, but one
> > > > > > > main difference I see is that this is intended for driver-internal
> > > > > > > use only. (I'm counting using the drm_scheduler as a helper for
> > > > > > > driver-private use). This is by no means a way to try tackle the
> > > > > > > indefinite fence problem.
> > > > > > 
> > > > > > Well this was just my latest try to tackle this, but essentially the
> > > > > > problems are the same as with your approach: When we express such
> > > > > > operations as dma_fence there is always the change that we leak that
> > > > > > somewhere.
> > > > > > 
> > > > > > My approach of adding a flag noting that this operation is dangerous and
> > > > > > can't be synced with something memory management depends on tried to
> > > > > > contain this as much as possible, but Daniel still pretty clearly
> > > > > > rejected it (for good reasons I think).
> > > > > > 
> > > > > > > 
> > > > > > > We could ofc invent a completely different data-type that abstracts
> > > > > > > the synchronization the scheduler needs in the long-running case, or
> > > > > > > each driver could hack something up, like sleeping in the
> > > > > > > prepare_job() or run_job() callback for throttling, but those waits
> > > > > > > should still be annotated in one way or annotated one way or another
> > > > > > > (and probably in a similar way across drivers) to make sure we don't
> > > > > > > do anything bad.
> > > > > > > 
> > > > > > >  So any suggestions as to what would be the better solution here
> > > > > > > would be appreciated.
> > > > > > 
> > > > > > Mhm, do we really the the GPU scheduler for that?
> > > > > > 
> > > > 
> > > > I think we need to solve this within the DRM scheduler one way or
> > > > another.
> > > 
> > > Yeah so if we conclude that the queue really must be bottomless then I
> > > agree drm-sched should help out sort out the mess. Because I'm guessing
> > > that every driver will have this issue. But that's a big if.
> > > 
> > > I guess if we teach the drm scheduler that some jobs are fairly endless
> > > then maybe it wouldn't be too far-fetched to also teach it to wait for a
> > > previous one to finish (but not with the dma_fence that preempts, which we
> > > put into the dma_resv for memory management, but some other struct
> > > completion). The scheduler already has a concept of not stuffing too much
> > > stuff into the same queue after all, so this should fit?
> > 
> > See above, exact same situation as spinning on flow controling the ring,
> > this IMO absolutely breaks the dma-fence rules. IMO the correct solution
> > is to have a DRM that doesn't export dma-fences, this is exactly what
> > this series does as if we try to, boom lockdep / warn on blow up.
> 
> I dont think it's impossible to do this correctly, but definitely very,
> very hard. Which is why neither Christian nor me like the idea :-)
> 
> Essentially you'd have to make sure that any indefinite way will still
> react to drm_sched_job, so that you're not holding up a gt reset or
> anything like that, but only ever hold up forward progress for this
> specific scheduler/drm_sched_entity. Which you can do as long (and again,
> another hugely tricky detail) you still obey the preempt-ctx dma_fence and
> manage to preempt the underlying long-running ctx even when the drm/sched
> is stuck waiting for an indefinite fence (like waiting for ringspace or
> something like that).
> 
> So I don't think it's impossible, but very far away from "a good idea" :-)
> 
> Hence to proposal to bail out of this entire mess by throwing EWOULDBLCK
> back to userspace directly from the ioctl function, where you still can do
> that without breaking any dma_fence rules. Or if it's not a case that
> matters in practice, simply block in the ioctl handler instead of
> returning EWOULDBLCK.

Returning EWOULDBLCK on a full ring is reasonsible I guess but again
without returning a fence in run job the TDR can't be used for clean up
on LR entities which will result in duplicate code open coded by each
driver. Same goes for checking ring full in exec.

How about this:
- We mark xe_hw_fence as LR to ensure it can't be exported, return this
  in run_job which gives flow control on the ring + the handy TDR
  functionality
- When a scheduler is marked as LR, we do not generate finished fences
  for jobs
- We heavily, heavily scrutinize any usage of the LR fence flag going
  foward
- We document all of this very loudly

Is this reasonable?

Matt

> -Daniel
> 
> > 
> > Matt
> > 
> > [1] https://patchwork.freedesktop.org/patch/525461/?series=114772&rev=2
> > 
> > > -Daniel
> > > 
> > > 
> > > > > > I mean in the 1 to 1 case  you basically just need a component which
> > > > > > collects the dependencies as dma_fence and if all of them are fulfilled
> > > > > > schedules a work item.
> > > > > > 
> > > > > > As long as the work item itself doesn't produce a dma_fence it can then
> > > > > > still just wait for other none dma_fence dependencies.
> > > > > > 
> > > > > > Then the work function could submit the work and wait for the result.
> > > > > > 
> > > > > > The work item would then pretty much represent what you want, you can
> > > > > > wait for it to finish and pass it along as long running dependency.
> > > > > > 
> > > > > > Maybe give it a funky name and wrap it up in a structure, but that's
> > > > > > basically it.
> > > > > > 
> > > > > This very much sounds like a i915_sw_fence for the dependency tracking and
> > > > > dma_fence_work for the actual work although it's completion fence is a
> > > > > dma_fence.
> > > > >
> > > > 
> > > > Agree this does sound to i915ish as stated below one of mandates in Xe
> > > > was to use the DRM scheduler. Beyond that as someone who a submission
> > > > backend in the i915 and Xe, I love how the DRM scheduler works (single
> > > > entry point), it makes everything so much easier.
> > > > 
> > > > Matt
> > > > 
> > > > > Although that goes against the whole idea of a condition for merging the xe
> > > > > driver would be that we implement some sort of minimal scaffolding for
> > > > > long-running workloads in the drm scheduler, and the thinking behind that is
> > > > > to avoid implementing intel-specific solutions like those...
> > > > > 
> > > > > Thanks,
> > > > > 
> > > > > Thomas
> > > > > 
> > > > > 
> > > > > 
> > > > > > Regards,
> > > > > > Christian.
> > > > > > 
> > > > > > > 
> > > > > > > Thanks,
> > > > > > > 
> > > > > > > Thomas
> > > > > > > 
> > > > > > > 
> > > > > > > 
> > > > > > > 
> > > > > > > 
> > > 
> > > -- 
> > > Daniel Vetter
> > > Software Engineer, Intel Corporation
> > > http://blog.ffwll.ch
> 
> -- 
> Daniel Vetter
> Software Engineer, Intel Corporation
> http://blog.ffwll.ch
Daniel Vetter April 6, 2023, 6:32 a.m. UTC | #20
On Wed, Apr 05, 2023 at 11:58:44PM +0000, Matthew Brost wrote:
> On Wed, Apr 05, 2023 at 03:09:08PM +0200, Daniel Vetter wrote:
> > On Tue, Apr 04, 2023 at 07:48:27PM +0000, Matthew Brost wrote:
> > > On Tue, Apr 04, 2023 at 09:25:52PM +0200, Daniel Vetter wrote:
> > > > On Tue, Apr 04, 2023 at 07:02:23PM +0000, Matthew Brost wrote:
> > > > > On Tue, Apr 04, 2023 at 08:14:01PM +0200, Thomas Hellström (Intel) wrote:
> > > > > > 
> > > > > > On 4/4/23 15:10, Christian König wrote:
> > > > > > > Am 04.04.23 um 14:54 schrieb Thomas Hellström:
> > > > > > > > Hi, Christian,
> > > > > > > > 
> > > > > > > > On 4/4/23 11:09, Christian König wrote:
> > > > > > > > > Am 04.04.23 um 02:22 schrieb Matthew Brost:
> > > > > > > > > > From: Thomas Hellström <thomas.hellstrom@linux.intel.com>
> > > > > > > > > > 
> > > > > > > > > > For long-running workloads, drivers either need to open-code
> > > > > > > > > > completion
> > > > > > > > > > waits, invent their own synchronization primitives or internally use
> > > > > > > > > > dma-fences that do not obey the cross-driver dma-fence protocol, but
> > > > > > > > > > without any lockdep annotation all these approaches are error prone.
> > > > > > > > > > 
> > > > > > > > > > So since for example the drm scheduler uses dma-fences it is
> > > > > > > > > > desirable for
> > > > > > > > > > a driver to be able to use it for throttling and error
> > > > > > > > > > handling also with
> > > > > > > > > > internal dma-fences tha do not obey the cros-driver
> > > > > > > > > > dma-fence protocol.
> > > > > > > > > > 
> > > > > > > > > > Introduce long-running completion fences in form of
> > > > > > > > > > dma-fences, and add
> > > > > > > > > > lockdep annotation for them. In particular:
> > > > > > > > > > 
> > > > > > > > > > * Do not allow waiting under any memory management locks.
> > > > > > > > > > * Do not allow to attach them to a dma-resv object.
> > > > > > > > > > * Introduce a new interface for adding callbacks making the
> > > > > > > > > > helper adding
> > > > > > > > > >    a callback sign off on that it is aware that the dma-fence may not
> > > > > > > > > >    complete anytime soon. Typically this will be the
> > > > > > > > > > scheduler chaining
> > > > > > > > > >    a new long-running fence on another one.
> > > > > > > > > 
> > > > > > > > > Well that's pretty much what I tried before:
> > > > > > > > > https://lwn.net/Articles/893704/
> > > > > > > > > 
> > > > > 
> > > > > I don't think this quite the same, this explictly enforces that we don't
> > > > > break the dma-fence rules (in path of memory allocations, exported in
> > > > > any way), essentially this just SW sync point reusing dma-fence the
> > > > > infrastructure for signaling / callbacks. I believe your series tried to
> > > > > export these fences to user space (admittedly I haven't fully read your
> > > > > series).
> > > > > 
> > > > > In this use case we essentially just want to flow control the ring via
> > > > > the dma-scheduler + maintain a list of pending jobs so the TDR can be
> > > > > used for cleanup if LR entity encounters an error. To me this seems
> > > > > perfectly reasonable but I know dma-femce rules are akin to a holy war.
> > > > > 
> > > > > If we return NULL in run_job, now we have to be able to sink all jobs
> > > > > in the backend regardless on ring space, maintain a list of jobs pending
> > > > > for cleanup after errors, and write a different cleanup path as now the
> > > > > TDR doesn't work. Seems very, very silly to duplicate all of this code
> > > > > when the DRM scheduler provides all of this for us. Also if we go this
> > > > > route, now all drivers are going to invent ways to handle LR jobs /w the
> > > > > DRM scheduler.
> > > > > 
> > > > > This solution is pretty clear, mark the scheduler as LR, and don't
> > > > > export any fences from the scheduler. If you try to export these fences
> > > > > a blow up happens.
> > > > 
> > > > The problem is if you mix things up. Like for resets you need all the
> > > > schedulers on an engine/set-of-engines to quiescent or things get
> > > > potentially hilarious. If you now have a scheduler in forever limbo, the
> > > > dma_fence guarantees are right out the window.
> > > > 
> > > 
> > > Right, a GT reset on Xe is:
> > > 
> > > Stop all schedulers
> > > Do a reset
> > > Ban any schedulers which we think caused the GT reset
> > > Resubmit all schedulers which we think were good
> > > Restart all schedulers
> > > 
> > > None of this flow depends on LR dma-fences, all of this uses the DRM
> > > sched infrastructure and work very well compared to the i915. Rewriting
> > > all this with a driver specific implementation is what we are trying to
> > > avoid.
> > > 
> > > Similarly if LR entity hangs on its own (not a GT reset, rather the
> > > firmware does the reset for us) we use all the DRM scheduler
> > > infrastructure to handle this. Again this works rather well...
> > 
> > Yeah this is why I don't think duplicating everything that long-running
> > jobs need makes any sense. iow I agree with you.
> > 
> 
> Glad we agree.
> 
> > > > But the issue you're having is fairly specific if it's just about
> > > > ringspace. I think the dumbest fix is to just block in submit if you run
> > > > out of per-ctx ringspace, and call it a day. This notion that somehow the
> > > 
> > > How does that not break the dma-fence rules? A job can publish its
> > > finished fence after ARM, if the finished fence fence waits on ring
> > > space that may not free up in a reasonable amount of time we now have
> > > broken the dma-dence rules. My understanding is any dma-fence must only
> > > on other dma-fence, Christian seems to agree and NAK'd just blocking if
> > > no space available [1]. IMO this series ensures we don't break dma-fence
> > > rules by restricting how the finished fence can be used.
> > 
> > Oh I meant in the submit ioctl, _before_ you even call
> > drm_sched_job_arm(). It's ok to block in there indefinitely.
> >
> 
> Ok, but how do we determine if their is ring space, wait on xe_hw_fence
> which is a dma-fence. We just move a wait from the scheduler to the exec
> IOCTL and I realy fail to see the point of that.

Fill in anything you need into the ring at ioctl time, but don't update
the tail pointers? If there's no space, then EWOULDBLCK.

> > > > kernel is supposed to provide a bottomless queue of anything userspace
> > > > submits simply doesn't hold up in reality (as much as userspace standards
> > > > committees would like it to), and as long as it doesn't have a real-world
> > > > perf impact it doesn't really matter why we end up blocking in the submit
> > > > ioctl. It might also be a simple memory allocation that hits a snag in
> > > > page reclaim.
> > > > 
> > > > > > > > > And the reasons why it was rejected haven't changed.
> > > > > > > > > 
> > > > > > > > > Regards,
> > > > > > > > > Christian.
> > > > > > > > > 
> > > > > > > > Yes, TBH this was mostly to get discussion going how we'd best
> > > > > > > > tackle this problem while being able to reuse the scheduler for
> > > > > > > > long-running workloads.
> > > > > > > > 
> > > > > > > > I couldn't see any clear decision on your series, though, but one
> > > > > > > > main difference I see is that this is intended for driver-internal
> > > > > > > > use only. (I'm counting using the drm_scheduler as a helper for
> > > > > > > > driver-private use). This is by no means a way to try tackle the
> > > > > > > > indefinite fence problem.
> > > > > > > 
> > > > > > > Well this was just my latest try to tackle this, but essentially the
> > > > > > > problems are the same as with your approach: When we express such
> > > > > > > operations as dma_fence there is always the change that we leak that
> > > > > > > somewhere.
> > > > > > > 
> > > > > > > My approach of adding a flag noting that this operation is dangerous and
> > > > > > > can't be synced with something memory management depends on tried to
> > > > > > > contain this as much as possible, but Daniel still pretty clearly
> > > > > > > rejected it (for good reasons I think).
> > > > > > > 
> > > > > > > > 
> > > > > > > > We could ofc invent a completely different data-type that abstracts
> > > > > > > > the synchronization the scheduler needs in the long-running case, or
> > > > > > > > each driver could hack something up, like sleeping in the
> > > > > > > > prepare_job() or run_job() callback for throttling, but those waits
> > > > > > > > should still be annotated in one way or annotated one way or another
> > > > > > > > (and probably in a similar way across drivers) to make sure we don't
> > > > > > > > do anything bad.
> > > > > > > > 
> > > > > > > >  So any suggestions as to what would be the better solution here
> > > > > > > > would be appreciated.
> > > > > > > 
> > > > > > > Mhm, do we really the the GPU scheduler for that?
> > > > > > > 
> > > > > 
> > > > > I think we need to solve this within the DRM scheduler one way or
> > > > > another.
> > > > 
> > > > Yeah so if we conclude that the queue really must be bottomless then I
> > > > agree drm-sched should help out sort out the mess. Because I'm guessing
> > > > that every driver will have this issue. But that's a big if.
> > > > 
> > > > I guess if we teach the drm scheduler that some jobs are fairly endless
> > > > then maybe it wouldn't be too far-fetched to also teach it to wait for a
> > > > previous one to finish (but not with the dma_fence that preempts, which we
> > > > put into the dma_resv for memory management, but some other struct
> > > > completion). The scheduler already has a concept of not stuffing too much
> > > > stuff into the same queue after all, so this should fit?
> > > 
> > > See above, exact same situation as spinning on flow controling the ring,
> > > this IMO absolutely breaks the dma-fence rules. IMO the correct solution
> > > is to have a DRM that doesn't export dma-fences, this is exactly what
> > > this series does as if we try to, boom lockdep / warn on blow up.
> > 
> > I dont think it's impossible to do this correctly, but definitely very,
> > very hard. Which is why neither Christian nor me like the idea :-)
> > 
> > Essentially you'd have to make sure that any indefinite way will still
> > react to drm_sched_job, so that you're not holding up a gt reset or
> > anything like that, but only ever hold up forward progress for this
> > specific scheduler/drm_sched_entity. Which you can do as long (and again,
> > another hugely tricky detail) you still obey the preempt-ctx dma_fence and
> > manage to preempt the underlying long-running ctx even when the drm/sched
> > is stuck waiting for an indefinite fence (like waiting for ringspace or
> > something like that).
> > 
> > So I don't think it's impossible, but very far away from "a good idea" :-)
> > 
> > Hence to proposal to bail out of this entire mess by throwing EWOULDBLCK
> > back to userspace directly from the ioctl function, where you still can do
> > that without breaking any dma_fence rules. Or if it's not a case that
> > matters in practice, simply block in the ioctl handler instead of
> > returning EWOULDBLCK.
> 
> Returning EWOULDBLCK on a full ring is reasonsible I guess but again
> without returning a fence in run job the TDR can't be used for clean up
> on LR entities which will result in duplicate code open coded by each
> driver. Same goes for checking ring full in exec.
> 
> How about this:
> - We mark xe_hw_fence as LR to ensure it can't be exported, return this
>   in run_job which gives flow control on the ring + the handy TDR
>   functionality
> - When a scheduler is marked as LR, we do not generate finished fences
>   for jobs
> - We heavily, heavily scrutinize any usage of the LR fence flag going
>   foward
> - We document all of this very loudly
> 
> Is this reasonable?

I'm not seeing why it's needed? If you're worried about TDR duplication
then I think we need something else. Because for long-running ctx we never
have a timeout of the ctx itself (by definition). The only thing we time
out on is the preempt, so I guess what could be done:
- have the minimal scaffolding to support the preempt-ctx fence in
  drm_sched_entity
- when the preempt ctx fence enables signalling a) callback to the driver
  to start the preempt (which should signal the fence) b) start a timer,
  which should catch if the preempt takes too long
- if the timeout first (importantly we only enable that when the
  preemption is trigger, not by default), kick of the normal drm/sched tdr
  flow. maybe needs some adjustements in case there's different handling
  needed for when a preemption times out compared to just a job timing out

I think this might make sense for sharing timeout handling needs for
long-running context. What you proposed I don't really follow why it
should exist, because that kind of timeout handling should not ever happen
for long-running jobs.
-Daniel
Matthew Brost April 6, 2023, 4:58 p.m. UTC | #21
On Thu, Apr 06, 2023 at 08:32:59AM +0200, Daniel Vetter wrote:
> On Wed, Apr 05, 2023 at 11:58:44PM +0000, Matthew Brost wrote:
> > On Wed, Apr 05, 2023 at 03:09:08PM +0200, Daniel Vetter wrote:
> > > On Tue, Apr 04, 2023 at 07:48:27PM +0000, Matthew Brost wrote:
> > > > On Tue, Apr 04, 2023 at 09:25:52PM +0200, Daniel Vetter wrote:
> > > > > On Tue, Apr 04, 2023 at 07:02:23PM +0000, Matthew Brost wrote:
> > > > > > On Tue, Apr 04, 2023 at 08:14:01PM +0200, Thomas Hellström (Intel) wrote:
> > > > > > > 
> > > > > > > On 4/4/23 15:10, Christian König wrote:
> > > > > > > > Am 04.04.23 um 14:54 schrieb Thomas Hellström:
> > > > > > > > > Hi, Christian,
> > > > > > > > > 
> > > > > > > > > On 4/4/23 11:09, Christian König wrote:
> > > > > > > > > > Am 04.04.23 um 02:22 schrieb Matthew Brost:
> > > > > > > > > > > From: Thomas Hellström <thomas.hellstrom@linux.intel.com>
> > > > > > > > > > > 
> > > > > > > > > > > For long-running workloads, drivers either need to open-code
> > > > > > > > > > > completion
> > > > > > > > > > > waits, invent their own synchronization primitives or internally use
> > > > > > > > > > > dma-fences that do not obey the cross-driver dma-fence protocol, but
> > > > > > > > > > > without any lockdep annotation all these approaches are error prone.
> > > > > > > > > > > 
> > > > > > > > > > > So since for example the drm scheduler uses dma-fences it is
> > > > > > > > > > > desirable for
> > > > > > > > > > > a driver to be able to use it for throttling and error
> > > > > > > > > > > handling also with
> > > > > > > > > > > internal dma-fences tha do not obey the cros-driver
> > > > > > > > > > > dma-fence protocol.
> > > > > > > > > > > 
> > > > > > > > > > > Introduce long-running completion fences in form of
> > > > > > > > > > > dma-fences, and add
> > > > > > > > > > > lockdep annotation for them. In particular:
> > > > > > > > > > > 
> > > > > > > > > > > * Do not allow waiting under any memory management locks.
> > > > > > > > > > > * Do not allow to attach them to a dma-resv object.
> > > > > > > > > > > * Introduce a new interface for adding callbacks making the
> > > > > > > > > > > helper adding
> > > > > > > > > > >    a callback sign off on that it is aware that the dma-fence may not
> > > > > > > > > > >    complete anytime soon. Typically this will be the
> > > > > > > > > > > scheduler chaining
> > > > > > > > > > >    a new long-running fence on another one.
> > > > > > > > > > 
> > > > > > > > > > Well that's pretty much what I tried before:
> > > > > > > > > > https://lwn.net/Articles/893704/
> > > > > > > > > > 
> > > > > > 
> > > > > > I don't think this quite the same, this explictly enforces that we don't
> > > > > > break the dma-fence rules (in path of memory allocations, exported in
> > > > > > any way), essentially this just SW sync point reusing dma-fence the
> > > > > > infrastructure for signaling / callbacks. I believe your series tried to
> > > > > > export these fences to user space (admittedly I haven't fully read your
> > > > > > series).
> > > > > > 
> > > > > > In this use case we essentially just want to flow control the ring via
> > > > > > the dma-scheduler + maintain a list of pending jobs so the TDR can be
> > > > > > used for cleanup if LR entity encounters an error. To me this seems
> > > > > > perfectly reasonable but I know dma-femce rules are akin to a holy war.
> > > > > > 
> > > > > > If we return NULL in run_job, now we have to be able to sink all jobs
> > > > > > in the backend regardless on ring space, maintain a list of jobs pending
> > > > > > for cleanup after errors, and write a different cleanup path as now the
> > > > > > TDR doesn't work. Seems very, very silly to duplicate all of this code
> > > > > > when the DRM scheduler provides all of this for us. Also if we go this
> > > > > > route, now all drivers are going to invent ways to handle LR jobs /w the
> > > > > > DRM scheduler.
> > > > > > 
> > > > > > This solution is pretty clear, mark the scheduler as LR, and don't
> > > > > > export any fences from the scheduler. If you try to export these fences
> > > > > > a blow up happens.
> > > > > 
> > > > > The problem is if you mix things up. Like for resets you need all the
> > > > > schedulers on an engine/set-of-engines to quiescent or things get
> > > > > potentially hilarious. If you now have a scheduler in forever limbo, the
> > > > > dma_fence guarantees are right out the window.
> > > > > 
> > > > 
> > > > Right, a GT reset on Xe is:
> > > > 
> > > > Stop all schedulers
> > > > Do a reset
> > > > Ban any schedulers which we think caused the GT reset
> > > > Resubmit all schedulers which we think were good
> > > > Restart all schedulers
> > > > 
> > > > None of this flow depends on LR dma-fences, all of this uses the DRM
> > > > sched infrastructure and work very well compared to the i915. Rewriting
> > > > all this with a driver specific implementation is what we are trying to
> > > > avoid.
> > > > 
> > > > Similarly if LR entity hangs on its own (not a GT reset, rather the
> > > > firmware does the reset for us) we use all the DRM scheduler
> > > > infrastructure to handle this. Again this works rather well...
> > > 
> > > Yeah this is why I don't think duplicating everything that long-running
> > > jobs need makes any sense. iow I agree with you.
> > > 
> > 
> > Glad we agree.
> > 
> > > > > But the issue you're having is fairly specific if it's just about
> > > > > ringspace. I think the dumbest fix is to just block in submit if you run
> > > > > out of per-ctx ringspace, and call it a day. This notion that somehow the
> > > > 
> > > > How does that not break the dma-fence rules? A job can publish its
> > > > finished fence after ARM, if the finished fence fence waits on ring
> > > > space that may not free up in a reasonable amount of time we now have
> > > > broken the dma-dence rules. My understanding is any dma-fence must only
> > > > on other dma-fence, Christian seems to agree and NAK'd just blocking if
> > > > no space available [1]. IMO this series ensures we don't break dma-fence
> > > > rules by restricting how the finished fence can be used.
> > > 
> > > Oh I meant in the submit ioctl, _before_ you even call
> > > drm_sched_job_arm(). It's ok to block in there indefinitely.
> > >
> > 
> > Ok, but how do we determine if their is ring space, wait on xe_hw_fence
> > which is a dma-fence. We just move a wait from the scheduler to the exec
> > IOCTL and I realy fail to see the point of that.
> 
> Fill in anything you need into the ring at ioctl time, but don't update
> the tail pointers? If there's no space, then EWOULDBLCK.
> 

Ok, I can maybe buy this approach and this is fairly easy to do. I'm
going to do this for LR jobs only though (non-LR job will still flow
control on the ring via the scheduler + write ring in run_job). A bit of
duplicate code but I can live with this.

> > > > > kernel is supposed to provide a bottomless queue of anything userspace
> > > > > submits simply doesn't hold up in reality (as much as userspace standards
> > > > > committees would like it to), and as long as it doesn't have a real-world
> > > > > perf impact it doesn't really matter why we end up blocking in the submit
> > > > > ioctl. It might also be a simple memory allocation that hits a snag in
> > > > > page reclaim.
> > > > > 
> > > > > > > > > > And the reasons why it was rejected haven't changed.
> > > > > > > > > > 
> > > > > > > > > > Regards,
> > > > > > > > > > Christian.
> > > > > > > > > > 
> > > > > > > > > Yes, TBH this was mostly to get discussion going how we'd best
> > > > > > > > > tackle this problem while being able to reuse the scheduler for
> > > > > > > > > long-running workloads.
> > > > > > > > > 
> > > > > > > > > I couldn't see any clear decision on your series, though, but one
> > > > > > > > > main difference I see is that this is intended for driver-internal
> > > > > > > > > use only. (I'm counting using the drm_scheduler as a helper for
> > > > > > > > > driver-private use). This is by no means a way to try tackle the
> > > > > > > > > indefinite fence problem.
> > > > > > > > 
> > > > > > > > Well this was just my latest try to tackle this, but essentially the
> > > > > > > > problems are the same as with your approach: When we express such
> > > > > > > > operations as dma_fence there is always the change that we leak that
> > > > > > > > somewhere.
> > > > > > > > 
> > > > > > > > My approach of adding a flag noting that this operation is dangerous and
> > > > > > > > can't be synced with something memory management depends on tried to
> > > > > > > > contain this as much as possible, but Daniel still pretty clearly
> > > > > > > > rejected it (for good reasons I think).
> > > > > > > > 
> > > > > > > > > 
> > > > > > > > > We could ofc invent a completely different data-type that abstracts
> > > > > > > > > the synchronization the scheduler needs in the long-running case, or
> > > > > > > > > each driver could hack something up, like sleeping in the
> > > > > > > > > prepare_job() or run_job() callback for throttling, but those waits
> > > > > > > > > should still be annotated in one way or annotated one way or another
> > > > > > > > > (and probably in a similar way across drivers) to make sure we don't
> > > > > > > > > do anything bad.
> > > > > > > > > 
> > > > > > > > >  So any suggestions as to what would be the better solution here
> > > > > > > > > would be appreciated.
> > > > > > > > 
> > > > > > > > Mhm, do we really the the GPU scheduler for that?
> > > > > > > > 
> > > > > > 
> > > > > > I think we need to solve this within the DRM scheduler one way or
> > > > > > another.
> > > > > 
> > > > > Yeah so if we conclude that the queue really must be bottomless then I
> > > > > agree drm-sched should help out sort out the mess. Because I'm guessing
> > > > > that every driver will have this issue. But that's a big if.
> > > > > 
> > > > > I guess if we teach the drm scheduler that some jobs are fairly endless
> > > > > then maybe it wouldn't be too far-fetched to also teach it to wait for a
> > > > > previous one to finish (but not with the dma_fence that preempts, which we
> > > > > put into the dma_resv for memory management, but some other struct
> > > > > completion). The scheduler already has a concept of not stuffing too much
> > > > > stuff into the same queue after all, so this should fit?
> > > > 
> > > > See above, exact same situation as spinning on flow controling the ring,
> > > > this IMO absolutely breaks the dma-fence rules. IMO the correct solution
> > > > is to have a DRM that doesn't export dma-fences, this is exactly what
> > > > this series does as if we try to, boom lockdep / warn on blow up.
> > > 
> > > I dont think it's impossible to do this correctly, but definitely very,
> > > very hard. Which is why neither Christian nor me like the idea :-)
> > > 
> > > Essentially you'd have to make sure that any indefinite way will still
> > > react to drm_sched_job, so that you're not holding up a gt reset or
> > > anything like that, but only ever hold up forward progress for this
> > > specific scheduler/drm_sched_entity. Which you can do as long (and again,
> > > another hugely tricky detail) you still obey the preempt-ctx dma_fence and
> > > manage to preempt the underlying long-running ctx even when the drm/sched
> > > is stuck waiting for an indefinite fence (like waiting for ringspace or
> > > something like that).
> > > 
> > > So I don't think it's impossible, but very far away from "a good idea" :-)
> > > 
> > > Hence to proposal to bail out of this entire mess by throwing EWOULDBLCK
> > > back to userspace directly from the ioctl function, where you still can do
> > > that without breaking any dma_fence rules. Or if it's not a case that
> > > matters in practice, simply block in the ioctl handler instead of
> > > returning EWOULDBLCK.
> > 
> > Returning EWOULDBLCK on a full ring is reasonsible I guess but again
> > without returning a fence in run job the TDR can't be used for clean up
> > on LR entities which will result in duplicate code open coded by each
> > driver. Same goes for checking ring full in exec.
> > 
> > How about this:
> > - We mark xe_hw_fence as LR to ensure it can't be exported, return this
> >   in run_job which gives flow control on the ring + the handy TDR
> >   functionality
> > - When a scheduler is marked as LR, we do not generate finished fences
> >   for jobs
> > - We heavily, heavily scrutinize any usage of the LR fence flag going
> >   foward
> > - We document all of this very loudly
> > 
> > Is this reasonable?
> 
> I'm not seeing why it's needed? If you're worried about TDR duplication
> then I think we need something else. Because for long-running ctx we never
> have a timeout of the ctx itself (by definition). The only thing we time
> out on is the preempt, so I guess what could be done:
> - have the minimal scaffolding to support the preempt-ctx fence in
>   drm_sched_entity
> - when the preempt ctx fence enables signalling a) callback to the driver
>   to start the preempt (which should signal the fence) b) start a timer,
>   which should catch if the preempt takes too long

The GuC does this for us, no need.

> - if the timeout first (importantly we only enable that when the
>   preemption is trigger, not by default), kick of the normal drm/sched tdr
>   flow. maybe needs some adjustements in case there's different handling
>   needed for when a preemption times out compared to just a job timing out
>

The GuC imforms us this and yea we kick the TDR.

> I think this might make sense for sharing timeout handling needs for
> long-running context. What you proposed I don't really follow why it
> should exist, because that kind of timeout handling should not ever happen
> for long-running jobs.

We use just the TDR a as single cleanup point for all entities. In the
case of a LR entity this occurs if the GuC issues a reset on the
entity (liekly preempt timeout), the entity takes a non-recoverable page
fail, or the entity to the root cause of a GT reset. The pending job
list here is handy, that why I wanted to return xe_hw_fence in run_job
to hold the job in the scheduler pending list. The doesn't TDR won't
fire if the pending list is empty.

Based on what you are saying my new proposal:

1. Write ring in exec for LR jobs, return -EWOULDBLCK if no space in
ring
2. Return NULL in run_job (or alternatively a signaled fence)
3. Have specical cased cleanup flow for LR entites (not the TDR, rather
likely a different worker we kick owned by the xe_engine).
4. Document this some that this how drivers are expected to do LR
workloads plus DRM scheduler

1 & 3 are pretty clear duplicates of code but I can live with that if
I can get Ack on the plan + move on. The coding will not be all that
difficult either, I am just being difficult. In the is probably a 100ish
lines of code.

What do you think Daniel, seem like a reasonable plan?

Matt

> -Daniel
> -- 
> Daniel Vetter
> Software Engineer, Intel Corporation
> http://blog.ffwll.ch
Daniel Vetter April 6, 2023, 5:09 p.m. UTC | #22
On Thu, 6 Apr 2023 at 18:58, Matthew Brost <matthew.brost@intel.com> wrote:
>
> On Thu, Apr 06, 2023 at 08:32:59AM +0200, Daniel Vetter wrote:
> > On Wed, Apr 05, 2023 at 11:58:44PM +0000, Matthew Brost wrote:
> > > On Wed, Apr 05, 2023 at 03:09:08PM +0200, Daniel Vetter wrote:
> > > > On Tue, Apr 04, 2023 at 07:48:27PM +0000, Matthew Brost wrote:
> > > > > On Tue, Apr 04, 2023 at 09:25:52PM +0200, Daniel Vetter wrote:
> > > > > > On Tue, Apr 04, 2023 at 07:02:23PM +0000, Matthew Brost wrote:
> > > > > > > On Tue, Apr 04, 2023 at 08:14:01PM +0200, Thomas Hellström (Intel) wrote:
> > > > > > > >
> > > > > > > > On 4/4/23 15:10, Christian König wrote:
> > > > > > > > > Am 04.04.23 um 14:54 schrieb Thomas Hellström:
> > > > > > > > > > Hi, Christian,
> > > > > > > > > >
> > > > > > > > > > On 4/4/23 11:09, Christian König wrote:
> > > > > > > > > > > Am 04.04.23 um 02:22 schrieb Matthew Brost:
> > > > > > > > > > > > From: Thomas Hellström <thomas.hellstrom@linux.intel.com>
> > > > > > > > > > > >
> > > > > > > > > > > > For long-running workloads, drivers either need to open-code
> > > > > > > > > > > > completion
> > > > > > > > > > > > waits, invent their own synchronization primitives or internally use
> > > > > > > > > > > > dma-fences that do not obey the cross-driver dma-fence protocol, but
> > > > > > > > > > > > without any lockdep annotation all these approaches are error prone.
> > > > > > > > > > > >
> > > > > > > > > > > > So since for example the drm scheduler uses dma-fences it is
> > > > > > > > > > > > desirable for
> > > > > > > > > > > > a driver to be able to use it for throttling and error
> > > > > > > > > > > > handling also with
> > > > > > > > > > > > internal dma-fences tha do not obey the cros-driver
> > > > > > > > > > > > dma-fence protocol.
> > > > > > > > > > > >
> > > > > > > > > > > > Introduce long-running completion fences in form of
> > > > > > > > > > > > dma-fences, and add
> > > > > > > > > > > > lockdep annotation for them. In particular:
> > > > > > > > > > > >
> > > > > > > > > > > > * Do not allow waiting under any memory management locks.
> > > > > > > > > > > > * Do not allow to attach them to a dma-resv object.
> > > > > > > > > > > > * Introduce a new interface for adding callbacks making the
> > > > > > > > > > > > helper adding
> > > > > > > > > > > >    a callback sign off on that it is aware that the dma-fence may not
> > > > > > > > > > > >    complete anytime soon. Typically this will be the
> > > > > > > > > > > > scheduler chaining
> > > > > > > > > > > >    a new long-running fence on another one.
> > > > > > > > > > >
> > > > > > > > > > > Well that's pretty much what I tried before:
> > > > > > > > > > > https://lwn.net/Articles/893704/
> > > > > > > > > > >
> > > > > > >
> > > > > > > I don't think this quite the same, this explictly enforces that we don't
> > > > > > > break the dma-fence rules (in path of memory allocations, exported in
> > > > > > > any way), essentially this just SW sync point reusing dma-fence the
> > > > > > > infrastructure for signaling / callbacks. I believe your series tried to
> > > > > > > export these fences to user space (admittedly I haven't fully read your
> > > > > > > series).
> > > > > > >
> > > > > > > In this use case we essentially just want to flow control the ring via
> > > > > > > the dma-scheduler + maintain a list of pending jobs so the TDR can be
> > > > > > > used for cleanup if LR entity encounters an error. To me this seems
> > > > > > > perfectly reasonable but I know dma-femce rules are akin to a holy war.
> > > > > > >
> > > > > > > If we return NULL in run_job, now we have to be able to sink all jobs
> > > > > > > in the backend regardless on ring space, maintain a list of jobs pending
> > > > > > > for cleanup after errors, and write a different cleanup path as now the
> > > > > > > TDR doesn't work. Seems very, very silly to duplicate all of this code
> > > > > > > when the DRM scheduler provides all of this for us. Also if we go this
> > > > > > > route, now all drivers are going to invent ways to handle LR jobs /w the
> > > > > > > DRM scheduler.
> > > > > > >
> > > > > > > This solution is pretty clear, mark the scheduler as LR, and don't
> > > > > > > export any fences from the scheduler. If you try to export these fences
> > > > > > > a blow up happens.
> > > > > >
> > > > > > The problem is if you mix things up. Like for resets you need all the
> > > > > > schedulers on an engine/set-of-engines to quiescent or things get
> > > > > > potentially hilarious. If you now have a scheduler in forever limbo, the
> > > > > > dma_fence guarantees are right out the window.
> > > > > >
> > > > >
> > > > > Right, a GT reset on Xe is:
> > > > >
> > > > > Stop all schedulers
> > > > > Do a reset
> > > > > Ban any schedulers which we think caused the GT reset
> > > > > Resubmit all schedulers which we think were good
> > > > > Restart all schedulers
> > > > >
> > > > > None of this flow depends on LR dma-fences, all of this uses the DRM
> > > > > sched infrastructure and work very well compared to the i915. Rewriting
> > > > > all this with a driver specific implementation is what we are trying to
> > > > > avoid.
> > > > >
> > > > > Similarly if LR entity hangs on its own (not a GT reset, rather the
> > > > > firmware does the reset for us) we use all the DRM scheduler
> > > > > infrastructure to handle this. Again this works rather well...
> > > >
> > > > Yeah this is why I don't think duplicating everything that long-running
> > > > jobs need makes any sense. iow I agree with you.
> > > >
> > >
> > > Glad we agree.
> > >
> > > > > > But the issue you're having is fairly specific if it's just about
> > > > > > ringspace. I think the dumbest fix is to just block in submit if you run
> > > > > > out of per-ctx ringspace, and call it a day. This notion that somehow the
> > > > >
> > > > > How does that not break the dma-fence rules? A job can publish its
> > > > > finished fence after ARM, if the finished fence fence waits on ring
> > > > > space that may not free up in a reasonable amount of time we now have
> > > > > broken the dma-dence rules. My understanding is any dma-fence must only
> > > > > on other dma-fence, Christian seems to agree and NAK'd just blocking if
> > > > > no space available [1]. IMO this series ensures we don't break dma-fence
> > > > > rules by restricting how the finished fence can be used.
> > > >
> > > > Oh I meant in the submit ioctl, _before_ you even call
> > > > drm_sched_job_arm(). It's ok to block in there indefinitely.
> > > >
> > >
> > > Ok, but how do we determine if their is ring space, wait on xe_hw_fence
> > > which is a dma-fence. We just move a wait from the scheduler to the exec
> > > IOCTL and I realy fail to see the point of that.
> >
> > Fill in anything you need into the ring at ioctl time, but don't update
> > the tail pointers? If there's no space, then EWOULDBLCK.
> >
>
> Ok, I can maybe buy this approach and this is fairly easy to do. I'm
> going to do this for LR jobs only though (non-LR job will still flow
> control on the ring via the scheduler + write ring in run_job). A bit of
> duplicate code but I can live with this.
>
> > > > > > kernel is supposed to provide a bottomless queue of anything userspace
> > > > > > submits simply doesn't hold up in reality (as much as userspace standards
> > > > > > committees would like it to), and as long as it doesn't have a real-world
> > > > > > perf impact it doesn't really matter why we end up blocking in the submit
> > > > > > ioctl. It might also be a simple memory allocation that hits a snag in
> > > > > > page reclaim.
> > > > > >
> > > > > > > > > > > And the reasons why it was rejected haven't changed.
> > > > > > > > > > >
> > > > > > > > > > > Regards,
> > > > > > > > > > > Christian.
> > > > > > > > > > >
> > > > > > > > > > Yes, TBH this was mostly to get discussion going how we'd best
> > > > > > > > > > tackle this problem while being able to reuse the scheduler for
> > > > > > > > > > long-running workloads.
> > > > > > > > > >
> > > > > > > > > > I couldn't see any clear decision on your series, though, but one
> > > > > > > > > > main difference I see is that this is intended for driver-internal
> > > > > > > > > > use only. (I'm counting using the drm_scheduler as a helper for
> > > > > > > > > > driver-private use). This is by no means a way to try tackle the
> > > > > > > > > > indefinite fence problem.
> > > > > > > > >
> > > > > > > > > Well this was just my latest try to tackle this, but essentially the
> > > > > > > > > problems are the same as with your approach: When we express such
> > > > > > > > > operations as dma_fence there is always the change that we leak that
> > > > > > > > > somewhere.
> > > > > > > > >
> > > > > > > > > My approach of adding a flag noting that this operation is dangerous and
> > > > > > > > > can't be synced with something memory management depends on tried to
> > > > > > > > > contain this as much as possible, but Daniel still pretty clearly
> > > > > > > > > rejected it (for good reasons I think).
> > > > > > > > >
> > > > > > > > > >
> > > > > > > > > > We could ofc invent a completely different data-type that abstracts
> > > > > > > > > > the synchronization the scheduler needs in the long-running case, or
> > > > > > > > > > each driver could hack something up, like sleeping in the
> > > > > > > > > > prepare_job() or run_job() callback for throttling, but those waits
> > > > > > > > > > should still be annotated in one way or annotated one way or another
> > > > > > > > > > (and probably in a similar way across drivers) to make sure we don't
> > > > > > > > > > do anything bad.
> > > > > > > > > >
> > > > > > > > > >  So any suggestions as to what would be the better solution here
> > > > > > > > > > would be appreciated.
> > > > > > > > >
> > > > > > > > > Mhm, do we really the the GPU scheduler for that?
> > > > > > > > >
> > > > > > >
> > > > > > > I think we need to solve this within the DRM scheduler one way or
> > > > > > > another.
> > > > > >
> > > > > > Yeah so if we conclude that the queue really must be bottomless then I
> > > > > > agree drm-sched should help out sort out the mess. Because I'm guessing
> > > > > > that every driver will have this issue. But that's a big if.
> > > > > >
> > > > > > I guess if we teach the drm scheduler that some jobs are fairly endless
> > > > > > then maybe it wouldn't be too far-fetched to also teach it to wait for a
> > > > > > previous one to finish (but not with the dma_fence that preempts, which we
> > > > > > put into the dma_resv for memory management, but some other struct
> > > > > > completion). The scheduler already has a concept of not stuffing too much
> > > > > > stuff into the same queue after all, so this should fit?
> > > > >
> > > > > See above, exact same situation as spinning on flow controling the ring,
> > > > > this IMO absolutely breaks the dma-fence rules. IMO the correct solution
> > > > > is to have a DRM that doesn't export dma-fences, this is exactly what
> > > > > this series does as if we try to, boom lockdep / warn on blow up.
> > > >
> > > > I dont think it's impossible to do this correctly, but definitely very,
> > > > very hard. Which is why neither Christian nor me like the idea :-)
> > > >
> > > > Essentially you'd have to make sure that any indefinite way will still
> > > > react to drm_sched_job, so that you're not holding up a gt reset or
> > > > anything like that, but only ever hold up forward progress for this
> > > > specific scheduler/drm_sched_entity. Which you can do as long (and again,
> > > > another hugely tricky detail) you still obey the preempt-ctx dma_fence and
> > > > manage to preempt the underlying long-running ctx even when the drm/sched
> > > > is stuck waiting for an indefinite fence (like waiting for ringspace or
> > > > something like that).
> > > >
> > > > So I don't think it's impossible, but very far away from "a good idea" :-)
> > > >
> > > > Hence to proposal to bail out of this entire mess by throwing EWOULDBLCK
> > > > back to userspace directly from the ioctl function, where you still can do
> > > > that without breaking any dma_fence rules. Or if it's not a case that
> > > > matters in practice, simply block in the ioctl handler instead of
> > > > returning EWOULDBLCK.
> > >
> > > Returning EWOULDBLCK on a full ring is reasonsible I guess but again
> > > without returning a fence in run job the TDR can't be used for clean up
> > > on LR entities which will result in duplicate code open coded by each
> > > driver. Same goes for checking ring full in exec.
> > >
> > > How about this:
> > > - We mark xe_hw_fence as LR to ensure it can't be exported, return this
> > >   in run_job which gives flow control on the ring + the handy TDR
> > >   functionality
> > > - When a scheduler is marked as LR, we do not generate finished fences
> > >   for jobs
> > > - We heavily, heavily scrutinize any usage of the LR fence flag going
> > >   foward
> > > - We document all of this very loudly
> > >
> > > Is this reasonable?
> >
> > I'm not seeing why it's needed? If you're worried about TDR duplication
> > then I think we need something else. Because for long-running ctx we never
> > have a timeout of the ctx itself (by definition). The only thing we time
> > out on is the preempt, so I guess what could be done:
> > - have the minimal scaffolding to support the preempt-ctx fence in
> >   drm_sched_entity
> > - when the preempt ctx fence enables signalling a) callback to the driver
> >   to start the preempt (which should signal the fence) b) start a timer,
> >   which should catch if the preempt takes too long
>
> The GuC does this for us, no need.
>
> > - if the timeout first (importantly we only enable that when the
> >   preemption is trigger, not by default), kick of the normal drm/sched tdr
> >   flow. maybe needs some adjustements in case there's different handling
> >   needed for when a preemption times out compared to just a job timing out
> >
>
> The GuC imforms us this and yea we kick the TDR.

You might still need the kernel fallback when guc dies? But yeah not
sure how much similarity is then left with the end-of-batch timed-out.

> > I think this might make sense for sharing timeout handling needs for
> > long-running context. What you proposed I don't really follow why it
> > should exist, because that kind of timeout handling should not ever happen
> > for long-running jobs.
>
> We use just the TDR a as single cleanup point for all entities. In the
> case of a LR entity this occurs if the GuC issues a reset on the
> entity (liekly preempt timeout), the entity takes a non-recoverable page
> fail, or the entity to the root cause of a GT reset. The pending job
> list here is handy, that why I wanted to return xe_hw_fence in run_job
> to hold the job in the scheduler pending list. The doesn't TDR won't
> fire if the pending list is empty.
>
> Based on what you are saying my new proposal:
>
> 1. Write ring in exec for LR jobs, return -EWOULDBLCK if no space in
> ring
> 2. Return NULL in run_job (or alternatively a signaled fence)
> 3. Have specical cased cleanup flow for LR entites (not the TDR, rather
> likely a different worker we kick owned by the xe_engine).
> 4. Document this some that this how drivers are expected to do LR
> workloads plus DRM scheduler
>
> 1 & 3 are pretty clear duplicates of code but I can live with that if
> I can get Ack on the plan + move on. The coding will not be all that
> difficult either, I am just being difficult. In the is probably a 100ish
> lines of code.

For 1 I guess you could also write the ring stuff for end-of-batch
code from the ioctl (and then just block for the ring to advance
enough if needed). That is how we had i915-gem operate before
i915-scheduler happened.

For 3 I'm not sure there's really that much to share, the end-of-batch
and preempt-ctx dma_fence are just rather different. I'm not too clear
on why LR needs a special cleanup flow, isn't it roughly the same:
- stop drm/sched from pushing in new jobs
- preempt the ctx to kill it

With xe I don't think we ever want to let existing jobs complete,
neither for legacy nor for lr ctx.


> What do you think Daniel, seem like a reasonable plan?

Yeah my comments above are just minor side questions, I think this is
roughly what we want. Plus/minus polish details of how much code
sharing between legacy/lr ctx in xe and for lr with other drivers
makes sense or not. I think the in-fences handling (because that ties
into memory management when eviction and restoring a vm) is the big
part which ideally has as much shared as possible. Which I think this
achieves.
-Daniel
--
Daniel Vetter
Software Engineer, Intel Corporation
http://blog.ffwll.ch
diff mbox series

Patch

diff --git a/drivers/dma-buf/dma-fence.c b/drivers/dma-buf/dma-fence.c
index f177c56269bb..9726b2a3c67d 100644
--- a/drivers/dma-buf/dma-fence.c
+++ b/drivers/dma-buf/dma-fence.c
@@ -111,6 +111,20 @@  static atomic64_t dma_fence_context_counter = ATOMIC64_INIT(1);
  * drivers/gpu should ever call dma_fence_wait() in such contexts.
  */
 
+/**
+ * DOC: Long-Running (lr) dma-fences.
+ *
+ * * Long-running dma-fences are NOT required to complete in reasonable time.
+ *   Typically they signal completion of user-space controlled workloads and
+ *   as such, need to never be part of a cross-driver contract, never waited
+ *   for inside a kernel lock, nor attached to a dma-resv. There are helpers
+ *   and warnings in place to help facilitate that that never happens.
+ *
+ * * The motivation for their existense is that helpers that are intended to
+ *   be used by drivers may use dma-fences that, given the workloads mentioned
+ *   above, become long-running.
+ */
+
 static const char *dma_fence_stub_get_name(struct dma_fence *fence)
 {
         return "stub";
@@ -284,6 +298,34 @@  static struct lockdep_map dma_fence_lockdep_map = {
 	.name = "dma_fence_map"
 };
 
+static struct lockdep_map dma_fence_lr_lockdep_map = {
+	.name = "dma_fence_lr_map"
+};
+
+static bool __dma_fence_begin_signalling(struct lockdep_map *map)
+{
+	/* explicitly nesting ... */
+	if (lock_is_held_type(map, 1))
+		return true;
+
+	/* rely on might_sleep check for soft/hardirq locks */
+	if (in_atomic())
+		return true;
+
+	/* ... and non-recursive readlock */
+	lock_acquire(map, 0, 0, 1, 1, NULL, _RET_IP_);
+
+	return false;
+}
+
+static void __dma_fence_end_signalling(bool cookie, struct lockdep_map *map)
+{
+	if (cookie)
+		return;
+
+	lock_release(map, _RET_IP_);
+}
+
 /**
  * dma_fence_begin_signalling - begin a critical DMA fence signalling section
  *
@@ -300,18 +342,7 @@  static struct lockdep_map dma_fence_lockdep_map = {
  */
 bool dma_fence_begin_signalling(void)
 {
-	/* explicitly nesting ... */
-	if (lock_is_held_type(&dma_fence_lockdep_map, 1))
-		return true;
-
-	/* rely on might_sleep check for soft/hardirq locks */
-	if (in_atomic())
-		return true;
-
-	/* ... and non-recursive readlock */
-	lock_acquire(&dma_fence_lockdep_map, 0, 0, 1, 1, NULL, _RET_IP_);
-
-	return false;
+	return __dma_fence_begin_signalling(&dma_fence_lockdep_map);
 }
 EXPORT_SYMBOL(dma_fence_begin_signalling);
 
@@ -323,25 +354,61 @@  EXPORT_SYMBOL(dma_fence_begin_signalling);
  */
 void dma_fence_end_signalling(bool cookie)
 {
-	if (cookie)
-		return;
-
-	lock_release(&dma_fence_lockdep_map, _RET_IP_);
+	__dma_fence_end_signalling(cookie, &dma_fence_lockdep_map);
 }
 EXPORT_SYMBOL(dma_fence_end_signalling);
 
-void __dma_fence_might_wait(void)
+/**
+ * dma_fence_lr begin_signalling - begin a critical long-running DMA fence
+ * signalling section
+ *
+ * Drivers should use this to annotate the beginning of any code section
+ * required to eventually complete &dma_fence by calling dma_fence_signal().
+ *
+ * The end of these critical sections are annotated with
+ * dma_fence_lr_end_signalling(). Ideally the section should encompass all
+ * locks that are ever required to signal a long-running dma-fence.
+ *
+ * Return: An opaque cookie needed by the implementation, which needs to be
+ * passed to dma_fence_lr end_signalling().
+ */
+bool dma_fence_lr_begin_signalling(void)
+{
+	return __dma_fence_begin_signalling(&dma_fence_lr_lockdep_map);
+}
+EXPORT_SYMBOL(dma_fence_lr_begin_signalling);
+
+/**
+ * dma_fence_lr_end_signalling - end a critical DMA fence signalling section
+ * @cookie: opaque cookie from dma_fence_lr_begin_signalling()
+ *
+ * Closes a critical section annotation opened by
+ * dma_fence_lr_begin_signalling().
+ */
+void dma_fence_lr_end_signalling(bool cookie)
+{
+	__dma_fence_end_signalling(cookie, &dma_fence_lr_lockdep_map);
+}
+EXPORT_SYMBOL(dma_fence_lr_end_signalling);
+
+static void ___dma_fence_might_wait(struct lockdep_map *map)
 {
 	bool tmp;
 
-	tmp = lock_is_held_type(&dma_fence_lockdep_map, 1);
+	tmp = lock_is_held_type(map, 1);
 	if (tmp)
-		lock_release(&dma_fence_lockdep_map, _THIS_IP_);
-	lock_map_acquire(&dma_fence_lockdep_map);
-	lock_map_release(&dma_fence_lockdep_map);
+		lock_release(map, _THIS_IP_);
+	lock_map_acquire(map);
+	lock_map_release(map);
 	if (tmp)
-		lock_acquire(&dma_fence_lockdep_map, 0, 0, 1, 1, NULL, _THIS_IP_);
+		lock_acquire(map, 0, 0, 1, 1, NULL, _THIS_IP_);
+}
+
+void __dma_fence_might_wait(void)
+{
+	___dma_fence_might_wait(&dma_fence_lockdep_map);
 }
+
 #endif
 
 
@@ -506,7 +573,11 @@  dma_fence_wait_timeout(struct dma_fence *fence, bool intr, signed long timeout)
 
 	might_sleep();
 
-	__dma_fence_might_wait();
+#ifdef CONFIG_LOCKDEP
+	___dma_fence_might_wait(dma_fence_is_lr(fence) ?
+				&dma_fence_lr_lockdep_map :
+				&dma_fence_lockdep_map);
+#endif
 
 	dma_fence_enable_sw_signaling(fence);
 
@@ -618,29 +689,22 @@  void dma_fence_enable_sw_signaling(struct dma_fence *fence)
 EXPORT_SYMBOL(dma_fence_enable_sw_signaling);
 
 /**
- * dma_fence_add_callback - add a callback to be called when the fence
+ * dma_fence_lr_add_callback - add a callback to be called when the fence
  * is signaled
  * @fence: the fence to wait on
  * @cb: the callback to register
  * @func: the function to call
  *
- * Add a software callback to the fence. The caller should keep a reference to
- * the fence.
- *
- * @cb will be initialized by dma_fence_add_callback(), no initialization
- * by the caller is required. Any number of callbacks can be registered
- * to a fence, but a callback can only be registered to one fence at a time.
- *
- * If fence is already signaled, this function will return -ENOENT (and
- * *not* call the callback).
- *
- * Note that the callback can be called from an atomic context or irq context.
+ * This function is identical to dma_fence_add_callback() but allows adding
+ * callbacks also to lr dma-fences. The naming helps annotating the fact that
+ * we're adding a callback to a a lr fence and that the callback might therefore
+ * not be called within a reasonable amount of time.
  *
- * Returns 0 in case of success, -ENOENT if the fence is already signaled
+ * Return: 0 in case of success, -ENOENT if the fence is already signaled
  * and -EINVAL in case of error.
  */
-int dma_fence_add_callback(struct dma_fence *fence, struct dma_fence_cb *cb,
-			   dma_fence_func_t func)
+int dma_fence_lr_add_callback(struct dma_fence *fence, struct dma_fence_cb *cb,
+			      dma_fence_func_t func)
 {
 	unsigned long flags;
 	int ret = 0;
@@ -667,7 +731,7 @@  int dma_fence_add_callback(struct dma_fence *fence, struct dma_fence_cb *cb,
 
 	return ret;
 }
-EXPORT_SYMBOL(dma_fence_add_callback);
+EXPORT_SYMBOL(dma_fence_lr_add_callback);
 
 /**
  * dma_fence_get_status - returns the status upon completion
diff --git a/drivers/dma-buf/dma-resv.c b/drivers/dma-buf/dma-resv.c
index 2a594b754af1..fa0210c1442e 100644
--- a/drivers/dma-buf/dma-resv.c
+++ b/drivers/dma-buf/dma-resv.c
@@ -292,6 +292,7 @@  void dma_resv_add_fence(struct dma_resv *obj, struct dma_fence *fence,
 	 * individually.
 	 */
 	WARN_ON(dma_fence_is_container(fence));
+	WARN_ON_ONCE(dma_fence_is_lr(fence));
 
 	fobj = dma_resv_fences_list(obj);
 	count = fobj->num_fences;
@@ -340,6 +341,7 @@  void dma_resv_replace_fences(struct dma_resv *obj, uint64_t context,
 	unsigned int i;
 
 	dma_resv_assert_held(obj);
+	WARN_ON_ONCE(dma_fence_is_lr(replacement));
 
 	list = dma_resv_fences_list(obj);
 	for (i = 0; list && i < list->num_fences; ++i) {
@@ -764,6 +766,7 @@  static int __init dma_resv_lockdep(void)
 	struct ww_acquire_ctx ctx;
 	struct dma_resv obj;
 	struct address_space mapping;
+	bool lr_cookie;
 	int ret;
 
 	if (!mm)
@@ -772,6 +775,7 @@  static int __init dma_resv_lockdep(void)
 	dma_resv_init(&obj);
 	address_space_init_once(&mapping);
 
+	lr_cookie = dma_fence_lr_begin_signalling();
 	mmap_read_lock(mm);
 	ww_acquire_init(&ctx, &reservation_ww_class);
 	ret = dma_resv_lock(&obj, &ctx);
@@ -792,6 +796,7 @@  static int __init dma_resv_lockdep(void)
 	ww_mutex_unlock(&obj.lock);
 	ww_acquire_fini(&ctx);
 	mmap_read_unlock(mm);
+	dma_fence_lr_end_signalling(lr_cookie);
 
 	mmput(mm);
 
diff --git a/include/linux/dma-fence.h b/include/linux/dma-fence.h
index d54b595a0fe0..08d21e26782b 100644
--- a/include/linux/dma-fence.h
+++ b/include/linux/dma-fence.h
@@ -99,6 +99,7 @@  enum dma_fence_flag_bits {
 	DMA_FENCE_FLAG_SIGNALED_BIT,
 	DMA_FENCE_FLAG_TIMESTAMP_BIT,
 	DMA_FENCE_FLAG_ENABLE_SIGNAL_BIT,
+	DMA_FENCE_FLAG_LR_BIT,
 	DMA_FENCE_FLAG_USER_BITS, /* must always be last member */
 };
 
@@ -279,6 +280,11 @@  struct dma_fence_ops {
 	void (*set_deadline)(struct dma_fence *fence, ktime_t deadline);
 };
 
+static inline bool dma_fence_is_lr(const struct dma_fence *fence)
+{
+	return test_bit(DMA_FENCE_FLAG_LR_BIT, &fence->flags);
+}
+
 void dma_fence_init(struct dma_fence *fence, const struct dma_fence_ops *ops,
 		    spinlock_t *lock, u64 context, u64 seqno);
 
@@ -377,13 +383,23 @@  dma_fence_get_rcu_safe(struct dma_fence __rcu **fencep)
 #ifdef CONFIG_LOCKDEP
 bool dma_fence_begin_signalling(void);
 void dma_fence_end_signalling(bool cookie);
+bool dma_fence_lr_begin_signalling(void);
+void dma_fence_lr_end_signalling(bool cookie);
 void __dma_fence_might_wait(void);
 #else
+
 static inline bool dma_fence_begin_signalling(void)
 {
 	return true;
 }
+
 static inline void dma_fence_end_signalling(bool cookie) {}
+static inline bool dma_fence_lr_begin_signalling(void)
+{
+	return true;
+}
+
+static inline void dma_fence_lr_end_signalling(bool cookie) {}
 static inline void __dma_fence_might_wait(void) {}
 #endif
 
@@ -394,9 +410,42 @@  int dma_fence_signal_timestamp_locked(struct dma_fence *fence,
 				      ktime_t timestamp);
 signed long dma_fence_default_wait(struct dma_fence *fence,
 				   bool intr, signed long timeout);
-int dma_fence_add_callback(struct dma_fence *fence,
-			   struct dma_fence_cb *cb,
-			   dma_fence_func_t func);
+
+int dma_fence_lr_add_callback(struct dma_fence *fence,
+			      struct dma_fence_cb *cb,
+			      dma_fence_func_t func);
+
+/**
+ * dma_fence_add_callback - add a callback to be called when the fence
+ * is signaled
+ * @fence: the fence to wait on
+ * @cb: the callback to register
+ * @func: the function to call
+ *
+ * Add a software callback to the fence. The caller should keep a reference to
+ * the fence.
+ *
+ * @cb will be initialized by dma_fence_add_callback(), no initialization
+ * by the caller is required. Any number of callbacks can be registered
+ * to a fence, but a callback can only be registered to one fence at a time.
+ *
+ * If fence is already signaled, this function will return -ENOENT (and
+ * *not* call the callback).
+ *
+ * Note that the callback can be called from an atomic context or irq context.
+ *
+ * Returns 0 in case of success, -ENOENT if the fence is already signaled
+ * and -EINVAL in case of error.
+ */
+static inline int dma_fence_add_callback(struct dma_fence *fence,
+					 struct dma_fence_cb *cb,
+					 dma_fence_func_t func)
+{
+	WARN_ON(IS_ENABLED(CONFIG_LOCKDEP) && dma_fence_is_lr(fence));
+
+	return dma_fence_lr_add_callback(fence, cb, func);
+}
+
 bool dma_fence_remove_callback(struct dma_fence *fence,
 			       struct dma_fence_cb *cb);
 void dma_fence_enable_sw_signaling(struct dma_fence *fence);