diff mbox series

[v3,4/6] drm/i915: Request watchdog infrastructure

Message ID 20210322132937.2165901-1-tvrtko.ursulin@linux.intel.com (mailing list archive)
State New, archived
Headers show
Series None | expand

Commit Message

Tvrtko Ursulin March 22, 2021, 1:29 p.m. UTC
From: Tvrtko Ursulin <tvrtko.ursulin@intel.com>

Prepares the plumbing for setting request/fence expiration time. All code
is put in place but is never activeted due yet missing ability to actually
configure the timer.

Outline of the basic operation:

A timer is started when request is ready for execution. If the request
completes (retires) before the timer fires, timer is cancelled and nothing
further happens.

If the timer fires request is added to a lockless list and worker queued.
Purpose of this is twofold: a) It allows request cancellation from a more
friendly context and b) coalesces multiple expirations into a single event
of consuming the list.

Worker locklessly consumes the list of expired requests and cancels them
all using previous added i915_request_cancel().

Associated timeout value is stored in rq->context.watchdog.timeout_us.

v2:
 * Log expiration.

v3:
 * Include more information about user timeline in the log message.

Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Cc: Daniel Vetter <daniel.vetter@ffwll.ch>
---
 drivers/gpu/drm/i915/gt/intel_context_types.h |  4 ++
 .../drm/i915/gt/intel_execlists_submission.h  |  2 +
 drivers/gpu/drm/i915/gt/intel_gt.c            |  3 +
 drivers/gpu/drm/i915/gt/intel_gt.h            |  2 +
 drivers/gpu/drm/i915/gt/intel_gt_requests.c   | 28 ++++++++++
 drivers/gpu/drm/i915/gt/intel_gt_types.h      |  7 +++
 drivers/gpu/drm/i915/i915_request.c           | 56 +++++++++++++++++++
 drivers/gpu/drm/i915/i915_request.h           |  8 +++
 8 files changed, 110 insertions(+)

Comments

Matthew Auld March 23, 2021, 10:54 a.m. UTC | #1
On Mon, 22 Mar 2021 at 13:29, Tvrtko Ursulin
<tvrtko.ursulin@linux.intel.com> wrote:
>
> From: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
>
> Prepares the plumbing for setting request/fence expiration time. All code
> is put in place but is never activeted due yet missing ability to actually

                 activated

> configure the timer.
>
> Outline of the basic operation:
>
> A timer is started when request is ready for execution. If the request
> completes (retires) before the timer fires, timer is cancelled and nothing
> further happens.
>
> If the timer fires request is added to a lockless list and worker queued.
> Purpose of this is twofold: a) It allows request cancellation from a more
> friendly context and b) coalesces multiple expirations into a single event
> of consuming the list.
>
> Worker locklessly consumes the list of expired requests and cancels them
> all using previous added i915_request_cancel().
>
> Associated timeout value is stored in rq->context.watchdog.timeout_us.
>
> v2:
>  * Log expiration.
>
> v3:
>  * Include more information about user timeline in the log message.
>
> Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
> Cc: Daniel Vetter <daniel.vetter@ffwll.ch>
> ---
>  drivers/gpu/drm/i915/gt/intel_context_types.h |  4 ++
>  .../drm/i915/gt/intel_execlists_submission.h  |  2 +
>  drivers/gpu/drm/i915/gt/intel_gt.c            |  3 +
>  drivers/gpu/drm/i915/gt/intel_gt.h            |  2 +
>  drivers/gpu/drm/i915/gt/intel_gt_requests.c   | 28 ++++++++++
>  drivers/gpu/drm/i915/gt/intel_gt_types.h      |  7 +++
>  drivers/gpu/drm/i915/i915_request.c           | 56 +++++++++++++++++++
>  drivers/gpu/drm/i915/i915_request.h           |  8 +++
>  8 files changed, 110 insertions(+)
>
> diff --git a/drivers/gpu/drm/i915/gt/intel_context_types.h b/drivers/gpu/drm/i915/gt/intel_context_types.h
> index 0ea18c9e2aca..65a5730a4f5b 100644
> --- a/drivers/gpu/drm/i915/gt/intel_context_types.h
> +++ b/drivers/gpu/drm/i915/gt/intel_context_types.h
> @@ -99,6 +99,10 @@ struct intel_context {
>  #define CONTEXT_FORCE_SINGLE_SUBMISSION        7
>  #define CONTEXT_NOPREEMPT              8
>
> +       struct {
> +               u64 timeout_us;
> +       } watchdog;
> +
>         u32 *lrc_reg_state;
>         union {
>                 struct {
> diff --git a/drivers/gpu/drm/i915/gt/intel_execlists_submission.h b/drivers/gpu/drm/i915/gt/intel_execlists_submission.h
> index f7bd3fccfee8..4ca9b475e252 100644
> --- a/drivers/gpu/drm/i915/gt/intel_execlists_submission.h
> +++ b/drivers/gpu/drm/i915/gt/intel_execlists_submission.h
> @@ -6,6 +6,7 @@
>  #ifndef __INTEL_EXECLISTS_SUBMISSION_H__
>  #define __INTEL_EXECLISTS_SUBMISSION_H__
>
> +#include <linux/llist.h>
>  #include <linux/types.h>
>
>  struct drm_printer;
> @@ -13,6 +14,7 @@ struct drm_printer;
>  struct i915_request;
>  struct intel_context;
>  struct intel_engine_cs;
> +struct intel_gt;
>
>  enum {
>         INTEL_CONTEXT_SCHEDULE_IN = 0,
> diff --git a/drivers/gpu/drm/i915/gt/intel_gt.c b/drivers/gpu/drm/i915/gt/intel_gt.c
> index ca76f93bc03d..8d77dcbad059 100644
> --- a/drivers/gpu/drm/i915/gt/intel_gt.c
> +++ b/drivers/gpu/drm/i915/gt/intel_gt.c
> @@ -31,6 +31,9 @@ void intel_gt_init_early(struct intel_gt *gt, struct drm_i915_private *i915)
>         INIT_LIST_HEAD(&gt->closed_vma);
>         spin_lock_init(&gt->closed_lock);
>
> +       init_llist_head(&gt->watchdog.list);
> +       INIT_WORK(&gt->watchdog.work, intel_gt_watchdog_work);
> +
>         intel_gt_init_buffer_pool(gt);
>         intel_gt_init_reset(gt);
>         intel_gt_init_requests(gt);
> diff --git a/drivers/gpu/drm/i915/gt/intel_gt.h b/drivers/gpu/drm/i915/gt/intel_gt.h
> index a17bd8b3195f..7ec395cace69 100644
> --- a/drivers/gpu/drm/i915/gt/intel_gt.h
> +++ b/drivers/gpu/drm/i915/gt/intel_gt.h
> @@ -78,4 +78,6 @@ static inline bool intel_gt_is_wedged(const struct intel_gt *gt)
>  void intel_gt_info_print(const struct intel_gt_info *info,
>                          struct drm_printer *p);
>
> +void intel_gt_watchdog_work(struct work_struct *work);
> +
>  #endif /* __INTEL_GT_H__ */
> diff --git a/drivers/gpu/drm/i915/gt/intel_gt_requests.c b/drivers/gpu/drm/i915/gt/intel_gt_requests.c
> index 36ec97f79174..fbfd19b2e5f2 100644
> --- a/drivers/gpu/drm/i915/gt/intel_gt_requests.c
> +++ b/drivers/gpu/drm/i915/gt/intel_gt_requests.c
> @@ -8,6 +8,7 @@
>  #include "i915_drv.h" /* for_each_engine() */
>  #include "i915_request.h"
>  #include "intel_engine_heartbeat.h"
> +#include "intel_execlists_submission.h"
>  #include "intel_gt.h"
>  #include "intel_gt_pm.h"
>  #include "intel_gt_requests.h"
> @@ -242,4 +243,31 @@ void intel_gt_fini_requests(struct intel_gt *gt)
>  {
>         /* Wait until the work is marked as finished before unloading! */
>         cancel_delayed_work_sync(&gt->requests.retire_work);
> +
> +       flush_work(&gt->watchdog.work);
> +}
> +
> +void intel_gt_watchdog_work(struct work_struct *work)
> +{
> +       struct intel_gt *gt =
> +               container_of(work, typeof(*gt), watchdog.work);
> +       struct i915_request *rq, *rn;
> +       struct llist_node *first;
> +
> +       first = llist_del_all(&gt->watchdog.list);
> +       if (!first)
> +               return;
> +
> +       llist_for_each_entry_safe(rq, rn, first, watchdog.link) {
> +               if (!i915_request_completed(rq)) {
> +                       struct dma_fence *f = &rq->fence;
> +
> +                       pr_notice("Fence expiration time out i915-%s:%s:%llx!\n",
> +                                 f->ops->get_driver_name(f),
> +                                 f->ops->get_timeline_name(f),
> +                                 f->seqno);
> +                       i915_request_cancel(rq, -EINTR);
> +               }
> +               i915_request_put(rq);
> +       }
>  }
> diff --git a/drivers/gpu/drm/i915/gt/intel_gt_types.h b/drivers/gpu/drm/i915/gt/intel_gt_types.h
> index 626af37c7790..d70ebcc6f19f 100644
> --- a/drivers/gpu/drm/i915/gt/intel_gt_types.h
> +++ b/drivers/gpu/drm/i915/gt/intel_gt_types.h
> @@ -8,10 +8,12 @@
>
>  #include <linux/ktime.h>
>  #include <linux/list.h>
> +#include <linux/llist.h>
>  #include <linux/mutex.h>
>  #include <linux/notifier.h>
>  #include <linux/spinlock.h>
>  #include <linux/types.h>
> +#include <linux/workqueue.h>
>
>  #include "uc/intel_uc.h"
>
> @@ -62,6 +64,11 @@ struct intel_gt {
>                 struct delayed_work retire_work;
>         } requests;
>
> +       struct {
> +               struct llist_head list;
> +               struct work_struct work;
> +       } watchdog;
> +
>         struct intel_wakeref wakeref;
>         atomic_t user_wakeref;
>
> diff --git a/drivers/gpu/drm/i915/i915_request.c b/drivers/gpu/drm/i915/i915_request.c
> index b4511ac05e9a..9dd5e588b0a4 100644
> --- a/drivers/gpu/drm/i915/i915_request.c
> +++ b/drivers/gpu/drm/i915/i915_request.c
> @@ -277,6 +277,57 @@ static void remove_from_engine(struct i915_request *rq)
>         __notify_execute_cb_imm(rq);
>  }
>
> +static void __rq_init_watchdog(struct i915_request *rq)
> +{
> +       rq->watchdog.timer.function = NULL;
> +}
> +
> +static enum hrtimer_restart __rq_watchdog_expired(struct hrtimer *hrtimer)
> +{
> +       struct i915_request *rq =
> +               container_of(hrtimer, struct i915_request, watchdog.timer);
> +       struct intel_gt *gt = rq->engine->gt;
> +
> +       if (!i915_request_completed(rq)) {
> +               if (llist_add(&rq->watchdog.link, &gt->watchdog.list))
> +                       schedule_work(&gt->watchdog.work);
> +       } else {
> +               i915_request_put(rq);
> +       }
> +
> +       return HRTIMER_NORESTART;
> +}
> +
> +static void __rq_arm_watchdog(struct i915_request *rq)
> +{
> +       struct i915_request_watchdog *wdg = &rq->watchdog;
> +       struct intel_context *ce = rq->context;
> +
> +       if (!ce->watchdog.timeout_us)
> +               return;
> +
> +       hrtimer_init(&wdg->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
> +       wdg->timer.function = __rq_watchdog_expired;
> +       hrtimer_start_range_ns(&wdg->timer,
> +                              ns_to_ktime(ce->watchdog.timeout_us *
> +                                          NSEC_PER_USEC),
> +                               /*
> +                                * FIXME check if it gives the "not sooner"
> +                                * guarantee or slack is both ways
> +                                */

It looks like the slack/fuzziness just delays the timer, in case it
can coalesce multiple timer events. So shouldn't be sooner I think?

> +                               NSEC_PER_MSEC,

Formatting.

Reviewed-by: Matthew Auld <matthew.auld@intel.com>
Tvrtko Ursulin March 23, 2021, 11:09 a.m. UTC | #2
On 23/03/2021 10:54, Matthew Auld wrote:
> On Mon, 22 Mar 2021 at 13:29, Tvrtko Ursulin
> <tvrtko.ursulin@linux.intel.com> wrote:
>>
>> From: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
>>
>> Prepares the plumbing for setting request/fence expiration time. All code
>> is put in place but is never activeted due yet missing ability to actually
> 
>                   activated
> 
>> configure the timer.
>>
>> Outline of the basic operation:
>>
>> A timer is started when request is ready for execution. If the request
>> completes (retires) before the timer fires, timer is cancelled and nothing
>> further happens.
>>
>> If the timer fires request is added to a lockless list and worker queued.
>> Purpose of this is twofold: a) It allows request cancellation from a more
>> friendly context and b) coalesces multiple expirations into a single event
>> of consuming the list.
>>
>> Worker locklessly consumes the list of expired requests and cancels them
>> all using previous added i915_request_cancel().
>>
>> Associated timeout value is stored in rq->context.watchdog.timeout_us.
>>
>> v2:
>>   * Log expiration.
>>
>> v3:
>>   * Include more information about user timeline in the log message.
>>
>> Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
>> Cc: Daniel Vetter <daniel.vetter@ffwll.ch>
>> ---
>>   drivers/gpu/drm/i915/gt/intel_context_types.h |  4 ++
>>   .../drm/i915/gt/intel_execlists_submission.h  |  2 +
>>   drivers/gpu/drm/i915/gt/intel_gt.c            |  3 +
>>   drivers/gpu/drm/i915/gt/intel_gt.h            |  2 +
>>   drivers/gpu/drm/i915/gt/intel_gt_requests.c   | 28 ++++++++++
>>   drivers/gpu/drm/i915/gt/intel_gt_types.h      |  7 +++
>>   drivers/gpu/drm/i915/i915_request.c           | 56 +++++++++++++++++++
>>   drivers/gpu/drm/i915/i915_request.h           |  8 +++
>>   8 files changed, 110 insertions(+)
>>
>> diff --git a/drivers/gpu/drm/i915/gt/intel_context_types.h b/drivers/gpu/drm/i915/gt/intel_context_types.h
>> index 0ea18c9e2aca..65a5730a4f5b 100644
>> --- a/drivers/gpu/drm/i915/gt/intel_context_types.h
>> +++ b/drivers/gpu/drm/i915/gt/intel_context_types.h
>> @@ -99,6 +99,10 @@ struct intel_context {
>>   #define CONTEXT_FORCE_SINGLE_SUBMISSION        7
>>   #define CONTEXT_NOPREEMPT              8
>>
>> +       struct {
>> +               u64 timeout_us;
>> +       } watchdog;
>> +
>>          u32 *lrc_reg_state;
>>          union {
>>                  struct {
>> diff --git a/drivers/gpu/drm/i915/gt/intel_execlists_submission.h b/drivers/gpu/drm/i915/gt/intel_execlists_submission.h
>> index f7bd3fccfee8..4ca9b475e252 100644
>> --- a/drivers/gpu/drm/i915/gt/intel_execlists_submission.h
>> +++ b/drivers/gpu/drm/i915/gt/intel_execlists_submission.h
>> @@ -6,6 +6,7 @@
>>   #ifndef __INTEL_EXECLISTS_SUBMISSION_H__
>>   #define __INTEL_EXECLISTS_SUBMISSION_H__
>>
>> +#include <linux/llist.h>
>>   #include <linux/types.h>
>>
>>   struct drm_printer;
>> @@ -13,6 +14,7 @@ struct drm_printer;
>>   struct i915_request;
>>   struct intel_context;
>>   struct intel_engine_cs;
>> +struct intel_gt;
>>
>>   enum {
>>          INTEL_CONTEXT_SCHEDULE_IN = 0,
>> diff --git a/drivers/gpu/drm/i915/gt/intel_gt.c b/drivers/gpu/drm/i915/gt/intel_gt.c
>> index ca76f93bc03d..8d77dcbad059 100644
>> --- a/drivers/gpu/drm/i915/gt/intel_gt.c
>> +++ b/drivers/gpu/drm/i915/gt/intel_gt.c
>> @@ -31,6 +31,9 @@ void intel_gt_init_early(struct intel_gt *gt, struct drm_i915_private *i915)
>>          INIT_LIST_HEAD(&gt->closed_vma);
>>          spin_lock_init(&gt->closed_lock);
>>
>> +       init_llist_head(&gt->watchdog.list);
>> +       INIT_WORK(&gt->watchdog.work, intel_gt_watchdog_work);
>> +
>>          intel_gt_init_buffer_pool(gt);
>>          intel_gt_init_reset(gt);
>>          intel_gt_init_requests(gt);
>> diff --git a/drivers/gpu/drm/i915/gt/intel_gt.h b/drivers/gpu/drm/i915/gt/intel_gt.h
>> index a17bd8b3195f..7ec395cace69 100644
>> --- a/drivers/gpu/drm/i915/gt/intel_gt.h
>> +++ b/drivers/gpu/drm/i915/gt/intel_gt.h
>> @@ -78,4 +78,6 @@ static inline bool intel_gt_is_wedged(const struct intel_gt *gt)
>>   void intel_gt_info_print(const struct intel_gt_info *info,
>>                           struct drm_printer *p);
>>
>> +void intel_gt_watchdog_work(struct work_struct *work);
>> +
>>   #endif /* __INTEL_GT_H__ */
>> diff --git a/drivers/gpu/drm/i915/gt/intel_gt_requests.c b/drivers/gpu/drm/i915/gt/intel_gt_requests.c
>> index 36ec97f79174..fbfd19b2e5f2 100644
>> --- a/drivers/gpu/drm/i915/gt/intel_gt_requests.c
>> +++ b/drivers/gpu/drm/i915/gt/intel_gt_requests.c
>> @@ -8,6 +8,7 @@
>>   #include "i915_drv.h" /* for_each_engine() */
>>   #include "i915_request.h"
>>   #include "intel_engine_heartbeat.h"
>> +#include "intel_execlists_submission.h"
>>   #include "intel_gt.h"
>>   #include "intel_gt_pm.h"
>>   #include "intel_gt_requests.h"
>> @@ -242,4 +243,31 @@ void intel_gt_fini_requests(struct intel_gt *gt)
>>   {
>>          /* Wait until the work is marked as finished before unloading! */
>>          cancel_delayed_work_sync(&gt->requests.retire_work);
>> +
>> +       flush_work(&gt->watchdog.work);
>> +}
>> +
>> +void intel_gt_watchdog_work(struct work_struct *work)
>> +{
>> +       struct intel_gt *gt =
>> +               container_of(work, typeof(*gt), watchdog.work);
>> +       struct i915_request *rq, *rn;
>> +       struct llist_node *first;
>> +
>> +       first = llist_del_all(&gt->watchdog.list);
>> +       if (!first)
>> +               return;
>> +
>> +       llist_for_each_entry_safe(rq, rn, first, watchdog.link) {
>> +               if (!i915_request_completed(rq)) {
>> +                       struct dma_fence *f = &rq->fence;
>> +
>> +                       pr_notice("Fence expiration time out i915-%s:%s:%llx!\n",
>> +                                 f->ops->get_driver_name(f),
>> +                                 f->ops->get_timeline_name(f),
>> +                                 f->seqno);
>> +                       i915_request_cancel(rq, -EINTR);
>> +               }
>> +               i915_request_put(rq);
>> +       }
>>   }
>> diff --git a/drivers/gpu/drm/i915/gt/intel_gt_types.h b/drivers/gpu/drm/i915/gt/intel_gt_types.h
>> index 626af37c7790..d70ebcc6f19f 100644
>> --- a/drivers/gpu/drm/i915/gt/intel_gt_types.h
>> +++ b/drivers/gpu/drm/i915/gt/intel_gt_types.h
>> @@ -8,10 +8,12 @@
>>
>>   #include <linux/ktime.h>
>>   #include <linux/list.h>
>> +#include <linux/llist.h>
>>   #include <linux/mutex.h>
>>   #include <linux/notifier.h>
>>   #include <linux/spinlock.h>
>>   #include <linux/types.h>
>> +#include <linux/workqueue.h>
>>
>>   #include "uc/intel_uc.h"
>>
>> @@ -62,6 +64,11 @@ struct intel_gt {
>>                  struct delayed_work retire_work;
>>          } requests;
>>
>> +       struct {
>> +               struct llist_head list;
>> +               struct work_struct work;
>> +       } watchdog;
>> +
>>          struct intel_wakeref wakeref;
>>          atomic_t user_wakeref;
>>
>> diff --git a/drivers/gpu/drm/i915/i915_request.c b/drivers/gpu/drm/i915/i915_request.c
>> index b4511ac05e9a..9dd5e588b0a4 100644
>> --- a/drivers/gpu/drm/i915/i915_request.c
>> +++ b/drivers/gpu/drm/i915/i915_request.c
>> @@ -277,6 +277,57 @@ static void remove_from_engine(struct i915_request *rq)
>>          __notify_execute_cb_imm(rq);
>>   }
>>
>> +static void __rq_init_watchdog(struct i915_request *rq)
>> +{
>> +       rq->watchdog.timer.function = NULL;
>> +}
>> +
>> +static enum hrtimer_restart __rq_watchdog_expired(struct hrtimer *hrtimer)
>> +{
>> +       struct i915_request *rq =
>> +               container_of(hrtimer, struct i915_request, watchdog.timer);
>> +       struct intel_gt *gt = rq->engine->gt;
>> +
>> +       if (!i915_request_completed(rq)) {
>> +               if (llist_add(&rq->watchdog.link, &gt->watchdog.list))
>> +                       schedule_work(&gt->watchdog.work);
>> +       } else {
>> +               i915_request_put(rq);
>> +       }
>> +
>> +       return HRTIMER_NORESTART;
>> +}
>> +
>> +static void __rq_arm_watchdog(struct i915_request *rq)
>> +{
>> +       struct i915_request_watchdog *wdg = &rq->watchdog;
>> +       struct intel_context *ce = rq->context;
>> +
>> +       if (!ce->watchdog.timeout_us)
>> +               return;
>> +
>> +       hrtimer_init(&wdg->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
>> +       wdg->timer.function = __rq_watchdog_expired;
>> +       hrtimer_start_range_ns(&wdg->timer,
>> +                              ns_to_ktime(ce->watchdog.timeout_us *
>> +                                          NSEC_PER_USEC),
>> +                               /*
>> +                                * FIXME check if it gives the "not sooner"
>> +                                * guarantee or slack is both ways
>> +                                */
> 
> It looks like the slack/fuzziness just delays the timer, in case it
> can coalesce multiple timer events. So shouldn't be sooner I think?

I couldn't quickly figure it out when I looked at the implementation so 
I left this comment. But it was only relevant at a time I thought we 
would be exposing context param to allow userspace control. With the 
only user being default expiry which is not sensitive to precision or 
accuracy, I simply need to remove this comment.

> 
>> +                               NSEC_PER_MSEC,
> 
> Formatting.

Which part? I think indentation/alignment is correct.

> 
> Reviewed-by: Matthew Auld <matthew.auld@intel.com>
> 

Thanks,

Tvrtko
Matthew Auld March 23, 2021, 11:40 a.m. UTC | #3
On Tue, 23 Mar 2021 at 11:09, Tvrtko Ursulin
<tvrtko.ursulin@linux.intel.com> wrote:
>
>
> On 23/03/2021 10:54, Matthew Auld wrote:
> > On Mon, 22 Mar 2021 at 13:29, Tvrtko Ursulin
> > <tvrtko.ursulin@linux.intel.com> wrote:
> >>
> >> From: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
> >>
> >> Prepares the plumbing for setting request/fence expiration time. All code
> >> is put in place but is never activeted due yet missing ability to actually
> >
> >                   activated
> >
> >> configure the timer.
> >>
> >> Outline of the basic operation:
> >>
> >> A timer is started when request is ready for execution. If the request
> >> completes (retires) before the timer fires, timer is cancelled and nothing
> >> further happens.
> >>
> >> If the timer fires request is added to a lockless list and worker queued.
> >> Purpose of this is twofold: a) It allows request cancellation from a more
> >> friendly context and b) coalesces multiple expirations into a single event
> >> of consuming the list.
> >>
> >> Worker locklessly consumes the list of expired requests and cancels them
> >> all using previous added i915_request_cancel().
> >>
> >> Associated timeout value is stored in rq->context.watchdog.timeout_us.
> >>
> >> v2:
> >>   * Log expiration.
> >>
> >> v3:
> >>   * Include more information about user timeline in the log message.
> >>
> >> Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
> >> Cc: Daniel Vetter <daniel.vetter@ffwll.ch>
> >> ---
> >>   drivers/gpu/drm/i915/gt/intel_context_types.h |  4 ++
> >>   .../drm/i915/gt/intel_execlists_submission.h  |  2 +
> >>   drivers/gpu/drm/i915/gt/intel_gt.c            |  3 +
> >>   drivers/gpu/drm/i915/gt/intel_gt.h            |  2 +
> >>   drivers/gpu/drm/i915/gt/intel_gt_requests.c   | 28 ++++++++++
> >>   drivers/gpu/drm/i915/gt/intel_gt_types.h      |  7 +++
> >>   drivers/gpu/drm/i915/i915_request.c           | 56 +++++++++++++++++++
> >>   drivers/gpu/drm/i915/i915_request.h           |  8 +++
> >>   8 files changed, 110 insertions(+)
> >>
> >> diff --git a/drivers/gpu/drm/i915/gt/intel_context_types.h b/drivers/gpu/drm/i915/gt/intel_context_types.h
> >> index 0ea18c9e2aca..65a5730a4f5b 100644
> >> --- a/drivers/gpu/drm/i915/gt/intel_context_types.h
> >> +++ b/drivers/gpu/drm/i915/gt/intel_context_types.h
> >> @@ -99,6 +99,10 @@ struct intel_context {
> >>   #define CONTEXT_FORCE_SINGLE_SUBMISSION        7
> >>   #define CONTEXT_NOPREEMPT              8
> >>
> >> +       struct {
> >> +               u64 timeout_us;
> >> +       } watchdog;
> >> +
> >>          u32 *lrc_reg_state;
> >>          union {
> >>                  struct {
> >> diff --git a/drivers/gpu/drm/i915/gt/intel_execlists_submission.h b/drivers/gpu/drm/i915/gt/intel_execlists_submission.h
> >> index f7bd3fccfee8..4ca9b475e252 100644
> >> --- a/drivers/gpu/drm/i915/gt/intel_execlists_submission.h
> >> +++ b/drivers/gpu/drm/i915/gt/intel_execlists_submission.h
> >> @@ -6,6 +6,7 @@
> >>   #ifndef __INTEL_EXECLISTS_SUBMISSION_H__
> >>   #define __INTEL_EXECLISTS_SUBMISSION_H__
> >>
> >> +#include <linux/llist.h>
> >>   #include <linux/types.h>
> >>
> >>   struct drm_printer;
> >> @@ -13,6 +14,7 @@ struct drm_printer;
> >>   struct i915_request;
> >>   struct intel_context;
> >>   struct intel_engine_cs;
> >> +struct intel_gt;
> >>
> >>   enum {
> >>          INTEL_CONTEXT_SCHEDULE_IN = 0,
> >> diff --git a/drivers/gpu/drm/i915/gt/intel_gt.c b/drivers/gpu/drm/i915/gt/intel_gt.c
> >> index ca76f93bc03d..8d77dcbad059 100644
> >> --- a/drivers/gpu/drm/i915/gt/intel_gt.c
> >> +++ b/drivers/gpu/drm/i915/gt/intel_gt.c
> >> @@ -31,6 +31,9 @@ void intel_gt_init_early(struct intel_gt *gt, struct drm_i915_private *i915)
> >>          INIT_LIST_HEAD(&gt->closed_vma);
> >>          spin_lock_init(&gt->closed_lock);
> >>
> >> +       init_llist_head(&gt->watchdog.list);
> >> +       INIT_WORK(&gt->watchdog.work, intel_gt_watchdog_work);
> >> +
> >>          intel_gt_init_buffer_pool(gt);
> >>          intel_gt_init_reset(gt);
> >>          intel_gt_init_requests(gt);
> >> diff --git a/drivers/gpu/drm/i915/gt/intel_gt.h b/drivers/gpu/drm/i915/gt/intel_gt.h
> >> index a17bd8b3195f..7ec395cace69 100644
> >> --- a/drivers/gpu/drm/i915/gt/intel_gt.h
> >> +++ b/drivers/gpu/drm/i915/gt/intel_gt.h
> >> @@ -78,4 +78,6 @@ static inline bool intel_gt_is_wedged(const struct intel_gt *gt)
> >>   void intel_gt_info_print(const struct intel_gt_info *info,
> >>                           struct drm_printer *p);
> >>
> >> +void intel_gt_watchdog_work(struct work_struct *work);
> >> +
> >>   #endif /* __INTEL_GT_H__ */
> >> diff --git a/drivers/gpu/drm/i915/gt/intel_gt_requests.c b/drivers/gpu/drm/i915/gt/intel_gt_requests.c
> >> index 36ec97f79174..fbfd19b2e5f2 100644
> >> --- a/drivers/gpu/drm/i915/gt/intel_gt_requests.c
> >> +++ b/drivers/gpu/drm/i915/gt/intel_gt_requests.c
> >> @@ -8,6 +8,7 @@
> >>   #include "i915_drv.h" /* for_each_engine() */
> >>   #include "i915_request.h"
> >>   #include "intel_engine_heartbeat.h"
> >> +#include "intel_execlists_submission.h"
> >>   #include "intel_gt.h"
> >>   #include "intel_gt_pm.h"
> >>   #include "intel_gt_requests.h"
> >> @@ -242,4 +243,31 @@ void intel_gt_fini_requests(struct intel_gt *gt)
> >>   {
> >>          /* Wait until the work is marked as finished before unloading! */
> >>          cancel_delayed_work_sync(&gt->requests.retire_work);
> >> +
> >> +       flush_work(&gt->watchdog.work);
> >> +}
> >> +
> >> +void intel_gt_watchdog_work(struct work_struct *work)
> >> +{
> >> +       struct intel_gt *gt =
> >> +               container_of(work, typeof(*gt), watchdog.work);
> >> +       struct i915_request *rq, *rn;
> >> +       struct llist_node *first;
> >> +
> >> +       first = llist_del_all(&gt->watchdog.list);
> >> +       if (!first)
> >> +               return;
> >> +
> >> +       llist_for_each_entry_safe(rq, rn, first, watchdog.link) {
> >> +               if (!i915_request_completed(rq)) {
> >> +                       struct dma_fence *f = &rq->fence;
> >> +
> >> +                       pr_notice("Fence expiration time out i915-%s:%s:%llx!\n",
> >> +                                 f->ops->get_driver_name(f),
> >> +                                 f->ops->get_timeline_name(f),
> >> +                                 f->seqno);
> >> +                       i915_request_cancel(rq, -EINTR);
> >> +               }
> >> +               i915_request_put(rq);
> >> +       }
> >>   }
> >> diff --git a/drivers/gpu/drm/i915/gt/intel_gt_types.h b/drivers/gpu/drm/i915/gt/intel_gt_types.h
> >> index 626af37c7790..d70ebcc6f19f 100644
> >> --- a/drivers/gpu/drm/i915/gt/intel_gt_types.h
> >> +++ b/drivers/gpu/drm/i915/gt/intel_gt_types.h
> >> @@ -8,10 +8,12 @@
> >>
> >>   #include <linux/ktime.h>
> >>   #include <linux/list.h>
> >> +#include <linux/llist.h>
> >>   #include <linux/mutex.h>
> >>   #include <linux/notifier.h>
> >>   #include <linux/spinlock.h>
> >>   #include <linux/types.h>
> >> +#include <linux/workqueue.h>
> >>
> >>   #include "uc/intel_uc.h"
> >>
> >> @@ -62,6 +64,11 @@ struct intel_gt {
> >>                  struct delayed_work retire_work;
> >>          } requests;
> >>
> >> +       struct {
> >> +               struct llist_head list;
> >> +               struct work_struct work;
> >> +       } watchdog;
> >> +
> >>          struct intel_wakeref wakeref;
> >>          atomic_t user_wakeref;
> >>
> >> diff --git a/drivers/gpu/drm/i915/i915_request.c b/drivers/gpu/drm/i915/i915_request.c
> >> index b4511ac05e9a..9dd5e588b0a4 100644
> >> --- a/drivers/gpu/drm/i915/i915_request.c
> >> +++ b/drivers/gpu/drm/i915/i915_request.c
> >> @@ -277,6 +277,57 @@ static void remove_from_engine(struct i915_request *rq)
> >>          __notify_execute_cb_imm(rq);
> >>   }
> >>
> >> +static void __rq_init_watchdog(struct i915_request *rq)
> >> +{
> >> +       rq->watchdog.timer.function = NULL;
> >> +}
> >> +
> >> +static enum hrtimer_restart __rq_watchdog_expired(struct hrtimer *hrtimer)
> >> +{
> >> +       struct i915_request *rq =
> >> +               container_of(hrtimer, struct i915_request, watchdog.timer);
> >> +       struct intel_gt *gt = rq->engine->gt;
> >> +
> >> +       if (!i915_request_completed(rq)) {
> >> +               if (llist_add(&rq->watchdog.link, &gt->watchdog.list))
> >> +                       schedule_work(&gt->watchdog.work);
> >> +       } else {
> >> +               i915_request_put(rq);
> >> +       }
> >> +
> >> +       return HRTIMER_NORESTART;
> >> +}
> >> +
> >> +static void __rq_arm_watchdog(struct i915_request *rq)
> >> +{
> >> +       struct i915_request_watchdog *wdg = &rq->watchdog;
> >> +       struct intel_context *ce = rq->context;
> >> +
> >> +       if (!ce->watchdog.timeout_us)
> >> +               return;
> >> +
> >> +       hrtimer_init(&wdg->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
> >> +       wdg->timer.function = __rq_watchdog_expired;
> >> +       hrtimer_start_range_ns(&wdg->timer,
> >> +                              ns_to_ktime(ce->watchdog.timeout_us *
> >> +                                          NSEC_PER_USEC),
> >> +                               /*
> >> +                                * FIXME check if it gives the "not sooner"
> >> +                                * guarantee or slack is both ways
> >> +                                */
> >
> > It looks like the slack/fuzziness just delays the timer, in case it
> > can coalesce multiple timer events. So shouldn't be sooner I think?
>
> I couldn't quickly figure it out when I looked at the implementation so
> I left this comment. But it was only relevant at a time I thought we
> would be exposing context param to allow userspace control. With the
> only user being default expiry which is not sensitive to precision or
> accuracy, I simply need to remove this comment.
>
> >
> >> +                               NSEC_PER_MSEC,
> >
> > Formatting.
>
> Which part? I think indentation/alignment is correct.

@@ -311,11 +311,11 @@ static void __rq_arm_watchdog(struct i915_request *rq)
        hrtimer_start_range_ns(&wdg->timer,
                               ns_to_ktime(ce->watchdog.timeout_us *
                                           NSEC_PER_USEC),
-                               /*
-                                * FIXME check if it gives the "not sooner"
-                                * guarantee or slack is both ways
-                                */
-                               NSEC_PER_MSEC,
+                              /*
+                               * FIXME check if it gives the "not sooner"
+                               * guarantee or slack is both ways
+                               */
+                              NSEC_PER_MSEC,
                               HRTIMER_MODE_REL);
        i915_request_get(rq);
 }


>
> >
> > Reviewed-by: Matthew Auld <matthew.auld@intel.com>
> >
>
> Thanks,
>
> Tvrtko
diff mbox series

Patch

diff --git a/drivers/gpu/drm/i915/gt/intel_context_types.h b/drivers/gpu/drm/i915/gt/intel_context_types.h
index 0ea18c9e2aca..65a5730a4f5b 100644
--- a/drivers/gpu/drm/i915/gt/intel_context_types.h
+++ b/drivers/gpu/drm/i915/gt/intel_context_types.h
@@ -99,6 +99,10 @@  struct intel_context {
 #define CONTEXT_FORCE_SINGLE_SUBMISSION	7
 #define CONTEXT_NOPREEMPT		8
 
+	struct {
+		u64 timeout_us;
+	} watchdog;
+
 	u32 *lrc_reg_state;
 	union {
 		struct {
diff --git a/drivers/gpu/drm/i915/gt/intel_execlists_submission.h b/drivers/gpu/drm/i915/gt/intel_execlists_submission.h
index f7bd3fccfee8..4ca9b475e252 100644
--- a/drivers/gpu/drm/i915/gt/intel_execlists_submission.h
+++ b/drivers/gpu/drm/i915/gt/intel_execlists_submission.h
@@ -6,6 +6,7 @@ 
 #ifndef __INTEL_EXECLISTS_SUBMISSION_H__
 #define __INTEL_EXECLISTS_SUBMISSION_H__
 
+#include <linux/llist.h>
 #include <linux/types.h>
 
 struct drm_printer;
@@ -13,6 +14,7 @@  struct drm_printer;
 struct i915_request;
 struct intel_context;
 struct intel_engine_cs;
+struct intel_gt;
 
 enum {
 	INTEL_CONTEXT_SCHEDULE_IN = 0,
diff --git a/drivers/gpu/drm/i915/gt/intel_gt.c b/drivers/gpu/drm/i915/gt/intel_gt.c
index ca76f93bc03d..8d77dcbad059 100644
--- a/drivers/gpu/drm/i915/gt/intel_gt.c
+++ b/drivers/gpu/drm/i915/gt/intel_gt.c
@@ -31,6 +31,9 @@  void intel_gt_init_early(struct intel_gt *gt, struct drm_i915_private *i915)
 	INIT_LIST_HEAD(&gt->closed_vma);
 	spin_lock_init(&gt->closed_lock);
 
+	init_llist_head(&gt->watchdog.list);
+	INIT_WORK(&gt->watchdog.work, intel_gt_watchdog_work);
+
 	intel_gt_init_buffer_pool(gt);
 	intel_gt_init_reset(gt);
 	intel_gt_init_requests(gt);
diff --git a/drivers/gpu/drm/i915/gt/intel_gt.h b/drivers/gpu/drm/i915/gt/intel_gt.h
index a17bd8b3195f..7ec395cace69 100644
--- a/drivers/gpu/drm/i915/gt/intel_gt.h
+++ b/drivers/gpu/drm/i915/gt/intel_gt.h
@@ -78,4 +78,6 @@  static inline bool intel_gt_is_wedged(const struct intel_gt *gt)
 void intel_gt_info_print(const struct intel_gt_info *info,
 			 struct drm_printer *p);
 
+void intel_gt_watchdog_work(struct work_struct *work);
+
 #endif /* __INTEL_GT_H__ */
diff --git a/drivers/gpu/drm/i915/gt/intel_gt_requests.c b/drivers/gpu/drm/i915/gt/intel_gt_requests.c
index 36ec97f79174..fbfd19b2e5f2 100644
--- a/drivers/gpu/drm/i915/gt/intel_gt_requests.c
+++ b/drivers/gpu/drm/i915/gt/intel_gt_requests.c
@@ -8,6 +8,7 @@ 
 #include "i915_drv.h" /* for_each_engine() */
 #include "i915_request.h"
 #include "intel_engine_heartbeat.h"
+#include "intel_execlists_submission.h"
 #include "intel_gt.h"
 #include "intel_gt_pm.h"
 #include "intel_gt_requests.h"
@@ -242,4 +243,31 @@  void intel_gt_fini_requests(struct intel_gt *gt)
 {
 	/* Wait until the work is marked as finished before unloading! */
 	cancel_delayed_work_sync(&gt->requests.retire_work);
+
+	flush_work(&gt->watchdog.work);
+}
+
+void intel_gt_watchdog_work(struct work_struct *work)
+{
+	struct intel_gt *gt =
+		container_of(work, typeof(*gt), watchdog.work);
+	struct i915_request *rq, *rn;
+	struct llist_node *first;
+
+	first = llist_del_all(&gt->watchdog.list);
+	if (!first)
+		return;
+
+	llist_for_each_entry_safe(rq, rn, first, watchdog.link) {
+		if (!i915_request_completed(rq)) {
+			struct dma_fence *f = &rq->fence;
+
+			pr_notice("Fence expiration time out i915-%s:%s:%llx!\n",
+				  f->ops->get_driver_name(f),
+				  f->ops->get_timeline_name(f),
+				  f->seqno);
+			i915_request_cancel(rq, -EINTR);
+		}
+		i915_request_put(rq);
+	}
 }
diff --git a/drivers/gpu/drm/i915/gt/intel_gt_types.h b/drivers/gpu/drm/i915/gt/intel_gt_types.h
index 626af37c7790..d70ebcc6f19f 100644
--- a/drivers/gpu/drm/i915/gt/intel_gt_types.h
+++ b/drivers/gpu/drm/i915/gt/intel_gt_types.h
@@ -8,10 +8,12 @@ 
 
 #include <linux/ktime.h>
 #include <linux/list.h>
+#include <linux/llist.h>
 #include <linux/mutex.h>
 #include <linux/notifier.h>
 #include <linux/spinlock.h>
 #include <linux/types.h>
+#include <linux/workqueue.h>
 
 #include "uc/intel_uc.h"
 
@@ -62,6 +64,11 @@  struct intel_gt {
 		struct delayed_work retire_work;
 	} requests;
 
+	struct {
+		struct llist_head list;
+		struct work_struct work;
+	} watchdog;
+
 	struct intel_wakeref wakeref;
 	atomic_t user_wakeref;
 
diff --git a/drivers/gpu/drm/i915/i915_request.c b/drivers/gpu/drm/i915/i915_request.c
index b4511ac05e9a..9dd5e588b0a4 100644
--- a/drivers/gpu/drm/i915/i915_request.c
+++ b/drivers/gpu/drm/i915/i915_request.c
@@ -277,6 +277,57 @@  static void remove_from_engine(struct i915_request *rq)
 	__notify_execute_cb_imm(rq);
 }
 
+static void __rq_init_watchdog(struct i915_request *rq)
+{
+	rq->watchdog.timer.function = NULL;
+}
+
+static enum hrtimer_restart __rq_watchdog_expired(struct hrtimer *hrtimer)
+{
+	struct i915_request *rq =
+		container_of(hrtimer, struct i915_request, watchdog.timer);
+	struct intel_gt *gt = rq->engine->gt;
+
+	if (!i915_request_completed(rq)) {
+		if (llist_add(&rq->watchdog.link, &gt->watchdog.list))
+			schedule_work(&gt->watchdog.work);
+	} else {
+		i915_request_put(rq);
+	}
+
+	return HRTIMER_NORESTART;
+}
+
+static void __rq_arm_watchdog(struct i915_request *rq)
+{
+	struct i915_request_watchdog *wdg = &rq->watchdog;
+	struct intel_context *ce = rq->context;
+
+	if (!ce->watchdog.timeout_us)
+		return;
+
+	hrtimer_init(&wdg->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+	wdg->timer.function = __rq_watchdog_expired;
+	hrtimer_start_range_ns(&wdg->timer,
+			       ns_to_ktime(ce->watchdog.timeout_us *
+					   NSEC_PER_USEC),
+				/*
+				 * FIXME check if it gives the "not sooner"
+				 * guarantee or slack is both ways
+				 */
+				NSEC_PER_MSEC,
+			       HRTIMER_MODE_REL);
+	i915_request_get(rq);
+}
+
+static void __rq_cancel_watchdog(struct i915_request *rq)
+{
+	struct i915_request_watchdog *wdg = &rq->watchdog;
+
+	if (wdg->timer.function && hrtimer_try_to_cancel(&wdg->timer) > 0)
+		i915_request_put(rq);
+}
+
 bool i915_request_retire(struct i915_request *rq)
 {
 	if (!__i915_request_is_complete(rq))
@@ -288,6 +339,8 @@  bool i915_request_retire(struct i915_request *rq)
 	trace_i915_request_retire(rq);
 	i915_request_mark_complete(rq);
 
+	__rq_cancel_watchdog(rq);
+
 	/*
 	 * We know the GPU must have read the request to have
 	 * sent us the seqno + interrupt, so use the position
@@ -667,6 +720,8 @@  submit_notify(struct i915_sw_fence *fence, enum i915_sw_fence_notify state)
 
 		if (unlikely(fence->error))
 			i915_request_set_error_once(request, fence->error);
+		else
+			__rq_arm_watchdog(request);
 
 		/*
 		 * We need to serialize use of the submit_request() callback
@@ -854,6 +909,7 @@  __i915_request_create(struct intel_context *ce, gfp_t gfp)
 
 	/* No zalloc, everything must be cleared after use */
 	rq->batch = NULL;
+	__rq_init_watchdog(rq);
 	GEM_BUG_ON(rq->capture_list);
 	GEM_BUG_ON(!llist_empty(&rq->execute_cb));
 
diff --git a/drivers/gpu/drm/i915/i915_request.h b/drivers/gpu/drm/i915/i915_request.h
index 64869a313b3e..294f16e2163d 100644
--- a/drivers/gpu/drm/i915/i915_request.h
+++ b/drivers/gpu/drm/i915/i915_request.h
@@ -26,7 +26,9 @@ 
 #define I915_REQUEST_H
 
 #include <linux/dma-fence.h>
+#include <linux/hrtimer.h>
 #include <linux/irq_work.h>
+#include <linux/llist.h>
 #include <linux/lockdep.h>
 
 #include "gem/i915_gem_context_types.h"
@@ -289,6 +291,12 @@  struct i915_request {
 	/** timeline->request entry for this request */
 	struct list_head link;
 
+	/** Watchdog support fields. */
+	struct i915_request_watchdog {
+		struct llist_node link;
+		struct hrtimer timer;
+	} watchdog;
+
 	I915_SELFTEST_DECLARE(struct {
 		struct list_head link;
 		unsigned long delay;