Message ID | 20210322132937.2165901-1-tvrtko.ursulin@linux.intel.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | None | expand |
On Mon, 22 Mar 2021 at 13:29, Tvrtko Ursulin <tvrtko.ursulin@linux.intel.com> wrote: > > From: Tvrtko Ursulin <tvrtko.ursulin@intel.com> > > Prepares the plumbing for setting request/fence expiration time. All code > is put in place but is never activeted due yet missing ability to actually activated > configure the timer. > > Outline of the basic operation: > > A timer is started when request is ready for execution. If the request > completes (retires) before the timer fires, timer is cancelled and nothing > further happens. > > If the timer fires request is added to a lockless list and worker queued. > Purpose of this is twofold: a) It allows request cancellation from a more > friendly context and b) coalesces multiple expirations into a single event > of consuming the list. > > Worker locklessly consumes the list of expired requests and cancels them > all using previous added i915_request_cancel(). > > Associated timeout value is stored in rq->context.watchdog.timeout_us. > > v2: > * Log expiration. > > v3: > * Include more information about user timeline in the log message. > > Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com> > Cc: Daniel Vetter <daniel.vetter@ffwll.ch> > --- > drivers/gpu/drm/i915/gt/intel_context_types.h | 4 ++ > .../drm/i915/gt/intel_execlists_submission.h | 2 + > drivers/gpu/drm/i915/gt/intel_gt.c | 3 + > drivers/gpu/drm/i915/gt/intel_gt.h | 2 + > drivers/gpu/drm/i915/gt/intel_gt_requests.c | 28 ++++++++++ > drivers/gpu/drm/i915/gt/intel_gt_types.h | 7 +++ > drivers/gpu/drm/i915/i915_request.c | 56 +++++++++++++++++++ > drivers/gpu/drm/i915/i915_request.h | 8 +++ > 8 files changed, 110 insertions(+) > > diff --git a/drivers/gpu/drm/i915/gt/intel_context_types.h b/drivers/gpu/drm/i915/gt/intel_context_types.h > index 0ea18c9e2aca..65a5730a4f5b 100644 > --- a/drivers/gpu/drm/i915/gt/intel_context_types.h > +++ b/drivers/gpu/drm/i915/gt/intel_context_types.h > @@ -99,6 +99,10 @@ struct intel_context { > #define CONTEXT_FORCE_SINGLE_SUBMISSION 7 > #define CONTEXT_NOPREEMPT 8 > > + struct { > + u64 timeout_us; > + } watchdog; > + > u32 *lrc_reg_state; > union { > struct { > diff --git a/drivers/gpu/drm/i915/gt/intel_execlists_submission.h b/drivers/gpu/drm/i915/gt/intel_execlists_submission.h > index f7bd3fccfee8..4ca9b475e252 100644 > --- a/drivers/gpu/drm/i915/gt/intel_execlists_submission.h > +++ b/drivers/gpu/drm/i915/gt/intel_execlists_submission.h > @@ -6,6 +6,7 @@ > #ifndef __INTEL_EXECLISTS_SUBMISSION_H__ > #define __INTEL_EXECLISTS_SUBMISSION_H__ > > +#include <linux/llist.h> > #include <linux/types.h> > > struct drm_printer; > @@ -13,6 +14,7 @@ struct drm_printer; > struct i915_request; > struct intel_context; > struct intel_engine_cs; > +struct intel_gt; > > enum { > INTEL_CONTEXT_SCHEDULE_IN = 0, > diff --git a/drivers/gpu/drm/i915/gt/intel_gt.c b/drivers/gpu/drm/i915/gt/intel_gt.c > index ca76f93bc03d..8d77dcbad059 100644 > --- a/drivers/gpu/drm/i915/gt/intel_gt.c > +++ b/drivers/gpu/drm/i915/gt/intel_gt.c > @@ -31,6 +31,9 @@ void intel_gt_init_early(struct intel_gt *gt, struct drm_i915_private *i915) > INIT_LIST_HEAD(>->closed_vma); > spin_lock_init(>->closed_lock); > > + init_llist_head(>->watchdog.list); > + INIT_WORK(>->watchdog.work, intel_gt_watchdog_work); > + > intel_gt_init_buffer_pool(gt); > intel_gt_init_reset(gt); > intel_gt_init_requests(gt); > diff --git a/drivers/gpu/drm/i915/gt/intel_gt.h b/drivers/gpu/drm/i915/gt/intel_gt.h > index a17bd8b3195f..7ec395cace69 100644 > --- a/drivers/gpu/drm/i915/gt/intel_gt.h > +++ b/drivers/gpu/drm/i915/gt/intel_gt.h > @@ -78,4 +78,6 @@ static inline bool intel_gt_is_wedged(const struct intel_gt *gt) > void intel_gt_info_print(const struct intel_gt_info *info, > struct drm_printer *p); > > +void intel_gt_watchdog_work(struct work_struct *work); > + > #endif /* __INTEL_GT_H__ */ > diff --git a/drivers/gpu/drm/i915/gt/intel_gt_requests.c b/drivers/gpu/drm/i915/gt/intel_gt_requests.c > index 36ec97f79174..fbfd19b2e5f2 100644 > --- a/drivers/gpu/drm/i915/gt/intel_gt_requests.c > +++ b/drivers/gpu/drm/i915/gt/intel_gt_requests.c > @@ -8,6 +8,7 @@ > #include "i915_drv.h" /* for_each_engine() */ > #include "i915_request.h" > #include "intel_engine_heartbeat.h" > +#include "intel_execlists_submission.h" > #include "intel_gt.h" > #include "intel_gt_pm.h" > #include "intel_gt_requests.h" > @@ -242,4 +243,31 @@ void intel_gt_fini_requests(struct intel_gt *gt) > { > /* Wait until the work is marked as finished before unloading! */ > cancel_delayed_work_sync(>->requests.retire_work); > + > + flush_work(>->watchdog.work); > +} > + > +void intel_gt_watchdog_work(struct work_struct *work) > +{ > + struct intel_gt *gt = > + container_of(work, typeof(*gt), watchdog.work); > + struct i915_request *rq, *rn; > + struct llist_node *first; > + > + first = llist_del_all(>->watchdog.list); > + if (!first) > + return; > + > + llist_for_each_entry_safe(rq, rn, first, watchdog.link) { > + if (!i915_request_completed(rq)) { > + struct dma_fence *f = &rq->fence; > + > + pr_notice("Fence expiration time out i915-%s:%s:%llx!\n", > + f->ops->get_driver_name(f), > + f->ops->get_timeline_name(f), > + f->seqno); > + i915_request_cancel(rq, -EINTR); > + } > + i915_request_put(rq); > + } > } > diff --git a/drivers/gpu/drm/i915/gt/intel_gt_types.h b/drivers/gpu/drm/i915/gt/intel_gt_types.h > index 626af37c7790..d70ebcc6f19f 100644 > --- a/drivers/gpu/drm/i915/gt/intel_gt_types.h > +++ b/drivers/gpu/drm/i915/gt/intel_gt_types.h > @@ -8,10 +8,12 @@ > > #include <linux/ktime.h> > #include <linux/list.h> > +#include <linux/llist.h> > #include <linux/mutex.h> > #include <linux/notifier.h> > #include <linux/spinlock.h> > #include <linux/types.h> > +#include <linux/workqueue.h> > > #include "uc/intel_uc.h" > > @@ -62,6 +64,11 @@ struct intel_gt { > struct delayed_work retire_work; > } requests; > > + struct { > + struct llist_head list; > + struct work_struct work; > + } watchdog; > + > struct intel_wakeref wakeref; > atomic_t user_wakeref; > > diff --git a/drivers/gpu/drm/i915/i915_request.c b/drivers/gpu/drm/i915/i915_request.c > index b4511ac05e9a..9dd5e588b0a4 100644 > --- a/drivers/gpu/drm/i915/i915_request.c > +++ b/drivers/gpu/drm/i915/i915_request.c > @@ -277,6 +277,57 @@ static void remove_from_engine(struct i915_request *rq) > __notify_execute_cb_imm(rq); > } > > +static void __rq_init_watchdog(struct i915_request *rq) > +{ > + rq->watchdog.timer.function = NULL; > +} > + > +static enum hrtimer_restart __rq_watchdog_expired(struct hrtimer *hrtimer) > +{ > + struct i915_request *rq = > + container_of(hrtimer, struct i915_request, watchdog.timer); > + struct intel_gt *gt = rq->engine->gt; > + > + if (!i915_request_completed(rq)) { > + if (llist_add(&rq->watchdog.link, >->watchdog.list)) > + schedule_work(>->watchdog.work); > + } else { > + i915_request_put(rq); > + } > + > + return HRTIMER_NORESTART; > +} > + > +static void __rq_arm_watchdog(struct i915_request *rq) > +{ > + struct i915_request_watchdog *wdg = &rq->watchdog; > + struct intel_context *ce = rq->context; > + > + if (!ce->watchdog.timeout_us) > + return; > + > + hrtimer_init(&wdg->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); > + wdg->timer.function = __rq_watchdog_expired; > + hrtimer_start_range_ns(&wdg->timer, > + ns_to_ktime(ce->watchdog.timeout_us * > + NSEC_PER_USEC), > + /* > + * FIXME check if it gives the "not sooner" > + * guarantee or slack is both ways > + */ It looks like the slack/fuzziness just delays the timer, in case it can coalesce multiple timer events. So shouldn't be sooner I think? > + NSEC_PER_MSEC, Formatting. Reviewed-by: Matthew Auld <matthew.auld@intel.com>
On 23/03/2021 10:54, Matthew Auld wrote: > On Mon, 22 Mar 2021 at 13:29, Tvrtko Ursulin > <tvrtko.ursulin@linux.intel.com> wrote: >> >> From: Tvrtko Ursulin <tvrtko.ursulin@intel.com> >> >> Prepares the plumbing for setting request/fence expiration time. All code >> is put in place but is never activeted due yet missing ability to actually > > activated > >> configure the timer. >> >> Outline of the basic operation: >> >> A timer is started when request is ready for execution. If the request >> completes (retires) before the timer fires, timer is cancelled and nothing >> further happens. >> >> If the timer fires request is added to a lockless list and worker queued. >> Purpose of this is twofold: a) It allows request cancellation from a more >> friendly context and b) coalesces multiple expirations into a single event >> of consuming the list. >> >> Worker locklessly consumes the list of expired requests and cancels them >> all using previous added i915_request_cancel(). >> >> Associated timeout value is stored in rq->context.watchdog.timeout_us. >> >> v2: >> * Log expiration. >> >> v3: >> * Include more information about user timeline in the log message. >> >> Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com> >> Cc: Daniel Vetter <daniel.vetter@ffwll.ch> >> --- >> drivers/gpu/drm/i915/gt/intel_context_types.h | 4 ++ >> .../drm/i915/gt/intel_execlists_submission.h | 2 + >> drivers/gpu/drm/i915/gt/intel_gt.c | 3 + >> drivers/gpu/drm/i915/gt/intel_gt.h | 2 + >> drivers/gpu/drm/i915/gt/intel_gt_requests.c | 28 ++++++++++ >> drivers/gpu/drm/i915/gt/intel_gt_types.h | 7 +++ >> drivers/gpu/drm/i915/i915_request.c | 56 +++++++++++++++++++ >> drivers/gpu/drm/i915/i915_request.h | 8 +++ >> 8 files changed, 110 insertions(+) >> >> diff --git a/drivers/gpu/drm/i915/gt/intel_context_types.h b/drivers/gpu/drm/i915/gt/intel_context_types.h >> index 0ea18c9e2aca..65a5730a4f5b 100644 >> --- a/drivers/gpu/drm/i915/gt/intel_context_types.h >> +++ b/drivers/gpu/drm/i915/gt/intel_context_types.h >> @@ -99,6 +99,10 @@ struct intel_context { >> #define CONTEXT_FORCE_SINGLE_SUBMISSION 7 >> #define CONTEXT_NOPREEMPT 8 >> >> + struct { >> + u64 timeout_us; >> + } watchdog; >> + >> u32 *lrc_reg_state; >> union { >> struct { >> diff --git a/drivers/gpu/drm/i915/gt/intel_execlists_submission.h b/drivers/gpu/drm/i915/gt/intel_execlists_submission.h >> index f7bd3fccfee8..4ca9b475e252 100644 >> --- a/drivers/gpu/drm/i915/gt/intel_execlists_submission.h >> +++ b/drivers/gpu/drm/i915/gt/intel_execlists_submission.h >> @@ -6,6 +6,7 @@ >> #ifndef __INTEL_EXECLISTS_SUBMISSION_H__ >> #define __INTEL_EXECLISTS_SUBMISSION_H__ >> >> +#include <linux/llist.h> >> #include <linux/types.h> >> >> struct drm_printer; >> @@ -13,6 +14,7 @@ struct drm_printer; >> struct i915_request; >> struct intel_context; >> struct intel_engine_cs; >> +struct intel_gt; >> >> enum { >> INTEL_CONTEXT_SCHEDULE_IN = 0, >> diff --git a/drivers/gpu/drm/i915/gt/intel_gt.c b/drivers/gpu/drm/i915/gt/intel_gt.c >> index ca76f93bc03d..8d77dcbad059 100644 >> --- a/drivers/gpu/drm/i915/gt/intel_gt.c >> +++ b/drivers/gpu/drm/i915/gt/intel_gt.c >> @@ -31,6 +31,9 @@ void intel_gt_init_early(struct intel_gt *gt, struct drm_i915_private *i915) >> INIT_LIST_HEAD(>->closed_vma); >> spin_lock_init(>->closed_lock); >> >> + init_llist_head(>->watchdog.list); >> + INIT_WORK(>->watchdog.work, intel_gt_watchdog_work); >> + >> intel_gt_init_buffer_pool(gt); >> intel_gt_init_reset(gt); >> intel_gt_init_requests(gt); >> diff --git a/drivers/gpu/drm/i915/gt/intel_gt.h b/drivers/gpu/drm/i915/gt/intel_gt.h >> index a17bd8b3195f..7ec395cace69 100644 >> --- a/drivers/gpu/drm/i915/gt/intel_gt.h >> +++ b/drivers/gpu/drm/i915/gt/intel_gt.h >> @@ -78,4 +78,6 @@ static inline bool intel_gt_is_wedged(const struct intel_gt *gt) >> void intel_gt_info_print(const struct intel_gt_info *info, >> struct drm_printer *p); >> >> +void intel_gt_watchdog_work(struct work_struct *work); >> + >> #endif /* __INTEL_GT_H__ */ >> diff --git a/drivers/gpu/drm/i915/gt/intel_gt_requests.c b/drivers/gpu/drm/i915/gt/intel_gt_requests.c >> index 36ec97f79174..fbfd19b2e5f2 100644 >> --- a/drivers/gpu/drm/i915/gt/intel_gt_requests.c >> +++ b/drivers/gpu/drm/i915/gt/intel_gt_requests.c >> @@ -8,6 +8,7 @@ >> #include "i915_drv.h" /* for_each_engine() */ >> #include "i915_request.h" >> #include "intel_engine_heartbeat.h" >> +#include "intel_execlists_submission.h" >> #include "intel_gt.h" >> #include "intel_gt_pm.h" >> #include "intel_gt_requests.h" >> @@ -242,4 +243,31 @@ void intel_gt_fini_requests(struct intel_gt *gt) >> { >> /* Wait until the work is marked as finished before unloading! */ >> cancel_delayed_work_sync(>->requests.retire_work); >> + >> + flush_work(>->watchdog.work); >> +} >> + >> +void intel_gt_watchdog_work(struct work_struct *work) >> +{ >> + struct intel_gt *gt = >> + container_of(work, typeof(*gt), watchdog.work); >> + struct i915_request *rq, *rn; >> + struct llist_node *first; >> + >> + first = llist_del_all(>->watchdog.list); >> + if (!first) >> + return; >> + >> + llist_for_each_entry_safe(rq, rn, first, watchdog.link) { >> + if (!i915_request_completed(rq)) { >> + struct dma_fence *f = &rq->fence; >> + >> + pr_notice("Fence expiration time out i915-%s:%s:%llx!\n", >> + f->ops->get_driver_name(f), >> + f->ops->get_timeline_name(f), >> + f->seqno); >> + i915_request_cancel(rq, -EINTR); >> + } >> + i915_request_put(rq); >> + } >> } >> diff --git a/drivers/gpu/drm/i915/gt/intel_gt_types.h b/drivers/gpu/drm/i915/gt/intel_gt_types.h >> index 626af37c7790..d70ebcc6f19f 100644 >> --- a/drivers/gpu/drm/i915/gt/intel_gt_types.h >> +++ b/drivers/gpu/drm/i915/gt/intel_gt_types.h >> @@ -8,10 +8,12 @@ >> >> #include <linux/ktime.h> >> #include <linux/list.h> >> +#include <linux/llist.h> >> #include <linux/mutex.h> >> #include <linux/notifier.h> >> #include <linux/spinlock.h> >> #include <linux/types.h> >> +#include <linux/workqueue.h> >> >> #include "uc/intel_uc.h" >> >> @@ -62,6 +64,11 @@ struct intel_gt { >> struct delayed_work retire_work; >> } requests; >> >> + struct { >> + struct llist_head list; >> + struct work_struct work; >> + } watchdog; >> + >> struct intel_wakeref wakeref; >> atomic_t user_wakeref; >> >> diff --git a/drivers/gpu/drm/i915/i915_request.c b/drivers/gpu/drm/i915/i915_request.c >> index b4511ac05e9a..9dd5e588b0a4 100644 >> --- a/drivers/gpu/drm/i915/i915_request.c >> +++ b/drivers/gpu/drm/i915/i915_request.c >> @@ -277,6 +277,57 @@ static void remove_from_engine(struct i915_request *rq) >> __notify_execute_cb_imm(rq); >> } >> >> +static void __rq_init_watchdog(struct i915_request *rq) >> +{ >> + rq->watchdog.timer.function = NULL; >> +} >> + >> +static enum hrtimer_restart __rq_watchdog_expired(struct hrtimer *hrtimer) >> +{ >> + struct i915_request *rq = >> + container_of(hrtimer, struct i915_request, watchdog.timer); >> + struct intel_gt *gt = rq->engine->gt; >> + >> + if (!i915_request_completed(rq)) { >> + if (llist_add(&rq->watchdog.link, >->watchdog.list)) >> + schedule_work(>->watchdog.work); >> + } else { >> + i915_request_put(rq); >> + } >> + >> + return HRTIMER_NORESTART; >> +} >> + >> +static void __rq_arm_watchdog(struct i915_request *rq) >> +{ >> + struct i915_request_watchdog *wdg = &rq->watchdog; >> + struct intel_context *ce = rq->context; >> + >> + if (!ce->watchdog.timeout_us) >> + return; >> + >> + hrtimer_init(&wdg->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); >> + wdg->timer.function = __rq_watchdog_expired; >> + hrtimer_start_range_ns(&wdg->timer, >> + ns_to_ktime(ce->watchdog.timeout_us * >> + NSEC_PER_USEC), >> + /* >> + * FIXME check if it gives the "not sooner" >> + * guarantee or slack is both ways >> + */ > > It looks like the slack/fuzziness just delays the timer, in case it > can coalesce multiple timer events. So shouldn't be sooner I think? I couldn't quickly figure it out when I looked at the implementation so I left this comment. But it was only relevant at a time I thought we would be exposing context param to allow userspace control. With the only user being default expiry which is not sensitive to precision or accuracy, I simply need to remove this comment. > >> + NSEC_PER_MSEC, > > Formatting. Which part? I think indentation/alignment is correct. > > Reviewed-by: Matthew Auld <matthew.auld@intel.com> > Thanks, Tvrtko
On Tue, 23 Mar 2021 at 11:09, Tvrtko Ursulin <tvrtko.ursulin@linux.intel.com> wrote: > > > On 23/03/2021 10:54, Matthew Auld wrote: > > On Mon, 22 Mar 2021 at 13:29, Tvrtko Ursulin > > <tvrtko.ursulin@linux.intel.com> wrote: > >> > >> From: Tvrtko Ursulin <tvrtko.ursulin@intel.com> > >> > >> Prepares the plumbing for setting request/fence expiration time. All code > >> is put in place but is never activeted due yet missing ability to actually > > > > activated > > > >> configure the timer. > >> > >> Outline of the basic operation: > >> > >> A timer is started when request is ready for execution. If the request > >> completes (retires) before the timer fires, timer is cancelled and nothing > >> further happens. > >> > >> If the timer fires request is added to a lockless list and worker queued. > >> Purpose of this is twofold: a) It allows request cancellation from a more > >> friendly context and b) coalesces multiple expirations into a single event > >> of consuming the list. > >> > >> Worker locklessly consumes the list of expired requests and cancels them > >> all using previous added i915_request_cancel(). > >> > >> Associated timeout value is stored in rq->context.watchdog.timeout_us. > >> > >> v2: > >> * Log expiration. > >> > >> v3: > >> * Include more information about user timeline in the log message. > >> > >> Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com> > >> Cc: Daniel Vetter <daniel.vetter@ffwll.ch> > >> --- > >> drivers/gpu/drm/i915/gt/intel_context_types.h | 4 ++ > >> .../drm/i915/gt/intel_execlists_submission.h | 2 + > >> drivers/gpu/drm/i915/gt/intel_gt.c | 3 + > >> drivers/gpu/drm/i915/gt/intel_gt.h | 2 + > >> drivers/gpu/drm/i915/gt/intel_gt_requests.c | 28 ++++++++++ > >> drivers/gpu/drm/i915/gt/intel_gt_types.h | 7 +++ > >> drivers/gpu/drm/i915/i915_request.c | 56 +++++++++++++++++++ > >> drivers/gpu/drm/i915/i915_request.h | 8 +++ > >> 8 files changed, 110 insertions(+) > >> > >> diff --git a/drivers/gpu/drm/i915/gt/intel_context_types.h b/drivers/gpu/drm/i915/gt/intel_context_types.h > >> index 0ea18c9e2aca..65a5730a4f5b 100644 > >> --- a/drivers/gpu/drm/i915/gt/intel_context_types.h > >> +++ b/drivers/gpu/drm/i915/gt/intel_context_types.h > >> @@ -99,6 +99,10 @@ struct intel_context { > >> #define CONTEXT_FORCE_SINGLE_SUBMISSION 7 > >> #define CONTEXT_NOPREEMPT 8 > >> > >> + struct { > >> + u64 timeout_us; > >> + } watchdog; > >> + > >> u32 *lrc_reg_state; > >> union { > >> struct { > >> diff --git a/drivers/gpu/drm/i915/gt/intel_execlists_submission.h b/drivers/gpu/drm/i915/gt/intel_execlists_submission.h > >> index f7bd3fccfee8..4ca9b475e252 100644 > >> --- a/drivers/gpu/drm/i915/gt/intel_execlists_submission.h > >> +++ b/drivers/gpu/drm/i915/gt/intel_execlists_submission.h > >> @@ -6,6 +6,7 @@ > >> #ifndef __INTEL_EXECLISTS_SUBMISSION_H__ > >> #define __INTEL_EXECLISTS_SUBMISSION_H__ > >> > >> +#include <linux/llist.h> > >> #include <linux/types.h> > >> > >> struct drm_printer; > >> @@ -13,6 +14,7 @@ struct drm_printer; > >> struct i915_request; > >> struct intel_context; > >> struct intel_engine_cs; > >> +struct intel_gt; > >> > >> enum { > >> INTEL_CONTEXT_SCHEDULE_IN = 0, > >> diff --git a/drivers/gpu/drm/i915/gt/intel_gt.c b/drivers/gpu/drm/i915/gt/intel_gt.c > >> index ca76f93bc03d..8d77dcbad059 100644 > >> --- a/drivers/gpu/drm/i915/gt/intel_gt.c > >> +++ b/drivers/gpu/drm/i915/gt/intel_gt.c > >> @@ -31,6 +31,9 @@ void intel_gt_init_early(struct intel_gt *gt, struct drm_i915_private *i915) > >> INIT_LIST_HEAD(>->closed_vma); > >> spin_lock_init(>->closed_lock); > >> > >> + init_llist_head(>->watchdog.list); > >> + INIT_WORK(>->watchdog.work, intel_gt_watchdog_work); > >> + > >> intel_gt_init_buffer_pool(gt); > >> intel_gt_init_reset(gt); > >> intel_gt_init_requests(gt); > >> diff --git a/drivers/gpu/drm/i915/gt/intel_gt.h b/drivers/gpu/drm/i915/gt/intel_gt.h > >> index a17bd8b3195f..7ec395cace69 100644 > >> --- a/drivers/gpu/drm/i915/gt/intel_gt.h > >> +++ b/drivers/gpu/drm/i915/gt/intel_gt.h > >> @@ -78,4 +78,6 @@ static inline bool intel_gt_is_wedged(const struct intel_gt *gt) > >> void intel_gt_info_print(const struct intel_gt_info *info, > >> struct drm_printer *p); > >> > >> +void intel_gt_watchdog_work(struct work_struct *work); > >> + > >> #endif /* __INTEL_GT_H__ */ > >> diff --git a/drivers/gpu/drm/i915/gt/intel_gt_requests.c b/drivers/gpu/drm/i915/gt/intel_gt_requests.c > >> index 36ec97f79174..fbfd19b2e5f2 100644 > >> --- a/drivers/gpu/drm/i915/gt/intel_gt_requests.c > >> +++ b/drivers/gpu/drm/i915/gt/intel_gt_requests.c > >> @@ -8,6 +8,7 @@ > >> #include "i915_drv.h" /* for_each_engine() */ > >> #include "i915_request.h" > >> #include "intel_engine_heartbeat.h" > >> +#include "intel_execlists_submission.h" > >> #include "intel_gt.h" > >> #include "intel_gt_pm.h" > >> #include "intel_gt_requests.h" > >> @@ -242,4 +243,31 @@ void intel_gt_fini_requests(struct intel_gt *gt) > >> { > >> /* Wait until the work is marked as finished before unloading! */ > >> cancel_delayed_work_sync(>->requests.retire_work); > >> + > >> + flush_work(>->watchdog.work); > >> +} > >> + > >> +void intel_gt_watchdog_work(struct work_struct *work) > >> +{ > >> + struct intel_gt *gt = > >> + container_of(work, typeof(*gt), watchdog.work); > >> + struct i915_request *rq, *rn; > >> + struct llist_node *first; > >> + > >> + first = llist_del_all(>->watchdog.list); > >> + if (!first) > >> + return; > >> + > >> + llist_for_each_entry_safe(rq, rn, first, watchdog.link) { > >> + if (!i915_request_completed(rq)) { > >> + struct dma_fence *f = &rq->fence; > >> + > >> + pr_notice("Fence expiration time out i915-%s:%s:%llx!\n", > >> + f->ops->get_driver_name(f), > >> + f->ops->get_timeline_name(f), > >> + f->seqno); > >> + i915_request_cancel(rq, -EINTR); > >> + } > >> + i915_request_put(rq); > >> + } > >> } > >> diff --git a/drivers/gpu/drm/i915/gt/intel_gt_types.h b/drivers/gpu/drm/i915/gt/intel_gt_types.h > >> index 626af37c7790..d70ebcc6f19f 100644 > >> --- a/drivers/gpu/drm/i915/gt/intel_gt_types.h > >> +++ b/drivers/gpu/drm/i915/gt/intel_gt_types.h > >> @@ -8,10 +8,12 @@ > >> > >> #include <linux/ktime.h> > >> #include <linux/list.h> > >> +#include <linux/llist.h> > >> #include <linux/mutex.h> > >> #include <linux/notifier.h> > >> #include <linux/spinlock.h> > >> #include <linux/types.h> > >> +#include <linux/workqueue.h> > >> > >> #include "uc/intel_uc.h" > >> > >> @@ -62,6 +64,11 @@ struct intel_gt { > >> struct delayed_work retire_work; > >> } requests; > >> > >> + struct { > >> + struct llist_head list; > >> + struct work_struct work; > >> + } watchdog; > >> + > >> struct intel_wakeref wakeref; > >> atomic_t user_wakeref; > >> > >> diff --git a/drivers/gpu/drm/i915/i915_request.c b/drivers/gpu/drm/i915/i915_request.c > >> index b4511ac05e9a..9dd5e588b0a4 100644 > >> --- a/drivers/gpu/drm/i915/i915_request.c > >> +++ b/drivers/gpu/drm/i915/i915_request.c > >> @@ -277,6 +277,57 @@ static void remove_from_engine(struct i915_request *rq) > >> __notify_execute_cb_imm(rq); > >> } > >> > >> +static void __rq_init_watchdog(struct i915_request *rq) > >> +{ > >> + rq->watchdog.timer.function = NULL; > >> +} > >> + > >> +static enum hrtimer_restart __rq_watchdog_expired(struct hrtimer *hrtimer) > >> +{ > >> + struct i915_request *rq = > >> + container_of(hrtimer, struct i915_request, watchdog.timer); > >> + struct intel_gt *gt = rq->engine->gt; > >> + > >> + if (!i915_request_completed(rq)) { > >> + if (llist_add(&rq->watchdog.link, >->watchdog.list)) > >> + schedule_work(>->watchdog.work); > >> + } else { > >> + i915_request_put(rq); > >> + } > >> + > >> + return HRTIMER_NORESTART; > >> +} > >> + > >> +static void __rq_arm_watchdog(struct i915_request *rq) > >> +{ > >> + struct i915_request_watchdog *wdg = &rq->watchdog; > >> + struct intel_context *ce = rq->context; > >> + > >> + if (!ce->watchdog.timeout_us) > >> + return; > >> + > >> + hrtimer_init(&wdg->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); > >> + wdg->timer.function = __rq_watchdog_expired; > >> + hrtimer_start_range_ns(&wdg->timer, > >> + ns_to_ktime(ce->watchdog.timeout_us * > >> + NSEC_PER_USEC), > >> + /* > >> + * FIXME check if it gives the "not sooner" > >> + * guarantee or slack is both ways > >> + */ > > > > It looks like the slack/fuzziness just delays the timer, in case it > > can coalesce multiple timer events. So shouldn't be sooner I think? > > I couldn't quickly figure it out when I looked at the implementation so > I left this comment. But it was only relevant at a time I thought we > would be exposing context param to allow userspace control. With the > only user being default expiry which is not sensitive to precision or > accuracy, I simply need to remove this comment. > > > > >> + NSEC_PER_MSEC, > > > > Formatting. > > Which part? I think indentation/alignment is correct. @@ -311,11 +311,11 @@ static void __rq_arm_watchdog(struct i915_request *rq) hrtimer_start_range_ns(&wdg->timer, ns_to_ktime(ce->watchdog.timeout_us * NSEC_PER_USEC), - /* - * FIXME check if it gives the "not sooner" - * guarantee or slack is both ways - */ - NSEC_PER_MSEC, + /* + * FIXME check if it gives the "not sooner" + * guarantee or slack is both ways + */ + NSEC_PER_MSEC, HRTIMER_MODE_REL); i915_request_get(rq); } > > > > > Reviewed-by: Matthew Auld <matthew.auld@intel.com> > > > > Thanks, > > Tvrtko
diff --git a/drivers/gpu/drm/i915/gt/intel_context_types.h b/drivers/gpu/drm/i915/gt/intel_context_types.h index 0ea18c9e2aca..65a5730a4f5b 100644 --- a/drivers/gpu/drm/i915/gt/intel_context_types.h +++ b/drivers/gpu/drm/i915/gt/intel_context_types.h @@ -99,6 +99,10 @@ struct intel_context { #define CONTEXT_FORCE_SINGLE_SUBMISSION 7 #define CONTEXT_NOPREEMPT 8 + struct { + u64 timeout_us; + } watchdog; + u32 *lrc_reg_state; union { struct { diff --git a/drivers/gpu/drm/i915/gt/intel_execlists_submission.h b/drivers/gpu/drm/i915/gt/intel_execlists_submission.h index f7bd3fccfee8..4ca9b475e252 100644 --- a/drivers/gpu/drm/i915/gt/intel_execlists_submission.h +++ b/drivers/gpu/drm/i915/gt/intel_execlists_submission.h @@ -6,6 +6,7 @@ #ifndef __INTEL_EXECLISTS_SUBMISSION_H__ #define __INTEL_EXECLISTS_SUBMISSION_H__ +#include <linux/llist.h> #include <linux/types.h> struct drm_printer; @@ -13,6 +14,7 @@ struct drm_printer; struct i915_request; struct intel_context; struct intel_engine_cs; +struct intel_gt; enum { INTEL_CONTEXT_SCHEDULE_IN = 0, diff --git a/drivers/gpu/drm/i915/gt/intel_gt.c b/drivers/gpu/drm/i915/gt/intel_gt.c index ca76f93bc03d..8d77dcbad059 100644 --- a/drivers/gpu/drm/i915/gt/intel_gt.c +++ b/drivers/gpu/drm/i915/gt/intel_gt.c @@ -31,6 +31,9 @@ void intel_gt_init_early(struct intel_gt *gt, struct drm_i915_private *i915) INIT_LIST_HEAD(>->closed_vma); spin_lock_init(>->closed_lock); + init_llist_head(>->watchdog.list); + INIT_WORK(>->watchdog.work, intel_gt_watchdog_work); + intel_gt_init_buffer_pool(gt); intel_gt_init_reset(gt); intel_gt_init_requests(gt); diff --git a/drivers/gpu/drm/i915/gt/intel_gt.h b/drivers/gpu/drm/i915/gt/intel_gt.h index a17bd8b3195f..7ec395cace69 100644 --- a/drivers/gpu/drm/i915/gt/intel_gt.h +++ b/drivers/gpu/drm/i915/gt/intel_gt.h @@ -78,4 +78,6 @@ static inline bool intel_gt_is_wedged(const struct intel_gt *gt) void intel_gt_info_print(const struct intel_gt_info *info, struct drm_printer *p); +void intel_gt_watchdog_work(struct work_struct *work); + #endif /* __INTEL_GT_H__ */ diff --git a/drivers/gpu/drm/i915/gt/intel_gt_requests.c b/drivers/gpu/drm/i915/gt/intel_gt_requests.c index 36ec97f79174..fbfd19b2e5f2 100644 --- a/drivers/gpu/drm/i915/gt/intel_gt_requests.c +++ b/drivers/gpu/drm/i915/gt/intel_gt_requests.c @@ -8,6 +8,7 @@ #include "i915_drv.h" /* for_each_engine() */ #include "i915_request.h" #include "intel_engine_heartbeat.h" +#include "intel_execlists_submission.h" #include "intel_gt.h" #include "intel_gt_pm.h" #include "intel_gt_requests.h" @@ -242,4 +243,31 @@ void intel_gt_fini_requests(struct intel_gt *gt) { /* Wait until the work is marked as finished before unloading! */ cancel_delayed_work_sync(>->requests.retire_work); + + flush_work(>->watchdog.work); +} + +void intel_gt_watchdog_work(struct work_struct *work) +{ + struct intel_gt *gt = + container_of(work, typeof(*gt), watchdog.work); + struct i915_request *rq, *rn; + struct llist_node *first; + + first = llist_del_all(>->watchdog.list); + if (!first) + return; + + llist_for_each_entry_safe(rq, rn, first, watchdog.link) { + if (!i915_request_completed(rq)) { + struct dma_fence *f = &rq->fence; + + pr_notice("Fence expiration time out i915-%s:%s:%llx!\n", + f->ops->get_driver_name(f), + f->ops->get_timeline_name(f), + f->seqno); + i915_request_cancel(rq, -EINTR); + } + i915_request_put(rq); + } } diff --git a/drivers/gpu/drm/i915/gt/intel_gt_types.h b/drivers/gpu/drm/i915/gt/intel_gt_types.h index 626af37c7790..d70ebcc6f19f 100644 --- a/drivers/gpu/drm/i915/gt/intel_gt_types.h +++ b/drivers/gpu/drm/i915/gt/intel_gt_types.h @@ -8,10 +8,12 @@ #include <linux/ktime.h> #include <linux/list.h> +#include <linux/llist.h> #include <linux/mutex.h> #include <linux/notifier.h> #include <linux/spinlock.h> #include <linux/types.h> +#include <linux/workqueue.h> #include "uc/intel_uc.h" @@ -62,6 +64,11 @@ struct intel_gt { struct delayed_work retire_work; } requests; + struct { + struct llist_head list; + struct work_struct work; + } watchdog; + struct intel_wakeref wakeref; atomic_t user_wakeref; diff --git a/drivers/gpu/drm/i915/i915_request.c b/drivers/gpu/drm/i915/i915_request.c index b4511ac05e9a..9dd5e588b0a4 100644 --- a/drivers/gpu/drm/i915/i915_request.c +++ b/drivers/gpu/drm/i915/i915_request.c @@ -277,6 +277,57 @@ static void remove_from_engine(struct i915_request *rq) __notify_execute_cb_imm(rq); } +static void __rq_init_watchdog(struct i915_request *rq) +{ + rq->watchdog.timer.function = NULL; +} + +static enum hrtimer_restart __rq_watchdog_expired(struct hrtimer *hrtimer) +{ + struct i915_request *rq = + container_of(hrtimer, struct i915_request, watchdog.timer); + struct intel_gt *gt = rq->engine->gt; + + if (!i915_request_completed(rq)) { + if (llist_add(&rq->watchdog.link, >->watchdog.list)) + schedule_work(>->watchdog.work); + } else { + i915_request_put(rq); + } + + return HRTIMER_NORESTART; +} + +static void __rq_arm_watchdog(struct i915_request *rq) +{ + struct i915_request_watchdog *wdg = &rq->watchdog; + struct intel_context *ce = rq->context; + + if (!ce->watchdog.timeout_us) + return; + + hrtimer_init(&wdg->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); + wdg->timer.function = __rq_watchdog_expired; + hrtimer_start_range_ns(&wdg->timer, + ns_to_ktime(ce->watchdog.timeout_us * + NSEC_PER_USEC), + /* + * FIXME check if it gives the "not sooner" + * guarantee or slack is both ways + */ + NSEC_PER_MSEC, + HRTIMER_MODE_REL); + i915_request_get(rq); +} + +static void __rq_cancel_watchdog(struct i915_request *rq) +{ + struct i915_request_watchdog *wdg = &rq->watchdog; + + if (wdg->timer.function && hrtimer_try_to_cancel(&wdg->timer) > 0) + i915_request_put(rq); +} + bool i915_request_retire(struct i915_request *rq) { if (!__i915_request_is_complete(rq)) @@ -288,6 +339,8 @@ bool i915_request_retire(struct i915_request *rq) trace_i915_request_retire(rq); i915_request_mark_complete(rq); + __rq_cancel_watchdog(rq); + /* * We know the GPU must have read the request to have * sent us the seqno + interrupt, so use the position @@ -667,6 +720,8 @@ submit_notify(struct i915_sw_fence *fence, enum i915_sw_fence_notify state) if (unlikely(fence->error)) i915_request_set_error_once(request, fence->error); + else + __rq_arm_watchdog(request); /* * We need to serialize use of the submit_request() callback @@ -854,6 +909,7 @@ __i915_request_create(struct intel_context *ce, gfp_t gfp) /* No zalloc, everything must be cleared after use */ rq->batch = NULL; + __rq_init_watchdog(rq); GEM_BUG_ON(rq->capture_list); GEM_BUG_ON(!llist_empty(&rq->execute_cb)); diff --git a/drivers/gpu/drm/i915/i915_request.h b/drivers/gpu/drm/i915/i915_request.h index 64869a313b3e..294f16e2163d 100644 --- a/drivers/gpu/drm/i915/i915_request.h +++ b/drivers/gpu/drm/i915/i915_request.h @@ -26,7 +26,9 @@ #define I915_REQUEST_H #include <linux/dma-fence.h> +#include <linux/hrtimer.h> #include <linux/irq_work.h> +#include <linux/llist.h> #include <linux/lockdep.h> #include "gem/i915_gem_context_types.h" @@ -289,6 +291,12 @@ struct i915_request { /** timeline->request entry for this request */ struct list_head link; + /** Watchdog support fields. */ + struct i915_request_watchdog { + struct llist_node link; + struct hrtimer timer; + } watchdog; + I915_SELFTEST_DECLARE(struct { struct list_head link; unsigned long delay;