diff mbox series

[3/5] drm/i915: Fixup preempt-to-busy vs resubmission of a virtual request

Message ID 20190921095542.23205-3-chris@chris-wilson.co.uk (mailing list archive)
State New, archived
Headers show
Series [1/5] drm/i915/execlists: Refactor -EIO markup of hung requests | expand

Commit Message

Chris Wilson Sept. 21, 2019, 9:55 a.m. UTC
As preempt-to-busy leaves the request on the HW as the resubmission is
processed, that request may complete in the background and even cause a
second virtual request to enter queue. This second virtual request
breaks our "single request in the virtual pipeline" assumptions.
Furthermore, as the virtual request may be completed and retired, we
lose the reference the virtual engine assumes is held. Normally, just
removing the request from the scheduler queue removes it from the
engine, but the virtual engine keeps track of its singleton request via
its ve->request. This pointer needs protecting with a reference.

Fixes: 22b7a426bbe1 ("drm/i915/execlists: Preempt-to-busy")
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com>
Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
---
 drivers/gpu/drm/i915/gt/intel_lrc.c | 34 ++++++++++++++++++++++-------
 1 file changed, 26 insertions(+), 8 deletions(-)

Comments

Tvrtko Ursulin Sept. 23, 2019, 1:32 p.m. UTC | #1
On 21/09/2019 10:55, Chris Wilson wrote:
> As preempt-to-busy leaves the request on the HW as the resubmission is
> processed, that request may complete in the background and even cause a
> second virtual request to enter queue. This second virtual request
> breaks our "single request in the virtual pipeline" assumptions.
> Furthermore, as the virtual request may be completed and retired, we
> lose the reference the virtual engine assumes is held. Normally, just
> removing the request from the scheduler queue removes it from the
> engine, but the virtual engine keeps track of its singleton request via
> its ve->request. This pointer needs protecting with a reference.
> 
> Fixes: 22b7a426bbe1 ("drm/i915/execlists: Preempt-to-busy")
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com>
> Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
> ---
>   drivers/gpu/drm/i915/gt/intel_lrc.c | 34 ++++++++++++++++++++++-------
>   1 file changed, 26 insertions(+), 8 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/gt/intel_lrc.c b/drivers/gpu/drm/i915/gt/intel_lrc.c
> index 53bc4308793c..1b2bacc60300 100644
> --- a/drivers/gpu/drm/i915/gt/intel_lrc.c
> +++ b/drivers/gpu/drm/i915/gt/intel_lrc.c
> @@ -529,7 +529,6 @@ __unwind_incomplete_requests(struct intel_engine_cs *engine)
>   				i915_request_cancel_breadcrumb(rq);
>   				spin_unlock(&rq->lock);
>   			}
> -			rq->engine = owner;
>   			owner->submit_request(rq);
>   			active = NULL;
>   		}
> @@ -1248,6 +1247,7 @@ static void execlists_dequeue(struct intel_engine_cs *engine)
>   				submit = true;
>   				last = rq;
>   			}
> +			i915_request_put(rq);
>   
>   			if (!submit) {
>   				spin_unlock(&ve->base.active.lock);
> @@ -2535,6 +2535,7 @@ static void execlists_cancel_requests(struct intel_engine_cs *engine)
>   
>   			rq->engine = engine;
>   			__i915_request_submit(rq);
> +			i915_request_put(rq);
>   
>   			ve->base.execlists.queue_priority_hint = INT_MIN;
>   		}
> @@ -3787,7 +3788,9 @@ static void virtual_submission_tasklet(unsigned long data)
>   
>   static void virtual_submit_request(struct i915_request *rq)
>   {
> -	struct virtual_engine *ve = to_virtual_engine(rq->engine);
> +	struct virtual_engine *ve = to_virtual_engine(rq->hw_context->engine);
> +	struct i915_request *stale;
> +	unsigned long flags;
>   
>   	GEM_TRACE("%s: rq=%llx:%lld\n",
>   		  ve->base.name,
> @@ -3796,15 +3799,30 @@ static void virtual_submit_request(struct i915_request *rq)
>   
>   	GEM_BUG_ON(ve->base.submit_request != virtual_submit_request);
>   
> -	GEM_BUG_ON(ve->request);
> -	GEM_BUG_ON(!list_empty(virtual_queue(ve)));
> +	spin_lock_irqsave(&ve->base.active.lock, flags);
> +
> +	stale = ve->request;

fetch_and_zero so you don't have to set it to NULL a bit lower?

s/stale/completed/, plus a comment describing preempt-to-busy is to blame?

> +	if (stale) {
> +		GEM_BUG_ON(!i915_request_completed(stale));
> +		__i915_request_submit(stale);
> +		i915_request_put(stale);
> +	}
> +
> +	if (i915_request_completed(rq)) {
> +		__i915_request_submit(rq);
> +		ve->request = NULL;
> +	} else {
> +		ve->base.execlists.queue_priority_hint = rq_prio(rq);
> +		ve->request = i915_request_get(rq);
> +		rq->engine = &ve->base; /* fixup from unwind */

The last line has me confused. Isn't this the normal veng rq submission 
path? In which case eq->engine will already be set to veng. But on the 
unwind path you have removed reset-back of rq->engine to owner. Ah.. the 
unwind calls veng->submit_request on it, and then we end up in here.. 
Okay, this is outside the normal path, I mean the else block has two 
functions/paths, and this should be explained in a comment.

>   
> -	ve->base.execlists.queue_priority_hint = rq_prio(rq);
> -	WRITE_ONCE(ve->request, rq);
> +		GEM_BUG_ON(!list_empty(virtual_queue(ve)));
> +		list_move_tail(&rq->sched.link, virtual_queue(ve));
>   
> -	list_move_tail(&rq->sched.link, virtual_queue(ve));
> +		tasklet_schedule(&ve->base.execlists.tasklet);
> +	}
>   
> -	tasklet_schedule(&ve->base.execlists.tasklet);
> +	spin_unlock_irqrestore(&ve->base.active.lock, flags);
>   }
>   
>   static struct ve_bond *
> 

Otherwise I *think* it's okay.

Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>

Regards,

Tvrtko
Chris Wilson Sept. 23, 2019, 1:39 p.m. UTC | #2
Quoting Tvrtko Ursulin (2019-09-23 14:32:23)
> 
> On 21/09/2019 10:55, Chris Wilson wrote:
> > As preempt-to-busy leaves the request on the HW as the resubmission is
> > processed, that request may complete in the background and even cause a
> > second virtual request to enter queue. This second virtual request
> > breaks our "single request in the virtual pipeline" assumptions.
> > Furthermore, as the virtual request may be completed and retired, we
> > lose the reference the virtual engine assumes is held. Normally, just
> > removing the request from the scheduler queue removes it from the
> > engine, but the virtual engine keeps track of its singleton request via
> > its ve->request. This pointer needs protecting with a reference.
> > 
> > Fixes: 22b7a426bbe1 ("drm/i915/execlists: Preempt-to-busy")
> > Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> > Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com>
> > Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
> > ---
> >   drivers/gpu/drm/i915/gt/intel_lrc.c | 34 ++++++++++++++++++++++-------
> >   1 file changed, 26 insertions(+), 8 deletions(-)
> > 
> > diff --git a/drivers/gpu/drm/i915/gt/intel_lrc.c b/drivers/gpu/drm/i915/gt/intel_lrc.c
> > index 53bc4308793c..1b2bacc60300 100644
> > --- a/drivers/gpu/drm/i915/gt/intel_lrc.c
> > +++ b/drivers/gpu/drm/i915/gt/intel_lrc.c
> > @@ -529,7 +529,6 @@ __unwind_incomplete_requests(struct intel_engine_cs *engine)
> >                               i915_request_cancel_breadcrumb(rq);
> >                               spin_unlock(&rq->lock);
> >                       }
> > -                     rq->engine = owner;
> >                       owner->submit_request(rq);
> >                       active = NULL;
> >               }
> > @@ -1248,6 +1247,7 @@ static void execlists_dequeue(struct intel_engine_cs *engine)
> >                               submit = true;
> >                               last = rq;
> >                       }
> > +                     i915_request_put(rq);
> >   
> >                       if (!submit) {
> >                               spin_unlock(&ve->base.active.lock);
> > @@ -2535,6 +2535,7 @@ static void execlists_cancel_requests(struct intel_engine_cs *engine)
> >   
> >                       rq->engine = engine;
> >                       __i915_request_submit(rq);
> > +                     i915_request_put(rq);
> >   
> >                       ve->base.execlists.queue_priority_hint = INT_MIN;
> >               }
> > @@ -3787,7 +3788,9 @@ static void virtual_submission_tasklet(unsigned long data)
> >   
> >   static void virtual_submit_request(struct i915_request *rq)
> >   {
> > -     struct virtual_engine *ve = to_virtual_engine(rq->engine);
> > +     struct virtual_engine *ve = to_virtual_engine(rq->hw_context->engine);
> > +     struct i915_request *stale;
> > +     unsigned long flags;
> >   
> >       GEM_TRACE("%s: rq=%llx:%lld\n",
> >                 ve->base.name,
> > @@ -3796,15 +3799,30 @@ static void virtual_submit_request(struct i915_request *rq)
> >   
> >       GEM_BUG_ON(ve->base.submit_request != virtual_submit_request);
> >   
> > -     GEM_BUG_ON(ve->request);
> > -     GEM_BUG_ON(!list_empty(virtual_queue(ve)));
> > +     spin_lock_irqsave(&ve->base.active.lock, flags);
> > +
> > +     stale = ve->request;
> 
> fetch_and_zero so you don't have to set it to NULL a bit lower?

I iterated through xchg then fetch_and_zero, before settling on this
variant. My feeling was setting ve->request in both sides of the if()
was more balanced.

> s/stale/completed/, plus a comment describing preempt-to-busy is to blame?

completed is a little long, old? Already added a comment to remind about
the preempt-to-busy link :)

> > +     if (stale) {
> > +             GEM_BUG_ON(!i915_request_completed(stale));
> > +             __i915_request_submit(stale);
> > +             i915_request_put(stale);
> > +     }
> > +
> > +     if (i915_request_completed(rq)) {
> > +             __i915_request_submit(rq);
> > +             ve->request = NULL;
> > +     } else {
> > +             ve->base.execlists.queue_priority_hint = rq_prio(rq);
> > +             ve->request = i915_request_get(rq);
> > +             rq->engine = &ve->base; /* fixup from unwind */
> 
> The last line has me confused. Isn't this the normal veng rq submission 
> path?

It is also on the normal submission path.

> In which case rq->engine will already be set to veng.

Yes

> But on the 
> unwind path you have removed reset-back of rq->engine to owner. Ah.. the 
> unwind calls veng->submit_request on it, and then we end up in here.. 
> Okay, this is outside the normal path, I mean the else block has two 
> functions/paths, and this should be explained in a comment.

That was the intent of "fixup from unwind?"

I can squeeze in /* fixup __unwind_incomplete_requests */ is that more
clueful? Or do you think it needs more?
-Chris
Chris Wilson Sept. 23, 2019, 1:46 p.m. UTC | #3
Quoting Chris Wilson (2019-09-23 14:39:20)
> Quoting Tvrtko Ursulin (2019-09-23 14:32:23)
> > 
> > On 21/09/2019 10:55, Chris Wilson wrote:
> > > +     if (i915_request_completed(rq)) {
> > > +             __i915_request_submit(rq);
> > > +             ve->request = NULL;
> > > +     } else {
> > > +             ve->base.execlists.queue_priority_hint = rq_prio(rq);
> > > +             ve->request = i915_request_get(rq);
> > > +             rq->engine = &ve->base; /* fixup from unwind */
> > 
> > The last line has me confused. Isn't this the normal veng rq submission 
> > path?
> 
> It is also on the normal submission path.
> 
> > In which case rq->engine will already be set to veng.
> 
> Yes
> 
> > But on the 
> > unwind path you have removed reset-back of rq->engine to owner. Ah.. the 
> > unwind calls veng->submit_request on it, and then we end up in here.. 
> > Okay, this is outside the normal path, I mean the else block has two 
> > functions/paths, and this should be explained in a comment.
> 
> That was the intent of "fixup from unwind?"
> 
> I can squeeze in /* fixup __unwind_incomplete_requests */ is that more
> clueful? Or do you think it needs more?

Also it doesn't strictly have to be moved. And certainly there's not
good reason why it needs to be done in this patch -- initially I was
moving it to avoid dereferencing incomplete virtual engine state inside
__i915_request_submit(), but that's better handled by the earlier
patches.
-Chris
Tvrtko Ursulin Sept. 23, 2019, 1:58 p.m. UTC | #4
On 23/09/2019 14:39, Chris Wilson wrote:
> Quoting Tvrtko Ursulin (2019-09-23 14:32:23)
>>
>> On 21/09/2019 10:55, Chris Wilson wrote:
>>> As preempt-to-busy leaves the request on the HW as the resubmission is
>>> processed, that request may complete in the background and even cause a
>>> second virtual request to enter queue. This second virtual request
>>> breaks our "single request in the virtual pipeline" assumptions.
>>> Furthermore, as the virtual request may be completed and retired, we
>>> lose the reference the virtual engine assumes is held. Normally, just
>>> removing the request from the scheduler queue removes it from the
>>> engine, but the virtual engine keeps track of its singleton request via
>>> its ve->request. This pointer needs protecting with a reference.
>>>
>>> Fixes: 22b7a426bbe1 ("drm/i915/execlists: Preempt-to-busy")
>>> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
>>> Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com>
>>> Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
>>> ---
>>>    drivers/gpu/drm/i915/gt/intel_lrc.c | 34 ++++++++++++++++++++++-------
>>>    1 file changed, 26 insertions(+), 8 deletions(-)
>>>
>>> diff --git a/drivers/gpu/drm/i915/gt/intel_lrc.c b/drivers/gpu/drm/i915/gt/intel_lrc.c
>>> index 53bc4308793c..1b2bacc60300 100644
>>> --- a/drivers/gpu/drm/i915/gt/intel_lrc.c
>>> +++ b/drivers/gpu/drm/i915/gt/intel_lrc.c
>>> @@ -529,7 +529,6 @@ __unwind_incomplete_requests(struct intel_engine_cs *engine)
>>>                                i915_request_cancel_breadcrumb(rq);
>>>                                spin_unlock(&rq->lock);
>>>                        }
>>> -                     rq->engine = owner;
>>>                        owner->submit_request(rq);
>>>                        active = NULL;
>>>                }
>>> @@ -1248,6 +1247,7 @@ static void execlists_dequeue(struct intel_engine_cs *engine)
>>>                                submit = true;
>>>                                last = rq;
>>>                        }
>>> +                     i915_request_put(rq);
>>>    
>>>                        if (!submit) {
>>>                                spin_unlock(&ve->base.active.lock);
>>> @@ -2535,6 +2535,7 @@ static void execlists_cancel_requests(struct intel_engine_cs *engine)
>>>    
>>>                        rq->engine = engine;
>>>                        __i915_request_submit(rq);
>>> +                     i915_request_put(rq);
>>>    
>>>                        ve->base.execlists.queue_priority_hint = INT_MIN;
>>>                }
>>> @@ -3787,7 +3788,9 @@ static void virtual_submission_tasklet(unsigned long data)
>>>    
>>>    static void virtual_submit_request(struct i915_request *rq)
>>>    {
>>> -     struct virtual_engine *ve = to_virtual_engine(rq->engine);
>>> +     struct virtual_engine *ve = to_virtual_engine(rq->hw_context->engine);
>>> +     struct i915_request *stale;
>>> +     unsigned long flags;
>>>    
>>>        GEM_TRACE("%s: rq=%llx:%lld\n",
>>>                  ve->base.name,
>>> @@ -3796,15 +3799,30 @@ static void virtual_submit_request(struct i915_request *rq)
>>>    
>>>        GEM_BUG_ON(ve->base.submit_request != virtual_submit_request);
>>>    
>>> -     GEM_BUG_ON(ve->request);
>>> -     GEM_BUG_ON(!list_empty(virtual_queue(ve)));
>>> +     spin_lock_irqsave(&ve->base.active.lock, flags);
>>> +
>>> +     stale = ve->request;
>>
>> fetch_and_zero so you don't have to set it to NULL a bit lower?
> 
> I iterated through xchg then fetch_and_zero, before settling on this
> variant. My feeling was setting ve->request in both sides of the if()
> was more balanced.

Ok.

> 
>> s/stale/completed/, plus a comment describing preempt-to-busy is to blame?
> 
> completed is a little long, old? Already added a comment to remind about
> the preempt-to-busy link :)

I disliked stale since it has negative connotations and this is actually 
normal/expected situation (albeit rare), but maybe it's just me. Old is 
perhaps better. But you can also keep stale I guess.

>>> +     if (stale) {
>>> +             GEM_BUG_ON(!i915_request_completed(stale));
>>> +             __i915_request_submit(stale);
>>> +             i915_request_put(stale);
>>> +     }
>>> +
>>> +     if (i915_request_completed(rq)) {
>>> +             __i915_request_submit(rq);
>>> +             ve->request = NULL;
>>> +     } else {
>>> +             ve->base.execlists.queue_priority_hint = rq_prio(rq);
>>> +             ve->request = i915_request_get(rq);
>>> +             rq->engine = &ve->base; /* fixup from unwind */
>>
>> The last line has me confused. Isn't this the normal veng rq submission
>> path?
> 
> It is also on the normal submission path.
> 
>> In which case rq->engine will already be set to veng.
> 
> Yes
> 
>> But on the
>> unwind path you have removed reset-back of rq->engine to owner. Ah.. the
>> unwind calls veng->submit_request on it, and then we end up in here..
>> Okay, this is outside the normal path, I mean the else block has two
>> functions/paths, and this should be explained in a comment.
> 
> That was the intent of "fixup from unwind?"
> 
> I can squeeze in /* fixup __unwind_incomplete_requests */ is that more
> clueful? Or do you think it needs more?

I thought it does, but then I did not know what I would write so it's 
useful. So I think it's fine after all.

Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>

Regards,

Tvrtko

> -Chris
>
diff mbox series

Patch

diff --git a/drivers/gpu/drm/i915/gt/intel_lrc.c b/drivers/gpu/drm/i915/gt/intel_lrc.c
index 53bc4308793c..1b2bacc60300 100644
--- a/drivers/gpu/drm/i915/gt/intel_lrc.c
+++ b/drivers/gpu/drm/i915/gt/intel_lrc.c
@@ -529,7 +529,6 @@  __unwind_incomplete_requests(struct intel_engine_cs *engine)
 				i915_request_cancel_breadcrumb(rq);
 				spin_unlock(&rq->lock);
 			}
-			rq->engine = owner;
 			owner->submit_request(rq);
 			active = NULL;
 		}
@@ -1248,6 +1247,7 @@  static void execlists_dequeue(struct intel_engine_cs *engine)
 				submit = true;
 				last = rq;
 			}
+			i915_request_put(rq);
 
 			if (!submit) {
 				spin_unlock(&ve->base.active.lock);
@@ -2535,6 +2535,7 @@  static void execlists_cancel_requests(struct intel_engine_cs *engine)
 
 			rq->engine = engine;
 			__i915_request_submit(rq);
+			i915_request_put(rq);
 
 			ve->base.execlists.queue_priority_hint = INT_MIN;
 		}
@@ -3787,7 +3788,9 @@  static void virtual_submission_tasklet(unsigned long data)
 
 static void virtual_submit_request(struct i915_request *rq)
 {
-	struct virtual_engine *ve = to_virtual_engine(rq->engine);
+	struct virtual_engine *ve = to_virtual_engine(rq->hw_context->engine);
+	struct i915_request *stale;
+	unsigned long flags;
 
 	GEM_TRACE("%s: rq=%llx:%lld\n",
 		  ve->base.name,
@@ -3796,15 +3799,30 @@  static void virtual_submit_request(struct i915_request *rq)
 
 	GEM_BUG_ON(ve->base.submit_request != virtual_submit_request);
 
-	GEM_BUG_ON(ve->request);
-	GEM_BUG_ON(!list_empty(virtual_queue(ve)));
+	spin_lock_irqsave(&ve->base.active.lock, flags);
+
+	stale = ve->request;
+	if (stale) {
+		GEM_BUG_ON(!i915_request_completed(stale));
+		__i915_request_submit(stale);
+		i915_request_put(stale);
+	}
+
+	if (i915_request_completed(rq)) {
+		__i915_request_submit(rq);
+		ve->request = NULL;
+	} else {
+		ve->base.execlists.queue_priority_hint = rq_prio(rq);
+		ve->request = i915_request_get(rq);
+		rq->engine = &ve->base; /* fixup from unwind */
 
-	ve->base.execlists.queue_priority_hint = rq_prio(rq);
-	WRITE_ONCE(ve->request, rq);
+		GEM_BUG_ON(!list_empty(virtual_queue(ve)));
+		list_move_tail(&rq->sched.link, virtual_queue(ve));
 
-	list_move_tail(&rq->sched.link, virtual_queue(ve));
+		tasklet_schedule(&ve->base.execlists.tasklet);
+	}
 
-	tasklet_schedule(&ve->base.execlists.tasklet);
+	spin_unlock_irqrestore(&ve->base.active.lock, flags);
 }
 
 static struct ve_bond *