[7/8] drm/i915: Immediately execute the fenced work

Message ID	20200323092841.22240-7-chris@chris-wilson.co.uk (mailing list archive)
State	New, archived
Headers	show Return-Path: <SRS0=Atw5=5I=lists.freedesktop.org=intel-gfx-bounces@kernel.org> DMARC-Filter: OpenDMARC Filter v1.3.2 mail.kernel.org 2B97C2074D From: Chris Wilson <chris@chris-wilson.co.uk> To: intel-gfx@lists.freedesktop.org Date: Mon, 23 Mar 2020 09:28:40 +0000 Message-Id: <20200323092841.22240-7-chris@chris-wilson.co.uk> In-Reply-To: <20200323092841.22240-1-chris@chris-wilson.co.uk> References: <20200323092841.22240-1-chris@chris-wilson.co.uk> MIME-Version: 1.0 Subject: [Intel-gfx] [PATCH 7/8] drm/i915: Immediately execute the fenced work Precedence: list Content-Type: text/plain; charset="us-ascii" Content-Transfer-Encoding: 7bit Errors-To: intel-gfx-bounces@lists.freedesktop.org Sender: "Intel-gfx" <intel-gfx-bounces@lists.freedesktop.org>
Series	[1/8] drm/i915/gt: Mark timeline->cacheline as destroyed after rcu grace period \| expand [1/8] drm/i915/gt: Mark timeline->cacheline as destroyed after rcu grace period [2/8] drm/i915: Avoid live-lock with i915_vma_parked() [3/8] drm/i915: Extend intel_wakeref to support delayed puts [4/8] drm/i915/gt: Delay release of engine-pm after last retirement [5/8] drm/i915: Rely on direct submission to the queue [6/8] drm/i915/execlists: Pull tasklet interrupt-bh local to direct submission [7/8] drm/i915: Immediately execute the fenced work [8/8] drm/i915/gem: Avoid gem_context->mutex for simple vma lookup

Message ID

20200323092841.22240-7-chris@chris-wilson.co.uk (mailing list archive)

State

New, archived

Headers

DMARC-Filter: OpenDMARC Filter v1.3.2 mail.kernel.org 2B97C2074D
From: Chris Wilson <chris@chris-wilson.co.uk>
To: intel-gfx@lists.freedesktop.org
Date: Mon, 23 Mar 2020 09:28:40 +0000
Message-Id: <20200323092841.22240-7-chris@chris-wilson.co.uk>
In-Reply-To: <20200323092841.22240-1-chris@chris-wilson.co.uk>
References: <20200323092841.22240-1-chris@chris-wilson.co.uk>
MIME-Version: 1.0
Subject: [Intel-gfx] [PATCH 7/8] drm/i915: Immediately execute the fenced
 work
Precedence: list
Content-Type: text/plain; charset="us-ascii"
Content-Transfer-Encoding: 7bit
Errors-To: intel-gfx-bounces@lists.freedesktop.org
Sender: "Intel-gfx" <intel-gfx-bounces@lists.freedesktop.org>

Series

[1/8] drm/i915/gt: Mark timeline->cacheline as destroyed after rcu grace period | expand

Commit Message

Chris Wilson March 23, 2020, 9:28 a.m. UTC

If the caller allows and we do not have to wait for any signals,
immediately execute the work within the caller's process. By doing so we
avoid the overhead of scheduling a new task, and the latency in
executing it, at the cost of pulling that work back into the immediate
context. (Sometimes we still prefer to offload the task to another cpu,
especially if we plan on executing many such tasks in parallel for this
client.)

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
---
 drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c |  2 +-
 drivers/gpu/drm/i915/i915_sw_fence_work.c      |  5 ++++-
 drivers/gpu/drm/i915/i915_sw_fence_work.h      | 12 ++++++++++++
 drivers/gpu/drm/i915/i915_vma.c                |  2 +-
 4 files changed, 18 insertions(+), 3 deletions(-)

Comments

Tvrtko Ursulin March 23, 2020, 10:37 a.m. UTC | #1

On 23/03/2020 09:28, Chris Wilson wrote:
> If the caller allows and we do not have to wait for any signals,
> immediately execute the work within the caller's process. By doing so we
> avoid the overhead of scheduling a new task, and the latency in
> executing it, at the cost of pulling that work back into the immediate
> context. (Sometimes we still prefer to offload the task to another cpu,
> especially if we plan on executing many such tasks in parallel for this
> client.)
> 
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> ---
>   drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c |  2 +-
>   drivers/gpu/drm/i915/i915_sw_fence_work.c      |  5 ++++-
>   drivers/gpu/drm/i915/i915_sw_fence_work.h      | 12 ++++++++++++
>   drivers/gpu/drm/i915/i915_vma.c                |  2 +-
>   4 files changed, 18 insertions(+), 3 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c
> index c2bd5accde0c..e80c6f613feb 100644
> --- a/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c
> +++ b/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c
> @@ -1784,7 +1784,7 @@ static int eb_parse_pipeline(struct i915_execbuffer *eb,
>   	dma_resv_add_excl_fence(shadow->resv, &pw->base.dma);
>   	dma_resv_unlock(shadow->resv);
>   
> -	dma_fence_work_commit(&pw->base);
> +	dma_fence_work_commit_imm(&pw->base);
>   	return 0;
>   
>   err_batch_unlock:
> diff --git a/drivers/gpu/drm/i915/i915_sw_fence_work.c b/drivers/gpu/drm/i915/i915_sw_fence_work.c
> index 997b2998f1f2..a3a81bb8f2c3 100644
> --- a/drivers/gpu/drm/i915/i915_sw_fence_work.c
> +++ b/drivers/gpu/drm/i915/i915_sw_fence_work.c
> @@ -38,7 +38,10 @@ fence_notify(struct i915_sw_fence *fence, enum i915_sw_fence_notify state)
>   
>   		if (!f->dma.error) {
>   			dma_fence_get(&f->dma);
> -			queue_work(system_unbound_wq, &f->work);
> +			if (test_bit(DMA_FENCE_WORK_IMM, &f->dma.flags))
> +				fence_work(&f->work);
> +			else
> +				queue_work(system_unbound_wq, &f->work);
>   		} else {
>   			fence_complete(f);
>   		}
> diff --git a/drivers/gpu/drm/i915/i915_sw_fence_work.h b/drivers/gpu/drm/i915/i915_sw_fence_work.h
> index 3a22b287e201..0719d661dc9c 100644
> --- a/drivers/gpu/drm/i915/i915_sw_fence_work.h
> +++ b/drivers/gpu/drm/i915/i915_sw_fence_work.h
> @@ -32,6 +32,10 @@ struct dma_fence_work {
>   	const struct dma_fence_work_ops *ops;
>   };
>   
> +enum {
> +	DMA_FENCE_WORK_IMM = DMA_FENCE_FLAG_USER_BITS,
> +};
> +
>   void dma_fence_work_init(struct dma_fence_work *f,
>   			 const struct dma_fence_work_ops *ops);
>   int dma_fence_work_chain(struct dma_fence_work *f, struct dma_fence *signal);
> @@ -41,4 +45,12 @@ static inline void dma_fence_work_commit(struct dma_fence_work *f)
>   	i915_sw_fence_commit(&f->chain);
>   }
>   
> +static inline void dma_fence_work_commit_imm(struct dma_fence_work *f)
> +{
> +	if (atomic_read(&f->chain.pending) <= 1)
> +		__set_bit(DMA_FENCE_WORK_IMM, &f->dma.flags);
> +

What is someone bumps pending to 2 at this point?

Regards,

Tvrtko

> +	dma_fence_work_commit(f);
> +}
> +
>   #endif /* I915_SW_FENCE_WORK_H */
> diff --git a/drivers/gpu/drm/i915/i915_vma.c b/drivers/gpu/drm/i915/i915_vma.c
> index 08699fa069aa..191577a98390 100644
> --- a/drivers/gpu/drm/i915/i915_vma.c
> +++ b/drivers/gpu/drm/i915/i915_vma.c
> @@ -980,7 +980,7 @@ int i915_vma_pin(struct i915_vma *vma, u64 size, u64 alignment, u64 flags)
>   	mutex_unlock(&vma->vm->mutex);
>   err_fence:
>   	if (work)
> -		dma_fence_work_commit(&work->base);
> +		dma_fence_work_commit_imm(&work->base);
>   	if (wakeref)
>   		intel_runtime_pm_put(&vma->vm->i915->runtime_pm, wakeref);
>   err_pages:
>

Chris Wilson March 23, 2020, 10:46 a.m. UTC | #2

Quoting Tvrtko Ursulin (2020-03-23 10:37:22)
> 
> On 23/03/2020 09:28, Chris Wilson wrote:
> > If the caller allows and we do not have to wait for any signals,
> > immediately execute the work within the caller's process. By doing so we
> > avoid the overhead of scheduling a new task, and the latency in
> > executing it, at the cost of pulling that work back into the immediate
> > context. (Sometimes we still prefer to offload the task to another cpu,
> > especially if we plan on executing many such tasks in parallel for this
> > client.)
> > 
> > Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> > ---
> >   drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c |  2 +-
> >   drivers/gpu/drm/i915/i915_sw_fence_work.c      |  5 ++++-
> >   drivers/gpu/drm/i915/i915_sw_fence_work.h      | 12 ++++++++++++
> >   drivers/gpu/drm/i915/i915_vma.c                |  2 +-
> >   4 files changed, 18 insertions(+), 3 deletions(-)
> > 
> > diff --git a/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c
> > index c2bd5accde0c..e80c6f613feb 100644
> > --- a/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c
> > +++ b/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c
> > @@ -1784,7 +1784,7 @@ static int eb_parse_pipeline(struct i915_execbuffer *eb,
> >       dma_resv_add_excl_fence(shadow->resv, &pw->base.dma);
> >       dma_resv_unlock(shadow->resv);
> >   
> > -     dma_fence_work_commit(&pw->base);
> > +     dma_fence_work_commit_imm(&pw->base);
> >       return 0;
> >   
> >   err_batch_unlock:
> > diff --git a/drivers/gpu/drm/i915/i915_sw_fence_work.c b/drivers/gpu/drm/i915/i915_sw_fence_work.c
> > index 997b2998f1f2..a3a81bb8f2c3 100644
> > --- a/drivers/gpu/drm/i915/i915_sw_fence_work.c
> > +++ b/drivers/gpu/drm/i915/i915_sw_fence_work.c
> > @@ -38,7 +38,10 @@ fence_notify(struct i915_sw_fence *fence, enum i915_sw_fence_notify state)
> >   
> >               if (!f->dma.error) {
> >                       dma_fence_get(&f->dma);
> > -                     queue_work(system_unbound_wq, &f->work);
> > +                     if (test_bit(DMA_FENCE_WORK_IMM, &f->dma.flags))
> > +                             fence_work(&f->work);
> > +                     else
> > +                             queue_work(system_unbound_wq, &f->work);
> >               } else {
> >                       fence_complete(f);
> >               }
> > diff --git a/drivers/gpu/drm/i915/i915_sw_fence_work.h b/drivers/gpu/drm/i915/i915_sw_fence_work.h
> > index 3a22b287e201..0719d661dc9c 100644
> > --- a/drivers/gpu/drm/i915/i915_sw_fence_work.h
> > +++ b/drivers/gpu/drm/i915/i915_sw_fence_work.h
> > @@ -32,6 +32,10 @@ struct dma_fence_work {
> >       const struct dma_fence_work_ops *ops;
> >   };
> >   
> > +enum {
> > +     DMA_FENCE_WORK_IMM = DMA_FENCE_FLAG_USER_BITS,
> > +};
> > +
> >   void dma_fence_work_init(struct dma_fence_work *f,
> >                        const struct dma_fence_work_ops *ops);
> >   int dma_fence_work_chain(struct dma_fence_work *f, struct dma_fence *signal);
> > @@ -41,4 +45,12 @@ static inline void dma_fence_work_commit(struct dma_fence_work *f)
> >       i915_sw_fence_commit(&f->chain);
> >   }
> >   
> > +static inline void dma_fence_work_commit_imm(struct dma_fence_work *f)
> > +{
> > +     if (atomic_read(&f->chain.pending) <= 1)
> > +             __set_bit(DMA_FENCE_WORK_IMM, &f->dma.flags);
> > +
> 
> What is someone bumps pending to 2 at this point?

That's invalid. The sequence is

create a worker
... add all async waits ...
commit

Since the worker fires when pending waits hits zero, you cannot add any
more async waits after commit in a race free manner. You have to play
games, such as "is this fence already signaled? no, let's delay it
again". If you are playing such games, you know already and shouldn't be
trying to execute synchronously/immediately.

A BUG_ON(!dma_fence_signaled(&f->dma)) would suffice to catch most such
races.
-Chris

Tvrtko Ursulin March 24, 2020, 4:13 p.m. UTC | #3

On 23/03/2020 10:46, Chris Wilson wrote:
> Quoting Tvrtko Ursulin (2020-03-23 10:37:22)
>>
>> On 23/03/2020 09:28, Chris Wilson wrote:
>>> If the caller allows and we do not have to wait for any signals,
>>> immediately execute the work within the caller's process. By doing so we
>>> avoid the overhead of scheduling a new task, and the latency in
>>> executing it, at the cost of pulling that work back into the immediate
>>> context. (Sometimes we still prefer to offload the task to another cpu,
>>> especially if we plan on executing many such tasks in parallel for this
>>> client.)
>>>
>>> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
>>> ---
>>>    drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c |  2 +-
>>>    drivers/gpu/drm/i915/i915_sw_fence_work.c      |  5 ++++-
>>>    drivers/gpu/drm/i915/i915_sw_fence_work.h      | 12 ++++++++++++
>>>    drivers/gpu/drm/i915/i915_vma.c                |  2 +-
>>>    4 files changed, 18 insertions(+), 3 deletions(-)
>>>
>>> diff --git a/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c
>>> index c2bd5accde0c..e80c6f613feb 100644
>>> --- a/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c
>>> +++ b/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c
>>> @@ -1784,7 +1784,7 @@ static int eb_parse_pipeline(struct i915_execbuffer *eb,
>>>        dma_resv_add_excl_fence(shadow->resv, &pw->base.dma);
>>>        dma_resv_unlock(shadow->resv);
>>>    
>>> -     dma_fence_work_commit(&pw->base);
>>> +     dma_fence_work_commit_imm(&pw->base);
>>>        return 0;
>>>    
>>>    err_batch_unlock:
>>> diff --git a/drivers/gpu/drm/i915/i915_sw_fence_work.c b/drivers/gpu/drm/i915/i915_sw_fence_work.c
>>> index 997b2998f1f2..a3a81bb8f2c3 100644
>>> --- a/drivers/gpu/drm/i915/i915_sw_fence_work.c
>>> +++ b/drivers/gpu/drm/i915/i915_sw_fence_work.c
>>> @@ -38,7 +38,10 @@ fence_notify(struct i915_sw_fence *fence, enum i915_sw_fence_notify state)
>>>    
>>>                if (!f->dma.error) {
>>>                        dma_fence_get(&f->dma);
>>> -                     queue_work(system_unbound_wq, &f->work);
>>> +                     if (test_bit(DMA_FENCE_WORK_IMM, &f->dma.flags))
>>> +                             fence_work(&f->work);
>>> +                     else
>>> +                             queue_work(system_unbound_wq, &f->work);
>>>                } else {
>>>                        fence_complete(f);
>>>                }
>>> diff --git a/drivers/gpu/drm/i915/i915_sw_fence_work.h b/drivers/gpu/drm/i915/i915_sw_fence_work.h
>>> index 3a22b287e201..0719d661dc9c 100644
>>> --- a/drivers/gpu/drm/i915/i915_sw_fence_work.h
>>> +++ b/drivers/gpu/drm/i915/i915_sw_fence_work.h
>>> @@ -32,6 +32,10 @@ struct dma_fence_work {
>>>        const struct dma_fence_work_ops *ops;
>>>    };
>>>    
>>> +enum {
>>> +     DMA_FENCE_WORK_IMM = DMA_FENCE_FLAG_USER_BITS,
>>> +};
>>> +
>>>    void dma_fence_work_init(struct dma_fence_work *f,
>>>                         const struct dma_fence_work_ops *ops);
>>>    int dma_fence_work_chain(struct dma_fence_work *f, struct dma_fence *signal);
>>> @@ -41,4 +45,12 @@ static inline void dma_fence_work_commit(struct dma_fence_work *f)
>>>        i915_sw_fence_commit(&f->chain);
>>>    }
>>>    
>>> +static inline void dma_fence_work_commit_imm(struct dma_fence_work *f)
>>> +{
>>> +     if (atomic_read(&f->chain.pending) <= 1)
>>> +             __set_bit(DMA_FENCE_WORK_IMM, &f->dma.flags);
>>> +
>>
>> What is someone bumps pending to 2 at this point?
> 
> That's invalid. The sequence is
> 
> create a worker
> ... add all async waits ...
> commit
> 
> Since the worker fires when pending waits hits zero, you cannot add any
> more async waits after commit in a race free manner. You have to play
> games, such as "is this fence already signaled? no, let's delay it
> again". If you are playing such games, you know already and shouldn't be
> trying to execute synchronously/immediately.

> A BUG_ON(!dma_fence_signaled(&f->dma)) would suffice to catch most such
> races.

So the "if the callers allows" from the commit message means something 
like "if pending is only one at the time of commit" ?

Regards,

Tvrtko

Chris Wilson March 24, 2020, 4:19 p.m. UTC | #4

Quoting Tvrtko Ursulin (2020-03-24 16:13:04)
> 
> On 23/03/2020 10:46, Chris Wilson wrote:
> > Quoting Tvrtko Ursulin (2020-03-23 10:37:22)
> >>
> >> On 23/03/2020 09:28, Chris Wilson wrote:
> >>> If the caller allows and we do not have to wait for any signals,
> >>> immediately execute the work within the caller's process. By doing so we
> >>> avoid the overhead of scheduling a new task, and the latency in
> >>> executing it, at the cost of pulling that work back into the immediate
> >>> context. (Sometimes we still prefer to offload the task to another cpu,
> >>> especially if we plan on executing many such tasks in parallel for this
> >>> client.)
> >>>
> >>> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> >>> ---
> >>>    drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c |  2 +-
> >>>    drivers/gpu/drm/i915/i915_sw_fence_work.c      |  5 ++++-
> >>>    drivers/gpu/drm/i915/i915_sw_fence_work.h      | 12 ++++++++++++
> >>>    drivers/gpu/drm/i915/i915_vma.c                |  2 +-
> >>>    4 files changed, 18 insertions(+), 3 deletions(-)
> >>>
> >>> diff --git a/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c
> >>> index c2bd5accde0c..e80c6f613feb 100644
> >>> --- a/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c
> >>> +++ b/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c
> >>> @@ -1784,7 +1784,7 @@ static int eb_parse_pipeline(struct i915_execbuffer *eb,
> >>>        dma_resv_add_excl_fence(shadow->resv, &pw->base.dma);
> >>>        dma_resv_unlock(shadow->resv);
> >>>    
> >>> -     dma_fence_work_commit(&pw->base);
> >>> +     dma_fence_work_commit_imm(&pw->base);
> >>>        return 0;
> >>>    
> >>>    err_batch_unlock:
> >>> diff --git a/drivers/gpu/drm/i915/i915_sw_fence_work.c b/drivers/gpu/drm/i915/i915_sw_fence_work.c
> >>> index 997b2998f1f2..a3a81bb8f2c3 100644
> >>> --- a/drivers/gpu/drm/i915/i915_sw_fence_work.c
> >>> +++ b/drivers/gpu/drm/i915/i915_sw_fence_work.c
> >>> @@ -38,7 +38,10 @@ fence_notify(struct i915_sw_fence *fence, enum i915_sw_fence_notify state)
> >>>    
> >>>                if (!f->dma.error) {
> >>>                        dma_fence_get(&f->dma);
> >>> -                     queue_work(system_unbound_wq, &f->work);
> >>> +                     if (test_bit(DMA_FENCE_WORK_IMM, &f->dma.flags))
> >>> +                             fence_work(&f->work);
> >>> +                     else
> >>> +                             queue_work(system_unbound_wq, &f->work);
> >>>                } else {
> >>>                        fence_complete(f);
> >>>                }
> >>> diff --git a/drivers/gpu/drm/i915/i915_sw_fence_work.h b/drivers/gpu/drm/i915/i915_sw_fence_work.h
> >>> index 3a22b287e201..0719d661dc9c 100644
> >>> --- a/drivers/gpu/drm/i915/i915_sw_fence_work.h
> >>> +++ b/drivers/gpu/drm/i915/i915_sw_fence_work.h
> >>> @@ -32,6 +32,10 @@ struct dma_fence_work {
> >>>        const struct dma_fence_work_ops *ops;
> >>>    };
> >>>    
> >>> +enum {
> >>> +     DMA_FENCE_WORK_IMM = DMA_FENCE_FLAG_USER_BITS,
> >>> +};
> >>> +
> >>>    void dma_fence_work_init(struct dma_fence_work *f,
> >>>                         const struct dma_fence_work_ops *ops);
> >>>    int dma_fence_work_chain(struct dma_fence_work *f, struct dma_fence *signal);
> >>> @@ -41,4 +45,12 @@ static inline void dma_fence_work_commit(struct dma_fence_work *f)
> >>>        i915_sw_fence_commit(&f->chain);
> >>>    }
> >>>    
> >>> +static inline void dma_fence_work_commit_imm(struct dma_fence_work *f)
> >>> +{
> >>> +     if (atomic_read(&f->chain.pending) <= 1)
> >>> +             __set_bit(DMA_FENCE_WORK_IMM, &f->dma.flags);
> >>> +
> >>
> >> What is someone bumps pending to 2 at this point?
> > 
> > That's invalid. The sequence is
> > 
> > create a worker
> > ... add all async waits ...
> > commit
> > 
> > Since the worker fires when pending waits hits zero, you cannot add any
> > more async waits after commit in a race free manner. You have to play
> > games, such as "is this fence already signaled? no, let's delay it
> > again". If you are playing such games, you know already and shouldn't be
> > trying to execute synchronously/immediately.
> 
> > A BUG_ON(!dma_fence_signaled(&f->dma)) would suffice to catch most such
> > races.
> 
> So the "if the callers allows" from the commit message means something 
> like "if pending is only one at the time of commit" ?

If the caller allows, means if it is valid to call the callback from
within the current context (i.e. within all the locks the caller holds).
And then there's the clflush where we not only worry about the locks in
the deep callpath, but also where we know that offloading the slow task
means we can parallelise better.
-Chris

diff --git a/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c
index c2bd5accde0c..e80c6f613feb 100644
--- a/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c
+++ b/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c
@@ -1784,7 +1784,7 @@  static int eb_parse_pipeline(struct i915_execbuffer *eb,
 	dma_resv_add_excl_fence(shadow->resv, &pw->base.dma);
 	dma_resv_unlock(shadow->resv);
 
-	dma_fence_work_commit(&pw->base);
+	dma_fence_work_commit_imm(&pw->base);
 	return 0;
 
 err_batch_unlock:
diff --git a/drivers/gpu/drm/i915/i915_sw_fence_work.c b/drivers/gpu/drm/i915/i915_sw_fence_work.c
index 997b2998f1f2..a3a81bb8f2c3 100644
--- a/drivers/gpu/drm/i915/i915_sw_fence_work.c
+++ b/drivers/gpu/drm/i915/i915_sw_fence_work.c
@@ -38,7 +38,10 @@  fence_notify(struct i915_sw_fence *fence, enum i915_sw_fence_notify state)
 
 		if (!f->dma.error) {
 			dma_fence_get(&f->dma);
-			queue_work(system_unbound_wq, &f->work);
+			if (test_bit(DMA_FENCE_WORK_IMM, &f->dma.flags))
+				fence_work(&f->work);
+			else
+				queue_work(system_unbound_wq, &f->work);
 		} else {
 			fence_complete(f);
 		}
diff --git a/drivers/gpu/drm/i915/i915_sw_fence_work.h b/drivers/gpu/drm/i915/i915_sw_fence_work.h
index 3a22b287e201..0719d661dc9c 100644
--- a/drivers/gpu/drm/i915/i915_sw_fence_work.h
+++ b/drivers/gpu/drm/i915/i915_sw_fence_work.h
@@ -32,6 +32,10 @@  struct dma_fence_work {
 	const struct dma_fence_work_ops *ops;
 };
 
+enum {
+	DMA_FENCE_WORK_IMM = DMA_FENCE_FLAG_USER_BITS,
+};
+
 void dma_fence_work_init(struct dma_fence_work *f,
 			 const struct dma_fence_work_ops *ops);
 int dma_fence_work_chain(struct dma_fence_work *f, struct dma_fence *signal);
@@ -41,4 +45,12 @@  static inline void dma_fence_work_commit(struct dma_fence_work *f)
 	i915_sw_fence_commit(&f->chain);
 }
 
+static inline void dma_fence_work_commit_imm(struct dma_fence_work *f)
+{
+	if (atomic_read(&f->chain.pending) <= 1)
+		__set_bit(DMA_FENCE_WORK_IMM, &f->dma.flags);
+
+	dma_fence_work_commit(f);
+}
+
 #endif /* I915_SW_FENCE_WORK_H */
diff --git a/drivers/gpu/drm/i915/i915_vma.c b/drivers/gpu/drm/i915/i915_vma.c
index 08699fa069aa..191577a98390 100644
--- a/drivers/gpu/drm/i915/i915_vma.c
+++ b/drivers/gpu/drm/i915/i915_vma.c
@@ -980,7 +980,7 @@  int i915_vma_pin(struct i915_vma *vma, u64 size, u64 alignment, u64 flags)
 	mutex_unlock(&vma->vm->mutex);
 err_fence:
 	if (work)
-		dma_fence_work_commit(&work->base);
+		dma_fence_work_commit_imm(&work->base);
 	if (wakeref)
 		intel_runtime_pm_put(&vma->vm->i915->runtime_pm, wakeref);
 err_pages:

[7/8] drm/i915: Immediately execute the fenced work

Commit Message

Comments

Patch