diff mbox series

[01/11] drm/i915: Release i915_gem_context from a worker

Message ID 20210813203033.3179400-1-daniel.vetter@ffwll.ch (mailing list archive)
State New, archived
Headers show
Series [01/11] drm/i915: Release i915_gem_context from a worker | expand

Commit Message

Daniel Vetter Aug. 13, 2021, 8:30 p.m. UTC
The only reason for this really is the i915_gem_engines->fence
callback engines_notify(), which exists purely as a fairly funky
reference counting scheme for that. Otherwise all other callers are
from process context, and generally fairly benign locking context.

Unfortunately untangling that requires some major surgery, and we have
a few i915_gem_context reference counting bugs that need fixing, and
they blow in the current hardirq calling context, so we need a
stop-gap measure.

Put a FIXME comment in when this should be removable again.

Signed-off-by: Daniel Vetter <daniel.vetter@intel.com>
Cc: Jon Bloomfield <jon.bloomfield@intel.com>
Cc: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Maarten Lankhorst <maarten.lankhorst@linux.intel.com>
Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Cc: Daniel Vetter <daniel.vetter@ffwll.ch>
Cc: "Thomas Hellström" <thomas.hellstrom@linux.intel.com>
Cc: Matthew Auld <matthew.auld@intel.com>
Cc: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Cc: Dave Airlie <airlied@redhat.com>
Cc: Jason Ekstrand <jason@jlekstrand.net>
---
 drivers/gpu/drm/i915/gem/i915_gem_context.c       | 13 +++++++++++--
 drivers/gpu/drm/i915/gem/i915_gem_context_types.h | 12 ++++++++++++
 2 files changed, 23 insertions(+), 2 deletions(-)

Comments

Tvrtko Ursulin Sept. 2, 2021, 12:42 p.m. UTC | #1
On 13/08/2021 21:30, Daniel Vetter wrote:
> The only reason for this really is the i915_gem_engines->fence
> callback engines_notify(), which exists purely as a fairly funky
> reference counting scheme for that. Otherwise all other callers are
> from process context, and generally fairly benign locking context.

There is reset which definitely isn't process context.

Otherwise I did not really get from the commit message is this patch 
fixing an existing problem or preparing something for the future. If the 
former then as I wrote above - I am pretty sure there are call sites 
from the tasklet already.

Regards,

Tvrtko

> Unfortunately untangling that requires some major surgery, and we have
> a few i915_gem_context reference counting bugs that need fixing, and
> they blow in the current hardirq calling context, so we need a
> stop-gap measure.
> 
> Put a FIXME comment in when this should be removable again.
> 
> Signed-off-by: Daniel Vetter <daniel.vetter@intel.com>
> Cc: Jon Bloomfield <jon.bloomfield@intel.com>
> Cc: Chris Wilson <chris@chris-wilson.co.uk>
> Cc: Maarten Lankhorst <maarten.lankhorst@linux.intel.com>
> Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
> Cc: Daniel Vetter <daniel.vetter@ffwll.ch>
> Cc: "Thomas Hellström" <thomas.hellstrom@linux.intel.com>
> Cc: Matthew Auld <matthew.auld@intel.com>
> Cc: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
> Cc: Dave Airlie <airlied@redhat.com>
> Cc: Jason Ekstrand <jason@jlekstrand.net>
> ---
>   drivers/gpu/drm/i915/gem/i915_gem_context.c       | 13 +++++++++++--
>   drivers/gpu/drm/i915/gem/i915_gem_context_types.h | 12 ++++++++++++
>   2 files changed, 23 insertions(+), 2 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/gem/i915_gem_context.c b/drivers/gpu/drm/i915/gem/i915_gem_context.c
> index fd169cf2f75a..051bc357ff65 100644
> --- a/drivers/gpu/drm/i915/gem/i915_gem_context.c
> +++ b/drivers/gpu/drm/i915/gem/i915_gem_context.c
> @@ -986,9 +986,10 @@ static struct i915_gem_engines *user_engines(struct i915_gem_context *ctx,
>   	return err;
>   }
>   
> -void i915_gem_context_release(struct kref *ref)
> +static void i915_gem_context_release_work(struct work_struct *work)
>   {
> -	struct i915_gem_context *ctx = container_of(ref, typeof(*ctx), ref);
> +	struct i915_gem_context *ctx = container_of(work, typeof(*ctx),
> +						    release_work);
>   
>   	trace_i915_context_free(ctx);
>   	GEM_BUG_ON(!i915_gem_context_is_closed(ctx));
> @@ -1002,6 +1003,13 @@ void i915_gem_context_release(struct kref *ref)
>   	kfree_rcu(ctx, rcu);
>   }
>   
> +void i915_gem_context_release(struct kref *ref)
> +{
> +	struct i915_gem_context *ctx = container_of(ref, typeof(*ctx), ref);
> +
> +	queue_work(ctx->i915->wq, &ctx->release_work);
> +}
> +
>   static inline struct i915_gem_engines *
>   __context_engines_static(const struct i915_gem_context *ctx)
>   {
> @@ -1303,6 +1311,7 @@ i915_gem_create_context(struct drm_i915_private *i915,
>   	ctx->sched = pc->sched;
>   	mutex_init(&ctx->mutex);
>   	INIT_LIST_HEAD(&ctx->link);
> +	INIT_WORK(&ctx->release_work, i915_gem_context_release_work);
>   
>   	spin_lock_init(&ctx->stale.lock);
>   	INIT_LIST_HEAD(&ctx->stale.engines);
> diff --git a/drivers/gpu/drm/i915/gem/i915_gem_context_types.h b/drivers/gpu/drm/i915/gem/i915_gem_context_types.h
> index 94c03a97cb77..0c38789bd4a8 100644
> --- a/drivers/gpu/drm/i915/gem/i915_gem_context_types.h
> +++ b/drivers/gpu/drm/i915/gem/i915_gem_context_types.h
> @@ -288,6 +288,18 @@ struct i915_gem_context {
>   	 */
>   	struct kref ref;
>   
> +	/**
> +	 * @release_work:
> +	 *
> +	 * Work item for deferred cleanup, since i915_gem_context_put() tends to
> +	 * be called from hardirq context.
> +	 *
> +	 * FIXME: The only real reason for this is &i915_gem_engines.fence, all
> +	 * other callers are from process context and need at most some mild
> +	 * shuffling to pull the i915_gem_context_put() call out of a spinlock.
> +	 */
> +	struct work_struct release_work;
> +
>   	/**
>   	 * @rcu: rcu_head for deferred freeing.
>   	 */
>
Daniel Vetter Sept. 2, 2021, 3:05 p.m. UTC | #2
On Thu, Sep 2, 2021 at 2:42 PM Tvrtko Ursulin
<tvrtko.ursulin@linux.intel.com> wrote:
>
>
> On 13/08/2021 21:30, Daniel Vetter wrote:
> > The only reason for this really is the i915_gem_engines->fence
> > callback engines_notify(), which exists purely as a fairly funky
> > reference counting scheme for that. Otherwise all other callers are
> > from process context, and generally fairly benign locking context.
>
> There is reset which definitely isn't process context.

gpu reset runs in process context. The tasklet context is the
engines_notify I'm talking about above.

> Otherwise I did not really get from the commit message is this patch
> fixing an existing problem or preparing something for the future. If the
> former then as I wrote above - I am pretty sure there are call sites
> from the tasklet already.
>
> Regards,
>
> Tvrtko
>
> > Unfortunately untangling that requires some major surgery, and we have
> > a few i915_gem_context reference counting bugs that need fixing, and
> > they blow in the current hardirq calling context, so we need a
> > stop-gap measure.

I guess this para wasn't clear, but subsequent patches fix the
refcount bugs and need this prep patch here.
-Daniel

> >
> > Put a FIXME comment in when this should be removable again.
> >
> > Signed-off-by: Daniel Vetter <daniel.vetter@intel.com>
> > Cc: Jon Bloomfield <jon.bloomfield@intel.com>
> > Cc: Chris Wilson <chris@chris-wilson.co.uk>
> > Cc: Maarten Lankhorst <maarten.lankhorst@linux.intel.com>
> > Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
> > Cc: Daniel Vetter <daniel.vetter@ffwll.ch>
> > Cc: "Thomas Hellström" <thomas.hellstrom@linux.intel.com>
> > Cc: Matthew Auld <matthew.auld@intel.com>
> > Cc: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
> > Cc: Dave Airlie <airlied@redhat.com>
> > Cc: Jason Ekstrand <jason@jlekstrand.net>
> > ---
> >   drivers/gpu/drm/i915/gem/i915_gem_context.c       | 13 +++++++++++--
> >   drivers/gpu/drm/i915/gem/i915_gem_context_types.h | 12 ++++++++++++
> >   2 files changed, 23 insertions(+), 2 deletions(-)
> >
> > diff --git a/drivers/gpu/drm/i915/gem/i915_gem_context.c b/drivers/gpu/drm/i915/gem/i915_gem_context.c
> > index fd169cf2f75a..051bc357ff65 100644
> > --- a/drivers/gpu/drm/i915/gem/i915_gem_context.c
> > +++ b/drivers/gpu/drm/i915/gem/i915_gem_context.c
> > @@ -986,9 +986,10 @@ static struct i915_gem_engines *user_engines(struct i915_gem_context *ctx,
> >       return err;
> >   }
> >
> > -void i915_gem_context_release(struct kref *ref)
> > +static void i915_gem_context_release_work(struct work_struct *work)
> >   {
> > -     struct i915_gem_context *ctx = container_of(ref, typeof(*ctx), ref);
> > +     struct i915_gem_context *ctx = container_of(work, typeof(*ctx),
> > +                                                 release_work);
> >
> >       trace_i915_context_free(ctx);
> >       GEM_BUG_ON(!i915_gem_context_is_closed(ctx));
> > @@ -1002,6 +1003,13 @@ void i915_gem_context_release(struct kref *ref)
> >       kfree_rcu(ctx, rcu);
> >   }
> >
> > +void i915_gem_context_release(struct kref *ref)
> > +{
> > +     struct i915_gem_context *ctx = container_of(ref, typeof(*ctx), ref);
> > +
> > +     queue_work(ctx->i915->wq, &ctx->release_work);
> > +}
> > +
> >   static inline struct i915_gem_engines *
> >   __context_engines_static(const struct i915_gem_context *ctx)
> >   {
> > @@ -1303,6 +1311,7 @@ i915_gem_create_context(struct drm_i915_private *i915,
> >       ctx->sched = pc->sched;
> >       mutex_init(&ctx->mutex);
> >       INIT_LIST_HEAD(&ctx->link);
> > +     INIT_WORK(&ctx->release_work, i915_gem_context_release_work);
> >
> >       spin_lock_init(&ctx->stale.lock);
> >       INIT_LIST_HEAD(&ctx->stale.engines);
> > diff --git a/drivers/gpu/drm/i915/gem/i915_gem_context_types.h b/drivers/gpu/drm/i915/gem/i915_gem_context_types.h
> > index 94c03a97cb77..0c38789bd4a8 100644
> > --- a/drivers/gpu/drm/i915/gem/i915_gem_context_types.h
> > +++ b/drivers/gpu/drm/i915/gem/i915_gem_context_types.h
> > @@ -288,6 +288,18 @@ struct i915_gem_context {
> >        */
> >       struct kref ref;
> >
> > +     /**
> > +      * @release_work:
> > +      *
> > +      * Work item for deferred cleanup, since i915_gem_context_put() tends to
> > +      * be called from hardirq context.
> > +      *
> > +      * FIXME: The only real reason for this is &i915_gem_engines.fence, all
> > +      * other callers are from process context and need at most some mild
> > +      * shuffling to pull the i915_gem_context_put() call out of a spinlock.
> > +      */
> > +     struct work_struct release_work;
> > +
> >       /**
> >        * @rcu: rcu_head for deferred freeing.
> >        */
> >
Tvrtko Ursulin Sept. 2, 2021, 4:20 p.m. UTC | #3
On 02/09/2021 16:05, Daniel Vetter wrote:
> On Thu, Sep 2, 2021 at 2:42 PM Tvrtko Ursulin
> <tvrtko.ursulin@linux.intel.com> wrote:
>>
>>
>> On 13/08/2021 21:30, Daniel Vetter wrote:
>>> The only reason for this really is the i915_gem_engines->fence
>>> callback engines_notify(), which exists purely as a fairly funky
>>> reference counting scheme for that. Otherwise all other callers are
>>> from process context, and generally fairly benign locking context.
>>
>> There is reset which definitely isn't process context.
> 
> gpu reset runs in process context. The tasklet context is the
> engines_notify I'm talking about above.

I haven't looked very deeply but please double check the path from 
execlists_submission_tasklet -> execlists_reset -> intel_engine_reset -> 
__intel_engine_reset -> execlists_reset_rewind -> execlists_reset_csb -> 
execlists_reset_active -> __i915_request_reset -> mark_guilty -> 
i915_gem_context_put.

>> Otherwise I did not really get from the commit message is this patch
>> fixing an existing problem or preparing something for the future. If the
>> former then as I wrote above - I am pretty sure there are call sites
>> from the tasklet already.
>>
>> Regards,
>>
>> Tvrtko
>>
>>> Unfortunately untangling that requires some major surgery, and we have
>>> a few i915_gem_context reference counting bugs that need fixing, and
>>> they blow in the current hardirq calling context, so we need a
>>> stop-gap measure.
> 
> I guess this para wasn't clear, but subsequent patches fix the
> refcount bugs and need this prep patch here.

So up to where in the series are those fixes and where other stuff 
follows? Worth spliting and having cover letters perhaps? Is the fixing 
part applicable to the existing code or only comes to play with the 
syncobj single timeline changes?

Regards,

Tvrtko

> -Daniel
> 
>>>
>>> Put a FIXME comment in when this should be removable again.
>>>
>>> Signed-off-by: Daniel Vetter <daniel.vetter@intel.com>
>>> Cc: Jon Bloomfield <jon.bloomfield@intel.com>
>>> Cc: Chris Wilson <chris@chris-wilson.co.uk>
>>> Cc: Maarten Lankhorst <maarten.lankhorst@linux.intel.com>
>>> Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
>>> Cc: Daniel Vetter <daniel.vetter@ffwll.ch>
>>> Cc: "Thomas Hellström" <thomas.hellstrom@linux.intel.com>
>>> Cc: Matthew Auld <matthew.auld@intel.com>
>>> Cc: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
>>> Cc: Dave Airlie <airlied@redhat.com>
>>> Cc: Jason Ekstrand <jason@jlekstrand.net>
>>> ---
>>>    drivers/gpu/drm/i915/gem/i915_gem_context.c       | 13 +++++++++++--
>>>    drivers/gpu/drm/i915/gem/i915_gem_context_types.h | 12 ++++++++++++
>>>    2 files changed, 23 insertions(+), 2 deletions(-)
>>>
>>> diff --git a/drivers/gpu/drm/i915/gem/i915_gem_context.c b/drivers/gpu/drm/i915/gem/i915_gem_context.c
>>> index fd169cf2f75a..051bc357ff65 100644
>>> --- a/drivers/gpu/drm/i915/gem/i915_gem_context.c
>>> +++ b/drivers/gpu/drm/i915/gem/i915_gem_context.c
>>> @@ -986,9 +986,10 @@ static struct i915_gem_engines *user_engines(struct i915_gem_context *ctx,
>>>        return err;
>>>    }
>>>
>>> -void i915_gem_context_release(struct kref *ref)
>>> +static void i915_gem_context_release_work(struct work_struct *work)
>>>    {
>>> -     struct i915_gem_context *ctx = container_of(ref, typeof(*ctx), ref);
>>> +     struct i915_gem_context *ctx = container_of(work, typeof(*ctx),
>>> +                                                 release_work);
>>>
>>>        trace_i915_context_free(ctx);
>>>        GEM_BUG_ON(!i915_gem_context_is_closed(ctx));
>>> @@ -1002,6 +1003,13 @@ void i915_gem_context_release(struct kref *ref)
>>>        kfree_rcu(ctx, rcu);
>>>    }
>>>
>>> +void i915_gem_context_release(struct kref *ref)
>>> +{
>>> +     struct i915_gem_context *ctx = container_of(ref, typeof(*ctx), ref);
>>> +
>>> +     queue_work(ctx->i915->wq, &ctx->release_work);
>>> +}
>>> +
>>>    static inline struct i915_gem_engines *
>>>    __context_engines_static(const struct i915_gem_context *ctx)
>>>    {
>>> @@ -1303,6 +1311,7 @@ i915_gem_create_context(struct drm_i915_private *i915,
>>>        ctx->sched = pc->sched;
>>>        mutex_init(&ctx->mutex);
>>>        INIT_LIST_HEAD(&ctx->link);
>>> +     INIT_WORK(&ctx->release_work, i915_gem_context_release_work);
>>>
>>>        spin_lock_init(&ctx->stale.lock);
>>>        INIT_LIST_HEAD(&ctx->stale.engines);
>>> diff --git a/drivers/gpu/drm/i915/gem/i915_gem_context_types.h b/drivers/gpu/drm/i915/gem/i915_gem_context_types.h
>>> index 94c03a97cb77..0c38789bd4a8 100644
>>> --- a/drivers/gpu/drm/i915/gem/i915_gem_context_types.h
>>> +++ b/drivers/gpu/drm/i915/gem/i915_gem_context_types.h
>>> @@ -288,6 +288,18 @@ struct i915_gem_context {
>>>         */
>>>        struct kref ref;
>>>
>>> +     /**
>>> +      * @release_work:
>>> +      *
>>> +      * Work item for deferred cleanup, since i915_gem_context_put() tends to
>>> +      * be called from hardirq context.
>>> +      *
>>> +      * FIXME: The only real reason for this is &i915_gem_engines.fence, all
>>> +      * other callers are from process context and need at most some mild
>>> +      * shuffling to pull the i915_gem_context_put() call out of a spinlock.
>>> +      */
>>> +     struct work_struct release_work;
>>> +
>>>        /**
>>>         * @rcu: rcu_head for deferred freeing.
>>>         */
>>>
> 
> 
>
Daniel Vetter Sept. 2, 2021, 8:02 p.m. UTC | #4
On Thu, Sep 2, 2021 at 6:20 PM Tvrtko Ursulin
<tvrtko.ursulin@linux.intel.com> wrote:
> On 02/09/2021 16:05, Daniel Vetter wrote:
> > On Thu, Sep 2, 2021 at 2:42 PM Tvrtko Ursulin
> > <tvrtko.ursulin@linux.intel.com> wrote:
> >>
> >>
> >> On 13/08/2021 21:30, Daniel Vetter wrote:
> >>> The only reason for this really is the i915_gem_engines->fence
> >>> callback engines_notify(), which exists purely as a fairly funky
> >>> reference counting scheme for that. Otherwise all other callers are
> >>> from process context, and generally fairly benign locking context.
> >>
> >> There is reset which definitely isn't process context.
> >
> > gpu reset runs in process context. The tasklet context is the
> > engines_notify I'm talking about above.
>
> I haven't looked very deeply but please double check the path from
> execlists_submission_tasklet -> execlists_reset -> intel_engine_reset ->
> __intel_engine_reset -> execlists_reset_rewind -> execlists_reset_csb ->
> execlists_reset_active -> __i915_request_reset -> mark_guilty ->
> i915_gem_context_put.

Thanks for pointing this out, I'll add it to the commit message.

More stuff to fix, yay.

> >> Otherwise I did not really get from the commit message is this patch
> >> fixing an existing problem or preparing something for the future. If the
> >> former then as I wrote above - I am pretty sure there are call sites
> >> from the tasklet already.
> >>
> >> Regards,
> >>
> >> Tvrtko
> >>
> >>> Unfortunately untangling that requires some major surgery, and we have
> >>> a few i915_gem_context reference counting bugs that need fixing, and
> >>> they blow in the current hardirq calling context, so we need a
> >>> stop-gap measure.
> >
> > I guess this para wasn't clear, but subsequent patches fix the
> > refcount bugs and need this prep patch here.
>
> So up to where in the series are those fixes and where other stuff
> follows? Worth spliting and having cover letters perhaps? Is the fixing
> part applicable to the existing code or only comes to play with the
> syncobj single timeline changes?

There's Fixes: lines. One is timeline syncobj, the other is 2 years old.
-Daniel

>
> Regards,
>
> Tvrtko
>
> > -Daniel
> >
> >>>
> >>> Put a FIXME comment in when this should be removable again.
> >>>
> >>> Signed-off-by: Daniel Vetter <daniel.vetter@intel.com>
> >>> Cc: Jon Bloomfield <jon.bloomfield@intel.com>
> >>> Cc: Chris Wilson <chris@chris-wilson.co.uk>
> >>> Cc: Maarten Lankhorst <maarten.lankhorst@linux.intel.com>
> >>> Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
> >>> Cc: Daniel Vetter <daniel.vetter@ffwll.ch>
> >>> Cc: "Thomas Hellström" <thomas.hellstrom@linux.intel.com>
> >>> Cc: Matthew Auld <matthew.auld@intel.com>
> >>> Cc: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
> >>> Cc: Dave Airlie <airlied@redhat.com>
> >>> Cc: Jason Ekstrand <jason@jlekstrand.net>
> >>> ---
> >>>    drivers/gpu/drm/i915/gem/i915_gem_context.c       | 13 +++++++++++--
> >>>    drivers/gpu/drm/i915/gem/i915_gem_context_types.h | 12 ++++++++++++
> >>>    2 files changed, 23 insertions(+), 2 deletions(-)
> >>>
> >>> diff --git a/drivers/gpu/drm/i915/gem/i915_gem_context.c b/drivers/gpu/drm/i915/gem/i915_gem_context.c
> >>> index fd169cf2f75a..051bc357ff65 100644
> >>> --- a/drivers/gpu/drm/i915/gem/i915_gem_context.c
> >>> +++ b/drivers/gpu/drm/i915/gem/i915_gem_context.c
> >>> @@ -986,9 +986,10 @@ static struct i915_gem_engines *user_engines(struct i915_gem_context *ctx,
> >>>        return err;
> >>>    }
> >>>
> >>> -void i915_gem_context_release(struct kref *ref)
> >>> +static void i915_gem_context_release_work(struct work_struct *work)
> >>>    {
> >>> -     struct i915_gem_context *ctx = container_of(ref, typeof(*ctx), ref);
> >>> +     struct i915_gem_context *ctx = container_of(work, typeof(*ctx),
> >>> +                                                 release_work);
> >>>
> >>>        trace_i915_context_free(ctx);
> >>>        GEM_BUG_ON(!i915_gem_context_is_closed(ctx));
> >>> @@ -1002,6 +1003,13 @@ void i915_gem_context_release(struct kref *ref)
> >>>        kfree_rcu(ctx, rcu);
> >>>    }
> >>>
> >>> +void i915_gem_context_release(struct kref *ref)
> >>> +{
> >>> +     struct i915_gem_context *ctx = container_of(ref, typeof(*ctx), ref);
> >>> +
> >>> +     queue_work(ctx->i915->wq, &ctx->release_work);
> >>> +}
> >>> +
> >>>    static inline struct i915_gem_engines *
> >>>    __context_engines_static(const struct i915_gem_context *ctx)
> >>>    {
> >>> @@ -1303,6 +1311,7 @@ i915_gem_create_context(struct drm_i915_private *i915,
> >>>        ctx->sched = pc->sched;
> >>>        mutex_init(&ctx->mutex);
> >>>        INIT_LIST_HEAD(&ctx->link);
> >>> +     INIT_WORK(&ctx->release_work, i915_gem_context_release_work);
> >>>
> >>>        spin_lock_init(&ctx->stale.lock);
> >>>        INIT_LIST_HEAD(&ctx->stale.engines);
> >>> diff --git a/drivers/gpu/drm/i915/gem/i915_gem_context_types.h b/drivers/gpu/drm/i915/gem/i915_gem_context_types.h
> >>> index 94c03a97cb77..0c38789bd4a8 100644
> >>> --- a/drivers/gpu/drm/i915/gem/i915_gem_context_types.h
> >>> +++ b/drivers/gpu/drm/i915/gem/i915_gem_context_types.h
> >>> @@ -288,6 +288,18 @@ struct i915_gem_context {
> >>>         */
> >>>        struct kref ref;
> >>>
> >>> +     /**
> >>> +      * @release_work:
> >>> +      *
> >>> +      * Work item for deferred cleanup, since i915_gem_context_put() tends to
> >>> +      * be called from hardirq context.
> >>> +      *
> >>> +      * FIXME: The only real reason for this is &i915_gem_engines.fence, all
> >>> +      * other callers are from process context and need at most some mild
> >>> +      * shuffling to pull the i915_gem_context_put() call out of a spinlock.
> >>> +      */
> >>> +     struct work_struct release_work;
> >>> +
> >>>        /**
> >>>         * @rcu: rcu_head for deferred freeing.
> >>>         */
> >>>
> >
> >
> >
Tvrtko Ursulin Sept. 3, 2021, 10:40 a.m. UTC | #5
On 02/09/2021 21:02, Daniel Vetter wrote:
> On Thu, Sep 2, 2021 at 6:20 PM Tvrtko Ursulin
> <tvrtko.ursulin@linux.intel.com> wrote:
>> On 02/09/2021 16:05, Daniel Vetter wrote:
>>> On Thu, Sep 2, 2021 at 2:42 PM Tvrtko Ursulin
>>> <tvrtko.ursulin@linux.intel.com> wrote:
>>>>
>>>>
>>>> On 13/08/2021 21:30, Daniel Vetter wrote:
>>>>> The only reason for this really is the i915_gem_engines->fence
>>>>> callback engines_notify(), which exists purely as a fairly funky
>>>>> reference counting scheme for that. Otherwise all other callers are
>>>>> from process context, and generally fairly benign locking context.
>>>>
>>>> There is reset which definitely isn't process context.
>>>
>>> gpu reset runs in process context. The tasklet context is the
>>> engines_notify I'm talking about above.
>>
>> I haven't looked very deeply but please double check the path from
>> execlists_submission_tasklet -> execlists_reset -> intel_engine_reset ->
>> __intel_engine_reset -> execlists_reset_rewind -> execlists_reset_csb ->
>> execlists_reset_active -> __i915_request_reset -> mark_guilty ->
>> i915_gem_context_put.
> 
> Thanks for pointing this out, I'll add it to the commit message.
> 
> More stuff to fix, yay.
> 
>>>> Otherwise I did not really get from the commit message is this patch
>>>> fixing an existing problem or preparing something for the future. If the
>>>> former then as I wrote above - I am pretty sure there are call sites
>>>> from the tasklet already.
>>>>
>>>> Regards,
>>>>
>>>> Tvrtko
>>>>
>>>>> Unfortunately untangling that requires some major surgery, and we have
>>>>> a few i915_gem_context reference counting bugs that need fixing, and
>>>>> they blow in the current hardirq calling context, so we need a
>>>>> stop-gap measure.
>>>
>>> I guess this para wasn't clear, but subsequent patches fix the
>>> refcount bugs and need this prep patch here.
>>
>> So up to where in the series are those fixes and where other stuff
>> follows? Worth spliting and having cover letters perhaps? Is the fixing
>> part applicable to the existing code or only comes to play with the
>> syncobj single timeline changes?
> 
> There's Fixes: lines. One is timeline syncobj, the other is 2 years old.

So first two patches are standalone and fix the immediate bug? Could you 
describe the composition and doings of the series in a cover letter so 
it's possible to have an overview of chunk of work tackled?

Regards,

Tvrtko

> -Daniel
> 
>>
>> Regards,
>>
>> Tvrtko
>>
>>> -Daniel
>>>
>>>>>
>>>>> Put a FIXME comment in when this should be removable again.
>>>>>
>>>>> Signed-off-by: Daniel Vetter <daniel.vetter@intel.com>
>>>>> Cc: Jon Bloomfield <jon.bloomfield@intel.com>
>>>>> Cc: Chris Wilson <chris@chris-wilson.co.uk>
>>>>> Cc: Maarten Lankhorst <maarten.lankhorst@linux.intel.com>
>>>>> Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
>>>>> Cc: Daniel Vetter <daniel.vetter@ffwll.ch>
>>>>> Cc: "Thomas Hellström" <thomas.hellstrom@linux.intel.com>
>>>>> Cc: Matthew Auld <matthew.auld@intel.com>
>>>>> Cc: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
>>>>> Cc: Dave Airlie <airlied@redhat.com>
>>>>> Cc: Jason Ekstrand <jason@jlekstrand.net>
>>>>> ---
>>>>>     drivers/gpu/drm/i915/gem/i915_gem_context.c       | 13 +++++++++++--
>>>>>     drivers/gpu/drm/i915/gem/i915_gem_context_types.h | 12 ++++++++++++
>>>>>     2 files changed, 23 insertions(+), 2 deletions(-)
>>>>>
>>>>> diff --git a/drivers/gpu/drm/i915/gem/i915_gem_context.c b/drivers/gpu/drm/i915/gem/i915_gem_context.c
>>>>> index fd169cf2f75a..051bc357ff65 100644
>>>>> --- a/drivers/gpu/drm/i915/gem/i915_gem_context.c
>>>>> +++ b/drivers/gpu/drm/i915/gem/i915_gem_context.c
>>>>> @@ -986,9 +986,10 @@ static struct i915_gem_engines *user_engines(struct i915_gem_context *ctx,
>>>>>         return err;
>>>>>     }
>>>>>
>>>>> -void i915_gem_context_release(struct kref *ref)
>>>>> +static void i915_gem_context_release_work(struct work_struct *work)
>>>>>     {
>>>>> -     struct i915_gem_context *ctx = container_of(ref, typeof(*ctx), ref);
>>>>> +     struct i915_gem_context *ctx = container_of(work, typeof(*ctx),
>>>>> +                                                 release_work);
>>>>>
>>>>>         trace_i915_context_free(ctx);
>>>>>         GEM_BUG_ON(!i915_gem_context_is_closed(ctx));
>>>>> @@ -1002,6 +1003,13 @@ void i915_gem_context_release(struct kref *ref)
>>>>>         kfree_rcu(ctx, rcu);
>>>>>     }
>>>>>
>>>>> +void i915_gem_context_release(struct kref *ref)
>>>>> +{
>>>>> +     struct i915_gem_context *ctx = container_of(ref, typeof(*ctx), ref);
>>>>> +
>>>>> +     queue_work(ctx->i915->wq, &ctx->release_work);
>>>>> +}
>>>>> +
>>>>>     static inline struct i915_gem_engines *
>>>>>     __context_engines_static(const struct i915_gem_context *ctx)
>>>>>     {
>>>>> @@ -1303,6 +1311,7 @@ i915_gem_create_context(struct drm_i915_private *i915,
>>>>>         ctx->sched = pc->sched;
>>>>>         mutex_init(&ctx->mutex);
>>>>>         INIT_LIST_HEAD(&ctx->link);
>>>>> +     INIT_WORK(&ctx->release_work, i915_gem_context_release_work);
>>>>>
>>>>>         spin_lock_init(&ctx->stale.lock);
>>>>>         INIT_LIST_HEAD(&ctx->stale.engines);
>>>>> diff --git a/drivers/gpu/drm/i915/gem/i915_gem_context_types.h b/drivers/gpu/drm/i915/gem/i915_gem_context_types.h
>>>>> index 94c03a97cb77..0c38789bd4a8 100644
>>>>> --- a/drivers/gpu/drm/i915/gem/i915_gem_context_types.h
>>>>> +++ b/drivers/gpu/drm/i915/gem/i915_gem_context_types.h
>>>>> @@ -288,6 +288,18 @@ struct i915_gem_context {
>>>>>          */
>>>>>         struct kref ref;
>>>>>
>>>>> +     /**
>>>>> +      * @release_work:
>>>>> +      *
>>>>> +      * Work item for deferred cleanup, since i915_gem_context_put() tends to
>>>>> +      * be called from hardirq context.
>>>>> +      *
>>>>> +      * FIXME: The only real reason for this is &i915_gem_engines.fence, all
>>>>> +      * other callers are from process context and need at most some mild
>>>>> +      * shuffling to pull the i915_gem_context_put() call out of a spinlock.
>>>>> +      */
>>>>> +     struct work_struct release_work;
>>>>> +
>>>>>         /**
>>>>>          * @rcu: rcu_head for deferred freeing.
>>>>>          */
>>>>>
>>>
>>>
>>>
> 
> 
>
diff mbox series

Patch

diff --git a/drivers/gpu/drm/i915/gem/i915_gem_context.c b/drivers/gpu/drm/i915/gem/i915_gem_context.c
index fd169cf2f75a..051bc357ff65 100644
--- a/drivers/gpu/drm/i915/gem/i915_gem_context.c
+++ b/drivers/gpu/drm/i915/gem/i915_gem_context.c
@@ -986,9 +986,10 @@  static struct i915_gem_engines *user_engines(struct i915_gem_context *ctx,
 	return err;
 }
 
-void i915_gem_context_release(struct kref *ref)
+static void i915_gem_context_release_work(struct work_struct *work)
 {
-	struct i915_gem_context *ctx = container_of(ref, typeof(*ctx), ref);
+	struct i915_gem_context *ctx = container_of(work, typeof(*ctx),
+						    release_work);
 
 	trace_i915_context_free(ctx);
 	GEM_BUG_ON(!i915_gem_context_is_closed(ctx));
@@ -1002,6 +1003,13 @@  void i915_gem_context_release(struct kref *ref)
 	kfree_rcu(ctx, rcu);
 }
 
+void i915_gem_context_release(struct kref *ref)
+{
+	struct i915_gem_context *ctx = container_of(ref, typeof(*ctx), ref);
+
+	queue_work(ctx->i915->wq, &ctx->release_work);
+}
+
 static inline struct i915_gem_engines *
 __context_engines_static(const struct i915_gem_context *ctx)
 {
@@ -1303,6 +1311,7 @@  i915_gem_create_context(struct drm_i915_private *i915,
 	ctx->sched = pc->sched;
 	mutex_init(&ctx->mutex);
 	INIT_LIST_HEAD(&ctx->link);
+	INIT_WORK(&ctx->release_work, i915_gem_context_release_work);
 
 	spin_lock_init(&ctx->stale.lock);
 	INIT_LIST_HEAD(&ctx->stale.engines);
diff --git a/drivers/gpu/drm/i915/gem/i915_gem_context_types.h b/drivers/gpu/drm/i915/gem/i915_gem_context_types.h
index 94c03a97cb77..0c38789bd4a8 100644
--- a/drivers/gpu/drm/i915/gem/i915_gem_context_types.h
+++ b/drivers/gpu/drm/i915/gem/i915_gem_context_types.h
@@ -288,6 +288,18 @@  struct i915_gem_context {
 	 */
 	struct kref ref;
 
+	/**
+	 * @release_work:
+	 *
+	 * Work item for deferred cleanup, since i915_gem_context_put() tends to
+	 * be called from hardirq context.
+	 *
+	 * FIXME: The only real reason for this is &i915_gem_engines.fence, all
+	 * other callers are from process context and need at most some mild
+	 * shuffling to pull the i915_gem_context_put() call out of a spinlock.
+	 */
+	struct work_struct release_work;
+
 	/**
 	 * @rcu: rcu_head for deferred freeing.
 	 */