diff mbox series

[BrownBag] drm/i915/gtt: Neuter the deferred unbind callback from gen6_ppgtt_cleanup

Message ID 20190524064529.20514-1-chris@chris-wilson.co.uk (mailing list archive)
State New, archived
Headers show
Series [BrownBag] drm/i915/gtt: Neuter the deferred unbind callback from gen6_ppgtt_cleanup | expand

Commit Message

Chris Wilson May 24, 2019, 6:45 a.m. UTC
Having deferred the vma destruction to a worker where we can acquire the
struct_mutex, we have to avoid chasing back into the now destroyed
ppgtt. The pd_vma is special in having a custom unbind function to scan
for unused pages despite the VMA itself being notionally part of the
GGTT. As such, we need to disable that callback to avoid a
use-after-free.

This unfortunately blew up so early during boot that CI declared the
machine unreachable as opposed to being the major failure it was. Oops.

Fixes: d3622099c76f ("drm/i915/gtt: Always acquire struct_mutex for gen6_ppgtt_cleanup")
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Cc: Tomi Sarvela <tomi.p.sarvela@intel.com>
---
 drivers/gpu/drm/i915/i915_gem_gtt.c | 28 ++++++++++++++++++++++++++++
 1 file changed, 28 insertions(+)

Comments

Tvrtko Ursulin May 24, 2019, 8:13 a.m. UTC | #1
On 24/05/2019 07:45, Chris Wilson wrote:
> Having deferred the vma destruction to a worker where we can acquire the
> struct_mutex, we have to avoid chasing back into the now destroyed
> ppgtt. The pd_vma is special in having a custom unbind function to scan
> for unused pages despite the VMA itself being notionally part of the
> GGTT. As such, we need to disable that callback to avoid a
> use-after-free.
> 
> This unfortunately blew up so early during boot that CI declared the
> machine unreachable as opposed to being the major failure it was. Oops.
> 
> Fixes: d3622099c76f ("drm/i915/gtt: Always acquire struct_mutex for gen6_ppgtt_cleanup")
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
> Cc: Tomi Sarvela <tomi.p.sarvela@intel.com>
> ---
>   drivers/gpu/drm/i915/i915_gem_gtt.c | 28 ++++++++++++++++++++++++++++
>   1 file changed, 28 insertions(+)
> 
> diff --git a/drivers/gpu/drm/i915/i915_gem_gtt.c b/drivers/gpu/drm/i915/i915_gem_gtt.c
> index 8d8a4b0ad4d9..266baa11df64 100644
> --- a/drivers/gpu/drm/i915/i915_gem_gtt.c
> +++ b/drivers/gpu/drm/i915/i915_gem_gtt.c
> @@ -1847,6 +1847,33 @@ static void gen6_ppgtt_cleanup_work(struct work_struct *wrk)
>   	kfree(work);
>   }
>   
> +static int nop_set_pages(struct i915_vma *vma)
> +{
> +	return -ENODEV;
> +}
> +
> +static void nop_clear_pages(struct i915_vma *vma)
> +{
> +}
> +
> +static int nop_bind(struct i915_vma *vma,
> +		    enum i915_cache_level cache_level,
> +		    u32 unused)
> +{
> +	return -ENODEV;
> +}
> +
> +static void nop_unbind(struct i915_vma *vma)
> +{
> +}
> +
> +static const struct i915_vma_ops nop_vma_ops = {
> +	.set_pages = nop_set_pages,
> +	.clear_pages = nop_clear_pages,
> +	.bind_vma = nop_bind,
> +	.unbind_vma = nop_unbind,
> +};
> +
>   static void gen6_ppgtt_cleanup(struct i915_address_space *vm)
>   {
>   	struct gen6_hw_ppgtt *ppgtt = to_gen6_ppgtt(i915_vm_to_ppgtt(vm));
> @@ -1855,6 +1882,7 @@ static void gen6_ppgtt_cleanup(struct i915_address_space *vm)
>   	/* FIXME remove the struct_mutex to bring the locking under control */
>   	INIT_WORK(&work->base, gen6_ppgtt_cleanup_work);
>   	work->vma = ppgtt->vma;
> +	work->vma->ops = &nop_vma_ops;

Could we use some asserts before overriding the vma ops? Like 
GEM_BUG_ON(vma->pages)? And something for still bound?

>   	schedule_work(&work->base);
>   
>   	gen6_ppgtt_free_pd(ppgtt);
> 

Regards,

Tvrtko
Chris Wilson May 24, 2019, 8:17 a.m. UTC | #2
Quoting Tvrtko Ursulin (2019-05-24 09:13:14)
> 
> On 24/05/2019 07:45, Chris Wilson wrote:
> > Having deferred the vma destruction to a worker where we can acquire the
> > struct_mutex, we have to avoid chasing back into the now destroyed
> > ppgtt. The pd_vma is special in having a custom unbind function to scan
> > for unused pages despite the VMA itself being notionally part of the
> > GGTT. As such, we need to disable that callback to avoid a
> > use-after-free.
> > 
> > This unfortunately blew up so early during boot that CI declared the
> > machine unreachable as opposed to being the major failure it was. Oops.
> > 
> > Fixes: d3622099c76f ("drm/i915/gtt: Always acquire struct_mutex for gen6_ppgtt_cleanup")
> > Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> > Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
> > Cc: Tomi Sarvela <tomi.p.sarvela@intel.com>
> > ---
> >   drivers/gpu/drm/i915/i915_gem_gtt.c | 28 ++++++++++++++++++++++++++++
> >   1 file changed, 28 insertions(+)
> > 
> > diff --git a/drivers/gpu/drm/i915/i915_gem_gtt.c b/drivers/gpu/drm/i915/i915_gem_gtt.c
> > index 8d8a4b0ad4d9..266baa11df64 100644
> > --- a/drivers/gpu/drm/i915/i915_gem_gtt.c
> > +++ b/drivers/gpu/drm/i915/i915_gem_gtt.c
> > @@ -1847,6 +1847,33 @@ static void gen6_ppgtt_cleanup_work(struct work_struct *wrk)
> >       kfree(work);
> >   }
> >   
> > +static int nop_set_pages(struct i915_vma *vma)
> > +{
> > +     return -ENODEV;
> > +}
> > +
> > +static void nop_clear_pages(struct i915_vma *vma)
> > +{
> > +}
> > +
> > +static int nop_bind(struct i915_vma *vma,
> > +                 enum i915_cache_level cache_level,
> > +                 u32 unused)
> > +{
> > +     return -ENODEV;
> > +}
> > +
> > +static void nop_unbind(struct i915_vma *vma)
> > +{
> > +}
> > +
> > +static const struct i915_vma_ops nop_vma_ops = {
> > +     .set_pages = nop_set_pages,
> > +     .clear_pages = nop_clear_pages,
> > +     .bind_vma = nop_bind,
> > +     .unbind_vma = nop_unbind,
> > +};
> > +
> >   static void gen6_ppgtt_cleanup(struct i915_address_space *vm)
> >   {
> >       struct gen6_hw_ppgtt *ppgtt = to_gen6_ppgtt(i915_vm_to_ppgtt(vm));
> > @@ -1855,6 +1882,7 @@ static void gen6_ppgtt_cleanup(struct i915_address_space *vm)
> >       /* FIXME remove the struct_mutex to bring the locking under control */
> >       INIT_WORK(&work->base, gen6_ppgtt_cleanup_work);
> >       work->vma = ppgtt->vma;
> > +     work->vma->ops = &nop_vma_ops;
> 
> Could we use some asserts before overriding the vma ops? Like 
> GEM_BUG_ON(vma->pages)? And something for still bound?

It technically still is bound as it is in the GGTT but currently
unpinned -- that will be checked on destroy, it's just we also get an
unbind callback. vma->pages doesn't exist for this (set to ERR_PTR).
-Chris
Tvrtko Ursulin May 24, 2019, 8:23 a.m. UTC | #3
On 24/05/2019 09:17, Chris Wilson wrote:
> Quoting Tvrtko Ursulin (2019-05-24 09:13:14)
>>
>> On 24/05/2019 07:45, Chris Wilson wrote:
>>> Having deferred the vma destruction to a worker where we can acquire the
>>> struct_mutex, we have to avoid chasing back into the now destroyed
>>> ppgtt. The pd_vma is special in having a custom unbind function to scan
>>> for unused pages despite the VMA itself being notionally part of the
>>> GGTT. As such, we need to disable that callback to avoid a
>>> use-after-free.
>>>
>>> This unfortunately blew up so early during boot that CI declared the
>>> machine unreachable as opposed to being the major failure it was. Oops.
>>>
>>> Fixes: d3622099c76f ("drm/i915/gtt: Always acquire struct_mutex for gen6_ppgtt_cleanup")
>>> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
>>> Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
>>> Cc: Tomi Sarvela <tomi.p.sarvela@intel.com>
>>> ---
>>>    drivers/gpu/drm/i915/i915_gem_gtt.c | 28 ++++++++++++++++++++++++++++
>>>    1 file changed, 28 insertions(+)
>>>
>>> diff --git a/drivers/gpu/drm/i915/i915_gem_gtt.c b/drivers/gpu/drm/i915/i915_gem_gtt.c
>>> index 8d8a4b0ad4d9..266baa11df64 100644
>>> --- a/drivers/gpu/drm/i915/i915_gem_gtt.c
>>> +++ b/drivers/gpu/drm/i915/i915_gem_gtt.c
>>> @@ -1847,6 +1847,33 @@ static void gen6_ppgtt_cleanup_work(struct work_struct *wrk)
>>>        kfree(work);
>>>    }
>>>    
>>> +static int nop_set_pages(struct i915_vma *vma)
>>> +{
>>> +     return -ENODEV;
>>> +}
>>> +
>>> +static void nop_clear_pages(struct i915_vma *vma)
>>> +{
>>> +}
>>> +
>>> +static int nop_bind(struct i915_vma *vma,
>>> +                 enum i915_cache_level cache_level,
>>> +                 u32 unused)
>>> +{
>>> +     return -ENODEV;
>>> +}
>>> +
>>> +static void nop_unbind(struct i915_vma *vma)
>>> +{
>>> +}
>>> +
>>> +static const struct i915_vma_ops nop_vma_ops = {
>>> +     .set_pages = nop_set_pages,
>>> +     .clear_pages = nop_clear_pages,
>>> +     .bind_vma = nop_bind,
>>> +     .unbind_vma = nop_unbind,
>>> +};
>>> +
>>>    static void gen6_ppgtt_cleanup(struct i915_address_space *vm)
>>>    {
>>>        struct gen6_hw_ppgtt *ppgtt = to_gen6_ppgtt(i915_vm_to_ppgtt(vm));
>>> @@ -1855,6 +1882,7 @@ static void gen6_ppgtt_cleanup(struct i915_address_space *vm)
>>>        /* FIXME remove the struct_mutex to bring the locking under control */
>>>        INIT_WORK(&work->base, gen6_ppgtt_cleanup_work);
>>>        work->vma = ppgtt->vma;
>>> +     work->vma->ops = &nop_vma_ops;
>>
>> Could we use some asserts before overriding the vma ops? Like
>> GEM_BUG_ON(vma->pages)? And something for still bound?
> 
> It technically still is bound as it is in the GGTT but currently
> unpinned -- that will be checked on destroy, it's just we also get an
> unbind callback. vma->pages doesn't exist for this (set to ERR_PTR).

If we are getting the unbind callback and we nop-ed it, who will 
actually do it's job?

Regards,

Tvrtko
Chris Wilson May 24, 2019, 8:29 a.m. UTC | #4
Quoting Tvrtko Ursulin (2019-05-24 09:23:40)
> 
> On 24/05/2019 09:17, Chris Wilson wrote:
> > Quoting Tvrtko Ursulin (2019-05-24 09:13:14)
> >>
> >> On 24/05/2019 07:45, Chris Wilson wrote:
> >>> Having deferred the vma destruction to a worker where we can acquire the
> >>> struct_mutex, we have to avoid chasing back into the now destroyed
> >>> ppgtt. The pd_vma is special in having a custom unbind function to scan
> >>> for unused pages despite the VMA itself being notionally part of the
> >>> GGTT. As such, we need to disable that callback to avoid a
> >>> use-after-free.
> >>>
> >>> This unfortunately blew up so early during boot that CI declared the
> >>> machine unreachable as opposed to being the major failure it was. Oops.
> >>>
> >>> Fixes: d3622099c76f ("drm/i915/gtt: Always acquire struct_mutex for gen6_ppgtt_cleanup")
> >>> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> >>> Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
> >>> Cc: Tomi Sarvela <tomi.p.sarvela@intel.com>
> >>> ---
> >>>    drivers/gpu/drm/i915/i915_gem_gtt.c | 28 ++++++++++++++++++++++++++++
> >>>    1 file changed, 28 insertions(+)
> >>>
> >>> diff --git a/drivers/gpu/drm/i915/i915_gem_gtt.c b/drivers/gpu/drm/i915/i915_gem_gtt.c
> >>> index 8d8a4b0ad4d9..266baa11df64 100644
> >>> --- a/drivers/gpu/drm/i915/i915_gem_gtt.c
> >>> +++ b/drivers/gpu/drm/i915/i915_gem_gtt.c
> >>> @@ -1847,6 +1847,33 @@ static void gen6_ppgtt_cleanup_work(struct work_struct *wrk)
> >>>        kfree(work);
> >>>    }
> >>>    
> >>> +static int nop_set_pages(struct i915_vma *vma)
> >>> +{
> >>> +     return -ENODEV;
> >>> +}
> >>> +
> >>> +static void nop_clear_pages(struct i915_vma *vma)
> >>> +{
> >>> +}
> >>> +
> >>> +static int nop_bind(struct i915_vma *vma,
> >>> +                 enum i915_cache_level cache_level,
> >>> +                 u32 unused)
> >>> +{
> >>> +     return -ENODEV;
> >>> +}
> >>> +
> >>> +static void nop_unbind(struct i915_vma *vma)
> >>> +{
> >>> +}
> >>> +
> >>> +static const struct i915_vma_ops nop_vma_ops = {
> >>> +     .set_pages = nop_set_pages,
> >>> +     .clear_pages = nop_clear_pages,
> >>> +     .bind_vma = nop_bind,
> >>> +     .unbind_vma = nop_unbind,
> >>> +};
> >>> +
> >>>    static void gen6_ppgtt_cleanup(struct i915_address_space *vm)
> >>>    {
> >>>        struct gen6_hw_ppgtt *ppgtt = to_gen6_ppgtt(i915_vm_to_ppgtt(vm));
> >>> @@ -1855,6 +1882,7 @@ static void gen6_ppgtt_cleanup(struct i915_address_space *vm)
> >>>        /* FIXME remove the struct_mutex to bring the locking under control */
> >>>        INIT_WORK(&work->base, gen6_ppgtt_cleanup_work);
> >>>        work->vma = ppgtt->vma;
> >>> +     work->vma->ops = &nop_vma_ops;
> >>
> >> Could we use some asserts before overriding the vma ops? Like
> >> GEM_BUG_ON(vma->pages)? And something for still bound?
> > 
> > It technically still is bound as it is in the GGTT but currently
> > unpinned -- that will be checked on destroy, it's just we also get an
> > unbind callback. vma->pages doesn't exist for this (set to ERR_PTR).
> 
> If we are getting the unbind callback and we nop-ed it, who will 
> actually do it's job?

The callback is just a hook for us to prune within the ppgtt.
It still is removed from GGTT by i915_vma_unbind().
-Chris
Tvrtko Ursulin May 24, 2019, 8:31 a.m. UTC | #5
On 24/05/2019 09:29, Chris Wilson wrote:
> Quoting Tvrtko Ursulin (2019-05-24 09:23:40)
>>
>> On 24/05/2019 09:17, Chris Wilson wrote:
>>> Quoting Tvrtko Ursulin (2019-05-24 09:13:14)
>>>>
>>>> On 24/05/2019 07:45, Chris Wilson wrote:
>>>>> Having deferred the vma destruction to a worker where we can acquire the
>>>>> struct_mutex, we have to avoid chasing back into the now destroyed
>>>>> ppgtt. The pd_vma is special in having a custom unbind function to scan
>>>>> for unused pages despite the VMA itself being notionally part of the
>>>>> GGTT. As such, we need to disable that callback to avoid a
>>>>> use-after-free.
>>>>>
>>>>> This unfortunately blew up so early during boot that CI declared the
>>>>> machine unreachable as opposed to being the major failure it was. Oops.
>>>>>
>>>>> Fixes: d3622099c76f ("drm/i915/gtt: Always acquire struct_mutex for gen6_ppgtt_cleanup")
>>>>> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
>>>>> Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
>>>>> Cc: Tomi Sarvela <tomi.p.sarvela@intel.com>
>>>>> ---
>>>>>     drivers/gpu/drm/i915/i915_gem_gtt.c | 28 ++++++++++++++++++++++++++++
>>>>>     1 file changed, 28 insertions(+)
>>>>>
>>>>> diff --git a/drivers/gpu/drm/i915/i915_gem_gtt.c b/drivers/gpu/drm/i915/i915_gem_gtt.c
>>>>> index 8d8a4b0ad4d9..266baa11df64 100644
>>>>> --- a/drivers/gpu/drm/i915/i915_gem_gtt.c
>>>>> +++ b/drivers/gpu/drm/i915/i915_gem_gtt.c
>>>>> @@ -1847,6 +1847,33 @@ static void gen6_ppgtt_cleanup_work(struct work_struct *wrk)
>>>>>         kfree(work);
>>>>>     }
>>>>>     
>>>>> +static int nop_set_pages(struct i915_vma *vma)
>>>>> +{
>>>>> +     return -ENODEV;
>>>>> +}
>>>>> +
>>>>> +static void nop_clear_pages(struct i915_vma *vma)
>>>>> +{
>>>>> +}
>>>>> +
>>>>> +static int nop_bind(struct i915_vma *vma,
>>>>> +                 enum i915_cache_level cache_level,
>>>>> +                 u32 unused)
>>>>> +{
>>>>> +     return -ENODEV;
>>>>> +}
>>>>> +
>>>>> +static void nop_unbind(struct i915_vma *vma)
>>>>> +{
>>>>> +}
>>>>> +
>>>>> +static const struct i915_vma_ops nop_vma_ops = {
>>>>> +     .set_pages = nop_set_pages,
>>>>> +     .clear_pages = nop_clear_pages,
>>>>> +     .bind_vma = nop_bind,
>>>>> +     .unbind_vma = nop_unbind,
>>>>> +};
>>>>> +
>>>>>     static void gen6_ppgtt_cleanup(struct i915_address_space *vm)
>>>>>     {
>>>>>         struct gen6_hw_ppgtt *ppgtt = to_gen6_ppgtt(i915_vm_to_ppgtt(vm));
>>>>> @@ -1855,6 +1882,7 @@ static void gen6_ppgtt_cleanup(struct i915_address_space *vm)
>>>>>         /* FIXME remove the struct_mutex to bring the locking under control */
>>>>>         INIT_WORK(&work->base, gen6_ppgtt_cleanup_work);
>>>>>         work->vma = ppgtt->vma;
>>>>> +     work->vma->ops = &nop_vma_ops;
>>>>
>>>> Could we use some asserts before overriding the vma ops? Like
>>>> GEM_BUG_ON(vma->pages)? And something for still bound?
>>>
>>> It technically still is bound as it is in the GGTT but currently
>>> unpinned -- that will be checked on destroy, it's just we also get an
>>> unbind callback. vma->pages doesn't exist for this (set to ERR_PTR).
>>
>> If we are getting the unbind callback and we nop-ed it, who will
>> actually do it's job?
> 
> The callback is just a hook for us to prune within the ppgtt.
> It still is removed from GGTT by i915_vma_unbind().

So it needs GEM_BUG_ON(ppgtt->scan_for_unused_pt) before overriding the 
unbind?

Regards,

Tvrtko
Chris Wilson May 24, 2019, 8:36 a.m. UTC | #6
Quoting Tvrtko Ursulin (2019-05-24 09:31:45)
> 
> On 24/05/2019 09:29, Chris Wilson wrote:
> > Quoting Tvrtko Ursulin (2019-05-24 09:23:40)
> >>
> >> On 24/05/2019 09:17, Chris Wilson wrote:
> >>> Quoting Tvrtko Ursulin (2019-05-24 09:13:14)
> >>>>
> >>>> On 24/05/2019 07:45, Chris Wilson wrote:
> >>>>> Having deferred the vma destruction to a worker where we can acquire the
> >>>>> struct_mutex, we have to avoid chasing back into the now destroyed
> >>>>> ppgtt. The pd_vma is special in having a custom unbind function to scan
> >>>>> for unused pages despite the VMA itself being notionally part of the
> >>>>> GGTT. As such, we need to disable that callback to avoid a
> >>>>> use-after-free.
> >>>>>
> >>>>> This unfortunately blew up so early during boot that CI declared the
> >>>>> machine unreachable as opposed to being the major failure it was. Oops.
> >>>>>
> >>>>> Fixes: d3622099c76f ("drm/i915/gtt: Always acquire struct_mutex for gen6_ppgtt_cleanup")
> >>>>> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> >>>>> Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
> >>>>> Cc: Tomi Sarvela <tomi.p.sarvela@intel.com>
> >>>>> ---
> >>>>>     drivers/gpu/drm/i915/i915_gem_gtt.c | 28 ++++++++++++++++++++++++++++
> >>>>>     1 file changed, 28 insertions(+)
> >>>>>
> >>>>> diff --git a/drivers/gpu/drm/i915/i915_gem_gtt.c b/drivers/gpu/drm/i915/i915_gem_gtt.c
> >>>>> index 8d8a4b0ad4d9..266baa11df64 100644
> >>>>> --- a/drivers/gpu/drm/i915/i915_gem_gtt.c
> >>>>> +++ b/drivers/gpu/drm/i915/i915_gem_gtt.c
> >>>>> @@ -1847,6 +1847,33 @@ static void gen6_ppgtt_cleanup_work(struct work_struct *wrk)
> >>>>>         kfree(work);
> >>>>>     }
> >>>>>     
> >>>>> +static int nop_set_pages(struct i915_vma *vma)
> >>>>> +{
> >>>>> +     return -ENODEV;
> >>>>> +}
> >>>>> +
> >>>>> +static void nop_clear_pages(struct i915_vma *vma)
> >>>>> +{
> >>>>> +}
> >>>>> +
> >>>>> +static int nop_bind(struct i915_vma *vma,
> >>>>> +                 enum i915_cache_level cache_level,
> >>>>> +                 u32 unused)
> >>>>> +{
> >>>>> +     return -ENODEV;
> >>>>> +}
> >>>>> +
> >>>>> +static void nop_unbind(struct i915_vma *vma)
> >>>>> +{
> >>>>> +}
> >>>>> +
> >>>>> +static const struct i915_vma_ops nop_vma_ops = {
> >>>>> +     .set_pages = nop_set_pages,
> >>>>> +     .clear_pages = nop_clear_pages,
> >>>>> +     .bind_vma = nop_bind,
> >>>>> +     .unbind_vma = nop_unbind,
> >>>>> +};
> >>>>> +
> >>>>>     static void gen6_ppgtt_cleanup(struct i915_address_space *vm)
> >>>>>     {
> >>>>>         struct gen6_hw_ppgtt *ppgtt = to_gen6_ppgtt(i915_vm_to_ppgtt(vm));
> >>>>> @@ -1855,6 +1882,7 @@ static void gen6_ppgtt_cleanup(struct i915_address_space *vm)
> >>>>>         /* FIXME remove the struct_mutex to bring the locking under control */
> >>>>>         INIT_WORK(&work->base, gen6_ppgtt_cleanup_work);
> >>>>>         work->vma = ppgtt->vma;
> >>>>> +     work->vma->ops = &nop_vma_ops;
> >>>>
> >>>> Could we use some asserts before overriding the vma ops? Like
> >>>> GEM_BUG_ON(vma->pages)? And something for still bound?
> >>>
> >>> It technically still is bound as it is in the GGTT but currently
> >>> unpinned -- that will be checked on destroy, it's just we also get an
> >>> unbind callback. vma->pages doesn't exist for this (set to ERR_PTR).
> >>
> >> If we are getting the unbind callback and we nop-ed it, who will
> >> actually do it's job?
> > 
> > The callback is just a hook for us to prune within the ppgtt.
> > It still is removed from GGTT by i915_vma_unbind().
> 
> So it needs GEM_BUG_ON(ppgtt->scan_for_unused_pt) before overriding the 
> unbind?

No. They get freed by the cleanup itself. The scan is just an
opportunistic prune if either the context/mm is evicted but still alive.
-Chris
Tvrtko Ursulin May 24, 2019, 8:51 a.m. UTC | #7
On 24/05/2019 09:36, Chris Wilson wrote:
> Quoting Tvrtko Ursulin (2019-05-24 09:31:45)
>>
>> On 24/05/2019 09:29, Chris Wilson wrote:
>>> Quoting Tvrtko Ursulin (2019-05-24 09:23:40)
>>>>
>>>> On 24/05/2019 09:17, Chris Wilson wrote:
>>>>> Quoting Tvrtko Ursulin (2019-05-24 09:13:14)
>>>>>>
>>>>>> On 24/05/2019 07:45, Chris Wilson wrote:
>>>>>>> Having deferred the vma destruction to a worker where we can acquire the
>>>>>>> struct_mutex, we have to avoid chasing back into the now destroyed
>>>>>>> ppgtt. The pd_vma is special in having a custom unbind function to scan
>>>>>>> for unused pages despite the VMA itself being notionally part of the
>>>>>>> GGTT. As such, we need to disable that callback to avoid a
>>>>>>> use-after-free.
>>>>>>>
>>>>>>> This unfortunately blew up so early during boot that CI declared the
>>>>>>> machine unreachable as opposed to being the major failure it was. Oops.
>>>>>>>
>>>>>>> Fixes: d3622099c76f ("drm/i915/gtt: Always acquire struct_mutex for gen6_ppgtt_cleanup")
>>>>>>> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
>>>>>>> Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
>>>>>>> Cc: Tomi Sarvela <tomi.p.sarvela@intel.com>
>>>>>>> ---
>>>>>>>      drivers/gpu/drm/i915/i915_gem_gtt.c | 28 ++++++++++++++++++++++++++++
>>>>>>>      1 file changed, 28 insertions(+)
>>>>>>>
>>>>>>> diff --git a/drivers/gpu/drm/i915/i915_gem_gtt.c b/drivers/gpu/drm/i915/i915_gem_gtt.c
>>>>>>> index 8d8a4b0ad4d9..266baa11df64 100644
>>>>>>> --- a/drivers/gpu/drm/i915/i915_gem_gtt.c
>>>>>>> +++ b/drivers/gpu/drm/i915/i915_gem_gtt.c
>>>>>>> @@ -1847,6 +1847,33 @@ static void gen6_ppgtt_cleanup_work(struct work_struct *wrk)
>>>>>>>          kfree(work);
>>>>>>>      }
>>>>>>>      
>>>>>>> +static int nop_set_pages(struct i915_vma *vma)
>>>>>>> +{
>>>>>>> +     return -ENODEV;
>>>>>>> +}
>>>>>>> +
>>>>>>> +static void nop_clear_pages(struct i915_vma *vma)
>>>>>>> +{
>>>>>>> +}
>>>>>>> +
>>>>>>> +static int nop_bind(struct i915_vma *vma,
>>>>>>> +                 enum i915_cache_level cache_level,
>>>>>>> +                 u32 unused)
>>>>>>> +{
>>>>>>> +     return -ENODEV;
>>>>>>> +}
>>>>>>> +
>>>>>>> +static void nop_unbind(struct i915_vma *vma)
>>>>>>> +{
>>>>>>> +}
>>>>>>> +
>>>>>>> +static const struct i915_vma_ops nop_vma_ops = {
>>>>>>> +     .set_pages = nop_set_pages,
>>>>>>> +     .clear_pages = nop_clear_pages,
>>>>>>> +     .bind_vma = nop_bind,
>>>>>>> +     .unbind_vma = nop_unbind,
>>>>>>> +};
>>>>>>> +
>>>>>>>      static void gen6_ppgtt_cleanup(struct i915_address_space *vm)
>>>>>>>      {
>>>>>>>          struct gen6_hw_ppgtt *ppgtt = to_gen6_ppgtt(i915_vm_to_ppgtt(vm));
>>>>>>> @@ -1855,6 +1882,7 @@ static void gen6_ppgtt_cleanup(struct i915_address_space *vm)
>>>>>>>          /* FIXME remove the struct_mutex to bring the locking under control */
>>>>>>>          INIT_WORK(&work->base, gen6_ppgtt_cleanup_work);
>>>>>>>          work->vma = ppgtt->vma;
>>>>>>> +     work->vma->ops = &nop_vma_ops;
>>>>>>
>>>>>> Could we use some asserts before overriding the vma ops? Like
>>>>>> GEM_BUG_ON(vma->pages)? And something for still bound?
>>>>>
>>>>> It technically still is bound as it is in the GGTT but currently
>>>>> unpinned -- that will be checked on destroy, it's just we also get an
>>>>> unbind callback. vma->pages doesn't exist for this (set to ERR_PTR).
>>>>
>>>> If we are getting the unbind callback and we nop-ed it, who will
>>>> actually do it's job?
>>>
>>> The callback is just a hook for us to prune within the ppgtt.
>>> It still is removed from GGTT by i915_vma_unbind().
>>
>> So it needs GEM_BUG_ON(ppgtt->scan_for_unused_pt) before overriding the
>> unbind?
> 
> No. They get freed by the cleanup itself. The scan is just an
> opportunistic prune if either the context/mm is evicted but still alive.

Then the same assert in gen6_ppgtt_cleanup_work? :)

Regards,

Tvrtko
Chris Wilson May 24, 2019, 8:55 a.m. UTC | #8
Quoting Tvrtko Ursulin (2019-05-24 09:51:46)
> 
> On 24/05/2019 09:36, Chris Wilson wrote:
> > Quoting Tvrtko Ursulin (2019-05-24 09:31:45)
> >>
> >> On 24/05/2019 09:29, Chris Wilson wrote:
> >>> Quoting Tvrtko Ursulin (2019-05-24 09:23:40)
> >>>>
> >>>> On 24/05/2019 09:17, Chris Wilson wrote:
> >>>>> Quoting Tvrtko Ursulin (2019-05-24 09:13:14)
> >>>>>>
> >>>>>> On 24/05/2019 07:45, Chris Wilson wrote:
> >>>>>>> Having deferred the vma destruction to a worker where we can acquire the
> >>>>>>> struct_mutex, we have to avoid chasing back into the now destroyed
> >>>>>>> ppgtt. The pd_vma is special in having a custom unbind function to scan
> >>>>>>> for unused pages despite the VMA itself being notionally part of the
> >>>>>>> GGTT. As such, we need to disable that callback to avoid a
> >>>>>>> use-after-free.
> >>>>>>>
> >>>>>>> This unfortunately blew up so early during boot that CI declared the
> >>>>>>> machine unreachable as opposed to being the major failure it was. Oops.
> >>>>>>>
> >>>>>>> Fixes: d3622099c76f ("drm/i915/gtt: Always acquire struct_mutex for gen6_ppgtt_cleanup")
> >>>>>>> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> >>>>>>> Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
> >>>>>>> Cc: Tomi Sarvela <tomi.p.sarvela@intel.com>
> >>>>>>> ---
> >>>>>>>      drivers/gpu/drm/i915/i915_gem_gtt.c | 28 ++++++++++++++++++++++++++++
> >>>>>>>      1 file changed, 28 insertions(+)
> >>>>>>>
> >>>>>>> diff --git a/drivers/gpu/drm/i915/i915_gem_gtt.c b/drivers/gpu/drm/i915/i915_gem_gtt.c
> >>>>>>> index 8d8a4b0ad4d9..266baa11df64 100644
> >>>>>>> --- a/drivers/gpu/drm/i915/i915_gem_gtt.c
> >>>>>>> +++ b/drivers/gpu/drm/i915/i915_gem_gtt.c
> >>>>>>> @@ -1847,6 +1847,33 @@ static void gen6_ppgtt_cleanup_work(struct work_struct *wrk)
> >>>>>>>          kfree(work);
> >>>>>>>      }
> >>>>>>>      
> >>>>>>> +static int nop_set_pages(struct i915_vma *vma)
> >>>>>>> +{
> >>>>>>> +     return -ENODEV;
> >>>>>>> +}
> >>>>>>> +
> >>>>>>> +static void nop_clear_pages(struct i915_vma *vma)
> >>>>>>> +{
> >>>>>>> +}
> >>>>>>> +
> >>>>>>> +static int nop_bind(struct i915_vma *vma,
> >>>>>>> +                 enum i915_cache_level cache_level,
> >>>>>>> +                 u32 unused)
> >>>>>>> +{
> >>>>>>> +     return -ENODEV;
> >>>>>>> +}
> >>>>>>> +
> >>>>>>> +static void nop_unbind(struct i915_vma *vma)
> >>>>>>> +{
> >>>>>>> +}
> >>>>>>> +
> >>>>>>> +static const struct i915_vma_ops nop_vma_ops = {
> >>>>>>> +     .set_pages = nop_set_pages,
> >>>>>>> +     .clear_pages = nop_clear_pages,
> >>>>>>> +     .bind_vma = nop_bind,
> >>>>>>> +     .unbind_vma = nop_unbind,
> >>>>>>> +};
> >>>>>>> +
> >>>>>>>      static void gen6_ppgtt_cleanup(struct i915_address_space *vm)
> >>>>>>>      {
> >>>>>>>          struct gen6_hw_ppgtt *ppgtt = to_gen6_ppgtt(i915_vm_to_ppgtt(vm));
> >>>>>>> @@ -1855,6 +1882,7 @@ static void gen6_ppgtt_cleanup(struct i915_address_space *vm)
> >>>>>>>          /* FIXME remove the struct_mutex to bring the locking under control */
> >>>>>>>          INIT_WORK(&work->base, gen6_ppgtt_cleanup_work);
> >>>>>>>          work->vma = ppgtt->vma;
> >>>>>>> +     work->vma->ops = &nop_vma_ops;
> >>>>>>
> >>>>>> Could we use some asserts before overriding the vma ops? Like
> >>>>>> GEM_BUG_ON(vma->pages)? And something for still bound?
> >>>>>
> >>>>> It technically still is bound as it is in the GGTT but currently
> >>>>> unpinned -- that will be checked on destroy, it's just we also get an
> >>>>> unbind callback. vma->pages doesn't exist for this (set to ERR_PTR).
> >>>>
> >>>> If we are getting the unbind callback and we nop-ed it, who will
> >>>> actually do it's job?
> >>>
> >>> The callback is just a hook for us to prune within the ppgtt.
> >>> It still is removed from GGTT by i915_vma_unbind().
> >>
> >> So it needs GEM_BUG_ON(ppgtt->scan_for_unused_pt) before overriding the
> >> unbind?
> > 
> > No. They get freed by the cleanup itself. The scan is just an
> > opportunistic prune if either the context/mm is evicted but still alive.
> 
> Then the same assert in gen6_ppgtt_cleanup_work? :)

ppgtt is dead.
-Chris
Tvrtko Ursulin May 24, 2019, 8:57 a.m. UTC | #9
On 24/05/2019 09:51, Tvrtko Ursulin wrote:
> 
> On 24/05/2019 09:36, Chris Wilson wrote:
>> Quoting Tvrtko Ursulin (2019-05-24 09:31:45)
>>>
>>> On 24/05/2019 09:29, Chris Wilson wrote:
>>>> Quoting Tvrtko Ursulin (2019-05-24 09:23:40)
>>>>>
>>>>> On 24/05/2019 09:17, Chris Wilson wrote:
>>>>>> Quoting Tvrtko Ursulin (2019-05-24 09:13:14)
>>>>>>>
>>>>>>> On 24/05/2019 07:45, Chris Wilson wrote:
>>>>>>>> Having deferred the vma destruction to a worker where we can 
>>>>>>>> acquire the
>>>>>>>> struct_mutex, we have to avoid chasing back into the now destroyed
>>>>>>>> ppgtt. The pd_vma is special in having a custom unbind function 
>>>>>>>> to scan
>>>>>>>> for unused pages despite the VMA itself being notionally part of 
>>>>>>>> the
>>>>>>>> GGTT. As such, we need to disable that callback to avoid a
>>>>>>>> use-after-free.
>>>>>>>>
>>>>>>>> This unfortunately blew up so early during boot that CI declared 
>>>>>>>> the
>>>>>>>> machine unreachable as opposed to being the major failure it 
>>>>>>>> was. Oops.
>>>>>>>>
>>>>>>>> Fixes: d3622099c76f ("drm/i915/gtt: Always acquire struct_mutex 
>>>>>>>> for gen6_ppgtt_cleanup")
>>>>>>>> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
>>>>>>>> Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
>>>>>>>> Cc: Tomi Sarvela <tomi.p.sarvela@intel.com>
>>>>>>>> ---
>>>>>>>>      drivers/gpu/drm/i915/i915_gem_gtt.c | 28 
>>>>>>>> ++++++++++++++++++++++++++++
>>>>>>>>      1 file changed, 28 insertions(+)
>>>>>>>>
>>>>>>>> diff --git a/drivers/gpu/drm/i915/i915_gem_gtt.c 
>>>>>>>> b/drivers/gpu/drm/i915/i915_gem_gtt.c
>>>>>>>> index 8d8a4b0ad4d9..266baa11df64 100644
>>>>>>>> --- a/drivers/gpu/drm/i915/i915_gem_gtt.c
>>>>>>>> +++ b/drivers/gpu/drm/i915/i915_gem_gtt.c
>>>>>>>> @@ -1847,6 +1847,33 @@ static void 
>>>>>>>> gen6_ppgtt_cleanup_work(struct work_struct *wrk)
>>>>>>>>          kfree(work);
>>>>>>>>      }
>>>>>>>> +static int nop_set_pages(struct i915_vma *vma)
>>>>>>>> +{
>>>>>>>> +     return -ENODEV;
>>>>>>>> +}
>>>>>>>> +
>>>>>>>> +static void nop_clear_pages(struct i915_vma *vma)
>>>>>>>> +{
>>>>>>>> +}
>>>>>>>> +
>>>>>>>> +static int nop_bind(struct i915_vma *vma,
>>>>>>>> +                 enum i915_cache_level cache_level,
>>>>>>>> +                 u32 unused)
>>>>>>>> +{
>>>>>>>> +     return -ENODEV;
>>>>>>>> +}
>>>>>>>> +
>>>>>>>> +static void nop_unbind(struct i915_vma *vma)
>>>>>>>> +{
>>>>>>>> +}
>>>>>>>> +
>>>>>>>> +static const struct i915_vma_ops nop_vma_ops = {
>>>>>>>> +     .set_pages = nop_set_pages,
>>>>>>>> +     .clear_pages = nop_clear_pages,
>>>>>>>> +     .bind_vma = nop_bind,
>>>>>>>> +     .unbind_vma = nop_unbind,
>>>>>>>> +};
>>>>>>>> +
>>>>>>>>      static void gen6_ppgtt_cleanup(struct i915_address_space *vm)
>>>>>>>>      {
>>>>>>>>          struct gen6_hw_ppgtt *ppgtt = 
>>>>>>>> to_gen6_ppgtt(i915_vm_to_ppgtt(vm));
>>>>>>>> @@ -1855,6 +1882,7 @@ static void gen6_ppgtt_cleanup(struct 
>>>>>>>> i915_address_space *vm)
>>>>>>>>          /* FIXME remove the struct_mutex to bring the locking 
>>>>>>>> under control */
>>>>>>>>          INIT_WORK(&work->base, gen6_ppgtt_cleanup_work);
>>>>>>>>          work->vma = ppgtt->vma;
>>>>>>>> +     work->vma->ops = &nop_vma_ops;
>>>>>>>
>>>>>>> Could we use some asserts before overriding the vma ops? Like
>>>>>>> GEM_BUG_ON(vma->pages)? And something for still bound?
>>>>>>
>>>>>> It technically still is bound as it is in the GGTT but currently
>>>>>> unpinned -- that will be checked on destroy, it's just we also get an
>>>>>> unbind callback. vma->pages doesn't exist for this (set to ERR_PTR).
>>>>>
>>>>> If we are getting the unbind callback and we nop-ed it, who will
>>>>> actually do it's job?
>>>>
>>>> The callback is just a hook for us to prune within the ppgtt.
>>>> It still is removed from GGTT by i915_vma_unbind().
>>>
>>> So it needs GEM_BUG_ON(ppgtt->scan_for_unused_pt) before overriding the
>>> unbind?
>>
>> No. They get freed by the cleanup itself. The scan is just an
>> opportunistic prune if either the context/mm is evicted but still alive.
> 
> Then the same assert in gen6_ppgtt_cleanup_work? :)

Okay ppgtt is gone so can't do it.. annoying.. Cleanup seems to support 
your claims but I think we need a BFC (big fat comment) above the vma 
ops override to explains this. With that:

Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>

Regards,

Tvrtko
Chris Wilson May 24, 2019, 9:01 a.m. UTC | #10
Quoting Tvrtko Ursulin (2019-05-24 09:57:42)
> 
> On 24/05/2019 09:51, Tvrtko Ursulin wrote:
> > 
> > On 24/05/2019 09:36, Chris Wilson wrote:
> >> Quoting Tvrtko Ursulin (2019-05-24 09:31:45)
> >>>
> >>> On 24/05/2019 09:29, Chris Wilson wrote:
> >>>> Quoting Tvrtko Ursulin (2019-05-24 09:23:40)
> >>>>>
> >>>>> On 24/05/2019 09:17, Chris Wilson wrote:
> >>>>>> Quoting Tvrtko Ursulin (2019-05-24 09:13:14)
> >>>>>>>
> >>>>>>> On 24/05/2019 07:45, Chris Wilson wrote:
> >>>>>>>> Having deferred the vma destruction to a worker where we can 
> >>>>>>>> acquire the
> >>>>>>>> struct_mutex, we have to avoid chasing back into the now destroyed
> >>>>>>>> ppgtt. The pd_vma is special in having a custom unbind function 
> >>>>>>>> to scan
> >>>>>>>> for unused pages despite the VMA itself being notionally part of 
> >>>>>>>> the
> >>>>>>>> GGTT. As such, we need to disable that callback to avoid a
> >>>>>>>> use-after-free.
> >>>>>>>>
> >>>>>>>> This unfortunately blew up so early during boot that CI declared 
> >>>>>>>> the
> >>>>>>>> machine unreachable as opposed to being the major failure it 
> >>>>>>>> was. Oops.
> >>>>>>>>
> >>>>>>>> Fixes: d3622099c76f ("drm/i915/gtt: Always acquire struct_mutex 
> >>>>>>>> for gen6_ppgtt_cleanup")
> >>>>>>>> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> >>>>>>>> Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
> >>>>>>>> Cc: Tomi Sarvela <tomi.p.sarvela@intel.com>
> >>>>>>>> ---
> >>>>>>>>      drivers/gpu/drm/i915/i915_gem_gtt.c | 28 
> >>>>>>>> ++++++++++++++++++++++++++++
> >>>>>>>>      1 file changed, 28 insertions(+)
> >>>>>>>>
> >>>>>>>> diff --git a/drivers/gpu/drm/i915/i915_gem_gtt.c 
> >>>>>>>> b/drivers/gpu/drm/i915/i915_gem_gtt.c
> >>>>>>>> index 8d8a4b0ad4d9..266baa11df64 100644
> >>>>>>>> --- a/drivers/gpu/drm/i915/i915_gem_gtt.c
> >>>>>>>> +++ b/drivers/gpu/drm/i915/i915_gem_gtt.c
> >>>>>>>> @@ -1847,6 +1847,33 @@ static void 
> >>>>>>>> gen6_ppgtt_cleanup_work(struct work_struct *wrk)
> >>>>>>>>          kfree(work);
> >>>>>>>>      }
> >>>>>>>> +static int nop_set_pages(struct i915_vma *vma)
> >>>>>>>> +{
> >>>>>>>> +     return -ENODEV;
> >>>>>>>> +}
> >>>>>>>> +
> >>>>>>>> +static void nop_clear_pages(struct i915_vma *vma)
> >>>>>>>> +{
> >>>>>>>> +}
> >>>>>>>> +
> >>>>>>>> +static int nop_bind(struct i915_vma *vma,
> >>>>>>>> +                 enum i915_cache_level cache_level,
> >>>>>>>> +                 u32 unused)
> >>>>>>>> +{
> >>>>>>>> +     return -ENODEV;
> >>>>>>>> +}
> >>>>>>>> +
> >>>>>>>> +static void nop_unbind(struct i915_vma *vma)
> >>>>>>>> +{
> >>>>>>>> +}
> >>>>>>>> +
> >>>>>>>> +static const struct i915_vma_ops nop_vma_ops = {
> >>>>>>>> +     .set_pages = nop_set_pages,
> >>>>>>>> +     .clear_pages = nop_clear_pages,
> >>>>>>>> +     .bind_vma = nop_bind,
> >>>>>>>> +     .unbind_vma = nop_unbind,
> >>>>>>>> +};
> >>>>>>>> +
> >>>>>>>>      static void gen6_ppgtt_cleanup(struct i915_address_space *vm)
> >>>>>>>>      {
> >>>>>>>>          struct gen6_hw_ppgtt *ppgtt = 
> >>>>>>>> to_gen6_ppgtt(i915_vm_to_ppgtt(vm));
> >>>>>>>> @@ -1855,6 +1882,7 @@ static void gen6_ppgtt_cleanup(struct 
> >>>>>>>> i915_address_space *vm)
> >>>>>>>>          /* FIXME remove the struct_mutex to bring the locking 
> >>>>>>>> under control */
> >>>>>>>>          INIT_WORK(&work->base, gen6_ppgtt_cleanup_work);
> >>>>>>>>          work->vma = ppgtt->vma;
> >>>>>>>> +     work->vma->ops = &nop_vma_ops;
> >>>>>>>
> >>>>>>> Could we use some asserts before overriding the vma ops? Like
> >>>>>>> GEM_BUG_ON(vma->pages)? And something for still bound?
> >>>>>>
> >>>>>> It technically still is bound as it is in the GGTT but currently
> >>>>>> unpinned -- that will be checked on destroy, it's just we also get an
> >>>>>> unbind callback. vma->pages doesn't exist for this (set to ERR_PTR).
> >>>>>
> >>>>> If we are getting the unbind callback and we nop-ed it, who will
> >>>>> actually do it's job?
> >>>>
> >>>> The callback is just a hook for us to prune within the ppgtt.
> >>>> It still is removed from GGTT by i915_vma_unbind().
> >>>
> >>> So it needs GEM_BUG_ON(ppgtt->scan_for_unused_pt) before overriding the
> >>> unbind?
> >>
> >> No. They get freed by the cleanup itself. The scan is just an
> >> opportunistic prune if either the context/mm is evicted but still alive.
> > 
> > Then the same assert in gen6_ppgtt_cleanup_work? :)
> 
> Okay ppgtt is gone so can't do it.. annoying.. Cleanup seems to support 
> your claims but I think we need a BFC (big fat comment) above the vma 
> ops override to explains this. With that:

It has FIXME! I really do hope this is short term...
-Chris
Tvrtko Ursulin May 24, 2019, 9:08 a.m. UTC | #11
On 24/05/2019 10:01, Chris Wilson wrote:
> Quoting Tvrtko Ursulin (2019-05-24 09:57:42)
>>
>> On 24/05/2019 09:51, Tvrtko Ursulin wrote:
>>>
>>> On 24/05/2019 09:36, Chris Wilson wrote:
>>>> Quoting Tvrtko Ursulin (2019-05-24 09:31:45)
>>>>>
>>>>> On 24/05/2019 09:29, Chris Wilson wrote:
>>>>>> Quoting Tvrtko Ursulin (2019-05-24 09:23:40)
>>>>>>>
>>>>>>> On 24/05/2019 09:17, Chris Wilson wrote:
>>>>>>>> Quoting Tvrtko Ursulin (2019-05-24 09:13:14)
>>>>>>>>>
>>>>>>>>> On 24/05/2019 07:45, Chris Wilson wrote:
>>>>>>>>>> Having deferred the vma destruction to a worker where we can
>>>>>>>>>> acquire the
>>>>>>>>>> struct_mutex, we have to avoid chasing back into the now destroyed
>>>>>>>>>> ppgtt. The pd_vma is special in having a custom unbind function
>>>>>>>>>> to scan
>>>>>>>>>> for unused pages despite the VMA itself being notionally part of
>>>>>>>>>> the
>>>>>>>>>> GGTT. As such, we need to disable that callback to avoid a
>>>>>>>>>> use-after-free.
>>>>>>>>>>
>>>>>>>>>> This unfortunately blew up so early during boot that CI declared
>>>>>>>>>> the
>>>>>>>>>> machine unreachable as opposed to being the major failure it
>>>>>>>>>> was. Oops.
>>>>>>>>>>
>>>>>>>>>> Fixes: d3622099c76f ("drm/i915/gtt: Always acquire struct_mutex
>>>>>>>>>> for gen6_ppgtt_cleanup")
>>>>>>>>>> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
>>>>>>>>>> Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
>>>>>>>>>> Cc: Tomi Sarvela <tomi.p.sarvela@intel.com>
>>>>>>>>>> ---
>>>>>>>>>>       drivers/gpu/drm/i915/i915_gem_gtt.c | 28
>>>>>>>>>> ++++++++++++++++++++++++++++
>>>>>>>>>>       1 file changed, 28 insertions(+)
>>>>>>>>>>
>>>>>>>>>> diff --git a/drivers/gpu/drm/i915/i915_gem_gtt.c
>>>>>>>>>> b/drivers/gpu/drm/i915/i915_gem_gtt.c
>>>>>>>>>> index 8d8a4b0ad4d9..266baa11df64 100644
>>>>>>>>>> --- a/drivers/gpu/drm/i915/i915_gem_gtt.c
>>>>>>>>>> +++ b/drivers/gpu/drm/i915/i915_gem_gtt.c
>>>>>>>>>> @@ -1847,6 +1847,33 @@ static void
>>>>>>>>>> gen6_ppgtt_cleanup_work(struct work_struct *wrk)
>>>>>>>>>>           kfree(work);
>>>>>>>>>>       }
>>>>>>>>>> +static int nop_set_pages(struct i915_vma *vma)
>>>>>>>>>> +{
>>>>>>>>>> +     return -ENODEV;
>>>>>>>>>> +}
>>>>>>>>>> +
>>>>>>>>>> +static void nop_clear_pages(struct i915_vma *vma)
>>>>>>>>>> +{
>>>>>>>>>> +}
>>>>>>>>>> +
>>>>>>>>>> +static int nop_bind(struct i915_vma *vma,
>>>>>>>>>> +                 enum i915_cache_level cache_level,
>>>>>>>>>> +                 u32 unused)
>>>>>>>>>> +{
>>>>>>>>>> +     return -ENODEV;
>>>>>>>>>> +}
>>>>>>>>>> +
>>>>>>>>>> +static void nop_unbind(struct i915_vma *vma)
>>>>>>>>>> +{
>>>>>>>>>> +}
>>>>>>>>>> +
>>>>>>>>>> +static const struct i915_vma_ops nop_vma_ops = {
>>>>>>>>>> +     .set_pages = nop_set_pages,
>>>>>>>>>> +     .clear_pages = nop_clear_pages,
>>>>>>>>>> +     .bind_vma = nop_bind,
>>>>>>>>>> +     .unbind_vma = nop_unbind,
>>>>>>>>>> +};
>>>>>>>>>> +
>>>>>>>>>>       static void gen6_ppgtt_cleanup(struct i915_address_space *vm)
>>>>>>>>>>       {
>>>>>>>>>>           struct gen6_hw_ppgtt *ppgtt =
>>>>>>>>>> to_gen6_ppgtt(i915_vm_to_ppgtt(vm));
>>>>>>>>>> @@ -1855,6 +1882,7 @@ static void gen6_ppgtt_cleanup(struct
>>>>>>>>>> i915_address_space *vm)
>>>>>>>>>>           /* FIXME remove the struct_mutex to bring the locking
>>>>>>>>>> under control */
>>>>>>>>>>           INIT_WORK(&work->base, gen6_ppgtt_cleanup_work);
>>>>>>>>>>           work->vma = ppgtt->vma;
>>>>>>>>>> +     work->vma->ops = &nop_vma_ops;
>>>>>>>>>
>>>>>>>>> Could we use some asserts before overriding the vma ops? Like
>>>>>>>>> GEM_BUG_ON(vma->pages)? And something for still bound?
>>>>>>>>
>>>>>>>> It technically still is bound as it is in the GGTT but currently
>>>>>>>> unpinned -- that will be checked on destroy, it's just we also get an
>>>>>>>> unbind callback. vma->pages doesn't exist for this (set to ERR_PTR).
>>>>>>>
>>>>>>> If we are getting the unbind callback and we nop-ed it, who will
>>>>>>> actually do it's job?
>>>>>>
>>>>>> The callback is just a hook for us to prune within the ppgtt.
>>>>>> It still is removed from GGTT by i915_vma_unbind().
>>>>>
>>>>> So it needs GEM_BUG_ON(ppgtt->scan_for_unused_pt) before overriding the
>>>>> unbind?
>>>>
>>>> No. They get freed by the cleanup itself. The scan is just an
>>>> opportunistic prune if either the context/mm is evicted but still alive.
>>>
>>> Then the same assert in gen6_ppgtt_cleanup_work? :)
>>
>> Okay ppgtt is gone so can't do it.. annoying.. Cleanup seems to support
>> your claims but I think we need a BFC (big fat comment) above the vma
>> ops override to explains this. With that:
> 
> It has FIXME! I really do hope this is short term...

Ok, that's good enough.

Regards,

Tvrtko
diff mbox series

Patch

diff --git a/drivers/gpu/drm/i915/i915_gem_gtt.c b/drivers/gpu/drm/i915/i915_gem_gtt.c
index 8d8a4b0ad4d9..266baa11df64 100644
--- a/drivers/gpu/drm/i915/i915_gem_gtt.c
+++ b/drivers/gpu/drm/i915/i915_gem_gtt.c
@@ -1847,6 +1847,33 @@  static void gen6_ppgtt_cleanup_work(struct work_struct *wrk)
 	kfree(work);
 }
 
+static int nop_set_pages(struct i915_vma *vma)
+{
+	return -ENODEV;
+}
+
+static void nop_clear_pages(struct i915_vma *vma)
+{
+}
+
+static int nop_bind(struct i915_vma *vma,
+		    enum i915_cache_level cache_level,
+		    u32 unused)
+{
+	return -ENODEV;
+}
+
+static void nop_unbind(struct i915_vma *vma)
+{
+}
+
+static const struct i915_vma_ops nop_vma_ops = {
+	.set_pages = nop_set_pages,
+	.clear_pages = nop_clear_pages,
+	.bind_vma = nop_bind,
+	.unbind_vma = nop_unbind,
+};
+
 static void gen6_ppgtt_cleanup(struct i915_address_space *vm)
 {
 	struct gen6_hw_ppgtt *ppgtt = to_gen6_ppgtt(i915_vm_to_ppgtt(vm));
@@ -1855,6 +1882,7 @@  static void gen6_ppgtt_cleanup(struct i915_address_space *vm)
 	/* FIXME remove the struct_mutex to bring the locking under control */
 	INIT_WORK(&work->base, gen6_ppgtt_cleanup_work);
 	work->vma = ppgtt->vma;
+	work->vma->ops = &nop_vma_ops;
 	schedule_work(&work->base);
 
 	gen6_ppgtt_free_pd(ppgtt);