diff mbox

[v1] drm/i915: Fix a false alert of memory leak when free LRC

Message ID 1447978226-1588-1-git-send-email-yu.dai@intel.com (mailing list archive)
State New, archived
Headers show

Commit Message

yu.dai@intel.com Nov. 20, 2015, 12:10 a.m. UTC
From: Alex Dai <yu.dai@intel.com>

There is a memory leak warning message from i915_gem_context_clean
when GuC submission is enabled. The reason is that when LRC is
released, its ppgtt could be still referenced. The assumption that
all VMAs are unbound during release of LRC is not true.

v1: Move the code inside i915_gem_context_clean() to where ppgtt is
released because it is not cleaning context anyway but ppgtt.

Signed-off-by: Alex Dai <yu.dai@intel.com>
---
 drivers/gpu/drm/i915/i915_gem_context.c | 24 ------------------------
 drivers/gpu/drm/i915/i915_gem_gtt.c     | 12 ++++++++++++
 2 files changed, 12 insertions(+), 24 deletions(-)

Comments

Daniel Vetter Nov. 20, 2015, 8:31 a.m. UTC | #1
On Thu, Nov 19, 2015 at 04:10:26PM -0800, yu.dai@intel.com wrote:
> From: Alex Dai <yu.dai@intel.com>
> 
> There is a memory leak warning message from i915_gem_context_clean
> when GuC submission is enabled. The reason is that when LRC is
> released, its ppgtt could be still referenced. The assumption that
> all VMAs are unbound during release of LRC is not true.
> 
> v1: Move the code inside i915_gem_context_clean() to where ppgtt is
> released because it is not cleaning context anyway but ppgtt.
> 
> Signed-off-by: Alex Dai <yu.dai@intel.com>

retire__read drops the ctx (and hence ppgtt) reference too early,
resulting in us hitting the WARNING. See the giant thread with Tvrtko,
Chris and me:

http://www.spinics.net/lists/intel-gfx/msg78918.html

Would be great if someone could test the diff I posted in there.
-Daniel

> ---
>  drivers/gpu/drm/i915/i915_gem_context.c | 24 ------------------------
>  drivers/gpu/drm/i915/i915_gem_gtt.c     | 12 ++++++++++++
>  2 files changed, 12 insertions(+), 24 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/i915_gem_context.c b/drivers/gpu/drm/i915/i915_gem_context.c
> index 204dc7c..cc5c8e6 100644
> --- a/drivers/gpu/drm/i915/i915_gem_context.c
> +++ b/drivers/gpu/drm/i915/i915_gem_context.c
> @@ -133,23 +133,6 @@ static int get_context_size(struct drm_device *dev)
>  	return ret;
>  }
>  
> -static void i915_gem_context_clean(struct intel_context *ctx)
> -{
> -	struct i915_hw_ppgtt *ppgtt = ctx->ppgtt;
> -	struct i915_vma *vma, *next;
> -
> -	if (!ppgtt)
> -		return;
> -
> -	WARN_ON(!list_empty(&ppgtt->base.active_list));
> -
> -	list_for_each_entry_safe(vma, next, &ppgtt->base.inactive_list,
> -				 mm_list) {
> -		if (WARN_ON(__i915_vma_unbind_no_wait(vma)))
> -			break;
> -	}
> -}
> -
>  void i915_gem_context_free(struct kref *ctx_ref)
>  {
>  	struct intel_context *ctx = container_of(ctx_ref, typeof(*ctx), ref);
> @@ -159,13 +142,6 @@ void i915_gem_context_free(struct kref *ctx_ref)
>  	if (i915.enable_execlists)
>  		intel_lr_context_free(ctx);
>  
> -	/*
> -	 * This context is going away and we need to remove all VMAs still
> -	 * around. This is to handle imported shared objects for which
> -	 * destructor did not run when their handles were closed.
> -	 */
> -	i915_gem_context_clean(ctx);
> -
>  	i915_ppgtt_put(ctx->ppgtt);
>  
>  	if (ctx->legacy_hw_ctx.rcs_state)
> diff --git a/drivers/gpu/drm/i915/i915_gem_gtt.c b/drivers/gpu/drm/i915/i915_gem_gtt.c
> index 016739e..d36943c 100644
> --- a/drivers/gpu/drm/i915/i915_gem_gtt.c
> +++ b/drivers/gpu/drm/i915/i915_gem_gtt.c
> @@ -2214,6 +2214,7 @@ void  i915_ppgtt_release(struct kref *kref)
>  {
>  	struct i915_hw_ppgtt *ppgtt =
>  		container_of(kref, struct i915_hw_ppgtt, ref);
> +	struct i915_vma *vma, *next;
>  
>  	trace_i915_ppgtt_release(&ppgtt->base);
>  
> @@ -2221,6 +2222,17 @@ void  i915_ppgtt_release(struct kref *kref)
>  	WARN_ON(!list_empty(&ppgtt->base.active_list));
>  	WARN_ON(!list_empty(&ppgtt->base.inactive_list));
>  
> +	/*
> +	 * This ppgtt is going away and we need to remove all VMAs still
> +	 * around. This is to handle imported shared objects for which
> +	 * destructor did not run when their handles were closed.
> +	 */
> +	list_for_each_entry_safe(vma, next, &ppgtt->base.inactive_list,
> +				 mm_list) {
> +		if (WARN_ON(__i915_vma_unbind_no_wait(vma)))
> +			break;
> +	}
> +
>  	list_del(&ppgtt->base.global_link);
>  	drm_mm_takedown(&ppgtt->base.mm);
>  
> -- 
> 2.5.0
>
yu.dai@intel.com Nov. 20, 2015, 6:38 p.m. UTC | #2
On 11/20/2015 12:31 AM, Daniel Vetter wrote:
> On Thu, Nov 19, 2015 at 04:10:26PM -0800, yu.dai@intel.com wrote:
> > From: Alex Dai <yu.dai@intel.com>
> >
> > There is a memory leak warning message from i915_gem_context_clean
> > when GuC submission is enabled. The reason is that when LRC is
> > released, its ppgtt could be still referenced. The assumption that
> > all VMAs are unbound during release of LRC is not true.
> >
> > v1: Move the code inside i915_gem_context_clean() to where ppgtt is
> > released because it is not cleaning context anyway but ppgtt.
> >
> > Signed-off-by: Alex Dai <yu.dai@intel.com>
>
> retire__read drops the ctx (and hence ppgtt) reference too early,
> resulting in us hitting the WARNING. See the giant thread with Tvrtko,
> Chris and me:
>
> http://www.spinics.net/lists/intel-gfx/msg78918.html
>
> Would be great if someone could test the diff I posted in there.

I have to recall my patch because of calling vma_unbind inside 
ppgtt_release is not right.

> > ---
> >  drivers/gpu/drm/i915/i915_gem_context.c | 24 ------------------------
> >  drivers/gpu/drm/i915/i915_gem_gtt.c     | 12 ++++++++++++
> >  2 files changed, 12 insertions(+), 24 deletions(-)
> >
> > diff --git a/drivers/gpu/drm/i915/i915_gem_context.c b/drivers/gpu/drm/i915/i915_gem_context.c
> > index 204dc7c..cc5c8e6 100644
> > --- a/drivers/gpu/drm/i915/i915_gem_context.c
> > +++ b/drivers/gpu/drm/i915/i915_gem_context.c
> > @@ -133,23 +133,6 @@ static int get_context_size(struct drm_device *dev)
> >  	return ret;
> >  }
> >
> > -static void i915_gem_context_clean(struct intel_context *ctx)
> > -{
> > -	struct i915_hw_ppgtt *ppgtt = ctx->ppgtt;
> > -	struct i915_vma *vma, *next;
> > -
> > -	if (!ppgtt)
> > -		return;
> > -
> > -	WARN_ON(!list_empty(&ppgtt->base.active_list));
> > -
> > -	list_for_each_entry_safe(vma, next, &ppgtt->base.inactive_list,
> > -				 mm_list) {
> > -		if (WARN_ON(__i915_vma_unbind_no_wait(vma)))
> > -			break;
> > -	}
> > -}
> > -
> >  void i915_gem_context_free(struct kref *ctx_ref)
> >  {
> >  	struct intel_context *ctx = container_of(ctx_ref, typeof(*ctx), ref);
> > @@ -159,13 +142,6 @@ void i915_gem_context_free(struct kref *ctx_ref)
> >  	if (i915.enable_execlists)
> >  		intel_lr_context_free(ctx);
> >
> > -	/*
> > -	 * This context is going away and we need to remove all VMAs still
> > -	 * around. This is to handle imported shared objects for which
> > -	 * destructor did not run when their handles were closed.
> > -	 */
> > -	i915_gem_context_clean(ctx);
> > -
> >  	i915_ppgtt_put(ctx->ppgtt);
> >
> >  	if (ctx->legacy_hw_ctx.rcs_state)
> > diff --git a/drivers/gpu/drm/i915/i915_gem_gtt.c b/drivers/gpu/drm/i915/i915_gem_gtt.c
> > index 016739e..d36943c 100644
> > --- a/drivers/gpu/drm/i915/i915_gem_gtt.c
> > +++ b/drivers/gpu/drm/i915/i915_gem_gtt.c
> > @@ -2214,6 +2214,7 @@ void  i915_ppgtt_release(struct kref *kref)
> >  {
> >  	struct i915_hw_ppgtt *ppgtt =
> >  		container_of(kref, struct i915_hw_ppgtt, ref);
> > +	struct i915_vma *vma, *next;
> >
> >  	trace_i915_ppgtt_release(&ppgtt->base);
> >
> > @@ -2221,6 +2222,17 @@ void  i915_ppgtt_release(struct kref *kref)
> >  	WARN_ON(!list_empty(&ppgtt->base.active_list));
> >  	WARN_ON(!list_empty(&ppgtt->base.inactive_list));
> >
> > +	/*
> > +	 * This ppgtt is going away and we need to remove all VMAs still
> > +	 * around. This is to handle imported shared objects for which
> > +	 * destructor did not run when their handles were closed.
> > +	 */
> > +	list_for_each_entry_safe(vma, next, &ppgtt->base.inactive_list,
> > +				 mm_list) {
> > +		if (WARN_ON(__i915_vma_unbind_no_wait(vma)))
> > +			break;
> > +	}
> > +
> >  	list_del(&ppgtt->base.global_link);
> >  	drm_mm_takedown(&ppgtt->base.mm);
> >
> > --
> > 2.5.0
> >
>
Tvrtko Ursulin Nov. 23, 2015, 10:34 a.m. UTC | #3
On 20/11/15 08:31, Daniel Vetter wrote:
> On Thu, Nov 19, 2015 at 04:10:26PM -0800, yu.dai@intel.com wrote:
>> From: Alex Dai <yu.dai@intel.com>
>>
>> There is a memory leak warning message from i915_gem_context_clean
>> when GuC submission is enabled. The reason is that when LRC is
>> released, its ppgtt could be still referenced. The assumption that
>> all VMAs are unbound during release of LRC is not true.
>>
>> v1: Move the code inside i915_gem_context_clean() to where ppgtt is
>> released because it is not cleaning context anyway but ppgtt.
>>
>> Signed-off-by: Alex Dai <yu.dai@intel.com>
>
> retire__read drops the ctx (and hence ppgtt) reference too early,
> resulting in us hitting the WARNING. See the giant thread with Tvrtko,
> Chris and me:
>
> http://www.spinics.net/lists/intel-gfx/msg78918.html
>
> Would be great if someone could test the diff I posted in there.

It doesn't work - I have posted my IGT snippet which I thought explained it.

Problem req unreference in obj->active case. When it hits that path it 
will not move the VMA to inactive and the 
intel_execlists_retire_requests will be the last unreference from the 
retire worker which will trigger the WARN.

I posted an IGT which hits that -> 
http://patchwork.freedesktop.org/patch/65369/

And posted one give up on the active VMA mem leak patch -> 
http://patchwork.freedesktop.org/patch/65529/

I have no idea yet of GuC implications, I just spotted this parallel thread.

And Mika has proposed something interesting - that we could just clean 
up the active VMA in context cleanup since we know it is a false one.

However, again I don't know how that interacts with the GuC. Surely it 
cannot be freeing the context with stuff genuinely still active in the GuC?

Regards,

Tvrtko
yu.dai@intel.com Nov. 23, 2015, 10:30 p.m. UTC | #4
On 11/23/2015 02:34 AM, Tvrtko Ursulin wrote:
> On 20/11/15 08:31, Daniel Vetter wrote:
> > On Thu, Nov 19, 2015 at 04:10:26PM -0800, yu.dai@intel.com wrote:
> >> From: Alex Dai <yu.dai@intel.com>
> >>
> >> There is a memory leak warning message from i915_gem_context_clean
> >> when GuC submission is enabled. The reason is that when LRC is
> >> released, its ppgtt could be still referenced. The assumption that
> >> all VMAs are unbound during release of LRC is not true.
> >>
> >> v1: Move the code inside i915_gem_context_clean() to where ppgtt is
> >> released because it is not cleaning context anyway but ppgtt.
> >>
> >> Signed-off-by: Alex Dai <yu.dai@intel.com>
> >
> > retire__read drops the ctx (and hence ppgtt) reference too early,
> > resulting in us hitting the WARNING. See the giant thread with Tvrtko,
> > Chris and me:
> >
> > http://www.spinics.net/lists/intel-gfx/msg78918.html
> >
> > Would be great if someone could test the diff I posted in there.
>
> It doesn't work - I have posted my IGT snippet which I thought explained it.

I thought moving the VMA list clean up into i915_ppgtt_release() should 
work. However, it creates a chicken & egg problem. ppgtt_release() rely 
on vma_unbound() to be called first to decrease its refcount. So calling 
vma_unbound() inside ppgtt_release() is not right.
> Problem req unreference in obj->active case. When it hits that path it
> will not move the VMA to inactive and the
> intel_execlists_retire_requests will be the last unreference from the
> retire worker which will trigger the WARN.

I still think the problem comes from the assumption that when lrc is 
released, its all VMAs should be unbound. Precisely I mean the comments 
made for i915_gem_context_clean() - "This context is going away and we 
need to remove all VMAs still around." Really the lrc life cycle is 
different from ppgtt / VMAs. Check the line after 
i915_gem_context_clean(). It is ppgtt_put(). In the case lrc is freed 
early, It won't release ppgtt anyway because it is still referenced by 
VMAs. An it will be freed when no ref of GEM obj.

> I posted an IGT which hits that ->
> http://patchwork.freedesktop.org/patch/65369/
>
> And posted one give up on the active VMA mem leak patch ->
> http://patchwork.freedesktop.org/patch/65529/

This patch will silent the warning. But I think the 
i915_gem_context_clean() itself is unnecessary. I don't see any issue by 
deleting it. The check of VMA list is inside ppgtt_release() and the 
unbound should be aligned to GEM obj's life cycle but not lrc life cycle.
> I have no idea yet of GuC implications, I just spotted this parallel thread.
>
> And Mika has proposed something interesting - that we could just clean
> up the active VMA in context cleanup since we know it is a false one.
>
> However, again I don't know how that interacts with the GuC. Surely it
> cannot be freeing the context with stuff genuinely still active in the GuC?
>

There is no interacts with GuC though. Just very easy to see the warning 
when GuC is enabled, says when run gem_close_race. The reason is that 
GuC does not use the execlist_queue (execlist_retired_req_list) which is 
deferred to retire worker. Same as ring submission mode, when GuC is 
enabled, whenever driver submits a new batch, it will try to release 
previous request. I don't know why  intel_execlists_retire_requests is 
not called for this case. Probably because of the unpin. Deferring the 
retirement may just hide the issue. I bet you will see the warning more 
often if you change i915_gem_retire_requests_ring() to 
i915_gem_retire_requests() in i915_gem_execbuffer_reserve().

Thanks,
Alex
Tvrtko Ursulin Nov. 24, 2015, 10:46 a.m. UTC | #5
On 23/11/15 22:30, Yu Dai wrote:
> On 11/23/2015 02:34 AM, Tvrtko Ursulin wrote:
>> On 20/11/15 08:31, Daniel Vetter wrote:
>> > On Thu, Nov 19, 2015 at 04:10:26PM -0800, yu.dai@intel.com wrote:
>> >> From: Alex Dai <yu.dai@intel.com>
>> >>
>> >> There is a memory leak warning message from i915_gem_context_clean
>> >> when GuC submission is enabled. The reason is that when LRC is
>> >> released, its ppgtt could be still referenced. The assumption that
>> >> all VMAs are unbound during release of LRC is not true.
>> >>
>> >> v1: Move the code inside i915_gem_context_clean() to where ppgtt is
>> >> released because it is not cleaning context anyway but ppgtt.
>> >>
>> >> Signed-off-by: Alex Dai <yu.dai@intel.com>
>> >
>> > retire__read drops the ctx (and hence ppgtt) reference too early,
>> > resulting in us hitting the WARNING. See the giant thread with Tvrtko,
>> > Chris and me:
>> >
>> > http://www.spinics.net/lists/intel-gfx/msg78918.html
>> >
>> > Would be great if someone could test the diff I posted in there.
>>
>> It doesn't work - I have posted my IGT snippet which I thought
>> explained it.
>
> I thought moving the VMA list clean up into i915_ppgtt_release() should
> work. However, it creates a chicken & egg problem. ppgtt_release() rely
> on vma_unbound() to be called first to decrease its refcount. So calling
> vma_unbound() inside ppgtt_release() is not right.
>> Problem req unreference in obj->active case. When it hits that path it
>> will not move the VMA to inactive and the
>> intel_execlists_retire_requests will be the last unreference from the
>> retire worker which will trigger the WARN.
>
> I still think the problem comes from the assumption that when lrc is
> released, its all VMAs should be unbound. Precisely I mean the comments
> made for i915_gem_context_clean() - "This context is going away and we
> need to remove all VMAs still around." Really the lrc life cycle is
> different from ppgtt / VMAs. Check the line after
> i915_gem_context_clean(). It is ppgtt_put(). In the case lrc is freed
> early, It won't release ppgtt anyway because it is still referenced by
> VMAs. An it will be freed when no ref of GEM obj.

Yes, but the last is just a consequence of how the code was laid out, 
not a hard requirement as far as I understand it.

When context destructor runs that means two things:

1. GPU is done with all VMAs belonging to the VM.
2. USerspace also cannot reach them any more either.

So VMAs have no reason to exist beyond that point. If they do, they can 
be left dangling under certain circumstances. See below.

>> I posted an IGT which hits that ->
>> http://patchwork.freedesktop.org/patch/65369/
>>
>> And posted one give up on the active VMA mem leak patch ->
>> http://patchwork.freedesktop.org/patch/65529/
>
> This patch will silent the warning. But I think the
> i915_gem_context_clean() itself is unnecessary. I don't see any issue by
> deleting it. The check of VMA list is inside ppgtt_release() and the
> unbound should be aligned to GEM obj's life cycle but not lrc life cycle.

i915_gem_context_clean solves a memory leak which is explained in the 
commit.

If there is an extra reference on the GEM obj, like in the flink case, 
VMAs will not get unbound.

Apart from various IGTs, you can also demonstrate this leak with fbcon 
and Xorg. Every time you re-start Xorg one VMA will be leaked since it 
imports the fbcon fb via flink.

Second part of the story is that the WARN in i915_gem_context_clean is 
wrong because of how retirement works. So if we removed the WARN, 
cleanup still has some value to avoid memory leak in the above described 
Xorg case, or in any case where flink is in the picture and VMAs have 
been retired to inactive.

Bug left standing would be that if the VMA happens to be flinked and on 
the active list, because of say being shared between rings and contexts, 
it would still be leaked. But it is less leaking than without the cleanup.

I proposed another way of fixing that: 
http://patchwork.freedesktop.org/patch/65861/

>> I have no idea yet of GuC implications, I just spotted this parallel
>> thread.
>>
>> And Mika has proposed something interesting - that we could just clean
>> up the active VMA in context cleanup since we know it is a false one.
>>
>> However, again I don't know how that interacts with the GuC. Surely it
>> cannot be freeing the context with stuff genuinely still active in the
>> GuC?
>>
>
> There is no interacts with GuC though. Just very easy to see the warning
> when GuC is enabled, says when run gem_close_race. The reason is that
> GuC does not use the execlist_queue (execlist_retired_req_list) which is
> deferred to retire worker. Same as ring submission mode, when GuC is
> enabled, whenever driver submits a new batch, it will try to release
> previous request. I don't know why  intel_execlists_retire_requests is
> not called for this case. Probably because of the unpin. Deferring the
> retirement may just hide the issue. I bet you will see the warning more
> often if you change i915_gem_retire_requests_ring() to
> i915_gem_retire_requests() in i915_gem_execbuffer_reserve().

Hmmm.. gem_close_race uses flink, but why would it trigger context 
destruction with active VMAs? How does the backtrace looks like from the 
context cleanup WARN?

Regards,

Tvrtko
Daniel Vetter Nov. 24, 2015, 10:57 a.m. UTC | #6
On Mon, Nov 23, 2015 at 10:34:59AM +0000, Tvrtko Ursulin wrote:
> 
> On 20/11/15 08:31, Daniel Vetter wrote:
> >On Thu, Nov 19, 2015 at 04:10:26PM -0800, yu.dai@intel.com wrote:
> >>From: Alex Dai <yu.dai@intel.com>
> >>
> >>There is a memory leak warning message from i915_gem_context_clean
> >>when GuC submission is enabled. The reason is that when LRC is
> >>released, its ppgtt could be still referenced. The assumption that
> >>all VMAs are unbound during release of LRC is not true.
> >>
> >>v1: Move the code inside i915_gem_context_clean() to where ppgtt is
> >>released because it is not cleaning context anyway but ppgtt.
> >>
> >>Signed-off-by: Alex Dai <yu.dai@intel.com>
> >
> >retire__read drops the ctx (and hence ppgtt) reference too early,
> >resulting in us hitting the WARNING. See the giant thread with Tvrtko,
> >Chris and me:
> >
> >http://www.spinics.net/lists/intel-gfx/msg78918.html
> >
> >Would be great if someone could test the diff I posted in there.
> 
> It doesn't work - I have posted my IGT snippet which I thought explained it.
> 
> Problem req unreference in obj->active case. When it hits that path it will
> not move the VMA to inactive and the intel_execlists_retire_requests will be
> the last unreference from the retire worker which will trigger the WARN.
> 
> I posted an IGT which hits that ->
> http://patchwork.freedesktop.org/patch/65369/
> 
> And posted one give up on the active VMA mem leak patch ->
> http://patchwork.freedesktop.org/patch/65529/

Ok, I get things now. Fundamentally the problem is that we track active
per-obj, but we want it per-vma. In a way this patch just diggs us deeper
into that hole, which doesn't make me all too happy. Oh well.

I'll pull in the warning removal.
-Daniel

> I have no idea yet of GuC implications, I just spotted this parallel thread.
> 
> And Mika has proposed something interesting - that we could just clean up
> the active VMA in context cleanup since we know it is a false one.
> 
> However, again I don't know how that interacts with the GuC. Surely it
> cannot be freeing the context with stuff genuinely still active in the GuC?
> 
> Regards,
> 
> Tvrtko
>
Chris Wilson Nov. 24, 2015, 12:50 p.m. UTC | #7
On Tue, Nov 24, 2015 at 11:57:22AM +0100, Daniel Vetter wrote:
> On Mon, Nov 23, 2015 at 10:34:59AM +0000, Tvrtko Ursulin wrote:
> > 
> > On 20/11/15 08:31, Daniel Vetter wrote:
> > >On Thu, Nov 19, 2015 at 04:10:26PM -0800, yu.dai@intel.com wrote:
> > >>From: Alex Dai <yu.dai@intel.com>
> > >>
> > >>There is a memory leak warning message from i915_gem_context_clean
> > >>when GuC submission is enabled. The reason is that when LRC is
> > >>released, its ppgtt could be still referenced. The assumption that
> > >>all VMAs are unbound during release of LRC is not true.
> > >>
> > >>v1: Move the code inside i915_gem_context_clean() to where ppgtt is
> > >>released because it is not cleaning context anyway but ppgtt.
> > >>
> > >>Signed-off-by: Alex Dai <yu.dai@intel.com>
> > >
> > >retire__read drops the ctx (and hence ppgtt) reference too early,
> > >resulting in us hitting the WARNING. See the giant thread with Tvrtko,
> > >Chris and me:
> > >
> > >http://www.spinics.net/lists/intel-gfx/msg78918.html
> > >
> > >Would be great if someone could test the diff I posted in there.
> > 
> > It doesn't work - I have posted my IGT snippet which I thought explained it.
> > 
> > Problem req unreference in obj->active case. When it hits that path it will
> > not move the VMA to inactive and the intel_execlists_retire_requests will be
> > the last unreference from the retire worker which will trigger the WARN.
> > 
> > I posted an IGT which hits that ->
> > http://patchwork.freedesktop.org/patch/65369/
> > 
> > And posted one give up on the active VMA mem leak patch ->
> > http://patchwork.freedesktop.org/patch/65529/
> 
> Ok, I get things now. Fundamentally the problem is that we track active
> per-obj, but we want it per-vma. In a way this patch just diggs us deeper
> into that hole, which doesn't make me all too happy. Oh well.
> 
> I'll pull in the warning removal.

Just revert the incomplete patch, rather than the warning that the
patch is suspect.
-Chris
Chris Wilson Nov. 24, 2015, 12:51 p.m. UTC | #8
On Tue, Nov 24, 2015 at 11:57:22AM +0100, Daniel Vetter wrote:
> Ok, I get things now. Fundamentally the problem is that we track active
> per-obj, but we want it per-vma. In a way this patch just diggs us deeper
> into that hole, which doesn't make me all too happy. Oh well.

Why not, I've been chasing reviewers for vma tracking for the last 15
months.
-Chris
diff mbox

Patch

diff --git a/drivers/gpu/drm/i915/i915_gem_context.c b/drivers/gpu/drm/i915/i915_gem_context.c
index 204dc7c..cc5c8e6 100644
--- a/drivers/gpu/drm/i915/i915_gem_context.c
+++ b/drivers/gpu/drm/i915/i915_gem_context.c
@@ -133,23 +133,6 @@  static int get_context_size(struct drm_device *dev)
 	return ret;
 }
 
-static void i915_gem_context_clean(struct intel_context *ctx)
-{
-	struct i915_hw_ppgtt *ppgtt = ctx->ppgtt;
-	struct i915_vma *vma, *next;
-
-	if (!ppgtt)
-		return;
-
-	WARN_ON(!list_empty(&ppgtt->base.active_list));
-
-	list_for_each_entry_safe(vma, next, &ppgtt->base.inactive_list,
-				 mm_list) {
-		if (WARN_ON(__i915_vma_unbind_no_wait(vma)))
-			break;
-	}
-}
-
 void i915_gem_context_free(struct kref *ctx_ref)
 {
 	struct intel_context *ctx = container_of(ctx_ref, typeof(*ctx), ref);
@@ -159,13 +142,6 @@  void i915_gem_context_free(struct kref *ctx_ref)
 	if (i915.enable_execlists)
 		intel_lr_context_free(ctx);
 
-	/*
-	 * This context is going away and we need to remove all VMAs still
-	 * around. This is to handle imported shared objects for which
-	 * destructor did not run when their handles were closed.
-	 */
-	i915_gem_context_clean(ctx);
-
 	i915_ppgtt_put(ctx->ppgtt);
 
 	if (ctx->legacy_hw_ctx.rcs_state)
diff --git a/drivers/gpu/drm/i915/i915_gem_gtt.c b/drivers/gpu/drm/i915/i915_gem_gtt.c
index 016739e..d36943c 100644
--- a/drivers/gpu/drm/i915/i915_gem_gtt.c
+++ b/drivers/gpu/drm/i915/i915_gem_gtt.c
@@ -2214,6 +2214,7 @@  void  i915_ppgtt_release(struct kref *kref)
 {
 	struct i915_hw_ppgtt *ppgtt =
 		container_of(kref, struct i915_hw_ppgtt, ref);
+	struct i915_vma *vma, *next;
 
 	trace_i915_ppgtt_release(&ppgtt->base);
 
@@ -2221,6 +2222,17 @@  void  i915_ppgtt_release(struct kref *kref)
 	WARN_ON(!list_empty(&ppgtt->base.active_list));
 	WARN_ON(!list_empty(&ppgtt->base.inactive_list));
 
+	/*
+	 * This ppgtt is going away and we need to remove all VMAs still
+	 * around. This is to handle imported shared objects for which
+	 * destructor did not run when their handles were closed.
+	 */
+	list_for_each_entry_safe(vma, next, &ppgtt->base.inactive_list,
+				 mm_list) {
+		if (WARN_ON(__i915_vma_unbind_no_wait(vma)))
+			break;
+	}
+
 	list_del(&ppgtt->base.global_link);
 	drm_mm_takedown(&ppgtt->base.mm);